├── Code
    ├── 1-1多项式拟合sin函数.py
    ├── 1-2欠拟合，拟合，过拟合.py
    ├── 1-3糖尿病的线性拟合.py
    ├── 1-4线性回归.py
    ├── 1-5弟弟妹妹身高的线性拟合.py
    ├── 10-1EM.py
    ├── 10-2Kmean和GaussianMixture.py
    ├── 11-1马尔可夫.py
    ├── 11-2马尔可夫训练模型.py
    ├── 11-3马尔可夫解码.py
    ├── 11-4马尔可夫（维特比最短路径）.py
    ├── 11-5马尔可夫算法（对应课本P213习题）.py
    ├── 12-1SVD数据压缩.py
    ├── 12-2PCA数据降维.py
    ├── 13-1文本特征处理（泰坦尼克）.py
    ├── 13-2文本特征处理-词带.py
    ├── 13-3文本特征处理(泰坦尼克)-TFIDF.py
    ├── 13-4文本特征处理-单词二维化.py
    ├── 13-5文本特征处理-机器语言学习.py
    ├── 14-1交叉验证.py
    ├── 14-2Pipeline.py
    ├── 2-1(重要)二分类模型(感知器学习的原始算法).py
    ├── 2-2二分类模型（对偶算法）.py
    ├── 2-3二分类模型（sklearn包里的分类算法）.py
    ├── 2-4二分类的课后练习.py
    ├── 3-1K近邻的距离图.py
    ├── 3-2K近邻法距离加权与统一的对比.py
    ├── 3-2（1）K近邻法距离加权与统一的对比.py
    ├── 3-3KNN算法（原始和包）.py
    ├── 3-4KNN（糖尿病）.py
    ├── 3-5KNN（cifar-10）.py
    ├── 4-1原始贝叶斯.py
    ├── 4-2导包的高斯贝叶斯.py
    ├── 4-3高斯伯努利多项式贝叶斯.py
    ├── 4-4高斯做的分类(数字样本).py
    ├── 4-5高斯（鱼样本).py
    ├── 4-6高斯（cifar-10）.py
    ├── 5-1原始决策树.py
    ├── 5-2决策树（鸢尾样本）.py
    ├── 5-3决策树（数字样本）.py
    ├── 5-4多层决策树回归.py
    ├── 5-5决策树（鱼样本）.py
    ├── 5-6决策树（cifar-10）.py
    ├── 5-7决策树剪枝（乳腺癌样本）.py
    ├── 5-8计算熵(entropy)的函数.py
    ├── 6-1逻辑斯蒂的概率分布.py
    ├── 6-2原始逻辑斯蒂（鸢尾样本）.py
    ├── 6-2逻辑斯蒂（鸢尾样本）.py
    ├── 6-4逻辑斯蒂（数字样本）.py
    ├── 6-5逻辑斯蒂（乳腺癌样本)评估（二分类）.py
    ├── 6-6逻辑斯蒂（广告样本）.py
    ├── 7-1查找best参数.py
    ├── 7-2决策树（数字样本）评估（多分类）.py
    ├── 7-2官网svm（花样本）评估（多分类）.py
    ├── 8-1原始svm.py
    ├── 8-2svc参数讲解.py
    ├── 8-3核是可以选的.py
    ├── 8-4SVC(数字样本).py
    ├── 8-5SVC（cifar-10）.py
    ├── 9-1bagging三种集成学习方式.py
    ├── 9-2原始Adaboost.py
    ├── 9-3Adaboost与RandomForest.py
    ├── 9-4集成学习（酒样本）.py
    └── advertising.csv
├── Code_2022
    ├── class10-test1.py
    ├── class10-test2.py
    ├── class10-test3.py
    ├── class11-test1.py
    ├── class11-test2.py
    ├── class12-test.py
    ├── class12-test2.py
    ├── class13-test1.py
    ├── class13-test2.py
    ├── class13-test3.py
    ├── class13-test4.py
    ├── class13-test5.py
    ├── class13-test6.py
    ├── class13-test7.py
    ├── class14-test1.py
    ├── class14-test2.py
    ├── class14-test3.py
    ├── class15-test1.py
    ├── class15-test2.py
    ├── class2-test1.py
    ├── class2-test2.py
    ├── class2-test3.py
    ├── class2-test4.py
    ├── class2-test5.py
    ├── class3-test1.py
    ├── class4-test1.py
    ├── class4-test2.py
    ├── class4-test3.py
    ├── class5-test1.py
    ├── class5-test2.py
    ├── class6-test1.py
    ├── class6-test2.py
    ├── class7-test1.py
    ├── class7-test2.py
    ├── class7-test3.py
    ├── class7-test4.py
    ├── class8-test1.py
    ├── class8-test2.py
    ├── class8-test3.py
    ├── class8-test4.py
    ├── class9-test1.py
    ├── class9-test2.py
    ├── class9-test3.py
    └── readme
├── LICENSE
├── README.md
└── Slides
    ├── A Step by Step Backpropagation Example for Regression using an One-hot Encoded Categorical Variable .pdf
    ├── CH00 OverView.pdf
    ├── CH01 Statistical Learning.pdf
    ├── CH02 Perceptron.pdf
    ├── CH03 KNN.pdf
    ├── CH04 NaiveBayes.pdf
    ├── CH05 DecisionTree.pdf
    ├── CH06 LogicRegression and Maximum Entropy Model.pdf
    ├── CH07 SVM.pdf
    ├── CH08 Boosting.pdf
    ├── CH09 EM.pdf
    ├── CH10 Hidden Markov Model.pdf
    ├── CH16 PCA.pdf
    ├── CH21 PageRank.pdf
    ├── CH22 Transformer.pdf
    ├── CHX0 Summary.pdf
    └── CHX5 NN-CNN.pdf


/Code/1-1多项式拟合sin函数.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy.optimize import leastsq
 3 | import matplotlib.pyplot as plt
 4 | # %matplotlib inline
 5 | 
 6 | # 目标函数
 7 | def real_func(x):
 8 |     return np.sin(2*np.pi*x)
 9 | 
10 | # 多项式
11 | def fit_func(p, x):
12 |     f = np.poly1d(p)
13 |     return f(x)
14 | 
15 | # 残差
16 | def residuals_func(p, x, y):
17 |     ret = fit_func(p, x) - y
18 |     return ret
19 | 
20 | f = np.poly1d([1,1,1])
21 | print(f(6))
22 | 
23 | # 十个点
24 | x = np.linspace(0, 1, 10)
25 | x_points = np.linspace(0, 1, 1000)
26 | # 加上正态分布噪音的目标函数的值
27 | y_ = real_func(x)
28 | y = [np.random.normal(0, 0.1) + y1 for y1 in y_]
29 | 
30 | def fitting(M=0):
31 |     """
32 |     M 为 多项式的次数
33 |     """
34 |     # 随机初始化多项式参数
35 |     p_init = np.random.rand(M + 1)
36 |     # 最小二乘法
37 |     p_lsq = leastsq(residuals_func, p_init, args=(x, y))
38 |     print('Fitting Parameters:', p_lsq[0])
39 |     # 可视化
40 |     plt.plot(x_points, real_func(x_points), label='real')
41 |     plt.plot(x_points, fit_func(p_lsq[0], x_points), label='fitted curve')
42 |     plt.plot(x, y, 'bo', label='noise')
43 |     plt.legend()
44 |     plt.show()
45 |     return p_lsq
46 | 
47 | p_lsq_0 = fitting(M=3)
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/Code/1-2欠拟合，拟合，过拟合.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from sklearn.pipeline import Pipeline
 4 | from sklearn.preprocessing import PolynomialFeatures
 5 | from sklearn.linear_model import LinearRegression
 6 | from sklearn.model_selection import cross_val_score
 7 | 
 8 | 
 9 | def true_fun(X):
10 |     return np.cos(1.5 * np.pi * X)
11 | 
12 | np.random.seed(0)
13 | 
14 | n_samples = 30
15 | degrees = [1, 4, 15]
16 | 
17 | X = np.sort(np.random.rand(n_samples))
18 | y = true_fun(X) + np.random.randn(n_samples) * 0.1
19 | 
20 | plt.figure(figsize=(14, 5))
21 | for i in range(len(degrees)):
22 |     ax = plt.subplot(1, len(degrees), i + 1)
23 |     plt.setp(ax, xticks=(), yticks=())
24 | 
25 |     polynomial_features = PolynomialFeatures(degree=degrees[i],
26 |                                              include_bias=False)
27 |     linear_regression = LinearRegression()
28 |     pipeline = Pipeline([("polynomial_features", polynomial_features),
29 |                          ("linear_regression", linear_regression)])
30 |     pipeline.fit(X[:, np.newaxis], y)
31 | 
32 |     # Evaluate the models using crossvalidation
33 |     scores = cross_val_score(pipeline, X[:, np.newaxis], y,
34 |                              scoring="neg_mean_squared_error", cv=10)
35 | 
36 |     X_test = np.linspace(0, 1, 100)
37 |     plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
38 |     plt.plot(X_test, true_fun(X_test), label="True function")
39 |     plt.scatter(X, y, edgecolor='b', s=20, label="Samples")
40 |     plt.xlabel("x")
41 |     plt.ylabel("y")
42 |     plt.xlim((0, 1))
43 |     plt.ylim((-2, 2))
44 |     plt.legend(loc="best")
45 |     plt.title("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format(
46 |         degrees[i], -scores.mean(), scores.std()))
47 | plt.show()
48 | 


--------------------------------------------------------------------------------
/Code/1-3糖尿病的线性拟合.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | from sklearn import datasets, linear_model
 5 | from sklearn.metrics import mean_squared_error, r2_score
 6 | 
 7 | # Load the diabetes dataset
 8 | diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)
 9 | 
10 | # Use only one feature
11 | diabetes_X_1 = diabetes_X[:, np.newaxis, 2]
12 | 
13 | print(diabetes_X.shape)
14 | print(diabetes_X)
15 | 
16 | import matplotlib.pyplot as plt
17 | import numpy as np
18 | from sklearn import datasets, linear_model
19 | from sklearn.metrics import mean_squared_error, r2_score
20 | 
21 | # Load the diabetes dataset
22 | diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)
23 | 
24 | # Use only one feature
25 | diabetes_X = diabetes_X[:, np.newaxis, 2]
26 | 
27 | # Split the data into training/testing sets
28 | diabetes_X_train = diabetes_X[:-20]
29 | diabetes_X_test = diabetes_X[-20:]
30 | 
31 | # Split the targets into training/testing sets
32 | diabetes_y_train = diabetes_y[:-20]
33 | diabetes_y_test = diabetes_y[-20:]
34 | 
35 | # Create linear regression object
36 | regr = linear_model.LinearRegression()
37 | 
38 | # Train the model using the training sets
39 | regr.fit(diabetes_X_train, diabetes_y_train)
40 | 
41 | # Make predictions using the testing set
42 | diabetes_y_pred = regr.predict(diabetes_X_test)
43 | 
44 | # The coefficients
45 | print('Coefficients: \n', regr.coef_)
46 | # The mean squared error
47 | print('Mean squared error: %.2f'
48 |       % mean_squared_error(diabetes_y_test, diabetes_y_pred))
49 | # The coefficient of determination: 1 is perfect prediction
50 | print('Coefficient of determination: %.2f'
51 |       % r2_score(diabetes_y_test, diabetes_y_pred))
52 | 
53 | # Plot outputs
54 | plt.scatter(diabetes_X_test, diabetes_y_test,  color='black')
55 | plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3)
56 | 
57 | plt.xticks(())
58 | plt.yticks(())
59 | plt.show()
60 | 
61 | 


--------------------------------------------------------------------------------
/Code/1-4线性回归.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.linear_model import LinearRegression
 3 | X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
 4 | # y = 1 * x_0 + 2 * x_1 + 3
 5 | y = np.dot(X, np.array([1, 2])) + 3
 6 | reg = LinearRegression().fit(X, y)
 7 | print(reg.score(X, y))
 8 | print(reg.coef_)
 9 | print(reg.intercept_)
10 | print(reg.predict(np.array([[3, 5]])))
11 | 
12 | 


--------------------------------------------------------------------------------
/Code/1-5弟弟妹妹身高的线性拟合.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.linear_model import LinearRegression
 3 | import matplotlib.pyplot as plt
 4 | a = [[71],[68],[66],[67],[70],[71],[70],[73],[72],[65],[66]]
 5 | b = [69,64,65,63,65,62,65,64,66,59,62]
 6 | def fit_func(p, x):
 7 |     f = np.poly1d(p)
 8 |     return f(x)
 9 | def residuals_func(p, x, y):
10 |     ret = fit_func(p, x) - y
11 |     return ret
12 | 
13 | plt.scatter(a, b, label = 'real data')
14 | plt.xlabel('bother height')
15 | plt.ylabel('sister height')
16 | plt.title('this is a demo')
17 | 
18 | reg = LinearRegression().fit(a, b)
19 | y_pred = reg.predict(a)
20 | plt.plot(a, y_pred, color='red', label = 'prediect')
21 | plt.legend()                # 将标注显示出来
22 | plt.show()
23 | 


--------------------------------------------------------------------------------
/Code/10-1EM.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import itertools
  3 | 
  4 | import numpy as np
  5 | from scipy import linalg
  6 | import matplotlib.pyplot as plt
  7 | import matplotlib as mpl
  8 | 
  9 | from sklearn import mixture
 10 | 
 11 | color_iter = itertools.cycle(['navy', 'c', 'cornflowerblue', 'gold',
 12 |                               'darkorange'])
 13 | 
 14 | 
 15 | def plot_results(X, Y_, means, covariances, index, title):
 16 |     splot = plt.subplot(2, 1, 1 + index)
 17 |     for i, (mean, covar, color) in enumerate(zip(
 18 |             means, covariances, color_iter)):
 19 |         v, w = linalg.eigh(covar)
 20 |         v = 2. * np.sqrt(2.) * np.sqrt(v)
 21 |         u = w[0] / linalg.norm(w[0])
 22 |         # as the DP will not use every component it has access to
 23 |         # unless it needs it, we shouldn't plot the redundant
 24 |         # components.
 25 |         if not np.any(Y_ == i):
 26 |             continue
 27 |         plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
 28 | 
 29 |         # Plot an ellipse to show the Gaussian component
 30 |         angle = np.arctan(u[1] / u[0])
 31 |         angle = 180. * angle / np.pi  # convert to degrees
 32 |         ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color)
 33 |         ell.set_clip_box(splot.bbox)
 34 |         ell.set_alpha(0.5)
 35 |         splot.add_artist(ell)
 36 | 
 37 |     plt.xlim(-9., 5.)
 38 |     plt.ylim(-3., 6.)
 39 |     plt.xticks(())
 40 |     plt.yticks(())
 41 |     plt.title(title)
 42 | 
 43 | 
 44 | # Number of samples per component
 45 | n_samples = 500
 46 | 
 47 | # Generate random sample, two components
 48 | np.random.seed(0)
 49 | C = np.array([[0., -0.1], [1.7, .4]])
 50 | X = np.r_[np.dot(np.random.randn(n_samples, 2), C),
 51 |           .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])]
 52 | 
 53 | # Fit a Gaussian mixture with EM using five components
 54 | gmm = mixture.GaussianMixture(n_components=5, covariance_type='full').fit(X)
 55 | plot_results(X, gmm.predict(X), gmm.means_, gmm.covariances_, 0,
 56 |              'Gaussian Mixture')
 57 | 
 58 | 
 59 | 
 60 | plt.show()
 61 | 
 62 | 
 63 | 
 64 | print(gmm.weights_)
 65 | print(gmm.means_)
 66 | print(gmm.covariances_)
 67 | 
 68 | from sklearn import mixture
 69 | 
 70 | 
 71 | 
 72 | color_iter = itertools.cycle(['navy', 'c', 'cornflowerblue', 'gold',
 73 |                               'darkorange'])
 74 | 
 75 | 
 76 | def plot_results(X, Y_, means, covariances, index, title):
 77 |     splot = plt.subplot(2, 1, 1 + index)
 78 |     for i, (mean, covar, color) in enumerate(zip(
 79 |             means, covariances, color_iter)):
 80 |         v, w = linalg.eigh(covar)
 81 |         v = 2. * np.sqrt(2.) * np.sqrt(v)
 82 |         u = w[0] / linalg.norm(w[0])
 83 |         # as the DP will not use every component it has access to
 84 |         # unless it needs it, we shouldn't plot the redundant
 85 |         # components.
 86 |         if not np.any(Y_ == i):
 87 |             continue
 88 |         plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
 89 | 
 90 |         # Plot an ellipse to show the Gaussian component
 91 |         angle = np.arctan(u[1] / u[0])
 92 |         angle = 180. * angle / np.pi  # convert to degrees
 93 |         ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color)
 94 |         ell.set_clip_box(splot.bbox)
 95 |         ell.set_alpha(0.5)
 96 |         splot.add_artist(ell)
 97 | 
 98 |     plt.xlim(-9., 5.)
 99 |     plt.ylim(-3., 6.)
100 |     plt.xticks(())
101 |     plt.yticks(())
102 |     plt.title(title)
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------
/Code/10-2Kmean和GaussianMixture.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import itertools
  3 | 
  4 | import numpy as np
  5 | from scipy import linalg
  6 | import matplotlib.pyplot as plt
  7 | import matplotlib as mpl
  8 | 
  9 | from sklearn import mixture
 10 | 
 11 | color_iter = itertools.cycle(['navy', 'c', 'cornflowerblue', 'gold',
 12 |                               'darkorange'])
 13 | 
 14 | 
 15 | def plot_results(X, Y_, means, covariances, index, title):
 16 |     splot = plt.subplot(2, 1, 1 + index)
 17 |     for i, (mean, covar, color) in enumerate(zip(
 18 |             means, covariances, color_iter)):
 19 |         v, w = linalg.eigh(covar)
 20 |         v = 2. * np.sqrt(2.) * np.sqrt(v)
 21 |         u = w[0] / linalg.norm(w[0])
 22 |         # as the DP will not use every component it has access to
 23 |         # unless it needs it, we shouldn't plot the redundant
 24 |         # components.
 25 |         if not np.any(Y_ == i):
 26 |             continue
 27 |         plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
 28 | 
 29 |         # Plot an ellipse to show the Gaussian component
 30 |         angle = np.arctan(u[1] / u[0])
 31 |         angle = 180. * angle / np.pi  # convert to degrees
 32 |         ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color)
 33 |         ell.set_clip_box(splot.bbox)
 34 |         ell.set_alpha(0.5)
 35 |         splot.add_artist(ell)
 36 | 
 37 |     plt.xlim(-9., 5.)
 38 |     plt.ylim(-3., 6.)
 39 |     plt.xticks(())
 40 |     plt.yticks(())
 41 |     plt.title(title)
 42 | 
 43 | 
 44 | # Number of samples per component
 45 | n_samples = 500
 46 | 
 47 | # Generate random sample, two components
 48 | np.random.seed(0)
 49 | C = np.array([[0., -0.1], [1.7, .4]])
 50 | X = np.r_[np.dot(np.random.randn(n_samples, 2), C),
 51 |           .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])]
 52 | 
 53 | # Fit a Gaussian mixture with EM using five components
 54 | gmm = mixture.GaussianMixture(n_components=5, covariance_type='full').fit(X)
 55 | plot_results(X, gmm.predict(X), gmm.means_, gmm.covariances_, 0,
 56 |              'Gaussian Mixture')
 57 | 
 58 | 
 59 | 
 60 | plt.show()
 61 | 
 62 | print(gmm.weights_)
 63 | print(gmm.means_)
 64 | print(gmm.covariances_)
 65 | 
 66 | from sklearn.cluster import KMeans
 67 | import numpy as np
 68 | X = np.array([[1, 2], [1, 4], [1, 0],[4, 2], [4, 4], [4, 0]])
 69 | kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
 70 | print(kmeans.labels_)
 71 | print(kmeans.predict([[0, 0], [4, 4]]))
 72 | print(kmeans.cluster_centers_)
 73 | 
 74 | 
 75 | #euclidian distance between 2 data points. For as many data points as necessary.
 76 | def euclidean_distance(a, b):
 77 |     return np.linalg.norm(a-b)
 78 | 
 79 | 
 80 | 
 81 | 
 82 | 
 83 | def kmeans(data,k=3):
 84 |     m = data.shape[0]
 85 |     index = random.sample(range(m),k)
 86 |     mu = data[index] #随机选择初始均值向量
 87 | 
 88 | 
 89 |     while True:
 90 | 
 91 |         C = defaultdict(list)
 92 | 
 93 |         for j in range(0,m):
 94 |             dij = [euclidean_distance(data[j],mu[i]) for i in range(k)]
 95 |             lambda_j = np.argmin(dij)   #选择最小的值得下标
 96 | 
 97 |             C[lambda_j].append(data[j].tolist())
 98 | 
 99 |         new_mu = [np.mean(C[i],axis=0).tolist() for i in range(k)]
100 | 
101 |         if (euclidean_distance(np.array(new_mu),np.array(mu))>1e-9):
102 |             mu = new_mu
103 |         else:
104 |             break
105 | 
106 |     return C,mu
107 | 
108 | 
109 | watermelon = np.array([[ 0.697  ,0.46 ],
110 |                          [ 0.774  ,0.376],
111 |                          [ 0.634  ,0.264],
112 |                          [ 0.608  ,0.318],
113 |                          [ 0.556  ,0.215],
114 |                          [ 0.403  ,0.237],
115 |                          [ 0.481  ,0.149],
116 |                          [ 0.437  ,0.211],
117 |                          [ 0.666  ,0.091],
118 |                          [ 0.243  ,0.267],
119 |                          [ 0.245  ,0.057],
120 |                          [ 0.343  ,0.099],
121 |                          [ 0.639  ,0.161],
122 |                          [ 0.657  ,0.198],
123 |                          [ 0.36   ,0.37 ],
124 |                          [ 0.593  ,0.042],
125 |                          [ 0.719  ,0.103],
126 |                          [ 0.359  ,0.188],
127 |                          [ 0.339  ,0.241],
128 |                          [ 0.282  ,0.257],
129 |                          [ 0.748  ,0.232],
130 |                          [ 0.714  ,0.346],
131 |                          [ 0.483  ,0.312],
132 |                          [ 0.478  ,0.437],
133 |                          [ 0.525  ,0.369],
134 |                          [ 0.751  ,0.489],
135 |                          [ 0.532  ,0.472],
136 |                          [ 0.473  ,0.376],
137 |                          [ 0.725  ,0.445],
138 |                          [ 0.446  ,0.459]])
139 | 
140 | 
141 | k = 2
142 | res,mu = kmeans(watermelon,k)
143 | print(res)
144 | print('新的中心：',mu)
145 | 
146 | 
147 | class GaussianMixture:
148 |     "Model mixture of two univariate Gaussians and their EM estimation"
149 | 
150 |     def __init__(self, data, mu_min=min(data), mu_max=max(data), sigma_min=.1, sigma_max=1, mix=.5):
151 |         self.data = data
152 |         # init with multiple gaussians
153 |         self.one = Gaussian(uniform(mu_min, mu_max),
154 |                             uniform(sigma_min, sigma_max))
155 |         self.two = Gaussian(uniform(mu_min, mu_max),
156 |                             uniform(sigma_min, sigma_max))
157 | 
158 |         # as well as how much to mix them
159 |         self.mix = mix
160 |         self.loglike = 0.  # = log(p = 1)
161 | 
162 |     def Estep(self):
163 |         "Perform an E(stimation)-step, freshening up self.loglike in the process"
164 |         # compute weights
165 |         self.loglike = 0.  # = log(p = 1)
166 |         for datum in self.data:
167 |             # unnormalized weights
168 |             wp1 = self.one.pdf(datum) * self.mix
169 |             wp2 = self.two.pdf(datum) * (1. - self.mix)
170 |             # compute denominator
171 |             den = wp1 + wp2
172 |             # normalize
173 |             wp1 /= den
174 |             wp2 /= den
175 |             # add into loglike
176 |             self.loglike += log(wp1 + wp2)
177 |             # yield weight tuple
178 |             yield (wp1, wp2)
179 | 
180 |     def Mstep(self, weights):
181 |         "Perform an M(aximization)-step"
182 |         # compute denominators
183 |         (left, rigt) = zip(*weights)
184 |         one_den = sum(left)
185 |         two_den = sum(rigt)
186 |         # compute new means
187 |         self.one.mu = sum(w * d / one_den for (w, d) in zip(left, data))
188 |         self.two.mu = sum(w * d / two_den for (w, d) in zip(rigt, data))
189 |         # compute new sigmas
190 |         self.one.sigma = sqrt(sum(w * ((d - self.one.mu) ** 2)
191 |                                   for (w, d) in zip(left, data)) / one_den)
192 |         self.two.sigma = sqrt(sum(w * ((d - self.two.mu) ** 2)
193 |                                   for (w, d) in zip(rigt, data)) / two_den)
194 |         # compute new mix
195 |         self.mix = one_den / len(data)
196 | 
197 |     def iterate(self, N=1, verbose=False):
198 |         "Perform N iterations, then compute log-likelihood"
199 | 
200 |     def pdf(self, x):
201 |         return (self.mix) * self.one.pdf(x) + (1 - self.mix) * self.two.pdf(x)
202 | 
203 |     def __repr__(self):
204 |         return 'GaussianMixture({0}, {1}, mix={2.03})'.format(self.one,
205 |                                                               self.two,
206 |                                                               self.mix)
207 | 
208 |     def __str__(self):
209 |         return 'Mixture: {0}, {1}, mix={2:.03})'.format(self.one,
210 |                                                         self.two,
211 |                                                         self.mix)


--------------------------------------------------------------------------------
/Code/11-1马尔可夫.py:
--------------------------------------------------------------------------------
 1 | from math import exp
 2 | 
 3 | import numpy as np
 4 | from hmmlearn import hmm
 5 | 
 6 | status = ['盒子1', '盒子2', '盒子3']
 7 | obs = ['白球', '黑球']
 8 | n_status = len(status)
 9 | m_obs = len(obs)
10 | start_probability = np.array([0.2, 0.5, 0.3])
11 | transition_probability = np.array([
12 |     [0.5, 0.4, 0.1],      #盒子1到1，1到2，1到3的概率
13 |     [0.2, 0.2, 0.6],
14 |     [0.2, 0.5, 0.3]
15 | ])
16 | emission_probalitity = np.array([
17 |     [0.4, 0.6],
18 |     [0.8, 0.2],
19 |     [0.5, 0.5]
20 | ])
21 | 
22 | model = hmm.MultinomialHMM(n_components=n_status)
23 | model.startprob_ = start_probability
24 | model.transmat_ = transition_probability
25 | model.emissionprob_ = emission_probalitity
26 | 
27 | # 预测问题
28 | seen=np.array([0,1,0]) #白球，黑球，白球
29 | 
30 | # 观测序列的概率计算问题
31 | # score函数返回的是以自然对数为底的对数概率值
32 | # ln0.13022≈−2.0385
33 | print(exp(model.score(seen.reshape(-1,1))))


--------------------------------------------------------------------------------
/Code/11-2马尔可夫训练模型.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import hmmlearn.hmm as hmm
 3 | 
 4 | states = ['盒子1', '盒子2', '盒子3']
 5 | obs = ['白球', '黑球']
 6 | n_states = len(states)
 7 | m_obs = len(obs)
 8 | 
 9 | model2 = hmm.MultinomialHMM(n_components=n_states, n_iter=20, tol=0.001)
10 | X2 = np.array([
11 |     [0, 1, 0, 0, 1],
12 |     [0, 0, 0, 1, 1],
13 |     [1, 1, 0, 1, 0],
14 |     [0, 1, 0, 1, 1],
15 |     [0, 0, 0, 1, 0]
16 | ])
17 | model2.fit(X2)
18 | print("输出根据数据训练出来的π")
19 | print(model2.startprob_)
20 | print("输出根据数据训练出来的A")
21 | print(model2.transmat_)
22 | print("输出根据数据训练出来的B")
23 | print(model2.emissionprob_)
24 | #从观测的结果反过去求盒子和球


--------------------------------------------------------------------------------
/Code/11-3马尔可夫解码.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from hmmlearn import hmm
 3 | status = ['盒子1', '盒子2', '盒子3']
 4 | obs = ['白球', '黑球']
 5 | n_status = len(status)
 6 | m_obs = len(obs)
 7 | start_probability = np.array([0.2, 0.5, 0.3])
 8 | transition_probability = np.array([
 9 |     [0.5, 0.4, 0.1],
10 |     [0.2, 0.2, 0.6],
11 |     [0.2, 0.5, 0.3]
12 | ])
13 | emission_probalitity = np.array([
14 |     [0.4, 0.6],
15 |     [0.8, 0.2],
16 |     [0.5, 0.5]
17 | ])
18 | 
19 | model = hmm.MultinomialHMM(n_components=n_status)
20 | model.startprob_ = start_probability
21 | model.transmat_ = transition_probability
22 | model.emissionprob_ = emission_probalitity
23 | 
24 | se = np.array([[0, 1, 0, 0, 1]]).T
25 | logprob, box_index = model.decode(se, algorithm='viterbi')
26 | print("颜色:", end="")
27 | print(" ".join(map(lambda t: obs[t], [0, 1, 0, 0, 1])))
28 | print("盒子:", end="")
29 | print(" ".join(map(lambda t: status[t], box_index)))
30 | print("概率值:", end="")
31 | print(np.exp(logprob)) # 这个是因为在hmmlearn底层将概率进行了对数化，防止出现乘积为0的情况


--------------------------------------------------------------------------------
/Code/11-4马尔可夫（维特比最短路径）.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from hmmlearn import hmm
 3 | startprob = np.array([0.6, 0.3, 0.1, 0.0])
 4 | # The transition matrix, note that there are no transitions possible
 5 | # between component 1 and 3
 6 | transmat = np.array([[0.7, 0.2, 0.0, 0.1],
 7 |                      [0.3, 0.5, 0.2, 0.0],
 8 |                      [0.0, 0.3, 0.5, 0.2],
 9 |                      [0.2, 0.0, 0.2, 0.6]])
10 | # The means of each component
11 | means = np.array([[0.0,  0.0],
12 |                   [0.0, 11.0],
13 |                   [9.0, 10.0],
14 |                   [11.0, -1.0]])
15 | # The covariance of each component
16 | covars = .5 * np.tile(np.identity(2), (4, 1, 1))
17 | 
18 | # Build an HMM instance and set parameters
19 | model3 = hmm.GaussianHMM(n_components=4, covariance_type="full")
20 | 
21 | # Instead of fitting it from the data, we directly set the estimated
22 | # parameters, the means and covariance of the components
23 | model3.startprob_ = startprob
24 | model3.transmat_ = transmat
25 | model3.means_ = means
26 | model3.covars_ = covars
27 | 
28 | 
29 | seen = np.array([[1.1,2.0],[-1,2.0],[3,7]])
30 | logprob, state = model3.decode(seen, algorithm="viterbi")
31 | print(logprob,state)
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/Code/11-5马尔可夫算法（对应课本P213习题）.py:
--------------------------------------------------------------------------------
 1 | import numpy as  np
 2 | 
 3 | class HiddenMarkov:
 4 |     def forward(self, Q, V, A, B, O, PI):  # 使用前向算法
 5 |         N = len(Q)  # 状态序列的大小
 6 |         M = len(O)  # 观测序列的大小
 7 |         alphas = np.zeros((N, M))  # alpha值
 8 |         T = M  # 有几个时刻，有几个观测序列，就有几个时刻
 9 |         for t in range(T):  # 遍历每一时刻，算出alpha值
10 |             indexOfO = V.index(O[t])  # 找出序列对应的索引
11 |             for i in range(N):
12 |                 if t == 0:  # 计算初值
13 |                     alphas[i][t] = PI[t][i] * B[i][indexOfO]  # P176（10.15）
14 |                     print('alpha1(%d)=p%db%db(o1)=%f' % (i, i, i, alphas[i][t]))
15 |                 else:
16 |                     alphas[i][t] = np.dot([alpha[t - 1] for alpha in alphas], [a[i] for a in A]) * B[i][
17 |                         indexOfO]  # 对应P176（10.16）
18 |                     print('alpha%d(%d)=[sigma alpha%d(i)ai%d]b%d(o%d)=%f' % (t, i, t - 1, i, i, t, alphas[i][t]))
19 |                     # print(alphas)
20 |         P = np.sum([alpha[M - 1] for alpha in alphas])  # P176(10.17)
21 |         # alpha11 = pi[0][0] * B[0][0]    #代表a1(1)
22 |         # alpha12 = pi[0][1] * B[1][0]    #代表a1(2)
23 |         # alpha13 = pi[0][2] * B[2][0]    #代表a1(3)
24 | 
25 |     def backward(self, Q, V, A, B, O, PI):  # 后向算法
26 |         N = len(Q)  # 状态序列的大小
27 |         M = len(O)  # 观测序列的大小
28 |         betas = np.ones((N, M))  # beta
29 |         for i in range(N):
30 |             print('beta%d(%d)=1' % (M, i))
31 |         for t in range(M - 2, -1, -1):
32 |             indexOfO = V.index(O[t + 1])  # 找出序列对应的索引
33 |             for i in range(N):
34 |                 betas[i][t] = np.dot(np.multiply(A[i], [b[indexOfO] for b in B]), [beta[t + 1] for beta in betas])
35 |                 realT = t + 1
36 |                 realI = i + 1
37 |                 print('beta%d(%d)=[sigma a%djbj(o%d)]beta%d(j)=(' % (realT, realI, realI, realT + 1, realT + 1),
38 |                       end='')
39 |                 for j in range(N):
40 |                     print("%.2f*%.2f*%.2f+" % (A[i][j], B[j][indexOfO], betas[j][t + 1]), end='')
41 |                 print("0)=%.3f" % betas[i][t])
42 |         # print(betas)
43 |         indexOfO = V.index(O[0])
44 |         P = np.dot(np.multiply(PI, [b[indexOfO] for b in B]), [beta[0] for beta in betas])
45 |         print("P(O|lambda)=", end="")
46 |         for i in range(N):
47 |             print("%.1f*%.1f*%.5f+" % (PI[0][i], B[i][indexOfO], betas[i][0]), end="")
48 |         print("0=%f" % P)
49 | 
50 |     def viterbi(self, Q, V, A, B, O, PI):
51 |         N = len(Q)  # 状态序列的大小
52 |         M = len(O)  # 观测序列的大小
53 |         deltas = np.zeros((N, M))
54 |         psis = np.zeros((N, M))
55 |         I = np.zeros((1, M))
56 |         for t in range(M):
57 |             realT = t+1
58 |             indexOfO = V.index(O[t])  # 找出序列对应的索引
59 |             for i in range(N):
60 |                 realI = i+1
61 |                 if t == 0:
62 |                     deltas[i][t] = PI[0][i] * B[i][indexOfO]
63 |                     psis[i][t] = 0
64 |                     print('delta1(%d)=pi%d * b%d(o1)=%.2f * %.2f=%.2f'%(realI, realI, realI, PI[0][i], B[i][indexOfO], deltas[i][t]))
65 |                     print('psis1(%d)=0' % (realI))
66 |                 else:
67 |                     deltas[i][t] = np.max(np.multiply([delta[t-1] for delta in deltas], [a[i] for a in A])) * B[i][indexOfO]
68 |                     print('delta%d(%d)=max[delta%d(j)aj%d]b%d(o%d)=%.2f*%.2f=%.5f'%(realT, realI, realT-1, realI, realI, realT, np.max(np.multiply([delta[t-1] for delta in deltas], [a[i] for a in A])), B[i][indexOfO], deltas[i][t]))
69 |                     psis[i][t] = np.argmax(np.multiply([delta[t-1] for delta in deltas], [a[i] for a in A]))
70 |                     print('psis%d(%d)=argmax[delta%d(j)aj%d]=%d' % (realT, realI, realT-1, realI, psis[i][t]))
71 |         print(deltas)
72 |         print(psis)
73 |         I[0][M-1] = np.argmax([delta[M-1] for delta in deltas])
74 |         print('i%d=argmax[deltaT(i)]=%d' % (M, I[0][M-1]+1))
75 |         for t in range(M-2, -1, -1):
76 |             I[0][t] = psis[int(I[0][t+1])][t+1]
77 |             print('i%d=psis%d(i%d)=%d' % (t+1, t+2, t+2, I[0][t]+1))
78 |         print(I)
79 | 
80 | 
81 | Q = [1, 2, 3]
82 | V = ['红', '白']
83 | A = [[0.5, 0.2, 0.3], [0.3, 0.5, 0.2], [0.2, 0.3, 0.5]]
84 | B = [[0.5, 0.5], [0.4, 0.6], [0.7, 0.3]]
85 | # O = ['红', '白', '红', '红', '白', '红', '白', '白']
86 | O = ['红', '白', '红', '白']
87 | PI = [[0.2, 0.4, 0.4]]
88 | 
89 | 
90 | HMM = HiddenMarkov()
91 | HMM.forward(Q, V, A, B, O, PI)
92 | HMM.backward(Q, V, A, B, O, PI)
93 | HMM.viterbi(Q, V, A, B, O, PI)
94 | 
95 | 


--------------------------------------------------------------------------------
/Code/12-1SVD数据压缩.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | from sklearn.decomposition import TruncatedSVD
 3 | from scipy.sparse import random as sparse_random
 4 | from sklearn.random_projection import sparse_random_matrix
 5 | X = sparse_random(100, 100, density=0.01, format='csr',
 6 |                    random_state=42)
 7 | svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
 8 | svd.fit(X)
 9 | 
10 | print(svd.explained_variance_ratio_)
11 | 
12 | print(svd.explained_variance_ratio_.sum())
13 | print(svd.singular_values_)
14 | import numpy as np
15 | 
16 | a = np.random.randn(9, 6) + 1j*np.random.randn(9, 6)
17 | b = np.random.randn(2, 7, 8, 3) + 1j*np.random.randn(2, 7, 8, 3)
18 | 
19 | u, s, vh = np.linalg.svd(a, full_matrices=True)
20 | print(a.shape)
21 | print(u.shape, s.shape, vh.shape)
22 | 
23 | from PIL import Image
24 | import matplotlib.image as mpimg
25 | 
26 | 
27 | I = mpimg.imread('data/F_test.jpeg')
28 | #Now, let's look at the size of this numpy array object img as well as plot it using imshow.
29 | print(I.shape)
30 | plt.axis('off')
31 | plt.imshow(I)
32 | 
33 | def show_img(img):
34 |     plt.figure(figsize = (10, 7.5))
35 |     plt.imshow(img, cmap = 'gray', vmin=0, vmax=255, aspect = 'auto')
36 |     plt.axis('off')
37 |     plt.show()
38 | 
39 | U, S, V_T = np.linalg.svd(I)
40 | #U.shape, S.shape, V_T.shape
41 | 
42 | 
43 | 
44 | I = I[:,:,1]
45 | print(I.shape)
46 | 
47 | 
48 | plt.figure(figsize = (9, 5))
49 | plt.plot(np.arange(S.shape[0]), S)
50 | plt.yscale('log')
51 | plt.xlabel('Index of $\sigma$')
52 | plt.ylabel('log(value of $\sigma$)')
53 | plt.title('Singular values $\sigma_i$ vs its index')
54 | plt.show()
55 | plt.figure(figsize = (9, 5))
56 | plt.plot(np.cumsum(S) / sum(S))
57 | plt.xlabel('Index of $\sigma$')
58 | plt.ylabel('Value of $\sigma$')
59 | plt.title('Cumulative sum of $\sigma_i$ vs its index\n(Percent of explained variance)')
60 | plt.show()
61 | S_full = np.zeros((U.shape[0], V_T.shape[0]))
62 | 
63 | #S_full.shape
64 | 
65 | S_diag = np.diag(S)
66 | S_full[:S_diag.shape[0], :S_diag.shape[1]] = S_diag
67 | 
68 | for i in [5, 10, 25, 50, 100, 200, U.shape[0]]:
69 |     print(str(i) + '\n')
70 |     show_img(U[:, :i].dot(S_full[:i, :i].dot(V_T[:i, :])))
71 |     print('-' * 100 + '\n')
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/Code/12-2PCA数据降维.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | from sklearn.datasets import load_iris
 3 | import numpy as np
 4 | iris = load_iris()
 5 | #checking to see what datasets are available in iris
 6 | print(iris.keys())
 7 | print(iris.data.shape)
 8 | print(iris.feature_names)
 9 | 
10 | from sklearn.decomposition import PCA
11 | pca = PCA(2)
12 | print(pca)
13 | 
14 | X, y = iris.data, iris.target
15 | X_proj = pca.fit_transform(X)
16 | print(X_proj.shape)
17 | 
18 | plt.scatter(X_proj[:,0], X_proj[:,1],c=y)
19 | plt.show()
20 | 
21 | from sklearn.datasets import load_digits
22 | digits = load_digits()
23 | print(digits.keys())
24 | 
25 | print(digits.data.shape)
26 | 
27 | print(digits.images.shape)
28 | 
29 | X,y = digits.data, digits.target
30 | pca_digits=PCA(0.95)
31 | X_proj = pca_digits.fit_transform(X)
32 | print(X.shape, X_proj.shape)
33 | 
34 | 
35 | pca_digits=PCA(2)
36 | X_proj = pca_digits.fit_transform(X)
37 | print(np.sum(pca_digits.explained_variance_ratio_))
38 | 
39 | 
40 | print(X_proj.shape)
41 | 
42 | 
43 | plt.scatter(X_proj[:,0], X_proj[:,1], c=y)
44 | plt.colorbar()
45 | plt.show()
46 | 
47 | pca_digits = PCA(64).fit(X)
48 | plt.semilogx(np.cumsum(pca_digits.explained_variance_ratio_))
49 | plt.xlabel('Number of Components')
50 | plt.ylabel('Variance retained')
51 | plt.ylim(0,1)
52 | plt.show()
53 | 
54 | from PIL import Image
55 | 
56 | im1 = Image.open('data/F_test.jpeg')
57 | im1.save('data/F_test.png')
58 | 
59 | import matplotlib.image as mpimg
60 | img = mpimg.imread('data/F_test.png')
61 | #Now, let's look at the size of this numpy array object img as well as plot it using imshow.
62 | print(img.shape)
63 | plt.axis('off')
64 | plt.imshow(img)
65 | 
66 | 
67 | img_r = np.reshape(img, (800, 3600))
68 | print(img_r.shape)
69 | 
70 | 
71 | ipca = PCA(64).fit(img_r)
72 | img_c = ipca.transform(img_r)
73 | print(img_c.shape)
74 | print(np.sum(ipca.explained_variance_ratio_))
75 | 
76 | 
77 | temp = ipca.inverse_transform(img_c)
78 | print(temp.shape)
79 | 
80 | 
81 | temp = np.reshape(temp, (800,1200,3))
82 | print(temp.shape)
83 | 
84 | 
85 | plt.axis('off')
86 | plt.imshow(temp)
87 | 


--------------------------------------------------------------------------------
/Code/13-1文本特征处理（泰坦尼克）.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import pandas as pd
 3 | from sklearn.preprocessing import OneHotEncoder
 4 | 
 5 | train = pd.read_csv('onehot/train.csv')
 6 | train.head()
 7 | 
 8 | 
 9 | train.info()
10 | 
11 | 
12 | data = train
13 | data['Died']= 1 - data['Survived']
14 | plt.show(data.groupby('Sex').agg('sum')[['Survived','Died']].plot(kind='bar',stacked=True))
15 | 
16 | 
17 | encoder = OneHotEncoder(sparse=False)
18 | En_ec = encoder.fit_transform(train[['Sex']])
19 | En_ec = pd.DataFrame(En_ec)
20 | train_new = pd.concat([train,En_ec],axis=1)
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/Code/13-2文本特征处理-词带.py:
--------------------------------------------------------------------------------
 1 | from sklearn.feature_extraction.text import CountVectorizer
 2 | corpus = [
 3 |      'This is the first document.',
 4 |      'This document is the second document.',
 5 |      'And this is the third one.',
 6 |      'Is this the first document?',
 7 | ]
 8 | 
 9 | vectorizer = CountVectorizer()
10 | X = vectorizer.fit_transform(corpus)
11 | print(vectorizer.get_feature_names())
12 | 
13 | print(X.toarray())
14 | 
15 | 
16 | vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))
17 | X2 = vectorizer2.fit_transform(corpus)
18 | print(vectorizer2.get_feature_names())
19 | 
20 | print(X2.toarray())
21 | 


--------------------------------------------------------------------------------
/Code/13-3文本特征处理(泰坦尼克)-TFIDF.py:
--------------------------------------------------------------------------------
 1 | import  pandas as pd
 2 | corpus = ['this is the first document',
 3 |           'this document is the second document',
 4 |           'and this is the third one',
 5 |           'is this the first document']
 6 | 
 7 | def display_features(features,feature_names):
 8 |     df = pd.DataFrame(data=features, columns = feature_names)
 9 |     print(df)
10 | 
11 | 
12 | 
13 | 
14 | from sklearn.feature_extraction.text import CountVectorizer
15 | 
16 | 
17 | def bow_extractor(corpus, ngram_range=(1, 1)):
18 |     vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
19 |     features = vectorizer.fit_transform(corpus)
20 |     return vectorizer, features
21 | 
22 | 
23 | 
24 | bow_vectorizer, bow_features = bow_extractor(corpus)
25 | print(bow_features.todense())
26 | 
27 | feature_names = bow_vectorizer.get_feature_names()
28 | 
29 | print(feature_names)
30 | 
31 | features = bow_features.todense()
32 | display_features(features, feature_names)
33 | 
34 | 
35 | from sklearn.feature_extraction.text import TfidfTransformer
36 | 
37 | 
38 | def tfidf_transformer(bow_matrix):
39 |     transformer = TfidfTransformer(norm='l2',
40 |                                    smooth_idf=True,
41 |                                    use_idf=True)
42 |     tfidf_matrix = transformer.fit_transform(bow_matrix)
43 |     return transformer, tfidf_matrix
44 | 
45 | 
46 | 
47 | import numpy as np
48 | 
49 | feature_names = bow_vectorizer.get_feature_names()
50 | tfidf_trans, tdidf_features = tfidf_transformer(bow_features)
51 | 
52 | features = np.round(tdidf_features.todense(), 2)
53 | display_features(features, feature_names)
54 | 
55 | 


--------------------------------------------------------------------------------
/Code/13-4文本特征处理-单词二维化.py:
--------------------------------------------------------------------------------
 1 | from gensim.models import Word2Vec
 2 | # define training data
 3 | sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
 4 | 			['this', 'is', 'the', 'second', 'sentence'],
 5 | 			['yet', 'another', 'sentence'],
 6 | 			['one', 'more', 'sentence'],
 7 | 			['and', 'the', 'final', 'sentence']]
 8 | # train model
 9 | model = Word2Vec(sentences, min_count=1)
10 | # summarize the loaded model
11 | print(model)
12 | # summarize vocabulary
13 | words = list(model.wv.vocab)
14 | print(words)
15 | # access vector for one word
16 | print(model['sentence'])
17 | 
18 | 
19 | from sklearn.decomposition import PCA
20 | 
21 | 
22 | 
23 | from matplotlib import pyplot
24 | 
25 | 
26 | 
27 | X = model[model.wv.vocab]
28 | pca = PCA(n_components=2)
29 | result = pca.fit_transform(X)
30 | # create a scatter plot of the projection
31 | pyplot.scatter(result[:, 0], result[:, 1])
32 | words = list(model.wv.vocab)
33 | for i, word in enumerate(words):
34 | 	pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
35 | pyplot.show()
36 | 
37 | 


--------------------------------------------------------------------------------
/Code/13-5文本特征处理-机器语言学习.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import pandas as pd
  3 | import numpy as np
  4 | from tqdm import tqdm
  5 | tqdm.pandas(desc="progress-bar")
  6 | from gensim.models import Doc2Vec
  7 | from sklearn import utils
  8 | from sklearn.model_selection import train_test_split
  9 | import gensim
 10 | from sklearn.linear_model import LogisticRegression
 11 | from gensim.models.doc2vec import TaggedDocument
 12 | import re
 13 | import seaborn as sns
 14 | import matplotlib.pyplot as plt
 15 | 
 16 | 
 17 | df = pd.read_csv('onehot/Consumer_Complaints_sim.csv')
 18 | df = df[['Sub-issue','Product']]
 19 | df = df[pd.notnull(df['Sub-issue'])]
 20 | df.head(10)
 21 | 
 22 | 
 23 | 
 24 | df.shape
 25 | 
 26 | 
 27 | cnt_pro = df['Product'].value_counts()
 28 | 
 29 | plt.figure(figsize=(12,4))
 30 | sns.barplot(cnt_pro.index, cnt_pro.values, alpha=0.8)
 31 | plt.ylabel('Number of Occurrences', fontsize=12)
 32 | plt.xlabel('Product', fontsize=12)
 33 | plt.xticks(rotation=90)
 34 | plt.show()
 35 | 
 36 | 
 37 | df.rename(columns = {'Consumer complaint narrative':'narrative'}, inplace = True)
 38 | df.rename(columns = {'Sub-issue':'narrative'}, inplace = True)
 39 | 
 40 | 
 41 | from gensim.models import doc2vec
 42 | 
 43 | def label_sentences(corpus, label_type):
 44 |     """
 45 |     Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
 46 |     We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
 47 |     a dummy index of the complaint narrative.
 48 |     """
 49 |     labeled = []
 50 |     for i, v in enumerate(corpus):
 51 |         label = label_type + '_' + str(i)
 52 |         labeled.append(doc2vec.TaggedDocument(v.split(), [label]))
 53 |     return labeled
 54 | 
 55 | 
 56 | X_train, X_test, y_train, y_test = train_test_split(df.narrative, df.Product, random_state=0, test_size=0.3)
 57 | X_train = label_sentences(X_train, 'Train')
 58 | X_test = label_sentences(X_test, 'Test')
 59 | all_data = X_train + X_test
 60 | 
 61 | 
 62 | model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
 63 | model_dbow.build_vocab([x for x in tqdm(all_data)])
 64 | 
 65 | 
 66 | 
 67 | for epoch in range(30):
 68 |     model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
 69 |     model_dbow.alpha -= 0.002
 70 |     model_dbow.min_alpha = model_dbow.alpha
 71 | 
 72 | 
 73 | def get_vectors(model, corpus_size, vectors_size, vectors_type):
 74 |     """
 75 |     Get vectors from trained doc2vec model
 76 |     :param doc2vec_model: Trained Doc2Vec model
 77 |     :param corpus_size: Size of the data
 78 |     :param vectors_size: Size of the embedding vectors
 79 |     :param vectors_type: Training or Testing vectors
 80 |     :return: list of vectors
 81 |     """
 82 |     vectors = np.zeros((corpus_size, vectors_size))
 83 |     for i in range(0, corpus_size):
 84 |         prefix = vectors_type + '_' + str(i)
 85 |         vectors[i] = model.docvecs[prefix]
 86 |     return vectors
 87 | 
 88 | 
 89 | train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train')
 90 | test_vectors_dbow = get_vectors(model_dbow, len(X_test), 300, 'Test')
 91 | print(test_vectors_dbow)
 92 | print(train_vectors_dbow.shape)
 93 | print(test_vectors_dbow.shape)
 94 | 
 95 | 
 96 | from sklearn.linear_model import LogisticRegression
 97 | 
 98 | logreg = LogisticRegression(multi_class='multinomial', solver = 'lbfgs')
 99 | logreg.fit(train_vectors_dbow, y_train)
100 | 
101 | 
102 | print("逻辑斯蒂测准确率="+logreg.score(test_vectors_dbow, y_test))
103 | 
104 | 


--------------------------------------------------------------------------------
/Code/14-1交叉验证.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.model_selection import train_test_split
 3 | from sklearn import datasets
 4 | from sklearn import svm
 5 | 
 6 | iris = datasets.load_iris()
 7 | print(iris.data.shape, iris.target.shape)
 8 | 
 9 | X_train, X_test, y_train, y_test = train_test_split(
10 |     iris.data, iris.target, test_size=0.4, random_state=0)
11 | 
12 | 
13 | 
14 | clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
15 | clf.score(X_test, y_test)
16 | 
17 | from sklearn.model_selection import cross_val_score
18 | clf = svm.SVC(kernel='linear', C=1)
19 | scores = cross_val_score(clf, iris.data, iris.target, cv=5 )   #交叉测试了5次
20 | print(scores)
21 | print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
22 | 
23 | 
24 | from sklearn.model_selection import ShuffleSplit
25 | n_samples = iris.data.shape[0]
26 | cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
27 | scores = cross_val_score(clf, iris.data, iris.target, cv=cv)
28 | print(scores)
29 | 
30 | from sklearn.model_selection import cross_validate
31 | from sklearn.metrics import recall_score
32 | from sklearn.metrics.scorer import make_scorer
33 | scoring = {'prec_macro': 'precision_macro',
34 |             'rec_micro': make_scorer(recall_score, average='macro')}
35 | scores = cross_validate(clf, iris.data, iris.target, scoring=scoring,
36 |                          cv=5, return_train_score=True)
37 | print(scores)


--------------------------------------------------------------------------------
/Code/14-2Pipeline.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | import pandas as pd
 4 | 
 5 | from sklearn import datasets
 6 | from sklearn.decomposition import PCA
 7 | from sklearn.linear_model import LogisticRegression
 8 | from sklearn.pipeline import Pipeline
 9 | from sklearn.model_selection import GridSearchCV
10 | 
11 | 
12 | # Define a pipeline to search for the best combination of PCA truncation
13 | # and classifier regularization.
14 | pca = PCA()
15 | # set the tolerance to a large value to make the example faster
16 | logistic = LogisticRegression(max_iter=10000, tol=0.1)
17 | pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
18 | 
19 | X_digits, y_digits = datasets.load_digits(return_X_y=True)
20 | 
21 | # Parameters of pipelines can be set using ‘__’ separated parameter names:
22 | param_grid = {
23 |     'pca__n_components': [5, 15, 30, 45, 64],
24 |     'logistic__C': np.logspace(-4, 4, 4),
25 | }
26 | search = GridSearchCV(pipe, param_grid, n_jobs=-1)
27 | search.fit(X_digits, y_digits)
28 | print("Best parameter (CV score=%0.3f):" % search.best_score_)
29 | print(search.best_params_)
30 | 
31 | # Plot the PCA spectrum
32 | pca.fit(X_digits)
33 | 
34 | fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(6, 6))
35 | ax0.plot(np.arange(1, pca.n_components_ + 1),
36 |          pca.explained_variance_ratio_, '+', linewidth=2)
37 | ax0.set_ylabel('PCA explained variance ratio')
38 | 
39 | ax0.axvline(search.best_estimator_.named_steps['pca'].n_components,
40 |             linestyle=':', label='n_components chosen')
41 | ax0.legend(prop=dict(size=12))
42 | 
43 | # For each number of components, find the best classifier results
44 | results = pd.DataFrame(search.cv_results_)
45 | components_col = 'param_pca__n_components'
46 | best_clfs = results.groupby(components_col).apply(
47 |     lambda g: g.nlargest(1, 'mean_test_score'))
48 | 
49 | best_clfs.plot(x=components_col, y='mean_test_score', yerr='std_test_score',
50 |                legend=False, ax=ax1)
51 | ax1.set_ylabel('Classification accuracy (val)')
52 | ax1.set_xlabel('n_components')
53 | 
54 | plt.xlim(-1, 70)
55 | 
56 | plt.tight_layout()
57 | plt.show()
58 | 


--------------------------------------------------------------------------------
/Code/2-1(重要)二分类模型(感知器学习的原始算法).py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from sklearn.datasets import load_iris
 4 | import matplotlib.pyplot as plt
 5 | #%matplotlib inline
 6 | iris = load_iris()
 7 | df = pd.DataFrame(iris.data, columns=iris.feature_names)
 8 | df['label'] = iris.target
 9 | print(df.head(10))
10 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
11 | print(df.label.value_counts())
12 | plt.scatter(df[:50]['sepal length'], df[:50]['sepal width'], label='0')
13 | plt.scatter(df[50:100]['sepal length'], df[50:100]['sepal width'], label='1')
14 | plt.xlabel('sepal length')
15 | plt.ylabel('sepal width')
16 | plt.legend()
17 | plt.show()
18 | data = np.array(df.iloc[:100, [0, 1, -1]])
19 | print(data[:10,:])
20 | 
21 | 
22 | X, y = data[:,:-1], data[:,-1]
23 | X[:10,:]
24 | 
25 | 
26 | y = np.array([1 if i == 1 else -1 for i in y ])
27 | 
28 | 
29 | class Perceptron_Model:
30 |     def __init__(self):
31 |         self.w = np.ones(len(data[0]) - 1, dtype=np.float32)
32 |         print(self.w)
33 |         self.b = 0
34 |         self.l_rate = 0.1
35 |         # self.data = data
36 | 
37 |     def sign(self, x, w, b):
38 |         y = np.dot(x, w) + b
39 |         return y
40 | 
41 |     # 随机梯度下降法
42 |     def fit(self, X_train, y_train):
43 |         is_wrong = False
44 |         while not is_wrong:
45 |             wrong_count = 0
46 |             for d in range(len(X_train)):
47 |                 X = X_train[d]
48 |                 y = y_train[d]
49 |                 if y * self.sign(X, self.w, self.b) <= 0:
50 |                     self.w = self.w + self.l_rate * np.dot(y, X)
51 |                     self.b = self.b + self.l_rate * y
52 |                     wrong_count += 1
53 |             if wrong_count == 0:
54 |                 is_wrong = True
55 |         return 'Perceptron Model!'
56 | 
57 |     def score(self):
58 |         pass
59 | 
60 | perceptron = Perceptron_Model()
61 | perceptron.fit(X, y)
62 | x_points = np.linspace(4, 7, 10)
63 | y_ = -(perceptron.w[0] * x_points + perceptron.b) / perceptron.w[1]
64 | plt.plot(x_points, y_)
65 | 
66 | plt.plot(data[:50, 0], data[:50, 1], 'bo', color='blue', label='0')
67 | plt.plot(data[50:100, 0], data[50:100, 1], 'bo', color='orange', label='1')
68 | plt.xlabel('sepal length')
69 | plt.ylabel('sepal width')
70 | plt.legend()
71 | plt.show()


--------------------------------------------------------------------------------
/Code/2-2二分类模型（对偶算法）.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from sklearn.datasets import load_iris
 4 | import matplotlib.pyplot as plt
 5 | #%matplotlib inline
 6 | iris = load_iris()
 7 | df = pd.DataFrame(iris.data, columns=iris.feature_names)
 8 | df['label'] = iris.target
 9 | print(df.head(10))
10 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
11 | print(df.label.value_counts())
12 | plt.scatter(df[:50]['sepal length'], df[:50]['sepal width'], label='0')
13 | plt.scatter(df[50:100]['sepal length'], df[50:100]['sepal width'], label='1')
14 | plt.xlabel('sepal length')
15 | plt.ylabel('sepal width')
16 | plt.legend()
17 | data = np.array(df.iloc[:100, [0, 1, -1]])
18 | print(data[:10,:])
19 | 
20 | X, y = data[:,:-1], data[:,-1]
21 | X[:10,:]
22 | 
23 | clf = Perceptron(tol=1e-3, random_state=0, max_iter=1000)
24 | clf.fit(X, y)
25 | print(clf.coef_)
26 | print(clf.intercept_)
27 | x_ponits = np.arange(4, 8)
28 | y_ = -(clf.coef_[0][0]*x_ponits + clf.intercept_)/clf.coef_[0][1]
29 | plt.plot(x_ponits, y_)
30 | plt.plot(data[:50, 0], data[:50, 1], 'bo', color='blue', label='0')
31 | plt.plot(data[50:100, 0], data[50:100, 1], 'bo', color='orange', label='1')
32 | plt.xlabel('sepal length')
33 | plt.ylabel('sepal width')
34 | plt.legend()
35 | plt.show()
36 | # class PLA_dual:
37 | #     def __init__(self, max_iter=1000):
38 | #         self.b = 0
39 | #         self.lr = 0.1
40 | #         self.max_iter = max_iter
41 | #         self.iter = 0
42 | #
43 | #     def cal_w(self, X):
44 | #         w = 0
45 | #         for i in range(len(self.alpha)):
46 | #             w += self.alpha[i] * y[i] * X[i]
47 | #         return w
48 | #
49 | #     def gram_matrix(self, X):
50 | #         return np.dot(X, X.T)
51 | #
52 | #     def fit(self, X, y):
53 | #         N, M = X.shape
54 | #         self.alpha = np.zeros(N)
55 | #         gram = self.gram_matrix(X)
56 | #         for n in range(self.max_iter):
57 | #             self.iter = n
58 | #             wrong_items = 0
59 | #             for i in range(N):
60 | #                 tmp = 0
61 | #                 for j in range(N):
62 | #                     tmp += self.alpha[j] * y[j] * gram[i, j]
63 | #                 tmp += self.b
64 | #                 if y[i] * tmp <= 0:
65 | #                     self.alpha[i] += self.lr
66 | #                     self.b += self.lr * y[i]
67 | #                     wrong_items += 1
68 | #             if wrong_items == 0:
69 | #                 self.w = self.cal_w(X)
70 | #                 print("finished at iters: {}, w: {}, b: {}".format(self.iter, self.w, self.b))
71 | #                 return
72 | #         self.w = self.cal_w(X)
73 | #         print("finished for reaching the max_iter: {}, w: {}, b: {}".format(self.max_iter, self.w, self.b))
74 | #         return
75 | #
76 | # perceptron3 = PLA_dual()
77 | # perceptron3.fit(X, y)
78 | # def plot(model, tilte):
79 | #     x_points = np.linspace(4, 7, 10)
80 | #     y_ = -(model.w[0]*x_points + model.b)/model.w[1]
81 | #     plt.plot(x_points, y_)
82 | #     print(y_)
83 | #
84 | #     plt.plot(data[:50, 0], data[:50, 1], 'bo', color='blue', label='-1')
85 | #     plt.plot(data[50:100, 0], data[50:100, 1], 'bo', color='orange', label='1')
86 | #     plt.xlabel('sepal length')
87 | #     plt.ylabel('sepal width')
88 | #     plt.title(tilte)
89 | #     plt.legend()
90 | #     plt.show()
91 | # plot(perceptron3, 'PLA_dual')


--------------------------------------------------------------------------------
/Code/2-3二分类模型（sklearn包里的分类算法）.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from sklearn.datasets import load_iris
 4 | import matplotlib.pyplot as plt
 5 | from sklearn.linear_model import Perceptron
 6 | #%matplotlib inline
 7 | iris = load_iris()
 8 | df = pd.DataFrame(iris.data, columns=iris.feature_names)
 9 | df['label'] = iris.target
10 | print(df.head(10))
11 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
12 | print(df.label.value_counts())
13 | data = np.array(df.iloc[:100, [0, 1, -1]])
14 | print(data[:10,:])
15 | print()
16 | 
17 | X, y = data[:,:-1], data[:,-1]
18 | X[:10,:]
19 | y = np.array([1 if i == 1 else -1 for i in y ])
20 | #clf = Perceptron(fit_intercept=False, shuffle=False)
21 | clf = Perceptron(tol=1e-3, random_state=0, max_iter=1000)
22 | clf.fit(X, y)
23 | print(clf.coef_)
24 | print(clf.intercept_)
25 | x_ponits = np.arange(4, 8)
26 | y_ = -(clf.coef_[0][0]*x_ponits + clf.intercept_)/clf.coef_[0][1]
27 | plt.plot(x_ponits, y_)
28 | plt.plot(data[:50, 0], data[:50, 1], 'bo', color='blue', label='0')
29 | plt.plot(data[50:100, 0], data[50:100, 1], 'bo', color='orange', label='1')
30 | plt.xlabel('sepal length')
31 | plt.ylabel('sepal width')
32 | plt.legend()
33 | plt.show()
34 | 
35 | 


--------------------------------------------------------------------------------
/Code/2-4二分类的课后练习.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | 
  4 | 
  5 | Iteration=[]
  6 | w_0=[-1.5]
  7 | w_1=[0]
  8 | w_2=[2]
  9 | Training_Example=[]
 10 | x1=[]
 11 | x2=[]
 12 | Class=[]
 13 | s=[]
 14 | Action=[]
 15 | 
 16 | count=0
 17 | 
 18 | 
 19 | 
 20 | data = np.array([[0,1,-1],[2, 0,-1],[1,1,1]])
 21 | X, y = data[:,:-1], data[:,-1]
 22 | 
 23 | class Perceptron_Model:
 24 |     def __init__(self):
 25 |         self.w = np.array([0,2])
 26 |         self.b = -1.5
 27 |         self.l_rate = 1.0
 28 | 
 29 |         # self.data = data
 30 | 
 31 | 
 32 |     def sign(self, x, w, b):
 33 | 
 34 |         y = np.dot(x, w) + b
 35 |         s.append(y)
 36 |         return y
 37 | 
 38 |     # 随机梯度下降法
 39 |     def fit(self, X_train, y_train):
 40 |         count=1
 41 |         is_wrong = False
 42 |         while not is_wrong:
 43 | 
 44 | 
 45 |             wrong_count = 0
 46 |             for d in range(len(X_train)):
 47 |                 if d==0:
 48 |                     Training_Example.append("a")
 49 |                 elif d==1:
 50 |                     Training_Example.append("b")
 51 |                 else:
 52 |                     Training_Example.append("c")
 53 |                 X = X_train[d]
 54 |                 x1.append(X[0])
 55 |                 x2.append(X[1])
 56 |                 y = y_train[d]
 57 |                 if y<0:
 58 |                     Class.append("-")
 59 |                 else:
 60 |                     Class.append("+")
 61 |                 if y * self.sign(X, self.w, self.b) <= 0:
 62 |                     Iteration.append(count)
 63 |                     count = count + 1
 64 |                     self.w = self.w + self.l_rate * np.dot(y, X)
 65 |                     w_1.append(self.w[0])
 66 |                     w_2.append(self.w[1])
 67 |                     self.b = self.b + self.l_rate * y
 68 |                     w_0.append(self.b)
 69 |                     if(y>0):Action.append("Add")
 70 |                     else:Action.append("Subtract")
 71 |                     wrong_count += 1
 72 |                 else:
 73 |                     Iteration.append(count)
 74 |                     count = count + 1
 75 |                     w_1.append(self.w[0])
 76 |                     w_2.append(self.w[1])
 77 |                     w_0.append(self.b)
 78 |                     Action.append("None")
 79 | 
 80 |             if wrong_count == 0:
 81 |                 is_wrong = True
 82 |         return 'Perceptron Model!'
 83 | 
 84 |     def score(self):
 85 |         pass
 86 | print()
 87 | perceptron = Perceptron_Model()
 88 | perceptron.fit(X, y)
 89 | print(count)
 90 | 
 91 | record = {
 92 |     'Iteration':Iteration,
 93 |     'w_0':w_0[0:12],
 94 |     'w_1':w_1[0:12],
 95 |     'w_2':w_2[0:12],
 96 | 'Training_Example':Training_Example,
 97 | 'x1':x1,
 98 | 'x2':x2,
 99 | 'Class':Class,
100 | 's=w_0+w_1x_1+w_2x_2':s,
101 | 'Action':Action
102 | }
103 | 
104 | print(record)
105 | frame = pd.DataFrame(record)
106 | frame.to_csv(path_or_buf="tmp.csv",index=False)
107 | 


--------------------------------------------------------------------------------
/Code/3-1K近邻的距离图.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from itertools import combinations
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | import matplotlib.pyplot as plt
 7 | r = 1
 8 | 
 9 | linestyle = ['b-','k-','m-','r-','y-']
10 | p_values = (0.25, 0.5, 1, 2, 4,100)
11 | 
12 | for i,p in enumerate(p_values):
13 |     x = np.arange(-r,r+1e-5,1/128.0)
14 |     y = (r**p - (abs(x)**p))**(1.0/p)
15 |     plt.plot(x,y,x,-y)
16 | 
17 | ax = plt.gca()
18 | ax.set_aspect(1)
19 | plt.show()
20 | 
21 | def L(x, y, p=2):
22 |     # x1 = [1, 1], x2 = [5,1]
23 |     if len(x) == len(y) and len(x) > 1:
24 |         sum = 0
25 |         for i in range(len(x)):
26 |             sum += math.pow(abs(x[i] - y[i]), p)
27 |         return math.pow(sum, 1/p)
28 |     else:
29 |         return 0
30 | 
31 | x1 = [1, 1]
32 | x2 = [5, 1]
33 | x3 = [4, 4]
34 | 
35 | def L(x, y, p=2):
36 |     # x1 = [1, 1], x2 = [5,1]
37 |     if len(x) == len(y) and len(x) > 1:
38 |         sum = 0
39 |         for i in range(len(x)):
40 |             sum += math.pow(abs(x[i] - y[i]), p)
41 |         return math.pow(sum, 1/p)
42 |     else:
43 |         return 0
44 | 
45 | 


--------------------------------------------------------------------------------
/Code/3-2K近邻法距离加权与统一的对比.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | from matplotlib.colors import ListedColormap
 6 | from sklearn import neighbors, datasets
 7 | 
 8 | 
 9 | # import some data to play with
10 | irisData = datasets.load_iris()
11 | irisData.data[0:5 ,:]
12 | 
13 | 
14 | X = irisData.data[:, :2]
15 | y = irisData.target
16 | X[:10 ,:]
17 | 
18 | 
19 | n_neighbors = 15
20 | 
21 | 
22 | step = .01  # step size in the mesh
23 | 
24 | for weights in ['uniform', 'distance']:
25 |     # we create an instance of Neighbours Classifier and fit the data.
26 |     classifier = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
27 |     classifier.fit(X, y)
28 | 
29 |     print('KNN classifier accuracy - "%s" - %.3f' % (weights ,classifier.score(X ,y)))
30 | 
31 |     # Plot the decision boundary. For that, we will assign a color to each
32 |     # point in the mesh [x_min, x_max]x[y_min, y_max].
33 |     x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
34 |     y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
35 |     x_grid, y_grid = np.meshgrid(np.arange(x_min, x_max, step = step),
36 |                                  np.arange(y_min, y_max, step = step))
37 |     Z = classifier.predict(np.c_[x_grid.ravel(), y_grid.ravel()])
38 | 
39 |     # Put the result into a color plot
40 |     Z = Z.reshape(x_grid.shape)
41 |     plt.figure()
42 |     plt.pcolormesh(x_grid, y_grid, Z, cmap=ListedColormap(['lightblue', 'lightgreen', 'lightyellow']) )
43 | 
44 |     # Plot also the training points
45 |     plt.scatter(X[:, 0], X[:, 1], c=y,
46 |                 edgecolor='k', s=20)
47 |     plt.xlim(x_grid.min(), x_grid.max())
48 |     plt.ylim(y_grid.min(), y_grid.max())
49 |     plt.title("KNN 3-Class Classification (k = %d, weights = '%s')"
50 |               % (n_neighbors, weights))
51 | 
52 | 
53 | plt.show()
54 | 
55 | 


--------------------------------------------------------------------------------
/Code/3-2（1）K近邻法距离加权与统一的对比.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from sklearn import neighbors
 5 | 
 6 | np.random.seed(0)
 7 | X = np.sort(5 * np.random.rand(40, 1), axis=0)
 8 | T = np.linspace(0, 5, 500)[:, np.newaxis]
 9 | y = np.sin(X).ravel()
10 | 
11 | # Add noise to targets
12 | y[::5] += 1 * (0.5 - np.random.rand(8))
13 | 
14 | plt.plot(X,y)
15 | 
16 | 
17 | n_neighbors = 5
18 | 
19 | for i, weights in enumerate(['uniform', 'distance']):
20 |     knn = neighbors.KNeighborsRegressor(n_neighbors, weights=weights)
21 |     y_ = knn.fit(X, y).predict(T)
22 | 
23 |     plt.subplot(2, 1, i + 1)
24 |     plt.scatter(X, y, c='k', label='data')
25 |     plt.plot(T, y_, c='g', label='prediction')
26 |     plt.axis('tight')
27 |     plt.legend()
28 |     plt.title("KNeighborsRegressor (k = %i, weights = '%s')" % (n_neighbors,
29 |                                                                 weights))
30 | 
31 | plt.tight_layout()
32 | plt.show()
33 | 
34 | 


--------------------------------------------------------------------------------
/Code/3-3KNN算法（原始和包）.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | import pandas as pd
 4 | import matplotlib.pyplot as plt
 5 | #% matplotlib
 6 | #inline
 7 | 
 8 | from sklearn.datasets import load_iris
 9 | from sklearn.model_selection import train_test_split
10 | 
11 | from collections import Counter
12 | 
13 | iris = load_iris()
14 | df = pd.DataFrame(iris.data, columns=iris.feature_names)
15 | df['label'] = iris.target
16 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
17 | 
18 | 
19 | df.iloc[0:5]
20 | 
21 | plt.scatter(df[:50]['sepal length'], df[:50]['sepal width'], label='0')
22 | plt.scatter(df[50:100]['sepal length'], df[50:100]['sepal width'], label='1')
23 | plt.xlabel('sepal length')
24 | plt.ylabel('sepal width')
25 | plt.legend()
26 | 
27 | 
28 | data = np.array(df.iloc[:100, [0, 1, -1]])
29 | X, y = data[:, :-1], data[:, -1]
30 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
31 | 
32 | 
33 | 
34 | class KNN:
35 |     def __init__(self, X_train, y_train, n_neighbors=3, p=2):
36 |         """
37 |         parameter: n_neighbors 临近点个数
38 |         parameter: p 距离度量
39 |         """
40 |         self.n = n_neighbors
41 |         self.p = p
42 |         self.X_train = X_train
43 |         self.y_train = y_train
44 | 
45 |     def predict(self, X):
46 |         # 取出n个点
47 |         knn_list = []
48 |         for i in range(self.n):
49 |             dist = np.linalg.norm(X - self.X_train[i], ord=self.p)
50 |             knn_list.append((dist, self.y_train[i]))
51 | 
52 |         for i in range(self.n, len(self.X_train)):
53 |             max_index = knn_list.index(max(knn_list, key=lambda x: x[0]))
54 |             dist = np.linalg.norm(X - self.X_train[i], ord=self.p)
55 |             if knn_list[max_index][0] > dist:
56 |                 knn_list[max_index] = (dist, self.y_train[i])
57 | 
58 |         # 统计
59 |         knn = [k[-1] for k in knn_list]
60 |         count_pairs = Counter(knn)
61 |         max_count = sorted(count_pairs, key=lambda x: x)[-1]
62 |         return max_count
63 | 
64 |     def score(self, X_test, y_test):
65 |         right_count = 0
66 |         n = 10
67 |         for X, y in zip(X_test, y_test):
68 |             label = self.predict(X)
69 |             if label == y:
70 |                 right_count += 1
71 |         return right_count / len(X_test)
72 | 
73 | 
74 | 
75 | clf = KNN(X_train, y_train)
76 | 
77 | clf.score(X_test, y_test)
78 | 
79 | 
80 | test_point = [6.0, 3.0]
81 | plt.scatter(df[:50]['sepal length'], df[:50]['sepal width'], label='0')
82 | plt.scatter(df[50:100]['sepal length'], df[50:100]['sepal width'], label='1')
83 | plt.plot(test_point[0], test_point[1], 'bo', label='test_point')
84 | plt.xlabel('sepal length')
85 | plt.ylabel('sepal width')
86 | plt.legend()
87 | plt.show()
88 | 
89 | 
90 | from sklearn.neighbors import KNeighborsClassifier
91 | 
92 | clf_sk = KNeighborsClassifier()
93 | clf_sk.fit(X_train, y_train)
94 | 
95 | 
96 | print(clf_sk.score(X_test, y_test))
97 | 
98 | 
99 | clf_sk.predict([[6.0,3.0]])


--------------------------------------------------------------------------------
/Code/3-4KNN（糖尿病）.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | import pandas as pd
 4 | import matplotlib.pyplot as plt
 5 | #% matplotlib
 6 | #inline
 7 | 
 8 | from sklearn.datasets import load_iris
 9 | from sklearn.model_selection import train_test_split
10 | 
11 | from collections import Counter
12 | 
13 | dia = pd.read_csv("diabetes.csv")
14 | df = pd.DataFrame(dia)
15 | 
16 | print(df)
17 | 
18 | 
19 | data = np.array(df.iloc[:767, [0,1,2,3,4,6,-1]])
20 | print(data)
21 | X, y = data[:, :-1], data[:, -1]
22 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
23 | 
24 | 
25 | 
26 | from sklearn.neighbors import KNeighborsClassifier
27 | #
28 | clf_sk = KNeighborsClassifier()
29 | clf_sk.fit(X_train, y_train)
30 | #
31 | #
32 | print(clf_sk.score(X_test, y_test))
33 | #
34 | #
35 | 


--------------------------------------------------------------------------------
/Code/3-5KNN（cifar-10）.py:
--------------------------------------------------------------------------------
 1 | from sklearn.neighbors import KNeighborsClassifier
 2 | import pickle
 3 | import cv2
 4 | 
 5 | 
 6 | def load(filename):
 7 | 
 8 |     with open(filename, 'rb') as fo:
 9 | 
10 |         data = pickle.load(fo, encoding='latin1')
11 | 
12 |     return data
13 | #读取第一个训练集——data_batch_1：
14 | train = 'cifar-10-batches-py\data_batch_'
15 | test=r'cifar-10-batches-py\test_batch'       #字符串前加r防止转义字符/t
16 | print(test)
17 | clf = KNeighborsClassifier("nn")
18 | 
19 | for i in  range(1,6): #从文件cifar-10-batches-py中读取data集1-5
20 |     d=load(train+str(i))
21 |     X, y = d["data"], d["labels"]
22 |     X_train, y_train = X, y
23 |     clf.fit(X_train, y_train)
24 |     print("数据集" + str(i) + "训练完毕")
25 | d=load(test)#从文件cifar-10-batches-py中读取test集
26 | X, y = d["data"], d["labels"]
27 | X_test, y_test = X, y
28 | 
29 | print(clf.score(X_test, y_test))


--------------------------------------------------------------------------------
/Code/4-1原始贝叶斯.py:
--------------------------------------------------------------------------------
 1 | class NaiveBayes:
 2 |     def __init__(self):
 3 |         self.model = None
 4 | 
 5 |     # 数学期望
 6 |     @staticmethod
 7 |     def mean(X):
 8 |         return sum(X) / float(len(X))
 9 | 
10 |     # 标准差（方差）
11 |     def stdev(self, X):
12 |         avg = self.mean(X)
13 |         return math.sqrt(sum([pow(x-avg, 2) for x in X]) / float(len(X)))
14 | 
15 |     # 概率密度函数
16 |     def gaussian_probability(self, x, mean, stdev):
17 |         exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
18 |         return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
19 | 
20 |     # 处理X_train
21 |     def summarize(self, train_data):
22 |         summaries = [(self.mean(i), self.stdev(i)) for i in zip(*train_data)]
23 |         return summaries
24 | 
25 |     # 分类别求出数学期望和标准差
26 |     def fit(self, X, y):
27 |         labels = list(set(y))
28 |         data = {label:[] for label in labels}
29 |         for f, label in zip(X, y):
30 |             data[label].append(f)
31 |         self.model = {label: self.summarize(value) for label, value in data.items()}
32 |         return 'GaussianNB train done!'
33 | 
34 |     # 计算概率
35 |     def calculate_probabilities(self, input_data):
36 |         # summaries:{0.0: [(5.0, 0.37),(3.42, 0.40)], 1.0: [(5.8, 0.449),(2.7, 0.27)]}
37 |         # input_data:[1.1, 2.2]
38 |         probabilities = {}
39 |         for label, value in self.model.items():
40 |             probabilities[label] = 1
41 |             for i in range(len(value)):
42 |                 mean, stdev = value[i]
43 |                 probabilities[label] *= self.gaussian_probability(input_data[i], mean, stdev)
44 |         return probabilities
45 | 
46 |     # 类别
47 |     def predict(self, X_test):
48 |         # {0.0: 2.9680340789325763e-27, 1.0: 3.5749783019849535e-26}
49 |         label = sorted(self.calculate_probabilities(X_test).items(), key=lambda x: x[-1])[-1][0]
50 |         return label
51 | 
52 |     def score(self, X_test, y_test):
53 |         right = 0
54 |         for X, y in zip(X_test, y_test):
55 |             label = self.predict(X)
56 |             if label == y:
57 |                 right += 1
58 | 
59 |         return right / float(len(X_test))
60 | 
61 | import math
62 | import numpy as np
63 | import pandas as pd
64 | 
65 | import matplotlib.pyplot as plt
66 | 
67 | 
68 | from sklearn.datasets import load_iris
69 | from sklearn.model_selection import train_test_split
70 | 
71 | 
72 | iris = load_iris()
73 | X = iris.data
74 | Y = iris.target
75 | 
76 | df = pd.DataFrame(iris.data, columns=iris.feature_names)
77 | df['label'] = iris.target
78 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
79 | data = np.array(df.iloc[:100, :])
80 | # print(data)
81 | 
82 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
83 | 
84 | X_train[:10,:]
85 | X_test[0], y_test[0]
86 | 
87 | model = NaiveBayes()
88 | 
89 | 
90 | model.fit(X_train, y_train)
91 | 
92 | 
93 | x_train=[4.4,  3.2,  1.3,  0.2]
94 | 
95 | print(model.predict(x_train))
96 | 


--------------------------------------------------------------------------------
/Code/4-2导包的高斯贝叶斯.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | 
3 | X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
4 | Y = np.array([1, 1, 1, 2, 2, 2])
5 | 
6 | from sklearn.naive_bayes import GaussianNB
7 | clf = GaussianNB(priors=None, var_smoothing=1e-09)
8 | clf.fit(X, Y)
9 | print(clf.predict([[-0.8, -1]]))


--------------------------------------------------------------------------------
/Code/4-3高斯伯努利多项式贝叶斯.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from sklearn.naive_bayes import GaussianNB
 3 | from sklearn.naive_bayes import MultinomialNB
 4 | from sklearn.naive_bayes import BernoulliNB
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | 
 9 | import matplotlib.pyplot as plt
10 | 
11 | 
12 | from sklearn.datasets import load_iris
13 | from sklearn.model_selection import train_test_split
14 | iris = load_iris()
15 | X = iris.data
16 | Y = iris.target
17 | 
18 | X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.4, random_state=0)
19 | 
20 | nb = GaussianNB()
21 | nb.fit(X_train, y_train)
22 | 
23 | y_pred = nb.fit(iris.data, iris.target).predict(iris.data)
24 | print("Number of mislabeled points out of a total %d points : %d"% (iris.data.shape[0],(iris.target != y_pred).sum()))
25 | 
26 | print("Naive Gausian bayes score (sklearn): " +str(nb.score(X_test, y_test)))
27 | 
28 | 
29 | nb = MultinomialNB()
30 | nb.fit(X_train, y_train)
31 | 
32 | y_pred = nb.fit(iris.data, iris.target).predict(iris.data)
33 | print("Number of mislabeled points out of a total %d points : %d"% (iris.data.shape[0],(iris.target != y_pred).sum()))
34 | 
35 | print("Naive Gausian bayes score (sklearn): " +str(nb.score(X_test, y_test)))
36 | 
37 | 
38 | nb = BernoulliNB()
39 | nb.fit(X_train, y_train)
40 | 
41 | y_pred = nb.fit(iris.data, iris.target).predict(iris.data)
42 | print("Number of mislabeled points out of a total %d points : %d"% (iris.data.shape[0],(iris.target != y_pred).sum()))
43 | print("Naive Gausian bayes score (sklearn): " +str(nb.score(X_test, y_test)))
44 | 
45 | min_x=min(np.min(X_train.ravel()),np.min(X_test.ravel()))-0.1
46 | max_x=max(np.max(X_train.ravel()),np.max(X_test.ravel()))+0.1
47 | binarizes=np.linspace(min_x,max_x,endpoint=True,num=100)
48 | 
49 | train_scores=[]
50 | test_scores=[]
51 | 
52 | for binarize in binarizes:
53 |     cls=BernoulliNB(binarize=binarize)
54 |     cls.fit(X_train,y_train)
55 |     train_scores.append(cls.score(X_train,y_train))
56 |     test_scores.append(cls.score(X_test, y_test))
57 | 
58 | fig=plt.figure()
59 | ax=fig.add_subplot(1,1,1)
60 | ax.plot(binarizes,train_scores,label="Training Score")
61 | ax.plot(binarizes,test_scores,label="Testing Score")
62 | ax.set_xlabel("binarize")
63 | ax.set_ylabel("score")
64 | ax.set_ylim(0,1.0)
65 | ax.set_xlim(min_x-1,max_x+1)
66 | ax.set_title("BernoulliNB")
67 | ax.legend(loc="best")
68 | plt.show()
69 | 
70 | # 这几个都是naive bayes的模型，区别主要在于特征的分布。
71 | #
72 | #
73 | #
74 | # 如果特征是数值的，最好是正态分布的数值的，那么用
75 | # sklearn.naive_bayes.GaussianNB
76 | 
77 | # 如果特征是binary的，那么用
78 | # sklearn.naive_bayes.BernoulliNB
79 | 
80 | # 如果特征是categorical的，那么用
81 | # sklearn.naive_bayes.MultinomialNB


--------------------------------------------------------------------------------
/Code/4-4高斯做的分类(数字样本).py:
--------------------------------------------------------------------------------
 1 | 
 2 | from sklearn.datasets import load_digits
 3 | from sklearn.naive_bayes import GaussianNB
 4 | 
 5 | from sklearn.model_selection import train_test_split
 6 | digits = load_digits()
 7 | X, y = digits.data, digits.target
 8 | 
 9 | 
10 | print(digits.data.shape)
11 | 
12 | import matplotlib.pyplot as plt
13 | # plt.gray()
14 | # plt.matshow(digits.images[0])
15 | # plt.show()
16 | 
17 | fig=plt.figure(figsize=(6,6))
18 | fig.subplots_adjust(left=0,right=1,bottom=0,top=1,hspace=0.05,wspace=0.05)
19 | 
20 | for i in range(64):
21 |     ax=fig.add_subplot(8,8,i+1,xticks=[],yticks=[])
22 |     ax.imshow(digits.images[i],cmap=plt.cm.binary,interpolation='nearest')
23 |     #用目标值标记图像
24 |     ax.text(0,7,str(digits.target[i]))
25 | plt.show()
26 | 
27 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
28 | from sklearn.naive_bayes import GaussianNB
29 | clf = GaussianNB(priors=None, var_smoothing=1e-09)
30 | clf.fit(X_train, y_train)
31 | print(clf.score(X_test,y_test))


--------------------------------------------------------------------------------
/Code/4-5高斯（鱼样本).py:
--------------------------------------------------------------------------------
 1 | import pandas as  pd
 2 | from sklearn.naive_bayes import GaussianNB
 3 | from sklearn.model_selection import train_test_split
 4 | fish= pd.read_csv("fish-01.csv")
 5 | X_train, X_test, y_train, y_test = train_test_split(fish.iloc[:,1:], fish.iloc[:,0], test_size=0.4, random_state=0)
 6 | 
 7 | clf = GaussianNB(priors=None, var_smoothing=1e-09)
 8 | clf.fit(X_train, y_train)
 9 | print(clf.predict([[120.0, 19.4, 21.0, 23.7, 25.8, 13.9]]))
10 | print(clf.score(X_test, y_test))
11 | 
12 | 
13 | from sklearn.neural_network import MLPClassifier
14 | 
15 | clf1=MLPClassifier(activation='logistic',max_iter=1000)# 构造分类器实例
16 | clf1.fit(X_train, y_train)
17 | print(clf.predict([[120.0, 19.4, 21.0, 23.7, 25.8, 13.9]]))
18 | print(clf.score(X_test, y_test))


--------------------------------------------------------------------------------
/Code/4-6高斯（cifar-10）.py:
--------------------------------------------------------------------------------
 1 | from sklearn.naive_bayes import GaussianNB
 2 | import pickle
 3 | 
 4 | 
 5 | 
 6 | def load(filename):
 7 | 
 8 |     with open(filename, 'rb') as fo:
 9 | 
10 |         data = pickle.load(fo, encoding='latin1')
11 | 
12 |     return data
13 | #读取第一个训练集——data_batch_1：
14 | train = 'cifar-10-batches-py\data_batch_'
15 | test=r'cifar-10-batches-py\test_batch'       #字符串前加r防止转义字符/t
16 | print(test)
17 | clf = GaussianNB(priors=None, var_smoothing=1e-09)
18 | 
19 | for i in  range(1,6): #从文件cifar-10-batches-py中读取data集1-5
20 |     d=load(train+str(i))
21 |     X, y = d["data"], d["labels"]
22 |     X_train, y_train = X, y
23 |     clf.fit(X_train, y_train,)
24 |     print("数据集" + str(i) + "训练完毕")
25 | 
26 | d=load(test)#从文件cifar-10-batches-py中读取test集
27 | X, y = d["data"], d["labels"]
28 | X_test, y_test = X, y
29 | 
30 | print(clf.score(X_test, y_test))


--------------------------------------------------------------------------------
/Code/5-1原始决策树.py:
--------------------------------------------------------------------------------
  1 | def create_data():
  2 |     datasets = [[1, 'Sunny', 'Hot', 'High', 'Weak', 'No'],
  3 |                 [2, 'Sunny', 'Hot', 'High', 'Strong', 'No'],
  4 |                 [3, 'Overcast', 'Hot', 'High', 'Weak', 'Yes'],
  5 |                 [4, 'Rainy', 'Mild', 'High', 'Weak', 'Yes'],
  6 |                 [5, 'Rainy', 'Cool', 'Normal', 'Weak', 'Yes'],
  7 |                 [6, 'Rainy', 'Cool', 'Normal', 'Strong', 'No'],
  8 |                 [7, 'Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
  9 |                 [8, 'Sunny', 'Mild', 'High', 'Weak', 'No'],
 10 |                 [9, 'Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
 11 |                 [10, 'Rainy', 'Mild', 'Normal', 'Weak', 'Yes'],
 12 |                 [11, 'Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],
 13 |                 [12, 'Overcast', 'Mild', 'High', 'Strong', 'Yes'],
 14 |                 [13, 'Overcast', 'Hot', 'Normal', 'Weak', 'Yes'],
 15 |                 [14, 'Rainy', 'Mild', 'High', 'Strong', 'No'],
 16 |                 ]
 17 | 
 18 |     labels = ['Day', 'OutLook', 'Temperature', 'Humidity', 'Wind', 'PlayTennis']
 19 | 
 20 |     # 返回数据集和每个维度的名称
 21 |     return datasets, labels
 22 | from math import log
 23 | 
 24 | # 以 Outlook 为分界的熵
 25 | 
 26 | En_Sunny = -(2/5)*log(2/5,2) - (3/5)*log(3/5,2)
 27 | En_Overcast = -(4/4)*log(4/4,2)
 28 | En_Rainy = -(3/5)*log(3/5,2) - (2/5)*log(2/5,2)
 29 | 
 30 | # Outlook 联合熵
 31 | En_Outlook = 5/14*En_Sunny + 4/14*En_Overcast + 5/14*En_Rainy
 32 | 
 33 | print(En_Sunny,En_Overcast,En_Rainy)
 34 | print('联合熵:',En_Outlook)
 35 | # Outlook 的分裂信息度量 熵
 36 | 
 37 | IG=-(5/14)*log(5/14,2) - (9/14)*log(9/14,2)-En_Outlook
 38 | print("信息增益",IG)
 39 | OutLook = -5/14*log(5/14,2)-4/14*log(4/14,2)-5/14*log(5/14,2)
 40 | # Outlook 增益率
 41 | OutLook_Gain_Ratio = IG/OutLook
 42 | 
 43 | print(OutLook,OutLook_Gain_Ratio)
 44 | 
 45 | import numpy as  np
 46 | 
 47 | 
 48 | # 定义节点类 二叉树
 49 | class Node:
 50 |     def __init__(self, root=True, label=None, feature_name=None, feature=None):
 51 |         self.root = root
 52 |         self.label = label
 53 |         self.feature_name = feature_name
 54 |         self.feature = feature
 55 |         self.tree = {}
 56 |         self.result = {'label:': self.label, 'feature': self.feature, 'tree': self.tree}
 57 | 
 58 |     def __repr__(self):
 59 |         return '{}'.format(self.result)
 60 | 
 61 |     def add_node(self, val, node):
 62 |         self.tree[val] = node
 63 | 
 64 |     def predict(self, features):
 65 |         if self.root is True:
 66 |             return self.label
 67 |         return self.tree[features[self.feature]].predict(features)
 68 | 
 69 | 
 70 | class DTree:
 71 |     def __init__(self, epsilon=0.1):
 72 |         self.epsilon = epsilon
 73 |         self._tree = {}
 74 | 
 75 |     # 熵
 76 |     @staticmethod
 77 |     def calc_ent(datasets):
 78 |         data_length = len(datasets)
 79 |         label_count = {}
 80 |         for i in range(data_length):
 81 |             label = datasets[i][-1]
 82 |             if label not in label_count:
 83 |                 label_count[label] = 0
 84 |             label_count[label] += 1
 85 |         ent = -sum([(p / data_length) * log(p / data_length, 2) for p in label_count.values()])
 86 |         return ent
 87 | 
 88 |     # 经验条件熵
 89 |     def cond_ent(self, datasets, axis=0):
 90 |         data_length = len(datasets)
 91 |         feature_sets = {}
 92 |         for i in range(data_length):
 93 |             feature = datasets[i][axis]
 94 |             if feature not in feature_sets:
 95 |                 feature_sets[feature] = []
 96 |             feature_sets[feature].append(datasets[i])
 97 |         cond_ent = sum([(len(p) / data_length) * self.calc_ent(p) for p in feature_sets.values()])
 98 |         return cond_ent
 99 | 
100 |     # 信息增益
101 |     @staticmethod
102 |     def info_gain(ent, cond_ent):
103 |         return ent - cond_ent
104 | 
105 |     def info_gain_train(self, datasets):
106 |         count = len(datasets[0]) - 1
107 |         ent = self.calc_ent(datasets)
108 |         best_feature = []
109 |         for c in range(count):
110 |             c_info_gain = self.info_gain(ent, self.cond_ent(datasets, axis=c))
111 |             best_feature.append((c, c_info_gain))
112 |         # 比较大小
113 |         best_ = max(best_feature, key=lambda x: x[-1])
114 |         return best_
115 | 
116 |     def train(self, train_data):
117 |         """
118 |         input:数据集D(DataFrame格式)，特征集A，阈值eta
119 |         output:决策树T
120 |         """
121 |         _, y_train, features = train_data.iloc[:, :-1], train_data.iloc[:, -1], train_data.columns[:-1]
122 |         # 1,若D中实例属于同一类Ck，则T为单节点树，并将类Ck作为结点的类标记，返回T
123 |         if len(y_train.value_counts()) == 1:
124 |             return Node(root=True,
125 |                         label=y_train.iloc[0])
126 | 
127 |         # 2, 若A为空，则T为单节点树，将D中实例树最大的类Ck作为该节点的类标记，返回T
128 |         if len(features) == 0:
129 |             return Node(root=True, label=y_train.value_counts().sort_values(ascending=False).index[0])
130 | 
131 |         # 3,计算最大信息增益 同5.1,Ag为信息增益最大的特征
132 |         max_feature, max_info_gain = self.info_gain_train(np.array(train_data))
133 |         max_feature_name = features[max_feature]
134 | 
135 |         # 4,Ag的信息增益小于阈值eta,则置T为单节点树，并将D中是实例数最大的类Ck作为该节点的类标记，返回T
136 |         if max_info_gain < self.epsilon:
137 |             return Node(root=True, label=y_train.value_counts().sort_values(ascending=False).index[0])
138 | 
139 |         # 5,构建Ag子集
140 |         node_tree = Node(root=False, feature_name=max_feature_name, feature=max_feature)
141 | 
142 |         feature_list = train_data[max_feature_name].value_counts().index
143 |         for f in feature_list:
144 |             sub_train_df = train_data.loc[train_data[max_feature_name] == f].drop([max_feature_name], axis=1)
145 | 
146 |             # 6, 递归生成树
147 |             sub_tree = self.train(sub_train_df)
148 |             node_tree.add_node(f, sub_tree)
149 | 
150 |         # pprint.pprint(node_tree.tree)
151 |         return node_tree
152 | 
153 |     def fit(self, train_data):
154 |         self._tree = self.train(train_data)
155 |         return self._tree
156 | 
157 |     def predict(self, X_test):
158 |         return self._tree.predict(X_test)
159 | 
160 | 
161 | import pandas as pd
162 | 
163 | def create_data():
164 |     datasets = [['青年', '否', '否', '一般', '否'],
165 |                 ['青年', '否', '否', '好', '否'],
166 |                 ['青年', '是', '否', '好', '是'],
167 |                 ['青年', '是', '是', '一般', '是'],
168 |                 ['青年', '否', '否', '一般', '否'],
169 |                 ['中年', '否', '否', '一般', '否'],
170 |                 ['中年', '否', '否', '好', '否'],
171 |                 ['中年', '是', '是', '好', '是'],
172 |                 ['中年', '否', '是', '非常好', '是'],
173 |                 ['中年', '否', '是', '非常好', '是'],
174 |                 ['老年', '否', '是', '非常好', '是'],
175 |                 ['老年', '否', '是', '好', '是'],
176 |                 ['老年', '是', '否', '好', '是'],
177 |                 ['老年', '是', '否', '非常好', '是'],
178 |                 ['老年', '否', '否', '一般', '否'],
179 |                 ]
180 |     labels = ['年龄', '有工作', '有自己的房子', '信贷情况', '类别']
181 |     # 返回数据集和每个维度的名称
182 |     return datasets, labels
183 | 
184 | 
185 | datasets, labels = create_data()
186 | data_df = pd.DataFrame(datasets, columns=labels)
187 | dt = DTree()
188 | tree = dt.fit(data_df)
189 | 
190 | print(dt.predict(['老年', '否', '否', '一般']))
191 | 
192 | 


--------------------------------------------------------------------------------
/Code/5-2决策树（鸢尾样本）.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from sklearn.tree import DecisionTreeClassifier
 3 | 
 4 | from sklearn.tree import export_graphviz
 5 | 
 6 | from sklearn.datasets import load_iris
 7 | from sklearn.model_selection import train_test_split
 8 | 
 9 | import pandas as pd
10 | import numpy as np
11 | iris = load_iris()
12 | # data
13 | def iris_data():
14 |     iris = load_iris()
15 |     df = pd.DataFrame(iris.data, columns=iris.feature_names)
16 |     df['label'] = iris.target
17 |     df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
18 |     data = np.array(df.iloc[:100, [0, 1, -1]])
19 |     # print(data)
20 |     return data[:,:2], data[:,-1]
21 | 
22 | X, y = iris_data()
23 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
24 | 
25 | clf = DecisionTreeClassifier()
26 | 
27 | clf.fit(X_train, y_train,)
28 | 
29 | print(clf.score(X_test, y_test))
30 | 
31 | print(clf.predict([[4.4, 3. ]]))
32 | 
33 | 
34 | import graphviz
35 | dot_data = export_graphviz(clf, out_file=None)
36 | graph = graphviz.Source(dot_data)
37 | graph.render("iris")
38 | 
39 | dot_data = export_graphviz(clf, out_file=None,
40 |                       feature_names=iris.feature_names,
41 |                       class_names=iris.target_names,
42 |                       filled=True, rounded=True,
43 |                       special_characters=True)
44 | graph = graphviz.Source(dot_data)


--------------------------------------------------------------------------------
/Code/5-3决策树（数字样本）.py:
--------------------------------------------------------------------------------
 1 | from sklearn.datasets import load_digits
 2 | from sklearn.tree import DecisionTreeClassifier
 3 | from sklearn.model_selection import train_test_split
 4 | import matplotlib.pyplot as plt
 5 | digits = load_digits()
 6 | X, y = digits.data, digits.target
 7 | 
 8 | 
 9 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
10 | 
11 | 
12 | clf = DecisionTreeClassifier()
13 | 
14 | clf.fit(X_train, y_train,)
15 | 
16 | print(clf.score(X_test, y_test))
17 | 
18 | 


--------------------------------------------------------------------------------
/Code/5-4多层决策树回归.py:
--------------------------------------------------------------------------------
 1 | # Import the necessary modules and libraries
 2 | import numpy as np
 3 | from sklearn.tree import DecisionTreeRegressor
 4 | import matplotlib.pyplot as plt # Create a random dataset
 5 | rng = np.random.RandomState(1)
 6 | X = np.sort(5 * rng.rand(80, 1), axis=0)
 7 | y = np.sin(X).ravel()
 8 | y[::5] += 3 * (0.5 - rng.rand(16))
 9 | 
10 | # Fit regression model
11 | regr_1 = DecisionTreeRegressor(max_depth=2)
12 | regr_2 = DecisionTreeRegressor(max_depth=5)
13 | regr_1.fit(X, y)
14 | regr_2.fit(X, y)
15 | 
16 | # Predict
17 | X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
18 | y_1 = regr_1.predict(X_test)
19 | y_2 = regr_2.predict(X_test)
20 | 
21 | # Plot the results
22 | plt.figure()
23 | plt.scatter(X, y, s=20, edgecolor="black", c="darkorange", label="data")
24 | plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2)
25 | plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2)
26 | 
27 | plt.xlabel("data")
28 | plt.ylabel("target")
29 | plt.title("Decision Tree Regression")
30 | plt.legend()
31 | plt.show()


--------------------------------------------------------------------------------
/Code/5-5决策树（鱼样本）.py:
--------------------------------------------------------------------------------
 1 | import pandas as  pd
 2 | from sklearn.tree import DecisionTreeClassifier
 3 | from sklearn.model_selection import train_test_split
 4 | fish= pd.read_csv("fish-01.csv")
 5 | X_train, X_test, y_train, y_test = train_test_split(fish.iloc[:,1:], fish.iloc[:,0], test_size=0.4, random_state=0)
 6 | 
 7 | 
 8 | 
 9 | clf = DecisionTreeClassifier()
10 | clf.fit(X_train, y_train)
11 | 
12 | print(clf.predict([[120.0, 19.4, 21.0, 23.7, 25.8, 13.9]]))
13 | 
14 | print(clf.score(X_test, y_test))
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/Code/5-6决策树（cifar-10）.py:
--------------------------------------------------------------------------------
 1 | from sklearn.tree import DecisionTreeClassifier
 2 | import pickle
 3 | import cv2
 4 | 
 5 | 
 6 | def load(filename):
 7 | 
 8 |     with open(filename, 'rb') as fo:
 9 | 
10 |         data = pickle.load(fo, encoding='latin1')
11 | 
12 |     return data
13 | #读取第一个训练集——data_batch_1：
14 | train = 'cifar-10-batches-py\data_batch_'
15 | test=r'cifar-10-batches-py\test_batch'       #字符串前加r防止转义字符/t
16 | print(test)
17 | clf = DecisionTreeClassifier()
18 | 
19 | for i in  range(1,6): #从文件cifar-10-batches-py中读取data集1-5
20 |     d=load(train+str(i))
21 |     X, y = d["data"], d["labels"]
22 |     X_train, y_train = X, y
23 |     clf.fit(X_train, y_train,)
24 |     print("数据集" + str(i) + "训练完毕")
25 | 
26 | d=load(test)#从文件cifar-10-batches-py中读取test集
27 | X, y = d["data"], d["labels"]
28 | X_test, y_test = X, y
29 | 
30 | print(clf.score(X_test, y_test))


--------------------------------------------------------------------------------
/Code/5-7决策树剪枝（乳腺癌样本）.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import matplotlib.pyplot as plt
 3 | from sklearn.model_selection import train_test_split
 4 | from sklearn.datasets import load_breast_cancer
 5 | from sklearn.tree import DecisionTreeClassifier
 6 | 
 7 | 
 8 | X, y = load_breast_cancer(return_X_y=True)
 9 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
10 | 
11 | clf = DecisionTreeClassifier(random_state=0)
12 | path = clf.cost_complexity_pruning_path(X_train, y_train)
13 | ccp_alphas, impurities = path.ccp_alphas, path.impurities
14 | fig, ax = plt.subplots()
15 | ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
16 | ax.set_xlabel("effective alpha")
17 | ax.set_ylabel("total impurity of leaves")
18 | ax.set_title("Total Impurity vs effective alpha for training set")
19 | 
20 | 
21 | 
22 | clfs = []
23 | for ccp_alpha in ccp_alphas:
24 |     clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
25 |     clf.fit(X_train, y_train)
26 |     clfs.append(clf)
27 | print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
28 |       clfs[-1].tree_.node_count, ccp_alphas[-1]))
29 | 
30 | 
31 | clfs = clfs[:-1]
32 | ccp_alphas = ccp_alphas[:-1]
33 | 
34 | node_counts = [clf.tree_.node_count for clf in clfs]
35 | depth = [clf.tree_.max_depth for clf in clfs]
36 | fig, ax = plt.subplots(2, 1)
37 | ax[0].plot(ccp_alphas, node_counts, marker='o', drawstyle="steps-post")
38 | ax[0].set_xlabel("alpha")
39 | ax[0].set_ylabel("number of nodes")
40 | ax[0].set_title("Number of nodes vs alpha")
41 | ax[1].plot(ccp_alphas, depth, marker='o', drawstyle="steps-post")
42 | ax[1].set_xlabel("alpha")
43 | ax[1].set_ylabel("depth of tree")
44 | ax[1].set_title("Depth vs alpha")
45 | fig.tight_layout()
46 | 
47 | train_scores = [clf.score(X_train, y_train) for clf in clfs]
48 | test_scores = [clf.score(X_test, y_test) for clf in clfs]
49 | 
50 | fig, ax = plt.subplots()
51 | ax.set_xlabel("alpha")
52 | ax.set_ylabel("accuracy")
53 | ax.set_title("Accuracy vs alpha for training and testing sets")
54 | ax.plot(ccp_alphas, train_scores, marker='o', label="train",
55 |         drawstyle="steps-post")
56 | ax.plot(ccp_alphas, test_scores, marker='o', label="test",
57 |         drawstyle="steps-post")
58 | ax.legend()
59 | plt.show()
60 | 
61 | clfs = clfs[:-1]
62 | ccp_alphas = ccp_alphas[:-1]
63 | 
64 | node_counts = [clf.tree_.node_count for clf in clfs]
65 | depth = [clf.tree_.max_depth for clf in clfs]
66 | fig, ax = plt.subplots(2, 1)
67 | ax[0].plot(ccp_alphas, node_counts, marker='o', drawstyle="steps-post")
68 | ax[0].set_xlabel("alpha")
69 | ax[0].set_ylabel("number of nodes")
70 | ax[0].set_title("Number of nodes vs alpha")
71 | ax[1].plot(ccp_alphas, depth, marker='o', drawstyle="steps-post")
72 | ax[1].set_xlabel("alpha")
73 | ax[1].set_ylabel("depth of tree")
74 | ax[1].set_title("Depth vs alpha")
75 | fig.tight_layout()
76 | 
77 | 
78 | 
79 | train_scores = [clf.score(X_train, y_train) for clf in clfs]
80 | test_scores = [clf.score(X_test, y_test) for clf in clfs]
81 | 
82 | fig, ax = plt.subplots()
83 | ax.set_xlabel("alpha")
84 | ax.set_ylabel("accuracy")
85 | ax.set_title("Accuracy vs alpha for training and testing sets")
86 | ax.plot(ccp_alphas, train_scores, marker='o', label="train",
87 |         drawstyle="steps-post")
88 | ax.plot(ccp_alphas, test_scores, marker='o', label="test",
89 |         drawstyle="steps-post")
90 | ax.legend()
91 | plt.show()
92 | 
93 | 


--------------------------------------------------------------------------------
/Code/5-8计算熵(entropy)的函数.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | 
 5 | 
 6 | import math
 7 | 
 8 | p=np.linspace(0.01,1,num=50,endpoint=False)
 9 | 
10 | entropy = -p*np.log2(p)-(1-p)*np.log2(1-p)
11 | 
12 | 
13 | #plt.plot(b)
14 | plt.plot(p,entropy)
15 | plt.grid(True)
16 | plt.xlabel('p')
17 | plt.ylabel('Entropy(bit)')
18 | #plt.plot(p,gini)
19 | 
20 | max_en = 2*(-(1/2)*np.log2(1/2))
21 | print(max_en)
22 | 
23 | d=np.linspace(0.01,100,num=50,endpoint=False)
24 | ld=np.log2(d)
25 | 
26 | plt.show()


--------------------------------------------------------------------------------
/Code/6-1逻辑斯蒂的概率分布.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from math import exp
 4 | 
 5 | def sigmod(x):
 6 |     return 1/(1+np.exp(-x))
 7 | 
 8 | x = np.arange(-10,10.,0.1)
 9 | y = sigmod(x)
10 | 
11 | plt.plot(x,y)
12 | plt.grid(True)
13 | plt.show()
14 | 
15 | 
16 | class LogisticReressionClassifier:
17 |     def __init__(self, max_iter=200, learning_rate=0.01):
18 |         self.max_iter = max_iter
19 |         self.learning_rate = learning_rate
20 | 
21 |     def sigmoid(self, x):
22 |         return 1 / (1 + exp(-x))
23 | 
24 |     def data_matrix(self, X):
25 |         data_mat = []
26 |         for d in X:
27 |             data_mat.append([1.0, *d])
28 |         return data_mat
29 | 
30 |     def fit(self, X, y):
31 |         # label = np.mat(y)
32 |         data_mat = self.data_matrix(X)  # m*n
33 |         self.weights = np.zeros((len(data_mat[0]), 1), dtype=np.float32)
34 | 
35 |         for iter_ in range(self.max_iter):
36 |             for i in range(len(X)):
37 |                 result = self.sigmoid(np.dot(data_mat[i], self.weights))
38 |                 error = y[i] - result
39 |                 self.weights += self.learning_rate * error * np.transpose([data_mat[i]])
40 |         print('LogisticRegression Model(learning_rate={},max_iter={})'.format(self.learning_rate, self.max_iter))
41 | 
42 |     # def f(self, x):
43 |     #     return -(self.weights[0] + self.weights[1] * x) / self.weights[2]
44 | 
45 |     def score(self, X_test, y_test):
46 |         right = 0
47 |         X_test = self.data_matrix(X_test)
48 |         for x, y in zip(X_test, y_test):
49 |             result = np.dot(x, self.weights)
50 |             if (result > 0 and y == 1) or (result < 0 and y == 0):
51 |                 right += 1
52 |         return right / len(X_test)
53 | 
54 | 


--------------------------------------------------------------------------------
/Code/6-2原始逻辑斯蒂（鸢尾样本）.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from math import exp
 3 | import numpy as np
 4 | import pandas as pd
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | 
 8 | from sklearn.datasets import load_iris
 9 | from sklearn.model_selection import train_test_split
10 | 
11 | from sklearn.linear_model import LogisticRegression
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # data
18 | def create_data():
19 |     iris = load_iris()
20 |     df = pd.DataFrame(iris.data, columns=iris.feature_names)
21 |     df['label'] = iris.target
22 |     df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
23 |     data = np.array(df.iloc[:100, [0,1,-1]])
24 |     # print(data)
25 |     return data[:,:2], data[:,-1]
26 | 
27 | 
28 | X, y = create_data()
29 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
30 | 
31 | 
32 | class LogisticReressionClassifier:
33 |     def __init__(self, max_iter=200, learning_rate=0.01):
34 |         self.max_iter = max_iter
35 |         self.learning_rate = learning_rate
36 | 
37 |     def sigmoid(self, x):
38 |         return 1 / (1 + exp(-x))
39 | 
40 |     def data_matrix(self, X):
41 |         data_mat = []
42 |         for d in X:
43 |             data_mat.append([1.0, *d])
44 |         return data_mat
45 | 
46 |     def fit(self, X, y):
47 |         # label = np.mat(y)
48 |         data_mat = self.data_matrix(X)  # m*n
49 |         self.weights = np.zeros((len(data_mat[0]), 1), dtype=np.float32)
50 | 
51 |         for iter_ in range(self.max_iter):
52 |             for i in range(len(X)):
53 |                 result = self.sigmoid(np.dot(data_mat[i], self.weights))
54 |                 error = y[i] - result
55 |                 self.weights += self.learning_rate * error * np.transpose([data_mat[i]])
56 |         print('LogisticRegression Model(learning_rate={},max_iter={})'.format(self.learning_rate, self.max_iter))
57 | 
58 |     # def f(self, x):
59 |     #     return -(self.weights[0] + self.weights[1] * x) / self.weights[2]
60 | 
61 |     def score(self, X_test, y_test):
62 |         right = 0
63 |         X_test = self.data_matrix(X_test)
64 |         for x, y in zip(X_test, y_test):
65 |             result = np.dot(x, self.weights)
66 |             if (result > 0 and y == 1) or (result < 0 and y == 0):
67 |                 right += 1
68 |         return right / len(X_test)
69 | 
70 | lr_clf = LogisticReressionClassifier()
71 | lr_clf.fit(X_train, y_train)
72 | 
73 | x_ponits = np.arange(4, 8)
74 | y_ = -(lr_clf.weights[1]*x_ponits + lr_clf.weights[0])/lr_clf.weights[2]
75 | plt.plot(x_ponits, y_)
76 | 
77 | 
78 | plt.scatter(X[:50,0],X[:50,1], label='0')
79 | plt.scatter(X[50:,0],X[50:,1], label='1')
80 | plt.legend()
81 | plt.show()
82 | 
83 | 


--------------------------------------------------------------------------------
/Code/6-2逻辑斯蒂（鸢尾样本）.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from math import exp
 3 | import numpy as np
 4 | import pandas as pd
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | 
 8 | from sklearn.datasets import load_iris
 9 | from sklearn.model_selection import train_test_split
10 | from sklearn.linear_model import LogisticRegression
11 | 
12 | 
13 | clf = LogisticRegression(max_iter=200,solver='liblinear')
14 | def create_data():
15 |     iris = load_iris()
16 |     df = pd.DataFrame(iris.data, columns=iris.feature_names)
17 |     df['label'] = iris.target
18 |     df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
19 |     data = np.array(df.iloc[:100, [0,1,-1]])
20 |     # print(data)
21 |     return data[:,:2], data[:,-1]
22 | 
23 | 
24 | X, y = create_data()
25 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
26 | 
27 | clf.fit(X_train, y_train)
28 | 
29 | 
30 | print(clf.coef_, clf.intercept_)
31 | 
32 | 
33 | x_ponits = np.arange(4, 8)
34 | y_ = -(clf.coef_[0][0]*x_ponits + clf.intercept_)/clf.coef_[0][1]
35 | plt.plot(x_ponits, y_)
36 | 
37 | plt.plot(X[:50, 0], X[:50, 1], 'bo', color='blue', label='0')
38 | plt.plot(X[50:, 0], X[50:, 1], 'bo', color='orange', label='1')
39 | plt.xlabel('sepal length')
40 | plt.ylabel('sepal width')
41 | plt.legend()
42 | plt.show()
43 | 


--------------------------------------------------------------------------------
/Code/6-4逻辑斯蒂（数字样本）.py:
--------------------------------------------------------------------------------
 1 | from sklearn import datasets, neighbors, linear_model
 2 | 
 3 | digits = datasets.load_digits()
 4 | X_digits = digits.data / digits.data.max()
 5 | y_digits = digits.target
 6 | 
 7 | n_samples = len(X_digits)
 8 | 
 9 | X_train = X_digits[:int(.9 * n_samples)]
10 | y_train = y_digits[:int(.9 * n_samples)]
11 | X_test = X_digits[int(.9 * n_samples):]
12 | y_test = y_digits[int(.9 * n_samples):]
13 | 
14 | knn = neighbors.KNeighborsClassifier()
15 | logistic = linear_model.LogisticRegression(solver='lbfgs', max_iter=1000,
16 |                                            multi_class='multinomial')
17 | 
18 | print('KNN score: %f' % knn.fit(X_train, y_train).score(X_test, y_test))
19 | print('LogisticRegression score: %f'
20 |       % logistic.fit(X_train, y_train).score(X_test, y_test))


--------------------------------------------------------------------------------
/Code/6-5逻辑斯蒂（乳腺癌样本)评估（二分类）.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # 载入数据
 3 | from sklearn.datasets import load_breast_cancer
 4 | import matplotlib.pyplot as plt
 5 | cancer = load_breast_cancer()
 6 | X = cancer.data
 7 | y = cancer.target
 8 | print('data shape: {0}; no. positive: {1}; no. negative: {2}'.format(
 9 |     X.shape, y[y==1].shape[0], y[y==0].shape[0]))
10 | print(cancer.data[0])
11 | 
12 | 
13 | cancer.feature_names
14 | 
15 | 
16 | from sklearn.model_selection import train_test_split
17 | X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
18 | 
19 | from sklearn.linear_model import LogisticRegression
20 | logmodel = LogisticRegression(max_iter=3000)
21 | logmodel.fit(X_train, y_train)
22 | 
23 | train_score = logmodel.score(X_train,y_train)
24 | test_score = logmodel.score(X_test,y_test)
25 | 
26 | #print(train_score,test_score)
27 | 
28 | X_output=logmodel.predict(X_test)
29 | from sklearn.metrics import precision_score
30 | print(precision_score(y_test,X_output,average=None))
31 | 
32 | # （查全率）召回率  The best value is 1 and the worst value is 0.
33 | # 就是所有准确的条目有多少被检索出来了。
34 | from sklearn.metrics import recall_score
35 | print(recall_score(y_test,X_output,average=None))
36 | 
37 | 
38 | 
39 | 
40 | from sklearn.metrics import plot_precision_recall_curve
41 | 
42 | pr = plot_precision_recall_curve(logmodel,X_test,y_test)
43 | 
44 | #print(pr)
45 | 
46 | from sklearn.metrics import plot_roc_curve
47 | 
48 | roc = plot_roc_curve(logmodel, X_test, y_test)
49 | 
50 | 
51 | # from sklearn.metrics import roc_curve,auc
52 | # fpr, tpr, thresholds = roc_curve(y_test, X_output)
53 | # print(auc(fpr, tpr))
54 | 
55 | 
56 | from sklearn.metrics import plot_confusion_matrix
57 | from sklearn.metrics import classification_report
58 | disp = plot_confusion_matrix(logmodel, X_test, y_test)  #混淆矩阵
59 | disp.figure_.suptitle("Confusion Matrix")
60 | #print("Confusion matrix:\n%s" % disp.confusion_matrix)
61 | 
62 | print(classification_report(y_test, X_output))
63 | 
64 | plt.show()


--------------------------------------------------------------------------------
/Code/6-6逻辑斯蒂（广告样本）.py:
--------------------------------------------------------------------------------
 1 | from sklearn import datasets, neighbors, linear_model
 2 | import pandas as  pd
 3 | from sklearn.model_selection import train_test_split
 4 | 
 5 | advertise= pd.read_csv("advertising.csv")
 6 | X_train, X_test, y_train, y_test = train_test_split(advertise.loc[:,["Daily Time Spent on Site","Age","Area Income","Male"]], advertise.iloc[:,[-1]], test_size=0.2, random_state=0)
 7 | from sklearn.linear_model import LogisticRegression
 8 | logmodel = LogisticRegression(max_iter=200)
 9 | logmodel.fit(X_train, y_train)
10 | 
11 | train_score = logmodel.score(X_train,y_train)
12 | test_score = logmodel.score(X_test,y_test)
13 | 
14 | print(train_score,test_score)
15 | 
16 | y_pred_proba = logmodel.predict_proba(X_test)
17 | print('sample of predict probability: {0}'.format(y_pred_proba[0]))#是0？还是1（概率）0在前1在后
18 | print(y_pred_proba)
19 | 


--------------------------------------------------------------------------------
/Code/7-1查找best参数.py:
--------------------------------------------------------------------------------
 1 | from sklearn import svm, datasets
 2 | from sklearn.model_selection import GridSearchCV #找离散变量中最好的参数
 3 | from sklearn.linear_model import LogisticRegression
 4 | iris = datasets.load_iris()
 5 | #parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
 6 | 
 7 | parameters = {'solver':('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'), 'C':[0.1,1,2]}
 8 | print(parameters)
 9 | classifier = LogisticRegression(max_iter=3000)
10 | 
11 | 
12 | clf = GridSearchCV(classifier, parameters)
13 | clf.fit(iris.data, iris.target)
14 | 
15 | sorted(clf.cv_results_.keys())
16 | 
17 | print(clf.best_params_)
18 | print(clf.best_score_)
19 | 
20 | from sklearn.model_selection import RandomizedSearchCV  #找连续变量中
21 | 
22 | clf = RandomizedSearchCV(classifier, parameters)
23 | 
24 | clf.fit(iris.data, iris.target)
25 | 
26 | sorted(clf.cv_results_.keys())
27 | 
28 | print(clf.best_params_)
29 | print(clf.best_score_)


--------------------------------------------------------------------------------
/Code/7-2决策树（数字样本）评估（多分类）.py:
--------------------------------------------------------------------------------
 1 | # 对决策树模型在数字样本上的测试结果做评估
 2 | from sklearn.datasets import load_digits
 3 | from sklearn.tree import DecisionTreeClassifier
 4 | from sklearn.model_selection import train_test_split
 5 | import matplotlib.pyplot as plt
 6 | digits = load_digits()
 7 | X, y = digits.data, digits.target
 8 | 
 9 | 
10 | X_train, X_test, y_train, Y_test = train_test_split(X, y, test_size=0.3)
11 | clf = DecisionTreeClassifier()
12 | clf.fit(X_train, y_train,)
13 | print(clf.score(X_test, Y_test))
14 | 
15 | 
16 | y_pred=clf.predict(X_test)
17 | from sklearn.metrics import precision_score
18 | print(precision_score(Y_test,y_pred,average=None))
19 | 
20 | 
21 | from sklearn.metrics import recall_score
22 | print(recall_score(Y_test,y_pred,average=None))
23 | 
24 | 
25 | from sklearn.preprocessing import label_binarize   #给数字数据集做归类化
26 | 
27 | # Use label_binarize to be multi-label like settings
28 | Y_test_b = label_binarize(Y_test, classes=[0, 1, 2,3,4,5,6,7,8,9])
29 | Y_pred_b= label_binarize(y_pred, classes=[0, 1, 2,3,4,5,6,7,8,9])
30 | n_classes = Y_test_b.shape[1]
31 | 
32 | 
33 | from sklearn.metrics import precision_recall_curve
34 | from sklearn.metrics import average_precision_score
35 | 
36 | 
37 | precision = dict()
38 | recall = dict()
39 | average_precision = dict()
40 | for i in range(n_classes):                  #计算每个标签的召回率
41 |     precision[i], recall[i], _ = precision_recall_curve(Y_test_b[:, i],
42 |                                                         Y_pred_b[:, i])
43 |     average_precision[i] = average_precision_score(Y_test_b[:, i],Y_pred_b[:, i])
44 | 
45 | lines = []
46 | labels = []
47 | 
48 | colors = ['red','blue','green','pink','gold','navy', 'turquoise', 'darkorange', 'cornflowerblue', 'teal']
49 | 
50 | # precision["micro"], recall["micro"], _ = precision_recall_curve(Y_test_b.ravel(),
51 | #     Y_pred_b.ravel())
52 | # average_precision["micro"] = average_precision_score(Y_test_b,Y_pred_b,  #平均召回率
53 | #                                                      average="micro")
54 | # print('Average precision score, micro-averaged over all classes: {0:0.2f}'
55 | #       .format(average_precision["micro"]))
56 | 
57 | plt.figure(figsize=(7, 8))
58 | for i, color in zip(range(n_classes), colors):          #在图里画出每个标签的召回率
59 |     l, = plt.plot(recall[i], precision[i], color=color, lw=2)
60 |     lines.append(l)
61 |     labels.append('Precision-recall for class {0} (area = {1:0.2f})'
62 |                   ''.format(i, average_precision[i]))
63 | 
64 | fig = plt.gcf()
65 | fig.subplots_adjust(bottom=0.25)
66 | plt.xlim([0.0, 1.0])
67 | plt.ylim([0.0, 1.05])
68 | plt.xlabel('Recall')
69 | plt.ylabel('Precision')
70 | plt.title('Extension of Precision-Recall curve to multi-class')
71 | plt.legend(lines, labels, loc=(0, -.0), prop=dict(size=9))
72 | 
73 | from sklearn.metrics import plot_confusion_matrix
74 | from sklearn.metrics import classification_report
75 | disp = plot_confusion_matrix(clf, X_test, Y_test)
76 | disp.figure_.suptitle("Confusion Matrix")
77 | 
78 | 
79 | print(classification_report(Y_test, y_pred))
80 | 
81 | plt.show()


--------------------------------------------------------------------------------
/Code/7-2官网svm（花样本）评估（多分类）.py:
--------------------------------------------------------------------------------
  1 | from sklearn import svm, datasets
  2 | from sklearn.model_selection import train_test_split
  3 | import numpy as np
  4 | 
  5 | iris = datasets.load_iris()
  6 | X = iris.data
  7 | y = iris.target
  8 | 
  9 | # Add noisy features
 10 | random_state = np.random.RandomState(0)
 11 | n_samples, n_features = X.shape
 12 | X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
 13 | 
 14 | # Limit to the two first classes, and split into training and test
 15 | X_train, X_test, y_train, y_test = train_test_split(X[y < 2], y[y < 2],
 16 |                                                     test_size=.5,
 17 |                                                     random_state=random_state)
 18 | 
 19 | # Create a simple classifier
 20 | classifier = svm.LinearSVC(random_state=random_state)
 21 | classifier.fit(X_train, y_train)
 22 | y_score = classifier.decision_function(X_test)
 23 | 
 24 | from sklearn.metrics import precision_recall_curve
 25 | from sklearn.metrics import plot_precision_recall_curve
 26 | import matplotlib.pyplot as plt
 27 | from sklearn.metrics import average_precision_score
 28 | average_precision = average_precision_score(y_test, y_score)
 29 | 
 30 | print('Average precision-recall score: {0:0.2f}'.format(
 31 |       average_precision))
 32 | disp = plot_precision_recall_curve(classifier, X_test, y_test)
 33 | disp.ax_.set_title('2-class Precision-Recall curve: '
 34 |                    'AP={0:0.2f}'.format(average_precision))
 35 | 
 36 | 
 37 | from sklearn.preprocessing import label_binarize
 38 | 
 39 | # Use label_binarize to be multi-label like settings
 40 | Y = label_binarize(y, classes=[0, 1, 2])
 41 | n_classes = Y.shape[1]
 42 | 
 43 | # Split into training and test
 44 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.5,
 45 |                                                     random_state=random_state)
 46 | 
 47 | # We use OneVsRestClassifier for multi-label prediction
 48 | from sklearn.multiclass import OneVsRestClassifier
 49 | 
 50 | # Run classifier
 51 | classifier = OneVsRestClassifier(svm.LinearSVC(random_state=random_state))
 52 | classifier.fit(X_train, Y_train)
 53 | y_score = classifier.decision_function(X_test)
 54 | 
 55 | from sklearn.metrics import precision_recall_curve
 56 | from sklearn.metrics import average_precision_score
 57 | 
 58 | # For each class
 59 | precision = dict()
 60 | recall = dict()
 61 | average_precision = dict()
 62 | 
 63 | 
 64 | print(n_classes)
 65 | for i in range(n_classes):
 66 |     print(Y_test[:, i])
 67 |     precision[i], recall[i], _ = precision_recall_curve(Y_test[:, i],
 68 |                                                         y_score[:, i])
 69 |     average_precision[i] = average_precision_score(Y_test[:, i], y_score[:, i])
 70 | 
 71 | # A "micro-average": quantifying score on all classes jointly
 72 | precision["micro"], recall["micro"], _ = precision_recall_curve(Y_test.ravel(),
 73 |     y_score.ravel())
 74 | average_precision["micro"] = average_precision_score(Y_test, y_score,
 75 |                                                      average="micro")
 76 | print('Average precision score, micro-averaged over all classes: {0:0.2f}'
 77 |       .format(average_precision["micro"]))
 78 | 
 79 | plt.figure()
 80 | plt.step(recall['micro'], precision['micro'], where='post')
 81 | 
 82 | plt.xlabel('Recall')
 83 | plt.ylabel('Precision')
 84 | plt.ylim([0.0, 1.05])
 85 | plt.xlim([0.0, 1.0])
 86 | plt.title(
 87 |     'Average precision score, micro-averaged over all classes: AP={0:0.2f}'
 88 |     .format(average_precision["micro"]))
 89 | 
 90 | 
 91 | 
 92 | from itertools import cycle
 93 | # setup plot details
 94 | colors = cycle(['navy', 'turquoise', 'darkorange', 'cornflowerblue', 'teal'])
 95 | 
 96 | plt.figure(figsize=(7, 8))
 97 | f_scores = np.linspace(0.2, 0.8, num=4)
 98 | lines = []
 99 | labels = []
100 | for f_score in f_scores:
101 |     print(f_score)
102 |     x = np.linspace(0.01, 1)
103 |     y = f_score * x / (2 * x - f_score)
104 |     l, = plt.plot(x[y >= 0], y[y >= 0], color='gray', alpha=0.2)
105 |     plt.annotate('f1={0:0.1f}'.format(f_score), xy=(0.9, y[45] + 0.02))
106 | 
107 | lines.append(l)
108 | labels.append('iso-f1 curves')
109 | l, = plt.plot(recall["micro"], precision["micro"], color='gold', lw=2)
110 | lines.append(l)
111 | labels.append('micro-average Precision-recall (area = {0:0.2f})'
112 |               ''.format(average_precision["micro"]))
113 | 
114 | for i, color in zip(range(n_classes), colors):
115 |     l, = plt.plot(recall[i], precision[i], color=color, lw=2)
116 |     lines.append(l)
117 |     labels.append('Precision-recall for class {0} (area = {1:0.2f})'
118 |                   ''.format(i, average_precision[i]))
119 | 
120 | fig = plt.gcf()
121 | fig.subplots_adjust(bottom=0.25)
122 | plt.xlim([0.0, 1.0])
123 | plt.ylim([0.0, 1.05])
124 | plt.xlabel('Recall')
125 | plt.ylabel('Precision')
126 | plt.title('Extension of Precision-Recall curve to multi-class')
127 | plt.legend(lines, labels, loc=(0, -.38), prop=dict(size=14))
128 | 
129 | 
130 | plt.show()


--------------------------------------------------------------------------------
/Code/8-1原始svm.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | class SVM:
  4 |     def __init__(self, max_iter=100, kernel='linear'):
  5 |         self.max_iter = max_iter
  6 |         self._kernel = kernel
  7 | 
  8 |     def init_args(self, features, labels):
  9 |         self.m, self.n = features.shape
 10 |         self.X = features
 11 |         self.Y = labels
 12 |         self.b = 0.0
 13 |         # 将Ei保存在⼀个列表⾥
 14 |         self.alpha = np.ones(self.m)
 15 |         self.E = [self._E(i) for i in range(self.m)]
 16 |         # 松弛变量
 17 |         self.C = 1.0
 18 | 
 19 |     def _KKT(self, i):
 20 |         y_g = self._g(i) * self.Y[i]
 21 |         if self.alpha[i] == 0:
 22 |             return y_g >= 1
 23 |         elif 0 < self.alpha[i] < self.C:
 24 |             return y_g == 1
 25 |         else:
 26 |             return y_g <= 1
 27 |             # g(x)预测值，输⼊xi（X[i]）
 28 | 
 29 |     def _g(self, i):
 30 |         r = self.b
 31 |         for j in range(self.m):
 32 |             r += self.alpha[j] * self.Y[j] * self.kernel(self.X[i], self.X[j])
 33 |         return r
 34 |         # 核函数
 35 | 
 36 |     def kernel(self, x1, x2):
 37 |         if self._kernel == 'linear':
 38 |             return sum([x1[k] * x2[k] for k in range(self.n)])
 39 |         elif self._kernel == 'poly':
 40 |             return (sum([x1[k] * x2[k] for k in range(self.n)]) + 1) ** 2
 41 | 
 42 |         return 0
 43 |         # E（x）为g(x)对输⼊x的预测值和y的差
 44 | 
 45 |     def _E(self, i):
 46 |         return self._g(i) - self.Y[i]
 47 | 
 48 |     def _init_alpha(self):
 49 |         # 外层循环⾸先遍历所有满⾜0<a<C的样本点，检验是否 满⾜KKT
 50 |         index_list = [i for i in range(self.m) if 0 < self.alpha[i] < self.C]
 51 |         # 否则遍历整个训练集
 52 |         non_satisfy_list = [i for i in range(self.m) if i not in index_list]
 53 |         index_list.extend(non_satisfy_list)
 54 |         for i in index_list:
 55 |             if self._KKT(i):
 56 |                 continue
 57 |             E1 = self.E[i]
 58 |             # 如果E2是+，选择最⼩的；如果E2是负的，选择最 ⼤的
 59 |             if E1 >= 0:
 60 |                 j = min(range(self.m), key=lambda x: self.E[x])
 61 |             else:
 62 |                 j = max(range(self.m), key=lambda x: self.E[x])
 63 |             return i, j
 64 | 
 65 |     def _compare(self, _alpha, L, H):
 66 |         if _alpha > H:
 67 |             return H
 68 |         elif _alpha < L:
 69 |             return L
 70 |         else:
 71 |             return _alpha
 72 | 
 73 |     def ﬁt(self, features, labels):
 74 |         self.init_args(features, labels)
 75 | 
 76 |         for t in range(self.max_iter):
 77 |             # train
 78 |             i1, i2 = self._init_alpha()
 79 |             # 边界
 80 |             if self.Y[i1] == self.Y[i2]:
 81 |                 L = max(0, self.alpha[i1] + self.alpha[i2] - self.C)
 82 |                 H = min(self.C, self.alpha[i1] + self.alpha[i2])
 83 |             else:
 84 |                 L = max(0, self.alpha[i2] - self.alpha[i1])
 85 |                 H = min(self.C, self.C + self.alpha[i2] - self.alpha[i1])
 86 |             E1 = self.E[i1]
 87 |             E2 = self.E[i2]
 88 |             # eta=K11+K22-2K12
 89 |             eta = self.kernel(self.X[i1], self.X[i1]) + self.kernel(self.X[i2], self.X[i2]) - 2 * self.kernel(
 90 |                 self.X[i1], self.X[i2])
 91 |             if eta <= 0:
 92 |                 # print('eta <= 0')
 93 |                 continue
 94 |             alpha2_new_unc = self.alpha[i2] + self.Y[i2] * (E1 - E2) / eta
 95 |             alpha2_new = self._compare(alpha2_new_unc, L, H)
 96 |             alpha1_new = self.alpha[i1] + self.Y[i1] * self.Y[i2] * (self.alpha[i2] - alpha2_new)
 97 |             b1_new = -E1 - self.Y[i1] * self.kernel(self.X[i1], self.X[i1]) * (alpha1_new - self.alpha[i1]) - self.Y[
 98 |                 i2] * self.kernel(self.X[i2], self.X[i1]) * (alpha2_new - self.alpha[i2]) + self.b
 99 |             b2_new = -E2 - self.Y[i1] * self.kernel(self.X[i1], self.X[i2]) * (alpha1_new - self.alpha[i1]) - self.Y[
100 |                 i2] * self.kernel(self.X[i2], self.X[i2]) * (alpha2_new - self.alpha[i2]) + self.b
101 |             if 0 < alpha1_new < self.C:
102 |                 b_new = b1_new
103 | 
104 |             elif 0 < alpha2_new < self.C:
105 |                 b_new = b2_new
106 |             else:
107 |                 # 选择中点
108 |                 b_new = (b1_new + b2_new) / 2
109 |                 # 更新参数
110 |             self.alpha[i1] = alpha1_new
111 |             self.alpha[i2] = alpha2_new
112 |             self.b = b_new
113 |             self.E[i1] = self._E(i1)
114 |             self.E[i2] = self._E(i2)
115 |         return 'train done!'
116 | 
117 |     def predict(self, data):
118 |         r = self.b
119 |         for i in range(self.m):
120 |             r += self.alpha[i] * self.Y[i] * self.kernel(data, self.X[i])
121 |         return 1 if r > 0 else -1
122 | 
123 |     def score(self, X_test, y_test):
124 |         right_count = 0
125 |         for i in range(len(X_test)):
126 |             result = self.predict(X_test[i])
127 |             if result == y_test[i]:
128 |                 right_count += 1
129 |         return right_count / len(X_test)
130 | 
131 |     def _weight(self):
132 |         # linear model
133 |         yx = self.Y.reshape(-1, 1) * self.X
134 |         self.w = np.dot(yx.T, self.alpha)
135 |         return self.w


--------------------------------------------------------------------------------
/Code/8-2svc参数讲解.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | import pandas as pd
  4 | from sklearn.datasets import load_iris
  5 | from sklearn.model_selection import  train_test_split
  6 | import matplotlib.pyplot as plt
  7 | 
  8 | 
  9 | 
 10 | # data
 11 | def create_data(col = 2):
 12 |     iris = load_iris()
 13 |     df = pd.DataFrame(iris.data, columns=iris.feature_names)
 14 |     df['label'] = iris.target
 15 |     df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
 16 |     data = np.array(df.iloc[:100, [0, 1, -1]])
 17 |     for i in range(len(data)):
 18 |         if data[i,-1] == 0:
 19 |             data[i,-1] = -1
 20 |     # print(data)
 21 |     return data[:,:col], data[:,-1]
 22 | 
 23 | 
 24 | X, y = create_data()
 25 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
 26 | 
 27 | 
 28 | 
 29 | #svm = SVM(max_iter=400)
 30 | 
 31 | 
 32 | #svm.fit(X_train, y_train)
 33 | 
 34 | 
 35 | from sklearn.svm import SVC
 36 | clf = SVC(gamma='auto')
 37 | clf.fit(X_train, y_train)
 38 | 
 39 | # sklearn.svm.SVC
 40 | '''
 41 | *(C
 42 |   =1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=None, random_state=None) *
 43 | 
 44 | 参数：
 45 | 
 46 | - C：C - SVC的惩罚参数C?默认值是1
 47 | .0
 48 | 
 49 | C越大，相当于惩罚松弛变量，希望松弛变量接近0，即对误分类的惩罚增大，趋向于对训练集全分对的情况，这样对训练集测试时准确率很高，但泛化能力弱。C值小，对误分类的惩罚减小，允许容错，将他们当成噪声点，泛化能力较强。
 50 | 
 51 | - kernel ：核函数，默认是rbf，可以是‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’
 52 | 
 53 | – 线性：u'v
 54 | 
 55 | – 多项式：(gamma * u'*v + coef0)^degree
 56 | 
 57 |        – RBF函数：exp(-gamma | u-v | ^ 2)
 58 | 
 59 | – sigmoid：tanh(gamma * u'*v + coef0)
 60 | 
 61 |                - decision_function_shape: ‘ovo’, ‘ovr’, default =’ovr’
 62 | 
 63 | Whether
 64 | to
 65 | return a
 66 | one - vs - rest(‘ovr’) decision
 67 | function
 68 | of
 69 | shape(n_samples, n_classes) as all
 70 | other
 71 | classifiers, or the
 72 | original
 73 | one - vs - one(‘ovo’) decision
 74 | function
 75 | of
 76 | libsvm
 77 | which
 78 | has
 79 | shape(n_samples, n_classes * (n_classes - 1) / 2).However, one - vs - one(‘ovo’) is always
 80 | used as multi -
 81 | 
 82 | 
 83 | class strategy.
 84 | 
 85 | 
 86 | 
 87 | a.一对多法（one - versus - rest, 简称1 - v - r
 88 | SVMs）。训练时依次把某个类别的样本归为一类, 其他剩余的样本归为另一类，这样k个类别的样本就构造出了k个SVM。分类时将未知样本分类为具有最大分类函数值的那类。 
 89 | 
 90 | b.一对一法（one - versus - one, 简称1 - v - 1
 91 | SVMs）。其做法是在任意两类样本之间设计一个SVM，因此k个类别的样本就需要设计k(k - 1) / 2
 92 | 个SVM。当对一个未知样本进行分类时，最后得票最多的类别即为该未知样本的类别。Libsvm中的多类分类就是根据这个方法实现的。
 93 | 
 94 | - degree ：多项式poly函数的维度，默认是3，选择其他核函数时会被忽略。
 95 | 
 96 | 
 97 | - gamma ： ‘rbf’, ‘poly’ 和‘sigmoid’的核函数参数。默认是’auto’，则会选择1 / n_features
 98 | 
 99 | - coef0 ：核函数的常数项。对于‘poly’和 ‘sigmoid’有用。
100 | 
101 | 
102 | - probability ：是否采用概率估计？.默认为False
103 | 
104 | - shrinking ：是否采用shrinking
105 | heuristic方法，默认为true
106 | 
107 | - tol ：停止训练的误差值大小，默认为1e - 3
108 | 
109 | - cache_size ：核函数cache缓存大小，默认为200
110 | 
111 | - class_weight ：类别的权重，字典形式传递。设置第几类的参数C为weight * C(C - SVC中的C)
112 | 
113 | - verbose ：允许冗余输出？
114 | 
115 | 
116 | - max_iter ：最大迭代次数。-1
117 | 为无限制。
118 | 
119 | 
120 | - decision_function_shape ：‘ovo’, ‘ovr’ or None, default = None3
121 | 
122 | - random_state ：数据洗牌时的种子值，int值
123 | 
124 | 主要调节的参数有：C、kernel、degree、gamma、coef0。
125 | '''


--------------------------------------------------------------------------------
/Code/8-3核是可以选的.py:
--------------------------------------------------------------------------------
 1 | from sklearn import svm
 2 | from sklearn import datasets
 3 | from sklearn.model_selection import train_test_split as ts
 4 | 
 5 | #import our data
 6 | iris = datasets.load_iris()
 7 | X = iris.data
 8 | y = iris.target
 9 | 
10 | #split the data to  7:3
11 | X_train,X_test,y_train,y_test = ts(X,y,test_size=0.3)
12 | 
13 | # select different type of kernel function and compare the score
14 | 
15 | # kernel = 'rbf'
16 | clf_rbf = svm.SVC(kernel='rbf',gamma='auto')
17 | clf_rbf.fit(X_train,y_train)
18 | score_rbf = clf_rbf.score(X_test,y_test)
19 | print("The score of rbf is : %f"%score_rbf)
20 | 
21 | # kernel = 'linear'
22 | clf_linear = svm.SVC(kernel='linear',gamma='auto')
23 | print("xxxxxxxxxxxxxxxx")
24 | print(X_train.shape)
25 | clf_linear.fit(X_train,y_train)
26 | score_linear = clf_linear.score(X_test,y_test)
27 | print("The score of linear is : %f"%score_linear)
28 | 
29 | # kernel = 'poly'
30 | clf_poly = svm.SVC(kernel='poly',gamma='auto')
31 | clf_poly.fit(X_train,y_train)
32 | score_poly = clf_poly.score(X_test,y_test)
33 | print("The score of poly is : %f"%score_poly)
34 | 
35 | print(clf_linear.coef_,clf_linear.intercept_)
36 | 
37 | print(clf_linear.predict([[4.9,3.,1.4,0.2]]))
38 | # LinearSVC
39 | from sklearn.svm import LinearSVC
40 | from sklearn import datasets
41 | from sklearn.model_selection import train_test_split as ts
42 | 
43 | 
44 | iris = datasets.load_iris()
45 | X = iris.data
46 | y = iris.target
47 | 
48 | #split the data to  7:3
49 | X_train,X_test,y_train,y_test = ts(X,y,test_size=0.3)
50 | 
51 | 
52 | clf = LinearSVC(random_state=0, tol=1e-5,max_iter = 10000)
53 | clf.fit(X, y)
54 | 
55 | print(clf.coef_)
56 | print(clf.intercept_)
57 | print(clf.predict([[4.9,3.,1.4,0.2]]))
58 | 
59 | 
60 | from sklearn.svm import SVR
61 | 
62 | X = [[0, 0], [2, 2]]
63 | y = [0.5, 2.5]
64 | clf = svm.SVR()
65 | clf.fit(X, y)
66 | SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
67 |     gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
68 |     tol=0.001, verbose=False)
69 | clf.predict([[1, 1]])
70 | 
71 | 
72 | 
73 | from sklearn.svm import LinearSVR
74 | 
75 | regr = LinearSVR(random_state=0, tol=1e-5)
76 | regr.fit(X, y)
77 | print(regr.coef_)
78 | 
79 | print(regr.intercept_)
80 | print(regr.predict([[1,1]]))
81 | 
82 | 


--------------------------------------------------------------------------------
/Code/8-4SVC(数字样本).py:
--------------------------------------------------------------------------------
 1 | from sklearn import datasets, svm, metrics
 2 | 
 3 | 
 4 | digits = datasets.load_digits()
 5 | from sklearn.datasets import load_digits
 6 | 
 7 | from sklearn.model_selection import train_test_split
 8 | 
 9 | digits = load_digits()
10 | X, y = digits.data, digits.target
11 | 
12 | 
13 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
14 | 
15 | 
16 | clf = svm.SVC()
17 | 
18 | clf.fit(X_train, y_train,)
19 | 
20 | print(clf.score(X_test, y_test))
21 | 
22 | 


--------------------------------------------------------------------------------
/Code/8-5SVC（cifar-10）.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from sklearn import svm
 3 | import pickle
 4 | 
 5 | from sklearn.model_selection import GridSearchCV
 6 | 
 7 | 
 8 | def load(filename):
 9 | 
10 |     with open(filename, 'rb') as fo:
11 | 
12 |         data = pickle.load(fo, encoding='latin1')
13 | 
14 |     return data
15 | #读取第一个训练集——data_batch_1：
16 | train = 'cifar-10-batches-py\data_batch_'
17 | test=r'cifar-10-batches-py\test_batch'       #字符串前加r防止转义字符/t
18 | print(test)
19 | 
20 | 
21 | 
22 | #parameters = {'kernel':('linear', 'poly', 'rbf', 'sigmoid', 'precomputed'), 'C':[0.1,1,10,100]}
23 | #classifier=svm.SVC()
24 | #classifier.n_jobs=-1
25 | #print("1")
26 | #clf = GridSearchCV(classifier, parameters)
27 | clf=svm.SVC()
28 | for j in  range(1,6): #从文件cifar-10-batches-py中读取data集1-5
29 |         d=load(train+str(j))
30 |         print("数据集"+str(j)+"训练完毕")
31 |         X, y = d["data"], d["labels"]
32 |         X_train, y_train = X, y
33 |         clf.fit(X_train, y_train,)
34 | 
35 | d=load(test)#从文件cifar-10-batches-py中读取test集
36 | X, y = d["data"], d["labels"]
37 | X_test, y_test = X, y
38 | clf.fit(X_test, y_test,)
39 | 
40 | 
41 | print(clf.score)
42 | 


--------------------------------------------------------------------------------
/Code/9-1bagging三种集成学习方式.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn.datasets import load_iris
  4 | from sklearn.model_selection  import train_test_split
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | # data
  8 | def create_data():
  9 |     iris = load_iris()
 10 |     df = pd.DataFrame(iris.data, columns=iris.feature_names)
 11 |     df['label'] = iris.target
 12 |     df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
 13 |     data = np.array(df.iloc[:100, [0, 1, -1]])
 14 |     for i in range(len(data)):
 15 |         if data[i,-1] == 0:
 16 |             data[i,-1] = -1
 17 |     # print(data)
 18 |     return data[:,:2], data[:,-1]
 19 | 
 20 | 
 21 | X, y = create_data()
 22 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
 23 | 
 24 | 
 25 | # bagging 算法
 26 | from sklearn.ensemble import BaggingClassifier
 27 | from sklearn.neighbors import KNeighborsClassifier
 28 | 
 29 | bagging = BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.5)
 30 | bagging.fit(X_train, y_train)
 31 | 
 32 | in_score = bagging.score(X_train, y_train)
 33 | out_score = bagging.score(X_test, y_test)
 34 | print(in_score,out_score)
 35 | 
 36 | # RandomForest 算法
 37 | from sklearn.ensemble import RandomForestClassifier
 38 | 
 39 | forest = RandomForestClassifier(n_estimators=300, max_depth=2,random_state=0) #用了三百棵树
 40 | forest.fit(X_train, y_train)
 41 | 
 42 | in_score = forest.score(X_train, y_train)
 43 | out_score = forest.score(X_test, y_test)
 44 | print(in_score,out_score)
 45 | 
 46 | 
 47 | from sklearn.linear_model import LogisticRegression
 48 | from sklearn.naive_bayes import GaussianNB
 49 | from sklearn.ensemble import RandomForestClassifier
 50 | 
 51 | from sklearn.ensemble import VotingClassifier
 52 | 
 53 | import numpy as np
 54 | 
 55 | clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
 56 | clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
 57 | clf3 = GaussianNB()
 58 | 
 59 | 
 60 | 
 61 | eclf1 = VotingClassifier(estimators=[
 62 |          ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')
 63 | eclf1 = eclf1.fit(X_train, y_train)
 64 | # print(eclf1.predict(X))
 65 | in_score = eclf1.score(X_train, y_train)
 66 | out_score = eclf1.score(X_test, y_test)
 67 | print(in_score,out_score)
 68 | 
 69 | # Adaboost 算法
 70 | 
 71 | from sklearn.ensemble import AdaBoostClassifier
 72 | clf = AdaBoostClassifier(n_estimators=100, learning_rate=0.5)
 73 | clf.fit(X_train, y_train)
 74 | in_score = clf.score(X_train, y_train)
 75 | out_score = clf.score(X_test, y_test)
 76 | print(in_score,out_score)
 77 | 
 78 | """
 79 | - n_estimators： AdaBoostClassifier和AdaBoostRegressor都有，就是我们的弱学习器的最大迭代次数，或者说最大的弱学习器的个数。一般来说n_estimators太小，容易欠拟合，n_estimators太大，又容易过拟合，一般选择一个适中的数值。默认是50。在实际调参的过程中，我们常常将n_estimators和下面介绍的参数learning_rate一起考虑。
 80 | 
 81 | -  learning_rate:  AdaBoostClassifier和AdaBoostRegressor都有，即每个弱学习器的权重缩减系数ν
 82 | 
 83 | - base_estimator：AdaBoostClassifier和AdaBoostRegressor都有，即我们的弱分类学习器或者弱回归学习器。理论上可以选择任何一个分类或者回归学习器，不过需要支持样本权重。我们常用的一般是CART决策树或者神经网络MLP。
 84 | """
 85 | 
 86 | # boosting的方式 GBDT 算法
 87 | from sklearn.ensemble import GradientBoostingRegressor
 88 | 
 89 | model = GradientBoostingRegressor(n_estimators=500,learning_rate=0.25,min_samples_leaf=9,max_depth=8,random_state=4)
 90 | model.fit(X_train, y_train)
 91 | in_score = model.score(X_train, y_train)
 92 | out_score = model.score(X_test, y_test)
 93 | print(in_score,out_score)
 94 | 
 95 | #stackking的方式
 96 | from sklearn.datasets import load_iris
 97 | from sklearn.ensemble import RandomForestClassifier
 98 | from sklearn.svm import LinearSVC
 99 | from sklearn.linear_model import LogisticRegression
100 | from sklearn.preprocessing import StandardScaler
101 | from sklearn.pipeline import make_pipeline
102 | from sklearn.ensemble import StackingClassifier
103 | 
104 | 
105 | X, y = load_iris(return_X_y=True)
106 | estimators = [
107 |      ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
108 |      ('svr', make_pipeline(StandardScaler(),
109 |                            LinearSVC(random_state=42)))]
110 | clf = StackingClassifier(
111 |      estimators=estimators, final_estimator=LogisticRegression()
112 | )
113 | 
114 | from sklearn.model_selection import train_test_split
115 | X_train, X_test, y_train, y_test = train_test_split(
116 |      X, y, stratify=y, random_state=42
117 | )
118 | clf.fit(X_train, y_train)
119 | print(clf.score(X_test, y_test))
120 | 
121 | from sklearn.ensemble import AdaBoostRegressor
122 | from sklearn.datasets import make_regression
123 | X, y = make_regression(n_features=4, n_informative=2,
124 |                         random_state=0, shuffle=False)
125 | regr = AdaBoostRegressor(random_state=0, n_estimators=100)
126 | regr.fit(X, y)
127 | #regr.feature_importances_
128 | print(regr.predict([[0, 0, 0, 0]]))
129 | print(regr.score(X, y))


--------------------------------------------------------------------------------
/Code/9-2原始Adaboost.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn import datasets
 3 | 
 4 | 
 5 | digits = datasets.load_digits()
 6 | 
 7 | class AdaBoost:
 8 |     def __init__(self, n_estimators=50, learning_rate=1.0):
 9 |         self.clf_num = n_estimators
10 |         self.learning_rate = learning_rate
11 | 
12 |     def init_args(self, datasets, labels):
13 | 
14 |         self.X = datasets
15 |         self.Y = labels
16 |         self.M, self.N = datasets.shape
17 | 
18 |         # 弱分类器数目和集合
19 |         self.clf_sets = []
20 | 
21 |         # 初始化weights
22 |         self.weights = [1.0 / self.M] * self.M
23 | 
24 |         # G(x)系数 alpha
25 |         self.alpha = []
26 | 
27 |     def _G(self, features, labels, weights):
28 |         m = len(features)
29 |         error = 100000.0  # 无穷大
30 |         best_v = 0.0
31 |         # 单维features
32 |         features_min = min(features)
33 |         features_max = max(features)
34 |         n_step = (features_max - features_min + self.learning_rate) // self.learning_rate
35 |         # print('n_step:{}'.format(n_step))
36 |         direct, compare_array = None, None
37 |         for i in range(1, int(n_step)):
38 |             v = features_min + self.learning_rate * i
39 | 
40 |             if v not in features:
41 |                 # 误分类计算
42 |                 compare_array_positive = np.array([1 if features[k] > v else -1 for k in range(m)])
43 |                 weight_error_positive = sum([weights[k] for k in range(m) if compare_array_positive[k] != labels[k]])
44 | 
45 |                 compare_array_nagetive = np.array([-1 if features[k] > v else 1 for k in range(m)])
46 |                 weight_error_nagetive = sum([weights[k] for k in range(m) if compare_array_nagetive[k] != labels[k]])
47 | 
48 |                 if weight_error_positive < weight_error_nagetive:
49 |                     weight_error = weight_error_positive
50 |                     _compare_array = compare_array_positive
51 |                     direct = 'positive'
52 |                 else:
53 |                     weight_error = weight_error_nagetive
54 |                     _compare_array = compare_array_nagetive
55 |                     direct = 'nagetive'
56 | 
57 |                 # print('v:{} error:{}'.format(v, weight_error))
58 |                 if weight_error < error:
59 |                     error = weight_error
60 |                     compare_array = _compare_array
61 |                     best_v = v
62 |         return best_v, direct, error, compare_array
63 | 
64 | 
65 | clf = AdaBoost(n_estimators=3, learning_rate=0.5)
66 | clf.fit(X, y)
67 | clf.score(X_test, y_test)


--------------------------------------------------------------------------------
/Code/9-3Adaboost与RandomForest.py:
--------------------------------------------------------------------------------
 1 | from sklearn import datasets
 2 | 
 3 | digits = datasets.load_digits()
 4 | from sklearn.datasets import load_digits
 5 | 
 6 | from sklearn.model_selection import train_test_split
 7 | 
 8 | digits = load_digits()
 9 | X, y = digits.data, digits.target
10 | 
11 | 
12 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
13 | 
14 | 
15 | 
16 | 
17 | 
18 | from sklearn.ensemble import RandomForestClassifier
19 | 
20 | forest = RandomForestClassifier(n_estimators=300, max_depth=2,random_state=0) #用了三百棵树
21 | forest.fit(X_train, y_train)
22 | 
23 | in_score = forest.score(X_train, y_train)
24 | out_score = forest.score(X_test, y_test)
25 | print(in_score,out_score)
26 | 
27 | from sklearn.ensemble import AdaBoostClassifier
28 | clf = AdaBoostClassifier(n_estimators=100, learning_rate=0.5)
29 | clf.fit(X_train, y_train)
30 | in_score = clf.score(X_train, y_train)
31 | out_score = clf.score(X_test, y_test)
32 | print(in_score,out_score)
33 | 
34 | 


--------------------------------------------------------------------------------
/Code/9-4集成学习（酒样本）.py:
--------------------------------------------------------------------------------
 1 | import pandas as  pd
 2 | from sklearn.model_selection import train_test_split
 3 | 
 4 | wine= pd.read_csv("wine.csv")
 5 | 
 6 | 
 7 | X_train, X_test, y_train, y_test = train_test_split(wine.iloc[:,1:], wine.iloc[:,0], test_size=0.2, random_state=0)
 8 | 
 9 | 
10 | 
11 | from sklearn.ensemble import RandomForestClassifier
12 | 
13 | forest = RandomForestClassifier(n_estimators=300, max_depth=2,random_state=0) #用了三百棵树
14 | forest.fit(X_train, y_train)
15 | 
16 | in_score = forest.score(X_train, y_train)
17 | out_score = forest.score(X_test, y_test)
18 | print(in_score,out_score)
19 | 
20 | from sklearn.ensemble import AdaBoostClassifier
21 | clf = AdaBoostClassifier(n_estimators=100, learning_rate=0.5)
22 | clf.fit(X_train, y_train)
23 | in_score = clf.score(X_train, y_train)
24 | out_score = clf.score(X_test, y_test)
25 | print(in_score,out_score)


--------------------------------------------------------------------------------
/Code_2022/class10-test1.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from collections import defaultdict
 5 | 
 6 | 
 7 | # euclidian distance between 2 data points. For as many data points as necessary.
 8 | def euclidean_distance(a, b):
 9 |     return np.linalg.norm(a - b)
10 | 
11 | 
12 | def kmeans(data, k=3):
13 |     m = data.shape[0]
14 |     index = random.sample(range(m), k)
15 |     mu = data[index]  # 随机选择初始均值向量
16 | 
17 |     while True:
18 | 
19 |         C = defaultdict(list)
20 | 
21 |         for j in range(0, m):
22 |             dij = [euclidean_distance(data[j], mu[i]) for i in range(k)]
23 |             lambda_j = np.argmin(dij)  # 选择最小的值得下标
24 | 
25 |             C[lambda_j].append(data[j].tolist())
26 | 
27 |         new_mu = [np.mean(C[i], axis=0).tolist() for i in range(k)]
28 | 
29 |         if (euclidean_distance(np.array(new_mu), np.array(mu)) > 1e-9):
30 |             mu = new_mu
31 |         else:
32 |             break
33 | 
34 |     return C, mu
35 | 
36 | 
37 | watermelon = np.array([[0.697, 0.46],
38 |                        [0.774, 0.376],
39 |                        [0.634, 0.264],
40 |                        [0.608, 0.318],
41 |                        [0.556, 0.215],
42 |                        [0.403, 0.237],
43 |                        [0.481, 0.149],
44 |                        [0.437, 0.211],
45 |                        [0.666, 0.091],
46 |                        [0.243, 0.267],
47 |                        [0.245, 0.057],
48 |                        [0.343, 0.099],
49 |                        [0.639, 0.161],
50 |                        [0.657, 0.198],
51 |                        [0.36, 0.37],
52 |                        [0.593, 0.042],
53 |                        [0.719, 0.103],
54 |                        [0.359, 0.188],
55 |                        [0.339, 0.241],
56 |                        [0.282, 0.257],
57 |                        [0.748, 0.232],
58 |                        [0.714, 0.346],
59 |                        [0.483, 0.312],
60 |                        [0.478, 0.437],
61 |                        [0.525, 0.369],
62 |                        [0.751, 0.489],
63 |                        [0.532, 0.472],
64 |                        [0.473, 0.376],
65 |                        [0.725, 0.445],
66 |                        [0.446, 0.459]])
67 | 
68 | k = 2
69 | res, mu = kmeans(watermelon, k)
70 | print(res)
71 | print('新的中心：', mu)
72 | 
73 | for i in range(k):
74 |     res_i = np.array(res[i])
75 |     plt.scatter(res_i[:, 0], res_i[:, 1])
76 | plt.show()
77 | 


--------------------------------------------------------------------------------
/Code_2022/class10-test2.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | 
 3 | import numpy as np
 4 | from scipy import linalg
 5 | import matplotlib.pyplot as plt
 6 | import matplotlib as mpl
 7 | 
 8 | from sklearn import mixture
 9 | from sklearn.cluster import KMeans
10 | 
11 | X = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]])
12 | kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
13 | print(kmeans.labels_)
14 | print(kmeans.predict([[0, 0], [4, 4]]))
15 | print(kmeans.cluster_centers_)
16 | 
17 | color_iter = itertools.cycle(['navy', 'c', 'cornflowerblue', 'gold',
18 |                               'darkorange'])
19 | 
20 | 
21 | def plot_results(X, Y_, means, covariances, index, title):
22 |     splot = plt.subplot(2, 1, 1 + index)
23 |     for i, (mean, covar, color) in enumerate(zip(
24 |             means, covariances, color_iter)):
25 |         v, w = linalg.eigh(covar)
26 |         v = 2. * np.sqrt(2.) * np.sqrt(v)
27 |         u = w[0] / linalg.norm(w[0])
28 |         # as the DP will not use every component it has access to
29 |         # unless it needs it, we shouldn't plot the redundant
30 |         # components.
31 |         if not np.any(Y_ == i):
32 |             continue
33 |         plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
34 | 
35 |         # Plot an ellipse to show the Gaussian component
36 |         angle = np.arctan(u[1] / u[0])
37 |         angle = 180. * angle / np.pi  # convert to degrees
38 |         ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color)
39 |         ell.set_clip_box(splot.bbox)
40 |         ell.set_alpha(0.5)
41 |         splot.add_artist(ell)
42 | 
43 | 
44 | # Number of samples per component
45 | n_samples = 500
46 | 
47 | # Generate random sample, two components
48 | np.random.seed(0)
49 | C = np.array([[0., -0.1], [1.7, .4]])
50 | X = np.r_[np.dot(np.random.randn(n_samples, 2), C),
51 |           .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])]
52 | 
53 | # Fit a Gaussian mixture with EM using five components
54 | gmm = mixture.GaussianMixture(n_components=5, covariance_type='full').fit(X)
55 | plot_results(X, gmm.predict(X), gmm.means_, gmm.covariances_, 0,
56 |              'Gaussian Mixture')
57 | 
58 | # Fit a Dirichlet process Gaussian mixture using five components
59 | dpgmm = mixture.BayesianGaussianMixture(n_components=5,
60 |                                         covariance_type='full').fit(X)
61 | plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm.covariances_, 1,
62 |              'Bayesian Gaussian Mixture with a Dirichlet process prior')
63 | 
64 | plt.show()
65 | 
66 | print(gmm.weights_)
67 | print(gmm.means_)
68 | print(gmm.covariances_)
69 | 


--------------------------------------------------------------------------------
/Code_2022/class10-test3.py:
--------------------------------------------------------------------------------
 1 | from sklearn import datasets
 2 | import matplotlib.pyplot as plt
 3 | from sklearn.cluster import KMeans
 4 | 
 5 | # 加载数据集，是一个字典类似Java中的map
 6 | lris_df = datasets.load_iris()
 7 | 
 8 | # 挑选出前两个维度作为x轴和y轴，你也可以选择其他维度
 9 | x_axis = lris_df.data[:, 0]
10 | y_axis = lris_df.data[:, 2]
11 | 
12 | # 这里已经知道了分3类，其他分类这里的参数需要调试
13 | model = KMeans(n_clusters=3)
14 | 
15 | # 训练模型
16 | model.fit(lris_df.data)
17 | 
18 | # 选取行标为100的那条数据，进行预测
19 | prddicted_label = model.predict([[6.3, 3.3, 6, 2.5]])
20 | 
21 | # 预测全部150条数据
22 | all_predictions = model.predict(lris_df.data)
23 | 
24 | # 打印出来对150条数据的聚类散点图
25 | plt.scatter(x_axis, y_axis, c=all_predictions)
26 | plt.show()
27 | 
28 | 


--------------------------------------------------------------------------------
/Code_2022/class11-test1.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | class HiddenMarkov:
  5 |     def forward(self, Q, V, A, B, O, PI):  # 使用前向算法
  6 |         N = len(Q)  # 状态序列的大小
  7 |         M = len(O)  # 观测序列的大小
  8 |         alphas = np.zeros((N, M))  # alpha值
  9 |         T = M  # 有几个时刻，有几个观测序列，就有几个时刻
 10 |         for t in range(T):  # 遍历每一时刻，算出alpha值
 11 |             indexOfO = V.index(O[t])  # 找出序列对应的索引
 12 |             for i in range(N):
 13 |                 if t == 0:  # 计算初值
 14 |                     alphas[i][t] = PI[t][i] * B[i][indexOfO]  # P176（10.15）
 15 |                     print('alpha1(%d)=p%db%db(o1)=%f' % (i, i, i, alphas[i][t]))
 16 |                 else:
 17 |                     alphas[i][t] = np.dot([alpha[t - 1] for alpha in alphas], [a[i] for a in A]) * B[i][
 18 |                         indexOfO]  # 对应P176（10.16）
 19 |                     print('alpha%d(%d)=[sigma alpha%d(i)ai%d]b%d(o%d)=%f' % (t, i, t - 1, i, i, t, alphas[i][t]))
 20 |                     # print(alphas)
 21 |         P = np.sum([alpha[M - 1] for alpha in alphas])  # P176(10.17)
 22 |         # alpha11 = pi[0][0] * B[0][0]    #代表a1(1)
 23 |         # alpha12 = pi[0][1] * B[1][0]    #代表a1(2)
 24 |         # alpha13 = pi[0][2] * B[2][0]    #代表a1(3)
 25 | 
 26 |     def backward(self, Q, V, A, B, O, PI):  # 后向算法
 27 |         N = len(Q)  # 状态序列的大小
 28 |         M = len(O)  # 观测序列的大小
 29 |         betas = np.ones((N, M))  # beta
 30 |         for i in range(N):
 31 |             print('beta%d(%d)=1' % (M, i))
 32 |         for t in range(M - 2, -1, -1):
 33 |             indexOfO = V.index(O[t + 1])  # 找出序列对应的索引
 34 |             for i in range(N):
 35 |                 betas[i][t] = np.dot(np.multiply(A[i], [b[indexOfO] for b in B]), [beta[t + 1] for beta in betas])
 36 |                 realT = t + 1
 37 |                 realI = i + 1
 38 |                 print('beta%d(%d)=[sigma a%djbj(o%d)]beta%d(j)=(' % (realT, realI, realI, realT + 1, realT + 1),
 39 |                       end='')
 40 |                 for j in range(N):
 41 |                     print("%.2f*%.2f*%.2f+" % (A[i][j], B[j][indexOfO], betas[j][t + 1]), end='')
 42 |                 print("0)=%.3f" % betas[i][t])
 43 |         # print(betas)
 44 |         indexOfO = V.index(O[0])
 45 |         P = np.dot(np.multiply(PI, [b[indexOfO] for b in B]), [beta[0] for beta in betas])
 46 |         print("P(O|lambda)=", end="")
 47 |         for i in range(N):
 48 |             print("%.1f*%.1f*%.5f+" % (PI[0][i], B[i][indexOfO], betas[i][0]), end="")
 49 |         print("0=%f" % P)
 50 | 
 51 |     def viterbi(self, Q, V, A, B, O, PI):
 52 |         N = len(Q)  # 状态序列的大小
 53 |         M = len(O)  # 观测序列的大小
 54 |         deltas = np.zeros((N, M))
 55 |         psis = np.zeros((N, M))
 56 |         I = np.zeros((1, M))
 57 |         for t in range(M):
 58 |             realT = t + 1
 59 |             indexOfO = V.index(O[t])  # 找出序列对应的索引
 60 |             for i in range(N):
 61 |                 realI = i + 1
 62 |                 if t == 0:
 63 |                     deltas[i][t] = PI[0][i] * B[i][indexOfO]
 64 |                     psis[i][t] = 0
 65 |                     print('delta1(%d)=pi%d * b%d(o1)=%.2f * %.2f=%.2f' % (
 66 |                         realI, realI, realI, PI[0][i], B[i][indexOfO], deltas[i][t]))
 67 |                     print('psis1(%d)=0' % (realI))
 68 |                 else:
 69 |                     deltas[i][t] = np.max(np.multiply([delta[t - 1] for delta in deltas], [a[i] for a in A])) * B[i][
 70 |                         indexOfO]
 71 |                     print('delta%d(%d)=max[delta%d(j)aj%d]b%d(o%d)=%.2f*%.2f=%.5f' % (
 72 |                         realT, realI, realT - 1, realI, realI, realT,
 73 |                         np.max(np.multiply([delta[t - 1] for delta in deltas], [a[i] for a in A])), B[i][indexOfO],
 74 |                         deltas[i][t]))
 75 |                     psis[i][t] = np.argmax(np.multiply([delta[t - 1] for delta in deltas], [a[i] for a in A]))
 76 |                     print('psis%d(%d)=argmax[delta%d(j)aj%d]=%d' % (realT, realI, realT - 1, realI, psis[i][t]))
 77 |         print(deltas)
 78 |         print(psis)
 79 |         I[0][M - 1] = np.argmax([delta[M - 1] for delta in deltas])
 80 |         print('i%d=argmax[deltaT(i)]=%d' % (M, I[0][M - 1] + 1))
 81 |         for t in range(M - 2, -1, -1):
 82 |             I[0][t] = psis[int(I[0][t + 1])][t + 1]
 83 |             print('i%d=psis%d(i%d)=%d' % (t + 1, t + 2, t + 2, I[0][t] + 1))
 84 |         print(I)
 85 | 
 86 | 
 87 | Q = [1, 2, 3]
 88 | V = ['红', '白']
 89 | A = [[0.5, 0.2, 0.3], [0.3, 0.5, 0.2], [0.2, 0.3, 0.5]]
 90 | B = [[0.5, 0.5], [0.4, 0.6], [0.7, 0.3]]
 91 | # O = ['红', '白', '红', '红', '白', '红', '白', '白']
 92 | O = ['红', '白', '红', '白']  # 习题10.1的例子
 93 | PI = [[0.2, 0.4, 0.4]]
 94 | 
 95 | HMM = HiddenMarkov()
 96 | # HMM.forward(Q, V, A, B, O, PI)
 97 | # HMM.backward(Q, V, A, B, O, PI)
 98 | HMM.viterbi(Q, V, A, B, O, PI)
 99 | print('------------------------------------------------')
100 | 
101 | Q = [1, 2, 3]
102 | V = ['红', '白']
103 | A = [[0.5, 0.2, 0.3], [0.3, 0.5, 0.2], [0.2, 0.3, 0.5]]
104 | B = [[0.5, 0.5], [0.4, 0.6], [0.7, 0.3]]
105 | O = ['红', '白', '红', '红', '白', '红', '白', '白']
106 | PI = [[0.2, 0.3, 0.5]]
107 | 
108 | HMM.forward(Q, V, A, B, O, PI)
109 | HMM.backward(Q, V, A, B, O, PI)
110 | 


--------------------------------------------------------------------------------
/Code_2022/class11-test2.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import hmmlearn.hmm as hmm
 3 | 
 4 | states = ['盒子1', '盒子2', '盒子3']
 5 | obs = ['白球', '黑球']
 6 | n_states = len(states)
 7 | m_obs = len(obs)
 8 | 
 9 | model2 = hmm.MultinomialHMM(n_components=n_states, n_iter=20, tol=0.001)
10 | X2 = np.array([
11 |     [0, 1, 0, 0, 1],
12 |     [0, 0, 0, 1, 1],
13 |     [1, 1, 0, 1, 0],
14 |     [0, 1, 0, 1, 1],
15 |     [0, 0, 0, 1, 0]
16 | ])
17 | model2.fit(X2)
18 | print("输出根据数据训练出来的π")
19 | print(model2.startprob_)
20 | print("输出根据数据训练出来的A")
21 | print(model2.transmat_)
22 | print("输出根据数据训练出来的B")
23 | print(model2.emissionprob_)
24 | 
25 | status = ['盒子1', '盒子2', '盒子3']
26 | obs = ['白球', '黑球']
27 | n_status = len(status)
28 | m_obs = len(obs)
29 | start_probability = np.array([0.2, 0.5, 0.3])
30 | transition_probability = np.array([
31 |     [0.5, 0.4, 0.1],
32 |     [0.2, 0.2, 0.6],
33 |     [0.2, 0.5, 0.3]
34 | ])
35 | emission_probalitity = np.array([
36 |     [0.4, 0.6],
37 |     [0.8, 0.2],
38 |     [0.5, 0.5]
39 | ])
40 | 
41 | model = hmm.MultinomialHMM(n_components=n_status)
42 | model.startprob_ = start_probability
43 | model.transmat_ = transition_probability
44 | model.emissionprob_ = emission_probalitity
45 | 
46 | se = np.array([[0, 1, 0, 0, 1]]).T
47 | logprob, box_index = model.decode(se, algorithm='viterbi')
48 | print("颜色:", end="")
49 | print(" ".join(map(lambda t: obs[t], [0, 1, 0, 0, 1])))
50 | print("盒子:", end="")
51 | print(" ".join(map(lambda t: status[t], box_index)))
52 | print("概率值:", end="")
53 | print(np.exp(logprob))  # 这个是因为在hmmlearn底层将概率进行了对数化，防止出现乘积为0的情况
54 | 
55 | status = ['盒子1', '盒子2', '盒子3']
56 | obs = ['白球', '黑球']
57 | n_status = len(status)
58 | m_obs = len(obs)
59 | start_probability = np.array([0.2, 0.5, 0.3])
60 | transition_probability = np.array([
61 |     [0.5, 0.4, 0.1],
62 |     [0.2, 0.2, 0.6],
63 |     [0.2, 0.5, 0.3]
64 | ])
65 | emission_probalitity = np.array([
66 |     [0.4, 0.6],
67 |     [0.8, 0.2],
68 |     [0.5, 0.5]
69 | ])
70 | 
71 | model = hmm.MultinomialHMM(n_components=n_status)
72 | model.startprob_ = start_probability
73 | model.transmat_ = transition_probability
74 | model.emissionprob_ = emission_probalitity
75 | 
76 | # 预测问题
77 | seen = np.array([0, 1, 0])
78 | 
79 | # 观测序列的概率计算问题
80 | # score函数返回的是以自然对数为底的对数概率值
81 | # ln0.13022≈−2.0385
82 | print(model.score(seen.reshape(-1, 1)))
83 | 
84 | print(np.exp(-1.81))
85 | 


--------------------------------------------------------------------------------
/Code_2022/class12-test.py:
--------------------------------------------------------------------------------
 1 | # Singular-value decomposition
 2 | from numpy import array
 3 | from scipy.linalg import svd
 4 | 
 5 | # define a matrix
 6 | A = array([[1, 2], [3, 4], [5, 6]])
 7 | print(A)
 8 | # SVD
 9 | U, s, VT = svd(A)
10 | print(U)
11 | print(s)
12 | print(VT)
13 | 
14 | from numpy import array
15 | from numpy import diag
16 | from numpy import dot
17 | 
18 | A = array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
19 | print(A)
20 | # Singular-value decomposition
21 | U, s, VT = svd(A)
22 | # create n x n Sigma matrix
23 | Sigma = diag(s)
24 | # reconstruct matrix
25 | B = U.dot(Sigma.dot(VT))
26 | print(B)
27 | 
28 | # create n x n Sigma matrix
29 | Sigma = diag(s)
30 | # reconstruct matrix
31 | B = U.dot(Sigma.dot(VT))
32 | print(B)
33 | 


--------------------------------------------------------------------------------
/Code_2022/class12-test2.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from PIL import Image
 3 | import matplotlib.image as mpimg
 4 | 
 5 | # import cv2
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | # %matplotlib inline
 9 | 
10 | I = mpimg.imread('data/header.jpg')
11 | # Now, let's look at the size of this numpy array object img as well as plot it using imshow.
12 | print(I.shape)
13 | plt.axis('off')
14 | plt.imshow(I)
15 | plt.show()
16 | 
17 | 
18 | def show_img(img):
19 |     plt.figure(figsize=(10, 7.5))
20 |     plt.imshow(img, cmap='gray', vmin=0, vmax=255, aspect='auto')
21 |     plt.axis('off')
22 |     plt.show()
23 | 
24 | 
25 | gray_img = I[:, :, 1]
26 | show_img(gray_img)
27 | print(gray_img.shape)
28 | U, S, V_T = np.linalg.svd(gray_img)
29 | print(U.shape, S.shape, V_T.shape)
30 | 
31 | # Plot sigmas
32 | plt.figure(figsize=(9, 5))
33 | plt.plot(np.arange(S.shape[0]), S)
34 | plt.yscale('log')
35 | plt.xlabel('Index of $\sigma$')
36 | plt.ylabel('log(value of $\sigma$)')
37 | plt.title('Singular values $\sigma_i$ vs its index')
38 | plt.show()
39 | 
40 | # Plot cumsum of sigma
41 | plt.figure(figsize=(9, 5))
42 | plt.plot(np.cumsum(S) / np.sum(S))
43 | plt.xlabel('Index of $\sigma$')
44 | plt.ylabel('Value of $\sigma$')
45 | plt.title('Cumulative sum of $\sigma_i$ vs its index\n(Percent of explained variance)')
46 | plt.show()
47 | 
48 | # Create an empty matrix to fill with sigma values (np.lialg.svd returns sigma as an array)
49 | S_full = np.zeros((U.shape[0], V_T.shape[0]))
50 | print(S_full.shape)
51 | 
52 | # Populate sigma matrix
53 | S_diag = np.diag(S)
54 | print(S_diag.shape)
55 | S_full[:S_diag.shape[0], :S_diag.shape[1]] = S_diag
56 | 
57 | # for i in [5, 10, 25, 50, 100, 200, U.shape[0]]:
58 | #     print(str(i) + '\n')
59 | #     show_img(U[:, :i].dot(S_full[:i, :i].dot(V_T[:i, :])))
60 | #     print('-' * 100 + '\n')
61 | #
62 | # print(U[:, :5])
63 | i = 200
64 | print(U[:, :i].shape, S_full[:i, :i].shape, V_T[:i, :].shape)
65 | 


--------------------------------------------------------------------------------
/Code_2022/class13-test1.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.datasets import load_iris
 3 | from sklearn.decomposition import PCA
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | 
 7 | def pca(data, n_dim):
 8 |     '''
 9 |     pca is O(D^3)
10 |     :param data: (n_samples, n_features(D))
11 |     :param n_dim: target dimensions
12 |     :return: (n_samples, n_dim)
13 |     '''
14 |     data = data - np.mean(data, axis=0, keepdims=True)
15 | 
16 |     cov = np.dot(data.T, data)
17 | 
18 |     eig_values, eig_vector = np.linalg.eig(cov)
19 |     # print(eig_values)
20 |     indexs_ = np.argsort(-eig_values)[:n_dim]
21 |     picked_eig_values = eig_values[indexs_]
22 |     picked_eig_vector = eig_vector[:, indexs_]
23 |     data_ndim = np.dot(data, picked_eig_vector)
24 |     return data_ndim
25 | 
26 | 
27 | data = load_iris()
28 | X = data.data
29 | Y = data.target
30 | data_2d1 = pca(X, 2)
31 | plt.figure(figsize=(8, 4))
32 | plt.subplot(121)
33 | plt.title("my_PCA")
34 | plt.scatter(data_2d1[:, 0], data_2d1[:, 1], c=Y)
35 | plt.show()


--------------------------------------------------------------------------------
/Code_2022/class13-test2.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.decomposition import PCA
 3 | 
 4 | X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
 5 | pca = PCA(n_components=2)
 6 | pca.fit(X)
 7 | print(pca.explained_variance_ratio_)
 8 | print(pca.singular_values_)
 9 | 
10 | new_X = pca.fit_transform(X)
11 | print(new_X)
12 | 


--------------------------------------------------------------------------------
/Code_2022/class13-test3.py:
--------------------------------------------------------------------------------
 1 | from matplotlib import pyplot as plt
 2 | from sklearn.decomposition import PCA
 3 | from sklearn.datasets import load_iris
 4 | 
 5 | pca = PCA(2)
 6 | print(pca)
 7 | 
 8 | data = load_iris()
 9 | X, y = data.data, data.target
10 | X_proj = pca.fit_transform(X)
11 | print(X_proj.shape)
12 | 
13 | plt.scatter(X_proj[:, 0], X_proj[:, 1], c=y)
14 | plt.show()
15 | 


--------------------------------------------------------------------------------
/Code_2022/class13-test4.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.image as mpimg
 2 | import numpy as np
 3 | from matplotlib import pyplot as plt
 4 | from sklearn.decomposition import PCA
 5 | 
 6 | img = mpimg.imread('data/header.jpg')
 7 | # Now, let's look at the size of this numpy array object img as well as plot it using imshow.
 8 | print(img.shape)
 9 | plt.axis('off')
10 | plt.imshow(img)
11 | plt.show()
12 | 
13 | img_r = np.reshape(img, (800, 3600))
14 | print(img_r.shape)
15 | 
16 | ipca = PCA(64).fit(img_r)
17 | img_c = ipca.transform(img_r)
18 | print(img_c.shape)
19 | print(np.sum(ipca.explained_variance_ratio_))
20 | 
21 | print(ipca)
22 | 
23 | # OK, now to visualize how PCA has performed this compression, let's inverse transform the PCA output and
24 | # reshape for visualization using imshow.
25 | temp = ipca.inverse_transform(img_c)
26 | print(temp.shape)
27 | # reshaping 2988 back to the original 996 * 3
28 | temp = np.reshape(temp, (800, 1200, 3))
29 | print(temp.shape)
30 | 
31 | plt.axis('off')
32 | plt.imshow(temp)
33 | plt.imshow(temp.astype('uint8'))
34 | plt.show()
35 | 


--------------------------------------------------------------------------------
/Code_2022/class13-test5.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.datasets import load_digits
 3 | from sklearn.decomposition import PCA
 4 | 
 5 | digits = load_digits()
 6 | print(digits.keys())
 7 | 
 8 | # looking at data, there looks to be 64 features, what are these?
 9 | print(digits.data.shape)
10 | # another available dataset is called images. Let's check this out.
11 | print(digits.images.shape)
12 | 
13 | import matplotlib.pyplot as plt
14 | 
15 | plt.gray()
16 | plt.matshow(digits.images[0])
17 | plt.show()
18 | 
19 | X, y = digits.data, digits.target
20 | pca_digits = PCA(0.95)
21 | X_proj = pca_digits.fit_transform(X)
22 | print(X.shape, X_proj.shape)
23 | 
24 | # Let's run PCA with 2 components so as to plot the data in 2D
25 | pca_digits = PCA(2)
26 | X_proj = pca_digits.fit_transform(X)
27 | print(np.sum(pca_digits.explained_variance_ratio_))
28 | # Note we only retain about 28% of the variance by choosing 2 components
29 | 
30 | print(X_proj.shape)
31 | 
32 | # Let's plot the principal components as a scatter plot
33 | plt.scatter(X_proj[:, 0], X_proj[:, 1], c=y)
34 | plt.colorbar()
35 | plt.show()
36 | 
37 | pca_digits = PCA(64).fit(X)
38 | plt.semilogx(np.cumsum(pca_digits.explained_variance_ratio_))
39 | plt.xlabel('Number of Components')
40 | plt.ylabel('Variance retained')
41 | plt.ylim(0, 1)
42 | plt.show()
43 | 


--------------------------------------------------------------------------------
/Code_2022/class13-test6.py:
--------------------------------------------------------------------------------
 1 | # SparsePCA
 2 | import numpy as np
 3 | from sklearn.datasets import make_friedman1, load_digits
 4 | from sklearn.decomposition import SparsePCA
 5 | 
 6 | X, _ = load_digits(return_X_y=True)
 7 | transformer = SparsePCA(n_components=5, random_state=0)
 8 | transformer.fit(X)
 9 | X_transformed = transformer.transform(X)
10 | 
11 | print(X_transformed.shape)
12 | 
13 | # KernelPCA
14 | from sklearn.datasets import load_digits
15 | from sklearn.decomposition import KernelPCA
16 | 
17 | X, y = load_digits(return_X_y=True)
18 | print(X.shape)
19 | transformer = KernelPCA(n_components=7, kernel='linear')
20 | X_transformed = transformer.fit_transform(X)
21 | print(X_transformed.shape)
22 | 
23 | # Isomap
24 | from sklearn.manifold import Isomap
25 | 
26 | isomap = Isomap(n_components=2, n_neighbors=5)
27 | new_X_isomap = isomap.fit_transform(X)
28 | print(new_X_isomap.shape)
29 | 
30 | from sklearn.manifold import TSNE
31 | 
32 | X_embedded = TSNE(n_components=2).fit_transform(X)
33 | print(X_embedded.shape)
34 | 


--------------------------------------------------------------------------------
/Code_2022/class13-test7.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from functools import partial
 3 | from time import time
 4 | 
 5 | import matplotlib.pyplot as plt
 6 | from mpl_toolkits.mplot3d import Axes3D
 7 | from matplotlib.ticker import NullFormatter
 8 | 
 9 | from sklearn import manifold, datasets
10 | 
11 | # Next line to silence pyflakes. This import is needed.
12 | print(Axes3D)
13 | 
14 | n_points = 1000
15 | X, color = datasets.make_s_curve(n_points, random_state=0)
16 | n_neighbors = 10
17 | n_components = 2
18 | 
19 | # Create figure
20 | fig = plt.figure(figsize=(15, 8))
21 | fig.suptitle("Manifold Learning with %i points, %i neighbors"
22 |              % (1000, n_neighbors), fontsize=14)
23 | 
24 | # Add 3d scatter plot
25 | ax = fig.add_subplot(251, projection='3d')
26 | ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap=plt.cm.Spectral)
27 | ax.view_init(4, -72)
28 | 
29 | # Set-up manifold methods
30 | LLE = partial(manifold.LocallyLinearEmbedding,
31 |               n_neighbors, n_components, eigen_solver='auto')
32 | 
33 | methods = OrderedDict()
34 | methods['LLE'] = LLE(method='standard')
35 | methods['LTSA'] = LLE(method='ltsa')
36 | methods['Hessian LLE'] = LLE(method='hessian')
37 | methods['Modified LLE'] = LLE(method='modified')
38 | methods['Isomap'] = manifold.Isomap(n_neighbors, n_components)
39 | methods['MDS'] = manifold.MDS(n_components, max_iter=100, n_init=1)
40 | methods['SE'] = manifold.SpectralEmbedding(n_components=n_components,
41 |                                            n_neighbors=n_neighbors)
42 | methods['t-SNE'] = manifold.TSNE(n_components=n_components, init='pca',
43 |                                  random_state=0)
44 | 
45 | # Plot results
46 | for i, (label, method) in enumerate(methods.items()):
47 |     t0 = time()
48 |     Y = method.fit_transform(X)
49 |     t1 = time()
50 |     print("%s: %.2g sec" % (label, t1 - t0))
51 |     ax = fig.add_subplot(2, 5, 2 + i + (i > 3))
52 |     ax.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral)
53 |     ax.set_title("%s (%.2g sec)" % (label, t1 - t0))
54 |     ax.xaxis.set_major_formatter(NullFormatter())
55 |     ax.yaxis.set_major_formatter(NullFormatter())
56 |     ax.axis('tight')
57 | 
58 | plt.show()
59 | 


--------------------------------------------------------------------------------
/Code_2022/class14-test1.py:
--------------------------------------------------------------------------------
 1 | import gensim
 2 | from gensim import corpora
 3 | from pprint import pprint
 4 | 
 5 | # How to create a dictionary from a list of sentences?
 6 | 
 7 | documents = ["The Saudis are preparing a report that will acknowledge that",
 8 |              "Saudi journalist Jamal Khashoggi's death was the result of an",
 9 |              "interrogation that went wrong, one that was intended to lead",
10 |              "to his abduction from Turkey, according to two sources."]
11 | 
12 | # Tokenize(split) the sentences into words
13 | texts = [[text for text in doc.split()] for doc in documents]
14 | 
15 | # Create dictionary
16 | dictionary = corpora.Dictionary(texts)
17 | 
18 | # Get information about the dictionary
19 | print(dictionary)
20 | print(dictionary.token2id)
21 | 
22 | documents_2 = ["The intersection graph of paths in trees",
23 |                "Graph minors IV Widths of trees and well quasi ordering",
24 |                "Graph minors A survey"]
25 | 
26 | texts_2 = [[text for text in doc.split()] for doc in documents_2]
27 | 
28 | dictionary.add_documents(texts_2)
29 | 
30 | print(dictionary.token2id)
31 | 
32 | new_corpus = [dictionary.doc2bow(text) for text in texts]
33 | 
34 | print(new_corpus)
35 | 
36 | from gensim import models
37 | 
38 | tfidf = models.TfidfModel(new_corpus)
39 | 
40 | corpus_tfidf = tfidf[new_corpus]
41 | print(corpus_tfidf)
42 | 
43 | for i in range(len(corpus_tfidf)):
44 |     print(corpus_tfidf[i])
45 | 
46 | string = 'the i first second name'
47 | string_bow = dictionary.doc2bow(string.lower().split())
48 | string_tfidf = tfidf[string_bow]
49 | print(string_bow)
50 | print(string_tfidf)


--------------------------------------------------------------------------------
/Code_2022/class14-test2.py:
--------------------------------------------------------------------------------
 1 | import gensim
 2 | from gensim.models import Word2Vec
 3 | 
 4 | # define training data
 5 | sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
 6 |              ['this', 'is', 'the', 'second', 'sentence'],
 7 |              ['yet', 'another', 'sentence'],
 8 |              ['one', 'more', 'sentence'],
 9 |              ['and', 'the', 'final', 'sentence']]
10 | 
11 | # train model
12 | model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)
13 | # model.build_vocab(sentences, update=True)
14 | # model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)
15 | 
16 | print(model)
17 | vector = list(model.wv['first'])
18 | print(vector)
19 | 
20 | sims = model.wv.most_similar('first', topn=10)  # get other similar words
21 | print(sims)
22 | 
23 | word_vectors = model.wv
24 | print(word_vectors)
25 | 
26 | for index, word in enumerate(model.wv.index_to_key):
27 |     if index == 10:
28 |         break
29 |     print(f"word #{index}/{len(model.wv.index_to_key)} is {word}")
30 | 
31 | similarity = word_vectors.similarity('first', 'second')
32 | print(similarity)
33 | 
34 | result = word_vectors.similar_by_word("first")
35 | print(result)
36 | 
37 | _idx = model.wv.key_to_index["first"]
38 | print(_idx)
39 | 


--------------------------------------------------------------------------------
/Code_2022/class14-test3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | '''
 4 | @author: MrYx
 5 | @github: https://github.com/MrYxJ
 6 | '''
 7 | 
 8 | import jieba
 9 | from gensim.models import word2vec
10 | import re
11 | 
12 | with open('../class/data/三国演义.txt') as f:
13 |     document = f.read()
14 |     document = re.sub('[，。？！：；、“”]+', ' ', document)  # 去标点
15 |     document_cut = jieba.cut(document)  # 结巴分词
16 |     result = ' '.join(document_cut)
17 |     with open('1.txt', 'w') as f2:
18 |         f2.write(result)
19 | 
20 | sentences = word2vec.LineSentence('1.txt')
21 | model = word2vec.Word2Vec(sentences, hs=1, min_count=1, window=3)
22 | 
23 | s1 = model.wv.most_similar('曹操')
24 | s2 = model.wv.most_similar('玄德')
25 | 
26 | 
27 | def show(s, name):
28 |     print(name + ':', end=' ')
29 |     for i in s:
30 |         print(i[0], end=' ')
31 |     print()
32 | 
33 | 
34 | show(s1, '曹操')
35 | show(s2, '玄德')
36 | 


--------------------------------------------------------------------------------
/Code_2022/class15-test1.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from tqdm import tqdm
 4 | 
 5 | tqdm.pandas(desc="progress-bar")
 6 | from gensim.models import Doc2Vec
 7 | from sklearn import utils
 8 | from sklearn.model_selection import train_test_split
 9 | import gensim
10 | from sklearn.linear_model import LogisticRegression
11 | from gensim.models.doc2vec import TaggedDocument
12 | import re
13 | import seaborn as sns
14 | import matplotlib.pyplot as plt
15 | 
16 | df = pd.read_csv('data/Consumer_Complaints.csv')
17 | df = df[['Consumer Complaint', 'Product']]
18 | df = df[pd.notnull(df['Consumer Complaint'])]
19 | print(df.head(10))
20 | print(df.shape)
21 | print(df.isnull().sum())
22 | 
23 | cnt_pro = df['Product'].value_counts()
24 | 
25 | plt.figure(figsize=(12, 4))
26 | sns.barplot(cnt_pro.index, cnt_pro.values, alpha=0.8)
27 | plt.ylabel('Number of Occurrences', fontsize=12)
28 | plt.xlabel('Product', fontsize=12)
29 | plt.xticks(rotation=90)
30 | plt.show()
31 | 
32 | df.rename(columns={'Consumer Complaint': 'narrative'}, inplace=True)
33 | 
34 | from gensim.models import doc2vec
35 | 
36 | 
37 | def label_sentences(corpus, label_type):
38 |     """
39 |     Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
40 |     We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
41 |     a dummy index of the complaint narrative.
42 |     """
43 |     labeled = []
44 |     for i, v in enumerate(corpus):
45 |         label = label_type + '_' + str(i)
46 |         labeled.append(doc2vec.TaggedDocument(v.split(), [label]))
47 |     return labeled
48 | 
49 | 
50 | X_train, X_test, y_train, y_test = train_test_split(df.narrative, df.Product, random_state=0, test_size=0.3)
51 | X_train = label_sentences(X_train, 'Train')
52 | X_test = label_sentences(X_test, 'Test')
53 | all_data = X_train + X_test
54 | 
55 | # print(all_data[:2])
56 | 
57 | model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
58 | model_dbow.build_vocab([x for x in tqdm(all_data)])
59 | 
60 | # %%time
61 | for epoch in range(2):
62 |     model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
63 |     model_dbow.alpha -= 0.002
64 |     model_dbow.min_alpha = model_dbow.alpha
65 | 
66 | 
67 | # %%time
68 | def get_vectors(model, corpus_size, vectors_size, vectors_type):
69 |     """
70 |     Get vectors from trained doc2vec model
71 |     :param doc2vec_model: Trained Doc2Vec model
72 |     :param corpus_size: Size of the data
73 |     :param vectors_size: Size of the embedding vectors
74 |     :param vectors_type: Training or Testing vectors
75 |     :return: list of vectors
76 |     """
77 |     vectors = np.zeros((corpus_size, vectors_size))
78 |     for i in range(0, corpus_size):
79 |         prefix = vectors_type + '_' + str(i)
80 |         vectors[i] = model.dv[prefix]
81 |     return vectors
82 | 
83 | 
84 | train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train')
85 | test_vectors_dbow = get_vectors(model_dbow, len(X_test), 300, 'Test')
86 | 
87 | print(len(train_vectors_dbow))
88 | print(len(test_vectors_dbow))
89 | # %%time
90 | from sklearn.linear_model import LogisticRegression
91 | 
92 | logreg = LogisticRegression()  # multi_class='multinomial', solver = 'lbfgs')
93 | logreg.fit(train_vectors_dbow, y_train)
94 | print(logreg.score(test_vectors_dbow, y_test))
95 | # %%time
96 | new_doc = model_dbow.infer_vector(['violent', 'means', 'to', 'destroy', 'the', 'organization'])
97 | print(new_doc.shape)
98 | 


--------------------------------------------------------------------------------
/Code_2022/class15-test2.py:
--------------------------------------------------------------------------------
 1 | from gensim import corpora, models, similarities
 2 | from pprint import pprint
 3 | import warnings
 4 | 
 5 | f = open('data/LDA_test.txt')
 6 | stop_list = set('for a of the and to in'.split())
 7 | 
 8 | texts = [[
 9 |     word for word in line.strip().lower().split() if word not in stop_list
10 | ] for line in f]
11 | print('Text = ')
12 | pprint(texts)
13 | 
14 | dictionary = corpora.Dictionary(texts)
15 | print(dictionary)
16 | 
17 | V = len(dictionary)
18 | corpus = [dictionary.doc2bow(text) for text in texts]
19 | corpus_tfidf = models.TfidfModel(corpus)[corpus]
20 | corpus_tfidf = corpus
21 | 
22 | print('\nTF-IDF:')
23 | for c in corpus_tfidf:
24 |     print(c)
25 | 
26 | print('\nLSI Model:')
27 | lsi = models.LsiModel(corpus_tfidf, num_topics=2, id2word=dictionary)
28 | topic_result = [a for a in lsi[corpus_tfidf]]
29 | pprint(topic_result)
30 | 
31 | print('\nLSI Topics:')
32 | pprint(lsi.print_topics(num_topics=2, num_words=5))
33 | 
34 | print('\nLDA Model:')
35 | num_topics = 2
36 | lda = models.LdaModel(
37 |     corpus_tfidf,
38 |     num_topics=num_topics,
39 |     id2word=dictionary,
40 |     alpha='auto',
41 |     eta='auto',
42 |     minimum_probability=0.001,
43 |     passes=10)
44 | doc_topic = [doc_t for doc_t in lda[corpus_tfidf]]
45 | print('Document-Topic:')
46 | pprint(doc_topic)
47 | 


--------------------------------------------------------------------------------
/Code_2022/class2-test1.py:
--------------------------------------------------------------------------------
 1 | # Importing libraries
 2 | import numpy as np
 3 | import pandas as pd
 4 | import matplotlib.pyplot as plt
 5 | from sklearn.linear_model import LinearRegression
 6 | 
 7 | # driver code
 8 | # Create dataset
 9 | 
10 | X = np.array([[1], [2], [3], [4], [5], [6], [7]])
11 | Y = np.array([45000, 50000, 60000, 80000, 110000, 150000, 200000])
12 | 
13 | # Model training
14 | 
15 | model = LinearRegression()
16 | model.fit(X, Y)
17 | 
18 | # Prediction
19 | Y_pred = model.predict(X)
20 | 
21 | print(model.coef_, model.intercept_)
22 | # Visualization
23 | plt.scatter(X, Y, color='blue')
24 | plt.plot(X, Y_pred, color='orange')
25 | plt.title('X vs Y')
26 | plt.xlabel('X')
27 | plt.ylabel('Y')
28 | plt.show()


--------------------------------------------------------------------------------
/Code_2022/class2-test2.py:
--------------------------------------------------------------------------------
 1 | from sklearn import linear_model
 2 | from sklearn.linear_model import LinearRegression
 3 | 
 4 | reg = linear_model.LinearRegression()
 5 | reg.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2])
 6 | print(reg.coef_)
 7 | 
 8 | from sklearn.preprocessing import PolynomialFeatures
 9 | from sklearn.pipeline import Pipeline
10 | 
11 | polynomial_features = PolynomialFeatures(degree=3,
12 |                                          include_bias=False)
13 | linear_regression = LinearRegression()
14 | pipeline = Pipeline([("polynomial_features", polynomial_features),
15 |                      ("linear_regression", linear_regression)])
16 | pipeline.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2])
17 | print(linear_regression.coef_, linear_regression.intercept_)
18 | 


--------------------------------------------------------------------------------
/Code_2022/class2-test3.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from sklearn.pipeline import Pipeline
 4 | from sklearn.preprocessing import PolynomialFeatures
 5 | from sklearn.linear_model import LinearRegression
 6 | from sklearn.model_selection import cross_val_score
 7 | 
 8 | 
 9 | def true_fun(X):
10 |     return np.cos(1.5 * np.pi * X)
11 | 
12 | np.random.seed(0)
13 | 
14 | n_samples = 30
15 | degrees = [1, 4, 15, 17]
16 | 
17 | X = np.sort(np.random.rand(n_samples))
18 | y = true_fun(X) + np.random.randn(n_samples) * 0.1      # 加入随机噪声
19 | 
20 | plt.figure(figsize=(14, 5))
21 | for i in range(len(degrees)):
22 |     ax = plt.subplot(1, len(degrees), i + 1)    # 确认行列
23 |     plt.setp(ax, xticks=(), yticks=())
24 | 
25 |     polynomial_features = PolynomialFeatures(degree=degrees[i],
26 |                                              include_bias=False)
27 |     # 建模，组装，拟合
28 |     linear_regression = LinearRegression()
29 |     pipeline = Pipeline([("polynomial_features", polynomial_features),
30 |                          ("linear_regression", linear_regression)])
31 |     pipeline.fit(X[:, np.newaxis], y)
32 | 
33 |     # Evaluate the models using crossvalidation 评分
34 |     scores = cross_val_score(pipeline, X[:, np.newaxis], y,
35 |                              scoring="neg_mean_squared_error", cv=10)
36 | 
37 |     X_test = np.linspace(0, 1, 100)
38 |     plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
39 |     plt.plot(X_test, true_fun(X_test), label="True function")
40 |     plt.scatter(X, y, edgecolor='b', s=20, label="Samples")
41 |     plt.xlabel("x")
42 |     plt.ylabel("y")
43 |     plt.xlim((0, 1))
44 |     plt.ylim((-2, 2))
45 |     plt.legend(loc="best")
46 |     plt.title("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format(
47 |         degrees[i], -scores.mean(), scores.std()))
48 | plt.show()
49 | 
50 | print(X)
51 | print(y)


--------------------------------------------------------------------------------
/Code_2022/class2-test4.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from sklearn.pipeline import Pipeline
 4 | from sklearn.preprocessing import PolynomialFeatures
 5 | from sklearn.linear_model import LinearRegression
 6 | from sklearn.model_selection import cross_val_score
 7 | 
 8 | 
 9 | def true_fun(X):
10 |     return np.cos(1.5 * np.pi * X)
11 | 
12 | 
13 | np.random.seed(0)
14 | 
15 | n_samples = 30
16 | degrees = [1, 4, 15]
17 | 
18 | X = np.sort(np.random.rand(n_samples))
19 | y = true_fun(X) + np.random.randn(n_samples) * 0.1  # 加入随机噪声
20 | 
21 | plt.figure(figsize=(14, 5))
22 | for i in range(len(degrees)):
23 |     ax = plt.subplot(1, len(degrees), i + 1)  # 确认行列
24 |     plt.setp(ax, xticks=(), yticks=())
25 | 
26 |     polynomial_features = PolynomialFeatures(degree=degrees[i],
27 |                                              include_bias=False)
28 |     # 建模，组装，拟合
29 |     linear_regression = LinearRegression()
30 |     pipeline = Pipeline([("polynomial_features", polynomial_features),
31 |                          ("linear_regression", linear_regression)])
32 |     pipeline.fit(X[:, np.newaxis], y)
33 | 
34 |     # Evaluate the models using crossvalidation 评分
35 |     scores = cross_val_score(pipeline, X[:, np.newaxis], y,
36 |                              scoring="neg_mean_squared_error", cv=10)
37 | 
38 |     X_test = np.linspace(0, 1, 100)
39 |     plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
40 |     plt.plot(X_test, true_fun(X_test), label="True function")
41 |     plt.scatter(X, y, edgecolor='b', s=20, label="Samples")
42 |     plt.xlabel("x")
43 |     plt.ylabel("y")
44 |     plt.xlim((0, 1))
45 |     plt.ylim((-2, 2))
46 |     plt.legend(loc="best")
47 |     plt.title("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format(
48 |         degrees[i], -scores.mean(), scores.std()))
49 | plt.show()
50 | 
51 | print(X)
52 | print(y)
53 | 
54 | from sklearn.linear_model import Lasso
55 | from sklearn.linear_model import Ridge
56 | 
57 | # pipeline = Ridge(alpha = 0.5)
58 | # pipeline.fit(X[:, np.newaxis], y)
59 | from sklearn.model_selection import cross_val_score
60 | from sklearn.pipeline import Pipeline
61 | from sklearn.preprocessing import PolynomialFeatures
62 | 
63 | polynomial_features = PolynomialFeatures(degree=15,             # 加入岭回归，避免15次时过拟合
64 |                                          include_bias=False)
65 | linear_regression = Ridge(alpha=0.01)
66 | 
67 | pipeline = Pipeline([("polynomial_features", polynomial_features),
68 |                      ("linear_regression", linear_regression)])
69 | pipeline.fit(X[:, np.newaxis], y)
70 | 
71 | # Evaluate the models using crossvalidation
72 | scores = cross_val_score(pipeline, X[:, np.newaxis], y,
73 |                          scoring="neg_mean_squared_error", cv=10)
74 | 
75 | X_test = np.linspace(0, 1, 100)
76 | plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
77 | plt.plot(X_test, true_fun(X_test), label="True function")
78 | plt.scatter(X, y, edgecolor='b', s=20, label="Samples")
79 | plt.xlabel("x")
80 | plt.ylabel("y")
81 | plt.xlim((0, 1))
82 | plt.ylim((-2, 2))
83 | plt.legend(loc="best")
84 | plt.show()


--------------------------------------------------------------------------------
/Code_2022/class2-test5.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from sklearn.pipeline import Pipeline
 4 | from sklearn.preprocessing import PolynomialFeatures
 5 | from sklearn.linear_model import LinearRegression
 6 | from sklearn.model_selection import cross_val_score
 7 | 
 8 | 
 9 | def true_fun(X):
10 |     return np.cos(1.5 * np.pi * X)
11 | 
12 | 
13 | np.random.seed(0)
14 | 
15 | n_samples = 30
16 | degrees = [1, 4, 15]
17 | 
18 | X = np.sort(np.random.rand(n_samples))
19 | y = true_fun(X) + np.random.randn(n_samples) * 0.1  # 加入随机噪声
20 | 
21 | plt.figure(figsize=(14, 5))
22 | for i in range(len(degrees)):
23 |     ax = plt.subplot(1, len(degrees), i + 1)  # 确认行列
24 |     plt.setp(ax, xticks=(), yticks=())
25 | 
26 |     polynomial_features = PolynomialFeatures(degree=degrees[i],
27 |                                              include_bias=False)
28 |     # 建模，组装，拟合
29 |     linear_regression = LinearRegression()
30 |     pipeline = Pipeline([("polynomial_features", polynomial_features),
31 |                          ("linear_regression", linear_regression)])
32 |     pipeline.fit(X[:, np.newaxis], y)
33 | 
34 |     # Evaluate the models using crossvalidation 评分
35 |     scores = cross_val_score(pipeline, X[:, np.newaxis], y,
36 |                              scoring="neg_mean_squared_error", cv=10)
37 | 
38 |     X_test = np.linspace(0, 1, 100)
39 |     plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
40 |     plt.plot(X_test, true_fun(X_test), label="True function")
41 |     plt.scatter(X, y, edgecolor='b', s=20, label="Samples")
42 |     plt.xlabel("x")
43 |     plt.ylabel("y")
44 |     plt.xlim((0, 1))
45 |     plt.ylim((-2, 2))
46 |     plt.legend(loc="best")
47 |     plt.title("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format(
48 |         degrees[i], -scores.mean(), scores.std()))
49 | plt.show()
50 | 
51 | print(X)
52 | print(y)
53 | 
54 | from sklearn.linear_model import Lasso
55 | from sklearn.linear_model import Ridge
56 | 
57 | # pipeline = Ridge(alpha = 0.5)
58 | # pipeline.fit(X[:, np.newaxis], y)
59 | from sklearn.model_selection import cross_val_score
60 | from sklearn.pipeline import Pipeline
61 | from sklearn.preprocessing import PolynomialFeatures
62 | 
63 | polynomial_features = PolynomialFeatures(degree=15,  # 加入岭回归，避免15次时过拟合
64 |                                          include_bias=False)
65 | # linear_regression = Ridge(alpha = 0.01) # 替换
66 | linear_regression = Lasso(alpha=0.01)
67 | 
68 | pipeline = Pipeline([("polynomial_features", polynomial_features),
69 |                      ("linear_regression", linear_regression)])
70 | pipeline.fit(X[:, np.newaxis], y)
71 | 
72 | # Evaluate the models using crossvalidation
73 | scores = cross_val_score(pipeline, X[:, np.newaxis], y,
74 |                          scoring="neg_mean_squared_error", cv=10)
75 | 
76 | X_test = np.linspace(0, 1, 100)
77 | plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
78 | plt.plot(X_test, true_fun(X_test), label="True function")
79 | plt.scatter(X, y, edgecolor='b', s=20, label="Samples")
80 | plt.xlabel("x")
81 | plt.ylabel("y")
82 | plt.xlim((0, 1))
83 | plt.ylim((-2, 2))
84 | plt.legend(loc="best")
85 | plt.show()
86 | 


--------------------------------------------------------------------------------
/Code_2022/class3-test1.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from sklearn.datasets import load_iris
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | # load data
  7 | iris = load_iris()
  8 | df = pd.DataFrame(iris.data, columns=iris.feature_names)
  9 | df['label'] = iris.target
 10 | 
 11 | df.head(5)
 12 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
 13 | df.label.value_counts()
 14 | 
 15 | plt.scatter(df[:50]['sepal length'], df[:50]['sepal width'], label='0')
 16 | plt.scatter(df[50:100]['sepal length'], df[50:100]['sepal width'], label='1')
 17 | plt.xlabel('sepal length')
 18 | plt.ylabel('sepal width')
 19 | plt.legend()
 20 | plt.show()
 21 | 
 22 | data = np.array(df.iloc[:100, [0, 1, -1]])
 23 | print(data[:10, :])
 24 | X, y = data[:, :-1], data[:, -1]
 25 | print(X[:10, :])
 26 | y = np.array([1 if i == 1 else -1 for i in y])
 27 | 
 28 | 
 29 | class PLA:
 30 |     def __init__(self, max_iter=1000, shuffle=False):
 31 |         self.b = 0
 32 |         self.lr = 0.1
 33 |         self.max_iter = max_iter
 34 |         self.iter = 0
 35 |         self.shuffle = shuffle
 36 | 
 37 |     def sign(self, x, w, b):
 38 |         return np.dot(x, w) + b
 39 | 
 40 |     def fit(self, X, y):
 41 |         N, M = X.shape
 42 |         self.w = np.ones(M)
 43 |         for n in range(self.max_iter):
 44 |             self.iter = n
 45 |             wrong_items = 0
 46 |             if self.shuffle:  # 每次迭代，是否打乱
 47 |                 idx = np.random.permutation(range(N))
 48 |                 X, y = X[idx], y[idx]
 49 |             for i in range(N):
 50 |                 if y[i] * self.sign(X[i], self.w, self.b) <= 0:
 51 |                     self.w += self.lr * np.dot(y[i], X[i])
 52 |                     self.b += self.lr * y[i]
 53 |                     wrong_items += 1
 54 |             if wrong_items == 0:
 55 |                 print("finished at iters: {}, w: {}, b: {}".format(self.iter, self.w, self.b))
 56 |                 return
 57 |         print("finished for reaching the max_iter: {}, w: {}, b: {}".format(self.max_iter, self.w, self.b))
 58 |         perceptron1 = PLA()
 59 |         perceptron1.fit(X, y)
 60 | 
 61 | 
 62 | def plot(model, tilte):
 63 |     x_points = np.linspace(4, 7, 10)
 64 |     y_ = -(model.w[0] * x_points + model.b) / model.w[1]
 65 |     plt.plot(x_points, y_)
 66 |     print(y_)
 67 | 
 68 |     plt.plot(data[:50, 0], data[:50, 1], 'bo', color='blue', label='-1')
 69 |     plt.plot(data[50:100, 0], data[50:100, 1], 'bo', color='orange', label='1')
 70 |     plt.xlabel('sepal length')
 71 |     plt.ylabel('sepal width')
 72 |     plt.title(tilte)
 73 |     plt.legend()
 74 | 
 75 | 
 76 | perceptron1 = PLA()
 77 | perceptron1.fit(X, y)
 78 | plot(perceptron1, 'PLA_dual')
 79 | plt.show()
 80 | 
 81 | ####################################################
 82 | from sklearn.linear_model import Perceptron
 83 | from sklearn.model_selection import train_test_split
 84 | 
 85 | # import numpy as np
 86 | # import matplotlib.pyplot as plt
 87 | iris = load_iris()
 88 | X = iris.data
 89 | Y = iris.target
 90 | 
 91 | df = pd.DataFrame(iris.data, columns=iris.feature_names)
 92 | 
 93 | # df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
 94 | df['label'] = iris.target
 95 | 
 96 | #
 97 | data = np.array(df.iloc[:100, [0, 1, -1]])
 98 | 
 99 | x, y = data[:, :-1], data[:, -1]
100 | 
101 | # print(data)
102 | X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.9)
103 | 
104 | clf = Perceptron(tol=1e-3, random_state=0, max_iter=1000)
105 | 
106 | clf.fit(X_train, y_train)
107 | 
108 | print(clf.coef_)
109 | 
110 | print(clf.intercept_)
111 | 
112 | x_ponits = np.arange(4, 8)
113 | y_ = -(clf.coef_[0][0] * x_ponits + clf.intercept_) / clf.coef_[0][1]
114 | plt.plot(x_ponits, y_)
115 | 
116 | plt.plot(data[:50, 0], data[:50, 1], 'bo', color='blue', label='0')
117 | plt.plot(data[50:100, 0], data[50:100, 1], 'bo', color='orange', label='1')
118 | plt.xlabel('sepal length')
119 | plt.ylabel('sepal width')
120 | plt.legend()
121 | plt.show()
122 | 
123 | ###################################################
124 | from sklearn.metrics import plot_confusion_matrix
125 | 
126 | disp = plot_confusion_matrix(clf, X_train, y_train)
127 | disp.figure_.suptitle("Confusion Matrix")
128 | print("Confusion matrix:\n%s" % disp.confusion_matrix)
129 | ###################################################
130 | 
131 | from sklearn.metrics import plot_precision_recall_curve
132 | 
133 | pr = plot_precision_recall_curve(clf, X_test, y_test)
134 | from sklearn.metrics import plot_roc_curve
135 | 
136 | roc = plot_roc_curve(clf, X_test, y_test)
137 | 


--------------------------------------------------------------------------------
/Code_2022/class4-test1.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from sklearn.datasets import load_iris
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | # load data
  7 | iris = load_iris()
  8 | df = pd.DataFrame(iris.data, columns=iris.feature_names)
  9 | df['label'] = iris.target
 10 | 
 11 | df.head(5)
 12 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
 13 | df.label.value_counts()
 14 | 
 15 | plt.scatter(df[:50]['sepal length'], df[:50]['sepal width'], label='0')
 16 | plt.scatter(df[50:100]['sepal length'], df[50:100]['sepal width'], label='1')
 17 | plt.xlabel('sepal length')
 18 | plt.ylabel('sepal width')
 19 | plt.legend()
 20 | plt.show()
 21 | 
 22 | data = np.array(df.iloc[:100, [0, 1, -1]])
 23 | print(data[:10, :])
 24 | X, y = data[:, :-1], data[:, -1]
 25 | print(X[:10, :])
 26 | y = np.array([1 if i == 1 else -1 for i in y])
 27 | 
 28 | 
 29 | class PLA_dual:
 30 |     def __init__(self, max_iter=1000):
 31 |         self.b = 0
 32 |         self.lr = 0.1
 33 |         self.max_iter = max_iter
 34 |         self.iter = 0
 35 | 
 36 |     def cal_w(self, X):
 37 |         w = 0
 38 |         for i in range(len(self.alpha)):
 39 |             w += self.alpha[i] * y[i] * X[i]
 40 |         return w
 41 | 
 42 |     def gram_matrix(self, X):
 43 |         return np.dot(X, X.T)
 44 | 
 45 |     def fit(self, X, y):
 46 |         N, M = X.shape
 47 |         self.alpha = np.zeros(N)
 48 |         gram = self.gram_matrix(X)
 49 |         for n in range(self.max_iter):
 50 |             self.iter = n
 51 |             wrong_items = 0
 52 |             for i in range(N):
 53 |                 tmp = 0
 54 |                 for j in range(N):
 55 |                     tmp += self.alpha[j] * y[j] * gram[i, j]
 56 |                 tmp += self.b
 57 |                 if y[i] * tmp <= 0:
 58 |                     self.alpha[i] += self.lr
 59 |                     self.b += self.lr * y[i]
 60 |                     wrong_items += 1
 61 |             if wrong_items == 0:
 62 |                 self.w = self.cal_w(X)
 63 |                 print("finished at iters: {}, w: {}, b: {}".format(self.iter, self.w, self.b))
 64 |                 return
 65 |         self.w = self.cal_w(X)
 66 |         print("finished for reaching the max_iter: {}, w: {}, b: {}".format(self.max_iter, self.w, self.b))
 67 |         return
 68 | 
 69 | 
 70 | perceptron3 = PLA_dual()
 71 | perceptron3.fit(X, y)
 72 | 
 73 | 
 74 | def plot(model, tilte):
 75 |     x_points = np.linspace(4, 7, 10)
 76 |     y_ = -(model.w[0] * x_points + model.b) / model.w[1]
 77 |     plt.plot(x_points, y_)
 78 |     print(y_)
 79 | 
 80 |     plt.plot(data[:50, 0], data[:50, 1], 'bo', color='blue', label='-1')
 81 |     plt.plot(data[50:100, 0], data[50:100, 1], 'bo', color='orange', label='1')
 82 |     plt.xlabel('sepal length')
 83 |     plt.ylabel('sepal width')
 84 |     plt.title(tilte)
 85 |     plt.legend()
 86 |     plt.show()
 87 | 
 88 | 
 89 | plot(perceptron3, 'PLA_dual')
 90 | 
 91 | from sklearn.datasets import load_iris
 92 | from sklearn.model_selection import train_test_split
 93 | from sklearn.neural_network import MLPClassifier
 94 | 
 95 | iris = load_iris()
 96 | X = iris.data
 97 | Y = iris.target
 98 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
 99 | 
100 | # clf=MLPClassifier(activation='logistic',max_iter=1000)# 构造分类器实例
101 | clf = MLPClassifier(solver='sgd', alpha=1e-5,
102 |                     hidden_layer_sizes=(20, 20, 20), random_state=1, max_iter=10000)
103 | # 4*20*20*20个参数
104 | 
105 | clf.fit(X_train, y_train)  # 训练分类器
106 | print(clf.score(X_test, y_test))  # 查看在训练集上的评价预测精度
107 | 


--------------------------------------------------------------------------------
/Code_2022/class4-test2.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | r = 1
 6 | 
 7 | linestyle = ['b-', 'k-', 'm-', 'r-', 'y-']
 8 | p_values = (0.25, 0.5, 1, 2, 4, 100)
 9 | 
10 | for i, p in enumerate(p_values):
11 |     x = np.arange(-r, r + 1e-5, 1 / 128.0)
12 |     y = (r ** p - (abs(x) ** p)) ** (1.0 / p)
13 |     plt.plot(x, y, x, -y)
14 | 
15 | ax = plt.gca()
16 | ax.set_aspect(1)
17 | plt.show()
18 | 
19 | #####
20 | X = [[0], [1], [2], [3]]
21 | y = [0, 0, 1, 1]
22 | from sklearn.neighbors import KNeighborsClassifier
23 | neigh = KNeighborsClassifier(n_neighbors=3)
24 | neigh.fit(X, y)
25 | 
26 | print(neigh.predict([[1.1]]))
27 | print(neigh.predict_proba([[0.9]]))
28 | 


--------------------------------------------------------------------------------
/Code_2022/class4-test3.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from matplotlib.colors import ListedColormap
 4 | from sklearn import neighbors, datasets
 5 | 
 6 | irisData = datasets.load_iris()
 7 | 
 8 | X = irisData.data[:, :4]
 9 | y = irisData.target
10 | 
11 | weights = 'uniform'
12 | n_neighbors=15
13 | # we create an instance of Neighbours Classifier and fit the data.
14 | classifier = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
15 | classifier.fit(X, y)
16 | 
17 | print('KNN classifier accuracy - "%s" - %.3f' % (weights, classifier.score(X, y)))
18 | 


--------------------------------------------------------------------------------
/Code_2022/class5-test1.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | 
  4 | class NaiveBayes:
  5 |     def __init__(self):
  6 |         self.model = None
  7 | 
  8 |     # 数学期望
  9 |     @staticmethod
 10 |     def mean(X):
 11 |         return sum(X) / float(len(X))
 12 | 
 13 |     # 标准差（方差）
 14 |     def stdev(self, X):
 15 |         avg = self.mean(X)
 16 |         return math.sqrt(sum([pow(x - avg, 2) for x in X]) / float(len(X)))
 17 | 
 18 |     # 概率密度函数
 19 |     def gaussian_probability(self, x, mean, stdev):
 20 |         exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
 21 |         return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent
 22 | 
 23 |     # 处理X_train
 24 |     def summarize(self, train_data):
 25 |         summaries = [(self.mean(i), self.stdev(i)) for i in zip(*train_data)]
 26 |         return summaries
 27 | 
 28 |     # 分类别求出数学期望和标准差
 29 |     def fit(self, X, y):
 30 |         labels = list(set(y))
 31 |         data = {label: [] for label in labels}
 32 |         for f, label in zip(X, y):
 33 |             data[label].append(f)
 34 |         self.model = {label: self.summarize(value) for label, value in data.items()}
 35 |         return 'GaussianNB train done!'
 36 | 
 37 |     # 计算概率
 38 |     def calculate_probabilities(self, input_data):
 39 |         # summaries:{0.0: [(5.0, 0.37),(3.42, 0.40)], 1.0: [(5.8, 0.449),(2.7, 0.27)]}
 40 |         # input_data:[1.1, 2.2]
 41 |         probabilities = {}
 42 |         for label, value in self.model.items():
 43 |             probabilities[label] = 1
 44 |             for i in range(len(value)):
 45 |                 mean, stdev = value[i]
 46 |                 probabilities[label] *= self.gaussian_probability(input_data[i], mean, stdev)
 47 |         return probabilities
 48 | 
 49 |     # 类别
 50 |     def predict(self, X_test):
 51 |         # {0.0: 2.9680340789325763e-27, 1.0: 3.5749783019849535e-26}
 52 |         label = sorted(self.calculate_probabilities(X_test).items(), key=lambda x: x[-1])[-1][0]
 53 |         return label
 54 | 
 55 |     def score(self, X_test, y_test):
 56 |         right = 0
 57 |         for X, y in zip(X_test, y_test):
 58 |             label = self.predict(X)
 59 |             if label == y:
 60 |                 right += 1
 61 | 
 62 |         return right / float(len(X_test))
 63 | 
 64 | 
 65 | import numpy as np
 66 | import pandas as pd
 67 | 
 68 | import matplotlib.pyplot as plt
 69 | # %matplotlib inline
 70 | 
 71 | from sklearn.datasets import load_iris
 72 | from sklearn.model_selection import train_test_split
 73 | 
 74 | iris = load_iris()
 75 | X = iris.data
 76 | Y = iris.target
 77 | '''
 78 | df = pd.DataFrame(iris.data, columns=iris.feature_names)
 79 | df['label'] = iris.target
 80 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
 81 | data = np.array(df.iloc[:100, :])
 82 | # print(data)
 83 | '''
 84 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
 85 | model = NaiveBayes()
 86 | model.fit(X_train, y_train)
 87 | print(model.score(X_test, y_test))
 88 | print(model.predict([4.4, 3.2, 1.3, 0.2]))
 89 | 
 90 | from sklearn.naive_bayes import GaussianNB
 91 | X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
 92 | Y = np.array([1, 1, 1, 2, 2, 2])
 93 | Y = np.array(['a', 'a', 'a', 'b', 'b', 'b'])
 94 | 
 95 | clf = GaussianNB(priors=None, var_smoothing=1e-09)
 96 | clf.fit(X, Y)
 97 | print(clf.predict([[-0.8, -1]]))
 98 | 
 99 | from sklearn import datasets
100 | iris = datasets.load_iris()
101 | 
102 | from sklearn.naive_bayes import MultinomialNB
103 | clf = MultinomialNB()
104 | clf = clf.fit(iris.data, iris.target)
105 | y_pred=clf.predict(iris.data)
106 | print("多项分布朴素贝叶斯，样本总数： %d 错误样本数 : %d" % (iris.data.shape[0],(iris.target != y_pred).sum()))


--------------------------------------------------------------------------------
/Code_2022/class5-test2.py:
--------------------------------------------------------------------------------
 1 | from sklearn.datasets import load_iris
 2 | from sklearn.naive_bayes import GaussianNB
 3 | #from sklearn.cross_validation import train_test_split
 4 | from sklearn.model_selection import train_test_split
 5 | 
 6 | from sklearn.naive_bayes import BernoulliNB, MultinomialNB # 伯努利模型和多项式模型
 7 | 
 8 | iris = load_iris()
 9 | X = iris.data
10 | Y = iris.target
11 | 
12 | X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.4, random_state=0)
13 | 
14 | nb = GaussianNB()
15 | nb.fit(X_train, y_train)
16 | 
17 | y_pred = nb.fit(iris.data, iris.target).predict(iris.data)
18 | print("Number of mislabeled points out of a total %d points : %d"% (iris.data.shape[0],(iris.target != y_pred).sum()))
19 | 
20 | print("Naive Gausian bayes score (sklearn): " +str(nb.score(X_test, y_test)))
21 | 
22 | nb = MultinomialNB()
23 | nb.fit(X_train, y_train)
24 | 
25 | y_pred = nb.fit(iris.data, iris.target).predict(iris.data)
26 | print("Number of mislabeled points out of a total %d points : %d"% (iris.data.shape[0],(iris.target != y_pred).sum()))
27 | 
28 | print("Naive Gausian bayes score (sklearn): " +str(nb.score(X_test, y_test)))
29 | 
30 | nb = BernoulliNB()
31 | nb.fit(X_train, y_train)
32 | 
33 | y_pred = nb.fit(iris.data, iris.target).predict(iris.data)
34 | print("Number of mislabeled points out of a total %d points : %d"% (iris.data.shape[0],(iris.target != y_pred).sum()))
35 | 
36 | print("Naive Gausian bayes score (sklearn): " +str(nb.score(X_test, y_test)))
37 | 
38 | print(nb)


--------------------------------------------------------------------------------
/Code_2022/class6-test1.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | # %matplotlib inline
 4 | 
 5 | import math
 6 | 
 7 | p = np.linspace(0.01, 1, num=50, endpoint=False)
 8 | 
 9 | entropy = -p * np.log2(p) - (1 - p) * np.log2(1 - p)
10 | 
11 | # plt.plot(b)
12 | plt.plot(p, entropy)
13 | plt.grid(True)
14 | plt.xlabel('p')
15 | plt.ylabel('Entropy(bit)')
16 | # plt.plot(p,gini)
17 | 
18 | max_en = 2 * (-(1 / 2) * np.log2(1 / 2))
19 | print(max_en)
20 | 
21 | d = np.linspace(0.01, 100, num=50, endpoint=False)
22 | ld = np.log2(d)
23 | plt.show()
24 | 


--------------------------------------------------------------------------------
/Code_2022/class6-test2.py:
--------------------------------------------------------------------------------
 1 | def create_data():
 2 |     datasets = [[1, 'Sunny', 'Hot', 'High', 'Weak', 'No'],
 3 |                 [2, 'Sunny', 'Hot', 'High', 'Strong', 'No'],
 4 |                 [3, 'Overcast', 'Hot', 'High', 'Weak', 'Yes'],
 5 |                 [4, 'Rainy', 'Mild', 'High', 'Weak', 'Yes'],
 6 |                 [5, 'Rainy', 'Cool', 'Normal', 'Weak', 'Yes'],
 7 |                 [6, 'Rainy', 'Cool', 'Normal', 'Strong', 'No'],
 8 |                 [7, 'Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
 9 |                 [8, 'Sunny', 'Mild', 'High', 'Weak', 'No'],
10 |                 [9, 'Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
11 |                 [10, 'Rainy', 'Mild', 'Normal', 'Weak', 'Yes'],
12 |                 [11, 'Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],
13 |                 [12, 'Overcast', 'Mild', 'High', 'Strong', 'Yes'],
14 |                 [13, 'Overcast', 'Hot', 'Normal', 'Weak', 'Yes'],
15 |                 [14, 'Rainy', 'Mild', 'High', 'Strong', 'No'],
16 |                 ]
17 | 
18 |     labels = ['Day', 'OutLook', 'Temperature', 'Humidity', 'Wind', 'PlayTennis']
19 |     return datasets, labels
20 |     # 返回数据集和每个维度的名称
21 | 
22 | 
23 | import pandas as pd
24 | from math import log2
25 | 
26 | datasets, labels = create_data()
27 | 
28 | train_data = pd.DataFrame(datasets, columns=labels)
29 | 
30 | print(train_data)
31 | 
32 | # 以 Outlook 为分界的熵
33 | En_Sunny = -(2 / 5) * log2(2 / 5) - (3 / 5) * log2(3 / 5)
34 | En_Overcast = -(4 / 4) * log2(4 / 4)
35 | En_Rainy = -(3 / 5) * log2(3 / 5) - (2 / 5) * log2(2 / 5)
36 | 
37 | # Outlook 熵
38 | En_Outlook = 5 / 14 * En_Sunny + 4 / 14 * En_Overcast + 5 / 14 * En_Rainy
39 | 
40 | print(En_Outlook)
41 | 
42 | from sklearn import tree
43 | X = [[0, 0], [1, 1]]
44 | Y = [0, 1]
45 | clf = tree.DecisionTreeClassifier()
46 | clf = clf.fit(X, Y)
47 | 
48 | print(clf.predict([[2., 2.]]))
49 | print(clf.predict_proba([[2., 2.]]))


--------------------------------------------------------------------------------
/Code_2022/class7-test1.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | 
 5 | # %matplotlib inline
 6 | 
 7 | def sigmod(x):
 8 |     return 1 / (1 + np.exp(-x))
 9 | 
10 | 
11 | x = np.arange(-10, 10., 0.1)
12 | y = sigmod(x)
13 | 
14 | plt.plot(x, y)
15 | plt.grid(True)
16 | plt.show()
17 | 


--------------------------------------------------------------------------------
/Code_2022/class7-test2.py:
--------------------------------------------------------------------------------
 1 | from math import exp
 2 | import numpy as np
 3 | import pandas as pd
 4 | import matplotlib.pyplot as plt
 5 | # %matplotlib inline
 6 | 
 7 | from sklearn.datasets import load_iris
 8 | from sklearn.model_selection import train_test_split
 9 | 
10 | 
11 | class LogisticReressionClassifier:
12 |     def __init__(self, max_iter=200, learning_rate=0.01):
13 |         self.max_iter = max_iter
14 |         self.learning_rate = learning_rate
15 | 
16 |     def sigmoid(self, x):
17 |         return 1 / (1 + exp(-x))
18 | 
19 |     def data_matrix(self, X):
20 |         data_mat = []
21 |         for d in X:
22 |             data_mat.append([1.0, *d])
23 |         return data_mat
24 | 
25 |     def fit(self, X, y):
26 |         # label = np.mat(y)
27 |         data_mat = self.data_matrix(X)  # m*n
28 |         self.weights = np.zeros((len(data_mat[0]), 1), dtype=np.float32)
29 | 
30 |         for iter_ in range(self.max_iter):
31 |             for i in range(len(X)):
32 |                 result = self.sigmoid(np.dot(data_mat[i], self.weights))
33 |                 error = y[i] - result
34 |                 self.weights += self.learning_rate * error * np.transpose([data_mat[i]])
35 |         print('LogisticRegression Model(learning_rate={},max_iter={})'.format(self.learning_rate, self.max_iter))
36 | 
37 |     # def f(self, x):
38 |     #     return -(self.weights[0] + self.weights[1] * x) / self.weights[2]
39 | 
40 |     def score(self, X_test, y_test):
41 |         right = 0
42 |         X_test = self.data_matrix(X_test)
43 |         for x, y in zip(X_test, y_test):
44 |             result = np.dot(x, self.weights)
45 |             if (result > 0 and y == 1) or (result < 0 and y == 0):
46 |                 right += 1
47 |         return right / len(X_test)
48 | 
49 | 
50 | def create_data():
51 |     iris = load_iris()
52 |     df = pd.DataFrame(iris.data, columns=iris.feature_names)
53 |     df['label'] = iris.target
54 |     df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
55 |     data = np.array(df.iloc[:100, [0, 1, -1]])
56 |     # print(data)
57 |     return data[:, :2], data[:, -1]
58 | 
59 | 
60 | X, y = create_data()
61 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
62 | lr_clf = LogisticReressionClassifier()
63 | lr_clf.fit(X_train, y_train)
64 | print(lr_clf.score(X_test, y_test))
65 | 
66 | x_ponits = np.arange(4, 8)
67 | y_ = -(lr_clf.weights[1] * x_ponits + lr_clf.weights[0]) / lr_clf.weights[2]
68 | plt.plot(x_ponits, y_)
69 | 
70 | # lr_clf.show_graph()
71 | plt.scatter(X[:50, 0], X[:50, 1], label='0')
72 | plt.scatter(X[50:, 0], X[50:, 1], label='1')
73 | plt.legend()
74 | plt.show()
75 | 


--------------------------------------------------------------------------------
/Code_2022/class7-test3.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from matplotlib import pyplot as plt
 4 | from sklearn.datasets import load_iris
 5 | from sklearn.linear_model import LogisticRegression
 6 | from sklearn.model_selection import train_test_split
 7 | 
 8 | 
 9 | def create_data():
10 |     iris = load_iris()
11 |     df = pd.DataFrame(iris.data, columns=iris.feature_names)
12 |     df['label'] = iris.target
13 |     df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
14 |     data = np.array(df.iloc[:100, [0, 1, -1]])
15 |     # print(data)
16 |     return data[:, :2], data[:, -1]
17 | 
18 | 
19 | X, y = create_data()
20 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
21 | clf = LogisticRegression(max_iter=200, solver='liblinear')
22 | clf.fit(X_train, y_train)
23 | clf.score(X_test, y_test)
24 | print(clf.coef_, clf.intercept_)
25 | 
26 | x_ponits = np.arange(4, 8)
27 | y_ = -(clf.coef_[0][0] * x_ponits + clf.intercept_) / clf.coef_[0][1]
28 | plt.plot(x_ponits, y_)
29 | 
30 | plt.plot(X[:50, 0], X[:50, 1], 'bo', color='blue', label='0')
31 | plt.plot(X[50:, 0], X[50:, 1], 'bo', color='orange', label='1')
32 | plt.xlabel('sepal length')
33 | plt.ylabel('sepal width')
34 | plt.legend()
35 | plt.show()
36 | 


--------------------------------------------------------------------------------
/Code_2022/class7-test4.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from sklearn.linear_model import LogisticRegression
 4 | from sklearn import datasets
 5 | 
 6 | # import some data to play with
 7 | iris = datasets.load_iris()
 8 | X = iris.data[:, :2]  # we only take the first two features.
 9 | Y = iris.target
10 | 
11 | logreg = LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial')
12 | 
13 | # Create an instance of Logistic Regression Classifier and fit the data.
14 | logreg.fit(X, Y)
15 | 
16 | # Plot the decision boundary. For that, we will assign a color to each
17 | # point in the mesh [x_min, x_max]x[y_min, y_max].
18 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
19 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
20 | h = .02  # step size in the mesh
21 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
22 | Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])
23 | 
24 | # Put the result into a color plo
25 | Z = Z.reshape(xx.shape)
26 | plt.figure(1, figsize=(4, 3))
27 | plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)
28 | 
29 | # Plot also the training points
30 | plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired)
31 | plt.xlabel('Sepal length')
32 | plt.ylabel('Sepal width')
33 | 
34 | plt.xlim(xx.min(), xx.max())
35 | plt.ylim(yy.min(), yy.max())
36 | plt.xticks(())
37 | plt.yticks(())
38 | plt.show()
39 | 


--------------------------------------------------------------------------------
/Code_2022/class8-test1.py:
--------------------------------------------------------------------------------
 1 | # Example 1
 2 | import numpy as np
 3 | 
 4 | X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
 5 | y = np.array([1, 1, 2, 2])
 6 | 
 7 | from sklearn.svm import SVC
 8 | 
 9 | # clf = SVC(gamma='auto')
10 | 
11 | clf = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
12 |           decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
13 |           max_iter=-1, probability=False, random_state=None, shrinking=True,
14 |           tol=0.001, verbose=False)  # 可以根据前面介绍的参数，做出相应改变观察结果变化
15 | 
16 | clf.fit(X, y)
17 | print(clf.predict([[-0.8, -1]]))
18 | 
19 | print(clf.support_vectors_)
20 | print(clf.dual_coef_, clf.coef_, clf.intercept_)
21 | 


--------------------------------------------------------------------------------
/Code_2022/class8-test2.py:
--------------------------------------------------------------------------------
 1 | # Example 2
 2 | from sklearn import svm
 3 | from sklearn import datasets
 4 | from sklearn.model_selection import train_test_split as ts
 5 | 
 6 | # import our data
 7 | iris = datasets.load_iris()
 8 | X = iris.data
 9 | y = iris.target
10 | 
11 | # split the data to  7:3
12 | X_train, X_test, y_train, y_test = ts(X, y, test_size=0.3)
13 | 
14 | # select different type of kernel function and compare the score
15 | 
16 | # kernel = 'rbf'
17 | clf_rbf = svm.SVC(kernel='rbf', gamma='auto')
18 | clf_rbf.fit(X_train, y_train)
19 | score_rbf = clf_rbf.score(X_test, y_test)
20 | print("The score of rbf is : %f" % score_rbf)
21 | 
22 | # kernel = 'linear'
23 | clf_linear = svm.SVC(kernel='linear', gamma='auto')
24 | clf_linear.fit(X_train, y_train)
25 | score_linear = clf_linear.score(X_test, y_test)
26 | print("The score of linear is : %f" % score_linear)
27 | 
28 | # kernel = 'poly'
29 | clf_poly = svm.SVC(kernel='poly', gamma='auto')
30 | clf_poly.fit(X_train, y_train)
31 | score_poly = clf_poly.score(X_test, y_test)
32 | print("The score of poly is : %f" % score_poly)
33 | 
34 | print(clf_linear.coef_, clf_linear.intercept_)
35 | 
36 | # print(clf.predict([[4.9, 3., 1.4, 0.2]]))
37 | 


--------------------------------------------------------------------------------
/Code_2022/class8-test3.py:
--------------------------------------------------------------------------------
 1 | from sklearn import svm
 2 | from sklearn.svm import SVR
 3 | 
 4 | X = [[0, 0], [2, 2]]
 5 | y = [0.5, 2.5]
 6 | clf = svm.SVR()
 7 | clf.fit(X, y)
 8 | SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
 9 |     gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
10 |     tol=0.001, verbose=False)
11 | print(clf.predict([[1, 1]]))
12 | 
13 | from sklearn.svm import LinearSVR
14 | 
15 | regr = LinearSVR(random_state=0, tol=1e-5)
16 | regr.fit(X, y)
17 | print(regr.coef_)
18 | 
19 | print(regr.intercept_)
20 | print(regr.predict([[1, 1]]))
21 | 


--------------------------------------------------------------------------------
/Code_2022/class8-test4.py:
--------------------------------------------------------------------------------
1 | from sklearn.svm import LinearSVR
2 | 
3 | regr = LinearSVR(random_state=0, tol=1e-5)
4 | regr.fit(X, y)
5 | print(regr.coef_)
6 | 
7 | print(regr.intercept_)
8 | print(regr.predict([[1, 1]]))
9 | 


--------------------------------------------------------------------------------
/Code_2022/class9-test1.py:
--------------------------------------------------------------------------------
  1 | class AdaBoost:
  2 |     def __init__(self, n_estimators=50, learning_rate=1.0):
  3 |         self.clf_num = n_estimators
  4 |         self.learning_rate = learning_rate
  5 | 
  6 |     def init_args(self, datasets, labels):
  7 | 
  8 |         self.X = datasets
  9 |         self.Y = labels
 10 |         self.M, self.N = datasets.shape
 11 | 
 12 |         # 弱分类器数目和集合
 13 |         self.clf_sets = []
 14 | 
 15 |         # 初始化weights
 16 |         self.weights = [1.0 / self.M] * self.M
 17 | 
 18 |         # G(x)系数 alpha
 19 |         self.alpha = []
 20 | 
 21 |     def _G(self, features, labels, weights):
 22 |         m = len(features)
 23 |         error = 100000.0  # 无穷大
 24 |         best_v = 0.0
 25 |         # 单维features
 26 |         features_min = min(features)
 27 |         features_max = max(features)
 28 |         n_step = (features_max - features_min + self.learning_rate) // self.learning_rate
 29 |         # print('n_step:{}'.format(n_step))
 30 |         direct, compare_array = None, None
 31 |         for i in range(1, int(n_step)):
 32 |             v = features_min + self.learning_rate * i
 33 | 
 34 |             if v not in features:
 35 |                 # 误分类计算
 36 |                 compare_array_positive = np.array([1 if features[k] > v else -1 for k in range(m)])
 37 |                 weight_error_positive = sum([weights[k] for k in range(m) if compare_array_positive[k] != labels[k]])
 38 | 
 39 |                 compare_array_nagetive = np.array([-1 if features[k] > v else 1 for k in range(m)])
 40 |                 weight_error_nagetive = sum([weights[k] for k in range(m) if compare_array_nagetive[k] != labels[k]])
 41 | 
 42 |                 if weight_error_positive < weight_error_nagetive:
 43 |                     weight_error = weight_error_positive
 44 |                     _compare_array = compare_array_positive
 45 |                     direct = 'positive'
 46 |                 else:
 47 |                     weight_error = weight_error_nagetive
 48 |                     _compare_array = compare_array_nagetive
 49 |                     direct = 'nagetive'
 50 | 
 51 |                 # print('v:{} error:{}'.format(v, weight_error))
 52 |                 if weight_error < error:
 53 |                     error = weight_error
 54 |                     compare_array = _compare_array
 55 |                     best_v = v
 56 |         return best_v, direct, error, compare_array
 57 | 
 58 |     # 计算alpha
 59 |     def _alpha(self, error):
 60 |         return 0.5 * np.log((1 - error) / error)
 61 | 
 62 |     # 规范化因子
 63 |     def _Z(self, weights, a, clf):
 64 |         return sum([weights[i] * np.exp(-1 * a * self.Y[i] * clf[i]) for i in range(self.M)])
 65 | 
 66 |     # 权值更新
 67 |     def _w(self, a, clf, Z):
 68 |         for i in range(self.M):
 69 |             self.weights[i] = self.weights[i] * np.exp(-1 * a * self.Y[i] * clf[i]) / Z
 70 | 
 71 |     # G(x)的线性组合
 72 |     def _f(self, alpha, clf_sets):
 73 |         pass
 74 | 
 75 |     def G(self, x, v, direct):
 76 |         if direct == 'positive':
 77 |             return 1 if x > v else -1
 78 |         else:
 79 |             return -1 if x > v else 1
 80 | 
 81 |     def fit(self, X, y):
 82 |         self.init_args(X, y)
 83 | 
 84 |         for epoch in range(self.clf_num):
 85 |             best_clf_error, best_v, clf_result = 100000, None, None
 86 |             # 根据特征维度, 选择误差最小的
 87 |             for j in range(self.N):
 88 |                 features = self.X[:, j]
 89 |                 # 分类阈值，分类误差，分类结果
 90 |                 v, direct, error, compare_array = self._G(features, self.Y, self.weights)
 91 | 
 92 |                 if error < best_clf_error:
 93 |                     best_clf_error = error
 94 |                     best_v = v
 95 |                     final_direct = direct
 96 |                     clf_result = compare_array
 97 |                     axis = j
 98 | 
 99 |                 # print('epoch:{}/{} feature:{} error:{} v:{}'.format(epoch, self.clf_num, j, error, best_v))
100 |                 if best_clf_error == 0:
101 |                     break
102 | 
103 |             # 计算G(x)系数a
104 |             a = self._alpha(best_clf_error)
105 |             self.alpha.append(a)
106 |             # 记录分类器
107 |             self.clf_sets.append((axis, best_v, final_direct))
108 |             # 规范化因子
109 |             Z = self._Z(self.weights, a, clf_result)
110 |             # 权值更新
111 |             self._w(a, clf_result, Z)
112 | 
113 |     #             print('classifier:{}/{} error:{:.3f} v:{} direct:{} a:{:.5f}'.format(epoch+1, self.clf_num, error, best_v, final_direct, a))
114 |     #             print('weight:{}'.format(self.weights))
115 |     #             print('\n')
116 | 
117 |     def predict(self, feature):
118 |         result = 0.0
119 |         for i in range(len(self.clf_sets)):
120 |             axis, clf_v, direct = self.clf_sets[i]
121 |             f_input = feature[axis]
122 |             result += self.alpha[i] * self.G(f_input, clf_v, direct)
123 |         # sign
124 |         return 1 if result > 0 else -1
125 | 
126 |     def score(self, X_test, y_test):
127 |         right_count = 0
128 |         for i in range(len(X_test)):
129 |             feature = X_test[i]
130 |             if self.predict(feature) == y_test[i]:
131 |                 right_count += 1
132 | 
133 |         return right_count / len(X_test)
134 | 
135 | 
136 | import numpy as np
137 | import pandas as pd
138 | from sklearn.datasets import load_iris
139 | from sklearn.model_selection import train_test_split
140 | import matplotlib.pyplot as plt
141 | 
142 | 
143 | # %matplotlib inline
144 | 
145 | def create_data():
146 |     iris = load_iris()
147 |     df = pd.DataFrame(iris.data, columns=iris.feature_names)
148 |     df['label'] = iris.target
149 |     df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
150 |     data = np.array(df.iloc[:100, [0, 1, -1]])
151 |     for i in range(len(data)):
152 |         if data[i, -1] == 0:
153 |             data[i, -1] = -1
154 |     # print(data)
155 |     return data[:, :2], data[:, -1]
156 | 
157 | 
158 | X, y = create_data()
159 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
160 | clf = AdaBoost(n_estimators=3, learning_rate=0.5)
161 | clf.fit(X_train, y_train)
162 | print(clf.score(X_test, y_test))
163 | 


--------------------------------------------------------------------------------
/Code_2022/class9-test2.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from sklearn.datasets import load_iris
 4 | from sklearn.ensemble import BaggingClassifier
 5 | from sklearn.model_selection import train_test_split
 6 | from sklearn.neighbors import KNeighborsClassifier
 7 | 
 8 | 
 9 | def create_data():
10 |     iris = load_iris()
11 |     df = pd.DataFrame(iris.data, columns=iris.feature_names)
12 |     df['label'] = iris.target
13 |     df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
14 |     data = np.array(df.iloc[:100, [0, 1, -1]])
15 |     for i in range(len(data)):
16 |         if data[i, -1] == 0:
17 |             data[i, -1] = -1
18 |     # print(data)
19 |     return data[:, :2], data[:, -1]
20 | 
21 | 
22 | X, y = create_data()
23 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
24 | # bagging 算法
25 | bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)
26 | bagging.fit(X_train, y_train)
27 | 
28 | in_score = bagging.score(X_train, y_train)
29 | out_score = bagging.score(X_test, y_test)
30 | print(in_score, out_score)
31 | 
32 | # RandomForest 算法
33 | from sklearn.ensemble import RandomForestClassifier
34 | 
35 | forest = RandomForestClassifier(n_estimators=300, max_depth=2, random_state=0)
36 | forest.fit(X_train, y_train)
37 | 
38 | in_score = forest.score(X_train, y_train)
39 | out_score = forest.score(X_test, y_test)
40 | print(in_score, out_score)
41 | 
42 | # Adaboost 算法
43 | 
44 | from sklearn.ensemble import AdaBoostClassifier
45 | 
46 | clf = AdaBoostClassifier(n_estimators=100, learning_rate=0.5)
47 | clf.fit(X_train, y_train)
48 | in_score = clf.score(X_train, y_train)
49 | out_score = clf.score(X_test, y_test)
50 | print(in_score, out_score)
51 | 
52 | # 投票分类器
53 | from sklearn.linear_model import LogisticRegression
54 | from sklearn.naive_bayes import GaussianNB
55 | from sklearn.ensemble import RandomForestClassifier
56 | 
57 | from sklearn.ensemble import VotingClassifier
58 | 
59 | import numpy as np
60 | 
61 | clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
62 | clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
63 | clf3 = GaussianNB()
64 | 
65 | X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
66 | y = np.array([1, 1, 1, 2, 2, 2])
67 | eclf1 = VotingClassifier(estimators=[
68 |     ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')
69 | eclf1 = eclf1.fit(X_train, y_train)
70 | # print(eclf1.predict(X))
71 | in_score = eclf1.score(X_train, y_train)
72 | out_score = eclf1.score(X_test, y_test)
73 | print(in_score, out_score)
74 | 
75 | 
76 | from sklearn.datasets import load_iris
77 | from sklearn.ensemble import RandomForestClassifier
78 | from sklearn.svm import LinearSVC
79 | from sklearn.linear_model import LogisticRegression
80 | from sklearn.preprocessing import StandardScaler
81 | from sklearn.pipeline import make_pipeline
82 | from sklearn.ensemble import StackingClassifier
83 | 
84 | 
85 | X, y = load_iris(return_X_y=True)
86 | estimators = [
87 |      ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
88 |      ('svr', make_pipeline(StandardScaler(),
89 |                            LinearSVC(random_state=42)))]
90 | clf = StackingClassifier(
91 |      estimators=estimators, final_estimator=LogisticRegression()
92 | )
93 | 
94 | from sklearn.model_selection import train_test_split
95 | X_train, X_test, y_train, y_test = train_test_split(
96 |      X, y, stratify=y, random_state=42
97 | )
98 | clf.fit(X_train, y_train)
99 | print(clf.score(X_test, y_test))


--------------------------------------------------------------------------------
/Code_2022/class9-test3.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | import matplotlib.gridspec as gridspec
 4 | import itertools
 5 | from sklearn.linear_model import LogisticRegression
 6 | from sklearn.naive_bayes import GaussianNB
 7 | from sklearn.neighbors import KNeighborsClassifier
 8 | from sklearn.svm import SVC
 9 | from sklearn.ensemble import RandomForestClassifier, StackingClassifier
10 | 
11 | from mlxtend.classifier import EnsembleVoteClassifier
12 | from mlxtend.data import iris_data
13 | from mlxtend.plotting import plot_decision_regions
14 | 
15 | # Initializing Classifiers
16 | clf1 = LogisticRegression(random_state=0)
17 | clf2 = RandomForestClassifier(random_state=0)
18 | clf3 = SVC(random_state=0, probability=True)
19 | eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
20 |                               weights=[2, 1, 1], voting='soft')
21 | 
22 | # clf1 = KNeighborsClassifier(n_neighbors=1)
23 | # clf2 = RandomForestClassifier(random_state=1)
24 | # clf3 = GaussianNB()
25 | # lr = LogisticRegression()
26 | # sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
27 | #                           meta_classifier=lr)
28 | 
29 | # Loading some example data
30 | X, y = iris_data()
31 | X = X[:, [0, 2]]
32 | 
33 | # Plotting Decision Regions
34 | 
35 | gs = gridspec.GridSpec(2, 2)
36 | fig = plt.figure(figsize=(10, 8))
37 | 
38 | labels = ['Logistic Regression',
39 |           'Random Forest',
40 |           'RBF kernel SVM',
41 |           'Ensemble']
42 | 
43 | for clf, lab, grd in zip([clf1, clf2, clf3, eclf],
44 |                          labels,
45 |                          itertools.product([0, 1],
46 |                                            repeat=2)):
47 |     clf.fit(X, y)
48 |     ax = plt.subplot(gs[grd[0], grd[1]])
49 |     fig = plot_decision_regions(X=X, y=y,
50 |                                 clf=clf, legend=2)
51 |     plt.title(lab)
52 | 
53 | plt.show()
54 | 


--------------------------------------------------------------------------------
/Code_2022/readme:
--------------------------------------------------------------------------------
1 | 2022 春季学期代码整理。 - 2023.2.8 上传
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Boost Software License - Version 1.0 - August 17th, 2003
 2 | 
 3 | Permission is hereby granted, free of charge, to any person or organization
 4 | obtaining a copy of the software and accompanying documentation covered by
 5 | this license (the "Software") to use, reproduce, display, distribute,
 6 | execute, and transmit the Software, and to prepare derivative works of the
 7 | Software, and to permit third-parties to whom the Software is furnished to
 8 | do so, all subject to the following:
 9 | 
10 | The copyright notices in the Software and this entire statement, including
11 | the above license grant, this restriction and the following disclaimer,
12 | must be included in all copies of the Software, in whole or in part, and
13 | all derivative works of the Software, unless such copies or derivative
14 | works are solely in the form of machine-executable object code generated by
15 | a source language processor.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
20 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
21 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
22 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 | DEALINGS IN THE SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Statistical-Learning-Slides-Code
 2 | 
 3 | ## 2025-《数据挖掘技术》课程课件与参考代码
 4 | 
 5 | 最后一次更新日期：2025.02.25
 6 | 
 7 | 更新记录：
 8 | - 2024.02.26
 9 | - 2023.02.08
10 | 
11 | #### 教材：《机器学习方法》（李航）
12 | 
13 | 历史教材：
14 | - 《机器学习方法》 李航
15 | - 《统计学习方法》（第二版）李航
16 | - 《统计学习方法》（第一版）李航
17 | 
18 | ## Contents
19 | 
20 | #### Code
21 | - Code 2020年整理整理参考代码
22 | - Code_2022 2022年整理代码
23 |     - 致谢 学号 20195298 同学
24 | 
25 | #### Slides
26 | - CH00.pdf (2022.02.20 11:28)
27 | - CH01 Statistical Learning.pdf
28 | - CH02 Perceptron.pdf
29 | - CH03 KNN.pdf
30 | - CH04 NaiveBayes.pdf 、
31 | - CH05 DecisionTree.pdf (2022.03.26 22:04)
32 | - CH06 LogicRegression and Maximum Entropy Model.pdf (2022.03.26 22:04)
33 | - CH07 SVM.pdf (2022.03.26 22:04)
34 | - CH08 Boosting.pdf  (2022.03.26 22:04)
35 | - CH09 EM.pdf (2022.03.26 22:04)
36 | - CH10 Hidden Markov Model.pdf (2022.03.26 22:04)
37 | - CH15 SVD.pdf (2022.03.26 22:04)
38 | - CH16 PCA.pdf (2022.03.26 22:04)
39 | - CH17 LSA.pdf (2022.03.26 22:04)
40 | - CH21 PageRank (2021.05.02 16:52)
41 | - CH22 Transformer.pdf (2025.02.25)
42 | - CHX0 Summary (2021.05.17 16:21）
43 | - CHX5 NN-CNN.pdf (2025.02.25)
44 | 
45 | -------
46 | ## History versions:
47 | 
48 | - CH00.pdf (2021.03.23 16:03)
49 | - CH01 Statistical Learning.pdf
50 | - CH02 Perceptron.pdf
51 | - CH03 KNN.pdf
52 | - CH04 NaiveBayes.pdf
53 | - CH05 DecisionTree.pdf
54 | - CH06 LogicRegression and Maximum Entropy Model.pdf
55 | - CH07 SVM.pdf
56 | - CH08 Boosting.pdf  (2021.03.26 22:04)
57 | - CH09 EM.pdf (2021.03.30 16:47)
58 | - CH10 Hidden Markov Model.pdf (2021.04.06 16:14)
59 | - CH15 SVD.pdf (2021.04.12 12:09)
60 | - CH16 PCA.pdf (2021.04.16 10:00)
61 | - CH17 LSA.pdf (2021.04.25 14:32)
62 | 
63 | ...... End ......
64 | 


--------------------------------------------------------------------------------
/Slides/A Step by Step Backpropagation Example for Regression using an One-hot Encoded Categorical Variable .pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/A Step by Step Backpropagation Example for Regression using an One-hot Encoded Categorical Variable .pdf


--------------------------------------------------------------------------------
/Slides/CH00 OverView.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH00 OverView.pdf


--------------------------------------------------------------------------------
/Slides/CH01 Statistical Learning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH01 Statistical Learning.pdf


--------------------------------------------------------------------------------
/Slides/CH02 Perceptron.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH02 Perceptron.pdf


--------------------------------------------------------------------------------
/Slides/CH03 KNN.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH03 KNN.pdf


--------------------------------------------------------------------------------
/Slides/CH04 NaiveBayes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH04 NaiveBayes.pdf


--------------------------------------------------------------------------------
/Slides/CH05 DecisionTree.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH05 DecisionTree.pdf


--------------------------------------------------------------------------------
/Slides/CH06 LogicRegression and Maximum Entropy Model.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH06 LogicRegression and Maximum Entropy Model.pdf


--------------------------------------------------------------------------------
/Slides/CH07 SVM.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH07 SVM.pdf


--------------------------------------------------------------------------------
/Slides/CH08 Boosting.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH08 Boosting.pdf


--------------------------------------------------------------------------------
/Slides/CH09 EM.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH09 EM.pdf


--------------------------------------------------------------------------------
/Slides/CH10 Hidden Markov Model.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH10 Hidden Markov Model.pdf


--------------------------------------------------------------------------------
/Slides/CH16 PCA.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH16 PCA.pdf


--------------------------------------------------------------------------------
/Slides/CH21 PageRank.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH21 PageRank.pdf


--------------------------------------------------------------------------------
/Slides/CH22 Transformer.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH22 Transformer.pdf


--------------------------------------------------------------------------------
/Slides/CHX0 Summary.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CHX0 Summary.pdf


--------------------------------------------------------------------------------
/Slides/CHX5 NN-CNN.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CHX5 NN-CNN.pdf


--------------------------------------------------------------------------------