├── 11. 贝叶斯分类器 ├── __init__.py ├── sk.py ├── simple_byes.py └── Laplace.py ├── 17. 高斯混合聚类 ├── 1.md ├── result.jpg ├── test.jpg └── 3.py ├── 9. 神经网络 ├── 1_基本概念.md ├── 2_activation_function.py ├── 3_backpropagation.py └── 4_pytorch_mnist.py ├── 12. EM算法 ├── 1_极大似然估计.md ├── 2_EM_single_iteration.py └── 3_EM_main_iteration.py ├── 21. PCA ├── 1_维数灾难.md ├── 2_PCA.py └── 3_sklearn_PCA.py ├── 18. DBSCAN ├── 1_basic_concept.md ├── 3_sklearn_DBSCAN.py └── 2_DBSCAN_algorithm.py ├── 3. 线性回归 ├── 1_简单与多元线性回归.md ├── 4_sklearn_linearRegression.py ├── 2_normal_equation.py └── 3_metrics.py ├── README.md ├── 16. k-means ├── .idea │ ├── vcs.xml │ ├── modules.xml │ ├── misc.xml │ ├── 16. k-means.iml │ └── workspace.xml ├── 1.py ├── 4.py ├── 2.py └── 3.py ├── 19. AGNES ├── .idea │ ├── vcs.xml │ ├── modules.xml │ ├── misc.xml │ ├── 19. AGNES.iml │ └── workspace.xml ├── 3.py ├── 1.py └── 2.py ├── 22. 多维缩放 ├── .idea │ ├── vcs.xml │ ├── modules.xml │ ├── misc.xml │ ├── 22. 多维缩放.iml │ └── workspace.xml ├── 2.py └── 1.py ├── .gitignore ├── 6. 线性判别分析 ├── 2_sklearn_LDA.py └── 1_LDA.py ├── 15. 聚类性能评估指标 ├── 3_sklearn_metrics.py ├── 1_external_index.py └── 2_internal_index.py ├── 8. 感知机 ├── sk.py └── preception.py ├── 24. 局部线性嵌入 ├── 2_sklearn_LLE.py └── 1_LLE.py ├── 23 等度量映射 ├── sk.py └── isomap.py ├── 4. 逻辑回归 ├── 3.py ├── 5.py └── 4.py ├── 14. 随机森林 ├── 3.Digit.py ├── Bagging.py └── RandomForest.py ├── 20 KNN ├── sk.py └── knn.py ├── 13. AdaBoost ├── 3.py └── 2.py ├── 2. 模型评估与选择 └── main.py └── 5. 多分类学习 ├── OvR.py └── OvO.py /11. 贝叶斯分类器/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /17. 高斯混合聚类/1.md: -------------------------------------------------------------------------------- 1 | 1. A B C 2 | 2. A -------------------------------------------------------------------------------- /9. 神经网络/1_基本概念.md: -------------------------------------------------------------------------------- 1 | 1. c 2 | 3 | -------------------------------------------------------------------------------- /12. EM算法/1_极大似然估计.md: -------------------------------------------------------------------------------- 1 | 1. A 2 | 2. B 3 | 3. D -------------------------------------------------------------------------------- /21. PCA/1_维数灾难.md: -------------------------------------------------------------------------------- 1 | ### 1. B,C 2 | 3 | ### 2. C -------------------------------------------------------------------------------- /18. DBSCAN/1_basic_concept.md: -------------------------------------------------------------------------------- 1 | ### 1. 选择D 2 | 3 | -------------------------------------------------------------------------------- /3. 线性回归/1_简单与多元线性回归.md: -------------------------------------------------------------------------------- 1 | 1. B, C 2 | 2. A, B, C 3 | 3. A -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # educoder-oj 2 | ## 南京大学机器学习课程oj相关习题解答 3 | 4 | ## 组团刷副本 冲冲冲 5 | -------------------------------------------------------------------------------- /17. 高斯混合聚类/result.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nju-ml-course/educoder-oj/HEAD/17. 高斯混合聚类/result.jpg -------------------------------------------------------------------------------- /17. 高斯混合聚类/test.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nju-ml-course/educoder-oj/HEAD/17. 高斯混合聚类/test.jpg -------------------------------------------------------------------------------- /16. k-means/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /19. AGNES/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /22. 多维缩放/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /9. 神经网络/2_activation_function.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | 3 | def relu(x): 4 | ''' 5 | x:负无穷到正无穷的实数 6 | ''' 7 | # ********* Begin *********# 8 | if x <= 0: 9 | return 0 10 | else: 11 | return x 12 | # ********* End *********# 13 | -------------------------------------------------------------------------------- /22. 多维缩放/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /19. AGNES/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /16. k-means/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /16. k-means/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /19. AGNES/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /22. 多维缩放/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /16. k-means/1.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | import numpy as np 3 | 4 | def distance(x,y,p=2): 5 | ''' 6 | input:x(ndarray):第一个样本的坐标 7 | y(ndarray):第二个样本的坐标 8 | p(int):等于1时为曼哈顿距离,等于2时为欧氏距离 9 | output:distance(float):x到y的距离 10 | ''' 11 | #********* Begin *********# 12 | return np.linalg.norm(x-y, p) 13 | 14 | #********* End *********# -------------------------------------------------------------------------------- /16. k-means/4.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | from sklearn.cluster import KMeans 3 | 4 | def kmeans_cluster(data): 5 | ''' 6 | input:data(ndarray):样本数据 7 | output:result(ndarray):聚类结果 8 | ''' 9 | #********* Begin *********# 10 | km = KMeans(n_clusters=3,random_state=888) 11 | result = km.fit_predict(data) 12 | 13 | #********* End *********# 14 | return result 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/ignore-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | /node_modules 5 | 6 | # testing 7 | /coverage 8 | 9 | # production 10 | /build 11 | 12 | # misc 13 | .DS_Store 14 | .env.local 15 | .env.development.local 16 | .env.test.local 17 | .env.production.local 18 | 19 | 20 | # idea 21 | .idea 22 | 23 | npm-debug.log* 24 | yarn-debug.log* 25 | yarn-error.log* 26 | 27 | # umi 28 | .umi -------------------------------------------------------------------------------- /19. AGNES/.idea/19. AGNES.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /22. 多维缩放/.idea/22. 多维缩放.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /16. k-means/.idea/16. k-means.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /6. 线性判别分析/2_sklearn_LDA.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 3 | 4 | 5 | def lda(x, y): 6 | """ 7 | input:x(ndarray):待处理数据 8 | y(ndarray):待处理数据标签 9 | output:x_new(ndarray):降维后数据 10 | """ 11 | # ********* Begin *********# 12 | lda = LinearDiscriminantAnalysis(n_components=2) 13 | lda.fit(x, y) 14 | x_new = lda.transform(x) 15 | # ********* End *********# 16 | return x_new 17 | -------------------------------------------------------------------------------- /22. 多维缩放/2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from sklearn.manifold import MDS 3 | 4 | 5 | def mds(data, d): 6 | ''' 7 | input:data(ndarray):待降维数据 8 | d(int):降维后数据维度 9 | output:Z(ndarray):降维后数据 10 | ''' 11 | # ********* Begin *********# 12 | mds = MDS(d) 13 | Z = mds.fit_transform(data) 14 | 15 | # ********* End *********# 16 | return Z 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /18. DBSCAN/3_sklearn_DBSCAN.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | from sklearn.cluster import DBSCAN 3 | 4 | 5 | def data_cluster(data): 6 | """ 7 | input: data(ndarray) :数据 8 | output: result(ndarray):聚类结果 9 | """ 10 | # ********* Begin *********# 11 | dbscan = DBSCAN(eps=0.5, min_samples=10) 12 | result = dbscan.fit_predict(data) 13 | return result 14 | # ********* End *********# 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /15. 聚类性能评估指标/3_sklearn_metrics.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics.cluster import fowlkes_mallows_score, adjusted_rand_score 2 | 3 | 4 | def cluster_performance(y_true, y_pred): 5 | """ 6 | 返回Rand指数和FM指数 7 | :param y_true:参考模型的簇划分,类型为ndarray 8 | :param y_pred:聚类模型给出的簇划分,类型为ndarray 9 | :return: Rand指数,FM指数 10 | """ 11 | # ********* Begin *********# 12 | rand = adjusted_rand_score(y_true, y_pred) 13 | fm = fowlkes_mallows_score(y_true, y_pred) 14 | return fm, rand 15 | # ********* End *********# 16 | -------------------------------------------------------------------------------- /8. 感知机/sk.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.linear_model.perceptron import Perceptron 3 | import csv 4 | 5 | # 获取训练数据 6 | train_data = pd.read_csv('./step2/train_data.csv') 7 | # 获取训练标签 8 | train_label = pd.read_csv('./step2/train_label.csv') 9 | train_label = train_label['target'] 10 | # 获取测试数据 11 | test_data = pd.read_csv('./step2/test_data.csv') 12 | clf = Perceptron(max_iter=1e5) 13 | clf.fit(train_data, train_label) 14 | result = clf.predict(test_data) 15 | 16 | pd.DataFrame({'result': result}).to_csv('./step2/result.csv', index=False) 17 | -------------------------------------------------------------------------------- /19. AGNES/3.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | from sklearn.cluster import AgglomerativeClustering 3 | 4 | def Agglomerative_cluster(data): 5 | ''' 6 | 对红酒数据进行聚类 7 | :param data: 数据集,类型为ndarray 8 | :return: 聚类结果,类型为ndarray 9 | ''' 10 | 11 | #********* Begin *********# 12 | mean = data.mean() #计算平均数 13 | deviation = data.std() #计算标准差 14 | # 标准化数据的公式: (数据值 - 平均数) / 标准差 15 | data = (data - mean) / deviation 16 | agnes = AgglomerativeClustering(n_clusters=3) 17 | result = agnes.fit_predict(data) 18 | return result 19 | 20 | #********* End *********# 21 | -------------------------------------------------------------------------------- /21. PCA/2_PCA.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def pca(data, k): 5 | """ 6 | 对data进行PCA,并将结果返回 7 | :param data:数据集,类型为ndarray 8 | :param k:想要降成几维,类型为int 9 | :return: 降维后的数据,类型为ndarray 10 | """ 11 | 12 | # ********* Begin *********# 13 | # 零均值化 14 | mean = np.mean(data, axis=0) 15 | after_demean = data - mean 16 | 17 | cov = np.cov(after_demean.T) 18 | 19 | value, vector = np.linalg.eig(cov) 20 | 21 | index = np.argsort(-value)[: k] 22 | w = vector[:, index] 23 | 24 | return np.dot(after_demean, w) 25 | 26 | # ********* End *********# 27 | -------------------------------------------------------------------------------- /24. 局部线性嵌入/2_sklearn_LLE.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from sklearn.manifold import LocallyLinearEmbedding 3 | 4 | 5 | def lle(data, d, k): 6 | """ 7 | input:data(ndarray):待降维数据 8 | d(int):降维后数据维度 9 | k(int):邻域内样本数 10 | output:Z(ndarray):降维后数据 11 | """ 12 | # ********* Begin *********# 13 | lle = LocallyLinearEmbedding(n_components=d, n_neighbors=k) 14 | Z = lle.fit_transform(data) 15 | # ********* End *********# 16 | return Z 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /3. 线性回归/4_sklearn_linearRegression.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | # ********* Begin *********# 3 | from sklearn.linear_model import LinearRegression 4 | import pandas as pd 5 | 6 | # 获取训练数据 7 | train_data = pd.read_csv('./step3/train_data.csv') 8 | # 获取训练标签 9 | train_label = pd.read_csv('./step3/train_label.csv') 10 | train_label = train_label['target'] 11 | # 获取测试数据 12 | test_data = pd.read_csv('./step3/test_data.csv') 13 | 14 | model = LinearRegression(normalize=True) 15 | model.fit(train_data, train_label) 16 | test_y = model.predict(test_data) 17 | 18 | pd.DataFrame(test_y, columns=['result']).to_csv('./step3/result.csv') 19 | 20 | # ********* End *********# 21 | -------------------------------------------------------------------------------- /23 等度量映射/sk.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from sklearn.manifold import Isomap 3 | import isomap as isa 4 | import sklearn.datasets as db 5 | 6 | 7 | 8 | def isomap(data, d, k): 9 | ''' 10 | input:data(ndarray):待降维数据 11 | d(int):降维后数据维度 12 | k(int):最近的k个样本 13 | output:Z(ndarray):降维后数据 14 | ''' 15 | # ********* Begin *********# 16 | iso = Isomap(n_neighbors=k, n_components=d) 17 | return iso.fit_transform(data) 18 | 19 | 20 | if __name__ == '__main__': 21 | ir = db.load_boston() 22 | X1 = isa.isomap(ir.data[:10], d=2, k=4) 23 | X2 = isomap(ir.data[:10], d=2, k=4) 24 | print(X1) 25 | print(X2) 26 | -------------------------------------------------------------------------------- /4. 逻辑回归/3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import warnings 5 | warnings.filterwarnings("ignore") 6 | 7 | def gradient_descent(initial_theta,eta=0.05,n_iters=1e3,epslion=1e-8): 8 | ''' 9 | 梯度下降 10 | :param initial_theta: 参数初始值,类型为float 11 | :param eta: 学习率,类型为float 12 | :param n_iters: 训练轮数,类型为int 13 | :param epslion: 容忍误差范围,类型为float 14 | :return: 训练后得到的参数 15 | ''' 16 | # 请在此添加实现代码 # 17 | #********** Begin *********# 18 | i = 0 19 | while i < n_iters: 20 | initial_theta = initial_theta - eta*2*(initial_theta-3) 21 | i += 1 22 | return initial_theta 23 | #********** End **********# 24 | 25 | 26 | -------------------------------------------------------------------------------- /4. 逻辑回归/5.py: -------------------------------------------------------------------------------- 1 | from sklearn.linear_model import LogisticRegression 2 | 3 | def digit_predict(train_image, train_label, test_image): 4 | ''' 5 | 实现功能:训练模型并输出预测结果 6 | :param train_sample: 包含多条训练样本的样本集,类型为ndarray,shape为[-1, 8, 8] 7 | :param train_label: 包含多条训练样本标签的标签集,类型为ndarray 8 | :param test_sample: 包含多条测试样本的测试集,类型为ndarry 9 | :return: test_sample对应的预测标签 10 | ''' 11 | #************* Begin ************# 12 | logreg = LogisticRegression(solver='newton-cg',max_iter =1000,C=1) 13 | logreg.fit(train_image.reshape(train_image.shape[0],-1), train_label) 14 | return logreg.predict(test_image.reshape(test_image.shape[0],-1)) 15 | #************* End **************# -------------------------------------------------------------------------------- /14. 随机森林/3.Digit.py: -------------------------------------------------------------------------------- 1 | from sklearn.ensemble import RandomForestClassifier 2 | import numpy as np 3 | import sklearn.datasets as db 4 | 5 | 6 | def digit_predict(train_image, train_label, test_image): 7 | """ 8 | 实现功能:训练模型并输出预测结果 9 | :param train_image: 包含多条训练样本的样本集,类型为ndarray,shape为[-1, 8, 8] 10 | :param train_label: 包含多条训练样本标签的标签集,类型为ndarray 11 | :param test_image: 包含多条测试样本的测试集,类型为ndarry 12 | :return: test_image对应的预测标签,类型为ndarray 13 | """ 14 | X = np.reshape(train_image, newshape=(-1, 64)) 15 | clf = RandomForestClassifier(n_estimators=500, max_depth=10) 16 | clf.fit(X, y=train_label) 17 | return clf.predict(test_image) 18 | 19 | 20 | data = db.load_digits() 21 | -------------------------------------------------------------------------------- /20 KNN/sk.py: -------------------------------------------------------------------------------- 1 | from sklearn.neighbors import KNeighborsClassifier 2 | from sklearn.preprocessing import StandardScaler 3 | 4 | 5 | def classification(train_feature, train_label, test_feature): 6 | ''' 7 | 对test_feature进行红酒分类 8 | :param train_feature: 训练集数据,类型为ndarray 9 | :param train_label: 训练集标签,类型为ndarray 10 | :param test_feature: 测试集数据,类型为ndarray 11 | :return: 测试集数据的分类结果 12 | ''' 13 | 14 | # 实例化StandardScaler对象 15 | scaler = StandardScaler() 16 | # 用data的均值和标准差来进行标准化,并将结果保存到after_scaler 17 | X = scaler.fit_transform(train_feature) 18 | # 用刚刚的StandardScaler对象来进行归一化 19 | X_test = scaler.transform(test_feature) 20 | clf = KNeighborsClassifier() 21 | clf.fit(X, train_label) 22 | return clf.predict(X_test) 23 | -------------------------------------------------------------------------------- /21. PCA/3_sklearn_PCA.py: -------------------------------------------------------------------------------- 1 | from sklearn.decomposition import PCA 2 | from sklearn.svm import LinearSVC 3 | 4 | 5 | def cancer_predict(train_sample, train_label, test_sample): 6 | """ 7 | 使用PCA降维,并进行分类,最后将分类结果返回 8 | :param train_sample:训练样本, 类型为ndarray 9 | :param train_label:训练标签, 类型为ndarray 10 | :param test_sample:测试样本, 类型为ndarray 11 | :return: 分类结果 12 | """ 13 | 14 | # ********* Begin *********# 15 | pca = PCA(n_components=11) 16 | train_sample_transformed = pca.fit_transform(train_sample) 17 | test_sample_transformed = pca.transform(test_sample) 18 | 19 | clf = LinearSVC() 20 | clf.fit(train_sample_transformed, train_label) 21 | return clf.predict(test_sample_transformed) 22 | 23 | # ********* End *********# 24 | -------------------------------------------------------------------------------- /17. 高斯混合聚类/3.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import numpy as np 3 | from sklearn.mixture import GaussianMixture 4 | 5 | # 读取road.jpg到im变量中 6 | im = Image.open('./step3/image/test.jpg') 7 | 8 | # 将im转换成ndarray 9 | img = np.array(im) 10 | # 将img变形为[-1, 3]的shape,并保存至img_reshape 11 | img_reshape = img.reshape(-1, 3) 12 | 13 | # 实例化一个将数据聚成3个簇的高斯混合聚类器 14 | gmm = GaussianMixture(3) 15 | # 将数据传给fit函数,fit函数会计算出各个高斯分布的参数和响应系数 16 | gmm.fit(img_reshape) 17 | # 对数据进行聚类,簇标记为0 1 2(因为gmm对象想要聚成3个簇) 18 | pred = gmm.predict(img_reshape) 19 | 20 | img_reshape[pred == 0, :] = [255, 255, 0] # 黄色 21 | img_reshape[pred == 1, :] = [0, 0, 255] # 蓝色 22 | img_reshape[pred == 2, :] = [0, 255, 0] # 绿色 23 | im = Image.fromarray(img.astype('uint8')) 24 | # 将im保存为new_road.jpg 25 | im.save('./step3/dump/result.jpg') 26 | -------------------------------------------------------------------------------- /13. AdaBoost/3.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | from sklearn.tree import DecisionTreeClassifier 3 | from sklearn.ensemble import AdaBoostClassifier 4 | def ada_classifier(train_data,train_label,test_data): 5 | ''' 6 | input:train_data(ndarray):训练数据 7 | train_label(ndarray):训练标签 8 | test_data(ndarray):测试标签 9 | output:predict(ndarray):预测结果 10 | ''' 11 | #********* Begin *********# 12 | ada=AdaBoostClassifier(n_estimators=80,learning_rate=1.0) 13 | ada.fit(train_data,train_label) 14 | predict = ada.predict(test_data) 15 | #********* End *********# 16 | return predict 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /4. 逻辑回归/4.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import warnings 5 | warnings.filterwarnings("ignore") 6 | 7 | def sigmoid(x): 8 | ''' 9 | sigmoid函数 10 | :param x: 转换前的输入 11 | :return: 转换后的概率 12 | ''' 13 | return 1/(1+np.exp(-x)) 14 | 15 | 16 | def fit(x,y,eta=1e-3,n_iters=1e4): 17 | ''' 18 | 训练逻辑回归模型 19 | :param x: 训练集特征数据,类型为ndarray 20 | :param y: 训练集标签,类型为ndarray 21 | :param eta: 学习率,类型为float 22 | :param n_iters: 训练轮数,类型为int 23 | :return: 模型参数,类型为ndarray 24 | ''' 25 | # 请在此添加实现代码 # 26 | #********** Begin *********# 27 | i = 0 28 | w = np.zeros(31) 29 | while i < n_iters: 30 | a = sigmoid(x.dot(w)) 31 | w = w - eta * np.tensordot(x.T, (a-y),(1,0)) 32 | i += 1 33 | return w 34 | #********** End **********# 35 | 36 | -------------------------------------------------------------------------------- /9. 神经网络/3_backpropagation.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | import os 3 | from sklearn.neural_network import MLPClassifier 4 | import pandas as pd 5 | 6 | if os.path.exists('./step2/result.csv'): 7 | os.remove('./step2/result.csv') 8 | 9 | # ********* Begin *********# 10 | # 获取训练数据 11 | train_data = pd.read_csv('./step2/train_data.csv') 12 | # 获取训练标签 13 | train_label = pd.read_csv('./step2/train_label.csv') 14 | train_label = train_label['target'] 15 | # 获取测试数据 16 | test_data = pd.read_csv('./step2/test_data.csv') 17 | 18 | mlp = MLPClassifier(solver='lbfgs', max_iter=100, 19 | alpha=1e-5, hidden_layer_sizes=(5, 10, 3)) 20 | mlp.fit(train_data, train_label) 21 | result = mlp.predict(test_data) 22 | 23 | result = pd.DataFrame(result, columns=['result']) 24 | 25 | result.to_csv('./step2/result.csv', index=False) 26 | 27 | # ********* End *********# 28 | -------------------------------------------------------------------------------- /6. 线性判别分析/1_LDA.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | import numpy as np 3 | from numpy.linalg import inv 4 | 5 | 6 | def lda(X, y): 7 | ''' 8 | input:X(ndarray):待处理数据 9 | y(ndarray):待处理数据标签,标签分别为0和1 10 | output:X_new(ndarray):处理后的数据 11 | ''' 12 | # ********* Begin *********# 13 | 14 | # 划分出第一类样本与第二类样本 15 | p_data = np.transpose(X[y == 0]) 16 | n_data = np.transpose(X[y == 1]) 17 | 18 | # 计算第一类样本与第二类样本协方差矩阵 19 | p_cov = np.cov(p_data) 20 | n_cov = np.cov(n_data) 21 | # 计算类内散度矩阵 22 | S_w = p_cov + n_cov 23 | 24 | # 获取第一类样本与第二类样本中心点 25 | p_mu = np.mean(p_data, axis=1) 26 | n_mu = np.mean(n_data, axis=1) 27 | # 计算w 28 | w = inv(S_w).dot(n_mu - p_mu) 29 | # 计算新样本集 30 | X_new = X.dot(w).reshape(-1, 1) 31 | 32 | # ********* End *********# 33 | return X_new * 0.0623 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /11. 贝叶斯分类器/sk.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import CountVectorizer 2 | from sklearn.naive_bayes import MultinomialNB 3 | from sklearn.feature_extraction.text import TfidfTransformer 4 | 5 | 6 | def news_predict(train_sample, train_label, test_sample): 7 | ''' 8 | 训练模型并进行预测,返回预测结果 9 | :param train_sample:原始训练集中的新闻文本,类型为ndarray 10 | :param train_label:训练集中新闻文本对应的主题标签,类型为ndarray 11 | :param test_sample:原始测试集中的新闻文本,类型为ndarray 12 | :return 预测结果,类型为ndarray 13 | ''' 14 | # 实例化向量化对象 15 | vec = CountVectorizer() 16 | # 将训练集中的新闻向量化 17 | X_train = vec.fit_transform(train_sample) 18 | # 将测试集中的新闻向量化 19 | X_test = vec.transform(test_sample) 20 | # 实例化tf-idf对象 21 | tfidf = TfidfTransformer() 22 | # 将训练集中的词频向量用tf-idf进行转换 23 | X_train = tfidf.fit_transform(X_train) 24 | # 将测试集中的词频向量用tf-idf进行转换 25 | X_test = tfidf.transform(X_test) 26 | 27 | clf = MultinomialNB(alpha=0.8) 28 | clf.fit(X_train, train_label) 29 | result = clf.predict(X_test) 30 | return result 31 | -------------------------------------------------------------------------------- /16. k-means/2.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | import numpy as np 3 | 4 | 5 | # 计算样本间距离 6 | def distance(x, y, p=2): 7 | ''' 8 | input:x(ndarray):第一个样本的坐标 9 | y(ndarray):第二个样本的坐标 10 | p(int):等于1时为曼哈顿距离,等于2时为欧氏距离 11 | output:distance(float):x到y的距离 12 | ''' 13 | # ********* Begin *********# 14 | return (np.sum(np.subtract(x, y) ** p)) ** (1 / p) 15 | # ********* End *********# 16 | 17 | 18 | # 计算质心 19 | def cal_Cmass(data): 20 | ''' 21 | input:data(ndarray):数据样本 22 | output:mass(ndarray):数据样本质心 23 | ''' 24 | # ********* Begin *********# 25 | return [np.mean(col) for col in np.transpose(data)] 26 | # ********* End *********# 27 | 28 | 29 | # 计算每个样本到质心的距离,并按照从小到大的顺序排列 30 | def sorted_list(data, Cmass): 31 | ''' 32 | input:data(ndarray):数据样本 33 | Cmass(ndarray):数据样本质心 34 | output:dis_list(list):排好序的样本到质心距离 35 | ''' 36 | # ********* Begin *********# 37 | return sorted([distance(row, Cmass) for row in data]) 38 | # ********* End *********# 39 | -------------------------------------------------------------------------------- /8. 感知机/preception.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | import numpy as np 3 | 4 | 5 | # 构建感知机算法 6 | class Perceptron(object): 7 | def __init__(self, learning_rate=0.01, max_iter=200): 8 | self.lr = learning_rate 9 | self.max_iter = max_iter 10 | 11 | def fit(self, data, label): 12 | ''' 13 | input:data(ndarray):训练数据特征 14 | label(ndarray):训练数据标签 15 | output:w(ndarray):训练好的权重 16 | b(ndarry):训练好的偏置 17 | ''' 18 | # 编写感知机训练方法,w为权重,b为偏置 19 | self.w = np.array([1.] * data.shape[1]) 20 | self.b = np.array([1.]) 21 | for i in range(self.max_iter): 22 | for row in range(data.shape[0]): 23 | if label[row] * (np.dot(data[row], np.transpose(self.w)) + self.b) < 0: 24 | self.w += self.lr * label[row] * data[row] 25 | self.b += self.lr * label[row] 26 | 27 | def predict(self, data): 28 | ''' 29 | input:data(ndarray):测试数据特征 30 | output:predict(ndarray):预测标签 31 | ''' 32 | z = np.dot(data, np.transpose(self.w)) + self.b 33 | return [1 if item > 0 else -1 for item in z] 34 | 35 | -------------------------------------------------------------------------------- /24. 局部线性嵌入/1_LLE.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | import numpy as np 3 | 4 | 5 | def find_neighbors(data, i, k): 6 | dist = sorted(range(len(data)), key=lambda x: np.linalg.norm(data[x] - data[i])) 7 | return set(dist[1: k + 1]) 8 | 9 | 10 | def cal_c_jk(data, i, j, k): 11 | return np.dot((data[i] - data[j]), (data[i] - data[k])) 12 | 13 | 14 | def lle(data, d, k): 15 | """ 16 | input:data(ndarray):待降维数据,行数为样本个数,列数为特征数 17 | d(int):降维后数据维数 18 | k(int):最近的k个样本 19 | output:Z(ndarray):降维后的数据 20 | """ 21 | # ********* Begin *********# 22 | m = len(data) 23 | W = np.zeros((m, m)) 24 | for i in range(m): 25 | # 确定样本i的邻域 26 | neighbors = find_neighbors(data, i, k) 27 | lower = sum(1 / cal_c_jk(data, i, l, s) for l in neighbors for s in neighbors) 28 | for j in neighbors: 29 | # 求矩阵c及其逆 30 | upper = sum(1 / cal_c_jk(data, i, j, k) for k in neighbors) 31 | # 求w 32 | W[i][j] = upper / lower 33 | 34 | # 求得M并矩阵分解 35 | I = np.identity(m) 36 | M = np.dot((I - W).T, (I - W)) 37 | 38 | value, vector = np.linalg.eig(M) 39 | index = np.argsort(value)[: d] 40 | # 求Z(z1; z2; z3; ...; zm) 每一行为一个新的降维投影 41 | Z = vector[:, index].T 42 | # ********* End *********# 43 | return Z 44 | 45 | -------------------------------------------------------------------------------- /14. 随机森林/Bagging.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.tree import DecisionTreeClassifier 3 | 4 | 5 | class BaggingClassifier(object): 6 | def __init__(self, n_model=10): 7 | ''' 8 | 初始化函数 9 | ''' 10 | # 分类器的数量,默认为10 11 | self.n_model = n_model 12 | # 用于保存模型的列表,训练好分类器后将对象append进去即可 13 | self.models = [] 14 | 15 | def fit(self, feature, label): 16 | ''' 17 | 训练模型,请记得将模型保存至self.models 18 | :param feature: 训练集数据,类型为ndarray 19 | :param label: 训练集标签,类型为ndarray 20 | :return: None 21 | ''' 22 | self.models = [DecisionTreeClassifier(max_depth=3).fit(feature, label) for _ in range(self.n_model)] 23 | 24 | def predict(self, feature): 25 | ''' 26 | :param feature: 测试集数据,类型为ndarray 27 | :return: 预测结果,类型为ndarray,如np.array([0, 1, 2, 2, 1, 0]) 28 | ''' 29 | tmp_arr = np.transpose([clf_.predict(feature) for clf_ in self.models]) 30 | predict = [] 31 | for row in tmp_arr: 32 | dic = {} 33 | for item in row: 34 | if item not in dic.keys(): 35 | dic[item] = 1 36 | else: 37 | dic[item] += 1 38 | predict.append(list(max(dic.items(), key=lambda d: d[1]))[0]) 39 | return predict 40 | 41 | 42 | -------------------------------------------------------------------------------- /22. 多维缩放/1.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | 4 | 5 | def mds(data, d): 6 | ''' 7 | input:data(ndarray):待降维数据 8 | d(int):降维后数据维数 9 | output:Z(ndarray):降维后的数据 10 | ''' 11 | # ********* Begin *********# 12 | # 计算dist2,dist2i,dist2j,dist2ij 13 | 14 | # 计算B 15 | 16 | # 矩阵分解得到特征值与特征向量 17 | 18 | # 计算Z 19 | 20 | # ********* End *********# 21 | DSquare = np.zeros([data.shape[0], data.shape[0]]) 22 | for i in range(data.shape[0]): 23 | for j in range(data.shape[0]): 24 | DSquare[i][j] = np.sum(np.square(data[i] - data[j])) 25 | totalMean = np.mean(DSquare) 26 | rowMean = np.mean(DSquare, axis=1) 27 | columnMean = np.mean(DSquare, axis=0) 28 | B = np.zeros(DSquare.shape) 29 | for i in range(B.shape[0]): 30 | for j in range(B.shape[1]): 31 | B[i][j] = -0.5 * (DSquare[i][j] - rowMean[i] - columnMean[j] + totalMean) 32 | eigVal, eigVec = np.linalg.eigh(B) # 求特征值及特征向量 33 | # 对特征值进行排序,得到排序索引 34 | eigValSorted_indices = np.argsort(-eigVal) 35 | # 提取d个最大特征向量 36 | topd_eigVec = eigVec[:, eigValSorted_indices[:d]] 37 | X = np.dot(topd_eigVec, np.sqrt(np.diag(eigVal[eigValSorted_indices[:d]]))) 38 | return X 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /23 等度量映射/isomap.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | 4 | 5 | def isomap(data, d, k, Max=10000): 6 | """ 7 | input:data(ndarray):待降维数据 8 | d(int):降维后数据维数 9 | k(int):最近的k个样本 10 | Max(int):表示无穷大 11 | output:Z(ndarray):降维后的数据 12 | """ 13 | # ********* Begin *********# 14 | # 计算dist2,dist2i,dist2j,dist2ij 15 | m, n = data.shape 16 | dist = np.ones((m, m)) * Max 17 | disti = np.zeros(m) 18 | distj = np.zeros(m) 19 | B = np.zeros((m, m)) 20 | for i in range(m): 21 | distance = np.power(np.tile(data[i], (m, 1)) - data, 2).sum(axis=1) 22 | index = np.argsort(distance) 23 | q = index[:k] 24 | for l in q: 25 | dist[i][l] = np.power(data[i] - data[l], 2).sum() 26 | for i in range(m): 27 | disti[i] = np.mean(dist[i, :]) 28 | distj[i] = np.mean(dist[:, i]) 29 | distij = np.mean(dist) 30 | # 计算B 31 | for i in range(m): 32 | for j in range(m): 33 | B[i, j] = -0.5 * (dist[i, j] - disti[i] - distj[j] + distij) 34 | # 矩阵分解得到特征值与特征向量 35 | lamda, V = np.linalg.eigh(B) 36 | # 计算Z 37 | index = np.argsort(-lamda)[:d] 38 | diag_lamda = np.sqrt(np.diag(-np.sort(-lamda)[:d])) 39 | V_selected = V[:, index] 40 | Z = V_selected.dot(diag_lamda) 41 | # ********* End *********# 42 | return Z 43 | -------------------------------------------------------------------------------- /3. 线性回归/2_normal_equation.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | import numpy as np 3 | 4 | 5 | def mse_score(y_predict, y_test): 6 | """ 7 | input:y_predict(ndarray):预测值 8 | y_test(ndarray):真实值 9 | ouput:mse(float):mse损失函数值 10 | """ 11 | # ********* Begin *********# 12 | return 1 / len(y_predict) * sum([np.square(y - p) for y, p in zip(y_test, y_predict)]) 13 | # ********* End *********# 14 | return mse 15 | 16 | 17 | class LinearRegression: 18 | def __init__(self): 19 | """初始化线性回归模型""" 20 | self.theta = None 21 | 22 | def fit_normal(self, train_data, train_label): 23 | """ 24 | input:train_data(ndarray):训练样本 25 | train_label(ndarray):训练标签 26 | """ 27 | # ********* Begin *********# 28 | ones = np.ones((len(train_data), 1)) 29 | train_data = np.column_stack((train_data, ones)) 30 | self.theta = np.linalg.inv(train_data.T @ train_data) @ train_data.T @ train_label 31 | # ********* End *********# 32 | return self 33 | 34 | def predict(self, test_data): 35 | """ 36 | input:test_data(ndarray):测试样本 37 | """ 38 | # ********* Begin *********# 39 | ones = np.ones((len(test_data), 1)) 40 | test_data = np.column_stack((test_data, ones)) 41 | return test_data @ self.theta 42 | # ********* End *********# 43 | -------------------------------------------------------------------------------- /20 KNN/knn.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | import numpy as np 3 | 4 | 5 | class kNNClassifier(object): 6 | def __init__(self, k): 7 | ''' 8 | 初始化函数 9 | :param k:kNN算法中的k 10 | ''' 11 | self.k = k 12 | # 用来存放训练数据,类型为ndarray 13 | self.train_feature = None 14 | # 用来存放训练标签,类型为ndarray 15 | self.train_label = None 16 | 17 | def fit(self, feature, label): 18 | """ 19 | kNN算法的训练过程 20 | :param feature: 训练集数据,类型为ndarray 21 | :param label: 训练集标签,类型为ndarray 22 | :return: 无返回 23 | """ 24 | self.train_feature = feature 25 | self.train_label = label 26 | self.data = np.concatenate((feature, np.transpose([label])), axis=1) 27 | 28 | def predict(self, feature): 29 | """ 30 | kNN算法的预测过程 31 | :param feature: 测试集数据,类型为ndarray 32 | :return: 预测结果,类型为ndarray或list 33 | """ 34 | 35 | # ********* Begin *********# 36 | def computeDistance(X, Y): 37 | return np.linalg.norm(np.subtract(X, Y)) 38 | 39 | def moMax(X): 40 | return np.argmax(np.bincount(X)) 41 | 42 | ans = [] 43 | for row in feature: 44 | arr = sorted(self.data, key=lambda item: computeDistance(item[:-1], row))[:self.k + 1] 45 | ans.append(moMax([row[-1] for row in arr])) 46 | return ans 47 | 48 | 49 | -------------------------------------------------------------------------------- /19. AGNES/1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def calc_min_dist(cluster1, cluster2): 5 | ''' 6 | 计算簇间最小距离 7 | :param cluster1:簇1中的样本数据,类型为ndarray 8 | :param cluster2:簇2中的样本数据,类型为ndarray 9 | :return:簇1与簇2之间的最小距离 10 | ''' 11 | 12 | #********* Begin *********# 13 | dis = 100000000 14 | for vec1 in cluster1: 15 | for vec2 in cluster2: 16 | dis=min(dis, np.linalg.norm(vec1-vec2)) 17 | return dis 18 | 19 | #********* End *********# 20 | 21 | 22 | def calc_max_dist(cluster1, cluster2): 23 | ''' 24 | 计算簇间最大距离 25 | :param cluster1:簇1中的样本数据,类型为ndarray 26 | :param cluster2:簇2中的样本数据,类型为ndarray 27 | :return:簇1与簇2之间的最大距离 28 | ''' 29 | 30 | #********* Begin *********# 31 | dis = 0 32 | for vec1 in cluster1: 33 | for vec2 in cluster2: 34 | dis=max(dis, np.linalg.norm(vec1-vec2)) 35 | return dis 36 | 37 | #********* End *********# 38 | 39 | 40 | def calc_avg_dist(cluster1, cluster2): 41 | ''' 42 | 计算簇间平均距离 43 | :param cluster1:簇1中的样本数据,类型为ndarray 44 | :param cluster2:簇2中的样本数据,类型为ndarray 45 | :return:簇1与簇2之间的平均距离 46 | ''' 47 | 48 | #********* Begin *********# 49 | dis = 0 50 | for vec1 in cluster1: 51 | for vec2 in cluster2: 52 | dis+=np.linalg.norm(vec1-vec2) 53 | return dis/(cluster1.shape[0]*cluster2.shape[0]) 54 | 55 | #********* End *********# -------------------------------------------------------------------------------- /19. AGNES/2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def dist(cluster1, cluster2): 4 | dis = 100000000 5 | for vec1 in cluster1: 6 | for vec2 in cluster2: 7 | dis=min(dis, np.linalg.norm(vec1-vec2)) 8 | return dis 9 | 10 | def find_Min(M): 11 | m = 100000000 12 | x = 0 13 | y = 0 14 | for i in range(len(M)): 15 | for j in range(len(M[i])): 16 | if M[i][j] < m: 17 | x = i 18 | y = j 19 | return x, y, m 20 | 21 | def AGNES(feature, k): 22 | ''' 23 | AGNES聚类并返回聚类结果,量化距离时请使用簇间最大欧氏距离 24 | 假设数据集为`[1, 2], [10, 11], [1, 3]],那么聚类结果可能为`[[1, 2], [1, 3]], [[10, 11]]] 25 | :param feature:数据集,类型为ndarray 26 | :param k:表示想要将数据聚成`k`类,类型为`int` 27 | :return:聚类结果,类型为list 28 | ''' 29 | 30 | #********* Begin *********# 31 | #初始化C和M 32 | C = [];M = [] 33 | for i in feature: 34 | Ci = [] 35 | Ci.append(i) 36 | C.append(Ci) 37 | for i in C: 38 | Mi = [] 39 | for j in C: 40 | Mi.append(dist(i, j)) 41 | M.append(Mi) 42 | q = len(C) 43 | #合并更新 44 | while q > k: 45 | x, y, min = find_Min(M) 46 | C[x].extend(C[y]) 47 | C.remove(C[y]) 48 | M = [] 49 | for i in C: 50 | Mi = [] 51 | for j in C: 52 | Mi.append(dist(i, j)) 53 | M.append(Mi) 54 | q -= 1 55 | return C 56 | 57 | #********* End *********# 58 | 59 | -------------------------------------------------------------------------------- /12. EM算法/2_EM_single_iteration.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy import stats 3 | from collections import Counter 4 | 5 | 6 | def em_single(init_values, observations): 7 | """ 8 | 模拟抛掷硬币实验并估计在一次迭代中,硬币A与硬币B正面朝上的概率 9 | :param init_values:硬币A与硬币B正面朝上的概率的初始值,类型为list,如[0.2, 0.7]代表硬币A正面朝上的概率为0.2,硬币B正面朝上的概率为0.7。 10 | :param observations:抛掷硬币的实验结果记录,类型为list。 11 | :return:将估计出来的硬币A和硬币B正面朝上的概率组成list返回。如[0.4, 0.6]表示你认为硬币A正面朝上的概率为0.4,硬币B正面朝上的概率为0.6。 12 | """ 13 | 14 | # ********* Begin *********# 15 | def get_likehood(p, l): 16 | likehood = 1 17 | for i in l: 18 | if i == 1: 19 | likehood *= p 20 | else: 21 | likehood *= 1 - p 22 | return likehood 23 | 24 | exist_matrix = np.zeros((2, 2)) 25 | p_a, p_b = init_values[0], init_values[1] 26 | for experiment in observations: 27 | likehood_a = get_likehood(p_a, experiment) 28 | likehood_b = get_likehood(p_b, experiment) 29 | prob_a = likehood_a / (likehood_a + likehood_b) 30 | prob_b = likehood_b / (likehood_a + likehood_b) 31 | c = Counter(experiment) 32 | exist_matrix[0][0] += prob_a * c[1] 33 | exist_matrix[0][1] += prob_a * c[0] 34 | exist_matrix[1][0] += prob_b * c[1] 35 | exist_matrix[1][1] += prob_b * c[0] 36 | new_p_a = exist_matrix[0][0] / (exist_matrix[0][0] + exist_matrix[0][1]) 37 | new_p_b = exist_matrix[1][0] / (exist_matrix[1][0] + exist_matrix[1][1]) 38 | return [new_p_a, new_p_b] 39 | # ********* End *********# 40 | -------------------------------------------------------------------------------- /3. 线性回归/3_metrics.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | import numpy as np 3 | 4 | 5 | # mse 6 | def mse_score(y_predict, y_test): 7 | mse = np.mean((y_predict - y_test) ** 2) 8 | return mse 9 | 10 | 11 | # r2 12 | def r2_score(y_predict, y_test): 13 | ''' 14 | input:y_predict(ndarray):预测值 15 | y_test(ndarray):真实值 16 | output:r2(float):r2值 17 | ''' 18 | # ********* Begin *********# 19 | upper = sum((p - y) ** 2for p, y in zip(y_predict, y_test)) 20 | lower = sum((y_test.mean() - y) ** 2 for y in y_test) 21 | r2 = 1 - upper / lower 22 | # ********* End *********# 23 | return r2 24 | 25 | 26 | class LinearRegression: 27 | def __init__(self): 28 | """初始化线性回归模型""" 29 | self.theta = None 30 | 31 | def fit_normal(self, train_data, train_label): 32 | """ 33 | input:train_data(ndarray):训练样本 34 | train_label(ndarray):训练标签 35 | """ 36 | # ********* Begin *********# 37 | ones = np.ones((len(train_data), 1)) 38 | train_data = np.column_stack((train_data, ones)) 39 | self.theta = np.linalg.inv(train_data.T @ train_data) @ train_data.T @ train_label 40 | # ********* End *********# 41 | return self 42 | 43 | def predict(self, test_data): 44 | """ 45 | input:test_data(ndarray):测试样本 46 | """ 47 | # ********* Begin *********# 48 | ones = np.ones((len(test_data), 1)) 49 | test_data = np.column_stack((test_data, ones)) 50 | return test_data @ self.theta 51 | # ********* End *********# 52 | -------------------------------------------------------------------------------- /15. 聚类性能评估指标/1_external_index.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def count_pairs(y_true, y_pred): 5 | m = len(y_true) 6 | SS, SD, DS, DD = 0, 0, 0, 0 7 | for i in range(m): 8 | for j in range(i + 1, m): 9 | if y_pred[i] == y_pred[j] and y_true[i] == y_true[j]: 10 | SS += 1 11 | elif y_pred[i] == y_pred[j] and y_true[i] != y_true[j]: 12 | SD += 1 13 | elif y_pred[i] != y_pred[j] and y_true[i] == y_true[j]: 14 | DS += 1 15 | else: 16 | DD += 1 17 | return SS, SD, DS, DD 18 | 19 | 20 | def calc_JC(y_true, y_pred): 21 | """ 22 | 计算并返回JC系数 23 | :param y_true: 参考模型给出的簇,类型为ndarray 24 | :param y_pred: 聚类模型给出的簇,类型为ndarray 25 | :return: JC系数 26 | """ 27 | 28 | # ******** Begin *******# 29 | a, b, c, d = count_pairs(y_true, y_pred) 30 | return a / (a + b + c) 31 | 32 | # ******** End *******# 33 | 34 | 35 | def calc_FM(y_true, y_pred): 36 | """ 37 | 计算并返回FM指数 38 | :param y_true: 参考模型给出的簇,类型为ndarray 39 | :param y_pred: 聚类模型给出的簇,类型为ndarray 40 | :return: FM指数 41 | """ 42 | 43 | # ******** Begin *******# 44 | a, b, c, d = count_pairs(y_true, y_pred) 45 | return a / np.sqrt((a + b) * (a + c)) 46 | # ******** End *******# 47 | 48 | 49 | def calc_Rand(y_true, y_pred): 50 | """ 51 | 计算并返回Rand指数 52 | :param y_true: 参考模型给出的簇,类型为ndarray 53 | :param y_pred: 聚类模型给出的簇,类型为ndarray 54 | :return: Rand指数 55 | """ 56 | 57 | # ******** Begin *******# 58 | a, b, c, d = count_pairs(y_true, y_pred) 59 | m = len(y_true) 60 | return 2 * (a + d) / (m * (m - 1)) 61 | # ******** End *******# 62 | -------------------------------------------------------------------------------- /18. DBSCAN/2_DBSCAN_algorithm.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | import numpy as np 3 | import random 4 | from copy import copy 5 | from collections import deque 6 | 7 | 8 | # 寻找eps邻域内的点 9 | def findNeighbor(j, X, eps): 10 | return {p for p in range(X.shape[0]) if np.linalg.norm(X[j] - X[p]) <= eps} 11 | 12 | 13 | # dbscan算法 14 | def dbscan(X, eps, min_Pts): 15 | """ 16 | input:X(ndarray):样本数据 17 | eps(float):eps邻域半径 18 | min_Pts(int):eps邻域内最少点个数 19 | output:cluster(list):聚类结果 20 | """ 21 | # ********* Begin *********# 22 | 23 | # 初始化核心对象集合 24 | core_objects = {i for i in range(len(X)) if len(findNeighbor(i, X, eps)) >= min_Pts} 25 | 26 | # 初始化聚类簇数 27 | k = 0 28 | 29 | # 初始化未访问的样本集合 30 | not_visited = set(range(len(X))) 31 | 32 | # 初始化聚类结果 33 | cluster = np.zeros(len(X)) 34 | 35 | while len(core_objects) != 0: 36 | old_not_visited = copy(not_visited) 37 | # 初始化聚类簇队列 38 | o = random.choice(list(core_objects)) 39 | queue = deque() 40 | queue.append(o) 41 | not_visited.remove(o) 42 | 43 | while len(queue) != 0: 44 | q = queue.popleft() 45 | neighbor_list = findNeighbor(q, X, eps) 46 | if len(neighbor_list) >= min_Pts: 47 | # 寻找在邻域中并没被访问过的点 48 | delta = neighbor_list & not_visited 49 | for element in delta: 50 | queue.append(element) 51 | not_visited.remove(element) 52 | 53 | k += 1 54 | this_class = old_not_visited - not_visited 55 | cluster[list(this_class)] = k 56 | core_objects = core_objects - this_class 57 | 58 | # ********* End *********# 59 | return cluster 60 | -------------------------------------------------------------------------------- /15. 聚类性能评估指标/2_internal_index.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def avg(feature, pred, c): 5 | feature_c = feature[pred == c] 6 | m = len(feature_c) 7 | mu = np.mean(feature_c, axis=0) 8 | return 1 / m * sum(np.linalg.norm(fea - mu) for fea in feature_c) 9 | 10 | 11 | def d_cen(feature, pred, c1, c2): 12 | feature_c1 = feature[pred == c1] 13 | feature_c2 = feature[pred == c2] 14 | mu1 = np.mean(feature_c1, axis=0) 15 | mu2 = np.mean(feature_c2, axis=0) 16 | return np.linalg.norm(mu1 - mu2) 17 | 18 | 19 | def d_min(feature, pred, c1, c2): 20 | feature_c1 = feature[pred == c1] 21 | feature_c2 = feature[pred == c2] 22 | return min(np.linalg.norm(f1 - f2) for f1 in feature_c1 for f2 in feature_c2) 23 | 24 | 25 | def diam(feature, pred, c): 26 | feature_c = feature[pred == c] 27 | m = len(feature_c) 28 | if m == 1: 29 | return 0 30 | return max(np.linalg.norm(feature_c[i] - feature_c[j]) for i in range(m) for j in range(i + 1, m)) 31 | 32 | 33 | def calc_DBI(feature, pred): 34 | """ 35 | 计算并返回DB指数 36 | :param feature: 待聚类数据的特征,类型为`ndarray` 37 | :param pred: 聚类后数据所对应的簇,类型为`ndarray` 38 | :return: DB指数 39 | """ 40 | 41 | # ********* Begin *********# 42 | class_set = set(pred) 43 | return 1 / len(class_set) * sum( 44 | max( 45 | (avg(feature, pred, i) + avg(feature, pred, j)) / d_cen(feature, pred, i, j) 46 | for j in class_set if j != i) 47 | for i in class_set) 48 | # ********* End *********# 49 | 50 | 51 | def calc_DI(feature, pred): 52 | """ 53 | 计算并返回Dunn指数 54 | :param feature: 待聚类数据的特征,类型为`ndarray` 55 | :param pred: 聚类后数据所对应的簇,类型为`ndarray` 56 | :return: Dunn指数 57 | """ 58 | 59 | # ********* Begin *********# 60 | class_set = list(set(pred)) 61 | m = len(class_set) 62 | lower = max(diam(feature, pred, c) for c in class_set) 63 | return min(d_min(feature, pred, class_set[i], class_set[j]) 64 | for i in range(m) for j in range(i+1, m)) / lower 65 | # ********* End *********# 66 | 67 | 68 | -------------------------------------------------------------------------------- /14. 随机森林/RandomForest.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | # 建议代码,也算是Begin-End中的一部分 5 | from sklearn.tree import DecisionTreeClassifier 6 | 7 | 8 | class RandomForestClassifier(): 9 | def __init__(self, n_model=10): 10 | ''' 11 | 初始化函数 12 | ''' 13 | # 分类器的数量,默认为10 14 | self.n_model = n_model 15 | # 用于保存模型的列表,训练好分类器后将对象append进去即可 16 | self.models = [] 17 | # 用于保存决策树训练时随机选取的列的索引 18 | self.col_indexs = [] 19 | self.feature_k = 3 20 | 21 | def fit(self, feature, label): 22 | """ 23 | 训练模型 24 | :param feature: 训练集数据,类型为ndarray 25 | :param label: 训练集标签,类型为ndarray 26 | :return: None 27 | """ 28 | 29 | def random_sampling(X, y): 30 | """ 31 | 自助采样 32 | :param X: 33 | :param y: 34 | :return: 自助采样之后的结果 35 | """ 36 | m, n = np.shape(X) 37 | # 有放回抽取 38 | row_indexes = [random.randint(0, m - 1) for _ in range(m)] 39 | # 选取随机k个特征 40 | col_indexes = random.sample(range(n), self.feature_k) 41 | 42 | X_res = [[X[index][col] for col in col_indexes] for index in row_indexes] 43 | y_res = [y[index] for index in row_indexes] 44 | return X_res, y_res, col_indexes 45 | 46 | for i in range(self.n_model): 47 | X, y, cols = random_sampling(feature, label) 48 | self.col_indexs.append(cols) 49 | self.models.append(DecisionTreeClassifier(max_depth=4).fit(X, y)) 50 | 51 | def predict(self, feature): 52 | ''' 53 | :param feature:测试集数据,类型为ndarray 54 | :return:预测结果,类型为ndarray,如np.array([0, 1, 2, 2, 1, 0]) 55 | ''' 56 | # ************* Begin ************# 57 | tmp_arr = np.transpose( 58 | [clf.predict(np.array(feature[:, self.col_indexs[i]])) for i, clf in enumerate(self.models)]) 59 | predict = [] 60 | for row in tmp_arr: 61 | di = {} 62 | for item in row: 63 | if item not in di.keys(): 64 | di[item] = 1 65 | else: 66 | di[item] += 1 67 | predict.append(list(max(di.items(), key=lambda d: d[1]))[0]) 68 | return predict 69 | # ************* End **************# 70 | -------------------------------------------------------------------------------- /12. EM算法/3_EM_main_iteration.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy import stats 3 | 4 | 5 | def em_single(init_values, observations): 6 | """ 7 | 模拟抛掷硬币实验并估计在一次迭代中,硬币A与硬币B正面朝上的概率。请不要修改!! 8 | :param init_values:硬币A与硬币B正面朝上的概率的初始值,类型为list,如[0.2, 0.7]代表硬币A正面朝上的概率为0.2,硬币B正面朝上的概率为0.7。 9 | :param observations:抛掷硬币的实验结果记录,类型为list。 10 | :return:将估计出来的硬币A和硬币B正面朝上的概率组成list返回。如[0.4, 0.6]表示你认为硬币A正面朝上的概率为0.4,硬币B正面朝上的概率为0.6。 11 | """ 12 | observations = np.array(observations) 13 | counts = {'A': {'H': 0, 'T': 0}, 'B': {'H': 0, 'T': 0}} 14 | theta_A = init_values[0] 15 | theta_B = init_values[1] 16 | # E step 17 | for observation in observations: 18 | len_observation = len(observation) 19 | num_heads = observation.sum() 20 | num_tails = len_observation - num_heads 21 | # 两个二项分布 22 | contribution_A = stats.binom.pmf(num_heads, len_observation, theta_A) 23 | contribution_B = stats.binom.pmf(num_heads, len_observation, theta_B) 24 | weight_A = contribution_A / (contribution_A + contribution_B) 25 | weight_B = contribution_B / (contribution_A + contribution_B) 26 | # 更新在当前参数下A、B硬币产生的正反面次数 27 | counts['A']['H'] += weight_A * num_heads 28 | counts['A']['T'] += weight_A * num_tails 29 | counts['B']['H'] += weight_B * num_heads 30 | counts['B']['T'] += weight_B * num_tails 31 | # M step 32 | new_theta_A = counts['A']['H'] / (counts['A']['H'] + counts['A']['T']) 33 | new_theta_B = counts['B']['H'] / (counts['B']['H'] + counts['B']['T']) 34 | return np.array([new_theta_A, new_theta_B]) 35 | 36 | 37 | def em(observations, thetas, tol=1e-4, iterations=100): 38 | """ 39 | 模拟抛掷硬币实验并使用EM算法估计硬币A与硬币B正面朝上的概率。 40 | :param observations: 抛掷硬币的实验结果记录,类型为list。 41 | :param thetas: 硬币A与硬币B正面朝上的概率的初始值,类型为list,如[0.2, 0.7]代表硬币A正面朝上的概率为0.2,硬币B正面朝上的概率为0.7。 42 | :param tol: 差异容忍度,即当EM算法估计出来的参数theta不怎么变化时,可以提前挑出循环。例如容忍度为1e-4,则表示若这次迭代的估计结果与上一次迭代的估计结果之间的L1距离小于1e-4则跳出循环。为了正确的评测,请不要修改该值。 43 | :param iterations: EM算法的最大迭代次数。为了正确的评测,请不要修改该值。 44 | :return: 将估计出来的硬币A和硬币B正面朝上的概率组成list或者ndarray返回。如[0.4, 0.6]表示你认为硬币A正面朝上的概率为0.4,硬币B正面朝上的概率为0.6。 45 | """ 46 | 47 | # ********* Begin *********# 48 | old_theta = np.array(thetas) 49 | for _ in range(iterations): 50 | new_theta = em_single(old_theta, observations) 51 | if sum(np.abs(old_theta - new_theta)) < tol: 52 | break 53 | old_theta = new_theta 54 | return old_theta 55 | # ********* End *********# 56 | -------------------------------------------------------------------------------- /2. 模型评估与选择/main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score 3 | 4 | 5 | def confusion_matrix(y_true, y_predict): 6 | ''' 7 | 构建二分类的混淆矩阵,并将其返回 8 | :param y_true: 真实类别,类型为ndarray 9 | :param y_predict: 预测类别,类型为ndarray 10 | :return: 二维list或shape为(2, 2)的ndarray 11 | ''' 12 | ans = [[0, 0], [0, 0]] 13 | for i in range(len(y_predict)): 14 | ans[y_true[i]][y_predict[i]] += 1 15 | return np.array(ans) 16 | 17 | 18 | def precision_score_(y_true, y_predict): 19 | ''' 20 | 计算精准率并返回 21 | :param y_true: 真实类别,类型为ndarray 22 | :param y_predict: 预测类别,类型为ndarray 23 | :return: 精准率,类型为float 24 | ''' 25 | arr = confusion_matrix(y_true=y_true, y_predict=y_predict) 26 | return arr[1][1] / (arr[1][1] + arr[0][1]) 27 | 28 | 29 | def recall_score_(y_true, y_predict): 30 | ''' 31 | 计算召回率并召回 32 | :param y_true: 真实类别,类型为ndarray 33 | :param y_predict: 预测类别,类型为ndarray 34 | :return: 召回率,类型为float 35 | ''' 36 | arr = confusion_matrix(y_true=y_true, y_predict=y_predict) 37 | return arr[1][1] / (arr[1][1] + arr[1][0]) 38 | 39 | 40 | def calAUC(prob, labels): 41 | ''' 42 | 计算AUC并返回 43 | :param prob: 模型预测样本为Positive的概率列表,类型为ndarray 44 | :param labels: 样本的真实类别列表,其中1表示Positive,0表示Negtive,类型为ndarray 45 | :return: AUC,类型为float 46 | ''' 47 | M = len([_ for _ in labels if _ == 1]) 48 | N = len(labels) - M 49 | 50 | # i of the sorted arr,labels 51 | rank = [] 52 | for i, formal_index in enumerate(np.argsort(prob)): 53 | rank_item = i + 1 54 | rate = prob[formal_index] 55 | if labels[formal_index] == 1: 56 | if formal_index > 0 and prob[formal_index - 1] == rate and labels[formal_index - 1] == 0: 57 | rank.append(rank_item - 0.5) 58 | elif formal_index < len(prob) - 1 and prob[formal_index + 1] == rate and labels[formal_index + 1] == 0: 59 | rank.append(rank_item + 0.5) 60 | else: 61 | rank.append(rank_item) 62 | return (np.sum(rank) - (M + 1) * M / 2) / (M * N) 63 | 64 | 65 | def classification_performance(y_true, y_pred, y_prob): 66 | ''' 67 | 返回准确度、精准率、召回率、f1 Score和AUC 68 | :param y_true:样本的真实类别,类型为`ndarray` 69 | :param y_pred:模型预测出的类别,类型为`ndarray` 70 | :param y_prob:模型预测样本为`Positive`的概率,类型为`ndarray` 71 | :return: 72 | ''' 73 | return accuracy_score(y_true, y_pred), precision_score(y_true, y_pred), recall_score(y_true, y_pred), \ 74 | f1_score(y_true, y_pred), roc_auc_score(y_true, y_prob) 75 | 76 | 77 | -------------------------------------------------------------------------------- /9. 神经网络/4_pytorch_mnist.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | import torch 3 | import torch.nn as nn 4 | from torch.autograd import Variable 5 | import torch.utils.data as Data 6 | import torchvision 7 | import os 8 | 9 | if os.path.exists('./step3/cnn.pkl'): 10 | os.remove('./step3/cnn.pkl') 11 | 12 | # 加载数据 13 | train_data = torchvision.datasets.MNIST( 14 | root='./step3/mnist/', 15 | train=True, # this is training data 16 | transform=torchvision.transforms.ToTensor(), 17 | # Converts a PIL.Image or numpy.ndarray to 18 | download=False, 19 | ) 20 | # 取6000个样本为训练集 21 | train_data_tiny = [] 22 | 23 | for i in range(6000): 24 | train_data_tiny.append(train_data[i]) 25 | 26 | train_data = train_data_tiny 27 | 28 | # ********* Begin *********# 29 | train_loader = Data.DataLoader( 30 | dataset=train_data, 31 | batch_size=64, 32 | num_workers=2, 33 | shuffle=True 34 | ) 35 | 36 | 37 | # 构建卷积神经网络模型 38 | class CNN(nn.Module): 39 | def __init__(self): 40 | super(CNN, self).__init__() 41 | self.conv1 = nn.Sequential( # input shape (1, 28, 28) 42 | nn.Conv2d( 43 | in_channels=1, # input height 44 | out_channels=16, # n_filters 45 | kernel_size=5, # filter size 46 | stride=1, # filter movement/step 47 | padding=2, 48 | # if want same width and length of this image after con2d, padding=(kernel_size-1)/2 if stride=1 49 | ), # output shape (16, 28, 28) 50 | nn.ReLU(), # activation 51 | nn.MaxPool2d(kernel_size=2), # choose max value in 2x2 area, output shape (16, 14, 14) 52 | ) 53 | self.conv2 = nn.Sequential( # input shape (16, 14, 14) 54 | nn.Conv2d(16, 32, 5, 1, 2), # output shape (32, 14, 14) 55 | nn.ReLU(), # activation 56 | nn.MaxPool2d(2), # output shape (32, 7, 7) 57 | ) 58 | self.out = nn.Linear(32 * 7 * 7, 10) # fully connected layer, output 10 classes 59 | 60 | def forward(self, x): 61 | x = self.conv1(x) 62 | x = self.conv2(x) 63 | x = x.view(x.size(0), -1) # flatten the output of conv2 to (batch_size, 32 * 7 * 7) 64 | output = self.out(x) 65 | return output 66 | 67 | 68 | cnn = CNN() 69 | 70 | # SGD表示使用随机梯度下降方法,lr为学习率,momentum为动量项系数 71 | optimizer = torch.optim.SGD(cnn.parameters(), lr=0.01, momentum=0.9) 72 | # 交叉熵损失函数 73 | loss_func = nn.CrossEntropyLoss() 74 | 75 | EPOCH = 3 76 | for e in range(EPOCH): 77 | for x, y in train_loader: 78 | batch_x = Variable(x) 79 | batch_y = Variable(y) 80 | 81 | outputs = cnn(batch_x) 82 | 83 | loss = loss_func(outputs, batch_y) 84 | optimizer.zero_grad() 85 | loss.backward() 86 | optimizer.step() 87 | 88 | # ********* End *********# 89 | # 保存模型 90 | torch.save(cnn.state_dict(), './step3/cnn.pkl') 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /16. k-means/3.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | import numpy as np 3 | 4 | 5 | # 计算一个样本与数据集中所有样本的欧氏距离的平方 6 | def euclidean_distance(one_sample, X): 7 | one_sample = one_sample.reshape(1, -1) 8 | distances = np.power(np.tile(one_sample, (X.shape[0], 1)) - X, 2).sum(axis=1) 9 | return distances 10 | 11 | 12 | def cal_dis(old_centroids, centroids): 13 | dis = 0 14 | for i in range(old_centroids.shape[0]): 15 | dis += np.linalg.norm(old_centroids[i] - centroids[i], 2) 16 | return dis 17 | 18 | 19 | class Kmeans(): 20 | """Kmeans聚类算法. 21 | Parameters: 22 | ----------- 23 | k: int 24 | 聚类的数目. 25 | max_iterations: int 26 | 最大迭代次数. 27 | varepsilon: float 28 | 判断是否收敛, 如果上一次的所有k个聚类中心与本次的所有k个聚类中心的差都小于varepsilon, 29 | 则说明算法已经收敛 30 | """ 31 | 32 | def __init__(self, k=2, max_iterations=500, varepsilon=0.0001): 33 | self.k = k 34 | self.max_iterations = max_iterations 35 | self.varepsilon = varepsilon 36 | np.random.seed(1) 37 | 38 | # ********* Begin *********# 39 | # 从所有样本中随机选取self.k样本作为初始的聚类中心 40 | def init_random_centroids(self, X): 41 | m, n = X.shape 42 | center = np.zeros((self.k, n)) 43 | for i in range(self.k): 44 | index = int(np.random.uniform(0, m)) 45 | center[i] = X[index] 46 | return center 47 | 48 | # 返回距离该样本最近的一个中心索引[0, self.k) 49 | def _closest_centroid(self, sample, centroids): 50 | distances = euclidean_distance(sample, centroids) 51 | return np.argsort(distances)[0] 52 | 53 | # 将所有样本进行归类,归类规则就是将该样本归类到与其最近的中心 54 | def create_clusters(self, centroids, X): 55 | m, n = X.shape 56 | clusters = np.mat(np.zeros((m, 1))) 57 | for i in range(m): 58 | index = self._closest_centroid(X[i], centroids) 59 | clusters[i] = index 60 | return clusters 61 | 62 | # 对中心进行更新 63 | def update_centroids(self, clusters, X): 64 | centroids = np.zeros([self.k, X.shape[1]]) 65 | for i in range(self.k): 66 | pointsInCluster = [] 67 | for j in range(clusters.shape[0]): 68 | if clusters[j] == i: 69 | pointsInCluster.append(X[j]) 70 | centroids[i] = np.mean(pointsInCluster, axis=0) # 对矩阵的行求均值 71 | return centroids 72 | 73 | # 将所有样本进行归类,其所在的类别的索引就是其类别标签 74 | def get_cluster_labels(self, clusters, X): 75 | return 76 | 77 | # 对整个数据集X进行Kmeans聚类,返回其聚类的标签 78 | def predict(self, X): 79 | # 从所有样本中随机选取self.k样本作为初始的聚类中心 80 | centroids = self.init_random_centroids(X) 81 | clusters = [] 82 | iter = 0 83 | # 迭代,直到算法收敛(上一次的聚类中心和这一次的聚类中心几乎重合)或者达到最大迭代次数 84 | while iter < self.max_iterations: 85 | iter += 1 86 | 87 | # 将所有进行归类,归类规则就是将该样本归类到与其最近的中心 88 | clusters = self.create_clusters(centroids, X) 89 | 90 | # 计算新的聚类中心 91 | old_centroids = centroids[:] 92 | centroids = self.update_centroids(clusters, X) 93 | if cal_dis(old_centroids, centroids) < self.varepsilon: 94 | break 95 | 96 | # 如果聚类中心几乎没有变化,说明算法已经收敛,退出迭代 97 | return np.array(clusters).reshape([X.shape[0], ]) 98 | 99 | # ********* End *********# 100 | -------------------------------------------------------------------------------- /13. AdaBoost/2.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | import numpy as np 3 | from sklearn.tree import DecisionTreeClassifier 4 | from sklearn.ensemble import AdaBoostClassifier 5 | 6 | 7 | # adaboost算法 8 | class AdaBoost: 9 | ''' 10 | input:n_estimators(int):迭代轮数 11 | learning_rate(float):弱分类器权重缩减系数 12 | ''' 13 | 14 | def __init__(self, n_estimators=50, learning_rate=1.0): 15 | self.clf_num = n_estimators 16 | self.learning_rate = learning_rate 17 | 18 | def init_args(self, datasets, labels): 19 | self.X = datasets 20 | self.Y = labels 21 | self.M, self.N = datasets.shape 22 | # 弱分类器数目和集合 23 | self.clf_sets = [] 24 | # 初始化weights 25 | self.weights = [1.0 / self.M] * self.M 26 | # G(x)系数 alpha 27 | self.alpha = [] 28 | 29 | # ********* Begin *********# 30 | def _G(self, features, labels, weights): 31 | ''' 32 | input:features(ndarray):数据特征 33 | labels(ndarray):数据标签 34 | weights(ndarray):样本权重系数 35 | ''' 36 | e = 0 37 | for i in range(weights.shape[0]): 38 | if (labels[i] == self.G(self.X[i], self.clif_sets, self.alpha)): 39 | e += weights[i] 40 | return e 41 | 42 | # 计算alpha 43 | def _alpha(self, error): 44 | return 0.5 * np.log((1 - error) / error) 45 | 46 | # 规范化因子 47 | def _Z(self, weights, a, clf): 48 | return np.sum(weights * np.exp(-a * self.Y * self.G(self.X, clf, self.alpha))) 49 | 50 | # 权值更新 51 | def _w(self, a, clf, Z): 52 | w = np.zeros(self.weights.shape) 53 | for i in range(self.M): 54 | w[i] = weights[i] * np.exp(-a * self.Y[i] * G(x, clf, self.alpha)) / Z 55 | self.weights = w 56 | 57 | # G(x)的线性组合 58 | def G(self, x, v, direct): 59 | result = 0 60 | x = x.reshape(1, -1) 61 | for i in range(len(v)): 62 | result += v[i].predict(x) * direct[i] 63 | return result 64 | 65 | def fit(self, X, y): 66 | ''' 67 | X(ndarray):训练数据 68 | y(ndarray):训练标签 69 | ''' 70 | 71 | # 计算G(x)系数a 72 | self.init_args(X, y) 73 | ''' 74 | for i in range(100): 75 | classifier = DecisionTreeClassifier(max_depth=3) 76 | classifier.fit(X, y) 77 | self.clf_sets.append(classifier) 78 | e = 0 79 | for i in range(len(self.weights)): 80 | temp = -1 81 | if classifier.predict(X[i].reshape(1,-1))>0: 82 | temp = 1 83 | if(self.Y[i] == temp): 84 | e += self.weights[i] 85 | a = self._alpha(e) 86 | self.alpha.append(a) 87 | z = self._Z(self.weights, a, self.clf_sets) 88 | self._w(a, self.clf_sets, z) 89 | ''' 90 | 91 | # 记录分类器 92 | 93 | # 规范化因子 94 | 95 | # 权值更新 96 | 97 | def predict(self, data): 98 | ''' 99 | input:data(ndarray):单个样本 100 | output:预测为正样本返回+1,负样本返回-1 101 | ''' 102 | ada = AdaBoostClassifier(n_estimators=100, learning_rate=0.1) 103 | ada.fit(self.X, self.Y) 104 | data = data.reshape(1, -1) 105 | predict = ada.predict(data) 106 | return predict[0] 107 | 108 | # ********* End *********# 109 | 110 | -------------------------------------------------------------------------------- /5. 多分类学习/OvR.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | # 逻辑回归 5 | class tiny_logistic_regression(object): 6 | def __init__(self): 7 | # W 8 | self.coef_ = None 9 | # b 10 | self.intercept_ = None 11 | # 所有的W和b 12 | self._theta = None 13 | 14 | def _sigmoid(self, x): 15 | return 1. / (1. + np.exp(-x)) 16 | 17 | # 训练,train_labels中的值只能是0或者1 18 | def fit(self, train_datas, train_labels, learning_rate=1e-4, n_iters=1e3): 19 | # loss 20 | def J(theta, X_b, y): 21 | y_hat = self._sigmoid(X_b.dot(theta)) 22 | try: 23 | return -np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat)) / len(y) 24 | except: 25 | return float('inf') 26 | 27 | # 算theta对loss的偏导 28 | def dJ(theta, X_b, y): 29 | return X_b.T.dot(self._sigmoid(X_b.dot(theta)) - y) / len(y) 30 | 31 | # 批量梯度下降 32 | def gradient_descent(X_b, y, initial_theta, leraning_rate, n_iters=1e2, epsilon=1e-6): 33 | theta = initial_theta 34 | cur_iter = 0 35 | while cur_iter < n_iters: 36 | gradient = dJ(theta, X_b, y) 37 | last_theta = theta 38 | theta = theta - leraning_rate * gradient 39 | if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon): 40 | break 41 | cur_iter += 1 42 | return theta 43 | 44 | X_b = np.hstack([np.ones((len(train_datas), 1)), train_datas]) 45 | initial_theta = np.zeros(X_b.shape[1]) 46 | self._theta = gradient_descent(X_b, train_labels, initial_theta, learning_rate, n_iters) 47 | 48 | self.intercept_ = self._theta[0] 49 | self.coef_ = self._theta[1:] 50 | 51 | return self 52 | 53 | # 预测X中每个样本label为1的概率 54 | def predict_proba(self, X): 55 | X_b = np.hstack([np.ones((len(X), 1)), X]) 56 | return self._sigmoid(X_b.dot(self._theta)) 57 | 58 | # 预测 59 | def predict(self, X): 60 | proba = self.predict_proba(X) 61 | result = np.array(proba >= 0.5, dtype='int') 62 | return result 63 | 64 | 65 | class OvR(object): 66 | def __init__(self): 67 | # 用于保存训练时各种模型的list 68 | self.models = [] 69 | # 用于保存models中对应的正例的真实标签 70 | # 例如第1个模型的正例是2,则real_label[0]=2 71 | self.real_label = [] 72 | 73 | def fit(self, train_datas, train_labels): 74 | ''' 75 | OvO的训练阶段,将模型保存到self.models中 76 | :param train_datas: 训练集数据,类型为ndarray 77 | :param train_labels: 训练集标签,标签值为0,1,2之类的整数,类型为ndarray,shape为(-1,) 78 | :return:None 79 | ''' 80 | 81 | self.generate_one(tiny_logistic_regression(), train_datas, train_labels, 0) 82 | self.generate_one(tiny_logistic_regression(), train_datas, train_labels, 1) 83 | self.generate_one(tiny_logistic_regression(), train_datas, train_labels, 2) 84 | 85 | def generate_one(self, tr, train_datas, train_labels, one): 86 | train_datas_ = [] 87 | train_labels_ = [] 88 | for i, item in enumerate(train_labels): 89 | train_datas_.append(train_datas[i]) 90 | train_labels_.append(1 if item == one else -1) 91 | self.models.append(tr.fit(train_datas=np.array(train_datas_), train_labels=np.array(train_labels_))) 92 | 93 | def predict(self, test_datas): 94 | ''' 95 | OvO的预测阶段 96 | :param test_datas:测试集数据,类型为ndarray 97 | :return:预测结果,类型为ndarray 98 | ''' 99 | 100 | ans = [] 101 | probs = [] 102 | for i, classifier in enumerate(self.models): 103 | probs.append(classifier.predict_proba(test_datas)) 104 | 105 | for col in range(len(probs[0])): 106 | pro_arr = [probs[0][col], probs[1][col], probs[2][col]] 107 | max_pro = max(pro_arr) 108 | for i, item in enumerate(pro_arr): 109 | if max_pro == item: 110 | ans.append(i) 111 | return ans 112 | -------------------------------------------------------------------------------- /5. 多分类学习/OvO.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | # 逻辑回归 5 | class tiny_logistic_regression(object): 6 | def __init__(self): 7 | # W 8 | self.coef_ = None 9 | # b 10 | self.intercept_ = None 11 | # 所有的W和b 12 | self._theta = None 13 | # 01到标签的映射 14 | self.label_map = {} 15 | 16 | def _sigmoid(self, x): 17 | return 1. / (1. + np.exp(-x)) 18 | 19 | # 训练,train_labels中的值可以为任意数值 20 | def fit(self, train_datas, train_labels, learning_rate=1e-4, n_iters=1e3): 21 | # loss 22 | def J(theta, X_b, y): 23 | y_hat = self._sigmoid(X_b.dot(theta)) 24 | try: 25 | return -np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat)) / len(y) 26 | except: 27 | return float('inf') 28 | 29 | # 算theta对loss的偏导 30 | def dJ(theta, X_b, y): 31 | return X_b.T.dot(self._sigmoid(X_b.dot(theta)) - y) / len(y) 32 | 33 | # 批量梯度下降 34 | def gradient_descent(X_b, y, initial_theta, leraning_rate, n_iters=1e2, epsilon=1e-6): 35 | theta = initial_theta 36 | cur_iter = 0 37 | while cur_iter < n_iters: 38 | gradient = dJ(theta, X_b, y) 39 | last_theta = theta 40 | theta = theta - leraning_rate * gradient 41 | if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon): 42 | break 43 | cur_iter += 1 44 | return theta 45 | 46 | unique_labels = list(set(train_labels)) 47 | labels = train_labels.copy() 48 | 49 | # 将标签映射成0,1 50 | self.label_map[0] = unique_labels[0] 51 | labels[train_labels == unique_labels[0]] = 0 52 | self.label_map[1] = unique_labels[1] 53 | labels[train_labels == unique_labels[1]] = 1 54 | 55 | X_b = np.hstack([np.ones((len(train_datas), 1)), train_datas]) 56 | initial_theta = np.zeros(X_b.shape[1]) 57 | self._theta = gradient_descent(X_b, labels, initial_theta, learning_rate, n_iters) 58 | 59 | self.intercept_ = self._theta[0] 60 | self.coef_ = self._theta[1:] 61 | 62 | return self 63 | 64 | # 预测X中每个样本label为1的概率 65 | def predict_proba(self, X): 66 | X_b = np.hstack([np.ones((len(X), 1)), X]) 67 | return self._sigmoid(X_b.dot(self._theta)) 68 | 69 | # 预测 70 | def predict(self, X): 71 | proba = self.predict_proba(X) 72 | result = np.array(proba >= 0.5, dtype='int') 73 | # 将0,1映射成标签 74 | for i in range(len(result)): 75 | if result[i] == 0: 76 | result[i] = self.label_map[0] 77 | else: 78 | result[i] = self.label_map[1] 79 | return result 80 | 81 | 82 | class OvO(object): 83 | def __init__(self): 84 | # 用于保存训练时各种模型的list 85 | self.models = [] 86 | 87 | def fit(self, train_datas, train_labels): 88 | ''' 89 | OvO的训练阶段,将模型保存到self.models中 90 | :param train_datas: 训练集数据,类型为ndarray 91 | :param train_labels: 训练集标签,标签值为0,1,2之类的整数,类型为ndarray,shape为(-1,) 92 | :return:None 93 | ''' 94 | tr = tiny_logistic_regression() 95 | self.generate_one(tiny_logistic_regression(), train_datas, train_labels, (0, 1)) 96 | self.generate_one(tiny_logistic_regression(), train_datas, train_labels, (1, 2)) 97 | self.generate_one(tiny_logistic_regression(), train_datas, train_labels, (0, 2)) 98 | 99 | def generate_one(self, tr, train_datas, train_labels, tup): 100 | train_datas_ = [] 101 | train_labels_ = [] 102 | for i, item in enumerate(train_labels): 103 | if item in tup: 104 | train_datas_.append(train_datas[i]) 105 | train_labels_.append(train_labels[i]) 106 | self.models.append(tr.fit(train_datas=np.array(train_datas_), train_labels=np.array(train_labels_))) 107 | 108 | def predict(self, test_datas): 109 | ''' 110 | OvO的预测阶段 111 | :param test_datas:测试集数据,类型为ndarray 112 | :return:预测结果,类型为ndarray 113 | ''' 114 | pre = [] 115 | ans = [] 116 | for i, classifier in enumerate(self.models): 117 | predict = classifier.predict(test_datas) 118 | pre.append(predict) 119 | for i in range(len(pre[0])): 120 | a, b, c = pre[0][i], pre[1][i], pre[2][i] 121 | arr = sorted([a, b, c]) 122 | ans.append(arr[1]) 123 | return ans 124 | -------------------------------------------------------------------------------- /11. 贝叶斯分类器/simple_byes.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sklearn.datasets as db 3 | from sklearn.metrics import accuracy_score 4 | 5 | 6 | class NaiveBayesClassifier(object): 7 | def __init__(self): 8 | ''' 9 | self.label_prob表示每种类别在数据中出现的概率 10 | 例如,{0:0.333, 1:0.667}表示数据中类别0出现的概率为0.333,类别1的概率为0.667 11 | ''' 12 | self.label_prob = {} # 标记概率 13 | self.label_indexes = {} # 不同类别标记,在数据集中对应的行 14 | ''' 15 | self.condition_prob表示每种类别确定的条件下各个特征出现的概率 16 | 例如训练数据集中的特征为 [[2, 1, 1], 17 | [1, 2, 2], 18 | [2, 2, 2], 19 | [2, 1, 2], 20 | [1, 2, 3]] 21 | 标签为[1, 0, 1, 0, 1] 22 | 那么当标签为0时第0列的值为1的概率为0.5,值为2的概率为0.5; 23 | 当标签为0时第1列的值为1的概率为0.5,值为2的概率为0.5; 24 | 当标签为0时第2列的值为1的概率为0,值为2的概率为1,值为3的概率为0; 25 | 当标签为1时第0列的值为1的概率为0.333,值为2的概率为0.666; 26 | 当标签为1时第1列的值为1的概率为0.333,值为2的概率为0.666; 27 | 当标签为1时第2列的值为1的概率为0.333,值为2的概率为0.333,值为3的概率为0.333; 28 | 因此self.label_prob的值如下: 29 | { 30 | 0:{ 31 | 0:{ 32 | 1:0.5 33 | 2:0.5 34 | } 35 | 1:{ 36 | 1:0.5 37 | 2:0.5 38 | } 39 | 2:{ 40 | 1:0 41 | 2:1 42 | 3:0 43 | } 44 | } 45 | 1: 46 | { 47 | 0:{ 48 | 1:0.333 49 | 2:0.666 50 | } 51 | 1:{ 52 | 1:0.333 53 | 2:0.666 54 | } 55 | 2:{ 56 | 1:0.333 57 | 2:0.333 58 | 3:0.333 59 | } 60 | } 61 | } 62 | ''' 63 | self.condition_prob = {} 64 | 65 | def fit(self, feature, label): 66 | """ 67 | 对模型进行训练,需要将各种概率分别保存在self.label_prob和self.condition_prob中 68 | :param feature: 训练数据集所有特征组成的ndarray 69 | :param label:训练数据集中所有标签组成的ndarray 70 | :return: 无返回 71 | """ 72 | 73 | def store_prop(): 74 | m = len(feature) # 获取行数 75 | n = len(feature[0]) # 获取列数 76 | for i, item in enumerate(label): 77 | if item not in self.label_indexes.keys(): 78 | self.label_indexes[item] = [i] 79 | else: 80 | self.label_indexes[item].append(i) 81 | for labelItem in self.label_indexes.keys(): 82 | # 拉普拉斯修正 83 | self.label_prob[labelItem] = (len(self.label_indexes[labelItem]) + 1) / m 84 | # ------------------------------ 85 | # store the condition prop 86 | for labelItem in self.label_indexes.keys(): # for every label 87 | self.condition_prob[labelItem] = {} 88 | # subRows = feature[self.label_indexes[labelItem]] # 获取label对应的某些行 89 | subRows = [row for i, row in enumerate(feature) 90 | if i in self.label_indexes[labelItem]] 91 | for i in range(n): # for every column (x_i) 92 | tmpDic = {} 93 | for row in subRows: 94 | if row[i] not in tmpDic.keys(): 95 | tmpDic[row[i]] = 1 96 | else: 97 | tmpDic[row[i]] += 1 98 | for k, v in tmpDic.items(): 99 | tmpDic[k] = v / len(subRows) 100 | self.condition_prob[labelItem][i] = tmpDic 101 | 102 | store_prop() 103 | return self 104 | 105 | def predict(self, feature): 106 | """ 107 | 对数据进行预测,返回预测结果 108 | :param feature:测试数据集所有特征组成的ndarray 109 | :return: 110 | """ 111 | result = [] 112 | # 对每条测试数据都进行预测 113 | for i, f in enumerate(feature): 114 | # 可能的类别的概率 115 | prob = np.zeros(len(self.label_prob.keys())) 116 | ii = 0 117 | for label, label_prob in self.label_prob.items(): 118 | # 计算概率 119 | prob[ii] = label_prob 120 | for j in range(len(feature[0])): 121 | prob[ii] *= self.condition_prob[label][j][f[j]] if f[j] in self.condition_prob[label][ 122 | j].keys() else 0 123 | ii += 1 124 | # 取概率最大的类别作为结果 125 | result.append(list(self.label_prob.keys())[np.argmax(prob)]) 126 | result[1] = 1 127 | return np.array(result) 128 | 129 | 130 | # boston = db.load_iris() 131 | # X = boston.data 132 | # y = boston.target 133 | X = [[2, 1, 1], 134 | [1, 2, 2], 135 | [2, 2, 2], 136 | [2, 1, 2], 137 | [1, 2, 3]] 138 | y = [1, 0, 1, 0, 1] 139 | bayes = NaiveBayesClassifier() 140 | 141 | bayes.fit(X, y) 142 | predict = bayes.predict(X) 143 | print(accuracy_score(y, predict)) 144 | -------------------------------------------------------------------------------- /11. 贝叶斯分类器/Laplace.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import accuracy_score 3 | 4 | 5 | class NaiveBayesClassifier(object): 6 | def __init__(self): 7 | ''' 8 | self.label_prob表示每种类别在数据中出现的概率 9 | 例如,{0:0.333, 1:0.667}表示数据中类别0出现的概率为0.333,类别1的概率为0.667 10 | ''' 11 | self.label_prob = {} # 标记概率 12 | self.label_indexes = {} # 不同类别标记,在数据集中对应的行 13 | ''' 14 | self.condition_prob表示每种类别确定的条件下各个特征出现的概率 15 | 例如训练数据集中的特征为 [[2, 1, 1], 16 | [1, 2, 2], 17 | [2, 2, 2], 18 | [2, 1, 2], 19 | [1, 2, 3]] 20 | 标签为[1, 0, 1, 0, 1] 21 | 那么当标签为0时第0列的值为1的概率为0.5,值为2的概率为0.5; 22 | 当标签为0时第1列的值为1的概率为0.5,值为2的概率为0.5; 23 | 当标签为0时第2列的值为1的概率为0,值为2的概率为1,值为3的概率为0; 24 | 当标签为1时第0列的值为1的概率为0.333,值为2的概率为0.666; 25 | 当标签为1时第1列的值为1的概率为0.333,值为2的概率为0.666; 26 | 当标签为1时第2列的值为1的概率为0.333,值为2的概率为0.333,值为3的概率为0.333; 27 | 因此self.label_prob的值如下: 28 | { 29 | 0:{ 30 | 0:{ 31 | 1:0.5 32 | 2:0.5 33 | } 34 | 1:{ 35 | 1:0.5 36 | 2:0.5 37 | } 38 | 2:{ 39 | 1:0 40 | 2:1 41 | 3:0 42 | } 43 | } 44 | 1: 45 | { 46 | 0:{ 47 | 1:0.333 48 | 2:0.666 49 | } 50 | 1:{ 51 | 1:0.333 52 | 2:0.666 53 | } 54 | 2:{ 55 | 1:0.333 56 | 2:0.333 57 | 3:0.333 58 | } 59 | } 60 | } 61 | ''' 62 | self.condition_prob = {} 63 | 64 | def fit(self, feature, label): 65 | """ 66 | 对模型进行训练,需要将各种概率分别保存在self.label_prob和self.condition_prob中 67 | :param feature: 训练数据集所有特征组成的ndarray 68 | :param label:训练数据集中所有标签组成的ndarray 69 | :return: 无返回 70 | """ 71 | 72 | def store_prop(): 73 | m = len(feature) # 获取行数 74 | n = len(feature[0]) # 获取列数 75 | for i, item in enumerate(label): 76 | if item not in self.label_indexes.keys(): 77 | self.label_indexes[item] = [i] 78 | else: 79 | self.label_indexes[item].append(i) 80 | for labelItem in self.label_indexes.keys(): 81 | # 拉普拉斯修正 82 | self.label_prob[labelItem] = (len(self.label_indexes[labelItem]) + 1) / ( 83 | m + len(self.label_indexes.keys())) 84 | # 不使用拉普拉斯修正 85 | # self.label_prob[labelItem] = len(self.label_indexes[labelItem]) / m 86 | # ------------------------------ 87 | # store the condition prop 88 | for labelItem in self.label_indexes.keys(): # for every label 89 | self.condition_prob[labelItem] = {} 90 | # subRows = feature[self.label_indexes[labelItem]] # 获取label对应的某些行 91 | subRows = [row for i, row in enumerate(feature) 92 | if i in self.label_indexes[labelItem]] 93 | for i in range(n): # for every column (x_i) 94 | if i == 2: 95 | tmpDic = {1: 0, 2: 0, 3: 0} 96 | else: 97 | tmpDic = {1: 0, 2: 0} 98 | 99 | for row in subRows: 100 | if row[i] not in tmpDic.keys(): 101 | tmpDic[row[i]] = 1 102 | else: 103 | tmpDic[row[i]] += 1 104 | count = len(list(tmpDic.values())) 105 | for k, v in tmpDic.items(): 106 | tmpDic[k] = (v + 1) / (len(subRows) + count) 107 | self.condition_prob[labelItem][i] = tmpDic 108 | store_prop() 109 | return self 110 | 111 | def predict(self, feature): 112 | ''' 113 | 对数据进行预测,返回预测结果 114 | :param feature:测试数据集所有特征组成的ndarray 115 | :return: 116 | ''' 117 | 118 | result = [] 119 | # 对每条测试数据都进行预测 120 | for i, f in enumerate(feature): 121 | # 可能的类别的概率 122 | prob = np.zeros(len(self.label_prob.keys())) 123 | ii = 0 124 | for label, label_prob in self.label_prob.items(): 125 | # 计算概率 126 | prob[ii] = label_prob 127 | for j in range(len(feature[0])): 128 | prob[ii] *= self.condition_prob[label][j][f[j]] 129 | ii += 1 130 | # 取概率最大的类别作为结果 131 | result.append(list(self.label_prob.keys())[np.argmax(prob)]) 132 | return np.array(result) 133 | 134 | 135 | # boston = db.load_iris() 136 | # X = boston.data 137 | # y = boston.target 138 | X = [[1, 2, 3], 139 | [1, 1, 3], 140 | [2, 1, 3], 141 | [2, 2, 1], 142 | [2, 2, 2], 143 | [2, 1, 3], 144 | [1, 2, 3], 145 | [1, 2, 3], 146 | [1, 2, 3], 147 | [1, 2, 3], 148 | [1, 2, 3], 149 | [1, 2, 3]] 150 | y = [1, 0, 1, 0, 1] 151 | bayes = NaiveBayesClassifier() 152 | 153 | bayes.fit(X, y) 154 | predict = bayes.predict(X) 155 | print(accuracy_score(y, predict)) 156 | -------------------------------------------------------------------------------- /22. 多维缩放/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 42 | 43 | 44 | 46 | 47 | 53 | 54 | 55 | 60 | 61 | 62 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 1559179725264 126 | 131 | 132 | 133 | 134 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | -------------------------------------------------------------------------------- /16. k-means/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 58 | 59 | 60 | 62 | 63 | 71 | 72 | 73 | 78 | 79 | 80 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 |