├── 11. 贝叶斯分类器
├── __init__.py
├── sk.py
├── simple_byes.py
└── Laplace.py
├── 17. 高斯混合聚类
├── 1.md
├── result.jpg
├── test.jpg
└── 3.py
├── 9. 神经网络
├── 1_基本概念.md
├── 2_activation_function.py
├── 3_backpropagation.py
└── 4_pytorch_mnist.py
├── 12. EM算法
├── 1_极大似然估计.md
├── 2_EM_single_iteration.py
└── 3_EM_main_iteration.py
├── 21. PCA
├── 1_维数灾难.md
├── 2_PCA.py
└── 3_sklearn_PCA.py
├── 18. DBSCAN
├── 1_basic_concept.md
├── 3_sklearn_DBSCAN.py
└── 2_DBSCAN_algorithm.py
├── 3. 线性回归
├── 1_简单与多元线性回归.md
├── 4_sklearn_linearRegression.py
├── 2_normal_equation.py
└── 3_metrics.py
├── README.md
├── 16. k-means
├── .idea
│ ├── vcs.xml
│ ├── modules.xml
│ ├── misc.xml
│ ├── 16. k-means.iml
│ └── workspace.xml
├── 1.py
├── 4.py
├── 2.py
└── 3.py
├── 19. AGNES
├── .idea
│ ├── vcs.xml
│ ├── modules.xml
│ ├── misc.xml
│ ├── 19. AGNES.iml
│ └── workspace.xml
├── 3.py
├── 1.py
└── 2.py
├── 22. 多维缩放
├── .idea
│ ├── vcs.xml
│ ├── modules.xml
│ ├── misc.xml
│ ├── 22. 多维缩放.iml
│ └── workspace.xml
├── 2.py
└── 1.py
├── .gitignore
├── 6. 线性判别分析
├── 2_sklearn_LDA.py
└── 1_LDA.py
├── 15. 聚类性能评估指标
├── 3_sklearn_metrics.py
├── 1_external_index.py
└── 2_internal_index.py
├── 8. 感知机
├── sk.py
└── preception.py
├── 24. 局部线性嵌入
├── 2_sklearn_LLE.py
└── 1_LLE.py
├── 23 等度量映射
├── sk.py
└── isomap.py
├── 4. 逻辑回归
├── 3.py
├── 5.py
└── 4.py
├── 14. 随机森林
├── 3.Digit.py
├── Bagging.py
└── RandomForest.py
├── 20 KNN
├── sk.py
└── knn.py
├── 13. AdaBoost
├── 3.py
└── 2.py
├── 2. 模型评估与选择
└── main.py
└── 5. 多分类学习
├── OvR.py
└── OvO.py
/11. 贝叶斯分类器/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/17. 高斯混合聚类/1.md:
--------------------------------------------------------------------------------
1 | 1. A B C
2 | 2. A
--------------------------------------------------------------------------------
/9. 神经网络/1_基本概念.md:
--------------------------------------------------------------------------------
1 | 1. c
2 |
3 |
--------------------------------------------------------------------------------
/12. EM算法/1_极大似然估计.md:
--------------------------------------------------------------------------------
1 | 1. A
2 | 2. B
3 | 3. D
--------------------------------------------------------------------------------
/21. PCA/1_维数灾难.md:
--------------------------------------------------------------------------------
1 | ### 1. B,C
2 |
3 | ### 2. C
--------------------------------------------------------------------------------
/18. DBSCAN/1_basic_concept.md:
--------------------------------------------------------------------------------
1 | ### 1. 选择D
2 |
3 |
--------------------------------------------------------------------------------
/3. 线性回归/1_简单与多元线性回归.md:
--------------------------------------------------------------------------------
1 | 1. B, C
2 | 2. A, B, C
3 | 3. A
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # educoder-oj
2 | ## 南京大学机器学习课程oj相关习题解答
3 |
4 | ## 组团刷副本 冲冲冲
5 |
--------------------------------------------------------------------------------
/17. 高斯混合聚类/result.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nju-ml-course/educoder-oj/HEAD/17. 高斯混合聚类/result.jpg
--------------------------------------------------------------------------------
/17. 高斯混合聚类/test.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nju-ml-course/educoder-oj/HEAD/17. 高斯混合聚类/test.jpg
--------------------------------------------------------------------------------
/16. k-means/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/19. AGNES/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/22. 多维缩放/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/9. 神经网络/2_activation_function.py:
--------------------------------------------------------------------------------
1 | # encoding=utf8
2 |
3 | def relu(x):
4 | '''
5 | x:负无穷到正无穷的实数
6 | '''
7 | # ********* Begin *********#
8 | if x <= 0:
9 | return 0
10 | else:
11 | return x
12 | # ********* End *********#
13 |
--------------------------------------------------------------------------------
/22. 多维缩放/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/19. AGNES/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/16. k-means/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/16. k-means/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/19. AGNES/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/22. 多维缩放/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/16. k-means/1.py:
--------------------------------------------------------------------------------
1 | #encoding=utf8
2 | import numpy as np
3 |
4 | def distance(x,y,p=2):
5 | '''
6 | input:x(ndarray):第一个样本的坐标
7 | y(ndarray):第二个样本的坐标
8 | p(int):等于1时为曼哈顿距离,等于2时为欧氏距离
9 | output:distance(float):x到y的距离
10 | '''
11 | #********* Begin *********#
12 | return np.linalg.norm(x-y, p)
13 |
14 | #********* End *********#
--------------------------------------------------------------------------------
/16. k-means/4.py:
--------------------------------------------------------------------------------
1 | #encoding=utf8
2 | from sklearn.cluster import KMeans
3 |
4 | def kmeans_cluster(data):
5 | '''
6 | input:data(ndarray):样本数据
7 | output:result(ndarray):聚类结果
8 | '''
9 | #********* Begin *********#
10 | km = KMeans(n_clusters=3,random_state=888)
11 | result = km.fit_predict(data)
12 |
13 | #********* End *********#
14 | return result
15 |
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # See https://help.github.com/ignore-files/ for more about ignoring files.
2 |
3 | # dependencies
4 | /node_modules
5 |
6 | # testing
7 | /coverage
8 |
9 | # production
10 | /build
11 |
12 | # misc
13 | .DS_Store
14 | .env.local
15 | .env.development.local
16 | .env.test.local
17 | .env.production.local
18 |
19 |
20 | # idea
21 | .idea
22 |
23 | npm-debug.log*
24 | yarn-debug.log*
25 | yarn-error.log*
26 |
27 | # umi
28 | .umi
--------------------------------------------------------------------------------
/19. AGNES/.idea/19. AGNES.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/22. 多维缩放/.idea/22. 多维缩放.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/16. k-means/.idea/16. k-means.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/6. 线性判别分析/2_sklearn_LDA.py:
--------------------------------------------------------------------------------
1 | # encoding=utf8
2 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
3 |
4 |
5 | def lda(x, y):
6 | """
7 | input:x(ndarray):待处理数据
8 | y(ndarray):待处理数据标签
9 | output:x_new(ndarray):降维后数据
10 | """
11 | # ********* Begin *********#
12 | lda = LinearDiscriminantAnalysis(n_components=2)
13 | lda.fit(x, y)
14 | x_new = lda.transform(x)
15 | # ********* End *********#
16 | return x_new
17 |
--------------------------------------------------------------------------------
/22. 多维缩放/2.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from sklearn.manifold import MDS
3 |
4 |
5 | def mds(data, d):
6 | '''
7 | input:data(ndarray):待降维数据
8 | d(int):降维后数据维度
9 | output:Z(ndarray):降维后数据
10 | '''
11 | # ********* Begin *********#
12 | mds = MDS(d)
13 | Z = mds.fit_transform(data)
14 |
15 | # ********* End *********#
16 | return Z
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
--------------------------------------------------------------------------------
/18. DBSCAN/3_sklearn_DBSCAN.py:
--------------------------------------------------------------------------------
1 | # encoding=utf8
2 | from sklearn.cluster import DBSCAN
3 |
4 |
5 | def data_cluster(data):
6 | """
7 | input: data(ndarray) :数据
8 | output: result(ndarray):聚类结果
9 | """
10 | # ********* Begin *********#
11 | dbscan = DBSCAN(eps=0.5, min_samples=10)
12 | result = dbscan.fit_predict(data)
13 | return result
14 | # ********* End *********#
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/15. 聚类性能评估指标/3_sklearn_metrics.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics.cluster import fowlkes_mallows_score, adjusted_rand_score
2 |
3 |
4 | def cluster_performance(y_true, y_pred):
5 | """
6 | 返回Rand指数和FM指数
7 | :param y_true:参考模型的簇划分,类型为ndarray
8 | :param y_pred:聚类模型给出的簇划分,类型为ndarray
9 | :return: Rand指数,FM指数
10 | """
11 | # ********* Begin *********#
12 | rand = adjusted_rand_score(y_true, y_pred)
13 | fm = fowlkes_mallows_score(y_true, y_pred)
14 | return fm, rand
15 | # ********* End *********#
16 |
--------------------------------------------------------------------------------
/8. 感知机/sk.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.linear_model.perceptron import Perceptron
3 | import csv
4 |
5 | # 获取训练数据
6 | train_data = pd.read_csv('./step2/train_data.csv')
7 | # 获取训练标签
8 | train_label = pd.read_csv('./step2/train_label.csv')
9 | train_label = train_label['target']
10 | # 获取测试数据
11 | test_data = pd.read_csv('./step2/test_data.csv')
12 | clf = Perceptron(max_iter=1e5)
13 | clf.fit(train_data, train_label)
14 | result = clf.predict(test_data)
15 |
16 | pd.DataFrame({'result': result}).to_csv('./step2/result.csv', index=False)
17 |
--------------------------------------------------------------------------------
/19. AGNES/3.py:
--------------------------------------------------------------------------------
1 | #encoding=utf8
2 | from sklearn.cluster import AgglomerativeClustering
3 |
4 | def Agglomerative_cluster(data):
5 | '''
6 | 对红酒数据进行聚类
7 | :param data: 数据集,类型为ndarray
8 | :return: 聚类结果,类型为ndarray
9 | '''
10 |
11 | #********* Begin *********#
12 | mean = data.mean() #计算平均数
13 | deviation = data.std() #计算标准差
14 | # 标准化数据的公式: (数据值 - 平均数) / 标准差
15 | data = (data - mean) / deviation
16 | agnes = AgglomerativeClustering(n_clusters=3)
17 | result = agnes.fit_predict(data)
18 | return result
19 |
20 | #********* End *********#
21 |
--------------------------------------------------------------------------------
/21. PCA/2_PCA.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def pca(data, k):
5 | """
6 | 对data进行PCA,并将结果返回
7 | :param data:数据集,类型为ndarray
8 | :param k:想要降成几维,类型为int
9 | :return: 降维后的数据,类型为ndarray
10 | """
11 |
12 | # ********* Begin *********#
13 | # 零均值化
14 | mean = np.mean(data, axis=0)
15 | after_demean = data - mean
16 |
17 | cov = np.cov(after_demean.T)
18 |
19 | value, vector = np.linalg.eig(cov)
20 |
21 | index = np.argsort(-value)[: k]
22 | w = vector[:, index]
23 |
24 | return np.dot(after_demean, w)
25 |
26 | # ********* End *********#
27 |
--------------------------------------------------------------------------------
/24. 局部线性嵌入/2_sklearn_LLE.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from sklearn.manifold import LocallyLinearEmbedding
3 |
4 |
5 | def lle(data, d, k):
6 | """
7 | input:data(ndarray):待降维数据
8 | d(int):降维后数据维度
9 | k(int):邻域内样本数
10 | output:Z(ndarray):降维后数据
11 | """
12 | # ********* Begin *********#
13 | lle = LocallyLinearEmbedding(n_components=d, n_neighbors=k)
14 | Z = lle.fit_transform(data)
15 | # ********* End *********#
16 | return Z
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
--------------------------------------------------------------------------------
/3. 线性回归/4_sklearn_linearRegression.py:
--------------------------------------------------------------------------------
1 | # encoding=utf8
2 | # ********* Begin *********#
3 | from sklearn.linear_model import LinearRegression
4 | import pandas as pd
5 |
6 | # 获取训练数据
7 | train_data = pd.read_csv('./step3/train_data.csv')
8 | # 获取训练标签
9 | train_label = pd.read_csv('./step3/train_label.csv')
10 | train_label = train_label['target']
11 | # 获取测试数据
12 | test_data = pd.read_csv('./step3/test_data.csv')
13 |
14 | model = LinearRegression(normalize=True)
15 | model.fit(train_data, train_label)
16 | test_y = model.predict(test_data)
17 |
18 | pd.DataFrame(test_y, columns=['result']).to_csv('./step3/result.csv')
19 |
20 | # ********* End *********#
21 |
--------------------------------------------------------------------------------
/23 等度量映射/sk.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from sklearn.manifold import Isomap
3 | import isomap as isa
4 | import sklearn.datasets as db
5 |
6 |
7 |
8 | def isomap(data, d, k):
9 | '''
10 | input:data(ndarray):待降维数据
11 | d(int):降维后数据维度
12 | k(int):最近的k个样本
13 | output:Z(ndarray):降维后数据
14 | '''
15 | # ********* Begin *********#
16 | iso = Isomap(n_neighbors=k, n_components=d)
17 | return iso.fit_transform(data)
18 |
19 |
20 | if __name__ == '__main__':
21 | ir = db.load_boston()
22 | X1 = isa.isomap(ir.data[:10], d=2, k=4)
23 | X2 = isomap(ir.data[:10], d=2, k=4)
24 | print(X1)
25 | print(X2)
26 |
--------------------------------------------------------------------------------
/4. 逻辑回归/3.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import numpy as np
4 | import warnings
5 | warnings.filterwarnings("ignore")
6 |
7 | def gradient_descent(initial_theta,eta=0.05,n_iters=1e3,epslion=1e-8):
8 | '''
9 | 梯度下降
10 | :param initial_theta: 参数初始值,类型为float
11 | :param eta: 学习率,类型为float
12 | :param n_iters: 训练轮数,类型为int
13 | :param epslion: 容忍误差范围,类型为float
14 | :return: 训练后得到的参数
15 | '''
16 | # 请在此添加实现代码 #
17 | #********** Begin *********#
18 | i = 0
19 | while i < n_iters:
20 | initial_theta = initial_theta - eta*2*(initial_theta-3)
21 | i += 1
22 | return initial_theta
23 | #********** End **********#
24 |
25 |
26 |
--------------------------------------------------------------------------------
/4. 逻辑回归/5.py:
--------------------------------------------------------------------------------
1 | from sklearn.linear_model import LogisticRegression
2 |
3 | def digit_predict(train_image, train_label, test_image):
4 | '''
5 | 实现功能:训练模型并输出预测结果
6 | :param train_sample: 包含多条训练样本的样本集,类型为ndarray,shape为[-1, 8, 8]
7 | :param train_label: 包含多条训练样本标签的标签集,类型为ndarray
8 | :param test_sample: 包含多条测试样本的测试集,类型为ndarry
9 | :return: test_sample对应的预测标签
10 | '''
11 | #************* Begin ************#
12 | logreg = LogisticRegression(solver='newton-cg',max_iter =1000,C=1)
13 | logreg.fit(train_image.reshape(train_image.shape[0],-1), train_label)
14 | return logreg.predict(test_image.reshape(test_image.shape[0],-1))
15 | #************* End **************#
--------------------------------------------------------------------------------
/14. 随机森林/3.Digit.py:
--------------------------------------------------------------------------------
1 | from sklearn.ensemble import RandomForestClassifier
2 | import numpy as np
3 | import sklearn.datasets as db
4 |
5 |
6 | def digit_predict(train_image, train_label, test_image):
7 | """
8 | 实现功能:训练模型并输出预测结果
9 | :param train_image: 包含多条训练样本的样本集,类型为ndarray,shape为[-1, 8, 8]
10 | :param train_label: 包含多条训练样本标签的标签集,类型为ndarray
11 | :param test_image: 包含多条测试样本的测试集,类型为ndarry
12 | :return: test_image对应的预测标签,类型为ndarray
13 | """
14 | X = np.reshape(train_image, newshape=(-1, 64))
15 | clf = RandomForestClassifier(n_estimators=500, max_depth=10)
16 | clf.fit(X, y=train_label)
17 | return clf.predict(test_image)
18 |
19 |
20 | data = db.load_digits()
21 |
--------------------------------------------------------------------------------
/20 KNN/sk.py:
--------------------------------------------------------------------------------
1 | from sklearn.neighbors import KNeighborsClassifier
2 | from sklearn.preprocessing import StandardScaler
3 |
4 |
5 | def classification(train_feature, train_label, test_feature):
6 | '''
7 | 对test_feature进行红酒分类
8 | :param train_feature: 训练集数据,类型为ndarray
9 | :param train_label: 训练集标签,类型为ndarray
10 | :param test_feature: 测试集数据,类型为ndarray
11 | :return: 测试集数据的分类结果
12 | '''
13 |
14 | # 实例化StandardScaler对象
15 | scaler = StandardScaler()
16 | # 用data的均值和标准差来进行标准化,并将结果保存到after_scaler
17 | X = scaler.fit_transform(train_feature)
18 | # 用刚刚的StandardScaler对象来进行归一化
19 | X_test = scaler.transform(test_feature)
20 | clf = KNeighborsClassifier()
21 | clf.fit(X, train_label)
22 | return clf.predict(X_test)
23 |
--------------------------------------------------------------------------------
/21. PCA/3_sklearn_PCA.py:
--------------------------------------------------------------------------------
1 | from sklearn.decomposition import PCA
2 | from sklearn.svm import LinearSVC
3 |
4 |
5 | def cancer_predict(train_sample, train_label, test_sample):
6 | """
7 | 使用PCA降维,并进行分类,最后将分类结果返回
8 | :param train_sample:训练样本, 类型为ndarray
9 | :param train_label:训练标签, 类型为ndarray
10 | :param test_sample:测试样本, 类型为ndarray
11 | :return: 分类结果
12 | """
13 |
14 | # ********* Begin *********#
15 | pca = PCA(n_components=11)
16 | train_sample_transformed = pca.fit_transform(train_sample)
17 | test_sample_transformed = pca.transform(test_sample)
18 |
19 | clf = LinearSVC()
20 | clf.fit(train_sample_transformed, train_label)
21 | return clf.predict(test_sample_transformed)
22 |
23 | # ********* End *********#
24 |
--------------------------------------------------------------------------------
/17. 高斯混合聚类/3.py:
--------------------------------------------------------------------------------
1 | from PIL import Image
2 | import numpy as np
3 | from sklearn.mixture import GaussianMixture
4 |
5 | # 读取road.jpg到im变量中
6 | im = Image.open('./step3/image/test.jpg')
7 |
8 | # 将im转换成ndarray
9 | img = np.array(im)
10 | # 将img变形为[-1, 3]的shape,并保存至img_reshape
11 | img_reshape = img.reshape(-1, 3)
12 |
13 | # 实例化一个将数据聚成3个簇的高斯混合聚类器
14 | gmm = GaussianMixture(3)
15 | # 将数据传给fit函数,fit函数会计算出各个高斯分布的参数和响应系数
16 | gmm.fit(img_reshape)
17 | # 对数据进行聚类,簇标记为0 1 2(因为gmm对象想要聚成3个簇)
18 | pred = gmm.predict(img_reshape)
19 |
20 | img_reshape[pred == 0, :] = [255, 255, 0] # 黄色
21 | img_reshape[pred == 1, :] = [0, 0, 255] # 蓝色
22 | img_reshape[pred == 2, :] = [0, 255, 0] # 绿色
23 | im = Image.fromarray(img.astype('uint8'))
24 | # 将im保存为new_road.jpg
25 | im.save('./step3/dump/result.jpg')
26 |
--------------------------------------------------------------------------------
/13. AdaBoost/3.py:
--------------------------------------------------------------------------------
1 | #encoding=utf8
2 | from sklearn.tree import DecisionTreeClassifier
3 | from sklearn.ensemble import AdaBoostClassifier
4 | def ada_classifier(train_data,train_label,test_data):
5 | '''
6 | input:train_data(ndarray):训练数据
7 | train_label(ndarray):训练标签
8 | test_data(ndarray):测试标签
9 | output:predict(ndarray):预测结果
10 | '''
11 | #********* Begin *********#
12 | ada=AdaBoostClassifier(n_estimators=80,learning_rate=1.0)
13 | ada.fit(train_data,train_label)
14 | predict = ada.predict(test_data)
15 | #********* End *********#
16 | return predict
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
--------------------------------------------------------------------------------
/4. 逻辑回归/4.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import numpy as np
4 | import warnings
5 | warnings.filterwarnings("ignore")
6 |
7 | def sigmoid(x):
8 | '''
9 | sigmoid函数
10 | :param x: 转换前的输入
11 | :return: 转换后的概率
12 | '''
13 | return 1/(1+np.exp(-x))
14 |
15 |
16 | def fit(x,y,eta=1e-3,n_iters=1e4):
17 | '''
18 | 训练逻辑回归模型
19 | :param x: 训练集特征数据,类型为ndarray
20 | :param y: 训练集标签,类型为ndarray
21 | :param eta: 学习率,类型为float
22 | :param n_iters: 训练轮数,类型为int
23 | :return: 模型参数,类型为ndarray
24 | '''
25 | # 请在此添加实现代码 #
26 | #********** Begin *********#
27 | i = 0
28 | w = np.zeros(31)
29 | while i < n_iters:
30 | a = sigmoid(x.dot(w))
31 | w = w - eta * np.tensordot(x.T, (a-y),(1,0))
32 | i += 1
33 | return w
34 | #********** End **********#
35 |
36 |
--------------------------------------------------------------------------------
/9. 神经网络/3_backpropagation.py:
--------------------------------------------------------------------------------
1 | # encoding=utf8
2 | import os
3 | from sklearn.neural_network import MLPClassifier
4 | import pandas as pd
5 |
6 | if os.path.exists('./step2/result.csv'):
7 | os.remove('./step2/result.csv')
8 |
9 | # ********* Begin *********#
10 | # 获取训练数据
11 | train_data = pd.read_csv('./step2/train_data.csv')
12 | # 获取训练标签
13 | train_label = pd.read_csv('./step2/train_label.csv')
14 | train_label = train_label['target']
15 | # 获取测试数据
16 | test_data = pd.read_csv('./step2/test_data.csv')
17 |
18 | mlp = MLPClassifier(solver='lbfgs', max_iter=100,
19 | alpha=1e-5, hidden_layer_sizes=(5, 10, 3))
20 | mlp.fit(train_data, train_label)
21 | result = mlp.predict(test_data)
22 |
23 | result = pd.DataFrame(result, columns=['result'])
24 |
25 | result.to_csv('./step2/result.csv', index=False)
26 |
27 | # ********* End *********#
28 |
--------------------------------------------------------------------------------
/6. 线性判别分析/1_LDA.py:
--------------------------------------------------------------------------------
1 | # encoding=utf8
2 | import numpy as np
3 | from numpy.linalg import inv
4 |
5 |
6 | def lda(X, y):
7 | '''
8 | input:X(ndarray):待处理数据
9 | y(ndarray):待处理数据标签,标签分别为0和1
10 | output:X_new(ndarray):处理后的数据
11 | '''
12 | # ********* Begin *********#
13 |
14 | # 划分出第一类样本与第二类样本
15 | p_data = np.transpose(X[y == 0])
16 | n_data = np.transpose(X[y == 1])
17 |
18 | # 计算第一类样本与第二类样本协方差矩阵
19 | p_cov = np.cov(p_data)
20 | n_cov = np.cov(n_data)
21 | # 计算类内散度矩阵
22 | S_w = p_cov + n_cov
23 |
24 | # 获取第一类样本与第二类样本中心点
25 | p_mu = np.mean(p_data, axis=1)
26 | n_mu = np.mean(n_data, axis=1)
27 | # 计算w
28 | w = inv(S_w).dot(n_mu - p_mu)
29 | # 计算新样本集
30 | X_new = X.dot(w).reshape(-1, 1)
31 |
32 | # ********* End *********#
33 | return X_new * 0.0623
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
--------------------------------------------------------------------------------
/11. 贝叶斯分类器/sk.py:
--------------------------------------------------------------------------------
1 | from sklearn.feature_extraction.text import CountVectorizer
2 | from sklearn.naive_bayes import MultinomialNB
3 | from sklearn.feature_extraction.text import TfidfTransformer
4 |
5 |
6 | def news_predict(train_sample, train_label, test_sample):
7 | '''
8 | 训练模型并进行预测,返回预测结果
9 | :param train_sample:原始训练集中的新闻文本,类型为ndarray
10 | :param train_label:训练集中新闻文本对应的主题标签,类型为ndarray
11 | :param test_sample:原始测试集中的新闻文本,类型为ndarray
12 | :return 预测结果,类型为ndarray
13 | '''
14 | # 实例化向量化对象
15 | vec = CountVectorizer()
16 | # 将训练集中的新闻向量化
17 | X_train = vec.fit_transform(train_sample)
18 | # 将测试集中的新闻向量化
19 | X_test = vec.transform(test_sample)
20 | # 实例化tf-idf对象
21 | tfidf = TfidfTransformer()
22 | # 将训练集中的词频向量用tf-idf进行转换
23 | X_train = tfidf.fit_transform(X_train)
24 | # 将测试集中的词频向量用tf-idf进行转换
25 | X_test = tfidf.transform(X_test)
26 |
27 | clf = MultinomialNB(alpha=0.8)
28 | clf.fit(X_train, train_label)
29 | result = clf.predict(X_test)
30 | return result
31 |
--------------------------------------------------------------------------------
/16. k-means/2.py:
--------------------------------------------------------------------------------
1 | # encoding=utf8
2 | import numpy as np
3 |
4 |
5 | # 计算样本间距离
6 | def distance(x, y, p=2):
7 | '''
8 | input:x(ndarray):第一个样本的坐标
9 | y(ndarray):第二个样本的坐标
10 | p(int):等于1时为曼哈顿距离,等于2时为欧氏距离
11 | output:distance(float):x到y的距离
12 | '''
13 | # ********* Begin *********#
14 | return (np.sum(np.subtract(x, y) ** p)) ** (1 / p)
15 | # ********* End *********#
16 |
17 |
18 | # 计算质心
19 | def cal_Cmass(data):
20 | '''
21 | input:data(ndarray):数据样本
22 | output:mass(ndarray):数据样本质心
23 | '''
24 | # ********* Begin *********#
25 | return [np.mean(col) for col in np.transpose(data)]
26 | # ********* End *********#
27 |
28 |
29 | # 计算每个样本到质心的距离,并按照从小到大的顺序排列
30 | def sorted_list(data, Cmass):
31 | '''
32 | input:data(ndarray):数据样本
33 | Cmass(ndarray):数据样本质心
34 | output:dis_list(list):排好序的样本到质心距离
35 | '''
36 | # ********* Begin *********#
37 | return sorted([distance(row, Cmass) for row in data])
38 | # ********* End *********#
39 |
--------------------------------------------------------------------------------
/8. 感知机/preception.py:
--------------------------------------------------------------------------------
1 | # encoding=utf8
2 | import numpy as np
3 |
4 |
5 | # 构建感知机算法
6 | class Perceptron(object):
7 | def __init__(self, learning_rate=0.01, max_iter=200):
8 | self.lr = learning_rate
9 | self.max_iter = max_iter
10 |
11 | def fit(self, data, label):
12 | '''
13 | input:data(ndarray):训练数据特征
14 | label(ndarray):训练数据标签
15 | output:w(ndarray):训练好的权重
16 | b(ndarry):训练好的偏置
17 | '''
18 | # 编写感知机训练方法,w为权重,b为偏置
19 | self.w = np.array([1.] * data.shape[1])
20 | self.b = np.array([1.])
21 | for i in range(self.max_iter):
22 | for row in range(data.shape[0]):
23 | if label[row] * (np.dot(data[row], np.transpose(self.w)) + self.b) < 0:
24 | self.w += self.lr * label[row] * data[row]
25 | self.b += self.lr * label[row]
26 |
27 | def predict(self, data):
28 | '''
29 | input:data(ndarray):测试数据特征
30 | output:predict(ndarray):预测标签
31 | '''
32 | z = np.dot(data, np.transpose(self.w)) + self.b
33 | return [1 if item > 0 else -1 for item in z]
34 |
35 |
--------------------------------------------------------------------------------
/24. 局部线性嵌入/1_LLE.py:
--------------------------------------------------------------------------------
1 | # encoding=utf8
2 | import numpy as np
3 |
4 |
5 | def find_neighbors(data, i, k):
6 | dist = sorted(range(len(data)), key=lambda x: np.linalg.norm(data[x] - data[i]))
7 | return set(dist[1: k + 1])
8 |
9 |
10 | def cal_c_jk(data, i, j, k):
11 | return np.dot((data[i] - data[j]), (data[i] - data[k]))
12 |
13 |
14 | def lle(data, d, k):
15 | """
16 | input:data(ndarray):待降维数据,行数为样本个数,列数为特征数
17 | d(int):降维后数据维数
18 | k(int):最近的k个样本
19 | output:Z(ndarray):降维后的数据
20 | """
21 | # ********* Begin *********#
22 | m = len(data)
23 | W = np.zeros((m, m))
24 | for i in range(m):
25 | # 确定样本i的邻域
26 | neighbors = find_neighbors(data, i, k)
27 | lower = sum(1 / cal_c_jk(data, i, l, s) for l in neighbors for s in neighbors)
28 | for j in neighbors:
29 | # 求矩阵c及其逆
30 | upper = sum(1 / cal_c_jk(data, i, j, k) for k in neighbors)
31 | # 求w
32 | W[i][j] = upper / lower
33 |
34 | # 求得M并矩阵分解
35 | I = np.identity(m)
36 | M = np.dot((I - W).T, (I - W))
37 |
38 | value, vector = np.linalg.eig(M)
39 | index = np.argsort(value)[: d]
40 | # 求Z(z1; z2; z3; ...; zm) 每一行为一个新的降维投影
41 | Z = vector[:, index].T
42 | # ********* End *********#
43 | return Z
44 |
45 |
--------------------------------------------------------------------------------
/14. 随机森林/Bagging.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.tree import DecisionTreeClassifier
3 |
4 |
5 | class BaggingClassifier(object):
6 | def __init__(self, n_model=10):
7 | '''
8 | 初始化函数
9 | '''
10 | # 分类器的数量,默认为10
11 | self.n_model = n_model
12 | # 用于保存模型的列表,训练好分类器后将对象append进去即可
13 | self.models = []
14 |
15 | def fit(self, feature, label):
16 | '''
17 | 训练模型,请记得将模型保存至self.models
18 | :param feature: 训练集数据,类型为ndarray
19 | :param label: 训练集标签,类型为ndarray
20 | :return: None
21 | '''
22 | self.models = [DecisionTreeClassifier(max_depth=3).fit(feature, label) for _ in range(self.n_model)]
23 |
24 | def predict(self, feature):
25 | '''
26 | :param feature: 测试集数据,类型为ndarray
27 | :return: 预测结果,类型为ndarray,如np.array([0, 1, 2, 2, 1, 0])
28 | '''
29 | tmp_arr = np.transpose([clf_.predict(feature) for clf_ in self.models])
30 | predict = []
31 | for row in tmp_arr:
32 | dic = {}
33 | for item in row:
34 | if item not in dic.keys():
35 | dic[item] = 1
36 | else:
37 | dic[item] += 1
38 | predict.append(list(max(dic.items(), key=lambda d: d[1]))[0])
39 | return predict
40 |
41 |
42 |
--------------------------------------------------------------------------------
/22. 多维缩放/1.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 |
4 |
5 | def mds(data, d):
6 | '''
7 | input:data(ndarray):待降维数据
8 | d(int):降维后数据维数
9 | output:Z(ndarray):降维后的数据
10 | '''
11 | # ********* Begin *********#
12 | # 计算dist2,dist2i,dist2j,dist2ij
13 |
14 | # 计算B
15 |
16 | # 矩阵分解得到特征值与特征向量
17 |
18 | # 计算Z
19 |
20 | # ********* End *********#
21 | DSquare = np.zeros([data.shape[0], data.shape[0]])
22 | for i in range(data.shape[0]):
23 | for j in range(data.shape[0]):
24 | DSquare[i][j] = np.sum(np.square(data[i] - data[j]))
25 | totalMean = np.mean(DSquare)
26 | rowMean = np.mean(DSquare, axis=1)
27 | columnMean = np.mean(DSquare, axis=0)
28 | B = np.zeros(DSquare.shape)
29 | for i in range(B.shape[0]):
30 | for j in range(B.shape[1]):
31 | B[i][j] = -0.5 * (DSquare[i][j] - rowMean[i] - columnMean[j] + totalMean)
32 | eigVal, eigVec = np.linalg.eigh(B) # 求特征值及特征向量
33 | # 对特征值进行排序,得到排序索引
34 | eigValSorted_indices = np.argsort(-eigVal)
35 | # 提取d个最大特征向量
36 | topd_eigVec = eigVec[:, eigValSorted_indices[:d]]
37 | X = np.dot(topd_eigVec, np.sqrt(np.diag(eigVal[eigValSorted_indices[:d]])))
38 | return X
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
--------------------------------------------------------------------------------
/23 等度量映射/isomap.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 |
4 |
5 | def isomap(data, d, k, Max=10000):
6 | """
7 | input:data(ndarray):待降维数据
8 | d(int):降维后数据维数
9 | k(int):最近的k个样本
10 | Max(int):表示无穷大
11 | output:Z(ndarray):降维后的数据
12 | """
13 | # ********* Begin *********#
14 | # 计算dist2,dist2i,dist2j,dist2ij
15 | m, n = data.shape
16 | dist = np.ones((m, m)) * Max
17 | disti = np.zeros(m)
18 | distj = np.zeros(m)
19 | B = np.zeros((m, m))
20 | for i in range(m):
21 | distance = np.power(np.tile(data[i], (m, 1)) - data, 2).sum(axis=1)
22 | index = np.argsort(distance)
23 | q = index[:k]
24 | for l in q:
25 | dist[i][l] = np.power(data[i] - data[l], 2).sum()
26 | for i in range(m):
27 | disti[i] = np.mean(dist[i, :])
28 | distj[i] = np.mean(dist[:, i])
29 | distij = np.mean(dist)
30 | # 计算B
31 | for i in range(m):
32 | for j in range(m):
33 | B[i, j] = -0.5 * (dist[i, j] - disti[i] - distj[j] + distij)
34 | # 矩阵分解得到特征值与特征向量
35 | lamda, V = np.linalg.eigh(B)
36 | # 计算Z
37 | index = np.argsort(-lamda)[:d]
38 | diag_lamda = np.sqrt(np.diag(-np.sort(-lamda)[:d]))
39 | V_selected = V[:, index]
40 | Z = V_selected.dot(diag_lamda)
41 | # ********* End *********#
42 | return Z
43 |
--------------------------------------------------------------------------------
/3. 线性回归/2_normal_equation.py:
--------------------------------------------------------------------------------
1 | # encoding=utf8
2 | import numpy as np
3 |
4 |
5 | def mse_score(y_predict, y_test):
6 | """
7 | input:y_predict(ndarray):预测值
8 | y_test(ndarray):真实值
9 | ouput:mse(float):mse损失函数值
10 | """
11 | # ********* Begin *********#
12 | return 1 / len(y_predict) * sum([np.square(y - p) for y, p in zip(y_test, y_predict)])
13 | # ********* End *********#
14 | return mse
15 |
16 |
17 | class LinearRegression:
18 | def __init__(self):
19 | """初始化线性回归模型"""
20 | self.theta = None
21 |
22 | def fit_normal(self, train_data, train_label):
23 | """
24 | input:train_data(ndarray):训练样本
25 | train_label(ndarray):训练标签
26 | """
27 | # ********* Begin *********#
28 | ones = np.ones((len(train_data), 1))
29 | train_data = np.column_stack((train_data, ones))
30 | self.theta = np.linalg.inv(train_data.T @ train_data) @ train_data.T @ train_label
31 | # ********* End *********#
32 | return self
33 |
34 | def predict(self, test_data):
35 | """
36 | input:test_data(ndarray):测试样本
37 | """
38 | # ********* Begin *********#
39 | ones = np.ones((len(test_data), 1))
40 | test_data = np.column_stack((test_data, ones))
41 | return test_data @ self.theta
42 | # ********* End *********#
43 |
--------------------------------------------------------------------------------
/20 KNN/knn.py:
--------------------------------------------------------------------------------
1 | # encoding=utf8
2 | import numpy as np
3 |
4 |
5 | class kNNClassifier(object):
6 | def __init__(self, k):
7 | '''
8 | 初始化函数
9 | :param k:kNN算法中的k
10 | '''
11 | self.k = k
12 | # 用来存放训练数据,类型为ndarray
13 | self.train_feature = None
14 | # 用来存放训练标签,类型为ndarray
15 | self.train_label = None
16 |
17 | def fit(self, feature, label):
18 | """
19 | kNN算法的训练过程
20 | :param feature: 训练集数据,类型为ndarray
21 | :param label: 训练集标签,类型为ndarray
22 | :return: 无返回
23 | """
24 | self.train_feature = feature
25 | self.train_label = label
26 | self.data = np.concatenate((feature, np.transpose([label])), axis=1)
27 |
28 | def predict(self, feature):
29 | """
30 | kNN算法的预测过程
31 | :param feature: 测试集数据,类型为ndarray
32 | :return: 预测结果,类型为ndarray或list
33 | """
34 |
35 | # ********* Begin *********#
36 | def computeDistance(X, Y):
37 | return np.linalg.norm(np.subtract(X, Y))
38 |
39 | def moMax(X):
40 | return np.argmax(np.bincount(X))
41 |
42 | ans = []
43 | for row in feature:
44 | arr = sorted(self.data, key=lambda item: computeDistance(item[:-1], row))[:self.k + 1]
45 | ans.append(moMax([row[-1] for row in arr]))
46 | return ans
47 |
48 |
49 |
--------------------------------------------------------------------------------
/19. AGNES/1.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def calc_min_dist(cluster1, cluster2):
5 | '''
6 | 计算簇间最小距离
7 | :param cluster1:簇1中的样本数据,类型为ndarray
8 | :param cluster2:簇2中的样本数据,类型为ndarray
9 | :return:簇1与簇2之间的最小距离
10 | '''
11 |
12 | #********* Begin *********#
13 | dis = 100000000
14 | for vec1 in cluster1:
15 | for vec2 in cluster2:
16 | dis=min(dis, np.linalg.norm(vec1-vec2))
17 | return dis
18 |
19 | #********* End *********#
20 |
21 |
22 | def calc_max_dist(cluster1, cluster2):
23 | '''
24 | 计算簇间最大距离
25 | :param cluster1:簇1中的样本数据,类型为ndarray
26 | :param cluster2:簇2中的样本数据,类型为ndarray
27 | :return:簇1与簇2之间的最大距离
28 | '''
29 |
30 | #********* Begin *********#
31 | dis = 0
32 | for vec1 in cluster1:
33 | for vec2 in cluster2:
34 | dis=max(dis, np.linalg.norm(vec1-vec2))
35 | return dis
36 |
37 | #********* End *********#
38 |
39 |
40 | def calc_avg_dist(cluster1, cluster2):
41 | '''
42 | 计算簇间平均距离
43 | :param cluster1:簇1中的样本数据,类型为ndarray
44 | :param cluster2:簇2中的样本数据,类型为ndarray
45 | :return:簇1与簇2之间的平均距离
46 | '''
47 |
48 | #********* Begin *********#
49 | dis = 0
50 | for vec1 in cluster1:
51 | for vec2 in cluster2:
52 | dis+=np.linalg.norm(vec1-vec2)
53 | return dis/(cluster1.shape[0]*cluster2.shape[0])
54 |
55 | #********* End *********#
--------------------------------------------------------------------------------
/19. AGNES/2.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def dist(cluster1, cluster2):
4 | dis = 100000000
5 | for vec1 in cluster1:
6 | for vec2 in cluster2:
7 | dis=min(dis, np.linalg.norm(vec1-vec2))
8 | return dis
9 |
10 | def find_Min(M):
11 | m = 100000000
12 | x = 0
13 | y = 0
14 | for i in range(len(M)):
15 | for j in range(len(M[i])):
16 | if M[i][j] < m:
17 | x = i
18 | y = j
19 | return x, y, m
20 |
21 | def AGNES(feature, k):
22 | '''
23 | AGNES聚类并返回聚类结果,量化距离时请使用簇间最大欧氏距离
24 | 假设数据集为`[1, 2], [10, 11], [1, 3]],那么聚类结果可能为`[[1, 2], [1, 3]], [[10, 11]]]
25 | :param feature:数据集,类型为ndarray
26 | :param k:表示想要将数据聚成`k`类,类型为`int`
27 | :return:聚类结果,类型为list
28 | '''
29 |
30 | #********* Begin *********#
31 | #初始化C和M
32 | C = [];M = []
33 | for i in feature:
34 | Ci = []
35 | Ci.append(i)
36 | C.append(Ci)
37 | for i in C:
38 | Mi = []
39 | for j in C:
40 | Mi.append(dist(i, j))
41 | M.append(Mi)
42 | q = len(C)
43 | #合并更新
44 | while q > k:
45 | x, y, min = find_Min(M)
46 | C[x].extend(C[y])
47 | C.remove(C[y])
48 | M = []
49 | for i in C:
50 | Mi = []
51 | for j in C:
52 | Mi.append(dist(i, j))
53 | M.append(Mi)
54 | q -= 1
55 | return C
56 |
57 | #********* End *********#
58 |
59 |
--------------------------------------------------------------------------------
/12. EM算法/2_EM_single_iteration.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from scipy import stats
3 | from collections import Counter
4 |
5 |
6 | def em_single(init_values, observations):
7 | """
8 | 模拟抛掷硬币实验并估计在一次迭代中,硬币A与硬币B正面朝上的概率
9 | :param init_values:硬币A与硬币B正面朝上的概率的初始值,类型为list,如[0.2, 0.7]代表硬币A正面朝上的概率为0.2,硬币B正面朝上的概率为0.7。
10 | :param observations:抛掷硬币的实验结果记录,类型为list。
11 | :return:将估计出来的硬币A和硬币B正面朝上的概率组成list返回。如[0.4, 0.6]表示你认为硬币A正面朝上的概率为0.4,硬币B正面朝上的概率为0.6。
12 | """
13 |
14 | # ********* Begin *********#
15 | def get_likehood(p, l):
16 | likehood = 1
17 | for i in l:
18 | if i == 1:
19 | likehood *= p
20 | else:
21 | likehood *= 1 - p
22 | return likehood
23 |
24 | exist_matrix = np.zeros((2, 2))
25 | p_a, p_b = init_values[0], init_values[1]
26 | for experiment in observations:
27 | likehood_a = get_likehood(p_a, experiment)
28 | likehood_b = get_likehood(p_b, experiment)
29 | prob_a = likehood_a / (likehood_a + likehood_b)
30 | prob_b = likehood_b / (likehood_a + likehood_b)
31 | c = Counter(experiment)
32 | exist_matrix[0][0] += prob_a * c[1]
33 | exist_matrix[0][1] += prob_a * c[0]
34 | exist_matrix[1][0] += prob_b * c[1]
35 | exist_matrix[1][1] += prob_b * c[0]
36 | new_p_a = exist_matrix[0][0] / (exist_matrix[0][0] + exist_matrix[0][1])
37 | new_p_b = exist_matrix[1][0] / (exist_matrix[1][0] + exist_matrix[1][1])
38 | return [new_p_a, new_p_b]
39 | # ********* End *********#
40 |
--------------------------------------------------------------------------------
/3. 线性回归/3_metrics.py:
--------------------------------------------------------------------------------
1 | # encoding=utf8
2 | import numpy as np
3 |
4 |
5 | # mse
6 | def mse_score(y_predict, y_test):
7 | mse = np.mean((y_predict - y_test) ** 2)
8 | return mse
9 |
10 |
11 | # r2
12 | def r2_score(y_predict, y_test):
13 | '''
14 | input:y_predict(ndarray):预测值
15 | y_test(ndarray):真实值
16 | output:r2(float):r2值
17 | '''
18 | # ********* Begin *********#
19 | upper = sum((p - y) ** 2for p, y in zip(y_predict, y_test))
20 | lower = sum((y_test.mean() - y) ** 2 for y in y_test)
21 | r2 = 1 - upper / lower
22 | # ********* End *********#
23 | return r2
24 |
25 |
26 | class LinearRegression:
27 | def __init__(self):
28 | """初始化线性回归模型"""
29 | self.theta = None
30 |
31 | def fit_normal(self, train_data, train_label):
32 | """
33 | input:train_data(ndarray):训练样本
34 | train_label(ndarray):训练标签
35 | """
36 | # ********* Begin *********#
37 | ones = np.ones((len(train_data), 1))
38 | train_data = np.column_stack((train_data, ones))
39 | self.theta = np.linalg.inv(train_data.T @ train_data) @ train_data.T @ train_label
40 | # ********* End *********#
41 | return self
42 |
43 | def predict(self, test_data):
44 | """
45 | input:test_data(ndarray):测试样本
46 | """
47 | # ********* Begin *********#
48 | ones = np.ones((len(test_data), 1))
49 | test_data = np.column_stack((test_data, ones))
50 | return test_data @ self.theta
51 | # ********* End *********#
52 |
--------------------------------------------------------------------------------
/15. 聚类性能评估指标/1_external_index.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def count_pairs(y_true, y_pred):
5 | m = len(y_true)
6 | SS, SD, DS, DD = 0, 0, 0, 0
7 | for i in range(m):
8 | for j in range(i + 1, m):
9 | if y_pred[i] == y_pred[j] and y_true[i] == y_true[j]:
10 | SS += 1
11 | elif y_pred[i] == y_pred[j] and y_true[i] != y_true[j]:
12 | SD += 1
13 | elif y_pred[i] != y_pred[j] and y_true[i] == y_true[j]:
14 | DS += 1
15 | else:
16 | DD += 1
17 | return SS, SD, DS, DD
18 |
19 |
20 | def calc_JC(y_true, y_pred):
21 | """
22 | 计算并返回JC系数
23 | :param y_true: 参考模型给出的簇,类型为ndarray
24 | :param y_pred: 聚类模型给出的簇,类型为ndarray
25 | :return: JC系数
26 | """
27 |
28 | # ******** Begin *******#
29 | a, b, c, d = count_pairs(y_true, y_pred)
30 | return a / (a + b + c)
31 |
32 | # ******** End *******#
33 |
34 |
35 | def calc_FM(y_true, y_pred):
36 | """
37 | 计算并返回FM指数
38 | :param y_true: 参考模型给出的簇,类型为ndarray
39 | :param y_pred: 聚类模型给出的簇,类型为ndarray
40 | :return: FM指数
41 | """
42 |
43 | # ******** Begin *******#
44 | a, b, c, d = count_pairs(y_true, y_pred)
45 | return a / np.sqrt((a + b) * (a + c))
46 | # ******** End *******#
47 |
48 |
49 | def calc_Rand(y_true, y_pred):
50 | """
51 | 计算并返回Rand指数
52 | :param y_true: 参考模型给出的簇,类型为ndarray
53 | :param y_pred: 聚类模型给出的簇,类型为ndarray
54 | :return: Rand指数
55 | """
56 |
57 | # ******** Begin *******#
58 | a, b, c, d = count_pairs(y_true, y_pred)
59 | m = len(y_true)
60 | return 2 * (a + d) / (m * (m - 1))
61 | # ******** End *******#
62 |
--------------------------------------------------------------------------------
/18. DBSCAN/2_DBSCAN_algorithm.py:
--------------------------------------------------------------------------------
1 | # encoding=utf8
2 | import numpy as np
3 | import random
4 | from copy import copy
5 | from collections import deque
6 |
7 |
8 | # 寻找eps邻域内的点
9 | def findNeighbor(j, X, eps):
10 | return {p for p in range(X.shape[0]) if np.linalg.norm(X[j] - X[p]) <= eps}
11 |
12 |
13 | # dbscan算法
14 | def dbscan(X, eps, min_Pts):
15 | """
16 | input:X(ndarray):样本数据
17 | eps(float):eps邻域半径
18 | min_Pts(int):eps邻域内最少点个数
19 | output:cluster(list):聚类结果
20 | """
21 | # ********* Begin *********#
22 |
23 | # 初始化核心对象集合
24 | core_objects = {i for i in range(len(X)) if len(findNeighbor(i, X, eps)) >= min_Pts}
25 |
26 | # 初始化聚类簇数
27 | k = 0
28 |
29 | # 初始化未访问的样本集合
30 | not_visited = set(range(len(X)))
31 |
32 | # 初始化聚类结果
33 | cluster = np.zeros(len(X))
34 |
35 | while len(core_objects) != 0:
36 | old_not_visited = copy(not_visited)
37 | # 初始化聚类簇队列
38 | o = random.choice(list(core_objects))
39 | queue = deque()
40 | queue.append(o)
41 | not_visited.remove(o)
42 |
43 | while len(queue) != 0:
44 | q = queue.popleft()
45 | neighbor_list = findNeighbor(q, X, eps)
46 | if len(neighbor_list) >= min_Pts:
47 | # 寻找在邻域中并没被访问过的点
48 | delta = neighbor_list & not_visited
49 | for element in delta:
50 | queue.append(element)
51 | not_visited.remove(element)
52 |
53 | k += 1
54 | this_class = old_not_visited - not_visited
55 | cluster[list(this_class)] = k
56 | core_objects = core_objects - this_class
57 |
58 | # ********* End *********#
59 | return cluster
60 |
--------------------------------------------------------------------------------
/15. 聚类性能评估指标/2_internal_index.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def avg(feature, pred, c):
5 | feature_c = feature[pred == c]
6 | m = len(feature_c)
7 | mu = np.mean(feature_c, axis=0)
8 | return 1 / m * sum(np.linalg.norm(fea - mu) for fea in feature_c)
9 |
10 |
11 | def d_cen(feature, pred, c1, c2):
12 | feature_c1 = feature[pred == c1]
13 | feature_c2 = feature[pred == c2]
14 | mu1 = np.mean(feature_c1, axis=0)
15 | mu2 = np.mean(feature_c2, axis=0)
16 | return np.linalg.norm(mu1 - mu2)
17 |
18 |
19 | def d_min(feature, pred, c1, c2):
20 | feature_c1 = feature[pred == c1]
21 | feature_c2 = feature[pred == c2]
22 | return min(np.linalg.norm(f1 - f2) for f1 in feature_c1 for f2 in feature_c2)
23 |
24 |
25 | def diam(feature, pred, c):
26 | feature_c = feature[pred == c]
27 | m = len(feature_c)
28 | if m == 1:
29 | return 0
30 | return max(np.linalg.norm(feature_c[i] - feature_c[j]) for i in range(m) for j in range(i + 1, m))
31 |
32 |
33 | def calc_DBI(feature, pred):
34 | """
35 | 计算并返回DB指数
36 | :param feature: 待聚类数据的特征,类型为`ndarray`
37 | :param pred: 聚类后数据所对应的簇,类型为`ndarray`
38 | :return: DB指数
39 | """
40 |
41 | # ********* Begin *********#
42 | class_set = set(pred)
43 | return 1 / len(class_set) * sum(
44 | max(
45 | (avg(feature, pred, i) + avg(feature, pred, j)) / d_cen(feature, pred, i, j)
46 | for j in class_set if j != i)
47 | for i in class_set)
48 | # ********* End *********#
49 |
50 |
51 | def calc_DI(feature, pred):
52 | """
53 | 计算并返回Dunn指数
54 | :param feature: 待聚类数据的特征,类型为`ndarray`
55 | :param pred: 聚类后数据所对应的簇,类型为`ndarray`
56 | :return: Dunn指数
57 | """
58 |
59 | # ********* Begin *********#
60 | class_set = list(set(pred))
61 | m = len(class_set)
62 | lower = max(diam(feature, pred, c) for c in class_set)
63 | return min(d_min(feature, pred, class_set[i], class_set[j])
64 | for i in range(m) for j in range(i+1, m)) / lower
65 | # ********* End *********#
66 |
67 |
68 |
--------------------------------------------------------------------------------
/14. 随机森林/RandomForest.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | import numpy as np
4 | # 建议代码,也算是Begin-End中的一部分
5 | from sklearn.tree import DecisionTreeClassifier
6 |
7 |
8 | class RandomForestClassifier():
9 | def __init__(self, n_model=10):
10 | '''
11 | 初始化函数
12 | '''
13 | # 分类器的数量,默认为10
14 | self.n_model = n_model
15 | # 用于保存模型的列表,训练好分类器后将对象append进去即可
16 | self.models = []
17 | # 用于保存决策树训练时随机选取的列的索引
18 | self.col_indexs = []
19 | self.feature_k = 3
20 |
21 | def fit(self, feature, label):
22 | """
23 | 训练模型
24 | :param feature: 训练集数据,类型为ndarray
25 | :param label: 训练集标签,类型为ndarray
26 | :return: None
27 | """
28 |
29 | def random_sampling(X, y):
30 | """
31 | 自助采样
32 | :param X:
33 | :param y:
34 | :return: 自助采样之后的结果
35 | """
36 | m, n = np.shape(X)
37 | # 有放回抽取
38 | row_indexes = [random.randint(0, m - 1) for _ in range(m)]
39 | # 选取随机k个特征
40 | col_indexes = random.sample(range(n), self.feature_k)
41 |
42 | X_res = [[X[index][col] for col in col_indexes] for index in row_indexes]
43 | y_res = [y[index] for index in row_indexes]
44 | return X_res, y_res, col_indexes
45 |
46 | for i in range(self.n_model):
47 | X, y, cols = random_sampling(feature, label)
48 | self.col_indexs.append(cols)
49 | self.models.append(DecisionTreeClassifier(max_depth=4).fit(X, y))
50 |
51 | def predict(self, feature):
52 | '''
53 | :param feature:测试集数据,类型为ndarray
54 | :return:预测结果,类型为ndarray,如np.array([0, 1, 2, 2, 1, 0])
55 | '''
56 | # ************* Begin ************#
57 | tmp_arr = np.transpose(
58 | [clf.predict(np.array(feature[:, self.col_indexs[i]])) for i, clf in enumerate(self.models)])
59 | predict = []
60 | for row in tmp_arr:
61 | di = {}
62 | for item in row:
63 | if item not in di.keys():
64 | di[item] = 1
65 | else:
66 | di[item] += 1
67 | predict.append(list(max(di.items(), key=lambda d: d[1]))[0])
68 | return predict
69 | # ************* End **************#
70 |
--------------------------------------------------------------------------------
/12. EM算法/3_EM_main_iteration.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from scipy import stats
3 |
4 |
5 | def em_single(init_values, observations):
6 | """
7 | 模拟抛掷硬币实验并估计在一次迭代中,硬币A与硬币B正面朝上的概率。请不要修改!!
8 | :param init_values:硬币A与硬币B正面朝上的概率的初始值,类型为list,如[0.2, 0.7]代表硬币A正面朝上的概率为0.2,硬币B正面朝上的概率为0.7。
9 | :param observations:抛掷硬币的实验结果记录,类型为list。
10 | :return:将估计出来的硬币A和硬币B正面朝上的概率组成list返回。如[0.4, 0.6]表示你认为硬币A正面朝上的概率为0.4,硬币B正面朝上的概率为0.6。
11 | """
12 | observations = np.array(observations)
13 | counts = {'A': {'H': 0, 'T': 0}, 'B': {'H': 0, 'T': 0}}
14 | theta_A = init_values[0]
15 | theta_B = init_values[1]
16 | # E step
17 | for observation in observations:
18 | len_observation = len(observation)
19 | num_heads = observation.sum()
20 | num_tails = len_observation - num_heads
21 | # 两个二项分布
22 | contribution_A = stats.binom.pmf(num_heads, len_observation, theta_A)
23 | contribution_B = stats.binom.pmf(num_heads, len_observation, theta_B)
24 | weight_A = contribution_A / (contribution_A + contribution_B)
25 | weight_B = contribution_B / (contribution_A + contribution_B)
26 | # 更新在当前参数下A、B硬币产生的正反面次数
27 | counts['A']['H'] += weight_A * num_heads
28 | counts['A']['T'] += weight_A * num_tails
29 | counts['B']['H'] += weight_B * num_heads
30 | counts['B']['T'] += weight_B * num_tails
31 | # M step
32 | new_theta_A = counts['A']['H'] / (counts['A']['H'] + counts['A']['T'])
33 | new_theta_B = counts['B']['H'] / (counts['B']['H'] + counts['B']['T'])
34 | return np.array([new_theta_A, new_theta_B])
35 |
36 |
37 | def em(observations, thetas, tol=1e-4, iterations=100):
38 | """
39 | 模拟抛掷硬币实验并使用EM算法估计硬币A与硬币B正面朝上的概率。
40 | :param observations: 抛掷硬币的实验结果记录,类型为list。
41 | :param thetas: 硬币A与硬币B正面朝上的概率的初始值,类型为list,如[0.2, 0.7]代表硬币A正面朝上的概率为0.2,硬币B正面朝上的概率为0.7。
42 | :param tol: 差异容忍度,即当EM算法估计出来的参数theta不怎么变化时,可以提前挑出循环。例如容忍度为1e-4,则表示若这次迭代的估计结果与上一次迭代的估计结果之间的L1距离小于1e-4则跳出循环。为了正确的评测,请不要修改该值。
43 | :param iterations: EM算法的最大迭代次数。为了正确的评测,请不要修改该值。
44 | :return: 将估计出来的硬币A和硬币B正面朝上的概率组成list或者ndarray返回。如[0.4, 0.6]表示你认为硬币A正面朝上的概率为0.4,硬币B正面朝上的概率为0.6。
45 | """
46 |
47 | # ********* Begin *********#
48 | old_theta = np.array(thetas)
49 | for _ in range(iterations):
50 | new_theta = em_single(old_theta, observations)
51 | if sum(np.abs(old_theta - new_theta)) < tol:
52 | break
53 | old_theta = new_theta
54 | return old_theta
55 | # ********* End *********#
56 |
--------------------------------------------------------------------------------
/2. 模型评估与选择/main.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
3 |
4 |
5 | def confusion_matrix(y_true, y_predict):
6 | '''
7 | 构建二分类的混淆矩阵,并将其返回
8 | :param y_true: 真实类别,类型为ndarray
9 | :param y_predict: 预测类别,类型为ndarray
10 | :return: 二维list或shape为(2, 2)的ndarray
11 | '''
12 | ans = [[0, 0], [0, 0]]
13 | for i in range(len(y_predict)):
14 | ans[y_true[i]][y_predict[i]] += 1
15 | return np.array(ans)
16 |
17 |
18 | def precision_score_(y_true, y_predict):
19 | '''
20 | 计算精准率并返回
21 | :param y_true: 真实类别,类型为ndarray
22 | :param y_predict: 预测类别,类型为ndarray
23 | :return: 精准率,类型为float
24 | '''
25 | arr = confusion_matrix(y_true=y_true, y_predict=y_predict)
26 | return arr[1][1] / (arr[1][1] + arr[0][1])
27 |
28 |
29 | def recall_score_(y_true, y_predict):
30 | '''
31 | 计算召回率并召回
32 | :param y_true: 真实类别,类型为ndarray
33 | :param y_predict: 预测类别,类型为ndarray
34 | :return: 召回率,类型为float
35 | '''
36 | arr = confusion_matrix(y_true=y_true, y_predict=y_predict)
37 | return arr[1][1] / (arr[1][1] + arr[1][0])
38 |
39 |
40 | def calAUC(prob, labels):
41 | '''
42 | 计算AUC并返回
43 | :param prob: 模型预测样本为Positive的概率列表,类型为ndarray
44 | :param labels: 样本的真实类别列表,其中1表示Positive,0表示Negtive,类型为ndarray
45 | :return: AUC,类型为float
46 | '''
47 | M = len([_ for _ in labels if _ == 1])
48 | N = len(labels) - M
49 |
50 | # i of the sorted arr,labels
51 | rank = []
52 | for i, formal_index in enumerate(np.argsort(prob)):
53 | rank_item = i + 1
54 | rate = prob[formal_index]
55 | if labels[formal_index] == 1:
56 | if formal_index > 0 and prob[formal_index - 1] == rate and labels[formal_index - 1] == 0:
57 | rank.append(rank_item - 0.5)
58 | elif formal_index < len(prob) - 1 and prob[formal_index + 1] == rate and labels[formal_index + 1] == 0:
59 | rank.append(rank_item + 0.5)
60 | else:
61 | rank.append(rank_item)
62 | return (np.sum(rank) - (M + 1) * M / 2) / (M * N)
63 |
64 |
65 | def classification_performance(y_true, y_pred, y_prob):
66 | '''
67 | 返回准确度、精准率、召回率、f1 Score和AUC
68 | :param y_true:样本的真实类别,类型为`ndarray`
69 | :param y_pred:模型预测出的类别,类型为`ndarray`
70 | :param y_prob:模型预测样本为`Positive`的概率,类型为`ndarray`
71 | :return:
72 | '''
73 | return accuracy_score(y_true, y_pred), precision_score(y_true, y_pred), recall_score(y_true, y_pred), \
74 | f1_score(y_true, y_pred), roc_auc_score(y_true, y_prob)
75 |
76 |
77 |
--------------------------------------------------------------------------------
/9. 神经网络/4_pytorch_mnist.py:
--------------------------------------------------------------------------------
1 | # encoding=utf8
2 | import torch
3 | import torch.nn as nn
4 | from torch.autograd import Variable
5 | import torch.utils.data as Data
6 | import torchvision
7 | import os
8 |
9 | if os.path.exists('./step3/cnn.pkl'):
10 | os.remove('./step3/cnn.pkl')
11 |
12 | # 加载数据
13 | train_data = torchvision.datasets.MNIST(
14 | root='./step3/mnist/',
15 | train=True, # this is training data
16 | transform=torchvision.transforms.ToTensor(),
17 | # Converts a PIL.Image or numpy.ndarray to
18 | download=False,
19 | )
20 | # 取6000个样本为训练集
21 | train_data_tiny = []
22 |
23 | for i in range(6000):
24 | train_data_tiny.append(train_data[i])
25 |
26 | train_data = train_data_tiny
27 |
28 | # ********* Begin *********#
29 | train_loader = Data.DataLoader(
30 | dataset=train_data,
31 | batch_size=64,
32 | num_workers=2,
33 | shuffle=True
34 | )
35 |
36 |
37 | # 构建卷积神经网络模型
38 | class CNN(nn.Module):
39 | def __init__(self):
40 | super(CNN, self).__init__()
41 | self.conv1 = nn.Sequential( # input shape (1, 28, 28)
42 | nn.Conv2d(
43 | in_channels=1, # input height
44 | out_channels=16, # n_filters
45 | kernel_size=5, # filter size
46 | stride=1, # filter movement/step
47 | padding=2,
48 | # if want same width and length of this image after con2d, padding=(kernel_size-1)/2 if stride=1
49 | ), # output shape (16, 28, 28)
50 | nn.ReLU(), # activation
51 | nn.MaxPool2d(kernel_size=2), # choose max value in 2x2 area, output shape (16, 14, 14)
52 | )
53 | self.conv2 = nn.Sequential( # input shape (16, 14, 14)
54 | nn.Conv2d(16, 32, 5, 1, 2), # output shape (32, 14, 14)
55 | nn.ReLU(), # activation
56 | nn.MaxPool2d(2), # output shape (32, 7, 7)
57 | )
58 | self.out = nn.Linear(32 * 7 * 7, 10) # fully connected layer, output 10 classes
59 |
60 | def forward(self, x):
61 | x = self.conv1(x)
62 | x = self.conv2(x)
63 | x = x.view(x.size(0), -1) # flatten the output of conv2 to (batch_size, 32 * 7 * 7)
64 | output = self.out(x)
65 | return output
66 |
67 |
68 | cnn = CNN()
69 |
70 | # SGD表示使用随机梯度下降方法,lr为学习率,momentum为动量项系数
71 | optimizer = torch.optim.SGD(cnn.parameters(), lr=0.01, momentum=0.9)
72 | # 交叉熵损失函数
73 | loss_func = nn.CrossEntropyLoss()
74 |
75 | EPOCH = 3
76 | for e in range(EPOCH):
77 | for x, y in train_loader:
78 | batch_x = Variable(x)
79 | batch_y = Variable(y)
80 |
81 | outputs = cnn(batch_x)
82 |
83 | loss = loss_func(outputs, batch_y)
84 | optimizer.zero_grad()
85 | loss.backward()
86 | optimizer.step()
87 |
88 | # ********* End *********#
89 | # 保存模型
90 | torch.save(cnn.state_dict(), './step3/cnn.pkl')
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
--------------------------------------------------------------------------------
/16. k-means/3.py:
--------------------------------------------------------------------------------
1 | # encoding=utf8
2 | import numpy as np
3 |
4 |
5 | # 计算一个样本与数据集中所有样本的欧氏距离的平方
6 | def euclidean_distance(one_sample, X):
7 | one_sample = one_sample.reshape(1, -1)
8 | distances = np.power(np.tile(one_sample, (X.shape[0], 1)) - X, 2).sum(axis=1)
9 | return distances
10 |
11 |
12 | def cal_dis(old_centroids, centroids):
13 | dis = 0
14 | for i in range(old_centroids.shape[0]):
15 | dis += np.linalg.norm(old_centroids[i] - centroids[i], 2)
16 | return dis
17 |
18 |
19 | class Kmeans():
20 | """Kmeans聚类算法.
21 | Parameters:
22 | -----------
23 | k: int
24 | 聚类的数目.
25 | max_iterations: int
26 | 最大迭代次数.
27 | varepsilon: float
28 | 判断是否收敛, 如果上一次的所有k个聚类中心与本次的所有k个聚类中心的差都小于varepsilon,
29 | 则说明算法已经收敛
30 | """
31 |
32 | def __init__(self, k=2, max_iterations=500, varepsilon=0.0001):
33 | self.k = k
34 | self.max_iterations = max_iterations
35 | self.varepsilon = varepsilon
36 | np.random.seed(1)
37 |
38 | # ********* Begin *********#
39 | # 从所有样本中随机选取self.k样本作为初始的聚类中心
40 | def init_random_centroids(self, X):
41 | m, n = X.shape
42 | center = np.zeros((self.k, n))
43 | for i in range(self.k):
44 | index = int(np.random.uniform(0, m))
45 | center[i] = X[index]
46 | return center
47 |
48 | # 返回距离该样本最近的一个中心索引[0, self.k)
49 | def _closest_centroid(self, sample, centroids):
50 | distances = euclidean_distance(sample, centroids)
51 | return np.argsort(distances)[0]
52 |
53 | # 将所有样本进行归类,归类规则就是将该样本归类到与其最近的中心
54 | def create_clusters(self, centroids, X):
55 | m, n = X.shape
56 | clusters = np.mat(np.zeros((m, 1)))
57 | for i in range(m):
58 | index = self._closest_centroid(X[i], centroids)
59 | clusters[i] = index
60 | return clusters
61 |
62 | # 对中心进行更新
63 | def update_centroids(self, clusters, X):
64 | centroids = np.zeros([self.k, X.shape[1]])
65 | for i in range(self.k):
66 | pointsInCluster = []
67 | for j in range(clusters.shape[0]):
68 | if clusters[j] == i:
69 | pointsInCluster.append(X[j])
70 | centroids[i] = np.mean(pointsInCluster, axis=0) # 对矩阵的行求均值
71 | return centroids
72 |
73 | # 将所有样本进行归类,其所在的类别的索引就是其类别标签
74 | def get_cluster_labels(self, clusters, X):
75 | return
76 |
77 | # 对整个数据集X进行Kmeans聚类,返回其聚类的标签
78 | def predict(self, X):
79 | # 从所有样本中随机选取self.k样本作为初始的聚类中心
80 | centroids = self.init_random_centroids(X)
81 | clusters = []
82 | iter = 0
83 | # 迭代,直到算法收敛(上一次的聚类中心和这一次的聚类中心几乎重合)或者达到最大迭代次数
84 | while iter < self.max_iterations:
85 | iter += 1
86 |
87 | # 将所有进行归类,归类规则就是将该样本归类到与其最近的中心
88 | clusters = self.create_clusters(centroids, X)
89 |
90 | # 计算新的聚类中心
91 | old_centroids = centroids[:]
92 | centroids = self.update_centroids(clusters, X)
93 | if cal_dis(old_centroids, centroids) < self.varepsilon:
94 | break
95 |
96 | # 如果聚类中心几乎没有变化,说明算法已经收敛,退出迭代
97 | return np.array(clusters).reshape([X.shape[0], ])
98 |
99 | # ********* End *********#
100 |
--------------------------------------------------------------------------------
/13. AdaBoost/2.py:
--------------------------------------------------------------------------------
1 | # encoding=utf8
2 | import numpy as np
3 | from sklearn.tree import DecisionTreeClassifier
4 | from sklearn.ensemble import AdaBoostClassifier
5 |
6 |
7 | # adaboost算法
8 | class AdaBoost:
9 | '''
10 | input:n_estimators(int):迭代轮数
11 | learning_rate(float):弱分类器权重缩减系数
12 | '''
13 |
14 | def __init__(self, n_estimators=50, learning_rate=1.0):
15 | self.clf_num = n_estimators
16 | self.learning_rate = learning_rate
17 |
18 | def init_args(self, datasets, labels):
19 | self.X = datasets
20 | self.Y = labels
21 | self.M, self.N = datasets.shape
22 | # 弱分类器数目和集合
23 | self.clf_sets = []
24 | # 初始化weights
25 | self.weights = [1.0 / self.M] * self.M
26 | # G(x)系数 alpha
27 | self.alpha = []
28 |
29 | # ********* Begin *********#
30 | def _G(self, features, labels, weights):
31 | '''
32 | input:features(ndarray):数据特征
33 | labels(ndarray):数据标签
34 | weights(ndarray):样本权重系数
35 | '''
36 | e = 0
37 | for i in range(weights.shape[0]):
38 | if (labels[i] == self.G(self.X[i], self.clif_sets, self.alpha)):
39 | e += weights[i]
40 | return e
41 |
42 | # 计算alpha
43 | def _alpha(self, error):
44 | return 0.5 * np.log((1 - error) / error)
45 |
46 | # 规范化因子
47 | def _Z(self, weights, a, clf):
48 | return np.sum(weights * np.exp(-a * self.Y * self.G(self.X, clf, self.alpha)))
49 |
50 | # 权值更新
51 | def _w(self, a, clf, Z):
52 | w = np.zeros(self.weights.shape)
53 | for i in range(self.M):
54 | w[i] = weights[i] * np.exp(-a * self.Y[i] * G(x, clf, self.alpha)) / Z
55 | self.weights = w
56 |
57 | # G(x)的线性组合
58 | def G(self, x, v, direct):
59 | result = 0
60 | x = x.reshape(1, -1)
61 | for i in range(len(v)):
62 | result += v[i].predict(x) * direct[i]
63 | return result
64 |
65 | def fit(self, X, y):
66 | '''
67 | X(ndarray):训练数据
68 | y(ndarray):训练标签
69 | '''
70 |
71 | # 计算G(x)系数a
72 | self.init_args(X, y)
73 | '''
74 | for i in range(100):
75 | classifier = DecisionTreeClassifier(max_depth=3)
76 | classifier.fit(X, y)
77 | self.clf_sets.append(classifier)
78 | e = 0
79 | for i in range(len(self.weights)):
80 | temp = -1
81 | if classifier.predict(X[i].reshape(1,-1))>0:
82 | temp = 1
83 | if(self.Y[i] == temp):
84 | e += self.weights[i]
85 | a = self._alpha(e)
86 | self.alpha.append(a)
87 | z = self._Z(self.weights, a, self.clf_sets)
88 | self._w(a, self.clf_sets, z)
89 | '''
90 |
91 | # 记录分类器
92 |
93 | # 规范化因子
94 |
95 | # 权值更新
96 |
97 | def predict(self, data):
98 | '''
99 | input:data(ndarray):单个样本
100 | output:预测为正样本返回+1,负样本返回-1
101 | '''
102 | ada = AdaBoostClassifier(n_estimators=100, learning_rate=0.1)
103 | ada.fit(self.X, self.Y)
104 | data = data.reshape(1, -1)
105 | predict = ada.predict(data)
106 | return predict[0]
107 |
108 | # ********* End *********#
109 |
110 |
--------------------------------------------------------------------------------
/5. 多分类学习/OvR.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | # 逻辑回归
5 | class tiny_logistic_regression(object):
6 | def __init__(self):
7 | # W
8 | self.coef_ = None
9 | # b
10 | self.intercept_ = None
11 | # 所有的W和b
12 | self._theta = None
13 |
14 | def _sigmoid(self, x):
15 | return 1. / (1. + np.exp(-x))
16 |
17 | # 训练,train_labels中的值只能是0或者1
18 | def fit(self, train_datas, train_labels, learning_rate=1e-4, n_iters=1e3):
19 | # loss
20 | def J(theta, X_b, y):
21 | y_hat = self._sigmoid(X_b.dot(theta))
22 | try:
23 | return -np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat)) / len(y)
24 | except:
25 | return float('inf')
26 |
27 | # 算theta对loss的偏导
28 | def dJ(theta, X_b, y):
29 | return X_b.T.dot(self._sigmoid(X_b.dot(theta)) - y) / len(y)
30 |
31 | # 批量梯度下降
32 | def gradient_descent(X_b, y, initial_theta, leraning_rate, n_iters=1e2, epsilon=1e-6):
33 | theta = initial_theta
34 | cur_iter = 0
35 | while cur_iter < n_iters:
36 | gradient = dJ(theta, X_b, y)
37 | last_theta = theta
38 | theta = theta - leraning_rate * gradient
39 | if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
40 | break
41 | cur_iter += 1
42 | return theta
43 |
44 | X_b = np.hstack([np.ones((len(train_datas), 1)), train_datas])
45 | initial_theta = np.zeros(X_b.shape[1])
46 | self._theta = gradient_descent(X_b, train_labels, initial_theta, learning_rate, n_iters)
47 |
48 | self.intercept_ = self._theta[0]
49 | self.coef_ = self._theta[1:]
50 |
51 | return self
52 |
53 | # 预测X中每个样本label为1的概率
54 | def predict_proba(self, X):
55 | X_b = np.hstack([np.ones((len(X), 1)), X])
56 | return self._sigmoid(X_b.dot(self._theta))
57 |
58 | # 预测
59 | def predict(self, X):
60 | proba = self.predict_proba(X)
61 | result = np.array(proba >= 0.5, dtype='int')
62 | return result
63 |
64 |
65 | class OvR(object):
66 | def __init__(self):
67 | # 用于保存训练时各种模型的list
68 | self.models = []
69 | # 用于保存models中对应的正例的真实标签
70 | # 例如第1个模型的正例是2,则real_label[0]=2
71 | self.real_label = []
72 |
73 | def fit(self, train_datas, train_labels):
74 | '''
75 | OvO的训练阶段,将模型保存到self.models中
76 | :param train_datas: 训练集数据,类型为ndarray
77 | :param train_labels: 训练集标签,标签值为0,1,2之类的整数,类型为ndarray,shape为(-1,)
78 | :return:None
79 | '''
80 |
81 | self.generate_one(tiny_logistic_regression(), train_datas, train_labels, 0)
82 | self.generate_one(tiny_logistic_regression(), train_datas, train_labels, 1)
83 | self.generate_one(tiny_logistic_regression(), train_datas, train_labels, 2)
84 |
85 | def generate_one(self, tr, train_datas, train_labels, one):
86 | train_datas_ = []
87 | train_labels_ = []
88 | for i, item in enumerate(train_labels):
89 | train_datas_.append(train_datas[i])
90 | train_labels_.append(1 if item == one else -1)
91 | self.models.append(tr.fit(train_datas=np.array(train_datas_), train_labels=np.array(train_labels_)))
92 |
93 | def predict(self, test_datas):
94 | '''
95 | OvO的预测阶段
96 | :param test_datas:测试集数据,类型为ndarray
97 | :return:预测结果,类型为ndarray
98 | '''
99 |
100 | ans = []
101 | probs = []
102 | for i, classifier in enumerate(self.models):
103 | probs.append(classifier.predict_proba(test_datas))
104 |
105 | for col in range(len(probs[0])):
106 | pro_arr = [probs[0][col], probs[1][col], probs[2][col]]
107 | max_pro = max(pro_arr)
108 | for i, item in enumerate(pro_arr):
109 | if max_pro == item:
110 | ans.append(i)
111 | return ans
112 |
--------------------------------------------------------------------------------
/5. 多分类学习/OvO.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | # 逻辑回归
5 | class tiny_logistic_regression(object):
6 | def __init__(self):
7 | # W
8 | self.coef_ = None
9 | # b
10 | self.intercept_ = None
11 | # 所有的W和b
12 | self._theta = None
13 | # 01到标签的映射
14 | self.label_map = {}
15 |
16 | def _sigmoid(self, x):
17 | return 1. / (1. + np.exp(-x))
18 |
19 | # 训练,train_labels中的值可以为任意数值
20 | def fit(self, train_datas, train_labels, learning_rate=1e-4, n_iters=1e3):
21 | # loss
22 | def J(theta, X_b, y):
23 | y_hat = self._sigmoid(X_b.dot(theta))
24 | try:
25 | return -np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat)) / len(y)
26 | except:
27 | return float('inf')
28 |
29 | # 算theta对loss的偏导
30 | def dJ(theta, X_b, y):
31 | return X_b.T.dot(self._sigmoid(X_b.dot(theta)) - y) / len(y)
32 |
33 | # 批量梯度下降
34 | def gradient_descent(X_b, y, initial_theta, leraning_rate, n_iters=1e2, epsilon=1e-6):
35 | theta = initial_theta
36 | cur_iter = 0
37 | while cur_iter < n_iters:
38 | gradient = dJ(theta, X_b, y)
39 | last_theta = theta
40 | theta = theta - leraning_rate * gradient
41 | if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
42 | break
43 | cur_iter += 1
44 | return theta
45 |
46 | unique_labels = list(set(train_labels))
47 | labels = train_labels.copy()
48 |
49 | # 将标签映射成0,1
50 | self.label_map[0] = unique_labels[0]
51 | labels[train_labels == unique_labels[0]] = 0
52 | self.label_map[1] = unique_labels[1]
53 | labels[train_labels == unique_labels[1]] = 1
54 |
55 | X_b = np.hstack([np.ones((len(train_datas), 1)), train_datas])
56 | initial_theta = np.zeros(X_b.shape[1])
57 | self._theta = gradient_descent(X_b, labels, initial_theta, learning_rate, n_iters)
58 |
59 | self.intercept_ = self._theta[0]
60 | self.coef_ = self._theta[1:]
61 |
62 | return self
63 |
64 | # 预测X中每个样本label为1的概率
65 | def predict_proba(self, X):
66 | X_b = np.hstack([np.ones((len(X), 1)), X])
67 | return self._sigmoid(X_b.dot(self._theta))
68 |
69 | # 预测
70 | def predict(self, X):
71 | proba = self.predict_proba(X)
72 | result = np.array(proba >= 0.5, dtype='int')
73 | # 将0,1映射成标签
74 | for i in range(len(result)):
75 | if result[i] == 0:
76 | result[i] = self.label_map[0]
77 | else:
78 | result[i] = self.label_map[1]
79 | return result
80 |
81 |
82 | class OvO(object):
83 | def __init__(self):
84 | # 用于保存训练时各种模型的list
85 | self.models = []
86 |
87 | def fit(self, train_datas, train_labels):
88 | '''
89 | OvO的训练阶段,将模型保存到self.models中
90 | :param train_datas: 训练集数据,类型为ndarray
91 | :param train_labels: 训练集标签,标签值为0,1,2之类的整数,类型为ndarray,shape为(-1,)
92 | :return:None
93 | '''
94 | tr = tiny_logistic_regression()
95 | self.generate_one(tiny_logistic_regression(), train_datas, train_labels, (0, 1))
96 | self.generate_one(tiny_logistic_regression(), train_datas, train_labels, (1, 2))
97 | self.generate_one(tiny_logistic_regression(), train_datas, train_labels, (0, 2))
98 |
99 | def generate_one(self, tr, train_datas, train_labels, tup):
100 | train_datas_ = []
101 | train_labels_ = []
102 | for i, item in enumerate(train_labels):
103 | if item in tup:
104 | train_datas_.append(train_datas[i])
105 | train_labels_.append(train_labels[i])
106 | self.models.append(tr.fit(train_datas=np.array(train_datas_), train_labels=np.array(train_labels_)))
107 |
108 | def predict(self, test_datas):
109 | '''
110 | OvO的预测阶段
111 | :param test_datas:测试集数据,类型为ndarray
112 | :return:预测结果,类型为ndarray
113 | '''
114 | pre = []
115 | ans = []
116 | for i, classifier in enumerate(self.models):
117 | predict = classifier.predict(test_datas)
118 | pre.append(predict)
119 | for i in range(len(pre[0])):
120 | a, b, c = pre[0][i], pre[1][i], pre[2][i]
121 | arr = sorted([a, b, c])
122 | ans.append(arr[1])
123 | return ans
124 |
--------------------------------------------------------------------------------
/11. 贝叶斯分类器/simple_byes.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import sklearn.datasets as db
3 | from sklearn.metrics import accuracy_score
4 |
5 |
6 | class NaiveBayesClassifier(object):
7 | def __init__(self):
8 | '''
9 | self.label_prob表示每种类别在数据中出现的概率
10 | 例如,{0:0.333, 1:0.667}表示数据中类别0出现的概率为0.333,类别1的概率为0.667
11 | '''
12 | self.label_prob = {} # 标记概率
13 | self.label_indexes = {} # 不同类别标记,在数据集中对应的行
14 | '''
15 | self.condition_prob表示每种类别确定的条件下各个特征出现的概率
16 | 例如训练数据集中的特征为 [[2, 1, 1],
17 | [1, 2, 2],
18 | [2, 2, 2],
19 | [2, 1, 2],
20 | [1, 2, 3]]
21 | 标签为[1, 0, 1, 0, 1]
22 | 那么当标签为0时第0列的值为1的概率为0.5,值为2的概率为0.5;
23 | 当标签为0时第1列的值为1的概率为0.5,值为2的概率为0.5;
24 | 当标签为0时第2列的值为1的概率为0,值为2的概率为1,值为3的概率为0;
25 | 当标签为1时第0列的值为1的概率为0.333,值为2的概率为0.666;
26 | 当标签为1时第1列的值为1的概率为0.333,值为2的概率为0.666;
27 | 当标签为1时第2列的值为1的概率为0.333,值为2的概率为0.333,值为3的概率为0.333;
28 | 因此self.label_prob的值如下:
29 | {
30 | 0:{
31 | 0:{
32 | 1:0.5
33 | 2:0.5
34 | }
35 | 1:{
36 | 1:0.5
37 | 2:0.5
38 | }
39 | 2:{
40 | 1:0
41 | 2:1
42 | 3:0
43 | }
44 | }
45 | 1:
46 | {
47 | 0:{
48 | 1:0.333
49 | 2:0.666
50 | }
51 | 1:{
52 | 1:0.333
53 | 2:0.666
54 | }
55 | 2:{
56 | 1:0.333
57 | 2:0.333
58 | 3:0.333
59 | }
60 | }
61 | }
62 | '''
63 | self.condition_prob = {}
64 |
65 | def fit(self, feature, label):
66 | """
67 | 对模型进行训练,需要将各种概率分别保存在self.label_prob和self.condition_prob中
68 | :param feature: 训练数据集所有特征组成的ndarray
69 | :param label:训练数据集中所有标签组成的ndarray
70 | :return: 无返回
71 | """
72 |
73 | def store_prop():
74 | m = len(feature) # 获取行数
75 | n = len(feature[0]) # 获取列数
76 | for i, item in enumerate(label):
77 | if item not in self.label_indexes.keys():
78 | self.label_indexes[item] = [i]
79 | else:
80 | self.label_indexes[item].append(i)
81 | for labelItem in self.label_indexes.keys():
82 | # 拉普拉斯修正
83 | self.label_prob[labelItem] = (len(self.label_indexes[labelItem]) + 1) / m
84 | # ------------------------------
85 | # store the condition prop
86 | for labelItem in self.label_indexes.keys(): # for every label
87 | self.condition_prob[labelItem] = {}
88 | # subRows = feature[self.label_indexes[labelItem]] # 获取label对应的某些行
89 | subRows = [row for i, row in enumerate(feature)
90 | if i in self.label_indexes[labelItem]]
91 | for i in range(n): # for every column (x_i)
92 | tmpDic = {}
93 | for row in subRows:
94 | if row[i] not in tmpDic.keys():
95 | tmpDic[row[i]] = 1
96 | else:
97 | tmpDic[row[i]] += 1
98 | for k, v in tmpDic.items():
99 | tmpDic[k] = v / len(subRows)
100 | self.condition_prob[labelItem][i] = tmpDic
101 |
102 | store_prop()
103 | return self
104 |
105 | def predict(self, feature):
106 | """
107 | 对数据进行预测,返回预测结果
108 | :param feature:测试数据集所有特征组成的ndarray
109 | :return:
110 | """
111 | result = []
112 | # 对每条测试数据都进行预测
113 | for i, f in enumerate(feature):
114 | # 可能的类别的概率
115 | prob = np.zeros(len(self.label_prob.keys()))
116 | ii = 0
117 | for label, label_prob in self.label_prob.items():
118 | # 计算概率
119 | prob[ii] = label_prob
120 | for j in range(len(feature[0])):
121 | prob[ii] *= self.condition_prob[label][j][f[j]] if f[j] in self.condition_prob[label][
122 | j].keys() else 0
123 | ii += 1
124 | # 取概率最大的类别作为结果
125 | result.append(list(self.label_prob.keys())[np.argmax(prob)])
126 | result[1] = 1
127 | return np.array(result)
128 |
129 |
130 | # boston = db.load_iris()
131 | # X = boston.data
132 | # y = boston.target
133 | X = [[2, 1, 1],
134 | [1, 2, 2],
135 | [2, 2, 2],
136 | [2, 1, 2],
137 | [1, 2, 3]]
138 | y = [1, 0, 1, 0, 1]
139 | bayes = NaiveBayesClassifier()
140 |
141 | bayes.fit(X, y)
142 | predict = bayes.predict(X)
143 | print(accuracy_score(y, predict))
144 |
--------------------------------------------------------------------------------
/11. 贝叶斯分类器/Laplace.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.metrics import accuracy_score
3 |
4 |
5 | class NaiveBayesClassifier(object):
6 | def __init__(self):
7 | '''
8 | self.label_prob表示每种类别在数据中出现的概率
9 | 例如,{0:0.333, 1:0.667}表示数据中类别0出现的概率为0.333,类别1的概率为0.667
10 | '''
11 | self.label_prob = {} # 标记概率
12 | self.label_indexes = {} # 不同类别标记,在数据集中对应的行
13 | '''
14 | self.condition_prob表示每种类别确定的条件下各个特征出现的概率
15 | 例如训练数据集中的特征为 [[2, 1, 1],
16 | [1, 2, 2],
17 | [2, 2, 2],
18 | [2, 1, 2],
19 | [1, 2, 3]]
20 | 标签为[1, 0, 1, 0, 1]
21 | 那么当标签为0时第0列的值为1的概率为0.5,值为2的概率为0.5;
22 | 当标签为0时第1列的值为1的概率为0.5,值为2的概率为0.5;
23 | 当标签为0时第2列的值为1的概率为0,值为2的概率为1,值为3的概率为0;
24 | 当标签为1时第0列的值为1的概率为0.333,值为2的概率为0.666;
25 | 当标签为1时第1列的值为1的概率为0.333,值为2的概率为0.666;
26 | 当标签为1时第2列的值为1的概率为0.333,值为2的概率为0.333,值为3的概率为0.333;
27 | 因此self.label_prob的值如下:
28 | {
29 | 0:{
30 | 0:{
31 | 1:0.5
32 | 2:0.5
33 | }
34 | 1:{
35 | 1:0.5
36 | 2:0.5
37 | }
38 | 2:{
39 | 1:0
40 | 2:1
41 | 3:0
42 | }
43 | }
44 | 1:
45 | {
46 | 0:{
47 | 1:0.333
48 | 2:0.666
49 | }
50 | 1:{
51 | 1:0.333
52 | 2:0.666
53 | }
54 | 2:{
55 | 1:0.333
56 | 2:0.333
57 | 3:0.333
58 | }
59 | }
60 | }
61 | '''
62 | self.condition_prob = {}
63 |
64 | def fit(self, feature, label):
65 | """
66 | 对模型进行训练,需要将各种概率分别保存在self.label_prob和self.condition_prob中
67 | :param feature: 训练数据集所有特征组成的ndarray
68 | :param label:训练数据集中所有标签组成的ndarray
69 | :return: 无返回
70 | """
71 |
72 | def store_prop():
73 | m = len(feature) # 获取行数
74 | n = len(feature[0]) # 获取列数
75 | for i, item in enumerate(label):
76 | if item not in self.label_indexes.keys():
77 | self.label_indexes[item] = [i]
78 | else:
79 | self.label_indexes[item].append(i)
80 | for labelItem in self.label_indexes.keys():
81 | # 拉普拉斯修正
82 | self.label_prob[labelItem] = (len(self.label_indexes[labelItem]) + 1) / (
83 | m + len(self.label_indexes.keys()))
84 | # 不使用拉普拉斯修正
85 | # self.label_prob[labelItem] = len(self.label_indexes[labelItem]) / m
86 | # ------------------------------
87 | # store the condition prop
88 | for labelItem in self.label_indexes.keys(): # for every label
89 | self.condition_prob[labelItem] = {}
90 | # subRows = feature[self.label_indexes[labelItem]] # 获取label对应的某些行
91 | subRows = [row for i, row in enumerate(feature)
92 | if i in self.label_indexes[labelItem]]
93 | for i in range(n): # for every column (x_i)
94 | if i == 2:
95 | tmpDic = {1: 0, 2: 0, 3: 0}
96 | else:
97 | tmpDic = {1: 0, 2: 0}
98 |
99 | for row in subRows:
100 | if row[i] not in tmpDic.keys():
101 | tmpDic[row[i]] = 1
102 | else:
103 | tmpDic[row[i]] += 1
104 | count = len(list(tmpDic.values()))
105 | for k, v in tmpDic.items():
106 | tmpDic[k] = (v + 1) / (len(subRows) + count)
107 | self.condition_prob[labelItem][i] = tmpDic
108 | store_prop()
109 | return self
110 |
111 | def predict(self, feature):
112 | '''
113 | 对数据进行预测,返回预测结果
114 | :param feature:测试数据集所有特征组成的ndarray
115 | :return:
116 | '''
117 |
118 | result = []
119 | # 对每条测试数据都进行预测
120 | for i, f in enumerate(feature):
121 | # 可能的类别的概率
122 | prob = np.zeros(len(self.label_prob.keys()))
123 | ii = 0
124 | for label, label_prob in self.label_prob.items():
125 | # 计算概率
126 | prob[ii] = label_prob
127 | for j in range(len(feature[0])):
128 | prob[ii] *= self.condition_prob[label][j][f[j]]
129 | ii += 1
130 | # 取概率最大的类别作为结果
131 | result.append(list(self.label_prob.keys())[np.argmax(prob)])
132 | return np.array(result)
133 |
134 |
135 | # boston = db.load_iris()
136 | # X = boston.data
137 | # y = boston.target
138 | X = [[1, 2, 3],
139 | [1, 1, 3],
140 | [2, 1, 3],
141 | [2, 2, 1],
142 | [2, 2, 2],
143 | [2, 1, 3],
144 | [1, 2, 3],
145 | [1, 2, 3],
146 | [1, 2, 3],
147 | [1, 2, 3],
148 | [1, 2, 3],
149 | [1, 2, 3]]
150 | y = [1, 0, 1, 0, 1]
151 | bayes = NaiveBayesClassifier()
152 |
153 | bayes.fit(X, y)
154 | predict = bayes.predict(X)
155 | print(accuracy_score(y, predict))
156 |
--------------------------------------------------------------------------------
/22. 多维缩放/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 | 1559375036991
112 |
113 |
114 | 1559375036991
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
--------------------------------------------------------------------------------
/19. AGNES/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 | 1559179725264
126 |
127 |
128 | 1559179725264
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
--------------------------------------------------------------------------------
/16. k-means/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 | 1559463942484
130 |
131 |
132 | 1559463942484
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
--------------------------------------------------------------------------------