├── 11. 贝叶斯分类器
    ├── __init__.py
    ├── sk.py
    ├── simple_byes.py
    └── Laplace.py
├── 17. 高斯混合聚类
    ├── 1.md
    ├── result.jpg
    ├── test.jpg
    └── 3.py
├── 9. 神经网络
    ├── 1_基本概念.md
    ├── 2_activation_function.py
    ├── 3_backpropagation.py
    └── 4_pytorch_mnist.py
├── 12. EM算法
    ├── 1_极大似然估计.md
    ├── 2_EM_single_iteration.py
    └── 3_EM_main_iteration.py
├── 21. PCA
    ├── 1_维数灾难.md
    ├── 2_PCA.py
    └── 3_sklearn_PCA.py
├── 18. DBSCAN
    ├── 1_basic_concept.md
    ├── 3_sklearn_DBSCAN.py
    └── 2_DBSCAN_algorithm.py
├── 3. 线性回归
    ├── 1_简单与多元线性回归.md
    ├── 4_sklearn_linearRegression.py
    ├── 2_normal_equation.py
    └── 3_metrics.py
├── README.md
├── 16. k-means
    ├── .idea
    │   ├── vcs.xml
    │   ├── modules.xml
    │   ├── misc.xml
    │   ├── 16. k-means.iml
    │   └── workspace.xml
    ├── 1.py
    ├── 4.py
    ├── 2.py
    └── 3.py
├── 19. AGNES
    ├── .idea
    │   ├── vcs.xml
    │   ├── modules.xml
    │   ├── misc.xml
    │   ├── 19. AGNES.iml
    │   └── workspace.xml
    ├── 3.py
    ├── 1.py
    └── 2.py
├── 22. 多维缩放
    ├── .idea
    │   ├── vcs.xml
    │   ├── modules.xml
    │   ├── misc.xml
    │   ├── 22. 多维缩放.iml
    │   └── workspace.xml
    ├── 2.py
    └── 1.py
├── .gitignore
├── 6. 线性判别分析
    ├── 2_sklearn_LDA.py
    └── 1_LDA.py
├── 15. 聚类性能评估指标
    ├── 3_sklearn_metrics.py
    ├── 1_external_index.py
    └── 2_internal_index.py
├── 8. 感知机
    ├── sk.py
    └── preception.py
├── 24. 局部线性嵌入
    ├── 2_sklearn_LLE.py
    └── 1_LLE.py
├── 23 等度量映射
    ├── sk.py
    └── isomap.py
├── 4. 逻辑回归
    ├── 3.py
    ├── 5.py
    └── 4.py
├── 14. 随机森林
    ├── 3.Digit.py
    ├── Bagging.py
    └── RandomForest.py
├── 20 KNN
    ├── sk.py
    └── knn.py
├── 13. AdaBoost
    ├── 3.py
    └── 2.py
├── 2. 模型评估与选择
    └── main.py
└── 5. 多分类学习
    ├── OvR.py
    └── OvO.py


/11. 贝叶斯分类器/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/17. 高斯混合聚类/1.md:
--------------------------------------------------------------------------------
1 | 1. A B C
2 | 2. A


--------------------------------------------------------------------------------
/9. 神经网络/1_基本概念.md:
--------------------------------------------------------------------------------
1 | 1. c
2 | 
3 | 


--------------------------------------------------------------------------------
/12. EM算法/1_极大似然估计.md:
--------------------------------------------------------------------------------
1 | 1. A
2 | 2. B
3 | 3. D


--------------------------------------------------------------------------------
/21. PCA/1_维数灾难.md:
--------------------------------------------------------------------------------
1 | ### 1. B，C
2 | 
3 | ### 2. C


--------------------------------------------------------------------------------
/18. DBSCAN/1_basic_concept.md:
--------------------------------------------------------------------------------
1 | ### 1. 选择D
2 | 
3 | 


--------------------------------------------------------------------------------
/3. 线性回归/1_简单与多元线性回归.md:
--------------------------------------------------------------------------------
1 | 1. B, C
2 | 2. A, B, C
3 | 3. A


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # educoder-oj
2 | ## 南京大学机器学习课程oj相关习题解答
3 | 
4 | ## 组团刷副本 冲冲冲
5 | 


--------------------------------------------------------------------------------
/17. 高斯混合聚类/result.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nju-ml-course/educoder-oj/HEAD/17. 高斯混合聚类/result.jpg


--------------------------------------------------------------------------------
/17. 高斯混合聚类/test.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nju-ml-course/educoder-oj/HEAD/17. 高斯混合聚类/test.jpg


--------------------------------------------------------------------------------
/16. k-means/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/19. AGNES/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/22. 多维缩放/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/9. 神经网络/2_activation_function.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf8
 2 | 
 3 | def relu(x):
 4 |     '''
 5 |     x:负无穷到正无穷的实数
 6 |     '''
 7 |     # ********* Begin *********#
 8 |     if x <= 0:
 9 |         return 0
10 |     else:
11 |         return x
12 |     # ********* End *********#
13 | 


--------------------------------------------------------------------------------
/22. 多维缩放/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/22. 多维缩放.iml" filepath="$PROJECT_DIR$/.idea/22. 多维缩放.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/19. AGNES/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/19. AGNES.iml" filepath="$PROJECT_DIR$/.idea/19. AGNES.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/16. k-means/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/16. k-means.iml" filepath="$PROJECT_DIR$/.idea/16. k-means.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/16. k-means/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (ML)" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/19. AGNES/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (ML)" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/22. 多维缩放/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (ML)" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/16. k-means/1.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf8
 2 | import numpy as np
 3 | 
 4 | def distance(x,y,p=2):
 5 |     '''
 6 |     input:x(ndarray):第一个样本的坐标
 7 |           y(ndarray):第二个样本的坐标
 8 |           p(int):等于1时为曼哈顿距离，等于2时为欧氏距离
 9 |     output:distance(float):x到y的距离
10 |     '''
11 |     #********* Begin *********#
12 |     return np.linalg.norm(x-y, p)
13 | 
14 |     #********* End *********#


--------------------------------------------------------------------------------
/16. k-means/4.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf8
 2 | from sklearn.cluster import KMeans
 3 | 
 4 | def kmeans_cluster(data):
 5 |     '''
 6 |     input:data(ndarray):样本数据
 7 |     output:result(ndarray):聚类结果
 8 |     '''
 9 |     #********* Begin *********#
10 |     km = KMeans(n_clusters=3,random_state=888)
11 |     result = km.fit_predict(data)
12 | 
13 |     #********* End *********#
14 |     return result
15 | 
16 | 
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # See https://help.github.com/ignore-files/ for more about ignoring files.
 2 | 
 3 | # dependencies
 4 | /node_modules
 5 | 
 6 | # testing
 7 | /coverage
 8 | 
 9 | # production
10 | /build
11 | 
12 | # misc
13 | .DS_Store
14 | .env.local
15 | .env.development.local
16 | .env.test.local
17 | .env.production.local
18 | 
19 | 
20 | # idea
21 | .idea
22 | 
23 | npm-debug.log*
24 | yarn-debug.log*
25 | yarn-error.log*
26 | 
27 | # umi
28 | .umi


--------------------------------------------------------------------------------
/19. AGNES/.idea/19. AGNES.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/22. 多维缩放/.idea/22. 多维缩放.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/16. k-means/.idea/16. k-means.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/6. 线性判别分析/2_sklearn_LDA.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf8
 2 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 3 | 
 4 | 
 5 | def lda(x, y):
 6 |     """
 7 |     input:x(ndarray):待处理数据
 8 |           y(ndarray):待处理数据标签
 9 |     output:x_new(ndarray):降维后数据
10 |     """
11 |     # ********* Begin *********#
12 |     lda = LinearDiscriminantAnalysis(n_components=2)
13 |     lda.fit(x, y)
14 |     x_new = lda.transform(x)
15 |     # ********* End *********#
16 |     return x_new
17 | 


--------------------------------------------------------------------------------
/22. 多维缩放/2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from sklearn.manifold import MDS
 3 | 
 4 | 
 5 | def mds(data, d):
 6 |     '''
 7 |     input:data(ndarray):待降维数据
 8 |           d(int):降维后数据维度
 9 |     output:Z(ndarray):降维后数据
10 |     '''
11 |     # ********* Begin *********#
12 |     mds = MDS(d)
13 |     Z = mds.fit_transform(data)
14 | 
15 |     # ********* End *********#
16 |     return Z
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 
28 | 
29 | 
30 | 
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/18. DBSCAN/3_sklearn_DBSCAN.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf8
 2 | from sklearn.cluster import DBSCAN
 3 | 
 4 | 
 5 | def data_cluster(data):
 6 |     """
 7 |     input: data(ndarray) :数据
 8 |     output: result(ndarray):聚类结果
 9 |     """
10 |     # ********* Begin *********#
11 |     dbscan = DBSCAN(eps=0.5, min_samples=10)
12 |     result = dbscan.fit_predict(data)
13 |     return result
14 |     # ********* End *********#
15 | 
16 | 
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 
28 | 
29 | 
30 | 
31 | 
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/15. 聚类性能评估指标/3_sklearn_metrics.py:
--------------------------------------------------------------------------------
 1 | from sklearn.metrics.cluster import fowlkes_mallows_score, adjusted_rand_score
 2 | 
 3 | 
 4 | def cluster_performance(y_true, y_pred):
 5 |     """
 6 |     返回Rand指数和FM指数
 7 |     :param y_true:参考模型的簇划分，类型为ndarray
 8 |     :param y_pred:聚类模型给出的簇划分，类型为ndarray
 9 |     :return: Rand指数，FM指数
10 |     """
11 |     # ********* Begin *********#
12 |     rand = adjusted_rand_score(y_true, y_pred)
13 |     fm = fowlkes_mallows_score(y_true, y_pred)
14 |     return fm, rand
15 |     # ********* End *********#
16 | 


--------------------------------------------------------------------------------
/8. 感知机/sk.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn.linear_model.perceptron import Perceptron
 3 | import csv
 4 | 
 5 | # 获取训练数据
 6 | train_data = pd.read_csv('./step2/train_data.csv')
 7 | # 获取训练标签
 8 | train_label = pd.read_csv('./step2/train_label.csv')
 9 | train_label = train_label['target']
10 | # 获取测试数据
11 | test_data = pd.read_csv('./step2/test_data.csv')
12 | clf = Perceptron(max_iter=1e5)
13 | clf.fit(train_data, train_label)
14 | result = clf.predict(test_data)
15 | 
16 | pd.DataFrame({'result': result}).to_csv('./step2/result.csv', index=False)
17 | 


--------------------------------------------------------------------------------
/19. AGNES/3.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf8
 2 | from sklearn.cluster import AgglomerativeClustering
 3 | 
 4 | def Agglomerative_cluster(data):
 5 |     '''
 6 |     对红酒数据进行聚类
 7 |     :param data: 数据集，类型为ndarray
 8 |     :return: 聚类结果，类型为ndarray
 9 |     '''
10 | 
11 |     #********* Begin *********#
12 |     mean = data.mean()         #计算平均数
13 |     deviation = data.std()     #计算标准差
14 |     # 标准化数据的公式: (数据值 - 平均数) / 标准差
15 |     data = (data - mean) / deviation
16 |     agnes = AgglomerativeClustering(n_clusters=3)
17 |     result = agnes.fit_predict(data)
18 |     return result
19 | 
20 |     #********* End *********#
21 | 


--------------------------------------------------------------------------------
/21. PCA/2_PCA.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def pca(data, k):
 5 |     """
 6 |     对data进行PCA，并将结果返回
 7 |     :param data:数据集，类型为ndarray
 8 |     :param k:想要降成几维，类型为int
 9 |     :return: 降维后的数据，类型为ndarray
10 |     """
11 | 
12 |     # ********* Begin *********#
13 |     # 零均值化
14 |     mean = np.mean(data, axis=0)
15 |     after_demean = data - mean
16 | 
17 |     cov = np.cov(after_demean.T)
18 | 
19 |     value, vector = np.linalg.eig(cov)
20 | 
21 |     index = np.argsort(-value)[: k]
22 |     w = vector[:, index]
23 | 
24 |     return np.dot(after_demean, w)
25 | 
26 |     # ********* End *********#
27 | 


--------------------------------------------------------------------------------
/24. 局部线性嵌入/2_sklearn_LLE.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from sklearn.manifold import LocallyLinearEmbedding
 3 | 
 4 | 
 5 | def lle(data, d, k):
 6 |     """
 7 |     input:data(ndarray):待降维数据
 8 |           d(int):降维后数据维度
 9 |           k(int):邻域内样本数
10 |     output:Z(ndarray):降维后数据
11 |     """
12 |     # ********* Begin *********#
13 |     lle = LocallyLinearEmbedding(n_components=d, n_neighbors=k)
14 |     Z = lle.fit_transform(data)
15 |     # ********* End *********#
16 |     return Z
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 
28 | 
29 | 
30 | 
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/3. 线性回归/4_sklearn_linearRegression.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf8
 2 | # ********* Begin *********#
 3 | from sklearn.linear_model import LinearRegression
 4 | import pandas as pd
 5 | 
 6 | # 获取训练数据
 7 | train_data = pd.read_csv('./step3/train_data.csv')
 8 | # 获取训练标签
 9 | train_label = pd.read_csv('./step3/train_label.csv')
10 | train_label = train_label['target']
11 | # 获取测试数据
12 | test_data = pd.read_csv('./step3/test_data.csv')
13 | 
14 | model = LinearRegression(normalize=True)
15 | model.fit(train_data, train_label)
16 | test_y = model.predict(test_data)
17 | 
18 | pd.DataFrame(test_y, columns=['result']).to_csv('./step3/result.csv')
19 | 
20 | # ********* End *********#
21 | 


--------------------------------------------------------------------------------
/23 等度量映射/sk.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from sklearn.manifold import Isomap
 3 | import isomap as isa
 4 | import sklearn.datasets as db
 5 | 
 6 | 
 7 | 
 8 | def isomap(data, d, k):
 9 |     '''
10 |     input:data(ndarray):待降维数据
11 |           d(int):降维后数据维度
12 |           k(int):最近的k个样本
13 |     output:Z(ndarray):降维后数据
14 |     '''
15 |     # ********* Begin *********#
16 |     iso = Isomap(n_neighbors=k, n_components=d)
17 |     return iso.fit_transform(data)
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     ir = db.load_boston()
22 |     X1 = isa.isomap(ir.data[:10], d=2, k=4)
23 |     X2 = isomap(ir.data[:10], d=2, k=4)
24 |     print(X1)
25 |     print(X2)
26 | 


--------------------------------------------------------------------------------
/4. 逻辑回归/3.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import numpy as np
 4 | import warnings
 5 | warnings.filterwarnings("ignore")
 6 | 
 7 | def gradient_descent(initial_theta,eta=0.05,n_iters=1e3,epslion=1e-8):
 8 |     '''
 9 |     梯度下降
10 |     :param initial_theta: 参数初始值，类型为float
11 |     :param eta: 学习率，类型为float
12 |     :param n_iters: 训练轮数，类型为int
13 |     :param epslion: 容忍误差范围，类型为float
14 |     :return: 训练后得到的参数
15 |     '''
16 |     #   请在此添加实现代码   #
17 |     #********** Begin *********#
18 |     i = 0
19 |     while i < n_iters:
20 |         initial_theta = initial_theta - eta*2*(initial_theta-3)
21 |         i += 1
22 |     return initial_theta
23 |     #********** End **********#
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/4. 逻辑回归/5.py:
--------------------------------------------------------------------------------
 1 | from sklearn.linear_model import LogisticRegression
 2 | 
 3 | def digit_predict(train_image, train_label, test_image):
 4 |     '''
 5 |     实现功能：训练模型并输出预测结果
 6 |     :param train_sample: 包含多条训练样本的样本集，类型为ndarray,shape为[-1, 8, 8]
 7 |     :param train_label: 包含多条训练样本标签的标签集，类型为ndarray
 8 |     :param test_sample: 包含多条测试样本的测试集，类型为ndarry
 9 |     :return: test_sample对应的预测标签
10 |     '''
11 |     #************* Begin ************#
12 |     logreg = LogisticRegression(solver='newton-cg',max_iter =1000,C=1)
13 |     logreg.fit(train_image.reshape(train_image.shape[0],-1), train_label)
14 |     return logreg.predict(test_image.reshape(test_image.shape[0],-1))
15 |     #************* End **************#


--------------------------------------------------------------------------------
/14. 随机森林/3.Digit.py:
--------------------------------------------------------------------------------
 1 | from sklearn.ensemble import RandomForestClassifier
 2 | import numpy as np
 3 | import sklearn.datasets as db
 4 | 
 5 | 
 6 | def digit_predict(train_image, train_label, test_image):
 7 |     """
 8 |     实现功能：训练模型并输出预测结果
 9 |     :param train_image: 包含多条训练样本的样本集，类型为ndarray,shape为[-1, 8, 8]
10 |     :param train_label: 包含多条训练样本标签的标签集，类型为ndarray
11 |     :param test_image: 包含多条测试样本的测试集，类型为ndarry
12 |     :return: test_image对应的预测标签，类型为ndarray
13 |     """
14 |     X = np.reshape(train_image, newshape=(-1, 64))
15 |     clf = RandomForestClassifier(n_estimators=500, max_depth=10)
16 |     clf.fit(X, y=train_label)
17 |     return clf.predict(test_image)
18 | 
19 | 
20 | data = db.load_digits()
21 | 


--------------------------------------------------------------------------------
/20 KNN/sk.py:
--------------------------------------------------------------------------------
 1 | from sklearn.neighbors import KNeighborsClassifier
 2 | from sklearn.preprocessing import StandardScaler
 3 | 
 4 | 
 5 | def classification(train_feature, train_label, test_feature):
 6 |     '''
 7 |     对test_feature进行红酒分类
 8 |     :param train_feature: 训练集数据，类型为ndarray
 9 |     :param train_label: 训练集标签，类型为ndarray
10 |     :param test_feature: 测试集数据，类型为ndarray
11 |     :return: 测试集数据的分类结果
12 |     '''
13 | 
14 |     # 实例化StandardScaler对象
15 |     scaler = StandardScaler()
16 |     # 用data的均值和标准差来进行标准化，并将结果保存到after_scaler
17 |     X = scaler.fit_transform(train_feature)
18 |     # 用刚刚的StandardScaler对象来进行归一化
19 |     X_test = scaler.transform(test_feature)
20 |     clf = KNeighborsClassifier()
21 |     clf.fit(X, train_label)
22 |     return clf.predict(X_test)
23 | 


--------------------------------------------------------------------------------
/21. PCA/3_sklearn_PCA.py:
--------------------------------------------------------------------------------
 1 | from sklearn.decomposition import PCA
 2 | from sklearn.svm import LinearSVC
 3 | 
 4 | 
 5 | def cancer_predict(train_sample, train_label, test_sample):
 6 |     """
 7 |     使用PCA降维，并进行分类，最后将分类结果返回
 8 |     :param train_sample:训练样本, 类型为ndarray
 9 |     :param train_label:训练标签, 类型为ndarray
10 |     :param test_sample:测试样本, 类型为ndarray
11 |     :return: 分类结果
12 |     """
13 | 
14 |     # ********* Begin *********#
15 |     pca = PCA(n_components=11)
16 |     train_sample_transformed = pca.fit_transform(train_sample)
17 |     test_sample_transformed = pca.transform(test_sample)
18 | 
19 |     clf = LinearSVC()
20 |     clf.fit(train_sample_transformed, train_label)
21 |     return clf.predict(test_sample_transformed)
22 | 
23 |     # ********* End *********#
24 | 


--------------------------------------------------------------------------------
/17. 高斯混合聚类/3.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | import numpy as np
 3 | from sklearn.mixture import GaussianMixture
 4 | 
 5 | # 读取road.jpg到im变量中
 6 | im = Image.open('./step3/image/test.jpg')
 7 | 
 8 | # 将im转换成ndarray
 9 | img = np.array(im)
10 | # 将img变形为[-1, 3]的shape，并保存至img_reshape
11 | img_reshape = img.reshape(-1, 3)
12 | 
13 | # 实例化一个将数据聚成3个簇的高斯混合聚类器
14 | gmm = GaussianMixture(3)
15 | # 将数据传给fit函数，fit函数会计算出各个高斯分布的参数和响应系数
16 | gmm.fit(img_reshape)
17 | # 对数据进行聚类，簇标记为0 1 2(因为gmm对象想要聚成3个簇)
18 | pred = gmm.predict(img_reshape)
19 | 
20 | img_reshape[pred == 0, :] = [255, 255, 0]  # 黄色
21 | img_reshape[pred == 1, :] = [0, 0, 255]  # 蓝色
22 | img_reshape[pred == 2, :] = [0, 255, 0]  # 绿色
23 | im = Image.fromarray(img.astype('uint8'))
24 | # 将im保存为new_road.jpg
25 | im.save('./step3/dump/result.jpg')
26 | 


--------------------------------------------------------------------------------
/13. AdaBoost/3.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf8
 2 | from sklearn.tree import DecisionTreeClassifier
 3 | from sklearn.ensemble import AdaBoostClassifier
 4 | def ada_classifier(train_data,train_label,test_data):
 5 |     '''
 6 |     input:train_data(ndarray):训练数据
 7 |           train_label(ndarray):训练标签
 8 |           test_data(ndarray):测试标签
 9 |     output:predict(ndarray):预测结果
10 |     '''
11 |     #********* Begin *********#
12 |     ada=AdaBoostClassifier(n_estimators=80,learning_rate=1.0)
13 |     ada.fit(train_data,train_label)
14 |     predict = ada.predict(test_data)
15 |     #********* End *********#
16 |     return predict
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 
28 | 
29 | 
30 | 
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/4. 逻辑回归/4.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import numpy as np
 4 | import warnings
 5 | warnings.filterwarnings("ignore")
 6 | 
 7 | def sigmoid(x):
 8 |     '''
 9 |     sigmoid函数
10 |     :param x: 转换前的输入
11 |     :return: 转换后的概率
12 |     '''
13 |     return 1/(1+np.exp(-x))
14 | 
15 | 
16 | def fit(x,y,eta=1e-3,n_iters=1e4):
17 |     '''
18 |     训练逻辑回归模型
19 |     :param x: 训练集特征数据，类型为ndarray
20 |     :param y: 训练集标签，类型为ndarray
21 |     :param eta: 学习率，类型为float
22 |     :param n_iters: 训练轮数，类型为int
23 |     :return: 模型参数，类型为ndarray
24 |     '''
25 |     #   请在此添加实现代码   #
26 |     #********** Begin *********#
27 |     i = 0
28 |     w = np.zeros(31)
29 |     while i < n_iters:
30 |         a = sigmoid(x.dot(w))
31 |         w = w - eta * np.tensordot(x.T, (a-y),(1,0))
32 |         i += 1
33 |     return w
34 |     #********** End **********#
35 | 
36 | 


--------------------------------------------------------------------------------
/9. 神经网络/3_backpropagation.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf8
 2 | import os
 3 | from sklearn.neural_network import MLPClassifier
 4 | import pandas as pd
 5 | 
 6 | if os.path.exists('./step2/result.csv'):
 7 |     os.remove('./step2/result.csv')
 8 | 
 9 | # ********* Begin *********#
10 | # 获取训练数据
11 | train_data = pd.read_csv('./step2/train_data.csv')
12 | # 获取训练标签
13 | train_label = pd.read_csv('./step2/train_label.csv')
14 | train_label = train_label['target']
15 | # 获取测试数据
16 | test_data = pd.read_csv('./step2/test_data.csv')
17 | 
18 | mlp = MLPClassifier(solver='lbfgs', max_iter=100,
19 |                     alpha=1e-5, hidden_layer_sizes=(5, 10, 3))
20 | mlp.fit(train_data, train_label)
21 | result = mlp.predict(test_data)
22 | 
23 | result = pd.DataFrame(result, columns=['result'])
24 | 
25 | result.to_csv('./step2/result.csv', index=False)
26 | 
27 | # ********* End *********#
28 | 


--------------------------------------------------------------------------------
/6. 线性判别分析/1_LDA.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf8
 2 | import numpy as np
 3 | from numpy.linalg import inv
 4 | 
 5 | 
 6 | def lda(X, y):
 7 |     '''
 8 |     input:X(ndarray):待处理数据
 9 |           y(ndarray):待处理数据标签，标签分别为0和1
10 |     output:X_new(ndarray):处理后的数据
11 |     '''
12 |     # ********* Begin *********#
13 | 
14 |     # 划分出第一类样本与第二类样本
15 |     p_data = np.transpose(X[y == 0])
16 |     n_data = np.transpose(X[y == 1])
17 | 
18 |     # 计算第一类样本与第二类样本协方差矩阵
19 |     p_cov = np.cov(p_data)
20 |     n_cov = np.cov(n_data)
21 |     # 计算类内散度矩阵
22 |     S_w = p_cov + n_cov
23 | 
24 |     # 获取第一类样本与第二类样本中心点
25 |     p_mu = np.mean(p_data, axis=1)
26 |     n_mu = np.mean(n_data, axis=1)
27 |     # 计算w
28 |     w = inv(S_w).dot(n_mu - p_mu)
29 |     # 计算新样本集
30 |     X_new = X.dot(w).reshape(-1, 1)
31 | 
32 |     # ********* End *********#
33 |     return X_new * 0.0623
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/11. 贝叶斯分类器/sk.py:
--------------------------------------------------------------------------------
 1 | from sklearn.feature_extraction.text import CountVectorizer
 2 | from sklearn.naive_bayes import MultinomialNB
 3 | from sklearn.feature_extraction.text import TfidfTransformer
 4 | 
 5 | 
 6 | def news_predict(train_sample, train_label, test_sample):
 7 |     '''
 8 |     训练模型并进行预测，返回预测结果
 9 |     :param train_sample:原始训练集中的新闻文本，类型为ndarray
10 |     :param train_label:训练集中新闻文本对应的主题标签，类型为ndarray
11 |     :param test_sample:原始测试集中的新闻文本，类型为ndarray
12 |     :return 预测结果，类型为ndarray
13 |     '''
14 |     # 实例化向量化对象
15 |     vec = CountVectorizer()
16 |     # 将训练集中的新闻向量化
17 |     X_train = vec.fit_transform(train_sample)
18 |     # 将测试集中的新闻向量化
19 |     X_test = vec.transform(test_sample)
20 |     # 实例化tf-idf对象
21 |     tfidf = TfidfTransformer()
22 |     # 将训练集中的词频向量用tf-idf进行转换
23 |     X_train = tfidf.fit_transform(X_train)
24 |     # 将测试集中的词频向量用tf-idf进行转换
25 |     X_test = tfidf.transform(X_test)
26 | 
27 |     clf = MultinomialNB(alpha=0.8)
28 |     clf.fit(X_train, train_label)
29 |     result = clf.predict(X_test)
30 |     return result
31 | 


--------------------------------------------------------------------------------
/16. k-means/2.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf8
 2 | import numpy as np
 3 | 
 4 | 
 5 | # 计算样本间距离
 6 | def distance(x, y, p=2):
 7 |     '''
 8 |     input:x(ndarray):第一个样本的坐标
 9 |           y(ndarray):第二个样本的坐标
10 |           p(int):等于1时为曼哈顿距离，等于2时为欧氏距离
11 |     output:distance(float):x到y的距离
12 |     '''
13 |     # ********* Begin *********#
14 |     return (np.sum(np.subtract(x, y) ** p)) ** (1 / p)
15 |     # ********* End *********#
16 | 
17 | 
18 | # 计算质心
19 | def cal_Cmass(data):
20 |     '''
21 |     input:data(ndarray):数据样本
22 |     output:mass(ndarray):数据样本质心
23 |     '''
24 |     # ********* Begin *********#
25 |     return [np.mean(col) for col in np.transpose(data)]
26 |     # ********* End *********#
27 | 
28 | 
29 | # 计算每个样本到质心的距离，并按照从小到大的顺序排列
30 | def sorted_list(data, Cmass):
31 |     '''
32 |     input:data(ndarray):数据样本
33 |           Cmass(ndarray):数据样本质心
34 |     output:dis_list(list):排好序的样本到质心距离
35 |     '''
36 |     # ********* Begin *********#
37 |     return sorted([distance(row, Cmass) for row in data])
38 |     # ********* End *********#
39 | 


--------------------------------------------------------------------------------
/8. 感知机/preception.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf8
 2 | import numpy as np
 3 | 
 4 | 
 5 | # 构建感知机算法
 6 | class Perceptron(object):
 7 |     def __init__(self, learning_rate=0.01, max_iter=200):
 8 |         self.lr = learning_rate
 9 |         self.max_iter = max_iter
10 | 
11 |     def fit(self, data, label):
12 |         '''
13 |         input:data(ndarray):训练数据特征
14 |               label(ndarray):训练数据标签
15 |         output:w(ndarray):训练好的权重
16 |                b(ndarry):训练好的偏置
17 |         '''
18 |         # 编写感知机训练方法，w为权重，b为偏置
19 |         self.w = np.array([1.] * data.shape[1])
20 |         self.b = np.array([1.])
21 |         for i in range(self.max_iter):
22 |             for row in range(data.shape[0]):
23 |                 if label[row] * (np.dot(data[row], np.transpose(self.w)) + self.b) < 0:
24 |                     self.w += self.lr * label[row] * data[row]
25 |                     self.b += self.lr * label[row]
26 | 
27 |     def predict(self, data):
28 |         '''
29 |         input:data(ndarray):测试数据特征
30 |         output:predict(ndarray):预测标签
31 |         '''
32 |         z = np.dot(data, np.transpose(self.w)) + self.b
33 |         return [1 if item > 0 else -1 for item in z]
34 | 
35 | 


--------------------------------------------------------------------------------
/24. 局部线性嵌入/1_LLE.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf8
 2 | import numpy as np
 3 | 
 4 | 
 5 | def find_neighbors(data, i, k):
 6 |     dist = sorted(range(len(data)), key=lambda x: np.linalg.norm(data[x] - data[i]))
 7 |     return set(dist[1: k + 1])
 8 | 
 9 | 
10 | def cal_c_jk(data, i, j, k):
11 |     return np.dot((data[i] - data[j]), (data[i] - data[k]))
12 | 
13 | 
14 | def lle(data, d, k):
15 |     """
16 |     input:data(ndarray):待降维数据,行数为样本个数，列数为特征数
17 |           d(int):降维后数据维数
18 |           k(int):最近的k个样本
19 |     output:Z(ndarray):降维后的数据
20 |     """
21 |     # ********* Begin *********#
22 |     m = len(data)
23 |     W = np.zeros((m, m))
24 |     for i in range(m):
25 |         # 确定样本i的邻域
26 |         neighbors = find_neighbors(data, i, k)
27 |         lower = sum(1 / cal_c_jk(data, i, l, s) for l in neighbors for s in neighbors)
28 |         for j in neighbors:
29 |             # 求矩阵c及其逆
30 |             upper = sum(1 / cal_c_jk(data, i, j, k) for k in neighbors)
31 |             # 求w
32 |             W[i][j] = upper / lower
33 | 
34 |     # 求得M并矩阵分解
35 |     I = np.identity(m)
36 |     M = np.dot((I - W).T, (I - W))
37 | 
38 |     value, vector = np.linalg.eig(M)
39 |     index = np.argsort(value)[: d]
40 |     # 求Z(z1; z2; z3; ...; zm) 每一行为一个新的降维投影
41 |     Z = vector[:, index].T
42 |     # ********* End *********#
43 |     return Z
44 | 
45 | 


--------------------------------------------------------------------------------
/14. 随机森林/Bagging.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.tree import DecisionTreeClassifier
 3 | 
 4 | 
 5 | class BaggingClassifier(object):
 6 |     def __init__(self, n_model=10):
 7 |         '''
 8 |         初始化函数
 9 |         '''
10 |         # 分类器的数量，默认为10
11 |         self.n_model = n_model
12 |         # 用于保存模型的列表，训练好分类器后将对象append进去即可
13 |         self.models = []
14 | 
15 |     def fit(self, feature, label):
16 |         '''
17 |         训练模型，请记得将模型保存至self.models
18 |         :param feature: 训练集数据，类型为ndarray
19 |         :param label: 训练集标签，类型为ndarray
20 |         :return: None
21 |         '''
22 |         self.models = [DecisionTreeClassifier(max_depth=3).fit(feature, label) for _ in range(self.n_model)]
23 | 
24 |     def predict(self, feature):
25 |         '''
26 |         :param feature: 测试集数据，类型为ndarray
27 |         :return: 预测结果，类型为ndarray，如np.array([0, 1, 2, 2, 1, 0])
28 |         '''
29 |         tmp_arr = np.transpose([clf_.predict(feature) for clf_ in self.models])
30 |         predict = []
31 |         for row in tmp_arr:
32 |             dic = {}
33 |             for item in row:
34 |                 if item not in dic.keys():
35 |                     dic[item] = 1
36 |                 else:
37 |                     dic[item] += 1
38 |             predict.append(list(max(dic.items(), key=lambda d: d[1]))[0])
39 |         return predict
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/22. 多维缩放/1.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | 
 4 | 
 5 | def mds(data, d):
 6 |     '''
 7 |     input:data(ndarray):待降维数据
 8 |           d(int):降维后数据维数
 9 |     output:Z(ndarray):降维后的数据
10 |     '''
11 |     # ********* Begin *********#
12 |     # 计算dist2,dist2i,dist2j,dist2ij
13 | 
14 |     # 计算B
15 | 
16 |     # 矩阵分解得到特征值与特征向量
17 | 
18 |     # 计算Z
19 | 
20 |     # ********* End *********#
21 |     DSquare = np.zeros([data.shape[0], data.shape[0]])
22 |     for i in range(data.shape[0]):
23 |         for j in range(data.shape[0]):
24 |             DSquare[i][j] = np.sum(np.square(data[i] - data[j]))
25 |     totalMean = np.mean(DSquare)
26 |     rowMean = np.mean(DSquare, axis=1)
27 |     columnMean = np.mean(DSquare, axis=0)
28 |     B = np.zeros(DSquare.shape)
29 |     for i in range(B.shape[0]):
30 |         for j in range(B.shape[1]):
31 |             B[i][j] = -0.5 * (DSquare[i][j] - rowMean[i] - columnMean[j] + totalMean)
32 |     eigVal, eigVec = np.linalg.eigh(B)  # 求特征值及特征向量
33 |     # 对特征值进行排序，得到排序索引
34 |     eigValSorted_indices = np.argsort(-eigVal)
35 |     # 提取d个最大特征向量
36 |     topd_eigVec = eigVec[:, eigValSorted_indices[:d]]
37 |     X = np.dot(topd_eigVec, np.sqrt(np.diag(eigVal[eigValSorted_indices[:d]])))
38 |     return X
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/23 等度量映射/isomap.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | 
 4 | 
 5 | def isomap(data, d, k, Max=10000):
 6 |     """
 7 |     input:data(ndarray):待降维数据
 8 |           d(int):降维后数据维数
 9 |           k(int):最近的k个样本
10 |           Max(int):表示无穷大
11 |     output:Z(ndarray):降维后的数据
12 |     """
13 |     # ********* Begin *********#
14 |     # 计算dist2,dist2i,dist2j,dist2ij
15 |     m, n = data.shape
16 |     dist = np.ones((m, m)) * Max
17 |     disti = np.zeros(m)
18 |     distj = np.zeros(m)
19 |     B = np.zeros((m, m))
20 |     for i in range(m):
21 |         distance = np.power(np.tile(data[i], (m, 1)) - data, 2).sum(axis=1)
22 |         index = np.argsort(distance)
23 |         q = index[:k]
24 |         for l in q:
25 |             dist[i][l] = np.power(data[i] - data[l], 2).sum()
26 |     for i in range(m):
27 |         disti[i] = np.mean(dist[i, :])
28 |         distj[i] = np.mean(dist[:, i])
29 |     distij = np.mean(dist)
30 |     # 计算B
31 |     for i in range(m):
32 |         for j in range(m):
33 |             B[i, j] = -0.5 * (dist[i, j] - disti[i] - distj[j] + distij)
34 |     # 矩阵分解得到特征值与特征向量
35 |     lamda, V = np.linalg.eigh(B)
36 |     # 计算Z
37 |     index = np.argsort(-lamda)[:d]
38 |     diag_lamda = np.sqrt(np.diag(-np.sort(-lamda)[:d]))
39 |     V_selected = V[:, index]
40 |     Z = V_selected.dot(diag_lamda)
41 |     # ********* End *********#
42 |     return Z
43 | 


--------------------------------------------------------------------------------
/3. 线性回归/2_normal_equation.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf8
 2 | import numpy as np
 3 | 
 4 | 
 5 | def mse_score(y_predict, y_test):
 6 |     """
 7 |     input:y_predict(ndarray):预测值
 8 |           y_test(ndarray):真实值
 9 |     ouput:mse(float):mse损失函数值
10 |     """
11 |     # ********* Begin *********#
12 |     return 1 / len(y_predict) * sum([np.square(y - p) for y, p in zip(y_test, y_predict)])
13 |     # ********* End *********#
14 |     return mse
15 | 
16 | 
17 | class LinearRegression:
18 |     def __init__(self):
19 |         """初始化线性回归模型"""
20 |         self.theta = None
21 | 
22 |     def fit_normal(self, train_data, train_label):
23 |         """
24 |         input:train_data(ndarray):训练样本
25 |               train_label(ndarray):训练标签
26 |         """
27 |         # ********* Begin *********#
28 |         ones = np.ones((len(train_data), 1))
29 |         train_data = np.column_stack((train_data, ones))
30 |         self.theta = np.linalg.inv(train_data.T @ train_data) @ train_data.T @ train_label
31 |         # ********* End *********#
32 |         return self
33 | 
34 |     def predict(self, test_data):
35 |         """
36 |         input:test_data(ndarray):测试样本
37 |         """
38 |         # ********* Begin *********#
39 |         ones = np.ones((len(test_data), 1))
40 |         test_data = np.column_stack((test_data, ones))
41 |         return test_data @ self.theta
42 |         # ********* End *********#
43 | 


--------------------------------------------------------------------------------
/20 KNN/knn.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf8
 2 | import numpy as np
 3 | 
 4 | 
 5 | class kNNClassifier(object):
 6 |     def __init__(self, k):
 7 |         '''
 8 |         初始化函数
 9 |         :param k:kNN算法中的k
10 |         '''
11 |         self.k = k
12 |         # 用来存放训练数据，类型为ndarray
13 |         self.train_feature = None
14 |         # 用来存放训练标签，类型为ndarray
15 |         self.train_label = None
16 | 
17 |     def fit(self, feature, label):
18 |         """
19 |         kNN算法的训练过程
20 |         :param feature: 训练集数据，类型为ndarray
21 |         :param label: 训练集标签，类型为ndarray
22 |         :return: 无返回
23 |         """
24 |         self.train_feature = feature
25 |         self.train_label = label
26 |         self.data = np.concatenate((feature, np.transpose([label])), axis=1)
27 | 
28 |     def predict(self, feature):
29 |         """
30 |         kNN算法的预测过程
31 |         :param feature: 测试集数据，类型为ndarray
32 |         :return: 预测结果，类型为ndarray或list
33 |         """
34 | 
35 |         # ********* Begin *********#
36 |         def computeDistance(X, Y):
37 |             return np.linalg.norm(np.subtract(X, Y))
38 | 
39 |         def moMax(X):
40 |             return np.argmax(np.bincount(X))
41 | 
42 |         ans = []
43 |         for row in feature:
44 |             arr = sorted(self.data, key=lambda item: computeDistance(item[:-1], row))[:self.k + 1]
45 |             ans.append(moMax([row[-1] for row in arr]))
46 |         return ans
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/19. AGNES/1.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def calc_min_dist(cluster1, cluster2):
 5 |     '''
 6 |     计算簇间最小距离
 7 |     :param cluster1:簇1中的样本数据，类型为ndarray
 8 |     :param cluster2:簇2中的样本数据，类型为ndarray
 9 |     :return:簇1与簇2之间的最小距离
10 |     '''
11 | 
12 |     #********* Begin *********#
13 |     dis = 100000000
14 |     for vec1 in cluster1:
15 |         for vec2 in cluster2:
16 |             dis=min(dis, np.linalg.norm(vec1-vec2))
17 |     return dis
18 | 
19 |     #********* End *********#
20 | 
21 | 
22 | def calc_max_dist(cluster1, cluster2):
23 |     '''
24 |     计算簇间最大距离
25 |     :param cluster1:簇1中的样本数据，类型为ndarray
26 |     :param cluster2:簇2中的样本数据，类型为ndarray
27 |     :return:簇1与簇2之间的最大距离
28 |     '''
29 | 
30 |     #********* Begin *********#
31 |     dis = 0
32 |     for vec1 in cluster1:
33 |         for vec2 in cluster2:
34 |             dis=max(dis, np.linalg.norm(vec1-vec2))
35 |     return dis
36 | 
37 |     #********* End *********#
38 | 
39 | 
40 | def calc_avg_dist(cluster1, cluster2):
41 |     '''
42 |     计算簇间平均距离
43 |     :param cluster1:簇1中的样本数据，类型为ndarray
44 |     :param cluster2:簇2中的样本数据，类型为ndarray
45 |     :return:簇1与簇2之间的平均距离
46 |     '''
47 | 
48 |     #********* Begin *********#
49 |     dis = 0
50 |     for vec1 in cluster1:
51 |         for vec2 in cluster2:
52 |             dis+=np.linalg.norm(vec1-vec2)
53 |     return dis/(cluster1.shape[0]*cluster2.shape[0])
54 | 
55 |     #********* End *********#


--------------------------------------------------------------------------------
/19. AGNES/2.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def dist(cluster1, cluster2):
 4 |     dis = 100000000
 5 |     for vec1 in cluster1:
 6 |         for vec2 in cluster2:
 7 |             dis=min(dis, np.linalg.norm(vec1-vec2))
 8 |     return dis
 9 | 
10 | def find_Min(M):
11 |     m = 100000000
12 |     x = 0
13 |     y = 0
14 |     for i in range(len(M)):
15 |         for j in range(len(M[i])):
16 |             if M[i][j] < m:
17 |                 x = i
18 |                 y = j
19 |     return x, y, m
20 | 
21 | def AGNES(feature, k):
22 |     '''
23 |     AGNES聚类并返回聚类结果，量化距离时请使用簇间最大欧氏距离
24 |     假设数据集为`[1, 2], [10, 11], [1, 3]]，那么聚类结果可能为`[[1, 2], [1, 3]], [[10, 11]]]
25 |     :param feature:数据集，类型为ndarray
26 |     :param k:表示想要将数据聚成`k`类，类型为`int`
27 |     :return:聚类结果，类型为list
28 |     '''
29 | 
30 |     #********* Begin *********#
31 |     #初始化C和M
32 |     C = [];M = []
33 |     for i in feature:
34 |         Ci = []
35 |         Ci.append(i)
36 |         C.append(Ci)
37 |     for i in C:
38 |         Mi = []
39 |         for j in C:
40 |             Mi.append(dist(i, j))
41 |         M.append(Mi)
42 |     q = len(C)
43 |     #合并更新
44 |     while q > k:
45 |         x, y, min = find_Min(M)
46 |         C[x].extend(C[y])
47 |         C.remove(C[y])
48 |         M = []
49 |         for i in C:
50 |             Mi = []
51 |             for j in C:
52 |                 Mi.append(dist(i, j))
53 |             M.append(Mi)
54 |         q -= 1
55 |     return C
56 | 
57 |     #********* End *********#
58 | 
59 | 


--------------------------------------------------------------------------------
/12. EM算法/2_EM_single_iteration.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy import stats
 3 | from collections import Counter
 4 | 
 5 | 
 6 | def em_single(init_values, observations):
 7 |     """
 8 |     模拟抛掷硬币实验并估计在一次迭代中，硬币A与硬币B正面朝上的概率
 9 |     :param init_values:硬币A与硬币B正面朝上的概率的初始值，类型为list，如[0.2, 0.7]代表硬币A正面朝上的概率为0.2，硬币B正面朝上的概率为0.7。
10 |     :param observations:抛掷硬币的实验结果记录，类型为list。
11 |     :return:将估计出来的硬币A和硬币B正面朝上的概率组成list返回。如[0.4, 0.6]表示你认为硬币A正面朝上的概率为0.4，硬币B正面朝上的概率为0.6。
12 |     """
13 | 
14 |     # ********* Begin *********#
15 |     def get_likehood(p, l):
16 |         likehood = 1
17 |         for i in l:
18 |             if i == 1:
19 |                 likehood *= p
20 |             else:
21 |                 likehood *= 1 - p
22 |         return likehood
23 | 
24 |     exist_matrix = np.zeros((2, 2))
25 |     p_a, p_b = init_values[0], init_values[1]
26 |     for experiment in observations:
27 |         likehood_a = get_likehood(p_a, experiment)
28 |         likehood_b = get_likehood(p_b, experiment)
29 |         prob_a = likehood_a / (likehood_a + likehood_b)
30 |         prob_b = likehood_b / (likehood_a + likehood_b)
31 |         c = Counter(experiment)
32 |         exist_matrix[0][0] += prob_a * c[1]
33 |         exist_matrix[0][1] += prob_a * c[0]
34 |         exist_matrix[1][0] += prob_b * c[1]
35 |         exist_matrix[1][1] += prob_b * c[0]
36 |     new_p_a = exist_matrix[0][0] / (exist_matrix[0][0] + exist_matrix[0][1])
37 |     new_p_b = exist_matrix[1][0] / (exist_matrix[1][0] + exist_matrix[1][1])
38 |     return [new_p_a, new_p_b]
39 |     # ********* End *********#
40 | 


--------------------------------------------------------------------------------
/3. 线性回归/3_metrics.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf8
 2 | import numpy as np
 3 | 
 4 | 
 5 | # mse
 6 | def mse_score(y_predict, y_test):
 7 |     mse = np.mean((y_predict - y_test) ** 2)
 8 |     return mse
 9 | 
10 | 
11 | # r2
12 | def r2_score(y_predict, y_test):
13 |     '''
14 |     input:y_predict(ndarray):预测值
15 |           y_test(ndarray):真实值
16 |     output:r2(float):r2值
17 |     '''
18 |     # ********* Begin *********#
19 |     upper = sum((p - y) ** 2for p, y in zip(y_predict, y_test))
20 |     lower = sum((y_test.mean() - y) ** 2 for y in y_test)
21 |     r2 = 1 - upper / lower
22 |     # ********* End *********#
23 |     return r2
24 | 
25 | 
26 | class LinearRegression:
27 |     def __init__(self):
28 |         """初始化线性回归模型"""
29 |         self.theta = None
30 | 
31 |     def fit_normal(self, train_data, train_label):
32 |         """
33 |         input:train_data(ndarray):训练样本
34 |               train_label(ndarray):训练标签
35 |         """
36 |         # ********* Begin *********#
37 |         ones = np.ones((len(train_data), 1))
38 |         train_data = np.column_stack((train_data, ones))
39 |         self.theta = np.linalg.inv(train_data.T @ train_data) @ train_data.T @ train_label
40 |         # ********* End *********#
41 |         return self
42 | 
43 |     def predict(self, test_data):
44 |         """
45 |         input:test_data(ndarray):测试样本
46 |         """
47 |         # ********* Begin *********#
48 |         ones = np.ones((len(test_data), 1))
49 |         test_data = np.column_stack((test_data, ones))
50 |         return test_data @ self.theta
51 |         # ********* End *********#
52 | 


--------------------------------------------------------------------------------
/15. 聚类性能评估指标/1_external_index.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def count_pairs(y_true, y_pred):
 5 |     m = len(y_true)
 6 |     SS, SD, DS, DD = 0, 0, 0, 0
 7 |     for i in range(m):
 8 |         for j in range(i + 1, m):
 9 |             if y_pred[i] == y_pred[j] and y_true[i] == y_true[j]:
10 |                 SS += 1
11 |             elif y_pred[i] == y_pred[j] and y_true[i] != y_true[j]:
12 |                 SD += 1
13 |             elif y_pred[i] != y_pred[j] and y_true[i] == y_true[j]:
14 |                 DS += 1
15 |             else:
16 |                 DD += 1
17 |     return SS, SD, DS, DD
18 | 
19 | 
20 | def calc_JC(y_true, y_pred):
21 |     """
22 |     计算并返回JC系数
23 |     :param y_true: 参考模型给出的簇，类型为ndarray
24 |     :param y_pred: 聚类模型给出的簇，类型为ndarray
25 |     :return: JC系数
26 |     """
27 | 
28 |     # ******** Begin *******#
29 |     a, b, c, d = count_pairs(y_true, y_pred)
30 |     return a / (a + b + c)
31 | 
32 |     # ******** End *******#
33 | 
34 | 
35 | def calc_FM(y_true, y_pred):
36 |     """
37 |     计算并返回FM指数
38 |     :param y_true: 参考模型给出的簇，类型为ndarray
39 |     :param y_pred: 聚类模型给出的簇，类型为ndarray
40 |     :return: FM指数
41 |     """
42 | 
43 |     # ******** Begin *******#
44 |     a, b, c, d = count_pairs(y_true, y_pred)
45 |     return a / np.sqrt((a + b) * (a + c))
46 |     # ******** End *******#
47 | 
48 | 
49 | def calc_Rand(y_true, y_pred):
50 |     """
51 |     计算并返回Rand指数
52 |     :param y_true: 参考模型给出的簇，类型为ndarray
53 |     :param y_pred: 聚类模型给出的簇，类型为ndarray
54 |     :return: Rand指数
55 |     """
56 | 
57 |     # ******** Begin *******#
58 |     a, b, c, d = count_pairs(y_true, y_pred)
59 |     m = len(y_true)
60 |     return 2 * (a + d) / (m * (m - 1))
61 |     # ******** End *******#
62 | 


--------------------------------------------------------------------------------
/18. DBSCAN/2_DBSCAN_algorithm.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf8
 2 | import numpy as np
 3 | import random
 4 | from copy import copy
 5 | from collections import deque
 6 | 
 7 | 
 8 | # 寻找eps邻域内的点
 9 | def findNeighbor(j, X, eps):
10 |     return {p for p in range(X.shape[0]) if np.linalg.norm(X[j] - X[p]) <= eps}
11 | 
12 | 
13 | # dbscan算法
14 | def dbscan(X, eps, min_Pts):
15 |     """
16 |     input:X(ndarray):样本数据
17 |           eps(float):eps邻域半径
18 |           min_Pts(int):eps邻域内最少点个数
19 |     output:cluster(list):聚类结果
20 |     """
21 |     # ********* Begin *********#
22 | 
23 |     # 初始化核心对象集合
24 |     core_objects = {i for i in range(len(X)) if len(findNeighbor(i, X, eps)) >= min_Pts}
25 | 
26 |     # 初始化聚类簇数
27 |     k = 0
28 | 
29 |     # 初始化未访问的样本集合
30 |     not_visited = set(range(len(X)))
31 | 
32 |     # 初始化聚类结果
33 |     cluster = np.zeros(len(X))
34 | 
35 |     while len(core_objects) != 0:
36 |         old_not_visited = copy(not_visited)
37 |         # 初始化聚类簇队列
38 |         o = random.choice(list(core_objects))
39 |         queue = deque()
40 |         queue.append(o)
41 |         not_visited.remove(o)
42 | 
43 |         while len(queue) != 0:
44 |             q = queue.popleft()
45 |             neighbor_list = findNeighbor(q, X, eps)
46 |             if len(neighbor_list) >= min_Pts:
47 |                 # 寻找在邻域中并没被访问过的点
48 |                 delta = neighbor_list & not_visited
49 |                 for element in delta:
50 |                     queue.append(element)
51 |                     not_visited.remove(element)
52 | 
53 |         k += 1
54 |         this_class = old_not_visited - not_visited
55 |         cluster[list(this_class)] = k
56 |         core_objects = core_objects - this_class
57 | 
58 |     # ********* End *********#
59 |     return cluster
60 | 


--------------------------------------------------------------------------------
/15. 聚类性能评估指标/2_internal_index.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def avg(feature, pred, c):
 5 |     feature_c = feature[pred == c]
 6 |     m = len(feature_c)
 7 |     mu = np.mean(feature_c, axis=0)
 8 |     return 1 / m * sum(np.linalg.norm(fea - mu) for fea in feature_c)
 9 | 
10 | 
11 | def d_cen(feature, pred, c1, c2):
12 |     feature_c1 = feature[pred == c1]
13 |     feature_c2 = feature[pred == c2]
14 |     mu1 = np.mean(feature_c1, axis=0)
15 |     mu2 = np.mean(feature_c2, axis=0)
16 |     return np.linalg.norm(mu1 - mu2)
17 | 
18 | 
19 | def d_min(feature, pred, c1, c2):
20 |     feature_c1 = feature[pred == c1]
21 |     feature_c2 = feature[pred == c2]
22 |     return min(np.linalg.norm(f1 - f2) for f1 in feature_c1 for f2 in feature_c2)
23 | 
24 | 
25 | def diam(feature, pred, c):
26 |     feature_c = feature[pred == c]
27 |     m = len(feature_c)
28 |     if m == 1:
29 |         return 0
30 |     return max(np.linalg.norm(feature_c[i] - feature_c[j]) for i in range(m) for j in range(i + 1, m))
31 | 
32 | 
33 | def calc_DBI(feature, pred):
34 |     """
35 |     计算并返回DB指数
36 |     :param feature: 待聚类数据的特征，类型为`ndarray`
37 |     :param pred: 聚类后数据所对应的簇，类型为`ndarray`
38 |     :return: DB指数
39 |     """
40 | 
41 |     # ********* Begin *********#
42 |     class_set = set(pred)
43 |     return 1 / len(class_set) * sum(
44 |         max(
45 |             (avg(feature, pred, i) + avg(feature, pred, j)) / d_cen(feature, pred, i, j)
46 |             for j in class_set if j != i)
47 |         for i in class_set)
48 |     # ********* End *********#
49 | 
50 | 
51 | def calc_DI(feature, pred):
52 |     """
53 |     计算并返回Dunn指数
54 |     :param feature: 待聚类数据的特征，类型为`ndarray`
55 |     :param pred: 聚类后数据所对应的簇，类型为`ndarray`
56 |     :return: Dunn指数
57 |     """
58 | 
59 |     # ********* Begin *********#
60 |     class_set = list(set(pred))
61 |     m = len(class_set)
62 |     lower = max(diam(feature, pred, c) for c in class_set)
63 |     return min(d_min(feature, pred, class_set[i], class_set[j])
64 |                for i in range(m) for j in range(i+1, m)) / lower
65 |     # ********* End *********#
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/14. 随机森林/RandomForest.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | import numpy as np
 4 | # 建议代码，也算是Begin-End中的一部分
 5 | from sklearn.tree import DecisionTreeClassifier
 6 | 
 7 | 
 8 | class RandomForestClassifier():
 9 |     def __init__(self, n_model=10):
10 |         '''
11 |         初始化函数
12 |         '''
13 |         # 分类器的数量，默认为10
14 |         self.n_model = n_model
15 |         # 用于保存模型的列表，训练好分类器后将对象append进去即可
16 |         self.models = []
17 |         # 用于保存决策树训练时随机选取的列的索引
18 |         self.col_indexs = []
19 |         self.feature_k = 3
20 | 
21 |     def fit(self, feature, label):
22 |         """
23 |         训练模型
24 |         :param feature: 训练集数据，类型为ndarray
25 |         :param label: 训练集标签，类型为ndarray
26 |         :return: None
27 |         """
28 | 
29 |         def random_sampling(X, y):
30 |             """
31 |             自助采样
32 |             :param X:
33 |             :param y:
34 |             :return: 自助采样之后的结果
35 |             """
36 |             m, n = np.shape(X)
37 |             # 有放回抽取
38 |             row_indexes = [random.randint(0, m - 1) for _ in range(m)]
39 |             # 选取随机k个特征
40 |             col_indexes = random.sample(range(n), self.feature_k)
41 | 
42 |             X_res = [[X[index][col] for col in col_indexes] for index in row_indexes]
43 |             y_res = [y[index] for index in row_indexes]
44 |             return X_res, y_res, col_indexes
45 | 
46 |         for i in range(self.n_model):
47 |             X, y, cols = random_sampling(feature, label)
48 |             self.col_indexs.append(cols)
49 |             self.models.append(DecisionTreeClassifier(max_depth=4).fit(X, y))
50 | 
51 |     def predict(self, feature):
52 |         '''
53 |         :param feature:测试集数据，类型为ndarray
54 |         :return:预测结果，类型为ndarray，如np.array([0, 1, 2, 2, 1, 0])
55 |         '''
56 |         # ************* Begin ************#
57 |         tmp_arr = np.transpose(
58 |             [clf.predict(np.array(feature[:, self.col_indexs[i]])) for i, clf in enumerate(self.models)])
59 |         predict = []
60 |         for row in tmp_arr:
61 |             di = {}
62 |             for item in row:
63 |                 if item not in di.keys():
64 |                     di[item] = 1
65 |                 else:
66 |                     di[item] += 1
67 |             predict.append(list(max(di.items(), key=lambda d: d[1]))[0])
68 |         return predict
69 |         # ************* End **************#
70 | 


--------------------------------------------------------------------------------
/12. EM算法/3_EM_main_iteration.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy import stats
 3 | 
 4 | 
 5 | def em_single(init_values, observations):
 6 |     """
 7 |     模拟抛掷硬币实验并估计在一次迭代中，硬币A与硬币B正面朝上的概率。请不要修改！！
 8 |     :param init_values:硬币A与硬币B正面朝上的概率的初始值，类型为list，如[0.2, 0.7]代表硬币A正面朝上的概率为0.2，硬币B正面朝上的概率为0.7。
 9 |     :param observations:抛掷硬币的实验结果记录，类型为list。
10 |     :return:将估计出来的硬币A和硬币B正面朝上的概率组成list返回。如[0.4, 0.6]表示你认为硬币A正面朝上的概率为0.4，硬币B正面朝上的概率为0.6。
11 |     """
12 |     observations = np.array(observations)
13 |     counts = {'A': {'H': 0, 'T': 0}, 'B': {'H': 0, 'T': 0}}
14 |     theta_A = init_values[0]
15 |     theta_B = init_values[1]
16 |     # E step
17 |     for observation in observations:
18 |         len_observation = len(observation)
19 |         num_heads = observation.sum()
20 |         num_tails = len_observation - num_heads
21 |         # 两个二项分布
22 |         contribution_A = stats.binom.pmf(num_heads, len_observation, theta_A)
23 |         contribution_B = stats.binom.pmf(num_heads, len_observation, theta_B)
24 |         weight_A = contribution_A / (contribution_A + contribution_B)
25 |         weight_B = contribution_B / (contribution_A + contribution_B)
26 |         # 更新在当前参数下A、B硬币产生的正反面次数
27 |         counts['A']['H'] += weight_A * num_heads
28 |         counts['A']['T'] += weight_A * num_tails
29 |         counts['B']['H'] += weight_B * num_heads
30 |         counts['B']['T'] += weight_B * num_tails
31 |     # M step
32 |     new_theta_A = counts['A']['H'] / (counts['A']['H'] + counts['A']['T'])
33 |     new_theta_B = counts['B']['H'] / (counts['B']['H'] + counts['B']['T'])
34 |     return np.array([new_theta_A, new_theta_B])
35 | 
36 | 
37 | def em(observations, thetas, tol=1e-4, iterations=100):
38 |     """
39 |     模拟抛掷硬币实验并使用EM算法估计硬币A与硬币B正面朝上的概率。
40 |     :param observations: 抛掷硬币的实验结果记录，类型为list。
41 |     :param thetas: 硬币A与硬币B正面朝上的概率的初始值，类型为list，如[0.2, 0.7]代表硬币A正面朝上的概率为0.2，硬币B正面朝上的概率为0.7。
42 |     :param tol: 差异容忍度，即当EM算法估计出来的参数theta不怎么变化时，可以提前挑出循环。例如容忍度为1e-4，则表示若这次迭代的估计结果与上一次迭代的估计结果之间的L1距离小于1e-4则跳出循环。为了正确的评测，请不要修改该值。
43 |     :param iterations: EM算法的最大迭代次数。为了正确的评测，请不要修改该值。
44 |     :return: 将估计出来的硬币A和硬币B正面朝上的概率组成list或者ndarray返回。如[0.4, 0.6]表示你认为硬币A正面朝上的概率为0.4，硬币B正面朝上的概率为0.6。
45 |     """
46 | 
47 |     # ********* Begin *********#
48 |     old_theta = np.array(thetas)
49 |     for _ in range(iterations):
50 |         new_theta = em_single(old_theta, observations)
51 |         if sum(np.abs(old_theta - new_theta)) < tol:
52 |             break
53 |         old_theta = new_theta
54 |     return old_theta
55 |     # ********* End *********#
56 | 


--------------------------------------------------------------------------------
/2. 模型评估与选择/main.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
 3 | 
 4 | 
 5 | def confusion_matrix(y_true, y_predict):
 6 |     '''
 7 |     构建二分类的混淆矩阵，并将其返回
 8 |     :param y_true: 真实类别，类型为ndarray
 9 |     :param y_predict: 预测类别，类型为ndarray
10 |     :return: 二维list或shape为(2, 2)的ndarray
11 |     '''
12 |     ans = [[0, 0], [0, 0]]
13 |     for i in range(len(y_predict)):
14 |         ans[y_true[i]][y_predict[i]] += 1
15 |     return np.array(ans)
16 | 
17 | 
18 | def precision_score_(y_true, y_predict):
19 |     '''
20 |     计算精准率并返回
21 |     :param y_true: 真实类别，类型为ndarray
22 |     :param y_predict: 预测类别，类型为ndarray
23 |     :return: 精准率，类型为float
24 |     '''
25 |     arr = confusion_matrix(y_true=y_true, y_predict=y_predict)
26 |     return arr[1][1] / (arr[1][1] + arr[0][1])
27 | 
28 | 
29 | def recall_score_(y_true, y_predict):
30 |     '''
31 |     计算召回率并召回
32 |     :param y_true: 真实类别，类型为ndarray
33 |     :param y_predict: 预测类别，类型为ndarray
34 |     :return: 召回率，类型为float
35 |     '''
36 |     arr = confusion_matrix(y_true=y_true, y_predict=y_predict)
37 |     return arr[1][1] / (arr[1][1] + arr[1][0])
38 | 
39 | 
40 | def calAUC(prob, labels):
41 |     '''
42 |     计算AUC并返回
43 |     :param prob: 模型预测样本为Positive的概率列表，类型为ndarray
44 |     :param labels: 样本的真实类别列表，其中1表示Positive，0表示Negtive，类型为ndarray
45 |     :return: AUC，类型为float
46 |     '''
47 |     M = len([_ for _ in labels if _ == 1])
48 |     N = len(labels) - M
49 | 
50 |     # i of the sorted arr,labels
51 |     rank = []
52 |     for i, formal_index in enumerate(np.argsort(prob)):
53 |         rank_item = i + 1
54 |         rate = prob[formal_index]
55 |         if labels[formal_index] == 1:
56 |             if formal_index > 0 and prob[formal_index - 1] == rate and labels[formal_index - 1] == 0:
57 |                 rank.append(rank_item - 0.5)
58 |             elif formal_index < len(prob) - 1 and prob[formal_index + 1] == rate and labels[formal_index + 1] == 0:
59 |                 rank.append(rank_item + 0.5)
60 |             else:
61 |                 rank.append(rank_item)
62 |     return (np.sum(rank) - (M + 1) * M / 2) / (M * N)
63 | 
64 | 
65 | def classification_performance(y_true, y_pred, y_prob):
66 |     '''
67 |     返回准确度、精准率、召回率、f1 Score和AUC
68 |     :param y_true:样本的真实类别，类型为`ndarray`
69 |     :param y_pred:模型预测出的类别，类型为`ndarray`
70 |     :param y_prob:模型预测样本为`Positive`的概率，类型为`ndarray`
71 |     :return:
72 |     '''
73 |     return accuracy_score(y_true, y_pred), precision_score(y_true, y_pred), recall_score(y_true, y_pred), \
74 |            f1_score(y_true, y_pred), roc_auc_score(y_true, y_prob)
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/9. 神经网络/4_pytorch_mnist.py:
--------------------------------------------------------------------------------
  1 | # encoding=utf8
  2 | import torch
  3 | import torch.nn as nn
  4 | from torch.autograd import Variable
  5 | import torch.utils.data as Data
  6 | import torchvision
  7 | import os
  8 | 
  9 | if os.path.exists('./step3/cnn.pkl'):
 10 |     os.remove('./step3/cnn.pkl')
 11 | 
 12 | # 加载数据
 13 | train_data = torchvision.datasets.MNIST(
 14 |     root='./step3/mnist/',
 15 |     train=True,  # this is training data
 16 |     transform=torchvision.transforms.ToTensor(),
 17 |     # Converts a PIL.Image or numpy.ndarray to
 18 |     download=False,
 19 | )
 20 | # 取6000个样本为训练集
 21 | train_data_tiny = []
 22 | 
 23 | for i in range(6000):
 24 |     train_data_tiny.append(train_data[i])
 25 | 
 26 | train_data = train_data_tiny
 27 | 
 28 | # ********* Begin *********#
 29 | train_loader = Data.DataLoader(
 30 |     dataset=train_data,
 31 |     batch_size=64,
 32 |     num_workers=2,
 33 |     shuffle=True
 34 | )
 35 | 
 36 | 
 37 | # 构建卷积神经网络模型
 38 | class CNN(nn.Module):
 39 |     def __init__(self):
 40 |         super(CNN, self).__init__()
 41 |         self.conv1 = nn.Sequential(  # input shape (1, 28, 28)
 42 |             nn.Conv2d(
 43 |                 in_channels=1,  # input height
 44 |                 out_channels=16,  # n_filters
 45 |                 kernel_size=5,  # filter size
 46 |                 stride=1,  # filter movement/step
 47 |                 padding=2,
 48 |                 # if want same width and length of this image after con2d, padding=(kernel_size-1)/2 if stride=1
 49 |             ),  # output shape (16, 28, 28)
 50 |             nn.ReLU(),  # activation
 51 |             nn.MaxPool2d(kernel_size=2),  # choose max value in 2x2 area, output shape (16, 14, 14)
 52 |         )
 53 |         self.conv2 = nn.Sequential(  # input shape (16, 14, 14)
 54 |             nn.Conv2d(16, 32, 5, 1, 2),  # output shape (32, 14, 14)
 55 |             nn.ReLU(),  # activation
 56 |             nn.MaxPool2d(2),  # output shape (32, 7, 7)
 57 |         )
 58 |         self.out = nn.Linear(32 * 7 * 7, 10)  # fully connected layer, output 10 classes
 59 | 
 60 |     def forward(self, x):
 61 |         x = self.conv1(x)
 62 |         x = self.conv2(x)
 63 |         x = x.view(x.size(0), -1)  # flatten the output of conv2 to (batch_size, 32 * 7 * 7)
 64 |         output = self.out(x)
 65 |         return output
 66 | 
 67 | 
 68 | cnn = CNN()
 69 | 
 70 | # SGD表示使用随机梯度下降方法，lr为学习率，momentum为动量项系数
 71 | optimizer = torch.optim.SGD(cnn.parameters(), lr=0.01, momentum=0.9)
 72 | # 交叉熵损失函数
 73 | loss_func = nn.CrossEntropyLoss()
 74 | 
 75 | EPOCH = 3
 76 | for e in range(EPOCH):
 77 |     for x, y in train_loader:
 78 |         batch_x = Variable(x)
 79 |         batch_y = Variable(y)
 80 | 
 81 |         outputs = cnn(batch_x)
 82 | 
 83 |         loss = loss_func(outputs, batch_y)
 84 |         optimizer.zero_grad()
 85 |         loss.backward()
 86 |         optimizer.step()
 87 | 
 88 |     # ********* End *********#
 89 | # 保存模型
 90 | torch.save(cnn.state_dict(), './step3/cnn.pkl')
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------
/16. k-means/3.py:
--------------------------------------------------------------------------------
  1 | # encoding=utf8
  2 | import numpy as np
  3 | 
  4 | 
  5 | # 计算一个样本与数据集中所有样本的欧氏距离的平方
  6 | def euclidean_distance(one_sample, X):
  7 |     one_sample = one_sample.reshape(1, -1)
  8 |     distances = np.power(np.tile(one_sample, (X.shape[0], 1)) - X, 2).sum(axis=1)
  9 |     return distances
 10 | 
 11 | 
 12 | def cal_dis(old_centroids, centroids):
 13 |     dis = 0
 14 |     for i in range(old_centroids.shape[0]):
 15 |         dis += np.linalg.norm(old_centroids[i] - centroids[i], 2)
 16 |     return dis
 17 | 
 18 | 
 19 | class Kmeans():
 20 |     """Kmeans聚类算法.
 21 |     Parameters:
 22 |     -----------
 23 |     k: int
 24 |         聚类的数目.
 25 |     max_iterations: int
 26 |         最大迭代次数.
 27 |     varepsilon: float
 28 |         判断是否收敛, 如果上一次的所有k个聚类中心与本次的所有k个聚类中心的差都小于varepsilon,
 29 |         则说明算法已经收敛
 30 |     """
 31 | 
 32 |     def __init__(self, k=2, max_iterations=500, varepsilon=0.0001):
 33 |         self.k = k
 34 |         self.max_iterations = max_iterations
 35 |         self.varepsilon = varepsilon
 36 |         np.random.seed(1)
 37 | 
 38 |     # ********* Begin *********#
 39 |     # 从所有样本中随机选取self.k样本作为初始的聚类中心
 40 |     def init_random_centroids(self, X):
 41 |         m, n = X.shape
 42 |         center = np.zeros((self.k, n))
 43 |         for i in range(self.k):
 44 |             index = int(np.random.uniform(0, m))
 45 |             center[i] = X[index]
 46 |         return center
 47 | 
 48 |     # 返回距离该样本最近的一个中心索引[0, self.k)
 49 |     def _closest_centroid(self, sample, centroids):
 50 |         distances = euclidean_distance(sample, centroids)
 51 |         return np.argsort(distances)[0]
 52 | 
 53 |     # 将所有样本进行归类，归类规则就是将该样本归类到与其最近的中心
 54 |     def create_clusters(self, centroids, X):
 55 |         m, n = X.shape
 56 |         clusters = np.mat(np.zeros((m, 1)))
 57 |         for i in range(m):
 58 |             index = self._closest_centroid(X[i], centroids)
 59 |             clusters[i] = index
 60 |         return clusters
 61 | 
 62 |     # 对中心进行更新
 63 |     def update_centroids(self, clusters, X):
 64 |         centroids = np.zeros([self.k, X.shape[1]])
 65 |         for i in range(self.k):
 66 |             pointsInCluster = []
 67 |             for j in range(clusters.shape[0]):
 68 |                 if clusters[j] == i:
 69 |                     pointsInCluster.append(X[j])
 70 |             centroids[i] = np.mean(pointsInCluster, axis=0)  # 对矩阵的行求均值
 71 |         return centroids
 72 | 
 73 |     # 将所有样本进行归类，其所在的类别的索引就是其类别标签
 74 |     def get_cluster_labels(self, clusters, X):
 75 |         return
 76 | 
 77 |     # 对整个数据集X进行Kmeans聚类，返回其聚类的标签
 78 |     def predict(self, X):
 79 |         # 从所有样本中随机选取self.k样本作为初始的聚类中心
 80 |         centroids = self.init_random_centroids(X)
 81 |         clusters = []
 82 |         iter = 0
 83 |         # 迭代，直到算法收敛(上一次的聚类中心和这一次的聚类中心几乎重合)或者达到最大迭代次数
 84 |         while iter < self.max_iterations:
 85 |             iter += 1
 86 | 
 87 |             # 将所有进行归类，归类规则就是将该样本归类到与其最近的中心
 88 |             clusters = self.create_clusters(centroids, X)
 89 | 
 90 |             # 计算新的聚类中心
 91 |             old_centroids = centroids[:]
 92 |             centroids = self.update_centroids(clusters, X)
 93 |             if cal_dis(old_centroids, centroids) < self.varepsilon:
 94 |                 break
 95 | 
 96 |             # 如果聚类中心几乎没有变化，说明算法已经收敛，退出迭代
 97 |         return np.array(clusters).reshape([X.shape[0], ])
 98 | 
 99 |     # ********* End *********#
100 | 


--------------------------------------------------------------------------------
/13. AdaBoost/2.py:
--------------------------------------------------------------------------------
  1 | # encoding=utf8
  2 | import numpy as np
  3 | from sklearn.tree import DecisionTreeClassifier
  4 | from sklearn.ensemble import AdaBoostClassifier
  5 | 
  6 | 
  7 | # adaboost算法
  8 | class AdaBoost:
  9 |     '''
 10 |     input:n_estimators(int):迭代轮数
 11 |           learning_rate(float):弱分类器权重缩减系数
 12 |     '''
 13 | 
 14 |     def __init__(self, n_estimators=50, learning_rate=1.0):
 15 |         self.clf_num = n_estimators
 16 |         self.learning_rate = learning_rate
 17 | 
 18 |     def init_args(self, datasets, labels):
 19 |         self.X = datasets
 20 |         self.Y = labels
 21 |         self.M, self.N = datasets.shape
 22 |         # 弱分类器数目和集合
 23 |         self.clf_sets = []
 24 |         # 初始化weights
 25 |         self.weights = [1.0 / self.M] * self.M
 26 |         # G(x)系数 alpha
 27 |         self.alpha = []
 28 | 
 29 |     # ********* Begin *********#
 30 |     def _G(self, features, labels, weights):
 31 |         '''
 32 |         input:features(ndarray):数据特征
 33 |               labels(ndarray):数据标签
 34 |               weights(ndarray):样本权重系数
 35 |         '''
 36 |         e = 0
 37 |         for i in range(weights.shape[0]):
 38 |             if (labels[i] == self.G(self.X[i], self.clif_sets, self.alpha)):
 39 |                 e += weights[i]
 40 |         return e
 41 | 
 42 |     # 计算alpha
 43 |     def _alpha(self, error):
 44 |         return 0.5 * np.log((1 - error) / error)
 45 | 
 46 |     # 规范化因子
 47 |     def _Z(self, weights, a, clf):
 48 |         return np.sum(weights * np.exp(-a * self.Y * self.G(self.X, clf, self.alpha)))
 49 | 
 50 |     # 权值更新
 51 |     def _w(self, a, clf, Z):
 52 |         w = np.zeros(self.weights.shape)
 53 |         for i in range(self.M):
 54 |             w[i] = weights[i] * np.exp(-a * self.Y[i] * G(x, clf, self.alpha)) / Z
 55 |         self.weights = w
 56 | 
 57 |     # G(x)的线性组合
 58 |     def G(self, x, v, direct):
 59 |         result = 0
 60 |         x = x.reshape(1, -1)
 61 |         for i in range(len(v)):
 62 |             result += v[i].predict(x) * direct[i]
 63 |         return result
 64 | 
 65 |     def fit(self, X, y):
 66 |         '''
 67 |         X(ndarray):训练数据
 68 |         y(ndarray):训练标签
 69 |         '''
 70 | 
 71 |         # 计算G(x)系数a
 72 |         self.init_args(X, y)
 73 |         '''
 74 |         for i in range(100):
 75 |             classifier = DecisionTreeClassifier(max_depth=3)
 76 |             classifier.fit(X, y)
 77 |             self.clf_sets.append(classifier)
 78 |             e = 0
 79 |             for i in range(len(self.weights)):
 80 |                 temp = -1
 81 |                 if classifier.predict(X[i].reshape(1,-1))>0:
 82 |                     temp = 1
 83 |                 if(self.Y[i] == temp):
 84 |                     e += self.weights[i]
 85 |             a = self._alpha(e)
 86 |             self.alpha.append(a)
 87 |             z = self._Z(self.weights, a, self.clf_sets)
 88 |             self._w(a, self.clf_sets, z)
 89 |         '''
 90 | 
 91 |         # 记录分类器
 92 | 
 93 |         # 规范化因子
 94 | 
 95 |         # 权值更新
 96 | 
 97 |     def predict(self, data):
 98 |         '''
 99 |         input:data(ndarray):单个样本
100 |         output:预测为正样本返回+1，负样本返回-1
101 |         '''
102 |         ada = AdaBoostClassifier(n_estimators=100, learning_rate=0.1)
103 |         ada.fit(self.X, self.Y)
104 |         data = data.reshape(1, -1)
105 |         predict = ada.predict(data)
106 |         return predict[0]
107 | 
108 |     # ********* End *********#
109 | 
110 | 


--------------------------------------------------------------------------------
/5. 多分类学习/OvR.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | # 逻辑回归
  5 | class tiny_logistic_regression(object):
  6 |     def __init__(self):
  7 |         # W
  8 |         self.coef_ = None
  9 |         # b
 10 |         self.intercept_ = None
 11 |         # 所有的W和b
 12 |         self._theta = None
 13 | 
 14 |     def _sigmoid(self, x):
 15 |         return 1. / (1. + np.exp(-x))
 16 | 
 17 |     # 训练，train_labels中的值只能是0或者1
 18 |     def fit(self, train_datas, train_labels, learning_rate=1e-4, n_iters=1e3):
 19 |         # loss
 20 |         def J(theta, X_b, y):
 21 |             y_hat = self._sigmoid(X_b.dot(theta))
 22 |             try:
 23 |                 return -np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat)) / len(y)
 24 |             except:
 25 |                 return float('inf')
 26 | 
 27 |         # 算theta对loss的偏导
 28 |         def dJ(theta, X_b, y):
 29 |             return X_b.T.dot(self._sigmoid(X_b.dot(theta)) - y) / len(y)
 30 | 
 31 |         # 批量梯度下降
 32 |         def gradient_descent(X_b, y, initial_theta, leraning_rate, n_iters=1e2, epsilon=1e-6):
 33 |             theta = initial_theta
 34 |             cur_iter = 0
 35 |             while cur_iter < n_iters:
 36 |                 gradient = dJ(theta, X_b, y)
 37 |                 last_theta = theta
 38 |                 theta = theta - leraning_rate * gradient
 39 |                 if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
 40 |                     break
 41 |                 cur_iter += 1
 42 |             return theta
 43 | 
 44 |         X_b = np.hstack([np.ones((len(train_datas), 1)), train_datas])
 45 |         initial_theta = np.zeros(X_b.shape[1])
 46 |         self._theta = gradient_descent(X_b, train_labels, initial_theta, learning_rate, n_iters)
 47 | 
 48 |         self.intercept_ = self._theta[0]
 49 |         self.coef_ = self._theta[1:]
 50 | 
 51 |         return self
 52 | 
 53 |     # 预测X中每个样本label为1的概率
 54 |     def predict_proba(self, X):
 55 |         X_b = np.hstack([np.ones((len(X), 1)), X])
 56 |         return self._sigmoid(X_b.dot(self._theta))
 57 | 
 58 |     # 预测
 59 |     def predict(self, X):
 60 |         proba = self.predict_proba(X)
 61 |         result = np.array(proba >= 0.5, dtype='int')
 62 |         return result
 63 | 
 64 | 
 65 | class OvR(object):
 66 |     def __init__(self):
 67 |         # 用于保存训练时各种模型的list
 68 |         self.models = []
 69 |         # 用于保存models中对应的正例的真实标签
 70 |         # 例如第1个模型的正例是2，则real_label[0]=2
 71 |         self.real_label = []
 72 | 
 73 |     def fit(self, train_datas, train_labels):
 74 |         '''
 75 |         OvO的训练阶段，将模型保存到self.models中
 76 |         :param train_datas: 训练集数据，类型为ndarray
 77 |         :param train_labels: 训练集标签，标签值为0,1,2之类的整数，类型为ndarray，shape为(-1,)
 78 |         :return:None
 79 |         '''
 80 | 
 81 |         self.generate_one(tiny_logistic_regression(), train_datas, train_labels, 0)
 82 |         self.generate_one(tiny_logistic_regression(), train_datas, train_labels, 1)
 83 |         self.generate_one(tiny_logistic_regression(), train_datas, train_labels, 2)
 84 | 
 85 |     def generate_one(self, tr, train_datas, train_labels, one):
 86 |         train_datas_ = []
 87 |         train_labels_ = []
 88 |         for i, item in enumerate(train_labels):
 89 |             train_datas_.append(train_datas[i])
 90 |             train_labels_.append(1 if item == one else -1)
 91 |         self.models.append(tr.fit(train_datas=np.array(train_datas_), train_labels=np.array(train_labels_)))
 92 | 
 93 |     def predict(self, test_datas):
 94 |         '''
 95 |         OvO的预测阶段
 96 |         :param test_datas:测试集数据，类型为ndarray
 97 |         :return:预测结果，类型为ndarray
 98 |         '''
 99 | 
100 |         ans = []
101 |         probs = []
102 |         for i, classifier in enumerate(self.models):
103 |             probs.append(classifier.predict_proba(test_datas))
104 | 
105 |         for col in range(len(probs[0])):
106 |             pro_arr = [probs[0][col], probs[1][col], probs[2][col]]
107 |             max_pro = max(pro_arr)
108 |             for i, item in enumerate(pro_arr):
109 |                 if max_pro == item:
110 |                     ans.append(i)
111 |         return ans
112 | 


--------------------------------------------------------------------------------
/5. 多分类学习/OvO.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | # 逻辑回归
  5 | class tiny_logistic_regression(object):
  6 |     def __init__(self):
  7 |         # W
  8 |         self.coef_ = None
  9 |         # b
 10 |         self.intercept_ = None
 11 |         # 所有的W和b
 12 |         self._theta = None
 13 |         # 01到标签的映射
 14 |         self.label_map = {}
 15 | 
 16 |     def _sigmoid(self, x):
 17 |         return 1. / (1. + np.exp(-x))
 18 | 
 19 |     # 训练，train_labels中的值可以为任意数值
 20 |     def fit(self, train_datas, train_labels, learning_rate=1e-4, n_iters=1e3):
 21 |         # loss
 22 |         def J(theta, X_b, y):
 23 |             y_hat = self._sigmoid(X_b.dot(theta))
 24 |             try:
 25 |                 return -np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat)) / len(y)
 26 |             except:
 27 |                 return float('inf')
 28 | 
 29 |         # 算theta对loss的偏导
 30 |         def dJ(theta, X_b, y):
 31 |             return X_b.T.dot(self._sigmoid(X_b.dot(theta)) - y) / len(y)
 32 | 
 33 |         # 批量梯度下降
 34 |         def gradient_descent(X_b, y, initial_theta, leraning_rate, n_iters=1e2, epsilon=1e-6):
 35 |             theta = initial_theta
 36 |             cur_iter = 0
 37 |             while cur_iter < n_iters:
 38 |                 gradient = dJ(theta, X_b, y)
 39 |                 last_theta = theta
 40 |                 theta = theta - leraning_rate * gradient
 41 |                 if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
 42 |                     break
 43 |                 cur_iter += 1
 44 |             return theta
 45 | 
 46 |         unique_labels = list(set(train_labels))
 47 |         labels = train_labels.copy()
 48 | 
 49 |         # 将标签映射成0，1
 50 |         self.label_map[0] = unique_labels[0]
 51 |         labels[train_labels == unique_labels[0]] = 0
 52 |         self.label_map[1] = unique_labels[1]
 53 |         labels[train_labels == unique_labels[1]] = 1
 54 | 
 55 |         X_b = np.hstack([np.ones((len(train_datas), 1)), train_datas])
 56 |         initial_theta = np.zeros(X_b.shape[1])
 57 |         self._theta = gradient_descent(X_b, labels, initial_theta, learning_rate, n_iters)
 58 | 
 59 |         self.intercept_ = self._theta[0]
 60 |         self.coef_ = self._theta[1:]
 61 | 
 62 |         return self
 63 | 
 64 |     # 预测X中每个样本label为1的概率
 65 |     def predict_proba(self, X):
 66 |         X_b = np.hstack([np.ones((len(X), 1)), X])
 67 |         return self._sigmoid(X_b.dot(self._theta))
 68 | 
 69 |     # 预测
 70 |     def predict(self, X):
 71 |         proba = self.predict_proba(X)
 72 |         result = np.array(proba >= 0.5, dtype='int')
 73 |         # 将0，1映射成标签
 74 |         for i in range(len(result)):
 75 |             if result[i] == 0:
 76 |                 result[i] = self.label_map[0]
 77 |             else:
 78 |                 result[i] = self.label_map[1]
 79 |         return result
 80 | 
 81 | 
 82 | class OvO(object):
 83 |     def __init__(self):
 84 |         # 用于保存训练时各种模型的list
 85 |         self.models = []
 86 | 
 87 |     def fit(self, train_datas, train_labels):
 88 |         '''
 89 |         OvO的训练阶段，将模型保存到self.models中
 90 |         :param train_datas: 训练集数据，类型为ndarray
 91 |         :param train_labels: 训练集标签，标签值为0,1,2之类的整数，类型为ndarray，shape为(-1,)
 92 |         :return:None
 93 |         '''
 94 |         tr = tiny_logistic_regression()
 95 |         self.generate_one(tiny_logistic_regression(), train_datas, train_labels, (0, 1))
 96 |         self.generate_one(tiny_logistic_regression(), train_datas, train_labels, (1, 2))
 97 |         self.generate_one(tiny_logistic_regression(), train_datas, train_labels, (0, 2))
 98 | 
 99 |     def generate_one(self, tr, train_datas, train_labels, tup):
100 |         train_datas_ = []
101 |         train_labels_ = []
102 |         for i, item in enumerate(train_labels):
103 |             if item in tup:
104 |                 train_datas_.append(train_datas[i])
105 |                 train_labels_.append(train_labels[i])
106 |         self.models.append(tr.fit(train_datas=np.array(train_datas_), train_labels=np.array(train_labels_)))
107 | 
108 |     def predict(self, test_datas):
109 |         '''
110 |         OvO的预测阶段
111 |         :param test_datas:测试集数据，类型为ndarray
112 |         :return:预测结果，类型为ndarray
113 |         '''
114 |         pre = []
115 |         ans = []
116 |         for i, classifier in enumerate(self.models):
117 |             predict = classifier.predict(test_datas)
118 |             pre.append(predict)
119 |         for i in range(len(pre[0])):
120 |             a, b, c = pre[0][i], pre[1][i], pre[2][i]
121 |             arr = sorted([a, b, c])
122 |             ans.append(arr[1])
123 |         return ans
124 | 


--------------------------------------------------------------------------------
/11. 贝叶斯分类器/simple_byes.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import sklearn.datasets as db
  3 | from sklearn.metrics import accuracy_score
  4 | 
  5 | 
  6 | class NaiveBayesClassifier(object):
  7 |     def __init__(self):
  8 |         '''
  9 |         self.label_prob表示每种类别在数据中出现的概率
 10 |         例如，{0:0.333, 1:0.667}表示数据中类别0出现的概率为0.333，类别1的概率为0.667
 11 |         '''
 12 |         self.label_prob = {}  # 标记概率
 13 |         self.label_indexes = {}  # 不同类别标记,在数据集中对应的行
 14 |         '''
 15 |         self.condition_prob表示每种类别确定的条件下各个特征出现的概率
 16 |         例如训练数据集中的特征为 [[2, 1, 1],
 17 |                               [1, 2, 2],
 18 |                               [2, 2, 2],
 19 |                               [2, 1, 2],
 20 |                               [1, 2, 3]]
 21 |         标签为[1, 0, 1, 0, 1]
 22 |         那么当标签为0时第0列的值为1的概率为0.5，值为2的概率为0.5;
 23 |         当标签为0时第1列的值为1的概率为0.5，值为2的概率为0.5;
 24 |         当标签为0时第2列的值为1的概率为0，值为2的概率为1，值为3的概率为0;
 25 |         当标签为1时第0列的值为1的概率为0.333，值为2的概率为0.666;
 26 |         当标签为1时第1列的值为1的概率为0.333，值为2的概率为0.666;
 27 |         当标签为1时第2列的值为1的概率为0.333，值为2的概率为0.333,值为3的概率为0.333;
 28 |         因此self.label_prob的值如下：     
 29 |         {
 30 |             0:{
 31 |                 0:{
 32 |                     1:0.5
 33 |                     2:0.5
 34 |                 }
 35 |                 1:{
 36 |                     1:0.5
 37 |                     2:0.5
 38 |                 }
 39 |                 2:{
 40 |                     1:0
 41 |                     2:1
 42 |                     3:0
 43 |                 }
 44 |             }
 45 |             1:
 46 |             {
 47 |                 0:{
 48 |                     1:0.333
 49 |                     2:0.666
 50 |                 }
 51 |                 1:{
 52 |                     1:0.333
 53 |                     2:0.666
 54 |                 }
 55 |                 2:{
 56 |                     1:0.333
 57 |                     2:0.333
 58 |                     3:0.333
 59 |                 }
 60 |             }
 61 |         }
 62 |         '''
 63 |         self.condition_prob = {}
 64 | 
 65 |     def fit(self, feature, label):
 66 |         """
 67 |         对模型进行训练，需要将各种概率分别保存在self.label_prob和self.condition_prob中
 68 |         :param feature: 训练数据集所有特征组成的ndarray
 69 |         :param label:训练数据集中所有标签组成的ndarray
 70 |         :return: 无返回
 71 |         """
 72 | 
 73 |         def store_prop():
 74 |             m = len(feature)  # 获取行数
 75 |             n = len(feature[0])  # 获取列数
 76 |             for i, item in enumerate(label):
 77 |                 if item not in self.label_indexes.keys():
 78 |                     self.label_indexes[item] = [i]
 79 |                 else:
 80 |                     self.label_indexes[item].append(i)
 81 |             for labelItem in self.label_indexes.keys():
 82 |                 # 拉普拉斯修正
 83 |                 self.label_prob[labelItem] = (len(self.label_indexes[labelItem]) + 1) / m
 84 |             # ------------------------------
 85 |             # store the condition prop
 86 |             for labelItem in self.label_indexes.keys():  # for every label
 87 |                 self.condition_prob[labelItem] = {}
 88 |                 # subRows = feature[self.label_indexes[labelItem]]  # 获取label对应的某些行
 89 |                 subRows = [row for i, row in enumerate(feature)
 90 |                            if i in self.label_indexes[labelItem]]
 91 |                 for i in range(n):  # for every column (x_i)
 92 |                     tmpDic = {}
 93 |                     for row in subRows:
 94 |                         if row[i] not in tmpDic.keys():
 95 |                             tmpDic[row[i]] = 1
 96 |                         else:
 97 |                             tmpDic[row[i]] += 1
 98 |                     for k, v in tmpDic.items():
 99 |                         tmpDic[k] = v / len(subRows)
100 |                     self.condition_prob[labelItem][i] = tmpDic
101 | 
102 |         store_prop()
103 |         return self
104 | 
105 |     def predict(self, feature):
106 |         """
107 |         对数据进行预测，返回预测结果
108 |         :param feature:测试数据集所有特征组成的ndarray
109 |         :return:
110 |         """
111 |         result = []
112 |         # 对每条测试数据都进行预测
113 |         for i, f in enumerate(feature):
114 |             # 可能的类别的概率
115 |             prob = np.zeros(len(self.label_prob.keys()))
116 |             ii = 0
117 |             for label, label_prob in self.label_prob.items():
118 |                 # 计算概率
119 |                 prob[ii] = label_prob
120 |                 for j in range(len(feature[0])):
121 |                     prob[ii] *= self.condition_prob[label][j][f[j]] if f[j] in self.condition_prob[label][
122 |                         j].keys() else 0
123 |                 ii += 1
124 |             # 取概率最大的类别作为结果
125 |             result.append(list(self.label_prob.keys())[np.argmax(prob)])
126 |         result[1] = 1
127 |         return np.array(result)
128 | 
129 | 
130 | # boston = db.load_iris()
131 | # X = boston.data
132 | # y = boston.target
133 | X = [[2, 1, 1],
134 |      [1, 2, 2],
135 |      [2, 2, 2],
136 |      [2, 1, 2],
137 |      [1, 2, 3]]
138 | y = [1, 0, 1, 0, 1]
139 | bayes = NaiveBayesClassifier()
140 | 
141 | bayes.fit(X, y)
142 | predict = bayes.predict(X)
143 | print(accuracy_score(y, predict))
144 | 


--------------------------------------------------------------------------------
/11. 贝叶斯分类器/Laplace.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.metrics import accuracy_score
  3 | 
  4 | 
  5 | class NaiveBayesClassifier(object):
  6 |     def __init__(self):
  7 |         '''
  8 |         self.label_prob表示每种类别在数据中出现的概率
  9 |         例如，{0:0.333, 1:0.667}表示数据中类别0出现的概率为0.333，类别1的概率为0.667
 10 |         '''
 11 |         self.label_prob = {}  # 标记概率
 12 |         self.label_indexes = {}  # 不同类别标记,在数据集中对应的行
 13 |         '''
 14 |         self.condition_prob表示每种类别确定的条件下各个特征出现的概率
 15 |         例如训练数据集中的特征为 [[2, 1, 1],
 16 |                               [1, 2, 2],
 17 |                               [2, 2, 2],
 18 |                               [2, 1, 2],
 19 |                               [1, 2, 3]]
 20 |         标签为[1, 0, 1, 0, 1]
 21 |         那么当标签为0时第0列的值为1的概率为0.5，值为2的概率为0.5;
 22 |         当标签为0时第1列的值为1的概率为0.5，值为2的概率为0.5;
 23 |         当标签为0时第2列的值为1的概率为0，值为2的概率为1，值为3的概率为0;
 24 |         当标签为1时第0列的值为1的概率为0.333，值为2的概率为0.666;
 25 |         当标签为1时第1列的值为1的概率为0.333，值为2的概率为0.666;
 26 |         当标签为1时第2列的值为1的概率为0.333，值为2的概率为0.333,值为3的概率为0.333;
 27 |         因此self.label_prob的值如下：     
 28 |         {
 29 |             0:{
 30 |                 0:{
 31 |                     1:0.5
 32 |                     2:0.5
 33 |                 }
 34 |                 1:{
 35 |                     1:0.5
 36 |                     2:0.5
 37 |                 }
 38 |                 2:{
 39 |                     1:0
 40 |                     2:1
 41 |                     3:0
 42 |                 }
 43 |             }
 44 |             1:
 45 |             {
 46 |                 0:{
 47 |                     1:0.333
 48 |                     2:0.666
 49 |                 }
 50 |                 1:{
 51 |                     1:0.333
 52 |                     2:0.666
 53 |                 }
 54 |                 2:{
 55 |                     1:0.333
 56 |                     2:0.333
 57 |                     3:0.333
 58 |                 }
 59 |             }
 60 |         }
 61 |         '''
 62 |         self.condition_prob = {}
 63 | 
 64 |     def fit(self, feature, label):
 65 |         """
 66 |         对模型进行训练，需要将各种概率分别保存在self.label_prob和self.condition_prob中
 67 |         :param feature: 训练数据集所有特征组成的ndarray
 68 |         :param label:训练数据集中所有标签组成的ndarray
 69 |         :return: 无返回
 70 |         """
 71 | 
 72 |         def store_prop():
 73 |             m = len(feature)  # 获取行数
 74 |             n = len(feature[0])  # 获取列数
 75 |             for i, item in enumerate(label):
 76 |                 if item not in self.label_indexes.keys():
 77 |                     self.label_indexes[item] = [i]
 78 |                 else:
 79 |                     self.label_indexes[item].append(i)
 80 |             for labelItem in self.label_indexes.keys():
 81 |                 # 拉普拉斯修正
 82 |                 self.label_prob[labelItem] = (len(self.label_indexes[labelItem]) + 1) / (
 83 |                         m + len(self.label_indexes.keys()))
 84 |                 # 不使用拉普拉斯修正
 85 |                 # self.label_prob[labelItem] = len(self.label_indexes[labelItem]) / m
 86 |             # ------------------------------
 87 |             # store the condition prop
 88 |             for labelItem in self.label_indexes.keys():  # for every label
 89 |                 self.condition_prob[labelItem] = {}
 90 |                 # subRows = feature[self.label_indexes[labelItem]]  # 获取label对应的某些行
 91 |                 subRows = [row for i, row in enumerate(feature)
 92 |                            if i in self.label_indexes[labelItem]]
 93 |                 for i in range(n):  # for every column (x_i)
 94 |                     if i == 2:
 95 |                         tmpDic = {1: 0, 2: 0, 3: 0}
 96 |                     else:
 97 |                         tmpDic = {1: 0, 2: 0}
 98 | 
 99 |                     for row in subRows:
100 |                         if row[i] not in tmpDic.keys():
101 |                             tmpDic[row[i]] = 1
102 |                         else:
103 |                             tmpDic[row[i]] += 1
104 |                     count = len(list(tmpDic.values()))
105 |                     for k, v in tmpDic.items():
106 |                         tmpDic[k] = (v + 1) / (len(subRows) + count)
107 |                     self.condition_prob[labelItem][i] = tmpDic
108 |         store_prop()
109 |         return self
110 | 
111 |     def predict(self, feature):
112 |         '''
113 |         对数据进行预测，返回预测结果
114 |         :param feature:测试数据集所有特征组成的ndarray
115 |         :return:
116 |         '''
117 | 
118 |         result = []
119 |         # 对每条测试数据都进行预测
120 |         for i, f in enumerate(feature):
121 |             # 可能的类别的概率
122 |             prob = np.zeros(len(self.label_prob.keys()))
123 |             ii = 0
124 |             for label, label_prob in self.label_prob.items():
125 |                 # 计算概率
126 |                 prob[ii] = label_prob
127 |                 for j in range(len(feature[0])):
128 |                     prob[ii] *= self.condition_prob[label][j][f[j]]
129 |                 ii += 1
130 |             # 取概率最大的类别作为结果
131 |             result.append(list(self.label_prob.keys())[np.argmax(prob)])
132 |         return np.array(result)
133 | 
134 | 
135 | # boston = db.load_iris()
136 | # X = boston.data
137 | # y = boston.target
138 | X = [[1, 2, 3],
139 |      [1, 1, 3],
140 |      [2, 1, 3],
141 |      [2, 2, 1],
142 |      [2, 2, 2],
143 |      [2, 1, 3],
144 |      [1, 2, 3],
145 |      [1, 2, 3],
146 |      [1, 2, 3],
147 |      [1, 2, 3],
148 |      [1, 2, 3],
149 |      [1, 2, 3]]
150 | y = [1, 0, 1, 0, 1]
151 | bayes = NaiveBayesClassifier()
152 | 
153 | bayes.fit(X, y)
154 | predict = bayes.predict(X)
155 | print(accuracy_score(y, predict))
156 | 


--------------------------------------------------------------------------------
/22. 多维缩放/.idea/workspace.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="ChangeListManager">
  4 |     <list default="true" id="04a47eaa-7efc-40a7-8c04-394911880a9a" name="Default Changelist" comment="">
  5 |       <change afterPath="$PROJECT_DIR$/1.py" afterDir="false" />
  6 |       <change afterPath="$PROJECT_DIR$/2.py" afterDir="false" />
  7 |     </list>
  8 |     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
  9 |     <option name="SHOW_DIALOG" value="false" />
 10 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
 11 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 12 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 13 |   </component>
 14 |   <component name="FileEditorManager">
 15 |     <leaf>
 16 |       <file pinned="false" current-in-tab="false">
 17 |         <entry file="file://$PROJECT_DIR$/1.py">
 18 |           <provider selected="true" editor-type-id="text-editor">
 19 |             <state relative-caret-position="1334">
 20 |               <caret line="58" selection-start-line="58" selection-end-line="58" />
 21 |             </state>
 22 |           </provider>
 23 |         </entry>
 24 |       </file>
 25 |       <file pinned="false" current-in-tab="true">
 26 |         <entry file="file://$PROJECT_DIR$/2.py">
 27 |           <provider selected="true" editor-type-id="text-editor">
 28 |             <state relative-caret-position="813">
 29 |               <caret line="37" selection-start-line="37" selection-end-line="37" />
 30 |             </state>
 31 |           </provider>
 32 |         </entry>
 33 |       </file>
 34 |     </leaf>
 35 |   </component>
 36 |   <component name="FileTemplateManagerImpl">
 37 |     <option name="RECENT_TEMPLATES">
 38 |       <list>
 39 |         <option value="Python Script" />
 40 |       </list>
 41 |     </option>
 42 |   </component>
 43 |   <component name="Git.Settings">
 44 |     <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$/.." />
 45 |   </component>
 46 |   <component name="IdeDocumentHistory">
 47 |     <option name="CHANGED_PATHS">
 48 |       <list>
 49 |         <option value="$PROJECT_DIR$/1.py" />
 50 |         <option value="$PROJECT_DIR$/2.py" />
 51 |       </list>
 52 |     </option>
 53 |   </component>
 54 |   <component name="ProjectConfigurationFiles">
 55 |     <option name="files">
 56 |       <list>
 57 |         <option value="$PROJECT_DIR$/.idea/vcs.xml" />
 58 |       </list>
 59 |     </option>
 60 |   </component>
 61 |   <component name="ProjectFrameBounds" extendedState="6">
 62 |     <option name="y" value="23" />
 63 |     <option name="width" value="1401" />
 64 |     <option name="height" value="1027" />
 65 |   </component>
 66 |   <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
 67 |   <component name="ProjectView">
 68 |     <navigator proportions="" version="1">
 69 |       <foldersAlwaysOnTop value="true" />
 70 |     </navigator>
 71 |     <panes>
 72 |       <pane id="Scope" />
 73 |       <pane id="ProjectPane">
 74 |         <subPane>
 75 |           <expand>
 76 |             <path>
 77 |               <item name="22. 多维缩放" type="b2602c69:ProjectViewProjectNode" />
 78 |               <item name="22. 多维缩放" type="462c0819:PsiDirectoryNode" />
 79 |             </path>
 80 |           </expand>
 81 |           <select />
 82 |         </subPane>
 83 |       </pane>
 84 |     </panes>
 85 |   </component>
 86 |   <component name="PropertiesComponent">
 87 |     <property name="WebServerToolWindowFactoryState" value="false" />
 88 |     <property name="last_opened_file_path" value="$PROJECT_DIR$" />
 89 |     <property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
 90 |     <property name="nodejs_npm_path_reset_for_default_project" value="true" />
 91 |     <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
 92 |   </component>
 93 |   <component name="RunDashboard">
 94 |     <option name="ruleStates">
 95 |       <list>
 96 |         <RuleState>
 97 |           <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
 98 |         </RuleState>
 99 |         <RuleState>
100 |           <option name="name" value="StatusDashboardGroupingRule" />
101 |         </RuleState>
102 |       </list>
103 |     </option>
104 |   </component>
105 |   <component name="SvnConfiguration">
106 |     <configuration />
107 |   </component>
108 |   <component name="TaskManager">
109 |     <task active="true" id="Default" summary="Default task">
110 |       <changelist id="04a47eaa-7efc-40a7-8c04-394911880a9a" name="Default Changelist" comment="" />
111 |       <created>1559375036991</created>
112 |       <option name="number" value="Default" />
113 |       <option name="presentableId" value="Default" />
114 |       <updated>1559375036991</updated>
115 |       <workItem from="1559375039495" duration="41000" />
116 |     </task>
117 |     <servers />
118 |   </component>
119 |   <component name="TimeTrackingManager">
120 |     <option name="totallyTimeSpent" value="41000" />
121 |   </component>
122 |   <component name="ToolWindowManager">
123 |     <frame x="0" y="23" width="1680" height="1027" extended-state="6" />
124 |     <editor active="true" />
125 |     <layout>
126 |       <window_info id="Favorites" side_tool="true" />
127 |       <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.24969475" />
128 |       <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
129 |       <window_info anchor="bottom" id="Docker" show_stripe_button="false" />
130 |       <window_info anchor="bottom" id="Database Changes" />
131 |       <window_info anchor="bottom" id="Version Control" />
132 |       <window_info anchor="bottom" id="Python Console" />
133 |       <window_info anchor="bottom" id="Terminal" />
134 |       <window_info anchor="bottom" id="Event Log" side_tool="true" />
135 |       <window_info anchor="bottom" id="Message" order="0" />
136 |       <window_info anchor="bottom" id="Find" order="1" />
137 |       <window_info anchor="bottom" id="Run" order="2" />
138 |       <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
139 |       <window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
140 |       <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
141 |       <window_info anchor="bottom" id="TODO" order="6" />
142 |       <window_info anchor="right" id="SciView" />
143 |       <window_info anchor="right" id="Database" />
144 |       <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
145 |       <window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
146 |       <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
147 |     </layout>
148 |   </component>
149 |   <component name="TypeScriptGeneratedFilesManager">
150 |     <option name="version" value="1" />
151 |   </component>
152 |   <component name="editorHistoryManager">
153 |     <entry file="file://$PROJECT_DIR$/1.py">
154 |       <provider selected="true" editor-type-id="text-editor">
155 |         <state relative-caret-position="1334">
156 |           <caret line="58" selection-start-line="58" selection-end-line="58" />
157 |         </state>
158 |       </provider>
159 |     </entry>
160 |     <entry file="file://$PROJECT_DIR$/2.py">
161 |       <provider selected="true" editor-type-id="text-editor">
162 |         <state relative-caret-position="813">
163 |           <caret line="37" selection-start-line="37" selection-end-line="37" />
164 |         </state>
165 |       </provider>
166 |     </entry>
167 |   </component>
168 | </project>


--------------------------------------------------------------------------------
/19. AGNES/.idea/workspace.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="ChangeListManager">
  4 |     <list default="true" id="bae0d4ba-f1e9-4d5a-b1d2-08d987febd8c" name="Default Changelist" comment="">
  5 |       <change afterPath="$PROJECT_DIR$/1.py" afterDir="false" />
  6 |       <change afterPath="$PROJECT_DIR$/2.py" afterDir="false" />
  7 |       <change afterPath="$PROJECT_DIR$/3.py" afterDir="false" />
  8 |     </list>
  9 |     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
 10 |     <option name="SHOW_DIALOG" value="false" />
 11 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
 12 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 13 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 14 |   </component>
 15 |   <component name="FileEditorManager">
 16 |     <leaf>
 17 |       <file pinned="false" current-in-tab="false">
 18 |         <entry file="file://$PROJECT_DIR$/3.py">
 19 |           <provider selected="true" editor-type-id="text-editor">
 20 |             <state relative-caret-position="414">
 21 |               <caret line="18" selection-start-line="18" selection-end-line="18" />
 22 |             </state>
 23 |           </provider>
 24 |         </entry>
 25 |       </file>
 26 |       <file pinned="false" current-in-tab="false">
 27 |         <entry file="file://$PROJECT_DIR$/1.py">
 28 |           <provider selected="true" editor-type-id="text-editor">
 29 |             <state relative-caret-position="721">
 30 |               <caret line="54" column="29" selection-start-line="54" selection-start-column="29" selection-end-line="54" selection-end-column="29" />
 31 |             </state>
 32 |           </provider>
 33 |         </entry>
 34 |       </file>
 35 |       <file pinned="false" current-in-tab="true">
 36 |         <entry file="file://$PROJECT_DIR$/2.py">
 37 |           <provider selected="true" editor-type-id="text-editor">
 38 |             <state relative-caret-position="1334">
 39 |               <caret line="58" selection-start-line="58" selection-end-line="58" />
 40 |             </state>
 41 |           </provider>
 42 |         </entry>
 43 |       </file>
 44 |     </leaf>
 45 |   </component>
 46 |   <component name="FileTemplateManagerImpl">
 47 |     <option name="RECENT_TEMPLATES">
 48 |       <list>
 49 |         <option value="Python Script" />
 50 |       </list>
 51 |     </option>
 52 |   </component>
 53 |   <component name="Git.Settings">
 54 |     <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$/.." />
 55 |   </component>
 56 |   <component name="IdeDocumentHistory">
 57 |     <option name="CHANGED_PATHS">
 58 |       <list>
 59 |         <option value="$PROJECT_DIR$/3.py" />
 60 |         <option value="$PROJECT_DIR$/1.py" />
 61 |         <option value="$PROJECT_DIR$/2.py" />
 62 |       </list>
 63 |     </option>
 64 |   </component>
 65 |   <component name="ProjectConfigurationFiles">
 66 |     <option name="files">
 67 |       <list>
 68 |         <option value="$PROJECT_DIR$/.idea/19. AGNES.iml" />
 69 |         <option value="$PROJECT_DIR$/.idea/vcs.xml" />
 70 |         <option value="$PROJECT_DIR$/.idea/misc.xml" />
 71 |         <option value="$PROJECT_DIR$/.idea/modules.xml" />
 72 |       </list>
 73 |     </option>
 74 |   </component>
 75 |   <component name="ProjectFrameBounds" extendedState="6">
 76 |     <option name="y" value="23" />
 77 |     <option name="width" value="1400" />
 78 |     <option name="height" value="1027" />
 79 |   </component>
 80 |   <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
 81 |   <component name="ProjectView">
 82 |     <navigator proportions="" version="1">
 83 |       <foldersAlwaysOnTop value="true" />
 84 |     </navigator>
 85 |     <panes>
 86 |       <pane id="ProjectPane">
 87 |         <subPane>
 88 |           <expand>
 89 |             <path>
 90 |               <item name="19. AGNES" type="b2602c69:ProjectViewProjectNode" />
 91 |               <item name="19. AGNES" type="462c0819:PsiDirectoryNode" />
 92 |             </path>
 93 |           </expand>
 94 |           <select />
 95 |         </subPane>
 96 |       </pane>
 97 |       <pane id="Scope" />
 98 |     </panes>
 99 |   </component>
100 |   <component name="PropertiesComponent">
101 |     <property name="WebServerToolWindowFactoryState" value="false" />
102 |     <property name="last_opened_file_path" value="$PROJECT_DIR$" />
103 |     <property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
104 |     <property name="nodejs_npm_path_reset_for_default_project" value="true" />
105 |     <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
106 |   </component>
107 |   <component name="RunDashboard">
108 |     <option name="ruleStates">
109 |       <list>
110 |         <RuleState>
111 |           <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
112 |         </RuleState>
113 |         <RuleState>
114 |           <option name="name" value="StatusDashboardGroupingRule" />
115 |         </RuleState>
116 |       </list>
117 |     </option>
118 |   </component>
119 |   <component name="SvnConfiguration">
120 |     <configuration />
121 |   </component>
122 |   <component name="TaskManager">
123 |     <task active="true" id="Default" summary="Default task">
124 |       <changelist id="bae0d4ba-f1e9-4d5a-b1d2-08d987febd8c" name="Default Changelist" comment="" />
125 |       <created>1559179725264</created>
126 |       <option name="number" value="Default" />
127 |       <option name="presentableId" value="Default" />
128 |       <updated>1559179725264</updated>
129 |       <workItem from="1559179727955" duration="110000" />
130 |     </task>
131 |     <servers />
132 |   </component>
133 |   <component name="TimeTrackingManager">
134 |     <option name="totallyTimeSpent" value="110000" />
135 |   </component>
136 |   <component name="ToolWindowManager">
137 |     <frame x="0" y="23" width="1680" height="1027" extended-state="6" />
138 |     <editor active="true" />
139 |     <layout>
140 |       <window_info id="Favorites" side_tool="true" />
141 |       <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.24969475" />
142 |       <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
143 |       <window_info anchor="bottom" id="Docker" show_stripe_button="false" />
144 |       <window_info anchor="bottom" id="Database Changes" />
145 |       <window_info anchor="bottom" id="Version Control" />
146 |       <window_info anchor="bottom" id="Python Console" />
147 |       <window_info anchor="bottom" id="Terminal" />
148 |       <window_info anchor="bottom" id="Event Log" side_tool="true" />
149 |       <window_info anchor="bottom" id="Message" order="0" />
150 |       <window_info anchor="bottom" id="Find" order="1" />
151 |       <window_info anchor="bottom" id="Run" order="2" />
152 |       <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
153 |       <window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
154 |       <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
155 |       <window_info anchor="bottom" id="TODO" order="6" />
156 |       <window_info anchor="right" id="SciView" />
157 |       <window_info anchor="right" id="Database" />
158 |       <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
159 |       <window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
160 |       <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
161 |     </layout>
162 |   </component>
163 |   <component name="TypeScriptGeneratedFilesManager">
164 |     <option name="version" value="1" />
165 |   </component>
166 |   <component name="editorHistoryManager">
167 |     <entry file="file://$PROJECT_DIR$/3.py">
168 |       <provider selected="true" editor-type-id="text-editor">
169 |         <state relative-caret-position="414">
170 |           <caret line="18" selection-start-line="18" selection-end-line="18" />
171 |         </state>
172 |       </provider>
173 |     </entry>
174 |     <entry file="file://$PROJECT_DIR$/1.py">
175 |       <provider selected="true" editor-type-id="text-editor">
176 |         <state relative-caret-position="721">
177 |           <caret line="54" column="29" selection-start-line="54" selection-start-column="29" selection-end-line="54" selection-end-column="29" />
178 |         </state>
179 |       </provider>
180 |     </entry>
181 |     <entry file="file://$PROJECT_DIR$/2.py">
182 |       <provider selected="true" editor-type-id="text-editor">
183 |         <state relative-caret-position="1334">
184 |           <caret line="58" selection-start-line="58" selection-end-line="58" />
185 |         </state>
186 |       </provider>
187 |     </entry>
188 |   </component>
189 | </project>


--------------------------------------------------------------------------------
/16. k-means/.idea/workspace.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="ChangeListManager">
  4 |     <list default="true" id="dfdb56ca-7083-45bc-8377-032d532eba47" name="Default Changelist" comment="">
  5 |       <change afterPath="$PROJECT_DIR$/1.py" afterDir="false" />
  6 |       <change afterPath="$PROJECT_DIR$/2.py" afterDir="false" />
  7 |       <change afterPath="$PROJECT_DIR$/3.py" afterDir="false" />
  8 |       <change afterPath="$PROJECT_DIR$/4.py" afterDir="false" />
  9 |     </list>
 10 |     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
 11 |     <option name="SHOW_DIALOG" value="false" />
 12 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
 13 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 14 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 15 |   </component>
 16 |   <component name="FileEditorManager">
 17 |     <leaf>
 18 |       <file pinned="false" current-in-tab="false">
 19 |         <entry file="file://$PROJECT_DIR$/1.py">
 20 |           <provider selected="true" editor-type-id="text-editor">
 21 |             <state relative-caret-position="299">
 22 |               <caret line="13" column="29" selection-start-line="13" selection-start-column="29" selection-end-line="13" selection-end-column="29" />
 23 |             </state>
 24 |           </provider>
 25 |         </entry>
 26 |       </file>
 27 |       <file pinned="false" current-in-tab="false">
 28 |         <entry file="file://$PROJECT_DIR$/2.py">
 29 |           <provider selected="true" editor-type-id="text-editor" />
 30 |         </entry>
 31 |       </file>
 32 |       <file pinned="false" current-in-tab="false">
 33 |         <entry file="file://$PROJECT_DIR$/4.py">
 34 |           <provider selected="true" editor-type-id="text-editor">
 35 |             <state relative-caret-position="414">
 36 |               <caret line="18" selection-start-line="18" selection-end-line="18" />
 37 |             </state>
 38 |           </provider>
 39 |         </entry>
 40 |       </file>
 41 |       <file pinned="false" current-in-tab="true">
 42 |         <entry file="file://$PROJECT_DIR$/3.py">
 43 |           <provider selected="true" editor-type-id="text-editor">
 44 |             <state relative-caret-position="721">
 45 |               <caret line="124" selection-start-line="124" selection-end-line="124" />
 46 |             </state>
 47 |           </provider>
 48 |         </entry>
 49 |       </file>
 50 |     </leaf>
 51 |   </component>
 52 |   <component name="FileTemplateManagerImpl">
 53 |     <option name="RECENT_TEMPLATES">
 54 |       <list>
 55 |         <option value="Python Script" />
 56 |       </list>
 57 |     </option>
 58 |   </component>
 59 |   <component name="Git.Settings">
 60 |     <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$/.." />
 61 |   </component>
 62 |   <component name="IdeDocumentHistory">
 63 |     <option name="CHANGED_PATHS">
 64 |       <list>
 65 |         <option value="$PROJECT_DIR$/1.py" />
 66 |         <option value="$PROJECT_DIR$/2.py" />
 67 |         <option value="$PROJECT_DIR$/4.py" />
 68 |         <option value="$PROJECT_DIR$/3.py" />
 69 |       </list>
 70 |     </option>
 71 |   </component>
 72 |   <component name="ProjectConfigurationFiles">
 73 |     <option name="files">
 74 |       <list>
 75 |         <option value="$PROJECT_DIR$/.idea/vcs.xml" />
 76 |       </list>
 77 |     </option>
 78 |   </component>
 79 |   <component name="ProjectFrameBounds" extendedState="6">
 80 |     <option name="y" value="23" />
 81 |     <option name="width" value="1401" />
 82 |     <option name="height" value="1027" />
 83 |   </component>
 84 |   <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
 85 |   <component name="ProjectView">
 86 |     <navigator proportions="" version="1">
 87 |       <foldersAlwaysOnTop value="true" />
 88 |     </navigator>
 89 |     <panes>
 90 |       <pane id="Scope" />
 91 |       <pane id="ProjectPane">
 92 |         <subPane>
 93 |           <expand>
 94 |             <path>
 95 |               <item name="16. k-means" type="b2602c69:ProjectViewProjectNode" />
 96 |               <item name="16. k-means" type="462c0819:PsiDirectoryNode" />
 97 |             </path>
 98 |           </expand>
 99 |           <select />
100 |         </subPane>
101 |       </pane>
102 |     </panes>
103 |   </component>
104 |   <component name="PropertiesComponent">
105 |     <property name="WebServerToolWindowFactoryState" value="false" />
106 |     <property name="last_opened_file_path" value="$PROJECT_DIR$" />
107 |     <property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
108 |     <property name="nodejs_npm_path_reset_for_default_project" value="true" />
109 |     <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
110 |   </component>
111 |   <component name="RunDashboard">
112 |     <option name="ruleStates">
113 |       <list>
114 |         <RuleState>
115 |           <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
116 |         </RuleState>
117 |         <RuleState>
118 |           <option name="name" value="StatusDashboardGroupingRule" />
119 |         </RuleState>
120 |       </list>
121 |     </option>
122 |   </component>
123 |   <component name="SvnConfiguration">
124 |     <configuration />
125 |   </component>
126 |   <component name="TaskManager">
127 |     <task active="true" id="Default" summary="Default task">
128 |       <changelist id="dfdb56ca-7083-45bc-8377-032d532eba47" name="Default Changelist" comment="" />
129 |       <created>1559463942484</created>
130 |       <option name="number" value="Default" />
131 |       <option name="presentableId" value="Default" />
132 |       <updated>1559463942484</updated>
133 |       <workItem from="1559463944981" duration="169000" />
134 |     </task>
135 |     <servers />
136 |   </component>
137 |   <component name="TimeTrackingManager">
138 |     <option name="totallyTimeSpent" value="169000" />
139 |   </component>
140 |   <component name="ToolWindowManager">
141 |     <frame x="0" y="23" width="1680" height="1027" extended-state="6" />
142 |     <editor active="true" />
143 |     <layout>
144 |       <window_info id="Favorites" side_tool="true" />
145 |       <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.24969475" />
146 |       <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
147 |       <window_info anchor="bottom" id="Docker" show_stripe_button="false" />
148 |       <window_info anchor="bottom" id="Database Changes" />
149 |       <window_info anchor="bottom" id="Version Control" />
150 |       <window_info anchor="bottom" id="Python Console" />
151 |       <window_info anchor="bottom" id="Terminal" />
152 |       <window_info anchor="bottom" id="Event Log" side_tool="true" />
153 |       <window_info anchor="bottom" id="Message" order="0" />
154 |       <window_info anchor="bottom" id="Find" order="1" />
155 |       <window_info anchor="bottom" id="Run" order="2" />
156 |       <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
157 |       <window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
158 |       <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
159 |       <window_info anchor="bottom" id="TODO" order="6" />
160 |       <window_info anchor="right" id="SciView" />
161 |       <window_info anchor="right" id="Database" />
162 |       <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
163 |       <window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
164 |       <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
165 |     </layout>
166 |   </component>
167 |   <component name="TypeScriptGeneratedFilesManager">
168 |     <option name="version" value="1" />
169 |   </component>
170 |   <component name="editorHistoryManager">
171 |     <entry file="file://$PROJECT_DIR$/1.py">
172 |       <provider selected="true" editor-type-id="text-editor">
173 |         <state relative-caret-position="299">
174 |           <caret line="13" column="29" selection-start-line="13" selection-start-column="29" selection-end-line="13" selection-end-column="29" />
175 |         </state>
176 |       </provider>
177 |     </entry>
178 |     <entry file="file://$PROJECT_DIR$/2.py">
179 |       <provider selected="true" editor-type-id="text-editor" />
180 |     </entry>
181 |     <entry file="file://$PROJECT_DIR$/4.py">
182 |       <provider selected="true" editor-type-id="text-editor">
183 |         <state relative-caret-position="414">
184 |           <caret line="18" selection-start-line="18" selection-end-line="18" />
185 |         </state>
186 |       </provider>
187 |     </entry>
188 |     <entry file="file://$PROJECT_DIR$/3.py">
189 |       <provider selected="true" editor-type-id="text-editor">
190 |         <state relative-caret-position="721">
191 |           <caret line="124" selection-start-line="124" selection-end-line="124" />
192 |         </state>
193 |       </provider>
194 |     </entry>
195 |   </component>
196 | </project>


--------------------------------------------------------------------------------