├── data └── __init__.py ├── .gitignore ├── result ├── lr_example.png ├── nb_example.png ├── cart_example.png ├── gbdt_example.png ├── gda_example.png ├── knn_example.png ├── pca_example.png ├── svm_example.png ├── perception_example.png ├── maximum_entropy_example.png └── random_forest_example.png ├── models_with_sklearn.py ├── linear_discriminant_analysis.py ├── gradient_boosting_decision_tree.py ├── principal_component_analysis.py ├── util_kd_tree.py ├── gaussian_discriminant_analysis.py ├── random_forest.py ├── kmeans.py ├── perception.py ├── support_vector_machine.py ├── README.md ├── k_nearest_neighbor.py ├── maximum_entropy.py ├── naive_bayes.py ├── logistic_regression.py └── decision_tree.py /data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.name 2 | *.xml 3 | *.iml 4 | *.pyc 5 | ttt.py -------------------------------------------------------------------------------- /result/lr_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KangCai/Machine-Learning-Algorithm/HEAD/result/lr_example.png -------------------------------------------------------------------------------- /result/nb_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KangCai/Machine-Learning-Algorithm/HEAD/result/nb_example.png -------------------------------------------------------------------------------- /result/cart_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KangCai/Machine-Learning-Algorithm/HEAD/result/cart_example.png -------------------------------------------------------------------------------- /result/gbdt_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KangCai/Machine-Learning-Algorithm/HEAD/result/gbdt_example.png -------------------------------------------------------------------------------- /result/gda_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KangCai/Machine-Learning-Algorithm/HEAD/result/gda_example.png -------------------------------------------------------------------------------- /result/knn_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KangCai/Machine-Learning-Algorithm/HEAD/result/knn_example.png -------------------------------------------------------------------------------- /result/pca_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KangCai/Machine-Learning-Algorithm/HEAD/result/pca_example.png -------------------------------------------------------------------------------- /result/svm_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KangCai/Machine-Learning-Algorithm/HEAD/result/svm_example.png -------------------------------------------------------------------------------- /result/perception_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KangCai/Machine-Learning-Algorithm/HEAD/result/perception_example.png -------------------------------------------------------------------------------- /result/maximum_entropy_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KangCai/Machine-Learning-Algorithm/HEAD/result/maximum_entropy_example.png -------------------------------------------------------------------------------- /result/random_forest_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KangCai/Machine-Learning-Algorithm/HEAD/result/random_forest_example.png -------------------------------------------------------------------------------- /models_with_sklearn.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from sklearn import * 4 | 5 | nb = naive_bayes.BernoulliNB() 6 | lr = linear_model.LogisticRegression() 7 | svm_model = svm.SVC() 8 | knn = neighbors.KNeighborsClassifier() 9 | -------------------------------------------------------------------------------- /linear_discriminant_analysis.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from numpy import * 4 | 5 | 6 | def lda(c1, c2, top_n_feat=1): 7 | """ 8 | lda特征维度压缩函数 9 | :param c1: 第一类样本矩阵,每行是一个样本 10 | :param c2: 第二类样本矩阵,每行是一个样本 11 | :param top_n_feat: 需要保留的特征维度,即要压缩成的维度数 12 | :return: 13 | """ 14 | # 第一类样本均值 15 | m1 = mean(c1, axis=0) 16 | # 第二类样本均值 17 | m2 = mean(c2, axis=0) 18 | # 所有样本矩阵 19 | c = vstack((c1, c2)) 20 | # 所有样本的均值 21 | m = mean(c, axis=0) 22 | # 第一类样本数 23 | n1 = c1.shape[0] 24 | # 第二类样本数 25 | n2 = c2.shape[0] 26 | # 求第一类样本的散列矩阵s1 27 | s1 = 0 28 | for i in range(0, n1): 29 | s1 += (c1[i, :]-m1).T*(c1[i, :]-m1) 30 | # 求第二类样本的散列矩阵 s2 31 | s2 = 0 32 | for i in range(0, n2): 33 | s2 += (c2[i, :]-m2).T*(c2[i, :]-m2) 34 | # 计算类内离散度矩阵Sw 35 | sw = (n1*s1+n2*s2)/(n1+n2) 36 | # 计算类间离散度矩阵Sb 37 | sb = (n1*(m-m1).T*(m-m1) + n2*(m-m2).T*(m-m2))/(n1+n2) 38 | # 求最大特征值对应的特征值和特征向量(重点) 39 | eig_value, eig_vector = linalg.eig(mat(sw).I*sb) 40 | # 对eig_value从大到小排序,返回对应排序后的索引 41 | index_vec = argsort(-eig_value) 42 | # 取出最大的特征值对应的索引 43 | n_largest_index = index_vec[:top_n_feat] 44 | # 取出最大的特征值对应的特征向量 45 | W = eig_vector[:, n_largest_index] 46 | # 返回降维后结果 47 | return W 48 | 49 | 50 | if __name__ == '__main__': 51 | data1 = [[1, 0], [3, 2]] 52 | data2 = [[0, 1], [1, 3]] 53 | w = lda(array(data1), array(data2), 2) 54 | print(w) 55 | -------------------------------------------------------------------------------- /gradient_boosting_decision_tree.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import numpy as np 4 | import decision_tree 5 | 6 | class GBDT(object): 7 | 8 | def __init__(self, max_iter=10, sample_rate=0.5, learn_rate=1.0, max_depth=3): 9 | self.max_iter = max_iter 10 | self.sample_rate = sample_rate # 0 < sample_rate <= 1 11 | self.learn_rate = learn_rate 12 | self.max_depth = max_depth 13 | self.dtrees = dict() 14 | self.original_f = None 15 | 16 | def fit(self, X_train, Y_train): 17 | n, m = X_train.shape 18 | # 记录每个样本对应的预测值,这个偏移值需要加到GBDT的预测结果中 19 | f = np.ones(n) * np.mean(Y_train) 20 | self.original_f = np.array(f) 21 | # 数据集随机抽样,减少模型方差 22 | n_sample = int(n*self.sample_rate) 23 | print('') 24 | for iter_ in range(self.max_iter): 25 | sample_idx = np.random.permutation(n)[:n_sample] 26 | X_train_subset, Y_train_subset = X_train[sample_idx, :], Y_train[sample_idx] 27 | y_predict_subset = np.zeros(n_sample) 28 | # 用损失函数的负梯度作为回归树的残差近似值 29 | for j in range(n_sample): 30 | k = sample_idx[j] 31 | y_predict_subset[j] = f[k] 32 | residual = Y_train_subset - y_predict_subset 33 | print('Iter %r/%r: %r(residual)' % (iter_, self.max_iter, np.mean(residual))) 34 | # 用残差作为新标签训练一颗新树 35 | dtree = decision_tree.DTreeRegressionCART(max_depth=self.max_depth) 36 | dtree.fit(X_train_subset, residual) 37 | self.dtrees[iter_] = dtree 38 | # 更新样本预测值 39 | for j in range(n): 40 | f[j] += self.learn_rate * dtree.predict(np.array([X_train[j]])) 41 | 42 | def predict(self, X): 43 | n = X.shape[0] 44 | Y = np.zeros([n, self.max_iter]) 45 | for iter_ in range(self.max_iter): 46 | dtree = self.dtrees[iter_] 47 | Y[:, iter_] = dtree.predict(X) 48 | # 将GBDT初始化时的偏移值需要加到预测结果中 49 | return np.sum(Y, axis=1) + self.original_f 50 | 51 | if __name__ == '__main__': 52 | model = GBDT() 53 | X_ = np.array([[0, 1], [1, 2], [2, 3], [3, 4]]) 54 | Y_ = np.array([1, 2, 3, 4]) 55 | model.fit(X_, Y_) 56 | print('