├── data
    └── __init__.py
├── .gitignore
├── result
    ├── lr_example.png
    ├── nb_example.png
    ├── cart_example.png
    ├── gbdt_example.png
    ├── gda_example.png
    ├── knn_example.png
    ├── pca_example.png
    ├── svm_example.png
    ├── perception_example.png
    ├── maximum_entropy_example.png
    └── random_forest_example.png
├── models_with_sklearn.py
├── linear_discriminant_analysis.py
├── gradient_boosting_decision_tree.py
├── principal_component_analysis.py
├── util_kd_tree.py
├── gaussian_discriminant_analysis.py
├── random_forest.py
├── kmeans.py
├── perception.py
├── support_vector_machine.py
├── README.md
├── k_nearest_neighbor.py
├── maximum_entropy.py
├── naive_bayes.py
├── logistic_regression.py
└── decision_tree.py


/data/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.name
2 | *.xml
3 | *.iml
4 | *.pyc
5 | ttt.py


--------------------------------------------------------------------------------
/result/lr_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KangCai/Machine-Learning-Algorithm/HEAD/result/lr_example.png


--------------------------------------------------------------------------------
/result/nb_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KangCai/Machine-Learning-Algorithm/HEAD/result/nb_example.png


--------------------------------------------------------------------------------
/result/cart_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KangCai/Machine-Learning-Algorithm/HEAD/result/cart_example.png


--------------------------------------------------------------------------------
/result/gbdt_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KangCai/Machine-Learning-Algorithm/HEAD/result/gbdt_example.png


--------------------------------------------------------------------------------
/result/gda_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KangCai/Machine-Learning-Algorithm/HEAD/result/gda_example.png


--------------------------------------------------------------------------------
/result/knn_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KangCai/Machine-Learning-Algorithm/HEAD/result/knn_example.png


--------------------------------------------------------------------------------
/result/pca_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KangCai/Machine-Learning-Algorithm/HEAD/result/pca_example.png


--------------------------------------------------------------------------------
/result/svm_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KangCai/Machine-Learning-Algorithm/HEAD/result/svm_example.png


--------------------------------------------------------------------------------
/result/perception_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KangCai/Machine-Learning-Algorithm/HEAD/result/perception_example.png


--------------------------------------------------------------------------------
/result/maximum_entropy_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KangCai/Machine-Learning-Algorithm/HEAD/result/maximum_entropy_example.png


--------------------------------------------------------------------------------
/result/random_forest_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KangCai/Machine-Learning-Algorithm/HEAD/result/random_forest_example.png


--------------------------------------------------------------------------------
/models_with_sklearn.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | 
3 | from sklearn import *
4 | 
5 | nb = naive_bayes.BernoulliNB()
6 | lr = linear_model.LogisticRegression()
7 | svm_model = svm.SVC()
8 | knn = neighbors.KNeighborsClassifier()
9 | 


--------------------------------------------------------------------------------
/linear_discriminant_analysis.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | from numpy import *
 4 | 
 5 | 
 6 | def lda(c1, c2, top_n_feat=1):
 7 |     """
 8 |     lda特征维度压缩函数
 9 |     :param c1: 第一类样本矩阵，每行是一个样本
10 |     :param c2: 第二类样本矩阵，每行是一个样本
11 |     :param top_n_feat: 需要保留的特征维度，即要压缩成的维度数
12 |     :return:
13 |     """
14 |     # 第一类样本均值
15 |     m1 = mean(c1, axis=0)
16 |     # 第二类样本均值
17 |     m2 = mean(c2, axis=0)
18 |     # 所有样本矩阵
19 |     c = vstack((c1, c2))
20 |     # 所有样本的均值
21 |     m = mean(c, axis=0)
22 |     # 第一类样本数
23 |     n1 = c1.shape[0]
24 |     # 第二类样本数
25 |     n2 = c2.shape[0]
26 |     # 求第一类样本的散列矩阵s1
27 |     s1 = 0
28 |     for i in range(0, n1):
29 |         s1 += (c1[i, :]-m1).T*(c1[i, :]-m1)
30 |     # 求第二类样本的散列矩阵 s2
31 |     s2 = 0
32 |     for i in range(0, n2):
33 |         s2 += (c2[i, :]-m2).T*(c2[i, :]-m2)
34 |     # 计算类内离散度矩阵Sw
35 |     sw = (n1*s1+n2*s2)/(n1+n2)
36 |     # 计算类间离散度矩阵Sb
37 |     sb = (n1*(m-m1).T*(m-m1) + n2*(m-m2).T*(m-m2))/(n1+n2)
38 |     # 求最大特征值对应的特征值和特征向量（重点）
39 |     eig_value, eig_vector = linalg.eig(mat(sw).I*sb)
40 |     # 对eig_value从大到小排序，返回对应排序后的索引
41 |     index_vec = argsort(-eig_value)
42 |     # 取出最大的特征值对应的索引
43 |     n_largest_index = index_vec[:top_n_feat]
44 |     # 取出最大的特征值对应的特征向量
45 |     W = eig_vector[:, n_largest_index]
46 |     # 返回降维后结果
47 |     return W
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     data1 = [[1, 0], [3, 2]]
52 |     data2 = [[0, 1], [1, 3]]
53 |     w = lda(array(data1), array(data2), 2)
54 |     print(w)
55 | 


--------------------------------------------------------------------------------
/gradient_boosting_decision_tree.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import numpy as np
 4 | import decision_tree
 5 | 
 6 | class GBDT(object):
 7 | 
 8 |     def __init__(self, max_iter=10, sample_rate=0.5, learn_rate=1.0, max_depth=3):
 9 |         self.max_iter = max_iter
10 |         self.sample_rate = sample_rate # 0 < sample_rate <= 1
11 |         self.learn_rate = learn_rate
12 |         self.max_depth = max_depth
13 |         self.dtrees = dict()
14 |         self.original_f = None
15 | 
16 |     def fit(self, X_train, Y_train):
17 |         n, m = X_train.shape
18 |         # 记录每个样本对应的预测值，这个偏移值需要加到GBDT的预测结果中
19 |         f = np.ones(n) * np.mean(Y_train)
20 |         self.original_f = np.array(f)
21 |         # 数据集随机抽样，减少模型方差
22 |         n_sample = int(n*self.sample_rate)
23 |         print('<Train begins>')
24 |         for iter_ in range(self.max_iter):
25 |             sample_idx = np.random.permutation(n)[:n_sample]
26 |             X_train_subset, Y_train_subset = X_train[sample_idx, :], Y_train[sample_idx]
27 |             y_predict_subset = np.zeros(n_sample)
28 |             # 用损失函数的负梯度作为回归树的残差近似值
29 |             for j in range(n_sample):
30 |                 k = sample_idx[j]
31 |                 y_predict_subset[j] = f[k]
32 |             residual = Y_train_subset - y_predict_subset
33 |             print('Iter %r/%r: %r(residual)' % (iter_, self.max_iter, np.mean(residual)))
34 |             # 用残差作为新标签训练一颗新树
35 |             dtree = decision_tree.DTreeRegressionCART(max_depth=self.max_depth)
36 |             dtree.fit(X_train_subset, residual)
37 |             self.dtrees[iter_] = dtree
38 |             # 更新样本预测值
39 |             for j in range(n):
40 |                 f[j] += self.learn_rate * dtree.predict(np.array([X_train[j]]))
41 | 
42 |     def predict(self, X):
43 |         n = X.shape[0]
44 |         Y = np.zeros([n, self.max_iter])
45 |         for iter_ in range(self.max_iter):
46 |             dtree = self.dtrees[iter_]
47 |             Y[:, iter_] = dtree.predict(X)
48 |         # 将GBDT初始化时的偏移值需要加到预测结果中
49 |         return np.sum(Y, axis=1) + self.original_f
50 | 
51 | if __name__ == '__main__':
52 |     model = GBDT()
53 |     X_ = np.array([[0, 1], [1, 2], [2, 3], [3, 4]])
54 |     Y_ = np.array([1, 2, 3, 4])
55 |     model.fit(X_, Y_)
56 |     print('<Label Ground Truth>')
57 |     print(Y_)
58 |     print('<Label Output>')
59 |     print(model.predict(X_))


--------------------------------------------------------------------------------
/principal_component_analysis.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | def pca(data_mat, top_n_feat=1, draw=False):
 7 |     """
 8 |     pca特征维度压缩函数
 9 |     :param data_mat: 数据集矩阵
10 |     :param top_n_feat: 需要保留的特征维度，即要压缩成的维度数
11 |     :param draw:
12 |     :return:
13 |     """
14 |     # 求数据矩阵每一列的均值
15 |     mean_val = np.mean(data_mat, axis=0)
16 |     # 数据矩阵每一列特征减去该列的特征均值
17 |     mean_removed = data_mat - mean_val
18 |     # 计算协方差矩阵，除数n-1是为了得到协方差的无偏估计
19 |     cov_mat = np.cov(mean_removed, rowvar=False)
20 |     # 计算协方差矩阵的特征值eig_val及对应的特征向量eig_vec
21 |     eig_val, eig_vec = np.linalg.eig(np.mat(cov_mat))
22 |     # argsort():对特征值矩阵进行由小到大排序，返回对应排序后的索引
23 |     eig_val_ind = np.argsort(eig_val)
24 |     # 从排序后的矩阵最后一个开始自下而上选取最大的N个特征值，返回其对应的索引
25 |     eig_val_ind = eig_val_ind[:-(top_n_feat + 1):-1]
26 |     # 将特征值最大的N个特征值对应索引的特征向量提取出来，组成压缩矩阵
27 |     red_eig_vec = eig_vec[:, eig_val_ind]
28 |     # 将去除均值后的数据矩阵*压缩矩阵，转换到新的空间，使维度降低为N
29 |     low_dim_data_mat = mean_removed * red_eig_vec
30 |     # 利用降维后的矩阵反构出原数据矩阵(用作测试，可跟未压缩的原矩阵比对)
31 |     recon_mat = (low_dim_data_mat * red_eig_vec.T) + mean_val
32 |     # 画图
33 |     if draw:
34 |         import matplotlib.pyplot as plt
35 |         color = np.array(['r', 'g', 'b', 'm', 'c'])
36 |         plt.scatter(data_mat[:, 0], data_mat[:, 1], c=color)
37 |         plt.scatter(np.array(recon_mat)[:, 0], np.array(recon_mat)[:, 1], marker='s', c=color)
38 |         x_min, x_max, y_min, y_max = np.min(data_mat[:, 0]) - 1, np.max(data_mat[:, 0]) + 1, \
39 |             np.min(data_mat[:, 1]) - 1, np.max(data_mat[:, 1]) + 1
40 |         if top_n_feat == 1:
41 |             for i in range(data_mat.shape[0]):
42 |                 plt.plot([np.array(recon_mat)[:, 0], data_mat[:, 0]], [np.array(recon_mat)[:, 1], data_mat[:, 1]], linestyle=':')
43 |             w = float(red_eig_vec[1][0] / red_eig_vec[0][0])
44 |             b = mean_val[1] - mean_val[0] * w
45 |             plt.plot([x_min, x_max], [x_min * w + b, x_max * w + b], linestyle='--')
46 |         plt.xlim(x_min, x_max)
47 |         plt.ylim(y_min, y_max)
48 |         plt.title('PCA example')
49 |         plt.show()
50 |     # 返回压缩后的数据矩阵即该矩阵反构出原始数据矩阵
51 |     return low_dim_data_mat, recon_mat
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     data = np.array([[1, 0], [3, 2], [2, 2], [0, 2], [1, 3]])
56 |     print('Raw data: %r\n' % data[:, 0])
57 |     lowDData, recon = pca(data, draw=True)
58 |     print('PCA data: %r\n' % lowDData)
59 |     print('Reconstructed data: %r' % recon)
60 | 


--------------------------------------------------------------------------------
/util_kd_tree.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | class KDTree(object):
 4 | 
 5 |     def __init__(self, X, DATA):
 6 |         # Attach extra data to point.
 7 |         point_container_list = []
 8 |         row, col = X.shape
 9 |         for i in range(row):
10 |             point_container_list.append(PointContainer(X[i], DATA[i]))
11 |         # Create KD tree.
12 |         self.kd_node = self._create_kd_tree(point_container_list, col)
13 | 
14 |     def _create_kd_tree(self, points, dim, i=0):
15 |         """
16 | 
17 |         :param points:
18 |         :param i:
19 |         :return:
20 |         """
21 |         if len(points) > 1:
22 |             points.sort(key=lambda p: p.x[i])
23 |             i = (i + 1) % dim
24 |             half = len(points) >> 1
25 |             return self._create_kd_tree(points[:half], dim, i), self._create_kd_tree(points[half+1:], dim, i), points[half]
26 |         elif len(points) == 1:
27 |             return None, None, points[0]
28 | 
29 |     def search(self, x, k):
30 |         """
31 |         This k is not the k in 'kmeans'.
32 |         :param x:
33 |         :param k:
34 |         :return:
35 |         """
36 |         p = PointContainer(x, None)
37 |         dim = len(x)
38 |         return self._search_kd_tree(self.kd_node, p, k, dim, lambda a, b: sum((a[i] - b[i]) ** 2 for i in range(dim)))
39 | 
40 |     def _search_kd_tree(self, kd_p, p, k, dim, dist_func, i=0, heap=None):
41 |         import heapq
42 |         is_root = not heap
43 |         if is_root:
44 |             heap = []
45 |         if kd_p and isinstance(kd_p, tuple) and len(kd_p) == 3:
46 |             mid_kd_p = kd_p[2]
47 |             dist = dist_func(p.x, mid_kd_p.x)
48 |             dx = mid_kd_p.x[i] - p.x[i]
49 |             if len(heap) < k:
50 |                 heapq.heappush(heap, (-dist, mid_kd_p))
51 |             elif dist < -heap[0][0]: # -heap[0][0] is the maximum distance in heap.
52 |                 heapq.heappushpop(heap, (-dist, mid_kd_p))
53 |             i = (i + 1) % dim
54 |             self._search_kd_tree(kd_p[dx < 0], p, k, dim, dist_func, i, heap)
55 |             if dx * dx < -heap[0][0]: # -heap[0][0] is the maximum distance in heap.
56 |                 self._search_kd_tree(kd_p[dx >= 0], p, k, dim, dist_func, i, heap)
57 |         if is_root:
58 |             nn_result = sorted((-h[0], h[1]) for h in heap)
59 |             return [n[1] for n in nn_result]
60 | 
61 | class PointContainer(object):
62 | 
63 |     def __init__(self, x, data):
64 |         self.x = x
65 |         self.data = data


--------------------------------------------------------------------------------
/gaussian_discriminant_analysis.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | class GDA(object):
 7 | 
 8 |     def __init__(self):
 9 |         self.Mu0, self.Mu1, self.Sigma = None, None, None
10 |         self.w, self.b, self.sign = None, None, None
11 | 
12 |     def fit(self, X_train, Y_train):
13 |         n, m = X_train.shape
14 |         X0, X1 = X_train[Y_train==0], X_train[Y_train==1]
15 |         self.Mu0, self.Mu1 = np.mean(X0, axis=0), np.mean(X1, axis=0)
16 |         X_sub_Mu = np.vstack([X0 - self.Mu0, X1 - self.Mu1])
17 |         self.Sigma = (1.0/m) * np.dot(X_sub_Mu.T, X_sub_Mu)
18 |         # 判别平面计算
19 |         normal_vec = self.Mu1 - self.Mu0
20 |         normal_vec = normal_vec / np.sqrt(np.sum(normal_vec * normal_vec))
21 |         self.w = normal_vec
22 |         self.b = - np.dot(self.w.T, (self.Mu0 + self.Mu1) / 2.0)
23 |         self.sign = int(np.dot(self.w.T, self.Mu1) + self.b > 0)
24 | 
25 |     def predict(self, X):
26 |         return (np.dot(X, self.w) + self.b > 0).astype(int) * self.sign
27 | 
28 | def _GenerateData():
29 |     import random
30 |     m, n_train, n_val, interval = 2, 10, 2, 1
31 |     X_train, X_val, Y_train, Y_val = [], [], [], []
32 |     color = ['c', 'r']
33 |     def _generateOne(X, Y, i):
34 |         i += 1
35 |         x, y, l = random.uniform((int(i / 2) + 0.1) * 10, (int(i / 2) + 0.9) * 10), random.uniform((int(i / 2) + 0.1) * 10, (int(i / 2) + 0.9) * 10), i
36 |         X.append((x, y))
37 |         Y.append(i - 1)
38 |         return x, y
39 |     for i_ in range(m):
40 |         for _ in range(n_train):
41 |             x_, y_ = _generateOne(X_train, Y_train, i_)
42 |             plt.scatter(x_, y_, s=60, c=color[i_], alpha=0.3)
43 |         for _ in range(n_val):
44 |             _generateOne(X_val, X_val, i_)
45 |     return np.array(X_train), np.array(X_val), np.array(Y_train), np.array(Y_val)
46 | 
47 | if __name__ == '__main__':
48 |     model = GDA()
49 |     X_t, X_v, Y_t, Y_v = _GenerateData()
50 |     print('<Y_t>')
51 |     print(Y_t)
52 |     model.fit(X_t, Y_t)
53 |     print('<Label Output>')
54 |     print(model.predict(X_t))
55 |     # 画 Mu 点
56 |     plt.scatter([model.Mu0[0], model.Mu1[0]], [model.Mu0[1], model.Mu1[1]], s=100, c=['c', 'r'])
57 |     # 根据 Mu 画判别边界
58 |     midPoint = [(model.Mu0[0] + model.Mu1[0]) / 2.0, (model.Mu0[1] + model.Mu1[1]) / 2.0]
59 |     k = (model.Mu1[1] - model.Mu0[1]) / (model.Mu1[0] - model.Mu0[0])
60 |     bx = range(-5, 25)
61 |     by = [(-1.0 / k) * (i - midPoint[0]) + midPoint[1] for i in bx]
62 |     plt.plot(bx, by)
63 |     plt.xlim(0, 20)
64 |     plt.ylim(0, 20)
65 |     plt.title('Gaussian discriminant analysis')
66 |     plt.show()


--------------------------------------------------------------------------------
/random_forest.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import numpy as np
 4 | import decision_tree
 5 | 
 6 | class RandomForest(object):
 7 | 
 8 |     def __init__(self, tree_count=10):
 9 |         self.tree_list = []
10 |         self.tree_count = tree_count
11 | 
12 |     def fit(self, X_train, Y_train):
13 |         # Generate decision tree
14 |         for i in range(self.tree_count):
15 |             dt_CART = decision_tree.DTreeCART()
16 |             # Bagging data
17 |             n, m = X_train.shape
18 |             sample_idx = np.random.permutation(n)
19 |             feature_idx = np.random.permutation(m)[:int(np.sqrt(m))]
20 |             X_t_ = X_train[:, feature_idx]
21 |             X_t_, Y_t_ = X_t_[sample_idx, :], Y_train[sample_idx]
22 |             # Train
23 |             dt_CART.fit(X_t_, Y_t_)
24 |             self.tree_list.append((dt_CART, feature_idx))
25 |             print('=' * 10 + ' %r/%r tree trained ' % (i + 1, self.tree_count) + '=' * 10)
26 |             # print(dt_CART.visualization())
27 | 
28 |     def predict(self, X):
29 |         output_matrix = np.zeros((self.tree_count, X.shape[0]))
30 |         output_label = np.zeros(X.shape[0])
31 |         for i, (tree, feature_idx) in enumerate(self.tree_list):
32 |             output_matrix[i, :] = tree.predict(X[:, feature_idx])
33 |         for col in range(output_matrix.shape[1]):
34 |             output_label[col] = np.argmax(np.bincount(output_matrix[:, col].astype(int)))
35 |         return output_label.astype(int)
36 | 
37 | datalabel = np.array(['年龄(特征1)', '有工作(特征2)', '有自己的房子(特征3)', '信贷情况(特征4)', '类别(标签)'])
38 | train_sets = np.array([
39 |                     ['青年', '否', '否', '一般', '否'],
40 |                     ['青年', '否', '否', '好', '否'],
41 |                     ['青年', '是', '否', '好', '是'],
42 |                     ['青年', '是', '是', '一般', '是'],
43 |                     ['青年', '否', '否', '一般', '否'],
44 |                     ['中年', '否', '否', '一般', '否'],
45 |                     ['中年', '否', '否', '好', '否'],
46 |                     ['中年', '是', '是', '好', '是'],
47 |                     ['中年', '否', '是', '非常好', '是'],
48 |                     ['中年', '否', '是', '非常好', '是'],
49 |                     ['老年', '否', '是', '非常好', '是'],
50 |                     ['老年', '否', '是', '好', '是'],
51 |                     ['老年', '是', '否', '好', '是'],
52 |                     ['老年', '是', '否', '非常好', '是'],
53 |                     ['老年', '否', '否', '一般', '否'],
54 |                     ['青年', '否', '否', '一般', '是']])
55 | map_table = {'青年': 0, '中年': 1, '老年': 2,
56 |              '否': 0, '是': 1,
57 |              '一般': 0, '好': 1, '非常好': 2}
58 | 
59 | if __name__ == '__main__':
60 |     model = RandomForest()
61 |     train_sets_encode = np.array([[map_table[train_sets[i, j]] for j in range(train_sets.shape[1])] for i in range(train_sets.shape[0])])
62 |     X_t, Y_t = train_sets_encode[:, :-1], train_sets_encode[:, -1]
63 |     model.fit(X_t, Y_t)
64 |     print('Ground truth   : %r' % (Y_t,))
65 |     print('Label predicted: %r' % (model.predict(X_t),))
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/kmeans.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import numpy as np
 4 | import util_kd_tree as kdtree
 5 | 
 6 | class KmeansModel(object):
 7 | 
 8 |     def cluster(self, X, k):
 9 |         """
10 |         X: shape(n, d)
11 |         :param X:
12 |         :param k:
13 |         :return:
14 |         """
15 |         row, col = X.shape
16 |         # Init centers according to value range.
17 |         rand_idx = random.sample(range(0, row), k)
18 |         centers = X[rand_idx, :]
19 |         # Iteration of assigning cluster ID to each point
20 |         assignments = self._assign_points(X, centers)
21 |         old_assignments = None
22 |         count = 0
23 |         while (assignments != old_assignments).any():
24 |             count += 1
25 |             # New centers
26 |             centers = self._update_centers(X, k, assignments)
27 |             # Store old assignments
28 |             old_assignments = assignments
29 |             # New assignments
30 |             assignments = self._assign_points(X, centers)
31 |         return zip(assignments, X)
32 | 
33 |     def _update_centers(self, X, k, assignments):
34 |         # Statistic
35 |         store_center_points = {i : [] for i in range(k)}
36 |         for i, p in zip(assignments, X):
37 |             store_center_points[i].append(p)
38 |         # Calculate new centers
39 |         row, col = X.shape
40 |         center_points = np.zeros((k, col))
41 |         for i, points in store_center_points.items():
42 |             store_center_points[i] = np.array(store_center_points[i])
43 |             center_points[i] = np.mean(store_center_points[i], axis=0)
44 |         return center_points
45 | 
46 |     def _assign_points(self, X, centers):
47 |         row, col = X.shape
48 |         assignments = np.zeros(row)
49 |         for i in range(row):
50 |             dists = np.linalg.norm(X[i] - centers, axis=1)
51 |             assignments[i] = np.argmin(dists)
52 |         return assignments
53 | 
54 | class KmeansModelKDTree(KmeansModel):
55 | 
56 |     def _assign_points(self, X, centers):
57 |         if not getattr(self, 'kd_tree', None):
58 |             self.kd_tree = kdtree.KDTree(centers, range(0, len(centers)))
59 |         row, col = X.shape
60 |         assignments = np.zeros(row)
61 |         for i in range(row):
62 |             nn_center_nodes = self.kd_tree.search(X[i], 1)
63 |             assignments[i] = nn_center_nodes[0].data
64 |         return assignments
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     import random, time
69 |     a_ = []
70 |     # k = 3
71 |     # 100: Naive kmeans 0.004s; KD tree kmeans 0.004s
72 |     # 1,000: Naive kmeans 0.19s; KD tree kmeans 0.05s
73 |     # 10,000: Naive kmeans 4.9s; KD tree kmeans 0.4s
74 |     # 100,000: Naive kmeans 66.6s; KD tree kmeans 4.2s
75 |     for _ in range(10000):
76 |         a_.append((random.uniform(0, 100), random.uniform(0, 100)))
77 |     a_ = np.array(a_)
78 |     k_ = 3
79 |     print('=' * 5 + ' Naive kmeans ' + '=' * 5)
80 |     model = KmeansModel()
81 |     t1 = time.clock()
82 |     model.cluster(a_, k_)
83 |     print('Total used time: %r s' % (time.clock() - t1))
84 |     # for label, point in res:
85 |     #     print(label, point)
86 |     print('=' * 5 + ' KD tree kmeans ' + '=' * 5)
87 |     model = KmeansModelKDTree()
88 |     t1 = time.clock()
89 |     res = model.cluster(a_, k_)
90 |     print('Total used time: %r s' % (time.clock() - t1))
91 |     # for label, point in res:
92 |     #     print(label, point)


--------------------------------------------------------------------------------
/perception.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | class PerceptionPrimitive(object):
 7 | 
 8 |     def __init__(self, eta=0.1, n_iter=10000):
 9 |         self.eta = eta
10 |         self.n_iter = n_iter
11 |         self.w = self.b = None
12 |         self.error_count_history = []
13 | 
14 |     def fit(self, X_train, Y_train):
15 |         # Sum( -y(wx + b) ) -> min
16 |         self.w, self.b = np.zeros(X_train.shape[1]), 0
17 |         # Iteration
18 |         for _ in range(self.n_iter):
19 |             error_count = 0
20 |             for xi, yi in zip(X_train, Y_train):
21 |                 if yi * self.predict(xi) <= 0:
22 |                     self.w += self.eta * yi * xi
23 |                     self.b += self.eta * yi
24 |                     error_count += 1
25 |             self.error_count_history.append(error_count)
26 |             if error_count == 0:
27 |                 break
28 |             # if self.w[1]:
29 |             #     plt.plot([10, 0], [(-self.b - self.w[0] * 10) / self.w[1], -self.b / self.w[1]])
30 | 
31 |     def predict_raw(self, x):
32 |         return np.dot(x, self.w) + self.b
33 | 
34 |     def predict(self, x):
35 |         print(self.predict_raw(x), np.sign(self.predict_raw(x)))
36 |         return np.sign(self.predict_raw(x))
37 | 
38 | 
39 | class PerceptionDual(PerceptionPrimitive):
40 | 
41 |     def __init__(self, eta=0.1, n_iter=10000):
42 |         super(PerceptionDual, self).__init__(eta=eta, n_iter=n_iter)
43 |         self.alpha, self.Gram_matrix = None, None
44 | 
45 |     def fit(self, X_train, Y_train):
46 |         n_samples, dim = X_train.shape
47 |         self.alpha, self.w, self.b = np.zeros(n_samples), np.zeros(dim), 0
48 |         # Gram matrix
49 |         self.Gram_matrix = np.dot(X_train, X_train.T)
50 |         # Iteration
51 |         i = 0
52 |         while i < n_samples:
53 |             # Judge end of iteration
54 |             wx = np.sum(np.dot(self.Gram_matrix[i, :] , self.alpha * Y_train))
55 |             # print(wx)
56 |             if Y_train[i] * (wx + self.b) <= 0:
57 |                 # a <- a + eta, b <- b + eta * y_i
58 |                 self.alpha[i] += self.eta
59 |                 self.b += self.eta * Y_train[i]
60 |                 # print('Iteration: %r/%r' % (i, n_samples))
61 |                 i = 0
62 |             else:
63 |                 i += 1
64 | 
65 |         self.w = np.sum(X_train * np.tile((self.alpha * Y_train).reshape((n_samples, 1)), (1, dim)), axis=0)
66 | 
67 | def _GenerateData():
68 |     import random
69 |     m, n_train, n_val, interval = 2, 5, 2, 1
70 |     X_train, X_val, Y_train, Y_val = [], [], [], []
71 |     color = ['c', 'g', 'b', 'r']
72 |     def _generateOne(X, Y, i):
73 |         x, y, l = random.uniform((int(i / 2) + 0.1) * 10, (int(i / 2) + 0.9) * 10), random.uniform((i % 2 * interval + 0.1) * 10, (i % 2 * interval + 0.9) * 10), i
74 |         X.append((x, y))
75 |         Y.append((i - 0.5)*2)
76 |         return x, y
77 |     for i_ in range(m):
78 |         for _ in range(n_train):
79 |             x_, y_ = _generateOne(X_train, Y_train, i_)
80 |             plt.scatter(x_, y_, s=100, c=color[i_])
81 |         for _ in range(n_val):
82 |             _generateOne(X_val, X_val, i_)
83 |     return np.array(X_train), np.array(X_val), np.array(Y_train), np.array(Y_val)
84 | 
85 | if __name__ == '__main__':
86 |     model = PerceptionPrimitive()
87 |     X_t, X_v, Y_t, Y_v = _GenerateData()
88 |     model.fit(X_t, Y_t)
89 |     plt.plot([10, 0], [(-model.b - model.w[0] * 10) / model.w[1], -model.b / model.w[1]])
90 |     plt.grid()
91 |     plt.xlim(0, 10)
92 |     plt.ylim(0, 20)
93 |     plt.show()
94 | 
95 | 


--------------------------------------------------------------------------------
/support_vector_machine.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | import numpy as np
  4 | import random
  5 | import matplotlib.pyplot as plt
  6 | import re
  7 | import time
  8 | 
  9 | class SVMModel(object):
 10 |     """
 11 |     SVM model
 12 |     """
 13 |     def __init__(self, max_iter=10000, kernel_type='linear', C=1.0, epsilon=0.00001):
 14 |         self.max_iter = max_iter
 15 |         self.kernel_type = kernel_type
 16 |         self.kernel_func_list = {
 17 |             'linear': self._kernel_linear,
 18 |             'quadratic': self._kernel_quadratic,
 19 |         }
 20 |         self.kernel_func = self.kernel_func_list[kernel_type]
 21 |         self.C = C
 22 |         self.epsilon = epsilon
 23 |         self.alpha = None
 24 | 
 25 |     def fit(self, X_train, Y_train):
 26 |         """
 27 |         Training model
 28 |         :param X_train: shape = num_train, dim_feature
 29 |         :param Y_train: shape = num_train, 1
 30 |         :return: loss_history
 31 |         """
 32 |         n, d = X_train.shape[0], X_train.shape[1]
 33 |         self.alpha = np.zeros(n)
 34 |         # Iteration
 35 |         for i in range(self.max_iter):
 36 |             diff = self._iteration(X_train, Y_train)
 37 |             if i % 100 == 0:
 38 |                 print('Iter %r / %r, Diff %r' % (i, self.max_iter, diff))
 39 |             if diff < self.epsilon:
 40 |                 break
 41 | 
 42 |     def predict_raw(self, X):
 43 |         return np.dot(self.w.T, X.T) + self.b
 44 | 
 45 |     def predict(self, X):
 46 |         return np.sign(np.dot(self.w.T, X.T) + self.b).astype(int)
 47 | 
 48 |     def _iteration(self, X_train, Y_train):
 49 |         alpha = self.alpha
 50 |         alpha_prev = np.copy(alpha)
 51 |         n = alpha.shape[0]
 52 |         for j in range(n):
 53 |             # Find i not equal to j randomly
 54 |             i = j
 55 |             for _ in range(1000):
 56 |                 if i != j:
 57 |                     break
 58 |                 i = random.randint(0, n - 1)
 59 |             x_i, x_j, y_i, y_j = X_train[i, :], X_train[j, :], Y_train[i], Y_train[j]
 60 |             # Define the similarity of instances. K11 + K22 - 2K12
 61 |             k_ij = self.kernel_func(x_i, x_i) + self.kernel_func(x_j, x_j) - 2 * self.kernel_func(x_i, x_j)
 62 |             if k_ij == 0:
 63 |                 continue
 64 |             a_i, a_j = alpha[i], alpha[j]
 65 |             # Calculate the boundary of alpha
 66 |             L, H = self._cal_L_H(self.C, a_j, a_i, y_j, y_i)
 67 |             # Calculate model parameters
 68 |             self.w = np.dot(X_train.T, np.multiply(alpha, Y_train))
 69 |             self.b = np.mean(Y_train - np.dot(self.w.T, X_train.T))
 70 |             # Iterate alpha_j and alpha_i according to 'Delta W(a_j)'
 71 |             E_i = self.predict(x_i) - y_i
 72 |             E_j = self.predict(x_j) - y_j
 73 |             alpha[j] = a_j + (y_j * (E_i - E_j) * 1.0) / k_ij
 74 |             alpha[j] = min(H, max(L, alpha[j]))
 75 |             alpha[i] = a_i + y_i * y_j * (a_j - alpha[j])
 76 |         diff = np.linalg.norm(alpha - alpha_prev)
 77 |         return diff
 78 | 
 79 |     def _kernel_linear(self, x1, x2):
 80 |         return np.dot(x1, x2.T)
 81 | 
 82 |     def _kernel_quadratic(self, x1, x2):
 83 |         return np.dot(x1, x2.T) ** 2
 84 | 
 85 |     def _cal_L_H(self, C, a_j, a_i, y_j, y_i):
 86 |         if y_i != y_j:
 87 |             L = max(0, a_j - a_i)
 88 |             H = min(C, C - a_i + a_j)
 89 |         else:
 90 |             L = max(0, a_i + a_j - C)
 91 |             H = min(C, a_i + a_j)
 92 |         return L, H
 93 | 
 94 | def _GenerateData():
 95 |     k, m, n_train, n_val = 5, 2, 5, 2
 96 |     X_train, X_val, Y_train, Y_val = [], [], [], []
 97 |     color = ['c', 'g', 'b', 'r']
 98 |     def _generateOne(X, Y, i):
 99 |         x, y, l = random.uniform((int(i / 2) + 0.1) * 10, (int(i / 2) + 0.9) * 10), random.uniform((i % 2 * 0.5 + 0.1) * 10, (i % 2 * 0.5 + 0.9) * 10), i
100 |         X.append((x, y))
101 |         Y.append((i - 0.5)*2)
102 |         return x, y
103 |     for i_ in range(m):
104 |         for _ in range(n_train):
105 |             x_, y_ = _generateOne(X_train, Y_train, i_)
106 |             plt.scatter(x_, y_, s=100, c=color[i_])
107 |         for _ in range(n_val):
108 |             _generateOne(X_val, X_val, i_)
109 | 
110 |     return np.array(X_train), np.array(X_val), np.array(Y_train), np.array(Y_val)
111 | 
112 | if __name__ == '__main__':
113 |     model = SVMModel()
114 |     X_t, X_v, Y_t, Y_v = _GenerateData()
115 |     model.fit(X_t, Y_t)
116 |     plt.plot([10, 0], [(-model.b-model.w[0]*10)/model.w[1], -model.b/model.w[1]])
117 |     alpha_idx = np.where(model.alpha > 0)[0]
118 |     for i__ in range(len(model.alpha)):
119 |         if model.alpha[i__] > 0.1:
120 |             plt.scatter(X_t[i__, 0], X_t[i__, 1], color='', marker='o', s=300, edgecolors='r')
121 |     for i__ in range(X_t.shape[0]):
122 |         plt.text(X_t[i__, 0], X_t[i__, 1], s='%0.1f, %0.1f' % (model.alpha[i__], model.predict_raw(X_t[i__, :])))
123 |     plt.xlim(0, 10)
124 |     plt.ylim(0, 15)
125 |     plt.show()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Machine-Learning-Algorithm
  2 | 
  3 | **注意: 每个文件只有开始的 class 是模型本身，其它代码都是用来测试的，每个模型的实现都在 100 行以内**
  4 | 
  5 | **Note: Only the class at the beginning of each file is the model itself, the rest of the code is for testing, and the implementation of each model is within 100 lines**
  6 | 
  7 | ---
  8 | 
  9 | ### 1. Logistic Regression
 10 | 
 11 | **File** - [logistic_regression.py](https://github.com/KangCai/Machine-Learning-Algorithm/blob/master/logistic_regression.py)
 12 | 
 13 | **Cost Function** -
 14 |  
 15 |  <img src="https://latex.codecogs.com/gif.latex?H(p,q)=-\sum_{i}^{&space;}[y^{(i)}\&space;log\&space;h_\theta(x^{(i)})&plus;(1-y^{(i)})\&space;log(1-h_\theta(x^{(i)})))]&space;" />
 16 | 
 17 | **Optimization Algorithm** - Gradient descent method
 18 | 
 19 | <img src="https://user-images.githubusercontent.com/11633302/67949923-a1cafb00-fc23-11e9-9249-363cfee52f87.png" width=80%/>
 20 | 
 21 | ---
 22 | 
 23 | ### 2. Support Vector Machine
 24 | 
 25 | **File** - [support_vector_machine.py](https://github.com/KangCai/Machine-Learning-Algorithm/blob/master/support_vector_machine.py)
 26 | 
 27 | **Example** -
 28 | 
 29 | <img src="https://user-images.githubusercontent.com/11633302/67949933-a394be80-fc23-11e9-8b7e-bbeaa6670c3e.png" width=60%/>
 30 | 
 31 | **Cost Function** -
 32 | 
 33 | <img src="https://latex.codecogs.com/gif.latex?\begin{aligned}&space;max\&space;W(\alpha)&=\sum_{i=1}^{n}\alpha-\frac{1}{2}\sum_{i,j=1}^{n}y_iy_ja_ia_j(K(x_i,x_j))\\&space;s.t.\sum_{i=1}^{n}y_ia_i&=0&space;\\&space;0&space;\leq&space;a_i&space;\leq&space;C&(i=1,2...n)&space;\end{aligned}" />
 34 | 
 35 | **Optimization Algorithm** - Sequential minimal optimization (SMO)
 36 | 
 37 | ---
 38 | 
 39 | ### 3. Perception
 40 | 
 41 | **File** - [perception.py](https://github.com/KangCai/Machine-Learning-Algorithm/blob/master/perception.py)
 42 | 
 43 | **Example** -
 44 | 
 45 | <img src="https://user-images.githubusercontent.com/11633302/67949931-a2fc2800-fc23-11e9-8825-d5a61778148f.png" width=50%/>
 46 | 
 47 | ---
 48 | 
 49 | ### 4. Naive Bayes
 50 | 
 51 | **File** - [naive_bayes.py](https://github.com/KangCai/Machine-Learning-Algorithm/blob/master/naive_bayes.py)
 52 | 
 53 | **Example** -
 54 | 
 55 | <img src="https://user-images.githubusercontent.com/11633302/67949927-a2fc2800-fc23-11e9-8386-ead2ef4322bb.png" width=80%/>
 56 | 
 57 | ---
 58 | 
 59 | ### 5. K-Nearest Neighbor
 60 | 
 61 | **File** - [k_nearest_neighbor.py](https://github.com/KangCai/Machine-Learning-Algorithm/blob/master/k_nearest_neighbor.py) | [util_kd_tree.py](https://github.com/KangCai/Machine-Learning-Algorithm/blob/master/util_kd_tree.py)
 62 | 
 63 | **Example** -
 64 | 
 65 | <img src="https://user-images.githubusercontent.com/11633302/67949920-a099ce00-fc23-11e9-90f2-3720c5ce7830.png" width=50%/>
 66 | 
 67 | ---
 68 | 
 69 | ### 6. Decision Tree
 70 | 
 71 | **File** - [decision_tree.py](https://github.com/KangCai/Machine-Learning-Algorithm/blob/master/decision_tree.py)
 72 | 
 73 | **Optimization Algorithm** - Generalized Iterative Scaling (GIS)
 74 | 
 75 | **Example** -
 76 | 
 77 | <img src="https://user-images.githubusercontent.com/11633302/67949914-a0013780-fc23-11e9-9b99-8633380d3164.png" width=45%/>
 78 | 
 79 | ---
 80 | 
 81 | ### 7. Random Forest
 82 | 
 83 | **File** - [random_forest.py](https://github.com/KangCai/Machine-Learning-Algorithm/blob/master/random_forest.py) | | [decision_tree.py](https://github.com/KangCai/Machine-Learning-Algorithm/blob/master/decision_tree.py)
 84 | 
 85 | **Example** -
 86 | 
 87 | <img src="https://user-images.githubusercontent.com/11633302/67949932-a394be80-fc23-11e9-9b6d-e04c5bf85849.png" width=70%/>
 88 | 
 89 | ---
 90 | 
 91 | ### 8. Gradient Boosting Decision Tree
 92 | 
 93 | **File** - [gradient_boosting_decision_tree.py](https://github.com/KangCai/Machine-Learning-Algorithm/blob/master/gradient_boosting_decision_tree.py) | [decision_tree.py](https://github.com/KangCai/Machine-Learning-Algorithm/blob/master/decision_tree.py)
 94 | 
 95 | <img src="https://user-images.githubusercontent.com/11633302/67949915-a0013780-fc23-11e9-81a5-36bd7b6c9e9c.png" width=25%/>
 96 | 
 97 | ---
 98 | 
 99 | ### 9. Linear Discriminant Analysis
100 | 
101 | **File** - [linear_discriminant_analysis.py](https://github.com/KangCai/Machine-Learning-Algorithm/blob/master/linear_discriminant_analysis.py)
102 | 
103 | ---
104 | 
105 | ### 10. Maximum Entropy
106 | 
107 | **File** - [maximum_entropy.py](https://github.com/KangCai/Machine-Learning-Algorithm/blob/master/maximum_entropy.py)
108 | 
109 | **Example** -
110 | 
111 | <img src="https://user-images.githubusercontent.com/11633302/67949926-a2639180-fc23-11e9-8830-4b7da5dc5e1c.png" width=80%/>
112 | 
113 | ---
114 | 
115 | ### 11. Gaussian Discriminant Analysis
116 | 
117 | **File** - [gaussian_discriminant_analysis.py](https://github.com/KangCai/Machine-Learning-Algorithm/blob/master/gaussian_discriminant_analysis.py)
118 | 
119 | <img src="https://user-images.githubusercontent.com/11633302/67949918-a099ce00-fc23-11e9-8550-953cc6f344f1.png" width=50%/>
120 | 
121 | ---
122 | 
123 | ### 12. Principal Component Analysis
124 | 
125 | **File** - [principal_component_analysis.py](https://github.com/KangCai/Machine-Learning-Algorithm/blob/master/principal_component_analysis.py)
126 | 
127 | **Example** -
128 | 
129 | <img src="https://user-images.githubusercontent.com/11633302/67949929-a2fc2800-fc23-11e9-9c9d-1baf77b219be.png" width=50%/>
130 | 
131 | ---
132 | 
133 | ### 13. K-means
134 | 
135 | **File** - [kmeans.py](https://github.com/KangCai/Machine-Learning-Algorithm/blob/master/kmeans.py) | [util_kd_tree.py](https://github.com/KangCai/Machine-Learning-Algorithm/blob/master/util_kd_tree.py)
136 | 


--------------------------------------------------------------------------------
/k_nearest_neighbor.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | import numpy as np
  4 | import util_kd_tree as kdtree
  5 | 
  6 | class KNNModel_Naive(object):
  7 | 
  8 |     def __init__(self, k, X_train, Y_train):
  9 |         """
 10 |         Train model implicitly. No explicit training process.
 11 |         :param:
 12 |         :param X_train:
 13 |         :param Y_train:
 14 |         """
 15 |         self.k = k
 16 |         self.X_train = X_train
 17 |         self.Y_train = Y_train
 18 | 
 19 |     def validate(self, X_val, Y_val):
 20 |         """
 21 |         Validate the trained model.
 22 |         :param X_val:
 23 |         :param Y_val:
 24 |         :return:
 25 |         """
 26 |         label_list = []
 27 |         row, col = X_val.shape
 28 |         for i in range(row):
 29 |             dist = np.linalg.norm(X_val[i, :] - self.X_train, axis=1)
 30 |             res_idx = np.argsort(dist)[:self.k]
 31 |             res = [self.Y_train[i] for i in res_idx]
 32 |             label = np.argmax(np.bincount(res))
 33 |             label_list.append(label)
 34 |         label_array = np.array(label_list)
 35 |         accuracy = len(np.where(label_array == Y_val)) / row
 36 |         return accuracy, label_array
 37 | 
 38 | class KNNModel_Heap(KNNModel_Naive):
 39 | 
 40 |     def validate(self, X_val, Y_val):
 41 |         """
 42 |         Validate the trained model.
 43 |         :param X_val:
 44 |         :param Y_val:
 45 |         :return:
 46 |         """
 47 |         import heapq
 48 |         label_list = []
 49 |         row, col = X_val.shape
 50 |         for i in range(row):
 51 |             heap = []
 52 |             dist = np.linalg.norm(X_val[i, :] - self.X_train, axis=1)
 53 |             for idx, d in enumerate(dist):
 54 |                 if len(heap) < self.k:
 55 |                     heapq.heappush(heap, (-d, idx))
 56 |                 elif d < -heap[0][0]: # -heap[0][0] is the maximum distance in heap.
 57 |                     heapq.heappushpop(heap, (-d, idx))
 58 |             res = [self.Y_train[r[1]] for r in heap]
 59 |             label = np.argmax(np.bincount(res))
 60 |             label_list.append(label)
 61 |         label_array = np.array(label_list)
 62 |         accuracy = len(np.where(label_array == Y_val)) / row
 63 |         return accuracy, label_array
 64 | 
 65 | class KNNModel_KDTree(KNNModel_Naive):
 66 | 
 67 |     def __init__(self, k, X_train, Y_train):
 68 |         """
 69 |         Train model.
 70 |         :param X_train:
 71 |         :param Y_train:
 72 |         """
 73 |         super().__init__(k, X_train, Y_train)
 74 |         self.kd_node = kdtree.KDTree(X_train, range(X_train.shape[0]))
 75 | 
 76 |     def validate(self, X_val, Y_val):
 77 |         """
 78 |         Validate the trained model.
 79 |         :param X_val:
 80 |         :param Y_val:
 81 |         :return:
 82 |         """
 83 |         label_list = []
 84 |         row, col = X_val.shape
 85 |         for i in range(row):
 86 |             nn_nodes = self.kd_node.search(X_val[i], self.k)
 87 |             res = [self.Y_train[n.data] for n in nn_nodes]
 88 |             label = np.argmax(np.bincount(res))
 89 |             label_list.append(label)
 90 |         label_array = np.array(label_list)
 91 |         accuracy = len(np.where(label_array == Y_val)) / row
 92 |         return accuracy, label_array
 93 | 
 94 | def _TestEfficiency():
 95 |     import random
 96 |     X_train, Y_train = [], []
 97 |     # k=5, n=100000, m=200: 2.4(Naive), 6.0(Heap), 0.05(KDTree)
 98 |     k_, n_, t_num, d = 5, 1000, 200, 500
 99 |     # Train
100 |     for _ in range(n_):
101 |         X_train.append([random.uniform(0, 50) for _ in range(d)])
102 |         Y_train.append(random.randint(0, 1))
103 |     X_train, Y_train = np.array(X_train), np.array(Y_train)
104 |     # Validate
105 |     X_val, Y_val = [], []
106 |     for _ in range(t_num):
107 |         X_val.append([random.uniform(0, 50) for _ in range(d)])
108 |         Y_val.append(random.randint(0, 1))
109 |     X_val, Y_val = np.array(X_val), np.array(Y_val)
110 |     for model_class in (KNNModel_Naive, KNNModel_Heap, KNNModel_KDTree):
111 |         model = model_class(k_, X_train, Y_train)
112 |         print('=' * 5 + 'Model type %r' % (model,) + '=' * 5)
113 |         t1 = time.clock()
114 |         print('Accuracy is %r' % (model.validate(X_val, Y_val),))
115 |         print('Total used time is %r' % (time.clock() - t1,))
116 | 
117 | def _TestVisualization():
118 |     import matplotlib.pyplot as plt
119 |     k, m, n_train, n_val = 5, 4, 5, 2
120 |     X_train, X_val, Y_train, Y_val = [], [], [], []
121 |     color = ['c', 'g', 'b', 'r']
122 |     for i in range(m):
123 |         for _ in range(n_train):
124 |             x, y, l = random.uniform(int(i/2)+0.1, int(i/2)+0.9), random.uniform(i%2+0.1, i%2+0.9), i
125 |             X_train.append((x, y))
126 |             Y_train.append(i)
127 |             plt.scatter(x, y, s=100, c=color[i])
128 |         for _ in range(n_val):
129 |             x, y, l = random.uniform(int(i/2)+0.1, int(i/2)+0.9), random.uniform(i%2+0.1, i%2+0.9), i
130 |             X_val.append((x, y))
131 |             Y_val.append(i)
132 |     X_train, X_val, Y_train, Y_val = np.array(X_train), np.array(X_val), np.array(Y_train), np.array(Y_val)
133 |     for model_class in (KNNModel_KDTree,):
134 |         model = model_class(k, X_train, Y_train)
135 |         accuracy, label_val = model.validate(X_val, Y_val)
136 |         for i in range(len(label_val)):
137 |             plt.scatter(X_val[i, 0], X_val[i, 1], alpha=0.3, s=100, c=color[Y_val[i]], linewidths=2, edgecolors=color[label_val[i]])
138 |     plt.grid()
139 |     plt.xlim(0, 2)
140 |     plt.ylim(0, 2)
141 |     plt.show()
142 | 
143 | if __name__ == '__main__':
144 |     import random, time
145 |     _TestEfficiency()
146 |     # Visualization
147 |     # _TestVisualization()
148 | 
149 | 
150 | 


--------------------------------------------------------------------------------
/maximum_entropy.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | import numpy as np
  4 | import math
  5 | from collections import defaultdict
  6 | 
  7 | class MaxEntropy(object):
  8 | 
  9 |     def __init__(self, epsilon=0.001, n_iter=100):
 10 |         self.epsilon = epsilon
 11 |         self.n_iter = n_iter
 12 |         self.w = None
 13 |         # 训练样本量，特征维度，x 的经验边缘分布，(x,y) 的联合概率分布，特征函数关于 (x,y) 联合分布的期望值，特征函数 f(x,y)，记录标签集
 14 |         self.N, self.M, self.px, self.pxy, self.e_feat, self.feat_list, self.labels = \
 15 |             None, None, defaultdict(lambda: 0), defaultdict(lambda: 0), defaultdict(lambda: 0), [], []
 16 | 
 17 |     def fit(self, X_train, Y_train):
 18 |         self.N, self.M = X_train.shape
 19 |         self.labels = np.arange(np.bincount(Y_train).size)
 20 |         # 统计 (x,y) 的联合概率分布，x 的经验边缘分布
 21 |         feat_set = set()
 22 |         for X,y in zip(X_train, Y_train):
 23 |             X = tuple(X)
 24 |             self.px[X] += 1.0 / self.N
 25 |             self.pxy[(X, y)] += 1.0 / self.N
 26 |             for idx, val in enumerate(X):
 27 |                 key = (idx, val, y)
 28 |                 feat_set.add(key)
 29 |         self.feat_list = list(feat_set)
 30 |         self.w = np.zeros(len(self.feat_list))
 31 |         print(self.px, self.pxy)
 32 |         # 计算特征的经验期望值, E_p~(f) = Sum( P~(x,y) * f(x,y) )
 33 |         for X,y in zip(X_train, Y_train):
 34 |             X = tuple(X)
 35 |             for idx, val in enumerate(X):
 36 |                 key = (idx, val, y)
 37 |                 self.e_feat[key] += self.pxy[(X, y)]
 38 |         # 迭代找到最优参数 self.w
 39 |         for i in range(self.n_iter):
 40 |             delta = self._GIS(X_train, Y_train)
 41 |             print('Iter %d/%d, Delta %r' % (i, self.n_iter, np.max(np.abs(delta))))
 42 |             if np.max(np.abs(delta)) < self.epsilon:
 43 |                 break
 44 |             self.w += delta
 45 | 
 46 |     def predict(self, X):
 47 |         n, m = X.shape
 48 |         result_array = np.zeros(n)
 49 |         for i in range(n):
 50 |             output = self._cal_py_X(X[i, :])
 51 |             result_array[i] = max(output, key=output.get)
 52 |         return result_array
 53 | 
 54 |     def _GIS(self, X_train, Y_train):
 55 |         n_feat = len(self.feat_list)
 56 |         # 基于当前模型，获取每个特征估计期望, E_p(f) = Sum( P~(x) * P(y|x) * f(x,y) )
 57 |         delta = np.zeros(n_feat)
 58 |         estimate_feat = defaultdict(float)
 59 |         for X,y in zip(X_train, Y_train):
 60 |             X = tuple(X)
 61 |             py_x = self._cal_py_X(X)[y]
 62 |             for idx, val in enumerate(X):
 63 |                 key = (idx, val, y)
 64 |                 estimate_feat[key] += self.px[X] * py_x
 65 |         # 更新 delta
 66 |         for j in range(n_feat):
 67 |             feat_key = self.feat_list[j]
 68 |             e_feat_exp = self.e_feat[feat_key]
 69 |             e_feat_estimate = estimate_feat[feat_key]
 70 |             if e_feat_estimate == 0 or e_feat_exp / e_feat_estimate <= 0:
 71 |                 continue
 72 |             delta[j] = 1.0 / self.M * math.log(e_feat_exp / e_feat_estimate)
 73 |         delta /= np.sum(delta)
 74 |         return delta
 75 | 
 76 |     def _cal_py_X(self, X):
 77 |         # 计算条件分布概率 P(y|x)
 78 |         py_X = defaultdict(float)
 79 |         for y in self.labels:
 80 |             s = 0
 81 |             for idx, val in enumerate(X):
 82 |                 feat_key = (idx, val, y)
 83 |                 if feat_key in self.feat_list:
 84 |                     dim_idx = self.feat_list.index(feat_key)
 85 |                     s += self.w[dim_idx]
 86 |             py_X[y] = math.exp(s)
 87 |         normalizer = sum(py_X.values())
 88 |         for label, val in py_X.items():
 89 |             py_X[label] = val / normalizer
 90 |         return py_X
 91 | 
 92 | datalabel = np.array(['年龄(特征1)', '有工作(特征2)', '有自己的房子(特征3)', '信贷情况(特征4)', '类别(标签)'])
 93 | train_sets = np.array([
 94 |                     ['青年', '否', '否', '一般', '否'],
 95 |                     ['青年', '否', '否', '好', '否'],
 96 |                     ['青年', '是', '否', '好', '是'],
 97 |                     ['青年', '是', '是', '一般', '是'],
 98 |                     ['青年', '否', '否', '一般', '否'],
 99 |                     ['中年', '否', '否', '一般', '否'],
100 |                     ['中年', '否', '否', '好', '否'],
101 |                     ['中年', '是', '是', '好', '是'],
102 |                     ['中年', '否', '是', '非常好', '是'],
103 |                     ['中年', '否', '是', '非常好', '是'],
104 |                     ['老年', '否', '是', '非常好', '是'],
105 |                     ['老年', '否', '是', '好', '是'],
106 |                     ['老年', '是', '否', '好', '是'],
107 |                     ['老年', '是', '否', '非常好', '是'],
108 |                     ['老年', '否', '否', '一般', '否'],
109 |                     ['青年', '否', '否', '一般', '是']])
110 | validate_sets = np.array([
111 |     ['青年', '是', '是', '好', '是'],
112 |     ['青年', '是', '否', '一般', '是'],
113 |     ['中年', '否', '否', '一般', '否'],
114 |     ['老年', '是', '是', '一般', '是'],
115 |     ['青年', '否', '否', '非常好', '是'],
116 | ])
117 | map_table = {'青年': 0, '中年': 1, '老年': 2,
118 |              '否': 0, '是': 1,
119 |              '一般': 0, '好': 1, '非常好': 2}
120 | 
121 | if __name__ == '__main__':
122 |     row_, col_ = train_sets.shape
123 |     train_sets_encode = np.array([[map_table[train_sets[i, j]] for j in range(col_)] for i in range(row_)])
124 |     X_t, Y_t = train_sets_encode[:, :-1], train_sets_encode[:, -1]
125 |     model = MaxEntropy()
126 |     model.fit(X_t, Y_t)
127 |     res = model.predict(X_t)
128 |     print('Ground truth   on train set: %r' % (Y_t,))
129 |     print('Predict result on train set: %r' % (res.astype(int),))
130 |     row_, col_ = validate_sets.shape
131 |     validate_sets_encode = np.array([[map_table[validate_sets[i, j]] for j in range(col_)] for i in range(row_)])
132 |     X_v, Y_v = validate_sets_encode[:, :-1], validate_sets_encode[:, -1]
133 |     res = model.predict(X_v)
134 |     print('Ground truth   on validate set: %r' % (Y_v,))
135 |     print('Predict result on validate set: %r' % (res.astype(int),))


--------------------------------------------------------------------------------
/naive_bayes.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import re
  6 | import time
  7 | 
  8 | 
  9 | class NaiveBayesClassificationModel(object):
 10 |     """
 11 |     朴素贝叶斯模型
 12 |     """
 13 |     def __init__(self, kw_set):
 14 |         # 关键字集合，即哪些单词是我们要当做是特征属性的单词
 15 |         self.kw_set = kw_set
 16 |         # P(类别) 样本类别本身在样本中出现的先验概率
 17 |         self.label_prior_prob = dict()
 18 |         # P(关键字|类别) 这一条件概率
 19 |         self.kw_posterior_prob = dict()
 20 | 
 21 |     def train(self, data):
 22 |         """
 23 |         训练模型
 24 |         :param data: 以 [[label] [input_text_words]] 的形式构成的list
 25 |         :return: None
 26 |         """
 27 |         # 计算条件概率 P(关键字|类别)
 28 |         for label, input_text_words in data:
 29 |             if label not in self.kw_posterior_prob:
 30 |                 self.kw_posterior_prob[label] = dict()
 31 |             if label not in self.label_prior_prob:
 32 |                 self.label_prior_prob[label] = 0
 33 |             self.label_prior_prob[label] += 1
 34 |             for word in input_text_words:
 35 |                 if word not in self.kw_set:
 36 |                     continue
 37 |                 if word not in self.kw_posterior_prob[label]:
 38 |                     self.kw_posterior_prob[label][word] = 0
 39 |                 self.kw_posterior_prob[label][word] += 1
 40 |         for label, kw_posterior_prob in self.kw_posterior_prob.items():
 41 |             for word in self.kw_set:
 42 |                 if word in kw_posterior_prob:
 43 |                     self.kw_posterior_prob[label][word] /= self.label_prior_prob[label] * 1.0
 44 |                 else:
 45 |                     self.kw_posterior_prob[label][word] = 0
 46 |         # 样本类别本身在样本中出现的先验概率 P(类别)
 47 |         for label in self.label_prior_prob:
 48 |             self.label_prior_prob[label] /= len(data) * 1.0
 49 | 
 50 |     def predict(self, input_text):
 51 |         """
 52 |         预测过程
 53 |         :param input_text: 处理过后的单词集合
 54 |         :return:
 55 |         """
 56 |         predicted_label = None
 57 |         max_prob = None
 58 |         for label in self.label_prior_prob:
 59 |             prob = 1.0
 60 |             for word in self.kw_set:
 61 |                 if word in input_text:
 62 |                     prob *= self.kw_posterior_prob[label][word]
 63 |                 else:
 64 |                     prob *= 1 - self.kw_posterior_prob[label][word]
 65 |             if max_prob is None or prob > max_prob:
 66 |                 predicted_label = label
 67 |                 max_prob = prob
 68 |         return predicted_label
 69 | 
 70 |     def validate(self, data):
 71 |         """
 72 |         验证模型效果
 73 |         :param data: 以 [[label] [input_text_words]] 的形式形构成的list
 74 |         :return:
 75 |         """
 76 |         # 计算 正误频次混淆矩阵
 77 |         mtc = {label_1: {label_2: 0 for label_2 in self.label_prior_prob} for label_1 in self.label_prior_prob}
 78 |         for gd_label, input_text_words in data:
 79 |             predicted_label = self.predict(input_text_words)
 80 |             mtc[gd_label][predicted_label] += 1
 81 |         # 计算 准确率混淆矩阵 和 总准确率
 82 |         acc = 0
 83 |         for gd_label in mtc:
 84 |             for predicted_label in mtc[gd_label]:
 85 |                 mtc[gd_label][predicted_label] /= len(data) * self.label_prior_prob[gd_label]
 86 |                 if predicted_label == gd_label:
 87 |                     acc += mtc[gd_label][predicted_label] * self.label_prior_prob[gd_label]
 88 |         return acc, mtc
 89 | 
 90 | 
 91 | def data_pre_process(data_file_name):
 92 |     """
 93 |     句子切分成单词，由于是英文，所以这里处理方式比较暴力，按照空格和除'之外的符号来切分了；然后全部转小写
 94 |     :param data_file_name:
 95 |     :return:
 96 |     """
 97 |     fh = open(data_file_name, encoding='utf-8')
 98 |     data = list()
 99 |     for line in fh.readlines():
100 |         label_text_pair = line.split('\t')
101 |         word_list = re.split('[^\'a-zA-Z]', label_text_pair[1])
102 |         word_in_doc_set = set()
103 |         for raw_word in word_list:
104 |             word = raw_word.lower()
105 |             if word == '':
106 |                 continue
107 |             word_in_doc_set.add(word)
108 |         # 组成 [[label] [input_text_words]] 的形式
109 |         data.append((label_text_pair[0], list(word_in_doc_set)))
110 |     return data
111 | 
112 | 
113 | def statistic_key_word(data, cut_off=None):
114 |     """
115 |     统计单词出现的文档次数，并试图把直观上无效（出现在的文档数目较少）的单词去掉
116 |     :param data: data in one line: [label] [input_text]
117 |     :param cut_off:
118 |     :return:
119 |     """
120 |     # 针对各个单词，统计单词出现的文档次数
121 |     w_dict = dict()
122 |     total_doc_count = len(data)
123 |     for _, word_in_doc_set in data:
124 |         for word in word_in_doc_set:
125 |             if word not in w_dict:
126 |                 w_dict[word] = 0
127 |             w_dict[word] += 1
128 |     for word in w_dict.keys():
129 |         w_dict[word] /= total_doc_count * 1.0
130 |     # 按出现文档次数从高到低，对单词进行排序
131 |     w_count_list = sorted(w_dict.items(), key=lambda d: d[1], reverse=True)
132 |     # 截断后续出现次数过低的单词
133 |     kw_set = set()
134 |     cut_off_length = cut_off if cut_off else len(w_count_list)
135 |     for word, _ in w_count_list[:cut_off_length]:
136 |         kw_set.add(word)
137 |     return w_count_list, kw_set
138 | 
139 | 
140 | def shuffle(data, k):
141 |     """
142 |     切分并打乱，为模型的交叉验证做准备
143 |     :param data:
144 |     :param k:
145 |     :return:
146 |     """
147 |     # 将数据按类别归类，目的是为了切分各个fold的时候，保证数据集合中类别分布平均一些
148 |     label_data_dict = dict()
149 |     for label, word_in_doc_set in data:
150 |         if label not in label_data_dict:
151 |             label_data_dict[label] = list()
152 |         label_data_dict[label].append((label, word_in_doc_set))
153 |     # 切分并打乱
154 |     k_group_data_list = [list() for _ in range(k)]
155 |     for label, label_data_list in label_data_dict.items():
156 |         # 打乱
157 |         seq = np.random.permutation(range(len(label_data_list)))
158 |         # 切分
159 |         fold_instance_count = int(len(label_data_list) / k)
160 |         for i in range(k):
161 |             for idx in range(i * fold_instance_count, (i+1) * fold_instance_count):
162 |                 k_group_data_list[i].append(label_data_list[seq[idx]])
163 |     k_fold_data_list = list()
164 |     for i in range(k):
165 |         train_data = []
166 |         for j in range(k):
167 |             if i != j:
168 |                 train_data.extend(k_group_data_list[j])
169 |         k_fold_data_list.append((train_data, k_group_data_list[i]))
170 |     return k_fold_data_list
171 | 
172 | 
173 | def draw(kw_count_list):
174 |     """
175 |     画出单词频次分布情况，为选择一个合适的截断提供直观的依据
176 |     :param kw_count_list:
177 |     :return:
178 |     """
179 |     key_word_list = list()
180 |     count_list = list()
181 |     for key_word, count in kw_count_list:
182 |         key_word_list.append(key_word)
183 |         count_list.append(count)
184 | 
185 |     plt.figure(figsize=(8, 4))
186 |     plt.xlabel('Rank of key word')
187 |     plt.ylabel('count of doc containing key word')
188 |     plt.plot(key_word_list, count_list)
189 |     xt_list = range(0, len(count_list), 1000)
190 |     plt.xticks(xt_list, xt_list)
191 |     plt.xlim(0, len(count_list))
192 |     plt.ylim(0, 0.35)
193 |     plt.grid(True)
194 | 
195 | 
196 | if __name__ == '__main__':
197 |     file_name = './data/SMSSpamCollection.txt'
198 |     raw_data_list = data_pre_process(file_name)
199 |     fold_count = 4
200 |     fold_data_list = shuffle(raw_data_list, fold_count)
201 |     acc_average = 0
202 |     cut_off = 500
203 |     t1 = time.clock()
204 |     for fold, data_list in enumerate(fold_data_list):
205 |         train_data_list, test_data_list = data_list
206 |         word_count_list, key_word_set = statistic_key_word(train_data_list, cut_off=cut_off)
207 |         nbc_model = NaiveBayesClassificationModel(key_word_set)
208 |         nbc_model.train(train_data_list)
209 |         accuracy, metric = nbc_model.validate(test_data_list)
210 |         acc_average += accuracy
211 |         print('Fold %r/%r - Acc:%r Metric:%r' % (fold+1, fold_count, accuracy, metric))
212 |     print('Average Acc:%r Average Cost Time:%r' % (acc_average / len(fold_data_list),
213 |             (time.clock() - t1) / len(fold_data_list)))


--------------------------------------------------------------------------------
/logistic_regression.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import re
  6 | import time
  7 | 
  8 | class RegressionModel(object):
  9 |     """
 10 |     逻辑回归模型
 11 |     """
 12 |     def __init__(self):
 13 |         self.W = None
 14 | 
 15 |     def train(self, x_train, y_train, learning_rate=0.1, num_iters=10000):
 16 |         """
 17 |         模型训练
 18 |         :param x_train: shape = num_train, dim_feature
 19 |         :param y_train: shape = num_train, 1
 20 |         :param learning_rate
 21 |         :param num_iters
 22 |         :return: loss_history
 23 |         """
 24 |         num_train, dim_feature = x_train.shape
 25 |         # w * x + b
 26 |         x_train_ = np.hstack((x_train, np.ones((num_train, 1))))
 27 |         self.W = 0.001 * np.random.randn(dim_feature + 1, 1)
 28 |         loss_history = []
 29 |         for i in range(num_iters+1):
 30 |             # linear transformation: w * x + b
 31 |             g = np.dot(x_train_, self.W)
 32 |             # sigmoid: 1 / (1 + e**-x)
 33 |             h = 1 / (1 + np.exp(-g))
 34 |             # cross entropy: 1/m * sum((y*np.log(h) + (1-y)*np.log((1-h))))
 35 |             loss = -np.sum(y_train * np.log(h) + (1 - y_train) * np.log(1 - h)) / num_train
 36 |             loss_history.append(loss)
 37 |             # dW = cross entropy' = 1/m * sum(h-y) * x
 38 |             dW = x_train_.T.dot(h - y_train) / num_train
 39 |             # W = W - dW
 40 |             self.W -= learning_rate * dW
 41 |             # debug
 42 |             if i % 100 == 0:
 43 |                 print('Iters: %r/%r Loss: %r' % (i, num_iters, loss))
 44 |         return loss_history
 45 | 
 46 |     def validate(self, x_val, y_val):
 47 |         """
 48 |         验证模型效果
 49 |         :param x_val: shape = num_val, dim_feature
 50 |         :param y_val: shape = num_val, 1
 51 |         :return: accuracy, metric
 52 |         """
 53 |         num_val, dim_feature = x_val.shape
 54 |         x_val_ = np.hstack((x_val, np.ones((num_val, 1))))
 55 |         # linear transformation: w * x + b
 56 |         g = np.dot(x_val_, self.W)
 57 |         # sigmoid: 1 / (1 + e**-x)
 58 |         h = 1 / (1 + np.exp(-g))
 59 |         # predict
 60 |         y_val_ = h
 61 |         y_val_[y_val_ >= 0.5] = 1
 62 |         y_val_[y_val_ < 0.5] = 0
 63 |         true_positive = len(np.where(((y_val_ == 1).astype(int) + (y_val == 1).astype(int) == 2) == True)[0]) * 1.0 / num_val
 64 |         true_negative = len(np.where(((y_val_ == 0).astype(int) + (y_val == 0).astype(int) == 2) == True)[0]) * 1.0 / num_val
 65 |         false_positive = len(np.where(((y_val_ == 1).astype(int) + (y_val == 0).astype(int) == 2) == True)[0]) * 1.0 / num_val
 66 |         false_negative = len(np.where(((y_val_ == 0).astype(int) + (y_val == 1).astype(int) == 2) == True)[0]) * 1.0 / num_val
 67 |         negative_instance = true_negative + false_positive
 68 |         positive_instance = false_negative + true_positive
 69 |         metric = np.array([[true_negative / negative_instance, false_positive / negative_instance],
 70 |                            [false_negative / positive_instance, true_positive / positive_instance]])
 71 |         accuracy = true_positive + true_negative
 72 |         return accuracy, metric
 73 | 
 74 | def feature_batch_extraction(d_list, kw_set):
 75 |     """
 76 |     特征批量提取
 77 |     :param d_list: 原始数据集
 78 |     :param kw_set: 关键字列表
 79 |     :return:
 80 |     """
 81 |     kw_2_idx_dict = dict(zip(list(kw_set), range(len(kw_set))))
 82 |     feature_data = np.zeros((len(d_list), len(kw_set)))
 83 |     label_data = np.zeros((len(d_list), 1))
 84 |     for i in range(len(d_list)):
 85 |         label, words = d_list[i]
 86 |         for word in words:
 87 |             if word in kw_2_idx_dict:
 88 |                 feature_data[i, kw_2_idx_dict[word]] = 1
 89 |         label_data[i] = 1 if label == 'spam' else 0
 90 |     return feature_data, label_data
 91 | 
 92 | 
 93 | def data_pre_process(data_file_name):
 94 |     """
 95 |     句子切分成单词，由于是英文，所以这里处理方式比较暴力，按照空格和除'之外的符号来切分了；然后全部转小写
 96 |     :param data_file_name:
 97 |     :return:
 98 |     """
 99 |     fh = open(data_file_name, encoding='utf-8')
100 |     data = list()
101 |     for line in fh.readlines():
102 |         label_text_pair = line.split('\t')
103 |         word_list = re.split('[^\'a-zA-Z]', label_text_pair[1])
104 |         word_in_doc_set = set()
105 |         for raw_word in word_list:
106 |             word = raw_word.lower()
107 |             if word == '':
108 |                 continue
109 |             word_in_doc_set.add(word)
110 |         # 组成 [[label] [input_text_words]] 的形式
111 |         data.append((label_text_pair[0], list(word_in_doc_set)))
112 |     return data
113 | 
114 | 
115 | def statistic_key_word(data, cut_off=None):
116 |     """
117 |     统计单词出现的文档次数，并试图把直观上无效（出现在的文档数目较少）的单词去掉
118 |     :param data: data in one line: [label] [input_text]
119 |     :param cut_off:
120 |     :return:
121 |     """
122 |     # 针对各个单词，统计单词出现的文档次数
123 |     w_dict = dict()
124 |     total_doc_count = len(data)
125 |     for _, word_in_doc_set in data:
126 |         for word in word_in_doc_set:
127 |             if word not in w_dict:
128 |                 w_dict[word] = 0
129 |             w_dict[word] += 1
130 |     for word in w_dict.keys():
131 |         w_dict[word] /= total_doc_count * 1.0
132 |     # 按出现文档次数从高到低，对单词进行排序
133 |     w_count_list = sorted(w_dict.items(), key=lambda d: d[1], reverse=True)
134 |     # 截断后续出现次数过低的单词
135 |     kw_set = set()
136 |     cut_off_length = cut_off if cut_off else len(w_count_list)
137 |     for word, _ in w_count_list[:cut_off_length]:
138 |         kw_set.add(word)
139 |     return w_count_list, kw_set
140 | 
141 | 
142 | def shuffle(data, k):
143 |     """
144 |     切分并打乱，为模型的交叉验证做准备
145 |     :param data:
146 |     :param k:
147 |     :return:
148 |     """
149 |     # 将数据按类别归类，目的是为了切分各个fold的时候，保证数据集合中类别分布平均一些
150 |     label_data_dict = dict()
151 |     for label, word_in_doc_set in data:
152 |         if label not in label_data_dict:
153 |             label_data_dict[label] = list()
154 |         label_data_dict[label].append((label, word_in_doc_set))
155 |     # 切分并打乱
156 |     k_group_data_list = [list() for _ in range(k)]
157 |     for label, label_data_list in label_data_dict.items():
158 |         # 打乱
159 |         seq = np.random.permutation(range(len(label_data_list)))
160 |         # 切分
161 |         fold_instance_count = int(len(label_data_list) / k)
162 |         for i in range(k):
163 |             for idx in range(i * fold_instance_count, (i+1) * fold_instance_count):
164 |                 k_group_data_list[i].append(label_data_list[seq[idx]])
165 |     k_fold_data_list = list()
166 |     for i in range(k):
167 |         train_data = []
168 |         for j in range(k):
169 |             if i != j:
170 |                 train_data.extend(k_group_data_list[j])
171 |         k_fold_data_list.append((train_data, k_group_data_list[i]))
172 |     return k_fold_data_list
173 | 
174 | def draw_loss_list(loss_list):
175 |     """
176 |     画出单词频次分布情况，为选择一个合适的截断提供直观的依据
177 |     :param loss_list:
178 |     :return:
179 |     """
180 |     plt.figure(figsize=(8, 4))
181 |     plt.xlabel('Train iteration')
182 |     plt.ylabel('Loss')
183 |     xt_list = range(0, len(loss_list[0][1]), 1000)
184 |     print(len(loss_list[0][1]))
185 |     for cut_off, loss in loss_list:
186 |         print(len(loss))
187 |         plt.plot(range(0, len(loss)), loss, label='cut off %r' % (cut_off,))
188 |     plt.xticks(xt_list, xt_list)
189 |     plt.xlim(1, len(loss_list[0][1]) + 1)
190 |     plt.ylim(0, 0.7)
191 |     plt.legend()
192 |     plt.show()
193 | 
194 | def performance_with_cut_off():
195 |     """
196 | 
197 |     :return:
198 |     """
199 |     file_name = './data/SMSSpamCollection.txt'
200 |     raw_data_list = data_pre_process(file_name)
201 |     fold_count = 4
202 |     fold_data_list = shuffle(raw_data_list, fold_count)
203 |     loss_list = list()
204 |     accuracy_list = list()
205 |     metric_list = list()
206 |     time_cost_list = list()
207 |     for cut_off in (200, 500, 2000, 5000, 7956):
208 |         t1 = time.clock()
209 |         data_list = fold_data_list[0]
210 |         train_data_list, test_data_list = data_list
211 |         word_count_list, key_word_set = statistic_key_word(train_data_list, cut_off=cut_off)
212 |         # Feature extraction
213 |         train_feature, train_label = feature_batch_extraction(train_data_list, key_word_set)
214 |         validate_feature, validate_label = feature_batch_extraction(test_data_list, key_word_set)
215 |         # Train model
216 |         lr_model = RegressionModel()
217 |         loss_history = lr_model.train(train_feature, train_label, num_iters=10000)
218 |         loss_list.append((cut_off, loss_history))
219 |         accuracy, metric = lr_model.validate(validate_feature, validate_label)
220 |         accuracy_list.append(accuracy)
221 |         metric_list.append(metric)
222 |         time_cost_list.append((time.clock() - t1))
223 |     with open('./result/lr_loss_list.txt', 'w') as f:
224 |         f.write(str(loss_list) + '\n')
225 |         f.write(str(accuracy_list) + '\n')
226 |         f.write(str(time_cost_list) + '\n')
227 |         f.write(str(metric_list))
228 |     with open('./result/lr_loss_list.txt') as f:
229 |         loss_list = eval(f.readline())
230 |         draw_loss_list(loss_list)
231 |         accuracy_list = eval(f.readline())
232 |         print(accuracy_list)
233 |         time_cost_list = eval(f.readline())
234 |         print(time_cost_list)
235 |         metric_list = eval(f.readline())
236 |         print(metric_list)
237 | 
238 | def performance_with_fold():
239 |     """
240 | 
241 |     :return:
242 |     """
243 |     file_name = './data/SMSSpamCollection.txt'
244 |     raw_data_list = data_pre_process(file_name)
245 |     fold_count = 4
246 |     fold_data_list = shuffle(raw_data_list, fold_count)
247 |     acc_average = 0
248 |     cut_off = 500
249 |     t1 = time.clock()
250 |     for fold, data_list in enumerate(fold_data_list):
251 |         train_data_list, test_data_list = data_list
252 |         word_count_list, key_word_set = statistic_key_word(train_data_list, cut_off=cut_off)
253 |         # Feature extraction
254 |         train_feature, train_label = feature_batch_extraction(train_data_list, key_word_set)
255 |         validate_feature, validate_label = feature_batch_extraction(test_data_list, key_word_set)
256 |         # Train model
257 |         lr_model = RegressionModel()
258 |         loss_history = lr_model.train(train_feature, train_label)
259 |         # Validate
260 |         accuracy, metric = lr_model.validate(validate_feature, validate_label)
261 |         acc_average += accuracy
262 |         print('Fold %r/%r - Acc:%r Metric:%r' % (fold + 1, fold_count, accuracy, metric))
263 |     print('Average Acc:%r Average Cost Time:%r' % (acc_average / len(fold_data_list),
264 |             (time.clock() - t1) / len(fold_data_list)))
265 | 
266 | if __name__ == '__main__':
267 |     performance_with_cut_off()
268 | 
269 | 


--------------------------------------------------------------------------------
/decision_tree.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | import numpy as np
  4 | 
  5 | class DTreeID3(object):
  6 | 
  7 |     def __init__(self, epsilon=0.0001):
  8 |         self.tree = Node()
  9 |         self.epsilon = epsilon
 10 | 
 11 |     def fit(self, X_train, Y_train):
 12 |         A_recorder = np.arange(X_train.shape[1])
 13 |         self._train(X_train, Y_train, self.tree, A_recorder)
 14 | 
 15 |     def predict(self, X):
 16 |         n = X.shape[0]
 17 |         Y = np.zeros(n)
 18 |         for i in range(n):
 19 |             Y[i] = self.tree.predict_classification(X[i, :])
 20 |         return Y
 21 | 
 22 |     def visualization(self):
 23 |         return self._visualization_dfs(self.tree)
 24 | 
 25 |     def _train(self, A, D, node, AR):
 26 |         # 1. 结束条件：若 D 中所有实例属于同一类，决策树成单节点树，直接返回
 27 |         if np.any(np.bincount(D) == len(D)):
 28 |             node.y = D[0]
 29 |             return
 30 |         # 2. 结束条件：若 A 为空，则返回单结点树 T，标记类别为样本默认输出最多的类别
 31 |         if A.size == 0:
 32 |             node.y = np.argmax(np.bincount(D))
 33 |             return
 34 |         # 3. 计算特征集 A 中各特征对 D 的信息增益，选择信息增益最大的特征 A_g
 35 |         max_info_gain, g = self._feature_choose_standard(A, D)
 36 |         # 4. 结束条件：如果 A_g 的信息增益小于阈值 epsilon，决策树成单节点树，直接返回
 37 |         if max_info_gain <= self.epsilon:
 38 |             node.y = np.argmax(np.bincount(D))
 39 |             return
 40 |         # 5. 对于 A_g 的每一可能值 a_i，依据 A_g = a_i 将 D 分割为若干非空子集 D_i，将当前结点的标记设为样本数最大的 D_i 对应
 41 |             # 的类别，即对第 i 个子节点，以 D_i 为训练集，以 A - {A_g} 为特征集，递归调用以上步骤，得到子树 T_i，返回 T_i
 42 |         node.label = AR[g]
 43 |         a_cls = np.bincount(A[:, g])
 44 |         new_A, AR = np.hstack((A[:, 0:g], A[:, g+1:])), np.hstack((AR[0:g], AR[g+1:]))
 45 |         for k in range(len(a_cls)):
 46 |             a_row_idxs = np.argwhere(A[:, g] == k).T[0].T
 47 |             child = Node(k)
 48 |             node.append(child)
 49 |             A_child, D_child = new_A[a_row_idxs, :], D[a_row_idxs]
 50 |             self._train(A_child, D_child, child, AR)
 51 | 
 52 |     def _feature_choose_standard(self, A, D):
 53 |         row, col = A.shape
 54 |         prob = self._cal_prob(D)
 55 |         prob = np.array([a if 0 < a <= 1 else 1 for a in prob])
 56 |         entropy = -np.sum(prob * np.log2(prob))
 57 |         max_info_gain_ratio = None
 58 |         g = None
 59 |         for j in range(col):
 60 |             a_cls = np.bincount(A[:, j])
 61 |             condition_entropy = 0
 62 |             for k in range(len(a_cls)):
 63 |                 a_row_idxs = np.argwhere(A[:, j] == k)
 64 |                 # H(D)
 65 |                 prob = self._cal_prob(D[a_row_idxs].T[0])
 66 |                 prob = np.array([a if 0 < a <= 1 else 1 for a in prob])
 67 |                 H_D = -np.sum(prob * np.log2(prob))
 68 |                 # H(D|A)=SUM(p_i * H(D|A=a_i))
 69 |                 condition_entropy += a_cls[k] / np.sum(a_cls) * H_D
 70 |             feature_choose_std = entropy - condition_entropy
 71 |             if max_info_gain_ratio is None or max_info_gain_ratio < feature_choose_std:
 72 |                 max_info_gain_ratio = feature_choose_std
 73 |                 g = j
 74 |         return max_info_gain_ratio, g
 75 | 
 76 |     def _cal_prob(self, D):
 77 |         statistic = np.bincount(D)
 78 |         prob = statistic / np.sum(statistic)
 79 |         return prob
 80 | 
 81 |     def _visualization_dfs(self, node, layer=0):
 82 |         prefix = '\n' if layer else ''
 83 |         output_str = [prefix + ' ' * 4 * layer, '%r+%r ' % (node.y, node.label)]
 84 |         if not node.child:
 85 |             return ''.join(output_str)
 86 |         for child in node.child:
 87 |             output_str.append(self._visualization_dfs(child, layer=layer + 1))
 88 |         return ''.join(output_str)
 89 | 
 90 | class DTreeC45(DTreeID3):
 91 | 
 92 |     def _feature_choose_standard(self, A, D):
 93 |         row, col = A.shape
 94 |         prob = self._cal_prob(D)
 95 |         prob = np.array([a if 0 < a <= 1 else 1 for a in prob])
 96 |         entropy = -np.sum(prob * np.log2(prob))
 97 |         max_info_gain_ratio = None
 98 |         g = None
 99 |         for j in range(col):
100 |             a_cls = np.bincount(A[:, j])
101 |             condition_entropy = 0
102 |             for k in range(len(a_cls)):
103 |                 a_row_idxs = np.argwhere(A[:, j] == k)
104 |                 # H(D) = -SUM(p_i * log(p_i))
105 |                 prob = self._cal_prob(D[a_row_idxs].T[0])
106 |                 prob = np.array([a if 0 < a <= 1 else 1 for a in prob])
107 |                 H_D = -np.sum(prob * np.log2(prob))
108 |                 # H(D|A)=SUM(p_i * H(D|A=a_i))
109 |                 condition_entropy += a_cls[k] / np.sum(a_cls) * H_D
110 |             feature_choose_std = entropy / (condition_entropy + 0.0001)
111 |             if max_info_gain_ratio is None or max_info_gain_ratio < feature_choose_std:
112 |                 max_info_gain_ratio = feature_choose_std
113 |                 g = j
114 |         return max_info_gain_ratio, g
115 | 
116 | class DTreeCART(DTreeID3):
117 | 
118 |     def _train(self, A, D, node, AR):
119 |         self.visited_set = set()
120 |         self._train_helper(A, D, node, AR)
121 | 
122 |     def _train_helper(self, A, D, node, AR):
123 |         # 1. 结束条件：若 D 中所有实例属于同一类，决策树成单节点树，直接返回
124 |         if np.any(np.bincount(D) == len(D)):
125 |             node.y = D[0]
126 |             return
127 |         # 2. 与 ID3, C4.5 不一样, 不会直接去掉 A
128 |         if A.size == 0:
129 |             node.y = np.argmax(np.bincount(D))
130 |             return
131 |         # 3. 与 ID3, C4.5 不一样, 不仅要确定最优切分特征，还要确定最优切分值
132 |         max_info_gain, g, v, a_idx, other_idx = self._feature_choose_standard(A, D)
133 |         if (g, v) in self.visited_set:
134 |             node.y = np.argmax(np.bincount(D))
135 |             return
136 |         self.visited_set.add((g, v))
137 |         # 4. 结束条件：如果 A_g 的信息增益小于阈值 epsilon，决策树成单节点树，直接返回
138 |         if max_info_gain <= self.epsilon:
139 |             node.y = np.argmax(np.bincount(D))
140 |             return
141 |         # 5. 与 ID3, C4.5 不一样, 不是 len(a_cls) 叉树，而是二叉树
142 |         node.label = AR[g]
143 |         idx_list = a_idx, other_idx
144 |         for k, row_idx in enumerate(idx_list):
145 |             row_idx = row_idx.T[0].T
146 |             child = Node(k)
147 |             node.append(child)
148 |             A_child, D_child = A[row_idx, :], D[row_idx]
149 |             self._train_helper(A_child, D_child, child, AR)
150 | 
151 |     def _feature_choose_standard(self, A, D):
152 |         row, col = A.shape
153 |         min_gini, g, v, a_idx, other_idx = None, None, None, None, None
154 |         for j in range(col):
155 |             a_cls = np.bincount(A[:, j])
156 |             # 与 ID3, C4.5 不一样,不仅要确定最优切分特征，还要确定最优切分值
157 |             for k in range(len(a_cls)):
158 |                 # 根据切分值划为两类
159 |                 a_row_idxs, other_row_idxs = np.argwhere(A[:, j] == k), np.argwhere(A[:, j] != k)
160 |                 # H(D) = -SUM(p_i * log(p_i))
161 |                 a_prob, other_prob = self._cal_prob(D[a_row_idxs].T[0]), self._cal_prob(D[other_row_idxs].T[0])
162 |                 a_gini_D, other_gini = 1 - np.sum(a_prob * a_prob), 1 - np.sum(other_prob * other_prob)
163 |                 # H(D|A)=SUM(p_i * H(D|A=a_i))
164 |                 gini_DA = a_cls[k] / np.sum(a_cls) * a_gini_D + (1 - a_cls[k] / np.sum(a_cls)) * other_gini
165 |                 if min_gini is None or min_gini > gini_DA:
166 |                     min_gini, g, v, a_idx, other_idx = gini_DA, j, k, a_row_idxs, other_row_idxs
167 | 
168 |         return min_gini, g, v, a_idx, other_idx
169 | 
170 | class DTreeRegressionCART(object):
171 | 
172 |     def __init__(self, max_depth=1):
173 |         self.tree = Node()
174 |         self.max_depth = max_depth
175 | 
176 |     def fit(self, X_train, Y_train):
177 |         A_recorder = np.arange(X_train.shape[1])
178 |         self._train(X_train, Y_train, self.tree, A_recorder)
179 | 
180 |     def predict(self, X):
181 |         n = X.shape[0]
182 |         Y = np.zeros(n)
183 |         for i in range(n):
184 |             Y[i] = self.tree.predict_regression(X[i, :])
185 |         return Y
186 | 
187 |     def _train(self, A, D, node, AR, depth=0):
188 |         # 1. 结束条件：到最后一层 | A 或 D 一样
189 |         if depth == self.max_depth or np.all(D == D[0]) or np.all(A == A[0]):
190 |             node.y = np.mean(D)
191 |             return
192 |         # 2. 选择第j个变量A_j（切分变量splitting variable）和 切分点s（splitting point）
193 |         min_f, min_j, min_s, min_idx1, min_idx2 = None, None, None, None, None
194 |         row, col = A.shape
195 |         for j in range(col):
196 |             a_col = A[:, j]
197 |             # 这里实现比较简化，s 就直接取最值的平均数
198 |             s = (np.max(a_col) + np.min(a_col)) * 0.5
199 |             R1_idx, R2_idx = np.argwhere(a_col <= s).T[0], np.argwhere(a_col > s).T[0]
200 |             if R1_idx.size == 0 or R2_idx.size == 0:
201 |                 continue
202 |             c1, c2 = np.mean(D[R1_idx]), np.mean(D[R2_idx])
203 |             f1, f2 = np.sum(np.square(D[R1_idx] - c1)), np.sum(np.square(D[R2_idx] - c2))
204 |             if min_f is None or min_f > f1 + f2:
205 |                 min_f, min_j, min_s, min_idx1, min_idx2 = f1 + f2, j, s, R1_idx, R2_idx
206 |         if min_f is None:
207 |             node.y = np.mean(D)
208 |             return
209 |         # 3. 向下一层展开
210 |         node.label, node.s = AR[min_j], min_s
211 |         for i, idx_list in enumerate((min_idx1, min_idx2)):
212 |             child = Node(i)
213 |             node.append(child)
214 |             self._train(A[idx_list, :], D[idx_list], child, AR, depth+1)
215 | 
216 |     def visualization(self):
217 |         return self._visualization_dfs(self.tree)
218 | 
219 |     def _visualization_dfs(self, node, layer=0):
220 |         prefix = '\n' if layer else ''
221 |         output_str = [prefix + ' ' * 4 * layer, '%r+%r+%r' % (node.y, node.label, node.s)]
222 |         if not node.child:
223 |             return ''.join(output_str)
224 |         for child in node.child:
225 |             output_str.append(self._visualization_dfs(child, layer=layer + 1))
226 |         return ''.join(output_str)
227 | 
228 | class Node(object):
229 | 
230 |     def __init__(self, x=None):
231 |         self.label = None
232 |         self.x = x
233 |         self.s = None  # Number
234 |         self.child = []
235 |         self.y = None
236 |         self.data = None
237 | 
238 |     def append(self, child):
239 |         self.child.append(child)
240 | 
241 |     def predict_classification(self, features):
242 |         if self.y is not None:
243 |             return self.y
244 |         for child in self.child:
245 |             if child.x == features[self.label]:
246 |                 return child.predict_classification(features)
247 |         return self.child[1].predict_classification(features)
248 | 
249 |     def predict_regression(self, features):
250 |         if self.y is not None:
251 |             return self.y
252 |         child_idx = 0 if features[self.label] <= self.s else 1
253 |         return self.child[child_idx].predict_regression(features)
254 | 
255 | 
256 | datalabel = np.array(['年龄(特征1)', '有工作(特征2)', '有自己的房子(特征3)', '信贷情况(特征4)', '类别(标签)'])
257 | train_sets = np.array([
258 |                     ['青年', '否', '否', '一般', '否'],
259 |                     ['青年', '否', '否', '好', '否'],
260 |                     ['青年', '是', '否', '好', '是'],
261 |                     ['青年', '是', '是', '一般', '是'],
262 |                     ['青年', '否', '否', '一般', '否'],
263 |                     ['中年', '否', '否', '一般', '否'],
264 |                     ['中年', '否', '否', '好', '否'],
265 |                     ['中年', '是', '是', '好', '是'],
266 |                     ['中年', '否', '是', '非常好', '是'],
267 |                     ['中年', '否', '是', '非常好', '是'],
268 |                     ['老年', '否', '是', '非常好', '是'],
269 |                     ['老年', '否', '是', '好', '是'],
270 |                     ['老年', '是', '否', '好', '是'],
271 |                     ['老年', '是', '否', '非常好', '是'],
272 |                     ['老年', '否', '否', '一般', '否'],
273 |                     ['青年', '否', '否', '一般', '是']])
274 | map_table = {'青年': 0, '中年': 1, '老年': 2,
275 |              '否': 0, '是': 1,
276 |              '一般': 0, '好': 1, '非常好': 2}
277 | 
278 | if __name__ == '__main__':
279 |     row_, col_ = train_sets.shape
280 |     train_sets_encode = np.array([[map_table[train_sets[i, j]] for j in range(col_)] for i in range(row_)])
281 |     X_t, Y_t = train_sets_encode[:, :-1], train_sets_encode[:, -1]
282 |     for model in (DTreeID3(), DTreeC45(), DTreeCART()):
283 |         model.fit(X_t, Y_t)
284 |         print('=' * 20 + model.__class__.__name__ + '=' * 20)
285 |         print('\n<Tree Strucutre>')
286 |         print(model.visualization())
287 |         print('\n<Label Groundtruth>')
288 |         print(Y_t)
289 |         print('\n<Label Output>')
290 |         print(model.predict(X_t).astype(int))
291 |         print()
292 | 
293 |     model = DTreeRegressionCART(max_depth=2)
294 |     print('=' * 20 + model.__class__.__name__ + '=' * 20)
295 |     model.fit(X_t, Y_t)
296 |     print('\n<Tree Strucutre>')
297 |     print(model.visualization())
298 |     print('\n<Label Output>')
299 |     print(model.predict(X_t))
300 | 
301 | 
302 | 
303 | 
304 | 


--------------------------------------------------------------------------------