├── .idea ├── .gitignore ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── statisticml.iml ├── Chapter02 └── perceptron.py ├── Chapter03 ├── KNN.py └── kd_tree.py ├── Chapter04 └── naive_baysian.py ├── Chapter05 ├── classify_decision_tree.py └── tree.py ├── Chapter06 ├── LR.py ├── MEM.py └── logistic_regression.py ├── Chapter07 └── SVM.py ├── Chapter08 └── Adaboost.py ├── Chapter09 └── GMM.py ├── Chapter10 ├── HMM.py ├── backward.py ├── baum_welch.py ├── forward.py └── viterbi.py ├── Chapter11 ├── BFGS.py ├── CRF.py ├── backward.py └── forward.py └── readme.md /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/statisticml.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /Chapter02/perceptron.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | 感知机（以下标号跟书上的章节标号没有关系，后同） 4 | 1. 感知机的出发点是什么：找到一个平面尽可能将正实例、负实例分别分到平面两侧，即对y=+1的点，w·x+b>0，反之<0 5 | 2. 平面的表示形式：y = w·x + b 6 | 3. 1中"尽可能"该如何表达：误分类点的个数越少越好。但这个个数不是w,b的导数，不易优化；改为所有误分类点和平面的总距离尽可能小 7 | 4. 误分类点怎么表达：-y(w·x+b)>0 8 | 5. 故目标函数：L(w,b)=-Σ_{(x,y)属于误分类点} [y(w·x+b)] 9 | 6. 最小化目标函数的方法，求偏导，梯度下降 10 | ------到此为止，足以写出代码，但还需要学习以下内容------ 11 | 7. 算法的收敛性 12 | """ 13 | 14 | 15 | def sgd_perceptron(w, b, x, y, lr=1): 16 | """ 17 | 根据误分类实例(x,y)更新参数w, b。仅用于感知机 18 | """ 19 | w = [w_i + lr * x_i * y for w_i, x_i in zip(w, x)] 20 | b += lr * y 21 | return w, b 22 | 23 | 24 | class Perceptron: 25 | def __init__(self, max_epoch=1000): 26 | self.w = [] 27 | self.b = 0 28 | self.max_epoch = max_epoch 29 | 30 | def fit(self, X, Y): 31 | self.w = [0] * len(X[0]) 32 | 33 | epoch = 0 34 | while True: 35 | epoch += 1 36 | all_right = True # 全都被正确分类 37 | for x, y in zip(X, Y): 38 | if sum([w_i * x_i for w_i, x_i in zip(self.w, x)]) * y <= 0: # 误分类点 39 | print(f"误分类点为{(x, y)}") 40 | self.w, self.b = sgd_perceptron(self.w, self.b, x, y) 41 | all_right = False # 进入这个if意味着有点没有被正确分类，all_right置为False 42 | break 43 | # 如果经过上述的循环，确实每个点都正确分类，那么可以跳出while循环 44 | # 或者这个训练集就是无法通过一个超平面分割，那么循环再多次也无法达到all_right，我们设定一个最大循环次数 45 | if all_right or epoch > self.max_epoch: 46 | break 47 | 48 | def predict(self, X): 49 | return [self.predict_single(x) for x in X] 50 | 51 | def predict_single(self, x): 52 | if sum([w_i * x_i for w_i, x_i in zip(self.w, x)]) + self.b > 0: 53 | return 1 54 | else: 55 | return -1 56 | 57 | 58 | def demo(): 59 | X = [ 60 | [3, 3], 61 | [4, 3], 62 | [1, 1] 63 | ] 64 | Y = [ 65 | 1, 66 | 1, 67 | -1 68 | ] 69 | clf = Perceptron(max_epoch=20) 70 | clf.fit(X, Y) 71 | print(f"w={clf.w}, b={clf.b}") 72 | print(f"预测结果{clf.predict(X)}") 73 | 74 | 75 | if __name__ == '__main__': 76 | demo() 77 | -------------------------------------------------------------------------------- /Chapter03/KNN.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | KNN 4 | 目的：对于给定目标点A，想要知道A的label 5 | 方法：找到训练集中最近的k个点，我们认为这k个点的类别最多的类就是这个点A的类别 6 | 也就是说KNN的核心概念只有3个： 7 | 1. 距离最近 8 | 2. k个 9 | 3. 用这k个点的频数最高的label作为目标点A的label预测。 10 | 11 | 问题： 12 | 但是如果遍历搜索所有训练集中的点，来找到最近的距离，这样很耗时。怎么办？ 13 | 答案： 14 | 这就是KDTree的意义，它就是让我们搜索得快一点的办法 15 | 所以需要知道，KDTree本质上只是我们为了快速搜索最近k个点的实现手段，它本身不是KNN，只是KDTree这种数据结构具有快速 16 | 搜索最近k个点的优点。 17 | 18 | """ 19 | from collections import Counter 20 | 21 | from Chapter03.kd_tree import KDTree 22 | 23 | 24 | class KNN: 25 | """KNN = k nearest neighbour""" 26 | 27 | def __init__(self, k): 28 | self.k = k 29 | self.model = None 30 | 31 | def fit(self, X, Y): 32 | """用KDTree方法来拟合数据，构建模型""" 33 | self.model = KDTree(X, Y) 34 | 35 | def predict_single(self, x): 36 | # 找到包含节点的叶节点 37 | knn_list = self.model.search(x, self.k) 38 | label_list = [i[1][1] for i in knn_list] 39 | label_count = Counter(label_list) 40 | return sorted(label_count.items(), key=lambda t: t[1])[-1][0] 41 | 42 | def predict(self, X): 43 | return [self.predict_single(x) for x in X] 44 | 45 | 46 | def demo(): 47 | my_X = [ 48 | [2, 3], 49 | [5, 4], 50 | [7, 2], 51 | [9, 6], 52 | [8, 1], 53 | [4, 7] 54 | ] 55 | my_Y = [ 56 | 0, 57 | 1, 58 | 1, 59 | 0, 60 | 1, 61 | 0 62 | ] 63 | knn = KNN(2) 64 | knn.fit(my_X, my_Y) 65 | print(knn.model) 66 | print(knn.predict(my_X)) 67 | 68 | 69 | def demo2(): 70 | my_X = [ 71 | [6.27, 5.5], 72 | [1.24, -2.86], 73 | [17.05, -12.79], 74 | [-6.88, -5.4], 75 | [-2.96, -0.5], 76 | [-4.6, -10.55], 77 | [-4.96, 12.61], 78 | [1.75, 12.26], 79 | [7.75, -22.68], 80 | [10.8, -5.03], 81 | [15.31, -13.16], 82 | [7.83, 15.70], 83 | [14.63, -0.35], 84 | ] 85 | 86 | my_Y = [ 87 | 1, 88 | 1, 89 | 0, 90 | 1, 91 | 1, 92 | 0, 93 | 1, 94 | 1, 95 | 0, 96 | 1, 97 | 0, 98 | 1, 99 | 0 100 | ] 101 | 102 | knn = KNN(k=1) 103 | knn.fit(my_X, my_Y) 104 | print(knn.model) 105 | print(knn.predict(my_X)) 106 | 107 | 108 | if __name__ == '__main__': 109 | demo2() 110 | -------------------------------------------------------------------------------- /Chapter03/kd_tree.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import math 3 | from heapq import heappop, heappush, nsmallest 4 | 5 | 6 | def find_middle(X, Y, dim): 7 | """ 8 | 找到dim纬度上处于中位数的实例，返回这个实例和更小、更大的X,Y 9 | :param X: 10 | :param Y: 11 | :param dim: 12 | :return: 13 | """ 14 | # print(X, Y) 15 | sorted_X_Y = sorted(zip(X, Y), key=lambda x_and_y: x_and_y[0][dim]) 16 | middle_index = len(X) >> 1 17 | middle = sorted_X_Y[middle_index] 18 | 19 | smaller = sorted_X_Y[:middle_index] 20 | bigger = sorted_X_Y[middle_index + 1:] 21 | 22 | smaller_X, smaller_Y = [i[0] for i in smaller], [i[1] for i in smaller] 23 | bigger_X, bigger_Y = [i[0] for i in bigger], [i[1] for i in bigger] 24 | smaller_X, smaller_Y, bigger_X, bigger_Y = list(smaller_X), list(smaller_Y), list(bigger_X), list(bigger_Y) 25 | return middle, smaller_X, smaller_Y, bigger_X, bigger_Y 26 | 27 | 28 | def l2(x1, x2): 29 | return math.sqrt(sum([(x_1_i - x_2_i) ** 2 for x_1_i, x_2_i in zip(x1, x2)])) 30 | 31 | 32 | class Node: 33 | """Node的实例代表KDTree的一个节点""" 34 | 35 | def __repr__(self): 36 | return f"深度为{self.level}, 以第{self.dim}个特征作为分割标准, 实例点为{self.instance}" 37 | 38 | def __init__(self, instance, level=0): 39 | self.instance = instance 40 | self.level = level 41 | self.left = None 42 | self.right = None 43 | self.parent = None 44 | 45 | @property 46 | def dim(self): 47 | return self.level % len(self.instance) 48 | 49 | @property 50 | def is_leaf(self): 51 | return self.left is None and self.right is None 52 | 53 | @property 54 | def brother(self): 55 | if self.parent is None: 56 | return None 57 | if self.parent.left is self: # 当自己是父节点的左子节点，则兄弟节点为父节点的右节点 58 | return self.parent.right 59 | return self.parent.left # 反之 60 | 61 | def plane_distance(self, x): 62 | """节点所代表的超平面与目标点的距离""" 63 | return abs(x[self.dim] - self.instance[0][self.dim]) 64 | 65 | def point_distance(self, x): 66 | return l2(self.instance[0], x) 67 | 68 | def find_leaf(self, x): 69 | node = self 70 | while not node.is_leaf: 71 | if node.left is None: 72 | node = node.right 73 | elif node.right is None: 74 | node = node.left 75 | elif x[node.dim] < node.instance[0][node.dim]: 76 | node = node.left 77 | else: 78 | node = node.right 79 | return node 80 | 81 | 82 | class KDTree: 83 | def __repr__(self): 84 | representation = "" 85 | queue = [self.root] 86 | while queue: 87 | node = queue.pop(0) 88 | representation += str(node) 89 | representation += '\n' 90 | if node.left: 91 | queue.append(node.left) 92 | if node.right: 93 | queue.append(node.right) 94 | return representation 95 | 96 | def __init__(self, X, Y): 97 | def _build_node(_X, _Y, _level, _dim): 98 | """递归地方式构建节点""" 99 | _middle, _smaller_X, _smaller_Y, _bigger_X, _bigger_Y = find_middle(_X, _Y, _dim) 100 | # print(_middle, _smaller_X, _smaller_Y, _bigger_X, _bigger_Y) 101 | _node = Node(_middle, _level) 102 | _next_level = _level + 1 103 | _next_dim = _next_level % len(_middle) 104 | if _smaller_X: 105 | _node.left = _build_node(_smaller_X, _smaller_Y, _next_level, _next_dim) 106 | if _bigger_X: 107 | _node.right = _build_node(_bigger_X, _bigger_Y, _next_level, _next_dim) 108 | return _node 109 | 110 | self.root = _build_node(X, Y, 0, 0) 111 | # 递归设置父节点 112 | queue = [self.root] 113 | while queue: 114 | node = queue.pop(0) 115 | if node.left: 116 | node.left.parent = node 117 | queue.append(node.left) 118 | if node.right: 119 | node.right.parent = node 120 | queue.append(node.right) 121 | 122 | def search(self, x, k): 123 | """找到最接近x的k个实例""" 124 | 125 | def backtrack(root, knn_list, is_visited): 126 | if root is self.root and root in is_visited: 127 | return 128 | 129 | node = root.find_leaf(x) 130 | is_visited.append(node) 131 | dist = node.point_distance(x) 132 | 133 | if len(knn_list) < k: 134 | # record = (-距离, 实例点),heappush构造的是小顶堆，而我们想知道的是最大距离点，故对距离取相反数 135 | heappush(knn_list, (-dist, node.instance)) 136 | else: 137 | # 先比较这个叶节点是否比knn_list中最远点近，是的话替换，否则不换 138 | farthest_dist, farthest_point = nsmallest(1, knn_list)[0] 139 | if -farthest_dist > dist: 140 | heappop(knn_list) 141 | heappush(knn_list, (-dist, node.instance)) 142 | 143 | # 往上寻找没有被访问过的父节点，并将兄弟节点取出备用 144 | brother = node.brother 145 | node = node.parent 146 | while node in is_visited and node.parent: 147 | brother = node.brother 148 | node = node.parent 149 | # 如果遍历到顶 150 | if node is self.root and node in is_visited: 151 | return 152 | 153 | while True: 154 | # 否则计算父节点是否能满足条件、并把父节点计入被访问列表 155 | is_visited.append(node) 156 | dist = node.point_distance(x) 157 | if len(knn_list) < k: 158 | # record = (距离, 实例点) 159 | # heappush构造的是小顶堆，而我们想知道的是最大距离点，故对距离取相反数 160 | heappush(knn_list, (-dist, node.instance)) 161 | else: 162 | # 先比较这个叶节点是否比knn_list中最远点近，是的话替换，否则不换 163 | farthest_dist, farthest_point = nsmallest(1, knn_list)[0] 164 | if -farthest_dist > dist: 165 | heappop(knn_list) 166 | heappush(knn_list, (-dist, node.instance)) 167 | 168 | # 再看超平面 169 | farthest_dist, farthest_point = nsmallest(1, knn_list)[0] 170 | if (node.plane_distance(x) < -farthest_dist or len(knn_list) < k) and brother is not None: 171 | backtrack(brother, knn_list, is_visited) 172 | break 173 | else: 174 | while node in is_visited and node.parent: 175 | brother = node.brother 176 | node = node.parent 177 | # 如果遍历到顶 178 | if node is self.root and node in is_visited: 179 | return 180 | 181 | _knn_list = [] 182 | _is_visited = [] 183 | backtrack(self.root, _knn_list, _is_visited) 184 | print(_knn_list) 185 | return _knn_list 186 | -------------------------------------------------------------------------------- /Chapter04/naive_baysian.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from collections import Counter 3 | 4 | """ 5 | 朴素贝叶斯 6 | 0. 在实现朴素贝叶斯的时候，笔者已经是第N次回顾朴素贝叶斯了，但直到这一次才开始有意识地将它与上一章的感知机做一些对比， 7 | 它也给了笔者一些收获。这种与前面的模型/方法做比较的意识，将贯彻整个repository。 8 | 1. 朴素贝叶斯的出发点是什么：当已知特征x的条件下，求概率最高的y，所以需要对P(y|x)建模。 9 | 而回顾下上一章，感知机的建模是f(x)。 10 | 2. 怎么建模: 根据贝叶斯公式:P(y|x)=P(x,y) / P(x) 11 | =[P(x|y) * P(y)] / [Σ_{y_i}P(x,y_i)] 12 | =[P(x|y) * P(y)] / [Σ_{y_i}P(x|y_i) * P(y_i)] 13 | 故需要对P(x|y)和P(y)建模 --> 为什么不能直接对P(y|x)建模，而可以反过来对P(x|y)建模（其实可以！看看逻辑斯蒂回归) 14 | 但这里的任务转化为P(x|y)和P(y)建模后，这个模型必须得具备为P(x|y)和P(y)建模的能力才说得过去！ 15 | 这就是"朴素贝叶斯法"的贝叶斯。 16 | 3. 进一步地，在P(x|y)中，x可能是多维特征，实际上这些特征可能是有关系的。 17 | 但朴素贝叶斯做了一个简单的、天真的、朴素的假设：特征之间没有关系。 18 | 这就是"朴素贝叶斯"的朴素之处。但是这个朴素的假设有什么用呢（问题A的答案，下面揭晓） 19 | 4. 剩下的问题就是如何为P(x|y)和P(y)建模了 20 | 4.1 使用极大似然估计法估计相应的概率 21 | 4.1.2 P(y)用频数即可 22 | 4.1.3 P(x|y) = P(x1, x2, ..., xn|y) 23 | = P(x1|y) * P(x2|y) * ... * P(xn|y) （从上一行到这一行就是基于朴素的"特征之间没有关系"的假设） 24 | = [频数(x1, y) / 频数(y)] * [频数(x1, y) / 频数(y)] * ... * [频数(xn, y) / 频数(y)] 25 | 这里就是朴素假设的用途了，通过这个朴素假设，我们可以通过简单地估计各个P(xi|y)来达到目的 26 | # todo: P(y|x) = P(y|x1) * P(y|x2) * ... * P(y|xn)??? 27 | 4.2 使用贝叶斯估计来避免概率为0的情况 28 | 5. 对比下感知机和朴素贝叶斯法。朴素贝叶斯有一步很特别，就是它对P(x,y)建模了， 29 | 换句话说，原则上它掌握了(x,y)的生成规律，可以用来生成数据。我们把这类模型叫做生成模型 30 | 后续的逻辑斯蒂回归直接对P(y|x)建模，则没有这个生成的过程！ 31 | todo: 为什么我们需要对这个特性那么在意？有什么好处吗？ 32 | """ 33 | 34 | 35 | class NaiveBaysian: 36 | def __init__(self): 37 | """ 38 | :param features: 特征 39 | :param labels: label 40 | """ 41 | self.prior_proba = {} 42 | self.conditional_proba = [] 43 | self.y_options = {} 44 | 45 | def fit(self, X, Y): 46 | Y_counts = dict(Counter(Y)) 47 | self.prior_proba = {y: count / len(Y) for y, count in Y_counts.items()} 48 | self.y_options = set(Y) 49 | 50 | for i in range(len(X[0])): 51 | X_i = [x[i] for x in X] 52 | X_i_Y = list(zip(X_i, Y)) 53 | X_i_Y_count = dict(Counter(X_i_Y)) 54 | # P(xi, yi) 55 | X_i_Y_proba = {x_i_y: count / len(Y) for x_i_y, count in X_i_Y_count.items()} 56 | # P(xi|yi) = P(xi,yi) / P(yi) 57 | conditional_proba = {x_i_y: proba / self.prior_proba[x_i_y[1]] for x_i_y, proba in # x_i_y[1]就是y 58 | X_i_Y_proba.items()} 59 | self.conditional_proba.append(conditional_proba) 60 | # 最后self.conditional_proba形如 61 | # [ 62 | # 第一个特征的条件概率：P(x1|y)={(x1=a, y): p1, (x1=b,y): p2, ..., (x1=z,y): pn}, # 这里的(x1=a,y)代表x1=a|y 63 | # 第二个特征的条件概率：P(x2|y)={(x1=a, y): p1, (x2=b,y): p2, ..., (x2=z,y): pn}, 64 | # ... 65 | # 最后的特征的条件概率：P(xm|y)={(xm=a, y): p1, (xm=b,y): p2, ..., (xm=z,y): pn}, 66 | # ] 67 | 68 | def predict_single(self, x): 69 | assert len(x) == len(self.conditional_proba) 70 | y_result = 0 71 | proba_result = 0 72 | for y in self.y_options: 73 | prior_proba = self.prior_proba.get(y, 0) # 这里要防止训练集中没有出现y 74 | conditional_proba = 1 75 | for idx, x_i in enumerate(x): 76 | conditional_proba *= self.conditional_proba[idx].get((x_i, y), 0) # 这里要防止训练集中没有出现(x_i, y) 77 | proba = prior_proba * conditional_proba 78 | if proba > proba_result: 79 | proba_result = proba 80 | y_result = y 81 | return y_result 82 | 83 | def predict(self, X): 84 | return [self.predict_single(x) for x in X] 85 | 86 | 87 | def demo(): 88 | X = [ 89 | [1, 'S'], 90 | [1, 'M'], 91 | [1, 'M'], 92 | [1, 'S'], 93 | [1, 'S'], 94 | [2, 'S'], 95 | [2, 'M'], 96 | [2, 'M'], 97 | [2, 'L'], 98 | [2, 'L'], 99 | [3, 'L'], 100 | [3, 'M'], 101 | [3, 'M'], 102 | [3, 'L'], 103 | [3, 'L'], 104 | ] 105 | Y = [ 106 | -1, 107 | -1, 108 | -1, 109 | -1, 110 | -1, 111 | -1, 112 | -1, 113 | 1, 114 | 1, 115 | 1, 116 | 1, 117 | 1, 118 | 1, 119 | 1, 120 | -1 121 | ] 122 | nb = NaiveBaysian() 123 | nb.fit(X, Y) 124 | prediction = nb.predict(X) 125 | print(prediction) 126 | print(f"正确率为{sum([1 if i == j else 0 for i, j in zip(prediction, Y)]) / len(prediction)}") 127 | 128 | 129 | if __name__ == '__main__': 130 | demo() 131 | -------------------------------------------------------------------------------- /Chapter05/classify_decision_tree.py: -------------------------------------------------------------------------------- 1 | # !/Applications/anaconda/envs/4PyCharm/bin/python3.4 2 | # -*- coding: utf-8 -*- 3 | import math 4 | from collections import Counter, deque 5 | from functools import reduce 6 | 7 | 8 | def calculate_entropy(labels): 9 | """ 10 | 计算label集的熵 11 | :param labels: list 12 | :return: 熵: float 13 | """ 14 | total = len(labels) 15 | 16 | # 每个类的数量，计算熵的时候，类本身并不重要，重要的是每个类各种的数量/比例 17 | counter_of_every_class = Counter(labels).values() 18 | # 每个类的比例 19 | scale_of_every_class = map(lambda x: x / total, counter_of_every_class) 20 | res = sum(map(lambda i: -i * math.log(i), scale_of_every_class)) 21 | # my_print(res) 22 | return res 23 | 24 | 25 | class _Node: 26 | """ 27 | 树的节点，每个节点用来fit一个特征 28 | """ 29 | 30 | def __init__(self, epsilon=0.1): 31 | self.epsilon = epsilon 32 | self.label = None 33 | self.idx_feature = None # idx_feature用来记载这个节点选择了哪个特征分量来拆分树 34 | self.child_dict = {} # 选择了特征分量，按照这个特征分量的n个取值划分出若干子集合，这个节点的子节点分别一个子集合 35 | 36 | def fit(self, features, labels): 37 | """ 38 | :param features: X = 样本 * [特征0, 特征1, ……] 39 | :param labels: Y = 样本 * label 40 | :return: 41 | """ 42 | 43 | assert len(features) == len(labels), "X和Y的个数不一致" 44 | 45 | # 当labels都为一样的，这个节点就有自己的label了，没有子节点 46 | if len(set(labels)) == 1: 47 | self.label = labels[0] 48 | return 49 | 50 | # 如果已经没有特征的话，跟上面一样 51 | num_features = len(features[0]) # 特征的个数 52 | if not num_features: 53 | self.label = Counter(labels).most_common(1)[0][0] # 计数，然后选最多的那个 54 | return 55 | 56 | """ 57 | 计算每个特征列的信息熵 58 | """ 59 | cols = [[sample[idx] for sample in features] for idx in range(num_features)] 60 | entropy_list = [] 61 | for col in cols: # 对于每个特征列 62 | set_of_types_in_col = set(col) 63 | total_entropy = 0 64 | for s in set_of_types_in_col: # 对于这个特征列的每个取值 65 | subset = [label for c, label in zip(col, labels) if c == s] 66 | total_entropy += calculate_entropy(subset) * (len(subset) / len(labels)) 67 | entropy_list.append(total_entropy) 68 | 69 | # 挑选出【使得分割后集合的信息熵最少】的特征 70 | min_idx, min_entropy = reduce(lambda x, y: x if x[1] < y[1] else y, enumerate(entropy_list)) 71 | 72 | """ 73 | 这个特征会使得互信息最大（信息不确定性的减少最多） 74 | 如果连这个互信息都达不到epsilon，我们认为每个特征都提供不了多少信息，那再继续分支也没有什么价值 75 | 所以直接取占比最高的类作为这个节点的label 76 | """ 77 | if calculate_entropy(labels) - min_entropy < self.epsilon: 78 | self.label = Counter(labels).most_common(1)[0][0] 79 | return 80 | 81 | # 否则就挑选这个特征 82 | self.idx_feature = min_idx 83 | 84 | # 挑选之后，按照这个特征的n个取值，它会产生n个子节点 85 | # 同时我们需要划分集合 86 | # 每个子节点(child)对应处理一个子集(sub_feature和sub_labels) 87 | set_n_value = set([sample[min_idx] for sample in features]) # n个取值的集合，形如{0, 1}、{1, 2, 3}这样 88 | for value in set_n_value: 89 | sub_features = [] # 子特征集 90 | sub_labels = [] # 子label集 91 | for sample, label in zip(features, labels): 92 | if sample[min_idx] == value: 93 | sub_features.append(sample[:min_idx] + sample[min_idx + 1:]) 94 | sub_labels.append(label) 95 | child = _Node(epsilon=self.epsilon) 96 | child.fit(sub_features, sub_labels) 97 | self.child_dict[value] = child 98 | 99 | def __str__(self): 100 | node_information = f"node's idx_feature={self.idx_feature}\n" \ 101 | f"node's child_dict={self.child_dict}\n" \ 102 | f"node's label={self.label}\n" 103 | return node_information 104 | 105 | 106 | class ClassifyDecisionTree(_Node): 107 | """ 108 | 分类决策树 109 | """ 110 | 111 | def predict(self, feature): 112 | """ 113 | 预测数据 114 | :param feature: 特征 115 | :return: 预测的结果 116 | """ 117 | print('*' * 10, '预测正在进行', '*' * 10) 118 | node = self 119 | while node.label is None: # 注意不能用while not node.label，因为label可能为0 120 | to_delete_idx = node.idx_feature 121 | node = node.child_dict[feature[node.idx_feature]] 122 | feature.pop(to_delete_idx) 123 | return node.label 124 | 125 | 126 | if __name__ == "__main__": 127 | # 《统计学习方法》的贷款申请样本数据表 128 | sample_with_labels = [ 129 | [[0, 0, 0, 0], 0], 130 | [[0, 0, 0, 1], 0], 131 | [[0, 1, 0, 1], 1], 132 | [[0, 1, 1, 0], 1], 133 | [[0, 0, 0, 0], 0], 134 | [[1, 0, 0, 0], 0], 135 | [[1, 0, 0, 1], 0], 136 | [[1, 1, 1, 1], 1], 137 | [[1, 0, 1, 2], 1], 138 | [[1, 0, 1, 2], 1], 139 | [[2, 0, 1, 2], 1], 140 | [[2, 0, 1, 1], 1], 141 | [[2, 1, 0, 1], 1], 142 | [[2, 1, 0, 2], 1], 143 | [[2, 0, 0, 0], 0], 144 | ] 145 | test_features = [i[0] for i in sample_with_labels] 146 | test_labels = [i[1] for i in sample_with_labels] 147 | cdt = ClassifyDecisionTree(epsilon=0.1) 148 | cdt.fit(test_features, test_labels) 149 | print(cdt.predict([0, 1, 0, 0])) 150 | 151 | """ 152 | 用队列来先序遍历决策树的节点，打印出来 153 | 方便按照打印信息来验证自己的树 154 | """ 155 | q = deque([cdt]) 156 | while q: 157 | if q[0].label: 158 | print(q.popleft()) 159 | else: 160 | q.extend(q[0].child_dict.values()) 161 | print(q.popleft()) 162 | -------------------------------------------------------------------------------- /Chapter05/tree.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn.metrics import mean_squared_error 6 | 7 | INF = np.inf 8 | EPSILON = 1e-2 9 | 10 | 11 | def _best_split(X, Y): 12 | """找到最佳的切分特征j和对应的切分点s""" 13 | rows, cols = X.shape 14 | if rows <= 1: 15 | return 0, X[0, 0], 0, 0, 0 16 | best_j = -1 17 | best_s = INF 18 | c1 = INF 19 | c2 = INF 20 | best_loss = INF 21 | for j in range(cols): 22 | for i in range(rows): 23 | s = X[i, j] 24 | R1 = Y[X[:, j] <= s] 25 | R2 = Y[X[:, j] > s] 26 | c1_hat = R1.mean() 27 | c2_hat = R2.mean() 28 | loss = sum((R1 - c1_hat) ** 2) + sum((R2 - c2_hat) ** 2) 29 | if loss < best_loss: 30 | best_j = j 31 | best_s = s 32 | c1 = c1_hat 33 | c2 = c2_hat 34 | best_loss = loss 35 | 36 | return best_j, best_s, c1, c2, best_loss 37 | 38 | 39 | class Node: 40 | def __repr__(self): 41 | return f"划分特征={self.j} 划分点={self.s} 左标签为{self.c1} 右标签为{self.c2} loss为{self.loss}" 42 | 43 | def __init__(self, j, s, c1, c2, loss, left=None, right=None): 44 | self.j = j 45 | self.s = s 46 | self.c1 = c1 47 | self.c2 = c2 48 | self.loss = loss 49 | self.left = left 50 | self.right = right 51 | # self.is_leaf = True 52 | 53 | 54 | class CartRegressor: 55 | def __init__(self, max_depth=3): 56 | self._tree = None 57 | self.max_depth = max_depth 58 | self.n_nodes = max_depth * 2 - 1 # Cart是完整二叉树，最大节点数不超过max_depth * 2 - 1 59 | 60 | def fit(self, X, Y, max_depth): 61 | self.n_nodes = max_depth * 2 - 1 62 | """递归地对子节点fit""" 63 | self._tree = Node(*_best_split(X, Y)) 64 | # self._tree = Node(-1, INF, INF, INF) 65 | n_nodes = 1 66 | node_list = [(self._tree, X, Y)] # (节点，节点需要fit的X，Y) 67 | while node_list: 68 | node, x, y = node_list.pop(0) 69 | # print(node) 70 | # 如果这个节点的loss为0，就不用再细分了 71 | if node.loss <= EPSILON: 72 | # node.is_leaf = True 73 | continue 74 | part1_index = x[:, node.j] <= node.s 75 | part2_index = x[:, node.j] > node.s 76 | x1, y1 = x[part1_index], y[part1_index] 77 | x2, y2 = x[part2_index], y[part2_index] 78 | if n_nodes == self.n_nodes: 79 | continue 80 | left = Node(*_best_split(x1, y1)) 81 | node_list.append((left, x1, y1)) 82 | node.left = left 83 | n_nodes += 1 84 | right = Node(*_best_split(x2, y2)) 85 | node_list.append((right, x2, y2)) 86 | node.right = right 87 | n_nodes += 1 88 | 89 | def predict_single(self, x): 90 | node = self._tree 91 | while node.left or node.right: 92 | node = node.left if x[node.j] <= node.s else node.right 93 | return node.c1 if x[node.j] <= node.s else node.c2 94 | 95 | def predict(self, X): 96 | return np.asarray([self.predict_single(x) for x in X]) 97 | 98 | def score(self, X, Y): 99 | return mean_squared_error(self.predict(X), Y) 100 | 101 | 102 | def main(): 103 | np.random.seed(0) 104 | x = np.linspace(-10, 10, 100).reshape((-1, 1)) 105 | y = np.linspace(-20, 20, 100) + np.random.normal(loc=0, scale=3.5, size=(100,)) 106 | # x, y = make_regression(n_samples=500, n_features=2, n_informative=2) 107 | t = CartRegressor(4) 108 | df = pd.DataFrame() 109 | df['x'] = x.reshape((-1,)) 110 | df = df.set_index('x') 111 | 112 | for max_depth in range(2, 8): 113 | t.fit(x, y, max_depth=max_depth) 114 | print(f"MAX_DEPTH_{max_depth}: {t.score(x, y)}") 115 | y_predict = t.predict(x) 116 | 117 | df['MAX_DEPTH_{}'.format(max_depth)] = y_predict 118 | 119 | plt.figure(figsize=(12, 7)) 120 | plt.scatter(x, y, s=10, color='r') 121 | 122 | for max_depth in range(2, 8): 123 | col_name = 'MAX_DEPTH_{}'.format(max_depth) 124 | plt.plot(x, df[col_name], label=col_name) 125 | # plt.show() 126 | plt.title('Regression Tree') 127 | plt.legend(loc='best') 128 | plt.xlabel('x') 129 | plt.ylabel('y') 130 | plt.show() 131 | 132 | 133 | if __name__ == '__main__': 134 | main() 135 | -------------------------------------------------------------------------------- /Chapter06/LR.py: -------------------------------------------------------------------------------- 1 | # !/Applications/anaconda/envs/4PyCharm/bin/python3.4 2 | # -*- coding: utf-8 -*- 3 | import numpy as np 4 | import torch 5 | from sklearn.datasets import load_iris 6 | from sklearn.preprocessing import LabelBinarizer 7 | from torch.nn import Parameter 8 | from torch.optim import SGD 9 | 10 | 11 | class LR: 12 | def __init__(self): 13 | self.w = torch.tensor(0.) 14 | self.b = torch.tensor(0.) 15 | self.step = 100 16 | 17 | def fit(self, X, Y): 18 | X, Y = torch.from_numpy(X), torch.from_numpy(Y) 19 | X, Y = torch.tensor(X, dtype=torch.float32), torch.tensor(Y, dtype=torch.float32) 20 | n_feature = len(X[0]) 21 | n_class = len(Y[0]) 22 | self.w = Parameter(torch.zeros((n_feature, n_class - 1)), requires_grad=True) 23 | self.b = Parameter(torch.zeros((n_class - 1,)), requires_grad=True) 24 | optimizer = SGD([self.w, self.b], lr=.1) 25 | Y = Y.argmax(dim=1) 26 | 27 | for _ in range(self.step): 28 | optimizer.zero_grad() 29 | 30 | Y_hat_along_label = torch.exp(torch.matmul(X, self.w) + self.b) 31 | Y_hat_along_label = torch.cat([Y_hat_along_label, torch.ones((len(Y), 1))], 1) 32 | denominator = Y_hat_along_label.sum(dim=1) 33 | distribution = Y_hat_along_label / denominator[:, None] 34 | # loss = torch.nn.CrossEntropyLoss()(Y, distribution) 35 | loss = torch.nn.NLLLoss()(distribution, Y) 36 | loss.backward() 37 | optimizer.step() 38 | 39 | def predict_prob(self, X): 40 | X = torch.from_numpy(X) 41 | X = torch.tensor(X, dtype=torch.float32) 42 | Y_hat_along_label = torch.exp(torch.matmul(X, self.w) + self.b) 43 | Y_hat_along_label = torch.cat([Y_hat_along_label, torch.ones((len(Y_hat_along_label), 1))], 1) 44 | denominator = Y_hat_along_label.sum(dim=1) 45 | distribution = Y_hat_along_label / denominator[:, None] 46 | return distribution 47 | 48 | def predict_single(self, x): 49 | x = self.predict_prob(x) 50 | res = np.zeros_like(x) 51 | res[x.argmax()] = 1 52 | return res 53 | 54 | def predict(self, X): 55 | X = torch.from_numpy(X) 56 | return np.asarray([self.predict_single(x) for x in X]) 57 | 58 | 59 | def main(): 60 | iris = load_iris() 61 | X, Y = iris.data, iris.target 62 | lb = LabelBinarizer() 63 | Y = lb.fit_transform(Y) 64 | lr = LR() 65 | lr.fit(X, Y) 66 | print(lr.predict_prob(X)) 67 | return lr 68 | 69 | 70 | if __name__ == '__main__': 71 | my_lr = main() 72 | -------------------------------------------------------------------------------- /Chapter06/MEM.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """MEM = Maximum entropy model""" 3 | # 最大熵模型原理 4 | # todo: 拉格朗日对偶性，学完这个才知道为什么可以转化为求解max_min_问题 5 | # todo: 理解最大熵模型的最大似然估计等价于最大熵模型的对偶函数 6 | # todo: 牛顿法和拟牛顿法 7 | """ 8 | 1. 原则上，MEM需要传入特征函数，如果未传入，则可以简单(朴素)地以每个特征与label的共现作为feature_function 9 | 2. 根据特征函数、训练集求得Pw(y|x)，接下来的任务是求得最好的w，当得到了w后，模型就固定了 10 | 3. 求得w的方法：IIS 11 | # todo: 这个模型需要遍历、存储x、y的一些性质，这跟生成模型、判别模型有关系吗 12 | 13 | feature function定义了哪些数据作为模型的约束，以及数据如何转化为约束 14 | feature function形如 15 | y1 y2 ... yt 16 | f1 x1 0/1 0/1 0/1 17 | f2 x2 0/1 0/1 0/1 18 | ... 19 | fs xs 0/1 0/1 0/1 20 | 例如 21 | 假设在训练集中，x有 22 | 23 | """ 24 | import numpy as np 25 | import pandas as pd 26 | from itertools import product 27 | 28 | 29 | def get_P_XY_and_P_X(X, Y): 30 | """ 31 | 获取联合概率分布和X分布 32 | 联合概率形如 33 | feature1, feature2, ..., feature, prob_y1, prob_y2, ..., prob_ym 34 | 0 , 0 , ..., 0 , 0.1 , 0.1 , ..., 0 35 | 1 , 0 , ..., 0 , 0.2 , 0 , ..., 0 36 | ... 37 | 如果总共有10个样本，特征为(1, 0, 0)样本总共有2个，其中有一个y是1，一个y是2，总共可能的y是[1, 2, 3]，那么对应的，它的联合概率如下 38 | feature1, feature2, feature3, prob_y=1, prob_y=2, prob_y=3 39 | 1 , 0 , 0 , 0.1 , 0.1 , 0 40 | """ 41 | # 将Y转化成 42 | XY = np.concatenate(X, Y, axis=1) 43 | XY_unique, counts = np.unique(XY, axis=1, return_counts=True) 44 | freq = counts / XY.shape[0] 45 | df_XY = pd.DataFrame(XY_unique, columns=[f"feature_{i}" for i in range(len(X[0]))] + ['y']) 46 | df_XY = df_XY.set_index([f"feature_{i}" for i in range(len(X[0]))])['y'] 47 | df_XY = df_XY.unstack().reset_index() 48 | 49 | df_XY.loc[:, 'freq'] = freq 50 | df_XY = df_XY.groupby([col for col in df_XY.columns if col != 'y']).apply( 51 | lambda _df: dict(zip(_df['y'], _df['freq'])) 52 | ).reset_index().rename(columns={0: 'distribution'}) 53 | 54 | unique_list = [np.unique(X[:, i]) for i in range(len(X[0]))] 55 | array = np.array(product(*unique_list)) 56 | df = pd.DataFrame(data=array, columns=[f"feature_{i}" for i in range(len(X[0]))]) 57 | zero_distribution = dict(zip(Y.unique(), np.zeros_like(Y.unique()))) 58 | df.loc[: 'distribution_0'] = [zero_distribution for _ in range(len(X[0]))] 59 | df = pd.merge(df, df_XY, on=df.columns.tolist(), how='left') 60 | df.loc[: 'distribution'] = np.where() 61 | 62 | 63 | def get_P_X(X): 64 | 65 | 66 | class MEM: 67 | def __init__(self, method='BFGS', epsilon=1e-3): 68 | """ 69 | """ 70 | self.method = method 71 | self.epsilon = epsilon 72 | self.X = np.array([]) 73 | self.Y = np.array([]) 74 | self.p_X = {} 75 | self.p_XY = {} 76 | self.n_feature = 1 77 | self.w = np.random.rand(self.n_feature) 78 | self.y_options = np.array([]) 79 | 80 | def f(self, w): 81 | pass 82 | 83 | loss_function = f 84 | 85 | def _empirical_joint_distribution(self): 86 | n_samples = self.X.shape[0] 87 | X_Y = np.concatenate((self.X, self.Y), axis=1) 88 | # 以每行作为一个元素计数 89 | element, freq = np.unique(X_Y, axis=0, return_counts=True) 90 | element = [tuple(i) for i in element] 91 | freq /= n_samples 92 | distribution = dict(zip(element, freq)) 93 | 94 | def inner(x, y): 95 | return distribution[tuple(x) + (y,)] 96 | 97 | return inner 98 | 99 | def get_Pw_y_x(self, w): 100 | """ 101 | 给定参数下的最大熵模型Pw(y|x) 102 | 所谓Pw(y|x)是个概率模型，它可以表示为一个接受x，输出概率分布{y1: p1, y2: p2, ...}的函数（当然也可以有其他表示方法） 103 | """ 104 | 105 | def inner(x): 106 | numerator_array = np.array([]) 107 | for y in self.y_options: 108 | numerator = np.exp(w * np.array([f(x, y) for f in self.ffs])) 109 | numerator_array = np.append(numerator_array, numerator) 110 | denominator = numerator_array.sum() 111 | distribution = numerator_array / denominator 112 | return dict(zip(self.y_options, distribution)) 113 | 114 | return inner 115 | 116 | def distribution_matrix(self, X, Y): 117 | self. 118 | 119 | def fit(self, X, Y): 120 | X, Y = np.asarray(X), np.asarray(Y) 121 | # 根据训练数据做一些必要的初始化 122 | # 1. 获取经验联合分布~P(X,Y) 123 | empirical_joint_distribution = self._empirical_joint_distribution() 124 | # 2. 获取给定参数w下的最大熵模型Pw(y|x) 125 | 126 | if self.method == 'IIS': 127 | pass 128 | else: 129 | # 输入特征函数、经验联合分布，目标函数f(w), 梯度函数g(w) 130 | # 1. 根据特征函数、给定的w，求得最大熵模型Pw_y_x 131 | # 2. 然后任务是求得最佳的w，将w代进Pw_y_x，就是最终的P_y_x 132 | # 3. 求解w的方法是 133 | # 3.1 初始化w、B（正定对称矩阵） 134 | # 3.2 求梯度g，如果梯度<=epsilon，则停止，否则进入3.3~3.7 135 | # 3.3 根据B·p = -g,求得p 136 | # 3.4 一维搜索λ, 使得f(w+pλ)最小 137 | # 3.5 更新w=w+λp 138 | # 3.6 更新g,如果g<=epsilon,w_best=w；否则计算新B 139 | # 3.7 转3.3 140 | # 备注：另外可以限定循环的次数不超过epochs次 141 | 142 | # 3.1 初始化w, B 143 | w = np.random.rand(len(self.ffs)) 144 | B = np.eye(len(self.ffs)) 145 | # 3.2 求梯度g 146 | Pw_y_x = self.get_Pw_y_x(w) 147 | 148 | g = g_w = 0 149 | for epoch in range(epochs): 150 | if g <= epsilon: 151 | break 152 | -------------------------------------------------------------------------------- /Chapter06/logistic_regression.py: -------------------------------------------------------------------------------- 1 | # !/Applications/anaconda/envs/4PyCharm/bin/python3.4 2 | # -*- coding: utf-8 -*- 3 | # author: frank 4 | # time : 2019-06-16 14:40 5 | # file : logistic_regression.py 6 | import numpy as np 7 | import logging 8 | from collections import Counter 9 | logging.basicConfig(format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s', 10 | level=logging.DEBUG) 11 | """ 12 | 逻辑斯蒂回归 13 | 1. 以二分类为例子 14 | 2. 给定了实例的Feature，判断实例的label是0还是1 15 | 3. 两个问题： 16 | 3.1 为什么二分类的模型可以是"逻辑斯蒂回归模型"？ 17 | 我们预设二分类模型服从伯努利分布，伯努利分布是GLM之一，写出它的GLM形式。 18 | 根据最大熵原则，它是sigmoid形式，或者说是逻辑斯蒂回归模型 19 | 根据最大熵原则，它是sigmoid形式，或者说是逻辑斯蒂回归模型。 20 | 3.2 逻辑斯蒂回归的参数估计怎么做？ 21 | 极大似然原则，其实跟最大熵原则殊途同归。 22 | 换句话说，最大熵原则既决定了模型的"公式"的样子，又决定了参数。 23 | 24 | """ 25 | 26 | 27 | def sigmoid(x): 28 | activation = 1 / (1 + np.exp(-x)) 29 | return activation 30 | 31 | 32 | def propagate(features, labels, w, b): 33 | """ 34 | 反向传播梯度下降，此处为了简单起见只做全局梯度下降 35 | :param features: 特征 36 | :param labels: 标签 37 | :param w: 系数 38 | :param b: 截距 39 | :return: 40 | """ 41 | 42 | n = features.shape[1] 43 | 44 | # 前向传播 45 | predictions = sigmoid(np.dot(w.T, features) + b) 46 | cost = -np.sum(labels * np.log(predictions) + (1 - labels) * np.log(1 - predictions)) / n 47 | 48 | # 反向传播 49 | d_Z = predictions - labels 50 | d_w = np.dot(features, d_Z.T) / n 51 | d_b = np.sum(d_Z) / n 52 | 53 | # w = w - lr * d_w 54 | # b = b - lr * d_b 55 | return d_w, d_b, cost 56 | 57 | 58 | class LogisticRegression: 59 | """ 60 | 初始化 61 | """ 62 | def __init__(self, lr=0.001, num_epochs=100): 63 | self.lr = lr 64 | self.num_epochs = num_epochs 65 | 66 | # 模型的参数 67 | self.dim = 0 68 | self.w = np.zeros((0, )) 69 | self.b = 0 70 | 71 | def fit(self, features, labels): 72 | """ 73 | 拟合、改变参数 74 | """ 75 | logging.info("开始训练") 76 | self.dim = features.shape[0] 77 | self.w = np.ones((self.dim, 1)) * .5 78 | 79 | # 对训练集反向传播 80 | for epoch in range(self.num_epochs): 81 | d_w, d_b, cost = propagate(features, labels, self.w, self.b) 82 | self.w -= d_w * self.lr 83 | self.b -= d_b * self.lr 84 | 85 | # ========================================== 86 | # ================ 参数衰减 =============== 87 | # ========================================== 88 | if epoch == self.num_epochs * .6: 89 | self.lr *= .5 90 | if epoch == self.num_epochs * .8: 91 | self.lr *= .2 92 | if epoch % 100 == 0: 93 | logging.info(f"cost = {cost}") 94 | logging.info(f"===============训练完毕===========") 95 | 96 | def predict(self, instance): 97 | # p_1 = instance的label是1的概率 98 | p_1 = sigmoid(np.dot(self.w.T, instance) + self.b) 99 | return np.where(p_1 > 0.5, 1, 0) 100 | 101 | 102 | if __name__ == '__main__': 103 | 104 | # 参数设置 105 | num_cases = 10000 106 | num_features = 6 107 | test_lr = 0.1 108 | test_num_epochs = 5000 109 | 110 | # ========================================== 111 | # ================ 生成数据 =============== 112 | # ========================================== 113 | test_features = np.random.rand(num_features, num_cases) 114 | true_w = (np.arange(1, 7) * np.array([1, -1, 1, -1, 1, -1])).reshape(6, 1) 115 | true_b = .2 116 | logging.info(f"true_w=\n{true_w}") 117 | logging.info(f"true_b={true_b}") 118 | 119 | # w * x + b 120 | linear_result = np.dot(true_w.T, test_features) + true_b 121 | # sigmoid(w * x + b) 122 | test_labels = np.where(sigmoid(linear_result) > 0.5, 1, 0) 123 | logging.info(f"labels counts are {Counter(test_labels[0])}") 124 | 125 | # 实例化并训练 126 | LR = LogisticRegression(lr=test_lr, num_epochs=test_num_epochs) 127 | LR.fit(test_features, test_labels) 128 | logging.info(f"w=\n{LR.w}") 129 | logging.info(f"b={LR.b}") 130 | 131 | # accuracy on train data 132 | train_predictions = LR.predict(test_features) 133 | result = (train_predictions == test_labels)[0] 134 | accuracy = Counter(result)[True] / num_cases 135 | logging.info(f"正确率为{accuracy}") 136 | 137 | # 开始预测 138 | sample = np.random.rand(num_features, 5) 139 | true_label = np.where(sigmoid(np.dot(true_w.T, sample) + true_b) > .5, 1, 0) 140 | logging.info(f"true_label = {true_label}") 141 | prediction = LR.predict(sample) 142 | logging.info(f"\nsample=\n{sample}\nprediction={prediction}") 143 | -------------------------------------------------------------------------------- /Chapter07/SVM.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | SVM 4 | 按照解决问题的难度从易到难排序，SVM相关的算法有 5 | 线性可分SVM 6 | 线性SVM 7 | 非线性SVM 8 | 由上到下，前序（简单）模型都是后序（复杂）模型的基础、特殊情况，所以只实现非线性，使它兼容前序的模型。 9 | 10 | 非线性SVM的理解流程： 11 | 1. 线性可分SVM的初衷是什么+线性可分SVM参数的计算方法 12 | 2. 从硬间隔过渡到软间隔，创造出线性支持SVM，来解决 [近似可分训练集]的分类问题 13 | 3. 对于非近似线性可分的训练集，我们的目标是通过映射函数将输入空间映射映射为特征空间，使得训练集在特征空间中近似线性可分，然后应用线性SVM 14 | 15 | 接下来再针对每个大点做具体理解 16 | 1. 线性可分 17 | 1.1 SVM的初衷是对空间中正负例画出一个超平面，使得正负例可以被完美隔开，并且不同于感知机，我们还希望无论是每个点都能离这个超平面足够远， 18 | 越远我们才会觉得越靠谱。这就有点像及格线60分，成绩越超过60，我们越相信这是个学霸，成绩越低于60，我们越相信这是学渣。 19 | 1.2 根据1.1引出函数间隔，进而引出几何间隔 20 | 1.3 根据初始最优化的目标的形式，确定最终优化目标是min 1/2 * ||w|| s.t Σ yi(w·xi + b) - 1 >= 0 ，变量是w, b 21 | 1.4【重要，不懂拉格朗日就没有必要再看下去了】 22 | 1.4.1 构建拉格朗日函数 L(w,b,α) 23 | 1.4.2 拉格朗日对偶性，确定对偶优化目标是max min L。 24 | 1.4.2.1 min L,变量是w,b。求偏导，得到w关于α的公式，b关于α的公式 25 | 1.4.2.2 代入到L中,得到min L = -1/2 ∑i∑j αi*αj*yi*yj*(xi·xj) + ∑i αi 26 | 1.4.2.3 max [min L]，求得α=(α1, α2, ..., αn) 【这里有个伏笔，当数据量很大的时候，求α其实是非常耗时】 27 | 1.4.3 根据附录定理C.3，我们可以根据对偶问题的最优解α，反过来求得原始问题的最优价w,b 28 | 29 | 2. 线性不可分，但近似线性可分的情况 30 | 2.1 我们对yi(w·xi+b) >= 1的要求放宽一点，允许每个点都能不同程度地达不到这个目标，设置松弛变量ξi，使得 yi(w·xi+b) >= 1 - ξi 31 | 2.2 对应优化目标也要对松弛变量加惩罚C，目标变为min 1/2 * ||w|| + Cξ，ξi不为0时，意味着(xi,yi)没有被平面正确分类，否则没有必要松弛。 32 | 所以min 1/2 * ||w|| + Cξ，ξi的后半部分蕴含着少分错点的目标 33 | 2.3 同样经过拉格朗日那一套(不过比线性可分的推导过程要复杂),min L = -1/2 ∑i∑j αi*αj*yi*yj*(xi·xj) + ∑i αi s.t. ∑αi yi=0 , 0<=α<=C 34 | 2.4 用合页损失来理解线性SVM，这样更容易理解它和感知机的区别。 35 | 36 | 3. 非线性分类问题(非近似线性可分的训练集)。既然在当前输入空间上，训练集看起来不可分，那能不能通过对空间的映射，使得训练集在映射后 37 | 的空间是线性可分的，或者近似线性可分。 38 | 3.1 一个直接的想法，是找到这么一个映射函数φ，但是这个可不好找，怎么就知道φ后的训练集就可分呢？让我们倒过来想，假如我们映射后的训练集可分，那么 39 | 它应该可以用线性SVM搞，那么届时它的目标就是min L = -1/2 ∑i∑j αi*αj*yi*yj*(xi'·xj') +∑i αi，里面的xi',xj'是映射后的，也就是说， 40 | 这里的xi'=φ(xi),xj'=φ(xj)，我们观察到运算单元其实是φ(xi)·φ(xj)，也就是说，我们要是能直接定义出K(xi,xj)=φ(xi)·φ(xj)，也是够用的， 41 | 这个K就是核函数，这种不直接找φ而是找K的方法就是核技巧。 42 | 3.2 但是，不能说以xi,xj为变量的二元函数K就是核函数，核函数本意上=两个经过映射后的向量的内积。所以我们需要知道一个K是不是核函数。 43 | 这里有一堆数学知识，按住不表了。 44 | 3.3 但即使有3.2，要证明K是核函数还是挺麻烦的，所以一般都是直接应用一些常见的核函数：多项式核函数、高斯核函数核字符串核函数。 45 | 3.4 这里我有个问题，好像没有直接证明核函数后的训练集就(近似)线性可分了，大概是拿常用的核函数尝试后，准确率达到一定程度就认为有效吧 46 | 3.5 最后我们回到1.4.2.3的伏笔，求α是很麻烦的。好在Platt在1998年提出了SMO(sequential minimal optimization)。实际上，我们手动实现 47 | SVM，大多数篇幅就是在实现SMO而已。但是不懂前序这些知识，就算是照猫画虎把SMO实现了，笔者认为还不足够 48 | """ 49 | 50 | import numpy as np 51 | from functools import partial 52 | 53 | 54 | def W(K, Y, alpha, i, j): 55 | """ 56 | i, j分别是第一、第二变量的下标 57 | """ 58 | _W = .5 * K[i, i] * alpha[i] ** 2 + .5 * K[j, j] * alpha[j] ** 2 + Y[i] * Y[j] * K[i, j] * alpha[i] * alpha[j] - \ 59 | (alpha[i] + alpha[j]) + Y[i] * alpha[i] * ((Y * K[i])[np.r_[:i, i + 1:j:, j + 1:]]).sum() + \ 60 | Y[j] * alpha[j] * (Y * K[j][np.r_[:i, i + 1:j:, j + 1:]]).sum() 61 | return _W 62 | 63 | 64 | def SMO(K, Y, alpha, b, epsilon, C): 65 | """ 66 | SMO要解决如下问题 67 | min 1/2 ∑i∑j αi*αj*yi*yj*K(xi,xj)-∑i αi 68 | α 69 | """ 70 | # 选择变量 71 | # 先选择第一个变量，选择违反KKT条件最严重的变量作为第一个变量 72 | pred = np.dot(K, (alpha * Y)) + b # 书上的g_xi其实就是预测pred 73 | interval = Y * pred 74 | error = Y - pred 75 | 76 | # 注意到P129页在“第2个变量的选择”这一节中，最后说明了可能会找不到合适的α2使得目标函数有足够的下降，所以需要遍历寻找直到满足就退出 77 | # 先在间隔边界上的支持向量点，检验他们是否满足KKT条件（为什么书上说要优先从这里找呢） 78 | # 记选择的第一个变量是αi,第二个变量αj，即他们的下标分别为i,j 79 | i_candidate = np.where( 80 | (0 < alpha < C and interval - 1 > epsilon) or # todo: 理解这里为什么是 - 1 > epsilon 81 | (alpha == 0 and interval < 1) or 82 | (alpha == C and interval > 1) 83 | ) 84 | for i in i_candidate: 85 | # 找到第二个变量 86 | Ei = error[i] 87 | Ei_minus_Ej = np.abs(error - Ei) 88 | j_candidate = np.argsort(-Ei_minus_Ej) # 要对Ei_minus_Ej降序获得下标，np.argsort只支持升序，故排序的时候用相反数 89 | for j in j_candidate: 90 | # 更新选定的αi,αj，并计算更新后的αi,αj是否使得子问题W有足够的下降 91 | # 所以在更新前还得先计算、保存W(αi,αj) 92 | W_prev = W(K, Y, alpha, i, j) 93 | 94 | # 更新αi,αj 95 | if Y[i] != Y[j]: 96 | L = max(0, alpha[j] - alpha[i]) 97 | H = min(C, C + alpha[j] - alpha[i]) 98 | else: 99 | L = max(0, alpha[j] + alpha[i] - C) 100 | H = min(C, alpha[j] + alpha[i]) 101 | 102 | # 求解未经剪辑的αj_new 103 | eta = K[i, i] + K[j, j] - 2 * K[i, j] 104 | Ej = error[j] 105 | alpha_j_new_unc = alpha[j] + Y[j] * (Ei - Ej) / eta 106 | # 经剪辑后的αj_new 107 | if alpha_j_new_unc > H: 108 | alpha_j_new = H 109 | elif alpha_j_new_unc >= L: 110 | alpha_j_new = alpha_j_new_unc 111 | else: 112 | alpha_j_new = L 113 | # 求解αi_new 114 | alpha_i_new = alpha[i] + Y[i] * Y[j] * (alpha_j_new - alpha[j]) 115 | # 计算是否满足要求 116 | alpha_new = alpha.copy() 117 | alpha_new[i] = alpha_i_new 118 | alpha_new[j] = alpha_j_new 119 | W_next = W(K, Y, alpha_new, i, j) 120 | if W_prev - W_next > epsilon: 121 | b1_new = -Ei - Y[i] *K[i, i] * (alpha_i_new - alpha[i]) - \ 122 | Y[j] * K[j, i] * (alpha_j_new - alpha[j]) + b 123 | b2_new = -Ei - Y[i] * K[i, j] * (alpha_i_new - alpha[i]) - \ 124 | Y[j] * K[j, j] * (alpha_j_new - alpha[j]) + b 125 | b_new = (b1_new + b2_new) / 2 126 | return alpha_new, b_new 127 | 128 | return alpha, b 129 | 130 | 131 | class SVM: 132 | def __init__(self, C, epsilon, kernel=np.dot): 133 | self.C = C 134 | self.epsilon = epsilon 135 | self.kernel = kernel # 默认不经过映射函数，此时核函数就是向量点积而已 136 | self.w = np.empty((1,)) 137 | self.b = np.random.rand() 138 | self.alpha = np.empty((1,)) 139 | 140 | @staticmethod 141 | def _data_check(X, Y): 142 | assert set(Y) == {-1, 1}, "要求训练集中只能用+1,-1的标签" 143 | assert X.shape[0] == Y.shape[0] 144 | 145 | def fit(self, X, Y): 146 | self._data_check(X, Y) 147 | # 根据X来初始化w，α 148 | self.w = np.empty(X.shape[0]) 149 | self.alpha = np.empty_like(self.w) 150 | 151 | # 先用SMO算法求解α 152 | K = self.kernel(X, X.T) 153 | self.alpha, self.b = SMO(K, Y, self.alpha, self.b, self.epsilon, self.C) 154 | 155 | self.w = (self.alpha * Y)[:, None] * X.sum(axis=0) 156 | 157 | def predict(self, X): 158 | return np.sign(np.dot(X, self.w.T) + self.b) -------------------------------------------------------------------------------- /Chapter08/Adaboost.py: -------------------------------------------------------------------------------- 1 | # !/Applications/anaconda/envs/4PyCharm/bin/python3.4 2 | # -*- coding: utf-8 -*- 3 | 4 | import numpy as np 5 | from numpy import float 6 | 7 | INF = float('inf') 8 | 9 | 10 | # @lru_cache() 11 | def compute_error(pred, Y, weight): 12 | return sum(weight * (pred != Y)) 13 | 14 | 15 | class SignClassifier: 16 | def __repr__(self): 17 | return ("< " if self.sign == 1 else "> ") + str(self.threshold) 18 | 19 | def __init__(self): 20 | self.sign = 1 21 | self.threshold = INF 22 | 23 | def fit(self, X, Y, weight): 24 | assert len(X) == len(Y) == len(weight) 25 | X, Y, weight = zip(*sorted(zip(X, Y, weight), key=lambda t: t[0])) 26 | X, Y, weight = np.array(X), np.array(Y), np.array(weight) 27 | cost = INF 28 | for x in np.arange(min(X), max(X), 0.5): 29 | for sign in [-1, 1]: 30 | cur_pred = np.array(list(map(lambda t: 1 if t < 0 else -1, X - x))) * sign 31 | cur_cost = compute_error(cur_pred, Y, weight) 32 | if cur_cost < cost: 33 | cost = cur_cost 34 | self.threshold = x 35 | self.sign = sign 36 | if cur_cost == 0: 37 | break 38 | 39 | def predict(self, X): 40 | X = np.array(X) 41 | return np.array(list(map(lambda t: 1 if t < 0 else -1, X - self.threshold))) * self.sign 42 | 43 | 44 | class AdaClassifier: 45 | __slots__ = ['weight', 'n_estimate', 'base_estimate', 'estimate_list', 'am_list'] 46 | 47 | def __init__(self, base_estimate, n_estimate): 48 | self.base_estimate = base_estimate 49 | self.n_estimate = n_estimate 50 | # self.weight = 0 51 | self.estimate_list = [] 52 | self.am_list = [] 53 | 54 | def fit(self, X, Y): 55 | X, Y = np.array(X), np.array(Y) 56 | weight = np.ones(shape=X.shape) / X.shape[0] # 初始化权重 57 | for i in range(self.n_estimate): 58 | clf = self.base_estimate() 59 | clf.fit(X, Y, weight) 60 | self.estimate_list.append(clf) 61 | # 计算错误率 62 | em = compute_error(clf.predict(X), Y, weight) 63 | # 计算指数 64 | am = .5 * np.log((1 - em) / em) 65 | self.am_list.append(am) 66 | # 更新权重 67 | pred = clf.predict(X) 68 | exp_list = weight * np.exp(-am * Y * pred) 69 | Z = sum(exp_list) 70 | weight = exp_list / Z 71 | 72 | def predict(self, X): 73 | return np.sign(self.decision_function(X).sum(axis=0)) 74 | 75 | def decision_function(self, X): 76 | return np.array([am * clf.predict(X) for am, clf in zip(self.am_list, self.estimate_list)]) 77 | 78 | def score(self, X, Y): 79 | X, Y = np.array(X), np.array(Y) 80 | return sum(self.predict(X) == Y) / X.shape[0] 81 | 82 | 83 | class AdaRegression: 84 | pass 85 | 86 | 87 | def main(): 88 | X = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] 89 | Y = [1, 1, 1, -1, -1, -1, 1, 1, 1, -1] 90 | ada = AdaClassifier(base_estimate=SignClassifier, n_estimate=3) 91 | ada.fit(X, Y) 92 | print(ada.decision_function(X)) 93 | print(ada.predict(X)) 94 | print(ada.score(X, Y)) 95 | 96 | 97 | if __name__ == '__main__': 98 | main() 99 | -------------------------------------------------------------------------------- /Chapter09/GMM.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | from scipy.stats import norm 4 | 5 | PENDING = -1 6 | 7 | 8 | class GMM: 9 | def __init__(self, k, step=100, epsilon=1e-3): 10 | self.k = k # k个高斯分布 11 | self.alpha = np.ones(k) / k 12 | # mu形如[mu_1, mu_2, ..., mu_k] 13 | self.mu = PENDING 14 | 15 | # sigma形如[sigma_1, sigma_2, ..., sigma_k] 16 | self.sigma = PENDING 17 | 18 | # lambda_matrix形如 19 | # [ 20 | # [λ_11, λ_12, ..., λ_1k], 21 | # [λ_21, λ_22, ..., λ_2k], 22 | # ..., 23 | # [λ_n1, λ_n2, ..., λ_nk] 24 | # ], n是样本的数量，lambda_matrix[j,k]记录的是第k个模型对第j个数据的响应度 25 | self.lambda_matrix = PENDING 26 | 27 | self.step = step 28 | self.epsilon = epsilon 29 | 30 | @property 31 | def k_model(self): 32 | # P(y|θ) = Σ_{k=1}^{K} alpha_k * norm( 33 | # 因为norm(loc=self.mu, scale=self.sigma)的shape是(k,) 34 | # X的shape是(n,)，形如[x1, x2, ..., xn] 35 | # 而我们希望每个模型都分别n个样本计算概率分布pdf 36 | # 故需要将X包装成[[x1], [x2], ..., [xn]], 所以用X[:, None] 37 | return lambda X: self.alpha * norm(loc=self.mu, scale=self.sigma).pdf(X[:, None]) 38 | 39 | def fit(self, X): 40 | """ 41 | GMM学习的是X的分布，是一个无监督学习 42 | """ 43 | # 根据训练集初始化每个高斯分布的参数μ和σ 44 | self.mu = np.ones(self.k) * np.mean(X) 45 | self.sigma = np.ones(self.k) * np.std(X) 46 | 47 | # 开始迭代 48 | for step in range(self.step): 49 | # E步：依据当前模型参数，计算分模型k对观测数据y_j的响应度 50 | self.lambda_matrix = self.k_model(X) 51 | self.lambda_matrix /= self.lambda_matrix.sum(axis=1)[:, None] 52 | 53 | # M步：计算新一轮的模型参数μ_k, σ_k, α_k 54 | self.mu = (self.lambda_matrix * X[:, None]).sum(axis=0) / self.lambda_matrix.sum(axis=0) 55 | self.sigma = (self.lambda_matrix * (X - self.mu) ** 2).sum(axis=0) / self.lambda_matrix.sum(axis=0) 56 | self.alpha = self.lambda_matrix.sum(axis=0) / X.shape[0] 57 | 58 | def predict(self, X): 59 | return self.k_model(X).sum() -------------------------------------------------------------------------------- /Chapter10/HMM.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | 重点: 学会手写Baum-Welch的手推 4 | """ 5 | from itertools import product 6 | 7 | import numpy as np 8 | 9 | from Chapter10.backward import backward 10 | from Chapter10.forward import forward 11 | 12 | PENDING = np.array([0]) 13 | 14 | 15 | class HMM: 16 | def __init__(self, n_state=1, n_output=1, epsilon=1e-3, max_epoch=1000): 17 | # 状态转移概率矩阵，,shape = j * j，形如 18 | # [ 19 | # [a11, a12, ..., a1j], 20 | # [a21, a22, ..., a2j], 21 | # ..., 22 | # [aj1, a12, ..., ajj] 23 | # ] 24 | self.A = np.random.random(size=(n_state, n_state)) 25 | self.A /= self.A.sum(axis=1)[:, None] # 按行求和，然后按行除以和，确保最终的A每一行的和为1，这样才符合概率和为1 26 | 27 | # 状态->观察的概率矩阵，shape = j * m，形如 28 | # [ 29 | # [b11, b12, ..., b1m], 30 | # [b21, b22, ..., b2m], 31 | # ..., 32 | # [bj1, bj2, ..., bjm], 33 | # ] 34 | self.B = np.random.random(size=(n_state, n_output)) 35 | self.B /= self.B.sum(axis=1)[:, None] 36 | 37 | # 初始隐变量的概率分布，shape = (j, ), 形如 38 | # [p0 p1 p2 ..., pj] 39 | self.pi = np.ones_like((n_state,)) / n_state 40 | 41 | self.epsilon = epsilon 42 | self.max_epoch = max_epoch 43 | 44 | def probability(self, O, method='forward'): 45 | """ 46 | 已知λ=(A, B, π)和观测序列O，计算O出现的概率P(O|λ) 47 | """ 48 | if method == 'forward': 49 | return forward(self.pi, self.A, self.B, O) 50 | else: 51 | return backward(self.pi, self.A, self.B, O) 52 | 53 | def fit(self, O, I): 54 | """ 55 | 正常来说，观测数据是多条O1=(o11, o12, ..., o1s), ..., 按照书上的提示，将这多条拼接成一条大的 56 | O=(o1, o2, oT) 57 | """ 58 | O = O.reshape(1, -1) 59 | I = O.reshape(1, -1) 60 | if I.size != 0: # 即有状态序列，使用监督的学习方法 61 | assert O.shape == I.shape 62 | # todo: 这里O的shape改了 63 | # 1. 状态转移概率A的估计，通过频数来估计 64 | for i in I: 65 | for i_prev, i_next in zip(i[:-1], i[1:]): 66 | self.A[i_prev, i_next] += 1 67 | self.A /= self.A.sum() 68 | # 2. 观测概率B的估计 69 | rows, columns = I.shape 70 | for row, column in product(range(rows), range(columns)): 71 | self.B[I[row, column], O[row, column]] += 1 72 | self.B /= self.B.sum() 73 | # 3. 估计π 74 | self.pi = np.unique(I[:, 0], return_counts=True)[1] / I.shape[0] 75 | 76 | else: # 没有状态序列，则需要用非监督的学习方法——Baum-Welch，背后是EM算法 77 | for _ in range(self.max_epoch): 78 | # new_A 79 | # 1. ξ = (ξ1, ξ2, ..., ξt-1) 80 | # ξ1形如 81 | # 下一时刻状态为1 下一时刻状态为2 ... 下一时刻状态为n 82 | # 此时刻状态为1 p11 p12 p1n 83 | # 此时刻状态为2 p21 p22 p2n 84 | # ... 85 | # 此时刻状态为n pn1 pn2 pnn 86 | ksi = [] 87 | gamma = [] 88 | for t in range(len(O[0]) - 1): 89 | alpha = forward(self.pi, self.A, self.B, O[0:, t])[0] 90 | beta = backward(self.pi, self.A, self.B, O[0:, t])[0] 91 | ksi_t = alpha[:, None] * self.A * self.B[:, O[0][t]][None] * beta 92 | ksi_t = ksi_t / ksi_t.sum() 93 | ksi.append(ksi_t) 94 | 95 | gamma_t = alpha * beta 96 | gamma.append(gamma_t) 97 | 98 | alpha_last = forward(self.pi, self.A, self.B, O[0:, -1]) 99 | beta_last = backward(self.pi, self.A, self.B, O[:, -1]) 100 | gamma_last = alpha_last * beta_last 101 | gamma.append(gamma_last) 102 | 103 | ksi = np.array(ksi) 104 | gamma = np.array(gamma) 105 | new_A = ksi.sum(axis=-1) / gamma.sum(axis=-1)[:, None] 106 | 107 | new_B = 0 108 | new_pi = 0 109 | self.A, prev_A = new_A, self.A 110 | self.B, prev_B = new_B, self.B 111 | self.pi, prev_pi = new_pi, self.pi 112 | if np.max(np.abs(self.A - prev_A)) < self.epsilon and np.max(np.abs(self.B - prev_B)) < self.epsilon \ 113 | and np.max(np.abs(self.pi - prev_pi)) < self.epsilon: 114 | break 115 | 116 | def predict(self): 117 | pass 118 | -------------------------------------------------------------------------------- /Chapter10/backward.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def backward(pi, A, B, O): 5 | n_state, _ = A.shape 6 | assert pi.shape[0] == n_state 7 | assert B.shape[0] == n_state 8 | # 初始化β，形如 9 | # 状态1 状态2 ... 状态n 10 | # 观测序列1 1 1 1 11 | # 观测序列2 1 1 1 12 | # ... 13 | # 观测序列S 1 1 1 14 | beta_prev = np.ones((O.shape[0], pi.shape[0])) 15 | # block: 迭代 16 | for i in range(O.shape[1] - 1, 0, -1): 17 | beta_next = beta_prev 18 | # o形如 19 | # 观测 20 | # 观测序列1 o1 21 | # 观测序列2 o2 22 | # ... 23 | # 观测序列S oS 24 | o = O[:, i] 25 | b = B[:, o] 26 | beta_prev = np.dot(A, (b * beta_next.T)).T 27 | 28 | # 此时得到的beta_prev是指 29 | o = O[:, 0] 30 | beta_prev = (pi * B[:, o].T) * beta_prev 31 | return beta_prev.sum(axis=1) 32 | 33 | 34 | def demo(): 35 | pi = np.array([.2, .4, .4]) 36 | A = np.array([ 37 | [.5, .2, .3], 38 | [.3, .5, .2], 39 | [.2, .3, .5] 40 | ]) 41 | B = np.array([ 42 | [.5, .5], 43 | [.4, .6], 44 | [.7, .3] 45 | ]) 46 | O = np.array([ 47 | [0, 1, 0], 48 | [0, 1, 0], 49 | ]) 50 | print(f"P(O|λ) = {backward(pi, A, B, O)}") 51 | 52 | 53 | if __name__ == '__main__': 54 | demo() 55 | -------------------------------------------------------------------------------- /Chapter10/baum_welch.py: -------------------------------------------------------------------------------- 1 | """ 2 | 输入数据O=(o1, o2, ..., oT) 3 | 输出隐变量pi, A, B 4 | todo: new_B公式的含义其实如下 5 | 遍历所有时刻t， 6 | 1. t时状态为j且t时刻观察为k的概率 7 | 2. t是状态为j的概率 8 | 9 | """ 10 | import numpy as np 11 | 12 | from Chapter10.backward import backward 13 | from Chapter10.forward import forward 14 | 15 | 16 | def baum_welch(pi, A, B, O, epsilon, max_epoch): 17 | """ 18 | 根据观测数据O来学习、输出隐马尔科夫模型λ=(A, B, π) 19 | """ 20 | epoch = 0 21 | T = len(O[0]) - 1 22 | while epoch < max_epoch: 23 | print(f"A = \n{A}, \nB = \n{B}, \nπ = \n{pi}") 24 | epoch += 1 25 | # 先求ξ_t和γ_t 26 | # ξ_t形如 27 | # 下时刻状态为1 下时刻状态为2 ... 下时刻状态为n 28 | # 此时刻状态为1 p11 p12 p1n 29 | # 此时刻状态为2 p21 p22 p2n 30 | # ... 31 | # 此时刻状态为n pn1 pn2 pnn 32 | 33 | # γ_t形如 34 | # 处于状态1 处于状态2 ... 处于状态n 35 | # p1 p2 pn 36 | 37 | # 求ξ_t和γ_t需要借助α_t、β_t、β_t+1 38 | ksi = [] 39 | gamma = [] 40 | # new_B需要知道t时刻状态为j且观察为k的概率，这个量如下计算 41 | gamma_with_o = [] 42 | for t in range(T): 43 | alpha_t = forward(pi, A, B, O[:, t])[0] 44 | beta_t = backward(pi, A, B, O[:, t])[0] 45 | beta_t_add_1 = backward(pi, A, B, O[:, t + 1])[0] 46 | 47 | ksi_t = alpha_t[:, None] * A * B[:, [t + 1]] * beta_t_add_1[:, None] 48 | ksi_t = ksi_t / ksi_t.sum() 49 | ksi.append(ksi_t) 50 | 51 | gamma_t = alpha_t * beta_t 52 | gamma_t = gamma_t / gamma_t.sum() 53 | gamma.append(gamma_t) 54 | 55 | # 接下来计算t时刻的gamma_with_o，代表t时刻状态为j且观察为o的概率 56 | # 形如 57 | # 观察1 观察2 ... 观察S 58 | # 状态1 59 | # 状态2 60 | # ... 61 | # 状态n 62 | output_is_o = np.zeros((B.shape[0],)) 63 | output_is_o[O[:, t][0]] = 1 64 | gamma_with_o_t = np.dot(gamma_t[:, None], output_is_o[None]) 65 | gamma_with_o.append(gamma_with_o_t) 66 | ksi = np.array(ksi) 67 | gamma = np.array(gamma) 68 | gamma_with_o = np.array(gamma_with_o) 69 | 70 | new_A = ksi.sum(axis=-1) / gamma.sum(axis=-1)[:, None] 71 | new_B = gamma_with_o.sum(axis=1) / gamma.sum(axis=-1)[:, None] 72 | new_pi = gamma[0] 73 | if stop(new_A - A, new_B - B, new_pi - pi, epsilon=epsilon): 74 | return new_pi, new_A, new_B 75 | else: 76 | pi = new_pi 77 | A = new_A 78 | B = new_B 79 | 80 | 81 | def stop(*diffs, epsilon): 82 | for diff in diffs: 83 | if abs(diff.max()) < epsilon: 84 | return True 85 | return False 86 | 87 | 88 | def demo(): 89 | pass 90 | 91 | 92 | if __name__ == '__main__': 93 | demo() 94 | 95 | -------------------------------------------------------------------------------- /Chapter10/forward.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def forward(pi, A, B, O): 5 | """ 6 | pi: 初始状态概率分布(initial state distribution), shape = 7 | A: 状态转移概率矩阵(state transition matrix) 8 | B: 状态-->观察概率矩阵(state-->output matrix) 9 | O: array([观察序列1, 观察序列2, ..., pn]) 10 | 任务是计算每条观察序列的概率，形如[p1, p2, ..., pn] 11 | 需要借助α来计算, α形如[α1, α2, ..., αn] 12 | """ 13 | n_state, _ = A.shape 14 | assert pi.shape[0] == n_state 15 | assert B.shape[0] == n_state 16 | # block: 初始化alpha 17 | # 1. 每条观察序列的第一个观察值 18 | o = O[:, 0] 19 | # 2. 每个初始状态转移到每条观察序列第一个观察值的概率矩阵，形如 20 | # 第一条序列第一个观测值第二条序列第一个观测值 ... 第S条序列第一个观测值 21 | # 状态1 p11 p12 p1s 22 | # 状态2 p21 p22 p2s 23 | # ... 24 | # 状态n pn1 pn2 pns 25 | b = B[:, o] 26 | # 3. 每条观测序列的初始alpha，形如 27 | # 状态1 状态2 ... 状态n 28 | # 观测序列1 29 | # 观测序列2 30 | # ... 31 | # 观测序列S 32 | alpha_next = pi * b.T 33 | 34 | # block: 迭代 35 | for i in range(1, O.shape[1]): 36 | alpha_prev = alpha_next 37 | o = O[:, i] 38 | b = B[:, o] 39 | alpha_next = (np.dot(alpha_prev, A)) * b.T 40 | 41 | return alpha_next.sum(axis=1) 42 | 43 | 44 | def demo(): 45 | pi = np.array([.2, .4, .4]) 46 | A = np.array([ 47 | [.5, .2, .3], 48 | [.3, .5, .2], 49 | [.2, .3, .5] 50 | ]) 51 | B = np.array([ 52 | [.5, .5], 53 | [.4, .6], 54 | [.7, .3] 55 | ]) 56 | O = np.array([ 57 | [0, 1, 0], 58 | [0, 1, 0], 59 | ]) 60 | print(f"P(O|λ) = {forward(pi, A, B, O)}") 61 | 62 | 63 | if __name__ == '__main__': 64 | demo() -------------------------------------------------------------------------------- /Chapter10/viterbi.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | 维特比算法 4 | 任务： 5 | 当模型的参数λ=(π,A,B)已知，想要根据观察序列O=(o1, o2, ..., oT),来预测状态序列(i1_, i2_, ..., iT_)【下划线代表它是未知的变量】 6 | 每个时刻t都有n种状态的可能，那么状态序列的可能性就有T^n，这个计算量是很大的。 7 | 我们需要一种更快捷的办法 8 | 9 | 假设最优状态序列为I=(i1, i2, ..., iT)，那么它应该具备以下特性： 10 | 已知它截止T-1时刻的状态序列为(i1, i2, ..., i_{T-1})。从T-1时刻到T时刻，有状态转移概率iT-1-->iT_的概率， 11 | 且iT_还有发射出观察为oT的概率iT_-->oT。iT_有n种可能，其中使得(iT-1-->iT_的概率)*(iT_-->oT的概率)最大的，一定就是iT。 12 | 否则，就存在另外一个iT',使得整条序列的概率更大，有矛盾。 13 | 这就意味着，算法在求解最后一个时刻T的状态时，答案必须要使得(iT-1-->iT_的概率)*(iT_-->oT的概率)最大。 14 | 【关键步骤】现在，让我们把目光往前推一步到T-1，T-1也需要满足这样的条件，T-2也需要，直到t=2时刻(t=1时刻是初始状态)。 15 | 因此，我们只需要从t=2开始，每次都求解基于i_{t-1}，分别计算it=1, it=2, ..., it=n的概率最大化的情况 16 | 这里举个例子辅助理解：t-1时刻i_{t-1}=1, 2, ..., n, 对应的t时刻it=1概率分别是P11, P12, ..., P1n，如果P1j最大，那么此时应该选择 17 | it=1搭配的i_{t-1}=j，对应最大概率为P1j；同理，计算 18 | it=2搭配的i_{t-1}=k, 对应最大概率为P2k; 19 | ...; 20 | it=n搭配的i_{t-1}=m, 对应最大概率为Pnm; 21 | 22 | 然后递归到下一步，我们可以提炼出一个公式 23 | P_max[t][i] = max(P_max[t-1][1] * a1i * bi(o), P_max[t-1][2] * a2i * bi(o), ..., P_max[t-1][n] * ani * bi(o)) 24 | 这就是动态规划的公式了。 25 | 当然，这个动态规划的任务比一般的动态规划要多一个步骤，因为我们要输出序列，而不是最终最大概率是多少，所以我们还需要记录 26 | 第t步it=i时，搭配的i_{t-1}是什么才行。 27 | 28 | """ 29 | import numpy as np 30 | 31 | 32 | def viterbi(pi, A, B, O): 33 | """ 34 | 注意，这里的O是多条观察序列，即O=(O1, O2, ..., Os)，假设每条Oi=(oi1,oi2, ..., oiT)，即每条oi有n_step个时刻 35 | 需要对每条观察序列预测、输出最大概率的状态序列 36 | """ 37 | A = np.array(A) 38 | B = np.array(B) 39 | pi = np.array(pi) 40 | O = np.array(O) 41 | 42 | # 时刻数（步数（ 43 | _, n_step = O.shape 44 | 45 | # 多条状态序列的shape应该跟O一致 46 | I = np.empty_like(O) 47 | 48 | # δ代表第t步时，状态分别为1, 2, ..., n的最大概率，形如 49 | # 第一条观测状态为1的最大概率状态为2的最大概率 ... 状态为n的最大概率 50 | # 第二条观测状态为1的最大概率状态为2的最大概率 ... 状态为n的最大概率 51 | # ... 52 | # 最后条观测状态为1的最大概率状态为2的最大概率 ... 状态为n的最大概率 53 | 54 | # 第0步的delta是根据π和B来初始化的 55 | delta = pi[None] * B[:, O[:, 0]].T 56 | psi = np.zeros(shape=(*O.shape, pi.shape[0])) # psi[k][t][i]代表，第k条观察序列对应的第t步选择状态为i时，搭配t-1的状态 57 | 58 | for t in range(1, n_step): 59 | psi_t = np.argmax(delta[..., None] * A, axis=1) 60 | delta = np.max((delta[:, None] * A.T) * B[:, O[:, t]].T[..., None], axis=2) 61 | psi[:, t] = psi_t 62 | 63 | best_T = np.argmax(delta, axis=1) 64 | I[:, -1] = best_T 65 | for t in range(n_step - 2, -1, -1): 66 | best_t = psi[:, t + 1].take([I[:, t + 1]]) 67 | I[:, t] = best_t 68 | return I 69 | 70 | 71 | def demo(): 72 | A = [ 73 | [.5, .2, .3], 74 | [.3, .5, .2], 75 | [.2, .3, .5] 76 | ] 77 | 78 | B = [ 79 | [.5, .5], 80 | [.4, .6], 81 | [.7, .3] 82 | ] 83 | 84 | pi = [.2, .4, .4] 85 | 86 | O = [ 87 | [0, 1, 0], 88 | [0, 1, 0] 89 | ] 90 | print(viterbi(pi, A, B, O)) 91 | 92 | 93 | if __name__ == '__main__': 94 | demo() 95 | -------------------------------------------------------------------------------- /Chapter11/BFGS.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | 实现BFGS 4 | """ 5 | import numpy as np 6 | 7 | 8 | # 用一维搜索求λk 9 | def line_search_wolfe(fun, grad, x, p, max_epoch=100, c1=10 ** (-3), c2=0.9, alpha_1=1.0, alpha_max=10 ** 6): 10 | if alpha_1 >= alpha_max: 11 | raise ValueError('Argument alpha_1 should be less than alpha_max') 12 | 13 | def phi(alpha): 14 | return fun(x + alpha * p) 15 | 16 | def phi_grad(alpha): 17 | return np.dot(grad(x + alpha * p).T, p) 18 | 19 | alpha_old = 0 20 | alpha_new = alpha_1 21 | 22 | final_alpha = None 23 | 24 | for i in np.arange(1, max_epoch + 1): 25 | phi_alpha = phi(alpha_new) 26 | 27 | if (i == 1 and phi_alpha > phi(0) + c1 * alpha_new * phi_grad(0)) or (i > 1 and phi_alpha >= phi(alpha_old)): 28 | final_alpha = zoom(x, p, phi, phi_grad, alpha_old, alpha_new, c1, c2) 29 | break 30 | 31 | phi_grad_alpha = phi_grad(alpha_new) 32 | 33 | if np.abs(phi_grad_alpha) <= -c2 * phi_grad(0): 34 | final_alpha = alpha_new 35 | break 36 | 37 | if phi_grad_alpha >= 0: 38 | final_alpha = zoom(x, p, phi, phi_grad, alpha_new, alpha_old, c1, c2) 39 | break 40 | 41 | alpha_old = alpha_new 42 | alpha_new = alpha_new + (alpha_max - alpha_new) * np.random.rand(1) 43 | 44 | if i == max_epoch and final_alpha is None: 45 | return None 46 | 47 | return final_alpha 48 | 49 | 50 | # 一维搜索中的辅助函数 51 | def zoom(x, p, phi, phi_grad, alpha_lo, alpha_hi, c1, c2): 52 | while True: 53 | alpha_j = (alpha_hi + alpha_lo) / 2 54 | 55 | phi_alpha_j = phi(alpha_j) 56 | 57 | if (phi_alpha_j > phi(0) + c1 * alpha_j * phi_grad(0)) or (phi_alpha_j >= phi(alpha_lo)): 58 | alpha_hi = alpha_j 59 | else: 60 | phi_grad_alpha_j = phi_grad(alpha_j) 61 | 62 | if np.abs(phi_grad_alpha_j) <= -c2 * phi_grad(0): 63 | return alpha_j 64 | 65 | if phi_grad_alpha_j * (alpha_hi - alpha_lo) >= 0: 66 | alpha_hi = alpha_lo 67 | 68 | alpha_lo = alpha_j 69 | 70 | 71 | def BFGS(func, grad, w_start, eps, max_iterations=100, verbose=False): 72 | n = len(w_start) 73 | 74 | # We are starting with identity matrix 75 | # as approximation of the inverse of the Hessian. 76 | # It will be updated on every iteration. 77 | # We are using the notation H_k = (B_k)^{-1}, 78 | # where B_k is the approximation of the Hessian. 79 | # B矩阵需要是个对称正定矩阵，用单位矩阵初始化B矩阵 80 | H_old = np.diag(np.ones(n)) 81 | w_old = w_start 82 | 83 | for i in np.arange(1, max_iterations + 1): 84 | # 搜索方向p=-H * gk = -Bk^{-1} * gk 85 | p = -1 * np.dot(H_old, grad(w_old)) 86 | 87 | # Calculating the step into the direction p 88 | # using the Wolfe conditions as constrains on the step. 89 | lambda_ = line_search_wolfe(func, grad, w_old, p, max_epoch=max_iterations) 90 | 91 | if lambda_ is None: 92 | print('Wolfe line search did not converge') 93 | return w_old, i 94 | 95 | w_new = w_old + lambda_ * p 96 | 97 | s = (w_new - w_old).reshape((n, 1)) 98 | y = (grad(w_new) - grad(w_old)).reshape((n, 1)) 99 | sT = s.T.reshape((1, n)) 100 | yT = y.T.reshape((1, n)) 101 | 102 | yT_s = np.dot(yT, s).reshape(()) 103 | 104 | I = np.diag(np.ones(n)) 105 | rho = 1 / yT_s 106 | rho2 = rho ** 2 107 | 108 | # The next products are being used 109 | # in the calculation of the H_{k+1} from H_k. 110 | # Only the matrices of dimension (n x n) will be used in the final formula. 111 | H_y = np.dot(H_old, y).reshape((n, 1)) # H_k * y_k 112 | Hy_sT = np.dot(H_y, sT).reshape((n, n)) # (H_k*y_k) * s^T 113 | yT_H = np.dot(yT, H_old).reshape((1, n)) # y_k^T * H_k 114 | s_yTH = np.dot(s, yT_H).reshape((n, n)) # s_k * (y_k^T*H_k) 115 | syTH_y = np.dot(s_yTH, y).reshape((n, 1)) # (s_k*(y_k^T*H_k)) * y_k 116 | syTHy_sT = np.dot(syTH_y, sT).reshape((n, n)) # ((s_k*(y_k^T*H_k))*y_k) * s_k^T 117 | s_sT = np.dot(s, sT).reshape((n, n)) # s_k * s_k^T 118 | 119 | # The initial formula 120 | # H_{k+1} = (I - rho_k*s_k*y_k^T)H_k(I - rho_k*y_k*s_k^T) + rho_k*s_k*s_T 121 | # can be rewritten as 122 | # H_{k+1} = H_k - rho_k*(H_k*y_k)*s_k^T - rho_k*s_k*(y_k^T*H_k) + rho_k^2*((s_k*(y_k^T*H_k))*y_k)*s_k^T + rho_k*s_k*s_k^T 123 | # to avoid calculations of assimptote complexity O(n^3). 124 | H_new = H_old - rho * Hy_sT - rho * s_yTH + rho2 * syTHy_sT + rho * s_sT 125 | 126 | if verbose: 127 | print('x_k = {0} converges to x_(k+1) = {1}'.format(w_old, w_new)) 128 | 129 | # We are using the 2-norm value 130 | # between the previous and the next gradient 131 | # of the approximation of the function minima 132 | # as the stopping condition for the BFGS algorithm. 133 | grad_dist = np.linalg.norm(grad(w_old) - grad(w_new)) 134 | if grad_dist < eps: 135 | break 136 | elif verbose: 137 | print('There is still {0} left for approximations to converge'.format(np.abs(grad_dist - eps)), '\n') 138 | 139 | w_old = w_new 140 | H_old = H_new 141 | 142 | if verbose: 143 | print('\nFinal approximation of the minima is {0}.'.format(w_new)) 144 | if i != max_iterations: 145 | print('Optimization process converged in {0} steps'.format(i)) 146 | else: 147 | print('Optimization process did not converge') 148 | 149 | return w_new, i 150 | 151 | 152 | def demo(): 153 | pass 154 | 155 | 156 | if __name__ == '__main__': 157 | demo() 158 | -------------------------------------------------------------------------------- /Chapter11/CRF.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | 条件随机场(CRF, conditional random field) 4 | 5 | 1. 初衷：和HMM一样，想要解决有隐变量的序列概率问题，即求解argmax P(I|O,λ=(A,B,π)) 6 | I 7 | 2. 区别： 8 | 2.1 HMM 9 | HMM最重要的预设：预设了每个it只跟i_{t-1}有关，每个ot只跟o_{t-1}有关。第一个假设就是齐次马尔可夫假设。 10 | 换句话说i_{t-1}决定it，o_{t-1}决定ot，前者相当于预设了概率转移矩阵A，后者预设了发射矩阵B。所以HMM需要学习出这两个 11 | 矩阵，再加上初始状态概率分布矩阵π。这三个矩阵学习出来后，意味着HMM已经完全掌握了无论是状态还是观察的生成规律了。所以它就是生成模型。 12 | 这里细细品味一下，正是HMM的这两个假设决定了模型的学习目标，进而决定了模型是生成模型。 13 | 14 | 这就是HMM的基本方法论，剩下的难点无非是如何学习出这几个矩阵。 15 | 2.2 CRF(只说最简单的线性CRF) 16 | CRF则不仅仅是假设每个it只跟i_{t-1}有关，而是假设it跟i_{t-1}和i_{t+1}。也就是说， 17 | P(it|O,i1,...i_{t-1},i_{t+1},...,iT)=P(it|O,i_{t-1},i_{t+1}) 18 | 所以P=(I|O)=（公式11.10) 19 | # todo: 位置这个概念我，有时候用t，有时候跟书上一致，用的是i，得统一一波 20 | # todo: 为什么LinearCRF和HMM一样，-在预测都用维特比，在计算概率时都用前向后向 21 | 根据例11.1，可以发现，特征函数的定义非常宽泛具体，笔者一开始以为特征函数限定在相对位置，即(前、后位置的标记转移关系) 22 | 但后面才发现特征函数可以限定第几个位置，比如t4就限定序列的第二个状态为2和第三个状态为2时才算满足条件 23 | 另外，书上的s1,s2,s3,s4都不依赖于具体的 24 | 这里用闭包来定义转移特征和状态特征，当然也可以用类定义 25 | 统一标识： 26 | X = (X1, X2, ..., Xn)，即n条观察序列。其中Xi = (xi1, xi2, ..., xiT)，即每条观察序列有T个位置。同理： 27 | Y = (Y1, Y2, ..., Yn)，即n条标识序列。其中Yi = (yi1, yi2, ..., yiT)。 28 | 每个yit可能的取值有N个 29 | # todo: 一个大问题：看起来,CRF的概率计算、学习、预测都跟x没有任何关系，尤其是根据11.4.1节的对数似然函数，可以发现训练过程中根本用不到x 30 | # todo: 因为fk(yj,xj)的计算过程中，完全用不到xj。(待求证李航老师）但假设现在是给语句分词作标注，我们定义一个状态特征："的"字的标注为O（非实体词），说明 31 | # todo：这种依赖于观察的状态特征是完全合理，笔者擅自按照这种思路来拓宽细化状态特征的定义。 32 | """ 33 | from functools import lru_cache 34 | from functools import reduce 35 | from itertools import product 36 | 37 | import numpy as np 38 | 39 | from Chapter11.BFGS import BFGS 40 | from Chapter11.backward import backward 41 | from Chapter11.forward import forward 42 | 43 | TRANSITION = 'transition' 44 | STATE = 'state' 45 | 46 | 47 | class FeatureFunc: 48 | def __init__(self, category, required_y_prev, required_y_next, required_x=None, required_i=None): 49 | self.category = category 50 | self.required_y_prev = required_y_prev 51 | self.required_y_next = required_y_next 52 | self.required_x = required_x 53 | self.required_i = required_i 54 | 55 | @lru_cache() 56 | def cal_single(self, test_y_prev, test_y_next, test_x, test_i): 57 | """计算给定位置的特征得分""" 58 | if self.category == TRANSITION: 59 | if test_y_prev != self.required_y_prev or test_y_next != self.required_y_next: 60 | return 0 61 | if self.required_x is not None and test_x != self.required_x: 62 | return 0 63 | if self.required_i is not None and test_i != self.required_i: 64 | return 0 65 | return 1 66 | elif self.category == STATE: # 状态特征只看y_next和位置(如果有要求) 67 | if test_y_next != self.required_y_prev: 68 | return 0 69 | if self.required_i is not None and test_i != self.required_i: 70 | return 0 71 | return 1 72 | 73 | @lru_cache() 74 | def cal_sequence(self, x, y): 75 | """计算一整个序列的特征得分""" 76 | score = 0 77 | start_index = 0 if self.category == STATE else 1 78 | for test_i in range(start_index, len(x)): 79 | test_y_prev = y[test_i - 1] 80 | test_y_next = y[test_i] 81 | test_x = x[test_i] 82 | score += self.cal_single(test_y_prev, test_y_next, test_x, test_i) 83 | return score 84 | 85 | 86 | class LinearCRF: 87 | def __init__(self, X, Y, y_option, ff, epsilon): 88 | """ 89 | :param y_option: 状态的可能值， 90 | :param X: 观察序列，把多条碾成一条 91 | """ 92 | # 直接根据这个X,Y来初始化M,α，β，甚至那两个期望值 todo 93 | assert len(X) == len(Y) 94 | self.X = X 95 | self.Y = Y 96 | # 计算联合(x,y)的经验概率分布和x的概率分布 97 | self.x_prob, self.x_y_prob = self._cal_empirical_distribution() 98 | self.n_sample, self.T = X.shape 99 | self.y_option = y_option 100 | self.n = len(self.y_option) # 状态可能值的数目 101 | self.ff = ff 102 | self.w = np.random.dirichlet(size=(len(ff),)) 103 | self.epsilon = epsilon 104 | 105 | def _cal_empirical_distribution(self): 106 | n_sample = len(self.X) 107 | x_prob = dict() 108 | x_y_prob = dict() 109 | for idx in range(n_sample): 110 | x, y = tuple(self.X[idx]), tuple(self.Y[idx]) 111 | assert len(x) == len(y), f"第{idx}条样本的状态长度为{len(x)}和输出长度为{len(y)}，不等长" 112 | x_y = (x, y) 113 | x_prob[x] = x_prob.get(x, 0) + 1 / n_sample 114 | x_y_prob[x_y] = x_prob.get(x_y, 0) + 1 / n_sample 115 | return x_prob, x_y_prob 116 | 117 | def cal_F(self, x, y): 118 | """ 119 | 给定x,y来生成特征矩阵F(y,x)=(f1(y,x),f2(y,x),...,fK(y,x))T 120 | """ 121 | pass 122 | 123 | def cal_M(self, x): 124 | """计算给定观察x的前提下的M矩阵""" 125 | # M是各个时间步上的状态转移矩阵，即M=(M1,M2,...,MT) 126 | # 形如 127 | # [ 128 | # 第一个时间步 [ 第一个时间步处于状态1 第一个时间步处于状态2 ... 第一个时间步处于状态n 129 | # 第零个时间步处于状态1 M11 M12 M1n 130 | # 第零个时间步处于状态2 M21 M22 M2n 131 | # ... 132 | # 第零个时间步处于状态n Mn1 Mn2 Mnn 133 | # ] 134 | # 第二个时间步 [ 第二个时间步处于状态1 第二个时间步处于状态2 ... 第二个时间步处于状态n 135 | # 第一个时间步处于状态1 M11 M12 M1n 136 | # 第一个时间步处于状态2 M21 M22 M2n 137 | # ... 138 | # 第一个时间步处于状态n Mn1 Mn2 Mnn 139 | # ] 140 | # ... 141 | # 第T+1个时间步 [ 第T+1个时间步处于状态1 第T+1个时间步处于状态2 ... 第T+1个时间步处于状态n 142 | # 第T个时间步处于状态1 M11 M12 M1n 143 | # 第T个时间步处于状态2 M21 M22 M2n 144 | # ... 145 | # 第T个时间步处于状态n Mn1 Mn2 Mnn 146 | # ] 147 | # ] 148 | # 而Mij=f1(yi,yj,x,1) + f2(yi,yj,x,1) + ... 149 | # feature_matrix = np.zeros(shape=(self.n, self.n)) 150 | T = len(x) 151 | 152 | M = [] 153 | for test_i in range(T + 1): 154 | M_t = [] 155 | test_x = x[test_i] 156 | for test_y_prev, test_y_next in product(range(self.n), range(self.n)): 157 | score = 0 # 在x下，y_prev, y_next, i在特征函数下的得分 158 | for w, f in zip(self.w, self.ff): 159 | score += w * f(test_y_prev, test_y_next, test_x, test_i) 160 | M_t.append(score) 161 | M_t = np.array(M_t).reshape((self.n, self.n)) # 其实到这里，仅仅是书上的W矩阵 162 | M_t = np.exp(M_t) 163 | M.append(M_t) 164 | M = np.array(M) 165 | return M 166 | 167 | def inference(self, x, y): 168 | """给定x,y，求Pw(y|x),利用M矩阵""" 169 | T = len(x) 170 | M = self.cal_M(x) 171 | Zw = reduce(np.dot, M) 172 | numerator = 1 173 | for i in range(T + 1): 174 | y_prev = y[i] 175 | y_next = y[i + 1] 176 | numerator *= M[i, y_prev, y_next] 177 | return numerator / Zw 178 | 179 | def fit(self): 180 | """ 181 | 这里用拟牛顿法 182 | 输入： 183 | 1. 原始func: Pw(y|x)，注意这里的func的参数是w，而训练集(X,Y)其实是常数了 184 | 2. func的梯度grad：同样的，grad也是w的梯度 185 | 将1、2传入给BFGS函数，求得最后的w 186 | :return: 187 | """ 188 | 189 | def loss(w): 190 | # 先算f(w)的第一项 191 | term1 = 0 192 | for x, x_prob in self.x_prob.items(): 193 | exp = 0 194 | for (x_, y) in self.x_y_prob: # 在训练集中出现的(x,y) 195 | if x_ == x: 196 | for w_k, ff_k in zip(w, self.ff): 197 | exp += np.exp(ff_k.cal_sequence(x, y)) 198 | term1 += x_prob * np.log(exp) 199 | 200 | term2 = 0 201 | for (x, y), x_y_prob in self.x_y_prob.items(): 202 | # 计算 203 | score = 0 204 | for w_k, ff_k in zip(w, self.ff): 205 | score += w * ff_k.cal_sequence(x, y) 206 | term2 += x_y_prob * score 207 | cost = term1 - term2 208 | return cost 209 | 210 | def grad_loss(w): 211 | self.w = w # todo 212 | grad = [] 213 | for w_k, ff_k in zip(self.w, self.ff): 214 | score = 0 215 | for (x, y), x_y_prob in self.x_y_prob.items(): 216 | score += ff_k.cal_sequence(x, y) 217 | M = self.cal_M(x) 218 | Zm = reduce(np.dot, M) 219 | alpha = forward(M, len(x)) 220 | beta = backward(M, len(x)) 221 | # todo: 还没弄完 222 | return 0 223 | 224 | self.w, _ = BFGS(loss, grad_loss, self.w, self.epsilon) 225 | 226 | def probability_single(self, x, i): 227 | """ 228 | :param x: 已知的观察 229 | :param i: 位置 230 | :return: 231 | """ 232 | M = self.cal_M(x) 233 | # 在当前x下扫描记录α和β 234 | alpha = [] 235 | alpha_0 = 1 # 书上这里设置的y_0=start时才为1，否则为0，但我没有想出有不为start的必要 236 | alpha.append(alpha_0) 237 | 238 | 239 | if __name__ == '__main__': 240 | pass 241 | -------------------------------------------------------------------------------- /Chapter11/backward.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | 4 | 5 | def backward(M_x, t): 6 | beta = [] 7 | beta_prev = np.ones(M_x.shape[1]) 8 | beta.append(beta_prev) 9 | print(f"初始β={beta_prev}") 10 | for i in range(t - 1, -1, -1): 11 | beta_next = beta_prev 12 | M_t = M_x[i + 1] 13 | beta_prev = np.dot(M_t, beta_next) 14 | beta.append(beta_prev) 15 | print(f"β{i}={beta_prev}") 16 | return beta_prev 17 | 18 | 19 | def demo(): 20 | M = [ 21 | [ 22 | [.5, .5], 23 | [.0, .0] 24 | ], 25 | [ 26 | [.7, .3], 27 | [.4, .6] 28 | 29 | ], 30 | [ 31 | [.2, .8], 32 | [.5, .5] 33 | ], 34 | [ 35 | [.9, .1], 36 | [.8, .2] 37 | ] 38 | ] 39 | M = np.array(M) 40 | beta = backward(M, 3) 41 | print(beta) 42 | 43 | 44 | if __name__ == '__main__': 45 | demo() 46 | -------------------------------------------------------------------------------- /Chapter11/forward.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | 为什么CRF算概率时，不像HMM那样，只是计算P(I|O,λ)。而是计算 4 | P(yi|x)和P(yi-1,yi|x) 5 | """ 6 | import numpy as np 7 | 8 | 9 | def forward(M_x, t): 10 | """ 11 | 根据M矩阵，来计算前向向量αi(yi|x)：即在第i个位置(第i时间步，状态为yi,且 12 | 截止到第i个位置，观察为(x0,x1, ... xi)的概率。 13 | yi的取值有n个，所以α.shape = (n, ) 14 | 15 | 注意，书上的M矩阵，指的是从位置i=1,2,..., T+1，各有一个Mi矩阵。 16 | 从而M矩阵由T+1个Mi矩阵组成 17 | """ 18 | # 书上写的是当y0=start时，才为1，但笔者想不出有什么必要，因为这个start事实上也是虚构头， 19 | # 这个虚构头的状态为任意一个，我们都无所谓才对，所以这里概率都取为1 20 | alpha = [] 21 | alpha_next = np.ones(M_x.shape[1]) 22 | alpha.append(alpha_next) 23 | print(f"初始α={alpha_next}") 24 | for i in range(t): 25 | alpha_prev = alpha_next 26 | M = M_x[i] 27 | alpha_next = np.dot(alpha_prev, M) 28 | alpha.append(alpha_next) 29 | print(f"α{i}={alpha_next}") 30 | return alpha 31 | 32 | 33 | def demo(): 34 | M = [ 35 | [ 36 | [.5, .5], 37 | [.0, .0] 38 | ], 39 | [ 40 | [.7, .3], 41 | [.4, .6] 42 | 43 | ], 44 | [ 45 | [.2, .8], 46 | [.5, .5] 47 | ], 48 | [ 49 | [.9, .1], 50 | [.8, .2] 51 | ] 52 | ] 53 | M = np.array(M) 54 | alpha = forward(M, 3) 55 | print(alpha) 56 | 57 | 58 | if __name__ == '__main__': 59 | demo() 60 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | 本项目复现李航《统计学习方法》每一章节的算法 2 | 3 | # 特点： 4 | - 笔记摘要：在每个文件开头都会有一些核心的摘要 5 | - pythonic：这里会用尽可能规范的方式来实现，包括编程风格几乎严格按照PEP8 6 | - 循序渐进：前期的算法会更list的方式来做计算，可读性比较强，后期几乎完全为numpy.array的计算，并且辅助详细的注释。 7 | 8 | # 完成情况： 9 | - ✅ perceptron 10 | - ✅ KNN 11 | - ✅ naive baysian 12 | - ✅ 决策树 13 | - ✅ 逻辑斯蒂回归 14 | - [ ] SVM 15 | - ✅ Adaboost 16 | - ✅ GMM 17 | - ✅ HMM 18 | - [ ] CRF 19 | 20 | # requirements 21 |
python 3.7
22 | sklearn 0.21.3 23 |
numpy 1.17.2
24 | matplotlib 3.1.1 25 | --------------------------------------------------------------------------------