├── .idea
├── .gitignore
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
└── statisticml.iml
├── Chapter02
└── perceptron.py
├── Chapter03
├── KNN.py
└── kd_tree.py
├── Chapter04
└── naive_baysian.py
├── Chapter05
├── classify_decision_tree.py
└── tree.py
├── Chapter06
├── LR.py
├── MEM.py
└── logistic_regression.py
├── Chapter07
└── SVM.py
├── Chapter08
└── Adaboost.py
├── Chapter09
└── GMM.py
├── Chapter10
├── HMM.py
├── backward.py
├── baum_welch.py
├── forward.py
└── viterbi.py
├── Chapter11
├── BFGS.py
├── CRF.py
├── backward.py
└── forward.py
└── readme.md
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/statisticml.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/Chapter02/perceptron.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | """
3 | 感知机(以下标号跟书上的章节标号没有关系,后同)
4 | 1. 感知机的出发点是什么:找到一个平面尽可能将正实例、负实例分别分到平面两侧,即对y=+1的点,w·x+b>0,反之<0
5 | 2. 平面的表示形式:y = w·x + b
6 | 3. 1中"尽可能"该如何表达:误分类点的个数越少越好。但这个个数不是w,b的导数,不易优化;改为所有误分类点和平面的总距离尽可能小
7 | 4. 误分类点怎么表达:-y(w·x+b)>0
8 | 5. 故目标函数:L(w,b)=-Σ_{(x,y)属于误分类点} [y(w·x+b)]
9 | 6. 最小化目标函数的方法,求偏导,梯度下降
10 | ------到此为止,足以写出代码,但还需要学习以下内容------
11 | 7. 算法的收敛性
12 | """
13 |
14 |
15 | def sgd_perceptron(w, b, x, y, lr=1):
16 | """
17 | 根据误分类实例(x,y)更新参数w, b。仅用于感知机
18 | """
19 | w = [w_i + lr * x_i * y for w_i, x_i in zip(w, x)]
20 | b += lr * y
21 | return w, b
22 |
23 |
24 | class Perceptron:
25 | def __init__(self, max_epoch=1000):
26 | self.w = []
27 | self.b = 0
28 | self.max_epoch = max_epoch
29 |
30 | def fit(self, X, Y):
31 | self.w = [0] * len(X[0])
32 |
33 | epoch = 0
34 | while True:
35 | epoch += 1
36 | all_right = True # 全都被正确分类
37 | for x, y in zip(X, Y):
38 | if sum([w_i * x_i for w_i, x_i in zip(self.w, x)]) * y <= 0: # 误分类点
39 | print(f"误分类点为{(x, y)}")
40 | self.w, self.b = sgd_perceptron(self.w, self.b, x, y)
41 | all_right = False # 进入这个if意味着有点没有被正确分类,all_right置为False
42 | break
43 | # 如果经过上述的循环,确实每个点都正确分类,那么可以跳出while循环
44 | # 或者这个训练集就是无法通过一个超平面分割,那么循环再多次也无法达到all_right,我们设定一个最大循环次数
45 | if all_right or epoch > self.max_epoch:
46 | break
47 |
48 | def predict(self, X):
49 | return [self.predict_single(x) for x in X]
50 |
51 | def predict_single(self, x):
52 | if sum([w_i * x_i for w_i, x_i in zip(self.w, x)]) + self.b > 0:
53 | return 1
54 | else:
55 | return -1
56 |
57 |
58 | def demo():
59 | X = [
60 | [3, 3],
61 | [4, 3],
62 | [1, 1]
63 | ]
64 | Y = [
65 | 1,
66 | 1,
67 | -1
68 | ]
69 | clf = Perceptron(max_epoch=20)
70 | clf.fit(X, Y)
71 | print(f"w={clf.w}, b={clf.b}")
72 | print(f"预测结果{clf.predict(X)}")
73 |
74 |
75 | if __name__ == '__main__':
76 | demo()
77 |
--------------------------------------------------------------------------------
/Chapter03/KNN.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | """
3 | KNN
4 | 目的:对于给定目标点A,想要知道A的label
5 | 方法:找到训练集中最近的k个点,我们认为这k个点的类别最多的类就是这个点A的类别
6 | 也就是说KNN的核心概念只有3个:
7 | 1. 距离最近
8 | 2. k个
9 | 3. 用这k个点的频数最高的label作为目标点A的label预测。
10 |
11 | 问题:
12 | 但是如果遍历搜索所有训练集中的点,来找到最近的距离,这样很耗时。怎么办?
13 | 答案:
14 | 这就是KDTree的意义,它就是让我们搜索得快一点的办法
15 | 所以需要知道,KDTree本质上只是我们为了快速搜索最近k个点的实现手段,它本身不是KNN,只是KDTree这种数据结构具有快速
16 | 搜索最近k个点的优点。
17 |
18 | """
19 | from collections import Counter
20 |
21 | from Chapter03.kd_tree import KDTree
22 |
23 |
24 | class KNN:
25 | """KNN = k nearest neighbour"""
26 |
27 | def __init__(self, k):
28 | self.k = k
29 | self.model = None
30 |
31 | def fit(self, X, Y):
32 | """用KDTree方法来拟合数据,构建模型"""
33 | self.model = KDTree(X, Y)
34 |
35 | def predict_single(self, x):
36 | # 找到包含节点的叶节点
37 | knn_list = self.model.search(x, self.k)
38 | label_list = [i[1][1] for i in knn_list]
39 | label_count = Counter(label_list)
40 | return sorted(label_count.items(), key=lambda t: t[1])[-1][0]
41 |
42 | def predict(self, X):
43 | return [self.predict_single(x) for x in X]
44 |
45 |
46 | def demo():
47 | my_X = [
48 | [2, 3],
49 | [5, 4],
50 | [7, 2],
51 | [9, 6],
52 | [8, 1],
53 | [4, 7]
54 | ]
55 | my_Y = [
56 | 0,
57 | 1,
58 | 1,
59 | 0,
60 | 1,
61 | 0
62 | ]
63 | knn = KNN(2)
64 | knn.fit(my_X, my_Y)
65 | print(knn.model)
66 | print(knn.predict(my_X))
67 |
68 |
69 | def demo2():
70 | my_X = [
71 | [6.27, 5.5],
72 | [1.24, -2.86],
73 | [17.05, -12.79],
74 | [-6.88, -5.4],
75 | [-2.96, -0.5],
76 | [-4.6, -10.55],
77 | [-4.96, 12.61],
78 | [1.75, 12.26],
79 | [7.75, -22.68],
80 | [10.8, -5.03],
81 | [15.31, -13.16],
82 | [7.83, 15.70],
83 | [14.63, -0.35],
84 | ]
85 |
86 | my_Y = [
87 | 1,
88 | 1,
89 | 0,
90 | 1,
91 | 1,
92 | 0,
93 | 1,
94 | 1,
95 | 0,
96 | 1,
97 | 0,
98 | 1,
99 | 0
100 | ]
101 |
102 | knn = KNN(k=1)
103 | knn.fit(my_X, my_Y)
104 | print(knn.model)
105 | print(knn.predict(my_X))
106 |
107 |
108 | if __name__ == '__main__':
109 | demo2()
110 |
--------------------------------------------------------------------------------
/Chapter03/kd_tree.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import math
3 | from heapq import heappop, heappush, nsmallest
4 |
5 |
6 | def find_middle(X, Y, dim):
7 | """
8 | 找到dim纬度上处于中位数的实例,返回这个实例和更小、更大的X,Y
9 | :param X:
10 | :param Y:
11 | :param dim:
12 | :return:
13 | """
14 | # print(X, Y)
15 | sorted_X_Y = sorted(zip(X, Y), key=lambda x_and_y: x_and_y[0][dim])
16 | middle_index = len(X) >> 1
17 | middle = sorted_X_Y[middle_index]
18 |
19 | smaller = sorted_X_Y[:middle_index]
20 | bigger = sorted_X_Y[middle_index + 1:]
21 |
22 | smaller_X, smaller_Y = [i[0] for i in smaller], [i[1] for i in smaller]
23 | bigger_X, bigger_Y = [i[0] for i in bigger], [i[1] for i in bigger]
24 | smaller_X, smaller_Y, bigger_X, bigger_Y = list(smaller_X), list(smaller_Y), list(bigger_X), list(bigger_Y)
25 | return middle, smaller_X, smaller_Y, bigger_X, bigger_Y
26 |
27 |
28 | def l2(x1, x2):
29 | return math.sqrt(sum([(x_1_i - x_2_i) ** 2 for x_1_i, x_2_i in zip(x1, x2)]))
30 |
31 |
32 | class Node:
33 | """Node的实例代表KDTree的一个节点"""
34 |
35 | def __repr__(self):
36 | return f"深度为{self.level}, 以第{self.dim}个特征作为分割标准, 实例点为{self.instance}"
37 |
38 | def __init__(self, instance, level=0):
39 | self.instance = instance
40 | self.level = level
41 | self.left = None
42 | self.right = None
43 | self.parent = None
44 |
45 | @property
46 | def dim(self):
47 | return self.level % len(self.instance)
48 |
49 | @property
50 | def is_leaf(self):
51 | return self.left is None and self.right is None
52 |
53 | @property
54 | def brother(self):
55 | if self.parent is None:
56 | return None
57 | if self.parent.left is self: # 当自己是父节点的左子节点,则兄弟节点为父节点的右节点
58 | return self.parent.right
59 | return self.parent.left # 反之
60 |
61 | def plane_distance(self, x):
62 | """节点所代表的超平面与目标点的距离"""
63 | return abs(x[self.dim] - self.instance[0][self.dim])
64 |
65 | def point_distance(self, x):
66 | return l2(self.instance[0], x)
67 |
68 | def find_leaf(self, x):
69 | node = self
70 | while not node.is_leaf:
71 | if node.left is None:
72 | node = node.right
73 | elif node.right is None:
74 | node = node.left
75 | elif x[node.dim] < node.instance[0][node.dim]:
76 | node = node.left
77 | else:
78 | node = node.right
79 | return node
80 |
81 |
82 | class KDTree:
83 | def __repr__(self):
84 | representation = ""
85 | queue = [self.root]
86 | while queue:
87 | node = queue.pop(0)
88 | representation += str(node)
89 | representation += '\n'
90 | if node.left:
91 | queue.append(node.left)
92 | if node.right:
93 | queue.append(node.right)
94 | return representation
95 |
96 | def __init__(self, X, Y):
97 | def _build_node(_X, _Y, _level, _dim):
98 | """递归地方式构建节点"""
99 | _middle, _smaller_X, _smaller_Y, _bigger_X, _bigger_Y = find_middle(_X, _Y, _dim)
100 | # print(_middle, _smaller_X, _smaller_Y, _bigger_X, _bigger_Y)
101 | _node = Node(_middle, _level)
102 | _next_level = _level + 1
103 | _next_dim = _next_level % len(_middle)
104 | if _smaller_X:
105 | _node.left = _build_node(_smaller_X, _smaller_Y, _next_level, _next_dim)
106 | if _bigger_X:
107 | _node.right = _build_node(_bigger_X, _bigger_Y, _next_level, _next_dim)
108 | return _node
109 |
110 | self.root = _build_node(X, Y, 0, 0)
111 | # 递归设置父节点
112 | queue = [self.root]
113 | while queue:
114 | node = queue.pop(0)
115 | if node.left:
116 | node.left.parent = node
117 | queue.append(node.left)
118 | if node.right:
119 | node.right.parent = node
120 | queue.append(node.right)
121 |
122 | def search(self, x, k):
123 | """找到最接近x的k个实例"""
124 |
125 | def backtrack(root, knn_list, is_visited):
126 | if root is self.root and root in is_visited:
127 | return
128 |
129 | node = root.find_leaf(x)
130 | is_visited.append(node)
131 | dist = node.point_distance(x)
132 |
133 | if len(knn_list) < k:
134 | # record = (-距离, 实例点),heappush构造的是小顶堆,而我们想知道的是最大距离点,故对距离取相反数
135 | heappush(knn_list, (-dist, node.instance))
136 | else:
137 | # 先比较这个叶节点是否比knn_list中最远点近,是的话替换,否则不换
138 | farthest_dist, farthest_point = nsmallest(1, knn_list)[0]
139 | if -farthest_dist > dist:
140 | heappop(knn_list)
141 | heappush(knn_list, (-dist, node.instance))
142 |
143 | # 往上寻找没有被访问过的父节点,并将兄弟节点取出备用
144 | brother = node.brother
145 | node = node.parent
146 | while node in is_visited and node.parent:
147 | brother = node.brother
148 | node = node.parent
149 | # 如果遍历到顶
150 | if node is self.root and node in is_visited:
151 | return
152 |
153 | while True:
154 | # 否则计算父节点是否能满足条件、并把父节点计入被访问列表
155 | is_visited.append(node)
156 | dist = node.point_distance(x)
157 | if len(knn_list) < k:
158 | # record = (距离, 实例点)
159 | # heappush构造的是小顶堆,而我们想知道的是最大距离点,故对距离取相反数
160 | heappush(knn_list, (-dist, node.instance))
161 | else:
162 | # 先比较这个叶节点是否比knn_list中最远点近,是的话替换,否则不换
163 | farthest_dist, farthest_point = nsmallest(1, knn_list)[0]
164 | if -farthest_dist > dist:
165 | heappop(knn_list)
166 | heappush(knn_list, (-dist, node.instance))
167 |
168 | # 再看超平面
169 | farthest_dist, farthest_point = nsmallest(1, knn_list)[0]
170 | if (node.plane_distance(x) < -farthest_dist or len(knn_list) < k) and brother is not None:
171 | backtrack(brother, knn_list, is_visited)
172 | break
173 | else:
174 | while node in is_visited and node.parent:
175 | brother = node.brother
176 | node = node.parent
177 | # 如果遍历到顶
178 | if node is self.root and node in is_visited:
179 | return
180 |
181 | _knn_list = []
182 | _is_visited = []
183 | backtrack(self.root, _knn_list, _is_visited)
184 | print(_knn_list)
185 | return _knn_list
186 |
--------------------------------------------------------------------------------
/Chapter04/naive_baysian.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | from collections import Counter
3 |
4 | """
5 | 朴素贝叶斯
6 | 0. 在实现朴素贝叶斯的时候,笔者已经是第N次回顾朴素贝叶斯了,但直到这一次才开始有意识地将它与上一章的感知机做一些对比,
7 | 它也给了笔者一些收获。这种与前面的模型/方法做比较的意识,将贯彻整个repository。
8 | 1. 朴素贝叶斯的出发点是什么:当已知特征x的条件下,求概率最高的y,所以需要对P(y|x)建模。
9 | 而回顾下上一章,感知机的建模是f(x)。
10 | 2. 怎么建模: 根据贝叶斯公式:P(y|x)=P(x,y) / P(x)
11 | =[P(x|y) * P(y)] / [Σ_{y_i}P(x,y_i)]
12 | =[P(x|y) * P(y)] / [Σ_{y_i}P(x|y_i) * P(y_i)]
13 | 故需要对P(x|y)和P(y)建模 --> 为什么不能直接对P(y|x)建模,而可以反过来对P(x|y)建模 (其实可以!看看逻辑斯蒂回归)
14 | 但这里的任务转化为P(x|y)和P(y)建模后,这个模型必须得具备为P(x|y)和P(y)建模的能力才说得过去!
15 | 这就是"朴素贝叶斯法"的贝叶斯。
16 | 3. 进一步地,在P(x|y)中,x可能是多维特征,实际上这些特征可能是有关系的。
17 | 但朴素贝叶斯做了一个简单的、天真的、朴素的假设:特征之间没有关系。
18 | 这就是"朴素贝叶斯"的朴素之处。但是这个朴素的假设有什么用呢 (问题A的答案,下面揭晓)
19 | 4. 剩下的问题就是如何为P(x|y)和P(y)建模了
20 | 4.1 使用极大似然估计法估计相应的概率
21 | 4.1.2 P(y)用频数即可
22 | 4.1.3 P(x|y) = P(x1, x2, ..., xn|y)
23 | = P(x1|y) * P(x2|y) * ... * P(xn|y) (从上一行到这一行就是基于朴素的"特征之间没有关系"的假设)
24 | = [频数(x1, y) / 频数(y)] * [频数(x1, y) / 频数(y)] * ... * [频数(xn, y) / 频数(y)]
25 | 这里就是朴素假设的用途了,通过这个朴素假设,我们可以通过简单地估计各个P(xi|y)来达到目的
26 | # todo: P(y|x) = P(y|x1) * P(y|x2) * ... * P(y|xn)???
27 | 4.2 使用贝叶斯估计来避免概率为0的情况
28 | 5. 对比下感知机和朴素贝叶斯法。朴素贝叶斯有一步很特别,就是它对P(x,y)建模了,
29 | 换句话说,原则上它掌握了(x,y)的生成规律,可以用来生成数据。我们把这类模型叫做生成模型
30 | 后续的逻辑斯蒂回归直接对P(y|x)建模,则没有这个生成的过程!
31 | todo: 为什么我们需要对这个特性那么在意?有什么好处吗?
32 | """
33 |
34 |
35 | class NaiveBaysian:
36 | def __init__(self):
37 | """
38 | :param features: 特征
39 | :param labels: label
40 | """
41 | self.prior_proba = {}
42 | self.conditional_proba = []
43 | self.y_options = {}
44 |
45 | def fit(self, X, Y):
46 | Y_counts = dict(Counter(Y))
47 | self.prior_proba = {y: count / len(Y) for y, count in Y_counts.items()}
48 | self.y_options = set(Y)
49 |
50 | for i in range(len(X[0])):
51 | X_i = [x[i] for x in X]
52 | X_i_Y = list(zip(X_i, Y))
53 | X_i_Y_count = dict(Counter(X_i_Y))
54 | # P(xi, yi)
55 | X_i_Y_proba = {x_i_y: count / len(Y) for x_i_y, count in X_i_Y_count.items()}
56 | # P(xi|yi) = P(xi,yi) / P(yi)
57 | conditional_proba = {x_i_y: proba / self.prior_proba[x_i_y[1]] for x_i_y, proba in # x_i_y[1]就是y
58 | X_i_Y_proba.items()}
59 | self.conditional_proba.append(conditional_proba)
60 | # 最后self.conditional_proba形如
61 | # [
62 | # 第一个特征的条件概率:P(x1|y)={(x1=a, y): p1, (x1=b,y): p2, ..., (x1=z,y): pn}, # 这里的(x1=a,y)代表x1=a|y
63 | # 第二个特征的条件概率:P(x2|y)={(x1=a, y): p1, (x2=b,y): p2, ..., (x2=z,y): pn},
64 | # ...
65 | # 最后的特征的条件概率:P(xm|y)={(xm=a, y): p1, (xm=b,y): p2, ..., (xm=z,y): pn},
66 | # ]
67 |
68 | def predict_single(self, x):
69 | assert len(x) == len(self.conditional_proba)
70 | y_result = 0
71 | proba_result = 0
72 | for y in self.y_options:
73 | prior_proba = self.prior_proba.get(y, 0) # 这里要防止训练集中没有出现y
74 | conditional_proba = 1
75 | for idx, x_i in enumerate(x):
76 | conditional_proba *= self.conditional_proba[idx].get((x_i, y), 0) # 这里要防止训练集中没有出现(x_i, y)
77 | proba = prior_proba * conditional_proba
78 | if proba > proba_result:
79 | proba_result = proba
80 | y_result = y
81 | return y_result
82 |
83 | def predict(self, X):
84 | return [self.predict_single(x) for x in X]
85 |
86 |
87 | def demo():
88 | X = [
89 | [1, 'S'],
90 | [1, 'M'],
91 | [1, 'M'],
92 | [1, 'S'],
93 | [1, 'S'],
94 | [2, 'S'],
95 | [2, 'M'],
96 | [2, 'M'],
97 | [2, 'L'],
98 | [2, 'L'],
99 | [3, 'L'],
100 | [3, 'M'],
101 | [3, 'M'],
102 | [3, 'L'],
103 | [3, 'L'],
104 | ]
105 | Y = [
106 | -1,
107 | -1,
108 | -1,
109 | -1,
110 | -1,
111 | -1,
112 | -1,
113 | 1,
114 | 1,
115 | 1,
116 | 1,
117 | 1,
118 | 1,
119 | 1,
120 | -1
121 | ]
122 | nb = NaiveBaysian()
123 | nb.fit(X, Y)
124 | prediction = nb.predict(X)
125 | print(prediction)
126 | print(f"正确率为{sum([1 if i == j else 0 for i, j in zip(prediction, Y)]) / len(prediction)}")
127 |
128 |
129 | if __name__ == '__main__':
130 | demo()
131 |
--------------------------------------------------------------------------------
/Chapter05/classify_decision_tree.py:
--------------------------------------------------------------------------------
1 | # !/Applications/anaconda/envs/4PyCharm/bin/python3.4
2 | # -*- coding: utf-8 -*-
3 | import math
4 | from collections import Counter, deque
5 | from functools import reduce
6 |
7 |
8 | def calculate_entropy(labels):
9 | """
10 | 计算label集的熵
11 | :param labels: list
12 | :return: 熵: float
13 | """
14 | total = len(labels)
15 |
16 | # 每个类的数量,计算熵的时候,类本身并不重要,重要的是每个类各种的数量/比例
17 | counter_of_every_class = Counter(labels).values()
18 | # 每个类的比例
19 | scale_of_every_class = map(lambda x: x / total, counter_of_every_class)
20 | res = sum(map(lambda i: -i * math.log(i), scale_of_every_class))
21 | # my_print(res)
22 | return res
23 |
24 |
25 | class _Node:
26 | """
27 | 树的节点,每个节点用来fit一个特征
28 | """
29 |
30 | def __init__(self, epsilon=0.1):
31 | self.epsilon = epsilon
32 | self.label = None
33 | self.idx_feature = None # idx_feature用来记载这个节点选择了哪个特征分量来拆分树
34 | self.child_dict = {} # 选择了特征分量,按照这个特征分量的n个取值划分出若干子集合,这个节点的子节点分别一个子集合
35 |
36 | def fit(self, features, labels):
37 | """
38 | :param features: X = 样本 * [特征0, 特征1, ……]
39 | :param labels: Y = 样本 * label
40 | :return:
41 | """
42 |
43 | assert len(features) == len(labels), "X和Y的个数不一致"
44 |
45 | # 当labels都为一样的,这个节点就有自己的label了,没有子节点
46 | if len(set(labels)) == 1:
47 | self.label = labels[0]
48 | return
49 |
50 | # 如果已经没有特征的话,跟上面一样
51 | num_features = len(features[0]) # 特征的个数
52 | if not num_features:
53 | self.label = Counter(labels).most_common(1)[0][0] # 计数,然后选最多的那个
54 | return
55 |
56 | """
57 | 计算每个特征列的信息熵
58 | """
59 | cols = [[sample[idx] for sample in features] for idx in range(num_features)]
60 | entropy_list = []
61 | for col in cols: # 对于每个特征列
62 | set_of_types_in_col = set(col)
63 | total_entropy = 0
64 | for s in set_of_types_in_col: # 对于这个特征列的每个取值
65 | subset = [label for c, label in zip(col, labels) if c == s]
66 | total_entropy += calculate_entropy(subset) * (len(subset) / len(labels))
67 | entropy_list.append(total_entropy)
68 |
69 | # 挑选出【使得分割后集合的信息熵最少】的特征
70 | min_idx, min_entropy = reduce(lambda x, y: x if x[1] < y[1] else y, enumerate(entropy_list))
71 |
72 | """
73 | 这个特征会使得互信息最大(信息不确定性的减少最多)
74 | 如果连这个互信息都达不到epsilon,我们认为每个特征都提供不了多少信息,那再继续分支也没有什么价值
75 | 所以直接取占比最高的类作为这个节点的label
76 | """
77 | if calculate_entropy(labels) - min_entropy < self.epsilon:
78 | self.label = Counter(labels).most_common(1)[0][0]
79 | return
80 |
81 | # 否则就挑选这个特征
82 | self.idx_feature = min_idx
83 |
84 | # 挑选之后,按照这个特征的n个取值,它会产生n个子节点
85 | # 同时我们需要划分集合
86 | # 每个子节点(child)对应处理一个子集(sub_feature和sub_labels)
87 | set_n_value = set([sample[min_idx] for sample in features]) # n个取值的集合,形如{0, 1}、{1, 2, 3}这样
88 | for value in set_n_value:
89 | sub_features = [] # 子特征集
90 | sub_labels = [] # 子label集
91 | for sample, label in zip(features, labels):
92 | if sample[min_idx] == value:
93 | sub_features.append(sample[:min_idx] + sample[min_idx + 1:])
94 | sub_labels.append(label)
95 | child = _Node(epsilon=self.epsilon)
96 | child.fit(sub_features, sub_labels)
97 | self.child_dict[value] = child
98 |
99 | def __str__(self):
100 | node_information = f"node's idx_feature={self.idx_feature}\n" \
101 | f"node's child_dict={self.child_dict}\n" \
102 | f"node's label={self.label}\n"
103 | return node_information
104 |
105 |
106 | class ClassifyDecisionTree(_Node):
107 | """
108 | 分类决策树
109 | """
110 |
111 | def predict(self, feature):
112 | """
113 | 预测数据
114 | :param feature: 特征
115 | :return: 预测的结果
116 | """
117 | print('*' * 10, '预测正在进行', '*' * 10)
118 | node = self
119 | while node.label is None: # 注意不能用while not node.label,因为label可能为0
120 | to_delete_idx = node.idx_feature
121 | node = node.child_dict[feature[node.idx_feature]]
122 | feature.pop(to_delete_idx)
123 | return node.label
124 |
125 |
126 | if __name__ == "__main__":
127 | # 《统计学习方法》的贷款申请样本数据表
128 | sample_with_labels = [
129 | [[0, 0, 0, 0], 0],
130 | [[0, 0, 0, 1], 0],
131 | [[0, 1, 0, 1], 1],
132 | [[0, 1, 1, 0], 1],
133 | [[0, 0, 0, 0], 0],
134 | [[1, 0, 0, 0], 0],
135 | [[1, 0, 0, 1], 0],
136 | [[1, 1, 1, 1], 1],
137 | [[1, 0, 1, 2], 1],
138 | [[1, 0, 1, 2], 1],
139 | [[2, 0, 1, 2], 1],
140 | [[2, 0, 1, 1], 1],
141 | [[2, 1, 0, 1], 1],
142 | [[2, 1, 0, 2], 1],
143 | [[2, 0, 0, 0], 0],
144 | ]
145 | test_features = [i[0] for i in sample_with_labels]
146 | test_labels = [i[1] for i in sample_with_labels]
147 | cdt = ClassifyDecisionTree(epsilon=0.1)
148 | cdt.fit(test_features, test_labels)
149 | print(cdt.predict([0, 1, 0, 0]))
150 |
151 | """
152 | 用队列来先序遍历决策树的节点,打印出来
153 | 方便按照打印信息来验证自己的树
154 | """
155 | q = deque([cdt])
156 | while q:
157 | if q[0].label:
158 | print(q.popleft())
159 | else:
160 | q.extend(q[0].child_dict.values())
161 | print(q.popleft())
162 |
--------------------------------------------------------------------------------
/Chapter05/tree.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 | import pandas as pd
5 | from sklearn.metrics import mean_squared_error
6 |
7 | INF = np.inf
8 | EPSILON = 1e-2
9 |
10 |
11 | def _best_split(X, Y):
12 | """找到最佳的切分特征j和对应的切分点s"""
13 | rows, cols = X.shape
14 | if rows <= 1:
15 | return 0, X[0, 0], 0, 0, 0
16 | best_j = -1
17 | best_s = INF
18 | c1 = INF
19 | c2 = INF
20 | best_loss = INF
21 | for j in range(cols):
22 | for i in range(rows):
23 | s = X[i, j]
24 | R1 = Y[X[:, j] <= s]
25 | R2 = Y[X[:, j] > s]
26 | c1_hat = R1.mean()
27 | c2_hat = R2.mean()
28 | loss = sum((R1 - c1_hat) ** 2) + sum((R2 - c2_hat) ** 2)
29 | if loss < best_loss:
30 | best_j = j
31 | best_s = s
32 | c1 = c1_hat
33 | c2 = c2_hat
34 | best_loss = loss
35 |
36 | return best_j, best_s, c1, c2, best_loss
37 |
38 |
39 | class Node:
40 | def __repr__(self):
41 | return f"划分特征={self.j} 划分点={self.s} 左标签为{self.c1} 右标签为{self.c2} loss为{self.loss}"
42 |
43 | def __init__(self, j, s, c1, c2, loss, left=None, right=None):
44 | self.j = j
45 | self.s = s
46 | self.c1 = c1
47 | self.c2 = c2
48 | self.loss = loss
49 | self.left = left
50 | self.right = right
51 | # self.is_leaf = True
52 |
53 |
54 | class CartRegressor:
55 | def __init__(self, max_depth=3):
56 | self._tree = None
57 | self.max_depth = max_depth
58 | self.n_nodes = max_depth * 2 - 1 # Cart是完整二叉树,最大节点数不超过max_depth * 2 - 1
59 |
60 | def fit(self, X, Y, max_depth):
61 | self.n_nodes = max_depth * 2 - 1
62 | """递归地对子节点fit"""
63 | self._tree = Node(*_best_split(X, Y))
64 | # self._tree = Node(-1, INF, INF, INF)
65 | n_nodes = 1
66 | node_list = [(self._tree, X, Y)] # (节点,节点需要fit的X,Y)
67 | while node_list:
68 | node, x, y = node_list.pop(0)
69 | # print(node)
70 | # 如果这个节点的loss为0,就不用再细分了
71 | if node.loss <= EPSILON:
72 | # node.is_leaf = True
73 | continue
74 | part1_index = x[:, node.j] <= node.s
75 | part2_index = x[:, node.j] > node.s
76 | x1, y1 = x[part1_index], y[part1_index]
77 | x2, y2 = x[part2_index], y[part2_index]
78 | if n_nodes == self.n_nodes:
79 | continue
80 | left = Node(*_best_split(x1, y1))
81 | node_list.append((left, x1, y1))
82 | node.left = left
83 | n_nodes += 1
84 | right = Node(*_best_split(x2, y2))
85 | node_list.append((right, x2, y2))
86 | node.right = right
87 | n_nodes += 1
88 |
89 | def predict_single(self, x):
90 | node = self._tree
91 | while node.left or node.right:
92 | node = node.left if x[node.j] <= node.s else node.right
93 | return node.c1 if x[node.j] <= node.s else node.c2
94 |
95 | def predict(self, X):
96 | return np.asarray([self.predict_single(x) for x in X])
97 |
98 | def score(self, X, Y):
99 | return mean_squared_error(self.predict(X), Y)
100 |
101 |
102 | def main():
103 | np.random.seed(0)
104 | x = np.linspace(-10, 10, 100).reshape((-1, 1))
105 | y = np.linspace(-20, 20, 100) + np.random.normal(loc=0, scale=3.5, size=(100,))
106 | # x, y = make_regression(n_samples=500, n_features=2, n_informative=2)
107 | t = CartRegressor(4)
108 | df = pd.DataFrame()
109 | df['x'] = x.reshape((-1,))
110 | df = df.set_index('x')
111 |
112 | for max_depth in range(2, 8):
113 | t.fit(x, y, max_depth=max_depth)
114 | print(f"MAX_DEPTH_{max_depth}: {t.score(x, y)}")
115 | y_predict = t.predict(x)
116 |
117 | df['MAX_DEPTH_{}'.format(max_depth)] = y_predict
118 |
119 | plt.figure(figsize=(12, 7))
120 | plt.scatter(x, y, s=10, color='r')
121 |
122 | for max_depth in range(2, 8):
123 | col_name = 'MAX_DEPTH_{}'.format(max_depth)
124 | plt.plot(x, df[col_name], label=col_name)
125 | # plt.show()
126 | plt.title('Regression Tree')
127 | plt.legend(loc='best')
128 | plt.xlabel('x')
129 | plt.ylabel('y')
130 | plt.show()
131 |
132 |
133 | if __name__ == '__main__':
134 | main()
135 |
--------------------------------------------------------------------------------
/Chapter06/LR.py:
--------------------------------------------------------------------------------
1 | # !/Applications/anaconda/envs/4PyCharm/bin/python3.4
2 | # -*- coding: utf-8 -*-
3 | import numpy as np
4 | import torch
5 | from sklearn.datasets import load_iris
6 | from sklearn.preprocessing import LabelBinarizer
7 | from torch.nn import Parameter
8 | from torch.optim import SGD
9 |
10 |
11 | class LR:
12 | def __init__(self):
13 | self.w = torch.tensor(0.)
14 | self.b = torch.tensor(0.)
15 | self.step = 100
16 |
17 | def fit(self, X, Y):
18 | X, Y = torch.from_numpy(X), torch.from_numpy(Y)
19 | X, Y = torch.tensor(X, dtype=torch.float32), torch.tensor(Y, dtype=torch.float32)
20 | n_feature = len(X[0])
21 | n_class = len(Y[0])
22 | self.w = Parameter(torch.zeros((n_feature, n_class - 1)), requires_grad=True)
23 | self.b = Parameter(torch.zeros((n_class - 1,)), requires_grad=True)
24 | optimizer = SGD([self.w, self.b], lr=.1)
25 | Y = Y.argmax(dim=1)
26 |
27 | for _ in range(self.step):
28 | optimizer.zero_grad()
29 |
30 | Y_hat_along_label = torch.exp(torch.matmul(X, self.w) + self.b)
31 | Y_hat_along_label = torch.cat([Y_hat_along_label, torch.ones((len(Y), 1))], 1)
32 | denominator = Y_hat_along_label.sum(dim=1)
33 | distribution = Y_hat_along_label / denominator[:, None]
34 | # loss = torch.nn.CrossEntropyLoss()(Y, distribution)
35 | loss = torch.nn.NLLLoss()(distribution, Y)
36 | loss.backward()
37 | optimizer.step()
38 |
39 | def predict_prob(self, X):
40 | X = torch.from_numpy(X)
41 | X = torch.tensor(X, dtype=torch.float32)
42 | Y_hat_along_label = torch.exp(torch.matmul(X, self.w) + self.b)
43 | Y_hat_along_label = torch.cat([Y_hat_along_label, torch.ones((len(Y_hat_along_label), 1))], 1)
44 | denominator = Y_hat_along_label.sum(dim=1)
45 | distribution = Y_hat_along_label / denominator[:, None]
46 | return distribution
47 |
48 | def predict_single(self, x):
49 | x = self.predict_prob(x)
50 | res = np.zeros_like(x)
51 | res[x.argmax()] = 1
52 | return res
53 |
54 | def predict(self, X):
55 | X = torch.from_numpy(X)
56 | return np.asarray([self.predict_single(x) for x in X])
57 |
58 |
59 | def main():
60 | iris = load_iris()
61 | X, Y = iris.data, iris.target
62 | lb = LabelBinarizer()
63 | Y = lb.fit_transform(Y)
64 | lr = LR()
65 | lr.fit(X, Y)
66 | print(lr.predict_prob(X))
67 | return lr
68 |
69 |
70 | if __name__ == '__main__':
71 | my_lr = main()
72 |
--------------------------------------------------------------------------------
/Chapter06/MEM.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | """MEM = Maximum entropy model"""
3 | # 最大熵模型原理
4 | # todo: 拉格朗日对偶性,学完这个才知道为什么可以转化为求解max_min_问题
5 | # todo: 理解最大熵模型的最大似然估计等价于最大熵模型的对偶函数
6 | # todo: 牛顿法和拟牛顿法
7 | """
8 | 1. 原则上,MEM需要传入特征函数,如果未传入,则可以简单(朴素)地以每个特征与label的共现作为feature_function
9 | 2. 根据特征函数、训练集求得Pw(y|x),接下来的任务是求得最好的w,当得到了w后,模型就固定了
10 | 3. 求得w的方法:IIS
11 | # todo: 这个模型需要遍历、存储x、y的一些性质,这跟生成模型、判别模型有关系吗
12 |
13 | feature function定义了哪些数据作为模型的约束,以及数据如何转化为约束
14 | feature function形如
15 | y1 y2 ... yt
16 | f1 x1 0/1 0/1 0/1
17 | f2 x2 0/1 0/1 0/1
18 | ...
19 | fs xs 0/1 0/1 0/1
20 | 例如
21 | 假设在训练集中,x有
22 |
23 | """
24 | import numpy as np
25 | import pandas as pd
26 | from itertools import product
27 |
28 |
29 | def get_P_XY_and_P_X(X, Y):
30 | """
31 | 获取联合概率分布和X分布
32 | 联合概率形如
33 | feature1, feature2, ..., feature, prob_y1, prob_y2, ..., prob_ym
34 | 0 , 0 , ..., 0 , 0.1 , 0.1 , ..., 0
35 | 1 , 0 , ..., 0 , 0.2 , 0 , ..., 0
36 | ...
37 | 如果总共有10个样本,特征为(1, 0, 0)样本总共有2个,其中有一个y是1,一个y是2,总共可能的y是[1, 2, 3],那么对应的,它的联合概率如下
38 | feature1, feature2, feature3, prob_y=1, prob_y=2, prob_y=3
39 | 1 , 0 , 0 , 0.1 , 0.1 , 0
40 | """
41 | # 将Y转化成
42 | XY = np.concatenate(X, Y, axis=1)
43 | XY_unique, counts = np.unique(XY, axis=1, return_counts=True)
44 | freq = counts / XY.shape[0]
45 | df_XY = pd.DataFrame(XY_unique, columns=[f"feature_{i}" for i in range(len(X[0]))] + ['y'])
46 | df_XY = df_XY.set_index([f"feature_{i}" for i in range(len(X[0]))])['y']
47 | df_XY = df_XY.unstack().reset_index()
48 |
49 | df_XY.loc[:, 'freq'] = freq
50 | df_XY = df_XY.groupby([col for col in df_XY.columns if col != 'y']).apply(
51 | lambda _df: dict(zip(_df['y'], _df['freq']))
52 | ).reset_index().rename(columns={0: 'distribution'})
53 |
54 | unique_list = [np.unique(X[:, i]) for i in range(len(X[0]))]
55 | array = np.array(product(*unique_list))
56 | df = pd.DataFrame(data=array, columns=[f"feature_{i}" for i in range(len(X[0]))])
57 | zero_distribution = dict(zip(Y.unique(), np.zeros_like(Y.unique())))
58 | df.loc[: 'distribution_0'] = [zero_distribution for _ in range(len(X[0]))]
59 | df = pd.merge(df, df_XY, on=df.columns.tolist(), how='left')
60 | df.loc[: 'distribution'] = np.where()
61 |
62 |
63 | def get_P_X(X):
64 |
65 |
66 | class MEM:
67 | def __init__(self, method='BFGS', epsilon=1e-3):
68 | """
69 | """
70 | self.method = method
71 | self.epsilon = epsilon
72 | self.X = np.array([])
73 | self.Y = np.array([])
74 | self.p_X = {}
75 | self.p_XY = {}
76 | self.n_feature = 1
77 | self.w = np.random.rand(self.n_feature)
78 | self.y_options = np.array([])
79 |
80 | def f(self, w):
81 | pass
82 |
83 | loss_function = f
84 |
85 | def _empirical_joint_distribution(self):
86 | n_samples = self.X.shape[0]
87 | X_Y = np.concatenate((self.X, self.Y), axis=1)
88 | # 以每行作为一个元素计数
89 | element, freq = np.unique(X_Y, axis=0, return_counts=True)
90 | element = [tuple(i) for i in element]
91 | freq /= n_samples
92 | distribution = dict(zip(element, freq))
93 |
94 | def inner(x, y):
95 | return distribution[tuple(x) + (y,)]
96 |
97 | return inner
98 |
99 | def get_Pw_y_x(self, w):
100 | """
101 | 给定参数下的最大熵模型Pw(y|x)
102 | 所谓Pw(y|x)是个概率模型,它可以表示为一个接受x,输出概率分布{y1: p1, y2: p2, ...}的函数(当然也可以有其他表示方法)
103 | """
104 |
105 | def inner(x):
106 | numerator_array = np.array([])
107 | for y in self.y_options:
108 | numerator = np.exp(w * np.array([f(x, y) for f in self.ffs]))
109 | numerator_array = np.append(numerator_array, numerator)
110 | denominator = numerator_array.sum()
111 | distribution = numerator_array / denominator
112 | return dict(zip(self.y_options, distribution))
113 |
114 | return inner
115 |
116 | def distribution_matrix(self, X, Y):
117 | self.
118 |
119 | def fit(self, X, Y):
120 | X, Y = np.asarray(X), np.asarray(Y)
121 | # 根据训练数据做一些必要的初始化
122 | # 1. 获取经验联合分布~P(X,Y)
123 | empirical_joint_distribution = self._empirical_joint_distribution()
124 | # 2. 获取给定参数w下的最大熵模型Pw(y|x)
125 |
126 | if self.method == 'IIS':
127 | pass
128 | else:
129 | # 输入特征函数、经验联合分布,目标函数f(w), 梯度函数g(w)
130 | # 1. 根据特征函数、给定的w,求得最大熵模型Pw_y_x
131 | # 2. 然后任务是求得最佳的w,将w代进Pw_y_x,就是最终的P_y_x
132 | # 3. 求解w的方法是
133 | # 3.1 初始化w、B(正定对称矩阵)
134 | # 3.2 求梯度g,如果梯度<=epsilon,则停止,否则进入3.3~3.7
135 | # 3.3 根据B·p = -g,求得p
136 | # 3.4 一维搜索λ, 使得f(w+pλ)最小
137 | # 3.5 更新w=w+λp
138 | # 3.6 更新g,如果g<=epsilon,w_best=w;否则计算新B
139 | # 3.7 转3.3
140 | # 备注:另外可以限定循环的次数不超过epochs次
141 |
142 | # 3.1 初始化w, B
143 | w = np.random.rand(len(self.ffs))
144 | B = np.eye(len(self.ffs))
145 | # 3.2 求梯度g
146 | Pw_y_x = self.get_Pw_y_x(w)
147 |
148 | g = g_w = 0
149 | for epoch in range(epochs):
150 | if g <= epsilon:
151 | break
152 |
--------------------------------------------------------------------------------
/Chapter06/logistic_regression.py:
--------------------------------------------------------------------------------
1 | # !/Applications/anaconda/envs/4PyCharm/bin/python3.4
2 | # -*- coding: utf-8 -*-
3 | # author: frank
4 | # time : 2019-06-16 14:40
5 | # file : logistic_regression.py
6 | import numpy as np
7 | import logging
8 | from collections import Counter
9 | logging.basicConfig(format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s',
10 | level=logging.DEBUG)
11 | """
12 | 逻辑斯蒂回归
13 | 1. 以二分类为例子
14 | 2. 给定了实例的Feature,判断实例的label是0还是1
15 | 3. 两个问题:
16 | 3.1 为什么二分类的模型可以是"逻辑斯蒂回归模型"?
17 | 我们预设二分类模型服从伯努利分布,伯努利分布是GLM之一,写出它的GLM形式。
18 | 根据最大熵原则,它是sigmoid形式,或者说是逻辑斯蒂回归模型
19 | 根据最大熵原则,它是sigmoid形式,或者说是逻辑斯蒂回归模型。
20 | 3.2 逻辑斯蒂回归的参数估计怎么做?
21 | 极大似然原则,其实跟最大熵原则殊途同归。
22 | 换句话说,最大熵原则既决定了模型的"公式"的样子,又决定了参数。
23 |
24 | """
25 |
26 |
27 | def sigmoid(x):
28 | activation = 1 / (1 + np.exp(-x))
29 | return activation
30 |
31 |
32 | def propagate(features, labels, w, b):
33 | """
34 | 反向传播梯度下降,此处为了简单起见只做全局梯度下降
35 | :param features: 特征
36 | :param labels: 标签
37 | :param w: 系数
38 | :param b: 截距
39 | :return:
40 | """
41 |
42 | n = features.shape[1]
43 |
44 | # 前向传播
45 | predictions = sigmoid(np.dot(w.T, features) + b)
46 | cost = -np.sum(labels * np.log(predictions) + (1 - labels) * np.log(1 - predictions)) / n
47 |
48 | # 反向传播
49 | d_Z = predictions - labels
50 | d_w = np.dot(features, d_Z.T) / n
51 | d_b = np.sum(d_Z) / n
52 |
53 | # w = w - lr * d_w
54 | # b = b - lr * d_b
55 | return d_w, d_b, cost
56 |
57 |
58 | class LogisticRegression:
59 | """
60 | 初始化
61 | """
62 | def __init__(self, lr=0.001, num_epochs=100):
63 | self.lr = lr
64 | self.num_epochs = num_epochs
65 |
66 | # 模型的参数
67 | self.dim = 0
68 | self.w = np.zeros((0, ))
69 | self.b = 0
70 |
71 | def fit(self, features, labels):
72 | """
73 | 拟合、改变参数
74 | """
75 | logging.info("开始训练")
76 | self.dim = features.shape[0]
77 | self.w = np.ones((self.dim, 1)) * .5
78 |
79 | # 对训练集反向传播
80 | for epoch in range(self.num_epochs):
81 | d_w, d_b, cost = propagate(features, labels, self.w, self.b)
82 | self.w -= d_w * self.lr
83 | self.b -= d_b * self.lr
84 |
85 | # ==========================================
86 | # ================ 参数衰减 ===============
87 | # ==========================================
88 | if epoch == self.num_epochs * .6:
89 | self.lr *= .5
90 | if epoch == self.num_epochs * .8:
91 | self.lr *= .2
92 | if epoch % 100 == 0:
93 | logging.info(f"cost = {cost}")
94 | logging.info(f"===============训练完毕===========")
95 |
96 | def predict(self, instance):
97 | # p_1 = instance的label是1的概率
98 | p_1 = sigmoid(np.dot(self.w.T, instance) + self.b)
99 | return np.where(p_1 > 0.5, 1, 0)
100 |
101 |
102 | if __name__ == '__main__':
103 |
104 | # 参数设置
105 | num_cases = 10000
106 | num_features = 6
107 | test_lr = 0.1
108 | test_num_epochs = 5000
109 |
110 | # ==========================================
111 | # ================ 生成数据 ===============
112 | # ==========================================
113 | test_features = np.random.rand(num_features, num_cases)
114 | true_w = (np.arange(1, 7) * np.array([1, -1, 1, -1, 1, -1])).reshape(6, 1)
115 | true_b = .2
116 | logging.info(f"true_w=\n{true_w}")
117 | logging.info(f"true_b={true_b}")
118 |
119 | # w * x + b
120 | linear_result = np.dot(true_w.T, test_features) + true_b
121 | # sigmoid(w * x + b)
122 | test_labels = np.where(sigmoid(linear_result) > 0.5, 1, 0)
123 | logging.info(f"labels counts are {Counter(test_labels[0])}")
124 |
125 | # 实例化并训练
126 | LR = LogisticRegression(lr=test_lr, num_epochs=test_num_epochs)
127 | LR.fit(test_features, test_labels)
128 | logging.info(f"w=\n{LR.w}")
129 | logging.info(f"b={LR.b}")
130 |
131 | # accuracy on train data
132 | train_predictions = LR.predict(test_features)
133 | result = (train_predictions == test_labels)[0]
134 | accuracy = Counter(result)[True] / num_cases
135 | logging.info(f"正确率为{accuracy}")
136 |
137 | # 开始预测
138 | sample = np.random.rand(num_features, 5)
139 | true_label = np.where(sigmoid(np.dot(true_w.T, sample) + true_b) > .5, 1, 0)
140 | logging.info(f"true_label = {true_label}")
141 | prediction = LR.predict(sample)
142 | logging.info(f"\nsample=\n{sample}\nprediction={prediction}")
143 |
--------------------------------------------------------------------------------
/Chapter07/SVM.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | """
3 | SVM
4 | 按照解决问题的难度从易到难排序,SVM相关的算法有
5 | 线性可分SVM
6 | 线性SVM
7 | 非线性SVM
8 | 由上到下,前序(简单)模型都是后序(复杂)模型的基础、特殊情况,所以只实现非线性,使它兼容前序的模型。
9 |
10 | 非线性SVM的理解流程:
11 | 1. 线性可分SVM的初衷是什么+线性可分SVM参数的计算方法
12 | 2. 从硬间隔过渡到软间隔,创造出线性支持SVM,来解决 [近似可分训练集]的分类问题
13 | 3. 对于非近似线性可分的训练集,我们的目标是通过映射函数将输入空间映射映射为特征空间,使得训练集在特征空间中近似线性可分,然后应用线性SVM
14 |
15 | 接下来再针对每个大点做具体理解
16 | 1. 线性可分
17 | 1.1 SVM的初衷是对空间中正负例画出一个超平面,使得正负例可以被完美隔开,并且不同于感知机,我们还希望无论是每个点都能离这个超平面足够远,
18 | 越远我们才会觉得越靠谱。这就有点像及格线60分,成绩越超过60,我们越相信这是个学霸,成绩越低于60,我们越相信这是学渣。
19 | 1.2 根据1.1引出函数间隔,进而引出几何间隔
20 | 1.3 根据初始最优化的目标的形式,确定最终优化目标是min 1/2 * ||w|| s.t Σ yi(w·xi + b) - 1 >= 0 ,变量是w, b
21 | 1.4【重要,不懂拉格朗日就没有必要再看下去了】
22 | 1.4.1 构建拉格朗日函数 L(w,b,α)
23 | 1.4.2 拉格朗日对偶性,确定对偶优化目标是max min L。
24 | 1.4.2.1 min L,变量是w,b。求偏导,得到w关于α的公式,b关于α的公式
25 | 1.4.2.2 代入到L中,得到min L = -1/2 ∑i∑j αi*αj*yi*yj*(xi·xj) + ∑i αi
26 | 1.4.2.3 max [min L],求得α=(α1, α2, ..., αn) 【这里有个伏笔,当数据量很大的时候,求α其实是非常耗时】
27 | 1.4.3 根据附录定理C.3,我们可以根据对偶问题的最优解α,反过来求得原始问题的最优价w,b
28 |
29 | 2. 线性不可分,但近似线性可分的情况
30 | 2.1 我们对yi(w·xi+b) >= 1的要求放宽一点,允许每个点都能不同程度地达不到这个目标,设置松弛变量ξi,使得 yi(w·xi+b) >= 1 - ξi
31 | 2.2 对应优化目标也要对松弛变量加惩罚C,目标变为min 1/2 * ||w|| + Cξ,ξi不为0时,意味着(xi,yi)没有被平面正确分类,否则没有必要松弛。
32 | 所以min 1/2 * ||w|| + Cξ,ξi的后半部分蕴含着少分错点的目标
33 | 2.3 同样经过拉格朗日那一套(不过比线性可分的推导过程要复杂),min L = -1/2 ∑i∑j αi*αj*yi*yj*(xi·xj) + ∑i αi s.t. ∑αi yi=0 , 0<=α<=C
34 | 2.4 用合页损失来理解线性SVM,这样更容易理解它和感知机的区别。
35 |
36 | 3. 非线性分类问题(非近似线性可分的训练集)。既然在当前输入空间上,训练集看起来不可分,那能不能通过对空间的映射,使得训练集在映射后
37 | 的空间是线性可分的,或者近似线性可分。
38 | 3.1 一个直接的想法,是找到这么一个映射函数φ,但是这个可不好找,怎么就知道φ后的训练集就可分呢?让我们倒过来想,假如我们映射后的训练集可分,那么
39 | 它应该可以用线性SVM搞,那么届时它的目标就是min L = -1/2 ∑i∑j αi*αj*yi*yj*(xi'·xj') +∑i αi,里面的xi',xj'是映射后的,也就是说,
40 | 这里的xi'=φ(xi),xj'=φ(xj),我们观察到运算单元其实是φ(xi)·φ(xj),也就是说,我们要是能直接定义出K(xi,xj)=φ(xi)·φ(xj),也是够用的,
41 | 这个K就是核函数,这种不直接找φ而是找K的方法就是核技巧。
42 | 3.2 但是,不能说以xi,xj为变量的二元函数K就是核函数,核函数本意上=两个经过映射后的向量的内积。所以我们需要知道一个K是不是核函数。
43 | 这里有一堆数学知识,按住不表了。
44 | 3.3 但即使有3.2,要证明K是核函数还是挺麻烦的,所以一般都是直接应用一些常见的核函数:多项式核函数、高斯核函数核字符串核函数。
45 | 3.4 这里我有个问题,好像没有直接证明核函数后的训练集就(近似)线性可分了,大概是拿常用的核函数尝试后,准确率达到一定程度就认为有效吧
46 | 3.5 最后我们回到1.4.2.3的伏笔,求α是很麻烦的。好在Platt在1998年提出了SMO(sequential minimal optimization)。实际上,我们手动实现
47 | SVM,大多数篇幅就是在实现SMO而已。但是不懂前序这些知识,就算是照猫画虎把SMO实现了,笔者认为还不足够
48 | """
49 |
50 | import numpy as np
51 | from functools import partial
52 |
53 |
54 | def W(K, Y, alpha, i, j):
55 | """
56 | i, j分别是第一、第二变量的下标
57 | """
58 | _W = .5 * K[i, i] * alpha[i] ** 2 + .5 * K[j, j] * alpha[j] ** 2 + Y[i] * Y[j] * K[i, j] * alpha[i] * alpha[j] - \
59 | (alpha[i] + alpha[j]) + Y[i] * alpha[i] * ((Y * K[i])[np.r_[:i, i + 1:j:, j + 1:]]).sum() + \
60 | Y[j] * alpha[j] * (Y * K[j][np.r_[:i, i + 1:j:, j + 1:]]).sum()
61 | return _W
62 |
63 |
64 | def SMO(K, Y, alpha, b, epsilon, C):
65 | """
66 | SMO要解决如下问题
67 | min 1/2 ∑i∑j αi*αj*yi*yj*K(xi,xj)-∑i αi
68 | α
69 | """
70 | # 选择变量
71 | # 先选择第一个变量,选择违反KKT条件最严重的变量作为第一个变量
72 | pred = np.dot(K, (alpha * Y)) + b # 书上的g_xi其实就是预测pred
73 | interval = Y * pred
74 | error = Y - pred
75 |
76 | # 注意到P129页在“第2个变量的选择”这一节中,最后说明了可能会找不到合适的α2使得目标函数有足够的下降,所以需要遍历寻找直到满足就退出
77 | # 先在间隔边界上的支持向量点,检验他们是否满足KKT条件(为什么书上说要优先从这里找呢)
78 | # 记选择的第一个变量是αi,第二个变量αj,即他们的下标分别为i,j
79 | i_candidate = np.where(
80 | (0 < alpha < C and interval - 1 > epsilon) or # todo: 理解这里为什么是 - 1 > epsilon
81 | (alpha == 0 and interval < 1) or
82 | (alpha == C and interval > 1)
83 | )
84 | for i in i_candidate:
85 | # 找到第二个变量
86 | Ei = error[i]
87 | Ei_minus_Ej = np.abs(error - Ei)
88 | j_candidate = np.argsort(-Ei_minus_Ej) # 要对Ei_minus_Ej降序获得下标,np.argsort只支持升序,故排序的时候用相反数
89 | for j in j_candidate:
90 | # 更新选定的αi,αj,并计算更新后的αi,αj是否使得子问题W有足够的下降
91 | # 所以在更新前还得先计算、保存W(αi,αj)
92 | W_prev = W(K, Y, alpha, i, j)
93 |
94 | # 更新αi,αj
95 | if Y[i] != Y[j]:
96 | L = max(0, alpha[j] - alpha[i])
97 | H = min(C, C + alpha[j] - alpha[i])
98 | else:
99 | L = max(0, alpha[j] + alpha[i] - C)
100 | H = min(C, alpha[j] + alpha[i])
101 |
102 | # 求解未经剪辑的αj_new
103 | eta = K[i, i] + K[j, j] - 2 * K[i, j]
104 | Ej = error[j]
105 | alpha_j_new_unc = alpha[j] + Y[j] * (Ei - Ej) / eta
106 | # 经剪辑后的αj_new
107 | if alpha_j_new_unc > H:
108 | alpha_j_new = H
109 | elif alpha_j_new_unc >= L:
110 | alpha_j_new = alpha_j_new_unc
111 | else:
112 | alpha_j_new = L
113 | # 求解αi_new
114 | alpha_i_new = alpha[i] + Y[i] * Y[j] * (alpha_j_new - alpha[j])
115 | # 计算是否满足要求
116 | alpha_new = alpha.copy()
117 | alpha_new[i] = alpha_i_new
118 | alpha_new[j] = alpha_j_new
119 | W_next = W(K, Y, alpha_new, i, j)
120 | if W_prev - W_next > epsilon:
121 | b1_new = -Ei - Y[i] *K[i, i] * (alpha_i_new - alpha[i]) - \
122 | Y[j] * K[j, i] * (alpha_j_new - alpha[j]) + b
123 | b2_new = -Ei - Y[i] * K[i, j] * (alpha_i_new - alpha[i]) - \
124 | Y[j] * K[j, j] * (alpha_j_new - alpha[j]) + b
125 | b_new = (b1_new + b2_new) / 2
126 | return alpha_new, b_new
127 |
128 | return alpha, b
129 |
130 |
131 | class SVM:
132 | def __init__(self, C, epsilon, kernel=np.dot):
133 | self.C = C
134 | self.epsilon = epsilon
135 | self.kernel = kernel # 默认不经过映射函数,此时核函数就是向量点积而已
136 | self.w = np.empty((1,))
137 | self.b = np.random.rand()
138 | self.alpha = np.empty((1,))
139 |
140 | @staticmethod
141 | def _data_check(X, Y):
142 | assert set(Y) == {-1, 1}, "要求训练集中只能用+1,-1的标签"
143 | assert X.shape[0] == Y.shape[0]
144 |
145 | def fit(self, X, Y):
146 | self._data_check(X, Y)
147 | # 根据X来初始化w,α
148 | self.w = np.empty(X.shape[0])
149 | self.alpha = np.empty_like(self.w)
150 |
151 | # 先用SMO算法求解α
152 | K = self.kernel(X, X.T)
153 | self.alpha, self.b = SMO(K, Y, self.alpha, self.b, self.epsilon, self.C)
154 |
155 | self.w = (self.alpha * Y)[:, None] * X.sum(axis=0)
156 |
157 | def predict(self, X):
158 | return np.sign(np.dot(X, self.w.T) + self.b)
--------------------------------------------------------------------------------
/Chapter08/Adaboost.py:
--------------------------------------------------------------------------------
1 | # !/Applications/anaconda/envs/4PyCharm/bin/python3.4
2 | # -*- coding: utf-8 -*-
3 |
4 | import numpy as np
5 | from numpy import float
6 |
7 | INF = float('inf')
8 |
9 |
10 | # @lru_cache()
11 | def compute_error(pred, Y, weight):
12 | return sum(weight * (pred != Y))
13 |
14 |
15 | class SignClassifier:
16 | def __repr__(self):
17 | return ("< " if self.sign == 1 else "> ") + str(self.threshold)
18 |
19 | def __init__(self):
20 | self.sign = 1
21 | self.threshold = INF
22 |
23 | def fit(self, X, Y, weight):
24 | assert len(X) == len(Y) == len(weight)
25 | X, Y, weight = zip(*sorted(zip(X, Y, weight), key=lambda t: t[0]))
26 | X, Y, weight = np.array(X), np.array(Y), np.array(weight)
27 | cost = INF
28 | for x in np.arange(min(X), max(X), 0.5):
29 | for sign in [-1, 1]:
30 | cur_pred = np.array(list(map(lambda t: 1 if t < 0 else -1, X - x))) * sign
31 | cur_cost = compute_error(cur_pred, Y, weight)
32 | if cur_cost < cost:
33 | cost = cur_cost
34 | self.threshold = x
35 | self.sign = sign
36 | if cur_cost == 0:
37 | break
38 |
39 | def predict(self, X):
40 | X = np.array(X)
41 | return np.array(list(map(lambda t: 1 if t < 0 else -1, X - self.threshold))) * self.sign
42 |
43 |
44 | class AdaClassifier:
45 | __slots__ = ['weight', 'n_estimate', 'base_estimate', 'estimate_list', 'am_list']
46 |
47 | def __init__(self, base_estimate, n_estimate):
48 | self.base_estimate = base_estimate
49 | self.n_estimate = n_estimate
50 | # self.weight = 0
51 | self.estimate_list = []
52 | self.am_list = []
53 |
54 | def fit(self, X, Y):
55 | X, Y = np.array(X), np.array(Y)
56 | weight = np.ones(shape=X.shape) / X.shape[0] # 初始化权重
57 | for i in range(self.n_estimate):
58 | clf = self.base_estimate()
59 | clf.fit(X, Y, weight)
60 | self.estimate_list.append(clf)
61 | # 计算错误率
62 | em = compute_error(clf.predict(X), Y, weight)
63 | # 计算指数
64 | am = .5 * np.log((1 - em) / em)
65 | self.am_list.append(am)
66 | # 更新权重
67 | pred = clf.predict(X)
68 | exp_list = weight * np.exp(-am * Y * pred)
69 | Z = sum(exp_list)
70 | weight = exp_list / Z
71 |
72 | def predict(self, X):
73 | return np.sign(self.decision_function(X).sum(axis=0))
74 |
75 | def decision_function(self, X):
76 | return np.array([am * clf.predict(X) for am, clf in zip(self.am_list, self.estimate_list)])
77 |
78 | def score(self, X, Y):
79 | X, Y = np.array(X), np.array(Y)
80 | return sum(self.predict(X) == Y) / X.shape[0]
81 |
82 |
83 | class AdaRegression:
84 | pass
85 |
86 |
87 | def main():
88 | X = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
89 | Y = [1, 1, 1, -1, -1, -1, 1, 1, 1, -1]
90 | ada = AdaClassifier(base_estimate=SignClassifier, n_estimate=3)
91 | ada.fit(X, Y)
92 | print(ada.decision_function(X))
93 | print(ada.predict(X))
94 | print(ada.score(X, Y))
95 |
96 |
97 | if __name__ == '__main__':
98 | main()
99 |
--------------------------------------------------------------------------------
/Chapter09/GMM.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import numpy as np
3 | from scipy.stats import norm
4 |
5 | PENDING = -1
6 |
7 |
8 | class GMM:
9 | def __init__(self, k, step=100, epsilon=1e-3):
10 | self.k = k # k个高斯分布
11 | self.alpha = np.ones(k) / k
12 | # mu形如[mu_1, mu_2, ..., mu_k]
13 | self.mu = PENDING
14 |
15 | # sigma形如[sigma_1, sigma_2, ..., sigma_k]
16 | self.sigma = PENDING
17 |
18 | # lambda_matrix形如
19 | # [
20 | # [λ_11, λ_12, ..., λ_1k],
21 | # [λ_21, λ_22, ..., λ_2k],
22 | # ...,
23 | # [λ_n1, λ_n2, ..., λ_nk]
24 | # ], n是样本的数量,lambda_matrix[j,k]记录的是第k个模型对第j个数据的响应度
25 | self.lambda_matrix = PENDING
26 |
27 | self.step = step
28 | self.epsilon = epsilon
29 |
30 | @property
31 | def k_model(self):
32 | # P(y|θ) = Σ_{k=1}^{K} alpha_k * norm(
33 | # 因为norm(loc=self.mu, scale=self.sigma)的shape是(k,)
34 | # X的shape是(n,),形如[x1, x2, ..., xn]
35 | # 而我们希望每个模型都分别n个样本计算概率分布pdf
36 | # 故需要将X包装成[[x1], [x2], ..., [xn]], 所以用X[:, None]
37 | return lambda X: self.alpha * norm(loc=self.mu, scale=self.sigma).pdf(X[:, None])
38 |
39 | def fit(self, X):
40 | """
41 | GMM学习的是X的分布,是一个无监督学习
42 | """
43 | # 根据训练集初始化每个高斯分布的参数μ和σ
44 | self.mu = np.ones(self.k) * np.mean(X)
45 | self.sigma = np.ones(self.k) * np.std(X)
46 |
47 | # 开始迭代
48 | for step in range(self.step):
49 | # E步:依据当前模型参数,计算分模型k对观测数据y_j的响应度
50 | self.lambda_matrix = self.k_model(X)
51 | self.lambda_matrix /= self.lambda_matrix.sum(axis=1)[:, None]
52 |
53 | # M步:计算新一轮的模型参数μ_k, σ_k, α_k
54 | self.mu = (self.lambda_matrix * X[:, None]).sum(axis=0) / self.lambda_matrix.sum(axis=0)
55 | self.sigma = (self.lambda_matrix * (X - self.mu) ** 2).sum(axis=0) / self.lambda_matrix.sum(axis=0)
56 | self.alpha = self.lambda_matrix.sum(axis=0) / X.shape[0]
57 |
58 | def predict(self, X):
59 | return self.k_model(X).sum()
--------------------------------------------------------------------------------
/Chapter10/HMM.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | """
3 | 重点: 学会手写Baum-Welch的手推
4 | """
5 | from itertools import product
6 |
7 | import numpy as np
8 |
9 | from Chapter10.backward import backward
10 | from Chapter10.forward import forward
11 |
12 | PENDING = np.array([0])
13 |
14 |
15 | class HMM:
16 | def __init__(self, n_state=1, n_output=1, epsilon=1e-3, max_epoch=1000):
17 | # 状态转移概率矩阵,,shape = j * j, 形如
18 | # [
19 | # [a11, a12, ..., a1j],
20 | # [a21, a22, ..., a2j],
21 | # ...,
22 | # [aj1, a12, ..., ajj]
23 | # ]
24 | self.A = np.random.random(size=(n_state, n_state))
25 | self.A /= self.A.sum(axis=1)[:, None] # 按行求和,然后按行除以和,确保最终的A每一行的和为1,这样才符合概率和为1
26 |
27 | # 状态->观察的概率矩阵,shape = j * m, 形如
28 | # [
29 | # [b11, b12, ..., b1m],
30 | # [b21, b22, ..., b2m],
31 | # ...,
32 | # [bj1, bj2, ..., bjm],
33 | # ]
34 | self.B = np.random.random(size=(n_state, n_output))
35 | self.B /= self.B.sum(axis=1)[:, None]
36 |
37 | # 初始隐变量的概率分布,shape = (j, ), 形如
38 | # [p0 p1 p2 ..., pj]
39 | self.pi = np.ones_like((n_state,)) / n_state
40 |
41 | self.epsilon = epsilon
42 | self.max_epoch = max_epoch
43 |
44 | def probability(self, O, method='forward'):
45 | """
46 | 已知λ=(A, B, π)和观测序列O,计算O出现的概率P(O|λ)
47 | """
48 | if method == 'forward':
49 | return forward(self.pi, self.A, self.B, O)
50 | else:
51 | return backward(self.pi, self.A, self.B, O)
52 |
53 | def fit(self, O, I):
54 | """
55 | 正常来说,观测数据是多条O1=(o11, o12, ..., o1s), ..., 按照书上的提示,将这多条拼接成一条大的
56 | O=(o1, o2, oT)
57 | """
58 | O = O.reshape(1, -1)
59 | I = O.reshape(1, -1)
60 | if I.size != 0: # 即有状态序列,使用监督的学习方法
61 | assert O.shape == I.shape
62 | # todo: 这里O的shape改了
63 | # 1. 状态转移概率A的估计,通过频数来估计
64 | for i in I:
65 | for i_prev, i_next in zip(i[:-1], i[1:]):
66 | self.A[i_prev, i_next] += 1
67 | self.A /= self.A.sum()
68 | # 2. 观测概率B的估计
69 | rows, columns = I.shape
70 | for row, column in product(range(rows), range(columns)):
71 | self.B[I[row, column], O[row, column]] += 1
72 | self.B /= self.B.sum()
73 | # 3. 估计π
74 | self.pi = np.unique(I[:, 0], return_counts=True)[1] / I.shape[0]
75 |
76 | else: # 没有状态序列,则需要用非监督的学习方法——Baum-Welch,背后是EM算法
77 | for _ in range(self.max_epoch):
78 | # new_A
79 | # 1. ξ = (ξ1, ξ2, ..., ξt-1)
80 | # ξ1形如
81 | # 下一时刻状态为1 下一时刻状态为2 ... 下一时刻状态为n
82 | # 此时刻状态为1 p11 p12 p1n
83 | # 此时刻状态为2 p21 p22 p2n
84 | # ...
85 | # 此时刻状态为n pn1 pn2 pnn
86 | ksi = []
87 | gamma = []
88 | for t in range(len(O[0]) - 1):
89 | alpha = forward(self.pi, self.A, self.B, O[0:, t])[0]
90 | beta = backward(self.pi, self.A, self.B, O[0:, t])[0]
91 | ksi_t = alpha[:, None] * self.A * self.B[:, O[0][t]][None] * beta
92 | ksi_t = ksi_t / ksi_t.sum()
93 | ksi.append(ksi_t)
94 |
95 | gamma_t = alpha * beta
96 | gamma.append(gamma_t)
97 |
98 | alpha_last = forward(self.pi, self.A, self.B, O[0:, -1])
99 | beta_last = backward(self.pi, self.A, self.B, O[:, -1])
100 | gamma_last = alpha_last * beta_last
101 | gamma.append(gamma_last)
102 |
103 | ksi = np.array(ksi)
104 | gamma = np.array(gamma)
105 | new_A = ksi.sum(axis=-1) / gamma.sum(axis=-1)[:, None]
106 |
107 | new_B = 0
108 | new_pi = 0
109 | self.A, prev_A = new_A, self.A
110 | self.B, prev_B = new_B, self.B
111 | self.pi, prev_pi = new_pi, self.pi
112 | if np.max(np.abs(self.A - prev_A)) < self.epsilon and np.max(np.abs(self.B - prev_B)) < self.epsilon \
113 | and np.max(np.abs(self.pi - prev_pi)) < self.epsilon:
114 | break
115 |
116 | def predict(self):
117 | pass
118 |
--------------------------------------------------------------------------------
/Chapter10/backward.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def backward(pi, A, B, O):
5 | n_state, _ = A.shape
6 | assert pi.shape[0] == n_state
7 | assert B.shape[0] == n_state
8 | # 初始化β,形如
9 | # 状态1 状态2 ... 状态n
10 | # 观测序列1 1 1 1
11 | # 观测序列2 1 1 1
12 | # ...
13 | # 观测序列S 1 1 1
14 | beta_prev = np.ones((O.shape[0], pi.shape[0]))
15 | # block: 迭代
16 | for i in range(O.shape[1] - 1, 0, -1):
17 | beta_next = beta_prev
18 | # o形如
19 | # 观测
20 | # 观测序列1 o1
21 | # 观测序列2 o2
22 | # ...
23 | # 观测序列S oS
24 | o = O[:, i]
25 | b = B[:, o]
26 | beta_prev = np.dot(A, (b * beta_next.T)).T
27 |
28 | # 此时得到的beta_prev是指
29 | o = O[:, 0]
30 | beta_prev = (pi * B[:, o].T) * beta_prev
31 | return beta_prev.sum(axis=1)
32 |
33 |
34 | def demo():
35 | pi = np.array([.2, .4, .4])
36 | A = np.array([
37 | [.5, .2, .3],
38 | [.3, .5, .2],
39 | [.2, .3, .5]
40 | ])
41 | B = np.array([
42 | [.5, .5],
43 | [.4, .6],
44 | [.7, .3]
45 | ])
46 | O = np.array([
47 | [0, 1, 0],
48 | [0, 1, 0],
49 | ])
50 | print(f"P(O|λ) = {backward(pi, A, B, O)}")
51 |
52 |
53 | if __name__ == '__main__':
54 | demo()
55 |
--------------------------------------------------------------------------------
/Chapter10/baum_welch.py:
--------------------------------------------------------------------------------
1 | """
2 | 输入数据O=(o1, o2, ..., oT)
3 | 输出隐变量pi, A, B
4 | todo: new_B公式的含义其实如下
5 | 遍历所有时刻t,
6 | 1. t时状态为j且t时刻观察为k的概率
7 | 2. t是状态为j的概率
8 |
9 | """
10 | import numpy as np
11 |
12 | from Chapter10.backward import backward
13 | from Chapter10.forward import forward
14 |
15 |
16 | def baum_welch(pi, A, B, O, epsilon, max_epoch):
17 | """
18 | 根据观测数据O来学习、输出隐马尔科夫模型λ=(A, B, π)
19 | """
20 | epoch = 0
21 | T = len(O[0]) - 1
22 | while epoch < max_epoch:
23 | print(f"A = \n{A}, \nB = \n{B}, \nπ = \n{pi}")
24 | epoch += 1
25 | # 先求ξ_t和γ_t
26 | # ξ_t形如
27 | # 下时刻状态为1 下时刻状态为2 ... 下时刻状态为n
28 | # 此时刻状态为1 p11 p12 p1n
29 | # 此时刻状态为2 p21 p22 p2n
30 | # ...
31 | # 此时刻状态为n pn1 pn2 pnn
32 |
33 | # γ_t形如
34 | # 处于状态1 处于状态2 ... 处于状态n
35 | # p1 p2 pn
36 |
37 | # 求ξ_t和γ_t需要借助α_t、β_t、β_t+1
38 | ksi = []
39 | gamma = []
40 | # new_B需要知道t时刻状态为j且观察为k的概率,这个量如下计算
41 | gamma_with_o = []
42 | for t in range(T):
43 | alpha_t = forward(pi, A, B, O[:, t])[0]
44 | beta_t = backward(pi, A, B, O[:, t])[0]
45 | beta_t_add_1 = backward(pi, A, B, O[:, t + 1])[0]
46 |
47 | ksi_t = alpha_t[:, None] * A * B[:, [t + 1]] * beta_t_add_1[:, None]
48 | ksi_t = ksi_t / ksi_t.sum()
49 | ksi.append(ksi_t)
50 |
51 | gamma_t = alpha_t * beta_t
52 | gamma_t = gamma_t / gamma_t.sum()
53 | gamma.append(gamma_t)
54 |
55 | # 接下来计算t时刻的gamma_with_o,代表t时刻状态为j且观察为o的概率
56 | # 形如
57 | # 观察1 观察2 ... 观察S
58 | # 状态1
59 | # 状态2
60 | # ...
61 | # 状态n
62 | output_is_o = np.zeros((B.shape[0],))
63 | output_is_o[O[:, t][0]] = 1
64 | gamma_with_o_t = np.dot(gamma_t[:, None], output_is_o[None])
65 | gamma_with_o.append(gamma_with_o_t)
66 | ksi = np.array(ksi)
67 | gamma = np.array(gamma)
68 | gamma_with_o = np.array(gamma_with_o)
69 |
70 | new_A = ksi.sum(axis=-1) / gamma.sum(axis=-1)[:, None]
71 | new_B = gamma_with_o.sum(axis=1) / gamma.sum(axis=-1)[:, None]
72 | new_pi = gamma[0]
73 | if stop(new_A - A, new_B - B, new_pi - pi, epsilon=epsilon):
74 | return new_pi, new_A, new_B
75 | else:
76 | pi = new_pi
77 | A = new_A
78 | B = new_B
79 |
80 |
81 | def stop(*diffs, epsilon):
82 | for diff in diffs:
83 | if abs(diff.max()) < epsilon:
84 | return True
85 | return False
86 |
87 |
88 | def demo():
89 | pass
90 |
91 |
92 | if __name__ == '__main__':
93 | demo()
94 |
95 |
--------------------------------------------------------------------------------
/Chapter10/forward.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def forward(pi, A, B, O):
5 | """
6 | pi: 初始状态概率分布(initial state distribution), shape =
7 | A: 状态转移概率矩阵(state transition matrix)
8 | B: 状态-->观察概率矩阵(state-->output matrix)
9 | O: array([观察序列1, 观察序列2, ..., pn])
10 | 任务是计算每条观察序列的概率,形如[p1, p2, ..., pn]
11 | 需要借助α来计算, α形如[α1, α2, ..., αn]
12 | """
13 | n_state, _ = A.shape
14 | assert pi.shape[0] == n_state
15 | assert B.shape[0] == n_state
16 | # block: 初始化alpha
17 | # 1. 每条观察序列的第一个观察值
18 | o = O[:, 0]
19 | # 2. 每个初始状态转移到每条观察序列第一个观察值的概率矩阵,形如
20 | # 第一条序列第一个观测值 第二条序列第一个观测值 ... 第S条序列第一个观测值
21 | # 状态1 p11 p12 p1s
22 | # 状态2 p21 p22 p2s
23 | # ...
24 | # 状态n pn1 pn2 pns
25 | b = B[:, o]
26 | # 3. 每条观测序列的初始alpha,形如
27 | # 状态1 状态2 ... 状态n
28 | # 观测序列1
29 | # 观测序列2
30 | # ...
31 | # 观测序列S
32 | alpha_next = pi * b.T
33 |
34 | # block: 迭代
35 | for i in range(1, O.shape[1]):
36 | alpha_prev = alpha_next
37 | o = O[:, i]
38 | b = B[:, o]
39 | alpha_next = (np.dot(alpha_prev, A)) * b.T
40 |
41 | return alpha_next.sum(axis=1)
42 |
43 |
44 | def demo():
45 | pi = np.array([.2, .4, .4])
46 | A = np.array([
47 | [.5, .2, .3],
48 | [.3, .5, .2],
49 | [.2, .3, .5]
50 | ])
51 | B = np.array([
52 | [.5, .5],
53 | [.4, .6],
54 | [.7, .3]
55 | ])
56 | O = np.array([
57 | [0, 1, 0],
58 | [0, 1, 0],
59 | ])
60 | print(f"P(O|λ) = {forward(pi, A, B, O)}")
61 |
62 |
63 | if __name__ == '__main__':
64 | demo()
--------------------------------------------------------------------------------
/Chapter10/viterbi.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | """
3 | 维特比算法
4 | 任务:
5 | 当模型的参数λ=(π,A,B)已知,想要根据观察序列O=(o1, o2, ..., oT),来预测状态序列(i1_, i2_, ..., iT_)【下划线代表它是未知的变量】
6 | 每个时刻t都有n种状态的可能,那么状态序列的可能性就有T^n,这个计算量是很大的。
7 | 我们需要一种更快捷的办法
8 |
9 | 假设最优状态序列为I=(i1, i2, ..., iT),那么它应该具备以下特性:
10 | 已知它截止T-1时刻的状态序列为(i1, i2, ..., i_{T-1})。从T-1时刻到T时刻,有状态转移概率iT-1-->iT_的概率,
11 | 且iT_还有发射出观察为oT的概率iT_-->oT。iT_有n种可能,其中使得(iT-1-->iT_的概率)*(iT_-->oT的概率)最大的,一定就是iT。
12 | 否则,就存在另外一个iT',使得整条序列的概率更大,有矛盾。
13 | 这就意味着,算法在求解最后一个时刻T的状态时,答案必须要使得(iT-1-->iT_的概率)*(iT_-->oT的概率)最大。
14 | 【关键步骤】现在,让我们把目光往前推一步到T-1,T-1也需要满足这样的条件,T-2也需要,直到t=2时刻(t=1时刻是初始状态)。
15 | 因此,我们只需要从t=2开始,每次都求解基于i_{t-1},分别计算it=1, it=2, ..., it=n的概率最大化的情况
16 | 这里举个例子辅助理解:t-1时刻i_{t-1}=1, 2, ..., n, 对应的t时刻it=1概率分别是P11, P12, ..., P1n,如果P1j最大,那么此时应该选择
17 | it=1搭配的i_{t-1}=j,对应最大概率为P1j;同理,计算
18 | it=2搭配的i_{t-1}=k, 对应最大概率为P2k;
19 | ...;
20 | it=n搭配的i_{t-1}=m, 对应最大概率为Pnm;
21 |
22 | 然后递归到下一步,我们可以提炼出一个公式
23 | P_max[t][i] = max(P_max[t-1][1] * a1i * bi(o), P_max[t-1][2] * a2i * bi(o), ..., P_max[t-1][n] * ani * bi(o))
24 | 这就是动态规划的公式了。
25 | 当然,这个动态规划的任务比一般的动态规划要多一个步骤,因为我们要输出序列,而不是最终最大概率是多少,所以我们还需要记录
26 | 第t步it=i时,搭配的i_{t-1}是什么才行。
27 |
28 | """
29 | import numpy as np
30 |
31 |
32 | def viterbi(pi, A, B, O):
33 | """
34 | 注意,这里的O是多条观察序列,即O=(O1, O2, ..., Os),假设每条Oi=(oi1,oi2, ..., oiT),即每条oi有n_step个时刻
35 | 需要对每条观察序列预测、输出最大概率的状态序列
36 | """
37 | A = np.array(A)
38 | B = np.array(B)
39 | pi = np.array(pi)
40 | O = np.array(O)
41 |
42 | # 时刻数(步数(
43 | _, n_step = O.shape
44 |
45 | # 多条状态序列的shape应该跟O一致
46 | I = np.empty_like(O)
47 |
48 | # δ代表第t步时,状态分别为1, 2, ..., n的最大概率,形如
49 | # 第一条观测 状态为1的最大概率 状态为2的最大概率 ... 状态为n的最大概率
50 | # 第二条观测 状态为1的最大概率 状态为2的最大概率 ... 状态为n的最大概率
51 | # ...
52 | # 最后条观测 状态为1的最大概率 状态为2的最大概率 ... 状态为n的最大概率
53 |
54 | # 第0步的delta是根据π和B来初始化的
55 | delta = pi[None] * B[:, O[:, 0]].T
56 | psi = np.zeros(shape=(*O.shape, pi.shape[0])) # psi[k][t][i]代表,第k条观察序列对应的第t步选择状态为i时,搭配t-1的状态
57 |
58 | for t in range(1, n_step):
59 | psi_t = np.argmax(delta[..., None] * A, axis=1)
60 | delta = np.max((delta[:, None] * A.T) * B[:, O[:, t]].T[..., None], axis=2)
61 | psi[:, t] = psi_t
62 |
63 | best_T = np.argmax(delta, axis=1)
64 | I[:, -1] = best_T
65 | for t in range(n_step - 2, -1, -1):
66 | best_t = psi[:, t + 1].take([I[:, t + 1]])
67 | I[:, t] = best_t
68 | return I
69 |
70 |
71 | def demo():
72 | A = [
73 | [.5, .2, .3],
74 | [.3, .5, .2],
75 | [.2, .3, .5]
76 | ]
77 |
78 | B = [
79 | [.5, .5],
80 | [.4, .6],
81 | [.7, .3]
82 | ]
83 |
84 | pi = [.2, .4, .4]
85 |
86 | O = [
87 | [0, 1, 0],
88 | [0, 1, 0]
89 | ]
90 | print(viterbi(pi, A, B, O))
91 |
92 |
93 | if __name__ == '__main__':
94 | demo()
95 |
--------------------------------------------------------------------------------
/Chapter11/BFGS.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | """
3 | 实现BFGS
4 | """
5 | import numpy as np
6 |
7 |
8 | # 用一维搜索求λk
9 | def line_search_wolfe(fun, grad, x, p, max_epoch=100, c1=10 ** (-3), c2=0.9, alpha_1=1.0, alpha_max=10 ** 6):
10 | if alpha_1 >= alpha_max:
11 | raise ValueError('Argument alpha_1 should be less than alpha_max')
12 |
13 | def phi(alpha):
14 | return fun(x + alpha * p)
15 |
16 | def phi_grad(alpha):
17 | return np.dot(grad(x + alpha * p).T, p)
18 |
19 | alpha_old = 0
20 | alpha_new = alpha_1
21 |
22 | final_alpha = None
23 |
24 | for i in np.arange(1, max_epoch + 1):
25 | phi_alpha = phi(alpha_new)
26 |
27 | if (i == 1 and phi_alpha > phi(0) + c1 * alpha_new * phi_grad(0)) or (i > 1 and phi_alpha >= phi(alpha_old)):
28 | final_alpha = zoom(x, p, phi, phi_grad, alpha_old, alpha_new, c1, c2)
29 | break
30 |
31 | phi_grad_alpha = phi_grad(alpha_new)
32 |
33 | if np.abs(phi_grad_alpha) <= -c2 * phi_grad(0):
34 | final_alpha = alpha_new
35 | break
36 |
37 | if phi_grad_alpha >= 0:
38 | final_alpha = zoom(x, p, phi, phi_grad, alpha_new, alpha_old, c1, c2)
39 | break
40 |
41 | alpha_old = alpha_new
42 | alpha_new = alpha_new + (alpha_max - alpha_new) * np.random.rand(1)
43 |
44 | if i == max_epoch and final_alpha is None:
45 | return None
46 |
47 | return final_alpha
48 |
49 |
50 | # 一维搜索中的辅助函数
51 | def zoom(x, p, phi, phi_grad, alpha_lo, alpha_hi, c1, c2):
52 | while True:
53 | alpha_j = (alpha_hi + alpha_lo) / 2
54 |
55 | phi_alpha_j = phi(alpha_j)
56 |
57 | if (phi_alpha_j > phi(0) + c1 * alpha_j * phi_grad(0)) or (phi_alpha_j >= phi(alpha_lo)):
58 | alpha_hi = alpha_j
59 | else:
60 | phi_grad_alpha_j = phi_grad(alpha_j)
61 |
62 | if np.abs(phi_grad_alpha_j) <= -c2 * phi_grad(0):
63 | return alpha_j
64 |
65 | if phi_grad_alpha_j * (alpha_hi - alpha_lo) >= 0:
66 | alpha_hi = alpha_lo
67 |
68 | alpha_lo = alpha_j
69 |
70 |
71 | def BFGS(func, grad, w_start, eps, max_iterations=100, verbose=False):
72 | n = len(w_start)
73 |
74 | # We are starting with identity matrix
75 | # as approximation of the inverse of the Hessian.
76 | # It will be updated on every iteration.
77 | # We are using the notation H_k = (B_k)^{-1},
78 | # where B_k is the approximation of the Hessian.
79 | # B矩阵需要是个对称正定矩阵,用单位矩阵初始化B矩阵
80 | H_old = np.diag(np.ones(n))
81 | w_old = w_start
82 |
83 | for i in np.arange(1, max_iterations + 1):
84 | # 搜索方向p=-H * gk = -Bk^{-1} * gk
85 | p = -1 * np.dot(H_old, grad(w_old))
86 |
87 | # Calculating the step into the direction p
88 | # using the Wolfe conditions as constrains on the step.
89 | lambda_ = line_search_wolfe(func, grad, w_old, p, max_epoch=max_iterations)
90 |
91 | if lambda_ is None:
92 | print('Wolfe line search did not converge')
93 | return w_old, i
94 |
95 | w_new = w_old + lambda_ * p
96 |
97 | s = (w_new - w_old).reshape((n, 1))
98 | y = (grad(w_new) - grad(w_old)).reshape((n, 1))
99 | sT = s.T.reshape((1, n))
100 | yT = y.T.reshape((1, n))
101 |
102 | yT_s = np.dot(yT, s).reshape(())
103 |
104 | I = np.diag(np.ones(n))
105 | rho = 1 / yT_s
106 | rho2 = rho ** 2
107 |
108 | # The next products are being used
109 | # in the calculation of the H_{k+1} from H_k.
110 | # Only the matrices of dimension (n x n) will be used in the final formula.
111 | H_y = np.dot(H_old, y).reshape((n, 1)) # H_k * y_k
112 | Hy_sT = np.dot(H_y, sT).reshape((n, n)) # (H_k*y_k) * s^T
113 | yT_H = np.dot(yT, H_old).reshape((1, n)) # y_k^T * H_k
114 | s_yTH = np.dot(s, yT_H).reshape((n, n)) # s_k * (y_k^T*H_k)
115 | syTH_y = np.dot(s_yTH, y).reshape((n, 1)) # (s_k*(y_k^T*H_k)) * y_k
116 | syTHy_sT = np.dot(syTH_y, sT).reshape((n, n)) # ((s_k*(y_k^T*H_k))*y_k) * s_k^T
117 | s_sT = np.dot(s, sT).reshape((n, n)) # s_k * s_k^T
118 |
119 | # The initial formula
120 | # H_{k+1} = (I - rho_k*s_k*y_k^T)H_k(I - rho_k*y_k*s_k^T) + rho_k*s_k*s_T
121 | # can be rewritten as
122 | # H_{k+1} = H_k - rho_k*(H_k*y_k)*s_k^T - rho_k*s_k*(y_k^T*H_k) + rho_k^2*((s_k*(y_k^T*H_k))*y_k)*s_k^T + rho_k*s_k*s_k^T
123 | # to avoid calculations of assimptote complexity O(n^3).
124 | H_new = H_old - rho * Hy_sT - rho * s_yTH + rho2 * syTHy_sT + rho * s_sT
125 |
126 | if verbose:
127 | print('x_k = {0} converges to x_(k+1) = {1}'.format(w_old, w_new))
128 |
129 | # We are using the 2-norm value
130 | # between the previous and the next gradient
131 | # of the approximation of the function minima
132 | # as the stopping condition for the BFGS algorithm.
133 | grad_dist = np.linalg.norm(grad(w_old) - grad(w_new))
134 | if grad_dist < eps:
135 | break
136 | elif verbose:
137 | print('There is still {0} left for approximations to converge'.format(np.abs(grad_dist - eps)), '\n')
138 |
139 | w_old = w_new
140 | H_old = H_new
141 |
142 | if verbose:
143 | print('\nFinal approximation of the minima is {0}.'.format(w_new))
144 | if i != max_iterations:
145 | print('Optimization process converged in {0} steps'.format(i))
146 | else:
147 | print('Optimization process did not converge')
148 |
149 | return w_new, i
150 |
151 |
152 | def demo():
153 | pass
154 |
155 |
156 | if __name__ == '__main__':
157 | demo()
158 |
--------------------------------------------------------------------------------
/Chapter11/CRF.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | """
3 | 条件随机场(CRF, conditional random field)
4 |
5 | 1. 初衷:和HMM一样,想要解决有隐变量的序列概率问题,即求解argmax P(I|O,λ=(A,B,π))
6 | I
7 | 2. 区别:
8 | 2.1 HMM
9 | HMM最重要的预设:预设了每个it只跟i_{t-1}有关,每个ot只跟o_{t-1}有关。第一个假设就是齐次马尔可夫假设。
10 | 换句话说i_{t-1}决定it,o_{t-1}决定ot,前者相当于预设了概率转移矩阵A,后者预设了发射矩阵B。所以HMM需要学习出这两个
11 | 矩阵,再加上初始状态概率分布矩阵π。这三个矩阵学习出来后,意味着HMM已经完全掌握了无论是状态还是观察的生成规律了。所以它就是生成模型。
12 | 这里细细品味一下,正是HMM的这两个假设决定了模型的学习目标,进而决定了模型是生成模型。
13 |
14 | 这就是HMM的基本方法论,剩下的难点无非是如何学习出这几个矩阵。
15 | 2.2 CRF(只说最简单的线性CRF)
16 | CRF则不仅仅是假设每个it只跟i_{t-1}有关,而是假设it跟i_{t-1}和i_{t+1}。也就是说,
17 | P(it|O,i1,...i_{t-1},i_{t+1},...,iT)=P(it|O,i_{t-1},i_{t+1})
18 | 所以P=(I|O)=(公式11.10)
19 | # todo: 位置这个概念我,有时候用t,有时候跟书上一致,用的是i,得统一一波
20 | # todo: 为什么LinearCRF和HMM一样,-在预测都用维特比,在计算概率时都用前向后向
21 | 根据例11.1,可以发现,特征函数的定义非常宽泛具体,笔者一开始以为特征函数限定在相对位置,即(前、后位置的标记转移关系)
22 | 但后面才发现特征函数可以限定第几个位置,比如t4就限定序列的第二个状态为2和第三个状态为2时才算满足条件
23 | 另外,书上的s1,s2,s3,s4都不依赖于具体的
24 | 这里用闭包来定义转移特征和状态特征,当然也可以用类定义
25 | 统一标识:
26 | X = (X1, X2, ..., Xn),即n条观察序列。其中Xi = (xi1, xi2, ..., xiT),即每条观察序列有T个位置。同理:
27 | Y = (Y1, Y2, ..., Yn),即n条标识序列。其中Yi = (yi1, yi2, ..., yiT)。
28 | 每个yit可能的取值有N个
29 | # todo: 一个大问题:看起来,CRF的概率计算、学习、预测都跟x没有任何关系,尤其是根据11.4.1节的对数似然函数,可以发现训练过程中根本用不到x
30 | # todo: 因为fk(yj,xj)的计算过程中,完全用不到xj。(待求证李航老师)但假设现在是给语句分词作标注,我们定义一个状态特征:"的"字的标注为O(非实体词),说明
31 | # todo:这种依赖于观察的状态特征是完全合理,笔者擅自按照这种思路来拓宽细化状态特征的定义。
32 | """
33 | from functools import lru_cache
34 | from functools import reduce
35 | from itertools import product
36 |
37 | import numpy as np
38 |
39 | from Chapter11.BFGS import BFGS
40 | from Chapter11.backward import backward
41 | from Chapter11.forward import forward
42 |
43 | TRANSITION = 'transition'
44 | STATE = 'state'
45 |
46 |
47 | class FeatureFunc:
48 | def __init__(self, category, required_y_prev, required_y_next, required_x=None, required_i=None):
49 | self.category = category
50 | self.required_y_prev = required_y_prev
51 | self.required_y_next = required_y_next
52 | self.required_x = required_x
53 | self.required_i = required_i
54 |
55 | @lru_cache()
56 | def cal_single(self, test_y_prev, test_y_next, test_x, test_i):
57 | """计算给定位置的特征得分"""
58 | if self.category == TRANSITION:
59 | if test_y_prev != self.required_y_prev or test_y_next != self.required_y_next:
60 | return 0
61 | if self.required_x is not None and test_x != self.required_x:
62 | return 0
63 | if self.required_i is not None and test_i != self.required_i:
64 | return 0
65 | return 1
66 | elif self.category == STATE: # 状态特征只看y_next和位置(如果有要求)
67 | if test_y_next != self.required_y_prev:
68 | return 0
69 | if self.required_i is not None and test_i != self.required_i:
70 | return 0
71 | return 1
72 |
73 | @lru_cache()
74 | def cal_sequence(self, x, y):
75 | """计算一整个序列的特征得分"""
76 | score = 0
77 | start_index = 0 if self.category == STATE else 1
78 | for test_i in range(start_index, len(x)):
79 | test_y_prev = y[test_i - 1]
80 | test_y_next = y[test_i]
81 | test_x = x[test_i]
82 | score += self.cal_single(test_y_prev, test_y_next, test_x, test_i)
83 | return score
84 |
85 |
86 | class LinearCRF:
87 | def __init__(self, X, Y, y_option, ff, epsilon):
88 | """
89 | :param y_option: 状态的可能值,
90 | :param X: 观察序列,把多条碾成一条
91 | """
92 | # 直接根据这个X,Y来初始化M,α,β,甚至那两个期望值 todo
93 | assert len(X) == len(Y)
94 | self.X = X
95 | self.Y = Y
96 | # 计算联合(x,y)的经验概率分布和x的概率分布
97 | self.x_prob, self.x_y_prob = self._cal_empirical_distribution()
98 | self.n_sample, self.T = X.shape
99 | self.y_option = y_option
100 | self.n = len(self.y_option) # 状态可能值的数目
101 | self.ff = ff
102 | self.w = np.random.dirichlet(size=(len(ff),))
103 | self.epsilon = epsilon
104 |
105 | def _cal_empirical_distribution(self):
106 | n_sample = len(self.X)
107 | x_prob = dict()
108 | x_y_prob = dict()
109 | for idx in range(n_sample):
110 | x, y = tuple(self.X[idx]), tuple(self.Y[idx])
111 | assert len(x) == len(y), f"第{idx}条样本的状态长度为{len(x)}和输出长度为{len(y)},不等长"
112 | x_y = (x, y)
113 | x_prob[x] = x_prob.get(x, 0) + 1 / n_sample
114 | x_y_prob[x_y] = x_prob.get(x_y, 0) + 1 / n_sample
115 | return x_prob, x_y_prob
116 |
117 | def cal_F(self, x, y):
118 | """
119 | 给定x,y来生成特征矩阵F(y,x)=(f1(y,x),f2(y,x),...,fK(y,x))T
120 | """
121 | pass
122 |
123 | def cal_M(self, x):
124 | """计算给定观察x的前提下的M矩阵"""
125 | # M是各个时间步上的状态转移矩阵,即M=(M1,M2,...,MT)
126 | # 形如
127 | # [
128 | # 第一个时间步 [ 第一个时间步处于状态1 第一个时间步处于状态2 ... 第一个时间步处于状态n
129 | # 第零个时间步处于状态1 M11 M12 M1n
130 | # 第零个时间步处于状态2 M21 M22 M2n
131 | # ...
132 | # 第零个时间步处于状态n Mn1 Mn2 Mnn
133 | # ]
134 | # 第二个时间步 [ 第二个时间步处于状态1 第二个时间步处于状态2 ... 第二个时间步处于状态n
135 | # 第一个时间步处于状态1 M11 M12 M1n
136 | # 第一个时间步处于状态2 M21 M22 M2n
137 | # ...
138 | # 第一个时间步处于状态n Mn1 Mn2 Mnn
139 | # ]
140 | # ...
141 | # 第T+1个时间步 [ 第T+1个时间步处于状态1 第T+1个时间步处于状态2 ... 第T+1个时间步处于状态n
142 | # 第T个时间步处于状态1 M11 M12 M1n
143 | # 第T个时间步处于状态2 M21 M22 M2n
144 | # ...
145 | # 第T个时间步处于状态n Mn1 Mn2 Mnn
146 | # ]
147 | # ]
148 | # 而Mij=f1(yi,yj,x,1) + f2(yi,yj,x,1) + ...
149 | # feature_matrix = np.zeros(shape=(self.n, self.n))
150 | T = len(x)
151 |
152 | M = []
153 | for test_i in range(T + 1):
154 | M_t = []
155 | test_x = x[test_i]
156 | for test_y_prev, test_y_next in product(range(self.n), range(self.n)):
157 | score = 0 # 在x下,y_prev, y_next, i在特征函数下的得分
158 | for w, f in zip(self.w, self.ff):
159 | score += w * f(test_y_prev, test_y_next, test_x, test_i)
160 | M_t.append(score)
161 | M_t = np.array(M_t).reshape((self.n, self.n)) # 其实到这里,仅仅是书上的W矩阵
162 | M_t = np.exp(M_t)
163 | M.append(M_t)
164 | M = np.array(M)
165 | return M
166 |
167 | def inference(self, x, y):
168 | """给定x,y,求Pw(y|x),利用M矩阵"""
169 | T = len(x)
170 | M = self.cal_M(x)
171 | Zw = reduce(np.dot, M)
172 | numerator = 1
173 | for i in range(T + 1):
174 | y_prev = y[i]
175 | y_next = y[i + 1]
176 | numerator *= M[i, y_prev, y_next]
177 | return numerator / Zw
178 |
179 | def fit(self):
180 | """
181 | 这里用拟牛顿法
182 | 输入:
183 | 1. 原始func: Pw(y|x),注意这里的func的参数是w,而训练集(X,Y)其实是常数了
184 | 2. func的梯度grad:同样的,grad也是w的梯度
185 | 将1、2传入给BFGS函数,求得最后的w
186 | :return:
187 | """
188 |
189 | def loss(w):
190 | # 先算f(w)的第一项
191 | term1 = 0
192 | for x, x_prob in self.x_prob.items():
193 | exp = 0
194 | for (x_, y) in self.x_y_prob: # 在训练集中出现的(x,y)
195 | if x_ == x:
196 | for w_k, ff_k in zip(w, self.ff):
197 | exp += np.exp(ff_k.cal_sequence(x, y))
198 | term1 += x_prob * np.log(exp)
199 |
200 | term2 = 0
201 | for (x, y), x_y_prob in self.x_y_prob.items():
202 | # 计算
203 | score = 0
204 | for w_k, ff_k in zip(w, self.ff):
205 | score += w * ff_k.cal_sequence(x, y)
206 | term2 += x_y_prob * score
207 | cost = term1 - term2
208 | return cost
209 |
210 | def grad_loss(w):
211 | self.w = w # todo
212 | grad = []
213 | for w_k, ff_k in zip(self.w, self.ff):
214 | score = 0
215 | for (x, y), x_y_prob in self.x_y_prob.items():
216 | score += ff_k.cal_sequence(x, y)
217 | M = self.cal_M(x)
218 | Zm = reduce(np.dot, M)
219 | alpha = forward(M, len(x))
220 | beta = backward(M, len(x))
221 | # todo: 还没弄完
222 | return 0
223 |
224 | self.w, _ = BFGS(loss, grad_loss, self.w, self.epsilon)
225 |
226 | def probability_single(self, x, i):
227 | """
228 | :param x: 已知的观察
229 | :param i: 位置
230 | :return:
231 | """
232 | M = self.cal_M(x)
233 | # 在当前x下扫描记录α和β
234 | alpha = []
235 | alpha_0 = 1 # 书上这里设置的y_0=start时才为1,否则为0,但我没有想出有不为start的必要
236 | alpha.append(alpha_0)
237 |
238 |
239 | if __name__ == '__main__':
240 | pass
241 |
--------------------------------------------------------------------------------
/Chapter11/backward.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import numpy as np
3 |
4 |
5 | def backward(M_x, t):
6 | beta = []
7 | beta_prev = np.ones(M_x.shape[1])
8 | beta.append(beta_prev)
9 | print(f"初始β={beta_prev}")
10 | for i in range(t - 1, -1, -1):
11 | beta_next = beta_prev
12 | M_t = M_x[i + 1]
13 | beta_prev = np.dot(M_t, beta_next)
14 | beta.append(beta_prev)
15 | print(f"β{i}={beta_prev}")
16 | return beta_prev
17 |
18 |
19 | def demo():
20 | M = [
21 | [
22 | [.5, .5],
23 | [.0, .0]
24 | ],
25 | [
26 | [.7, .3],
27 | [.4, .6]
28 |
29 | ],
30 | [
31 | [.2, .8],
32 | [.5, .5]
33 | ],
34 | [
35 | [.9, .1],
36 | [.8, .2]
37 | ]
38 | ]
39 | M = np.array(M)
40 | beta = backward(M, 3)
41 | print(beta)
42 |
43 |
44 | if __name__ == '__main__':
45 | demo()
46 |
--------------------------------------------------------------------------------
/Chapter11/forward.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | """
3 | 为什么CRF算概率时,不像HMM那样,只是计算P(I|O,λ)。而是计算
4 | P(yi|x)和P(yi-1,yi|x)
5 | """
6 | import numpy as np
7 |
8 |
9 | def forward(M_x, t):
10 | """
11 | 根据M矩阵,来计算前向向量αi(yi|x):即在第i个位置(第i时间步,状态为yi,且
12 | 截止到第i个位置,观察为(x0,x1, ... xi)的概率。
13 | yi的取值有n个,所以α.shape = (n, )
14 |
15 | 注意,书上的M矩阵,指的是从位置i=1,2,..., T+1,各有一个Mi矩阵。
16 | 从而M矩阵由T+1个Mi矩阵组成
17 | """
18 | # 书上写的是当y0=start时,才为1,但笔者想不出有什么必要,因为这个start事实上也是虚构头,
19 | # 这个虚构头的状态为任意一个,我们都无所谓才对,所以这里概率都取为1
20 | alpha = []
21 | alpha_next = np.ones(M_x.shape[1])
22 | alpha.append(alpha_next)
23 | print(f"初始α={alpha_next}")
24 | for i in range(t):
25 | alpha_prev = alpha_next
26 | M = M_x[i]
27 | alpha_next = np.dot(alpha_prev, M)
28 | alpha.append(alpha_next)
29 | print(f"α{i}={alpha_next}")
30 | return alpha
31 |
32 |
33 | def demo():
34 | M = [
35 | [
36 | [.5, .5],
37 | [.0, .0]
38 | ],
39 | [
40 | [.7, .3],
41 | [.4, .6]
42 |
43 | ],
44 | [
45 | [.2, .8],
46 | [.5, .5]
47 | ],
48 | [
49 | [.9, .1],
50 | [.8, .2]
51 | ]
52 | ]
53 | M = np.array(M)
54 | alpha = forward(M, 3)
55 | print(alpha)
56 |
57 |
58 | if __name__ == '__main__':
59 | demo()
60 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | 本项目复现李航《统计学习方法》每一章节的算法
2 |
3 | # 特点:
4 | - 笔记摘要:在每个文件开头都会有一些核心的摘要
5 | - pythonic:这里会用尽可能规范的方式来实现,包括编程风格几乎严格按照PEP8
6 | - 循序渐进:前期的算法会更list的方式来做计算,可读性比较强,后期几乎完全为numpy.array的计算,并且辅助详细的注释。
7 |
8 | # 完成情况:
9 | - ✅ perceptron
10 | - ✅ KNN
11 | - ✅ naive baysian
12 | - ✅ 决策树
13 | - ✅ 逻辑斯蒂回归
14 | - [ ] SVM
15 | - ✅ Adaboost
16 | - ✅ GMM
17 | - ✅ HMM
18 | - [ ] CRF
19 |
20 | # requirements
21 |
python 3.7
22 | sklearn 0.21.3
23 |
numpy 1.17.2
24 | matplotlib 3.1.1
25 |
--------------------------------------------------------------------------------