├── .gitignore
├── README.md
├── crf.py
├── example.py
└── sample.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | __pycache__
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Simple CRF
 2 | 
 3 | 线性链条件随机场简洁实现版本，参考：https://github.com/shawntan/python-crf
 4 | 
 5 | 代码的符号系统风格和李航老师的《统计学习方法》第11章保持一致，可对照书本查看代码，代码有比较详细的注释。
 6 | 
 7 | 亦可查阅我的博客，内有详细介绍：https://applenob.github.io/crf.html 
 8 | 
 9 | ## 执行demo
10 | `python3 example.py`
11 | 
12 | 注：这里的训练数据只有4条，因此结果不佳，主要用作原理理解。
13 | 
14 | 


--------------------------------------------------------------------------------
/crf.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import numpy as np
  3 | from scipy import special, optimize
  4 | 
  5 | # 引进特殊的起点和终点标记
  6 | START = '|-'
  7 | END = '-|'
  8 | 
  9 | 
 10 | def log_dot_vm(loga, logM):
 11 |     """通过log向量和log矩阵，计算log(向量 点乘 矩阵)"""
 12 |     return special.logsumexp(np.expand_dims(loga, axis=1) + logM, axis=0)
 13 | 
 14 | 
 15 | def log_dot_mv(logM, logb):
 16 |     """通过log向量和log矩阵，计算log(矩阵 点乘 向量)"""
 17 |     return special.logsumexp(logM + np.expand_dims(logb, axis=0), axis=1)
 18 | 
 19 | 
 20 | class CRF:
 21 |     def __init__(self, feature_functions, labels):
 22 |         self.ft_fun = feature_functions
 23 |         # 特征函数的权值，模型的主要参数，size：K
 24 |         self.w = np.random.randn(len(self.ft_fun))
 25 |         # self.labels = [START] + labels + [END]
 26 |         self.labels = labels
 27 |         self.label_id = {l: i for i, l in enumerate(self.labels)}
 28 | 
 29 |     def get_all_features(self, x_vec):
 30 |         """
 31 |         给定一个输入x_vec，计算这个输入上的所有的(y',y)组合的特征值。
 32 |         size: len(x_vec) + 1, Y, Y, K
 33 |         Axes:
 34 |         0 - T or time or sequence index
 35 |         1 - y' or previous label
 36 |         2 - y  or current  label
 37 |         3 - f(y', y, x_vec, i) for i s
 38 |         """
 39 |         result = np.zeros((len(x_vec) + 1, len(self.labels), len(self.labels), len(self.ft_fun)))
 40 |         for i in range(len(x_vec) + 1):
 41 |             for j, yp in enumerate(self.labels):
 42 |                 for k, y in enumerate(self.labels):
 43 |                     for l, f in enumerate(self.ft_fun):
 44 |                         result[i, j, k, l] = f(yp, y, x_vec, i)
 45 |         return result
 46 | 
 47 |     def forward(self, log_M_s, start):
 48 |         T = log_M_s.shape[0]
 49 |         Y = log_M_s.shape[1]
 50 |         alphas = np.NINF * np.ones((T+1, Y))  # log0 = ninf
 51 |         alpha = alphas[0]
 52 |         alpha[start] = 0  # log1 = 0
 53 |         for t in range(1, T+1):
 54 |             alphas[t] = log_dot_vm(alpha, log_M_s[t - 1])
 55 |             alpha = alphas[t]
 56 |         return alphas
 57 | 
 58 |     def backward(self, log_M_s, end):
 59 |         T = log_M_s.shape[0]
 60 |         Y = log_M_s.shape[1]
 61 |         betas = np.NINF * np.ones((T+1, Y))  # log0 = ninf
 62 |         # betas = np.zeros((T+1, Y))
 63 |         beta = betas[-1]
 64 |         beta[end] = 0  # log1 = 0
 65 |         for t in reversed(range(T)):
 66 |             betas[t] = log_dot_mv(log_M_s[t], beta)
 67 |             beta = betas[t]
 68 |         return betas
 69 | 
 70 |     def create_vector_list(self, x_vecs, y_vecs):
 71 |         """
 72 |         数据预处理
 73 |         """
 74 |         print("create vector list ...")
 75 |         print("total training data num:", len(x_vecs))
 76 |         observations = [self.get_all_features(x_vec) for x_vec in x_vecs]
 77 |         labels = len(y_vecs) * [None]
 78 | 
 79 |         for i in range(len(y_vecs)):
 80 |             assert (len(y_vecs[i]) == len(x_vecs[i]))
 81 |             y_vecs[i].insert(0, START)
 82 |             y_vecs[i].append(END)
 83 |             labels[i] = np.array([self.label_id[y] for y in y_vecs[i]], copy=False, dtype=np.int)
 84 | 
 85 |         return observations, labels
 86 | 
 87 |     def neg_likelihood_and_deriv(self, x_vec_list, y_vec_list, w, debug=False):
 88 |         """
 89 |         求负对数似然函数和关于w的偏导。
 90 |         关键变量的尺寸中，Y是标注空间的个数，K是特征函数的个数。
 91 |         """
 92 |         likelihood = 0
 93 |         derivative = np.zeros(len(self.w))
 94 |         # 对观测序列x的每一个位置
 95 |         for x_vec, y_vec in zip(x_vec_list, y_vec_list):
 96 |             # all_features, len(x_vec) + 1, Y, Y, K
 97 |             # all_features = self.get_all_features(x_vec)
 98 |             all_features = x_vec  # 这里为了提高速度，把上面这步统一在外面做
 99 |             length = x_vec.shape[0]
100 |             # y_vec = [START] + y_vec + [END]
101 |             yp_vec_ids = y_vec[:-1]
102 |             y_vec_ids = y_vec[1:]
103 |             # log_M_s: len(x_vec) + 1, Y, Y
104 |             log_M_s = np.dot(all_features, w)
105 |             # alphas: len(x_vec) + 2, Y
106 |             log_alphas = self.forward(log_M_s, self.label_id[START])
107 |             last = log_alphas[-1]
108 |             # betas: len(x_vec) + 2, Y
109 |             log_betas = self.backward(log_M_s, self.label_id[END])
110 |             log_Z = special.logsumexp(last)
111 |             # reshape
112 |             log_alphas1 = np.expand_dims(log_alphas[1:], axis=2)
113 |             log_betas1 = np.expand_dims(log_betas[:-1], axis=1)
114 |             #  log_probs: len(x_vec) + 1, Y, Y
115 |             log_probs = log_alphas1 + log_M_s + log_betas1 - log_Z
116 |             log_probs = np.expand_dims(log_probs, axis=3)
117 |             # 计算特征函数关于模型的期望
118 |             exp_features = np.sum(np.exp(log_probs) * all_features, axis=(0, 1, 2))
119 |             # 计算特征函数关于训练数据的期望
120 |             emp_features = np.sum(all_features[range(length), yp_vec_ids, y_vec_ids], axis=0)
121 |             # 计算似然函数
122 |             likelihood += np.sum(log_M_s[range(length), yp_vec_ids, y_vec_ids]) - log_Z
123 |             # 计算似然函数的偏导
124 |             derivative += emp_features - exp_features
125 | 
126 |         return -likelihood, -derivative
127 | 
128 |     def train(self, x_vecs, y_vecs, debug=False):
129 |         """训练模型，更新w"""
130 |         vectorised_x_vecs, vectorised_y_vecs = self.create_vector_list(x_vecs, y_vecs)
131 |         print("start training ...")
132 |         l = lambda w: self.neg_likelihood_and_deriv(vectorised_x_vecs, vectorised_y_vecs, w)
133 |         val = optimize.fmin_l_bfgs_b(l, self.w)
134 |         if debug:
135 |             print(val)
136 |         self.w, _, _ = val
137 |         return self.w
138 | 
139 |     def predict(self, x_vec, debug=False):
140 |         """给定x，预测y。使用Viterbi算法"""
141 |         # all_features, len(x_vec) + 1, Y, Y, K
142 |         all_features = self.get_all_features(x_vec)
143 |         # log_potential: len(x_vec) + 1, Y, Y  保存各个下标的非规范化概率
144 |         log_potential = np.dot(all_features, self.w)
145 |         T = len(x_vec)
146 |         Y = len(self.labels)
147 |         # Psi保存每个时刻最优情况的下标
148 |         Psi = np.ones((T, Y), dtype=np.int32) * -1
149 |         # 初始化
150 |         delta = log_potential[0, 0]
151 |         # 递推
152 |         for t in range(1, T):
153 |             next_delta = np.zeros(Y)
154 |             for y in range(Y):
155 |                 w = delta + log_potential[t, :, y]
156 |                 Psi[t, y] = psi = w.argmax()
157 |                 next_delta[y] = w[psi]
158 |             delta = next_delta
159 |         # 回溯找到最优路径
160 |         y = delta.argmax()
161 |         trace = []
162 |         for t in reversed(range(T)):
163 |             trace.append(y)
164 |             y = Psi[t, y]
165 |         trace.reverse()
166 |         return [self.labels[i] for i in trace]
167 | 
168 | 


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
 1 | from crf import *
 2 | from collections import defaultdict
 3 | import re
 4 | import sys
 5 | 
 6 | 
 7 | def get_feature_functions(word_sets, labels, observes):
 8 |     """生成各种特征函数"""
 9 |     print("get feature functions ...")
10 |     transition_functions = [
11 |         lambda yp, y, x_v, i, _yp=_yp, _y=_y: 1 if yp == _yp and y == _y else 0
12 |         for _yp in labels[:-1] for _y in labels[1:]
13 |         ]
14 | 
15 |     def set_membership(tag, word_sets):
16 |         def fun(yp, y, x_v, i):
17 |             if i < len(x_v) and x_v[i].lower() in word_sets[tag]:
18 |                 return 1
19 |             else:
20 |                 return 0
21 |         return fun
22 | 
23 |     observation_functions = [set_membership(t, word_sets) for t in word_sets]
24 | 
25 |     misc_functions = [
26 |         lambda yp, y, x_v, i: 1 if i < len(x_v) and re.match('^[^0-9a-zA-Z]+$', x_v[i]) else 0,
27 |         lambda yp, y, x_v, i: 1 if i < len(x_v) and re.match('^[A-Z\.]+$', x_v[i]) else 0,
28 |         lambda yp, y, x_v, i: 1 if i < len(x_v) and re.match('^[0-9\.]+$', x_v[i]) else 0
29 |     ]
30 | 
31 |     tagval_functions = [
32 |         lambda yp, y, x_v, i, _y=_y, _x=_x: 1 if i < len(x_v) and y == _y and x_v[i].lower() == _x else 0
33 |         for _y in labels
34 |         for _x in observes]
35 | 
36 |     return transition_functions + tagval_functions + observation_functions + misc_functions
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     word_data = []
41 |     label_data = []
42 |     all_labels = set()
43 |     word_sets = defaultdict(set)
44 |     observes = set()
45 |     for line in open("sample.txt"):
46 |         words, labels = [], []
47 |         for token in line.strip().split():
48 |             word, label = token.split('/')
49 |             all_labels.add(label)
50 |             word_sets[label].add(word.lower())
51 |             observes.add(word.lower())
52 |             words.append(word)
53 |             labels.append(label)
54 | 
55 |         word_data.append(words)
56 |         label_data.append(labels)
57 | 
58 |     labels = [START, END] + list(all_labels)
59 |     feature_functions = get_feature_functions(word_sets, labels, observes)
60 | 
61 |     crf = CRF(labels=labels, feature_functions=feature_functions)
62 |     crf.train(word_data, label_data)
63 |     for x_vec, y_vec in zip(word_data[-5:], label_data[-5:]):
64 |         print("raw data: ", x_vec)
65 |         print("prediction: ", crf.predict(x_vec))
66 |         print("ground truth: ", y_vec)
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/sample.txt:
--------------------------------------------------------------------------------
1 | Confidence/NN in/IN the/DT pound/NN is/VBZ widely/RB expected/VBN to/TO take/VB another/DT sharp/JJ dive/NN if/IN trade/NN figures/NNS for/IN September/NNP ,/, due/JJ for/IN release/NN tomorrow/NN ,/, fail/VB to/TO show/VB a/DT substantial/JJ improvement/NN from/IN July/NNP and/CC August/NNP 's/POS near-record/JJ deficits/NNS ./.
2 | Chancellor/NNP of/IN the/DT Exchequer/NNP Nigel/NNP Lawson/NNP 's/POS restated/VBN commitment/NN to/TO a/DT firm/NN monetary/JJ policy/NN has/VBZ helped/VBN to/TO prevent/VB a/DT freefall/NN in/IN sterling/NN over/IN the/DT past/JJ week/NN ./.
3 | But/CC analysts/NNS reckon/VBP underlying/VBG support/NN for/IN sterling/NN has/VBZ been/VBN eroded/VBN by/IN the/DT chancellor/NN 's/POS failure/NN to/TO announce/VB any/DT new/JJ policy/NN measures/NNS in/IN his/PRP$ Mansion/NNP House/NNP speech/NN last/JJ Thursday/NNP ./.
4 | This/DT has/VBZ increased/VBN the/DT risk/NN of/IN the/DT government/NN being/VBG forced/VBN to/TO increase/VB base/NN rates/NNS to/TO 16/CD %/NN from/IN their/PRP$ current/JJ 15/CD %/NN level/NN to/TO defend/VB the/DT pound/NN ,/, economists/NNS and/CC foreign/JJ exchange/NN market/NN analysts/NNS say/VBP ./.
5 | 


--------------------------------------------------------------------------------