├── .gitignore ├── README.md ├── crf.py ├── example.py └── sample.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | __pycache__ 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Simple CRF 2 | 3 | 线性链条件随机场简洁实现版本,参考:https://github.com/shawntan/python-crf 4 | 5 | 代码的符号系统风格和李航老师的《统计学习方法》第11章保持一致,可对照书本查看代码,代码有比较详细的注释。 6 | 7 | 亦可查阅我的博客,内有详细介绍:https://applenob.github.io/crf.html 8 | 9 | ## 执行demo 10 | `python3 example.py` 11 | 12 | 注:这里的训练数据只有4条,因此结果不佳,主要用作原理理解。 13 | 14 | -------------------------------------------------------------------------------- /crf.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import numpy as np 3 | from scipy import special, optimize 4 | 5 | # 引进特殊的起点和终点标记 6 | START = '|-' 7 | END = '-|' 8 | 9 | 10 | def log_dot_vm(loga, logM): 11 | """通过log向量和log矩阵,计算log(向量 点乘 矩阵)""" 12 | return special.logsumexp(np.expand_dims(loga, axis=1) + logM, axis=0) 13 | 14 | 15 | def log_dot_mv(logM, logb): 16 | """通过log向量和log矩阵,计算log(矩阵 点乘 向量)""" 17 | return special.logsumexp(logM + np.expand_dims(logb, axis=0), axis=1) 18 | 19 | 20 | class CRF: 21 | def __init__(self, feature_functions, labels): 22 | self.ft_fun = feature_functions 23 | # 特征函数的权值,模型的主要参数,size:K 24 | self.w = np.random.randn(len(self.ft_fun)) 25 | # self.labels = [START] + labels + [END] 26 | self.labels = labels 27 | self.label_id = {l: i for i, l in enumerate(self.labels)} 28 | 29 | def get_all_features(self, x_vec): 30 | """ 31 | 给定一个输入x_vec,计算这个输入上的所有的(y',y)组合的特征值。 32 | size: len(x_vec) + 1, Y, Y, K 33 | Axes: 34 | 0 - T or time or sequence index 35 | 1 - y' or previous label 36 | 2 - y or current label 37 | 3 - f(y', y, x_vec, i) for i s 38 | """ 39 | result = np.zeros((len(x_vec) + 1, len(self.labels), len(self.labels), len(self.ft_fun))) 40 | for i in range(len(x_vec) + 1): 41 | for j, yp in enumerate(self.labels): 42 | for k, y in enumerate(self.labels): 43 | for l, f in enumerate(self.ft_fun): 44 | result[i, j, k, l] = f(yp, y, x_vec, i) 45 | return result 46 | 47 | def forward(self, log_M_s, start): 48 | T = log_M_s.shape[0] 49 | Y = log_M_s.shape[1] 50 | alphas = np.NINF * np.ones((T+1, Y)) # log0 = ninf 51 | alpha = alphas[0] 52 | alpha[start] = 0 # log1 = 0 53 | for t in range(1, T+1): 54 | alphas[t] = log_dot_vm(alpha, log_M_s[t - 1]) 55 | alpha = alphas[t] 56 | return alphas 57 | 58 | def backward(self, log_M_s, end): 59 | T = log_M_s.shape[0] 60 | Y = log_M_s.shape[1] 61 | betas = np.NINF * np.ones((T+1, Y)) # log0 = ninf 62 | # betas = np.zeros((T+1, Y)) 63 | beta = betas[-1] 64 | beta[end] = 0 # log1 = 0 65 | for t in reversed(range(T)): 66 | betas[t] = log_dot_mv(log_M_s[t], beta) 67 | beta = betas[t] 68 | return betas 69 | 70 | def create_vector_list(self, x_vecs, y_vecs): 71 | """ 72 | 数据预处理 73 | """ 74 | print("create vector list ...") 75 | print("total training data num:", len(x_vecs)) 76 | observations = [self.get_all_features(x_vec) for x_vec in x_vecs] 77 | labels = len(y_vecs) * [None] 78 | 79 | for i in range(len(y_vecs)): 80 | assert (len(y_vecs[i]) == len(x_vecs[i])) 81 | y_vecs[i].insert(0, START) 82 | y_vecs[i].append(END) 83 | labels[i] = np.array([self.label_id[y] for y in y_vecs[i]], copy=False, dtype=np.int) 84 | 85 | return observations, labels 86 | 87 | def neg_likelihood_and_deriv(self, x_vec_list, y_vec_list, w, debug=False): 88 | """ 89 | 求负对数似然函数和关于w的偏导。 90 | 关键变量的尺寸中,Y是标注空间的个数,K是特征函数的个数。 91 | """ 92 | likelihood = 0 93 | derivative = np.zeros(len(self.w)) 94 | # 对观测序列x的每一个位置 95 | for x_vec, y_vec in zip(x_vec_list, y_vec_list): 96 | # all_features, len(x_vec) + 1, Y, Y, K 97 | # all_features = self.get_all_features(x_vec) 98 | all_features = x_vec # 这里为了提高速度,把上面这步统一在外面做 99 | length = x_vec.shape[0] 100 | # y_vec = [START] + y_vec + [END] 101 | yp_vec_ids = y_vec[:-1] 102 | y_vec_ids = y_vec[1:] 103 | # log_M_s: len(x_vec) + 1, Y, Y 104 | log_M_s = np.dot(all_features, w) 105 | # alphas: len(x_vec) + 2, Y 106 | log_alphas = self.forward(log_M_s, self.label_id[START]) 107 | last = log_alphas[-1] 108 | # betas: len(x_vec) + 2, Y 109 | log_betas = self.backward(log_M_s, self.label_id[END]) 110 | log_Z = special.logsumexp(last) 111 | # reshape 112 | log_alphas1 = np.expand_dims(log_alphas[1:], axis=2) 113 | log_betas1 = np.expand_dims(log_betas[:-1], axis=1) 114 | # log_probs: len(x_vec) + 1, Y, Y 115 | log_probs = log_alphas1 + log_M_s + log_betas1 - log_Z 116 | log_probs = np.expand_dims(log_probs, axis=3) 117 | # 计算特征函数关于模型的期望 118 | exp_features = np.sum(np.exp(log_probs) * all_features, axis=(0, 1, 2)) 119 | # 计算特征函数关于训练数据的期望 120 | emp_features = np.sum(all_features[range(length), yp_vec_ids, y_vec_ids], axis=0) 121 | # 计算似然函数 122 | likelihood += np.sum(log_M_s[range(length), yp_vec_ids, y_vec_ids]) - log_Z 123 | # 计算似然函数的偏导 124 | derivative += emp_features - exp_features 125 | 126 | return -likelihood, -derivative 127 | 128 | def train(self, x_vecs, y_vecs, debug=False): 129 | """训练模型,更新w""" 130 | vectorised_x_vecs, vectorised_y_vecs = self.create_vector_list(x_vecs, y_vecs) 131 | print("start training ...") 132 | l = lambda w: self.neg_likelihood_and_deriv(vectorised_x_vecs, vectorised_y_vecs, w) 133 | val = optimize.fmin_l_bfgs_b(l, self.w) 134 | if debug: 135 | print(val) 136 | self.w, _, _ = val 137 | return self.w 138 | 139 | def predict(self, x_vec, debug=False): 140 | """给定x,预测y。使用Viterbi算法""" 141 | # all_features, len(x_vec) + 1, Y, Y, K 142 | all_features = self.get_all_features(x_vec) 143 | # log_potential: len(x_vec) + 1, Y, Y 保存各个下标的非规范化概率 144 | log_potential = np.dot(all_features, self.w) 145 | T = len(x_vec) 146 | Y = len(self.labels) 147 | # Psi保存每个时刻最优情况的下标 148 | Psi = np.ones((T, Y), dtype=np.int32) * -1 149 | # 初始化 150 | delta = log_potential[0, 0] 151 | # 递推 152 | for t in range(1, T): 153 | next_delta = np.zeros(Y) 154 | for y in range(Y): 155 | w = delta + log_potential[t, :, y] 156 | Psi[t, y] = psi = w.argmax() 157 | next_delta[y] = w[psi] 158 | delta = next_delta 159 | # 回溯找到最优路径 160 | y = delta.argmax() 161 | trace = [] 162 | for t in reversed(range(T)): 163 | trace.append(y) 164 | y = Psi[t, y] 165 | trace.reverse() 166 | return [self.labels[i] for i in trace] 167 | 168 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | from crf import * 2 | from collections import defaultdict 3 | import re 4 | import sys 5 | 6 | 7 | def get_feature_functions(word_sets, labels, observes): 8 | """生成各种特征函数""" 9 | print("get feature functions ...") 10 | transition_functions = [ 11 | lambda yp, y, x_v, i, _yp=_yp, _y=_y: 1 if yp == _yp and y == _y else 0 12 | for _yp in labels[:-1] for _y in labels[1:] 13 | ] 14 | 15 | def set_membership(tag, word_sets): 16 | def fun(yp, y, x_v, i): 17 | if i < len(x_v) and x_v[i].lower() in word_sets[tag]: 18 | return 1 19 | else: 20 | return 0 21 | return fun 22 | 23 | observation_functions = [set_membership(t, word_sets) for t in word_sets] 24 | 25 | misc_functions = [ 26 | lambda yp, y, x_v, i: 1 if i < len(x_v) and re.match('^[^0-9a-zA-Z]+$', x_v[i]) else 0, 27 | lambda yp, y, x_v, i: 1 if i < len(x_v) and re.match('^[A-Z\.]+$', x_v[i]) else 0, 28 | lambda yp, y, x_v, i: 1 if i < len(x_v) and re.match('^[0-9\.]+$', x_v[i]) else 0 29 | ] 30 | 31 | tagval_functions = [ 32 | lambda yp, y, x_v, i, _y=_y, _x=_x: 1 if i < len(x_v) and y == _y and x_v[i].lower() == _x else 0 33 | for _y in labels 34 | for _x in observes] 35 | 36 | return transition_functions + tagval_functions + observation_functions + misc_functions 37 | 38 | 39 | if __name__ == '__main__': 40 | word_data = [] 41 | label_data = [] 42 | all_labels = set() 43 | word_sets = defaultdict(set) 44 | observes = set() 45 | for line in open("sample.txt"): 46 | words, labels = [], [] 47 | for token in line.strip().split(): 48 | word, label = token.split('/') 49 | all_labels.add(label) 50 | word_sets[label].add(word.lower()) 51 | observes.add(word.lower()) 52 | words.append(word) 53 | labels.append(label) 54 | 55 | word_data.append(words) 56 | label_data.append(labels) 57 | 58 | labels = [START, END] + list(all_labels) 59 | feature_functions = get_feature_functions(word_sets, labels, observes) 60 | 61 | crf = CRF(labels=labels, feature_functions=feature_functions) 62 | crf.train(word_data, label_data) 63 | for x_vec, y_vec in zip(word_data[-5:], label_data[-5:]): 64 | print("raw data: ", x_vec) 65 | print("prediction: ", crf.predict(x_vec)) 66 | print("ground truth: ", y_vec) 67 | 68 | 69 | -------------------------------------------------------------------------------- /sample.txt: -------------------------------------------------------------------------------- 1 | Confidence/NN in/IN the/DT pound/NN is/VBZ widely/RB expected/VBN to/TO take/VB another/DT sharp/JJ dive/NN if/IN trade/NN figures/NNS for/IN September/NNP ,/, due/JJ for/IN release/NN tomorrow/NN ,/, fail/VB to/TO show/VB a/DT substantial/JJ improvement/NN from/IN July/NNP and/CC August/NNP 's/POS near-record/JJ deficits/NNS ./. 2 | Chancellor/NNP of/IN the/DT Exchequer/NNP Nigel/NNP Lawson/NNP 's/POS restated/VBN commitment/NN to/TO a/DT firm/NN monetary/JJ policy/NN has/VBZ helped/VBN to/TO prevent/VB a/DT freefall/NN in/IN sterling/NN over/IN the/DT past/JJ week/NN ./. 3 | But/CC analysts/NNS reckon/VBP underlying/VBG support/NN for/IN sterling/NN has/VBZ been/VBN eroded/VBN by/IN the/DT chancellor/NN 's/POS failure/NN to/TO announce/VB any/DT new/JJ policy/NN measures/NNS in/IN his/PRP$ Mansion/NNP House/NNP speech/NN last/JJ Thursday/NNP ./. 4 | This/DT has/VBZ increased/VBN the/DT risk/NN of/IN the/DT government/NN being/VBG forced/VBN to/TO increase/VB base/NN rates/NNS to/TO 16/CD %/NN from/IN their/PRP$ current/JJ 15/CD %/NN level/NN to/TO defend/VB the/DT pound/NN ,/, economists/NNS and/CC foreign/JJ exchange/NN market/NN analysts/NNS say/VBP ./. 5 | --------------------------------------------------------------------------------