├── .gitignore
├── README.md
├── pretreat.py
├── run_pretreat.cmd
├── run_test_keras_model.cmd
├── run_train_keras_model.cmd
├── test_keras_model.py
├── train_keras_model.py
├── viterbi.py
├── word2vec_model
    ├── .gitignore
    ├── prepare_word2vec_train_dataset.cmd
    ├── prepare_word2vec_train_dataset.py
    ├── train_word2vec_model.cmd
    └── train_word2vec_model.py
└── wwwrun.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | dataset


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 说明：
 2 | 利用rnn实现命名实体识别算法
 3 | 源码参考:http://www.jianshu.com/p/7e233ef57cb6
 4 | 数据集下载地址：http://pan.baidu.com/s/1jIyNT7w
 5 | 
 6 | 训练步骤：
 7 | 1 用现有的语料库（已经切分好）训练出word2vec的model
 8 | 2 预训练处理语料库得到训练输入和测试输入
 9 | 3 构建rnn并进行训练，在训练的同时测试准确率
10 | 4 根据训练好的model得到可能的序列组合，并利用viterbi算法选择出其中可能性最大的一个序列


--------------------------------------------------------------------------------
/pretreat.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | '''
  4 | python pretrain.py input_file cws_info_filePath cws_data_filePath
  5 | '''
  6 | 
  7 | #2016年 03月 03日 星期四 11:01:05 CST by Demobin
  8 | 
  9 | import json
 10 | import h5py
 11 | import string
 12 | import codecs
 13 | import sys
 14 | import time
 15 | 
 16 | mappings = {
 17 |     #人民日报标注集：863标注集
 18 |             'w':    'wp',
 19 |             't':    'nt',
 20 |             'nr':   'nh',
 21 |             'nx':   'nz',
 22 |             'nn':   'n',
 23 |             'nzz':  'n',
 24 |             'na':   'n',
 25 |             'Ng':   'n',
 26 |             'f':    'nd',
 27 |             's':    'nl',
 28 |             'Vg':   'v',
 29 |             'vd':   'v',
 30 |             'vn':   'v',
 31 |             'vnn':  'v',
 32 |             'ad':   'a',
 33 |             'an':   'a',
 34 |             'Ag':   'a',
 35 |             'l':    'i',
 36 |             'z':    'a',
 37 |             'mq':   'm',
 38 |             'Mg':   'm',
 39 |             'Tg':   'nt',
 40 |             'y':    'u',
 41 |             'Yg':   'u',
 42 |             'Dg':   'd',
 43 |             'Rg':   'r',
 44 |             'Bg':   'b',
 45 |             'pn':   'p',
 46 |             'vvn':  'v', 
 47 |         }
 48 | 
 49 | tags_863 = {
 50 |         'a' :    [0, '形容词'],
 51 |         'b' :    [1, '区别词'],
 52 |         'c' :    [2, '连词'],
 53 |         'd' :    [3, '副词'],
 54 |         'e' :    [4, '叹词'],
 55 |         'g' :    [5, '语素字'],
 56 |         'h' :    [6, '前接成分'],
 57 |         'i' :    [7, '习用语'],
 58 |         'j' :    [8, '简称'],
 59 |         'k' :    [9, '后接成分'],
 60 |         'm' :    [10, '数词'],
 61 |         'n' :    [11, '名词'],
 62 |         'nd':    [12, '方位名词'],
 63 |         'nh':    [13, '人名'],
 64 |         'ni':    [14, '团体、机构、组织的专名'],
 65 |         'nl':    [15, '处所名词'],
 66 |         'ns':    [16, '地名'],
 67 |         'nt':    [17, '时间名词'],
 68 |         'nz':    [18, '其它专名'],
 69 |         'o' :    [19, '拟声词'],
 70 |         'p' :    [20, '介词'],
 71 |         'q' :    [21, '量词'],
 72 |         'r' :    [22, '代词'],
 73 |         'u' :    [23, '助词'],
 74 |         'v' :    [24, '动词'],
 75 |         'wp':    [25, '标点'],
 76 |         'ws':    [26, '字符串'],
 77 |         'x' :    [27, '非语素字'],
 78 |     }
 79 | 
 80 | def genCorpusTags():
 81 |     s = ''
 82 |     features = ['b', 'm', 'e', 's']
 83 |     for tag in tags:
 84 |         for f in features:
 85 |              s += '\'' + tag + '-' + f + '\'' + ',\n'
 86 |     print s
 87 | 
 88 | corpus_tags = [
 89 |         'nh-b','nh-m','nh-e','nh-s',
 90 |         'ni-b','ni-m','ni-e','ni-s',
 91 |         'nl-b','nl-m','nl-e','nl-s',
 92 |         'nd-b','nd-m','nd-e','nd-s',
 93 |         'nz-b','nz-m','nz-e','nz-s',
 94 |         'ns-b','ns-m','ns-e','ns-s',
 95 |         'nt-b','nt-m','nt-e','nt-s',
 96 |         'ws-b','ws-m','ws-e','ws-s',
 97 |         'wp-b','wp-m','wp-e','wp-s',
 98 |         'a-b','a-m','a-e','a-s',
 99 |         'c-b','c-m','c-e','c-s',
100 |         'b-b','b-m','b-e','b-s',
101 |         'e-b','e-m','e-e','e-s',
102 |         'd-b','d-m','d-e','d-s',
103 |         'g-b','g-m','g-e','g-s',
104 |         'i-b','i-m','i-e','i-s',
105 |         'h-b','h-m','h-e','h-s',
106 |         'k-b','k-m','k-e','k-s',
107 |         'j-b','j-m','j-e','j-s',
108 |         'm-b','m-m','m-e','m-s',
109 |         'o-b','o-m','o-e','o-s',
110 |         'n-b','n-m','n-e','n-s',
111 |         'q-b','q-m','q-e','q-s',
112 |         'p-b','p-m','p-e','p-s',
113 |         'r-b','r-m','r-e','r-s',
114 |         'u-b','u-m','u-e','u-s',
115 |         'v-b','v-m','v-e','v-s',
116 |         'x-b','x-m','x-e','x-s'
117 |     ]
118 | 
119 | retain_unknown = 'retain-unknown'
120 | retain_padding = 'retain-padding'
121 | 
122 | def saveTrainingInfo(path, trainingInfo):
123 |     '''保存分词训练数据字典和概率'''
124 |     print('save training info to %s'%path)
125 |     fd = open(path, 'w')
126 |     (initProb, tranProb), (vocab, indexVocab) = trainingInfo
127 |     j = json.dumps((initProb, tranProb))
128 |     fd.write(j + '\n')
129 |     for char in vocab:
130 |         fd.write(char.encode('utf-8') + '\t' + str(vocab[char]) + '\n')
131 |     fd.close()
132 | 
133 | def loadTrainingInfo(path):
134 |     '''载入分词训练数据字典和概率'''
135 |     print('load training info from %s'%path)
136 |     fd = open(path, 'r')
137 |     line = fd.readline()
138 |     j = json.loads(line.strip())
139 |     initProb, tranProb = j[0], j[1]
140 |     lines = fd.readlines()
141 |     fd.close()
142 |     vocab = {}
143 |     indexVocab = [0 for i in range(len(lines))]
144 |     for line in lines:
145 |         rst = line.strip().split('\t')
146 |         if len(rst) < 2: continue
147 |         char, index = rst[0].decode('utf-8'), int(rst[1])
148 |         vocab[char] = index
149 |         indexVocab[index] = char
150 |     return (initProb, tranProb), (vocab, indexVocab)
151 | 
152 | def saveTrainingData(path, trainingData):
153 |     '''保存分词训练输入样本'''
154 |     print('save training data to %s'%path)
155 |     #采用hdf5保存大矩阵效率最高
156 |     fd = h5py.File(path,'w')
157 |     (X, y) = trainingData
158 |     fd.create_dataset('X', data = X)
159 |     fd.create_dataset('y', data = y)
160 |     fd.close()
161 | 
162 | def loadTrainingData(path):
163 |     '''载入分词训练输入样本'''
164 |     print('load training data from %s'%path)
165 |     fd = h5py.File(path,'r')
166 |     X = fd['X'][:]
167 |     y = fd['y'][:]
168 |     fd.close()
169 |     return (X, y)
170 | 
171 | def sent2vec2(sent, vocab, ctxWindows = 5):
172 | 
173 |     charVec = []
174 |     for char in sent:
175 |         if char in vocab:
176 |             charVec.append(vocab[char])
177 |         else:
178 |             charVec.append(vocab[retain_unknown])
179 |     #首尾padding
180 |     num = len(charVec)
181 |     pad = int((ctxWindows - 1)/2)
182 |     for i in range(pad):
183 |         charVec.insert(0, vocab[retain_padding] )
184 |         charVec.append(vocab[retain_padding] )
185 |     X = []
186 |     for i in range(num):
187 |         X.append(charVec[i:i + ctxWindows])
188 |     return X
189 | 
190 | def sent2vec(sent, vocab, ctxWindows = 5):
191 |     chars = []
192 |     for char in sent:
193 |         chars.append(char)
194 |     return sent2vec2(chars, vocab, ctxWindows = ctxWindows)
195 | 
196 | def doc2vec(fname, vocab):
197 |     '''文档转向量'''
198 | 
199 |     #一次性读入文件，注意内存
200 |     fd = codecs.open(fname, 'r', 'utf-8')
201 |     lines = fd.readlines()
202 |     fd.close()
203 | 
204 |     #样本集
205 |     X = []
206 |     y = []
207 | 
208 |     #标注统计信息
209 |     tagSize = len(corpus_tags)
210 |     tagCnt = [0 for i in range(tagSize)]
211 |     tagTranCnt = [[0 for i in range(tagSize)] for j in range(tagSize)]
212 | 
213 |     #遍历行
214 |     for line in lines:
215 |         #按空格分割
216 |         words = line.strip().split()
217 |         #每行的分词信息
218 |         chars = []
219 |         tags = []
220 |         for word in words:
221 |             word = word.strip('[ ')
222 |             end_index = word.find(']')
223 |             if end_index >= 0:
224 |                 word = word[0:end_index]
225 |             rst = word.split('/')
226 |             if len(rst) < 2:
227 |                 continue
228 |             word, tag = rst[0], rst[1]
229 |             if tag not in tags_863:
230 |                 tag = mappings[tag]
231 | 
232 |             #包含两个字及以上的词
233 |             if len(word) > 1:
234 |                 #词的首字
235 |                 chars.append(word[0])
236 |                 tags.append(corpus_tags.index(tag + '-b'))
237 |                 #词中间的字
238 |                 for char in word[1:(len(word) - 1)]:
239 |                     chars.append(char)
240 |                     tags.append(corpus_tags.index(tag + '-m'))
241 |                 #词的尾字
242 |                 chars.append(word[-1])
243 |                 tags.append(corpus_tags.index(tag + '-e'))
244 |             #单字词
245 |             else: 
246 |                 chars.append(word)
247 |                 tags.append(corpus_tags.index(tag + '-s'))
248 | 
249 |         #字向量表示
250 |         lineVecX = sent2vec2(chars, vocab, ctxWindows = 7)
251 | 
252 |         #统计标注信息
253 |         lineVecY = []
254 |         lastTag = -1
255 |         for tag in tags:
256 |             #向量
257 |             lineVecY.append(tag)
258 |             #lineVecY.append(corpus_tags[tag])
259 |             #统计tag频次
260 |             tagCnt[tag] += 1
261 |             #统计tag转移频次
262 |             if lastTag != -1:
263 |                 tagTranCnt[lastTag][tag] += 1
264 |             #暂存上一次的tag
265 |             lastTag = tag
266 | 
267 |         X.extend(lineVecX)
268 |         y.extend(lineVecY)
269 | 
270 |     #字总频次
271 |     charCnt = sum(tagCnt)
272 |     #转移总频次
273 |     tranCnt = sum([sum(tag) for tag in tagTranCnt])
274 |     #tag初始概率
275 |     initProb = []
276 |     for i in range(tagSize):
277 |         initProb.append(tagCnt[i]/float(charCnt))
278 |     #tag转移概率
279 |     tranProb = []
280 |     for i in range(tagSize):
281 |         p = []
282 |         for j in range(tagSize):
283 |             p.append(tagTranCnt[i][j]/float(tranCnt))
284 |         tranProb.append(p)
285 | 
286 |     return X, y, initProb, tranProb
287 | 
288 | def vocabAddChar(vocab, indexVocab, index, char):
289 |     if char not in vocab:
290 |         vocab[char] = index
291 |         indexVocab.append(char)
292 |         index += 1
293 |     return index
294 | 
295 | def genVocab(fname, delimiters = [' ', '\n']):
296 | 
297 |     #一次性读入文件，注意内存
298 |     fd = codecs.open(fname, 'r', 'utf-8')
299 |     lines = fd.readlines()
300 |     fd.close()
301 | 
302 |     vocab = {}
303 |     indexVocab = []
304 |     #遍历所有行
305 |     index = 0
306 |     for line in lines:
307 |         words = line.strip().split()
308 |         if words <= 0: continue
309 |         #遍历所有词
310 |         #如果为分隔符则无需加入字典
311 |         for word in words:
312 |             word = word.strip('[ ')
313 |             end_index = word.find(']')
314 |             if end_index >= 0:
315 |                 word = word[0:end_index]
316 |             rst = word.split('/')
317 |             if len(rst) < 2:
318 |                 continue
319 |             word, tag = rst[0], rst[1]
320 | 
321 |             if word not in delimiters:
322 |                 index = vocabAddChar(vocab, indexVocab, index, word)
323 | 
324 |     #加入未登陆新词和填充词
325 |     vocab[retain_unknown] = len(vocab)
326 |     vocab[retain_padding] = len(vocab)
327 |     indexVocab.append(retain_unknown)
328 |     indexVocab.append(retain_padding)
329 |     #返回字典与索引
330 |     return vocab, indexVocab
331 | 
332 | def load(fname):
333 |     print 'train from file', fname
334 |     delims = [' ', '\n']
335 |     vocab, indexVocab = genVocab(fname)
336 |     X, y, initProb, tranProb = doc2vec(fname, vocab)
337 |     print len(X), len(y), len(vocab), len(indexVocab)
338 |     return (X, y), (initProb, tranProb), (vocab, indexVocab)
339 | 
340 | if __name__ == '__main__':
341 |     start_time = time.time()
342 | 
343 |     if len(sys.argv) < 4:
344 |         print globals()['__doc__'] % locals()
345 |         sys.exit(1)
346 |     input_file, training_info_filePath, training_data_filePath = sys.argv[1:4]
347 | 
348 |     (X, y), (initProb, tranProb), (vocab, indexVocab) = load(input_file)
349 |     saveTrainingInfo(training_info_filePath, ((initProb, tranProb), (vocab, indexVocab)))
350 |     saveTrainingData(training_data_filePath, (X, y))
351 | 
352 |     end_time = time.time()
353 |     print("used time : %d s" % (end_time - start_time))


--------------------------------------------------------------------------------
/run_pretreat.cmd:
--------------------------------------------------------------------------------
1 | python ./pretreat.py ./dataset/original_with_tag.utf8 ./ner_training.info ./ner_training.data ./word2vec_model/ner_training_word2vec.model


--------------------------------------------------------------------------------
/run_test_keras_model.cmd:
--------------------------------------------------------------------------------
1 | python ./test_keras_model.py ./ner_training.info ./ner_keras_model ./keras_model_weights ./dataset/ner_test.utf8 ./dataset/ner_test.utf8.result
2 | 


--------------------------------------------------------------------------------
/run_train_keras_model.cmd:
--------------------------------------------------------------------------------
1 | python ./train_keras_model.py ./ner_training.info ./ner_training.data ./ner_keras_model ./keras_model_weights
2 | 


--------------------------------------------------------------------------------
/test_keras_model.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | '''
 4 | python test_keras_model.py training_info_file keras_model_file keras_model_weights_file test_data_file output_file
 5 | '''
 6 | 
 7 | import numpy as np
 8 | import json
 9 | import h5py
10 | import codecs
11 | import time
12 | import sys
13 | 
14 | import pretreat
15 | import viterbi
16 | 
17 | from sklearn import model_selection
18 | 
19 | from keras.preprocessing import sequence
20 | from keras.optimizers import SGD, RMSprop, Adagrad
21 | from keras.utils import np_utils
22 | from keras.models import Sequential,Graph, model_from_json
23 | from keras.layers.core import Dense, Dropout, Activation, TimeDistributedDense
24 | from keras.layers.embeddings import Embedding
25 | from keras.layers.recurrent import LSTM, GRU, SimpleRNN
26 | 
27 | from gensim.models import Word2Vec
28 | 
29 | def loadModel(modelPath, weightPath):
30 | 
31 |     fd = open(modelPath, 'r')
32 |     j = fd.read()
33 |     fd.close()
34 | 
35 |     model = model_from_json(j)
36 | 
37 |     model.load_weights(weightPath)
38 | 
39 |     return model
40 | 
41 | 
42 | # 根据输入得到标注推断
43 | def testSent(sent, model, trainingInfo):
44 |     (initProb, tranProb), (vocab, indexVocab) = trainingInfo
45 |     vec = pretreat.sent2vec(sent, vocab, ctxWindows = 7)
46 |     vec = np.array(vec)
47 |     probs = model.predict_proba(vec)
48 |     #classes = model.predict_classes(vec)
49 | 
50 |     prob, path = viterbi.viterbi(vec, pretreat.corpus_tags, initProb, tranProb, probs.transpose())
51 | 
52 |     ss = ''
53 |     for i, t in enumerate(path):
54 |         ss += '%s/%s '%(sent[i], pretreat.corpus_tags[t])
55 |     # ss = ''
56 |     # word = ''
57 |     # for i, t in enumerate(path):
58 |     #     if cws.corpus_tags[t] == 'S':
59 |     #         ss += sent[i] + ' '
60 |     #         word = ''
61 |     #     elif cws.corpus_tags[t] == 'B':
62 |     #         word += sent[i]
63 |     #     elif cws.corpus_tags[t] == 'E':
64 |     #         word += sent[i]
65 |     #         ss += word + ' '
66 |     #         word = ''
67 |     #     elif cws.corpus_tags[t] == 'M': 
68 |     #         word += sent[i]
69 | 
70 |     return ss
71 | 
72 | def testFile(fname, dstname, model, trainingInfo):
73 |     fd = codecs.open(fname, 'r', 'utf-8')
74 |     lines = fd.readlines()
75 |     fd.close()
76 | 
77 |     fd = open(dstname, 'w')
78 |     for line in lines:
79 |         rst = testSent(line.strip(), model, trainingInfo)
80 |         fd.write(rst.encode('utf-8') + '\n')
81 |     fd.close()
82 | 
83 | if __name__ == '__main__':
84 |     if len(sys.argv) < 6:
85 |         print globals()['__doc__'] % locals()
86 |         sys.exit(1)
87 |     training_info_file, keras_model_file, keras_model_weights_file, test_data_file, output_file = sys.argv[1:6]
88 | 
89 |     training_info = pretreat.loadTrainingInfo(training_info_file)
90 |     print 'Loading model...'
91 |     start_time = time.time()
92 |     model = loadModel(keras_model_file, keras_model_weights_file)
93 |     print("Loading used time : ", time.time() - start_time)
94 |     print 'Done!'
95 |     print '-------------start predict----------------'
96 |     #s = u'为寂寞的夜空画上一个月亮'
97 |     #print testSent(s, model, cwsInfo)
98 |     testFile(test_data_file, output_file, model, training_info)
99 | 


--------------------------------------------------------------------------------
/train_keras_model.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | '''
  4 | python train_keras_model.py training_info_filePath training_data_filePath output_keras_model_file output_keras_model_weights_file word2vec_model_file
  5 | '''
  6 | 
  7 | import numpy as np
  8 | import json
  9 | import h5py
 10 | import codecs
 11 | import time
 12 | import sys
 13 | 
 14 | import pretreat
 15 | import viterbi
 16 | 
 17 | from sklearn import model_selection
 18 | 
 19 | from keras.preprocessing import sequence
 20 | from keras.optimizers import SGD, RMSprop, Adagrad
 21 | from keras.utils import np_utils
 22 | from keras.models import Sequential,Graph, model_from_json
 23 | from keras.layers.core import Dense, Dropout, Activation, TimeDistributedDense
 24 | from keras.layers.embeddings import Embedding
 25 | from keras.layers.recurrent import LSTM, GRU, SimpleRNN
 26 | 
 27 | from gensim.models import Word2Vec
 28 | 
 29 | def train(trainingInfo, trainingData, modelPath, weightPath, word2vec_model_file):
 30 | 
 31 |     (initProb, tranProb), (vocab, indexVocab) = trainingInfo
 32 |     (X, y) = trainingData
 33 | 
 34 |     train_X, test_X, train_y, test_y = model_selection.train_test_split(X, y , train_size=0.9, random_state=1)
 35 | 
 36 |     train_X = np.array(train_X)
 37 |     train_y = np.array(train_y)
 38 |     test_X = np.array(test_X)
 39 |     test_y = np.array(test_y)
 40 | 
 41 |     outputDims = len(pretreat.corpus_tags)
 42 |     Y_train = np_utils.to_categorical(train_y, outputDims)
 43 |     Y_test = np_utils.to_categorical(test_y, outputDims)
 44 |     batchSize = 128
 45 |     vocabSize = len(vocab) + 1
 46 |     wordDims = 100
 47 |     maxlen = 7
 48 |     hiddenDims = 100
 49 | 
 50 |     w2vModel = Word2Vec.load(word2vec_model_file)
 51 |     embeddingDim = w2vModel.vector_size
 52 |     embeddingUnknown = [0 for i in range(embeddingDim)]
 53 |     embeddingWeights = np.zeros((vocabSize + 1, embeddingDim))
 54 |     for word, index in vocab.items():
 55 |         if word in w2vModel:
 56 |             e = w2vModel[word]
 57 |         else:
 58 |             e = embeddingUnknown
 59 |         embeddingWeights[index, :] = e
 60 | 
 61 |     #LSTM
 62 |     model = Sequential()
 63 |     model.add(Embedding(output_dim = embeddingDim, input_dim = vocabSize + 1, 
 64 |         input_length = maxlen, mask_zero = True, weights = [embeddingWeights]))
 65 |     model.add(LSTM(output_dim = hiddenDims, return_sequences = True))
 66 |     model.add(LSTM(output_dim = hiddenDims, return_sequences = False))
 67 |     model.add(Dropout(0.5))
 68 |     model.add(Dense(outputDims))
 69 |     model.add(Activation('softmax'))
 70 |     model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics=["accuracy"])
 71 | 
 72 |     result = model.fit(train_X, Y_train, batch_size = batchSize, 
 73 |                     nb_epoch = 20, validation_data = (test_X,Y_test))
 74 | 
 75 |     j = model.to_json()
 76 |     fd = open(modelPath, 'w')
 77 |     fd.write(j)
 78 |     fd.close()
 79 | 
 80 |     model.save_weights(weightPath)
 81 | 
 82 |     return model
 83 | 
 84 | if __name__ == '__main__':
 85 |     if len(sys.argv) < 6:
 86 |         print globals()['__doc__'] % locals()
 87 |         sys.exit(1)
 88 |     training_info_filePath, training_data_filePath, output_keras_model_file, output_keras_model_weights_file, word2vec_model_file = sys.argv[1:6]
 89 | 
 90 |     print 'Loading vocab...'
 91 |     start_time = time.time()
 92 |     trainingInfo = pretreat.loadTrainingInfo(training_info_filePath)
 93 |     trainingData = pretreat.loadTrainingData(training_data_filePath)
 94 |     print("Loading used time : ", time.time() - start_time)
 95 |     print 'Done!'
 96 | 
 97 |     print 'Training model...'
 98 |     start_time = time.time()
 99 |     model = train(trainingInfo, trainingData, output_keras_model_file, output_keras_model_weights_file, word2vec_model_file)
100 |     print("Training used time : ", time.time() - start_time)
101 |     print 'Done!'


--------------------------------------------------------------------------------
/viterbi.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | #2016年 01月 28日 星期四 17:14:03 CST by Demobin
 5 | 
 6 | def _print(hiddenstates, V):
 7 |     s = "    " + " ".join(("%7d" % i) for i in range(len(V))) + "\n"
 8 |     for i, state in enumerate(hiddenstates):
 9 |         s += "%.5s: " % state
10 |         s += " ".join("%.7s" % ("%f" % v[i]) for v in V)
11 |         s += "\n"
12 |     print(s)
13 | 
14 | #标准viterbi算法，参数为观察状态、隐藏状态、概率三元组(初始概率、转移概率、观察概率)
15 | def viterbi(obs, states, start_p, trans_p, emit_p):
16 | 
17 |     lenObs = len(obs)
18 |     lenStates = len(states)
19 | 
20 |     V = [[0.0 for col in range(lenStates)] for row in range(lenObs)]
21 |     path = [[0 for col in range(lenObs)] for row in range(lenStates)]
22 | 
23 |     #t = 0时刻
24 |     for y in range(lenStates):
25 |         #V[0][y] = start_p[y] * emit_p[y][obs[0]]
26 |         V[0][y] = start_p[y] * emit_p[y][0]
27 |         path[y][0] = y
28 | 
29 |     #t > 1时
30 |     for t in range(1, lenObs):
31 |         newpath = [[0.0 for col in range(lenObs)] for row in range(lenStates)]
32 | 
33 |         for y in range(lenStates):
34 |             prob = -1
35 |             state = 0
36 |             for y0 in range(lenStates):
37 |                 #nprob = V[t - 1][y0] * trans_p[y0][y] * emit_p[y][obs[t]]
38 |                 nprob = V[t - 1][y0] * trans_p[y0][y] * emit_p[y][t]
39 |                 if nprob > prob:
40 |                     prob = nprob
41 |                     state = y0
42 |                     #记录最大概率
43 |                     V[t][y] = prob
44 |                     #记录路径
45 |                     newpath[y][:t] = path[state][:t]
46 |                     newpath[y][t] = y
47 | 
48 |         path = newpath
49 | 
50 |     prob = -1
51 |     state = 0
52 |     for y in range(lenStates):
53 |         if V[lenObs - 1][y] > prob:
54 |             prob = V[lenObs - 1][y]
55 |             state = y
56 | 
57 |     #_print(states, V)
58 |     return prob, path[state]
59 | 
60 | def example():
61 |     #隐藏状态
62 |     hiddenstates = ('Healthy', 'Fever')
63 |     #观察状态
64 |     observations = ('normal', 'cold', 'dizzy')
65 | 
66 |     #初始概率
67 |     '''
68 |     Healthy': 0.6, 'Fever': 0.4
69 |     '''
70 |     start_p = [0.6, 0.4]
71 |     #转移概率
72 |     '''
73 |     Healthy' : {'Healthy': 0.7, 'Fever': 0.3},
74 |     Fever' : {'Healthy': 0.4, 'Fever': 0.6}
75 |     '''
76 |     trans_p = [[0.7, 0.3], [0.4, 0.6]]
77 |     #发射概率/输出概率/观察概率
78 |     '''
79 |     Healthy' : {'normal': 0.5, 'cold': 0.4, 'dizzy': 0.1},
80 |     Fever' : {'normal': 0.1, 'cold': 0.3, 'dizzy': 0.6}
81 |     '''
82 |     emit_p = [[0.5, 0.4, 0.1], [0.1, 0.3, 0.6]]
83 | 
84 |     return viterbi(observations,
85 |                    hiddenstates,
86 |                    start_p,
87 |                    trans_p,
88 |                    emit_p)
89 | 
90 | if __name__ == '__main__':
91 |     print(example())


--------------------------------------------------------------------------------
/word2vec_model/.gitignore:
--------------------------------------------------------------------------------
1 | dataset


--------------------------------------------------------------------------------
/word2vec_model/prepare_word2vec_train_dataset.cmd:
--------------------------------------------------------------------------------
1 | python ./prepare_word2vec_train_dataset.py ./dataset/original_with_tag.utf8 ./dataset/original_split.utf8


--------------------------------------------------------------------------------
/word2vec_model/prepare_word2vec_train_dataset.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | '''
 4 | python prepare_word2vec_train_dataset.py input_file output_file
 5 | '''
 6 | 
 7 | import os
 8 | import sys
 9 | import logging
10 | import multiprocessing
11 | import time
12 | import json
13 | 
14 | if __name__ == '__main__':
15 |     start_time = time.time()
16 |     
17 |     program = os.path.basename(sys.argv[0])
18 |     logger = logging.getLogger(program)
19 |  
20 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
21 |     logger.info("running %s" % ' '.join(sys.argv))
22 |  
23 |     # check and process input arguments
24 |     if len(sys.argv) < 3:
25 |         print globals()['__doc__'] % locals()
26 |         sys.exit(1)
27 |     input_file, output_file = sys.argv[1:3]
28 | 
29 |     output_file_handler = open(output_file, 'w')
30 |     for line in open(input_file, 'r'):
31 |     	new_line = ''
32 |     	words = line.strip().split()
33 |     	for word in words:
34 |     		word = word.strip('[ ')
35 |     		end_index = word.find(']')
36 |     		if end_index >= 0:
37 |     			word = word[0:end_index]
38 |     		word, tag = word.split('/')
39 |     		new_line = new_line + word + ' '
40 |     	output_file_handler.write(new_line.strip() + '\n')
41 |     	output_file_handler.flush()
42 |     output_file_handler.close()
43 | 
44 |     end_time = time.time()
45 |     print("used time : %d s" % (end_time - start_time))


--------------------------------------------------------------------------------
/word2vec_model/train_word2vec_model.cmd:
--------------------------------------------------------------------------------
1 | python ./train_word2vec_model.py ./dataset/original_split.utf8 ./ner_training_word2vec.model ./ner_training_word2vec.vector


--------------------------------------------------------------------------------
/word2vec_model/train_word2vec_model.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | '''
 4 | python train_word2vec_model.py input_file output_model_file output_vector_file
 5 | '''
 6 | 
 7 | # import modules & set up logging
 8 | import os
 9 | import sys
10 | import logging
11 | import multiprocessing
12 | import time
13 | import json
14 |  
15 | from gensim.models import Word2Vec
16 | from gensim.models.word2vec import LineSentence
17 | 
18 | def output_vocab(vocab):
19 |     for k, v in vocab.items():
20 |         print(k)
21 | 
22 | if __name__ == '__main__':
23 |     start_time = time.time()
24 |     
25 |     program = os.path.basename(sys.argv[0])
26 |     logger = logging.getLogger(program)
27 |  
28 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
29 |     logger.info("running %s" % ' '.join(sys.argv))
30 |  
31 |     # check and process input arguments
32 |     if len(sys.argv) < 4:
33 |         print globals()['__doc__'] % locals()
34 |         sys.exit(1)
35 |     input_file, output_model_file, output_vector_file = sys.argv[1:4]
36 |  
37 |     model = Word2Vec(LineSentence(input_file), size=128, window=5, min_count=5,
38 |             workers=multiprocessing.cpu_count())
39 |  
40 |     # trim unneeded model memory = use(much) less RAM
41 |     #model.init_sims(replace=True)
42 |     model.save(output_model_file)
43 |     model.save_word2vec_format(output_vector_file, binary=False)
44 | 
45 |     end_time = time.time()
46 |     print("used time : %d s" % (end_time - start_time))


--------------------------------------------------------------------------------
/wwwrun.sh:
--------------------------------------------------------------------------------
1 | /home/escenter11/gym/anaconda/bin/python ./train_keras_model.py ./ner_training.info ./ner_training.data ./cws_keras_model ./keras_model_weights ./word2vec_model/ner_training_word2vec.model
2 | 


--------------------------------------------------------------------------------