├── .gitignore ├── README.md ├── pretreat.py ├── run_pretreat.cmd ├── run_test_keras_model.cmd ├── run_train_keras_model.cmd ├── test_keras_model.py ├── train_keras_model.py ├── viterbi.py ├── word2vec_model ├── .gitignore ├── prepare_word2vec_train_dataset.cmd ├── prepare_word2vec_train_dataset.py ├── train_word2vec_model.cmd └── train_word2vec_model.py └── wwwrun.sh /.gitignore: -------------------------------------------------------------------------------- 1 | dataset -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 说明: 2 | 利用rnn实现命名实体识别算法 3 | 源码参考:http://www.jianshu.com/p/7e233ef57cb6 4 | 数据集下载地址:http://pan.baidu.com/s/1jIyNT7w 5 | 6 | 训练步骤: 7 | 1 用现有的语料库(已经切分好)训练出word2vec的model 8 | 2 预训练处理语料库得到训练输入和测试输入 9 | 3 构建rnn并进行训练,在训练的同时测试准确率 10 | 4 根据训练好的model得到可能的序列组合,并利用viterbi算法选择出其中可能性最大的一个序列 -------------------------------------------------------------------------------- /pretreat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | ''' 4 | python pretrain.py input_file cws_info_filePath cws_data_filePath 5 | ''' 6 | 7 | #2016年 03月 03日 星期四 11:01:05 CST by Demobin 8 | 9 | import json 10 | import h5py 11 | import string 12 | import codecs 13 | import sys 14 | import time 15 | 16 | mappings = { 17 | #人民日报标注集:863标注集 18 | 'w': 'wp', 19 | 't': 'nt', 20 | 'nr': 'nh', 21 | 'nx': 'nz', 22 | 'nn': 'n', 23 | 'nzz': 'n', 24 | 'na': 'n', 25 | 'Ng': 'n', 26 | 'f': 'nd', 27 | 's': 'nl', 28 | 'Vg': 'v', 29 | 'vd': 'v', 30 | 'vn': 'v', 31 | 'vnn': 'v', 32 | 'ad': 'a', 33 | 'an': 'a', 34 | 'Ag': 'a', 35 | 'l': 'i', 36 | 'z': 'a', 37 | 'mq': 'm', 38 | 'Mg': 'm', 39 | 'Tg': 'nt', 40 | 'y': 'u', 41 | 'Yg': 'u', 42 | 'Dg': 'd', 43 | 'Rg': 'r', 44 | 'Bg': 'b', 45 | 'pn': 'p', 46 | 'vvn': 'v', 47 | } 48 | 49 | tags_863 = { 50 | 'a' : [0, '形容词'], 51 | 'b' : [1, '区别词'], 52 | 'c' : [2, '连词'], 53 | 'd' : [3, '副词'], 54 | 'e' : [4, '叹词'], 55 | 'g' : [5, '语素字'], 56 | 'h' : [6, '前接成分'], 57 | 'i' : [7, '习用语'], 58 | 'j' : [8, '简称'], 59 | 'k' : [9, '后接成分'], 60 | 'm' : [10, '数词'], 61 | 'n' : [11, '名词'], 62 | 'nd': [12, '方位名词'], 63 | 'nh': [13, '人名'], 64 | 'ni': [14, '团体、机构、组织的专名'], 65 | 'nl': [15, '处所名词'], 66 | 'ns': [16, '地名'], 67 | 'nt': [17, '时间名词'], 68 | 'nz': [18, '其它专名'], 69 | 'o' : [19, '拟声词'], 70 | 'p' : [20, '介词'], 71 | 'q' : [21, '量词'], 72 | 'r' : [22, '代词'], 73 | 'u' : [23, '助词'], 74 | 'v' : [24, '动词'], 75 | 'wp': [25, '标点'], 76 | 'ws': [26, '字符串'], 77 | 'x' : [27, '非语素字'], 78 | } 79 | 80 | def genCorpusTags(): 81 | s = '' 82 | features = ['b', 'm', 'e', 's'] 83 | for tag in tags: 84 | for f in features: 85 | s += '\'' + tag + '-' + f + '\'' + ',\n' 86 | print s 87 | 88 | corpus_tags = [ 89 | 'nh-b','nh-m','nh-e','nh-s', 90 | 'ni-b','ni-m','ni-e','ni-s', 91 | 'nl-b','nl-m','nl-e','nl-s', 92 | 'nd-b','nd-m','nd-e','nd-s', 93 | 'nz-b','nz-m','nz-e','nz-s', 94 | 'ns-b','ns-m','ns-e','ns-s', 95 | 'nt-b','nt-m','nt-e','nt-s', 96 | 'ws-b','ws-m','ws-e','ws-s', 97 | 'wp-b','wp-m','wp-e','wp-s', 98 | 'a-b','a-m','a-e','a-s', 99 | 'c-b','c-m','c-e','c-s', 100 | 'b-b','b-m','b-e','b-s', 101 | 'e-b','e-m','e-e','e-s', 102 | 'd-b','d-m','d-e','d-s', 103 | 'g-b','g-m','g-e','g-s', 104 | 'i-b','i-m','i-e','i-s', 105 | 'h-b','h-m','h-e','h-s', 106 | 'k-b','k-m','k-e','k-s', 107 | 'j-b','j-m','j-e','j-s', 108 | 'm-b','m-m','m-e','m-s', 109 | 'o-b','o-m','o-e','o-s', 110 | 'n-b','n-m','n-e','n-s', 111 | 'q-b','q-m','q-e','q-s', 112 | 'p-b','p-m','p-e','p-s', 113 | 'r-b','r-m','r-e','r-s', 114 | 'u-b','u-m','u-e','u-s', 115 | 'v-b','v-m','v-e','v-s', 116 | 'x-b','x-m','x-e','x-s' 117 | ] 118 | 119 | retain_unknown = 'retain-unknown' 120 | retain_padding = 'retain-padding' 121 | 122 | def saveTrainingInfo(path, trainingInfo): 123 | '''保存分词训练数据字典和概率''' 124 | print('save training info to %s'%path) 125 | fd = open(path, 'w') 126 | (initProb, tranProb), (vocab, indexVocab) = trainingInfo 127 | j = json.dumps((initProb, tranProb)) 128 | fd.write(j + '\n') 129 | for char in vocab: 130 | fd.write(char.encode('utf-8') + '\t' + str(vocab[char]) + '\n') 131 | fd.close() 132 | 133 | def loadTrainingInfo(path): 134 | '''载入分词训练数据字典和概率''' 135 | print('load training info from %s'%path) 136 | fd = open(path, 'r') 137 | line = fd.readline() 138 | j = json.loads(line.strip()) 139 | initProb, tranProb = j[0], j[1] 140 | lines = fd.readlines() 141 | fd.close() 142 | vocab = {} 143 | indexVocab = [0 for i in range(len(lines))] 144 | for line in lines: 145 | rst = line.strip().split('\t') 146 | if len(rst) < 2: continue 147 | char, index = rst[0].decode('utf-8'), int(rst[1]) 148 | vocab[char] = index 149 | indexVocab[index] = char 150 | return (initProb, tranProb), (vocab, indexVocab) 151 | 152 | def saveTrainingData(path, trainingData): 153 | '''保存分词训练输入样本''' 154 | print('save training data to %s'%path) 155 | #采用hdf5保存大矩阵效率最高 156 | fd = h5py.File(path,'w') 157 | (X, y) = trainingData 158 | fd.create_dataset('X', data = X) 159 | fd.create_dataset('y', data = y) 160 | fd.close() 161 | 162 | def loadTrainingData(path): 163 | '''载入分词训练输入样本''' 164 | print('load training data from %s'%path) 165 | fd = h5py.File(path,'r') 166 | X = fd['X'][:] 167 | y = fd['y'][:] 168 | fd.close() 169 | return (X, y) 170 | 171 | def sent2vec2(sent, vocab, ctxWindows = 5): 172 | 173 | charVec = [] 174 | for char in sent: 175 | if char in vocab: 176 | charVec.append(vocab[char]) 177 | else: 178 | charVec.append(vocab[retain_unknown]) 179 | #首尾padding 180 | num = len(charVec) 181 | pad = int((ctxWindows - 1)/2) 182 | for i in range(pad): 183 | charVec.insert(0, vocab[retain_padding] ) 184 | charVec.append(vocab[retain_padding] ) 185 | X = [] 186 | for i in range(num): 187 | X.append(charVec[i:i + ctxWindows]) 188 | return X 189 | 190 | def sent2vec(sent, vocab, ctxWindows = 5): 191 | chars = [] 192 | for char in sent: 193 | chars.append(char) 194 | return sent2vec2(chars, vocab, ctxWindows = ctxWindows) 195 | 196 | def doc2vec(fname, vocab): 197 | '''文档转向量''' 198 | 199 | #一次性读入文件,注意内存 200 | fd = codecs.open(fname, 'r', 'utf-8') 201 | lines = fd.readlines() 202 | fd.close() 203 | 204 | #样本集 205 | X = [] 206 | y = [] 207 | 208 | #标注统计信息 209 | tagSize = len(corpus_tags) 210 | tagCnt = [0 for i in range(tagSize)] 211 | tagTranCnt = [[0 for i in range(tagSize)] for j in range(tagSize)] 212 | 213 | #遍历行 214 | for line in lines: 215 | #按空格分割 216 | words = line.strip().split() 217 | #每行的分词信息 218 | chars = [] 219 | tags = [] 220 | for word in words: 221 | word = word.strip('[ ') 222 | end_index = word.find(']') 223 | if end_index >= 0: 224 | word = word[0:end_index] 225 | rst = word.split('/') 226 | if len(rst) < 2: 227 | continue 228 | word, tag = rst[0], rst[1] 229 | if tag not in tags_863: 230 | tag = mappings[tag] 231 | 232 | #包含两个字及以上的词 233 | if len(word) > 1: 234 | #词的首字 235 | chars.append(word[0]) 236 | tags.append(corpus_tags.index(tag + '-b')) 237 | #词中间的字 238 | for char in word[1:(len(word) - 1)]: 239 | chars.append(char) 240 | tags.append(corpus_tags.index(tag + '-m')) 241 | #词的尾字 242 | chars.append(word[-1]) 243 | tags.append(corpus_tags.index(tag + '-e')) 244 | #单字词 245 | else: 246 | chars.append(word) 247 | tags.append(corpus_tags.index(tag + '-s')) 248 | 249 | #字向量表示 250 | lineVecX = sent2vec2(chars, vocab, ctxWindows = 7) 251 | 252 | #统计标注信息 253 | lineVecY = [] 254 | lastTag = -1 255 | for tag in tags: 256 | #向量 257 | lineVecY.append(tag) 258 | #lineVecY.append(corpus_tags[tag]) 259 | #统计tag频次 260 | tagCnt[tag] += 1 261 | #统计tag转移频次 262 | if lastTag != -1: 263 | tagTranCnt[lastTag][tag] += 1 264 | #暂存上一次的tag 265 | lastTag = tag 266 | 267 | X.extend(lineVecX) 268 | y.extend(lineVecY) 269 | 270 | #字总频次 271 | charCnt = sum(tagCnt) 272 | #转移总频次 273 | tranCnt = sum([sum(tag) for tag in tagTranCnt]) 274 | #tag初始概率 275 | initProb = [] 276 | for i in range(tagSize): 277 | initProb.append(tagCnt[i]/float(charCnt)) 278 | #tag转移概率 279 | tranProb = [] 280 | for i in range(tagSize): 281 | p = [] 282 | for j in range(tagSize): 283 | p.append(tagTranCnt[i][j]/float(tranCnt)) 284 | tranProb.append(p) 285 | 286 | return X, y, initProb, tranProb 287 | 288 | def vocabAddChar(vocab, indexVocab, index, char): 289 | if char not in vocab: 290 | vocab[char] = index 291 | indexVocab.append(char) 292 | index += 1 293 | return index 294 | 295 | def genVocab(fname, delimiters = [' ', '\n']): 296 | 297 | #一次性读入文件,注意内存 298 | fd = codecs.open(fname, 'r', 'utf-8') 299 | lines = fd.readlines() 300 | fd.close() 301 | 302 | vocab = {} 303 | indexVocab = [] 304 | #遍历所有行 305 | index = 0 306 | for line in lines: 307 | words = line.strip().split() 308 | if words <= 0: continue 309 | #遍历所有词 310 | #如果为分隔符则无需加入字典 311 | for word in words: 312 | word = word.strip('[ ') 313 | end_index = word.find(']') 314 | if end_index >= 0: 315 | word = word[0:end_index] 316 | rst = word.split('/') 317 | if len(rst) < 2: 318 | continue 319 | word, tag = rst[0], rst[1] 320 | 321 | if word not in delimiters: 322 | index = vocabAddChar(vocab, indexVocab, index, word) 323 | 324 | #加入未登陆新词和填充词 325 | vocab[retain_unknown] = len(vocab) 326 | vocab[retain_padding] = len(vocab) 327 | indexVocab.append(retain_unknown) 328 | indexVocab.append(retain_padding) 329 | #返回字典与索引 330 | return vocab, indexVocab 331 | 332 | def load(fname): 333 | print 'train from file', fname 334 | delims = [' ', '\n'] 335 | vocab, indexVocab = genVocab(fname) 336 | X, y, initProb, tranProb = doc2vec(fname, vocab) 337 | print len(X), len(y), len(vocab), len(indexVocab) 338 | return (X, y), (initProb, tranProb), (vocab, indexVocab) 339 | 340 | if __name__ == '__main__': 341 | start_time = time.time() 342 | 343 | if len(sys.argv) < 4: 344 | print globals()['__doc__'] % locals() 345 | sys.exit(1) 346 | input_file, training_info_filePath, training_data_filePath = sys.argv[1:4] 347 | 348 | (X, y), (initProb, tranProb), (vocab, indexVocab) = load(input_file) 349 | saveTrainingInfo(training_info_filePath, ((initProb, tranProb), (vocab, indexVocab))) 350 | saveTrainingData(training_data_filePath, (X, y)) 351 | 352 | end_time = time.time() 353 | print("used time : %d s" % (end_time - start_time)) -------------------------------------------------------------------------------- /run_pretreat.cmd: -------------------------------------------------------------------------------- 1 | python ./pretreat.py ./dataset/original_with_tag.utf8 ./ner_training.info ./ner_training.data ./word2vec_model/ner_training_word2vec.model -------------------------------------------------------------------------------- /run_test_keras_model.cmd: -------------------------------------------------------------------------------- 1 | python ./test_keras_model.py ./ner_training.info ./ner_keras_model ./keras_model_weights ./dataset/ner_test.utf8 ./dataset/ner_test.utf8.result 2 | -------------------------------------------------------------------------------- /run_train_keras_model.cmd: -------------------------------------------------------------------------------- 1 | python ./train_keras_model.py ./ner_training.info ./ner_training.data ./ner_keras_model ./keras_model_weights 2 | -------------------------------------------------------------------------------- /test_keras_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | ''' 4 | python test_keras_model.py training_info_file keras_model_file keras_model_weights_file test_data_file output_file 5 | ''' 6 | 7 | import numpy as np 8 | import json 9 | import h5py 10 | import codecs 11 | import time 12 | import sys 13 | 14 | import pretreat 15 | import viterbi 16 | 17 | from sklearn import model_selection 18 | 19 | from keras.preprocessing import sequence 20 | from keras.optimizers import SGD, RMSprop, Adagrad 21 | from keras.utils import np_utils 22 | from keras.models import Sequential,Graph, model_from_json 23 | from keras.layers.core import Dense, Dropout, Activation, TimeDistributedDense 24 | from keras.layers.embeddings import Embedding 25 | from keras.layers.recurrent import LSTM, GRU, SimpleRNN 26 | 27 | from gensim.models import Word2Vec 28 | 29 | def loadModel(modelPath, weightPath): 30 | 31 | fd = open(modelPath, 'r') 32 | j = fd.read() 33 | fd.close() 34 | 35 | model = model_from_json(j) 36 | 37 | model.load_weights(weightPath) 38 | 39 | return model 40 | 41 | 42 | # 根据输入得到标注推断 43 | def testSent(sent, model, trainingInfo): 44 | (initProb, tranProb), (vocab, indexVocab) = trainingInfo 45 | vec = pretreat.sent2vec(sent, vocab, ctxWindows = 7) 46 | vec = np.array(vec) 47 | probs = model.predict_proba(vec) 48 | #classes = model.predict_classes(vec) 49 | 50 | prob, path = viterbi.viterbi(vec, pretreat.corpus_tags, initProb, tranProb, probs.transpose()) 51 | 52 | ss = '' 53 | for i, t in enumerate(path): 54 | ss += '%s/%s '%(sent[i], pretreat.corpus_tags[t]) 55 | # ss = '' 56 | # word = '' 57 | # for i, t in enumerate(path): 58 | # if cws.corpus_tags[t] == 'S': 59 | # ss += sent[i] + ' ' 60 | # word = '' 61 | # elif cws.corpus_tags[t] == 'B': 62 | # word += sent[i] 63 | # elif cws.corpus_tags[t] == 'E': 64 | # word += sent[i] 65 | # ss += word + ' ' 66 | # word = '' 67 | # elif cws.corpus_tags[t] == 'M': 68 | # word += sent[i] 69 | 70 | return ss 71 | 72 | def testFile(fname, dstname, model, trainingInfo): 73 | fd = codecs.open(fname, 'r', 'utf-8') 74 | lines = fd.readlines() 75 | fd.close() 76 | 77 | fd = open(dstname, 'w') 78 | for line in lines: 79 | rst = testSent(line.strip(), model, trainingInfo) 80 | fd.write(rst.encode('utf-8') + '\n') 81 | fd.close() 82 | 83 | if __name__ == '__main__': 84 | if len(sys.argv) < 6: 85 | print globals()['__doc__'] % locals() 86 | sys.exit(1) 87 | training_info_file, keras_model_file, keras_model_weights_file, test_data_file, output_file = sys.argv[1:6] 88 | 89 | training_info = pretreat.loadTrainingInfo(training_info_file) 90 | print 'Loading model...' 91 | start_time = time.time() 92 | model = loadModel(keras_model_file, keras_model_weights_file) 93 | print("Loading used time : ", time.time() - start_time) 94 | print 'Done!' 95 | print '-------------start predict----------------' 96 | #s = u'为寂寞的夜空画上一个月亮' 97 | #print testSent(s, model, cwsInfo) 98 | testFile(test_data_file, output_file, model, training_info) 99 | -------------------------------------------------------------------------------- /train_keras_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | ''' 4 | python train_keras_model.py training_info_filePath training_data_filePath output_keras_model_file output_keras_model_weights_file word2vec_model_file 5 | ''' 6 | 7 | import numpy as np 8 | import json 9 | import h5py 10 | import codecs 11 | import time 12 | import sys 13 | 14 | import pretreat 15 | import viterbi 16 | 17 | from sklearn import model_selection 18 | 19 | from keras.preprocessing import sequence 20 | from keras.optimizers import SGD, RMSprop, Adagrad 21 | from keras.utils import np_utils 22 | from keras.models import Sequential,Graph, model_from_json 23 | from keras.layers.core import Dense, Dropout, Activation, TimeDistributedDense 24 | from keras.layers.embeddings import Embedding 25 | from keras.layers.recurrent import LSTM, GRU, SimpleRNN 26 | 27 | from gensim.models import Word2Vec 28 | 29 | def train(trainingInfo, trainingData, modelPath, weightPath, word2vec_model_file): 30 | 31 | (initProb, tranProb), (vocab, indexVocab) = trainingInfo 32 | (X, y) = trainingData 33 | 34 | train_X, test_X, train_y, test_y = model_selection.train_test_split(X, y , train_size=0.9, random_state=1) 35 | 36 | train_X = np.array(train_X) 37 | train_y = np.array(train_y) 38 | test_X = np.array(test_X) 39 | test_y = np.array(test_y) 40 | 41 | outputDims = len(pretreat.corpus_tags) 42 | Y_train = np_utils.to_categorical(train_y, outputDims) 43 | Y_test = np_utils.to_categorical(test_y, outputDims) 44 | batchSize = 128 45 | vocabSize = len(vocab) + 1 46 | wordDims = 100 47 | maxlen = 7 48 | hiddenDims = 100 49 | 50 | w2vModel = Word2Vec.load(word2vec_model_file) 51 | embeddingDim = w2vModel.vector_size 52 | embeddingUnknown = [0 for i in range(embeddingDim)] 53 | embeddingWeights = np.zeros((vocabSize + 1, embeddingDim)) 54 | for word, index in vocab.items(): 55 | if word in w2vModel: 56 | e = w2vModel[word] 57 | else: 58 | e = embeddingUnknown 59 | embeddingWeights[index, :] = e 60 | 61 | #LSTM 62 | model = Sequential() 63 | model.add(Embedding(output_dim = embeddingDim, input_dim = vocabSize + 1, 64 | input_length = maxlen, mask_zero = True, weights = [embeddingWeights])) 65 | model.add(LSTM(output_dim = hiddenDims, return_sequences = True)) 66 | model.add(LSTM(output_dim = hiddenDims, return_sequences = False)) 67 | model.add(Dropout(0.5)) 68 | model.add(Dense(outputDims)) 69 | model.add(Activation('softmax')) 70 | model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics=["accuracy"]) 71 | 72 | result = model.fit(train_X, Y_train, batch_size = batchSize, 73 | nb_epoch = 20, validation_data = (test_X,Y_test)) 74 | 75 | j = model.to_json() 76 | fd = open(modelPath, 'w') 77 | fd.write(j) 78 | fd.close() 79 | 80 | model.save_weights(weightPath) 81 | 82 | return model 83 | 84 | if __name__ == '__main__': 85 | if len(sys.argv) < 6: 86 | print globals()['__doc__'] % locals() 87 | sys.exit(1) 88 | training_info_filePath, training_data_filePath, output_keras_model_file, output_keras_model_weights_file, word2vec_model_file = sys.argv[1:6] 89 | 90 | print 'Loading vocab...' 91 | start_time = time.time() 92 | trainingInfo = pretreat.loadTrainingInfo(training_info_filePath) 93 | trainingData = pretreat.loadTrainingData(training_data_filePath) 94 | print("Loading used time : ", time.time() - start_time) 95 | print 'Done!' 96 | 97 | print 'Training model...' 98 | start_time = time.time() 99 | model = train(trainingInfo, trainingData, output_keras_model_file, output_keras_model_weights_file, word2vec_model_file) 100 | print("Training used time : ", time.time() - start_time) 101 | print 'Done!' -------------------------------------------------------------------------------- /viterbi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | #2016年 01月 28日 星期四 17:14:03 CST by Demobin 5 | 6 | def _print(hiddenstates, V): 7 | s = " " + " ".join(("%7d" % i) for i in range(len(V))) + "\n" 8 | for i, state in enumerate(hiddenstates): 9 | s += "%.5s: " % state 10 | s += " ".join("%.7s" % ("%f" % v[i]) for v in V) 11 | s += "\n" 12 | print(s) 13 | 14 | #标准viterbi算法,参数为观察状态、隐藏状态、概率三元组(初始概率、转移概率、观察概率) 15 | def viterbi(obs, states, start_p, trans_p, emit_p): 16 | 17 | lenObs = len(obs) 18 | lenStates = len(states) 19 | 20 | V = [[0.0 for col in range(lenStates)] for row in range(lenObs)] 21 | path = [[0 for col in range(lenObs)] for row in range(lenStates)] 22 | 23 | #t = 0时刻 24 | for y in range(lenStates): 25 | #V[0][y] = start_p[y] * emit_p[y][obs[0]] 26 | V[0][y] = start_p[y] * emit_p[y][0] 27 | path[y][0] = y 28 | 29 | #t > 1时 30 | for t in range(1, lenObs): 31 | newpath = [[0.0 for col in range(lenObs)] for row in range(lenStates)] 32 | 33 | for y in range(lenStates): 34 | prob = -1 35 | state = 0 36 | for y0 in range(lenStates): 37 | #nprob = V[t - 1][y0] * trans_p[y0][y] * emit_p[y][obs[t]] 38 | nprob = V[t - 1][y0] * trans_p[y0][y] * emit_p[y][t] 39 | if nprob > prob: 40 | prob = nprob 41 | state = y0 42 | #记录最大概率 43 | V[t][y] = prob 44 | #记录路径 45 | newpath[y][:t] = path[state][:t] 46 | newpath[y][t] = y 47 | 48 | path = newpath 49 | 50 | prob = -1 51 | state = 0 52 | for y in range(lenStates): 53 | if V[lenObs - 1][y] > prob: 54 | prob = V[lenObs - 1][y] 55 | state = y 56 | 57 | #_print(states, V) 58 | return prob, path[state] 59 | 60 | def example(): 61 | #隐藏状态 62 | hiddenstates = ('Healthy', 'Fever') 63 | #观察状态 64 | observations = ('normal', 'cold', 'dizzy') 65 | 66 | #初始概率 67 | ''' 68 | Healthy': 0.6, 'Fever': 0.4 69 | ''' 70 | start_p = [0.6, 0.4] 71 | #转移概率 72 | ''' 73 | Healthy' : {'Healthy': 0.7, 'Fever': 0.3}, 74 | Fever' : {'Healthy': 0.4, 'Fever': 0.6} 75 | ''' 76 | trans_p = [[0.7, 0.3], [0.4, 0.6]] 77 | #发射概率/输出概率/观察概率 78 | ''' 79 | Healthy' : {'normal': 0.5, 'cold': 0.4, 'dizzy': 0.1}, 80 | Fever' : {'normal': 0.1, 'cold': 0.3, 'dizzy': 0.6} 81 | ''' 82 | emit_p = [[0.5, 0.4, 0.1], [0.1, 0.3, 0.6]] 83 | 84 | return viterbi(observations, 85 | hiddenstates, 86 | start_p, 87 | trans_p, 88 | emit_p) 89 | 90 | if __name__ == '__main__': 91 | print(example()) -------------------------------------------------------------------------------- /word2vec_model/.gitignore: -------------------------------------------------------------------------------- 1 | dataset -------------------------------------------------------------------------------- /word2vec_model/prepare_word2vec_train_dataset.cmd: -------------------------------------------------------------------------------- 1 | python ./prepare_word2vec_train_dataset.py ./dataset/original_with_tag.utf8 ./dataset/original_split.utf8 -------------------------------------------------------------------------------- /word2vec_model/prepare_word2vec_train_dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | ''' 4 | python prepare_word2vec_train_dataset.py input_file output_file 5 | ''' 6 | 7 | import os 8 | import sys 9 | import logging 10 | import multiprocessing 11 | import time 12 | import json 13 | 14 | if __name__ == '__main__': 15 | start_time = time.time() 16 | 17 | program = os.path.basename(sys.argv[0]) 18 | logger = logging.getLogger(program) 19 | 20 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 21 | logger.info("running %s" % ' '.join(sys.argv)) 22 | 23 | # check and process input arguments 24 | if len(sys.argv) < 3: 25 | print globals()['__doc__'] % locals() 26 | sys.exit(1) 27 | input_file, output_file = sys.argv[1:3] 28 | 29 | output_file_handler = open(output_file, 'w') 30 | for line in open(input_file, 'r'): 31 | new_line = '' 32 | words = line.strip().split() 33 | for word in words: 34 | word = word.strip('[ ') 35 | end_index = word.find(']') 36 | if end_index >= 0: 37 | word = word[0:end_index] 38 | word, tag = word.split('/') 39 | new_line = new_line + word + ' ' 40 | output_file_handler.write(new_line.strip() + '\n') 41 | output_file_handler.flush() 42 | output_file_handler.close() 43 | 44 | end_time = time.time() 45 | print("used time : %d s" % (end_time - start_time)) -------------------------------------------------------------------------------- /word2vec_model/train_word2vec_model.cmd: -------------------------------------------------------------------------------- 1 | python ./train_word2vec_model.py ./dataset/original_split.utf8 ./ner_training_word2vec.model ./ner_training_word2vec.vector -------------------------------------------------------------------------------- /word2vec_model/train_word2vec_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | ''' 4 | python train_word2vec_model.py input_file output_model_file output_vector_file 5 | ''' 6 | 7 | # import modules & set up logging 8 | import os 9 | import sys 10 | import logging 11 | import multiprocessing 12 | import time 13 | import json 14 | 15 | from gensim.models import Word2Vec 16 | from gensim.models.word2vec import LineSentence 17 | 18 | def output_vocab(vocab): 19 | for k, v in vocab.items(): 20 | print(k) 21 | 22 | if __name__ == '__main__': 23 | start_time = time.time() 24 | 25 | program = os.path.basename(sys.argv[0]) 26 | logger = logging.getLogger(program) 27 | 28 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 29 | logger.info("running %s" % ' '.join(sys.argv)) 30 | 31 | # check and process input arguments 32 | if len(sys.argv) < 4: 33 | print globals()['__doc__'] % locals() 34 | sys.exit(1) 35 | input_file, output_model_file, output_vector_file = sys.argv[1:4] 36 | 37 | model = Word2Vec(LineSentence(input_file), size=128, window=5, min_count=5, 38 | workers=multiprocessing.cpu_count()) 39 | 40 | # trim unneeded model memory = use(much) less RAM 41 | #model.init_sims(replace=True) 42 | model.save(output_model_file) 43 | model.save_word2vec_format(output_vector_file, binary=False) 44 | 45 | end_time = time.time() 46 | print("used time : %d s" % (end_time - start_time)) -------------------------------------------------------------------------------- /wwwrun.sh: -------------------------------------------------------------------------------- 1 | /home/escenter11/gym/anaconda/bin/python ./train_keras_model.py ./ner_training.info ./ner_training.data ./cws_keras_model ./keras_model_weights ./word2vec_model/ner_training_word2vec.model 2 | --------------------------------------------------------------------------------