├── pyseg
    ├── data
    │   ├── test.utf8
    │   ├── corpus.pkl
    │   ├── extra_dict.pkl
    │   ├── fmm_model.pkl
    │   ├── hmm_model.pkl
    │   ├── partspeech.pkl
    │   └── tagger.py
    ├── LSTMmodel.py
    ├── __init__.py
    ├── FMMmodel.py
    └── HMMmodel.py
├── test.py
└── README.md


/pyseg/data/test.utf8:
--------------------------------------------------------------------------------
1 | 建设 中国
2 | 特色
3 | 社会主义 道路


--------------------------------------------------------------------------------
/pyseg/data/corpus.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KylinC/PySeg/HEAD/pyseg/data/corpus.pkl


--------------------------------------------------------------------------------
/pyseg/data/extra_dict.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KylinC/PySeg/HEAD/pyseg/data/extra_dict.pkl


--------------------------------------------------------------------------------
/pyseg/data/fmm_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KylinC/PySeg/HEAD/pyseg/data/fmm_model.pkl


--------------------------------------------------------------------------------
/pyseg/data/hmm_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KylinC/PySeg/HEAD/pyseg/data/hmm_model.pkl


--------------------------------------------------------------------------------
/pyseg/data/partspeech.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KylinC/PySeg/HEAD/pyseg/data/partspeech.pkl


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import pyseg
 2 | from pyseg.LSTMmodel import LSTMTagger
 3 | 
 4 | # # Hidden Markov Model
 5 | 
 6 | res1 = pyseg.cut('这个程序不能准确的分割出喜欢，这也就是概率模型的问题所在。')
 7 | res2 = pyseg.cut('中国特色社会主义，邓小平改革开放')
 8 | res3 = pyseg.cut('发言人强调，该法案涉台内容严重违反一个中国原则和中美三个联合公报规定，严重损害中美关系和台海和平稳定。')
 9 | res4 = pyseg.cut('通知强调，各地要做好统筹安排，按照国务院关于保障义务教育教师工资待遇的工作部署，加大工作力度。')
10 | res5,tag = pyseg.cut_mark('江泽民主席来到北京负责燃料工业部的指导建设工作。')
11 | 
12 | for item in res4:
13 |     print(item,end="")
14 |     print('\\',end="")
15 | print("\n\n")
16 | 
17 | for idx in range(len(res5)):
18 |     print(res5[idx],end="\\")
19 |     print(tag[idx],end="    ")


--------------------------------------------------------------------------------
/pyseg/LSTMmodel.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import torch.optim as optim
 5 | 
 6 | class LSTMTagger(nn.Module):
 7 |     def __init__(self,embedding_dim,hidden_dim,vocab_size,tagset_size):
 8 |         super(LSTMTagger,self).__init__()
 9 |         self.hidden_dim=hidden_dim
10 |         self.word_embeddings=nn.Embedding(vocab_size,embedding_dim)
11 |         self.lstm=nn.LSTM(embedding_dim,hidden_dim)
12 |         self.hidden2tag=nn.Linear(hidden_dim,tagset_size)
13 |         self.hidden=self.init_hidden()
14 |     def init_hidden(self):
15 |         #the axes semantics are (num_layers,minibatch_size,hidden_size)
16 |         return (torch.zeros(1,1,self.hidden_dim),
17 |                 (torch.zeros(1,1,self.hidden_dim)))
18 |     def forward(self, sentence):
19 |         embeds=self.word_embeddings(sentence)
20 |         lstm_out,self.hidden=self.lstm(embeds.view(len(sentence),1,-1),self.hidden)
21 |         tag_space=self.hidden2tag(lstm_out.view(len(sentence),-1))
22 |         tag_scores=F.log_softmax(tag_space,dim=1)
23 |         return tag_scores


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PySeg
 2 | > 中文分词库, 词性标注库
 3 | >
 4 | > Chinese Words Segmentation and Tagger Library via Python
 5 | 
 6 | [![](https://img.shields.io/badge/python-3.5.7-blue.svg)]()
 7 | 
 8 | [![](https://img.shields.io/badge/Torch-1.0-orange)]()
 9 | 
10 | 
11 | 
12 | ## Usage
13 | 
14 | - 将项目内pyseg放置于工作目录或者site-packages目录
15 | - import snailseg
16 | 
17 | ```python
18 | import pyseg
19 | 
20 | # 默认使用隐马尔可夫模型（HMM）训练的2013年人民日报语料库进行分词
21 | words = pyseg.cut('发言人强调，该法案涉台内容严重违反一个中国原则，损害中美关系。')
22 | for w in words:
23 | 	  print(w)
24 |     
25 | # 使用二分正向最大匹配模型（BFMM）训练的2013年人民日报语料库进行分词 O(T)= nlog(n)
26 | words = pyseg.cut_fmm('中国特色社会主义，邓小平改革开放。')
27 | for w in words:
28 | 	  print(w)
29 |     
30 | # 将语料置于pyseg/data内,即可通过文件名进行HMM和BFMM模型的训练参数重载
31 | pyseg.load_corpus("trainCorpus.txt_utf8")
32 | 
33 | # 12.15日加入词性标注功能，但是由于标注语料缺乏，所以只限于适用！！
34 | # 应用HMM模型进行预处理分词，之后使用LSTM进行词性标注
35 | res5,res5_tag = pyseg.cut_mark('江泽民主席来到北京负责燃料工业部的指导建设工作。')
36 | for tag in res5_tag:
37 |     print(tag)
38 |   
39 | ```
40 | 
41 | 
42 | 
43 | ## Algorithm
44 | 
45 | - 默认cut方法使用2013年人民日报语料库训练的隐马尔可夫模型（HMM）进行分词，具体实现参考MCMC模型以及Viterbi算法（DP）
46 | - cut_fmm算法使用2013年人民日报语料库训练的二分正向最大匹配模型（BFMM）进行分词，具体实现在FMM上进行改进，将逐次缩位改为二分查找最大前缀，具体可以参考博客中分词内容：[https://kylinchen.top](https://kylinchen.top)
47 | - HMM+Embedding+LSTM进行词性标注，训练语料还较少，目标使用1998年人民日报语料库进行训练
48 | 
49 | 
50 | 
51 | ## Example
52 | 
53 | >  以HMM模型为例
54 | 
55 | - Input
56 | 
57 | ```python
58 | res1 = pyseg.cut('这个程序不能准确的分割出喜欢，这也就是概率模型的问题所在。')
59 | res2 = pyseg.cut('中国特色社会主义，邓小平改革开放')
60 | res3 = pyseg.cut('发言人强调，该法案涉台内容严重违反一个中国原则和中美三个联合公报规定，严重损害中美关系和台海和平稳定。')
61 | res4 = pyseg.cut('通知强调，各地要做好统筹安排，按照国务院关于保障义务教育教师工资待遇的工作部署，加大工作力度。')
62 | res5,tag = pyseg.cut_mark('江泽民主席来到北京负责燃料工业部的指导建设工作。')
63 | ```
64 | 
65 | - Output
66 | 
67 | ```pyhton
68 | 这个\程序\不能\准确\的\分割出\喜欢\，\这\也\就\是\概率\模型\的\问题\所在\。\
69 | 
70 | 中国\特色\社会\主义\，\邓小平\改革\开放\
71 | 
72 | 发言人\强调\，\该\法案\涉台\内容\严重\违反\一个\中国\原则\和\中美\三个\联合\公报\规定\，\严重\损害\中美\关系\和\台海\和\平\稳定\。\
73 | 
74 | 通知\强调\，\各地\要\做好\统筹\安排\，\按照\国务院\关于\保障\义务\教育\教师\工资待遇\的\工作\部署\，\加大\工作\力度\。\
75 | 
76 | 江泽民主席\n    来到\v    北京\ns    负责\v    燃料\n    工业部\n    的\uj    指导\n    建设\vn    工作\vn    。\x 
77 | ```
78 | 
79 | 


--------------------------------------------------------------------------------
/pyseg/__init__.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import io
 3 | import math
 4 | import os,sys
 5 | import bisect
 6 | import pickle
 7 | 
 8 | # ML dependency
 9 | import torch
10 | 
11 | # system path
12 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
13 | # data path
14 | _localDir=os.path.dirname(__file__)
15 | _curpath=os.path.normpath(os.path.join(os.getcwd(),_localDir))
16 | 
17 | torch_path=os.path.normpath(os.path.join(os.getcwd(),_localDir,"data/partspeech.pkl"))
18 | dict_path=os.path.normpath(os.path.join(os.getcwd(),_localDir,"data/extra_dict.pkl"))
19 | 
20 | # Hidden Markov
21 | from HMMmodel import HMM
22 | from FMMmodel import FMM
23 | from LSTMmodel import LSTMTagger
24 | 
25 | hmm = HMM()
26 | fmm = FMM()
27 | 
28 | with open(dict_path, 'rb') as inp:
29 |     word_to_ix = pickle.load(inp)
30 |     tag_to_ix = pickle.load(inp)
31 |     ix_to_tag = pickle.load(inp)
32 | 
33 | def cut(text):
34 |     blocks = re.split("([^\u4E00-\u9FA5]+)",text)
35 |     result = []
36 |     for block in blocks:
37 |         if re.match("[\u4E00-\u9FA5]+",block):
38 |             result.extend(hmm.cut(block))
39 |         else:
40 |             # tmp = re.split("[^a-zA-Z0-9+#]",block)
41 |             result.extend([x for x in block if x.strip()!=""])
42 |     return result
43 | 
44 | def load_corpus(file_name):
45 |     hmm.train(file_name)
46 |     fmm.train(file_name)
47 | 
48 | def cut_fmm(text):
49 |     blocks = re.split("([^\u4E00-\u9FA5]+)",text)
50 |     result = []
51 |     for block in blocks:
52 |         if re.match("[\u4E00-\u9FA5]+",block):
53 |             result.extend(fmm.cut(block))
54 |         else:
55 |             # tmp = re.split("[^a-zA-Z0-9+#]",block)
56 |             result.extend([x for x in block if x.strip()!=""])
57 |     return result
58 | 
59 | def prepare_sequence(seq,to_ix):
60 |     idxs=[]
61 |     for w in seq:
62 |         if(w in to_ix.keys()):
63 |             idxs.append(to_ix[w])
64 |         else:
65 |             idxs.append(to_ix["None"])
66 |     return torch.tensor(idxs,dtype=torch.long)
67 | 
68 | def cut_mark(text):
69 |     lstm_model = torch.load(torch_path)
70 |     cut_list = []
71 |     mark_list = []
72 |     sentence = cut(text)
73 |     with torch.no_grad():
74 |         inputs=prepare_sequence(sentence,word_to_ix)
75 |         tag_scores=lstm_model(inputs)
76 |         tem=tag_scores.argmax(dim=1).numpy().tolist()
77 |     for idx in range(len(sentence)):
78 |         cut_list.append(sentence[idx])
79 |         mark_list.append(ix_to_tag[tem[idx]])
80 |     return cut_list,mark_list
81 | 
82 | if __name__ == '__main__':
83 |     # load_corpus("trainCorpus.txt_utf8")
84 |     res1 = cut('这个程序不能准确的分割出喜欢，这也就是概率模型的问题所在。')
85 |     res2 = cut_fmm('中国特色社会主义，邓小平改革开放')
86 |     print(res1)
87 |     print(res2)
88 |     print(cut_mark('江泽民主席来到北京负责燃料工业部的指导建设工作。'))


--------------------------------------------------------------------------------
/pyseg/FMMmodel.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import io
 3 | import bisect
 4 | import pickle
 5 | import collections
 6 | 
 7 | class RunError(Exception):
 8 |     def __init__(self):
 9 |         Exception.__init__(self)
10 |     def __str__(self):
11 |         return repr('RunError')
12 | 
13 | class FMM(object):
14 |     def __init__(self,train_switch=False):
15 |         _localDir=os.path.dirname(__file__)
16 |         self._curpath=os.path.normpath(os.path.join(os.getcwd(),_localDir))
17 |         self.model_file = os.path.join(self._curpath,"data/fmm_model.pkl")
18 |         self.word_list = []
19 |         self.max_length = 0
20 |         if not train_switch:
21 |             try:
22 |                 with open(self.model_file, 'rb') as f:
23 |                     self.word_list = pickle.load(f)
24 |             except:
25 |                 pass
26 |             num_list = [len(one) for one in self.word_list]
27 |             self.max_length = max(num_list)
28 |             # print(self.max_length)
29 | 
30 |     def flatten(self,x):
31 |         result = []
32 |         for el in x:
33 |             if isinstance(x, collections.Iterable) and not isinstance(el, str):
34 |                 result.extend(self.flatten(el))
35 |             else:
36 |                 result.append(el)
37 |         return result
38 | 
39 |     def train(self, file_name):
40 |         path = os.path.join(self._curpath,"data/"+file_name)
41 |         with io.open(path, encoding='utf8') as f:
42 |             self.word_list = [line.strip().split() for line in f]
43 |             self.word_list = self.flatten(self.word_list)
44 |             self.word_list.sort()
45 |         num_list = [len(one) for one in self.word_list]
46 |         self.max_length = max(num_list)
47 |         with open(self.model_file, 'wb') as f:
48 |             pickle.dump(self.word_list, f)
49 | 
50 |     def check_prefix(self,text):
51 |         idx = bisect.bisect_right(self.word_list,text)
52 |         if(text.startswith(self.word_list[idx-1])):
53 |             return len(self.word_list[idx-1])
54 |         else:
55 |             return 1
56 | 
57 |     def cut(self,text):
58 |         begin = 0
59 |         while(begin<len(text)):
60 |             if(begin+self.max_length<=len(text)):
61 |                 res_offset = self.check_prefix(text[begin:begin+self.max_length])
62 |                 # print(text[begin:begin+res_offset])
63 |                 yield text[begin:begin+res_offset]
64 |                 begin=begin+res_offset
65 |             else:
66 |                 res_offset = self.check_prefix(text[begin:])
67 |                 yield text[begin:begin+res_offset]
68 |                 begin=begin+res_offset
69 | 
70 | if __name__ == '__main__':
71 |     fmm = FMM()
72 |     fmm.train("test.utf8")
73 |     text = '在建设有中国特色的社会主义道路上'
74 |     res = fmm.cut(text)
75 |     print(list(res))


--------------------------------------------------------------------------------
/pyseg/HMMmodel.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import io
  3 | import pickle
  4 | 
  5 | class RunError(Exception):
  6 |     def __init__(self):
  7 |         Exception.__init__(self)
  8 |     def __str__(self):
  9 |         return repr('RunError')
 10 | 
 11 | class HMM(object):
 12 |     def __init__(self,train_switch=False):
 13 |         _localDir=os.path.dirname(__file__)
 14 |         self._curpath=os.path.normpath(os.path.join(os.getcwd(),_localDir))
 15 |         self.model_file = os.path.join(self._curpath,"data/hmm_model.pkl")
 16 |         self.state_list = ['B', 'M', 'E', 'S']
 17 |         self.trans_p = {}
 18 |         self.emit_p = {}
 19 |         self.start_p = {}
 20 |         self.load_para = False
 21 |         if not train_switch:
 22 |             try:
 23 |                 with open(self.model_file, 'rb') as f:
 24 |                     self.trans_p = pickle.load(f)
 25 |                     self.emit_p = pickle.load(f)
 26 |                     self.start_p = pickle.load(f)
 27 |                     self.load_para = True
 28 |             except:
 29 |                 pass
 30 | 
 31 |     def train(self, file_name):
 32 |         path = os.path.join(self._curpath,"data/"+file_name)
 33 |         Count_dic = {}
 34 |         def init_parameters():
 35 |             for state in self.state_list:
 36 |                 self.trans_p[state] = {s: 0.0 for s in self.state_list}
 37 |                 self.start_p[state] = 0.0
 38 |                 self.emit_p[state] = {}
 39 |                 Count_dic[state] = 0
 40 | 
 41 |         def makeLabel(text):
 42 |             out_text = []
 43 |             if len(text) == 1:
 44 |                 out_text.append('S')
 45 |             else:
 46 |                 out_text += ['B'] + ['M'] * (len(text) - 2) + ['E']
 47 |             return out_text
 48 | 
 49 |         init_parameters()
 50 |         line_num = -1
 51 |         words = set()
 52 |         with io.open(path, encoding='utf8') as f:
 53 |             for line in f:
 54 |                 line_num += 1
 55 |                 line = line.strip()
 56 |                 if not line:
 57 |                     continue
 58 |                 word_list = [i for i in line if i != ' ']
 59 |                 words |= set(word_list)
 60 |                 linelist = line.split()
 61 |                 line_state = []
 62 |                 for w in linelist:
 63 |                     line_state.extend(makeLabel(w))
 64 | 
 65 |                 assert(len(word_list) == len(line_state))
 66 | 
 67 |                 for k, v in enumerate(line_state):
 68 |                     Count_dic[v] += 1
 69 |                     if k == 0:
 70 |                         self.start_p[v] += 1
 71 |                     else:
 72 |                         self.trans_p[line_state[k-1]][v] += 1
 73 |                         self.emit_p[line_state[k]][word_list[k]] = \
 74 |                             self.emit_p[line_state[k]].get(word_list[k], 0) + 1.0
 75 |         self.start_p = {k: v * 1.0 / line_num for k, v in self.start_p.items()}
 76 |         self.trans_p = {k: {k1: v1 / (Count_dic[k]+1) for k1, v1 in v.items()}
 77 |                       for k, v in self.trans_p.items()}
 78 |         self.emit_p = {k: {k1: (v1 + 1) / (Count_dic[k]+1) for k1, v1 in v.items()}
 79 |                       for k, v in self.emit_p.items()}
 80 | 
 81 |         with open(self.model_file, 'wb') as f:
 82 |             pickle.dump(self.trans_p, f)
 83 |             pickle.dump(self.emit_p, f)
 84 |             pickle.dump(self.start_p, f)
 85 | 
 86 |         return self
 87 | 
 88 | 
 89 |     def viterbi(self, text, states, start_p, trans_p, emit_p):
 90 |         V = [{}]
 91 |         path = {}
 92 |         for y in states:
 93 |             V[0][y] = start_p[y] * emit_p[y].get(text[0], 0)
 94 |             path[y] = [y]
 95 |         for t in range(1, len(text)):
 96 |             V.append({})
 97 |             newpath = {}
 98 | 
 99 |             neverSeen = text[t] not in emit_p['S'].keys() and \
100 |                 text[t] not in emit_p['M'].keys() and \
101 |                 text[t] not in emit_p['E'].keys() and \
102 |                 text[t] not in emit_p['B'].keys()
103 |             
104 |             for y in states:
105 |                 emitP = emit_p[y].get(text[t], 0) if not neverSeen else 1.0
106 |                 (prob, state) = max(
107 |                     [(V[t-1][y0] * trans_p[y0].get(y, 0) * emitP, y0) for y0 in states if V[t-1][y0] > 0]
108 |                 )
109 |                 V[t][y] = prob
110 |                 newpath[y] = path[state] + [y]
111 |             path = newpath
112 |         if (emit_p['M'].get(text[-1], 0)):
113 |             (prob, state) = max([(V[len(text) - 1][y], y) for y in ('E', 'M')])
114 |         else:
115 |             (prob, state) = max([(V[len(text) - 1][y], y) for y in states])
116 |         return (prob, path[state])
117 | 
118 |     def cut(self, text):
119 |         prob, pos_list = self.viterbi(text, self.state_list, self.start_p, self.trans_p, self.emit_p)
120 |         begin, next = 0, 0
121 |         for i, char in enumerate(text):
122 |             pos = pos_list[i]
123 |             if pos == 'B':
124 |                 begin = i
125 |             elif pos == 'E':
126 |                 yield text[begin: i+1]
127 |                 next = i + 1
128 |             elif pos == 'S':
129 |                 yield char
130 |                 next = i + 1
131 |         if next < len(text):
132 |             yield text[next:]
133 | 
134 | 
135 | if __name__ == '__main__':
136 |     hmm = HMM()
137 |     # hmm.train('trainCorpus.txt_utf8')
138 |     # hmm.load_model(True)
139 |     text = '这个程序不能准确的分割出喜欢，这也就是概率模型的问题所在。'
140 |     res = hmm.cut(text)
141 |     print(list(res))


--------------------------------------------------------------------------------
/pyseg/data/tagger.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Untitled9.ipynb
  3 | 
  4 | Automatically generated by Colaboratory.
  5 | 
  6 | Original file is located at
  7 |     https://colab.research.google.com/drive/1-Wmm-4G31z5hLGzujFFnCaqFqoTi0bdi
  8 | """
  9 | 
 10 | import pickle
 11 | import io
 12 | from google.colab import files
 13 | import jieba
 14 | import jieba.posseg as pseg
 15 | 
 16 | df = open('corpus.pkl','rb')
 17 | train=pickle.load(df)
 18 | df.close()
 19 | 
 20 | root_check_dict = {
 21 |     "n": "名词",
 22 |     "nr": "人名",
 23 |     "nr1": "汉语姓氏",
 24 |     "nr2": "汉语名字",
 25 |     "nrj": "日语人名",
 26 |     "nrf": "音译人名",
 27 |     "ns": "地名",
 28 |     "nsf": "音译地名",
 29 |     "nt": "机构团体名",
 30 |     "nz": "其它专名",
 31 |     "nl": "名词性惯用语",
 32 |     "ng": "名词性语素",
 33 |     "t": "时间词",
 34 |     "tg": "时间词性语素",
 35 |     "s": "处所词",
 36 |     "f": "方位词",
 37 |     "v": "动词",
 38 |     "vd": "副动词",
 39 |     "vn": "名动词",
 40 |     "vshi": "动词“是”",
 41 |     "vyou": "动词“有”",
 42 |     "vf": "趋向动词",
 43 |     "vx": "形式动词",
 44 |     "vi": "不及物动词（内动词）",
 45 |     "vl": "动词性惯用语",
 46 |     "vg": "动词性语素",
 47 |     "a": "形容词",
 48 |     "ad": "副形词",
 49 |     "an": "名形词",
 50 |     "ag": "形容词性语素",
 51 |     "al": "形容词性惯用语",
 52 |     "b": "区别词",
 53 |     "bl": "区别词性惯用语",
 54 |     "z": "状态词",
 55 |     "r": "代词",
 56 |     "rr": "人称代词",
 57 |     "rz": "指示代词",
 58 |     "rzt": "时间指示代词",
 59 |     "rzs": "处所指示代词",
 60 |     "rzv": "谓词性指示代词",
 61 |     "ry": "疑问代词",
 62 |     "ryt": "时间疑问代词",
 63 |     "rys": "处所疑问代词",
 64 |     "ryv": "谓词性疑问代词",
 65 |     "rg": "代词性语素",
 66 |     "m": "数词",
 67 |     "mq": "数量词",
 68 |     "q": "量词",
 69 |     "qv": "动量词",
 70 |     "qt": "时量词",
 71 |     "d": "副词",
 72 |     "p": "介词",
 73 |     "pba": "介词“把”",
 74 |     "pbei": "介词“被”",
 75 |     "c": "连词",
 76 |     "cc": "并列连词",
 77 |     "u": "助词",
 78 |     "uzhe": "着",
 79 |     "ule": "了 喽",
 80 |     "uguo": "过",
 81 |     "ude1": "的 底",
 82 |     "ude2": "地",
 83 |     "ude3": "得",
 84 |     "usuo": "所",
 85 |     "udeng": "等 等等 云云",
 86 |     "uyy": "一样 一般 似的 般",
 87 |     "udh": "的话",
 88 |     "uls": "来讲 来说 而言 说来",
 89 |     "uzhi": "之",
 90 |     "ulian": "连",
 91 |     "e": "叹词",
 92 |     "y": "语气词(delete yg)",
 93 |     "o": "拟声词",
 94 |     "h": "前缀",
 95 |     "k": "后缀",
 96 |     "x": "字符串",
 97 |     "xx": "非语素字",
 98 |     "xu": "网址URL",
 99 |     "w": "标点符号",
100 |     "wkz": "左括号，全角：（ 〔 ［ ｛ 《 【 〖 〈 半角：( [ { <",
101 |     "wky": "右括号，全角：） 〕 ］ ｝ 》 】 〗 〉 半角： ) ] { >",
102 |     "wyz": "左引号，全角：“ ‘ 『",
103 |     "wyy": "右引号，全角：” ’ 』",
104 |     "wj": "句号，全角：。",
105 |     "ww": "问号，全角：？ 半角：?",
106 |     "wt": "叹号，全角：！ 半角：!",
107 |     "wd": "逗号，全角：， 半角：,",
108 |     "wf": "分号，全角：； 半角： ; ",
109 |     "wn": "顿号，全角：、",
110 |     "wm": "冒号，全角：： 半角： :",
111 |     "ws": "省略号，全角：…… …",
112 |     "wp": "破折号，全角：—— －－ ——－ 半角：—",
113 |     "wb": "百分号千分号，全角：％ ‰ 半角：%",
114 |     "wh": "单位符号，全角：￥ ＄ ￡ ° ℃ 半角 $"
115 | }
116 | 
117 | training_set=[]
118 | 
119 | for item in train:
120 |   words = pseg.cut(item[0][0])
121 |   tmp_word_list=[]
122 |   tmp_tag_list=[]
123 |   for word,tag in words:
124 |     tmp_word_list.append(word)
125 |     tmp_tag_list.append(tag)
126 |   training_set.append((tmp_word_list,tmp_tag_list))
127 | 
128 | word_to_ix={}
129 | for sent,tag in training_set:
130 |     for word in sent:
131 |         if word not in word_to_ix:
132 |             word_to_ix[word]=len(word_to_ix)
133 | print(word_to_ix)
134 | 
135 | word_to_ix["None"]=len(word_to_ix)
136 | 
137 | tag_to_ix={}
138 | for sent,tag in training_set:
139 |     for word in tag:
140 |         if word not in tag_to_ix:
141 |             tag_to_ix[word]=len(tag_to_ix)
142 | print(tag_to_ix)
143 | 
144 | ix_to_tag={}
145 | for key,value in tag_to_ix.items():
146 |   ix_to_tag[value]=key
147 | 
148 | import torch
149 | import torch.nn as nn
150 | import torch.nn.functional as F
151 | import torch.optim as optim
152 | 
153 | EMBEDDING_DIM=50
154 | HIDDEN_DIM=50
155 | 
156 | class LSTMTagger(nn.Module):
157 |     def __init__(self,embedding_dim,hidden_dim,vocab_size,tagset_size):
158 |         super(LSTMTagger,self).__init__()
159 |         self.hidden_dim=hidden_dim
160 |         self.word_embeddings=nn.Embedding(vocab_size,embedding_dim)
161 |         self.lstm=nn.LSTM(embedding_dim,hidden_dim)
162 |         self.hidden2tag=nn.Linear(hidden_dim,tagset_size)
163 |         self.hidden=self.init_hidden()
164 |     def init_hidden(self):
165 |         #the axes semantics are (num_layers,minibatch_size,hidden_size)
166 |         return (torch.zeros(1,1,self.hidden_dim),
167 |                 (torch.zeros(1,1,self.hidden_dim)))
168 |     def forward(self, sentence):
169 |         embeds=self.word_embeddings(sentence)
170 |         lstm_out,self.hidden=self.lstm(embeds.view(len(sentence),1,-1),self.hidden)
171 |         tag_space=self.hidden2tag(lstm_out.view(len(sentence),-1))
172 |         tag_scores=F.log_softmax(tag_space,dim=1)
173 |         return tag_scores
174 | 
175 | model=LSTMTagger(EMBEDDING_DIM,HIDDEN_DIM,len(word_to_ix),len(tag_to_ix))
176 | loss_function=nn.NLLLoss()
177 | optimizer=optim.SGD(model.parameters(),lr=0.1)
178 | 
179 | def prepare_sequence(seq,to_ix):
180 |     idxs=[]
181 |     for w in seq:
182 |       if(w in to_ix.keys()):
183 |         idxs.append(to_ix[w])
184 |       else:
185 |         idxs.append(to_ix["None"])
186 |     return torch.tensor(idxs,dtype=torch.long)
187 | 
188 | model=LSTMTagger(EMBEDDING_DIM,HIDDEN_DIM,len(word_to_ix),len(tag_to_ix))
189 | loss_function=nn.NLLLoss()
190 | optimizer=optim.SGD(model.parameters(),lr=0.1)
191 | #before training
192 | # with torch.no_grad():
193 | #     inputs=prepare_sequence(training_data[0][0],word_to_ix)
194 | #     tag_scores=model(inputs)
195 | #     print(tag_scores)
196 |  
197 | for epoch in range(20):
198 |     for sentence,tags in training_set:
199 |         model.zero_grad()
200 |         model.hidden=model.init_hidden()
201 |         sentence_in=prepare_sequence(sentence,word_to_ix)
202 |         targets=prepare_sequence(tags,tag_to_ix)
203 |         tag_scores=model(sentence_in)
204 |         loss=loss_function(tag_scores,targets)
205 |         loss.backward()
206 |         optimizer.step()
207 |     print(epoch)
208 | # after training
209 | # with torch.no_grad():
210 | #     inputs=prepare_sequence(training_set[0][0],word_to_ix)
211 | #     tag_scores=model(inputs)
212 | #     print(tag_scores.argmax(dim=1).numpy().tolist())
213 | 
214 | with torch.no_grad():
215 |     inputs=prepare_sequence(training_set[273][0],word_to_ix)
216 |     tag_scores=model(inputs)
217 |     tem=tag_scores.argmax(dim=1).numpy().tolist()
218 |     sen=training_set[273][0]
219 | 
220 | for idx in range(len(sen)):
221 |   print(sen[idx],end="/")
222 |   print(ix_to_tag[tem[idx]])
223 | 
224 | torch.save(model,"partspeech.pkl")
225 | 
226 | model=torch.load("partspeech.pkl")
227 | 
228 | with open('extra_dict.pkl', 'wb') as outp:
229 | 	pickle.dump(word_to_ix, outp)
230 | 	pickle.dump(tag_to_ix, outp)
231 | 	pickle.dump(ix_to_tag, outp)
232 | print('** Finished saving the data.')
233 | 
234 | training_set[273][0]
235 | 
236 | 


--------------------------------------------------------------------------------