├── pyseg ├── data │ ├── test.utf8 │ ├── corpus.pkl │ ├── extra_dict.pkl │ ├── fmm_model.pkl │ ├── hmm_model.pkl │ ├── partspeech.pkl │ └── tagger.py ├── LSTMmodel.py ├── __init__.py ├── FMMmodel.py └── HMMmodel.py ├── test.py └── README.md /pyseg/data/test.utf8: -------------------------------------------------------------------------------- 1 | 建设 中国 2 | 特色 3 | 社会主义 道路 -------------------------------------------------------------------------------- /pyseg/data/corpus.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KylinC/PySeg/HEAD/pyseg/data/corpus.pkl -------------------------------------------------------------------------------- /pyseg/data/extra_dict.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KylinC/PySeg/HEAD/pyseg/data/extra_dict.pkl -------------------------------------------------------------------------------- /pyseg/data/fmm_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KylinC/PySeg/HEAD/pyseg/data/fmm_model.pkl -------------------------------------------------------------------------------- /pyseg/data/hmm_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KylinC/PySeg/HEAD/pyseg/data/hmm_model.pkl -------------------------------------------------------------------------------- /pyseg/data/partspeech.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KylinC/PySeg/HEAD/pyseg/data/partspeech.pkl -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import pyseg 2 | from pyseg.LSTMmodel import LSTMTagger 3 | 4 | # # Hidden Markov Model 5 | 6 | res1 = pyseg.cut('这个程序不能准确的分割出喜欢,这也就是概率模型的问题所在。') 7 | res2 = pyseg.cut('中国特色社会主义,邓小平改革开放') 8 | res3 = pyseg.cut('发言人强调,该法案涉台内容严重违反一个中国原则和中美三个联合公报规定,严重损害中美关系和台海和平稳定。') 9 | res4 = pyseg.cut('通知强调,各地要做好统筹安排,按照国务院关于保障义务教育教师工资待遇的工作部署,加大工作力度。') 10 | res5,tag = pyseg.cut_mark('江泽民主席来到北京负责燃料工业部的指导建设工作。') 11 | 12 | for item in res4: 13 | print(item,end="") 14 | print('\\',end="") 15 | print("\n\n") 16 | 17 | for idx in range(len(res5)): 18 | print(res5[idx],end="\\") 19 | print(tag[idx],end=" ") -------------------------------------------------------------------------------- /pyseg/LSTMmodel.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.optim as optim 5 | 6 | class LSTMTagger(nn.Module): 7 | def __init__(self,embedding_dim,hidden_dim,vocab_size,tagset_size): 8 | super(LSTMTagger,self).__init__() 9 | self.hidden_dim=hidden_dim 10 | self.word_embeddings=nn.Embedding(vocab_size,embedding_dim) 11 | self.lstm=nn.LSTM(embedding_dim,hidden_dim) 12 | self.hidden2tag=nn.Linear(hidden_dim,tagset_size) 13 | self.hidden=self.init_hidden() 14 | def init_hidden(self): 15 | #the axes semantics are (num_layers,minibatch_size,hidden_size) 16 | return (torch.zeros(1,1,self.hidden_dim), 17 | (torch.zeros(1,1,self.hidden_dim))) 18 | def forward(self, sentence): 19 | embeds=self.word_embeddings(sentence) 20 | lstm_out,self.hidden=self.lstm(embeds.view(len(sentence),1,-1),self.hidden) 21 | tag_space=self.hidden2tag(lstm_out.view(len(sentence),-1)) 22 | tag_scores=F.log_softmax(tag_space,dim=1) 23 | return tag_scores -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PySeg 2 | > 中文分词库, 词性标注库 3 | > 4 | > Chinese Words Segmentation and Tagger Library via Python 5 | 6 | [![](https://img.shields.io/badge/python-3.5.7-blue.svg)]() 7 | 8 | [![](https://img.shields.io/badge/Torch-1.0-orange)]() 9 | 10 | 11 | 12 | ## Usage 13 | 14 | - 将项目内pyseg放置于工作目录或者site-packages目录 15 | - import snailseg 16 | 17 | ```python 18 | import pyseg 19 | 20 | # 默认使用隐马尔可夫模型(HMM)训练的2013年人民日报语料库进行分词 21 | words = pyseg.cut('发言人强调,该法案涉台内容严重违反一个中国原则,损害中美关系。') 22 | for w in words: 23 | print(w) 24 | 25 | # 使用二分正向最大匹配模型(BFMM)训练的2013年人民日报语料库进行分词 O(T)= nlog(n) 26 | words = pyseg.cut_fmm('中国特色社会主义,邓小平改革开放。') 27 | for w in words: 28 | print(w) 29 | 30 | # 将语料置于pyseg/data内,即可通过文件名进行HMM和BFMM模型的训练参数重载 31 | pyseg.load_corpus("trainCorpus.txt_utf8") 32 | 33 | # 12.15日加入词性标注功能,但是由于标注语料缺乏,所以只限于适用!! 34 | # 应用HMM模型进行预处理分词,之后使用LSTM进行词性标注 35 | res5,res5_tag = pyseg.cut_mark('江泽民主席来到北京负责燃料工业部的指导建设工作。') 36 | for tag in res5_tag: 37 | print(tag) 38 | 39 | ``` 40 | 41 | 42 | 43 | ## Algorithm 44 | 45 | - 默认cut方法使用2013年人民日报语料库训练的隐马尔可夫模型(HMM)进行分词,具体实现参考MCMC模型以及Viterbi算法(DP) 46 | - cut_fmm算法使用2013年人民日报语料库训练的二分正向最大匹配模型(BFMM)进行分词,具体实现在FMM上进行改进,将逐次缩位改为二分查找最大前缀,具体可以参考博客中分词内容:[https://kylinchen.top](https://kylinchen.top) 47 | - HMM+Embedding+LSTM进行词性标注,训练语料还较少,目标使用1998年人民日报语料库进行训练 48 | 49 | 50 | 51 | ## Example 52 | 53 | > 以HMM模型为例 54 | 55 | - Input 56 | 57 | ```python 58 | res1 = pyseg.cut('这个程序不能准确的分割出喜欢,这也就是概率模型的问题所在。') 59 | res2 = pyseg.cut('中国特色社会主义,邓小平改革开放') 60 | res3 = pyseg.cut('发言人强调,该法案涉台内容严重违反一个中国原则和中美三个联合公报规定,严重损害中美关系和台海和平稳定。') 61 | res4 = pyseg.cut('通知强调,各地要做好统筹安排,按照国务院关于保障义务教育教师工资待遇的工作部署,加大工作力度。') 62 | res5,tag = pyseg.cut_mark('江泽民主席来到北京负责燃料工业部的指导建设工作。') 63 | ``` 64 | 65 | - Output 66 | 67 | ```pyhton 68 | 这个\程序\不能\准确\的\分割出\喜欢\,\这\也\就\是\概率\模型\的\问题\所在\。\ 69 | 70 | 中国\特色\社会\主义\,\邓小平\改革\开放\ 71 | 72 | 发言人\强调\,\该\法案\涉台\内容\严重\违反\一个\中国\原则\和\中美\三个\联合\公报\规定\,\严重\损害\中美\关系\和\台海\和\平\稳定\。\ 73 | 74 | 通知\强调\,\各地\要\做好\统筹\安排\,\按照\国务院\关于\保障\义务\教育\教师\工资待遇\的\工作\部署\,\加大\工作\力度\。\ 75 | 76 | 江泽民主席\n 来到\v 北京\ns 负责\v 燃料\n 工业部\n 的\uj 指导\n 建设\vn 工作\vn 。\x 77 | ``` 78 | 79 | -------------------------------------------------------------------------------- /pyseg/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | import io 3 | import math 4 | import os,sys 5 | import bisect 6 | import pickle 7 | 8 | # ML dependency 9 | import torch 10 | 11 | # system path 12 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 13 | # data path 14 | _localDir=os.path.dirname(__file__) 15 | _curpath=os.path.normpath(os.path.join(os.getcwd(),_localDir)) 16 | 17 | torch_path=os.path.normpath(os.path.join(os.getcwd(),_localDir,"data/partspeech.pkl")) 18 | dict_path=os.path.normpath(os.path.join(os.getcwd(),_localDir,"data/extra_dict.pkl")) 19 | 20 | # Hidden Markov 21 | from HMMmodel import HMM 22 | from FMMmodel import FMM 23 | from LSTMmodel import LSTMTagger 24 | 25 | hmm = HMM() 26 | fmm = FMM() 27 | 28 | with open(dict_path, 'rb') as inp: 29 | word_to_ix = pickle.load(inp) 30 | tag_to_ix = pickle.load(inp) 31 | ix_to_tag = pickle.load(inp) 32 | 33 | def cut(text): 34 | blocks = re.split("([^\u4E00-\u9FA5]+)",text) 35 | result = [] 36 | for block in blocks: 37 | if re.match("[\u4E00-\u9FA5]+",block): 38 | result.extend(hmm.cut(block)) 39 | else: 40 | # tmp = re.split("[^a-zA-Z0-9+#]",block) 41 | result.extend([x for x in block if x.strip()!=""]) 42 | return result 43 | 44 | def load_corpus(file_name): 45 | hmm.train(file_name) 46 | fmm.train(file_name) 47 | 48 | def cut_fmm(text): 49 | blocks = re.split("([^\u4E00-\u9FA5]+)",text) 50 | result = [] 51 | for block in blocks: 52 | if re.match("[\u4E00-\u9FA5]+",block): 53 | result.extend(fmm.cut(block)) 54 | else: 55 | # tmp = re.split("[^a-zA-Z0-9+#]",block) 56 | result.extend([x for x in block if x.strip()!=""]) 57 | return result 58 | 59 | def prepare_sequence(seq,to_ix): 60 | idxs=[] 61 | for w in seq: 62 | if(w in to_ix.keys()): 63 | idxs.append(to_ix[w]) 64 | else: 65 | idxs.append(to_ix["None"]) 66 | return torch.tensor(idxs,dtype=torch.long) 67 | 68 | def cut_mark(text): 69 | lstm_model = torch.load(torch_path) 70 | cut_list = [] 71 | mark_list = [] 72 | sentence = cut(text) 73 | with torch.no_grad(): 74 | inputs=prepare_sequence(sentence,word_to_ix) 75 | tag_scores=lstm_model(inputs) 76 | tem=tag_scores.argmax(dim=1).numpy().tolist() 77 | for idx in range(len(sentence)): 78 | cut_list.append(sentence[idx]) 79 | mark_list.append(ix_to_tag[tem[idx]]) 80 | return cut_list,mark_list 81 | 82 | if __name__ == '__main__': 83 | # load_corpus("trainCorpus.txt_utf8") 84 | res1 = cut('这个程序不能准确的分割出喜欢,这也就是概率模型的问题所在。') 85 | res2 = cut_fmm('中国特色社会主义,邓小平改革开放') 86 | print(res1) 87 | print(res2) 88 | print(cut_mark('江泽民主席来到北京负责燃料工业部的指导建设工作。')) -------------------------------------------------------------------------------- /pyseg/FMMmodel.py: -------------------------------------------------------------------------------- 1 | import os 2 | import io 3 | import bisect 4 | import pickle 5 | import collections 6 | 7 | class RunError(Exception): 8 | def __init__(self): 9 | Exception.__init__(self) 10 | def __str__(self): 11 | return repr('RunError') 12 | 13 | class FMM(object): 14 | def __init__(self,train_switch=False): 15 | _localDir=os.path.dirname(__file__) 16 | self._curpath=os.path.normpath(os.path.join(os.getcwd(),_localDir)) 17 | self.model_file = os.path.join(self._curpath,"data/fmm_model.pkl") 18 | self.word_list = [] 19 | self.max_length = 0 20 | if not train_switch: 21 | try: 22 | with open(self.model_file, 'rb') as f: 23 | self.word_list = pickle.load(f) 24 | except: 25 | pass 26 | num_list = [len(one) for one in self.word_list] 27 | self.max_length = max(num_list) 28 | # print(self.max_length) 29 | 30 | def flatten(self,x): 31 | result = [] 32 | for el in x: 33 | if isinstance(x, collections.Iterable) and not isinstance(el, str): 34 | result.extend(self.flatten(el)) 35 | else: 36 | result.append(el) 37 | return result 38 | 39 | def train(self, file_name): 40 | path = os.path.join(self._curpath,"data/"+file_name) 41 | with io.open(path, encoding='utf8') as f: 42 | self.word_list = [line.strip().split() for line in f] 43 | self.word_list = self.flatten(self.word_list) 44 | self.word_list.sort() 45 | num_list = [len(one) for one in self.word_list] 46 | self.max_length = max(num_list) 47 | with open(self.model_file, 'wb') as f: 48 | pickle.dump(self.word_list, f) 49 | 50 | def check_prefix(self,text): 51 | idx = bisect.bisect_right(self.word_list,text) 52 | if(text.startswith(self.word_list[idx-1])): 53 | return len(self.word_list[idx-1]) 54 | else: 55 | return 1 56 | 57 | def cut(self,text): 58 | begin = 0 59 | while(begin 0] 108 | ) 109 | V[t][y] = prob 110 | newpath[y] = path[state] + [y] 111 | path = newpath 112 | if (emit_p['M'].get(text[-1], 0)): 113 | (prob, state) = max([(V[len(text) - 1][y], y) for y in ('E', 'M')]) 114 | else: 115 | (prob, state) = max([(V[len(text) - 1][y], y) for y in states]) 116 | return (prob, path[state]) 117 | 118 | def cut(self, text): 119 | prob, pos_list = self.viterbi(text, self.state_list, self.start_p, self.trans_p, self.emit_p) 120 | begin, next = 0, 0 121 | for i, char in enumerate(text): 122 | pos = pos_list[i] 123 | if pos == 'B': 124 | begin = i 125 | elif pos == 'E': 126 | yield text[begin: i+1] 127 | next = i + 1 128 | elif pos == 'S': 129 | yield char 130 | next = i + 1 131 | if next < len(text): 132 | yield text[next:] 133 | 134 | 135 | if __name__ == '__main__': 136 | hmm = HMM() 137 | # hmm.train('trainCorpus.txt_utf8') 138 | # hmm.load_model(True) 139 | text = '这个程序不能准确的分割出喜欢,这也就是概率模型的问题所在。' 140 | res = hmm.cut(text) 141 | print(list(res)) -------------------------------------------------------------------------------- /pyseg/data/tagger.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Untitled9.ipynb 3 | 4 | Automatically generated by Colaboratory. 5 | 6 | Original file is located at 7 | https://colab.research.google.com/drive/1-Wmm-4G31z5hLGzujFFnCaqFqoTi0bdi 8 | """ 9 | 10 | import pickle 11 | import io 12 | from google.colab import files 13 | import jieba 14 | import jieba.posseg as pseg 15 | 16 | df = open('corpus.pkl','rb') 17 | train=pickle.load(df) 18 | df.close() 19 | 20 | root_check_dict = { 21 | "n": "名词", 22 | "nr": "人名", 23 | "nr1": "汉语姓氏", 24 | "nr2": "汉语名字", 25 | "nrj": "日语人名", 26 | "nrf": "音译人名", 27 | "ns": "地名", 28 | "nsf": "音译地名", 29 | "nt": "机构团体名", 30 | "nz": "其它专名", 31 | "nl": "名词性惯用语", 32 | "ng": "名词性语素", 33 | "t": "时间词", 34 | "tg": "时间词性语素", 35 | "s": "处所词", 36 | "f": "方位词", 37 | "v": "动词", 38 | "vd": "副动词", 39 | "vn": "名动词", 40 | "vshi": "动词“是”", 41 | "vyou": "动词“有”", 42 | "vf": "趋向动词", 43 | "vx": "形式动词", 44 | "vi": "不及物动词(内动词)", 45 | "vl": "动词性惯用语", 46 | "vg": "动词性语素", 47 | "a": "形容词", 48 | "ad": "副形词", 49 | "an": "名形词", 50 | "ag": "形容词性语素", 51 | "al": "形容词性惯用语", 52 | "b": "区别词", 53 | "bl": "区别词性惯用语", 54 | "z": "状态词", 55 | "r": "代词", 56 | "rr": "人称代词", 57 | "rz": "指示代词", 58 | "rzt": "时间指示代词", 59 | "rzs": "处所指示代词", 60 | "rzv": "谓词性指示代词", 61 | "ry": "疑问代词", 62 | "ryt": "时间疑问代词", 63 | "rys": "处所疑问代词", 64 | "ryv": "谓词性疑问代词", 65 | "rg": "代词性语素", 66 | "m": "数词", 67 | "mq": "数量词", 68 | "q": "量词", 69 | "qv": "动量词", 70 | "qt": "时量词", 71 | "d": "副词", 72 | "p": "介词", 73 | "pba": "介词“把”", 74 | "pbei": "介词“被”", 75 | "c": "连词", 76 | "cc": "并列连词", 77 | "u": "助词", 78 | "uzhe": "着", 79 | "ule": "了 喽", 80 | "uguo": "过", 81 | "ude1": "的 底", 82 | "ude2": "地", 83 | "ude3": "得", 84 | "usuo": "所", 85 | "udeng": "等 等等 云云", 86 | "uyy": "一样 一般 似的 般", 87 | "udh": "的话", 88 | "uls": "来讲 来说 而言 说来", 89 | "uzhi": "之", 90 | "ulian": "连", 91 | "e": "叹词", 92 | "y": "语气词(delete yg)", 93 | "o": "拟声词", 94 | "h": "前缀", 95 | "k": "后缀", 96 | "x": "字符串", 97 | "xx": "非语素字", 98 | "xu": "网址URL", 99 | "w": "标点符号", 100 | "wkz": "左括号,全角:( 〔 [ { 《 【 〖 〈 半角:( [ { <", 101 | "wky": "右括号,全角:) 〕 ] } 》 】 〗 〉 半角: ) ] { >", 102 | "wyz": "左引号,全角:“ ‘ 『", 103 | "wyy": "右引号,全角:” ’ 』", 104 | "wj": "句号,全角:。", 105 | "ww": "问号,全角:? 半角:?", 106 | "wt": "叹号,全角:! 半角:!", 107 | "wd": "逗号,全角:, 半角:,", 108 | "wf": "分号,全角:; 半角: ; ", 109 | "wn": "顿号,全角:、", 110 | "wm": "冒号,全角:: 半角: :", 111 | "ws": "省略号,全角:…… …", 112 | "wp": "破折号,全角:—— -- ——- 半角:—", 113 | "wb": "百分号千分号,全角:% ‰ 半角:%", 114 | "wh": "单位符号,全角:¥ $ £ ° ℃ 半角 $" 115 | } 116 | 117 | training_set=[] 118 | 119 | for item in train: 120 | words = pseg.cut(item[0][0]) 121 | tmp_word_list=[] 122 | tmp_tag_list=[] 123 | for word,tag in words: 124 | tmp_word_list.append(word) 125 | tmp_tag_list.append(tag) 126 | training_set.append((tmp_word_list,tmp_tag_list)) 127 | 128 | word_to_ix={} 129 | for sent,tag in training_set: 130 | for word in sent: 131 | if word not in word_to_ix: 132 | word_to_ix[word]=len(word_to_ix) 133 | print(word_to_ix) 134 | 135 | word_to_ix["None"]=len(word_to_ix) 136 | 137 | tag_to_ix={} 138 | for sent,tag in training_set: 139 | for word in tag: 140 | if word not in tag_to_ix: 141 | tag_to_ix[word]=len(tag_to_ix) 142 | print(tag_to_ix) 143 | 144 | ix_to_tag={} 145 | for key,value in tag_to_ix.items(): 146 | ix_to_tag[value]=key 147 | 148 | import torch 149 | import torch.nn as nn 150 | import torch.nn.functional as F 151 | import torch.optim as optim 152 | 153 | EMBEDDING_DIM=50 154 | HIDDEN_DIM=50 155 | 156 | class LSTMTagger(nn.Module): 157 | def __init__(self,embedding_dim,hidden_dim,vocab_size,tagset_size): 158 | super(LSTMTagger,self).__init__() 159 | self.hidden_dim=hidden_dim 160 | self.word_embeddings=nn.Embedding(vocab_size,embedding_dim) 161 | self.lstm=nn.LSTM(embedding_dim,hidden_dim) 162 | self.hidden2tag=nn.Linear(hidden_dim,tagset_size) 163 | self.hidden=self.init_hidden() 164 | def init_hidden(self): 165 | #the axes semantics are (num_layers,minibatch_size,hidden_size) 166 | return (torch.zeros(1,1,self.hidden_dim), 167 | (torch.zeros(1,1,self.hidden_dim))) 168 | def forward(self, sentence): 169 | embeds=self.word_embeddings(sentence) 170 | lstm_out,self.hidden=self.lstm(embeds.view(len(sentence),1,-1),self.hidden) 171 | tag_space=self.hidden2tag(lstm_out.view(len(sentence),-1)) 172 | tag_scores=F.log_softmax(tag_space,dim=1) 173 | return tag_scores 174 | 175 | model=LSTMTagger(EMBEDDING_DIM,HIDDEN_DIM,len(word_to_ix),len(tag_to_ix)) 176 | loss_function=nn.NLLLoss() 177 | optimizer=optim.SGD(model.parameters(),lr=0.1) 178 | 179 | def prepare_sequence(seq,to_ix): 180 | idxs=[] 181 | for w in seq: 182 | if(w in to_ix.keys()): 183 | idxs.append(to_ix[w]) 184 | else: 185 | idxs.append(to_ix["None"]) 186 | return torch.tensor(idxs,dtype=torch.long) 187 | 188 | model=LSTMTagger(EMBEDDING_DIM,HIDDEN_DIM,len(word_to_ix),len(tag_to_ix)) 189 | loss_function=nn.NLLLoss() 190 | optimizer=optim.SGD(model.parameters(),lr=0.1) 191 | #before training 192 | # with torch.no_grad(): 193 | # inputs=prepare_sequence(training_data[0][0],word_to_ix) 194 | # tag_scores=model(inputs) 195 | # print(tag_scores) 196 | 197 | for epoch in range(20): 198 | for sentence,tags in training_set: 199 | model.zero_grad() 200 | model.hidden=model.init_hidden() 201 | sentence_in=prepare_sequence(sentence,word_to_ix) 202 | targets=prepare_sequence(tags,tag_to_ix) 203 | tag_scores=model(sentence_in) 204 | loss=loss_function(tag_scores,targets) 205 | loss.backward() 206 | optimizer.step() 207 | print(epoch) 208 | # after training 209 | # with torch.no_grad(): 210 | # inputs=prepare_sequence(training_set[0][0],word_to_ix) 211 | # tag_scores=model(inputs) 212 | # print(tag_scores.argmax(dim=1).numpy().tolist()) 213 | 214 | with torch.no_grad(): 215 | inputs=prepare_sequence(training_set[273][0],word_to_ix) 216 | tag_scores=model(inputs) 217 | tem=tag_scores.argmax(dim=1).numpy().tolist() 218 | sen=training_set[273][0] 219 | 220 | for idx in range(len(sen)): 221 | print(sen[idx],end="/") 222 | print(ix_to_tag[tem[idx]]) 223 | 224 | torch.save(model,"partspeech.pkl") 225 | 226 | model=torch.load("partspeech.pkl") 227 | 228 | with open('extra_dict.pkl', 'wb') as outp: 229 | pickle.dump(word_to_ix, outp) 230 | pickle.dump(tag_to_ix, outp) 231 | pickle.dump(ix_to_tag, outp) 232 | print('** Finished saving the data.') 233 | 234 | training_set[273][0] 235 | 236 | --------------------------------------------------------------------------------