├── .travis.yml ├── README.md ├── analyse ├── example_test.go ├── idf.go ├── idf.txt ├── stop_words.txt ├── stopwords.go ├── tag_extracker.go ├── tag_extracker_test.go ├── textrank.go └── textrank_test.go ├── dict.txt ├── dictionary.go ├── dictionary ├── dictionary.go ├── dictionary_test.go └── token.go ├── example_parallel_cut_test.go ├── example_test.go ├── finalseg ├── finalseg.go ├── finalseg_test.go ├── prob_emit.go ├── prob_trans.go └── viterbi.go ├── foobar.txt ├── jieba.go ├── jieba_test.go ├── posseg ├── char_state_tab.go ├── char_state_tab_test.go ├── dictionary.go ├── example_test.go ├── posseg.go ├── posseg_test.go ├── prob_emit.go ├── prob_start.go ├── prob_trans.go ├── viterbi.go └── viterbi_test.go ├── tokenizers ├── example_bleve_test.go ├── example_test.go ├── tokenizer.go └── tokenizer_test.go ├── userdict.txt └── util ├── util.go └── util_test.go /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | go: 3 | - 1.4.2 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #结巴分词 Go 语言版:Jiebago 2 | 3 | 4 | [![Build Status](https://travis-ci.org/wangbin/jiebago.png?branch=master)](https://travis-ci.org/wangbin/jiebago) [![GoDoc](https://godoc.org/github.com/wangbin/jiebago?status.svg)](https://godoc.org/github.com/wangbin/jiebago) 5 | 6 | [结巴分词](https://github.com/fxsjy/jieba) 是由 [@fxsjy](https://github.com/fxsjy) 使用 Python 编写的中文分词组件,Iiebago 是结巴分词的 Golang 语言实现。 7 | 8 | 9 | ## 安装 10 | 11 | ``` 12 | go get github.com/wangbin/jiebago/... 13 | ``` 14 | 15 | ## 使用 16 | 17 | ``` 18 | package main 19 | 20 | import ( 21 | "fmt" 22 | 23 | "github.com/wangbin/jiebago" 24 | ) 25 | 26 | var seg jiebago.Segmenter 27 | 28 | func init() { 29 | seg.LoadDictionary("dict.txt") 30 | } 31 | 32 | func print(ch <-chan string) { 33 | for word := range ch { 34 | fmt.Printf(" %s /", word) 35 | } 36 | fmt.Println() 37 | } 38 | 39 | func Example() { 40 | fmt.Print("【全模式】:") 41 | print(seg.CutAll("我来到北京清华大学")) 42 | 43 | fmt.Print("【精确模式】:") 44 | print(seg.Cut("我来到北京清华大学", false)) 45 | 46 | fmt.Print("【新词识别】:") 47 | print(seg.Cut("他来到了网易杭研大厦", true)) 48 | 49 | fmt.Print("【搜索引擎模式】:") 50 | print(seg.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", true)) 51 | } 52 | ``` 53 | 输出结果: 54 | 55 | ``` 56 | 【全模式】: 我 / 来到 / 北京 / 清华 / 清华大学 / 华大 / 大学 / 57 | 58 | 【精确模式】: 我 / 来到 / 北京 / 清华大学 / 59 | 60 | 【新词识别】: 他 / 来到 / 了 / 网易 / 杭研 / 大厦 / 61 | 62 | 【搜索引擎模式】: 小明 / 硕士 / 毕业 / 于 / 中国 / 科学 / 学院 / 科学院 / 中国科学院 / 计算 / 计算所 / , / 后 / 在 / 日本 / 京都 / 大学 / 日本京都大学 / 深造 / 63 | ``` 64 | 65 | 更多信息请参考[文档](https://godoc.org/github.com/wangbin/jiebago)。 66 | 67 | ## 分词速度 68 | 69 | - 2MB / Second in Full Mode 70 | - 700KB / Second in Default Mode 71 | - Test Env: AMD Phenom(tm) II X6 1055T CPU @ 2.8GHz; 《金庸全集》 72 | 73 | ## 许可证 74 | 75 | MIT: http://wangbin.mit-license.org 76 | -------------------------------------------------------------------------------- /analyse/example_test.go: -------------------------------------------------------------------------------- 1 | package analyse_test 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/wangbin/jiebago/analyse" 7 | ) 8 | 9 | func Example_extractTags() { 10 | var t analyse.TagExtracter 11 | t.LoadDictionary("../dict.txt") 12 | t.LoadIdf("idf.txt") 13 | 14 | sentence := "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。" 15 | segments := t.ExtractTags(sentence, 5) 16 | fmt.Printf("Top %d tags:", len(segments)) 17 | for _, segment := range segments { 18 | fmt.Printf(" %s /", segment.Text()) 19 | } 20 | // Output: 21 | // Top 5 tags: Python / C++ / 伸手不见五指 / 孙悟空 / 黑夜 / 22 | } 23 | 24 | func Example_textRank() { 25 | var t analyse.TextRanker 26 | t.LoadDictionary("../dict.txt") 27 | sentence := "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。" 28 | 29 | result := t.TextRank(sentence, 10) 30 | for _, segment := range result { 31 | fmt.Printf("%s %f\n", segment.Text(), segment.Weight()) 32 | } 33 | // Output: 34 | // 吉林 1.000000 35 | // 欧亚 0.878078 36 | // 置业 0.562048 37 | // 实现 0.520906 38 | // 收入 0.384284 39 | // 增资 0.360591 40 | // 子公司 0.353132 41 | // 城市 0.307509 42 | // 全资 0.306324 43 | // 商业 0.306138 44 | } 45 | -------------------------------------------------------------------------------- /analyse/idf.go: -------------------------------------------------------------------------------- 1 | package analyse 2 | 3 | import ( 4 | "sort" 5 | "sync" 6 | 7 | "github.com/wangbin/jiebago/dictionary" 8 | ) 9 | 10 | // Idf represents a thread-safe dictionary for all words with their 11 | // IDFs(Inverse Document Frequency). 12 | type Idf struct { 13 | freqMap map[string]float64 14 | median float64 15 | freqs []float64 16 | sync.RWMutex 17 | } 18 | 19 | // AddToken adds a new word with IDF into it's dictionary. 20 | func (i *Idf) AddToken(token dictionary.Token) { 21 | i.Lock() 22 | i.freqMap[token.Text()] = token.Frequency() 23 | i.freqs = append(i.freqs, token.Frequency()) 24 | sort.Float64s(i.freqs) 25 | i.median = i.freqs[len(i.freqs)/2] 26 | i.Unlock() 27 | } 28 | 29 | // Load loads all tokens from channel into it's dictionary. 30 | func (i *Idf) Load(ch <-chan dictionary.Token) { 31 | i.Lock() 32 | for token := range ch { 33 | i.freqMap[token.Text()] = token.Frequency() 34 | i.freqs = append(i.freqs, token.Frequency()) 35 | } 36 | sort.Float64s(i.freqs) 37 | i.median = i.freqs[len(i.freqs)/2] 38 | i.Unlock() 39 | } 40 | 41 | func (i *Idf) loadDictionary(fileName string) error { 42 | return dictionary.LoadDictionary(i, fileName) 43 | } 44 | 45 | // Frequency returns the IDF of given word. 46 | func (i *Idf) Frequency(key string) (float64, bool) { 47 | i.RLock() 48 | freq, ok := i.freqMap[key] 49 | i.RUnlock() 50 | return freq, ok 51 | } 52 | 53 | // NewIdf creates a new Idf instance. 54 | func NewIdf() *Idf { 55 | return &Idf{freqMap: make(map[string]float64), freqs: make([]float64, 0)} 56 | } 57 | -------------------------------------------------------------------------------- /analyse/stop_words.txt: -------------------------------------------------------------------------------- 1 | the 2 | of 3 | is 4 | and 5 | to 6 | in 7 | that 8 | we 9 | for 10 | an 11 | are 12 | by 13 | be 14 | as 15 | on 16 | with 17 | can 18 | if 19 | from 20 | which 21 | you 22 | it 23 | this 24 | then 25 | at 26 | have 27 | all 28 | not 29 | one 30 | has 31 | or 32 | that 33 | 的 34 | 了 35 | 和 36 | 是 37 | 就 38 | 都 39 | 而 40 | 及 41 | 與 42 | 著 43 | 或 44 | 一個 45 | 沒有 46 | 我們 47 | 你們 48 | 妳們 49 | 他們 50 | 她們 51 | 是否 -------------------------------------------------------------------------------- /analyse/stopwords.go: -------------------------------------------------------------------------------- 1 | package analyse 2 | 3 | import ( 4 | "sync" 5 | 6 | "github.com/wangbin/jiebago/dictionary" 7 | ) 8 | 9 | // DefaultStopWordMap contains some stop words. 10 | var DefaultStopWordMap = map[string]int{ 11 | "the": 1, 12 | "of": 1, 13 | "is": 1, 14 | "and": 1, 15 | "to": 1, 16 | "in": 1, 17 | "that": 1, 18 | "we": 1, 19 | "for": 1, 20 | "an": 1, 21 | "are": 1, 22 | "by": 1, 23 | "be": 1, 24 | "as": 1, 25 | "on": 1, 26 | "with": 1, 27 | "can": 1, 28 | "if": 1, 29 | "from": 1, 30 | "which": 1, 31 | "you": 1, 32 | "it": 1, 33 | "this": 1, 34 | "then": 1, 35 | "at": 1, 36 | "have": 1, 37 | "all": 1, 38 | "not": 1, 39 | "one": 1, 40 | "has": 1, 41 | "or": 1, 42 | } 43 | 44 | // StopWord is a thread-safe dictionary for all stop words. 45 | type StopWord struct { 46 | stopWordMap map[string]int 47 | sync.RWMutex 48 | } 49 | 50 | // AddToken adds a token into StopWord dictionary. 51 | func (s *StopWord) AddToken(token dictionary.Token) { 52 | s.Lock() 53 | s.stopWordMap[token.Text()] = 1 54 | s.Unlock() 55 | } 56 | 57 | // NewStopWord create a new StopWord with default stop words. 58 | func NewStopWord() *StopWord { 59 | s := new(StopWord) 60 | s.stopWordMap = DefaultStopWordMap 61 | return s 62 | } 63 | 64 | // IsStopWord checks if a given word is stop word. 65 | func (s *StopWord) IsStopWord(word string) bool { 66 | s.RLock() 67 | _, ok := s.stopWordMap[word] 68 | s.RUnlock() 69 | return ok 70 | } 71 | 72 | // Load loads all tokens from given channel into StopWord dictionary. 73 | func (s *StopWord) Load(ch <-chan dictionary.Token) { 74 | s.Lock() 75 | for token := range ch { 76 | s.stopWordMap[token.Text()] = 1 77 | } 78 | s.Unlock() 79 | } 80 | 81 | func (s *StopWord) loadDictionary(fileName string) error { 82 | return dictionary.LoadDictionary(s, fileName) 83 | } 84 | -------------------------------------------------------------------------------- /analyse/tag_extracker.go: -------------------------------------------------------------------------------- 1 | // Package analyse is the Golang implementation of Jieba's analyse module. 2 | package analyse 3 | 4 | import ( 5 | "sort" 6 | "strings" 7 | "unicode/utf8" 8 | 9 | "github.com/wangbin/jiebago" 10 | ) 11 | 12 | // Segment represents a word with weight. 13 | type Segment struct { 14 | text string 15 | weight float64 16 | } 17 | 18 | // Text returns the segment's text. 19 | func (s Segment) Text() string { 20 | return s.text 21 | } 22 | 23 | // Weight returns the segment's weight. 24 | func (s Segment) Weight() float64 { 25 | return s.weight 26 | } 27 | 28 | // Segments represents a slice of Segment. 29 | type Segments []Segment 30 | 31 | func (ss Segments) Len() int { 32 | return len(ss) 33 | } 34 | 35 | func (ss Segments) Less(i, j int) bool { 36 | if ss[i].weight == ss[j].weight { 37 | return ss[i].text < ss[j].text 38 | } 39 | 40 | return ss[i].weight < ss[j].weight 41 | } 42 | 43 | func (ss Segments) Swap(i, j int) { 44 | ss[i], ss[j] = ss[j], ss[i] 45 | } 46 | 47 | // TagExtracter is used to extract tags from sentence. 48 | type TagExtracter struct { 49 | seg *jiebago.Segmenter 50 | idf *Idf 51 | stopWord *StopWord 52 | } 53 | 54 | // LoadDictionary reads the given filename and create a new dictionary. 55 | func (t *TagExtracter) LoadDictionary(fileName string) error { 56 | t.stopWord = NewStopWord() 57 | t.seg = new(jiebago.Segmenter) 58 | return t.seg.LoadDictionary(fileName) 59 | } 60 | 61 | // LoadIdf reads the given file and create a new Idf dictionary. 62 | func (t *TagExtracter) LoadIdf(fileName string) error { 63 | t.idf = NewIdf() 64 | return t.idf.loadDictionary(fileName) 65 | } 66 | 67 | // LoadStopWords reads the given file and create a new StopWord dictionary. 68 | func (t *TagExtracter) LoadStopWords(fileName string) error { 69 | t.stopWord = NewStopWord() 70 | return t.stopWord.loadDictionary(fileName) 71 | } 72 | 73 | // ExtractTags extracts the topK key words from sentence. 74 | func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) { 75 | freqMap := make(map[string]float64) 76 | 77 | for w := range t.seg.Cut(sentence, true) { 78 | w = strings.TrimSpace(w) 79 | if utf8.RuneCountInString(w) < 2 { 80 | continue 81 | } 82 | if t.stopWord.IsStopWord(w) { 83 | continue 84 | } 85 | if f, ok := freqMap[w]; ok { 86 | freqMap[w] = f + 1.0 87 | } else { 88 | freqMap[w] = 1.0 89 | } 90 | } 91 | total := 0.0 92 | for _, freq := range freqMap { 93 | total += freq 94 | } 95 | for k, v := range freqMap { 96 | freqMap[k] = v / total 97 | } 98 | ws := make(Segments, 0) 99 | var s Segment 100 | for k, v := range freqMap { 101 | if freq, ok := t.idf.Frequency(k); ok { 102 | s = Segment{text: k, weight: freq * v} 103 | } else { 104 | s = Segment{text: k, weight: t.idf.median * v} 105 | } 106 | ws = append(ws, s) 107 | } 108 | sort.Sort(sort.Reverse(ws)) 109 | if len(ws) > topK { 110 | tags = ws[:topK] 111 | } else { 112 | tags = ws 113 | } 114 | return tags 115 | } 116 | -------------------------------------------------------------------------------- /analyse/tag_extracker_test.go: -------------------------------------------------------------------------------- 1 | package analyse 2 | 3 | import ( 4 | "math" 5 | "testing" 6 | ) 7 | 8 | var ( 9 | testContents = []string{ 10 | "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。", 11 | "我不喜欢日本和服。", 12 | "雷猴回归人间。", 13 | "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", 14 | "我需要廉租房", 15 | "永和服装饰品有限公司", 16 | "我爱北京天安门", 17 | "abc", 18 | "隐马尔可夫", 19 | "雷猴是个好网站", 20 | "“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成", 21 | "草泥马和欺实马是今年的流行词汇", 22 | "伊藤洋华堂总府店", 23 | "中国科学院计算技术研究所", 24 | "罗密欧与朱丽叶", 25 | "我购买了道具和服装", 26 | "PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍", 27 | "湖北省石首市", 28 | "湖北省十堰市", 29 | "总经理完成了这件事情", 30 | "电脑修好了", 31 | "做好了这件事情就一了百了了", 32 | "人们审美的观点是不同的", 33 | "我们买了一个美的空调", 34 | "线程初始化时我们要注意", 35 | "一个分子是由好多原子组织成的", 36 | "祝你马到功成", 37 | "他掉进了无底洞里", 38 | "中国的首都是北京", 39 | "孙君意", 40 | "外交部发言人马朝旭", 41 | "领导人会议和第四届东亚峰会", 42 | "在过去的这五年", 43 | "还需要很长的路要走", 44 | "60周年首都阅兵", 45 | "你好人们审美的观点是不同的", 46 | "买水果然后来世博园", 47 | "买水果然后去世博园", 48 | "但是后来我才知道你是对的", 49 | "存在即合理", 50 | "的的的的的在的的的的就以和和和", 51 | "I love你,不以为耻,反以为rong", 52 | "因", 53 | "", 54 | "hello你好人们审美的观点是不同的", 55 | "很好但主要是基于网页形式", 56 | "hello你好人们审美的观点是不同的", 57 | "为什么我不能拥有想要的生活", 58 | "后来我才", 59 | "此次来中国是为了", 60 | "使用了它就可以解决一些问题", 61 | ",使用了它就可以解决一些问题", 62 | "其实使用了它就可以解决一些问题", 63 | "好人使用了它就可以解决一些问题", 64 | "是因为和国家", 65 | "老年搜索还支持", 66 | "干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ", 67 | "大", 68 | "", 69 | "他说的确实在理", 70 | "长春市长春节讲话", 71 | "结婚的和尚未结婚的", 72 | "结合成分子时", 73 | "旅游和服务是最好的", 74 | "这件事情的确是我的错", 75 | "供大家参考指正", 76 | "哈尔滨政府公布塌桥原因", 77 | "我在机场入口处", 78 | "邢永臣摄影报道", 79 | "BP神经网络如何训练才能在分类时增加区分度?", 80 | "南京市长江大桥", 81 | "应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究", 82 | "长春市长春药店", 83 | "邓颖超生前最喜欢的衣服", 84 | "胡锦涛是热爱世界和平的政治局常委", 85 | "程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪", 86 | "一次性交多少钱", 87 | "两块五一套,三块八一斤,四块七一本,五块六一条", 88 | "小和尚留了一个像大和尚一样的和尚头", 89 | "我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站", 90 | "张晓梅去人民医院做了个B超然后去买了件T恤", 91 | "AT&T是一件不错的公司,给你发offer了吗?", 92 | "C++和c#是什么关系?11+122=133,是吗?PI=3.14159", 93 | "你认识那个和主席握手的的哥吗?他开一辆黑色的士。", 94 | "枪杆子中出政权"} 95 | 96 | Tags = [][]string{ 97 | []string{"Python", "C++", "伸手不见五指", "孙悟空", "黑夜", "北京", "这是", "一个"}, 98 | []string{"和服", "喜欢", "日本"}, 99 | []string{"雷猴", "人间", "回归"}, 100 | []string{"工信处", "女干事", "24", "交换机", "科室", "亲口", "器件", "技术性", "下属", "交代", "每月", "安装", "经过", "工作"}, 101 | []string{"廉租房", "需要"}, 102 | []string{"饰品", "永和", "服装", "有限公司"}, 103 | []string{"天安门", "北京"}, 104 | []string{"abc"}, 105 | []string{"马尔可夫"}, 106 | []string{"雷猴", "网站"}, 107 | []string{"SOFTware", "Microsoft", "MICROcomputer", "微型", "一词", "软件", "计算机", "组成", "部分"}, 108 | []string{"草泥马", "欺实", "词汇", "流行", "今年"}, 109 | []string{"洋华堂", "总府", "伊藤"}, 110 | []string{"中国科学院计算技术研究所"}, 111 | []string{"朱丽叶", "罗密欧"}, 112 | []string{"道具", "服装", "购买"}, 113 | []string{"自珍", "敞帚", "PS", "开源", "不断改进", "敦促", "好处", "避免", "能够", "觉得", "就是", "自己", "一个"}, 114 | []string{"石首市", "湖北省"}, 115 | []string{"十堰市", "湖北省"}, 116 | []string{"总经理", "这件", "完成", "事情"}, 117 | []string{"修好", "电脑"}, 118 | []string{"一了百了", "做好", "这件", "事情"}, 119 | []string{"审美", "观点", "人们", "不同"}, 120 | []string{"美的", "空调", "我们", "一个"}, 121 | []string{"线程", "初始化", "注意", "我们"}, 122 | []string{"好多", "原子", "分子", "组织", "一个"}, 123 | []string{"马到功成"}, 124 | []string{"无底洞"}, 125 | []string{"首都", "北京", "中国"}, 126 | []string{"孙君意"}, 127 | []string{"马朝旭", "外交部", "发言人"}, 128 | []string{"第四届", "东亚", "峰会", "领导人", "会议"}, 129 | []string{"五年", "过去"}, 130 | []string{"很长", "需要"}, 131 | []string{"60", "阅兵", "周年", "首都"}, 132 | []string{"审美", "你好", "观点", "人们", "不同"}, 133 | []string{"世博园", "水果", "然后"}, 134 | []string{"世博园", "水果", "然后"}, 135 | []string{"后来", "但是", "知道"}, 136 | []string{"合理", "存在"}, 137 | []string{}, 138 | []string{"rong", "love", "不以为耻", "以为"}, 139 | []string{}, 140 | []string{}, 141 | []string{"hello", "审美", "你好", "观点", "人们", "不同"}, 142 | []string{"网页", "基于", "形式", "主要"}, 143 | []string{"hello", "审美", "你好", "观点", "人们", "不同"}, 144 | []string{"想要", "拥有", "为什么", "生活", "不能"}, 145 | []string{"后来"}, 146 | []string{"此次", "为了", "中国"}, 147 | []string{"解决", "使用", "一些", "问题", "可以"}, 148 | []string{"解决", "使用", "一些", "问题", "可以"}, 149 | []string{"解决", "其实", "使用", "一些", "问题", "可以"}, 150 | []string{"好人", "解决", "使用", "一些", "问题", "可以"}, 151 | []string{"是因为", "国家"}, 152 | []string{"老年", "搜索", "支持"}, 153 | []string{"闲法", "中本", "laoshipukong", "RT", "27", "责任法", "蒙人", "万劫不复", "举证", "倒置", "医患", "那部", "拉倒", "侵权", "全国人大常委会", "草案", "境地", "纠纷", "删除", "弱势"}, 154 | []string{}, 155 | []string{}, 156 | []string{"在理", "确实"}, 157 | []string{"长春", "春节", "讲话", "市长"}, 158 | []string{"结婚", "尚未"}, 159 | []string{"分子", "结合"}, 160 | []string{"旅游", "最好", "服务"}, 161 | []string{"的确", "这件", "事情"}, 162 | []string{"指正", "参考", "大家"}, 163 | []string{"塌桥", "哈尔滨", "公布", "原因", "政府"}, 164 | []string{"入口处", "机场"}, 165 | []string{"邢永臣", "摄影", "报道"}, 166 | []string{"区分度", "BP", "神经网络", "训练", "分类", "才能", "如何", "增加"}, 167 | []string{"长江大桥", "南京市"}, 168 | []string{"SMT", "NiuTrans", "使用者", "便于", "用于", "建议", "利用", "为了", "研究", "一些"}, 169 | []string{"长春市", "药店", "长春"}, 170 | []string{"邓颖超", "生前", "衣服", "喜欢"}, 171 | []string{"政治局", "热爱", "常委", "胡锦涛", "和平", "世界"}, 172 | []string{"右面", "孙健", "范凯", "李松洪", "朱会震", "海林", "左面", "程序员", "再往"}, 173 | []string{"一次性", "多少"}, 174 | []string{"四块", "五块", "三块", "一斤", "两块", "一本", "一套", "一条"}, 175 | []string{"和尚", "和尚头", "一样", "一个"}, 176 | []string{"和平门", "共和党", "地铁", "党员", "公民", "爸爸", "中华人民共和国"}, 177 | []string{"张晓梅", "T恤", "B超", "医院", "人民", "然后"}, 178 | []string{"offer", "AT&T", "不错", "一件", "公司"}, 179 | []string{"c#", "PI", "C++", "3.14159", "133", "122", "11", "关系", "什么"}, 180 | []string{"的士", "的哥", "他开", "握手", "一辆", "黑色", "主席", "认识", "那个"}, 181 | []string{"枪杆子", "政权"}, 182 | } 183 | 184 | Lyric = ` 185 | 我沒有心 186 | 我沒有真實的自我 187 | 我只有消瘦的臉孔 188 | 所謂軟弱 189 | 所謂的順從一向是我 190 | 的座右銘 191 | 192 | 而我 193 | 沒有那海洋的寬闊 194 | 我只要熱情的撫摸 195 | 所謂空洞 196 | 所謂不安全感是我 197 | 的墓誌銘 198 | 199 | 而你 200 | 是否和我一般怯懦 201 | 是否和我一般矯作 202 | 和我一般囉唆 203 | 204 | 而你 205 | 是否和我一般退縮 206 | 是否和我一般肌迫 207 | 一般地困惑 208 | 209 | 我沒有力 210 | 我沒有滿腔的熱火 211 | 我只有滿肚的如果 212 | 所謂勇氣 213 | 所謂的認同感是我 214 | 隨便說說 215 | 216 | 而你 217 | 是否和我一般怯懦 218 | 是否和我一般矯作 219 | 是否對你來說 220 | 只是一場遊戲 221 | 雖然沒有把握 222 | 223 | 而你 224 | 是否和我一般退縮 225 | 是否和我一般肌迫 226 | 是否對你來說 227 | 只是逼不得已 228 | 雖然沒有藉口 229 | ` 230 | LyciWeight = Segments{ 231 | Segment{text: "所謂", weight: 1.010262}, 232 | Segment{text: "是否", weight: 0.738650}, 233 | Segment{text: "一般", weight: 0.607600}, 234 | Segment{text: "雖然", weight: 0.336754}, 235 | Segment{text: "退縮", weight: 0.336754}, 236 | Segment{text: "肌迫", weight: 0.336754}, 237 | Segment{text: "矯作", weight: 0.336754}, 238 | Segment{text: "沒有", weight: 0.336754}, 239 | Segment{text: "怯懦", weight: 0.271099}, 240 | Segment{text: "隨便", weight: 0.168377}, 241 | } 242 | 243 | LyciWeight2 = Segments{ 244 | Segment{text: "所謂", weight: 1.215739}, 245 | Segment{text: "一般", weight: 0.731179}, 246 | Segment{text: "雖然", weight: 0.405246}, 247 | Segment{text: "退縮", weight: 0.405246}, 248 | Segment{text: "肌迫", weight: 0.405246}, 249 | Segment{text: "矯作", weight: 0.405246}, 250 | Segment{text: "怯懦", weight: 0.326238}, 251 | Segment{text: "逼不得已", weight: 0.202623}, 252 | Segment{text: "右銘", weight: 0.202623}, 253 | Segment{text: "寬闊", weight: 0.202623}, 254 | } 255 | ) 256 | 257 | func TestExtractTags(t *testing.T) { 258 | var te TagExtracter 259 | te.LoadDictionary("../dict.txt") 260 | te.LoadIdf("idf.txt") 261 | 262 | for index, sentence := range testContents { 263 | result := te.ExtractTags(sentence, 20) 264 | if len(result) != len(Tags[index]) { 265 | t.Fatalf("%s = %v", sentence, result) 266 | } 267 | for i, tag := range result { 268 | if tag.text != Tags[index][i] { 269 | t.Fatalf("%s != %s", tag, Tags[index][i]) 270 | } 271 | } 272 | } 273 | } 274 | 275 | func TestExtratTagsWithWeight(t *testing.T) { 276 | var te TagExtracter 277 | te.LoadDictionary("../dict.txt") 278 | te.LoadIdf("idf.txt") 279 | result := te.ExtractTags(Lyric, 10) 280 | for index, tag := range result { 281 | if LyciWeight[index].text != tag.text || 282 | math.Abs(LyciWeight[index].weight-tag.weight) > 1e-6 { 283 | t.Fatalf("%v != %v", tag, LyciWeight[index]) 284 | } 285 | } 286 | } 287 | 288 | func TestExtractTagsWithStopWordsFile(t *testing.T) { 289 | var te TagExtracter 290 | te.LoadDictionary("../dict.txt") 291 | te.LoadIdf("idf.txt") 292 | te.LoadStopWords("stop_words.txt") 293 | result := te.ExtractTags(Lyric, 7) 294 | for index, tag := range result { 295 | if LyciWeight2[index].text != tag.text || 296 | math.Abs(LyciWeight2[index].weight-tag.weight) > 1e-6 { 297 | t.Fatalf("%v != %v", tag, LyciWeight2[index]) 298 | } 299 | } 300 | } 301 | -------------------------------------------------------------------------------- /analyse/textrank.go: -------------------------------------------------------------------------------- 1 | package analyse 2 | 3 | import ( 4 | "math" 5 | "sort" 6 | 7 | "github.com/wangbin/jiebago/posseg" 8 | ) 9 | 10 | const dampingFactor = 0.85 11 | 12 | var ( 13 | defaultAllowPOS = []string{"ns", "n", "vn", "v"} 14 | ) 15 | 16 | type edge struct { 17 | start string 18 | end string 19 | weight float64 20 | } 21 | 22 | type edges []edge 23 | 24 | func (es edges) Len() int { 25 | return len(es) 26 | } 27 | 28 | func (es edges) Less(i, j int) bool { 29 | return es[i].weight < es[j].weight 30 | } 31 | 32 | func (es edges) Swap(i, j int) { 33 | es[i], es[j] = es[j], es[i] 34 | } 35 | 36 | type undirectWeightedGraph struct { 37 | graph map[string]edges 38 | keys sort.StringSlice 39 | } 40 | 41 | func newUndirectWeightedGraph() *undirectWeightedGraph { 42 | u := new(undirectWeightedGraph) 43 | u.graph = make(map[string]edges) 44 | u.keys = make(sort.StringSlice, 0) 45 | return u 46 | } 47 | 48 | func (u *undirectWeightedGraph) addEdge(start, end string, weight float64) { 49 | if _, ok := u.graph[start]; !ok { 50 | u.keys = append(u.keys, start) 51 | u.graph[start] = edges{edge{start: start, end: end, weight: weight}} 52 | } else { 53 | u.graph[start] = append(u.graph[start], edge{start: start, end: end, weight: weight}) 54 | } 55 | 56 | if _, ok := u.graph[end]; !ok { 57 | u.keys = append(u.keys, end) 58 | u.graph[end] = edges{edge{start: end, end: start, weight: weight}} 59 | } else { 60 | u.graph[end] = append(u.graph[end], edge{start: end, end: start, weight: weight}) 61 | } 62 | } 63 | 64 | func (u *undirectWeightedGraph) rank() Segments { 65 | if !sort.IsSorted(u.keys) { 66 | sort.Sort(u.keys) 67 | } 68 | 69 | ws := make(map[string]float64) 70 | outSum := make(map[string]float64) 71 | 72 | wsdef := 1.0 73 | if len(u.graph) > 0 { 74 | wsdef /= float64(len(u.graph)) 75 | } 76 | for n, out := range u.graph { 77 | ws[n] = wsdef 78 | sum := 0.0 79 | for _, e := range out { 80 | sum += e.weight 81 | } 82 | outSum[n] = sum 83 | } 84 | 85 | for x := 0; x < 10; x++ { 86 | for _, n := range u.keys { 87 | s := 0.0 88 | inedges := u.graph[n] 89 | for _, e := range inedges { 90 | s += e.weight / outSum[e.end] * ws[e.end] 91 | } 92 | ws[n] = (1 - dampingFactor) + dampingFactor*s 93 | } 94 | } 95 | minRank := math.MaxFloat64 96 | maxRank := math.SmallestNonzeroFloat64 97 | for _, w := range ws { 98 | if w < minRank { 99 | minRank = w 100 | } else if w > maxRank { 101 | maxRank = w 102 | } 103 | } 104 | result := make(Segments, 0) 105 | for n, w := range ws { 106 | result = append(result, Segment{text: n, weight: (w - minRank/10.0) / (maxRank - minRank/10.0)}) 107 | } 108 | sort.Sort(sort.Reverse(result)) 109 | return result 110 | } 111 | 112 | // TextRankWithPOS extracts keywords from sentence using TextRank algorithm. 113 | // Parameter allowPOS allows a customized pos list. 114 | func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) Segments { 115 | posFilt := make(map[string]int) 116 | for _, pos := range allowPOS { 117 | posFilt[pos] = 1 118 | } 119 | g := newUndirectWeightedGraph() 120 | cm := make(map[[2]string]float64) 121 | span := 5 122 | var pairs []posseg.Segment 123 | for pair := range t.seg.Cut(sentence, true) { 124 | pairs = append(pairs, pair) 125 | } 126 | for i := range pairs { 127 | if _, ok := posFilt[pairs[i].Pos()]; ok { 128 | for j := i + 1; j < i+span && j <= len(pairs); j++ { 129 | if _, ok := posFilt[pairs[j].Pos()]; !ok { 130 | continue 131 | } 132 | if _, ok := cm[[2]string{pairs[i].Text(), pairs[j].Text()}]; !ok { 133 | cm[[2]string{pairs[i].Text(), pairs[j].Text()}] = 1.0 134 | } else { 135 | cm[[2]string{pairs[i].Text(), pairs[j].Text()}] += 1.0 136 | } 137 | } 138 | } 139 | } 140 | for startEnd, weight := range cm { 141 | g.addEdge(startEnd[0], startEnd[1], weight) 142 | } 143 | tags := g.rank() 144 | if topK > 0 && len(tags) > topK { 145 | tags = tags[:topK] 146 | } 147 | return tags 148 | } 149 | 150 | // TextRank extract keywords from sentence using TextRank algorithm. 151 | // Parameter topK specify how many top keywords to be returned at most. 152 | func (t *TextRanker) TextRank(sentence string, topK int) Segments { 153 | return t.TextRankWithPOS(sentence, topK, defaultAllowPOS) 154 | } 155 | 156 | // TextRanker is used to extract tags from sentence. 157 | type TextRanker struct { 158 | seg *posseg.Segmenter 159 | } 160 | 161 | // LoadDictionary reads a given file and create a new dictionary file for Textranker. 162 | func (t *TextRanker) LoadDictionary(fileName string) error { 163 | t.seg = new(posseg.Segmenter) 164 | return t.seg.LoadDictionary(fileName) 165 | } 166 | -------------------------------------------------------------------------------- /analyse/textrank_test.go: -------------------------------------------------------------------------------- 1 | package analyse 2 | 3 | import ( 4 | "math" 5 | "testing" 6 | ) 7 | 8 | var ( 9 | sentence = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。" 10 | 11 | tagRanks = Segments{ 12 | Segment{text: "吉林", weight: 1.0}, 13 | Segment{text: "欧亚", weight: 0.87807810644}, 14 | Segment{text: "置业", weight: 0.562048250306}, 15 | Segment{text: "实现", weight: 0.520905743929}, 16 | Segment{text: "收入", weight: 0.384283870648}, 17 | Segment{text: "增资", weight: 0.360590945312}, 18 | Segment{text: "子公司", weight: 0.353131980904}, 19 | Segment{text: "城市", weight: 0.307509449283}, 20 | Segment{text: "全资", weight: 0.306324426665}, 21 | Segment{text: "商业", weight: 0.306138241063}, 22 | } 23 | ) 24 | 25 | func TestTextRank(t *testing.T) { 26 | var tr TextRanker 27 | tr.LoadDictionary("../dict.txt") 28 | results := tr.TextRank(sentence, 10) 29 | for index, tw := range results { 30 | if tw.text != tagRanks[index].text || math.Abs(tw.weight-tagRanks[index].weight) > 1e-6 { 31 | t.Fatalf("%v != %v", tw, tagRanks[index]) 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /dictionary.go: -------------------------------------------------------------------------------- 1 | package jiebago 2 | 3 | import ( 4 | "math" 5 | "sync" 6 | 7 | "github.com/wangbin/jiebago/dictionary" 8 | ) 9 | 10 | // A Dictionary represents a thread-safe dictionary used for word segmentation. 11 | type Dictionary struct { 12 | total, logTotal float64 13 | freqMap map[string]float64 14 | sync.RWMutex 15 | } 16 | 17 | // Load loads all tokens from given channel 18 | func (d *Dictionary) Load(ch <-chan dictionary.Token) { 19 | d.Lock() 20 | for token := range ch { 21 | d.addToken(token) 22 | } 23 | d.Unlock() 24 | d.updateLogTotal() 25 | } 26 | 27 | // AddToken adds one token 28 | func (d *Dictionary) AddToken(token dictionary.Token) { 29 | d.Lock() 30 | d.addToken(token) 31 | d.Unlock() 32 | d.updateLogTotal() 33 | } 34 | 35 | func (d *Dictionary) addToken(token dictionary.Token) { 36 | d.freqMap[token.Text()] = token.Frequency() 37 | d.total += token.Frequency() 38 | runes := []rune(token.Text()) 39 | n := len(runes) 40 | for i := 0; i < n; i++ { //TODO: n-1? 41 | frag := string(runes[:i+1]) 42 | if _, ok := d.freqMap[frag]; !ok { 43 | d.freqMap[frag] = 0.0 44 | } 45 | } 46 | } 47 | 48 | func (d *Dictionary) updateLogTotal() { 49 | d.logTotal = math.Log(d.total) 50 | } 51 | 52 | // Frequency returns the frequency and existence of give word 53 | func (d *Dictionary) Frequency(key string) (float64, bool) { 54 | d.RLock() 55 | freq, ok := d.freqMap[key] 56 | d.RUnlock() 57 | return freq, ok 58 | } 59 | 60 | func (d *Dictionary) loadDictionary(fileName string) error { 61 | return dictionary.LoadDictionary(d, fileName) 62 | } 63 | -------------------------------------------------------------------------------- /dictionary/dictionary.go: -------------------------------------------------------------------------------- 1 | // Package dictionary contains a interface and wraps all io related work. 2 | // It is used by jiebago module to read/write files. 3 | package dictionary 4 | 5 | import ( 6 | "bufio" 7 | "os" 8 | "path/filepath" 9 | "strconv" 10 | "strings" 11 | ) 12 | 13 | // DictLoader is the interface that could add one token or load 14 | // tokens from channel. 15 | type DictLoader interface { 16 | Load(<-chan Token) 17 | AddToken(Token) 18 | } 19 | 20 | func loadDictionary(file *os.File) (<-chan Token, <-chan error) { 21 | tokenCh, errCh := make(chan Token), make(chan error) 22 | 23 | go func() { 24 | defer close(tokenCh) 25 | defer close(errCh) 26 | scanner := bufio.NewScanner(file) 27 | var token Token 28 | var line string 29 | var fields []string 30 | var err error 31 | for scanner.Scan() { 32 | line = scanner.Text() 33 | fields = strings.Split(line, " ") 34 | token.text = strings.TrimSpace(strings.Replace(fields[0], "\ufeff", "", 1)) 35 | if length := len(fields); length > 1 { 36 | token.frequency, err = strconv.ParseFloat(fields[1], 64) 37 | if err != nil { 38 | errCh <- err 39 | return 40 | } 41 | if length > 2 { 42 | token.pos = strings.TrimSpace(fields[2]) 43 | } 44 | } 45 | tokenCh <- token 46 | } 47 | 48 | if err = scanner.Err(); err != nil { 49 | errCh <- err 50 | } 51 | }() 52 | return tokenCh, errCh 53 | 54 | } 55 | 56 | // LoadDictionary reads the given file and passes all tokens to a DictLoader. 57 | func LoadDictionary(dl DictLoader, fileName string) error { 58 | filePath, err := dictPath(fileName) 59 | if err != nil { 60 | return err 61 | } 62 | dictFile, err := os.Open(filePath) 63 | if err != nil { 64 | return err 65 | } 66 | defer dictFile.Close() 67 | tokenCh, errCh := loadDictionary(dictFile) 68 | dl.Load(tokenCh) 69 | 70 | return <-errCh 71 | 72 | } 73 | 74 | func dictPath(dictFileName string) (string, error) { 75 | if filepath.IsAbs(dictFileName) { 76 | return dictFileName, nil 77 | } 78 | var dictFilePath string 79 | cwd, err := os.Getwd() 80 | if err != nil { 81 | return dictFilePath, err 82 | } 83 | dictFilePath = filepath.Clean(filepath.Join(cwd, dictFileName)) 84 | return dictFilePath, nil 85 | } 86 | -------------------------------------------------------------------------------- /dictionary/dictionary_test.go: -------------------------------------------------------------------------------- 1 | package dictionary 2 | 3 | import ( 4 | "sync" 5 | "testing" 6 | ) 7 | 8 | type Dict struct { 9 | freqMap map[string]float64 10 | posMap map[string]string 11 | sync.RWMutex 12 | } 13 | 14 | func (d *Dict) Load(ch <-chan Token) { 15 | d.Lock() 16 | for token := range ch { 17 | d.freqMap[token.Text()] = token.Frequency() 18 | if len(token.Pos()) > 0 { 19 | d.posMap[token.Text()] = token.Pos() 20 | } 21 | } 22 | d.Unlock() 23 | } 24 | 25 | func (d *Dict) AddToken(token Token) { 26 | d.Lock() 27 | d.freqMap[token.Text()] = token.Frequency() 28 | if len(token.Pos()) > 0 { 29 | d.posMap[token.Text()] = token.Pos() 30 | } 31 | d.Unlock() 32 | } 33 | 34 | func TestLoadDictionary(t *testing.T) { 35 | d := &Dict{freqMap: make(map[string]float64), posMap: make(map[string]string)} 36 | err := LoadDictionary(d, "../userdict.txt") 37 | if err != nil { 38 | t.Fatalf(err.Error()) 39 | } 40 | if len(d.freqMap) != 7 { 41 | t.Fatalf("Failed to load userdict.txt, got %d tokens with frequency, expected 7", 42 | len(d.freqMap)) 43 | } 44 | if len(d.posMap) != 6 { 45 | t.Fatalf("Failed to load userdict.txt, got %d tokens with pos, expected 6", len(d.posMap)) 46 | } 47 | } 48 | 49 | func TestAddToken(t *testing.T) { 50 | d := &Dict{freqMap: make(map[string]float64), posMap: make(map[string]string)} 51 | LoadDictionary(d, "../userdict.txt") 52 | d.AddToken(Token{"好用", 99, "a"}) 53 | if d.freqMap["好用"] != 99 { 54 | t.Fatalf("Failed to add token, got frequency %f, expected 99", d.freqMap["好用"]) 55 | } 56 | if d.posMap["好用"] != "a" { 57 | t.Fatalf("Failed to add token, got pos %s, expected \"a\"", d.posMap["好用"]) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /dictionary/token.go: -------------------------------------------------------------------------------- 1 | package dictionary 2 | 3 | // Token represents a Chinese word with (optional) frequency and POS. 4 | type Token struct { 5 | text string 6 | frequency float64 7 | pos string 8 | } 9 | 10 | //Text returns token's text. 11 | func (t Token) Text() string { 12 | return t.text 13 | } 14 | 15 | // Frequency returns token's frequency. 16 | func (t Token) Frequency() float64 { 17 | return t.frequency 18 | } 19 | 20 | // Pos returns token's POS. 21 | func (t Token) Pos() string { 22 | return t.pos 23 | } 24 | 25 | // NewToken creates a new token. 26 | func NewToken(text string, frequency float64, pos string) Token { 27 | return Token{text: text, frequency: frequency, pos: pos} 28 | } 29 | -------------------------------------------------------------------------------- /example_parallel_cut_test.go: -------------------------------------------------------------------------------- 1 | package jiebago_test 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "log" 7 | "os" 8 | "runtime" 9 | "strings" 10 | "time" 11 | 12 | "github.com/wangbin/jiebago" 13 | ) 14 | 15 | type line struct { 16 | number int 17 | text string 18 | } 19 | 20 | var ( 21 | segmenter = jiebago.Segmenter{} 22 | numThreads = runtime.NumCPU() 23 | task = make(chan line, numThreads) 24 | result = make(chan line, numThreads) 25 | ) 26 | 27 | func worker() { 28 | for l := range task { 29 | var segments []string 30 | for segment := range segmenter.Cut(l.text, true) { 31 | segments = append(segments, segment) 32 | } 33 | 34 | l.text = fmt.Sprintf("%s\n", strings.Join(segments, " / ")) 35 | result <- l 36 | } 37 | } 38 | 39 | func Example_parallelCut() { 40 | // Set the number of goroutines 41 | runtime.GOMAXPROCS(numThreads) 42 | 43 | // Load dictionary 44 | segmenter.LoadDictionary("dict.txt") 45 | 46 | // open file for segmentation 47 | file, err := os.Open("README.md") 48 | if err != nil { 49 | log.Fatal(err) 50 | } 51 | defer file.Close() 52 | 53 | // start worker routines 54 | for i := 0; i < numThreads; i++ { 55 | go worker() 56 | } 57 | 58 | var length, size int 59 | scanner := bufio.NewScanner(file) 60 | 61 | t0 := time.Now() 62 | 63 | lines := make([]string, 0) 64 | 65 | // Read lines 66 | for scanner.Scan() { 67 | t := scanner.Text() 68 | size += len(t) 69 | lines = append(lines, t) 70 | } 71 | length = len(lines) 72 | 73 | // Segmentation 74 | go func() { 75 | for i := 0; i < length; i++ { 76 | task <- line{number: i, text: lines[i]} 77 | } 78 | close(task) 79 | }() 80 | 81 | // Make sure the segmentation result contains same line as original file 82 | for i := 0; i < length; i++ { 83 | l := <-result 84 | lines[l.number] = l.text 85 | } 86 | 87 | t1 := time.Now() 88 | 89 | // Write the segments into a file for verify 90 | outputFile, _ := os.OpenFile("parallelCut.log", os.O_CREATE|os.O_WRONLY, 0600) 91 | defer outputFile.Close() 92 | writer := bufio.NewWriter(outputFile) 93 | for _, l := range lines { 94 | writer.WriteString(l) 95 | } 96 | writer.Flush() 97 | 98 | log.Printf("Time cousumed: %v", t1.Sub(t0)) 99 | log.Printf("Segmentation speed: %f MB/s", float64(size)/t1.Sub(t0).Seconds()/(1024*1024)) 100 | } 101 | -------------------------------------------------------------------------------- /example_test.go: -------------------------------------------------------------------------------- 1 | package jiebago_test 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/wangbin/jiebago" 7 | ) 8 | 9 | func Example() { 10 | var seg jiebago.Segmenter 11 | seg.LoadDictionary("dict.txt") 12 | 13 | print := func(ch <-chan string) { 14 | for word := range ch { 15 | fmt.Printf(" %s /", word) 16 | } 17 | fmt.Println() 18 | } 19 | 20 | fmt.Print("【全模式】:") 21 | print(seg.CutAll("我来到北京清华大学")) 22 | 23 | fmt.Print("【精确模式】:") 24 | print(seg.Cut("我来到北京清华大学", false)) 25 | 26 | fmt.Print("【新词识别】:") 27 | print(seg.Cut("他来到了网易杭研大厦", true)) 28 | 29 | fmt.Print("【搜索引擎模式】:") 30 | print(seg.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", true)) 31 | // Output: 32 | // 【全模式】: 我 / 来到 / 北京 / 清华 / 清华大学 / 华大 / 大学 / 33 | // 【精确模式】: 我 / 来到 / 北京 / 清华大学 / 34 | // 【新词识别】: 他 / 来到 / 了 / 网易 / 杭研 / 大厦 / 35 | // 【搜索引擎模式】: 小明 / 硕士 / 毕业 / 于 / 中国 / 科学 / 学院 / 科学院 / 中国科学院 / 计算 / 计算所 / , / 后 / 在 / 日本 / 京都 / 大学 / 日本京都大学 / 深造 / 36 | } 37 | 38 | func Example_suggestFrequency() { 39 | var seg jiebago.Segmenter 40 | seg.LoadDictionary("dict.txt") 41 | 42 | print := func(ch <-chan string) { 43 | for word := range ch { 44 | fmt.Printf(" %s /", word) 45 | } 46 | fmt.Println() 47 | } 48 | sentence := "超敏C反应蛋白是什么?" 49 | fmt.Print("Before:") 50 | print(seg.Cut(sentence, false)) 51 | word := "超敏C反应蛋白" 52 | oldFrequency, _ := seg.Frequency(word) 53 | frequency := seg.SuggestFrequency(word) 54 | fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency) 55 | seg.AddWord(word, frequency) 56 | fmt.Print("After:") 57 | print(seg.Cut(sentence, false)) 58 | 59 | sentence = "如果放到post中将出错" 60 | fmt.Print("Before:") 61 | print(seg.Cut(sentence, false)) 62 | word = "中将" 63 | oldFrequency, _ = seg.Frequency(word) 64 | frequency = seg.SuggestFrequency("中", "将") 65 | fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency) 66 | seg.AddWord(word, frequency) 67 | fmt.Print("After:") 68 | print(seg.Cut(sentence, false)) 69 | 70 | sentence = "今天天气不错" 71 | fmt.Print("Before:") 72 | print(seg.Cut(sentence, false)) 73 | word = "今天天气" 74 | oldFrequency, _ = seg.Frequency(word) 75 | frequency = seg.SuggestFrequency("今天", "天气") 76 | fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency) 77 | seg.AddWord(word, frequency) 78 | fmt.Print("After:") 79 | print(seg.Cut(sentence, false)) 80 | // Output: 81 | // Before: 超敏 / C / 反应 / 蛋白 / 是 / 什么 / ? / 82 | // 超敏C反应蛋白 current frequency: 0.000000, suggest: 1.000000. 83 | // After: 超敏C反应蛋白 / 是 / 什么 / ? / 84 | // Before: 如果 / 放到 / post / 中将 / 出错 / 85 | // 中将 current frequency: 763.000000, suggest: 494.000000. 86 | // After: 如果 / 放到 / post / 中 / 将 / 出错 / 87 | // Before: 今天天气 / 不错 / 88 | // 今天天气 current frequency: 3.000000, suggest: 0.000000. 89 | // After: 今天 / 天气 / 不错 / 90 | } 91 | 92 | func Example_loadUserDictionary() { 93 | var seg jiebago.Segmenter 94 | seg.LoadDictionary("dict.txt") 95 | 96 | print := func(ch <-chan string) { 97 | for word := range ch { 98 | fmt.Printf(" %s /", word) 99 | } 100 | fmt.Println() 101 | } 102 | sentence := "李小福是创新办主任也是云计算方面的专家" 103 | fmt.Print("Before:") 104 | print(seg.Cut(sentence, true)) 105 | 106 | seg.LoadUserDictionary("userdict.txt") 107 | 108 | fmt.Print("After:") 109 | print(seg.Cut(sentence, true)) 110 | // Output: 111 | // Before: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 / 112 | // After: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 / 113 | } 114 | -------------------------------------------------------------------------------- /finalseg/finalseg.go: -------------------------------------------------------------------------------- 1 | // Package finalseg is the Golang implementation of Jieba's finalseg module. 2 | package finalseg 3 | 4 | import ( 5 | "regexp" 6 | ) 7 | 8 | var ( 9 | reHan = regexp.MustCompile(`\p{Han}+`) 10 | reSkip = regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`) 11 | ) 12 | 13 | func cutHan(sentence string) chan string { 14 | result := make(chan string) 15 | go func() { 16 | runes := []rune(sentence) 17 | _, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'}) 18 | begin, next := 0, 0 19 | for i, char := range runes { 20 | pos := posList[i] 21 | switch pos { 22 | case 'B': 23 | begin = i 24 | case 'E': 25 | result <- string(runes[begin : i+1]) 26 | next = i + 1 27 | case 'S': 28 | result <- string(char) 29 | next = i + 1 30 | } 31 | } 32 | if next < len(runes) { 33 | result <- string(runes[next:]) 34 | } 35 | close(result) 36 | }() 37 | return result 38 | } 39 | 40 | // Cut cuts sentence into words using Hidden Markov Model with Viterbi 41 | // algorithm. It is used by Jiebago for unknonw words. 42 | func Cut(sentence string) chan string { 43 | result := make(chan string) 44 | s := sentence 45 | var hans string 46 | var hanLoc []int 47 | var nonhanLoc []int 48 | go func() { 49 | for { 50 | hanLoc = reHan.FindStringIndex(s) 51 | if hanLoc == nil { 52 | if len(s) == 0 { 53 | break 54 | } 55 | } else if hanLoc[0] == 0 { 56 | hans = s[hanLoc[0]:hanLoc[1]] 57 | s = s[hanLoc[1]:] 58 | for han := range cutHan(hans) { 59 | result <- han 60 | } 61 | continue 62 | } 63 | nonhanLoc = reSkip.FindStringIndex(s) 64 | if nonhanLoc == nil { 65 | if len(s) == 0 { 66 | break 67 | } 68 | } else if nonhanLoc[0] == 0 { 69 | nonhans := s[nonhanLoc[0]:nonhanLoc[1]] 70 | s = s[nonhanLoc[1]:] 71 | if nonhans != "" { 72 | result <- nonhans 73 | continue 74 | } 75 | } 76 | var loc []int 77 | if hanLoc == nil && nonhanLoc == nil { 78 | if len(s) > 0 { 79 | result <- s 80 | break 81 | } 82 | } else if hanLoc == nil { 83 | loc = nonhanLoc 84 | } else if nonhanLoc == nil { 85 | loc = hanLoc 86 | } else if hanLoc[0] < nonhanLoc[0] { 87 | loc = hanLoc 88 | } else { 89 | loc = nonhanLoc 90 | } 91 | result <- s[:loc[0]] 92 | s = s[loc[0]:] 93 | } 94 | close(result) 95 | }() 96 | return result 97 | } 98 | -------------------------------------------------------------------------------- /finalseg/finalseg_test.go: -------------------------------------------------------------------------------- 1 | package finalseg 2 | 3 | import ( 4 | "math" 5 | "testing" 6 | ) 7 | 8 | func chanToArray(ch chan string) []string { 9 | var result []string 10 | for word := range ch { 11 | result = append(result, word) 12 | } 13 | return result 14 | } 15 | 16 | func TestViterbi(t *testing.T) { 17 | obs := "我们是程序员" 18 | states := []byte{'B', 'M', 'E', 'S'} 19 | prob, path := viterbi([]rune(obs), states) 20 | if math.Abs(prob+39.68824128493802) > 1e-10 { 21 | t.Fatal(prob) 22 | } 23 | for index, state := range []byte{'B', 'E', 'S', 'B', 'M', 'E'} { 24 | if path[index] != state { 25 | t.Fatal(path) 26 | } 27 | } 28 | } 29 | 30 | func TestCutHan(t *testing.T) { 31 | obs := "我们是程序员" 32 | result := chanToArray(cutHan(obs)) 33 | if len(result) != 3 { 34 | t.Fatal(result) 35 | } 36 | if result[0] != "我们" { 37 | t.Fatal(result[0]) 38 | } 39 | if result[1] != "是" { 40 | t.Fatal(result[1]) 41 | } 42 | if result[2] != "程序员" { 43 | t.Fatal(result[2]) 44 | } 45 | } 46 | 47 | func TestCut(t *testing.T) { 48 | sentence := "我们是程序员" 49 | result := chanToArray(Cut(sentence)) 50 | if len(result) != 3 { 51 | t.Fatal(len(result)) 52 | } 53 | if result[0] != "我们" { 54 | t.Fatal(result[0]) 55 | } 56 | if result[1] != "是" { 57 | t.Fatal(result[1]) 58 | } 59 | if result[2] != "程序员" { 60 | t.Fatal(result[2]) 61 | } 62 | result2 := chanToArray(Cut("I'm a programmer!")) 63 | if len(result2) != 8 { 64 | t.Fatal(result2) 65 | } 66 | result3 := chanToArray(Cut("程序员average年龄28.6岁。")) 67 | if len(result3) != 6 { 68 | t.Fatal(result3) 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /finalseg/prob_trans.go: -------------------------------------------------------------------------------- 1 | package finalseg 2 | 3 | var probTrans = make(map[byte]map[byte]float64) 4 | 5 | func init() { 6 | probTrans['B'] = map[byte]float64{'E': -0.510825623765990, 7 | 'M': -0.916290731874155} 8 | probTrans['E'] = map[byte]float64{'B': -0.5897149736854513, 9 | 'S': -0.8085250474669937} 10 | probTrans['M'] = map[byte]float64{'E': -0.33344856811948514, 11 | 'M': -1.2603623820268226} 12 | probTrans['S'] = map[byte]float64{'B': -0.7211965654669841, 13 | 'S': -0.6658631448798212} 14 | } 15 | -------------------------------------------------------------------------------- /finalseg/viterbi.go: -------------------------------------------------------------------------------- 1 | package finalseg 2 | 3 | import ( 4 | "fmt" 5 | "sort" 6 | ) 7 | 8 | const minFloat = -3.14e100 9 | 10 | var ( 11 | prevStatus = make(map[byte][]byte) 12 | probStart = make(map[byte]float64) 13 | ) 14 | 15 | func init() { 16 | prevStatus['B'] = []byte{'E', 'S'} 17 | prevStatus['M'] = []byte{'M', 'B'} 18 | prevStatus['S'] = []byte{'S', 'E'} 19 | prevStatus['E'] = []byte{'B', 'M'} 20 | probStart['B'] = -0.26268660809250016 21 | probStart['E'] = -3.14e+100 22 | probStart['M'] = -3.14e+100 23 | probStart['S'] = -1.4652633398537678 24 | } 25 | 26 | type probState struct { 27 | prob float64 28 | state byte 29 | } 30 | 31 | func (p probState) String() string { 32 | return fmt.Sprintf("(%f, %x)", p.prob, p.state) 33 | } 34 | 35 | type probStates []*probState 36 | 37 | func (ps probStates) Len() int { 38 | return len(ps) 39 | } 40 | 41 | func (ps probStates) Less(i, j int) bool { 42 | if ps[i].prob == ps[j].prob { 43 | return ps[i].state < ps[j].state 44 | } 45 | return ps[i].prob < ps[j].prob 46 | } 47 | 48 | func (ps probStates) Swap(i, j int) { 49 | ps[i], ps[j] = ps[j], ps[i] 50 | } 51 | 52 | func viterbi(obs []rune, states []byte) (float64, []byte) { 53 | path := make(map[byte][]byte) 54 | V := make([]map[byte]float64, len(obs)) 55 | V[0] = make(map[byte]float64) 56 | for _, y := range states { 57 | if val, ok := probEmit[y][obs[0]]; ok { 58 | V[0][y] = val + probStart[y] 59 | } else { 60 | V[0][y] = minFloat + probStart[y] 61 | } 62 | path[y] = []byte{y} 63 | } 64 | 65 | for t := 1; t < len(obs); t++ { 66 | newPath := make(map[byte][]byte) 67 | V[t] = make(map[byte]float64) 68 | for _, y := range states { 69 | ps0 := make(probStates, 0) 70 | var emP float64 71 | if val, ok := probEmit[y][obs[t]]; ok { 72 | emP = val 73 | } else { 74 | emP = minFloat 75 | } 76 | for _, y0 := range prevStatus[y] { 77 | var transP float64 78 | if tp, ok := probTrans[y0][y]; ok { 79 | transP = tp 80 | } else { 81 | transP = minFloat 82 | } 83 | prob0 := V[t-1][y0] + transP + emP 84 | ps0 = append(ps0, &probState{prob: prob0, state: y0}) 85 | } 86 | sort.Sort(sort.Reverse(ps0)) 87 | V[t][y] = ps0[0].prob 88 | pp := make([]byte, len(path[ps0[0].state])) 89 | copy(pp, path[ps0[0].state]) 90 | newPath[y] = append(pp, y) 91 | } 92 | path = newPath 93 | } 94 | ps := make(probStates, 0) 95 | for _, y := range []byte{'E', 'S'} { 96 | ps = append(ps, &probState{V[len(obs)-1][y], y}) 97 | } 98 | sort.Sort(sort.Reverse(ps)) 99 | v := ps[0] 100 | return v.prob, path[v.state] 101 | } 102 | -------------------------------------------------------------------------------- /foobar.txt: -------------------------------------------------------------------------------- 1 | 好人 12 n -------------------------------------------------------------------------------- /jieba.go: -------------------------------------------------------------------------------- 1 | // Package jiebago is the Golang implemention of [Jieba](https://github.com/fxsjy/jieba), Python Chinese text segmentation module. 2 | package jiebago 3 | 4 | import ( 5 | "math" 6 | "regexp" 7 | "strings" 8 | 9 | "github.com/wangbin/jiebago/dictionary" 10 | "github.com/wangbin/jiebago/finalseg" 11 | "github.com/wangbin/jiebago/util" 12 | ) 13 | 14 | var ( 15 | reEng = regexp.MustCompile(`[[:alnum:]]`) 16 | reHanCutAll = regexp.MustCompile(`(\p{Han}+)`) 17 | reSkipCutAll = regexp.MustCompile(`[^[:alnum:]+#\n]`) 18 | reHanDefault = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`) 19 | reSkipDefault = regexp.MustCompile(`(\r\n|\s)`) 20 | ) 21 | 22 | // Segmenter is a Chinese words segmentation struct. 23 | type Segmenter struct { 24 | dict *Dictionary 25 | } 26 | 27 | // Frequency returns a word's frequency and existence 28 | func (seg *Segmenter) Frequency(word string) (float64, bool) { 29 | return seg.dict.Frequency(word) 30 | } 31 | 32 | // AddWord adds a new word with frequency to dictionary 33 | func (seg *Segmenter) AddWord(word string, frequency float64) { 34 | seg.dict.AddToken(dictionary.NewToken(word, frequency, "")) 35 | } 36 | 37 | // DeleteWord removes a word from dictionary 38 | func (seg *Segmenter) DeleteWord(word string) { 39 | seg.dict.AddToken(dictionary.NewToken(word, 0.0, "")) 40 | } 41 | 42 | /* 43 | SuggestFrequency returns a suggested frequncy of a word or a long word 44 | cutted into several short words. 45 | 46 | This method is useful when a word in the sentence is not cutted out correctly. 47 | 48 | If a word should not be further cutted, for example word "石墨烯" should not be 49 | cutted into "石墨" and "烯", SuggestFrequency("石墨烯") will return the maximu 50 | frequency for this word. 51 | 52 | If a word should be further cutted, for example word "今天天气" should be 53 | further cutted into two words "今天" and "天气", SuggestFrequency("今天", "天气") 54 | should return the minimum frequency for word "今天天气". 55 | */ 56 | func (seg *Segmenter) SuggestFrequency(words ...string) float64 { 57 | frequency := 1.0 58 | if len(words) > 1 { 59 | for _, word := range words { 60 | if freq, ok := seg.dict.Frequency(word); ok { 61 | frequency *= freq 62 | } 63 | frequency /= seg.dict.total 64 | } 65 | frequency, _ = math.Modf(frequency * seg.dict.total) 66 | wordFreq := 0.0 67 | if freq, ok := seg.dict.Frequency(strings.Join(words, "")); ok { 68 | wordFreq = freq 69 | } 70 | if wordFreq < frequency { 71 | frequency = wordFreq 72 | } 73 | } else { 74 | word := words[0] 75 | for segment := range seg.Cut(word, false) { 76 | if freq, ok := seg.dict.Frequency(segment); ok { 77 | frequency *= freq 78 | } 79 | frequency /= seg.dict.total 80 | } 81 | frequency, _ = math.Modf(frequency * seg.dict.total) 82 | frequency += 1.0 83 | wordFreq := 1.0 84 | if freq, ok := seg.dict.Frequency(word); ok { 85 | wordFreq = freq 86 | } 87 | if wordFreq > frequency { 88 | frequency = wordFreq 89 | } 90 | } 91 | return frequency 92 | } 93 | 94 | // LoadDictionary loads dictionary from given file name. Everytime 95 | // LoadDictionary is called, previously loaded dictionary will be cleard. 96 | func (seg *Segmenter) LoadDictionary(fileName string) error { 97 | seg.dict = &Dictionary{freqMap: make(map[string]float64)} 98 | return seg.dict.loadDictionary(fileName) 99 | } 100 | 101 | // LoadUserDictionary loads a user specified dictionary, it must be called 102 | // after LoadDictionary, and it will not clear any previous loaded dictionary, 103 | // instead it will override exist entries. 104 | func (seg *Segmenter) LoadUserDictionary(fileName string) error { 105 | return seg.dict.loadDictionary(fileName) 106 | } 107 | 108 | func (seg *Segmenter) dag(runes []rune) map[int][]int { 109 | dag := make(map[int][]int) 110 | n := len(runes) 111 | var frag []rune 112 | var i int 113 | for k := 0; k < n; k++ { 114 | dag[k] = make([]int, 0) 115 | i = k 116 | frag = runes[k : k+1] 117 | for { 118 | freq, ok := seg.dict.Frequency(string(frag)) 119 | if !ok { 120 | break 121 | } 122 | if freq > 0.0 { 123 | dag[k] = append(dag[k], i) 124 | } 125 | i++ 126 | if i >= n { 127 | break 128 | } 129 | frag = runes[k : i+1] 130 | } 131 | if len(dag[k]) == 0 { 132 | dag[k] = append(dag[k], k) 133 | } 134 | } 135 | return dag 136 | } 137 | 138 | type route struct { 139 | frequency float64 140 | index int 141 | } 142 | 143 | func (seg *Segmenter) calc(runes []rune) map[int]route { 144 | dag := seg.dag(runes) 145 | n := len(runes) 146 | rs := make(map[int]route) 147 | rs[n] = route{frequency: 0.0, index: 0} 148 | var r route 149 | for idx := n - 1; idx >= 0; idx-- { 150 | for _, i := range dag[idx] { 151 | if freq, ok := seg.dict.Frequency(string(runes[idx : i+1])); ok { 152 | r = route{frequency: math.Log(freq) - seg.dict.logTotal + rs[i+1].frequency, index: i} 153 | } else { 154 | r = route{frequency: math.Log(1.0) - seg.dict.logTotal + rs[i+1].frequency, index: i} 155 | } 156 | if v, ok := rs[idx]; !ok { 157 | rs[idx] = r 158 | } else { 159 | if v.frequency < r.frequency || (v.frequency == r.frequency && v.index < r.index) { 160 | rs[idx] = r 161 | } 162 | } 163 | } 164 | } 165 | return rs 166 | } 167 | 168 | type cutFunc func(sentence string) <-chan string 169 | 170 | func (seg *Segmenter) cutDAG(sentence string) <-chan string { 171 | result := make(chan string) 172 | go func() { 173 | runes := []rune(sentence) 174 | routes := seg.calc(runes) 175 | var y int 176 | length := len(runes) 177 | var buf []rune 178 | for x := 0; x < length; { 179 | y = routes[x].index + 1 180 | frag := runes[x:y] 181 | if y-x == 1 { 182 | buf = append(buf, frag...) 183 | } else { 184 | if len(buf) > 0 { 185 | bufString := string(buf) 186 | if len(buf) == 1 { 187 | result <- bufString 188 | } else { 189 | if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 { 190 | for x := range finalseg.Cut(bufString) { 191 | result <- x 192 | } 193 | } else { 194 | for _, elem := range buf { 195 | result <- string(elem) 196 | } 197 | } 198 | } 199 | buf = make([]rune, 0) 200 | } 201 | result <- string(frag) 202 | } 203 | x = y 204 | } 205 | 206 | if len(buf) > 0 { 207 | bufString := string(buf) 208 | if len(buf) == 1 { 209 | result <- bufString 210 | } else { 211 | if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 { 212 | for t := range finalseg.Cut(bufString) { 213 | result <- t 214 | } 215 | } else { 216 | for _, elem := range buf { 217 | result <- string(elem) 218 | } 219 | } 220 | } 221 | } 222 | close(result) 223 | }() 224 | return result 225 | } 226 | 227 | func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string { 228 | result := make(chan string) 229 | 230 | go func() { 231 | runes := []rune(sentence) 232 | routes := seg.calc(runes) 233 | var y int 234 | length := len(runes) 235 | var buf []rune 236 | for x := 0; x < length; { 237 | y = routes[x].index + 1 238 | frag := runes[x:y] 239 | if reEng.MatchString(string(frag)) && len(frag) == 1 { 240 | buf = append(buf, frag...) 241 | x = y 242 | continue 243 | } 244 | if len(buf) > 0 { 245 | result <- string(buf) 246 | buf = make([]rune, 0) 247 | } 248 | result <- string(frag) 249 | x = y 250 | } 251 | if len(buf) > 0 { 252 | result <- string(buf) 253 | buf = make([]rune, 0) 254 | } 255 | close(result) 256 | }() 257 | return result 258 | } 259 | 260 | // Cut cuts a sentence into words using accurate mode. 261 | // Parameter hmm controls whether to use the Hidden Markov Model. 262 | // Accurate mode attempts to cut the sentence into the most accurate 263 | // segmentations, which is suitable for text analysis. 264 | func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan string { 265 | result := make(chan string) 266 | var cut cutFunc 267 | if hmm { 268 | cut = seg.cutDAG 269 | } else { 270 | cut = seg.cutDAGNoHMM 271 | } 272 | 273 | go func() { 274 | for _, block := range util.RegexpSplit(reHanDefault, sentence, -1) { 275 | if len(block) == 0 { 276 | continue 277 | } 278 | if reHanDefault.MatchString(block) { 279 | for x := range cut(block) { 280 | result <- x 281 | } 282 | continue 283 | } 284 | for _, subBlock := range util.RegexpSplit(reSkipDefault, block, -1) { 285 | if reSkipDefault.MatchString(subBlock) { 286 | result <- subBlock 287 | continue 288 | } 289 | for _, r := range subBlock { 290 | result <- string(r) 291 | } 292 | } 293 | } 294 | close(result) 295 | }() 296 | return result 297 | } 298 | 299 | func (seg *Segmenter) cutAll(sentence string) <-chan string { 300 | result := make(chan string) 301 | go func() { 302 | runes := []rune(sentence) 303 | dag := seg.dag(runes) 304 | start := -1 305 | ks := make([]int, len(dag)) 306 | for k := range dag { 307 | ks[k] = k 308 | } 309 | var l []int 310 | for k := range ks { 311 | l = dag[k] 312 | if len(l) == 1 && k > start { 313 | result <- string(runes[k : l[0]+1]) 314 | start = l[0] 315 | continue 316 | } 317 | for _, j := range l { 318 | if j > k { 319 | result <- string(runes[k : j+1]) 320 | start = j 321 | } 322 | } 323 | } 324 | close(result) 325 | }() 326 | return result 327 | } 328 | 329 | // CutAll cuts a sentence into words using full mode. 330 | // Full mode gets all the possible words from the sentence. 331 | // Fast but not accurate. 332 | func (seg *Segmenter) CutAll(sentence string) <-chan string { 333 | result := make(chan string) 334 | go func() { 335 | for _, block := range util.RegexpSplit(reHanCutAll, sentence, -1) { 336 | if len(block) == 0 { 337 | continue 338 | } 339 | if reHanCutAll.MatchString(block) { 340 | for x := range seg.cutAll(block) { 341 | result <- x 342 | } 343 | continue 344 | } 345 | for _, subBlock := range reSkipCutAll.Split(block, -1) { 346 | result <- subBlock 347 | } 348 | } 349 | close(result) 350 | }() 351 | return result 352 | } 353 | 354 | // CutForSearch cuts sentence into words using search engine mode. 355 | // Search engine mode, based on the accurate mode, attempts to cut long words 356 | // into several short words, which can raise the recall rate. 357 | // Suitable for search engines. 358 | func (seg *Segmenter) CutForSearch(sentence string, hmm bool) <-chan string { 359 | result := make(chan string) 360 | go func() { 361 | for word := range seg.Cut(sentence, hmm) { 362 | runes := []rune(word) 363 | for _, increment := range []int{2, 3} { 364 | if len(runes) <= increment { 365 | continue 366 | } 367 | var gram string 368 | for i := 0; i < len(runes)-increment+1; i++ { 369 | gram = string(runes[i : i+increment]) 370 | if v, ok := seg.dict.Frequency(gram); ok && v > 0.0 { 371 | result <- gram 372 | } 373 | } 374 | } 375 | result <- word 376 | } 377 | close(result) 378 | }() 379 | return result 380 | } 381 | -------------------------------------------------------------------------------- /jieba_test.go: -------------------------------------------------------------------------------- 1 | package jiebago 2 | 3 | import "testing" 4 | 5 | var ( 6 | seg Segmenter 7 | testContents = []string{ 8 | "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。", 9 | "我不喜欢日本和服。", 10 | "雷猴回归人间。", 11 | "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", 12 | "我需要廉租房", 13 | "永和服装饰品有限公司", 14 | "我爱北京天安门", 15 | "abc", 16 | "隐马尔可夫", 17 | "雷猴是个好网站", 18 | "“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成", 19 | "草泥马和欺实马是今年的流行词汇", 20 | "伊藤洋华堂总府店", 21 | "中国科学院计算技术研究所", 22 | "罗密欧与朱丽叶", 23 | "我购买了道具和服装", 24 | "PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍", 25 | "湖北省石首市", 26 | "湖北省十堰市", 27 | "总经理完成了这件事情", 28 | "电脑修好了", 29 | "做好了这件事情就一了百了了", 30 | "人们审美的观点是不同的", 31 | "我们买了一个美的空调", 32 | "线程初始化时我们要注意", 33 | "一个分子是由好多原子组织成的", 34 | "祝你马到功成", 35 | "他掉进了无底洞里", 36 | "中国的首都是北京", 37 | "孙君意", 38 | "外交部发言人马朝旭", 39 | "领导人会议和第四届东亚峰会", 40 | "在过去的这五年", 41 | "还需要很长的路要走", 42 | "60周年首都阅兵", 43 | "你好人们审美的观点是不同的", 44 | "买水果然后来世博园", 45 | "买水果然后去世博园", 46 | "但是后来我才知道你是对的", 47 | "存在即合理", 48 | "的的的的的在的的的的就以和和和", 49 | "I love你,不以为耻,反以为rong", 50 | "因", 51 | "", 52 | "hello你好人们审美的观点是不同的", 53 | "很好但主要是基于网页形式", 54 | "hello你好人们审美的观点是不同的", 55 | "为什么我不能拥有想要的生活", 56 | "后来我才", 57 | "此次来中国是为了", 58 | "使用了它就可以解决一些问题", 59 | ",使用了它就可以解决一些问题", 60 | "其实使用了它就可以解决一些问题", 61 | "好人使用了它就可以解决一些问题", 62 | "是因为和国家", 63 | "老年搜索还支持", 64 | "干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ", 65 | "大", 66 | "", 67 | "他说的确实在理", 68 | "长春市长春节讲话", 69 | "结婚的和尚未结婚的", 70 | "结合成分子时", 71 | "旅游和服务是最好的", 72 | "这件事情的确是我的错", 73 | "供大家参考指正", 74 | "哈尔滨政府公布塌桥原因", 75 | "我在机场入口处", 76 | "邢永臣摄影报道", 77 | "BP神经网络如何训练才能在分类时增加区分度?", 78 | "南京市长江大桥", 79 | "应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究", 80 | "长春市长春药店", 81 | "邓颖超生前最喜欢的衣服", 82 | "胡锦涛是热爱世界和平的政治局常委", 83 | "程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪", 84 | "一次性交多少钱", 85 | "两块五一套,三块八一斤,四块七一本,五块六一条", 86 | "小和尚留了一个像大和尚一样的和尚头", 87 | "我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站", 88 | "张晓梅去人民医院做了个B超然后去买了件T恤", 89 | "AT&T是一件不错的公司,给你发offer了吗?", 90 | "C++和c#是什么关系?11+122=133,是吗?PI=3.14159", 91 | "你认识那个和主席握手的的哥吗?他开一辆黑色的士。", 92 | "枪杆子中出政权"} 93 | 94 | defaultCutResult = [][]string{[]string{"这是", "一个", "伸手不见五指", "的", "黑夜", "。", "我", "叫", "孙悟空", ",", "我", "爱", "北京", ",", "我", "爱", "Python", "和", "C++", "。"}, 95 | []string{"我", "不", "喜欢", "日本", "和服", "。"}, 96 | []string{"雷猴", "回归", "人间", "。"}, 97 | []string{"工信处", "女干事", "每月", "经过", "下属", "科室", "都", "要", "亲口", "交代", "24", "口", "交换机", "等", "技术性", "器件", "的", "安装", "工作"}, 98 | []string{"我", "需要", "廉租房"}, 99 | []string{"永和", "服装", "饰品", "有限公司"}, 100 | []string{"我", "爱", "北京", "天安门"}, 101 | []string{"abc"}, 102 | []string{"隐", "马尔可夫"}, 103 | []string{"雷猴", "是", "个", "好", "网站"}, 104 | []string{"“", "Microsoft", "”", "一词", "由", "“", "MICROcomputer", "(", "微型", "计算机", ")", "”", "和", "“", "SOFTware", "(", "软件", ")", "”", "两", "部分", "组成"}, 105 | []string{"草泥马", "和", "欺实", "马", "是", "今年", "的", "流行", "词汇"}, 106 | []string{"伊藤", "洋华堂", "总府", "店"}, 107 | []string{"中国科学院计算技术研究所"}, 108 | []string{"罗密欧", "与", "朱丽叶"}, 109 | []string{"我", "购买", "了", "道具", "和", "服装"}, 110 | []string{"PS", ":", " ", "我", "觉得", "开源", "有", "一个", "好处", ",", "就是", "能够", "敦促", "自己", "不断改进", ",", "避免", "敞帚", "自珍"}, 111 | []string{"湖北省", "石首市"}, 112 | []string{"湖北省", "十堰市"}, 113 | []string{"总经理", "完成", "了", "这件", "事情"}, 114 | []string{"电脑", "修好", "了"}, 115 | []string{"做好", "了", "这件", "事情", "就", "一了百了", "了"}, 116 | []string{"人们", "审美", "的", "观点", "是", "不同", "的"}, 117 | []string{"我们", "买", "了", "一个", "美的", "空调"}, 118 | []string{"线程", "初始化", "时", "我们", "要", "注意"}, 119 | []string{"一个", "分子", "是", "由", "好多", "原子", "组织", "成", "的"}, 120 | []string{"祝", "你", "马到功成"}, 121 | []string{"他", "掉", "进", "了", "无底洞", "里"}, 122 | []string{"中国", "的", "首都", "是", "北京"}, 123 | []string{"孙君意"}, 124 | []string{"外交部", "发言人", "马朝旭"}, 125 | []string{"领导人", "会议", "和", "第四届", "东亚", "峰会"}, 126 | []string{"在", "过去", "的", "这", "五年"}, 127 | []string{"还", "需要", "很长", "的", "路", "要", "走"}, 128 | []string{"60", "周年", "首都", "阅兵"}, 129 | []string{"你好", "人们", "审美", "的", "观点", "是", "不同", "的"}, 130 | []string{"买", "水果", "然后", "来", "世博园"}, 131 | []string{"买", "水果", "然后", "去", "世博园"}, 132 | []string{"但是", "后来", "我", "才", "知道", "你", "是", "对", "的"}, 133 | []string{"存在", "即", "合理"}, 134 | []string{"的", "的", "的", "的", "的", "在", "的", "的", "的", "的", "就", "以", "和", "和", "和"}, 135 | []string{"I", " ", "love", "你", ",", "不以为耻", ",", "反", "以为", "rong"}, 136 | []string{"因"}, 137 | []string{}, 138 | []string{"hello", "你好", "人们", "审美", "的", "观点", "是", "不同", "的"}, 139 | []string{"很", "好", "但", "主要", "是", "基于", "网页", "形式"}, 140 | []string{"hello", "你好", "人们", "审美", "的", "观点", "是", "不同", "的"}, 141 | []string{"为什么", "我", "不能", "拥有", "想要", "的", "生活"}, 142 | []string{"后来", "我", "才"}, 143 | []string{"此次", "来", "中国", "是", "为了"}, 144 | []string{"使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, 145 | []string{",", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, 146 | []string{"其实", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, 147 | []string{"好人", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, 148 | []string{"是因为", "和", "国家"}, 149 | []string{"老年", "搜索", "还", "支持"}, 150 | []string{"干脆", "就", "把", "那部", "蒙人", "的", "闲法", "给", "废", "了", "拉倒", "!", "RT", " ", "@", "laoshipukong", " ", ":", " ", "27", "日", ",", "全国人大常委会", "第三次", "审议", "侵权", "责任法", "草案", ",", "删除", "了", "有关", "医疗", "损害", "责任", "“", "举证", "倒置", "”", "的", "规定", "。", "在", "医患", "纠纷", "中本", "已", "处于", "弱势", "地位", "的", "消费者", "由此", "将", "陷入", "万劫不复", "的", "境地", "。", " "}, 151 | []string{"大"}, 152 | []string{}, 153 | []string{"他", "说", "的", "确实", "在理"}, 154 | []string{"长春", "市长", "春节", "讲话"}, 155 | []string{"结婚", "的", "和", "尚未", "结婚", "的"}, 156 | []string{"结合", "成", "分子", "时"}, 157 | []string{"旅游", "和", "服务", "是", "最好", "的"}, 158 | []string{"这件", "事情", "的确", "是", "我", "的", "错"}, 159 | []string{"供", "大家", "参考", "指正"}, 160 | []string{"哈尔滨", "政府", "公布", "塌桥", "原因"}, 161 | []string{"我", "在", "机场", "入口处"}, 162 | []string{"邢永臣", "摄影", "报道"}, 163 | []string{"BP", "神经网络", "如何", "训练", "才能", "在", "分类", "时", "增加", "区分度", "?"}, 164 | []string{"南京市", "长江大桥"}, 165 | []string{"应", "一些", "使用者", "的", "建议", ",", "也", "为了", "便于", "利用", "NiuTrans", "用于", "SMT", "研究"}, 166 | []string{"长春市", "长春", "药店"}, 167 | []string{"邓颖超", "生前", "最", "喜欢", "的", "衣服"}, 168 | []string{"胡锦涛", "是", "热爱", "世界", "和平", "的", "政治局", "常委"}, 169 | []string{"程序员", "祝", "海林", "和", "朱会震", "是", "在", "孙健", "的", "左面", "和", "右面", ",", " ", "范凯", "在", "最", "右面", ".", "再往", "左", "是", "李松洪"}, 170 | []string{"一次性", "交", "多少", "钱"}, 171 | []string{"两块", "五", "一套", ",", "三块", "八", "一斤", ",", "四块", "七", "一本", ",", "五块", "六", "一条"}, 172 | []string{"小", "和尚", "留", "了", "一个", "像", "大", "和尚", "一样", "的", "和尚头"}, 173 | []string{"我", "是", "中华人民共和国", "公民", ";", "我", "爸爸", "是", "共和党", "党员", ";", " ", "地铁", "和平门", "站"}, 174 | []string{"张晓梅", "去", "人民", "医院", "做", "了", "个", "B超", "然后", "去", "买", "了", "件", "T恤"}, 175 | []string{"AT&T", "是", "一件", "不错", "的", "公司", ",", "给", "你", "发", "offer", "了", "吗", "?"}, 176 | []string{"C++", "和", "c#", "是", "什么", "关系", "?", "11", "+", "122", "=", "133", ",", "是", "吗", "?", "PI", "=", "3.14159"}, 177 | []string{"你", "认识", "那个", "和", "主席", "握手", "的", "的哥", "吗", "?", "他开", "一辆", "黑色", "的士", "。"}, 178 | []string{"枪杆子", "中", "出", "政权"}, 179 | } 180 | 181 | cutAllResult = [][]string{[]string{"这", "是", "一个", "伸手", "伸手不见", "伸手不见五指", "不见", "五指", "的", "黑夜", "", "", "我", "叫", "孙悟空", "悟空", "", "", "我", "爱", "北京", "", "", "我", "爱", "Python", "和", "C++", ""}, 182 | []string{"我", "不", "喜欢", "日本", "和服", "", ""}, 183 | []string{"雷猴", "回归", "人间", "", ""}, 184 | []string{"工信处", "处女", "女干事", "干事", "每月", "月经", "经过", "下属", "科室", "都", "要", "亲口", "口交", "交代", "24", "口交", "交换", "交换机", "换机", "等", "技术", "技术性", "性器", "器件", "的", "安装", "安装工", "装工", "工作"}, 185 | []string{"我", "需要", "廉租", "廉租房", "租房"}, 186 | []string{"永和", "和服", "服装", "装饰", "装饰品", "饰品", "有限", "有限公司", "公司"}, 187 | []string{"我", "爱", "北京", "天安", "天安门"}, 188 | []string{"abc"}, 189 | []string{"隐", "马尔可", "马尔可夫", "可夫"}, 190 | []string{"雷猴", "是", "个", "好", "网站"}, 191 | []string{"", "Microsoft", "", "一", "词", "由", "", "MICROcomputer", "", "微型", "计算", "计算机", "算机", "", "", "", "和", "", "SOFTware", "", "软件", "", "", "", "两部", "部分", "分组", "组成"}, 192 | []string{"草泥马", "和", "欺", "实", "马", "是", "今年", "的", "流行", "词汇"}, 193 | []string{"伊", "藤", "洋华堂", "总府", "店"}, 194 | []string{"中国", "中国科学院", "中国科学院计算技术研究所", "科学", "科学院", "学院", "计算", "计算技术", "技术", "研究", "研究所"}, 195 | []string{"罗密欧", "与", "朱丽叶"}, 196 | []string{"我", "购买", "了", "道具", "和服", "服装"}, 197 | []string{"PS", "", "", "我", "觉得", "开源", "有", "一个", "好处", "", "", "就是", "能够", "敦促", "自己", "不断", "不断改进", "改进", "", "", "避免", "敞", "帚", "自珍"}, 198 | []string{"湖北", "湖北省", "石首", "石首市"}, 199 | []string{"湖北", "湖北省", "十堰", "十堰市"}, 200 | []string{"总经理", "经理", "理完", "完成", "了", "这件", "事情"}, 201 | []string{"电脑", "修好", "了"}, 202 | []string{"做好", "了", "这件", "事情", "就", "一了百了", "了了"}, 203 | []string{"人们", "审美", "美的", "观点", "是", "不同", "的"}, 204 | []string{"我们", "买", "了", "一个", "美的", "空调"}, 205 | []string{"线程", "初始", "初始化", "化时", "我们", "要", "注意"}, 206 | []string{"一个", "分子", "是", "由", "好多", "原子", "组织", "织成", "的"}, 207 | []string{"祝", "你", "马到功成"}, 208 | []string{"他", "掉", "进", "了", "无底", "无底洞", "里"}, 209 | []string{"中国", "的", "首都", "是", "北京"}, 210 | []string{"孙", "君", "意"}, 211 | []string{"外交", "外交部", "部发", "发言", "发言人", "人马", "马朝旭"}, 212 | []string{"领导", "领导人", "会议", "议和", "第四", "第四届", "四届", "东亚", "峰会"}, 213 | []string{"在", "过去", "的", "这", "五年"}, 214 | []string{"还", "需要", "很", "长", "的", "路", "要", "走"}, 215 | []string{"60", "周年", "首都", "阅兵"}, 216 | []string{"你好", "好人", "人们", "审美", "美的", "观点", "是", "不同", "的"}, 217 | []string{"买", "水果", "果然", "然后", "后来", "来世", "世博", "世博园", "博园"}, 218 | []string{"买", "水果", "果然", "然后", "后去", "去世", "世博", "世博园", "博园"}, 219 | []string{"但是", "后来", "我", "才", "知道", "你", "是", "对", "的"}, 220 | []string{"存在", "即", "合理"}, 221 | []string{"的", "的", "的", "的", "的", "在", "的", "的", "的", "的", "就", "以", "和", "和", "和"}, 222 | []string{"I", "love", "你", "", "", "不以", "不以为耻", "以为", "耻", "", "", "反", "以为", "rong"}, 223 | []string{"因"}, 224 | []string{}, 225 | []string{"hello", "你好", "好人", "人们", "审美", "美的", "观点", "是", "不同", "的"}, 226 | []string{"很", "好", "但", "主要", "要是", "基于", "网页", "形式"}, 227 | []string{"hello", "你好", "好人", "人们", "审美", "美的", "观点", "是", "不同", "的"}, 228 | []string{"为什么", "什么", "我", "不能", "拥有", "想要", "的", "生活"}, 229 | []string{"后来", "我", "才"}, 230 | []string{"此次", "来", "中国", "国是", "为了"}, 231 | []string{"使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, 232 | []string{"", "", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, 233 | []string{"其实", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, 234 | []string{"好人", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, 235 | []string{"是因为", "因为", "和", "国家"}, 236 | []string{"老年", "搜索", "索还", "支持"}, 237 | []string{"干脆", "就", "把", "那部", "蒙人", "的", "闲", "法", "给", "废", "了", "拉倒", "", "RT", "", "laoshipukong", "", "", "27", "日", "", "", "全国", "全国人大", "全国人大常委会", "国人", "人大", "人大常委会", "常委", "常委会", "委会", "第三", "第三次", "三次", "审议", "侵权", "权责", "责任", "责任法", "草案", "", "", "删除", "除了", "有关", "医疗", "损害", "责任", "", "", "举证", "倒置", "", "", "的", "规定", "", "", "在", "医患", "纠纷", "中", "本", "已", "处于", "弱势", "地位", "的", "消费", "消费者", "由此", "将", "陷入", "万劫不复", "不复", "的", "境地", "", "", ""}, 238 | []string{"大"}, 239 | []string{}, 240 | []string{"他", "说", "的确", "确实", "实在", "理"}, 241 | []string{"长春", "长春市", "市长", "长春", "春节", "讲话"}, 242 | []string{"结婚", "的", "和尚", "尚未", "未结", "结婚", "的"}, 243 | []string{"结合", "合成", "成分", "分子", "时"}, 244 | []string{"旅游", "和服", "服务", "是", "最好", "的"}, 245 | []string{"这件", "事情", "的确", "是", "我", "的", "错"}, 246 | []string{"供", "大家", "参考", "指正"}, 247 | []string{"哈尔", "哈尔滨", "政府", "公布", "塌", "桥", "原因"}, 248 | []string{"我", "在", "机场", "入口", "入口处"}, 249 | []string{"邢", "永", "臣", "摄影", "报道"}, 250 | []string{"BP", "神经", "神经网", "神经网络", "网络", "如何", "训练", "才能", "在", "分类", "时", "增加", "加区", "区分", "区分度", "分度", "", ""}, 251 | []string{"南京", "南京市", "京市", "市长", "长江", "长江大桥", "大桥"}, 252 | []string{"应", "一些", "使用", "使用者", "用者", "的", "建议", "", "", "也", "为了", "便于", "利用", "NiuTrans", "用于", "SMT", "研究"}, 253 | []string{"长春", "长春市", "市长", "长春", "春药", "药店"}, 254 | []string{"邓颖超", "超生", "生前", "最", "喜欢", "的", "衣服"}, 255 | []string{"胡锦涛", "锦涛", "是", "热爱", "世界", "和平", "的", "政治", "政治局", "常委"}, 256 | []string{"程序", "程序员", "祝", "海林", "和", "朱", "会", "震", "是", "在", "孙", "健", "的", "左面", "和", "右面", "", "", "", "范", "凯", "在", "最", "右面", "", "", "再往", "左", "是", "李", "松", "洪"}, 257 | []string{"一次", "一次性", "性交", "多少", "多少钱"}, 258 | []string{"两块", "五一", "一套", "", "", "三块", "八一", "一斤", "", "", "四块", "七一", "一本", "", "", "五块", "六一", "一条"}, 259 | []string{"小", "和尚", "留", "了", "一个", "像", "大", "和尚", "一样", "的", "和尚", "和尚头"}, 260 | []string{"我", "是", "中华", "中华人民", "中华人民共和国", "华人", "人民", "人民共和国", "共和", "共和国", "国公", "公民", "", "", "我", "爸爸", "是", "共和", "共和党", "党员", "", "", "", "地铁", "和平", "和平门", "站"}, 261 | []string{"张晓梅", "去", "人民", "民医院", "医院", "做", "了", "个", "B", "超然", "然后", "后去", "买", "了", "件", "T", "恤"}, 262 | []string{"AT", "T", "是", "一件", "不错", "的", "公司", "", "", "给", "你", "发", "offer", "了", "吗", "", ""}, 263 | []string{"C++", "和", "c#", "是", "什么", "关系", "", "11+122", "133", "", "是", "吗", "", "PI", "3", "14159"}, 264 | []string{"你", "认识", "那个", "和", "主席", "握手", "的", "的哥", "吗", "", "", "他", "开", "一辆", "黑色", "的士", "", ""}, 265 | []string{"枪杆", "枪杆子", "杆子", "中出", "政权"}, 266 | } 267 | 268 | defaultCutNoHMMResult = [][]string{[]string{"这", "是", "一个", "伸手不见五指", "的", "黑夜", "。", "我", "叫", "孙悟空", ",", "我", "爱", "北京", ",", "我", "爱", "Python", "和", "C++", "。"}, 269 | []string{"我", "不", "喜欢", "日本", "和服", "。"}, 270 | []string{"雷猴", "回归", "人间", "。"}, 271 | []string{"工信处", "女干事", "每月", "经过", "下属", "科室", "都", "要", "亲口", "交代", "24", "口", "交换机", "等", "技术性", "器件", "的", "安装", "工作"}, 272 | []string{"我", "需要", "廉租房"}, 273 | []string{"永和", "服装", "饰品", "有限公司"}, 274 | []string{"我", "爱", "北京", "天安门"}, 275 | []string{"abc"}, 276 | []string{"隐", "马尔可夫"}, 277 | []string{"雷猴", "是", "个", "好", "网站"}, 278 | []string{"“", "Microsoft", "”", "一", "词", "由", "“", "MICROcomputer", "(", "微型", "计算机", ")", "”", "和", "“", "SOFTware", "(", "软件", ")", "”", "两", "部分", "组成"}, 279 | []string{"草泥马", "和", "欺", "实", "马", "是", "今年", "的", "流行", "词汇"}, 280 | []string{"伊", "藤", "洋华堂", "总府", "店"}, 281 | []string{"中国科学院计算技术研究所"}, 282 | []string{"罗密欧", "与", "朱丽叶"}, 283 | []string{"我", "购买", "了", "道具", "和", "服装"}, 284 | []string{"PS", ":", " ", "我", "觉得", "开源", "有", "一个", "好处", ",", "就是", "能够", "敦促", "自己", "不断改进", ",", "避免", "敞", "帚", "自珍"}, 285 | []string{"湖北省", "石首市"}, 286 | []string{"湖北省", "十堰市"}, 287 | []string{"总经理", "完成", "了", "这件", "事情"}, 288 | []string{"电脑", "修好", "了"}, 289 | []string{"做好", "了", "这件", "事情", "就", "一了百了", "了"}, 290 | []string{"人们", "审美", "的", "观点", "是", "不同", "的"}, 291 | []string{"我们", "买", "了", "一个", "美的", "空调"}, 292 | []string{"线程", "初始化", "时", "我们", "要", "注意"}, 293 | []string{"一个", "分子", "是", "由", "好多", "原子", "组织", "成", "的"}, 294 | []string{"祝", "你", "马到功成"}, 295 | []string{"他", "掉", "进", "了", "无底洞", "里"}, 296 | []string{"中国", "的", "首都", "是", "北京"}, 297 | []string{"孙", "君", "意"}, 298 | []string{"外交部", "发言人", "马朝旭"}, 299 | []string{"领导人", "会议", "和", "第四届", "东亚", "峰会"}, 300 | []string{"在", "过去", "的", "这", "五年"}, 301 | []string{"还", "需要", "很", "长", "的", "路", "要", "走"}, 302 | []string{"60", "周年", "首都", "阅兵"}, 303 | []string{"你好", "人们", "审美", "的", "观点", "是", "不同", "的"}, 304 | []string{"买", "水果", "然后", "来", "世博园"}, 305 | []string{"买", "水果", "然后", "去", "世博园"}, 306 | []string{"但是", "后来", "我", "才", "知道", "你", "是", "对", "的"}, 307 | []string{"存在", "即", "合理"}, 308 | []string{"的", "的", "的", "的", "的", "在", "的", "的", "的", "的", "就", "以", "和", "和", "和"}, 309 | []string{"I", " ", "love", "你", ",", "不以为耻", ",", "反", "以为", "rong"}, 310 | []string{"因"}, 311 | []string{}, 312 | []string{"hello", "你好", "人们", "审美", "的", "观点", "是", "不同", "的"}, 313 | []string{"很", "好", "但", "主要", "是", "基于", "网页", "形式"}, 314 | []string{"hello", "你好", "人们", "审美", "的", "观点", "是", "不同", "的"}, 315 | []string{"为什么", "我", "不能", "拥有", "想要", "的", "生活"}, 316 | []string{"后来", "我", "才"}, 317 | []string{"此次", "来", "中国", "是", "为了"}, 318 | []string{"使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, 319 | []string{",", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, 320 | []string{"其实", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, 321 | []string{"好人", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, 322 | []string{"是因为", "和", "国家"}, 323 | []string{"老年", "搜索", "还", "支持"}, 324 | []string{"干脆", "就", "把", "那", "部", "蒙", "人", "的", "闲", "法", "给", "废", "了", "拉倒", "!", "RT", " ", "@", "laoshipukong", " ", ":", " ", "27", "日", ",", "全国人大常委会", "第三次", "审议", "侵权", "责任法", "草案", ",", "删除", "了", "有关", "医疗", "损害", "责任", "“", "举证", "倒置", "”", "的", "规定", "。", "在", "医患", "纠纷", "中", "本", "已", "处于", "弱势", "地位", "的", "消费者", "由此", "将", "陷入", "万劫不复", "的", "境地", "。", " "}, 325 | []string{"大"}, 326 | []string{}, 327 | []string{"他", "说", "的", "确实", "在", "理"}, 328 | []string{"长春", "市长", "春节", "讲话"}, 329 | []string{"结婚", "的", "和", "尚未", "结婚", "的"}, 330 | []string{"结合", "成", "分子", "时"}, 331 | []string{"旅游", "和", "服务", "是", "最好", "的"}, 332 | []string{"这件", "事情", "的确", "是", "我", "的", "错"}, 333 | []string{"供", "大家", "参考", "指正"}, 334 | []string{"哈尔滨", "政府", "公布", "塌", "桥", "原因"}, 335 | []string{"我", "在", "机场", "入口处"}, 336 | []string{"邢", "永", "臣", "摄影", "报道"}, 337 | []string{"BP", "神经网络", "如何", "训练", "才能", "在", "分类", "时", "增加", "区分度", "?"}, 338 | []string{"南京市", "长江大桥"}, 339 | []string{"应", "一些", "使用者", "的", "建议", ",", "也", "为了", "便于", "利用", "NiuTrans", "用于", "SMT", "研究"}, 340 | []string{"长春市", "长春", "药店"}, 341 | []string{"邓颖超", "生前", "最", "喜欢", "的", "衣服"}, 342 | []string{"胡锦涛", "是", "热爱", "世界", "和平", "的", "政治局", "常委"}, 343 | []string{"程序员", "祝", "海林", "和", "朱", "会", "震", "是", "在", "孙", "健", "的", "左面", "和", "右面", ",", " ", "范", "凯", "在", "最", "右面", ".", "再", "往", "左", "是", "李", "松", "洪"}, 344 | []string{"一次性", "交", "多少", "钱"}, 345 | []string{"两块", "五", "一套", ",", "三块", "八", "一斤", ",", "四块", "七", "一本", ",", "五块", "六", "一条"}, 346 | []string{"小", "和尚", "留", "了", "一个", "像", "大", "和尚", "一样", "的", "和尚头"}, 347 | []string{"我", "是", "中华人民共和国", "公民", ";", "我", "爸爸", "是", "共和党", "党员", ";", " ", "地铁", "和平门", "站"}, 348 | []string{"张晓梅", "去", "人民", "医院", "做", "了", "个", "B超", "然后", "去", "买", "了", "件", "T恤"}, 349 | []string{"AT&T", "是", "一件", "不错", "的", "公司", ",", "给", "你", "发", "offer", "了", "吗", "?"}, 350 | []string{"C++", "和", "c#", "是", "什么", "关系", "?", "11", "+", "122", "=", "133", ",", "是", "吗", "?", "PI", "=", "3", ".", "14159"}, 351 | []string{"你", "认识", "那个", "和", "主席", "握手", "的", "的哥", "吗", "?", "他", "开", "一辆", "黑色", "的士", "。"}, 352 | []string{"枪杆子", "中", "出", "政权"}, 353 | } 354 | 355 | cutForSearchResult = [][]string{[]string{"这是", "一个", "伸手", "不见", "五指", "伸手不见五指", "的", "黑夜", "。", "我", "叫", "悟空", "孙悟空", ",", "我", "爱", "北京", ",", "我", "爱", "Python", "和", "C++", "。"}, 356 | []string{"我", "不", "喜欢", "日本", "和服", "。"}, 357 | []string{"雷猴", "回归", "人间", "。"}, 358 | []string{"工信处", "干事", "女干事", "每月", "经过", "下属", "科室", "都", "要", "亲口", "交代", "24", "口", "交换", "换机", "交换机", "等", "技术", "技术性", "器件", "的", "安装", "工作"}, 359 | []string{"我", "需要", "廉租", "租房", "廉租房"}, 360 | []string{"永和", "服装", "饰品", "有限", "公司", "有限公司"}, 361 | []string{"我", "爱", "北京", "天安", "天安门"}, 362 | []string{"abc"}, 363 | []string{"隐", "可夫", "马尔可", "马尔可夫"}, 364 | []string{"雷猴", "是", "个", "好", "网站"}, 365 | []string{"“", "Microsoft", "”", "一词", "由", "“", "MICROcomputer", "(", "微型", "计算", "算机", "计算机", ")", "”", "和", "“", "SOFTware", "(", "软件", ")", "”", "两", "部分", "组成"}, 366 | []string{"草泥马", "和", "欺实", "马", "是", "今年", "的", "流行", "词汇"}, 367 | []string{"伊藤", "洋华堂", "总府", "店"}, 368 | []string{"中国", "科学", "学院", "计算", "技术", "研究", "科学院", "研究所", "中国科学院计算技术研究所"}, 369 | []string{"罗密欧", "与", "朱丽叶"}, 370 | []string{"我", "购买", "了", "道具", "和", "服装"}, 371 | []string{"PS", ":", " ", "我", "觉得", "开源", "有", "一个", "好处", ",", "就是", "能够", "敦促", "自己", "不断", "改进", "不断改进", ",", "避免", "敞帚", "自珍"}, 372 | []string{"湖北", "湖北省", "石首", "石首市"}, 373 | []string{"湖北", "湖北省", "十堰", "十堰市"}, 374 | []string{"经理", "总经理", "完成", "了", "这件", "事情"}, 375 | []string{"电脑", "修好", "了"}, 376 | []string{"做好", "了", "这件", "事情", "就", "一了百了", "了"}, 377 | []string{"人们", "审美", "的", "观点", "是", "不同", "的"}, 378 | []string{"我们", "买", "了", "一个", "美的", "空调"}, 379 | []string{"线程", "初始", "初始化", "时", "我们", "要", "注意"}, 380 | []string{"一个", "分子", "是", "由", "好多", "原子", "组织", "成", "的"}, 381 | []string{"祝", "你", "马到功成"}, 382 | []string{"他", "掉", "进", "了", "无底", "无底洞", "里"}, 383 | []string{"中国", "的", "首都", "是", "北京"}, 384 | []string{"孙君意"}, 385 | []string{"外交", "外交部", "发言", "发言人", "马朝旭"}, 386 | []string{"领导", "领导人", "会议", "和", "第四", "四届", "第四届", "东亚", "峰会"}, 387 | []string{"在", "过去", "的", "这", "五年"}, 388 | []string{"还", "需要", "很长", "的", "路", "要", "走"}, 389 | []string{"60", "周年", "首都", "阅兵"}, 390 | []string{"你好", "人们", "审美", "的", "观点", "是", "不同", "的"}, 391 | []string{"买", "水果", "然后", "来", "世博", "博园", "世博园"}, 392 | []string{"买", "水果", "然后", "去", "世博", "博园", "世博园"}, 393 | []string{"但是", "后来", "我", "才", "知道", "你", "是", "对", "的"}, 394 | []string{"存在", "即", "合理"}, 395 | []string{"的", "的", "的", "的", "的", "在", "的", "的", "的", "的", "就", "以", "和", "和", "和"}, 396 | []string{"I", " ", "love", "你", ",", "不以", "以为", "不以为耻", ",", "反", "以为", "rong"}, 397 | []string{"因"}, 398 | []string{}, 399 | []string{"hello", "你好", "人们", "审美", "的", "观点", "是", "不同", "的"}, 400 | []string{"很", "好", "但", "主要", "是", "基于", "网页", "形式"}, 401 | []string{"hello", "你好", "人们", "审美", "的", "观点", "是", "不同", "的"}, 402 | []string{"什么", "为什么", "我", "不能", "拥有", "想要", "的", "生活"}, 403 | []string{"后来", "我", "才"}, 404 | []string{"此次", "来", "中国", "是", "为了"}, 405 | []string{"使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, 406 | []string{",", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, 407 | []string{"其实", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, 408 | []string{"好人", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, 409 | []string{"因为", "是因为", "和", "国家"}, 410 | []string{"老年", "搜索", "还", "支持"}, 411 | []string{"干脆", "就", "把", "那部", "蒙人", "的", "闲法", "给", "废", "了", "拉倒", "!", "RT", " ", "@", "laoshipukong", " ", ":", " ", "27", "日", ",", "全国", "国人", "人大", "常委", "委会", "常委会", "全国人大常委会", "第三", "三次", "第三次", "审议", "侵权", "责任", "责任法", "草案", ",", "删除", "了", "有关", "医疗", "损害", "责任", "“", "举证", "倒置", "”", "的", "规定", "。", "在", "医患", "纠纷", "中本", "已", "处于", "弱势", "地位", "的", "消费", "消费者", "由此", "将", "陷入", "不复", "万劫不复", "的", "境地", "。", " "}, 412 | []string{"大"}, 413 | []string{}, 414 | []string{"他", "说", "的", "确实", "在理"}, 415 | []string{"长春", "市长", "春节", "讲话"}, 416 | []string{"结婚", "的", "和", "尚未", "结婚", "的"}, 417 | []string{"结合", "成", "分子", "时"}, 418 | []string{"旅游", "和", "服务", "是", "最好", "的"}, 419 | []string{"这件", "事情", "的确", "是", "我", "的", "错"}, 420 | []string{"供", "大家", "参考", "指正"}, 421 | []string{"哈尔", "哈尔滨", "政府", "公布", "塌桥", "原因"}, 422 | []string{"我", "在", "机场", "入口", "入口处"}, 423 | []string{"邢永臣", "摄影", "报道"}, 424 | []string{"BP", "神经", "网络", "神经网", "神经网络", "如何", "训练", "才能", "在", "分类", "时", "增加", "区分", "分度", "区分度", "?"}, 425 | []string{"南京", "京市", "南京市", "长江", "大桥", "长江大桥"}, 426 | []string{"应", "一些", "使用", "用者", "使用者", "的", "建议", ",", "也", "为了", "便于", "利用", "NiuTrans", "用于", "SMT", "研究"}, 427 | []string{"长春", "长春市", "长春", "药店"}, 428 | []string{"邓颖超", "生前", "最", "喜欢", "的", "衣服"}, 429 | []string{"锦涛", "胡锦涛", "是", "热爱", "世界", "和平", "的", "政治", "政治局", "常委"}, 430 | []string{"程序", "程序员", "祝", "海林", "和", "朱会震", "是", "在", "孙健", "的", "左面", "和", "右面", ",", " ", "范凯", "在", "最", "右面", ".", "再往", "左", "是", "李松洪"}, 431 | []string{"一次", "一次性", "交", "多少", "钱"}, 432 | []string{"两块", "五", "一套", ",", "三块", "八", "一斤", ",", "四块", "七", "一本", ",", "五块", "六", "一条"}, 433 | []string{"小", "和尚", "留", "了", "一个", "像", "大", "和尚", "一样", "的", "和尚", "和尚头"}, 434 | []string{"我", "是", "中华", "华人", "人民", "共和", "共和国", "中华人民共和国", "公民", ";", "我", "爸爸", "是", "共和", "共和党", "党员", ";", " ", "地铁", "和平", "和平门", "站"}, 435 | []string{"张晓梅", "去", "人民", "医院", "做", "了", "个", "B超", "然后", "去", "买", "了", "件", "T恤"}, 436 | []string{"AT&T", "是", "一件", "不错", "的", "公司", ",", "给", "你", "发", "offer", "了", "吗", "?"}, 437 | []string{"C++", "和", "c#", "是", "什么", "关系", "?", "11", "+", "122", "=", "133", ",", "是", "吗", "?", "PI", "=", "3.14159"}, 438 | []string{"你", "认识", "那个", "和", "主席", "握手", "的", "的哥", "吗", "?", "他开", "一辆", "黑色", "的士", "。"}, 439 | []string{"枪杆", "杆子", "枪杆子", "中", "出", "政权"}, 440 | } 441 | 442 | cutForSearchNoHMMResult = [][]string{[]string{"这", "是", "一个", "伸手", "不见", "五指", "伸手不见五指", "的", "黑夜", "。", "我", "叫", "悟空", "孙悟空", ",", "我", "爱", "北京", ",", "我", "爱", "Python", "和", "C++", "。"}, 443 | []string{"我", "不", "喜欢", "日本", "和服", "。"}, 444 | []string{"雷猴", "回归", "人间", "。"}, 445 | []string{"工信处", "干事", "女干事", "每月", "经过", "下属", "科室", "都", "要", "亲口", "交代", "24", "口", "交换", "换机", "交换机", "等", "技术", "技术性", "器件", "的", "安装", "工作"}, 446 | []string{"我", "需要", "廉租", "租房", "廉租房"}, 447 | []string{"永和", "服装", "饰品", "有限", "公司", "有限公司"}, 448 | []string{"我", "爱", "北京", "天安", "天安门"}, 449 | []string{"abc"}, 450 | []string{"隐", "可夫", "马尔可", "马尔可夫"}, 451 | []string{"雷猴", "是", "个", "好", "网站"}, 452 | []string{"“", "Microsoft", "”", "一", "词", "由", "“", "MICROcomputer", "(", "微型", "计算", "算机", "计算机", ")", "”", "和", "“", "SOFTware", "(", "软件", ")", "”", "两", "部分", "组成"}, 453 | []string{"草泥马", "和", "欺", "实", "马", "是", "今年", "的", "流行", "词汇"}, 454 | []string{"伊", "藤", "洋华堂", "总府", "店"}, 455 | []string{"中国", "科学", "学院", "计算", "技术", "研究", "科学院", "研究所", "中国科学院计算技术研究所"}, 456 | []string{"罗密欧", "与", "朱丽叶"}, 457 | []string{"我", "购买", "了", "道具", "和", "服装"}, 458 | []string{"PS", ":", " ", "我", "觉得", "开源", "有", "一个", "好处", ",", "就是", "能够", "敦促", "自己", "不断", "改进", "不断改进", ",", "避免", "敞", "帚", "自珍"}, 459 | []string{"湖北", "湖北省", "石首", "石首市"}, 460 | []string{"湖北", "湖北省", "十堰", "十堰市"}, 461 | []string{"经理", "总经理", "完成", "了", "这件", "事情"}, 462 | []string{"电脑", "修好", "了"}, 463 | []string{"做好", "了", "这件", "事情", "就", "一了百了", "了"}, 464 | []string{"人们", "审美", "的", "观点", "是", "不同", "的"}, 465 | []string{"我们", "买", "了", "一个", "美的", "空调"}, 466 | []string{"线程", "初始", "初始化", "时", "我们", "要", "注意"}, 467 | []string{"一个", "分子", "是", "由", "好多", "原子", "组织", "成", "的"}, 468 | []string{"祝", "你", "马到功成"}, 469 | []string{"他", "掉", "进", "了", "无底", "无底洞", "里"}, 470 | []string{"中国", "的", "首都", "是", "北京"}, 471 | []string{"孙", "君", "意"}, 472 | []string{"外交", "外交部", "发言", "发言人", "马朝旭"}, 473 | []string{"领导", "领导人", "会议", "和", "第四", "四届", "第四届", "东亚", "峰会"}, 474 | []string{"在", "过去", "的", "这", "五年"}, 475 | []string{"还", "需要", "很", "长", "的", "路", "要", "走"}, 476 | []string{"60", "周年", "首都", "阅兵"}, 477 | []string{"你好", "人们", "审美", "的", "观点", "是", "不同", "的"}, 478 | []string{"买", "水果", "然后", "来", "世博", "博园", "世博园"}, 479 | []string{"买", "水果", "然后", "去", "世博", "博园", "世博园"}, 480 | []string{"但是", "后来", "我", "才", "知道", "你", "是", "对", "的"}, 481 | []string{"存在", "即", "合理"}, 482 | []string{"的", "的", "的", "的", "的", "在", "的", "的", "的", "的", "就", "以", "和", "和", "和"}, 483 | []string{"I", " ", "love", "你", ",", "不以", "以为", "不以为耻", ",", "反", "以为", "rong"}, 484 | []string{"因"}, 485 | []string{}, 486 | []string{"hello", "你好", "人们", "审美", "的", "观点", "是", "不同", "的"}, 487 | []string{"很", "好", "但", "主要", "是", "基于", "网页", "形式"}, 488 | []string{"hello", "你好", "人们", "审美", "的", "观点", "是", "不同", "的"}, 489 | []string{"什么", "为什么", "我", "不能", "拥有", "想要", "的", "生活"}, 490 | []string{"后来", "我", "才"}, 491 | []string{"此次", "来", "中国", "是", "为了"}, 492 | []string{"使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, 493 | []string{",", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, 494 | []string{"其实", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, 495 | []string{"好人", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, 496 | []string{"因为", "是因为", "和", "国家"}, 497 | []string{"老年", "搜索", "还", "支持"}, 498 | []string{"干脆", "就", "把", "那", "部", "蒙", "人", "的", "闲", "法", "给", "废", "了", "拉倒", "!", "RT", " ", "@", "laoshipukong", " ", ":", " ", "27", "日", ",", "全国", "国人", "人大", "常委", "委会", "常委会", "全国人大常委会", "第三", "三次", "第三次", "审议", "侵权", "责任", "责任法", "草案", ",", "删除", "了", "有关", "医疗", "损害", "责任", "“", "举证", "倒置", "”", "的", "规定", "。", "在", "医患", "纠纷", "中", "本", "已", "处于", "弱势", "地位", "的", "消费", "消费者", "由此", "将", "陷入", "不复", "万劫不复", "的", "境地", "。", " "}, 499 | []string{"大"}, 500 | []string{}, 501 | []string{"他", "说", "的", "确实", "在", "理"}, 502 | []string{"长春", "市长", "春节", "讲话"}, 503 | []string{"结婚", "的", "和", "尚未", "结婚", "的"}, 504 | []string{"结合", "成", "分子", "时"}, 505 | []string{"旅游", "和", "服务", "是", "最好", "的"}, 506 | []string{"这件", "事情", "的确", "是", "我", "的", "错"}, 507 | []string{"供", "大家", "参考", "指正"}, 508 | []string{"哈尔", "哈尔滨", "政府", "公布", "塌", "桥", "原因"}, 509 | []string{"我", "在", "机场", "入口", "入口处"}, 510 | []string{"邢", "永", "臣", "摄影", "报道"}, 511 | []string{"BP", "神经", "网络", "神经网", "神经网络", "如何", "训练", "才能", "在", "分类", "时", "增加", "区分", "分度", "区分度", "?"}, 512 | []string{"南京", "京市", "南京市", "长江", "大桥", "长江大桥"}, 513 | []string{"应", "一些", "使用", "用者", "使用者", "的", "建议", ",", "也", "为了", "便于", "利用", "NiuTrans", "用于", "SMT", "研究"}, 514 | []string{"长春", "长春市", "长春", "药店"}, 515 | []string{"邓颖超", "生前", "最", "喜欢", "的", "衣服"}, 516 | []string{"锦涛", "胡锦涛", "是", "热爱", "世界", "和平", "的", "政治", "政治局", "常委"}, 517 | []string{"程序", "程序员", "祝", "海林", "和", "朱", "会", "震", "是", "在", "孙", "健", "的", "左面", "和", "右面", ",", " ", "范", "凯", "在", "最", "右面", ".", "再", "往", "左", "是", "李", "松", "洪"}, 518 | []string{"一次", "一次性", "交", "多少", "钱"}, 519 | []string{"两块", "五", "一套", ",", "三块", "八", "一斤", ",", "四块", "七", "一本", ",", "五块", "六", "一条"}, 520 | []string{"小", "和尚", "留", "了", "一个", "像", "大", "和尚", "一样", "的", "和尚", "和尚头"}, 521 | []string{"我", "是", "中华", "华人", "人民", "共和", "共和国", "中华人民共和国", "公民", ";", "我", "爸爸", "是", "共和", "共和党", "党员", ";", " ", "地铁", "和平", "和平门", "站"}, 522 | []string{"张晓梅", "去", "人民", "医院", "做", "了", "个", "B超", "然后", "去", "买", "了", "件", "T恤"}, 523 | []string{"AT&T", "是", "一件", "不错", "的", "公司", ",", "给", "你", "发", "offer", "了", "吗", "?"}, 524 | []string{"C++", "和", "c#", "是", "什么", "关系", "?", "11", "+", "122", "=", "133", ",", "是", "吗", "?", "PI", "=", "3", ".", "14159"}, 525 | []string{"你", "认识", "那个", "和", "主席", "握手", "的", "的哥", "吗", "?", "他", "开", "一辆", "黑色", "的士", "。"}, 526 | []string{"枪杆", "杆子", "枪杆子", "中", "出", "政权"}, 527 | } 528 | 529 | userDictCutResult = [][]string{ 530 | []string{"这是", "一个", "伸手", "不见", "五指", "的", "黑夜", "。", "我", "叫", "孙悟空", ",", "我", "爱北京", ",", "我", "爱", "Python", "和", "C", "++", "。"}, 531 | []string{"我", "不", "喜欢", "日本", "和", "服", "。"}, 532 | []string{"雷猴", "回归人间", "。"}, 533 | []string{"工信", "处女", "干事", "每", "月", "经过", "下", "属", "科室", "都", "要", "亲口", "交代", "24", "口交换机", "等", "技术性", "器件", "的", "安装", "工作"}, 534 | []string{"我", "需要", "廉租房"}, 535 | []string{"永和服", "装饰品", "有", "限公司"}, 536 | []string{"我", "爱北京", "天安门"}, 537 | []string{"abc"}, 538 | []string{"隐马尔", "可夫"}, 539 | []string{"雷猴", "是", "个", "好", "网站"}, 540 | []string{"“", "Microsoft", "”", "一词", "由", "“", "MICROcomputer", "(", "微型", "计算机", ")", "”", "和", "“", "SOFTware", "(", "软件", ")", "”", "两部分", "组成"}, 541 | []string{"草泥", "马", "和", "欺实", "马", "是", "今", "年", "的", "流行", "词汇"}, 542 | []string{"伊藤洋华堂", "总府", "店"}, 543 | []string{"中国", "科学院", "计算", "技术", "研究", "所"}, 544 | []string{"罗密欧", "与", "朱丽叶"}, 545 | []string{"我购", "买", "了", "道", "具", "和", "服装"}, 546 | []string{"PS", ":", " ", "我觉", "得", "开源", "有", "一个", "好", "处", ",", "就", "是", "能够", "敦促", "自己", "不断", "改进", ",", "避免", "敞帚", "自珍"}, 547 | []string{"湖北省", "石首市"}, 548 | []string{"湖北省", "十堰市"}, 549 | []string{"总经理", "完成", "了", "这件", "事情"}, 550 | []string{"电脑", "修好", "了"}, 551 | []string{"做", "好", "了", "这件", "事情", "就", "一", "了", "百", "了", "了"}, 552 | []string{"人们", "审美", "的", "观点", "是", "不同", "的"}, 553 | []string{"我们", "买", "了", "一个", "美", "的", "空调"}, 554 | []string{"线程", "初始", "化时", "我们", "要", "注意"}, 555 | []string{"一个", "分子", "是", "由", "好", "多", "原子", "组织成", "的"}, 556 | []string{"祝", "你", "马到", "功成"}, 557 | []string{"他", "掉", "进", "了", "无底", "洞里"}, 558 | []string{"中国", "的", "首", "都", "是", "北京"}, 559 | []string{"孙君意"}, 560 | []string{"外交部", "发言人", "马朝旭"}, 561 | []string{"领导", "人会议", "和", "第四届", "东亚峰", "会"}, 562 | []string{"在", "过", "去", "的", "这五年"}, 563 | []string{"还", "需要", "很长", "的", "路", "要", "走"}, 564 | []string{"60", "周年首", "都", "阅兵"}, 565 | []string{"你", "好人", "们", "审美", "的", "观点", "是", "不同", "的"}, 566 | []string{"买水果", "然后", "来", "世博园"}, 567 | []string{"买水果", "然后", "去", "世博园"}, 568 | []string{"但", "是", "后", "来", "我", "才", "知道", "你", "是", "对", "的"}, 569 | []string{"存在", "即", "合理"}, 570 | []string{"的", "的", "的", "的", "的", "在", "的", "的", "的", "的", "就", "以", "和", "和", "和"}, 571 | []string{"I", " ", "love", "你", ",", "不以", "为耻", ",", "反以", "为", "rong"}, 572 | []string{"因"}, 573 | []string{}, 574 | []string{"hello", "你", "好人", "们", "审美", "的", "观点", "是", "不同", "的"}, 575 | []string{"很", "好", "但", "主要", "是", "基于", "网页", "形式"}, 576 | []string{"hello", "你", "好人", "们", "审美", "的", "观点", "是", "不同", "的"}, 577 | []string{"为", "什么", "我", "不能", "拥有", "想", "要", "的", "生活"}, 578 | []string{"后来", "我", "才"}, 579 | []string{"此次", "来", "中国", "是", "为", "了"}, 580 | []string{"使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, 581 | []string{",", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, 582 | []string{"其实", "使", "用", "了", "它", "就", "可以", "解决", "一些", "问题"}, 583 | []string{"好人", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"}, 584 | []string{"是", "因为", "和", "国家"}, 585 | []string{"老年", "搜索", "还", "支持"}, 586 | []string{"干脆", "就", "把", "那部", "蒙人", "的", "闲法", "给", "废", "了", "拉", "倒", "!", "RT", " ", "@", "laoshipukong", " ", ":", " ", "27", "日", ",", "全国人", "大常委会", "第三次", "审议", "侵权责", "任法", "草案", ",", "删除", "了", "有", "关医疗", "损害", "责任", "“", "举证", "倒", "置", "”", "的", "规定", "。", "在", "医患", "纠纷", "中本", "已", "处于", "弱势", "地位", "的", "消费者", "由", "此", "将", "陷入", "万劫", "不复", "的", "境地", "。", " "}, 587 | []string{"大"}, 588 | []string{}, 589 | []string{"他", "说", "的", "确实", "在", "理"}, 590 | []string{"长春市", "长春节", "讲话"}, 591 | []string{"结婚", "的", "和", "尚未", "结婚", "的"}, 592 | []string{"结合成", "分子", "时"}, 593 | []string{"旅游", "和", "服务", "是", "最", "好", "的"}, 594 | []string{"这件", "事情", "的", "确是", "我", "的", "错"}, 595 | []string{"供大家", "参考", "指正"}, 596 | []string{"哈尔滨", "政府", "公布塌桥", "原因"}, 597 | []string{"我", "在", "机场", "入口", "处"}, 598 | []string{"邢永臣", "摄影", "报道"}, 599 | []string{"BP", "神经", "网络", "如何", "训练", "才", "能", "在", "分类", "时", "增加区", "分度", "?"}, 600 | []string{"南京市", "长江大桥"}, 601 | []string{"应一些", "使", "用者", "的", "建议", ",", "也", "为", "了", "便", "于", "利用", "NiuTrans", "用于", "SMT", "研究"}, 602 | []string{"长春市", "长春药店"}, 603 | []string{"邓颖", "超生", "前", "最", "喜欢", "的", "衣服"}, 604 | []string{"胡锦涛", "是", "热爱世界", "和", "平", "的", "政治局", "常委"}, 605 | []string{"程序员", "祝海林", "和", "朱会震", "是", "在", "孙健", "的", "左面", "和", "右面", ",", " ", "范凯", "在", "最右面", ".", "再往", "左", "是", "李松洪"}, 606 | []string{"一次性", "交多少", "钱"}, 607 | []string{"两块", "五一套", ",", "三块", "八一斤", ",", "四块", "七", "一本", ",", "五块", "六", "一条"}, 608 | []string{"小", "和", "尚留", "了", "一个", "像", "大", "和", "尚", "一样", "的", "和", "尚头"}, 609 | []string{"我", "是", "中华人民共和国", "公民", ";", "我", "爸爸", "是", "共", "和", "党", "党员", ";", " ", "地铁", "和", "平门", "站"}, 610 | []string{"张晓梅", "去", "人民医院", "做", "了", "个", "B", "超然", "后", "去", "买", "了", "件", "T", "恤"}, 611 | []string{"AT", "&", "T", "是", "一件", "不错", "的", "公司", ",", "给", "你", "发", "offer", "了", "吗", "?"}, 612 | []string{"C", "++", "和", "c", "#", "是", "什么", "关系", "?", "11", "+", "122", "=", "133", ",", "是", "吗", "?", "PI", "=", "3.14159"}, 613 | []string{"你", "认识", "那个", "和", "主席握", "手", "的", "的", "哥", "吗", "?", "他开", "一辆", "黑色", "的", "士", "。"}, 614 | []string{"枪杆子", "中", "出政权"}, 615 | } 616 | ) 617 | 618 | func init() { 619 | seg.LoadDictionary("dict.txt") 620 | } 621 | 622 | func chanToArray(ch <-chan string) []string { 623 | var result []string 624 | for word := range ch { 625 | result = append(result, word) 626 | } 627 | return result 628 | } 629 | 630 | func TestCutDAG(t *testing.T) { 631 | result := chanToArray(seg.cutDAG("BP神经网络如何训练才能在分类时增加区分度?")) 632 | if len(result) != 11 { 633 | t.Fatal(result) 634 | } 635 | } 636 | 637 | func TestCutDAGNoHmm(t *testing.T) { 638 | result := chanToArray(seg.cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度?")) 639 | if len(result) != 11 { 640 | t.Fatal(result) 641 | } 642 | } 643 | 644 | func TestDefaultCut(t *testing.T) { 645 | var result []string 646 | for index, content := range testContents { 647 | result = chanToArray(seg.Cut(content, true)) 648 | if len(result) != len(defaultCutResult[index]) { 649 | t.Errorf("default cut for %s length should be %d not %d\n", 650 | content, len(defaultCutResult[index]), len(result)) 651 | t.Errorf("expect: %v\n", defaultCutResult[index]) 652 | t.Fatalf("got: %v\n", result) 653 | } 654 | for i, r := range result { 655 | if r != defaultCutResult[index][i] { 656 | t.Fatal(r) 657 | } 658 | } 659 | } 660 | } 661 | 662 | func TestCutAll(t *testing.T) { 663 | var result []string 664 | for index, content := range testContents { 665 | result = chanToArray(seg.CutAll(content)) 666 | if len(result) != len(cutAllResult[index]) { 667 | t.Errorf("cut all for %s length should be %d not %d\n", 668 | content, len(cutAllResult[index]), len(result)) 669 | t.Errorf("expect: %v\n", defaultCutResult[index]) 670 | t.Fatalf("got: %v\n", result) 671 | } 672 | for i, c := range result { 673 | if c != cutAllResult[index][i] { 674 | t.Fatal(c) 675 | } 676 | } 677 | } 678 | } 679 | 680 | func TestDefaultCutNoHMM(t *testing.T) { 681 | var result []string 682 | for index, content := range testContents { 683 | result = chanToArray(seg.Cut(content, false)) 684 | if len(result) != len(defaultCutNoHMMResult[index]) { 685 | t.Fatalf("default cut no hmm for %s length should be %d not %d\n", 686 | content, len(defaultCutNoHMMResult[index]), len(result)) 687 | } 688 | for i, c := range result { 689 | if c != defaultCutNoHMMResult[index][i] { 690 | t.Fatal(c) 691 | } 692 | } 693 | } 694 | } 695 | 696 | func TestCutForSearch(t *testing.T) { 697 | var result []string 698 | for index, content := range testContents { 699 | result = chanToArray(seg.CutForSearch(content, true)) 700 | if len(result) != len(cutForSearchResult[index]) { 701 | t.Fatalf("cut for search for %s length should be %d not %d\n", 702 | content, len(cutForSearchResult[index]), len(result)) 703 | } 704 | for i, c := range result { 705 | if c != cutForSearchResult[index][i] { 706 | t.Fatal(c) 707 | } 708 | } 709 | } 710 | for index, content := range testContents { 711 | result = chanToArray(seg.CutForSearch(content, false)) 712 | if len(result) != len(cutForSearchNoHMMResult[index]) { 713 | t.Fatalf("cut for search no hmm for %s length should be %d not %d\n", 714 | content, len(cutForSearchNoHMMResult[index]), len(result)) 715 | } 716 | for i, c := range result { 717 | if c != cutForSearchNoHMMResult[index][i] { 718 | t.Fatal(c) 719 | } 720 | } 721 | } 722 | } 723 | 724 | func TestLoadDictionary(t *testing.T) { 725 | var result []string 726 | seg.LoadDictionary("foobar.txt") 727 | for index, content := range testContents { 728 | result = chanToArray(seg.Cut(content, true)) 729 | if len(result) != len(userDictCutResult[index]) { 730 | t.Fatalf("default cut with user dictionary for %s length should be %d not %d\n", 731 | content, len(userDictCutResult[index]), len(result)) 732 | } 733 | for i, c := range result { 734 | if c != userDictCutResult[index][i] { 735 | t.Fatal(c) 736 | } 737 | } 738 | } 739 | seg.LoadDictionary("dict.txt") 740 | } 741 | 742 | func TestLoadUserDictionary(t *testing.T) { 743 | seg.LoadUserDictionary("userdict.txt") 744 | 745 | sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型" 746 | result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", ",", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"} 747 | 748 | words := chanToArray(seg.Cut(sentence, true)) 749 | if len(words) != len(result) { 750 | t.Fatal(len(words)) 751 | } 752 | for index, word := range words { 753 | if word != result[index] { 754 | t.Fatal(word) 755 | } 756 | } 757 | 758 | sentence = "easy_install is great" 759 | result = []string{"easy_install", " ", "is", " ", "great"} 760 | words = chanToArray(seg.Cut(sentence, true)) 761 | if len(words) != len(result) { 762 | t.Fatal(len(words)) 763 | } 764 | for index, word := range words { 765 | if word != result[index] { 766 | t.Fatal(word) 767 | } 768 | } 769 | 770 | sentence = "python 的正则表达式是好用的" 771 | result = []string{"python", " ", "的", "正则表达式", "是", "好用", "的"} 772 | words = chanToArray(seg.Cut(sentence, true)) 773 | if len(words) != len(result) { 774 | t.Fatal(words) 775 | t.Fatal(result) 776 | } 777 | for index, word := range words { 778 | if word != result[index] { 779 | t.Fatal(word) 780 | } 781 | } 782 | seg.LoadDictionary("dict.txt") 783 | } 784 | 785 | func BenchmarkCutNoHMM(b *testing.B) { 786 | sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" 787 | b.ResetTimer() 788 | for i := 0; i < b.N; i++ { 789 | chanToArray(seg.Cut(sentence, false)) 790 | } 791 | } 792 | 793 | func BenchmarkCut(b *testing.B) { 794 | sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" 795 | b.ResetTimer() 796 | for i := 0; i < b.N; i++ { 797 | chanToArray(seg.Cut(sentence, true)) 798 | } 799 | } 800 | 801 | func BenchmarkCutAll(b *testing.B) { 802 | sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" 803 | b.ResetTimer() 804 | for i := 0; i < b.N; i++ { 805 | chanToArray(seg.CutAll(sentence)) 806 | } 807 | } 808 | 809 | func BenchmarkCutForSearchNoHMM(b *testing.B) { 810 | sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" 811 | b.ResetTimer() 812 | for i := 0; i < b.N; i++ { 813 | chanToArray(seg.CutForSearch(sentence, false)) 814 | } 815 | } 816 | 817 | func BenchmarkCutForSearch(b *testing.B) { 818 | sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" 819 | b.ResetTimer() 820 | for i := 0; i < b.N; i++ { 821 | chanToArray(seg.CutForSearch(sentence, true)) 822 | } 823 | } 824 | -------------------------------------------------------------------------------- /posseg/char_state_tab_test.go: -------------------------------------------------------------------------------- 1 | package posseg 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestGet(t *testing.T) { 8 | result := charStateTab.get('\u8000') 9 | if len(result) != 17 { 10 | t.FailNow() 11 | } 12 | result = charStateTab.get('\uaaaa') 13 | if len(result) == 17 { 14 | t.FailNow() 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /posseg/dictionary.go: -------------------------------------------------------------------------------- 1 | package posseg 2 | 3 | import ( 4 | "math" 5 | "sync" 6 | 7 | "github.com/wangbin/jiebago/dictionary" 8 | ) 9 | 10 | // A Dictionary represents a thread-safe dictionary used for word segmentation. 11 | type Dictionary struct { 12 | total, logTotal float64 13 | freqMap map[string]float64 14 | posMap map[string]string 15 | sync.RWMutex 16 | } 17 | 18 | // Load loads all tokens from given channel 19 | func (d *Dictionary) Load(ch <-chan dictionary.Token) { 20 | d.Lock() 21 | for token := range ch { 22 | d.addToken(token) 23 | } 24 | d.Unlock() 25 | d.updateLogTotal() 26 | } 27 | 28 | // AddToken adds one token 29 | func (d *Dictionary) AddToken(token dictionary.Token) { 30 | d.Lock() 31 | d.addToken(token) 32 | d.Unlock() 33 | d.updateLogTotal() 34 | } 35 | 36 | func (d *Dictionary) addToken(token dictionary.Token) { 37 | d.freqMap[token.Text()] = token.Frequency() 38 | d.total += token.Frequency() 39 | runes := []rune(token.Text()) 40 | n := len(runes) 41 | for i := 0; i < n; i++ { 42 | frag := string(runes[:i+1]) 43 | if _, ok := d.freqMap[frag]; !ok { 44 | d.freqMap[frag] = 0.0 45 | } 46 | } 47 | if len(token.Pos()) > 0 { 48 | d.posMap[token.Text()] = token.Pos() 49 | } 50 | } 51 | 52 | func (d *Dictionary) updateLogTotal() { 53 | d.logTotal = math.Log(d.total) 54 | } 55 | 56 | // Frequency returns the frequency and existence of give word 57 | func (d *Dictionary) Frequency(key string) (float64, bool) { 58 | d.RLock() 59 | freq, ok := d.freqMap[key] 60 | d.RUnlock() 61 | return freq, ok 62 | } 63 | 64 | // Pos returns the POS and existence of give word 65 | func (d *Dictionary) Pos(key string) (string, bool) { 66 | d.RLock() 67 | pos, ok := d.posMap[key] 68 | d.RUnlock() 69 | return pos, ok 70 | } 71 | 72 | func (d *Dictionary) loadDictionary(fileName string) error { 73 | return dictionary.LoadDictionary(d, fileName) 74 | } 75 | -------------------------------------------------------------------------------- /posseg/example_test.go: -------------------------------------------------------------------------------- 1 | package posseg_test 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/wangbin/jiebago/posseg" 7 | ) 8 | 9 | func Example() { 10 | var seg posseg.Segmenter 11 | seg.LoadDictionary("../dict.txt") 12 | 13 | for segment := range seg.Cut("我爱北京天安门", true) { 14 | fmt.Printf("%s %s\n", segment.Text(), segment.Pos()) 15 | } 16 | // Output: 17 | // 我 r 18 | // 爱 v 19 | // 北京 ns 20 | // 天安门 ns 21 | } 22 | -------------------------------------------------------------------------------- /posseg/posseg.go: -------------------------------------------------------------------------------- 1 | // Package posseg is the Golang implementation of Jieba's posseg module. 2 | package posseg 3 | 4 | import ( 5 | "math" 6 | "regexp" 7 | 8 | "github.com/wangbin/jiebago/util" 9 | ) 10 | 11 | var ( 12 | reHanDetail = regexp.MustCompile(`(\p{Han}+)`) 13 | reSkipDetail = regexp.MustCompile(`([[\.[:digit:]]+|[:alnum:]]+)`) 14 | reEng = regexp.MustCompile(`[[:alnum:]]`) 15 | reNum = regexp.MustCompile(`[\.[:digit:]]+`) 16 | reEng1 = regexp.MustCompile(`[[:alnum:]]$`) 17 | reHanInternal = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`) 18 | reSkipInternal = regexp.MustCompile(`(\r\n|\s)`) 19 | ) 20 | 21 | // Segment represents a word with it's POS 22 | type Segment struct { 23 | text, pos string 24 | } 25 | 26 | // Text returns the Segment's text. 27 | func (s Segment) Text() string { 28 | return s.text 29 | } 30 | 31 | // Pos returns the Segment's POS. 32 | func (s Segment) Pos() string { 33 | return s.pos 34 | } 35 | 36 | // Segmenter is a Chinese words segmentation struct. 37 | type Segmenter struct { 38 | dict *Dictionary 39 | } 40 | 41 | // LoadDictionary loads dictionary from given file name. 42 | // Everytime LoadDictionary is called, previously loaded dictionary will be cleard. 43 | func (seg *Segmenter) LoadDictionary(fileName string) error { 44 | seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)} 45 | return seg.dict.loadDictionary(fileName) 46 | } 47 | 48 | // LoadUserDictionary loads a user specified dictionary, it must be called 49 | // after LoadDictionary, and it will not clear any previous loaded dictionary, 50 | // instead it will override exist entries. 51 | func (seg *Segmenter) LoadUserDictionary(fileName string) error { 52 | return seg.dict.loadDictionary(fileName) 53 | } 54 | 55 | func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment { 56 | result := make(chan Segment) 57 | 58 | go func() { 59 | runes := []rune(sentence) 60 | posList := viterbi(runes) 61 | begin := 0 62 | next := 0 63 | for i, char := range runes { 64 | pos := posList[i] 65 | switch pos.position() { 66 | case "B": 67 | begin = i 68 | case "E": 69 | result <- Segment{string(runes[begin : i+1]), pos.pos()} 70 | next = i + 1 71 | case "S": 72 | result <- Segment{string(char), pos.pos()} 73 | next = i + 1 74 | } 75 | } 76 | if next < len(runes) { 77 | result <- Segment{string(runes[next:]), posList[next].pos()} 78 | } 79 | close(result) 80 | }() 81 | return result 82 | } 83 | 84 | func (seg *Segmenter) cutDetail(sentence string) <-chan Segment { 85 | result := make(chan Segment) 86 | go func() { 87 | for _, blk := range util.RegexpSplit(reHanDetail, sentence, -1) { 88 | if reHanDetail.MatchString(blk) { 89 | for segment := range seg.cutDetailInternal(blk) { 90 | result <- segment 91 | } 92 | continue 93 | } 94 | for _, x := range util.RegexpSplit(reSkipDetail, blk, -1) { 95 | if len(x) == 0 { 96 | continue 97 | } 98 | switch { 99 | case reNum.MatchString(x): 100 | result <- Segment{x, "m"} 101 | case reEng.MatchString(x): 102 | result <- Segment{x, "eng"} 103 | default: 104 | result <- Segment{x, "x"} 105 | } 106 | } 107 | } 108 | close(result) 109 | }() 110 | return result 111 | } 112 | 113 | func (seg *Segmenter) dag(runes []rune) map[int][]int { 114 | dag := make(map[int][]int) 115 | n := len(runes) 116 | var frag []rune 117 | var i int 118 | for k := 0; k < n; k++ { 119 | dag[k] = make([]int, 0) 120 | i = k 121 | frag = runes[k : k+1] 122 | for { 123 | freq, ok := seg.dict.Frequency(string(frag)) 124 | if !ok { 125 | break 126 | } 127 | if freq > 0.0 { 128 | dag[k] = append(dag[k], i) 129 | } 130 | i++ 131 | if i >= n { 132 | break 133 | } 134 | frag = runes[k : i+1] 135 | } 136 | if len(dag[k]) == 0 { 137 | dag[k] = append(dag[k], k) 138 | } 139 | } 140 | return dag 141 | } 142 | 143 | type route struct { 144 | frequency float64 145 | index int 146 | } 147 | 148 | func (seg *Segmenter) calc(runes []rune) map[int]route { 149 | dag := seg.dag(runes) 150 | n := len(runes) 151 | rs := make(map[int]route) 152 | rs[n] = route{frequency: 0.0, index: 0} 153 | var r route 154 | for idx := n - 1; idx >= 0; idx-- { 155 | for _, i := range dag[idx] { 156 | if freq, ok := seg.dict.Frequency(string(runes[idx : i+1])); ok { 157 | r = route{frequency: math.Log(freq) - seg.dict.logTotal + rs[i+1].frequency, index: i} 158 | } else { 159 | r = route{frequency: math.Log(1.0) - seg.dict.logTotal + rs[i+1].frequency, index: i} 160 | } 161 | if v, ok := rs[idx]; !ok { 162 | rs[idx] = r 163 | } else { 164 | if v.frequency < r.frequency || (v.frequency == r.frequency && v.index < r.index) { 165 | rs[idx] = r 166 | } 167 | } 168 | } 169 | } 170 | return rs 171 | } 172 | 173 | type cutFunc func(sentence string) <-chan Segment 174 | 175 | func (seg *Segmenter) cutDAG(sentence string) <-chan Segment { 176 | result := make(chan Segment) 177 | 178 | go func() { 179 | runes := []rune(sentence) 180 | routes := seg.calc(runes) 181 | var y int 182 | length := len(runes) 183 | var buf []rune 184 | for x := 0; x < length; { 185 | y = routes[x].index + 1 186 | frag := runes[x:y] 187 | if y-x == 1 { 188 | buf = append(buf, frag...) 189 | x = y 190 | continue 191 | } 192 | if len(buf) > 0 { 193 | bufString := string(buf) 194 | if len(buf) == 1 { 195 | if tag, ok := seg.dict.Pos(bufString); ok { 196 | result <- Segment{bufString, tag} 197 | } else { 198 | result <- Segment{bufString, "x"} 199 | } 200 | buf = make([]rune, 0) 201 | continue 202 | } 203 | if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 { 204 | for t := range seg.cutDetail(bufString) { 205 | result <- t 206 | } 207 | } else { 208 | for _, elem := range buf { 209 | selem := string(elem) 210 | if tag, ok := seg.dict.Pos(selem); ok { 211 | result <- Segment{selem, tag} 212 | } else { 213 | result <- Segment{selem, "x"} 214 | } 215 | 216 | } 217 | } 218 | buf = make([]rune, 0) 219 | } 220 | word := string(frag) 221 | if tag, ok := seg.dict.Pos(word); ok { 222 | result <- Segment{word, tag} 223 | } else { 224 | result <- Segment{word, "x"} 225 | } 226 | x = y 227 | } 228 | 229 | if len(buf) > 0 { 230 | bufString := string(buf) 231 | if len(buf) == 1 { 232 | if tag, ok := seg.dict.Pos(bufString); ok { 233 | result <- Segment{bufString, tag} 234 | } else { 235 | result <- Segment{bufString, "x"} 236 | } 237 | } else { 238 | if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 { 239 | for t := range seg.cutDetail(bufString) { 240 | result <- t 241 | } 242 | } else { 243 | for _, elem := range buf { 244 | selem := string(elem) 245 | if tag, ok := seg.dict.Pos(selem); ok { 246 | result <- Segment{selem, tag} 247 | } else { 248 | result <- Segment{selem, "x"} 249 | } 250 | } 251 | } 252 | } 253 | } 254 | close(result) 255 | }() 256 | return result 257 | } 258 | 259 | func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment { 260 | result := make(chan Segment) 261 | 262 | go func() { 263 | runes := []rune(sentence) 264 | routes := seg.calc(runes) 265 | var y int 266 | length := len(runes) 267 | var buf []rune 268 | for x := 0; x < length; { 269 | y = routes[x].index + 1 270 | frag := runes[x:y] 271 | if reEng1.MatchString(string(frag)) && len(frag) == 1 { 272 | buf = append(buf, frag...) 273 | x = y 274 | continue 275 | } 276 | if len(buf) > 0 { 277 | result <- Segment{string(buf), "eng"} 278 | buf = make([]rune, 0) 279 | } 280 | word := string(frag) 281 | if tag, ok := seg.dict.Pos(word); ok { 282 | result <- Segment{word, tag} 283 | } else { 284 | result <- Segment{word, "x"} 285 | } 286 | x = y 287 | 288 | } 289 | if len(buf) > 0 { 290 | result <- Segment{string(buf), "eng"} 291 | buf = make([]rune, 0) 292 | } 293 | close(result) 294 | }() 295 | return result 296 | } 297 | 298 | // Cut cuts a sentence into words. 299 | // Parameter hmm controls whether to use the Hidden Markov Model. 300 | func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan Segment { 301 | result := make(chan Segment) 302 | var cut cutFunc 303 | if hmm { 304 | cut = seg.cutDAG 305 | } else { 306 | cut = seg.cutDAGNoHMM 307 | } 308 | go func() { 309 | for _, blk := range util.RegexpSplit(reHanInternal, sentence, -1) { 310 | if reHanInternal.MatchString(blk) { 311 | for wordTag := range cut(blk) { 312 | result <- wordTag 313 | } 314 | continue 315 | } 316 | for _, x := range util.RegexpSplit(reSkipInternal, blk, -1) { 317 | if reSkipInternal.MatchString(x) { 318 | result <- Segment{x, "x"} 319 | continue 320 | } 321 | for _, xx := range x { 322 | s := string(xx) 323 | switch { 324 | case reNum.MatchString(s): 325 | result <- Segment{s, "m"} 326 | case reEng.MatchString(x): 327 | result <- Segment{x, "eng"} 328 | default: 329 | result <- Segment{s, "x"} 330 | } 331 | } 332 | } 333 | } 334 | close(result) 335 | }() 336 | return result 337 | } 338 | -------------------------------------------------------------------------------- /posseg/posseg_test.go: -------------------------------------------------------------------------------- 1 | package posseg 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | var ( 8 | seg Segmenter 9 | testContents = []string{ 10 | "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。", 11 | "我不喜欢日本和服。", 12 | "雷猴回归人间。", 13 | "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", 14 | "我需要廉租房", 15 | "永和服装饰品有限公司", 16 | "我爱北京天安门", 17 | "abc", 18 | "隐马尔可夫", 19 | "雷猴是个好网站", 20 | "“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成", 21 | "草泥马和欺实马是今年的流行词汇", 22 | "伊藤洋华堂总府店", 23 | "中国科学院计算技术研究所", 24 | "罗密欧与朱丽叶", 25 | "我购买了道具和服装", 26 | "PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍", 27 | "湖北省石首市", 28 | "湖北省十堰市", 29 | "总经理完成了这件事情", 30 | "电脑修好了", 31 | "做好了这件事情就一了百了了", 32 | "人们审美的观点是不同的", 33 | "我们买了一个美的空调", 34 | "线程初始化时我们要注意", 35 | "一个分子是由好多原子组织成的", 36 | "祝你马到功成", 37 | "他掉进了无底洞里", 38 | "中国的首都是北京", 39 | "孙君意", 40 | "外交部发言人马朝旭", 41 | "领导人会议和第四届东亚峰会", 42 | "在过去的这五年", 43 | "还需要很长的路要走", 44 | "60周年首都阅兵", 45 | "你好人们审美的观点是不同的", 46 | "买水果然后来世博园", 47 | "买水果然后去世博园", 48 | "但是后来我才知道你是对的", 49 | "存在即合理", 50 | "的的的的的在的的的的就以和和和", 51 | "I love你,不以为耻,反以为rong", 52 | "因", 53 | "", 54 | "hello你好人们审美的观点是不同的", 55 | "很好但主要是基于网页形式", 56 | "hello你好人们审美的观点是不同的", 57 | "为什么我不能拥有想要的生活", 58 | "后来我才", 59 | "此次来中国是为了", 60 | "使用了它就可以解决一些问题", 61 | ",使用了它就可以解决一些问题", 62 | "其实使用了它就可以解决一些问题", 63 | "好人使用了它就可以解决一些问题", 64 | "是因为和国家", 65 | "老年搜索还支持", 66 | "干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ", 67 | "大", 68 | "", 69 | "他说的确实在理", 70 | "长春市长春节讲话", 71 | "结婚的和尚未结婚的", 72 | "结合成分子时", 73 | "旅游和服务是最好的", 74 | "这件事情的确是我的错", 75 | "供大家参考指正", 76 | "哈尔滨政府公布塌桥原因", 77 | "我在机场入口处", 78 | "邢永臣摄影报道", 79 | "BP神经网络如何训练才能在分类时增加区分度?", 80 | "南京市长江大桥", 81 | "应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究", 82 | "长春市长春药店", 83 | "邓颖超生前最喜欢的衣服", 84 | "胡锦涛是热爱世界和平的政治局常委", 85 | "程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪", 86 | "一次性交多少钱", 87 | "两块五一套,三块八一斤,四块七一本,五块六一条", 88 | "小和尚留了一个像大和尚一样的和尚头", 89 | "我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站", 90 | "张晓梅去人民医院做了个B超然后去买了件T恤", 91 | "AT&T是一件不错的公司,给你发offer了吗?", 92 | "C++和c#是什么关系?11+122=133,是吗?PI=3.14159", 93 | "你认识那个和主席握手的的哥吗?他开一辆黑色的士。", 94 | "枪杆子中出政权"} 95 | 96 | defaultCutResult = [][]Segment{[]Segment{Segment{"这", "r"}, Segment{"是", "v"}, Segment{"一个", "m"}, Segment{"伸手不见五指", "i"}, Segment{"的", "uj"}, Segment{"黑夜", "n"}, Segment{"。", "x"}, Segment{"我", "r"}, Segment{"叫", "v"}, Segment{"孙悟空", "nr"}, Segment{",", "x"}, Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"北京", "ns"}, Segment{",", "x"}, Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"Python", "eng"}, Segment{"和", "c"}, Segment{"C++", "nz"}, Segment{"。", "x"}}, 97 | []Segment{Segment{"我", "r"}, Segment{"不", "d"}, Segment{"喜欢", "v"}, Segment{"日本", "ns"}, Segment{"和服", "nz"}, Segment{"。", "x"}}, 98 | []Segment{Segment{"雷猴", "n"}, Segment{"回归", "v"}, Segment{"人间", "n"}, Segment{"。", "x"}}, 99 | []Segment{Segment{"工信处", "n"}, Segment{"女干事", "n"}, Segment{"每月", "r"}, Segment{"经过", "p"}, Segment{"下属", "v"}, Segment{"科室", "n"}, Segment{"都", "d"}, Segment{"要", "v"}, Segment{"亲口", "n"}, Segment{"交代", "n"}, Segment{"24", "m"}, Segment{"口", "n"}, Segment{"交换机", "n"}, Segment{"等", "u"}, Segment{"技术性", "n"}, Segment{"器件", "n"}, Segment{"的", "uj"}, Segment{"安装", "v"}, Segment{"工作", "vn"}}, 100 | []Segment{Segment{"我", "r"}, Segment{"需要", "v"}, Segment{"廉租房", "n"}}, 101 | []Segment{Segment{"永和", "nz"}, Segment{"服装", "vn"}, Segment{"饰品", "n"}, Segment{"有限公司", "n"}}, 102 | []Segment{Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"北京", "ns"}, Segment{"天安门", "ns"}}, 103 | []Segment{Segment{"abc", "eng"}}, 104 | []Segment{Segment{"隐", "n"}, Segment{"马尔可夫", "nr"}}, 105 | []Segment{Segment{"雷猴", "n"}, Segment{"是", "v"}, Segment{"个", "q"}, Segment{"好", "a"}, Segment{"网站", "n"}}, 106 | []Segment{Segment{"“", "x"}, Segment{"Microsoft", "eng"}, Segment{"”", "x"}, Segment{"一", "m"}, Segment{"词", "n"}, Segment{"由", "p"}, Segment{"“", "x"}, Segment{"MICROcomputer", "eng"}, Segment{"(", "x"}, Segment{"微型", "b"}, Segment{"计算机", "n"}, Segment{")", "x"}, Segment{"”", "x"}, Segment{"和", "c"}, Segment{"“", "x"}, Segment{"SOFTware", "eng"}, Segment{"(", "x"}, Segment{"软件", "n"}, Segment{")", "x"}, Segment{"”", "x"}, Segment{"两", "m"}, Segment{"部分", "n"}, Segment{"组成", "v"}}, 107 | []Segment{Segment{"草泥马", "n"}, Segment{"和", "c"}, Segment{"欺实", "v"}, Segment{"马", "n"}, Segment{"是", "v"}, Segment{"今年", "t"}, Segment{"的", "uj"}, Segment{"流行", "v"}, Segment{"词汇", "n"}}, 108 | []Segment{Segment{"伊藤", "nr"}, Segment{"洋华堂", "n"}, Segment{"总府", "n"}, Segment{"店", "n"}}, 109 | []Segment{Segment{"中国科学院计算技术研究所", "nt"}}, 110 | []Segment{Segment{"罗密欧", "nr"}, Segment{"与", "p"}, Segment{"朱丽叶", "nr"}}, 111 | []Segment{Segment{"我", "r"}, Segment{"购买", "v"}, Segment{"了", "ul"}, Segment{"道具", "n"}, Segment{"和", "c"}, Segment{"服装", "vn"}}, 112 | []Segment{Segment{"PS", "eng"}, Segment{":", "x"}, Segment{" ", "x"}, Segment{"我", "r"}, Segment{"觉得", "v"}, Segment{"开源", "n"}, Segment{"有", "v"}, Segment{"一个", "m"}, Segment{"好处", "d"}, Segment{",", "x"}, Segment{"就是", "d"}, Segment{"能够", "v"}, Segment{"敦促", "v"}, Segment{"自己", "r"}, Segment{"不断改进", "l"}, Segment{",", "x"}, Segment{"避免", "v"}, Segment{"敞", "v"}, Segment{"帚", "ng"}, Segment{"自珍", "b"}}, 113 | []Segment{Segment{"湖北省", "ns"}, Segment{"石首市", "ns"}}, 114 | []Segment{Segment{"湖北省", "ns"}, Segment{"十堰市", "ns"}}, 115 | []Segment{Segment{"总经理", "n"}, Segment{"完成", "v"}, Segment{"了", "ul"}, Segment{"这件", "mq"}, Segment{"事情", "n"}}, 116 | []Segment{Segment{"电脑", "n"}, Segment{"修好", "v"}, Segment{"了", "ul"}}, 117 | []Segment{Segment{"做好", "v"}, Segment{"了", "ul"}, Segment{"这件", "mq"}, Segment{"事情", "n"}, Segment{"就", "d"}, Segment{"一了百了", "l"}, Segment{"了", "ul"}}, 118 | []Segment{Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}}, 119 | []Segment{Segment{"我们", "r"}, Segment{"买", "v"}, Segment{"了", "ul"}, Segment{"一个", "m"}, Segment{"美的", "nr"}, Segment{"空调", "n"}}, 120 | []Segment{Segment{"线程", "n"}, Segment{"初始化", "l"}, Segment{"时", "n"}, Segment{"我们", "r"}, Segment{"要", "v"}, Segment{"注意", "v"}}, 121 | []Segment{Segment{"一个", "m"}, Segment{"分子", "n"}, Segment{"是", "v"}, Segment{"由", "p"}, Segment{"好多", "m"}, Segment{"原子", "n"}, Segment{"组织", "v"}, Segment{"成", "v"}, Segment{"的", "uj"}}, 122 | []Segment{Segment{"祝", "v"}, Segment{"你", "r"}, Segment{"马到功成", "i"}}, 123 | []Segment{Segment{"他", "r"}, Segment{"掉", "v"}, Segment{"进", "v"}, Segment{"了", "ul"}, Segment{"无底洞", "ns"}, Segment{"里", "f"}}, 124 | []Segment{Segment{"中国", "ns"}, Segment{"的", "uj"}, Segment{"首都", "d"}, Segment{"是", "v"}, Segment{"北京", "ns"}}, 125 | []Segment{Segment{"孙君意", "nr"}}, 126 | []Segment{Segment{"外交部", "nt"}, Segment{"发言人", "l"}, Segment{"马朝旭", "nr"}}, 127 | []Segment{Segment{"领导人", "n"}, Segment{"会议", "n"}, Segment{"和", "c"}, Segment{"第四届", "m"}, Segment{"东亚", "ns"}, Segment{"峰会", "n"}}, 128 | []Segment{Segment{"在", "p"}, Segment{"过去", "t"}, Segment{"的", "uj"}, Segment{"这", "r"}, Segment{"五年", "t"}}, 129 | []Segment{Segment{"还", "d"}, Segment{"需要", "v"}, Segment{"很", "d"}, Segment{"长", "a"}, Segment{"的", "uj"}, Segment{"路", "n"}, Segment{"要", "v"}, Segment{"走", "v"}}, 130 | []Segment{Segment{"60", "m"}, Segment{"周年", "t"}, Segment{"首都", "d"}, Segment{"阅兵", "v"}}, 131 | []Segment{Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}}, 132 | []Segment{Segment{"买", "v"}, Segment{"水果", "n"}, Segment{"然后", "c"}, Segment{"来", "v"}, Segment{"世博园", "nr"}}, 133 | []Segment{Segment{"买", "v"}, Segment{"水果", "n"}, Segment{"然后", "c"}, Segment{"去", "v"}, Segment{"世博园", "nr"}}, 134 | []Segment{Segment{"但是", "c"}, Segment{"后来", "t"}, Segment{"我", "r"}, Segment{"才", "d"}, Segment{"知道", "v"}, Segment{"你", "r"}, Segment{"是", "v"}, Segment{"对", "p"}, Segment{"的", "uj"}}, 135 | []Segment{Segment{"存在", "v"}, Segment{"即", "v"}, Segment{"合理", "vn"}}, 136 | []Segment{Segment{"的的", "u"}, Segment{"的的", "u"}, Segment{"的", "uj"}, Segment{"在的", "u"}, Segment{"的的", "u"}, Segment{"的", "uj"}, Segment{"就", "d"}, Segment{"以", "p"}, Segment{"和和", "nz"}, Segment{"和", "c"}}, 137 | []Segment{Segment{"I", "x"}, Segment{" ", "x"}, Segment{"love", "eng"}, Segment{"你", "r"}, Segment{",", "x"}, Segment{"不以为耻", "i"}, Segment{",", "x"}, Segment{"反", "zg"}, Segment{"以为", "c"}, Segment{"rong", "eng"}}, 138 | []Segment{Segment{"因", "p"}}, 139 | []Segment{}, 140 | []Segment{Segment{"hello", "eng"}, Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}}, 141 | []Segment{Segment{"很好", "a"}, Segment{"但", "c"}, Segment{"主要", "b"}, Segment{"是", "v"}, Segment{"基于", "p"}, Segment{"网页", "n"}, Segment{"形式", "n"}}, 142 | []Segment{Segment{"hello", "eng"}, Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}}, 143 | []Segment{Segment{"为什么", "r"}, Segment{"我", "r"}, Segment{"不能", "v"}, Segment{"拥有", "v"}, Segment{"想要", "v"}, Segment{"的", "uj"}, Segment{"生活", "vn"}}, 144 | []Segment{Segment{"后来", "t"}, Segment{"我", "r"}, Segment{"才", "d"}}, 145 | []Segment{Segment{"此次", "r"}, Segment{"来", "v"}, Segment{"中国", "ns"}, Segment{"是", "v"}, Segment{"为了", "p"}}, 146 | []Segment{Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}}, 147 | []Segment{Segment{",", "x"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}}, 148 | []Segment{Segment{"其实", "d"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}}, 149 | []Segment{Segment{"好人", "n"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}}, 150 | []Segment{Segment{"是因为", "c"}, Segment{"和", "c"}, Segment{"国家", "n"}}, 151 | []Segment{Segment{"老年", "t"}, Segment{"搜索", "v"}, Segment{"还", "d"}, Segment{"支持", "v"}}, 152 | []Segment{Segment{"干脆", "d"}, Segment{"就", "d"}, Segment{"把", "p"}, Segment{"那部", "r"}, Segment{"蒙人", "n"}, Segment{"的", "uj"}, Segment{"闲法", "n"}, Segment{"给", "p"}, Segment{"废", "v"}, Segment{"了", "ul"}, Segment{"拉倒", "v"}, Segment{"!", "x"}, Segment{"RT", "eng"}, Segment{" ", "x"}, Segment{"@", "x"}, Segment{"laoshipukong", "eng"}, Segment{" ", "x"}, Segment{":", "x"}, Segment{" ", "x"}, Segment{"27", "m"}, Segment{"日", "m"}, Segment{",", "x"}, Segment{"全国人大常委会", "nt"}, Segment{"第三次", "m"}, Segment{"审议", "v"}, Segment{"侵权", "v"}, Segment{"责任法", "n"}, Segment{"草案", "n"}, Segment{",", "x"}, Segment{"删除", "v"}, Segment{"了", "ul"}, Segment{"有关", "vn"}, Segment{"医疗", "n"}, Segment{"损害", "v"}, Segment{"责任", "n"}, Segment{"“", "x"}, Segment{"举证", "v"}, Segment{"倒置", "v"}, Segment{"”", "x"}, Segment{"的", "uj"}, Segment{"规定", "n"}, Segment{"。", "x"}, Segment{"在", "p"}, Segment{"医患", "n"}, Segment{"纠纷", "n"}, Segment{"中本", "ns"}, Segment{"已", "d"}, Segment{"处于", "v"}, Segment{"弱势", "n"}, Segment{"地位", "n"}, Segment{"的", "uj"}, Segment{"消费者", "n"}, Segment{"由此", "c"}, Segment{"将", "d"}, Segment{"陷入", "v"}, Segment{"万劫不复", "i"}, Segment{"的", "uj"}, Segment{"境地", "s"}, Segment{"。", "x"}, Segment{" ", "x"}}, 153 | []Segment{Segment{"大", "a"}}, 154 | []Segment{}, 155 | []Segment{Segment{"他", "r"}, Segment{"说", "v"}, Segment{"的", "uj"}, Segment{"确实", "ad"}, Segment{"在", "p"}, Segment{"理", "n"}}, 156 | []Segment{Segment{"长春", "ns"}, Segment{"市长", "n"}, Segment{"春节", "t"}, Segment{"讲话", "n"}}, 157 | []Segment{Segment{"结婚", "v"}, Segment{"的", "uj"}, Segment{"和", "c"}, Segment{"尚未", "d"}, Segment{"结婚", "v"}, Segment{"的", "uj"}}, 158 | []Segment{Segment{"结合", "v"}, Segment{"成", "n"}, Segment{"分子", "n"}, Segment{"时", "n"}}, 159 | []Segment{Segment{"旅游", "vn"}, Segment{"和", "c"}, Segment{"服务", "vn"}, Segment{"是", "v"}, Segment{"最好", "a"}, Segment{"的", "uj"}}, 160 | []Segment{Segment{"这件", "mq"}, Segment{"事情", "n"}, Segment{"的确", "d"}, Segment{"是", "v"}, Segment{"我", "r"}, Segment{"的", "uj"}, Segment{"错", "n"}}, 161 | []Segment{Segment{"供", "v"}, Segment{"大家", "n"}, Segment{"参考", "v"}, Segment{"指正", "v"}}, 162 | []Segment{Segment{"哈尔滨", "ns"}, Segment{"政府", "n"}, Segment{"公布", "v"}, Segment{"塌", "v"}, Segment{"桥", "n"}, Segment{"原因", "n"}}, 163 | []Segment{Segment{"我", "r"}, Segment{"在", "p"}, Segment{"机场", "n"}, Segment{"入口处", "i"}}, 164 | []Segment{Segment{"邢永臣", "nr"}, Segment{"摄影", "n"}, Segment{"报道", "v"}}, 165 | []Segment{Segment{"BP", "eng"}, Segment{"神经网络", "n"}, Segment{"如何", "r"}, Segment{"训练", "vn"}, Segment{"才能", "v"}, Segment{"在", "p"}, Segment{"分类", "n"}, Segment{"时", "n"}, Segment{"增加", "v"}, Segment{"区分度", "n"}, Segment{"?", "x"}}, 166 | []Segment{Segment{"南京市", "ns"}, Segment{"长江大桥", "ns"}}, 167 | []Segment{Segment{"应", "v"}, Segment{"一些", "m"}, Segment{"使用者", "n"}, Segment{"的", "uj"}, Segment{"建议", "n"}, Segment{",", "x"}, Segment{"也", "d"}, Segment{"为了", "p"}, Segment{"便于", "v"}, Segment{"利用", "n"}, Segment{"NiuTrans", "eng"}, Segment{"用于", "v"}, Segment{"SMT", "eng"}, Segment{"研究", "vn"}}, 168 | []Segment{Segment{"长春市", "ns"}, Segment{"长春", "ns"}, Segment{"药店", "n"}}, 169 | []Segment{Segment{"邓颖超", "nr"}, Segment{"生前", "t"}, Segment{"最", "d"}, Segment{"喜欢", "v"}, Segment{"的", "uj"}, Segment{"衣服", "n"}}, 170 | []Segment{Segment{"胡锦涛", "nr"}, Segment{"是", "v"}, Segment{"热爱", "a"}, Segment{"世界", "n"}, Segment{"和平", "nz"}, Segment{"的", "uj"}, Segment{"政治局", "n"}, Segment{"常委", "j"}}, 171 | []Segment{Segment{"程序员", "n"}, Segment{"祝", "v"}, Segment{"海林", "nz"}, Segment{"和", "c"}, Segment{"朱会震", "nr"}, Segment{"是", "v"}, Segment{"在", "p"}, Segment{"孙健", "nr"}, Segment{"的", "uj"}, Segment{"左面", "f"}, Segment{"和", "c"}, Segment{"右面", "f"}, Segment{",", "x"}, Segment{" ", "x"}, Segment{"范凯", "nr"}, Segment{"在", "p"}, Segment{"最", "a"}, Segment{"右面", "f"}, Segment{".", "m"}, Segment{"再往", "d"}, Segment{"左", "f"}, Segment{"是", "v"}, Segment{"李松洪", "nr"}}, 172 | []Segment{Segment{"一次性", "d"}, Segment{"交", "v"}, Segment{"多少", "m"}, Segment{"钱", "n"}}, 173 | []Segment{Segment{"两块", "m"}, Segment{"五", "m"}, Segment{"一套", "m"}, Segment{",", "x"}, Segment{"三块", "m"}, Segment{"八", "m"}, Segment{"一斤", "m"}, Segment{",", "x"}, Segment{"四块", "m"}, Segment{"七", "m"}, Segment{"一本", "m"}, Segment{",", "x"}, Segment{"五块", "m"}, Segment{"六", "m"}, Segment{"一条", "m"}}, 174 | []Segment{Segment{"小", "a"}, Segment{"和尚", "nr"}, Segment{"留", "v"}, Segment{"了", "ul"}, Segment{"一个", "m"}, Segment{"像", "v"}, Segment{"大", "a"}, Segment{"和尚", "nr"}, Segment{"一样", "r"}, Segment{"的", "uj"}, Segment{"和尚头", "nr"}}, 175 | []Segment{Segment{"我", "r"}, Segment{"是", "v"}, Segment{"中华人民共和国", "ns"}, Segment{"公民", "n"}, Segment{";", "x"}, Segment{"我", "r"}, Segment{"爸爸", "n"}, Segment{"是", "v"}, Segment{"共和党", "nt"}, Segment{"党员", "n"}, Segment{";", "x"}, Segment{" ", "x"}, Segment{"地铁", "n"}, Segment{"和平门", "ns"}, Segment{"站", "v"}}, 176 | []Segment{Segment{"张晓梅", "nr"}, Segment{"去", "v"}, Segment{"人民", "n"}, Segment{"医院", "n"}, Segment{"做", "v"}, Segment{"了", "ul"}, Segment{"个", "q"}, Segment{"B超", "n"}, Segment{"然后", "c"}, Segment{"去", "v"}, Segment{"买", "v"}, Segment{"了", "ul"}, Segment{"件", "q"}, Segment{"T恤", "n"}}, 177 | []Segment{Segment{"AT&T", "nz"}, Segment{"是", "v"}, Segment{"一件", "m"}, Segment{"不错", "a"}, Segment{"的", "uj"}, Segment{"公司", "n"}, Segment{",", "x"}, Segment{"给", "p"}, Segment{"你", "r"}, Segment{"发", "v"}, Segment{"offer", "eng"}, Segment{"了", "ul"}, Segment{"吗", "y"}, Segment{"?", "x"}}, 178 | []Segment{Segment{"C++", "nz"}, Segment{"和", "c"}, Segment{"c#", "nz"}, Segment{"是", "v"}, Segment{"什么", "r"}, Segment{"关系", "n"}, Segment{"?", "x"}, Segment{"11", "m"}, Segment{"+", "x"}, Segment{"122", "m"}, Segment{"=", "x"}, Segment{"133", "m"}, Segment{",", "x"}, Segment{"是", "v"}, Segment{"吗", "y"}, Segment{"?", "x"}, Segment{"PI", "eng"}, Segment{"=", "x"}, Segment{"3.14159", "m"}}, 179 | []Segment{Segment{"你", "r"}, Segment{"认识", "v"}, Segment{"那个", "r"}, Segment{"和", "c"}, Segment{"主席", "n"}, Segment{"握手", "v"}, Segment{"的", "uj"}, Segment{"的哥", "n"}, Segment{"吗", "y"}, Segment{"?", "x"}, Segment{"他", "r"}, Segment{"开", "v"}, Segment{"一辆", "m"}, Segment{"黑色", "n"}, Segment{"的士", "n"}, Segment{"。", "x"}}, 180 | []Segment{Segment{"枪杆子", "n"}, Segment{"中", "f"}, Segment{"出", "v"}, Segment{"政权", "n"}}, 181 | } 182 | noHMMCutResult = [][]Segment{ 183 | []Segment{Segment{"这", "r"}, Segment{"是", "v"}, Segment{"一个", "m"}, Segment{"伸手不见五指", "i"}, Segment{"的", "uj"}, Segment{"黑夜", "n"}, Segment{"。", "x"}, Segment{"我", "r"}, Segment{"叫", "v"}, Segment{"孙悟空", "nr"}, Segment{",", "x"}, Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"北京", "ns"}, Segment{",", "x"}, Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"Python", "eng"}, Segment{"和", "c"}, Segment{"C++", "nz"}, Segment{"。", "x"}}, 184 | []Segment{Segment{"我", "r"}, Segment{"不", "d"}, Segment{"喜欢", "v"}, Segment{"日本", "ns"}, Segment{"和服", "nz"}, Segment{"。", "x"}}, 185 | []Segment{Segment{"雷猴", "n"}, Segment{"回归", "v"}, Segment{"人间", "n"}, Segment{"。", "x"}}, 186 | []Segment{Segment{"工信处", "n"}, Segment{"女干事", "n"}, Segment{"每月", "r"}, Segment{"经过", "p"}, Segment{"下属", "v"}, Segment{"科室", "n"}, Segment{"都", "d"}, Segment{"要", "v"}, Segment{"亲口", "n"}, Segment{"交代", "n"}, Segment{"24", "eng"}, Segment{"口", "q"}, Segment{"交换机", "n"}, Segment{"等", "u"}, Segment{"技术性", "n"}, Segment{"器件", "n"}, Segment{"的", "uj"}, Segment{"安装", "v"}, Segment{"工作", "vn"}}, 187 | []Segment{Segment{"我", "r"}, Segment{"需要", "v"}, Segment{"廉租房", "n"}}, 188 | []Segment{Segment{"永和", "nz"}, Segment{"服装", "vn"}, Segment{"饰品", "n"}, Segment{"有限公司", "n"}}, 189 | []Segment{Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"北京", "ns"}, Segment{"天安门", "ns"}}, 190 | []Segment{Segment{"abc", "eng"}}, 191 | []Segment{Segment{"隐", "n"}, Segment{"马尔可夫", "nr"}}, 192 | []Segment{Segment{"雷猴", "n"}, Segment{"是", "v"}, Segment{"个", "q"}, Segment{"好", "a"}, Segment{"网站", "n"}}, 193 | []Segment{Segment{"“", "x"}, Segment{"Microsoft", "eng"}, Segment{"”", "x"}, Segment{"一", "m"}, Segment{"词", "n"}, Segment{"由", "p"}, Segment{"“", "x"}, Segment{"MICROcomputer", "eng"}, Segment{"(", "x"}, Segment{"微型", "b"}, Segment{"计算机", "n"}, Segment{")", "x"}, Segment{"”", "x"}, Segment{"和", "c"}, Segment{"“", "x"}, Segment{"SOFTware", "eng"}, Segment{"(", "x"}, Segment{"软件", "n"}, Segment{")", "x"}, Segment{"”", "x"}, Segment{"两", "m"}, Segment{"部分", "n"}, Segment{"组成", "v"}}, 194 | []Segment{Segment{"草泥马", "n"}, Segment{"和", "c"}, Segment{"欺", "vn"}, Segment{"实", "n"}, Segment{"马", "n"}, Segment{"是", "v"}, Segment{"今年", "t"}, Segment{"的", "uj"}, Segment{"流行", "v"}, Segment{"词汇", "n"}}, 195 | []Segment{Segment{"伊", "ns"}, Segment{"藤", "nr"}, Segment{"洋华堂", "n"}, Segment{"总府", "n"}, Segment{"店", "n"}}, 196 | []Segment{Segment{"中国科学院计算技术研究所", "nt"}}, 197 | []Segment{Segment{"罗密欧", "nr"}, Segment{"与", "p"}, Segment{"朱丽叶", "nr"}}, 198 | []Segment{Segment{"我", "r"}, Segment{"购买", "v"}, Segment{"了", "ul"}, Segment{"道具", "n"}, Segment{"和", "c"}, Segment{"服装", "vn"}}, 199 | []Segment{Segment{"PS", "eng"}, Segment{":", "x"}, Segment{" ", "x"}, Segment{"我", "r"}, Segment{"觉得", "v"}, Segment{"开源", "n"}, Segment{"有", "v"}, Segment{"一个", "m"}, Segment{"好处", "d"}, Segment{",", "x"}, Segment{"就是", "d"}, Segment{"能够", "v"}, Segment{"敦促", "v"}, Segment{"自己", "r"}, Segment{"不断改进", "l"}, Segment{",", "x"}, Segment{"避免", "v"}, Segment{"敞", "v"}, Segment{"帚", "ng"}, Segment{"自珍", "b"}}, 200 | []Segment{Segment{"湖北省", "ns"}, Segment{"石首市", "ns"}}, 201 | []Segment{Segment{"湖北省", "ns"}, Segment{"十堰市", "ns"}}, 202 | []Segment{Segment{"总经理", "n"}, Segment{"完成", "v"}, Segment{"了", "ul"}, Segment{"这件", "mq"}, Segment{"事情", "n"}}, 203 | []Segment{Segment{"电脑", "n"}, Segment{"修好", "v"}, Segment{"了", "ul"}}, 204 | []Segment{Segment{"做好", "v"}, Segment{"了", "ul"}, Segment{"这件", "mq"}, Segment{"事情", "n"}, Segment{"就", "d"}, Segment{"一了百了", "l"}, Segment{"了", "ul"}}, 205 | []Segment{Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}}, 206 | []Segment{Segment{"我们", "r"}, Segment{"买", "v"}, Segment{"了", "ul"}, Segment{"一个", "m"}, Segment{"美的", "nr"}, Segment{"空调", "n"}}, 207 | []Segment{Segment{"线程", "n"}, Segment{"初始化", "l"}, Segment{"时", "n"}, Segment{"我们", "r"}, Segment{"要", "v"}, Segment{"注意", "v"}}, 208 | []Segment{Segment{"一个", "m"}, Segment{"分子", "n"}, Segment{"是", "v"}, Segment{"由", "p"}, Segment{"好多", "m"}, Segment{"原子", "n"}, Segment{"组织", "v"}, Segment{"成", "n"}, Segment{"的", "uj"}}, 209 | []Segment{Segment{"祝", "v"}, Segment{"你", "r"}, Segment{"马到功成", "i"}}, 210 | []Segment{Segment{"他", "r"}, Segment{"掉", "zg"}, Segment{"进", "v"}, Segment{"了", "ul"}, Segment{"无底洞", "ns"}, Segment{"里", "f"}}, 211 | []Segment{Segment{"中国", "ns"}, Segment{"的", "uj"}, Segment{"首都", "d"}, Segment{"是", "v"}, Segment{"北京", "ns"}}, 212 | []Segment{Segment{"孙", "zg"}, Segment{"君", "nz"}, Segment{"意", "n"}}, 213 | []Segment{Segment{"外交部", "nt"}, Segment{"发言人", "l"}, Segment{"马朝旭", "nr"}}, 214 | []Segment{Segment{"领导人", "n"}, Segment{"会议", "n"}, Segment{"和", "c"}, Segment{"第四届", "m"}, Segment{"东亚", "ns"}, Segment{"峰会", "n"}}, 215 | []Segment{Segment{"在", "p"}, Segment{"过去", "t"}, Segment{"的", "uj"}, Segment{"这", "r"}, Segment{"五年", "t"}}, 216 | []Segment{Segment{"还", "d"}, Segment{"需要", "v"}, Segment{"很", "zg"}, Segment{"长", "a"}, Segment{"的", "uj"}, Segment{"路", "n"}, Segment{"要", "v"}, Segment{"走", "v"}}, 217 | []Segment{Segment{"60", "eng"}, Segment{"周年", "t"}, Segment{"首都", "d"}, Segment{"阅兵", "v"}}, 218 | []Segment{Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}}, 219 | []Segment{Segment{"买", "v"}, Segment{"水果", "n"}, Segment{"然后", "c"}, Segment{"来", "v"}, Segment{"世博园", "nr"}}, 220 | []Segment{Segment{"买", "v"}, Segment{"水果", "n"}, Segment{"然后", "c"}, Segment{"去", "v"}, Segment{"世博园", "nr"}}, 221 | []Segment{Segment{"但是", "c"}, Segment{"后来", "t"}, Segment{"我", "r"}, Segment{"才", "d"}, Segment{"知道", "v"}, Segment{"你", "r"}, Segment{"是", "v"}, Segment{"对", "p"}, Segment{"的", "uj"}}, 222 | []Segment{Segment{"存在", "v"}, Segment{"即", "v"}, Segment{"合理", "vn"}}, 223 | []Segment{Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"在", "p"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"就", "d"}, Segment{"以", "p"}, Segment{"和", "c"}, Segment{"和", "c"}, Segment{"和", "c"}}, 224 | []Segment{Segment{"I", "eng"}, Segment{" ", "x"}, Segment{"love", "eng"}, Segment{"你", "r"}, Segment{",", "x"}, Segment{"不以为耻", "i"}, Segment{",", "x"}, Segment{"反", "zg"}, Segment{"以为", "c"}, Segment{"rong", "eng"}}, 225 | []Segment{Segment{"因", "p"}}, 226 | []Segment{}, 227 | []Segment{Segment{"hello", "eng"}, Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}}, 228 | []Segment{Segment{"很", "zg"}, Segment{"好", "a"}, Segment{"但", "c"}, Segment{"主要", "b"}, Segment{"是", "v"}, Segment{"基于", "p"}, Segment{"网页", "n"}, Segment{"形式", "n"}}, 229 | []Segment{Segment{"hello", "eng"}, Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}}, 230 | []Segment{Segment{"为什么", "r"}, Segment{"我", "r"}, Segment{"不能", "v"}, Segment{"拥有", "v"}, Segment{"想要", "v"}, Segment{"的", "uj"}, Segment{"生活", "vn"}}, 231 | []Segment{Segment{"后来", "t"}, Segment{"我", "r"}, Segment{"才", "d"}}, 232 | []Segment{Segment{"此次", "r"}, Segment{"来", "v"}, Segment{"中国", "ns"}, Segment{"是", "v"}, Segment{"为了", "p"}}, 233 | []Segment{Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}}, 234 | []Segment{Segment{",", "x"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}}, 235 | []Segment{Segment{"其实", "d"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}}, 236 | []Segment{Segment{"好人", "n"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}}, 237 | []Segment{Segment{"是因为", "c"}, Segment{"和", "c"}, Segment{"国家", "n"}}, 238 | []Segment{Segment{"老年", "t"}, Segment{"搜索", "v"}, Segment{"还", "d"}, Segment{"支持", "v"}}, 239 | []Segment{Segment{"干脆", "d"}, Segment{"就", "d"}, Segment{"把", "p"}, Segment{"那", "r"}, Segment{"部", "n"}, Segment{"蒙", "v"}, Segment{"人", "n"}, Segment{"的", "uj"}, Segment{"闲", "n"}, Segment{"法", "j"}, Segment{"给", "p"}, Segment{"废", "v"}, Segment{"了", "ul"}, Segment{"拉倒", "v"}, Segment{"!", "x"}, Segment{"RT", "eng"}, Segment{" ", "x"}, Segment{"@", "x"}, Segment{"laoshipukong", "eng"}, Segment{" ", "x"}, Segment{":", "x"}, Segment{" ", "x"}, Segment{"27", "eng"}, Segment{"日", "m"}, Segment{",", "x"}, Segment{"全国人大常委会", "nt"}, Segment{"第三次", "m"}, Segment{"审议", "v"}, Segment{"侵权", "v"}, Segment{"责任法", "n"}, Segment{"草案", "n"}, Segment{",", "x"}, Segment{"删除", "v"}, Segment{"了", "ul"}, Segment{"有关", "vn"}, Segment{"医疗", "n"}, Segment{"损害", "v"}, Segment{"责任", "n"}, Segment{"“", "x"}, Segment{"举证", "v"}, Segment{"倒置", "v"}, Segment{"”", "x"}, Segment{"的", "uj"}, Segment{"规定", "n"}, Segment{"。", "x"}, Segment{"在", "p"}, Segment{"医患", "n"}, Segment{"纠纷", "n"}, Segment{"中", "f"}, Segment{"本", "r"}, Segment{"已", "d"}, Segment{"处于", "v"}, Segment{"弱势", "n"}, Segment{"地位", "n"}, Segment{"的", "uj"}, Segment{"消费者", "n"}, Segment{"由此", "c"}, Segment{"将", "d"}, Segment{"陷入", "v"}, Segment{"万劫不复", "i"}, Segment{"的", "uj"}, Segment{"境地", "s"}, Segment{"。", "x"}, Segment{" ", "x"}}, 240 | []Segment{Segment{"大", "a"}}, 241 | []Segment{}, 242 | []Segment{Segment{"他", "r"}, Segment{"说", "v"}, Segment{"的", "uj"}, Segment{"确实", "ad"}, Segment{"在", "p"}, Segment{"理", "n"}}, 243 | []Segment{Segment{"长春", "ns"}, Segment{"市长", "n"}, Segment{"春节", "t"}, Segment{"讲话", "n"}}, 244 | []Segment{Segment{"结婚", "v"}, Segment{"的", "uj"}, Segment{"和", "c"}, Segment{"尚未", "d"}, Segment{"结婚", "v"}, Segment{"的", "uj"}}, 245 | []Segment{Segment{"结合", "v"}, Segment{"成", "n"}, Segment{"分子", "n"}, Segment{"时", "n"}}, 246 | []Segment{Segment{"旅游", "vn"}, Segment{"和", "c"}, Segment{"服务", "vn"}, Segment{"是", "v"}, Segment{"最好", "a"}, Segment{"的", "uj"}}, 247 | []Segment{Segment{"这件", "mq"}, Segment{"事情", "n"}, Segment{"的确", "d"}, Segment{"是", "v"}, Segment{"我", "r"}, Segment{"的", "uj"}, Segment{"错", "v"}}, 248 | []Segment{Segment{"供", "v"}, Segment{"大家", "n"}, Segment{"参考", "v"}, Segment{"指正", "v"}}, 249 | []Segment{Segment{"哈尔滨", "ns"}, Segment{"政府", "n"}, Segment{"公布", "v"}, Segment{"塌", "v"}, Segment{"桥", "n"}, Segment{"原因", "n"}}, 250 | []Segment{Segment{"我", "r"}, Segment{"在", "p"}, Segment{"机场", "n"}, Segment{"入口处", "i"}}, 251 | []Segment{Segment{"邢", "nr"}, Segment{"永", "ns"}, Segment{"臣", "n"}, Segment{"摄影", "n"}, Segment{"报道", "v"}}, 252 | []Segment{Segment{"BP", "eng"}, Segment{"神经网络", "n"}, Segment{"如何", "r"}, Segment{"训练", "vn"}, Segment{"才能", "v"}, Segment{"在", "p"}, Segment{"分类", "n"}, Segment{"时", "n"}, Segment{"增加", "v"}, Segment{"区分度", "n"}, Segment{"?", "x"}}, 253 | []Segment{Segment{"南京市", "ns"}, Segment{"长江大桥", "ns"}}, 254 | []Segment{Segment{"应", "v"}, Segment{"一些", "m"}, Segment{"使用者", "n"}, Segment{"的", "uj"}, Segment{"建议", "n"}, Segment{",", "x"}, Segment{"也", "d"}, Segment{"为了", "p"}, Segment{"便于", "v"}, Segment{"利用", "n"}, Segment{"NiuTrans", "eng"}, Segment{"用于", "v"}, Segment{"SMT", "eng"}, Segment{"研究", "vn"}}, 255 | []Segment{Segment{"长春市", "ns"}, Segment{"长春", "ns"}, Segment{"药店", "n"}}, 256 | []Segment{Segment{"邓颖超", "nr"}, Segment{"生前", "t"}, Segment{"最", "d"}, Segment{"喜欢", "v"}, Segment{"的", "uj"}, Segment{"衣服", "n"}}, 257 | []Segment{Segment{"胡锦涛", "nr"}, Segment{"是", "v"}, Segment{"热爱", "a"}, Segment{"世界", "n"}, Segment{"和平", "nz"}, Segment{"的", "uj"}, Segment{"政治局", "n"}, Segment{"常委", "j"}}, 258 | []Segment{Segment{"程序员", "n"}, Segment{"祝", "v"}, Segment{"海林", "nz"}, Segment{"和", "c"}, Segment{"朱", "nr"}, Segment{"会", "v"}, Segment{"震", "v"}, Segment{"是", "v"}, Segment{"在", "p"}, Segment{"孙", "zg"}, Segment{"健", "a"}, Segment{"的", "uj"}, Segment{"左面", "f"}, Segment{"和", "c"}, Segment{"右面", "f"}, Segment{",", "x"}, Segment{" ", "x"}, Segment{"范", "nr"}, Segment{"凯", "nr"}, Segment{"在", "p"}, Segment{"最", "d"}, Segment{"右面", "f"}, Segment{".", "x"}, Segment{"再", "d"}, Segment{"往", "zg"}, Segment{"左", "m"}, Segment{"是", "v"}, Segment{"李", "nr"}, Segment{"松", "v"}, Segment{"洪", "nr"}}, 259 | []Segment{Segment{"一次性", "d"}, Segment{"交", "v"}, Segment{"多少", "m"}, Segment{"钱", "n"}}, 260 | []Segment{Segment{"两块", "m"}, Segment{"五", "m"}, Segment{"一套", "m"}, Segment{",", "x"}, Segment{"三块", "m"}, Segment{"八", "m"}, Segment{"一斤", "m"}, Segment{",", "x"}, Segment{"四块", "m"}, Segment{"七", "m"}, Segment{"一本", "m"}, Segment{",", "x"}, Segment{"五块", "m"}, Segment{"六", "m"}, Segment{"一条", "m"}}, 261 | []Segment{Segment{"小", "a"}, Segment{"和尚", "nr"}, Segment{"留", "v"}, Segment{"了", "ul"}, Segment{"一个", "m"}, Segment{"像", "v"}, Segment{"大", "a"}, Segment{"和尚", "nr"}, Segment{"一样", "r"}, Segment{"的", "uj"}, Segment{"和尚头", "nr"}}, 262 | []Segment{Segment{"我", "r"}, Segment{"是", "v"}, Segment{"中华人民共和国", "ns"}, Segment{"公民", "n"}, Segment{";", "x"}, Segment{"我", "r"}, Segment{"爸爸", "n"}, Segment{"是", "v"}, Segment{"共和党", "nt"}, Segment{"党员", "n"}, Segment{";", "x"}, Segment{" ", "x"}, Segment{"地铁", "n"}, Segment{"和平门", "ns"}, Segment{"站", "v"}}, 263 | []Segment{Segment{"张晓梅", "nr"}, Segment{"去", "v"}, Segment{"人民", "n"}, Segment{"医院", "n"}, Segment{"做", "v"}, Segment{"了", "ul"}, Segment{"个", "q"}, Segment{"B超", "n"}, Segment{"然后", "c"}, Segment{"去", "v"}, Segment{"买", "v"}, Segment{"了", "ul"}, Segment{"件", "zg"}, Segment{"T恤", "n"}}, 264 | []Segment{Segment{"AT&T", "nz"}, Segment{"是", "v"}, Segment{"一件", "m"}, Segment{"不错", "a"}, Segment{"的", "uj"}, Segment{"公司", "n"}, Segment{",", "x"}, Segment{"给", "p"}, Segment{"你", "r"}, Segment{"发", "v"}, Segment{"offer", "eng"}, Segment{"了", "ul"}, Segment{"吗", "y"}, Segment{"?", "x"}}, 265 | []Segment{Segment{"C++", "nz"}, Segment{"和", "c"}, Segment{"c#", "nz"}, Segment{"是", "v"}, Segment{"什么", "r"}, Segment{"关系", "n"}, Segment{"?", "x"}, Segment{"11", "eng"}, Segment{"+", "x"}, Segment{"122", "eng"}, Segment{"=", "x"}, Segment{"133", "eng"}, Segment{",", "x"}, Segment{"是", "v"}, Segment{"吗", "y"}, Segment{"?", "x"}, Segment{"PI", "eng"}, Segment{"=", "x"}, Segment{"3", "eng"}, Segment{".", "x"}, Segment{"14159", "eng"}}, 266 | []Segment{Segment{"你", "r"}, Segment{"认识", "v"}, Segment{"那个", "r"}, Segment{"和", "c"}, Segment{"主席", "n"}, Segment{"握手", "v"}, Segment{"的", "uj"}, Segment{"的哥", "n"}, Segment{"吗", "y"}, Segment{"?", "x"}, Segment{"他", "r"}, Segment{"开", "v"}, Segment{"一辆", "m"}, Segment{"黑色", "n"}, Segment{"的士", "n"}, Segment{"。", "x"}}, 267 | []Segment{Segment{"枪杆子", "n"}, Segment{"中", "f"}, Segment{"出", "v"}, Segment{"政权", "n"}}, 268 | } 269 | ) 270 | 271 | func init() { 272 | seg.LoadDictionary("../dict.txt") 273 | } 274 | 275 | func chanToArray(ch <-chan Segment) []Segment { 276 | var result []Segment 277 | for word := range ch { 278 | result = append(result, word) 279 | } 280 | return result 281 | } 282 | 283 | func TestCut(t *testing.T) { 284 | for index, content := range testContents { 285 | result := chanToArray(seg.Cut(content, true)) 286 | if len(defaultCutResult[index]) != len(result) { 287 | t.Errorf("default cut for %s length should be %d not %d\n", 288 | content, len(defaultCutResult[index]), len(result)) 289 | t.Errorf("expect: %v\n", defaultCutResult[index]) 290 | t.Fatalf("got: %v\n", result) 291 | } 292 | for i := range result { 293 | if result[i] != defaultCutResult[index][i] { 294 | t.Fatalf("expect %s, got %s", defaultCutResult[index][i], result[i]) 295 | } 296 | } 297 | result = chanToArray(seg.Cut(content, false)) 298 | if len(noHMMCutResult[index]) != len(result) { 299 | t.Fatal(content) 300 | } 301 | for i := range result { 302 | if result[i] != noHMMCutResult[index][i] { 303 | t.Fatal(content) 304 | } 305 | } 306 | 307 | } 308 | } 309 | 310 | // https://github.com/fxsjy/jieba/issues/132 311 | func TestBug132(t *testing.T) { 312 | sentence := "又跛又啞" 313 | cutResult := []Segment{ 314 | Segment{"又", "d"}, 315 | Segment{"跛", "a"}, 316 | Segment{"又", "d"}, 317 | Segment{"啞", "v"}, 318 | } 319 | result := chanToArray(seg.Cut(sentence, true)) 320 | if len(cutResult) != len(result) { 321 | t.Fatal(result) 322 | } 323 | for i := range result { 324 | if result[i] != cutResult[i] { 325 | t.Fatal(result[i]) 326 | } 327 | } 328 | } 329 | 330 | // https://github.com/fxsjy/jieba/issues/137 331 | func TestBug137(t *testing.T) { 332 | sentence := "前港督衛奕信在八八年十月宣布成立中央政策研究組" 333 | cutResult := []Segment{ 334 | Segment{"前", "f"}, 335 | Segment{"港督", "n"}, 336 | Segment{"衛奕", "z"}, 337 | Segment{"信", "n"}, 338 | Segment{"在", "p"}, 339 | Segment{"八八年", "m"}, 340 | Segment{"十月", "t"}, 341 | Segment{"宣布", "v"}, 342 | Segment{"成立", "v"}, 343 | Segment{"中央", "n"}, 344 | Segment{"政策", "n"}, 345 | Segment{"研究", "vn"}, 346 | Segment{"組", "x"}, 347 | } 348 | result := chanToArray(seg.Cut(sentence, true)) 349 | if len(cutResult) != len(result) { 350 | t.Fatal(result) 351 | } 352 | for i := range result { 353 | if result[i] != cutResult[i] { 354 | t.Fatal(result[i]) 355 | } 356 | } 357 | } 358 | 359 | func TestUserDict(t *testing.T) { 360 | seg.LoadUserDictionary("../userdict.txt") 361 | defer seg.LoadDictionary("../dict.txt") 362 | sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型" 363 | 364 | cutResult := []Segment{ 365 | Segment{"李小福", "nr"}, 366 | Segment{"是", "v"}, 367 | Segment{"创新办", "i"}, 368 | Segment{"主任", "b"}, 369 | Segment{"也", "d"}, 370 | Segment{"是", "v"}, 371 | Segment{"云计算", "x"}, 372 | Segment{"方面", "n"}, 373 | Segment{"的", "uj"}, 374 | Segment{"专家", "n"}, 375 | Segment{";", "x"}, 376 | Segment{" ", "x"}, 377 | Segment{"什么", "r"}, 378 | Segment{"是", "v"}, 379 | Segment{"八一双鹿", "nz"}, 380 | Segment{"例如", "v"}, 381 | Segment{"我", "r"}, 382 | Segment{"输入", "v"}, 383 | Segment{"一个", "m"}, 384 | Segment{"带", "v"}, 385 | Segment{"“", "x"}, 386 | Segment{"韩玉赏鉴", "nz"}, 387 | Segment{"”", "x"}, 388 | Segment{"的", "uj"}, 389 | Segment{"标题", "n"}, 390 | Segment{",", "x"}, 391 | Segment{"在", "p"}, 392 | Segment{"自定义词", "n"}, 393 | Segment{"库中", "nrt"}, 394 | Segment{"也", "d"}, 395 | Segment{"增加", "v"}, 396 | Segment{"了", "ul"}, 397 | Segment{"此", "r"}, 398 | Segment{"词", "n"}, 399 | Segment{"为", "p"}, 400 | Segment{"N", "eng"}, 401 | Segment{"类型", "n"}} 402 | 403 | result := chanToArray(seg.Cut(sentence, true)) 404 | if len(cutResult) != len(result) { 405 | t.Fatal(result) 406 | } 407 | for i := range result { 408 | if result[i] != cutResult[i] { 409 | t.Fatal(result[i]) 410 | } 411 | } 412 | } 413 | 414 | func BenchmarkCutNoHMM(b *testing.B) { 415 | sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" 416 | b.ResetTimer() 417 | for i := 0; i < b.N; i++ { 418 | chanToArray(seg.Cut(sentence, false)) 419 | } 420 | } 421 | 422 | func BenchmarkCut(b *testing.B) { 423 | sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" 424 | b.ResetTimer() 425 | for i := 0; i < b.N; i++ { 426 | chanToArray(seg.Cut(sentence, true)) 427 | } 428 | } 429 | -------------------------------------------------------------------------------- /posseg/prob_start.go: -------------------------------------------------------------------------------- 1 | package posseg 2 | 3 | var probStart = map[uint16]float64{ 4 | 100: -4.762305214596967, 5 | 101: -6.680066036784177, 6 | 102: -3.14e+100, 7 | 103: -8.697083223018778, 8 | 104: -5.018374362109218, 9 | 105: -3.14e+100, 10 | 106: -3.423880184954888, 11 | 107: -3.9750475297585357, 12 | 108: -8.888974230828882, 13 | 109: -3.14e+100, 14 | 110: -8.563551830394255, 15 | 111: -3.14e+100, 16 | 112: -5.491630418482717, 17 | 113: -3.14e+100, 18 | 114: -13.533365129970255, 19 | 115: -6.1157847275557105, 20 | 116: -3.14e+100, 21 | 117: -5.0576191284681915, 22 | 118: -3.14e+100, 23 | 119: -3.14e+100, 24 | 120: -4.905883584659895, 25 | 121: -3.14e+100, 26 | 122: -3.6524299819046386, 27 | 123: -3.14e+100, 28 | 124: -6.78695300139688, 29 | 125: -1.6966257797548328, 30 | 126: -3.14e+100, 31 | 127: -2.2310495913769506, 32 | 128: -5.873722175405573, 33 | 129: -4.985642733519195, 34 | 130: -2.8228438314969213, 35 | 131: -4.846091668182416, 36 | 132: -3.94698846057672, 37 | 133: -8.433498702146057, 38 | 134: -4.200984132085048, 39 | 135: -6.998123858956596, 40 | 136: -3.14e+100, 41 | 137: -3.14e+100, 42 | 138: -3.4098187790818413, 43 | 139: -3.14e+100, 44 | 140: -12.434752841302146, 45 | 141: -7.946116471570005, 46 | 142: -5.522673590839954, 47 | 143: -3.3647479094528574, 48 | 144: -3.14e+100, 49 | 145: -9.163917277503234, 50 | 146: -3.14e+100, 51 | 147: -3.14e+100, 52 | 148: -3.14e+100, 53 | 149: -3.14e+100, 54 | 150: -3.14e+100, 55 | 151: -3.14e+100, 56 | 152: -2.6740584874265685, 57 | 153: -9.044728760238115, 58 | 154: -3.14e+100, 59 | 155: -12.434752841302146, 60 | 156: -4.3315610890163585, 61 | 157: -12.147070768850364, 62 | 158: -3.14e+100, 63 | 159: -3.14e+100, 64 | 160: -9.844485675856319, 65 | 161: -3.14e+100, 66 | 162: -7.045681111485645, 67 | 163: -3.14e+100, 68 | 200: -3.14e+100, 69 | 201: -3.14e+100, 70 | 202: -3.14e+100, 71 | 203: -3.14e+100, 72 | 204: -3.14e+100, 73 | 205: -3.14e+100, 74 | 206: -3.14e+100, 75 | 207: -3.14e+100, 76 | 208: -3.14e+100, 77 | 209: -3.14e+100, 78 | 210: -3.14e+100, 79 | 211: -3.14e+100, 80 | 212: -3.14e+100, 81 | 213: -3.14e+100, 82 | 214: -3.14e+100, 83 | 215: -3.14e+100, 84 | 216: -3.14e+100, 85 | 217: -3.14e+100, 86 | 218: -3.14e+100, 87 | 219: -3.14e+100, 88 | 220: -3.14e+100, 89 | 221: -3.14e+100, 90 | 222: -3.14e+100, 91 | 223: -3.14e+100, 92 | 224: -3.14e+100, 93 | 225: -3.14e+100, 94 | 226: -3.14e+100, 95 | 227: -3.14e+100, 96 | 228: -3.14e+100, 97 | 229: -3.14e+100, 98 | 230: -3.14e+100, 99 | 231: -3.14e+100, 100 | 232: -3.14e+100, 101 | 233: -3.14e+100, 102 | 234: -3.14e+100, 103 | 235: -3.14e+100, 104 | 236: -3.14e+100, 105 | 237: -3.14e+100, 106 | 238: -3.14e+100, 107 | 239: -3.14e+100, 108 | 240: -3.14e+100, 109 | 241: -3.14e+100, 110 | 242: -3.14e+100, 111 | 243: -3.14e+100, 112 | 244: -3.14e+100, 113 | 245: -3.14e+100, 114 | 246: -3.14e+100, 115 | 247: -3.14e+100, 116 | 248: -3.14e+100, 117 | 249: -3.14e+100, 118 | 250: -3.14e+100, 119 | 251: -3.14e+100, 120 | 252: -3.14e+100, 121 | 253: -3.14e+100, 122 | 254: -3.14e+100, 123 | 255: -3.14e+100, 124 | 256: -3.14e+100, 125 | 257: -3.14e+100, 126 | 258: -3.14e+100, 127 | 259: -3.14e+100, 128 | 260: -3.14e+100, 129 | 261: -3.14e+100, 130 | 262: -3.14e+100, 131 | 263: -3.14e+100, 132 | 300: -3.14e+100, 133 | 301: -3.14e+100, 134 | 302: -3.14e+100, 135 | 303: -3.14e+100, 136 | 304: -3.14e+100, 137 | 305: -3.14e+100, 138 | 306: -3.14e+100, 139 | 307: -3.14e+100, 140 | 308: -3.14e+100, 141 | 309: -3.14e+100, 142 | 310: -3.14e+100, 143 | 311: -3.14e+100, 144 | 312: -3.14e+100, 145 | 313: -3.14e+100, 146 | 314: -3.14e+100, 147 | 315: -3.14e+100, 148 | 316: -3.14e+100, 149 | 317: -3.14e+100, 150 | 318: -3.14e+100, 151 | 319: -3.14e+100, 152 | 320: -3.14e+100, 153 | 321: -3.14e+100, 154 | 322: -3.14e+100, 155 | 323: -3.14e+100, 156 | 324: -3.14e+100, 157 | 325: -3.14e+100, 158 | 326: -3.14e+100, 159 | 327: -3.14e+100, 160 | 328: -3.14e+100, 161 | 329: -3.14e+100, 162 | 330: -3.14e+100, 163 | 331: -3.14e+100, 164 | 332: -3.14e+100, 165 | 333: -3.14e+100, 166 | 334: -3.14e+100, 167 | 335: -3.14e+100, 168 | 336: -3.14e+100, 169 | 337: -3.14e+100, 170 | 338: -3.14e+100, 171 | 339: -3.14e+100, 172 | 340: -3.14e+100, 173 | 341: -3.14e+100, 174 | 342: -3.14e+100, 175 | 343: -3.14e+100, 176 | 344: -3.14e+100, 177 | 345: -3.14e+100, 178 | 346: -3.14e+100, 179 | 347: -3.14e+100, 180 | 348: -3.14e+100, 181 | 349: -3.14e+100, 182 | 350: -3.14e+100, 183 | 351: -3.14e+100, 184 | 352: -3.14e+100, 185 | 353: -3.14e+100, 186 | 354: -3.14e+100, 187 | 355: -3.14e+100, 188 | 356: -3.14e+100, 189 | 357: -3.14e+100, 190 | 358: -3.14e+100, 191 | 359: -3.14e+100, 192 | 360: -3.14e+100, 193 | 361: -3.14e+100, 194 | 362: -3.14e+100, 195 | 363: -3.14e+100, 196 | 400: -3.9025396831295227, 197 | 401: -11.048458480182255, 198 | 402: -6.954113917960154, 199 | 403: -12.84021794941031, 200 | 404: -6.472888763970454, 201 | 405: -3.14e+100, 202 | 406: -4.786966795861212, 203 | 407: -3.903919764181873, 204 | 408: -3.14e+100, 205 | 409: -8.948397651299683, 206 | 410: -5.942513006281674, 207 | 411: -3.14e+100, 208 | 412: -5.194820249981676, 209 | 413: -6.507826815331734, 210 | 414: -8.650563207383884, 211 | 415: -3.14e+100, 212 | 416: -3.14e+100, 213 | 417: -4.911992119644354, 214 | 418: -3.14e+100, 215 | 419: -6.940320595827818, 216 | 420: -3.14e+100, 217 | 421: -3.14e+100, 218 | 422: -3.269200652116097, 219 | 423: -10.825314928868044, 220 | 424: -3.14e+100, 221 | 425: -3.8551483897645107, 222 | 426: -4.913434861102905, 223 | 427: -4.483663103956885, 224 | 428: -3.14e+100, 225 | 429: -3.14e+100, 226 | 430: -3.14e+100, 227 | 431: -12.147070768850364, 228 | 432: -3.14e+100, 229 | 433: -8.464460927750023, 230 | 434: -2.9868401813596317, 231 | 435: -4.888658618255058, 232 | 436: -3.14e+100, 233 | 437: -3.14e+100, 234 | 438: -2.7635336784127853, 235 | 439: -10.275268591948773, 236 | 440: -3.14e+100, 237 | 441: -3.14e+100, 238 | 442: -3.14e+100, 239 | 443: -3.14e+100, 240 | 444: -6.272842531880403, 241 | 445: -6.940320595827818, 242 | 446: -7.728230161053767, 243 | 447: -7.5394037026636855, 244 | 448: -6.85251045118004, 245 | 449: -8.4153713175535, 246 | 450: -8.15808672228609, 247 | 451: -9.299258625372996, 248 | 452: -3.053292303412302, 249 | 453: -3.14e+100, 250 | 454: -5.9430181843676895, 251 | 455: -3.14e+100, 252 | 456: -11.453923588290419, 253 | 457: -3.14e+100, 254 | 458: -3.14e+100, 255 | 459: -8.427419656069674, 256 | 460: -6.1970794699489575, 257 | 461: -13.533365129970255, 258 | 462: -3.14e+100, 259 | 463: -3.14e+100, 260 | } 261 | -------------------------------------------------------------------------------- /posseg/viterbi.go: -------------------------------------------------------------------------------- 1 | package posseg 2 | 3 | import ( 4 | "fmt" 5 | "sort" 6 | ) 7 | 8 | type probState struct { 9 | prob float64 10 | state uint16 11 | } 12 | 13 | func (ps probState) String() string { 14 | return fmt.Sprintf("(%v: %f)", ps.state, ps.prob) 15 | } 16 | 17 | type probStates []probState 18 | 19 | func (pss probStates) Len() int { 20 | return len(pss) 21 | } 22 | 23 | func (pss probStates) Less(i, j int) bool { 24 | if pss[i].prob == pss[j].prob { 25 | return pss[i].state < pss[j].state 26 | } 27 | return pss[i].prob < pss[j].prob 28 | } 29 | 30 | func (pss probStates) Swap(i, j int) { 31 | pss[i], pss[j] = pss[j], pss[i] 32 | } 33 | 34 | func viterbi(obs []rune) []tag { 35 | obsLength := len(obs) 36 | V := make([]map[uint16]float64, obsLength) 37 | V[0] = make(map[uint16]float64) 38 | memPath := make([]map[uint16]uint16, obsLength) 39 | memPath[0] = make(map[uint16]uint16) 40 | ys := charStateTab.get(obs[0]) // default is all_states 41 | for _, y := range ys { 42 | V[0][y] = probEmit[y].get(obs[0]) + probStart[y] 43 | memPath[0][y] = 0 44 | } 45 | for t := 1; t < obsLength; t++ { 46 | var prevStates []uint16 47 | for x := range memPath[t-1] { 48 | if len(probTrans[x]) > 0 { 49 | prevStates = append(prevStates, x) 50 | } 51 | } 52 | //use Go's map to implement Python's Set() 53 | prevStatesExpectNext := make(map[uint16]int) 54 | for _, x := range prevStates { 55 | for y := range probTrans[x] { 56 | prevStatesExpectNext[y] = 1 57 | } 58 | } 59 | tmpObsStates := charStateTab.get(obs[t]) 60 | 61 | var obsStates []uint16 62 | for index := range tmpObsStates { 63 | if _, ok := prevStatesExpectNext[tmpObsStates[index]]; ok { 64 | obsStates = append(obsStates, tmpObsStates[index]) 65 | } 66 | } 67 | if len(obsStates) == 0 { 68 | for key := range prevStatesExpectNext { 69 | obsStates = append(obsStates, key) 70 | } 71 | } 72 | if len(obsStates) == 0 { 73 | obsStates = probTransKeys 74 | } 75 | memPath[t] = make(map[uint16]uint16) 76 | V[t] = make(map[uint16]float64) 77 | for _, y := range obsStates { 78 | var max, ps probState 79 | for i, y0 := range prevStates { 80 | ps = probState{ 81 | prob: V[t-1][y0] + probTrans[y0].Get(y) + probEmit[y].get(obs[t]), 82 | state: y0} 83 | if i == 0 || ps.prob > max.prob || (ps.prob == max.prob && ps.state > max.state) { 84 | max = ps 85 | } 86 | } 87 | V[t][y] = max.prob 88 | memPath[t][y] = max.state 89 | } 90 | } 91 | last := make(probStates, 0) 92 | length := len(memPath) 93 | vlength := len(V) 94 | for y := range memPath[length-1] { 95 | ps := probState{prob: V[vlength-1][y], state: y} 96 | last = append(last, ps) 97 | } 98 | sort.Sort(sort.Reverse(last)) 99 | state := last[0].state 100 | route := make([]tag, len(obs)) 101 | 102 | for i := obsLength - 1; i >= 0; i-- { 103 | route[i] = tag(state) 104 | state = memPath[i][state] 105 | } 106 | return route 107 | } 108 | -------------------------------------------------------------------------------- /posseg/viterbi_test.go: -------------------------------------------------------------------------------- 1 | package posseg 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | var defaultRoute []tag 8 | 9 | func init() { 10 | var t tag 11 | t, _ = newTag("B", "nr") 12 | defaultRoute = append(defaultRoute, t) 13 | t, _ = newTag("M", "nr") 14 | defaultRoute = append(defaultRoute, t) 15 | t, _ = newTag("E", "nr") 16 | defaultRoute = append(defaultRoute, t) 17 | t, _ = newTag("S", "v") 18 | defaultRoute = append(defaultRoute, t) 19 | t, _ = newTag("B", "v") 20 | defaultRoute = append(defaultRoute, t) 21 | t, _ = newTag("E", "v") 22 | defaultRoute = append(defaultRoute, t) 23 | t, _ = newTag("B", "n") 24 | defaultRoute = append(defaultRoute, t) 25 | t, _ = newTag("M", "n") 26 | defaultRoute = append(defaultRoute, t) 27 | t, _ = newTag("E", "n") 28 | defaultRoute = append(defaultRoute, t) 29 | t, _ = newTag("S", "d") 30 | defaultRoute = append(defaultRoute, t) 31 | t, _ = newTag("S", "v") 32 | defaultRoute = append(defaultRoute, t) 33 | t, _ = newTag("S", "n") 34 | defaultRoute = append(defaultRoute, t) 35 | t, _ = newTag("B", "v") 36 | defaultRoute = append(defaultRoute, t) 37 | t, _ = newTag("E", "v") 38 | defaultRoute = append(defaultRoute, t) 39 | t, _ = newTag("B", "nr") 40 | defaultRoute = append(defaultRoute, t) 41 | t, _ = newTag("M", "nr") 42 | defaultRoute = append(defaultRoute, t) 43 | t, _ = newTag("M", "nr") 44 | defaultRoute = append(defaultRoute, t) 45 | t, _ = newTag("M", "nr") 46 | defaultRoute = append(defaultRoute, t) 47 | t, _ = newTag("E", "nr") 48 | defaultRoute = append(defaultRoute, t) 49 | t, _ = newTag("S", "zg") 50 | defaultRoute = append(defaultRoute, t) 51 | } 52 | 53 | func TestViterbi(t *testing.T) { 54 | ss := "李小福是创新办主任也是云计算方面的专家;" 55 | route := viterbi([]rune(ss)) 56 | if len(route) != len(defaultRoute) { 57 | t.Fatal(len(route)) 58 | } 59 | for index := range route { 60 | if route[index] != defaultRoute[index] { 61 | t.Fatal(route[index]) 62 | } 63 | } 64 | } 65 | 66 | func BenchmarkViterbi(b *testing.B) { 67 | ss := "李小福是创新办主任也是云计算方面的专家;" 68 | for i := 0; i < b.N; i++ { 69 | viterbi([]rune(ss)) 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /tokenizers/example_bleve_test.go: -------------------------------------------------------------------------------- 1 | package tokenizers_test 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | 8 | "github.com/blevesearch/bleve" 9 | _ "github.com/wangbin/jiebago/tokenizers" 10 | ) 11 | 12 | func Example_beleveSearch() { 13 | // open a new index 14 | indexMapping := bleve.NewIndexMapping() 15 | 16 | err := indexMapping.AddCustomTokenizer("jieba", 17 | map[string]interface{}{ 18 | "file": "../dict.txt", 19 | "type": "jieba", 20 | }) 21 | if err != nil { 22 | log.Fatal(err) 23 | } 24 | 25 | // create a custom analyzer 26 | err = indexMapping.AddCustomAnalyzer("jieba", 27 | map[string]interface{}{ 28 | "type": "custom", 29 | "tokenizer": "jieba", 30 | "token_filters": []string{ 31 | "possessive_en", 32 | "to_lower", 33 | "stop_en", 34 | }, 35 | }) 36 | 37 | if err != nil { 38 | log.Fatal(err) 39 | } 40 | 41 | indexMapping.DefaultAnalyzer = "jieba" 42 | cacheDir := "jieba.beleve" 43 | os.RemoveAll(cacheDir) 44 | index, err := bleve.New(cacheDir, indexMapping) 45 | 46 | if err != nil { 47 | log.Fatal(err) 48 | } 49 | 50 | docs := []struct { 51 | Title string 52 | Name string 53 | }{ 54 | { 55 | Title: "Doc 1", 56 | Name: "This is the first document we’ve added", 57 | }, 58 | { 59 | Title: "Doc 2", 60 | Name: "The second one 你 中文测试中文 is even more interesting! 吃水果", 61 | }, 62 | { 63 | Title: "Doc 3", 64 | Name: "买水果然后来世博园。", 65 | }, 66 | { 67 | Title: "Doc 4", 68 | Name: "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", 69 | }, 70 | { 71 | Title: "Doc 5", 72 | Name: "咱俩交换一下吧。", 73 | }, 74 | } 75 | // index docs 76 | for _, doc := range docs { 77 | index.Index(doc.Title, doc) 78 | } 79 | 80 | // search for some text 81 | for _, keyword := range []string{"水果世博园", "你", "first", "中文", "交换机", "交换"} { 82 | query := bleve.NewQueryStringQuery(keyword) 83 | search := bleve.NewSearchRequest(query) 84 | search.Highlight = bleve.NewHighlight() 85 | searchResults, err := index.Search(search) 86 | if err != nil { 87 | log.Fatal(err) 88 | } 89 | fmt.Printf("Result of \"%s\": %d matches:\n", keyword, searchResults.Total) 90 | for i, hit := range searchResults.Hits { 91 | rv := fmt.Sprintf("%d. %s, (%f)\n", i+searchResults.Request.From+1, hit.ID, hit.Score) 92 | for fragmentField, fragments := range hit.Fragments { 93 | rv += fmt.Sprintf("%s: ", fragmentField) 94 | for _, fragment := range fragments { 95 | rv += fmt.Sprintf("%s", fragment) 96 | } 97 | } 98 | fmt.Printf("%s\n", rv) 99 | } 100 | } 101 | // Output: 102 | // Result of "水果世博园": 2 matches: 103 | // 1. Doc 3, (1.099550) 104 | // Name: 买水果然后来世博园。 105 | // 2. Doc 2, (0.031941) 106 | // Name: The second one 你 中文测试中文 is even more interesting! 吃水果 107 | // Result of "你": 1 matches: 108 | // 1. Doc 2, (0.391161) 109 | // Name: The second one 中文测试中文 is even more interesting! 吃水果 110 | // Result of "first": 1 matches: 111 | // 1. Doc 1, (0.512150) 112 | // Name: This is the first document we’ve added 113 | // Result of "中文": 1 matches: 114 | // 1. Doc 2, (0.553186) 115 | // Name: The second one 你 中文测试中文 is even more interesting! 吃水果 116 | // Result of "交换机": 2 matches: 117 | // 1. Doc 4, (0.608495) 118 | // Name: 工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作 119 | // 2. Doc 5, (0.086700) 120 | // Name: 咱俩交换一下吧。 121 | // Result of "交换": 2 matches: 122 | // 1. Doc 5, (0.534158) 123 | // Name: 咱俩交换一下吧。 124 | // 2. Doc 4, (0.296297) 125 | // Name: 工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作 126 | } 127 | -------------------------------------------------------------------------------- /tokenizers/example_test.go: -------------------------------------------------------------------------------- 1 | package tokenizers_test 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/wangbin/jiebago/tokenizers" 7 | ) 8 | 9 | func Example() { 10 | sentence := []byte("永和服装饰品有限公司") 11 | 12 | // default mode 13 | tokenizer, _ := tokenizers.NewJiebaTokenizer("../dict.txt", true, false) 14 | fmt.Println("Default Mode:") 15 | for _, token := range tokenizer.Tokenize(sentence) { 16 | fmt.Printf( 17 | "Term: %s Start: %d End: %d Position: %d Type: %d\n", 18 | token.Term, token.Start, token.End, token.Position, token.Type) 19 | } 20 | 21 | //search mode 22 | tokenizer, _ = tokenizers.NewJiebaTokenizer("../dict.txt", true, true) 23 | fmt.Println("Search Mode:") 24 | for _, token := range tokenizer.Tokenize(sentence) { 25 | fmt.Printf( 26 | "Term: %s Start: %d End: %d Position: %d Type: %d\n", 27 | token.Term, token.Start, token.End, token.Position, token.Type) 28 | } 29 | // Output: 30 | // Default Mode: 31 | // Term: 永和 Start: 0 End: 6 Position: 1 Type: 1 32 | // Term: 服装 Start: 6 End: 12 Position: 2 Type: 1 33 | // Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1 34 | // Term: 有限公司 Start: 18 End: 30 Position: 4 Type: 1 35 | // Search Mode: 36 | // Term: 永和 Start: 0 End: 6 Position: 1 Type: 1 37 | // Term: 服装 Start: 6 End: 12 Position: 2 Type: 1 38 | // Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1 39 | // Term: 有限 Start: 18 End: 24 Position: 4 Type: 1 40 | // Term: 公司 Start: 24 End: 30 Position: 5 Type: 1 41 | // Term: 有限公司 Start: 18 End: 30 Position: 6 Type: 1 42 | } 43 | -------------------------------------------------------------------------------- /tokenizers/tokenizer.go: -------------------------------------------------------------------------------- 1 | package tokenizers 2 | 3 | import ( 4 | "fmt" 5 | "regexp" 6 | "strconv" 7 | 8 | "github.com/blevesearch/bleve/analysis" 9 | "github.com/blevesearch/bleve/registry" 10 | "github.com/wangbin/jiebago" 11 | ) 12 | 13 | // Name is the jieba tokenizer name. 14 | const Name = "jieba" 15 | 16 | var ideographRegexp = regexp.MustCompile(`\p{Han}+`) 17 | 18 | // JiebaTokenizer is the beleve tokenizer for jiebago. 19 | type JiebaTokenizer struct { 20 | seg jiebago.Segmenter 21 | hmm, searchMode bool 22 | } 23 | 24 | /* 25 | NewJiebaTokenizer creates a new JiebaTokenizer. 26 | 27 | Parameters: 28 | 29 | dictFilePath: path of the dictioanry file. 30 | 31 | hmm: whether to use Hidden Markov Model to cut unknown words, 32 | i.e. not found in dictionary. For example word "安卓" (means "Android" in 33 | English) not in the dictionary file. If hmm is set to false, it will be 34 | cutted into two single words "安" and "卓", if hmm is set to true, it will 35 | be traded as one single word because Jieba using Hidden Markov Model with 36 | Viterbi algorithm to guess the best possibility. 37 | 38 | searchMode: whether to further cut long words into serveral short words. 39 | In Chinese, some long words may contains other words, for example "交换机" 40 | is a Chinese word for "Switcher", if sechMode is false, it will trade 41 | "交换机" as a single word. If searchMode is true, it will further split 42 | this word into "交换", "换机", which are valid Chinese words. 43 | */ 44 | func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) { 45 | var seg jiebago.Segmenter 46 | err := seg.LoadDictionary(dictFilePath) 47 | return &JiebaTokenizer{ 48 | seg: seg, 49 | hmm: hmm, 50 | searchMode: searchMode, 51 | }, err 52 | } 53 | 54 | // Tokenize cuts input into bleve token stream. 55 | func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream { 56 | rv := make(analysis.TokenStream, 0) 57 | runeStart := 0 58 | start := 0 59 | end := 0 60 | pos := 1 61 | var width int 62 | var gram string 63 | for word := range jt.seg.Cut(string(input), jt.hmm) { 64 | if jt.searchMode { 65 | runes := []rune(word) 66 | width = len(runes) 67 | for _, step := range [2]int{2, 3} { 68 | if width <= step { 69 | continue 70 | } 71 | for i := 0; i < width-step+1; i++ { 72 | gram = string(runes[i : i+step]) 73 | gramLen := len(gram) 74 | if frequency, ok := jt.seg.Frequency(gram); ok && frequency > 0 { 75 | gramStart := start + len(string(runes[:i])) 76 | token := analysis.Token{ 77 | Term: []byte(gram), 78 | Start: gramStart, 79 | End: gramStart + gramLen, 80 | Position: pos, 81 | Type: detectTokenType(gram), 82 | } 83 | rv = append(rv, &token) 84 | pos++ 85 | } 86 | } 87 | } 88 | } 89 | end = start + len(word) 90 | token := analysis.Token{ 91 | Term: []byte(word), 92 | Start: start, 93 | End: end, 94 | Position: pos, 95 | Type: detectTokenType(word), 96 | } 97 | rv = append(rv, &token) 98 | pos++ 99 | runeStart += width 100 | start = end 101 | } 102 | return rv 103 | } 104 | 105 | /* 106 | JiebaTokenizerConstructor creates a JiebaTokenizer. 107 | 108 | Parameter config should contains at least one parameter: 109 | 110 | file: the path of the dictionary file. 111 | 112 | hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details. 113 | 114 | search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details. 115 | */ 116 | func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) ( 117 | analysis.Tokenizer, error) { 118 | dictFilePath, ok := config["file"].(string) 119 | if !ok { 120 | return nil, fmt.Errorf("must specify dictionary file path") 121 | } 122 | hmm, ok := config["hmm"].(bool) 123 | if !ok { 124 | hmm = true 125 | } 126 | searchMode, ok := config["search"].(bool) 127 | if !ok { 128 | searchMode = true 129 | } 130 | 131 | return NewJiebaTokenizer(dictFilePath, hmm, searchMode) 132 | } 133 | 134 | func detectTokenType(term string) analysis.TokenType { 135 | if ideographRegexp.MatchString(term) { 136 | return analysis.Ideographic 137 | } 138 | _, err := strconv.ParseFloat(term, 64) 139 | if err == nil { 140 | return analysis.Numeric 141 | } 142 | return analysis.AlphaNumeric 143 | } 144 | 145 | func init() { 146 | registry.RegisterTokenizer(Name, JiebaTokenizerConstructor) 147 | } 148 | -------------------------------------------------------------------------------- /userdict.txt: -------------------------------------------------------------------------------- 1 | 云计算 5 2 | 李小福 2 nr 3 | 创新办 3 i 4 | easy_install 3 eng 5 | 好用 300 6 | 韩玉赏鉴 3 nz 7 | 八一双鹿 3 nz 8 | -------------------------------------------------------------------------------- /util/util.go: -------------------------------------------------------------------------------- 1 | // Package util contains some util functions used by jiebago. 2 | package util 3 | 4 | import "regexp" 5 | 6 | /* 7 | RegexpSplit split slices s into substrings separated by the expression and 8 | returns a slice of the substrings between those expression matches. 9 | If capturing parentheses are used in expression, then the text of all groups 10 | in the expression are also returned as part of the resulting slice. 11 | 12 | This function acts consistent with Python's re.split function. 13 | */ 14 | func RegexpSplit(re *regexp.Regexp, s string, n int) []string { 15 | if n == 0 { 16 | return nil 17 | } 18 | 19 | if len(re.String()) > 0 && len(s) == 0 { 20 | return []string{""} 21 | } 22 | 23 | var matches [][]int 24 | if len(re.SubexpNames()) > 1 { 25 | matches = re.FindAllStringSubmatchIndex(s, n) 26 | } else { 27 | matches = re.FindAllStringIndex(s, n) 28 | } 29 | strings := make([]string, 0, len(matches)) 30 | 31 | beg := 0 32 | end := 0 33 | for _, match := range matches { 34 | if n > 0 && len(strings) >= n-1 { 35 | break 36 | } 37 | 38 | end = match[0] 39 | if match[1] != 0 { 40 | strings = append(strings, s[beg:end]) 41 | } 42 | beg = match[1] 43 | if len(re.SubexpNames()) > 1 { 44 | strings = append(strings, s[match[0]:match[1]]) 45 | } 46 | } 47 | 48 | if end != len(s) { 49 | strings = append(strings, s[beg:]) 50 | } 51 | 52 | return strings 53 | } 54 | -------------------------------------------------------------------------------- /util/util_test.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "regexp" 5 | "testing" 6 | ) 7 | 8 | func TestRegexpSplit(t *testing.T) { 9 | result := RegexpSplit(regexp.MustCompile(`\p{Han}+`), 10 | "BP神经网络如何训练才能在分类时增加区分度?", -1) 11 | if len(result) != 2 { 12 | t.Fatal(result) 13 | } 14 | result = RegexpSplit(regexp.MustCompile(`(\p{Han})+`), 15 | "BP神经网络如何训练才能在分类时增加区分度?", -1) 16 | if len(result) != 3 { 17 | t.Fatal(result) 18 | } 19 | result = RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`), 20 | ",BP神经网络如何训练才能在分类时#增加区分度?", -1) 21 | if len(result) != 3 { 22 | t.Fatal(result) 23 | } 24 | } 25 | --------------------------------------------------------------------------------