├── .travis.yml
├── README.md
├── analyse
├── example_test.go
├── idf.go
├── idf.txt
├── stop_words.txt
├── stopwords.go
├── tag_extracker.go
├── tag_extracker_test.go
├── textrank.go
└── textrank_test.go
├── dict.txt
├── dictionary.go
├── dictionary
├── dictionary.go
├── dictionary_test.go
└── token.go
├── example_parallel_cut_test.go
├── example_test.go
├── finalseg
├── finalseg.go
├── finalseg_test.go
├── prob_emit.go
├── prob_trans.go
└── viterbi.go
├── foobar.txt
├── jieba.go
├── jieba_test.go
├── posseg
├── char_state_tab.go
├── char_state_tab_test.go
├── dictionary.go
├── example_test.go
├── posseg.go
├── posseg_test.go
├── prob_emit.go
├── prob_start.go
├── prob_trans.go
├── viterbi.go
└── viterbi_test.go
├── tokenizers
├── example_bleve_test.go
├── example_test.go
├── tokenizer.go
└── tokenizer_test.go
├── userdict.txt
└── util
├── util.go
└── util_test.go
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: go
2 | go:
3 | - 1.4.2
4 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | #结巴分词 Go 语言版:Jiebago
2 |
3 |
4 | [](https://travis-ci.org/wangbin/jiebago) [](https://godoc.org/github.com/wangbin/jiebago)
5 |
6 | [结巴分词](https://github.com/fxsjy/jieba) 是由 [@fxsjy](https://github.com/fxsjy) 使用 Python 编写的中文分词组件,Iiebago 是结巴分词的 Golang 语言实现。
7 |
8 |
9 | ## 安装
10 |
11 | ```
12 | go get github.com/wangbin/jiebago/...
13 | ```
14 |
15 | ## 使用
16 |
17 | ```
18 | package main
19 |
20 | import (
21 | "fmt"
22 |
23 | "github.com/wangbin/jiebago"
24 | )
25 |
26 | var seg jiebago.Segmenter
27 |
28 | func init() {
29 | seg.LoadDictionary("dict.txt")
30 | }
31 |
32 | func print(ch <-chan string) {
33 | for word := range ch {
34 | fmt.Printf(" %s /", word)
35 | }
36 | fmt.Println()
37 | }
38 |
39 | func Example() {
40 | fmt.Print("【全模式】:")
41 | print(seg.CutAll("我来到北京清华大学"))
42 |
43 | fmt.Print("【精确模式】:")
44 | print(seg.Cut("我来到北京清华大学", false))
45 |
46 | fmt.Print("【新词识别】:")
47 | print(seg.Cut("他来到了网易杭研大厦", true))
48 |
49 | fmt.Print("【搜索引擎模式】:")
50 | print(seg.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", true))
51 | }
52 | ```
53 | 输出结果:
54 |
55 | ```
56 | 【全模式】: 我 / 来到 / 北京 / 清华 / 清华大学 / 华大 / 大学 /
57 |
58 | 【精确模式】: 我 / 来到 / 北京 / 清华大学 /
59 |
60 | 【新词识别】: 他 / 来到 / 了 / 网易 / 杭研 / 大厦 /
61 |
62 | 【搜索引擎模式】: 小明 / 硕士 / 毕业 / 于 / 中国 / 科学 / 学院 / 科学院 / 中国科学院 / 计算 / 计算所 / , / 后 / 在 / 日本 / 京都 / 大学 / 日本京都大学 / 深造 /
63 | ```
64 |
65 | 更多信息请参考[文档](https://godoc.org/github.com/wangbin/jiebago)。
66 |
67 | ## 分词速度
68 |
69 | - 2MB / Second in Full Mode
70 | - 700KB / Second in Default Mode
71 | - Test Env: AMD Phenom(tm) II X6 1055T CPU @ 2.8GHz; 《金庸全集》
72 |
73 | ## 许可证
74 |
75 | MIT: http://wangbin.mit-license.org
76 |
--------------------------------------------------------------------------------
/analyse/example_test.go:
--------------------------------------------------------------------------------
1 | package analyse_test
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/wangbin/jiebago/analyse"
7 | )
8 |
9 | func Example_extractTags() {
10 | var t analyse.TagExtracter
11 | t.LoadDictionary("../dict.txt")
12 | t.LoadIdf("idf.txt")
13 |
14 | sentence := "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。"
15 | segments := t.ExtractTags(sentence, 5)
16 | fmt.Printf("Top %d tags:", len(segments))
17 | for _, segment := range segments {
18 | fmt.Printf(" %s /", segment.Text())
19 | }
20 | // Output:
21 | // Top 5 tags: Python / C++ / 伸手不见五指 / 孙悟空 / 黑夜 /
22 | }
23 |
24 | func Example_textRank() {
25 | var t analyse.TextRanker
26 | t.LoadDictionary("../dict.txt")
27 | sentence := "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。"
28 |
29 | result := t.TextRank(sentence, 10)
30 | for _, segment := range result {
31 | fmt.Printf("%s %f\n", segment.Text(), segment.Weight())
32 | }
33 | // Output:
34 | // 吉林 1.000000
35 | // 欧亚 0.878078
36 | // 置业 0.562048
37 | // 实现 0.520906
38 | // 收入 0.384284
39 | // 增资 0.360591
40 | // 子公司 0.353132
41 | // 城市 0.307509
42 | // 全资 0.306324
43 | // 商业 0.306138
44 | }
45 |
--------------------------------------------------------------------------------
/analyse/idf.go:
--------------------------------------------------------------------------------
1 | package analyse
2 |
3 | import (
4 | "sort"
5 | "sync"
6 |
7 | "github.com/wangbin/jiebago/dictionary"
8 | )
9 |
10 | // Idf represents a thread-safe dictionary for all words with their
11 | // IDFs(Inverse Document Frequency).
12 | type Idf struct {
13 | freqMap map[string]float64
14 | median float64
15 | freqs []float64
16 | sync.RWMutex
17 | }
18 |
19 | // AddToken adds a new word with IDF into it's dictionary.
20 | func (i *Idf) AddToken(token dictionary.Token) {
21 | i.Lock()
22 | i.freqMap[token.Text()] = token.Frequency()
23 | i.freqs = append(i.freqs, token.Frequency())
24 | sort.Float64s(i.freqs)
25 | i.median = i.freqs[len(i.freqs)/2]
26 | i.Unlock()
27 | }
28 |
29 | // Load loads all tokens from channel into it's dictionary.
30 | func (i *Idf) Load(ch <-chan dictionary.Token) {
31 | i.Lock()
32 | for token := range ch {
33 | i.freqMap[token.Text()] = token.Frequency()
34 | i.freqs = append(i.freqs, token.Frequency())
35 | }
36 | sort.Float64s(i.freqs)
37 | i.median = i.freqs[len(i.freqs)/2]
38 | i.Unlock()
39 | }
40 |
41 | func (i *Idf) loadDictionary(fileName string) error {
42 | return dictionary.LoadDictionary(i, fileName)
43 | }
44 |
45 | // Frequency returns the IDF of given word.
46 | func (i *Idf) Frequency(key string) (float64, bool) {
47 | i.RLock()
48 | freq, ok := i.freqMap[key]
49 | i.RUnlock()
50 | return freq, ok
51 | }
52 |
53 | // NewIdf creates a new Idf instance.
54 | func NewIdf() *Idf {
55 | return &Idf{freqMap: make(map[string]float64), freqs: make([]float64, 0)}
56 | }
57 |
--------------------------------------------------------------------------------
/analyse/stop_words.txt:
--------------------------------------------------------------------------------
1 | the
2 | of
3 | is
4 | and
5 | to
6 | in
7 | that
8 | we
9 | for
10 | an
11 | are
12 | by
13 | be
14 | as
15 | on
16 | with
17 | can
18 | if
19 | from
20 | which
21 | you
22 | it
23 | this
24 | then
25 | at
26 | have
27 | all
28 | not
29 | one
30 | has
31 | or
32 | that
33 | 的
34 | 了
35 | 和
36 | 是
37 | 就
38 | 都
39 | 而
40 | 及
41 | 與
42 | 著
43 | 或
44 | 一個
45 | 沒有
46 | 我們
47 | 你們
48 | 妳們
49 | 他們
50 | 她們
51 | 是否
--------------------------------------------------------------------------------
/analyse/stopwords.go:
--------------------------------------------------------------------------------
1 | package analyse
2 |
3 | import (
4 | "sync"
5 |
6 | "github.com/wangbin/jiebago/dictionary"
7 | )
8 |
9 | // DefaultStopWordMap contains some stop words.
10 | var DefaultStopWordMap = map[string]int{
11 | "the": 1,
12 | "of": 1,
13 | "is": 1,
14 | "and": 1,
15 | "to": 1,
16 | "in": 1,
17 | "that": 1,
18 | "we": 1,
19 | "for": 1,
20 | "an": 1,
21 | "are": 1,
22 | "by": 1,
23 | "be": 1,
24 | "as": 1,
25 | "on": 1,
26 | "with": 1,
27 | "can": 1,
28 | "if": 1,
29 | "from": 1,
30 | "which": 1,
31 | "you": 1,
32 | "it": 1,
33 | "this": 1,
34 | "then": 1,
35 | "at": 1,
36 | "have": 1,
37 | "all": 1,
38 | "not": 1,
39 | "one": 1,
40 | "has": 1,
41 | "or": 1,
42 | }
43 |
44 | // StopWord is a thread-safe dictionary for all stop words.
45 | type StopWord struct {
46 | stopWordMap map[string]int
47 | sync.RWMutex
48 | }
49 |
50 | // AddToken adds a token into StopWord dictionary.
51 | func (s *StopWord) AddToken(token dictionary.Token) {
52 | s.Lock()
53 | s.stopWordMap[token.Text()] = 1
54 | s.Unlock()
55 | }
56 |
57 | // NewStopWord create a new StopWord with default stop words.
58 | func NewStopWord() *StopWord {
59 | s := new(StopWord)
60 | s.stopWordMap = DefaultStopWordMap
61 | return s
62 | }
63 |
64 | // IsStopWord checks if a given word is stop word.
65 | func (s *StopWord) IsStopWord(word string) bool {
66 | s.RLock()
67 | _, ok := s.stopWordMap[word]
68 | s.RUnlock()
69 | return ok
70 | }
71 |
72 | // Load loads all tokens from given channel into StopWord dictionary.
73 | func (s *StopWord) Load(ch <-chan dictionary.Token) {
74 | s.Lock()
75 | for token := range ch {
76 | s.stopWordMap[token.Text()] = 1
77 | }
78 | s.Unlock()
79 | }
80 |
81 | func (s *StopWord) loadDictionary(fileName string) error {
82 | return dictionary.LoadDictionary(s, fileName)
83 | }
84 |
--------------------------------------------------------------------------------
/analyse/tag_extracker.go:
--------------------------------------------------------------------------------
1 | // Package analyse is the Golang implementation of Jieba's analyse module.
2 | package analyse
3 |
4 | import (
5 | "sort"
6 | "strings"
7 | "unicode/utf8"
8 |
9 | "github.com/wangbin/jiebago"
10 | )
11 |
12 | // Segment represents a word with weight.
13 | type Segment struct {
14 | text string
15 | weight float64
16 | }
17 |
18 | // Text returns the segment's text.
19 | func (s Segment) Text() string {
20 | return s.text
21 | }
22 |
23 | // Weight returns the segment's weight.
24 | func (s Segment) Weight() float64 {
25 | return s.weight
26 | }
27 |
28 | // Segments represents a slice of Segment.
29 | type Segments []Segment
30 |
31 | func (ss Segments) Len() int {
32 | return len(ss)
33 | }
34 |
35 | func (ss Segments) Less(i, j int) bool {
36 | if ss[i].weight == ss[j].weight {
37 | return ss[i].text < ss[j].text
38 | }
39 |
40 | return ss[i].weight < ss[j].weight
41 | }
42 |
43 | func (ss Segments) Swap(i, j int) {
44 | ss[i], ss[j] = ss[j], ss[i]
45 | }
46 |
47 | // TagExtracter is used to extract tags from sentence.
48 | type TagExtracter struct {
49 | seg *jiebago.Segmenter
50 | idf *Idf
51 | stopWord *StopWord
52 | }
53 |
54 | // LoadDictionary reads the given filename and create a new dictionary.
55 | func (t *TagExtracter) LoadDictionary(fileName string) error {
56 | t.stopWord = NewStopWord()
57 | t.seg = new(jiebago.Segmenter)
58 | return t.seg.LoadDictionary(fileName)
59 | }
60 |
61 | // LoadIdf reads the given file and create a new Idf dictionary.
62 | func (t *TagExtracter) LoadIdf(fileName string) error {
63 | t.idf = NewIdf()
64 | return t.idf.loadDictionary(fileName)
65 | }
66 |
67 | // LoadStopWords reads the given file and create a new StopWord dictionary.
68 | func (t *TagExtracter) LoadStopWords(fileName string) error {
69 | t.stopWord = NewStopWord()
70 | return t.stopWord.loadDictionary(fileName)
71 | }
72 |
73 | // ExtractTags extracts the topK key words from sentence.
74 | func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
75 | freqMap := make(map[string]float64)
76 |
77 | for w := range t.seg.Cut(sentence, true) {
78 | w = strings.TrimSpace(w)
79 | if utf8.RuneCountInString(w) < 2 {
80 | continue
81 | }
82 | if t.stopWord.IsStopWord(w) {
83 | continue
84 | }
85 | if f, ok := freqMap[w]; ok {
86 | freqMap[w] = f + 1.0
87 | } else {
88 | freqMap[w] = 1.0
89 | }
90 | }
91 | total := 0.0
92 | for _, freq := range freqMap {
93 | total += freq
94 | }
95 | for k, v := range freqMap {
96 | freqMap[k] = v / total
97 | }
98 | ws := make(Segments, 0)
99 | var s Segment
100 | for k, v := range freqMap {
101 | if freq, ok := t.idf.Frequency(k); ok {
102 | s = Segment{text: k, weight: freq * v}
103 | } else {
104 | s = Segment{text: k, weight: t.idf.median * v}
105 | }
106 | ws = append(ws, s)
107 | }
108 | sort.Sort(sort.Reverse(ws))
109 | if len(ws) > topK {
110 | tags = ws[:topK]
111 | } else {
112 | tags = ws
113 | }
114 | return tags
115 | }
116 |
--------------------------------------------------------------------------------
/analyse/tag_extracker_test.go:
--------------------------------------------------------------------------------
1 | package analyse
2 |
3 | import (
4 | "math"
5 | "testing"
6 | )
7 |
8 | var (
9 | testContents = []string{
10 | "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。",
11 | "我不喜欢日本和服。",
12 | "雷猴回归人间。",
13 | "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
14 | "我需要廉租房",
15 | "永和服装饰品有限公司",
16 | "我爱北京天安门",
17 | "abc",
18 | "隐马尔可夫",
19 | "雷猴是个好网站",
20 | "“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成",
21 | "草泥马和欺实马是今年的流行词汇",
22 | "伊藤洋华堂总府店",
23 | "中国科学院计算技术研究所",
24 | "罗密欧与朱丽叶",
25 | "我购买了道具和服装",
26 | "PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍",
27 | "湖北省石首市",
28 | "湖北省十堰市",
29 | "总经理完成了这件事情",
30 | "电脑修好了",
31 | "做好了这件事情就一了百了了",
32 | "人们审美的观点是不同的",
33 | "我们买了一个美的空调",
34 | "线程初始化时我们要注意",
35 | "一个分子是由好多原子组织成的",
36 | "祝你马到功成",
37 | "他掉进了无底洞里",
38 | "中国的首都是北京",
39 | "孙君意",
40 | "外交部发言人马朝旭",
41 | "领导人会议和第四届东亚峰会",
42 | "在过去的这五年",
43 | "还需要很长的路要走",
44 | "60周年首都阅兵",
45 | "你好人们审美的观点是不同的",
46 | "买水果然后来世博园",
47 | "买水果然后去世博园",
48 | "但是后来我才知道你是对的",
49 | "存在即合理",
50 | "的的的的的在的的的的就以和和和",
51 | "I love你,不以为耻,反以为rong",
52 | "因",
53 | "",
54 | "hello你好人们审美的观点是不同的",
55 | "很好但主要是基于网页形式",
56 | "hello你好人们审美的观点是不同的",
57 | "为什么我不能拥有想要的生活",
58 | "后来我才",
59 | "此次来中国是为了",
60 | "使用了它就可以解决一些问题",
61 | ",使用了它就可以解决一些问题",
62 | "其实使用了它就可以解决一些问题",
63 | "好人使用了它就可以解决一些问题",
64 | "是因为和国家",
65 | "老年搜索还支持",
66 | "干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ",
67 | "大",
68 | "",
69 | "他说的确实在理",
70 | "长春市长春节讲话",
71 | "结婚的和尚未结婚的",
72 | "结合成分子时",
73 | "旅游和服务是最好的",
74 | "这件事情的确是我的错",
75 | "供大家参考指正",
76 | "哈尔滨政府公布塌桥原因",
77 | "我在机场入口处",
78 | "邢永臣摄影报道",
79 | "BP神经网络如何训练才能在分类时增加区分度?",
80 | "南京市长江大桥",
81 | "应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究",
82 | "长春市长春药店",
83 | "邓颖超生前最喜欢的衣服",
84 | "胡锦涛是热爱世界和平的政治局常委",
85 | "程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪",
86 | "一次性交多少钱",
87 | "两块五一套,三块八一斤,四块七一本,五块六一条",
88 | "小和尚留了一个像大和尚一样的和尚头",
89 | "我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站",
90 | "张晓梅去人民医院做了个B超然后去买了件T恤",
91 | "AT&T是一件不错的公司,给你发offer了吗?",
92 | "C++和c#是什么关系?11+122=133,是吗?PI=3.14159",
93 | "你认识那个和主席握手的的哥吗?他开一辆黑色的士。",
94 | "枪杆子中出政权"}
95 |
96 | Tags = [][]string{
97 | []string{"Python", "C++", "伸手不见五指", "孙悟空", "黑夜", "北京", "这是", "一个"},
98 | []string{"和服", "喜欢", "日本"},
99 | []string{"雷猴", "人间", "回归"},
100 | []string{"工信处", "女干事", "24", "交换机", "科室", "亲口", "器件", "技术性", "下属", "交代", "每月", "安装", "经过", "工作"},
101 | []string{"廉租房", "需要"},
102 | []string{"饰品", "永和", "服装", "有限公司"},
103 | []string{"天安门", "北京"},
104 | []string{"abc"},
105 | []string{"马尔可夫"},
106 | []string{"雷猴", "网站"},
107 | []string{"SOFTware", "Microsoft", "MICROcomputer", "微型", "一词", "软件", "计算机", "组成", "部分"},
108 | []string{"草泥马", "欺实", "词汇", "流行", "今年"},
109 | []string{"洋华堂", "总府", "伊藤"},
110 | []string{"中国科学院计算技术研究所"},
111 | []string{"朱丽叶", "罗密欧"},
112 | []string{"道具", "服装", "购买"},
113 | []string{"自珍", "敞帚", "PS", "开源", "不断改进", "敦促", "好处", "避免", "能够", "觉得", "就是", "自己", "一个"},
114 | []string{"石首市", "湖北省"},
115 | []string{"十堰市", "湖北省"},
116 | []string{"总经理", "这件", "完成", "事情"},
117 | []string{"修好", "电脑"},
118 | []string{"一了百了", "做好", "这件", "事情"},
119 | []string{"审美", "观点", "人们", "不同"},
120 | []string{"美的", "空调", "我们", "一个"},
121 | []string{"线程", "初始化", "注意", "我们"},
122 | []string{"好多", "原子", "分子", "组织", "一个"},
123 | []string{"马到功成"},
124 | []string{"无底洞"},
125 | []string{"首都", "北京", "中国"},
126 | []string{"孙君意"},
127 | []string{"马朝旭", "外交部", "发言人"},
128 | []string{"第四届", "东亚", "峰会", "领导人", "会议"},
129 | []string{"五年", "过去"},
130 | []string{"很长", "需要"},
131 | []string{"60", "阅兵", "周年", "首都"},
132 | []string{"审美", "你好", "观点", "人们", "不同"},
133 | []string{"世博园", "水果", "然后"},
134 | []string{"世博园", "水果", "然后"},
135 | []string{"后来", "但是", "知道"},
136 | []string{"合理", "存在"},
137 | []string{},
138 | []string{"rong", "love", "不以为耻", "以为"},
139 | []string{},
140 | []string{},
141 | []string{"hello", "审美", "你好", "观点", "人们", "不同"},
142 | []string{"网页", "基于", "形式", "主要"},
143 | []string{"hello", "审美", "你好", "观点", "人们", "不同"},
144 | []string{"想要", "拥有", "为什么", "生活", "不能"},
145 | []string{"后来"},
146 | []string{"此次", "为了", "中国"},
147 | []string{"解决", "使用", "一些", "问题", "可以"},
148 | []string{"解决", "使用", "一些", "问题", "可以"},
149 | []string{"解决", "其实", "使用", "一些", "问题", "可以"},
150 | []string{"好人", "解决", "使用", "一些", "问题", "可以"},
151 | []string{"是因为", "国家"},
152 | []string{"老年", "搜索", "支持"},
153 | []string{"闲法", "中本", "laoshipukong", "RT", "27", "责任法", "蒙人", "万劫不复", "举证", "倒置", "医患", "那部", "拉倒", "侵权", "全国人大常委会", "草案", "境地", "纠纷", "删除", "弱势"},
154 | []string{},
155 | []string{},
156 | []string{"在理", "确实"},
157 | []string{"长春", "春节", "讲话", "市长"},
158 | []string{"结婚", "尚未"},
159 | []string{"分子", "结合"},
160 | []string{"旅游", "最好", "服务"},
161 | []string{"的确", "这件", "事情"},
162 | []string{"指正", "参考", "大家"},
163 | []string{"塌桥", "哈尔滨", "公布", "原因", "政府"},
164 | []string{"入口处", "机场"},
165 | []string{"邢永臣", "摄影", "报道"},
166 | []string{"区分度", "BP", "神经网络", "训练", "分类", "才能", "如何", "增加"},
167 | []string{"长江大桥", "南京市"},
168 | []string{"SMT", "NiuTrans", "使用者", "便于", "用于", "建议", "利用", "为了", "研究", "一些"},
169 | []string{"长春市", "药店", "长春"},
170 | []string{"邓颖超", "生前", "衣服", "喜欢"},
171 | []string{"政治局", "热爱", "常委", "胡锦涛", "和平", "世界"},
172 | []string{"右面", "孙健", "范凯", "李松洪", "朱会震", "海林", "左面", "程序员", "再往"},
173 | []string{"一次性", "多少"},
174 | []string{"四块", "五块", "三块", "一斤", "两块", "一本", "一套", "一条"},
175 | []string{"和尚", "和尚头", "一样", "一个"},
176 | []string{"和平门", "共和党", "地铁", "党员", "公民", "爸爸", "中华人民共和国"},
177 | []string{"张晓梅", "T恤", "B超", "医院", "人民", "然后"},
178 | []string{"offer", "AT&T", "不错", "一件", "公司"},
179 | []string{"c#", "PI", "C++", "3.14159", "133", "122", "11", "关系", "什么"},
180 | []string{"的士", "的哥", "他开", "握手", "一辆", "黑色", "主席", "认识", "那个"},
181 | []string{"枪杆子", "政权"},
182 | }
183 |
184 | Lyric = `
185 | 我沒有心
186 | 我沒有真實的自我
187 | 我只有消瘦的臉孔
188 | 所謂軟弱
189 | 所謂的順從一向是我
190 | 的座右銘
191 |
192 | 而我
193 | 沒有那海洋的寬闊
194 | 我只要熱情的撫摸
195 | 所謂空洞
196 | 所謂不安全感是我
197 | 的墓誌銘
198 |
199 | 而你
200 | 是否和我一般怯懦
201 | 是否和我一般矯作
202 | 和我一般囉唆
203 |
204 | 而你
205 | 是否和我一般退縮
206 | 是否和我一般肌迫
207 | 一般地困惑
208 |
209 | 我沒有力
210 | 我沒有滿腔的熱火
211 | 我只有滿肚的如果
212 | 所謂勇氣
213 | 所謂的認同感是我
214 | 隨便說說
215 |
216 | 而你
217 | 是否和我一般怯懦
218 | 是否和我一般矯作
219 | 是否對你來說
220 | 只是一場遊戲
221 | 雖然沒有把握
222 |
223 | 而你
224 | 是否和我一般退縮
225 | 是否和我一般肌迫
226 | 是否對你來說
227 | 只是逼不得已
228 | 雖然沒有藉口
229 | `
230 | LyciWeight = Segments{
231 | Segment{text: "所謂", weight: 1.010262},
232 | Segment{text: "是否", weight: 0.738650},
233 | Segment{text: "一般", weight: 0.607600},
234 | Segment{text: "雖然", weight: 0.336754},
235 | Segment{text: "退縮", weight: 0.336754},
236 | Segment{text: "肌迫", weight: 0.336754},
237 | Segment{text: "矯作", weight: 0.336754},
238 | Segment{text: "沒有", weight: 0.336754},
239 | Segment{text: "怯懦", weight: 0.271099},
240 | Segment{text: "隨便", weight: 0.168377},
241 | }
242 |
243 | LyciWeight2 = Segments{
244 | Segment{text: "所謂", weight: 1.215739},
245 | Segment{text: "一般", weight: 0.731179},
246 | Segment{text: "雖然", weight: 0.405246},
247 | Segment{text: "退縮", weight: 0.405246},
248 | Segment{text: "肌迫", weight: 0.405246},
249 | Segment{text: "矯作", weight: 0.405246},
250 | Segment{text: "怯懦", weight: 0.326238},
251 | Segment{text: "逼不得已", weight: 0.202623},
252 | Segment{text: "右銘", weight: 0.202623},
253 | Segment{text: "寬闊", weight: 0.202623},
254 | }
255 | )
256 |
257 | func TestExtractTags(t *testing.T) {
258 | var te TagExtracter
259 | te.LoadDictionary("../dict.txt")
260 | te.LoadIdf("idf.txt")
261 |
262 | for index, sentence := range testContents {
263 | result := te.ExtractTags(sentence, 20)
264 | if len(result) != len(Tags[index]) {
265 | t.Fatalf("%s = %v", sentence, result)
266 | }
267 | for i, tag := range result {
268 | if tag.text != Tags[index][i] {
269 | t.Fatalf("%s != %s", tag, Tags[index][i])
270 | }
271 | }
272 | }
273 | }
274 |
275 | func TestExtratTagsWithWeight(t *testing.T) {
276 | var te TagExtracter
277 | te.LoadDictionary("../dict.txt")
278 | te.LoadIdf("idf.txt")
279 | result := te.ExtractTags(Lyric, 10)
280 | for index, tag := range result {
281 | if LyciWeight[index].text != tag.text ||
282 | math.Abs(LyciWeight[index].weight-tag.weight) > 1e-6 {
283 | t.Fatalf("%v != %v", tag, LyciWeight[index])
284 | }
285 | }
286 | }
287 |
288 | func TestExtractTagsWithStopWordsFile(t *testing.T) {
289 | var te TagExtracter
290 | te.LoadDictionary("../dict.txt")
291 | te.LoadIdf("idf.txt")
292 | te.LoadStopWords("stop_words.txt")
293 | result := te.ExtractTags(Lyric, 7)
294 | for index, tag := range result {
295 | if LyciWeight2[index].text != tag.text ||
296 | math.Abs(LyciWeight2[index].weight-tag.weight) > 1e-6 {
297 | t.Fatalf("%v != %v", tag, LyciWeight2[index])
298 | }
299 | }
300 | }
301 |
--------------------------------------------------------------------------------
/analyse/textrank.go:
--------------------------------------------------------------------------------
1 | package analyse
2 |
3 | import (
4 | "math"
5 | "sort"
6 |
7 | "github.com/wangbin/jiebago/posseg"
8 | )
9 |
10 | const dampingFactor = 0.85
11 |
12 | var (
13 | defaultAllowPOS = []string{"ns", "n", "vn", "v"}
14 | )
15 |
16 | type edge struct {
17 | start string
18 | end string
19 | weight float64
20 | }
21 |
22 | type edges []edge
23 |
24 | func (es edges) Len() int {
25 | return len(es)
26 | }
27 |
28 | func (es edges) Less(i, j int) bool {
29 | return es[i].weight < es[j].weight
30 | }
31 |
32 | func (es edges) Swap(i, j int) {
33 | es[i], es[j] = es[j], es[i]
34 | }
35 |
36 | type undirectWeightedGraph struct {
37 | graph map[string]edges
38 | keys sort.StringSlice
39 | }
40 |
41 | func newUndirectWeightedGraph() *undirectWeightedGraph {
42 | u := new(undirectWeightedGraph)
43 | u.graph = make(map[string]edges)
44 | u.keys = make(sort.StringSlice, 0)
45 | return u
46 | }
47 |
48 | func (u *undirectWeightedGraph) addEdge(start, end string, weight float64) {
49 | if _, ok := u.graph[start]; !ok {
50 | u.keys = append(u.keys, start)
51 | u.graph[start] = edges{edge{start: start, end: end, weight: weight}}
52 | } else {
53 | u.graph[start] = append(u.graph[start], edge{start: start, end: end, weight: weight})
54 | }
55 |
56 | if _, ok := u.graph[end]; !ok {
57 | u.keys = append(u.keys, end)
58 | u.graph[end] = edges{edge{start: end, end: start, weight: weight}}
59 | } else {
60 | u.graph[end] = append(u.graph[end], edge{start: end, end: start, weight: weight})
61 | }
62 | }
63 |
64 | func (u *undirectWeightedGraph) rank() Segments {
65 | if !sort.IsSorted(u.keys) {
66 | sort.Sort(u.keys)
67 | }
68 |
69 | ws := make(map[string]float64)
70 | outSum := make(map[string]float64)
71 |
72 | wsdef := 1.0
73 | if len(u.graph) > 0 {
74 | wsdef /= float64(len(u.graph))
75 | }
76 | for n, out := range u.graph {
77 | ws[n] = wsdef
78 | sum := 0.0
79 | for _, e := range out {
80 | sum += e.weight
81 | }
82 | outSum[n] = sum
83 | }
84 |
85 | for x := 0; x < 10; x++ {
86 | for _, n := range u.keys {
87 | s := 0.0
88 | inedges := u.graph[n]
89 | for _, e := range inedges {
90 | s += e.weight / outSum[e.end] * ws[e.end]
91 | }
92 | ws[n] = (1 - dampingFactor) + dampingFactor*s
93 | }
94 | }
95 | minRank := math.MaxFloat64
96 | maxRank := math.SmallestNonzeroFloat64
97 | for _, w := range ws {
98 | if w < minRank {
99 | minRank = w
100 | } else if w > maxRank {
101 | maxRank = w
102 | }
103 | }
104 | result := make(Segments, 0)
105 | for n, w := range ws {
106 | result = append(result, Segment{text: n, weight: (w - minRank/10.0) / (maxRank - minRank/10.0)})
107 | }
108 | sort.Sort(sort.Reverse(result))
109 | return result
110 | }
111 |
112 | // TextRankWithPOS extracts keywords from sentence using TextRank algorithm.
113 | // Parameter allowPOS allows a customized pos list.
114 | func (t *TextRanker) TextRankWithPOS(sentence string, topK int, allowPOS []string) Segments {
115 | posFilt := make(map[string]int)
116 | for _, pos := range allowPOS {
117 | posFilt[pos] = 1
118 | }
119 | g := newUndirectWeightedGraph()
120 | cm := make(map[[2]string]float64)
121 | span := 5
122 | var pairs []posseg.Segment
123 | for pair := range t.seg.Cut(sentence, true) {
124 | pairs = append(pairs, pair)
125 | }
126 | for i := range pairs {
127 | if _, ok := posFilt[pairs[i].Pos()]; ok {
128 | for j := i + 1; j < i+span && j <= len(pairs); j++ {
129 | if _, ok := posFilt[pairs[j].Pos()]; !ok {
130 | continue
131 | }
132 | if _, ok := cm[[2]string{pairs[i].Text(), pairs[j].Text()}]; !ok {
133 | cm[[2]string{pairs[i].Text(), pairs[j].Text()}] = 1.0
134 | } else {
135 | cm[[2]string{pairs[i].Text(), pairs[j].Text()}] += 1.0
136 | }
137 | }
138 | }
139 | }
140 | for startEnd, weight := range cm {
141 | g.addEdge(startEnd[0], startEnd[1], weight)
142 | }
143 | tags := g.rank()
144 | if topK > 0 && len(tags) > topK {
145 | tags = tags[:topK]
146 | }
147 | return tags
148 | }
149 |
150 | // TextRank extract keywords from sentence using TextRank algorithm.
151 | // Parameter topK specify how many top keywords to be returned at most.
152 | func (t *TextRanker) TextRank(sentence string, topK int) Segments {
153 | return t.TextRankWithPOS(sentence, topK, defaultAllowPOS)
154 | }
155 |
156 | // TextRanker is used to extract tags from sentence.
157 | type TextRanker struct {
158 | seg *posseg.Segmenter
159 | }
160 |
161 | // LoadDictionary reads a given file and create a new dictionary file for Textranker.
162 | func (t *TextRanker) LoadDictionary(fileName string) error {
163 | t.seg = new(posseg.Segmenter)
164 | return t.seg.LoadDictionary(fileName)
165 | }
166 |
--------------------------------------------------------------------------------
/analyse/textrank_test.go:
--------------------------------------------------------------------------------
1 | package analyse
2 |
3 | import (
4 | "math"
5 | "testing"
6 | )
7 |
8 | var (
9 | sentence = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。"
10 |
11 | tagRanks = Segments{
12 | Segment{text: "吉林", weight: 1.0},
13 | Segment{text: "欧亚", weight: 0.87807810644},
14 | Segment{text: "置业", weight: 0.562048250306},
15 | Segment{text: "实现", weight: 0.520905743929},
16 | Segment{text: "收入", weight: 0.384283870648},
17 | Segment{text: "增资", weight: 0.360590945312},
18 | Segment{text: "子公司", weight: 0.353131980904},
19 | Segment{text: "城市", weight: 0.307509449283},
20 | Segment{text: "全资", weight: 0.306324426665},
21 | Segment{text: "商业", weight: 0.306138241063},
22 | }
23 | )
24 |
25 | func TestTextRank(t *testing.T) {
26 | var tr TextRanker
27 | tr.LoadDictionary("../dict.txt")
28 | results := tr.TextRank(sentence, 10)
29 | for index, tw := range results {
30 | if tw.text != tagRanks[index].text || math.Abs(tw.weight-tagRanks[index].weight) > 1e-6 {
31 | t.Fatalf("%v != %v", tw, tagRanks[index])
32 | }
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/dictionary.go:
--------------------------------------------------------------------------------
1 | package jiebago
2 |
3 | import (
4 | "math"
5 | "sync"
6 |
7 | "github.com/wangbin/jiebago/dictionary"
8 | )
9 |
10 | // A Dictionary represents a thread-safe dictionary used for word segmentation.
11 | type Dictionary struct {
12 | total, logTotal float64
13 | freqMap map[string]float64
14 | sync.RWMutex
15 | }
16 |
17 | // Load loads all tokens from given channel
18 | func (d *Dictionary) Load(ch <-chan dictionary.Token) {
19 | d.Lock()
20 | for token := range ch {
21 | d.addToken(token)
22 | }
23 | d.Unlock()
24 | d.updateLogTotal()
25 | }
26 |
27 | // AddToken adds one token
28 | func (d *Dictionary) AddToken(token dictionary.Token) {
29 | d.Lock()
30 | d.addToken(token)
31 | d.Unlock()
32 | d.updateLogTotal()
33 | }
34 |
35 | func (d *Dictionary) addToken(token dictionary.Token) {
36 | d.freqMap[token.Text()] = token.Frequency()
37 | d.total += token.Frequency()
38 | runes := []rune(token.Text())
39 | n := len(runes)
40 | for i := 0; i < n; i++ { //TODO: n-1?
41 | frag := string(runes[:i+1])
42 | if _, ok := d.freqMap[frag]; !ok {
43 | d.freqMap[frag] = 0.0
44 | }
45 | }
46 | }
47 |
48 | func (d *Dictionary) updateLogTotal() {
49 | d.logTotal = math.Log(d.total)
50 | }
51 |
52 | // Frequency returns the frequency and existence of give word
53 | func (d *Dictionary) Frequency(key string) (float64, bool) {
54 | d.RLock()
55 | freq, ok := d.freqMap[key]
56 | d.RUnlock()
57 | return freq, ok
58 | }
59 |
60 | func (d *Dictionary) loadDictionary(fileName string) error {
61 | return dictionary.LoadDictionary(d, fileName)
62 | }
63 |
--------------------------------------------------------------------------------
/dictionary/dictionary.go:
--------------------------------------------------------------------------------
1 | // Package dictionary contains a interface and wraps all io related work.
2 | // It is used by jiebago module to read/write files.
3 | package dictionary
4 |
5 | import (
6 | "bufio"
7 | "os"
8 | "path/filepath"
9 | "strconv"
10 | "strings"
11 | )
12 |
13 | // DictLoader is the interface that could add one token or load
14 | // tokens from channel.
15 | type DictLoader interface {
16 | Load(<-chan Token)
17 | AddToken(Token)
18 | }
19 |
20 | func loadDictionary(file *os.File) (<-chan Token, <-chan error) {
21 | tokenCh, errCh := make(chan Token), make(chan error)
22 |
23 | go func() {
24 | defer close(tokenCh)
25 | defer close(errCh)
26 | scanner := bufio.NewScanner(file)
27 | var token Token
28 | var line string
29 | var fields []string
30 | var err error
31 | for scanner.Scan() {
32 | line = scanner.Text()
33 | fields = strings.Split(line, " ")
34 | token.text = strings.TrimSpace(strings.Replace(fields[0], "\ufeff", "", 1))
35 | if length := len(fields); length > 1 {
36 | token.frequency, err = strconv.ParseFloat(fields[1], 64)
37 | if err != nil {
38 | errCh <- err
39 | return
40 | }
41 | if length > 2 {
42 | token.pos = strings.TrimSpace(fields[2])
43 | }
44 | }
45 | tokenCh <- token
46 | }
47 |
48 | if err = scanner.Err(); err != nil {
49 | errCh <- err
50 | }
51 | }()
52 | return tokenCh, errCh
53 |
54 | }
55 |
56 | // LoadDictionary reads the given file and passes all tokens to a DictLoader.
57 | func LoadDictionary(dl DictLoader, fileName string) error {
58 | filePath, err := dictPath(fileName)
59 | if err != nil {
60 | return err
61 | }
62 | dictFile, err := os.Open(filePath)
63 | if err != nil {
64 | return err
65 | }
66 | defer dictFile.Close()
67 | tokenCh, errCh := loadDictionary(dictFile)
68 | dl.Load(tokenCh)
69 |
70 | return <-errCh
71 |
72 | }
73 |
74 | func dictPath(dictFileName string) (string, error) {
75 | if filepath.IsAbs(dictFileName) {
76 | return dictFileName, nil
77 | }
78 | var dictFilePath string
79 | cwd, err := os.Getwd()
80 | if err != nil {
81 | return dictFilePath, err
82 | }
83 | dictFilePath = filepath.Clean(filepath.Join(cwd, dictFileName))
84 | return dictFilePath, nil
85 | }
86 |
--------------------------------------------------------------------------------
/dictionary/dictionary_test.go:
--------------------------------------------------------------------------------
1 | package dictionary
2 |
3 | import (
4 | "sync"
5 | "testing"
6 | )
7 |
8 | type Dict struct {
9 | freqMap map[string]float64
10 | posMap map[string]string
11 | sync.RWMutex
12 | }
13 |
14 | func (d *Dict) Load(ch <-chan Token) {
15 | d.Lock()
16 | for token := range ch {
17 | d.freqMap[token.Text()] = token.Frequency()
18 | if len(token.Pos()) > 0 {
19 | d.posMap[token.Text()] = token.Pos()
20 | }
21 | }
22 | d.Unlock()
23 | }
24 |
25 | func (d *Dict) AddToken(token Token) {
26 | d.Lock()
27 | d.freqMap[token.Text()] = token.Frequency()
28 | if len(token.Pos()) > 0 {
29 | d.posMap[token.Text()] = token.Pos()
30 | }
31 | d.Unlock()
32 | }
33 |
34 | func TestLoadDictionary(t *testing.T) {
35 | d := &Dict{freqMap: make(map[string]float64), posMap: make(map[string]string)}
36 | err := LoadDictionary(d, "../userdict.txt")
37 | if err != nil {
38 | t.Fatalf(err.Error())
39 | }
40 | if len(d.freqMap) != 7 {
41 | t.Fatalf("Failed to load userdict.txt, got %d tokens with frequency, expected 7",
42 | len(d.freqMap))
43 | }
44 | if len(d.posMap) != 6 {
45 | t.Fatalf("Failed to load userdict.txt, got %d tokens with pos, expected 6", len(d.posMap))
46 | }
47 | }
48 |
49 | func TestAddToken(t *testing.T) {
50 | d := &Dict{freqMap: make(map[string]float64), posMap: make(map[string]string)}
51 | LoadDictionary(d, "../userdict.txt")
52 | d.AddToken(Token{"好用", 99, "a"})
53 | if d.freqMap["好用"] != 99 {
54 | t.Fatalf("Failed to add token, got frequency %f, expected 99", d.freqMap["好用"])
55 | }
56 | if d.posMap["好用"] != "a" {
57 | t.Fatalf("Failed to add token, got pos %s, expected \"a\"", d.posMap["好用"])
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/dictionary/token.go:
--------------------------------------------------------------------------------
1 | package dictionary
2 |
3 | // Token represents a Chinese word with (optional) frequency and POS.
4 | type Token struct {
5 | text string
6 | frequency float64
7 | pos string
8 | }
9 |
10 | //Text returns token's text.
11 | func (t Token) Text() string {
12 | return t.text
13 | }
14 |
15 | // Frequency returns token's frequency.
16 | func (t Token) Frequency() float64 {
17 | return t.frequency
18 | }
19 |
20 | // Pos returns token's POS.
21 | func (t Token) Pos() string {
22 | return t.pos
23 | }
24 |
25 | // NewToken creates a new token.
26 | func NewToken(text string, frequency float64, pos string) Token {
27 | return Token{text: text, frequency: frequency, pos: pos}
28 | }
29 |
--------------------------------------------------------------------------------
/example_parallel_cut_test.go:
--------------------------------------------------------------------------------
1 | package jiebago_test
2 |
3 | import (
4 | "bufio"
5 | "fmt"
6 | "log"
7 | "os"
8 | "runtime"
9 | "strings"
10 | "time"
11 |
12 | "github.com/wangbin/jiebago"
13 | )
14 |
15 | type line struct {
16 | number int
17 | text string
18 | }
19 |
20 | var (
21 | segmenter = jiebago.Segmenter{}
22 | numThreads = runtime.NumCPU()
23 | task = make(chan line, numThreads)
24 | result = make(chan line, numThreads)
25 | )
26 |
27 | func worker() {
28 | for l := range task {
29 | var segments []string
30 | for segment := range segmenter.Cut(l.text, true) {
31 | segments = append(segments, segment)
32 | }
33 |
34 | l.text = fmt.Sprintf("%s\n", strings.Join(segments, " / "))
35 | result <- l
36 | }
37 | }
38 |
39 | func Example_parallelCut() {
40 | // Set the number of goroutines
41 | runtime.GOMAXPROCS(numThreads)
42 |
43 | // Load dictionary
44 | segmenter.LoadDictionary("dict.txt")
45 |
46 | // open file for segmentation
47 | file, err := os.Open("README.md")
48 | if err != nil {
49 | log.Fatal(err)
50 | }
51 | defer file.Close()
52 |
53 | // start worker routines
54 | for i := 0; i < numThreads; i++ {
55 | go worker()
56 | }
57 |
58 | var length, size int
59 | scanner := bufio.NewScanner(file)
60 |
61 | t0 := time.Now()
62 |
63 | lines := make([]string, 0)
64 |
65 | // Read lines
66 | for scanner.Scan() {
67 | t := scanner.Text()
68 | size += len(t)
69 | lines = append(lines, t)
70 | }
71 | length = len(lines)
72 |
73 | // Segmentation
74 | go func() {
75 | for i := 0; i < length; i++ {
76 | task <- line{number: i, text: lines[i]}
77 | }
78 | close(task)
79 | }()
80 |
81 | // Make sure the segmentation result contains same line as original file
82 | for i := 0; i < length; i++ {
83 | l := <-result
84 | lines[l.number] = l.text
85 | }
86 |
87 | t1 := time.Now()
88 |
89 | // Write the segments into a file for verify
90 | outputFile, _ := os.OpenFile("parallelCut.log", os.O_CREATE|os.O_WRONLY, 0600)
91 | defer outputFile.Close()
92 | writer := bufio.NewWriter(outputFile)
93 | for _, l := range lines {
94 | writer.WriteString(l)
95 | }
96 | writer.Flush()
97 |
98 | log.Printf("Time cousumed: %v", t1.Sub(t0))
99 | log.Printf("Segmentation speed: %f MB/s", float64(size)/t1.Sub(t0).Seconds()/(1024*1024))
100 | }
101 |
--------------------------------------------------------------------------------
/example_test.go:
--------------------------------------------------------------------------------
1 | package jiebago_test
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/wangbin/jiebago"
7 | )
8 |
9 | func Example() {
10 | var seg jiebago.Segmenter
11 | seg.LoadDictionary("dict.txt")
12 |
13 | print := func(ch <-chan string) {
14 | for word := range ch {
15 | fmt.Printf(" %s /", word)
16 | }
17 | fmt.Println()
18 | }
19 |
20 | fmt.Print("【全模式】:")
21 | print(seg.CutAll("我来到北京清华大学"))
22 |
23 | fmt.Print("【精确模式】:")
24 | print(seg.Cut("我来到北京清华大学", false))
25 |
26 | fmt.Print("【新词识别】:")
27 | print(seg.Cut("他来到了网易杭研大厦", true))
28 |
29 | fmt.Print("【搜索引擎模式】:")
30 | print(seg.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", true))
31 | // Output:
32 | // 【全模式】: 我 / 来到 / 北京 / 清华 / 清华大学 / 华大 / 大学 /
33 | // 【精确模式】: 我 / 来到 / 北京 / 清华大学 /
34 | // 【新词识别】: 他 / 来到 / 了 / 网易 / 杭研 / 大厦 /
35 | // 【搜索引擎模式】: 小明 / 硕士 / 毕业 / 于 / 中国 / 科学 / 学院 / 科学院 / 中国科学院 / 计算 / 计算所 / , / 后 / 在 / 日本 / 京都 / 大学 / 日本京都大学 / 深造 /
36 | }
37 |
38 | func Example_suggestFrequency() {
39 | var seg jiebago.Segmenter
40 | seg.LoadDictionary("dict.txt")
41 |
42 | print := func(ch <-chan string) {
43 | for word := range ch {
44 | fmt.Printf(" %s /", word)
45 | }
46 | fmt.Println()
47 | }
48 | sentence := "超敏C反应蛋白是什么?"
49 | fmt.Print("Before:")
50 | print(seg.Cut(sentence, false))
51 | word := "超敏C反应蛋白"
52 | oldFrequency, _ := seg.Frequency(word)
53 | frequency := seg.SuggestFrequency(word)
54 | fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency)
55 | seg.AddWord(word, frequency)
56 | fmt.Print("After:")
57 | print(seg.Cut(sentence, false))
58 |
59 | sentence = "如果放到post中将出错"
60 | fmt.Print("Before:")
61 | print(seg.Cut(sentence, false))
62 | word = "中将"
63 | oldFrequency, _ = seg.Frequency(word)
64 | frequency = seg.SuggestFrequency("中", "将")
65 | fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency)
66 | seg.AddWord(word, frequency)
67 | fmt.Print("After:")
68 | print(seg.Cut(sentence, false))
69 |
70 | sentence = "今天天气不错"
71 | fmt.Print("Before:")
72 | print(seg.Cut(sentence, false))
73 | word = "今天天气"
74 | oldFrequency, _ = seg.Frequency(word)
75 | frequency = seg.SuggestFrequency("今天", "天气")
76 | fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency)
77 | seg.AddWord(word, frequency)
78 | fmt.Print("After:")
79 | print(seg.Cut(sentence, false))
80 | // Output:
81 | // Before: 超敏 / C / 反应 / 蛋白 / 是 / 什么 / ? /
82 | // 超敏C反应蛋白 current frequency: 0.000000, suggest: 1.000000.
83 | // After: 超敏C反应蛋白 / 是 / 什么 / ? /
84 | // Before: 如果 / 放到 / post / 中将 / 出错 /
85 | // 中将 current frequency: 763.000000, suggest: 494.000000.
86 | // After: 如果 / 放到 / post / 中 / 将 / 出错 /
87 | // Before: 今天天气 / 不错 /
88 | // 今天天气 current frequency: 3.000000, suggest: 0.000000.
89 | // After: 今天 / 天气 / 不错 /
90 | }
91 |
92 | func Example_loadUserDictionary() {
93 | var seg jiebago.Segmenter
94 | seg.LoadDictionary("dict.txt")
95 |
96 | print := func(ch <-chan string) {
97 | for word := range ch {
98 | fmt.Printf(" %s /", word)
99 | }
100 | fmt.Println()
101 | }
102 | sentence := "李小福是创新办主任也是云计算方面的专家"
103 | fmt.Print("Before:")
104 | print(seg.Cut(sentence, true))
105 |
106 | seg.LoadUserDictionary("userdict.txt")
107 |
108 | fmt.Print("After:")
109 | print(seg.Cut(sentence, true))
110 | // Output:
111 | // Before: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
112 | // After: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
113 | }
114 |
--------------------------------------------------------------------------------
/finalseg/finalseg.go:
--------------------------------------------------------------------------------
1 | // Package finalseg is the Golang implementation of Jieba's finalseg module.
2 | package finalseg
3 |
4 | import (
5 | "regexp"
6 | )
7 |
8 | var (
9 | reHan = regexp.MustCompile(`\p{Han}+`)
10 | reSkip = regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`)
11 | )
12 |
13 | func cutHan(sentence string) chan string {
14 | result := make(chan string)
15 | go func() {
16 | runes := []rune(sentence)
17 | _, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
18 | begin, next := 0, 0
19 | for i, char := range runes {
20 | pos := posList[i]
21 | switch pos {
22 | case 'B':
23 | begin = i
24 | case 'E':
25 | result <- string(runes[begin : i+1])
26 | next = i + 1
27 | case 'S':
28 | result <- string(char)
29 | next = i + 1
30 | }
31 | }
32 | if next < len(runes) {
33 | result <- string(runes[next:])
34 | }
35 | close(result)
36 | }()
37 | return result
38 | }
39 |
40 | // Cut cuts sentence into words using Hidden Markov Model with Viterbi
41 | // algorithm. It is used by Jiebago for unknonw words.
42 | func Cut(sentence string) chan string {
43 | result := make(chan string)
44 | s := sentence
45 | var hans string
46 | var hanLoc []int
47 | var nonhanLoc []int
48 | go func() {
49 | for {
50 | hanLoc = reHan.FindStringIndex(s)
51 | if hanLoc == nil {
52 | if len(s) == 0 {
53 | break
54 | }
55 | } else if hanLoc[0] == 0 {
56 | hans = s[hanLoc[0]:hanLoc[1]]
57 | s = s[hanLoc[1]:]
58 | for han := range cutHan(hans) {
59 | result <- han
60 | }
61 | continue
62 | }
63 | nonhanLoc = reSkip.FindStringIndex(s)
64 | if nonhanLoc == nil {
65 | if len(s) == 0 {
66 | break
67 | }
68 | } else if nonhanLoc[0] == 0 {
69 | nonhans := s[nonhanLoc[0]:nonhanLoc[1]]
70 | s = s[nonhanLoc[1]:]
71 | if nonhans != "" {
72 | result <- nonhans
73 | continue
74 | }
75 | }
76 | var loc []int
77 | if hanLoc == nil && nonhanLoc == nil {
78 | if len(s) > 0 {
79 | result <- s
80 | break
81 | }
82 | } else if hanLoc == nil {
83 | loc = nonhanLoc
84 | } else if nonhanLoc == nil {
85 | loc = hanLoc
86 | } else if hanLoc[0] < nonhanLoc[0] {
87 | loc = hanLoc
88 | } else {
89 | loc = nonhanLoc
90 | }
91 | result <- s[:loc[0]]
92 | s = s[loc[0]:]
93 | }
94 | close(result)
95 | }()
96 | return result
97 | }
98 |
--------------------------------------------------------------------------------
/finalseg/finalseg_test.go:
--------------------------------------------------------------------------------
1 | package finalseg
2 |
3 | import (
4 | "math"
5 | "testing"
6 | )
7 |
8 | func chanToArray(ch chan string) []string {
9 | var result []string
10 | for word := range ch {
11 | result = append(result, word)
12 | }
13 | return result
14 | }
15 |
16 | func TestViterbi(t *testing.T) {
17 | obs := "我们是程序员"
18 | states := []byte{'B', 'M', 'E', 'S'}
19 | prob, path := viterbi([]rune(obs), states)
20 | if math.Abs(prob+39.68824128493802) > 1e-10 {
21 | t.Fatal(prob)
22 | }
23 | for index, state := range []byte{'B', 'E', 'S', 'B', 'M', 'E'} {
24 | if path[index] != state {
25 | t.Fatal(path)
26 | }
27 | }
28 | }
29 |
30 | func TestCutHan(t *testing.T) {
31 | obs := "我们是程序员"
32 | result := chanToArray(cutHan(obs))
33 | if len(result) != 3 {
34 | t.Fatal(result)
35 | }
36 | if result[0] != "我们" {
37 | t.Fatal(result[0])
38 | }
39 | if result[1] != "是" {
40 | t.Fatal(result[1])
41 | }
42 | if result[2] != "程序员" {
43 | t.Fatal(result[2])
44 | }
45 | }
46 |
47 | func TestCut(t *testing.T) {
48 | sentence := "我们是程序员"
49 | result := chanToArray(Cut(sentence))
50 | if len(result) != 3 {
51 | t.Fatal(len(result))
52 | }
53 | if result[0] != "我们" {
54 | t.Fatal(result[0])
55 | }
56 | if result[1] != "是" {
57 | t.Fatal(result[1])
58 | }
59 | if result[2] != "程序员" {
60 | t.Fatal(result[2])
61 | }
62 | result2 := chanToArray(Cut("I'm a programmer!"))
63 | if len(result2) != 8 {
64 | t.Fatal(result2)
65 | }
66 | result3 := chanToArray(Cut("程序员average年龄28.6岁。"))
67 | if len(result3) != 6 {
68 | t.Fatal(result3)
69 | }
70 |
71 | }
72 |
--------------------------------------------------------------------------------
/finalseg/prob_trans.go:
--------------------------------------------------------------------------------
1 | package finalseg
2 |
3 | var probTrans = make(map[byte]map[byte]float64)
4 |
5 | func init() {
6 | probTrans['B'] = map[byte]float64{'E': -0.510825623765990,
7 | 'M': -0.916290731874155}
8 | probTrans['E'] = map[byte]float64{'B': -0.5897149736854513,
9 | 'S': -0.8085250474669937}
10 | probTrans['M'] = map[byte]float64{'E': -0.33344856811948514,
11 | 'M': -1.2603623820268226}
12 | probTrans['S'] = map[byte]float64{'B': -0.7211965654669841,
13 | 'S': -0.6658631448798212}
14 | }
15 |
--------------------------------------------------------------------------------
/finalseg/viterbi.go:
--------------------------------------------------------------------------------
1 | package finalseg
2 |
3 | import (
4 | "fmt"
5 | "sort"
6 | )
7 |
8 | const minFloat = -3.14e100
9 |
10 | var (
11 | prevStatus = make(map[byte][]byte)
12 | probStart = make(map[byte]float64)
13 | )
14 |
15 | func init() {
16 | prevStatus['B'] = []byte{'E', 'S'}
17 | prevStatus['M'] = []byte{'M', 'B'}
18 | prevStatus['S'] = []byte{'S', 'E'}
19 | prevStatus['E'] = []byte{'B', 'M'}
20 | probStart['B'] = -0.26268660809250016
21 | probStart['E'] = -3.14e+100
22 | probStart['M'] = -3.14e+100
23 | probStart['S'] = -1.4652633398537678
24 | }
25 |
26 | type probState struct {
27 | prob float64
28 | state byte
29 | }
30 |
31 | func (p probState) String() string {
32 | return fmt.Sprintf("(%f, %x)", p.prob, p.state)
33 | }
34 |
35 | type probStates []*probState
36 |
37 | func (ps probStates) Len() int {
38 | return len(ps)
39 | }
40 |
41 | func (ps probStates) Less(i, j int) bool {
42 | if ps[i].prob == ps[j].prob {
43 | return ps[i].state < ps[j].state
44 | }
45 | return ps[i].prob < ps[j].prob
46 | }
47 |
48 | func (ps probStates) Swap(i, j int) {
49 | ps[i], ps[j] = ps[j], ps[i]
50 | }
51 |
52 | func viterbi(obs []rune, states []byte) (float64, []byte) {
53 | path := make(map[byte][]byte)
54 | V := make([]map[byte]float64, len(obs))
55 | V[0] = make(map[byte]float64)
56 | for _, y := range states {
57 | if val, ok := probEmit[y][obs[0]]; ok {
58 | V[0][y] = val + probStart[y]
59 | } else {
60 | V[0][y] = minFloat + probStart[y]
61 | }
62 | path[y] = []byte{y}
63 | }
64 |
65 | for t := 1; t < len(obs); t++ {
66 | newPath := make(map[byte][]byte)
67 | V[t] = make(map[byte]float64)
68 | for _, y := range states {
69 | ps0 := make(probStates, 0)
70 | var emP float64
71 | if val, ok := probEmit[y][obs[t]]; ok {
72 | emP = val
73 | } else {
74 | emP = minFloat
75 | }
76 | for _, y0 := range prevStatus[y] {
77 | var transP float64
78 | if tp, ok := probTrans[y0][y]; ok {
79 | transP = tp
80 | } else {
81 | transP = minFloat
82 | }
83 | prob0 := V[t-1][y0] + transP + emP
84 | ps0 = append(ps0, &probState{prob: prob0, state: y0})
85 | }
86 | sort.Sort(sort.Reverse(ps0))
87 | V[t][y] = ps0[0].prob
88 | pp := make([]byte, len(path[ps0[0].state]))
89 | copy(pp, path[ps0[0].state])
90 | newPath[y] = append(pp, y)
91 | }
92 | path = newPath
93 | }
94 | ps := make(probStates, 0)
95 | for _, y := range []byte{'E', 'S'} {
96 | ps = append(ps, &probState{V[len(obs)-1][y], y})
97 | }
98 | sort.Sort(sort.Reverse(ps))
99 | v := ps[0]
100 | return v.prob, path[v.state]
101 | }
102 |
--------------------------------------------------------------------------------
/foobar.txt:
--------------------------------------------------------------------------------
1 | 好人 12 n
--------------------------------------------------------------------------------
/jieba.go:
--------------------------------------------------------------------------------
1 | // Package jiebago is the Golang implemention of [Jieba](https://github.com/fxsjy/jieba), Python Chinese text segmentation module.
2 | package jiebago
3 |
4 | import (
5 | "math"
6 | "regexp"
7 | "strings"
8 |
9 | "github.com/wangbin/jiebago/dictionary"
10 | "github.com/wangbin/jiebago/finalseg"
11 | "github.com/wangbin/jiebago/util"
12 | )
13 |
14 | var (
15 | reEng = regexp.MustCompile(`[[:alnum:]]`)
16 | reHanCutAll = regexp.MustCompile(`(\p{Han}+)`)
17 | reSkipCutAll = regexp.MustCompile(`[^[:alnum:]+#\n]`)
18 | reHanDefault = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
19 | reSkipDefault = regexp.MustCompile(`(\r\n|\s)`)
20 | )
21 |
22 | // Segmenter is a Chinese words segmentation struct.
23 | type Segmenter struct {
24 | dict *Dictionary
25 | }
26 |
27 | // Frequency returns a word's frequency and existence
28 | func (seg *Segmenter) Frequency(word string) (float64, bool) {
29 | return seg.dict.Frequency(word)
30 | }
31 |
32 | // AddWord adds a new word with frequency to dictionary
33 | func (seg *Segmenter) AddWord(word string, frequency float64) {
34 | seg.dict.AddToken(dictionary.NewToken(word, frequency, ""))
35 | }
36 |
37 | // DeleteWord removes a word from dictionary
38 | func (seg *Segmenter) DeleteWord(word string) {
39 | seg.dict.AddToken(dictionary.NewToken(word, 0.0, ""))
40 | }
41 |
42 | /*
43 | SuggestFrequency returns a suggested frequncy of a word or a long word
44 | cutted into several short words.
45 |
46 | This method is useful when a word in the sentence is not cutted out correctly.
47 |
48 | If a word should not be further cutted, for example word "石墨烯" should not be
49 | cutted into "石墨" and "烯", SuggestFrequency("石墨烯") will return the maximu
50 | frequency for this word.
51 |
52 | If a word should be further cutted, for example word "今天天气" should be
53 | further cutted into two words "今天" and "天气", SuggestFrequency("今天", "天气")
54 | should return the minimum frequency for word "今天天气".
55 | */
56 | func (seg *Segmenter) SuggestFrequency(words ...string) float64 {
57 | frequency := 1.0
58 | if len(words) > 1 {
59 | for _, word := range words {
60 | if freq, ok := seg.dict.Frequency(word); ok {
61 | frequency *= freq
62 | }
63 | frequency /= seg.dict.total
64 | }
65 | frequency, _ = math.Modf(frequency * seg.dict.total)
66 | wordFreq := 0.0
67 | if freq, ok := seg.dict.Frequency(strings.Join(words, "")); ok {
68 | wordFreq = freq
69 | }
70 | if wordFreq < frequency {
71 | frequency = wordFreq
72 | }
73 | } else {
74 | word := words[0]
75 | for segment := range seg.Cut(word, false) {
76 | if freq, ok := seg.dict.Frequency(segment); ok {
77 | frequency *= freq
78 | }
79 | frequency /= seg.dict.total
80 | }
81 | frequency, _ = math.Modf(frequency * seg.dict.total)
82 | frequency += 1.0
83 | wordFreq := 1.0
84 | if freq, ok := seg.dict.Frequency(word); ok {
85 | wordFreq = freq
86 | }
87 | if wordFreq > frequency {
88 | frequency = wordFreq
89 | }
90 | }
91 | return frequency
92 | }
93 |
94 | // LoadDictionary loads dictionary from given file name. Everytime
95 | // LoadDictionary is called, previously loaded dictionary will be cleard.
96 | func (seg *Segmenter) LoadDictionary(fileName string) error {
97 | seg.dict = &Dictionary{freqMap: make(map[string]float64)}
98 | return seg.dict.loadDictionary(fileName)
99 | }
100 |
101 | // LoadUserDictionary loads a user specified dictionary, it must be called
102 | // after LoadDictionary, and it will not clear any previous loaded dictionary,
103 | // instead it will override exist entries.
104 | func (seg *Segmenter) LoadUserDictionary(fileName string) error {
105 | return seg.dict.loadDictionary(fileName)
106 | }
107 |
108 | func (seg *Segmenter) dag(runes []rune) map[int][]int {
109 | dag := make(map[int][]int)
110 | n := len(runes)
111 | var frag []rune
112 | var i int
113 | for k := 0; k < n; k++ {
114 | dag[k] = make([]int, 0)
115 | i = k
116 | frag = runes[k : k+1]
117 | for {
118 | freq, ok := seg.dict.Frequency(string(frag))
119 | if !ok {
120 | break
121 | }
122 | if freq > 0.0 {
123 | dag[k] = append(dag[k], i)
124 | }
125 | i++
126 | if i >= n {
127 | break
128 | }
129 | frag = runes[k : i+1]
130 | }
131 | if len(dag[k]) == 0 {
132 | dag[k] = append(dag[k], k)
133 | }
134 | }
135 | return dag
136 | }
137 |
138 | type route struct {
139 | frequency float64
140 | index int
141 | }
142 |
143 | func (seg *Segmenter) calc(runes []rune) map[int]route {
144 | dag := seg.dag(runes)
145 | n := len(runes)
146 | rs := make(map[int]route)
147 | rs[n] = route{frequency: 0.0, index: 0}
148 | var r route
149 | for idx := n - 1; idx >= 0; idx-- {
150 | for _, i := range dag[idx] {
151 | if freq, ok := seg.dict.Frequency(string(runes[idx : i+1])); ok {
152 | r = route{frequency: math.Log(freq) - seg.dict.logTotal + rs[i+1].frequency, index: i}
153 | } else {
154 | r = route{frequency: math.Log(1.0) - seg.dict.logTotal + rs[i+1].frequency, index: i}
155 | }
156 | if v, ok := rs[idx]; !ok {
157 | rs[idx] = r
158 | } else {
159 | if v.frequency < r.frequency || (v.frequency == r.frequency && v.index < r.index) {
160 | rs[idx] = r
161 | }
162 | }
163 | }
164 | }
165 | return rs
166 | }
167 |
168 | type cutFunc func(sentence string) <-chan string
169 |
170 | func (seg *Segmenter) cutDAG(sentence string) <-chan string {
171 | result := make(chan string)
172 | go func() {
173 | runes := []rune(sentence)
174 | routes := seg.calc(runes)
175 | var y int
176 | length := len(runes)
177 | var buf []rune
178 | for x := 0; x < length; {
179 | y = routes[x].index + 1
180 | frag := runes[x:y]
181 | if y-x == 1 {
182 | buf = append(buf, frag...)
183 | } else {
184 | if len(buf) > 0 {
185 | bufString := string(buf)
186 | if len(buf) == 1 {
187 | result <- bufString
188 | } else {
189 | if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
190 | for x := range finalseg.Cut(bufString) {
191 | result <- x
192 | }
193 | } else {
194 | for _, elem := range buf {
195 | result <- string(elem)
196 | }
197 | }
198 | }
199 | buf = make([]rune, 0)
200 | }
201 | result <- string(frag)
202 | }
203 | x = y
204 | }
205 |
206 | if len(buf) > 0 {
207 | bufString := string(buf)
208 | if len(buf) == 1 {
209 | result <- bufString
210 | } else {
211 | if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
212 | for t := range finalseg.Cut(bufString) {
213 | result <- t
214 | }
215 | } else {
216 | for _, elem := range buf {
217 | result <- string(elem)
218 | }
219 | }
220 | }
221 | }
222 | close(result)
223 | }()
224 | return result
225 | }
226 |
227 | func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan string {
228 | result := make(chan string)
229 |
230 | go func() {
231 | runes := []rune(sentence)
232 | routes := seg.calc(runes)
233 | var y int
234 | length := len(runes)
235 | var buf []rune
236 | for x := 0; x < length; {
237 | y = routes[x].index + 1
238 | frag := runes[x:y]
239 | if reEng.MatchString(string(frag)) && len(frag) == 1 {
240 | buf = append(buf, frag...)
241 | x = y
242 | continue
243 | }
244 | if len(buf) > 0 {
245 | result <- string(buf)
246 | buf = make([]rune, 0)
247 | }
248 | result <- string(frag)
249 | x = y
250 | }
251 | if len(buf) > 0 {
252 | result <- string(buf)
253 | buf = make([]rune, 0)
254 | }
255 | close(result)
256 | }()
257 | return result
258 | }
259 |
260 | // Cut cuts a sentence into words using accurate mode.
261 | // Parameter hmm controls whether to use the Hidden Markov Model.
262 | // Accurate mode attempts to cut the sentence into the most accurate
263 | // segmentations, which is suitable for text analysis.
264 | func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan string {
265 | result := make(chan string)
266 | var cut cutFunc
267 | if hmm {
268 | cut = seg.cutDAG
269 | } else {
270 | cut = seg.cutDAGNoHMM
271 | }
272 |
273 | go func() {
274 | for _, block := range util.RegexpSplit(reHanDefault, sentence, -1) {
275 | if len(block) == 0 {
276 | continue
277 | }
278 | if reHanDefault.MatchString(block) {
279 | for x := range cut(block) {
280 | result <- x
281 | }
282 | continue
283 | }
284 | for _, subBlock := range util.RegexpSplit(reSkipDefault, block, -1) {
285 | if reSkipDefault.MatchString(subBlock) {
286 | result <- subBlock
287 | continue
288 | }
289 | for _, r := range subBlock {
290 | result <- string(r)
291 | }
292 | }
293 | }
294 | close(result)
295 | }()
296 | return result
297 | }
298 |
299 | func (seg *Segmenter) cutAll(sentence string) <-chan string {
300 | result := make(chan string)
301 | go func() {
302 | runes := []rune(sentence)
303 | dag := seg.dag(runes)
304 | start := -1
305 | ks := make([]int, len(dag))
306 | for k := range dag {
307 | ks[k] = k
308 | }
309 | var l []int
310 | for k := range ks {
311 | l = dag[k]
312 | if len(l) == 1 && k > start {
313 | result <- string(runes[k : l[0]+1])
314 | start = l[0]
315 | continue
316 | }
317 | for _, j := range l {
318 | if j > k {
319 | result <- string(runes[k : j+1])
320 | start = j
321 | }
322 | }
323 | }
324 | close(result)
325 | }()
326 | return result
327 | }
328 |
329 | // CutAll cuts a sentence into words using full mode.
330 | // Full mode gets all the possible words from the sentence.
331 | // Fast but not accurate.
332 | func (seg *Segmenter) CutAll(sentence string) <-chan string {
333 | result := make(chan string)
334 | go func() {
335 | for _, block := range util.RegexpSplit(reHanCutAll, sentence, -1) {
336 | if len(block) == 0 {
337 | continue
338 | }
339 | if reHanCutAll.MatchString(block) {
340 | for x := range seg.cutAll(block) {
341 | result <- x
342 | }
343 | continue
344 | }
345 | for _, subBlock := range reSkipCutAll.Split(block, -1) {
346 | result <- subBlock
347 | }
348 | }
349 | close(result)
350 | }()
351 | return result
352 | }
353 |
354 | // CutForSearch cuts sentence into words using search engine mode.
355 | // Search engine mode, based on the accurate mode, attempts to cut long words
356 | // into several short words, which can raise the recall rate.
357 | // Suitable for search engines.
358 | func (seg *Segmenter) CutForSearch(sentence string, hmm bool) <-chan string {
359 | result := make(chan string)
360 | go func() {
361 | for word := range seg.Cut(sentence, hmm) {
362 | runes := []rune(word)
363 | for _, increment := range []int{2, 3} {
364 | if len(runes) <= increment {
365 | continue
366 | }
367 | var gram string
368 | for i := 0; i < len(runes)-increment+1; i++ {
369 | gram = string(runes[i : i+increment])
370 | if v, ok := seg.dict.Frequency(gram); ok && v > 0.0 {
371 | result <- gram
372 | }
373 | }
374 | }
375 | result <- word
376 | }
377 | close(result)
378 | }()
379 | return result
380 | }
381 |
--------------------------------------------------------------------------------
/jieba_test.go:
--------------------------------------------------------------------------------
1 | package jiebago
2 |
3 | import "testing"
4 |
5 | var (
6 | seg Segmenter
7 | testContents = []string{
8 | "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。",
9 | "我不喜欢日本和服。",
10 | "雷猴回归人间。",
11 | "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
12 | "我需要廉租房",
13 | "永和服装饰品有限公司",
14 | "我爱北京天安门",
15 | "abc",
16 | "隐马尔可夫",
17 | "雷猴是个好网站",
18 | "“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成",
19 | "草泥马和欺实马是今年的流行词汇",
20 | "伊藤洋华堂总府店",
21 | "中国科学院计算技术研究所",
22 | "罗密欧与朱丽叶",
23 | "我购买了道具和服装",
24 | "PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍",
25 | "湖北省石首市",
26 | "湖北省十堰市",
27 | "总经理完成了这件事情",
28 | "电脑修好了",
29 | "做好了这件事情就一了百了了",
30 | "人们审美的观点是不同的",
31 | "我们买了一个美的空调",
32 | "线程初始化时我们要注意",
33 | "一个分子是由好多原子组织成的",
34 | "祝你马到功成",
35 | "他掉进了无底洞里",
36 | "中国的首都是北京",
37 | "孙君意",
38 | "外交部发言人马朝旭",
39 | "领导人会议和第四届东亚峰会",
40 | "在过去的这五年",
41 | "还需要很长的路要走",
42 | "60周年首都阅兵",
43 | "你好人们审美的观点是不同的",
44 | "买水果然后来世博园",
45 | "买水果然后去世博园",
46 | "但是后来我才知道你是对的",
47 | "存在即合理",
48 | "的的的的的在的的的的就以和和和",
49 | "I love你,不以为耻,反以为rong",
50 | "因",
51 | "",
52 | "hello你好人们审美的观点是不同的",
53 | "很好但主要是基于网页形式",
54 | "hello你好人们审美的观点是不同的",
55 | "为什么我不能拥有想要的生活",
56 | "后来我才",
57 | "此次来中国是为了",
58 | "使用了它就可以解决一些问题",
59 | ",使用了它就可以解决一些问题",
60 | "其实使用了它就可以解决一些问题",
61 | "好人使用了它就可以解决一些问题",
62 | "是因为和国家",
63 | "老年搜索还支持",
64 | "干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ",
65 | "大",
66 | "",
67 | "他说的确实在理",
68 | "长春市长春节讲话",
69 | "结婚的和尚未结婚的",
70 | "结合成分子时",
71 | "旅游和服务是最好的",
72 | "这件事情的确是我的错",
73 | "供大家参考指正",
74 | "哈尔滨政府公布塌桥原因",
75 | "我在机场入口处",
76 | "邢永臣摄影报道",
77 | "BP神经网络如何训练才能在分类时增加区分度?",
78 | "南京市长江大桥",
79 | "应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究",
80 | "长春市长春药店",
81 | "邓颖超生前最喜欢的衣服",
82 | "胡锦涛是热爱世界和平的政治局常委",
83 | "程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪",
84 | "一次性交多少钱",
85 | "两块五一套,三块八一斤,四块七一本,五块六一条",
86 | "小和尚留了一个像大和尚一样的和尚头",
87 | "我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站",
88 | "张晓梅去人民医院做了个B超然后去买了件T恤",
89 | "AT&T是一件不错的公司,给你发offer了吗?",
90 | "C++和c#是什么关系?11+122=133,是吗?PI=3.14159",
91 | "你认识那个和主席握手的的哥吗?他开一辆黑色的士。",
92 | "枪杆子中出政权"}
93 |
94 | defaultCutResult = [][]string{[]string{"这是", "一个", "伸手不见五指", "的", "黑夜", "。", "我", "叫", "孙悟空", ",", "我", "爱", "北京", ",", "我", "爱", "Python", "和", "C++", "。"},
95 | []string{"我", "不", "喜欢", "日本", "和服", "。"},
96 | []string{"雷猴", "回归", "人间", "。"},
97 | []string{"工信处", "女干事", "每月", "经过", "下属", "科室", "都", "要", "亲口", "交代", "24", "口", "交换机", "等", "技术性", "器件", "的", "安装", "工作"},
98 | []string{"我", "需要", "廉租房"},
99 | []string{"永和", "服装", "饰品", "有限公司"},
100 | []string{"我", "爱", "北京", "天安门"},
101 | []string{"abc"},
102 | []string{"隐", "马尔可夫"},
103 | []string{"雷猴", "是", "个", "好", "网站"},
104 | []string{"“", "Microsoft", "”", "一词", "由", "“", "MICROcomputer", "(", "微型", "计算机", ")", "”", "和", "“", "SOFTware", "(", "软件", ")", "”", "两", "部分", "组成"},
105 | []string{"草泥马", "和", "欺实", "马", "是", "今年", "的", "流行", "词汇"},
106 | []string{"伊藤", "洋华堂", "总府", "店"},
107 | []string{"中国科学院计算技术研究所"},
108 | []string{"罗密欧", "与", "朱丽叶"},
109 | []string{"我", "购买", "了", "道具", "和", "服装"},
110 | []string{"PS", ":", " ", "我", "觉得", "开源", "有", "一个", "好处", ",", "就是", "能够", "敦促", "自己", "不断改进", ",", "避免", "敞帚", "自珍"},
111 | []string{"湖北省", "石首市"},
112 | []string{"湖北省", "十堰市"},
113 | []string{"总经理", "完成", "了", "这件", "事情"},
114 | []string{"电脑", "修好", "了"},
115 | []string{"做好", "了", "这件", "事情", "就", "一了百了", "了"},
116 | []string{"人们", "审美", "的", "观点", "是", "不同", "的"},
117 | []string{"我们", "买", "了", "一个", "美的", "空调"},
118 | []string{"线程", "初始化", "时", "我们", "要", "注意"},
119 | []string{"一个", "分子", "是", "由", "好多", "原子", "组织", "成", "的"},
120 | []string{"祝", "你", "马到功成"},
121 | []string{"他", "掉", "进", "了", "无底洞", "里"},
122 | []string{"中国", "的", "首都", "是", "北京"},
123 | []string{"孙君意"},
124 | []string{"外交部", "发言人", "马朝旭"},
125 | []string{"领导人", "会议", "和", "第四届", "东亚", "峰会"},
126 | []string{"在", "过去", "的", "这", "五年"},
127 | []string{"还", "需要", "很长", "的", "路", "要", "走"},
128 | []string{"60", "周年", "首都", "阅兵"},
129 | []string{"你好", "人们", "审美", "的", "观点", "是", "不同", "的"},
130 | []string{"买", "水果", "然后", "来", "世博园"},
131 | []string{"买", "水果", "然后", "去", "世博园"},
132 | []string{"但是", "后来", "我", "才", "知道", "你", "是", "对", "的"},
133 | []string{"存在", "即", "合理"},
134 | []string{"的", "的", "的", "的", "的", "在", "的", "的", "的", "的", "就", "以", "和", "和", "和"},
135 | []string{"I", " ", "love", "你", ",", "不以为耻", ",", "反", "以为", "rong"},
136 | []string{"因"},
137 | []string{},
138 | []string{"hello", "你好", "人们", "审美", "的", "观点", "是", "不同", "的"},
139 | []string{"很", "好", "但", "主要", "是", "基于", "网页", "形式"},
140 | []string{"hello", "你好", "人们", "审美", "的", "观点", "是", "不同", "的"},
141 | []string{"为什么", "我", "不能", "拥有", "想要", "的", "生活"},
142 | []string{"后来", "我", "才"},
143 | []string{"此次", "来", "中国", "是", "为了"},
144 | []string{"使用", "了", "它", "就", "可以", "解决", "一些", "问题"},
145 | []string{",", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"},
146 | []string{"其实", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"},
147 | []string{"好人", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"},
148 | []string{"是因为", "和", "国家"},
149 | []string{"老年", "搜索", "还", "支持"},
150 | []string{"干脆", "就", "把", "那部", "蒙人", "的", "闲法", "给", "废", "了", "拉倒", "!", "RT", " ", "@", "laoshipukong", " ", ":", " ", "27", "日", ",", "全国人大常委会", "第三次", "审议", "侵权", "责任法", "草案", ",", "删除", "了", "有关", "医疗", "损害", "责任", "“", "举证", "倒置", "”", "的", "规定", "。", "在", "医患", "纠纷", "中本", "已", "处于", "弱势", "地位", "的", "消费者", "由此", "将", "陷入", "万劫不复", "的", "境地", "。", " "},
151 | []string{"大"},
152 | []string{},
153 | []string{"他", "说", "的", "确实", "在理"},
154 | []string{"长春", "市长", "春节", "讲话"},
155 | []string{"结婚", "的", "和", "尚未", "结婚", "的"},
156 | []string{"结合", "成", "分子", "时"},
157 | []string{"旅游", "和", "服务", "是", "最好", "的"},
158 | []string{"这件", "事情", "的确", "是", "我", "的", "错"},
159 | []string{"供", "大家", "参考", "指正"},
160 | []string{"哈尔滨", "政府", "公布", "塌桥", "原因"},
161 | []string{"我", "在", "机场", "入口处"},
162 | []string{"邢永臣", "摄影", "报道"},
163 | []string{"BP", "神经网络", "如何", "训练", "才能", "在", "分类", "时", "增加", "区分度", "?"},
164 | []string{"南京市", "长江大桥"},
165 | []string{"应", "一些", "使用者", "的", "建议", ",", "也", "为了", "便于", "利用", "NiuTrans", "用于", "SMT", "研究"},
166 | []string{"长春市", "长春", "药店"},
167 | []string{"邓颖超", "生前", "最", "喜欢", "的", "衣服"},
168 | []string{"胡锦涛", "是", "热爱", "世界", "和平", "的", "政治局", "常委"},
169 | []string{"程序员", "祝", "海林", "和", "朱会震", "是", "在", "孙健", "的", "左面", "和", "右面", ",", " ", "范凯", "在", "最", "右面", ".", "再往", "左", "是", "李松洪"},
170 | []string{"一次性", "交", "多少", "钱"},
171 | []string{"两块", "五", "一套", ",", "三块", "八", "一斤", ",", "四块", "七", "一本", ",", "五块", "六", "一条"},
172 | []string{"小", "和尚", "留", "了", "一个", "像", "大", "和尚", "一样", "的", "和尚头"},
173 | []string{"我", "是", "中华人民共和国", "公民", ";", "我", "爸爸", "是", "共和党", "党员", ";", " ", "地铁", "和平门", "站"},
174 | []string{"张晓梅", "去", "人民", "医院", "做", "了", "个", "B超", "然后", "去", "买", "了", "件", "T恤"},
175 | []string{"AT&T", "是", "一件", "不错", "的", "公司", ",", "给", "你", "发", "offer", "了", "吗", "?"},
176 | []string{"C++", "和", "c#", "是", "什么", "关系", "?", "11", "+", "122", "=", "133", ",", "是", "吗", "?", "PI", "=", "3.14159"},
177 | []string{"你", "认识", "那个", "和", "主席", "握手", "的", "的哥", "吗", "?", "他开", "一辆", "黑色", "的士", "。"},
178 | []string{"枪杆子", "中", "出", "政权"},
179 | }
180 |
181 | cutAllResult = [][]string{[]string{"这", "是", "一个", "伸手", "伸手不见", "伸手不见五指", "不见", "五指", "的", "黑夜", "", "", "我", "叫", "孙悟空", "悟空", "", "", "我", "爱", "北京", "", "", "我", "爱", "Python", "和", "C++", ""},
182 | []string{"我", "不", "喜欢", "日本", "和服", "", ""},
183 | []string{"雷猴", "回归", "人间", "", ""},
184 | []string{"工信处", "处女", "女干事", "干事", "每月", "月经", "经过", "下属", "科室", "都", "要", "亲口", "口交", "交代", "24", "口交", "交换", "交换机", "换机", "等", "技术", "技术性", "性器", "器件", "的", "安装", "安装工", "装工", "工作"},
185 | []string{"我", "需要", "廉租", "廉租房", "租房"},
186 | []string{"永和", "和服", "服装", "装饰", "装饰品", "饰品", "有限", "有限公司", "公司"},
187 | []string{"我", "爱", "北京", "天安", "天安门"},
188 | []string{"abc"},
189 | []string{"隐", "马尔可", "马尔可夫", "可夫"},
190 | []string{"雷猴", "是", "个", "好", "网站"},
191 | []string{"", "Microsoft", "", "一", "词", "由", "", "MICROcomputer", "", "微型", "计算", "计算机", "算机", "", "", "", "和", "", "SOFTware", "", "软件", "", "", "", "两部", "部分", "分组", "组成"},
192 | []string{"草泥马", "和", "欺", "实", "马", "是", "今年", "的", "流行", "词汇"},
193 | []string{"伊", "藤", "洋华堂", "总府", "店"},
194 | []string{"中国", "中国科学院", "中国科学院计算技术研究所", "科学", "科学院", "学院", "计算", "计算技术", "技术", "研究", "研究所"},
195 | []string{"罗密欧", "与", "朱丽叶"},
196 | []string{"我", "购买", "了", "道具", "和服", "服装"},
197 | []string{"PS", "", "", "我", "觉得", "开源", "有", "一个", "好处", "", "", "就是", "能够", "敦促", "自己", "不断", "不断改进", "改进", "", "", "避免", "敞", "帚", "自珍"},
198 | []string{"湖北", "湖北省", "石首", "石首市"},
199 | []string{"湖北", "湖北省", "十堰", "十堰市"},
200 | []string{"总经理", "经理", "理完", "完成", "了", "这件", "事情"},
201 | []string{"电脑", "修好", "了"},
202 | []string{"做好", "了", "这件", "事情", "就", "一了百了", "了了"},
203 | []string{"人们", "审美", "美的", "观点", "是", "不同", "的"},
204 | []string{"我们", "买", "了", "一个", "美的", "空调"},
205 | []string{"线程", "初始", "初始化", "化时", "我们", "要", "注意"},
206 | []string{"一个", "分子", "是", "由", "好多", "原子", "组织", "织成", "的"},
207 | []string{"祝", "你", "马到功成"},
208 | []string{"他", "掉", "进", "了", "无底", "无底洞", "里"},
209 | []string{"中国", "的", "首都", "是", "北京"},
210 | []string{"孙", "君", "意"},
211 | []string{"外交", "外交部", "部发", "发言", "发言人", "人马", "马朝旭"},
212 | []string{"领导", "领导人", "会议", "议和", "第四", "第四届", "四届", "东亚", "峰会"},
213 | []string{"在", "过去", "的", "这", "五年"},
214 | []string{"还", "需要", "很", "长", "的", "路", "要", "走"},
215 | []string{"60", "周年", "首都", "阅兵"},
216 | []string{"你好", "好人", "人们", "审美", "美的", "观点", "是", "不同", "的"},
217 | []string{"买", "水果", "果然", "然后", "后来", "来世", "世博", "世博园", "博园"},
218 | []string{"买", "水果", "果然", "然后", "后去", "去世", "世博", "世博园", "博园"},
219 | []string{"但是", "后来", "我", "才", "知道", "你", "是", "对", "的"},
220 | []string{"存在", "即", "合理"},
221 | []string{"的", "的", "的", "的", "的", "在", "的", "的", "的", "的", "就", "以", "和", "和", "和"},
222 | []string{"I", "love", "你", "", "", "不以", "不以为耻", "以为", "耻", "", "", "反", "以为", "rong"},
223 | []string{"因"},
224 | []string{},
225 | []string{"hello", "你好", "好人", "人们", "审美", "美的", "观点", "是", "不同", "的"},
226 | []string{"很", "好", "但", "主要", "要是", "基于", "网页", "形式"},
227 | []string{"hello", "你好", "好人", "人们", "审美", "美的", "观点", "是", "不同", "的"},
228 | []string{"为什么", "什么", "我", "不能", "拥有", "想要", "的", "生活"},
229 | []string{"后来", "我", "才"},
230 | []string{"此次", "来", "中国", "国是", "为了"},
231 | []string{"使用", "了", "它", "就", "可以", "解决", "一些", "问题"},
232 | []string{"", "", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"},
233 | []string{"其实", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"},
234 | []string{"好人", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"},
235 | []string{"是因为", "因为", "和", "国家"},
236 | []string{"老年", "搜索", "索还", "支持"},
237 | []string{"干脆", "就", "把", "那部", "蒙人", "的", "闲", "法", "给", "废", "了", "拉倒", "", "RT", "", "laoshipukong", "", "", "27", "日", "", "", "全国", "全国人大", "全国人大常委会", "国人", "人大", "人大常委会", "常委", "常委会", "委会", "第三", "第三次", "三次", "审议", "侵权", "权责", "责任", "责任法", "草案", "", "", "删除", "除了", "有关", "医疗", "损害", "责任", "", "", "举证", "倒置", "", "", "的", "规定", "", "", "在", "医患", "纠纷", "中", "本", "已", "处于", "弱势", "地位", "的", "消费", "消费者", "由此", "将", "陷入", "万劫不复", "不复", "的", "境地", "", "", ""},
238 | []string{"大"},
239 | []string{},
240 | []string{"他", "说", "的确", "确实", "实在", "理"},
241 | []string{"长春", "长春市", "市长", "长春", "春节", "讲话"},
242 | []string{"结婚", "的", "和尚", "尚未", "未结", "结婚", "的"},
243 | []string{"结合", "合成", "成分", "分子", "时"},
244 | []string{"旅游", "和服", "服务", "是", "最好", "的"},
245 | []string{"这件", "事情", "的确", "是", "我", "的", "错"},
246 | []string{"供", "大家", "参考", "指正"},
247 | []string{"哈尔", "哈尔滨", "政府", "公布", "塌", "桥", "原因"},
248 | []string{"我", "在", "机场", "入口", "入口处"},
249 | []string{"邢", "永", "臣", "摄影", "报道"},
250 | []string{"BP", "神经", "神经网", "神经网络", "网络", "如何", "训练", "才能", "在", "分类", "时", "增加", "加区", "区分", "区分度", "分度", "", ""},
251 | []string{"南京", "南京市", "京市", "市长", "长江", "长江大桥", "大桥"},
252 | []string{"应", "一些", "使用", "使用者", "用者", "的", "建议", "", "", "也", "为了", "便于", "利用", "NiuTrans", "用于", "SMT", "研究"},
253 | []string{"长春", "长春市", "市长", "长春", "春药", "药店"},
254 | []string{"邓颖超", "超生", "生前", "最", "喜欢", "的", "衣服"},
255 | []string{"胡锦涛", "锦涛", "是", "热爱", "世界", "和平", "的", "政治", "政治局", "常委"},
256 | []string{"程序", "程序员", "祝", "海林", "和", "朱", "会", "震", "是", "在", "孙", "健", "的", "左面", "和", "右面", "", "", "", "范", "凯", "在", "最", "右面", "", "", "再往", "左", "是", "李", "松", "洪"},
257 | []string{"一次", "一次性", "性交", "多少", "多少钱"},
258 | []string{"两块", "五一", "一套", "", "", "三块", "八一", "一斤", "", "", "四块", "七一", "一本", "", "", "五块", "六一", "一条"},
259 | []string{"小", "和尚", "留", "了", "一个", "像", "大", "和尚", "一样", "的", "和尚", "和尚头"},
260 | []string{"我", "是", "中华", "中华人民", "中华人民共和国", "华人", "人民", "人民共和国", "共和", "共和国", "国公", "公民", "", "", "我", "爸爸", "是", "共和", "共和党", "党员", "", "", "", "地铁", "和平", "和平门", "站"},
261 | []string{"张晓梅", "去", "人民", "民医院", "医院", "做", "了", "个", "B", "超然", "然后", "后去", "买", "了", "件", "T", "恤"},
262 | []string{"AT", "T", "是", "一件", "不错", "的", "公司", "", "", "给", "你", "发", "offer", "了", "吗", "", ""},
263 | []string{"C++", "和", "c#", "是", "什么", "关系", "", "11+122", "133", "", "是", "吗", "", "PI", "3", "14159"},
264 | []string{"你", "认识", "那个", "和", "主席", "握手", "的", "的哥", "吗", "", "", "他", "开", "一辆", "黑色", "的士", "", ""},
265 | []string{"枪杆", "枪杆子", "杆子", "中出", "政权"},
266 | }
267 |
268 | defaultCutNoHMMResult = [][]string{[]string{"这", "是", "一个", "伸手不见五指", "的", "黑夜", "。", "我", "叫", "孙悟空", ",", "我", "爱", "北京", ",", "我", "爱", "Python", "和", "C++", "。"},
269 | []string{"我", "不", "喜欢", "日本", "和服", "。"},
270 | []string{"雷猴", "回归", "人间", "。"},
271 | []string{"工信处", "女干事", "每月", "经过", "下属", "科室", "都", "要", "亲口", "交代", "24", "口", "交换机", "等", "技术性", "器件", "的", "安装", "工作"},
272 | []string{"我", "需要", "廉租房"},
273 | []string{"永和", "服装", "饰品", "有限公司"},
274 | []string{"我", "爱", "北京", "天安门"},
275 | []string{"abc"},
276 | []string{"隐", "马尔可夫"},
277 | []string{"雷猴", "是", "个", "好", "网站"},
278 | []string{"“", "Microsoft", "”", "一", "词", "由", "“", "MICROcomputer", "(", "微型", "计算机", ")", "”", "和", "“", "SOFTware", "(", "软件", ")", "”", "两", "部分", "组成"},
279 | []string{"草泥马", "和", "欺", "实", "马", "是", "今年", "的", "流行", "词汇"},
280 | []string{"伊", "藤", "洋华堂", "总府", "店"},
281 | []string{"中国科学院计算技术研究所"},
282 | []string{"罗密欧", "与", "朱丽叶"},
283 | []string{"我", "购买", "了", "道具", "和", "服装"},
284 | []string{"PS", ":", " ", "我", "觉得", "开源", "有", "一个", "好处", ",", "就是", "能够", "敦促", "自己", "不断改进", ",", "避免", "敞", "帚", "自珍"},
285 | []string{"湖北省", "石首市"},
286 | []string{"湖北省", "十堰市"},
287 | []string{"总经理", "完成", "了", "这件", "事情"},
288 | []string{"电脑", "修好", "了"},
289 | []string{"做好", "了", "这件", "事情", "就", "一了百了", "了"},
290 | []string{"人们", "审美", "的", "观点", "是", "不同", "的"},
291 | []string{"我们", "买", "了", "一个", "美的", "空调"},
292 | []string{"线程", "初始化", "时", "我们", "要", "注意"},
293 | []string{"一个", "分子", "是", "由", "好多", "原子", "组织", "成", "的"},
294 | []string{"祝", "你", "马到功成"},
295 | []string{"他", "掉", "进", "了", "无底洞", "里"},
296 | []string{"中国", "的", "首都", "是", "北京"},
297 | []string{"孙", "君", "意"},
298 | []string{"外交部", "发言人", "马朝旭"},
299 | []string{"领导人", "会议", "和", "第四届", "东亚", "峰会"},
300 | []string{"在", "过去", "的", "这", "五年"},
301 | []string{"还", "需要", "很", "长", "的", "路", "要", "走"},
302 | []string{"60", "周年", "首都", "阅兵"},
303 | []string{"你好", "人们", "审美", "的", "观点", "是", "不同", "的"},
304 | []string{"买", "水果", "然后", "来", "世博园"},
305 | []string{"买", "水果", "然后", "去", "世博园"},
306 | []string{"但是", "后来", "我", "才", "知道", "你", "是", "对", "的"},
307 | []string{"存在", "即", "合理"},
308 | []string{"的", "的", "的", "的", "的", "在", "的", "的", "的", "的", "就", "以", "和", "和", "和"},
309 | []string{"I", " ", "love", "你", ",", "不以为耻", ",", "反", "以为", "rong"},
310 | []string{"因"},
311 | []string{},
312 | []string{"hello", "你好", "人们", "审美", "的", "观点", "是", "不同", "的"},
313 | []string{"很", "好", "但", "主要", "是", "基于", "网页", "形式"},
314 | []string{"hello", "你好", "人们", "审美", "的", "观点", "是", "不同", "的"},
315 | []string{"为什么", "我", "不能", "拥有", "想要", "的", "生活"},
316 | []string{"后来", "我", "才"},
317 | []string{"此次", "来", "中国", "是", "为了"},
318 | []string{"使用", "了", "它", "就", "可以", "解决", "一些", "问题"},
319 | []string{",", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"},
320 | []string{"其实", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"},
321 | []string{"好人", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"},
322 | []string{"是因为", "和", "国家"},
323 | []string{"老年", "搜索", "还", "支持"},
324 | []string{"干脆", "就", "把", "那", "部", "蒙", "人", "的", "闲", "法", "给", "废", "了", "拉倒", "!", "RT", " ", "@", "laoshipukong", " ", ":", " ", "27", "日", ",", "全国人大常委会", "第三次", "审议", "侵权", "责任法", "草案", ",", "删除", "了", "有关", "医疗", "损害", "责任", "“", "举证", "倒置", "”", "的", "规定", "。", "在", "医患", "纠纷", "中", "本", "已", "处于", "弱势", "地位", "的", "消费者", "由此", "将", "陷入", "万劫不复", "的", "境地", "。", " "},
325 | []string{"大"},
326 | []string{},
327 | []string{"他", "说", "的", "确实", "在", "理"},
328 | []string{"长春", "市长", "春节", "讲话"},
329 | []string{"结婚", "的", "和", "尚未", "结婚", "的"},
330 | []string{"结合", "成", "分子", "时"},
331 | []string{"旅游", "和", "服务", "是", "最好", "的"},
332 | []string{"这件", "事情", "的确", "是", "我", "的", "错"},
333 | []string{"供", "大家", "参考", "指正"},
334 | []string{"哈尔滨", "政府", "公布", "塌", "桥", "原因"},
335 | []string{"我", "在", "机场", "入口处"},
336 | []string{"邢", "永", "臣", "摄影", "报道"},
337 | []string{"BP", "神经网络", "如何", "训练", "才能", "在", "分类", "时", "增加", "区分度", "?"},
338 | []string{"南京市", "长江大桥"},
339 | []string{"应", "一些", "使用者", "的", "建议", ",", "也", "为了", "便于", "利用", "NiuTrans", "用于", "SMT", "研究"},
340 | []string{"长春市", "长春", "药店"},
341 | []string{"邓颖超", "生前", "最", "喜欢", "的", "衣服"},
342 | []string{"胡锦涛", "是", "热爱", "世界", "和平", "的", "政治局", "常委"},
343 | []string{"程序员", "祝", "海林", "和", "朱", "会", "震", "是", "在", "孙", "健", "的", "左面", "和", "右面", ",", " ", "范", "凯", "在", "最", "右面", ".", "再", "往", "左", "是", "李", "松", "洪"},
344 | []string{"一次性", "交", "多少", "钱"},
345 | []string{"两块", "五", "一套", ",", "三块", "八", "一斤", ",", "四块", "七", "一本", ",", "五块", "六", "一条"},
346 | []string{"小", "和尚", "留", "了", "一个", "像", "大", "和尚", "一样", "的", "和尚头"},
347 | []string{"我", "是", "中华人民共和国", "公民", ";", "我", "爸爸", "是", "共和党", "党员", ";", " ", "地铁", "和平门", "站"},
348 | []string{"张晓梅", "去", "人民", "医院", "做", "了", "个", "B超", "然后", "去", "买", "了", "件", "T恤"},
349 | []string{"AT&T", "是", "一件", "不错", "的", "公司", ",", "给", "你", "发", "offer", "了", "吗", "?"},
350 | []string{"C++", "和", "c#", "是", "什么", "关系", "?", "11", "+", "122", "=", "133", ",", "是", "吗", "?", "PI", "=", "3", ".", "14159"},
351 | []string{"你", "认识", "那个", "和", "主席", "握手", "的", "的哥", "吗", "?", "他", "开", "一辆", "黑色", "的士", "。"},
352 | []string{"枪杆子", "中", "出", "政权"},
353 | }
354 |
355 | cutForSearchResult = [][]string{[]string{"这是", "一个", "伸手", "不见", "五指", "伸手不见五指", "的", "黑夜", "。", "我", "叫", "悟空", "孙悟空", ",", "我", "爱", "北京", ",", "我", "爱", "Python", "和", "C++", "。"},
356 | []string{"我", "不", "喜欢", "日本", "和服", "。"},
357 | []string{"雷猴", "回归", "人间", "。"},
358 | []string{"工信处", "干事", "女干事", "每月", "经过", "下属", "科室", "都", "要", "亲口", "交代", "24", "口", "交换", "换机", "交换机", "等", "技术", "技术性", "器件", "的", "安装", "工作"},
359 | []string{"我", "需要", "廉租", "租房", "廉租房"},
360 | []string{"永和", "服装", "饰品", "有限", "公司", "有限公司"},
361 | []string{"我", "爱", "北京", "天安", "天安门"},
362 | []string{"abc"},
363 | []string{"隐", "可夫", "马尔可", "马尔可夫"},
364 | []string{"雷猴", "是", "个", "好", "网站"},
365 | []string{"“", "Microsoft", "”", "一词", "由", "“", "MICROcomputer", "(", "微型", "计算", "算机", "计算机", ")", "”", "和", "“", "SOFTware", "(", "软件", ")", "”", "两", "部分", "组成"},
366 | []string{"草泥马", "和", "欺实", "马", "是", "今年", "的", "流行", "词汇"},
367 | []string{"伊藤", "洋华堂", "总府", "店"},
368 | []string{"中国", "科学", "学院", "计算", "技术", "研究", "科学院", "研究所", "中国科学院计算技术研究所"},
369 | []string{"罗密欧", "与", "朱丽叶"},
370 | []string{"我", "购买", "了", "道具", "和", "服装"},
371 | []string{"PS", ":", " ", "我", "觉得", "开源", "有", "一个", "好处", ",", "就是", "能够", "敦促", "自己", "不断", "改进", "不断改进", ",", "避免", "敞帚", "自珍"},
372 | []string{"湖北", "湖北省", "石首", "石首市"},
373 | []string{"湖北", "湖北省", "十堰", "十堰市"},
374 | []string{"经理", "总经理", "完成", "了", "这件", "事情"},
375 | []string{"电脑", "修好", "了"},
376 | []string{"做好", "了", "这件", "事情", "就", "一了百了", "了"},
377 | []string{"人们", "审美", "的", "观点", "是", "不同", "的"},
378 | []string{"我们", "买", "了", "一个", "美的", "空调"},
379 | []string{"线程", "初始", "初始化", "时", "我们", "要", "注意"},
380 | []string{"一个", "分子", "是", "由", "好多", "原子", "组织", "成", "的"},
381 | []string{"祝", "你", "马到功成"},
382 | []string{"他", "掉", "进", "了", "无底", "无底洞", "里"},
383 | []string{"中国", "的", "首都", "是", "北京"},
384 | []string{"孙君意"},
385 | []string{"外交", "外交部", "发言", "发言人", "马朝旭"},
386 | []string{"领导", "领导人", "会议", "和", "第四", "四届", "第四届", "东亚", "峰会"},
387 | []string{"在", "过去", "的", "这", "五年"},
388 | []string{"还", "需要", "很长", "的", "路", "要", "走"},
389 | []string{"60", "周年", "首都", "阅兵"},
390 | []string{"你好", "人们", "审美", "的", "观点", "是", "不同", "的"},
391 | []string{"买", "水果", "然后", "来", "世博", "博园", "世博园"},
392 | []string{"买", "水果", "然后", "去", "世博", "博园", "世博园"},
393 | []string{"但是", "后来", "我", "才", "知道", "你", "是", "对", "的"},
394 | []string{"存在", "即", "合理"},
395 | []string{"的", "的", "的", "的", "的", "在", "的", "的", "的", "的", "就", "以", "和", "和", "和"},
396 | []string{"I", " ", "love", "你", ",", "不以", "以为", "不以为耻", ",", "反", "以为", "rong"},
397 | []string{"因"},
398 | []string{},
399 | []string{"hello", "你好", "人们", "审美", "的", "观点", "是", "不同", "的"},
400 | []string{"很", "好", "但", "主要", "是", "基于", "网页", "形式"},
401 | []string{"hello", "你好", "人们", "审美", "的", "观点", "是", "不同", "的"},
402 | []string{"什么", "为什么", "我", "不能", "拥有", "想要", "的", "生活"},
403 | []string{"后来", "我", "才"},
404 | []string{"此次", "来", "中国", "是", "为了"},
405 | []string{"使用", "了", "它", "就", "可以", "解决", "一些", "问题"},
406 | []string{",", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"},
407 | []string{"其实", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"},
408 | []string{"好人", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"},
409 | []string{"因为", "是因为", "和", "国家"},
410 | []string{"老年", "搜索", "还", "支持"},
411 | []string{"干脆", "就", "把", "那部", "蒙人", "的", "闲法", "给", "废", "了", "拉倒", "!", "RT", " ", "@", "laoshipukong", " ", ":", " ", "27", "日", ",", "全国", "国人", "人大", "常委", "委会", "常委会", "全国人大常委会", "第三", "三次", "第三次", "审议", "侵权", "责任", "责任法", "草案", ",", "删除", "了", "有关", "医疗", "损害", "责任", "“", "举证", "倒置", "”", "的", "规定", "。", "在", "医患", "纠纷", "中本", "已", "处于", "弱势", "地位", "的", "消费", "消费者", "由此", "将", "陷入", "不复", "万劫不复", "的", "境地", "。", " "},
412 | []string{"大"},
413 | []string{},
414 | []string{"他", "说", "的", "确实", "在理"},
415 | []string{"长春", "市长", "春节", "讲话"},
416 | []string{"结婚", "的", "和", "尚未", "结婚", "的"},
417 | []string{"结合", "成", "分子", "时"},
418 | []string{"旅游", "和", "服务", "是", "最好", "的"},
419 | []string{"这件", "事情", "的确", "是", "我", "的", "错"},
420 | []string{"供", "大家", "参考", "指正"},
421 | []string{"哈尔", "哈尔滨", "政府", "公布", "塌桥", "原因"},
422 | []string{"我", "在", "机场", "入口", "入口处"},
423 | []string{"邢永臣", "摄影", "报道"},
424 | []string{"BP", "神经", "网络", "神经网", "神经网络", "如何", "训练", "才能", "在", "分类", "时", "增加", "区分", "分度", "区分度", "?"},
425 | []string{"南京", "京市", "南京市", "长江", "大桥", "长江大桥"},
426 | []string{"应", "一些", "使用", "用者", "使用者", "的", "建议", ",", "也", "为了", "便于", "利用", "NiuTrans", "用于", "SMT", "研究"},
427 | []string{"长春", "长春市", "长春", "药店"},
428 | []string{"邓颖超", "生前", "最", "喜欢", "的", "衣服"},
429 | []string{"锦涛", "胡锦涛", "是", "热爱", "世界", "和平", "的", "政治", "政治局", "常委"},
430 | []string{"程序", "程序员", "祝", "海林", "和", "朱会震", "是", "在", "孙健", "的", "左面", "和", "右面", ",", " ", "范凯", "在", "最", "右面", ".", "再往", "左", "是", "李松洪"},
431 | []string{"一次", "一次性", "交", "多少", "钱"},
432 | []string{"两块", "五", "一套", ",", "三块", "八", "一斤", ",", "四块", "七", "一本", ",", "五块", "六", "一条"},
433 | []string{"小", "和尚", "留", "了", "一个", "像", "大", "和尚", "一样", "的", "和尚", "和尚头"},
434 | []string{"我", "是", "中华", "华人", "人民", "共和", "共和国", "中华人民共和国", "公民", ";", "我", "爸爸", "是", "共和", "共和党", "党员", ";", " ", "地铁", "和平", "和平门", "站"},
435 | []string{"张晓梅", "去", "人民", "医院", "做", "了", "个", "B超", "然后", "去", "买", "了", "件", "T恤"},
436 | []string{"AT&T", "是", "一件", "不错", "的", "公司", ",", "给", "你", "发", "offer", "了", "吗", "?"},
437 | []string{"C++", "和", "c#", "是", "什么", "关系", "?", "11", "+", "122", "=", "133", ",", "是", "吗", "?", "PI", "=", "3.14159"},
438 | []string{"你", "认识", "那个", "和", "主席", "握手", "的", "的哥", "吗", "?", "他开", "一辆", "黑色", "的士", "。"},
439 | []string{"枪杆", "杆子", "枪杆子", "中", "出", "政权"},
440 | }
441 |
442 | cutForSearchNoHMMResult = [][]string{[]string{"这", "是", "一个", "伸手", "不见", "五指", "伸手不见五指", "的", "黑夜", "。", "我", "叫", "悟空", "孙悟空", ",", "我", "爱", "北京", ",", "我", "爱", "Python", "和", "C++", "。"},
443 | []string{"我", "不", "喜欢", "日本", "和服", "。"},
444 | []string{"雷猴", "回归", "人间", "。"},
445 | []string{"工信处", "干事", "女干事", "每月", "经过", "下属", "科室", "都", "要", "亲口", "交代", "24", "口", "交换", "换机", "交换机", "等", "技术", "技术性", "器件", "的", "安装", "工作"},
446 | []string{"我", "需要", "廉租", "租房", "廉租房"},
447 | []string{"永和", "服装", "饰品", "有限", "公司", "有限公司"},
448 | []string{"我", "爱", "北京", "天安", "天安门"},
449 | []string{"abc"},
450 | []string{"隐", "可夫", "马尔可", "马尔可夫"},
451 | []string{"雷猴", "是", "个", "好", "网站"},
452 | []string{"“", "Microsoft", "”", "一", "词", "由", "“", "MICROcomputer", "(", "微型", "计算", "算机", "计算机", ")", "”", "和", "“", "SOFTware", "(", "软件", ")", "”", "两", "部分", "组成"},
453 | []string{"草泥马", "和", "欺", "实", "马", "是", "今年", "的", "流行", "词汇"},
454 | []string{"伊", "藤", "洋华堂", "总府", "店"},
455 | []string{"中国", "科学", "学院", "计算", "技术", "研究", "科学院", "研究所", "中国科学院计算技术研究所"},
456 | []string{"罗密欧", "与", "朱丽叶"},
457 | []string{"我", "购买", "了", "道具", "和", "服装"},
458 | []string{"PS", ":", " ", "我", "觉得", "开源", "有", "一个", "好处", ",", "就是", "能够", "敦促", "自己", "不断", "改进", "不断改进", ",", "避免", "敞", "帚", "自珍"},
459 | []string{"湖北", "湖北省", "石首", "石首市"},
460 | []string{"湖北", "湖北省", "十堰", "十堰市"},
461 | []string{"经理", "总经理", "完成", "了", "这件", "事情"},
462 | []string{"电脑", "修好", "了"},
463 | []string{"做好", "了", "这件", "事情", "就", "一了百了", "了"},
464 | []string{"人们", "审美", "的", "观点", "是", "不同", "的"},
465 | []string{"我们", "买", "了", "一个", "美的", "空调"},
466 | []string{"线程", "初始", "初始化", "时", "我们", "要", "注意"},
467 | []string{"一个", "分子", "是", "由", "好多", "原子", "组织", "成", "的"},
468 | []string{"祝", "你", "马到功成"},
469 | []string{"他", "掉", "进", "了", "无底", "无底洞", "里"},
470 | []string{"中国", "的", "首都", "是", "北京"},
471 | []string{"孙", "君", "意"},
472 | []string{"外交", "外交部", "发言", "发言人", "马朝旭"},
473 | []string{"领导", "领导人", "会议", "和", "第四", "四届", "第四届", "东亚", "峰会"},
474 | []string{"在", "过去", "的", "这", "五年"},
475 | []string{"还", "需要", "很", "长", "的", "路", "要", "走"},
476 | []string{"60", "周年", "首都", "阅兵"},
477 | []string{"你好", "人们", "审美", "的", "观点", "是", "不同", "的"},
478 | []string{"买", "水果", "然后", "来", "世博", "博园", "世博园"},
479 | []string{"买", "水果", "然后", "去", "世博", "博园", "世博园"},
480 | []string{"但是", "后来", "我", "才", "知道", "你", "是", "对", "的"},
481 | []string{"存在", "即", "合理"},
482 | []string{"的", "的", "的", "的", "的", "在", "的", "的", "的", "的", "就", "以", "和", "和", "和"},
483 | []string{"I", " ", "love", "你", ",", "不以", "以为", "不以为耻", ",", "反", "以为", "rong"},
484 | []string{"因"},
485 | []string{},
486 | []string{"hello", "你好", "人们", "审美", "的", "观点", "是", "不同", "的"},
487 | []string{"很", "好", "但", "主要", "是", "基于", "网页", "形式"},
488 | []string{"hello", "你好", "人们", "审美", "的", "观点", "是", "不同", "的"},
489 | []string{"什么", "为什么", "我", "不能", "拥有", "想要", "的", "生活"},
490 | []string{"后来", "我", "才"},
491 | []string{"此次", "来", "中国", "是", "为了"},
492 | []string{"使用", "了", "它", "就", "可以", "解决", "一些", "问题"},
493 | []string{",", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"},
494 | []string{"其实", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"},
495 | []string{"好人", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"},
496 | []string{"因为", "是因为", "和", "国家"},
497 | []string{"老年", "搜索", "还", "支持"},
498 | []string{"干脆", "就", "把", "那", "部", "蒙", "人", "的", "闲", "法", "给", "废", "了", "拉倒", "!", "RT", " ", "@", "laoshipukong", " ", ":", " ", "27", "日", ",", "全国", "国人", "人大", "常委", "委会", "常委会", "全国人大常委会", "第三", "三次", "第三次", "审议", "侵权", "责任", "责任法", "草案", ",", "删除", "了", "有关", "医疗", "损害", "责任", "“", "举证", "倒置", "”", "的", "规定", "。", "在", "医患", "纠纷", "中", "本", "已", "处于", "弱势", "地位", "的", "消费", "消费者", "由此", "将", "陷入", "不复", "万劫不复", "的", "境地", "。", " "},
499 | []string{"大"},
500 | []string{},
501 | []string{"他", "说", "的", "确实", "在", "理"},
502 | []string{"长春", "市长", "春节", "讲话"},
503 | []string{"结婚", "的", "和", "尚未", "结婚", "的"},
504 | []string{"结合", "成", "分子", "时"},
505 | []string{"旅游", "和", "服务", "是", "最好", "的"},
506 | []string{"这件", "事情", "的确", "是", "我", "的", "错"},
507 | []string{"供", "大家", "参考", "指正"},
508 | []string{"哈尔", "哈尔滨", "政府", "公布", "塌", "桥", "原因"},
509 | []string{"我", "在", "机场", "入口", "入口处"},
510 | []string{"邢", "永", "臣", "摄影", "报道"},
511 | []string{"BP", "神经", "网络", "神经网", "神经网络", "如何", "训练", "才能", "在", "分类", "时", "增加", "区分", "分度", "区分度", "?"},
512 | []string{"南京", "京市", "南京市", "长江", "大桥", "长江大桥"},
513 | []string{"应", "一些", "使用", "用者", "使用者", "的", "建议", ",", "也", "为了", "便于", "利用", "NiuTrans", "用于", "SMT", "研究"},
514 | []string{"长春", "长春市", "长春", "药店"},
515 | []string{"邓颖超", "生前", "最", "喜欢", "的", "衣服"},
516 | []string{"锦涛", "胡锦涛", "是", "热爱", "世界", "和平", "的", "政治", "政治局", "常委"},
517 | []string{"程序", "程序员", "祝", "海林", "和", "朱", "会", "震", "是", "在", "孙", "健", "的", "左面", "和", "右面", ",", " ", "范", "凯", "在", "最", "右面", ".", "再", "往", "左", "是", "李", "松", "洪"},
518 | []string{"一次", "一次性", "交", "多少", "钱"},
519 | []string{"两块", "五", "一套", ",", "三块", "八", "一斤", ",", "四块", "七", "一本", ",", "五块", "六", "一条"},
520 | []string{"小", "和尚", "留", "了", "一个", "像", "大", "和尚", "一样", "的", "和尚", "和尚头"},
521 | []string{"我", "是", "中华", "华人", "人民", "共和", "共和国", "中华人民共和国", "公民", ";", "我", "爸爸", "是", "共和", "共和党", "党员", ";", " ", "地铁", "和平", "和平门", "站"},
522 | []string{"张晓梅", "去", "人民", "医院", "做", "了", "个", "B超", "然后", "去", "买", "了", "件", "T恤"},
523 | []string{"AT&T", "是", "一件", "不错", "的", "公司", ",", "给", "你", "发", "offer", "了", "吗", "?"},
524 | []string{"C++", "和", "c#", "是", "什么", "关系", "?", "11", "+", "122", "=", "133", ",", "是", "吗", "?", "PI", "=", "3", ".", "14159"},
525 | []string{"你", "认识", "那个", "和", "主席", "握手", "的", "的哥", "吗", "?", "他", "开", "一辆", "黑色", "的士", "。"},
526 | []string{"枪杆", "杆子", "枪杆子", "中", "出", "政权"},
527 | }
528 |
529 | userDictCutResult = [][]string{
530 | []string{"这是", "一个", "伸手", "不见", "五指", "的", "黑夜", "。", "我", "叫", "孙悟空", ",", "我", "爱北京", ",", "我", "爱", "Python", "和", "C", "++", "。"},
531 | []string{"我", "不", "喜欢", "日本", "和", "服", "。"},
532 | []string{"雷猴", "回归人间", "。"},
533 | []string{"工信", "处女", "干事", "每", "月", "经过", "下", "属", "科室", "都", "要", "亲口", "交代", "24", "口交换机", "等", "技术性", "器件", "的", "安装", "工作"},
534 | []string{"我", "需要", "廉租房"},
535 | []string{"永和服", "装饰品", "有", "限公司"},
536 | []string{"我", "爱北京", "天安门"},
537 | []string{"abc"},
538 | []string{"隐马尔", "可夫"},
539 | []string{"雷猴", "是", "个", "好", "网站"},
540 | []string{"“", "Microsoft", "”", "一词", "由", "“", "MICROcomputer", "(", "微型", "计算机", ")", "”", "和", "“", "SOFTware", "(", "软件", ")", "”", "两部分", "组成"},
541 | []string{"草泥", "马", "和", "欺实", "马", "是", "今", "年", "的", "流行", "词汇"},
542 | []string{"伊藤洋华堂", "总府", "店"},
543 | []string{"中国", "科学院", "计算", "技术", "研究", "所"},
544 | []string{"罗密欧", "与", "朱丽叶"},
545 | []string{"我购", "买", "了", "道", "具", "和", "服装"},
546 | []string{"PS", ":", " ", "我觉", "得", "开源", "有", "一个", "好", "处", ",", "就", "是", "能够", "敦促", "自己", "不断", "改进", ",", "避免", "敞帚", "自珍"},
547 | []string{"湖北省", "石首市"},
548 | []string{"湖北省", "十堰市"},
549 | []string{"总经理", "完成", "了", "这件", "事情"},
550 | []string{"电脑", "修好", "了"},
551 | []string{"做", "好", "了", "这件", "事情", "就", "一", "了", "百", "了", "了"},
552 | []string{"人们", "审美", "的", "观点", "是", "不同", "的"},
553 | []string{"我们", "买", "了", "一个", "美", "的", "空调"},
554 | []string{"线程", "初始", "化时", "我们", "要", "注意"},
555 | []string{"一个", "分子", "是", "由", "好", "多", "原子", "组织成", "的"},
556 | []string{"祝", "你", "马到", "功成"},
557 | []string{"他", "掉", "进", "了", "无底", "洞里"},
558 | []string{"中国", "的", "首", "都", "是", "北京"},
559 | []string{"孙君意"},
560 | []string{"外交部", "发言人", "马朝旭"},
561 | []string{"领导", "人会议", "和", "第四届", "东亚峰", "会"},
562 | []string{"在", "过", "去", "的", "这五年"},
563 | []string{"还", "需要", "很长", "的", "路", "要", "走"},
564 | []string{"60", "周年首", "都", "阅兵"},
565 | []string{"你", "好人", "们", "审美", "的", "观点", "是", "不同", "的"},
566 | []string{"买水果", "然后", "来", "世博园"},
567 | []string{"买水果", "然后", "去", "世博园"},
568 | []string{"但", "是", "后", "来", "我", "才", "知道", "你", "是", "对", "的"},
569 | []string{"存在", "即", "合理"},
570 | []string{"的", "的", "的", "的", "的", "在", "的", "的", "的", "的", "就", "以", "和", "和", "和"},
571 | []string{"I", " ", "love", "你", ",", "不以", "为耻", ",", "反以", "为", "rong"},
572 | []string{"因"},
573 | []string{},
574 | []string{"hello", "你", "好人", "们", "审美", "的", "观点", "是", "不同", "的"},
575 | []string{"很", "好", "但", "主要", "是", "基于", "网页", "形式"},
576 | []string{"hello", "你", "好人", "们", "审美", "的", "观点", "是", "不同", "的"},
577 | []string{"为", "什么", "我", "不能", "拥有", "想", "要", "的", "生活"},
578 | []string{"后来", "我", "才"},
579 | []string{"此次", "来", "中国", "是", "为", "了"},
580 | []string{"使用", "了", "它", "就", "可以", "解决", "一些", "问题"},
581 | []string{",", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"},
582 | []string{"其实", "使", "用", "了", "它", "就", "可以", "解决", "一些", "问题"},
583 | []string{"好人", "使用", "了", "它", "就", "可以", "解决", "一些", "问题"},
584 | []string{"是", "因为", "和", "国家"},
585 | []string{"老年", "搜索", "还", "支持"},
586 | []string{"干脆", "就", "把", "那部", "蒙人", "的", "闲法", "给", "废", "了", "拉", "倒", "!", "RT", " ", "@", "laoshipukong", " ", ":", " ", "27", "日", ",", "全国人", "大常委会", "第三次", "审议", "侵权责", "任法", "草案", ",", "删除", "了", "有", "关医疗", "损害", "责任", "“", "举证", "倒", "置", "”", "的", "规定", "。", "在", "医患", "纠纷", "中本", "已", "处于", "弱势", "地位", "的", "消费者", "由", "此", "将", "陷入", "万劫", "不复", "的", "境地", "。", " "},
587 | []string{"大"},
588 | []string{},
589 | []string{"他", "说", "的", "确实", "在", "理"},
590 | []string{"长春市", "长春节", "讲话"},
591 | []string{"结婚", "的", "和", "尚未", "结婚", "的"},
592 | []string{"结合成", "分子", "时"},
593 | []string{"旅游", "和", "服务", "是", "最", "好", "的"},
594 | []string{"这件", "事情", "的", "确是", "我", "的", "错"},
595 | []string{"供大家", "参考", "指正"},
596 | []string{"哈尔滨", "政府", "公布塌桥", "原因"},
597 | []string{"我", "在", "机场", "入口", "处"},
598 | []string{"邢永臣", "摄影", "报道"},
599 | []string{"BP", "神经", "网络", "如何", "训练", "才", "能", "在", "分类", "时", "增加区", "分度", "?"},
600 | []string{"南京市", "长江大桥"},
601 | []string{"应一些", "使", "用者", "的", "建议", ",", "也", "为", "了", "便", "于", "利用", "NiuTrans", "用于", "SMT", "研究"},
602 | []string{"长春市", "长春药店"},
603 | []string{"邓颖", "超生", "前", "最", "喜欢", "的", "衣服"},
604 | []string{"胡锦涛", "是", "热爱世界", "和", "平", "的", "政治局", "常委"},
605 | []string{"程序员", "祝海林", "和", "朱会震", "是", "在", "孙健", "的", "左面", "和", "右面", ",", " ", "范凯", "在", "最右面", ".", "再往", "左", "是", "李松洪"},
606 | []string{"一次性", "交多少", "钱"},
607 | []string{"两块", "五一套", ",", "三块", "八一斤", ",", "四块", "七", "一本", ",", "五块", "六", "一条"},
608 | []string{"小", "和", "尚留", "了", "一个", "像", "大", "和", "尚", "一样", "的", "和", "尚头"},
609 | []string{"我", "是", "中华人民共和国", "公民", ";", "我", "爸爸", "是", "共", "和", "党", "党员", ";", " ", "地铁", "和", "平门", "站"},
610 | []string{"张晓梅", "去", "人民医院", "做", "了", "个", "B", "超然", "后", "去", "买", "了", "件", "T", "恤"},
611 | []string{"AT", "&", "T", "是", "一件", "不错", "的", "公司", ",", "给", "你", "发", "offer", "了", "吗", "?"},
612 | []string{"C", "++", "和", "c", "#", "是", "什么", "关系", "?", "11", "+", "122", "=", "133", ",", "是", "吗", "?", "PI", "=", "3.14159"},
613 | []string{"你", "认识", "那个", "和", "主席握", "手", "的", "的", "哥", "吗", "?", "他开", "一辆", "黑色", "的", "士", "。"},
614 | []string{"枪杆子", "中", "出政权"},
615 | }
616 | )
617 |
618 | func init() {
619 | seg.LoadDictionary("dict.txt")
620 | }
621 |
622 | func chanToArray(ch <-chan string) []string {
623 | var result []string
624 | for word := range ch {
625 | result = append(result, word)
626 | }
627 | return result
628 | }
629 |
630 | func TestCutDAG(t *testing.T) {
631 | result := chanToArray(seg.cutDAG("BP神经网络如何训练才能在分类时增加区分度?"))
632 | if len(result) != 11 {
633 | t.Fatal(result)
634 | }
635 | }
636 |
637 | func TestCutDAGNoHmm(t *testing.T) {
638 | result := chanToArray(seg.cutDAGNoHMM("BP神经网络如何训练才能在分类时增加区分度?"))
639 | if len(result) != 11 {
640 | t.Fatal(result)
641 | }
642 | }
643 |
644 | func TestDefaultCut(t *testing.T) {
645 | var result []string
646 | for index, content := range testContents {
647 | result = chanToArray(seg.Cut(content, true))
648 | if len(result) != len(defaultCutResult[index]) {
649 | t.Errorf("default cut for %s length should be %d not %d\n",
650 | content, len(defaultCutResult[index]), len(result))
651 | t.Errorf("expect: %v\n", defaultCutResult[index])
652 | t.Fatalf("got: %v\n", result)
653 | }
654 | for i, r := range result {
655 | if r != defaultCutResult[index][i] {
656 | t.Fatal(r)
657 | }
658 | }
659 | }
660 | }
661 |
662 | func TestCutAll(t *testing.T) {
663 | var result []string
664 | for index, content := range testContents {
665 | result = chanToArray(seg.CutAll(content))
666 | if len(result) != len(cutAllResult[index]) {
667 | t.Errorf("cut all for %s length should be %d not %d\n",
668 | content, len(cutAllResult[index]), len(result))
669 | t.Errorf("expect: %v\n", defaultCutResult[index])
670 | t.Fatalf("got: %v\n", result)
671 | }
672 | for i, c := range result {
673 | if c != cutAllResult[index][i] {
674 | t.Fatal(c)
675 | }
676 | }
677 | }
678 | }
679 |
680 | func TestDefaultCutNoHMM(t *testing.T) {
681 | var result []string
682 | for index, content := range testContents {
683 | result = chanToArray(seg.Cut(content, false))
684 | if len(result) != len(defaultCutNoHMMResult[index]) {
685 | t.Fatalf("default cut no hmm for %s length should be %d not %d\n",
686 | content, len(defaultCutNoHMMResult[index]), len(result))
687 | }
688 | for i, c := range result {
689 | if c != defaultCutNoHMMResult[index][i] {
690 | t.Fatal(c)
691 | }
692 | }
693 | }
694 | }
695 |
696 | func TestCutForSearch(t *testing.T) {
697 | var result []string
698 | for index, content := range testContents {
699 | result = chanToArray(seg.CutForSearch(content, true))
700 | if len(result) != len(cutForSearchResult[index]) {
701 | t.Fatalf("cut for search for %s length should be %d not %d\n",
702 | content, len(cutForSearchResult[index]), len(result))
703 | }
704 | for i, c := range result {
705 | if c != cutForSearchResult[index][i] {
706 | t.Fatal(c)
707 | }
708 | }
709 | }
710 | for index, content := range testContents {
711 | result = chanToArray(seg.CutForSearch(content, false))
712 | if len(result) != len(cutForSearchNoHMMResult[index]) {
713 | t.Fatalf("cut for search no hmm for %s length should be %d not %d\n",
714 | content, len(cutForSearchNoHMMResult[index]), len(result))
715 | }
716 | for i, c := range result {
717 | if c != cutForSearchNoHMMResult[index][i] {
718 | t.Fatal(c)
719 | }
720 | }
721 | }
722 | }
723 |
724 | func TestLoadDictionary(t *testing.T) {
725 | var result []string
726 | seg.LoadDictionary("foobar.txt")
727 | for index, content := range testContents {
728 | result = chanToArray(seg.Cut(content, true))
729 | if len(result) != len(userDictCutResult[index]) {
730 | t.Fatalf("default cut with user dictionary for %s length should be %d not %d\n",
731 | content, len(userDictCutResult[index]), len(result))
732 | }
733 | for i, c := range result {
734 | if c != userDictCutResult[index][i] {
735 | t.Fatal(c)
736 | }
737 | }
738 | }
739 | seg.LoadDictionary("dict.txt")
740 | }
741 |
742 | func TestLoadUserDictionary(t *testing.T) {
743 | seg.LoadUserDictionary("userdict.txt")
744 |
745 | sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
746 | result := []string{"李小福", "是", "创新办", "主任", "也", "是", "云计算", "方面", "的", "专家", ";", " ", "什么", "是", "八一双鹿", "例如", "我", "输入", "一个", "带", "“", "韩玉赏鉴", "”", "的", "标题", ",", "在", "自定义词", "库中", "也", "增加", "了", "此", "词为", "N", "类型"}
747 |
748 | words := chanToArray(seg.Cut(sentence, true))
749 | if len(words) != len(result) {
750 | t.Fatal(len(words))
751 | }
752 | for index, word := range words {
753 | if word != result[index] {
754 | t.Fatal(word)
755 | }
756 | }
757 |
758 | sentence = "easy_install is great"
759 | result = []string{"easy_install", " ", "is", " ", "great"}
760 | words = chanToArray(seg.Cut(sentence, true))
761 | if len(words) != len(result) {
762 | t.Fatal(len(words))
763 | }
764 | for index, word := range words {
765 | if word != result[index] {
766 | t.Fatal(word)
767 | }
768 | }
769 |
770 | sentence = "python 的正则表达式是好用的"
771 | result = []string{"python", " ", "的", "正则表达式", "是", "好用", "的"}
772 | words = chanToArray(seg.Cut(sentence, true))
773 | if len(words) != len(result) {
774 | t.Fatal(words)
775 | t.Fatal(result)
776 | }
777 | for index, word := range words {
778 | if word != result[index] {
779 | t.Fatal(word)
780 | }
781 | }
782 | seg.LoadDictionary("dict.txt")
783 | }
784 |
785 | func BenchmarkCutNoHMM(b *testing.B) {
786 | sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
787 | b.ResetTimer()
788 | for i := 0; i < b.N; i++ {
789 | chanToArray(seg.Cut(sentence, false))
790 | }
791 | }
792 |
793 | func BenchmarkCut(b *testing.B) {
794 | sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
795 | b.ResetTimer()
796 | for i := 0; i < b.N; i++ {
797 | chanToArray(seg.Cut(sentence, true))
798 | }
799 | }
800 |
801 | func BenchmarkCutAll(b *testing.B) {
802 | sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
803 | b.ResetTimer()
804 | for i := 0; i < b.N; i++ {
805 | chanToArray(seg.CutAll(sentence))
806 | }
807 | }
808 |
809 | func BenchmarkCutForSearchNoHMM(b *testing.B) {
810 | sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
811 | b.ResetTimer()
812 | for i := 0; i < b.N; i++ {
813 | chanToArray(seg.CutForSearch(sentence, false))
814 | }
815 | }
816 |
817 | func BenchmarkCutForSearch(b *testing.B) {
818 | sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
819 | b.ResetTimer()
820 | for i := 0; i < b.N; i++ {
821 | chanToArray(seg.CutForSearch(sentence, true))
822 | }
823 | }
824 |
--------------------------------------------------------------------------------
/posseg/char_state_tab_test.go:
--------------------------------------------------------------------------------
1 | package posseg
2 |
3 | import (
4 | "testing"
5 | )
6 |
7 | func TestGet(t *testing.T) {
8 | result := charStateTab.get('\u8000')
9 | if len(result) != 17 {
10 | t.FailNow()
11 | }
12 | result = charStateTab.get('\uaaaa')
13 | if len(result) == 17 {
14 | t.FailNow()
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/posseg/dictionary.go:
--------------------------------------------------------------------------------
1 | package posseg
2 |
3 | import (
4 | "math"
5 | "sync"
6 |
7 | "github.com/wangbin/jiebago/dictionary"
8 | )
9 |
10 | // A Dictionary represents a thread-safe dictionary used for word segmentation.
11 | type Dictionary struct {
12 | total, logTotal float64
13 | freqMap map[string]float64
14 | posMap map[string]string
15 | sync.RWMutex
16 | }
17 |
18 | // Load loads all tokens from given channel
19 | func (d *Dictionary) Load(ch <-chan dictionary.Token) {
20 | d.Lock()
21 | for token := range ch {
22 | d.addToken(token)
23 | }
24 | d.Unlock()
25 | d.updateLogTotal()
26 | }
27 |
28 | // AddToken adds one token
29 | func (d *Dictionary) AddToken(token dictionary.Token) {
30 | d.Lock()
31 | d.addToken(token)
32 | d.Unlock()
33 | d.updateLogTotal()
34 | }
35 |
36 | func (d *Dictionary) addToken(token dictionary.Token) {
37 | d.freqMap[token.Text()] = token.Frequency()
38 | d.total += token.Frequency()
39 | runes := []rune(token.Text())
40 | n := len(runes)
41 | for i := 0; i < n; i++ {
42 | frag := string(runes[:i+1])
43 | if _, ok := d.freqMap[frag]; !ok {
44 | d.freqMap[frag] = 0.0
45 | }
46 | }
47 | if len(token.Pos()) > 0 {
48 | d.posMap[token.Text()] = token.Pos()
49 | }
50 | }
51 |
52 | func (d *Dictionary) updateLogTotal() {
53 | d.logTotal = math.Log(d.total)
54 | }
55 |
56 | // Frequency returns the frequency and existence of give word
57 | func (d *Dictionary) Frequency(key string) (float64, bool) {
58 | d.RLock()
59 | freq, ok := d.freqMap[key]
60 | d.RUnlock()
61 | return freq, ok
62 | }
63 |
64 | // Pos returns the POS and existence of give word
65 | func (d *Dictionary) Pos(key string) (string, bool) {
66 | d.RLock()
67 | pos, ok := d.posMap[key]
68 | d.RUnlock()
69 | return pos, ok
70 | }
71 |
72 | func (d *Dictionary) loadDictionary(fileName string) error {
73 | return dictionary.LoadDictionary(d, fileName)
74 | }
75 |
--------------------------------------------------------------------------------
/posseg/example_test.go:
--------------------------------------------------------------------------------
1 | package posseg_test
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/wangbin/jiebago/posseg"
7 | )
8 |
9 | func Example() {
10 | var seg posseg.Segmenter
11 | seg.LoadDictionary("../dict.txt")
12 |
13 | for segment := range seg.Cut("我爱北京天安门", true) {
14 | fmt.Printf("%s %s\n", segment.Text(), segment.Pos())
15 | }
16 | // Output:
17 | // 我 r
18 | // 爱 v
19 | // 北京 ns
20 | // 天安门 ns
21 | }
22 |
--------------------------------------------------------------------------------
/posseg/posseg.go:
--------------------------------------------------------------------------------
1 | // Package posseg is the Golang implementation of Jieba's posseg module.
2 | package posseg
3 |
4 | import (
5 | "math"
6 | "regexp"
7 |
8 | "github.com/wangbin/jiebago/util"
9 | )
10 |
11 | var (
12 | reHanDetail = regexp.MustCompile(`(\p{Han}+)`)
13 | reSkipDetail = regexp.MustCompile(`([[\.[:digit:]]+|[:alnum:]]+)`)
14 | reEng = regexp.MustCompile(`[[:alnum:]]`)
15 | reNum = regexp.MustCompile(`[\.[:digit:]]+`)
16 | reEng1 = regexp.MustCompile(`[[:alnum:]]$`)
17 | reHanInternal = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
18 | reSkipInternal = regexp.MustCompile(`(\r\n|\s)`)
19 | )
20 |
21 | // Segment represents a word with it's POS
22 | type Segment struct {
23 | text, pos string
24 | }
25 |
26 | // Text returns the Segment's text.
27 | func (s Segment) Text() string {
28 | return s.text
29 | }
30 |
31 | // Pos returns the Segment's POS.
32 | func (s Segment) Pos() string {
33 | return s.pos
34 | }
35 |
36 | // Segmenter is a Chinese words segmentation struct.
37 | type Segmenter struct {
38 | dict *Dictionary
39 | }
40 |
41 | // LoadDictionary loads dictionary from given file name.
42 | // Everytime LoadDictionary is called, previously loaded dictionary will be cleard.
43 | func (seg *Segmenter) LoadDictionary(fileName string) error {
44 | seg.dict = &Dictionary{freqMap: make(map[string]float64), posMap: make(map[string]string)}
45 | return seg.dict.loadDictionary(fileName)
46 | }
47 |
48 | // LoadUserDictionary loads a user specified dictionary, it must be called
49 | // after LoadDictionary, and it will not clear any previous loaded dictionary,
50 | // instead it will override exist entries.
51 | func (seg *Segmenter) LoadUserDictionary(fileName string) error {
52 | return seg.dict.loadDictionary(fileName)
53 | }
54 |
55 | func (seg *Segmenter) cutDetailInternal(sentence string) <-chan Segment {
56 | result := make(chan Segment)
57 |
58 | go func() {
59 | runes := []rune(sentence)
60 | posList := viterbi(runes)
61 | begin := 0
62 | next := 0
63 | for i, char := range runes {
64 | pos := posList[i]
65 | switch pos.position() {
66 | case "B":
67 | begin = i
68 | case "E":
69 | result <- Segment{string(runes[begin : i+1]), pos.pos()}
70 | next = i + 1
71 | case "S":
72 | result <- Segment{string(char), pos.pos()}
73 | next = i + 1
74 | }
75 | }
76 | if next < len(runes) {
77 | result <- Segment{string(runes[next:]), posList[next].pos()}
78 | }
79 | close(result)
80 | }()
81 | return result
82 | }
83 |
84 | func (seg *Segmenter) cutDetail(sentence string) <-chan Segment {
85 | result := make(chan Segment)
86 | go func() {
87 | for _, blk := range util.RegexpSplit(reHanDetail, sentence, -1) {
88 | if reHanDetail.MatchString(blk) {
89 | for segment := range seg.cutDetailInternal(blk) {
90 | result <- segment
91 | }
92 | continue
93 | }
94 | for _, x := range util.RegexpSplit(reSkipDetail, blk, -1) {
95 | if len(x) == 0 {
96 | continue
97 | }
98 | switch {
99 | case reNum.MatchString(x):
100 | result <- Segment{x, "m"}
101 | case reEng.MatchString(x):
102 | result <- Segment{x, "eng"}
103 | default:
104 | result <- Segment{x, "x"}
105 | }
106 | }
107 | }
108 | close(result)
109 | }()
110 | return result
111 | }
112 |
113 | func (seg *Segmenter) dag(runes []rune) map[int][]int {
114 | dag := make(map[int][]int)
115 | n := len(runes)
116 | var frag []rune
117 | var i int
118 | for k := 0; k < n; k++ {
119 | dag[k] = make([]int, 0)
120 | i = k
121 | frag = runes[k : k+1]
122 | for {
123 | freq, ok := seg.dict.Frequency(string(frag))
124 | if !ok {
125 | break
126 | }
127 | if freq > 0.0 {
128 | dag[k] = append(dag[k], i)
129 | }
130 | i++
131 | if i >= n {
132 | break
133 | }
134 | frag = runes[k : i+1]
135 | }
136 | if len(dag[k]) == 0 {
137 | dag[k] = append(dag[k], k)
138 | }
139 | }
140 | return dag
141 | }
142 |
143 | type route struct {
144 | frequency float64
145 | index int
146 | }
147 |
148 | func (seg *Segmenter) calc(runes []rune) map[int]route {
149 | dag := seg.dag(runes)
150 | n := len(runes)
151 | rs := make(map[int]route)
152 | rs[n] = route{frequency: 0.0, index: 0}
153 | var r route
154 | for idx := n - 1; idx >= 0; idx-- {
155 | for _, i := range dag[idx] {
156 | if freq, ok := seg.dict.Frequency(string(runes[idx : i+1])); ok {
157 | r = route{frequency: math.Log(freq) - seg.dict.logTotal + rs[i+1].frequency, index: i}
158 | } else {
159 | r = route{frequency: math.Log(1.0) - seg.dict.logTotal + rs[i+1].frequency, index: i}
160 | }
161 | if v, ok := rs[idx]; !ok {
162 | rs[idx] = r
163 | } else {
164 | if v.frequency < r.frequency || (v.frequency == r.frequency && v.index < r.index) {
165 | rs[idx] = r
166 | }
167 | }
168 | }
169 | }
170 | return rs
171 | }
172 |
173 | type cutFunc func(sentence string) <-chan Segment
174 |
175 | func (seg *Segmenter) cutDAG(sentence string) <-chan Segment {
176 | result := make(chan Segment)
177 |
178 | go func() {
179 | runes := []rune(sentence)
180 | routes := seg.calc(runes)
181 | var y int
182 | length := len(runes)
183 | var buf []rune
184 | for x := 0; x < length; {
185 | y = routes[x].index + 1
186 | frag := runes[x:y]
187 | if y-x == 1 {
188 | buf = append(buf, frag...)
189 | x = y
190 | continue
191 | }
192 | if len(buf) > 0 {
193 | bufString := string(buf)
194 | if len(buf) == 1 {
195 | if tag, ok := seg.dict.Pos(bufString); ok {
196 | result <- Segment{bufString, tag}
197 | } else {
198 | result <- Segment{bufString, "x"}
199 | }
200 | buf = make([]rune, 0)
201 | continue
202 | }
203 | if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
204 | for t := range seg.cutDetail(bufString) {
205 | result <- t
206 | }
207 | } else {
208 | for _, elem := range buf {
209 | selem := string(elem)
210 | if tag, ok := seg.dict.Pos(selem); ok {
211 | result <- Segment{selem, tag}
212 | } else {
213 | result <- Segment{selem, "x"}
214 | }
215 |
216 | }
217 | }
218 | buf = make([]rune, 0)
219 | }
220 | word := string(frag)
221 | if tag, ok := seg.dict.Pos(word); ok {
222 | result <- Segment{word, tag}
223 | } else {
224 | result <- Segment{word, "x"}
225 | }
226 | x = y
227 | }
228 |
229 | if len(buf) > 0 {
230 | bufString := string(buf)
231 | if len(buf) == 1 {
232 | if tag, ok := seg.dict.Pos(bufString); ok {
233 | result <- Segment{bufString, tag}
234 | } else {
235 | result <- Segment{bufString, "x"}
236 | }
237 | } else {
238 | if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
239 | for t := range seg.cutDetail(bufString) {
240 | result <- t
241 | }
242 | } else {
243 | for _, elem := range buf {
244 | selem := string(elem)
245 | if tag, ok := seg.dict.Pos(selem); ok {
246 | result <- Segment{selem, tag}
247 | } else {
248 | result <- Segment{selem, "x"}
249 | }
250 | }
251 | }
252 | }
253 | }
254 | close(result)
255 | }()
256 | return result
257 | }
258 |
259 | func (seg *Segmenter) cutDAGNoHMM(sentence string) <-chan Segment {
260 | result := make(chan Segment)
261 |
262 | go func() {
263 | runes := []rune(sentence)
264 | routes := seg.calc(runes)
265 | var y int
266 | length := len(runes)
267 | var buf []rune
268 | for x := 0; x < length; {
269 | y = routes[x].index + 1
270 | frag := runes[x:y]
271 | if reEng1.MatchString(string(frag)) && len(frag) == 1 {
272 | buf = append(buf, frag...)
273 | x = y
274 | continue
275 | }
276 | if len(buf) > 0 {
277 | result <- Segment{string(buf), "eng"}
278 | buf = make([]rune, 0)
279 | }
280 | word := string(frag)
281 | if tag, ok := seg.dict.Pos(word); ok {
282 | result <- Segment{word, tag}
283 | } else {
284 | result <- Segment{word, "x"}
285 | }
286 | x = y
287 |
288 | }
289 | if len(buf) > 0 {
290 | result <- Segment{string(buf), "eng"}
291 | buf = make([]rune, 0)
292 | }
293 | close(result)
294 | }()
295 | return result
296 | }
297 |
298 | // Cut cuts a sentence into words.
299 | // Parameter hmm controls whether to use the Hidden Markov Model.
300 | func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan Segment {
301 | result := make(chan Segment)
302 | var cut cutFunc
303 | if hmm {
304 | cut = seg.cutDAG
305 | } else {
306 | cut = seg.cutDAGNoHMM
307 | }
308 | go func() {
309 | for _, blk := range util.RegexpSplit(reHanInternal, sentence, -1) {
310 | if reHanInternal.MatchString(blk) {
311 | for wordTag := range cut(blk) {
312 | result <- wordTag
313 | }
314 | continue
315 | }
316 | for _, x := range util.RegexpSplit(reSkipInternal, blk, -1) {
317 | if reSkipInternal.MatchString(x) {
318 | result <- Segment{x, "x"}
319 | continue
320 | }
321 | for _, xx := range x {
322 | s := string(xx)
323 | switch {
324 | case reNum.MatchString(s):
325 | result <- Segment{s, "m"}
326 | case reEng.MatchString(x):
327 | result <- Segment{x, "eng"}
328 | default:
329 | result <- Segment{s, "x"}
330 | }
331 | }
332 | }
333 | }
334 | close(result)
335 | }()
336 | return result
337 | }
338 |
--------------------------------------------------------------------------------
/posseg/posseg_test.go:
--------------------------------------------------------------------------------
1 | package posseg
2 |
3 | import (
4 | "testing"
5 | )
6 |
7 | var (
8 | seg Segmenter
9 | testContents = []string{
10 | "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。",
11 | "我不喜欢日本和服。",
12 | "雷猴回归人间。",
13 | "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
14 | "我需要廉租房",
15 | "永和服装饰品有限公司",
16 | "我爱北京天安门",
17 | "abc",
18 | "隐马尔可夫",
19 | "雷猴是个好网站",
20 | "“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成",
21 | "草泥马和欺实马是今年的流行词汇",
22 | "伊藤洋华堂总府店",
23 | "中国科学院计算技术研究所",
24 | "罗密欧与朱丽叶",
25 | "我购买了道具和服装",
26 | "PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍",
27 | "湖北省石首市",
28 | "湖北省十堰市",
29 | "总经理完成了这件事情",
30 | "电脑修好了",
31 | "做好了这件事情就一了百了了",
32 | "人们审美的观点是不同的",
33 | "我们买了一个美的空调",
34 | "线程初始化时我们要注意",
35 | "一个分子是由好多原子组织成的",
36 | "祝你马到功成",
37 | "他掉进了无底洞里",
38 | "中国的首都是北京",
39 | "孙君意",
40 | "外交部发言人马朝旭",
41 | "领导人会议和第四届东亚峰会",
42 | "在过去的这五年",
43 | "还需要很长的路要走",
44 | "60周年首都阅兵",
45 | "你好人们审美的观点是不同的",
46 | "买水果然后来世博园",
47 | "买水果然后去世博园",
48 | "但是后来我才知道你是对的",
49 | "存在即合理",
50 | "的的的的的在的的的的就以和和和",
51 | "I love你,不以为耻,反以为rong",
52 | "因",
53 | "",
54 | "hello你好人们审美的观点是不同的",
55 | "很好但主要是基于网页形式",
56 | "hello你好人们审美的观点是不同的",
57 | "为什么我不能拥有想要的生活",
58 | "后来我才",
59 | "此次来中国是为了",
60 | "使用了它就可以解决一些问题",
61 | ",使用了它就可以解决一些问题",
62 | "其实使用了它就可以解决一些问题",
63 | "好人使用了它就可以解决一些问题",
64 | "是因为和国家",
65 | "老年搜索还支持",
66 | "干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ",
67 | "大",
68 | "",
69 | "他说的确实在理",
70 | "长春市长春节讲话",
71 | "结婚的和尚未结婚的",
72 | "结合成分子时",
73 | "旅游和服务是最好的",
74 | "这件事情的确是我的错",
75 | "供大家参考指正",
76 | "哈尔滨政府公布塌桥原因",
77 | "我在机场入口处",
78 | "邢永臣摄影报道",
79 | "BP神经网络如何训练才能在分类时增加区分度?",
80 | "南京市长江大桥",
81 | "应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究",
82 | "长春市长春药店",
83 | "邓颖超生前最喜欢的衣服",
84 | "胡锦涛是热爱世界和平的政治局常委",
85 | "程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪",
86 | "一次性交多少钱",
87 | "两块五一套,三块八一斤,四块七一本,五块六一条",
88 | "小和尚留了一个像大和尚一样的和尚头",
89 | "我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站",
90 | "张晓梅去人民医院做了个B超然后去买了件T恤",
91 | "AT&T是一件不错的公司,给你发offer了吗?",
92 | "C++和c#是什么关系?11+122=133,是吗?PI=3.14159",
93 | "你认识那个和主席握手的的哥吗?他开一辆黑色的士。",
94 | "枪杆子中出政权"}
95 |
96 | defaultCutResult = [][]Segment{[]Segment{Segment{"这", "r"}, Segment{"是", "v"}, Segment{"一个", "m"}, Segment{"伸手不见五指", "i"}, Segment{"的", "uj"}, Segment{"黑夜", "n"}, Segment{"。", "x"}, Segment{"我", "r"}, Segment{"叫", "v"}, Segment{"孙悟空", "nr"}, Segment{",", "x"}, Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"北京", "ns"}, Segment{",", "x"}, Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"Python", "eng"}, Segment{"和", "c"}, Segment{"C++", "nz"}, Segment{"。", "x"}},
97 | []Segment{Segment{"我", "r"}, Segment{"不", "d"}, Segment{"喜欢", "v"}, Segment{"日本", "ns"}, Segment{"和服", "nz"}, Segment{"。", "x"}},
98 | []Segment{Segment{"雷猴", "n"}, Segment{"回归", "v"}, Segment{"人间", "n"}, Segment{"。", "x"}},
99 | []Segment{Segment{"工信处", "n"}, Segment{"女干事", "n"}, Segment{"每月", "r"}, Segment{"经过", "p"}, Segment{"下属", "v"}, Segment{"科室", "n"}, Segment{"都", "d"}, Segment{"要", "v"}, Segment{"亲口", "n"}, Segment{"交代", "n"}, Segment{"24", "m"}, Segment{"口", "n"}, Segment{"交换机", "n"}, Segment{"等", "u"}, Segment{"技术性", "n"}, Segment{"器件", "n"}, Segment{"的", "uj"}, Segment{"安装", "v"}, Segment{"工作", "vn"}},
100 | []Segment{Segment{"我", "r"}, Segment{"需要", "v"}, Segment{"廉租房", "n"}},
101 | []Segment{Segment{"永和", "nz"}, Segment{"服装", "vn"}, Segment{"饰品", "n"}, Segment{"有限公司", "n"}},
102 | []Segment{Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"北京", "ns"}, Segment{"天安门", "ns"}},
103 | []Segment{Segment{"abc", "eng"}},
104 | []Segment{Segment{"隐", "n"}, Segment{"马尔可夫", "nr"}},
105 | []Segment{Segment{"雷猴", "n"}, Segment{"是", "v"}, Segment{"个", "q"}, Segment{"好", "a"}, Segment{"网站", "n"}},
106 | []Segment{Segment{"“", "x"}, Segment{"Microsoft", "eng"}, Segment{"”", "x"}, Segment{"一", "m"}, Segment{"词", "n"}, Segment{"由", "p"}, Segment{"“", "x"}, Segment{"MICROcomputer", "eng"}, Segment{"(", "x"}, Segment{"微型", "b"}, Segment{"计算机", "n"}, Segment{")", "x"}, Segment{"”", "x"}, Segment{"和", "c"}, Segment{"“", "x"}, Segment{"SOFTware", "eng"}, Segment{"(", "x"}, Segment{"软件", "n"}, Segment{")", "x"}, Segment{"”", "x"}, Segment{"两", "m"}, Segment{"部分", "n"}, Segment{"组成", "v"}},
107 | []Segment{Segment{"草泥马", "n"}, Segment{"和", "c"}, Segment{"欺实", "v"}, Segment{"马", "n"}, Segment{"是", "v"}, Segment{"今年", "t"}, Segment{"的", "uj"}, Segment{"流行", "v"}, Segment{"词汇", "n"}},
108 | []Segment{Segment{"伊藤", "nr"}, Segment{"洋华堂", "n"}, Segment{"总府", "n"}, Segment{"店", "n"}},
109 | []Segment{Segment{"中国科学院计算技术研究所", "nt"}},
110 | []Segment{Segment{"罗密欧", "nr"}, Segment{"与", "p"}, Segment{"朱丽叶", "nr"}},
111 | []Segment{Segment{"我", "r"}, Segment{"购买", "v"}, Segment{"了", "ul"}, Segment{"道具", "n"}, Segment{"和", "c"}, Segment{"服装", "vn"}},
112 | []Segment{Segment{"PS", "eng"}, Segment{":", "x"}, Segment{" ", "x"}, Segment{"我", "r"}, Segment{"觉得", "v"}, Segment{"开源", "n"}, Segment{"有", "v"}, Segment{"一个", "m"}, Segment{"好处", "d"}, Segment{",", "x"}, Segment{"就是", "d"}, Segment{"能够", "v"}, Segment{"敦促", "v"}, Segment{"自己", "r"}, Segment{"不断改进", "l"}, Segment{",", "x"}, Segment{"避免", "v"}, Segment{"敞", "v"}, Segment{"帚", "ng"}, Segment{"自珍", "b"}},
113 | []Segment{Segment{"湖北省", "ns"}, Segment{"石首市", "ns"}},
114 | []Segment{Segment{"湖北省", "ns"}, Segment{"十堰市", "ns"}},
115 | []Segment{Segment{"总经理", "n"}, Segment{"完成", "v"}, Segment{"了", "ul"}, Segment{"这件", "mq"}, Segment{"事情", "n"}},
116 | []Segment{Segment{"电脑", "n"}, Segment{"修好", "v"}, Segment{"了", "ul"}},
117 | []Segment{Segment{"做好", "v"}, Segment{"了", "ul"}, Segment{"这件", "mq"}, Segment{"事情", "n"}, Segment{"就", "d"}, Segment{"一了百了", "l"}, Segment{"了", "ul"}},
118 | []Segment{Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}},
119 | []Segment{Segment{"我们", "r"}, Segment{"买", "v"}, Segment{"了", "ul"}, Segment{"一个", "m"}, Segment{"美的", "nr"}, Segment{"空调", "n"}},
120 | []Segment{Segment{"线程", "n"}, Segment{"初始化", "l"}, Segment{"时", "n"}, Segment{"我们", "r"}, Segment{"要", "v"}, Segment{"注意", "v"}},
121 | []Segment{Segment{"一个", "m"}, Segment{"分子", "n"}, Segment{"是", "v"}, Segment{"由", "p"}, Segment{"好多", "m"}, Segment{"原子", "n"}, Segment{"组织", "v"}, Segment{"成", "v"}, Segment{"的", "uj"}},
122 | []Segment{Segment{"祝", "v"}, Segment{"你", "r"}, Segment{"马到功成", "i"}},
123 | []Segment{Segment{"他", "r"}, Segment{"掉", "v"}, Segment{"进", "v"}, Segment{"了", "ul"}, Segment{"无底洞", "ns"}, Segment{"里", "f"}},
124 | []Segment{Segment{"中国", "ns"}, Segment{"的", "uj"}, Segment{"首都", "d"}, Segment{"是", "v"}, Segment{"北京", "ns"}},
125 | []Segment{Segment{"孙君意", "nr"}},
126 | []Segment{Segment{"外交部", "nt"}, Segment{"发言人", "l"}, Segment{"马朝旭", "nr"}},
127 | []Segment{Segment{"领导人", "n"}, Segment{"会议", "n"}, Segment{"和", "c"}, Segment{"第四届", "m"}, Segment{"东亚", "ns"}, Segment{"峰会", "n"}},
128 | []Segment{Segment{"在", "p"}, Segment{"过去", "t"}, Segment{"的", "uj"}, Segment{"这", "r"}, Segment{"五年", "t"}},
129 | []Segment{Segment{"还", "d"}, Segment{"需要", "v"}, Segment{"很", "d"}, Segment{"长", "a"}, Segment{"的", "uj"}, Segment{"路", "n"}, Segment{"要", "v"}, Segment{"走", "v"}},
130 | []Segment{Segment{"60", "m"}, Segment{"周年", "t"}, Segment{"首都", "d"}, Segment{"阅兵", "v"}},
131 | []Segment{Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}},
132 | []Segment{Segment{"买", "v"}, Segment{"水果", "n"}, Segment{"然后", "c"}, Segment{"来", "v"}, Segment{"世博园", "nr"}},
133 | []Segment{Segment{"买", "v"}, Segment{"水果", "n"}, Segment{"然后", "c"}, Segment{"去", "v"}, Segment{"世博园", "nr"}},
134 | []Segment{Segment{"但是", "c"}, Segment{"后来", "t"}, Segment{"我", "r"}, Segment{"才", "d"}, Segment{"知道", "v"}, Segment{"你", "r"}, Segment{"是", "v"}, Segment{"对", "p"}, Segment{"的", "uj"}},
135 | []Segment{Segment{"存在", "v"}, Segment{"即", "v"}, Segment{"合理", "vn"}},
136 | []Segment{Segment{"的的", "u"}, Segment{"的的", "u"}, Segment{"的", "uj"}, Segment{"在的", "u"}, Segment{"的的", "u"}, Segment{"的", "uj"}, Segment{"就", "d"}, Segment{"以", "p"}, Segment{"和和", "nz"}, Segment{"和", "c"}},
137 | []Segment{Segment{"I", "x"}, Segment{" ", "x"}, Segment{"love", "eng"}, Segment{"你", "r"}, Segment{",", "x"}, Segment{"不以为耻", "i"}, Segment{",", "x"}, Segment{"反", "zg"}, Segment{"以为", "c"}, Segment{"rong", "eng"}},
138 | []Segment{Segment{"因", "p"}},
139 | []Segment{},
140 | []Segment{Segment{"hello", "eng"}, Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}},
141 | []Segment{Segment{"很好", "a"}, Segment{"但", "c"}, Segment{"主要", "b"}, Segment{"是", "v"}, Segment{"基于", "p"}, Segment{"网页", "n"}, Segment{"形式", "n"}},
142 | []Segment{Segment{"hello", "eng"}, Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}},
143 | []Segment{Segment{"为什么", "r"}, Segment{"我", "r"}, Segment{"不能", "v"}, Segment{"拥有", "v"}, Segment{"想要", "v"}, Segment{"的", "uj"}, Segment{"生活", "vn"}},
144 | []Segment{Segment{"后来", "t"}, Segment{"我", "r"}, Segment{"才", "d"}},
145 | []Segment{Segment{"此次", "r"}, Segment{"来", "v"}, Segment{"中国", "ns"}, Segment{"是", "v"}, Segment{"为了", "p"}},
146 | []Segment{Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}},
147 | []Segment{Segment{",", "x"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}},
148 | []Segment{Segment{"其实", "d"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}},
149 | []Segment{Segment{"好人", "n"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}},
150 | []Segment{Segment{"是因为", "c"}, Segment{"和", "c"}, Segment{"国家", "n"}},
151 | []Segment{Segment{"老年", "t"}, Segment{"搜索", "v"}, Segment{"还", "d"}, Segment{"支持", "v"}},
152 | []Segment{Segment{"干脆", "d"}, Segment{"就", "d"}, Segment{"把", "p"}, Segment{"那部", "r"}, Segment{"蒙人", "n"}, Segment{"的", "uj"}, Segment{"闲法", "n"}, Segment{"给", "p"}, Segment{"废", "v"}, Segment{"了", "ul"}, Segment{"拉倒", "v"}, Segment{"!", "x"}, Segment{"RT", "eng"}, Segment{" ", "x"}, Segment{"@", "x"}, Segment{"laoshipukong", "eng"}, Segment{" ", "x"}, Segment{":", "x"}, Segment{" ", "x"}, Segment{"27", "m"}, Segment{"日", "m"}, Segment{",", "x"}, Segment{"全国人大常委会", "nt"}, Segment{"第三次", "m"}, Segment{"审议", "v"}, Segment{"侵权", "v"}, Segment{"责任法", "n"}, Segment{"草案", "n"}, Segment{",", "x"}, Segment{"删除", "v"}, Segment{"了", "ul"}, Segment{"有关", "vn"}, Segment{"医疗", "n"}, Segment{"损害", "v"}, Segment{"责任", "n"}, Segment{"“", "x"}, Segment{"举证", "v"}, Segment{"倒置", "v"}, Segment{"”", "x"}, Segment{"的", "uj"}, Segment{"规定", "n"}, Segment{"。", "x"}, Segment{"在", "p"}, Segment{"医患", "n"}, Segment{"纠纷", "n"}, Segment{"中本", "ns"}, Segment{"已", "d"}, Segment{"处于", "v"}, Segment{"弱势", "n"}, Segment{"地位", "n"}, Segment{"的", "uj"}, Segment{"消费者", "n"}, Segment{"由此", "c"}, Segment{"将", "d"}, Segment{"陷入", "v"}, Segment{"万劫不复", "i"}, Segment{"的", "uj"}, Segment{"境地", "s"}, Segment{"。", "x"}, Segment{" ", "x"}},
153 | []Segment{Segment{"大", "a"}},
154 | []Segment{},
155 | []Segment{Segment{"他", "r"}, Segment{"说", "v"}, Segment{"的", "uj"}, Segment{"确实", "ad"}, Segment{"在", "p"}, Segment{"理", "n"}},
156 | []Segment{Segment{"长春", "ns"}, Segment{"市长", "n"}, Segment{"春节", "t"}, Segment{"讲话", "n"}},
157 | []Segment{Segment{"结婚", "v"}, Segment{"的", "uj"}, Segment{"和", "c"}, Segment{"尚未", "d"}, Segment{"结婚", "v"}, Segment{"的", "uj"}},
158 | []Segment{Segment{"结合", "v"}, Segment{"成", "n"}, Segment{"分子", "n"}, Segment{"时", "n"}},
159 | []Segment{Segment{"旅游", "vn"}, Segment{"和", "c"}, Segment{"服务", "vn"}, Segment{"是", "v"}, Segment{"最好", "a"}, Segment{"的", "uj"}},
160 | []Segment{Segment{"这件", "mq"}, Segment{"事情", "n"}, Segment{"的确", "d"}, Segment{"是", "v"}, Segment{"我", "r"}, Segment{"的", "uj"}, Segment{"错", "n"}},
161 | []Segment{Segment{"供", "v"}, Segment{"大家", "n"}, Segment{"参考", "v"}, Segment{"指正", "v"}},
162 | []Segment{Segment{"哈尔滨", "ns"}, Segment{"政府", "n"}, Segment{"公布", "v"}, Segment{"塌", "v"}, Segment{"桥", "n"}, Segment{"原因", "n"}},
163 | []Segment{Segment{"我", "r"}, Segment{"在", "p"}, Segment{"机场", "n"}, Segment{"入口处", "i"}},
164 | []Segment{Segment{"邢永臣", "nr"}, Segment{"摄影", "n"}, Segment{"报道", "v"}},
165 | []Segment{Segment{"BP", "eng"}, Segment{"神经网络", "n"}, Segment{"如何", "r"}, Segment{"训练", "vn"}, Segment{"才能", "v"}, Segment{"在", "p"}, Segment{"分类", "n"}, Segment{"时", "n"}, Segment{"增加", "v"}, Segment{"区分度", "n"}, Segment{"?", "x"}},
166 | []Segment{Segment{"南京市", "ns"}, Segment{"长江大桥", "ns"}},
167 | []Segment{Segment{"应", "v"}, Segment{"一些", "m"}, Segment{"使用者", "n"}, Segment{"的", "uj"}, Segment{"建议", "n"}, Segment{",", "x"}, Segment{"也", "d"}, Segment{"为了", "p"}, Segment{"便于", "v"}, Segment{"利用", "n"}, Segment{"NiuTrans", "eng"}, Segment{"用于", "v"}, Segment{"SMT", "eng"}, Segment{"研究", "vn"}},
168 | []Segment{Segment{"长春市", "ns"}, Segment{"长春", "ns"}, Segment{"药店", "n"}},
169 | []Segment{Segment{"邓颖超", "nr"}, Segment{"生前", "t"}, Segment{"最", "d"}, Segment{"喜欢", "v"}, Segment{"的", "uj"}, Segment{"衣服", "n"}},
170 | []Segment{Segment{"胡锦涛", "nr"}, Segment{"是", "v"}, Segment{"热爱", "a"}, Segment{"世界", "n"}, Segment{"和平", "nz"}, Segment{"的", "uj"}, Segment{"政治局", "n"}, Segment{"常委", "j"}},
171 | []Segment{Segment{"程序员", "n"}, Segment{"祝", "v"}, Segment{"海林", "nz"}, Segment{"和", "c"}, Segment{"朱会震", "nr"}, Segment{"是", "v"}, Segment{"在", "p"}, Segment{"孙健", "nr"}, Segment{"的", "uj"}, Segment{"左面", "f"}, Segment{"和", "c"}, Segment{"右面", "f"}, Segment{",", "x"}, Segment{" ", "x"}, Segment{"范凯", "nr"}, Segment{"在", "p"}, Segment{"最", "a"}, Segment{"右面", "f"}, Segment{".", "m"}, Segment{"再往", "d"}, Segment{"左", "f"}, Segment{"是", "v"}, Segment{"李松洪", "nr"}},
172 | []Segment{Segment{"一次性", "d"}, Segment{"交", "v"}, Segment{"多少", "m"}, Segment{"钱", "n"}},
173 | []Segment{Segment{"两块", "m"}, Segment{"五", "m"}, Segment{"一套", "m"}, Segment{",", "x"}, Segment{"三块", "m"}, Segment{"八", "m"}, Segment{"一斤", "m"}, Segment{",", "x"}, Segment{"四块", "m"}, Segment{"七", "m"}, Segment{"一本", "m"}, Segment{",", "x"}, Segment{"五块", "m"}, Segment{"六", "m"}, Segment{"一条", "m"}},
174 | []Segment{Segment{"小", "a"}, Segment{"和尚", "nr"}, Segment{"留", "v"}, Segment{"了", "ul"}, Segment{"一个", "m"}, Segment{"像", "v"}, Segment{"大", "a"}, Segment{"和尚", "nr"}, Segment{"一样", "r"}, Segment{"的", "uj"}, Segment{"和尚头", "nr"}},
175 | []Segment{Segment{"我", "r"}, Segment{"是", "v"}, Segment{"中华人民共和国", "ns"}, Segment{"公民", "n"}, Segment{";", "x"}, Segment{"我", "r"}, Segment{"爸爸", "n"}, Segment{"是", "v"}, Segment{"共和党", "nt"}, Segment{"党员", "n"}, Segment{";", "x"}, Segment{" ", "x"}, Segment{"地铁", "n"}, Segment{"和平门", "ns"}, Segment{"站", "v"}},
176 | []Segment{Segment{"张晓梅", "nr"}, Segment{"去", "v"}, Segment{"人民", "n"}, Segment{"医院", "n"}, Segment{"做", "v"}, Segment{"了", "ul"}, Segment{"个", "q"}, Segment{"B超", "n"}, Segment{"然后", "c"}, Segment{"去", "v"}, Segment{"买", "v"}, Segment{"了", "ul"}, Segment{"件", "q"}, Segment{"T恤", "n"}},
177 | []Segment{Segment{"AT&T", "nz"}, Segment{"是", "v"}, Segment{"一件", "m"}, Segment{"不错", "a"}, Segment{"的", "uj"}, Segment{"公司", "n"}, Segment{",", "x"}, Segment{"给", "p"}, Segment{"你", "r"}, Segment{"发", "v"}, Segment{"offer", "eng"}, Segment{"了", "ul"}, Segment{"吗", "y"}, Segment{"?", "x"}},
178 | []Segment{Segment{"C++", "nz"}, Segment{"和", "c"}, Segment{"c#", "nz"}, Segment{"是", "v"}, Segment{"什么", "r"}, Segment{"关系", "n"}, Segment{"?", "x"}, Segment{"11", "m"}, Segment{"+", "x"}, Segment{"122", "m"}, Segment{"=", "x"}, Segment{"133", "m"}, Segment{",", "x"}, Segment{"是", "v"}, Segment{"吗", "y"}, Segment{"?", "x"}, Segment{"PI", "eng"}, Segment{"=", "x"}, Segment{"3.14159", "m"}},
179 | []Segment{Segment{"你", "r"}, Segment{"认识", "v"}, Segment{"那个", "r"}, Segment{"和", "c"}, Segment{"主席", "n"}, Segment{"握手", "v"}, Segment{"的", "uj"}, Segment{"的哥", "n"}, Segment{"吗", "y"}, Segment{"?", "x"}, Segment{"他", "r"}, Segment{"开", "v"}, Segment{"一辆", "m"}, Segment{"黑色", "n"}, Segment{"的士", "n"}, Segment{"。", "x"}},
180 | []Segment{Segment{"枪杆子", "n"}, Segment{"中", "f"}, Segment{"出", "v"}, Segment{"政权", "n"}},
181 | }
182 | noHMMCutResult = [][]Segment{
183 | []Segment{Segment{"这", "r"}, Segment{"是", "v"}, Segment{"一个", "m"}, Segment{"伸手不见五指", "i"}, Segment{"的", "uj"}, Segment{"黑夜", "n"}, Segment{"。", "x"}, Segment{"我", "r"}, Segment{"叫", "v"}, Segment{"孙悟空", "nr"}, Segment{",", "x"}, Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"北京", "ns"}, Segment{",", "x"}, Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"Python", "eng"}, Segment{"和", "c"}, Segment{"C++", "nz"}, Segment{"。", "x"}},
184 | []Segment{Segment{"我", "r"}, Segment{"不", "d"}, Segment{"喜欢", "v"}, Segment{"日本", "ns"}, Segment{"和服", "nz"}, Segment{"。", "x"}},
185 | []Segment{Segment{"雷猴", "n"}, Segment{"回归", "v"}, Segment{"人间", "n"}, Segment{"。", "x"}},
186 | []Segment{Segment{"工信处", "n"}, Segment{"女干事", "n"}, Segment{"每月", "r"}, Segment{"经过", "p"}, Segment{"下属", "v"}, Segment{"科室", "n"}, Segment{"都", "d"}, Segment{"要", "v"}, Segment{"亲口", "n"}, Segment{"交代", "n"}, Segment{"24", "eng"}, Segment{"口", "q"}, Segment{"交换机", "n"}, Segment{"等", "u"}, Segment{"技术性", "n"}, Segment{"器件", "n"}, Segment{"的", "uj"}, Segment{"安装", "v"}, Segment{"工作", "vn"}},
187 | []Segment{Segment{"我", "r"}, Segment{"需要", "v"}, Segment{"廉租房", "n"}},
188 | []Segment{Segment{"永和", "nz"}, Segment{"服装", "vn"}, Segment{"饰品", "n"}, Segment{"有限公司", "n"}},
189 | []Segment{Segment{"我", "r"}, Segment{"爱", "v"}, Segment{"北京", "ns"}, Segment{"天安门", "ns"}},
190 | []Segment{Segment{"abc", "eng"}},
191 | []Segment{Segment{"隐", "n"}, Segment{"马尔可夫", "nr"}},
192 | []Segment{Segment{"雷猴", "n"}, Segment{"是", "v"}, Segment{"个", "q"}, Segment{"好", "a"}, Segment{"网站", "n"}},
193 | []Segment{Segment{"“", "x"}, Segment{"Microsoft", "eng"}, Segment{"”", "x"}, Segment{"一", "m"}, Segment{"词", "n"}, Segment{"由", "p"}, Segment{"“", "x"}, Segment{"MICROcomputer", "eng"}, Segment{"(", "x"}, Segment{"微型", "b"}, Segment{"计算机", "n"}, Segment{")", "x"}, Segment{"”", "x"}, Segment{"和", "c"}, Segment{"“", "x"}, Segment{"SOFTware", "eng"}, Segment{"(", "x"}, Segment{"软件", "n"}, Segment{")", "x"}, Segment{"”", "x"}, Segment{"两", "m"}, Segment{"部分", "n"}, Segment{"组成", "v"}},
194 | []Segment{Segment{"草泥马", "n"}, Segment{"和", "c"}, Segment{"欺", "vn"}, Segment{"实", "n"}, Segment{"马", "n"}, Segment{"是", "v"}, Segment{"今年", "t"}, Segment{"的", "uj"}, Segment{"流行", "v"}, Segment{"词汇", "n"}},
195 | []Segment{Segment{"伊", "ns"}, Segment{"藤", "nr"}, Segment{"洋华堂", "n"}, Segment{"总府", "n"}, Segment{"店", "n"}},
196 | []Segment{Segment{"中国科学院计算技术研究所", "nt"}},
197 | []Segment{Segment{"罗密欧", "nr"}, Segment{"与", "p"}, Segment{"朱丽叶", "nr"}},
198 | []Segment{Segment{"我", "r"}, Segment{"购买", "v"}, Segment{"了", "ul"}, Segment{"道具", "n"}, Segment{"和", "c"}, Segment{"服装", "vn"}},
199 | []Segment{Segment{"PS", "eng"}, Segment{":", "x"}, Segment{" ", "x"}, Segment{"我", "r"}, Segment{"觉得", "v"}, Segment{"开源", "n"}, Segment{"有", "v"}, Segment{"一个", "m"}, Segment{"好处", "d"}, Segment{",", "x"}, Segment{"就是", "d"}, Segment{"能够", "v"}, Segment{"敦促", "v"}, Segment{"自己", "r"}, Segment{"不断改进", "l"}, Segment{",", "x"}, Segment{"避免", "v"}, Segment{"敞", "v"}, Segment{"帚", "ng"}, Segment{"自珍", "b"}},
200 | []Segment{Segment{"湖北省", "ns"}, Segment{"石首市", "ns"}},
201 | []Segment{Segment{"湖北省", "ns"}, Segment{"十堰市", "ns"}},
202 | []Segment{Segment{"总经理", "n"}, Segment{"完成", "v"}, Segment{"了", "ul"}, Segment{"这件", "mq"}, Segment{"事情", "n"}},
203 | []Segment{Segment{"电脑", "n"}, Segment{"修好", "v"}, Segment{"了", "ul"}},
204 | []Segment{Segment{"做好", "v"}, Segment{"了", "ul"}, Segment{"这件", "mq"}, Segment{"事情", "n"}, Segment{"就", "d"}, Segment{"一了百了", "l"}, Segment{"了", "ul"}},
205 | []Segment{Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}},
206 | []Segment{Segment{"我们", "r"}, Segment{"买", "v"}, Segment{"了", "ul"}, Segment{"一个", "m"}, Segment{"美的", "nr"}, Segment{"空调", "n"}},
207 | []Segment{Segment{"线程", "n"}, Segment{"初始化", "l"}, Segment{"时", "n"}, Segment{"我们", "r"}, Segment{"要", "v"}, Segment{"注意", "v"}},
208 | []Segment{Segment{"一个", "m"}, Segment{"分子", "n"}, Segment{"是", "v"}, Segment{"由", "p"}, Segment{"好多", "m"}, Segment{"原子", "n"}, Segment{"组织", "v"}, Segment{"成", "n"}, Segment{"的", "uj"}},
209 | []Segment{Segment{"祝", "v"}, Segment{"你", "r"}, Segment{"马到功成", "i"}},
210 | []Segment{Segment{"他", "r"}, Segment{"掉", "zg"}, Segment{"进", "v"}, Segment{"了", "ul"}, Segment{"无底洞", "ns"}, Segment{"里", "f"}},
211 | []Segment{Segment{"中国", "ns"}, Segment{"的", "uj"}, Segment{"首都", "d"}, Segment{"是", "v"}, Segment{"北京", "ns"}},
212 | []Segment{Segment{"孙", "zg"}, Segment{"君", "nz"}, Segment{"意", "n"}},
213 | []Segment{Segment{"外交部", "nt"}, Segment{"发言人", "l"}, Segment{"马朝旭", "nr"}},
214 | []Segment{Segment{"领导人", "n"}, Segment{"会议", "n"}, Segment{"和", "c"}, Segment{"第四届", "m"}, Segment{"东亚", "ns"}, Segment{"峰会", "n"}},
215 | []Segment{Segment{"在", "p"}, Segment{"过去", "t"}, Segment{"的", "uj"}, Segment{"这", "r"}, Segment{"五年", "t"}},
216 | []Segment{Segment{"还", "d"}, Segment{"需要", "v"}, Segment{"很", "zg"}, Segment{"长", "a"}, Segment{"的", "uj"}, Segment{"路", "n"}, Segment{"要", "v"}, Segment{"走", "v"}},
217 | []Segment{Segment{"60", "eng"}, Segment{"周年", "t"}, Segment{"首都", "d"}, Segment{"阅兵", "v"}},
218 | []Segment{Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}},
219 | []Segment{Segment{"买", "v"}, Segment{"水果", "n"}, Segment{"然后", "c"}, Segment{"来", "v"}, Segment{"世博园", "nr"}},
220 | []Segment{Segment{"买", "v"}, Segment{"水果", "n"}, Segment{"然后", "c"}, Segment{"去", "v"}, Segment{"世博园", "nr"}},
221 | []Segment{Segment{"但是", "c"}, Segment{"后来", "t"}, Segment{"我", "r"}, Segment{"才", "d"}, Segment{"知道", "v"}, Segment{"你", "r"}, Segment{"是", "v"}, Segment{"对", "p"}, Segment{"的", "uj"}},
222 | []Segment{Segment{"存在", "v"}, Segment{"即", "v"}, Segment{"合理", "vn"}},
223 | []Segment{Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"在", "p"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"的", "uj"}, Segment{"就", "d"}, Segment{"以", "p"}, Segment{"和", "c"}, Segment{"和", "c"}, Segment{"和", "c"}},
224 | []Segment{Segment{"I", "eng"}, Segment{" ", "x"}, Segment{"love", "eng"}, Segment{"你", "r"}, Segment{",", "x"}, Segment{"不以为耻", "i"}, Segment{",", "x"}, Segment{"反", "zg"}, Segment{"以为", "c"}, Segment{"rong", "eng"}},
225 | []Segment{Segment{"因", "p"}},
226 | []Segment{},
227 | []Segment{Segment{"hello", "eng"}, Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}},
228 | []Segment{Segment{"很", "zg"}, Segment{"好", "a"}, Segment{"但", "c"}, Segment{"主要", "b"}, Segment{"是", "v"}, Segment{"基于", "p"}, Segment{"网页", "n"}, Segment{"形式", "n"}},
229 | []Segment{Segment{"hello", "eng"}, Segment{"你好", "l"}, Segment{"人们", "n"}, Segment{"审美", "vn"}, Segment{"的", "uj"}, Segment{"观点", "n"}, Segment{"是", "v"}, Segment{"不同", "a"}, Segment{"的", "uj"}},
230 | []Segment{Segment{"为什么", "r"}, Segment{"我", "r"}, Segment{"不能", "v"}, Segment{"拥有", "v"}, Segment{"想要", "v"}, Segment{"的", "uj"}, Segment{"生活", "vn"}},
231 | []Segment{Segment{"后来", "t"}, Segment{"我", "r"}, Segment{"才", "d"}},
232 | []Segment{Segment{"此次", "r"}, Segment{"来", "v"}, Segment{"中国", "ns"}, Segment{"是", "v"}, Segment{"为了", "p"}},
233 | []Segment{Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}},
234 | []Segment{Segment{",", "x"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}},
235 | []Segment{Segment{"其实", "d"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}},
236 | []Segment{Segment{"好人", "n"}, Segment{"使用", "v"}, Segment{"了", "ul"}, Segment{"它", "r"}, Segment{"就", "d"}, Segment{"可以", "c"}, Segment{"解决", "v"}, Segment{"一些", "m"}, Segment{"问题", "n"}},
237 | []Segment{Segment{"是因为", "c"}, Segment{"和", "c"}, Segment{"国家", "n"}},
238 | []Segment{Segment{"老年", "t"}, Segment{"搜索", "v"}, Segment{"还", "d"}, Segment{"支持", "v"}},
239 | []Segment{Segment{"干脆", "d"}, Segment{"就", "d"}, Segment{"把", "p"}, Segment{"那", "r"}, Segment{"部", "n"}, Segment{"蒙", "v"}, Segment{"人", "n"}, Segment{"的", "uj"}, Segment{"闲", "n"}, Segment{"法", "j"}, Segment{"给", "p"}, Segment{"废", "v"}, Segment{"了", "ul"}, Segment{"拉倒", "v"}, Segment{"!", "x"}, Segment{"RT", "eng"}, Segment{" ", "x"}, Segment{"@", "x"}, Segment{"laoshipukong", "eng"}, Segment{" ", "x"}, Segment{":", "x"}, Segment{" ", "x"}, Segment{"27", "eng"}, Segment{"日", "m"}, Segment{",", "x"}, Segment{"全国人大常委会", "nt"}, Segment{"第三次", "m"}, Segment{"审议", "v"}, Segment{"侵权", "v"}, Segment{"责任法", "n"}, Segment{"草案", "n"}, Segment{",", "x"}, Segment{"删除", "v"}, Segment{"了", "ul"}, Segment{"有关", "vn"}, Segment{"医疗", "n"}, Segment{"损害", "v"}, Segment{"责任", "n"}, Segment{"“", "x"}, Segment{"举证", "v"}, Segment{"倒置", "v"}, Segment{"”", "x"}, Segment{"的", "uj"}, Segment{"规定", "n"}, Segment{"。", "x"}, Segment{"在", "p"}, Segment{"医患", "n"}, Segment{"纠纷", "n"}, Segment{"中", "f"}, Segment{"本", "r"}, Segment{"已", "d"}, Segment{"处于", "v"}, Segment{"弱势", "n"}, Segment{"地位", "n"}, Segment{"的", "uj"}, Segment{"消费者", "n"}, Segment{"由此", "c"}, Segment{"将", "d"}, Segment{"陷入", "v"}, Segment{"万劫不复", "i"}, Segment{"的", "uj"}, Segment{"境地", "s"}, Segment{"。", "x"}, Segment{" ", "x"}},
240 | []Segment{Segment{"大", "a"}},
241 | []Segment{},
242 | []Segment{Segment{"他", "r"}, Segment{"说", "v"}, Segment{"的", "uj"}, Segment{"确实", "ad"}, Segment{"在", "p"}, Segment{"理", "n"}},
243 | []Segment{Segment{"长春", "ns"}, Segment{"市长", "n"}, Segment{"春节", "t"}, Segment{"讲话", "n"}},
244 | []Segment{Segment{"结婚", "v"}, Segment{"的", "uj"}, Segment{"和", "c"}, Segment{"尚未", "d"}, Segment{"结婚", "v"}, Segment{"的", "uj"}},
245 | []Segment{Segment{"结合", "v"}, Segment{"成", "n"}, Segment{"分子", "n"}, Segment{"时", "n"}},
246 | []Segment{Segment{"旅游", "vn"}, Segment{"和", "c"}, Segment{"服务", "vn"}, Segment{"是", "v"}, Segment{"最好", "a"}, Segment{"的", "uj"}},
247 | []Segment{Segment{"这件", "mq"}, Segment{"事情", "n"}, Segment{"的确", "d"}, Segment{"是", "v"}, Segment{"我", "r"}, Segment{"的", "uj"}, Segment{"错", "v"}},
248 | []Segment{Segment{"供", "v"}, Segment{"大家", "n"}, Segment{"参考", "v"}, Segment{"指正", "v"}},
249 | []Segment{Segment{"哈尔滨", "ns"}, Segment{"政府", "n"}, Segment{"公布", "v"}, Segment{"塌", "v"}, Segment{"桥", "n"}, Segment{"原因", "n"}},
250 | []Segment{Segment{"我", "r"}, Segment{"在", "p"}, Segment{"机场", "n"}, Segment{"入口处", "i"}},
251 | []Segment{Segment{"邢", "nr"}, Segment{"永", "ns"}, Segment{"臣", "n"}, Segment{"摄影", "n"}, Segment{"报道", "v"}},
252 | []Segment{Segment{"BP", "eng"}, Segment{"神经网络", "n"}, Segment{"如何", "r"}, Segment{"训练", "vn"}, Segment{"才能", "v"}, Segment{"在", "p"}, Segment{"分类", "n"}, Segment{"时", "n"}, Segment{"增加", "v"}, Segment{"区分度", "n"}, Segment{"?", "x"}},
253 | []Segment{Segment{"南京市", "ns"}, Segment{"长江大桥", "ns"}},
254 | []Segment{Segment{"应", "v"}, Segment{"一些", "m"}, Segment{"使用者", "n"}, Segment{"的", "uj"}, Segment{"建议", "n"}, Segment{",", "x"}, Segment{"也", "d"}, Segment{"为了", "p"}, Segment{"便于", "v"}, Segment{"利用", "n"}, Segment{"NiuTrans", "eng"}, Segment{"用于", "v"}, Segment{"SMT", "eng"}, Segment{"研究", "vn"}},
255 | []Segment{Segment{"长春市", "ns"}, Segment{"长春", "ns"}, Segment{"药店", "n"}},
256 | []Segment{Segment{"邓颖超", "nr"}, Segment{"生前", "t"}, Segment{"最", "d"}, Segment{"喜欢", "v"}, Segment{"的", "uj"}, Segment{"衣服", "n"}},
257 | []Segment{Segment{"胡锦涛", "nr"}, Segment{"是", "v"}, Segment{"热爱", "a"}, Segment{"世界", "n"}, Segment{"和平", "nz"}, Segment{"的", "uj"}, Segment{"政治局", "n"}, Segment{"常委", "j"}},
258 | []Segment{Segment{"程序员", "n"}, Segment{"祝", "v"}, Segment{"海林", "nz"}, Segment{"和", "c"}, Segment{"朱", "nr"}, Segment{"会", "v"}, Segment{"震", "v"}, Segment{"是", "v"}, Segment{"在", "p"}, Segment{"孙", "zg"}, Segment{"健", "a"}, Segment{"的", "uj"}, Segment{"左面", "f"}, Segment{"和", "c"}, Segment{"右面", "f"}, Segment{",", "x"}, Segment{" ", "x"}, Segment{"范", "nr"}, Segment{"凯", "nr"}, Segment{"在", "p"}, Segment{"最", "d"}, Segment{"右面", "f"}, Segment{".", "x"}, Segment{"再", "d"}, Segment{"往", "zg"}, Segment{"左", "m"}, Segment{"是", "v"}, Segment{"李", "nr"}, Segment{"松", "v"}, Segment{"洪", "nr"}},
259 | []Segment{Segment{"一次性", "d"}, Segment{"交", "v"}, Segment{"多少", "m"}, Segment{"钱", "n"}},
260 | []Segment{Segment{"两块", "m"}, Segment{"五", "m"}, Segment{"一套", "m"}, Segment{",", "x"}, Segment{"三块", "m"}, Segment{"八", "m"}, Segment{"一斤", "m"}, Segment{",", "x"}, Segment{"四块", "m"}, Segment{"七", "m"}, Segment{"一本", "m"}, Segment{",", "x"}, Segment{"五块", "m"}, Segment{"六", "m"}, Segment{"一条", "m"}},
261 | []Segment{Segment{"小", "a"}, Segment{"和尚", "nr"}, Segment{"留", "v"}, Segment{"了", "ul"}, Segment{"一个", "m"}, Segment{"像", "v"}, Segment{"大", "a"}, Segment{"和尚", "nr"}, Segment{"一样", "r"}, Segment{"的", "uj"}, Segment{"和尚头", "nr"}},
262 | []Segment{Segment{"我", "r"}, Segment{"是", "v"}, Segment{"中华人民共和国", "ns"}, Segment{"公民", "n"}, Segment{";", "x"}, Segment{"我", "r"}, Segment{"爸爸", "n"}, Segment{"是", "v"}, Segment{"共和党", "nt"}, Segment{"党员", "n"}, Segment{";", "x"}, Segment{" ", "x"}, Segment{"地铁", "n"}, Segment{"和平门", "ns"}, Segment{"站", "v"}},
263 | []Segment{Segment{"张晓梅", "nr"}, Segment{"去", "v"}, Segment{"人民", "n"}, Segment{"医院", "n"}, Segment{"做", "v"}, Segment{"了", "ul"}, Segment{"个", "q"}, Segment{"B超", "n"}, Segment{"然后", "c"}, Segment{"去", "v"}, Segment{"买", "v"}, Segment{"了", "ul"}, Segment{"件", "zg"}, Segment{"T恤", "n"}},
264 | []Segment{Segment{"AT&T", "nz"}, Segment{"是", "v"}, Segment{"一件", "m"}, Segment{"不错", "a"}, Segment{"的", "uj"}, Segment{"公司", "n"}, Segment{",", "x"}, Segment{"给", "p"}, Segment{"你", "r"}, Segment{"发", "v"}, Segment{"offer", "eng"}, Segment{"了", "ul"}, Segment{"吗", "y"}, Segment{"?", "x"}},
265 | []Segment{Segment{"C++", "nz"}, Segment{"和", "c"}, Segment{"c#", "nz"}, Segment{"是", "v"}, Segment{"什么", "r"}, Segment{"关系", "n"}, Segment{"?", "x"}, Segment{"11", "eng"}, Segment{"+", "x"}, Segment{"122", "eng"}, Segment{"=", "x"}, Segment{"133", "eng"}, Segment{",", "x"}, Segment{"是", "v"}, Segment{"吗", "y"}, Segment{"?", "x"}, Segment{"PI", "eng"}, Segment{"=", "x"}, Segment{"3", "eng"}, Segment{".", "x"}, Segment{"14159", "eng"}},
266 | []Segment{Segment{"你", "r"}, Segment{"认识", "v"}, Segment{"那个", "r"}, Segment{"和", "c"}, Segment{"主席", "n"}, Segment{"握手", "v"}, Segment{"的", "uj"}, Segment{"的哥", "n"}, Segment{"吗", "y"}, Segment{"?", "x"}, Segment{"他", "r"}, Segment{"开", "v"}, Segment{"一辆", "m"}, Segment{"黑色", "n"}, Segment{"的士", "n"}, Segment{"。", "x"}},
267 | []Segment{Segment{"枪杆子", "n"}, Segment{"中", "f"}, Segment{"出", "v"}, Segment{"政权", "n"}},
268 | }
269 | )
270 |
271 | func init() {
272 | seg.LoadDictionary("../dict.txt")
273 | }
274 |
275 | func chanToArray(ch <-chan Segment) []Segment {
276 | var result []Segment
277 | for word := range ch {
278 | result = append(result, word)
279 | }
280 | return result
281 | }
282 |
283 | func TestCut(t *testing.T) {
284 | for index, content := range testContents {
285 | result := chanToArray(seg.Cut(content, true))
286 | if len(defaultCutResult[index]) != len(result) {
287 | t.Errorf("default cut for %s length should be %d not %d\n",
288 | content, len(defaultCutResult[index]), len(result))
289 | t.Errorf("expect: %v\n", defaultCutResult[index])
290 | t.Fatalf("got: %v\n", result)
291 | }
292 | for i := range result {
293 | if result[i] != defaultCutResult[index][i] {
294 | t.Fatalf("expect %s, got %s", defaultCutResult[index][i], result[i])
295 | }
296 | }
297 | result = chanToArray(seg.Cut(content, false))
298 | if len(noHMMCutResult[index]) != len(result) {
299 | t.Fatal(content)
300 | }
301 | for i := range result {
302 | if result[i] != noHMMCutResult[index][i] {
303 | t.Fatal(content)
304 | }
305 | }
306 |
307 | }
308 | }
309 |
310 | // https://github.com/fxsjy/jieba/issues/132
311 | func TestBug132(t *testing.T) {
312 | sentence := "又跛又啞"
313 | cutResult := []Segment{
314 | Segment{"又", "d"},
315 | Segment{"跛", "a"},
316 | Segment{"又", "d"},
317 | Segment{"啞", "v"},
318 | }
319 | result := chanToArray(seg.Cut(sentence, true))
320 | if len(cutResult) != len(result) {
321 | t.Fatal(result)
322 | }
323 | for i := range result {
324 | if result[i] != cutResult[i] {
325 | t.Fatal(result[i])
326 | }
327 | }
328 | }
329 |
330 | // https://github.com/fxsjy/jieba/issues/137
331 | func TestBug137(t *testing.T) {
332 | sentence := "前港督衛奕信在八八年十月宣布成立中央政策研究組"
333 | cutResult := []Segment{
334 | Segment{"前", "f"},
335 | Segment{"港督", "n"},
336 | Segment{"衛奕", "z"},
337 | Segment{"信", "n"},
338 | Segment{"在", "p"},
339 | Segment{"八八年", "m"},
340 | Segment{"十月", "t"},
341 | Segment{"宣布", "v"},
342 | Segment{"成立", "v"},
343 | Segment{"中央", "n"},
344 | Segment{"政策", "n"},
345 | Segment{"研究", "vn"},
346 | Segment{"組", "x"},
347 | }
348 | result := chanToArray(seg.Cut(sentence, true))
349 | if len(cutResult) != len(result) {
350 | t.Fatal(result)
351 | }
352 | for i := range result {
353 | if result[i] != cutResult[i] {
354 | t.Fatal(result[i])
355 | }
356 | }
357 | }
358 |
359 | func TestUserDict(t *testing.T) {
360 | seg.LoadUserDictionary("../userdict.txt")
361 | defer seg.LoadDictionary("../dict.txt")
362 | sentence := "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
363 |
364 | cutResult := []Segment{
365 | Segment{"李小福", "nr"},
366 | Segment{"是", "v"},
367 | Segment{"创新办", "i"},
368 | Segment{"主任", "b"},
369 | Segment{"也", "d"},
370 | Segment{"是", "v"},
371 | Segment{"云计算", "x"},
372 | Segment{"方面", "n"},
373 | Segment{"的", "uj"},
374 | Segment{"专家", "n"},
375 | Segment{";", "x"},
376 | Segment{" ", "x"},
377 | Segment{"什么", "r"},
378 | Segment{"是", "v"},
379 | Segment{"八一双鹿", "nz"},
380 | Segment{"例如", "v"},
381 | Segment{"我", "r"},
382 | Segment{"输入", "v"},
383 | Segment{"一个", "m"},
384 | Segment{"带", "v"},
385 | Segment{"“", "x"},
386 | Segment{"韩玉赏鉴", "nz"},
387 | Segment{"”", "x"},
388 | Segment{"的", "uj"},
389 | Segment{"标题", "n"},
390 | Segment{",", "x"},
391 | Segment{"在", "p"},
392 | Segment{"自定义词", "n"},
393 | Segment{"库中", "nrt"},
394 | Segment{"也", "d"},
395 | Segment{"增加", "v"},
396 | Segment{"了", "ul"},
397 | Segment{"此", "r"},
398 | Segment{"词", "n"},
399 | Segment{"为", "p"},
400 | Segment{"N", "eng"},
401 | Segment{"类型", "n"}}
402 |
403 | result := chanToArray(seg.Cut(sentence, true))
404 | if len(cutResult) != len(result) {
405 | t.Fatal(result)
406 | }
407 | for i := range result {
408 | if result[i] != cutResult[i] {
409 | t.Fatal(result[i])
410 | }
411 | }
412 | }
413 |
414 | func BenchmarkCutNoHMM(b *testing.B) {
415 | sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
416 | b.ResetTimer()
417 | for i := 0; i < b.N; i++ {
418 | chanToArray(seg.Cut(sentence, false))
419 | }
420 | }
421 |
422 | func BenchmarkCut(b *testing.B) {
423 | sentence := "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
424 | b.ResetTimer()
425 | for i := 0; i < b.N; i++ {
426 | chanToArray(seg.Cut(sentence, true))
427 | }
428 | }
429 |
--------------------------------------------------------------------------------
/posseg/prob_start.go:
--------------------------------------------------------------------------------
1 | package posseg
2 |
3 | var probStart = map[uint16]float64{
4 | 100: -4.762305214596967,
5 | 101: -6.680066036784177,
6 | 102: -3.14e+100,
7 | 103: -8.697083223018778,
8 | 104: -5.018374362109218,
9 | 105: -3.14e+100,
10 | 106: -3.423880184954888,
11 | 107: -3.9750475297585357,
12 | 108: -8.888974230828882,
13 | 109: -3.14e+100,
14 | 110: -8.563551830394255,
15 | 111: -3.14e+100,
16 | 112: -5.491630418482717,
17 | 113: -3.14e+100,
18 | 114: -13.533365129970255,
19 | 115: -6.1157847275557105,
20 | 116: -3.14e+100,
21 | 117: -5.0576191284681915,
22 | 118: -3.14e+100,
23 | 119: -3.14e+100,
24 | 120: -4.905883584659895,
25 | 121: -3.14e+100,
26 | 122: -3.6524299819046386,
27 | 123: -3.14e+100,
28 | 124: -6.78695300139688,
29 | 125: -1.6966257797548328,
30 | 126: -3.14e+100,
31 | 127: -2.2310495913769506,
32 | 128: -5.873722175405573,
33 | 129: -4.985642733519195,
34 | 130: -2.8228438314969213,
35 | 131: -4.846091668182416,
36 | 132: -3.94698846057672,
37 | 133: -8.433498702146057,
38 | 134: -4.200984132085048,
39 | 135: -6.998123858956596,
40 | 136: -3.14e+100,
41 | 137: -3.14e+100,
42 | 138: -3.4098187790818413,
43 | 139: -3.14e+100,
44 | 140: -12.434752841302146,
45 | 141: -7.946116471570005,
46 | 142: -5.522673590839954,
47 | 143: -3.3647479094528574,
48 | 144: -3.14e+100,
49 | 145: -9.163917277503234,
50 | 146: -3.14e+100,
51 | 147: -3.14e+100,
52 | 148: -3.14e+100,
53 | 149: -3.14e+100,
54 | 150: -3.14e+100,
55 | 151: -3.14e+100,
56 | 152: -2.6740584874265685,
57 | 153: -9.044728760238115,
58 | 154: -3.14e+100,
59 | 155: -12.434752841302146,
60 | 156: -4.3315610890163585,
61 | 157: -12.147070768850364,
62 | 158: -3.14e+100,
63 | 159: -3.14e+100,
64 | 160: -9.844485675856319,
65 | 161: -3.14e+100,
66 | 162: -7.045681111485645,
67 | 163: -3.14e+100,
68 | 200: -3.14e+100,
69 | 201: -3.14e+100,
70 | 202: -3.14e+100,
71 | 203: -3.14e+100,
72 | 204: -3.14e+100,
73 | 205: -3.14e+100,
74 | 206: -3.14e+100,
75 | 207: -3.14e+100,
76 | 208: -3.14e+100,
77 | 209: -3.14e+100,
78 | 210: -3.14e+100,
79 | 211: -3.14e+100,
80 | 212: -3.14e+100,
81 | 213: -3.14e+100,
82 | 214: -3.14e+100,
83 | 215: -3.14e+100,
84 | 216: -3.14e+100,
85 | 217: -3.14e+100,
86 | 218: -3.14e+100,
87 | 219: -3.14e+100,
88 | 220: -3.14e+100,
89 | 221: -3.14e+100,
90 | 222: -3.14e+100,
91 | 223: -3.14e+100,
92 | 224: -3.14e+100,
93 | 225: -3.14e+100,
94 | 226: -3.14e+100,
95 | 227: -3.14e+100,
96 | 228: -3.14e+100,
97 | 229: -3.14e+100,
98 | 230: -3.14e+100,
99 | 231: -3.14e+100,
100 | 232: -3.14e+100,
101 | 233: -3.14e+100,
102 | 234: -3.14e+100,
103 | 235: -3.14e+100,
104 | 236: -3.14e+100,
105 | 237: -3.14e+100,
106 | 238: -3.14e+100,
107 | 239: -3.14e+100,
108 | 240: -3.14e+100,
109 | 241: -3.14e+100,
110 | 242: -3.14e+100,
111 | 243: -3.14e+100,
112 | 244: -3.14e+100,
113 | 245: -3.14e+100,
114 | 246: -3.14e+100,
115 | 247: -3.14e+100,
116 | 248: -3.14e+100,
117 | 249: -3.14e+100,
118 | 250: -3.14e+100,
119 | 251: -3.14e+100,
120 | 252: -3.14e+100,
121 | 253: -3.14e+100,
122 | 254: -3.14e+100,
123 | 255: -3.14e+100,
124 | 256: -3.14e+100,
125 | 257: -3.14e+100,
126 | 258: -3.14e+100,
127 | 259: -3.14e+100,
128 | 260: -3.14e+100,
129 | 261: -3.14e+100,
130 | 262: -3.14e+100,
131 | 263: -3.14e+100,
132 | 300: -3.14e+100,
133 | 301: -3.14e+100,
134 | 302: -3.14e+100,
135 | 303: -3.14e+100,
136 | 304: -3.14e+100,
137 | 305: -3.14e+100,
138 | 306: -3.14e+100,
139 | 307: -3.14e+100,
140 | 308: -3.14e+100,
141 | 309: -3.14e+100,
142 | 310: -3.14e+100,
143 | 311: -3.14e+100,
144 | 312: -3.14e+100,
145 | 313: -3.14e+100,
146 | 314: -3.14e+100,
147 | 315: -3.14e+100,
148 | 316: -3.14e+100,
149 | 317: -3.14e+100,
150 | 318: -3.14e+100,
151 | 319: -3.14e+100,
152 | 320: -3.14e+100,
153 | 321: -3.14e+100,
154 | 322: -3.14e+100,
155 | 323: -3.14e+100,
156 | 324: -3.14e+100,
157 | 325: -3.14e+100,
158 | 326: -3.14e+100,
159 | 327: -3.14e+100,
160 | 328: -3.14e+100,
161 | 329: -3.14e+100,
162 | 330: -3.14e+100,
163 | 331: -3.14e+100,
164 | 332: -3.14e+100,
165 | 333: -3.14e+100,
166 | 334: -3.14e+100,
167 | 335: -3.14e+100,
168 | 336: -3.14e+100,
169 | 337: -3.14e+100,
170 | 338: -3.14e+100,
171 | 339: -3.14e+100,
172 | 340: -3.14e+100,
173 | 341: -3.14e+100,
174 | 342: -3.14e+100,
175 | 343: -3.14e+100,
176 | 344: -3.14e+100,
177 | 345: -3.14e+100,
178 | 346: -3.14e+100,
179 | 347: -3.14e+100,
180 | 348: -3.14e+100,
181 | 349: -3.14e+100,
182 | 350: -3.14e+100,
183 | 351: -3.14e+100,
184 | 352: -3.14e+100,
185 | 353: -3.14e+100,
186 | 354: -3.14e+100,
187 | 355: -3.14e+100,
188 | 356: -3.14e+100,
189 | 357: -3.14e+100,
190 | 358: -3.14e+100,
191 | 359: -3.14e+100,
192 | 360: -3.14e+100,
193 | 361: -3.14e+100,
194 | 362: -3.14e+100,
195 | 363: -3.14e+100,
196 | 400: -3.9025396831295227,
197 | 401: -11.048458480182255,
198 | 402: -6.954113917960154,
199 | 403: -12.84021794941031,
200 | 404: -6.472888763970454,
201 | 405: -3.14e+100,
202 | 406: -4.786966795861212,
203 | 407: -3.903919764181873,
204 | 408: -3.14e+100,
205 | 409: -8.948397651299683,
206 | 410: -5.942513006281674,
207 | 411: -3.14e+100,
208 | 412: -5.194820249981676,
209 | 413: -6.507826815331734,
210 | 414: -8.650563207383884,
211 | 415: -3.14e+100,
212 | 416: -3.14e+100,
213 | 417: -4.911992119644354,
214 | 418: -3.14e+100,
215 | 419: -6.940320595827818,
216 | 420: -3.14e+100,
217 | 421: -3.14e+100,
218 | 422: -3.269200652116097,
219 | 423: -10.825314928868044,
220 | 424: -3.14e+100,
221 | 425: -3.8551483897645107,
222 | 426: -4.913434861102905,
223 | 427: -4.483663103956885,
224 | 428: -3.14e+100,
225 | 429: -3.14e+100,
226 | 430: -3.14e+100,
227 | 431: -12.147070768850364,
228 | 432: -3.14e+100,
229 | 433: -8.464460927750023,
230 | 434: -2.9868401813596317,
231 | 435: -4.888658618255058,
232 | 436: -3.14e+100,
233 | 437: -3.14e+100,
234 | 438: -2.7635336784127853,
235 | 439: -10.275268591948773,
236 | 440: -3.14e+100,
237 | 441: -3.14e+100,
238 | 442: -3.14e+100,
239 | 443: -3.14e+100,
240 | 444: -6.272842531880403,
241 | 445: -6.940320595827818,
242 | 446: -7.728230161053767,
243 | 447: -7.5394037026636855,
244 | 448: -6.85251045118004,
245 | 449: -8.4153713175535,
246 | 450: -8.15808672228609,
247 | 451: -9.299258625372996,
248 | 452: -3.053292303412302,
249 | 453: -3.14e+100,
250 | 454: -5.9430181843676895,
251 | 455: -3.14e+100,
252 | 456: -11.453923588290419,
253 | 457: -3.14e+100,
254 | 458: -3.14e+100,
255 | 459: -8.427419656069674,
256 | 460: -6.1970794699489575,
257 | 461: -13.533365129970255,
258 | 462: -3.14e+100,
259 | 463: -3.14e+100,
260 | }
261 |
--------------------------------------------------------------------------------
/posseg/viterbi.go:
--------------------------------------------------------------------------------
1 | package posseg
2 |
3 | import (
4 | "fmt"
5 | "sort"
6 | )
7 |
8 | type probState struct {
9 | prob float64
10 | state uint16
11 | }
12 |
13 | func (ps probState) String() string {
14 | return fmt.Sprintf("(%v: %f)", ps.state, ps.prob)
15 | }
16 |
17 | type probStates []probState
18 |
19 | func (pss probStates) Len() int {
20 | return len(pss)
21 | }
22 |
23 | func (pss probStates) Less(i, j int) bool {
24 | if pss[i].prob == pss[j].prob {
25 | return pss[i].state < pss[j].state
26 | }
27 | return pss[i].prob < pss[j].prob
28 | }
29 |
30 | func (pss probStates) Swap(i, j int) {
31 | pss[i], pss[j] = pss[j], pss[i]
32 | }
33 |
34 | func viterbi(obs []rune) []tag {
35 | obsLength := len(obs)
36 | V := make([]map[uint16]float64, obsLength)
37 | V[0] = make(map[uint16]float64)
38 | memPath := make([]map[uint16]uint16, obsLength)
39 | memPath[0] = make(map[uint16]uint16)
40 | ys := charStateTab.get(obs[0]) // default is all_states
41 | for _, y := range ys {
42 | V[0][y] = probEmit[y].get(obs[0]) + probStart[y]
43 | memPath[0][y] = 0
44 | }
45 | for t := 1; t < obsLength; t++ {
46 | var prevStates []uint16
47 | for x := range memPath[t-1] {
48 | if len(probTrans[x]) > 0 {
49 | prevStates = append(prevStates, x)
50 | }
51 | }
52 | //use Go's map to implement Python's Set()
53 | prevStatesExpectNext := make(map[uint16]int)
54 | for _, x := range prevStates {
55 | for y := range probTrans[x] {
56 | prevStatesExpectNext[y] = 1
57 | }
58 | }
59 | tmpObsStates := charStateTab.get(obs[t])
60 |
61 | var obsStates []uint16
62 | for index := range tmpObsStates {
63 | if _, ok := prevStatesExpectNext[tmpObsStates[index]]; ok {
64 | obsStates = append(obsStates, tmpObsStates[index])
65 | }
66 | }
67 | if len(obsStates) == 0 {
68 | for key := range prevStatesExpectNext {
69 | obsStates = append(obsStates, key)
70 | }
71 | }
72 | if len(obsStates) == 0 {
73 | obsStates = probTransKeys
74 | }
75 | memPath[t] = make(map[uint16]uint16)
76 | V[t] = make(map[uint16]float64)
77 | for _, y := range obsStates {
78 | var max, ps probState
79 | for i, y0 := range prevStates {
80 | ps = probState{
81 | prob: V[t-1][y0] + probTrans[y0].Get(y) + probEmit[y].get(obs[t]),
82 | state: y0}
83 | if i == 0 || ps.prob > max.prob || (ps.prob == max.prob && ps.state > max.state) {
84 | max = ps
85 | }
86 | }
87 | V[t][y] = max.prob
88 | memPath[t][y] = max.state
89 | }
90 | }
91 | last := make(probStates, 0)
92 | length := len(memPath)
93 | vlength := len(V)
94 | for y := range memPath[length-1] {
95 | ps := probState{prob: V[vlength-1][y], state: y}
96 | last = append(last, ps)
97 | }
98 | sort.Sort(sort.Reverse(last))
99 | state := last[0].state
100 | route := make([]tag, len(obs))
101 |
102 | for i := obsLength - 1; i >= 0; i-- {
103 | route[i] = tag(state)
104 | state = memPath[i][state]
105 | }
106 | return route
107 | }
108 |
--------------------------------------------------------------------------------
/posseg/viterbi_test.go:
--------------------------------------------------------------------------------
1 | package posseg
2 |
3 | import (
4 | "testing"
5 | )
6 |
7 | var defaultRoute []tag
8 |
9 | func init() {
10 | var t tag
11 | t, _ = newTag("B", "nr")
12 | defaultRoute = append(defaultRoute, t)
13 | t, _ = newTag("M", "nr")
14 | defaultRoute = append(defaultRoute, t)
15 | t, _ = newTag("E", "nr")
16 | defaultRoute = append(defaultRoute, t)
17 | t, _ = newTag("S", "v")
18 | defaultRoute = append(defaultRoute, t)
19 | t, _ = newTag("B", "v")
20 | defaultRoute = append(defaultRoute, t)
21 | t, _ = newTag("E", "v")
22 | defaultRoute = append(defaultRoute, t)
23 | t, _ = newTag("B", "n")
24 | defaultRoute = append(defaultRoute, t)
25 | t, _ = newTag("M", "n")
26 | defaultRoute = append(defaultRoute, t)
27 | t, _ = newTag("E", "n")
28 | defaultRoute = append(defaultRoute, t)
29 | t, _ = newTag("S", "d")
30 | defaultRoute = append(defaultRoute, t)
31 | t, _ = newTag("S", "v")
32 | defaultRoute = append(defaultRoute, t)
33 | t, _ = newTag("S", "n")
34 | defaultRoute = append(defaultRoute, t)
35 | t, _ = newTag("B", "v")
36 | defaultRoute = append(defaultRoute, t)
37 | t, _ = newTag("E", "v")
38 | defaultRoute = append(defaultRoute, t)
39 | t, _ = newTag("B", "nr")
40 | defaultRoute = append(defaultRoute, t)
41 | t, _ = newTag("M", "nr")
42 | defaultRoute = append(defaultRoute, t)
43 | t, _ = newTag("M", "nr")
44 | defaultRoute = append(defaultRoute, t)
45 | t, _ = newTag("M", "nr")
46 | defaultRoute = append(defaultRoute, t)
47 | t, _ = newTag("E", "nr")
48 | defaultRoute = append(defaultRoute, t)
49 | t, _ = newTag("S", "zg")
50 | defaultRoute = append(defaultRoute, t)
51 | }
52 |
53 | func TestViterbi(t *testing.T) {
54 | ss := "李小福是创新办主任也是云计算方面的专家;"
55 | route := viterbi([]rune(ss))
56 | if len(route) != len(defaultRoute) {
57 | t.Fatal(len(route))
58 | }
59 | for index := range route {
60 | if route[index] != defaultRoute[index] {
61 | t.Fatal(route[index])
62 | }
63 | }
64 | }
65 |
66 | func BenchmarkViterbi(b *testing.B) {
67 | ss := "李小福是创新办主任也是云计算方面的专家;"
68 | for i := 0; i < b.N; i++ {
69 | viterbi([]rune(ss))
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/tokenizers/example_bleve_test.go:
--------------------------------------------------------------------------------
1 | package tokenizers_test
2 |
3 | import (
4 | "fmt"
5 | "log"
6 | "os"
7 |
8 | "github.com/blevesearch/bleve"
9 | _ "github.com/wangbin/jiebago/tokenizers"
10 | )
11 |
12 | func Example_beleveSearch() {
13 | // open a new index
14 | indexMapping := bleve.NewIndexMapping()
15 |
16 | err := indexMapping.AddCustomTokenizer("jieba",
17 | map[string]interface{}{
18 | "file": "../dict.txt",
19 | "type": "jieba",
20 | })
21 | if err != nil {
22 | log.Fatal(err)
23 | }
24 |
25 | // create a custom analyzer
26 | err = indexMapping.AddCustomAnalyzer("jieba",
27 | map[string]interface{}{
28 | "type": "custom",
29 | "tokenizer": "jieba",
30 | "token_filters": []string{
31 | "possessive_en",
32 | "to_lower",
33 | "stop_en",
34 | },
35 | })
36 |
37 | if err != nil {
38 | log.Fatal(err)
39 | }
40 |
41 | indexMapping.DefaultAnalyzer = "jieba"
42 | cacheDir := "jieba.beleve"
43 | os.RemoveAll(cacheDir)
44 | index, err := bleve.New(cacheDir, indexMapping)
45 |
46 | if err != nil {
47 | log.Fatal(err)
48 | }
49 |
50 | docs := []struct {
51 | Title string
52 | Name string
53 | }{
54 | {
55 | Title: "Doc 1",
56 | Name: "This is the first document we’ve added",
57 | },
58 | {
59 | Title: "Doc 2",
60 | Name: "The second one 你 中文测试中文 is even more interesting! 吃水果",
61 | },
62 | {
63 | Title: "Doc 3",
64 | Name: "买水果然后来世博园。",
65 | },
66 | {
67 | Title: "Doc 4",
68 | Name: "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
69 | },
70 | {
71 | Title: "Doc 5",
72 | Name: "咱俩交换一下吧。",
73 | },
74 | }
75 | // index docs
76 | for _, doc := range docs {
77 | index.Index(doc.Title, doc)
78 | }
79 |
80 | // search for some text
81 | for _, keyword := range []string{"水果世博园", "你", "first", "中文", "交换机", "交换"} {
82 | query := bleve.NewQueryStringQuery(keyword)
83 | search := bleve.NewSearchRequest(query)
84 | search.Highlight = bleve.NewHighlight()
85 | searchResults, err := index.Search(search)
86 | if err != nil {
87 | log.Fatal(err)
88 | }
89 | fmt.Printf("Result of \"%s\": %d matches:\n", keyword, searchResults.Total)
90 | for i, hit := range searchResults.Hits {
91 | rv := fmt.Sprintf("%d. %s, (%f)\n", i+searchResults.Request.From+1, hit.ID, hit.Score)
92 | for fragmentField, fragments := range hit.Fragments {
93 | rv += fmt.Sprintf("%s: ", fragmentField)
94 | for _, fragment := range fragments {
95 | rv += fmt.Sprintf("%s", fragment)
96 | }
97 | }
98 | fmt.Printf("%s\n", rv)
99 | }
100 | }
101 | // Output:
102 | // Result of "水果世博园": 2 matches:
103 | // 1. Doc 3, (1.099550)
104 | // Name: 买水果然后来世博园。
105 | // 2. Doc 2, (0.031941)
106 | // Name: The second one 你 中文测试中文 is even more interesting! 吃水果
107 | // Result of "你": 1 matches:
108 | // 1. Doc 2, (0.391161)
109 | // Name: The second one 你 中文测试中文 is even more interesting! 吃水果
110 | // Result of "first": 1 matches:
111 | // 1. Doc 1, (0.512150)
112 | // Name: This is the first document we’ve added
113 | // Result of "中文": 1 matches:
114 | // 1. Doc 2, (0.553186)
115 | // Name: The second one 你 中文测试中文 is even more interesting! 吃水果
116 | // Result of "交换机": 2 matches:
117 | // 1. Doc 4, (0.608495)
118 | // Name: 工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作
119 | // 2. Doc 5, (0.086700)
120 | // Name: 咱俩交换一下吧。
121 | // Result of "交换": 2 matches:
122 | // 1. Doc 5, (0.534158)
123 | // Name: 咱俩交换一下吧。
124 | // 2. Doc 4, (0.296297)
125 | // Name: 工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作
126 | }
127 |
--------------------------------------------------------------------------------
/tokenizers/example_test.go:
--------------------------------------------------------------------------------
1 | package tokenizers_test
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/wangbin/jiebago/tokenizers"
7 | )
8 |
9 | func Example() {
10 | sentence := []byte("永和服装饰品有限公司")
11 |
12 | // default mode
13 | tokenizer, _ := tokenizers.NewJiebaTokenizer("../dict.txt", true, false)
14 | fmt.Println("Default Mode:")
15 | for _, token := range tokenizer.Tokenize(sentence) {
16 | fmt.Printf(
17 | "Term: %s Start: %d End: %d Position: %d Type: %d\n",
18 | token.Term, token.Start, token.End, token.Position, token.Type)
19 | }
20 |
21 | //search mode
22 | tokenizer, _ = tokenizers.NewJiebaTokenizer("../dict.txt", true, true)
23 | fmt.Println("Search Mode:")
24 | for _, token := range tokenizer.Tokenize(sentence) {
25 | fmt.Printf(
26 | "Term: %s Start: %d End: %d Position: %d Type: %d\n",
27 | token.Term, token.Start, token.End, token.Position, token.Type)
28 | }
29 | // Output:
30 | // Default Mode:
31 | // Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
32 | // Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
33 | // Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
34 | // Term: 有限公司 Start: 18 End: 30 Position: 4 Type: 1
35 | // Search Mode:
36 | // Term: 永和 Start: 0 End: 6 Position: 1 Type: 1
37 | // Term: 服装 Start: 6 End: 12 Position: 2 Type: 1
38 | // Term: 饰品 Start: 12 End: 18 Position: 3 Type: 1
39 | // Term: 有限 Start: 18 End: 24 Position: 4 Type: 1
40 | // Term: 公司 Start: 24 End: 30 Position: 5 Type: 1
41 | // Term: 有限公司 Start: 18 End: 30 Position: 6 Type: 1
42 | }
43 |
--------------------------------------------------------------------------------
/tokenizers/tokenizer.go:
--------------------------------------------------------------------------------
1 | package tokenizers
2 |
3 | import (
4 | "fmt"
5 | "regexp"
6 | "strconv"
7 |
8 | "github.com/blevesearch/bleve/analysis"
9 | "github.com/blevesearch/bleve/registry"
10 | "github.com/wangbin/jiebago"
11 | )
12 |
13 | // Name is the jieba tokenizer name.
14 | const Name = "jieba"
15 |
16 | var ideographRegexp = regexp.MustCompile(`\p{Han}+`)
17 |
18 | // JiebaTokenizer is the beleve tokenizer for jiebago.
19 | type JiebaTokenizer struct {
20 | seg jiebago.Segmenter
21 | hmm, searchMode bool
22 | }
23 |
24 | /*
25 | NewJiebaTokenizer creates a new JiebaTokenizer.
26 |
27 | Parameters:
28 |
29 | dictFilePath: path of the dictioanry file.
30 |
31 | hmm: whether to use Hidden Markov Model to cut unknown words,
32 | i.e. not found in dictionary. For example word "安卓" (means "Android" in
33 | English) not in the dictionary file. If hmm is set to false, it will be
34 | cutted into two single words "安" and "卓", if hmm is set to true, it will
35 | be traded as one single word because Jieba using Hidden Markov Model with
36 | Viterbi algorithm to guess the best possibility.
37 |
38 | searchMode: whether to further cut long words into serveral short words.
39 | In Chinese, some long words may contains other words, for example "交换机"
40 | is a Chinese word for "Switcher", if sechMode is false, it will trade
41 | "交换机" as a single word. If searchMode is true, it will further split
42 | this word into "交换", "换机", which are valid Chinese words.
43 | */
44 | func NewJiebaTokenizer(dictFilePath string, hmm, searchMode bool) (analysis.Tokenizer, error) {
45 | var seg jiebago.Segmenter
46 | err := seg.LoadDictionary(dictFilePath)
47 | return &JiebaTokenizer{
48 | seg: seg,
49 | hmm: hmm,
50 | searchMode: searchMode,
51 | }, err
52 | }
53 |
54 | // Tokenize cuts input into bleve token stream.
55 | func (jt *JiebaTokenizer) Tokenize(input []byte) analysis.TokenStream {
56 | rv := make(analysis.TokenStream, 0)
57 | runeStart := 0
58 | start := 0
59 | end := 0
60 | pos := 1
61 | var width int
62 | var gram string
63 | for word := range jt.seg.Cut(string(input), jt.hmm) {
64 | if jt.searchMode {
65 | runes := []rune(word)
66 | width = len(runes)
67 | for _, step := range [2]int{2, 3} {
68 | if width <= step {
69 | continue
70 | }
71 | for i := 0; i < width-step+1; i++ {
72 | gram = string(runes[i : i+step])
73 | gramLen := len(gram)
74 | if frequency, ok := jt.seg.Frequency(gram); ok && frequency > 0 {
75 | gramStart := start + len(string(runes[:i]))
76 | token := analysis.Token{
77 | Term: []byte(gram),
78 | Start: gramStart,
79 | End: gramStart + gramLen,
80 | Position: pos,
81 | Type: detectTokenType(gram),
82 | }
83 | rv = append(rv, &token)
84 | pos++
85 | }
86 | }
87 | }
88 | }
89 | end = start + len(word)
90 | token := analysis.Token{
91 | Term: []byte(word),
92 | Start: start,
93 | End: end,
94 | Position: pos,
95 | Type: detectTokenType(word),
96 | }
97 | rv = append(rv, &token)
98 | pos++
99 | runeStart += width
100 | start = end
101 | }
102 | return rv
103 | }
104 |
105 | /*
106 | JiebaTokenizerConstructor creates a JiebaTokenizer.
107 |
108 | Parameter config should contains at least one parameter:
109 |
110 | file: the path of the dictionary file.
111 |
112 | hmm: optional, specify whether to use Hidden Markov Model, see NewJiebaTokenizer for details.
113 |
114 | search: optional, speficy whether to use search mode, see NewJiebaTokenizer for details.
115 | */
116 | func JiebaTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (
117 | analysis.Tokenizer, error) {
118 | dictFilePath, ok := config["file"].(string)
119 | if !ok {
120 | return nil, fmt.Errorf("must specify dictionary file path")
121 | }
122 | hmm, ok := config["hmm"].(bool)
123 | if !ok {
124 | hmm = true
125 | }
126 | searchMode, ok := config["search"].(bool)
127 | if !ok {
128 | searchMode = true
129 | }
130 |
131 | return NewJiebaTokenizer(dictFilePath, hmm, searchMode)
132 | }
133 |
134 | func detectTokenType(term string) analysis.TokenType {
135 | if ideographRegexp.MatchString(term) {
136 | return analysis.Ideographic
137 | }
138 | _, err := strconv.ParseFloat(term, 64)
139 | if err == nil {
140 | return analysis.Numeric
141 | }
142 | return analysis.AlphaNumeric
143 | }
144 |
145 | func init() {
146 | registry.RegisterTokenizer(Name, JiebaTokenizerConstructor)
147 | }
148 |
--------------------------------------------------------------------------------
/userdict.txt:
--------------------------------------------------------------------------------
1 | 云计算 5
2 | 李小福 2 nr
3 | 创新办 3 i
4 | easy_install 3 eng
5 | 好用 300
6 | 韩玉赏鉴 3 nz
7 | 八一双鹿 3 nz
8 |
--------------------------------------------------------------------------------
/util/util.go:
--------------------------------------------------------------------------------
1 | // Package util contains some util functions used by jiebago.
2 | package util
3 |
4 | import "regexp"
5 |
6 | /*
7 | RegexpSplit split slices s into substrings separated by the expression and
8 | returns a slice of the substrings between those expression matches.
9 | If capturing parentheses are used in expression, then the text of all groups
10 | in the expression are also returned as part of the resulting slice.
11 |
12 | This function acts consistent with Python's re.split function.
13 | */
14 | func RegexpSplit(re *regexp.Regexp, s string, n int) []string {
15 | if n == 0 {
16 | return nil
17 | }
18 |
19 | if len(re.String()) > 0 && len(s) == 0 {
20 | return []string{""}
21 | }
22 |
23 | var matches [][]int
24 | if len(re.SubexpNames()) > 1 {
25 | matches = re.FindAllStringSubmatchIndex(s, n)
26 | } else {
27 | matches = re.FindAllStringIndex(s, n)
28 | }
29 | strings := make([]string, 0, len(matches))
30 |
31 | beg := 0
32 | end := 0
33 | for _, match := range matches {
34 | if n > 0 && len(strings) >= n-1 {
35 | break
36 | }
37 |
38 | end = match[0]
39 | if match[1] != 0 {
40 | strings = append(strings, s[beg:end])
41 | }
42 | beg = match[1]
43 | if len(re.SubexpNames()) > 1 {
44 | strings = append(strings, s[match[0]:match[1]])
45 | }
46 | }
47 |
48 | if end != len(s) {
49 | strings = append(strings, s[beg:])
50 | }
51 |
52 | return strings
53 | }
54 |
--------------------------------------------------------------------------------
/util/util_test.go:
--------------------------------------------------------------------------------
1 | package util
2 |
3 | import (
4 | "regexp"
5 | "testing"
6 | )
7 |
8 | func TestRegexpSplit(t *testing.T) {
9 | result := RegexpSplit(regexp.MustCompile(`\p{Han}+`),
10 | "BP神经网络如何训练才能在分类时增加区分度?", -1)
11 | if len(result) != 2 {
12 | t.Fatal(result)
13 | }
14 | result = RegexpSplit(regexp.MustCompile(`(\p{Han})+`),
15 | "BP神经网络如何训练才能在分类时增加区分度?", -1)
16 | if len(result) != 3 {
17 | t.Fatal(result)
18 | }
19 | result = RegexpSplit(regexp.MustCompile(`([\p{Han}#]+)`),
20 | ",BP神经网络如何训练才能在分类时#增加区分度?", -1)
21 | if len(result) != 3 {
22 | t.Fatal(result)
23 | }
24 | }
25 |
--------------------------------------------------------------------------------