├── .gitignore ├── LICENSE ├── README.md ├── api ├── Dockerfile ├── Makefile ├── jiebago.go └── jiebago_test.go ├── dictionary ├── dict_std_utf8.txt ├── dict_user_utf8.txt ├── fs_pbemit.json ├── fs_pbstart.json ├── fs_pbtrans.json ├── idf_std_utf8.txt ├── stop_words_std_utf8.txt └── stop_words_user_utf8.txt ├── dll ├── Makefile └── jiebago.go ├── example ├── main.go └── main_api.go ├── go.mod ├── go.sum ├── jiebago.go ├── jiebago_test.go └── tokenizer ├── analyzer.go ├── common.go ├── cutword.go ├── dictionary.go ├── fstokenizer.go └── sentence.go /.gitignore: -------------------------------------------------------------------------------- 1 | .git 2 | .idea 3 | 4 | 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2022 Ze-Bin Wang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![logo](http://static.codebaoku.com/images/blogo.png)](http://www.codebaoku.com) 2 | 3 | JiebaGo 是 jieba 中文分词的 Go 语言版本。 4 | 5 | ## 功能特点 6 | 7 | + 支持多种分词方式,包括: 最大概率模式, HMM新词发现模式, 搜索引擎模式, 全模式 8 | + 支持抽取关键词,包括: 无权重关键词, 权重关键词 9 | + 支持多种使用方式,包括: Go语言包, Windows Dll, Web API, Docker 10 | + 支持在线并行添加字典词库和停止词 11 | + 全部代码使用 go 语言实现,全面兼容 jieba python 词库 12 | 13 | ## 引用方法 14 | 15 | 不使用包管理工具: 16 | ```bash 17 | go get github.com/wangshizebin/jiebago 18 | ``` 19 | 20 | 使用 go mod 管理: 21 | 代码中直接引用 github.com/wangshizebin/jiebago 即可。 22 | 23 | ## 特别注意 24 | 25 | 由于分词和提取关键词使用了中文预置词库和TF-IDF统计库,所以使用 jiebago,需要先下载项目中词库 dictionary 目录,并将 dictionary 放入项目的工作目录中。 26 | 我们也可以自己指定字典库的位置,不过需要在初始化 jiebago 对象的时候进行设置: 27 | 28 | ```golang 29 | jieBaGo := jiebago.NewJieBaGo("/data/mydict") 30 | ``` 31 | 32 | ## 功能示例 33 | 34 | ```golang 35 | package main 36 | 37 | import ( 38 | "fmt" 39 | "strings" 40 | 41 | "github.com/wangshizebin/jiebago" 42 | ) 43 | 44 | func main() { 45 | jieBaGo := jiebago.NewJieBaGo() 46 | // 可以指定字典库的位置 47 | // jieBaGo := jiebago.NewJieBaGo("/data/mydict") 48 | 49 | sentence := "Shell 位于用户与系统之间,用来帮助用户与操作系统进行沟通。通常都是文字模式的 Shell。" 50 | fmt.Println("原始语句:", sentence) 51 | fmt.Println() 52 | 53 | // 默认模式分词 54 | words := jieBaGo.Cut(sentence) 55 | fmt.Println("默认模式分词:", strings.Join(words,"/")) 56 | 57 | // 精确模式分词 58 | words = jieBaGo.CutAccurate(sentence) 59 | fmt.Println("精确模式分词:", strings.Join(words,"/")) 60 | 61 | // 全模式分词 62 | words = jieBaGo.CutFull(sentence) 63 | fmt.Println("全模式分词:", strings.Join(words,"/")) 64 | 65 | // NoHMM模式分词 66 | words = jieBaGo.CutNoHMM(sentence) 67 | fmt.Println("NoHMM模式分词:", strings.Join(words,"/")) 68 | 69 | // 搜索引擎模式分词 70 | words = jieBaGo.CutForSearch(sentence) 71 | fmt.Println("搜索引擎模式分词:", strings.Join(words,"/")) 72 | fmt.Println() 73 | 74 | // 提取关键词,即Tag标签 75 | keywords := jieBaGo.ExtractKeywords(sentence, 20) 76 | fmt.Println("提取关键词:", strings.Join(keywords,"/")) 77 | 78 | // 提取带权重的关键词,即Tag标签 79 | keywordsWeight := jieBaGo.ExtractKeywordsWeight(sentence, 20) 80 | fmt.Println("提取带权重的关键词:", keywordsWeight) 81 | fmt.Println() 82 | 83 | // 向字典加入单词 84 | exist, err := jieBaGo.AddDictWord("编程宝库", 3, "n") 85 | if err != nil { 86 | fmt.Println(err) 87 | } else { 88 | fmt.Println("向字典加入单词:编程宝库") 89 | if exist { 90 | fmt.Println("单词已经存在") 91 | } 92 | } 93 | 94 | // 向字典加入停止词 95 | exist, err = jieBaGo.AddStopWord("the") 96 | if err != nil { 97 | fmt.Println(err) 98 | } else { 99 | fmt.Println("向字典加入停止词:the") 100 | if exist { 101 | fmt.Println("单词已经存在") 102 | } 103 | } 104 | } 105 | ``` 106 | 107 | ``` 108 | 原始语句: Shell位于用户与系统之间,用来帮助用户与操作系统进行沟通。 109 | 110 | 默认模式分词: Shell/位于/用户/与/系统/之间/,/用来/帮助/用户/与/操作系统/进行/沟通/。 111 | 精确模式分词: Shell/位于/用户/与/系统/之间/,/用来/帮助/用户/与/操作系统/进行/沟通/。 112 | 全模式分词: Shell/位于/用户/与/系统/之间/,/用来/帮助/用户/与/操作/操作系统/系统/进行/沟通/。 113 | NoHMM模式分词: Shell/位于/用户/与/系统/之间/,/用来/帮助/用户/与/操作系统/进行/沟通/。 114 | 搜索引擎模式分词: Shell/位于/用户/与/系统/之间/,/用来/帮助/用户/与/操作/系统/操作系/操作系统/进行/沟通/。 115 | 116 | 提取关键词: 用户/Shell/操作系统/沟通/帮助/位于/系统/之间/进行 117 | 提取带权重的关键词: [{用户 1.364467214484} {Shell 1.19547675029} {操作系统 0.9265948663750001} {沟通 0.694890548758} {帮助 0.5809050240370001} {位于 0.496609078159} {系统 0.49601794343199995} {之间 0.446152979906} {进行 0.372712479502}] 118 | 119 | 向字典加入单词:编程宝库 120 | 向字典加入停止词:the 121 | ``` 122 | 123 | 更详细的例子参照 example/main.go, jiebago_test.go, api/iebago_test.go 中的代码。 124 | 125 | ## 单元测试 126 | go 包 127 | 128 | ```bash 129 | go test 130 | ``` 131 | 132 | Web API 133 | 134 | ```bash 135 | cd api 136 | go test 137 | ``` 138 | 139 | ## Contact 140 | 141 | + Email: `wangzebin@vip.163.com` 142 | + weixin: `bkra50` -------------------------------------------------------------------------------- /api/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM alpine 2 | ENV TZ=Asia/Shanghai 3 | RUN mkdir -p /jiebago/dictionary 4 | COPY ./jiebago /jiebago 5 | COPY ./dictionary /jiebago/dictionary 6 | WORKDIR /jiebago 7 | CMD ["./jiebago","-http_addr=:8118"] 8 | -------------------------------------------------------------------------------- /api/Makefile: -------------------------------------------------------------------------------- 1 | PROJECT = jiebago 2 | 3 | .PHONY: all clean build docker 4 | all:clean build 5 | @echo "Done!" 6 | 7 | build: 8 | go build -o $(PROJECT) $(PROJECT).go 9 | 10 | clean: 11 | rm -rf $(PROJECT) 12 | rm -rf dictionary 13 | 14 | docker: clean 15 | CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o $(PROJECT) $(PROJECT).go 16 | mkdir dictionary 17 | cp -r ../dictionary/ dictionary/ 18 | docker build . -t $(PROJECT):v1 19 | rm -rf $(PROJECT) 20 | rm -rf dictionary 21 | 22 | -------------------------------------------------------------------------------- /api/jiebago.go: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Ze-Bin Wang. All rights reserved. 2 | // Use of this source code is governed by a MIT style 3 | // license that can be found in the LICENSE file. 4 | 5 | package main 6 | 7 | import ( 8 | "flag" 9 | "fmt" 10 | "log" 11 | "net/http" 12 | "strconv" 13 | "strings" 14 | 15 | "github.com/gin-gonic/gin" 16 | "github.com/wangshizebin/jiebago" 17 | "github.com/wangshizebin/jiebago/tokenizer" 18 | ) 19 | 20 | const ( 21 | Success = iota 22 | ErrorFail 23 | ErrorRequestMethod 24 | ErrorJsonData 25 | ErrorWordEmpty 26 | ErrorWeightInteger 27 | ErrorWeightRange 28 | ErrorCountInteger 29 | ) 30 | 31 | var ( 32 | jieBaGo = &jiebago.JieBaGo{} 33 | ) 34 | 35 | func main() { 36 | httpAddr := flag.String("http_addr", ":8118", 37 | "http_addr specifies the listening ip and port, for example: -http_addr 1.2.3.4:8888") 38 | 39 | dictPath := flag.String("dict_path", "", 40 | "dict_path specifies the path of dictionary, for example: -dict_path /data/dictionary") 41 | 42 | flag.Parse() 43 | 44 | jieBaGo = jiebago.NewJieBaGo(*dictPath) 45 | 46 | engine := gin.Default() 47 | 48 | engine.Any("/cut_words", cutWordsHandler) 49 | engine.Any("/extract_keywords", extractKeywordsHandler) 50 | engine.Any("/add_dict_word", addDictWordHandler) 51 | engine.Any("/add_stop_word", addStopWordHandler) 52 | 53 | if err := engine.Run(*httpAddr); err != nil { 54 | log.Print(err) 55 | } 56 | } 57 | 58 | type RequestCutWord struct { 59 | Sentence string `json:"s"` 60 | Mode string `json:"mode"` 61 | } 62 | 63 | type RequestExtractWord struct { 64 | Sentence string `json:"s"` 65 | Mode string `json:"mode"` 66 | Count int `json:"count"` 67 | } 68 | 69 | type RequestAddWord struct { 70 | Word string `json:"s"` 71 | Weight int `json:"weight"` 72 | Prop string `json:"prop"` 73 | } 74 | 75 | type Response struct { 76 | ErrCode int `json:"errcode"` 77 | ErrMsg string `json:"errmsg"` 78 | } 79 | 80 | func cutWordsHandler(c *gin.Context) { 81 | sentence := "" 82 | mode := "" 83 | if c.Request.Method == "GET" { 84 | mode = strings.ToLower(c.DefaultQuery("mode", "")) 85 | sentence = c.DefaultQuery("s", "") 86 | } else if c.Request.Method == "POST" { 87 | var request RequestCutWord 88 | err := c.BindJSON(&request) 89 | if err != nil { 90 | c.JSON(http.StatusOK, struct { 91 | Response 92 | Words []string `json:"words"` 93 | }{ 94 | Response: Response{ 95 | ErrCode: ErrorJsonData, 96 | ErrMsg: fmt.Sprintf(`invalid json data, the proper data format is {"s":"xx","mode":"xx"}`), 97 | }, 98 | Words: []string{}, 99 | }) 100 | return 101 | } 102 | mode = request.Mode 103 | sentence = request.Sentence 104 | } else { 105 | c.JSON(http.StatusOK, struct { 106 | Response 107 | Words []string `json:"words"` 108 | }{ 109 | Response: Response{ 110 | ErrCode: ErrorRequestMethod, 111 | ErrMsg: fmt.Sprintf(`iinvalid request method, only GET and POST methods are supported`), 112 | }, 113 | Words: []string{}, 114 | }) 115 | return 116 | } 117 | 118 | var words []string 119 | if mode == "full" { 120 | words = jieBaGo.CutFull(sentence) 121 | } else if mode == "accurate" { 122 | words = jieBaGo.CutAccurate(sentence) 123 | } else if mode == "nohmm" { 124 | words = jieBaGo.CutNoHMM(sentence) 125 | } else if mode == "search" { 126 | words = jieBaGo.CutForSearch(sentence) 127 | } else { 128 | words = jieBaGo.Cut(sentence) 129 | } 130 | 131 | c.JSON(http.StatusOK, struct { 132 | Response 133 | Words []string `json:"words"` 134 | }{ 135 | Response: Response{ 136 | ErrCode: Success, 137 | ErrMsg: "success", 138 | }, 139 | Words: words, 140 | }) 141 | } 142 | 143 | func extractKeywordsHandler(c *gin.Context) { 144 | sentence := "" 145 | count := 0 146 | mode := "" 147 | if c.Request.Method == "GET" { 148 | sentence = c.DefaultQuery("s", "") 149 | mode = c.DefaultQuery("mode", "") 150 | w := c.DefaultQuery("count", "0") 151 | var err error 152 | count, err = strconv.Atoi(w) 153 | if err != nil { 154 | c.JSON(http.StatusOK, struct { 155 | Response 156 | Tags []string `json:"tags"` 157 | }{ 158 | Response: Response{ 159 | ErrCode: ErrorCountInteger, 160 | ErrMsg: "the count must be an integer", 161 | }, 162 | Tags: []string{}, 163 | }) 164 | return 165 | } 166 | } else if c.Request.Method == "POST" { 167 | var request RequestExtractWord 168 | err := c.BindJSON(&request) 169 | if err != nil { 170 | c.JSON(http.StatusOK, struct { 171 | Response 172 | Tags []string `json:"tags"` 173 | }{ 174 | Response: Response{ 175 | ErrCode: ErrorJsonData, 176 | ErrMsg: fmt.Sprintf(`invalid json data, the proper data format is {"s":"xx","count":xx,"mode":"xx"}`), 177 | }, 178 | Tags: []string{}, 179 | }) 180 | return 181 | } 182 | sentence = request.Sentence 183 | mode = request.Mode 184 | count = request.Count 185 | } else { 186 | c.JSON(http.StatusOK, struct { 187 | Response 188 | Tags []string `json:"tags"` 189 | }{ 190 | Response: Response{ 191 | ErrCode: ErrorRequestMethod, 192 | ErrMsg: fmt.Sprintf(`iinvalid request method, only GET and POST methods are supported`), 193 | }, 194 | Tags: []string{}, 195 | }) 196 | return 197 | } 198 | if count <= 0 { 199 | count = 20 200 | } 201 | 202 | if mode == "weight" { 203 | tags := jieBaGo.ExtractKeywordsWeight(sentence, count) 204 | c.JSON(http.StatusOK, struct { 205 | Response 206 | Tags []tokenizer.Keyword `json:"tags"` 207 | }{ 208 | Response: Response{ 209 | ErrCode: Success, 210 | ErrMsg: "success", 211 | }, 212 | Tags: tags, 213 | }) 214 | } else { 215 | tags := jieBaGo.ExtractKeywords(sentence, count) 216 | c.JSON(http.StatusOK, struct { 217 | Response 218 | Tags []string `json:"tags"` 219 | }{ 220 | Response: Response{ 221 | ErrCode: Success, 222 | ErrMsg: "success", 223 | }, 224 | Tags: tags, 225 | }) 226 | } 227 | } 228 | 229 | func addDictWordHandler(c *gin.Context) { 230 | word := "" 231 | weight := 0 232 | prop := "" 233 | if c.Request.Method == "GET" { 234 | word = c.DefaultQuery("s", "") 235 | w := c.DefaultQuery("weight", "0") 236 | var err error 237 | weight, err = strconv.Atoi(w) 238 | if err != nil { 239 | c.JSON(http.StatusOK, Response{ 240 | ErrCode: ErrorWeightInteger, 241 | ErrMsg: "the weight must be an integer", 242 | }) 243 | return 244 | } 245 | prop = c.DefaultQuery("prop", "") 246 | } else if c.Request.Method == "POST" { 247 | var request RequestAddWord 248 | err := c.BindJSON(&request) 249 | if err != nil { 250 | c.JSON(http.StatusOK, Response{ 251 | ErrCode: ErrorJsonData, 252 | ErrMsg: fmt.Sprintf(`invalid json data, the proper data format is {"s":"xx","weight":xx,"prop":"xx"}`), 253 | }) 254 | return 255 | } 256 | word = request.Word 257 | weight = request.Weight 258 | prop = request.Prop 259 | } 260 | 261 | word = strings.TrimSpace(word) 262 | if len(word) == 0 { 263 | c.JSON(http.StatusOK, Response{ 264 | ErrCode: ErrorWordEmpty, 265 | ErrMsg: "the word is empty", 266 | }) 267 | return 268 | } 269 | 270 | if weight < 0 || weight > 5000 { 271 | c.JSON(http.StatusOK, Response{ 272 | ErrCode: ErrorWeightRange, 273 | ErrMsg: "the weight must be between 0 and 5000", 274 | }) 275 | return 276 | } 277 | 278 | if prop == "" { 279 | prop = "n" 280 | } 281 | 282 | exist, err := jieBaGo.AddDictWord(word, weight, prop) 283 | if err != nil { 284 | c.JSON(http.StatusOK, Response{ 285 | ErrCode: ErrorFail, 286 | ErrMsg: err.Error(), 287 | }) 288 | return 289 | } 290 | 291 | message := "success" 292 | if exist { 293 | message = "the word already exists" 294 | } 295 | c.JSON(http.StatusOK, Response{ 296 | ErrCode: Success, 297 | ErrMsg: message, 298 | }) 299 | } 300 | 301 | func addStopWordHandler(c *gin.Context) { 302 | word := "" 303 | if c.Request.Method == "GET" { 304 | word = c.DefaultQuery("s", "") 305 | } else if c.Request.Method == "POST" { 306 | var request RequestAddWord 307 | err := c.BindJSON(&request) 308 | if err != nil { 309 | c.JSON(http.StatusOK, Response{ 310 | ErrCode: ErrorJsonData, 311 | ErrMsg: fmt.Sprintf(`invalid json data, the proper data format is {"s":"xx"}`), 312 | }) 313 | return 314 | } 315 | word = request.Word 316 | } 317 | 318 | word = strings.TrimSpace(word) 319 | if len(word) == 0 { 320 | c.JSON(http.StatusOK, Response{ 321 | ErrCode: ErrorWordEmpty, 322 | ErrMsg: "the word is empty", 323 | }) 324 | return 325 | } 326 | 327 | exist, err := jieBaGo.AddStopWord(word) 328 | if err != nil { 329 | c.JSON(http.StatusOK, Response{ 330 | ErrCode: ErrorFail, 331 | ErrMsg: err.Error(), 332 | }) 333 | return 334 | } 335 | 336 | message := "success" 337 | if exist { 338 | message = "the word already exists" 339 | } 340 | c.JSON(http.StatusOK, Response{ 341 | ErrCode: Success, 342 | ErrMsg: message, 343 | }) 344 | } 345 | -------------------------------------------------------------------------------- /api/jiebago_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Ze-Bin Wang. All rights reserved. 2 | // Use of this source code is governed by a MIT style 3 | // license that can be found in the LICENSE file. 4 | 5 | package main 6 | 7 | import ( 8 | "bytes" 9 | "encoding/json" 10 | "errors" 11 | "fmt" 12 | "io" 13 | "io/ioutil" 14 | "net/http" 15 | "strconv" 16 | "strings" 17 | "testing" 18 | "time" 19 | 20 | "github.com/wangshizebin/jiebago/tokenizer" 21 | ) 22 | 23 | var ( 24 | sentence = "Shell位于用户与系统之间,用来帮助用户与操作系统进行沟通。" 25 | resultTest = []string{"Shell", "操作系统", "用户"} 26 | ) 27 | 28 | func TestCutWordsGet(t *testing.T) { 29 | t.Log(sentence) 30 | 31 | url := "http://localhost:8118/cut_words?s=" + sentence 32 | modes := []string{"", "accurate", "full", "nohmm", "search"} 33 | for _, mode := range modes { 34 | t.Log("=== mode: " + mode) 35 | result, err := Get(url + "&mode=" + mode) 36 | if err != nil { 37 | t.Error(err) 38 | return 39 | } 40 | var w struct { 41 | Words []string `json:"words"` 42 | } 43 | err = json.Unmarshal([]byte(result), &w) 44 | if err != nil { 45 | t.Error(err) 46 | return 47 | } 48 | t.Log("结果:", strings.Join(w.Words, "/")) 49 | 50 | for _, word := range resultTest { 51 | ok := false 52 | for _, v := range w.Words { 53 | if word == v { 54 | ok = true 55 | } 56 | } 57 | if !ok { 58 | t.Error(word + " not pass") 59 | } else { 60 | t.Log(word + " OK") 61 | } 62 | } 63 | } 64 | } 65 | 66 | func TestCutWordsPost(t *testing.T) { 67 | t.Log(sentence) 68 | 69 | url := "http://localhost:8118/cut_words" 70 | modes := []string{"", "accurate", "full", "nohmm", "search"} 71 | for _, mode := range modes { 72 | t.Log("=== mode: " + mode) 73 | data := fmt.Sprintf(`{"s":"%s", "mode":"%s"}`, sentence, mode) 74 | result, err := Post(url, data, "application/json") 75 | if err != nil { 76 | t.Error(err) 77 | } 78 | var w struct { 79 | Words []string `json:"words"` 80 | } 81 | err = json.Unmarshal([]byte(result), &w) 82 | if err != nil { 83 | t.Error(err) 84 | return 85 | } 86 | t.Log("结果:", strings.Join(w.Words, "/")) 87 | 88 | for _, word := range resultTest { 89 | ok := false 90 | for _, v := range w.Words { 91 | if word == v { 92 | ok = true 93 | } 94 | } 95 | if !ok { 96 | t.Error(word + " not pass") 97 | } else { 98 | t.Log(word + " OK") 99 | } 100 | } 101 | } 102 | } 103 | 104 | func TestExtractKeywordsGet(t *testing.T) { 105 | t.Log(sentence) 106 | 107 | url := "http://localhost:8118/extract_keywords?s=" + sentence + "&count=3" 108 | result, err := Get(url) 109 | if err != nil { 110 | t.Error(err) 111 | return 112 | } 113 | 114 | var w struct { 115 | Tags []string `json:"tags"` 116 | } 117 | 118 | err = json.Unmarshal([]byte(result), &w) 119 | if err != nil { 120 | t.Error(err) 121 | return 122 | } 123 | t.Log("提取关键字:", strings.Join(w.Tags, "/")) 124 | 125 | for _, word := range resultTest { 126 | ok := false 127 | for _, v := range w.Tags { 128 | if word == v { 129 | ok = true 130 | } 131 | } 132 | if !ok { 133 | t.Error(word + " not pass") 134 | } else { 135 | t.Log(word + " OK") 136 | } 137 | } 138 | } 139 | 140 | func TestExtractKeywordsPost(t *testing.T) { 141 | t.Log(sentence) 142 | 143 | url := "http://localhost:8118/extract_keywords" 144 | data := fmt.Sprintf(`{"s":"%s", "count":%d}`, sentence, 3) 145 | result, err := Post(url, data, "application/json") 146 | if err != nil { 147 | t.Error(err) 148 | return 149 | } 150 | 151 | var w struct { 152 | Tags []string `json:"tags"` 153 | } 154 | 155 | err = json.Unmarshal([]byte(result), &w) 156 | if err != nil { 157 | t.Error(err) 158 | return 159 | } 160 | t.Log("提取关键字:", strings.Join(w.Tags, "/")) 161 | 162 | for _, word := range resultTest { 163 | ok := false 164 | for _, v := range w.Tags { 165 | if word == v { 166 | ok = true 167 | } 168 | } 169 | if !ok { 170 | t.Error(word + " not pass") 171 | } else { 172 | t.Log(word + " OK") 173 | } 174 | } 175 | } 176 | 177 | func TestExtractKeywordsWeightGet(t *testing.T) { 178 | t.Log(sentence) 179 | 180 | url := "http://localhost:8118/extract_keywords?s=" + sentence + "&mode=weight&count=3" 181 | result, err := Get(url) 182 | if err != nil { 183 | t.Error(err) 184 | return 185 | } 186 | 187 | var w struct { 188 | Tags []tokenizer.Keyword `json:"tags"` 189 | } 190 | 191 | err = json.Unmarshal([]byte(result), &w) 192 | if err != nil { 193 | t.Error(err) 194 | return 195 | } 196 | t.Log("提取关键字:", w) 197 | 198 | for _, word := range resultTest { 199 | ok := false 200 | for _, v := range w.Tags { 201 | if word == v.Word { 202 | ok = true 203 | } 204 | } 205 | if !ok { 206 | t.Error(word + " not pass") 207 | } else { 208 | t.Log(word + " OK") 209 | } 210 | } 211 | } 212 | 213 | func TestExtractKeywordsWeightPost(t *testing.T) { 214 | t.Log(sentence) 215 | 216 | url := "http://localhost:8118/extract_keywords" 217 | data := fmt.Sprintf(`{"s":"%s", "mode":"%s","count":%d}`, sentence, "weight", 3) 218 | result, err := Post(url, data, "application/json") 219 | if err != nil { 220 | t.Error(err) 221 | return 222 | } 223 | 224 | var w struct { 225 | Tags []tokenizer.Keyword `json:"tags"` 226 | } 227 | 228 | err = json.Unmarshal([]byte(result), &w) 229 | if err != nil { 230 | t.Error(err) 231 | return 232 | } 233 | t.Log("提取关键字:", w) 234 | 235 | for _, word := range resultTest { 236 | ok := false 237 | for _, v := range w.Tags { 238 | if word == v.Word { 239 | ok = true 240 | } 241 | } 242 | if !ok { 243 | t.Error(word + " not pass") 244 | } else { 245 | t.Log(word + " OK") 246 | } 247 | } 248 | } 249 | 250 | func TestAddDictWordsGet(t *testing.T) { 251 | word := "编程宝库" 252 | t.Log("=== 添加字典单词: " + word) 253 | url := fmt.Sprintf(`http://localhost:8118/add_dict_word?s=%s&weight=%d&prop=%s`, word, 3, "n") 254 | result, err := Get(url) 255 | if err != nil { 256 | t.Error(err) 257 | return 258 | } 259 | var response struct { 260 | ErrCode int `json:"errcode"` 261 | ErrMsg string `json:"errmsg"` 262 | } 263 | err = json.Unmarshal([]byte(result), &response) 264 | if err != nil { 265 | t.Error(err) 266 | return 267 | } 268 | if response.ErrMsg != "" { 269 | t.Log(response.ErrMsg) 270 | } 271 | } 272 | 273 | func TestAddDictWordsPost(t *testing.T) { 274 | url := "http://localhost:8118/add_dict_word" 275 | 276 | word := "编程宝库" 277 | t.Log("=== 添加字典单词: " + word) 278 | data := fmt.Sprintf(`{"s":"%s", "weight":%d,"prop":"%s"}`, word, 3, "n") 279 | result, err := Post(url, data, "application/json") 280 | if err != nil { 281 | t.Error(err) 282 | return 283 | } 284 | var response struct { 285 | ErrCode int `json:"errcode"` 286 | ErrMsg string `json:"errmsg"` 287 | } 288 | err = json.Unmarshal([]byte(result), &response) 289 | if err != nil { 290 | t.Error(err) 291 | return 292 | } 293 | if response.ErrMsg != "" { 294 | t.Log(response.ErrMsg) 295 | } 296 | } 297 | 298 | func TestAddStopWordsGet(t *testing.T) { 299 | word := "the" 300 | t.Log("=== 添加停止词: " + word) 301 | url := fmt.Sprintf(`http://localhost:8118/add_stop_word?s=%s`, word) 302 | result, err := Get(url) 303 | if err != nil { 304 | t.Error(err) 305 | return 306 | } 307 | var response struct { 308 | ErrCode int `json:"errcode"` 309 | ErrMsg string `json:"errmsg"` 310 | } 311 | err = json.Unmarshal([]byte(result), &response) 312 | if err != nil { 313 | t.Error(err) 314 | return 315 | } 316 | if response.ErrMsg != "" { 317 | t.Log(response.ErrMsg) 318 | } 319 | } 320 | 321 | func TestAddStopWordsPost(t *testing.T) { 322 | url := "http://localhost:8118/add_stop_word" 323 | 324 | word := "the" 325 | t.Log("=== 添加停止词: " + word) 326 | data := fmt.Sprintf(`{"s":"%s"}`, word) 327 | result, err := Post(url, data, "application/json") 328 | if err != nil { 329 | t.Error(err) 330 | return 331 | } 332 | var response struct { 333 | ErrCode int `json:"errcode"` 334 | ErrMsg string `json:"errmsg"` 335 | } 336 | err = json.Unmarshal([]byte(result), &response) 337 | if err != nil { 338 | t.Error(err) 339 | return 340 | } 341 | if response.ErrMsg != "" { 342 | t.Log(response.ErrMsg) 343 | } 344 | } 345 | 346 | // 发送GET请求 347 | // url: 请求地址 348 | // response: 请求返回的内容 349 | func Get(url string) (string, error) { 350 | // 超时时间:5秒 351 | client := &http.Client{Timeout: 5 * time.Second} 352 | resp, err := client.Get(url) 353 | if err != nil { 354 | return "", err 355 | } 356 | defer resp.Body.Close() 357 | if resp.StatusCode != http.StatusOK { 358 | return "", errors.New("status code:" + strconv.Itoa(resp.StatusCode)) 359 | } 360 | var buffer [256]byte 361 | result := bytes.NewBuffer(nil) 362 | for { 363 | n, err := resp.Body.Read(buffer[0:]) 364 | result.Write(buffer[0:n]) 365 | if err != nil && err == io.EOF { 366 | break 367 | } else if err != nil { 368 | return "", err 369 | } 370 | } 371 | 372 | return result.String(), nil 373 | } 374 | 375 | // 发送POST请求 376 | // url: 请求地址 377 | // data: POST请求提交的数据 378 | // contentType: 请求体格式,如:application/json 379 | // content: 请求放回的内容 380 | func Post(url string, data string, contentType string) (string, error) { 381 | // 超时时间:5秒 382 | client := &http.Client{Timeout: 5 * time.Second} 383 | resp, err := client.Post(url, contentType, bytes.NewBuffer([]byte(data))) 384 | if err != nil { 385 | return "", err 386 | } 387 | defer resp.Body.Close() 388 | 389 | result, err := ioutil.ReadAll(resp.Body) 390 | if err != nil { 391 | return "", err 392 | } 393 | return string(result), nil 394 | } 395 | -------------------------------------------------------------------------------- /dictionary/dict_user_utf8.txt: -------------------------------------------------------------------------------- 1 | 编程宝库 3 n 2 | 王泽宾 3 n 3 | codebaoku 3 n 4 | -------------------------------------------------------------------------------- /dictionary/fs_pbstart.json: -------------------------------------------------------------------------------- 1 | {"B":-0.26268660809250016,"E":-3.14e+100,"M":-3.14e+100,"S":-1.4652633398537678} -------------------------------------------------------------------------------- /dictionary/fs_pbtrans.json: -------------------------------------------------------------------------------- 1 | {"B": {"E": -0.510825623765990, "M": -0.916290731874155}, 2 | "E": {"B": -0.5897149736854513, "S": -0.8085250474669937}, 3 | "M": {"E": -0.33344856811948514, "M": -1.2603623820268226}, 4 | "S": {"B": -0.7211965654669841, "S": -0.6658631448798212}} -------------------------------------------------------------------------------- /dictionary/stop_words_std_utf8.txt: -------------------------------------------------------------------------------- 1 | " 2 | . 3 | 。 4 | , 5 | 、 6 | ! 7 | ? 8 | : 9 | ; 10 | ` 11 | ﹑ 12 | • 13 | " 14 | ^ 15 | … 16 | ‘ 17 | ’ 18 | “ 19 | ” 20 | 〝 21 | 〞 22 | ~ 23 | \ 24 | ∕ 25 | | 26 | ¦ 27 | ‖ 28 | —  29 | ( 30 | ) 31 | 〈 32 | 〉 33 | ﹞ 34 | ﹝ 35 | 「 36 | 」 37 | ‹ 38 | › 39 | 〖 40 | 〗 41 | 】 42 | 【 43 | » 44 | « 45 | 』 46 | 『 47 | 〕 48 | 〔 49 | 》 50 | 《 51 | } 52 | { 53 | ] 54 | [ 55 | ﹐ 56 | ¸ 57 | ﹕ 58 | ︰ 59 | ﹔ 60 | ; 61 | ! 62 | ¡ 63 | ? 64 | ¿ 65 | ﹖ 66 | ﹌ 67 | ﹏ 68 | ﹋ 69 | ' 70 | ´ 71 | ˊ 72 | ˋ 73 | - 74 | ― 75 | ﹫ 76 | @ 77 | ︳ 78 | ︴ 79 | _ 80 | ¯ 81 | _ 82 |  ̄ 83 | ﹢ 84 | + 85 | ﹦ 86 | = 87 | ﹤ 88 | ‐ 89 | < 90 | ­ 91 | ˜ 92 | ~ 93 | ﹟ 94 | # 95 | ﹩ 96 | $ 97 | ﹠ 98 | & 99 | ﹪ 100 | % 101 | ﹡ 102 | * 103 | ﹨ 104 | \ 105 | ﹍ 106 | ﹉ 107 | ﹎ 108 | ﹊ 109 | ˇ 110 | ︵ 111 | ︶ 112 | ︷ 113 | ︸ 114 | ︹ 115 | ︿ 116 | ﹀ 117 | ︺ 118 | ︽ 119 | ︾ 120 | _ 121 | ˉ 122 | ﹁ 123 | ﹂ 124 | ﹃ 125 | ﹄ 126 | ︻ 127 | ︼ 128 | 的 129 | 了 130 | the 131 | a 132 | an 133 | that 134 | those 135 | this 136 | that 137 | $ 138 | 0 139 | 1 140 | 2 141 | 3 142 | 4 143 | 5 144 | 6 145 | 7 146 | 8 147 | 9 148 | ? 149 | _ 150 | “ 151 | ” 152 | 、 153 | 。 154 | 《 155 | 》 156 | 一 157 | 一些 158 | 一何 159 | 一切 160 | 一则 161 | 一方面 162 | 一旦 163 | 一来 164 | 一样 165 | 一般 166 | 一转眼 167 | 万一 168 | 上 169 | 上下 170 | 下 171 | 不 172 | 不仅 173 | 不但 174 | 不光 175 | 不单 176 | 不只 177 | 不外乎 178 | 不如 179 | 不妨 180 | 不尽 181 | 不尽然 182 | 不得 183 | 不怕 184 | 不惟 185 | 不成 186 | 不拘 187 | 不料 188 | 不是 189 | 不比 190 | 不然 191 | 不特 192 | 不独 193 | 不管 194 | 不至于 195 | 不若 196 | 不论 197 | 不过 198 | 不问 199 | 与 200 | 与其 201 | 与其说 202 | 与否 203 | 与此同时 204 | 且 205 | 且不说 206 | 且说 207 | 两者 208 | 个 209 | 个别 210 | 临 211 | 为 212 | 为了 213 | 为什么 214 | 为何 215 | 为止 216 | 为此 217 | 为着 218 | 乃 219 | 乃至 220 | 乃至于 221 | 么 222 | 之 223 | 之一 224 | 之所以 225 | 之类 226 | 乌乎 227 | 乎 228 | 乘 229 | 也 230 | 也好 231 | 也罢 232 | 了 233 | 二来 234 | 于 235 | 于是 236 | 于是乎 237 | 云云 238 | 云尔 239 | 些 240 | 亦 241 | 人 242 | 人们 243 | 人家 244 | 什么 245 | 什么样 246 | 今 247 | 介于 248 | 仍 249 | 仍旧 250 | 从 251 | 从此 252 | 从而 253 | 他 254 | 他人 255 | 他们 256 | 以 257 | 以上 258 | 以为 259 | 以便 260 | 以免 261 | 以及 262 | 以故 263 | 以期 264 | 以来 265 | 以至 266 | 以至于 267 | 以致 268 | 们 269 | 任 270 | 任何 271 | 任凭 272 | 似的 273 | 但 274 | 但凡 275 | 但是 276 | 何 277 | 何以 278 | 何况 279 | 何处 280 | 何时 281 | 余外 282 | 作为 283 | 你 284 | 你们 285 | 使 286 | 使得 287 | 例如 288 | 依 289 | 依据 290 | 依照 291 | 便于 292 | 俺 293 | 俺们 294 | 倘 295 | 倘使 296 | 倘或 297 | 倘然 298 | 倘若 299 | 借 300 | 假使 301 | 假如 302 | 假若 303 | 傥然 304 | 像 305 | 儿 306 | 先不先 307 | 光是 308 | 全体 309 | 全部 310 | 兮 311 | 关于 312 | 其 313 | 其一 314 | 其中 315 | 其二 316 | 其他 317 | 其余 318 | 其它 319 | 其次 320 | 具体地说 321 | 具体说来 322 | 兼之 323 | 内 324 | 再 325 | 再其次 326 | 再则 327 | 再有 328 | 再者 329 | 再者说 330 | 再说 331 | 冒 332 | 冲 333 | 况且 334 | 几 335 | 几时 336 | 凡 337 | 凡是 338 | 凭 339 | 凭借 340 | 出于 341 | 出来 342 | 分别 343 | 则 344 | 则甚 345 | 别 346 | 别人 347 | 别处 348 | 别是 349 | 别的 350 | 别管 351 | 别说 352 | 到 353 | 前后 354 | 前此 355 | 前者 356 | 加之 357 | 加以 358 | 即 359 | 即令 360 | 即使 361 | 即便 362 | 即如 363 | 即或 364 | 即若 365 | 却 366 | 去 367 | 又 368 | 又及 369 | 及 370 | 及其 371 | 及至 372 | 反之 373 | 反而 374 | 反过来 375 | 反过来说 376 | 受到 377 | 另 378 | 另一方面 379 | 另外 380 | 另悉 381 | 只 382 | 只当 383 | 只怕 384 | 只是 385 | 只有 386 | 只消 387 | 只要 388 | 只限 389 | 叫 390 | 叮咚 391 | 可 392 | 可以 393 | 可是 394 | 可见 395 | 各 396 | 各个 397 | 各位 398 | 各种 399 | 各自 400 | 同 401 | 同时 402 | 后 403 | 后者 404 | 向 405 | 向使 406 | 向着 407 | 吓 408 | 吗 409 | 否则 410 | 吧 411 | 吧哒 412 | 吱 413 | 呀 414 | 呃 415 | 呕 416 | 呗 417 | 呜 418 | 呜呼 419 | 呢 420 | 呵 421 | 呵呵 422 | 呸 423 | 呼哧 424 | 咋 425 | 和 426 | 咚 427 | 咦 428 | 咧 429 | 咱 430 | 咱们 431 | 咳 432 | 哇 433 | 哈 434 | 哈哈 435 | 哉 436 | 哎 437 | 哎呀 438 | 哎哟 439 | 哗 440 | 哟 441 | 哦 442 | 哩 443 | 哪 444 | 哪个 445 | 哪些 446 | 哪儿 447 | 哪天 448 | 哪年 449 | 哪怕 450 | 哪样 451 | 哪边 452 | 哪里 453 | 哼 454 | 哼唷 455 | 唉 456 | 唯有 457 | 啊 458 | 啐 459 | 啥 460 | 啦 461 | 啪达 462 | 啷当 463 | 喂 464 | 喏 465 | 喔唷 466 | 喽 467 | 嗡 468 | 嗡嗡 469 | 嗬 470 | 嗯 471 | 嗳 472 | 嘎 473 | 嘎登 474 | 嘘 475 | 嘛 476 | 嘻 477 | 嘿 478 | 嘿嘿 479 | 因 480 | 因为 481 | 因了 482 | 因此 483 | 因着 484 | 因而 485 | 固然 486 | 在 487 | 在下 488 | 在于 489 | 地 490 | 基于 491 | 处在 492 | 多 493 | 多么 494 | 多少 495 | 大 496 | 大家 497 | 她 498 | 她们 499 | 好 500 | 如 501 | 如上 502 | 如上所述 503 | 如下 504 | 如何 505 | 如其 506 | 如同 507 | 如是 508 | 如果 509 | 如此 510 | 如若 511 | 始而 512 | 孰料 513 | 孰知 514 | 宁 515 | 宁可 516 | 宁愿 517 | 宁肯 518 | 它 519 | 它们 520 | 对 521 | 对于 522 | 对待 523 | 对方 524 | 对比 525 | 将 526 | 小 527 | 尔 528 | 尔后 529 | 尔尔 530 | 尚且 531 | 就 532 | 就是 533 | 就是了 534 | 就是说 535 | 就算 536 | 就要 537 | 尽 538 | 尽管 539 | 尽管如此 540 | 岂但 541 | 己 542 | 已 543 | 已矣 544 | 巴 545 | 巴巴 546 | 并 547 | 并且 548 | 并非 549 | 庶乎 550 | 庶几 551 | 开外 552 | 开始 553 | 归 554 | 归齐 555 | 当 556 | 当地 557 | 当然 558 | 当着 559 | 彼 560 | 彼时 561 | 彼此 562 | 往 563 | 待 564 | 很 565 | 得 566 | 得了 567 | 怎 568 | 怎么 569 | 怎么办 570 | 怎么样 571 | 怎奈 572 | 怎样 573 | 总之 574 | 总的来看 575 | 总的来说 576 | 总的说来 577 | 总而言之 578 | 恰恰相反 579 | 您 580 | 惟其 581 | 慢说 582 | 我 583 | 我们 584 | 或 585 | 或则 586 | 或是 587 | 或曰 588 | 或者 589 | 截至 590 | 所 591 | 所以 592 | 所在 593 | 所幸 594 | 所有 595 | 才 596 | 才能 597 | 打 598 | 打从 599 | 把 600 | 抑或 601 | 拿 602 | 按 603 | 按照 604 | 换句话说 605 | 换言之 606 | 据 607 | 据此 608 | 接着 609 | 故 610 | 故此 611 | 故而 612 | 旁人 613 | 无 614 | 无宁 615 | 无论 616 | 既 617 | 既往 618 | 既是 619 | 既然 620 | 时候 621 | 是 622 | 是以 623 | 是的 624 | 曾 625 | 替 626 | 替代 627 | 最 628 | 有 629 | 有些 630 | 有关 631 | 有及 632 | 有时 633 | 有的 634 | 望 635 | 朝 636 | 朝着 637 | 本 638 | 本人 639 | 本地 640 | 本着 641 | 本身 642 | 来 643 | 来着 644 | 来自 645 | 来说 646 | 极了 647 | 果然 648 | 果真 649 | 某 650 | 某个 651 | 某些 652 | 某某 653 | 根据 654 | 欤 655 | 正值 656 | 正如 657 | 正巧 658 | 正是 659 | 此 660 | 此地 661 | 此处 662 | 此外 663 | 此时 664 | 此次 665 | 此间 666 | 毋宁 667 | 每 668 | 每当 669 | 比 670 | 比及 671 | 比如 672 | 比方 673 | 没奈何 674 | 沿 675 | 沿着 676 | 漫说 677 | 焉 678 | 然则 679 | 然后 680 | 然而 681 | 照 682 | 照着 683 | 犹且 684 | 犹自 685 | 甚且 686 | 甚么 687 | 甚或 688 | 甚而 689 | 甚至 690 | 甚至于 691 | 用 692 | 用来 693 | 由 694 | 由于 695 | 由是 696 | 由此 697 | 由此可见 698 | 的 699 | 的确 700 | 的话 701 | 直到 702 | 相对而言 703 | 省得 704 | 看 705 | 眨眼 706 | 着 707 | 着呢 708 | 矣 709 | 矣乎 710 | 矣哉 711 | 离 712 | 竟而 713 | 第 714 | 等 715 | 等到 716 | 等等 717 | 简言之 718 | 管 719 | 类如 720 | 紧接着 721 | 纵 722 | 纵令 723 | 纵使 724 | 纵然 725 | 经 726 | 经过 727 | 结果 728 | 给 729 | 继之 730 | 继后 731 | 继而 732 | 综上所述 733 | 罢了 734 | 者 735 | 而 736 | 而且 737 | 而况 738 | 而后 739 | 而外 740 | 而已 741 | 而是 742 | 而言 743 | 能 744 | 能否 745 | 腾 746 | 自 747 | 自个儿 748 | 自从 749 | 自各儿 750 | 自后 751 | 自家 752 | 自己 753 | 自打 754 | 自身 755 | 至 756 | 至于 757 | 至今 758 | 至若 759 | 致 760 | 般的 761 | 若 762 | 若夫 763 | 若是 764 | 若果 765 | 若非 766 | 莫不然 767 | 莫如 768 | 莫若 769 | 虽 770 | 虽则 771 | 虽然 772 | 虽说 773 | 被 774 | 要 775 | 要不 776 | 要不是 777 | 要不然 778 | 要么 779 | 要是 780 | 譬喻 781 | 譬如 782 | 让 783 | 许多 784 | 论 785 | 设使 786 | 设或 787 | 设若 788 | 诚如 789 | 诚然 790 | 该 791 | 说来 792 | 诸 793 | 诸位 794 | 诸如 795 | 谁 796 | 谁人 797 | 谁料 798 | 谁知 799 | 贼死 800 | 赖以 801 | 赶 802 | 起 803 | 起见 804 | 趁 805 | 趁着 806 | 越是 807 | 距 808 | 跟 809 | 较 810 | 较之 811 | 边 812 | 过 813 | 还 814 | 还是 815 | 还有 816 | 还要 817 | 这 818 | 这一来 819 | 这个 820 | 这么 821 | 这么些 822 | 这么样 823 | 这么点儿 824 | 这些 825 | 这会儿 826 | 这儿 827 | 这就是说 828 | 这时 829 | 这样 830 | 这次 831 | 这般 832 | 这边 833 | 这里 834 | 进而 835 | 连 836 | 连同 837 | 逐步 838 | 通过 839 | 遵循 840 | 遵照 841 | 那 842 | 那个 843 | 那么 844 | 那么些 845 | 那么样 846 | 那些 847 | 那会儿 848 | 那儿 849 | 那时 850 | 那样 851 | 那般 852 | 那边 853 | 那里 854 | 都 855 | 鄙人 856 | 鉴于 857 | 针对 858 | 阿 859 | 除 860 | 除了 861 | 除外 862 | 除开 863 | 除此之外 864 | 除非 865 | 随 866 | 随后 867 | 随时 868 | 随着 869 | 难道说 870 | 非但 871 | 非徒 872 | 非特 873 | 非独 874 | 靠 875 | 顺 876 | 顺着 877 | 首先 878 | ! 879 | , 880 | : 881 | ; 882 | ? 883 | to 884 | can 885 | could 886 | dare 887 | do 888 | did 889 | does 890 | may 891 | might 892 | would 893 | should 894 | must 895 | will 896 | ought 897 | shall 898 | need 899 | is 900 | a 901 | am 902 | are 903 | about 904 | according 905 | after 906 | against 907 | all 908 | almost 909 | also 910 | although 911 | among 912 | an 913 | and 914 | another 915 | any 916 | anything 917 | approximately 918 | as 919 | asked 920 | at 921 | back 922 | because 923 | before 924 | besides 925 | between 926 | both 927 | but 928 | by 929 | call 930 | called 931 | currently 932 | despite 933 | did 934 | do 935 | dr 936 | during 937 | each 938 | earlier 939 | eight 940 | even 941 | eventually 942 | every 943 | everything 944 | five 945 | for 946 | four 947 | from 948 | he 949 | her 950 | here 951 | his 952 | how 953 | however 954 | i 955 | if 956 | in 957 | indeed 958 | instead 959 | it 960 | its 961 | just 962 | last 963 | like 964 | major 965 | many 966 | may 967 | maybe 968 | meanwhile 969 | more 970 | moreover 971 | most 972 | mr 973 | mrs 974 | ms 975 | much 976 | my 977 | neither 978 | net 979 | never 980 | nevertheless 981 | nine 982 | no 983 | none 984 | not 985 | nothing 986 | now 987 | of 988 | on 989 | once 990 | one 991 | only 992 | or 993 | other 994 | our 995 | over 996 | partly 997 | perhaps 998 | prior 999 | regarding 1000 | separately 1001 | seven 1002 | several 1003 | she 1004 | should 1005 | similarly 1006 | since 1007 | six 1008 | so 1009 | some 1010 | somehow 1011 | still 1012 | such 1013 | ten 1014 | that 1015 | the 1016 | their 1017 | then 1018 | there 1019 | therefore 1020 | these 1021 | they 1022 | this 1023 | those 1024 | though 1025 | three 1026 | to 1027 | two 1028 | under 1029 | unless 1030 | unlike 1031 | until 1032 | volume 1033 | we 1034 | what 1035 | whatever 1036 | whats 1037 | when 1038 | where 1039 | which 1040 | while 1041 | why 1042 | with 1043 | without 1044 | yesterday 1045 | yet 1046 | you 1047 | your 1048 | aboard 1049 | about 1050 | above 1051 | according to 1052 | across 1053 | afore 1054 | after 1055 | against 1056 | agin 1057 | along 1058 | alongside 1059 | amid 1060 | amidst 1061 | among 1062 | amongst 1063 | anent 1064 | around 1065 | as 1066 | aslant 1067 | astride 1068 | at 1069 | athwart 1070 | bar 1071 | because of 1072 | before 1073 | behind 1074 | below 1075 | beneath 1076 | beside 1077 | besides 1078 | between 1079 | betwixt 1080 | beyond 1081 | but 1082 | by 1083 | circa 1084 | despite 1085 | down 1086 | during 1087 | due to 1088 | ere 1089 | except 1090 | for 1091 | from 1092 | in 1093 | inside 1094 | into 1095 | less 1096 | like 1097 | mid 1098 | midst 1099 | minus 1100 | near 1101 | next 1102 | nigh 1103 | nigher 1104 | nighest 1105 | notwithstanding 1106 | of 1107 | off 1108 | on 1109 | on to 1110 | onto 1111 | out 1112 | out of 1113 | outside 1114 | over 1115 | past 1116 | pending 1117 | per 1118 | plus 1119 | qua 1120 | re 1121 | round 1122 | sans 1123 | save 1124 | since 1125 | through 1126 | throughout 1127 | thru 1128 | till 1129 | to 1130 | toward 1131 | towards 1132 | under 1133 | underneath 1134 | unlike 1135 | until 1136 | unto 1137 | up 1138 | upon 1139 | versus 1140 | via 1141 | vice 1142 | with 1143 | within 1144 | without 1145 | he 1146 | her 1147 | herself 1148 | hers 1149 | him 1150 | himself 1151 | his 1152 | I 1153 | it 1154 | its 1155 | itself 1156 | me 1157 | mine 1158 | my 1159 | myself 1160 | ours 1161 | she 1162 | their 1163 | theirs 1164 | them 1165 | themselves 1166 | they 1167 | us 1168 | we 1169 | our 1170 | ourselves 1171 | you 1172 | your 1173 | yours 1174 | yourselves 1175 | yourself 1176 | this 1177 | that 1178 | these 1179 | those 1180 | " 1181 | ' 1182 | '' 1183 | ( 1184 | ) 1185 | *LRB* 1186 | *RRB* 1187 | 1188 | 1189 | 1190 | 1191 | 1192 | @ 1193 | & 1194 | [ 1195 | ] 1196 | ` 1197 | `` 1198 | e.g., 1199 | { 1200 | } 1201 | " 1202 | “ 1203 | ” 1204 | -RRB- 1205 | -LRB- 1206 | -- 1207 | a 1208 | about 1209 | above 1210 | across 1211 | after 1212 | afterwards 1213 | again 1214 | against 1215 | all 1216 | almost 1217 | alone 1218 | along 1219 | already 1220 | also 1221 | although 1222 | always 1223 | am 1224 | among 1225 | amongst 1226 | amoungst 1227 | amount 1228 | an 1229 | and 1230 | another 1231 | any 1232 | anyhow 1233 | anyone 1234 | anything 1235 | anyway 1236 | anywhere 1237 | are 1238 | around 1239 | as 1240 | at 1241 | back 1242 | be 1243 | became 1244 | because 1245 | become 1246 | becomes 1247 | becoming 1248 | been 1249 | before 1250 | beforehand 1251 | behind 1252 | being 1253 | below 1254 | beside 1255 | besides 1256 | between 1257 | beyond 1258 | bill 1259 | both 1260 | bottom 1261 | but 1262 | by 1263 | call 1264 | can 1265 | cannot 1266 | cant 1267 | co 1268 | computer 1269 | con 1270 | could 1271 | couldnt 1272 | cry 1273 | de 1274 | describe 1275 | detail 1276 | do 1277 | done 1278 | down 1279 | due 1280 | during 1281 | each 1282 | eg 1283 | eight 1284 | either 1285 | eleven 1286 | else 1287 | elsewhere 1288 | empty 1289 | enough 1290 | etc 1291 | even 1292 | ever 1293 | every 1294 | everyone 1295 | everything 1296 | everywhere 1297 | except 1298 | few 1299 | fifteen 1300 | fify 1301 | fill 1302 | find 1303 | fire 1304 | first 1305 | five 1306 | for 1307 | former 1308 | formerly 1309 | forty 1310 | found 1311 | four 1312 | from 1313 | front 1314 | full 1315 | further 1316 | get 1317 | give 1318 | go 1319 | had 1320 | has 1321 | hasnt 1322 | have 1323 | he 1324 | hence 1325 | her 1326 | here 1327 | hereafter 1328 | hereby 1329 | herein 1330 | hereupon 1331 | hers 1332 | herself 1333 | him 1334 | himself 1335 | his 1336 | how 1337 | however 1338 | hundred 1339 | i 1340 | ie 1341 | if 1342 | in 1343 | inc 1344 | indeed 1345 | interest 1346 | into 1347 | is 1348 | it 1349 | its 1350 | itself 1351 | keep 1352 | last 1353 | latter 1354 | latterly 1355 | least 1356 | less 1357 | ltd 1358 | made 1359 | many 1360 | may 1361 | me 1362 | meanwhile 1363 | might 1364 | mill 1365 | mine 1366 | more 1367 | moreover 1368 | most 1369 | mostly 1370 | move 1371 | much 1372 | must 1373 | my 1374 | myself 1375 | name 1376 | namely 1377 | neither 1378 | never 1379 | nevertheless 1380 | next 1381 | nine 1382 | no 1383 | nobody 1384 | none 1385 | noone 1386 | nor 1387 | not 1388 | nothing 1389 | now 1390 | nowhere 1391 | of 1392 | off 1393 | often 1394 | on 1395 | once 1396 | one 1397 | only 1398 | onto 1399 | or 1400 | other 1401 | others 1402 | otherwise 1403 | our 1404 | ours 1405 | ourselves 1406 | out 1407 | over 1408 | own 1409 | part 1410 | per 1411 | perhaps 1412 | please 1413 | put 1414 | rather 1415 | re 1416 | same 1417 | see 1418 | seem 1419 | seemed 1420 | seeming 1421 | seems 1422 | serious 1423 | several 1424 | she 1425 | should 1426 | show 1427 | side 1428 | since 1429 | sincere 1430 | six 1431 | sixty 1432 | so 1433 | some 1434 | somehow 1435 | someone 1436 | something 1437 | sometime 1438 | sometimes 1439 | somewhere 1440 | still 1441 | such 1442 | system 1443 | take 1444 | ten 1445 | than 1446 | that 1447 | the 1448 | their 1449 | them 1450 | themselves 1451 | then 1452 | thence 1453 | there 1454 | thereafter 1455 | thereby 1456 | therefore 1457 | therein 1458 | thereupon 1459 | these 1460 | they 1461 | thick 1462 | thin 1463 | third 1464 | this 1465 | those 1466 | though 1467 | three 1468 | through 1469 | throughout 1470 | thru 1471 | thus 1472 | to 1473 | together 1474 | too 1475 | top 1476 | toward 1477 | towards 1478 | twelve 1479 | twenty 1480 | two 1481 | un 1482 | under 1483 | until 1484 | up 1485 | upon 1486 | us 1487 | very 1488 | via 1489 | was 1490 | we 1491 | well 1492 | were 1493 | what 1494 | whatever 1495 | when 1496 | whence 1497 | whenever 1498 | where 1499 | whereafter 1500 | whereas 1501 | whereby 1502 | wherein 1503 | whereupon 1504 | wherever 1505 | whether 1506 | which 1507 | while 1508 | whither 1509 | who 1510 | whoever 1511 | whole 1512 | whom 1513 | whose 1514 | why 1515 | will 1516 | with 1517 | within 1518 | without 1519 | would 1520 | yet 1521 | you 1522 | your 1523 | yours 1524 | yourself 1525 | yourselves 1526 | 1527 | 1528 | : 1529 | / 1530 | ( 1531 | > 1532 | ) 1533 | < 1534 | ! 1535 | -------------------------------------------------------------------------------- /dictionary/stop_words_user_utf8.txt: -------------------------------------------------------------------------------- 1 | the 2 | of 3 | is 4 | and 5 | to 6 | in 7 | that 8 | we 9 | for 10 | an 11 | are 12 | by 13 | be 14 | as 15 | on 16 | with 17 | can 18 | if 19 | from 20 | which 21 | you 22 | it 23 | this 24 | then 25 | at 26 | have 27 | all 28 | not 29 | one 30 | has 31 | or 32 | that 33 | -------------------------------------------------------------------------------- /dll/Makefile: -------------------------------------------------------------------------------- 1 | PROJECT = jiebago 2 | 3 | .PHONY: all clean build 4 | all:clean build 5 | @echo "Done!" 6 | 7 | build: 8 | go build -ldflags "-s -w" -buildmode=c-shared -o $(PROJECT).dll $(PROJECT).go 9 | 10 | clean: 11 | rm -rf $(PROJECT).dll $(PROJECT).h 12 | -------------------------------------------------------------------------------- /dll/jiebago.go: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Ze-Bin Wang. All rights reserved. 2 | // Use of this source code is governed by a MIT style 3 | // license that can be found in the LICENSE file. 4 | 5 | package main 6 | 7 | import ( 8 | "C" 9 | "encoding/json" 10 | 11 | "github.com/wangshizebin/jiebago" 12 | "github.com/wangshizebin/jiebago/tokenizer" 13 | ) 14 | 15 | var ( 16 | jieBaGo *jiebago.JieBaGo 17 | ) 18 | 19 | //export Init 20 | func Init(path string) { 21 | jieBaGo = jiebago.NewJieBaGo(path) 22 | } 23 | 24 | //export Cut 25 | func Cut(sentence string) string { 26 | if jieBaGo == nil { 27 | return "" 28 | } 29 | words := jieBaGo.Cut(sentence) 30 | return wordsToJson(&words) 31 | } 32 | 33 | //export CutFull 34 | func CutFull(sentence string) string { 35 | if jieBaGo == nil { 36 | return "" 37 | } 38 | words := jieBaGo.Cut(sentence) 39 | return wordsToJson(&words) 40 | } 41 | 42 | //export CutAccurate 43 | func CutAccurate(sentence string) string { 44 | if jieBaGo == nil { 45 | return "" 46 | } 47 | words := jieBaGo.Cut(sentence) 48 | return wordsToJson(&words) 49 | } 50 | 51 | //export CutNoHMM 52 | func CutNoHMM(sentence string) string { 53 | if jieBaGo == nil { 54 | return "" 55 | } 56 | words := jieBaGo.Cut(sentence) 57 | return wordsToJson(&words) 58 | } 59 | 60 | //export CutForSearch 61 | func CutForSearch(sentence string) string { 62 | if jieBaGo == nil { 63 | return "" 64 | } 65 | words := jieBaGo.Cut(sentence) 66 | return wordsToJson(&words) 67 | } 68 | 69 | //export ExtractKeywords 70 | func ExtractKeywords(s string, count int) string { 71 | if jieBaGo == nil { 72 | return "" 73 | } 74 | words := jieBaGo.ExtractKeywords(s, count) 75 | return wordsToJson(&words) 76 | } 77 | 78 | //export ExtractKeywordsWeight 79 | func ExtractKeywordsWeight(s string, count int) string { 80 | if jieBaGo == nil { 81 | return "" 82 | } 83 | tags := jieBaGo.ExtractKeywordsWeight(s, count) 84 | return wordsWeightToJson(&tags) 85 | } 86 | 87 | //export AddDictWord 88 | func AddDictWord(word string, freq int, prop string) bool { 89 | if jieBaGo == nil { 90 | return false 91 | } 92 | _, err := jieBaGo.AddDictWord(word, freq, prop) 93 | if err != nil { 94 | return false 95 | } 96 | return true 97 | } 98 | 99 | //export AddStopWord 100 | func AddStopWord(word string) bool { 101 | if jieBaGo == nil { 102 | return false 103 | } 104 | _, err := jieBaGo.AddStopWord(word) 105 | if err != nil { 106 | return false 107 | } 108 | return true 109 | } 110 | 111 | func wordsToJson(words *[]string) string { 112 | w := struct { 113 | Words *[]string `json:"words"` 114 | }{ 115 | Words: words, 116 | } 117 | v, _ := json.Marshal(w) 118 | return string(v) 119 | } 120 | 121 | func wordsWeightToJson(tags *[]tokenizer.Keyword) string { 122 | w := struct { 123 | Tags *[]tokenizer.Keyword `json:"tags"` 124 | }{ 125 | Tags: tags, 126 | } 127 | v, _ := json.Marshal(w) 128 | return string(v) 129 | } 130 | 131 | func main() { 132 | // Need a main function to make CGO compile package as C shared library 133 | } 134 | -------------------------------------------------------------------------------- /example/main.go: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Ze-Bin Wang. All rights reserved. 2 | // Use of this source code is governed by a MIT style 3 | // license that can be found in the LICENSE file. 4 | 5 | package main 6 | 7 | import ( 8 | "fmt" 9 | "strings" 10 | 11 | "github.com/wangshizebin/jiebago" 12 | ) 13 | 14 | func main() { 15 | jieBaGo := jiebago.NewJieBaGo() 16 | // 可以指定字典库的位置 17 | // jieBaGo := jiebago.NewJieBaGo("/data/mydict") 18 | 19 | sentence := "Shell 位于用户与系统之间,用来帮助用户与操作系统进行沟通。通常都是文字模式的 Shell。" 20 | fmt.Println("原始语句:", sentence) 21 | fmt.Println() 22 | 23 | // 默认模式分词 24 | words := jieBaGo.Cut(sentence) 25 | fmt.Println("默认模式分词:", strings.Join(words, "/")) 26 | 27 | // 精确模式分词 28 | words = jieBaGo.CutAccurate(sentence) 29 | fmt.Println("精确模式分词:", strings.Join(words, "/")) 30 | 31 | // 全模式分词 32 | words = jieBaGo.CutFull(sentence) 33 | fmt.Println("全模式分词:", strings.Join(words, "/")) 34 | 35 | // NoHMM模式分词 36 | words = jieBaGo.CutNoHMM(sentence) 37 | fmt.Println("NoHMM模式分词:", strings.Join(words, "/")) 38 | 39 | // 搜索引擎模式分词 40 | words = jieBaGo.CutForSearch(sentence) 41 | fmt.Println("搜索引擎模式分词:", strings.Join(words, "/")) 42 | fmt.Println() 43 | 44 | // 提取关键词,即Tag标签 45 | keywords := jieBaGo.ExtractKeywords(sentence, 20) 46 | fmt.Println("提取关键词:", strings.Join(keywords, "/")) 47 | 48 | // 提取带权重的关键词,即Tag标签 49 | keywordsWeight := jieBaGo.ExtractKeywordsWeight(sentence, 20) 50 | fmt.Println("提取带权重的关键词:", keywordsWeight) 51 | fmt.Println() 52 | 53 | // 向字典加入单词 54 | exist, err := jieBaGo.AddDictWord("编程宝库", 3, "n") 55 | if err != nil { 56 | fmt.Println(err) 57 | } else { 58 | fmt.Println("向字典加入单词:编程宝库") 59 | if exist { 60 | fmt.Println("单词已经存在") 61 | } 62 | } 63 | 64 | // 向字典加入停止词 65 | exist, err = jieBaGo.AddStopWord("the") 66 | if err != nil { 67 | fmt.Println(err) 68 | } else { 69 | fmt.Println("向字典加入停止词:the") 70 | if exist { 71 | fmt.Println("单词已经存在") 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /example/main_api.go: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Ze-Bin Wang. All rights reserved. 2 | // Use of this source code is governed by a MIT style 3 | // license that can be found in the LICENSE file. 4 | 5 | package main 6 | 7 | import ( 8 | "fmt" 9 | 10 | "github.com/wangshizebin/jiebago" 11 | ) 12 | 13 | func main() { 14 | sentence := "Shell 位于用户与系统之间,用来帮助用户与操作系统进行沟通。我们通常所说的 Shell 都指的是文字模式的 Shell。" 15 | fmt.Println("原始语句:", sentence) 16 | fmt.Println() 17 | 18 | // 默认模式分词 19 | words := jiebago.Cut(sentence) 20 | fmt.Println("默认模式分词:", words) 21 | 22 | // 精确模式分词 23 | words = jiebago.CutAccurate(sentence) 24 | fmt.Println("精确模式分词:", words) 25 | 26 | // 全模式分词 27 | words = jiebago.CutFull(sentence) 28 | fmt.Println("全模式分词:", words) 29 | 30 | // NoHMM模式分词 31 | words = jiebago.CutNoHMM(sentence) 32 | fmt.Println("NoHMM模式分词:", words) 33 | 34 | // 搜索引擎模式分词 35 | words = jiebago.CutForSearch(sentence) 36 | fmt.Println("搜索引擎模式分词:", words) 37 | fmt.Println() 38 | 39 | // 提取关键词,即Tag标签 40 | keywords := jiebago.ExtractKeywords(sentence, 20) 41 | fmt.Println("提取关键词:", keywords) 42 | 43 | // 提取带权重的关键词,即Tag标签 44 | keywordsWeight := jiebago.ExtractKeywordsWeight(sentence, 20) 45 | fmt.Println("提取带权重的关键词:", keywordsWeight) 46 | fmt.Println() 47 | 48 | // 向字典加入单词 49 | exist, err := jiebago.AddDictWord("编程宝库", 3, "n") 50 | if err != nil { 51 | fmt.Println(err) 52 | } else { 53 | fmt.Println("向字典加入单词:编程宝库") 54 | if exist { 55 | fmt.Println("单词已经存在") 56 | } 57 | } 58 | 59 | // 向字典加入停止词 60 | exist, err = jiebago.AddStopWord("the") 61 | if err != nil { 62 | fmt.Println(err) 63 | } else { 64 | fmt.Println("向字典加入停止词:the") 65 | if exist { 66 | fmt.Println("单词已经存在") 67 | } 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/wangshizebin/jiebago 2 | 3 | go 1.16 4 | 5 | require github.com/gin-gonic/gin v1.7.7 6 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 2 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 3 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 4 | github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE= 5 | github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI= 6 | github.com/gin-gonic/gin v1.7.7 h1:3DoBmSbJbZAWqXJC3SLjAPfutPJJRN1U5pALB7EeTTs= 7 | github.com/gin-gonic/gin v1.7.7/go.mod h1:axIBovoeJpVj8S3BwE0uPMTeReE4+AfFtqpqaZ1qq1U= 8 | github.com/go-playground/assert/v2 v2.0.1 h1:MsBgLAaY856+nPRTKrp3/OZK38U/wa0CcBYNjji3q3A= 9 | github.com/go-playground/assert/v2 v2.0.1/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4= 10 | github.com/go-playground/locales v0.13.0 h1:HyWk6mgj5qFqCT5fjGBuRArbVDfE4hi8+e8ceBS/t7Q= 11 | github.com/go-playground/locales v0.13.0/go.mod h1:taPMhCMXrRLJO55olJkUXHZBHCxTMfnGwq/HNwmWNS8= 12 | github.com/go-playground/universal-translator v0.17.0 h1:icxd5fm+REJzpZx7ZfpaD876Lmtgy7VtROAbHHXk8no= 13 | github.com/go-playground/universal-translator v0.17.0/go.mod h1:UkSxE5sNxxRwHyU+Scu5vgOQjsIJAF8j9muTVoKLVtA= 14 | github.com/go-playground/validator/v10 v10.4.1 h1:pH2c5ADXtd66mxoE0Zm9SUhxE20r7aM3F26W0hOn+GE= 15 | github.com/go-playground/validator/v10 v10.4.1/go.mod h1:nlOn6nFhuKACm19sB/8EGNn9GlaMV7XkbRSipzJ0Ii4= 16 | github.com/golang/protobuf v1.3.3 h1:gyjaxf+svBWX08ZjK86iN9geUJF0H6gp2IRKX6Nf6/I= 17 | github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= 18 | github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= 19 | github.com/json-iterator/go v1.1.9 h1:9yzud/Ht36ygwatGx56VwCZtlI/2AD15T1X2sjSuGns= 20 | github.com/json-iterator/go v1.1.9/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= 21 | github.com/leodido/go-urn v1.2.0 h1:hpXL4XnriNwQ/ABnpepYM/1vCLWNDfUNts8dX3xTG6Y= 22 | github.com/leodido/go-urn v1.2.0/go.mod h1:+8+nEpDfqqsY+g338gtMEUOtuK+4dEMhiQEgxpxOKII= 23 | github.com/mattn/go-isatty v0.0.12 h1:wuysRhFDzyxgEmMf5xjvJ2M9dZoWAXNNr5LSBS7uHXY= 24 | github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU= 25 | github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421 h1:ZqeYNhU3OHLH3mGKHDcjJRFFRrJa6eAM5H+CtDdOsPc= 26 | github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= 27 | github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742 h1:Esafd1046DLDQ0W1YjYsBW+p8U2u7vzgW2SQVmlNazg= 28 | github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= 29 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 30 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 31 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 32 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 33 | github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= 34 | github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= 35 | github.com/ugorji/go v1.1.7 h1:/68gy2h+1mWMrwZFeD1kQialdSzAb432dtpeJ42ovdo= 36 | github.com/ugorji/go v1.1.7/go.mod h1:kZn38zHttfInRq0xu/PH0az30d+z6vm202qpg1oXVMw= 37 | github.com/ugorji/go/codec v1.1.7 h1:2SvQaVZ1ouYrrKKwoSk2pzd4A9evlKJb9oTL+OaLUSs= 38 | github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY= 39 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 40 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9 h1:psW17arqaxU48Z5kZ0CQnkZWQJsqcURM6tKiBApRjXI= 41 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= 42 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 43 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 44 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 45 | golang.org/x/sys v0.0.0-20200116001909-b77594299b42 h1:vEOn+mP2zCOVzKckCZy6YsCtDblrpj/w7B9nxGNELpg= 46 | golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 47 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 48 | golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= 49 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 50 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= 51 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 52 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 53 | gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10= 54 | gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 55 | -------------------------------------------------------------------------------- /jiebago.go: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Ze-Bin Wang. All rights reserved. 2 | // Use of this source code is governed by a MIT style 3 | // license that can be found in the LICENSE file. 4 | 5 | package jiebago 6 | 7 | import ( 8 | "strings" 9 | 10 | "github.com/wangshizebin/jiebago/tokenizer" 11 | ) 12 | 13 | type JieBaGo struct { 14 | } 15 | 16 | func NewJieBaGo(path ...string) *JieBaGo { 17 | configPath := "" 18 | if len(path) > 0 { 19 | configPath = path[0] 20 | } 21 | tokenizer.Init(configPath) 22 | jieBaGo := &JieBaGo{} 23 | return jieBaGo 24 | } 25 | 26 | func (g *JieBaGo) Cut(sentence string) []string { 27 | return g.CutAccurate(sentence) 28 | } 29 | 30 | func (g *JieBaGo) CutFull(s string) []string { 31 | wordsRet := make([]string, 0, tokenizer.DefaultWordsLen) 32 | 33 | segments := tokenizer.SplitTextSeg(s) 34 | for _, segment := range segments { 35 | if strings.Trim(segment, " ") == "" { 36 | continue 37 | } 38 | if tokenizer.IsTextChars(segment) { 39 | tokenizer.CutFullW(segment, &wordsRet) 40 | } else { 41 | tokenizer.CutSymbolW(segment, &wordsRet) 42 | } 43 | } 44 | return wordsRet 45 | } 46 | 47 | func (g *JieBaGo) CutAccurate(s string) []string { 48 | wordsRet := make([]string, 0, tokenizer.DefaultWordsLen) 49 | 50 | segments := tokenizer.SplitTextSeg(s) 51 | for _, segment := range segments { 52 | if strings.Trim(segment, " ") == "" { 53 | continue 54 | } 55 | if tokenizer.IsTextChars(segment) { 56 | tokenizer.CutAccurateW(segment, &wordsRet) 57 | } else { 58 | tokenizer.CutSymbolW(segment, &wordsRet) 59 | } 60 | } 61 | 62 | return wordsRet 63 | } 64 | 65 | func (g *JieBaGo) CutNoHMM(s string) []string { 66 | wordsRet := make([]string, 0, tokenizer.DefaultWordsLen) 67 | 68 | segments := tokenizer.SplitTextSeg(s) 69 | for _, segment := range segments { 70 | if strings.Trim(segment, " ") == "" { 71 | continue 72 | } 73 | if tokenizer.IsTextChars(segment) { 74 | tokenizer.CutNoHMMW(segment, &wordsRet) 75 | } else { 76 | tokenizer.CutSymbolW(segment, &wordsRet) 77 | } 78 | } 79 | 80 | return wordsRet 81 | } 82 | 83 | func (g *JieBaGo) CutForSearch(s string) []string { 84 | wordsRet := make([]string, 0, tokenizer.DefaultWordsLen) 85 | 86 | segments := tokenizer.SplitTextSeg(s) 87 | for _, segment := range segments { 88 | if strings.Trim(segment, " ") == "" { 89 | continue 90 | } 91 | if tokenizer.IsTextChars(segment) { 92 | g.cutForSearchW(segment, &wordsRet) 93 | } else { 94 | tokenizer.CutSymbolW(segment, &wordsRet) 95 | } 96 | } 97 | 98 | return wordsRet 99 | } 100 | 101 | func (g *JieBaGo) cutForSearchW(s string, words *[]string) { 102 | dictionary := tokenizer.GetDictionary() 103 | 104 | for _, word := range g.CutAccurate(s) { 105 | wordRune := []rune(word) 106 | if len(wordRune) > 2 { 107 | for i := 0; i < len(wordRune)-1; i++ { 108 | s := string(wordRune[i : i+2]) 109 | if dictionary.Exist(s) { 110 | *words = append(*words, s) 111 | } 112 | } 113 | } 114 | if len(wordRune) > 3 { 115 | for i := 0; i < len(wordRune)-2; i++ { 116 | s := string(wordRune[i : i+3]) 117 | if dictionary.Exist(s) { 118 | *words = append(*words, s) 119 | } 120 | } 121 | } 122 | *words = append(*words, word) 123 | } 124 | } 125 | 126 | func (g *JieBaGo) ExtractKeywords(s string, count int) []string { 127 | keywords := tokenizer.GetTFIDF().ExtractKeywords(s, count, false) 128 | return keywords.([]string) 129 | } 130 | 131 | func (g *JieBaGo) ExtractKeywordsWeight(s string, count int) []tokenizer.Keyword { 132 | keywords := tokenizer.GetTFIDF().ExtractKeywords(s, count, true) 133 | return []tokenizer.Keyword(keywords.(tokenizer.Keywords)) 134 | } 135 | 136 | func (g *JieBaGo) AddDictWord(word string, freq int, prop string) (exist bool, err error) { 137 | return tokenizer.GetDictionary().AddWord(word, freq, prop) 138 | } 139 | 140 | func (g *JieBaGo) AddStopWord(word string) (exist bool, err error) { 141 | return tokenizer.GetTFIDF().AddStopWord(word) 142 | } 143 | -------------------------------------------------------------------------------- /jiebago_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Ze-Bin Wang. All rights reserved. 2 | // Use of this source code is governed by a MIT style 3 | // license that can be found in the LICENSE file. 4 | 5 | package jiebago 6 | 7 | import ( 8 | "strings" 9 | "testing" 10 | ) 11 | 12 | var ( 13 | sentence = "Shell位于用户与系统之间,用来帮助用户与操作系统进行沟通。" 14 | resultTest = []string{"Shell", "操作系统", "沟通"} 15 | jieBaGo = NewJieBaGo() 16 | ) 17 | 18 | func TestCut(t *testing.T) { 19 | testCutWords(jieBaGo.Cut, t) 20 | } 21 | 22 | func TestCutFull(t *testing.T) { 23 | testCutWords(jieBaGo.CutFull, t) 24 | } 25 | 26 | func TestCutAccurate(t *testing.T) { 27 | testCutWords(jieBaGo.CutAccurate, t) 28 | } 29 | 30 | func TestCutNoHMM(t *testing.T) { 31 | testCutWords(jieBaGo.CutNoHMM, t) 32 | } 33 | 34 | func TestCutForSearch(t *testing.T) { 35 | testCutWords(jieBaGo.CutForSearch, t) 36 | } 37 | 38 | func TestExtractKeywords(t *testing.T) { 39 | t.Log("原始语句: " + sentence) 40 | 41 | words := jieBaGo.ExtractKeywords(sentence, 20) 42 | t.Log("提取关键字:", strings.Join(words, "/")) 43 | for _, word := range resultTest { 44 | ok := false 45 | for _, v := range words { 46 | if word == v { 47 | ok = true 48 | } 49 | } 50 | if !ok { 51 | t.Error(word + " not pass") 52 | } else { 53 | t.Log(word + " OK") 54 | } 55 | } 56 | } 57 | 58 | func TestExtractKeywordsWeight(t *testing.T) { 59 | t.Log("原始语句: " + sentence) 60 | 61 | words := jieBaGo.ExtractKeywordsWeight(sentence, 20) 62 | t.Log("提取关键字:", words) 63 | for _, word := range resultTest { 64 | ok := false 65 | for _, v := range words { 66 | if word == v.Word { 67 | ok = true 68 | } 69 | } 70 | if !ok { 71 | t.Error(word + " not pass") 72 | } else { 73 | t.Log(word + " OK") 74 | } 75 | } 76 | } 77 | 78 | func TestAddWordToDict(t *testing.T) { 79 | words := []string{"编程宝库", "王泽宾", "codebaoku"} 80 | t.Log("加入词典:", words) 81 | for _, word := range words { 82 | exist, err := jieBaGo.AddDictWord(word, 3, "n") 83 | if err != nil { 84 | t.Error(err) 85 | } else { 86 | if exist { 87 | t.Log(word + " 已经存在") 88 | } else { 89 | t.Log(word + " 添加入库") 90 | } 91 | } 92 | } 93 | } 94 | 95 | func TestAddStopWord(t *testing.T) { 96 | words := []string{"the", "of", "is"} 97 | t.Log("加入停止词:", words) 98 | for _, word := range words { 99 | exist, err := jieBaGo.AddStopWord(word) 100 | if err != nil { 101 | t.Error(err) 102 | } else { 103 | if exist { 104 | t.Log(word + " 已经存在") 105 | } else { 106 | t.Log(word + " 添加入库") 107 | } 108 | } 109 | } 110 | } 111 | 112 | func testCutWords(f func(string) []string, t *testing.T) { 113 | t.Log("原始语句: " + sentence) 114 | 115 | wordsResult := f(sentence) 116 | t.Log("分词结果:", strings.Join(wordsResult, "/")) 117 | for _, word := range resultTest { 118 | ok := false 119 | for _, v := range wordsResult { 120 | if word == v { 121 | ok = true 122 | } 123 | } 124 | if !ok { 125 | t.Error(word + " not pass") 126 | } else { 127 | t.Log(word + " OK") 128 | } 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /tokenizer/analyzer.go: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Ze-Bin Wang. All rights reserved. 2 | // Use of this source code is governed by a MIT style 3 | // license that can be found in the LICENSE file. 4 | 5 | package tokenizer 6 | 7 | import ( 8 | "bufio" 9 | "errors" 10 | "io" 11 | "log" 12 | "os" 13 | "path/filepath" 14 | "sort" 15 | "strconv" 16 | "strings" 17 | "sync" 18 | "time" 19 | ) 20 | 21 | const ( 22 | DefaultIDFSize = 300000 23 | ) 24 | 25 | var tfIDF = &TFIDF{ 26 | idfLoader: &IDFLoader{ 27 | idfFreq: make(map[string]float64), 28 | }, 29 | stopWords: &StopWords{ 30 | dictMap: make(map[string]struct{}), 31 | }, 32 | } 33 | 34 | type Keyword struct { 35 | Word string `json:"word"` 36 | Weight float64 `json:"weight"` 37 | } 38 | 39 | type Keywords []Keyword 40 | 41 | func (k Keywords) Len() int { 42 | return len(k) 43 | } 44 | 45 | func (k Keywords) Less(i, j int) bool { 46 | if k[i].Weight > k[j].Weight { 47 | return true 48 | } 49 | return false 50 | } 51 | 52 | func (k Keywords) Swap(i, j int) { 53 | k[i], k[j] = k[j], k[i] 54 | } 55 | 56 | type StopWords struct { 57 | dictMap map[string]struct{} 58 | mu sync.RWMutex 59 | } 60 | 61 | func (d *StopWords) load(fileStopWords string) error { 62 | d.mu.Lock() 63 | defer d.mu.Unlock() 64 | 65 | timeStart := time.Now() 66 | 67 | f, err := os.Open(fileStopWords) 68 | if err != nil { 69 | log.Println(err) 70 | return errors.New("unable to load the stop words library:" + filepath.Base(fileStopWords)) 71 | } 72 | defer func() { 73 | _ = f.Close() 74 | }() 75 | 76 | itemCount := 0 77 | reader := bufio.NewReader(f) 78 | for { 79 | line, err := reader.ReadString('\n') 80 | if err != nil && err != io.EOF { 81 | break 82 | } 83 | 84 | elem := strings.Fields(line) 85 | if len(elem) == 0 { 86 | if err == io.EOF { 87 | break 88 | } 89 | continue 90 | } 91 | 92 | for _, v := range elem { 93 | if v == "" { 94 | continue 95 | } 96 | 97 | itemCount++ 98 | d.dictMap[strings.ToLower(v)] = struct{}{} 99 | } 100 | 101 | if err == io.EOF { 102 | break 103 | } 104 | } 105 | 106 | log.Printf("%v stop words are loaded, and take %v\n", 107 | itemCount, time.Now().Sub(timeStart)) 108 | return nil 109 | } 110 | 111 | func (d *StopWords) exist(s string) bool { 112 | d.mu.RLock() 113 | defer d.mu.RUnlock() 114 | _, ok := d.dictMap[strings.ToLower(s)] 115 | return ok 116 | } 117 | 118 | func (d *StopWords) add(s string) (exist bool, err error) { 119 | d.mu.Lock() 120 | defer d.mu.Unlock() 121 | 122 | s = strings.ToLower(strings.TrimSpace(s)) 123 | if s == "" { 124 | return 125 | } 126 | if _, ok := d.dictMap[s]; ok { 127 | exist = true 128 | return 129 | } 130 | 131 | stopWordsStdFile, err := GetDictFile(StopWordsStdFile) 132 | if err != nil { 133 | return 134 | } 135 | 136 | stopWordsUserFile := filepath.Dir(stopWordsStdFile) 137 | stopWordsUserFile += string(filepath.Separator) + StopWordsUserFile 138 | f, err := os.OpenFile(stopWordsUserFile, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0666) 139 | if err != nil { 140 | return 141 | } 142 | defer func() { 143 | _ = f.Close() 144 | }() 145 | 146 | stat, err := f.Stat() 147 | if err != nil { 148 | return 149 | } 150 | 151 | line := "" 152 | n := stat.Size() 153 | if n > 0 { 154 | buf := make([]byte, 1, 1) 155 | _, err = f.ReadAt(buf, n-1) 156 | if err != nil { 157 | return 158 | } 159 | if buf[0] != '\n' { 160 | line += "\n" 161 | } 162 | } 163 | line += s + "\n" 164 | _, err = f.Write([]byte(line)) 165 | if err != nil { 166 | log.Println(err) 167 | return 168 | } 169 | 170 | d.dictMap[s] = struct{}{} 171 | return 172 | } 173 | 174 | type IDFLoader struct { 175 | idfFreq map[string]float64 176 | idfMedian float64 177 | } 178 | 179 | func (d *IDFLoader) load(idfFile string) error { 180 | timeStart := time.Now() 181 | 182 | f, err := os.Open(idfFile) 183 | if err != nil { 184 | log.Println(err) 185 | return errors.New("unable to load the IDF library") 186 | } 187 | defer func() { 188 | _ = f.Close() 189 | }() 190 | 191 | itemCount := 0 192 | freqSlice := make([]float64, 0, DefaultIDFSize) 193 | reader := bufio.NewReader(f) 194 | for { 195 | line, err := reader.ReadString('\n') 196 | if err != nil && err != io.EOF { 197 | break 198 | } 199 | 200 | elem := strings.Fields(line) 201 | if len(elem) != 2 { 202 | if err == io.EOF { 203 | break 204 | } 205 | continue 206 | } 207 | 208 | itemCount++ 209 | freq, err := strconv.ParseFloat(elem[1], 64) 210 | if err != nil { 211 | log.Println(err) 212 | freq = float64(0) 213 | } 214 | 215 | d.idfFreq[strings.ToLower(elem[0])] = freq 216 | freqSlice = append(freqSlice, freq) 217 | 218 | if err == io.EOF { 219 | break 220 | } 221 | } 222 | 223 | sort.Float64s(freqSlice) 224 | d.idfMedian = freqSlice[itemCount/2] 225 | 226 | log.Printf("%v idfs are loaded, and take %v\n", 227 | itemCount, time.Now().Sub(timeStart)) 228 | return nil 229 | } 230 | 231 | type TFIDF struct { 232 | idfLoader *IDFLoader 233 | stopWords *StopWords 234 | } 235 | 236 | func (t *TFIDF) ExtractKeywords(s string, count int, withWeight bool) interface{} { 237 | words := make([]string, 0, DefaultWordsLen) 238 | segments := SplitTextSeg(s) 239 | for _, segment := range segments { 240 | if IsTextChars(segment) { 241 | CutAccurateW(segment, &words) 242 | } else { 243 | CutSymbolW(segment, &words) 244 | } 245 | } 246 | 247 | freqMap, freqMedian := t.idfLoader.idfFreq, t.idfLoader.idfMedian 248 | 249 | freqTotal := 0 250 | freqWords := make(map[string]int) 251 | for _, word := range words { 252 | if len([]rune(word)) < 2 || t.ExistStopWord(word) { 253 | continue 254 | } 255 | if val, ok := freqWords[word]; ok { 256 | freqWords[word] = val + 1 257 | freqTotal++ 258 | continue 259 | } 260 | freqTotal++ 261 | freqWords[word] = 1 262 | } 263 | 264 | i := 0 265 | wordsRet := make(Keywords, len(freqWords)) 266 | for word, s := range freqWords { 267 | val := freqMedian 268 | if v, ok := freqMap[word]; ok { 269 | val = v 270 | } 271 | wordsRet[i] = Keyword{ 272 | Word: word, 273 | Weight: float64(s) * (val / float64(freqTotal)), 274 | } 275 | i++ 276 | } 277 | 278 | sort.Sort(wordsRet) 279 | if count == 0 { 280 | count = 20 281 | } 282 | if count < len(wordsRet) { 283 | wordsRet = wordsRet[:count] 284 | } 285 | if withWeight { 286 | return wordsRet 287 | } 288 | stringSet := make([]string, len(wordsRet)) 289 | for i, v := range wordsRet { 290 | stringSet[i] = v.Word 291 | } 292 | return stringSet 293 | } 294 | 295 | func (t *TFIDF) ExistStopWord(word string) bool { 296 | return t.stopWords.exist(word) 297 | } 298 | 299 | func (t *TFIDF) AddStopWord(word string) (exist bool, err error) { 300 | return t.stopWords.add(word) 301 | } 302 | 303 | func InitTFIDF() { 304 | // load the tf-idf library 305 | idfFile, err := GetDictFile(IDFStdFile) 306 | if err != nil { 307 | log.Panic(err) 308 | } 309 | 310 | err = tfIDF.idfLoader.load(idfFile) 311 | if err != nil { 312 | log.Panic(err) 313 | } 314 | 315 | // load the standard stop words library 316 | stopWordsStdFile, err := GetDictFile(StopWordsStdFile) 317 | if err == nil { 318 | tfIDF.stopWords.load(stopWordsStdFile) 319 | } 320 | 321 | // load the user-defined stop words library 322 | stopWordsUserFile, err := GetDictFile(StopWordsUserFile) 323 | if err == nil { 324 | tfIDF.stopWords.load(stopWordsUserFile) 325 | } 326 | } 327 | 328 | func GetTFIDF() *TFIDF { 329 | return tfIDF 330 | } 331 | -------------------------------------------------------------------------------- /tokenizer/common.go: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Ze-Bin Wang. All rights reserved. 2 | // Use of this source code is governed by a MIT style 3 | // license that can be found in the LICENSE file. 4 | 5 | package tokenizer 6 | 7 | import ( 8 | "errors" 9 | "fmt" 10 | "log" 11 | "os" 12 | "path/filepath" 13 | "regexp" 14 | "strings" 15 | ) 16 | 17 | const ( 18 | DictStdFile = "dict_std_utf8.txt" // standard dictionary file 19 | DictUserFile = "dict_user_utf8.txt" // user-defined dictionary file 20 | IDFStdFile = "idf_std_utf8.txt" // standard IDF file 21 | StopWordsStdFile = "stop_words_std_utf8.txt" // standard stop words file 22 | StopWordsUserFile = "stop_words_user_utf8.txt" // user-defined stop words file 23 | 24 | RegExpEnglish = "([a-zA-Z0-9])+" // English regular expression 25 | RegExpChinese = "([\u4e00-\u9fa5])+" // Chinese regular expression 26 | RegExpText = "([\u4e00-\u9fa5a-zA-Z0-9+#&._%-])+" // text regular expression 27 | RegExpNumber = "[a-zA-Z0-9]+(\\.\\d+)?%?" // numeric regular expression 28 | RegExpDelimiter = "[\\r\\n\\s\\t]" // delimiter regular expression 29 | 30 | DefaultWordsLen = 32 // default slice size of the word segmentation result 31 | ) 32 | 33 | var ( 34 | reEnglish, _ = regexp.Compile(RegExpEnglish) // precompiled English regular expression 35 | reChinese, _ = regexp.Compile(RegExpChinese) // precompiled Chinese regular expression 36 | reText, _ = regexp.Compile(RegExpText) // precompiled text regular expression 37 | reNumber, _ = regexp.Compile(RegExpNumber) // precompiled numeric regular expression 38 | reDelimiter, _ = regexp.Compile(RegExpDelimiter) // precompiled delimiter regular expression 39 | 40 | dictPath string // dictionary directory, default is current work directory 41 | ) 42 | 43 | func IsEnglishChars(s string) bool { 44 | return reEnglish.Match([]byte(s)) 45 | } 46 | 47 | func IsChineseChars(s string) bool { 48 | return reChinese.Match([]byte(s)) 49 | } 50 | 51 | func IsTextChars(s string) bool { 52 | return reText.Match([]byte(s)) 53 | } 54 | 55 | // Split sentence according to normal text 56 | func SplitTextSeg(s string) []string { 57 | return splitRegExp(s, reText) 58 | } 59 | 60 | // Split sentence according to Chinese 61 | func SplitChineseSeg(s string) []string { 62 | return splitRegExp(s, reChinese) 63 | } 64 | 65 | // Split sentence according to number 66 | func SplitNumberSeg(s string) []string { 67 | return splitRegExp(s, reNumber) 68 | } 69 | 70 | // Get the dictionary file directory 71 | func GetDictFile(file string) (string, error) { 72 | errFileNotFound := errors.New("unable to load the dictionary file") 73 | 74 | dictPath := "" 75 | if GetDictPath() != "" { 76 | dictPath = filepath.Join(GetDictPath(), file) 77 | if !fileExist(dictPath) { 78 | return "", errFileNotFound 79 | } 80 | return dictPath, nil 81 | } 82 | 83 | dictFile := fmt.Sprintf("%cdictionary%c%s", os.PathSeparator, os.PathSeparator, file) 84 | 85 | // check exe file directory 86 | path, err := filepath.Abs(filepath.Dir(os.Args[0])) 87 | if err != nil { 88 | log.Println(err) 89 | return "", errFileNotFound 90 | } 91 | 92 | dictPath = path + dictFile 93 | if !fileExist(dictPath) { 94 | path, err = os.Getwd() 95 | if err != nil { 96 | log.Println(err) 97 | return "", errFileNotFound 98 | } 99 | } 100 | 101 | // check work directory 102 | dictPath = path + dictFile 103 | if !fileExist(dictPath) { 104 | path = getParentPath(path) 105 | if path == "" { 106 | return "", errFileNotFound 107 | } 108 | } 109 | 110 | // check parent of work directory 111 | dictPath = path + dictFile 112 | if !fileExist(dictPath) { 113 | return "", errFileNotFound 114 | } 115 | 116 | return dictPath, nil 117 | } 118 | 119 | // Split sentence according to the specified regular expression 120 | func splitRegExp(s string, re *regexp.Regexp) []string { 121 | sentences := make([]string, 0) 122 | 123 | n := len(s) 124 | prePos := 0 125 | for { 126 | loc := re.FindStringIndex(s[prePos:]) 127 | if loc == nil { 128 | sentences = append(sentences, s[prePos:]) 129 | return sentences 130 | } 131 | loc[0] += prePos 132 | loc[1] += prePos 133 | if loc[0] > prePos { 134 | sentences = append(sentences, s[prePos:loc[0]]) 135 | } 136 | sentences = append(sentences, s[loc[0]:loc[1]]) 137 | prePos = loc[1] 138 | if prePos == n { 139 | break 140 | } 141 | } 142 | return sentences 143 | } 144 | 145 | func fileExist(path string) bool { 146 | _, err := os.Lstat(path) 147 | return !os.IsNotExist(err) 148 | } 149 | 150 | func getParentPath(path string) string { 151 | return substrRune(path, 0, strings.LastIndex(path, string(os.PathSeparator))) 152 | } 153 | 154 | func substrRune(s string, pos, length int) string { 155 | runes := []rune(s) 156 | l := pos + length 157 | if l > len(runes) { 158 | l = len(runes) 159 | } 160 | return string(runes[pos:l]) 161 | } 162 | 163 | func GetDictPath() string { 164 | return dictPath 165 | } 166 | 167 | func SetDictPath(path string) { 168 | dictPath = path 169 | } 170 | -------------------------------------------------------------------------------- /tokenizer/cutword.go: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Ze-Bin Wang. All rights reserved. 2 | // Use of this source code is governed by a MIT style 3 | // license that can be found in the LICENSE file. 4 | 5 | package tokenizer 6 | 7 | func CutFullW(s string, words *[]string) { 8 | bufEnglish := "" 9 | pos := -1 10 | 11 | sentence := NewSentence(s) 12 | dag := sentence.GetDAG() 13 | for k, listPos := range dag { 14 | if len(bufEnglish) > 0 && !IsEnglishChars(sentence.GetChar(k)) { 15 | *words = append(*words, bufEnglish) 16 | bufEnglish = "" 17 | } 18 | 19 | if len(listPos) == 1 && k > pos { 20 | word := sentence.GetWord(k, listPos[0]+1) 21 | if IsEnglishChars(word) { 22 | bufEnglish += word 23 | } 24 | if len(bufEnglish) == 0 { 25 | *words = append(*words, word) 26 | } 27 | pos = listPos[0] 28 | } else { 29 | for _, j := range listPos { 30 | if j > k { 31 | *words = append(*words, sentence.GetWord(k, j+1)) 32 | pos = j 33 | } 34 | } 35 | } 36 | } 37 | 38 | if len(bufEnglish) > 0 { 39 | *words = append(*words, bufEnglish) 40 | } 41 | } 42 | 43 | func CutAccurateW(s string, words *[]string) { 44 | sentence := NewSentence(s) 45 | route := sentence.CalcDAG() 46 | dictionary := GetDictionary() 47 | buf := "" 48 | for i := 0; i < sentence.Len(); { 49 | y := route[i].Y + 1 50 | leftWord := sentence.GetWord(i, y) 51 | if y-i == 1 { 52 | buf += leftWord 53 | i = y 54 | continue 55 | } 56 | 57 | if len(buf) > 0 { 58 | if len([]rune(buf)) == 1 { 59 | *words = append(*words, buf) 60 | } else { 61 | if !dictionary.Exist(buf) { 62 | wordsRecognized := GetFinalSeg().Cut(buf) 63 | for _, w := range wordsRecognized { 64 | *words = append(*words, w) 65 | } 66 | } else { 67 | for _, v := range buf { 68 | *words = append(*words, string(v)) 69 | } 70 | } 71 | } 72 | buf = "" 73 | } 74 | *words = append(*words, leftWord) 75 | i = y 76 | } 77 | 78 | if len(buf) > 0 { 79 | if len([]rune(buf)) == 1 { 80 | *words = append(*words, buf) 81 | } else { 82 | if !dictionary.Exist(buf) { 83 | wordsRecognized := GetFinalSeg().Cut(buf) 84 | for _, w := range wordsRecognized { 85 | *words = append(*words, w) 86 | } 87 | } else { 88 | for _, v := range buf { 89 | *words = append(*words, string(v)) 90 | } 91 | } 92 | } 93 | } 94 | } 95 | 96 | func CutNoHMMW(s string, words *[]string) { 97 | sentence := NewSentence(s) 98 | route := sentence.CalcDAG() 99 | 100 | bufEnglish := "" 101 | for i := 0; i < sentence.Len(); { 102 | y := route[i].Y + 1 103 | leftWord := sentence.GetWord(i, y) 104 | if IsEnglishChars(leftWord) && len(leftWord) == 1 { 105 | bufEnglish += leftWord 106 | i = y 107 | continue 108 | } 109 | 110 | if len(bufEnglish) > 0 { 111 | *words = append(*words, bufEnglish) 112 | bufEnglish = "" 113 | } 114 | *words = append(*words, leftWord) 115 | i = y 116 | } 117 | 118 | if len(bufEnglish) > 0 { 119 | *words = append(*words, bufEnglish) 120 | } 121 | } 122 | 123 | func CutSymbolW(s string, words *[]string) { 124 | n := len(s) 125 | if n == 0 { 126 | return 127 | } 128 | 129 | buf := "" 130 | word := "" 131 | prePos := 0 132 | for { 133 | loc := reDelimiter.FindStringIndex(s[prePos:]) 134 | if loc == nil { 135 | word = s[prePos:] 136 | prePos = n 137 | } else { 138 | loc[0] += prePos 139 | loc[1] += prePos 140 | if loc[0] > prePos { 141 | buf = s[prePos:loc[0]] 142 | } 143 | 144 | word = s[loc[0]:loc[1]] 145 | prePos = loc[1] 146 | } 147 | 148 | if buf == "\r" && word == "\n" { 149 | *words = append(*words, "\r\n") 150 | buf = "" 151 | } else { 152 | if buf != "" { 153 | *words = append(*words, buf) 154 | } 155 | if word != "" { 156 | buf = word 157 | } 158 | } 159 | 160 | if prePos == n { 161 | if buf != "" { 162 | *words = append(*words, buf) 163 | } 164 | return 165 | } 166 | } 167 | } 168 | 169 | func Init(dictPath string) { 170 | SetDictPath(dictPath) 171 | InitDictionary() 172 | InitTFIDF() 173 | InitFSToken() 174 | } 175 | -------------------------------------------------------------------------------- /tokenizer/dictionary.go: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Ze-Bin Wang. All rights reserved. 2 | // Use of this source code is governed by a MIT style 3 | // license that can be found in the LICENSE file. 4 | 5 | package tokenizer 6 | 7 | import ( 8 | "bufio" 9 | "errors" 10 | "io" 11 | "log" 12 | "os" 13 | "path/filepath" 14 | "strconv" 15 | "strings" 16 | "sync" 17 | "time" 18 | ) 19 | 20 | var dictionary = &Dictionary{ 21 | dict: make(map[string]int), 22 | } 23 | 24 | type Dictionary struct { 25 | dict map[string]int 26 | mu sync.RWMutex 27 | tf int // total freq 28 | } 29 | 30 | func (d *Dictionary) Exist(word string) bool { 31 | d.mu.RLock() 32 | defer d.mu.RUnlock() 33 | _, ok := d.dict[strings.ToLower(word)] 34 | return ok 35 | } 36 | 37 | func (d *Dictionary) GetWord(word string) (int, bool) { 38 | d.mu.RLock() 39 | defer d.mu.RUnlock() 40 | v, ok := d.dict[strings.ToLower(word)] 41 | return v, ok 42 | } 43 | 44 | func (d *Dictionary) GetTotalFreq() float64 { 45 | d.mu.RLock() 46 | defer d.mu.RUnlock() 47 | return float64(d.tf) 48 | } 49 | 50 | func (d *Dictionary) AddWord(word string, freq int, prop string) (exist bool, err error) { 51 | d.mu.Lock() 52 | defer d.mu.Unlock() 53 | 54 | if _, ok := d.dict[strings.ToLower(word)]; ok { 55 | exist = true 56 | return 57 | } 58 | 59 | dictStdFile, err := GetDictFile(DictStdFile) 60 | if err != nil { 61 | return 62 | } 63 | 64 | dictUserFile := filepath.Dir(dictStdFile) 65 | dictUserFile += string(filepath.Separator) + DictUserFile 66 | 67 | f, err := os.OpenFile(dictUserFile, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0666) 68 | if err != nil { 69 | return 70 | } 71 | defer func() { 72 | _ = f.Close() 73 | }() 74 | 75 | stat, err := f.Stat() 76 | if err != nil { 77 | return 78 | } 79 | 80 | line := "" 81 | n := stat.Size() 82 | if n > 0 { 83 | buf := make([]byte, 1, 1) 84 | _, err = f.ReadAt(buf, n-1) 85 | if err != nil { 86 | return 87 | } 88 | if buf[0] != '\n' { 89 | line += "\n" 90 | } 91 | } 92 | line += word + " " + strconv.Itoa(freq) + " " + prop + "\n" 93 | _, err = f.Write([]byte(line)) 94 | if err != nil { 95 | log.Println(err) 96 | return 97 | } 98 | 99 | d.dict[strings.ToLower(word)] = freq 100 | d.tf += freq 101 | return 102 | } 103 | 104 | func (d *Dictionary) load(fileDict string) error { 105 | d.mu.Lock() 106 | defer d.mu.Unlock() 107 | 108 | timeStart := time.Now() 109 | 110 | f, err := os.Open(fileDict) 111 | if err != nil { 112 | log.Println(err) 113 | return errors.New("unable to load the dictionary library:" + filepath.Base(fileDict)) 114 | } 115 | defer func() { 116 | _ = f.Close() 117 | }() 118 | 119 | itemCount := 0 120 | reader := bufio.NewReader(f) 121 | for { 122 | line, err := reader.ReadString('\n') 123 | if err != nil && err != io.EOF { 124 | break 125 | } 126 | 127 | elem := strings.Fields(line) 128 | if len(elem) != 3 { 129 | if err == io.EOF { 130 | break 131 | } 132 | continue 133 | } 134 | 135 | itemCount++ 136 | nFreq, err := strconv.Atoi(elem[1]) 137 | if err != nil { 138 | nFreq = 0 139 | } 140 | d.tf += nFreq 141 | d.dict[strings.ToLower(elem[0])] = nFreq 142 | 143 | runeWord := []rune(elem[0]) 144 | for i := range runeWord { 145 | s := strings.ToLower(string(runeWord[:i+1])) 146 | if _, ok := d.dict[s]; !ok { 147 | d.dict[s] = 0 148 | } 149 | } 150 | 151 | if err == io.EOF { 152 | break 153 | } 154 | } 155 | if len(d.dict) == 0 || d.tf <= 0 { 156 | return errors.New("unable to load the dictionary library:" + filepath.Base(fileDict)) 157 | } 158 | 159 | log.Printf("%v words are loaded in dictionary "+filepath.Base(fileDict)+", and take %v\n", 160 | itemCount, time.Now().Sub(timeStart)) 161 | return nil 162 | } 163 | 164 | func InitDictionary() { 165 | // load the standard dictionary 166 | dictStdFile, err := GetDictFile(DictStdFile) 167 | if err != nil { 168 | log.Panic(err) 169 | } 170 | err = dictionary.load(dictStdFile) 171 | if err != nil { 172 | log.Panic(err) 173 | } 174 | 175 | // load the user-defined dictionary 176 | dictUserFile, err := GetDictFile(DictUserFile) 177 | if err == nil { 178 | dictionary.load(dictUserFile) 179 | } 180 | } 181 | 182 | func GetDictionary() *Dictionary { 183 | return dictionary 184 | } 185 | -------------------------------------------------------------------------------- /tokenizer/fstokenizer.go: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Ze-Bin Wang. All rights reserved. 2 | // Use of this source code is governed by a MIT style 3 | // license that can be found in the LICENSE file. 4 | 5 | package tokenizer 6 | 7 | import ( 8 | "encoding/json" 9 | "io/ioutil" 10 | "log" 11 | "sync" 12 | ) 13 | 14 | const ( 15 | minFloat = -3.14e100 16 | finalSegProbStart = "fs_pbstart.json" 17 | finalSegProbTrans = "fs_pbtrans.json" 18 | finalSegProbEmit = "fs_pbemit.json" 19 | ) 20 | 21 | var ( 22 | fsTokenizer = &FinalSeg{ 23 | start: make(map[string]float64), 24 | trans: make(map[string]map[string]float64), 25 | emit: make(map[string]map[string]float64), 26 | 27 | forceSplitWords: &forceSplitWords{ 28 | dict: make(map[string]struct{}), 29 | }, 30 | } 31 | 32 | prevStatus = map[string][]string{ 33 | "B": {"E", "S"}, 34 | "M": {"M", "B"}, 35 | "S": {"S", "E"}, 36 | "E": {"B", "M"}, 37 | } 38 | states = []string{"B", "M", "E", "S"} 39 | ) 40 | 41 | type forceSplitWords struct { 42 | dict map[string]struct{} 43 | mutex sync.RWMutex 44 | } 45 | 46 | func (s *forceSplitWords) exist(word string) bool { 47 | s.mutex.RLock() 48 | defer s.mutex.RUnlock() 49 | _, ok := s.dict[word] 50 | return ok 51 | } 52 | 53 | func (s *forceSplitWords) addForceSplit(word string) { 54 | s.mutex.Lock() 55 | defer s.mutex.Unlock() 56 | s.dict[word] = struct{}{} 57 | } 58 | 59 | type FinalSeg struct { 60 | start map[string]float64 61 | trans map[string]map[string]float64 62 | emit map[string]map[string]float64 63 | 64 | forceSplitWords *forceSplitWords 65 | } 66 | 67 | func (fs *FinalSeg) Cut(sentence string) []string { 68 | wordsRet := make([]string, 0, DefaultWordsLen) 69 | 70 | segments := SplitChineseSeg(sentence) 71 | for _, segment := range segments { 72 | if IsChineseChars(segment) { 73 | words := fs.cut(segment) 74 | for _, v := range words { 75 | if fs.exist(v) { 76 | for _, c := range v { 77 | wordsRet = append(wordsRet, string(c)) 78 | } 79 | } else { 80 | wordsRet = append(wordsRet, v) 81 | } 82 | } 83 | } else { 84 | words := SplitNumberSeg(segment) 85 | wordsRet = append(wordsRet, words...) 86 | } 87 | } 88 | return wordsRet 89 | } 90 | 91 | func (fs *FinalSeg) getMatrixVal(name, key, word string) float64 { 92 | var m map[string]map[string]float64 93 | if name == "emit" { 94 | m = fs.emit 95 | } else if name == "trans" { 96 | m = fs.trans 97 | } else { 98 | return minFloat 99 | } 100 | val, ok := m[key][word] 101 | if !ok { 102 | val = minFloat 103 | } 104 | return val 105 | } 106 | 107 | func (fs *FinalSeg) viterbi(sentence string) []string { 108 | rs := []rune(sentence) 109 | n := len(rs) 110 | if n == 0 { 111 | return nil 112 | } 113 | 114 | v := make([]map[string]float64, n) 115 | for i := 0; i < n; i++ { 116 | v[i] = make(map[string]float64) 117 | } 118 | path := make(map[string][]string, 0) 119 | 120 | word := string(rs[0]) 121 | for _, y := range states { 122 | v[0][y] = fs.start[y] + fs.getMatrixVal("emit", y, word) 123 | path[y] = []string{y} 124 | } 125 | 126 | for i := 1; i < len(rs); i++ { 127 | word = string(rs[i]) 128 | 129 | pathNew := make(map[string][]string, 0) 130 | for _, y := range states { 131 | st := "" 132 | pb := minFloat 133 | for _, y0 := range prevStatus[y] { 134 | m := v[i-1][y0] + fs.getMatrixVal("trans", y0, y) + fs.getMatrixVal("emit", y, word) 135 | if st == "" { 136 | st = y0 137 | pb = m 138 | } else if m > pb { 139 | st = y0 140 | pb = m 141 | } 142 | } 143 | v[i][y] = pb 144 | pathNew[y] = append(path[st], y) 145 | } 146 | path = pathNew 147 | } 148 | 149 | state := "E" 150 | prob := v[len(rs)-1]["E"] 151 | if v[len(rs)-1]["S"] > prob { 152 | prob = v[len(rs)-1]["S"] 153 | state = "S" 154 | } 155 | return path[state] 156 | } 157 | 158 | func (fs *FinalSeg) cut(sentence string) []string { 159 | rs := []rune(sentence) 160 | wordsRet := make([]string, 0) 161 | posList := fs.viterbi(sentence) 162 | 163 | begin, next := 0, 0 164 | for i, word := range rs { 165 | pos := posList[i] 166 | if pos == "B" { 167 | begin = i 168 | } else if pos == "E" { 169 | wordsRet = append(wordsRet, string(rs[begin:i+1])) 170 | next = i + 1 171 | } else if pos == "S" { 172 | wordsRet = append(wordsRet, string(word)) 173 | next = i + 1 174 | } 175 | } 176 | if next < len(rs) { 177 | wordsRet = append(wordsRet, string(rs[next:])) 178 | } 179 | 180 | return wordsRet 181 | } 182 | 183 | func (fs *FinalSeg) exist(word string) bool { 184 | return fs.forceSplitWords.exist(word) 185 | } 186 | 187 | func readJsonFromFile(fn string, fs interface{}) { 188 | fileProbeStart, err := GetDictFile(fn) 189 | if err != nil { 190 | log.Panic(err) 191 | } 192 | data, err := ioutil.ReadFile(fileProbeStart) 193 | if err != nil { 194 | log.Panic(err) 195 | } 196 | err = json.Unmarshal(data, fs) 197 | if err != nil { 198 | log.Panic(err) 199 | } 200 | } 201 | 202 | func GetFinalSeg() *FinalSeg { 203 | return fsTokenizer 204 | } 205 | 206 | func InitFSToken() { 207 | readJsonFromFile(finalSegProbStart, &fsTokenizer.start) 208 | readJsonFromFile(finalSegProbTrans, &fsTokenizer.trans) 209 | readJsonFromFile(finalSegProbEmit, &fsTokenizer.emit) 210 | } 211 | -------------------------------------------------------------------------------- /tokenizer/sentence.go: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Ze-Bin Wang. All rights reserved. 2 | // Use of this source code is governed by a MIT style 3 | // license that can be found in the LICENSE file. 4 | 5 | package tokenizer 6 | 7 | import "math" 8 | 9 | type NodeDAG struct { 10 | X float64 11 | Y int 12 | } 13 | 14 | type Sentence struct { 15 | sentenceRune []rune 16 | } 17 | 18 | func NewSentence(s string) *Sentence { 19 | return &Sentence{ 20 | sentenceRune: []rune(s), 21 | } 22 | } 23 | 24 | func (s *Sentence) Len() int { 25 | if s.sentenceRune == nil { 26 | return 0 27 | } 28 | return len(s.sentenceRune) 29 | } 30 | 31 | func (s *Sentence) GetWord(start, end int) string { 32 | if (start < 0 || start >= s.Len()) || (end <= 0 || end > s.Len()) { 33 | return "" 34 | } 35 | return string(s.sentenceRune[start:end]) 36 | } 37 | 38 | func (s *Sentence) GetChar(i int) string { 39 | if i < 0 || i >= s.Len() { 40 | return "" 41 | } 42 | return string(s.sentenceRune[i]) 43 | } 44 | 45 | func (s *Sentence) GetDAG() [][]int { 46 | dag := make([][]int, 0) 47 | 48 | dictionary := GetDictionary() 49 | n := s.Len() 50 | for k := 0; k < n; k++ { 51 | l := make([]int, 0) 52 | 53 | i := k 54 | for i < n { 55 | word := string(s.sentenceRune[k : i+1]) 56 | if freq, ok := dictionary.GetWord(word); ok { 57 | if freq > 0 { 58 | l = append(l, i) 59 | } 60 | i++ 61 | continue 62 | } 63 | break 64 | } 65 | 66 | if len(l) == 0 { 67 | l = append(l, k) 68 | } 69 | dag = append(dag, l) 70 | } 71 | return dag 72 | } 73 | 74 | func (s *Sentence) CalcDAG() []NodeDAG { 75 | n := s.Len() 76 | route := make([]NodeDAG, n+1) 77 | route[n] = NodeDAG{0, 0} 78 | 79 | dictionary := GetDictionary() 80 | logTotal := math.Log(dictionary.GetTotalFreq()) 81 | 82 | dag := s.GetDAG() 83 | for k := n - 1; k >= 0; k-- { 84 | score := float64(0) 85 | idx := -1 86 | for _, x := range dag[k] { 87 | word := s.GetWord(k, x+1) 88 | freq := 0 89 | if v, ok := dictionary.GetWord(word); ok { 90 | freq = v 91 | } 92 | if freq == 0 { 93 | freq = 1 94 | } 95 | val := math.Log(float64(freq)) - logTotal + route[x+1].X 96 | if idx == -1 { 97 | score = val 98 | idx = x 99 | } else if val >= score { 100 | score = val 101 | idx = x 102 | } 103 | } 104 | route[k] = NodeDAG{score, idx} 105 | } 106 | return route 107 | } 108 | --------------------------------------------------------------------------------