├── .gitignore
├── LICENSE
├── README.md
├── api
    ├── Dockerfile
    ├── Makefile
    ├── jiebago.go
    └── jiebago_test.go
├── dictionary
    ├── dict_std_utf8.txt
    ├── dict_user_utf8.txt
    ├── fs_pbemit.json
    ├── fs_pbstart.json
    ├── fs_pbtrans.json
    ├── idf_std_utf8.txt
    ├── stop_words_std_utf8.txt
    └── stop_words_user_utf8.txt
├── dll
    ├── Makefile
    └── jiebago.go
├── example
    ├── main.go
    └── main_api.go
├── go.mod
├── go.sum
├── jiebago.go
├── jiebago_test.go
└── tokenizer
    ├── analyzer.go
    ├── common.go
    ├── cutword.go
    ├── dictionary.go
    ├── fstokenizer.go
    └── sentence.go


/.gitignore:
--------------------------------------------------------------------------------
1 | .git
2 | .idea
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2022 Ze-Bin Wang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![logo](http://static.codebaoku.com/images/blogo.png)](http://www.codebaoku.com)
  2 | 
  3 | JiebaGo 是 jieba 中文分词的 Go 语言版本。
  4 | 
  5 | ## 功能特点
  6 | 
  7 | + 支持多种分词方式，包括: 最大概率模式, HMM新词发现模式, 搜索引擎模式, 全模式
  8 | + 支持抽取关键词，包括: 无权重关键词, 权重关键词
  9 | + 支持多种使用方式，包括: Go语言包, Windows Dll, Web API, Docker
 10 | + 支持在线并行添加字典词库和停止词
 11 | + 全部代码使用 go 语言实现，全面兼容 jieba python 词库
 12 | 
 13 | ## 引用方法
 14 | 
 15 | 不使用包管理工具：
 16 | ```bash
 17 | go get github.com/wangshizebin/jiebago
 18 | ```
 19 | 
 20 | 使用 go mod 管理：
 21 | 代码中直接引用 github.com/wangshizebin/jiebago 即可。
 22 | 
 23 | ## 特别注意
 24 | 
 25 | 由于分词和提取关键词使用了中文预置词库和TF-IDF统计库，所以使用 jiebago，需要先下载项目中词库 dictionary 目录，并将 dictionary 放入项目的工作目录中。
 26 | 我们也可以自己指定字典库的位置，不过需要在初始化 jiebago 对象的时候进行设置：
 27 | 
 28 | ```golang
 29 | jieBaGo := jiebago.NewJieBaGo("/data/mydict")
 30 | ```
 31 | 
 32 | ## 功能示例
 33 | 
 34 | ```golang
 35 | package main
 36 | 
 37 | import (
 38 | 	"fmt"
 39 | 	"strings"
 40 | 
 41 | 	"github.com/wangshizebin/jiebago"
 42 | )
 43 | 
 44 | func main() {
 45 | 	jieBaGo := jiebago.NewJieBaGo()
 46 | 	// 可以指定字典库的位置
 47 | 	// jieBaGo := jiebago.NewJieBaGo("/data/mydict")
 48 | 
 49 | 	sentence := "Shell 位于用户与系统之间，用来帮助用户与操作系统进行沟通。通常都是文字模式的 Shell。"
 50 | 	fmt.Println("原始语句：", sentence)
 51 | 	fmt.Println()
 52 | 
 53 | 	// 默认模式分词
 54 | 	words := jieBaGo.Cut(sentence)
 55 | 	fmt.Println("默认模式分词：", strings.Join(words,"/"))
 56 | 
 57 | 	// 精确模式分词
 58 | 	words = jieBaGo.CutAccurate(sentence)
 59 | 	fmt.Println("精确模式分词：", strings.Join(words,"/"))
 60 | 
 61 | 	// 全模式分词
 62 | 	words = jieBaGo.CutFull(sentence)
 63 | 	fmt.Println("全模式分词：", strings.Join(words,"/"))
 64 | 
 65 | 	// NoHMM模式分词
 66 | 	words = jieBaGo.CutNoHMM(sentence)
 67 | 	fmt.Println("NoHMM模式分词：", strings.Join(words,"/"))
 68 | 
 69 | 	// 搜索引擎模式分词
 70 | 	words = jieBaGo.CutForSearch(sentence)
 71 | 	fmt.Println("搜索引擎模式分词：", strings.Join(words,"/"))
 72 | 	fmt.Println()
 73 | 
 74 | 	// 提取关键词，即Tag标签
 75 | 	keywords := jieBaGo.ExtractKeywords(sentence, 20)
 76 | 	fmt.Println("提取关键词：", strings.Join(keywords,"/"))
 77 | 
 78 | 	// 提取带权重的关键词，即Tag标签
 79 | 	keywordsWeight := jieBaGo.ExtractKeywordsWeight(sentence, 20)
 80 | 	fmt.Println("提取带权重的关键词：", keywordsWeight)
 81 | 	fmt.Println()
 82 | 
 83 | 	// 向字典加入单词
 84 | 	exist, err := jieBaGo.AddDictWord("编程宝库", 3, "n")
 85 | 	if err != nil {
 86 | 		fmt.Println(err)
 87 | 	} else {
 88 | 		fmt.Println("向字典加入单词：编程宝库")
 89 | 		if exist {
 90 | 			fmt.Println("单词已经存在")
 91 | 		}
 92 | 	}
 93 | 
 94 | 	// 向字典加入停止词
 95 | 	exist, err = jieBaGo.AddStopWord("the")
 96 | 	if err != nil {
 97 | 		fmt.Println(err)
 98 | 	} else {
 99 | 		fmt.Println("向字典加入停止词：the")
100 | 		if exist {
101 | 			fmt.Println("单词已经存在")
102 | 		}
103 | 	}
104 | }
105 | ```
106 | 
107 | ```
108 | 原始语句： Shell位于用户与系统之间，用来帮助用户与操作系统进行沟通。
109 | 
110 | 默认模式分词： Shell/位于/用户/与/系统/之间/，/用来/帮助/用户/与/操作系统/进行/沟通/。
111 | 精确模式分词： Shell/位于/用户/与/系统/之间/，/用来/帮助/用户/与/操作系统/进行/沟通/。
112 | 全模式分词： Shell/位于/用户/与/系统/之间/，/用来/帮助/用户/与/操作/操作系统/系统/进行/沟通/。
113 | NoHMM模式分词： Shell/位于/用户/与/系统/之间/，/用来/帮助/用户/与/操作系统/进行/沟通/。
114 | 搜索引擎模式分词： Shell/位于/用户/与/系统/之间/，/用来/帮助/用户/与/操作/系统/操作系/操作系统/进行/沟通/。
115 | 
116 | 提取关键词： 用户/Shell/操作系统/沟通/帮助/位于/系统/之间/进行
117 | 提取带权重的关键词： [{用户 1.364467214484} {Shell 1.19547675029} {操作系统 0.9265948663750001} {沟通 0.694890548758} {帮助 0.5809050240370001} {位于 0.496609078159} {系统 0.49601794343199995} {之间 0.446152979906} {进行 0.372712479502}]
118 | 
119 | 向字典加入单词：编程宝库
120 | 向字典加入停止词：the
121 | ```
122 | 
123 | 更详细的例子参照 example/main.go, jiebago_test.go, api/iebago_test.go 中的代码。
124 | 
125 | ## 单元测试
126 | go 包
127 | 
128 | ```bash
129 | go test
130 | ```
131 | 
132 | Web API
133 | 
134 | ```bash
135 | cd api
136 | go test 
137 | ```
138 | 
139 | ## Contact
140 | 
141 | + Email: `wangzebin@vip.163.com`
142 | + weixin: `bkra50`


--------------------------------------------------------------------------------
/api/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM alpine
2 | ENV TZ=Asia/Shanghai
3 | RUN mkdir -p /jiebago/dictionary
4 | COPY ./jiebago /jiebago
5 | COPY ./dictionary /jiebago/dictionary
6 | WORKDIR /jiebago
7 | CMD ["./jiebago","-http_addr=:8118"]
8 | 


--------------------------------------------------------------------------------
/api/Makefile:
--------------------------------------------------------------------------------
 1 | PROJECT = jiebago
 2 | 
 3 | .PHONY: all clean build docker
 4 | all:clean build
 5 | 	@echo "Done!"
 6 | 
 7 | build:
 8 | 	go build -o $(PROJECT) $(PROJECT).go
 9 | 
10 | clean:
11 | 	rm -rf $(PROJECT)
12 | 	rm -rf dictionary
13 | 
14 | docker: clean
15 | 	CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o $(PROJECT) $(PROJECT).go
16 | 	mkdir dictionary
17 | 	cp -r ../dictionary/ dictionary/
18 | 	docker build . -t $(PROJECT):v1
19 | 	rm -rf $(PROJECT)
20 | 	rm -rf dictionary
21 | 
22 | 


--------------------------------------------------------------------------------
/api/jiebago.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2022 Ze-Bin Wang.  All rights reserved.
  2 | // Use of this source code is governed by a MIT style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package main
  6 | 
  7 | import (
  8 | 	"flag"
  9 | 	"fmt"
 10 | 	"log"
 11 | 	"net/http"
 12 | 	"strconv"
 13 | 	"strings"
 14 | 
 15 | 	"github.com/gin-gonic/gin"
 16 | 	"github.com/wangshizebin/jiebago"
 17 | 	"github.com/wangshizebin/jiebago/tokenizer"
 18 | )
 19 | 
 20 | const (
 21 | 	Success = iota
 22 | 	ErrorFail
 23 | 	ErrorRequestMethod
 24 | 	ErrorJsonData
 25 | 	ErrorWordEmpty
 26 | 	ErrorWeightInteger
 27 | 	ErrorWeightRange
 28 | 	ErrorCountInteger
 29 | )
 30 | 
 31 | var (
 32 | 	jieBaGo = &jiebago.JieBaGo{}
 33 | )
 34 | 
 35 | func main() {
 36 | 	httpAddr := flag.String("http_addr", ":8118",
 37 | 		"http_addr specifies the listening ip and port, for example: -http_addr 1.2.3.4:8888")
 38 | 
 39 | 	dictPath := flag.String("dict_path", "",
 40 | 		"dict_path specifies the path of dictionary, for example: -dict_path /data/dictionary")
 41 | 
 42 | 	flag.Parse()
 43 | 
 44 | 	jieBaGo = jiebago.NewJieBaGo(*dictPath)
 45 | 
 46 | 	engine := gin.Default()
 47 | 
 48 | 	engine.Any("/cut_words", cutWordsHandler)
 49 | 	engine.Any("/extract_keywords", extractKeywordsHandler)
 50 | 	engine.Any("/add_dict_word", addDictWordHandler)
 51 | 	engine.Any("/add_stop_word", addStopWordHandler)
 52 | 
 53 | 	if err := engine.Run(*httpAddr); err != nil {
 54 | 		log.Print(err)
 55 | 	}
 56 | }
 57 | 
 58 | type RequestCutWord struct {
 59 | 	Sentence string `json:"s"`
 60 | 	Mode     string `json:"mode"`
 61 | }
 62 | 
 63 | type RequestExtractWord struct {
 64 | 	Sentence string `json:"s"`
 65 | 	Mode     string `json:"mode"`
 66 | 	Count    int    `json:"count"`
 67 | }
 68 | 
 69 | type RequestAddWord struct {
 70 | 	Word   string `json:"s"`
 71 | 	Weight int    `json:"weight"`
 72 | 	Prop   string `json:"prop"`
 73 | }
 74 | 
 75 | type Response struct {
 76 | 	ErrCode int    `json:"errcode"`
 77 | 	ErrMsg  string `json:"errmsg"`
 78 | }
 79 | 
 80 | func cutWordsHandler(c *gin.Context) {
 81 | 	sentence := ""
 82 | 	mode := ""
 83 | 	if c.Request.Method == "GET" {
 84 | 		mode = strings.ToLower(c.DefaultQuery("mode", ""))
 85 | 		sentence = c.DefaultQuery("s", "")
 86 | 	} else if c.Request.Method == "POST" {
 87 | 		var request RequestCutWord
 88 | 		err := c.BindJSON(&request)
 89 | 		if err != nil {
 90 | 			c.JSON(http.StatusOK, struct {
 91 | 				Response
 92 | 				Words []string `json:"words"`
 93 | 			}{
 94 | 				Response: Response{
 95 | 					ErrCode: ErrorJsonData,
 96 | 					ErrMsg:  fmt.Sprintf(`invalid json data, the proper data format is {"s":"xx","mode":"xx"}`),
 97 | 				},
 98 | 				Words: []string{},
 99 | 			})
100 | 			return
101 | 		}
102 | 		mode = request.Mode
103 | 		sentence = request.Sentence
104 | 	} else {
105 | 		c.JSON(http.StatusOK, struct {
106 | 			Response
107 | 			Words []string `json:"words"`
108 | 		}{
109 | 			Response: Response{
110 | 				ErrCode: ErrorRequestMethod,
111 | 				ErrMsg:  fmt.Sprintf(`iinvalid request method, only GET and POST methods are supported`),
112 | 			},
113 | 			Words: []string{},
114 | 		})
115 | 		return
116 | 	}
117 | 
118 | 	var words []string
119 | 	if mode == "full" {
120 | 		words = jieBaGo.CutFull(sentence)
121 | 	} else if mode == "accurate" {
122 | 		words = jieBaGo.CutAccurate(sentence)
123 | 	} else if mode == "nohmm" {
124 | 		words = jieBaGo.CutNoHMM(sentence)
125 | 	} else if mode == "search" {
126 | 		words = jieBaGo.CutForSearch(sentence)
127 | 	} else {
128 | 		words = jieBaGo.Cut(sentence)
129 | 	}
130 | 
131 | 	c.JSON(http.StatusOK, struct {
132 | 		Response
133 | 		Words []string `json:"words"`
134 | 	}{
135 | 		Response: Response{
136 | 			ErrCode: Success,
137 | 			ErrMsg:  "success",
138 | 		},
139 | 		Words: words,
140 | 	})
141 | }
142 | 
143 | func extractKeywordsHandler(c *gin.Context) {
144 | 	sentence := ""
145 | 	count := 0
146 | 	mode := ""
147 | 	if c.Request.Method == "GET" {
148 | 		sentence = c.DefaultQuery("s", "")
149 | 		mode = c.DefaultQuery("mode", "")
150 | 		w := c.DefaultQuery("count", "0")
151 | 		var err error
152 | 		count, err = strconv.Atoi(w)
153 | 		if err != nil {
154 | 			c.JSON(http.StatusOK, struct {
155 | 				Response
156 | 				Tags []string `json:"tags"`
157 | 			}{
158 | 				Response: Response{
159 | 					ErrCode: ErrorCountInteger,
160 | 					ErrMsg:  "the count must be an integer",
161 | 				},
162 | 				Tags: []string{},
163 | 			})
164 | 			return
165 | 		}
166 | 	} else if c.Request.Method == "POST" {
167 | 		var request RequestExtractWord
168 | 		err := c.BindJSON(&request)
169 | 		if err != nil {
170 | 			c.JSON(http.StatusOK, struct {
171 | 				Response
172 | 				Tags []string `json:"tags"`
173 | 			}{
174 | 				Response: Response{
175 | 					ErrCode: ErrorJsonData,
176 | 					ErrMsg:  fmt.Sprintf(`invalid json data, the proper data format is {"s":"xx","count":xx,"mode":"xx"}`),
177 | 				},
178 | 				Tags: []string{},
179 | 			})
180 | 			return
181 | 		}
182 | 		sentence = request.Sentence
183 | 		mode = request.Mode
184 | 		count = request.Count
185 | 	} else {
186 | 		c.JSON(http.StatusOK, struct {
187 | 			Response
188 | 			Tags []string `json:"tags"`
189 | 		}{
190 | 			Response: Response{
191 | 				ErrCode: ErrorRequestMethod,
192 | 				ErrMsg:  fmt.Sprintf(`iinvalid request method, only GET and POST methods are supported`),
193 | 			},
194 | 			Tags: []string{},
195 | 		})
196 | 		return
197 | 	}
198 | 	if count <= 0 {
199 | 		count = 20
200 | 	}
201 | 
202 | 	if mode == "weight" {
203 | 		tags := jieBaGo.ExtractKeywordsWeight(sentence, count)
204 | 		c.JSON(http.StatusOK, struct {
205 | 			Response
206 | 			Tags []tokenizer.Keyword `json:"tags"`
207 | 		}{
208 | 			Response: Response{
209 | 				ErrCode: Success,
210 | 				ErrMsg:  "success",
211 | 			},
212 | 			Tags: tags,
213 | 		})
214 | 	} else {
215 | 		tags := jieBaGo.ExtractKeywords(sentence, count)
216 | 		c.JSON(http.StatusOK, struct {
217 | 			Response
218 | 			Tags []string `json:"tags"`
219 | 		}{
220 | 			Response: Response{
221 | 				ErrCode: Success,
222 | 				ErrMsg:  "success",
223 | 			},
224 | 			Tags: tags,
225 | 		})
226 | 	}
227 | }
228 | 
229 | func addDictWordHandler(c *gin.Context) {
230 | 	word := ""
231 | 	weight := 0
232 | 	prop := ""
233 | 	if c.Request.Method == "GET" {
234 | 		word = c.DefaultQuery("s", "")
235 | 		w := c.DefaultQuery("weight", "0")
236 | 		var err error
237 | 		weight, err = strconv.Atoi(w)
238 | 		if err != nil {
239 | 			c.JSON(http.StatusOK, Response{
240 | 				ErrCode: ErrorWeightInteger,
241 | 				ErrMsg:  "the weight must be an integer",
242 | 			})
243 | 			return
244 | 		}
245 | 		prop = c.DefaultQuery("prop", "")
246 | 	} else if c.Request.Method == "POST" {
247 | 		var request RequestAddWord
248 | 		err := c.BindJSON(&request)
249 | 		if err != nil {
250 | 			c.JSON(http.StatusOK, Response{
251 | 				ErrCode: ErrorJsonData,
252 | 				ErrMsg:  fmt.Sprintf(`invalid json data, the proper data format is {"s":"xx","weight":xx,"prop":"xx"}`),
253 | 			})
254 | 			return
255 | 		}
256 | 		word = request.Word
257 | 		weight = request.Weight
258 | 		prop = request.Prop
259 | 	}
260 | 
261 | 	word = strings.TrimSpace(word)
262 | 	if len(word) == 0 {
263 | 		c.JSON(http.StatusOK, Response{
264 | 			ErrCode: ErrorWordEmpty,
265 | 			ErrMsg:  "the word is empty",
266 | 		})
267 | 		return
268 | 	}
269 | 
270 | 	if weight < 0 || weight > 5000 {
271 | 		c.JSON(http.StatusOK, Response{
272 | 			ErrCode: ErrorWeightRange,
273 | 			ErrMsg:  "the weight must be between 0 and 5000",
274 | 		})
275 | 		return
276 | 	}
277 | 
278 | 	if prop == "" {
279 | 		prop = "n"
280 | 	}
281 | 
282 | 	exist, err := jieBaGo.AddDictWord(word, weight, prop)
283 | 	if err != nil {
284 | 		c.JSON(http.StatusOK, Response{
285 | 			ErrCode: ErrorFail,
286 | 			ErrMsg:  err.Error(),
287 | 		})
288 | 		return
289 | 	}
290 | 
291 | 	message := "success"
292 | 	if exist {
293 | 		message = "the word already exists"
294 | 	}
295 | 	c.JSON(http.StatusOK, Response{
296 | 		ErrCode: Success,
297 | 		ErrMsg:  message,
298 | 	})
299 | }
300 | 
301 | func addStopWordHandler(c *gin.Context) {
302 | 	word := ""
303 | 	if c.Request.Method == "GET" {
304 | 		word = c.DefaultQuery("s", "")
305 | 	} else if c.Request.Method == "POST" {
306 | 		var request RequestAddWord
307 | 		err := c.BindJSON(&request)
308 | 		if err != nil {
309 | 			c.JSON(http.StatusOK, Response{
310 | 				ErrCode: ErrorJsonData,
311 | 				ErrMsg:  fmt.Sprintf(`invalid json data, the proper data format is {"s":"xx"}`),
312 | 			})
313 | 			return
314 | 		}
315 | 		word = request.Word
316 | 	}
317 | 
318 | 	word = strings.TrimSpace(word)
319 | 	if len(word) == 0 {
320 | 		c.JSON(http.StatusOK, Response{
321 | 			ErrCode: ErrorWordEmpty,
322 | 			ErrMsg:  "the word is empty",
323 | 		})
324 | 		return
325 | 	}
326 | 
327 | 	exist, err := jieBaGo.AddStopWord(word)
328 | 	if err != nil {
329 | 		c.JSON(http.StatusOK, Response{
330 | 			ErrCode: ErrorFail,
331 | 			ErrMsg:  err.Error(),
332 | 		})
333 | 		return
334 | 	}
335 | 
336 | 	message := "success"
337 | 	if exist {
338 | 		message = "the word already exists"
339 | 	}
340 | 	c.JSON(http.StatusOK, Response{
341 | 		ErrCode: Success,
342 | 		ErrMsg:  message,
343 | 	})
344 | }
345 | 


--------------------------------------------------------------------------------
/api/jiebago_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2022 Ze-Bin Wang.  All rights reserved.
  2 | // Use of this source code is governed by a MIT style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package main
  6 | 
  7 | import (
  8 | 	"bytes"
  9 | 	"encoding/json"
 10 | 	"errors"
 11 | 	"fmt"
 12 | 	"io"
 13 | 	"io/ioutil"
 14 | 	"net/http"
 15 | 	"strconv"
 16 | 	"strings"
 17 | 	"testing"
 18 | 	"time"
 19 | 
 20 | 	"github.com/wangshizebin/jiebago/tokenizer"
 21 | )
 22 | 
 23 | var (
 24 | 	sentence   = "Shell位于用户与系统之间，用来帮助用户与操作系统进行沟通。"
 25 | 	resultTest = []string{"Shell", "操作系统", "用户"}
 26 | )
 27 | 
 28 | func TestCutWordsGet(t *testing.T) {
 29 | 	t.Log(sentence)
 30 | 
 31 | 	url := "http://localhost:8118/cut_words?s=" + sentence
 32 | 	modes := []string{"", "accurate", "full", "nohmm", "search"}
 33 | 	for _, mode := range modes {
 34 | 		t.Log("=== mode: " + mode)
 35 | 		result, err := Get(url + "&mode=" + mode)
 36 | 		if err != nil {
 37 | 			t.Error(err)
 38 | 			return
 39 | 		}
 40 | 		var w struct {
 41 | 			Words []string `json:"words"`
 42 | 		}
 43 | 		err = json.Unmarshal([]byte(result), &w)
 44 | 		if err != nil {
 45 | 			t.Error(err)
 46 | 			return
 47 | 		}
 48 | 		t.Log("结果：", strings.Join(w.Words, "/"))
 49 | 
 50 | 		for _, word := range resultTest {
 51 | 			ok := false
 52 | 			for _, v := range w.Words {
 53 | 				if word == v {
 54 | 					ok = true
 55 | 				}
 56 | 			}
 57 | 			if !ok {
 58 | 				t.Error(word + " not pass")
 59 | 			} else {
 60 | 				t.Log(word + " OK")
 61 | 			}
 62 | 		}
 63 | 	}
 64 | }
 65 | 
 66 | func TestCutWordsPost(t *testing.T) {
 67 | 	t.Log(sentence)
 68 | 
 69 | 	url := "http://localhost:8118/cut_words"
 70 | 	modes := []string{"", "accurate", "full", "nohmm", "search"}
 71 | 	for _, mode := range modes {
 72 | 		t.Log("=== mode: " + mode)
 73 | 		data := fmt.Sprintf(`{"s":"%s", "mode":"%s"}`, sentence, mode)
 74 | 		result, err := Post(url, data, "application/json")
 75 | 		if err != nil {
 76 | 			t.Error(err)
 77 | 		}
 78 | 		var w struct {
 79 | 			Words []string `json:"words"`
 80 | 		}
 81 | 		err = json.Unmarshal([]byte(result), &w)
 82 | 		if err != nil {
 83 | 			t.Error(err)
 84 | 			return
 85 | 		}
 86 | 		t.Log("结果：", strings.Join(w.Words, "/"))
 87 | 
 88 | 		for _, word := range resultTest {
 89 | 			ok := false
 90 | 			for _, v := range w.Words {
 91 | 				if word == v {
 92 | 					ok = true
 93 | 				}
 94 | 			}
 95 | 			if !ok {
 96 | 				t.Error(word + " not pass")
 97 | 			} else {
 98 | 				t.Log(word + " OK")
 99 | 			}
100 | 		}
101 | 	}
102 | }
103 | 
104 | func TestExtractKeywordsGet(t *testing.T) {
105 | 	t.Log(sentence)
106 | 
107 | 	url := "http://localhost:8118/extract_keywords?s=" + sentence + "&count=3"
108 | 	result, err := Get(url)
109 | 	if err != nil {
110 | 		t.Error(err)
111 | 		return
112 | 	}
113 | 
114 | 	var w struct {
115 | 		Tags []string `json:"tags"`
116 | 	}
117 | 
118 | 	err = json.Unmarshal([]byte(result), &w)
119 | 	if err != nil {
120 | 		t.Error(err)
121 | 		return
122 | 	}
123 | 	t.Log("提取关键字：", strings.Join(w.Tags, "/"))
124 | 
125 | 	for _, word := range resultTest {
126 | 		ok := false
127 | 		for _, v := range w.Tags {
128 | 			if word == v {
129 | 				ok = true
130 | 			}
131 | 		}
132 | 		if !ok {
133 | 			t.Error(word + " not pass")
134 | 		} else {
135 | 			t.Log(word + " OK")
136 | 		}
137 | 	}
138 | }
139 | 
140 | func TestExtractKeywordsPost(t *testing.T) {
141 | 	t.Log(sentence)
142 | 
143 | 	url := "http://localhost:8118/extract_keywords"
144 | 	data := fmt.Sprintf(`{"s":"%s", "count":%d}`, sentence, 3)
145 | 	result, err := Post(url, data, "application/json")
146 | 	if err != nil {
147 | 		t.Error(err)
148 | 		return
149 | 	}
150 | 
151 | 	var w struct {
152 | 		Tags []string `json:"tags"`
153 | 	}
154 | 
155 | 	err = json.Unmarshal([]byte(result), &w)
156 | 	if err != nil {
157 | 		t.Error(err)
158 | 		return
159 | 	}
160 | 	t.Log("提取关键字：", strings.Join(w.Tags, "/"))
161 | 
162 | 	for _, word := range resultTest {
163 | 		ok := false
164 | 		for _, v := range w.Tags {
165 | 			if word == v {
166 | 				ok = true
167 | 			}
168 | 		}
169 | 		if !ok {
170 | 			t.Error(word + " not pass")
171 | 		} else {
172 | 			t.Log(word + " OK")
173 | 		}
174 | 	}
175 | }
176 | 
177 | func TestExtractKeywordsWeightGet(t *testing.T) {
178 | 	t.Log(sentence)
179 | 
180 | 	url := "http://localhost:8118/extract_keywords?s=" + sentence + "&mode=weight&count=3"
181 | 	result, err := Get(url)
182 | 	if err != nil {
183 | 		t.Error(err)
184 | 		return
185 | 	}
186 | 
187 | 	var w struct {
188 | 		Tags []tokenizer.Keyword `json:"tags"`
189 | 	}
190 | 
191 | 	err = json.Unmarshal([]byte(result), &w)
192 | 	if err != nil {
193 | 		t.Error(err)
194 | 		return
195 | 	}
196 | 	t.Log("提取关键字：", w)
197 | 
198 | 	for _, word := range resultTest {
199 | 		ok := false
200 | 		for _, v := range w.Tags {
201 | 			if word == v.Word {
202 | 				ok = true
203 | 			}
204 | 		}
205 | 		if !ok {
206 | 			t.Error(word + " not pass")
207 | 		} else {
208 | 			t.Log(word + " OK")
209 | 		}
210 | 	}
211 | }
212 | 
213 | func TestExtractKeywordsWeightPost(t *testing.T) {
214 | 	t.Log(sentence)
215 | 
216 | 	url := "http://localhost:8118/extract_keywords"
217 | 	data := fmt.Sprintf(`{"s":"%s", "mode":"%s","count":%d}`, sentence, "weight", 3)
218 | 	result, err := Post(url, data, "application/json")
219 | 	if err != nil {
220 | 		t.Error(err)
221 | 		return
222 | 	}
223 | 
224 | 	var w struct {
225 | 		Tags []tokenizer.Keyword `json:"tags"`
226 | 	}
227 | 
228 | 	err = json.Unmarshal([]byte(result), &w)
229 | 	if err != nil {
230 | 		t.Error(err)
231 | 		return
232 | 	}
233 | 	t.Log("提取关键字：", w)
234 | 
235 | 	for _, word := range resultTest {
236 | 		ok := false
237 | 		for _, v := range w.Tags {
238 | 			if word == v.Word {
239 | 				ok = true
240 | 			}
241 | 		}
242 | 		if !ok {
243 | 			t.Error(word + " not pass")
244 | 		} else {
245 | 			t.Log(word + " OK")
246 | 		}
247 | 	}
248 | }
249 | 
250 | func TestAddDictWordsGet(t *testing.T) {
251 | 	word := "编程宝库"
252 | 	t.Log("=== 添加字典单词: " + word)
253 | 	url := fmt.Sprintf(`http://localhost:8118/add_dict_word?s=%s&weight=%d&prop=%s`, word, 3, "n")
254 | 	result, err := Get(url)
255 | 	if err != nil {
256 | 		t.Error(err)
257 | 		return
258 | 	}
259 | 	var response struct {
260 | 		ErrCode int    `json:"errcode"`
261 | 		ErrMsg  string `json:"errmsg"`
262 | 	}
263 | 	err = json.Unmarshal([]byte(result), &response)
264 | 	if err != nil {
265 | 		t.Error(err)
266 | 		return
267 | 	}
268 | 	if response.ErrMsg != "" {
269 | 		t.Log(response.ErrMsg)
270 | 	}
271 | }
272 | 
273 | func TestAddDictWordsPost(t *testing.T) {
274 | 	url := "http://localhost:8118/add_dict_word"
275 | 
276 | 	word := "编程宝库"
277 | 	t.Log("=== 添加字典单词: " + word)
278 | 	data := fmt.Sprintf(`{"s":"%s", "weight":%d,"prop":"%s"}`, word, 3, "n")
279 | 	result, err := Post(url, data, "application/json")
280 | 	if err != nil {
281 | 		t.Error(err)
282 | 		return
283 | 	}
284 | 	var response struct {
285 | 		ErrCode int    `json:"errcode"`
286 | 		ErrMsg  string `json:"errmsg"`
287 | 	}
288 | 	err = json.Unmarshal([]byte(result), &response)
289 | 	if err != nil {
290 | 		t.Error(err)
291 | 		return
292 | 	}
293 | 	if response.ErrMsg != "" {
294 | 		t.Log(response.ErrMsg)
295 | 	}
296 | }
297 | 
298 | func TestAddStopWordsGet(t *testing.T) {
299 | 	word := "the"
300 | 	t.Log("=== 添加停止词: " + word)
301 | 	url := fmt.Sprintf(`http://localhost:8118/add_stop_word?s=%s`, word)
302 | 	result, err := Get(url)
303 | 	if err != nil {
304 | 		t.Error(err)
305 | 		return
306 | 	}
307 | 	var response struct {
308 | 		ErrCode int    `json:"errcode"`
309 | 		ErrMsg  string `json:"errmsg"`
310 | 	}
311 | 	err = json.Unmarshal([]byte(result), &response)
312 | 	if err != nil {
313 | 		t.Error(err)
314 | 		return
315 | 	}
316 | 	if response.ErrMsg != "" {
317 | 		t.Log(response.ErrMsg)
318 | 	}
319 | }
320 | 
321 | func TestAddStopWordsPost(t *testing.T) {
322 | 	url := "http://localhost:8118/add_stop_word"
323 | 
324 | 	word := "the"
325 | 	t.Log("=== 添加停止词: " + word)
326 | 	data := fmt.Sprintf(`{"s":"%s"}`, word)
327 | 	result, err := Post(url, data, "application/json")
328 | 	if err != nil {
329 | 		t.Error(err)
330 | 		return
331 | 	}
332 | 	var response struct {
333 | 		ErrCode int    `json:"errcode"`
334 | 		ErrMsg  string `json:"errmsg"`
335 | 	}
336 | 	err = json.Unmarshal([]byte(result), &response)
337 | 	if err != nil {
338 | 		t.Error(err)
339 | 		return
340 | 	}
341 | 	if response.ErrMsg != "" {
342 | 		t.Log(response.ErrMsg)
343 | 	}
344 | }
345 | 
346 | // 发送GET请求
347 | // url：		请求地址
348 | // response：	请求返回的内容
349 | func Get(url string) (string, error) {
350 | 	// 超时时间：5秒
351 | 	client := &http.Client{Timeout: 5 * time.Second}
352 | 	resp, err := client.Get(url)
353 | 	if err != nil {
354 | 		return "", err
355 | 	}
356 | 	defer resp.Body.Close()
357 | 	if resp.StatusCode != http.StatusOK {
358 | 		return "", errors.New("status code:" + strconv.Itoa(resp.StatusCode))
359 | 	}
360 | 	var buffer [256]byte
361 | 	result := bytes.NewBuffer(nil)
362 | 	for {
363 | 		n, err := resp.Body.Read(buffer[0:])
364 | 		result.Write(buffer[0:n])
365 | 		if err != nil && err == io.EOF {
366 | 			break
367 | 		} else if err != nil {
368 | 			return "", err
369 | 		}
370 | 	}
371 | 
372 | 	return result.String(), nil
373 | }
374 | 
375 | // 发送POST请求
376 | // url：         请求地址
377 | // data：        POST请求提交的数据
378 | // contentType： 请求体格式，如：application/json
379 | // content：     请求放回的内容
380 | func Post(url string, data string, contentType string) (string, error) {
381 | 	// 超时时间：5秒
382 | 	client := &http.Client{Timeout: 5 * time.Second}
383 | 	resp, err := client.Post(url, contentType, bytes.NewBuffer([]byte(data)))
384 | 	if err != nil {
385 | 		return "", err
386 | 	}
387 | 	defer resp.Body.Close()
388 | 
389 | 	result, err := ioutil.ReadAll(resp.Body)
390 | 	if err != nil {
391 | 		return "", err
392 | 	}
393 | 	return string(result), nil
394 | }
395 | 


--------------------------------------------------------------------------------
/dictionary/dict_user_utf8.txt:
--------------------------------------------------------------------------------
1 | 编程宝库 3 n
2 | 王泽宾 3 n
3 | codebaoku 3 n
4 | 


--------------------------------------------------------------------------------
/dictionary/fs_pbstart.json:
--------------------------------------------------------------------------------
1 | {"B":-0.26268660809250016,"E":-3.14e+100,"M":-3.14e+100,"S":-1.4652633398537678}


--------------------------------------------------------------------------------
/dictionary/fs_pbtrans.json:
--------------------------------------------------------------------------------
1 | {"B": {"E": -0.510825623765990, "M": -0.916290731874155},
2 |   "E": {"B": -0.5897149736854513, "S": -0.8085250474669937},
3 |   "M": {"E": -0.33344856811948514, "M": -1.2603623820268226},
4 |   "S": {"B": -0.7211965654669841, "S": -0.6658631448798212}}


--------------------------------------------------------------------------------
/dictionary/stop_words_std_utf8.txt:
--------------------------------------------------------------------------------
   1 | "
   2 | .
   3 | 。
   4 | ,
   5 | 、
   6 | ！
   7 | ？
   8 | ：
   9 | ；
  10 | `
  11 | ﹑
  12 | •
  13 | ＂
  14 | ^
  15 | …
  16 | ‘
  17 | ’
  18 | “
  19 | ”
  20 | 〝
  21 | 〞
  22 | ~
  23 | \
  24 | ∕
  25 | |
  26 | ¦
  27 | ‖
  28 | —　
  29 | (
  30 | )
  31 | 〈
  32 | 〉
  33 | ﹞
  34 | ﹝
  35 | 「
  36 | 」
  37 | ‹
  38 | ›
  39 | 〖
  40 | 〗
  41 | 】
  42 | 【
  43 | »
  44 | «
  45 | 』
  46 | 『
  47 | 〕
  48 | 〔
  49 | 》
  50 | 《
  51 | }
  52 | {
  53 | ]
  54 | [
  55 | ﹐
  56 | ¸
  57 | ﹕
  58 | ︰
  59 | ﹔
  60 | ;
  61 | ！
  62 | ¡
  63 | ？
  64 | ¿
  65 | ﹖
  66 | ﹌
  67 | ﹏
  68 | ﹋
  69 | ＇
  70 | ´
  71 | ˊ
  72 | ˋ
  73 | -
  74 | ―
  75 | ﹫
  76 | @
  77 | ︳
  78 | ︴
  79 | _
  80 | ¯
  81 | ＿
  82 | ￣
  83 | ﹢
  84 | +
  85 | ﹦
  86 | =
  87 | ﹤
  88 | ‐
  89 | <
  90 | ­
  91 | ˜
  92 | ~
  93 | ﹟
  94 | #
  95 | ﹩
  96 | $
  97 | ﹠
  98 | &
  99 | ﹪
 100 | %
 101 | ﹡
 102 | *
 103 | ﹨
 104 | \
 105 | ﹍
 106 | ﹉
 107 | ﹎
 108 | ﹊
 109 | ˇ
 110 | ︵
 111 | ︶
 112 | ︷
 113 | ︸
 114 | ︹
 115 | ︿
 116 | ﹀
 117 | ︺
 118 | ︽
 119 | ︾
 120 | _
 121 | ˉ
 122 | ﹁
 123 | ﹂
 124 | ﹃
 125 | ﹄
 126 | ︻
 127 | ︼
 128 | 的
 129 | 了
 130 | the
 131 | a
 132 | an
 133 | that
 134 | those
 135 | this
 136 | that
 137 | $
 138 | 0
 139 | 1
 140 | 2
 141 | 3
 142 | 4
 143 | 5
 144 | 6
 145 | 7
 146 | 8
 147 | 9
 148 | ?
 149 | _
 150 | “
 151 | ”
 152 | 、
 153 | 。
 154 | 《
 155 | 》
 156 | 一
 157 | 一些
 158 | 一何
 159 | 一切
 160 | 一则
 161 | 一方面
 162 | 一旦
 163 | 一来
 164 | 一样
 165 | 一般
 166 | 一转眼
 167 | 万一
 168 | 上
 169 | 上下
 170 | 下
 171 | 不
 172 | 不仅
 173 | 不但
 174 | 不光
 175 | 不单
 176 | 不只
 177 | 不外乎
 178 | 不如
 179 | 不妨
 180 | 不尽
 181 | 不尽然
 182 | 不得
 183 | 不怕
 184 | 不惟
 185 | 不成
 186 | 不拘
 187 | 不料
 188 | 不是
 189 | 不比
 190 | 不然
 191 | 不特
 192 | 不独
 193 | 不管
 194 | 不至于
 195 | 不若
 196 | 不论
 197 | 不过
 198 | 不问
 199 | 与
 200 | 与其
 201 | 与其说
 202 | 与否
 203 | 与此同时
 204 | 且
 205 | 且不说
 206 | 且说
 207 | 两者
 208 | 个
 209 | 个别
 210 | 临
 211 | 为
 212 | 为了
 213 | 为什么
 214 | 为何
 215 | 为止
 216 | 为此
 217 | 为着
 218 | 乃
 219 | 乃至
 220 | 乃至于
 221 | 么
 222 | 之
 223 | 之一
 224 | 之所以
 225 | 之类
 226 | 乌乎
 227 | 乎
 228 | 乘
 229 | 也
 230 | 也好
 231 | 也罢
 232 | 了
 233 | 二来
 234 | 于
 235 | 于是
 236 | 于是乎
 237 | 云云
 238 | 云尔
 239 | 些
 240 | 亦
 241 | 人
 242 | 人们
 243 | 人家
 244 | 什么
 245 | 什么样
 246 | 今
 247 | 介于
 248 | 仍
 249 | 仍旧
 250 | 从
 251 | 从此
 252 | 从而
 253 | 他
 254 | 他人
 255 | 他们
 256 | 以
 257 | 以上
 258 | 以为
 259 | 以便
 260 | 以免
 261 | 以及
 262 | 以故
 263 | 以期
 264 | 以来
 265 | 以至
 266 | 以至于
 267 | 以致
 268 | 们
 269 | 任
 270 | 任何
 271 | 任凭
 272 | 似的
 273 | 但
 274 | 但凡
 275 | 但是
 276 | 何
 277 | 何以
 278 | 何况
 279 | 何处
 280 | 何时
 281 | 余外
 282 | 作为
 283 | 你
 284 | 你们
 285 | 使
 286 | 使得
 287 | 例如
 288 | 依
 289 | 依据
 290 | 依照
 291 | 便于
 292 | 俺
 293 | 俺们
 294 | 倘
 295 | 倘使
 296 | 倘或
 297 | 倘然
 298 | 倘若
 299 | 借
 300 | 假使
 301 | 假如
 302 | 假若
 303 | 傥然
 304 | 像
 305 | 儿
 306 | 先不先
 307 | 光是
 308 | 全体
 309 | 全部
 310 | 兮
 311 | 关于
 312 | 其
 313 | 其一
 314 | 其中
 315 | 其二
 316 | 其他
 317 | 其余
 318 | 其它
 319 | 其次
 320 | 具体地说
 321 | 具体说来
 322 | 兼之
 323 | 内
 324 | 再
 325 | 再其次
 326 | 再则
 327 | 再有
 328 | 再者
 329 | 再者说
 330 | 再说
 331 | 冒
 332 | 冲
 333 | 况且
 334 | 几
 335 | 几时
 336 | 凡
 337 | 凡是
 338 | 凭
 339 | 凭借
 340 | 出于
 341 | 出来
 342 | 分别
 343 | 则
 344 | 则甚
 345 | 别
 346 | 别人
 347 | 别处
 348 | 别是
 349 | 别的
 350 | 别管
 351 | 别说
 352 | 到
 353 | 前后
 354 | 前此
 355 | 前者
 356 | 加之
 357 | 加以
 358 | 即
 359 | 即令
 360 | 即使
 361 | 即便
 362 | 即如
 363 | 即或
 364 | 即若
 365 | 却
 366 | 去
 367 | 又
 368 | 又及
 369 | 及
 370 | 及其
 371 | 及至
 372 | 反之
 373 | 反而
 374 | 反过来
 375 | 反过来说
 376 | 受到
 377 | 另
 378 | 另一方面
 379 | 另外
 380 | 另悉
 381 | 只
 382 | 只当
 383 | 只怕
 384 | 只是
 385 | 只有
 386 | 只消
 387 | 只要
 388 | 只限
 389 | 叫
 390 | 叮咚
 391 | 可
 392 | 可以
 393 | 可是
 394 | 可见
 395 | 各
 396 | 各个
 397 | 各位
 398 | 各种
 399 | 各自
 400 | 同
 401 | 同时
 402 | 后
 403 | 后者
 404 | 向
 405 | 向使
 406 | 向着
 407 | 吓
 408 | 吗
 409 | 否则
 410 | 吧
 411 | 吧哒
 412 | 吱
 413 | 呀
 414 | 呃
 415 | 呕
 416 | 呗
 417 | 呜
 418 | 呜呼
 419 | 呢
 420 | 呵
 421 | 呵呵
 422 | 呸
 423 | 呼哧
 424 | 咋
 425 | 和
 426 | 咚
 427 | 咦
 428 | 咧
 429 | 咱
 430 | 咱们
 431 | 咳
 432 | 哇
 433 | 哈
 434 | 哈哈
 435 | 哉
 436 | 哎
 437 | 哎呀
 438 | 哎哟
 439 | 哗
 440 | 哟
 441 | 哦
 442 | 哩
 443 | 哪
 444 | 哪个
 445 | 哪些
 446 | 哪儿
 447 | 哪天
 448 | 哪年
 449 | 哪怕
 450 | 哪样
 451 | 哪边
 452 | 哪里
 453 | 哼
 454 | 哼唷
 455 | 唉
 456 | 唯有
 457 | 啊
 458 | 啐
 459 | 啥
 460 | 啦
 461 | 啪达
 462 | 啷当
 463 | 喂
 464 | 喏
 465 | 喔唷
 466 | 喽
 467 | 嗡
 468 | 嗡嗡
 469 | 嗬
 470 | 嗯
 471 | 嗳
 472 | 嘎
 473 | 嘎登
 474 | 嘘
 475 | 嘛
 476 | 嘻
 477 | 嘿
 478 | 嘿嘿
 479 | 因
 480 | 因为
 481 | 因了
 482 | 因此
 483 | 因着
 484 | 因而
 485 | 固然
 486 | 在
 487 | 在下
 488 | 在于
 489 | 地
 490 | 基于
 491 | 处在
 492 | 多
 493 | 多么
 494 | 多少
 495 | 大
 496 | 大家
 497 | 她
 498 | 她们
 499 | 好
 500 | 如
 501 | 如上
 502 | 如上所述
 503 | 如下
 504 | 如何
 505 | 如其
 506 | 如同
 507 | 如是
 508 | 如果
 509 | 如此
 510 | 如若
 511 | 始而
 512 | 孰料
 513 | 孰知
 514 | 宁
 515 | 宁可
 516 | 宁愿
 517 | 宁肯
 518 | 它
 519 | 它们
 520 | 对
 521 | 对于
 522 | 对待
 523 | 对方
 524 | 对比
 525 | 将
 526 | 小
 527 | 尔
 528 | 尔后
 529 | 尔尔
 530 | 尚且
 531 | 就
 532 | 就是
 533 | 就是了
 534 | 就是说
 535 | 就算
 536 | 就要
 537 | 尽
 538 | 尽管
 539 | 尽管如此
 540 | 岂但
 541 | 己
 542 | 已
 543 | 已矣
 544 | 巴
 545 | 巴巴
 546 | 并
 547 | 并且
 548 | 并非
 549 | 庶乎
 550 | 庶几
 551 | 开外
 552 | 开始
 553 | 归
 554 | 归齐
 555 | 当
 556 | 当地
 557 | 当然
 558 | 当着
 559 | 彼
 560 | 彼时
 561 | 彼此
 562 | 往
 563 | 待
 564 | 很
 565 | 得
 566 | 得了
 567 | 怎
 568 | 怎么
 569 | 怎么办
 570 | 怎么样
 571 | 怎奈
 572 | 怎样
 573 | 总之
 574 | 总的来看
 575 | 总的来说
 576 | 总的说来
 577 | 总而言之
 578 | 恰恰相反
 579 | 您
 580 | 惟其
 581 | 慢说
 582 | 我
 583 | 我们
 584 | 或
 585 | 或则
 586 | 或是
 587 | 或曰
 588 | 或者
 589 | 截至
 590 | 所
 591 | 所以
 592 | 所在
 593 | 所幸
 594 | 所有
 595 | 才
 596 | 才能
 597 | 打
 598 | 打从
 599 | 把
 600 | 抑或
 601 | 拿
 602 | 按
 603 | 按照
 604 | 换句话说
 605 | 换言之
 606 | 据
 607 | 据此
 608 | 接着
 609 | 故
 610 | 故此
 611 | 故而
 612 | 旁人
 613 | 无
 614 | 无宁
 615 | 无论
 616 | 既
 617 | 既往
 618 | 既是
 619 | 既然
 620 | 时候
 621 | 是
 622 | 是以
 623 | 是的
 624 | 曾
 625 | 替
 626 | 替代
 627 | 最
 628 | 有
 629 | 有些
 630 | 有关
 631 | 有及
 632 | 有时
 633 | 有的
 634 | 望
 635 | 朝
 636 | 朝着
 637 | 本
 638 | 本人
 639 | 本地
 640 | 本着
 641 | 本身
 642 | 来
 643 | 来着
 644 | 来自
 645 | 来说
 646 | 极了
 647 | 果然
 648 | 果真
 649 | 某
 650 | 某个
 651 | 某些
 652 | 某某
 653 | 根据
 654 | 欤
 655 | 正值
 656 | 正如
 657 | 正巧
 658 | 正是
 659 | 此
 660 | 此地
 661 | 此处
 662 | 此外
 663 | 此时
 664 | 此次
 665 | 此间
 666 | 毋宁
 667 | 每
 668 | 每当
 669 | 比
 670 | 比及
 671 | 比如
 672 | 比方
 673 | 没奈何
 674 | 沿
 675 | 沿着
 676 | 漫说
 677 | 焉
 678 | 然则
 679 | 然后
 680 | 然而
 681 | 照
 682 | 照着
 683 | 犹且
 684 | 犹自
 685 | 甚且
 686 | 甚么
 687 | 甚或
 688 | 甚而
 689 | 甚至
 690 | 甚至于
 691 | 用
 692 | 用来
 693 | 由
 694 | 由于
 695 | 由是
 696 | 由此
 697 | 由此可见
 698 | 的
 699 | 的确
 700 | 的话
 701 | 直到
 702 | 相对而言
 703 | 省得
 704 | 看
 705 | 眨眼
 706 | 着
 707 | 着呢
 708 | 矣
 709 | 矣乎
 710 | 矣哉
 711 | 离
 712 | 竟而
 713 | 第
 714 | 等
 715 | 等到
 716 | 等等
 717 | 简言之
 718 | 管
 719 | 类如
 720 | 紧接着
 721 | 纵
 722 | 纵令
 723 | 纵使
 724 | 纵然
 725 | 经
 726 | 经过
 727 | 结果
 728 | 给
 729 | 继之
 730 | 继后
 731 | 继而
 732 | 综上所述
 733 | 罢了
 734 | 者
 735 | 而
 736 | 而且
 737 | 而况
 738 | 而后
 739 | 而外
 740 | 而已
 741 | 而是
 742 | 而言
 743 | 能
 744 | 能否
 745 | 腾
 746 | 自
 747 | 自个儿
 748 | 自从
 749 | 自各儿
 750 | 自后
 751 | 自家
 752 | 自己
 753 | 自打
 754 | 自身
 755 | 至
 756 | 至于
 757 | 至今
 758 | 至若
 759 | 致
 760 | 般的
 761 | 若
 762 | 若夫
 763 | 若是
 764 | 若果 
 765 | 若非
 766 | 莫不然
 767 | 莫如
 768 | 莫若
 769 | 虽
 770 | 虽则
 771 | 虽然
 772 | 虽说
 773 | 被
 774 | 要
 775 | 要不
 776 | 要不是
 777 | 要不然
 778 | 要么
 779 | 要是
 780 | 譬喻
 781 | 譬如
 782 | 让
 783 | 许多
 784 | 论
 785 | 设使
 786 | 设或
 787 | 设若
 788 | 诚如
 789 | 诚然
 790 | 该
 791 | 说来
 792 | 诸
 793 | 诸位
 794 | 诸如
 795 | 谁
 796 | 谁人
 797 | 谁料
 798 | 谁知
 799 | 贼死
 800 | 赖以
 801 | 赶
 802 | 起
 803 | 起见
 804 | 趁
 805 | 趁着
 806 | 越是
 807 | 距
 808 | 跟
 809 | 较
 810 | 较之
 811 | 边
 812 | 过
 813 | 还
 814 | 还是
 815 | 还有
 816 | 还要
 817 | 这
 818 | 这一来
 819 | 这个
 820 | 这么
 821 | 这么些
 822 | 这么样
 823 | 这么点儿
 824 | 这些
 825 | 这会儿
 826 | 这儿
 827 | 这就是说
 828 | 这时
 829 | 这样
 830 | 这次
 831 | 这般
 832 | 这边
 833 | 这里
 834 | 进而
 835 | 连
 836 | 连同
 837 | 逐步
 838 | 通过
 839 | 遵循
 840 | 遵照
 841 | 那
 842 | 那个
 843 | 那么
 844 | 那么些
 845 | 那么样
 846 | 那些
 847 | 那会儿
 848 | 那儿
 849 | 那时
 850 | 那样
 851 | 那般
 852 | 那边
 853 | 那里
 854 | 都
 855 | 鄙人
 856 | 鉴于
 857 | 针对
 858 | 阿
 859 | 除
 860 | 除了
 861 | 除外
 862 | 除开
 863 | 除此之外
 864 | 除非
 865 | 随
 866 | 随后
 867 | 随时
 868 | 随着
 869 | 难道说
 870 | 非但
 871 | 非徒
 872 | 非特
 873 | 非独
 874 | 靠
 875 | 顺
 876 | 顺着
 877 | 首先
 878 | ！
 879 | ，
 880 | ：
 881 | ；
 882 | ？
 883 | to
 884 | can
 885 | could
 886 | dare
 887 | do
 888 | did
 889 | does
 890 | may
 891 | might
 892 | would
 893 | should
 894 | must
 895 | will
 896 | ought
 897 | shall
 898 | need
 899 | is
 900 | a
 901 | am
 902 | are
 903 | about
 904 | according
 905 | after
 906 | against
 907 | all
 908 | almost
 909 | also
 910 | although
 911 | among
 912 | an
 913 | and
 914 | another
 915 | any
 916 | anything
 917 | approximately
 918 | as
 919 | asked
 920 | at
 921 | back
 922 | because
 923 | before
 924 | besides
 925 | between
 926 | both
 927 | but
 928 | by
 929 | call
 930 | called
 931 | currently
 932 | despite
 933 | did
 934 | do
 935 | dr
 936 | during
 937 | each
 938 | earlier
 939 | eight
 940 | even
 941 | eventually
 942 | every
 943 | everything
 944 | five
 945 | for
 946 | four
 947 | from
 948 | he
 949 | her
 950 | here
 951 | his
 952 | how
 953 | however
 954 | i
 955 | if
 956 | in
 957 | indeed
 958 | instead
 959 | it
 960 | its
 961 | just
 962 | last
 963 | like
 964 | major
 965 | many
 966 | may
 967 | maybe
 968 | meanwhile
 969 | more
 970 | moreover
 971 | most
 972 | mr
 973 | mrs
 974 | ms
 975 | much
 976 | my
 977 | neither
 978 | net
 979 | never
 980 | nevertheless
 981 | nine
 982 | no
 983 | none
 984 | not
 985 | nothing
 986 | now
 987 | of
 988 | on
 989 | once
 990 | one
 991 | only
 992 | or
 993 | other
 994 | our
 995 | over
 996 | partly
 997 | perhaps
 998 | prior
 999 | regarding
1000 | separately
1001 | seven
1002 | several
1003 | she
1004 | should
1005 | similarly
1006 | since
1007 | six
1008 | so
1009 | some
1010 | somehow
1011 | still
1012 | such
1013 | ten
1014 | that
1015 | the
1016 | their
1017 | then
1018 | there
1019 | therefore
1020 | these
1021 | they
1022 | this
1023 | those
1024 | though
1025 | three
1026 | to
1027 | two
1028 | under
1029 | unless
1030 | unlike
1031 | until
1032 | volume
1033 | we
1034 | what
1035 | whatever
1036 | whats
1037 | when
1038 | where
1039 | which
1040 | while
1041 | why
1042 | with
1043 | without
1044 | yesterday
1045 | yet
1046 | you
1047 | your
1048 | aboard
1049 | about
1050 | above
1051 | according to
1052 | across
1053 | afore
1054 | after
1055 | against
1056 | agin
1057 | along
1058 | alongside
1059 | amid
1060 | amidst
1061 | among
1062 | amongst
1063 | anent
1064 | around
1065 | as
1066 | aslant
1067 | astride
1068 | at
1069 | athwart
1070 | bar
1071 | because of
1072 | before
1073 | behind
1074 | below
1075 | beneath
1076 | beside
1077 | besides
1078 | between
1079 | betwixt
1080 | beyond
1081 | but
1082 | by
1083 | circa
1084 | despite
1085 | down
1086 | during
1087 | due to
1088 | ere
1089 | except
1090 | for
1091 | from
1092 | in
1093 | inside
1094 | into
1095 | less
1096 | like
1097 | mid
1098 | midst
1099 | minus
1100 | near
1101 | next
1102 | nigh
1103 | nigher
1104 | nighest
1105 | notwithstanding
1106 | of
1107 | off
1108 | on
1109 | on to
1110 | onto
1111 | out
1112 | out of
1113 | outside
1114 | over
1115 | past
1116 | pending
1117 | per
1118 | plus
1119 | qua
1120 | re
1121 | round
1122 | sans
1123 | save
1124 | since
1125 | through
1126 | throughout
1127 | thru
1128 | till
1129 | to
1130 | toward
1131 | towards
1132 | under
1133 | underneath
1134 | unlike
1135 | until
1136 | unto
1137 | up
1138 | upon
1139 | versus
1140 | via
1141 | vice
1142 | with
1143 | within
1144 | without
1145 | he
1146 | her
1147 | herself
1148 | hers
1149 | him
1150 | himself
1151 | his
1152 | I
1153 | it
1154 | its
1155 | itself
1156 | me
1157 | mine
1158 | my
1159 | myself
1160 | ours
1161 | she
1162 | their
1163 | theirs
1164 | them
1165 | themselves
1166 | they
1167 | us
1168 | we
1169 | our
1170 | ourselves
1171 | you
1172 | your
1173 | yours
1174 | yourselves
1175 | yourself
1176 | this
1177 | that
1178 | these
1179 | those
1180 | "
1181 | '
1182 | ''
1183 | (
1184 | )
1185 | *LRB*
1186 | *RRB*
1187 | <dquote>
1188 | <ldquo>
1189 | <lsquo>
1190 | <rdquo>
1191 | <rsquo>
1192 | @
1193 | &
1194 | [
1195 | ]
1196 | `
1197 | ``
1198 | e.g.,
1199 | {
1200 | }
1201 | &quot;
1202 | &ldquo;
1203 | &rdquo;
1204 | -RRB-
1205 | -LRB-
1206 | --
1207 | a
1208 | about
1209 | above
1210 | across
1211 | after
1212 | afterwards
1213 | again
1214 | against
1215 | all
1216 | almost
1217 | alone
1218 | along
1219 | already
1220 | also
1221 | although
1222 | always
1223 | am
1224 | among
1225 | amongst
1226 | amoungst
1227 | amount
1228 | an
1229 | and
1230 | another
1231 | any
1232 | anyhow
1233 | anyone
1234 | anything
1235 | anyway
1236 | anywhere
1237 | are
1238 | around
1239 | as
1240 | at
1241 | back
1242 | be
1243 | became
1244 | because
1245 | become
1246 | becomes
1247 | becoming
1248 | been
1249 | before
1250 | beforehand
1251 | behind
1252 | being
1253 | below
1254 | beside
1255 | besides
1256 | between
1257 | beyond
1258 | bill
1259 | both
1260 | bottom
1261 | but
1262 | by
1263 | call
1264 | can
1265 | cannot
1266 | cant
1267 | co
1268 | computer
1269 | con
1270 | could
1271 | couldnt
1272 | cry
1273 | de
1274 | describe
1275 | detail
1276 | do
1277 | done
1278 | down
1279 | due
1280 | during
1281 | each
1282 | eg
1283 | eight
1284 | either
1285 | eleven
1286 | else
1287 | elsewhere
1288 | empty
1289 | enough
1290 | etc
1291 | even
1292 | ever
1293 | every
1294 | everyone
1295 | everything
1296 | everywhere
1297 | except
1298 | few
1299 | fifteen
1300 | fify
1301 | fill
1302 | find
1303 | fire
1304 | first
1305 | five
1306 | for
1307 | former
1308 | formerly
1309 | forty
1310 | found
1311 | four
1312 | from
1313 | front
1314 | full
1315 | further
1316 | get
1317 | give
1318 | go
1319 | had
1320 | has
1321 | hasnt
1322 | have
1323 | he
1324 | hence
1325 | her
1326 | here
1327 | hereafter
1328 | hereby
1329 | herein
1330 | hereupon
1331 | hers
1332 | herself
1333 | him
1334 | himself
1335 | his
1336 | how
1337 | however
1338 | hundred
1339 | i
1340 | ie
1341 | if
1342 | in
1343 | inc
1344 | indeed
1345 | interest
1346 | into
1347 | is
1348 | it
1349 | its
1350 | itself
1351 | keep
1352 | last
1353 | latter
1354 | latterly
1355 | least
1356 | less
1357 | ltd
1358 | made
1359 | many
1360 | may
1361 | me
1362 | meanwhile
1363 | might
1364 | mill
1365 | mine
1366 | more
1367 | moreover
1368 | most
1369 | mostly
1370 | move
1371 | much
1372 | must
1373 | my
1374 | myself
1375 | name
1376 | namely
1377 | neither
1378 | never
1379 | nevertheless
1380 | next
1381 | nine
1382 | no
1383 | nobody
1384 | none
1385 | noone
1386 | nor
1387 | not
1388 | nothing
1389 | now
1390 | nowhere
1391 | of
1392 | off
1393 | often
1394 | on
1395 | once
1396 | one
1397 | only
1398 | onto
1399 | or
1400 | other
1401 | others
1402 | otherwise
1403 | our
1404 | ours
1405 | ourselves
1406 | out
1407 | over
1408 | own
1409 | part
1410 | per
1411 | perhaps
1412 | please
1413 | put
1414 | rather
1415 | re
1416 | same
1417 | see
1418 | seem
1419 | seemed
1420 | seeming
1421 | seems
1422 | serious
1423 | several
1424 | she
1425 | should
1426 | show
1427 | side
1428 | since
1429 | sincere
1430 | six
1431 | sixty
1432 | so
1433 | some
1434 | somehow
1435 | someone
1436 | something
1437 | sometime
1438 | sometimes
1439 | somewhere
1440 | still
1441 | such
1442 | system
1443 | take
1444 | ten
1445 | than
1446 | that
1447 | the
1448 | their
1449 | them
1450 | themselves
1451 | then
1452 | thence
1453 | there
1454 | thereafter
1455 | thereby
1456 | therefore
1457 | therein
1458 | thereupon
1459 | these
1460 | they
1461 | thick
1462 | thin
1463 | third
1464 | this
1465 | those
1466 | though
1467 | three
1468 | through
1469 | throughout
1470 | thru
1471 | thus
1472 | to
1473 | together
1474 | too
1475 | top
1476 | toward
1477 | towards
1478 | twelve
1479 | twenty
1480 | two
1481 | un
1482 | under
1483 | until
1484 | up
1485 | upon
1486 | us
1487 | very
1488 | via
1489 | was
1490 | we
1491 | well
1492 | were
1493 | what
1494 | whatever
1495 | when
1496 | whence
1497 | whenever
1498 | where
1499 | whereafter
1500 | whereas
1501 | whereby
1502 | wherein
1503 | whereupon
1504 | wherever
1505 | whether
1506 | which
1507 | while
1508 | whither
1509 | who
1510 | whoever
1511 | whole
1512 | whom
1513 | whose
1514 | why
1515 | will
1516 | with
1517 | within
1518 | without
1519 | would
1520 | yet
1521 | you
1522 | your
1523 | yours
1524 | yourself
1525 | yourselves
1526 |  
1527 | 	
1528 | :
1529 | /
1530 | （
1531 | >
1532 | ）
1533 | <
1534 | !
1535 | 


--------------------------------------------------------------------------------
/dictionary/stop_words_user_utf8.txt:
--------------------------------------------------------------------------------
 1 | the
 2 | of
 3 | is
 4 | and
 5 | to
 6 | in
 7 | that
 8 | we
 9 | for
10 | an
11 | are
12 | by
13 | be
14 | as
15 | on
16 | with
17 | can
18 | if
19 | from
20 | which
21 | you
22 | it
23 | this
24 | then
25 | at
26 | have
27 | all
28 | not
29 | one
30 | has
31 | or
32 | that
33 | 


--------------------------------------------------------------------------------
/dll/Makefile:
--------------------------------------------------------------------------------
 1 | PROJECT = jiebago
 2 | 
 3 | .PHONY: all clean build
 4 | all:clean build
 5 | 	@echo "Done!"
 6 | 
 7 | build:
 8 | 	go build -ldflags "-s -w" -buildmode=c-shared -o $(PROJECT).dll $(PROJECT).go
 9 | 
10 | clean:
11 | 	rm -rf $(PROJECT).dll $(PROJECT).h
12 | 


--------------------------------------------------------------------------------
/dll/jiebago.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2022 Ze-Bin Wang.  All rights reserved.
  2 | // Use of this source code is governed by a MIT style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package main
  6 | 
  7 | import (
  8 | 	"C"
  9 | 	"encoding/json"
 10 | 
 11 | 	"github.com/wangshizebin/jiebago"
 12 | 	"github.com/wangshizebin/jiebago/tokenizer"
 13 | )
 14 | 
 15 | var (
 16 | 	jieBaGo *jiebago.JieBaGo
 17 | )
 18 | 
 19 | //export Init
 20 | func Init(path string) {
 21 | 	jieBaGo = jiebago.NewJieBaGo(path)
 22 | }
 23 | 
 24 | //export Cut
 25 | func Cut(sentence string) string {
 26 | 	if jieBaGo == nil {
 27 | 		return ""
 28 | 	}
 29 | 	words := jieBaGo.Cut(sentence)
 30 | 	return wordsToJson(&words)
 31 | }
 32 | 
 33 | //export CutFull
 34 | func CutFull(sentence string) string {
 35 | 	if jieBaGo == nil {
 36 | 		return ""
 37 | 	}
 38 | 	words := jieBaGo.Cut(sentence)
 39 | 	return wordsToJson(&words)
 40 | }
 41 | 
 42 | //export CutAccurate
 43 | func CutAccurate(sentence string) string {
 44 | 	if jieBaGo == nil {
 45 | 		return ""
 46 | 	}
 47 | 	words := jieBaGo.Cut(sentence)
 48 | 	return wordsToJson(&words)
 49 | }
 50 | 
 51 | //export CutNoHMM
 52 | func CutNoHMM(sentence string) string {
 53 | 	if jieBaGo == nil {
 54 | 		return ""
 55 | 	}
 56 | 	words := jieBaGo.Cut(sentence)
 57 | 	return wordsToJson(&words)
 58 | }
 59 | 
 60 | //export CutForSearch
 61 | func CutForSearch(sentence string) string {
 62 | 	if jieBaGo == nil {
 63 | 		return ""
 64 | 	}
 65 | 	words := jieBaGo.Cut(sentence)
 66 | 	return wordsToJson(&words)
 67 | }
 68 | 
 69 | //export ExtractKeywords
 70 | func ExtractKeywords(s string, count int) string {
 71 | 	if jieBaGo == nil {
 72 | 		return ""
 73 | 	}
 74 | 	words := jieBaGo.ExtractKeywords(s, count)
 75 | 	return wordsToJson(&words)
 76 | }
 77 | 
 78 | //export ExtractKeywordsWeight
 79 | func ExtractKeywordsWeight(s string, count int) string {
 80 | 	if jieBaGo == nil {
 81 | 		return ""
 82 | 	}
 83 | 	tags := jieBaGo.ExtractKeywordsWeight(s, count)
 84 | 	return wordsWeightToJson(&tags)
 85 | }
 86 | 
 87 | //export AddDictWord
 88 | func AddDictWord(word string, freq int, prop string) bool {
 89 | 	if jieBaGo == nil {
 90 | 		return false
 91 | 	}
 92 | 	_, err := jieBaGo.AddDictWord(word, freq, prop)
 93 | 	if err != nil {
 94 | 		return false
 95 | 	}
 96 | 	return true
 97 | }
 98 | 
 99 | //export AddStopWord
100 | func AddStopWord(word string) bool {
101 | 	if jieBaGo == nil {
102 | 		return false
103 | 	}
104 | 	_, err := jieBaGo.AddStopWord(word)
105 | 	if err != nil {
106 | 		return false
107 | 	}
108 | 	return true
109 | }
110 | 
111 | func wordsToJson(words *[]string) string {
112 | 	w := struct {
113 | 		Words *[]string `json:"words"`
114 | 	}{
115 | 		Words: words,
116 | 	}
117 | 	v, _ := json.Marshal(w)
118 | 	return string(v)
119 | }
120 | 
121 | func wordsWeightToJson(tags *[]tokenizer.Keyword) string {
122 | 	w := struct {
123 | 		Tags *[]tokenizer.Keyword `json:"tags"`
124 | 	}{
125 | 		Tags: tags,
126 | 	}
127 | 	v, _ := json.Marshal(w)
128 | 	return string(v)
129 | }
130 | 
131 | func main() {
132 | 	// Need a main function to make CGO compile package as C shared library
133 | }
134 | 


--------------------------------------------------------------------------------
/example/main.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2022 Ze-Bin Wang.  All rights reserved.
 2 | // Use of this source code is governed by a MIT style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package main
 6 | 
 7 | import (
 8 | 	"fmt"
 9 | 	"strings"
10 | 
11 | 	"github.com/wangshizebin/jiebago"
12 | )
13 | 
14 | func main() {
15 | 	jieBaGo := jiebago.NewJieBaGo()
16 | 	// 可以指定字典库的位置
17 | 	// jieBaGo := jiebago.NewJieBaGo("/data/mydict")
18 | 
19 | 	sentence := "Shell 位于用户与系统之间，用来帮助用户与操作系统进行沟通。通常都是文字模式的 Shell。"
20 | 	fmt.Println("原始语句：", sentence)
21 | 	fmt.Println()
22 | 
23 | 	// 默认模式分词
24 | 	words := jieBaGo.Cut(sentence)
25 | 	fmt.Println("默认模式分词：", strings.Join(words, "/"))
26 | 
27 | 	// 精确模式分词
28 | 	words = jieBaGo.CutAccurate(sentence)
29 | 	fmt.Println("精确模式分词：", strings.Join(words, "/"))
30 | 
31 | 	// 全模式分词
32 | 	words = jieBaGo.CutFull(sentence)
33 | 	fmt.Println("全模式分词：", strings.Join(words, "/"))
34 | 
35 | 	// NoHMM模式分词
36 | 	words = jieBaGo.CutNoHMM(sentence)
37 | 	fmt.Println("NoHMM模式分词：", strings.Join(words, "/"))
38 | 
39 | 	// 搜索引擎模式分词
40 | 	words = jieBaGo.CutForSearch(sentence)
41 | 	fmt.Println("搜索引擎模式分词：", strings.Join(words, "/"))
42 | 	fmt.Println()
43 | 
44 | 	// 提取关键词，即Tag标签
45 | 	keywords := jieBaGo.ExtractKeywords(sentence, 20)
46 | 	fmt.Println("提取关键词：", strings.Join(keywords, "/"))
47 | 
48 | 	// 提取带权重的关键词，即Tag标签
49 | 	keywordsWeight := jieBaGo.ExtractKeywordsWeight(sentence, 20)
50 | 	fmt.Println("提取带权重的关键词：", keywordsWeight)
51 | 	fmt.Println()
52 | 
53 | 	// 向字典加入单词
54 | 	exist, err := jieBaGo.AddDictWord("编程宝库", 3, "n")
55 | 	if err != nil {
56 | 		fmt.Println(err)
57 | 	} else {
58 | 		fmt.Println("向字典加入单词：编程宝库")
59 | 		if exist {
60 | 			fmt.Println("单词已经存在")
61 | 		}
62 | 	}
63 | 
64 | 	// 向字典加入停止词
65 | 	exist, err = jieBaGo.AddStopWord("the")
66 | 	if err != nil {
67 | 		fmt.Println(err)
68 | 	} else {
69 | 		fmt.Println("向字典加入停止词：the")
70 | 		if exist {
71 | 			fmt.Println("单词已经存在")
72 | 		}
73 | 	}
74 | }
75 | 


--------------------------------------------------------------------------------
/example/main_api.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2022 Ze-Bin Wang.  All rights reserved.
 2 | // Use of this source code is governed by a MIT style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package main
 6 | 
 7 | import (
 8 | 	"fmt"
 9 | 
10 | 	"github.com/wangshizebin/jiebago"
11 | )
12 | 
13 | func main() {
14 | 	sentence := "Shell 位于用户与系统之间，用来帮助用户与操作系统进行沟通。我们通常所说的 Shell 都指的是文字模式的 Shell。"
15 | 	fmt.Println("原始语句：", sentence)
16 | 	fmt.Println()
17 | 
18 | 	// 默认模式分词
19 | 	words := jiebago.Cut(sentence)
20 | 	fmt.Println("默认模式分词：", words)
21 | 
22 | 	// 精确模式分词
23 | 	words = jiebago.CutAccurate(sentence)
24 | 	fmt.Println("精确模式分词：", words)
25 | 
26 | 	// 全模式分词
27 | 	words = jiebago.CutFull(sentence)
28 | 	fmt.Println("全模式分词：", words)
29 | 
30 | 	// NoHMM模式分词
31 | 	words = jiebago.CutNoHMM(sentence)
32 | 	fmt.Println("NoHMM模式分词：", words)
33 | 
34 | 	// 搜索引擎模式分词
35 | 	words = jiebago.CutForSearch(sentence)
36 | 	fmt.Println("搜索引擎模式分词：", words)
37 | 	fmt.Println()
38 | 
39 | 	// 提取关键词，即Tag标签
40 | 	keywords := jiebago.ExtractKeywords(sentence, 20)
41 | 	fmt.Println("提取关键词：", keywords)
42 | 
43 | 	// 提取带权重的关键词，即Tag标签
44 | 	keywordsWeight := jiebago.ExtractKeywordsWeight(sentence, 20)
45 | 	fmt.Println("提取带权重的关键词：", keywordsWeight)
46 | 	fmt.Println()
47 | 
48 | 	// 向字典加入单词
49 | 	exist, err := jiebago.AddDictWord("编程宝库", 3, "n")
50 | 	if err != nil {
51 | 		fmt.Println(err)
52 | 	} else {
53 | 		fmt.Println("向字典加入单词：编程宝库")
54 | 		if exist {
55 | 			fmt.Println("单词已经存在")
56 | 		}
57 | 	}
58 | 
59 | 	// 向字典加入停止词
60 | 	exist, err = jiebago.AddStopWord("the")
61 | 	if err != nil {
62 | 		fmt.Println(err)
63 | 	} else {
64 | 		fmt.Println("向字典加入停止词：the")
65 | 		if exist {
66 | 			fmt.Println("单词已经存在")
67 | 		}
68 | 	}
69 | }
70 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/wangshizebin/jiebago
2 | 
3 | go 1.16
4 | 
5 | require github.com/gin-gonic/gin v1.7.7
6 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 2 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 3 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 4 | github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE=
 5 | github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI=
 6 | github.com/gin-gonic/gin v1.7.7 h1:3DoBmSbJbZAWqXJC3SLjAPfutPJJRN1U5pALB7EeTTs=
 7 | github.com/gin-gonic/gin v1.7.7/go.mod h1:axIBovoeJpVj8S3BwE0uPMTeReE4+AfFtqpqaZ1qq1U=
 8 | github.com/go-playground/assert/v2 v2.0.1 h1:MsBgLAaY856+nPRTKrp3/OZK38U/wa0CcBYNjji3q3A=
 9 | github.com/go-playground/assert/v2 v2.0.1/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
10 | github.com/go-playground/locales v0.13.0 h1:HyWk6mgj5qFqCT5fjGBuRArbVDfE4hi8+e8ceBS/t7Q=
11 | github.com/go-playground/locales v0.13.0/go.mod h1:taPMhCMXrRLJO55olJkUXHZBHCxTMfnGwq/HNwmWNS8=
12 | github.com/go-playground/universal-translator v0.17.0 h1:icxd5fm+REJzpZx7ZfpaD876Lmtgy7VtROAbHHXk8no=
13 | github.com/go-playground/universal-translator v0.17.0/go.mod h1:UkSxE5sNxxRwHyU+Scu5vgOQjsIJAF8j9muTVoKLVtA=
14 | github.com/go-playground/validator/v10 v10.4.1 h1:pH2c5ADXtd66mxoE0Zm9SUhxE20r7aM3F26W0hOn+GE=
15 | github.com/go-playground/validator/v10 v10.4.1/go.mod h1:nlOn6nFhuKACm19sB/8EGNn9GlaMV7XkbRSipzJ0Ii4=
16 | github.com/golang/protobuf v1.3.3 h1:gyjaxf+svBWX08ZjK86iN9geUJF0H6gp2IRKX6Nf6/I=
17 | github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
18 | github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
19 | github.com/json-iterator/go v1.1.9 h1:9yzud/Ht36ygwatGx56VwCZtlI/2AD15T1X2sjSuGns=
20 | github.com/json-iterator/go v1.1.9/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
21 | github.com/leodido/go-urn v1.2.0 h1:hpXL4XnriNwQ/ABnpepYM/1vCLWNDfUNts8dX3xTG6Y=
22 | github.com/leodido/go-urn v1.2.0/go.mod h1:+8+nEpDfqqsY+g338gtMEUOtuK+4dEMhiQEgxpxOKII=
23 | github.com/mattn/go-isatty v0.0.12 h1:wuysRhFDzyxgEmMf5xjvJ2M9dZoWAXNNr5LSBS7uHXY=
24 | github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU=
25 | github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421 h1:ZqeYNhU3OHLH3mGKHDcjJRFFRrJa6eAM5H+CtDdOsPc=
26 | github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
27 | github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742 h1:Esafd1046DLDQ0W1YjYsBW+p8U2u7vzgW2SQVmlNazg=
28 | github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
29 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
30 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
31 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
32 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
33 | github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
34 | github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
35 | github.com/ugorji/go v1.1.7 h1:/68gy2h+1mWMrwZFeD1kQialdSzAb432dtpeJ42ovdo=
36 | github.com/ugorji/go v1.1.7/go.mod h1:kZn38zHttfInRq0xu/PH0az30d+z6vm202qpg1oXVMw=
37 | github.com/ugorji/go/codec v1.1.7 h1:2SvQaVZ1ouYrrKKwoSk2pzd4A9evlKJb9oTL+OaLUSs=
38 | github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY=
39 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
40 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9 h1:psW17arqaxU48Z5kZ0CQnkZWQJsqcURM6tKiBApRjXI=
41 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
42 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
43 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
44 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
45 | golang.org/x/sys v0.0.0-20200116001909-b77594299b42 h1:vEOn+mP2zCOVzKckCZy6YsCtDblrpj/w7B9nxGNELpg=
46 | golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
47 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
48 | golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
49 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
50 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
51 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
52 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
53 | gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10=
54 | gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
55 | 


--------------------------------------------------------------------------------
/jiebago.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2022 Ze-Bin Wang.  All rights reserved.
  2 | // Use of this source code is governed by a MIT style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package jiebago
  6 | 
  7 | import (
  8 | 	"strings"
  9 | 
 10 | 	"github.com/wangshizebin/jiebago/tokenizer"
 11 | )
 12 | 
 13 | type JieBaGo struct {
 14 | }
 15 | 
 16 | func NewJieBaGo(path ...string) *JieBaGo {
 17 | 	configPath := ""
 18 | 	if len(path) > 0 {
 19 | 		configPath = path[0]
 20 | 	}
 21 | 	tokenizer.Init(configPath)
 22 | 	jieBaGo := &JieBaGo{}
 23 | 	return jieBaGo
 24 | }
 25 | 
 26 | func (g *JieBaGo) Cut(sentence string) []string {
 27 | 	return g.CutAccurate(sentence)
 28 | }
 29 | 
 30 | func (g *JieBaGo) CutFull(s string) []string {
 31 | 	wordsRet := make([]string, 0, tokenizer.DefaultWordsLen)
 32 | 
 33 | 	segments := tokenizer.SplitTextSeg(s)
 34 | 	for _, segment := range segments {
 35 | 		if strings.Trim(segment, " ") == "" {
 36 | 			continue
 37 | 		}
 38 | 		if tokenizer.IsTextChars(segment) {
 39 | 			tokenizer.CutFullW(segment, &wordsRet)
 40 | 		} else {
 41 | 			tokenizer.CutSymbolW(segment, &wordsRet)
 42 | 		}
 43 | 	}
 44 | 	return wordsRet
 45 | }
 46 | 
 47 | func (g *JieBaGo) CutAccurate(s string) []string {
 48 | 	wordsRet := make([]string, 0, tokenizer.DefaultWordsLen)
 49 | 
 50 | 	segments := tokenizer.SplitTextSeg(s)
 51 | 	for _, segment := range segments {
 52 | 		if strings.Trim(segment, " ") == "" {
 53 | 			continue
 54 | 		}
 55 | 		if tokenizer.IsTextChars(segment) {
 56 | 			tokenizer.CutAccurateW(segment, &wordsRet)
 57 | 		} else {
 58 | 			tokenizer.CutSymbolW(segment, &wordsRet)
 59 | 		}
 60 | 	}
 61 | 
 62 | 	return wordsRet
 63 | }
 64 | 
 65 | func (g *JieBaGo) CutNoHMM(s string) []string {
 66 | 	wordsRet := make([]string, 0, tokenizer.DefaultWordsLen)
 67 | 
 68 | 	segments := tokenizer.SplitTextSeg(s)
 69 | 	for _, segment := range segments {
 70 | 		if strings.Trim(segment, " ") == "" {
 71 | 			continue
 72 | 		}
 73 | 		if tokenizer.IsTextChars(segment) {
 74 | 			tokenizer.CutNoHMMW(segment, &wordsRet)
 75 | 		} else {
 76 | 			tokenizer.CutSymbolW(segment, &wordsRet)
 77 | 		}
 78 | 	}
 79 | 
 80 | 	return wordsRet
 81 | }
 82 | 
 83 | func (g *JieBaGo) CutForSearch(s string) []string {
 84 | 	wordsRet := make([]string, 0, tokenizer.DefaultWordsLen)
 85 | 
 86 | 	segments := tokenizer.SplitTextSeg(s)
 87 | 	for _, segment := range segments {
 88 | 		if strings.Trim(segment, " ") == "" {
 89 | 			continue
 90 | 		}
 91 | 		if tokenizer.IsTextChars(segment) {
 92 | 			g.cutForSearchW(segment, &wordsRet)
 93 | 		} else {
 94 | 			tokenizer.CutSymbolW(segment, &wordsRet)
 95 | 		}
 96 | 	}
 97 | 
 98 | 	return wordsRet
 99 | }
100 | 
101 | func (g *JieBaGo) cutForSearchW(s string, words *[]string) {
102 | 	dictionary := tokenizer.GetDictionary()
103 | 
104 | 	for _, word := range g.CutAccurate(s) {
105 | 		wordRune := []rune(word)
106 | 		if len(wordRune) > 2 {
107 | 			for i := 0; i < len(wordRune)-1; i++ {
108 | 				s := string(wordRune[i : i+2])
109 | 				if dictionary.Exist(s) {
110 | 					*words = append(*words, s)
111 | 				}
112 | 			}
113 | 		}
114 | 		if len(wordRune) > 3 {
115 | 			for i := 0; i < len(wordRune)-2; i++ {
116 | 				s := string(wordRune[i : i+3])
117 | 				if dictionary.Exist(s) {
118 | 					*words = append(*words, s)
119 | 				}
120 | 			}
121 | 		}
122 | 		*words = append(*words, word)
123 | 	}
124 | }
125 | 
126 | func (g *JieBaGo) ExtractKeywords(s string, count int) []string {
127 | 	keywords := tokenizer.GetTFIDF().ExtractKeywords(s, count, false)
128 | 	return keywords.([]string)
129 | }
130 | 
131 | func (g *JieBaGo) ExtractKeywordsWeight(s string, count int) []tokenizer.Keyword {
132 | 	keywords := tokenizer.GetTFIDF().ExtractKeywords(s, count, true)
133 | 	return []tokenizer.Keyword(keywords.(tokenizer.Keywords))
134 | }
135 | 
136 | func (g *JieBaGo) AddDictWord(word string, freq int, prop string) (exist bool, err error) {
137 | 	return tokenizer.GetDictionary().AddWord(word, freq, prop)
138 | }
139 | 
140 | func (g *JieBaGo) AddStopWord(word string) (exist bool, err error) {
141 | 	return tokenizer.GetTFIDF().AddStopWord(word)
142 | }
143 | 


--------------------------------------------------------------------------------
/jiebago_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2022 Ze-Bin Wang.  All rights reserved.
  2 | // Use of this source code is governed by a MIT style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package jiebago
  6 | 
  7 | import (
  8 | 	"strings"
  9 | 	"testing"
 10 | )
 11 | 
 12 | var (
 13 | 	sentence   = "Shell位于用户与系统之间，用来帮助用户与操作系统进行沟通。"
 14 | 	resultTest = []string{"Shell", "操作系统", "沟通"}
 15 | 	jieBaGo    = NewJieBaGo()
 16 | )
 17 | 
 18 | func TestCut(t *testing.T) {
 19 | 	testCutWords(jieBaGo.Cut, t)
 20 | }
 21 | 
 22 | func TestCutFull(t *testing.T) {
 23 | 	testCutWords(jieBaGo.CutFull, t)
 24 | }
 25 | 
 26 | func TestCutAccurate(t *testing.T) {
 27 | 	testCutWords(jieBaGo.CutAccurate, t)
 28 | }
 29 | 
 30 | func TestCutNoHMM(t *testing.T) {
 31 | 	testCutWords(jieBaGo.CutNoHMM, t)
 32 | }
 33 | 
 34 | func TestCutForSearch(t *testing.T) {
 35 | 	testCutWords(jieBaGo.CutForSearch, t)
 36 | }
 37 | 
 38 | func TestExtractKeywords(t *testing.T) {
 39 | 	t.Log("原始语句： " + sentence)
 40 | 
 41 | 	words := jieBaGo.ExtractKeywords(sentence, 20)
 42 | 	t.Log("提取关键字：", strings.Join(words, "/"))
 43 | 	for _, word := range resultTest {
 44 | 		ok := false
 45 | 		for _, v := range words {
 46 | 			if word == v {
 47 | 				ok = true
 48 | 			}
 49 | 		}
 50 | 		if !ok {
 51 | 			t.Error(word + " not pass")
 52 | 		} else {
 53 | 			t.Log(word + " OK")
 54 | 		}
 55 | 	}
 56 | }
 57 | 
 58 | func TestExtractKeywordsWeight(t *testing.T) {
 59 | 	t.Log("原始语句： " + sentence)
 60 | 
 61 | 	words := jieBaGo.ExtractKeywordsWeight(sentence, 20)
 62 | 	t.Log("提取关键字：", words)
 63 | 	for _, word := range resultTest {
 64 | 		ok := false
 65 | 		for _, v := range words {
 66 | 			if word == v.Word {
 67 | 				ok = true
 68 | 			}
 69 | 		}
 70 | 		if !ok {
 71 | 			t.Error(word + " not pass")
 72 | 		} else {
 73 | 			t.Log(word + " OK")
 74 | 		}
 75 | 	}
 76 | }
 77 | 
 78 | func TestAddWordToDict(t *testing.T) {
 79 | 	words := []string{"编程宝库", "王泽宾", "codebaoku"}
 80 | 	t.Log("加入词典：", words)
 81 | 	for _, word := range words {
 82 | 		exist, err := jieBaGo.AddDictWord(word, 3, "n")
 83 | 		if err != nil {
 84 | 			t.Error(err)
 85 | 		} else {
 86 | 			if exist {
 87 | 				t.Log(word + " 已经存在")
 88 | 			} else {
 89 | 				t.Log(word + " 添加入库")
 90 | 			}
 91 | 		}
 92 | 	}
 93 | }
 94 | 
 95 | func TestAddStopWord(t *testing.T) {
 96 | 	words := []string{"the", "of", "is"}
 97 | 	t.Log("加入停止词：", words)
 98 | 	for _, word := range words {
 99 | 		exist, err := jieBaGo.AddStopWord(word)
100 | 		if err != nil {
101 | 			t.Error(err)
102 | 		} else {
103 | 			if exist {
104 | 				t.Log(word + " 已经存在")
105 | 			} else {
106 | 				t.Log(word + " 添加入库")
107 | 			}
108 | 		}
109 | 	}
110 | }
111 | 
112 | func testCutWords(f func(string) []string, t *testing.T) {
113 | 	t.Log("原始语句： " + sentence)
114 | 
115 | 	wordsResult := f(sentence)
116 | 	t.Log("分词结果：", strings.Join(wordsResult, "/"))
117 | 	for _, word := range resultTest {
118 | 		ok := false
119 | 		for _, v := range wordsResult {
120 | 			if word == v {
121 | 				ok = true
122 | 			}
123 | 		}
124 | 		if !ok {
125 | 			t.Error(word + " not pass")
126 | 		} else {
127 | 			t.Log(word + " OK")
128 | 		}
129 | 	}
130 | }
131 | 


--------------------------------------------------------------------------------
/tokenizer/analyzer.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2022 Ze-Bin Wang.  All rights reserved.
  2 | // Use of this source code is governed by a MIT style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package tokenizer
  6 | 
  7 | import (
  8 | 	"bufio"
  9 | 	"errors"
 10 | 	"io"
 11 | 	"log"
 12 | 	"os"
 13 | 	"path/filepath"
 14 | 	"sort"
 15 | 	"strconv"
 16 | 	"strings"
 17 | 	"sync"
 18 | 	"time"
 19 | )
 20 | 
 21 | const (
 22 | 	DefaultIDFSize = 300000
 23 | )
 24 | 
 25 | var tfIDF = &TFIDF{
 26 | 	idfLoader: &IDFLoader{
 27 | 		idfFreq: make(map[string]float64),
 28 | 	},
 29 | 	stopWords: &StopWords{
 30 | 		dictMap: make(map[string]struct{}),
 31 | 	},
 32 | }
 33 | 
 34 | type Keyword struct {
 35 | 	Word   string  `json:"word"`
 36 | 	Weight float64 `json:"weight"`
 37 | }
 38 | 
 39 | type Keywords []Keyword
 40 | 
 41 | func (k Keywords) Len() int {
 42 | 	return len(k)
 43 | }
 44 | 
 45 | func (k Keywords) Less(i, j int) bool {
 46 | 	if k[i].Weight > k[j].Weight {
 47 | 		return true
 48 | 	}
 49 | 	return false
 50 | }
 51 | 
 52 | func (k Keywords) Swap(i, j int) {
 53 | 	k[i], k[j] = k[j], k[i]
 54 | }
 55 | 
 56 | type StopWords struct {
 57 | 	dictMap map[string]struct{}
 58 | 	mu      sync.RWMutex
 59 | }
 60 | 
 61 | func (d *StopWords) load(fileStopWords string) error {
 62 | 	d.mu.Lock()
 63 | 	defer d.mu.Unlock()
 64 | 
 65 | 	timeStart := time.Now()
 66 | 
 67 | 	f, err := os.Open(fileStopWords)
 68 | 	if err != nil {
 69 | 		log.Println(err)
 70 | 		return errors.New("unable to load the stop words library:" + filepath.Base(fileStopWords))
 71 | 	}
 72 | 	defer func() {
 73 | 		_ = f.Close()
 74 | 	}()
 75 | 
 76 | 	itemCount := 0
 77 | 	reader := bufio.NewReader(f)
 78 | 	for {
 79 | 		line, err := reader.ReadString('\n')
 80 | 		if err != nil && err != io.EOF {
 81 | 			break
 82 | 		}
 83 | 
 84 | 		elem := strings.Fields(line)
 85 | 		if len(elem) == 0 {
 86 | 			if err == io.EOF {
 87 | 				break
 88 | 			}
 89 | 			continue
 90 | 		}
 91 | 
 92 | 		for _, v := range elem {
 93 | 			if v == "" {
 94 | 				continue
 95 | 			}
 96 | 
 97 | 			itemCount++
 98 | 			d.dictMap[strings.ToLower(v)] = struct{}{}
 99 | 		}
100 | 
101 | 		if err == io.EOF {
102 | 			break
103 | 		}
104 | 	}
105 | 
106 | 	log.Printf("%v stop words are loaded, and take %v\n",
107 | 		itemCount, time.Now().Sub(timeStart))
108 | 	return nil
109 | }
110 | 
111 | func (d *StopWords) exist(s string) bool {
112 | 	d.mu.RLock()
113 | 	defer d.mu.RUnlock()
114 | 	_, ok := d.dictMap[strings.ToLower(s)]
115 | 	return ok
116 | }
117 | 
118 | func (d *StopWords) add(s string) (exist bool, err error) {
119 | 	d.mu.Lock()
120 | 	defer d.mu.Unlock()
121 | 
122 | 	s = strings.ToLower(strings.TrimSpace(s))
123 | 	if s == "" {
124 | 		return
125 | 	}
126 | 	if _, ok := d.dictMap[s]; ok {
127 | 		exist = true
128 | 		return
129 | 	}
130 | 
131 | 	stopWordsStdFile, err := GetDictFile(StopWordsStdFile)
132 | 	if err != nil {
133 | 		return
134 | 	}
135 | 
136 | 	stopWordsUserFile := filepath.Dir(stopWordsStdFile)
137 | 	stopWordsUserFile += string(filepath.Separator) + StopWordsUserFile
138 | 	f, err := os.OpenFile(stopWordsUserFile, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0666)
139 | 	if err != nil {
140 | 		return
141 | 	}
142 | 	defer func() {
143 | 		_ = f.Close()
144 | 	}()
145 | 
146 | 	stat, err := f.Stat()
147 | 	if err != nil {
148 | 		return
149 | 	}
150 | 
151 | 	line := ""
152 | 	n := stat.Size()
153 | 	if n > 0 {
154 | 		buf := make([]byte, 1, 1)
155 | 		_, err = f.ReadAt(buf, n-1)
156 | 		if err != nil {
157 | 			return
158 | 		}
159 | 		if buf[0] != '\n' {
160 | 			line += "\n"
161 | 		}
162 | 	}
163 | 	line += s + "\n"
164 | 	_, err = f.Write([]byte(line))
165 | 	if err != nil {
166 | 		log.Println(err)
167 | 		return
168 | 	}
169 | 
170 | 	d.dictMap[s] = struct{}{}
171 | 	return
172 | }
173 | 
174 | type IDFLoader struct {
175 | 	idfFreq   map[string]float64
176 | 	idfMedian float64
177 | }
178 | 
179 | func (d *IDFLoader) load(idfFile string) error {
180 | 	timeStart := time.Now()
181 | 
182 | 	f, err := os.Open(idfFile)
183 | 	if err != nil {
184 | 		log.Println(err)
185 | 		return errors.New("unable to load the IDF library")
186 | 	}
187 | 	defer func() {
188 | 		_ = f.Close()
189 | 	}()
190 | 
191 | 	itemCount := 0
192 | 	freqSlice := make([]float64, 0, DefaultIDFSize)
193 | 	reader := bufio.NewReader(f)
194 | 	for {
195 | 		line, err := reader.ReadString('\n')
196 | 		if err != nil && err != io.EOF {
197 | 			break
198 | 		}
199 | 
200 | 		elem := strings.Fields(line)
201 | 		if len(elem) != 2 {
202 | 			if err == io.EOF {
203 | 				break
204 | 			}
205 | 			continue
206 | 		}
207 | 
208 | 		itemCount++
209 | 		freq, err := strconv.ParseFloat(elem[1], 64)
210 | 		if err != nil {
211 | 			log.Println(err)
212 | 			freq = float64(0)
213 | 		}
214 | 
215 | 		d.idfFreq[strings.ToLower(elem[0])] = freq
216 | 		freqSlice = append(freqSlice, freq)
217 | 
218 | 		if err == io.EOF {
219 | 			break
220 | 		}
221 | 	}
222 | 
223 | 	sort.Float64s(freqSlice)
224 | 	d.idfMedian = freqSlice[itemCount/2]
225 | 
226 | 	log.Printf("%v idfs are loaded, and take %v\n",
227 | 		itemCount, time.Now().Sub(timeStart))
228 | 	return nil
229 | }
230 | 
231 | type TFIDF struct {
232 | 	idfLoader *IDFLoader
233 | 	stopWords *StopWords
234 | }
235 | 
236 | func (t *TFIDF) ExtractKeywords(s string, count int, withWeight bool) interface{} {
237 | 	words := make([]string, 0, DefaultWordsLen)
238 | 	segments := SplitTextSeg(s)
239 | 	for _, segment := range segments {
240 | 		if IsTextChars(segment) {
241 | 			CutAccurateW(segment, &words)
242 | 		} else {
243 | 			CutSymbolW(segment, &words)
244 | 		}
245 | 	}
246 | 
247 | 	freqMap, freqMedian := t.idfLoader.idfFreq, t.idfLoader.idfMedian
248 | 
249 | 	freqTotal := 0
250 | 	freqWords := make(map[string]int)
251 | 	for _, word := range words {
252 | 		if len([]rune(word)) < 2 || t.ExistStopWord(word) {
253 | 			continue
254 | 		}
255 | 		if val, ok := freqWords[word]; ok {
256 | 			freqWords[word] = val + 1
257 | 			freqTotal++
258 | 			continue
259 | 		}
260 | 		freqTotal++
261 | 		freqWords[word] = 1
262 | 	}
263 | 
264 | 	i := 0
265 | 	wordsRet := make(Keywords, len(freqWords))
266 | 	for word, s := range freqWords {
267 | 		val := freqMedian
268 | 		if v, ok := freqMap[word]; ok {
269 | 			val = v
270 | 		}
271 | 		wordsRet[i] = Keyword{
272 | 			Word:   word,
273 | 			Weight: float64(s) * (val / float64(freqTotal)),
274 | 		}
275 | 		i++
276 | 	}
277 | 
278 | 	sort.Sort(wordsRet)
279 | 	if count == 0 {
280 | 		count = 20
281 | 	}
282 | 	if count < len(wordsRet) {
283 | 		wordsRet = wordsRet[:count]
284 | 	}
285 | 	if withWeight {
286 | 		return wordsRet
287 | 	}
288 | 	stringSet := make([]string, len(wordsRet))
289 | 	for i, v := range wordsRet {
290 | 		stringSet[i] = v.Word
291 | 	}
292 | 	return stringSet
293 | }
294 | 
295 | func (t *TFIDF) ExistStopWord(word string) bool {
296 | 	return t.stopWords.exist(word)
297 | }
298 | 
299 | func (t *TFIDF) AddStopWord(word string) (exist bool, err error) {
300 | 	return t.stopWords.add(word)
301 | }
302 | 
303 | func InitTFIDF() {
304 | 	// load the tf-idf library
305 | 	idfFile, err := GetDictFile(IDFStdFile)
306 | 	if err != nil {
307 | 		log.Panic(err)
308 | 	}
309 | 
310 | 	err = tfIDF.idfLoader.load(idfFile)
311 | 	if err != nil {
312 | 		log.Panic(err)
313 | 	}
314 | 
315 | 	// load the standard stop words library
316 | 	stopWordsStdFile, err := GetDictFile(StopWordsStdFile)
317 | 	if err == nil {
318 | 		tfIDF.stopWords.load(stopWordsStdFile)
319 | 	}
320 | 
321 | 	// load the user-defined stop words library
322 | 	stopWordsUserFile, err := GetDictFile(StopWordsUserFile)
323 | 	if err == nil {
324 | 		tfIDF.stopWords.load(stopWordsUserFile)
325 | 	}
326 | }
327 | 
328 | func GetTFIDF() *TFIDF {
329 | 	return tfIDF
330 | }
331 | 


--------------------------------------------------------------------------------
/tokenizer/common.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2022 Ze-Bin Wang.  All rights reserved.
  2 | // Use of this source code is governed by a MIT style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package tokenizer
  6 | 
  7 | import (
  8 | 	"errors"
  9 | 	"fmt"
 10 | 	"log"
 11 | 	"os"
 12 | 	"path/filepath"
 13 | 	"regexp"
 14 | 	"strings"
 15 | )
 16 | 
 17 | const (
 18 | 	DictStdFile       = "dict_std_utf8.txt"        // standard dictionary file
 19 | 	DictUserFile      = "dict_user_utf8.txt"       // user-defined dictionary file
 20 | 	IDFStdFile        = "idf_std_utf8.txt"         // standard IDF file
 21 | 	StopWordsStdFile  = "stop_words_std_utf8.txt"  // standard stop words file
 22 | 	StopWordsUserFile = "stop_words_user_utf8.txt" // user-defined stop words file
 23 | 
 24 | 	RegExpEnglish   = "([a-zA-Z0-9])+"                     // English regular expression
 25 | 	RegExpChinese   = "([\u4e00-\u9fa5])+"                 // Chinese regular expression
 26 | 	RegExpText      = "([\u4e00-\u9fa5a-zA-Z0-9+#&._%-])+" // text regular expression
 27 | 	RegExpNumber    = "[a-zA-Z0-9]+(\\.\\d+)?%?"           // numeric regular expression
 28 | 	RegExpDelimiter = "[\\r\\n\\s\\t]"                     // delimiter regular expression
 29 | 
 30 | 	DefaultWordsLen = 32 // default slice size of the word segmentation result
 31 | )
 32 | 
 33 | var (
 34 | 	reEnglish, _   = regexp.Compile(RegExpEnglish)   // precompiled English regular expression
 35 | 	reChinese, _   = regexp.Compile(RegExpChinese)   // precompiled Chinese regular expression
 36 | 	reText, _      = regexp.Compile(RegExpText)      // precompiled text regular expression
 37 | 	reNumber, _    = regexp.Compile(RegExpNumber)    // precompiled numeric regular expression
 38 | 	reDelimiter, _ = regexp.Compile(RegExpDelimiter) // precompiled delimiter regular expression
 39 | 
 40 | 	dictPath string // dictionary directory, default is current work directory
 41 | )
 42 | 
 43 | func IsEnglishChars(s string) bool {
 44 | 	return reEnglish.Match([]byte(s))
 45 | }
 46 | 
 47 | func IsChineseChars(s string) bool {
 48 | 	return reChinese.Match([]byte(s))
 49 | }
 50 | 
 51 | func IsTextChars(s string) bool {
 52 | 	return reText.Match([]byte(s))
 53 | }
 54 | 
 55 | // Split sentence according to normal text
 56 | func SplitTextSeg(s string) []string {
 57 | 	return splitRegExp(s, reText)
 58 | }
 59 | 
 60 | // Split sentence according to Chinese
 61 | func SplitChineseSeg(s string) []string {
 62 | 	return splitRegExp(s, reChinese)
 63 | }
 64 | 
 65 | // Split sentence according to number
 66 | func SplitNumberSeg(s string) []string {
 67 | 	return splitRegExp(s, reNumber)
 68 | }
 69 | 
 70 | // Get the dictionary file directory
 71 | func GetDictFile(file string) (string, error) {
 72 | 	errFileNotFound := errors.New("unable to load the dictionary file")
 73 | 
 74 | 	dictPath := ""
 75 | 	if GetDictPath() != "" {
 76 | 		dictPath = filepath.Join(GetDictPath(), file)
 77 | 		if !fileExist(dictPath) {
 78 | 			return "", errFileNotFound
 79 | 		}
 80 | 		return dictPath, nil
 81 | 	}
 82 | 
 83 | 	dictFile := fmt.Sprintf("%cdictionary%c%s", os.PathSeparator, os.PathSeparator, file)
 84 | 
 85 | 	// check exe file directory
 86 | 	path, err := filepath.Abs(filepath.Dir(os.Args[0]))
 87 | 	if err != nil {
 88 | 		log.Println(err)
 89 | 		return "", errFileNotFound
 90 | 	}
 91 | 
 92 | 	dictPath = path + dictFile
 93 | 	if !fileExist(dictPath) {
 94 | 		path, err = os.Getwd()
 95 | 		if err != nil {
 96 | 			log.Println(err)
 97 | 			return "", errFileNotFound
 98 | 		}
 99 | 	}
100 | 
101 | 	// check work directory
102 | 	dictPath = path + dictFile
103 | 	if !fileExist(dictPath) {
104 | 		path = getParentPath(path)
105 | 		if path == "" {
106 | 			return "", errFileNotFound
107 | 		}
108 | 	}
109 | 
110 | 	// check parent of work directory
111 | 	dictPath = path + dictFile
112 | 	if !fileExist(dictPath) {
113 | 		return "", errFileNotFound
114 | 	}
115 | 
116 | 	return dictPath, nil
117 | }
118 | 
119 | // Split sentence according to the specified regular expression
120 | func splitRegExp(s string, re *regexp.Regexp) []string {
121 | 	sentences := make([]string, 0)
122 | 
123 | 	n := len(s)
124 | 	prePos := 0
125 | 	for {
126 | 		loc := re.FindStringIndex(s[prePos:])
127 | 		if loc == nil {
128 | 			sentences = append(sentences, s[prePos:])
129 | 			return sentences
130 | 		}
131 | 		loc[0] += prePos
132 | 		loc[1] += prePos
133 | 		if loc[0] > prePos {
134 | 			sentences = append(sentences, s[prePos:loc[0]])
135 | 		}
136 | 		sentences = append(sentences, s[loc[0]:loc[1]])
137 | 		prePos = loc[1]
138 | 		if prePos == n {
139 | 			break
140 | 		}
141 | 	}
142 | 	return sentences
143 | }
144 | 
145 | func fileExist(path string) bool {
146 | 	_, err := os.Lstat(path)
147 | 	return !os.IsNotExist(err)
148 | }
149 | 
150 | func getParentPath(path string) string {
151 | 	return substrRune(path, 0, strings.LastIndex(path, string(os.PathSeparator)))
152 | }
153 | 
154 | func substrRune(s string, pos, length int) string {
155 | 	runes := []rune(s)
156 | 	l := pos + length
157 | 	if l > len(runes) {
158 | 		l = len(runes)
159 | 	}
160 | 	return string(runes[pos:l])
161 | }
162 | 
163 | func GetDictPath() string {
164 | 	return dictPath
165 | }
166 | 
167 | func SetDictPath(path string) {
168 | 	dictPath = path
169 | }
170 | 


--------------------------------------------------------------------------------
/tokenizer/cutword.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2022 Ze-Bin Wang.  All rights reserved.
  2 | // Use of this source code is governed by a MIT style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package tokenizer
  6 | 
  7 | func CutFullW(s string, words *[]string) {
  8 | 	bufEnglish := ""
  9 | 	pos := -1
 10 | 
 11 | 	sentence := NewSentence(s)
 12 | 	dag := sentence.GetDAG()
 13 | 	for k, listPos := range dag {
 14 | 		if len(bufEnglish) > 0 && !IsEnglishChars(sentence.GetChar(k)) {
 15 | 			*words = append(*words, bufEnglish)
 16 | 			bufEnglish = ""
 17 | 		}
 18 | 
 19 | 		if len(listPos) == 1 && k > pos {
 20 | 			word := sentence.GetWord(k, listPos[0]+1)
 21 | 			if IsEnglishChars(word) {
 22 | 				bufEnglish += word
 23 | 			}
 24 | 			if len(bufEnglish) == 0 {
 25 | 				*words = append(*words, word)
 26 | 			}
 27 | 			pos = listPos[0]
 28 | 		} else {
 29 | 			for _, j := range listPos {
 30 | 				if j > k {
 31 | 					*words = append(*words, sentence.GetWord(k, j+1))
 32 | 					pos = j
 33 | 				}
 34 | 			}
 35 | 		}
 36 | 	}
 37 | 
 38 | 	if len(bufEnglish) > 0 {
 39 | 		*words = append(*words, bufEnglish)
 40 | 	}
 41 | }
 42 | 
 43 | func CutAccurateW(s string, words *[]string) {
 44 | 	sentence := NewSentence(s)
 45 | 	route := sentence.CalcDAG()
 46 | 	dictionary := GetDictionary()
 47 | 	buf := ""
 48 | 	for i := 0; i < sentence.Len(); {
 49 | 		y := route[i].Y + 1
 50 | 		leftWord := sentence.GetWord(i, y)
 51 | 		if y-i == 1 {
 52 | 			buf += leftWord
 53 | 			i = y
 54 | 			continue
 55 | 		}
 56 | 
 57 | 		if len(buf) > 0 {
 58 | 			if len([]rune(buf)) == 1 {
 59 | 				*words = append(*words, buf)
 60 | 			} else {
 61 | 				if !dictionary.Exist(buf) {
 62 | 					wordsRecognized := GetFinalSeg().Cut(buf)
 63 | 					for _, w := range wordsRecognized {
 64 | 						*words = append(*words, w)
 65 | 					}
 66 | 				} else {
 67 | 					for _, v := range buf {
 68 | 						*words = append(*words, string(v))
 69 | 					}
 70 | 				}
 71 | 			}
 72 | 			buf = ""
 73 | 		}
 74 | 		*words = append(*words, leftWord)
 75 | 		i = y
 76 | 	}
 77 | 
 78 | 	if len(buf) > 0 {
 79 | 		if len([]rune(buf)) == 1 {
 80 | 			*words = append(*words, buf)
 81 | 		} else {
 82 | 			if !dictionary.Exist(buf) {
 83 | 				wordsRecognized := GetFinalSeg().Cut(buf)
 84 | 				for _, w := range wordsRecognized {
 85 | 					*words = append(*words, w)
 86 | 				}
 87 | 			} else {
 88 | 				for _, v := range buf {
 89 | 					*words = append(*words, string(v))
 90 | 				}
 91 | 			}
 92 | 		}
 93 | 	}
 94 | }
 95 | 
 96 | func CutNoHMMW(s string, words *[]string) {
 97 | 	sentence := NewSentence(s)
 98 | 	route := sentence.CalcDAG()
 99 | 
100 | 	bufEnglish := ""
101 | 	for i := 0; i < sentence.Len(); {
102 | 		y := route[i].Y + 1
103 | 		leftWord := sentence.GetWord(i, y)
104 | 		if IsEnglishChars(leftWord) && len(leftWord) == 1 {
105 | 			bufEnglish += leftWord
106 | 			i = y
107 | 			continue
108 | 		}
109 | 
110 | 		if len(bufEnglish) > 0 {
111 | 			*words = append(*words, bufEnglish)
112 | 			bufEnglish = ""
113 | 		}
114 | 		*words = append(*words, leftWord)
115 | 		i = y
116 | 	}
117 | 
118 | 	if len(bufEnglish) > 0 {
119 | 		*words = append(*words, bufEnglish)
120 | 	}
121 | }
122 | 
123 | func CutSymbolW(s string, words *[]string) {
124 | 	n := len(s)
125 | 	if n == 0 {
126 | 		return
127 | 	}
128 | 
129 | 	buf := ""
130 | 	word := ""
131 | 	prePos := 0
132 | 	for {
133 | 		loc := reDelimiter.FindStringIndex(s[prePos:])
134 | 		if loc == nil {
135 | 			word = s[prePos:]
136 | 			prePos = n
137 | 		} else {
138 | 			loc[0] += prePos
139 | 			loc[1] += prePos
140 | 			if loc[0] > prePos {
141 | 				buf = s[prePos:loc[0]]
142 | 			}
143 | 
144 | 			word = s[loc[0]:loc[1]]
145 | 			prePos = loc[1]
146 | 		}
147 | 
148 | 		if buf == "\r" && word == "\n" {
149 | 			*words = append(*words, "\r\n")
150 | 			buf = ""
151 | 		} else {
152 | 			if buf != "" {
153 | 				*words = append(*words, buf)
154 | 			}
155 | 			if word != "" {
156 | 				buf = word
157 | 			}
158 | 		}
159 | 
160 | 		if prePos == n {
161 | 			if buf != "" {
162 | 				*words = append(*words, buf)
163 | 			}
164 | 			return
165 | 		}
166 | 	}
167 | }
168 | 
169 | func Init(dictPath string) {
170 | 	SetDictPath(dictPath)
171 | 	InitDictionary()
172 | 	InitTFIDF()
173 | 	InitFSToken()
174 | }
175 | 


--------------------------------------------------------------------------------
/tokenizer/dictionary.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2022 Ze-Bin Wang.  All rights reserved.
  2 | // Use of this source code is governed by a MIT style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package tokenizer
  6 | 
  7 | import (
  8 | 	"bufio"
  9 | 	"errors"
 10 | 	"io"
 11 | 	"log"
 12 | 	"os"
 13 | 	"path/filepath"
 14 | 	"strconv"
 15 | 	"strings"
 16 | 	"sync"
 17 | 	"time"
 18 | )
 19 | 
 20 | var dictionary = &Dictionary{
 21 | 	dict: make(map[string]int),
 22 | }
 23 | 
 24 | type Dictionary struct {
 25 | 	dict map[string]int
 26 | 	mu   sync.RWMutex
 27 | 	tf   int // total freq
 28 | }
 29 | 
 30 | func (d *Dictionary) Exist(word string) bool {
 31 | 	d.mu.RLock()
 32 | 	defer d.mu.RUnlock()
 33 | 	_, ok := d.dict[strings.ToLower(word)]
 34 | 	return ok
 35 | }
 36 | 
 37 | func (d *Dictionary) GetWord(word string) (int, bool) {
 38 | 	d.mu.RLock()
 39 | 	defer d.mu.RUnlock()
 40 | 	v, ok := d.dict[strings.ToLower(word)]
 41 | 	return v, ok
 42 | }
 43 | 
 44 | func (d *Dictionary) GetTotalFreq() float64 {
 45 | 	d.mu.RLock()
 46 | 	defer d.mu.RUnlock()
 47 | 	return float64(d.tf)
 48 | }
 49 | 
 50 | func (d *Dictionary) AddWord(word string, freq int, prop string) (exist bool, err error) {
 51 | 	d.mu.Lock()
 52 | 	defer d.mu.Unlock()
 53 | 
 54 | 	if _, ok := d.dict[strings.ToLower(word)]; ok {
 55 | 		exist = true
 56 | 		return
 57 | 	}
 58 | 
 59 | 	dictStdFile, err := GetDictFile(DictStdFile)
 60 | 	if err != nil {
 61 | 		return
 62 | 	}
 63 | 
 64 | 	dictUserFile := filepath.Dir(dictStdFile)
 65 | 	dictUserFile += string(filepath.Separator) + DictUserFile
 66 | 
 67 | 	f, err := os.OpenFile(dictUserFile, os.O_CREATE|os.O_RDWR|os.O_APPEND, 0666)
 68 | 	if err != nil {
 69 | 		return
 70 | 	}
 71 | 	defer func() {
 72 | 		_ = f.Close()
 73 | 	}()
 74 | 
 75 | 	stat, err := f.Stat()
 76 | 	if err != nil {
 77 | 		return
 78 | 	}
 79 | 
 80 | 	line := ""
 81 | 	n := stat.Size()
 82 | 	if n > 0 {
 83 | 		buf := make([]byte, 1, 1)
 84 | 		_, err = f.ReadAt(buf, n-1)
 85 | 		if err != nil {
 86 | 			return
 87 | 		}
 88 | 		if buf[0] != '\n' {
 89 | 			line += "\n"
 90 | 		}
 91 | 	}
 92 | 	line += word + " " + strconv.Itoa(freq) + " " + prop + "\n"
 93 | 	_, err = f.Write([]byte(line))
 94 | 	if err != nil {
 95 | 		log.Println(err)
 96 | 		return
 97 | 	}
 98 | 
 99 | 	d.dict[strings.ToLower(word)] = freq
100 | 	d.tf += freq
101 | 	return
102 | }
103 | 
104 | func (d *Dictionary) load(fileDict string) error {
105 | 	d.mu.Lock()
106 | 	defer d.mu.Unlock()
107 | 
108 | 	timeStart := time.Now()
109 | 
110 | 	f, err := os.Open(fileDict)
111 | 	if err != nil {
112 | 		log.Println(err)
113 | 		return errors.New("unable to load the dictionary library:" + filepath.Base(fileDict))
114 | 	}
115 | 	defer func() {
116 | 		_ = f.Close()
117 | 	}()
118 | 
119 | 	itemCount := 0
120 | 	reader := bufio.NewReader(f)
121 | 	for {
122 | 		line, err := reader.ReadString('\n')
123 | 		if err != nil && err != io.EOF {
124 | 			break
125 | 		}
126 | 
127 | 		elem := strings.Fields(line)
128 | 		if len(elem) != 3 {
129 | 			if err == io.EOF {
130 | 				break
131 | 			}
132 | 			continue
133 | 		}
134 | 
135 | 		itemCount++
136 | 		nFreq, err := strconv.Atoi(elem[1])
137 | 		if err != nil {
138 | 			nFreq = 0
139 | 		}
140 | 		d.tf += nFreq
141 | 		d.dict[strings.ToLower(elem[0])] = nFreq
142 | 
143 | 		runeWord := []rune(elem[0])
144 | 		for i := range runeWord {
145 | 			s := strings.ToLower(string(runeWord[:i+1]))
146 | 			if _, ok := d.dict[s]; !ok {
147 | 				d.dict[s] = 0
148 | 			}
149 | 		}
150 | 
151 | 		if err == io.EOF {
152 | 			break
153 | 		}
154 | 	}
155 | 	if len(d.dict) == 0 || d.tf <= 0 {
156 | 		return errors.New("unable to load the dictionary library:" + filepath.Base(fileDict))
157 | 	}
158 | 
159 | 	log.Printf("%v words are loaded in dictionary "+filepath.Base(fileDict)+", and take %v\n",
160 | 		itemCount, time.Now().Sub(timeStart))
161 | 	return nil
162 | }
163 | 
164 | func InitDictionary() {
165 | 	// load the standard dictionary
166 | 	dictStdFile, err := GetDictFile(DictStdFile)
167 | 	if err != nil {
168 | 		log.Panic(err)
169 | 	}
170 | 	err = dictionary.load(dictStdFile)
171 | 	if err != nil {
172 | 		log.Panic(err)
173 | 	}
174 | 
175 | 	// load the user-defined dictionary
176 | 	dictUserFile, err := GetDictFile(DictUserFile)
177 | 	if err == nil {
178 | 		dictionary.load(dictUserFile)
179 | 	}
180 | }
181 | 
182 | func GetDictionary() *Dictionary {
183 | 	return dictionary
184 | }
185 | 


--------------------------------------------------------------------------------
/tokenizer/fstokenizer.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2022 Ze-Bin Wang.  All rights reserved.
  2 | // Use of this source code is governed by a MIT style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package tokenizer
  6 | 
  7 | import (
  8 | 	"encoding/json"
  9 | 	"io/ioutil"
 10 | 	"log"
 11 | 	"sync"
 12 | )
 13 | 
 14 | const (
 15 | 	minFloat          = -3.14e100
 16 | 	finalSegProbStart = "fs_pbstart.json"
 17 | 	finalSegProbTrans = "fs_pbtrans.json"
 18 | 	finalSegProbEmit  = "fs_pbemit.json"
 19 | )
 20 | 
 21 | var (
 22 | 	fsTokenizer = &FinalSeg{
 23 | 		start: make(map[string]float64),
 24 | 		trans: make(map[string]map[string]float64),
 25 | 		emit:  make(map[string]map[string]float64),
 26 | 
 27 | 		forceSplitWords: &forceSplitWords{
 28 | 			dict: make(map[string]struct{}),
 29 | 		},
 30 | 	}
 31 | 
 32 | 	prevStatus = map[string][]string{
 33 | 		"B": {"E", "S"},
 34 | 		"M": {"M", "B"},
 35 | 		"S": {"S", "E"},
 36 | 		"E": {"B", "M"},
 37 | 	}
 38 | 	states = []string{"B", "M", "E", "S"}
 39 | )
 40 | 
 41 | type forceSplitWords struct {
 42 | 	dict  map[string]struct{}
 43 | 	mutex sync.RWMutex
 44 | }
 45 | 
 46 | func (s *forceSplitWords) exist(word string) bool {
 47 | 	s.mutex.RLock()
 48 | 	defer s.mutex.RUnlock()
 49 | 	_, ok := s.dict[word]
 50 | 	return ok
 51 | }
 52 | 
 53 | func (s *forceSplitWords) addForceSplit(word string) {
 54 | 	s.mutex.Lock()
 55 | 	defer s.mutex.Unlock()
 56 | 	s.dict[word] = struct{}{}
 57 | }
 58 | 
 59 | type FinalSeg struct {
 60 | 	start map[string]float64
 61 | 	trans map[string]map[string]float64
 62 | 	emit  map[string]map[string]float64
 63 | 
 64 | 	forceSplitWords *forceSplitWords
 65 | }
 66 | 
 67 | func (fs *FinalSeg) Cut(sentence string) []string {
 68 | 	wordsRet := make([]string, 0, DefaultWordsLen)
 69 | 
 70 | 	segments := SplitChineseSeg(sentence)
 71 | 	for _, segment := range segments {
 72 | 		if IsChineseChars(segment) {
 73 | 			words := fs.cut(segment)
 74 | 			for _, v := range words {
 75 | 				if fs.exist(v) {
 76 | 					for _, c := range v {
 77 | 						wordsRet = append(wordsRet, string(c))
 78 | 					}
 79 | 				} else {
 80 | 					wordsRet = append(wordsRet, v)
 81 | 				}
 82 | 			}
 83 | 		} else {
 84 | 			words := SplitNumberSeg(segment)
 85 | 			wordsRet = append(wordsRet, words...)
 86 | 		}
 87 | 	}
 88 | 	return wordsRet
 89 | }
 90 | 
 91 | func (fs *FinalSeg) getMatrixVal(name, key, word string) float64 {
 92 | 	var m map[string]map[string]float64
 93 | 	if name == "emit" {
 94 | 		m = fs.emit
 95 | 	} else if name == "trans" {
 96 | 		m = fs.trans
 97 | 	} else {
 98 | 		return minFloat
 99 | 	}
100 | 	val, ok := m[key][word]
101 | 	if !ok {
102 | 		val = minFloat
103 | 	}
104 | 	return val
105 | }
106 | 
107 | func (fs *FinalSeg) viterbi(sentence string) []string {
108 | 	rs := []rune(sentence)
109 | 	n := len(rs)
110 | 	if n == 0 {
111 | 		return nil
112 | 	}
113 | 
114 | 	v := make([]map[string]float64, n)
115 | 	for i := 0; i < n; i++ {
116 | 		v[i] = make(map[string]float64)
117 | 	}
118 | 	path := make(map[string][]string, 0)
119 | 
120 | 	word := string(rs[0])
121 | 	for _, y := range states {
122 | 		v[0][y] = fs.start[y] + fs.getMatrixVal("emit", y, word)
123 | 		path[y] = []string{y}
124 | 	}
125 | 
126 | 	for i := 1; i < len(rs); i++ {
127 | 		word = string(rs[i])
128 | 
129 | 		pathNew := make(map[string][]string, 0)
130 | 		for _, y := range states {
131 | 			st := ""
132 | 			pb := minFloat
133 | 			for _, y0 := range prevStatus[y] {
134 | 				m := v[i-1][y0] + fs.getMatrixVal("trans", y0, y) + fs.getMatrixVal("emit", y, word)
135 | 				if st == "" {
136 | 					st = y0
137 | 					pb = m
138 | 				} else if m > pb {
139 | 					st = y0
140 | 					pb = m
141 | 				}
142 | 			}
143 | 			v[i][y] = pb
144 | 			pathNew[y] = append(path[st], y)
145 | 		}
146 | 		path = pathNew
147 | 	}
148 | 
149 | 	state := "E"
150 | 	prob := v[len(rs)-1]["E"]
151 | 	if v[len(rs)-1]["S"] > prob {
152 | 		prob = v[len(rs)-1]["S"]
153 | 		state = "S"
154 | 	}
155 | 	return path[state]
156 | }
157 | 
158 | func (fs *FinalSeg) cut(sentence string) []string {
159 | 	rs := []rune(sentence)
160 | 	wordsRet := make([]string, 0)
161 | 	posList := fs.viterbi(sentence)
162 | 
163 | 	begin, next := 0, 0
164 | 	for i, word := range rs {
165 | 		pos := posList[i]
166 | 		if pos == "B" {
167 | 			begin = i
168 | 		} else if pos == "E" {
169 | 			wordsRet = append(wordsRet, string(rs[begin:i+1]))
170 | 			next = i + 1
171 | 		} else if pos == "S" {
172 | 			wordsRet = append(wordsRet, string(word))
173 | 			next = i + 1
174 | 		}
175 | 	}
176 | 	if next < len(rs) {
177 | 		wordsRet = append(wordsRet, string(rs[next:]))
178 | 	}
179 | 
180 | 	return wordsRet
181 | }
182 | 
183 | func (fs *FinalSeg) exist(word string) bool {
184 | 	return fs.forceSplitWords.exist(word)
185 | }
186 | 
187 | func readJsonFromFile(fn string, fs interface{}) {
188 | 	fileProbeStart, err := GetDictFile(fn)
189 | 	if err != nil {
190 | 		log.Panic(err)
191 | 	}
192 | 	data, err := ioutil.ReadFile(fileProbeStart)
193 | 	if err != nil {
194 | 		log.Panic(err)
195 | 	}
196 | 	err = json.Unmarshal(data, fs)
197 | 	if err != nil {
198 | 		log.Panic(err)
199 | 	}
200 | }
201 | 
202 | func GetFinalSeg() *FinalSeg {
203 | 	return fsTokenizer
204 | }
205 | 
206 | func InitFSToken() {
207 | 	readJsonFromFile(finalSegProbStart, &fsTokenizer.start)
208 | 	readJsonFromFile(finalSegProbTrans, &fsTokenizer.trans)
209 | 	readJsonFromFile(finalSegProbEmit, &fsTokenizer.emit)
210 | }
211 | 


--------------------------------------------------------------------------------
/tokenizer/sentence.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2022 Ze-Bin Wang.  All rights reserved.
  2 | // Use of this source code is governed by a MIT style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package tokenizer
  6 | 
  7 | import "math"
  8 | 
  9 | type NodeDAG struct {
 10 | 	X float64
 11 | 	Y int
 12 | }
 13 | 
 14 | type Sentence struct {
 15 | 	sentenceRune []rune
 16 | }
 17 | 
 18 | func NewSentence(s string) *Sentence {
 19 | 	return &Sentence{
 20 | 		sentenceRune: []rune(s),
 21 | 	}
 22 | }
 23 | 
 24 | func (s *Sentence) Len() int {
 25 | 	if s.sentenceRune == nil {
 26 | 		return 0
 27 | 	}
 28 | 	return len(s.sentenceRune)
 29 | }
 30 | 
 31 | func (s *Sentence) GetWord(start, end int) string {
 32 | 	if (start < 0 || start >= s.Len()) || (end <= 0 || end > s.Len()) {
 33 | 		return ""
 34 | 	}
 35 | 	return string(s.sentenceRune[start:end])
 36 | }
 37 | 
 38 | func (s *Sentence) GetChar(i int) string {
 39 | 	if i < 0 || i >= s.Len() {
 40 | 		return ""
 41 | 	}
 42 | 	return string(s.sentenceRune[i])
 43 | }
 44 | 
 45 | func (s *Sentence) GetDAG() [][]int {
 46 | 	dag := make([][]int, 0)
 47 | 
 48 | 	dictionary := GetDictionary()
 49 | 	n := s.Len()
 50 | 	for k := 0; k < n; k++ {
 51 | 		l := make([]int, 0)
 52 | 
 53 | 		i := k
 54 | 		for i < n {
 55 | 			word := string(s.sentenceRune[k : i+1])
 56 | 			if freq, ok := dictionary.GetWord(word); ok {
 57 | 				if freq > 0 {
 58 | 					l = append(l, i)
 59 | 				}
 60 | 				i++
 61 | 				continue
 62 | 			}
 63 | 			break
 64 | 		}
 65 | 
 66 | 		if len(l) == 0 {
 67 | 			l = append(l, k)
 68 | 		}
 69 | 		dag = append(dag, l)
 70 | 	}
 71 | 	return dag
 72 | }
 73 | 
 74 | func (s *Sentence) CalcDAG() []NodeDAG {
 75 | 	n := s.Len()
 76 | 	route := make([]NodeDAG, n+1)
 77 | 	route[n] = NodeDAG{0, 0}
 78 | 
 79 | 	dictionary := GetDictionary()
 80 | 	logTotal := math.Log(dictionary.GetTotalFreq())
 81 | 
 82 | 	dag := s.GetDAG()
 83 | 	for k := n - 1; k >= 0; k-- {
 84 | 		score := float64(0)
 85 | 		idx := -1
 86 | 		for _, x := range dag[k] {
 87 | 			word := s.GetWord(k, x+1)
 88 | 			freq := 0
 89 | 			if v, ok := dictionary.GetWord(word); ok {
 90 | 				freq = v
 91 | 			}
 92 | 			if freq == 0 {
 93 | 				freq = 1
 94 | 			}
 95 | 			val := math.Log(float64(freq)) - logTotal + route[x+1].X
 96 | 			if idx == -1 {
 97 | 				score = val
 98 | 				idx = x
 99 | 			} else if val >= score {
100 | 				score = val
101 | 				idx = x
102 | 			}
103 | 		}
104 | 		route[k] = NodeDAG{score, idx}
105 | 	}
106 | 	return route
107 | }
108 | 


--------------------------------------------------------------------------------