├── .gitignore ├── examples ├── data.txt └── examples.go ├── go.mod ├── go.sum ├── LICENSE ├── flashtext_test.go ├── README.md └── KeywordProcessor.go /.gitignore: -------------------------------------------------------------------------------- 1 | vendor/* 2 | .idea/* 3 | -------------------------------------------------------------------------------- /examples/data.txt: -------------------------------------------------------------------------------- 1 | abc => abc 2 | 中国 => 中文 -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/waltsmith88/go-flashtext 2 | 3 | go 1.11 4 | 5 | require github.com/stretchr/testify v1.3.0 6 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= 2 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 3 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 4 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 5 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 6 | github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= 7 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2019 The github.com/waltsmith88/go-flashtext Authors. 4 | All rights reserved. 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /examples/examples.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | gf "github.com/waltsmith88/go-flashtext" 6 | "log" 7 | "path/filepath" 8 | "runtime" 9 | ) 10 | 11 | func main() { 12 | 13 | file := "examples/data.txt" 14 | filePath, _ := filepath.Abs(file) 15 | 16 | keywordProcessor := gf.NewKeywordProcessor() 17 | 18 | // Example 1: add multiple keywords from file 19 | keywordProcessor.AddKeywordsFromFile(filePath) 20 | 21 | // Example 2: add a keyword 22 | keywordProcessor.AddKeyword("abc", "abc") 23 | 24 | // Example 3: add keywords from Map 25 | keywordMap := map[string]string{ 26 | "abcd": "abcd", 27 | "student": "stu", 28 | } 29 | keywordProcessor.AddKeywordsFromMap(keywordMap) 30 | 31 | // Example 4: add same keyword "abc" with different cleanName 32 | keywordProcessor.AddKeyword("abc", "abc1") 33 | 34 | allKeywords := keywordProcessor.GetAllKeywords() 35 | fmt.Println(allKeywords) 36 | 37 | // Example 5: get cleanName of keyword from TrieTree 38 | if cleanName, err := keywordProcessor.GetKeyword("abc"); err { 39 | fmt.Println("Success:", cleanName, err) 40 | } else { 41 | fmt.Println("Failed:", cleanName, err) 42 | } 43 | 44 | // Example 6: 添加中文关键词并提取 add Chinese keywords 45 | keywordProcessor.AddKeyword("中文", "中文") 46 | // Extract keywords from sentence by searching TrieTree 47 | sentence := "中文支持1bceAbcd支持中文student abc" 48 | cleanNameList := keywordProcessor.ExtractKeywords(sentence) 49 | fmt.Println(cleanNameList) 50 | 51 | // Example 7: set properties of keywordProcessor 52 | fmt.Println(keywordProcessor.GetCaseSensitive()) 53 | keywordProcessor.SetCaseSensitive(false) 54 | fmt.Println(keywordProcessor.GetCaseSensitive()) 55 | cleanNameList1 := keywordProcessor.ExtractKeywords(sentence) 56 | fmt.Println(cleanNameList1) 57 | 58 | // Example 8: Extract keywords from sentence by searching TrieTree and return keywords' span 59 | cleanNameRes := keywordProcessor.ExtractKeywordsWithSpanInfo(sentence) 60 | sentence1 := []rune(sentence) 61 | fmt.Println(cleanNameRes, sentence1) 62 | for _, resSpan := range cleanNameRes { 63 | fmt.Println(resSpan.CleanName, resSpan.StartPos, resSpan.EndPos, fmt.Sprintf("%c", sentence1[resSpan.StartPos:resSpan.EndPos])) 64 | } 65 | 66 | // Example 9: delete keyword 67 | keywordProcessor.RemoveKeyword("abc") 68 | 69 | // Example 10: delete keywords in list 70 | keywordProcessor.RemoveKeywordFromList([]string{"student", "abcd", "abc", "中文"}) 71 | fmt.Println(keywordProcessor.GetAllKeywords()) 72 | 73 | // Example 11: replace keywords in sentence with their cleanName 74 | sourceSentence := "hello中国helloabc" 75 | newSentence := keywordProcessor.ReplaceKeywords(sourceSentence) 76 | fmt.Println(fmt.Sprintf("source sentence: %s; \nnew sentence: %s", sourceSentence, newSentence)) 77 | 78 | printMemStats() 79 | } 80 | 81 | func printMemStats() { 82 | _, _, line, _ := runtime.Caller(1) 83 | var m runtime.MemStats 84 | runtime.ReadMemStats(&m) 85 | log.Printf("Line %v : Alloc = %v TotalAlloc = %v Sys = %v NumGC = %v\n", line, m.Alloc/1024, m.TotalAlloc/1024, m.Sys/1024, 86 | m.NumGC) 87 | } 88 | -------------------------------------------------------------------------------- /flashtext_test.go: -------------------------------------------------------------------------------- 1 | package flashtext 2 | 3 | import ( 4 | "github.com/stretchr/testify/assert" 5 | "testing" 6 | ) 7 | 8 | func TestAddKeywords(t *testing.T) { 9 | 10 | var testSet = []struct{ 11 | in map[string]string 12 | expected map[string]string 13 | } { 14 | {map[string]string{"teacher": "tea"}, map[string]string{"teacher": "tea"}}, 15 | {map[string]string{"student": "stu", "中国": "中文"}, map[string]string{"student": "stu", "中国": "中文"}}, 16 | } 17 | // Add keywords from map 18 | for _, testItem := range testSet { 19 | keywordProcessor := NewKeywordProcessor() 20 | keywordProcessor.AddKeywordsFromMap(testItem.in) 21 | assert.Equal(t, testItem.expected, keywordProcessor.GetAllKeywords()) 22 | } 23 | } 24 | 25 | func TestAddKeywordsFromFile(t *testing.T) { 26 | 27 | var testSet = []struct{ 28 | in string 29 | expected map[string]string 30 | } { 31 | {"examples/data.txt", map[string]string{"abc": "abc", "中国": "中文",}}, 32 | } 33 | // Extract keywords from sentence by searching TrieTree 34 | for _, testItem := range testSet { 35 | keywordProcessor := NewKeywordProcessor() 36 | keywordProcessor.AddKeywordsFromFile(testItem.in) 37 | assert.Equal(t, testItem.expected, keywordProcessor.GetAllKeywords()) 38 | } 39 | } 40 | 41 | func TestRemoveKeywordsFromList(t *testing.T) { 42 | 43 | // add keywords from Map 44 | keywordMap := map[string]string{ 45 | "teacher": "tea", 46 | "student": "stu", 47 | "中国": "中文", 48 | } 49 | 50 | var testSet = []struct{ 51 | in []string 52 | expected map[string]string 53 | } { 54 | {[]string{"teacher"}, map[string]string{"student": "stu", "中国": "中文"}}, 55 | {[]string{"student", "teacher"},map[string]string{"中国": "中文"}}, 56 | } 57 | // Remove keywords from list 58 | for _, testItem := range testSet { 59 | keywordProcessor := NewKeywordProcessor() 60 | keywordProcessor.AddKeywordsFromMap(keywordMap) 61 | keywordProcessor.RemoveKeywordFromList(testItem.in) 62 | assert.Equal(t, testItem.expected, keywordProcessor.GetAllKeywords()) 63 | } 64 | } 65 | 66 | func TestRemoveKeyword(t *testing.T) { 67 | 68 | // add keywords from Map 69 | keywordMap := map[string]string{ 70 | "teacher": "tea", 71 | "student": "stu", 72 | "中国": "中文", 73 | } 74 | 75 | var testSet = []struct{ 76 | in string 77 | expected map[string]string 78 | } { 79 | {"teacher", map[string]string{"student": "stu", "中国": "中文",}}, 80 | {"student", map[string]string{"teacher": "tea", "中国": "中文",}}, 81 | } 82 | // remove a keyword 83 | for _, testItem := range testSet { 84 | keywordProcessor := NewKeywordProcessor() 85 | keywordProcessor.AddKeywordsFromMap(keywordMap) 86 | keywordProcessor.RemoveKeyword(testItem.in) 87 | assert.Equal(t, testItem.expected, keywordProcessor.GetAllKeywords()) 88 | } 89 | } 90 | 91 | func TestExtractKeywords(t *testing.T) { 92 | keywordProcessor := NewKeywordProcessor() 93 | 94 | // add keywords from Map 95 | keywordMap := map[string]string{ 96 | "teacher": "tea", 97 | "student": "stu", 98 | } 99 | keywordProcessor.AddKeywordsFromMap(keywordMap) 100 | 101 | // 添加中文关键词 102 | keywordProcessor.AddKeyword("中文", "中文") 103 | keywordProcessor.AddKeyword("abc") 104 | 105 | var testSet = []struct{ 106 | in string 107 | expected []string 108 | } { 109 | {"hello abc, what up", []string{"abc"}}, 110 | {"hello, 你会说中文吗?", []string{"中文"}}, 111 | {"hello, abc 你会说中文吗? oHabc", []string{"abc", "中文", "abc"}}, 112 | } 113 | // Extract keywords from sentence by searching TrieTree 114 | for _, testItem := range testSet { 115 | cleanNameList := keywordProcessor.ExtractKeywords(testItem.in) 116 | assert.Equal(t, testItem.expected, cleanNameList) 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # go-flashtext 2 | 3 | This module can be used to replace keywords in sentences or extract keywords from sentences. It is based on the [FlashText algorithm](https://arxiv.org/abs/1711.00046). 4 | 5 | Compared with standard [FlashText algorithm](https://arxiv.org/abs/1711.00046), there are some differences which make [go-flashtext](https://github.com/waltsmith88/go-flashtext) more powerful: 6 | 7 | * **Chinese** is support fully. [Python implement](https://github.com/vi3k6i5/flashtext#flashtext) supports Chinese not well. 8 | * We break **nonWordBoundaries** in FlashText algorithm to make it more powerful, which means that keyword could contains char not in [_0-9a-zA-Z]. 9 | * We allow the **same keyword with different cleanNames** exists, which means keywords are not unique. We found this is very useful in Industry envs. 10 | 11 | 12 | 13 | ## Installation 14 | 15 | To install GoFlashText package, you need to install Go and set your Go workspace first. 16 | 17 | 1. The first need [Go](https://golang.org/) installed, then you can use the below Go command to install GoFlashText. 18 | 19 | ```shell 20 | $ go get -u github.com/waltsmith88/go-flashtext 21 | ``` 22 | 23 | 2. Import it in your code: 24 | 25 | ```go 26 | imoprt gf "github.com/waltsmith88/go-flashtext" 27 | ``` 28 | 29 | 30 | 31 | ## Usage 32 | 33 | - Extract keywords 34 | 35 | ```go 36 | package main 37 | 38 | import ( 39 | "fmt" 40 | gf "github.com/waltsmith88/go-flashtext" 41 | ) 42 | 43 | func main() { 44 | // add keywords from Map 45 | keywordMap := map[string]string{ 46 | "love": "love", 47 | "hello": "hello", 48 | } 49 | keywordProcessor := gf.NewKeywordProcessor() 50 | keywordProcessor.AddKeywordsFromMap(keywordMap) 51 | foundList := keywordProcessor.ExtractKeywords("I love coding.") 52 | fmt.Println(foundList) 53 | } 54 | // [love] 55 | ``` 56 | 57 | - Extract keywords With Chinese Support 58 | 59 | ```go 60 | package main 61 | 62 | import ( 63 | "fmt" 64 | gf "github.com/waltsmith88/go-flashtext" 65 | ) 66 | 67 | func main() { 68 | // add keywords from Map 69 | keywordMap := map[string]string{ 70 | "love": "love", 71 | "中国": "中文", 72 | } 73 | keywordProcessor := gf.NewKeywordProcessor() 74 | keywordProcessor.AddKeywordsFromMap(keywordMap) 75 | keywordProcessor.AddKeyword("love", "ove") 76 | foundList := keywordProcessor.ExtractKeywords("I Love 中国.") 77 | fmt.Println(foundList) 78 | } 79 | // [中文] 80 | ``` 81 | 82 | - Case Sensitive example 83 | 84 | ```go 85 | package main 86 | 87 | import ( 88 | "fmt" 89 | gf "github.com/waltsmith88/go-flashtext" 90 | ) 91 | 92 | func main() { 93 | // add keywords from Map 94 | keywordMap := map[string]string{ 95 | "love": "love", 96 | "中国": "中文", 97 | } 98 | keywordProcessor := gf.NewKeywordProcessor() 99 | keywordProcessor.SetCaseSensitive(false) 100 | keywordProcessor.AddKeywordsFromMap(keywordMap) 101 | keywordProcessor.AddKeyword("love", "ove") 102 | foundList := keywordProcessor.ExtractKeywords("I Love 中国.") 103 | fmt.Println(foundList) 104 | } 105 | // [love|ove 中文] 106 | ``` 107 | 108 | - Unique Keywords example 109 | 110 | ```go 111 | func main() { 112 | // add keywords from Map 113 | keywordMap := map[string]string{ 114 | "love": "love", 115 | "中国": "中文", 116 | } 117 | keywordProcessor := gf.NewKeywordProcessor() 118 | keywordProcessor.SetUniqueKeyword(true) 119 | keywordProcessor.SetCaseSensitive(false) 120 | keywordProcessor.AddKeywordsFromMap(keywordMap) 121 | keywordProcessor.AddKeyword("love", "ove") 122 | foundList := keywordProcessor.ExtractKeywords("I Love 中国.") 123 | fmt.Println(foundList) 124 | } 125 | // [ove 中文] 126 | ``` 127 | 128 | - Span of keywords extracted 129 | 130 | ```go 131 | func main() { 132 | // add keywords from Map 133 | keywordMap := map[string]string{ 134 | "love": "love", 135 | "中国": "中文", 136 | } 137 | keywordProcessor := gf.NewKeywordProcessor() 138 | keywordProcessor.AddKeywordsFromMap(keywordMap) 139 | sentence := "I love 中国." 140 | cleanNameRes := keywordProcessor.ExtractKeywordsWithSpanInfo(sentence) 141 | sentence1 := []rune(sentence) 142 | for _, resSpan := range cleanNameRes { 143 | fmt.Println(resSpan.CleanName, resSpan.StartPos, resSpan.EndPos, fmt.Sprintf("%c", sentence1[resSpan.StartPos:resSpan.EndPos])) 144 | } 145 | } 146 | // love 2 6 [l o v e] 147 | // 中文 7 9 [中 国] 148 | ``` 149 | 150 | - Add Multiple Keywords simultaneously 151 | 152 | ```go 153 | // way 1: from Map 154 | keywordMap := map[string]string{ 155 | "abcd": "abcd", 156 | "student": "stu", 157 | } 158 | keywordProcessor.AddKeywordsFromMap(keywordMap) 159 | // way 2: from Slice 160 | keywordProcessor.AddKeywordsFromList([]string{"student", "abcd", "abc", "中文"}) 161 | // way 3: from file. Line: keyword => cleanName 162 | keywordProcessor.AddKeywordsFromFile(filePath) 163 | ``` 164 | 165 | - To Remove keywords 166 | 167 | ```go 168 | keywordProcessor.RemoveKeyword("abc") 169 | keywordProcessor.RemoveKeywordFromList([]string{"student", "abcd", "abc", "中文"}) 170 | ``` 171 | 172 | - To Replace keywords 173 | 174 | ```go 175 | newSentence := keywordProcessor.ReplaceKeywords(sourceSentence) 176 | ``` 177 | 178 | - To check Number of terms in KeywordProcessor 179 | 180 | ```go 181 | keywordProcessor.Len() 182 | ``` 183 | 184 | - To check if term is present in KeywordProcessor 185 | 186 | ```go 187 | keywordProcessor.IsContains("abc") 188 | ``` 189 | 190 | - Get all keywords in dictionary 191 | 192 | ```go 193 | keywordProcessor.GetAllKeywords() 194 | ``` 195 | 196 | 197 | More Examples about Usage in go-flashtext/examples/examples.go and you could have a taste by using following command: 198 | 199 | ```shell 200 | $ go run examples/examples.go 201 | ``` 202 | 203 | 204 | 205 | ## Test 206 | 207 | ```shell 208 | $ git clone github.com/waltsmith88/go-flashtext 209 | $ cd go-flashtext 210 | $ go test -v 211 | ``` 212 | 213 | 214 | 215 | ## Why not Regex? 216 | 217 | It's a custom algorithm based on [Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm) and [Trie Dictionary](https://en.wikipedia.org/wiki/TrieDictionary). 218 | 219 | ![Benchmark](https://github.com/vi3k6i5/flashtext/raw/master/benchmark.png) 220 | 221 | 222 | 223 | Time taken by FlashText to find terms in comparison to Regex. 224 | 225 | [![https://thepracticaldev.s3.amazonaws.com/i/xruf50n6z1r37ti8rd89.png](https://camo.githubusercontent.com/53e63b19336a7dfbe5d2874b70d73e37c4cd744d/68747470733a2f2f74686570726163746963616c6465762e73332e616d617a6f6e6177732e636f6d2f692f7872756635306e367a31723337746938726438392e706e67)](https://camo.githubusercontent.com/53e63b19336a7dfbe5d2874b70d73e37c4cd744d/68747470733a2f2f74686570726163746963616c6465762e73332e616d617a6f6e6177732e636f6d2f692f7872756635306e367a31723337746938726438392e706e67) 226 | 227 | Time taken by FlashText to replace terms in comparison to Regex. 228 | 229 | [![https://thepracticaldev.s3.amazonaws.com/i/k44ghwp8o712dm58debj.png](https://camo.githubusercontent.com/28e8b327359b6f93bf3ac4733b92c5dec0576851/68747470733a2f2f74686570726163746963616c6465762e73332e616d617a6f6e6177732e636f6d2f692f6b343467687770386f373132646d35386465626a2e706e67)](https://camo.githubusercontent.com/28e8b327359b6f93bf3ac4733b92c5dec0576851/68747470733a2f2f74686570726163746963616c6465762e73332e616d617a6f6e6177732e636f6d2f692f6b343467687770386f373132646d35386465626a2e706e67) 230 | 231 | Link to code for benchmarking the [Find Feature](https://gist.github.com/vi3k6i5/604eefd92866d081cfa19f862224e4a0) and [Replace Feature](https://gist.github.com/vi3k6i5/dc3335ee46ab9f650b19885e8ade6c7a). 232 | 233 | The idea for this library came from the following [StackOverflow question](https://stackoverflow.com/questions/44178449/regex-replace-is-taking-time-for-millions-of-documents-how-to-make-it-faster). 234 | 235 | ## Citation 236 | 237 | The original paper published on [FlashText algorithm](https://arxiv.org/abs/1711.00046). 238 | 239 | ``` 240 | @ARTICLE{2017arXiv171100046S, 241 | author = {{Singh}, V.}, 242 | title = "{Replace or Retrieve Keywords In Documents at Scale}", 243 | journal = {ArXiv e-prints}, 244 | archivePrefix = "arXiv", 245 | eprint = {1711.00046}, 246 | primaryClass = "cs.DS", 247 | keywords = {Computer Science - Data Structures and Algorithms}, 248 | year = 2017, 249 | month = oct, 250 | adsurl = {http://adsabs.harvard.edu/abs/2017arXiv171100046S}, 251 | adsnote = {Provided by the SAO/NASA Astrophysics Data System} 252 | } 253 | ``` 254 | 255 | The article published on [Medium freeCodeCamp](https://medium.freecodecamp.org/regex-was-taking-5-days-flashtext-does-it-in-15-minutes-55f04411025f). 256 | 257 | 258 | 259 | ## Contribute 260 | 261 | - Issue Tracker: 262 | - Source Code: 263 | 264 | 265 | 266 | ## License 267 | 268 | The project is licensed under the MIT license. 269 | -------------------------------------------------------------------------------- /KeywordProcessor.go: -------------------------------------------------------------------------------- 1 | package flashtext 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "io" 7 | "os" 8 | "strings" 9 | ) 10 | 11 | const __key__ = int32(197) 12 | 13 | // Keyword Result of Extract 14 | type keywordRes struct { 15 | CleanName string 16 | StartPos int 17 | EndPos int 18 | } 19 | 20 | // Keyword Processor 21 | type keywordProcessor struct { 22 | caseSensitive bool 23 | uniqueKeyword bool 24 | _keyword string 25 | whiteSpaceChars []string 26 | nonWordBoundaries string 27 | keywordTrieDict map[int32]interface{} 28 | termsInTrie int 29 | delimiter string 30 | } 31 | 32 | func NewKeywordProcessor() *keywordProcessor { 33 | 34 | KeywordProcessor := &keywordProcessor{ 35 | caseSensitive: true, 36 | uniqueKeyword: false, 37 | _keyword: "_keyword_", 38 | whiteSpaceChars: []string{".", "\t", "\n", "\a", " ", ","}, 39 | nonWordBoundaries: "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz", 40 | keywordTrieDict: map[int32]interface{}{}, 41 | termsInTrie: 0, 42 | delimiter: "|", 43 | } 44 | return KeywordProcessor 45 | } 46 | 47 | // Default returns an keywordProcessor instance with default values. 48 | func Default() *keywordProcessor { 49 | KeywordProcessor := NewKeywordProcessor() 50 | return KeywordProcessor 51 | } 52 | 53 | // Get keyword TrieTree 54 | func (KeywordProcessor *keywordProcessor) GetKeywordTrieDict() map[int32]interface{} { 55 | return KeywordProcessor.keywordTrieDict 56 | } 57 | 58 | // Get case sensitive 59 | func (KeywordProcessor *keywordProcessor) GetCaseSensitive() bool { 60 | return KeywordProcessor.caseSensitive 61 | } 62 | 63 | // Set case sensitive 64 | func (KeywordProcessor *keywordProcessor) SetCaseSensitive(sensitive bool) { 65 | KeywordProcessor.caseSensitive = sensitive 66 | } 67 | 68 | // Get case sensitive 69 | func (KeywordProcessor *keywordProcessor) GetUniqueKeyword() bool { 70 | return KeywordProcessor.uniqueKeyword 71 | } 72 | 73 | // Set case sensitive 74 | func (KeywordProcessor *keywordProcessor) SetUniqueKeyword(sensitive bool) { 75 | KeywordProcessor.uniqueKeyword = sensitive 76 | } 77 | 78 | // Get the delimiter which is used to joining two different cleanNames of same keyword. 79 | func (KeywordProcessor *keywordProcessor) GetDelimiter() string { 80 | return KeywordProcessor.delimiter 81 | } 82 | 83 | // Set the delimiter which is used to joining two different cleanNames of same keyword. 84 | // Be careful of setting the delimiter 85 | // and make sure the delimiter be the unique identifier of all cleanNames. 86 | func (KeywordProcessor *keywordProcessor) SetDelimiter(delimiter string) { 87 | KeywordProcessor.delimiter = delimiter 88 | } 89 | 90 | // Return number of terms present in the keyword_trie_dict 91 | func (KeywordProcessor *keywordProcessor) Len() int { 92 | return KeywordProcessor.termsInTrie 93 | } 94 | 95 | // If TrieTree contains keyword 96 | func (KeywordProcessor *keywordProcessor) IsContains(keyword string) bool { 97 | currentDict := KeywordProcessor.keywordTrieDict 98 | for _, val := range keyword { 99 | 100 | if tmp_, err := currentDict[val]; !err { 101 | return false 102 | } else { 103 | currentDict = tmp_.(map[int32]interface{}) 104 | } 105 | } 106 | _, err := currentDict[__key__] 107 | return err 108 | } 109 | 110 | // Get clean Name of keyword from TrieTree. 111 | // Return (cleanName, bool). 112 | func (KeywordProcessor *keywordProcessor) GetKeyword(keyword string) (string, bool) { 113 | currentDict := KeywordProcessor.keywordTrieDict 114 | for _, val := range keyword { 115 | 116 | if tmp_, err := currentDict[val]; !err { 117 | return "nil", false 118 | } else { 119 | currentDict = tmp_.(map[int32]interface{}) 120 | } 121 | } 122 | if res, err := currentDict[__key__]; err { 123 | return res.(string), true 124 | } else { 125 | return "nil", false 126 | } 127 | } 128 | 129 | // Get all clean Name of keyword from TrieTree. 130 | // Return map(keyword, cleanName). 131 | func (KeywordProcessor *keywordProcessor) GetAllKeywords() map[string]string { 132 | return KeywordProcessor.__getAllKeywords__("", KeywordProcessor.keywordTrieDict) 133 | } 134 | 135 | // Get all clean Name of keyword from TrieTree. 136 | // Return map(keyword, cleanName). 137 | func (KeywordProcessor *keywordProcessor) __getAllKeywords__(termSoFar string, currendDict map[int32]interface{}) map[string]string { 138 | var allKeywords = map[string]string{} 139 | for key := range currendDict { 140 | if key == __key__ { 141 | allKeywords[termSoFar] = currendDict[__key__].(string) 142 | } else { 143 | subValues := KeywordProcessor.__getAllKeywords__( 144 | termSoFar+fmt.Sprintf("%c", key), 145 | currendDict[key].(map[int32]interface{})) 146 | for key := range subValues { 147 | allKeywords[key] = subValues[key] 148 | } 149 | } 150 | } 151 | return allKeywords 152 | } 153 | 154 | // Build the map relation between keyword and cleanName in TrieTree 155 | func (KeywordProcessor *keywordProcessor) __setItem__(keyword string, cleanName string) map[int32]interface{} { 156 | keyReverse := __reverseString__(keyword) 157 | var resMap map[int32]interface{} 158 | for i, j := range keyReverse { 159 | if i == 0 { 160 | tmp := map[int32]interface{}{__key__: cleanName} 161 | resMap = map[int32]interface{}{j: tmp} 162 | } else { 163 | resMap = map[int32]interface{}{j: resMap} 164 | } 165 | } 166 | return resMap 167 | } 168 | 169 | // Add a keyword and its' cleanName to TrieTree 170 | func (KeywordProcessor *keywordProcessor) AddKeyword(keyword string, cleanNames ...string) bool { 171 | var ( 172 | cleanName string 173 | diffDict map[int32]interface{} 174 | commDict map[int32]interface{} 175 | ) 176 | if len(cleanNames) == 0 { 177 | cleanName = keyword 178 | } else { 179 | cleanName = cleanNames[0] 180 | } 181 | 182 | if !KeywordProcessor.caseSensitive { 183 | keyword = strings.ToLower(keyword) 184 | } 185 | currentDict := KeywordProcessor.keywordTrieDict 186 | 187 | for i, letter := range keyword { 188 | if currentDict_, err := currentDict[letter]; err { 189 | currentDict = currentDict_.(map[int32]interface{}) 190 | commDict = currentDict 191 | } else { 192 | diffDict = KeywordProcessor.__setItem__(keyword[i:], cleanName) 193 | break 194 | } 195 | } 196 | 197 | if commDict == nil { 198 | if currentDict == nil { 199 | currentDict = diffDict 200 | } else { 201 | for k, v := range diffDict { 202 | currentDict[k] = v 203 | } 204 | } 205 | KeywordProcessor.keywordTrieDict = currentDict 206 | 207 | KeywordProcessor.termsInTrie++ 208 | return true 209 | } 210 | 211 | if diffDict == nil { 212 | if tmpCleanName, err := commDict[__key__]; err { 213 | if tmpCleanName != cleanName && !KeywordProcessor.uniqueKeyword { 214 | commDict[__key__] = strings.Join([]string{tmpCleanName.(string), cleanName}, "|") 215 | } else { // not unique keyword 216 | commDict[__key__] = cleanName 217 | } 218 | 219 | KeywordProcessor.termsInTrie++ 220 | return true 221 | } else { 222 | diffDict = map[int32]interface{}{__key__: cleanName} 223 | } 224 | } 225 | 226 | for k, v := range diffDict { 227 | commDict[k] = v 228 | } 229 | 230 | KeywordProcessor.termsInTrie++ 231 | return true 232 | } 233 | 234 | // Add multiple keywords and its' respective cleanName from keyword Map to TrieTree 235 | func (KeywordProcessor *keywordProcessor) AddKeywordsFromMap(keywordMap map[string]string) { 236 | for keyword, cleanName := range keywordMap { 237 | KeywordProcessor.AddKeyword(keyword, cleanName) 238 | } 239 | } 240 | 241 | // Add multiple keywords and cleanName same of keywords from keyword list to TrieTree 242 | // (keyword) -> (keyword, keyword) -> (keyword, cleanName) 243 | func (KeywordProcessor *keywordProcessor) AddKeywordsFromList(keywordList []string) { 244 | for _, keyword := range keywordList { 245 | KeywordProcessor.AddKeyword(keyword, keyword) 246 | } 247 | } 248 | 249 | // Add multiple keywords and cleanName same of keywords from keyword list to TrieTree 250 | // each line in file: 251 | // 1. keyword => cleanName 252 | // 2. keyword 253 | func (KeywordProcessor *keywordProcessor) AddKeywordsFromFile(filePath string) { 254 | 255 | file, err := os.Open(filePath) 256 | if err != nil { 257 | panic(err) 258 | } 259 | defer file.Close() 260 | 261 | w := bufio.NewReader(file) 262 | for { 263 | line, _, err := w.ReadLine() 264 | if err == io.EOF { 265 | break 266 | } 267 | if strings.Contains(string(line), "=>") { // 1. keyword => cleanName 268 | lineString := strings.Split(string(line), "=>") 269 | keyword := strings.TrimSpace(lineString[0]) 270 | cleanName := strings.TrimSpace(lineString[1]) 271 | KeywordProcessor.AddKeyword(keyword, cleanName) 272 | } else { // 2. keyword 273 | keyword := strings.TrimSpace(string(line)) 274 | KeywordProcessor.AddKeyword(keyword, keyword) 275 | } 276 | } 277 | } 278 | 279 | // Delete keywords and their cleanName from TrieTree 280 | func (KeywordProcessor *keywordProcessor) RemoveKeywordFromList(keywordList []string) { 281 | for _, keyword := range keywordList { 282 | KeywordProcessor.RemoveKeyword(keyword) 283 | } 284 | } 285 | 286 | // Delete a keyword and its' cleanName from TrieTree 287 | func (KeywordProcessor *keywordProcessor) RemoveKeyword(keyword string) bool { 288 | var ( 289 | commDictKey []int32 290 | commDictValue []map[int32]interface{} 291 | ) 292 | currentDict := KeywordProcessor.keywordTrieDict 293 | 294 | for _, letter := range keyword { 295 | commDictValue = append(commDictValue, currentDict) 296 | if tmp_, err := currentDict[letter]; !err { 297 | return false 298 | } else { 299 | commDictKey = append(commDictKey, letter) 300 | currentDict = tmp_.(map[int32]interface{}) 301 | } 302 | } 303 | 304 | if _, err := currentDict[__key__]; err { 305 | delete(currentDict, __key__) 306 | for i := len(commDictKey) - 1; i >= 0; i-- { 307 | tmpDict := commDictValue[i] 308 | delDict := tmpDict[commDictKey[i]].(map[int32]interface{}) 309 | if len(delDict) > 0 { 310 | return true 311 | } else { 312 | delete(tmpDict, commDictKey[i]) 313 | } 314 | } 315 | return true 316 | } 317 | return false 318 | } 319 | 320 | // Extract keywords from sentence by searching TrieTree. 321 | // And return the keywords' clean names. 322 | func (KeywordProcessor *keywordProcessor) ExtractKeywords(sentence string) []string { 323 | var keywordList []string 324 | if len(sentence) == 0 { 325 | return keywordList 326 | } 327 | if !KeywordProcessor.caseSensitive { 328 | sentence = strings.ToLower(sentence) 329 | } 330 | 331 | var ( 332 | start []int 333 | sentenceRune = []rune(sentence) 334 | idx = 0 335 | idy = 0 336 | sentenceLen = len(sentenceRune) 337 | cleanName = "" 338 | currentDict = KeywordProcessor.keywordTrieDict 339 | ) 340 | 341 | for idx < sentenceLen { 342 | char := sentenceRune[idx] 343 | tmpCurrent, err := currentDict[int32(char)].(map[int32]interface{}) 344 | if err { 345 | start = append(start, idx) 346 | idx++ 347 | idy++ 348 | if cleanNameTmp, err := tmpCurrent[__key__]; err { 349 | cleanName = cleanNameTmp.(string) 350 | } 351 | currentDict = tmpCurrent 352 | if idx < sentenceLen { 353 | continue 354 | } 355 | } else { 356 | idx++ 357 | idy++ 358 | currentDict = KeywordProcessor.keywordTrieDict 359 | } 360 | 361 | if cleanName != "" { 362 | keywordList = append(keywordList, cleanName) 363 | idx = start[len(start)-1] + 1 364 | idy = idx + 1 365 | cleanName = "" 366 | start = []int{} 367 | } else { 368 | if len(start) > 0 { 369 | idx = start[0] + 1 370 | idy = idx + 1 371 | } 372 | start = []int{} 373 | } 374 | } 375 | return keywordList 376 | } 377 | 378 | // Extract keywords from sentence by searching TrieTree. 379 | // And return the keywords' clean names, the start position and the end position of keyword in sentence. 380 | func (KeywordProcessor *keywordProcessor) ExtractKeywordsWithSpanInfo(sentence string) []keywordRes { 381 | var keywordList []keywordRes 382 | if len(sentence) == 0 { 383 | return keywordList 384 | } 385 | if !KeywordProcessor.caseSensitive { 386 | sentence = strings.ToLower(sentence) 387 | } 388 | 389 | var ( 390 | start []int 391 | idx = 0 392 | idy = 0 393 | cleanName = "" 394 | sentenceRune = []rune(sentence) 395 | sentenceLen = len(sentenceRune) 396 | currentDict = KeywordProcessor.keywordTrieDict 397 | ) 398 | 399 | for idx < sentenceLen { 400 | char := sentenceRune[idx] 401 | tmpCurrent, err := currentDict[int32(char)].(map[int32]interface{}) 402 | if err { 403 | start = append(start, idx) 404 | idx++ 405 | idy++ 406 | if cleanNameTmp, err := tmpCurrent[__key__]; err { 407 | cleanName = cleanNameTmp.(string) 408 | } 409 | currentDict = tmpCurrent 410 | continue 411 | } else { 412 | idx++ 413 | idy++ 414 | currentDict = KeywordProcessor.keywordTrieDict 415 | } 416 | if cleanName != "" { 417 | startIndex := start[0] 418 | endIndex := start[0] + len(start) 419 | res := keywordRes{cleanName, startIndex, endIndex} 420 | keywordList = append(keywordList, res) 421 | idx = start[len(start)-1] + 1 422 | idy = idx 423 | cleanName = "" 424 | start = []int{} 425 | } else { 426 | if len(start) > 0 { 427 | idx = start[0] + 1 428 | idy = idx 429 | } 430 | start = []int{} 431 | } 432 | } 433 | return keywordList 434 | } 435 | 436 | // Replace keywords in sentence with their cleanName 437 | func (KeywordProcessor *keywordProcessor) ReplaceKeywords(sentence string) string { 438 | var ( 439 | newSentence = "" 440 | ) 441 | if len(sentence) == 0 { 442 | return newSentence 443 | } 444 | if !KeywordProcessor.caseSensitive { 445 | sentence = strings.ToLower(sentence) 446 | } 447 | 448 | var ( 449 | start []int 450 | replaceIndex int 451 | sentenceRune = []rune(sentence) 452 | idx = 0 453 | idy = 0 454 | sentenceLen = len(sentenceRune) 455 | cleanName = "" 456 | currentDict = KeywordProcessor.keywordTrieDict 457 | ) 458 | 459 | for idx < sentenceLen { 460 | char := sentenceRune[idx] 461 | tmpCurrent, err := currentDict[int32(char)].(map[int32]interface{}) 462 | if err { 463 | start = append(start, idx) 464 | idx++ 465 | idy++ 466 | if cleanNameTmp, err := tmpCurrent[__key__]; err { 467 | cleanName = cleanNameTmp.(string) 468 | } 469 | currentDict = tmpCurrent 470 | if idx < sentenceLen { 471 | continue 472 | } 473 | } else { 474 | replaceIndex = idx 475 | idx++ 476 | idy++ 477 | currentDict = KeywordProcessor.keywordTrieDict 478 | } 479 | if cleanName != "" { 480 | newSentence += cleanName 481 | idx = start[len(start)-1] + 1 482 | idy = idx + 1 483 | cleanName = "" 484 | start = []int{} 485 | } else { 486 | if len(start) > 0 { 487 | idx = start[0] + 1 488 | idy = idx + 1 489 | replaceIndex = start[0] 490 | } 491 | start = []int{} 492 | if replaceIndex < sentenceLen { 493 | newSentence += string(sentenceRune[replaceIndex]) 494 | } 495 | } 496 | } 497 | return newSentence 498 | } 499 | 500 | 501 | // Reverse the given string 502 | func __reverseString__(s string) string { 503 | runes := []rune(s) 504 | for from, to := 0, len(runes)-1; from < to; from, to = from+1, to-1 { 505 | runes[from], runes[to] = runes[to], runes[from] 506 | } 507 | return string(runes) 508 | } 509 | --------------------------------------------------------------------------------