├── keywords.txt ├── go.mod ├── .gitignore ├── LICENSE ├── README.md ├── trie_test.go └── trie.go /keywords.txt: -------------------------------------------------------------------------------- 1 | 这里 2 | 输入 3 | 关键词 -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/yihleego/trie 2 | 3 | go 1.18 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .DS_Store 3 | *.exe 4 | *.iws 5 | *.iml 6 | *.ipr 7 | *.log 8 | tmp/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Leego Yih 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Trie 2 | 3 | [![GoDoc](https://godoc.org/github.com/yihleego/trie?status.svg)](https://godoc.org/github.com/yihleego/trie) 4 | [![Go Report Card](https://goreportcard.com/badge/github.com/yihleego/trie)](https://goreportcard.com/report/github.com/yihleego/trie) 5 | 6 | An Aho-Corasick algorithm based string-searching utility for Go. It supports tokenization, ignoring case, replacing text. So you can use it to find keywords in an article, filter sensitive words, etc. 7 | 8 | Implementation in Java:[Trie4j](https://github.com/yihleego/trie4j) 9 | 10 | ## Introduction 11 | 12 | 判断一个字符串是否包含另一个字符串,我们通常使用`strings.Index()`或`strings.Contains()`进行判断,其底层实现基于RK、KMP、BM和Sunday等算法。如果要判断一个字符串是否包含多个字符串,比如在一篇文章找几个敏感词,继续使用上述的字符串搜索算法显然是不合适,这种场景就需要用到多模式匹配算法。 13 | 14 | [Aho–Corasick](http://cr.yp.to/bib/1975/aho.pdf) 算法是由贝尔实验室的 Alfred V. Aho 和 Margaret J. Corasick 在 1975 年发明的一种字符串搜索算法。它是一种字典匹配算法,可在输入文本中定位有限字符串集(字典)的元素。它同时匹配所有字符串。该算法的复杂性与字符串长度加上搜索文本的长度加上输出匹配的数量成线性关系。 15 | 16 | 该算法主要依靠构造一个有限状态机来实现,然后通过失配指针在查找字符串失败时进行回退,转向某前缀的其他分支,免于重复匹配前缀,提高算法效率。 17 | 18 | ## Usage 19 | 20 | ### FindAll 21 | 22 | ```go 23 | t := trie.New("雨疏", "风骤", "残酒", "卷帘人", "知否") 24 | emits := t.FindAll("昨夜雨疏风骤,浓睡不消残酒。试问卷帘人,却道海棠依旧。知否,知否?应是绿肥红瘦。", false) 25 | ``` 26 | 27 | ```text 28 | [2:4=雨疏, 4:6=风骤, 11:13=残酒, 16:19=卷帘人, 27:29=知否, 30:32=知否] 29 | ``` 30 | 31 | ### FindFirst 32 | 33 | ```go 34 | t := trie.New("雨疏", "风骤", "残酒", "卷帘人", "知否") 35 | emit := t.FindFirst("昨夜雨疏风骤,浓睡不消残酒。试问卷帘人,却道海棠依旧。知否,知否?应是绿肥红瘦。", false) 36 | ``` 37 | 38 | ```text 39 | 2:4=雨疏 40 | ``` 41 | 42 | ### FindAll (Case Insensitive) 43 | 44 | ```go 45 | t := trie.New("poetry", "TRANSLATION") 46 | emits := t.FindAll("Poetry is what gets lost in translation.", true) 47 | ``` 48 | 49 | ```text 50 | [0:6=poetry, 28:39=TRANSLATION] 51 | ``` 52 | 53 | ### FindFirst (Case Insensitive) 54 | 55 | ```go 56 | t := trie.New("poetry", "TRANSLATION") 57 | emit := t.FindFirst("Poetry is what gets lost in translation.", true) 58 | ``` 59 | 60 | ```text 61 | 0:6=poetry 62 | ``` 63 | 64 | ### Tokenize 65 | 66 | ```go 67 | s := "常记溪亭日暮,沉醉不知归路。兴尽晚回舟,误入藕花深处。争渡,争渡,惊起一滩鸥鹭。" 68 | t := trie.New("溪亭", "归路", "藕花", "争渡") 69 | emits := t.FindAll(s, false) 70 | tokens := trie.Tokenize(emits, s) 71 | ``` 72 | 73 | ```text 74 | ["常记", "溪亭(2:4=溪亭)", "日暮,沉醉不知", "归路(11:13=归路)", "。兴尽晚回舟,误入", "藕花(22:24=藕花)", "深处。", "争渡(27:29=争渡)", ",", "争渡(30:32=争渡)", ",惊起一滩鸥鹭。"] 75 | ``` 76 | 77 | ### Replace 78 | 79 | ```go 80 | s := "我正在参加砍价,砍到0元就可以免费拿啦。亲~帮我砍一刀呗,咱们一起免费领好货。" 81 | t := trie.New("0元", "砍一刀", "免费拿", "免费领") 82 | emits := t.FindAll(s, false) 83 | r1 := trie.Replace(emits, s, "*") 84 | r2 := trie.Replace(emits, s, "@#$%^&*") 85 | ``` 86 | 87 | ```text 88 | 我正在参加砍价,砍到**就可以***啦。亲~帮我***呗,咱们一起***好货。 89 | 我正在参加砍价,砍到%^就可以#$%啦。亲~帮我%^&呗,咱们一起&*@好货。 90 | ``` 91 | 92 | ## License 93 | 94 | This project is under the MIT license. See the [LICENSE](LICENSE) file for details. 95 | -------------------------------------------------------------------------------- /trie_test.go: -------------------------------------------------------------------------------- 1 | package trie 2 | 3 | import ( 4 | "bufio" 5 | "os" 6 | "testing" 7 | "unicode/utf8" 8 | ) 9 | 10 | func TestFindAll(t *testing.T) { 11 | text := "昨夜雨疏风骤,浓睡不消残酒。试问卷帘人,却道海棠依旧。知否,知否?应是绿肥红瘦。" 12 | trie := New("雨疏", "风骤", "残酒", "卷帘人", "知否") 13 | emits := trie.FindAll(text, false) 14 | t.Log(emits) 15 | EqualEmit(t, emits[0], 2, 4, "雨疏") 16 | EqualEmit(t, emits[1], 4, 6, "风骤") 17 | EqualEmit(t, emits[2], 11, 13, "残酒") 18 | EqualEmit(t, emits[3], 16, 19, "卷帘人") 19 | EqualEmit(t, emits[4], 27, 29, "知否") 20 | EqualEmit(t, emits[5], 30, 32, "知否") 21 | EqualInt(t, 6, len(emits)) 22 | } 23 | 24 | func TestFindFirst(t *testing.T) { 25 | text := "昨夜雨疏风骤,浓睡不消残酒。试问卷帘人,却道海棠依旧。知否,知否?应是绿肥红瘦。" 26 | trie := New("雨疏", "风骤", "残酒", "卷帘人", "知否") 27 | emit := trie.FindFirst(text, false) 28 | t.Log(emit) 29 | EqualEmit(t, emit, 2, 4, "雨疏") 30 | } 31 | 32 | func TestFindAllIgnoreCase(t *testing.T) { 33 | text := "Poetry is what gets lost in translation." 34 | trie := New("poetry", "TRANSLATION") 35 | emits := trie.FindAll(text, true) 36 | t.Log(emits) 37 | EqualEmit(t, emits[0], 0, 6, "poetry") 38 | EqualEmit(t, emits[1], 28, 39, "TRANSLATION") 39 | EqualInt(t, 2, len(emits)) 40 | } 41 | 42 | func TestFindFirstIgnoreCase(t *testing.T) { 43 | text := "Poetry is what gets lost in translation." 44 | trie := New("poetry", "TRANSLATION") 45 | emit := trie.FindFirst(text, true) 46 | t.Log(emit) 47 | EqualEmit(t, emit, 0, 6, "poetry") 48 | } 49 | 50 | func TestIgnoreCase(t *testing.T) { 51 | text := "TurninG OnCe AgAiN BÖRKÜ" 52 | trie := New("turning", "once", "again", "börkü") 53 | emits := trie.FindAll(text, true) 54 | t.Log(emits) 55 | EqualEmit(t, emits[0], 0, 7, "turning") 56 | EqualEmit(t, emits[1], 8, 12, "once") 57 | EqualEmit(t, emits[2], 13, 18, "again") 58 | EqualEmit(t, emits[3], 19, 24, "börkü") 59 | EqualInt(t, 4, len(emits)) 60 | } 61 | 62 | func TestTokenize(t *testing.T) { 63 | text := "常记溪亭日暮,沉醉不知归路。兴尽晚回舟,误入藕花深处。争渡,争渡,惊起一滩鸥鹭。" 64 | trie := New("溪亭", "归路", "藕花", "争渡") 65 | emits := trie.FindAll(text, false) 66 | tokens := Tokenize(emits, text) 67 | t.Log(len(emits), emits) 68 | t.Log(len(tokens), tokens) 69 | EqualToken(t, tokens[0], -1, -1, "常记") 70 | EqualToken(t, tokens[1], 2, 4, "溪亭") 71 | EqualToken(t, tokens[2], -1, -1, "日暮,沉醉不知") 72 | EqualToken(t, tokens[3], 11, 13, "归路") 73 | EqualToken(t, tokens[4], -1, -1, "。兴尽晚回舟,误入") 74 | EqualToken(t, tokens[5], 22, 24, "藕花") 75 | EqualToken(t, tokens[6], -1, -1, "深处。") 76 | EqualToken(t, tokens[7], 27, 29, "争渡") 77 | EqualToken(t, tokens[8], -1, -1, ",") 78 | EqualToken(t, tokens[9], 30, 32, "争渡") 79 | EqualToken(t, tokens[10], -1, -1, ",惊起一滩鸥鹭。") 80 | EqualInt(t, 5, len(emits)) 81 | EqualInt(t, 11, len(tokens)) 82 | } 83 | 84 | func TestReplace(t *testing.T) { 85 | text := "我正在参加砍价,砍到0元就可以免费拿啦。亲~帮我砍一刀呗,咱们一起免费领好货。" 86 | trie := New("0元", "砍一刀", "免费拿", "免费领") 87 | emits := trie.FindAll(text, false) 88 | r1 := Replace(emits, text, "*") 89 | r2 := Replace(emits, text, "@#$%^&*") 90 | t.Log(emits) 91 | t.Log(r1) 92 | t.Log(r2) 93 | EqualString(t, "我正在参加砍价,砍到**就可以***啦。亲~帮我***呗,咱们一起***好货。", r1) 94 | EqualString(t, "我正在参加砍价,砍到%^就可以#$%啦。亲~帮我%^&呗,咱们一起&*@好货。", r2) 95 | EqualInt(t, 4, len(emits)) 96 | } 97 | 98 | func TestOverlaps(t *testing.T) { 99 | text := "a123,456b" 100 | trie := New("123", "12", "23", "45", "56") 101 | emits := trie.FindAll(text, false) 102 | t.Log(emits) 103 | removed := RemoveOverlaps(emits) 104 | t.Log(emits) 105 | t.Log(removed) 106 | EqualEmit(t, removed[0], 1, 4, "123") 107 | EqualEmit(t, removed[1], 5, 7, "45") 108 | EqualInt(t, 5, len(emits)) 109 | EqualInt(t, 2, len(removed)) 110 | } 111 | 112 | func TestContains(t *testing.T) { 113 | text := "a123,456b" 114 | trie := New("123", "12", "23", "45", "56") 115 | emits := trie.FindAll(text, false) 116 | t.Log(emits) 117 | removed := RemoveContains(emits) 118 | t.Log(emits) 119 | t.Log(removed) 120 | EqualEmit(t, removed[0], 1, 4, "123") 121 | EqualEmit(t, removed[1], 5, 7, "45") 122 | EqualEmit(t, removed[2], 6, 8, "56") 123 | EqualInt(t, 5, len(emits)) 124 | EqualInt(t, 3, len(removed)) 125 | } 126 | 127 | func TestDuplicate(t *testing.T) { 128 | text := "123456" 129 | trie := New("123", "123", "456", "456") 130 | emits := trie.FindAll(text, false) 131 | t.Log(emits) 132 | EqualEmit(t, emits[0], 0, 3, "123") 133 | EqualEmit(t, emits[1], 3, 6, "456") 134 | EqualInt(t, 2, len(emits)) 135 | } 136 | 137 | func TestAddKeywords(t *testing.T) { 138 | text := "ushers" 139 | trie1 := New("he", "she", "his", "hers") 140 | trie2 := New().AddKeywords("he", "she", "his", "hers") 141 | trie3 := New().AddKeywords("he").AddKeywords("she").AddKeywords("his").AddKeywords("hers") 142 | emits1 := trie1.FindAll(text, false) 143 | emits2 := trie2.FindAll(text, false) 144 | emits3 := trie3.FindAll(text, false) 145 | t.Log(emits1) 146 | t.Log(emits2) 147 | t.Log(emits3) 148 | EqualEmits(t, emits1, emits2) 149 | EqualEmits(t, emits1, emits3) 150 | EqualEmits(t, emits2, emits3) 151 | } 152 | 153 | func TestEmoji(t *testing.T) { 154 | t.Log("utf8.RuneCountInString(\"🐼\") >>", utf8.RuneCountInString("🐼")) 155 | t.Log("len(\"🐼\") >>", len("🐼")) 156 | EqualInt(t, 1, utf8.RuneCountInString("🐼")) 157 | EqualInt(t, 4, len("🐼")) 158 | text := "I love 🐼 very much." 159 | trie := New("🐼", "🐻") 160 | emits := trie.FindAll(text, false) 161 | t.Log(emits) 162 | EqualEmit(t, emits[0], 7, 8, "🐼") 163 | EqualInt(t, 1, len(emits)) 164 | } 165 | 166 | func TestFile(t *testing.T) { 167 | keywords, err := readFile("keywords.txt") 168 | if err != nil { 169 | t.Error(err) 170 | return 171 | } 172 | text := "请在这里输入关键词" 173 | trie := New(keywords...) 174 | emits := trie.FindAll(text, false) 175 | t.Log(emits) 176 | } 177 | 178 | func readFile(filename string) ([]string, error) { 179 | f, err := os.Open(filename) 180 | if err != nil { 181 | return nil, err 182 | } 183 | lines := make([]string, 0, 15000) 184 | s := bufio.NewScanner(f) 185 | for s.Scan() { 186 | lines = append(lines, s.Text()) 187 | } 188 | err = s.Err() 189 | if err != nil { 190 | return nil, err 191 | } 192 | err = f.Close() 193 | if err != nil { 194 | return nil, err 195 | } 196 | return lines, nil 197 | } 198 | 199 | func EqualInt(t *testing.T, expected int, actual int) { 200 | if expected != actual { 201 | t.Error(expected, actual) 202 | } 203 | } 204 | 205 | func EqualString(t *testing.T, expected string, actual string) { 206 | if expected != actual { 207 | t.Error(expected, actual) 208 | } 209 | } 210 | 211 | func EqualEmit(t *testing.T, emit *Emit, begin int, end int, kw string) { 212 | if emit.Begin != begin || emit.End != end || emit.Keyword != kw { 213 | t.Error(emit) 214 | } 215 | } 216 | 217 | func EqualEmits(t *testing.T, emits1 []*Emit, emits2 []*Emit) { 218 | if len(emits1) != len(emits2) { 219 | t.Error(emits1, emits2) 220 | return 221 | } 222 | for i := 0; i < len(emits1); i++ { 223 | emit1, emit2 := emits1[i], emits2[i] 224 | if !emit1.Equals(emit2) { 225 | t.Error(emits1, emits2) 226 | return 227 | } 228 | } 229 | } 230 | 231 | func EqualToken(t *testing.T, token *Token, begin int, end int, kw string) { 232 | if token.Fragment != kw { 233 | t.Error(token) 234 | } 235 | if token.IsMatch() { 236 | EqualEmit(t, token.Emit, begin, end, kw) 237 | } 238 | } 239 | -------------------------------------------------------------------------------- /trie.go: -------------------------------------------------------------------------------- 1 | package trie 2 | 3 | import ( 4 | "container/list" 5 | "fmt" 6 | "sort" 7 | "unicode" 8 | "unicode/utf8" 9 | ) 10 | 11 | type Emit struct { 12 | Begin, End int 13 | Keyword string 14 | } 15 | 16 | func (e *Emit) Length() int { 17 | return e.End - e.Begin 18 | } 19 | 20 | func (e *Emit) Equals(o *Emit) bool { 21 | return e.Begin == o.Begin && e.End == o.End && e.Keyword == o.Keyword 22 | } 23 | 24 | func (e *Emit) Overlaps(o *Emit) bool { 25 | return e.Begin < o.End && e.End > o.Begin 26 | } 27 | 28 | func (e *Emit) Contains(o *Emit) bool { 29 | return e.Begin <= o.Begin && e.End >= o.End 30 | } 31 | 32 | func (e *Emit) String() string { 33 | return fmt.Sprintf("%d:%d=%s", e.Begin, e.End, e.Keyword) 34 | } 35 | 36 | type Token struct { 37 | Fragment string 38 | Emit *Emit 39 | } 40 | 41 | func (t *Token) IsMatch() bool { 42 | return t.Emit != nil 43 | } 44 | 45 | func (t *Token) String() string { 46 | if t.Emit == nil { 47 | return t.Fragment 48 | } else { 49 | return fmt.Sprintf("%s(%v)", t.Fragment, t.Emit) 50 | } 51 | } 52 | 53 | type Keyword struct { 54 | value string 55 | length int 56 | } 57 | 58 | type State struct { 59 | depth int 60 | success map[rune]*State 61 | failure *State 62 | keywords []*Keyword 63 | } 64 | 65 | func (s *State) NextState(c rune, ignoreCase bool) *State { 66 | next := s.GetState(c, ignoreCase) 67 | if next != nil { 68 | return next 69 | } else if s.depth == 0 { 70 | return s 71 | } else { 72 | return nil 73 | } 74 | } 75 | 76 | func (s *State) GetState(c rune, ignoreCase bool) *State { 77 | if s.success == nil { 78 | return nil 79 | } 80 | state, exists := s.success[c] 81 | if exists { 82 | return state 83 | } 84 | if ignoreCase { 85 | cc := c 86 | if unicode.IsLower(c) { 87 | cc = unicode.ToUpper(c) 88 | } else if unicode.IsUpper(c) { 89 | cc = unicode.ToLower(c) 90 | } 91 | if c != cc { 92 | next := s.success[cc] 93 | return next 94 | } 95 | } 96 | return nil 97 | } 98 | 99 | func (s *State) AddState(str string) *State { 100 | state := s 101 | runes := []rune(str) 102 | for i := 0; i < len(runes); i++ { 103 | state = state.addState(runes[i]) 104 | } 105 | return state 106 | } 107 | 108 | func (s *State) addState(c rune) *State { 109 | if s.success == nil { 110 | s.success = make(map[rune]*State) 111 | } 112 | state, exists := s.success[c] 113 | if exists { 114 | return state 115 | } 116 | ns := &State{depth: s.depth + 1} 117 | s.success[c] = ns 118 | return ns 119 | } 120 | 121 | func (s *State) HasKeyword(keyword string) bool { 122 | for _, kw := range s.keywords { 123 | if kw.value == keyword { 124 | return true 125 | } 126 | } 127 | return false 128 | } 129 | 130 | func (s *State) AddKeyword(keyword string) { 131 | s.ensureKeywords() 132 | if !s.HasKeyword(keyword) { 133 | s.keywords = append(s.keywords, &Keyword{keyword, utf8.RuneCountInString(keyword)}) 134 | } 135 | } 136 | 137 | func (s *State) AddKeywords(keywords []*Keyword) { 138 | if len(keywords) == 0 { 139 | return 140 | } 141 | s.ensureKeywords() 142 | for _, keyword := range keywords { 143 | if !s.HasKeyword(keyword.value) { 144 | s.keywords = append(s.keywords, keyword) 145 | } 146 | } 147 | } 148 | 149 | func (s *State) ensureKeywords() { 150 | if s.keywords == nil { 151 | s.keywords = make([]*Keyword, 0, 2) 152 | } 153 | } 154 | 155 | type Trie struct { 156 | root *State 157 | } 158 | 159 | func New(keywords ...string) *Trie { 160 | t := Trie{root: &State{depth: 0}} 161 | if len(keywords) > 0 { 162 | t.AddKeywords(keywords...) 163 | } 164 | return &t 165 | } 166 | 167 | func (t *Trie) AddKeywords(keywords ...string) *Trie { 168 | for _, keyword := range keywords { 169 | if len(keyword) > 0 { 170 | t.root.AddState(keyword).AddKeyword(keyword) 171 | } 172 | } 173 | states := list.New() 174 | for _, state := range t.root.success { 175 | state.failure = t.root 176 | states.PushBack(state) 177 | } 178 | for states.Len() > 0 { 179 | state := states.Remove(states.Front()).(*State) 180 | if state.success == nil { 181 | continue 182 | } 183 | for c, next := range state.success { 184 | f := state.failure 185 | fn := f.NextState(c, false) 186 | for fn == nil { 187 | f = f.failure 188 | fn = f.NextState(c, false) 189 | } 190 | next.failure = fn 191 | next.AddKeywords(fn.keywords) 192 | states.PushBack(next) 193 | } 194 | } 195 | return t 196 | } 197 | 198 | func (t *Trie) FindAll(text string, ignoreCase bool) []*Emit { 199 | emits := make([]*Emit, 0, 10) 200 | state := t.root 201 | runes := []rune(text) 202 | for i := 0; i < len(runes); i++ { 203 | r := runes[i] 204 | state = t.nextState(state, r, ignoreCase) 205 | for j := 0; j < len(state.keywords); j++ { 206 | kw := state.keywords[j] 207 | emits = append(emits, &Emit{i + 1 - kw.length, i + 1, kw.value}) 208 | } 209 | } 210 | return emits 211 | } 212 | 213 | func (t *Trie) FindFirst(text string, ignoreCase bool) *Emit { 214 | state := t.root 215 | runes := []rune(text) 216 | for i := 0; i < len(runes); i++ { 217 | r := runes[i] 218 | state = t.nextState(state, r, ignoreCase) 219 | if len(state.keywords) > 0 { 220 | kw := state.keywords[0] 221 | return &Emit{i + 1 - kw.length, i + 1, kw.value} 222 | } 223 | } 224 | return nil 225 | } 226 | 227 | func (t *Trie) nextState(state *State, c rune, ignoreCase bool) *State { 228 | next := state.NextState(c, ignoreCase) 229 | for next == nil { 230 | state = state.failure 231 | next = state.NextState(c, ignoreCase) 232 | } 233 | return next 234 | } 235 | 236 | func Tokenize(emits []*Emit, source string) []*Token { 237 | emits = RemoveContains(emits) 238 | el := len(emits) 239 | if el == 0 { 240 | return []*Token{{source, nil}} 241 | } 242 | index := 0 243 | runes := []rune(source) 244 | tokens := make([]*Token, 0, el*2+1) 245 | for i := 0; i < el; i++ { 246 | emit := emits[i] 247 | if index < emit.Begin { 248 | tokens = append(tokens, &Token{string(runes[index:emit.Begin]), nil}) 249 | } 250 | tokens = append(tokens, &Token{string(runes[emit.Begin:emit.End]), emit}) 251 | index = emit.End 252 | } 253 | last := emits[el-1] 254 | if last.End < utf8.RuneCountInString(source) { 255 | tokens = append(tokens, &Token{string(runes[last.End:]), nil}) 256 | } 257 | return tokens 258 | } 259 | 260 | func Replace(emits []*Emit, source string, replacement string) string { 261 | emits = RemoveContains(emits) 262 | el := len(emits) 263 | if el == 0 { 264 | return source 265 | } 266 | index := 0 267 | runes := []rune(source) 268 | masks := []rune(replacement) 269 | ml := len(masks) 270 | for i := 0; i < el; i++ { 271 | emit := emits[i] 272 | if index < emit.Begin { 273 | index = emit.Begin 274 | } 275 | for j := emit.Begin; j < emit.End; j++ { 276 | runes[j] = masks[j%ml] 277 | } 278 | index = emit.End 279 | } 280 | return string(runes) 281 | } 282 | 283 | func RemoveOverlaps(emits []*Emit) []*Emit { 284 | return removeEmits(emits, func(a, b *Emit) bool { 285 | return a.Overlaps(b) 286 | }) 287 | } 288 | 289 | func RemoveContains(emits []*Emit) []*Emit { 290 | return removeEmits(emits, func(a, b *Emit) bool { 291 | return a.Contains(b) 292 | }) 293 | } 294 | 295 | func removeEmits(emits []*Emit, predicate func(a, b *Emit) bool) []*Emit { 296 | el := len(emits) 297 | if el < 1 { 298 | return nil 299 | } else if el == 1 { 300 | return []*Emit{emits[0]} 301 | } 302 | replica := make([]*Emit, el) 303 | copy(replica, emits) 304 | sortEmits(replica) 305 | emit := replica[0] 306 | sorted := make([]*Emit, 0, el) 307 | sorted = append(sorted, emit) 308 | for i := 1; i < el; i++ { 309 | next := replica[i] 310 | if !predicate(emit, next) { 311 | sorted = append(sorted, next) 312 | emit = next 313 | } 314 | } 315 | return sorted 316 | } 317 | 318 | func sortEmits(emits []*Emit) { 319 | sort.Slice(emits, func(i, j int) bool { 320 | a, b := emits[i], emits[j] 321 | if a.Begin != b.Begin { 322 | return a.Begin < b.Begin 323 | } else { 324 | return a.End > b.End 325 | } 326 | }) 327 | } 328 | --------------------------------------------------------------------------------