├── .travis.yml ├── words_test.txt ├── .gitignore ├── .editorconfig ├── words_filter_test.go ├── LICENSE ├── README.md ├── words_filter.go └── node.go /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | 3 | go: 4 | - 1.9.x -------------------------------------------------------------------------------- /words_test.txt: -------------------------------------------------------------------------------- 1 | Miyamoto Musashi 2 | 妲己 3 | アンジェラ 4 | ความรุ่งโรจน์ -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *.o 3 | *.a 4 | *.so 5 | *.exe 6 | *.test 7 | /out/ 8 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # top-most EditorConfig file 2 | root = true 3 | 4 | # all files 5 | [*] 6 | indent_style = tab 7 | indent_size = 4 -------------------------------------------------------------------------------- /words_filter_test.go: -------------------------------------------------------------------------------- 1 | package wordsfilter 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestWordsFilter(t *testing.T) { 8 | texts := []string{ 9 | "Miyamoto Musashi", 10 | "妲己", 11 | "アンジェラ", 12 | "ความรุ่งโรจน์", 13 | } 14 | wf := New() 15 | root := wf.Generate(texts) 16 | wf.Remove("shif", root) 17 | c1 := wf.Contains("アン", root) 18 | if c1 != false { 19 | t.Errorf("Test Contains expect false, get %T, %v", c1, c1) 20 | } 21 | c2 := wf.Contains("->アンジェラ2333", root) 22 | if c2 != true { 23 | t.Errorf("Test Contains expect true, get %T, %v", c2, c2) 24 | } 25 | r1 := wf.Replace("Game ความรุ่งโรจน์ i like 妲己 heroMiyamotoMusashi", root) 26 | if r1 != "Game*************ilike**hero***************" { 27 | t.Errorf("Test Replace expect Game*************ilike**hero***************,get %T,%v", r1, r1) 28 | } 29 | // Test generated with file. 30 | root, _ = wf.GenerateWithFile("./words_test.txt") 31 | if wf.Contains("アンジェラ", root) != true { 32 | t.Errorf("Test Contains expect true, get %T, %v", c2, c2) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Openset 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # go-wordsfilter 2 | A high performance text filter. 3 | 4 | ## Download & Install 5 | ```shell 6 | go get github.com/syyongx/go-wordsfilter 7 | ``` 8 | 9 | ## Quick Start 10 | ```go 11 | import ( 12 | "github.com/syyongx/go-wordsfilter" 13 | ) 14 | 15 | func main() { 16 | texts := []string{ 17 | "Miyamoto Musashi", 18 | "妲己", 19 | "アンジェラ", 20 | "ความรุ่งโรจน์", 21 | } 22 | wf := wordsfilter.New() 23 | 24 | // Generate 25 | root := wf.Generate(texts) 26 | // Generate with file 27 | // root := wf.GenerateWithFile(path) 28 | 29 | // Contains 30 | c1 := wf.Contains("アン", root) 31 | // c1: false 32 | c2 := wf.Contains("アンジェラ", root) 33 | // c2: true 34 | 35 | // Remove 36 | wf.Remove("アンジェラ", root) 37 | c3 := wf.Contains("アンジェラ", root) 38 | // c3: false 39 | 40 | // Replace 41 | r1 := wf.Replace("Game ความรุ่งโรจน์ i like 妲己 heroMiyamotoMusashi", root) 42 | // r1: Game*************ilike**hero*************** 43 | } 44 | ``` 45 | 46 | ## Apis 47 | ```go 48 | New() *WordsFilter 49 | Generate(texts []string) map[string]*Node 50 | GenerateWithFile(path string) (map[string]*Node, error) 51 | Add(text string, root map[string]*Node) 52 | Replace(text string, root map[string]*Node) string 53 | Contains(text string, root map[string]*Node) bool 54 | Remove(text string, root map[string]*Node) 55 | ``` 56 | 57 | ## LICENSE 58 | go-wordsfilter source code is licensed under the [MIT](https://github.com/syyongx/go-wordsfilter/blob/master/LICENSE) Licence. 59 | -------------------------------------------------------------------------------- /words_filter.go: -------------------------------------------------------------------------------- 1 | package wordsfilter 2 | 3 | import ( 4 | "sync" 5 | "strings" 6 | "bytes" 7 | "os" 8 | "bufio" 9 | "io" 10 | ) 11 | 12 | var DefaultPlaceholder = "*" 13 | var DefaultStripSpace = true 14 | 15 | type WordsFilter struct { 16 | Placeholder string 17 | StripSpace bool 18 | node *Node 19 | mutex sync.RWMutex 20 | } 21 | 22 | // New creates a words filter. 23 | func New() *WordsFilter { 24 | return &WordsFilter{ 25 | Placeholder: DefaultPlaceholder, 26 | StripSpace: DefaultStripSpace, 27 | node: NewNode(make(map[string]*Node), ""), 28 | } 29 | } 30 | 31 | // Convert sensitive text lists into sensitive word tree nodes 32 | func (wf *WordsFilter) Generate(texts []string) map[string]*Node { 33 | root := make(map[string]*Node) 34 | for _, text := range texts { 35 | wf.Add(text, root) 36 | } 37 | return root 38 | } 39 | 40 | // Convert sensitive text from file into sensitive word tree nodes. 41 | // File content format, please wrap every sensitive word. 42 | func (wf *WordsFilter) GenerateWithFile(path string) (map[string]*Node, error) { 43 | fd, err := os.Open(path) 44 | if err != nil { 45 | return nil, err 46 | } 47 | defer fd.Close() 48 | buf := bufio.NewReader(fd) 49 | var texts []string 50 | for { 51 | line, _, err := buf.ReadLine() 52 | if err != nil { 53 | if err == io.EOF { 54 | break 55 | } else { 56 | return nil, err 57 | } 58 | } 59 | text := strings.TrimSpace(string(line)) 60 | if text == "" { 61 | continue 62 | } 63 | texts = append(texts, text) 64 | } 65 | 66 | root := wf.Generate(texts) 67 | return root, nil 68 | } 69 | 70 | // Add sensitive words to specified sensitive words Map. 71 | func (wf *WordsFilter) Add(text string, root map[string]*Node) { 72 | if wf.StripSpace { 73 | text = stripSpace(text) 74 | } 75 | wf.mutex.Lock() 76 | defer wf.mutex.Unlock() 77 | wf.node.add(text, root, wf.Placeholder) 78 | } 79 | 80 | // Replace sensitive words in strings and return new strings. 81 | func (wf *WordsFilter) Replace(text string, root map[string]*Node) string { 82 | if wf.StripSpace { 83 | text = stripSpace(text) 84 | } 85 | wf.mutex.RLock() 86 | defer wf.mutex.RUnlock() 87 | return wf.node.replace(text, root) 88 | } 89 | 90 | // Whether the string contains sensitive words. 91 | func (wf *WordsFilter) Contains(text string, root map[string]*Node) bool { 92 | if wf.StripSpace { 93 | text = stripSpace(text) 94 | } 95 | wf.mutex.RLock() 96 | defer wf.mutex.RUnlock() 97 | return wf.node.contains(text, root) 98 | } 99 | 100 | // Remove specified sensitive words from sensitive word map. 101 | func (wf *WordsFilter) Remove(text string, root map[string]*Node) { 102 | if wf.StripSpace { 103 | text = stripSpace(text) 104 | } 105 | wf.mutex.Lock() 106 | defer wf.mutex.Unlock() 107 | wf.node.remove(text, root) 108 | } 109 | 110 | // Strip space 111 | func stripSpace(str string) string { 112 | fields := strings.Fields(str) 113 | var bf bytes.Buffer 114 | for _, field := range fields { 115 | bf.WriteString(field) 116 | } 117 | return bf.String() 118 | } 119 | -------------------------------------------------------------------------------- /node.go: -------------------------------------------------------------------------------- 1 | package wordsfilter 2 | 3 | import ( 4 | "bytes" 5 | "strings" 6 | ) 7 | 8 | type Node struct { 9 | Child map[string]*Node 10 | Placeholders string 11 | } 12 | 13 | // New creates a node. 14 | func NewNode(child map[string]*Node, placeholders string) *Node { 15 | return &Node{ 16 | Child: child, 17 | Placeholders: placeholders, 18 | } 19 | } 20 | 21 | // Add sensitive words to specified sensitive words Map. 22 | func (node *Node) add(text string, root map[string]*Node, placeholder string) { 23 | if text == "" { 24 | return 25 | } 26 | textr := []rune(text) 27 | end := len(textr) - 1 28 | for i := 0; i <= end; i++ { 29 | word := string(textr[i]) 30 | if n, ok := root[word]; ok { // contains key 31 | if i == end { // the last 32 | n.Placeholders = strings.Repeat(placeholder, end+1) 33 | } else { 34 | if n.Child != nil { 35 | root = n.Child 36 | } else { 37 | root = make(map[string]*Node) 38 | n.Child = root 39 | } 40 | } 41 | } else { 42 | placeholders, child := "", make(map[string]*Node) 43 | if i == end { 44 | placeholders = strings.Repeat(placeholder, end+1) 45 | } 46 | root[word] = NewNode(child, placeholders) 47 | root = child 48 | } 49 | } 50 | } 51 | 52 | // Remove specified sensitive words from sensitive word map. 53 | func (node *Node) remove(text string, root map[string]*Node) { 54 | textr := []rune(text) 55 | end := len(textr) - 1 56 | for i := 0; i <= end; i++ { 57 | word := string(textr[i]) 58 | if n, ok := root[word]; ok { 59 | if i == end { 60 | n.Placeholders = "" 61 | } else { 62 | root = n.Child 63 | } 64 | } else { 65 | return 66 | } 67 | } 68 | } 69 | 70 | // Replace sensitive words in strings and return new strings. 71 | // Follow the principle of maximum matching. 72 | func (node *Node) replace(text string, root map[string]*Node) string { 73 | if root == nil || text == "" { 74 | return text 75 | } 76 | textr := []rune(text) 77 | i, s, e, l := 0, 0, 0, len(textr) 78 | bf := bytes.Buffer{} 79 | words := make(map[string]*Node) 80 | var back []*Node 81 | loop: 82 | for e < l { 83 | words = root 84 | i = e 85 | // Maximum Matching Principle, Matching Backwards First 86 | for ; i < l; i ++ { 87 | word := string(textr[i]) 88 | if n, ok := words[word]; ok { 89 | back = append(back, n) 90 | if n.Child != nil { 91 | words = n.Child 92 | } else if n.Placeholders != "" { 93 | bf.WriteString(string(textr[s:e])) 94 | bf.WriteString(n.Placeholders) 95 | i++ 96 | s, e = i, i 97 | continue loop 98 | } else { 99 | break 100 | } 101 | } else if n != nil && n.Placeholders != "" { 102 | bf.WriteString(string(textr[s:e])) 103 | bf.WriteString(n.Placeholders) 104 | s, e = i, i 105 | continue loop 106 | } else { 107 | break 108 | } 109 | } 110 | // Backward match fails, backtracking. 111 | for ; i > e; i-- { 112 | bl := len(back) 113 | if bl == 0 { 114 | break 115 | } 116 | last := back[bl-1] 117 | back = back[:bl-1] 118 | if last.Placeholders != "" { 119 | bf.WriteString(string(textr[s:e])) 120 | bf.WriteString(last.Placeholders) 121 | s, e = i, i 122 | continue loop 123 | } 124 | } 125 | 126 | e++ 127 | back = back[:0] 128 | } 129 | bf.WriteString(string(textr[s:e])) 130 | 131 | return bf.String() 132 | } 133 | 134 | // Whether the string contains sensitive words. 135 | func (node *Node) contains(text string, root map[string]*Node) bool { 136 | if root == nil || text == "" { 137 | return false 138 | } 139 | textr := []rune(text) 140 | end := len(textr) - 1 141 | for i := 0; i <= end; i++ { 142 | word := string(textr[i]) 143 | if n, ok := root[word]; ok { 144 | if i == end { 145 | return n.Placeholders != "" 146 | } else { 147 | if len(n.Child) == 0 { // last 148 | return true 149 | } 150 | root = n.Child 151 | } 152 | } else { 153 | continue 154 | } 155 | } 156 | return false 157 | } 158 | --------------------------------------------------------------------------------