├── .gitignore ├── LICENSE ├── README.md ├── ahocorasick.go ├── ahocorasick_test.go ├── benchmark ├── benchmark.go ├── cn │ ├── dictionary.txt │ └── text.txt └── en │ ├── dictionary.txt │ └── text.txt ├── test_keywords_chn └── test_keywords_eng /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 hanshinan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Aho–Corasick algorithm 2 | 3 | #### Intro 4 | 5 | A faster and more effective implement of *Aho-Corasick algorithm* in golang and supports Both Chinese and English. To improve the performance and reduce memory usage, the program uses *Double Array Trie* instead of common *Linked List Trie*. In the benchmark, `it is 10 times faster than the most popular AC algorithm implement in golang @ github and tenth of its memory usage`. You can find more information in the benchmark parts. 6 | 7 | This Project is inspired by [hankcs/AhoCorasickDoubleArrayTrie](https://github.com/hankcs/AhoCorasickDoubleArrayTrie) 8 | 9 | Besides Multi-Pattern Search using AC algorithm, the program also provides "exact match search" using Double Array Trie 10 | 11 | Aho-Corasick algorithm is first presented in the paper below: 12 | 13 | > [Efficient string matching: an aid to bibliographic search](http://dl.acm.org/citation.cfm?id=360855) 14 | 15 | the wikipedia link is: [aho-corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm) 16 | 17 | #### Usage 18 | 19 | **Multi-Pattern Search Example** 20 | 21 | package main 22 | 23 | import ( 24 | "bufio" 25 | "bytes" 26 | "fmt" 27 | "io" 28 | "os" 29 | ) 30 | 31 | import ( 32 | "github.com/anknown/ahocorasick" 33 | ) 34 | 35 | func ReadRunes(filename string) ([][]rune, error) { 36 | dict := [][]rune{} 37 | 38 | f, err := os.OpenFile(filename, os.O_RDONLY, 0660) 39 | if err != nil { 40 | return nil, err 41 | } 42 | 43 | r := bufio.NewReader(f) 44 | for { 45 | l, err := r.ReadBytes('\n') 46 | if err != nil || err == io.EOF { 47 | break 48 | } 49 | l = bytes.TrimSpace(l) 50 | dict = append(dict, bytes.Runes(l)) 51 | } 52 | 53 | return dict, nil 54 | } 55 | 56 | func main() { 57 | dict, err := ReadRunes("your_dict_files") 58 | if err != nil { 59 | fmt.Println(err) 60 | return 61 | } 62 | 63 | content := []rune("your text") 64 | 65 | m := new(goahocorasick.Machine) 66 | if err := m.Build(dict); err != nil { 67 | fmt.Println(err) 68 | return 69 | } 70 | 71 | terms := m.MultiPatternSearch(content, false) 72 | for _, t := range terms { 73 | fmt.Printf("%d %s\n", t.Pos, string(t.Word)) 74 | } 75 | } 76 | 77 | I do not provide read file API because I think your dict may coming form other source 78 | 79 | #### Benchmark 80 | 81 | **Multi-Pattern Search** 82 | 83 | compare with `cloudflare/ahocorasick` who receives most stars and forks in all the implements written in golang 84 | 85 | To Run Benchmark, go to test dir 86 | 87 | go build benchmark.go 88 | 89 | ./benchmark 90 | 91 | 92 | * For Chinese Test 93 | 94 | *Dictionary* contains `153,151` words, *Text* contains `777,277` words 95 | 96 | ==================================================================== 97 | cost(million sec) memory usage(MBytes) 98 | cloudflare/ahocorasick 28926 1911 99 | anknown/ahocorasick 1814 155 100 | ==================================================================== 101 | 102 | * For English Test 103 | 104 | *Dictionary* contains `127,141` words, *Text* contains `674,669` words 105 | 106 | ==================================================================== 107 | time(million sec) memory usage(MBytes) 108 | cloudflare/ahocorasick 19835 1340 109 | anknown/ahocorasick 1619 203 110 | ==================================================================== 111 | 112 | #### License 113 | 114 | MIT License 115 | -------------------------------------------------------------------------------- /ahocorasick.go: -------------------------------------------------------------------------------- 1 | package goahocorasick 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | import ( 8 | "github.com/anknown/darts" 9 | ) 10 | 11 | const FAIL_STATE = -1 12 | const ROOT_STATE = 1 13 | 14 | type Machine struct { 15 | trie *godarts.DoubleArrayTrie 16 | failure []int 17 | output map[int]([][]rune) 18 | } 19 | 20 | type Term struct { 21 | Pos int 22 | Word []rune 23 | } 24 | 25 | func (m *Machine) Build(keywords [][]rune) (err error) { 26 | if len(keywords) == 0 { 27 | return fmt.Errorf("empty keywords") 28 | } 29 | 30 | d := new(godarts.Darts) 31 | 32 | trie := new(godarts.LinkedListTrie) 33 | m.trie, trie, err = d.Build(keywords) 34 | if err != nil { 35 | return err 36 | } 37 | 38 | m.output = make(map[int]([][]rune), 0) 39 | for idx, val := range d.Output { 40 | m.output[idx] = append(m.output[idx], val) 41 | } 42 | 43 | queue := make([](*godarts.LinkedListTrieNode), 0) 44 | m.failure = make([]int, len(m.trie.Base)) 45 | for _, c := range trie.Root.Children { 46 | m.failure[c.Base] = godarts.ROOT_NODE_BASE 47 | } 48 | queue = append(queue, trie.Root.Children...) 49 | 50 | for { 51 | if len(queue) == 0 { 52 | break 53 | } 54 | 55 | node := queue[0] 56 | for _, n := range node.Children { 57 | if n.Base == godarts.END_NODE_BASE { 58 | continue 59 | } 60 | inState := m.f(node.Base) 61 | set_state: 62 | outState := m.g(inState, n.Code-godarts.ROOT_NODE_BASE) 63 | if outState == FAIL_STATE { 64 | inState = m.f(inState) 65 | goto set_state 66 | } 67 | if _, ok := m.output[outState]; ok != false { 68 | copyOutState := make([][]rune, 0) 69 | for _, o := range m.output[outState] { 70 | copyOutState = append(copyOutState, o) 71 | } 72 | m.output[n.Base] = append(copyOutState, m.output[n.Base]...) 73 | } 74 | m.setF(n.Base, outState) 75 | } 76 | queue = append(queue, node.Children...) 77 | queue = queue[1:] 78 | } 79 | 80 | return nil 81 | } 82 | 83 | func (m *Machine) PrintFailure() { 84 | fmt.Printf("+-----+-----+\n") 85 | fmt.Printf("|%5s|%5s|\n", "index", "value") 86 | fmt.Printf("+-----+-----+\n") 87 | for i, v := range m.failure { 88 | fmt.Printf("|%5d|%5d|\n", i, v) 89 | } 90 | fmt.Printf("+-----+-----+\n") 91 | } 92 | 93 | func (m *Machine) PrintOutput() { 94 | fmt.Printf("+-----+----------+\n") 95 | fmt.Printf("|%5s|%10s|\n", "index", "value") 96 | fmt.Printf("+-----+----------+\n") 97 | for i, v := range m.output { 98 | var val string 99 | for _, o := range v { 100 | val = val + " " + string(o) 101 | } 102 | fmt.Printf("|%5d|%10s|\n", i, val) 103 | } 104 | fmt.Printf("+-----+----------+\n") 105 | } 106 | 107 | func (m *Machine) g(inState int, input rune) (outState int) { 108 | if inState == FAIL_STATE { 109 | return ROOT_STATE 110 | } 111 | 112 | t := inState + int(input) + godarts.ROOT_NODE_BASE 113 | if t >= len(m.trie.Base) { 114 | if inState == ROOT_STATE { 115 | return ROOT_STATE 116 | } 117 | return FAIL_STATE 118 | } 119 | if inState == m.trie.Check[t] { 120 | return m.trie.Base[t] 121 | } 122 | 123 | if inState == ROOT_STATE { 124 | return ROOT_STATE 125 | } 126 | 127 | return FAIL_STATE 128 | } 129 | 130 | func (m *Machine) f(index int) (state int) { 131 | return m.failure[index] 132 | } 133 | 134 | func (m *Machine) setF(inState, outState int) { 135 | m.failure[inState] = outState 136 | } 137 | 138 | func (m *Machine) MultiPatternSearch(content []rune, returnImmediately bool) [](*Term) { 139 | terms := make([](*Term), 0) 140 | 141 | state := ROOT_STATE 142 | for pos, c := range content { 143 | start: 144 | if m.g(state, c) == FAIL_STATE { 145 | state = m.f(state) 146 | goto start 147 | } else { 148 | state = m.g(state, c) 149 | if val, ok := m.output[state]; ok != false { 150 | for _, word := range val { 151 | term := new(Term) 152 | term.Pos = pos - len(word) + 1 153 | term.Word = word 154 | terms = append(terms, term) 155 | if returnImmediately { 156 | return terms 157 | } 158 | } 159 | } 160 | } 161 | } 162 | 163 | return terms 164 | } 165 | 166 | func (m *Machine) ExactSearch(content []rune) [](*Term) { 167 | if m.trie.ExactMatchSearch(content, 0) { 168 | t := new(Term) 169 | t.Word = content 170 | t.Pos = 0 171 | return [](*Term){t} 172 | } 173 | 174 | return nil 175 | } 176 | -------------------------------------------------------------------------------- /ahocorasick_test.go: -------------------------------------------------------------------------------- 1 | package goahocorasick 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "fmt" 7 | "io" 8 | "os" 9 | "testing" 10 | ) 11 | 12 | func Read(filename string) ([][]rune, error) { 13 | dict := [][]rune{} 14 | 15 | f, err := os.OpenFile(filename, os.O_RDONLY, 0660) 16 | if err != nil { 17 | return nil, err 18 | } 19 | 20 | r := bufio.NewReader(f) 21 | for { 22 | l, err := r.ReadBytes('\n') 23 | if err != nil || err == io.EOF { 24 | break 25 | } 26 | l = bytes.TrimSpace(l) 27 | dict = append(dict, bytes.Runes(l)) 28 | } 29 | 30 | return dict, nil 31 | } 32 | 33 | func TestBuild(t *testing.T) { 34 | keywords, err := Read("test_keywords_eng") 35 | if err != nil { 36 | t.Error(err) 37 | } 38 | 39 | m := new(Machine) 40 | m.Build(keywords) 41 | //m.PrintFailure() 42 | //m.PrintOutput() 43 | } 44 | 45 | func TestMultiPatternSearchEnglish(t *testing.T) { 46 | fmt.Printf("===> MultiPattern Search For English \n") 47 | keywords, err := Read("test_keywords_eng") 48 | if err != nil { 49 | t.Error(err) 50 | } 51 | m := new(Machine) 52 | m.Build(keywords) 53 | //m.PrintFailure() 54 | //m.PrintOutput() 55 | 56 | content := []rune("ushers") 57 | terms := m.MultiPatternSearch(content, false) 58 | for _, term := range terms { 59 | fmt.Printf("find %s @%d in %s\n", string(term.Word), term.Pos, string(content)) 60 | } 61 | fmt.Printf("\n") 62 | } 63 | 64 | func TestMultiPatternSearchChinese(t *testing.T) { 65 | fmt.Printf("===> MultiPattern Search For Chinese \n") 66 | keywords, err := Read("test_keywords_chn") 67 | if err != nil { 68 | t.Error(err) 69 | } 70 | m := new(Machine) 71 | m.Build(keywords) 72 | //m.PrintFailure() 73 | //m.PrintOutput() 74 | 75 | content := []rune("你不会想到阿拉伯人会踢出阿根廷风格的足球更何况是埃及风格") 76 | terms := m.MultiPatternSearch(content, false) 77 | for _, term := range terms { 78 | fmt.Printf("find %s @%d in %s\n", string(term.Word), term.Pos, string(content)) 79 | } 80 | fmt.Printf("\n") 81 | } 82 | 83 | func TestExactSearchEnglish(t *testing.T) { 84 | fmt.Printf("===> Exact Search For English\n") 85 | keywords, err := Read("test_keywords_eng") 86 | if err != nil { 87 | t.Error(err) 88 | } 89 | m := new(Machine) 90 | m.Build(keywords) 91 | 92 | for _, k := range keywords { 93 | if m.ExactSearch(k) == nil { 94 | t.Error("exact search chinese failed") 95 | } 96 | } 97 | fmt.Printf("Test total:%d words\n\n", len(keywords)) 98 | } 99 | 100 | func TestExactSearchChinese(t *testing.T) { 101 | fmt.Printf("===> Exact Search For Chinese\n") 102 | keywords, err := Read("test_keywords_chn") 103 | if err != nil { 104 | t.Error(err) 105 | } 106 | m := new(Machine) 107 | m.Build(keywords) 108 | 109 | for _, k := range keywords { 110 | if m.ExactSearch(k) == nil { 111 | t.Error("exact search chinese failed") 112 | } 113 | } 114 | fmt.Printf("Test total:%d words\n\n", len(keywords)) 115 | } 116 | -------------------------------------------------------------------------------- /benchmark/benchmark.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "fmt" 7 | "io" 8 | "io/ioutil" 9 | "os" 10 | "time" 11 | ) 12 | 13 | import ( 14 | "github.com/anknown/ahocorasick" 15 | "github.com/cloudflare/ahocorasick" 16 | ) 17 | 18 | const CHN_DICT_FILE = "./cn/dictionary.txt" 19 | const CHN_TEXT_FILE = "./cn/text.txt" 20 | const ENG_DICT_FILE = "./en/dictionary.txt" 21 | const ENG_TEXT_FILE = "./en/text.txt" 22 | 23 | func ReadBytes(filename string) ([][]byte, error) { 24 | dict := [][]byte{} 25 | 26 | f, err := os.OpenFile(filename, os.O_RDONLY, 0660) 27 | if err != nil { 28 | return nil, err 29 | } 30 | 31 | r := bufio.NewReader(f) 32 | for { 33 | l, err := r.ReadBytes('\n') 34 | if err != nil || err == io.EOF { 35 | break 36 | } 37 | l = bytes.TrimSpace(l) 38 | dict = append(dict, l) 39 | } 40 | 41 | return dict, nil 42 | } 43 | 44 | func ReadRunes(filename string) ([][]rune, error) { 45 | dict := [][]rune{} 46 | 47 | f, err := os.OpenFile(filename, os.O_RDONLY, 0660) 48 | if err != nil { 49 | return nil, err 50 | } 51 | 52 | r := bufio.NewReader(f) 53 | for { 54 | l, err := r.ReadBytes('\n') 55 | if err != nil || err == io.EOF { 56 | break 57 | } 58 | l = bytes.TrimSpace(l) 59 | dict = append(dict, bytes.Runes(l)) 60 | } 61 | 62 | return dict, nil 63 | } 64 | 65 | func TestAEnglish() { 66 | fmt.Println("** English Benchmark of cloudflare/ahocorasick **") 67 | fmt.Println("-------------------------------------------------") 68 | fmt.Println("=> Start to Load... ") 69 | start := time.Now() 70 | dict, err := ReadBytes(ENG_DICT_FILE) 71 | if err != nil { 72 | fmt.Println(err) 73 | return 74 | } 75 | 76 | content, err := ioutil.ReadFile(ENG_TEXT_FILE) 77 | if err != nil { 78 | fmt.Println(err) 79 | return 80 | } 81 | end := time.Now() 82 | fmt.Printf("load file cost:%d(ms)\n", (end.UnixNano()-start.UnixNano())/(1000*1000)) 83 | 84 | fmt.Println("=> Start to Search... ") 85 | start = time.Now() 86 | m := ahocorasick.NewMatcher(dict) 87 | 88 | //res := m.Match(content) 89 | m.Match(content) 90 | end = time.Now() 91 | 92 | fmt.Printf("search cost:%d(ms)\n", (end.UnixNano()-start.UnixNano())/(1000*1000)) 93 | 94 | /* 95 | for _, v := range res { 96 | fmt.Printf("%d\n", v) 97 | } 98 | */ 99 | } 100 | 101 | func TestAChinese() { 102 | fmt.Println("\n** Chinese Benchmark of cloudflare/ahocorasick **") 103 | fmt.Println("---------------------------------------------------") 104 | fmt.Println("=> Start to Load... ") 105 | start := time.Now() 106 | dict, err := ReadBytes(CHN_DICT_FILE) 107 | if err != nil { 108 | fmt.Println(err) 109 | return 110 | } 111 | 112 | content, err := ioutil.ReadFile(CHN_TEXT_FILE) 113 | if err != nil { 114 | fmt.Println(err) 115 | return 116 | } 117 | end := time.Now() 118 | fmt.Printf("load file cost:%d(ms)\n", (end.UnixNano()-start.UnixNano())/(1000*1000)) 119 | 120 | fmt.Println("=> Start to Search... ") 121 | start = time.Now() 122 | m := ahocorasick.NewMatcher(dict) 123 | 124 | //res := m.Match(content) 125 | m.Match(content) 126 | end = time.Now() 127 | 128 | fmt.Printf("search cost:%d(ms)\n", (end.UnixNano()-start.UnixNano())/(1000*1000)) 129 | 130 | /* 131 | for _, v := range res { 132 | fmt.Printf("%d\n", v) 133 | } 134 | */ 135 | } 136 | 137 | func TestBEnglish() { 138 | fmt.Println("\n** English Benchmark of anknown/ahocorasick **") 139 | fmt.Println("------------------------------------------------") 140 | fmt.Println("=> Start to Load... ") 141 | start := time.Now() 142 | dict, err := ReadRunes(ENG_DICT_FILE) 143 | if err != nil { 144 | fmt.Println(err) 145 | return 146 | } 147 | 148 | content, err := ioutil.ReadFile(ENG_TEXT_FILE) 149 | if err != nil { 150 | fmt.Println(err) 151 | return 152 | } 153 | 154 | contentRune := bytes.Runes([]byte(content)) 155 | end := time.Now() 156 | fmt.Printf("load file cost:%d(ms)\n", (end.UnixNano()-start.UnixNano())/(1000*1000)) 157 | 158 | fmt.Println("=> Start to Search... ") 159 | start = time.Now() 160 | m := new(goahocorasick.Machine) 161 | if err := m.Build(dict); err != nil { 162 | fmt.Println(err) 163 | return 164 | } 165 | //terms := m.Search(contentRune) 166 | m.MultiPatternSearch(contentRune, false) 167 | end = time.Now() 168 | fmt.Printf("search cost:%d(ms)\n", (end.UnixNano()-start.UnixNano())/(1000*1000)) 169 | /* 170 | for _, t := range terms { 171 | fmt.Printf("%d %s\n", t.Pos, string(t.Word)) 172 | } 173 | */ 174 | } 175 | 176 | func TestBChinese() { 177 | fmt.Println("\n** Chinese Benchmark of anknown/ahocorasick **") 178 | fmt.Println("------------------------------------------------") 179 | fmt.Println("=> Start to Load... ") 180 | start := time.Now() 181 | dict, err := ReadRunes(CHN_DICT_FILE) 182 | if err != nil { 183 | fmt.Println(err) 184 | return 185 | } 186 | 187 | content, err := ioutil.ReadFile(CHN_TEXT_FILE) 188 | if err != nil { 189 | fmt.Println(err) 190 | return 191 | } 192 | 193 | contentRune := bytes.Runes([]byte(content)) 194 | end := time.Now() 195 | fmt.Printf("load file cost:%d(ms)\n", (end.UnixNano()-start.UnixNano())/(1000*1000)) 196 | 197 | fmt.Println("=> Start to Search... ") 198 | start = time.Now() 199 | m := new(goahocorasick.Machine) 200 | if err := m.Build(dict); err != nil { 201 | fmt.Println(err) 202 | return 203 | } 204 | //terms := m.Search(contentRune) 205 | m.MultiPatternSearch(contentRune, false) 206 | end = time.Now() 207 | fmt.Printf("search cost:%d(ms)\n", (end.UnixNano()-start.UnixNano())/(1000*1000)) 208 | /* 209 | for _, t := range terms { 210 | fmt.Printf("%d %s\n", t.Pos, string(t.Word)) 211 | } 212 | */ 213 | } 214 | 215 | func main() { 216 | TestAEnglish() 217 | TestBEnglish() 218 | TestAChinese() 219 | TestBChinese() 220 | } 221 | -------------------------------------------------------------------------------- /test_keywords_chn: -------------------------------------------------------------------------------- 1 | 啊 2 | 埃及 3 | 阿胶 4 | 阿根廷 5 | 阿拉伯 6 | 阿拉伯人 7 | 埃及风格 8 | 何况是埃及风格 9 | 更何况是埃及风格 10 | 你不会想到阿拉伯人会踢出阿根廷风格的足球更何况是埃及风格 11 | 我们不会想到阿拉伯人会踢出阿根廷风格的足球更何况是埃及风格 12 | -------------------------------------------------------------------------------- /test_keywords_eng: -------------------------------------------------------------------------------- 1 | he 2 | she 3 | his 4 | hers 5 | --------------------------------------------------------------------------------