├── .github └── workflows │ └── semgrep.yml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── ahocorasick.go └── ahocorasick_test.go /.github/workflows/semgrep.yml: -------------------------------------------------------------------------------- 1 | 2 | on: 3 | pull_request: {} 4 | workflow_dispatch: {} 5 | push: 6 | branches: 7 | - main 8 | - master 9 | name: Semgrep config 10 | jobs: 11 | semgrep: 12 | name: semgrep/ci 13 | runs-on: ubuntu-20.04 14 | env: 15 | SEMGREP_APP_TOKEN: ${{ secrets.SEMGREP_APP_TOKEN }} 16 | SEMGREP_URL: https://cloudflare.semgrep.dev 17 | SEMGREP_APP_URL: https://cloudflare.semgrep.dev 18 | SEMGREP_VERSION_CHECK_URL: https://cloudflare.semgrep.dev/api/check-version 19 | container: 20 | image: returntocorp/semgrep 21 | steps: 22 | - uses: actions/checkout@v3 23 | - run: semgrep ci 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | 24 | *~ 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013 CloudFlare, Inc. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, this 11 | list of conditions and the following disclaimer in the documentation and/or 12 | other materials provided with the distribution. 13 | 14 | * Neither the name of the CloudFlare, Inc. nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | GCFLAGS := -B 2 | LDFLAGS := 3 | 4 | .PHONY: install 5 | install: 6 | @go install -v . 7 | 8 | .PHONY: test 9 | test: 10 | @go test -gcflags='$(GCFLAGS)' -race -ldflags='$(LDFLAGS)' . 11 | 12 | .PHONY: bench 13 | bench: 14 | @go test -gcflags='$(GCFLAGS)' -ldflags='$(LDFLAGS)' -benchmem -bench . 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ahocorasick 2 | =========== 3 | 4 | A Golang implementation of the Aho-Corasick string matching algorithm 5 | -------------------------------------------------------------------------------- /ahocorasick.go: -------------------------------------------------------------------------------- 1 | // ahocorasick.go: implementation of the Aho-Corasick string matching 2 | // algorithm. Actually implemented as matching against []byte rather 3 | // than the Go string type. Throughout this code []byte is referred to 4 | // as a blice. 5 | // 6 | // http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm 7 | // 8 | // Copyright (c) 2013 CloudFlare, Inc. 9 | 10 | package ahocorasick 11 | 12 | import ( 13 | "container/list" 14 | "sync" 15 | "sync/atomic" 16 | ) 17 | 18 | // A node in the trie structure used to implement Aho-Corasick 19 | type node struct { 20 | root bool // true if this is the root 21 | 22 | b []byte // The blice at this node 23 | 24 | output bool // True means this node represents a blice that should 25 | // be output when matching 26 | index int // index into original dictionary if output is true 27 | 28 | counter uint64 // Set to the value of the Matcher.counter when a 29 | // match is output to prevent duplicate output 30 | // The use of fixed size arrays is space-inefficient but fast for 31 | // lookups. 32 | 33 | child [256]*node // A non-nil entry in this array means that the 34 | // index represents a byte value which can be 35 | // appended to the current node. Blices in the 36 | // trie are built up byte by byte through these 37 | // child node pointers. 38 | 39 | fails [256]*node // Where to fail to (by following the fail 40 | // pointers) for each possible byte 41 | 42 | suffix *node // Pointer to the longest possible strict suffix of 43 | // this node 44 | 45 | fail *node // Pointer to the next node which is in the dictionary 46 | // which can be reached from here following suffixes. Called fail 47 | // because it is used to fallback in the trie when a match fails. 48 | } 49 | 50 | // Matcher is returned by NewMatcher and contains a list of blices to 51 | // match against 52 | type Matcher struct { 53 | counter uint64 // Counts the number of matches done, and is used to 54 | // prevent output of multiple matches of the same string 55 | trie []node // preallocated block of memory containing all the 56 | // nodes 57 | extent int // offset into trie that is currently free 58 | root *node // Points to trie[0] 59 | 60 | heap sync.Pool // a pool of haystacks to de-duplicate results in 61 | // a thread-safe manner 62 | } 63 | 64 | // findBlice looks for a blice in the trie starting from the root and 65 | // returns a pointer to the node representing the end of the blice. If 66 | // the blice is not found it returns nil. 67 | func (m *Matcher) findBlice(b []byte) *node { 68 | n := &m.trie[0] 69 | 70 | for n != nil && len(b) > 0 { 71 | n = n.child[int(b[0])] 72 | b = b[1:] 73 | } 74 | 75 | return n 76 | } 77 | 78 | // getFreeNode: gets a free node structure from the Matcher's trie 79 | // pool and updates the extent to point to the next free node. 80 | func (m *Matcher) getFreeNode() *node { 81 | m.extent += 1 82 | 83 | if m.extent == 1 { 84 | m.root = &m.trie[0] 85 | m.root.root = true 86 | } 87 | 88 | return &m.trie[m.extent-1] 89 | } 90 | 91 | // buildTrie builds the fundamental trie structure from a set of 92 | // blices. 93 | func (m *Matcher) buildTrie(dictionary [][]byte) { 94 | 95 | // Work out the maximum size for the trie (all dictionary entries 96 | // are distinct plus the root). This is used to preallocate memory 97 | // for it. 98 | 99 | max := 1 100 | for _, blice := range dictionary { 101 | max += len(blice) 102 | } 103 | m.trie = make([]node, max) 104 | 105 | // Calling this an ignoring its argument simply allocated 106 | // m.trie[0] which will be the root element 107 | 108 | m.getFreeNode() 109 | 110 | // This loop builds the nodes in the trie by following through 111 | // each dictionary entry building the children pointers. 112 | 113 | for i, blice := range dictionary { 114 | n := m.root 115 | var path []byte 116 | for _, b := range blice { 117 | path = append(path, b) 118 | 119 | c := n.child[int(b)] 120 | 121 | if c == nil { 122 | c = m.getFreeNode() 123 | n.child[int(b)] = c 124 | c.b = make([]byte, len(path)) 125 | copy(c.b, path) 126 | 127 | // Nodes directly under the root node will have the 128 | // root as their fail point as there are no suffixes 129 | // possible. 130 | 131 | if len(path) == 1 { 132 | c.fail = m.root 133 | } 134 | 135 | c.suffix = m.root 136 | } 137 | 138 | n = c 139 | } 140 | 141 | // The last value of n points to the node representing a 142 | // dictionary entry 143 | 144 | n.output = true 145 | n.index = i 146 | } 147 | 148 | l := new(list.List) 149 | l.PushBack(m.root) 150 | 151 | for l.Len() > 0 { 152 | n := l.Remove(l.Front()).(*node) 153 | 154 | for i := 0; i < 256; i++ { 155 | c := n.child[i] 156 | if c != nil { 157 | l.PushBack(c) 158 | 159 | for j := 1; j < len(c.b); j++ { 160 | c.fail = m.findBlice(c.b[j:]) 161 | if c.fail != nil { 162 | break 163 | } 164 | } 165 | 166 | if c.fail == nil { 167 | c.fail = m.root 168 | } 169 | 170 | for j := 1; j < len(c.b); j++ { 171 | s := m.findBlice(c.b[j:]) 172 | if s != nil && s.output { 173 | c.suffix = s 174 | break 175 | } 176 | } 177 | } 178 | } 179 | } 180 | 181 | for i := 0; i < m.extent; i++ { 182 | for c := 0; c < 256; c++ { 183 | n := &m.trie[i] 184 | for n.child[c] == nil && !n.root { 185 | n = n.fail 186 | } 187 | 188 | m.trie[i].fails[c] = n 189 | } 190 | } 191 | 192 | m.trie = m.trie[:m.extent] 193 | } 194 | 195 | // NewMatcher creates a new Matcher used to match against a set of 196 | // blices 197 | func NewMatcher(dictionary [][]byte) *Matcher { 198 | m := new(Matcher) 199 | 200 | m.buildTrie(dictionary) 201 | 202 | return m 203 | } 204 | 205 | // NewStringMatcher creates a new Matcher used to match against a set 206 | // of strings (this is a helper to make initialization easy) 207 | func NewStringMatcher(dictionary []string) *Matcher { 208 | m := new(Matcher) 209 | 210 | var d [][]byte 211 | for _, s := range dictionary { 212 | d = append(d, []byte(s)) 213 | } 214 | 215 | m.buildTrie(d) 216 | 217 | return m 218 | } 219 | 220 | // Match searches in for blices and returns all the blices found as indexes into 221 | // the original dictionary. 222 | // 223 | // This is not thread-safe method, seek for MatchThreadSafe() instead. 224 | func (m *Matcher) Match(in []byte) []int { 225 | m.counter++ 226 | 227 | return match(in, m.root, func(f *node) bool { 228 | if f.counter != m.counter { 229 | f.counter = m.counter 230 | return true 231 | } 232 | return false 233 | }) 234 | } 235 | 236 | // match is a core of matching logic. Accepts input byte slice, starting node 237 | // and a func to check whether should we include result into response or not 238 | func match(in []byte, n *node, unique func(f *node) bool) []int { 239 | var hits []int 240 | 241 | for _, b := range in { 242 | c := int(b) 243 | 244 | if !n.root && n.child[c] == nil { 245 | n = n.fails[c] 246 | } 247 | 248 | if n.child[c] != nil { 249 | f := n.child[c] 250 | n = f 251 | 252 | if f.output { 253 | if unique(f) { 254 | hits = append(hits, f.index) 255 | } 256 | } 257 | 258 | for !f.suffix.root { 259 | f = f.suffix 260 | if unique(f) { 261 | hits = append(hits, f.index) 262 | } else { 263 | 264 | // There's no point working our way up the 265 | // suffixes if it's been done before for this call 266 | // to Match. The matches are already in hits. 267 | 268 | break 269 | } 270 | } 271 | } 272 | } 273 | 274 | return hits 275 | } 276 | 277 | // MatchThreadSafe provides the same result as Match() but does it in a 278 | // thread-safe manner. Uses a sync.Pool of haystacks to track the uniqueness of 279 | // the result items. 280 | func (m *Matcher) MatchThreadSafe(in []byte) []int { 281 | var ( 282 | heap map[int]uint64 283 | ) 284 | 285 | generation := atomic.AddUint64(&m.counter, 1) 286 | n := m.root 287 | // read the matcher's heap 288 | item := m.heap.Get() 289 | if item == nil { 290 | heap = make(map[int]uint64, len(m.trie)) 291 | } else { 292 | heap = item.(map[int]uint64) 293 | } 294 | 295 | hits := match(in, n, func(f *node) bool { 296 | g := heap[f.index] 297 | if g != generation { 298 | heap[f.index] = generation 299 | return true 300 | } 301 | return false 302 | }) 303 | 304 | m.heap.Put(heap) 305 | return hits 306 | } 307 | 308 | // Contains returns true if any string matches. This can be faster 309 | // than Match() when you do not need to know which words matched. 310 | func (m *Matcher) Contains(in []byte) bool { 311 | n := m.root 312 | for _, b := range in { 313 | c := int(b) 314 | if !n.root { 315 | n = n.fails[c] 316 | } 317 | 318 | if n.child[c] != nil { 319 | f := n.child[c] 320 | n = f 321 | 322 | if f.output { 323 | return true 324 | } 325 | 326 | for !f.suffix.root { 327 | return true 328 | } 329 | } 330 | } 331 | return false 332 | } 333 | -------------------------------------------------------------------------------- /ahocorasick_test.go: -------------------------------------------------------------------------------- 1 | // ahocorasick_test.go: test suite for ahocorasick 2 | // 3 | // Copyright (c) 2013 CloudFlare, Inc. 4 | 5 | package ahocorasick 6 | 7 | import ( 8 | "regexp" 9 | "strings" 10 | "sync" 11 | "testing" 12 | ) 13 | 14 | func assert(t *testing.T, b bool) { 15 | if !b { 16 | t.Fail() 17 | } 18 | } 19 | 20 | func TestNoPatterns(t *testing.T) { 21 | m := NewStringMatcher([]string{}) 22 | hits := m.Match([]byte("foo bar baz")) 23 | assert(t, len(hits) == 0) 24 | 25 | hits = m.MatchThreadSafe([]byte("foo bar baz")) 26 | assert(t, len(hits) == 0) 27 | } 28 | 29 | func TestNoData(t *testing.T) { 30 | m := NewStringMatcher([]string{"foo", "baz", "bar"}) 31 | hits := m.Match([]byte("")) 32 | assert(t, len(hits) == 0) 33 | 34 | hits = m.MatchThreadSafe([]byte("")) 35 | assert(t, len(hits) == 0) 36 | } 37 | 38 | func TestSuffixes(t *testing.T) { 39 | m := NewStringMatcher([]string{"Superman", "uperman", "perman", "erman"}) 40 | hits := m.Match([]byte("The Man Of Steel: Superman")) 41 | assert(t, len(hits) == 4) 42 | assert(t, hits[0] == 0) 43 | assert(t, hits[1] == 1) 44 | assert(t, hits[2] == 2) 45 | assert(t, hits[3] == 3) 46 | 47 | hits = m.MatchThreadSafe([]byte("The Man Of Steel: Superman")) 48 | assert(t, len(hits) == 4) 49 | assert(t, hits[0] == 0) 50 | assert(t, hits[1] == 1) 51 | assert(t, hits[2] == 2) 52 | assert(t, hits[3] == 3) 53 | } 54 | 55 | func TestPrefixes(t *testing.T) { 56 | m := NewStringMatcher([]string{"Superman", "Superma", "Superm", "Super"}) 57 | hits := m.Match([]byte("The Man Of Steel: Superman")) 58 | assert(t, len(hits) == 4) 59 | assert(t, hits[0] == 3) 60 | assert(t, hits[1] == 2) 61 | assert(t, hits[2] == 1) 62 | assert(t, hits[3] == 0) 63 | 64 | hits = m.MatchThreadSafe([]byte("The Man Of Steel: Superman")) 65 | assert(t, len(hits) == 4) 66 | assert(t, hits[0] == 3) 67 | assert(t, hits[1] == 2) 68 | assert(t, hits[2] == 1) 69 | assert(t, hits[3] == 0) 70 | } 71 | 72 | func TestInterior(t *testing.T) { 73 | m := NewStringMatcher([]string{"Steel", "tee", "e"}) 74 | hits := m.Match([]byte("The Man Of Steel: Superman")) 75 | assert(t, len(hits) == 3) 76 | assert(t, hits[2] == 0) 77 | assert(t, hits[1] == 1) 78 | assert(t, hits[0] == 2) 79 | 80 | hits = m.MatchThreadSafe([]byte("The Man Of Steel: Superman")) 81 | assert(t, len(hits) == 3) 82 | assert(t, hits[2] == 0) 83 | assert(t, hits[1] == 1) 84 | assert(t, hits[0] == 2) 85 | } 86 | 87 | func TestMatchAtStart(t *testing.T) { 88 | m := NewStringMatcher([]string{"The", "Th", "he"}) 89 | hits := m.Match([]byte("The Man Of Steel: Superman")) 90 | assert(t, len(hits) == 3) 91 | assert(t, hits[0] == 1) 92 | assert(t, hits[1] == 0) 93 | assert(t, hits[2] == 2) 94 | 95 | hits = m.MatchThreadSafe([]byte("The Man Of Steel: Superman")) 96 | assert(t, len(hits) == 3) 97 | assert(t, hits[0] == 1) 98 | assert(t, hits[1] == 0) 99 | assert(t, hits[2] == 2) 100 | } 101 | 102 | func TestMatchAtEnd(t *testing.T) { 103 | m := NewStringMatcher([]string{"teel", "eel", "el"}) 104 | hits := m.Match([]byte("The Man Of Steel")) 105 | assert(t, len(hits) == 3) 106 | assert(t, hits[0] == 0) 107 | assert(t, hits[1] == 1) 108 | assert(t, hits[2] == 2) 109 | 110 | hits = m.MatchThreadSafe([]byte("The Man Of Steel")) 111 | assert(t, len(hits) == 3) 112 | assert(t, hits[0] == 0) 113 | assert(t, hits[1] == 1) 114 | assert(t, hits[2] == 2) 115 | } 116 | 117 | func TestOverlappingPatterns(t *testing.T) { 118 | m := NewStringMatcher([]string{"Man ", "n Of", "Of S"}) 119 | hits := m.Match([]byte("The Man Of Steel")) 120 | assert(t, len(hits) == 3) 121 | assert(t, hits[0] == 0) 122 | assert(t, hits[1] == 1) 123 | assert(t, hits[2] == 2) 124 | 125 | hits = m.MatchThreadSafe([]byte("The Man Of Steel")) 126 | assert(t, len(hits) == 3) 127 | assert(t, hits[0] == 0) 128 | assert(t, hits[1] == 1) 129 | assert(t, hits[2] == 2) 130 | } 131 | 132 | func TestMultipleMatches(t *testing.T) { 133 | m := NewStringMatcher([]string{"The", "Man", "an"}) 134 | hits := m.Match([]byte("A Man A Plan A Canal: Panama, which Man Planned The Canal")) 135 | assert(t, len(hits) == 3) 136 | assert(t, hits[0] == 1) 137 | assert(t, hits[1] == 2) 138 | assert(t, hits[2] == 0) 139 | 140 | hits = m.MatchThreadSafe([]byte("A Man A Plan A Canal: Panama, which Man Planned The Canal")) 141 | assert(t, len(hits) == 3) 142 | assert(t, hits[0] == 1) 143 | assert(t, hits[1] == 2) 144 | assert(t, hits[2] == 0) 145 | } 146 | 147 | func TestSingleCharacterMatches(t *testing.T) { 148 | m := NewStringMatcher([]string{"a", "M", "z"}) 149 | hits := m.Match([]byte("A Man A Plan A Canal: Panama, which Man Planned The Canal")) 150 | assert(t, len(hits) == 2) 151 | assert(t, hits[0] == 1) 152 | assert(t, hits[1] == 0) 153 | 154 | hits = m.MatchThreadSafe([]byte("A Man A Plan A Canal: Panama, which Man Planned The Canal")) 155 | assert(t, len(hits) == 2) 156 | assert(t, hits[0] == 1) 157 | assert(t, hits[1] == 0) 158 | } 159 | 160 | func TestNothingMatches(t *testing.T) { 161 | m := NewStringMatcher([]string{"baz", "bar", "foo"}) 162 | hits := m.Match([]byte("A Man A Plan A Canal: Panama, which Man Planned The Canal")) 163 | assert(t, len(hits) == 0) 164 | 165 | hits = m.MatchThreadSafe([]byte("A Man A Plan A Canal: Panama, which Man Planned The Canal")) 166 | assert(t, len(hits) == 0) 167 | } 168 | 169 | func TestWikipedia(t *testing.T) { 170 | m := NewStringMatcher([]string{"a", "ab", "bc", "bca", "c", "caa"}) 171 | hits := m.Match([]byte("abccab")) 172 | assert(t, len(hits) == 4) 173 | assert(t, hits[0] == 0) 174 | assert(t, hits[1] == 1) 175 | assert(t, hits[2] == 2) 176 | assert(t, hits[3] == 4) 177 | 178 | hits = m.Match([]byte("bccab")) 179 | assert(t, len(hits) == 4) 180 | assert(t, hits[0] == 2) 181 | assert(t, hits[1] == 4) 182 | assert(t, hits[2] == 0) 183 | assert(t, hits[3] == 1) 184 | 185 | hits = m.Match([]byte("bccb")) 186 | assert(t, len(hits) == 2) 187 | assert(t, hits[0] == 2) 188 | assert(t, hits[1] == 4) 189 | } 190 | 191 | func TestWikipediaConcurrently(t *testing.T) { 192 | m := NewStringMatcher([]string{"a", "ab", "bc", "bca", "c", "caa"}) 193 | 194 | wg := sync.WaitGroup{} 195 | wg.Add(3) 196 | go func() { 197 | defer wg.Done() 198 | hits := m.MatchThreadSafe([]byte("abccab")) 199 | assert(t, len(hits) == 4) 200 | assert(t, hits[0] == 0) 201 | assert(t, hits[1] == 1) 202 | assert(t, hits[2] == 2) 203 | assert(t, hits[3] == 4) 204 | }() 205 | 206 | go func() { 207 | defer wg.Done() 208 | hits := m.MatchThreadSafe([]byte("bccab")) 209 | assert(t, len(hits) == 4) 210 | assert(t, hits[0] == 2) 211 | assert(t, hits[1] == 4) 212 | assert(t, hits[2] == 0) 213 | assert(t, hits[3] == 1) 214 | }() 215 | 216 | go func() { 217 | defer wg.Done() 218 | hits := m.MatchThreadSafe([]byte("bccb")) 219 | assert(t, len(hits) == 2) 220 | assert(t, hits[0] == 2) 221 | assert(t, hits[1] == 4) 222 | }() 223 | 224 | wg.Wait() 225 | } 226 | 227 | func TestMatch(t *testing.T) { 228 | m := NewStringMatcher(dictionary) 229 | hits := m.Match([]byte("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36")) 230 | assert(t, len(hits) == 4) 231 | assert(t, hits[0] == 0) 232 | assert(t, hits[1] == 1) 233 | assert(t, hits[2] == 2) 234 | assert(t, hits[3] == 3) 235 | 236 | hits = m.Match([]byte("Mozilla/5.0 (Mac; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36")) 237 | assert(t, len(hits) == 3) 238 | assert(t, hits[0] == 0) 239 | assert(t, hits[1] == 1) 240 | assert(t, hits[2] == 3) 241 | 242 | hits = m.Match([]byte("Mozilla/5.0 (Moc; Intel Computer OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36")) 243 | assert(t, len(hits) == 2) 244 | assert(t, hits[0] == 0) 245 | assert(t, hits[1] == 3) 246 | 247 | hits = m.Match([]byte("Mozilla/5.0 (Moc; Intel Computer OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Sofari/537.36")) 248 | assert(t, len(hits) == 1) 249 | assert(t, hits[0] == 0) 250 | 251 | hits = m.Match([]byte("Mazilla/5.0 (Moc; Intel Computer OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Sofari/537.36")) 252 | assert(t, len(hits) == 0) 253 | } 254 | 255 | func TestMatchThreadSafe(t *testing.T) { 256 | m := NewStringMatcher([]string{"Mozilla", "Mac", "Macintosh", "Safari", "Sausage"}) 257 | 258 | wg := sync.WaitGroup{} 259 | wg.Add(5) 260 | go func() { 261 | defer wg.Done() 262 | 263 | hits := m.MatchThreadSafe([]byte("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36")) 264 | assert(t, len(hits) == 4) 265 | assert(t, hits[0] == 0) 266 | assert(t, hits[1] == 1) 267 | assert(t, hits[2] == 2) 268 | assert(t, hits[3] == 3) 269 | }() 270 | 271 | go func() { 272 | defer wg.Done() 273 | 274 | hits := m.MatchThreadSafe([]byte("Mozilla/5.0 (Mac; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36")) 275 | assert(t, len(hits) == 3) 276 | assert(t, hits[0] == 0) 277 | assert(t, hits[1] == 1) 278 | assert(t, hits[2] == 3) 279 | }() 280 | 281 | go func() { 282 | defer wg.Done() 283 | 284 | hits := m.MatchThreadSafe([]byte("Mozilla/5.0 (Moc; Intel Computer OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36")) 285 | assert(t, len(hits) == 2) 286 | assert(t, hits[0] == 0) 287 | assert(t, hits[1] == 3) 288 | }() 289 | 290 | go func() { 291 | defer wg.Done() 292 | 293 | hits := m.MatchThreadSafe([]byte("Mozilla/5.0 (Moc; Intel Computer OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Sofari/537.36")) 294 | assert(t, len(hits) == 1) 295 | assert(t, hits[0] == 0) 296 | }() 297 | 298 | go func() { 299 | defer wg.Done() 300 | 301 | hits := m.MatchThreadSafe([]byte("Mazilla/5.0 (Moc; Intel Computer OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Sofari/537.36")) 302 | assert(t, len(hits) == 0) 303 | }() 304 | 305 | wg.Wait() 306 | } 307 | 308 | func TestLargeDictionaryMatchThreadSafeWorks(t *testing.T) { 309 | /** 310 | * we have 105 unique words extracted from dictionary, therefore the result 311 | * is supposed to show 105 hits 312 | */ 313 | hits := precomputed6.MatchThreadSafe(bytes2) 314 | assert(t, len(hits) == 105) 315 | 316 | } 317 | 318 | func TestContains(t *testing.T) { 319 | m := NewStringMatcher(dictionary) 320 | contains := m.Contains([]byte("Mozilla/5.0 (Moc; Intel Computer OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Sofari/537.36")) 321 | assert(t, contains) 322 | 323 | contains = m.Contains([]byte("Mazilla/5.0 (Moc; Intel Computer OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Sofari/537.36")) 324 | assert(t, !contains) 325 | 326 | m = NewStringMatcher([]string{"SupermanX", "per"}) 327 | contains = m.Contains([]byte("The Man Of Steel: Superman")) 328 | assert(t, contains == true) 329 | } 330 | 331 | var bytes = []byte("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36") 332 | var sbytes = string(bytes) 333 | var dictionary = []string{"Mozilla", "Mac", "Macintosh", "Safari", "Sausage"} 334 | var precomputed = NewStringMatcher(dictionary) 335 | 336 | func BenchmarkMatchWorks(b *testing.B) { 337 | for i := 0; i < b.N; i++ { 338 | precomputed.Match(bytes) 339 | } 340 | } 341 | 342 | func BenchmarkMatchThreadSafeWorks(b *testing.B) { 343 | for i := 0; i < b.N; i++ { 344 | precomputed.MatchThreadSafe(bytes) 345 | } 346 | } 347 | 348 | func BenchmarkContainsWorks(b *testing.B) { 349 | for i := 0; i < b.N; i++ { 350 | hits := make([]int, 0) 351 | for i, s := range dictionary { 352 | if strings.Contains(sbytes, s) { 353 | hits = append(hits, i) 354 | } 355 | } 356 | } 357 | } 358 | 359 | var re = regexp.MustCompile("(" + strings.Join(dictionary, "|") + ")") 360 | 361 | func BenchmarkRegexpWorks(b *testing.B) { 362 | for i := 0; i < b.N; i++ { 363 | re.FindAllIndex(bytes, -1) 364 | } 365 | } 366 | 367 | var dictionary2 = []string{"Googlebot", "bingbot", "msnbot", "Yandex", "Baiduspider"} 368 | var precomputed2 = NewStringMatcher(dictionary2) 369 | 370 | func BenchmarkMatchFails(b *testing.B) { 371 | for i := 0; i < b.N; i++ { 372 | precomputed2.Match(bytes) 373 | } 374 | } 375 | 376 | func BenchmarkContainsFails(b *testing.B) { 377 | for i := 0; i < b.N; i++ { 378 | hits := make([]int, 0) 379 | for i, s := range dictionary2 { 380 | if strings.Contains(sbytes, s) { 381 | hits = append(hits, i) 382 | } 383 | } 384 | } 385 | } 386 | 387 | var re2 = regexp.MustCompile("(" + strings.Join(dictionary2, "|") + ")") 388 | 389 | func BenchmarkRegexpFails(b *testing.B) { 390 | for i := 0; i < b.N; i++ { 391 | re2.FindAllIndex(bytes, -1) 392 | } 393 | } 394 | 395 | var bytes2 = []byte("Firefox is a web browser, and is Mozilla's flagship software product. It is available in both desktop and mobile versions. Firefox uses the Gecko layout engine to render web pages, which implements current and anticipated web standards. As of April 2013, Firefox has approximately 20% of worldwide usage share of web browsers, making it the third most-used web browser. Firefox began as an experimental branch of the Mozilla codebase by Dave Hyatt, Joe Hewitt and Blake Ross. They believed the commercial requirements of Netscape's sponsorship and developer-driven feature creep compromised the utility of the Mozilla browser. To combat what they saw as the Mozilla Suite's software bloat, they created a stand-alone browser, with which they intended to replace the Mozilla Suite. Firefox was originally named Phoenix but the name was changed so as to avoid trademark conflicts with Phoenix Technologies. The initially-announced replacement, Firebird, provoked objections from the Firebird project community. The current name, Firefox, was chosen on February 9, 2004.") 396 | var sbytes2 = string(bytes2) 397 | 398 | var dictionary3 = []string{"Mozilla", "Mac", "Macintosh", "Safari", "Phoenix"} 399 | var precomputed3 = NewStringMatcher(dictionary3) 400 | 401 | func BenchmarkLongMatchWorks(b *testing.B) { 402 | for i := 0; i < b.N; i++ { 403 | precomputed3.Match(bytes2) 404 | } 405 | } 406 | func BenchmarkLongMatchThreadSafeWorks(b *testing.B) { 407 | for i := 0; i < b.N; i++ { 408 | precomputed3.MatchThreadSafe(bytes2) 409 | } 410 | } 411 | 412 | func BenchmarkLongContainsWorks(b *testing.B) { 413 | for i := 0; i < b.N; i++ { 414 | hits := make([]int, 0) 415 | for i, s := range dictionary3 { 416 | if strings.Contains(sbytes2, s) { 417 | hits = append(hits, i) 418 | } 419 | } 420 | } 421 | } 422 | 423 | var re3 = regexp.MustCompile("(" + strings.Join(dictionary3, "|") + ")") 424 | 425 | func BenchmarkLongRegexpWorks(b *testing.B) { 426 | for i := 0; i < b.N; i++ { 427 | re3.FindAllIndex(bytes2, -1) 428 | } 429 | } 430 | 431 | var dictionary4 = []string{"12343453", "34353", "234234523", "324234", "33333"} 432 | var precomputed4 = NewStringMatcher(dictionary4) 433 | 434 | func BenchmarkLongMatchFails(b *testing.B) { 435 | for i := 0; i < b.N; i++ { 436 | precomputed4.Match(bytes2) 437 | } 438 | } 439 | 440 | func BenchmarkLongContainsFails(b *testing.B) { 441 | for i := 0; i < b.N; i++ { 442 | hits := make([]int, 0) 443 | for i, s := range dictionary4 { 444 | if strings.Contains(sbytes2, s) { 445 | hits = append(hits, i) 446 | } 447 | } 448 | } 449 | } 450 | 451 | var re4 = regexp.MustCompile("(" + strings.Join(dictionary4, "|") + ")") 452 | 453 | func BenchmarkLongRegexpFails(b *testing.B) { 454 | for i := 0; i < b.N; i++ { 455 | re4.FindAllIndex(bytes2, -1) 456 | } 457 | } 458 | 459 | var dictionary5 = []string{"12343453", "34353", "234234523", "324234", "33333", "experimental", "branch", "of", "the", "Mozilla", "codebase", "by", "Dave", "Hyatt", "Joe", "Hewitt", "and", "Blake", "Ross", "mother", "frequently", "performed", "in", "concerts", "around", "the", "village", "uses", "the", "Gecko", "layout", "engine"} 460 | var precomputed5 = NewStringMatcher(dictionary5) 461 | 462 | func BenchmarkMatchMany(b *testing.B) { 463 | for i := 0; i < b.N; i++ { 464 | precomputed5.Match(bytes) 465 | } 466 | } 467 | 468 | func BenchmarkMatchThreadSafeMany(b *testing.B) { 469 | for i := 0; i < b.N; i++ { 470 | precomputed5.MatchThreadSafe(bytes) 471 | } 472 | } 473 | 474 | func BenchmarkContainsMany(b *testing.B) { 475 | for i := 0; i < b.N; i++ { 476 | hits := make([]int, 0) 477 | for i, s := range dictionary4 { 478 | if strings.Contains(sbytes, s) { 479 | hits = append(hits, i) 480 | } 481 | } 482 | } 483 | } 484 | 485 | var re5 = regexp.MustCompile("(" + strings.Join(dictionary5, "|") + ")") 486 | 487 | func BenchmarkRegexpMany(b *testing.B) { 488 | for i := 0; i < b.N; i++ { 489 | re5.FindAllIndex(bytes, -1) 490 | } 491 | } 492 | 493 | func BenchmarkLongMatchMany(b *testing.B) { 494 | for i := 0; i < b.N; i++ { 495 | precomputed5.Match(bytes2) 496 | } 497 | } 498 | 499 | func BenchmarkLongMatchThreadSafeMany(b *testing.B) { 500 | for i := 0; i < b.N; i++ { 501 | precomputed5.MatchThreadSafe(bytes2) 502 | } 503 | } 504 | 505 | func BenchmarkLongContainsMany(b *testing.B) { 506 | for i := 0; i < b.N; i++ { 507 | hits := make([]int, 0) 508 | for i, s := range dictionary4 { 509 | if strings.Contains(sbytes2, s) { 510 | hits = append(hits, i) 511 | } 512 | } 513 | } 514 | } 515 | 516 | func BenchmarkLongRegexpMany(b *testing.B) { 517 | for i := 0; i < b.N; i++ { 518 | re5.FindAllIndex(bytes2, -1) 519 | } 520 | } 521 | 522 | var dictionary6 = []string{"2004", "2013", "9", "a", "an", "and", "anticipated", "approximately", "April", "as", "available", "avoid", "began", "believed", "Blake", "bloat", "both", "branch", "browser", "browsers", "but", "by", "changed", "chosen", "codebase", "combat", "commercial", "community", "compromised", "conflicts", "created", "creep", "current", "Dave", "desktop", "developer-driven", "engine", "experimental", "feature", "February", "Firebird", "Firefox", "flagship", "from", "Gecko", "has", "Hewitt", "Hyatt", "implements", "in", "initially-announced", "intended", "is", "it", "Joe", "layout", "making", "mobile", "most-used", "Mozilla", "Mozilla's", "name", "named", "Netscape's", "objections", "of", "on", "originally", "pages", "Phoenix", "product", "project", "provoked", "render", "replace", "replacement", "requirements", "Ross", "saw", "share", "so", "software", "sponsorship", "stand-alone", "standards", "Suite", "Suite's", "Technologies", "the", "The", "they", "They", "third", "to", "trademark", "usage", "uses", "utility", "versions", "was", "web", "what", "which", "with", "worldwide"} 523 | var precomputed6 = NewStringMatcher(dictionary6) 524 | 525 | func BenchmarkLargeMatchWorks(b *testing.B) { 526 | for i := 0; i < b.N; i++ { 527 | precomputed6.Match(bytes2) 528 | } 529 | } 530 | 531 | func BenchmarkLargeMatchThreadSafeWorks(b *testing.B) { 532 | for i := 0; i < b.N; i++ { 533 | precomputed6.MatchThreadSafe(bytes2) 534 | } 535 | } 536 | --------------------------------------------------------------------------------