├── .github └── workflows │ └── lint.yml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── ac.go ├── ac_test.go ├── acascii ├── ac.go └── ac_test.go └── go.mod /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: lint 2 | on: 3 | push: 4 | pull_request: 5 | 6 | jobs: 7 | 8 | build: 9 | name: Build 10 | runs-on: ubuntu-latest 11 | strategy: 12 | matrix: 13 | govers: [ 1.14, 1.16 ] 14 | steps: 15 | 16 | - name: Set up Go version 17 | uses: actions/setup-go@v1 18 | with: 19 | go-version: ${{ matrix.govers }} 20 | id: go 21 | 22 | - name: Check out code into the Go module directory 23 | uses: actions/checkout@v1 24 | 25 | - name: Build 26 | run: | 27 | export PATH=$PATH:$(go env GOPATH)/bin 28 | GOBIN=$PWD/bin make all -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | bin 10 | 11 | # Architecture specific extensions/prefixes 12 | *.[568vq] 13 | [568vq].out 14 | 15 | *.cgo1.go 16 | *.cgo2.c 17 | _cgo_defun.c 18 | _cgo_gotypes.go 19 | _cgo_export.* 20 | 21 | _testmain.go 22 | 23 | *.exe 24 | *.test 25 | *.prof 26 | 27 | # vim swapfiles 28 | *.swp 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | https://github.com/cloudflare/ahocorasick 3 | 4 | Copyright (c) 2013 CloudFlare, Inc. 5 | All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without modification, 8 | are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, this 14 | list of conditions and the following disclaimer in the documentation and/or 15 | other materials provided with the distribution. 16 | 17 | * Neither the name of the CloudFlare, Inc. nor the names of its 18 | contributors may be used to endorse or promote products derived from 19 | this software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | 32 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: ./bin/golangci-lint 2 | go build ./... 3 | ./bin/golangci-lint run 4 | go test -cover ./... 5 | 6 | ./bin/golangci-lint: 7 | curl -sfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh| sh -s v1.40.1 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ac 2 | 3 | [![GoDoc](https://godoc.org/github.com/signalsciences/ac?status.svg)](https://godoc.org/github.com/signalsciences/ac) [![Actions Status](https://github.com/signalsciences/ac/workflows/lint/badge.svg)](https://github.com/signalsciences/ac/actions) 4 | 5 | Golang implementation of Aho-Corasick for rapid substring matching on either byte 6 | strings or ASCII strings. 7 | 8 | This is based on the excellent library 9 | [cloudflare/ahocorasick](https://github.com/cloudflare/ahocorasick) (BSD 10 | License). The fork/changes were needed for a specific application usages 11 | that are incomptabile with the original library. Some other minor optimizations 12 | around memory and setup were also done. 13 | 14 | 15 | ## Examples 16 | 17 | * FindAllString 18 | 19 | ``` 20 | m := ac.MustCompileString([]string{"Superman", "uperman", "perman", "erman"}) 21 | matches := m.FindAllString("The Man Of Steel: Superman") 22 | fmt.Println(matches) 23 | ``` 24 | 25 | Output: 26 | 27 | ``` 28 | [Superman uperman perman erman] 29 | ``` 30 | 31 | * MatchString 32 | 33 | ``` 34 | m := ac.MustCompileString([]string{"Superman", "uperman", "perman", "erman"}) 35 | contains := m.MatchString("The Man Of Steel: Superman") 36 | fmt.Println(contains) 37 | ``` 38 | 39 | Output: 40 | 41 | ``` 42 | true 43 | ``` 44 | 45 | ## ac/acascii for pure ASCII matching 46 | 47 | The `ac/acascii` package assumes the dictionary is all ASCII characters (1-127) without NULL bytes. This results in during setup: 48 | 49 | * 50% less memory allocations 50 | * 50% less memory users 51 | * 50% less CPU time 52 | 53 | as compared to the plain `ac` package. 54 | 55 | 56 | ## IN PROGRESS 57 | 58 | * Support for ASCII case-insensitive matching. 59 | -------------------------------------------------------------------------------- /ac.go: -------------------------------------------------------------------------------- 1 | // Package ac provides an implementation of the Aho-Corasick string matching 2 | // algorithm. Throughout this code []byte is referred to 3 | // as a blice. 4 | // 5 | // http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm 6 | // 7 | // Copyright (c) 2013 CloudFlare, Inc. 8 | // 9 | // Originally from https://github.com/cloudflare/ahocorasick 10 | package ac 11 | 12 | import ( 13 | "container/list" 14 | ) 15 | 16 | const maxchar = 256 17 | 18 | // A node in the trie structure used to implement Aho-Corasick 19 | type node struct { 20 | root bool // true if this is the root 21 | 22 | output bool // True means this node represents a blice that should 23 | // be output when matching 24 | 25 | b string // The path at this node 26 | 27 | index int // index into original dictionary if output is true 28 | 29 | counter int // Set to the value of the Matcher.counter when a 30 | // match is output to prevent duplicate output 31 | 32 | // The use of fixed size arrays is space-inefficient but fast for 33 | // lookups. 34 | 35 | child [maxchar]*node // A non-nil entry in this array means that the 36 | // index represents a byte value which can be 37 | // appended to the current node. Blices in the 38 | // trie are built up byte by byte through these 39 | // child node pointers. 40 | 41 | fails [maxchar]*node // Where to fail to (by following the fail 42 | // pointers) for each possible byte 43 | 44 | suffix *node // Pointer to the longest possible strict suffix of 45 | // this node 46 | 47 | fail *node // Pointer to the next node which is in the dictionary 48 | // which can be reached from here following suffixes. Called fail 49 | // because it is used to fallback in the trie when a match fails. 50 | } 51 | 52 | // Matcher contains a list of blices to match against 53 | type Matcher struct { 54 | counter int // Counts the number of matches done, and is used to 55 | // prevent output of multiple matches of the same string 56 | trie []node // preallocated block of memory containing all the 57 | // nodes 58 | extent int // offset into trie that is currently free 59 | root *node // Points to trie[0] 60 | } 61 | 62 | // findBlice looks for a blice in the trie starting from the root and 63 | // returns a pointer to the node representing the end of the blice. If 64 | // the blice is not found it returns nil. 65 | func (m *Matcher) findBlice(b string) *node { 66 | n := &m.trie[0] 67 | 68 | for n != nil && len(b) > 0 { 69 | n = n.child[int(b[0])] 70 | b = b[1:] 71 | } 72 | 73 | return n 74 | } 75 | 76 | // getFreeNode: gets a free node structure from the Matcher's trie 77 | // pool and updates the extent to point to the next free node. 78 | func (m *Matcher) getFreeNode() *node { 79 | m.extent++ 80 | 81 | if m.extent == 1 { 82 | m.root = &m.trie[0] 83 | m.root.root = true 84 | } 85 | 86 | return &m.trie[m.extent-1] 87 | } 88 | 89 | // buildTrie builds the fundamental trie structure from a set of 90 | // blices. 91 | func (m *Matcher) buildTrie(dictionary [][]byte) { 92 | 93 | // Work out the maximum size for the trie (all dictionary entries 94 | // are distinct plus the root). This is used to preallocate memory 95 | // for it. 96 | 97 | max := 1 98 | for _, blice := range dictionary { 99 | max += len(blice) 100 | } 101 | m.trie = make([]node, max) 102 | 103 | // Calling this an ignoring its argument simply allocated 104 | // m.trie[0] which will be the root element 105 | 106 | m.getFreeNode() 107 | 108 | // This loop builds the nodes in the trie by following through 109 | // each dictionary entry building the children pointers. 110 | 111 | for _, blice := range dictionary { 112 | n := m.root 113 | for i, b := range blice { 114 | 115 | c := n.child[int(b)] 116 | 117 | if c == nil { 118 | c = m.getFreeNode() 119 | n.child[int(b)] = c 120 | c.b = string(blice[0 : i+1]) 121 | 122 | // Nodes directly under the root node will have the 123 | // root as their fail point as there are no suffixes 124 | // possible. 125 | 126 | if i == 0 { 127 | c.fail = m.root 128 | } 129 | 130 | c.suffix = m.root 131 | } 132 | 133 | n = c 134 | } 135 | 136 | // The last value of n points to the node representing a 137 | // dictionary entry 138 | 139 | n.output = true 140 | n.index = len(blice) 141 | } 142 | 143 | l := new(list.List) 144 | l.PushBack(m.root) 145 | 146 | for l.Len() > 0 { 147 | n := l.Remove(l.Front()).(*node) 148 | 149 | for i := 0; i < maxchar; i++ { 150 | c := n.child[i] 151 | if c != nil { 152 | l.PushBack(c) 153 | 154 | for j := 1; j < len(c.b); j++ { 155 | c.fail = m.findBlice(c.b[j:]) 156 | if c.fail != nil { 157 | break 158 | } 159 | } 160 | 161 | if c.fail == nil { 162 | c.fail = m.root 163 | } 164 | 165 | for j := 1; j < len(c.b); j++ { 166 | s := m.findBlice(c.b[j:]) 167 | if s != nil && s.output { 168 | c.suffix = s 169 | break 170 | } 171 | } 172 | } 173 | } 174 | } 175 | 176 | for i := 0; i < m.extent; i++ { 177 | for c := 0; c < maxchar; c++ { 178 | n := &m.trie[i] 179 | for n.child[c] == nil && !n.root { 180 | n = n.fail 181 | } 182 | 183 | m.trie[i].fails[c] = n 184 | } 185 | } 186 | 187 | m.trie = m.trie[:m.extent] 188 | } 189 | 190 | // buildTrieString builds the fundamental trie structure from a []string 191 | func (m *Matcher) buildTrieString(dictionary []string) { 192 | 193 | // Work out the maximum size for the trie (all dictionary entries 194 | // are distinct plus the root). This is used to preallocate memory 195 | // for it. 196 | 197 | max := 1 198 | for _, blice := range dictionary { 199 | max += len(blice) 200 | 201 | } 202 | m.trie = make([]node, max) 203 | 204 | // Calling this an ignoring its argument simply allocated 205 | // m.trie[0] which will be the root element 206 | 207 | m.getFreeNode() 208 | 209 | // This loop builds the nodes in the trie by following through 210 | // each dictionary entry building the children pointers. 211 | 212 | for _, blice := range dictionary { 213 | n := m.root 214 | for i := 0; i < len(blice); i++ { 215 | b := int(blice[i]) 216 | c := n.child[b] 217 | if c == nil { 218 | c = m.getFreeNode() 219 | n.child[b] = c 220 | c.b = blice[0 : i+1] 221 | 222 | // Nodes directly under the root node will have the 223 | // root as their fail point as there are no suffixes 224 | // possible. 225 | 226 | if i == 0 { 227 | c.fail = m.root 228 | } 229 | 230 | c.suffix = m.root 231 | } 232 | 233 | n = c 234 | } 235 | 236 | // The last value of n points to the node representing a 237 | // dictionary entry 238 | 239 | n.output = true 240 | n.index = len(blice) 241 | } 242 | 243 | l := new(list.List) 244 | l.PushBack(m.root) 245 | 246 | for l.Len() > 0 { 247 | n := l.Remove(l.Front()).(*node) 248 | 249 | for i := 0; i < maxchar; i++ { 250 | c := n.child[i] 251 | if c != nil { 252 | l.PushBack(c) 253 | 254 | for j := 1; j < len(c.b); j++ { 255 | c.fail = m.findBlice(c.b[j:]) 256 | if c.fail != nil { 257 | break 258 | } 259 | } 260 | 261 | if c.fail == nil { 262 | c.fail = m.root 263 | } 264 | 265 | for j := 1; j < len(c.b); j++ { 266 | s := m.findBlice(c.b[j:]) 267 | if s != nil && s.output { 268 | c.suffix = s 269 | break 270 | } 271 | } 272 | } 273 | } 274 | } 275 | 276 | for i := 0; i < m.extent; i++ { 277 | for c := 0; c < maxchar; c++ { 278 | n := &m.trie[i] 279 | for n.child[c] == nil && !n.root { 280 | n = n.fail 281 | } 282 | 283 | m.trie[i].fails[c] = n 284 | } 285 | } 286 | 287 | m.trie = m.trie[:m.extent] 288 | } 289 | 290 | // Compile creates a new Matcher using a list of []byte 291 | func Compile(dictionary [][]byte) (*Matcher, error) { 292 | m := new(Matcher) 293 | m.buildTrie(dictionary) 294 | // no error for now 295 | return m, nil 296 | } 297 | 298 | // MustCompile returns a Matcher or panics 299 | func MustCompile(dictionary [][]byte) *Matcher { 300 | m, err := Compile(dictionary) 301 | if err != nil { 302 | panic(err) 303 | } 304 | return m 305 | } 306 | 307 | // CompileString creates a new Matcher used to match against a set 308 | // of strings (this is a helper to make initialization easy) 309 | func CompileString(dictionary []string) (*Matcher, error) { 310 | m := new(Matcher) 311 | m.buildTrieString(dictionary) 312 | return m, nil 313 | } 314 | 315 | // MustCompileString returns a Matcher or panics 316 | func MustCompileString(dictionary []string) *Matcher { 317 | m, err := CompileString(dictionary) 318 | if err != nil { 319 | panic(err) 320 | } 321 | return m 322 | } 323 | 324 | // FindAll searches in for blices and returns all the blices found 325 | // in the original dictionary 326 | func (m *Matcher) FindAll(in []byte) [][]byte { 327 | m.counter++ 328 | var hits [][]byte 329 | 330 | n := m.root 331 | 332 | for idx, b := range in { 333 | c := int(b) 334 | 335 | if !n.root && n.child[c] == nil { 336 | n = n.fails[c] 337 | } 338 | 339 | if n.child[c] != nil { 340 | f := n.child[c] 341 | n = f 342 | 343 | if f.output && f.counter != m.counter { 344 | hits = append(hits, in[idx-f.index+1:idx+1]) 345 | f.counter = m.counter 346 | } 347 | 348 | for !f.suffix.root { 349 | f = f.suffix 350 | if f.counter != m.counter { 351 | hits = append(hits, in[idx-f.index+1:idx+1]) 352 | f.counter = m.counter 353 | } else { 354 | // There's no point working our way up the 355 | // suffixes if it's been done before for this call 356 | // to Match. The matches are already in hits. 357 | break 358 | } 359 | } 360 | } 361 | } 362 | 363 | return hits 364 | } 365 | 366 | // FindAllString searches in for blices and returns all the blices (as strings) found as 367 | // in the original dictionary 368 | func (m *Matcher) FindAllString(in string) []string { 369 | m.counter++ 370 | var hits []string 371 | 372 | n := m.root 373 | slen := len(in) 374 | for idx := 0; idx < slen; idx++ { 375 | c := int(in[idx]) 376 | 377 | if !n.root && n.child[c] == nil { 378 | n = n.fails[c] 379 | } 380 | 381 | if n.child[c] != nil { 382 | f := n.child[c] 383 | n = f 384 | 385 | if f.output && f.counter != m.counter { 386 | hits = append(hits, in[idx-f.index+1:idx+1]) 387 | f.counter = m.counter 388 | } 389 | 390 | for !f.suffix.root { 391 | f = f.suffix 392 | if f.counter != m.counter { 393 | hits = append(hits, in[idx-f.index+1:idx+1]) 394 | f.counter = m.counter 395 | } else { 396 | // There's no point working our way up the 397 | // suffixes if it's been done before for this call 398 | // to Match. The matches are already in hits. 399 | break 400 | } 401 | } 402 | } 403 | } 404 | 405 | return hits 406 | } 407 | 408 | // Match returns true if the input slice contains any subslices 409 | func (m *Matcher) Match(in []byte) bool { 410 | n := m.root 411 | for _, b := range in { 412 | c := int(b) 413 | if !n.root && n.child[c] == nil { 414 | n = n.fails[c] 415 | } 416 | 417 | if n.child[c] != nil { 418 | n = n.child[c] 419 | 420 | if n.output { 421 | return true 422 | } 423 | 424 | for !n.suffix.root { 425 | return true 426 | } 427 | } 428 | } 429 | return false 430 | } 431 | 432 | // MatchString returns true if the input slice contains any subslices 433 | func (m *Matcher) MatchString(in string) bool { 434 | n := m.root 435 | slen := len(in) 436 | for idx := 0; idx < slen; idx++ { 437 | c := int(in[idx]) 438 | if !n.root && n.child[c] == nil { 439 | n = n.fails[c] 440 | } 441 | if n.child[c] != nil { 442 | n = n.child[c] 443 | 444 | if n.output { 445 | return true 446 | } 447 | 448 | for !n.suffix.root { 449 | return true 450 | } 451 | } 452 | } 453 | return false 454 | } 455 | -------------------------------------------------------------------------------- /ac_test.go: -------------------------------------------------------------------------------- 1 | package ac 2 | 3 | import ( 4 | "fmt" 5 | "reflect" 6 | "testing" 7 | ) 8 | 9 | var cases = []struct { 10 | name string // matches original test name from cloudflare/ahocorasick 11 | dict []string 12 | input string 13 | matches []string 14 | }{ 15 | { 16 | "TestNoPatterns", 17 | []string{}, 18 | "", 19 | nil, 20 | }, 21 | { 22 | "TestNoData", 23 | []string{"foo", "baz", "bar"}, 24 | "", 25 | nil, 26 | }, 27 | { 28 | "TestSuffixes", 29 | []string{"Superman", "uperman", "perman", "erman"}, 30 | "The Man Of Steel: Superman", 31 | []string{"Superman", "uperman", "perman", "erman"}, 32 | }, 33 | { 34 | "TestPrefixes", 35 | []string{"Superman", "Superma", "Superm", "Super"}, 36 | "The Man Of Steel: Superman", 37 | []string{"Super", "Superm", "Superma", "Superman"}, 38 | }, 39 | { 40 | "TestInterior", 41 | []string{"Steel", "tee", "e"}, 42 | "The Man Of Steel: Superman", 43 | []string{"e", "tee", "Steel"}, 44 | }, 45 | { 46 | "TestMatchAtStart", 47 | []string{"The", "Th", "he"}, 48 | "The Man Of Steel: Superman", 49 | []string{"Th", "The", "he"}, 50 | }, 51 | { 52 | "TestMatchAtEnd", 53 | []string{"teel", "eel", "el"}, 54 | "The Man Of Steel", 55 | []string{"teel", "eel", "el"}, 56 | }, 57 | { 58 | "TestOverlappingPatterns", 59 | []string{"Man ", "n Of", "Of S"}, 60 | "The Man Of Steel", 61 | []string{"Man ", "n Of", "Of S"}, 62 | }, 63 | { 64 | "TestMultipleMatches", 65 | []string{"The", "Man", "an"}, 66 | "A Man A Plan A Canal: Panama, which Man Planned The Canal", 67 | []string{"Man", "an", "The"}, 68 | }, 69 | { 70 | "TestSingleCharacterMatches", 71 | []string{"a", "M", "z"}, 72 | "A Man A Plan A Canal: Panama, which Man Planned The Canal", 73 | []string{"M", "a"}}, 74 | { 75 | "TestNothingMatches", 76 | []string{"baz", "bar", "foo"}, 77 | "A Man A Plan A Canal: Panama, which Man Planned The Canal", 78 | nil, 79 | }, 80 | { 81 | "Wikipedia1", 82 | []string{"a", "ab", "bc", "bca", "c", "caa"}, 83 | "abccab", 84 | []string{"a", "ab", "bc", "c"}, 85 | }, 86 | { 87 | "Wikipedia2", 88 | []string{"a", "ab", "bc", "bca", "c", "caa"}, 89 | "bccab", 90 | []string{"bc", "c", "a", "ab"}, 91 | }, 92 | { 93 | "Wikipedia3", 94 | []string{"a", "ab", "bc", "bca", "c", "caa"}, 95 | "bccb", 96 | []string{"bc", "c"}, 97 | }, 98 | { 99 | "Browser1", 100 | []string{"Mozilla", "Mac", "Macintosh", "Safari", "Sausage"}, 101 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36", 102 | []string{"Mozilla", "Mac", "Macintosh", "Safari"}, 103 | }, 104 | { 105 | "Browser2", 106 | []string{"Mozilla", "Mac", "Macintosh", "Safari", "Sausage"}, 107 | "Mozilla/5.0 (Mac; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36", 108 | []string{"Mozilla", "Mac", "Safari"}, 109 | }, 110 | { 111 | "Browser3", 112 | []string{"Mozilla", "Mac", "Macintosh", "Safari", "Sausage"}, 113 | "Mozilla/5.0 (Moc; Intel Computer OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36", 114 | []string{"Mozilla", "Safari"}, 115 | }, 116 | { 117 | "Browser4", 118 | []string{"Mozilla", "Mac", "Macintosh", "Safari", "Sausage"}, 119 | "Mozilla/5.0 (Moc; Intel Computer OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Sofari/537.36", 120 | []string{"Mozilla"}, 121 | }, 122 | { 123 | "Browser5", 124 | []string{"Mozilla", "Mac", "Macintosh", "Safari", "Sausage"}, 125 | "Mazilla/5.0 (Moc; Intel Computer OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Sofari/537.36", 126 | nil, 127 | }, 128 | { 129 | // this is to make sure backtracking works. We get a partial 130 | // match of "Superwoman" with "Superman". Then we need to make 131 | // sure that we restart the search and find "per". Some implementations 132 | // had bugs that didn't backtrack (really start over) and didn't match 133 | // "per" 134 | "Backtrack", 135 | []string{"Superwoman", "per"}, 136 | "The Man Of Steel: Superman", 137 | []string{"per"}, 138 | }, 139 | { 140 | "NotAsciiInput", 141 | []string{"Mozilla", "Mac", "Macintosh", "Safari", "Sausage", "Gecko"}, 142 | "Mazilla/5.0 \u0000 (Moc; Intel Computer OS X 10_7_5) AppleWebKit/537.36 \uFFFF (KHTML, like Gecko) Chrome/30.0.1599.101 Sofari/537.36", 143 | []string{"Gecko"}, 144 | }, 145 | } 146 | 147 | func TestAC(t *testing.T) { 148 | for _, tt := range cases { 149 | m, err := CompileString(tt.dict) 150 | if err != nil { 151 | t.Fatalf("%s:unable to compile %s", tt.name, err) 152 | } 153 | 154 | // 155 | matches := m.FindAllString(tt.input) 156 | if !reflect.DeepEqual(matches, tt.matches) { 157 | t.Errorf("%s: FindAllString want %v, got %v", tt.name, tt.matches, matches) 158 | } 159 | 160 | // 161 | contains := m.MatchString(tt.input) 162 | if contains { 163 | if len(tt.matches) == 0 { 164 | t.Errorf("%s: MatchString want false, but got true", tt.name) 165 | } 166 | } else { 167 | // does not contain, but got matches 168 | if len(tt.matches) != 0 { 169 | t.Errorf("%s: MatchString want true, but got false", tt.name) 170 | } 171 | } 172 | } 173 | } 174 | 175 | func TestACBlices(t *testing.T) { 176 | for _, tt := range cases { 177 | var dict [][]byte 178 | for _, d := range tt.dict { 179 | dict = append(dict, []byte(d)) 180 | } 181 | m := MustCompile(dict) 182 | 183 | matches := m.FindAll([]byte(tt.input)) 184 | var mb [][]byte 185 | for _, m := range matches { 186 | mb = append(mb, []byte(m)) 187 | } 188 | if !reflect.DeepEqual(matches, mb) { 189 | t.Errorf("%s: FindAll = %v, want %v", tt.name, mb, matches) 190 | } 191 | 192 | contains := m.Match([]byte(tt.input)) 193 | if contains { 194 | if len(tt.matches) == 0 { 195 | t.Errorf("%s: MatchString = true, want false", tt.name) 196 | } 197 | } else { 198 | // does not contain, but got matches 199 | if len(tt.matches) != 0 { 200 | t.Errorf("%s: Match = false, want true", tt.name) 201 | } 202 | } 203 | 204 | } 205 | } 206 | 207 | func TestNonASCIIDictionary(t *testing.T) { 208 | dict := []string{"hello world", "こんにちは世界"} 209 | _, err := CompileString(dict) 210 | if err != nil { 211 | t.Errorf("error compiling matcher: %s", err) 212 | } 213 | } 214 | 215 | var ( 216 | source1 = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36" 217 | source1b = []byte(source1) 218 | dict1 = []string{"Mozilla", "Mac", "Macintosh", "Safari", "Sausage"} 219 | dict2 = []string{"Googlebot", "bingbot", "msnbot", "Yandex", "Baiduspider"} 220 | re1 = MustCompileString(dict1) 221 | re2 = MustCompileString(dict2) 222 | ) 223 | 224 | // this is to prevent optimizer tricks 225 | var result1 bool 226 | 227 | func BenchmarkAC1(b *testing.B) { 228 | var result bool 229 | for i := 0; i < b.N; i++ { 230 | result = re1.MatchString(source1) 231 | } 232 | result1 = result 233 | } 234 | 235 | func ExampleMatcher_FindAllString() { 236 | m := MustCompileString([]string{"Superman", "uperman", "perman", "erman"}) 237 | matches := m.FindAllString("The Man Of Steel: Superman") 238 | fmt.Println(matches) 239 | // Output: [Superman uperman perman erman] 240 | } 241 | 242 | func ExampleMatcher_MatchString() { 243 | m := MustCompileString([]string{"Superman", "uperman", "perman", "erman"}) 244 | contains := m.MatchString("The Man Of Steel: Superman") 245 | fmt.Println(contains) 246 | // Output: true 247 | } 248 | 249 | func BenchmarkAC2(b *testing.B) { 250 | var result bool 251 | for i := 0; i < b.N; i++ { 252 | result = re2.MatchString(source1) 253 | } 254 | result1 = result 255 | } 256 | func BenchmarkAC2Byte(b *testing.B) { 257 | var result bool 258 | for i := 0; i < b.N; i++ { 259 | result = re2.Match(source1b) 260 | } 261 | result1 = result 262 | } 263 | -------------------------------------------------------------------------------- /acascii/ac.go: -------------------------------------------------------------------------------- 1 | // Package ac provides an implementation of the Aho-Corasick string matching 2 | // algorithm. Throughout this code []byte is referred to 3 | // as a blice. 4 | // 5 | // http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm 6 | // 7 | // Copyright (c) 2013 CloudFlare, Inc. 8 | // 9 | // Originally from https://github.com/cloudflare/ahocorasick 10 | package acascii 11 | 12 | import ( 13 | "container/list" 14 | "errors" 15 | ) 16 | 17 | const maxchar = 128 18 | 19 | // ErrNotASCII is returned when the dictionary input is not ASCII 20 | var ErrNotASCII = errors.New("non-ASCII input") 21 | 22 | // A node in the trie structure used to implement Aho-Corasick 23 | type node struct { 24 | root bool // true if this is the root 25 | 26 | output bool // True means this node represents a blice that should 27 | // be output when matching 28 | 29 | b string // The path at this node 30 | 31 | index int // index into original dictionary if output is true 32 | 33 | counter int // Set to the value of the Matcher.counter when a 34 | // match is output to prevent duplicate output 35 | 36 | // The use of fixed size arrays is space-inefficient but fast for 37 | // lookups. 38 | 39 | child [maxchar]*node // A non-nil entry in this array means that the 40 | // index represents a byte value which can be 41 | // appended to the current node. Blices in the 42 | // trie are built up byte by byte through these 43 | // child node pointers. 44 | 45 | fails [maxchar]*node // Where to fail to (by following the fail 46 | // pointers) for each possible byte 47 | 48 | suffix *node // Pointer to the longest possible strict suffix of 49 | // this node 50 | 51 | fail *node // Pointer to the next node which is in the dictionary 52 | // which can be reached from here following suffixes. Called fail 53 | // because it is used to fallback in the trie when a match fails. 54 | } 55 | 56 | // Matcher contains a list of blices to match against 57 | type Matcher struct { 58 | counter int // Counts the number of matches done, and is used to 59 | // prevent output of multiple matches of the same string 60 | trie []node // preallocated block of memory containing all the 61 | // nodes 62 | extent int // offset into trie that is currently free 63 | root *node // Points to trie[0] 64 | } 65 | 66 | // findBlice looks for a blice in the trie starting from the root and 67 | // returns a pointer to the node representing the end of the blice. If 68 | // the blice is not found it returns nil. 69 | func (m *Matcher) findBlice(b string) *node { 70 | n := &m.trie[0] 71 | 72 | for n != nil && len(b) > 0 { 73 | n = n.child[int(b[0])] 74 | b = b[1:] 75 | } 76 | 77 | return n 78 | } 79 | 80 | // getFreeNode: gets a free node structure from the Matcher's trie 81 | // pool and updates the extent to point to the next free node. 82 | func (m *Matcher) getFreeNode() *node { 83 | m.extent++ 84 | 85 | if m.extent == 1 { 86 | m.root = &m.trie[0] 87 | m.root.root = true 88 | } 89 | 90 | return &m.trie[m.extent-1] 91 | } 92 | 93 | // buildTrie builds the fundamental trie structure from a set of 94 | // blices. 95 | func (m *Matcher) buildTrie(dictionary [][]byte) error { 96 | 97 | // Work out the maximum size for the trie (all dictionary entries 98 | // are distinct plus the root). This is used to preallocate memory 99 | // for it. 100 | 101 | max := 1 102 | for _, blice := range dictionary { 103 | max += len(blice) 104 | } 105 | m.trie = make([]node, max) 106 | 107 | // Calling this an ignoring its argument simply allocated 108 | // m.trie[0] which will be the root element 109 | 110 | m.getFreeNode() 111 | 112 | // This loop builds the nodes in the trie by following through 113 | // each dictionary entry building the children pointers. 114 | 115 | for _, blice := range dictionary { 116 | n := m.root 117 | for i, b := range blice { 118 | idx := int(b) 119 | if idx >= maxchar { 120 | return ErrNotASCII 121 | } 122 | c := n.child[idx] 123 | 124 | if c == nil { 125 | c = m.getFreeNode() 126 | n.child[idx] = c 127 | c.b = string(blice[0 : i+1]) 128 | 129 | // Nodes directly under the root node will have the 130 | // root as their fail point as there are no suffixes 131 | // possible. 132 | 133 | if i == 0 { 134 | c.fail = m.root 135 | } 136 | 137 | c.suffix = m.root 138 | } 139 | 140 | n = c 141 | } 142 | 143 | // The last value of n points to the node representing a 144 | // dictionary entry 145 | 146 | n.output = true 147 | n.index = len(blice) 148 | } 149 | 150 | l := new(list.List) 151 | l.PushBack(m.root) 152 | 153 | for l.Len() > 0 { 154 | n := l.Remove(l.Front()).(*node) 155 | 156 | for i := 0; i < maxchar; i++ { 157 | c := n.child[i] 158 | if c != nil { 159 | l.PushBack(c) 160 | 161 | for j := 1; j < len(c.b); j++ { 162 | c.fail = m.findBlice(c.b[j:]) 163 | if c.fail != nil { 164 | break 165 | } 166 | } 167 | 168 | if c.fail == nil { 169 | c.fail = m.root 170 | } 171 | 172 | for j := 1; j < len(c.b); j++ { 173 | s := m.findBlice(c.b[j:]) 174 | if s != nil && s.output { 175 | c.suffix = s 176 | break 177 | } 178 | } 179 | } 180 | } 181 | } 182 | 183 | for i := 0; i < m.extent; i++ { 184 | for c := 0; c < maxchar; c++ { 185 | n := &m.trie[i] 186 | for n.child[c] == nil && !n.root { 187 | n = n.fail 188 | } 189 | 190 | m.trie[i].fails[c] = n 191 | } 192 | } 193 | 194 | m.trie = m.trie[:m.extent] 195 | return nil 196 | } 197 | 198 | // buildTrieString builds the fundamental trie structure from a []string 199 | func (m *Matcher) buildTrieString(dictionary []string) error { 200 | 201 | // Work out the maximum size for the trie (all dictionary entries 202 | // are distinct plus the root). This is used to preallocate memory 203 | // for it. 204 | 205 | max := 1 206 | for _, blice := range dictionary { 207 | max += len(blice) 208 | 209 | } 210 | m.trie = make([]node, max) 211 | 212 | // Calling this an ignoring its argument simply allocated 213 | // m.trie[0] which will be the root element 214 | 215 | m.getFreeNode() 216 | 217 | // This loop builds the nodes in the trie by following through 218 | // each dictionary entry building the children pointers. 219 | 220 | for _, blice := range dictionary { 221 | n := m.root 222 | for i := 0; i < len(blice); i++ { 223 | index := int(blice[i]) 224 | if index >= maxchar { 225 | return ErrNotASCII 226 | } 227 | b := int(blice[i]) 228 | c := n.child[b] 229 | if c == nil { 230 | c = m.getFreeNode() 231 | n.child[b] = c 232 | c.b = blice[0 : i+1] 233 | 234 | // Nodes directly under the root node will have the 235 | // root as their fail point as there are no suffixes 236 | // possible. 237 | 238 | if i == 0 { 239 | c.fail = m.root 240 | } 241 | 242 | c.suffix = m.root 243 | } 244 | 245 | n = c 246 | } 247 | 248 | // The last value of n points to the node representing a 249 | // dictionary entry 250 | 251 | n.output = true 252 | n.index = len(blice) 253 | } 254 | 255 | l := new(list.List) 256 | l.PushBack(m.root) 257 | 258 | for l.Len() > 0 { 259 | n := l.Remove(l.Front()).(*node) 260 | 261 | for i := 0; i < maxchar; i++ { 262 | c := n.child[i] 263 | if c != nil { 264 | l.PushBack(c) 265 | 266 | for j := 1; j < len(c.b); j++ { 267 | c.fail = m.findBlice(c.b[j:]) 268 | if c.fail != nil { 269 | break 270 | } 271 | } 272 | 273 | if c.fail == nil { 274 | c.fail = m.root 275 | } 276 | 277 | for j := 1; j < len(c.b); j++ { 278 | s := m.findBlice(c.b[j:]) 279 | if s != nil && s.output { 280 | c.suffix = s 281 | break 282 | } 283 | } 284 | } 285 | } 286 | } 287 | 288 | for i := 0; i < m.extent; i++ { 289 | for c := 0; c < maxchar; c++ { 290 | n := &m.trie[i] 291 | for n.child[c] == nil && !n.root { 292 | n = n.fail 293 | } 294 | 295 | m.trie[i].fails[c] = n 296 | } 297 | } 298 | 299 | m.trie = m.trie[:m.extent] 300 | return nil 301 | } 302 | 303 | // Compile creates a new Matcher using a list of []byte 304 | func Compile(dictionary [][]byte) (*Matcher, error) { 305 | m := new(Matcher) 306 | if err := m.buildTrie(dictionary); err != nil { 307 | return nil, err 308 | } 309 | return m, nil 310 | } 311 | 312 | // MustCompile returns a Matcher or panics 313 | func MustCompile(dictionary [][]byte) *Matcher { 314 | m, err := Compile(dictionary) 315 | if err != nil { 316 | panic(err) 317 | } 318 | return m 319 | } 320 | 321 | // CompileString creates a new Matcher used to match against a set 322 | // of strings (this is a helper to make initialization easy) 323 | func CompileString(dictionary []string) (*Matcher, error) { 324 | m := new(Matcher) 325 | if err := m.buildTrieString(dictionary); err != nil { 326 | return nil, err 327 | } 328 | return m, nil 329 | } 330 | 331 | // MustCompileString returns a Matcher or panics 332 | func MustCompileString(dictionary []string) *Matcher { 333 | m, err := CompileString(dictionary) 334 | if err != nil { 335 | panic(err) 336 | } 337 | return m 338 | } 339 | 340 | // FindAll searches in for blices and returns all the blices found 341 | // in the original dictionary 342 | func (m *Matcher) FindAll(in []byte) [][]byte { 343 | m.counter++ 344 | var hits [][]byte 345 | 346 | n := m.root 347 | 348 | for idx, b := range in { 349 | c := int(b) 350 | if c >= maxchar { 351 | c = 0 352 | } 353 | if !n.root && n.child[c] == nil { 354 | n = n.fails[c] 355 | } 356 | 357 | if n.child[c] != nil { 358 | f := n.child[c] 359 | n = f 360 | 361 | if f.output && f.counter != m.counter { 362 | hits = append(hits, in[idx-f.index+1:idx+1]) 363 | f.counter = m.counter 364 | } 365 | 366 | for !f.suffix.root { 367 | f = f.suffix 368 | if f.counter != m.counter { 369 | hits = append(hits, in[idx-f.index+1:idx+1]) 370 | f.counter = m.counter 371 | } else { 372 | // There's no point working our way up the 373 | // suffixes if it's been done before for this call 374 | // to Match. The matches are already in hits. 375 | break 376 | } 377 | } 378 | } 379 | } 380 | 381 | return hits 382 | } 383 | 384 | // FindAllString searches in for blices and returns all the blices (as strings) found as 385 | // in the original dictionary 386 | func (m *Matcher) FindAllString(in string) []string { 387 | m.counter++ 388 | var hits []string 389 | 390 | n := m.root 391 | slen := len(in) 392 | for idx := 0; idx < slen; idx++ { 393 | c := int(in[idx]) 394 | if c >= maxchar { 395 | c = 0 396 | } 397 | if !n.root && n.child[c] == nil { 398 | n = n.fails[c] 399 | } 400 | 401 | if n.child[c] != nil { 402 | f := n.child[c] 403 | n = f 404 | 405 | if f.output && f.counter != m.counter { 406 | hits = append(hits, in[idx-f.index+1:idx+1]) 407 | f.counter = m.counter 408 | } 409 | 410 | for !f.suffix.root { 411 | f = f.suffix 412 | if f.counter != m.counter { 413 | hits = append(hits, in[idx-f.index+1:idx+1]) 414 | f.counter = m.counter 415 | } else { 416 | // There's no point working our way up the 417 | // suffixes if it's been done before for this call 418 | // to Match. The matches are already in hits. 419 | break 420 | } 421 | } 422 | } 423 | } 424 | 425 | return hits 426 | } 427 | 428 | // Match returns true if the input slice contains any subslices 429 | func (m *Matcher) Match(in []byte) bool { 430 | n := m.root 431 | for _, b := range in { 432 | c := int(b) 433 | if c > maxchar { 434 | c = 0 435 | } 436 | if !n.root && n.child[c] == nil { 437 | n = n.fails[c] 438 | } 439 | 440 | if n.child[c] != nil { 441 | n = n.child[c] 442 | 443 | if n.output { 444 | return true 445 | } 446 | 447 | for !n.suffix.root { 448 | return true 449 | } 450 | } 451 | } 452 | return false 453 | } 454 | 455 | // MatchString returns true if the input slice contains any subslices 456 | func (m *Matcher) MatchString(in string) bool { 457 | n := m.root 458 | slen := len(in) 459 | for idx := 0; idx < slen; idx++ { 460 | c := int(in[idx]) 461 | if c >= maxchar { 462 | c = 0 463 | } 464 | if !n.root && n.child[c] == nil { 465 | n = n.fails[c] 466 | } 467 | if n.child[c] != nil { 468 | n = n.child[c] 469 | 470 | if n.output { 471 | return true 472 | } 473 | 474 | for !n.suffix.root { 475 | return true 476 | } 477 | } 478 | } 479 | return false 480 | } 481 | -------------------------------------------------------------------------------- /acascii/ac_test.go: -------------------------------------------------------------------------------- 1 | package acascii 2 | 3 | import ( 4 | "fmt" 5 | "reflect" 6 | "testing" 7 | ) 8 | 9 | var cases = []struct { 10 | name string // matches original test name from cloudflare/ahocorasick 11 | dict []string 12 | input string 13 | matches []string 14 | }{ 15 | { 16 | "TestNoPatterns", 17 | []string{}, 18 | "", 19 | nil, 20 | }, 21 | { 22 | "TestNoData", 23 | []string{"foo", "baz", "bar"}, 24 | "", 25 | nil, 26 | }, 27 | { 28 | "TestSuffixes", 29 | []string{"Superman", "uperman", "perman", "erman"}, 30 | "The Man Of Steel: Superman", 31 | []string{"Superman", "uperman", "perman", "erman"}, 32 | }, 33 | { 34 | "TestPrefixes", 35 | []string{"Superman", "Superma", "Superm", "Super"}, 36 | "The Man Of Steel: Superman", 37 | []string{"Super", "Superm", "Superma", "Superman"}, 38 | }, 39 | { 40 | "TestInterior", 41 | []string{"Steel", "tee", "e"}, 42 | "The Man Of Steel: Superman", 43 | []string{"e", "tee", "Steel"}, 44 | }, 45 | { 46 | "TestMatchAtStart", 47 | []string{"The", "Th", "he"}, 48 | "The Man Of Steel: Superman", 49 | []string{"Th", "The", "he"}, 50 | }, 51 | { 52 | "TestMatchAtEnd", 53 | []string{"teel", "eel", "el"}, 54 | "The Man Of Steel", 55 | []string{"teel", "eel", "el"}, 56 | }, 57 | { 58 | "TestOverlappingPatterns", 59 | []string{"Man ", "n Of", "Of S"}, 60 | "The Man Of Steel", 61 | []string{"Man ", "n Of", "Of S"}, 62 | }, 63 | { 64 | "TestMultipleMatches", 65 | []string{"The", "Man", "an"}, 66 | "A Man A Plan A Canal: Panama, which Man Planned The Canal", 67 | []string{"Man", "an", "The"}, 68 | }, 69 | { 70 | "TestSingleCharacterMatches", 71 | []string{"a", "M", "z"}, 72 | "A Man A Plan A Canal: Panama, which Man Planned The Canal", 73 | []string{"M", "a"}}, 74 | { 75 | "TestNothingMatches", 76 | []string{"baz", "bar", "foo"}, 77 | "A Man A Plan A Canal: Panama, which Man Planned The Canal", 78 | nil, 79 | }, 80 | { 81 | "Wikipedia1", 82 | []string{"a", "ab", "bc", "bca", "c", "caa"}, 83 | "abccab", 84 | []string{"a", "ab", "bc", "c"}, 85 | }, 86 | { 87 | "Wikipedia2", 88 | []string{"a", "ab", "bc", "bca", "c", "caa"}, 89 | "bccab", 90 | []string{"bc", "c", "a", "ab"}, 91 | }, 92 | { 93 | "Wikipedia3", 94 | []string{"a", "ab", "bc", "bca", "c", "caa"}, 95 | "bccb", 96 | []string{"bc", "c"}, 97 | }, 98 | { 99 | "Browser1", 100 | []string{"Mozilla", "Mac", "Macintosh", "Safari", "Sausage"}, 101 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36", 102 | []string{"Mozilla", "Mac", "Macintosh", "Safari"}, 103 | }, 104 | { 105 | "Browser2", 106 | []string{"Mozilla", "Mac", "Macintosh", "Safari", "Sausage"}, 107 | "Mozilla/5.0 (Mac; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36", 108 | []string{"Mozilla", "Mac", "Safari"}, 109 | }, 110 | { 111 | "Browser3", 112 | []string{"Mozilla", "Mac", "Macintosh", "Safari", "Sausage"}, 113 | "Mozilla/5.0 (Moc; Intel Computer OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36", 114 | []string{"Mozilla", "Safari"}, 115 | }, 116 | { 117 | "Browser4", 118 | []string{"Mozilla", "Mac", "Macintosh", "Safari", "Sausage"}, 119 | "Mozilla/5.0 (Moc; Intel Computer OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Sofari/537.36", 120 | []string{"Mozilla"}, 121 | }, 122 | { 123 | "Browser5", 124 | []string{"Mozilla", "Mac", "Macintosh", "Safari", "Sausage"}, 125 | "Mazilla/5.0 (Moc; Intel Computer OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Sofari/537.36", 126 | nil, 127 | }, 128 | { 129 | // this is to make sure backtracking works. We get a partial 130 | // match of "Superwoman" with "Superman". Then we need to make 131 | // sure that we restart the search and find "per". Some implementations 132 | // had bugs that didn't backtrack (really start over) and didn't match 133 | // "per" 134 | "Backtrack", 135 | []string{"Superwoman", "per"}, 136 | "The Man Of Steel: Superman", 137 | []string{"per"}, 138 | }, 139 | { 140 | "NotAsciiInput", 141 | []string{"Mozilla", "Mac", "Macintosh", "Safari", "Sausage", "Gecko"}, 142 | "Mazilla/5.0 \u0000 (Moc; Intel Computer OS X 10_7_5) AppleWebKit/537.36 \uFFFF (KHTML, like Gecko) Chrome/30.0.1599.101 Sofari/537.36", 143 | []string{"Gecko"}, 144 | }, 145 | } 146 | 147 | func TestAC(t *testing.T) { 148 | for _, tt := range cases { 149 | m, err := CompileString(tt.dict) 150 | if err != nil { 151 | t.Fatalf("%s:unable to compile %s", tt.name, err) 152 | } 153 | 154 | // 155 | matches := m.FindAllString(tt.input) 156 | if !reflect.DeepEqual(matches, tt.matches) { 157 | t.Errorf("%s: FindAllString want %v, got %v", tt.name, tt.matches, matches) 158 | } 159 | 160 | // 161 | contains := m.MatchString(tt.input) 162 | if contains { 163 | if len(tt.matches) == 0 { 164 | t.Errorf("%s: MatchString want false, but got true", tt.name) 165 | } 166 | } else { 167 | // does not contain, but got matches 168 | if len(tt.matches) != 0 { 169 | t.Errorf("%s: MatchString want true, but got false", tt.name) 170 | } 171 | } 172 | } 173 | } 174 | 175 | func TestACBlices(t *testing.T) { 176 | for _, tt := range cases { 177 | var dict [][]byte 178 | for _, d := range tt.dict { 179 | dict = append(dict, []byte(d)) 180 | } 181 | m := MustCompile(dict) 182 | 183 | matches := m.FindAll([]byte(tt.input)) 184 | var mb [][]byte 185 | for _, m := range matches { 186 | mb = append(mb, []byte(m)) 187 | } 188 | if !reflect.DeepEqual(matches, mb) { 189 | t.Errorf("%s: FindAll = %v, want %v", tt.name, mb, matches) 190 | } 191 | 192 | contains := m.Match([]byte(tt.input)) 193 | if contains { 194 | if len(tt.matches) == 0 { 195 | t.Errorf("%s: MatchString = true, want false", tt.name) 196 | } 197 | } else { 198 | // does not contain, but got matches 199 | if len(tt.matches) != 0 { 200 | t.Errorf("%s: Match = false, want true", tt.name) 201 | } 202 | } 203 | 204 | } 205 | } 206 | 207 | func TestNonASCIIDictionary(t *testing.T) { 208 | dict := []string{"hello world", "こんにちは世界"} 209 | _, err := CompileString(dict) 210 | if err == nil { 211 | t.Errorf("expected error compiling ASCII matcher") 212 | } 213 | } 214 | 215 | var ( 216 | source1 = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36" 217 | source1b = []byte(source1) 218 | dict1 = []string{"Mozilla", "Mac", "Macintosh", "Safari", "Sausage"} 219 | dict2 = []string{"Googlebot", "bingbot", "msnbot", "Yandex", "Baiduspider"} 220 | re1 = MustCompileString(dict1) 221 | re2 = MustCompileString(dict2) 222 | ) 223 | 224 | // this is to prevent optimizer tricks 225 | var result1 bool 226 | 227 | func BenchmarkAC1(b *testing.B) { 228 | var result bool 229 | for i := 0; i < b.N; i++ { 230 | result = re1.MatchString(source1) 231 | } 232 | result1 = result 233 | } 234 | 235 | func ExampleMatcher_FindAllString() { 236 | m := MustCompileString([]string{"Superman", "uperman", "perman", "erman"}) 237 | matches := m.FindAllString("The Man Of Steel: Superman") 238 | fmt.Println(matches) 239 | // Output: [Superman uperman perman erman] 240 | } 241 | 242 | func ExampleMatcher_MatchString() { 243 | m := MustCompileString([]string{"Superman", "uperman", "perman", "erman"}) 244 | contains := m.MatchString("The Man Of Steel: Superman") 245 | fmt.Println(contains) 246 | // Output: true 247 | } 248 | 249 | func BenchmarkAC2(b *testing.B) { 250 | var result bool 251 | for i := 0; i < b.N; i++ { 252 | result = re2.MatchString(source1) 253 | } 254 | result1 = result 255 | } 256 | func BenchmarkAC2Byte(b *testing.B) { 257 | var result bool 258 | for i := 0; i < b.N; i++ { 259 | result = re2.Match(source1b) 260 | } 261 | result1 = result 262 | } 263 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/signalsciences/ac 2 | 3 | go 1.12 4 | --------------------------------------------------------------------------------