├── .gitignore ├── LICENSE ├── README.md ├── aho-corasick.lua └── test.lua /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013 CloudFlare, Inc. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above 10 | copyright notice, this list of conditions and the following disclaimer 11 | in the documentation and/or other materials provided with the 12 | distribution. 13 | * Neither the name of CloudFlare, Inc. nor the names of its 14 | contributors may be used to endorse or promote products derived from 15 | this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | aho-corasick-lua 2 | ================ 3 | 4 | Lua implementation of the Aho-Corasick string matching algorithm 5 | 6 | See http://en.wikipedia.org/wiki/Aho?Corasick_string_matching_algorithm 7 | 8 | -------------------------------------------------------------------------------- /aho-corasick.lua: -------------------------------------------------------------------------------- 1 | -- A Lua implementation of the Aho-Corasick string matching algorithm 2 | -- 3 | -- Copyright (c) 2013-2014 CloudFlare, Inc. 4 | -- 5 | -- Usage: 6 | -- 7 | -- local AC = require 'aho-corasick' 8 | -- 9 | -- t = AC.build({'words', 'to', 'find'}) 10 | -- r = AC.match(t, 'try to find in this string') 11 | -- r == {'to', 'find'} 12 | 13 | local M = {} 14 | local byte = string.byte 15 | local char = string.char 16 | 17 | local root = "" 18 | 19 | -- make: creates a new entry in t for the given string c with optional fail 20 | -- state 21 | local function make(t, c, f) 22 | t[c] = {} 23 | t[c].to = {} 24 | t[c].fail = f 25 | t[c].hit = root 26 | t[c].word = false 27 | end 28 | 29 | -- build: builds the Aho-Corasick data structure from an array of strings 30 | function M.build(m) 31 | local t = {} 32 | make(t, root, root) 33 | 34 | for i = 1, #m do 35 | local current = root 36 | 37 | -- Build the tos which capture the transitions within the tree 38 | 39 | for j = 1, m[i]:len() do 40 | local c = byte(m[i], j) 41 | local path = current .. char(c) 42 | 43 | if t[current].to[c] == nil then 44 | t[current].to[c] = path 45 | 46 | if current == root then 47 | make(t, path, root) 48 | else 49 | make(t, path) 50 | end 51 | end 52 | 53 | current = path 54 | end 55 | 56 | t[m[i]].word = true 57 | end 58 | 59 | -- Build the fails which show how to backtrack when a fail matches and 60 | -- build the hits which connect nodes to suffixes that are words 61 | 62 | local q = {root} 63 | 64 | while #q > 0 do 65 | local path = table.remove(q, 1) 66 | 67 | for c, p in pairs(t[path].to) do 68 | table.insert(q, p) 69 | 70 | local fail = p:sub(2) 71 | while fail ~= "" and t[fail] == nil do 72 | fail = fail:sub(2) 73 | end 74 | if fail == "" then fail = root end 75 | t[p].fail = fail 76 | 77 | local hit = p:sub(2) 78 | while hit ~= "" and (t[hit] == nil or not t[hit].word) do 79 | hit = hit:sub(2) 80 | end 81 | if hit == "" then hit = root end 82 | t[p].hit = hit 83 | end 84 | end 85 | 86 | return t 87 | end 88 | 89 | -- match: checks to see if the passed in string matches the passed in tree 90 | -- created with build. If all is true (the default) an array of all matches is 91 | -- returned. If all is false then only the first match is returned. 92 | function M.match(t, s, all) 93 | if all == nil then 94 | all = true 95 | end 96 | 97 | local path = root 98 | local hits = {} 99 | local hits_idx = 0 100 | 101 | for i = 1,s:len() do 102 | local c = byte(s, i) 103 | 104 | while t[path].to[c] == nil and path ~= root do 105 | path = t[path].fail 106 | end 107 | 108 | local n = t[path].to[c] 109 | 110 | if n ~= nil then 111 | path = n 112 | 113 | if t[n].word then 114 | hits_idx = hits_idx + 1 115 | hits[hits_idx] = n 116 | end 117 | 118 | while t[n].hit ~= root do 119 | n = t[n].hit 120 | hits_idx = hits_idx + 1 121 | hits[hits_idx] = n 122 | end 123 | 124 | if all == false and hits_idx > 0 then 125 | return hits 126 | end 127 | end 128 | end 129 | 130 | return hits 131 | end 132 | 133 | return M 134 | 135 | -------------------------------------------------------------------------------- /test.lua: -------------------------------------------------------------------------------- 1 | local AC = require 'aho-corasick' 2 | 3 | local count = 0 4 | 5 | function error(e, t) 6 | print(count .. " " .. e) 7 | os.exit() 8 | end 9 | 10 | function test(s, d, r) 11 | count = count + 1 12 | local t = AC.build(d) 13 | local f = AC.match(t, s) 14 | 15 | if #r ~= #f then 16 | error("Wrong number of results " .. #r .. ", " .. #f, t) 17 | end 18 | 19 | for i = 1,#r do 20 | if r[i] ~= f[i] then 21 | error("Non-matching result " .. r[i] .. ", " .. f[i], t) 22 | end 23 | end 24 | 25 | print(count .. " ok") 26 | end 27 | 28 | -- Example from Wikipedia page 29 | test("abccab", {"a", "ab", "bc", "bca", "c", "caa"}, 30 | {"a", "ab", "bc", "c", "c", "a", "ab"}) 31 | 32 | -- Simple test for finding a string 33 | test("The pot had a handle", {"poto"}, {}) 34 | test("The pot had a handle", {"The"}, {"The"}) 35 | test("The pot had a handle", {"pot"}, {"pot"}) 36 | test("The pot had a handle", {"pot "}, {"pot "}) 37 | test("The pot had a handle", {"ot h"}, {"ot h"}) 38 | test("The pot had a handle", {"andle"}, {"andle"}) 39 | 40 | -- Multiple non-overlapping patterns 41 | test("The pot had a handle", {"h"}, {"h", "h", "h"}) 42 | test("The pot had a handle", {"ha", "he"}, {"he", "ha", "ha"}) 43 | test("The pot had a handle", {"pot", "had"}, {"pot", "had"}) 44 | test("The pot had a handle", {"pot", "had", "hod"}, {"pot", "had"}) 45 | test("The pot had a handle", {"The", "pot", "had", "hod", "andle"}, 46 | {"The", "pot", "had", "andle"}) 47 | 48 | -- Overlapping patterns 49 | test("The pot had a handle", {"Th", "he pot", "The", "pot h"}, 50 | {"Th", "The", "he pot", "pot h"}) 51 | 52 | -- One pattern inside another 53 | test("The pot had a handle", {"handle", "hand", "and", "andle"}, 54 | {"hand", "and", "handle", "andle"}) 55 | test("The pot had a handle", {"handle", "hand", "an", "n"}, 56 | {"an", "n", "hand", "handle"}) 57 | test("The pot had a handle", {"dle", "l", "le"}, 58 | {"l", "dle", "le"}) 59 | 60 | -- Random example 61 | test("yasherhs", {"say", "she", "shr", "he", "her"}, 62 | {"she", "he", "her"}) 63 | 64 | -- Fail from partial match 65 | test("The pot had a handle", {"dlf", "l"}, {"l"}) 66 | 67 | -- Many suffixes and prefixes 68 | test("The pot had a handle", {"handle", "andle", "ndle", "dle", "le", "e"}, 69 | {"e", "handle", "andle", "ndle", "dle", "le", "e"}) 70 | test("The pot had a handle", {"handle", "handl", "hand", "han", "ha", "a"}, 71 | {"ha", "a", "a", "ha", "a", "han", "hand", "handl", "handle"}) 72 | 73 | -- Long word 74 | test("macintosh", {"acintosh", "in"}, {"in", "acintosh"}) 75 | test("macintosh", {"acintosh", "in", "tosh"}, {"in", "acintosh", "tosh"}) 76 | test("macintosh", {"acintosh", "into", "to", "in"}, 77 | {"in", "into", "to", "acintosh", }) 78 | --------------------------------------------------------------------------------