├── README.md ├── gokmp.go └── gokmp_test.go /README.md: -------------------------------------------------------------------------------- 1 | gokmp 2 | ===== 3 | 4 | String-matching in Golang using the Knuth–Morris–Pratt algorithm (KMP). 5 | 6 | ## Disclaimer 7 | 8 | This library was written as part of my Master's Thesis and should be used as a helpful implementation reference for people interested in the Knuth-Morris-Pratt algorithm than as a performance string searching library. 9 | 10 | I believe the compiler has since caught up to most of the gains that this library bought me back in the day. 11 | 12 | See [Documentation](http://godoc.org/github.com/paddie/gokmp) on [GoDoc](http://godoc.org/). 13 | 14 | Example: 15 | ======== 16 | ```Go 17 | package main 18 | 19 | import ( 20 | "fmt" 21 | "github.com/paddie/gokmp" 22 | ) 23 | 24 | const str = "aabaabaaaabbaabaabaaabbaabaabb" 25 | // " _ _ _ " 26 | // 8 19 26 27 | const pattern = "aabb" 28 | 29 | func main() { 30 | kmp, _ := gokmp.NewKMP(pattern) 31 | ints := kmp.FindAllStringIndex(str) 32 | 33 | fmt.Println(ints) 34 | } 35 | ``` 36 | Output: 37 | ======= 38 | ```Go 39 | [8 19 26] 40 | ``` 41 | 42 | Tests and Benchmarks: 43 | ===================== 44 | ``` 45 | go test -v -bench . 46 | ``` 47 | 48 | Output: 49 | ======= 50 | ``` 51 | === RUN TestFindAllStringIndex 52 | --- PASS: TestFindAllStringIndex (0.00 seconds) 53 | === RUN TestFindStringIndex 54 | --- PASS: TestFindStringIndex (0.00 seconds) 55 | === RUN TestContainedIn 56 | --- PASS: TestContainedIn (0.00 seconds) 57 | === RUN TestOccurrences 58 | --- PASS: TestOccurrences (0.00 seconds) 59 | === RUN TestOccurrencesFail 60 | --- PASS: TestOccurrencesFail (0.00 seconds) 61 | PASS 62 | BenchmarkKMPIndexComparison 10000000 178 ns/op 63 | BenchmarkStringsIndexComparison 10000000 359 ns/op 64 | ok github.com/paddie/gokmp 5.854s 65 | ``` 66 | Comparison: 67 | ============ 68 | ```bash 69 | gokmp.FindStringIndex / strings.Index = 178/359 = 0.4958 70 | ``` 71 | Almost a 2x improvement over the naive built-in method. 72 | -------------------------------------------------------------------------------- /gokmp.go: -------------------------------------------------------------------------------- 1 | // String-matching in Golang using the Knuth–Morris–Pratt algorithm (KMP) 2 | package gokmp 3 | 4 | import ( 5 | "errors" 6 | "fmt" 7 | ) 8 | 9 | type KMP struct { 10 | pattern string 11 | prefix []int 12 | size int 13 | } 14 | 15 | // For debugging 16 | func (kmp *KMP) String() string { 17 | return fmt.Sprintf("pattern: %v\nprefix: %v", kmp.pattern, kmp.prefix) 18 | } 19 | 20 | // compile new prefix-array given argument 21 | func NewKMP(pattern string) (*KMP, error) { 22 | prefix, err := computePrefix(pattern) 23 | if err != nil { 24 | return nil, err 25 | } 26 | return &KMP{ 27 | pattern: pattern, 28 | prefix: prefix, 29 | size: len(pattern)}, 30 | nil 31 | } 32 | 33 | // returns an array containing indexes of matches 34 | // - error if pattern argument is less than 1 char 35 | func computePrefix(pattern string) ([]int, error) { 36 | // sanity check 37 | len_p := len(pattern) 38 | if len_p < 2 { 39 | if len_p == 0 { 40 | return nil, errors.New("'pattern' must contain at least one character") 41 | } 42 | return []int{-1}, nil 43 | } 44 | t := make([]int, len_p) 45 | t[0], t[1] = -1, 0 46 | 47 | pos, count := 2, 0 48 | for pos < len_p { 49 | 50 | if pattern[pos-1] == pattern[count] { 51 | count++ 52 | t[pos] = count 53 | pos++ 54 | } else { 55 | if count > 0 { 56 | count = t[count] 57 | } else { 58 | t[pos] = 0 59 | pos++ 60 | } 61 | } 62 | } 63 | return t, nil 64 | } 65 | 66 | // return index of first occurence of kmp.pattern in argument 's' 67 | // - if not found, returns -1 68 | func (kmp *KMP) FindStringIndex(s string) int { 69 | // sanity check 70 | if len(s) < kmp.size { 71 | return -1 72 | } 73 | m, i := 0, 0 74 | for m+i < len(s) { 75 | if kmp.pattern[i] == s[m+i] { 76 | if i == kmp.size-1 { 77 | return m 78 | } 79 | i++ 80 | } else { 81 | m = m + i - kmp.prefix[i] 82 | if kmp.prefix[i] > -1 { 83 | i = kmp.prefix[i] 84 | } else { 85 | i = 0 86 | } 87 | } 88 | } 89 | return -1 90 | } 91 | 92 | // returns true if pattern i matched at least once 93 | func (kmp *KMP) ContainedIn(s string) bool { 94 | return kmp.FindStringIndex(s) >= 0 95 | } 96 | 97 | // returns the number of occurences of pattern in argument 98 | func (kmp *KMP) Occurrences(s string) int { 99 | return len(kmp.FindAllStringIndex(s)) 100 | } 101 | 102 | // for effeciency, define default array-size 103 | const startSize = 10 104 | 105 | // find every occurence of the kmp.pattern in 's' 106 | func (kmp *KMP) FindAllStringIndex(s string) []int { 107 | // precompute 108 | len_s := len(s) 109 | 110 | if len_s < kmp.size { 111 | return []int{} 112 | } 113 | 114 | match := make([]int, 0, startSize) 115 | m, i := 0, 0 116 | for m+i < len_s { 117 | if kmp.pattern[i] == s[m+i] { 118 | if i == kmp.size-1 { 119 | // the word was matched 120 | match = append(match, m) 121 | // simulate miss, and keep running 122 | m = m + i - kmp.prefix[i] 123 | if kmp.prefix[i] > -1 { 124 | i = kmp.prefix[i] 125 | } else { 126 | i = 0 127 | } 128 | } else { 129 | i++ 130 | } 131 | } else { 132 | m = m + i - kmp.prefix[i] 133 | if kmp.prefix[i] > -1 { 134 | i = kmp.prefix[i] 135 | } else { 136 | i = 0 137 | } 138 | } 139 | } 140 | return match 141 | } 142 | -------------------------------------------------------------------------------- /gokmp_test.go: -------------------------------------------------------------------------------- 1 | package gokmp 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | ) 7 | 8 | // TESTS 9 | 10 | // pretty much the worst case string for strings.Index 11 | // wrt. string and pattern 12 | const str = "aabaabaaaabbaabaabaaabbaabaabb" 13 | const pattern = "aabb" 14 | 15 | func TestFindAllStringIndex(t *testing.T) { 16 | kmp, _ := NewKMP(pattern) 17 | // fmt.Println(kmp) 18 | ints := kmp.FindAllStringIndex(str) 19 | test := []int{8, 19, 26} 20 | for i, v := range ints { 21 | if test[i] != v { 22 | t.Errorf("FindAllStringIndex:\t%v != %v, (exp: %v != act: %v)", test[i], v, ints, test) 23 | } 24 | } 25 | } 26 | 27 | func TestFindStringIndex(t *testing.T) { 28 | kmp, _ := NewKMP(pattern) 29 | ints := kmp.FindStringIndex(str) 30 | test := 8 31 | if ints != test { 32 | t.Errorf("FindStringIndex:\t%v != %v", ints, test) 33 | } 34 | } 35 | 36 | func TestContainedIn(t *testing.T) { 37 | kmp, _ := NewKMP(pattern) 38 | if !kmp.ContainedIn(str) { 39 | t.Errorf("ContainedIn:\tExpected: True != actual: False") 40 | } 41 | } 42 | 43 | func TestOccurrences(t *testing.T) { 44 | kmp, _ := NewKMP(pattern) 45 | nr := kmp.Occurrences(str) 46 | if nr != 3 { 47 | t.Errorf("Occurences:\texp: %v != act: %v)", 3, nr) 48 | } 49 | } 50 | 51 | func TestOccurrencesFail(t *testing.T) { 52 | kmp, _ := NewKMP(pattern) 53 | nr := kmp.Occurrences("pebble") 54 | if nr != 0 { 55 | t.Errorf("Occurences:\texp: %v != act: %v)", 0, nr) 56 | } 57 | } 58 | 59 | // BENCHMARKS 60 | 61 | func BenchmarkKMPIndexComparison(b *testing.B) { 62 | kmp, _ := NewKMP(pattern) 63 | for i := 0; i < b.N; i++ { 64 | _ = kmp.FindStringIndex(str) 65 | } 66 | } 67 | 68 | func BenchmarkStringsIndexComparison(b *testing.B) { 69 | for i := 0; i < b.N; i++ { 70 | _ = strings.Index(str, pattern) 71 | } 72 | } 73 | --------------------------------------------------------------------------------