├── go.mod ├── README.md ├── LICENSE.txt ├── murmur_test.go ├── murmur.go ├── mph.go └── mph_test.go /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/cespare/mph 2 | 3 | go 1.17 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mph 2 | 3 | 4 | [![Go Reference](https://pkg.go.dev/badge/github.com/cespare/mph.svg)](https://pkg.go.dev/github.com/cespare/mph) 5 | 6 | mph is a Go package for that implements a [minimal perfect hash table][mph] over 7 | strings. It uses the ["Hash, displace, and compress" algorithm][algo] and the 8 | [Murmur3 hash function][murmur3]. 9 | 10 | Some quick benchmark results (this is on an i7-8700K): 11 | 12 | * `Build` constructs a minimal perfect hash table from a 102k word dictionary in 13 | 18ms (construction time is linear in the size of the input). 14 | * `Lookup`s on that dictionary take about 30ns and are 27% faster than a 15 | `map[string]uint32`: 16 | 17 | BenchmarkTable-12 199293806 29.99 ns/op 18 | BenchmarkTableMap-12 145449822 40.92 ns/op 19 | 20 | [mph]: https://en.wikipedia.org/wiki/Perfect_hash_function#Minimal_perfect_hash_function 21 | [algo]: http://cmph.sourceforge.net/papers/esa09.pdf 22 | [murmur3]: https://en.wikipedia.org/wiki/MurmurHash 23 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016 Caleb Spare 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /murmur_test.go: -------------------------------------------------------------------------------- 1 | package mph 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | "testing" 7 | ) 8 | 9 | var murmurTestCases = []struct { 10 | input string 11 | seed murmurSeed 12 | want uint32 13 | }{ 14 | {"", 0, 0}, 15 | {"", 1, 0x514e28b7}, 16 | {"", 0xffffffff, 0x81f16f39}, 17 | {"\xff\xff\xff\xff", 0, 0x76293b50}, 18 | {"!Ce\x87", 0, 0xf55b516b}, 19 | {"!Ce\x87", 0x5082edee, 0x2362f9de}, 20 | {"!Ce", 0, 0x7e4a8634}, 21 | {"!C", 0, 0xa0f7b07a}, 22 | {"!", 0, 0x72661cf4}, 23 | {"\x00\x00\x00\x00", 0, 0x2362f9de}, 24 | {"\x00\x00\x00", 0, 0x85f0b427}, 25 | {"\x00\x00", 0, 0x30f4c306}, 26 | {"Hello, world!", 0x9747b28c, 0x24884CBA}, 27 | {"ππππππππ", 0x9747b28c, 0xD58063C1}, 28 | {"abc", 0, 0xb3dd93fa}, 29 | {"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 0, 0xee925b90}, 30 | {"The quick brown fox jumps over the lazy dog", 0x9747b28c, 0x2fa826cd}, 31 | {strings.Repeat("a", 256), 0x9747b28c, 0x37405bdc}, 32 | } 33 | 34 | func TestMurmur(t *testing.T) { 35 | for _, tt := range murmurTestCases { 36 | got := tt.seed.hash(tt.input) 37 | if got != tt.want { 38 | t.Errorf("hash(%q, seed=0x%x): got 0x%x; want %x", 39 | tt.input, tt.seed, got, tt.want) 40 | } 41 | } 42 | } 43 | 44 | func BenchmarkMurmur(b *testing.B) { 45 | for _, size := range []int{1, 4, 8, 16, 32, 50, 500} { 46 | b.Run(fmt.Sprint(size), func(b *testing.B) { 47 | s := strings.Repeat("a", size) 48 | b.SetBytes(int64(size)) 49 | var seed murmurSeed 50 | b.ResetTimer() 51 | for i := 0; i < b.N; i++ { 52 | seed.hash(s) 53 | } 54 | }) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /murmur.go: -------------------------------------------------------------------------------- 1 | package mph 2 | 3 | import ( 4 | "reflect" 5 | "unsafe" 6 | ) 7 | 8 | // This file contains an optimized murmur3 32-bit implementation tailored for 9 | // our specific use case. See https://en.wikipedia.org/wiki/MurmurHash. 10 | 11 | // A murmurSeed is the initial state of a Murmur3 hash. 12 | type murmurSeed uint32 13 | 14 | const ( 15 | c1 = 0xcc9e2d51 16 | c2 = 0x1b873593 17 | r1Left = 15 18 | r1Right = 32 - r1Left 19 | r2Left = 13 20 | r2Right = 32 - r2Left 21 | m = 5 22 | n = 0xe6546b64 23 | ) 24 | 25 | // hash computes the 32-bit Murmur3 hash of s using ms as the seed. 26 | func (ms murmurSeed) hash(s string) uint32 { 27 | h := uint32(ms) 28 | l := len(s) 29 | numBlocks := l / 4 30 | var blocks []uint32 31 | header := (*reflect.SliceHeader)(unsafe.Pointer(&blocks)) 32 | header.Data = (*reflect.StringHeader)(unsafe.Pointer(&s)).Data 33 | header.Len = numBlocks 34 | header.Cap = numBlocks 35 | for _, k := range blocks { 36 | k *= c1 37 | k = (k << r1Left) | (k >> r1Right) 38 | k *= c2 39 | h ^= k 40 | h = (h << r2Left) | (h >> r2Right) 41 | h = h*m + n 42 | } 43 | 44 | var k uint32 45 | ntail := l & 3 46 | itail := l - ntail 47 | switch ntail { 48 | case 3: 49 | k ^= uint32(s[itail+2]) << 16 50 | fallthrough 51 | case 2: 52 | k ^= uint32(s[itail+1]) << 8 53 | fallthrough 54 | case 1: 55 | k ^= uint32(s[itail]) 56 | k *= c1 57 | k = (k << r1Left) | (k >> r1Right) 58 | k *= c2 59 | h ^= k 60 | } 61 | 62 | h ^= uint32(l) 63 | h ^= h >> 16 64 | h *= 0x85ebca6b 65 | h ^= h >> 13 66 | h *= 0xc2b2ae35 67 | h ^= h >> 16 68 | return h 69 | } 70 | -------------------------------------------------------------------------------- /mph.go: -------------------------------------------------------------------------------- 1 | // Package mph implements a minimal perfect hash table over strings. 2 | package mph 3 | 4 | import "sort" 5 | 6 | // A Table is an immutable hash table that provides constant-time lookups of key 7 | // indices using a minimal perfect hash. 8 | type Table struct { 9 | keys []string 10 | level0 []uint32 // power of 2 size 11 | level0Mask int // len(Level0) - 1 12 | level1 []uint32 // power of 2 size >= len(keys) 13 | level1Mask int // len(Level1) - 1 14 | } 15 | 16 | // Build builds a Table from keys using the "Hash, displace, and compress" 17 | // algorithm described in http://cmph.sourceforge.net/papers/esa09.pdf. 18 | func Build(keys []string) *Table { 19 | var ( 20 | level0 = make([]uint32, nextPow2(len(keys)/4)) 21 | level0Mask = len(level0) - 1 22 | level1 = make([]uint32, nextPow2(len(keys))) 23 | level1Mask = len(level1) - 1 24 | sparseBuckets = make([][]int, len(level0)) 25 | zeroSeed = murmurSeed(0) 26 | ) 27 | for i, s := range keys { 28 | n := int(zeroSeed.hash(s)) & level0Mask 29 | sparseBuckets[n] = append(sparseBuckets[n], i) 30 | } 31 | var buckets []indexBucket 32 | for n, vals := range sparseBuckets { 33 | if len(vals) > 0 { 34 | buckets = append(buckets, indexBucket{n, vals}) 35 | } 36 | } 37 | sort.Sort(bySize(buckets)) 38 | 39 | occ := make([]bool, len(level1)) 40 | var tmpOcc []int 41 | for _, bucket := range buckets { 42 | var seed murmurSeed 43 | trySeed: 44 | tmpOcc = tmpOcc[:0] 45 | for _, i := range bucket.vals { 46 | n := int(seed.hash(keys[i])) & level1Mask 47 | if occ[n] { 48 | for _, n := range tmpOcc { 49 | occ[n] = false 50 | } 51 | seed++ 52 | goto trySeed 53 | } 54 | occ[n] = true 55 | tmpOcc = append(tmpOcc, n) 56 | level1[n] = uint32(i) 57 | } 58 | level0[int(bucket.n)] = uint32(seed) 59 | } 60 | 61 | return &Table{ 62 | keys: keys, 63 | level0: level0, 64 | level0Mask: level0Mask, 65 | level1: level1, 66 | level1Mask: level1Mask, 67 | } 68 | } 69 | 70 | func nextPow2(n int) int { 71 | for i := 1; ; i *= 2 { 72 | if i >= n { 73 | return i 74 | } 75 | } 76 | } 77 | 78 | // Lookup searches for s in t and returns its index and whether it was found. 79 | func (t *Table) Lookup(s string) (n uint32, ok bool) { 80 | i0 := int(murmurSeed(0).hash(s)) & t.level0Mask 81 | seed := t.level0[i0] 82 | i1 := int(murmurSeed(seed).hash(s)) & t.level1Mask 83 | n = t.level1[i1] 84 | return n, s == t.keys[int(n)] 85 | } 86 | 87 | type indexBucket struct { 88 | n int 89 | vals []int 90 | } 91 | 92 | type bySize []indexBucket 93 | 94 | func (s bySize) Len() int { return len(s) } 95 | func (s bySize) Less(i, j int) bool { return len(s[i].vals) > len(s[j].vals) } 96 | func (s bySize) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 97 | -------------------------------------------------------------------------------- /mph_test.go: -------------------------------------------------------------------------------- 1 | package mph 2 | 3 | import ( 4 | "bufio" 5 | "os" 6 | "strconv" 7 | "sync" 8 | "testing" 9 | ) 10 | 11 | func TestBuild_simple(t *testing.T) { 12 | testTable(t, []string{"foo", "foo2", "bar", "baz"}, []string{"quux"}) 13 | } 14 | 15 | func TestBuild_stress(t *testing.T) { 16 | var keys, extra []string 17 | for i := 0; i < 20000; i++ { 18 | s := strconv.Itoa(i) 19 | if i < 10000 { 20 | keys = append(keys, s) 21 | } else { 22 | extra = append(extra, s) 23 | } 24 | } 25 | testTable(t, keys, extra) 26 | } 27 | 28 | func testTable(t *testing.T, keys []string, extra []string) { 29 | table := Build(keys) 30 | for i, key := range keys { 31 | n, ok := table.Lookup(key) 32 | if !ok { 33 | t.Errorf("Lookup(%s): got !ok; want ok", key) 34 | continue 35 | } 36 | if int(n) != i { 37 | t.Errorf("Lookup(%s): got n=%d; want %d", key, n, i) 38 | } 39 | } 40 | for _, key := range extra { 41 | if _, ok := table.Lookup(key); ok { 42 | t.Errorf("Lookup(%s): got ok; want !ok", key) 43 | } 44 | } 45 | } 46 | 47 | var ( 48 | words []string 49 | wordsOnce sync.Once 50 | benchTable *Table 51 | ) 52 | 53 | func BenchmarkBuild(b *testing.B) { 54 | wordsOnce.Do(loadBenchTable) 55 | if len(words) == 0 { 56 | b.Skip("unable to load dictionary file") 57 | } 58 | for i := 0; i < b.N; i++ { 59 | Build(words) 60 | } 61 | } 62 | 63 | func BenchmarkTable(b *testing.B) { 64 | wordsOnce.Do(loadBenchTable) 65 | if len(words) == 0 { 66 | b.Skip("unable to load dictionary file") 67 | } 68 | b.ResetTimer() 69 | for i := 0; i < b.N; i++ { 70 | j := i % len(words) 71 | n, ok := benchTable.Lookup(words[j]) 72 | if !ok { 73 | b.Fatal("missing key") 74 | } 75 | if n != uint32(j) { 76 | b.Fatal("bad result index") 77 | } 78 | } 79 | } 80 | 81 | // For comparison against BenchmarkTable. 82 | func BenchmarkTableMap(b *testing.B) { 83 | wordsOnce.Do(loadBenchTable) 84 | if len(words) == 0 { 85 | b.Skip("unable to load dictionary file") 86 | } 87 | m := make(map[string]uint32) 88 | for i, word := range words { 89 | m[word] = uint32(i) 90 | } 91 | b.ResetTimer() 92 | for i := 0; i < b.N; i++ { 93 | j := i % len(words) 94 | n, ok := m[words[j]] 95 | if !ok { 96 | b.Fatal("missing key") 97 | } 98 | if n != uint32(j) { 99 | b.Fatal("bad result index") 100 | } 101 | } 102 | } 103 | 104 | func loadBenchTable() { 105 | for _, dict := range []string{"/usr/share/dict/words", "/usr/dict/words"} { 106 | var err error 107 | words, err = loadDict(dict) 108 | if err == nil { 109 | break 110 | } 111 | } 112 | if len(words) > 0 { 113 | benchTable = Build(words) 114 | } 115 | } 116 | 117 | func loadDict(dict string) ([]string, error) { 118 | f, err := os.Open(dict) 119 | if err != nil { 120 | return nil, err 121 | } 122 | defer f.Close() 123 | scanner := bufio.NewScanner(f) 124 | var words []string 125 | for scanner.Scan() { 126 | words = append(words, scanner.Text()) 127 | } 128 | if err := scanner.Err(); err != nil { 129 | return nil, err 130 | } 131 | return words, nil 132 | } 133 | --------------------------------------------------------------------------------