├── .travis.yml ├── bench ├── go.mod ├── main.go └── go.sum ├── go.mod ├── offheap ├── go.mod ├── intbank_test.go ├── intbank.go ├── LICENSE.txt ├── go.sum ├── symboltab_test.go └── symboltab.go ├── intbank_test.go ├── README.md ├── intbank.go ├── go.sum ├── naive.go ├── LICENSE.txt ├── symboltab_test.go └── symboltab.go /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go -------------------------------------------------------------------------------- /bench/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/philpearl/symboltab/bench 2 | 3 | go 1.12 4 | 5 | require ( 6 | github.com/loov/hrtime v1.0.1 7 | github.com/philpearl/symboltab v1.1.1 8 | ) 9 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/philpearl/symboltab 2 | 3 | go 1.24 4 | 5 | require ( 6 | github.com/philpearl/stringbank v1.1.0 7 | github.com/stretchr/testify v1.3.0 8 | ) 9 | 10 | require ( 11 | github.com/davecgh/go-spew v1.1.0 // indirect 12 | github.com/pmezard/go-difflib v1.0.0 // indirect 13 | ) 14 | -------------------------------------------------------------------------------- /offheap/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/philpearl/symboltab/offheap 2 | 3 | go 1.25 4 | 5 | require ( 6 | github.com/philpearl/mmap v0.0.1 7 | github.com/philpearl/stringbank/offheap v1.0.3 8 | github.com/stretchr/testify v1.3.0 9 | ) 10 | 11 | require ( 12 | github.com/davecgh/go-spew v1.1.1 // indirect 13 | github.com/pmezard/go-difflib v1.0.0 // indirect 14 | ) 15 | -------------------------------------------------------------------------------- /intbank_test.go: -------------------------------------------------------------------------------- 1 | package symboltab 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func TestIntbank(t *testing.T) { 10 | ib := intbank{} 11 | ib.save(1, 37) 12 | ib.save(2, 43) 13 | 14 | assert.EqualValues(t, 37, ib.lookup(1)) 15 | assert.EqualValues(t, 43, ib.lookup(2)) 16 | assert.EqualValues(t, 37, ib.lookup(1)) 17 | } 18 | -------------------------------------------------------------------------------- /offheap/intbank_test.go: -------------------------------------------------------------------------------- 1 | package offheap 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func TestIntbank(t *testing.T) { 10 | ib := intbank{} 11 | ib.save(1, 37) 12 | ib.save(2, 43) 13 | 14 | assert.EqualValues(t, 37, ib.lookup(1)) 15 | assert.EqualValues(t, 43, ib.lookup(2)) 16 | assert.EqualValues(t, 37, ib.lookup(1)) 17 | } 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![GoDoc](https://godoc.org/github.com/philpearl/symboltab?status.svg)](https://godoc.org/github.com/philpearl/symboltab) 2 | [![Build Status](https://travis-ci.org/philpearl/symboltab.svg)](https://travis-ci.org/philpearl/symboltab) 3 | 4 | 5 | I've called this a "symbol table". It converts a string ID to an integer sequence number. The integers start at 1 and increase by 1 for each new unique string. The intention is to store a very large number of strings, so the library is light on GC. 6 | 7 | The idea behind the symbol table is to convert string IDs into integer IDs that can then be used for fast comparison and array/slice lookups -------------------------------------------------------------------------------- /intbank.go: -------------------------------------------------------------------------------- 1 | package symboltab 2 | 3 | const intbanksize = 1 << 9 4 | 5 | type intbank struct { 6 | slabs [][]int 7 | } 8 | 9 | func (ib *intbank) save(sequence uint32, offset int) { 10 | sequence-- // externally sequence starts at 1 11 | slabNo := int(sequence / intbanksize) 12 | slabOffset := int(sequence % intbanksize) 13 | 14 | for len(ib.slabs) <= slabNo { 15 | ib.slabs = append(ib.slabs, make([]int, intbanksize)) 16 | } 17 | 18 | ib.slabs[slabNo][slabOffset] = offset 19 | } 20 | 21 | func (ib *intbank) lookup(sequence uint32) int { 22 | sequence-- // externally, sequence starts at 1 23 | slabNo := int(sequence / intbanksize) 24 | slabOffset := int(sequence % intbanksize) 25 | 26 | return ib.slabs[slabNo][slabOffset] 27 | } 28 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= 2 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 3 | github.com/philpearl/stringbank v1.1.0 h1:YY+DV72+w0MAIbjguu4dtNFiOgGtrwJ+hFPaKRkZV+4= 4 | github.com/philpearl/stringbank v1.1.0/go.mod h1:0V0f9Ba79DpIl4FTfotL+7IJ+etELdRQIcHJY2nX/+w= 5 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 6 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 7 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 8 | github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= 9 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 10 | -------------------------------------------------------------------------------- /offheap/intbank.go: -------------------------------------------------------------------------------- 1 | package offheap 2 | 3 | import ( 4 | "github.com/philpearl/mmap" 5 | ) 6 | 7 | const intbanksize = 1 << 12 8 | 9 | type intbank struct { 10 | slabs [][]int 11 | } 12 | 13 | func (ib *intbank) close() { 14 | for _, s := range ib.slabs { 15 | mmap.Free(s) 16 | } 17 | ib.slabs = nil 18 | } 19 | 20 | func (ib *intbank) save(sequence uint32, offset int) { 21 | sequence-- // externally sequence starts at 1 22 | slabNo := int(sequence / intbanksize) 23 | slabOffset := int(sequence % intbanksize) 24 | 25 | for len(ib.slabs) <= slabNo { 26 | ns, _ := mmap.Alloc[int](intbanksize) 27 | ib.slabs = append(ib.slabs, ns) 28 | } 29 | 30 | ib.slabs[slabNo][slabOffset] = offset 31 | } 32 | 33 | func (ib *intbank) lookup(sequence uint32) int { 34 | sequence-- // externally, sequence starts at 1 35 | slabNo := int(sequence / intbanksize) 36 | slabOffset := int(sequence % intbanksize) 37 | 38 | return ib.slabs[slabNo][slabOffset] 39 | } 40 | -------------------------------------------------------------------------------- /naive.go: -------------------------------------------------------------------------------- 1 | package symboltab 2 | 3 | // Naive implementation of the same function. Really just intended to compare against 4 | type Naive struct { 5 | m map[string]int32 6 | i []string 7 | } 8 | 9 | // NewNaive creates a new, basic implementation of the symboltable function 10 | func NewNaive(cap int) *Naive { 11 | return &Naive{ 12 | m: make(map[string]int32, cap), 13 | i: make([]string, 0, cap), 14 | } 15 | } 16 | 17 | // StringToSequence converts a string to a sequence number 18 | func (n *Naive) StringToSequence(val string, addNew bool) (seq int32, found bool) { 19 | seq, ok := n.m[val] 20 | if ok { 21 | return seq, true 22 | } 23 | if addNew { 24 | seq := int32(len(n.m)) + 1 25 | n.i = append(n.i, val) 26 | n.m[val] = seq 27 | return seq, false 28 | } 29 | return 0, false 30 | } 31 | 32 | // SequenceToString retrieves the string for a sequence number 33 | func (n *Naive) SequenceToString(seq int32) string { 34 | return n.i[seq-1] 35 | } 36 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2017-2025 Phil Pearl 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /offheap/LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2017-2025 Phil Pearl 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /bench/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "runtime" 6 | "strconv" 7 | "time" 8 | 9 | "github.com/loov/hrtime" 10 | "github.com/philpearl/symboltab" 11 | ) 12 | 13 | const count = 1e7 14 | 15 | func main() { 16 | b := hrtime.NewBenchmarkTSC(count) 17 | 18 | symbols := make([]string, count) 19 | for i := range symbols { 20 | symbols[i] = strconv.Itoa(i) 21 | } 22 | 23 | st := symboltab.New(0) 24 | 25 | runtime.GC() 26 | 27 | for i := 0; b.Next(); i++ { 28 | if i >= count { 29 | i = 0 30 | } 31 | t := hrtime.TSC() 32 | st.StringToSequence(symbols[i], true) 33 | st.StringToSequence(symbols[i], true) 34 | dur := hrtime.TSC() - t 35 | if dur.ApproxDuration() > time.Millisecond*100 { 36 | // When we grow the table to larger sizes we see slow performance. It seems that just allocating 37 | // these very big slices takes > 100ms, presumably because they are zeroed 38 | fmt.Printf("big number at %d\n", i) 39 | } 40 | } 41 | 42 | opts := hrtime.HistogramOptions{ 43 | BinCount: 20, 44 | NiceRange: true, 45 | ClampMaximum: 0, 46 | ClampPercentile: 0.999999, 47 | } 48 | fmt.Println(hrtime.NewDurationHistogram(b.Laps(), &opts)) 49 | } 50 | -------------------------------------------------------------------------------- /bench/go.sum: -------------------------------------------------------------------------------- 1 | github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= 2 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 3 | github.com/loov/hrtime v1.0.1 h1:n6UINiq9nfyTmfNpLvgYN4O8d6Z0tZMoGd4QOolrxyc= 4 | github.com/loov/hrtime v1.0.1/go.mod h1:yDY3Pwv2izeY4sq7YcPX/dtLwzg5NU1AxWuWxKwd0p0= 5 | github.com/philpearl/aeshash v0.0.0-20180606163729-b8bbbadb7d42 h1:gKkaGDgFO0tAurwztl7ut8BlkQbdWuK0mr46U00JYw8= 6 | github.com/philpearl/aeshash v0.0.0-20180606163729-b8bbbadb7d42/go.mod h1:fp/ETJRQnc8o4We0k8oAYZtDISr6JfAzJREMlpni2R0= 7 | github.com/philpearl/stringbank v1.1.0 h1:YY+DV72+w0MAIbjguu4dtNFiOgGtrwJ+hFPaKRkZV+4= 8 | github.com/philpearl/stringbank v1.1.0/go.mod h1:0V0f9Ba79DpIl4FTfotL+7IJ+etELdRQIcHJY2nX/+w= 9 | github.com/philpearl/symboltab v1.1.1 h1:0IuRP+CZPekX92YKslHjCPNHyulFKMct5sJ2Md7NlKQ= 10 | github.com/philpearl/symboltab v1.1.1/go.mod h1:ulTJYx+8SBHfrfK3lR07tiJUu8t2jkORgSkEeANZmWM= 11 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 12 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 13 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 14 | github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= 15 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 16 | -------------------------------------------------------------------------------- /offheap/go.sum: -------------------------------------------------------------------------------- 1 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 2 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 3 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 4 | github.com/philpearl/mmap v0.0.0-20190501094812-b5dc52c98503 h1:MraCkgNEk6PSuBL+nyseA2LvOo+mcsYpAhEcixj7t7I= 5 | github.com/philpearl/mmap v0.0.0-20190501094812-b5dc52c98503/go.mod h1:U3YvJkR3bBTvF9794kZHNUbyHCMxErFJMvNhBBhqPMU= 6 | github.com/philpearl/mmap v0.0.1 h1:vPBpjN92UQNvDGAnovW79HS4OI9XR7TYp6XkkzJ7skg= 7 | github.com/philpearl/mmap v0.0.1/go.mod h1:QrP2HYBITgRn17ew4iLlkxpqwC0+anpv8FgqPAfDWQE= 8 | github.com/philpearl/stringbank/offheap v1.0.1 h1:TGNpfzszLkMecT3UEReaRg0ymXqjxj+7rQIB+QSCdQY= 9 | github.com/philpearl/stringbank/offheap v1.0.1/go.mod h1:JQruHVjqo7N44kjvueRPOFtxCfp68Bnq8Vw3eSBi1UY= 10 | github.com/philpearl/stringbank/offheap v1.0.2 h1:47KztAcDEKup+FeHNnPSylF0gpyuJGgdHzLIVKY+hFM= 11 | github.com/philpearl/stringbank/offheap v1.0.2/go.mod h1:JQruHVjqo7N44kjvueRPOFtxCfp68Bnq8Vw3eSBi1UY= 12 | github.com/philpearl/stringbank/offheap v1.0.3 h1:9NT/eUJRfdaDevHiyjZ9W4a747WcF0B85vCNNiQ8Ads= 13 | github.com/philpearl/stringbank/offheap v1.0.3/go.mod h1:OnP8kk6PjZqZszWom4TdO/f5gXNwzMaJ9D+h79ixPqA= 14 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 15 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 16 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 17 | github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= 18 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 19 | -------------------------------------------------------------------------------- /symboltab_test.go: -------------------------------------------------------------------------------- 1 | package symboltab 2 | 3 | import ( 4 | "fmt" 5 | "runtime" 6 | "strconv" 7 | "testing" 8 | "time" 9 | 10 | "github.com/stretchr/testify/assert" 11 | ) 12 | 13 | func TestBasic(t *testing.T) { 14 | st := New(16) 15 | 16 | assertStringToSequence := func(seq uint32, existing bool, val string) { 17 | t.Helper() 18 | seqa, existinga := st.StringToSequence(val, true) 19 | assert.Equal(t, existing, existinga) 20 | if existinga { 21 | assert.Equal(t, seq, seqa) 22 | } 23 | } 24 | 25 | assert.Zero(t, st.SymbolSize()) 26 | 27 | assertStringToSequence(1, false, "a1") 28 | assertStringToSequence(2, false, "a2") 29 | assertStringToSequence(3, false, "a3") 30 | assertStringToSequence(2, true, "a2") 31 | assertStringToSequence(3, true, "a3") 32 | 33 | assert.Equal(t, 1<<18, st.SymbolSize()) 34 | 35 | assert.Equal(t, "a1", st.SequenceToString(1)) 36 | assert.Equal(t, "a2", st.SequenceToString(2)) 37 | assert.Equal(t, "a3", st.SequenceToString(3)) 38 | } 39 | 40 | func TestGrowth(t *testing.T) { 41 | st := New(16) 42 | 43 | for i := range 10_000 { 44 | seq, found := st.StringToSequence(strconv.Itoa(i), true) 45 | assert.False(t, found) 46 | assert.Equal(t, uint32(i+1), seq) 47 | } 48 | 49 | for i := range 10_000 { 50 | seq, found := st.StringToSequence(strconv.Itoa(i), true) 51 | assert.True(t, found) 52 | assert.Equal(t, uint32(i+1), seq) 53 | } 54 | 55 | for i := range 10_000 { 56 | str := st.SequenceToString(uint32(i + 1)) 57 | assert.Equal(t, strconv.Itoa(i), str) 58 | } 59 | } 60 | 61 | func TestGrowth2(t *testing.T) { 62 | st := New(16) 63 | 64 | for i := range 10_000 { 65 | seq, found := st.StringToSequence(strconv.Itoa(i), true) 66 | assert.False(t, found) 67 | assert.Equal(t, uint32(i+1), seq) 68 | 69 | seq, found = st.StringToSequence(strconv.Itoa(i), true) 70 | assert.True(t, found) 71 | assert.Equal(t, uint32(i+1), seq) 72 | } 73 | } 74 | 75 | func TestAddNew(t *testing.T) { 76 | st := New(16) 77 | // Won't add entry if asked not to 78 | seq, existing := st.StringToSequence("hat", false) 79 | assert.False(t, existing) 80 | assert.Equal(t, uint32(0), seq) 81 | 82 | seq, existing = st.StringToSequence("hat", true) 83 | assert.False(t, existing) 84 | assert.Equal(t, uint32(1), seq) 85 | 86 | // Can find existing entry if not asked to add new 87 | seq, existing = st.StringToSequence("hat", false) 88 | assert.True(t, existing) 89 | assert.Equal(t, uint32(1), seq) 90 | } 91 | 92 | func TestLowGC(t *testing.T) { 93 | st := New(16) 94 | for i := 0; i < 1e7; i++ { 95 | st.StringToSequence(strconv.Itoa(i), true) 96 | } 97 | runtime.GC() 98 | start := time.Now() 99 | runtime.GC() 100 | assert.True(t, time.Since(start) < time.Millisecond*5) 101 | 102 | runtime.KeepAlive(st) 103 | } 104 | 105 | func BenchmarkSymbolTab(b *testing.B) { 106 | symbols := make([]string, b.N) 107 | for i := range symbols { 108 | symbols[i] = strconv.Itoa(i) 109 | } 110 | 111 | b.ReportAllocs() 112 | b.ResetTimer() 113 | st := New(b.N) 114 | for _, sym := range symbols { 115 | st.StringToSequence(sym, true) 116 | } 117 | 118 | if symbols[0] != st.SequenceToString(1) { 119 | b.Errorf("first symbol doesn't match - get %s", st.SequenceToString(1)) 120 | } 121 | } 122 | 123 | func BenchmarkSequenceToString(b *testing.B) { 124 | st := New(b.N) 125 | for i := 0; i < b.N; i++ { 126 | st.StringToSequence(strconv.Itoa(i), true) 127 | } 128 | 129 | b.ReportAllocs() 130 | b.ResetTimer() 131 | 132 | var str string 133 | for i := 1; i <= b.N; i++ { 134 | str = st.SequenceToString(uint32(i)) 135 | } 136 | 137 | if str != strconv.Itoa(b.N-1) { 138 | b.Errorf("last symbol doesn't match - get %s", str) 139 | } 140 | } 141 | 142 | func BenchmarkExisting(b *testing.B) { 143 | st := New(b.N) 144 | values := make([]string, b.N) 145 | for i := range values { 146 | values[i] = strconv.Itoa(i) 147 | } 148 | 149 | for _, val := range values { 150 | st.StringToSequence(val, true) 151 | } 152 | 153 | b.ReportAllocs() 154 | b.ResetTimer() 155 | 156 | var seq uint32 157 | for _, val := range values { 158 | seq, _ = st.StringToSequence(val, false) 159 | } 160 | 161 | if st.SequenceToString(seq) != strconv.Itoa(b.N-1) { 162 | b.Errorf("last symbol doesn't match - get %s", st.SequenceToString(seq)) 163 | } 164 | } 165 | 166 | func BenchmarkMiss(b *testing.B) { 167 | st := New(b.N) 168 | values := make([]string, b.N) 169 | for i := range values { 170 | values[i] = strconv.Itoa(i) 171 | } 172 | 173 | b.ReportAllocs() 174 | b.ResetTimer() 175 | 176 | for _, val := range values { 177 | _, found := st.StringToSequence(val, false) 178 | if found { 179 | b.Errorf("found value %s", val) 180 | } 181 | } 182 | } 183 | 184 | func ExampleSymbolTab() { 185 | st := SymbolTab{} 186 | seq, found := st.StringToSequence("10293-ahdb-28383-555", true) 187 | fmt.Println(found) 188 | fmt.Println(st.SequenceToString(seq)) 189 | // Output: false 190 | // 10293-ahdb-28383-555 191 | } 192 | 193 | func BenchmarkMakeBigSlice(b *testing.B) { 194 | for i := 0; i < b.N; i++ { 195 | sl := make([]int32, 1e8) 196 | runtime.KeepAlive(sl) 197 | } 198 | } 199 | -------------------------------------------------------------------------------- /offheap/symboltab_test.go: -------------------------------------------------------------------------------- 1 | package offheap 2 | 3 | import ( 4 | "fmt" 5 | "runtime" 6 | "strconv" 7 | "testing" 8 | "time" 9 | 10 | "github.com/stretchr/testify/assert" 11 | ) 12 | 13 | func TestBasic(t *testing.T) { 14 | st := New(16) 15 | defer st.Close() 16 | 17 | assertStringToSequence := func(seq uint32, existing bool, val string) { 18 | t.Helper() 19 | seqa, existinga := st.StringToSequence(val, true) 20 | assert.Equal(t, existing, existinga) 21 | if existinga { 22 | assert.Equal(t, seq, seqa) 23 | } 24 | } 25 | 26 | assert.Zero(t, st.SymbolSize()) 27 | 28 | assertStringToSequence(1, false, "a1") 29 | assertStringToSequence(2, false, "a2") 30 | assertStringToSequence(3, false, "a3") 31 | assertStringToSequence(2, true, "a2") 32 | assertStringToSequence(3, true, "a3") 33 | 34 | assert.Equal(t, 1<<18, st.SymbolSize()) 35 | 36 | assert.Equal(t, "a1", st.SequenceToString(1)) 37 | assert.Equal(t, "a2", st.SequenceToString(2)) 38 | assert.Equal(t, "a3", st.SequenceToString(3)) 39 | } 40 | 41 | func TestGrowth(t *testing.T) { 42 | st := New(16) 43 | defer st.Close() 44 | 45 | for i := range 10000 { 46 | seq, found := st.StringToSequence(strconv.Itoa(i), true) 47 | assert.False(t, found) 48 | assert.Equal(t, uint32(i+1), seq) 49 | } 50 | 51 | for i := range 10000 { 52 | seq, found := st.StringToSequence(strconv.Itoa(i), true) 53 | assert.True(t, found) 54 | assert.Equal(t, uint32(i+1), seq) 55 | } 56 | 57 | for i := range 10000 { 58 | str := st.SequenceToString(uint32(i + 1)) 59 | assert.Equal(t, strconv.Itoa(i), str) 60 | } 61 | } 62 | 63 | func TestGrowth2(t *testing.T) { 64 | st := New(16) 65 | defer st.Close() 66 | 67 | for i := range 10000 { 68 | seq, found := st.StringToSequence(strconv.Itoa(i), true) 69 | assert.False(t, found) 70 | assert.Equal(t, uint32(i+1), seq) 71 | 72 | seq, found = st.StringToSequence(strconv.Itoa(i), true) 73 | assert.True(t, found) 74 | assert.Equal(t, uint32(i+1), seq) 75 | } 76 | } 77 | 78 | func TestAddNew(t *testing.T) { 79 | st := New(16) 80 | defer st.Close() 81 | // Won't add entry if asked not to 82 | seq, existing := st.StringToSequence("hat", false) 83 | assert.False(t, existing) 84 | assert.Equal(t, uint32(0), seq) 85 | 86 | seq, existing = st.StringToSequence("hat", true) 87 | assert.False(t, existing) 88 | assert.Equal(t, uint32(1), seq) 89 | 90 | // Can find existing entry if not asked to add new 91 | seq, existing = st.StringToSequence("hat", false) 92 | assert.True(t, existing) 93 | assert.Equal(t, uint32(1), seq) 94 | } 95 | 96 | func TestLowGC(t *testing.T) { 97 | st := New(16) 98 | defer st.Close() 99 | for i := range 10000000 { 100 | st.StringToSequence(strconv.Itoa(i), true) 101 | } 102 | runtime.GC() 103 | start := time.Now() 104 | runtime.GC() 105 | assert.True(t, time.Since(start) < time.Millisecond*5) 106 | 107 | runtime.KeepAlive(st) 108 | } 109 | 110 | func BenchmarkSymbolTab(b *testing.B) { 111 | symbols := make([]string, b.N) 112 | for i := range symbols { 113 | symbols[i] = strconv.Itoa(i) 114 | } 115 | 116 | b.ReportAllocs() 117 | b.ResetTimer() 118 | st := New(b.N) 119 | defer st.Close() 120 | for _, sym := range symbols { 121 | st.StringToSequence(sym, true) 122 | } 123 | 124 | if symbols[0] != st.SequenceToString(1) { 125 | b.Errorf("first symbol doesn't match - get %s", st.SequenceToString(1)) 126 | } 127 | } 128 | 129 | func BenchmarkSequenceToString(b *testing.B) { 130 | // This benchmark can run very very slowly as the setup is slow, but the 131 | // execution phase is fast. b.N gets driven very large! 132 | st := New(b.N) 133 | defer st.Close() 134 | for i := range b.N { 135 | st.StringToSequence(strconv.Itoa(i), true) 136 | } 137 | 138 | b.ReportAllocs() 139 | b.ResetTimer() 140 | 141 | var str string 142 | for i := range b.N { 143 | str = st.SequenceToString(uint32(i + 1)) 144 | } 145 | 146 | if str != strconv.Itoa(b.N-1) { 147 | b.Errorf("last symbol doesn't match - get %s", str) 148 | } 149 | } 150 | 151 | func BenchmarkExisting(b *testing.B) { 152 | st := New(b.N) 153 | defer st.Close() 154 | values := make([]string, b.N) 155 | for i := range values { 156 | values[i] = strconv.Itoa(i) 157 | } 158 | 159 | for _, val := range values { 160 | st.StringToSequence(val, true) 161 | } 162 | 163 | b.ReportAllocs() 164 | b.ResetTimer() 165 | 166 | var seq uint32 167 | for _, val := range values { 168 | seq, _ = st.StringToSequence(val, false) 169 | } 170 | 171 | if st.SequenceToString(seq) != strconv.Itoa(b.N-1) { 172 | b.Errorf("last symbol doesn't match - get %s", st.SequenceToString(seq)) 173 | } 174 | } 175 | 176 | func BenchmarkMiss(b *testing.B) { 177 | st := New(b.N) 178 | defer st.Close() 179 | values := make([]string, b.N) 180 | for i := range values { 181 | values[i] = strconv.Itoa(i) 182 | } 183 | 184 | b.ReportAllocs() 185 | b.ResetTimer() 186 | 187 | for _, val := range values { 188 | _, found := st.StringToSequence(val, false) 189 | if found { 190 | b.Errorf("found value %s", val) 191 | } 192 | } 193 | } 194 | 195 | func ExampleSymbolTab() { 196 | st := SymbolTab{} 197 | defer st.Close() 198 | seq, found := st.StringToSequence("10293-ahdb-28383-555", true) 199 | fmt.Println(found) 200 | fmt.Println(st.SequenceToString(seq)) 201 | // Output: false 202 | // 10293-ahdb-28383-555 203 | } 204 | 205 | func BenchmarkMakeBigSlice(b *testing.B) { 206 | b.ReportAllocs() 207 | for b.Loop() { 208 | sl := make([]int32, 1e8) 209 | runtime.KeepAlive(sl) 210 | } 211 | } 212 | -------------------------------------------------------------------------------- /symboltab.go: -------------------------------------------------------------------------------- 1 | // Package symboltab is a symbol table. It converts strings to sequence numbers. This is useful 2 | // for things like graph algorithms, where IDs are stored and compared a lot. 3 | // 4 | // symboltab is optimised for storing a lot of strings, so things are optimised for reducing 5 | // work for the GC 6 | package symboltab 7 | 8 | import ( 9 | "math/bits" 10 | "reflect" 11 | "unsafe" 12 | 13 | "github.com/philpearl/stringbank" 14 | ) 15 | 16 | // Our space costs are 8 bytes per entry. With a load factor of 0.5 (written as 2 here for reasons) that's 17 | // increased to at least 16 bytes per entry 18 | const loadFactor = 2 19 | 20 | // SymbolTab is the symbol table. Allocate it via New() 21 | type SymbolTab struct { 22 | sb stringbank.Stringbank 23 | table table 24 | oldTable table 25 | count int 26 | oldTableCursor int 27 | ib intbank 28 | } 29 | 30 | // New creates a new SymbolTab. cap is the initial capacity of the table - it will grow 31 | // automatically when needed 32 | func New(cap int) *SymbolTab { 33 | // want to allocate a table large enough to hold cap without growing 34 | cap = cap * loadFactor 35 | if cap < 16 { 36 | cap = 16 37 | } else { 38 | cap = 1 << uint(64-bits.LeadingZeros(uint(cap-1))) 39 | } 40 | return &SymbolTab{ 41 | table: table{ 42 | entries: make([]tableEntry, cap), 43 | }, 44 | } 45 | } 46 | 47 | // Len returns the number of unique strings stored 48 | func (i *SymbolTab) Len() int { 49 | return i.count 50 | } 51 | 52 | // Cap returns the size of the SymbolTab table 53 | func (i *SymbolTab) Cap() int { 54 | return i.table.len() 55 | } 56 | 57 | // SymbolSize contains the approximate size of string storage in the symboltable. This will be an over-estimate and 58 | // includes as yet unused and wasted space 59 | func (i *SymbolTab) SymbolSize() int { 60 | return i.sb.Size() 61 | } 62 | 63 | // SequenceToString looks up a string by its sequence number. Obtain the sequence number 64 | // for a string with StringToSequence 65 | func (i *SymbolTab) SequenceToString(seq uint32) string { 66 | // Look up the stringbank offset for this sequence number, then get the string 67 | offset := i.ib.lookup(seq) 68 | return i.sb.Get(offset) 69 | } 70 | 71 | // We use the runtime's map hash function without the overhead of using 72 | // hash/maphash 73 | // 74 | //go:linkname runtime_memhash runtime.memhash 75 | //go:noescape 76 | func runtime_memhash(p unsafe.Pointer, seed, s uintptr) uintptr 77 | 78 | // StringToSequence looks up the string val and returns its sequence number seq. If val does 79 | // not currently exist in the symbol table, it will add it if addNew is true. found indicates 80 | // whether val was already present in the SymbolTab 81 | func (i *SymbolTab) StringToSequence(val string, addNew bool) (seq uint32, found bool) { 82 | // we use a hashtable where the keys are stringbank offsets, but comparisons are done on 83 | // strings. There is no value to store 84 | 85 | hash := uint32(runtime_memhash( 86 | unsafe.Pointer((*reflect.StringHeader)(unsafe.Pointer(&val)).Data), 87 | 0, 88 | uintptr(len(val)), 89 | )) 90 | 91 | if addNew { 92 | // We're going to add to the table, make sure it is big enough 93 | i.resize() 94 | } 95 | 96 | if i.oldTable.len() != 0 { 97 | if addNew { 98 | // If we're resizing currently, then do some resizing work 99 | i.resizeWork() 100 | } 101 | 102 | // The data might still be only in the old table, so look there first. If we find the 103 | // data here then we can just go with that answer. But if not it may be in the new table 104 | // only. Certainly if we add we want to add to the new table 105 | _, sequence := i.findInTable(i.oldTable, val, hash) 106 | if sequence != 0 { 107 | return sequence, true 108 | } 109 | } 110 | 111 | cursor, sequence := i.findInTable(i.table, val, hash) 112 | if sequence != 0 { 113 | return sequence, true 114 | } 115 | 116 | if !addNew { 117 | return 0, false 118 | } 119 | 120 | // String was not found, so we want to store it. Cursor is the index where we should 121 | // store it 122 | i.count++ 123 | sequence = uint32(i.count) 124 | i.table.entries[cursor] = tableEntry{ 125 | hash: hash, 126 | sequence: sequence, 127 | } 128 | 129 | offset := i.sb.Save(val) 130 | i.ib.save(sequence, offset) 131 | 132 | return sequence, false 133 | } 134 | 135 | // findInTable find the string val in the hash table. If the string is present, it returns the 136 | // place in the table where it was found, plus the stringbank offset of the string + 1 137 | func (i *SymbolTab) findInTable(table table, val string, hashVal uint32) (cursor int, sequence uint32) { 138 | l := table.len() 139 | if l == 0 { 140 | return 0, 0 141 | } 142 | cursor = int(hashVal) & (l - 1) 143 | start := cursor 144 | for table.entries[cursor].sequence != 0 { 145 | if table.entries[cursor].hash == hashVal { 146 | if seq := table.entries[cursor].sequence; i.sb.Get(int(i.ib.lookup(seq))) == val { 147 | return cursor, table.entries[cursor].sequence 148 | } 149 | } 150 | cursor++ 151 | if cursor == l { 152 | cursor = 0 153 | } 154 | if cursor == start { 155 | panic("out of space!") 156 | } 157 | } 158 | return cursor, 0 159 | } 160 | 161 | func (i *SymbolTab) copyEntryToTable(table table, hash uint32, seq uint32) { 162 | l := table.len() 163 | cursor := int(hash) & (l - 1) 164 | start := cursor 165 | for table.entries[cursor].sequence != 0 { 166 | // the entry we're copying in is guaranteed not to be already 167 | // present, so we're just looking for an empty space 168 | cursor++ 169 | if cursor == l { 170 | cursor = 0 171 | } 172 | if cursor == start { 173 | panic("out of space (resize)!") 174 | } 175 | } 176 | table.entries[cursor] = tableEntry{ 177 | hash: hash, 178 | sequence: seq, 179 | } 180 | } 181 | 182 | func (i *SymbolTab) resizeWork() { 183 | // We copy items between tables 16 at a time. Since we do this every time 184 | // anyone writes to the table we won't run out of space in the new table 185 | // before this is complete 186 | l := i.oldTable.len() 187 | if l == 0 { 188 | return 189 | } 190 | for k := 0; k < 16; k++ { 191 | offset := k + i.oldTableCursor 192 | if seq := i.oldTable.entries[offset]; seq.sequence != 0 { 193 | i.copyEntryToTable(i.table, i.oldTable.entries[offset].hash, i.oldTable.entries[offset].sequence) 194 | // The entry can exist in the old and new versions of the table without 195 | // problems. If we did try to delete from the old table we'd have issues 196 | // searching forward from clashing entries. 197 | } 198 | } 199 | i.oldTableCursor += 16 200 | if i.oldTableCursor >= l { 201 | // resizing is complete - clear out the old table 202 | i.oldTable.entries = nil 203 | i.oldTableCursor = 0 204 | } 205 | } 206 | 207 | func (i *SymbolTab) resize() { 208 | if i.table.entries == nil { 209 | // Makes zero value of SymbolTab useful 210 | i.table.entries = make([]tableEntry, 16) 211 | } 212 | 213 | if i.count < i.table.len()/loadFactor { 214 | // Not full enough to grow the table 215 | return 216 | } 217 | 218 | if i.oldTable.entries == nil { 219 | // Not already resizing, so kick off the process. Note that despite all the work we do to try to be 220 | // clever, just allocating these slices can cause a considerable amount of work, presumably because 221 | // they are set to zero. 222 | i.oldTable, i.table = i.table, table{ 223 | entries: make([]tableEntry, len(i.table.entries)*2), 224 | } 225 | } 226 | } 227 | 228 | // table represents a hash table. We keep the strings and hashes separate in 229 | // case we want to use different size types in the future 230 | type table struct { 231 | // We keep hashes in the table to speed up resizing, and also stepping 232 | // through entries that have different hashes but hit the same bucket. 233 | // 234 | // Having entries with both the key and value together appears to speed up 235 | // the table when it's very large. I'd guess if the "value" of the table 236 | // (the sequence number) was larger this might not be the case. 237 | entries []tableEntry 238 | } 239 | 240 | type tableEntry struct { 241 | hash uint32 242 | sequence uint32 243 | } 244 | 245 | func (t table) len() int { 246 | return len(t.entries) 247 | } 248 | -------------------------------------------------------------------------------- /offheap/symboltab.go: -------------------------------------------------------------------------------- 1 | // Package offheap is an off-heap symbol table. It converts strings to sequence numbers. This is useful 2 | // for things like graph algorithms, where IDs are stored and compared a lot. 3 | // 4 | // symboltab is optimised for storing a lot of strings, so things are optimised for reducing 5 | // work for the GC 6 | package offheap 7 | 8 | import ( 9 | "math" 10 | "math/bits" 11 | "unsafe" 12 | 13 | "github.com/philpearl/mmap" 14 | stringbank "github.com/philpearl/stringbank/offheap" 15 | ) 16 | 17 | // Our space costs are 8 bytes per entry. With a load factor of 0.5 (written as 2 here for reasons) that's 18 | // increased to at least 16 bytes per entry 19 | const loadFactor = 2 20 | 21 | // SymbolTab is the symbol table. Allocate it via New() 22 | type SymbolTab struct { 23 | sb stringbank.Stringbank 24 | table table 25 | oldTable table 26 | count int 27 | oldTableCursor int 28 | ib intbank 29 | } 30 | 31 | // New creates a new SymbolTab. cap is the initial capacity of the table - it will grow 32 | // automatically when needed 33 | func New(cap int) *SymbolTab { 34 | // want to allocate a table large enough to hold cap without growing 35 | cap = cap * loadFactor 36 | if cap < 16 { 37 | cap = 16 38 | } else { 39 | cap = 1 << uint(64-bits.LeadingZeros(uint(cap-1))) 40 | } 41 | var t table 42 | t.init(cap) 43 | return &SymbolTab{ 44 | table: t, 45 | } 46 | } 47 | 48 | // Close releases resources associated with the SymbolTab 49 | func (i *SymbolTab) Close() { 50 | i.sb.Close() 51 | i.table.close() 52 | i.oldTable.close() 53 | i.oldTableCursor = 0 54 | i.count = 0 55 | i.ib.close() 56 | } 57 | 58 | // Len returns the number of unique strings stored 59 | func (i *SymbolTab) Len() int { 60 | return i.count 61 | } 62 | 63 | // Cap returns the size of the SymbolTab table 64 | func (i *SymbolTab) Cap() int { 65 | return i.table.len() 66 | } 67 | 68 | // SymbolSize contains the approximate size of string storage in the symboltable. This will be an over-estimate and 69 | // includes as yet unused and wasted space 70 | func (i *SymbolTab) SymbolSize() int { 71 | return i.sb.Size() 72 | } 73 | 74 | // SequenceToString looks up a string by its sequence number. Obtain the sequence number 75 | // for a string with StringToSequence 76 | func (i *SymbolTab) SequenceToString(seq uint32) string { 77 | // Look up the stringbank offset for this sequence number, then get the string 78 | offset := i.ib.lookup(seq) 79 | return i.sb.Get(offset) 80 | } 81 | 82 | // We use the runtime's map hash function without the overhead of using 83 | // hash/maphash 84 | // 85 | //go:linkname runtime_memhash runtime.memhash 86 | //go:noescape 87 | func runtime_memhash(p unsafe.Pointer, seed, s uintptr) uintptr 88 | 89 | // StringToSequence looks up the string val and returns its sequence number seq. If val does 90 | // not currently exist in the symbol table, it will add it if addNew is true. found indicates 91 | // whether val was already present in the SymbolTab 92 | func (i *SymbolTab) StringToSequence(val string, addNew bool) (seq uint32, found bool) { 93 | // we use a hashtable where the keys are stringbank offsets, but comparisons are done on 94 | // strings. There is no value to store 95 | 96 | hash := uint32(runtime_memhash( 97 | unsafe.Pointer(unsafe.StringData(val)), 98 | 0, 99 | uintptr(len(val)), 100 | )) 101 | 102 | if addNew { 103 | // We're going to add to the table, make sure it is big enough 104 | // We make sure we don't do any resizing work if we're not writing data as it will surprise folk who 105 | // might hold just a read lock while reading. 106 | i.resize() 107 | } 108 | 109 | if i.oldTable.len() != 0 { 110 | if addNew { 111 | // If we're resizing currently, then do some resizing work 112 | i.resizeWork() 113 | } 114 | 115 | // The data might still be only in the old table, so look there first. If we find the 116 | // data here then we can just go with that answer. But if not it may be in the new table 117 | // only. Certainly if we add we want to add to the new table 118 | _, sequence := i.findInTable(i.oldTable, val, hash) 119 | if sequence != 0 { 120 | return sequence, true 121 | } 122 | } 123 | 124 | cursor, sequence := i.findInTable(i.table, val, hash) 125 | if sequence != 0 { 126 | return sequence, true 127 | } 128 | 129 | if !addNew { 130 | return 0, false 131 | } 132 | 133 | // String was not found, so we want to store it. Cursor is the index where we should 134 | // store it 135 | i.count++ 136 | sequence = uint32(i.count) 137 | i.table.entries[cursor] = tableEntry{ 138 | hash: hash, 139 | sequence: sequence, 140 | } 141 | 142 | offset := i.sb.Save(val) 143 | i.ib.save(sequence, offset) 144 | 145 | return sequence, false 146 | } 147 | 148 | // findInTable find the string val in the hash table. If the string is present, it returns the 149 | // place in the table where it was found, plus the stringbank offset of the string + 1 150 | func (i *SymbolTab) findInTable(table table, val string, hashVal uint32) (cursor int, sequence uint32) { 151 | l := table.len() 152 | if l == 0 { 153 | return 0, 0 154 | } 155 | cursor = int(hashVal) % l 156 | start := cursor 157 | for table.entries[cursor].sequence != 0 { 158 | if table.entries[cursor].hash == hashVal { 159 | if seq := table.entries[cursor].sequence; i.sb.Get(int(i.ib.lookup(seq))) == val { 160 | return cursor, seq 161 | } 162 | } 163 | cursor++ 164 | cursor = cursor % l 165 | if cursor == start { 166 | panic("out of space!") 167 | } 168 | } 169 | return cursor, 0 170 | } 171 | 172 | func (i *SymbolTab) copyEntryToTable(table table, hash uint32, seq uint32) { 173 | l := table.len() 174 | cursor := int(hash) % l 175 | start := cursor 176 | for table.entries[cursor].sequence != 0 { 177 | // the entry we're copying in is guaranteed not to be already 178 | // present, so we're just looking for an empty space 179 | cursor++ 180 | cursor = cursor % l 181 | if cursor == start { 182 | panic("out of space (resize)!") 183 | } 184 | } 185 | table.entries[cursor] = tableEntry{ 186 | hash: hash, 187 | sequence: seq, 188 | } 189 | } 190 | 191 | func (i *SymbolTab) resizeWork() { 192 | // We copy items between tables 16 at a time. Since we do this every time 193 | // anyone writes to the table we won't run out of space in the new table 194 | // before this is complete 195 | l := i.oldTable.len() 196 | if l == 0 { 197 | return 198 | } 199 | // original size is 16, and we double to create new tables, so size should always be a multiple of 16 200 | for k, entry := range i.oldTable.entries[i.oldTableCursor : i.oldTableCursor+16] { 201 | if entry.sequence != 0 { 202 | offset := k + i.oldTableCursor 203 | i.copyEntryToTable(i.table, i.oldTable.entries[offset].hash, entry.sequence) 204 | // The entry can exist in the old and new versions of the table without 205 | // problems. If we did try to delete from the old table we'd have issues 206 | // searching forward from clashing entries. 207 | } 208 | } 209 | i.oldTableCursor += 16 210 | if i.oldTableCursor >= l { 211 | // resizing is complete - clear out the old table 212 | i.oldTable.close() 213 | i.oldTableCursor = 0 214 | } 215 | } 216 | 217 | func (i *SymbolTab) resize() { 218 | if i.table.entries == nil { 219 | // Makes zero value of SymbolTab useful 220 | i.table.init(16) 221 | } 222 | 223 | if i.count < i.table.len()/loadFactor { 224 | // Not full enough to grow the table 225 | return 226 | } 227 | 228 | if i.table.len() >= math.MaxUint32 { 229 | // We can't grow the table any more. We can let the table get fuller 230 | if i.count >= math.MaxUint32*3/4 { 231 | // Things will probably go wrong if we get this full. We have no 232 | // bits left to grow the table. This is the end. 233 | panic("out of space in symboltab!") 234 | } 235 | return 236 | } 237 | 238 | if i.oldTable.entries == nil { 239 | // Not already resizing, so kick off the process. Note that despite all the work we do to try to be 240 | // clever, just allocating these slices can cause a considerable amount of work, presumably because 241 | // they are set to zero. 242 | var newTable table 243 | newTable.init(i.table.len() * 2) 244 | i.oldTable, i.table = i.table, newTable 245 | } 246 | } 247 | 248 | // table represents a hash table. We keep the strings and hashes separate in 249 | // case we want to use different size types in the future 250 | type table struct { 251 | // We keep hashes in the table to speed up resizing, and also stepping 252 | // through entries that have different hashes but hit the same bucket. 253 | // 254 | // Having entries with both the key and value together appears to speed up 255 | // the table when it's very large. I'd guess if the "value" of the table 256 | // (the sequence number) was larger this might not be the case. 257 | entries []tableEntry 258 | } 259 | 260 | type tableEntry struct { 261 | hash uint32 262 | sequence uint32 263 | } 264 | 265 | func (t *table) init(cap int) { 266 | t.entries, _ = mmap.Alloc[tableEntry](cap) 267 | } 268 | 269 | func (t table) len() int { 270 | return len(t.entries) 271 | } 272 | 273 | func (t *table) close() { 274 | if t.entries != nil { 275 | mmap.Free(t.entries) 276 | t.entries = nil 277 | } 278 | } 279 | --------------------------------------------------------------------------------