├── .github └── workflows │ └── ci.yml ├── LICENSE ├── README.md ├── go.mod ├── uint32_store.go ├── uint32_store_example_test.go └── uint32_store_test.go /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | push: 4 | branches: 5 | - main 6 | pull_request: 7 | 8 | jobs: 9 | build: 10 | name: CI 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - name: Log 15 | env: 16 | CI_EVENT_ACTION: ${{ github.event.action }} 17 | CI_PR_TITLE: ${{ github.event.pull_request.title }} 18 | CI_PR_PREV_TITLE: ${{ github.event.changes.title.from }} 19 | run: | 20 | echo github.event.action=$CI_EVENT_ACTION 21 | echo github.event.pull_request.title=$CI_PR_TITLE 22 | echo github.event.changes.title.from=$CI_PR_PREV_TITLE 23 | 24 | - name: Set up Go 25 | uses: actions/setup-go@v2 26 | with: 27 | go-version: '~1.17.9' 28 | id: go 29 | 30 | - name: Install utilities 31 | run: | 32 | go install golang.org/x/lint/golint@latest 33 | go install golang.org/x/tools/cmd/goimports@latest 34 | go install honnef.co/go/tools/cmd/staticcheck@latest 35 | # display Go environment for reference 36 | go env 37 | 38 | - name: Check out code 39 | uses: actions/checkout@v2 40 | 41 | - uses: actions/cache@v2 42 | with: 43 | path: ~/go/pkg/mod 44 | key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }} 45 | restore-keys: | 46 | ${{ runner.os }}-go- 47 | 48 | - name: Get dependencies 49 | run: | 50 | go mod tidy 51 | /usr/bin/git diff --exit-code 52 | 53 | - name: Build 54 | run: | 55 | go build -v ./... 56 | 57 | - name: Check 58 | run: | 59 | go vet ./... 60 | golint ./... 61 | staticcheck ./... 62 | goimports -w . 63 | /usr/bin/git diff --exit-code 64 | 65 | - name: Test 66 | run: | 67 | go test -v ./... 68 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2021 The Sensible Code Company Ltd 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and 4 | associated documentation files (the "Software"), to deal in the Software without restriction, 5 | including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, 6 | and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, 7 | subject to the following conditions: 8 | 9 | The above copyright notice and this permission notice shall be included in all copies 10 | or substantial portions of the Software. 11 | 12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT 13 | NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 14 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 15 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH 16 | THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # faststringmap 2 | 3 | `faststringmap` is a fast read-only string keyed map for Go (golang). 4 | For our use case it is approximately 5 times faster than using Go's 5 | built-in map type with a string key. It also has the following advantages: 6 | 7 | * look up strings and byte slices without use of the `unsafe` package 8 | * minimal impact on GC due to lack of pointers in the data structure 9 | * data structure can be trivially serialized to disk or network 10 | 11 | The code provided implements a map from string to `uint32` which fits our 12 | use case, but you can easily substitute other value types. 13 | 14 | `faststringmap` is a variant of a data structure called a [Trie](https://en.wikipedia.org/wiki/Trie). 15 | At each level we use a slice to hold the next possible byte values. 16 | This slice is of length one plus the difference between the lowest and highest 17 | possible next bytes of strings in the map. Not all the entries in the slice are 18 | valid next bytes. `faststringmap` is thus more space efficient for keys using a 19 | small set of nearby runes, for example those using a lot of digits. 20 | 21 | ## Example 22 | 23 | Example usage can be found in [``uint32_store_example_test.go``](uint32_store_example_test.go). 24 | 25 | ## Motivation 26 | 27 | I created `faststringmap` in order to improve the speed of parsing CSV 28 | where the fields were category codes from survey data. The majority of these 29 | were numeric (`"1"`, `"2"`, `"3"`...) plus a distinct code for "not applicable". 30 | I was struck that in the simplest possible cases (e.g. `"1"` ... `"5"`) the map 31 | should be a single slice lookup. 32 | 33 | Our fast CSV parser provides fields as byte slices into the read buffer to 34 | avoid creating string objects. So I also wanted to facilitate key lookup from a 35 | `[]byte` rather than a string. This is not possible using a built-in Go map without 36 | use of the `unsafe` package. 37 | 38 | ## Benchmarks 39 | 40 | Below are example benchmarks from my laptop which are for looking up every element 41 | in a map of size 1000. So approximate times are 25ns per lookup for the Go native map 42 | and 5ns per lookup for the ``faststringmap``. 43 | ``` 44 | cpu: Intel(R) Core(TM) i7-6700HQ CPU @ 2.60GHz 45 | BenchmarkUint32Store 46 | BenchmarkUint32Store-8 218463 4959 ns/op 47 | BenchmarkGoStringToUint32 48 | BenchmarkGoStringToUint32-8 49279 24483 ns/op 49 | ``` 50 | 51 | ## Improvements 52 | 53 | You can improve the performance further by using a slice for the ``next`` fields. 54 | This avoids a bounds check when looking up the entry for a byte. However, it 55 | comes at the cost of easy serialization and introduces a lot of pointers which 56 | will have impact on GC. It is not possible to directly construct the slice version 57 | in the same way so that the whole store is one block of memory. Either create as in 58 | this code and then derive the slice version or create distinct slice objects at each level. -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/sensiblecodeio/faststringmap 2 | 3 | go 1.16 4 | -------------------------------------------------------------------------------- /uint32_store.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Sensible Code Company Ltd 2 | // Author: Duncan Harris 3 | 4 | package faststringmap 5 | 6 | import ( 7 | "sort" 8 | ) 9 | 10 | type ( 11 | // Uint32Store is a fast read only map from string to uint32 12 | // Lookups are about 5x faster than the built-in Go map type 13 | Uint32Store struct { 14 | store []byteValue 15 | } 16 | 17 | byteValue struct { 18 | nextLo uint32 // index in store of next byteValues 19 | nextLen byte // number of byteValues in store used for next possible bytes 20 | nextOffset byte // offset from zero byte value of first element of range of byteValues 21 | valid bool // is the byte sequence with no more bytes in the map? 22 | value uint32 // value for byte sequence with no more bytes 23 | } 24 | 25 | // Uint32Source is for supplying data to initialise Uint32Store 26 | Uint32Source interface { 27 | // AppendKeys should append the keys of the maps to the supplied slice and return the resulting slice 28 | AppendKeys([]string) []string 29 | // Get should return the value for the supplied key 30 | Get(string) uint32 31 | } 32 | 33 | // uint32Builder is used only during construction 34 | uint32Builder struct { 35 | all [][]byteValue 36 | src Uint32Source 37 | len int 38 | } 39 | ) 40 | 41 | // NewUint32Store creates from the data supplied in src 42 | func NewUint32Store(src Uint32Source) Uint32Store { 43 | if keys := src.AppendKeys([]string(nil)); len(keys) > 0 { 44 | sort.Strings(keys) 45 | return Uint32Store{store: uint32Build(keys, src)} 46 | } 47 | return Uint32Store{store: []byteValue{{}}} 48 | } 49 | 50 | // uint32Build constructs the map by allocating memory in blocks 51 | // and then copying into the eventual slice at the end. This is 52 | // more efficient than continually using append. 53 | func uint32Build(keys []string, src Uint32Source) []byteValue { 54 | b := uint32Builder{ 55 | all: [][]byteValue{make([]byteValue, 1, firstBufSize(len(keys)))}, 56 | src: src, 57 | len: 1, 58 | } 59 | b.makeByteValue(&b.all[0][0], keys, 0) 60 | // copy all blocks to one slice 61 | s := make([]byteValue, 0, b.len) 62 | for _, a := range b.all { 63 | s = append(s, a...) 64 | } 65 | return s 66 | } 67 | 68 | // makeByteValue will initialise the supplied byteValue for 69 | // the sorted strings in slice a considering bytes at byteIndex in the strings 70 | func (b *uint32Builder) makeByteValue(bv *byteValue, a []string, byteIndex int) { 71 | // if there is a string with no more bytes then it is always first because they are sorted 72 | if len(a[0]) == byteIndex { 73 | bv.valid = true 74 | bv.value = b.src.Get(a[0]) 75 | a = a[1:] 76 | } 77 | if len(a) == 0 { 78 | return 79 | } 80 | bv.nextOffset = a[0][byteIndex] // lowest value for next byte 81 | bv.nextLen = a[len(a)-1][byteIndex] - // highest value for next byte 82 | bv.nextOffset + 1 // minus lowest value +1 = number of possible next bytes 83 | bv.nextLo = uint32(b.len) // first byteValue struct in eventual built slice 84 | next := b.alloc(bv.nextLen) // new byteValues default to "not valid" 85 | 86 | for i, n := 0, len(a); i < n; { 87 | // find range of strings starting with the same byte 88 | iSameByteHi := i + 1 89 | for iSameByteHi < n && a[iSameByteHi][byteIndex] == a[i][byteIndex] { 90 | iSameByteHi++ 91 | } 92 | b.makeByteValue(&next[(a[i][byteIndex]-bv.nextOffset)], a[i:iSameByteHi], byteIndex+1) 93 | i = iSameByteHi 94 | } 95 | } 96 | 97 | const maxBuildBufSize = 1 << 20 98 | 99 | func firstBufSize(mapSize int) int { 100 | size := 1 << 4 101 | for size < mapSize && size < maxBuildBufSize { 102 | size <<= 1 103 | } 104 | return size 105 | } 106 | 107 | // alloc will grab space in the current block if available or allocate a new one if not 108 | func (b *uint32Builder) alloc(nByteValues byte) []byteValue { 109 | n := int(nByteValues) 110 | b.len += n 111 | cur := &b.all[len(b.all)-1] // current 112 | curCap, curLen := cap(*cur), len(*cur) 113 | if curCap-curLen >= n { // enough space in current 114 | *cur = (*cur)[: curLen+n : curCap] 115 | return (*cur)[curLen:] 116 | } 117 | newCap := curCap * 2 118 | for newCap < n { 119 | newCap *= 2 120 | } 121 | if newCap > maxBuildBufSize { 122 | newCap = maxBuildBufSize 123 | } 124 | a := make([]byteValue, n, newCap) 125 | b.all = append(b.all, a) 126 | return a 127 | } 128 | 129 | // LookupString looks up the supplied string in the map 130 | func (m *Uint32Store) LookupString(s string) (uint32, bool) { 131 | bv := &m.store[0] 132 | for i, n := 0, len(s); i < n; i++ { 133 | b := s[i] 134 | if b < bv.nextOffset { 135 | return 0, false 136 | } 137 | ni := b - bv.nextOffset 138 | if ni >= bv.nextLen { 139 | return 0, false 140 | } 141 | bv = &m.store[bv.nextLo+uint32(ni)] 142 | } 143 | return bv.value, bv.valid 144 | } 145 | 146 | // LookupBytes looks up the supplied byte slice in the map 147 | func (m *Uint32Store) LookupBytes(s []byte) (uint32, bool) { 148 | bv := &m.store[0] 149 | for _, b := range s { 150 | if b < bv.nextOffset { 151 | return 0, false 152 | } 153 | ni := b - bv.nextOffset 154 | if ni >= bv.nextLen { 155 | return 0, false 156 | } 157 | bv = &m.store[bv.nextLo+uint32(ni)] 158 | } 159 | return bv.value, bv.valid 160 | } 161 | -------------------------------------------------------------------------------- /uint32_store_example_test.go: -------------------------------------------------------------------------------- 1 | package faststringmap_test 2 | 3 | import ( 4 | "fmt" 5 | "sort" 6 | "strings" 7 | 8 | "github.com/sensiblecodeio/faststringmap" 9 | ) 10 | 11 | func Example() { 12 | m := exampleSource{ 13 | "key1": 42, 14 | "key2": 27644437, 15 | "l": 2, 16 | } 17 | 18 | fm := faststringmap.NewUint32Store(m) 19 | 20 | // add an entry that is not in the fast map 21 | m["m"] = 4 22 | 23 | // sort the keys so output is the same for each test run 24 | keys := make([]string, 0, len(m)) 25 | for k := range m { 26 | keys = append(keys, k) 27 | } 28 | sort.Strings(keys) 29 | 30 | // lookup every key in the fast map and print the corresponding value 31 | for _, k := range keys { 32 | v, ok := fm.LookupString(k) 33 | fmt.Printf("%q: %d, %v\n", k, v, ok) 34 | } 35 | 36 | // Dump out the store to aid in understanding the implementation 37 | fmt.Println() 38 | dump := fmt.Sprintf("%+v", fm) 39 | dump = strings.ReplaceAll(dump, "}", "}\n") 40 | dump = strings.ReplaceAll(dump, "[", "[\n ") 41 | fmt.Println(dump) 42 | 43 | // Output: 44 | // 45 | // "key1": 42, true 46 | // "key2": 27644437, true 47 | // "l": 2, true 48 | // "m": 0, false 49 | // 50 | // {store:[ 51 | // {nextLo:1 nextLen:2 nextOffset:107 valid:false value:0} 52 | // {nextLo:3 nextLen:1 nextOffset:101 valid:false value:0} 53 | // {nextLo:0 nextLen:0 nextOffset:0 valid:true value:2} 54 | // {nextLo:4 nextLen:1 nextOffset:121 valid:false value:0} 55 | // {nextLo:5 nextLen:2 nextOffset:49 valid:false value:0} 56 | // {nextLo:0 nextLen:0 nextOffset:0 valid:true value:42} 57 | // {nextLo:0 nextLen:0 nextOffset:0 valid:true value:27644437} 58 | // ]} 59 | } 60 | 61 | type exampleSource map[string]uint32 62 | 63 | func (s exampleSource) AppendKeys(a []string) []string { 64 | for k := range s { 65 | a = append(a, k) 66 | } 67 | return a 68 | } 69 | 70 | func (s exampleSource) Get(k string) uint32 { 71 | return s[k] 72 | } 73 | -------------------------------------------------------------------------------- /uint32_store_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Sensible Code Company Ltd 2 | // Author: Duncan Harris 3 | 4 | package faststringmap_test 5 | 6 | import ( 7 | "fmt" 8 | "math/rand" 9 | "strconv" 10 | "strings" 11 | "testing" 12 | 13 | "github.com/sensiblecodeio/faststringmap" 14 | ) 15 | 16 | func TestFastStringToUint32Empty(t *testing.T) { 17 | ms := mapSliceN(map[string]uint32{"": 1, "a": 2, "foo": 3, "ß": 4}, 0) 18 | checkWithMapSlice(t, ms) 19 | } 20 | 21 | func TestFastStringToUint32BigSpan(t *testing.T) { 22 | ms := mapSliceN(map[string]uint32{"a!": 1, "a~": 2}, 2) 23 | checkWithMapSlice(t, ms) 24 | } 25 | 26 | func TestFastStringToUint32(t *testing.T) { 27 | const nStrs = 8192 28 | m := randomSmallStrings(nStrs, 8) 29 | checkWithMapSlice(t, mapSliceN(m, len(m)/2)) 30 | } 31 | 32 | func checkWithMapSlice(t *testing.T, ms mapSlice) { 33 | fm := faststringmap.NewUint32Store(ms) 34 | 35 | for _, k := range ms.in { 36 | check := func(actV uint32, ok bool) { 37 | if !ok { 38 | t.Errorf("%q not present", k) 39 | } else if actV != ms.m[k] { 40 | t.Errorf("got %d want %d for %q", actV, ms.m[k], k) 41 | } 42 | } 43 | check(fm.LookupString(k)) 44 | check(fm.LookupBytes([]byte(k))) 45 | } 46 | 47 | for _, k := range ms.out { 48 | check := func(actV uint32, ok bool) { 49 | if ok { 50 | t.Errorf("%q present when not expected, got %d", k, actV) 51 | } 52 | } 53 | check(fm.LookupString(k)) 54 | check(fm.LookupBytes([]byte(k))) 55 | } 56 | } 57 | 58 | type mapSlice struct { 59 | m map[string]uint32 60 | in []string 61 | out []string 62 | } 63 | 64 | func mapSliceN(m map[string]uint32, n int) mapSlice { 65 | if n < 0 || n > len(m) { 66 | panic(fmt.Sprintf("n value %d out of range for map size %d", n, len(m))) 67 | } 68 | in := make([]string, 0, n) 69 | out := make([]string, 0, len(m)-n) 70 | nAdded := 0 71 | 72 | for k := range m { 73 | if nAdded < n { 74 | nAdded++ 75 | in = append(in, k) 76 | } else { 77 | out = append(out, k) 78 | } 79 | } 80 | return mapSlice{m: m, in: in, out: out} 81 | } 82 | 83 | func (m mapSlice) AppendKeys(a []string) []string { return append(a, m.in...) } 84 | func (m mapSlice) Get(s string) uint32 { return m.m[s] } 85 | 86 | func randomSmallStrings(nStrs int, maxLen uint8) map[string]uint32 { 87 | m := map[string]uint32{"": 0} 88 | for len(m) < nStrs { 89 | s := randomSmallString(maxLen) 90 | if _, ok := m[s]; !ok { 91 | m[s] = uint32(len(m)) 92 | } 93 | } 94 | return m 95 | } 96 | 97 | func randomSmallString(maxLen uint8) string { 98 | var sb strings.Builder 99 | n := rand.Intn(int(maxLen) + 1) 100 | for i := 0; i <= n; i++ { 101 | sb.WriteRune(rand.Int31n(94) + 33) 102 | } 103 | return sb.String() 104 | } 105 | 106 | func typicalCodeStrings(n int) mapSlice { 107 | m := make(map[string]uint32, n) 108 | keys := make([]string, 0, n) 109 | add := func(s string) { 110 | m[s] = uint32(len(m)) 111 | keys = append(keys, s) 112 | } 113 | for i := 1; i < n; i++ { 114 | add(strconv.Itoa(i)) 115 | } 116 | add("-9") 117 | return mapSlice{m: m, in: keys} 118 | } 119 | 120 | const nStrsBench = 1000 121 | 122 | func BenchmarkUint32Store(b *testing.B) { 123 | m := typicalCodeStrings(nStrsBench) 124 | fm := faststringmap.NewUint32Store(m) 125 | b.ResetTimer() 126 | for bi := 0; bi < b.N; bi++ { 127 | for si, n := uint32(0), uint32(len(m.in)); si < n; si++ { 128 | v, ok := fm.LookupString(m.in[si]) 129 | if !ok || v != si { 130 | b.Fatalf("ok=%v, value got %d want %d", ok, v, si) 131 | } 132 | } 133 | } 134 | } 135 | 136 | func BenchmarkGoStringToUint32(b *testing.B) { 137 | m := typicalCodeStrings(nStrsBench) 138 | b.ResetTimer() 139 | for bi := 0; bi < b.N; bi++ { 140 | for si, n := uint32(0), uint32(len(m.in)); si < n; si++ { 141 | v, ok := m.m[m.in[si]] 142 | if !ok || v != si { 143 | b.Fatalf("ok=%v, value got %d want %d", ok, v, si) 144 | } 145 | } 146 | } 147 | } 148 | --------------------------------------------------------------------------------