├── .gitignore ├── LICENSE ├── README.md ├── govarint_test.go └── govarint.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | *.test 24 | *.prof 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Stephen Merity 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Govarint 2 | 3 | This project aims to provide a simple API for the performant encoding and decoding of 32 and 64 bit integers using a variety of algorithms. 4 | 5 | [![](http://i.imgur.com/mpgC23U.jpg)](https://www.flickr.com/photos/tsevis/8648521649/) 6 | 7 | ## Usage 8 | 9 | Each integer encoding algorithm conforms to an encoding and decoding interface. 10 | The interfaces also specify the size of the unsigned integer, either 32 or 64 bits, and will be referred to as XX below. 11 | To create an encoder: 12 | 13 | NewU32Base128Encoder(w io.Writer) 14 | NewU64Base128Encoder(w io.Writer) 15 | NewU32GroupVarintEncoder(w io.Writer) 16 | 17 | For encoders, the only two commands are `PutUXX` and `Close`. 18 | `Close` must be called as some integer encoding algorithms write in multiples. 19 | 20 | var buf bytes.Buffer 21 | enc := NewU32Base128Encoder(&buf) 22 | enc.PutU32(117) 23 | enc.PutU32(343) 24 | enc.Close() 25 | 26 | To create a decoder: 27 | 28 | NewU32Base128Decoder(r io.ByteReader) 29 | NewU64Base128Decoder(r io.ByteReader) 30 | NewU32GroupVarintDecoder(r io.ByteReader) 31 | 32 | For decoders, the only command is `GetUXX`. 33 | `GetUXX` returns the value and any potential errors. 34 | When reading is complete, `GetUXX` will return an `EOF` (End Of File). 35 | 36 | dec := NewU32Base128Decoder(&buf) 37 | x, err := dec.GetU32() 38 | 39 | ## Use Cases 40 | 41 | Using fixed width integers, such as uint32 and uint64, usually waste large amounts of space, especially when encoding small values. 42 | Optimally, smaller numbers should take less space to represent. 43 | 44 | Using integer encoding algorithms is especially common in specific applications, such as storing edge lists or indexes for search engines. 45 | In these situations, you have a sorted list of numbers that you want to keep as compactly as possible in memory. 46 | Additionally, by storing only the difference between the given number and the previous (delta encoding), the numbers are quite small, and thus compress well. 47 | 48 | For an explicit example, the Web Data Commons Hyperlink Graph contains 128 billion edges linking page A to page B, where each page is represented by a 32 bit integer. 49 | By converting all these edges to 64 bit integers (32 | 32), sorting them, and then using delta encoding, memory usage can be reduced from 64 bits per edge down to only 9 bits per edge using the Base128 integer encoding algorithm. 50 | This figure improves even further if compressed using conventional compression algorithms (3 bits per edge). 51 | 52 | ## Encodings supported 53 | 54 | `govarint` supports: 55 | 56 | + Base128 [32, 64] - each byte uses 7 bits for encoding the integer and 1 bit for indicating if the integer requires another byte 57 | + Group Varint [32] - integers are encoded in blocks of four - one byte encodes the size of the following four integers, then the values of the four integers follows 58 | 59 | Group Varint consistently beats Base128 in decompression speed but Base128 may offer improved compression ratios depending on the distribution of the supplied integers. 60 | 61 | ## Tests 62 | 63 | go test -v -bench=. 64 | 65 | ## License 66 | 67 | MIT License, as per `LICENSE` 68 | -------------------------------------------------------------------------------- /govarint_test.go: -------------------------------------------------------------------------------- 1 | package govarint 2 | 3 | import "bytes" 4 | import "io" 5 | import "math/rand" 6 | import "testing" 7 | 8 | var fourU32 = []uint32{ 9 | 0, 10 | 1, 11 | 0, 12 | 256, 13 | } 14 | 15 | var fiveU32 = []uint32{ 16 | 42, 17 | 4294967196, 18 | 384, 19 | 9716053, 20 | 1024 + 256 + 3, 21 | } 22 | 23 | var testU32 = []uint32{ 24 | 0, 25 | 1, 26 | 2, 27 | 10, 28 | 20, 29 | 63, 30 | 64, 31 | 65, 32 | 127, 33 | 128, 34 | 129, 35 | 255, 36 | 256, 37 | 257, 38 | } 39 | 40 | var testU64 = []uint64{ 41 | 0, 42 | 1, 43 | 2, 44 | 10, 45 | 20, 46 | 63, 47 | 64, 48 | 65, 49 | 127, 50 | 128, 51 | 129, 52 | 255, 53 | 256, 54 | 257, 55 | /// 56 | 1<<32 - 1, 57 | 1 << 32, 58 | 1 << 33, 59 | 1 << 42, 60 | 1<<63 - 1, 61 | 1 << 63, 62 | } 63 | 64 | func TestEncodeAndDecodeU32(t *testing.T) { 65 | for _, expected := range testU32 { 66 | var buf bytes.Buffer 67 | enc := NewU32Base128Encoder(&buf) 68 | enc.PutU32(expected) 69 | enc.Close() 70 | dec := NewU32Base128Decoder(&buf) 71 | x, err := dec.GetU32() 72 | if x != expected || err != nil { 73 | t.Errorf("ReadUvarint(%v): got x = %d, expected = %d, err = %s", buf, x, expected, err) 74 | } 75 | } 76 | var buf bytes.Buffer 77 | enc := NewU32Base128Encoder(&buf) 78 | for _, expected := range testU32 { 79 | enc.PutU32(expected) 80 | } 81 | enc.Close() 82 | dec := NewU32Base128Decoder(&buf) 83 | i := 0 84 | for { 85 | x, err := dec.GetU32() 86 | if err == io.EOF { 87 | break 88 | } 89 | if x != testU32[i] || err != nil { 90 | t.Errorf("ReadUvarint(%v): got x = %d, expected = %d, err = %s", buf, x, testU32[i], err) 91 | } 92 | i += 1 93 | } 94 | if i != len(testU32) { 95 | t.Errorf("Only %d integers were decoded when %d were encoded", i, len(testU32)) 96 | } 97 | } 98 | 99 | func TestEncodeAndDecodeU64(t *testing.T) { 100 | for _, expected := range testU64 { 101 | var buf bytes.Buffer 102 | enc := NewU64Base128Encoder(&buf) 103 | enc.PutU64(expected) 104 | enc.Close() 105 | dec := NewU64Base128Decoder(&buf) 106 | x, err := dec.GetU64() 107 | if x != expected || err != nil { 108 | t.Errorf("ReadUvarint(%v): got x = %d, expected = %d, err = %s", buf, x, expected, err) 109 | } 110 | } 111 | } 112 | 113 | func TestU32GroupVarintFour(t *testing.T) { 114 | var buf bytes.Buffer 115 | enc := NewU32GroupVarintEncoder(&buf) 116 | for _, expected := range fourU32 { 117 | enc.PutU32(expected) 118 | } 119 | enc.Close() 120 | dec := NewU32GroupVarintDecoder(&buf) 121 | i := 0 122 | for { 123 | x, err := dec.GetU32() 124 | if err == io.EOF { 125 | break 126 | } 127 | if err != nil && x != fourU32[i] { 128 | t.Errorf("ReadUvarint(%v): got x = %d, expected = %d, err = %s", buf, x, testU32[i], err) 129 | } 130 | i += 1 131 | } 132 | if i != len(fourU32) { 133 | t.Errorf("%d integers were decoded when %d were encoded", i, len(fourU32)) 134 | } 135 | } 136 | 137 | func TestU32GroupVarintFive(t *testing.T) { 138 | var buf bytes.Buffer 139 | enc := NewU32GroupVarintEncoder(&buf) 140 | for _, expected := range fiveU32 { 141 | enc.PutU32(expected) 142 | } 143 | enc.Close() 144 | dec := NewU32GroupVarintDecoder(&buf) 145 | i := 0 146 | for { 147 | x, err := dec.GetU32() 148 | if err == io.EOF { 149 | break 150 | } 151 | if err != nil && x != fiveU32[i] { 152 | t.Errorf("ReadUvarint(%v): got x = %d, expected = %d, err = %s", buf, x, testU32[i], err) 153 | } 154 | i += 1 155 | } 156 | if i != len(fiveU32) { 157 | t.Errorf("%d integers were decoded when %d were encoded", i, len(fiveU32)) 158 | } 159 | } 160 | 161 | func TestU32GroupVarint14(t *testing.T) { 162 | var buf bytes.Buffer 163 | for length := 0; length < len(testU32); length++ { 164 | subset := testU32[:length] 165 | enc := NewU32GroupVarintEncoder(&buf) 166 | for _, expected := range subset { 167 | enc.PutU32(expected) 168 | } 169 | enc.Close() 170 | dec := NewU32GroupVarintDecoder(&buf) 171 | i := 0 172 | for { 173 | x, err := dec.GetU32() 174 | if err == io.EOF { 175 | break 176 | } 177 | if err != nil && x != subset[i] { 178 | t.Errorf("ReadUvarint(%v): got x = %d, expected = %d, err = %s", buf, x, subset[i], err) 179 | } 180 | i += 1 181 | } 182 | if i != len(subset) { 183 | t.Errorf("%d integers were decoded when %d were encoded", i, len(subset)) 184 | } 185 | } 186 | } 187 | 188 | func generateRandomU14() (uint64, []uint32) { 189 | // Need to be aware to make it fair for Base128 190 | // Base128 has 7 usable bits per byte 191 | rand.Seed(42) 192 | testSize := 1000000 193 | data := make([]uint32, testSize, testSize) 194 | total := uint64(0) 195 | for i := range data { 196 | data[i] = rand.Uint32() % 16384 197 | total += uint64(data[i]) 198 | } 199 | return total, data 200 | } 201 | 202 | func speedTest(b *testing.B, dec U32VarintDecoder, readBuf *bytes.Reader, expectedTotal uint64) { 203 | total := uint64(0) 204 | idx := 0 205 | for { 206 | x, err := dec.GetU32() 207 | if err == io.EOF { 208 | break 209 | } 210 | if err != nil { 211 | b.Errorf("Hit err: %v", err) 212 | } 213 | total += uint64(x) 214 | idx += 1 215 | } 216 | if total != expectedTotal { 217 | b.Errorf("Total was %d when %d was expected, having read %d integers", total, expectedTotal, idx) 218 | } 219 | } 220 | 221 | func BenchmarkBase128(b *testing.B) { 222 | b.StopTimer() 223 | // 224 | var buf bytes.Buffer 225 | enc := NewU32Base128Encoder(&buf) 226 | expectedTotal, data := generateRandomU14() 227 | for _, expected := range data { 228 | enc.PutU32(expected) 229 | } 230 | enc.Close() 231 | // 232 | readBuf := bytes.NewReader(buf.Bytes()) 233 | b.StartTimer() 234 | for i := 0; i < b.N; i++ { 235 | readBuf.Seek(0, 0) 236 | dec := NewU32Base128Decoder(readBuf) 237 | speedTest(b, dec, readBuf, expectedTotal) 238 | } 239 | } 240 | 241 | func BenchmarkGroupVarint(b *testing.B) { 242 | b.StopTimer() 243 | // 244 | var buf bytes.Buffer 245 | enc := NewU32GroupVarintEncoder(&buf) 246 | expectedTotal, data := generateRandomU14() 247 | for _, expected := range data { 248 | enc.PutU32(expected) 249 | } 250 | enc.Close() 251 | // 252 | readBuf := bytes.NewReader(buf.Bytes()) 253 | b.StartTimer() 254 | for i := 0; i < b.N; i++ { 255 | readBuf.Seek(0, 0) 256 | dec := NewU32GroupVarintDecoder(readBuf) 257 | speedTest(b, dec, readBuf, expectedTotal) 258 | } 259 | } 260 | -------------------------------------------------------------------------------- /govarint.go: -------------------------------------------------------------------------------- 1 | package govarint 2 | 3 | import "encoding/binary" 4 | import "io" 5 | 6 | type U32VarintEncoder interface { 7 | PutU32(x uint32) int 8 | Close() 9 | } 10 | 11 | type U32VarintDecoder interface { 12 | GetU32() (uint32, error) 13 | } 14 | 15 | /// 16 | 17 | type U64VarintEncoder interface { 18 | PutU64(x uint64) int 19 | Close() 20 | } 21 | 22 | type U64VarintDecoder interface { 23 | GetU64() (uint64, error) 24 | } 25 | 26 | /// 27 | 28 | type U32GroupVarintEncoder struct { 29 | w io.Writer 30 | index int 31 | store [4]uint32 32 | temp [17]byte 33 | } 34 | 35 | func NewU32GroupVarintEncoder(w io.Writer) *U32GroupVarintEncoder { return &U32GroupVarintEncoder{w: w} } 36 | 37 | func (b *U32GroupVarintEncoder) Flush() (int, error) { 38 | // TODO: Is it more efficient to have a tailored version that's called only in Close()? 39 | // If index is zero, there are no integers to flush 40 | if b.index == 0 { 41 | return 0, nil 42 | } 43 | // In the case we're flushing (the group isn't of size four), the non-values should be zero 44 | // This ensures the unused entries are all zero in the sizeByte 45 | for i := b.index; i < 4; i++ { 46 | b.store[i] = 0 47 | } 48 | length := 1 49 | // We need to reset the size byte to zero as we only bitwise OR into it, we don't overwrite it 50 | b.temp[0] = 0 51 | for i, x := range b.store { 52 | size := byte(0) 53 | shifts := []byte{24, 16, 8, 0} 54 | for _, shift := range shifts { 55 | // Always writes at least one byte -- the first one (shift = 0) 56 | // Will write more bytes until the rest of the integer is all zeroes 57 | if (x>>shift) != 0 || shift == 0 { 58 | size += 1 59 | b.temp[length] = byte(x >> shift) 60 | length += 1 61 | } 62 | } 63 | // We store the size in two of the eight bits in the first byte (sizeByte) 64 | // 0 means there is one byte in total, hence why we subtract one from size 65 | b.temp[0] |= (size - 1) << (uint8(3-i) * 2) 66 | } 67 | // If we're flushing without a full group of four, remove the unused bytes we computed 68 | // This enables us to realize it's a partial group on decoding thanks to EOF 69 | if b.index != 4 { 70 | length -= 4 - b.index 71 | } 72 | _, err := b.w.Write(b.temp[:length]) 73 | return length, err 74 | } 75 | 76 | func (b *U32GroupVarintEncoder) PutU32(x uint32) (int, error) { 77 | bytesWritten := 0 78 | b.store[b.index] = x 79 | b.index += 1 80 | if b.index == 4 { 81 | n, err := b.Flush() 82 | if err != nil { 83 | return n, err 84 | } 85 | bytesWritten += n 86 | b.index = 0 87 | } 88 | return bytesWritten, nil 89 | } 90 | 91 | func (b *U32GroupVarintEncoder) Close() { 92 | // On Close, we flush any remaining values that might not have been in a full group 93 | b.Flush() 94 | } 95 | 96 | /// 97 | 98 | type U32GroupVarintDecoder struct { 99 | r io.ByteReader 100 | group [4]uint32 101 | pos int 102 | finished bool 103 | capacity int 104 | } 105 | 106 | func NewU32GroupVarintDecoder(r io.ByteReader) *U32GroupVarintDecoder { 107 | return &U32GroupVarintDecoder{r: r, pos: 4, capacity: 4} 108 | } 109 | 110 | func (b *U32GroupVarintDecoder) getGroup() error { 111 | // We should always receive a sizeByte if there are more values to read 112 | sizeByte, err := b.r.ReadByte() 113 | if err != nil { 114 | return err 115 | } 116 | // Calculate the size of the four incoming 32 bit integers 117 | // 0b00 means 1 byte to read, 0b01 = 2, etc 118 | b.group[0] = uint32((sizeByte >> 6) & 3) 119 | b.group[1] = uint32((sizeByte >> 4) & 3) 120 | b.group[2] = uint32((sizeByte >> 2) & 3) 121 | b.group[3] = uint32(sizeByte & 3) 122 | // 123 | for index, size := range b.group { 124 | b.group[index] = 0 125 | // Any error that occurs in earlier byte reads should be repeated at the end one 126 | // Hence we only catch and report the final ReadByte's error 127 | var err error 128 | switch size { 129 | case 0: 130 | var x byte 131 | x, err = b.r.ReadByte() 132 | b.group[index] = uint32(x) 133 | case 1: 134 | var x, y byte 135 | x, _ = b.r.ReadByte() 136 | y, err = b.r.ReadByte() 137 | b.group[index] = uint32(x)<<8 | uint32(y) 138 | case 2: 139 | var x, y, z byte 140 | x, _ = b.r.ReadByte() 141 | y, _ = b.r.ReadByte() 142 | z, err = b.r.ReadByte() 143 | b.group[index] = uint32(x)<<16 | uint32(y)<<8 | uint32(z) 144 | case 3: 145 | var x, y, z, zz byte 146 | x, _ = b.r.ReadByte() 147 | y, _ = b.r.ReadByte() 148 | z, _ = b.r.ReadByte() 149 | zz, err = b.r.ReadByte() 150 | b.group[index] = uint32(x)<<24 | uint32(y)<<16 | uint32(z)<<8 | uint32(zz) 151 | } 152 | if err != nil { 153 | if err == io.EOF { 154 | // If we hit EOF here, we have found a partial group 155 | // We've return any valid entries we have read and return EOF once we run out 156 | b.capacity = index 157 | b.finished = true 158 | break 159 | } else { 160 | return err 161 | } 162 | } 163 | } 164 | // Reset the pos pointer to the beginning of the read values 165 | b.pos = 0 166 | return nil 167 | } 168 | 169 | func (b *U32GroupVarintDecoder) GetU32() (uint32, error) { 170 | // Check if we have any more values to give out - if not, let's get them 171 | if b.pos == b.capacity { 172 | // If finished is set, there is nothing else to do 173 | if b.finished { 174 | return 0, io.EOF 175 | } 176 | err := b.getGroup() 177 | if err != nil { 178 | return 0, err 179 | } 180 | } 181 | // Increment pointer and return the value stored at that point 182 | b.pos += 1 183 | return b.group[b.pos-1], nil 184 | } 185 | 186 | /// 187 | 188 | type Base128Encoder struct { 189 | w io.Writer 190 | tmpBytes []byte 191 | } 192 | 193 | func NewU32Base128Encoder(w io.Writer) *Base128Encoder { 194 | return &Base128Encoder{w: w, tmpBytes: make([]byte, binary.MaxVarintLen32)} 195 | } 196 | func NewU64Base128Encoder(w io.Writer) *Base128Encoder { 197 | return &Base128Encoder{w: w, tmpBytes: make([]byte, binary.MaxVarintLen64)} 198 | } 199 | 200 | func (b *Base128Encoder) PutU32(x uint32) (int, error) { 201 | writtenBytes := binary.PutUvarint(b.tmpBytes, uint64(x)) 202 | return b.w.Write(b.tmpBytes[:writtenBytes]) 203 | } 204 | 205 | func (b *Base128Encoder) PutU64(x uint64) (int, error) { 206 | writtenBytes := binary.PutUvarint(b.tmpBytes, x) 207 | return b.w.Write(b.tmpBytes[:writtenBytes]) 208 | } 209 | 210 | func (b *Base128Encoder) Close() { 211 | } 212 | 213 | /// 214 | 215 | type Base128Decoder struct { 216 | r io.ByteReader 217 | } 218 | 219 | func NewU32Base128Decoder(r io.ByteReader) *Base128Decoder { return &Base128Decoder{r: r} } 220 | func NewU64Base128Decoder(r io.ByteReader) *Base128Decoder { return &Base128Decoder{r: r} } 221 | 222 | func (b *Base128Decoder) GetU32() (uint32, error) { 223 | v, err := binary.ReadUvarint(b.r) 224 | return uint32(v), err 225 | } 226 | 227 | func (b *Base128Decoder) GetU64() (uint64, error) { 228 | return binary.ReadUvarint(b.r) 229 | } 230 | --------------------------------------------------------------------------------