├── .gitignore ├── LICENSE ├── README.md ├── benchmark ├── benchmark.go ├── data │ ├── ip.txt.gz │ ├── lat.txt.gz │ └── ts.txt.gz └── results │ └── Benchmarking_Integer_Compression.xlsx ├── benchtools └── benchtools.go ├── bitlen.go ├── bitlen_386.s ├── bitlen_amd64.s ├── bitlen_arm.s ├── bitlen_decl.go ├── bitlen_gccgo.go ├── bitpacking ├── bitpacking.go └── delta_bitpacking.go ├── bp32 ├── bp32.go └── bp32_test.go ├── composition ├── composition.go └── composition_test.go ├── cursor └── cursor.go ├── delta ├── bp32 │ ├── bp32.go │ └── bp32_test.go ├── fastpfor │ ├── fastpfor.go │ └── fastpfor_test.go └── variablebyte │ ├── variablebyte.go │ └── variablebyte_test.go ├── fastpfor ├── fastpfor.go └── fastpfor_test.go ├── generators ├── generators.go └── generators_test.go ├── integer.go ├── util.go ├── variablebyte ├── variablebyte.go └── variablebyte_test.go └── zigzag ├── bp32 ├── bp32.go └── bp32_test.go └── fastpfor ├── fastpfor.go └── fastpfor_test.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | 24 | .idea 25 | *.iml 26 | 27 | *.swp 28 | *.un~ 29 | 30 | .DS_Store 31 | 32 | Java* 33 | CPP* 34 | 35 | *.pprof 36 | *.prof 37 | *.test 38 | *.out 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, and 10 | distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright 13 | owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all other entities 16 | that control, are controlled by, or are under common control with that entity. 17 | For the purposes of this definition, "control" means (i) the power, direct or 18 | indirect, to cause the direction or management of such entity, whether by 19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the 20 | outstanding shares, or (iii) beneficial ownership of such entity. 21 | 22 | "You" (or "Your") shall mean an individual or Legal Entity exercising 23 | permissions granted by this License. 24 | 25 | "Source" form shall mean the preferred form for making modifications, including 26 | but not limited to software source code, documentation source, and configuration 27 | files. 28 | 29 | "Object" form shall mean any form resulting from mechanical transformation or 30 | translation of a Source form, including but not limited to compiled object code, 31 | generated documentation, and conversions to other media types. 32 | 33 | "Work" shall mean the work of authorship, whether in Source or Object form, made 34 | available under the License, as indicated by a copyright notice that is included 35 | in or attached to the work (an example is provided in the Appendix below). 36 | 37 | "Derivative Works" shall mean any work, whether in Source or Object form, that 38 | is based on (or derived from) the Work and for which the editorial revisions, 39 | annotations, elaborations, or other modifications represent, as a whole, an 40 | original work of authorship. For the purposes of this License, Derivative Works 41 | shall not include works that remain separable from, or merely link (or bind by 42 | name) to the interfaces of, the Work and Derivative Works thereof. 43 | 44 | "Contribution" shall mean any work of authorship, including the original version 45 | of the Work and any modifications or additions to that Work or Derivative Works 46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work 47 | by the copyright owner or by an individual or Legal Entity authorized to submit 48 | on behalf of the copyright owner. For the purposes of this definition, 49 | "submitted" means any form of electronic, verbal, or written communication sent 50 | to the Licensor or its representatives, including but not limited to 51 | communication on electronic mailing lists, source code control systems, and 52 | issue tracking systems that are managed by, or on behalf of, the Licensor for 53 | the purpose of discussing and improving the Work, but excluding communication 54 | that is conspicuously marked or otherwise designated in writing by the copyright 55 | owner as "Not a Contribution." 56 | 57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf 58 | of whom a Contribution has been received by Licensor and subsequently 59 | incorporated within the Work. 60 | 61 | 2. Grant of Copyright License. 62 | 63 | Subject to the terms and conditions of this License, each Contributor hereby 64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 65 | irrevocable copyright license to reproduce, prepare Derivative Works of, 66 | publicly display, publicly perform, sublicense, and distribute the Work and such 67 | Derivative Works in Source or Object form. 68 | 69 | 3. Grant of Patent License. 70 | 71 | Subject to the terms and conditions of this License, each Contributor hereby 72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 73 | irrevocable (except as stated in this section) patent license to make, have 74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where 75 | such license applies only to those patent claims licensable by such Contributor 76 | that are necessarily infringed by their Contribution(s) alone or by combination 77 | of their Contribution(s) with the Work to which such Contribution(s) was 78 | submitted. If You institute patent litigation against any entity (including a 79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a 80 | Contribution incorporated within the Work constitutes direct or contributory 81 | patent infringement, then any patent licenses granted to You under this License 82 | for that Work shall terminate as of the date such litigation is filed. 83 | 84 | 4. Redistribution. 85 | 86 | You may reproduce and distribute copies of the Work or Derivative Works thereof 87 | in any medium, with or without modifications, and in Source or Object form, 88 | provided that You meet the following conditions: 89 | 90 | You must give any other recipients of the Work or Derivative Works a copy of 91 | this License; and 92 | You must cause any modified files to carry prominent notices stating that You 93 | changed the files; and 94 | You must retain, in the Source form of any Derivative Works that You distribute, 95 | all copyright, patent, trademark, and attribution notices from the Source form 96 | of the Work, excluding those notices that do not pertain to any part of the 97 | Derivative Works; and 98 | If the Work includes a "NOTICE" text file as part of its distribution, then any 99 | Derivative Works that You distribute must include a readable copy of the 100 | attribution notices contained within such NOTICE file, excluding those notices 101 | that do not pertain to any part of the Derivative Works, in at least one of the 102 | following places: within a NOTICE text file distributed as part of the 103 | Derivative Works; within the Source form or documentation, if provided along 104 | with the Derivative Works; or, within a display generated by the Derivative 105 | Works, if and wherever such third-party notices normally appear. The contents of 106 | the NOTICE file are for informational purposes only and do not modify the 107 | License. You may add Your own attribution notices within Derivative Works that 108 | You distribute, alongside or as an addendum to the NOTICE text from the Work, 109 | provided that such additional attribution notices cannot be construed as 110 | modifying the License. 111 | You may add Your own copyright statement to Your modifications and may provide 112 | additional or different license terms and conditions for use, reproduction, or 113 | distribution of Your modifications, or for any such Derivative Works as a whole, 114 | provided Your use, reproduction, and distribution of the Work otherwise complies 115 | with the conditions stated in this License. 116 | 117 | 5. Submission of Contributions. 118 | 119 | Unless You explicitly state otherwise, any Contribution intentionally submitted 120 | for inclusion in the Work by You to the Licensor shall be under the terms and 121 | conditions of this License, without any additional terms or conditions. 122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of 123 | any separate license agreement you may have executed with Licensor regarding 124 | such Contributions. 125 | 126 | 6. Trademarks. 127 | 128 | This License does not grant permission to use the trade names, trademarks, 129 | service marks, or product names of the Licensor, except as required for 130 | reasonable and customary use in describing the origin of the Work and 131 | reproducing the content of the NOTICE file. 132 | 133 | 7. Disclaimer of Warranty. 134 | 135 | Unless required by applicable law or agreed to in writing, Licensor provides the 136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, 137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, 138 | including, without limitation, any warranties or conditions of TITLE, 139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are 140 | solely responsible for determining the appropriateness of using or 141 | redistributing the Work and assume any risks associated with Your exercise of 142 | permissions under this License. 143 | 144 | 8. Limitation of Liability. 145 | 146 | In no event and under no legal theory, whether in tort (including negligence), 147 | contract, or otherwise, unless required by applicable law (such as deliberate 148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be 149 | liable to You for damages, including any direct, indirect, special, incidental, 150 | or consequential damages of any character arising as a result of this License or 151 | out of the use or inability to use the Work (including but not limited to 152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or 153 | any and all other commercial damages or losses), even if such Contributor has 154 | been advised of the possibility of such damages. 155 | 156 | 9. Accepting Warranty or Additional Liability. 157 | 158 | While redistributing the Work or Derivative Works thereof, You may choose to 159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or 160 | other liability obligations and/or rights consistent with this License. However, 161 | in accepting such obligations, You may act only on Your own behalf and on Your 162 | sole responsibility, not on behalf of any other Contributor, and only if You 163 | agree to indemnify, defend, and hold each Contributor harmless for any liability 164 | incurred by, or claims asserted against, such Contributor by reason of your 165 | accepting any such warranty or additional liability. 166 | 167 | END OF TERMS AND CONDITIONS 168 | 169 | APPENDIX: How to apply the Apache License to your work 170 | 171 | To apply the Apache License to your work, attach the following boilerplate 172 | notice, with the fields enclosed by brackets "[]" replaced with your own 173 | identifying information. (Don't include the brackets!) The text should be 174 | enclosed in the appropriate comment syntax for the file format. We also 175 | recommend that a file or class name and description of purpose be included on 176 | the same "printed page" as the copyright notice for easier identification within 177 | third-party archives. 178 | 179 | Copyright [yyyy] [name of copyright owner] 180 | 181 | Licensed under the Apache License, Version 2.0 (the "License"); 182 | you may not use this file except in compliance with the License. 183 | You may obtain a copy of the License at 184 | 185 | http://www.apache.org/licenses/LICENSE-2.0 186 | 187 | Unless required by applicable law or agreed to in writing, software 188 | distributed under the License is distributed on an "AS IS" BASIS, 189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 190 | See the License for the specific language governing permissions and 191 | limitations under the License. 192 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Encoding 2 | ======== 3 | 4 | This is a set of integer compression algorithms implemented in Go. It is an (incomplete) port of the JavaFastPFOR by Dr. Daniel Lemire. 5 | 6 | For more detailed benchmark results please see http://zhen.org/blog/benchmarking-integer-compression-in-go/ 7 | -------------------------------------------------------------------------------- /benchmark/benchmark.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | package main 8 | 9 | import ( 10 | "bufio" 11 | "compress/gzip" 12 | "flag" 13 | "fmt" 14 | "io/ioutil" 15 | "log" 16 | "os" 17 | "runtime" 18 | "runtime/pprof" 19 | "strconv" 20 | "strings" 21 | "time" 22 | 23 | "github.com/dataence/encoding" 24 | "github.com/dataence/encoding/bp32" 25 | "github.com/dataence/encoding/composition" 26 | "github.com/dataence/encoding/cursor" 27 | dbp32 "github.com/dataence/encoding/delta/bp32" 28 | dfastpfor "github.com/dataence/encoding/delta/fastpfor" 29 | dvb "github.com/dataence/encoding/delta/variablebyte" 30 | "github.com/dataence/encoding/fastpfor" 31 | "github.com/dataence/encoding/variablebyte" 32 | zbp32 "github.com/dataence/encoding/zigzag/bp32" 33 | zfastpfor "github.com/dataence/encoding/zigzag/fastpfor" 34 | ) 35 | 36 | type paramList []string 37 | 38 | var ( 39 | filesParam, dirsParam, codecsParam paramList 40 | pprofParam bool 41 | files []string 42 | ) 43 | 44 | func (this *paramList) String() string { 45 | return fmt.Sprint(*this) 46 | } 47 | 48 | func (this *paramList) Set(value string) error { 49 | for _, f := range strings.Split(value, ",") { 50 | *this = append(*this, f) 51 | } 52 | 53 | return nil 54 | } 55 | 56 | func init() { 57 | flag.BoolVar(&pprofParam, "pprof", false, "Print result for individual files.") 58 | flag.Var(&filesParam, "file", "The file containing one integer per line to encode. There can be multiple of this, or comma separated list.") 59 | flag.Var(&dirsParam, "dir", "The directory containing a list of files with one integer per line. There can be multiple of this, or comma separated list.") 60 | flag.Var(&codecsParam, "codec", "The codec to use: bp32, fastpfor, variablebyte, deltabp32, deltafastpfor, deltavariablebyte, zigzagbp32. There can be multiple of this, or comma separated list.") 61 | } 62 | 63 | func scanIntegers(s *bufio.Scanner) ([]int32, error) { 64 | result := make([]int32, 0, 1000000) 65 | for s.Scan() { 66 | i, err := strconv.ParseUint(s.Text(), 10, 32) 67 | if err != nil { 68 | return nil, err 69 | } else { 70 | result = append(result, int32(i)) 71 | } 72 | } 73 | 74 | // Run the garbage collector to get rid of all the strings that's been allocated 75 | // during the file read 76 | runtime.GC() 77 | 78 | return result, nil 79 | 80 | } 81 | 82 | func readIntegerFile(path string) ([]int32, error) { 83 | file, err := os.Open(path) 84 | if err != nil { 85 | return nil, err 86 | } 87 | defer file.Close() 88 | 89 | scanner := bufio.NewScanner(file) 90 | 91 | return scanIntegers(scanner) 92 | } 93 | 94 | func readGzippedIntegerFile(path string) ([]int32, error) { 95 | f, err := os.Open(path) 96 | if err != nil { 97 | return nil, err 98 | } 99 | defer f.Close() 100 | 101 | gunzip, err := gzip.NewReader(f) 102 | if err != nil { 103 | return nil, err 104 | } 105 | 106 | scanner := bufio.NewScanner(gunzip) 107 | 108 | return scanIntegers(scanner) 109 | } 110 | 111 | func getDirOfFiles(path string) ([]string, error) { 112 | filenames := make([]string, 0, 10) 113 | 114 | files, err := ioutil.ReadDir(path) 115 | if err != nil { 116 | return nil, err 117 | } 118 | 119 | for _, f := range files { 120 | filenames = append(filenames, path+"/"+f.Name()) 121 | } 122 | 123 | return filenames, nil 124 | } 125 | 126 | func loadIntegerFromFiles(files []string) ([][]int32, int, error) { 127 | max := 0 128 | data := make([][]int32, 0, len(files)) 129 | 130 | for _, f := range files { 131 | var ( 132 | res []int32 133 | err error 134 | ) 135 | 136 | log.Printf("Processing %s\n", f) 137 | 138 | if strings.HasPrefix(f, "gz-") { 139 | res, err = readGzippedIntegerFile(strings.TrimPrefix(f, "gz-")) 140 | } else if strings.HasSuffix(f, ".gz") { 141 | res, err = readGzippedIntegerFile(f) 142 | } else { 143 | res, err = readIntegerFile(f) 144 | } 145 | 146 | if err != nil { 147 | return nil, 0, err 148 | } 149 | 150 | data = append(data, res) 151 | 152 | if len(res) > max { 153 | max = len(res) 154 | } 155 | } 156 | 157 | return data, max, nil 158 | } 159 | 160 | func getListOfFiles() []string { 161 | files := make([]string, 0, 10) 162 | 163 | for _, d := range dirsParam { 164 | res, err := getDirOfFiles(d) 165 | if err != nil { 166 | log.Fatal(err) 167 | } 168 | 169 | files = append(files, res...) 170 | } 171 | 172 | files = append(files, filesParam...) 173 | 174 | return files 175 | } 176 | 177 | func getListOfCodecs() (map[string]encoding.Integer, error) { 178 | codecs := make(map[string]encoding.Integer, 10) 179 | 180 | for _, codec := range codecsParam { 181 | switch codec { 182 | case "bp32": 183 | codecs["bp32"] = composition.New(bp32.New(), variablebyte.New()) 184 | case "fastpfor": 185 | codecs["fastpfor"] = composition.New(fastpfor.New(), variablebyte.New()) 186 | case "variablebyte": 187 | codecs["variablebyte"] = variablebyte.New() 188 | case "deltabp32": 189 | codecs["delta bp32"] = composition.New(dbp32.New(), dvb.New()) 190 | case "deltafastpfor": 191 | codecs["delta fastpfor"] = composition.New(dfastpfor.New(), dvb.New()) 192 | case "deltavariablebyte": 193 | codecs["delta variablebyte"] = dvb.New() 194 | case "zigzagbp32": 195 | codecs["zigzag bp32"] = composition.New(zbp32.New(), dvb.New()) 196 | case "zigzagfastpfor": 197 | codecs["zigzag fastpfor"] = composition.New(zfastpfor.New(), dvb.New()) 198 | } 199 | } 200 | 201 | if len(codecs) < 1 { 202 | return nil, fmt.Errorf("benchmark/getListOfCodecs: No codecs defined") 203 | } 204 | 205 | return codecs, nil 206 | } 207 | 208 | func compress(codec encoding.Integer, in, out []int32, length int, prof bool) (duration int64, ret []int32, err error) { 209 | inpos := cursor.New() 210 | outpos := cursor.New() 211 | 212 | now := time.Now() 213 | if prof { 214 | f, e := os.Create("cpu.compress.pprof") 215 | if e != nil { 216 | log.Fatal(e) 217 | } 218 | defer f.Close() 219 | 220 | pprof.StartCPUProfile(f) 221 | } 222 | 223 | if err = codec.Compress(in, inpos, len(in), out, outpos); err != nil { 224 | return 0, nil, err 225 | } 226 | since := time.Since(now).Nanoseconds() 227 | 228 | if prof { 229 | pprof.StopCPUProfile() 230 | } 231 | 232 | return since, out[:outpos.Get()], nil 233 | } 234 | 235 | func uncompress(codec encoding.Integer, in, out []int32, length int, prof bool) (duration int64, ret []int32, err error) { 236 | inpos := cursor.New() 237 | outpos := cursor.New() 238 | 239 | if prof { 240 | f, e := os.Create("cpu.uncompress.pprof") 241 | if e != nil { 242 | log.Fatal(e) 243 | } 244 | defer f.Close() 245 | 246 | pprof.StartCPUProfile(f) 247 | } 248 | 249 | now := time.Now() 250 | if err = codec.Uncompress(in, inpos, len(in), out, outpos); err != nil { 251 | return 0, nil, err 252 | } 253 | since := time.Since(now).Nanoseconds() 254 | 255 | if prof { 256 | pprof.StopCPUProfile() 257 | } 258 | 259 | return since, out[:outpos.Get()], nil 260 | } 261 | 262 | func testCodecs(codecs map[string]encoding.Integer, data [][]int32, max int, output bool) error { 263 | compdata := make([]int32, max+max/2) 264 | decompdata := make([]int32, max) 265 | 266 | for name, codec := range codecs { 267 | for i, in := range data { 268 | k := len(in) 269 | 270 | dur, out, err := compress(codec, in, compdata, k, pprofParam) 271 | if err != nil { 272 | return err 273 | } 274 | 275 | dur2, out2, err2 := uncompress(codec, out, decompdata, k, pprofParam) 276 | if err2 != nil { 277 | return err2 278 | } 279 | 280 | if output { 281 | fmt.Printf("% 20s % 20s: %5.2f %5.2f %5.2f\n", files[i], name, float64(len(out)*32)/float64(k), (float64(k) / (float64(dur) / 1000000000.0) / 1000000.0), (float64(k) / (float64(dur2) / 1000000000.0) / 1000000.0)) 282 | } 283 | 284 | for i := 0; i < k; i++ { 285 | if in[i] != decompdata[i] { 286 | return fmt.Errorf("benchmark/testCodecs: Problem recovering. index = %d, in = %d, recovered = %d, original length = %d, recovered length = %d\n", i, in[i], out2[i], k, len(out2)) 287 | } 288 | } 289 | 290 | runtime.GC() 291 | } 292 | } 293 | 294 | return nil 295 | } 296 | 297 | func main() { 298 | flag.Parse() 299 | files = getListOfFiles() 300 | 301 | codecs, err := getListOfCodecs() 302 | if err != nil { 303 | log.Fatal(err) 304 | } 305 | 306 | data, max, err := loadIntegerFromFiles(files) 307 | if err != nil { 308 | log.Fatal(err) 309 | } 310 | 311 | if err := testCodecs(codecs, data, max, false); err != nil { 312 | log.Fatal(err) 313 | } 314 | 315 | if err := testCodecs(codecs, data, max, false); err != nil { 316 | log.Fatal(err) 317 | } 318 | 319 | if err := testCodecs(codecs, data, max, true); err != nil { 320 | log.Fatal(err) 321 | } 322 | } 323 | -------------------------------------------------------------------------------- /benchmark/data/ip.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zentures/encoding/b90e310a0325f9b765b4be7220df3642ad93ad8d/benchmark/data/ip.txt.gz -------------------------------------------------------------------------------- /benchmark/data/lat.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zentures/encoding/b90e310a0325f9b765b4be7220df3642ad93ad8d/benchmark/data/lat.txt.gz -------------------------------------------------------------------------------- /benchmark/data/ts.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zentures/encoding/b90e310a0325f9b765b4be7220df3642ad93ad8d/benchmark/data/ts.txt.gz -------------------------------------------------------------------------------- /benchmark/results/Benchmarking_Integer_Compression.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zentures/encoding/b90e310a0325f9b765b4be7220df3642ad93ad8d/benchmark/results/Benchmarking_Integer_Compression.xlsx -------------------------------------------------------------------------------- /benchtools/benchtools.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | package benchtools 8 | 9 | import ( 10 | "bytes" 11 | "compress/gzip" 12 | "compress/lzw" 13 | "fmt" 14 | "io" 15 | "log" 16 | "os" 17 | "runtime/pprof" 18 | "time" 19 | 20 | "code.google.com/p/snappy-go/snappy" 21 | "github.com/dataence/encoding" 22 | "github.com/dataence/encoding/cursor" 23 | ) 24 | 25 | func TestCodec(codec encoding.Integer, in []int32, sizes []int) { 26 | for _, k := range sizes { 27 | if k > len(in) { 28 | continue 29 | } 30 | 31 | dur, out, err := Compress(codec, in[:k], k) 32 | if err != nil { 33 | log.Fatal(err) 34 | } 35 | 36 | dur2, out2, err2 := Uncompress(codec, out, k) 37 | if err2 != nil { 38 | log.Fatal(err2) 39 | } 40 | 41 | //log.Printf("benchtools/TestCodec: %f %.2f %.2f\n", float64(len(out)*32)/float64(k), (float64(k)/(float64(dur)/1000000000.0)/1000000.0), (float64(k)/(float64(dur2)/1000000000.0)/1000000.0)) 42 | fmt.Printf("%f %.2f %.2f\n", float64(len(out)*32)/float64(k), (float64(k) / (float64(dur) / 1000000000.0) / 1000000.0), (float64(k) / (float64(dur2) / 1000000000.0) / 1000000.0)) 43 | 44 | for i := 0; i < k; i++ { 45 | if in[i] != out2[i] { 46 | log.Fatalf("benchtools/TestCodec: Problem recovering. index = %d, in = %d, recovered = %d, original length = %d, recovered length = %d\n", i, in[i], out2[i], k, len(out2)) 47 | } 48 | } 49 | } 50 | } 51 | 52 | func PprofCodec(codec encoding.Integer, in []int32, sizes []int) { 53 | for _, k := range sizes { 54 | if k > len(in) { 55 | continue 56 | } 57 | 58 | dur, out, err := PprofCompress(codec, in[:k], k) 59 | if err != nil { 60 | log.Fatal(err) 61 | } 62 | 63 | dur2, out2, err2 := PprofUncompress(codec, out, k) 64 | if err2 != nil { 65 | log.Fatal(err2) 66 | } 67 | 68 | log.Printf("benchtools/PprofCodec: %f %.2f %.2f\n", float64(len(out)*32)/float64(k), (float64(k) / (float64(dur) / 1000000000.0) / 1000000.0), (float64(k) / (float64(dur2) / 1000000000.0) / 1000000.0)) 69 | 70 | for i := 0; i < k; i++ { 71 | if in[i] != out2[i] { 72 | log.Fatalf("benchtools/PprofCodec: Problem recovering. index = %d, in = %d, recovered = %d, original length = %d, recovered length = %d\n", i, in[i], out2[i], k, len(out2)) 73 | } 74 | } 75 | } 76 | } 77 | 78 | func Compress(codec encoding.Integer, in []int32, length int) (duration int64, out []int32, err error) { 79 | return RunCompress(codec, in, length, false) 80 | } 81 | 82 | func Uncompress(codec encoding.Integer, in []int32, length int) (duration int64, out []int32, err error) { 83 | return RunUncompress(codec, in, length, false) 84 | } 85 | 86 | func PprofCompress(codec encoding.Integer, in []int32, length int) (duration int64, out []int32, err error) { 87 | return RunCompress(codec, in, length, true) 88 | } 89 | 90 | func PprofUncompress(codec encoding.Integer, in []int32, length int) (duration int64, out []int32, err error) { 91 | return RunUncompress(codec, in, length, true) 92 | } 93 | 94 | func RunCompress(codec encoding.Integer, in []int32, length int, prof bool) (duration int64, out []int32, err error) { 95 | out = make([]int32, length*2) 96 | inpos := cursor.New() 97 | outpos := cursor.New() 98 | 99 | now := time.Now() 100 | if prof { 101 | f, e := os.Create("cpu.compress.pprof") 102 | if e != nil { 103 | log.Fatal(e) 104 | } 105 | defer f.Close() 106 | 107 | pprof.StartCPUProfile(f) 108 | } 109 | 110 | if err = codec.Compress(in, inpos, len(in), out, outpos); err != nil { 111 | return 0, nil, err 112 | } 113 | since := time.Since(now).Nanoseconds() 114 | 115 | if prof { 116 | pprof.StopCPUProfile() 117 | } 118 | 119 | return since, out[:outpos.Get()], nil 120 | } 121 | 122 | func RunUncompress(codec encoding.Integer, in []int32, length int, prof bool) (duration int64, out []int32, err error) { 123 | out = make([]int32, length) 124 | inpos := cursor.New() 125 | outpos := cursor.New() 126 | 127 | if prof { 128 | f, e := os.Create("cpu.uncompress.pprof") 129 | if e != nil { 130 | log.Fatal(e) 131 | } 132 | defer f.Close() 133 | 134 | pprof.StartCPUProfile(f) 135 | } 136 | 137 | now := time.Now() 138 | if err = codec.Uncompress(in, inpos, len(in), out, outpos); err != nil { 139 | return 0, nil, err 140 | } 141 | since := time.Since(now).Nanoseconds() 142 | 143 | if prof { 144 | pprof.StopCPUProfile() 145 | } 146 | 147 | return since, out[:outpos.Get()], nil 148 | } 149 | 150 | func RunTestGzip(data []byte) { 151 | log.Printf("encoding/RunTestGzip: Testing comprssion Gzip\n") 152 | 153 | var compressed bytes.Buffer 154 | w := gzip.NewWriter(&compressed) 155 | defer w.Close() 156 | now := time.Now() 157 | w.Write(data) 158 | 159 | cl := compressed.Len() 160 | log.Printf("encoding/RunTestGzip: Compressed from %d bytes to %d bytes in %d ns\n", len(data), cl, time.Since(now).Nanoseconds()) 161 | 162 | recovered := make([]byte, len(data)) 163 | r, _ := gzip.NewReader(&compressed) 164 | defer r.Close() 165 | 166 | total := 0 167 | n := 100 168 | var err error = nil 169 | for err != io.EOF && n != 0 { 170 | n, err = r.Read(recovered[total:]) 171 | total += n 172 | } 173 | log.Printf("encoding/RunTestGzip: Uncompressed from %d bytes to %d bytes in %d ns\n", cl, len(recovered), time.Since(now).Nanoseconds()) 174 | } 175 | 176 | func RunTestLZW(data []byte) { 177 | log.Printf("encoding/RunTestLZW: Testing comprssion LZW\n") 178 | 179 | var compressed bytes.Buffer 180 | w := lzw.NewWriter(&compressed, lzw.MSB, 8) 181 | defer w.Close() 182 | now := time.Now() 183 | w.Write(data) 184 | 185 | cl := compressed.Len() 186 | log.Printf("encoding/RunTestLZW: Compressed from %d bytes to %d bytes in %d ns\n", len(data), cl, time.Since(now).Nanoseconds()) 187 | 188 | recovered := make([]byte, len(data)) 189 | r := lzw.NewReader(&compressed, lzw.MSB, 8) 190 | defer r.Close() 191 | 192 | total := 0 193 | n := 100 194 | var err error = nil 195 | for err != io.EOF && n != 0 { 196 | n, err = r.Read(recovered[total:]) 197 | total += n 198 | } 199 | log.Printf("encoding/RunTestLZW: Uncompressed from %d bytes to %d bytes in %d ns\n", cl, len(recovered), time.Since(now).Nanoseconds()) 200 | } 201 | 202 | func RunTestSnappy(data []byte) { 203 | log.Printf("encoding/RunTestSnappy: Testing comprssion Snappy\n") 204 | 205 | now := time.Now() 206 | e, err := snappy.Encode(nil, data) 207 | if err != nil { 208 | log.Fatalf("encoding/RunTestSnappy: encoding error: %v\n", err) 209 | } 210 | log.Printf("encoding/RunTestSnappy: Compressed from %d bytes to %d bytes in %d ns\n", len(data), len(e), time.Since(now).Nanoseconds()) 211 | 212 | d, err := snappy.Decode(nil, e) 213 | if err != nil { 214 | log.Fatalf("encoding/RunTestSnappy: decoding error: %v\n", err) 215 | } 216 | log.Printf("encoding/RunTestSnappy: Uncompressed from %d bytes to %d bytes in %d ns\n", len(e), len(d), time.Since(now).Nanoseconds()) 217 | 218 | if !bytes.Equal(data, d) { 219 | log.Fatalf("encoding/RunTestSnappy: roundtrip mismatch\n") 220 | } 221 | } 222 | -------------------------------------------------------------------------------- /bitlen.go: -------------------------------------------------------------------------------- 1 | // +build !gccgo,!amd64,!386,!arm 2 | 3 | // (gccgo) OR ((NOT amd64) AND (NOT 386) AND (NOT ARM)) 4 | package encoding 5 | 6 | func bitlen(x uint64) (n int) { 7 | return 32 - int(nlz1a(uint32(x))) 8 | } 9 | -------------------------------------------------------------------------------- /bitlen_386.s: -------------------------------------------------------------------------------- 1 | // +build !gccgo 2 | 3 | // func bitlen(x Word) (n int) 4 | TEXT ·bitlen(SB),4,$0 5 | BSRL x+0(FP), AX 6 | JZ Z1 7 | INCL AX 8 | MOVL AX, n+4(FP) 9 | RET 10 | 11 | Z1: MOVL $0, n+4(FP) 12 | RET 13 | -------------------------------------------------------------------------------- /bitlen_amd64.s: -------------------------------------------------------------------------------- 1 | // +build !gccgo 2 | 3 | // func bitlen(x Word) (n int) 4 | TEXT ·bitlen(SB),4,$0 5 | BSRQ x+0(FP), AX 6 | JZ Z1 7 | ADDQ $1, AX 8 | MOVQ AX, n+8(FP) 9 | RET 10 | 11 | Z1: MOVQ $0, n+8(FP) 12 | RET 13 | -------------------------------------------------------------------------------- /bitlen_arm.s: -------------------------------------------------------------------------------- 1 | // +build !gccgo 2 | 3 | // func bitlen(x Word) (n int) 4 | TEXT ·bitlen(SB),4,$0 5 | MOVW x+0(FP), R0 6 | CLZ R0, R0 7 | MOVW $32, R1 8 | SUB.S R0, R1 9 | MOVW R1, n+4(FP) 10 | RET 11 | -------------------------------------------------------------------------------- /bitlen_decl.go: -------------------------------------------------------------------------------- 1 | // +build !gccgo 2 | // +build amd64 386 arm 3 | 4 | package encoding 5 | 6 | // This is defined in util_{amd64,386}.s, copied from pkg/math/big/arith_{amd64/386}.s 7 | func bitlen(x uint64) (n int) 8 | -------------------------------------------------------------------------------- /bitlen_gccgo.go: -------------------------------------------------------------------------------- 1 | // +build gccgo 2 | 3 | package encoding 4 | 5 | // this is apparetly the old way -> func clz(uint64) uint64 __asm__("__clzdi2") 6 | 7 | //extern __clzdi2 8 | func clz(uint64) uint64 9 | 10 | func bitlen(x uint64) (n int) { 11 | if x == 0 { 12 | return 0 13 | } 14 | return 64 - int(clz(x)) 15 | } 16 | -------------------------------------------------------------------------------- /bp32/bp32.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | // Package bp32 is an implementation of the binary packing integer compression 8 | // algorithm in in Go (also known as PackedBinary) using 32-integer blocks. 9 | // It is mostly suitable for arrays containing small positive integers. 10 | // Given a list of sorted integers, you should first compute the successive 11 | // differences prior to compression. 12 | // For details, please see 13 | // Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second 14 | // through vectorization Software: Practice & Experience 15 | // http://onlinelibrary.wiley.com/doi/10.1002/spe.2203/abstract or 16 | // http://arxiv.org/abs/1209.2137 17 | package bp32 18 | 19 | import ( 20 | "errors" 21 | 22 | "github.com/dataence/encoding" 23 | "github.com/dataence/encoding/bitpacking" 24 | "github.com/dataence/encoding/cursor" 25 | ) 26 | 27 | const ( 28 | DefaultBlockSize = 128 29 | ) 30 | 31 | type BP32 struct { 32 | } 33 | 34 | var _ encoding.Integer = (*BP32)(nil) 35 | 36 | func New() encoding.Integer { 37 | return &BP32{} 38 | } 39 | 40 | func (this *BP32) Compress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error { 41 | 42 | inlength = encoding.FloorBy(inlength, DefaultBlockSize) 43 | 44 | if inlength == 0 { 45 | return errors.New("BP32/Compress: block size less than 128. No work done.") 46 | } 47 | 48 | out[outpos.Get()] = int32(inlength) 49 | outpos.Increment() 50 | 51 | tmpoutpos := outpos.Get() 52 | s := inpos.Get() 53 | finalinpos := s + inlength 54 | 55 | for ; s < finalinpos; s += DefaultBlockSize { 56 | mbits1 := encoding.MaxBits(in[s : s+32]) 57 | mbits2 := encoding.MaxBits(in[s+32 : s+2*32]) 58 | mbits3 := encoding.MaxBits(in[s+2*32 : s+3*32]) 59 | mbits4 := encoding.MaxBits(in[s+3*32 : s+4*32]) 60 | 61 | out[tmpoutpos] = (mbits1 << 24) | (mbits2 << 16) | (mbits3 << 8) | mbits4 62 | tmpoutpos += 1 63 | bitpacking.FastPackWithoutMask(in, s, out, tmpoutpos, int(mbits1)) 64 | tmpoutpos += int(mbits1) 65 | bitpacking.FastPackWithoutMask(in, s+32, out, tmpoutpos, int(mbits2)) 66 | tmpoutpos += int(mbits2) 67 | bitpacking.FastPackWithoutMask(in, s+2*32, out, tmpoutpos, int(mbits3)) 68 | tmpoutpos += int(mbits3) 69 | bitpacking.FastPackWithoutMask(in, s+3*32, out, tmpoutpos, int(mbits4)) 70 | tmpoutpos += int(mbits4) 71 | } 72 | 73 | inpos.Add(inlength) 74 | outpos.Set(tmpoutpos) 75 | 76 | return nil 77 | } 78 | 79 | func (this *BP32) Uncompress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error { 80 | if inlength == 0 { 81 | return errors.New("BP32/Uncompress: Length is 0. No work done.") 82 | } 83 | 84 | outlength := int(in[inpos.Get()]) 85 | inpos.Increment() 86 | 87 | tmpinpos := inpos.Get() 88 | 89 | for s := outpos.Get(); s < outpos.Get()+outlength; s += 32 * 4 { 90 | tmp := in[tmpinpos] 91 | mbits1 := tmp >> 24 92 | mbits2 := (tmp >> 16) & 0xFF 93 | mbits3 := (tmp >> 8) & 0xFF 94 | mbits4 := (tmp) & 0xFF 95 | 96 | tmpinpos += 1 97 | 98 | bitpacking.FastUnpack(in, tmpinpos, out, s, int(mbits1)) 99 | tmpinpos += int(mbits1) 100 | 101 | bitpacking.FastUnpack(in, tmpinpos, out, s+32, int(mbits2)) 102 | tmpinpos += int(mbits2) 103 | 104 | bitpacking.FastUnpack(in, tmpinpos, out, s+2*32, int(mbits3)) 105 | tmpinpos += int(mbits3) 106 | 107 | bitpacking.FastUnpack(in, tmpinpos, out, s+3*32, int(mbits4)) 108 | tmpinpos += int(mbits4) 109 | } 110 | 111 | outpos.Add(outlength) 112 | inpos.Set(tmpinpos) 113 | 114 | return nil 115 | } 116 | -------------------------------------------------------------------------------- /bp32/bp32_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | package bp32 8 | 9 | import ( 10 | "log" 11 | "testing" 12 | 13 | "github.com/dataence/encoding/benchtools" 14 | "github.com/dataence/encoding/cursor" 15 | "github.com/dataence/encoding/generators" 16 | ) 17 | 18 | var ( 19 | data []int32 20 | size int = 128000 21 | ) 22 | 23 | func init() { 24 | log.Printf("bp32/init: generating %d int32s\n", size) 25 | data = generators.GenerateClustered(size, size*2) 26 | log.Printf("bp32/init: generated %d integers for test", size) 27 | } 28 | 29 | func TestCodec(t *testing.T) { 30 | sizes := []int{128, 128 * 10, 128 * 100, 128 * 1000} 31 | benchtools.TestCodec(New(), data, sizes) 32 | } 33 | 34 | // go test -bench=Decode 35 | func BenchmarkDecode(b *testing.B) { 36 | b.StopTimer() 37 | length := 128 * 1024 38 | data := generators.GenerateClustered(length, 1<<24) 39 | compdata := make([]int32, 2*length) 40 | recov := make([]int32, length) 41 | inpos := cursor.New() 42 | outpos := cursor.New() 43 | codec := New() 44 | codec.Compress(data, inpos, len(data), compdata, outpos) 45 | b.StartTimer() 46 | for j := 0; j < b.N; j++ { 47 | newinpos := cursor.New() 48 | newoutpos := cursor.New() 49 | codec.Uncompress(compdata, newinpos, outpos.Get()-newinpos.Get(), recov, newoutpos) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /composition/composition.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | package composition 8 | 9 | import ( 10 | "errors" 11 | 12 | "github.com/dataence/encoding" 13 | "github.com/dataence/encoding/cursor" 14 | ) 15 | 16 | type Composition struct { 17 | f1 encoding.Integer 18 | f2 encoding.Integer 19 | } 20 | 21 | var _ encoding.Integer = (*Composition)(nil) 22 | 23 | func New(f1 encoding.Integer, f2 encoding.Integer) encoding.Integer { 24 | return &Composition{ 25 | f1: f1, 26 | f2: f2, 27 | } 28 | } 29 | 30 | func (this *Composition) Compress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error { 31 | if inlength == 0 { 32 | return errors.New("composition/Compress: inlength = 0. No work done.") 33 | } 34 | 35 | init := inpos.Get() 36 | this.f1.Compress(in, inpos, inlength, out, outpos) 37 | if outpos.Get() == 0 { 38 | out[0] = 0 39 | outpos.Increment() 40 | } 41 | //log.Printf("composition/Compress: f1 inpos = %d, outpos = %d, inlength = %d\n", inpos.Get(), outpos.Get(), inlength) 42 | 43 | inlength -= inpos.Get() - init 44 | this.f2.Compress(in, inpos, inlength, out, outpos) 45 | //log.Printf("composition/Compress: f2 inpos = %d, outpos = %d, inlength = %d\n", inpos.Get(), outpos.Get(), inlength) 46 | 47 | return nil 48 | } 49 | 50 | func (this *Composition) Uncompress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error { 51 | if inlength == 0 { 52 | return errors.New("composition/Uncompress: inlength = 0. No work done.") 53 | } 54 | 55 | init := inpos.Get() 56 | this.f1.Uncompress(in, inpos, inlength, out, outpos) 57 | //log.Printf("composition/Uncompress: f1 inpos = %d, outpos = %d, inlength = %d\n", inpos.Get(), outpos.Get(), inlength) 58 | inlength -= inpos.Get() - init 59 | this.f2.Uncompress(in, inpos, inlength, out, outpos) 60 | //log.Printf("composition/Uncompress: f2 inpos = %d, outpos = %d, inlength = %d\n", inpos.Get(), outpos.Get(), inlength) 61 | 62 | return nil 63 | } 64 | -------------------------------------------------------------------------------- /composition/composition_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | package composition 8 | 9 | import ( 10 | "log" 11 | "testing" 12 | 13 | "github.com/dataence/encoding" 14 | "github.com/dataence/encoding/benchtools" 15 | "github.com/dataence/encoding/bp32" 16 | dbp32 "github.com/dataence/encoding/delta/bp32" 17 | dvb "github.com/dataence/encoding/delta/variablebyte" 18 | "github.com/dataence/encoding/generators" 19 | "github.com/dataence/encoding/variablebyte" 20 | ) 21 | 22 | var ( 23 | codec encoding.Integer 24 | data []int32 25 | size int = 10000000 26 | ) 27 | 28 | func init() { 29 | log.Printf("composition_test/init: generating %d uint32s\n", size) 30 | data = generators.GenerateClustered(size, size*2) 31 | log.Printf("composition_test/init: generated %d integers for test", size) 32 | } 33 | 34 | func TestDeltaBP32andDeltaVariableByte(t *testing.T) { 35 | sizes := []int{100, 100 * 10, 100 * 100, 100 * 1000, 100 * 10000} 36 | benchtools.TestCodec(New(dbp32.New(), dvb.New()), data, sizes) 37 | } 38 | 39 | func TestBP32andVariableByte(t *testing.T) { 40 | sizes := []int{100, 100 * 10, 100 * 100, 100 * 1000, 100 * 10000} 41 | benchtools.TestCodec(New(bp32.New(), variablebyte.New()), data, sizes) 42 | } 43 | -------------------------------------------------------------------------------- /cursor/cursor.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | package cursor 8 | 9 | type Cursor struct { 10 | value int 11 | } 12 | 13 | func New() *Cursor { 14 | return &Cursor{ 15 | value: 0, 16 | } 17 | } 18 | 19 | func (this *Cursor) Get() int { 20 | return this.value 21 | } 22 | 23 | func (this *Cursor) Set(i int) { 24 | this.value = i 25 | } 26 | 27 | func (this *Cursor) Add(i int) { 28 | this.value += i 29 | } 30 | 31 | func (this *Cursor) Increment() { 32 | this.value += 1 33 | } 34 | -------------------------------------------------------------------------------- /delta/bp32/bp32.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | package bp32 8 | 9 | import ( 10 | "errors" 11 | 12 | "github.com/dataence/encoding" 13 | "github.com/dataence/encoding/bitpacking" 14 | "github.com/dataence/encoding/cursor" 15 | ) 16 | 17 | const ( 18 | DefaultBlockSize = 128 19 | DefaultPageSize = 65536 20 | ) 21 | 22 | type BP32 struct { 23 | } 24 | 25 | var _ encoding.Integer = (*BP32)(nil) 26 | 27 | func New() encoding.Integer { 28 | return &BP32{} 29 | } 30 | 31 | func (this *BP32) Compress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error { 32 | //log.Printf("bp32/Compress: before inlength = %d\n", inlength) 33 | 34 | inlength = encoding.FloorBy(inlength, DefaultBlockSize) 35 | 36 | if inlength == 0 { 37 | return errors.New("BP32/Compress: block size less than 128. No work done.") 38 | } 39 | 40 | //log.Printf("bp32/Compress: after inlength = %d, len(in) = %d\n", inlength, len(in)) 41 | 42 | out[outpos.Get()] = int32(inlength) 43 | outpos.Increment() 44 | 45 | tmpoutpos := outpos.Get() 46 | initoffset := int32(0) 47 | s := inpos.Get() 48 | finalinpos := s + inlength 49 | 50 | for ; s < finalinpos; s += DefaultBlockSize { 51 | mbits1 := encoding.DeltaMaxBits(initoffset, in[s:s+32]) 52 | initoffset2 := in[s+31] 53 | mbits2 := encoding.DeltaMaxBits(initoffset2, in[s+32:s+2*32]) 54 | initoffset3 := in[s+32+31] 55 | mbits3 := encoding.DeltaMaxBits(initoffset3, in[s+2*32:s+3*32]) 56 | initoffset4 := in[s+2*32+31] 57 | mbits4 := encoding.DeltaMaxBits(initoffset4, in[s+3*32:s+4*32]) 58 | 59 | //log.Printf("bp32/Compress: tmpoutpos = %d, s = %d\n", tmpoutpos, s) 60 | 61 | out[tmpoutpos] = (mbits1 << 24) | (mbits2 << 16) | (mbits3 << 8) | mbits4 62 | tmpoutpos += 1 63 | 64 | //log.Printf("bp32/Compress: mbits1 = %d, mbits2 = %d, mbits3 = %d, mbits4 = %d, s = %d\n", mbits1, mbits2, mbits3, mbits4, out[tmpoutpos-1]) 65 | 66 | bitpacking.DeltaPack(initoffset, in, s, out, tmpoutpos, int(mbits1)) 67 | //encoding.PrintUint32sInBits(in, s, 32) 68 | //encoding.PrintUint32sInBits(out, tmpoutpos, int(mbits1)) 69 | tmpoutpos += int(mbits1) 70 | 71 | bitpacking.DeltaPack(initoffset2, in, s+32, out, tmpoutpos, int(mbits2)) 72 | //encoding.PrintUint32sInBits(in, s+32, 32) 73 | //encoding.PrintUint32sInBits(out, tmpoutpos, int(mbits2)) 74 | tmpoutpos += int(mbits2) 75 | 76 | bitpacking.DeltaPack(initoffset3, in, s+2*32, out, tmpoutpos, int(mbits3)) 77 | //encoding.PrintUint32sInBits(in, s+2*32, 32) 78 | //encoding.PrintUint32sInBits(out, tmpoutpos, int(mbits3)) 79 | tmpoutpos += int(mbits3) 80 | 81 | bitpacking.DeltaPack(initoffset4, in, s+3*32, out, tmpoutpos, int(mbits4)) 82 | //encoding.PrintUint32sInBits(in, s+3*32, 32) 83 | //encoding.PrintUint32sInBits(out, tmpoutpos, int(mbits4)) 84 | tmpoutpos += int(mbits4) 85 | 86 | initoffset = in[s+3*32+31] 87 | } 88 | 89 | inpos.Add(inlength) 90 | outpos.Set(tmpoutpos) 91 | 92 | return nil 93 | } 94 | 95 | func (this *BP32) Uncompress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error { 96 | if inlength == 0 { 97 | return errors.New("BP32/Uncompress: Length is 0. No work done.") 98 | } 99 | 100 | outlength := in[inpos.Get()] 101 | inpos.Increment() 102 | 103 | tmpinpos := inpos.Get() 104 | initoffset := int32(0) 105 | 106 | //log.Printf("bp32/Uncompress: outlength = %d, inpos = %d, outpos = %d\n", outlength, inpos.Get(), outpos.Get()) 107 | for s := outpos.Get(); s < outpos.Get()+int(outlength); s += 32 * 4 { 108 | tmp := in[tmpinpos] 109 | mbits1 := tmp >> 24 110 | mbits2 := (tmp >> 16) & 0xFF 111 | mbits3 := (tmp >> 8) & 0xFF 112 | mbits4 := (tmp) & 0xFF 113 | 114 | //log.Printf("bp32/Uncopmress: mbits1 = %d, mbits2 = %d, mbits3 = %d, mbits4 = %d, s = %d\n", mbits1, mbits2, mbits3, mbits4, s) 115 | tmpinpos += 1 116 | 117 | bitpacking.DeltaUnpack(initoffset, in, tmpinpos, out, s, int(mbits1)) 118 | tmpinpos += int(mbits1) 119 | initoffset = out[s+31] 120 | //log.Printf("bp32/Uncompress: out = %v\n", out) 121 | 122 | bitpacking.DeltaUnpack(initoffset, in, tmpinpos, out, s+32, int(mbits2)) 123 | tmpinpos += int(mbits2) 124 | initoffset = out[s+32+31] 125 | //log.Printf("bp32/Uncompress: out = %v\n", out) 126 | 127 | bitpacking.DeltaUnpack(initoffset, in, tmpinpos, out, s+2*32, int(mbits3)) 128 | tmpinpos += int(mbits3) 129 | initoffset = out[s+2*32+31] 130 | //log.Printf("bp32/Uncompress: out = %v\n", out) 131 | 132 | bitpacking.DeltaUnpack(initoffset, in, tmpinpos, out, s+3*32, int(mbits4)) 133 | tmpinpos += int(mbits4) 134 | initoffset = out[s+3*32+31] 135 | //log.Printf("bp32/Uncompress: out = %v\n", out) 136 | } 137 | 138 | outpos.Add(int(outlength)) 139 | inpos.Set(tmpinpos) 140 | 141 | return nil 142 | } 143 | -------------------------------------------------------------------------------- /delta/bp32/bp32_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | package bp32 8 | 9 | import ( 10 | "log" 11 | "testing" 12 | 13 | "github.com/dataence/encoding/benchtools" 14 | "github.com/dataence/encoding/generators" 15 | ) 16 | 17 | var ( 18 | data []int32 19 | size int = 12800000 20 | ) 21 | 22 | func init() { 23 | log.Printf("bp32/init: generating %d int32s\n", size) 24 | data = generators.GenerateClustered(size, size*2) 25 | log.Printf("bp32/init: generated %d integers for test", size) 26 | } 27 | 28 | func TestCodec(t *testing.T) { 29 | sizes := []int{128, 128 * 10, 128 * 100, 128 * 1000, 128 * 10000} 30 | benchtools.TestCodec(New(), data, sizes) 31 | } 32 | -------------------------------------------------------------------------------- /delta/fastpfor/fastpfor.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | package fastpfor 8 | 9 | import ( 10 | "errors" 11 | "math" 12 | 13 | "github.com/dataence/bytebuffer" 14 | "github.com/dataence/encoding" 15 | "github.com/dataence/encoding/bitpacking" 16 | "github.com/dataence/encoding/cursor" 17 | ) 18 | 19 | const ( 20 | DefaultBlockSize = 128 21 | OverheadOfEachExcept = 8 22 | DefaultPageSize = 65536 23 | ) 24 | 25 | var ( 26 | zeroDataPointers []int32 27 | zeroFreqs []int32 28 | ) 29 | 30 | func init() { 31 | zeroDataPointers = make([]int32, 33) 32 | zeroFreqs = make([]int32, 33) 33 | } 34 | 35 | type FastPFOR struct { 36 | dataToBePacked [33][]int32 37 | byteContainer *bytebuffer.ByteBuffer 38 | pageSize int32 39 | 40 | // Working area 41 | dataPointers []int32 42 | freqs []int32 43 | } 44 | 45 | var _ encoding.Integer = (*FastPFOR)(nil) 46 | 47 | func New() encoding.Integer { 48 | f := &FastPFOR{ 49 | pageSize: DefaultPageSize, 50 | byteContainer: bytebuffer.NewByteBuffer(3*DefaultPageSize/DefaultBlockSize + DefaultPageSize), 51 | dataPointers: make([]int32, 33), 52 | freqs: make([]int32, 33), 53 | } 54 | 55 | for i := 1; i < 33; i++ { 56 | f.dataToBePacked[i] = make([]int32, DefaultPageSize/32*4) 57 | } 58 | 59 | return f 60 | } 61 | 62 | func (this *FastPFOR) Compress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error { 63 | inlength = encoding.FloorBy(inlength, DefaultBlockSize) 64 | 65 | if inlength == 0 { 66 | return errors.New("fastpfor/Compress: inlength = 0. No work done.") 67 | } 68 | 69 | out[outpos.Get()] = int32(inlength) 70 | outpos.Increment() 71 | 72 | initoffset := cursor.New() 73 | 74 | copy(this.dataPointers, zeroDataPointers) 75 | copy(this.freqs, zeroFreqs) 76 | 77 | finalInpos := inpos.Get() + inlength 78 | 79 | for inpos.Get() != finalInpos { 80 | thissize := int(math.Min(float64(this.pageSize), float64(finalInpos-inpos.Get()))) 81 | 82 | if err := this.encodePage(in, inpos, thissize, out, outpos, initoffset); err != nil { 83 | return errors.New("fastpfor/Compress: " + err.Error()) 84 | } 85 | } 86 | 87 | return nil 88 | } 89 | 90 | func (this *FastPFOR) Uncompress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error { 91 | if inlength == 0 { 92 | return errors.New("fastpfor/Uncompress: inlength = 0. No work done.") 93 | } 94 | 95 | mynvalue := in[inpos.Get()] 96 | inpos.Increment() 97 | 98 | initoffset := cursor.New() 99 | 100 | copy(this.dataPointers, zeroDataPointers) 101 | 102 | finalout := outpos.Get() + int(mynvalue) 103 | for outpos.Get() != finalout { 104 | thissize := int(math.Min(float64(this.pageSize), float64(finalout-outpos.Get()))) 105 | 106 | if err := this.decodePage(in, inpos, out, outpos, thissize, initoffset); err != nil { 107 | return errors.New("fastpfor/Uncompress: " + err.Error()) 108 | } 109 | } 110 | return nil 111 | } 112 | 113 | // getBestBFromData determins the best bit position with the best cost of exceptions, 114 | // and the max bit position of the array of int32s 115 | func (this *FastPFOR) getBestBFromData(in []int32) (bestb int32, bestc int32, maxb int32) { 116 | copy(this.freqs, zeroFreqs) 117 | 118 | // Get the count of all the leading bit positions for the slice 119 | // Mainly to figure out what's the best (most popular) bit position 120 | for _, v := range in { 121 | this.freqs[encoding.LeadingBitPosition(uint32(v))]++ 122 | } 123 | //encoding.FastLeadingBitFrequency128(in, this.freqs) 124 | 125 | bestb = 32 126 | 127 | for this.freqs[bestb] == 0 { 128 | bestb-- 129 | } 130 | 131 | maxb = bestb 132 | bestCost := bestb * DefaultBlockSize 133 | var cexcept int32 134 | bestc = cexcept 135 | 136 | // Find the cost of storing exceptions for each bit position 137 | for b := bestb - 1; b >= 0; b-- { 138 | cexcept += this.freqs[b+1] 139 | if cexcept < 0 { 140 | break 141 | } 142 | 143 | // the extra 8 is the cost of storing maxbits 144 | thisCost := cexcept*OverheadOfEachExcept + cexcept*(maxb-b) + b*DefaultBlockSize + 8 145 | 146 | if thisCost < bestCost { 147 | bestCost = thisCost 148 | bestb = b 149 | bestc = cexcept 150 | } 151 | } 152 | 153 | return 154 | } 155 | 156 | func (this *FastPFOR) encodePage(in []int32, inpos *cursor.Cursor, thissize int, out []int32, outpos *cursor.Cursor, initoffset *cursor.Cursor) error { 157 | headerpos := int32(outpos.Get()) 158 | outpos.Increment() 159 | tmpoutpos := int32(outpos.Get()) 160 | 161 | // Clear working area 162 | copy(this.dataPointers, zeroDataPointers) 163 | this.byteContainer.Clear() 164 | 165 | tmpinpos := int32(inpos.Get()) 166 | var delta [DefaultBlockSize]int32 167 | 168 | for finalInpos := tmpinpos + int32(thissize) - DefaultBlockSize; tmpinpos <= finalInpos; tmpinpos += DefaultBlockSize { 169 | 170 | // Calculate the deltas, inlining to gain a bit of performance 171 | offset := int32(initoffset.Get()) 172 | for i, v := range in[tmpinpos : tmpinpos+DefaultBlockSize] { 173 | delta[i] = v - offset 174 | offset = v 175 | } 176 | 177 | initoffset.Set(int(in[tmpinpos+DefaultBlockSize-1])) 178 | 179 | bestb, bestc, maxb := this.getBestBFromData(delta[:]) 180 | tmpbestb := bestb 181 | this.byteContainer.Put(byte(bestb)) 182 | this.byteContainer.Put(byte(bestc)) 183 | 184 | if bestc > 0 { 185 | this.byteContainer.Put(byte(maxb)) 186 | index := maxb - bestb 187 | if int(this.dataPointers[index]+bestc) >= len(this.dataToBePacked[index]) { 188 | newSize := int(2 * (this.dataPointers[index] + bestc)) 189 | 190 | // make sure it is a multiple of 32. 191 | // there might be a better way to do this 192 | newSize = encoding.CeilBy(newSize, 32) 193 | newSlice := make([]int32, newSize) 194 | copy(newSlice, this.dataToBePacked[index]) 195 | this.dataToBePacked[index] = newSlice 196 | } 197 | 198 | for k := int32(0); k < DefaultBlockSize; k++ { 199 | if uint32(delta[k])>>uint(bestb) != 0 { 200 | // we have an exception 201 | this.byteContainer.Put(byte(k)) 202 | this.dataToBePacked[index][this.dataPointers[index]] = int32(uint32(delta[k]) >> uint(tmpbestb)) 203 | this.dataPointers[index] += 1 204 | } 205 | } 206 | } 207 | 208 | for k := int32(0); k < 128; k += 32 { 209 | bitpacking.FastPack(delta[:], int(k), out, int(tmpoutpos), int(tmpbestb)) 210 | tmpoutpos += tmpbestb 211 | } 212 | } 213 | 214 | inpos.Set(int(tmpinpos)) 215 | out[headerpos] = tmpoutpos - headerpos 216 | 217 | for this.byteContainer.Position()&3 != 0 { 218 | this.byteContainer.Put(0) 219 | } 220 | 221 | bytesize := int32(this.byteContainer.Position()) 222 | out[tmpoutpos] = bytesize 223 | tmpoutpos += 1 224 | howmanyints := bytesize / 4 225 | 226 | this.byteContainer.Flip() 227 | this.byteContainer.AsInt32Buffer().GetInt32s(out, int(tmpoutpos), int(howmanyints)) 228 | tmpoutpos += howmanyints 229 | 230 | bitmap := int32(0) 231 | for k := 1; k <= 32; k++ { 232 | v := this.dataPointers[k] 233 | if v != 0 { 234 | bitmap |= (1 << uint(k-1)) 235 | } 236 | } 237 | 238 | out[tmpoutpos] = bitmap 239 | tmpoutpos += 1 240 | 241 | for k := 1; k < 33; k++ { 242 | v := this.dataPointers[k] 243 | if v != 0 { 244 | out[tmpoutpos] = v // size 245 | tmpoutpos += 1 246 | for j := 0; j < int(v); j += 32 { 247 | bitpacking.FastPack(this.dataToBePacked[k], j, out, int(tmpoutpos), k) 248 | tmpoutpos += int32(k) 249 | } 250 | } 251 | } 252 | 253 | outpos.Set(int(tmpoutpos)) 254 | 255 | return nil 256 | } 257 | 258 | func (this *FastPFOR) decodePage(in []int32, inpos *cursor.Cursor, out []int32, outpos *cursor.Cursor, thissize int, initoffset *cursor.Cursor) error { 259 | initpos := int32(inpos.Get()) 260 | wheremeta := in[initpos] 261 | inpos.Increment() 262 | 263 | inexcept := initpos + wheremeta 264 | bytesize := in[inexcept] 265 | inexcept += 1 266 | 267 | this.byteContainer.Clear() 268 | if err := this.byteContainer.AsInt32Buffer().PutInt32s(in, int(inexcept), int(bytesize/4)); err != nil { 269 | return err 270 | } 271 | 272 | inexcept += bytesize / 4 273 | bitmap := in[inexcept] 274 | inexcept += 1 275 | 276 | for k := int32(1); k < 33; k++ { 277 | if bitmap&(1< 0 { 319 | var maxbits int32 320 | if maxbits, err = this.byteContainer.GetAsInt32(); err != nil { 321 | return err 322 | } 323 | 324 | index := maxbits - bestb 325 | 326 | for k := int32(0); k < cexcept; k++ { 327 | var pos int32 328 | if pos, err = this.byteContainer.GetAsInt32(); err != nil { 329 | return err 330 | } 331 | 332 | exceptvalue := this.dataToBePacked[index][this.dataPointers[index]] 333 | this.dataPointers[index] += 1 334 | //out[pos + tmpoutpos] |= exceptvalue << uint(bestb) 335 | delta[pos] |= exceptvalue << uint(bestb) 336 | } 337 | } 338 | 339 | // Calculate the original from the deltas, inlining to gain a bit of performance 340 | offset := int32(initoffset.Get()) 341 | for i, v := range delta { 342 | out[int(tmpoutpos)+i] = v + offset 343 | offset += v 344 | } 345 | 346 | initoffset.Set(int(out[tmpoutpos+DefaultBlockSize-1])) 347 | 348 | run += 1 349 | tmpoutpos += DefaultBlockSize 350 | } 351 | 352 | outpos.Set(int(tmpoutpos)) 353 | inpos.Set(int(inexcept)) 354 | 355 | return nil 356 | } 357 | -------------------------------------------------------------------------------- /delta/fastpfor/fastpfor_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | package fastpfor 8 | 9 | import ( 10 | "log" 11 | "testing" 12 | 13 | "github.com/dataence/encoding/benchtools" 14 | "github.com/dataence/encoding/generators" 15 | ) 16 | 17 | var ( 18 | data []int32 19 | size int = 12800000 20 | ) 21 | 22 | func init() { 23 | log.Printf("bp32/init: generating %d int32s\n", size) 24 | data = generators.GenerateClustered(size, size*2) 25 | log.Printf("bp32/init: generated %d integers for test", size) 26 | } 27 | 28 | func TestCodec(t *testing.T) { 29 | sizes := []int{128, 128 * 10, 128 * 100, 128 * 1000, 128 * 10000} 30 | benchtools.TestCodec(New(), data, sizes) 31 | } 32 | -------------------------------------------------------------------------------- /delta/variablebyte/variablebyte.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | package variablebyte 8 | 9 | import ( 10 | "errors" 11 | 12 | "github.com/dataence/bytebuffer" 13 | "github.com/dataence/encoding" 14 | "github.com/dataence/encoding/cursor" 15 | ) 16 | 17 | type VariableByte struct { 18 | } 19 | 20 | var _ encoding.Integer = (*VariableByte)(nil) 21 | 22 | func New() encoding.Integer { 23 | return &VariableByte{} 24 | } 25 | 26 | func (this *VariableByte) Compress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error { 27 | if inlength == 0 { 28 | return errors.New("variablebyte/Compress: inlength = 0. No work done.") 29 | } 30 | 31 | //fmt.Printf("variablebyte/Compress: after inlength = %d\n", inlength) 32 | 33 | buf := bytebuffer.NewByteBuffer(inlength * 8) 34 | initoffset := int32(0) 35 | 36 | tmpinpos := inpos.Get() 37 | for _, v := range in[tmpinpos : tmpinpos+inlength] { 38 | val := uint32(v - initoffset) 39 | initoffset = v 40 | 41 | for val >= 0x80 { 42 | buf.Put(byte(val) | 0x80) 43 | val >>= 7 44 | } 45 | buf.Put(byte(val)) 46 | } 47 | 48 | for buf.Position()%4 != 0 { 49 | //fmt.Printf("variablebyte/Compress: putting 128\n") 50 | buf.Put(128) 51 | } 52 | 53 | length := buf.Position() 54 | buf.Flip() 55 | ibuf := buf.AsInt32Buffer() 56 | //fmt.Printf("variablebyte/Compress: l = %d, outpos = %d, ibuf = %v, buf = %v\n", length/4, outpos.Get(), ibuf, buf) 57 | err := ibuf.GetInt32s(out, outpos.Get(), length/4) 58 | if err != nil { 59 | //fmt.Printf("variablebyte/Compress: error with GetUint32s - %v\n", err) 60 | return err 61 | } 62 | outpos.Add(length / 4) 63 | inpos.Add(inlength) 64 | //fmt.Printf("variablebyte/Compress: out = %v\n", out) 65 | 66 | return nil 67 | } 68 | 69 | func (this *VariableByte) Uncompress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error { 70 | if inlength == 0 { 71 | return errors.New("variablebyte/Uncompress: inlength = 0. No work done.") 72 | } 73 | 74 | //fmt.Printf("variablebyte/Uncompress: after inlength = %d\n", inlength) 75 | 76 | s := uint(0) 77 | p := inpos.Get() 78 | finalp := inpos.Get() + inlength 79 | tmpoutpos := outpos.Get() 80 | initoffset := int32(0) 81 | v := int32(0) 82 | shift := uint(0) 83 | 84 | for p < finalp { 85 | c := in[p] >> (24 - s) 86 | s += 8 87 | 88 | if s == 32 { 89 | s = 0 90 | p += 1 91 | } 92 | 93 | v += ((c & 127) << shift) 94 | if c&128 == 0 { 95 | out[tmpoutpos] = v + initoffset 96 | initoffset = out[tmpoutpos] 97 | tmpoutpos += 1 98 | v = 0 99 | shift = 0 100 | } else { 101 | shift += 7 102 | } 103 | 104 | outpos.Set(tmpoutpos) 105 | inpos.Add(inlength) 106 | } 107 | 108 | return nil 109 | } 110 | -------------------------------------------------------------------------------- /delta/variablebyte/variablebyte_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | package variablebyte 8 | 9 | import ( 10 | "log" 11 | "testing" 12 | 13 | "github.com/dataence/encoding/benchtools" 14 | "github.com/dataence/encoding/generators" 15 | ) 16 | 17 | var ( 18 | data []int32 19 | size int = 12800000 20 | ) 21 | 22 | func init() { 23 | log.Printf("bp32/init: generating %d int32s\n", size) 24 | data = generators.GenerateClustered(size, size*2) 25 | log.Printf("bp32/init: generated %d integers for test", size) 26 | } 27 | 28 | func TestCodec(t *testing.T) { 29 | sizes := []int{128, 128 * 10, 128 * 100, 128 * 1000, 128 * 10000} 30 | benchtools.TestCodec(New(), data, sizes) 31 | } 32 | -------------------------------------------------------------------------------- /fastpfor/fastpfor.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | // Package fastpfor is an implementation of the fastpfor integer compression 8 | // algorithm in in Go. 9 | // It is mostly suitable for arrays containing small positive integers. 10 | // Given a list of sorted integers, you should first compute the successive 11 | // differences prior to compression. 12 | // For details, please see 13 | // Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second 14 | // through vectorization Software: Practice & Experience 15 | // http://onlinelibrary.wiley.com/doi/10.1002/spe.2203/abstract or 16 | // http://arxiv.org/abs/1209.2137 17 | package fastpfor 18 | 19 | import ( 20 | "errors" 21 | "math" 22 | 23 | "github.com/dataence/bytebuffer" 24 | "github.com/dataence/encoding" 25 | "github.com/dataence/encoding/bitpacking" 26 | "github.com/dataence/encoding/cursor" 27 | ) 28 | 29 | const ( 30 | DefaultBlockSize = 128 31 | OverheadOfEachExcept = 8 32 | DefaultPageSize = 65536 33 | ) 34 | 35 | var ( 36 | zeroDataPointers []int32 37 | zeroFreqs []int32 38 | ) 39 | 40 | func init() { 41 | zeroDataPointers = make([]int32, 33) 42 | zeroFreqs = make([]int32, 33) 43 | } 44 | 45 | // FastPFOR codec structure: this is not thread-safe (need one per thread) 46 | type FastPFOR struct { 47 | dataToBePacked [33][]int32 48 | byteContainer *bytebuffer.ByteBuffer 49 | pageSize int32 50 | 51 | // Working area 52 | dataPointers []int32 53 | freqs []int32 54 | } 55 | 56 | var _ encoding.Integer = (*FastPFOR)(nil) 57 | 58 | func New() encoding.Integer { 59 | f := &FastPFOR{ 60 | pageSize: DefaultPageSize, 61 | byteContainer: bytebuffer.NewByteBuffer(3*DefaultPageSize/DefaultBlockSize + DefaultPageSize), 62 | dataPointers: make([]int32, 33), 63 | freqs: make([]int32, 33), 64 | } 65 | 66 | for i := 1; i < 33; i++ { 67 | f.dataToBePacked[i] = make([]int32, DefaultPageSize/32*4) 68 | } 69 | 70 | return f 71 | } 72 | 73 | func (this *FastPFOR) Compress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error { 74 | inlength = encoding.FloorBy(inlength, DefaultBlockSize) 75 | if inlength == 0 { 76 | return errors.New("fastpfor/Compress: inlength = 0. No work done.") 77 | } 78 | out[outpos.Get()] = int32(inlength) 79 | outpos.Increment() 80 | 81 | copy(this.dataPointers, zeroDataPointers) 82 | copy(this.freqs, zeroFreqs) 83 | 84 | finalInpos := inpos.Get() + inlength 85 | 86 | for inpos.Get() != finalInpos { 87 | thissize := int(math.Min(float64(this.pageSize), float64(finalInpos-inpos.Get()))) 88 | if err := this.encodePage(in, inpos, thissize, out, outpos); err != nil { 89 | return errors.New("fastpfor/Compress: " + err.Error()) 90 | } 91 | } 92 | 93 | return nil 94 | } 95 | 96 | func (this *FastPFOR) Uncompress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error { 97 | if inlength == 0 { 98 | return errors.New("fastpfor/Uncompress: inlength = 0. No work done.") 99 | } 100 | 101 | mynvalue := in[inpos.Get()] 102 | inpos.Increment() 103 | 104 | copy(this.dataPointers, zeroDataPointers) 105 | 106 | finalout := outpos.Get() + int(mynvalue) 107 | for outpos.Get() != finalout { 108 | thissize := int(math.Min(float64(this.pageSize), float64(finalout-outpos.Get()))) 109 | if err := this.decodePage(in, inpos, out, outpos, thissize); err != nil { 110 | return errors.New("fastpfor/Uncompress: " + err.Error()) 111 | } 112 | } 113 | return nil 114 | } 115 | 116 | // getBestBFromData determins the best bit position with the best cost of exceptions, 117 | // and the max bit position of the array of int32s 118 | func (this *FastPFOR) getBestBFromData(in []int32) (bestb int32, bestc int32, maxb int32) { 119 | copy(this.freqs, zeroFreqs) 120 | // Get the count of all the leading bit positionsfor the slice 121 | // Mainly to figure out what's the best (most popular) bit position 122 | //for _, v := range in[k:kEnd] { 123 | for _, v := range in { 124 | this.freqs[encoding.LeadingBitPosition(uint32(v))]++ 125 | } 126 | bestb = 32 127 | for this.freqs[bestb] == 0 { 128 | bestb-- 129 | } 130 | maxb = bestb 131 | bestCost := bestb * DefaultBlockSize 132 | var cexcept int32 133 | bestc = cexcept 134 | // Find the cost of storing exceptions for each bit position 135 | for b := bestb - 1; b >= 0; b-- { 136 | cexcept += this.freqs[b+1] 137 | if cexcept < 0 { 138 | break 139 | } 140 | // the extra 8 is the cost of storing maxbits 141 | thisCost := cexcept*OverheadOfEachExcept + cexcept*(maxb-b) + b*DefaultBlockSize + 8 142 | if thisCost < bestCost { 143 | bestCost = thisCost 144 | bestb = b 145 | bestc = cexcept 146 | } 147 | } 148 | return 149 | } 150 | 151 | func (this *FastPFOR) encodePage(in []int32, inpos *cursor.Cursor, thissize int, out []int32, outpos *cursor.Cursor) error { 152 | headerpos := int32(outpos.Get()) 153 | outpos.Increment() 154 | tmpoutpos := int32(outpos.Get()) 155 | 156 | // Clear working area 157 | copy(this.dataPointers, zeroDataPointers) 158 | this.byteContainer.Clear() 159 | 160 | tmpinpos := int32(inpos.Get()) 161 | 162 | for finalInpos := tmpinpos + int32(thissize) - DefaultBlockSize; tmpinpos <= finalInpos; tmpinpos += DefaultBlockSize { 163 | bestb, bestc, maxb := this.getBestBFromData(in[tmpinpos : tmpinpos+DefaultBlockSize]) 164 | tmpbestb := bestb 165 | this.byteContainer.Put(byte(bestb)) 166 | this.byteContainer.Put(byte(bestc)) 167 | 168 | if bestc > 0 { 169 | this.byteContainer.Put(byte(maxb)) 170 | index := maxb - bestb 171 | if int(this.dataPointers[index]+bestc) >= len(this.dataToBePacked[index]) { 172 | newSize := int(2 * (this.dataPointers[index] + bestc)) 173 | // make sure it is a multiple of 32. 174 | // there might be a better way to do this 175 | newSize = encoding.CeilBy(newSize, 32) 176 | newSlice := make([]int32, newSize) 177 | copy(newSlice, this.dataToBePacked[index]) 178 | this.dataToBePacked[index] = newSlice 179 | } 180 | 181 | for k := int32(0); k < DefaultBlockSize; k++ { 182 | if uint32(in[k+tmpinpos])>>uint(bestb) != 0 { 183 | // we have an exception 184 | this.byteContainer.Put(byte(k)) 185 | this.dataToBePacked[index][this.dataPointers[index]] = int32(uint32(in[k+tmpinpos]) >> uint(tmpbestb)) 186 | this.dataPointers[index] += 1 187 | } 188 | } 189 | } 190 | 191 | for k := int32(0); k < 128; k += 32 { 192 | bitpacking.FastPack(in, int(tmpinpos+k), out, int(tmpoutpos), int(tmpbestb)) 193 | tmpoutpos += tmpbestb 194 | } 195 | } 196 | 197 | inpos.Set(int(tmpinpos)) 198 | out[headerpos] = tmpoutpos - headerpos 199 | bytesize := int32(this.byteContainer.Position()) 200 | for this.byteContainer.Position()&3 != 0 { 201 | this.byteContainer.Put(0) 202 | } 203 | 204 | out[tmpoutpos] = bytesize 205 | tmpoutpos += 1 206 | howmanyints := (bytesize + 3) / 4 207 | this.byteContainer.Flip() 208 | this.byteContainer.AsInt32Buffer().GetInt32s(out, int(tmpoutpos), int(howmanyints)) 209 | tmpoutpos += howmanyints 210 | 211 | bitmap := int32(0) 212 | for k := 1; k <= 32; k++ { 213 | v := this.dataPointers[k] 214 | if v != 0 { 215 | bitmap |= (1 << uint(k-1)) 216 | } 217 | } 218 | 219 | out[tmpoutpos] = bitmap 220 | tmpoutpos += 1 221 | 222 | for k := 1; k < 33; k++ { 223 | v := this.dataPointers[k] 224 | if v != 0 { 225 | out[tmpoutpos] = v // size 226 | tmpoutpos += 1 227 | for j := 0; j < int(v); j += 32 { 228 | bitpacking.FastPack(this.dataToBePacked[k], j, out, int(tmpoutpos), k) 229 | tmpoutpos += int32(k) 230 | } 231 | } 232 | } 233 | 234 | outpos.Set(int(tmpoutpos)) 235 | 236 | return nil 237 | } 238 | 239 | func grapByte(in []int32, index uint) byte { 240 | return byte(in[index/4] >> (24 - (index%4)*8)) 241 | } 242 | 243 | func (this *FastPFOR) decodePage(in []int32, inpos *cursor.Cursor, out []int32, outpos *cursor.Cursor, thissize int) error { 244 | initpos := int32(inpos.Get()) 245 | wheremeta := in[initpos] 246 | inpos.Increment() 247 | 248 | inexcept := initpos + wheremeta 249 | bytesize := in[inexcept] 250 | inexcept += 1 251 | mybytearray := in[inexcept:] 252 | mybp := uint(0) 253 | 254 | inexcept += (bytesize + 3) / 4 255 | bitmap := in[inexcept] 256 | inexcept += 1 257 | 258 | for k := int32(1); k < 33; k++ { 259 | if bitmap&(1< 0 { 290 | maxbits := uint32(grapByte(mybytearray, mybp)) 291 | mybp++ 292 | index := maxbits - bestb 293 | // assuming that the Go compiler is bad, we move everything that is indexed outside the upcoming loop 294 | packedexceptions := this.dataToBePacked[index] 295 | myindex := this.dataPointers[index] 296 | 297 | for k := int32(0); k < cexcept; k++ { 298 | pos := uint32(grapByte(mybytearray, mybp)) 299 | mybp++ 300 | exceptvalue := packedexceptions[myindex] 301 | myindex++ 302 | out[pos+tmpoutpos] |= exceptvalue << bestb 303 | } 304 | this.dataPointers[index] = myindex 305 | } 306 | 307 | run += 1 308 | tmpoutpos += DefaultBlockSize 309 | } 310 | 311 | outpos.Set(int(tmpoutpos)) 312 | inpos.Set(int(inexcept)) 313 | 314 | return nil 315 | } 316 | -------------------------------------------------------------------------------- /fastpfor/fastpfor_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | package fastpfor 8 | 9 | import ( 10 | "log" 11 | "testing" 12 | 13 | "github.com/dataence/encoding/benchtools" 14 | "github.com/dataence/encoding/cursor" 15 | "github.com/dataence/encoding/generators" 16 | ) 17 | 18 | var ( 19 | data []int32 20 | size int = 128000 21 | ) 22 | 23 | func init() { 24 | log.Printf("fastpfor/init: generating %d int32s\n", size) 25 | data = generators.GenerateClustered(size, size*2) 26 | log.Printf("fastpfor/init: generated %d integers for test", size) 27 | } 28 | 29 | func TestCodec(t *testing.T) { 30 | sizes := []int{128, 128 * 10, 128 * 100, 128 * 1000} 31 | benchtools.TestCodec(New(), data, sizes) 32 | } 33 | 34 | // go test -bench=Decode 35 | func BenchmarkDecode(b *testing.B) { 36 | b.StopTimer() 37 | length := 128 * 1024 38 | data := generators.GenerateClustered(length, 1<<24) 39 | compdata := make([]int32, 2*length) 40 | recov := make([]int32, length) 41 | inpos := cursor.New() 42 | outpos := cursor.New() 43 | codec := New() 44 | codec.Compress(data, inpos, len(data), compdata, outpos) 45 | b.StartTimer() 46 | for j := 0; j < b.N; j++ { 47 | newinpos := cursor.New() 48 | newoutpos := cursor.New() 49 | codec.Uncompress(compdata, newinpos, outpos.Get()-newinpos.Get(), recov, newoutpos) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /generators/generators.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | package generators 8 | 9 | import ( 10 | "bytes" 11 | "encoding/binary" 12 | "errors" 13 | "github.com/willf/bitset" 14 | "math/rand" 15 | "sort" 16 | ) 17 | 18 | const ( 19 | c1 int64 = 0xcc9e2d51 20 | c2 int64 = 0x1b873593 21 | ) 22 | 23 | func GenerateUniformInBytes(N, max int) *bytes.Buffer { 24 | data := GenerateUniform(N, max) 25 | b := make([]byte, N*4) 26 | for i := 0; i < N; i++ { 27 | binary.LittleEndian.PutUint32(b[i*4:], uint32(data[i])) 28 | } 29 | 30 | return bytes.NewBuffer(b) 31 | } 32 | 33 | func GenerateClusteredInBytes(N, max int) *bytes.Buffer { 34 | data := GenerateClustered(N, max) 35 | b := make([]byte, N*4) 36 | for i := 0; i < N; i++ { 37 | binary.LittleEndian.PutUint32(b[i*4:], uint32(data[i])) 38 | } 39 | 40 | return bytes.NewBuffer(b) 41 | } 42 | 43 | func GenerateUniform(N, max int) []int32 { 44 | if N*2 > max { 45 | return negate(GenerateUniform(max-N, max), max) 46 | } 47 | 48 | if 2048*N > max { 49 | r, _ := generateUniformBitmap(N, max) 50 | return r 51 | 52 | } 53 | 54 | r, _ := generateUniformHash(N, max) 55 | return r 56 | } 57 | 58 | func GenerateClustered(N, max int) []int32 { 59 | ans := make([]int32, N) 60 | fillClustered(ans, 0, N, 0, max) 61 | return ans 62 | } 63 | 64 | func fillUniform(ans []int32, offset, length, min, max int) { 65 | v := GenerateUniform(length, max-min) 66 | for k := 0; k < len(v); k++ { 67 | ans[k+offset] = int32(min) + v[k] 68 | } 69 | } 70 | 71 | func fillClustered(ans []int32, offset, length, min, max int) { 72 | btwn := max - min 73 | if btwn == length || length <= 10 { 74 | fillUniform(ans, offset, length, min, max) 75 | return 76 | } 77 | 78 | r := rand.New(rand.NewSource(c1)) 79 | cut := length / 2 80 | if btwn-length-1 > 0 { 81 | cut += int(r.Int31n(int32(btwn - length - 1))) 82 | } 83 | 84 | p := r.Float64() 85 | if p < 0.25 { 86 | fillUniform(ans, offset, length/2, min, min+cut) 87 | fillClustered(ans, offset+length/2, length-length/2, min+cut, max) 88 | } else if p < 0.5 { 89 | fillClustered(ans, offset, length/2, min, min+cut) 90 | fillUniform(ans, offset+length/2, length-length/2, min+cut, max) 91 | } else { 92 | fillClustered(ans, offset, length/2, min, min+cut) 93 | fillClustered(ans, offset+length/2, length-length/2, min+cut, max) 94 | } 95 | } 96 | 97 | func negate(x []int32, max int) []int32 { 98 | ans := make([]int32, max-len(x)) 99 | 100 | var i, c int32 101 | 102 | for j := 0; j < len(x); j++ { 103 | v := x[j] 104 | for ; i < v; i++ { 105 | ans[c] = i 106 | c += 1 107 | } 108 | i += 1 109 | } 110 | 111 | for int(c) < len(ans) { 112 | ans[c] = i 113 | c += 1 114 | i += 1 115 | } 116 | 117 | return ans 118 | } 119 | 120 | func generateUniformBitmap(N, max int) ([]int32, error) { 121 | if N > max { 122 | return nil, errors.New("encoding/generateUniformBitmap: N > max, not possible") 123 | } 124 | 125 | r := rand.New(rand.NewSource(c1)) 126 | ans := make([]int32, N) 127 | bs := bitset.New(uint(max)) 128 | cardinality := uint(0) 129 | 130 | for int(cardinality) < N { 131 | v := r.Int31n(int32(max)) 132 | if !bs.Test(uint(v)) { 133 | bs.Set(uint(v)) 134 | cardinality += 1 135 | } 136 | } 137 | 138 | for i, c := int32(0), 0; c < N; i++ { 139 | if bs.Test(uint(i)) { 140 | ans[c] = i 141 | c += 1 142 | } 143 | } 144 | 145 | return ans, nil 146 | } 147 | 148 | func generateUniformHash(N, max int) ([]int32, error) { 149 | if N > max { 150 | return nil, errors.New("encoding/generateUniformBitmap: N > max, not possible") 151 | } 152 | 153 | r := rand.New(rand.NewSource(c2)) 154 | ans := make([]int32, N) 155 | s := make(map[int]bool) 156 | 157 | for len(s) < N { 158 | s[int(r.Int31n(int32(max)))] = true 159 | } 160 | 161 | c := 0 162 | tmpans := make([]int, N) 163 | for k, _ := range s { 164 | tmpans[c] = k 165 | } 166 | 167 | sort.Ints(tmpans) 168 | 169 | for i := 0; i < len(tmpans); i++ { 170 | ans[i] = int32(tmpans[i]) 171 | } 172 | 173 | return ans, nil 174 | } 175 | -------------------------------------------------------------------------------- /generators/generators_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | package generators 8 | 9 | import ( 10 | "fmt" 11 | "testing" 12 | ) 13 | 14 | func TestGenerateClustered(t *testing.T) { 15 | a := GenerateClustered(20, 1000) 16 | fmt.Println(a) 17 | } 18 | 19 | func TestGenerateUniform(t *testing.T) { 20 | a := GenerateUniform(20, 1000) 21 | fmt.Println(a) 22 | } 23 | -------------------------------------------------------------------------------- /integer.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | package encoding 8 | 9 | import ( 10 | "github.com/dataence/encoding/cursor" 11 | ) 12 | 13 | type Integer interface { 14 | // Compress data from an array to another array. 15 | // 16 | // Both inpos and outpos are modified to represent how much data was read and written to 17 | // if 12 ints (inlength = 12) are compressed to 3 ints, then inpos will be incremented by 12 18 | // while outpos will be incremented by 3 we use IntWrapper to pass the values by reference. 19 | // @param in input array 20 | // @param inpos location in the input array 21 | // @param inlength how many integers to compress 22 | // @param out output array 23 | //* @param outpos where to write in the output array 24 | Compress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error 25 | 26 | /** 27 | * Uncompress data from an array to another array. 28 | * 29 | * Both inpos and outpos parameters are modified to indicate new positions after read/write. 30 | * 31 | * @param in array containing data in compressed form 32 | * @param inpos where to start reading in the array 33 | * @param inlength length of the compressed data (ignored by some schemes) 34 | * @param out array where to write the compressed output 35 | * @param outpos where to write the compressed output in out 36 | */ 37 | Uncompress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error 38 | } 39 | -------------------------------------------------------------------------------- /util.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | package encoding 8 | 9 | import ( 10 | "fmt" 11 | ) 12 | 13 | func FloorBy(value, factor int) int { 14 | return value - value%factor 15 | } 16 | 17 | func CeilBy(value, factor int) int { 18 | return value + factor - value%factor 19 | } 20 | 21 | func LeadingBitPosition(x uint32) int32 { 22 | //return 32 - int32(nlz1a(x)) 23 | return int32(bitlen(uint64(x))) 24 | } 25 | 26 | func DeltaMaxBits(initoffset int32, buf []int32) int32 { 27 | var mask int32 28 | 29 | for _, v := range buf { 30 | mask |= v - initoffset 31 | initoffset = v 32 | } 33 | 34 | return LeadingBitPosition(uint32(mask)) 35 | } 36 | 37 | func MaxBits(buf []int32) int32 { 38 | var mask int32 39 | 40 | for _, v := range buf { 41 | mask |= v 42 | } 43 | 44 | return LeadingBitPosition(uint32(mask)) 45 | } 46 | 47 | func PrintInt32sInBits(buf []int32) { 48 | fmt.Println(" 10987654321098765432109876543210") 49 | for i, v := range buf { 50 | fmt.Printf("%4d: %20d %032b\n", i, v, uint32(v)) 51 | } 52 | } 53 | 54 | func Delta(in, out []int32, offset int32) { 55 | for i, v := range in { 56 | out[i] = v - offset 57 | offset = v 58 | } 59 | } 60 | 61 | func InverseDelta(in, out []int32, offset int32) { 62 | for i, v := range in { 63 | out[i] = v + offset 64 | offset = out[i] 65 | } 66 | } 67 | 68 | // https://developers.google.com/protocol-buffers/docs/encoding#types 69 | func ZigZagDelta(in, out []int32) { 70 | offset := int32(0) 71 | 72 | for i, v := range in { 73 | n := v - offset 74 | out[i] = (n << 1) ^ (n >> 31) 75 | offset = v 76 | } 77 | } 78 | 79 | func InverseZigZagDelta(in, out []int32) { 80 | offset := int32(0) 81 | 82 | for i, v := range in { 83 | //n := int32(uint32(v) >> 1) ^ (-(v & 1)) 84 | n := int32(uint32(v)>>1) ^ ((v << 31) >> 31) 85 | out[i] = n + offset 86 | offset = out[i] 87 | } 88 | } 89 | 90 | // Copied from http://www.hackersdelight.org/hdcodetxt/nlz.c.txt - nlz1a 91 | func nlz1a(x uint32) uint32 { 92 | var n uint32 = 0 93 | if x <= 0 { 94 | return (^x >> 26) & 32 95 | } 96 | 97 | n = 1 98 | 99 | if (x >> 16) == 0 { 100 | n = n + 16 101 | x = x << 16 102 | } 103 | if (x >> 24) == 0 { 104 | n = n + 8 105 | x = x << 8 106 | } 107 | if (x >> 28) == 0 { 108 | n = n + 4 109 | x = x << 4 110 | } 111 | if (x >> 30) == 0 { 112 | n = n + 2 113 | x = x << 2 114 | } 115 | n = n - (x >> 31) 116 | return n 117 | } 118 | 119 | func nlz2(x uint32) uint32 { 120 | var y uint32 121 | var n uint32 = 32 122 | 123 | y = x >> 16 124 | if y != 0 { 125 | n = n - 16 126 | x = y 127 | } 128 | y = x >> 8 129 | if y != 0 { 130 | n = n - 8 131 | x = y 132 | } 133 | y = x >> 4 134 | if y != 0 { 135 | n = n - 4 136 | x = y 137 | } 138 | y = x >> 2 139 | if y != 0 { 140 | n = n - 2 141 | x = y 142 | } 143 | y = x >> 1 144 | if y != 0 { 145 | return n - 2 146 | } 147 | return n - x 148 | } 149 | 150 | /* The following are unrolled versions, but they are probably slower due to range checks */ 151 | func UnrolledDelta128(in, out []int32, offset int32) { 152 | out[0] = in[0] - offset 153 | out[1] = in[1] - in[0] 154 | out[2] = in[2] - in[1] 155 | out[3] = in[3] - in[2] 156 | out[4] = in[4] - in[3] 157 | out[5] = in[5] - in[4] 158 | out[6] = in[6] - in[5] 159 | out[7] = in[7] - in[6] 160 | out[8] = in[8] - in[7] 161 | out[9] = in[9] - in[8] 162 | out[10] = in[10] - in[9] 163 | out[11] = in[11] - in[10] 164 | out[12] = in[12] - in[11] 165 | out[13] = in[13] - in[12] 166 | out[14] = in[14] - in[13] 167 | out[15] = in[15] - in[14] 168 | out[16] = in[16] - in[15] 169 | out[17] = in[17] - in[16] 170 | out[18] = in[18] - in[17] 171 | out[19] = in[19] - in[18] 172 | out[20] = in[20] - in[19] 173 | out[21] = in[21] - in[20] 174 | out[22] = in[22] - in[21] 175 | out[23] = in[23] - in[22] 176 | out[24] = in[24] - in[23] 177 | out[25] = in[25] - in[24] 178 | out[26] = in[26] - in[25] 179 | out[27] = in[27] - in[26] 180 | out[28] = in[28] - in[27] 181 | out[29] = in[29] - in[28] 182 | out[30] = in[30] - in[29] 183 | out[31] = in[31] - in[30] 184 | out[32] = in[32] - in[31] 185 | out[33] = in[33] - in[32] 186 | out[34] = in[34] - in[33] 187 | out[35] = in[35] - in[34] 188 | out[36] = in[36] - in[35] 189 | out[37] = in[37] - in[36] 190 | out[38] = in[38] - in[37] 191 | out[39] = in[39] - in[38] 192 | out[40] = in[40] - in[39] 193 | out[41] = in[41] - in[40] 194 | out[42] = in[42] - in[41] 195 | out[43] = in[43] - in[42] 196 | out[44] = in[44] - in[43] 197 | out[45] = in[45] - in[44] 198 | out[46] = in[46] - in[45] 199 | out[47] = in[47] - in[46] 200 | out[48] = in[48] - in[47] 201 | out[49] = in[49] - in[48] 202 | out[50] = in[50] - in[49] 203 | out[51] = in[51] - in[50] 204 | out[52] = in[52] - in[51] 205 | out[53] = in[53] - in[52] 206 | out[54] = in[54] - in[53] 207 | out[55] = in[55] - in[54] 208 | out[56] = in[56] - in[55] 209 | out[57] = in[57] - in[56] 210 | out[58] = in[58] - in[57] 211 | out[59] = in[59] - in[58] 212 | out[60] = in[60] - in[59] 213 | out[61] = in[61] - in[60] 214 | out[62] = in[62] - in[61] 215 | out[63] = in[63] - in[62] 216 | out[64] = in[64] - in[63] 217 | out[65] = in[65] - in[64] 218 | out[66] = in[66] - in[65] 219 | out[67] = in[67] - in[66] 220 | out[68] = in[68] - in[67] 221 | out[69] = in[69] - in[68] 222 | out[70] = in[70] - in[69] 223 | out[71] = in[71] - in[70] 224 | out[72] = in[72] - in[71] 225 | out[73] = in[73] - in[72] 226 | out[74] = in[74] - in[73] 227 | out[75] = in[75] - in[74] 228 | out[76] = in[76] - in[75] 229 | out[77] = in[77] - in[76] 230 | out[78] = in[78] - in[77] 231 | out[79] = in[79] - in[78] 232 | out[80] = in[80] - in[79] 233 | out[81] = in[81] - in[80] 234 | out[82] = in[82] - in[81] 235 | out[83] = in[83] - in[82] 236 | out[84] = in[84] - in[83] 237 | out[85] = in[85] - in[84] 238 | out[86] = in[86] - in[85] 239 | out[87] = in[87] - in[86] 240 | out[88] = in[88] - in[87] 241 | out[89] = in[89] - in[88] 242 | out[90] = in[90] - in[89] 243 | out[91] = in[91] - in[90] 244 | out[92] = in[92] - in[91] 245 | out[93] = in[93] - in[92] 246 | out[94] = in[94] - in[93] 247 | out[95] = in[95] - in[94] 248 | out[96] = in[96] - in[95] 249 | out[97] = in[97] - in[96] 250 | out[98] = in[98] - in[97] 251 | out[99] = in[99] - in[98] 252 | out[100] = in[100] - in[99] 253 | out[101] = in[101] - in[100] 254 | out[102] = in[102] - in[101] 255 | out[103] = in[103] - in[102] 256 | out[104] = in[104] - in[103] 257 | out[105] = in[105] - in[104] 258 | out[106] = in[106] - in[105] 259 | out[107] = in[107] - in[106] 260 | out[108] = in[108] - in[107] 261 | out[109] = in[109] - in[108] 262 | out[110] = in[110] - in[109] 263 | out[111] = in[111] - in[110] 264 | out[112] = in[112] - in[111] 265 | out[113] = in[113] - in[112] 266 | out[114] = in[114] - in[113] 267 | out[115] = in[115] - in[114] 268 | out[116] = in[116] - in[115] 269 | out[117] = in[117] - in[116] 270 | out[118] = in[118] - in[117] 271 | out[119] = in[119] - in[118] 272 | out[120] = in[120] - in[119] 273 | out[121] = in[121] - in[120] 274 | out[122] = in[122] - in[121] 275 | out[123] = in[123] - in[122] 276 | out[124] = in[124] - in[123] 277 | out[125] = in[125] - in[124] 278 | out[126] = in[126] - in[125] 279 | out[127] = in[127] - in[126] 280 | } 281 | 282 | func UnrolledInverseDelta128(in, out []int32, offset int32) { 283 | out[0] = in[0] + offset 284 | out[1] = in[1] + out[0] 285 | out[2] = in[2] + out[1] 286 | out[3] = in[3] + out[2] 287 | out[4] = in[4] + out[3] 288 | out[5] = in[5] + out[4] 289 | out[6] = in[6] + out[5] 290 | out[7] = in[7] + out[6] 291 | out[8] = in[8] + out[7] 292 | out[9] = in[9] + out[8] 293 | out[10] = in[10] + out[9] 294 | out[11] = in[11] + out[10] 295 | out[12] = in[12] + out[11] 296 | out[13] = in[13] + out[12] 297 | out[14] = in[14] + out[13] 298 | out[15] = in[15] + out[14] 299 | out[16] = in[16] + out[15] 300 | out[17] = in[17] + out[16] 301 | out[18] = in[18] + out[17] 302 | out[19] = in[19] + out[18] 303 | out[20] = in[20] + out[19] 304 | out[21] = in[21] + out[20] 305 | out[22] = in[22] + out[21] 306 | out[23] = in[23] + out[22] 307 | out[24] = in[24] + out[23] 308 | out[25] = in[25] + out[24] 309 | out[26] = in[26] + out[25] 310 | out[27] = in[27] + out[26] 311 | out[28] = in[28] + out[27] 312 | out[29] = in[29] + out[28] 313 | out[30] = in[30] + out[29] 314 | out[31] = in[31] + out[30] 315 | out[32] = in[32] + out[31] 316 | out[33] = in[33] + out[32] 317 | out[34] = in[34] + out[33] 318 | out[35] = in[35] + out[34] 319 | out[36] = in[36] + out[35] 320 | out[37] = in[37] + out[36] 321 | out[38] = in[38] + out[37] 322 | out[39] = in[39] + out[38] 323 | out[40] = in[40] + out[39] 324 | out[41] = in[41] + out[40] 325 | out[42] = in[42] + out[41] 326 | out[43] = in[43] + out[42] 327 | out[44] = in[44] + out[43] 328 | out[45] = in[45] + out[44] 329 | out[46] = in[46] + out[45] 330 | out[47] = in[47] + out[46] 331 | out[48] = in[48] + out[47] 332 | out[49] = in[49] + out[48] 333 | out[50] = in[50] + out[49] 334 | out[51] = in[51] + out[50] 335 | out[52] = in[52] + out[51] 336 | out[53] = in[53] + out[52] 337 | out[54] = in[54] + out[53] 338 | out[55] = in[55] + out[54] 339 | out[56] = in[56] + out[55] 340 | out[57] = in[57] + out[56] 341 | out[58] = in[58] + out[57] 342 | out[59] = in[59] + out[58] 343 | out[60] = in[60] + out[59] 344 | out[61] = in[61] + out[60] 345 | out[62] = in[62] + out[61] 346 | out[63] = in[63] + out[62] 347 | out[64] = in[64] + out[63] 348 | out[65] = in[65] + out[64] 349 | out[66] = in[66] + out[65] 350 | out[67] = in[67] + out[66] 351 | out[68] = in[68] + out[67] 352 | out[69] = in[69] + out[68] 353 | out[70] = in[70] + out[69] 354 | out[71] = in[71] + out[70] 355 | out[72] = in[72] + out[71] 356 | out[73] = in[73] + out[72] 357 | out[74] = in[74] + out[73] 358 | out[75] = in[75] + out[74] 359 | out[76] = in[76] + out[75] 360 | out[77] = in[77] + out[76] 361 | out[78] = in[78] + out[77] 362 | out[79] = in[79] + out[78] 363 | out[80] = in[80] + out[79] 364 | out[81] = in[81] + out[80] 365 | out[82] = in[82] + out[81] 366 | out[83] = in[83] + out[82] 367 | out[84] = in[84] + out[83] 368 | out[85] = in[85] + out[84] 369 | out[86] = in[86] + out[85] 370 | out[87] = in[87] + out[86] 371 | out[88] = in[88] + out[87] 372 | out[89] = in[89] + out[88] 373 | out[90] = in[90] + out[89] 374 | out[91] = in[91] + out[90] 375 | out[92] = in[92] + out[91] 376 | out[93] = in[93] + out[92] 377 | out[94] = in[94] + out[93] 378 | out[95] = in[95] + out[94] 379 | out[96] = in[96] + out[95] 380 | out[97] = in[97] + out[96] 381 | out[98] = in[98] + out[97] 382 | out[99] = in[99] + out[98] 383 | out[100] = in[100] + out[99] 384 | out[101] = in[101] + out[100] 385 | out[102] = in[102] + out[101] 386 | out[103] = in[103] + out[102] 387 | out[104] = in[104] + out[103] 388 | out[105] = in[105] + out[104] 389 | out[106] = in[106] + out[105] 390 | out[107] = in[107] + out[106] 391 | out[108] = in[108] + out[107] 392 | out[109] = in[109] + out[108] 393 | out[110] = in[110] + out[109] 394 | out[111] = in[111] + out[110] 395 | out[112] = in[112] + out[111] 396 | out[113] = in[113] + out[112] 397 | out[114] = in[114] + out[113] 398 | out[115] = in[115] + out[114] 399 | out[116] = in[116] + out[115] 400 | out[117] = in[117] + out[116] 401 | out[118] = in[118] + out[117] 402 | out[119] = in[119] + out[118] 403 | out[120] = in[120] + out[119] 404 | out[121] = in[121] + out[120] 405 | out[122] = in[122] + out[121] 406 | out[123] = in[123] + out[122] 407 | out[124] = in[124] + out[123] 408 | out[125] = in[125] + out[124] 409 | out[126] = in[126] + out[125] 410 | out[127] = in[127] + out[126] 411 | } 412 | 413 | func UnrolledLeadingBitFrequency128(in, freqs []int32) { 414 | freqs[LeadingBitPosition(uint32(in[0]))]++ 415 | freqs[LeadingBitPosition(uint32(in[1]))]++ 416 | freqs[LeadingBitPosition(uint32(in[2]))]++ 417 | freqs[LeadingBitPosition(uint32(in[3]))]++ 418 | freqs[LeadingBitPosition(uint32(in[4]))]++ 419 | freqs[LeadingBitPosition(uint32(in[5]))]++ 420 | freqs[LeadingBitPosition(uint32(in[6]))]++ 421 | freqs[LeadingBitPosition(uint32(in[7]))]++ 422 | freqs[LeadingBitPosition(uint32(in[8]))]++ 423 | freqs[LeadingBitPosition(uint32(in[9]))]++ 424 | freqs[LeadingBitPosition(uint32(in[10]))]++ 425 | freqs[LeadingBitPosition(uint32(in[11]))]++ 426 | freqs[LeadingBitPosition(uint32(in[12]))]++ 427 | freqs[LeadingBitPosition(uint32(in[13]))]++ 428 | freqs[LeadingBitPosition(uint32(in[14]))]++ 429 | freqs[LeadingBitPosition(uint32(in[15]))]++ 430 | freqs[LeadingBitPosition(uint32(in[16]))]++ 431 | freqs[LeadingBitPosition(uint32(in[17]))]++ 432 | freqs[LeadingBitPosition(uint32(in[18]))]++ 433 | freqs[LeadingBitPosition(uint32(in[19]))]++ 434 | freqs[LeadingBitPosition(uint32(in[20]))]++ 435 | freqs[LeadingBitPosition(uint32(in[21]))]++ 436 | freqs[LeadingBitPosition(uint32(in[22]))]++ 437 | freqs[LeadingBitPosition(uint32(in[23]))]++ 438 | freqs[LeadingBitPosition(uint32(in[24]))]++ 439 | freqs[LeadingBitPosition(uint32(in[25]))]++ 440 | freqs[LeadingBitPosition(uint32(in[26]))]++ 441 | freqs[LeadingBitPosition(uint32(in[27]))]++ 442 | freqs[LeadingBitPosition(uint32(in[28]))]++ 443 | freqs[LeadingBitPosition(uint32(in[29]))]++ 444 | freqs[LeadingBitPosition(uint32(in[30]))]++ 445 | freqs[LeadingBitPosition(uint32(in[31]))]++ 446 | freqs[LeadingBitPosition(uint32(in[32]))]++ 447 | freqs[LeadingBitPosition(uint32(in[33]))]++ 448 | freqs[LeadingBitPosition(uint32(in[34]))]++ 449 | freqs[LeadingBitPosition(uint32(in[35]))]++ 450 | freqs[LeadingBitPosition(uint32(in[36]))]++ 451 | freqs[LeadingBitPosition(uint32(in[37]))]++ 452 | freqs[LeadingBitPosition(uint32(in[38]))]++ 453 | freqs[LeadingBitPosition(uint32(in[39]))]++ 454 | freqs[LeadingBitPosition(uint32(in[40]))]++ 455 | freqs[LeadingBitPosition(uint32(in[41]))]++ 456 | freqs[LeadingBitPosition(uint32(in[42]))]++ 457 | freqs[LeadingBitPosition(uint32(in[43]))]++ 458 | freqs[LeadingBitPosition(uint32(in[44]))]++ 459 | freqs[LeadingBitPosition(uint32(in[45]))]++ 460 | freqs[LeadingBitPosition(uint32(in[46]))]++ 461 | freqs[LeadingBitPosition(uint32(in[47]))]++ 462 | freqs[LeadingBitPosition(uint32(in[48]))]++ 463 | freqs[LeadingBitPosition(uint32(in[49]))]++ 464 | freqs[LeadingBitPosition(uint32(in[50]))]++ 465 | freqs[LeadingBitPosition(uint32(in[51]))]++ 466 | freqs[LeadingBitPosition(uint32(in[52]))]++ 467 | freqs[LeadingBitPosition(uint32(in[53]))]++ 468 | freqs[LeadingBitPosition(uint32(in[54]))]++ 469 | freqs[LeadingBitPosition(uint32(in[55]))]++ 470 | freqs[LeadingBitPosition(uint32(in[56]))]++ 471 | freqs[LeadingBitPosition(uint32(in[57]))]++ 472 | freqs[LeadingBitPosition(uint32(in[58]))]++ 473 | freqs[LeadingBitPosition(uint32(in[59]))]++ 474 | freqs[LeadingBitPosition(uint32(in[60]))]++ 475 | freqs[LeadingBitPosition(uint32(in[61]))]++ 476 | freqs[LeadingBitPosition(uint32(in[62]))]++ 477 | freqs[LeadingBitPosition(uint32(in[63]))]++ 478 | freqs[LeadingBitPosition(uint32(in[64]))]++ 479 | freqs[LeadingBitPosition(uint32(in[65]))]++ 480 | freqs[LeadingBitPosition(uint32(in[66]))]++ 481 | freqs[LeadingBitPosition(uint32(in[67]))]++ 482 | freqs[LeadingBitPosition(uint32(in[68]))]++ 483 | freqs[LeadingBitPosition(uint32(in[69]))]++ 484 | freqs[LeadingBitPosition(uint32(in[70]))]++ 485 | freqs[LeadingBitPosition(uint32(in[71]))]++ 486 | freqs[LeadingBitPosition(uint32(in[72]))]++ 487 | freqs[LeadingBitPosition(uint32(in[73]))]++ 488 | freqs[LeadingBitPosition(uint32(in[74]))]++ 489 | freqs[LeadingBitPosition(uint32(in[75]))]++ 490 | freqs[LeadingBitPosition(uint32(in[76]))]++ 491 | freqs[LeadingBitPosition(uint32(in[77]))]++ 492 | freqs[LeadingBitPosition(uint32(in[78]))]++ 493 | freqs[LeadingBitPosition(uint32(in[79]))]++ 494 | freqs[LeadingBitPosition(uint32(in[80]))]++ 495 | freqs[LeadingBitPosition(uint32(in[81]))]++ 496 | freqs[LeadingBitPosition(uint32(in[82]))]++ 497 | freqs[LeadingBitPosition(uint32(in[83]))]++ 498 | freqs[LeadingBitPosition(uint32(in[84]))]++ 499 | freqs[LeadingBitPosition(uint32(in[85]))]++ 500 | freqs[LeadingBitPosition(uint32(in[86]))]++ 501 | freqs[LeadingBitPosition(uint32(in[87]))]++ 502 | freqs[LeadingBitPosition(uint32(in[88]))]++ 503 | freqs[LeadingBitPosition(uint32(in[89]))]++ 504 | freqs[LeadingBitPosition(uint32(in[90]))]++ 505 | freqs[LeadingBitPosition(uint32(in[91]))]++ 506 | freqs[LeadingBitPosition(uint32(in[92]))]++ 507 | freqs[LeadingBitPosition(uint32(in[93]))]++ 508 | freqs[LeadingBitPosition(uint32(in[94]))]++ 509 | freqs[LeadingBitPosition(uint32(in[95]))]++ 510 | freqs[LeadingBitPosition(uint32(in[96]))]++ 511 | freqs[LeadingBitPosition(uint32(in[97]))]++ 512 | freqs[LeadingBitPosition(uint32(in[98]))]++ 513 | freqs[LeadingBitPosition(uint32(in[99]))]++ 514 | freqs[LeadingBitPosition(uint32(in[100]))]++ 515 | freqs[LeadingBitPosition(uint32(in[101]))]++ 516 | freqs[LeadingBitPosition(uint32(in[102]))]++ 517 | freqs[LeadingBitPosition(uint32(in[103]))]++ 518 | freqs[LeadingBitPosition(uint32(in[104]))]++ 519 | freqs[LeadingBitPosition(uint32(in[105]))]++ 520 | freqs[LeadingBitPosition(uint32(in[106]))]++ 521 | freqs[LeadingBitPosition(uint32(in[107]))]++ 522 | freqs[LeadingBitPosition(uint32(in[108]))]++ 523 | freqs[LeadingBitPosition(uint32(in[109]))]++ 524 | freqs[LeadingBitPosition(uint32(in[110]))]++ 525 | freqs[LeadingBitPosition(uint32(in[111]))]++ 526 | freqs[LeadingBitPosition(uint32(in[112]))]++ 527 | freqs[LeadingBitPosition(uint32(in[113]))]++ 528 | freqs[LeadingBitPosition(uint32(in[114]))]++ 529 | freqs[LeadingBitPosition(uint32(in[115]))]++ 530 | freqs[LeadingBitPosition(uint32(in[116]))]++ 531 | freqs[LeadingBitPosition(uint32(in[117]))]++ 532 | freqs[LeadingBitPosition(uint32(in[118]))]++ 533 | freqs[LeadingBitPosition(uint32(in[119]))]++ 534 | freqs[LeadingBitPosition(uint32(in[120]))]++ 535 | freqs[LeadingBitPosition(uint32(in[121]))]++ 536 | freqs[LeadingBitPosition(uint32(in[122]))]++ 537 | freqs[LeadingBitPosition(uint32(in[123]))]++ 538 | freqs[LeadingBitPosition(uint32(in[124]))]++ 539 | freqs[LeadingBitPosition(uint32(in[125]))]++ 540 | freqs[LeadingBitPosition(uint32(in[126]))]++ 541 | freqs[LeadingBitPosition(uint32(in[127]))]++ 542 | } 543 | -------------------------------------------------------------------------------- /variablebyte/variablebyte.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | package variablebyte 8 | 9 | import ( 10 | "errors" 11 | 12 | "github.com/dataence/bytebuffer" 13 | "github.com/dataence/encoding" 14 | "github.com/dataence/encoding/cursor" 15 | ) 16 | 17 | type VariableByte struct { 18 | } 19 | 20 | var _ encoding.Integer = (*VariableByte)(nil) 21 | 22 | func New() encoding.Integer { 23 | return &VariableByte{} 24 | } 25 | 26 | func (this *VariableByte) Compress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error { 27 | if inlength == 0 { 28 | return errors.New("VariableByte/Compress: inlength = 0. No work done.") 29 | } 30 | 31 | //fmt.Printf("VariableByte/Compress: after inlength = %d\n", inlength) 32 | 33 | buf := bytebuffer.NewByteBuffer(inlength * 8) 34 | tmpinpos := inpos.Get() 35 | 36 | for _, v := range in[tmpinpos : tmpinpos+inlength] { 37 | val := uint32(v) 38 | 39 | for val >= 0x80 { 40 | buf.Put(byte(val) | 0x80) 41 | val >>= 7 42 | } 43 | buf.Put(byte(val)) 44 | } 45 | 46 | for buf.Position()%4 != 0 { 47 | //fmt.Printf("VariableByte/Compress: putting 128\n") 48 | buf.Put(128) 49 | } 50 | 51 | length := buf.Position() 52 | buf.Flip() 53 | ibuf := buf.AsInt32Buffer() 54 | //fmt.Printf("VariableByte/Compress: l = %d, outpos = %d, ibuf = %v, buf = %v\n", length/4, outpos.Get(), ibuf, buf) 55 | err := ibuf.GetInt32s(out, outpos.Get(), length/4) 56 | if err != nil { 57 | //fmt.Printf("VariableByte/Compress: error with GetUint32s - %v\n", err) 58 | return err 59 | } 60 | outpos.Add(length / 4) 61 | inpos.Add(inlength) 62 | //fmt.Printf("VariableByte/Compress: out = %v\n", out) 63 | 64 | return nil 65 | } 66 | 67 | func (this *VariableByte) Uncompress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error { 68 | if inlength == 0 { 69 | return errors.New("VariableByte/Uncompress: inlength = 0. No work done.") 70 | } 71 | 72 | //fmt.Printf("VariableByte/Uncompress: after inlength = %d\n", inlength) 73 | 74 | s := uint(0) 75 | p := inpos.Get() 76 | finalp := p + inlength 77 | tmpoutpos := outpos.Get() 78 | v := int32(0) 79 | shift := uint(0) 80 | 81 | for p < finalp { 82 | c := in[p] >> (24 - s) 83 | s += 8 84 | 85 | if s == 32 { 86 | s = 0 87 | p += 1 88 | } 89 | 90 | v += ((c & 127) << shift) 91 | if c&128 == 0 { 92 | out[tmpoutpos] = v 93 | tmpoutpos += 1 94 | v = 0 95 | shift = 0 96 | } else { 97 | shift += 7 98 | } 99 | } 100 | 101 | outpos.Set(tmpoutpos) 102 | inpos.Add(inlength) 103 | 104 | return nil 105 | } 106 | -------------------------------------------------------------------------------- /variablebyte/variablebyte_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | package variablebyte 8 | 9 | import ( 10 | "log" 11 | "testing" 12 | 13 | "github.com/dataence/encoding/benchtools" 14 | "github.com/dataence/encoding/cursor" 15 | "github.com/dataence/encoding/generators" 16 | ) 17 | 18 | var ( 19 | data []int32 20 | size int = 128000 21 | ) 22 | 23 | func init() { 24 | log.Printf("variablebyte/init: generating %d int32s\n", size) 25 | data = generators.GenerateClustered(size, size*2) 26 | log.Printf("variablebyte/init: generated %d integers for test", size) 27 | } 28 | 29 | func TestCodec(t *testing.T) { 30 | sizes := []int{128, 128 * 10, 128 * 100, 128 * 1000} 31 | benchtools.TestCodec(New(), data, sizes) 32 | } 33 | 34 | // go test -bench=Decode 35 | func BenchmarkDecode(b *testing.B) { 36 | b.StopTimer() 37 | length := 128 * 1024 38 | data := generators.GenerateClustered(length, 1<<24) 39 | compdata := make([]int32, 2*length) 40 | recov := make([]int32, length) 41 | inpos := cursor.New() 42 | outpos := cursor.New() 43 | codec := New() 44 | codec.Compress(data, inpos, len(data), compdata, outpos) 45 | b.StartTimer() 46 | for j := 0; j < b.N; j++ { 47 | newinpos := cursor.New() 48 | newoutpos := cursor.New() 49 | codec.Uncompress(compdata, newinpos, outpos.Get()-newinpos.Get(), recov, newoutpos) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /zigzag/bp32/bp32.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | package bp32 8 | 9 | import ( 10 | "errors" 11 | 12 | "github.com/dataence/encoding" 13 | "github.com/dataence/encoding/bitpacking" 14 | "github.com/dataence/encoding/cursor" 15 | ) 16 | 17 | const ( 18 | DefaultBlockSize = 128 19 | DefaultPageSize = 65536 20 | ) 21 | 22 | type BP32 struct { 23 | } 24 | 25 | var _ encoding.Integer = (*BP32)(nil) 26 | 27 | func New() encoding.Integer { 28 | return &BP32{} 29 | } 30 | 31 | func (this *BP32) Compress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error { 32 | //log.Printf("zigzag_bp32/Compress: before inlength = %d\n", inlength) 33 | 34 | inlength = encoding.FloorBy(inlength, DefaultBlockSize) 35 | 36 | if inlength == 0 { 37 | return errors.New("zigzag_bp32/Compress: block size less than 128. No work done.") 38 | } 39 | 40 | //log.Printf("zigzag_bp32/Compress: after inlength = %d, len(in) = %d\n", inlength, len(in)) 41 | 42 | out[outpos.Get()] = int32(inlength) 43 | outpos.Increment() 44 | 45 | tmpoutpos := outpos.Get() 46 | s := inpos.Get() 47 | finalinpos := s + inlength 48 | delta := make([]int32, DefaultBlockSize) 49 | 50 | for ; s < finalinpos; s += DefaultBlockSize { 51 | encoding.ZigZagDelta(in[s:s+DefaultBlockSize], delta) 52 | //log.Printf("zigzag_bp32/Compress: in = %v\n", in[s:s+DefaultBlockSize]) 53 | //log.Printf("zigzag_bp32/Compress: delta = %v\n", delta) 54 | 55 | mbits1 := encoding.MaxBits(delta[0:32]) 56 | mbits2 := encoding.MaxBits(delta[32:64]) 57 | mbits3 := encoding.MaxBits(delta[64:96]) 58 | mbits4 := encoding.MaxBits(delta[96:128]) 59 | 60 | //log.Printf("zigzag_bp32/Compress: tmpoutpos = %d, s = %d\n", tmpoutpos, s) 61 | 62 | out[tmpoutpos] = (mbits1 << 24) | (mbits2 << 16) | (mbits3 << 8) | mbits4 63 | tmpoutpos += 1 64 | 65 | //log.Printf("zigzag_bp32/Compress: mbits1 = %d, mbits2 = %d, mbits3 = %d, mbits4 = %d, s = %d\n", mbits1, mbits2, mbits3, mbits4, out[tmpoutpos-1]) 66 | 67 | bitpacking.FastPackWithoutMask(delta, 0, out, tmpoutpos, int(mbits1)) 68 | //encoding.PrintUint32sInBits(in[s:s+32]) 69 | //encoding.PrintUint32sInBits(out[tmpoutpos:tmpoutpos+int(mbits1])) 70 | tmpoutpos += int(mbits1) 71 | 72 | bitpacking.FastPackWithoutMask(delta, 32, out, tmpoutpos, int(mbits2)) 73 | //encoding.PrintUint32sInBits(in, s+32, 32) 74 | //encoding.PrintUint32sInBits(out, tmpoutpos, int(mbits2)) 75 | tmpoutpos += int(mbits2) 76 | 77 | bitpacking.FastPackWithoutMask(delta, 64, out, tmpoutpos, int(mbits3)) 78 | //encoding.PrintUint32sInBits(in, s+2*32, 32) 79 | //encoding.PrintUint32sInBits(out, tmpoutpos, int(mbits3)) 80 | tmpoutpos += int(mbits3) 81 | 82 | bitpacking.FastPackWithoutMask(delta, 96, out, tmpoutpos, int(mbits4)) 83 | //encoding.PrintUint32sInBits(in, s+3*32, 32) 84 | //encoding.PrintUint32sInBits(out, tmpoutpos, int(mbits4)) 85 | tmpoutpos += int(mbits4) 86 | 87 | //log.Printf("zigzag_bp32/Compress: out = %v\n", out[s:s+DefaultBlockSize]) 88 | } 89 | 90 | inpos.Add(inlength) 91 | outpos.Set(tmpoutpos) 92 | 93 | return nil 94 | } 95 | 96 | func (this *BP32) Uncompress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error { 97 | if inlength == 0 { 98 | return errors.New("zigzag_bp32/Uncompress: Length is 0. No work done.") 99 | } 100 | 101 | outlength := int(in[inpos.Get()]) 102 | inpos.Increment() 103 | 104 | tmpinpos := inpos.Get() 105 | s := outpos.Get() 106 | finalinpos := s + outlength 107 | delta := make([]int32, DefaultBlockSize) 108 | 109 | //log.Printf("zigzag_bp32/Uncompress: outlength = %d, inpos = %d, outpos = %d\n", outlength, inpos.Get(), outpos.Get()) 110 | for ; s < finalinpos; s += DefaultBlockSize { 111 | tmp := in[tmpinpos] 112 | mbits1 := tmp >> 24 113 | mbits2 := (tmp >> 16) & 0xFF 114 | mbits3 := (tmp >> 8) & 0xFF 115 | mbits4 := (tmp) & 0xFF 116 | 117 | //log.Printf("zigzag_bp32/Uncopmress: mbits1 = %d, mbits2 = %d, mbits3 = %d, mbits4 = %d, s = %d\n", mbits1, mbits2, mbits3, mbits4, s) 118 | tmpinpos += 1 119 | 120 | bitpacking.FastUnpack(in, tmpinpos, delta, 0, int(mbits1)) 121 | tmpinpos += int(mbits1) 122 | //log.Printf("zigzag_bp32/Uncompress: delta = %v\n", out) 123 | 124 | bitpacking.FastUnpack(in, tmpinpos, delta, 32, int(mbits2)) 125 | tmpinpos += int(mbits2) 126 | //log.Printf("zigzag_bp32/Uncompress: delta = %v\n", out) 127 | 128 | bitpacking.FastUnpack(in, tmpinpos, delta, 64, int(mbits3)) 129 | tmpinpos += int(mbits3) 130 | //log.Printf("zigzag_bp32/Uncompress: delta = %v\n", out) 131 | 132 | bitpacking.FastUnpack(in, tmpinpos, delta, 96, int(mbits4)) 133 | tmpinpos += int(mbits4) 134 | 135 | encoding.InverseZigZagDelta(delta, out[s:s+DefaultBlockSize]) 136 | 137 | //log.Printf("zigzag_bp32/Uncompress: delta = %v\n", delta) 138 | //log.Printf("zigzag_bp32/Uncompress: out = %v\n", out[s:s+DefaultBlockSize]) 139 | 140 | } 141 | 142 | outpos.Add(outlength) 143 | inpos.Set(tmpinpos) 144 | 145 | return nil 146 | } 147 | -------------------------------------------------------------------------------- /zigzag/bp32/bp32_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | package bp32 8 | 9 | import ( 10 | "log" 11 | "testing" 12 | 13 | "github.com/dataence/encoding/benchtools" 14 | "github.com/dataence/encoding/generators" 15 | ) 16 | 17 | var ( 18 | data []int32 19 | size int = 12800000 20 | ) 21 | 22 | func init() { 23 | log.Printf("bp32/init: generating %d int32s\n", size) 24 | data = generators.GenerateClustered(size, size*2) 25 | log.Printf("bp32/init: generated %d integers for test", size) 26 | } 27 | 28 | func TestCodec(t *testing.T) { 29 | sizes := []int{128, 128 * 10, 128 * 100, 128 * 1000, 128 * 10000} 30 | benchtools.TestCodec(New(), data, sizes) 31 | } 32 | -------------------------------------------------------------------------------- /zigzag/fastpfor/fastpfor.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | package fastpfor 8 | 9 | import ( 10 | "errors" 11 | "math" 12 | 13 | "github.com/dataence/bytebuffer" 14 | "github.com/dataence/encoding" 15 | "github.com/dataence/encoding/bitpacking" 16 | "github.com/dataence/encoding/cursor" 17 | ) 18 | 19 | const ( 20 | DefaultBlockSize = 128 21 | OverheadOfEachExcept = 8 22 | DefaultPageSize = 65536 23 | ) 24 | 25 | var ( 26 | zeroDataPointers []int32 27 | zeroFreqs []int32 28 | ) 29 | 30 | func init() { 31 | zeroDataPointers = make([]int32, 33) 32 | zeroFreqs = make([]int32, 33) 33 | } 34 | 35 | type FastPFOR struct { 36 | dataToBePacked [33][]int32 37 | byteContainer *bytebuffer.ByteBuffer 38 | pageSize int32 39 | 40 | // Working area 41 | dataPointers []int32 42 | freqs []int32 43 | } 44 | 45 | var _ encoding.Integer = (*FastPFOR)(nil) 46 | 47 | func New() encoding.Integer { 48 | f := &FastPFOR{ 49 | pageSize: DefaultPageSize, 50 | byteContainer: bytebuffer.NewByteBuffer(3*DefaultPageSize/DefaultBlockSize + DefaultPageSize), 51 | dataPointers: make([]int32, 33), 52 | freqs: make([]int32, 33), 53 | } 54 | 55 | for i := 1; i < 33; i++ { 56 | f.dataToBePacked[i] = make([]int32, DefaultPageSize/32*4) 57 | } 58 | 59 | return f 60 | } 61 | 62 | func (this *FastPFOR) Compress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error { 63 | inlength = encoding.FloorBy(inlength, DefaultBlockSize) 64 | 65 | if inlength == 0 { 66 | return errors.New("fastpfor/Compress: inlength = 0. No work done.") 67 | } 68 | 69 | out[outpos.Get()] = int32(inlength) 70 | outpos.Increment() 71 | 72 | initoffset := cursor.New() 73 | 74 | copy(this.dataPointers, zeroDataPointers) 75 | copy(this.freqs, zeroFreqs) 76 | 77 | finalInpos := inpos.Get() + inlength 78 | 79 | for inpos.Get() != finalInpos { 80 | thissize := int(math.Min(float64(this.pageSize), float64(finalInpos-inpos.Get()))) 81 | 82 | if err := this.encodePage(in, inpos, thissize, out, outpos, initoffset); err != nil { 83 | return errors.New("fastpfor/Compress: " + err.Error()) 84 | } 85 | } 86 | 87 | return nil 88 | } 89 | 90 | func (this *FastPFOR) Uncompress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error { 91 | if inlength == 0 { 92 | return errors.New("fastpfor/Uncompress: inlength = 0. No work done.") 93 | } 94 | 95 | mynvalue := in[inpos.Get()] 96 | inpos.Increment() 97 | 98 | initoffset := cursor.New() 99 | 100 | copy(this.dataPointers, zeroDataPointers) 101 | 102 | finalout := outpos.Get() + int(mynvalue) 103 | for outpos.Get() != finalout { 104 | thissize := int(math.Min(float64(this.pageSize), float64(finalout-outpos.Get()))) 105 | 106 | if err := this.decodePage(in, inpos, out, outpos, thissize, initoffset); err != nil { 107 | return errors.New("fastpfor/Uncompress: " + err.Error()) 108 | } 109 | } 110 | return nil 111 | } 112 | 113 | // getBestBFromData determins the best bit position with the best cost of exceptions, 114 | // and the max bit position of the array of int32s 115 | func (this *FastPFOR) getBestBFromData(in []int32) (bestb int32, bestc int32, maxb int32) { 116 | copy(this.freqs, zeroFreqs) 117 | 118 | // Get the count of all the leading bit positions for the slice 119 | // Mainly to figure out what's the best (most popular) bit position 120 | for _, v := range in { 121 | this.freqs[encoding.LeadingBitPosition(uint32(v))]++ 122 | } 123 | //encoding.FastLeadingBitFrequency128(in, this.freqs) 124 | 125 | bestb = 32 126 | 127 | for this.freqs[bestb] == 0 { 128 | bestb-- 129 | } 130 | 131 | maxb = bestb 132 | bestCost := bestb * DefaultBlockSize 133 | var cexcept int32 134 | bestc = cexcept 135 | 136 | // Find the cost of storing exceptions for each bit position 137 | for b := bestb - 1; b >= 0; b-- { 138 | cexcept += this.freqs[b+1] 139 | if cexcept < 0 { 140 | break 141 | } 142 | 143 | // the extra 8 is the cost of storing maxbits 144 | thisCost := cexcept*OverheadOfEachExcept + cexcept*(maxb-b) + b*DefaultBlockSize + 8 145 | 146 | if thisCost < bestCost { 147 | bestCost = thisCost 148 | bestb = b 149 | bestc = cexcept 150 | } 151 | } 152 | 153 | return 154 | } 155 | 156 | func (this *FastPFOR) encodePage(in []int32, inpos *cursor.Cursor, thissize int, out []int32, outpos *cursor.Cursor, initoffset *cursor.Cursor) error { 157 | headerpos := int32(outpos.Get()) 158 | outpos.Increment() 159 | tmpoutpos := int32(outpos.Get()) 160 | 161 | // Clear working area 162 | copy(this.dataPointers, zeroDataPointers) 163 | this.byteContainer.Clear() 164 | 165 | tmpinpos := int32(inpos.Get()) 166 | var delta [DefaultBlockSize]int32 167 | 168 | for finalInpos := tmpinpos + int32(thissize) - DefaultBlockSize; tmpinpos <= finalInpos; tmpinpos += DefaultBlockSize { 169 | // Calculate the deltas, inlining to gain a bit of performance 170 | offset := int32(initoffset.Get()) 171 | for i, v := range in[tmpinpos : tmpinpos+DefaultBlockSize] { 172 | n := v - offset 173 | delta[i] = (n << 1) ^ (n >> 31) 174 | offset = v 175 | } 176 | 177 | initoffset.Set(int(in[tmpinpos+DefaultBlockSize-1])) 178 | 179 | //bestb, bestc, maxb := this.getBestBFromData(in[tmpinpos:tmpinpos+DefaultBlockSize]) 180 | bestb, bestc, maxb := this.getBestBFromData(delta[:]) 181 | tmpbestb := bestb 182 | this.byteContainer.Put(byte(bestb)) 183 | this.byteContainer.Put(byte(bestc)) 184 | 185 | if bestc > 0 { 186 | this.byteContainer.Put(byte(maxb)) 187 | index := maxb - bestb 188 | 189 | if int(this.dataPointers[index]+bestc) >= len(this.dataToBePacked[index]) { 190 | newSize := int(2 * (this.dataPointers[index] + bestc)) 191 | 192 | // make sure it is a multiple of 32. 193 | // there might be a better way to do this 194 | newSize = encoding.CeilBy(newSize, 32) 195 | newSlice := make([]int32, newSize) 196 | copy(newSlice, this.dataToBePacked[index]) 197 | this.dataToBePacked[index] = newSlice 198 | } 199 | 200 | for k := int32(0); k < DefaultBlockSize; k++ { 201 | if uint32(delta[k])>>uint(bestb) != 0 { 202 | // we have an exception 203 | this.byteContainer.Put(byte(k)) 204 | this.dataToBePacked[index][this.dataPointers[index]] = int32(uint32(delta[k]) >> uint(tmpbestb)) 205 | this.dataPointers[index] += 1 206 | } 207 | } 208 | } 209 | 210 | for k := int32(0); k < 128; k += 32 { 211 | bitpacking.FastPack(delta[:], int(k), out, int(tmpoutpos), int(tmpbestb)) 212 | tmpoutpos += tmpbestb 213 | } 214 | } 215 | 216 | inpos.Set(int(tmpinpos)) 217 | out[headerpos] = tmpoutpos - headerpos 218 | 219 | for this.byteContainer.Position()&3 != 0 { 220 | this.byteContainer.Put(0) 221 | } 222 | 223 | bytesize := int32(this.byteContainer.Position()) 224 | out[tmpoutpos] = bytesize 225 | tmpoutpos += 1 226 | howmanyints := bytesize / 4 227 | 228 | this.byteContainer.Flip() 229 | this.byteContainer.AsInt32Buffer().GetInt32s(out, int(tmpoutpos), int(howmanyints)) 230 | tmpoutpos += howmanyints 231 | 232 | bitmap := int32(0) 233 | for k := 1; k <= 32; k++ { 234 | v := this.dataPointers[k] 235 | if v != 0 { 236 | bitmap |= (1 << uint(k-1)) 237 | } 238 | } 239 | 240 | out[tmpoutpos] = bitmap 241 | tmpoutpos += 1 242 | 243 | for k := 1; k < 33; k++ { 244 | v := this.dataPointers[k] 245 | if v != 0 { 246 | out[tmpoutpos] = v // size 247 | tmpoutpos += 1 248 | for j := 0; j < int(v); j += 32 { 249 | bitpacking.FastPack(this.dataToBePacked[k], j, out, int(tmpoutpos), k) 250 | tmpoutpos += int32(k) 251 | } 252 | } 253 | } 254 | 255 | outpos.Set(int(tmpoutpos)) 256 | 257 | return nil 258 | } 259 | 260 | func (this *FastPFOR) decodePage(in []int32, inpos *cursor.Cursor, out []int32, outpos *cursor.Cursor, thissize int, initoffset *cursor.Cursor) error { 261 | initpos := int32(inpos.Get()) 262 | wheremeta := in[initpos] 263 | inpos.Increment() 264 | 265 | inexcept := initpos + wheremeta 266 | bytesize := in[inexcept] 267 | inexcept += 1 268 | 269 | this.byteContainer.Clear() 270 | if err := this.byteContainer.AsInt32Buffer().PutInt32s(in, int(inexcept), int(bytesize/4)); err != nil { 271 | return err 272 | } 273 | 274 | inexcept += bytesize / 4 275 | bitmap := in[inexcept] 276 | inexcept += 1 277 | 278 | for k := int32(1); k < 33; k++ { 279 | if bitmap&(1< 0 { 321 | var maxbits int32 322 | if maxbits, err = this.byteContainer.GetAsInt32(); err != nil { 323 | return err 324 | } 325 | 326 | index := maxbits - bestb 327 | 328 | for k := int32(0); k < cexcept; k++ { 329 | var pos int32 330 | if pos, err = this.byteContainer.GetAsInt32(); err != nil { 331 | return err 332 | } 333 | 334 | exceptvalue := this.dataToBePacked[index][this.dataPointers[index]] 335 | this.dataPointers[index] += 1 336 | //out[pos + tmpoutpos] |= exceptvalue << uint(bestb) 337 | delta[pos] |= exceptvalue << uint(bestb) 338 | } 339 | } 340 | 341 | // Calculate the original from the deltas, inlining to gain a bit of performance 342 | offset := int32(initoffset.Get()) 343 | for i, v := range delta { 344 | n := int32(uint32(v)>>1) ^ ((v << 31) >> 31) 345 | out[int(tmpoutpos)+i] = n + offset 346 | offset += n 347 | } 348 | initoffset.Set(int(out[tmpoutpos+DefaultBlockSize-1])) 349 | 350 | run += 1 351 | tmpoutpos += DefaultBlockSize 352 | } 353 | 354 | outpos.Set(int(tmpoutpos)) 355 | inpos.Set(int(inexcept)) 356 | 357 | return nil 358 | } 359 | -------------------------------------------------------------------------------- /zigzag/fastpfor/fastpfor_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved. 3 | * Use of this source code is governed by the Apache 2.0 license. 4 | * 5 | */ 6 | 7 | package fastpfor 8 | 9 | import ( 10 | "log" 11 | "testing" 12 | 13 | "github.com/dataence/encoding/benchtools" 14 | "github.com/dataence/encoding/generators" 15 | ) 16 | 17 | var ( 18 | data []int32 19 | size int = 12800000 20 | ) 21 | 22 | func init() { 23 | log.Printf("bp32/init: generating %d int32s\n", size) 24 | data = generators.GenerateClustered(size, size*2) 25 | log.Printf("bp32/init: generated %d integers for test", size) 26 | } 27 | 28 | func TestCodec(t *testing.T) { 29 | sizes := []int{128, 128 * 10, 128 * 100, 128 * 1000, 128 * 10000} 30 | benchtools.TestCodec(New(), data, sizes) 31 | } 32 | --------------------------------------------------------------------------------