├── .gitignore
├── LICENSE
├── README.md
├── benchmark
    ├── benchmark.go
    ├── data
    │   ├── ip.txt.gz
    │   ├── lat.txt.gz
    │   └── ts.txt.gz
    └── results
    │   └── Benchmarking_Integer_Compression.xlsx
├── benchtools
    └── benchtools.go
├── bitlen.go
├── bitlen_386.s
├── bitlen_amd64.s
├── bitlen_arm.s
├── bitlen_decl.go
├── bitlen_gccgo.go
├── bitpacking
    ├── bitpacking.go
    └── delta_bitpacking.go
├── bp32
    ├── bp32.go
    └── bp32_test.go
├── composition
    ├── composition.go
    └── composition_test.go
├── cursor
    └── cursor.go
├── delta
    ├── bp32
    │   ├── bp32.go
    │   └── bp32_test.go
    ├── fastpfor
    │   ├── fastpfor.go
    │   └── fastpfor_test.go
    └── variablebyte
    │   ├── variablebyte.go
    │   └── variablebyte_test.go
├── fastpfor
    ├── fastpfor.go
    └── fastpfor_test.go
├── generators
    ├── generators.go
    └── generators_test.go
├── integer.go
├── util.go
├── variablebyte
    ├── variablebyte.go
    └── variablebyte_test.go
└── zigzag
    ├── bp32
        ├── bp32.go
        └── bp32_test.go
    └── fastpfor
        ├── fastpfor.go
        └── fastpfor_test.go


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects)
 2 | *.o
 3 | *.a
 4 | *.so
 5 | 
 6 | # Folders
 7 | _obj
 8 | _test
 9 | 
10 | # Architecture specific extensions/prefixes
11 | *.[568vq]
12 | [568vq].out
13 | 
14 | *.cgo1.go
15 | *.cgo2.c
16 | _cgo_defun.c
17 | _cgo_gotypes.go
18 | _cgo_export.*
19 | 
20 | _testmain.go
21 | 
22 | *.exe
23 | 
24 | .idea
25 | *.iml
26 | 
27 | *.swp
28 | *.un~
29 | 
30 | .DS_Store
31 | 
32 | Java*
33 | CPP*
34 | 
35 | *.pprof
36 | *.prof
37 | *.test
38 | *.out
39 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 | Version 2.0, January 2004
  3 | http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 | "License" shall mean the terms and conditions for use, reproduction, and
 10 | distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright
 13 | owner that is granting the License.
 14 | 
 15 | "Legal Entity" shall mean the union of the acting entity and all other entities
 16 | that control, are controlled by, or are under common control with that entity.
 17 | For the purposes of this definition, "control" means (i) the power, direct or
 18 | indirect, to cause the direction or management of such entity, whether by
 19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
 20 | outstanding shares, or (iii) beneficial ownership of such entity.
 21 | 
 22 | "You" (or "Your") shall mean an individual or Legal Entity exercising
 23 | permissions granted by this License.
 24 | 
 25 | "Source" form shall mean the preferred form for making modifications, including
 26 | but not limited to software source code, documentation source, and configuration
 27 | files.
 28 | 
 29 | "Object" form shall mean any form resulting from mechanical transformation or
 30 | translation of a Source form, including but not limited to compiled object code,
 31 | generated documentation, and conversions to other media types.
 32 | 
 33 | "Work" shall mean the work of authorship, whether in Source or Object form, made
 34 | available under the License, as indicated by a copyright notice that is included
 35 | in or attached to the work (an example is provided in the Appendix below).
 36 | 
 37 | "Derivative Works" shall mean any work, whether in Source or Object form, that
 38 | is based on (or derived from) the Work and for which the editorial revisions,
 39 | annotations, elaborations, or other modifications represent, as a whole, an
 40 | original work of authorship. For the purposes of this License, Derivative Works
 41 | shall not include works that remain separable from, or merely link (or bind by
 42 | name) to the interfaces of, the Work and Derivative Works thereof.
 43 | 
 44 | "Contribution" shall mean any work of authorship, including the original version
 45 | of the Work and any modifications or additions to that Work or Derivative Works
 46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work
 47 | by the copyright owner or by an individual or Legal Entity authorized to submit
 48 | on behalf of the copyright owner. For the purposes of this definition,
 49 | "submitted" means any form of electronic, verbal, or written communication sent
 50 | to the Licensor or its representatives, including but not limited to
 51 | communication on electronic mailing lists, source code control systems, and
 52 | issue tracking systems that are managed by, or on behalf of, the Licensor for
 53 | the purpose of discussing and improving the Work, but excluding communication
 54 | that is conspicuously marked or otherwise designated in writing by the copyright
 55 | owner as "Not a Contribution."
 56 | 
 57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf
 58 | of whom a Contribution has been received by Licensor and subsequently
 59 | incorporated within the Work.
 60 | 
 61 | 2. Grant of Copyright License.
 62 | 
 63 | Subject to the terms and conditions of this License, each Contributor hereby
 64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
 65 | irrevocable copyright license to reproduce, prepare Derivative Works of,
 66 | publicly display, publicly perform, sublicense, and distribute the Work and such
 67 | Derivative Works in Source or Object form.
 68 | 
 69 | 3. Grant of Patent License.
 70 | 
 71 | Subject to the terms and conditions of this License, each Contributor hereby
 72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
 73 | irrevocable (except as stated in this section) patent license to make, have
 74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where
 75 | such license applies only to those patent claims licensable by such Contributor
 76 | that are necessarily infringed by their Contribution(s) alone or by combination
 77 | of their Contribution(s) with the Work to which such Contribution(s) was
 78 | submitted. If You institute patent litigation against any entity (including a
 79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a
 80 | Contribution incorporated within the Work constitutes direct or contributory
 81 | patent infringement, then any patent licenses granted to You under this License
 82 | for that Work shall terminate as of the date such litigation is filed.
 83 | 
 84 | 4. Redistribution.
 85 | 
 86 | You may reproduce and distribute copies of the Work or Derivative Works thereof
 87 | in any medium, with or without modifications, and in Source or Object form,
 88 | provided that You meet the following conditions:
 89 | 
 90 | You must give any other recipients of the Work or Derivative Works a copy of
 91 | this License; and
 92 | You must cause any modified files to carry prominent notices stating that You
 93 | changed the files; and
 94 | You must retain, in the Source form of any Derivative Works that You distribute,
 95 | all copyright, patent, trademark, and attribution notices from the Source form
 96 | of the Work, excluding those notices that do not pertain to any part of the
 97 | Derivative Works; and
 98 | If the Work includes a "NOTICE" text file as part of its distribution, then any
 99 | Derivative Works that You distribute must include a readable copy of the
100 | attribution notices contained within such NOTICE file, excluding those notices
101 | that do not pertain to any part of the Derivative Works, in at least one of the
102 | following places: within a NOTICE text file distributed as part of the
103 | Derivative Works; within the Source form or documentation, if provided along
104 | with the Derivative Works; or, within a display generated by the Derivative
105 | Works, if and wherever such third-party notices normally appear. The contents of
106 | the NOTICE file are for informational purposes only and do not modify the
107 | License. You may add Your own attribution notices within Derivative Works that
108 | You distribute, alongside or as an addendum to the NOTICE text from the Work,
109 | provided that such additional attribution notices cannot be construed as
110 | modifying the License.
111 | You may add Your own copyright statement to Your modifications and may provide
112 | additional or different license terms and conditions for use, reproduction, or
113 | distribution of Your modifications, or for any such Derivative Works as a whole,
114 | provided Your use, reproduction, and distribution of the Work otherwise complies
115 | with the conditions stated in this License.
116 | 
117 | 5. Submission of Contributions.
118 | 
119 | Unless You explicitly state otherwise, any Contribution intentionally submitted
120 | for inclusion in the Work by You to the Licensor shall be under the terms and
121 | conditions of this License, without any additional terms or conditions.
122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of
123 | any separate license agreement you may have executed with Licensor regarding
124 | such Contributions.
125 | 
126 | 6. Trademarks.
127 | 
128 | This License does not grant permission to use the trade names, trademarks,
129 | service marks, or product names of the Licensor, except as required for
130 | reasonable and customary use in describing the origin of the Work and
131 | reproducing the content of the NOTICE file.
132 | 
133 | 7. Disclaimer of Warranty.
134 | 
135 | Unless required by applicable law or agreed to in writing, Licensor provides the
136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS,
137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
138 | including, without limitation, any warranties or conditions of TITLE,
139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
140 | solely responsible for determining the appropriateness of using or
141 | redistributing the Work and assume any risks associated with Your exercise of
142 | permissions under this License.
143 | 
144 | 8. Limitation of Liability.
145 | 
146 | In no event and under no legal theory, whether in tort (including negligence),
147 | contract, or otherwise, unless required by applicable law (such as deliberate
148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be
149 | liable to You for damages, including any direct, indirect, special, incidental,
150 | or consequential damages of any character arising as a result of this License or
151 | out of the use or inability to use the Work (including but not limited to
152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or
153 | any and all other commercial damages or losses), even if such Contributor has
154 | been advised of the possibility of such damages.
155 | 
156 | 9. Accepting Warranty or Additional Liability.
157 | 
158 | While redistributing the Work or Derivative Works thereof, You may choose to
159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or
160 | other liability obligations and/or rights consistent with this License. However,
161 | in accepting such obligations, You may act only on Your own behalf and on Your
162 | sole responsibility, not on behalf of any other Contributor, and only if You
163 | agree to indemnify, defend, and hold each Contributor harmless for any liability
164 | incurred by, or claims asserted against, such Contributor by reason of your
165 | accepting any such warranty or additional liability.
166 | 
167 | END OF TERMS AND CONDITIONS
168 | 
169 | APPENDIX: How to apply the Apache License to your work
170 | 
171 | To apply the Apache License to your work, attach the following boilerplate
172 | notice, with the fields enclosed by brackets "[]" replaced with your own
173 | identifying information. (Don't include the brackets!) The text should be
174 | enclosed in the appropriate comment syntax for the file format. We also
175 | recommend that a file or class name and description of purpose be included on
176 | the same "printed page" as the copyright notice for easier identification within
177 | third-party archives.
178 | 
179 |    Copyright [yyyy] [name of copyright owner]
180 | 
181 |    Licensed under the Apache License, Version 2.0 (the "License");
182 |    you may not use this file except in compliance with the License.
183 |    You may obtain a copy of the License at
184 | 
185 |      http://www.apache.org/licenses/LICENSE-2.0
186 | 
187 |    Unless required by applicable law or agreed to in writing, software
188 |    distributed under the License is distributed on an "AS IS" BASIS,
189 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
190 |    See the License for the specific language governing permissions and
191 |    limitations under the License.
192 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Encoding
2 | ========
3 | 
4 | This is a set of integer compression algorithms implemented in Go. It is an (incomplete) port of the JavaFastPFOR by Dr. Daniel Lemire. 
5 | 
6 | For more detailed benchmark results please see http://zhen.org/blog/benchmarking-integer-compression-in-go/
7 | 


--------------------------------------------------------------------------------
/benchmark/benchmark.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
  3 |  * Use of this source code is governed by the Apache 2.0 license.
  4 |  *
  5 |  */
  6 | 
  7 | package main
  8 | 
  9 | import (
 10 | 	"bufio"
 11 | 	"compress/gzip"
 12 | 	"flag"
 13 | 	"fmt"
 14 | 	"io/ioutil"
 15 | 	"log"
 16 | 	"os"
 17 | 	"runtime"
 18 | 	"runtime/pprof"
 19 | 	"strconv"
 20 | 	"strings"
 21 | 	"time"
 22 | 
 23 | 	"github.com/dataence/encoding"
 24 | 	"github.com/dataence/encoding/bp32"
 25 | 	"github.com/dataence/encoding/composition"
 26 | 	"github.com/dataence/encoding/cursor"
 27 | 	dbp32 "github.com/dataence/encoding/delta/bp32"
 28 | 	dfastpfor "github.com/dataence/encoding/delta/fastpfor"
 29 | 	dvb "github.com/dataence/encoding/delta/variablebyte"
 30 | 	"github.com/dataence/encoding/fastpfor"
 31 | 	"github.com/dataence/encoding/variablebyte"
 32 | 	zbp32 "github.com/dataence/encoding/zigzag/bp32"
 33 | 	zfastpfor "github.com/dataence/encoding/zigzag/fastpfor"
 34 | )
 35 | 
 36 | type paramList []string
 37 | 
 38 | var (
 39 | 	filesParam, dirsParam, codecsParam paramList
 40 | 	pprofParam                         bool
 41 | 	files                              []string
 42 | )
 43 | 
 44 | func (this *paramList) String() string {
 45 | 	return fmt.Sprint(*this)
 46 | }
 47 | 
 48 | func (this *paramList) Set(value string) error {
 49 | 	for _, f := range strings.Split(value, ",") {
 50 | 		*this = append(*this, f)
 51 | 	}
 52 | 
 53 | 	return nil
 54 | }
 55 | 
 56 | func init() {
 57 | 	flag.BoolVar(&pprofParam, "pprof", false, "Print result for individual files.")
 58 | 	flag.Var(&filesParam, "file", "The file containing one integer per line to encode. There can be multiple of this, or comma separated list.")
 59 | 	flag.Var(&dirsParam, "dir", "The directory containing a list of files with one integer per line. There can be multiple of this, or comma separated list.")
 60 | 	flag.Var(&codecsParam, "codec", "The codec to use: bp32, fastpfor, variablebyte, deltabp32, deltafastpfor, deltavariablebyte, zigzagbp32. There can be multiple of this, or comma separated list.")
 61 | }
 62 | 
 63 | func scanIntegers(s *bufio.Scanner) ([]int32, error) {
 64 | 	result := make([]int32, 0, 1000000)
 65 | 	for s.Scan() {
 66 | 		i, err := strconv.ParseUint(s.Text(), 10, 32)
 67 | 		if err != nil {
 68 | 			return nil, err
 69 | 		} else {
 70 | 			result = append(result, int32(i))
 71 | 		}
 72 | 	}
 73 | 
 74 | 	// Run the garbage collector to get rid of all the strings that's been allocated
 75 | 	// during the file read
 76 | 	runtime.GC()
 77 | 
 78 | 	return result, nil
 79 | 
 80 | }
 81 | 
 82 | func readIntegerFile(path string) ([]int32, error) {
 83 | 	file, err := os.Open(path)
 84 | 	if err != nil {
 85 | 		return nil, err
 86 | 	}
 87 | 	defer file.Close()
 88 | 
 89 | 	scanner := bufio.NewScanner(file)
 90 | 
 91 | 	return scanIntegers(scanner)
 92 | }
 93 | 
 94 | func readGzippedIntegerFile(path string) ([]int32, error) {
 95 | 	f, err := os.Open(path)
 96 | 	if err != nil {
 97 | 		return nil, err
 98 | 	}
 99 | 	defer f.Close()
100 | 
101 | 	gunzip, err := gzip.NewReader(f)
102 | 	if err != nil {
103 | 		return nil, err
104 | 	}
105 | 
106 | 	scanner := bufio.NewScanner(gunzip)
107 | 
108 | 	return scanIntegers(scanner)
109 | }
110 | 
111 | func getDirOfFiles(path string) ([]string, error) {
112 | 	filenames := make([]string, 0, 10)
113 | 
114 | 	files, err := ioutil.ReadDir(path)
115 | 	if err != nil {
116 | 		return nil, err
117 | 	}
118 | 
119 | 	for _, f := range files {
120 | 		filenames = append(filenames, path+"/"+f.Name())
121 | 	}
122 | 
123 | 	return filenames, nil
124 | }
125 | 
126 | func loadIntegerFromFiles(files []string) ([][]int32, int, error) {
127 | 	max := 0
128 | 	data := make([][]int32, 0, len(files))
129 | 
130 | 	for _, f := range files {
131 | 		var (
132 | 			res []int32
133 | 			err error
134 | 		)
135 | 
136 | 		log.Printf("Processing %s\n", f)
137 | 
138 | 		if strings.HasPrefix(f, "gz-") {
139 | 			res, err = readGzippedIntegerFile(strings.TrimPrefix(f, "gz-"))
140 | 		} else if strings.HasSuffix(f, ".gz") {
141 | 			res, err = readGzippedIntegerFile(f)
142 | 		} else {
143 | 			res, err = readIntegerFile(f)
144 | 		}
145 | 
146 | 		if err != nil {
147 | 			return nil, 0, err
148 | 		}
149 | 
150 | 		data = append(data, res)
151 | 
152 | 		if len(res) > max {
153 | 			max = len(res)
154 | 		}
155 | 	}
156 | 
157 | 	return data, max, nil
158 | }
159 | 
160 | func getListOfFiles() []string {
161 | 	files := make([]string, 0, 10)
162 | 
163 | 	for _, d := range dirsParam {
164 | 		res, err := getDirOfFiles(d)
165 | 		if err != nil {
166 | 			log.Fatal(err)
167 | 		}
168 | 
169 | 		files = append(files, res...)
170 | 	}
171 | 
172 | 	files = append(files, filesParam...)
173 | 
174 | 	return files
175 | }
176 | 
177 | func getListOfCodecs() (map[string]encoding.Integer, error) {
178 | 	codecs := make(map[string]encoding.Integer, 10)
179 | 
180 | 	for _, codec := range codecsParam {
181 | 		switch codec {
182 | 		case "bp32":
183 | 			codecs["bp32"] = composition.New(bp32.New(), variablebyte.New())
184 | 		case "fastpfor":
185 | 			codecs["fastpfor"] = composition.New(fastpfor.New(), variablebyte.New())
186 | 		case "variablebyte":
187 | 			codecs["variablebyte"] = variablebyte.New()
188 | 		case "deltabp32":
189 | 			codecs["delta bp32"] = composition.New(dbp32.New(), dvb.New())
190 | 		case "deltafastpfor":
191 | 			codecs["delta fastpfor"] = composition.New(dfastpfor.New(), dvb.New())
192 | 		case "deltavariablebyte":
193 | 			codecs["delta variablebyte"] = dvb.New()
194 | 		case "zigzagbp32":
195 | 			codecs["zigzag bp32"] = composition.New(zbp32.New(), dvb.New())
196 | 		case "zigzagfastpfor":
197 | 			codecs["zigzag fastpfor"] = composition.New(zfastpfor.New(), dvb.New())
198 | 		}
199 | 	}
200 | 
201 | 	if len(codecs) < 1 {
202 | 		return nil, fmt.Errorf("benchmark/getListOfCodecs: No codecs defined")
203 | 	}
204 | 
205 | 	return codecs, nil
206 | }
207 | 
208 | func compress(codec encoding.Integer, in, out []int32, length int, prof bool) (duration int64, ret []int32, err error) {
209 | 	inpos := cursor.New()
210 | 	outpos := cursor.New()
211 | 
212 | 	now := time.Now()
213 | 	if prof {
214 | 		f, e := os.Create("cpu.compress.pprof")
215 | 		if e != nil {
216 | 			log.Fatal(e)
217 | 		}
218 | 		defer f.Close()
219 | 
220 | 		pprof.StartCPUProfile(f)
221 | 	}
222 | 
223 | 	if err = codec.Compress(in, inpos, len(in), out, outpos); err != nil {
224 | 		return 0, nil, err
225 | 	}
226 | 	since := time.Since(now).Nanoseconds()
227 | 
228 | 	if prof {
229 | 		pprof.StopCPUProfile()
230 | 	}
231 | 
232 | 	return since, out[:outpos.Get()], nil
233 | }
234 | 
235 | func uncompress(codec encoding.Integer, in, out []int32, length int, prof bool) (duration int64, ret []int32, err error) {
236 | 	inpos := cursor.New()
237 | 	outpos := cursor.New()
238 | 
239 | 	if prof {
240 | 		f, e := os.Create("cpu.uncompress.pprof")
241 | 		if e != nil {
242 | 			log.Fatal(e)
243 | 		}
244 | 		defer f.Close()
245 | 
246 | 		pprof.StartCPUProfile(f)
247 | 	}
248 | 
249 | 	now := time.Now()
250 | 	if err = codec.Uncompress(in, inpos, len(in), out, outpos); err != nil {
251 | 		return 0, nil, err
252 | 	}
253 | 	since := time.Since(now).Nanoseconds()
254 | 
255 | 	if prof {
256 | 		pprof.StopCPUProfile()
257 | 	}
258 | 
259 | 	return since, out[:outpos.Get()], nil
260 | }
261 | 
262 | func testCodecs(codecs map[string]encoding.Integer, data [][]int32, max int, output bool) error {
263 | 	compdata := make([]int32, max+max/2)
264 | 	decompdata := make([]int32, max)
265 | 
266 | 	for name, codec := range codecs {
267 | 		for i, in := range data {
268 | 			k := len(in)
269 | 
270 | 			dur, out, err := compress(codec, in, compdata, k, pprofParam)
271 | 			if err != nil {
272 | 				return err
273 | 			}
274 | 
275 | 			dur2, out2, err2 := uncompress(codec, out, decompdata, k, pprofParam)
276 | 			if err2 != nil {
277 | 				return err2
278 | 			}
279 | 
280 | 			if output {
281 | 				fmt.Printf("% 20s % 20s: %5.2f %5.2f %5.2f\n", files[i], name, float64(len(out)*32)/float64(k), (float64(k) / (float64(dur) / 1000000000.0) / 1000000.0), (float64(k) / (float64(dur2) / 1000000000.0) / 1000000.0))
282 | 			}
283 | 
284 | 			for i := 0; i < k; i++ {
285 | 				if in[i] != decompdata[i] {
286 | 					return fmt.Errorf("benchmark/testCodecs: Problem recovering. index = %d, in = %d, recovered = %d, original length = %d, recovered length = %d\n", i, in[i], out2[i], k, len(out2))
287 | 				}
288 | 			}
289 | 
290 | 			runtime.GC()
291 | 		}
292 | 	}
293 | 
294 | 	return nil
295 | }
296 | 
297 | func main() {
298 | 	flag.Parse()
299 | 	files = getListOfFiles()
300 | 
301 | 	codecs, err := getListOfCodecs()
302 | 	if err != nil {
303 | 		log.Fatal(err)
304 | 	}
305 | 
306 | 	data, max, err := loadIntegerFromFiles(files)
307 | 	if err != nil {
308 | 		log.Fatal(err)
309 | 	}
310 | 
311 | 	if err := testCodecs(codecs, data, max, false); err != nil {
312 | 		log.Fatal(err)
313 | 	}
314 | 
315 | 	if err := testCodecs(codecs, data, max, false); err != nil {
316 | 		log.Fatal(err)
317 | 	}
318 | 
319 | 	if err := testCodecs(codecs, data, max, true); err != nil {
320 | 		log.Fatal(err)
321 | 	}
322 | }
323 | 


--------------------------------------------------------------------------------
/benchmark/data/ip.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zentures/encoding/b90e310a0325f9b765b4be7220df3642ad93ad8d/benchmark/data/ip.txt.gz


--------------------------------------------------------------------------------
/benchmark/data/lat.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zentures/encoding/b90e310a0325f9b765b4be7220df3642ad93ad8d/benchmark/data/lat.txt.gz


--------------------------------------------------------------------------------
/benchmark/data/ts.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zentures/encoding/b90e310a0325f9b765b4be7220df3642ad93ad8d/benchmark/data/ts.txt.gz


--------------------------------------------------------------------------------
/benchmark/results/Benchmarking_Integer_Compression.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zentures/encoding/b90e310a0325f9b765b4be7220df3642ad93ad8d/benchmark/results/Benchmarking_Integer_Compression.xlsx


--------------------------------------------------------------------------------
/benchtools/benchtools.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
  3 |  * Use of this source code is governed by the Apache 2.0 license.
  4 |  *
  5 |  */
  6 | 
  7 | package benchtools
  8 | 
  9 | import (
 10 | 	"bytes"
 11 | 	"compress/gzip"
 12 | 	"compress/lzw"
 13 | 	"fmt"
 14 | 	"io"
 15 | 	"log"
 16 | 	"os"
 17 | 	"runtime/pprof"
 18 | 	"time"
 19 | 
 20 | 	"code.google.com/p/snappy-go/snappy"
 21 | 	"github.com/dataence/encoding"
 22 | 	"github.com/dataence/encoding/cursor"
 23 | )
 24 | 
 25 | func TestCodec(codec encoding.Integer, in []int32, sizes []int) {
 26 | 	for _, k := range sizes {
 27 | 		if k > len(in) {
 28 | 			continue
 29 | 		}
 30 | 
 31 | 		dur, out, err := Compress(codec, in[:k], k)
 32 | 		if err != nil {
 33 | 			log.Fatal(err)
 34 | 		}
 35 | 
 36 | 		dur2, out2, err2 := Uncompress(codec, out, k)
 37 | 		if err2 != nil {
 38 | 			log.Fatal(err2)
 39 | 		}
 40 | 
 41 | 		//log.Printf("benchtools/TestCodec: %f %.2f %.2f\n", float64(len(out)*32)/float64(k), (float64(k)/(float64(dur)/1000000000.0)/1000000.0), (float64(k)/(float64(dur2)/1000000000.0)/1000000.0))
 42 | 		fmt.Printf("%f %.2f %.2f\n", float64(len(out)*32)/float64(k), (float64(k) / (float64(dur) / 1000000000.0) / 1000000.0), (float64(k) / (float64(dur2) / 1000000000.0) / 1000000.0))
 43 | 
 44 | 		for i := 0; i < k; i++ {
 45 | 			if in[i] != out2[i] {
 46 | 				log.Fatalf("benchtools/TestCodec: Problem recovering. index = %d, in = %d, recovered = %d, original length = %d, recovered length = %d\n", i, in[i], out2[i], k, len(out2))
 47 | 			}
 48 | 		}
 49 | 	}
 50 | }
 51 | 
 52 | func PprofCodec(codec encoding.Integer, in []int32, sizes []int) {
 53 | 	for _, k := range sizes {
 54 | 		if k > len(in) {
 55 | 			continue
 56 | 		}
 57 | 
 58 | 		dur, out, err := PprofCompress(codec, in[:k], k)
 59 | 		if err != nil {
 60 | 			log.Fatal(err)
 61 | 		}
 62 | 
 63 | 		dur2, out2, err2 := PprofUncompress(codec, out, k)
 64 | 		if err2 != nil {
 65 | 			log.Fatal(err2)
 66 | 		}
 67 | 
 68 | 		log.Printf("benchtools/PprofCodec: %f %.2f %.2f\n", float64(len(out)*32)/float64(k), (float64(k) / (float64(dur) / 1000000000.0) / 1000000.0), (float64(k) / (float64(dur2) / 1000000000.0) / 1000000.0))
 69 | 
 70 | 		for i := 0; i < k; i++ {
 71 | 			if in[i] != out2[i] {
 72 | 				log.Fatalf("benchtools/PprofCodec: Problem recovering. index = %d, in = %d, recovered = %d, original length = %d, recovered length = %d\n", i, in[i], out2[i], k, len(out2))
 73 | 			}
 74 | 		}
 75 | 	}
 76 | }
 77 | 
 78 | func Compress(codec encoding.Integer, in []int32, length int) (duration int64, out []int32, err error) {
 79 | 	return RunCompress(codec, in, length, false)
 80 | }
 81 | 
 82 | func Uncompress(codec encoding.Integer, in []int32, length int) (duration int64, out []int32, err error) {
 83 | 	return RunUncompress(codec, in, length, false)
 84 | }
 85 | 
 86 | func PprofCompress(codec encoding.Integer, in []int32, length int) (duration int64, out []int32, err error) {
 87 | 	return RunCompress(codec, in, length, true)
 88 | }
 89 | 
 90 | func PprofUncompress(codec encoding.Integer, in []int32, length int) (duration int64, out []int32, err error) {
 91 | 	return RunUncompress(codec, in, length, true)
 92 | }
 93 | 
 94 | func RunCompress(codec encoding.Integer, in []int32, length int, prof bool) (duration int64, out []int32, err error) {
 95 | 	out = make([]int32, length*2)
 96 | 	inpos := cursor.New()
 97 | 	outpos := cursor.New()
 98 | 
 99 | 	now := time.Now()
100 | 	if prof {
101 | 		f, e := os.Create("cpu.compress.pprof")
102 | 		if e != nil {
103 | 			log.Fatal(e)
104 | 		}
105 | 		defer f.Close()
106 | 
107 | 		pprof.StartCPUProfile(f)
108 | 	}
109 | 
110 | 	if err = codec.Compress(in, inpos, len(in), out, outpos); err != nil {
111 | 		return 0, nil, err
112 | 	}
113 | 	since := time.Since(now).Nanoseconds()
114 | 
115 | 	if prof {
116 | 		pprof.StopCPUProfile()
117 | 	}
118 | 
119 | 	return since, out[:outpos.Get()], nil
120 | }
121 | 
122 | func RunUncompress(codec encoding.Integer, in []int32, length int, prof bool) (duration int64, out []int32, err error) {
123 | 	out = make([]int32, length)
124 | 	inpos := cursor.New()
125 | 	outpos := cursor.New()
126 | 
127 | 	if prof {
128 | 		f, e := os.Create("cpu.uncompress.pprof")
129 | 		if e != nil {
130 | 			log.Fatal(e)
131 | 		}
132 | 		defer f.Close()
133 | 
134 | 		pprof.StartCPUProfile(f)
135 | 	}
136 | 
137 | 	now := time.Now()
138 | 	if err = codec.Uncompress(in, inpos, len(in), out, outpos); err != nil {
139 | 		return 0, nil, err
140 | 	}
141 | 	since := time.Since(now).Nanoseconds()
142 | 
143 | 	if prof {
144 | 		pprof.StopCPUProfile()
145 | 	}
146 | 
147 | 	return since, out[:outpos.Get()], nil
148 | }
149 | 
150 | func RunTestGzip(data []byte) {
151 | 	log.Printf("encoding/RunTestGzip: Testing comprssion Gzip\n")
152 | 
153 | 	var compressed bytes.Buffer
154 | 	w := gzip.NewWriter(&compressed)
155 | 	defer w.Close()
156 | 	now := time.Now()
157 | 	w.Write(data)
158 | 
159 | 	cl := compressed.Len()
160 | 	log.Printf("encoding/RunTestGzip: Compressed from %d bytes to %d bytes in %d ns\n", len(data), cl, time.Since(now).Nanoseconds())
161 | 
162 | 	recovered := make([]byte, len(data))
163 | 	r, _ := gzip.NewReader(&compressed)
164 | 	defer r.Close()
165 | 
166 | 	total := 0
167 | 	n := 100
168 | 	var err error = nil
169 | 	for err != io.EOF && n != 0 {
170 | 		n, err = r.Read(recovered[total:])
171 | 		total += n
172 | 	}
173 | 	log.Printf("encoding/RunTestGzip: Uncompressed from %d bytes to %d bytes in %d ns\n", cl, len(recovered), time.Since(now).Nanoseconds())
174 | }
175 | 
176 | func RunTestLZW(data []byte) {
177 | 	log.Printf("encoding/RunTestLZW: Testing comprssion LZW\n")
178 | 
179 | 	var compressed bytes.Buffer
180 | 	w := lzw.NewWriter(&compressed, lzw.MSB, 8)
181 | 	defer w.Close()
182 | 	now := time.Now()
183 | 	w.Write(data)
184 | 
185 | 	cl := compressed.Len()
186 | 	log.Printf("encoding/RunTestLZW: Compressed from %d bytes to %d bytes in %d ns\n", len(data), cl, time.Since(now).Nanoseconds())
187 | 
188 | 	recovered := make([]byte, len(data))
189 | 	r := lzw.NewReader(&compressed, lzw.MSB, 8)
190 | 	defer r.Close()
191 | 
192 | 	total := 0
193 | 	n := 100
194 | 	var err error = nil
195 | 	for err != io.EOF && n != 0 {
196 | 		n, err = r.Read(recovered[total:])
197 | 		total += n
198 | 	}
199 | 	log.Printf("encoding/RunTestLZW: Uncompressed from %d bytes to %d bytes in %d ns\n", cl, len(recovered), time.Since(now).Nanoseconds())
200 | }
201 | 
202 | func RunTestSnappy(data []byte) {
203 | 	log.Printf("encoding/RunTestSnappy: Testing comprssion Snappy\n")
204 | 
205 | 	now := time.Now()
206 | 	e, err := snappy.Encode(nil, data)
207 | 	if err != nil {
208 | 		log.Fatalf("encoding/RunTestSnappy: encoding error: %v\n", err)
209 | 	}
210 | 	log.Printf("encoding/RunTestSnappy: Compressed from %d bytes to %d bytes in %d ns\n", len(data), len(e), time.Since(now).Nanoseconds())
211 | 
212 | 	d, err := snappy.Decode(nil, e)
213 | 	if err != nil {
214 | 		log.Fatalf("encoding/RunTestSnappy: decoding error: %v\n", err)
215 | 	}
216 | 	log.Printf("encoding/RunTestSnappy: Uncompressed from %d bytes to %d bytes in %d ns\n", len(e), len(d), time.Since(now).Nanoseconds())
217 | 
218 | 	if !bytes.Equal(data, d) {
219 | 		log.Fatalf("encoding/RunTestSnappy: roundtrip mismatch\n")
220 | 	}
221 | }
222 | 


--------------------------------------------------------------------------------
/bitlen.go:
--------------------------------------------------------------------------------
1 | // +build !gccgo,!amd64,!386,!arm
2 | 
3 | // (gccgo) OR ((NOT amd64) AND (NOT 386) AND (NOT ARM))
4 | package encoding
5 | 
6 | func bitlen(x uint64) (n int) {
7 | 	return 32 - int(nlz1a(uint32(x)))
8 | }
9 | 


--------------------------------------------------------------------------------
/bitlen_386.s:
--------------------------------------------------------------------------------
 1 | // +build !gccgo
 2 | 
 3 | // func bitlen(x Word) (n int)
 4 | TEXT ·bitlen(SB),4,$0
 5 | 	BSRL x+0(FP), AX
 6 | 	JZ Z1
 7 | 	INCL AX
 8 | 	MOVL AX, n+4(FP)
 9 | 	RET
10 | 
11 | Z1:	MOVL $0, n+4(FP)
12 | 	RET
13 | 


--------------------------------------------------------------------------------
/bitlen_amd64.s:
--------------------------------------------------------------------------------
 1 | // +build !gccgo
 2 | 
 3 | // func bitlen(x Word) (n int)
 4 | TEXT ·bitlen(SB),4,$0
 5 | 	BSRQ x+0(FP), AX
 6 | 	JZ Z1
 7 | 	ADDQ $1, AX
 8 | 	MOVQ AX, n+8(FP)
 9 | 	RET
10 | 
11 | Z1:	MOVQ $0, n+8(FP)
12 | 	RET
13 | 


--------------------------------------------------------------------------------
/bitlen_arm.s:
--------------------------------------------------------------------------------
 1 | // +build !gccgo
 2 | 
 3 | // func bitlen(x Word) (n int)
 4 | TEXT ·bitlen(SB),4,$0
 5 |     MOVW    x+0(FP), R0
 6 |     CLZ     R0, R0
 7 |     MOVW    $32, R1
 8 |     SUB.S   R0, R1
 9 |     MOVW    R1, n+4(FP)
10 |     RET
11 | 


--------------------------------------------------------------------------------
/bitlen_decl.go:
--------------------------------------------------------------------------------
1 | // +build !gccgo
2 | // +build amd64 386 arm
3 | 
4 | package encoding
5 | 
6 | // This is defined in util_{amd64,386}.s, copied from pkg/math/big/arith_{amd64/386}.s
7 | func bitlen(x uint64) (n int)
8 | 


--------------------------------------------------------------------------------
/bitlen_gccgo.go:
--------------------------------------------------------------------------------
 1 | // +build gccgo
 2 | 
 3 | package encoding
 4 | 
 5 | // this is apparetly the old way -> func clz(uint64) uint64 __asm__("__clzdi2")
 6 | 
 7 | //extern __clzdi2
 8 | func clz(uint64) uint64
 9 | 
10 | func bitlen(x uint64) (n int) {
11 | 	if x == 0 {
12 | 		return 0
13 | 	}
14 | 	return 64 - int(clz(x))
15 | }
16 | 


--------------------------------------------------------------------------------
/bp32/bp32.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
  3 |  * Use of this source code is governed by the Apache 2.0 license.
  4 |  *
  5 |  */
  6 | 
  7 | // Package bp32 is an implementation of the binary packing integer compression
  8 | // algorithm in in Go (also known as PackedBinary) using 32-integer blocks.
  9 | // It is mostly suitable for arrays containing small positive integers.
 10 | // Given a list of sorted integers, you should first compute the successive
 11 | // differences prior to compression.
 12 | // For details, please see
 13 | // Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second
 14 | // through vectorization Software: Practice & Experience
 15 | // http://onlinelibrary.wiley.com/doi/10.1002/spe.2203/abstract or
 16 | //	http://arxiv.org/abs/1209.2137
 17 | package bp32
 18 | 
 19 | import (
 20 | 	"errors"
 21 | 
 22 | 	"github.com/dataence/encoding"
 23 | 	"github.com/dataence/encoding/bitpacking"
 24 | 	"github.com/dataence/encoding/cursor"
 25 | )
 26 | 
 27 | const (
 28 | 	DefaultBlockSize = 128
 29 | )
 30 | 
 31 | type BP32 struct {
 32 | }
 33 | 
 34 | var _ encoding.Integer = (*BP32)(nil)
 35 | 
 36 | func New() encoding.Integer {
 37 | 	return &BP32{}
 38 | }
 39 | 
 40 | func (this *BP32) Compress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error {
 41 | 
 42 | 	inlength = encoding.FloorBy(inlength, DefaultBlockSize)
 43 | 
 44 | 	if inlength == 0 {
 45 | 		return errors.New("BP32/Compress: block size less than 128. No work done.")
 46 | 	}
 47 | 
 48 | 	out[outpos.Get()] = int32(inlength)
 49 | 	outpos.Increment()
 50 | 
 51 | 	tmpoutpos := outpos.Get()
 52 | 	s := inpos.Get()
 53 | 	finalinpos := s + inlength
 54 | 
 55 | 	for ; s < finalinpos; s += DefaultBlockSize {
 56 | 		mbits1 := encoding.MaxBits(in[s : s+32])
 57 | 		mbits2 := encoding.MaxBits(in[s+32 : s+2*32])
 58 | 		mbits3 := encoding.MaxBits(in[s+2*32 : s+3*32])
 59 | 		mbits4 := encoding.MaxBits(in[s+3*32 : s+4*32])
 60 | 
 61 | 		out[tmpoutpos] = (mbits1 << 24) | (mbits2 << 16) | (mbits3 << 8) | mbits4
 62 | 		tmpoutpos += 1
 63 | 		bitpacking.FastPackWithoutMask(in, s, out, tmpoutpos, int(mbits1))
 64 | 		tmpoutpos += int(mbits1)
 65 | 		bitpacking.FastPackWithoutMask(in, s+32, out, tmpoutpos, int(mbits2))
 66 | 		tmpoutpos += int(mbits2)
 67 | 		bitpacking.FastPackWithoutMask(in, s+2*32, out, tmpoutpos, int(mbits3))
 68 | 		tmpoutpos += int(mbits3)
 69 | 		bitpacking.FastPackWithoutMask(in, s+3*32, out, tmpoutpos, int(mbits4))
 70 | 		tmpoutpos += int(mbits4)
 71 | 	}
 72 | 
 73 | 	inpos.Add(inlength)
 74 | 	outpos.Set(tmpoutpos)
 75 | 
 76 | 	return nil
 77 | }
 78 | 
 79 | func (this *BP32) Uncompress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error {
 80 | 	if inlength == 0 {
 81 | 		return errors.New("BP32/Uncompress: Length is 0. No work done.")
 82 | 	}
 83 | 
 84 | 	outlength := int(in[inpos.Get()])
 85 | 	inpos.Increment()
 86 | 
 87 | 	tmpinpos := inpos.Get()
 88 | 
 89 | 	for s := outpos.Get(); s < outpos.Get()+outlength; s += 32 * 4 {
 90 | 		tmp := in[tmpinpos]
 91 | 		mbits1 := tmp >> 24
 92 | 		mbits2 := (tmp >> 16) & 0xFF
 93 | 		mbits3 := (tmp >> 8) & 0xFF
 94 | 		mbits4 := (tmp) & 0xFF
 95 | 
 96 | 		tmpinpos += 1
 97 | 
 98 | 		bitpacking.FastUnpack(in, tmpinpos, out, s, int(mbits1))
 99 | 		tmpinpos += int(mbits1)
100 | 
101 | 		bitpacking.FastUnpack(in, tmpinpos, out, s+32, int(mbits2))
102 | 		tmpinpos += int(mbits2)
103 | 
104 | 		bitpacking.FastUnpack(in, tmpinpos, out, s+2*32, int(mbits3))
105 | 		tmpinpos += int(mbits3)
106 | 
107 | 		bitpacking.FastUnpack(in, tmpinpos, out, s+3*32, int(mbits4))
108 | 		tmpinpos += int(mbits4)
109 | 	}
110 | 
111 | 	outpos.Add(outlength)
112 | 	inpos.Set(tmpinpos)
113 | 
114 | 	return nil
115 | }
116 | 


--------------------------------------------------------------------------------
/bp32/bp32_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
 3 |  * Use of this source code is governed by the Apache 2.0 license.
 4 |  *
 5 |  */
 6 | 
 7 | package bp32
 8 | 
 9 | import (
10 | 	"log"
11 | 	"testing"
12 | 
13 | 	"github.com/dataence/encoding/benchtools"
14 | 	"github.com/dataence/encoding/cursor"
15 | 	"github.com/dataence/encoding/generators"
16 | )
17 | 
18 | var (
19 | 	data []int32
20 | 	size int = 128000
21 | )
22 | 
23 | func init() {
24 | 	log.Printf("bp32/init: generating %d int32s\n", size)
25 | 	data = generators.GenerateClustered(size, size*2)
26 | 	log.Printf("bp32/init: generated %d integers for test", size)
27 | }
28 | 
29 | func TestCodec(t *testing.T) {
30 | 	sizes := []int{128, 128 * 10, 128 * 100, 128 * 1000}
31 | 	benchtools.TestCodec(New(), data, sizes)
32 | }
33 | 
34 | // go test -bench=Decode
35 | func BenchmarkDecode(b *testing.B) {
36 | 	b.StopTimer()
37 | 	length := 128 * 1024
38 | 	data := generators.GenerateClustered(length, 1<<24)
39 | 	compdata := make([]int32, 2*length)
40 | 	recov := make([]int32, length)
41 | 	inpos := cursor.New()
42 | 	outpos := cursor.New()
43 | 	codec := New()
44 | 	codec.Compress(data, inpos, len(data), compdata, outpos)
45 | 	b.StartTimer()
46 | 	for j := 0; j < b.N; j++ {
47 | 		newinpos := cursor.New()
48 | 		newoutpos := cursor.New()
49 | 		codec.Uncompress(compdata, newinpos, outpos.Get()-newinpos.Get(), recov, newoutpos)
50 | 	}
51 | }
52 | 


--------------------------------------------------------------------------------
/composition/composition.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
 3 |  * Use of this source code is governed by the Apache 2.0 license.
 4 |  *
 5 |  */
 6 | 
 7 | package composition
 8 | 
 9 | import (
10 | 	"errors"
11 | 
12 | 	"github.com/dataence/encoding"
13 | 	"github.com/dataence/encoding/cursor"
14 | )
15 | 
16 | type Composition struct {
17 | 	f1 encoding.Integer
18 | 	f2 encoding.Integer
19 | }
20 | 
21 | var _ encoding.Integer = (*Composition)(nil)
22 | 
23 | func New(f1 encoding.Integer, f2 encoding.Integer) encoding.Integer {
24 | 	return &Composition{
25 | 		f1: f1,
26 | 		f2: f2,
27 | 	}
28 | }
29 | 
30 | func (this *Composition) Compress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error {
31 | 	if inlength == 0 {
32 | 		return errors.New("composition/Compress: inlength = 0. No work done.")
33 | 	}
34 | 
35 | 	init := inpos.Get()
36 | 	this.f1.Compress(in, inpos, inlength, out, outpos)
37 | 	if outpos.Get() == 0 {
38 | 		out[0] = 0
39 | 		outpos.Increment()
40 | 	}
41 | 	//log.Printf("composition/Compress: f1 inpos = %d, outpos = %d, inlength = %d\n", inpos.Get(), outpos.Get(), inlength)
42 | 
43 | 	inlength -= inpos.Get() - init
44 | 	this.f2.Compress(in, inpos, inlength, out, outpos)
45 | 	//log.Printf("composition/Compress: f2 inpos = %d, outpos = %d, inlength = %d\n", inpos.Get(), outpos.Get(), inlength)
46 | 
47 | 	return nil
48 | }
49 | 
50 | func (this *Composition) Uncompress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error {
51 | 	if inlength == 0 {
52 | 		return errors.New("composition/Uncompress: inlength = 0. No work done.")
53 | 	}
54 | 
55 | 	init := inpos.Get()
56 | 	this.f1.Uncompress(in, inpos, inlength, out, outpos)
57 | 	//log.Printf("composition/Uncompress: f1 inpos = %d, outpos = %d, inlength = %d\n", inpos.Get(), outpos.Get(), inlength)
58 | 	inlength -= inpos.Get() - init
59 | 	this.f2.Uncompress(in, inpos, inlength, out, outpos)
60 | 	//log.Printf("composition/Uncompress: f2 inpos = %d, outpos = %d, inlength = %d\n", inpos.Get(), outpos.Get(), inlength)
61 | 
62 | 	return nil
63 | }
64 | 


--------------------------------------------------------------------------------
/composition/composition_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
 3 |  * Use of this source code is governed by the Apache 2.0 license.
 4 |  *
 5 |  */
 6 | 
 7 | package composition
 8 | 
 9 | import (
10 | 	"log"
11 | 	"testing"
12 | 
13 | 	"github.com/dataence/encoding"
14 | 	"github.com/dataence/encoding/benchtools"
15 | 	"github.com/dataence/encoding/bp32"
16 | 	dbp32 "github.com/dataence/encoding/delta/bp32"
17 | 	dvb "github.com/dataence/encoding/delta/variablebyte"
18 | 	"github.com/dataence/encoding/generators"
19 | 	"github.com/dataence/encoding/variablebyte"
20 | )
21 | 
22 | var (
23 | 	codec encoding.Integer
24 | 	data  []int32
25 | 	size  int = 10000000
26 | )
27 | 
28 | func init() {
29 | 	log.Printf("composition_test/init: generating %d uint32s\n", size)
30 | 	data = generators.GenerateClustered(size, size*2)
31 | 	log.Printf("composition_test/init: generated %d integers for test", size)
32 | }
33 | 
34 | func TestDeltaBP32andDeltaVariableByte(t *testing.T) {
35 | 	sizes := []int{100, 100 * 10, 100 * 100, 100 * 1000, 100 * 10000}
36 | 	benchtools.TestCodec(New(dbp32.New(), dvb.New()), data, sizes)
37 | }
38 | 
39 | func TestBP32andVariableByte(t *testing.T) {
40 | 	sizes := []int{100, 100 * 10, 100 * 100, 100 * 1000, 100 * 10000}
41 | 	benchtools.TestCodec(New(bp32.New(), variablebyte.New()), data, sizes)
42 | }
43 | 


--------------------------------------------------------------------------------
/cursor/cursor.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
 3 |  * Use of this source code is governed by the Apache 2.0 license.
 4 |  *
 5 |  */
 6 | 
 7 | package cursor
 8 | 
 9 | type Cursor struct {
10 | 	value int
11 | }
12 | 
13 | func New() *Cursor {
14 | 	return &Cursor{
15 | 		value: 0,
16 | 	}
17 | }
18 | 
19 | func (this *Cursor) Get() int {
20 | 	return this.value
21 | }
22 | 
23 | func (this *Cursor) Set(i int) {
24 | 	this.value = i
25 | }
26 | 
27 | func (this *Cursor) Add(i int) {
28 | 	this.value += i
29 | }
30 | 
31 | func (this *Cursor) Increment() {
32 | 	this.value += 1
33 | }
34 | 


--------------------------------------------------------------------------------
/delta/bp32/bp32.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
  3 |  * Use of this source code is governed by the Apache 2.0 license.
  4 |  *
  5 |  */
  6 | 
  7 | package bp32
  8 | 
  9 | import (
 10 | 	"errors"
 11 | 
 12 | 	"github.com/dataence/encoding"
 13 | 	"github.com/dataence/encoding/bitpacking"
 14 | 	"github.com/dataence/encoding/cursor"
 15 | )
 16 | 
 17 | const (
 18 | 	DefaultBlockSize = 128
 19 | 	DefaultPageSize  = 65536
 20 | )
 21 | 
 22 | type BP32 struct {
 23 | }
 24 | 
 25 | var _ encoding.Integer = (*BP32)(nil)
 26 | 
 27 | func New() encoding.Integer {
 28 | 	return &BP32{}
 29 | }
 30 | 
 31 | func (this *BP32) Compress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error {
 32 | 	//log.Printf("bp32/Compress: before inlength = %d\n", inlength)
 33 | 
 34 | 	inlength = encoding.FloorBy(inlength, DefaultBlockSize)
 35 | 
 36 | 	if inlength == 0 {
 37 | 		return errors.New("BP32/Compress: block size less than 128. No work done.")
 38 | 	}
 39 | 
 40 | 	//log.Printf("bp32/Compress: after inlength = %d, len(in) = %d\n", inlength, len(in))
 41 | 
 42 | 	out[outpos.Get()] = int32(inlength)
 43 | 	outpos.Increment()
 44 | 
 45 | 	tmpoutpos := outpos.Get()
 46 | 	initoffset := int32(0)
 47 | 	s := inpos.Get()
 48 | 	finalinpos := s + inlength
 49 | 
 50 | 	for ; s < finalinpos; s += DefaultBlockSize {
 51 | 		mbits1 := encoding.DeltaMaxBits(initoffset, in[s:s+32])
 52 | 		initoffset2 := in[s+31]
 53 | 		mbits2 := encoding.DeltaMaxBits(initoffset2, in[s+32:s+2*32])
 54 | 		initoffset3 := in[s+32+31]
 55 | 		mbits3 := encoding.DeltaMaxBits(initoffset3, in[s+2*32:s+3*32])
 56 | 		initoffset4 := in[s+2*32+31]
 57 | 		mbits4 := encoding.DeltaMaxBits(initoffset4, in[s+3*32:s+4*32])
 58 | 
 59 | 		//log.Printf("bp32/Compress: tmpoutpos = %d, s = %d\n", tmpoutpos, s)
 60 | 
 61 | 		out[tmpoutpos] = (mbits1 << 24) | (mbits2 << 16) | (mbits3 << 8) | mbits4
 62 | 		tmpoutpos += 1
 63 | 
 64 | 		//log.Printf("bp32/Compress: mbits1 = %d, mbits2 = %d, mbits3 = %d, mbits4 = %d, s = %d\n", mbits1, mbits2, mbits3, mbits4, out[tmpoutpos-1])
 65 | 
 66 | 		bitpacking.DeltaPack(initoffset, in, s, out, tmpoutpos, int(mbits1))
 67 | 		//encoding.PrintUint32sInBits(in, s, 32)
 68 | 		//encoding.PrintUint32sInBits(out, tmpoutpos, int(mbits1))
 69 | 		tmpoutpos += int(mbits1)
 70 | 
 71 | 		bitpacking.DeltaPack(initoffset2, in, s+32, out, tmpoutpos, int(mbits2))
 72 | 		//encoding.PrintUint32sInBits(in, s+32, 32)
 73 | 		//encoding.PrintUint32sInBits(out, tmpoutpos, int(mbits2))
 74 | 		tmpoutpos += int(mbits2)
 75 | 
 76 | 		bitpacking.DeltaPack(initoffset3, in, s+2*32, out, tmpoutpos, int(mbits3))
 77 | 		//encoding.PrintUint32sInBits(in, s+2*32, 32)
 78 | 		//encoding.PrintUint32sInBits(out, tmpoutpos, int(mbits3))
 79 | 		tmpoutpos += int(mbits3)
 80 | 
 81 | 		bitpacking.DeltaPack(initoffset4, in, s+3*32, out, tmpoutpos, int(mbits4))
 82 | 		//encoding.PrintUint32sInBits(in, s+3*32, 32)
 83 | 		//encoding.PrintUint32sInBits(out, tmpoutpos, int(mbits4))
 84 | 		tmpoutpos += int(mbits4)
 85 | 
 86 | 		initoffset = in[s+3*32+31]
 87 | 	}
 88 | 
 89 | 	inpos.Add(inlength)
 90 | 	outpos.Set(tmpoutpos)
 91 | 
 92 | 	return nil
 93 | }
 94 | 
 95 | func (this *BP32) Uncompress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error {
 96 | 	if inlength == 0 {
 97 | 		return errors.New("BP32/Uncompress: Length is 0. No work done.")
 98 | 	}
 99 | 
100 | 	outlength := in[inpos.Get()]
101 | 	inpos.Increment()
102 | 
103 | 	tmpinpos := inpos.Get()
104 | 	initoffset := int32(0)
105 | 
106 | 	//log.Printf("bp32/Uncompress: outlength = %d, inpos = %d, outpos = %d\n", outlength, inpos.Get(), outpos.Get())
107 | 	for s := outpos.Get(); s < outpos.Get()+int(outlength); s += 32 * 4 {
108 | 		tmp := in[tmpinpos]
109 | 		mbits1 := tmp >> 24
110 | 		mbits2 := (tmp >> 16) & 0xFF
111 | 		mbits3 := (tmp >> 8) & 0xFF
112 | 		mbits4 := (tmp) & 0xFF
113 | 
114 | 		//log.Printf("bp32/Uncopmress: mbits1 = %d, mbits2 = %d, mbits3 = %d, mbits4 = %d, s = %d\n", mbits1, mbits2, mbits3, mbits4, s)
115 | 		tmpinpos += 1
116 | 
117 | 		bitpacking.DeltaUnpack(initoffset, in, tmpinpos, out, s, int(mbits1))
118 | 		tmpinpos += int(mbits1)
119 | 		initoffset = out[s+31]
120 | 		//log.Printf("bp32/Uncompress: out = %v\n", out)
121 | 
122 | 		bitpacking.DeltaUnpack(initoffset, in, tmpinpos, out, s+32, int(mbits2))
123 | 		tmpinpos += int(mbits2)
124 | 		initoffset = out[s+32+31]
125 | 		//log.Printf("bp32/Uncompress: out = %v\n", out)
126 | 
127 | 		bitpacking.DeltaUnpack(initoffset, in, tmpinpos, out, s+2*32, int(mbits3))
128 | 		tmpinpos += int(mbits3)
129 | 		initoffset = out[s+2*32+31]
130 | 		//log.Printf("bp32/Uncompress: out = %v\n", out)
131 | 
132 | 		bitpacking.DeltaUnpack(initoffset, in, tmpinpos, out, s+3*32, int(mbits4))
133 | 		tmpinpos += int(mbits4)
134 | 		initoffset = out[s+3*32+31]
135 | 		//log.Printf("bp32/Uncompress: out = %v\n", out)
136 | 	}
137 | 
138 | 	outpos.Add(int(outlength))
139 | 	inpos.Set(tmpinpos)
140 | 
141 | 	return nil
142 | }
143 | 


--------------------------------------------------------------------------------
/delta/bp32/bp32_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
 3 |  * Use of this source code is governed by the Apache 2.0 license.
 4 |  *
 5 |  */
 6 | 
 7 | package bp32
 8 | 
 9 | import (
10 | 	"log"
11 | 	"testing"
12 | 
13 | 	"github.com/dataence/encoding/benchtools"
14 | 	"github.com/dataence/encoding/generators"
15 | )
16 | 
17 | var (
18 | 	data []int32
19 | 	size int = 12800000
20 | )
21 | 
22 | func init() {
23 | 	log.Printf("bp32/init: generating %d int32s\n", size)
24 | 	data = generators.GenerateClustered(size, size*2)
25 | 	log.Printf("bp32/init: generated %d integers for test", size)
26 | }
27 | 
28 | func TestCodec(t *testing.T) {
29 | 	sizes := []int{128, 128 * 10, 128 * 100, 128 * 1000, 128 * 10000}
30 | 	benchtools.TestCodec(New(), data, sizes)
31 | }
32 | 


--------------------------------------------------------------------------------
/delta/fastpfor/fastpfor.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
  3 |  * Use of this source code is governed by the Apache 2.0 license.
  4 |  *
  5 |  */
  6 | 
  7 | package fastpfor
  8 | 
  9 | import (
 10 | 	"errors"
 11 | 	"math"
 12 | 
 13 | 	"github.com/dataence/bytebuffer"
 14 | 	"github.com/dataence/encoding"
 15 | 	"github.com/dataence/encoding/bitpacking"
 16 | 	"github.com/dataence/encoding/cursor"
 17 | )
 18 | 
 19 | const (
 20 | 	DefaultBlockSize     = 128
 21 | 	OverheadOfEachExcept = 8
 22 | 	DefaultPageSize      = 65536
 23 | )
 24 | 
 25 | var (
 26 | 	zeroDataPointers []int32
 27 | 	zeroFreqs        []int32
 28 | )
 29 | 
 30 | func init() {
 31 | 	zeroDataPointers = make([]int32, 33)
 32 | 	zeroFreqs = make([]int32, 33)
 33 | }
 34 | 
 35 | type FastPFOR struct {
 36 | 	dataToBePacked [33][]int32
 37 | 	byteContainer  *bytebuffer.ByteBuffer
 38 | 	pageSize       int32
 39 | 
 40 | 	// Working area
 41 | 	dataPointers []int32
 42 | 	freqs        []int32
 43 | }
 44 | 
 45 | var _ encoding.Integer = (*FastPFOR)(nil)
 46 | 
 47 | func New() encoding.Integer {
 48 | 	f := &FastPFOR{
 49 | 		pageSize:      DefaultPageSize,
 50 | 		byteContainer: bytebuffer.NewByteBuffer(3*DefaultPageSize/DefaultBlockSize + DefaultPageSize),
 51 | 		dataPointers:  make([]int32, 33),
 52 | 		freqs:         make([]int32, 33),
 53 | 	}
 54 | 
 55 | 	for i := 1; i < 33; i++ {
 56 | 		f.dataToBePacked[i] = make([]int32, DefaultPageSize/32*4)
 57 | 	}
 58 | 
 59 | 	return f
 60 | }
 61 | 
 62 | func (this *FastPFOR) Compress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error {
 63 | 	inlength = encoding.FloorBy(inlength, DefaultBlockSize)
 64 | 
 65 | 	if inlength == 0 {
 66 | 		return errors.New("fastpfor/Compress: inlength = 0. No work done.")
 67 | 	}
 68 | 
 69 | 	out[outpos.Get()] = int32(inlength)
 70 | 	outpos.Increment()
 71 | 
 72 | 	initoffset := cursor.New()
 73 | 
 74 | 	copy(this.dataPointers, zeroDataPointers)
 75 | 	copy(this.freqs, zeroFreqs)
 76 | 
 77 | 	finalInpos := inpos.Get() + inlength
 78 | 
 79 | 	for inpos.Get() != finalInpos {
 80 | 		thissize := int(math.Min(float64(this.pageSize), float64(finalInpos-inpos.Get())))
 81 | 
 82 | 		if err := this.encodePage(in, inpos, thissize, out, outpos, initoffset); err != nil {
 83 | 			return errors.New("fastpfor/Compress: " + err.Error())
 84 | 		}
 85 | 	}
 86 | 
 87 | 	return nil
 88 | }
 89 | 
 90 | func (this *FastPFOR) Uncompress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error {
 91 | 	if inlength == 0 {
 92 | 		return errors.New("fastpfor/Uncompress: inlength = 0. No work done.")
 93 | 	}
 94 | 
 95 | 	mynvalue := in[inpos.Get()]
 96 | 	inpos.Increment()
 97 | 
 98 | 	initoffset := cursor.New()
 99 | 
100 | 	copy(this.dataPointers, zeroDataPointers)
101 | 
102 | 	finalout := outpos.Get() + int(mynvalue)
103 | 	for outpos.Get() != finalout {
104 | 		thissize := int(math.Min(float64(this.pageSize), float64(finalout-outpos.Get())))
105 | 
106 | 		if err := this.decodePage(in, inpos, out, outpos, thissize, initoffset); err != nil {
107 | 			return errors.New("fastpfor/Uncompress: " + err.Error())
108 | 		}
109 | 	}
110 | 	return nil
111 | }
112 | 
113 | // getBestBFromData determins the best bit position with the best cost of exceptions,
114 | // and the max bit position of the array of int32s
115 | func (this *FastPFOR) getBestBFromData(in []int32) (bestb int32, bestc int32, maxb int32) {
116 | 	copy(this.freqs, zeroFreqs)
117 | 
118 | 	// Get the count of all the leading bit positions for the slice
119 | 	// Mainly to figure out what's the best (most popular) bit position
120 | 	for _, v := range in {
121 | 		this.freqs[encoding.LeadingBitPosition(uint32(v))]++
122 | 	}
123 | 	//encoding.FastLeadingBitFrequency128(in, this.freqs)
124 | 
125 | 	bestb = 32
126 | 
127 | 	for this.freqs[bestb] == 0 {
128 | 		bestb--
129 | 	}
130 | 
131 | 	maxb = bestb
132 | 	bestCost := bestb * DefaultBlockSize
133 | 	var cexcept int32
134 | 	bestc = cexcept
135 | 
136 | 	// Find the cost of storing exceptions for each bit position
137 | 	for b := bestb - 1; b >= 0; b-- {
138 | 		cexcept += this.freqs[b+1]
139 | 		if cexcept < 0 {
140 | 			break
141 | 		}
142 | 
143 | 		// the extra 8 is the cost of storing maxbits
144 | 		thisCost := cexcept*OverheadOfEachExcept + cexcept*(maxb-b) + b*DefaultBlockSize + 8
145 | 
146 | 		if thisCost < bestCost {
147 | 			bestCost = thisCost
148 | 			bestb = b
149 | 			bestc = cexcept
150 | 		}
151 | 	}
152 | 
153 | 	return
154 | }
155 | 
156 | func (this *FastPFOR) encodePage(in []int32, inpos *cursor.Cursor, thissize int, out []int32, outpos *cursor.Cursor, initoffset *cursor.Cursor) error {
157 | 	headerpos := int32(outpos.Get())
158 | 	outpos.Increment()
159 | 	tmpoutpos := int32(outpos.Get())
160 | 
161 | 	// Clear working area
162 | 	copy(this.dataPointers, zeroDataPointers)
163 | 	this.byteContainer.Clear()
164 | 
165 | 	tmpinpos := int32(inpos.Get())
166 | 	var delta [DefaultBlockSize]int32
167 | 
168 | 	for finalInpos := tmpinpos + int32(thissize) - DefaultBlockSize; tmpinpos <= finalInpos; tmpinpos += DefaultBlockSize {
169 | 
170 | 		// Calculate the deltas, inlining to gain a bit of performance
171 | 		offset := int32(initoffset.Get())
172 | 		for i, v := range in[tmpinpos : tmpinpos+DefaultBlockSize] {
173 | 			delta[i] = v - offset
174 | 			offset = v
175 | 		}
176 | 
177 | 		initoffset.Set(int(in[tmpinpos+DefaultBlockSize-1]))
178 | 
179 | 		bestb, bestc, maxb := this.getBestBFromData(delta[:])
180 | 		tmpbestb := bestb
181 | 		this.byteContainer.Put(byte(bestb))
182 | 		this.byteContainer.Put(byte(bestc))
183 | 
184 | 		if bestc > 0 {
185 | 			this.byteContainer.Put(byte(maxb))
186 | 			index := maxb - bestb
187 | 			if int(this.dataPointers[index]+bestc) >= len(this.dataToBePacked[index]) {
188 | 				newSize := int(2 * (this.dataPointers[index] + bestc))
189 | 
190 | 				// make sure it is a multiple of 32.
191 | 				// there might be a better way to do this
192 | 				newSize = encoding.CeilBy(newSize, 32)
193 | 				newSlice := make([]int32, newSize)
194 | 				copy(newSlice, this.dataToBePacked[index])
195 | 				this.dataToBePacked[index] = newSlice
196 | 			}
197 | 
198 | 			for k := int32(0); k < DefaultBlockSize; k++ {
199 | 				if uint32(delta[k])>>uint(bestb) != 0 {
200 | 					// we have an exception
201 | 					this.byteContainer.Put(byte(k))
202 | 					this.dataToBePacked[index][this.dataPointers[index]] = int32(uint32(delta[k]) >> uint(tmpbestb))
203 | 					this.dataPointers[index] += 1
204 | 				}
205 | 			}
206 | 		}
207 | 
208 | 		for k := int32(0); k < 128; k += 32 {
209 | 			bitpacking.FastPack(delta[:], int(k), out, int(tmpoutpos), int(tmpbestb))
210 | 			tmpoutpos += tmpbestb
211 | 		}
212 | 	}
213 | 
214 | 	inpos.Set(int(tmpinpos))
215 | 	out[headerpos] = tmpoutpos - headerpos
216 | 
217 | 	for this.byteContainer.Position()&3 != 0 {
218 | 		this.byteContainer.Put(0)
219 | 	}
220 | 
221 | 	bytesize := int32(this.byteContainer.Position())
222 | 	out[tmpoutpos] = bytesize
223 | 	tmpoutpos += 1
224 | 	howmanyints := bytesize / 4
225 | 
226 | 	this.byteContainer.Flip()
227 | 	this.byteContainer.AsInt32Buffer().GetInt32s(out, int(tmpoutpos), int(howmanyints))
228 | 	tmpoutpos += howmanyints
229 | 
230 | 	bitmap := int32(0)
231 | 	for k := 1; k <= 32; k++ {
232 | 		v := this.dataPointers[k]
233 | 		if v != 0 {
234 | 			bitmap |= (1 << uint(k-1))
235 | 		}
236 | 	}
237 | 
238 | 	out[tmpoutpos] = bitmap
239 | 	tmpoutpos += 1
240 | 
241 | 	for k := 1; k < 33; k++ {
242 | 		v := this.dataPointers[k]
243 | 		if v != 0 {
244 | 			out[tmpoutpos] = v // size
245 | 			tmpoutpos += 1
246 | 			for j := 0; j < int(v); j += 32 {
247 | 				bitpacking.FastPack(this.dataToBePacked[k], j, out, int(tmpoutpos), k)
248 | 				tmpoutpos += int32(k)
249 | 			}
250 | 		}
251 | 	}
252 | 
253 | 	outpos.Set(int(tmpoutpos))
254 | 
255 | 	return nil
256 | }
257 | 
258 | func (this *FastPFOR) decodePage(in []int32, inpos *cursor.Cursor, out []int32, outpos *cursor.Cursor, thissize int, initoffset *cursor.Cursor) error {
259 | 	initpos := int32(inpos.Get())
260 | 	wheremeta := in[initpos]
261 | 	inpos.Increment()
262 | 
263 | 	inexcept := initpos + wheremeta
264 | 	bytesize := in[inexcept]
265 | 	inexcept += 1
266 | 
267 | 	this.byteContainer.Clear()
268 | 	if err := this.byteContainer.AsInt32Buffer().PutInt32s(in, int(inexcept), int(bytesize/4)); err != nil {
269 | 		return err
270 | 	}
271 | 
272 | 	inexcept += bytesize / 4
273 | 	bitmap := in[inexcept]
274 | 	inexcept += 1
275 | 
276 | 	for k := int32(1); k < 33; k++ {
277 | 		if bitmap&(1<<uint32(k-1)) != 0 {
278 | 			size := in[inexcept]
279 | 			inexcept += 1
280 | 
281 | 			if int32(len(this.dataToBePacked[k])) < size {
282 | 				this.dataToBePacked[k] = make([]int32, encoding.CeilBy(int(size), 32))
283 | 			}
284 | 
285 | 			for j := int32(0); j < size; j += 32 {
286 | 				bitpacking.FastUnpack(in, int(inexcept), this.dataToBePacked[k], int(j), int(k))
287 | 				inexcept += k
288 | 			}
289 | 		}
290 | 	}
291 | 
292 | 	copy(this.dataPointers, zeroDataPointers)
293 | 	tmpoutpos := int32(outpos.Get())
294 | 	tmpinpos := int32(inpos.Get())
295 | 
296 | 	delta := make([]int32, DefaultBlockSize)
297 | 
298 | 	run := 0
299 | 	run_end := thissize / DefaultBlockSize
300 | 	for run < run_end {
301 | 		var err error
302 | 		var bestb int32
303 | 		if bestb, err = this.byteContainer.GetAsInt32(); err != nil {
304 | 			return err
305 | 		}
306 | 
307 | 		var cexcept int32
308 | 		if cexcept, err = this.byteContainer.GetAsInt32(); err != nil {
309 | 			return err
310 | 		}
311 | 
312 | 		for k := int32(0); k < 128; k += 32 {
313 | 			//bitpacking.FastUnpack(in, int(tmpinpos), out, int(tmpoutpos+k), int(bestb))
314 | 			bitpacking.FastUnpack(in, int(tmpinpos), delta, int(k), int(bestb))
315 | 			tmpinpos += bestb
316 | 		}
317 | 
318 | 		if cexcept > 0 {
319 | 			var maxbits int32
320 | 			if maxbits, err = this.byteContainer.GetAsInt32(); err != nil {
321 | 				return err
322 | 			}
323 | 
324 | 			index := maxbits - bestb
325 | 
326 | 			for k := int32(0); k < cexcept; k++ {
327 | 				var pos int32
328 | 				if pos, err = this.byteContainer.GetAsInt32(); err != nil {
329 | 					return err
330 | 				}
331 | 
332 | 				exceptvalue := this.dataToBePacked[index][this.dataPointers[index]]
333 | 				this.dataPointers[index] += 1
334 | 				//out[pos + tmpoutpos] |= exceptvalue << uint(bestb)
335 | 				delta[pos] |= exceptvalue << uint(bestb)
336 | 			}
337 | 		}
338 | 
339 | 		// Calculate the original from the deltas, inlining to gain a bit of performance
340 | 		offset := int32(initoffset.Get())
341 | 		for i, v := range delta {
342 | 			out[int(tmpoutpos)+i] = v + offset
343 | 			offset += v
344 | 		}
345 | 
346 | 		initoffset.Set(int(out[tmpoutpos+DefaultBlockSize-1]))
347 | 
348 | 		run += 1
349 | 		tmpoutpos += DefaultBlockSize
350 | 	}
351 | 
352 | 	outpos.Set(int(tmpoutpos))
353 | 	inpos.Set(int(inexcept))
354 | 
355 | 	return nil
356 | }
357 | 


--------------------------------------------------------------------------------
/delta/fastpfor/fastpfor_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
 3 |  * Use of this source code is governed by the Apache 2.0 license.
 4 |  *
 5 |  */
 6 | 
 7 | package fastpfor
 8 | 
 9 | import (
10 | 	"log"
11 | 	"testing"
12 | 
13 | 	"github.com/dataence/encoding/benchtools"
14 | 	"github.com/dataence/encoding/generators"
15 | )
16 | 
17 | var (
18 | 	data []int32
19 | 	size int = 12800000
20 | )
21 | 
22 | func init() {
23 | 	log.Printf("bp32/init: generating %d int32s\n", size)
24 | 	data = generators.GenerateClustered(size, size*2)
25 | 	log.Printf("bp32/init: generated %d integers for test", size)
26 | }
27 | 
28 | func TestCodec(t *testing.T) {
29 | 	sizes := []int{128, 128 * 10, 128 * 100, 128 * 1000, 128 * 10000}
30 | 	benchtools.TestCodec(New(), data, sizes)
31 | }
32 | 


--------------------------------------------------------------------------------
/delta/variablebyte/variablebyte.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
  3 |  * Use of this source code is governed by the Apache 2.0 license.
  4 |  *
  5 |  */
  6 | 
  7 | package variablebyte
  8 | 
  9 | import (
 10 | 	"errors"
 11 | 
 12 | 	"github.com/dataence/bytebuffer"
 13 | 	"github.com/dataence/encoding"
 14 | 	"github.com/dataence/encoding/cursor"
 15 | )
 16 | 
 17 | type VariableByte struct {
 18 | }
 19 | 
 20 | var _ encoding.Integer = (*VariableByte)(nil)
 21 | 
 22 | func New() encoding.Integer {
 23 | 	return &VariableByte{}
 24 | }
 25 | 
 26 | func (this *VariableByte) Compress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error {
 27 | 	if inlength == 0 {
 28 | 		return errors.New("variablebyte/Compress: inlength = 0. No work done.")
 29 | 	}
 30 | 
 31 | 	//fmt.Printf("variablebyte/Compress: after inlength = %d\n", inlength)
 32 | 
 33 | 	buf := bytebuffer.NewByteBuffer(inlength * 8)
 34 | 	initoffset := int32(0)
 35 | 
 36 | 	tmpinpos := inpos.Get()
 37 | 	for _, v := range in[tmpinpos : tmpinpos+inlength] {
 38 | 		val := uint32(v - initoffset)
 39 | 		initoffset = v
 40 | 
 41 | 		for val >= 0x80 {
 42 | 			buf.Put(byte(val) | 0x80)
 43 | 			val >>= 7
 44 | 		}
 45 | 		buf.Put(byte(val))
 46 | 	}
 47 | 
 48 | 	for buf.Position()%4 != 0 {
 49 | 		//fmt.Printf("variablebyte/Compress: putting 128\n")
 50 | 		buf.Put(128)
 51 | 	}
 52 | 
 53 | 	length := buf.Position()
 54 | 	buf.Flip()
 55 | 	ibuf := buf.AsInt32Buffer()
 56 | 	//fmt.Printf("variablebyte/Compress: l = %d, outpos = %d, ibuf = %v, buf = %v\n", length/4, outpos.Get(), ibuf, buf)
 57 | 	err := ibuf.GetInt32s(out, outpos.Get(), length/4)
 58 | 	if err != nil {
 59 | 		//fmt.Printf("variablebyte/Compress: error with GetUint32s - %v\n", err)
 60 | 		return err
 61 | 	}
 62 | 	outpos.Add(length / 4)
 63 | 	inpos.Add(inlength)
 64 | 	//fmt.Printf("variablebyte/Compress: out = %v\n", out)
 65 | 
 66 | 	return nil
 67 | }
 68 | 
 69 | func (this *VariableByte) Uncompress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error {
 70 | 	if inlength == 0 {
 71 | 		return errors.New("variablebyte/Uncompress: inlength = 0. No work done.")
 72 | 	}
 73 | 
 74 | 	//fmt.Printf("variablebyte/Uncompress: after inlength = %d\n", inlength)
 75 | 
 76 | 	s := uint(0)
 77 | 	p := inpos.Get()
 78 | 	finalp := inpos.Get() + inlength
 79 | 	tmpoutpos := outpos.Get()
 80 | 	initoffset := int32(0)
 81 | 	v := int32(0)
 82 | 	shift := uint(0)
 83 | 
 84 | 	for p < finalp {
 85 | 		c := in[p] >> (24 - s)
 86 | 		s += 8
 87 | 
 88 | 		if s == 32 {
 89 | 			s = 0
 90 | 			p += 1
 91 | 		}
 92 | 
 93 | 		v += ((c & 127) << shift)
 94 | 		if c&128 == 0 {
 95 | 			out[tmpoutpos] = v + initoffset
 96 | 			initoffset = out[tmpoutpos]
 97 | 			tmpoutpos += 1
 98 | 			v = 0
 99 | 			shift = 0
100 | 		} else {
101 | 			shift += 7
102 | 		}
103 | 
104 | 		outpos.Set(tmpoutpos)
105 | 		inpos.Add(inlength)
106 | 	}
107 | 
108 | 	return nil
109 | }
110 | 


--------------------------------------------------------------------------------
/delta/variablebyte/variablebyte_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
 3 |  * Use of this source code is governed by the Apache 2.0 license.
 4 |  *
 5 |  */
 6 | 
 7 | package variablebyte
 8 | 
 9 | import (
10 | 	"log"
11 | 	"testing"
12 | 
13 | 	"github.com/dataence/encoding/benchtools"
14 | 	"github.com/dataence/encoding/generators"
15 | )
16 | 
17 | var (
18 | 	data []int32
19 | 	size int = 12800000
20 | )
21 | 
22 | func init() {
23 | 	log.Printf("bp32/init: generating %d int32s\n", size)
24 | 	data = generators.GenerateClustered(size, size*2)
25 | 	log.Printf("bp32/init: generated %d integers for test", size)
26 | }
27 | 
28 | func TestCodec(t *testing.T) {
29 | 	sizes := []int{128, 128 * 10, 128 * 100, 128 * 1000, 128 * 10000}
30 | 	benchtools.TestCodec(New(), data, sizes)
31 | }
32 | 


--------------------------------------------------------------------------------
/fastpfor/fastpfor.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
  3 |  * Use of this source code is governed by the Apache 2.0 license.
  4 |  *
  5 |  */
  6 | 
  7 | // Package fastpfor is an implementation of the fastpfor integer compression
  8 | // algorithm in in Go.
  9 | // It is mostly suitable for arrays containing small positive integers.
 10 | // Given a list of sorted integers, you should first compute the successive
 11 | // differences prior to compression.
 12 | // For details, please see
 13 | // Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second
 14 | // through vectorization Software: Practice & Experience
 15 | // http://onlinelibrary.wiley.com/doi/10.1002/spe.2203/abstract or
 16 | //	http://arxiv.org/abs/1209.2137
 17 | package fastpfor
 18 | 
 19 | import (
 20 | 	"errors"
 21 | 	"math"
 22 | 
 23 | 	"github.com/dataence/bytebuffer"
 24 | 	"github.com/dataence/encoding"
 25 | 	"github.com/dataence/encoding/bitpacking"
 26 | 	"github.com/dataence/encoding/cursor"
 27 | )
 28 | 
 29 | const (
 30 | 	DefaultBlockSize     = 128
 31 | 	OverheadOfEachExcept = 8
 32 | 	DefaultPageSize      = 65536
 33 | )
 34 | 
 35 | var (
 36 | 	zeroDataPointers []int32
 37 | 	zeroFreqs        []int32
 38 | )
 39 | 
 40 | func init() {
 41 | 	zeroDataPointers = make([]int32, 33)
 42 | 	zeroFreqs = make([]int32, 33)
 43 | }
 44 | 
 45 | // FastPFOR codec structure: this is not thread-safe (need one per thread)
 46 | type FastPFOR struct {
 47 | 	dataToBePacked [33][]int32
 48 | 	byteContainer  *bytebuffer.ByteBuffer
 49 | 	pageSize       int32
 50 | 
 51 | 	// Working area
 52 | 	dataPointers []int32
 53 | 	freqs        []int32
 54 | }
 55 | 
 56 | var _ encoding.Integer = (*FastPFOR)(nil)
 57 | 
 58 | func New() encoding.Integer {
 59 | 	f := &FastPFOR{
 60 | 		pageSize:      DefaultPageSize,
 61 | 		byteContainer: bytebuffer.NewByteBuffer(3*DefaultPageSize/DefaultBlockSize + DefaultPageSize),
 62 | 		dataPointers:  make([]int32, 33),
 63 | 		freqs:         make([]int32, 33),
 64 | 	}
 65 | 
 66 | 	for i := 1; i < 33; i++ {
 67 | 		f.dataToBePacked[i] = make([]int32, DefaultPageSize/32*4)
 68 | 	}
 69 | 
 70 | 	return f
 71 | }
 72 | 
 73 | func (this *FastPFOR) Compress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error {
 74 | 	inlength = encoding.FloorBy(inlength, DefaultBlockSize)
 75 | 	if inlength == 0 {
 76 | 		return errors.New("fastpfor/Compress: inlength = 0. No work done.")
 77 | 	}
 78 | 	out[outpos.Get()] = int32(inlength)
 79 | 	outpos.Increment()
 80 | 
 81 | 	copy(this.dataPointers, zeroDataPointers)
 82 | 	copy(this.freqs, zeroFreqs)
 83 | 
 84 | 	finalInpos := inpos.Get() + inlength
 85 | 
 86 | 	for inpos.Get() != finalInpos {
 87 | 		thissize := int(math.Min(float64(this.pageSize), float64(finalInpos-inpos.Get())))
 88 | 		if err := this.encodePage(in, inpos, thissize, out, outpos); err != nil {
 89 | 			return errors.New("fastpfor/Compress: " + err.Error())
 90 | 		}
 91 | 	}
 92 | 
 93 | 	return nil
 94 | }
 95 | 
 96 | func (this *FastPFOR) Uncompress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error {
 97 | 	if inlength == 0 {
 98 | 		return errors.New("fastpfor/Uncompress: inlength = 0. No work done.")
 99 | 	}
100 | 
101 | 	mynvalue := in[inpos.Get()]
102 | 	inpos.Increment()
103 | 
104 | 	copy(this.dataPointers, zeroDataPointers)
105 | 
106 | 	finalout := outpos.Get() + int(mynvalue)
107 | 	for outpos.Get() != finalout {
108 | 		thissize := int(math.Min(float64(this.pageSize), float64(finalout-outpos.Get())))
109 | 		if err := this.decodePage(in, inpos, out, outpos, thissize); err != nil {
110 | 			return errors.New("fastpfor/Uncompress: " + err.Error())
111 | 		}
112 | 	}
113 | 	return nil
114 | }
115 | 
116 | // getBestBFromData determins the best bit position with the best cost of exceptions,
117 | // and the max bit position of the array of int32s
118 | func (this *FastPFOR) getBestBFromData(in []int32) (bestb int32, bestc int32, maxb int32) {
119 | 	copy(this.freqs, zeroFreqs)
120 | 	// Get the count of all the leading bit positionsfor the slice
121 | 	// Mainly to figure out what's the best (most popular) bit position
122 | 	//for _, v := range in[k:kEnd] {
123 | 	for _, v := range in {
124 | 		this.freqs[encoding.LeadingBitPosition(uint32(v))]++
125 | 	}
126 | 	bestb = 32
127 | 	for this.freqs[bestb] == 0 {
128 | 		bestb--
129 | 	}
130 | 	maxb = bestb
131 | 	bestCost := bestb * DefaultBlockSize
132 | 	var cexcept int32
133 | 	bestc = cexcept
134 | 	// Find the cost of storing exceptions for each bit position
135 | 	for b := bestb - 1; b >= 0; b-- {
136 | 		cexcept += this.freqs[b+1]
137 | 		if cexcept < 0 {
138 | 			break
139 | 		}
140 | 		// the extra 8 is the cost of storing maxbits
141 | 		thisCost := cexcept*OverheadOfEachExcept + cexcept*(maxb-b) + b*DefaultBlockSize + 8
142 | 		if thisCost < bestCost {
143 | 			bestCost = thisCost
144 | 			bestb = b
145 | 			bestc = cexcept
146 | 		}
147 | 	}
148 | 	return
149 | }
150 | 
151 | func (this *FastPFOR) encodePage(in []int32, inpos *cursor.Cursor, thissize int, out []int32, outpos *cursor.Cursor) error {
152 | 	headerpos := int32(outpos.Get())
153 | 	outpos.Increment()
154 | 	tmpoutpos := int32(outpos.Get())
155 | 
156 | 	// Clear working area
157 | 	copy(this.dataPointers, zeroDataPointers)
158 | 	this.byteContainer.Clear()
159 | 
160 | 	tmpinpos := int32(inpos.Get())
161 | 
162 | 	for finalInpos := tmpinpos + int32(thissize) - DefaultBlockSize; tmpinpos <= finalInpos; tmpinpos += DefaultBlockSize {
163 | 		bestb, bestc, maxb := this.getBestBFromData(in[tmpinpos : tmpinpos+DefaultBlockSize])
164 | 		tmpbestb := bestb
165 | 		this.byteContainer.Put(byte(bestb))
166 | 		this.byteContainer.Put(byte(bestc))
167 | 
168 | 		if bestc > 0 {
169 | 			this.byteContainer.Put(byte(maxb))
170 | 			index := maxb - bestb
171 | 			if int(this.dataPointers[index]+bestc) >= len(this.dataToBePacked[index]) {
172 | 				newSize := int(2 * (this.dataPointers[index] + bestc))
173 | 				// make sure it is a multiple of 32.
174 | 				// there might be a better way to do this
175 | 				newSize = encoding.CeilBy(newSize, 32)
176 | 				newSlice := make([]int32, newSize)
177 | 				copy(newSlice, this.dataToBePacked[index])
178 | 				this.dataToBePacked[index] = newSlice
179 | 			}
180 | 
181 | 			for k := int32(0); k < DefaultBlockSize; k++ {
182 | 				if uint32(in[k+tmpinpos])>>uint(bestb) != 0 {
183 | 					// we have an exception
184 | 					this.byteContainer.Put(byte(k))
185 | 					this.dataToBePacked[index][this.dataPointers[index]] = int32(uint32(in[k+tmpinpos]) >> uint(tmpbestb))
186 | 					this.dataPointers[index] += 1
187 | 				}
188 | 			}
189 | 		}
190 | 
191 | 		for k := int32(0); k < 128; k += 32 {
192 | 			bitpacking.FastPack(in, int(tmpinpos+k), out, int(tmpoutpos), int(tmpbestb))
193 | 			tmpoutpos += tmpbestb
194 | 		}
195 | 	}
196 | 
197 | 	inpos.Set(int(tmpinpos))
198 | 	out[headerpos] = tmpoutpos - headerpos
199 | 	bytesize := int32(this.byteContainer.Position())
200 | 	for this.byteContainer.Position()&3 != 0 {
201 | 		this.byteContainer.Put(0)
202 | 	}
203 | 
204 | 	out[tmpoutpos] = bytesize
205 | 	tmpoutpos += 1
206 | 	howmanyints := (bytesize + 3) / 4
207 | 	this.byteContainer.Flip()
208 | 	this.byteContainer.AsInt32Buffer().GetInt32s(out, int(tmpoutpos), int(howmanyints))
209 | 	tmpoutpos += howmanyints
210 | 
211 | 	bitmap := int32(0)
212 | 	for k := 1; k <= 32; k++ {
213 | 		v := this.dataPointers[k]
214 | 		if v != 0 {
215 | 			bitmap |= (1 << uint(k-1))
216 | 		}
217 | 	}
218 | 
219 | 	out[tmpoutpos] = bitmap
220 | 	tmpoutpos += 1
221 | 
222 | 	for k := 1; k < 33; k++ {
223 | 		v := this.dataPointers[k]
224 | 		if v != 0 {
225 | 			out[tmpoutpos] = v // size
226 | 			tmpoutpos += 1
227 | 			for j := 0; j < int(v); j += 32 {
228 | 				bitpacking.FastPack(this.dataToBePacked[k], j, out, int(tmpoutpos), k)
229 | 				tmpoutpos += int32(k)
230 | 			}
231 | 		}
232 | 	}
233 | 
234 | 	outpos.Set(int(tmpoutpos))
235 | 
236 | 	return nil
237 | }
238 | 
239 | func grapByte(in []int32, index uint) byte {
240 | 	return byte(in[index/4] >> (24 - (index%4)*8))
241 | }
242 | 
243 | func (this *FastPFOR) decodePage(in []int32, inpos *cursor.Cursor, out []int32, outpos *cursor.Cursor, thissize int) error {
244 | 	initpos := int32(inpos.Get())
245 | 	wheremeta := in[initpos]
246 | 	inpos.Increment()
247 | 
248 | 	inexcept := initpos + wheremeta
249 | 	bytesize := in[inexcept]
250 | 	inexcept += 1
251 | 	mybytearray := in[inexcept:]
252 | 	mybp := uint(0)
253 | 
254 | 	inexcept += (bytesize + 3) / 4
255 | 	bitmap := in[inexcept]
256 | 	inexcept += 1
257 | 
258 | 	for k := int32(1); k < 33; k++ {
259 | 		if bitmap&(1<<uint32(k-1)) != 0 {
260 | 			size := in[inexcept]
261 | 			inexcept += 1
262 | 
263 | 			if int32(len(this.dataToBePacked[k])) < size {
264 | 				this.dataToBePacked[k] = make([]int32, encoding.CeilBy(int(size), 32))
265 | 			}
266 | 			for j := int32(0); j < size; j += 32 {
267 | 				bitpacking.FastUnpack(in, int(inexcept), this.dataToBePacked[k], int(j), int(k))
268 | 				inexcept += k
269 | 			}
270 | 		}
271 | 	}
272 | 
273 | 	copy(this.dataPointers, zeroDataPointers)
274 | 	tmpoutpos := uint32(outpos.Get())
275 | 	tmpinpos := uint32(inpos.Get())
276 | 
277 | 	run := 0
278 | 	run_end := thissize / DefaultBlockSize
279 | 	for run < run_end {
280 | 		bestb := uint32(grapByte(mybytearray, mybp))
281 | 		mybp++
282 | 		cexcept := int32(grapByte(mybytearray, mybp))
283 | 		mybp++
284 | 		for k := uint32(0); k < 128; k += 32 {
285 | 			bitpacking.FastUnpack(in, int(tmpinpos), out, int(tmpoutpos+k), int(bestb))
286 | 			tmpinpos += bestb
287 | 		}
288 | 
289 | 		if cexcept > 0 {
290 | 			maxbits := uint32(grapByte(mybytearray, mybp))
291 | 			mybp++
292 | 			index := maxbits - bestb
293 | 			// assuming that the Go compiler is bad, we move everything that is indexed outside the upcoming loop
294 | 			packedexceptions := this.dataToBePacked[index]
295 | 			myindex := this.dataPointers[index]
296 | 
297 | 			for k := int32(0); k < cexcept; k++ {
298 | 				pos := uint32(grapByte(mybytearray, mybp))
299 | 				mybp++
300 | 				exceptvalue := packedexceptions[myindex]
301 | 				myindex++
302 | 				out[pos+tmpoutpos] |= exceptvalue << bestb
303 | 			}
304 | 			this.dataPointers[index] = myindex
305 | 		}
306 | 
307 | 		run += 1
308 | 		tmpoutpos += DefaultBlockSize
309 | 	}
310 | 
311 | 	outpos.Set(int(tmpoutpos))
312 | 	inpos.Set(int(inexcept))
313 | 
314 | 	return nil
315 | }
316 | 


--------------------------------------------------------------------------------
/fastpfor/fastpfor_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
 3 |  * Use of this source code is governed by the Apache 2.0 license.
 4 |  *
 5 |  */
 6 | 
 7 | package fastpfor
 8 | 
 9 | import (
10 | 	"log"
11 | 	"testing"
12 | 
13 | 	"github.com/dataence/encoding/benchtools"
14 | 	"github.com/dataence/encoding/cursor"
15 | 	"github.com/dataence/encoding/generators"
16 | )
17 | 
18 | var (
19 | 	data []int32
20 | 	size int = 128000
21 | )
22 | 
23 | func init() {
24 | 	log.Printf("fastpfor/init: generating %d int32s\n", size)
25 | 	data = generators.GenerateClustered(size, size*2)
26 | 	log.Printf("fastpfor/init: generated %d integers for test", size)
27 | }
28 | 
29 | func TestCodec(t *testing.T) {
30 | 	sizes := []int{128, 128 * 10, 128 * 100, 128 * 1000}
31 | 	benchtools.TestCodec(New(), data, sizes)
32 | }
33 | 
34 | // go test -bench=Decode
35 | func BenchmarkDecode(b *testing.B) {
36 | 	b.StopTimer()
37 | 	length := 128 * 1024
38 | 	data := generators.GenerateClustered(length, 1<<24)
39 | 	compdata := make([]int32, 2*length)
40 | 	recov := make([]int32, length)
41 | 	inpos := cursor.New()
42 | 	outpos := cursor.New()
43 | 	codec := New()
44 | 	codec.Compress(data, inpos, len(data), compdata, outpos)
45 | 	b.StartTimer()
46 | 	for j := 0; j < b.N; j++ {
47 | 		newinpos := cursor.New()
48 | 		newoutpos := cursor.New()
49 | 		codec.Uncompress(compdata, newinpos, outpos.Get()-newinpos.Get(), recov, newoutpos)
50 | 	}
51 | }
52 | 


--------------------------------------------------------------------------------
/generators/generators.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
  3 |  * Use of this source code is governed by the Apache 2.0 license.
  4 |  *
  5 |  */
  6 | 
  7 | package generators
  8 | 
  9 | import (
 10 | 	"bytes"
 11 | 	"encoding/binary"
 12 | 	"errors"
 13 | 	"github.com/willf/bitset"
 14 | 	"math/rand"
 15 | 	"sort"
 16 | )
 17 | 
 18 | const (
 19 | 	c1 int64 = 0xcc9e2d51
 20 | 	c2 int64 = 0x1b873593
 21 | )
 22 | 
 23 | func GenerateUniformInBytes(N, max int) *bytes.Buffer {
 24 | 	data := GenerateUniform(N, max)
 25 | 	b := make([]byte, N*4)
 26 | 	for i := 0; i < N; i++ {
 27 | 		binary.LittleEndian.PutUint32(b[i*4:], uint32(data[i]))
 28 | 	}
 29 | 
 30 | 	return bytes.NewBuffer(b)
 31 | }
 32 | 
 33 | func GenerateClusteredInBytes(N, max int) *bytes.Buffer {
 34 | 	data := GenerateClustered(N, max)
 35 | 	b := make([]byte, N*4)
 36 | 	for i := 0; i < N; i++ {
 37 | 		binary.LittleEndian.PutUint32(b[i*4:], uint32(data[i]))
 38 | 	}
 39 | 
 40 | 	return bytes.NewBuffer(b)
 41 | }
 42 | 
 43 | func GenerateUniform(N, max int) []int32 {
 44 | 	if N*2 > max {
 45 | 		return negate(GenerateUniform(max-N, max), max)
 46 | 	}
 47 | 
 48 | 	if 2048*N > max {
 49 | 		r, _ := generateUniformBitmap(N, max)
 50 | 		return r
 51 | 
 52 | 	}
 53 | 
 54 | 	r, _ := generateUniformHash(N, max)
 55 | 	return r
 56 | }
 57 | 
 58 | func GenerateClustered(N, max int) []int32 {
 59 | 	ans := make([]int32, N)
 60 | 	fillClustered(ans, 0, N, 0, max)
 61 | 	return ans
 62 | }
 63 | 
 64 | func fillUniform(ans []int32, offset, length, min, max int) {
 65 | 	v := GenerateUniform(length, max-min)
 66 | 	for k := 0; k < len(v); k++ {
 67 | 		ans[k+offset] = int32(min) + v[k]
 68 | 	}
 69 | }
 70 | 
 71 | func fillClustered(ans []int32, offset, length, min, max int) {
 72 | 	btwn := max - min
 73 | 	if btwn == length || length <= 10 {
 74 | 		fillUniform(ans, offset, length, min, max)
 75 | 		return
 76 | 	}
 77 | 
 78 | 	r := rand.New(rand.NewSource(c1))
 79 | 	cut := length / 2
 80 | 	if btwn-length-1 > 0 {
 81 | 		cut += int(r.Int31n(int32(btwn - length - 1)))
 82 | 	}
 83 | 
 84 | 	p := r.Float64()
 85 | 	if p < 0.25 {
 86 | 		fillUniform(ans, offset, length/2, min, min+cut)
 87 | 		fillClustered(ans, offset+length/2, length-length/2, min+cut, max)
 88 | 	} else if p < 0.5 {
 89 | 		fillClustered(ans, offset, length/2, min, min+cut)
 90 | 		fillUniform(ans, offset+length/2, length-length/2, min+cut, max)
 91 | 	} else {
 92 | 		fillClustered(ans, offset, length/2, min, min+cut)
 93 | 		fillClustered(ans, offset+length/2, length-length/2, min+cut, max)
 94 | 	}
 95 | }
 96 | 
 97 | func negate(x []int32, max int) []int32 {
 98 | 	ans := make([]int32, max-len(x))
 99 | 
100 | 	var i, c int32
101 | 
102 | 	for j := 0; j < len(x); j++ {
103 | 		v := x[j]
104 | 		for ; i < v; i++ {
105 | 			ans[c] = i
106 | 			c += 1
107 | 		}
108 | 		i += 1
109 | 	}
110 | 
111 | 	for int(c) < len(ans) {
112 | 		ans[c] = i
113 | 		c += 1
114 | 		i += 1
115 | 	}
116 | 
117 | 	return ans
118 | }
119 | 
120 | func generateUniformBitmap(N, max int) ([]int32, error) {
121 | 	if N > max {
122 | 		return nil, errors.New("encoding/generateUniformBitmap: N > max, not possible")
123 | 	}
124 | 
125 | 	r := rand.New(rand.NewSource(c1))
126 | 	ans := make([]int32, N)
127 | 	bs := bitset.New(uint(max))
128 | 	cardinality := uint(0)
129 | 
130 | 	for int(cardinality) < N {
131 | 		v := r.Int31n(int32(max))
132 | 		if !bs.Test(uint(v)) {
133 | 			bs.Set(uint(v))
134 | 			cardinality += 1
135 | 		}
136 | 	}
137 | 
138 | 	for i, c := int32(0), 0; c < N; i++ {
139 | 		if bs.Test(uint(i)) {
140 | 			ans[c] = i
141 | 			c += 1
142 | 		}
143 | 	}
144 | 
145 | 	return ans, nil
146 | }
147 | 
148 | func generateUniformHash(N, max int) ([]int32, error) {
149 | 	if N > max {
150 | 		return nil, errors.New("encoding/generateUniformBitmap: N > max, not possible")
151 | 	}
152 | 
153 | 	r := rand.New(rand.NewSource(c2))
154 | 	ans := make([]int32, N)
155 | 	s := make(map[int]bool)
156 | 
157 | 	for len(s) < N {
158 | 		s[int(r.Int31n(int32(max)))] = true
159 | 	}
160 | 
161 | 	c := 0
162 | 	tmpans := make([]int, N)
163 | 	for k, _ := range s {
164 | 		tmpans[c] = k
165 | 	}
166 | 
167 | 	sort.Ints(tmpans)
168 | 
169 | 	for i := 0; i < len(tmpans); i++ {
170 | 		ans[i] = int32(tmpans[i])
171 | 	}
172 | 
173 | 	return ans, nil
174 | }
175 | 


--------------------------------------------------------------------------------
/generators/generators_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
 3 |  * Use of this source code is governed by the Apache 2.0 license.
 4 |  *
 5 |  */
 6 | 
 7 | package generators
 8 | 
 9 | import (
10 | 	"fmt"
11 | 	"testing"
12 | )
13 | 
14 | func TestGenerateClustered(t *testing.T) {
15 | 	a := GenerateClustered(20, 1000)
16 | 	fmt.Println(a)
17 | }
18 | 
19 | func TestGenerateUniform(t *testing.T) {
20 | 	a := GenerateUniform(20, 1000)
21 | 	fmt.Println(a)
22 | }
23 | 


--------------------------------------------------------------------------------
/integer.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
 3 |  * Use of this source code is governed by the Apache 2.0 license.
 4 |  *
 5 |  */
 6 | 
 7 | package encoding
 8 | 
 9 | import (
10 | 	"github.com/dataence/encoding/cursor"
11 | )
12 | 
13 | type Integer interface {
14 | 	// Compress data from an array to another array.
15 | 	//
16 | 	// Both inpos and outpos are modified to represent how much data was read and written to
17 | 	// if 12 ints (inlength = 12) are compressed to 3 ints, then inpos will be incremented by 12
18 | 	// while outpos will be incremented by 3 we use IntWrapper to pass the values by reference.
19 | 	// @param in  input array
20 | 	// @param inpos location in the input array
21 | 	// @param inlength how many integers to compress
22 | 	// @param out output array
23 | 	//* @param outpos  where to write in the output array
24 | 	Compress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error
25 | 
26 | 	/**
27 | 	 * Uncompress data from an array to another array.
28 | 	 *
29 | 	 * Both inpos and outpos parameters are modified to indicate new positions after read/write.
30 | 	 *
31 | 	 * @param in array containing data in compressed form
32 | 	 * @param inpos where to start reading in the array
33 | 	 * @param inlength length of the compressed data (ignored by some schemes)
34 | 	 * @param out array where to write the compressed output
35 | 	 * @param outpos where to write the compressed output in out
36 | 	 */
37 | 	Uncompress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error
38 | }
39 | 


--------------------------------------------------------------------------------
/util.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
  3 |  * Use of this source code is governed by the Apache 2.0 license.
  4 |  *
  5 |  */
  6 | 
  7 | package encoding
  8 | 
  9 | import (
 10 | 	"fmt"
 11 | )
 12 | 
 13 | func FloorBy(value, factor int) int {
 14 | 	return value - value%factor
 15 | }
 16 | 
 17 | func CeilBy(value, factor int) int {
 18 | 	return value + factor - value%factor
 19 | }
 20 | 
 21 | func LeadingBitPosition(x uint32) int32 {
 22 | 	//return 32 - int32(nlz1a(x))
 23 | 	return int32(bitlen(uint64(x)))
 24 | }
 25 | 
 26 | func DeltaMaxBits(initoffset int32, buf []int32) int32 {
 27 | 	var mask int32
 28 | 
 29 | 	for _, v := range buf {
 30 | 		mask |= v - initoffset
 31 | 		initoffset = v
 32 | 	}
 33 | 
 34 | 	return LeadingBitPosition(uint32(mask))
 35 | }
 36 | 
 37 | func MaxBits(buf []int32) int32 {
 38 | 	var mask int32
 39 | 
 40 | 	for _, v := range buf {
 41 | 		mask |= v
 42 | 	}
 43 | 
 44 | 	return LeadingBitPosition(uint32(mask))
 45 | }
 46 | 
 47 | func PrintInt32sInBits(buf []int32) {
 48 | 	fmt.Println("                           10987654321098765432109876543210")
 49 | 	for i, v := range buf {
 50 | 		fmt.Printf("%4d: %20d %032b\n", i, v, uint32(v))
 51 | 	}
 52 | }
 53 | 
 54 | func Delta(in, out []int32, offset int32) {
 55 | 	for i, v := range in {
 56 | 		out[i] = v - offset
 57 | 		offset = v
 58 | 	}
 59 | }
 60 | 
 61 | func InverseDelta(in, out []int32, offset int32) {
 62 | 	for i, v := range in {
 63 | 		out[i] = v + offset
 64 | 		offset = out[i]
 65 | 	}
 66 | }
 67 | 
 68 | // https://developers.google.com/protocol-buffers/docs/encoding#types
 69 | func ZigZagDelta(in, out []int32) {
 70 | 	offset := int32(0)
 71 | 
 72 | 	for i, v := range in {
 73 | 		n := v - offset
 74 | 		out[i] = (n << 1) ^ (n >> 31)
 75 | 		offset = v
 76 | 	}
 77 | }
 78 | 
 79 | func InverseZigZagDelta(in, out []int32) {
 80 | 	offset := int32(0)
 81 | 
 82 | 	for i, v := range in {
 83 | 		//n := int32(uint32(v) >> 1) ^ (-(v & 1))
 84 | 		n := int32(uint32(v)>>1) ^ ((v << 31) >> 31)
 85 | 		out[i] = n + offset
 86 | 		offset = out[i]
 87 | 	}
 88 | }
 89 | 
 90 | // Copied from http://www.hackersdelight.org/hdcodetxt/nlz.c.txt - nlz1a
 91 | func nlz1a(x uint32) uint32 {
 92 | 	var n uint32 = 0
 93 | 	if x <= 0 {
 94 | 		return (^x >> 26) & 32
 95 | 	}
 96 | 
 97 | 	n = 1
 98 | 
 99 | 	if (x >> 16) == 0 {
100 | 		n = n + 16
101 | 		x = x << 16
102 | 	}
103 | 	if (x >> 24) == 0 {
104 | 		n = n + 8
105 | 		x = x << 8
106 | 	}
107 | 	if (x >> 28) == 0 {
108 | 		n = n + 4
109 | 		x = x << 4
110 | 	}
111 | 	if (x >> 30) == 0 {
112 | 		n = n + 2
113 | 		x = x << 2
114 | 	}
115 | 	n = n - (x >> 31)
116 | 	return n
117 | }
118 | 
119 | func nlz2(x uint32) uint32 {
120 | 	var y uint32
121 | 	var n uint32 = 32
122 | 
123 | 	y = x >> 16
124 | 	if y != 0 {
125 | 		n = n - 16
126 | 		x = y
127 | 	}
128 | 	y = x >> 8
129 | 	if y != 0 {
130 | 		n = n - 8
131 | 		x = y
132 | 	}
133 | 	y = x >> 4
134 | 	if y != 0 {
135 | 		n = n - 4
136 | 		x = y
137 | 	}
138 | 	y = x >> 2
139 | 	if y != 0 {
140 | 		n = n - 2
141 | 		x = y
142 | 	}
143 | 	y = x >> 1
144 | 	if y != 0 {
145 | 		return n - 2
146 | 	}
147 | 	return n - x
148 | }
149 | 
150 | /* The following are unrolled versions, but they are probably slower due to range checks */
151 | func UnrolledDelta128(in, out []int32, offset int32) {
152 | 	out[0] = in[0] - offset
153 | 	out[1] = in[1] - in[0]
154 | 	out[2] = in[2] - in[1]
155 | 	out[3] = in[3] - in[2]
156 | 	out[4] = in[4] - in[3]
157 | 	out[5] = in[5] - in[4]
158 | 	out[6] = in[6] - in[5]
159 | 	out[7] = in[7] - in[6]
160 | 	out[8] = in[8] - in[7]
161 | 	out[9] = in[9] - in[8]
162 | 	out[10] = in[10] - in[9]
163 | 	out[11] = in[11] - in[10]
164 | 	out[12] = in[12] - in[11]
165 | 	out[13] = in[13] - in[12]
166 | 	out[14] = in[14] - in[13]
167 | 	out[15] = in[15] - in[14]
168 | 	out[16] = in[16] - in[15]
169 | 	out[17] = in[17] - in[16]
170 | 	out[18] = in[18] - in[17]
171 | 	out[19] = in[19] - in[18]
172 | 	out[20] = in[20] - in[19]
173 | 	out[21] = in[21] - in[20]
174 | 	out[22] = in[22] - in[21]
175 | 	out[23] = in[23] - in[22]
176 | 	out[24] = in[24] - in[23]
177 | 	out[25] = in[25] - in[24]
178 | 	out[26] = in[26] - in[25]
179 | 	out[27] = in[27] - in[26]
180 | 	out[28] = in[28] - in[27]
181 | 	out[29] = in[29] - in[28]
182 | 	out[30] = in[30] - in[29]
183 | 	out[31] = in[31] - in[30]
184 | 	out[32] = in[32] - in[31]
185 | 	out[33] = in[33] - in[32]
186 | 	out[34] = in[34] - in[33]
187 | 	out[35] = in[35] - in[34]
188 | 	out[36] = in[36] - in[35]
189 | 	out[37] = in[37] - in[36]
190 | 	out[38] = in[38] - in[37]
191 | 	out[39] = in[39] - in[38]
192 | 	out[40] = in[40] - in[39]
193 | 	out[41] = in[41] - in[40]
194 | 	out[42] = in[42] - in[41]
195 | 	out[43] = in[43] - in[42]
196 | 	out[44] = in[44] - in[43]
197 | 	out[45] = in[45] - in[44]
198 | 	out[46] = in[46] - in[45]
199 | 	out[47] = in[47] - in[46]
200 | 	out[48] = in[48] - in[47]
201 | 	out[49] = in[49] - in[48]
202 | 	out[50] = in[50] - in[49]
203 | 	out[51] = in[51] - in[50]
204 | 	out[52] = in[52] - in[51]
205 | 	out[53] = in[53] - in[52]
206 | 	out[54] = in[54] - in[53]
207 | 	out[55] = in[55] - in[54]
208 | 	out[56] = in[56] - in[55]
209 | 	out[57] = in[57] - in[56]
210 | 	out[58] = in[58] - in[57]
211 | 	out[59] = in[59] - in[58]
212 | 	out[60] = in[60] - in[59]
213 | 	out[61] = in[61] - in[60]
214 | 	out[62] = in[62] - in[61]
215 | 	out[63] = in[63] - in[62]
216 | 	out[64] = in[64] - in[63]
217 | 	out[65] = in[65] - in[64]
218 | 	out[66] = in[66] - in[65]
219 | 	out[67] = in[67] - in[66]
220 | 	out[68] = in[68] - in[67]
221 | 	out[69] = in[69] - in[68]
222 | 	out[70] = in[70] - in[69]
223 | 	out[71] = in[71] - in[70]
224 | 	out[72] = in[72] - in[71]
225 | 	out[73] = in[73] - in[72]
226 | 	out[74] = in[74] - in[73]
227 | 	out[75] = in[75] - in[74]
228 | 	out[76] = in[76] - in[75]
229 | 	out[77] = in[77] - in[76]
230 | 	out[78] = in[78] - in[77]
231 | 	out[79] = in[79] - in[78]
232 | 	out[80] = in[80] - in[79]
233 | 	out[81] = in[81] - in[80]
234 | 	out[82] = in[82] - in[81]
235 | 	out[83] = in[83] - in[82]
236 | 	out[84] = in[84] - in[83]
237 | 	out[85] = in[85] - in[84]
238 | 	out[86] = in[86] - in[85]
239 | 	out[87] = in[87] - in[86]
240 | 	out[88] = in[88] - in[87]
241 | 	out[89] = in[89] - in[88]
242 | 	out[90] = in[90] - in[89]
243 | 	out[91] = in[91] - in[90]
244 | 	out[92] = in[92] - in[91]
245 | 	out[93] = in[93] - in[92]
246 | 	out[94] = in[94] - in[93]
247 | 	out[95] = in[95] - in[94]
248 | 	out[96] = in[96] - in[95]
249 | 	out[97] = in[97] - in[96]
250 | 	out[98] = in[98] - in[97]
251 | 	out[99] = in[99] - in[98]
252 | 	out[100] = in[100] - in[99]
253 | 	out[101] = in[101] - in[100]
254 | 	out[102] = in[102] - in[101]
255 | 	out[103] = in[103] - in[102]
256 | 	out[104] = in[104] - in[103]
257 | 	out[105] = in[105] - in[104]
258 | 	out[106] = in[106] - in[105]
259 | 	out[107] = in[107] - in[106]
260 | 	out[108] = in[108] - in[107]
261 | 	out[109] = in[109] - in[108]
262 | 	out[110] = in[110] - in[109]
263 | 	out[111] = in[111] - in[110]
264 | 	out[112] = in[112] - in[111]
265 | 	out[113] = in[113] - in[112]
266 | 	out[114] = in[114] - in[113]
267 | 	out[115] = in[115] - in[114]
268 | 	out[116] = in[116] - in[115]
269 | 	out[117] = in[117] - in[116]
270 | 	out[118] = in[118] - in[117]
271 | 	out[119] = in[119] - in[118]
272 | 	out[120] = in[120] - in[119]
273 | 	out[121] = in[121] - in[120]
274 | 	out[122] = in[122] - in[121]
275 | 	out[123] = in[123] - in[122]
276 | 	out[124] = in[124] - in[123]
277 | 	out[125] = in[125] - in[124]
278 | 	out[126] = in[126] - in[125]
279 | 	out[127] = in[127] - in[126]
280 | }
281 | 
282 | func UnrolledInverseDelta128(in, out []int32, offset int32) {
283 | 	out[0] = in[0] + offset
284 | 	out[1] = in[1] + out[0]
285 | 	out[2] = in[2] + out[1]
286 | 	out[3] = in[3] + out[2]
287 | 	out[4] = in[4] + out[3]
288 | 	out[5] = in[5] + out[4]
289 | 	out[6] = in[6] + out[5]
290 | 	out[7] = in[7] + out[6]
291 | 	out[8] = in[8] + out[7]
292 | 	out[9] = in[9] + out[8]
293 | 	out[10] = in[10] + out[9]
294 | 	out[11] = in[11] + out[10]
295 | 	out[12] = in[12] + out[11]
296 | 	out[13] = in[13] + out[12]
297 | 	out[14] = in[14] + out[13]
298 | 	out[15] = in[15] + out[14]
299 | 	out[16] = in[16] + out[15]
300 | 	out[17] = in[17] + out[16]
301 | 	out[18] = in[18] + out[17]
302 | 	out[19] = in[19] + out[18]
303 | 	out[20] = in[20] + out[19]
304 | 	out[21] = in[21] + out[20]
305 | 	out[22] = in[22] + out[21]
306 | 	out[23] = in[23] + out[22]
307 | 	out[24] = in[24] + out[23]
308 | 	out[25] = in[25] + out[24]
309 | 	out[26] = in[26] + out[25]
310 | 	out[27] = in[27] + out[26]
311 | 	out[28] = in[28] + out[27]
312 | 	out[29] = in[29] + out[28]
313 | 	out[30] = in[30] + out[29]
314 | 	out[31] = in[31] + out[30]
315 | 	out[32] = in[32] + out[31]
316 | 	out[33] = in[33] + out[32]
317 | 	out[34] = in[34] + out[33]
318 | 	out[35] = in[35] + out[34]
319 | 	out[36] = in[36] + out[35]
320 | 	out[37] = in[37] + out[36]
321 | 	out[38] = in[38] + out[37]
322 | 	out[39] = in[39] + out[38]
323 | 	out[40] = in[40] + out[39]
324 | 	out[41] = in[41] + out[40]
325 | 	out[42] = in[42] + out[41]
326 | 	out[43] = in[43] + out[42]
327 | 	out[44] = in[44] + out[43]
328 | 	out[45] = in[45] + out[44]
329 | 	out[46] = in[46] + out[45]
330 | 	out[47] = in[47] + out[46]
331 | 	out[48] = in[48] + out[47]
332 | 	out[49] = in[49] + out[48]
333 | 	out[50] = in[50] + out[49]
334 | 	out[51] = in[51] + out[50]
335 | 	out[52] = in[52] + out[51]
336 | 	out[53] = in[53] + out[52]
337 | 	out[54] = in[54] + out[53]
338 | 	out[55] = in[55] + out[54]
339 | 	out[56] = in[56] + out[55]
340 | 	out[57] = in[57] + out[56]
341 | 	out[58] = in[58] + out[57]
342 | 	out[59] = in[59] + out[58]
343 | 	out[60] = in[60] + out[59]
344 | 	out[61] = in[61] + out[60]
345 | 	out[62] = in[62] + out[61]
346 | 	out[63] = in[63] + out[62]
347 | 	out[64] = in[64] + out[63]
348 | 	out[65] = in[65] + out[64]
349 | 	out[66] = in[66] + out[65]
350 | 	out[67] = in[67] + out[66]
351 | 	out[68] = in[68] + out[67]
352 | 	out[69] = in[69] + out[68]
353 | 	out[70] = in[70] + out[69]
354 | 	out[71] = in[71] + out[70]
355 | 	out[72] = in[72] + out[71]
356 | 	out[73] = in[73] + out[72]
357 | 	out[74] = in[74] + out[73]
358 | 	out[75] = in[75] + out[74]
359 | 	out[76] = in[76] + out[75]
360 | 	out[77] = in[77] + out[76]
361 | 	out[78] = in[78] + out[77]
362 | 	out[79] = in[79] + out[78]
363 | 	out[80] = in[80] + out[79]
364 | 	out[81] = in[81] + out[80]
365 | 	out[82] = in[82] + out[81]
366 | 	out[83] = in[83] + out[82]
367 | 	out[84] = in[84] + out[83]
368 | 	out[85] = in[85] + out[84]
369 | 	out[86] = in[86] + out[85]
370 | 	out[87] = in[87] + out[86]
371 | 	out[88] = in[88] + out[87]
372 | 	out[89] = in[89] + out[88]
373 | 	out[90] = in[90] + out[89]
374 | 	out[91] = in[91] + out[90]
375 | 	out[92] = in[92] + out[91]
376 | 	out[93] = in[93] + out[92]
377 | 	out[94] = in[94] + out[93]
378 | 	out[95] = in[95] + out[94]
379 | 	out[96] = in[96] + out[95]
380 | 	out[97] = in[97] + out[96]
381 | 	out[98] = in[98] + out[97]
382 | 	out[99] = in[99] + out[98]
383 | 	out[100] = in[100] + out[99]
384 | 	out[101] = in[101] + out[100]
385 | 	out[102] = in[102] + out[101]
386 | 	out[103] = in[103] + out[102]
387 | 	out[104] = in[104] + out[103]
388 | 	out[105] = in[105] + out[104]
389 | 	out[106] = in[106] + out[105]
390 | 	out[107] = in[107] + out[106]
391 | 	out[108] = in[108] + out[107]
392 | 	out[109] = in[109] + out[108]
393 | 	out[110] = in[110] + out[109]
394 | 	out[111] = in[111] + out[110]
395 | 	out[112] = in[112] + out[111]
396 | 	out[113] = in[113] + out[112]
397 | 	out[114] = in[114] + out[113]
398 | 	out[115] = in[115] + out[114]
399 | 	out[116] = in[116] + out[115]
400 | 	out[117] = in[117] + out[116]
401 | 	out[118] = in[118] + out[117]
402 | 	out[119] = in[119] + out[118]
403 | 	out[120] = in[120] + out[119]
404 | 	out[121] = in[121] + out[120]
405 | 	out[122] = in[122] + out[121]
406 | 	out[123] = in[123] + out[122]
407 | 	out[124] = in[124] + out[123]
408 | 	out[125] = in[125] + out[124]
409 | 	out[126] = in[126] + out[125]
410 | 	out[127] = in[127] + out[126]
411 | }
412 | 
413 | func UnrolledLeadingBitFrequency128(in, freqs []int32) {
414 | 	freqs[LeadingBitPosition(uint32(in[0]))]++
415 | 	freqs[LeadingBitPosition(uint32(in[1]))]++
416 | 	freqs[LeadingBitPosition(uint32(in[2]))]++
417 | 	freqs[LeadingBitPosition(uint32(in[3]))]++
418 | 	freqs[LeadingBitPosition(uint32(in[4]))]++
419 | 	freqs[LeadingBitPosition(uint32(in[5]))]++
420 | 	freqs[LeadingBitPosition(uint32(in[6]))]++
421 | 	freqs[LeadingBitPosition(uint32(in[7]))]++
422 | 	freqs[LeadingBitPosition(uint32(in[8]))]++
423 | 	freqs[LeadingBitPosition(uint32(in[9]))]++
424 | 	freqs[LeadingBitPosition(uint32(in[10]))]++
425 | 	freqs[LeadingBitPosition(uint32(in[11]))]++
426 | 	freqs[LeadingBitPosition(uint32(in[12]))]++
427 | 	freqs[LeadingBitPosition(uint32(in[13]))]++
428 | 	freqs[LeadingBitPosition(uint32(in[14]))]++
429 | 	freqs[LeadingBitPosition(uint32(in[15]))]++
430 | 	freqs[LeadingBitPosition(uint32(in[16]))]++
431 | 	freqs[LeadingBitPosition(uint32(in[17]))]++
432 | 	freqs[LeadingBitPosition(uint32(in[18]))]++
433 | 	freqs[LeadingBitPosition(uint32(in[19]))]++
434 | 	freqs[LeadingBitPosition(uint32(in[20]))]++
435 | 	freqs[LeadingBitPosition(uint32(in[21]))]++
436 | 	freqs[LeadingBitPosition(uint32(in[22]))]++
437 | 	freqs[LeadingBitPosition(uint32(in[23]))]++
438 | 	freqs[LeadingBitPosition(uint32(in[24]))]++
439 | 	freqs[LeadingBitPosition(uint32(in[25]))]++
440 | 	freqs[LeadingBitPosition(uint32(in[26]))]++
441 | 	freqs[LeadingBitPosition(uint32(in[27]))]++
442 | 	freqs[LeadingBitPosition(uint32(in[28]))]++
443 | 	freqs[LeadingBitPosition(uint32(in[29]))]++
444 | 	freqs[LeadingBitPosition(uint32(in[30]))]++
445 | 	freqs[LeadingBitPosition(uint32(in[31]))]++
446 | 	freqs[LeadingBitPosition(uint32(in[32]))]++
447 | 	freqs[LeadingBitPosition(uint32(in[33]))]++
448 | 	freqs[LeadingBitPosition(uint32(in[34]))]++
449 | 	freqs[LeadingBitPosition(uint32(in[35]))]++
450 | 	freqs[LeadingBitPosition(uint32(in[36]))]++
451 | 	freqs[LeadingBitPosition(uint32(in[37]))]++
452 | 	freqs[LeadingBitPosition(uint32(in[38]))]++
453 | 	freqs[LeadingBitPosition(uint32(in[39]))]++
454 | 	freqs[LeadingBitPosition(uint32(in[40]))]++
455 | 	freqs[LeadingBitPosition(uint32(in[41]))]++
456 | 	freqs[LeadingBitPosition(uint32(in[42]))]++
457 | 	freqs[LeadingBitPosition(uint32(in[43]))]++
458 | 	freqs[LeadingBitPosition(uint32(in[44]))]++
459 | 	freqs[LeadingBitPosition(uint32(in[45]))]++
460 | 	freqs[LeadingBitPosition(uint32(in[46]))]++
461 | 	freqs[LeadingBitPosition(uint32(in[47]))]++
462 | 	freqs[LeadingBitPosition(uint32(in[48]))]++
463 | 	freqs[LeadingBitPosition(uint32(in[49]))]++
464 | 	freqs[LeadingBitPosition(uint32(in[50]))]++
465 | 	freqs[LeadingBitPosition(uint32(in[51]))]++
466 | 	freqs[LeadingBitPosition(uint32(in[52]))]++
467 | 	freqs[LeadingBitPosition(uint32(in[53]))]++
468 | 	freqs[LeadingBitPosition(uint32(in[54]))]++
469 | 	freqs[LeadingBitPosition(uint32(in[55]))]++
470 | 	freqs[LeadingBitPosition(uint32(in[56]))]++
471 | 	freqs[LeadingBitPosition(uint32(in[57]))]++
472 | 	freqs[LeadingBitPosition(uint32(in[58]))]++
473 | 	freqs[LeadingBitPosition(uint32(in[59]))]++
474 | 	freqs[LeadingBitPosition(uint32(in[60]))]++
475 | 	freqs[LeadingBitPosition(uint32(in[61]))]++
476 | 	freqs[LeadingBitPosition(uint32(in[62]))]++
477 | 	freqs[LeadingBitPosition(uint32(in[63]))]++
478 | 	freqs[LeadingBitPosition(uint32(in[64]))]++
479 | 	freqs[LeadingBitPosition(uint32(in[65]))]++
480 | 	freqs[LeadingBitPosition(uint32(in[66]))]++
481 | 	freqs[LeadingBitPosition(uint32(in[67]))]++
482 | 	freqs[LeadingBitPosition(uint32(in[68]))]++
483 | 	freqs[LeadingBitPosition(uint32(in[69]))]++
484 | 	freqs[LeadingBitPosition(uint32(in[70]))]++
485 | 	freqs[LeadingBitPosition(uint32(in[71]))]++
486 | 	freqs[LeadingBitPosition(uint32(in[72]))]++
487 | 	freqs[LeadingBitPosition(uint32(in[73]))]++
488 | 	freqs[LeadingBitPosition(uint32(in[74]))]++
489 | 	freqs[LeadingBitPosition(uint32(in[75]))]++
490 | 	freqs[LeadingBitPosition(uint32(in[76]))]++
491 | 	freqs[LeadingBitPosition(uint32(in[77]))]++
492 | 	freqs[LeadingBitPosition(uint32(in[78]))]++
493 | 	freqs[LeadingBitPosition(uint32(in[79]))]++
494 | 	freqs[LeadingBitPosition(uint32(in[80]))]++
495 | 	freqs[LeadingBitPosition(uint32(in[81]))]++
496 | 	freqs[LeadingBitPosition(uint32(in[82]))]++
497 | 	freqs[LeadingBitPosition(uint32(in[83]))]++
498 | 	freqs[LeadingBitPosition(uint32(in[84]))]++
499 | 	freqs[LeadingBitPosition(uint32(in[85]))]++
500 | 	freqs[LeadingBitPosition(uint32(in[86]))]++
501 | 	freqs[LeadingBitPosition(uint32(in[87]))]++
502 | 	freqs[LeadingBitPosition(uint32(in[88]))]++
503 | 	freqs[LeadingBitPosition(uint32(in[89]))]++
504 | 	freqs[LeadingBitPosition(uint32(in[90]))]++
505 | 	freqs[LeadingBitPosition(uint32(in[91]))]++
506 | 	freqs[LeadingBitPosition(uint32(in[92]))]++
507 | 	freqs[LeadingBitPosition(uint32(in[93]))]++
508 | 	freqs[LeadingBitPosition(uint32(in[94]))]++
509 | 	freqs[LeadingBitPosition(uint32(in[95]))]++
510 | 	freqs[LeadingBitPosition(uint32(in[96]))]++
511 | 	freqs[LeadingBitPosition(uint32(in[97]))]++
512 | 	freqs[LeadingBitPosition(uint32(in[98]))]++
513 | 	freqs[LeadingBitPosition(uint32(in[99]))]++
514 | 	freqs[LeadingBitPosition(uint32(in[100]))]++
515 | 	freqs[LeadingBitPosition(uint32(in[101]))]++
516 | 	freqs[LeadingBitPosition(uint32(in[102]))]++
517 | 	freqs[LeadingBitPosition(uint32(in[103]))]++
518 | 	freqs[LeadingBitPosition(uint32(in[104]))]++
519 | 	freqs[LeadingBitPosition(uint32(in[105]))]++
520 | 	freqs[LeadingBitPosition(uint32(in[106]))]++
521 | 	freqs[LeadingBitPosition(uint32(in[107]))]++
522 | 	freqs[LeadingBitPosition(uint32(in[108]))]++
523 | 	freqs[LeadingBitPosition(uint32(in[109]))]++
524 | 	freqs[LeadingBitPosition(uint32(in[110]))]++
525 | 	freqs[LeadingBitPosition(uint32(in[111]))]++
526 | 	freqs[LeadingBitPosition(uint32(in[112]))]++
527 | 	freqs[LeadingBitPosition(uint32(in[113]))]++
528 | 	freqs[LeadingBitPosition(uint32(in[114]))]++
529 | 	freqs[LeadingBitPosition(uint32(in[115]))]++
530 | 	freqs[LeadingBitPosition(uint32(in[116]))]++
531 | 	freqs[LeadingBitPosition(uint32(in[117]))]++
532 | 	freqs[LeadingBitPosition(uint32(in[118]))]++
533 | 	freqs[LeadingBitPosition(uint32(in[119]))]++
534 | 	freqs[LeadingBitPosition(uint32(in[120]))]++
535 | 	freqs[LeadingBitPosition(uint32(in[121]))]++
536 | 	freqs[LeadingBitPosition(uint32(in[122]))]++
537 | 	freqs[LeadingBitPosition(uint32(in[123]))]++
538 | 	freqs[LeadingBitPosition(uint32(in[124]))]++
539 | 	freqs[LeadingBitPosition(uint32(in[125]))]++
540 | 	freqs[LeadingBitPosition(uint32(in[126]))]++
541 | 	freqs[LeadingBitPosition(uint32(in[127]))]++
542 | }
543 | 


--------------------------------------------------------------------------------
/variablebyte/variablebyte.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
  3 |  * Use of this source code is governed by the Apache 2.0 license.
  4 |  *
  5 |  */
  6 | 
  7 | package variablebyte
  8 | 
  9 | import (
 10 | 	"errors"
 11 | 
 12 | 	"github.com/dataence/bytebuffer"
 13 | 	"github.com/dataence/encoding"
 14 | 	"github.com/dataence/encoding/cursor"
 15 | )
 16 | 
 17 | type VariableByte struct {
 18 | }
 19 | 
 20 | var _ encoding.Integer = (*VariableByte)(nil)
 21 | 
 22 | func New() encoding.Integer {
 23 | 	return &VariableByte{}
 24 | }
 25 | 
 26 | func (this *VariableByte) Compress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error {
 27 | 	if inlength == 0 {
 28 | 		return errors.New("VariableByte/Compress: inlength = 0. No work done.")
 29 | 	}
 30 | 
 31 | 	//fmt.Printf("VariableByte/Compress: after inlength = %d\n", inlength)
 32 | 
 33 | 	buf := bytebuffer.NewByteBuffer(inlength * 8)
 34 | 	tmpinpos := inpos.Get()
 35 | 
 36 | 	for _, v := range in[tmpinpos : tmpinpos+inlength] {
 37 | 		val := uint32(v)
 38 | 
 39 | 		for val >= 0x80 {
 40 | 			buf.Put(byte(val) | 0x80)
 41 | 			val >>= 7
 42 | 		}
 43 | 		buf.Put(byte(val))
 44 | 	}
 45 | 
 46 | 	for buf.Position()%4 != 0 {
 47 | 		//fmt.Printf("VariableByte/Compress: putting 128\n")
 48 | 		buf.Put(128)
 49 | 	}
 50 | 
 51 | 	length := buf.Position()
 52 | 	buf.Flip()
 53 | 	ibuf := buf.AsInt32Buffer()
 54 | 	//fmt.Printf("VariableByte/Compress: l = %d, outpos = %d, ibuf = %v, buf = %v\n", length/4, outpos.Get(), ibuf, buf)
 55 | 	err := ibuf.GetInt32s(out, outpos.Get(), length/4)
 56 | 	if err != nil {
 57 | 		//fmt.Printf("VariableByte/Compress: error with GetUint32s - %v\n", err)
 58 | 		return err
 59 | 	}
 60 | 	outpos.Add(length / 4)
 61 | 	inpos.Add(inlength)
 62 | 	//fmt.Printf("VariableByte/Compress: out = %v\n", out)
 63 | 
 64 | 	return nil
 65 | }
 66 | 
 67 | func (this *VariableByte) Uncompress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error {
 68 | 	if inlength == 0 {
 69 | 		return errors.New("VariableByte/Uncompress: inlength = 0. No work done.")
 70 | 	}
 71 | 
 72 | 	//fmt.Printf("VariableByte/Uncompress: after inlength = %d\n", inlength)
 73 | 
 74 | 	s := uint(0)
 75 | 	p := inpos.Get()
 76 | 	finalp := p + inlength
 77 | 	tmpoutpos := outpos.Get()
 78 | 	v := int32(0)
 79 | 	shift := uint(0)
 80 | 
 81 | 	for p < finalp {
 82 | 		c := in[p] >> (24 - s)
 83 | 		s += 8
 84 | 
 85 | 		if s == 32 {
 86 | 			s = 0
 87 | 			p += 1
 88 | 		}
 89 | 
 90 | 		v += ((c & 127) << shift)
 91 | 		if c&128 == 0 {
 92 | 			out[tmpoutpos] = v
 93 | 			tmpoutpos += 1
 94 | 			v = 0
 95 | 			shift = 0
 96 | 		} else {
 97 | 			shift += 7
 98 | 		}
 99 | 	}
100 | 
101 | 	outpos.Set(tmpoutpos)
102 | 	inpos.Add(inlength)
103 | 
104 | 	return nil
105 | }
106 | 


--------------------------------------------------------------------------------
/variablebyte/variablebyte_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
 3 |  * Use of this source code is governed by the Apache 2.0 license.
 4 |  *
 5 |  */
 6 | 
 7 | package variablebyte
 8 | 
 9 | import (
10 | 	"log"
11 | 	"testing"
12 | 
13 | 	"github.com/dataence/encoding/benchtools"
14 | 	"github.com/dataence/encoding/cursor"
15 | 	"github.com/dataence/encoding/generators"
16 | )
17 | 
18 | var (
19 | 	data []int32
20 | 	size int = 128000
21 | )
22 | 
23 | func init() {
24 | 	log.Printf("variablebyte/init: generating %d int32s\n", size)
25 | 	data = generators.GenerateClustered(size, size*2)
26 | 	log.Printf("variablebyte/init: generated %d integers for test", size)
27 | }
28 | 
29 | func TestCodec(t *testing.T) {
30 | 	sizes := []int{128, 128 * 10, 128 * 100, 128 * 1000}
31 | 	benchtools.TestCodec(New(), data, sizes)
32 | }
33 | 
34 | // go test -bench=Decode
35 | func BenchmarkDecode(b *testing.B) {
36 | 	b.StopTimer()
37 | 	length := 128 * 1024
38 | 	data := generators.GenerateClustered(length, 1<<24)
39 | 	compdata := make([]int32, 2*length)
40 | 	recov := make([]int32, length)
41 | 	inpos := cursor.New()
42 | 	outpos := cursor.New()
43 | 	codec := New()
44 | 	codec.Compress(data, inpos, len(data), compdata, outpos)
45 | 	b.StartTimer()
46 | 	for j := 0; j < b.N; j++ {
47 | 		newinpos := cursor.New()
48 | 		newoutpos := cursor.New()
49 | 		codec.Uncompress(compdata, newinpos, outpos.Get()-newinpos.Get(), recov, newoutpos)
50 | 	}
51 | }
52 | 


--------------------------------------------------------------------------------
/zigzag/bp32/bp32.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
  3 |  * Use of this source code is governed by the Apache 2.0 license.
  4 |  *
  5 |  */
  6 | 
  7 | package bp32
  8 | 
  9 | import (
 10 | 	"errors"
 11 | 
 12 | 	"github.com/dataence/encoding"
 13 | 	"github.com/dataence/encoding/bitpacking"
 14 | 	"github.com/dataence/encoding/cursor"
 15 | )
 16 | 
 17 | const (
 18 | 	DefaultBlockSize = 128
 19 | 	DefaultPageSize  = 65536
 20 | )
 21 | 
 22 | type BP32 struct {
 23 | }
 24 | 
 25 | var _ encoding.Integer = (*BP32)(nil)
 26 | 
 27 | func New() encoding.Integer {
 28 | 	return &BP32{}
 29 | }
 30 | 
 31 | func (this *BP32) Compress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error {
 32 | 	//log.Printf("zigzag_bp32/Compress: before inlength = %d\n", inlength)
 33 | 
 34 | 	inlength = encoding.FloorBy(inlength, DefaultBlockSize)
 35 | 
 36 | 	if inlength == 0 {
 37 | 		return errors.New("zigzag_bp32/Compress: block size less than 128. No work done.")
 38 | 	}
 39 | 
 40 | 	//log.Printf("zigzag_bp32/Compress: after inlength = %d, len(in) = %d\n", inlength, len(in))
 41 | 
 42 | 	out[outpos.Get()] = int32(inlength)
 43 | 	outpos.Increment()
 44 | 
 45 | 	tmpoutpos := outpos.Get()
 46 | 	s := inpos.Get()
 47 | 	finalinpos := s + inlength
 48 | 	delta := make([]int32, DefaultBlockSize)
 49 | 
 50 | 	for ; s < finalinpos; s += DefaultBlockSize {
 51 | 		encoding.ZigZagDelta(in[s:s+DefaultBlockSize], delta)
 52 | 		//log.Printf("zigzag_bp32/Compress: in = %v\n", in[s:s+DefaultBlockSize])
 53 | 		//log.Printf("zigzag_bp32/Compress: delta = %v\n", delta)
 54 | 
 55 | 		mbits1 := encoding.MaxBits(delta[0:32])
 56 | 		mbits2 := encoding.MaxBits(delta[32:64])
 57 | 		mbits3 := encoding.MaxBits(delta[64:96])
 58 | 		mbits4 := encoding.MaxBits(delta[96:128])
 59 | 
 60 | 		//log.Printf("zigzag_bp32/Compress: tmpoutpos = %d, s = %d\n", tmpoutpos, s)
 61 | 
 62 | 		out[tmpoutpos] = (mbits1 << 24) | (mbits2 << 16) | (mbits3 << 8) | mbits4
 63 | 		tmpoutpos += 1
 64 | 
 65 | 		//log.Printf("zigzag_bp32/Compress: mbits1 = %d, mbits2 = %d, mbits3 = %d, mbits4 = %d, s = %d\n", mbits1, mbits2, mbits3, mbits4, out[tmpoutpos-1])
 66 | 
 67 | 		bitpacking.FastPackWithoutMask(delta, 0, out, tmpoutpos, int(mbits1))
 68 | 		//encoding.PrintUint32sInBits(in[s:s+32])
 69 | 		//encoding.PrintUint32sInBits(out[tmpoutpos:tmpoutpos+int(mbits1]))
 70 | 		tmpoutpos += int(mbits1)
 71 | 
 72 | 		bitpacking.FastPackWithoutMask(delta, 32, out, tmpoutpos, int(mbits2))
 73 | 		//encoding.PrintUint32sInBits(in, s+32, 32)
 74 | 		//encoding.PrintUint32sInBits(out, tmpoutpos, int(mbits2))
 75 | 		tmpoutpos += int(mbits2)
 76 | 
 77 | 		bitpacking.FastPackWithoutMask(delta, 64, out, tmpoutpos, int(mbits3))
 78 | 		//encoding.PrintUint32sInBits(in, s+2*32, 32)
 79 | 		//encoding.PrintUint32sInBits(out, tmpoutpos, int(mbits3))
 80 | 		tmpoutpos += int(mbits3)
 81 | 
 82 | 		bitpacking.FastPackWithoutMask(delta, 96, out, tmpoutpos, int(mbits4))
 83 | 		//encoding.PrintUint32sInBits(in, s+3*32, 32)
 84 | 		//encoding.PrintUint32sInBits(out, tmpoutpos, int(mbits4))
 85 | 		tmpoutpos += int(mbits4)
 86 | 
 87 | 		//log.Printf("zigzag_bp32/Compress: out = %v\n", out[s:s+DefaultBlockSize])
 88 | 	}
 89 | 
 90 | 	inpos.Add(inlength)
 91 | 	outpos.Set(tmpoutpos)
 92 | 
 93 | 	return nil
 94 | }
 95 | 
 96 | func (this *BP32) Uncompress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error {
 97 | 	if inlength == 0 {
 98 | 		return errors.New("zigzag_bp32/Uncompress: Length is 0. No work done.")
 99 | 	}
100 | 
101 | 	outlength := int(in[inpos.Get()])
102 | 	inpos.Increment()
103 | 
104 | 	tmpinpos := inpos.Get()
105 | 	s := outpos.Get()
106 | 	finalinpos := s + outlength
107 | 	delta := make([]int32, DefaultBlockSize)
108 | 
109 | 	//log.Printf("zigzag_bp32/Uncompress: outlength = %d, inpos = %d, outpos = %d\n", outlength, inpos.Get(), outpos.Get())
110 | 	for ; s < finalinpos; s += DefaultBlockSize {
111 | 		tmp := in[tmpinpos]
112 | 		mbits1 := tmp >> 24
113 | 		mbits2 := (tmp >> 16) & 0xFF
114 | 		mbits3 := (tmp >> 8) & 0xFF
115 | 		mbits4 := (tmp) & 0xFF
116 | 
117 | 		//log.Printf("zigzag_bp32/Uncopmress: mbits1 = %d, mbits2 = %d, mbits3 = %d, mbits4 = %d, s = %d\n", mbits1, mbits2, mbits3, mbits4, s)
118 | 		tmpinpos += 1
119 | 
120 | 		bitpacking.FastUnpack(in, tmpinpos, delta, 0, int(mbits1))
121 | 		tmpinpos += int(mbits1)
122 | 		//log.Printf("zigzag_bp32/Uncompress: delta = %v\n", out)
123 | 
124 | 		bitpacking.FastUnpack(in, tmpinpos, delta, 32, int(mbits2))
125 | 		tmpinpos += int(mbits2)
126 | 		//log.Printf("zigzag_bp32/Uncompress: delta = %v\n", out)
127 | 
128 | 		bitpacking.FastUnpack(in, tmpinpos, delta, 64, int(mbits3))
129 | 		tmpinpos += int(mbits3)
130 | 		//log.Printf("zigzag_bp32/Uncompress: delta = %v\n", out)
131 | 
132 | 		bitpacking.FastUnpack(in, tmpinpos, delta, 96, int(mbits4))
133 | 		tmpinpos += int(mbits4)
134 | 
135 | 		encoding.InverseZigZagDelta(delta, out[s:s+DefaultBlockSize])
136 | 
137 | 		//log.Printf("zigzag_bp32/Uncompress: delta = %v\n", delta)
138 | 		//log.Printf("zigzag_bp32/Uncompress: out = %v\n", out[s:s+DefaultBlockSize])
139 | 
140 | 	}
141 | 
142 | 	outpos.Add(outlength)
143 | 	inpos.Set(tmpinpos)
144 | 
145 | 	return nil
146 | }
147 | 


--------------------------------------------------------------------------------
/zigzag/bp32/bp32_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
 3 |  * Use of this source code is governed by the Apache 2.0 license.
 4 |  *
 5 |  */
 6 | 
 7 | package bp32
 8 | 
 9 | import (
10 | 	"log"
11 | 	"testing"
12 | 
13 | 	"github.com/dataence/encoding/benchtools"
14 | 	"github.com/dataence/encoding/generators"
15 | )
16 | 
17 | var (
18 | 	data []int32
19 | 	size int = 12800000
20 | )
21 | 
22 | func init() {
23 | 	log.Printf("bp32/init: generating %d int32s\n", size)
24 | 	data = generators.GenerateClustered(size, size*2)
25 | 	log.Printf("bp32/init: generated %d integers for test", size)
26 | }
27 | 
28 | func TestCodec(t *testing.T) {
29 | 	sizes := []int{128, 128 * 10, 128 * 100, 128 * 1000, 128 * 10000}
30 | 	benchtools.TestCodec(New(), data, sizes)
31 | }
32 | 


--------------------------------------------------------------------------------
/zigzag/fastpfor/fastpfor.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
  3 |  * Use of this source code is governed by the Apache 2.0 license.
  4 |  *
  5 |  */
  6 | 
  7 | package fastpfor
  8 | 
  9 | import (
 10 | 	"errors"
 11 | 	"math"
 12 | 
 13 | 	"github.com/dataence/bytebuffer"
 14 | 	"github.com/dataence/encoding"
 15 | 	"github.com/dataence/encoding/bitpacking"
 16 | 	"github.com/dataence/encoding/cursor"
 17 | )
 18 | 
 19 | const (
 20 | 	DefaultBlockSize     = 128
 21 | 	OverheadOfEachExcept = 8
 22 | 	DefaultPageSize      = 65536
 23 | )
 24 | 
 25 | var (
 26 | 	zeroDataPointers []int32
 27 | 	zeroFreqs        []int32
 28 | )
 29 | 
 30 | func init() {
 31 | 	zeroDataPointers = make([]int32, 33)
 32 | 	zeroFreqs = make([]int32, 33)
 33 | }
 34 | 
 35 | type FastPFOR struct {
 36 | 	dataToBePacked [33][]int32
 37 | 	byteContainer  *bytebuffer.ByteBuffer
 38 | 	pageSize       int32
 39 | 
 40 | 	// Working area
 41 | 	dataPointers []int32
 42 | 	freqs        []int32
 43 | }
 44 | 
 45 | var _ encoding.Integer = (*FastPFOR)(nil)
 46 | 
 47 | func New() encoding.Integer {
 48 | 	f := &FastPFOR{
 49 | 		pageSize:      DefaultPageSize,
 50 | 		byteContainer: bytebuffer.NewByteBuffer(3*DefaultPageSize/DefaultBlockSize + DefaultPageSize),
 51 | 		dataPointers:  make([]int32, 33),
 52 | 		freqs:         make([]int32, 33),
 53 | 	}
 54 | 
 55 | 	for i := 1; i < 33; i++ {
 56 | 		f.dataToBePacked[i] = make([]int32, DefaultPageSize/32*4)
 57 | 	}
 58 | 
 59 | 	return f
 60 | }
 61 | 
 62 | func (this *FastPFOR) Compress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error {
 63 | 	inlength = encoding.FloorBy(inlength, DefaultBlockSize)
 64 | 
 65 | 	if inlength == 0 {
 66 | 		return errors.New("fastpfor/Compress: inlength = 0. No work done.")
 67 | 	}
 68 | 
 69 | 	out[outpos.Get()] = int32(inlength)
 70 | 	outpos.Increment()
 71 | 
 72 | 	initoffset := cursor.New()
 73 | 
 74 | 	copy(this.dataPointers, zeroDataPointers)
 75 | 	copy(this.freqs, zeroFreqs)
 76 | 
 77 | 	finalInpos := inpos.Get() + inlength
 78 | 
 79 | 	for inpos.Get() != finalInpos {
 80 | 		thissize := int(math.Min(float64(this.pageSize), float64(finalInpos-inpos.Get())))
 81 | 
 82 | 		if err := this.encodePage(in, inpos, thissize, out, outpos, initoffset); err != nil {
 83 | 			return errors.New("fastpfor/Compress: " + err.Error())
 84 | 		}
 85 | 	}
 86 | 
 87 | 	return nil
 88 | }
 89 | 
 90 | func (this *FastPFOR) Uncompress(in []int32, inpos *cursor.Cursor, inlength int, out []int32, outpos *cursor.Cursor) error {
 91 | 	if inlength == 0 {
 92 | 		return errors.New("fastpfor/Uncompress: inlength = 0. No work done.")
 93 | 	}
 94 | 
 95 | 	mynvalue := in[inpos.Get()]
 96 | 	inpos.Increment()
 97 | 
 98 | 	initoffset := cursor.New()
 99 | 
100 | 	copy(this.dataPointers, zeroDataPointers)
101 | 
102 | 	finalout := outpos.Get() + int(mynvalue)
103 | 	for outpos.Get() != finalout {
104 | 		thissize := int(math.Min(float64(this.pageSize), float64(finalout-outpos.Get())))
105 | 
106 | 		if err := this.decodePage(in, inpos, out, outpos, thissize, initoffset); err != nil {
107 | 			return errors.New("fastpfor/Uncompress: " + err.Error())
108 | 		}
109 | 	}
110 | 	return nil
111 | }
112 | 
113 | // getBestBFromData determins the best bit position with the best cost of exceptions,
114 | // and the max bit position of the array of int32s
115 | func (this *FastPFOR) getBestBFromData(in []int32) (bestb int32, bestc int32, maxb int32) {
116 | 	copy(this.freqs, zeroFreqs)
117 | 
118 | 	// Get the count of all the leading bit positions for the slice
119 | 	// Mainly to figure out what's the best (most popular) bit position
120 | 	for _, v := range in {
121 | 		this.freqs[encoding.LeadingBitPosition(uint32(v))]++
122 | 	}
123 | 	//encoding.FastLeadingBitFrequency128(in, this.freqs)
124 | 
125 | 	bestb = 32
126 | 
127 | 	for this.freqs[bestb] == 0 {
128 | 		bestb--
129 | 	}
130 | 
131 | 	maxb = bestb
132 | 	bestCost := bestb * DefaultBlockSize
133 | 	var cexcept int32
134 | 	bestc = cexcept
135 | 
136 | 	// Find the cost of storing exceptions for each bit position
137 | 	for b := bestb - 1; b >= 0; b-- {
138 | 		cexcept += this.freqs[b+1]
139 | 		if cexcept < 0 {
140 | 			break
141 | 		}
142 | 
143 | 		// the extra 8 is the cost of storing maxbits
144 | 		thisCost := cexcept*OverheadOfEachExcept + cexcept*(maxb-b) + b*DefaultBlockSize + 8
145 | 
146 | 		if thisCost < bestCost {
147 | 			bestCost = thisCost
148 | 			bestb = b
149 | 			bestc = cexcept
150 | 		}
151 | 	}
152 | 
153 | 	return
154 | }
155 | 
156 | func (this *FastPFOR) encodePage(in []int32, inpos *cursor.Cursor, thissize int, out []int32, outpos *cursor.Cursor, initoffset *cursor.Cursor) error {
157 | 	headerpos := int32(outpos.Get())
158 | 	outpos.Increment()
159 | 	tmpoutpos := int32(outpos.Get())
160 | 
161 | 	// Clear working area
162 | 	copy(this.dataPointers, zeroDataPointers)
163 | 	this.byteContainer.Clear()
164 | 
165 | 	tmpinpos := int32(inpos.Get())
166 | 	var delta [DefaultBlockSize]int32
167 | 
168 | 	for finalInpos := tmpinpos + int32(thissize) - DefaultBlockSize; tmpinpos <= finalInpos; tmpinpos += DefaultBlockSize {
169 | 		// Calculate the deltas, inlining to gain a bit of performance
170 | 		offset := int32(initoffset.Get())
171 | 		for i, v := range in[tmpinpos : tmpinpos+DefaultBlockSize] {
172 | 			n := v - offset
173 | 			delta[i] = (n << 1) ^ (n >> 31)
174 | 			offset = v
175 | 		}
176 | 
177 | 		initoffset.Set(int(in[tmpinpos+DefaultBlockSize-1]))
178 | 
179 | 		//bestb, bestc, maxb := this.getBestBFromData(in[tmpinpos:tmpinpos+DefaultBlockSize])
180 | 		bestb, bestc, maxb := this.getBestBFromData(delta[:])
181 | 		tmpbestb := bestb
182 | 		this.byteContainer.Put(byte(bestb))
183 | 		this.byteContainer.Put(byte(bestc))
184 | 
185 | 		if bestc > 0 {
186 | 			this.byteContainer.Put(byte(maxb))
187 | 			index := maxb - bestb
188 | 
189 | 			if int(this.dataPointers[index]+bestc) >= len(this.dataToBePacked[index]) {
190 | 				newSize := int(2 * (this.dataPointers[index] + bestc))
191 | 
192 | 				// make sure it is a multiple of 32.
193 | 				// there might be a better way to do this
194 | 				newSize = encoding.CeilBy(newSize, 32)
195 | 				newSlice := make([]int32, newSize)
196 | 				copy(newSlice, this.dataToBePacked[index])
197 | 				this.dataToBePacked[index] = newSlice
198 | 			}
199 | 
200 | 			for k := int32(0); k < DefaultBlockSize; k++ {
201 | 				if uint32(delta[k])>>uint(bestb) != 0 {
202 | 					// we have an exception
203 | 					this.byteContainer.Put(byte(k))
204 | 					this.dataToBePacked[index][this.dataPointers[index]] = int32(uint32(delta[k]) >> uint(tmpbestb))
205 | 					this.dataPointers[index] += 1
206 | 				}
207 | 			}
208 | 		}
209 | 
210 | 		for k := int32(0); k < 128; k += 32 {
211 | 			bitpacking.FastPack(delta[:], int(k), out, int(tmpoutpos), int(tmpbestb))
212 | 			tmpoutpos += tmpbestb
213 | 		}
214 | 	}
215 | 
216 | 	inpos.Set(int(tmpinpos))
217 | 	out[headerpos] = tmpoutpos - headerpos
218 | 
219 | 	for this.byteContainer.Position()&3 != 0 {
220 | 		this.byteContainer.Put(0)
221 | 	}
222 | 
223 | 	bytesize := int32(this.byteContainer.Position())
224 | 	out[tmpoutpos] = bytesize
225 | 	tmpoutpos += 1
226 | 	howmanyints := bytesize / 4
227 | 
228 | 	this.byteContainer.Flip()
229 | 	this.byteContainer.AsInt32Buffer().GetInt32s(out, int(tmpoutpos), int(howmanyints))
230 | 	tmpoutpos += howmanyints
231 | 
232 | 	bitmap := int32(0)
233 | 	for k := 1; k <= 32; k++ {
234 | 		v := this.dataPointers[k]
235 | 		if v != 0 {
236 | 			bitmap |= (1 << uint(k-1))
237 | 		}
238 | 	}
239 | 
240 | 	out[tmpoutpos] = bitmap
241 | 	tmpoutpos += 1
242 | 
243 | 	for k := 1; k < 33; k++ {
244 | 		v := this.dataPointers[k]
245 | 		if v != 0 {
246 | 			out[tmpoutpos] = v // size
247 | 			tmpoutpos += 1
248 | 			for j := 0; j < int(v); j += 32 {
249 | 				bitpacking.FastPack(this.dataToBePacked[k], j, out, int(tmpoutpos), k)
250 | 				tmpoutpos += int32(k)
251 | 			}
252 | 		}
253 | 	}
254 | 
255 | 	outpos.Set(int(tmpoutpos))
256 | 
257 | 	return nil
258 | }
259 | 
260 | func (this *FastPFOR) decodePage(in []int32, inpos *cursor.Cursor, out []int32, outpos *cursor.Cursor, thissize int, initoffset *cursor.Cursor) error {
261 | 	initpos := int32(inpos.Get())
262 | 	wheremeta := in[initpos]
263 | 	inpos.Increment()
264 | 
265 | 	inexcept := initpos + wheremeta
266 | 	bytesize := in[inexcept]
267 | 	inexcept += 1
268 | 
269 | 	this.byteContainer.Clear()
270 | 	if err := this.byteContainer.AsInt32Buffer().PutInt32s(in, int(inexcept), int(bytesize/4)); err != nil {
271 | 		return err
272 | 	}
273 | 
274 | 	inexcept += bytesize / 4
275 | 	bitmap := in[inexcept]
276 | 	inexcept += 1
277 | 
278 | 	for k := int32(1); k < 33; k++ {
279 | 		if bitmap&(1<<uint32(k-1)) != 0 {
280 | 			size := in[inexcept]
281 | 			inexcept += 1
282 | 
283 | 			if int32(len(this.dataToBePacked[k])) < size {
284 | 				this.dataToBePacked[k] = make([]int32, encoding.CeilBy(int(size), 32))
285 | 			}
286 | 
287 | 			for j := int32(0); j < size; j += 32 {
288 | 				bitpacking.FastUnpack(in, int(inexcept), this.dataToBePacked[k], int(j), int(k))
289 | 				inexcept += k
290 | 			}
291 | 		}
292 | 	}
293 | 
294 | 	copy(this.dataPointers, zeroDataPointers)
295 | 	tmpoutpos := int32(outpos.Get())
296 | 	tmpinpos := int32(inpos.Get())
297 | 
298 | 	delta := make([]int32, DefaultBlockSize)
299 | 
300 | 	run := 0
301 | 	run_end := thissize / DefaultBlockSize
302 | 	for run < run_end {
303 | 		var err error
304 | 		var bestb int32
305 | 		if bestb, err = this.byteContainer.GetAsInt32(); err != nil {
306 | 			return err
307 | 		}
308 | 
309 | 		var cexcept int32
310 | 		if cexcept, err = this.byteContainer.GetAsInt32(); err != nil {
311 | 			return err
312 | 		}
313 | 
314 | 		for k := int32(0); k < 128; k += 32 {
315 | 			//bitpacking.FastUnpack(in, int(tmpinpos), out, int(tmpoutpos+k), int(bestb))
316 | 			bitpacking.FastUnpack(in, int(tmpinpos), delta, int(k), int(bestb))
317 | 			tmpinpos += bestb
318 | 		}
319 | 
320 | 		if cexcept > 0 {
321 | 			var maxbits int32
322 | 			if maxbits, err = this.byteContainer.GetAsInt32(); err != nil {
323 | 				return err
324 | 			}
325 | 
326 | 			index := maxbits - bestb
327 | 
328 | 			for k := int32(0); k < cexcept; k++ {
329 | 				var pos int32
330 | 				if pos, err = this.byteContainer.GetAsInt32(); err != nil {
331 | 					return err
332 | 				}
333 | 
334 | 				exceptvalue := this.dataToBePacked[index][this.dataPointers[index]]
335 | 				this.dataPointers[index] += 1
336 | 				//out[pos + tmpoutpos] |= exceptvalue << uint(bestb)
337 | 				delta[pos] |= exceptvalue << uint(bestb)
338 | 			}
339 | 		}
340 | 
341 | 		// Calculate the original from the deltas, inlining to gain a bit of performance
342 | 		offset := int32(initoffset.Get())
343 | 		for i, v := range delta {
344 | 			n := int32(uint32(v)>>1) ^ ((v << 31) >> 31)
345 | 			out[int(tmpoutpos)+i] = n + offset
346 | 			offset += n
347 | 		}
348 | 		initoffset.Set(int(out[tmpoutpos+DefaultBlockSize-1]))
349 | 
350 | 		run += 1
351 | 		tmpoutpos += DefaultBlockSize
352 | 	}
353 | 
354 | 	outpos.Set(int(tmpoutpos))
355 | 	inpos.Set(int(inexcept))
356 | 
357 | 	return nil
358 | }
359 | 


--------------------------------------------------------------------------------
/zigzag/fastpfor/fastpfor_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2013 Zhen, LLC. http://zhen.io. All rights reserved.
 3 |  * Use of this source code is governed by the Apache 2.0 license.
 4 |  *
 5 |  */
 6 | 
 7 | package fastpfor
 8 | 
 9 | import (
10 | 	"log"
11 | 	"testing"
12 | 
13 | 	"github.com/dataence/encoding/benchtools"
14 | 	"github.com/dataence/encoding/generators"
15 | )
16 | 
17 | var (
18 | 	data []int32
19 | 	size int = 12800000
20 | )
21 | 
22 | func init() {
23 | 	log.Printf("bp32/init: generating %d int32s\n", size)
24 | 	data = generators.GenerateClustered(size, size*2)
25 | 	log.Printf("bp32/init: generated %d integers for test", size)
26 | }
27 | 
28 | func TestCodec(t *testing.T) {
29 | 	sizes := []int{128, 128 * 10, 128 * 100, 128 * 1000, 128 * 10000}
30 | 	benchtools.TestCodec(New(), data, sizes)
31 | }
32 | 


--------------------------------------------------------------------------------