├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── benchmarks
    ├── benchmark.h
    └── decodebenchmark.cpp
├── scripts
    ├── avx512dict.py
    └── avxdict.py
├── src
    ├── avx512bpacking.h
    ├── avx512codec.h
    ├── avx512dict.h
    ├── avxbpacking.h
    ├── avxcodec.h
    ├── avxdict.h
    ├── bpacking.h
    ├── dict.h
    └── scalarcodec.h
└── tests
    ├── avxtest.cpp
    └── scalartest.cpp


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files
 2 | *.slo
 3 | *.lo
 4 | *.o
 5 | *.obj
 6 | 
 7 | # Precompiled Headers
 8 | *.gch
 9 | *.pch
10 | 
11 | # Compiled Dynamic libraries
12 | *.so
13 | *.dylib
14 | *.dll
15 | 
16 | # Fortran module files
17 | *.mod
18 | *.smod
19 | 
20 | # Compiled Static libraries
21 | *.lai
22 | *.la
23 | *.a
24 | *.lib
25 | 
26 | # Executables
27 | *.exe
28 | *.out
29 | *.app
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # minimalist makefile
 2 | .SUFFIXES:
 3 | #
 4 | .SUFFIXES: .cpp .o .c .h
 5 | ifeq ($(DEBUG),1)
 6 | GENFLAGS = -fPIC  -ggdb  -march=native -Wall -Wextra -pedantic -Wshadow -fsanitize=undefined  -fno-omit-frame-pointer -fsanitize=address -Wno-unused
 7 | else
 8 | GENFLAGS =  -fPIC -O3  -march=native -Wall -Wextra -pedantic -Wshadow -Wno-unused
 9 | #GENFLAGS =  -fPIC -O3 -mavx2 -Wall -Wextra -pedantic -Wshadow -Wno-unused
10 | #GENFLAGS =  -fPIC -O3 -msse2 -Wall -Wextra -pedantic -Wshadow -Wno-unused
11 | endif # debug
12 | CFLAGS =  -std=c99 $(GENFLAGS)
13 | CXXFLAGS = -std=c++11 $(GENFLAGS)
14 | 
15 | HEADERS=src/bpacking.h src/dict.h  src/scalarcodec.h src/avx512bpacking.h src/avx512codec.h  src/avx512dict.h  src/avxbpacking.h src/avxcodec.h  src/avxdict.h
16 | EXECUTABLES=scalartest avxtest decodebenchmark
17 | 
18 | all: $(EXECUTABLES)
19 | 
20 | test: $(EXECUTABLES)
21 | 	@(./scalartest && ./avxtest && echo "\033[0;32mAll tests ok\033[0m" ) || (echo "\033[0;31mSome tests failed\033[0m")
22 | 
23 | scalartest : ./tests/scalartest.cpp  $(HEADERS)
24 | 	$(CXX) $(CXXFLAGS) -o scalartest ./tests/scalartest.cpp -Isrc
25 | 
26 | avxtest : ./tests/avxtest.cpp  $(HEADERS)
27 | 	$(CXX) $(CXXFLAGS) -o avxtest ./tests/avxtest.cpp -Isrc
28 | 
29 | decodebenchmark : ./benchmarks/decodebenchmark.cpp ./benchmarks/benchmark.h  $(HEADERS)
30 | 	$(CXX) $(CXXFLAGS) -o decodebenchmark ./benchmarks/decodebenchmark.cpp -Isrc -Ibenchmarks
31 | 
32 | clean:
33 | 	rm -f $(EXECUTABLES)
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # dictionary
  2 | High-performance dictionary coding
  3 | 
  4 | Suppose you want to compress a large array of values with
  5 | (relatively) few distinct values. For example, maybe you have 16 distinct 64-bit
  6 | values. Only four bits are needed to store a value in the range [0,16) using
  7 | binary packing,  so if you have long arrays, it is possible to save 60 bits per value (compress
  8 | the data by a factor of 16).
  9 | 
 10 | 
 11 | We consider the following (simple) form of dictionary coding. We
 12 | have a dictionary of 64-bit values (could be pointers) stored
 13 | in an array. In the compression phase, we convert the values to indexes
 14 | and binary pack them. In the decompression phase, we
 15 | try to recover the dictionary-coded values as fast as possible.
 16 | 
 17 | Dictionary coding is in common use within database systems (e.g., Oracle, Parquet and so forth).
 18 | 
 19 | We are going to assume that one has a recent Intel processor
 20 | for the sake of this experiment.
 21 | 
 22 | ## Core Idea
 23 | 
 24 | It is tempting in dictionary coding, to first unpack the indexes to a temporary buffer
 25 | and then run through it and look-up the values in the dictionary. What if it were possible
 26 | to decode the indexes and look-up the values in the dictionary in one step?
 27 | It is possible with vector instructions as long as you have access to a ``gather``
 28 | instruction. Thankfully, recent commodity x64 processors have such an instruction.
 29 | 
 30 | ## A word on RAM access
 31 | 
 32 | There is no slower processor than an idle processor waiting for the memory
 33 | subsystem.
 34 | 
 35 | When working with large data sets, it is tempting to decompress them from RAM
 36 | to RAM, converting gigabytes of compressed data into (many more) gigabytes of
 37 | uncompressed data.
 38 | 
 39 | If the purpose of compression is to keep more of the data close to the CPU,
 40 | then this is wasteful.
 41 | 
 42 | One should engineer applications so as to work on cache-friendly blocks. For
 43 | example, if have an array made of billions of values, instead of decoding them
 44 | all to RAM, and then reading them, it is much better to decode them in small blocks
 45 | at a time. In fact, ideally, one would prefer not to decode the data at all if possible:
 46 | working directly over the compressed data would be ideal.
 47 | 
 48 | If you must decode gigabytes of data to RAM or to disk, then you should expect
 49 | to be wasting enormous quantities of CPU cycles.
 50 | 
 51 | ## Usage
 52 | 
 53 | ```bash
 54 | make && make test
 55 | ./decodebenchmark
 56 | ```
 57 | 
 58 | ## Experimental results (Skylake, August 24th 2016)
 59 | 
 60 | We find that an AVX2 dictionary decoder can be more than twice as fast as a good scalar decoder
 61 | on a recent Intel processor (Skylake) for modest dictionary sizes. Even with large
 62 | dictionaries, the AVX2 gather approach is still remarkably faster. See results below. We expect results on older
 63 | Intel architectures to be less impressive because the ``vpgather`` instruction that we use was
 64 | quite slow in its early incarnations.
 65 | 
 66 | The case with large dictionary as implemented here is somewhat pessimistic as it assumes
 67 | that all values are equally likely. In most instances, a dictionary will have frequent
 68 | values, more likely to be repeated. This will reduce the number of cache misses.
 69 | 
 70 | Also, in practice one might limit the size of the dictionary by horizontal partitions.
 71 | 
 72 | ```bash
 73 | $ ./decodebenchmark
 74 | For this benchmark, use a recent (Skylake) Intel processor for best results.
 75 | Intel processor:  Skylake     compiler version: 5.3.0 20151204        AVX2 is available.
 76 | Using array sizes of 8388608 values or 65536 kiB.
 77 | testing with dictionary of size 2
 78 | Actual dict size: 2
 79 |         scalarcodec.uncompress(t,newbuf):  4.00 cycles per decoded value
 80 |    decodetocache(&sc, &t,newbuf,bufsize):  3.06 cycles per decoded value
 81 |            avxcodec.uncompress(t,newbuf):  3.45 cycles per decoded value
 82 |   AVXDictCODEC::fastuncompress(t,newbuf):  1.91 cycles per decoded value
 83 |      AVXdecodetocache(&t,newbuf,bufsize):  1.15 cycles per decoded value
 84 | 
 85 | testing with dictionary of size 4
 86 | Actual dict size: 4
 87 |         scalarcodec.uncompress(t,newbuf):  3.99 cycles per decoded value
 88 |    decodetocache(&sc, &t,newbuf,bufsize):  3.06 cycles per decoded value
 89 |            avxcodec.uncompress(t,newbuf):  3.46 cycles per decoded value
 90 |   AVXDictCODEC::fastuncompress(t,newbuf):  1.91 cycles per decoded value
 91 |      AVXdecodetocache(&t,newbuf,bufsize):  1.19 cycles per decoded value
 92 | 
 93 | testing with dictionary of size 8
 94 | Actual dict size: 8
 95 |         scalarcodec.uncompress(t,newbuf):  3.52 cycles per decoded value
 96 |    decodetocache(&sc, &t,newbuf,bufsize):  2.38 cycles per decoded value
 97 |            avxcodec.uncompress(t,newbuf):  3.49 cycles per decoded value
 98 |   AVXDictCODEC::fastuncompress(t,newbuf):  1.93 cycles per decoded value
 99 |      AVXdecodetocache(&t,newbuf,bufsize):  1.17 cycles per decoded value
100 | 
101 | testing with dictionary of size 16
102 | Actual dict size: 16
103 |         scalarcodec.uncompress(t,newbuf):  4.01 cycles per decoded value
104 |    decodetocache(&sc, &t,newbuf,bufsize):  3.08 cycles per decoded value
105 |            avxcodec.uncompress(t,newbuf):  3.50 cycles per decoded value
106 |   AVXDictCODEC::fastuncompress(t,newbuf):  1.95 cycles per decoded value
107 |      AVXdecodetocache(&t,newbuf,bufsize):  1.19 cycles per decoded value
108 | 
109 | testing with dictionary of size 32
110 | Actual dict size: 32
111 |         scalarcodec.uncompress(t,newbuf):  4.02 cycles per decoded value
112 |    decodetocache(&sc, &t,newbuf,bufsize):  3.06 cycles per decoded value
113 |            avxcodec.uncompress(t,newbuf):  3.51 cycles per decoded value
114 |   AVXDictCODEC::fastuncompress(t,newbuf):  1.96 cycles per decoded value
115 |      AVXdecodetocache(&t,newbuf,bufsize):  1.18 cycles per decoded value
116 | 
117 | testing with dictionary of size 64
118 | Actual dict size: 64
119 |         scalarcodec.uncompress(t,newbuf):  4.02 cycles per decoded value
120 |    decodetocache(&sc, &t,newbuf,bufsize):  3.08 cycles per decoded value
121 |            avxcodec.uncompress(t,newbuf):  3.54 cycles per decoded value
122 |   AVXDictCODEC::fastuncompress(t,newbuf):  1.98 cycles per decoded value
123 |      AVXdecodetocache(&t,newbuf,bufsize):  1.17 cycles per decoded value
124 | 
125 | testing with dictionary of size 128
126 | Actual dict size: 128
127 |         scalarcodec.uncompress(t,newbuf):  3.59 cycles per decoded value
128 |    decodetocache(&sc, &t,newbuf,bufsize):  2.35 cycles per decoded value
129 |            avxcodec.uncompress(t,newbuf):  3.55 cycles per decoded value
130 |   AVXDictCODEC::fastuncompress(t,newbuf):  1.99 cycles per decoded value
131 |      AVXdecodetocache(&t,newbuf,bufsize):  1.14 cycles per decoded value
132 | 
133 | testing with dictionary of size 256
134 | Actual dict size: 256
135 |         scalarcodec.uncompress(t,newbuf):  4.03 cycles per decoded value
136 |    decodetocache(&sc, &t,newbuf,bufsize):  3.10 cycles per decoded value
137 |            avxcodec.uncompress(t,newbuf):  3.55 cycles per decoded value
138 |   AVXDictCODEC::fastuncompress(t,newbuf):  2.00 cycles per decoded value
139 |      AVXdecodetocache(&t,newbuf,bufsize):  1.22 cycles per decoded value
140 | 
141 | testing with dictionary of size 512
142 | Actual dict size: 512
143 |         scalarcodec.uncompress(t,newbuf):  4.04 cycles per decoded value
144 |    decodetocache(&sc, &t,newbuf,bufsize):  3.11 cycles per decoded value
145 |            avxcodec.uncompress(t,newbuf):  3.55 cycles per decoded value
146 |   AVXDictCODEC::fastuncompress(t,newbuf):  2.01 cycles per decoded value
147 |      AVXdecodetocache(&t,newbuf,bufsize):  1.20 cycles per decoded value
148 | 
149 | testing with dictionary of size 1024
150 | Actual dict size: 1024
151 |         scalarcodec.uncompress(t,newbuf):  4.04 cycles per decoded value
152 |    decodetocache(&sc, &t,newbuf,bufsize):  3.11 cycles per decoded value
153 |            avxcodec.uncompress(t,newbuf):  3.57 cycles per decoded value
154 |   AVXDictCODEC::fastuncompress(t,newbuf):  2.04 cycles per decoded value
155 |      AVXdecodetocache(&t,newbuf,bufsize):  1.18 cycles per decoded value
156 | 
157 | testing with dictionary of size 2048
158 | Actual dict size: 2048
159 |         scalarcodec.uncompress(t,newbuf):  4.08 cycles per decoded value
160 |    decodetocache(&sc, &t,newbuf,bufsize):  3.15 cycles per decoded value
161 |            avxcodec.uncompress(t,newbuf):  3.67 cycles per decoded value
162 |   AVXDictCODEC::fastuncompress(t,newbuf):  2.05 cycles per decoded value
163 |      AVXdecodetocache(&t,newbuf,bufsize):  1.22 cycles per decoded value
164 | 
165 | testing with dictionary of size 4096
166 | Actual dict size: 4096
167 |         scalarcodec.uncompress(t,newbuf):  4.14 cycles per decoded value
168 |    decodetocache(&sc, &t,newbuf,bufsize):  3.33 cycles per decoded value
169 |            avxcodec.uncompress(t,newbuf):  3.69 cycles per decoded value
170 |   AVXDictCODEC::fastuncompress(t,newbuf):  2.12 cycles per decoded value
171 |      AVXdecodetocache(&t,newbuf,bufsize):  1.32 cycles per decoded value
172 | 
173 | testing with dictionary of size 8192
174 | Actual dict size: 8192
175 |         scalarcodec.uncompress(t,newbuf):  4.35 cycles per decoded value
176 |    decodetocache(&sc, &t,newbuf,bufsize):  3.65 cycles per decoded value
177 |            avxcodec.uncompress(t,newbuf):  3.85 cycles per decoded value
178 |   AVXDictCODEC::fastuncompress(t,newbuf):  2.28 cycles per decoded value
179 |      AVXdecodetocache(&t,newbuf,bufsize):  1.67 cycles per decoded value
180 | 
181 | testing with dictionary of size 16384
182 | Actual dict size: 16384
183 |         scalarcodec.uncompress(t,newbuf):  4.51 cycles per decoded value
184 |    decodetocache(&sc, &t,newbuf,bufsize):  3.95 cycles per decoded value
185 |            avxcodec.uncompress(t,newbuf):  4.07 cycles per decoded value
186 |   AVXDictCODEC::fastuncompress(t,newbuf):  2.55 cycles per decoded value
187 |      AVXdecodetocache(&t,newbuf,bufsize):  2.12 cycles per decoded value
188 | 
189 | testing with dictionary of size 32768
190 | Actual dict size: 32768
191 |         scalarcodec.uncompress(t,newbuf):  4.88 cycles per decoded value
192 |    decodetocache(&sc, &t,newbuf,bufsize):  3.84 cycles per decoded value
193 |            avxcodec.uncompress(t,newbuf):  4.89 cycles per decoded value
194 |   AVXDictCODEC::fastuncompress(t,newbuf):  3.52 cycles per decoded value
195 |      AVXdecodetocache(&t,newbuf,bufsize):  3.02 cycles per decoded value
196 | 
197 | testing with dictionary of size 65536
198 | Actual dict size: 65536
199 |         scalarcodec.uncompress(t,newbuf):  7.14 cycles per decoded value
200 |    decodetocache(&sc, &t,newbuf,bufsize):  5.47 cycles per decoded value
201 |            avxcodec.uncompress(t,newbuf):  6.68 cycles per decoded value
202 |   AVXDictCODEC::fastuncompress(t,newbuf):  5.18 cycles per decoded value
203 |      AVXdecodetocache(&t,newbuf,bufsize):  4.53 cycles per decoded value
204 | 
205 | testing with dictionary of size 131072
206 | Actual dict size: 131072
207 |         scalarcodec.uncompress(t,newbuf):  7.96 cycles per decoded value
208 |    decodetocache(&sc, &t,newbuf,bufsize):  6.05 cycles per decoded value
209 |            avxcodec.uncompress(t,newbuf):  7.53 cycles per decoded value
210 |   AVXDictCODEC::fastuncompress(t,newbuf):  6.01 cycles per decoded value
211 |      AVXdecodetocache(&t,newbuf,bufsize):  5.43 cycles per decoded value
212 | 
213 | testing with dictionary of size 262144
214 | Actual dict size: 262144
215 |         scalarcodec.uncompress(t,newbuf):  8.30 cycles per decoded value
216 |    decodetocache(&sc, &t,newbuf,bufsize):  6.35 cycles per decoded value
217 |            avxcodec.uncompress(t,newbuf):  8.08 cycles per decoded value
218 |   AVXDictCODEC::fastuncompress(t,newbuf):  6.46 cycles per decoded value
219 |      AVXdecodetocache(&t,newbuf,bufsize):  5.66 cycles per decoded value
220 | 
221 | testing with dictionary of size 524288
222 | Actual dict size: 524288
223 |         scalarcodec.uncompress(t,newbuf):  8.48 cycles per decoded value
224 |    decodetocache(&sc, &t,newbuf,bufsize):  6.39 cycles per decoded value
225 |            avxcodec.uncompress(t,newbuf):  8.09 cycles per decoded value
226 |   AVXDictCODEC::fastuncompress(t,newbuf):  6.44 cycles per decoded value
227 |      AVXdecodetocache(&t,newbuf,bufsize):  5.83 cycles per decoded value
228 | 
229 | testing with dictionary of size 1048576
230 | Actual dict size: 1048235
231 |         scalarcodec.uncompress(t,newbuf):  11.85 cycles per decoded value
232 |    decodetocache(&sc, &t,newbuf,bufsize):  10.53 cycles per decoded value
233 |            avxcodec.uncompress(t,newbuf):  11.65 cycles per decoded value
234 |   AVXDictCODEC::fastuncompress(t,newbuf):  8.47 cycles per decoded value
235 |      AVXdecodetocache(&t,newbuf,bufsize):  8.07 cycles per decoded value
236 | ```
237 | 
238 | ## Experimental results (Knights Landing, August 24th 2016)
239 | 
240 | We find that an AVX-512 dictionary decoder can be than twice as fast as an AVX dictionary
241 | decoder which is in turn twice as fast as a scalar decoder
242 | on a recent Intel processor (Knights Landing) for modest dictionary sizes. 
243 | The case with large dictionary as implemented here is somewhat pessimistic as it assumes
244 | that all values are equally likely.
245 | 
246 | 
247 | ```bash
248 | $ ./decodebenchmark
249 | For this benchmark, use a recent (Skylake) Intel processor for best results.
250 | Intel processor:  UNKNOWN     compiler version: 5.3.0        AVX2 is available.
251 | Using array sizes of 8388608 values or 65536 kiB.
252 | testing with dictionary of size 2
253 | Actual dict size: 2
254 |         scalarcodec.uncompress(t,newbuf):  7.75 cycles per decoded value
255 |    decodetocache(&sc, &t,newbuf,bufsize):  7.39 cycles per decoded value
256 |            avxcodec.uncompress(t,newbuf):  6.26 cycles per decoded value
257 |   AVXDictCODEC::fastuncompress(t,newbuf):  3.22 cycles per decoded value
258 |      AVXdecodetocache(&t,newbuf,bufsize):  3.06 cycles per decoded value
259 | AVX512DictCODEC::fastuncompress(t,newbuf):  1.48 cycles per decoded value
260 |   AVX512decodetocache(&t,newbuf,bufsize):  1.14 cycles per decoded value
261 | 
262 | testing with dictionary of size 4
263 | Actual dict size: 4
264 |         scalarcodec.uncompress(t,newbuf):  7.83 cycles per decoded value
265 |    decodetocache(&sc, &t,newbuf,bufsize):  7.49 cycles per decoded value
266 |            avxcodec.uncompress(t,newbuf):  6.35 cycles per decoded value
267 |   AVXDictCODEC::fastuncompress(t,newbuf):  3.23 cycles per decoded value
268 |      AVXdecodetocache(&t,newbuf,bufsize):  3.10 cycles per decoded value
269 | AVX512DictCODEC::fastuncompress(t,newbuf):  1.49 cycles per decoded value
270 |   AVX512decodetocache(&t,newbuf,bufsize):  1.21 cycles per decoded value
271 | 
272 | testing with dictionary of size 8
273 | Actual dict size: 8
274 |         scalarcodec.uncompress(t,newbuf):  7.27 cycles per decoded value
275 |    decodetocache(&sc, &t,newbuf,bufsize):  6.99 cycles per decoded value
276 |            avxcodec.uncompress(t,newbuf):  6.17 cycles per decoded value
277 |   AVXDictCODEC::fastuncompress(t,newbuf):  3.23 cycles per decoded value
278 |      AVXdecodetocache(&t,newbuf,bufsize):  3.10 cycles per decoded value
279 | AVX512DictCODEC::fastuncompress(t,newbuf):  1.59 cycles per decoded value
280 |   AVX512decodetocache(&t,newbuf,bufsize):  1.25 cycles per decoded value
281 | 
282 | testing with dictionary of size 16
283 | Actual dict size: 16
284 |         scalarcodec.uncompress(t,newbuf):  7.98 cycles per decoded value
285 |    decodetocache(&sc, &t,newbuf,bufsize):  7.65 cycles per decoded value
286 |            avxcodec.uncompress(t,newbuf):  6.32 cycles per decoded value
287 |   AVXDictCODEC::fastuncompress(t,newbuf):  3.23 cycles per decoded value
288 |      AVXdecodetocache(&t,newbuf,bufsize):  3.16 cycles per decoded value
289 | AVX512DictCODEC::fastuncompress(t,newbuf):  1.68 cycles per decoded value
290 |   AVX512decodetocache(&t,newbuf,bufsize):  1.34 cycles per decoded value
291 | 
292 | testing with dictionary of size 32
293 | Actual dict size: 32
294 |         scalarcodec.uncompress(t,newbuf):  7.92 cycles per decoded value
295 |    decodetocache(&sc, &t,newbuf,bufsize):  7.63 cycles per decoded value
296 |            avxcodec.uncompress(t,newbuf):  6.27 cycles per decoded value
297 |   AVXDictCODEC::fastuncompress(t,newbuf):  3.23 cycles per decoded value
298 |      AVXdecodetocache(&t,newbuf,bufsize):  3.19 cycles per decoded value
299 | AVX512DictCODEC::fastuncompress(t,newbuf):  1.65 cycles per decoded value
300 |   AVX512decodetocache(&t,newbuf,bufsize):  1.43 cycles per decoded value
301 | 
302 | testing with dictionary of size 64
303 | Actual dict size: 64
304 |         scalarcodec.uncompress(t,newbuf):  8.05 cycles per decoded value
305 |    decodetocache(&sc, &t,newbuf,bufsize):  7.76 cycles per decoded value
306 |            avxcodec.uncompress(t,newbuf):  6.32 cycles per decoded value
307 |   AVXDictCODEC::fastuncompress(t,newbuf):  3.31 cycles per decoded value
308 |      AVXdecodetocache(&t,newbuf,bufsize):  3.25 cycles per decoded value
309 | AVX512DictCODEC::fastuncompress(t,newbuf):  1.85 cycles per decoded value
310 |   AVX512decodetocache(&t,newbuf,bufsize):  1.66 cycles per decoded value
311 | 
312 | testing with dictionary of size 128
313 | Actual dict size: 128
314 |         scalarcodec.uncompress(t,newbuf):  6.64 cycles per decoded value
315 |    decodetocache(&sc, &t,newbuf,bufsize):  6.36 cycles per decoded value
316 |            avxcodec.uncompress(t,newbuf):  6.19 cycles per decoded value
317 |   AVXDictCODEC::fastuncompress(t,newbuf):  3.34 cycles per decoded value
318 |      AVXdecodetocache(&t,newbuf,bufsize):  3.28 cycles per decoded value
319 | AVX512DictCODEC::fastuncompress(t,newbuf):  1.83 cycles per decoded value
320 |   AVX512decodetocache(&t,newbuf,bufsize):  1.57 cycles per decoded value
321 | 
322 | testing with dictionary of size 256
323 | Actual dict size: 256
324 |         scalarcodec.uncompress(t,newbuf):  8.07 cycles per decoded value
325 |    decodetocache(&sc, &t,newbuf,bufsize):  7.87 cycles per decoded value
326 |            avxcodec.uncompress(t,newbuf):  6.39 cycles per decoded value
327 |   AVXDictCODEC::fastuncompress(t,newbuf):  3.39 cycles per decoded value
328 |      AVXdecodetocache(&t,newbuf,bufsize):  3.35 cycles per decoded value
329 | AVX512DictCODEC::fastuncompress(t,newbuf):  1.95 cycles per decoded value
330 |   AVX512decodetocache(&t,newbuf,bufsize):  1.69 cycles per decoded value
331 | 
332 | testing with dictionary of size 512
333 | Actual dict size: 512
334 |         scalarcodec.uncompress(t,newbuf):  8.07 cycles per decoded value
335 |    decodetocache(&sc, &t,newbuf,bufsize):  7.87 cycles per decoded value
336 |            avxcodec.uncompress(t,newbuf):  6.32 cycles per decoded value
337 |   AVXDictCODEC::fastuncompress(t,newbuf):  3.52 cycles per decoded value
338 |      AVXdecodetocache(&t,newbuf,bufsize):  3.48 cycles per decoded value
339 | AVX512DictCODEC::fastuncompress(t,newbuf):  2.04 cycles per decoded value
340 |   AVX512decodetocache(&t,newbuf,bufsize):  1.76 cycles per decoded value
341 | 
342 | testing with dictionary of size 1024
343 | Actual dict size: 1024
344 |         scalarcodec.uncompress(t,newbuf):  8.22 cycles per decoded value
345 |    decodetocache(&sc, &t,newbuf,bufsize):  7.97 cycles per decoded value
346 |            avxcodec.uncompress(t,newbuf):  6.43 cycles per decoded value
347 |   AVXDictCODEC::fastuncompress(t,newbuf):  3.63 cycles per decoded value
348 |      AVXdecodetocache(&t,newbuf,bufsize):  3.57 cycles per decoded value
349 | AVX512DictCODEC::fastuncompress(t,newbuf):  2.05 cycles per decoded value
350 |   AVX512decodetocache(&t,newbuf,bufsize):  1.83 cycles per decoded value
351 | 
352 | testing with dictionary of size 2048
353 | Actual dict size: 2048
354 |         scalarcodec.uncompress(t,newbuf):  7.97 cycles per decoded value
355 |    decodetocache(&sc, &t,newbuf,bufsize):  7.69 cycles per decoded value
356 |            avxcodec.uncompress(t,newbuf):  6.37 cycles per decoded value
357 |   AVXDictCODEC::fastuncompress(t,newbuf):  3.76 cycles per decoded value
358 |      AVXdecodetocache(&t,newbuf,bufsize):  3.64 cycles per decoded value
359 | AVX512DictCODEC::fastuncompress(t,newbuf):  2.11 cycles per decoded value
360 |   AVX512decodetocache(&t,newbuf,bufsize):  1.91 cycles per decoded value
361 | 
362 | testing with dictionary of size 4096
363 | Actual dict size: 4096
364 |         scalarcodec.uncompress(t,newbuf):  8.53 cycles per decoded value
365 |    decodetocache(&sc, &t,newbuf,bufsize):  8.20 cycles per decoded value
366 |            avxcodec.uncompress(t,newbuf):  6.67 cycles per decoded value
367 |   AVXDictCODEC::fastuncompress(t,newbuf):  3.58 cycles per decoded value
368 |      AVXdecodetocache(&t,newbuf,bufsize):  3.56 cycles per decoded value
369 | AVX512DictCODEC::fastuncompress(t,newbuf):  2.55 cycles per decoded value
370 |   AVX512decodetocache(&t,newbuf,bufsize):  2.35 cycles per decoded value
371 | 
372 | testing with dictionary of size 8192
373 | Actual dict size: 8192
374 |         scalarcodec.uncompress(t,newbuf):  8.66 cycles per decoded value
375 |    decodetocache(&sc, &t,newbuf,bufsize):  8.27 cycles per decoded value
376 |            avxcodec.uncompress(t,newbuf):  6.79 cycles per decoded value
377 |   AVXDictCODEC::fastuncompress(t,newbuf):  3.92 cycles per decoded value
378 |      AVXdecodetocache(&t,newbuf,bufsize):  3.86 cycles per decoded value
379 | AVX512DictCODEC::fastuncompress(t,newbuf):  2.80 cycles per decoded value
380 |   AVX512decodetocache(&t,newbuf,bufsize):  2.54 cycles per decoded value
381 | 
382 | testing with dictionary of size 16384
383 | Actual dict size: 16384
384 |         scalarcodec.uncompress(t,newbuf):  8.85 cycles per decoded value
385 |    decodetocache(&sc, &t,newbuf,bufsize):  8.55 cycles per decoded value
386 |            avxcodec.uncompress(t,newbuf):  6.95 cycles per decoded value
387 |   AVXDictCODEC::fastuncompress(t,newbuf):  4.05 cycles per decoded value
388 |      AVXdecodetocache(&t,newbuf,bufsize):  3.87 cycles per decoded value
389 | AVX512DictCODEC::fastuncompress(t,newbuf):  3.14 cycles per decoded value
390 |   AVX512decodetocache(&t,newbuf,bufsize):  2.96 cycles per decoded value
391 | 
392 | testing with dictionary of size 32768
393 | Actual dict size: 32768
394 |         scalarcodec.uncompress(t,newbuf):  6.75 cycles per decoded value
395 |    decodetocache(&sc, &t,newbuf,bufsize):  6.81 cycles per decoded value
396 |            avxcodec.uncompress(t,newbuf):  6.94 cycles per decoded value
397 |   AVXDictCODEC::fastuncompress(t,newbuf):  3.68 cycles per decoded value
398 |      AVXdecodetocache(&t,newbuf,bufsize):  3.58 cycles per decoded value
399 | AVX512DictCODEC::fastuncompress(t,newbuf):  3.41 cycles per decoded value
400 |   AVX512decodetocache(&t,newbuf,bufsize):  3.24 cycles per decoded value
401 | 
402 | testing with dictionary of size 65536
403 | Actual dict size: 65536
404 |         scalarcodec.uncompress(t,newbuf):  11.75 cycles per decoded value
405 |    decodetocache(&sc, &t,newbuf,bufsize):  13.76 cycles per decoded value
406 |            avxcodec.uncompress(t,newbuf):  9.64 cycles per decoded value
407 |   AVXDictCODEC::fastuncompress(t,newbuf):  5.29 cycles per decoded value
408 |      AVXdecodetocache(&t,newbuf,bufsize):  5.50 cycles per decoded value
409 | AVX512DictCODEC::fastuncompress(t,newbuf):  4.54 cycles per decoded value
410 |   AVX512decodetocache(&t,newbuf,bufsize):  4.66 cycles per decoded value
411 | 
412 | testing with dictionary of size 131072
413 | Actual dict size: 131072
414 |         scalarcodec.uncompress(t,newbuf):  19.07 cycles per decoded value
415 |    decodetocache(&sc, &t,newbuf,bufsize):  19.53 cycles per decoded value
416 |            avxcodec.uncompress(t,newbuf):  17.02 cycles per decoded value
417 |   AVXDictCODEC::fastuncompress(t,newbuf):  11.02 cycles per decoded value
418 |      AVXdecodetocache(&t,newbuf,bufsize):  11.01 cycles per decoded value
419 | AVX512DictCODEC::fastuncompress(t,newbuf):  8.03 cycles per decoded value
420 |   AVX512decodetocache(&t,newbuf,bufsize):  8.01 cycles per decoded value
421 | 
422 | testing with dictionary of size 262144
423 | Actual dict size: 262144
424 |         scalarcodec.uncompress(t,newbuf):  22.84 cycles per decoded value
425 |    decodetocache(&sc, &t,newbuf,bufsize):  23.12 cycles per decoded value
426 |            avxcodec.uncompress(t,newbuf):  20.63 cycles per decoded value
427 |   AVXDictCODEC::fastuncompress(t,newbuf):  16.57 cycles per decoded value
428 |      AVXdecodetocache(&t,newbuf,bufsize):  16.45 cycles per decoded value
429 | AVX512DictCODEC::fastuncompress(t,newbuf):  13.68 cycles per decoded value
430 |   AVX512decodetocache(&t,newbuf,bufsize):  13.69 cycles per decoded value
431 | 
432 | testing with dictionary of size 524288
433 | Actual dict size: 524288
434 |         scalarcodec.uncompress(t,newbuf):  22.34 cycles per decoded value
435 |    decodetocache(&sc, &t,newbuf,bufsize):  22.54 cycles per decoded value
436 |            avxcodec.uncompress(t,newbuf):  20.36 cycles per decoded value
437 |   AVXDictCODEC::fastuncompress(t,newbuf):  16.30 cycles per decoded value
438 |      AVXdecodetocache(&t,newbuf,bufsize):  16.34 cycles per decoded value
439 | AVX512DictCODEC::fastuncompress(t,newbuf):  14.91 cycles per decoded value
440 |   AVX512decodetocache(&t,newbuf,bufsize):  14.94 cycles per decoded value
441 | 
442 | testing with dictionary of size 1048576
443 | Actual dict size: 1048235
444 |         scalarcodec.uncompress(t,newbuf):  21.93 cycles per decoded value
445 |    decodetocache(&sc, &t,newbuf,bufsize):  22.11 cycles per decoded value
446 |            avxcodec.uncompress(t,newbuf):  19.91 cycles per decoded value
447 |   AVXDictCODEC::fastuncompress(t,newbuf):  16.33 cycles per decoded value
448 |      AVXdecodetocache(&t,newbuf,bufsize):  16.30 cycles per decoded value
449 | AVX512DictCODEC::fastuncompress(t,newbuf):  15.32 cycles per decoded value
450 |   AVX512decodetocache(&t,newbuf,bufsize):  15.31 cycles per decoded value
451 | 
452 | ```
453 | 
454 | ## Limitations
455 | - We support just one dictionary. In practice, one might want to use horizontal partitions.
456 | - We do not have a realistic usage of the dictionary values (we use a uniform distribution).
457 | - For simplicity, we assume that the dictionary is made of 64-bit words. It is hard-coded in the code, but not a fundamental limitation: the code would be faster with smaller words.
458 | - This code is not meant to be use in production. It is a demo.
459 | - This code makes up its own convenient format. It is not meant to plug as-is into an existing framework.
460 | - We assume that the arrays are large. If you have tiny arrays... well...
461 | - We effectively measure steady-state throughput. So we ignore costs such as loading up the dictionary in CPU cache.
462 | 
463 | ## Authors
464 | Daniel Lemire and Eric Daniel (motivated by ``parquet-cpp``)
465 | 
466 | 
467 | ## Other relevant libraries
468 | 
469 | * SIMDCompressionAndIntersection: A C++ library to compress and intersect sorted lists of integers using SIMD instructions https://github.com/lemire/SIMDCompressionAndIntersect
470 | * The FastPFOR C++ library : Fast integer compression https://github.com/lemire/FastPFor
471 | * LittleIntPacker: C library to pack and unpack short arrays of integers as fast as possible https://github.com/lemire/LittleIntPacker
472 | * The SIMDComp library: A simple C library for compressing lists of integers using binary packing https://github.com/lemire/simdcomp
473 | * StreamVByte: Fast integer compression in C using the StreamVByte codec https://github.com/lemire/streamvbyte
474 | * MaskedVByte: Fast decoder for VByte-compressed integers https://github.com/lemire/MaskedVByte
475 | * CSharpFastPFOR: A C#  integer compression library  https://github.com/Genbox/CSharpFastPFOR
476 | * JavaFastPFOR: A java integer compression library https://github.com/lemire/JavaFastPFOR
477 | * Encoding: Integer Compression Libraries for Go https://github.com/zhenjl/encoding
478 | * FrameOfReference is a C++ library dedicated to frame-of-reference (FOR) compression: https://github.com/lemire/FrameOfReference
479 | * libvbyte: A fast implementation for varbyte 32bit/64bit integer compression https://github.com/cruppstahl/libvbyte
480 | * TurboPFor is a C library that offers lots of interesting optimizations. Well worth checking! (GPL license) https://github.com/powturbo/TurboPFor
481 | * Oroch is a C++ library that offers a usable API (MIT license) https://github.com/ademakov/Oroch
482 | 
483 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark.h:
--------------------------------------------------------------------------------
  1 | 
  2 | #ifndef DICT_BENCH_H
  3 | #define DICT_BENCH_H
  4 | 
  5 | #include <cstdio>
  6 | 
  7 | // useful for basic info (0)
  8 | static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
  9 |                                 unsigned int *ecx, unsigned int *edx) {
 10 |     __asm volatile("cpuid"
 11 |                    : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
 12 |                    : "0"(*eax), "2"(*ecx));
 13 | }
 14 | 
 15 | // this is quite imperfect, but can be handy
 16 | static inline const char *guessprocessor() {
 17 |     unsigned eax = 1, ebx = 0, ecx = 0, edx = 0;
 18 |     native_cpuid(&eax, &ebx, &ecx, &edx);
 19 |     const char *codename;
 20 |     switch (eax >> 4) {
 21 |     case 0x506E:
 22 |         codename = "Skylake";
 23 |         break;
 24 |     case 0x406C:
 25 |         codename = "CherryTrail";
 26 |         break;
 27 |     case 0x306D:
 28 |         codename = "Broadwell";
 29 |         break;
 30 |     case 0x306C:
 31 |         codename = "Haswell";
 32 |         break;
 33 |     case 0x306A:
 34 |         codename = "IvyBridge";
 35 |         break;
 36 |     case 0x206A:
 37 |     case 0x206D:
 38 |         codename = "SandyBridge";
 39 |         break;
 40 |     case 0x2065:
 41 |     case 0x206C:
 42 |     case 0x206F:
 43 |         codename = "Westmere";
 44 |         break;
 45 |     case 0x106E:
 46 |     case 0x106A:
 47 |     case 0x206E:
 48 |         codename = "Nehalem";
 49 |         break;
 50 |     case 0x1067:
 51 |     case 0x106D:
 52 |         codename = "Penryn";
 53 |         break;
 54 |     case 0x006F:
 55 |     case 0x1066:
 56 |         codename = "Merom";
 57 |         break;
 58 |     case 0x0066:
 59 |         codename = "Presler";
 60 |         break;
 61 |     case 0x0063:
 62 |     case 0x0064:
 63 |         codename = "Prescott";
 64 |         break;
 65 |     case 0x006D:
 66 |         codename = "Dothan";
 67 |         break;
 68 |     case 0x0366:
 69 |         codename = "Cedarview";
 70 |         break;
 71 |     case 0x0266:
 72 |         codename = "Lincroft";
 73 |         break;
 74 |     case 0x016C:
 75 |         codename = "Pineview";
 76 |         break;
 77 |     default:
 78 |         codename = "UNKNOWN";
 79 |         break;
 80 |     }
 81 |     return codename;
 82 | }
 83 | 
 84 | static inline void tellmeall() {
 85 | 
 86 | #ifdef __arm__
 87 |     printf("ARM processor detected\n");
 88 | #else
 89 |     printf("Intel processor:  %s\t", guessprocessor());
 90 | #endif
 91 | 
 92 | #ifdef __VERSION__
 93 |     printf(" compiler version: %s\t", __VERSION__);
 94 | #endif
 95 | #ifndef __AVX2__
 96 |     printf("AVX2 is NOT available.\n");
 97 | #else
 98 |     printf("\tAVX2 is available.");
 99 | #endif
100 |     printf("\n");
101 | 
102 | }
103 | 
104 | 
105 | #define RDTSC_START(cycles)                                                   \
106 |     do {                                                                      \
107 |         unsigned cyc_high, cyc_low;                                  \
108 |         __asm volatile(                                                       \
109 |             "cpuid\n\t"                                                       \
110 |             "rdtsc\n\t"                                                       \
111 |             "mov %%edx, %0\n\t"                                               \
112 |             "mov %%eax, %1\n\t"                                               \
113 |             : "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \
114 |         (cycles) = ((uint64_t)cyc_high << 32) | cyc_low;                      \
115 |     } while (0)
116 | 
117 | #define RDTSC_FINAL(cycles)                                                   \
118 |     do {                                                                      \
119 |         unsigned cyc_high, cyc_low;                                  \
120 |         __asm volatile(                                                       \
121 |             "rdtscp\n\t"                                                      \
122 |             "mov %%edx, %0\n\t"                                               \
123 |             "mov %%eax, %1\n\t"                                               \
124 |             "cpuid\n\t"                                                       \
125 |             : "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \
126 |         (cycles) = ((uint64_t)cyc_high << 32) | cyc_low;                      \
127 |     } while (0)
128 | 
129 | 
130 | 
131 | /*
132 |  * Prints the best number of operations per cycle where
133 |  * test is the function call, answer is the expected answer generated by
134 |  * test, repeat is the number of times we should repeat and size is the
135 |  * number of operations represented by test.
136 |  */
137 | #define BEST_TIME(test, answer, repeat, size)                         \
138 |     do {                                                              \
139 |         printf("%40s: ", #test);                                        \
140 |         fflush(NULL);                                                 \
141 |         uint64_t cycles_start, cycles_final, cycles_diff;             \
142 |         uint64_t min_diff = (uint64_t)-1;                             \
143 |         int wrong_answer = 0;                                         \
144 |         for (int i = 0; i < repeat; i++) {                            \
145 |             __asm volatile("" ::: /* pretend to clobber */ "memory"); \
146 |             RDTSC_START(cycles_start);                                \
147 |             if (test != answer) wrong_answer = 1;                     \
148 |             RDTSC_FINAL(cycles_final);                                \
149 |             cycles_diff = (cycles_final - cycles_start);              \
150 |             if (cycles_diff < min_diff) min_diff = cycles_diff;       \
151 |         }                                                             \
152 |         uint64_t S = (uint64_t)size;                                  \
153 |         float cycle_per_op = (min_diff) / (float)S;                   \
154 |         printf(" %.2f cycles per decoded value", cycle_per_op);       \
155 |         if (wrong_answer) printf(" [ERROR]");                         \
156 |         printf("\n");                                                 \
157 |         fflush(NULL);                                                 \
158 |     } while (0)
159 | 
160 | #endif
161 | 


--------------------------------------------------------------------------------
/benchmarks/decodebenchmark.cpp:
--------------------------------------------------------------------------------
  1 | #include <cstdint>
  2 | #include <cassert>
  3 | #include <iostream>
  4 | 
  5 | #include <stdlib.h>
  6 | 
  7 | #include "benchmark.h"
  8 | 
  9 | #ifdef __AVX2__
 10 | #include "avxcodec.h"
 11 | #endif
 12 | 
 13 | #ifdef __AVX512F__
 14 | #include "avx512codec.h"
 15 | #endif
 16 | 
 17 | #include "scalarcodec.h"
 18 | 
 19 | 
 20 | void fill_buffer(uint64_t * buf, uint32_t length, uint32_t distinct)
 21 | {
 22 |     srand(1);
 23 |     for(size_t i = 0; i < length; i++) {
 24 |         buf[i] = rand() % distinct;
 25 |         //buf[i] = i % distinct;     // would produce streaming reads from the dictionary
 26 |     }
 27 | }
 28 | 
 29 | void scalartest(uint32_t distinct, uint32_t length = 1<<16, int repeat = 500) {
 30 |     uint64_t * buf = new uint64_t[length];
 31 |     fill_buffer(buf, length, distinct);
 32 | 
 33 |     SimpleDictCODEC scalarcodec;
 34 |     dictionary_coded_t t = scalarcodec.compress(buf, length);
 35 |     std::cout << "Actual dict size: " << t.dictionary_size << std::endl;
 36 |     uint64_t * newbuf = new uint64_t[length];
 37 |     BEST_TIME(scalarcodec.uncompress(t,newbuf), length, repeat, length);
 38 |     for(size_t i = 0; i < length; i++) {
 39 |         assert(buf[i] == newbuf[i]);
 40 |     }
 41 |     delete[] newbuf;
 42 |     delete[] buf;
 43 | }
 44 | 
 45 | 
 46 | size_t decodetocache(SimpleDictCODEC * scalarcodec, dictionary_coded_t * t, uint64_t * newbuf, size_t blocksize) {
 47 |   size_t totaldecoded = 0;
 48 |   size_t leftover = t->array_length;
 49 |   for(size_t i = 0; i <  t->array_length; i += blocksize) {
 50 |     size_t todecode = leftover > blocksize ? blocksize : leftover;
 51 |     totaldecoded += todecode;
 52 |     leftover = scalarcodec->rangeuncompress(*t,newbuf, i , todecode);
 53 |   }
 54 |   return totaldecoded;
 55 | }
 56 | 
 57 | void scalarcachetest(uint32_t distinct, uint32_t length = 1<<16, int repeat = 500) {
 58 |     uint64_t * buf = new uint64_t[length];
 59 |     fill_buffer(buf, length, distinct);
 60 |     SimpleDictCODEC sc;
 61 |     dictionary_coded_t t (sc.compress(buf, length) );
 62 |     size_t bufsize = 1 << 16;
 63 |     uint64_t * newbuf = new uint64_t[bufsize];
 64 |     BEST_TIME(decodetocache(&sc, &t,newbuf,bufsize), length, repeat, length);
 65 |     for(size_t i = length - bufsize; i < length; i++) {
 66 |         assert(buf[i] == newbuf[i - length + bufsize]);
 67 |     }
 68 |     delete[] newbuf;
 69 |     delete[] buf;
 70 | }
 71 | 
 72 | 
 73 | #ifdef __AVX2__
 74 | void mediumtest(uint32_t distinct, uint32_t length = 1<<16, int repeat = 500) {
 75 |     uint64_t * buf = new uint64_t[length];
 76 |     fill_buffer(buf, length, distinct);
 77 | 
 78 |     AVXDictCODEC avxcodec;
 79 |     dictionary_coded_t t (avxcodec.compress(buf, length) );
 80 |     uint64_t * newbuf = new uint64_t[length];
 81 |     BEST_TIME(avxcodec.uncompress(t,newbuf), length, repeat, length);
 82 |     for(size_t i = 0; i < length; i++) {
 83 |         assert(buf[i] == newbuf[i]);
 84 |     }
 85 |     delete[] newbuf;
 86 |     delete[] buf;
 87 | }
 88 | 
 89 | 
 90 | void fasttest(uint32_t distinct, uint32_t length = 1<<16, int repeat = 500) {
 91 |     uint64_t * buf = new uint64_t[length];
 92 |     fill_buffer(buf, length, distinct);
 93 |     dictionary_coded_t t (AVXDictCODEC().compress(buf, length) );
 94 |     uint64_t * newbuf = new uint64_t[length];
 95 |     BEST_TIME(AVXDictCODEC::fastuncompress(t,newbuf), length, repeat, length);
 96 |     for(size_t i = 0; i < length; i++) {
 97 |         assert(buf[i] == newbuf[i]);
 98 |     }
 99 |     delete[] newbuf;
100 |     delete[] buf;
101 | }
102 | 
103 | size_t AVXdecodetocache(dictionary_coded_t * t, uint64_t * newbuf, size_t blocksize) {
104 |   size_t totaldecoded = 0;
105 |   size_t leftover = t->array_length;
106 |   for(size_t i = 0; i <  t->array_length; i += blocksize) {
107 |     size_t todecode = leftover > blocksize ? blocksize : leftover;
108 |     totaldecoded += todecode;
109 |     leftover = AVXDictCODEC::fastrangeuncompress(*t,newbuf, i , todecode);
110 |   }
111 |   return totaldecoded;
112 | }
113 | 
114 | void fastcachetest(uint32_t distinct, uint32_t length = 1<<16, int repeat = 500) {
115 |     uint64_t * buf = new uint64_t[length];
116 |     fill_buffer(buf, length, distinct);
117 | 
118 |     dictionary_coded_t t (AVXDictCODEC().compress(buf, length) );
119 |     size_t bufsize = 1 << 16;
120 |     uint64_t * newbuf = new uint64_t[bufsize];
121 |     BEST_TIME(AVXdecodetocache(&t,newbuf,bufsize), length, repeat, length);
122 |     for(size_t i = length - bufsize; i < length; i++) {
123 |         assert(buf[i] == newbuf[i - length + bufsize]);
124 |     }
125 |     delete[] newbuf;
126 |     delete[] buf;
127 | }
128 | #endif
129 | #ifdef __AVX512F__
130 | 
131 | void fastavx512test(uint32_t distinct, uint32_t length = 1<<16, int repeat = 500) {
132 |     uint64_t * buf = new uint64_t[length];
133 |     fill_buffer(buf, length, distinct);
134 |     dictionary_coded_t t (AVX512DictCODEC().compress(buf, length) );
135 |     uint64_t * newbuf = new uint64_t[length];
136 |     BEST_TIME(AVX512DictCODEC::fastuncompress(t,newbuf), length, repeat, length);
137 |     for(size_t i = 0; i < length; i++) {
138 |         assert(buf[i] == newbuf[i]);
139 |     }
140 |     delete[] newbuf;
141 |     delete[] buf;
142 | }
143 | 
144 | size_t AVX512decodetocache(dictionary_coded_t * t, uint64_t * newbuf, size_t blocksize) {
145 |   size_t totaldecoded = 0;
146 |   size_t leftover = t->array_length;
147 |   for(size_t i = 0; i <  t->array_length; i += blocksize) {
148 |     size_t todecode = leftover > blocksize ? blocksize : leftover;
149 |     totaldecoded += todecode;
150 |     leftover = AVX512DictCODEC::fastrangeuncompress(*t,newbuf, i , todecode);
151 |   }
152 |   return totaldecoded;
153 | }
154 | 
155 | void fastavx512cachetest(uint32_t distinct, uint32_t length = 1<<16, int repeat = 500) {
156 |     uint64_t * buf = new uint64_t[length];
157 |     fill_buffer(buf, length, distinct);
158 | 
159 |     dictionary_coded_t t (AVX512DictCODEC().compress(buf, length) );
160 |     size_t bufsize = 1 << 16;
161 |     uint64_t * newbuf = new uint64_t[bufsize];
162 |     BEST_TIME(AVX512decodetocache(&t,newbuf,bufsize), length, repeat, length);
163 |     for(size_t i = length - bufsize; i < length; i++) {
164 |         assert(buf[i] == newbuf[i - length + bufsize]);
165 |     }
166 |     delete[] newbuf;
167 |     delete[] buf;
168 | }
169 | #endif
170 | 
171 | 
172 | int main() {
173 |     printf("For this benchmark, use a recent (Skylake) Intel processor for best results.\n");
174 |     tellmeall();
175 |     uint32_t length = 1<<23;    // larger than L3 cache
176 |     printf("Using array sizes of %u values or %lu kiB.\n", length, length * sizeof(uint64_t) / 1024);
177 |     int repeat = 5;
178 |     for(uint32_t distinct = 2; distinct <= (1<<20); distinct *=2) {
179 |         std::cout << "testing with dictionary of size " << distinct << std::endl;
180 |         scalartest(distinct, length, repeat);
181 |         scalarcachetest(distinct, length, repeat);
182 | 
183 | #ifdef __AVX2__
184 |         mediumtest(distinct, length, repeat);
185 |         fasttest(distinct, length, repeat);
186 |         fastcachetest(distinct, length, repeat);
187 | #endif
188 | #ifdef __AVX512F__
189 |         fastavx512test(distinct, length, repeat);
190 |         fastavx512cachetest(distinct, length, repeat);
191 | #endif
192 |          std::cout<<std::endl;
193 |     }
194 |     return 0;
195 | }
196 | 


--------------------------------------------------------------------------------
/scripts/avx512dict.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | def howmany(bit):
 4 |     """ how many values are we going to pack? """
 5 |     return 512
 6 | 
 7 | def howmany64perwideword():
 8 |     return 512/64
 9 | 
10 | def howmanywords(bit):
11 |     return (howmany(bit) * bit + 511)/512
12 | 
13 | def howmanybytes(bit):
14 |     return howmanywords(bit) * 64
15 | 
16 | print("""
17 | /** avxdict512 **/
18 | 
19 | 
20 | typedef long long myint64;
21 | """)
22 | 
23 | print("""typedef void (*avx512unpackdictfnc)(const __m512i * compressed, const myint64 * dictionary, int64_t * pout);""")
24 | 
25 | 
26 | 
27 | 
28 | 
29 | 
30 | def plurial(number):
31 |     if(number <> 1):
32 |         return "s"
33 |     else :
34 |         return ""
35 | 
36 | print("static void avx512unpackdict0(const __m512i * compressed, const myint64 * dictionary, int64_t * pout) {");
37 | print("  (void) compressed;");
38 | print("  __m512i * out = (__m512i *) pout;");
39 | print("  const __m512i uniquew = _mm512_set1_epi64(dictionary[0]);");
40 | print("  for(int k = 0; k < {0}; k++) {{".format(howmany(0)/howmany64perwideword()));
41 | print("    _mm512_storeu_si512(out + k, uniquew);")
42 | print("  }");
43 | print("}");
44 | print("")
45 | 
46 | for bit in range(1,33):
47 |     print("")
48 |     print("/* we packed {0} {1}-bit values, touching {2} 512-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
49 |     print("static void avx512unpackdict{0}(const __m512i * compressed, const myint64 * dictionary, int64_t * pout) {{".format(bit));
50 |     print("  /* we are going to access  {0} 512-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
51 |     if(howmanywords(bit) == 1):
52 |       print("  __m512i w0;")
53 |     else:
54 |       print("  __m512i w0, w1;")
55 |     print("  __m512i wout;")
56 |     print("  __m512i * out = (__m512i *) pout;");
57 |     if(bit < 32): print("  const __m512i mask = _mm512_set1_epi32({0});".format((1<<bit)-1));
58 |     maskstr = " _mm512_and_si512 ( mask, {0}) "
59 |     if (bit == 32) : maskstr = " {0} " # no need
60 |     oldword = 0
61 |     print("  w0 = _mm512_loadu_si512 (compressed);")
62 |     for j in range(howmany(bit)/16):
63 |       firstword = j * bit / 32
64 |       secondword = (j * bit + bit - 1)/32
65 |       if(secondword > oldword):
66 |         print("  w{0} = _mm512_loadu_si512 (compressed + {1});".format(secondword%2,secondword))
67 |         oldword = secondword
68 |       firstshift = (j*bit) % 32
69 |       firstshiftstr = "_mm512_srli_epi32( w{0} , "+str(firstshift)+") "
70 |       if(firstshift == 0):
71 |           firstshiftstr =" w{0} " # no need
72 |       wfirst = firstshiftstr.format(firstword%2)
73 |       if( firstword == secondword):
74 |           if(firstshift + bit <> 32):
75 |             wfirst  = maskstr.format(wfirst)
76 |           print("  wout = {0}; // 512-bit word to be output".format(wfirst));
77 |           print("  _mm512_storeu_si512(out + {0},_mm512_i32gather_epi64(_mm512_castsi512_si256(wout),dictionary, 8)); // load from dictionary and store".format(2*j))
78 |           print("  _mm512_storeu_si512(out + {0},_mm512_i32gather_epi64(_mm512_extracti64x4_epi64(wout,1),dictionary, 8)); // load from dictionary and store".format(2*j+1))
79 |       else:
80 |           secondshift = (32-firstshift)
81 |           wsecond = "_mm512_slli_epi32( w{0} , {1} ) ".format((firstword+1)%2,secondshift)
82 |           wfirstorsecond = " _mm512_or_si512 ({0},{1}) ".format(wfirst,wsecond)
83 |           wfirstorsecond = maskstr.format(wfirstorsecond)
84 |           print("  wout = {0}; // 512-bit word to be output".format(wfirstorsecond));
85 |           print("  _mm512_storeu_si512(out + {0},_mm512_i32gather_epi64(_mm512_castsi512_si256(wout),dictionary, 8)); // load from dictionary and store".format(2*j))
86 |           print("  _mm512_storeu_si512(out + {0},_mm512_i32gather_epi64(_mm512_extracti64x4_epi64(wout,1),dictionary, 8)); // load from dictionary and store".format(2*j+1))
87 |     print("}");
88 |     print("")
89 | 
90 | 
91 | 
92 | print("static avx512unpackdictfnc avx512funcUnpackDictArr[] = {")
93 | for bit in range(0,32):
94 |   print("&avx512unpackdict{0},".format(bit))
95 | print("&avx512unpackdict32")
96 | print("};")
97 | print("/** end of avxdict **/")
98 | 


--------------------------------------------------------------------------------
/scripts/avxdict.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | def howmany(bit):
 4 |     """ how many values are we going to pack? """
 5 |     return 256
 6 | 
 7 | def howmany64perwideword():
 8 |     return 256/64
 9 | 
10 | def howmanywords(bit):
11 |     return (howmany(bit) * bit + 255)/256
12 | 
13 | def howmanybytes(bit):
14 |     return howmanywords(bit) * 32
15 | 
16 | print("""
17 | /** avxdict **/
18 | 
19 | 
20 | typedef long long myint64;
21 | """)
22 | 
23 | print("""typedef void (*avxunpackdictfnc)(const __m256i * compressed, const myint64 * dictionary, int64_t * pout);""")
24 | 
25 | 
26 | 
27 | 
28 | 
29 | 
30 | def plurial(number):
31 |     if(number <> 1):
32 |         return "s"
33 |     else :
34 |         return ""
35 | 
36 | print("static void avxunpackdict0(const __m256i * compressed, const myint64 * dictionary, int64_t * pout) {");
37 | print("  (void) compressed;");
38 | print("  __m256i * out = (__m256i *) pout;");
39 | print("  const __m256i uniquew = _mm256_set1_epi64x(dictionary[0]);");
40 | print("  for(int k = 0; k < {0}; k++) {{".format(howmany(0)/howmany64perwideword()));
41 | print("    _mm256_storeu_si256(out + k, uniquew);")
42 | print("  }");
43 | print("}");
44 | print("")
45 | 
46 | for bit in range(1,33):
47 |     print("")
48 |     print("/* we packed {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
49 |     print("static void avxunpackdict{0}(const __m256i * compressed, const myint64 * dictionary, int64_t * pout) {{".format(bit));
50 |     print("  /* we are going to access  {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
51 |     if(howmanywords(bit) == 1):
52 |       print("  __m256i w0;")
53 |     else:
54 |       print("  __m256i w0, w1;")
55 |     print("  __m256i wout;")
56 |     print("  __m256i * out = (__m256i *) pout;");
57 |     if(bit < 32): print("  const __m256i mask = _mm256_set1_epi32({0});".format((1<<bit)-1));
58 |     maskstr = " _mm256_and_si256 ( mask, {0}) "
59 |     if (bit == 32) : maskstr = " {0} " # no need
60 |     oldword = 0
61 |     print("  w0 = _mm256_lddqu_si256 (compressed);")
62 |     for j in range(howmany(bit)/8):
63 |       firstword = j * bit / 32
64 |       secondword = (j * bit + bit - 1)/32
65 |       if(secondword > oldword):
66 |         print("  w{0} = _mm256_lddqu_si256 (compressed + {1});".format(secondword%2,secondword))
67 |         oldword = secondword
68 |       firstshift = (j*bit) % 32
69 |       firstshiftstr = "_mm256_srli_epi32( w{0} , "+str(firstshift)+") "
70 |       if(firstshift == 0):
71 |           firstshiftstr =" w{0} " # no need
72 |       wfirst = firstshiftstr.format(firstword%2)
73 |       if( firstword == secondword):
74 |           if(firstshift + bit <> 32):
75 |             wfirst  = maskstr.format(wfirst)
76 |           print("  wout = {0}; // 256-bit word to be output".format(wfirst));
77 |           print("  _mm256_storeu_si256(out + {0},_mm256_i32gather_epi64(dictionary,_mm256_castsi256_si128(wout), 8)); // load from dictionary and store".format(2*j))
78 |           print("  _mm256_storeu_si256(out + {0},_mm256_i32gather_epi64(dictionary,_mm256_extractf128_si256(wout,1), 8)); // load from dictionary and store".format(2*j+1))
79 |       else:
80 |           secondshift = (32-firstshift)
81 |           wsecond = "_mm256_slli_epi32( w{0} , {1} ) ".format((firstword+1)%2,secondshift)
82 |           wfirstorsecond = " _mm256_or_si256 ({0},{1}) ".format(wfirst,wsecond)
83 |           wfirstorsecond = maskstr.format(wfirstorsecond)
84 |           print("  wout = {0}; // 256-bit word to be output".format(wfirstorsecond));
85 |           print("  _mm256_storeu_si256(out + {0},_mm256_i32gather_epi64(dictionary,_mm256_castsi256_si128(wout), 8)); // load from dictionary and store".format(2*j))
86 |           print("  _mm256_storeu_si256(out + {0},_mm256_i32gather_epi64(dictionary,_mm256_extractf128_si256(wout,1), 8)); // load from dictionary and store".format(2*j+1))
87 |     print("}");
88 |     print("")
89 | 
90 | 
91 | 
92 | print("static avxunpackdictfnc avxfuncUnpackDictArr[] = {")
93 | for bit in range(0,32):
94 |   print("&avxunpackdict{0},".format(bit))
95 | print("&avxunpackdict32")
96 | print("};")
97 | print("/** end of avxdict **/")
98 | 


--------------------------------------------------------------------------------
/src/avx512codec.h:
--------------------------------------------------------------------------------
  1 | #ifndef DICT_AVX512_H
  2 | #define DICT_AVX512_H
  3 | 
  4 | /**
  5 | * This is silly compression/decompression code. Meant to test the basics, not
  6 | * for production.
  7 | */
  8 | #include <cassert>
  9 | #include <cstdint>
 10 | #include <cstddef>
 11 | #include <unordered_map>
 12 | 
 13 | #ifndef __AVX512F__
 14 | #error This code requires AVX-512 support (available on some Intel processors made since 2016)
 15 | #endif
 16 | 
 17 | #ifdef _MSC_VER
 18 | /* Microsoft C/C++-compatible compiler */
 19 | #include <intrin.h>
 20 | #else
 21 | /* Pretty much anything else. */
 22 | #include <x86intrin.h>
 23 | #endif
 24 | 
 25 | #include "avx512bpacking.h"
 26 | #include "avx512dict.h"
 27 | #include "dict.h"
 28 | 
 29 | 
 30 | /**
 31 | * This class is *not* thread-safe, use one instance per thread.
 32 | */
 33 | class AVX512DictCODEC {
 34 | public:
 35 | 
 36 |     AVX512DictCODEC() : tmpbuffer(NULL), buffercapacity(0) {}
 37 | 
 38 |     virtual ~AVX512DictCODEC() {
 39 |         clearBuffer();
 40 |     }
 41 |     /**
 42 |     * Silly code that compresses an array of 64-bit integers to an array of char,
 43 |     * outputting a convenient data structure.
 44 |     *
 45 |     * This could be *greatly* optimized.
 46 |     *
 47 |     * For simplicity, array lengths are assumed to be multiples of 512.
 48 |     */
 49 |     inline dictionary_coded_t compress(const uint64_t * array, size_t length) {
 50 |         dictionary_coded_t out;
 51 |         out.array_length = length;
 52 |         ensureBufferCapacity(out.array_length);
 53 |         std::unordered_map<uint64_t,uint32_t> distinctvalues;
 54 |         out.dictionary_size = 0;
 55 |         for(size_t i = 0; i < out.array_length; ++i) {
 56 |             if(distinctvalues.find(array[i]) == distinctvalues.end()) {
 57 |                 distinctvalues.emplace(array[i],out.dictionary_size++);
 58 |                 assert(out.dictionary_size != 0); // should never happen unless dictionary gets humongous
 59 |             }
 60 |         }
 61 |         out.dictionary = new uint64_t[out.dictionary_size];
 62 |         for(auto i = distinctvalues.begin(); i != distinctvalues.end(); ++i)  out.dictionary[i->second] = i->first;
 63 |         for(size_t i = 0; i < out.array_length ; ++i) {
 64 |             tmpbuffer[i] = distinctvalues[array[i]];
 65 |         }
 66 |         assert(length % 512 == 0);
 67 |         out.bit_width = 32 - __builtin_clz(out.dictionary_size);
 68 |         out.compressed_data_size = sizeof(uint32_t) * out.bit_width * length / 32;
 69 |         assert(out.array_length * out.bit_width ==  out.compressed_data_size * 8);
 70 |         out.compressed_data = new char[out.compressed_data_size];
 71 |         avx512packwithoutmask(tmpbuffer,(__m512i *) out.compressed_data, out.array_length, out.bit_width);
 72 |         return out;
 73 |     }
 74 | 
 75 | 
 76 | 
 77 |     /**
 78 |     * Silly code that uncompresses an array of 64-bit integers.
 79 |     * The out array should have enough space.
 80 |     *
 81 |     * This could be optimized.
 82 |     *
 83 |     * For simplicity, array lengths are assumed to be multiples of 512.
 84 |     *
 85 |     * Return array size
 86 |     */
 87 |     inline uint32_t uncompress(const dictionary_coded_t & t, uint64_t * out) {
 88 |         ensureBufferCapacity(t.array_length);
 89 |         assert(t.array_length % 512 == 0);
 90 |         assert(t.array_length * t.bit_width ==  t.compressed_data_size * 8);
 91 |         avx512unpack((const __m512i*) t.compressed_data, tmpbuffer, t.array_length, t.bit_width);
 92 |         for(size_t i = 0; i < t.array_length; ++i) {
 93 |             out[i] = t.dictionary[tmpbuffer[i]];
 94 |         }
 95 |         return t.array_length;
 96 |     }
 97 | 
 98 |     /**
 99 |     * Prototype code that uncompresses an array of 64-bit integers.
100 |     * The out array should have enough space.
101 |     *
102 |     * If the size of the compressed data does not fit in fast CPU cache,
103 |     * consider using fastrangeuncompress instead, to decompress data to fast CPU cache
104 |     * in blocks. Pushing data back and forth from RAM can be slow.
105 |     *
106 |     * For simplicity, array lengths are assumed to be multiples of 512.
107 |     *
108 |     * Return array size
109 |     */
110 |     static inline uint32_t fastuncompress(const dictionary_coded_t & t, uint64_t * out) {
111 |         assert(t.array_length % 512 == 0);
112 |         assert(t.array_length * t.bit_width ==  t.compressed_data_size * 8);
113 |         avx512unpackdict((const __m512i*) t.compressed_data,
114 |                       (const myint64 *) t.dictionary,(int64_t *)  out, t.array_length, t.bit_width);
115 |         return t.array_length;
116 |     }
117 | 
118 |     /**
119 |     * Uncompresses size values from index start
120 |     * This can be used to uncompress data to cache.
121 |     *
122 |     * For simplicity, all indexes and lengths are are assumed to be multiples of 512.
123 |     * Return the number of remain values left to be decoded (starting at index start+length)
124 |     */
125 |     static inline uint32_t fastrangeuncompress(const dictionary_coded_t & t, uint64_t * out, size_t start, size_t length) {
126 |         assert(t.array_length % 512 == 0);
127 |         assert(start % 512 == 0);
128 |         assert(length % 512 == 0);
129 |         assert(t.array_length * t.bit_width ==  t.compressed_data_size * 8);
130 |         assert(start + length <= t.array_length);
131 |         avx512unpackdict((const __m512i*) ( t.compressed_data + start * t.bit_width / 8),
132 |                       (const myint64 *) t.dictionary,(int64_t *)  out, length, t.bit_width);
133 |         return t.array_length - start - length;
134 |     }
135 | 
136 | 
137 |     inline void clearBuffer() {
138 |         buffercapacity = 0;
139 |         delete[] tmpbuffer;
140 |         tmpbuffer = NULL;
141 |     }
142 | 
143 | private:
144 | 
145 |     // by design, does not copy
146 |     AVX512DictCODEC(const AVX512DictCODEC & ) : tmpbuffer(NULL), buffercapacity(0) {}
147 | 
148 | 
149 |     AVX512DictCODEC& operator=(const AVX512DictCODEC & ) {
150 |         // does nothing, by design
151 |         return *this;
152 |     }
153 | 
154 | 
155 |     inline void ensureBufferCapacity(size_t desiredcap) {
156 |         if(desiredcap > buffercapacity) {
157 |             delete[] tmpbuffer;
158 |             tmpbuffer = new uint32_t[desiredcap];
159 |             assert(tmpbuffer!= NULL);
160 |             buffercapacity = desiredcap;
161 |         }
162 |     }
163 |     uint32_t * tmpbuffer;
164 |     size_t buffercapacity;
165 | };
166 | 
167 | 
168 | #endif
169 | 


--------------------------------------------------------------------------------
/src/avxcodec.h:
--------------------------------------------------------------------------------
  1 | #ifndef DICT_AVX_H
  2 | #define DICT_AVX_H
  3 | 
  4 | /**
  5 | * This is silly compression/decompression code. Meant to test the basics, not
  6 | * for production.
  7 | */
  8 | #include <cassert>
  9 | #include <cstdint>
 10 | #include <cstddef>
 11 | #include <unordered_map>
 12 | 
 13 | #ifndef __AVX2__
 14 | #error This code requires AVX2 support (available on Intel processors made since ~2013)
 15 | #endif
 16 | 
 17 | #ifdef _MSC_VER
 18 | /* Microsoft C/C++-compatible compiler */
 19 | #include <intrin.h>
 20 | #else
 21 | /* Pretty much anything else. */
 22 | #include <x86intrin.h>
 23 | #endif
 24 | 
 25 | #include "avxbpacking.h"
 26 | #include "avxdict.h"
 27 | #include "dict.h"
 28 | 
 29 | 
 30 | /**
 31 | * This class is *not* thread-safe, use one instance per thread.
 32 | */
 33 | class AVXDictCODEC {
 34 | public:
 35 | 
 36 |     AVXDictCODEC() : tmpbuffer(NULL), buffercapacity(0) {}
 37 | 
 38 |     virtual ~AVXDictCODEC() {
 39 |         clearBuffer();
 40 |     }
 41 |     /**
 42 |     * Silly code that compresses an array of 64-bit integers to an array of char,
 43 |     * outputting a convenient data structure.
 44 |     *
 45 |     * This could be *greatly* optimized.
 46 |     *
 47 |     * For simplicity, array lengths are assumed to be multiples of 256.
 48 |     */
 49 |     inline dictionary_coded_t compress(const uint64_t * array, size_t length) {
 50 |         dictionary_coded_t out;
 51 |         out.array_length = length;
 52 |         ensureBufferCapacity(out.array_length);
 53 |         std::unordered_map<uint64_t,uint32_t> distinctvalues;
 54 |         out.dictionary_size = 0;
 55 |         for(size_t i = 0; i < out.array_length; ++i) {
 56 |             if(distinctvalues.find(array[i]) == distinctvalues.end()) {
 57 |                 distinctvalues.emplace(array[i],out.dictionary_size++);
 58 |                 assert(out.dictionary_size != 0); // should never happen unless dictionary gets humongous
 59 |             }
 60 |         }
 61 |         out.dictionary = new uint64_t[out.dictionary_size];
 62 |         for(auto i = distinctvalues.begin(); i != distinctvalues.end(); ++i)  out.dictionary[i->second] = i->first;
 63 |         for(size_t i = 0; i < out.array_length ; ++i) {
 64 |             tmpbuffer[i] = distinctvalues[array[i]];
 65 |         }
 66 |         assert(length % 256 == 0);
 67 |         out.bit_width = 32 - __builtin_clz(out.dictionary_size);
 68 |         out.compressed_data_size = sizeof(uint32_t) * out.bit_width * length / 32;
 69 |         assert(out.array_length * out.bit_width ==  out.compressed_data_size * 8);
 70 |         out.compressed_data = new char[out.compressed_data_size];
 71 |         avxpackwithoutmask(tmpbuffer,(__m256i *) out.compressed_data, out.array_length, out.bit_width);
 72 |         return out;
 73 |     }
 74 | 
 75 | 
 76 | 
 77 |     /**
 78 |     * Silly code that uncompresses an array of 64-bit integers.
 79 |     * The out array should have enough space.
 80 |     *
 81 |     * This could be optimized.
 82 |     *
 83 |     * For simplicity, array lengths are assumed to be multiples of 256.
 84 |     *
 85 |     * Return array size
 86 |     */
 87 |     inline uint32_t uncompress(const dictionary_coded_t & t, uint64_t * out) {
 88 |         ensureBufferCapacity(t.array_length);
 89 |         assert(t.array_length % 256 == 0);
 90 |         assert(t.array_length * t.bit_width ==  t.compressed_data_size * 8);
 91 |         avxunpack((const __m256i*) t.compressed_data, tmpbuffer, t.array_length, t.bit_width);
 92 |         for(size_t i = 0; i < t.array_length; ++i) {
 93 |             out[i] = t.dictionary[tmpbuffer[i]];
 94 |         }
 95 |         return t.array_length;
 96 |     }
 97 | 
 98 |     /**
 99 |     * Prototype code that uncompresses an array of 64-bit integers.
100 |     * The out array should have enough space.
101 |     *
102 |     * If the size of the compressed data does not fit in fast CPU cache,
103 |     * consider using fastrangeuncompress instead, to decompress data to fast CPU cache
104 |     * in blocks. Pushing data back and forth from RAM can be slow.
105 |     *
106 |     * For simplicity, array lengths are assumed to be multiples of 256.
107 |     *
108 |     * Return array size
109 |     */
110 |     static inline uint32_t fastuncompress(const dictionary_coded_t & t, uint64_t * out) {
111 |         assert(t.array_length % 256 == 0);
112 |         assert(t.array_length * t.bit_width ==  t.compressed_data_size * 8);
113 |         avxunpackdict((const __m256i*) t.compressed_data,
114 |                       (const myint64 *) t.dictionary,(int64_t *)  out, t.array_length, t.bit_width);
115 |         return t.array_length;
116 |     }
117 | 
118 |     /**
119 |     * Uncompresses size values from index start
120 |     * This can be used to uncompress data to cache.
121 |     *
122 |     * For simplicity, all indexes and lengths are are assumed to be multiples of 256.
123 |     * Return the number of remain values left to be decoded (starting at index start+length)
124 |     */
125 |     static inline uint32_t fastrangeuncompress(const dictionary_coded_t & t, uint64_t * out, size_t start, size_t length) {
126 |         assert(t.array_length % 256 == 0);
127 |         assert(start % 256 == 0);
128 |         assert(length % 256 == 0);
129 |         assert(t.array_length * t.bit_width ==  t.compressed_data_size * 8);
130 |         assert(start + length <= t.array_length);
131 |         avxunpackdict((const __m256i*) ( t.compressed_data + start * t.bit_width / 8),
132 |                       (const myint64 *) t.dictionary,(int64_t *)  out, length, t.bit_width);
133 |         return t.array_length - start - length;
134 |     }
135 | 
136 | 
137 |     inline void clearBuffer() {
138 |         buffercapacity = 0;
139 |         delete[] tmpbuffer;
140 |         tmpbuffer = NULL;
141 |     }
142 | 
143 | private:
144 | 
145 |     // by design, does not copy
146 |     AVXDictCODEC(const AVXDictCODEC & ) : tmpbuffer(NULL), buffercapacity(0) {}
147 | 
148 | 
149 |     AVXDictCODEC& operator=(const AVXDictCODEC & ) {
150 |         // does nothing, by design
151 |         return *this;
152 |     }
153 | 
154 | 
155 |     inline void ensureBufferCapacity(size_t desiredcap) {
156 |         if(desiredcap > buffercapacity) {
157 |             delete[] tmpbuffer;
158 |             tmpbuffer = new uint32_t[desiredcap];
159 |             assert(tmpbuffer!= NULL);
160 |             buffercapacity = desiredcap;
161 |         }
162 |     }
163 |     uint32_t * tmpbuffer;
164 |     size_t buffercapacity;
165 | };
166 | 
167 | 
168 | #endif
169 | 


--------------------------------------------------------------------------------
/src/dict.h:
--------------------------------------------------------------------------------
 1 | #ifndef DICT_DICT_H
 2 | #define DICT_DICT_H
 3 | 
 4 | #include <cstdint>
 5 | #include <cstddef>
 6 | #include <cstring>
 7 | 
 8 | class dictionary_coded_t {
 9 | public:
10 |     uint64_t * dictionary;
11 |     uint32_t dictionary_size;
12 | 
13 |     char *compressed_data;
14 |     size_t array_length;// uncompressed length in 64-bit words
15 |     uint32_t compressed_data_size;// compressed data in bytes
16 |     int bit_width;
17 | 
18 |     dictionary_coded_t() :
19 |         dictionary ( NULL),
20 |         dictionary_size (0),
21 |         compressed_data (NULL),
22 |         array_length (0),
23 |         compressed_data_size(0),
24 |         bit_width (0)
25 |     {
26 |     }
27 | 
28 | 
29 |     dictionary_coded_t(const dictionary_coded_t && s) :
30 |         dictionary (std::move(s.dictionary)),
31 |         dictionary_size (std::move(s.dictionary_size)),
32 |         compressed_data (std::move(s.compressed_data)),
33 |         array_length (std::move(s.array_length)),
34 |         compressed_data_size(std::move(s.compressed_data_size)),
35 |         bit_width (std::move(s.bit_width))
36 |     {
37 |     }
38 | 
39 | 
40 |     virtual ~dictionary_coded_t() {
41 |         delete[] dictionary;
42 |         delete[] compressed_data;
43 |         init();
44 |     }
45 | 
46 | private:
47 |     dictionary_coded_t(const dictionary_coded_t & s) :
48 |         dictionary ( NULL),
49 |         dictionary_size (0),
50 |         compressed_data (NULL),
51 |         array_length ( 0),
52 |         compressed_data_size(0),
53 |         bit_width ( 0)
54 |     {
55 |         *this = s; // does a deep copy
56 |     }
57 | 
58 |     // does a deep copy
59 |     dictionary_coded_t& operator=(const dictionary_coded_t & s) {
60 |         delete[] dictionary;
61 |         delete[] compressed_data;
62 |         dictionary = new uint64_t[s.dictionary_size];
63 |         memcpy(dictionary,s.dictionary,sizeof(uint64_t)*s.dictionary_size);
64 |         dictionary_size = s.dictionary_size;
65 |         compressed_data = new char[s.compressed_data_size];
66 |         memcpy(compressed_data,s.compressed_data,s.compressed_data_size);
67 | 
68 |         compressed_data_size = s.compressed_data_size;
69 |         array_length = s.array_length;
70 |         bit_width = s.bit_width;
71 |         return *this;
72 |     }
73 | 
74 |     void init() {
75 |         dictionary = NULL;
76 |         compressed_data = NULL;
77 |         dictionary_size = 0;
78 |         array_length = 0;
79 |         compressed_data_size = 0;
80 |         bit_width = 0;
81 |     }
82 | 
83 | 
84 | };
85 | 
86 | #endif
87 | 


--------------------------------------------------------------------------------
/src/scalarcodec.h:
--------------------------------------------------------------------------------
  1 | #ifndef DICT_SCALAR_H
  2 | #define DICT_SCALAR_H
  3 | 
  4 | /**
  5 | * This is silly compression/decompression code. Meant to test the basics, not
  6 | * for production.
  7 | */
  8 | #include <cassert>
  9 | #include <cstdint>
 10 | #include <cstddef>
 11 | #include <unordered_map>
 12 | #include "bpacking.h"
 13 | #include "dict.h"
 14 | 
 15 | 
 16 | /**
 17 | * This class is *not* thread-safe, use one instance per thread.
 18 | */
 19 | class SimpleDictCODEC {
 20 | public:
 21 |     SimpleDictCODEC() : tmpbuffer(NULL), buffercapacity(0) {}
 22 | 
 23 |     virtual ~SimpleDictCODEC() {
 24 |         clearBuffer();
 25 |     }
 26 | 
 27 |     /**
 28 |     * Silly code that compresses an array of 64-bit integers to an array of char,
 29 |     * outputting a convenient data structure.
 30 |     *
 31 |     * This could be *greatly* optimized.
 32 |     *
 33 |     * For simplicity, array lengths are assumed to be multiples of 32.
 34 |     */
 35 |     inline dictionary_coded_t compress(const uint64_t * array, size_t length) {
 36 |         dictionary_coded_t out;
 37 |         out.array_length = length;
 38 |         ensureBufferCapacity(out.array_length);
 39 |         std::unordered_map<uint64_t,uint32_t> distinctvalues;
 40 |         out.dictionary_size = 0;
 41 |         for(size_t i = 0; i < out.array_length; ++i) {
 42 |             if(distinctvalues.find(array[i]) == distinctvalues.end()) {
 43 |                 distinctvalues.emplace(array[i],out.dictionary_size++);
 44 |                 assert(out.dictionary_size != 0); // should never happen unless dictionary gets humongous
 45 |             }
 46 |         }
 47 |         out.dictionary = new uint64_t[out.dictionary_size];
 48 |         for(auto i = distinctvalues.begin(); i != distinctvalues.end(); ++i)  out.dictionary[i->second] = i->first;
 49 |         for(size_t i = 0; i < out.array_length ; ++i) {
 50 |             tmpbuffer[i] = distinctvalues[array[i]];
 51 |         }
 52 |         assert(length % 32 == 0);
 53 |         out.bit_width = 32 - __builtin_clz(out.dictionary_size);
 54 |         out.compressed_data_size = sizeof(uint32_t) * out.bit_width * length / 32;
 55 |         out.compressed_data = new char[out.compressed_data_size];
 56 |         packwithoutmask32(tmpbuffer,(uint32_t *) out.compressed_data, out.array_length, out.bit_width);
 57 |         return out;
 58 |     }
 59 | 
 60 | 
 61 | 
 62 |     /**
 63 |     * Silly code that uncompresses an array of 64-bit integers.
 64 |     * The out array should have enough space.
 65 |     *
 66 |     * This could be optimized.
 67 |     *
 68 |     * For simplicity, array lengths are assumed to be multiples of 32.
 69 |     *
 70 |     * Return array size
 71 |     */
 72 |     inline uint32_t uncompress(const dictionary_coded_t & t, uint64_t * out) {
 73 |         ensureBufferCapacity(t.array_length);
 74 |         assert(t.array_length % 32 == 0);
 75 |         unpack32((const uint32_t*) t.compressed_data, tmpbuffer, t.array_length, t.bit_width);
 76 |         for(size_t i = 0; i < t.array_length; ++i) {
 77 |             out[i] = t.dictionary[tmpbuffer[i]];
 78 |         }
 79 |         return t.array_length;
 80 |     }
 81 | 
 82 |     /**
 83 |     * Uncompresses size values from index start
 84 |     * This can be used to uncompress data to cache.
 85 |     *
 86 |     * For simplicity, all indexes and lengths are are assumed to be multiples of 32.
 87 |     * Return the number of remain values left to be decoded (starting at index start+length)
 88 |     */
 89 |     inline uint32_t rangeuncompress(const dictionary_coded_t & t, uint64_t * out, size_t start, size_t length) {
 90 |         assert(t.array_length % 32 == 0);
 91 |         assert(start % 32 == 0);
 92 |         assert(length % 32 == 0);
 93 |         assert(t.array_length * t.bit_width ==  t.compressed_data_size * 8);
 94 |         assert(start + length <= t.array_length);
 95 |         ensureBufferCapacity(t.array_length);
 96 |         unpack32((const uint32_t*) (t.compressed_data + start * t.bit_width / 8), tmpbuffer, length, t.bit_width);
 97 |         for(size_t i = 0; i < length; ++i) {
 98 |             out[i] = t.dictionary[tmpbuffer[i]];
 99 |         }
100 |         return t.array_length - start - length;
101 |     }
102 | 
103 | 
104 |     inline void clearBuffer() {
105 |         buffercapacity = 0;
106 |         delete[] tmpbuffer;
107 |         tmpbuffer = NULL;
108 |     }
109 | 
110 | private:
111 | 
112 | 
113 |     // by design, does not copy
114 |     SimpleDictCODEC(const SimpleDictCODEC & ) : tmpbuffer(NULL), buffercapacity(0) {}
115 | 
116 | 
117 |     SimpleDictCODEC& operator=(const SimpleDictCODEC & ) {
118 |         // does nothing, by design
119 |         return *this;
120 |     }
121 | 
122 | 
123 |     inline void ensureBufferCapacity(size_t desiredcap) {
124 |         if(desiredcap > buffercapacity) {
125 |             delete[] tmpbuffer;
126 |             tmpbuffer = new uint32_t[desiredcap];
127 |             assert(tmpbuffer!= NULL);
128 |             buffercapacity = desiredcap;
129 |         }
130 |     }
131 |     uint32_t * tmpbuffer;
132 |     size_t buffercapacity;
133 | 
134 | 
135 | };
136 | 
137 | 
138 | #endif
139 | 


--------------------------------------------------------------------------------
/tests/avxtest.cpp:
--------------------------------------------------------------------------------
 1 | #include <string.h>
 2 | #include <cassert>
 3 | #include <iostream>
 4 | 
 5 | #include "avxcodec.h"
 6 | 
 7 | 
 8 | void basictest(uint32_t distinct, uint32_t length) {
 9 |     uint64_t * buf = new uint64_t[length];
10 |     for(size_t i = 0; i < length; i++) {
11 |         buf[i] = (i % distinct) * UINT64_C(0xcb9fe8c7cff9982a) + 77777 ;// made up
12 |     }
13 |     AVXDictCODEC codec;
14 |     dictionary_coded_t t (codec.compress(buf, length));
15 |     uint64_t * newbuf = new uint64_t[length];
16 |     memset(newbuf,0,sizeof(uint64_t) * length);
17 | 
18 |     size_t newlength = codec.uncompress(t,newbuf);
19 |     assert(length == newlength);
20 |     for(size_t i = 0; i < length; i++) {
21 |         assert(buf[i] == newbuf[i]);
22 |     }
23 | }
24 | 
25 | void fasttest(uint32_t distinct, uint32_t length) {
26 |     uint64_t * buf = new uint64_t[length];
27 |     for(size_t i = 0; i < length; i++) {
28 |         buf[i] = (i % distinct) * UINT64_C(0xcb9fe8c7cff9982a) + 77777 ;// made up
29 |     }
30 |     AVXDictCODEC codec;
31 |     dictionary_coded_t t (codec.compress(buf, length) );
32 |     uint64_t * newbuf = new uint64_t[length];
33 |     memset(newbuf,0,sizeof(uint64_t) * length);
34 |     size_t newlength = AVXDictCODEC::fastuncompress(t,newbuf);
35 |     assert(length == newlength);
36 |     for(size_t i = 0; i < length; i++) {
37 |         assert(buf[i] == newbuf[i]);
38 |     }
39 |     delete[] newbuf;
40 |     delete[] buf;
41 | }
42 | void fastrangetest(uint32_t distinct, uint32_t length) {
43 |     uint64_t * buf = new uint64_t[length];
44 |     for(size_t i = 0; i < length; i++) {
45 |         buf[i] = (i % distinct) * UINT64_C(0xcb9fe8c7cff9982a) + 77777 ;// made up
46 |     }
47 |     AVXDictCODEC codec;
48 |     dictionary_coded_t t (codec.compress(buf, length) );
49 |     for(size_t blocksize = 256; blocksize <= length; blocksize += 256) {
50 |       uint64_t * newbuf = new uint64_t[length];
51 |       memset(newbuf,0,sizeof(uint64_t) * length);
52 |       size_t leftover = length;
53 |       for(size_t i = 0; i < length; i += blocksize) {
54 |         leftover = AVXDictCODEC::fastrangeuncompress(t,newbuf + i , i , leftover > blocksize ? blocksize : leftover);
55 |       }
56 |       for(size_t i = 0; i < length; i++) {
57 |         assert(buf[i] == newbuf[i]);
58 |       }
59 |       delete[] newbuf;
60 |     }
61 |     delete[] buf;
62 | }
63 | 
64 | int main() {
65 |     for(uint32_t length = 256; length <= 65536; length *=2) {
66 |         for(uint32_t distinct = 1; distinct <= 65536; distinct *=2) {
67 |             basictest(distinct, length);
68 |             fasttest(distinct, length);
69 |             fastrangetest(distinct, length);
70 |         }
71 |         std::cout << ".";
72 |         std::cout.flush();
73 |     }
74 |     std::cout << std::endl;
75 |     std::cout << "AVX code might be ok. " << std::endl;
76 |     return 0;
77 | }
78 | 


--------------------------------------------------------------------------------
/tests/scalartest.cpp:
--------------------------------------------------------------------------------
 1 | #include <cassert>
 2 | #include <iostream>
 3 | 
 4 | #include "scalarcodec.h"
 5 | 
 6 | 
 7 | void basictest(uint32_t distinct, uint32_t length) {
 8 |     uint64_t * buf = new uint64_t[length];
 9 |     for(size_t i = 0; i < length; i++) {
10 |         buf[i] = (i % distinct) * UINT64_C(0xcb9fe8c7cff9982a) + 77777 ;// made up
11 |     }
12 |     SimpleDictCODEC codec;
13 |     dictionary_coded_t t = codec.compress(buf, length);
14 |     uint64_t * newbuf = new uint64_t[length];
15 | 
16 |     size_t newlength = codec.uncompress(t,newbuf);
17 |     assert(length == newlength);
18 |     for(size_t i = 0; i < length; i++) {
19 |         assert(buf[i] == newbuf[i]);
20 |     }
21 |     delete[] newbuf;
22 |     delete[] buf;
23 | }
24 | 
25 | void rangetest(uint32_t distinct, uint32_t length) {
26 |     uint64_t * buf = new uint64_t[length];
27 |     for(size_t i = 0; i < length; i++) {
28 |         buf[i] = (i % distinct) * UINT64_C(0xcb9fe8c7cff9982a) + 77777 ;// made up
29 |     }
30 |     SimpleDictCODEC codec;
31 |     dictionary_coded_t t (codec.compress(buf, length) );
32 |     for(size_t blocksize = 32; blocksize <= length; blocksize += 32) {
33 |       uint64_t * newbuf = new uint64_t[length];
34 |       memset(newbuf,0,sizeof(uint64_t) * length);
35 |       size_t leftover = length;
36 |       for(size_t i = 0; i < length; i += blocksize) {
37 |         leftover = codec.rangeuncompress(t,newbuf + i , i , leftover > blocksize ? blocksize : leftover);
38 |       }
39 |       for(size_t i = 0; i < length; i++) {
40 |         assert(buf[i] == newbuf[i]);
41 |       }
42 |       delete[] newbuf;
43 |     }
44 |     delete[] buf;
45 | }
46 | 
47 | 
48 | int main() {
49 |     for(uint32_t length = 256; length < 65536; length *=2) {
50 |         for(uint32_t distinct = 1; distinct < 65536; distinct *=2) {
51 |             basictest(distinct, length);
52 |             rangetest(distinct, length);
53 |         }
54 |         std::cout << ".";
55 |         std::cout.flush();
56 |     }
57 |     std::cout << std::endl;
58 |     std::cout << "Scalar code might be ok. " << std::endl;
59 |     return 0;
60 | }
61 | 


--------------------------------------------------------------------------------