├── .gitignore
├── .travis.yml
├── go
    ├── README.md
    └── test.go
├── CHANGELOG
├── include
    ├── simdcomp.h
    ├── avxbitpacking.h
    ├── avx512bitpacking.h
    ├── simdcomputil.h
    ├── portability.h
    ├── simdfor.h
    ├── simdbitpacking.h
    └── simdintegratedbitpacking.h
├── package.json
├── simdcomp.def.tpl
├── LICENSE
├── Makefile
├── tests
    ├── unit_chars.c
    └── unit.c
├── .appveyor.yml
├── makefile.vc
├── scripts
    ├── simdfor.py
    ├── avxpacking.py
    └── avx512packing.py
├── benchmarks
    ├── benchmark.c
    └── bitpackingbenchmark.c
├── example.c
├── README.md
└── src
    └── simdcomputil.c


/.gitignore:
--------------------------------------------------------------------------------
 1 | Makefile.in
 2 | lib*
 3 | unit*
 4 | *.o
 5 | src/*.lo
 6 | src/*.o
 7 | src/.deps
 8 | src/.dirstamp
 9 | src/.libs
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: c
 2 | sudo: false
 3 | compiler:
 4 |   - gcc
 5 |   - clang
 6 | 
 7 | branches:
 8 |   only:
 9 |     - master
10 | 
11 | script: make && ./unit && ./unit_chars && make clean  
12 | 


--------------------------------------------------------------------------------
/go/README.md:
--------------------------------------------------------------------------------
 1 | Simple Go demo
 2 | ==============
 3 | 
 4 | Setup
 5 | ======
 6 | 
 7 | Start by installing the simdcomp library (make && make install).
 8 | 
 9 | Then type:
10 | 
11 | go run test.go
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
 1 | Upcoming
 2 |   - added missing include
 3 |   - improved portability (MSVC)
 4 |   - implemented C89 compatibility
 5 | Version 0.0.3 (19 May 2014)
 6 |   - improved documentation
 7 | Version 0.0.2 (6 February 2014)
 8 |   - added go demo
 9 | Version 0.0.1  (5 February 2014)
10 | 


--------------------------------------------------------------------------------
/include/simdcomp.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This code is released under a BSD License.
 3 |  */
 4 | 
 5 | #ifndef SIMDCOMP_H_
 6 | #define SIMDCOMP_H_
 7 | 
 8 | #ifdef __cplusplus
 9 | extern "C" {
10 | #endif
11 | 
12 | #include "avx512bitpacking.h"
13 | #include "avxbitpacking.h"
14 | #include "simdbitpacking.h"
15 | #include "simdcomputil.h"
16 | #include "simdfor.h"
17 | #include "simdintegratedbitpacking.h"
18 | 
19 | #ifdef __cplusplus
20 | } // extern "C"
21 | #endif
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "simdcomp",
 3 |   "version": "0.0.3",
 4 |   "repo": "lemire/simdcomp",
 5 |   "description": "A simple C library for compressing lists of integers",
 6 |   "license": "BSD-3-Clause",
 7 |   "src": [
 8 |     "src/simdbitpacking.c",
 9 |     "src/simdcomputil.c",
10 |     "src/simdintegratedbitpacking.c",
11 |     "include/simdbitpacking.h",
12 |     "include/simdcomp.h",
13 |     "include/simdcomputil.h",
14 |     "include/simdintegratedbitpacking.h"
15 |   ]
16 | }
17 | 


--------------------------------------------------------------------------------
/simdcomp.def.tpl:
--------------------------------------------------------------------------------
 1 | EXPORTS
 2 | 	simdpack
 3 | 	simdpackwithoutmask
 4 | 	simdunpack
 5 | 	bits
 6 | 	maxbits
 7 | 	maxbits_length
 8 | 	simdmin
 9 | 	simdmin_length
10 | 	simdmaxmin
11 | 	simdmaxmin_length
12 | 	simdmaxbitsd1
13 | 	simdmaxbitsd1_length
14 | 	simdpackd1
15 | 	simdpackwithoutmaskd1
16 | 	simdunpackd1
17 | 	simdsearchd1
18 | 	simdsearchwithlengthd1
19 | 	simdselectd1
20 | 	simdpackFOR
21 | 	simdselectFOR
22 | 	simdsearchwithlengthFOR
23 | 	simdunpackFOR
24 | 	simdmin_length
25 | 	simdmaxmin
26 | 	simdmaxmin_length
27 | 	simdpack_length
28 | 	simdpackFOR_length
29 | 	simdunpackFOR_length
30 | 	simdpackFOR_compressedbytes
31 | 	simdpack_shortlength
32 | 	simdpack_compressedbytes
33 | 	simdfastsetFOR
34 | 	simdfastset
35 | 	simdfastsetd1
36 | 	simdunpack_length
37 | 	simdunpack_shortlength
38 | 	simdsearchwithlengthFOR
39 | 	simdscand1
40 | 	simdfastsetd1fromprevious
41 | 
42 | 


--------------------------------------------------------------------------------
/include/avxbitpacking.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This code is released under a BSD License.
 3 |  */
 4 | 
 5 | #ifndef INCLUDE_AVXBITPACKING_H_
 6 | #define INCLUDE_AVXBITPACKING_H_
 7 | 
 8 | #ifdef __AVX2__
 9 | 
10 | #include "portability.h"
11 | 
12 | /* AVX2 is required */
13 | #include <immintrin.h>
14 | /* for memset */
15 | #include <string.h>
16 | 
17 | #include "simdcomputil.h"
18 | 
19 | enum { AVXBlockSize = 256 };
20 | 
21 | /* max integer logarithm over a range of AVXBlockSize integers (256 integer) */
22 | uint32_t avxmaxbits(const uint32_t *begin);
23 | 
24 | /* reads 256 values from "in", writes  "bit" 256-bit vectors to "out" */
25 | void avxpack(const uint32_t *in, __m256i *out, const uint32_t bit);
26 | 
27 | /* reads 256 values from "in", writes  "bit" 256-bit vectors to "out" */
28 | void avxpackwithoutmask(const uint32_t *in, __m256i *out, const uint32_t bit);
29 | 
30 | /* reads  "bit" 256-bit vectors from "in", writes  256 values to "out" */
31 | void avxunpack(const __m256i *in, uint32_t *out, const uint32_t bit);
32 | 
33 | #endif /* __AVX2__ */
34 | 
35 | #endif /* INCLUDE_AVXBITPACKING_H_ */
36 | 


--------------------------------------------------------------------------------
/include/avx512bitpacking.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This code is released under a BSD License.
 3 |  */
 4 | 
 5 | #ifndef INCLUDE_AVX512BITPACKING_H_
 6 | #define INCLUDE_AVX512BITPACKING_H_
 7 | 
 8 | #ifdef __AVX512F__
 9 | 
10 | #include "portability.h"
11 | 
12 | /* AVX512 is required */
13 | #include <immintrin.h>
14 | /* for memset */
15 | #include <string.h>
16 | 
17 | #include "simdcomputil.h"
18 | 
19 | enum { AVX512BlockSize = 512 };
20 | 
21 | /* max integer logarithm over a range of AVX512BlockSize integers (512 integer)
22 |  */
23 | uint32_t avx512maxbits(const uint32_t *begin);
24 | 
25 | /* reads 512 values from "in", writes  "bit" 512-bit vectors to "out" */
26 | void avx512pack(const uint32_t *in, __m512i *out, const uint32_t bit);
27 | 
28 | /* reads 512 values from "in", writes  "bit" 512-bit vectors to "out" */
29 | void avx512packwithoutmask(const uint32_t *in, __m512i *out,
30 |                            const uint32_t bit);
31 | 
32 | /* reads  "bit" 512-bit vectors from "in", writes  512 values to "out" */
33 | void avx512unpack(const __m512i *in, uint32_t *out, const uint32_t bit);
34 | 
35 | #endif /* __AVX512F__ */
36 | 
37 | #endif /* INCLUDE_AVX512BITPACKING_H_ */
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014--, The authors
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification,
 5 | are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice, this
11 |   list of conditions and the following disclaimer in the documentation and/or
12 |   other materials provided with the distribution.
13 | 
14 | * Neither the name of the {organization} nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/include/simdcomputil.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This code is released under a BSD License.
 3 |  */
 4 | 
 5 | #ifndef SIMDCOMPUTIL_H_
 6 | #define SIMDCOMPUTIL_H_
 7 | 
 8 | #include "portability.h"
 9 | 
10 | /* SSE2 is required */
11 | #include <emmintrin.h>
12 | 
13 | /* returns the integer logarithm of v (bit width) */
14 | uint32_t bits(const uint32_t v);
15 | 
16 | /* max integer logarithm over a range of SIMDBlockSize integers (128 integer) */
17 | uint32_t maxbits(const uint32_t *begin);
18 | 
19 | /* same as maxbits, but we specify the number of integers */
20 | uint32_t maxbits_length(const uint32_t *in, uint32_t length);
21 | 
22 | enum { SIMDBlockSize = 128 };
23 | 
24 | /* computes (quickly) the minimal value of 128 values */
25 | uint32_t simdmin(const uint32_t *in);
26 | 
27 | /* computes (quickly) the minimal value of the specified number of values */
28 | uint32_t simdmin_length(const uint32_t *in, uint32_t length);
29 | 
30 | #ifdef __SSE4_1__
31 | /* computes (quickly) the minimal and maximal value of the specified number of
32 |  * values */
33 | void simdmaxmin_length(const uint32_t *in, uint32_t length, uint32_t *getmin,
34 |                        uint32_t *getmax);
35 | 
36 | /* computes (quickly) the minimal and maximal value of the 128 values */
37 | void simdmaxmin(const uint32_t *in, uint32_t *getmin, uint32_t *getmax);
38 | 
39 | #endif
40 | 
41 | /* like maxbit over 128 integers (SIMDBlockSize) with provided initial value
42 |    and using differential coding */
43 | uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t *in);
44 | 
45 | /* like simdmaxbitsd1, but calculates maxbits over |length| integers
46 |    with provided initial value. |length| can be any arbitrary value. */
47 | uint32_t simdmaxbitsd1_length(uint32_t initvalue, const uint32_t *in,
48 |                               uint32_t length);
49 | 
50 | #endif /* SIMDCOMPUTIL_H_ */
51 | 


--------------------------------------------------------------------------------
/go/test.go:
--------------------------------------------------------------------------------
 1 | /////////
 2 | // This particular file is in the public domain.
 3 | // Author: Daniel Lemire
 4 | ////////
 5 | 
 6 | package main 
 7 | 
 8 | /*
 9 | #cgo LDFLAGS: -lsimdcomp
10 | #include <simdcomp.h>
11 | */
12 | import "C"
13 | import "fmt"
14 | 
15 | //////////
16 | // For this demo, we pack and unpack blocks of 128 integers
17 | /////////
18 | func main() {
19 |         // I am going to use C types. Alternative might be to use unsafe.Pointer calls, see http://bit.ly/1ndw3W3
20 |         // this is our original data
21 |         var data [128]C.uint32_t
22 |         for i := C.uint32_t(0); i < C.uint32_t(128); i++ {
23 |             data[i] = i
24 |         }
25 | 
26 | 
27 | 
28 | 
29 | 
30 |         ////////////
31 |         // We first pack without differential coding
32 |         ///////////
33 |         // computing how many bits per int. is needed
34 |         b  := C.maxbits(&data[0])
35 |         ratio := 32.0/float64(b)
36 |         fmt.Println("Bit width  ", b)
37 |         fmt.Println(fmt.Sprintf("Compression ratio %f ", ratio))
38 |          // we are now going to create a buffer to receive the packed data (each __m128i uses 128 bits)
39 |         out := make([] C.__m128i,b)       
40 |         C.simdpackwithoutmask( &data[0],&out[0],b);
41 |         var recovereddata [128]C.uint32_t
42 |         C.simdunpack(&out[0],&recovereddata[0],b)
43 |         for i := 0; i < 128; i++ {
44 |             if data[i] != recovereddata[i]  {
45 |                   fmt.Println("Bug ")
46 |                   return
47 |             }
48 |         } 
49 | 
50 |         ///////////
51 |         // Next, we use differential coding
52 |         //////////
53 |         offset := C.uint32_t(0) // if you pack data from K to K + 128, offset should be the value at K-1. When K = 0, choose a default
54 |         b1  := C.simdmaxbitsd1(offset,&data[0])
55 |         ratio1 := 32.0/float64(b1)
56 |         fmt.Println("Bit width  ", b1)
57 |         fmt.Println(fmt.Sprintf("Compression ratio %f ", ratio1))
58 |          // we are now going to create a buffer to receive the packed data (each __m128i uses 128 bits)
59 |         out = make([] C.__m128i,b1)       
60 |         C.simdpackwithoutmaskd1(offset, &data[0],&out[0],b1);
61 |         C.simdunpackd1(offset,&out[0],&recovereddata[0],b1)
62 |         for i := 0; i < 128; i++ {
63 |             if data[i] != recovereddata[i]  {
64 |                   fmt.Println("Bug ")
65 |                   return
66 |             }
67 |         } 
68 | 
69 |         fmt.Println("test succesful.")
70 |       
71 | }
72 | 


--------------------------------------------------------------------------------
/include/portability.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This code is released under a BSD License.
 3 |  */
 4 | #ifndef SIMDBITCOMPAT_H_
 5 | #define SIMDBITCOMPAT_H_
 6 | 
 7 | #include <iso646.h> /* mostly for Microsoft compilers */
 8 | #include <string.h>
 9 | 
10 | #ifdef SIMDCOMP_DEBUG
11 | #define SIMDCOMP_ALWAYS_INLINE inline
12 | #define SIMDCOMP_NEVER_INLINE
13 | #define SIMDCOMP_PURE
14 | #else
15 | #if defined(__GNUC__)
16 | #if __GNUC__ >= 3
17 | #define SIMDCOMP_ALWAYS_INLINE inline __attribute__((always_inline))
18 | #define SIMDCOMP_NEVER_INLINE __attribute__((noinline))
19 | #define SIMDCOMP_PURE __attribute__((pure))
20 | #else
21 | #define SIMDCOMP_ALWAYS_INLINE inline
22 | #define SIMDCOMP_NEVER_INLINE
23 | #define SIMDCOMP_PURE
24 | #endif
25 | #elif defined(_MSC_VER)
26 | #define SIMDCOMP_ALWAYS_INLINE __forceinline
27 | #define SIMDCOMP_NEVER_INLINE
28 | #define SIMDCOMP_PURE
29 | #else
30 | #if __has_attribute(always_inline)
31 | #define SIMDCOMP_ALWAYS_INLINE inline __attribute__((always_inline))
32 | #else
33 | #define SIMDCOMP_ALWAYS_INLINE inline
34 | #endif
35 | #if __has_attribute(noinline)
36 | #define SIMDCOMP_NEVER_INLINE __attribute__((noinline))
37 | #else
38 | #define SIMDCOMP_NEVER_INLINE
39 | #endif
40 | #if __has_attribute(pure)
41 | #define SIMDCOMP_PURE __attribute__((pure))
42 | #else
43 | #define SIMDCOMP_PURE
44 | #endif
45 | #endif
46 | #endif
47 | 
48 | #if defined(_MSC_VER) && _MSC_VER < 1600
49 | typedef unsigned int uint32_t;
50 | typedef unsigned char uint8_t;
51 | typedef signed char int8_t;
52 | #else
53 | #include <stdint.h> /* part of Visual Studio 2010 and better, others likely anyway */
54 | #endif
55 | 
56 | #if defined(_MSC_VER)
57 | #define SIMDCOMP_ALIGNED(x) __declspec(align(x))
58 | #else
59 | #if defined(__GNUC__)
60 | #define SIMDCOMP_ALIGNED(x) __attribute__((aligned(x)))
61 | #endif
62 | #endif
63 | 
64 | #if defined(_MSC_VER)
65 | #include <intrin.h>
66 | /* 64-bit needs extending */
67 | #define SIMDCOMP_CTZ(result, mask)                                             \
68 |   do {                                                                         \
69 |     unsigned long index;                                                       \
70 |     if (!_BitScanForward(&(index), (mask))) {                                  \
71 |       (result) = 32U;                                                          \
72 |     } else {                                                                   \
73 |       (result) = (uint32_t)(index);                                            \
74 |     }                                                                          \
75 |   } while (0)
76 | #else
77 | #include <x86intrin.h>
78 | #define SIMDCOMP_CTZ(result, mask) result = __builtin_ctz(mask)
79 | #endif
80 | 
81 | #endif /* SIMDBITCOMPAT_H_ */
82 | 


--------------------------------------------------------------------------------
/include/simdfor.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This code is released under a BSD License.
 3 |  */
 4 | #ifndef INCLUDE_SIMDFOR_H_
 5 | #define INCLUDE_SIMDFOR_H_
 6 | 
 7 | #include "portability.h"
 8 | 
 9 | /* SSE2 is required */
10 | #include <emmintrin.h>
11 | 
12 | #include "simdbitpacking.h"
13 | #include "simdcomputil.h"
14 | 
15 | #ifdef __cplusplus
16 | extern "C" {
17 | #endif
18 | 
19 | /* reads 128 values from "in", writes  "bit" 128-bit vectors to "out" */
20 | void simdpackFOR(uint32_t initvalue, const uint32_t *in, __m128i *out,
21 |                  const uint32_t bit);
22 | 
23 | /* reads "bit" 128-bit vectors from "in", writes  128 values to "out" */
24 | void simdunpackFOR(uint32_t initvalue, const __m128i *in, uint32_t *out,
25 |                    const uint32_t bit);
26 | 
27 | /* how many compressed bytes are needed to compressed length integers using a
28 | bit width of bit with the  simdpackFOR_length function. */
29 | int simdpackFOR_compressedbytes(int length, const uint32_t bit);
30 | 
31 | /* like simdpackFOR, but supports an undetermined number of inputs.
32 | This is useful if you need to pack less than 128 integers. Note that this
33 | function is much slower. Compressed data is stored in the memory location
34 | between the provided (out) pointer and the returned pointer. */
35 | __m128i *simdpackFOR_length(uint32_t initvalue, const uint32_t *in, int length,
36 |                             __m128i *out, const uint32_t bit);
37 | 
38 | /* like simdunpackFOR, but supports an undetermined number of inputs.
39 | This is useful if you need to unpack less than 128 integers. Note that this
40 | function is much slower. The read compressed data is between the provided (in)
41 | pointer and the returned pointer.  */
42 | const __m128i *simdunpackFOR_length(uint32_t initvalue, const __m128i *in,
43 |                                     int length, uint32_t *out,
44 |                                     const uint32_t bit);
45 | 
46 | /* returns the value stored at the specified "slot".
47 |  * */
48 | uint32_t simdselectFOR(uint32_t initvalue, const __m128i *in, uint32_t bit,
49 |                        int slot);
50 | 
51 | /* given a block of 128 packed values, this function sets the value at index
52 |  * "index" to "value" */
53 | void simdfastsetFOR(uint32_t initvalue, __m128i *in, uint32_t bit,
54 |                     uint32_t value, size_t index);
55 | 
56 | /* searches "bit" 128-bit vectors from "in" (= length<=128 encoded integers) for
57 |  * the first encoded uint32 value which is >= |key|, and returns its position.
58 |  * It is assumed that the values stored are in sorted order. The encoded key is
59 |  * stored in "*presult". The first length decoded integers, ignoring others. If
60 |  * no value is larger or equal to the key, length is returned. Length should be
61 |  * no larger than 128.
62 |  *
63 |  * If no value is larger or equal to the key,
64 |  * length is returned */
65 | int simdsearchwithlengthFOR(uint32_t initvalue, const __m128i *in, uint32_t bit,
66 |                             int length, uint32_t key, uint32_t *presult);
67 | 
68 | #ifdef __cplusplus
69 | } // extern "C"
70 | #endif
71 | 
72 | #endif /* INCLUDE_SIMDFOR_H_ */
73 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # minimalist makefile
 2 | .SUFFIXES:
 3 | #
 4 | .SUFFIXES: .cpp .o .c .h
 5 | ifeq ($(DEBUG),1)
 6 | CFLAGS = -fPIC  -std=c89 -ggdb -march=native -Wall -Wextra -Wshadow -fsanitize=undefined  -fno-omit-frame-pointer -fsanitize=address
 7 | else
 8 | CFLAGS = -fPIC -std=c89 -O3  -march=native -Wall -Wextra -Wshadow
 9 | endif # debug
10 | LDFLAGS = -shared
11 | LIBNAME=libsimdcomp.so.0.0.3
12 | STATICLIBNAME=libsimdcomp.a
13 | all:  unit unit_chars bitpackingbenchmark $(LIBNAME) $(STATICLIBNAME)
14 | test:
15 | 	./unit
16 | 	./unit_chars
17 | install: $(OBJECTS)
18 | 	cp $(LIBNAME) /usr/local/lib
19 | 	ln -s /usr/local/lib/$(LIBNAME) /usr/local/lib/libsimdcomp.so
20 | 	ldconfig
21 | 	cp $(HEADERS) /usr/local/include
22 | 
23 | 
24 | 
25 | HEADERS=./include/simdbitpacking.h ./include/simdcomputil.h ./include/simdintegratedbitpacking.h ./include/simdcomp.h ./include/simdfor.h ./include/avxbitpacking.h ./include/avx512bitpacking.h
26 | 
27 | uninstall:
28 | 	for h in $(HEADERS) ; do rm  /usr/local/$$h; done
29 | 	rm  /usr/local/lib/$(LIBNAME)
30 | 	rm /usr/local/lib/libsimdcomp.so
31 | 	ldconfig
32 | 
33 | 
34 | OBJECTS= simdbitpacking.o simdintegratedbitpacking.o simdcomputil.o \
35 | 		 simdpackedsearch.o simdpackedselect.o simdfor.o avxbitpacking.o avx512bitpacking.o
36 | 
37 | $(LIBNAME): $(OBJECTS)
38 | 	$(CC) $(CFLAGS) -o $(LIBNAME) $(OBJECTS)  $(LDFLAGS)
39 | 
40 | $(STATICLIBNAME): $(OBJECTS)
41 | 	ar -qcs $@ $(OBJECTS)
42 | 	ranlib  $@
43 | 
44 | avx512bitpacking.o: ./src/avx512bitpacking.c $(HEADERS)
45 | 	$(CC) $(CFLAGS) -c ./src/avx512bitpacking.c -Iinclude
46 | 
47 | 
48 | 
49 | avxbitpacking.o: ./src/avxbitpacking.c $(HEADERS)
50 | 	$(CC) $(CFLAGS) -c ./src/avxbitpacking.c -Iinclude
51 | 
52 | 
53 | simdfor.o: ./src/simdfor.c $(HEADERS)
54 | 	$(CC) $(CFLAGS) -c ./src/simdfor.c -Iinclude
55 | 
56 | 
57 | simdcomputil.o: ./src/simdcomputil.c $(HEADERS)
58 | 	$(CC) $(CFLAGS) -c ./src/simdcomputil.c -Iinclude
59 | 
60 | simdbitpacking.o: ./src/simdbitpacking.c $(HEADERS)
61 | 	$(CC) $(CFLAGS) -c ./src/simdbitpacking.c -Iinclude
62 | 
63 | simdintegratedbitpacking.o: ./src/simdintegratedbitpacking.c  $(HEADERS)
64 | 	$(CC) $(CFLAGS) -c ./src/simdintegratedbitpacking.c -Iinclude
65 | 
66 | simdpackedsearch.o: ./src/simdpackedsearch.c $(HEADERS)
67 | 	$(CC) $(CFLAGS) -c ./src/simdpackedsearch.c -Iinclude
68 | 
69 | simdpackedselect.o: ./src/simdpackedselect.c $(HEADERS)
70 | 	$(CC) $(CFLAGS) -c ./src/simdpackedselect.c -Iinclude
71 | 
72 | example: ./example.c    $(HEADERS) $(OBJECTS)
73 | 	$(CC) $(CFLAGS) -o example ./example.c -Iinclude  $(OBJECTS)
74 | 
75 | unit: ./tests/unit.c    $(HEADERS) $(OBJECTS)
76 | 	$(CC) $(CFLAGS) -o unit ./tests/unit.c -Iinclude  $(OBJECTS)
77 | 
78 | bitpackingbenchmark: ./benchmarks/bitpackingbenchmark.c    $(HEADERS) $(OBJECTS)
79 | 	$(CC) $(CFLAGS) -o bitpackingbenchmark ./benchmarks/bitpackingbenchmark.c -Iinclude  $(OBJECTS)
80 | benchmark: ./benchmarks/benchmark.c    $(HEADERS) $(OBJECTS)
81 | 	$(CC) $(CFLAGS) -o benchmark ./benchmarks/benchmark.c -Iinclude  $(OBJECTS)
82 | dynunit: ./tests/unit.c    $(HEADERS) $(LIBNAME)
83 | 	$(CC) $(CFLAGS) -o dynunit ./tests/unit.c -Iinclude  -lsimdcomp
84 | 
85 | unit_chars: ./tests/unit_chars.c    $(HEADERS) $(OBJECTS)
86 | 	$(CC) $(CFLAGS) -o unit_chars ./tests/unit_chars.c -Iinclude  $(OBJECTS)
87 | clean:
88 | 	rm -f unit *.o $(LIBNAME) $(STATICLIBNAME) example benchmark bitpackingbenchmark dynunit unit_chars
89 | 


--------------------------------------------------------------------------------
/include/simdbitpacking.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This code is released under a BSD License.
 3 |  */
 4 | #ifndef SIMDBITPACKING_H_
 5 | #define SIMDBITPACKING_H_
 6 | 
 7 | #include "portability.h"
 8 | 
 9 | /* SSE2 is required */
10 | #include <emmintrin.h>
11 | /* for memset */
12 | #include <string.h>
13 | 
14 | #include "simdcomputil.h"
15 | 
16 | /***
17 |  * Please see example.c for various examples on how to make good use
18 |  * of these functions.
19 |  */
20 | 
21 | /* reads 128 values from "in", writes  "bit" 128-bit vectors to "out".
22 |  * The input values are masked so that only the least significant "bit" bits are
23 |  * used. */
24 | void simdpack(const uint32_t *in, __m128i *out, const uint32_t bit);
25 | 
26 | /* reads 128 values from "in", writes  "bit" 128-bit vectors to "out".
27 |  * The input values are assumed to be less than 1<<bit. */
28 | void simdpackwithoutmask(const uint32_t *in, __m128i *out, const uint32_t bit);
29 | 
30 | /* reads  "bit" 128-bit vectors from "in", writes  128 values to "out" */
31 | void simdunpack(const __m128i *in, uint32_t *out, const uint32_t bit);
32 | 
33 | /* how many compressed bytes are needed to compressed length integers using a
34 | bit width of bit with the  simdpack_length function. */
35 | int simdpack_compressedbytes(int length, const uint32_t bit);
36 | 
37 | /* like simdpack, but supports an undetermined number of inputs.
38 |  * This is useful if you need to unpack an array of integers that is not
39 |  divisible by 128 integers.
40 |  * Returns a pointer to the (advanced) compressed array. Compressed data is
41 |  stored in the memory location between the provided (out) pointer and the
42 |  returned pointer. */
43 | __m128i *simdpack_length(const uint32_t *in, size_t length, __m128i *out,
44 |                          const uint32_t bit);
45 | 
46 | /* like simdunpack, but supports an undetermined number of inputs.
47 |  * This is useful if you need to unpack an array of integers that is not
48 |  divisible by 128 integers.
49 |  * Returns a pointer to the (advanced) compressed array. The read compressed
50 |  data is between the provided (in) pointer and the returned pointer. */
51 | const __m128i *simdunpack_length(const __m128i *in, size_t length,
52 |                                  uint32_t *out, const uint32_t bit);
53 | 
54 | /* like simdpack, but supports an undetermined small number of inputs. This is
55 | useful if you need to pack less than 128 integers.
56 |  * Note that this function is much slower.
57 |  * Returns a pointer to the (advanced) compressed array. Compressed data is
58 | stored in the memory location between the provided (out) pointer and the
59 | returned pointer. */
60 | __m128i *simdpack_shortlength(const uint32_t *in, int length, __m128i *out,
61 |                               const uint32_t bit);
62 | 
63 | /* like simdunpack, but supports an undetermined small number of inputs. This is
64 |  useful if you need to unpack less than 128 integers.
65 |  * Note that this function is much slower.
66 |  * Returns a pointer to the (advanced) compressed array. The read compressed
67 |  data is between the provided (in) pointer and the returned pointer. */
68 | const __m128i *simdunpack_shortlength(const __m128i *in, int length,
69 |                                       uint32_t *out, const uint32_t bit);
70 | 
71 | /* given a block of 128 packed values, this function sets the value at index
72 |  * "index" to "value" */
73 | void simdfastset(__m128i *in128, uint32_t b, uint32_t value, size_t index);
74 | 
75 | #endif /* SIMDBITPACKING_H_ */
76 | 


--------------------------------------------------------------------------------
/tests/unit_chars.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This code is released under a BSD License.
 3 |  */
 4 | #include "simdcomp.h"
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <time.h>
 8 | 
 9 | #define get_random_char() (uint8_t)(rand() % 256);
10 | 
11 | int main() {
12 |   int N = 5000 * SIMDBlockSize, gap;
13 |   __m128i *buffer = malloc(SIMDBlockSize * sizeof(uint32_t));
14 |   uint32_t *datain = malloc(N * sizeof(uint32_t));
15 |   uint32_t *backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
16 | 
17 |   srand(time(NULL));
18 | 
19 |   for (gap = 1; gap <= 387420489; gap *= 3) {
20 |     int k;
21 |     printf(" gap = %u \n", gap);
22 | 
23 |     /* simulate some random character string, don't care about endiannes */
24 |     for (k = 0; k < N; ++k) {
25 |       uint8_t _tmp[4];
26 | 
27 |       _tmp[0] = get_random_char();
28 |       _tmp[1] = get_random_char();
29 |       _tmp[2] = get_random_char();
30 |       _tmp[3] = get_random_char();
31 | 
32 |       memmove(&datain[k], _tmp, 4);
33 |     }
34 |     for (k = 0; k * SIMDBlockSize < N; ++k) {
35 |       /*
36 |          First part works for general arrays (sorted or unsorted)
37 |       */
38 |       int j;
39 |       /* we compute the bit width */
40 |       const uint32_t b = maxbits(datain + k * SIMDBlockSize);
41 |       /* we read 128 integers at "datain + k * SIMDBlockSize" and
42 |          write b 128-bit vectors at "buffer" */
43 |       simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b);
44 |       /* we read back b1 128-bit vectors at "buffer" and write 128 integers at
45 |        * backbuffer */
46 |       simdunpack(buffer, backbuffer, b); /* uncompressed */
47 |       for (j = 0; j < SIMDBlockSize; ++j) {
48 |         uint8_t chars_back[4];
49 |         uint8_t chars_in[4];
50 | 
51 |         memmove(chars_back, &backbuffer[j], 4);
52 |         memmove(chars_in, &datain[k * SIMDBlockSize + j], 4);
53 | 
54 |         if (chars_in[0] != chars_back[0] || chars_in[1] != chars_back[1] ||
55 |             chars_in[2] != chars_back[2] || chars_in[3] != chars_back[3]) {
56 |           printf("bug in simdpack\n");
57 |           return -2;
58 |         }
59 |       }
60 | 
61 |       {
62 |         /*
63 |          next part assumes that the data is sorted (uses differential coding)
64 |         */
65 |         uint32_t offset = 0;
66 |         /* we compute the bit width */
67 |         const uint32_t b1 = simdmaxbitsd1(offset, datain + k * SIMDBlockSize);
68 |         /* we read 128 integers at "datain + k * SIMDBlockSize" and
69 |        write b1 128-bit vectors at "buffer" */
70 |         simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer, b1);
71 |         /* we read back b1 128-bit vectors at "buffer" and write 128 integers at
72 |          * backbuffer */
73 |         simdunpackd1(offset, buffer, backbuffer, b1);
74 |         for (j = 0; j < SIMDBlockSize; ++j) {
75 |           uint8_t chars_back[4];
76 |           uint8_t chars_in[4];
77 | 
78 |           memmove(chars_back, &backbuffer[j], 4);
79 |           memmove(chars_in, &datain[k * SIMDBlockSize + j], 4);
80 | 
81 |           if (chars_in[0] != chars_back[0] || chars_in[1] != chars_back[1] ||
82 |               chars_in[2] != chars_back[2] || chars_in[3] != chars_back[3]) {
83 |             printf("bug in simdpack\n");
84 |             return -3;
85 |           }
86 |         }
87 |         offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1];
88 |       }
89 |     }
90 |   }
91 |   free(buffer);
92 |   free(datain);
93 |   free(backbuffer);
94 |   printf("Code looks good.\n");
95 |   return 0;
96 | }
97 | 


--------------------------------------------------------------------------------
/include/simdintegratedbitpacking.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This code is released under a BSD License.
 3 |  */
 4 | 
 5 | #ifndef SIMD_INTEGRATED_BITPACKING_H
 6 | #define SIMD_INTEGRATED_BITPACKING_H
 7 | 
 8 | #include "portability.h"
 9 | 
10 | /* SSE2 is required */
11 | #include <emmintrin.h>
12 | 
13 | #include "simdbitpacking.h"
14 | #include "simdcomputil.h"
15 | 
16 | #ifdef __cplusplus
17 | extern "C" {
18 | #endif
19 | 
20 | /* reads 128 values from "in", writes  "bit" 128-bit vectors to "out"
21 |    integer values should be in sorted order (for best results).
22 |    The differences are masked so that only the least significant "bit" bits are
23 |    used. */
24 | void simdpackd1(uint32_t initvalue, const uint32_t *in, __m128i *out,
25 |                 const uint32_t bit);
26 | 
27 | /* reads 128 values from "in", writes  "bit" 128-bit vectors to "out"
28 |    integer values should be in sorted order (for best results).
29 |    The difference values are assumed to be less than 1<<bit. */
30 | void simdpackwithoutmaskd1(uint32_t initvalue, const uint32_t *in, __m128i *out,
31 |                            const uint32_t bit);
32 | 
33 | /* reads "bit" 128-bit vectors from "in", writes  128 values to "out" */
34 | void simdunpackd1(uint32_t initvalue, const __m128i *in, uint32_t *out,
35 |                   const uint32_t bit);
36 | 
37 | /* searches "bit" 128-bit vectors from "in" (= 128 encoded integers) for the
38 |  *first encoded uint32 value which is >= |key|, and returns its position. It is
39 |  *assumed that the values stored are in sorted order. The encoded key is stored
40 |  *in "*presult". If no value is larger or equal to the key, 128 is returned. The
41 |  *pointer initOffset is a pointer to the last four value decoded (when starting
42 |  *out, this can be a zero vector or initialized with _mm_set1_epi32(init)), and
43 |  *the vector gets updated.
44 |  **/
45 | int simdsearchd1(__m128i *initOffset, const __m128i *in, uint32_t bit,
46 |                  uint32_t key, uint32_t *presult);
47 | 
48 | /* searches "bit" 128-bit vectors from "in" (= length<=128 encoded integers) for
49 |  * the first encoded uint32 value which is >= |key|, and returns its position.
50 |  * It is assumed that the values stored are in sorted order. The encoded key is
51 |  * stored in "*presult". The first length decoded integers, ignoring others. If
52 |  * no value is larger or equal to the key, length is returned. Length should be
53 |  * no larger than 128.
54 |  *
55 |  * If no value is larger or equal to the key,
56 |  * length is returned */
57 | int simdsearchwithlengthd1(uint32_t initvalue, const __m128i *in, uint32_t bit,
58 |                            int length, uint32_t key, uint32_t *presult);
59 | 
60 | /* returns the value stored at the specified "slot".
61 |  * */
62 | uint32_t simdselectd1(uint32_t initvalue, const __m128i *in, uint32_t bit,
63 |                       int slot);
64 | 
65 | /* given a block of 128 packed values, this function sets the value at index
66 |  * "index" to "value", you must somehow know the previous value. Because of
67 |  * differential coding, all following values are incremented by the offset
68 |  * between this new value and the old value... This functions is useful if you
69 |  * want to modify the last value.
70 |  */
71 | void simdfastsetd1fromprevious(__m128i *in, uint32_t bit,
72 |                                uint32_t previousvalue, uint32_t value,
73 |                                size_t index);
74 | 
75 | /* given a block of 128 packed values, this function sets the value at index
76 |  * "index" to "value", This function computes the previous value if needed.
77 |  * Because of differential coding, all following values are incremented by the
78 |  * offset between this new value and the old value... This functions is useful
79 |  * if you want to modify the last value.
80 |  */
81 | void simdfastsetd1(uint32_t initvalue, __m128i *in, uint32_t bit,
82 |                    uint32_t value, size_t index);
83 | 
84 | /*Simply scan the data
85 |  * The pointer initOffset is a pointer to the last four value decoded
86 |  * (when starting out, this can be a zero vector or initialized with
87 |  * _mm_set1_epi32(init);), and the vector gets updated.
88 |  * */
89 | 
90 | void simdscand1(__m128i *initOffset, const __m128i *in, uint32_t bit);
91 | 
92 | #ifdef __cplusplus
93 | } // extern "C"
94 | #endif
95 | 
96 | #endif
97 | 


--------------------------------------------------------------------------------
/.appveyor.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | version: "{branch}.build.{build}"
 3 | 
 4 | clone_folder:  c:\projects\simdcomp
 5 | 
 6 | #cache:
 7 | #        c:\build-cache -> .appveyor.yml
 8 | 
 9 | environment:
10 |         matrix:
11 |                 - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
12 |                   ARCH: x64
13 | # looks like vc14 has trouble with code on x86, at least on the AppVeyor image
14 | #                - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
15 | #                  ARCH: x86
16 |                 - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
17 |                   ARCH: x64
18 |                 - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
19 |                   ARCH: x86
20 | 
21 | build_script:
22 |         ps: |
23 |                 cd c:\projects\simdcomp
24 |                 echo "" | Out-File -Encoding "ASCII" task.bat
25 |                 if ('Visual Studio 2015' -eq $env:APPVEYOR_BUILD_WORKER_IMAGE) {
26 |                         $VC = 14;
27 |                         $vs_shell_cmd = 'call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" ' + $env:ARCH + ' 2>&1'
28 |                 } elseif ('Visual Studio 2017' -eq $env:APPVEYOR_BUILD_WORKER_IMAGE) {
29 |                         $VC = 15;
30 |                         if ('x64' -eq $env:ARCH) {
31 |                                 $vs_shell_cmd = 'call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" 2>&1'
32 |                         } else {
33 |                                 $vs_shell_cmd = 'call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars32.bat" 2>&1'
34 |                         }
35 |                 }
36 |                 mkdir 'c:\tmp_pack'
37 |                 echo $vs_shell_cmd | Out-File -Encoding "ASCII" -Append task.bat
38 |                 $move_cmd = 'move *.zip c:\tmp_pack'
39 |                 if ($VC -gt 14) {
40 |                         # these won't be tested, just build and upload artifact, vc15 only
41 |                         $cmd = 'nmake /nologo /f makefile.vc AVX512=yes PKG_VERSION=' + $env:APPVEYOR_REPO_COMMIT.substring(0, 8) + ' MACHINE=' + $env:ARCH + ' 2>&1'
42 |                         echo $cmd | Out-File -Encoding "ASCII" -Append task.bat
43 |                         $cmd = 'nmake /nologo /f makefile.vc AVX512=yes PKG_VERSION=' + $env:APPVEYOR_REPO_COMMIT.substring(0, 8) + ' MACHINE=' + $env:ARCH + ' pack 2>&1'
44 |                         echo $cmd | Out-File -Encoding "ASCII" -Append task.bat
45 |                         echo $move_cmd | Out-File -Encoding "ASCII" -Append task.bat
46 |                         echo 'nmake /nologo /f makefile.vc clean' | Out-File -Encoding "ASCII" -Append task.bat
47 |                         $cmd = 'nmake /nologo /f makefile.vc AVX2=yes PKG_VERSION=' + $env:APPVEYOR_REPO_COMMIT.substring(0, 8) + ' MACHINE=' + $env:ARCH + ' 2>&1'
48 |                         echo $cmd | Out-File -Encoding "ASCII" -Append task.bat
49 |                         $cmd = 'nmake /nologo /f makefile.vc AVX2=yes PKG_VERSION=' + $env:APPVEYOR_REPO_COMMIT.substring(0, 8) + ' MACHINE=' + $env:ARCH + ' pack 2>&1'
50 |                         echo $cmd | Out-File -Encoding "ASCII" -Append task.bat
51 |                         echo $move_cmd | Out-File -Encoding "ASCII" -Append task.bat
52 |                         echo 'nmake /nologo /f makefile.vc clean' | Out-File -Encoding "ASCII" -Append task.bat
53 |                 }
54 |                 $cmd = 'nmake /nologo /f makefile.vc PKG_VERSION=' + $env:APPVEYOR_REPO_COMMIT.substring(0, 8) + ' MACHINE=' + $env:ARCH + ' 2>&1'
55 |                 echo $cmd | Out-File -Encoding "ASCII" -Append task.bat
56 |                 $cmd = 'nmake /nologo /f makefile.vc PKG_VERSION=' + $env:APPVEYOR_REPO_COMMIT.substring(0, 8) + ' MACHINE=' + $env:ARCH + ' pack 2>&1'
57 |                 echo $cmd | Out-File -Encoding "ASCII" -Append task.bat
58 |                 echo $move_cmd | Out-File -Encoding "ASCII" -Append task.bat
59 |                 $here = (Get-Item -Path "." -Verbose).FullName
60 |                 $task = $here + '\task.bat'
61 |                 & $task
62 | 
63 | after_build:
64 |         ps: |
65 |                 Get-ChildItem 'c:\tmp_pack' -Filter *.zip |
66 |                 Foreach-Object {
67 |                         Push-AppveyorArtifact $_.FullName
68 |                 }
69 | 
70 | test_script:
71 |         ps: |
72 |                 cd c:\projects\simdcomp
73 |                 echo "" | Out-File -Encoding "ASCII" task.bat
74 |                 $here = (Get-Item -Path "." -Verbose).FullName
75 |                 echo '.\unit.exe' | Out-File -Encoding "ASCII" -Append task.bat
76 |                 $task = $here + '\task.bat'
77 |                 & $task
78 | 
79 | 


--------------------------------------------------------------------------------
/makefile.vc:
--------------------------------------------------------------------------------
  1 | 
  2 | !IFNDEF MACHINE
  3 | !IF "$(PROCESSOR_ARCHITECTURE)"=="AMD64"
  4 | MACHINE=x64
  5 | !ELSE
  6 | MACHINE=x86
  7 | !ENDIF
  8 | !ENDIF
  9 | 
 10 | !IFNDEF VC
 11 | VC=vc%VisualStudioVersion:~0,-2%
 12 | !ENDIF
 13 | 
 14 | # catch up when there's a stronger versioning
 15 | !IFNDEF PKG_VERSION
 16 | PKG_VERSION=latest
 17 | !ENDIF
 18 | 
 19 | !IFNDEF DEBUG
 20 | DEBUG=no
 21 | !ENDIF
 22 | 
 23 | !IFNDEF CC
 24 | CC=cl.exe
 25 | !ENDIF
 26 | 
 27 | !IFNDEF AR
 28 | AR=lib.exe
 29 | !ENDIF
 30 | 
 31 | !IFNDEF LINK
 32 | LINK=link.exe
 33 | !ENDIF
 34 | 
 35 | !IFNDEF PGO
 36 | PGO=no
 37 | !ENDIF
 38 | 
 39 | !IFNDEF PGI
 40 | PGI=no
 41 | !ENDIF
 42 | 
 43 | INC = /Iinclude
 44 | 
 45 | !IF "$(DEBUG)"=="yes"
 46 | CFLAGS = /nologo /MDd /LDd /Od /Zi /D_DEBUG /RTC1 /W3 /GS /Gm /D __SSE4_1__=1
 47 | ARFLAGS = /nologo
 48 | LDFLAGS = /nologo /debug /nodefaultlib:msvcrt
 49 | !ELSE
 50 | CFLAGS = /nologo /MD /O2 /Zi /DNDEBUG /W3 /Gm- /GS /Gy /Oi /GL /MP /D __SSE4_1__=1
 51 | ARFLAGS = /nologo /LTCG
 52 | LDFLAGS = /nologo /LTCG /DYNAMICBASE /incremental:no /debug /opt:ref,icf
 53 | !ENDIF
 54 | 
 55 | !IF "$(PGI)"=="yes"
 56 | LDFLAGS = $(LDFLAGS) /ltcg:pgi
 57 | !ENDIF
 58 | 
 59 | !IF "$(PGO)"=="yes"
 60 | LDFLAGS = $(LDFLAGS) /ltcg:pgo
 61 | !ENDIF
 62 | 
 63 | # SSE4.1 is required
 64 | # VC++15.3 supports AVX512
 65 | !IF "$(AVX512)"=="yes"
 66 | CFLAGS = $(CFLAGS) /arch:AVX2 /D __AVX2__=1 /D __AVX512F__=1
 67 | AVX2=yes
 68 | !ELSEIF "$(AVX2)"=="yes"
 69 | CFLAGS = $(CFLAGS) /arch:AVX2 /D __AVX2__=1
 70 | !ENDIF
 71 | 
 72 | LIB_OBJS = simdbitpacking.obj simdintegratedbitpacking.obj simdcomputil.obj \
 73 | 	simdpackedsearch.obj simdpackedselect.obj simdfor.obj
 74 | 
 75 | LIB_SRCS = src/simdbitpacking.c src/simdintegratedbitpacking.c src/simdcomputil.c \
 76 | 	src/simdpackedsearch.c src/simdpackedselect.c src/simdfor.c
 77 | 
 78 | PKG_FEATURES=sse4.1
 79 | 
 80 | !IF "$(AVX2)"=="yes"
 81 | LIB_OBJS = $(LIB_OBJS) avxbitpacking.obj
 82 | LIB_SRCS = $(LIB_SRCS) src/avxbitpacking.c
 83 | PKG_FEATURES=avx2
 84 | !ENDIF
 85 | 
 86 | !IF "$(AVX512)"=="yes"
 87 | LIB_OBJS = $(LIB_OBJS) avx512bitpacking.obj
 88 | LIB_SRCS = $(LIB_SRCS) src/avx512bitpacking.c
 89 | PKG_FEATURES=avx512
 90 | !ENDIF
 91 | 
 92 | 
 93 | all: lib dll dynunit unit_chars example benchmarks
 94 | # need some good use case scenario to train the instrumented build
 95 | 	@if "$(PGI)"=="yes" echo Running PGO training
 96 | 	@if "$(PGI)"=="yes" benchmark.exe >nul 2>&1
 97 | #	@if "$(PGI)"=="yes" bitpackingbenchmark.exe >nul 2>&1
 98 | 	@if "$(PGI)"=="yes" example.exe >nul 2>&1
 99 | 
100 | 
101 | $(LIB_OBJS):
102 | 	$(CC) $(INC) $(CFLAGS) /c $(LIB_SRCS) 
103 | 
104 | lib: $(LIB_OBJS)
105 | 	@copy simdcomp.def.tpl simdcomp.def
106 | 	@if "$(AVX2)"=="yes" echo avxunpack >> simdcomp.def
107 | 	@if "$(AVX2)"=="yes" echo avxpackwithoutmask >> simdcomp.def
108 | 	@if "$(AVX2)"=="yes" echo avxpack >> simdcomp.def
109 | 	@if "$(AVX2)"=="yes" echo avxmaxbits >> simdcomp.def
110 | 	@if "$(AVX512)"=="yes" echo avx512unpack >> simdcomp.def
111 | 	@if "$(AVX512)"=="yes" echo avx512packwithoutmask >> simdcomp.def
112 | 	@if "$(AVX512)"=="yes" echo avx512pack >> simdcomp.def
113 | 	@if "$(AVX512)"=="yes" echo avx512maxbits >> simdcomp.def
114 | 	$(AR) $(ARFLAGS) /OUT:simdcomp_a.lib $(LIB_OBJS)
115 | 
116 | dll: $(LIB_OBJS)
117 | 	$(LINK) /DLL $(LDFLAGS) /OUT:simdcomp.dll /IMPLIB:simdcomp.lib /DEF:simdcomp.def $(LIB_OBJS)
118 | 
119 | unit: lib
120 | 	$(CC) $(INC) $(CFLAGS) /c tests/unit.c 
121 | 	$(LINK) $(LDFLAGS) /OUT:unit.exe unit.obj simdcomp_a.lib
122 | 
123 | dynunit: dll
124 | 	$(CC) $(INC) $(CFLAGS) /c tests/unit.c 
125 | 	$(LINK) $(LDFLAGS) /OUT:unit.exe unit.obj simdcomp.lib
126 | 
127 | unit_chars: lib
128 | 	$(CC) $(INC) $(CFLAGS) /c tests/unit_chars.c
129 | 	$(LINK) $(LDFLAGS) /OUT:unit_chars.exe unit_chars.obj simdcomp.lib
130 | 
131 | 
132 | example: lib
133 | 	$(CC) $(INC) $(CFLAGS) /c example.c
134 | 	$(LINK) $(LDFLAGS) /OUT:example.exe example.obj simdcomp.lib
135 | 
136 | benchmarks: lib
137 | 	$(CC) $(INC) $(CFLAGS) /c benchmarks/benchmark.c
138 | 	$(LINK) $(LDFLAGS) /OUT:benchmark.exe benchmark.obj simdcomp.lib
139 | #	$(CC) $(INC) $(CFLAGS) /c benchmarks/bitpackingbenchmark.c
140 | #	$(LINK) $(LDFLAGS) /OUT:bitpackingbenchmark.exe bitpackingbenchmark.obj simdcomp.lib
141 | 
142 | pack:
143 | 	mkdir .\package
144 | 	cd .\package
145 | 	mkdir .\include
146 | 	mkdir .\bin
147 | 	mkdir .\lib
148 | 	copy ..\include\*.h .\include
149 | 	copy ..\simdcomp.dll .\bin
150 | 	copy ..\simdcomp.pdb .\bin
151 | 	copy ..\simdcomp.lib .\lib
152 | 	copy ..\simdcomp_a.lib .\lib
153 | 	copy ..\LICENSE .
154 | 	copy ..\README.md .
155 | 	7z a ..\simdcomp-$(PKG_VERSION)-$(PKG_FEATURES)-$(VC)-$(MACHINE).zip .
156 | 	cd ..
157 | 	powershell -Command "Remove-Item -Recurse -Force .\package"
158 | 
159 | clean:
160 | 	powershell -Command "Remove-Item -Force *.obj"
161 | 	powershell -Command "Remove-Item -Force *.lib"
162 | 	powershell -Command "Remove-Item -Force *.exe"
163 | 	powershell -Command "Remove-Item -Force *.dll"
164 | 	powershell -Command "Remove-Item -Force *.pgc"
165 | 	powershell -Command "Remove-Item -Force *.pgd"
166 | 	powershell -Command "Remove-Item -Force *.pdb"
167 | 	powershell -Command "Remove-Item -Force *.def"
168 | 
169 | 


--------------------------------------------------------------------------------
/scripts/simdfor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | 
  4 | from math import ceil
  5 | 
  6 | print("""
  7 | /**
  8 | * Blablabla
  9 | *
 10 | */
 11 | 
 12 | """);
 13 | 
 14 | def mask(bit):
 15 |   return str((1 << bit) - 1)
 16 | 
 17 | for length in [32]:
 18 |   print("""
 19 | static __m128i  iunpackFOR0(__m128i initOffset, const __m128i *   _in , uint32_t *    _out) {
 20 |     __m128i       *out = (__m128i*)(_out);
 21 |     int i;
 22 |     (void) _in;
 23 |     for (i = 0; i < 8; ++i) {
 24 |         _mm_store_si128(out++, initOffset);
 25 |     	_mm_store_si128(out++, initOffset);
 26 |         _mm_store_si128(out++, initOffset);
 27 |         _mm_store_si128(out++, initOffset);
 28 |     }
 29 | 
 30 |     return initOffset;
 31 | }
 32 | 
 33 |   """)
 34 |   print("""
 35 | 
 36 | static void ipackFOR0(__m128i initOffset , const uint32_t *   _in , __m128i *  out  ) {
 37 |     (void) initOffset;
 38 |     (void) _in;
 39 |     (void) out;
 40 | }
 41 | """) 
 42 |   for bit in range(1,33):
 43 |     offsetVar = " initOffset";
 44 |     print("""  
 45 | static void ipackFOR"""+str(bit)+"""(__m128i """+offsetVar+""", const uint32_t *   _in, __m128i *   out) {
 46 |     const __m128i       *in = (const __m128i*)(_in);
 47 |     __m128i    OutReg;
 48 | 
 49 |       """);
 50 |     
 51 |     if (bit != 32):
 52 |       print("    __m128i CurrIn = _mm_load_si128(in);");
 53 |       print("    __m128i InReg = _mm_sub_epi32(CurrIn, initOffset);");
 54 |     else:
 55 |       print("    __m128i InReg = _mm_load_si128(in);");
 56 |       print("    (void) initOffset;");
 57 | 
 58 | 
 59 |     inwordpointer = 0
 60 |     valuecounter = 0
 61 |     for k in range(ceil((length * bit) / 32)):
 62 |       if(valuecounter == length): break
 63 |       for x in range(inwordpointer,32,bit):
 64 |         if(x!=0) :
 65 |           print("    OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, " + str(x) + "));");
 66 |         else:
 67 |           print("    OutReg = InReg; ");
 68 |         if((x+bit>=32) ):
 69 |           while(inwordpointer<32):
 70 |             inwordpointer += bit
 71 |           print("    _mm_store_si128(out, OutReg);");
 72 |           print("");
 73 | 
 74 |           if(valuecounter + 1 < length):
 75 |             print("    ++out;")
 76 |           inwordpointer -= 32;
 77 |           if(inwordpointer>0):
 78 |             print("    OutReg = _mm_srli_epi32(InReg, " + str(bit) + " - " + str(inwordpointer) + ");");
 79 |         if(valuecounter + 1 < length):
 80 |           print("    ++in;") 
 81 | 
 82 |           if (bit != 32):
 83 |             print("    CurrIn = _mm_load_si128(in);");
 84 |             print("    InReg = _mm_sub_epi32(CurrIn, initOffset);");
 85 |           else:
 86 |             print("    InReg = _mm_load_si128(in);");
 87 |           print("");
 88 |         valuecounter = valuecounter + 1
 89 |         if(valuecounter == length): break
 90 |     assert(valuecounter == length)
 91 |     print("\n}\n\n""")
 92 | 
 93 |   for bit in range(1,32):
 94 |     offsetVar = " initOffset";
 95 |     print("""\n
 96 | static __m128i iunpackFOR"""+str(bit)+"""(__m128i """+offsetVar+""", const  __m128i*   in, uint32_t *   _out) {
 97 |       """);
 98 |     print("""    __m128i*   out = (__m128i*)(_out);
 99 |     __m128i    InReg = _mm_load_si128(in);
100 |     __m128i    OutReg;    
101 |     __m128i     tmp;
102 |     const __m128i mask =  _mm_set1_epi32((1U<<"""+str(bit)+""")-1);
103 | 
104 |     """);
105 | 
106 |     MainText = "";
107 | 
108 |     MainText += "\n";
109 |     inwordpointer = 0
110 |     valuecounter = 0
111 |     for k in range(ceil((length * bit) / 32)):
112 |       for x in range(inwordpointer,32,bit):
113 |         if(valuecounter == length): break
114 |         if (x > 0):
115 |           MainText += "    tmp = _mm_srli_epi32(InReg," + str(x) +");\n"; 
116 |         else:
117 |           MainText += "    tmp = InReg;\n"; 
118 |         if(x+bit<32):
119 |           MainText += "    OutReg = _mm_and_si128(tmp, mask);\n";
120 |         else:
121 |           MainText += "    OutReg = tmp;\n";        
122 |         if((x+bit>=32) ):      
123 |           while(inwordpointer<32):
124 |             inwordpointer += bit
125 |           if(valuecounter + 1 < length):
126 |              MainText += "    ++in;"
127 |              MainText += "    InReg = _mm_load_si128(in);\n";
128 |           inwordpointer -= 32;
129 |           if(inwordpointer>0):
130 |             MainText += "    OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, " + str(bit) + "-" + str(inwordpointer) + "), mask));\n\n";
131 |         if (bit != 32):
132 |           MainText += "    OutReg = _mm_add_epi32(OutReg, initOffset);\n"; 
133 |         MainText += "    _mm_store_si128(out++, OutReg);\n\n"; 
134 |         MainText += "";
135 |         valuecounter = valuecounter + 1
136 |         if(valuecounter == length): break
137 |     assert(valuecounter == length)
138 |     print(MainText)
139 |     print("    return initOffset;");
140 |     print("\n}\n\n")
141 |   print("""
142 | static __m128i iunpackFOR32(__m128i initvalue , const  __m128i*   in, uint32_t *    _out) {
143 | 	__m128i * mout = (__m128i *)_out;
144 | 	__m128i invec;
145 | 	size_t k;
146 | 	for(k = 0; k < 128/4; ++k) {
147 | 		invec =  _mm_load_si128(in++);
148 | 	    _mm_store_si128(mout++, invec);
149 | 	}
150 | 	return invec;
151 | }
152 |   """)
153 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This code is released under a BSD License.
  3 |  */
  4 | #include <assert.h>
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <time.h>
  8 | 
  9 | #include "simdcomp.h"
 10 | 
 11 | #ifdef _MSC_VER
 12 | # include <windows.h>
 13 | 
 14 | __int64 freq;
 15 | 
 16 | typedef __int64 time_snap_t;
 17 | 
 18 | static time_snap_t time_snap(void)
 19 | {
 20 | 	__int64 now;
 21 | 
 22 | 	QueryPerformanceCounter((LARGE_INTEGER *)&now);
 23 | 
 24 | 	return (__int64)((now*1000000)/freq);
 25 | }
 26 | # define TIME_SNAP_FMT "%I64d"
 27 | #else
 28 | # define time_snap clock
 29 | # define TIME_SNAP_FMT "%lu"
 30 | typedef clock_t time_snap_t;
 31 | #endif
 32 | 
 33 | 
 34 | void benchmarkSelect() {
 35 |     uint32_t buffer[128];
 36 |     uint32_t backbuffer[128];
 37 |     uint32_t initial = 33;
 38 |     uint32_t b;
 39 |     time_snap_t S1, S2, S3;
 40 |     int i;
 41 |     printf("benchmarking select \n");
 42 | 
 43 |     /* this test creates delta encoded buffers with different bits, then
 44 |      * performs lower bound searches for each key */
 45 |     for (b = 0; b <= 32; b++) {
 46 |         uint32_t prev = initial;
 47 |         uint32_t out[128];
 48 |         /* initialize the buffer */
 49 |         for (i = 0; i < 128; i++) {
 50 |             buffer[i] =  ((uint32_t)(1655765 * i )) ;
 51 |             if(b < 32) buffer[i] %= (1<<b);
 52 |         }
 53 |         for (i = 0; i < 128; i++) {
 54 |             buffer[i] = buffer[i] + prev;
 55 |             prev = buffer[i];
 56 |         }
 57 | 
 58 |         for (i = 1; i < 128; i++) {
 59 |             if(buffer[i] < buffer[i-1] )
 60 |                 buffer[i] = buffer[i-1];
 61 |         }
 62 |         assert(simdmaxbitsd1(initial, buffer)<=b);
 63 | 
 64 |         for (i = 0; i < 128; i++) {
 65 |             out[i] = 0; /* memset would do too */
 66 |         }
 67 | 
 68 |         /* delta-encode to 'i' bits */
 69 |         simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b);
 70 | 
 71 |         S1 = time_snap();
 72 |         for (i = 0; i < 128 * 10; i++) {
 73 |             uint32_t valretrieved = simdselectd1(initial, (__m128i *)out, b, (uint32_t)i % 128);
 74 |             assert(valretrieved == buffer[i%128]);
 75 |         }
 76 |         S2 = time_snap();
 77 |         for (i = 0; i < 128 * 10; i++) {
 78 |             simdunpackd1(initial,  (__m128i *)out, backbuffer, b);
 79 |             assert(backbuffer[i % 128] == buffer[i % 128]);
 80 |         }
 81 |         S3 = time_snap();
 82 |         printf("bit width = %d, fast select function time = " TIME_SNAP_FMT ", naive time = " TIME_SNAP_FMT "  \n", b, (S2-S1), (S3-S2));
 83 |     }
 84 | }
 85 | 
 86 | int uint32_cmp(const void *a, const void *b)
 87 | {
 88 |     const uint32_t *ia = (const uint32_t *)a;
 89 |     const uint32_t *ib = (const uint32_t *)b;
 90 |     if(*ia < *ib)
 91 |         return -1;
 92 |     else if (*ia > *ib)
 93 |         return 1;
 94 |     return 0;
 95 | }
 96 | 
 97 | /* adapted from wikipedia */
 98 | int binary_search(uint32_t * A, uint32_t key, int imin, int imax)
 99 | {
100 |     int imid;
101 |     imax --;
102 |     while(imin + 1 < imax) {
103 |         imid = imin + ((imax - imin) / 2);
104 | 
105 |         if (A[imid] > key) {
106 |             imax = imid;
107 |         } else if (A[imid] < key) {
108 |             imin = imid;
109 |         } else {
110 |             return imid;
111 |         }
112 |     }
113 |     return imax;
114 | }
115 | 
116 | 
117 | /* adapted from wikipedia */
118 | int lower_bound(uint32_t * A, uint32_t key, int imin, int imax)
119 | {
120 |     int imid;
121 |     imax --;
122 |     while(imin + 1 < imax) {
123 |         imid = imin + ((imax - imin) / 2);
124 | 
125 |         if (A[imid] >= key) {
126 |             imax = imid;
127 |         } else if (A[imid] < key) {
128 |             imin = imid;
129 |         }
130 |     }
131 |     if(A[imin] >= key) return imin;
132 |     return imax;
133 | }
134 | 
135 | void benchmarkSearch() {
136 |     uint32_t buffer[128];
137 |     uint32_t backbuffer[128];
138 |     uint32_t out[128];
139 |     uint32_t result, initial = 0;
140 |     uint32_t b, i;
141 |     time_snap_t S1, S2, S3, S4;
142 | 
143 |     printf("benchmarking search \n");
144 | 
145 |     /* this test creates delta encoded buffers with different bits, then
146 |      * performs lower bound searches for each key */
147 |     for (b = 0; b <= 32; b++) {
148 |         uint32_t prev = initial;
149 |         /* initialize the buffer */
150 |         for (i = 0; i < 128; i++) {
151 |             buffer[i] =  ((uint32_t)rand()) ;
152 |             if(b < 32) buffer[i] %= (1<<b);
153 |         }
154 | 
155 |         qsort(buffer,128, sizeof(uint32_t), uint32_cmp);
156 | 
157 |         for (i = 0; i < 128; i++) {
158 |             buffer[i] = buffer[i] + prev;
159 |             prev = buffer[i];
160 |         }
161 |         for (i = 1; i < 128; i++) {
162 |             if(buffer[i] < buffer[i-1] )
163 |                 buffer[i] = buffer[i-1];
164 |         }
165 |         assert(simdmaxbitsd1(initial, buffer)<=b);
166 |         for (i = 0; i < 128; i++) {
167 |             out[i] = 0; /* memset would do too */
168 |         }
169 | 
170 |         /* delta-encode to 'i' bits */
171 |         simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b);
172 |         simdunpackd1(initial,  (__m128i *)out, backbuffer, b);
173 | 
174 |         for (i = 0; i < 128; i++) {
175 |             assert(buffer[i] == backbuffer[i]);
176 |          }
177 |         S1 = time_snap();
178 |         for (i = 0; i < 128 * 10; i++) {
179 | 
180 |             int pos;
181 |             uint32_t pseudorandomkey  =  buffer[i%128];
182 |             __m128i vecinitial = _mm_set1_epi32(initial);
183 |             pos = simdsearchd1(&vecinitial, (__m128i *)out, b,
184 |                                pseudorandomkey, &result);
185 |             if((result < pseudorandomkey) || (buffer[pos] != result)) {
186 |                 printf("bug A.\n");
187 |             } else if (pos > 0) {
188 |                 if(buffer[pos-1] >= pseudorandomkey)
189 |                     printf("bug B.\n");
190 |             }
191 |         }
192 |         S2 = time_snap();
193 |         for (i = 0; i < 128 * 10; i++) {
194 |             int pos;
195 |             uint32_t pseudorandomkey  =  buffer[i%128];
196 |             simdunpackd1(initial,  (__m128i *)out, backbuffer, b);
197 |             pos =  lower_bound(backbuffer, pseudorandomkey, 0, 128);
198 |             result = backbuffer[pos];
199 | 
200 |             if((result < pseudorandomkey) || (buffer[pos] != result)) {
201 |                 printf("bug C.\n");
202 |             } else if (pos > 0) {
203 |                 if(buffer[pos-1] >= pseudorandomkey)
204 |                     printf("bug D.\n");
205 |             }
206 |         }
207 |         S3 = time_snap();
208 |         for (i = 0; i < 128 * 10; i++) {
209 | 
210 |             int pos;
211 |             uint32_t pseudorandomkey  =  buffer[i%128];
212 |             pos = simdsearchwithlengthd1(initial, (__m128i *)out, b, 128,
213 |                                pseudorandomkey, &result);
214 |             if((result < pseudorandomkey) || (buffer[pos] != result)) {
215 |                 printf("bug A.\n");
216 |             } else if (pos > 0) {
217 |                 if(buffer[pos-1] >= pseudorandomkey)
218 |                     printf("bug B.\n");
219 |             }
220 |         }
221 |         S4 = time_snap();
222 | 
223 |         printf("bit width = %d, fast search function time = " TIME_SNAP_FMT ", naive time = " TIME_SNAP_FMT " , fast with length time = " TIME_SNAP_FMT "  \n", b, (S2-S1), (S3-S2), (S4-S3) );
224 |     }
225 | }
226 | 
227 | 
228 | int main() {
229 | #ifdef _MSC_VER
230 |     QueryPerformanceFrequency((LARGE_INTEGER *)&freq);
231 | #endif
232 |     benchmarkSearch();
233 |     benchmarkSelect();
234 |     return 0;
235 | }
236 | 


--------------------------------------------------------------------------------
/example.c:
--------------------------------------------------------------------------------
  1 | /* Type "make example" to build this example program. */
  2 | #include <stdio.h>
  3 | #include <time.h>
  4 | #include <stdlib.h>
  5 | #include "simdcomp.h"
  6 | 
  7 | /**
  8 | We provide several different code examples.
  9 | **/
 10 | 
 11 | 
 12 | /* very simple test to illustrate a simple application */
 13 | int compress_decompress_demo() {
 14 |     size_t k, N = 9999;
 15 |     __m128i * endofbuf;
 16 |     int howmanybytes;
 17 |     float compratio;
 18 |     uint32_t * datain = malloc(N * sizeof(uint32_t));
 19 |     uint8_t * buffer;
 20 |     uint32_t * backbuffer = malloc(N * sizeof(uint32_t));
 21 |     uint32_t b;
 22 |     printf("== simple test\n");
 23 | 
 24 |     for (k = 0; k < N; ++k) {       /* start with k=0, not k=1! */
 25 |         datain[k] = k;
 26 |     }
 27 | 
 28 |     b = maxbits_length(datain, N);
 29 |     buffer = malloc(simdpack_compressedbytes(N,b));
 30 |     endofbuf = simdpack_length(datain, N, (__m128i *)buffer, b);
 31 |     howmanybytes = (endofbuf-(__m128i *)buffer)*sizeof(__m128i); /* number of compressed bytes */
 32 |     compratio = N*sizeof(uint32_t) * 1.0 / howmanybytes;
 33 |     /* endofbuf points to the end of the compressed data */
 34 |     buffer = realloc(buffer,(endofbuf-(__m128i *)buffer)*sizeof(__m128i)); /* optional but safe. */
 35 |     printf("Compressed %d integers down to %d bytes (comp. ratio = %f).\n",(int)N,howmanybytes,compratio);
 36 |     /* in actual applications b must be stored and retrieved: caller is responsible for that. */
 37 |     simdunpack_length((const __m128i *)buffer, N, backbuffer, b); /* will return a pointer to endofbuf */ 
 38 | 
 39 |     for (k = 0; k < N; ++k) {
 40 |         if(datain[k] != backbuffer[k]) {
 41 |             printf("bug at %lu \n",(unsigned long)k);
 42 |             return -1;
 43 |         }
 44 |     }
 45 |     printf("Code works!\n");
 46 |     free(datain);
 47 |     free(buffer);
 48 |     free(backbuffer);
 49 |     return 0;
 50 | }
 51 | 
 52 | 
 53 | 
 54 | /* compresses data from datain to buffer, returns how many bytes written
 55 | used below in simple_demo */
 56 | size_t compress(uint32_t * datain, size_t length, uint8_t * buffer) {
 57 |     uint32_t offset;
 58 |     uint8_t * initout;
 59 |     size_t k;
 60 |     if(length/SIMDBlockSize*SIMDBlockSize != length) {
 61 |         printf("Data length should be a multiple of %i \n",SIMDBlockSize);
 62 |     }
 63 |     offset = 0;
 64 |     initout = buffer;
 65 |     for(k = 0; k < length / SIMDBlockSize; ++k) {
 66 |         uint32_t b = simdmaxbitsd1(offset,
 67 |                                    datain + k * SIMDBlockSize);
 68 |         *buffer++ = b;
 69 |         simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, (__m128i *) buffer,
 70 |                               b);
 71 |         offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1];
 72 |         buffer += b * sizeof(__m128i);
 73 |     }
 74 |     return buffer - initout;
 75 | }
 76 | 
 77 | /* Another illustration ... */
 78 | void simple_demo() {
 79 |     size_t REPEAT = 10, gap;
 80 |     size_t N = 1000 * SIMDBlockSize;/* SIMDBlockSize is 128 */
 81 |     uint32_t * datain = malloc(N * sizeof(uint32_t));
 82 |     size_t compsize;
 83 |     clock_t start, end;
 84 |     uint8_t * buffer = malloc(N * sizeof(uint32_t) + N / SIMDBlockSize); /* output buffer */
 85 |     uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
 86 |     printf("== simple demo\n");
 87 |     for (gap = 1; gap <= 243; gap *= 3) {
 88 |         size_t k, repeat;
 89 |         uint32_t offset = 0;
 90 |         uint32_t bogus = 0;
 91 |         double numberofseconds;
 92 | 
 93 |         printf("\n");
 94 |         printf(" gap = %lu \n", (unsigned long) gap);
 95 |         datain[0] = 0;
 96 |         for (k = 1; k < N; ++k)
 97 |             datain[k] = datain[k-1] + ( rand() % (gap + 1) );
 98 |         compsize = compress(datain,N,buffer);
 99 |         printf("compression ratio = %f \n",  (N * sizeof(uint32_t))/ (compsize * 1.0 ));
100 |         start = clock();
101 |         for(repeat = 0; repeat < REPEAT; ++repeat) {
102 |             uint8_t * decbuffer = buffer;
103 |             for (k = 0; k * SIMDBlockSize < N; ++k) {
104 |                 uint8_t b = *decbuffer++;
105 |                 simdunpackd1(offset, (__m128i *) decbuffer, backbuffer, b);
106 |                 /* do something here with backbuffer */
107 |                 bogus += backbuffer[3];
108 |                 decbuffer += b * sizeof(__m128i);
109 |                 offset = backbuffer[SIMDBlockSize - 1];
110 |             }
111 |         }
112 |         end = clock();
113 |         numberofseconds = (end-start)/(double)CLOCKS_PER_SEC;
114 |         printf("decoding speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0));
115 |         start = clock();
116 |         for(repeat = 0; repeat < REPEAT; ++repeat) {
117 |             uint8_t * decbuffer = buffer;
118 |             for (k = 0; k * SIMDBlockSize < N; ++k) {
119 |                 memcpy(backbuffer,decbuffer+k*SIMDBlockSize,SIMDBlockSize*sizeof(uint32_t));
120 |                 bogus += backbuffer[3] - backbuffer[100];
121 |             }
122 |         }
123 |         end = clock();
124 |         numberofseconds = (end-start)/(double)CLOCKS_PER_SEC;
125 |         printf("memcpy speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0));
126 |         printf("ignore me %i \n",bogus);
127 |         printf("All tests are in CPU cache. Avoid out-of-cache decoding in applications.\n");
128 |     }
129 |     free(buffer);
130 |     free(datain);
131 |     free(backbuffer);
132 | }
133 | 
134 | /* Used below in more_sophisticated_demo ... */
135 | size_t varying_bit_width_compress(uint32_t * datain, size_t length, uint8_t * buffer) {
136 |     uint8_t * initout;
137 |     size_t k;
138 |     if(length/SIMDBlockSize*SIMDBlockSize != length) {
139 |         printf("Data length should be a multiple of %i \n",SIMDBlockSize);
140 |     }
141 |     initout = buffer;
142 |     for(k = 0; k < length / SIMDBlockSize; ++k) {
143 |         uint32_t b = maxbits(datain);
144 |         *buffer++ = b;
145 |         simdpackwithoutmask(datain, (__m128i *)buffer, b);
146 |         datain += SIMDBlockSize;
147 |         buffer += b * sizeof(__m128i);
148 |     }
149 |     return buffer - initout;
150 | }
151 | 
152 | /* Here we compress the data in blocks of 128 integers with varying bit width */
153 | int varying_bit_width_demo() {
154 |     size_t nn = 128 * 2;
155 |     uint32_t * datainn = malloc(nn * sizeof(uint32_t));
156 |     uint8_t * buffern = malloc(nn * sizeof(uint32_t) + nn / SIMDBlockSize);
157 |     uint8_t * initbuffern = buffern;
158 |     uint32_t * backbuffern = malloc(nn * sizeof(uint32_t));
159 |     size_t k, compsize;
160 |     printf("== varying bit-width demo\n");
161 | 
162 |     for(k=0; k<nn; ++k) {
163 |         datainn[k] = rand() % (k + 1);
164 |     }
165 | 
166 |     compsize = varying_bit_width_compress(datainn,nn,buffern);
167 |     printf("encoded size: %u (original size: %u)\n", (unsigned)compsize,
168 |            (unsigned)(nn * sizeof(uint32_t)));
169 | 
170 |     for (k = 0; k * SIMDBlockSize < nn; ++k) {
171 |         uint32_t b = *buffern;
172 |         buffern++;
173 |         simdunpack((const __m128i *)buffern, backbuffern + k * SIMDBlockSize, b);
174 |         buffern += b * sizeof(__m128i);
175 |     }
176 | 
177 |     for (k = 0; k < nn; ++k) {
178 |         if(backbuffern[k] != datainn[k]) {
179 |             printf("bug\n");
180 |             return -1;
181 |         }
182 |     }
183 |     printf("Code works!\n");
184 |     free(datainn);
185 |     free(initbuffern);
186 |     free(backbuffern);
187 |     return 0;
188 | }
189 | 
190 | int main() {
191 |     if(compress_decompress_demo() != 0) return -1;
192 |     if(varying_bit_width_demo() != 0) return -1;
193 |     simple_demo();
194 |     return 0;
195 | }
196 | 


--------------------------------------------------------------------------------
/scripts/avxpacking.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys
  3 | def howmany(bit):
  4 |     """ how many values are we going to pack? """
  5 |     return 256
  6 | 
  7 | def howmanywords(bit):
  8 |     return (howmany(bit) * bit + 255)/256
  9 | 
 10 | def howmanybytes(bit):
 11 |     return howmanywords(bit) * 16
 12 | 
 13 | print("""
 14 | /** avxpacking **/
 15 | """)
 16 | 
 17 | print("""typedef void (*avxpackblockfnc)(const uint32_t * pin, __m256i * compressed);""")
 18 | print("""typedef void (*avxunpackblockfnc)(const __m256i * compressed, uint32_t * pout);""")
 19 | 
 20 | 
 21 | 
 22 | 
 23 | 
 24 | 
 25 | def plurial(number):
 26 |     if(number <> 1):
 27 |         return "s"
 28 |     else :
 29 |         return ""
 30 | 
 31 | print("")
 32 | print("static void avxpackblock0(const uint32_t * pin, __m256i * compressed) {");
 33 | print("  (void)compressed;");
 34 | print("  (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0))));
 35 | print("}");
 36 | print("")
 37 | 
 38 | for bit in range(1,33):
 39 |     print("")
 40 |     print("/* we are going to pack {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
 41 |     print("static void avxpackblock{0}(const uint32_t * pin, __m256i * compressed) {{".format(bit));
 42 |     print("  const __m256i * in = (const __m256i *)  pin;");
 43 |     print("  /* we are going to touch  {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
 44 |     if(howmanywords(bit) == 1):
 45 |       print("  __m256i w0;")
 46 |     else:
 47 |       print("  __m256i w0, w1;")
 48 |     if( (bit & (bit-1)) <> 0) : print("  __m256i tmp; /* used to store inputs at word boundary */")
 49 |     oldword = 0
 50 |     for j in range(howmany(bit)/8):
 51 |       firstword = j * bit / 32
 52 |       if(firstword > oldword):
 53 |         print("  _mm256_storeu_si256(compressed + {0}, w{1});".format(oldword,oldword%2))
 54 |         oldword = firstword
 55 |       secondword = (j * bit + bit - 1)/32
 56 |       firstshift = (j*bit) % 32
 57 |       if( firstword == secondword):
 58 |           if(firstshift == 0):
 59 |             print("  w{0} = _mm256_lddqu_si256 (in + {1});".format(firstword%2,j))
 60 |           else:
 61 |             print("  w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(_mm256_lddqu_si256 (in + {1}) , {2}));".format(firstword%2,j,firstshift))
 62 |       else:
 63 |           print("  tmp = _mm256_lddqu_si256 (in + {0});".format(j))
 64 |           print("  w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift))
 65 |           secondshift = 32-firstshift
 66 |           print("  w{0} = _mm256_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift))
 67 |     print("  _mm256_storeu_si256(compressed + {0}, w{1});".format(secondword,secondword%2))
 68 |     print("}");
 69 |     print("")
 70 | 
 71 | 
 72 | print("")
 73 | print("static void avxpackblockmask0(const uint32_t * pin, __m256i * compressed) {");
 74 | print("  (void)compressed;");
 75 | print("  (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0))));
 76 | print("}");
 77 | print("")
 78 | 
 79 | for bit in range(1,33):
 80 |     print("")
 81 |     print("/* we are going to pack {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
 82 |     print("static void avxpackblockmask{0}(const uint32_t * pin, __m256i * compressed) {{".format(bit));
 83 |     print("  /* we are going to touch  {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
 84 |     if(howmanywords(bit) == 1):
 85 |       print("  __m256i w0;")
 86 |     else:
 87 |       print("  __m256i w0, w1;")
 88 |     print("  const __m256i * in = (const __m256i *) pin;");
 89 |     if(bit < 32): print("  const __m256i mask = _mm256_set1_epi32({0});".format((1<<bit)-1));
 90 |     def maskfnc(x):
 91 |         if(bit == 32): return x
 92 |         return " _mm256_and_si256 ( mask, {0}) ".format(x)
 93 |     if( (bit & (bit-1)) <> 0) : print("  __m256i tmp; /* used to store inputs at word boundary */")
 94 |     oldword = 0
 95 |     for j in range(howmany(bit)/8):
 96 |       firstword = j * bit / 32
 97 |       if(firstword > oldword):
 98 |         print("  _mm256_storeu_si256(compressed + {0}, w{1});".format(oldword,oldword%2))
 99 |         oldword = firstword
100 |       secondword = (j * bit + bit - 1)/32
101 |       firstshift = (j*bit) % 32
102 |       loadstr = maskfnc(" _mm256_lddqu_si256 (in + {0}) ".format(j))
103 |       if( firstword == secondword):
104 |           if(firstshift == 0):
105 |             print("  w{0} = {1};".format(firstword%2,loadstr))
106 |           else:
107 |             print("  w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32({1} , {2}));".format(firstword%2,loadstr,firstshift))
108 |       else:
109 |           print("  tmp = {0};".format(loadstr))
110 |           print("  w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift))
111 |           secondshift = 32-firstshift
112 |           print("  w{0} = _mm256_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift))
113 |     print("  _mm256_storeu_si256(compressed + {0}, w{1});".format(secondword,secondword%2))
114 |     print("}");
115 |     print("")
116 | 
117 | 
118 | print("static void avxunpackblock0(const __m256i * compressed, uint32_t * pout) {");
119 | print("  (void) compressed;");
120 | print("  memset(pout,0,{0});".format(howmany(0)));
121 | print("}");
122 | print("")
123 | 
124 | for bit in range(1,33):
125 |     print("")
126 |     print("/* we packed {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
127 |     print("static void avxunpackblock{0}(const __m256i * compressed, uint32_t * pout) {{".format(bit));
128 |     print("  /* we are going to access  {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
129 |     if(howmanywords(bit) == 1):
130 |       print("  __m256i w0;")
131 |     else:
132 |       print("  __m256i w0, w1;")
133 |     print("  __m256i * out = (__m256i *) pout;");
134 |     if(bit < 32): print("  const __m256i mask = _mm256_set1_epi32({0});".format((1<<bit)-1));
135 |     maskstr = " _mm256_and_si256 ( mask, {0}) "
136 |     if (bit == 32) : maskstr = " {0} " # no need
137 |     oldword = 0
138 |     print("  w0 = _mm256_lddqu_si256 (compressed);")
139 |     for j in range(howmany(bit)/8):
140 |       firstword = j * bit / 32
141 |       secondword = (j * bit + bit - 1)/32
142 |       if(secondword > oldword):
143 |         print("  w{0} = _mm256_lddqu_si256 (compressed + {1});".format(secondword%2,secondword))
144 |         oldword = secondword
145 |       firstshift = (j*bit) % 32
146 |       firstshiftstr = "_mm256_srli_epi32( w{0} , "+str(firstshift)+") "
147 |       if(firstshift == 0):
148 |           firstshiftstr =" w{0} " # no need
149 |       wfirst = firstshiftstr.format(firstword%2)
150 |       if( firstword == secondword):
151 |           if(firstshift + bit <> 32):
152 |             wfirst  = maskstr.format(wfirst)
153 |           print("  _mm256_storeu_si256(out + {0}, {1});".format(j,wfirst))
154 |       else:
155 |           secondshift = (32-firstshift)
156 |           wsecond = "_mm256_slli_epi32( w{0} , {1} ) ".format((firstword+1)%2,secondshift)
157 |           wfirstorsecond = " _mm256_or_si256 ({0},{1}) ".format(wfirst,wsecond)
158 |           wfirstorsecond = maskstr.format(wfirstorsecond)
159 |           print("  _mm256_storeu_si256(out + {0},\n    {1});".format(j,wfirstorsecond))
160 |     print("}");
161 |     print("")
162 | 
163 | 
164 | print("static avxpackblockfnc avxfuncPackArr[] = {")
165 | for bit in range(0,32):
166 |   print("&avxpackblock{0},".format(bit))
167 | print("&avxpackblock32")
168 | print("};")
169 | 
170 | print("static avxpackblockfnc avxfuncPackMaskArr[] = {")
171 | for bit in range(0,32):
172 |   print("&avxpackblockmask{0},".format(bit))
173 | print("&avxpackblockmask32")
174 | print("};")
175 | 
176 | 
177 | print("static avxunpackblockfnc avxfuncUnpackArr[] = {")
178 | for bit in range(0,32):
179 |   print("&avxunpackblock{0},".format(bit))
180 | print("&avxunpackblock32")
181 | print("};")
182 | print("/** avxpacking **/")
183 | 


--------------------------------------------------------------------------------
/scripts/avx512packing.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys
  3 | def howmany(bit):
  4 |     """ how many values are we going to pack? """
  5 |     return 512
  6 | 
  7 | def howmanywords(bit):
  8 |     return (howmany(bit) * bit + 511)/512
  9 | 
 10 | def howmanybytes(bit):
 11 |     return howmanywords(bit) * 32
 12 | 
 13 | print("""
 14 | /** avx512packing **/
 15 | """)
 16 | 
 17 | print("""typedef void (*avx512packblockfnc)(const uint32_t * pin, __m512i * compressed);""")
 18 | print("""typedef void (*avx512unpackblockfnc)(const __m512i * compressed, uint32_t * pout);""")
 19 | 
 20 | 
 21 | 
 22 | 
 23 | 
 24 | 
 25 | def plurial(number):
 26 |     if(number <> 1):
 27 |         return "s"
 28 |     else :
 29 |         return ""
 30 | 
 31 | print("")
 32 | print("static void avx512packblock0(const uint32_t * pin, __m512i * compressed) {");
 33 | print("  (void)compressed;");
 34 | print("  (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0))));
 35 | print("}");
 36 | print("")
 37 | 
 38 | for bit in range(1,33):
 39 |     print("")
 40 |     print("/* we are going to pack {0} {1}-bit values, touching {2} 512-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
 41 |     print("static void avx512packblock{0}(const uint32_t * pin, __m512i * compressed) {{".format(bit));
 42 |     print("  const __m512i * in = (const __m512i *)  pin;");
 43 |     print("  /* we are going to touch  {0} 512-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
 44 |     if(howmanywords(bit) == 1):
 45 |       print("  __m512i w0;")
 46 |     else:
 47 |       print("  __m512i w0, w1;")
 48 |     if( (bit & (bit-1)) <> 0) : print("  __m512i tmp; /* used to store inputs at word boundary */")
 49 |     oldword = 0
 50 |     for j in range(howmany(bit)/16):
 51 |       firstword = j * bit / 32
 52 |       if(firstword > oldword):
 53 |         print("  _mm512_storeu_si512(compressed + {0}, w{1});".format(oldword,oldword%2))
 54 |         oldword = firstword
 55 |       secondword = (j * bit + bit - 1)/32
 56 |       firstshift = (j*bit) % 32
 57 |       if( firstword == secondword):
 58 |           if(firstshift == 0):
 59 |             print("  w{0} = _mm512_loadu_si512 (in + {1});".format(firstword%2,j))
 60 |           else:
 61 |             print("  w{0} = _mm512_or_si512(w{0},_mm512_slli_epi32(_mm512_loadu_si512 (in + {1}) , {2}));".format(firstword%2,j,firstshift))
 62 |       else:
 63 |           print("  tmp = _mm512_loadu_si512 (in + {0});".format(j))
 64 |           print("  w{0} = _mm512_or_si512(w{0},_mm512_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift))
 65 |           secondshift = 32-firstshift
 66 |           print("  w{0} = _mm512_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift))
 67 |     print("  _mm512_storeu_si512(compressed + {0}, w{1});".format(secondword,secondword%2))
 68 |     print("}");
 69 |     print("")
 70 | 
 71 | 
 72 | print("")
 73 | print("static void avx512packblockmask0(const uint32_t * pin, __m512i * compressed) {");
 74 | print("  (void)compressed;");
 75 | print("  (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0))));
 76 | print("}");
 77 | print("")
 78 | 
 79 | for bit in range(1,33):
 80 |     print("")
 81 |     print("/* we are going to pack {0} {1}-bit values, touching {2} 512-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
 82 |     print("static void avx512packblockmask{0}(const uint32_t * pin, __m512i * compressed) {{".format(bit));
 83 |     print("  /* we are going to touch  {0} 512-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
 84 |     if(howmanywords(bit) == 1):
 85 |       print("  __m512i w0;")
 86 |     else:
 87 |       print("  __m512i w0, w1;")
 88 |     print("  const __m512i * in = (const __m512i *) pin;");
 89 |     if(bit < 32): print("  const __m512i mask = _mm512_set1_epi32({0});".format((1<<bit)-1));
 90 |     def maskfnc(x):
 91 |         if(bit == 32): return x
 92 |         return " _mm512_and_si512 ( mask, {0}) ".format(x)
 93 |     if( (bit & (bit-1)) <> 0) : print("  __m512i tmp; /* used to store inputs at word boundary */")
 94 |     oldword = 0
 95 |     for j in range(howmany(bit)/16):
 96 |       firstword = j * bit / 32
 97 |       if(firstword > oldword):
 98 |         print("  _mm512_storeu_si512(compressed + {0}, w{1});".format(oldword,oldword%2))
 99 |         oldword = firstword
100 |       secondword = (j * bit + bit - 1)/32
101 |       firstshift = (j*bit) % 32
102 |       loadstr = maskfnc(" _mm512_loadu_si512 (in + {0}) ".format(j))
103 |       if( firstword == secondword):
104 |           if(firstshift == 0):
105 |             print("  w{0} = {1};".format(firstword%2,loadstr))
106 |           else:
107 |             print("  w{0} = _mm512_or_si512(w{0},_mm512_slli_epi32({1} , {2}));".format(firstword%2,loadstr,firstshift))
108 |       else:
109 |           print("  tmp = {0};".format(loadstr))
110 |           print("  w{0} = _mm512_or_si512(w{0},_mm512_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift))
111 |           secondshift = 32-firstshift
112 |           print("  w{0} = _mm512_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift))
113 |     print("  _mm512_storeu_si512(compressed + {0}, w{1});".format(secondword,secondword%2))
114 |     print("}");
115 |     print("")
116 | 
117 | 
118 | print("static void avx512unpackblock0(const __m512i * compressed, uint32_t * pout) {");
119 | print("  (void) compressed;");
120 | print("  memset(pout,0,{0});".format(howmany(0)));
121 | print("}");
122 | print("")
123 | 
124 | for bit in range(1,33):
125 |     print("")
126 |     print("/* we packed {0} {1}-bit values, touching {2} 512-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
127 |     print("static void avx512unpackblock{0}(const __m512i * compressed, uint32_t * pout) {{".format(bit));
128 |     print("  /* we are going to access  {0} 512-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
129 |     if(howmanywords(bit) == 1):
130 |       print("  __m512i w0;")
131 |     else:
132 |       print("  __m512i w0, w1;")
133 |     print("  __m512i * out = (__m512i *) pout;");
134 |     if(bit < 32): print("  const __m512i mask = _mm512_set1_epi32({0});".format((1<<bit)-1));
135 |     maskstr = " _mm512_and_si512 ( mask, {0}) "
136 |     if (bit == 32) : maskstr = " {0} " # no need
137 |     oldword = 0
138 |     print("  w0 = _mm512_loadu_si512 (compressed);")
139 |     for j in range(howmany(bit)/16):
140 |       firstword = j * bit / 32
141 |       secondword = (j * bit + bit - 1)/32
142 |       if(secondword > oldword):
143 |         print("  w{0} = _mm512_loadu_si512 (compressed + {1});".format(secondword%2,secondword))
144 |         oldword = secondword
145 |       firstshift = (j*bit) % 32
146 |       firstshiftstr = "_mm512_srli_epi32( w{0} , "+str(firstshift)+") "
147 |       if(firstshift == 0):
148 |           firstshiftstr =" w{0} " # no need
149 |       wfirst = firstshiftstr.format(firstword%2)
150 |       if( firstword == secondword):
151 |           if(firstshift + bit <> 32):
152 |             wfirst  = maskstr.format(wfirst)
153 |           print("  _mm512_storeu_si512(out + {0}, {1});".format(j,wfirst))
154 |       else:
155 |           secondshift = (32-firstshift)
156 |           wsecond = "_mm512_slli_epi32( w{0} , {1} ) ".format((firstword+1)%2,secondshift)
157 |           wfirstorsecond = " _mm512_or_si512 ({0},{1}) ".format(wfirst,wsecond)
158 |           wfirstorsecond = maskstr.format(wfirstorsecond)
159 |           print("  _mm512_storeu_si512(out + {0},\n    {1});".format(j,wfirstorsecond))
160 |     print("}");
161 |     print("")
162 | 
163 | 
164 | print("static avx512packblockfnc avx512funcPackArr[] = {")
165 | for bit in range(0,32):
166 |   print("&avx512packblock{0},".format(bit))
167 | print("&avx512packblock32")
168 | print("};")
169 | 
170 | print("static avx512packblockfnc avx512funcPackMaskArr[] = {")
171 | for bit in range(0,32):
172 |   print("&avx512packblockmask{0},".format(bit))
173 | print("&avx512packblockmask32")
174 | print("};")
175 | 
176 | 
177 | print("static avx512unpackblockfnc avx512funcUnpackArr[] = {")
178 | for bit in range(0,32):
179 |   print("&avx512unpackblock{0},".format(bit))
180 | print("&avx512unpackblock32")
181 | print("};")
182 | print("/**  avx512packing **/")
183 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | The SIMDComp library
  2 | ====================
  3 | [![Build Status](https://img.shields.io/appveyor/ci/lemire/simdcomp.svg)](https://ci.appveyor.com/project/lemire/simdcomp)
  4 | 
  5 | 
  6 | A simple C library for compressing lists of integers using binary packing and SIMD instructions.
  7 | The assumption is either that you have a list of 32-bit integers where most of them are small, or a list of 32-bit integers where differences between successive integers are small. No software is able to reliably compress an array of 32-bit random numbers.
  8 | 
  9 | This library can decode at least 4 billions of compressed integers per second on most
 10 | desktop or laptop processors. That is, it can decompress data at a rate of 15 GB/s.
 11 | This is significantly faster than generic codecs like gzip, LZO, Snappy or LZ4.
 12 | 
 13 | On a Skylake Intel processor, it can decode integers at a rate 0.3 cycles per integer,
 14 | which can easily translate into more than 8 decoded billions integers per second.
 15 | 
 16 | This library is part of the [Awesome C](https://github.com/kozross/awesome-c) list of C resources.
 17 | 
 18 | Contributors: Daniel Lemire, Nathan Kurz, Christoph Rupp, Anatol Belski, Nick White and others
 19 | 
 20 | What is it for?
 21 | -------------
 22 | 
 23 | This is a low-level library for fast integer compression. By design it does not define a compressed
 24 | format. It is up to the (sophisticated) user to create a compressed format.
 25 | 
 26 | It is used by:
 27 | - [upscaledb](https://github.com/cruppstahl/upscaledb)
 28 | - [EventQL](https://github.com/eventql/eventql)
 29 | - [ManticoreSearch](https://manticoresearch.com)
 30 | 
 31 | 
 32 | 
 33 | Requirements
 34 | -------------
 35 | 
 36 | - Your processor should support SSE4.1 (It is supported by most Intel and AMD processors released since 2008.)
 37 | - It is possible to build the core part of the code if your processor support SSE2 (Pentium4 or better)
 38 | - C99 compliant compiler (GCC is assumed)
 39 | - A Linux-like distribution is assumed by the makefile
 40 | 
 41 | For a plain C version that does not use SIMD instructions, see https://github.com/lemire/LittleIntPacker
 42 | 
 43 | Usage
 44 | -------
 45 | 
 46 | Compression works over blocks of 128 integers.
 47 | 
 48 | For a complete working example, see example.c (you can build it and
 49 | run it with "make example; ./example").
 50 | 
 51 | 
 52 | 
 53 | 1) Lists of integers in random order.
 54 | 
 55 | ```C            
 56 | const uint32_t b = maxbits(datain);// computes bit width
 57 | simdpackwithoutmask(datain, buffer, b);//compressed to buffer, compressing 128 32-bit integers down to b*32 bytes
 58 | simdunpack(buffer, backbuffer, b);//uncompressed to backbuffer
 59 | ```
 60 | 
 61 | While 128 32-bit integers are read, only b 128-bit words are written. Thus, the compression ratio is 32/b.
 62 | 
 63 | 2) Sorted lists of integers.
 64 | 
 65 | We used differential coding: we store the difference between successive integers. For this purpose, we need an initial value (called offset).
 66 | 
 67 | ```C            
 68 | uint32_t offset = 0;
 69 | uint32_t b1 = simdmaxbitsd1(offset,datain); // bit width
 70 | simdpackwithoutmaskd1(offset, datain, buffer, b1);//compressing 128 32-bit integers down to b1*32 bytes
 71 | simdunpackd1(offset, buffer, backbuffer, b1);//uncompressed
 72 | ```
 73 | 
 74 | General example for arrays of arbitrary length:
 75 | ```C
 76 | int compress_decompress_demo() {
 77 |   size_t k, N = 9999;
 78 |   __m128i * endofbuf;
 79 |   uint32_t * datain = malloc(N * sizeof(uint32_t));
 80 |   uint8_t * buffer;
 81 |   uint32_t * backbuffer = malloc(N * sizeof(uint32_t));
 82 |   uint32_t b;
 83 | 
 84 |   for (k = 0; k < N; ++k){        /* start with k=0, not k=1! */
 85 |     datain[k] = k;
 86 |   }
 87 | 
 88 |   b = maxbits_length(datain, N);
 89 |   buffer = malloc(simdpack_compressedbytes(N,b)); // allocate just enough memory
 90 |   endofbuf = simdpack_length(datain, N, (__m128i *)buffer, b);
 91 |   /* compressed data is stored between buffer and endofbuf using (endofbuf-buffer)*sizeof(__m128i) bytes */
 92 |   /* would be safe to do : buffer = realloc(buffer,(endofbuf-(__m128i *)buffer)*sizeof(__m128i)); */
 93 |   simdunpack_length((const __m128i *)buffer, N, backbuffer, b);
 94 | 
 95 |   for (k = 0; k < N; ++k){
 96 |     if(datain[k] != backbuffer[k]) {
 97 |       printf("bug\n");
 98 |       return -1;
 99 |     }
100 |   }
101 |   return 0;
102 | }
103 | ```
104 | 
105 | 
106 | 3) Frame-of-Reference 
107 | 
108 | We also have frame-of-reference (FOR) functions (see simdfor.h header). They work like the bit packing
109 | routines, but do not use differential coding so they allow faster search in some cases, at the expense
110 | of compression.
111 | 
112 | Setup
113 | ---------
114 | 
115 | 
116 | make
117 | make test
118 | 
119 | and if you are daring:
120 | 
121 | make install
122 | 
123 | Go
124 | --------
125 | 
126 | If you are a go user, there is a "go" folder where you will find a simple demo.
127 | 
128 | Other libraries
129 | ----------------
130 | * Fast integer compression in Go: https://github.com/ronanh/intcomp
131 | * Fast Bitpacking algorithms: Rust port of simdcomp https://github.com/quickwit-oss/bitpacking
132 | * SIMDCompressionAndIntersection: A C++ library to compress and intersect sorted lists of integers using SIMD instructions https://github.com/lemire/SIMDCompressionAndIntersection
133 | * The FastPFOR C++ library : Fast integer compression https://github.com/lemire/FastPFor
134 | * High-performance dictionary coding https://github.com/lemire/dictionary
135 | * LittleIntPacker: C library to pack and unpack short arrays of integers as fast as possible https://github.com/lemire/LittleIntPacker
136 | * StreamVByte: Fast integer compression in C using the StreamVByte codec https://github.com/lemire/streamvbyte
137 | * MaskedVByte: Fast decoder for VByte-compressed integers https://github.com/lemire/MaskedVByte
138 | * CSharpFastPFOR: A C#  integer compression library  https://github.com/Genbox/CSharpFastPFOR
139 | * JavaFastPFOR: A java integer compression library https://github.com/lemire/JavaFastPFOR
140 | * Encoding: Integer Compression Libraries for Go https://github.com/zhenjl/encoding
141 | * FrameOfReference is a C++ library dedicated to frame-of-reference (FOR) compression: https://github.com/lemire/FrameOfReference
142 | * libvbyte: A fast implementation for varbyte 32bit/64bit integer compression https://github.com/cruppstahl/libvbyte
143 | * TurboPFor is a C library that offers lots of interesting optimizations. Well worth checking! (GPL license) https://github.com/powturbo/TurboPFor
144 | * Oroch is a C++ library that offers a usable API (MIT license) https://github.com/ademakov/Oroch
145 | 
146 | 
147 | Other programming languages
148 | -------------
149 | 
150 | - [There is a wrapper for Julia](https://github.com/mcovalt/TinyInt.jl).
151 | - [There is a Rust port](https://github.com/tantivy-search/bitpacking/).
152 | 
153 | References
154 | ------------
155 | * Daniel Lemire, Nathan Kurz, Christoph Rupp, Stream VByte: Faster Byte-Oriented Integer Compression, Information Processing Letters, Information Processing Letters 130, February 2018, Pages 1-6https://arxiv.org/abs/1709.08990
156 | * Jianguo Wang, Chunbin Lin, Yannis Papakonstantinou, Steven Swanson, An Experimental Study of Bitmap Compression vs. Inverted List Compression, SIGMOD 2017 http://db.ucsd.edu/wp-content/uploads/2017/03/sidm338-wangA.pdf
157 | * P. Damme, D. Habich, J. Hildebrandt, W. Lehner, Lightweight Data Compression Algorithms: An Experimental Survey (Experiments and Analyses), EDBT 2017 http://openproceedings.org/2017/conf/edbt/paper-146.pdf
158 | * P. Damme, D. Habich, J. Hildebrandt, W. Lehner, Insights into the Comparative Evaluation of Lightweight Data Compression Algorithms, EDBT 2017 http://openproceedings.org/2017/conf/edbt/paper-414.pdf
159 | * Daniel Lemire, Leonid Boytsov, Nathan Kurz, SIMD Compression and the Intersection of Sorted Integers, Software Practice & Experience 46 (6) 2016. http://arxiv.org/abs/1401.6399
160 | * Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second through vectorization, Software Practice & Experience 45 (1), 2015.  http://arxiv.org/abs/1209.2137 http://onlinelibrary.wiley.com/doi/10.1002/spe.2203/abstract
161 | * Jeff Plaisance, Nathan Kurz, Daniel Lemire, Vectorized VByte Decoding, International Symposium on Web Algorithms 2015, 2015. http://arxiv.org/abs/1503.07387
162 | * Wayne Xin Zhao, Xudong Zhang, Daniel Lemire, Dongdong Shan, Jian-Yun Nie, Hongfei Yan, Ji-Rong Wen, A General SIMD-based Approach to Accelerating Compression Algorithms, ACM Transactions on Information Systems 33 (3), 2015. http://arxiv.org/abs/1502.01916
163 | * T. D. Wu, Bitpacking techniques for indexing genomes: I. Hash tables, Algorithms for Molecular Biology 11 (5), 2016. http://almob.biomedcentral.com/articles/10.1186/s13015-016-0069-5
164 | 


--------------------------------------------------------------------------------
/src/simdcomputil.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This code is released under a BSD License.
  3 |  */
  4 | 
  5 | #include "simdcomputil.h"
  6 | #ifdef __SSE4_1__
  7 | #include <smmintrin.h>
  8 | #endif
  9 | #include <assert.h>
 10 | 
 11 | #define Delta(curr, prev)                                                      \
 12 |   _mm_sub_epi32(                                                               \
 13 |       curr, _mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12)))
 14 | 
 15 | /* returns the integer logarithm of v (bit width) */
 16 | uint32_t bits(const uint32_t v) {
 17 | #ifdef _MSC_VER
 18 |   unsigned long answer;
 19 |   if (v == 0) {
 20 |     return 0;
 21 |   }
 22 |   _BitScanReverse(&answer, v);
 23 |   return answer + 1;
 24 | #else
 25 |   return v == 0 ? 0
 26 |                 : 32 - __builtin_clz(
 27 |                            v); /* assume GCC-like compiler if not microsoft */
 28 | #endif
 29 | }
 30 | 
 31 | static uint32_t maxbitas32int(const __m128i accumulator) {
 32 |   const __m128i _tmp1 = _mm_or_si128(
 33 |       _mm_srli_si128(accumulator, 8),
 34 |       accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
 35 |   const __m128i _tmp2 =
 36 |       _mm_or_si128(_mm_srli_si128(_tmp1, 4),
 37 |                    _tmp1); /*  (A,B,C xor A,D xor B) xor  (0,0,0,C xor A)*/
 38 |   uint32_t ans = _mm_cvtsi128_si32(_tmp2);
 39 |   return bits(ans);
 40 | }
 41 | 
 42 | SIMDCOMP_PURE uint32_t maxbits(const uint32_t *begin) {
 43 |   const __m128i *pin = (const __m128i *)(begin);
 44 |   __m128i accumulator = _mm_loadu_si128(pin);
 45 |   uint32_t k = 1;
 46 |   for (; 4 * k < SIMDBlockSize; ++k) {
 47 |     __m128i newvec = _mm_loadu_si128(pin + k);
 48 |     accumulator = _mm_or_si128(accumulator, newvec);
 49 |   }
 50 |   return maxbitas32int(accumulator);
 51 | }
 52 | static uint32_t orasint(const __m128i accumulator) {
 53 |   const __m128i _tmp1 = _mm_or_si128(
 54 |       _mm_srli_si128(accumulator, 8),
 55 |       accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
 56 |   const __m128i _tmp2 =
 57 |       _mm_or_si128(_mm_srli_si128(_tmp1, 4),
 58 |                    _tmp1); /*  (A,B,C xor A,D xor B) xor  (0,0,0,C xor A)*/
 59 |   return _mm_cvtsi128_si32(_tmp2);
 60 | }
 61 | 
 62 | #ifdef __SSE4_1__
 63 | 
 64 | static uint32_t minasint(const __m128i accumulator) {
 65 |   const __m128i _tmp1 = _mm_min_epu32(
 66 |       _mm_srli_si128(accumulator, 8),
 67 |       accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
 68 |   const __m128i _tmp2 =
 69 |       _mm_min_epu32(_mm_srli_si128(_tmp1, 4),
 70 |                     _tmp1); /*  (A,B,C xor A,D xor B) xor  (0,0,0,C xor A)*/
 71 |   return _mm_cvtsi128_si32(_tmp2);
 72 | }
 73 | 
 74 | static uint32_t maxasint(const __m128i accumulator) {
 75 |   const __m128i _tmp1 = _mm_max_epu32(
 76 |       _mm_srli_si128(accumulator, 8),
 77 |       accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/
 78 |   const __m128i _tmp2 =
 79 |       _mm_max_epu32(_mm_srli_si128(_tmp1, 4),
 80 |                     _tmp1); /*  (A,B,C xor A,D xor B) xor  (0,0,0,C xor A)*/
 81 |   return _mm_cvtsi128_si32(_tmp2);
 82 | }
 83 | 
 84 | uint32_t simdmin(const uint32_t *in) {
 85 |   const __m128i *pin = (const __m128i *)(in);
 86 |   __m128i accumulator = _mm_loadu_si128(pin);
 87 |   uint32_t k = 1;
 88 |   for (; 4 * k < SIMDBlockSize; ++k) {
 89 |     __m128i newvec = _mm_loadu_si128(pin + k);
 90 |     accumulator = _mm_min_epu32(accumulator, newvec);
 91 |   }
 92 |   return minasint(accumulator);
 93 | }
 94 | 
 95 | void simdmaxmin(const uint32_t *in, uint32_t *getmin, uint32_t *getmax) {
 96 |   const __m128i *pin = (const __m128i *)(in);
 97 |   __m128i minaccumulator = _mm_loadu_si128(pin);
 98 |   __m128i maxaccumulator = minaccumulator;
 99 |   uint32_t k = 1;
100 |   for (; 4 * k < SIMDBlockSize; ++k) {
101 |     __m128i newvec = _mm_loadu_si128(pin + k);
102 |     minaccumulator = _mm_min_epu32(minaccumulator, newvec);
103 |     maxaccumulator = _mm_max_epu32(maxaccumulator, newvec);
104 |   }
105 |   *getmin = minasint(minaccumulator);
106 |   *getmax = maxasint(maxaccumulator);
107 | }
108 | 
109 | uint32_t simdmin_length(const uint32_t *in, uint32_t length) {
110 |   uint32_t currentmin = 0xFFFFFFFF;
111 |   uint32_t lengthdividedby4 = length / 4;
112 |   uint32_t offset = lengthdividedby4 * 4;
113 |   uint32_t k;
114 |   if (lengthdividedby4 > 0) {
115 |     const __m128i *pin = (const __m128i *)(in);
116 |     __m128i accumulator = _mm_loadu_si128(pin);
117 |     k = 1;
118 |     for (; 4 * k < lengthdividedby4 * 4; ++k) {
119 |       __m128i newvec = _mm_loadu_si128(pin + k);
120 |       accumulator = _mm_min_epu32(accumulator, newvec);
121 |     }
122 |     currentmin = minasint(accumulator);
123 |   }
124 |   for (k = offset; k < length; ++k)
125 |     if (in[k] < currentmin)
126 |       currentmin = in[k];
127 |   return currentmin;
128 | }
129 | 
130 | void simdmaxmin_length(const uint32_t *in, uint32_t length, uint32_t *getmin,
131 |                        uint32_t *getmax) {
132 |   uint32_t lengthdividedby4 = length / 4;
133 |   uint32_t offset = lengthdividedby4 * 4;
134 |   uint32_t k;
135 |   *getmin = 0xFFFFFFFF;
136 |   *getmax = 0;
137 |   if (lengthdividedby4 > 0) {
138 |     const __m128i *pin = (const __m128i *)(in);
139 |     __m128i minaccumulator = _mm_loadu_si128(pin);
140 |     __m128i maxaccumulator = minaccumulator;
141 |     k = 1;
142 |     for (; 4 * k < lengthdividedby4 * 4; ++k) {
143 |       __m128i newvec = _mm_loadu_si128(pin + k);
144 |       minaccumulator = _mm_min_epu32(minaccumulator, newvec);
145 |       maxaccumulator = _mm_max_epu32(maxaccumulator, newvec);
146 |     }
147 |     *getmin = minasint(minaccumulator);
148 |     *getmax = maxasint(maxaccumulator);
149 |   }
150 |   for (k = offset; k < length; ++k) {
151 |     if (in[k] < *getmin)
152 |       *getmin = in[k];
153 |     if (in[k] > *getmax)
154 |       *getmax = in[k];
155 |   }
156 | }
157 | 
158 | #endif
159 | 
160 | SIMDCOMP_PURE uint32_t maxbits_length(const uint32_t *in, uint32_t length) {
161 |   uint32_t k;
162 |   uint32_t lengthdividedby4 = length / 4;
163 |   uint32_t offset = lengthdividedby4 * 4;
164 |   uint32_t bigxor = 0;
165 |   if (lengthdividedby4 > 0) {
166 |     const __m128i *pin = (const __m128i *)(in);
167 |     __m128i accumulator = _mm_loadu_si128(pin);
168 |     k = 1;
169 |     for (; 4 * k < 4 * lengthdividedby4; ++k) {
170 |       __m128i newvec = _mm_loadu_si128(pin + k);
171 |       accumulator = _mm_or_si128(accumulator, newvec);
172 |     }
173 |     bigxor = orasint(accumulator);
174 |   }
175 |   for (k = offset; k < length; ++k)
176 |     bigxor |= in[k];
177 |   return bits(bigxor);
178 | }
179 | 
180 | /* maxbit over 128 integers (SIMDBlockSize) with provided initial value */
181 | uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t *in) {
182 |   __m128i initoffset = _mm_set1_epi32(initvalue);
183 |   const __m128i *pin = (const __m128i *)(in);
184 |   __m128i newvec = _mm_loadu_si128(pin);
185 |   __m128i accumulator = Delta(newvec, initoffset);
186 |   __m128i oldvec = newvec;
187 |   uint32_t k = 1;
188 |   for (; 4 * k < SIMDBlockSize; ++k) {
189 |     newvec = _mm_loadu_si128(pin + k);
190 |     accumulator = _mm_or_si128(accumulator, Delta(newvec, oldvec));
191 |     oldvec = newvec;
192 |   }
193 |   initoffset = oldvec;
194 |   return maxbitas32int(accumulator);
195 | }
196 | 
197 | /* maxbit over |length| integers with provided initial value */
198 | uint32_t simdmaxbitsd1_length(uint32_t initvalue, const uint32_t *in,
199 |                               uint32_t length) {
200 |   __m128i newvec;
201 |   __m128i oldvec;
202 |   __m128i initoffset;
203 |   __m128i accumulator;
204 |   const __m128i *pin;
205 |   uint32_t tmparray[4];
206 |   uint32_t k = 1;
207 |   uint32_t acc;
208 | 
209 |   assert(length > 0);
210 | 
211 |   pin = (const __m128i *)(in);
212 |   initoffset = _mm_set1_epi32(initvalue);
213 |   switch (length) {
214 |   case 1:
215 |     newvec = _mm_set1_epi32(in[0]);
216 |     break;
217 |   case 2:
218 |     newvec = _mm_setr_epi32(in[0], in[1], in[1], in[1]);
219 |     break;
220 |   case 3:
221 |     newvec = _mm_setr_epi32(in[0], in[1], in[2], in[2]);
222 |     break;
223 |   default:
224 |     newvec = _mm_loadu_si128(pin);
225 |     break;
226 |   }
227 |   accumulator = Delta(newvec, initoffset);
228 |   oldvec = newvec;
229 | 
230 |   /* process 4 integers and build an accumulator */
231 |   while (k * 4 + 4 <= length) {
232 |     newvec = _mm_loadu_si128(pin + k);
233 |     accumulator = _mm_or_si128(accumulator, Delta(newvec, oldvec));
234 |     oldvec = newvec;
235 |     k++;
236 |   }
237 | 
238 |   /* extract the accumulator as an integer */
239 |   _mm_storeu_si128((__m128i *)(tmparray), accumulator);
240 |   acc = tmparray[0] | tmparray[1] | tmparray[2] | tmparray[3];
241 | 
242 |   /* now process the remaining integers */
243 |   for (k *= 4; k < length; k++)
244 |     acc |= in[k] - (k == 0 ? initvalue : in[k - 1]);
245 | 
246 |   /* return the number of bits */
247 |   return bits(acc);
248 | }
249 | 


--------------------------------------------------------------------------------
/benchmarks/bitpackingbenchmark.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <assert.h>
  3 | 
  4 | #include "simdcomp.h"
  5 | 
  6 | 
  7 | #define RDTSC_START(cycles)                                                   \
  8 |     do {                                                                      \
  9 |         register unsigned cyc_high, cyc_low;                                  \
 10 |         __asm volatile(                                                       \
 11 |             "cpuid\n\t"                                                       \
 12 |             "rdtsc\n\t"                                                       \
 13 |             "mov %%edx, %0\n\t"                                               \
 14 |             "mov %%eax, %1\n\t"                                               \
 15 |             : "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \
 16 |         (cycles) = ((uint64_t)cyc_high << 32) | cyc_low;                      \
 17 |     } while (0)
 18 | 
 19 | #define RDTSC_FINAL(cycles)                                                   \
 20 |     do {                                                                      \
 21 |         register unsigned cyc_high, cyc_low;                                  \
 22 |         __asm volatile(                                                       \
 23 |             "rdtscp\n\t"                                                      \
 24 |             "mov %%edx, %0\n\t"                                               \
 25 |             "mov %%eax, %1\n\t"                                               \
 26 |             "cpuid\n\t"                                                       \
 27 |             : "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \
 28 |         (cycles) = ((uint64_t)cyc_high << 32) | cyc_low;                      \
 29 |     } while (0)
 30 | 
 31 | 
 32 | 
 33 | 
 34 | uint32_t * get_random_array_from_bit_width(uint32_t length, uint32_t bit) {
 35 |     uint32_t * answer = malloc(sizeof(uint32_t) * length);
 36 |     uint32_t mask = (uint32_t) ((UINT64_C(1) << bit) - 1);
 37 |     uint32_t i;
 38 |     for(i = 0; i < length; ++i) {
 39 |         answer[i] = rand() & mask;
 40 |     }
 41 |     return answer;
 42 | }
 43 | 
 44 | uint32_t * get_random_array_from_bit_width_d1(uint32_t length, uint32_t bit) {
 45 |     uint32_t * answer = malloc(sizeof(uint32_t) * length);
 46 |     uint32_t mask = (uint32_t) ((UINT64_C(1) << bit) - 1);
 47 |     uint32_t i;
 48 |     answer[0] = rand() & mask;
 49 |     for(i = 1; i < length; ++i) {
 50 |         answer[i] = answer[i-1] + (rand() & mask);
 51 |     }
 52 |     return answer;
 53 | }
 54 | 
 55 | 
 56 | void demo128() {
 57 |     const uint32_t length = 128;
 58 |     uint32_t bit;
 59 |     printf("# --- %s\n", __func__);
 60 |     printf("# compressing %d integers\n",length);
 61 |     printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n");
 62 |     for(bit = 1; bit <= 32; ++bit) {
 63 |         uint32_t i;
 64 | 
 65 |         uint32_t * data = get_random_array_from_bit_width(length, bit);
 66 |         __m128i * buffer = malloc(length * sizeof(uint32_t));
 67 |         uint32_t * backdata = malloc(length * sizeof(uint32_t));
 68 |         uint32_t repeat = 500;
 69 |         uint64_t min_diff;
 70 |         printf("%d\t",bit);
 71 |         min_diff = (uint64_t)-1;
 72 |         for (i = 0; i < repeat; i++) {
 73 |             uint64_t cycles_start, cycles_final, cycles_diff;
 74 |             __asm volatile("" ::: /* pretend to clobber */ "memory");
 75 |             RDTSC_START(cycles_start);
 76 |             simdpackwithoutmask(data,buffer, bit);
 77 |             RDTSC_FINAL(cycles_final);
 78 |             cycles_diff = (cycles_final - cycles_start);
 79 |             if (cycles_diff < min_diff) min_diff = cycles_diff;
 80 |         }
 81 |         printf("%.2f\t",min_diff*1.0/length);
 82 |         min_diff = (uint64_t)-1;
 83 |         for (i = 0; i < repeat; i++) {
 84 |             uint64_t cycles_start, cycles_final, cycles_diff;
 85 |             __asm volatile("" ::: /* pretend to clobber */ "memory");
 86 |             RDTSC_START(cycles_start);
 87 |             simdunpack(buffer, backdata,bit);
 88 |             RDTSC_FINAL(cycles_final);
 89 |             cycles_diff = (cycles_final - cycles_start);
 90 |             if (cycles_diff < min_diff) min_diff = cycles_diff;
 91 |         }
 92 |         printf("%.2f\t",min_diff*1.0/length);
 93 | 
 94 |         free(data);
 95 |         free(buffer);
 96 |         free(backdata);
 97 |         printf("\n");
 98 |     }
 99 |     printf("\n\n"); /* two blank lines are required by gnuplot */
100 | }
101 | 
102 | void demo128_d1() {
103 |     const uint32_t length = 128;
104 |     uint32_t bit;
105 |     printf("# --- %s\n", __func__);
106 |     printf("# compressing %d integers\n",length);
107 |     printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n");
108 |     for(bit = 1; bit <= 32; ++bit) {
109 |         uint32_t i;
110 | 
111 |         uint32_t * data = get_random_array_from_bit_width_d1(length, bit);
112 |         __m128i * buffer = malloc(length * sizeof(uint32_t));
113 |         uint32_t * backdata = malloc(length * sizeof(uint32_t));
114 |         uint32_t repeat = 500;
115 |         uint64_t min_diff;
116 |         printf("%d\t",bit);
117 |         min_diff = (uint64_t)-1;
118 |         for (i = 0; i < repeat; i++) {
119 |             uint64_t cycles_start, cycles_final, cycles_diff;
120 |             __asm volatile("" ::: /* pretend to clobber */ "memory");
121 |             RDTSC_START(cycles_start);
122 |             simdpackwithoutmaskd1(0,data,buffer, bit);
123 |             RDTSC_FINAL(cycles_final);
124 |             cycles_diff = (cycles_final - cycles_start);
125 |             if (cycles_diff < min_diff) min_diff = cycles_diff;
126 |         }
127 |         printf("%.2f\t",min_diff*1.0/length);
128 |         min_diff = (uint64_t)-1;
129 |         for (i = 0; i < repeat; i++) {
130 |             uint64_t cycles_start, cycles_final, cycles_diff;
131 |             __asm volatile("" ::: /* pretend to clobber */ "memory");
132 |             RDTSC_START(cycles_start);
133 |             simdunpackd1(0,buffer, backdata,bit);
134 |             RDTSC_FINAL(cycles_final);
135 |             cycles_diff = (cycles_final - cycles_start);
136 |             if (cycles_diff < min_diff) min_diff = cycles_diff;
137 |         }
138 |         printf("%.2f\t",min_diff*1.0/length);
139 | 
140 |         free(data);
141 |         free(buffer);
142 |         free(backdata);
143 |         printf("\n");
144 |     }
145 |     printf("\n\n"); /* two blank lines are required by gnuplot */
146 | }
147 | 
148 | #ifdef __AVX2__
149 | void demo256() {
150 |     const uint32_t length = 256;
151 |     uint32_t bit;
152 |     printf("# --- %s\n", __func__);
153 |     printf("# compressing %d integers\n",length);
154 |     printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n");
155 |     for(bit = 1; bit <= 32; ++bit) {
156 |         uint32_t i;
157 | 
158 |         uint32_t * data = get_random_array_from_bit_width(length, bit);
159 |         __m256i * buffer = malloc(length * sizeof(uint32_t));
160 |         uint32_t * backdata = malloc(length * sizeof(uint32_t));
161 |         uint32_t repeat = 500;
162 |         uint64_t min_diff;
163 |         printf("%d\t",bit);
164 |         min_diff = (uint64_t)-1;
165 |         for (i = 0; i < repeat; i++) {
166 |             uint64_t cycles_start, cycles_final, cycles_diff;
167 |             __asm volatile("" ::: /* pretend to clobber */ "memory");
168 |             RDTSC_START(cycles_start);
169 |             avxpackwithoutmask(data,buffer, bit);
170 |             RDTSC_FINAL(cycles_final);
171 |             cycles_diff = (cycles_final - cycles_start);
172 |             if (cycles_diff < min_diff) min_diff = cycles_diff;
173 |         }
174 |         printf("%.2f\t",min_diff*1.0/length);
175 |         min_diff = (uint64_t)-1;
176 |         for (i = 0; i < repeat; i++) {
177 |             uint64_t cycles_start, cycles_final, cycles_diff;
178 |             __asm volatile("" ::: /* pretend to clobber */ "memory");
179 |             RDTSC_START(cycles_start);
180 |             avxunpack(buffer, backdata,bit);
181 |             RDTSC_FINAL(cycles_final);
182 |             cycles_diff = (cycles_final - cycles_start);
183 |             if (cycles_diff < min_diff) min_diff = cycles_diff;
184 |         }
185 |         printf("%.2f\t",min_diff*1.0/length);
186 | 
187 |         free(data);
188 |         free(buffer);
189 |         free(backdata);
190 |         printf("\n");
191 |     }
192 |     printf("\n\n"); /* two blank lines are required by gnuplot */
193 | }
194 | #endif /* avx 2 */
195 | 
196 | #ifdef __AVX512F__
197 | void demo512() {
198 |     const uint32_t length = 512;
199 |     uint32_t bit;
200 |     size_t z;
201 |     printf("# --- %s\n", __func__);
202 |     printf("# compressing %d integers\n",length);
203 |     printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n");
204 |     for(bit = 1; bit <= 32; ++bit) {
205 |         uint32_t i;
206 | 
207 |         uint32_t * data = get_random_array_from_bit_width(length, bit);
208 |         __m512i * buffer = malloc(length * sizeof(uint32_t));
209 |         uint32_t * backdata = malloc(length * sizeof(uint32_t));
210 |         uint32_t repeat = 500;
211 |         uint64_t min_diff;
212 |         printf("%d\t",bit);
213 |         min_diff = (uint64_t)-1;
214 |         for (i = 0; i < repeat; i++) {
215 |             uint64_t cycles_start, cycles_final, cycles_diff;
216 |             __asm volatile("" ::: /* pretend to clobber */ "memory");
217 |             RDTSC_START(cycles_start);
218 |             avx512packwithoutmask(data,buffer, bit);
219 |             RDTSC_FINAL(cycles_final);
220 |             cycles_diff = (cycles_final - cycles_start);
221 |             if (cycles_diff < min_diff) min_diff = cycles_diff;
222 |         }
223 |         printf("%.2f\t",min_diff*1.0/length);
224 |         min_diff = (uint64_t)-1;
225 |         for (i = 0; i < repeat; i++) {
226 |             uint64_t cycles_start, cycles_final, cycles_diff;
227 |             __asm volatile("" ::: /* pretend to clobber */ "memory");
228 |             RDTSC_START(cycles_start);
229 |             avx512unpack(buffer, backdata,bit);
230 |             RDTSC_FINAL(cycles_final);
231 |             cycles_diff = (cycles_final - cycles_start);
232 |             if (cycles_diff < min_diff) min_diff = cycles_diff;
233 |         }
234 |         printf("%.2f\t",min_diff*1.0/length);
235 |         for(z = 0 ; z < length ; ++z) assert(backdata[z] == data[z]);
236 |         free(data);
237 |         free(buffer);
238 |         free(backdata);
239 |         printf("\n");
240 |     }
241 |     printf("\n\n"); /* two blank lines are required by gnuplot */
242 | }
243 | #endif /* avx 2 */
244 | 
245 | 
246 | 
247 | int main() {
248 |     demo128();
249 |     demo128_d1();
250 | #ifdef __AVX2__
251 |     demo256();
252 | #endif
253 | #ifdef __AVX512F__
254 |     demo512();
255 | #endif
256 |      return 0;
257 | 
258 | 
259 | }
260 | 


--------------------------------------------------------------------------------
/tests/unit.c:
--------------------------------------------------------------------------------
   1 | /**
   2 |  * This code is released under a BSD License.
   3 |  */
   4 | #include "simdcomp.h"
   5 | #include <assert.h>
   6 | #include <stdio.h>
   7 | #include <stdlib.h>
   8 | 
   9 | int issue21() {
  10 |   uint32_t bw, sz;
  11 |   printf("issue21");
  12 |   fflush(stdout);
  13 |   for (bw = 0; bw < 30; bw++) {
  14 |     printf(".");
  15 |     fflush(stdout);
  16 |     for (sz = 1; sz < 4096; sz++) {
  17 | 
  18 |       size_t i;
  19 |       uint32_t *in = malloc(sz * sizeof(uint32_t));
  20 |       uint32_t *out = malloc(sz * sizeof(uint32_t));
  21 |       for (i = 0; i < sz; ++i)
  22 |         in[i] = (1 << bw) - 1;
  23 |       uint32_t b = maxbits_length(in, sz);
  24 |       uint8_t *buf = malloc(simdpack_compressedbytes(sz, b));
  25 |       __m128i *end = simdpack_length(in, sz, (__m128i *)buf, b);
  26 |       if ((uint8_t *)end - buf != simdpack_compressedbytes(sz, b)) {
  27 |         printf("bad mem usage\n");
  28 |         return -1;
  29 |       }
  30 |       simdunpack_length((const __m128i *)buf, sz, out, b);
  31 |       for (i = 0; i < sz; ++i) {
  32 |         if (in[i] != out[i]) {
  33 |           printf("bug\n");
  34 |           return -1;
  35 |         }
  36 |       }
  37 |       free(in);
  38 |       free(out);
  39 |       free(buf);
  40 |     }
  41 |   }
  42 |   printf("\n");
  43 |   return 0;
  44 | }
  45 | 
  46 | int issue21FOR() {
  47 |   uint32_t bw, sz;
  48 |   size_t i, j;
  49 |   printf("issue21for");
  50 |   fflush(stdout);
  51 |   for (bw = 0; bw < 30; bw++) {
  52 |     printf(".");
  53 |     fflush(stdout);
  54 |     for (sz = 1; sz < 4096; sz++) {
  55 | 
  56 |       uint32_t *in = malloc(sz * sizeof(uint32_t));
  57 |       uint32_t *out = malloc(sz * sizeof(uint32_t));
  58 |       in[0] = 0;
  59 |       for (i = 1; i < sz; ++i)
  60 |         in[i] = (1 << bw) - 1;
  61 |       uint32_t b = maxbits_length(in, sz);
  62 |       uint8_t *buf = malloc(simdpackFOR_compressedbytes(sz, b));
  63 |       __m128i *end = simdpackFOR_length(0, in, sz, (__m128i *)buf, b);
  64 |       if ((uint8_t *)end - buf != simdpackFOR_compressedbytes(sz, b)) {
  65 |         printf("bad mem usage\n");
  66 |         return -1;
  67 |       }
  68 |       simdunpackFOR_length(0, (const __m128i *)buf, sz, out, b);
  69 |       for (i = 0; i < sz; ++i) {
  70 |         if (in[i] != out[i]) {
  71 |           for (j = 0; j < sz; ++j) {
  72 |             printf("%zu : %u %u \n", j, in[j], out[j]);
  73 |           }
  74 |           printf("bug\n");
  75 |           return -1;
  76 |         }
  77 |       }
  78 |       free(in);
  79 |       free(out);
  80 |       free(buf);
  81 |     }
  82 |   }
  83 |   printf("\n");
  84 |   return 0;
  85 | }
  86 | 
  87 | int testshortpack() {
  88 |   int bit;
  89 |   size_t i;
  90 |   size_t length;
  91 |   __m128i *bb;
  92 |   srand(0);
  93 |   printf("[%s]\n", __func__);
  94 |   for (bit = 0; bit < 32; ++bit) {
  95 |     printf(" %d ", bit);
  96 |     fflush(stdout);
  97 |     const size_t N = 128;
  98 |     uint32_t *data = malloc(N * sizeof(uint32_t));
  99 |     uint32_t *backdata = malloc(N * sizeof(uint32_t));
 100 |     uint32_t *buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
 101 | 
 102 |     for (i = 0; i < N; ++i) {
 103 |       data[i] = rand() & ((1U << bit) - 1);
 104 |     }
 105 |     for (length = 0; length <= N; ++length) {
 106 |       for (i = 0; i < N; ++i) {
 107 |         backdata[i] = 0;
 108 |       }
 109 |       bb = simdpack_shortlength(data, length, (__m128i *)buffer, bit);
 110 |       if ((bb - (__m128i *)buffer) * sizeof(__m128i) !=
 111 |           (unsigned)simdpack_compressedbytes(length, bit)) {
 112 |         printf("bug\n");
 113 |         return -1;
 114 |       }
 115 |       simdunpack_shortlength((__m128i *)buffer, length, backdata, bit);
 116 |       for (i = 0; i < length; ++i) {
 117 | 
 118 |         if (data[i] != backdata[i]) {
 119 |           printf("bug\n");
 120 |           return -1;
 121 |         }
 122 |       }
 123 |     }
 124 |     free(data);
 125 |     free(backdata);
 126 |     free(buffer);
 127 |   }
 128 |   return 0;
 129 | }
 130 | 
 131 | int testlongpack() {
 132 |   int bit;
 133 |   size_t i;
 134 |   size_t length;
 135 |   __m128i *bb;
 136 |   srand(0);
 137 |   printf("[%s]\n", __func__);
 138 |   for (bit = 0; bit < 32; ++bit) {
 139 |     const size_t N = 2048;
 140 |     uint32_t *data = malloc(N * sizeof(uint32_t));
 141 |     uint32_t *backdata = malloc(N * sizeof(uint32_t));
 142 |     uint32_t *buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
 143 | 
 144 |     for (i = 0; i < N; ++i) {
 145 |       data[i] = rand() & ((1U << bit) - 1);
 146 |     }
 147 |     for (length = 0; length <= N; ++length) {
 148 |       for (i = 0; i < N; ++i) {
 149 |         backdata[i] = 0;
 150 |       }
 151 |       bb = simdpack_length(data, length, (__m128i *)buffer, bit);
 152 |       if ((bb - (__m128i *)buffer) * sizeof(__m128i) !=
 153 |           (unsigned)simdpack_compressedbytes(length, bit)) {
 154 |         printf("bug\n");
 155 |         return -1;
 156 |       }
 157 |       simdunpack_length((__m128i *)buffer, length, backdata, bit);
 158 |       for (i = 0; i < length; ++i) {
 159 | 
 160 |         if (data[i] != backdata[i]) {
 161 |           printf("bug\n");
 162 |           return -1;
 163 |         }
 164 |       }
 165 |     }
 166 |     free(data);
 167 |     free(backdata);
 168 |     free(buffer);
 169 |   }
 170 |   return 0;
 171 | }
 172 | 
 173 | int testset() {
 174 |   int bit;
 175 |   size_t i;
 176 |   const size_t N = 128;
 177 |   uint32_t *data = malloc(N * sizeof(uint32_t));
 178 |   uint32_t *backdata = malloc(N * sizeof(uint32_t));
 179 |   uint32_t *buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
 180 | 
 181 |   srand(0);
 182 |   printf("[%s]\n", __func__);
 183 |   for (bit = 0; bit < 32; ++bit) {
 184 |     printf("simple set %d \n", bit);
 185 | 
 186 |     for (i = 0; i < N; ++i) {
 187 |       data[i] = rand() & ((1U << bit) - 1);
 188 |     }
 189 |     for (i = 0; i < N; ++i) {
 190 |       backdata[i] = 0;
 191 |     }
 192 |     simdpack(data, (__m128i *)buffer, bit);
 193 |     simdunpack((__m128i *)buffer, backdata, bit);
 194 |     for (i = 0; i < N; ++i) {
 195 |       if (data[i] != backdata[i]) {
 196 |         printf("bug\n");
 197 |         return -1;
 198 |       }
 199 |     }
 200 | 
 201 |     for (i = N; i > 0; i--) {
 202 |       simdfastset((__m128i *)buffer, bit, data[N - i], i - 1);
 203 |     }
 204 |     simdunpack((__m128i *)buffer, backdata, bit);
 205 |     for (i = 0; i < N; ++i) {
 206 |       if (data[i] != backdata[N - i - 1]) {
 207 |         printf("bug\n");
 208 |         return -1;
 209 |       }
 210 |     }
 211 |     simdpack(data, (__m128i *)buffer, bit);
 212 |     for (i = 1; i <= N; i++) {
 213 |       simdfastset((__m128i *)buffer, bit, data[i - 1], i - 1);
 214 |     }
 215 |     simdunpack((__m128i *)buffer, backdata, bit);
 216 |     for (i = 0; i < N; ++i) {
 217 |       if (data[i] != backdata[i]) {
 218 |         printf("bug\n");
 219 |         return -1;
 220 |       }
 221 |     }
 222 |   }
 223 |   free(data);
 224 |   free(backdata);
 225 |   free(buffer);
 226 | 
 227 |   return 0;
 228 | }
 229 | 
 230 | #ifdef __SSE4_1__
 231 | 
 232 | int testsetd1() {
 233 |   int bit;
 234 |   size_t i;
 235 |   uint32_t newvalue;
 236 |   const size_t N = 128;
 237 |   uint32_t *data = malloc(N * sizeof(uint32_t));
 238 |   uint32_t *datazeroes = malloc(N * sizeof(uint32_t));
 239 | 
 240 |   uint32_t *backdata = malloc(N * sizeof(uint32_t));
 241 |   uint32_t *buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
 242 | 
 243 |   srand(0);
 244 |   printf("[%s]\n", __func__);
 245 |   for (bit = 0; bit < 32; ++bit) {
 246 |     printf("simple set d1 %d \n", bit);
 247 |     data[0] = rand() & ((1U << bit) - 1);
 248 |     datazeroes[0] = 0;
 249 | 
 250 |     for (i = 1; i < N; ++i) {
 251 |       data[i] = data[i - 1] + (rand() & ((1U << bit) - 1));
 252 |       datazeroes[i] = 0;
 253 |     }
 254 |     for (i = 0; i < N; ++i) {
 255 |       backdata[i] = 0;
 256 |     }
 257 |     simdpackd1(0, datazeroes, (__m128i *)buffer, bit);
 258 |     for (i = 1; i <= N; i++) {
 259 |       simdfastsetd1(0, (__m128i *)buffer, bit, data[i - 1], i - 1);
 260 |       newvalue = simdselectd1(0, (const __m128i *)buffer, bit, i - 1);
 261 |       if (newvalue != data[i - 1]) {
 262 |         printf("bad set-select\n");
 263 |         return -1;
 264 |       }
 265 |     }
 266 |     simdunpackd1(0, (__m128i *)buffer, backdata, bit);
 267 |     for (i = 0; i < N; ++i) {
 268 |       if (data[i] != backdata[i])
 269 |         return -1;
 270 |     }
 271 |   }
 272 |   free(data);
 273 |   free(backdata);
 274 |   free(buffer);
 275 |   free(datazeroes);
 276 |   return 0;
 277 | }
 278 | #endif
 279 | 
 280 | int testsetFOR() {
 281 |   int bit;
 282 |   size_t i;
 283 |   uint32_t newvalue;
 284 |   const size_t N = 128;
 285 |   uint32_t *data = malloc(N * sizeof(uint32_t));
 286 |   uint32_t *datazeroes = malloc(N * sizeof(uint32_t));
 287 | 
 288 |   uint32_t *backdata = malloc(N * sizeof(uint32_t));
 289 |   uint32_t *buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
 290 | 
 291 |   srand(0);
 292 |   printf("[%s]\n", __func__);
 293 |   for (bit = 0; bit < 32; ++bit) {
 294 |     printf("simple set FOR %d \n", bit);
 295 |     for (i = 0; i < N; ++i) {
 296 |       data[i] = (rand() & ((1U << bit) - 1));
 297 |       datazeroes[i] = 0;
 298 |     }
 299 |     for (i = 0; i < N; ++i) {
 300 |       backdata[i] = 0;
 301 |     }
 302 |     simdpackFOR(0, datazeroes, (__m128i *)buffer, bit);
 303 |     for (i = 1; i <= N; i++) {
 304 |       simdfastsetFOR(0, (__m128i *)buffer, bit, data[i - 1], i - 1);
 305 |       newvalue = simdselectFOR(0, (const __m128i *)buffer, bit, i - 1);
 306 |       if (newvalue != data[i - 1]) {
 307 |         printf("bad set-select\n");
 308 |         return -1;
 309 |       }
 310 |     }
 311 |     simdunpackFOR(0, (__m128i *)buffer, backdata, bit);
 312 |     for (i = 0; i < N; ++i) {
 313 |       if (data[i] != backdata[i])
 314 |         return -1;
 315 |     }
 316 |   }
 317 |   free(data);
 318 |   free(backdata);
 319 |   free(buffer);
 320 |   free(datazeroes);
 321 |   return 0;
 322 | }
 323 | 
 324 | int testshortFORpack() {
 325 |   int bit;
 326 |   size_t i;
 327 |   __m128i *rb;
 328 |   size_t length;
 329 |   uint32_t offset = 7;
 330 |   srand(0);
 331 |   printf("[%s]\n", __func__);
 332 |   for (bit = 0; bit < 32; ++bit) {
 333 |     printf(" %d ", bit);
 334 |     fflush(stdout);
 335 |     const size_t N = 128;
 336 |     uint32_t *data = malloc(N * sizeof(uint32_t));
 337 |     uint32_t *backdata = malloc(N * sizeof(uint32_t));
 338 |     uint32_t *buffer = malloc((2 * N + 1024) * sizeof(uint32_t));
 339 | 
 340 |     for (i = 0; i < N; ++i) {
 341 |       data[i] = (rand() & ((1U << bit) - 1)) + offset;
 342 |     }
 343 |     for (length = 0; length <= N; ++length) {
 344 |       for (i = 0; i < N; ++i) {
 345 |         backdata[i] = 0;
 346 |       }
 347 |       rb = simdpackFOR_length(offset, data, length, (__m128i *)buffer, bit);
 348 |       if (((rb - (__m128i *)buffer) * sizeof(__m128i)) !=
 349 |           (unsigned)simdpackFOR_compressedbytes(length, bit)) {
 350 |         return -1;
 351 |       }
 352 |       simdunpackFOR_length(offset, (__m128i *)buffer, length, backdata, bit);
 353 |       for (i = 0; i < length; ++i) {
 354 | 
 355 |         if (data[i] != backdata[i])
 356 |           return -1;
 357 |       }
 358 |     }
 359 |     free(data);
 360 |     free(backdata);
 361 |     free(buffer);
 362 |   }
 363 |   return 0;
 364 | }
 365 | 
 366 | #ifdef __AVX2__
 367 | 
 368 | int testbabyavx() {
 369 |   int bit;
 370 |   int trial;
 371 |   unsigned int i, j;
 372 |   const size_t N = AVXBlockSize;
 373 |   srand(0);
 374 |   printf("[%s]\n", __func__);
 375 |   printf("bit = ");
 376 |   for (bit = 0; bit < 32; ++bit) {
 377 |     printf(" %d ", bit);
 378 |     fflush(stdout);
 379 |     for (trial = 0; trial < 100; ++trial) {
 380 |       uint32_t *data = malloc(N * sizeof(uint32_t) + 64 * sizeof(uint32_t));
 381 |       uint32_t *backdata = malloc(N * sizeof(uint32_t) + 64 * sizeof(uint32_t));
 382 |       __m256i *buffer = malloc((2 * N + 1024) * sizeof(uint32_t) + 32);
 383 | 
 384 |       for (i = 0; i < N; ++i) {
 385 |         data[i] = rand() & ((uint32_t)(1U << bit) - 1);
 386 |       }
 387 |       for (i = 0; i < N; ++i) {
 388 |         backdata[i] = 0;
 389 |       }
 390 |       if (avxmaxbits(data) != maxbits_length(data, N)) {
 391 |         printf("avxmaxbits is buggy\n");
 392 |         return -1;
 393 |       }
 394 | 
 395 |       avxpackwithoutmask(data, buffer, bit);
 396 |       avxunpack(buffer, backdata, bit);
 397 |       for (i = 0; i < AVXBlockSize; ++i) {
 398 |         if (data[i] != backdata[i]) {
 399 |           printf("bug\n");
 400 |           for (j = 0; j < N; ++j) {
 401 |             if (data[j] != backdata[j]) {
 402 |               printf("data[%d]=%d v.s. backdata[%d]=%d\n", j, data[j], j,
 403 |                      backdata[j]);
 404 |             } else {
 405 |               printf("data[%d]=%d\n", j, data[j]);
 406 |             }
 407 |           }
 408 |           return -1;
 409 |         }
 410 |       }
 411 |       free(data);
 412 |       free(backdata);
 413 |       free(buffer);
 414 |     }
 415 |   }
 416 |   printf("\n");
 417 |   return 0;
 418 | }
 419 | 
 420 | int testavx2() {
 421 |   int N = 5000 * AVXBlockSize, gap;
 422 |   __m256i *buffer = malloc(AVXBlockSize * sizeof(uint32_t));
 423 |   uint32_t *datain = malloc(N * sizeof(uint32_t));
 424 |   uint32_t *backbuffer = malloc(AVXBlockSize * sizeof(uint32_t));
 425 |   printf("[%s]\n", __func__);
 426 |   for (gap = 1; gap <= 387420489; gap *= 3) {
 427 |     int k;
 428 |     printf(" gap = %u \n", gap);
 429 |     for (k = 0; k < N; ++k)
 430 |       datain[k] = (uint32_t)(((uint64_t)k * gap) & 0xFFFFFFFF);
 431 |     for (k = 0; k * AVXBlockSize < N; ++k) {
 432 |       /*
 433 |          First part works for general arrays (sorted or unsorted)
 434 |       */
 435 |       int j;
 436 |       /* we compute the bit width */
 437 |       const uint32_t b = avxmaxbits(datain + k * AVXBlockSize);
 438 |       if (avxmaxbits(datain + k * AVXBlockSize) !=
 439 |           maxbits_length(datain + k * AVXBlockSize, AVXBlockSize)) {
 440 |         printf("avxmaxbits is buggy %d %d \n",
 441 |                avxmaxbits(datain + k * AVXBlockSize),
 442 |                maxbits_length(datain + k * AVXBlockSize, AVXBlockSize));
 443 |         return -1;
 444 |       }
 445 | 
 446 |       /* we read 256 integers at "datain + k * AVXBlockSize" and
 447 |          write b 256-bit vectors at "buffer" */
 448 |       avxpackwithoutmask(datain + k * AVXBlockSize, buffer, b);
 449 |       /* we read back b1 128-bit vectors at "buffer" and write 128 integers at
 450 |        * backbuffer */
 451 |       avxunpack(buffer, backbuffer, b); /* uncompressed */
 452 |       for (j = 0; j < AVXBlockSize; ++j) {
 453 |         if (backbuffer[j] != datain[k * AVXBlockSize + j]) {
 454 |           int i;
 455 |           printf("bug in avxpack\n");
 456 |           for (i = 0; i < AVXBlockSize; ++i) {
 457 |             printf("data[%d]=%d got back %d %s\n", i,
 458 |                    datain[k * AVXBlockSize + i], backbuffer[i],
 459 |                    datain[k * AVXBlockSize + i] != backbuffer[i] ? "bug" : "");
 460 |           }
 461 |           return -2;
 462 |         }
 463 |       }
 464 |     }
 465 |   }
 466 |   free(buffer);
 467 |   free(datain);
 468 |   free(backbuffer);
 469 |   printf("Code looks good.\n");
 470 |   return 0;
 471 | }
 472 | #endif /* avx2 */
 473 | 
 474 | #ifdef __AVX512F__
 475 | 
 476 | int testbabyavx512() {
 477 |   int bit;
 478 |   int trial;
 479 |   unsigned int i, j;
 480 |   const size_t N = AVX512BlockSize;
 481 |   srand(0);
 482 |   printf("[%s]\n", __func__);
 483 |   printf("bit = ");
 484 |   for (bit = 0; bit < 32; ++bit) {
 485 |     printf(" %d ", bit);
 486 |     fflush(stdout);
 487 |     for (trial = 0; trial < 100; ++trial) {
 488 |       uint32_t *data = malloc(N * sizeof(uint32_t) + 64 * sizeof(uint32_t));
 489 |       uint32_t *backdata = malloc(N * sizeof(uint32_t) + 64 * sizeof(uint32_t));
 490 |       __m512i *buffer = malloc((2 * N + 1024) * sizeof(uint32_t) + 32);
 491 | 
 492 |       for (i = 0; i < N; ++i) {
 493 |         data[i] = rand() & ((uint32_t)(1U << bit) - 1);
 494 |       }
 495 |       for (i = 0; i < N; ++i) {
 496 |         backdata[i] = 0;
 497 |       }
 498 |       if (avx512maxbits(data) != maxbits_length(data, N)) {
 499 |         printf("avx512maxbits is buggy\n");
 500 |         return -1;
 501 |       }
 502 | 
 503 |       avx512packwithoutmask(data, buffer, bit);
 504 |       avx512unpack(buffer, backdata, bit);
 505 |       for (i = 0; i < AVX512BlockSize; ++i) {
 506 |         if (data[i] != backdata[i]) {
 507 |           printf("bug\n");
 508 |           for (j = 0; j < N; ++j) {
 509 |             if (data[j] != backdata[j]) {
 510 |               printf("data[%d]=%d v.s. backdata[%d]=%d\n", j, data[j], j,
 511 |                      backdata[j]);
 512 |             } else {
 513 |               printf("data[%d]=%d\n", j, data[j]);
 514 |             }
 515 |           }
 516 |           return -1;
 517 |         }
 518 |       }
 519 |       free(data);
 520 |       free(backdata);
 521 |       free(buffer);
 522 |     }
 523 |   }
 524 |   printf("\n");
 525 |   return 0;
 526 | }
 527 | 
 528 | int testavx512_2() {
 529 |   int N = 5000 * AVX512BlockSize, gap;
 530 |   __m512i *buffer = malloc(AVX512BlockSize * sizeof(uint32_t));
 531 |   uint32_t *datain = malloc(N * sizeof(uint32_t));
 532 |   uint32_t *backbuffer = malloc(AVX512BlockSize * sizeof(uint32_t));
 533 |   printf("[%s]\n", __func__);
 534 |   for (gap = 1; gap <= 387420489; gap *= 3) {
 535 |     int k;
 536 |     printf(" gap = %u \n", gap);
 537 |     for (k = 0; k < N; ++k) {
 538 |       datain[k] = k * gap;
 539 |     }
 540 |     for (k = 0; k * AVX512BlockSize < N; ++k) {
 541 |       /*
 542 |        *                First part works for general arrays (sorted or unsorted)
 543 |        *                            */
 544 |       int j;
 545 |       /* we compute the bit width */
 546 |       const uint32_t b = avx512maxbits(datain + k * AVX512BlockSize);
 547 |       if (b != maxbits_length(datain + k * AVX512BlockSize, AVX512BlockSize)) {
 548 |         printf("avx512maxbits is buggy %d %d \n",
 549 |                avx512maxbits(datain + k * AVX512BlockSize),
 550 |                maxbits_length(datain + k * AVX512BlockSize, AVX512BlockSize));
 551 |         return -1;
 552 |       }
 553 | 
 554 |       /* we read 512 integers at "datain + k * AVX512BlockSize" and
 555 |        *                write b 512-bit vectors at "buffer" */
 556 |       avx512packwithoutmask(datain + k * AVX512BlockSize, buffer, b);
 557 |       /* we read back b1 512-bit vectors at "buffer" and write 512 integers at
 558 |        * backbuffer */
 559 |       avx512unpack(buffer, backbuffer, b); /* uncompressed */
 560 |       for (j = 0; j < AVX512BlockSize; ++j) {
 561 |         if (backbuffer[j] != datain[k * AVX512BlockSize + j]) {
 562 |           int i;
 563 |           printf("bug in avx512pack\n");
 564 |           for (i = 0; i < AVX512BlockSize; ++i) {
 565 |             printf("data[%d]=%d got back %d %s\n", i,
 566 |                    datain[k * AVX512BlockSize + i], backbuffer[i],
 567 |                    datain[k * AVX512BlockSize + i] != backbuffer[i] ? "bug"
 568 |                                                                     : "");
 569 |           }
 570 |           return -2;
 571 |         }
 572 |       }
 573 |     }
 574 |   }
 575 |   free(buffer);
 576 |   free(datain);
 577 |   free(backbuffer);
 578 |   printf("Code looks good.\n");
 579 |   return 0;
 580 | }
 581 | #endif /* avx512 */
 582 | 
 583 | int test() {
 584 |   int N = 5000 * SIMDBlockSize, gap;
 585 |   __m128i *buffer = malloc(SIMDBlockSize * sizeof(uint32_t));
 586 |   uint32_t *datain = malloc(N * sizeof(uint32_t));
 587 |   uint32_t *backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
 588 |   printf("[%s]\n", __func__);
 589 |   for (gap = 1; gap <= 387420489; gap *= 3) {
 590 |     int k;
 591 |     printf(" gap = %u \n", gap);
 592 |     for (k = 0; k < N; ++k)
 593 |       datain[k] = (uint32_t)(((uint64_t)k * gap) & 0xFFFFFFFF);
 594 |     for (k = 0; k * SIMDBlockSize < N; ++k) {
 595 |       /*
 596 |          First part works for general arrays (sorted or unsorted)
 597 |       */
 598 |       int j;
 599 |       /* we compute the bit width */
 600 |       const uint32_t b = maxbits(datain + k * SIMDBlockSize);
 601 |       /* we read 128 integers at "datain + k * SIMDBlockSize" and
 602 |          write b 128-bit vectors at "buffer" */
 603 |       simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b);
 604 |       /* we read back b1 128-bit vectors at "buffer" and write 128 integers at
 605 |        * backbuffer */
 606 |       simdunpack(buffer, backbuffer, b); /* uncompressed */
 607 |       for (j = 0; j < SIMDBlockSize; ++j) {
 608 |         if (backbuffer[j] != datain[k * SIMDBlockSize + j]) {
 609 |           printf("bug in simdpack\n");
 610 |           return -2;
 611 |         }
 612 |       }
 613 | 
 614 |       {
 615 |         /*
 616 |          next part assumes that the data is sorted (uses differential coding)
 617 |         */
 618 |         uint32_t offset = 0;
 619 |         /* we compute the bit width */
 620 |         const uint32_t b1 = simdmaxbitsd1(offset, datain + k * SIMDBlockSize);
 621 |         /* we read 128 integers at "datain + k * SIMDBlockSize" and
 622 |            write b1 128-bit vectors at "buffer" */
 623 |         simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer, b1);
 624 |         /* we read back b1 128-bit vectors at "buffer" and write 128 integers at
 625 |          * backbuffer */
 626 |         simdunpackd1(offset, buffer, backbuffer, b1);
 627 |         for (j = 0; j < SIMDBlockSize; ++j) {
 628 |           if (backbuffer[j] != datain[k * SIMDBlockSize + j]) {
 629 |             printf("bug in simdpack d1\n");
 630 |             return -3;
 631 |           }
 632 |         }
 633 |         offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1];
 634 |       }
 635 |     }
 636 |   }
 637 |   free(buffer);
 638 |   free(datain);
 639 |   free(backbuffer);
 640 |   printf("Code looks good.\n");
 641 |   return 0;
 642 | }
 643 | 
 644 | #ifdef __SSE4_1__
 645 | int testFOR() {
 646 |   int N = 5000 * SIMDBlockSize, gap;
 647 |   __m128i *buffer = malloc(SIMDBlockSize * sizeof(uint32_t));
 648 |   uint32_t *datain = malloc(N * sizeof(uint32_t));
 649 |   uint32_t *backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
 650 |   uint32_t tmax, tmin, tb;
 651 |   printf("[%s]\n", __func__);
 652 |   for (gap = 1; gap <= 387420489; gap *= 2) {
 653 |     int k;
 654 |     printf(" gap = %u \n", gap);
 655 |     for (k = 0; k < N; ++k)
 656 |       datain[k] = (uint32_t)(((uint64_t)k * gap) & 0xFFFFFFFF);
 657 |     for (k = 0; k * SIMDBlockSize < N; ++k) {
 658 |       int j;
 659 |       simdmaxmin_length(datain + k * SIMDBlockSize, SIMDBlockSize, &tmin,
 660 |                         &tmax);
 661 |       /* we compute the bit width */
 662 |       tb = bits(tmax - tmin);
 663 | 
 664 |       /* we read 128 integers at "datain + k * SIMDBlockSize" and
 665 |          write b 128-bit vectors at "buffer" */
 666 |       simdpackFOR(tmin, datain + k * SIMDBlockSize, buffer, tb);
 667 | 
 668 |       for (j = 0; j < SIMDBlockSize; ++j) {
 669 |         uint32_t selectedvalue = simdselectFOR(tmin, buffer, tb, j);
 670 |         if (selectedvalue != datain[k * SIMDBlockSize + j]) {
 671 |           printf("bug in simdselectFOR\n");
 672 |           return -3;
 673 |         }
 674 |       }
 675 |       /* we read back b1 128-bit vectors at "buffer" and write 128 integers at
 676 |        * backbuffer */
 677 |       simdunpackFOR(tmin, buffer, backbuffer, tb); /* uncompressed */
 678 |       for (j = 0; j < SIMDBlockSize; ++j) {
 679 |         if (backbuffer[j] != datain[k * SIMDBlockSize + j]) {
 680 |           printf("bug in simdpackFOR\n");
 681 |           return -2;
 682 |         }
 683 |       }
 684 |     }
 685 |   }
 686 |   free(buffer);
 687 |   free(datain);
 688 |   free(backbuffer);
 689 |   printf("Code looks good.\n");
 690 |   return 0;
 691 | }
 692 | #endif
 693 | 
 694 | #define MAX 300
 695 | int test_simdmaxbitsd1_length() {
 696 |   uint32_t result, buffer[MAX + 1];
 697 |   int i, j;
 698 | 
 699 |   memset(&buffer[0], 0xff, sizeof(buffer));
 700 |   printf("[%s]\n", __func__);
 701 |   /* this test creates buffers of different length; each buffer is
 702 |    * initialized to result in the following deltas:
 703 |    * length 1: 2
 704 |    * length 2: 1 2
 705 |    * length 3: 1 1 2
 706 |    * length 4: 1 1 1 2
 707 |    * length 5: 1 1 1 1 2
 708 |    * etc. Each sequence's "maxbits" is 2. */
 709 |   for (i = 0; i < MAX; i++) {
 710 |     for (j = 0; j < i; j++)
 711 |       buffer[j] = j + 1;
 712 |     buffer[i] = i + 2;
 713 | 
 714 |     result = simdmaxbitsd1_length(0, &buffer[0], i + 1);
 715 |     if (result != 2) {
 716 |       printf("simdmaxbitsd1_length: unexpected result %u in loop %d\n", result,
 717 |              i);
 718 |       return -1;
 719 |     }
 720 |   }
 721 |   printf("simdmaxbitsd1_length: ok\n");
 722 |   return 0;
 723 | }
 724 | 
 725 | int uint32_cmp(const void *a, const void *b) {
 726 |   const uint32_t *ia = (const uint32_t *)a;
 727 |   const uint32_t *ib = (const uint32_t *)b;
 728 |   if (*ia < *ib)
 729 |     return -1;
 730 |   else if (*ia > *ib)
 731 |     return 1;
 732 |   return 0;
 733 | }
 734 | 
 735 | #ifdef __SSE4_1__
 736 | int test_simdpackedsearch() {
 737 |   uint32_t buffer[128];
 738 |   uint32_t result = 0;
 739 |   int b, i;
 740 |   uint32_t init = 0;
 741 |   __m128i initial = _mm_set1_epi32(init);
 742 |   printf("[%s]\n", __func__);
 743 |   /* initialize the buffer */
 744 |   for (i = 0; i < 128; i++)
 745 |     buffer[i] = (uint32_t)(i + 1);
 746 | 
 747 |   /* this test creates delta encoded buffers with different bits, then
 748 |    * performs lower bound searches for each key */
 749 |   for (b = 1; b <= 32; b++) {
 750 |     uint32_t out[128];
 751 |     /* delta-encode to 'i' bits */
 752 |     simdpackwithoutmaskd1(init, buffer, (__m128i *)out, b);
 753 |     initial = _mm_setzero_si128();
 754 |     printf("simdsearchd1: %d bits\n", b);
 755 | 
 756 |     /* now perform the searches */
 757 |     initial = _mm_set1_epi32(init);
 758 |     assert(simdsearchd1(&initial, (__m128i *)out, b, 0, &result) == 0);
 759 |     assert(result > 0);
 760 | 
 761 |     for (i = 1; i <= 128; i++) {
 762 |       initial = _mm_set1_epi32(init);
 763 |       assert(simdsearchd1(&initial, (__m128i *)out, b, (uint32_t)i, &result) ==
 764 |              i - 1);
 765 |       assert(result == (unsigned)i);
 766 |     }
 767 |     initial = _mm_set1_epi32(init);
 768 |     assert(simdsearchd1(&initial, (__m128i *)out, b, 200, &result) == 128);
 769 |     assert(result > 200);
 770 |   }
 771 |   printf("simdsearchd1: ok\n");
 772 |   return 0;
 773 | }
 774 | 
 775 | int test_simdpackedsearchFOR() {
 776 |   uint32_t buffer[128];
 777 |   uint32_t result = 0;
 778 |   int b;
 779 |   uint32_t i;
 780 |   uint32_t maxv, tmin, tmax, tb;
 781 |   uint32_t out[128];
 782 |   printf("[%s]\n", __func__);
 783 |   /* this test creates delta encoded buffers with different bits, then
 784 |    * performs lower bound searches for each key */
 785 |   for (b = 1; b <= 32; b++) {
 786 |     /* initialize the buffer */
 787 |     maxv = (b == 32) ? 0xFFFFFFFF : ((1U << b) - 1);
 788 |     for (i = 0; i < 128; i++)
 789 |       buffer[i] = maxv * (i + 1) / 128;
 790 |     simdmaxmin_length(buffer, SIMDBlockSize, &tmin, &tmax);
 791 |     /* we compute the bit width */
 792 |     tb = bits(tmax - tmin);
 793 |     /* delta-encode to 'i' bits */
 794 |     simdpackFOR(tmin, buffer, (__m128i *)out, tb);
 795 |     printf("simdsearchd1: %d bits\n", b);
 796 | 
 797 |     /* now perform the searches */
 798 |     for (i = 0; i < 128; i++) {
 799 |       assert(buffer[i] == simdselectFOR(tmin, (__m128i *)out, tb, i));
 800 |     }
 801 |     for (i = 0; i < 128; i++) {
 802 |       int x = simdsearchwithlengthFOR(tmin, (__m128i *)out, tb, 128, buffer[i],
 803 |                                       &result);
 804 |       assert(simdselectFOR(tmin, (__m128i *)out, tb, x) == buffer[x]);
 805 |       assert(simdselectFOR(tmin, (__m128i *)out, tb, x) == result);
 806 |       assert(buffer[x] == result);
 807 |       assert(result == buffer[i]);
 808 |       assert(buffer[x] == buffer[i]);
 809 |     }
 810 |   }
 811 |   printf("simdsearchFOR: ok\n");
 812 |   return 0;
 813 | }
 814 | 
 815 | int test_simdpackedsearch_advanced() {
 816 |   uint32_t buffer[128];
 817 |   uint32_t backbuffer[128];
 818 |   uint32_t out[128];
 819 |   uint32_t result = 0;
 820 |   uint32_t b, i;
 821 |   uint32_t init = 0;
 822 |   __m128i initial = _mm_set1_epi32(init);
 823 | 
 824 |   printf("[%s]\n", __func__);
 825 |   /* this test creates delta encoded buffers with different bits, then
 826 |    * performs lower bound searches for each key */
 827 |   for (b = 0; b <= 32; b++) {
 828 |     uint32_t prev = init;
 829 |     /* initialize the buffer */
 830 |     for (i = 0; i < 128; i++) {
 831 |       buffer[i] = ((uint32_t)(1431655765 * i + 0xFFFFFFFF));
 832 |       if (b < 32)
 833 |         buffer[i] %= (1U << b);
 834 |     }
 835 | 
 836 |     qsort(buffer, 128, sizeof(uint32_t), uint32_cmp);
 837 | 
 838 |     for (i = 0; i < 128; i++) {
 839 |       buffer[i] = buffer[i] + prev;
 840 |       prev = buffer[i];
 841 |     }
 842 |     for (i = 1; i < 128; i++) {
 843 |       if (buffer[i] < buffer[i - 1])
 844 |         buffer[i] = buffer[i - 1];
 845 |     }
 846 |     assert(simdmaxbitsd1(init, buffer) <= b);
 847 |     for (i = 0; i < 128; i++) {
 848 |       out[i] = 0; /* memset would do too */
 849 |     }
 850 | 
 851 |     /* delta-encode to 'i' bits */
 852 |     simdpackwithoutmaskd1(init, buffer, (__m128i *)out, b);
 853 |     simdunpackd1(init, (__m128i *)out, backbuffer, b);
 854 | 
 855 |     for (i = 0; i < 128; i++) {
 856 |       assert(buffer[i] == backbuffer[i]);
 857 |     }
 858 | 
 859 |     printf("advanced simdsearchd1: %d bits\n", b);
 860 | 
 861 |     for (i = 0; i < 128; i++) {
 862 |       int pos;
 863 |       initial = _mm_set1_epi32(init);
 864 |       pos = simdsearchd1(&initial, (__m128i *)out, b, buffer[i], &result);
 865 |       assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128,
 866 |                                            buffer[i], &result));
 867 |       assert(buffer[pos] == buffer[i]);
 868 |       if (pos > 0)
 869 |         assert(buffer[pos - 1] < buffer[i]);
 870 |       assert(result == buffer[i]);
 871 |     }
 872 |     for (i = 0; i < 128; i++) {
 873 |       int pos;
 874 |       if (buffer[i] == 0)
 875 |         continue;
 876 |       initial = _mm_set1_epi32(init);
 877 |       pos = simdsearchd1(&initial, (__m128i *)out, b, buffer[i] - 1, &result);
 878 |       assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128,
 879 |                                            buffer[i] - 1, &result));
 880 |       assert(buffer[pos] >= buffer[i] - 1);
 881 |       if (pos > 0)
 882 |         assert(buffer[pos - 1] < buffer[i] - 1);
 883 |       assert(result == buffer[pos]);
 884 |     }
 885 |     for (i = 0; i < 128; i++) {
 886 |       int pos;
 887 |       if (buffer[i] + 1 == 0)
 888 |         continue;
 889 |       initial = _mm_set1_epi32(init);
 890 |       pos = simdsearchd1(&initial, (__m128i *)out, b, buffer[i] + 1, &result);
 891 |       assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128,
 892 |                                            buffer[i] + 1, &result));
 893 |       if (pos == 128) {
 894 |         assert(buffer[i] == buffer[127]);
 895 |       } else {
 896 |         assert(buffer[pos] >= buffer[i] + 1);
 897 |         if (pos > 0)
 898 |           assert(buffer[pos - 1] < buffer[i] + 1);
 899 |         assert(result == buffer[pos]);
 900 |       }
 901 |     }
 902 |   }
 903 |   printf("advanced simdsearchd1: ok\n");
 904 |   return 0;
 905 | }
 906 | 
 907 | int test_simdpackedselect() {
 908 |   uint32_t buffer[128];
 909 |   uint32_t initial = 33;
 910 |   int b, i;
 911 |   printf("[%s]\n", __func__);
 912 |   /* initialize the buffer */
 913 |   for (i = 0; i < 128; i++)
 914 |     buffer[i] = (uint32_t)(initial + i);
 915 | 
 916 |   /* this test creates delta encoded buffers with different bits, then
 917 |    * performs lower bound searches for each key */
 918 |   for (b = 1; b <= 32; b++) {
 919 |     uint32_t out[128];
 920 |     /* delta-encode to 'i' bits */
 921 |     simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b);
 922 | 
 923 |     printf("simdselectd1: %d bits\n", b);
 924 | 
 925 |     /* now perform the searches */
 926 |     for (i = 0; i < 128; i++) {
 927 |       assert(simdselectd1(initial, (__m128i *)out, b, (uint32_t)i) ==
 928 |              initial + i);
 929 |     }
 930 |   }
 931 |   printf("simdselectd1: ok\n");
 932 |   return 0;
 933 | }
 934 | 
 935 | int test_simdpackedselect_advanced() {
 936 |   uint32_t buffer[128];
 937 |   uint32_t initial = 33;
 938 |   uint32_t b;
 939 |   int i;
 940 |   printf("[%s]\n", __func__);
 941 |   /* this test creates delta encoded buffers with different bits, then
 942 |    * performs lower bound searches for each key */
 943 |   for (b = 0; b <= 32; b++) {
 944 |     uint32_t prev = initial;
 945 |     uint32_t out[128];
 946 |     /* initialize the buffer */
 947 |     for (i = 0; i < 128; i++) {
 948 |       buffer[i] = ((uint32_t)(165576 * i));
 949 |       if (b < 32)
 950 |         buffer[i] %= (1U << b);
 951 |     }
 952 |     for (i = 0; i < 128; i++) {
 953 |       buffer[i] = buffer[i] + prev;
 954 |       prev = buffer[i];
 955 |     }
 956 | 
 957 |     for (i = 1; i < 128; i++) {
 958 |       if (buffer[i] < buffer[i - 1])
 959 |         buffer[i] = buffer[i - 1];
 960 |     }
 961 |     assert(simdmaxbitsd1(initial, buffer) <= b);
 962 | 
 963 |     for (i = 0; i < 128; i++) {
 964 |       out[i] = 0; /* memset would do too */
 965 |     }
 966 | 
 967 |     /* delta-encode to 'i' bits */
 968 |     simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b);
 969 | 
 970 |     printf("simdselectd1: %d bits\n", b);
 971 | 
 972 |     /* now perform the searches */
 973 |     for (i = 0; i < 128; i++) {
 974 |       uint32_t valretrieved =
 975 |           simdselectd1(initial, (__m128i *)out, b, (uint32_t)i);
 976 |       assert(valretrieved == buffer[i]);
 977 |     }
 978 |   }
 979 |   printf("advanced simdselectd1: ok\n");
 980 |   return 0;
 981 | }
 982 | #endif
 983 | 
 984 | int main() {
 985 |   int r;
 986 |   r = issue21();
 987 |   if (r) {
 988 |     printf("test failure issue21\n");
 989 |     return r;
 990 |   }
 991 |   r = issue21FOR();
 992 |   if (r) {
 993 |     printf("test failure issue21FOR\n");
 994 |     return r;
 995 |   }
 996 | #ifdef __AVX512F__
 997 |   r = testbabyavx512();
 998 |   if (r) {
 999 |     printf("test failure baby avx512\n");
1000 |     return r;
1001 |   }
1002 | 
1003 |   r = testavx512_2();
1004 |   if (r) {
1005 |     printf("test failure 9 avx512\n");
1006 |     return r;
1007 |   }
1008 | #endif
1009 | 
1010 |   r = testsetFOR();
1011 |   if (r) {
1012 |     printf("test failure 1\n");
1013 |     return r;
1014 |   }
1015 | 
1016 | #ifdef __SSE4_1__
1017 |   r = testsetd1();
1018 |   if (r) {
1019 |     printf("test failure 2\n");
1020 |     return r;
1021 |   }
1022 | #endif
1023 |   r = testset();
1024 |   if (r) {
1025 |     printf("test failure 3\n");
1026 |     return r;
1027 |   }
1028 | 
1029 |   r = testshortFORpack();
1030 |   if (r) {
1031 |     printf("test failure 4\n");
1032 |     return r;
1033 |   }
1034 |   r = testshortpack();
1035 |   if (r) {
1036 |     printf("test failure 5\n");
1037 |     return r;
1038 |   }
1039 |   r = testlongpack();
1040 |   if (r) {
1041 |     printf("test failure 6\n");
1042 |     return r;
1043 |   }
1044 | #ifdef __SSE4_1__
1045 |   r = test_simdpackedsearchFOR();
1046 |   if (r) {
1047 |     printf("test failure 7\n");
1048 |     return r;
1049 |   }
1050 | 
1051 |   r = testFOR();
1052 |   if (r) {
1053 |     printf("test failure 8\n");
1054 |     return r;
1055 |   }
1056 | #endif
1057 | #ifdef __AVX2__
1058 |   r = testbabyavx();
1059 |   if (r) {
1060 |     printf("test failure baby avx\n");
1061 |     return r;
1062 |   }
1063 | 
1064 |   r = testavx2();
1065 |   if (r) {
1066 |     printf("test failure 9 avx\n");
1067 |     return r;
1068 |   }
1069 | #endif
1070 |   r = test();
1071 |   if (r) {
1072 |     printf("test failure 9\n");
1073 |     return r;
1074 |   }
1075 | 
1076 |   r = test_simdmaxbitsd1_length();
1077 |   if (r) {
1078 |     printf("test failure 10\n");
1079 |     return r;
1080 |   }
1081 | #ifdef __SSE4_1__
1082 |   r = test_simdpackedsearch();
1083 |   if (r) {
1084 |     printf("test failure 11\n");
1085 |     return r;
1086 |   }
1087 | 
1088 |   r = test_simdpackedsearch_advanced();
1089 |   if (r) {
1090 |     printf("test failure 12\n");
1091 |     return r;
1092 |   }
1093 | 
1094 |   r = test_simdpackedselect();
1095 |   if (r) {
1096 |     printf("test failure 13\n");
1097 |     return r;
1098 |   }
1099 | 
1100 |   r = test_simdpackedselect_advanced();
1101 |   if (r) {
1102 |     printf("test failure 14\n");
1103 |     return r;
1104 |   }
1105 | #endif
1106 |   printf("All tests OK!\n");
1107 | 
1108 |   return 0;
1109 | }
1110 | 


--------------------------------------------------------------------------------