├── .gitignore ├── .travis.yml ├── go ├── README.md └── test.go ├── CHANGELOG ├── include ├── simdcomp.h ├── avxbitpacking.h ├── avx512bitpacking.h ├── simdcomputil.h ├── portability.h ├── simdfor.h ├── simdbitpacking.h └── simdintegratedbitpacking.h ├── package.json ├── simdcomp.def.tpl ├── LICENSE ├── Makefile ├── tests ├── unit_chars.c └── unit.c ├── .appveyor.yml ├── makefile.vc ├── scripts ├── simdfor.py ├── avxpacking.py └── avx512packing.py ├── benchmarks ├── benchmark.c └── bitpackingbenchmark.c ├── example.c ├── README.md └── src └── simdcomputil.c /.gitignore: -------------------------------------------------------------------------------- 1 | Makefile.in 2 | lib* 3 | unit* 4 | *.o 5 | src/*.lo 6 | src/*.o 7 | src/.deps 8 | src/.dirstamp 9 | src/.libs 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c 2 | sudo: false 3 | compiler: 4 | - gcc 5 | - clang 6 | 7 | branches: 8 | only: 9 | - master 10 | 11 | script: make && ./unit && ./unit_chars && make clean 12 | -------------------------------------------------------------------------------- /go/README.md: -------------------------------------------------------------------------------- 1 | Simple Go demo 2 | ============== 3 | 4 | Setup 5 | ====== 6 | 7 | Start by installing the simdcomp library (make && make install). 8 | 9 | Then type: 10 | 11 | go run test.go 12 | 13 | 14 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | Upcoming 2 | - added missing include 3 | - improved portability (MSVC) 4 | - implemented C89 compatibility 5 | Version 0.0.3 (19 May 2014) 6 | - improved documentation 7 | Version 0.0.2 (6 February 2014) 8 | - added go demo 9 | Version 0.0.1 (5 February 2014) 10 | -------------------------------------------------------------------------------- /include/simdcomp.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is released under a BSD License. 3 | */ 4 | 5 | #ifndef SIMDCOMP_H_ 6 | #define SIMDCOMP_H_ 7 | 8 | #ifdef __cplusplus 9 | extern "C" { 10 | #endif 11 | 12 | #include "avx512bitpacking.h" 13 | #include "avxbitpacking.h" 14 | #include "simdbitpacking.h" 15 | #include "simdcomputil.h" 16 | #include "simdfor.h" 17 | #include "simdintegratedbitpacking.h" 18 | 19 | #ifdef __cplusplus 20 | } // extern "C" 21 | #endif 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "simdcomp", 3 | "version": "0.0.3", 4 | "repo": "lemire/simdcomp", 5 | "description": "A simple C library for compressing lists of integers", 6 | "license": "BSD-3-Clause", 7 | "src": [ 8 | "src/simdbitpacking.c", 9 | "src/simdcomputil.c", 10 | "src/simdintegratedbitpacking.c", 11 | "include/simdbitpacking.h", 12 | "include/simdcomp.h", 13 | "include/simdcomputil.h", 14 | "include/simdintegratedbitpacking.h" 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /simdcomp.def.tpl: -------------------------------------------------------------------------------- 1 | EXPORTS 2 | simdpack 3 | simdpackwithoutmask 4 | simdunpack 5 | bits 6 | maxbits 7 | maxbits_length 8 | simdmin 9 | simdmin_length 10 | simdmaxmin 11 | simdmaxmin_length 12 | simdmaxbitsd1 13 | simdmaxbitsd1_length 14 | simdpackd1 15 | simdpackwithoutmaskd1 16 | simdunpackd1 17 | simdsearchd1 18 | simdsearchwithlengthd1 19 | simdselectd1 20 | simdpackFOR 21 | simdselectFOR 22 | simdsearchwithlengthFOR 23 | simdunpackFOR 24 | simdmin_length 25 | simdmaxmin 26 | simdmaxmin_length 27 | simdpack_length 28 | simdpackFOR_length 29 | simdunpackFOR_length 30 | simdpackFOR_compressedbytes 31 | simdpack_shortlength 32 | simdpack_compressedbytes 33 | simdfastsetFOR 34 | simdfastset 35 | simdfastsetd1 36 | simdunpack_length 37 | simdunpack_shortlength 38 | simdsearchwithlengthFOR 39 | simdscand1 40 | simdfastsetd1fromprevious 41 | 42 | -------------------------------------------------------------------------------- /include/avxbitpacking.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is released under a BSD License. 3 | */ 4 | 5 | #ifndef INCLUDE_AVXBITPACKING_H_ 6 | #define INCLUDE_AVXBITPACKING_H_ 7 | 8 | #ifdef __AVX2__ 9 | 10 | #include "portability.h" 11 | 12 | /* AVX2 is required */ 13 | #include 14 | /* for memset */ 15 | #include 16 | 17 | #include "simdcomputil.h" 18 | 19 | enum { AVXBlockSize = 256 }; 20 | 21 | /* max integer logarithm over a range of AVXBlockSize integers (256 integer) */ 22 | uint32_t avxmaxbits(const uint32_t *begin); 23 | 24 | /* reads 256 values from "in", writes "bit" 256-bit vectors to "out" */ 25 | void avxpack(const uint32_t *in, __m256i *out, const uint32_t bit); 26 | 27 | /* reads 256 values from "in", writes "bit" 256-bit vectors to "out" */ 28 | void avxpackwithoutmask(const uint32_t *in, __m256i *out, const uint32_t bit); 29 | 30 | /* reads "bit" 256-bit vectors from "in", writes 256 values to "out" */ 31 | void avxunpack(const __m256i *in, uint32_t *out, const uint32_t bit); 32 | 33 | #endif /* __AVX2__ */ 34 | 35 | #endif /* INCLUDE_AVXBITPACKING_H_ */ 36 | -------------------------------------------------------------------------------- /include/avx512bitpacking.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is released under a BSD License. 3 | */ 4 | 5 | #ifndef INCLUDE_AVX512BITPACKING_H_ 6 | #define INCLUDE_AVX512BITPACKING_H_ 7 | 8 | #ifdef __AVX512F__ 9 | 10 | #include "portability.h" 11 | 12 | /* AVX512 is required */ 13 | #include 14 | /* for memset */ 15 | #include 16 | 17 | #include "simdcomputil.h" 18 | 19 | enum { AVX512BlockSize = 512 }; 20 | 21 | /* max integer logarithm over a range of AVX512BlockSize integers (512 integer) 22 | */ 23 | uint32_t avx512maxbits(const uint32_t *begin); 24 | 25 | /* reads 512 values from "in", writes "bit" 512-bit vectors to "out" */ 26 | void avx512pack(const uint32_t *in, __m512i *out, const uint32_t bit); 27 | 28 | /* reads 512 values from "in", writes "bit" 512-bit vectors to "out" */ 29 | void avx512packwithoutmask(const uint32_t *in, __m512i *out, 30 | const uint32_t bit); 31 | 32 | /* reads "bit" 512-bit vectors from "in", writes 512 values to "out" */ 33 | void avx512unpack(const __m512i *in, uint32_t *out, const uint32_t bit); 34 | 35 | #endif /* __AVX512F__ */ 36 | 37 | #endif /* INCLUDE_AVX512BITPACKING_H_ */ 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014--, The authors 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, this 11 | list of conditions and the following disclaimer in the documentation and/or 12 | other materials provided with the distribution. 13 | 14 | * Neither the name of the {organization} nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /include/simdcomputil.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is released under a BSD License. 3 | */ 4 | 5 | #ifndef SIMDCOMPUTIL_H_ 6 | #define SIMDCOMPUTIL_H_ 7 | 8 | #include "portability.h" 9 | 10 | /* SSE2 is required */ 11 | #include 12 | 13 | /* returns the integer logarithm of v (bit width) */ 14 | uint32_t bits(const uint32_t v); 15 | 16 | /* max integer logarithm over a range of SIMDBlockSize integers (128 integer) */ 17 | uint32_t maxbits(const uint32_t *begin); 18 | 19 | /* same as maxbits, but we specify the number of integers */ 20 | uint32_t maxbits_length(const uint32_t *in, uint32_t length); 21 | 22 | enum { SIMDBlockSize = 128 }; 23 | 24 | /* computes (quickly) the minimal value of 128 values */ 25 | uint32_t simdmin(const uint32_t *in); 26 | 27 | /* computes (quickly) the minimal value of the specified number of values */ 28 | uint32_t simdmin_length(const uint32_t *in, uint32_t length); 29 | 30 | #ifdef __SSE4_1__ 31 | /* computes (quickly) the minimal and maximal value of the specified number of 32 | * values */ 33 | void simdmaxmin_length(const uint32_t *in, uint32_t length, uint32_t *getmin, 34 | uint32_t *getmax); 35 | 36 | /* computes (quickly) the minimal and maximal value of the 128 values */ 37 | void simdmaxmin(const uint32_t *in, uint32_t *getmin, uint32_t *getmax); 38 | 39 | #endif 40 | 41 | /* like maxbit over 128 integers (SIMDBlockSize) with provided initial value 42 | and using differential coding */ 43 | uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t *in); 44 | 45 | /* like simdmaxbitsd1, but calculates maxbits over |length| integers 46 | with provided initial value. |length| can be any arbitrary value. */ 47 | uint32_t simdmaxbitsd1_length(uint32_t initvalue, const uint32_t *in, 48 | uint32_t length); 49 | 50 | #endif /* SIMDCOMPUTIL_H_ */ 51 | -------------------------------------------------------------------------------- /go/test.go: -------------------------------------------------------------------------------- 1 | ///////// 2 | // This particular file is in the public domain. 3 | // Author: Daniel Lemire 4 | //////// 5 | 6 | package main 7 | 8 | /* 9 | #cgo LDFLAGS: -lsimdcomp 10 | #include 11 | */ 12 | import "C" 13 | import "fmt" 14 | 15 | ////////// 16 | // For this demo, we pack and unpack blocks of 128 integers 17 | ///////// 18 | func main() { 19 | // I am going to use C types. Alternative might be to use unsafe.Pointer calls, see http://bit.ly/1ndw3W3 20 | // this is our original data 21 | var data [128]C.uint32_t 22 | for i := C.uint32_t(0); i < C.uint32_t(128); i++ { 23 | data[i] = i 24 | } 25 | 26 | 27 | 28 | 29 | 30 | //////////// 31 | // We first pack without differential coding 32 | /////////// 33 | // computing how many bits per int. is needed 34 | b := C.maxbits(&data[0]) 35 | ratio := 32.0/float64(b) 36 | fmt.Println("Bit width ", b) 37 | fmt.Println(fmt.Sprintf("Compression ratio %f ", ratio)) 38 | // we are now going to create a buffer to receive the packed data (each __m128i uses 128 bits) 39 | out := make([] C.__m128i,b) 40 | C.simdpackwithoutmask( &data[0],&out[0],b); 41 | var recovereddata [128]C.uint32_t 42 | C.simdunpack(&out[0],&recovereddata[0],b) 43 | for i := 0; i < 128; i++ { 44 | if data[i] != recovereddata[i] { 45 | fmt.Println("Bug ") 46 | return 47 | } 48 | } 49 | 50 | /////////// 51 | // Next, we use differential coding 52 | ////////// 53 | offset := C.uint32_t(0) // if you pack data from K to K + 128, offset should be the value at K-1. When K = 0, choose a default 54 | b1 := C.simdmaxbitsd1(offset,&data[0]) 55 | ratio1 := 32.0/float64(b1) 56 | fmt.Println("Bit width ", b1) 57 | fmt.Println(fmt.Sprintf("Compression ratio %f ", ratio1)) 58 | // we are now going to create a buffer to receive the packed data (each __m128i uses 128 bits) 59 | out = make([] C.__m128i,b1) 60 | C.simdpackwithoutmaskd1(offset, &data[0],&out[0],b1); 61 | C.simdunpackd1(offset,&out[0],&recovereddata[0],b1) 62 | for i := 0; i < 128; i++ { 63 | if data[i] != recovereddata[i] { 64 | fmt.Println("Bug ") 65 | return 66 | } 67 | } 68 | 69 | fmt.Println("test succesful.") 70 | 71 | } 72 | -------------------------------------------------------------------------------- /include/portability.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is released under a BSD License. 3 | */ 4 | #ifndef SIMDBITCOMPAT_H_ 5 | #define SIMDBITCOMPAT_H_ 6 | 7 | #include /* mostly for Microsoft compilers */ 8 | #include 9 | 10 | #ifdef SIMDCOMP_DEBUG 11 | #define SIMDCOMP_ALWAYS_INLINE inline 12 | #define SIMDCOMP_NEVER_INLINE 13 | #define SIMDCOMP_PURE 14 | #else 15 | #if defined(__GNUC__) 16 | #if __GNUC__ >= 3 17 | #define SIMDCOMP_ALWAYS_INLINE inline __attribute__((always_inline)) 18 | #define SIMDCOMP_NEVER_INLINE __attribute__((noinline)) 19 | #define SIMDCOMP_PURE __attribute__((pure)) 20 | #else 21 | #define SIMDCOMP_ALWAYS_INLINE inline 22 | #define SIMDCOMP_NEVER_INLINE 23 | #define SIMDCOMP_PURE 24 | #endif 25 | #elif defined(_MSC_VER) 26 | #define SIMDCOMP_ALWAYS_INLINE __forceinline 27 | #define SIMDCOMP_NEVER_INLINE 28 | #define SIMDCOMP_PURE 29 | #else 30 | #if __has_attribute(always_inline) 31 | #define SIMDCOMP_ALWAYS_INLINE inline __attribute__((always_inline)) 32 | #else 33 | #define SIMDCOMP_ALWAYS_INLINE inline 34 | #endif 35 | #if __has_attribute(noinline) 36 | #define SIMDCOMP_NEVER_INLINE __attribute__((noinline)) 37 | #else 38 | #define SIMDCOMP_NEVER_INLINE 39 | #endif 40 | #if __has_attribute(pure) 41 | #define SIMDCOMP_PURE __attribute__((pure)) 42 | #else 43 | #define SIMDCOMP_PURE 44 | #endif 45 | #endif 46 | #endif 47 | 48 | #if defined(_MSC_VER) && _MSC_VER < 1600 49 | typedef unsigned int uint32_t; 50 | typedef unsigned char uint8_t; 51 | typedef signed char int8_t; 52 | #else 53 | #include /* part of Visual Studio 2010 and better, others likely anyway */ 54 | #endif 55 | 56 | #if defined(_MSC_VER) 57 | #define SIMDCOMP_ALIGNED(x) __declspec(align(x)) 58 | #else 59 | #if defined(__GNUC__) 60 | #define SIMDCOMP_ALIGNED(x) __attribute__((aligned(x))) 61 | #endif 62 | #endif 63 | 64 | #if defined(_MSC_VER) 65 | #include 66 | /* 64-bit needs extending */ 67 | #define SIMDCOMP_CTZ(result, mask) \ 68 | do { \ 69 | unsigned long index; \ 70 | if (!_BitScanForward(&(index), (mask))) { \ 71 | (result) = 32U; \ 72 | } else { \ 73 | (result) = (uint32_t)(index); \ 74 | } \ 75 | } while (0) 76 | #else 77 | #include 78 | #define SIMDCOMP_CTZ(result, mask) result = __builtin_ctz(mask) 79 | #endif 80 | 81 | #endif /* SIMDBITCOMPAT_H_ */ 82 | -------------------------------------------------------------------------------- /include/simdfor.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is released under a BSD License. 3 | */ 4 | #ifndef INCLUDE_SIMDFOR_H_ 5 | #define INCLUDE_SIMDFOR_H_ 6 | 7 | #include "portability.h" 8 | 9 | /* SSE2 is required */ 10 | #include 11 | 12 | #include "simdbitpacking.h" 13 | #include "simdcomputil.h" 14 | 15 | #ifdef __cplusplus 16 | extern "C" { 17 | #endif 18 | 19 | /* reads 128 values from "in", writes "bit" 128-bit vectors to "out" */ 20 | void simdpackFOR(uint32_t initvalue, const uint32_t *in, __m128i *out, 21 | const uint32_t bit); 22 | 23 | /* reads "bit" 128-bit vectors from "in", writes 128 values to "out" */ 24 | void simdunpackFOR(uint32_t initvalue, const __m128i *in, uint32_t *out, 25 | const uint32_t bit); 26 | 27 | /* how many compressed bytes are needed to compressed length integers using a 28 | bit width of bit with the simdpackFOR_length function. */ 29 | int simdpackFOR_compressedbytes(int length, const uint32_t bit); 30 | 31 | /* like simdpackFOR, but supports an undetermined number of inputs. 32 | This is useful if you need to pack less than 128 integers. Note that this 33 | function is much slower. Compressed data is stored in the memory location 34 | between the provided (out) pointer and the returned pointer. */ 35 | __m128i *simdpackFOR_length(uint32_t initvalue, const uint32_t *in, int length, 36 | __m128i *out, const uint32_t bit); 37 | 38 | /* like simdunpackFOR, but supports an undetermined number of inputs. 39 | This is useful if you need to unpack less than 128 integers. Note that this 40 | function is much slower. The read compressed data is between the provided (in) 41 | pointer and the returned pointer. */ 42 | const __m128i *simdunpackFOR_length(uint32_t initvalue, const __m128i *in, 43 | int length, uint32_t *out, 44 | const uint32_t bit); 45 | 46 | /* returns the value stored at the specified "slot". 47 | * */ 48 | uint32_t simdselectFOR(uint32_t initvalue, const __m128i *in, uint32_t bit, 49 | int slot); 50 | 51 | /* given a block of 128 packed values, this function sets the value at index 52 | * "index" to "value" */ 53 | void simdfastsetFOR(uint32_t initvalue, __m128i *in, uint32_t bit, 54 | uint32_t value, size_t index); 55 | 56 | /* searches "bit" 128-bit vectors from "in" (= length<=128 encoded integers) for 57 | * the first encoded uint32 value which is >= |key|, and returns its position. 58 | * It is assumed that the values stored are in sorted order. The encoded key is 59 | * stored in "*presult". The first length decoded integers, ignoring others. If 60 | * no value is larger or equal to the key, length is returned. Length should be 61 | * no larger than 128. 62 | * 63 | * If no value is larger or equal to the key, 64 | * length is returned */ 65 | int simdsearchwithlengthFOR(uint32_t initvalue, const __m128i *in, uint32_t bit, 66 | int length, uint32_t key, uint32_t *presult); 67 | 68 | #ifdef __cplusplus 69 | } // extern "C" 70 | #endif 71 | 72 | #endif /* INCLUDE_SIMDFOR_H_ */ 73 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # minimalist makefile 2 | .SUFFIXES: 3 | # 4 | .SUFFIXES: .cpp .o .c .h 5 | ifeq ($(DEBUG),1) 6 | CFLAGS = -fPIC -std=c89 -ggdb -march=native -Wall -Wextra -Wshadow -fsanitize=undefined -fno-omit-frame-pointer -fsanitize=address 7 | else 8 | CFLAGS = -fPIC -std=c89 -O3 -march=native -Wall -Wextra -Wshadow 9 | endif # debug 10 | LDFLAGS = -shared 11 | LIBNAME=libsimdcomp.so.0.0.3 12 | STATICLIBNAME=libsimdcomp.a 13 | all: unit unit_chars bitpackingbenchmark $(LIBNAME) $(STATICLIBNAME) 14 | test: 15 | ./unit 16 | ./unit_chars 17 | install: $(OBJECTS) 18 | cp $(LIBNAME) /usr/local/lib 19 | ln -s /usr/local/lib/$(LIBNAME) /usr/local/lib/libsimdcomp.so 20 | ldconfig 21 | cp $(HEADERS) /usr/local/include 22 | 23 | 24 | 25 | HEADERS=./include/simdbitpacking.h ./include/simdcomputil.h ./include/simdintegratedbitpacking.h ./include/simdcomp.h ./include/simdfor.h ./include/avxbitpacking.h ./include/avx512bitpacking.h 26 | 27 | uninstall: 28 | for h in $(HEADERS) ; do rm /usr/local/$$h; done 29 | rm /usr/local/lib/$(LIBNAME) 30 | rm /usr/local/lib/libsimdcomp.so 31 | ldconfig 32 | 33 | 34 | OBJECTS= simdbitpacking.o simdintegratedbitpacking.o simdcomputil.o \ 35 | simdpackedsearch.o simdpackedselect.o simdfor.o avxbitpacking.o avx512bitpacking.o 36 | 37 | $(LIBNAME): $(OBJECTS) 38 | $(CC) $(CFLAGS) -o $(LIBNAME) $(OBJECTS) $(LDFLAGS) 39 | 40 | $(STATICLIBNAME): $(OBJECTS) 41 | ar -qcs $@ $(OBJECTS) 42 | ranlib $@ 43 | 44 | avx512bitpacking.o: ./src/avx512bitpacking.c $(HEADERS) 45 | $(CC) $(CFLAGS) -c ./src/avx512bitpacking.c -Iinclude 46 | 47 | 48 | 49 | avxbitpacking.o: ./src/avxbitpacking.c $(HEADERS) 50 | $(CC) $(CFLAGS) -c ./src/avxbitpacking.c -Iinclude 51 | 52 | 53 | simdfor.o: ./src/simdfor.c $(HEADERS) 54 | $(CC) $(CFLAGS) -c ./src/simdfor.c -Iinclude 55 | 56 | 57 | simdcomputil.o: ./src/simdcomputil.c $(HEADERS) 58 | $(CC) $(CFLAGS) -c ./src/simdcomputil.c -Iinclude 59 | 60 | simdbitpacking.o: ./src/simdbitpacking.c $(HEADERS) 61 | $(CC) $(CFLAGS) -c ./src/simdbitpacking.c -Iinclude 62 | 63 | simdintegratedbitpacking.o: ./src/simdintegratedbitpacking.c $(HEADERS) 64 | $(CC) $(CFLAGS) -c ./src/simdintegratedbitpacking.c -Iinclude 65 | 66 | simdpackedsearch.o: ./src/simdpackedsearch.c $(HEADERS) 67 | $(CC) $(CFLAGS) -c ./src/simdpackedsearch.c -Iinclude 68 | 69 | simdpackedselect.o: ./src/simdpackedselect.c $(HEADERS) 70 | $(CC) $(CFLAGS) -c ./src/simdpackedselect.c -Iinclude 71 | 72 | example: ./example.c $(HEADERS) $(OBJECTS) 73 | $(CC) $(CFLAGS) -o example ./example.c -Iinclude $(OBJECTS) 74 | 75 | unit: ./tests/unit.c $(HEADERS) $(OBJECTS) 76 | $(CC) $(CFLAGS) -o unit ./tests/unit.c -Iinclude $(OBJECTS) 77 | 78 | bitpackingbenchmark: ./benchmarks/bitpackingbenchmark.c $(HEADERS) $(OBJECTS) 79 | $(CC) $(CFLAGS) -o bitpackingbenchmark ./benchmarks/bitpackingbenchmark.c -Iinclude $(OBJECTS) 80 | benchmark: ./benchmarks/benchmark.c $(HEADERS) $(OBJECTS) 81 | $(CC) $(CFLAGS) -o benchmark ./benchmarks/benchmark.c -Iinclude $(OBJECTS) 82 | dynunit: ./tests/unit.c $(HEADERS) $(LIBNAME) 83 | $(CC) $(CFLAGS) -o dynunit ./tests/unit.c -Iinclude -lsimdcomp 84 | 85 | unit_chars: ./tests/unit_chars.c $(HEADERS) $(OBJECTS) 86 | $(CC) $(CFLAGS) -o unit_chars ./tests/unit_chars.c -Iinclude $(OBJECTS) 87 | clean: 88 | rm -f unit *.o $(LIBNAME) $(STATICLIBNAME) example benchmark bitpackingbenchmark dynunit unit_chars 89 | -------------------------------------------------------------------------------- /include/simdbitpacking.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is released under a BSD License. 3 | */ 4 | #ifndef SIMDBITPACKING_H_ 5 | #define SIMDBITPACKING_H_ 6 | 7 | #include "portability.h" 8 | 9 | /* SSE2 is required */ 10 | #include 11 | /* for memset */ 12 | #include 13 | 14 | #include "simdcomputil.h" 15 | 16 | /*** 17 | * Please see example.c for various examples on how to make good use 18 | * of these functions. 19 | */ 20 | 21 | /* reads 128 values from "in", writes "bit" 128-bit vectors to "out". 22 | * The input values are masked so that only the least significant "bit" bits are 23 | * used. */ 24 | void simdpack(const uint32_t *in, __m128i *out, const uint32_t bit); 25 | 26 | /* reads 128 values from "in", writes "bit" 128-bit vectors to "out". 27 | * The input values are assumed to be less than 1< 6 | #include 7 | #include 8 | 9 | #define get_random_char() (uint8_t)(rand() % 256); 10 | 11 | int main() { 12 | int N = 5000 * SIMDBlockSize, gap; 13 | __m128i *buffer = malloc(SIMDBlockSize * sizeof(uint32_t)); 14 | uint32_t *datain = malloc(N * sizeof(uint32_t)); 15 | uint32_t *backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t)); 16 | 17 | srand(time(NULL)); 18 | 19 | for (gap = 1; gap <= 387420489; gap *= 3) { 20 | int k; 21 | printf(" gap = %u \n", gap); 22 | 23 | /* simulate some random character string, don't care about endiannes */ 24 | for (k = 0; k < N; ++k) { 25 | uint8_t _tmp[4]; 26 | 27 | _tmp[0] = get_random_char(); 28 | _tmp[1] = get_random_char(); 29 | _tmp[2] = get_random_char(); 30 | _tmp[3] = get_random_char(); 31 | 32 | memmove(&datain[k], _tmp, 4); 33 | } 34 | for (k = 0; k * SIMDBlockSize < N; ++k) { 35 | /* 36 | First part works for general arrays (sorted or unsorted) 37 | */ 38 | int j; 39 | /* we compute the bit width */ 40 | const uint32_t b = maxbits(datain + k * SIMDBlockSize); 41 | /* we read 128 integers at "datain + k * SIMDBlockSize" and 42 | write b 128-bit vectors at "buffer" */ 43 | simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b); 44 | /* we read back b1 128-bit vectors at "buffer" and write 128 integers at 45 | * backbuffer */ 46 | simdunpack(buffer, backbuffer, b); /* uncompressed */ 47 | for (j = 0; j < SIMDBlockSize; ++j) { 48 | uint8_t chars_back[4]; 49 | uint8_t chars_in[4]; 50 | 51 | memmove(chars_back, &backbuffer[j], 4); 52 | memmove(chars_in, &datain[k * SIMDBlockSize + j], 4); 53 | 54 | if (chars_in[0] != chars_back[0] || chars_in[1] != chars_back[1] || 55 | chars_in[2] != chars_back[2] || chars_in[3] != chars_back[3]) { 56 | printf("bug in simdpack\n"); 57 | return -2; 58 | } 59 | } 60 | 61 | { 62 | /* 63 | next part assumes that the data is sorted (uses differential coding) 64 | */ 65 | uint32_t offset = 0; 66 | /* we compute the bit width */ 67 | const uint32_t b1 = simdmaxbitsd1(offset, datain + k * SIMDBlockSize); 68 | /* we read 128 integers at "datain + k * SIMDBlockSize" and 69 | write b1 128-bit vectors at "buffer" */ 70 | simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer, b1); 71 | /* we read back b1 128-bit vectors at "buffer" and write 128 integers at 72 | * backbuffer */ 73 | simdunpackd1(offset, buffer, backbuffer, b1); 74 | for (j = 0; j < SIMDBlockSize; ++j) { 75 | uint8_t chars_back[4]; 76 | uint8_t chars_in[4]; 77 | 78 | memmove(chars_back, &backbuffer[j], 4); 79 | memmove(chars_in, &datain[k * SIMDBlockSize + j], 4); 80 | 81 | if (chars_in[0] != chars_back[0] || chars_in[1] != chars_back[1] || 82 | chars_in[2] != chars_back[2] || chars_in[3] != chars_back[3]) { 83 | printf("bug in simdpack\n"); 84 | return -3; 85 | } 86 | } 87 | offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1]; 88 | } 89 | } 90 | } 91 | free(buffer); 92 | free(datain); 93 | free(backbuffer); 94 | printf("Code looks good.\n"); 95 | return 0; 96 | } 97 | -------------------------------------------------------------------------------- /include/simdintegratedbitpacking.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is released under a BSD License. 3 | */ 4 | 5 | #ifndef SIMD_INTEGRATED_BITPACKING_H 6 | #define SIMD_INTEGRATED_BITPACKING_H 7 | 8 | #include "portability.h" 9 | 10 | /* SSE2 is required */ 11 | #include 12 | 13 | #include "simdbitpacking.h" 14 | #include "simdcomputil.h" 15 | 16 | #ifdef __cplusplus 17 | extern "C" { 18 | #endif 19 | 20 | /* reads 128 values from "in", writes "bit" 128-bit vectors to "out" 21 | integer values should be in sorted order (for best results). 22 | The differences are masked so that only the least significant "bit" bits are 23 | used. */ 24 | void simdpackd1(uint32_t initvalue, const uint32_t *in, __m128i *out, 25 | const uint32_t bit); 26 | 27 | /* reads 128 values from "in", writes "bit" 128-bit vectors to "out" 28 | integer values should be in sorted order (for best results). 29 | The difference values are assumed to be less than 1<= |key|, and returns its position. It is 39 | *assumed that the values stored are in sorted order. The encoded key is stored 40 | *in "*presult". If no value is larger or equal to the key, 128 is returned. The 41 | *pointer initOffset is a pointer to the last four value decoded (when starting 42 | *out, this can be a zero vector or initialized with _mm_set1_epi32(init)), and 43 | *the vector gets updated. 44 | **/ 45 | int simdsearchd1(__m128i *initOffset, const __m128i *in, uint32_t bit, 46 | uint32_t key, uint32_t *presult); 47 | 48 | /* searches "bit" 128-bit vectors from "in" (= length<=128 encoded integers) for 49 | * the first encoded uint32 value which is >= |key|, and returns its position. 50 | * It is assumed that the values stored are in sorted order. The encoded key is 51 | * stored in "*presult". The first length decoded integers, ignoring others. If 52 | * no value is larger or equal to the key, length is returned. Length should be 53 | * no larger than 128. 54 | * 55 | * If no value is larger or equal to the key, 56 | * length is returned */ 57 | int simdsearchwithlengthd1(uint32_t initvalue, const __m128i *in, uint32_t bit, 58 | int length, uint32_t key, uint32_t *presult); 59 | 60 | /* returns the value stored at the specified "slot". 61 | * */ 62 | uint32_t simdselectd1(uint32_t initvalue, const __m128i *in, uint32_t bit, 63 | int slot); 64 | 65 | /* given a block of 128 packed values, this function sets the value at index 66 | * "index" to "value", you must somehow know the previous value. Because of 67 | * differential coding, all following values are incremented by the offset 68 | * between this new value and the old value... This functions is useful if you 69 | * want to modify the last value. 70 | */ 71 | void simdfastsetd1fromprevious(__m128i *in, uint32_t bit, 72 | uint32_t previousvalue, uint32_t value, 73 | size_t index); 74 | 75 | /* given a block of 128 packed values, this function sets the value at index 76 | * "index" to "value", This function computes the previous value if needed. 77 | * Because of differential coding, all following values are incremented by the 78 | * offset between this new value and the old value... This functions is useful 79 | * if you want to modify the last value. 80 | */ 81 | void simdfastsetd1(uint32_t initvalue, __m128i *in, uint32_t bit, 82 | uint32_t value, size_t index); 83 | 84 | /*Simply scan the data 85 | * The pointer initOffset is a pointer to the last four value decoded 86 | * (when starting out, this can be a zero vector or initialized with 87 | * _mm_set1_epi32(init);), and the vector gets updated. 88 | * */ 89 | 90 | void simdscand1(__m128i *initOffset, const __m128i *in, uint32_t bit); 91 | 92 | #ifdef __cplusplus 93 | } // extern "C" 94 | #endif 95 | 96 | #endif 97 | -------------------------------------------------------------------------------- /.appveyor.yml: -------------------------------------------------------------------------------- 1 | 2 | version: "{branch}.build.{build}" 3 | 4 | clone_folder: c:\projects\simdcomp 5 | 6 | #cache: 7 | # c:\build-cache -> .appveyor.yml 8 | 9 | environment: 10 | matrix: 11 | - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 12 | ARCH: x64 13 | # looks like vc14 has trouble with code on x86, at least on the AppVeyor image 14 | # - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 15 | # ARCH: x86 16 | - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 17 | ARCH: x64 18 | - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 19 | ARCH: x86 20 | 21 | build_script: 22 | ps: | 23 | cd c:\projects\simdcomp 24 | echo "" | Out-File -Encoding "ASCII" task.bat 25 | if ('Visual Studio 2015' -eq $env:APPVEYOR_BUILD_WORKER_IMAGE) { 26 | $VC = 14; 27 | $vs_shell_cmd = 'call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" ' + $env:ARCH + ' 2>&1' 28 | } elseif ('Visual Studio 2017' -eq $env:APPVEYOR_BUILD_WORKER_IMAGE) { 29 | $VC = 15; 30 | if ('x64' -eq $env:ARCH) { 31 | $vs_shell_cmd = 'call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" 2>&1' 32 | } else { 33 | $vs_shell_cmd = 'call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars32.bat" 2>&1' 34 | } 35 | } 36 | mkdir 'c:\tmp_pack' 37 | echo $vs_shell_cmd | Out-File -Encoding "ASCII" -Append task.bat 38 | $move_cmd = 'move *.zip c:\tmp_pack' 39 | if ($VC -gt 14) { 40 | # these won't be tested, just build and upload artifact, vc15 only 41 | $cmd = 'nmake /nologo /f makefile.vc AVX512=yes PKG_VERSION=' + $env:APPVEYOR_REPO_COMMIT.substring(0, 8) + ' MACHINE=' + $env:ARCH + ' 2>&1' 42 | echo $cmd | Out-File -Encoding "ASCII" -Append task.bat 43 | $cmd = 'nmake /nologo /f makefile.vc AVX512=yes PKG_VERSION=' + $env:APPVEYOR_REPO_COMMIT.substring(0, 8) + ' MACHINE=' + $env:ARCH + ' pack 2>&1' 44 | echo $cmd | Out-File -Encoding "ASCII" -Append task.bat 45 | echo $move_cmd | Out-File -Encoding "ASCII" -Append task.bat 46 | echo 'nmake /nologo /f makefile.vc clean' | Out-File -Encoding "ASCII" -Append task.bat 47 | $cmd = 'nmake /nologo /f makefile.vc AVX2=yes PKG_VERSION=' + $env:APPVEYOR_REPO_COMMIT.substring(0, 8) + ' MACHINE=' + $env:ARCH + ' 2>&1' 48 | echo $cmd | Out-File -Encoding "ASCII" -Append task.bat 49 | $cmd = 'nmake /nologo /f makefile.vc AVX2=yes PKG_VERSION=' + $env:APPVEYOR_REPO_COMMIT.substring(0, 8) + ' MACHINE=' + $env:ARCH + ' pack 2>&1' 50 | echo $cmd | Out-File -Encoding "ASCII" -Append task.bat 51 | echo $move_cmd | Out-File -Encoding "ASCII" -Append task.bat 52 | echo 'nmake /nologo /f makefile.vc clean' | Out-File -Encoding "ASCII" -Append task.bat 53 | } 54 | $cmd = 'nmake /nologo /f makefile.vc PKG_VERSION=' + $env:APPVEYOR_REPO_COMMIT.substring(0, 8) + ' MACHINE=' + $env:ARCH + ' 2>&1' 55 | echo $cmd | Out-File -Encoding "ASCII" -Append task.bat 56 | $cmd = 'nmake /nologo /f makefile.vc PKG_VERSION=' + $env:APPVEYOR_REPO_COMMIT.substring(0, 8) + ' MACHINE=' + $env:ARCH + ' pack 2>&1' 57 | echo $cmd | Out-File -Encoding "ASCII" -Append task.bat 58 | echo $move_cmd | Out-File -Encoding "ASCII" -Append task.bat 59 | $here = (Get-Item -Path "." -Verbose).FullName 60 | $task = $here + '\task.bat' 61 | & $task 62 | 63 | after_build: 64 | ps: | 65 | Get-ChildItem 'c:\tmp_pack' -Filter *.zip | 66 | Foreach-Object { 67 | Push-AppveyorArtifact $_.FullName 68 | } 69 | 70 | test_script: 71 | ps: | 72 | cd c:\projects\simdcomp 73 | echo "" | Out-File -Encoding "ASCII" task.bat 74 | $here = (Get-Item -Path "." -Verbose).FullName 75 | echo '.\unit.exe' | Out-File -Encoding "ASCII" -Append task.bat 76 | $task = $here + '\task.bat' 77 | & $task 78 | 79 | -------------------------------------------------------------------------------- /makefile.vc: -------------------------------------------------------------------------------- 1 | 2 | !IFNDEF MACHINE 3 | !IF "$(PROCESSOR_ARCHITECTURE)"=="AMD64" 4 | MACHINE=x64 5 | !ELSE 6 | MACHINE=x86 7 | !ENDIF 8 | !ENDIF 9 | 10 | !IFNDEF VC 11 | VC=vc%VisualStudioVersion:~0,-2% 12 | !ENDIF 13 | 14 | # catch up when there's a stronger versioning 15 | !IFNDEF PKG_VERSION 16 | PKG_VERSION=latest 17 | !ENDIF 18 | 19 | !IFNDEF DEBUG 20 | DEBUG=no 21 | !ENDIF 22 | 23 | !IFNDEF CC 24 | CC=cl.exe 25 | !ENDIF 26 | 27 | !IFNDEF AR 28 | AR=lib.exe 29 | !ENDIF 30 | 31 | !IFNDEF LINK 32 | LINK=link.exe 33 | !ENDIF 34 | 35 | !IFNDEF PGO 36 | PGO=no 37 | !ENDIF 38 | 39 | !IFNDEF PGI 40 | PGI=no 41 | !ENDIF 42 | 43 | INC = /Iinclude 44 | 45 | !IF "$(DEBUG)"=="yes" 46 | CFLAGS = /nologo /MDd /LDd /Od /Zi /D_DEBUG /RTC1 /W3 /GS /Gm /D __SSE4_1__=1 47 | ARFLAGS = /nologo 48 | LDFLAGS = /nologo /debug /nodefaultlib:msvcrt 49 | !ELSE 50 | CFLAGS = /nologo /MD /O2 /Zi /DNDEBUG /W3 /Gm- /GS /Gy /Oi /GL /MP /D __SSE4_1__=1 51 | ARFLAGS = /nologo /LTCG 52 | LDFLAGS = /nologo /LTCG /DYNAMICBASE /incremental:no /debug /opt:ref,icf 53 | !ENDIF 54 | 55 | !IF "$(PGI)"=="yes" 56 | LDFLAGS = $(LDFLAGS) /ltcg:pgi 57 | !ENDIF 58 | 59 | !IF "$(PGO)"=="yes" 60 | LDFLAGS = $(LDFLAGS) /ltcg:pgo 61 | !ENDIF 62 | 63 | # SSE4.1 is required 64 | # VC++15.3 supports AVX512 65 | !IF "$(AVX512)"=="yes" 66 | CFLAGS = $(CFLAGS) /arch:AVX2 /D __AVX2__=1 /D __AVX512F__=1 67 | AVX2=yes 68 | !ELSEIF "$(AVX2)"=="yes" 69 | CFLAGS = $(CFLAGS) /arch:AVX2 /D __AVX2__=1 70 | !ENDIF 71 | 72 | LIB_OBJS = simdbitpacking.obj simdintegratedbitpacking.obj simdcomputil.obj \ 73 | simdpackedsearch.obj simdpackedselect.obj simdfor.obj 74 | 75 | LIB_SRCS = src/simdbitpacking.c src/simdintegratedbitpacking.c src/simdcomputil.c \ 76 | src/simdpackedsearch.c src/simdpackedselect.c src/simdfor.c 77 | 78 | PKG_FEATURES=sse4.1 79 | 80 | !IF "$(AVX2)"=="yes" 81 | LIB_OBJS = $(LIB_OBJS) avxbitpacking.obj 82 | LIB_SRCS = $(LIB_SRCS) src/avxbitpacking.c 83 | PKG_FEATURES=avx2 84 | !ENDIF 85 | 86 | !IF "$(AVX512)"=="yes" 87 | LIB_OBJS = $(LIB_OBJS) avx512bitpacking.obj 88 | LIB_SRCS = $(LIB_SRCS) src/avx512bitpacking.c 89 | PKG_FEATURES=avx512 90 | !ENDIF 91 | 92 | 93 | all: lib dll dynunit unit_chars example benchmarks 94 | # need some good use case scenario to train the instrumented build 95 | @if "$(PGI)"=="yes" echo Running PGO training 96 | @if "$(PGI)"=="yes" benchmark.exe >nul 2>&1 97 | # @if "$(PGI)"=="yes" bitpackingbenchmark.exe >nul 2>&1 98 | @if "$(PGI)"=="yes" example.exe >nul 2>&1 99 | 100 | 101 | $(LIB_OBJS): 102 | $(CC) $(INC) $(CFLAGS) /c $(LIB_SRCS) 103 | 104 | lib: $(LIB_OBJS) 105 | @copy simdcomp.def.tpl simdcomp.def 106 | @if "$(AVX2)"=="yes" echo avxunpack >> simdcomp.def 107 | @if "$(AVX2)"=="yes" echo avxpackwithoutmask >> simdcomp.def 108 | @if "$(AVX2)"=="yes" echo avxpack >> simdcomp.def 109 | @if "$(AVX2)"=="yes" echo avxmaxbits >> simdcomp.def 110 | @if "$(AVX512)"=="yes" echo avx512unpack >> simdcomp.def 111 | @if "$(AVX512)"=="yes" echo avx512packwithoutmask >> simdcomp.def 112 | @if "$(AVX512)"=="yes" echo avx512pack >> simdcomp.def 113 | @if "$(AVX512)"=="yes" echo avx512maxbits >> simdcomp.def 114 | $(AR) $(ARFLAGS) /OUT:simdcomp_a.lib $(LIB_OBJS) 115 | 116 | dll: $(LIB_OBJS) 117 | $(LINK) /DLL $(LDFLAGS) /OUT:simdcomp.dll /IMPLIB:simdcomp.lib /DEF:simdcomp.def $(LIB_OBJS) 118 | 119 | unit: lib 120 | $(CC) $(INC) $(CFLAGS) /c tests/unit.c 121 | $(LINK) $(LDFLAGS) /OUT:unit.exe unit.obj simdcomp_a.lib 122 | 123 | dynunit: dll 124 | $(CC) $(INC) $(CFLAGS) /c tests/unit.c 125 | $(LINK) $(LDFLAGS) /OUT:unit.exe unit.obj simdcomp.lib 126 | 127 | unit_chars: lib 128 | $(CC) $(INC) $(CFLAGS) /c tests/unit_chars.c 129 | $(LINK) $(LDFLAGS) /OUT:unit_chars.exe unit_chars.obj simdcomp.lib 130 | 131 | 132 | example: lib 133 | $(CC) $(INC) $(CFLAGS) /c example.c 134 | $(LINK) $(LDFLAGS) /OUT:example.exe example.obj simdcomp.lib 135 | 136 | benchmarks: lib 137 | $(CC) $(INC) $(CFLAGS) /c benchmarks/benchmark.c 138 | $(LINK) $(LDFLAGS) /OUT:benchmark.exe benchmark.obj simdcomp.lib 139 | # $(CC) $(INC) $(CFLAGS) /c benchmarks/bitpackingbenchmark.c 140 | # $(LINK) $(LDFLAGS) /OUT:bitpackingbenchmark.exe bitpackingbenchmark.obj simdcomp.lib 141 | 142 | pack: 143 | mkdir .\package 144 | cd .\package 145 | mkdir .\include 146 | mkdir .\bin 147 | mkdir .\lib 148 | copy ..\include\*.h .\include 149 | copy ..\simdcomp.dll .\bin 150 | copy ..\simdcomp.pdb .\bin 151 | copy ..\simdcomp.lib .\lib 152 | copy ..\simdcomp_a.lib .\lib 153 | copy ..\LICENSE . 154 | copy ..\README.md . 155 | 7z a ..\simdcomp-$(PKG_VERSION)-$(PKG_FEATURES)-$(VC)-$(MACHINE).zip . 156 | cd .. 157 | powershell -Command "Remove-Item -Recurse -Force .\package" 158 | 159 | clean: 160 | powershell -Command "Remove-Item -Force *.obj" 161 | powershell -Command "Remove-Item -Force *.lib" 162 | powershell -Command "Remove-Item -Force *.exe" 163 | powershell -Command "Remove-Item -Force *.dll" 164 | powershell -Command "Remove-Item -Force *.pgc" 165 | powershell -Command "Remove-Item -Force *.pgd" 166 | powershell -Command "Remove-Item -Force *.pdb" 167 | powershell -Command "Remove-Item -Force *.def" 168 | 169 | -------------------------------------------------------------------------------- /scripts/simdfor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | 4 | from math import ceil 5 | 6 | print(""" 7 | /** 8 | * Blablabla 9 | * 10 | */ 11 | 12 | """); 13 | 14 | def mask(bit): 15 | return str((1 << bit) - 1) 16 | 17 | for length in [32]: 18 | print(""" 19 | static __m128i iunpackFOR0(__m128i initOffset, const __m128i * _in , uint32_t * _out) { 20 | __m128i *out = (__m128i*)(_out); 21 | int i; 22 | (void) _in; 23 | for (i = 0; i < 8; ++i) { 24 | _mm_store_si128(out++, initOffset); 25 | _mm_store_si128(out++, initOffset); 26 | _mm_store_si128(out++, initOffset); 27 | _mm_store_si128(out++, initOffset); 28 | } 29 | 30 | return initOffset; 31 | } 32 | 33 | """) 34 | print(""" 35 | 36 | static void ipackFOR0(__m128i initOffset , const uint32_t * _in , __m128i * out ) { 37 | (void) initOffset; 38 | (void) _in; 39 | (void) out; 40 | } 41 | """) 42 | for bit in range(1,33): 43 | offsetVar = " initOffset"; 44 | print(""" 45 | static void ipackFOR"""+str(bit)+"""(__m128i """+offsetVar+""", const uint32_t * _in, __m128i * out) { 46 | const __m128i *in = (const __m128i*)(_in); 47 | __m128i OutReg; 48 | 49 | """); 50 | 51 | if (bit != 32): 52 | print(" __m128i CurrIn = _mm_load_si128(in);"); 53 | print(" __m128i InReg = _mm_sub_epi32(CurrIn, initOffset);"); 54 | else: 55 | print(" __m128i InReg = _mm_load_si128(in);"); 56 | print(" (void) initOffset;"); 57 | 58 | 59 | inwordpointer = 0 60 | valuecounter = 0 61 | for k in range(ceil((length * bit) / 32)): 62 | if(valuecounter == length): break 63 | for x in range(inwordpointer,32,bit): 64 | if(x!=0) : 65 | print(" OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, " + str(x) + "));"); 66 | else: 67 | print(" OutReg = InReg; "); 68 | if((x+bit>=32) ): 69 | while(inwordpointer<32): 70 | inwordpointer += bit 71 | print(" _mm_store_si128(out, OutReg);"); 72 | print(""); 73 | 74 | if(valuecounter + 1 < length): 75 | print(" ++out;") 76 | inwordpointer -= 32; 77 | if(inwordpointer>0): 78 | print(" OutReg = _mm_srli_epi32(InReg, " + str(bit) + " - " + str(inwordpointer) + ");"); 79 | if(valuecounter + 1 < length): 80 | print(" ++in;") 81 | 82 | if (bit != 32): 83 | print(" CurrIn = _mm_load_si128(in);"); 84 | print(" InReg = _mm_sub_epi32(CurrIn, initOffset);"); 85 | else: 86 | print(" InReg = _mm_load_si128(in);"); 87 | print(""); 88 | valuecounter = valuecounter + 1 89 | if(valuecounter == length): break 90 | assert(valuecounter == length) 91 | print("\n}\n\n""") 92 | 93 | for bit in range(1,32): 94 | offsetVar = " initOffset"; 95 | print("""\n 96 | static __m128i iunpackFOR"""+str(bit)+"""(__m128i """+offsetVar+""", const __m128i* in, uint32_t * _out) { 97 | """); 98 | print(""" __m128i* out = (__m128i*)(_out); 99 | __m128i InReg = _mm_load_si128(in); 100 | __m128i OutReg; 101 | __m128i tmp; 102 | const __m128i mask = _mm_set1_epi32((1U<<"""+str(bit)+""")-1); 103 | 104 | """); 105 | 106 | MainText = ""; 107 | 108 | MainText += "\n"; 109 | inwordpointer = 0 110 | valuecounter = 0 111 | for k in range(ceil((length * bit) / 32)): 112 | for x in range(inwordpointer,32,bit): 113 | if(valuecounter == length): break 114 | if (x > 0): 115 | MainText += " tmp = _mm_srli_epi32(InReg," + str(x) +");\n"; 116 | else: 117 | MainText += " tmp = InReg;\n"; 118 | if(x+bit<32): 119 | MainText += " OutReg = _mm_and_si128(tmp, mask);\n"; 120 | else: 121 | MainText += " OutReg = tmp;\n"; 122 | if((x+bit>=32) ): 123 | while(inwordpointer<32): 124 | inwordpointer += bit 125 | if(valuecounter + 1 < length): 126 | MainText += " ++in;" 127 | MainText += " InReg = _mm_load_si128(in);\n"; 128 | inwordpointer -= 32; 129 | if(inwordpointer>0): 130 | MainText += " OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, " + str(bit) + "-" + str(inwordpointer) + "), mask));\n\n"; 131 | if (bit != 32): 132 | MainText += " OutReg = _mm_add_epi32(OutReg, initOffset);\n"; 133 | MainText += " _mm_store_si128(out++, OutReg);\n\n"; 134 | MainText += ""; 135 | valuecounter = valuecounter + 1 136 | if(valuecounter == length): break 137 | assert(valuecounter == length) 138 | print(MainText) 139 | print(" return initOffset;"); 140 | print("\n}\n\n") 141 | print(""" 142 | static __m128i iunpackFOR32(__m128i initvalue , const __m128i* in, uint32_t * _out) { 143 | __m128i * mout = (__m128i *)_out; 144 | __m128i invec; 145 | size_t k; 146 | for(k = 0; k < 128/4; ++k) { 147 | invec = _mm_load_si128(in++); 148 | _mm_store_si128(mout++, invec); 149 | } 150 | return invec; 151 | } 152 | """) 153 | -------------------------------------------------------------------------------- /benchmarks/benchmark.c: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is released under a BSD License. 3 | */ 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "simdcomp.h" 10 | 11 | #ifdef _MSC_VER 12 | # include 13 | 14 | __int64 freq; 15 | 16 | typedef __int64 time_snap_t; 17 | 18 | static time_snap_t time_snap(void) 19 | { 20 | __int64 now; 21 | 22 | QueryPerformanceCounter((LARGE_INTEGER *)&now); 23 | 24 | return (__int64)((now*1000000)/freq); 25 | } 26 | # define TIME_SNAP_FMT "%I64d" 27 | #else 28 | # define time_snap clock 29 | # define TIME_SNAP_FMT "%lu" 30 | typedef clock_t time_snap_t; 31 | #endif 32 | 33 | 34 | void benchmarkSelect() { 35 | uint32_t buffer[128]; 36 | uint32_t backbuffer[128]; 37 | uint32_t initial = 33; 38 | uint32_t b; 39 | time_snap_t S1, S2, S3; 40 | int i; 41 | printf("benchmarking select \n"); 42 | 43 | /* this test creates delta encoded buffers with different bits, then 44 | * performs lower bound searches for each key */ 45 | for (b = 0; b <= 32; b++) { 46 | uint32_t prev = initial; 47 | uint32_t out[128]; 48 | /* initialize the buffer */ 49 | for (i = 0; i < 128; i++) { 50 | buffer[i] = ((uint32_t)(1655765 * i )) ; 51 | if(b < 32) buffer[i] %= (1< *ib) 93 | return 1; 94 | return 0; 95 | } 96 | 97 | /* adapted from wikipedia */ 98 | int binary_search(uint32_t * A, uint32_t key, int imin, int imax) 99 | { 100 | int imid; 101 | imax --; 102 | while(imin + 1 < imax) { 103 | imid = imin + ((imax - imin) / 2); 104 | 105 | if (A[imid] > key) { 106 | imax = imid; 107 | } else if (A[imid] < key) { 108 | imin = imid; 109 | } else { 110 | return imid; 111 | } 112 | } 113 | return imax; 114 | } 115 | 116 | 117 | /* adapted from wikipedia */ 118 | int lower_bound(uint32_t * A, uint32_t key, int imin, int imax) 119 | { 120 | int imid; 121 | imax --; 122 | while(imin + 1 < imax) { 123 | imid = imin + ((imax - imin) / 2); 124 | 125 | if (A[imid] >= key) { 126 | imax = imid; 127 | } else if (A[imid] < key) { 128 | imin = imid; 129 | } 130 | } 131 | if(A[imin] >= key) return imin; 132 | return imax; 133 | } 134 | 135 | void benchmarkSearch() { 136 | uint32_t buffer[128]; 137 | uint32_t backbuffer[128]; 138 | uint32_t out[128]; 139 | uint32_t result, initial = 0; 140 | uint32_t b, i; 141 | time_snap_t S1, S2, S3, S4; 142 | 143 | printf("benchmarking search \n"); 144 | 145 | /* this test creates delta encoded buffers with different bits, then 146 | * performs lower bound searches for each key */ 147 | for (b = 0; b <= 32; b++) { 148 | uint32_t prev = initial; 149 | /* initialize the buffer */ 150 | for (i = 0; i < 128; i++) { 151 | buffer[i] = ((uint32_t)rand()) ; 152 | if(b < 32) buffer[i] %= (1< 0) { 188 | if(buffer[pos-1] >= pseudorandomkey) 189 | printf("bug B.\n"); 190 | } 191 | } 192 | S2 = time_snap(); 193 | for (i = 0; i < 128 * 10; i++) { 194 | int pos; 195 | uint32_t pseudorandomkey = buffer[i%128]; 196 | simdunpackd1(initial, (__m128i *)out, backbuffer, b); 197 | pos = lower_bound(backbuffer, pseudorandomkey, 0, 128); 198 | result = backbuffer[pos]; 199 | 200 | if((result < pseudorandomkey) || (buffer[pos] != result)) { 201 | printf("bug C.\n"); 202 | } else if (pos > 0) { 203 | if(buffer[pos-1] >= pseudorandomkey) 204 | printf("bug D.\n"); 205 | } 206 | } 207 | S3 = time_snap(); 208 | for (i = 0; i < 128 * 10; i++) { 209 | 210 | int pos; 211 | uint32_t pseudorandomkey = buffer[i%128]; 212 | pos = simdsearchwithlengthd1(initial, (__m128i *)out, b, 128, 213 | pseudorandomkey, &result); 214 | if((result < pseudorandomkey) || (buffer[pos] != result)) { 215 | printf("bug A.\n"); 216 | } else if (pos > 0) { 217 | if(buffer[pos-1] >= pseudorandomkey) 218 | printf("bug B.\n"); 219 | } 220 | } 221 | S4 = time_snap(); 222 | 223 | printf("bit width = %d, fast search function time = " TIME_SNAP_FMT ", naive time = " TIME_SNAP_FMT " , fast with length time = " TIME_SNAP_FMT " \n", b, (S2-S1), (S3-S2), (S4-S3) ); 224 | } 225 | } 226 | 227 | 228 | int main() { 229 | #ifdef _MSC_VER 230 | QueryPerformanceFrequency((LARGE_INTEGER *)&freq); 231 | #endif 232 | benchmarkSearch(); 233 | benchmarkSelect(); 234 | return 0; 235 | } 236 | -------------------------------------------------------------------------------- /example.c: -------------------------------------------------------------------------------- 1 | /* Type "make example" to build this example program. */ 2 | #include 3 | #include 4 | #include 5 | #include "simdcomp.h" 6 | 7 | /** 8 | We provide several different code examples. 9 | **/ 10 | 11 | 12 | /* very simple test to illustrate a simple application */ 13 | int compress_decompress_demo() { 14 | size_t k, N = 9999; 15 | __m128i * endofbuf; 16 | int howmanybytes; 17 | float compratio; 18 | uint32_t * datain = malloc(N * sizeof(uint32_t)); 19 | uint8_t * buffer; 20 | uint32_t * backbuffer = malloc(N * sizeof(uint32_t)); 21 | uint32_t b; 22 | printf("== simple test\n"); 23 | 24 | for (k = 0; k < N; ++k) { /* start with k=0, not k=1! */ 25 | datain[k] = k; 26 | } 27 | 28 | b = maxbits_length(datain, N); 29 | buffer = malloc(simdpack_compressedbytes(N,b)); 30 | endofbuf = simdpack_length(datain, N, (__m128i *)buffer, b); 31 | howmanybytes = (endofbuf-(__m128i *)buffer)*sizeof(__m128i); /* number of compressed bytes */ 32 | compratio = N*sizeof(uint32_t) * 1.0 / howmanybytes; 33 | /* endofbuf points to the end of the compressed data */ 34 | buffer = realloc(buffer,(endofbuf-(__m128i *)buffer)*sizeof(__m128i)); /* optional but safe. */ 35 | printf("Compressed %d integers down to %d bytes (comp. ratio = %f).\n",(int)N,howmanybytes,compratio); 36 | /* in actual applications b must be stored and retrieved: caller is responsible for that. */ 37 | simdunpack_length((const __m128i *)buffer, N, backbuffer, b); /* will return a pointer to endofbuf */ 38 | 39 | for (k = 0; k < N; ++k) { 40 | if(datain[k] != backbuffer[k]) { 41 | printf("bug at %lu \n",(unsigned long)k); 42 | return -1; 43 | } 44 | } 45 | printf("Code works!\n"); 46 | free(datain); 47 | free(buffer); 48 | free(backbuffer); 49 | return 0; 50 | } 51 | 52 | 53 | 54 | /* compresses data from datain to buffer, returns how many bytes written 55 | used below in simple_demo */ 56 | size_t compress(uint32_t * datain, size_t length, uint8_t * buffer) { 57 | uint32_t offset; 58 | uint8_t * initout; 59 | size_t k; 60 | if(length/SIMDBlockSize*SIMDBlockSize != length) { 61 | printf("Data length should be a multiple of %i \n",SIMDBlockSize); 62 | } 63 | offset = 0; 64 | initout = buffer; 65 | for(k = 0; k < length / SIMDBlockSize; ++k) { 66 | uint32_t b = simdmaxbitsd1(offset, 67 | datain + k * SIMDBlockSize); 68 | *buffer++ = b; 69 | simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, (__m128i *) buffer, 70 | b); 71 | offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1]; 72 | buffer += b * sizeof(__m128i); 73 | } 74 | return buffer - initout; 75 | } 76 | 77 | /* Another illustration ... */ 78 | void simple_demo() { 79 | size_t REPEAT = 10, gap; 80 | size_t N = 1000 * SIMDBlockSize;/* SIMDBlockSize is 128 */ 81 | uint32_t * datain = malloc(N * sizeof(uint32_t)); 82 | size_t compsize; 83 | clock_t start, end; 84 | uint8_t * buffer = malloc(N * sizeof(uint32_t) + N / SIMDBlockSize); /* output buffer */ 85 | uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t)); 86 | printf("== simple demo\n"); 87 | for (gap = 1; gap <= 243; gap *= 3) { 88 | size_t k, repeat; 89 | uint32_t offset = 0; 90 | uint32_t bogus = 0; 91 | double numberofseconds; 92 | 93 | printf("\n"); 94 | printf(" gap = %lu \n", (unsigned long) gap); 95 | datain[0] = 0; 96 | for (k = 1; k < N; ++k) 97 | datain[k] = datain[k-1] + ( rand() % (gap + 1) ); 98 | compsize = compress(datain,N,buffer); 99 | printf("compression ratio = %f \n", (N * sizeof(uint32_t))/ (compsize * 1.0 )); 100 | start = clock(); 101 | for(repeat = 0; repeat < REPEAT; ++repeat) { 102 | uint8_t * decbuffer = buffer; 103 | for (k = 0; k * SIMDBlockSize < N; ++k) { 104 | uint8_t b = *decbuffer++; 105 | simdunpackd1(offset, (__m128i *) decbuffer, backbuffer, b); 106 | /* do something here with backbuffer */ 107 | bogus += backbuffer[3]; 108 | decbuffer += b * sizeof(__m128i); 109 | offset = backbuffer[SIMDBlockSize - 1]; 110 | } 111 | } 112 | end = clock(); 113 | numberofseconds = (end-start)/(double)CLOCKS_PER_SEC; 114 | printf("decoding speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0)); 115 | start = clock(); 116 | for(repeat = 0; repeat < REPEAT; ++repeat) { 117 | uint8_t * decbuffer = buffer; 118 | for (k = 0; k * SIMDBlockSize < N; ++k) { 119 | memcpy(backbuffer,decbuffer+k*SIMDBlockSize,SIMDBlockSize*sizeof(uint32_t)); 120 | bogus += backbuffer[3] - backbuffer[100]; 121 | } 122 | } 123 | end = clock(); 124 | numberofseconds = (end-start)/(double)CLOCKS_PER_SEC; 125 | printf("memcpy speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0)); 126 | printf("ignore me %i \n",bogus); 127 | printf("All tests are in CPU cache. Avoid out-of-cache decoding in applications.\n"); 128 | } 129 | free(buffer); 130 | free(datain); 131 | free(backbuffer); 132 | } 133 | 134 | /* Used below in more_sophisticated_demo ... */ 135 | size_t varying_bit_width_compress(uint32_t * datain, size_t length, uint8_t * buffer) { 136 | uint8_t * initout; 137 | size_t k; 138 | if(length/SIMDBlockSize*SIMDBlockSize != length) { 139 | printf("Data length should be a multiple of %i \n",SIMDBlockSize); 140 | } 141 | initout = buffer; 142 | for(k = 0; k < length / SIMDBlockSize; ++k) { 143 | uint32_t b = maxbits(datain); 144 | *buffer++ = b; 145 | simdpackwithoutmask(datain, (__m128i *)buffer, b); 146 | datain += SIMDBlockSize; 147 | buffer += b * sizeof(__m128i); 148 | } 149 | return buffer - initout; 150 | } 151 | 152 | /* Here we compress the data in blocks of 128 integers with varying bit width */ 153 | int varying_bit_width_demo() { 154 | size_t nn = 128 * 2; 155 | uint32_t * datainn = malloc(nn * sizeof(uint32_t)); 156 | uint8_t * buffern = malloc(nn * sizeof(uint32_t) + nn / SIMDBlockSize); 157 | uint8_t * initbuffern = buffern; 158 | uint32_t * backbuffern = malloc(nn * sizeof(uint32_t)); 159 | size_t k, compsize; 160 | printf("== varying bit-width demo\n"); 161 | 162 | for(k=0; k 1): 27 | return "s" 28 | else : 29 | return "" 30 | 31 | print("") 32 | print("static void avxpackblock0(const uint32_t * pin, __m256i * compressed) {"); 33 | print(" (void)compressed;"); 34 | print(" (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0)))); 35 | print("}"); 36 | print("") 37 | 38 | for bit in range(1,33): 39 | print("") 40 | print("/* we are going to pack {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit))) 41 | print("static void avxpackblock{0}(const uint32_t * pin, __m256i * compressed) {{".format(bit)); 42 | print(" const __m256i * in = (const __m256i *) pin;"); 43 | print(" /* we are going to touch {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit)))); 44 | if(howmanywords(bit) == 1): 45 | print(" __m256i w0;") 46 | else: 47 | print(" __m256i w0, w1;") 48 | if( (bit & (bit-1)) <> 0) : print(" __m256i tmp; /* used to store inputs at word boundary */") 49 | oldword = 0 50 | for j in range(howmany(bit)/8): 51 | firstword = j * bit / 32 52 | if(firstword > oldword): 53 | print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(oldword,oldword%2)) 54 | oldword = firstword 55 | secondword = (j * bit + bit - 1)/32 56 | firstshift = (j*bit) % 32 57 | if( firstword == secondword): 58 | if(firstshift == 0): 59 | print(" w{0} = _mm256_lddqu_si256 (in + {1});".format(firstword%2,j)) 60 | else: 61 | print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(_mm256_lddqu_si256 (in + {1}) , {2}));".format(firstword%2,j,firstshift)) 62 | else: 63 | print(" tmp = _mm256_lddqu_si256 (in + {0});".format(j)) 64 | print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift)) 65 | secondshift = 32-firstshift 66 | print(" w{0} = _mm256_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift)) 67 | print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(secondword,secondword%2)) 68 | print("}"); 69 | print("") 70 | 71 | 72 | print("") 73 | print("static void avxpackblockmask0(const uint32_t * pin, __m256i * compressed) {"); 74 | print(" (void)compressed;"); 75 | print(" (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0)))); 76 | print("}"); 77 | print("") 78 | 79 | for bit in range(1,33): 80 | print("") 81 | print("/* we are going to pack {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit))) 82 | print("static void avxpackblockmask{0}(const uint32_t * pin, __m256i * compressed) {{".format(bit)); 83 | print(" /* we are going to touch {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit)))); 84 | if(howmanywords(bit) == 1): 85 | print(" __m256i w0;") 86 | else: 87 | print(" __m256i w0, w1;") 88 | print(" const __m256i * in = (const __m256i *) pin;"); 89 | if(bit < 32): print(" const __m256i mask = _mm256_set1_epi32({0});".format((1< 0) : print(" __m256i tmp; /* used to store inputs at word boundary */") 94 | oldword = 0 95 | for j in range(howmany(bit)/8): 96 | firstword = j * bit / 32 97 | if(firstword > oldword): 98 | print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(oldword,oldword%2)) 99 | oldword = firstword 100 | secondword = (j * bit + bit - 1)/32 101 | firstshift = (j*bit) % 32 102 | loadstr = maskfnc(" _mm256_lddqu_si256 (in + {0}) ".format(j)) 103 | if( firstword == secondword): 104 | if(firstshift == 0): 105 | print(" w{0} = {1};".format(firstword%2,loadstr)) 106 | else: 107 | print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32({1} , {2}));".format(firstword%2,loadstr,firstshift)) 108 | else: 109 | print(" tmp = {0};".format(loadstr)) 110 | print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift)) 111 | secondshift = 32-firstshift 112 | print(" w{0} = _mm256_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift)) 113 | print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(secondword,secondword%2)) 114 | print("}"); 115 | print("") 116 | 117 | 118 | print("static void avxunpackblock0(const __m256i * compressed, uint32_t * pout) {"); 119 | print(" (void) compressed;"); 120 | print(" memset(pout,0,{0});".format(howmany(0))); 121 | print("}"); 122 | print("") 123 | 124 | for bit in range(1,33): 125 | print("") 126 | print("/* we packed {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit))) 127 | print("static void avxunpackblock{0}(const __m256i * compressed, uint32_t * pout) {{".format(bit)); 128 | print(" /* we are going to access {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit)))); 129 | if(howmanywords(bit) == 1): 130 | print(" __m256i w0;") 131 | else: 132 | print(" __m256i w0, w1;") 133 | print(" __m256i * out = (__m256i *) pout;"); 134 | if(bit < 32): print(" const __m256i mask = _mm256_set1_epi32({0});".format((1< oldword): 143 | print(" w{0} = _mm256_lddqu_si256 (compressed + {1});".format(secondword%2,secondword)) 144 | oldword = secondword 145 | firstshift = (j*bit) % 32 146 | firstshiftstr = "_mm256_srli_epi32( w{0} , "+str(firstshift)+") " 147 | if(firstshift == 0): 148 | firstshiftstr =" w{0} " # no need 149 | wfirst = firstshiftstr.format(firstword%2) 150 | if( firstword == secondword): 151 | if(firstshift + bit <> 32): 152 | wfirst = maskstr.format(wfirst) 153 | print(" _mm256_storeu_si256(out + {0}, {1});".format(j,wfirst)) 154 | else: 155 | secondshift = (32-firstshift) 156 | wsecond = "_mm256_slli_epi32( w{0} , {1} ) ".format((firstword+1)%2,secondshift) 157 | wfirstorsecond = " _mm256_or_si256 ({0},{1}) ".format(wfirst,wsecond) 158 | wfirstorsecond = maskstr.format(wfirstorsecond) 159 | print(" _mm256_storeu_si256(out + {0},\n {1});".format(j,wfirstorsecond)) 160 | print("}"); 161 | print("") 162 | 163 | 164 | print("static avxpackblockfnc avxfuncPackArr[] = {") 165 | for bit in range(0,32): 166 | print("&avxpackblock{0},".format(bit)) 167 | print("&avxpackblock32") 168 | print("};") 169 | 170 | print("static avxpackblockfnc avxfuncPackMaskArr[] = {") 171 | for bit in range(0,32): 172 | print("&avxpackblockmask{0},".format(bit)) 173 | print("&avxpackblockmask32") 174 | print("};") 175 | 176 | 177 | print("static avxunpackblockfnc avxfuncUnpackArr[] = {") 178 | for bit in range(0,32): 179 | print("&avxunpackblock{0},".format(bit)) 180 | print("&avxunpackblock32") 181 | print("};") 182 | print("/** avxpacking **/") 183 | -------------------------------------------------------------------------------- /scripts/avx512packing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | def howmany(bit): 4 | """ how many values are we going to pack? """ 5 | return 512 6 | 7 | def howmanywords(bit): 8 | return (howmany(bit) * bit + 511)/512 9 | 10 | def howmanybytes(bit): 11 | return howmanywords(bit) * 32 12 | 13 | print(""" 14 | /** avx512packing **/ 15 | """) 16 | 17 | print("""typedef void (*avx512packblockfnc)(const uint32_t * pin, __m512i * compressed);""") 18 | print("""typedef void (*avx512unpackblockfnc)(const __m512i * compressed, uint32_t * pout);""") 19 | 20 | 21 | 22 | 23 | 24 | 25 | def plurial(number): 26 | if(number <> 1): 27 | return "s" 28 | else : 29 | return "" 30 | 31 | print("") 32 | print("static void avx512packblock0(const uint32_t * pin, __m512i * compressed) {"); 33 | print(" (void)compressed;"); 34 | print(" (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0)))); 35 | print("}"); 36 | print("") 37 | 38 | for bit in range(1,33): 39 | print("") 40 | print("/* we are going to pack {0} {1}-bit values, touching {2} 512-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit))) 41 | print("static void avx512packblock{0}(const uint32_t * pin, __m512i * compressed) {{".format(bit)); 42 | print(" const __m512i * in = (const __m512i *) pin;"); 43 | print(" /* we are going to touch {0} 512-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit)))); 44 | if(howmanywords(bit) == 1): 45 | print(" __m512i w0;") 46 | else: 47 | print(" __m512i w0, w1;") 48 | if( (bit & (bit-1)) <> 0) : print(" __m512i tmp; /* used to store inputs at word boundary */") 49 | oldword = 0 50 | for j in range(howmany(bit)/16): 51 | firstword = j * bit / 32 52 | if(firstword > oldword): 53 | print(" _mm512_storeu_si512(compressed + {0}, w{1});".format(oldword,oldword%2)) 54 | oldword = firstword 55 | secondword = (j * bit + bit - 1)/32 56 | firstshift = (j*bit) % 32 57 | if( firstword == secondword): 58 | if(firstshift == 0): 59 | print(" w{0} = _mm512_loadu_si512 (in + {1});".format(firstword%2,j)) 60 | else: 61 | print(" w{0} = _mm512_or_si512(w{0},_mm512_slli_epi32(_mm512_loadu_si512 (in + {1}) , {2}));".format(firstword%2,j,firstshift)) 62 | else: 63 | print(" tmp = _mm512_loadu_si512 (in + {0});".format(j)) 64 | print(" w{0} = _mm512_or_si512(w{0},_mm512_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift)) 65 | secondshift = 32-firstshift 66 | print(" w{0} = _mm512_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift)) 67 | print(" _mm512_storeu_si512(compressed + {0}, w{1});".format(secondword,secondword%2)) 68 | print("}"); 69 | print("") 70 | 71 | 72 | print("") 73 | print("static void avx512packblockmask0(const uint32_t * pin, __m512i * compressed) {"); 74 | print(" (void)compressed;"); 75 | print(" (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0)))); 76 | print("}"); 77 | print("") 78 | 79 | for bit in range(1,33): 80 | print("") 81 | print("/* we are going to pack {0} {1}-bit values, touching {2} 512-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit))) 82 | print("static void avx512packblockmask{0}(const uint32_t * pin, __m512i * compressed) {{".format(bit)); 83 | print(" /* we are going to touch {0} 512-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit)))); 84 | if(howmanywords(bit) == 1): 85 | print(" __m512i w0;") 86 | else: 87 | print(" __m512i w0, w1;") 88 | print(" const __m512i * in = (const __m512i *) pin;"); 89 | if(bit < 32): print(" const __m512i mask = _mm512_set1_epi32({0});".format((1< 0) : print(" __m512i tmp; /* used to store inputs at word boundary */") 94 | oldword = 0 95 | for j in range(howmany(bit)/16): 96 | firstword = j * bit / 32 97 | if(firstword > oldword): 98 | print(" _mm512_storeu_si512(compressed + {0}, w{1});".format(oldword,oldword%2)) 99 | oldword = firstword 100 | secondword = (j * bit + bit - 1)/32 101 | firstshift = (j*bit) % 32 102 | loadstr = maskfnc(" _mm512_loadu_si512 (in + {0}) ".format(j)) 103 | if( firstword == secondword): 104 | if(firstshift == 0): 105 | print(" w{0} = {1};".format(firstword%2,loadstr)) 106 | else: 107 | print(" w{0} = _mm512_or_si512(w{0},_mm512_slli_epi32({1} , {2}));".format(firstword%2,loadstr,firstshift)) 108 | else: 109 | print(" tmp = {0};".format(loadstr)) 110 | print(" w{0} = _mm512_or_si512(w{0},_mm512_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift)) 111 | secondshift = 32-firstshift 112 | print(" w{0} = _mm512_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift)) 113 | print(" _mm512_storeu_si512(compressed + {0}, w{1});".format(secondword,secondword%2)) 114 | print("}"); 115 | print("") 116 | 117 | 118 | print("static void avx512unpackblock0(const __m512i * compressed, uint32_t * pout) {"); 119 | print(" (void) compressed;"); 120 | print(" memset(pout,0,{0});".format(howmany(0))); 121 | print("}"); 122 | print("") 123 | 124 | for bit in range(1,33): 125 | print("") 126 | print("/* we packed {0} {1}-bit values, touching {2} 512-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit))) 127 | print("static void avx512unpackblock{0}(const __m512i * compressed, uint32_t * pout) {{".format(bit)); 128 | print(" /* we are going to access {0} 512-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit)))); 129 | if(howmanywords(bit) == 1): 130 | print(" __m512i w0;") 131 | else: 132 | print(" __m512i w0, w1;") 133 | print(" __m512i * out = (__m512i *) pout;"); 134 | if(bit < 32): print(" const __m512i mask = _mm512_set1_epi32({0});".format((1< oldword): 143 | print(" w{0} = _mm512_loadu_si512 (compressed + {1});".format(secondword%2,secondword)) 144 | oldword = secondword 145 | firstshift = (j*bit) % 32 146 | firstshiftstr = "_mm512_srli_epi32( w{0} , "+str(firstshift)+") " 147 | if(firstshift == 0): 148 | firstshiftstr =" w{0} " # no need 149 | wfirst = firstshiftstr.format(firstword%2) 150 | if( firstword == secondword): 151 | if(firstshift + bit <> 32): 152 | wfirst = maskstr.format(wfirst) 153 | print(" _mm512_storeu_si512(out + {0}, {1});".format(j,wfirst)) 154 | else: 155 | secondshift = (32-firstshift) 156 | wsecond = "_mm512_slli_epi32( w{0} , {1} ) ".format((firstword+1)%2,secondshift) 157 | wfirstorsecond = " _mm512_or_si512 ({0},{1}) ".format(wfirst,wsecond) 158 | wfirstorsecond = maskstr.format(wfirstorsecond) 159 | print(" _mm512_storeu_si512(out + {0},\n {1});".format(j,wfirstorsecond)) 160 | print("}"); 161 | print("") 162 | 163 | 164 | print("static avx512packblockfnc avx512funcPackArr[] = {") 165 | for bit in range(0,32): 166 | print("&avx512packblock{0},".format(bit)) 167 | print("&avx512packblock32") 168 | print("};") 169 | 170 | print("static avx512packblockfnc avx512funcPackMaskArr[] = {") 171 | for bit in range(0,32): 172 | print("&avx512packblockmask{0},".format(bit)) 173 | print("&avx512packblockmask32") 174 | print("};") 175 | 176 | 177 | print("static avx512unpackblockfnc avx512funcUnpackArr[] = {") 178 | for bit in range(0,32): 179 | print("&avx512unpackblock{0},".format(bit)) 180 | print("&avx512unpackblock32") 181 | print("};") 182 | print("/** avx512packing **/") 183 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | The SIMDComp library 2 | ==================== 3 | [![Build Status](https://img.shields.io/appveyor/ci/lemire/simdcomp.svg)](https://ci.appveyor.com/project/lemire/simdcomp) 4 | 5 | 6 | A simple C library for compressing lists of integers using binary packing and SIMD instructions. 7 | The assumption is either that you have a list of 32-bit integers where most of them are small, or a list of 32-bit integers where differences between successive integers are small. No software is able to reliably compress an array of 32-bit random numbers. 8 | 9 | This library can decode at least 4 billions of compressed integers per second on most 10 | desktop or laptop processors. That is, it can decompress data at a rate of 15 GB/s. 11 | This is significantly faster than generic codecs like gzip, LZO, Snappy or LZ4. 12 | 13 | On a Skylake Intel processor, it can decode integers at a rate 0.3 cycles per integer, 14 | which can easily translate into more than 8 decoded billions integers per second. 15 | 16 | This library is part of the [Awesome C](https://github.com/kozross/awesome-c) list of C resources. 17 | 18 | Contributors: Daniel Lemire, Nathan Kurz, Christoph Rupp, Anatol Belski, Nick White and others 19 | 20 | What is it for? 21 | ------------- 22 | 23 | This is a low-level library for fast integer compression. By design it does not define a compressed 24 | format. It is up to the (sophisticated) user to create a compressed format. 25 | 26 | It is used by: 27 | - [upscaledb](https://github.com/cruppstahl/upscaledb) 28 | - [EventQL](https://github.com/eventql/eventql) 29 | - [ManticoreSearch](https://manticoresearch.com) 30 | 31 | 32 | 33 | Requirements 34 | ------------- 35 | 36 | - Your processor should support SSE4.1 (It is supported by most Intel and AMD processors released since 2008.) 37 | - It is possible to build the core part of the code if your processor support SSE2 (Pentium4 or better) 38 | - C99 compliant compiler (GCC is assumed) 39 | - A Linux-like distribution is assumed by the makefile 40 | 41 | For a plain C version that does not use SIMD instructions, see https://github.com/lemire/LittleIntPacker 42 | 43 | Usage 44 | ------- 45 | 46 | Compression works over blocks of 128 integers. 47 | 48 | For a complete working example, see example.c (you can build it and 49 | run it with "make example; ./example"). 50 | 51 | 52 | 53 | 1) Lists of integers in random order. 54 | 55 | ```C 56 | const uint32_t b = maxbits(datain);// computes bit width 57 | simdpackwithoutmask(datain, buffer, b);//compressed to buffer, compressing 128 32-bit integers down to b*32 bytes 58 | simdunpack(buffer, backbuffer, b);//uncompressed to backbuffer 59 | ``` 60 | 61 | While 128 32-bit integers are read, only b 128-bit words are written. Thus, the compression ratio is 32/b. 62 | 63 | 2) Sorted lists of integers. 64 | 65 | We used differential coding: we store the difference between successive integers. For this purpose, we need an initial value (called offset). 66 | 67 | ```C 68 | uint32_t offset = 0; 69 | uint32_t b1 = simdmaxbitsd1(offset,datain); // bit width 70 | simdpackwithoutmaskd1(offset, datain, buffer, b1);//compressing 128 32-bit integers down to b1*32 bytes 71 | simdunpackd1(offset, buffer, backbuffer, b1);//uncompressed 72 | ``` 73 | 74 | General example for arrays of arbitrary length: 75 | ```C 76 | int compress_decompress_demo() { 77 | size_t k, N = 9999; 78 | __m128i * endofbuf; 79 | uint32_t * datain = malloc(N * sizeof(uint32_t)); 80 | uint8_t * buffer; 81 | uint32_t * backbuffer = malloc(N * sizeof(uint32_t)); 82 | uint32_t b; 83 | 84 | for (k = 0; k < N; ++k){ /* start with k=0, not k=1! */ 85 | datain[k] = k; 86 | } 87 | 88 | b = maxbits_length(datain, N); 89 | buffer = malloc(simdpack_compressedbytes(N,b)); // allocate just enough memory 90 | endofbuf = simdpack_length(datain, N, (__m128i *)buffer, b); 91 | /* compressed data is stored between buffer and endofbuf using (endofbuf-buffer)*sizeof(__m128i) bytes */ 92 | /* would be safe to do : buffer = realloc(buffer,(endofbuf-(__m128i *)buffer)*sizeof(__m128i)); */ 93 | simdunpack_length((const __m128i *)buffer, N, backbuffer, b); 94 | 95 | for (k = 0; k < N; ++k){ 96 | if(datain[k] != backbuffer[k]) { 97 | printf("bug\n"); 98 | return -1; 99 | } 100 | } 101 | return 0; 102 | } 103 | ``` 104 | 105 | 106 | 3) Frame-of-Reference 107 | 108 | We also have frame-of-reference (FOR) functions (see simdfor.h header). They work like the bit packing 109 | routines, but do not use differential coding so they allow faster search in some cases, at the expense 110 | of compression. 111 | 112 | Setup 113 | --------- 114 | 115 | 116 | make 117 | make test 118 | 119 | and if you are daring: 120 | 121 | make install 122 | 123 | Go 124 | -------- 125 | 126 | If you are a go user, there is a "go" folder where you will find a simple demo. 127 | 128 | Other libraries 129 | ---------------- 130 | * Fast integer compression in Go: https://github.com/ronanh/intcomp 131 | * Fast Bitpacking algorithms: Rust port of simdcomp https://github.com/quickwit-oss/bitpacking 132 | * SIMDCompressionAndIntersection: A C++ library to compress and intersect sorted lists of integers using SIMD instructions https://github.com/lemire/SIMDCompressionAndIntersection 133 | * The FastPFOR C++ library : Fast integer compression https://github.com/lemire/FastPFor 134 | * High-performance dictionary coding https://github.com/lemire/dictionary 135 | * LittleIntPacker: C library to pack and unpack short arrays of integers as fast as possible https://github.com/lemire/LittleIntPacker 136 | * StreamVByte: Fast integer compression in C using the StreamVByte codec https://github.com/lemire/streamvbyte 137 | * MaskedVByte: Fast decoder for VByte-compressed integers https://github.com/lemire/MaskedVByte 138 | * CSharpFastPFOR: A C# integer compression library https://github.com/Genbox/CSharpFastPFOR 139 | * JavaFastPFOR: A java integer compression library https://github.com/lemire/JavaFastPFOR 140 | * Encoding: Integer Compression Libraries for Go https://github.com/zhenjl/encoding 141 | * FrameOfReference is a C++ library dedicated to frame-of-reference (FOR) compression: https://github.com/lemire/FrameOfReference 142 | * libvbyte: A fast implementation for varbyte 32bit/64bit integer compression https://github.com/cruppstahl/libvbyte 143 | * TurboPFor is a C library that offers lots of interesting optimizations. Well worth checking! (GPL license) https://github.com/powturbo/TurboPFor 144 | * Oroch is a C++ library that offers a usable API (MIT license) https://github.com/ademakov/Oroch 145 | 146 | 147 | Other programming languages 148 | ------------- 149 | 150 | - [There is a wrapper for Julia](https://github.com/mcovalt/TinyInt.jl). 151 | - [There is a Rust port](https://github.com/tantivy-search/bitpacking/). 152 | 153 | References 154 | ------------ 155 | * Daniel Lemire, Nathan Kurz, Christoph Rupp, Stream VByte: Faster Byte-Oriented Integer Compression, Information Processing Letters, Information Processing Letters 130, February 2018, Pages 1-6https://arxiv.org/abs/1709.08990 156 | * Jianguo Wang, Chunbin Lin, Yannis Papakonstantinou, Steven Swanson, An Experimental Study of Bitmap Compression vs. Inverted List Compression, SIGMOD 2017 http://db.ucsd.edu/wp-content/uploads/2017/03/sidm338-wangA.pdf 157 | * P. Damme, D. Habich, J. Hildebrandt, W. Lehner, Lightweight Data Compression Algorithms: An Experimental Survey (Experiments and Analyses), EDBT 2017 http://openproceedings.org/2017/conf/edbt/paper-146.pdf 158 | * P. Damme, D. Habich, J. Hildebrandt, W. Lehner, Insights into the Comparative Evaluation of Lightweight Data Compression Algorithms, EDBT 2017 http://openproceedings.org/2017/conf/edbt/paper-414.pdf 159 | * Daniel Lemire, Leonid Boytsov, Nathan Kurz, SIMD Compression and the Intersection of Sorted Integers, Software Practice & Experience 46 (6) 2016. http://arxiv.org/abs/1401.6399 160 | * Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second through vectorization, Software Practice & Experience 45 (1), 2015. http://arxiv.org/abs/1209.2137 http://onlinelibrary.wiley.com/doi/10.1002/spe.2203/abstract 161 | * Jeff Plaisance, Nathan Kurz, Daniel Lemire, Vectorized VByte Decoding, International Symposium on Web Algorithms 2015, 2015. http://arxiv.org/abs/1503.07387 162 | * Wayne Xin Zhao, Xudong Zhang, Daniel Lemire, Dongdong Shan, Jian-Yun Nie, Hongfei Yan, Ji-Rong Wen, A General SIMD-based Approach to Accelerating Compression Algorithms, ACM Transactions on Information Systems 33 (3), 2015. http://arxiv.org/abs/1502.01916 163 | * T. D. Wu, Bitpacking techniques for indexing genomes: I. Hash tables, Algorithms for Molecular Biology 11 (5), 2016. http://almob.biomedcentral.com/articles/10.1186/s13015-016-0069-5 164 | -------------------------------------------------------------------------------- /src/simdcomputil.c: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is released under a BSD License. 3 | */ 4 | 5 | #include "simdcomputil.h" 6 | #ifdef __SSE4_1__ 7 | #include 8 | #endif 9 | #include 10 | 11 | #define Delta(curr, prev) \ 12 | _mm_sub_epi32( \ 13 | curr, _mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12))) 14 | 15 | /* returns the integer logarithm of v (bit width) */ 16 | uint32_t bits(const uint32_t v) { 17 | #ifdef _MSC_VER 18 | unsigned long answer; 19 | if (v == 0) { 20 | return 0; 21 | } 22 | _BitScanReverse(&answer, v); 23 | return answer + 1; 24 | #else 25 | return v == 0 ? 0 26 | : 32 - __builtin_clz( 27 | v); /* assume GCC-like compiler if not microsoft */ 28 | #endif 29 | } 30 | 31 | static uint32_t maxbitas32int(const __m128i accumulator) { 32 | const __m128i _tmp1 = _mm_or_si128( 33 | _mm_srli_si128(accumulator, 8), 34 | accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/ 35 | const __m128i _tmp2 = 36 | _mm_or_si128(_mm_srli_si128(_tmp1, 4), 37 | _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/ 38 | uint32_t ans = _mm_cvtsi128_si32(_tmp2); 39 | return bits(ans); 40 | } 41 | 42 | SIMDCOMP_PURE uint32_t maxbits(const uint32_t *begin) { 43 | const __m128i *pin = (const __m128i *)(begin); 44 | __m128i accumulator = _mm_loadu_si128(pin); 45 | uint32_t k = 1; 46 | for (; 4 * k < SIMDBlockSize; ++k) { 47 | __m128i newvec = _mm_loadu_si128(pin + k); 48 | accumulator = _mm_or_si128(accumulator, newvec); 49 | } 50 | return maxbitas32int(accumulator); 51 | } 52 | static uint32_t orasint(const __m128i accumulator) { 53 | const __m128i _tmp1 = _mm_or_si128( 54 | _mm_srli_si128(accumulator, 8), 55 | accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/ 56 | const __m128i _tmp2 = 57 | _mm_or_si128(_mm_srli_si128(_tmp1, 4), 58 | _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/ 59 | return _mm_cvtsi128_si32(_tmp2); 60 | } 61 | 62 | #ifdef __SSE4_1__ 63 | 64 | static uint32_t minasint(const __m128i accumulator) { 65 | const __m128i _tmp1 = _mm_min_epu32( 66 | _mm_srli_si128(accumulator, 8), 67 | accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/ 68 | const __m128i _tmp2 = 69 | _mm_min_epu32(_mm_srli_si128(_tmp1, 4), 70 | _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/ 71 | return _mm_cvtsi128_si32(_tmp2); 72 | } 73 | 74 | static uint32_t maxasint(const __m128i accumulator) { 75 | const __m128i _tmp1 = _mm_max_epu32( 76 | _mm_srli_si128(accumulator, 8), 77 | accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/ 78 | const __m128i _tmp2 = 79 | _mm_max_epu32(_mm_srli_si128(_tmp1, 4), 80 | _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/ 81 | return _mm_cvtsi128_si32(_tmp2); 82 | } 83 | 84 | uint32_t simdmin(const uint32_t *in) { 85 | const __m128i *pin = (const __m128i *)(in); 86 | __m128i accumulator = _mm_loadu_si128(pin); 87 | uint32_t k = 1; 88 | for (; 4 * k < SIMDBlockSize; ++k) { 89 | __m128i newvec = _mm_loadu_si128(pin + k); 90 | accumulator = _mm_min_epu32(accumulator, newvec); 91 | } 92 | return minasint(accumulator); 93 | } 94 | 95 | void simdmaxmin(const uint32_t *in, uint32_t *getmin, uint32_t *getmax) { 96 | const __m128i *pin = (const __m128i *)(in); 97 | __m128i minaccumulator = _mm_loadu_si128(pin); 98 | __m128i maxaccumulator = minaccumulator; 99 | uint32_t k = 1; 100 | for (; 4 * k < SIMDBlockSize; ++k) { 101 | __m128i newvec = _mm_loadu_si128(pin + k); 102 | minaccumulator = _mm_min_epu32(minaccumulator, newvec); 103 | maxaccumulator = _mm_max_epu32(maxaccumulator, newvec); 104 | } 105 | *getmin = minasint(minaccumulator); 106 | *getmax = maxasint(maxaccumulator); 107 | } 108 | 109 | uint32_t simdmin_length(const uint32_t *in, uint32_t length) { 110 | uint32_t currentmin = 0xFFFFFFFF; 111 | uint32_t lengthdividedby4 = length / 4; 112 | uint32_t offset = lengthdividedby4 * 4; 113 | uint32_t k; 114 | if (lengthdividedby4 > 0) { 115 | const __m128i *pin = (const __m128i *)(in); 116 | __m128i accumulator = _mm_loadu_si128(pin); 117 | k = 1; 118 | for (; 4 * k < lengthdividedby4 * 4; ++k) { 119 | __m128i newvec = _mm_loadu_si128(pin + k); 120 | accumulator = _mm_min_epu32(accumulator, newvec); 121 | } 122 | currentmin = minasint(accumulator); 123 | } 124 | for (k = offset; k < length; ++k) 125 | if (in[k] < currentmin) 126 | currentmin = in[k]; 127 | return currentmin; 128 | } 129 | 130 | void simdmaxmin_length(const uint32_t *in, uint32_t length, uint32_t *getmin, 131 | uint32_t *getmax) { 132 | uint32_t lengthdividedby4 = length / 4; 133 | uint32_t offset = lengthdividedby4 * 4; 134 | uint32_t k; 135 | *getmin = 0xFFFFFFFF; 136 | *getmax = 0; 137 | if (lengthdividedby4 > 0) { 138 | const __m128i *pin = (const __m128i *)(in); 139 | __m128i minaccumulator = _mm_loadu_si128(pin); 140 | __m128i maxaccumulator = minaccumulator; 141 | k = 1; 142 | for (; 4 * k < lengthdividedby4 * 4; ++k) { 143 | __m128i newvec = _mm_loadu_si128(pin + k); 144 | minaccumulator = _mm_min_epu32(minaccumulator, newvec); 145 | maxaccumulator = _mm_max_epu32(maxaccumulator, newvec); 146 | } 147 | *getmin = minasint(minaccumulator); 148 | *getmax = maxasint(maxaccumulator); 149 | } 150 | for (k = offset; k < length; ++k) { 151 | if (in[k] < *getmin) 152 | *getmin = in[k]; 153 | if (in[k] > *getmax) 154 | *getmax = in[k]; 155 | } 156 | } 157 | 158 | #endif 159 | 160 | SIMDCOMP_PURE uint32_t maxbits_length(const uint32_t *in, uint32_t length) { 161 | uint32_t k; 162 | uint32_t lengthdividedby4 = length / 4; 163 | uint32_t offset = lengthdividedby4 * 4; 164 | uint32_t bigxor = 0; 165 | if (lengthdividedby4 > 0) { 166 | const __m128i *pin = (const __m128i *)(in); 167 | __m128i accumulator = _mm_loadu_si128(pin); 168 | k = 1; 169 | for (; 4 * k < 4 * lengthdividedby4; ++k) { 170 | __m128i newvec = _mm_loadu_si128(pin + k); 171 | accumulator = _mm_or_si128(accumulator, newvec); 172 | } 173 | bigxor = orasint(accumulator); 174 | } 175 | for (k = offset; k < length; ++k) 176 | bigxor |= in[k]; 177 | return bits(bigxor); 178 | } 179 | 180 | /* maxbit over 128 integers (SIMDBlockSize) with provided initial value */ 181 | uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t *in) { 182 | __m128i initoffset = _mm_set1_epi32(initvalue); 183 | const __m128i *pin = (const __m128i *)(in); 184 | __m128i newvec = _mm_loadu_si128(pin); 185 | __m128i accumulator = Delta(newvec, initoffset); 186 | __m128i oldvec = newvec; 187 | uint32_t k = 1; 188 | for (; 4 * k < SIMDBlockSize; ++k) { 189 | newvec = _mm_loadu_si128(pin + k); 190 | accumulator = _mm_or_si128(accumulator, Delta(newvec, oldvec)); 191 | oldvec = newvec; 192 | } 193 | initoffset = oldvec; 194 | return maxbitas32int(accumulator); 195 | } 196 | 197 | /* maxbit over |length| integers with provided initial value */ 198 | uint32_t simdmaxbitsd1_length(uint32_t initvalue, const uint32_t *in, 199 | uint32_t length) { 200 | __m128i newvec; 201 | __m128i oldvec; 202 | __m128i initoffset; 203 | __m128i accumulator; 204 | const __m128i *pin; 205 | uint32_t tmparray[4]; 206 | uint32_t k = 1; 207 | uint32_t acc; 208 | 209 | assert(length > 0); 210 | 211 | pin = (const __m128i *)(in); 212 | initoffset = _mm_set1_epi32(initvalue); 213 | switch (length) { 214 | case 1: 215 | newvec = _mm_set1_epi32(in[0]); 216 | break; 217 | case 2: 218 | newvec = _mm_setr_epi32(in[0], in[1], in[1], in[1]); 219 | break; 220 | case 3: 221 | newvec = _mm_setr_epi32(in[0], in[1], in[2], in[2]); 222 | break; 223 | default: 224 | newvec = _mm_loadu_si128(pin); 225 | break; 226 | } 227 | accumulator = Delta(newvec, initoffset); 228 | oldvec = newvec; 229 | 230 | /* process 4 integers and build an accumulator */ 231 | while (k * 4 + 4 <= length) { 232 | newvec = _mm_loadu_si128(pin + k); 233 | accumulator = _mm_or_si128(accumulator, Delta(newvec, oldvec)); 234 | oldvec = newvec; 235 | k++; 236 | } 237 | 238 | /* extract the accumulator as an integer */ 239 | _mm_storeu_si128((__m128i *)(tmparray), accumulator); 240 | acc = tmparray[0] | tmparray[1] | tmparray[2] | tmparray[3]; 241 | 242 | /* now process the remaining integers */ 243 | for (k *= 4; k < length; k++) 244 | acc |= in[k] - (k == 0 ? initvalue : in[k - 1]); 245 | 246 | /* return the number of bits */ 247 | return bits(acc); 248 | } 249 | -------------------------------------------------------------------------------- /benchmarks/bitpackingbenchmark.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "simdcomp.h" 5 | 6 | 7 | #define RDTSC_START(cycles) \ 8 | do { \ 9 | register unsigned cyc_high, cyc_low; \ 10 | __asm volatile( \ 11 | "cpuid\n\t" \ 12 | "rdtsc\n\t" \ 13 | "mov %%edx, %0\n\t" \ 14 | "mov %%eax, %1\n\t" \ 15 | : "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \ 16 | (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ 17 | } while (0) 18 | 19 | #define RDTSC_FINAL(cycles) \ 20 | do { \ 21 | register unsigned cyc_high, cyc_low; \ 22 | __asm volatile( \ 23 | "rdtscp\n\t" \ 24 | "mov %%edx, %0\n\t" \ 25 | "mov %%eax, %1\n\t" \ 26 | "cpuid\n\t" \ 27 | : "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \ 28 | (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ 29 | } while (0) 30 | 31 | 32 | 33 | 34 | uint32_t * get_random_array_from_bit_width(uint32_t length, uint32_t bit) { 35 | uint32_t * answer = malloc(sizeof(uint32_t) * length); 36 | uint32_t mask = (uint32_t) ((UINT64_C(1) << bit) - 1); 37 | uint32_t i; 38 | for(i = 0; i < length; ++i) { 39 | answer[i] = rand() & mask; 40 | } 41 | return answer; 42 | } 43 | 44 | uint32_t * get_random_array_from_bit_width_d1(uint32_t length, uint32_t bit) { 45 | uint32_t * answer = malloc(sizeof(uint32_t) * length); 46 | uint32_t mask = (uint32_t) ((UINT64_C(1) << bit) - 1); 47 | uint32_t i; 48 | answer[0] = rand() & mask; 49 | for(i = 1; i < length; ++i) { 50 | answer[i] = answer[i-1] + (rand() & mask); 51 | } 52 | return answer; 53 | } 54 | 55 | 56 | void demo128() { 57 | const uint32_t length = 128; 58 | uint32_t bit; 59 | printf("# --- %s\n", __func__); 60 | printf("# compressing %d integers\n",length); 61 | printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n"); 62 | for(bit = 1; bit <= 32; ++bit) { 63 | uint32_t i; 64 | 65 | uint32_t * data = get_random_array_from_bit_width(length, bit); 66 | __m128i * buffer = malloc(length * sizeof(uint32_t)); 67 | uint32_t * backdata = malloc(length * sizeof(uint32_t)); 68 | uint32_t repeat = 500; 69 | uint64_t min_diff; 70 | printf("%d\t",bit); 71 | min_diff = (uint64_t)-1; 72 | for (i = 0; i < repeat; i++) { 73 | uint64_t cycles_start, cycles_final, cycles_diff; 74 | __asm volatile("" ::: /* pretend to clobber */ "memory"); 75 | RDTSC_START(cycles_start); 76 | simdpackwithoutmask(data,buffer, bit); 77 | RDTSC_FINAL(cycles_final); 78 | cycles_diff = (cycles_final - cycles_start); 79 | if (cycles_diff < min_diff) min_diff = cycles_diff; 80 | } 81 | printf("%.2f\t",min_diff*1.0/length); 82 | min_diff = (uint64_t)-1; 83 | for (i = 0; i < repeat; i++) { 84 | uint64_t cycles_start, cycles_final, cycles_diff; 85 | __asm volatile("" ::: /* pretend to clobber */ "memory"); 86 | RDTSC_START(cycles_start); 87 | simdunpack(buffer, backdata,bit); 88 | RDTSC_FINAL(cycles_final); 89 | cycles_diff = (cycles_final - cycles_start); 90 | if (cycles_diff < min_diff) min_diff = cycles_diff; 91 | } 92 | printf("%.2f\t",min_diff*1.0/length); 93 | 94 | free(data); 95 | free(buffer); 96 | free(backdata); 97 | printf("\n"); 98 | } 99 | printf("\n\n"); /* two blank lines are required by gnuplot */ 100 | } 101 | 102 | void demo128_d1() { 103 | const uint32_t length = 128; 104 | uint32_t bit; 105 | printf("# --- %s\n", __func__); 106 | printf("# compressing %d integers\n",length); 107 | printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n"); 108 | for(bit = 1; bit <= 32; ++bit) { 109 | uint32_t i; 110 | 111 | uint32_t * data = get_random_array_from_bit_width_d1(length, bit); 112 | __m128i * buffer = malloc(length * sizeof(uint32_t)); 113 | uint32_t * backdata = malloc(length * sizeof(uint32_t)); 114 | uint32_t repeat = 500; 115 | uint64_t min_diff; 116 | printf("%d\t",bit); 117 | min_diff = (uint64_t)-1; 118 | for (i = 0; i < repeat; i++) { 119 | uint64_t cycles_start, cycles_final, cycles_diff; 120 | __asm volatile("" ::: /* pretend to clobber */ "memory"); 121 | RDTSC_START(cycles_start); 122 | simdpackwithoutmaskd1(0,data,buffer, bit); 123 | RDTSC_FINAL(cycles_final); 124 | cycles_diff = (cycles_final - cycles_start); 125 | if (cycles_diff < min_diff) min_diff = cycles_diff; 126 | } 127 | printf("%.2f\t",min_diff*1.0/length); 128 | min_diff = (uint64_t)-1; 129 | for (i = 0; i < repeat; i++) { 130 | uint64_t cycles_start, cycles_final, cycles_diff; 131 | __asm volatile("" ::: /* pretend to clobber */ "memory"); 132 | RDTSC_START(cycles_start); 133 | simdunpackd1(0,buffer, backdata,bit); 134 | RDTSC_FINAL(cycles_final); 135 | cycles_diff = (cycles_final - cycles_start); 136 | if (cycles_diff < min_diff) min_diff = cycles_diff; 137 | } 138 | printf("%.2f\t",min_diff*1.0/length); 139 | 140 | free(data); 141 | free(buffer); 142 | free(backdata); 143 | printf("\n"); 144 | } 145 | printf("\n\n"); /* two blank lines are required by gnuplot */ 146 | } 147 | 148 | #ifdef __AVX2__ 149 | void demo256() { 150 | const uint32_t length = 256; 151 | uint32_t bit; 152 | printf("# --- %s\n", __func__); 153 | printf("# compressing %d integers\n",length); 154 | printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n"); 155 | for(bit = 1; bit <= 32; ++bit) { 156 | uint32_t i; 157 | 158 | uint32_t * data = get_random_array_from_bit_width(length, bit); 159 | __m256i * buffer = malloc(length * sizeof(uint32_t)); 160 | uint32_t * backdata = malloc(length * sizeof(uint32_t)); 161 | uint32_t repeat = 500; 162 | uint64_t min_diff; 163 | printf("%d\t",bit); 164 | min_diff = (uint64_t)-1; 165 | for (i = 0; i < repeat; i++) { 166 | uint64_t cycles_start, cycles_final, cycles_diff; 167 | __asm volatile("" ::: /* pretend to clobber */ "memory"); 168 | RDTSC_START(cycles_start); 169 | avxpackwithoutmask(data,buffer, bit); 170 | RDTSC_FINAL(cycles_final); 171 | cycles_diff = (cycles_final - cycles_start); 172 | if (cycles_diff < min_diff) min_diff = cycles_diff; 173 | } 174 | printf("%.2f\t",min_diff*1.0/length); 175 | min_diff = (uint64_t)-1; 176 | for (i = 0; i < repeat; i++) { 177 | uint64_t cycles_start, cycles_final, cycles_diff; 178 | __asm volatile("" ::: /* pretend to clobber */ "memory"); 179 | RDTSC_START(cycles_start); 180 | avxunpack(buffer, backdata,bit); 181 | RDTSC_FINAL(cycles_final); 182 | cycles_diff = (cycles_final - cycles_start); 183 | if (cycles_diff < min_diff) min_diff = cycles_diff; 184 | } 185 | printf("%.2f\t",min_diff*1.0/length); 186 | 187 | free(data); 188 | free(buffer); 189 | free(backdata); 190 | printf("\n"); 191 | } 192 | printf("\n\n"); /* two blank lines are required by gnuplot */ 193 | } 194 | #endif /* avx 2 */ 195 | 196 | #ifdef __AVX512F__ 197 | void demo512() { 198 | const uint32_t length = 512; 199 | uint32_t bit; 200 | size_t z; 201 | printf("# --- %s\n", __func__); 202 | printf("# compressing %d integers\n",length); 203 | printf("# format: bit width, pack in cycles per int, unpack in cycles per int\n"); 204 | for(bit = 1; bit <= 32; ++bit) { 205 | uint32_t i; 206 | 207 | uint32_t * data = get_random_array_from_bit_width(length, bit); 208 | __m512i * buffer = malloc(length * sizeof(uint32_t)); 209 | uint32_t * backdata = malloc(length * sizeof(uint32_t)); 210 | uint32_t repeat = 500; 211 | uint64_t min_diff; 212 | printf("%d\t",bit); 213 | min_diff = (uint64_t)-1; 214 | for (i = 0; i < repeat; i++) { 215 | uint64_t cycles_start, cycles_final, cycles_diff; 216 | __asm volatile("" ::: /* pretend to clobber */ "memory"); 217 | RDTSC_START(cycles_start); 218 | avx512packwithoutmask(data,buffer, bit); 219 | RDTSC_FINAL(cycles_final); 220 | cycles_diff = (cycles_final - cycles_start); 221 | if (cycles_diff < min_diff) min_diff = cycles_diff; 222 | } 223 | printf("%.2f\t",min_diff*1.0/length); 224 | min_diff = (uint64_t)-1; 225 | for (i = 0; i < repeat; i++) { 226 | uint64_t cycles_start, cycles_final, cycles_diff; 227 | __asm volatile("" ::: /* pretend to clobber */ "memory"); 228 | RDTSC_START(cycles_start); 229 | avx512unpack(buffer, backdata,bit); 230 | RDTSC_FINAL(cycles_final); 231 | cycles_diff = (cycles_final - cycles_start); 232 | if (cycles_diff < min_diff) min_diff = cycles_diff; 233 | } 234 | printf("%.2f\t",min_diff*1.0/length); 235 | for(z = 0 ; z < length ; ++z) assert(backdata[z] == data[z]); 236 | free(data); 237 | free(buffer); 238 | free(backdata); 239 | printf("\n"); 240 | } 241 | printf("\n\n"); /* two blank lines are required by gnuplot */ 242 | } 243 | #endif /* avx 2 */ 244 | 245 | 246 | 247 | int main() { 248 | demo128(); 249 | demo128_d1(); 250 | #ifdef __AVX2__ 251 | demo256(); 252 | #endif 253 | #ifdef __AVX512F__ 254 | demo512(); 255 | #endif 256 | return 0; 257 | 258 | 259 | } 260 | -------------------------------------------------------------------------------- /tests/unit.c: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is released under a BSD License. 3 | */ 4 | #include "simdcomp.h" 5 | #include 6 | #include 7 | #include 8 | 9 | int issue21() { 10 | uint32_t bw, sz; 11 | printf("issue21"); 12 | fflush(stdout); 13 | for (bw = 0; bw < 30; bw++) { 14 | printf("."); 15 | fflush(stdout); 16 | for (sz = 1; sz < 4096; sz++) { 17 | 18 | size_t i; 19 | uint32_t *in = malloc(sz * sizeof(uint32_t)); 20 | uint32_t *out = malloc(sz * sizeof(uint32_t)); 21 | for (i = 0; i < sz; ++i) 22 | in[i] = (1 << bw) - 1; 23 | uint32_t b = maxbits_length(in, sz); 24 | uint8_t *buf = malloc(simdpack_compressedbytes(sz, b)); 25 | __m128i *end = simdpack_length(in, sz, (__m128i *)buf, b); 26 | if ((uint8_t *)end - buf != simdpack_compressedbytes(sz, b)) { 27 | printf("bad mem usage\n"); 28 | return -1; 29 | } 30 | simdunpack_length((const __m128i *)buf, sz, out, b); 31 | for (i = 0; i < sz; ++i) { 32 | if (in[i] != out[i]) { 33 | printf("bug\n"); 34 | return -1; 35 | } 36 | } 37 | free(in); 38 | free(out); 39 | free(buf); 40 | } 41 | } 42 | printf("\n"); 43 | return 0; 44 | } 45 | 46 | int issue21FOR() { 47 | uint32_t bw, sz; 48 | size_t i, j; 49 | printf("issue21for"); 50 | fflush(stdout); 51 | for (bw = 0; bw < 30; bw++) { 52 | printf("."); 53 | fflush(stdout); 54 | for (sz = 1; sz < 4096; sz++) { 55 | 56 | uint32_t *in = malloc(sz * sizeof(uint32_t)); 57 | uint32_t *out = malloc(sz * sizeof(uint32_t)); 58 | in[0] = 0; 59 | for (i = 1; i < sz; ++i) 60 | in[i] = (1 << bw) - 1; 61 | uint32_t b = maxbits_length(in, sz); 62 | uint8_t *buf = malloc(simdpackFOR_compressedbytes(sz, b)); 63 | __m128i *end = simdpackFOR_length(0, in, sz, (__m128i *)buf, b); 64 | if ((uint8_t *)end - buf != simdpackFOR_compressedbytes(sz, b)) { 65 | printf("bad mem usage\n"); 66 | return -1; 67 | } 68 | simdunpackFOR_length(0, (const __m128i *)buf, sz, out, b); 69 | for (i = 0; i < sz; ++i) { 70 | if (in[i] != out[i]) { 71 | for (j = 0; j < sz; ++j) { 72 | printf("%zu : %u %u \n", j, in[j], out[j]); 73 | } 74 | printf("bug\n"); 75 | return -1; 76 | } 77 | } 78 | free(in); 79 | free(out); 80 | free(buf); 81 | } 82 | } 83 | printf("\n"); 84 | return 0; 85 | } 86 | 87 | int testshortpack() { 88 | int bit; 89 | size_t i; 90 | size_t length; 91 | __m128i *bb; 92 | srand(0); 93 | printf("[%s]\n", __func__); 94 | for (bit = 0; bit < 32; ++bit) { 95 | printf(" %d ", bit); 96 | fflush(stdout); 97 | const size_t N = 128; 98 | uint32_t *data = malloc(N * sizeof(uint32_t)); 99 | uint32_t *backdata = malloc(N * sizeof(uint32_t)); 100 | uint32_t *buffer = malloc((2 * N + 1024) * sizeof(uint32_t)); 101 | 102 | for (i = 0; i < N; ++i) { 103 | data[i] = rand() & ((1U << bit) - 1); 104 | } 105 | for (length = 0; length <= N; ++length) { 106 | for (i = 0; i < N; ++i) { 107 | backdata[i] = 0; 108 | } 109 | bb = simdpack_shortlength(data, length, (__m128i *)buffer, bit); 110 | if ((bb - (__m128i *)buffer) * sizeof(__m128i) != 111 | (unsigned)simdpack_compressedbytes(length, bit)) { 112 | printf("bug\n"); 113 | return -1; 114 | } 115 | simdunpack_shortlength((__m128i *)buffer, length, backdata, bit); 116 | for (i = 0; i < length; ++i) { 117 | 118 | if (data[i] != backdata[i]) { 119 | printf("bug\n"); 120 | return -1; 121 | } 122 | } 123 | } 124 | free(data); 125 | free(backdata); 126 | free(buffer); 127 | } 128 | return 0; 129 | } 130 | 131 | int testlongpack() { 132 | int bit; 133 | size_t i; 134 | size_t length; 135 | __m128i *bb; 136 | srand(0); 137 | printf("[%s]\n", __func__); 138 | for (bit = 0; bit < 32; ++bit) { 139 | const size_t N = 2048; 140 | uint32_t *data = malloc(N * sizeof(uint32_t)); 141 | uint32_t *backdata = malloc(N * sizeof(uint32_t)); 142 | uint32_t *buffer = malloc((2 * N + 1024) * sizeof(uint32_t)); 143 | 144 | for (i = 0; i < N; ++i) { 145 | data[i] = rand() & ((1U << bit) - 1); 146 | } 147 | for (length = 0; length <= N; ++length) { 148 | for (i = 0; i < N; ++i) { 149 | backdata[i] = 0; 150 | } 151 | bb = simdpack_length(data, length, (__m128i *)buffer, bit); 152 | if ((bb - (__m128i *)buffer) * sizeof(__m128i) != 153 | (unsigned)simdpack_compressedbytes(length, bit)) { 154 | printf("bug\n"); 155 | return -1; 156 | } 157 | simdunpack_length((__m128i *)buffer, length, backdata, bit); 158 | for (i = 0; i < length; ++i) { 159 | 160 | if (data[i] != backdata[i]) { 161 | printf("bug\n"); 162 | return -1; 163 | } 164 | } 165 | } 166 | free(data); 167 | free(backdata); 168 | free(buffer); 169 | } 170 | return 0; 171 | } 172 | 173 | int testset() { 174 | int bit; 175 | size_t i; 176 | const size_t N = 128; 177 | uint32_t *data = malloc(N * sizeof(uint32_t)); 178 | uint32_t *backdata = malloc(N * sizeof(uint32_t)); 179 | uint32_t *buffer = malloc((2 * N + 1024) * sizeof(uint32_t)); 180 | 181 | srand(0); 182 | printf("[%s]\n", __func__); 183 | for (bit = 0; bit < 32; ++bit) { 184 | printf("simple set %d \n", bit); 185 | 186 | for (i = 0; i < N; ++i) { 187 | data[i] = rand() & ((1U << bit) - 1); 188 | } 189 | for (i = 0; i < N; ++i) { 190 | backdata[i] = 0; 191 | } 192 | simdpack(data, (__m128i *)buffer, bit); 193 | simdunpack((__m128i *)buffer, backdata, bit); 194 | for (i = 0; i < N; ++i) { 195 | if (data[i] != backdata[i]) { 196 | printf("bug\n"); 197 | return -1; 198 | } 199 | } 200 | 201 | for (i = N; i > 0; i--) { 202 | simdfastset((__m128i *)buffer, bit, data[N - i], i - 1); 203 | } 204 | simdunpack((__m128i *)buffer, backdata, bit); 205 | for (i = 0; i < N; ++i) { 206 | if (data[i] != backdata[N - i - 1]) { 207 | printf("bug\n"); 208 | return -1; 209 | } 210 | } 211 | simdpack(data, (__m128i *)buffer, bit); 212 | for (i = 1; i <= N; i++) { 213 | simdfastset((__m128i *)buffer, bit, data[i - 1], i - 1); 214 | } 215 | simdunpack((__m128i *)buffer, backdata, bit); 216 | for (i = 0; i < N; ++i) { 217 | if (data[i] != backdata[i]) { 218 | printf("bug\n"); 219 | return -1; 220 | } 221 | } 222 | } 223 | free(data); 224 | free(backdata); 225 | free(buffer); 226 | 227 | return 0; 228 | } 229 | 230 | #ifdef __SSE4_1__ 231 | 232 | int testsetd1() { 233 | int bit; 234 | size_t i; 235 | uint32_t newvalue; 236 | const size_t N = 128; 237 | uint32_t *data = malloc(N * sizeof(uint32_t)); 238 | uint32_t *datazeroes = malloc(N * sizeof(uint32_t)); 239 | 240 | uint32_t *backdata = malloc(N * sizeof(uint32_t)); 241 | uint32_t *buffer = malloc((2 * N + 1024) * sizeof(uint32_t)); 242 | 243 | srand(0); 244 | printf("[%s]\n", __func__); 245 | for (bit = 0; bit < 32; ++bit) { 246 | printf("simple set d1 %d \n", bit); 247 | data[0] = rand() & ((1U << bit) - 1); 248 | datazeroes[0] = 0; 249 | 250 | for (i = 1; i < N; ++i) { 251 | data[i] = data[i - 1] + (rand() & ((1U << bit) - 1)); 252 | datazeroes[i] = 0; 253 | } 254 | for (i = 0; i < N; ++i) { 255 | backdata[i] = 0; 256 | } 257 | simdpackd1(0, datazeroes, (__m128i *)buffer, bit); 258 | for (i = 1; i <= N; i++) { 259 | simdfastsetd1(0, (__m128i *)buffer, bit, data[i - 1], i - 1); 260 | newvalue = simdselectd1(0, (const __m128i *)buffer, bit, i - 1); 261 | if (newvalue != data[i - 1]) { 262 | printf("bad set-select\n"); 263 | return -1; 264 | } 265 | } 266 | simdunpackd1(0, (__m128i *)buffer, backdata, bit); 267 | for (i = 0; i < N; ++i) { 268 | if (data[i] != backdata[i]) 269 | return -1; 270 | } 271 | } 272 | free(data); 273 | free(backdata); 274 | free(buffer); 275 | free(datazeroes); 276 | return 0; 277 | } 278 | #endif 279 | 280 | int testsetFOR() { 281 | int bit; 282 | size_t i; 283 | uint32_t newvalue; 284 | const size_t N = 128; 285 | uint32_t *data = malloc(N * sizeof(uint32_t)); 286 | uint32_t *datazeroes = malloc(N * sizeof(uint32_t)); 287 | 288 | uint32_t *backdata = malloc(N * sizeof(uint32_t)); 289 | uint32_t *buffer = malloc((2 * N + 1024) * sizeof(uint32_t)); 290 | 291 | srand(0); 292 | printf("[%s]\n", __func__); 293 | for (bit = 0; bit < 32; ++bit) { 294 | printf("simple set FOR %d \n", bit); 295 | for (i = 0; i < N; ++i) { 296 | data[i] = (rand() & ((1U << bit) - 1)); 297 | datazeroes[i] = 0; 298 | } 299 | for (i = 0; i < N; ++i) { 300 | backdata[i] = 0; 301 | } 302 | simdpackFOR(0, datazeroes, (__m128i *)buffer, bit); 303 | for (i = 1; i <= N; i++) { 304 | simdfastsetFOR(0, (__m128i *)buffer, bit, data[i - 1], i - 1); 305 | newvalue = simdselectFOR(0, (const __m128i *)buffer, bit, i - 1); 306 | if (newvalue != data[i - 1]) { 307 | printf("bad set-select\n"); 308 | return -1; 309 | } 310 | } 311 | simdunpackFOR(0, (__m128i *)buffer, backdata, bit); 312 | for (i = 0; i < N; ++i) { 313 | if (data[i] != backdata[i]) 314 | return -1; 315 | } 316 | } 317 | free(data); 318 | free(backdata); 319 | free(buffer); 320 | free(datazeroes); 321 | return 0; 322 | } 323 | 324 | int testshortFORpack() { 325 | int bit; 326 | size_t i; 327 | __m128i *rb; 328 | size_t length; 329 | uint32_t offset = 7; 330 | srand(0); 331 | printf("[%s]\n", __func__); 332 | for (bit = 0; bit < 32; ++bit) { 333 | printf(" %d ", bit); 334 | fflush(stdout); 335 | const size_t N = 128; 336 | uint32_t *data = malloc(N * sizeof(uint32_t)); 337 | uint32_t *backdata = malloc(N * sizeof(uint32_t)); 338 | uint32_t *buffer = malloc((2 * N + 1024) * sizeof(uint32_t)); 339 | 340 | for (i = 0; i < N; ++i) { 341 | data[i] = (rand() & ((1U << bit) - 1)) + offset; 342 | } 343 | for (length = 0; length <= N; ++length) { 344 | for (i = 0; i < N; ++i) { 345 | backdata[i] = 0; 346 | } 347 | rb = simdpackFOR_length(offset, data, length, (__m128i *)buffer, bit); 348 | if (((rb - (__m128i *)buffer) * sizeof(__m128i)) != 349 | (unsigned)simdpackFOR_compressedbytes(length, bit)) { 350 | return -1; 351 | } 352 | simdunpackFOR_length(offset, (__m128i *)buffer, length, backdata, bit); 353 | for (i = 0; i < length; ++i) { 354 | 355 | if (data[i] != backdata[i]) 356 | return -1; 357 | } 358 | } 359 | free(data); 360 | free(backdata); 361 | free(buffer); 362 | } 363 | return 0; 364 | } 365 | 366 | #ifdef __AVX2__ 367 | 368 | int testbabyavx() { 369 | int bit; 370 | int trial; 371 | unsigned int i, j; 372 | const size_t N = AVXBlockSize; 373 | srand(0); 374 | printf("[%s]\n", __func__); 375 | printf("bit = "); 376 | for (bit = 0; bit < 32; ++bit) { 377 | printf(" %d ", bit); 378 | fflush(stdout); 379 | for (trial = 0; trial < 100; ++trial) { 380 | uint32_t *data = malloc(N * sizeof(uint32_t) + 64 * sizeof(uint32_t)); 381 | uint32_t *backdata = malloc(N * sizeof(uint32_t) + 64 * sizeof(uint32_t)); 382 | __m256i *buffer = malloc((2 * N + 1024) * sizeof(uint32_t) + 32); 383 | 384 | for (i = 0; i < N; ++i) { 385 | data[i] = rand() & ((uint32_t)(1U << bit) - 1); 386 | } 387 | for (i = 0; i < N; ++i) { 388 | backdata[i] = 0; 389 | } 390 | if (avxmaxbits(data) != maxbits_length(data, N)) { 391 | printf("avxmaxbits is buggy\n"); 392 | return -1; 393 | } 394 | 395 | avxpackwithoutmask(data, buffer, bit); 396 | avxunpack(buffer, backdata, bit); 397 | for (i = 0; i < AVXBlockSize; ++i) { 398 | if (data[i] != backdata[i]) { 399 | printf("bug\n"); 400 | for (j = 0; j < N; ++j) { 401 | if (data[j] != backdata[j]) { 402 | printf("data[%d]=%d v.s. backdata[%d]=%d\n", j, data[j], j, 403 | backdata[j]); 404 | } else { 405 | printf("data[%d]=%d\n", j, data[j]); 406 | } 407 | } 408 | return -1; 409 | } 410 | } 411 | free(data); 412 | free(backdata); 413 | free(buffer); 414 | } 415 | } 416 | printf("\n"); 417 | return 0; 418 | } 419 | 420 | int testavx2() { 421 | int N = 5000 * AVXBlockSize, gap; 422 | __m256i *buffer = malloc(AVXBlockSize * sizeof(uint32_t)); 423 | uint32_t *datain = malloc(N * sizeof(uint32_t)); 424 | uint32_t *backbuffer = malloc(AVXBlockSize * sizeof(uint32_t)); 425 | printf("[%s]\n", __func__); 426 | for (gap = 1; gap <= 387420489; gap *= 3) { 427 | int k; 428 | printf(" gap = %u \n", gap); 429 | for (k = 0; k < N; ++k) 430 | datain[k] = (uint32_t)(((uint64_t)k * gap) & 0xFFFFFFFF); 431 | for (k = 0; k * AVXBlockSize < N; ++k) { 432 | /* 433 | First part works for general arrays (sorted or unsorted) 434 | */ 435 | int j; 436 | /* we compute the bit width */ 437 | const uint32_t b = avxmaxbits(datain + k * AVXBlockSize); 438 | if (avxmaxbits(datain + k * AVXBlockSize) != 439 | maxbits_length(datain + k * AVXBlockSize, AVXBlockSize)) { 440 | printf("avxmaxbits is buggy %d %d \n", 441 | avxmaxbits(datain + k * AVXBlockSize), 442 | maxbits_length(datain + k * AVXBlockSize, AVXBlockSize)); 443 | return -1; 444 | } 445 | 446 | /* we read 256 integers at "datain + k * AVXBlockSize" and 447 | write b 256-bit vectors at "buffer" */ 448 | avxpackwithoutmask(datain + k * AVXBlockSize, buffer, b); 449 | /* we read back b1 128-bit vectors at "buffer" and write 128 integers at 450 | * backbuffer */ 451 | avxunpack(buffer, backbuffer, b); /* uncompressed */ 452 | for (j = 0; j < AVXBlockSize; ++j) { 453 | if (backbuffer[j] != datain[k * AVXBlockSize + j]) { 454 | int i; 455 | printf("bug in avxpack\n"); 456 | for (i = 0; i < AVXBlockSize; ++i) { 457 | printf("data[%d]=%d got back %d %s\n", i, 458 | datain[k * AVXBlockSize + i], backbuffer[i], 459 | datain[k * AVXBlockSize + i] != backbuffer[i] ? "bug" : ""); 460 | } 461 | return -2; 462 | } 463 | } 464 | } 465 | } 466 | free(buffer); 467 | free(datain); 468 | free(backbuffer); 469 | printf("Code looks good.\n"); 470 | return 0; 471 | } 472 | #endif /* avx2 */ 473 | 474 | #ifdef __AVX512F__ 475 | 476 | int testbabyavx512() { 477 | int bit; 478 | int trial; 479 | unsigned int i, j; 480 | const size_t N = AVX512BlockSize; 481 | srand(0); 482 | printf("[%s]\n", __func__); 483 | printf("bit = "); 484 | for (bit = 0; bit < 32; ++bit) { 485 | printf(" %d ", bit); 486 | fflush(stdout); 487 | for (trial = 0; trial < 100; ++trial) { 488 | uint32_t *data = malloc(N * sizeof(uint32_t) + 64 * sizeof(uint32_t)); 489 | uint32_t *backdata = malloc(N * sizeof(uint32_t) + 64 * sizeof(uint32_t)); 490 | __m512i *buffer = malloc((2 * N + 1024) * sizeof(uint32_t) + 32); 491 | 492 | for (i = 0; i < N; ++i) { 493 | data[i] = rand() & ((uint32_t)(1U << bit) - 1); 494 | } 495 | for (i = 0; i < N; ++i) { 496 | backdata[i] = 0; 497 | } 498 | if (avx512maxbits(data) != maxbits_length(data, N)) { 499 | printf("avx512maxbits is buggy\n"); 500 | return -1; 501 | } 502 | 503 | avx512packwithoutmask(data, buffer, bit); 504 | avx512unpack(buffer, backdata, bit); 505 | for (i = 0; i < AVX512BlockSize; ++i) { 506 | if (data[i] != backdata[i]) { 507 | printf("bug\n"); 508 | for (j = 0; j < N; ++j) { 509 | if (data[j] != backdata[j]) { 510 | printf("data[%d]=%d v.s. backdata[%d]=%d\n", j, data[j], j, 511 | backdata[j]); 512 | } else { 513 | printf("data[%d]=%d\n", j, data[j]); 514 | } 515 | } 516 | return -1; 517 | } 518 | } 519 | free(data); 520 | free(backdata); 521 | free(buffer); 522 | } 523 | } 524 | printf("\n"); 525 | return 0; 526 | } 527 | 528 | int testavx512_2() { 529 | int N = 5000 * AVX512BlockSize, gap; 530 | __m512i *buffer = malloc(AVX512BlockSize * sizeof(uint32_t)); 531 | uint32_t *datain = malloc(N * sizeof(uint32_t)); 532 | uint32_t *backbuffer = malloc(AVX512BlockSize * sizeof(uint32_t)); 533 | printf("[%s]\n", __func__); 534 | for (gap = 1; gap <= 387420489; gap *= 3) { 535 | int k; 536 | printf(" gap = %u \n", gap); 537 | for (k = 0; k < N; ++k) { 538 | datain[k] = k * gap; 539 | } 540 | for (k = 0; k * AVX512BlockSize < N; ++k) { 541 | /* 542 | * First part works for general arrays (sorted or unsorted) 543 | * */ 544 | int j; 545 | /* we compute the bit width */ 546 | const uint32_t b = avx512maxbits(datain + k * AVX512BlockSize); 547 | if (b != maxbits_length(datain + k * AVX512BlockSize, AVX512BlockSize)) { 548 | printf("avx512maxbits is buggy %d %d \n", 549 | avx512maxbits(datain + k * AVX512BlockSize), 550 | maxbits_length(datain + k * AVX512BlockSize, AVX512BlockSize)); 551 | return -1; 552 | } 553 | 554 | /* we read 512 integers at "datain + k * AVX512BlockSize" and 555 | * write b 512-bit vectors at "buffer" */ 556 | avx512packwithoutmask(datain + k * AVX512BlockSize, buffer, b); 557 | /* we read back b1 512-bit vectors at "buffer" and write 512 integers at 558 | * backbuffer */ 559 | avx512unpack(buffer, backbuffer, b); /* uncompressed */ 560 | for (j = 0; j < AVX512BlockSize; ++j) { 561 | if (backbuffer[j] != datain[k * AVX512BlockSize + j]) { 562 | int i; 563 | printf("bug in avx512pack\n"); 564 | for (i = 0; i < AVX512BlockSize; ++i) { 565 | printf("data[%d]=%d got back %d %s\n", i, 566 | datain[k * AVX512BlockSize + i], backbuffer[i], 567 | datain[k * AVX512BlockSize + i] != backbuffer[i] ? "bug" 568 | : ""); 569 | } 570 | return -2; 571 | } 572 | } 573 | } 574 | } 575 | free(buffer); 576 | free(datain); 577 | free(backbuffer); 578 | printf("Code looks good.\n"); 579 | return 0; 580 | } 581 | #endif /* avx512 */ 582 | 583 | int test() { 584 | int N = 5000 * SIMDBlockSize, gap; 585 | __m128i *buffer = malloc(SIMDBlockSize * sizeof(uint32_t)); 586 | uint32_t *datain = malloc(N * sizeof(uint32_t)); 587 | uint32_t *backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t)); 588 | printf("[%s]\n", __func__); 589 | for (gap = 1; gap <= 387420489; gap *= 3) { 590 | int k; 591 | printf(" gap = %u \n", gap); 592 | for (k = 0; k < N; ++k) 593 | datain[k] = (uint32_t)(((uint64_t)k * gap) & 0xFFFFFFFF); 594 | for (k = 0; k * SIMDBlockSize < N; ++k) { 595 | /* 596 | First part works for general arrays (sorted or unsorted) 597 | */ 598 | int j; 599 | /* we compute the bit width */ 600 | const uint32_t b = maxbits(datain + k * SIMDBlockSize); 601 | /* we read 128 integers at "datain + k * SIMDBlockSize" and 602 | write b 128-bit vectors at "buffer" */ 603 | simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b); 604 | /* we read back b1 128-bit vectors at "buffer" and write 128 integers at 605 | * backbuffer */ 606 | simdunpack(buffer, backbuffer, b); /* uncompressed */ 607 | for (j = 0; j < SIMDBlockSize; ++j) { 608 | if (backbuffer[j] != datain[k * SIMDBlockSize + j]) { 609 | printf("bug in simdpack\n"); 610 | return -2; 611 | } 612 | } 613 | 614 | { 615 | /* 616 | next part assumes that the data is sorted (uses differential coding) 617 | */ 618 | uint32_t offset = 0; 619 | /* we compute the bit width */ 620 | const uint32_t b1 = simdmaxbitsd1(offset, datain + k * SIMDBlockSize); 621 | /* we read 128 integers at "datain + k * SIMDBlockSize" and 622 | write b1 128-bit vectors at "buffer" */ 623 | simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer, b1); 624 | /* we read back b1 128-bit vectors at "buffer" and write 128 integers at 625 | * backbuffer */ 626 | simdunpackd1(offset, buffer, backbuffer, b1); 627 | for (j = 0; j < SIMDBlockSize; ++j) { 628 | if (backbuffer[j] != datain[k * SIMDBlockSize + j]) { 629 | printf("bug in simdpack d1\n"); 630 | return -3; 631 | } 632 | } 633 | offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1]; 634 | } 635 | } 636 | } 637 | free(buffer); 638 | free(datain); 639 | free(backbuffer); 640 | printf("Code looks good.\n"); 641 | return 0; 642 | } 643 | 644 | #ifdef __SSE4_1__ 645 | int testFOR() { 646 | int N = 5000 * SIMDBlockSize, gap; 647 | __m128i *buffer = malloc(SIMDBlockSize * sizeof(uint32_t)); 648 | uint32_t *datain = malloc(N * sizeof(uint32_t)); 649 | uint32_t *backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t)); 650 | uint32_t tmax, tmin, tb; 651 | printf("[%s]\n", __func__); 652 | for (gap = 1; gap <= 387420489; gap *= 2) { 653 | int k; 654 | printf(" gap = %u \n", gap); 655 | for (k = 0; k < N; ++k) 656 | datain[k] = (uint32_t)(((uint64_t)k * gap) & 0xFFFFFFFF); 657 | for (k = 0; k * SIMDBlockSize < N; ++k) { 658 | int j; 659 | simdmaxmin_length(datain + k * SIMDBlockSize, SIMDBlockSize, &tmin, 660 | &tmax); 661 | /* we compute the bit width */ 662 | tb = bits(tmax - tmin); 663 | 664 | /* we read 128 integers at "datain + k * SIMDBlockSize" and 665 | write b 128-bit vectors at "buffer" */ 666 | simdpackFOR(tmin, datain + k * SIMDBlockSize, buffer, tb); 667 | 668 | for (j = 0; j < SIMDBlockSize; ++j) { 669 | uint32_t selectedvalue = simdselectFOR(tmin, buffer, tb, j); 670 | if (selectedvalue != datain[k * SIMDBlockSize + j]) { 671 | printf("bug in simdselectFOR\n"); 672 | return -3; 673 | } 674 | } 675 | /* we read back b1 128-bit vectors at "buffer" and write 128 integers at 676 | * backbuffer */ 677 | simdunpackFOR(tmin, buffer, backbuffer, tb); /* uncompressed */ 678 | for (j = 0; j < SIMDBlockSize; ++j) { 679 | if (backbuffer[j] != datain[k * SIMDBlockSize + j]) { 680 | printf("bug in simdpackFOR\n"); 681 | return -2; 682 | } 683 | } 684 | } 685 | } 686 | free(buffer); 687 | free(datain); 688 | free(backbuffer); 689 | printf("Code looks good.\n"); 690 | return 0; 691 | } 692 | #endif 693 | 694 | #define MAX 300 695 | int test_simdmaxbitsd1_length() { 696 | uint32_t result, buffer[MAX + 1]; 697 | int i, j; 698 | 699 | memset(&buffer[0], 0xff, sizeof(buffer)); 700 | printf("[%s]\n", __func__); 701 | /* this test creates buffers of different length; each buffer is 702 | * initialized to result in the following deltas: 703 | * length 1: 2 704 | * length 2: 1 2 705 | * length 3: 1 1 2 706 | * length 4: 1 1 1 2 707 | * length 5: 1 1 1 1 2 708 | * etc. Each sequence's "maxbits" is 2. */ 709 | for (i = 0; i < MAX; i++) { 710 | for (j = 0; j < i; j++) 711 | buffer[j] = j + 1; 712 | buffer[i] = i + 2; 713 | 714 | result = simdmaxbitsd1_length(0, &buffer[0], i + 1); 715 | if (result != 2) { 716 | printf("simdmaxbitsd1_length: unexpected result %u in loop %d\n", result, 717 | i); 718 | return -1; 719 | } 720 | } 721 | printf("simdmaxbitsd1_length: ok\n"); 722 | return 0; 723 | } 724 | 725 | int uint32_cmp(const void *a, const void *b) { 726 | const uint32_t *ia = (const uint32_t *)a; 727 | const uint32_t *ib = (const uint32_t *)b; 728 | if (*ia < *ib) 729 | return -1; 730 | else if (*ia > *ib) 731 | return 1; 732 | return 0; 733 | } 734 | 735 | #ifdef __SSE4_1__ 736 | int test_simdpackedsearch() { 737 | uint32_t buffer[128]; 738 | uint32_t result = 0; 739 | int b, i; 740 | uint32_t init = 0; 741 | __m128i initial = _mm_set1_epi32(init); 742 | printf("[%s]\n", __func__); 743 | /* initialize the buffer */ 744 | for (i = 0; i < 128; i++) 745 | buffer[i] = (uint32_t)(i + 1); 746 | 747 | /* this test creates delta encoded buffers with different bits, then 748 | * performs lower bound searches for each key */ 749 | for (b = 1; b <= 32; b++) { 750 | uint32_t out[128]; 751 | /* delta-encode to 'i' bits */ 752 | simdpackwithoutmaskd1(init, buffer, (__m128i *)out, b); 753 | initial = _mm_setzero_si128(); 754 | printf("simdsearchd1: %d bits\n", b); 755 | 756 | /* now perform the searches */ 757 | initial = _mm_set1_epi32(init); 758 | assert(simdsearchd1(&initial, (__m128i *)out, b, 0, &result) == 0); 759 | assert(result > 0); 760 | 761 | for (i = 1; i <= 128; i++) { 762 | initial = _mm_set1_epi32(init); 763 | assert(simdsearchd1(&initial, (__m128i *)out, b, (uint32_t)i, &result) == 764 | i - 1); 765 | assert(result == (unsigned)i); 766 | } 767 | initial = _mm_set1_epi32(init); 768 | assert(simdsearchd1(&initial, (__m128i *)out, b, 200, &result) == 128); 769 | assert(result > 200); 770 | } 771 | printf("simdsearchd1: ok\n"); 772 | return 0; 773 | } 774 | 775 | int test_simdpackedsearchFOR() { 776 | uint32_t buffer[128]; 777 | uint32_t result = 0; 778 | int b; 779 | uint32_t i; 780 | uint32_t maxv, tmin, tmax, tb; 781 | uint32_t out[128]; 782 | printf("[%s]\n", __func__); 783 | /* this test creates delta encoded buffers with different bits, then 784 | * performs lower bound searches for each key */ 785 | for (b = 1; b <= 32; b++) { 786 | /* initialize the buffer */ 787 | maxv = (b == 32) ? 0xFFFFFFFF : ((1U << b) - 1); 788 | for (i = 0; i < 128; i++) 789 | buffer[i] = maxv * (i + 1) / 128; 790 | simdmaxmin_length(buffer, SIMDBlockSize, &tmin, &tmax); 791 | /* we compute the bit width */ 792 | tb = bits(tmax - tmin); 793 | /* delta-encode to 'i' bits */ 794 | simdpackFOR(tmin, buffer, (__m128i *)out, tb); 795 | printf("simdsearchd1: %d bits\n", b); 796 | 797 | /* now perform the searches */ 798 | for (i = 0; i < 128; i++) { 799 | assert(buffer[i] == simdselectFOR(tmin, (__m128i *)out, tb, i)); 800 | } 801 | for (i = 0; i < 128; i++) { 802 | int x = simdsearchwithlengthFOR(tmin, (__m128i *)out, tb, 128, buffer[i], 803 | &result); 804 | assert(simdselectFOR(tmin, (__m128i *)out, tb, x) == buffer[x]); 805 | assert(simdselectFOR(tmin, (__m128i *)out, tb, x) == result); 806 | assert(buffer[x] == result); 807 | assert(result == buffer[i]); 808 | assert(buffer[x] == buffer[i]); 809 | } 810 | } 811 | printf("simdsearchFOR: ok\n"); 812 | return 0; 813 | } 814 | 815 | int test_simdpackedsearch_advanced() { 816 | uint32_t buffer[128]; 817 | uint32_t backbuffer[128]; 818 | uint32_t out[128]; 819 | uint32_t result = 0; 820 | uint32_t b, i; 821 | uint32_t init = 0; 822 | __m128i initial = _mm_set1_epi32(init); 823 | 824 | printf("[%s]\n", __func__); 825 | /* this test creates delta encoded buffers with different bits, then 826 | * performs lower bound searches for each key */ 827 | for (b = 0; b <= 32; b++) { 828 | uint32_t prev = init; 829 | /* initialize the buffer */ 830 | for (i = 0; i < 128; i++) { 831 | buffer[i] = ((uint32_t)(1431655765 * i + 0xFFFFFFFF)); 832 | if (b < 32) 833 | buffer[i] %= (1U << b); 834 | } 835 | 836 | qsort(buffer, 128, sizeof(uint32_t), uint32_cmp); 837 | 838 | for (i = 0; i < 128; i++) { 839 | buffer[i] = buffer[i] + prev; 840 | prev = buffer[i]; 841 | } 842 | for (i = 1; i < 128; i++) { 843 | if (buffer[i] < buffer[i - 1]) 844 | buffer[i] = buffer[i - 1]; 845 | } 846 | assert(simdmaxbitsd1(init, buffer) <= b); 847 | for (i = 0; i < 128; i++) { 848 | out[i] = 0; /* memset would do too */ 849 | } 850 | 851 | /* delta-encode to 'i' bits */ 852 | simdpackwithoutmaskd1(init, buffer, (__m128i *)out, b); 853 | simdunpackd1(init, (__m128i *)out, backbuffer, b); 854 | 855 | for (i = 0; i < 128; i++) { 856 | assert(buffer[i] == backbuffer[i]); 857 | } 858 | 859 | printf("advanced simdsearchd1: %d bits\n", b); 860 | 861 | for (i = 0; i < 128; i++) { 862 | int pos; 863 | initial = _mm_set1_epi32(init); 864 | pos = simdsearchd1(&initial, (__m128i *)out, b, buffer[i], &result); 865 | assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128, 866 | buffer[i], &result)); 867 | assert(buffer[pos] == buffer[i]); 868 | if (pos > 0) 869 | assert(buffer[pos - 1] < buffer[i]); 870 | assert(result == buffer[i]); 871 | } 872 | for (i = 0; i < 128; i++) { 873 | int pos; 874 | if (buffer[i] == 0) 875 | continue; 876 | initial = _mm_set1_epi32(init); 877 | pos = simdsearchd1(&initial, (__m128i *)out, b, buffer[i] - 1, &result); 878 | assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128, 879 | buffer[i] - 1, &result)); 880 | assert(buffer[pos] >= buffer[i] - 1); 881 | if (pos > 0) 882 | assert(buffer[pos - 1] < buffer[i] - 1); 883 | assert(result == buffer[pos]); 884 | } 885 | for (i = 0; i < 128; i++) { 886 | int pos; 887 | if (buffer[i] + 1 == 0) 888 | continue; 889 | initial = _mm_set1_epi32(init); 890 | pos = simdsearchd1(&initial, (__m128i *)out, b, buffer[i] + 1, &result); 891 | assert(pos == simdsearchwithlengthd1(init, (__m128i *)out, b, 128, 892 | buffer[i] + 1, &result)); 893 | if (pos == 128) { 894 | assert(buffer[i] == buffer[127]); 895 | } else { 896 | assert(buffer[pos] >= buffer[i] + 1); 897 | if (pos > 0) 898 | assert(buffer[pos - 1] < buffer[i] + 1); 899 | assert(result == buffer[pos]); 900 | } 901 | } 902 | } 903 | printf("advanced simdsearchd1: ok\n"); 904 | return 0; 905 | } 906 | 907 | int test_simdpackedselect() { 908 | uint32_t buffer[128]; 909 | uint32_t initial = 33; 910 | int b, i; 911 | printf("[%s]\n", __func__); 912 | /* initialize the buffer */ 913 | for (i = 0; i < 128; i++) 914 | buffer[i] = (uint32_t)(initial + i); 915 | 916 | /* this test creates delta encoded buffers with different bits, then 917 | * performs lower bound searches for each key */ 918 | for (b = 1; b <= 32; b++) { 919 | uint32_t out[128]; 920 | /* delta-encode to 'i' bits */ 921 | simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b); 922 | 923 | printf("simdselectd1: %d bits\n", b); 924 | 925 | /* now perform the searches */ 926 | for (i = 0; i < 128; i++) { 927 | assert(simdselectd1(initial, (__m128i *)out, b, (uint32_t)i) == 928 | initial + i); 929 | } 930 | } 931 | printf("simdselectd1: ok\n"); 932 | return 0; 933 | } 934 | 935 | int test_simdpackedselect_advanced() { 936 | uint32_t buffer[128]; 937 | uint32_t initial = 33; 938 | uint32_t b; 939 | int i; 940 | printf("[%s]\n", __func__); 941 | /* this test creates delta encoded buffers with different bits, then 942 | * performs lower bound searches for each key */ 943 | for (b = 0; b <= 32; b++) { 944 | uint32_t prev = initial; 945 | uint32_t out[128]; 946 | /* initialize the buffer */ 947 | for (i = 0; i < 128; i++) { 948 | buffer[i] = ((uint32_t)(165576 * i)); 949 | if (b < 32) 950 | buffer[i] %= (1U << b); 951 | } 952 | for (i = 0; i < 128; i++) { 953 | buffer[i] = buffer[i] + prev; 954 | prev = buffer[i]; 955 | } 956 | 957 | for (i = 1; i < 128; i++) { 958 | if (buffer[i] < buffer[i - 1]) 959 | buffer[i] = buffer[i - 1]; 960 | } 961 | assert(simdmaxbitsd1(initial, buffer) <= b); 962 | 963 | for (i = 0; i < 128; i++) { 964 | out[i] = 0; /* memset would do too */ 965 | } 966 | 967 | /* delta-encode to 'i' bits */ 968 | simdpackwithoutmaskd1(initial, buffer, (__m128i *)out, b); 969 | 970 | printf("simdselectd1: %d bits\n", b); 971 | 972 | /* now perform the searches */ 973 | for (i = 0; i < 128; i++) { 974 | uint32_t valretrieved = 975 | simdselectd1(initial, (__m128i *)out, b, (uint32_t)i); 976 | assert(valretrieved == buffer[i]); 977 | } 978 | } 979 | printf("advanced simdselectd1: ok\n"); 980 | return 0; 981 | } 982 | #endif 983 | 984 | int main() { 985 | int r; 986 | r = issue21(); 987 | if (r) { 988 | printf("test failure issue21\n"); 989 | return r; 990 | } 991 | r = issue21FOR(); 992 | if (r) { 993 | printf("test failure issue21FOR\n"); 994 | return r; 995 | } 996 | #ifdef __AVX512F__ 997 | r = testbabyavx512(); 998 | if (r) { 999 | printf("test failure baby avx512\n"); 1000 | return r; 1001 | } 1002 | 1003 | r = testavx512_2(); 1004 | if (r) { 1005 | printf("test failure 9 avx512\n"); 1006 | return r; 1007 | } 1008 | #endif 1009 | 1010 | r = testsetFOR(); 1011 | if (r) { 1012 | printf("test failure 1\n"); 1013 | return r; 1014 | } 1015 | 1016 | #ifdef __SSE4_1__ 1017 | r = testsetd1(); 1018 | if (r) { 1019 | printf("test failure 2\n"); 1020 | return r; 1021 | } 1022 | #endif 1023 | r = testset(); 1024 | if (r) { 1025 | printf("test failure 3\n"); 1026 | return r; 1027 | } 1028 | 1029 | r = testshortFORpack(); 1030 | if (r) { 1031 | printf("test failure 4\n"); 1032 | return r; 1033 | } 1034 | r = testshortpack(); 1035 | if (r) { 1036 | printf("test failure 5\n"); 1037 | return r; 1038 | } 1039 | r = testlongpack(); 1040 | if (r) { 1041 | printf("test failure 6\n"); 1042 | return r; 1043 | } 1044 | #ifdef __SSE4_1__ 1045 | r = test_simdpackedsearchFOR(); 1046 | if (r) { 1047 | printf("test failure 7\n"); 1048 | return r; 1049 | } 1050 | 1051 | r = testFOR(); 1052 | if (r) { 1053 | printf("test failure 8\n"); 1054 | return r; 1055 | } 1056 | #endif 1057 | #ifdef __AVX2__ 1058 | r = testbabyavx(); 1059 | if (r) { 1060 | printf("test failure baby avx\n"); 1061 | return r; 1062 | } 1063 | 1064 | r = testavx2(); 1065 | if (r) { 1066 | printf("test failure 9 avx\n"); 1067 | return r; 1068 | } 1069 | #endif 1070 | r = test(); 1071 | if (r) { 1072 | printf("test failure 9\n"); 1073 | return r; 1074 | } 1075 | 1076 | r = test_simdmaxbitsd1_length(); 1077 | if (r) { 1078 | printf("test failure 10\n"); 1079 | return r; 1080 | } 1081 | #ifdef __SSE4_1__ 1082 | r = test_simdpackedsearch(); 1083 | if (r) { 1084 | printf("test failure 11\n"); 1085 | return r; 1086 | } 1087 | 1088 | r = test_simdpackedsearch_advanced(); 1089 | if (r) { 1090 | printf("test failure 12\n"); 1091 | return r; 1092 | } 1093 | 1094 | r = test_simdpackedselect(); 1095 | if (r) { 1096 | printf("test failure 13\n"); 1097 | return r; 1098 | } 1099 | 1100 | r = test_simdpackedselect_advanced(); 1101 | if (r) { 1102 | printf("test failure 14\n"); 1103 | return r; 1104 | } 1105 | #endif 1106 | printf("All tests OK!\n"); 1107 | 1108 | return 0; 1109 | } 1110 | --------------------------------------------------------------------------------