├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── include ├── vqf_filter.h ├── vqf_precompute.h └── vqf_wrapper.h ├── scripts ├── Performance.gnumeric ├── merge_into_develop.sh └── merge_into_master.sh └── src ├── bm.cc ├── generate_shuffle_matrix.cc ├── main.cc ├── main_id.cc ├── main_tx.cc ├── shuffle_matrix_256.c ├── shuffle_matrix_512.c ├── shuffle_matrix_512_16.c └── vqf_filter.c /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | 34 | main 35 | bm 36 | 37 | obj/* 38 | data/* 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | TARGETS= main main_tx main_id bm 2 | 3 | OPT=-Ofast -g 4 | 5 | ARCH=-msse4.2 6 | 7 | ifeq ($(P),1) 8 | OPT=-g -no-pie 9 | endif 10 | 11 | HAVE_AVX512=$(filter-out 0,$(shell lscpu | grep avx512bw | wc -l)) 12 | 13 | ifeq ($(THREAD),1) 14 | OPT +=-DENABLE_THREADS 15 | endif 16 | 17 | CXX = g++ -std=c++11 -fgnu-tm -frename-registers -march=native 18 | CC = gcc -std=gnu11 -fgnu-tm -frename-registers -march=native 19 | LD= g++ -std=c++11 20 | 21 | LOC_INCLUDE=include 22 | LOC_SRC=src 23 | OBJDIR=obj 24 | 25 | CXXFLAGS += -Wall $(DEBUG) $(PROFILE) $(OPT) $(ARCH) -m64 -I. -I$(LOC_INCLUDE) 26 | 27 | CFLAGS += -Wall $(DEBUG) $(PROFILE) $(OPT) $(ARCH) -m64 -I. -I$(LOC_INCLUDE) 28 | 29 | LDFLAGS += $(DEBUG) $(PROFILE) $(OPT) -lpthread -lssl -lcrypto -lm -litm 30 | 31 | # 32 | # declaration of dependencies 33 | # 34 | 35 | all: $(TARGETS) 36 | 37 | # dependencies between programs and .o files 38 | ifeq ($(HAVE_AVX512),1) 39 | main: $(OBJDIR)/main.o $(OBJDIR)/vqf_filter.o $(OBJDIR)/shuffle_matrix_512.o $(OBJDIR)/shuffle_matrix_512_16.o 40 | main_id: $(OBJDIR)/main_id.o $(OBJDIR)/vqf_filter.o $(OBJDIR)/shuffle_matrix_512.o $(OBJDIR)/shuffle_matrix_512_16.o 41 | main_tx: $(OBJDIR)/main_tx.o $(OBJDIR)/vqf_filter.o $(OBJDIR)/shuffle_matrix_512.o $(OBJDIR)/shuffle_matrix_512_16.o 42 | bm: $(OBJDIR)/bm.o $(OBJDIR)/vqf_filter.o $(OBJDIR)/shuffle_matrix_512.o $(OBJDIR)/shuffle_matrix_512_16.o 43 | else 44 | main: $(OBJDIR)/main.o $(OBJDIR)/vqf_filter.o 45 | main_id: $(OBJDIR)/main_id.o $(OBJDIR)/vqf_filter.o 46 | main_tx: $(OBJDIR)/main_tx.o $(OBJDIR)/vqf_filter.o 47 | bm: $(OBJDIR)/bm.o $(OBJDIR)/vqf_filter.o 48 | endif 49 | 50 | # dependencies between .o files and .cc (or .c) files 51 | $(OBJDIR)/shuffle_matrix_512_16.o: $(LOC_SRC)/shuffle_matrix_512_16.c 52 | $(OBJDIR)/shuffle_matrix_512.o: $(LOC_SRC)/shuffle_matrix_512.c 53 | $(OBJDIR)/main.o: $(LOC_SRC)/main.cc 54 | $(OBJDIR)/main_id.o: $(LOC_SRC)/main_id.cc 55 | $(OBJDIR)/main_tx.o: $(LOC_SRC)/main_tx.cc 56 | $(OBJDIR)/bm.o: $(LOC_SRC)/bm.cc 57 | 58 | $(OBJDIR)/vqf_filter.o: $(LOC_SRC)/vqf_filter.c 59 | 60 | # 61 | # generic build rules 62 | # 63 | 64 | $(TARGETS): 65 | $(LD) $^ $(LDFLAGS) -o $@ 66 | 67 | $(OBJDIR)/%.o: $(LOC_SRC)/%.cc | $(OBJDIR) 68 | $(CXX) $(CXXFLAGS) $(INCLUDE) -c -o $@ $< 69 | 70 | $(OBJDIR)/%.o: $(LOC_SRC)/%.c | $(OBJDIR) 71 | $(CXX) $(CFLAGS) $(INCLUDE) -c -o $@ $< 72 | 73 | $(OBJDIR): 74 | @mkdir -p $(OBJDIR) 75 | 76 | clean: 77 | rm -rf $(OBJDIR) core $(TARGETS) 78 | 79 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # vqf 2 | Vector Quotient Filters: Overcoming the Time/Space Trade-Off in Filter Design 3 | 4 | This work appeared at SIGMOD 2021. If you use this software please cite us: 5 | ``` 6 | @inproceedings{PandeyCDB21, 7 | author = {Prashant Pandey and 8 | Alex Conway and 9 | Joe Durie and 10 | Michael A. Bender and 11 | Martin Farach-Colton and 12 | Rob Johnson}, 13 | title = {Vector Quotient Filters: Overcoming the Time/Space Trade-Off in Filter Design}, 14 | booktitle={Proceedings of the 2021 ACM international conference on Management of Data}, 15 | year = {2021}, 16 | } 17 | ``` 18 | 19 | Overview 20 | -------- 21 | The VQF supports approximate membership testing of 22 | items in a data set. The VQF is based on Robin Hood hashing, like the quotient 23 | filter, but uses power-of-two-choices hashing to reduce the variance of 24 | runs, and thus offers consistent, high throughput across load factors. 25 | Power-of-two-choices hashing also makes it more amenable to concurrent updates. 26 | 27 | API 28 | -------- 29 | * 'vqf_insert(item)': insert an item to the filter 30 | * 'vqf_is_present(item)': return the existence of the item. Note that this 31 | method may return false positive results like Bloom filters. 32 | * 'vqf_remove(item)': remove the item. 33 | 34 | Build 35 | ------- 36 | This library depends on libssl. 37 | 38 | The code uses AVX512 instructions to speed up operatons. However, there is also 39 | an alternate implementation based on AVX2. 40 | 41 | ```bash 42 | $ make main 43 | $ ./main 24 44 | ``` 45 | 46 | To build the code with thread-safe insertions: 47 | ```bash 48 | $ make THREAD=1 main_tx 49 | $ ./main_tx 24 4 50 | ``` 51 | 52 | The argument to main is the log of the number of slots in the VQF. For example, 53 | to create a VQF with 2^30 slots, the argument will be 30. 54 | 55 | Contributing 56 | ------------ 57 | Contributions via GitHub pull requests are welcome. 58 | 59 | 60 | Authors 61 | ------- 62 | - Prashant Pandey 63 | - Alex Conway 64 | - Rob Johnson 65 | -------------------------------------------------------------------------------- /include/vqf_filter.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ============================================================================ 3 | * 4 | * Filename: vqf_filter.h 5 | * 6 | * Author: Prashant Pandey (), ppandey@berkeley.edu 7 | * Organization: LBNL/UCB 8 | * 9 | * ============================================================================ 10 | */ 11 | 12 | #ifndef _VQF_FILTER_H_ 13 | #define _VQF_FILTER_H_ 14 | 15 | #include 16 | #include 17 | 18 | #ifdef __cplusplus 19 | #define restrict __restrict__ 20 | extern "C" { 21 | #endif 22 | 23 | // NOTE: Currently the code only works for TAG_BITS 8 and 16. 24 | #define TAG_BITS 8 25 | 26 | // metadata: 1 --> end of the run 27 | // Each 1 is preceded by k 0s, where k is the number of remainders in that 28 | // run. 29 | 30 | #if TAG_BITS == 8 31 | // We are using 8-bit tags. 32 | // One block consists of 48 8-bit slots covering 80 buckets, and 80+48 = 128 33 | // bits of metadata. 34 | typedef struct __attribute__ ((__packed__)) vqf_block { 35 | uint64_t md[2]; 36 | uint8_t tags[48]; 37 | } vqf_block; 38 | #elif TAG_BITS == 12 39 | // We are using 12-bit tags. 40 | // One block consists of 32 12-bit slots covering 96 buckets, and 96+32 = 128 41 | // bits of metadata. 42 | // NOTE: not supported yet. 43 | typedef struct __attribute__ ((__packed__)) vqf_block { 44 | uint64_t md[2]; 45 | uint8_t tags[32]; // 32 12-bit tags 46 | } vqf_block; 47 | #elif TAG_BITS == 16 48 | // We are using 16-bit tags. 49 | // One block consists of 28 16-bit slots covering 36 buckets, and 36+28 = 64 50 | // bits of metadata. 51 | typedef struct __attribute__ ((__packed__)) vqf_block { 52 | uint64_t md; 53 | uint16_t tags[28]; 54 | } vqf_block; 55 | #endif 56 | 57 | typedef struct vqf_metadata { 58 | uint64_t total_size_in_bytes; 59 | uint64_t key_remainder_bits; 60 | uint64_t range; 61 | uint64_t nblocks; 62 | uint64_t nelts; 63 | uint64_t nslots; 64 | } vqf_metadata; 65 | 66 | typedef struct vqf_filter { 67 | vqf_metadata metadata; 68 | vqf_block blocks[]; 69 | } vqf_filter; 70 | 71 | vqf_filter * vqf_init(uint64_t nslots); 72 | 73 | bool vqf_insert(vqf_filter * restrict filter, uint64_t hash); 74 | 75 | bool vqf_remove(vqf_filter * restrict filter, uint64_t hash); 76 | 77 | bool vqf_is_present(vqf_filter * restrict filter, uint64_t hash); 78 | 79 | #ifdef __cplusplus 80 | } 81 | #endif 82 | 83 | #endif // _VQF_FILTER_H_ 84 | 85 | 86 | -------------------------------------------------------------------------------- /include/vqf_precompute.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ============================================================================ 3 | * 4 | * Filename: vqf_precompute.h 5 | * 6 | * Author: Prashant Pandey (), ppandey@berkeley.edu 7 | * Organization: LBNL/UCB 8 | * 9 | * ============================================================================ 10 | */ 11 | 12 | 13 | static uint64_t pre_one[64 + 128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14 | 1ULL << 0, 1ULL << 1, 1ULL << 2, 1ULL << 3, 1ULL << 4, 1ULL << 5, 1ULL << 6, 1ULL << 7, 1ULL << 8, 1ULL << 9, 15 | 1ULL << 10, 1ULL << 11, 1ULL << 12, 1ULL << 13, 1ULL << 14, 1ULL << 15, 1ULL << 16, 1ULL << 17, 1ULL << 18, 1ULL << 19, 16 | 1ULL << 20, 1ULL << 21, 1ULL << 22, 1ULL << 23, 1ULL << 24, 1ULL << 25, 1ULL << 26, 1ULL << 27, 1ULL << 28, 1ULL << 29, 17 | 1ULL << 30, 1ULL << 31, 1ULL << 32, 1ULL << 33, 1ULL << 34, 1ULL << 35, 1ULL << 36, 1ULL << 37, 1ULL << 38, 1ULL << 39, 18 | 1ULL << 40, 1ULL << 41, 1ULL << 42, 1ULL << 43, 1ULL << 44, 1ULL << 45, 1ULL << 46, 1ULL << 47, 1ULL << 48, 1ULL << 49, 19 | 1ULL << 50, 1ULL << 51, 1ULL << 52, 1ULL << 53, 1ULL << 54, 1ULL << 55, 1ULL << 56, 1ULL << 57, 1ULL << 58, 1ULL << 59, 20 | 1ULL << 60, 1ULL << 61, 1ULL << 62, 1ULL << 63, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 21 | 22 | static uint64_t *one = pre_one + 64; 23 | 24 | const static uint64_t carry_pdep_table[128] { 25 | 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 26 | 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 27 | 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 28 | 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 29 | 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 30 | 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 31 | 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 32 | 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 1ULL, 33 | 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 34 | 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 35 | 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 36 | 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 37 | 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 38 | 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 39 | 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 40 | 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL, 0ULL 41 | }; 42 | 43 | const static uint64_t high_order_pdep_table[128] { 44 | ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), 45 | ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), 46 | ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), 47 | ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), 48 | ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), 49 | ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), 50 | ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), 51 | ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), ~(1ULL << 0), 52 | ~(1ULL << 0), ~(1ULL << 1), ~(1ULL << 2), ~(1ULL << 3), ~(1ULL << 4), ~(1ULL << 5), ~(1ULL << 6), ~(1ULL << 7), 53 | ~(1ULL << 8), ~(1ULL << 9), ~(1ULL << 10), ~(1ULL << 11), ~(1ULL << 12), ~(1ULL << 13), ~(1ULL << 14), ~(1ULL << 15), 54 | ~(1ULL << 16), ~(1ULL << 17), ~(1ULL << 18), ~(1ULL << 19), ~(1ULL << 20), ~(1ULL << 21), ~(1ULL << 22), ~(1ULL << 23), 55 | ~(1ULL << 24), ~(1ULL << 25), ~(1ULL << 26), ~(1ULL << 27), ~(1ULL << 28), ~(1ULL << 29), ~(1ULL << 30), ~(1ULL << 31), 56 | ~(1ULL << 32), ~(1ULL << 33), ~(1ULL << 34), ~(1ULL << 35), ~(1ULL << 36), ~(1ULL << 37), ~(1ULL << 38), ~(1ULL << 39), 57 | ~(1ULL << 40), ~(1ULL << 41), ~(1ULL << 42), ~(1ULL << 43), ~(1ULL << 44), ~(1ULL << 45), ~(1ULL << 46), ~(1ULL << 47), 58 | ~(1ULL << 48), ~(1ULL << 49), ~(1ULL << 50), ~(1ULL << 51), ~(1ULL << 52), ~(1ULL << 53), ~(1ULL << 54), ~(1ULL << 55), 59 | ~(1ULL << 56), ~(1ULL << 57), ~(1ULL << 58), ~(1ULL << 59), ~(1ULL << 60), ~(1ULL << 61), ~(1ULL << 62), ~(1ULL << 63) 60 | }; 61 | 62 | const static uint64_t low_order_pdep_table[128] { 63 | ~(1ULL << 0), ~(1ULL << 1), ~(1ULL << 2), ~(1ULL << 3), ~(1ULL << 4), ~(1ULL << 5), ~(1ULL << 6), ~(1ULL << 7), 64 | ~(1ULL << 8), ~(1ULL << 9), ~(1ULL << 10), ~(1ULL << 11), ~(1ULL << 12), ~(1ULL << 13), ~(1ULL << 14), ~(1ULL << 15), 65 | ~(1ULL << 16), ~(1ULL << 17), ~(1ULL << 18), ~(1ULL << 19), ~(1ULL << 20), ~(1ULL << 21), ~(1ULL << 22), ~(1ULL << 23), 66 | ~(1ULL << 24), ~(1ULL << 25), ~(1ULL << 26), ~(1ULL << 27), ~(1ULL << 28), ~(1ULL << 29), ~(1ULL << 30), ~(1ULL << 31), 67 | ~(1ULL << 32), ~(1ULL << 33), ~(1ULL << 34), ~(1ULL << 35), ~(1ULL << 36), ~(1ULL << 37), ~(1ULL << 38), ~(1ULL << 39), 68 | ~(1ULL << 40), ~(1ULL << 41), ~(1ULL << 42), ~(1ULL << 43), ~(1ULL << 44), ~(1ULL << 45), ~(1ULL << 46), ~(1ULL << 47), 69 | ~(1ULL << 48), ~(1ULL << 49), ~(1ULL << 50), ~(1ULL << 51), ~(1ULL << 52), ~(1ULL << 53), ~(1ULL << 54), ~(1ULL << 55), 70 | ~(1ULL << 56), ~(1ULL << 57), ~(1ULL << 58), ~(1ULL << 59), ~(1ULL << 60), ~(1ULL << 61), ~(1ULL << 62), ~(1ULL << 63), 71 | ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, 72 | ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, 73 | ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, 74 | ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, 75 | ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, 76 | ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, 77 | ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, 78 | ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL 79 | }; 80 | 81 | const __m256i K0 = _mm256_setr_epi8( 82 | 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 83 | 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0); 84 | 85 | const __m256i K1 = _mm256_setr_epi8( 86 | 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 87 | 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70); 88 | 89 | const __m256i K[] = {K0, K1}; 90 | 91 | -------------------------------------------------------------------------------- /include/vqf_wrapper.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ============================================================================ 3 | * 4 | * Filename: vqf_wrapper.h 5 | * 6 | * Author: Prashant Pandey (), ppandey2@cs.cmu.edu 7 | * Organization: Carnegie Mellon University 8 | * 9 | * ============================================================================ 10 | */ 11 | 12 | #ifndef VQF_WRAPPER_H 13 | #define VQF_WRAPPER_H 14 | 15 | #include "vqf_filter.h" 16 | 17 | vqf_filter *q_filter; 18 | 19 | 20 | inline int q_init(uint64_t nbits) 21 | { 22 | uint64_t nslots = (1ULL << nbits); 23 | q_filter = vqf_init(nslots); 24 | return 0; 25 | } 26 | 27 | inline int q_insert(__uint128_t val) 28 | { 29 | if (!vqf_insert(q_filter, val)) 30 | return 0; 31 | return 1; 32 | } 33 | 34 | inline int q_lookup(__uint128_t val) 35 | { 36 | if (!vqf_is_present(q_filter, val)) 37 | return 0; 38 | return 1; 39 | } 40 | 41 | inline int q_remove(__uint128_t val) 42 | { 43 | if (!vqf_remove(q_filter, val)) 44 | return 0; 45 | return 1; 46 | } 47 | 48 | inline __uint128_t q_range() 49 | { 50 | //return q_filter->metadata.range; 51 | return UINT64_MAX; 52 | } 53 | 54 | inline int q_destroy() 55 | { 56 | return 0; 57 | } 58 | 59 | #endif 60 | -------------------------------------------------------------------------------- /scripts/Performance.gnumeric: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/splatlab/vqf/62accaf20c0f6014f98fc8d9b995e244b6318ce3/scripts/Performance.gnumeric -------------------------------------------------------------------------------- /scripts/merge_into_develop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -eq 0 ] 4 | then 5 | echo "No input arguments provided. Usage is merge_into_develop.sh " 6 | exit 1 7 | fi 8 | 9 | feature=$1 10 | 11 | # from https://stackoverflow.com/questions/173919/is-there-a-theirs-version-of-git-merge-s-ours 12 | # in case branchA is not our current branch 13 | git checkout develop 14 | 15 | # make merge commit but without conflicts!! 16 | # the contents of 'ours' will be discarded later 17 | git merge -s ours ${feature} 18 | 19 | # make temporary branch to merged commit 20 | git branch branchTEMP 21 | 22 | # get contents of working tree and index to the one of branchB 23 | git reset --hard ${feature} 24 | 25 | # reset to our merged commit but 26 | # keep contents of working tree and index 27 | git reset --soft branchTEMP 28 | 29 | # change the contents of the merged commit 30 | # with the contents of branchB 31 | git commit --amend 32 | 33 | # get rid off our temporary branch 34 | git branch -D branchTEMP 35 | 36 | # verify that the merge commit contains only contents of branchB 37 | git diff HEAD ${feature} 38 | -------------------------------------------------------------------------------- /scripts/merge_into_master.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -eq 0 ] 4 | then 5 | echo "No input arguments provided. Usage is merge_into_master.sh " 6 | exit 1 7 | fi 8 | 9 | feature=$1 10 | 11 | # from https://stackoverflow.com/questions/173919/is-there-a-theirs-version-of-git-merge-s-ours 12 | # in case branchA is not our current branch 13 | git checkout master 14 | 15 | # make merge commit but without conflicts!! 16 | # the contents of 'ours' will be discarded later 17 | git merge -s ours ${feature} 18 | 19 | # make temporary branch to merged commit 20 | git branch branchTEMP 21 | 22 | # get contents of working tree and index to the one of branchB 23 | git reset --hard ${feature} 24 | 25 | # reset to our merged commit but 26 | # keep contents of working tree and index 27 | git reset --soft branchTEMP 28 | 29 | # change the contents of the merged commit 30 | # with the contents of branchB 31 | git commit --amend 32 | 33 | # get rid off our temporary branch 34 | git branch -D branchTEMP 35 | 36 | # verify that the merge commit contains only contents of branchB 37 | git diff HEAD ${feature} 38 | -------------------------------------------------------------------------------- /src/bm.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * ===================================================================================== 3 | * 4 | * Filename: bm.c 5 | * 6 | * Description: 7 | * 8 | * Version: 1.0 9 | * Created: 05/18/2015 08:54:53 PM 10 | * Revision: none 11 | * Compiler: gcc 12 | * 13 | * Author: Prashant Pandey (ppandey@cs.stonybrook.edu), 14 | * Organization: 15 | * 16 | * ===================================================================================== 17 | */ 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | #include "vqf_wrapper.h" 31 | 32 | typedef void *(*rand_init)(uint64_t maxoutputs, __uint128_t maxvalue, 33 | void *params); 34 | typedef int (*gen_rand)(void *state, uint64_t noutputs, __uint128_t *outputs); 35 | typedef void *(*duplicate_rand)(void *state); 36 | 37 | typedef int (*init_op)(uint64_t nvals); 38 | typedef int (*insert_op)(__uint128_t val); 39 | typedef int (*lookup_op)(__uint128_t val); 40 | typedef int (*remove_op)(__uint128_t val); 41 | typedef __uint128_t (*get_range_op)(); 42 | typedef int (*destroy_op)(); 43 | 44 | typedef struct rand_generator { 45 | rand_init init; 46 | gen_rand gen; 47 | duplicate_rand dup; 48 | } rand_generator; 49 | 50 | typedef struct filter { 51 | init_op init; 52 | insert_op insert; 53 | lookup_op lookup; 54 | remove_op remove; 55 | get_range_op range; 56 | destroy_op destroy; 57 | } filter; 58 | 59 | typedef struct uniform_pregen_state { 60 | uint64_t maxoutputs; 61 | uint64_t nextoutput; 62 | __uint128_t *outputs; 63 | } uniform_pregen_state; 64 | 65 | typedef struct uniform_online_state { 66 | uint64_t maxoutputs; 67 | __uint128_t maxvalue; 68 | unsigned int seed; 69 | char *buf; 70 | int STATELEN; 71 | struct random_data *rand_state; 72 | } uniform_online_state; 73 | 74 | void *uniform_pregen_init(uint64_t maxoutputs, __uint128_t maxvalue, 75 | void *params) { 76 | uint32_t i; 77 | uniform_pregen_state *state = 78 | (uniform_pregen_state *)malloc(sizeof(uniform_pregen_state)); 79 | assert(state != NULL); 80 | 81 | state->nextoutput = 0; 82 | 83 | state->maxoutputs = maxoutputs; 84 | state->outputs = 85 | (__uint128_t *)malloc(state->maxoutputs * sizeof(state->outputs[0])); 86 | assert(state->outputs != NULL); 87 | uint64_t nbytes = sizeof(*state->outputs) * state->maxoutputs; 88 | uint8_t *ptr = (unsigned char *)state->outputs; 89 | while (nbytes > (1ULL << 30)) { 90 | RAND_bytes(ptr, 1ULL << 30); 91 | ptr += (1ULL << 30); 92 | nbytes -= (1ULL << 30); 93 | } 94 | RAND_bytes(ptr, nbytes); 95 | for (i = 0; i < state->maxoutputs; i++) 96 | state->outputs[i] = (1 * state->outputs[i]) % maxvalue; 97 | 98 | return (void *)state; 99 | } 100 | 101 | int uniform_pregen_gen_rand(void *_state, uint64_t noutputs, 102 | __uint128_t *outputs) { 103 | uniform_pregen_state *state = (uniform_pregen_state *)_state; 104 | assert(state->nextoutput + noutputs <= state->maxoutputs); 105 | memcpy(outputs, state->outputs + state->nextoutput, 106 | noutputs * sizeof(*state->outputs)); 107 | state->nextoutput += noutputs; 108 | return noutputs; 109 | } 110 | 111 | void *uniform_pregen_duplicate(void *state) { 112 | uniform_pregen_state *newstate = 113 | (uniform_pregen_state *)malloc(sizeof(*newstate)); 114 | assert(newstate); 115 | memcpy(newstate, state, sizeof(*newstate)); 116 | return newstate; 117 | } 118 | 119 | void *uniform_online_init(uint64_t maxoutputs, __uint128_t maxvalue, 120 | void *params) { 121 | uniform_online_state *state = 122 | (uniform_online_state *)malloc(sizeof(uniform_online_state)); 123 | assert(state != NULL); 124 | 125 | state->maxoutputs = maxoutputs; 126 | state->maxvalue = maxvalue; 127 | state->seed = time(NULL); 128 | state->STATELEN = 256; 129 | state->buf = (char *)calloc(256, sizeof(char)); 130 | state->rand_state = 131 | (struct random_data *)calloc(1, sizeof(struct random_data)); 132 | 133 | initstate_r(state->seed, state->buf, state->STATELEN, state->rand_state); 134 | return (void *)state; 135 | } 136 | 137 | int uniform_online_gen_rand(void *_state, uint64_t noutputs, 138 | __uint128_t *outputs) { 139 | uint32_t i, j; 140 | uniform_online_state *state = (uniform_online_state *)_state; 141 | assert(state->rand_state != NULL); 142 | memset(outputs, 0, noutputs * sizeof(__uint128_t)); 143 | for (i = 0; i < noutputs; i++) { 144 | int32_t result; 145 | for (j = 0; j < 4; j++) { 146 | random_r(state->rand_state, &result); 147 | outputs[i] = (outputs[i] * RAND_MAX) + result; 148 | } 149 | outputs[i] = (1 * outputs[i]) % state->maxvalue; 150 | } 151 | return noutputs; 152 | } 153 | 154 | void *uniform_online_duplicate(void *_state) { 155 | uniform_online_state *newstate = 156 | (uniform_online_state *)malloc(sizeof(uniform_online_state)); 157 | assert(newstate != NULL); 158 | uniform_online_state *oldstate = (uniform_online_state *)_state; 159 | 160 | newstate->maxvalue = oldstate->maxvalue; 161 | newstate->seed = oldstate->seed; 162 | newstate->STATELEN = oldstate->STATELEN; 163 | 164 | newstate->buf = (char *)calloc(256, sizeof(char)); 165 | memcpy(newstate->buf, oldstate->buf, newstate->STATELEN); 166 | newstate->rand_state = 167 | (struct random_data *)calloc(1, sizeof(struct random_data)); 168 | 169 | initstate_r(newstate->seed, newstate->buf, newstate->STATELEN, 170 | newstate->rand_state); 171 | return newstate; 172 | } 173 | 174 | rand_generator uniform_pregen = {uniform_pregen_init, uniform_pregen_gen_rand, 175 | uniform_pregen_duplicate}; 176 | 177 | rand_generator uniform_online = {uniform_online_init, uniform_online_gen_rand, 178 | uniform_online_duplicate}; 179 | 180 | filter cf = {q_init, q_insert, q_lookup, q_remove, q_range, q_destroy}; 181 | 182 | uint64_t tv2usec(struct timeval tv) { 183 | return 1000000 * tv.tv_sec + tv.tv_usec; 184 | } 185 | 186 | uint64_t tv2msec(struct timeval tv) { 187 | return tv.tv_sec * 1000 + tv.tv_usec / 1000; 188 | } 189 | 190 | int cmp_uint64_t(const void *a, const void *b) { 191 | const uint64_t *ua = (const uint64_t *)a, *ub = (const uint64_t *)b; 192 | return *ua < *ub ? -1 : *ua == *ub ? 0 : 1; 193 | } 194 | 195 | void usage(char *name) { 196 | printf( 197 | "%s [OPTIONS]\n" 198 | "Options are:\n" 199 | " -n nslots [ log_2 of filter capacity. Default 24 ]\n" 200 | " -r nruns [ number of runs. Default 1 ]\n" 201 | " -p npoints [ number of points on the graph. Default 20 ]\n" 202 | " -m randmode [ Data distribution, one of \n" 203 | " uniform_pregen\n" 204 | " uniform_online\n" 205 | " zipfian_pregen\n" 206 | " Default uniform_pregen ]\n" 207 | " -d datastruct [ Default qf. ]\n" 208 | " -f outputfile [ Default qf. ]\n", 209 | name); 210 | } 211 | 212 | int main(int argc, char **argv) { 213 | uint32_t nbits = 0, nruns = 0; 214 | unsigned int npoints = 0; 215 | uint64_t nslots = 0, nvals = 0; 216 | char *randmode = "uniform_pregen"; 217 | char *datastruct = "qf"; 218 | char *outputfile = "qf"; 219 | 220 | filter filter_ds; 221 | rand_generator *vals_gen; 222 | void *vals_gen_state; 223 | void *old_vals_gen_state; 224 | void *remove_vals_gen_state; 225 | rand_generator *othervals_gen; 226 | void *othervals_gen_state; 227 | 228 | // __uint128_t *vals; 229 | // __uint128_t *othervals; 230 | 231 | unsigned int i, j, exp, run; 232 | struct timeval tv_insert[100][1]; 233 | struct timeval tv_exit_lookup[100][1]; 234 | struct timeval tv_false_lookup[100][1]; 235 | struct timeval tv_remove[100][1]; 236 | uint64_t fps = 0; 237 | 238 | FILE *fp_insert; 239 | FILE *fp_exit_lookup; 240 | FILE *fp_false_lookup; 241 | FILE *fp_remove; 242 | const char *dir = "./"; 243 | const char *insert_op = "-insert.txt\0"; 244 | const char *exit_lookup_op = "-exists-lookup.txt\0"; 245 | const char *false_lookup_op = "-false-lookup.txt\0"; 246 | const char *remove_op = "-remove.txt\0"; 247 | char filename_insert[256]; 248 | char filename_exit_lookup[256]; 249 | char filename_false_lookup[256]; 250 | char filename_remove[256]; 251 | 252 | /* Argument parsing */ 253 | int opt; 254 | char *term; 255 | 256 | while ((opt = getopt(argc, argv, "n:r:p:m:d:f:")) != -1) { 257 | switch (opt) { 258 | case 'n': 259 | nbits = strtol(optarg, &term, 10); 260 | if (*term) { 261 | fprintf(stderr, "Argument to -n must be an integer\n"); 262 | usage(argv[0]); 263 | exit(1); 264 | } 265 | nslots = (1ULL << nbits); 266 | nvals = 950 * nslots / 1000; 267 | break; 268 | case 'r': 269 | nruns = strtol(optarg, &term, 10); 270 | if (*term) { 271 | fprintf(stderr, "Argument to -r must be an integer\n"); 272 | usage(argv[0]); 273 | exit(1); 274 | } 275 | break; 276 | case 'p': 277 | npoints = strtol(optarg, &term, 10); 278 | if (*term) { 279 | fprintf(stderr, "Argument to -p must be an integer\n"); 280 | usage(argv[0]); 281 | exit(1); 282 | } 283 | break; 284 | case 'm': 285 | randmode = optarg; 286 | break; 287 | case 'd': 288 | datastruct = optarg; 289 | break; 290 | case 'f': 291 | outputfile = optarg; 292 | break; 293 | default: 294 | fprintf(stderr, "Unknown option\n"); 295 | usage(argv[0]); 296 | exit(1); 297 | break; 298 | } 299 | } 300 | 301 | if (strcmp(randmode, "uniform_pregen") == 0) { 302 | vals_gen = &uniform_pregen; 303 | othervals_gen = &uniform_pregen; 304 | } else if (strcmp(randmode, "uniform_online") == 0) { 305 | vals_gen = &uniform_online; 306 | othervals_gen = &uniform_online; 307 | } else { 308 | fprintf(stderr, "Unknown randmode.\n"); 309 | usage(argv[0]); 310 | exit(1); 311 | } 312 | 313 | if (strcmp(datastruct, "cf") == 0) { 314 | filter_ds = cf; 315 | // } else if (strcmp(datastruct, "gqf") == 0) { 316 | // filter_ds = gqf; 317 | // } else if (strcmp(datastruct, "qf") == 0) { 318 | // filter_ds = qf; 319 | // } else if (strcmp(datastruct, "bf") == 0) { 320 | // filter_ds = bf; 321 | } else { 322 | fprintf(stderr, "Unknown randmode.\n"); 323 | usage(argv[0]); 324 | exit(1); 325 | } 326 | 327 | snprintf(filename_insert, 328 | strlen(dir) + strlen(outputfile) + strlen(insert_op) + 1, "%s%s%s", 329 | dir, outputfile, insert_op); 330 | snprintf(filename_exit_lookup, 331 | strlen(dir) + strlen(outputfile) + strlen(exit_lookup_op) + 1, 332 | "%s%s%s", dir, outputfile, exit_lookup_op); 333 | 334 | snprintf(filename_false_lookup, 335 | strlen(dir) + strlen(outputfile) + strlen(false_lookup_op) + 1, 336 | "%s%s%s", dir, outputfile, false_lookup_op); 337 | snprintf(filename_remove, 338 | strlen(dir) + strlen(outputfile) + strlen(remove_op) + 1, "%s%s%s", 339 | dir, outputfile, remove_op); 340 | 341 | fp_insert = fopen(filename_insert, "w"); 342 | fp_exit_lookup = fopen(filename_exit_lookup, "w"); 343 | fp_false_lookup = fopen(filename_false_lookup, "w"); 344 | fp_remove = fopen(filename_remove, "w"); 345 | 346 | if (fp_insert == NULL || fp_exit_lookup == NULL || fp_false_lookup == NULL 347 | || fp_remove == NULL) { 348 | printf("Can't open the data file"); 349 | exit(1); 350 | } 351 | 352 | fprintf(fp_insert, "x_0"); 353 | for (run = 0; run < nruns; run++) { 354 | fprintf(fp_insert, " y_%d", run); 355 | } 356 | fprintf(fp_insert, "\n"); 357 | 358 | fprintf(fp_exit_lookup, "x_0"); 359 | for (run = 0; run < nruns; run++) { 360 | fprintf(fp_exit_lookup, " y_%d", run); 361 | } 362 | fprintf(fp_exit_lookup, "\n"); 363 | 364 | fprintf(fp_false_lookup, "x_0"); 365 | for (run = 0; run < nruns; run++) { 366 | fprintf(fp_false_lookup, " y_%d", run); 367 | } 368 | fprintf(fp_false_lookup, "\n"); 369 | 370 | fprintf(fp_remove, "x_0"); 371 | for (run = 0; run < nruns; run++) { 372 | fprintf(fp_remove, " y_%d", run); 373 | } 374 | fprintf(fp_remove, "\n"); 375 | 376 | fclose(fp_insert); 377 | fclose(fp_exit_lookup); 378 | fclose(fp_false_lookup); 379 | fclose(fp_remove); 380 | 381 | for (run = 0; run < nruns; run++) { 382 | fps = 0; 383 | filter_ds.init(nbits); 384 | 385 | vals_gen_state = vals_gen->init(nvals, filter_ds.range(), NULL); 386 | old_vals_gen_state = vals_gen->dup(vals_gen_state); 387 | remove_vals_gen_state = vals_gen->dup(vals_gen_state); 388 | sleep(5); 389 | othervals_gen_state = othervals_gen->init(nvals, filter_ds.range(), NULL); 390 | 391 | for (exp = 0; exp < 2 * npoints; exp += 2) { 392 | fp_insert = fopen(filename_insert, "a"); 393 | fp_exit_lookup = fopen(filename_exit_lookup, "a"); 394 | fp_false_lookup = fopen(filename_false_lookup, "a"); 395 | 396 | i = (exp / 2) * (nvals / npoints); 397 | j = ((exp / 2) + 1) * (nvals / npoints); 398 | printf("Round: %d\n", exp / 2); 399 | 400 | gettimeofday(&tv_insert[exp][run], NULL); 401 | for (; i < j; i += 1 << 16) { 402 | int nitems = j - i < 1 << 16 ? j - i : 1 << 16; 403 | __uint128_t vals[1 << 16]; 404 | int m; 405 | assert(vals_gen->gen(vals_gen_state, nitems, vals) == nitems); 406 | 407 | for (m = 0; m < nitems; m++) { 408 | filter_ds.insert(vals[m]); 409 | } 410 | } 411 | gettimeofday(&tv_insert[exp + 1][run], NULL); 412 | fprintf(fp_insert, "%d", ((exp / 2) * (100 / npoints))); 413 | fprintf(fp_insert, " %f\n", 414 | 1.0 * (nvals / npoints) / 415 | (tv2usec(tv_insert[exp + 1][run]) - 416 | tv2usec(tv_insert[exp][run]))); 417 | 418 | i = (exp / 2) * (nvals / 20); 419 | gettimeofday(&tv_exit_lookup[exp][run], NULL); 420 | for (; i < j; i += 1 << 16) { 421 | int nitems = j - i < 1 << 16 ? j - i : 1 << 16; 422 | __uint128_t vals[1 << 16]; 423 | int m; 424 | assert(vals_gen->gen(old_vals_gen_state, nitems, vals) == nitems); 425 | for (m = 0; m < nitems; m++) { 426 | if (!filter_ds.lookup(vals[m])) { 427 | // fprintf(stderr, "Failed lookup for 0x%lx%016lx\n", 428 | //(uint64_t)(vals[m] >> 64), 429 | //(uint64_t)(vals[m] & 0xffffffffffffffff)); 430 | // abort(); 431 | } 432 | } 433 | } 434 | gettimeofday(&tv_exit_lookup[exp + 1][run], NULL); 435 | fprintf(fp_exit_lookup, "%d", ((exp / 2) * (100 / npoints))); 436 | fprintf(fp_exit_lookup, " %f\n", 437 | 1.0 * (nvals / npoints) / 438 | (tv2usec(tv_exit_lookup[exp + 1][run]) - 439 | tv2usec(tv_exit_lookup[exp][run]))); 440 | 441 | i = (exp / 2) * (nvals / 20); 442 | gettimeofday(&tv_false_lookup[exp][run], NULL); 443 | for (; i < j; i += 1 << 16) { 444 | int nitems = j - i < 1 << 16 ? j - i : 1 << 16; 445 | __uint128_t othervals[1 << 16]; 446 | int m; 447 | assert(othervals_gen->gen(othervals_gen_state, nitems, othervals) == 448 | nitems); 449 | for (m = 0; m < nitems; m++) { 450 | fps += filter_ds.lookup(othervals[m]); 451 | } 452 | } 453 | gettimeofday(&tv_false_lookup[exp + 1][run], NULL); 454 | fprintf(fp_false_lookup, "%d", ((exp / 2) * (100 / npoints))); 455 | fprintf(fp_false_lookup, " %f\n", 456 | 1.0 * (nvals / npoints) / 457 | (tv2usec(tv_false_lookup[exp + 1][run]) - 458 | tv2usec(tv_false_lookup[exp][run]))); 459 | 460 | fclose(fp_insert); 461 | fclose(fp_exit_lookup); 462 | fclose(fp_false_lookup); 463 | } 464 | 465 | for (exp = 0; exp < 2 * npoints; exp += 2) { 466 | fp_remove = fopen(filename_remove, "a"); 467 | i = (exp / 2) * (nvals / npoints); 468 | j = ((exp / 2) + 1) * (nvals / npoints); 469 | printf("Round: %d\n", exp / 2); 470 | 471 | gettimeofday(&tv_remove[exp][run], NULL); 472 | for (; i < j; i += 1 << 16) { 473 | int nitems = j - i < 1 << 16 ? j - i : 1 << 16; 474 | __uint128_t vals[1 << 16]; 475 | int m; 476 | assert(vals_gen->gen(remove_vals_gen_state, nitems, vals) == nitems); 477 | 478 | for (m = 0; m < nitems; m++) { 479 | filter_ds.remove(vals[m]); 480 | } 481 | } 482 | gettimeofday(&tv_remove[exp + 1][run], NULL); 483 | fprintf(fp_remove, "%d", ((exp / 2) * (100 / npoints))); 484 | fprintf(fp_remove, " %f\n", 485 | 1.0 * (nvals / npoints) / 486 | (tv2usec(tv_remove[exp + 1][run]) - 487 | tv2usec(tv_remove[exp][run]))); 488 | 489 | fclose(fp_remove); 490 | } 491 | 492 | filter_ds.destroy(); 493 | } 494 | printf("Insert Performance written to file: %s\n", filename_insert); 495 | printf("Exist lookup Performance written to file: %s\n", filename_exit_lookup); 496 | printf("False lookup Performance written to file: %s\n", filename_false_lookup); 497 | printf("Remove Performance written to file: %s\n", filename_remove); 498 | 499 | printf("FP rate: %f (%lu/%lu)\n", 1.0 * fps / nvals, fps, nvals); 500 | 501 | return 0; 502 | } 503 | -------------------------------------------------------------------------------- /src/generate_shuffle_matrix.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * ============================================================================ 3 | * 4 | * Filename: generate_shuffle_matrix.cc 5 | * 6 | * Author: Prashant Pandey (), ppandey@berkeley.edu 7 | * Organization: LBNL/UCB 8 | * 9 | * ============================================================================ 10 | */ 11 | 12 | 13 | #include 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | /* 21 | #define SHUFFLE_SIZE 32 22 | 23 | void generate_shuffle_256(void) { 24 | std::ofstream shuffle_matrix("src/shuffle_matrix_256.c"); 25 | 26 | // generate right shuffle 27 | for (uint64_t index = 0; index < SHUFFLE_SIZE; index++) { 28 | shuffle_matrix << "const __m256i RM" << std::to_string(index) << " = _mm256_setr_epi8(\n"; 29 | for (uint8_t i = 0, j = 0; i < SHUFFLE_SIZE; i++) { 30 | if (i == index) { 31 | shuffle_matrix << std::to_string(SHUFFLE_SIZE - 1); 32 | } else { 33 | shuffle_matrix << std::to_string(j++); 34 | } 35 | if (i < SHUFFLE_SIZE - 1) 36 | shuffle_matrix << ", "; 37 | } 38 | shuffle_matrix << ");\n"; 39 | } 40 | shuffle_matrix << '\n'; 41 | shuffle_matrix << "const __m256i RM [] = {"; 42 | for (uint8_t i = 0; i < SHUFFLE_SIZE; i++) { 43 | shuffle_matrix << "RM" << std::to_string(0) << ", "; 44 | } 45 | for (uint8_t i = 0; i < SHUFFLE_SIZE; i++) { 46 | shuffle_matrix << "RM" << std::to_string(i); 47 | if (i < SHUFFLE_SIZE - 1) 48 | shuffle_matrix << ", "; 49 | } 50 | shuffle_matrix << "};\n"; 51 | 52 | shuffle_matrix << "\n\n\n"; 53 | 54 | // generate left shuffle 55 | for (uint64_t index = 0; index < SHUFFLE_SIZE; index++) { 56 | shuffle_matrix << "const __m256i LM" << std::to_string(index) << " = _mm256_setr_epi8(\n"; 57 | for (uint8_t i = 0, j = 0; i < SHUFFLE_SIZE; i++) { 58 | if (i == index) { 59 | shuffle_matrix << std::to_string(SHUFFLE_SIZE - 1); 60 | } else { 61 | shuffle_matrix << std::to_string(j++); 62 | } 63 | if (i < SHUFFLE_SIZE - 1) 64 | shuffle_matrix << ", "; 65 | } 66 | shuffle_matrix << ");\n"; 67 | } 68 | shuffle_matrix << '\n'; 69 | shuffle_matrix << "const __m256i LM [] = {"; 70 | for (uint8_t i = 0; i < SHUFFLE_SIZE; i++) { 71 | shuffle_matrix << "LM" << std::to_string(i) << ", "; 72 | } 73 | for (uint8_t i = 0; i < SHUFFLE_SIZE; i++) { 74 | shuffle_matrix << "LM" << std::to_string(31); 75 | if (i < SHUFFLE_SIZE - 1) 76 | shuffle_matrix << ", "; 77 | } 78 | 79 | shuffle_matrix << "};\n"; 80 | } 81 | 82 | */ 83 | 84 | #define SHUFFLE_SIZE 64 85 | 86 | void generate_shuffle_512(void) { 87 | std::ofstream shuffle_matrix("src/shuffle_matrix_512.c"); 88 | 89 | shuffle_matrix << "#include \n#include \n\n"; 90 | // generate right shuffle 91 | for (uint64_t index = 0; index < SHUFFLE_SIZE; index++) { 92 | shuffle_matrix << "const __m512i S" << std::to_string(index) << " = _mm512_set_epi8(\n"; 93 | for (uint8_t i = 0, j = SHUFFLE_SIZE - 2; i < SHUFFLE_SIZE; i++) { 94 | if (i == SHUFFLE_SIZE - index - 1) { 95 | shuffle_matrix << std::to_string(SHUFFLE_SIZE - 1); 96 | } else { 97 | shuffle_matrix << std::to_string(j--); 98 | } 99 | if (i < SHUFFLE_SIZE - 1) 100 | shuffle_matrix << ", "; 101 | } 102 | shuffle_matrix << ");\n"; 103 | } 104 | shuffle_matrix << '\n'; 105 | shuffle_matrix << "const __m512i SHUFFLE [] = {"; 106 | for (uint8_t i = 0; i < SHUFFLE_SIZE; i++) { 107 | shuffle_matrix << "S" << std::to_string(i); 108 | if (i < SHUFFLE_SIZE - 1) 109 | shuffle_matrix << ", "; 110 | } 111 | shuffle_matrix << "};\n"; 112 | 113 | shuffle_matrix << "\n"; 114 | // generate left shift 115 | for (uint64_t index = 0; index < SHUFFLE_SIZE; index++) { 116 | shuffle_matrix << "const __m512i R" << std::to_string(index) << " = _mm512_set_epi8(\n"; 117 | shuffle_matrix << std::to_string(SHUFFLE_SIZE - 1) << ", "; // always overwrite the last item 118 | for (uint8_t i = 0, j = SHUFFLE_SIZE - 1; i < SHUFFLE_SIZE - 1; i++) { 119 | if (i == SHUFFLE_SIZE - index - 1) { 120 | j--; 121 | shuffle_matrix << std::to_string(j--); 122 | } else { 123 | shuffle_matrix << std::to_string(j--); 124 | } 125 | if (i < SHUFFLE_SIZE - 2) 126 | shuffle_matrix << ", "; 127 | } 128 | shuffle_matrix << ");\n"; 129 | } 130 | shuffle_matrix << '\n'; 131 | shuffle_matrix << "const __m512i SHUFFLE_REMOVE [] = {"; 132 | for (uint8_t i = 0; i < SHUFFLE_SIZE; i++) { 133 | shuffle_matrix << "R" << std::to_string(i); 134 | if (i < SHUFFLE_SIZE - 1) 135 | shuffle_matrix << ", "; 136 | } 137 | shuffle_matrix << "};\n"; 138 | } 139 | 140 | #define SHUFFLE_SIZE 32 141 | void generate_shuffle_512_16(void) { 142 | std::ofstream shuffle_matrix("src/shuffle_matrix_512_16.c"); 143 | 144 | shuffle_matrix << "#include \n#include \n\n"; 145 | // generate right shuffle 146 | for (uint64_t index = 0; index < SHUFFLE_SIZE; index++) { 147 | shuffle_matrix << "const __m512i S16_" << std::to_string(index) << " = _mm512_set_epi16(\n"; 148 | for (uint8_t i = 0, j = SHUFFLE_SIZE - 2; i < SHUFFLE_SIZE; i++) { 149 | if (i == SHUFFLE_SIZE - index - 1) { 150 | shuffle_matrix << std::to_string(SHUFFLE_SIZE - 1); 151 | } else { 152 | shuffle_matrix << std::to_string(j--); 153 | } 154 | if (i < SHUFFLE_SIZE - 1) 155 | shuffle_matrix << ", "; 156 | } 157 | 158 | shuffle_matrix << ");\n"; 159 | } 160 | shuffle_matrix << '\n'; 161 | shuffle_matrix << "__m512i SHUFFLE16 [] = {"; 162 | for (uint8_t i = 0; i < SHUFFLE_SIZE; i++) { 163 | shuffle_matrix << "S16_" << std::to_string(i); 164 | if (i < SHUFFLE_SIZE - 1) 165 | shuffle_matrix << ", "; 166 | } 167 | shuffle_matrix << "};\n"; 168 | 169 | shuffle_matrix << "\n"; 170 | // generate left shift 171 | for (uint64_t index = 0; index < SHUFFLE_SIZE; index++) { 172 | shuffle_matrix << "const __m512i R16_" << std::to_string(index) << " = _mm512_set_epi16(\n"; 173 | shuffle_matrix << std::to_string(SHUFFLE_SIZE - 1) << ", "; // always overwrite the last item 174 | for (uint8_t i = 0, j = SHUFFLE_SIZE - 1; i < SHUFFLE_SIZE - 1; i++) { 175 | if (i == SHUFFLE_SIZE - index - 1) { 176 | j--; 177 | shuffle_matrix << std::to_string(j--); 178 | } else { 179 | shuffle_matrix << std::to_string(j--); 180 | } 181 | if (i < SHUFFLE_SIZE - 2) 182 | shuffle_matrix << ", "; 183 | } 184 | 185 | shuffle_matrix << ");\n"; 186 | } 187 | shuffle_matrix << '\n'; 188 | shuffle_matrix << "__m512i SHUFFLE_REMOVE16 [] = {"; 189 | for (uint8_t i = 0; i < SHUFFLE_SIZE; i++) { 190 | shuffle_matrix << "R16_" << std::to_string(i); 191 | if (i < SHUFFLE_SIZE - 1) 192 | shuffle_matrix << ", "; 193 | } 194 | shuffle_matrix << "};\n"; 195 | } 196 | 197 | /* 198 | * === FUNCTION ============================================================= 199 | * Name: main 200 | * Description: 201 | * ============================================================================ 202 | */ 203 | int 204 | main ( int argc, char *argv[] ) 205 | { 206 | generate_shuffle_512_16(); 207 | return EXIT_SUCCESS; 208 | } /* ---------- end of function main ---------- */ 209 | -------------------------------------------------------------------------------- /src/main.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * ============================================================================ 3 | * 4 | * Filename: main.cc 5 | * 6 | * Author: Prashant Pandey (), ppandey2@cs.cmu.edu 7 | * Organization: Carnegie Mellon University 8 | * 9 | * ============================================================================ 10 | */ 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include // portable to all x86 compilers 18 | #include 19 | #include 20 | #include 21 | 22 | #include 23 | 24 | #include "vqf_filter.h" 25 | 26 | #ifdef __AVX512BW__ 27 | extern __m512i SHUFFLE []; 28 | extern __m512i SHUFFLE_REMOVE []; 29 | extern __m512i SHUFFLE16 []; 30 | extern __m512i SHUFFLE_REMOVE16 []; 31 | #endif 32 | 33 | uint64_t tv2usec(struct timeval *tv) { 34 | return 1000000 * tv->tv_sec + tv->tv_usec; 35 | } 36 | 37 | /* Print elapsed time using the start and end timeval */ 38 | void print_time_elapsed(const char* desc, struct timeval* start, struct 39 | timeval* end, uint64_t ops, const char *opname) 40 | { 41 | uint64_t elapsed_usecs = tv2usec(end) - tv2usec(start); 42 | printf("%s Total Time Elapsed: %f seconds", desc, 1.0*elapsed_usecs / 1000000); 43 | if (ops) { 44 | printf(" (%f nanoseconds/%s)", 1000.0 * elapsed_usecs / ops, opname); 45 | } 46 | printf("\n"); 47 | } 48 | 49 | int main(int argc, char **argv) 50 | { 51 | if (argc < 2) { 52 | fprintf(stderr, "Please specify the log of the number of slots in the CQF.\n"); 53 | exit(1); 54 | } 55 | uint64_t qbits = atoi(argv[1]); 56 | uint64_t nslots = (1ULL << qbits); 57 | uint64_t nvals = 85*nslots/100; 58 | uint64_t *vals; 59 | uint64_t *other_vals; 60 | 61 | vqf_filter *filter; 62 | 63 | /* initialize vqf filter */ 64 | if ((filter = vqf_init(nslots)) == NULL) { 65 | fprintf(stderr, "Can't allocate vqf filter."); 66 | exit(EXIT_FAILURE); 67 | } 68 | 69 | /* Generate random values */ 70 | vals = (uint64_t*)malloc(nvals*sizeof(vals[0])); 71 | other_vals = (uint64_t*)malloc(nvals*sizeof(other_vals[0])); 72 | RAND_bytes((unsigned char *)vals, sizeof(*vals) * nvals); 73 | for (uint64_t i = 0; i < nvals; i++) { 74 | vals[i] = (1 * vals[i]); 75 | //vals[i] = (1 * rand()) % filter->metadata.range; 76 | } 77 | RAND_bytes((unsigned char *)other_vals, sizeof(*other_vals) * nvals); 78 | for (uint64_t i = 0; i < nvals; i++) { 79 | other_vals[i] = (1 * other_vals[i]); 80 | //other_vals[i] = (1 * other_vals[i]) % filter->metadata.range; 81 | } 82 | 83 | struct timeval start, end; 84 | struct timezone tzp; 85 | 86 | gettimeofday(&start, &tzp); 87 | /* Insert hashes in the vqf filter */ 88 | for (uint64_t i = 0; i < nvals; i++) { 89 | if (!vqf_insert(filter, vals[i])) { 90 | fprintf(stderr, "Insertion failed. LF: %f\n", i/(nslots*1.0)); 91 | exit(EXIT_FAILURE); 92 | } 93 | } 94 | gettimeofday(&end, &tzp); 95 | print_time_elapsed("Insertion time", &start, &end, nvals, "insert"); 96 | gettimeofday(&start, &tzp); 97 | for (uint64_t i = 0; i < nvals; i++) { 98 | if (!vqf_is_present(filter, vals[i])) { 99 | fprintf(stderr, "Lookup failed for %ld index: %ld\n", vals[i], i); 100 | exit(EXIT_FAILURE); 101 | } 102 | } 103 | gettimeofday(&end, &tzp); 104 | print_time_elapsed("Lookup time", &start, &end, nvals, "successful lookup"); 105 | gettimeofday(&start, &tzp); 106 | uint64_t nfps = 0; 107 | /* Lookup hashes in the vqf filter */ 108 | for (uint64_t i = 0; i < nvals; i++) { 109 | if (vqf_is_present(filter, other_vals[i])) { 110 | nfps++; 111 | } 112 | } 113 | gettimeofday(&end, &tzp); 114 | print_time_elapsed("Random lookup:", &start, &end, nvals, "random lookup"); 115 | printf("%lu/%lu positives\n" 116 | "FP rate: 1/%f\n", 117 | nfps, nvals, 118 | 1.0 * nvals / nfps); 119 | 120 | gettimeofday(&start, &tzp); 121 | for (uint64_t i = 0; i < nvals; i++) { 122 | if (!vqf_remove(filter, vals[i])) { 123 | fprintf(stderr, "Remove failed for %ld and index %ld\n", vals[i], i); 124 | exit(EXIT_FAILURE); 125 | } 126 | } 127 | gettimeofday(&end, &tzp); 128 | print_time_elapsed("Remove time", &start, &end, nvals, "remove"); 129 | 130 | return 0; 131 | } 132 | -------------------------------------------------------------------------------- /src/main_id.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * ============================================================================ 3 | * 4 | * Filename: main.cc 5 | * 6 | * Author: Prashant Pandey (), ppandey2@cs.cmu.edu 7 | * Organization: Carnegie Mellon University 8 | * 9 | * ============================================================================ 10 | */ 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include // portable to all x86 compilers 17 | #include 18 | #include 19 | #include 20 | 21 | #include 22 | 23 | #include "vqf_filter.h" 24 | 25 | #define ITR 100000000 26 | 27 | uint64_t tv2usec(struct timeval *tv) { 28 | return 1000000 * tv->tv_sec + tv->tv_usec; 29 | } 30 | 31 | /* Print elapsed time using the start and end timeval */ 32 | void print_time_elapsed(const char* desc, struct timeval* start, struct 33 | timeval* end, uint64_t ops, const char *opname) 34 | { 35 | uint64_t elapsed_usecs = tv2usec(end) - tv2usec(start); 36 | printf("%s Total Time Elapsed: %f seconds", desc, 1.0*elapsed_usecs / 1000000); 37 | if (ops) { 38 | printf(" (%f nanoseconds/%s)", 1000.0 * elapsed_usecs / ops, opname); 39 | } 40 | printf("\n"); 41 | } 42 | 43 | int main(int argc, char **argv) 44 | { 45 | if (argc < 2) { 46 | fprintf(stderr, "Please specify the log of the number of slots in the CQF.\n"); 47 | exit(1); 48 | } 49 | uint64_t qbits = atoi(argv[1]); 50 | uint64_t nslots = (1ULL << qbits); 51 | uint64_t nvals = 85*nslots/100; 52 | uint64_t *vals; 53 | uint64_t *other_vals; 54 | 55 | vqf_filter *filter; 56 | 57 | /* initialize vqf filter */ 58 | if ((filter = vqf_init(nslots)) == NULL) { 59 | fprintf(stderr, "Can't allocate vqf filter."); 60 | exit(EXIT_FAILURE); 61 | } 62 | 63 | /* Generate random values */ 64 | vals = (uint64_t*)malloc(nvals*sizeof(vals[0])); 65 | RAND_bytes((unsigned char *)vals, sizeof(*vals) * nvals); 66 | other_vals = (uint64_t*)malloc(nvals*sizeof(other_vals[0])); 67 | RAND_bytes((unsigned char *)other_vals, sizeof(*other_vals) * nvals); 68 | for (uint64_t i = 0; i < nvals; i++) { 69 | vals[i] = (1 * vals[i]); 70 | other_vals[i] = (1 * other_vals[i]); 71 | //vals[i] = (1 * vals[i]) % filter->metadata.range; 72 | //other_vals[i] = (1 * other_vals[i]) % filter->metadata.range; 73 | } 74 | 75 | struct timeval start, end; 76 | struct timezone tzp; 77 | 78 | gettimeofday(&start, &tzp); 79 | /* Insert hashes in the vqf filter */ 80 | for (uint64_t i = 0; i < nvals; i++) { 81 | if (!vqf_insert(filter, vals[i])) { 82 | fprintf(stderr, "Insertion failed"); 83 | exit(EXIT_FAILURE); 84 | } 85 | } 86 | gettimeofday(&end, &tzp); 87 | print_time_elapsed("Insertion time", &start, &end, nvals, "insert"); 88 | 89 | std::cout << "Starting workload\n"; 90 | srand(time(NULL)); 91 | uint64_t ret; 92 | uint8_t *oprs = (uint8_t *)malloc(ITR*sizeof(uint8_t)); 93 | uint64_t *opr_vals = (uint64_t *)malloc(ITR*sizeof(uint64_t)); 94 | 95 | for (uint64_t i = 0; i < ITR; i++) { 96 | oprs[i] = rand() % 3; 97 | if (oprs[i] == 0) { // delete 98 | opr_vals[i] = vals[rand() % nvals]; 99 | } else if (oprs[i] == 1) { // query 100 | opr_vals[i] = vals[rand() % nvals]; 101 | } else if (oprs[i] == 2) { // insert 102 | opr_vals[i] = other_vals[rand() % nvals]; 103 | } 104 | } 105 | gettimeofday(&start, &tzp); 106 | for (uint64_t i = 0; i < ITR; i++) { 107 | if (oprs[i] == 0) { // delete 108 | ret = vqf_remove(filter, opr_vals[i]); 109 | } else if (oprs[i] == 1) { // query 110 | ret = vqf_is_present(filter, opr_vals[i]); 111 | } else if (oprs[i] == 2) { // insert 112 | if (!vqf_insert(filter, opr_vals[i])) { 113 | fprintf(stderr, "Insertion failed"); 114 | exit(EXIT_FAILURE); 115 | } 116 | } 117 | } 118 | gettimeofday(&end, &tzp); 119 | std::cout << "ret: " << ret << '\n'; 120 | print_time_elapsed("Workload time", &start, &end, ITR, "operations"); 121 | 122 | return 0; 123 | } 124 | -------------------------------------------------------------------------------- /src/main_tx.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * ============================================================================ 3 | * 4 | * Authors: Prashant Pandey 5 | * Rob Johnson 6 | * 7 | * ============================================================================ 8 | */ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include "vqf_filter.h" 21 | 22 | uint64_t tv2usec(struct timeval *tv) { 23 | return 1000000 * tv->tv_sec + tv->tv_usec; 24 | } 25 | 26 | /* Print elapsed time using the start and end timeval */ 27 | void print_time_elapsed(const char* desc, struct timeval* start, struct 28 | timeval* end, uint64_t ops, const char *opname) 29 | { 30 | uint64_t elapsed_usecs = tv2usec(end) - tv2usec(start); 31 | printf("%s Total Time Elapsed: %f seconds", desc, 1.0*elapsed_usecs / 1000000); 32 | if (ops) { 33 | printf(" (%f nanoseconds/%s)", 1000.0 * elapsed_usecs / ops, opname); 34 | } 35 | printf("\n"); 36 | } 37 | 38 | 39 | typedef struct args { 40 | vqf_filter *cf; 41 | uint64_t *vals; 42 | uint64_t start; 43 | uint64_t end; 44 | } args; 45 | 46 | void *insert_bm(void *arg) 47 | { 48 | args *a = (args *)arg; 49 | for (uint32_t i = a->start; i <= a->end; i++) { 50 | int ret = vqf_insert(a->cf, a->vals[i]); 51 | if (ret < 0) { 52 | fprintf(stderr, "failed insertion for key: %lx.\n", a->vals[i]); 53 | abort(); 54 | } 55 | } 56 | return NULL; 57 | } 58 | 59 | void *query_bm(void *arg) 60 | { 61 | args *a = (args *)arg; 62 | for (uint32_t i = a->start; i <= a->end; i++) { 63 | int ret = vqf_is_present(a->cf, a->vals[i]); 64 | if (ret < 0) { 65 | fprintf(stderr, "failed insertion for key: %lx.\n", a->vals[i]); 66 | abort(); 67 | } 68 | } 69 | return NULL; 70 | } 71 | 72 | void multi_threaded_insertion(args args[], int tcnt) 73 | { 74 | pthread_t threads[tcnt]; 75 | 76 | for (int i = 0; i < tcnt; i++) { 77 | fprintf(stdout, "Thread %d bounds %ld %ld\n", i, args[i].start, args[i].end); 78 | if (pthread_create(&threads[i], NULL, &insert_bm, &args[i])) { 79 | fprintf(stderr, "Error creating thread\n"); 80 | exit(0); 81 | } 82 | } 83 | 84 | for (int i = 0; i < tcnt; i++) { 85 | if (pthread_join(threads[i], NULL)) { 86 | fprintf(stderr, "Error joining thread\n"); 87 | exit(0); 88 | } 89 | } 90 | } 91 | 92 | int main(int argc, char **argv) 93 | { 94 | if (argc < 3) { 95 | fprintf(stderr, "Please specify three arguments: \n \ 96 | 1. log of the number of slots in the CQF.\n \ 97 | 2. number of threads.\n"); 98 | exit(1); 99 | } 100 | uint64_t qbits = atoi(argv[1]); 101 | uint32_t tcnt = atoi(argv[2]); 102 | uint64_t nhashbits = qbits + 8; 103 | uint64_t nslots = (1ULL << qbits); 104 | uint64_t nvals = 85*nslots/100; 105 | 106 | uint64_t *vals; 107 | vqf_filter *filter; 108 | 109 | /* initialize vqf filter */ 110 | if ((filter = vqf_init(nslots)) == NULL) { 111 | fprintf(stderr, "Can't allocate vqf filter."); 112 | exit(EXIT_FAILURE); 113 | } 114 | 115 | /* Generate random values */ 116 | vals = (uint64_t*)calloc(nvals, sizeof(vals[0])); 117 | RAND_bytes((unsigned char *)vals, sizeof(*vals) * nvals); 118 | for (uint32_t i = 0; i < nvals; i++) { 119 | vals[i] = (1 * vals[i]); 120 | //vals[i] = (1 * vals[i]) % filter->metadata.range; 121 | } 122 | 123 | args *arg = (args*)malloc(tcnt * sizeof(args)); 124 | for (uint32_t i = 0; i < tcnt; i++) { 125 | arg[i].cf = filter; 126 | arg[i].vals = vals; 127 | arg[i].start = (nvals/tcnt) * i; 128 | arg[i].end = (nvals/tcnt) * (i + 1) - 1; 129 | } 130 | //fprintf(stdout, "Total number of items: %ld\n", arg[tcnt-1].end); 131 | 132 | struct timeval start, end; 133 | struct timezone tzp; 134 | 135 | gettimeofday(&start, &tzp); 136 | multi_threaded_insertion(arg, tcnt); 137 | gettimeofday(&end, &tzp); 138 | print_time_elapsed("Insertion time", &start, &end, nvals, "insert"); 139 | 140 | //fprintf(stdout, "Inserted all items: %ld\n", arg[tcnt-1].end); 141 | 142 | for (uint64_t i = 0; i < arg[tcnt-1].end; i++) { 143 | if (!vqf_is_present(filter, vals[i])) { 144 | fprintf(stderr, "Lookup failed for %ld", vals[i]); 145 | exit(EXIT_FAILURE); 146 | } 147 | } 148 | 149 | return 0; 150 | } 151 | -------------------------------------------------------------------------------- /src/shuffle_matrix_256.c: -------------------------------------------------------------------------------- 1 | #include // portable to all x86 compilers 2 | #include 3 | 4 | const __m256i RM0 = _mm256_setr_epi8( 5 | 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 6 | const __m256i RM1 = _mm256_setr_epi8( 7 | 0, 31, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 8 | const __m256i RM2 = _mm256_setr_epi8( 9 | 0, 1, 31, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 10 | const __m256i RM3 = _mm256_setr_epi8( 11 | 0, 1, 2, 31, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 12 | const __m256i RM4 = _mm256_setr_epi8( 13 | 0, 1, 2, 3, 31, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 14 | const __m256i RM5 = _mm256_setr_epi8( 15 | 0, 1, 2, 3, 4, 31, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 16 | const __m256i RM6 = _mm256_setr_epi8( 17 | 0, 1, 2, 3, 4, 5, 31, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 18 | const __m256i RM7 = _mm256_setr_epi8( 19 | 0, 1, 2, 3, 4, 5, 6, 31, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 20 | const __m256i RM8 = _mm256_setr_epi8( 21 | 0, 1, 2, 3, 4, 5, 6, 7, 31, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 22 | const __m256i RM9 = _mm256_setr_epi8( 23 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 31, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 24 | const __m256i RM10 = _mm256_setr_epi8( 25 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 31, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 26 | const __m256i RM11 = _mm256_setr_epi8( 27 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 31, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 28 | const __m256i RM12 = _mm256_setr_epi8( 29 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 31, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 30 | const __m256i RM13 = _mm256_setr_epi8( 31 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 31, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 32 | const __m256i RM14 = _mm256_setr_epi8( 33 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 31, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 34 | const __m256i RM15 = _mm256_setr_epi8( 35 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 31, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 36 | const __m256i RM16 = _mm256_setr_epi8( 37 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 31, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 38 | const __m256i RM17 = _mm256_setr_epi8( 39 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 31, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 40 | const __m256i RM18 = _mm256_setr_epi8( 41 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 31, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 42 | const __m256i RM19 = _mm256_setr_epi8( 43 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 31, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 44 | const __m256i RM20 = _mm256_setr_epi8( 45 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 31, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 46 | const __m256i RM21 = _mm256_setr_epi8( 47 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 31, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 48 | const __m256i RM22 = _mm256_setr_epi8( 49 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 31, 22, 23, 24, 25, 26, 27, 28, 29, 30); 50 | const __m256i RM23 = _mm256_setr_epi8( 51 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 31, 23, 24, 25, 26, 27, 28, 29, 30); 52 | const __m256i RM24 = _mm256_setr_epi8( 53 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 31, 24, 25, 26, 27, 28, 29, 30); 54 | const __m256i RM25 = _mm256_setr_epi8( 55 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 31, 25, 26, 27, 28, 29, 30); 56 | const __m256i RM26 = _mm256_setr_epi8( 57 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 31, 26, 27, 28, 29, 30); 58 | const __m256i RM27 = _mm256_setr_epi8( 59 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 31, 27, 28, 29, 30); 60 | const __m256i RM28 = _mm256_setr_epi8( 61 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 31, 28, 29, 30); 62 | const __m256i RM29 = _mm256_setr_epi8( 63 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 31, 29, 30); 64 | const __m256i RM30 = _mm256_setr_epi8( 65 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 30); 66 | const __m256i RM31 = _mm256_setr_epi8( 67 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 68 | 69 | __m256i RM [] = {RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM0, RM1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20, RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, RM31}; 70 | 71 | 72 | 73 | const __m256i LM0 = _mm256_setr_epi8( 74 | 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 75 | const __m256i LM1 = _mm256_setr_epi8( 76 | 0, 31, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 77 | const __m256i LM2 = _mm256_setr_epi8( 78 | 0, 1, 31, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 79 | const __m256i LM3 = _mm256_setr_epi8( 80 | 0, 1, 2, 31, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 81 | const __m256i LM4 = _mm256_setr_epi8( 82 | 0, 1, 2, 3, 31, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 83 | const __m256i LM5 = _mm256_setr_epi8( 84 | 0, 1, 2, 3, 4, 31, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 85 | const __m256i LM6 = _mm256_setr_epi8( 86 | 0, 1, 2, 3, 4, 5, 31, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 87 | const __m256i LM7 = _mm256_setr_epi8( 88 | 0, 1, 2, 3, 4, 5, 6, 31, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 89 | const __m256i LM8 = _mm256_setr_epi8( 90 | 0, 1, 2, 3, 4, 5, 6, 7, 31, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 91 | const __m256i LM9 = _mm256_setr_epi8( 92 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 31, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 93 | const __m256i LM10 = _mm256_setr_epi8( 94 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 31, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 95 | const __m256i LM11 = _mm256_setr_epi8( 96 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 31, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 97 | const __m256i LM12 = _mm256_setr_epi8( 98 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 31, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 99 | const __m256i LM13 = _mm256_setr_epi8( 100 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 31, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 101 | const __m256i LM14 = _mm256_setr_epi8( 102 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 31, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 103 | const __m256i LM15 = _mm256_setr_epi8( 104 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 31, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 105 | const __m256i LM16 = _mm256_setr_epi8( 106 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 31, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 107 | const __m256i LM17 = _mm256_setr_epi8( 108 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 31, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 109 | const __m256i LM18 = _mm256_setr_epi8( 110 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 31, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 111 | const __m256i LM19 = _mm256_setr_epi8( 112 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 31, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 113 | const __m256i LM20 = _mm256_setr_epi8( 114 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 31, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 115 | const __m256i LM21 = _mm256_setr_epi8( 116 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 31, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); 117 | const __m256i LM22 = _mm256_setr_epi8( 118 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 31, 22, 23, 24, 25, 26, 27, 28, 29, 30); 119 | const __m256i LM23 = _mm256_setr_epi8( 120 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 31, 23, 24, 25, 26, 27, 28, 29, 30); 121 | const __m256i LM24 = _mm256_setr_epi8( 122 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 31, 24, 25, 26, 27, 28, 29, 30); 123 | const __m256i LM25 = _mm256_setr_epi8( 124 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 31, 25, 26, 27, 28, 29, 30); 125 | const __m256i LM26 = _mm256_setr_epi8( 126 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 31, 26, 27, 28, 29, 30); 127 | const __m256i LM27 = _mm256_setr_epi8( 128 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 31, 27, 28, 29, 30); 129 | const __m256i LM28 = _mm256_setr_epi8( 130 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 31, 28, 29, 30); 131 | const __m256i LM29 = _mm256_setr_epi8( 132 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 31, 29, 30); 133 | const __m256i LM30 = _mm256_setr_epi8( 134 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 30); 135 | const __m256i LM31 = _mm256_setr_epi8( 136 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 137 | 138 | __m256i LM [] = {LM0, LM1, LM2, LM3, LM4, LM5, LM6, LM7, LM8, LM9, LM10, LM11, LM12, LM13, LM14, LM15, LM16, LM17, LM18, LM19, LM20, LM21, LM22, LM23, LM24, LM25, LM26, LM27, LM28, LM29, LM30, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31, LM31}; 139 | -------------------------------------------------------------------------------- /src/shuffle_matrix_512.c: -------------------------------------------------------------------------------- 1 | #include // portable to all x86 compilers 2 | #include 3 | 4 | const __m512i S0 = _mm512_set_epi8( 5 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 63); 6 | const __m512i S1 = _mm512_set_epi8( 7 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 63, 0); 8 | const __m512i S2 = _mm512_set_epi8( 9 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 63, 1, 0); 10 | const __m512i S3 = _mm512_set_epi8( 11 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 63, 2, 1, 0); 12 | const __m512i S4 = _mm512_set_epi8( 13 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 63, 3, 2, 1, 0); 14 | const __m512i S5 = _mm512_set_epi8( 15 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 63, 4, 3, 2, 1, 0); 16 | const __m512i S6 = _mm512_set_epi8( 17 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 63, 5, 4, 3, 2, 1, 0); 18 | const __m512i S7 = _mm512_set_epi8( 19 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 63, 6, 5, 4, 3, 2, 1, 0); 20 | const __m512i S8 = _mm512_set_epi8( 21 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 63, 7, 6, 5, 4, 3, 2, 1, 0); 22 | const __m512i S9 = _mm512_set_epi8( 23 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 63, 8, 7, 6, 5, 4, 3, 2, 1, 0); 24 | const __m512i S10 = _mm512_set_epi8( 25 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 63, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 26 | const __m512i S11 = _mm512_set_epi8( 27 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 63, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 28 | const __m512i S12 = _mm512_set_epi8( 29 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 63, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 30 | const __m512i S13 = _mm512_set_epi8( 31 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 63, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 32 | const __m512i S14 = _mm512_set_epi8( 33 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 63, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 34 | const __m512i S15 = _mm512_set_epi8( 35 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 63, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 36 | const __m512i S16 = _mm512_set_epi8( 37 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 63, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 38 | const __m512i S17 = _mm512_set_epi8( 39 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 63, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 40 | const __m512i S18 = _mm512_set_epi8( 41 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 63, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 42 | const __m512i S19 = _mm512_set_epi8( 43 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 63, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 44 | const __m512i S20 = _mm512_set_epi8( 45 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 63, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 46 | const __m512i S21 = _mm512_set_epi8( 47 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 63, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 48 | const __m512i S22 = _mm512_set_epi8( 49 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 63, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 50 | const __m512i S23 = _mm512_set_epi8( 51 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 63, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 52 | const __m512i S24 = _mm512_set_epi8( 53 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 63, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 54 | const __m512i S25 = _mm512_set_epi8( 55 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 63, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 56 | const __m512i S26 = _mm512_set_epi8( 57 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 63, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 58 | const __m512i S27 = _mm512_set_epi8( 59 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 63, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 60 | const __m512i S28 = _mm512_set_epi8( 61 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 63, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 62 | const __m512i S29 = _mm512_set_epi8( 63 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 63, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 64 | const __m512i S30 = _mm512_set_epi8( 65 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 63, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 66 | const __m512i S31 = _mm512_set_epi8( 67 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 63, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 68 | const __m512i S32 = _mm512_set_epi8( 69 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 63, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 70 | const __m512i S33 = _mm512_set_epi8( 71 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 63, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 72 | const __m512i S34 = _mm512_set_epi8( 73 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 63, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 74 | const __m512i S35 = _mm512_set_epi8( 75 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 63, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 76 | const __m512i S36 = _mm512_set_epi8( 77 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 63, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 78 | const __m512i S37 = _mm512_set_epi8( 79 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 63, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 80 | const __m512i S38 = _mm512_set_epi8( 81 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 63, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 82 | const __m512i S39 = _mm512_set_epi8( 83 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 63, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 84 | const __m512i S40 = _mm512_set_epi8( 85 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 63, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 86 | const __m512i S41 = _mm512_set_epi8( 87 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 63, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 88 | const __m512i S42 = _mm512_set_epi8( 89 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 63, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 90 | const __m512i S43 = _mm512_set_epi8( 91 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 63, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 92 | const __m512i S44 = _mm512_set_epi8( 93 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 63, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 94 | const __m512i S45 = _mm512_set_epi8( 95 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 63, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 96 | const __m512i S46 = _mm512_set_epi8( 97 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 63, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 98 | const __m512i S47 = _mm512_set_epi8( 99 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 63, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 100 | const __m512i S48 = _mm512_set_epi8( 101 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 63, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 102 | const __m512i S49 = _mm512_set_epi8( 103 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 63, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 104 | const __m512i S50 = _mm512_set_epi8( 105 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 63, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 106 | const __m512i S51 = _mm512_set_epi8( 107 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 63, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 108 | const __m512i S52 = _mm512_set_epi8( 109 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 63, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 110 | const __m512i S53 = _mm512_set_epi8( 111 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 63, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 112 | const __m512i S54 = _mm512_set_epi8( 113 | 62, 61, 60, 59, 58, 57, 56, 55, 54, 63, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 114 | const __m512i S55 = _mm512_set_epi8( 115 | 62, 61, 60, 59, 58, 57, 56, 55, 63, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 116 | const __m512i S56 = _mm512_set_epi8( 117 | 62, 61, 60, 59, 58, 57, 56, 63, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 118 | const __m512i S57 = _mm512_set_epi8( 119 | 62, 61, 60, 59, 58, 57, 63, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 120 | const __m512i S58 = _mm512_set_epi8( 121 | 62, 61, 60, 59, 58, 63, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 122 | const __m512i S59 = _mm512_set_epi8( 123 | 62, 61, 60, 59, 63, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 124 | const __m512i S60 = _mm512_set_epi8( 125 | 62, 61, 60, 63, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 126 | const __m512i S61 = _mm512_set_epi8( 127 | 62, 61, 63, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 128 | const __m512i S62 = _mm512_set_epi8( 129 | 62, 63, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 130 | const __m512i S63 = _mm512_set_epi8( 131 | 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 132 | 133 | __m512i SHUFFLE [] = {S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S30, S31, S32, S33, S34, S35, S36, S37, S38, S39, S40, S41, S42, S43, S44, S45, S46, S47, S48, S49, S50, S51, S52, S53, S54, S55, S56, S57, S58, S59, S60, S61, S62, S63}; 134 | 135 | const __m512i R0 = _mm512_set_epi8( 136 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); 137 | const __m512i R1 = _mm512_set_epi8( 138 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 0); 139 | const __m512i R2 = _mm512_set_epi8( 140 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 1, 0); 141 | const __m512i R3 = _mm512_set_epi8( 142 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 2, 1, 0); 143 | const __m512i R4 = _mm512_set_epi8( 144 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 3, 2, 1, 0); 145 | const __m512i R5 = _mm512_set_epi8( 146 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 4, 3, 2, 1, 0); 147 | const __m512i R6 = _mm512_set_epi8( 148 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 5, 4, 3, 2, 1, 0); 149 | const __m512i R7 = _mm512_set_epi8( 150 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 6, 5, 4, 3, 2, 1, 0); 151 | const __m512i R8 = _mm512_set_epi8( 152 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6, 5, 4, 3, 2, 1, 0); 153 | const __m512i R9 = _mm512_set_epi8( 154 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 8, 7, 6, 5, 4, 3, 2, 1, 0); 155 | const __m512i R10 = _mm512_set_epi8( 156 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 157 | const __m512i R11 = _mm512_set_epi8( 158 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 159 | const __m512i R12 = _mm512_set_epi8( 160 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 161 | const __m512i R13 = _mm512_set_epi8( 162 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 163 | const __m512i R14 = _mm512_set_epi8( 164 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 165 | const __m512i R15 = _mm512_set_epi8( 166 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 167 | const __m512i R16 = _mm512_set_epi8( 168 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 169 | const __m512i R17 = _mm512_set_epi8( 170 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 171 | const __m512i R18 = _mm512_set_epi8( 172 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 173 | const __m512i R19 = _mm512_set_epi8( 174 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 175 | const __m512i R20 = _mm512_set_epi8( 176 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 177 | const __m512i R21 = _mm512_set_epi8( 178 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 179 | const __m512i R22 = _mm512_set_epi8( 180 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 181 | const __m512i R23 = _mm512_set_epi8( 182 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 183 | const __m512i R24 = _mm512_set_epi8( 184 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 185 | const __m512i R25 = _mm512_set_epi8( 186 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 187 | const __m512i R26 = _mm512_set_epi8( 188 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 189 | const __m512i R27 = _mm512_set_epi8( 190 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 191 | const __m512i R28 = _mm512_set_epi8( 192 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 193 | const __m512i R29 = _mm512_set_epi8( 194 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 195 | const __m512i R30 = _mm512_set_epi8( 196 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 197 | const __m512i R31 = _mm512_set_epi8( 198 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 199 | const __m512i R32 = _mm512_set_epi8( 200 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 201 | const __m512i R33 = _mm512_set_epi8( 202 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 203 | const __m512i R34 = _mm512_set_epi8( 204 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 205 | const __m512i R35 = _mm512_set_epi8( 206 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 207 | const __m512i R36 = _mm512_set_epi8( 208 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 209 | const __m512i R37 = _mm512_set_epi8( 210 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 211 | const __m512i R38 = _mm512_set_epi8( 212 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 213 | const __m512i R39 = _mm512_set_epi8( 214 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 215 | const __m512i R40 = _mm512_set_epi8( 216 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 217 | const __m512i R41 = _mm512_set_epi8( 218 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 219 | const __m512i R42 = _mm512_set_epi8( 220 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 221 | const __m512i R43 = _mm512_set_epi8( 222 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 223 | const __m512i R44 = _mm512_set_epi8( 224 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 225 | const __m512i R45 = _mm512_set_epi8( 226 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 227 | const __m512i R46 = _mm512_set_epi8( 228 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 229 | const __m512i R47 = _mm512_set_epi8( 230 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 231 | const __m512i R48 = _mm512_set_epi8( 232 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 233 | const __m512i R49 = _mm512_set_epi8( 234 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 235 | const __m512i R50 = _mm512_set_epi8( 236 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 237 | const __m512i R51 = _mm512_set_epi8( 238 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 239 | const __m512i R52 = _mm512_set_epi8( 240 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 241 | const __m512i R53 = _mm512_set_epi8( 242 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 243 | const __m512i R54 = _mm512_set_epi8( 244 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 55, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 245 | const __m512i R55 = _mm512_set_epi8( 246 | 63, 63, 62, 61, 60, 59, 58, 57, 56, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 247 | const __m512i R56 = _mm512_set_epi8( 248 | 63, 63, 62, 61, 60, 59, 58, 57, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 249 | const __m512i R57 = _mm512_set_epi8( 250 | 63, 63, 62, 61, 60, 59, 58, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 251 | const __m512i R58 = _mm512_set_epi8( 252 | 63, 63, 62, 61, 60, 59, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 253 | const __m512i R59 = _mm512_set_epi8( 254 | 63, 63, 62, 61, 60, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 255 | const __m512i R60 = _mm512_set_epi8( 256 | 63, 63, 62, 61, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 257 | const __m512i R61 = _mm512_set_epi8( 258 | 63, 63, 62, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 259 | const __m512i R62 = _mm512_set_epi8( 260 | 63, 63, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 261 | const __m512i R63 = _mm512_set_epi8( 262 | 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 263 | 264 | __m512i SHUFFLE_REMOVE [] = {R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, R62, R63}; 265 | -------------------------------------------------------------------------------- /src/shuffle_matrix_512_16.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | const __m512i S16_0 = _mm512_set_epi16( 5 | 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31); 6 | const __m512i S16_1 = _mm512_set_epi16( 7 | 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 31, 0); 8 | const __m512i S16_2 = _mm512_set_epi16( 9 | 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 31, 1, 0); 10 | const __m512i S16_3 = _mm512_set_epi16( 11 | 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 31, 2, 1, 0); 12 | const __m512i S16_4 = _mm512_set_epi16( 13 | 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 31, 3, 2, 1, 0); 14 | const __m512i S16_5 = _mm512_set_epi16( 15 | 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 31, 4, 3, 2, 1, 0); 16 | const __m512i S16_6 = _mm512_set_epi16( 17 | 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 31, 5, 4, 3, 2, 1, 0); 18 | const __m512i S16_7 = _mm512_set_epi16( 19 | 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 31, 6, 5, 4, 3, 2, 1, 0); 20 | const __m512i S16_8 = _mm512_set_epi16( 21 | 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 31, 7, 6, 5, 4, 3, 2, 1, 0); 22 | const __m512i S16_9 = _mm512_set_epi16( 23 | 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 31, 8, 7, 6, 5, 4, 3, 2, 1, 0); 24 | const __m512i S16_10 = _mm512_set_epi16( 25 | 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 31, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 26 | const __m512i S16_11 = _mm512_set_epi16( 27 | 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 31, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 28 | const __m512i S16_12 = _mm512_set_epi16( 29 | 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 31, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 30 | const __m512i S16_13 = _mm512_set_epi16( 31 | 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 31, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 32 | const __m512i S16_14 = _mm512_set_epi16( 33 | 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 31, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 34 | const __m512i S16_15 = _mm512_set_epi16( 35 | 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 31, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 36 | const __m512i S16_16 = _mm512_set_epi16( 37 | 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 31, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 38 | const __m512i S16_17 = _mm512_set_epi16( 39 | 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 31, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 40 | const __m512i S16_18 = _mm512_set_epi16( 41 | 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 31, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 42 | const __m512i S16_19 = _mm512_set_epi16( 43 | 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 31, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 44 | const __m512i S16_20 = _mm512_set_epi16( 45 | 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 31, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 46 | const __m512i S16_21 = _mm512_set_epi16( 47 | 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 31, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 48 | const __m512i S16_22 = _mm512_set_epi16( 49 | 30, 29, 28, 27, 26, 25, 24, 23, 22, 31, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 50 | const __m512i S16_23 = _mm512_set_epi16( 51 | 30, 29, 28, 27, 26, 25, 24, 23, 31, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 52 | const __m512i S16_24 = _mm512_set_epi16( 53 | 30, 29, 28, 27, 26, 25, 24, 31, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 54 | const __m512i S16_25 = _mm512_set_epi16( 55 | 30, 29, 28, 27, 26, 25, 31, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 56 | const __m512i S16_26 = _mm512_set_epi16( 57 | 30, 29, 28, 27, 26, 31, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 58 | const __m512i S16_27 = _mm512_set_epi16( 59 | 30, 29, 28, 27, 31, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 60 | const __m512i S16_28 = _mm512_set_epi16( 61 | 30, 29, 28, 31, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 62 | const __m512i S16_29 = _mm512_set_epi16( 63 | 30, 29, 31, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 64 | const __m512i S16_30 = _mm512_set_epi16( 65 | 30, 31, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 66 | const __m512i S16_31 = _mm512_set_epi16( 67 | 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 68 | 69 | __m512i SHUFFLE16 [] = {S16_0, S16_1, S16_2, S16_3, S16_4, S16_5, S16_6, S16_7, S16_8, S16_9, S16_10, S16_11, S16_12, S16_13, S16_14, S16_15, S16_16, S16_17, S16_18, S16_19, S16_20, S16_21, S16_22, S16_23, S16_24, S16_25, S16_26, S16_27, S16_28, S16_29, S16_30, S16_31}; 70 | 71 | const __m512i R16_0 = _mm512_set_epi16( 72 | 31, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); 73 | const __m512i R16_1 = _mm512_set_epi16( 74 | 31, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 0); 75 | const __m512i R16_2 = _mm512_set_epi16( 76 | 31, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 1, 0); 77 | const __m512i R16_3 = _mm512_set_epi16( 78 | 31, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 2, 1, 0); 79 | const __m512i R16_4 = _mm512_set_epi16( 80 | 31, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 3, 2, 1, 0); 81 | const __m512i R16_5 = _mm512_set_epi16( 82 | 31, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 4, 3, 2, 1, 0); 83 | const __m512i R16_6 = _mm512_set_epi16( 84 | 31, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 5, 4, 3, 2, 1, 0); 85 | const __m512i R16_7 = _mm512_set_epi16( 86 | 31, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 6, 5, 4, 3, 2, 1, 0); 87 | const __m512i R16_8 = _mm512_set_epi16( 88 | 31, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6, 5, 4, 3, 2, 1, 0); 89 | const __m512i R16_9 = _mm512_set_epi16( 90 | 31, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 8, 7, 6, 5, 4, 3, 2, 1, 0); 91 | const __m512i R16_10 = _mm512_set_epi16( 92 | 31, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 93 | const __m512i R16_11 = _mm512_set_epi16( 94 | 31, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 95 | const __m512i R16_12 = _mm512_set_epi16( 96 | 31, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 97 | const __m512i R16_13 = _mm512_set_epi16( 98 | 31, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 99 | const __m512i R16_14 = _mm512_set_epi16( 100 | 31, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 101 | const __m512i R16_15 = _mm512_set_epi16( 102 | 31, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 103 | const __m512i R16_16 = _mm512_set_epi16( 104 | 31, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 105 | const __m512i R16_17 = _mm512_set_epi16( 106 | 31, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 107 | const __m512i R16_18 = _mm512_set_epi16( 108 | 31, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 109 | const __m512i R16_19 = _mm512_set_epi16( 110 | 31, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 111 | const __m512i R16_20 = _mm512_set_epi16( 112 | 31, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 113 | const __m512i R16_21 = _mm512_set_epi16( 114 | 31, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 115 | const __m512i R16_22 = _mm512_set_epi16( 116 | 31, 31, 30, 29, 28, 27, 26, 25, 24, 23, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 117 | const __m512i R16_23 = _mm512_set_epi16( 118 | 31, 31, 30, 29, 28, 27, 26, 25, 24, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 119 | const __m512i R16_24 = _mm512_set_epi16( 120 | 31, 31, 30, 29, 28, 27, 26, 25, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 121 | const __m512i R16_25 = _mm512_set_epi16( 122 | 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 123 | const __m512i R16_26 = _mm512_set_epi16( 124 | 31, 31, 30, 29, 28, 27, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 125 | const __m512i R16_27 = _mm512_set_epi16( 126 | 31, 31, 30, 29, 28, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 127 | const __m512i R16_28 = _mm512_set_epi16( 128 | 31, 31, 30, 29, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 129 | const __m512i R16_29 = _mm512_set_epi16( 130 | 31, 31, 30, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 131 | const __m512i R16_30 = _mm512_set_epi16( 132 | 31, 31, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 133 | const __m512i R16_31 = _mm512_set_epi16( 134 | 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 135 | 136 | __m512i SHUFFLE_REMOVE16 [] = {R16_0, R16_1, R16_2, R16_3, R16_4, R16_5, R16_6, R16_7, R16_8, R16_9, R16_10, R16_11, R16_12, R16_13, R16_14, R16_15, R16_16, R16_17, R16_18, R16_19, R16_20, R16_21, R16_22, R16_23, R16_24, R16_25, R16_26, R16_27, R16_28, R16_29, R16_30, R16_31}; 137 | -------------------------------------------------------------------------------- /src/vqf_filter.c: -------------------------------------------------------------------------------- 1 | /* 2 | * ============================================================================ 3 | * 4 | * Filename: vqf_filter.c 5 | * 6 | * Author: Prashant Pandey (), ppandey@berkeley.edu 7 | * Organization: LBNL/UCB 8 | * 9 | * ============================================================================ 10 | */ 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include // portable to all x86 compilers 19 | #include 20 | 21 | #include "vqf_filter.h" 22 | #include "vqf_precompute.h" 23 | 24 | // ALT block check is set of 75% of the number of slots 25 | #if TAG_BITS == 8 26 | #define TAG_MASK 0xff 27 | #define QUQU_SLOTS_PER_BLOCK 48 28 | #define QUQU_BUCKETS_PER_BLOCK 80 29 | #define QUQU_CHECK_ALT 92 30 | #elif TAG_BITS == 12 31 | #define TAG_MASK 0xfff 32 | #define QUQU_SLOTS_PER_BLOCK 32 33 | #define QUQU_BUCKETS_PER_BLOCK 96 34 | #define QUQU_CHECK_ALT 104 35 | #elif TAG_BITS == 16 36 | #define TAG_MASK 0xffff 37 | #define QUQU_SLOTS_PER_BLOCK 28 38 | #define QUQU_BUCKETS_PER_BLOCK 36 39 | #define QUQU_CHECK_ALT 43 40 | #endif 41 | 42 | #ifdef __AVX512BW__ 43 | extern __m512i SHUFFLE []; 44 | extern __m512i SHUFFLE_REMOVE []; 45 | extern __m512i SHUFFLE16 []; 46 | extern __m512i SHUFFLE_REMOVE16 []; 47 | #endif 48 | 49 | #define LOCK_MASK (1ULL << 63) 50 | #define UNLOCK_MASK ~(1ULL << 63) 51 | 52 | static inline void lock(vqf_block& block) 53 | { 54 | #ifdef ENABLE_THREADS 55 | uint64_t *data; 56 | #if TAG_BITS == 8 57 | data = block.md + 1; 58 | #elif TAG_BITS == 16 59 | data = &block.md; 60 | #endif 61 | while ((__sync_fetch_and_or(data, LOCK_MASK) & (1ULL << 63)) != 0) {} 62 | #endif 63 | } 64 | 65 | static inline void unlock(vqf_block& block) 66 | { 67 | #ifdef ENABLE_THREADS 68 | uint64_t *data; 69 | #if TAG_BITS == 8 70 | data = block.md + 1; 71 | #elif TAG_BITS == 16 72 | data = &block.md; 73 | #endif 74 | __sync_fetch_and_and(data, UNLOCK_MASK); 75 | #endif 76 | } 77 | 78 | static inline void lock_blocks(vqf_filter * restrict filter, uint64_t index1, uint64_t index2) { 79 | #ifdef ENABLE_THREADS 80 | if (index1 < index2) { 81 | lock(filter->blocks[index1/QUQU_BUCKETS_PER_BLOCK]); 82 | lock(filter->blocks[index2/QUQU_BUCKETS_PER_BLOCK]); 83 | } else { 84 | lock(filter->blocks[index2/QUQU_BUCKETS_PER_BLOCK]); 85 | lock(filter->blocks[index1/QUQU_BUCKETS_PER_BLOCK]); 86 | } 87 | #endif 88 | } 89 | 90 | static inline void unlock_blocks(vqf_filter * restrict filter, uint64_t index1, uint64_t index2) { 91 | #ifdef ENABLE_THREADS 92 | if (index1 < index2) { 93 | unlock(filter->blocks[index1/QUQU_BUCKETS_PER_BLOCK]); 94 | unlock(filter->blocks[index2/QUQU_BUCKETS_PER_BLOCK]); 95 | } else { 96 | unlock(filter->blocks[index2/QUQU_BUCKETS_PER_BLOCK]); 97 | unlock(filter->blocks[index1/QUQU_BUCKETS_PER_BLOCK]); 98 | } 99 | #endif 100 | } 101 | 102 | static inline int word_rank(uint64_t val) { 103 | return __builtin_popcountll(val); 104 | } 105 | 106 | // Returns the position of the rank'th 1. (rank = 0 returns the 1st 1) 107 | // Returns 64 if there are fewer than rank+1 1s. 108 | static inline uint64_t word_select(uint64_t val, int rank) { 109 | val = _pdep_u64(one[rank], val); 110 | return _tzcnt_u64(val); 111 | } 112 | 113 | // select(vec, 0) -> -1 114 | // select(vec, i) -> 128, if i > popcnt(vec) 115 | static inline int64_t select_128_old(__uint128_t vector, uint64_t rank) { 116 | uint64_t lower_word = vector & 0xffffffffffffffff; 117 | uint64_t lower_pdep = _pdep_u64(one[rank], lower_word); 118 | //uint64_t lower_select = word_select(lower_word, rank); 119 | if (lower_pdep != 0) { 120 | //assert(rank < word_rank(lower_word)); 121 | return _tzcnt_u64(lower_pdep); 122 | } 123 | rank = rank - word_rank(lower_word); 124 | uint64_t higher_word = vector >> 64; 125 | return word_select(higher_word, rank) + 64; 126 | } 127 | 128 | static inline uint64_t lookup_64(uint64_t vector, uint64_t rank) { 129 | uint64_t lower_return = _pdep_u64(one[rank], vector) >> rank << (sizeof(uint64_t)/2); 130 | return lower_return; 131 | } 132 | 133 | static inline uint64_t lookup_128(uint64_t *vector, uint64_t rank) { 134 | uint64_t lower_word = vector[0]; 135 | uint64_t lower_rank = word_rank(lower_word); 136 | uint64_t lower_return = _pdep_u64(one[rank], lower_word) >> rank << sizeof(__uint128_t); 137 | int64_t higher_rank = (int64_t)rank - lower_rank; 138 | uint64_t higher_word = vector[1]; 139 | uint64_t higher_return = _pdep_u64(one[higher_rank], higher_word); 140 | higher_return <<= (64 + sizeof(__uint128_t) - rank); 141 | return lower_return + higher_return; 142 | } 143 | 144 | static inline int64_t select_64(uint64_t vector, uint64_t rank) { 145 | return _tzcnt_u64(lookup_64(vector, rank)); 146 | } 147 | 148 | static inline int64_t select_128(uint64_t *vector, uint64_t rank) { 149 | return _tzcnt_u64(lookup_128(vector, rank)); 150 | } 151 | 152 | //assumes little endian 153 | #if TAG_BITS == 8 154 | void print_bits(__uint128_t num, int numbits) 155 | { 156 | int i; 157 | for (i = 0 ; i < numbits; i++) { 158 | if (i != 0 && i % 8 == 0) { 159 | printf(":"); 160 | } 161 | printf("%d", ((num >> i) & 1) == 1); 162 | } 163 | puts(""); 164 | } 165 | 166 | void print_tags(uint8_t *tags, uint32_t size) { 167 | for (uint8_t i = 0; i < size; i++) 168 | printf("%d ", (uint32_t)tags[i]); 169 | printf("\n"); 170 | } 171 | 172 | void print_block(vqf_filter *filter, uint64_t block_index) { 173 | printf("block index: %ld\n", block_index); 174 | printf("metadata: "); 175 | uint64_t *md = filter->blocks[block_index].md; 176 | print_bits(*(__uint128_t *)md, QUQU_BUCKETS_PER_BLOCK + 177 | QUQU_SLOTS_PER_BLOCK); 178 | printf("tags: "); 179 | print_tags(filter->blocks[block_index].tags, QUQU_SLOTS_PER_BLOCK); 180 | } 181 | #elif TAG_BITS == 16 182 | void print_bits(uint64_t num, int numbits) 183 | { 184 | int i; 185 | for (i = 0 ; i < numbits; i++) { 186 | if (i != 0 && i % 8 == 0) { 187 | printf(":"); 188 | } 189 | printf("%d", ((num >> i) & 1) == 1); 190 | } 191 | puts(""); 192 | } 193 | void print_tags(uint16_t *tags, uint32_t size) { 194 | for (uint8_t i = 0; i < size; i++) 195 | printf("%d ", (uint32_t)tags[i]); 196 | printf("\n"); 197 | } 198 | void print_block(vqf_filter *filter, uint64_t block_index) { 199 | printf("block index: %ld\n", block_index); 200 | printf("metadata: "); 201 | uint64_t md = filter->blocks[block_index].md; 202 | print_bits(md, QUQU_BUCKETS_PER_BLOCK + QUQU_SLOTS_PER_BLOCK); 203 | printf("tags: "); 204 | print_tags(filter->blocks[block_index].tags, QUQU_SLOTS_PER_BLOCK); 205 | } 206 | #endif 207 | 208 | #ifdef __AVX512BW__ 209 | #if TAG_BITS == 8 210 | static inline void update_tags_512(vqf_block * restrict block, uint8_t index, uint8_t tag) { 211 | block->tags[47] = tag; // add tag at the end 212 | 213 | __m512i vector = _mm512_loadu_si512(reinterpret_cast<__m512i*>(block)); 214 | vector = _mm512_permutexvar_epi8(SHUFFLE[index], vector); 215 | _mm512_storeu_si512(reinterpret_cast<__m512i*>(block), vector); 216 | } 217 | 218 | static inline void remove_tags_512(vqf_block * restrict block, uint8_t index) { 219 | __m512i vector = _mm512_loadu_si512(reinterpret_cast<__m512i*>(block)); 220 | vector = _mm512_permutexvar_epi8(SHUFFLE_REMOVE[index], vector); 221 | _mm512_storeu_si512(reinterpret_cast<__m512i*>(block), vector); 222 | } 223 | #elif TAG_BITS == 16 224 | static inline void update_tags_512(vqf_block * restrict block, uint8_t index, uint16_t tag) { 225 | block->tags[27] = tag; // add tag at the end 226 | 227 | __m512i vector = _mm512_loadu_si512(reinterpret_cast<__m512i*>(block)); 228 | vector = _mm512_permutexvar_epi16(SHUFFLE16[index], vector); 229 | _mm512_storeu_si512(reinterpret_cast<__m512i*>(block), vector); 230 | } 231 | 232 | static inline void remove_tags_512(vqf_block * restrict block, uint8_t index) { 233 | __m512i vector = _mm512_loadu_si512(reinterpret_cast<__m512i*>(block)); 234 | vector = _mm512_permutexvar_epi16(SHUFFLE_REMOVE16[index], vector); 235 | _mm512_storeu_si512(reinterpret_cast<__m512i*>(block), vector); 236 | } 237 | #endif 238 | #else 239 | #if TAG_BITS == 8 240 | static inline void update_tags_512(vqf_block * restrict block, uint8_t index, uint8_t tag) { 241 | index -= 16; 242 | memmove(&block->tags[index + 1], &block->tags[index], sizeof(block->tags) / sizeof(block->tags[0]) - index - 1); 243 | block->tags[index] = tag; 244 | } 245 | 246 | static inline void remove_tags_512(vqf_block * restrict block, uint8_t index) { 247 | index -= 16; 248 | memmove(&block->tags[index], &block->tags[index+1], sizeof(block->tags) / sizeof(block->tags[0]) - index); 249 | } 250 | #elif TAG_BITS == 16 251 | static inline void update_tags_512(vqf_block * restrict block, uint8_t index, uint16_t tag) { 252 | index -= 4; 253 | memmove(&block->tags[index + 1], &block->tags[index], (sizeof(block->tags) / sizeof(block->tags[0]) - index - 1) * 2); 254 | block->tags[index] = tag; 255 | } 256 | 257 | static inline void remove_tags_512(vqf_block * restrict block, uint8_t index) { 258 | index -= 4; 259 | memmove(&block->tags[index], &block->tags[index+1], (sizeof(block->tags) / sizeof(block->tags[0]) - index) * 2); 260 | } 261 | #endif 262 | #endif 263 | 264 | #if 0 265 | // Shuffle using AVX2 vector instruction. It turns out memmove is faster compared to AVX2. 266 | inline __m256i cross_lane_shuffle(const __m256i & value, const __m256i & 267 | shuffle) 268 | { 269 | return _mm256_or_si256(_mm256_shuffle_epi8(value, _mm256_add_epi8(shuffle, 270 | K[0])), 271 | _mm256_shuffle_epi8(_mm256_permute4x64_epi64(value, 0x4E), 272 | _mm256_add_epi8(shuffle, K[1]))); 273 | } 274 | 275 | #define SHUFFLE_SIZE 32 276 | void shuffle_256(uint8_t * restrict source, __m256i shuffle) { 277 | __m256i vector = _mm256_loadu_si256(reinterpret_cast<__m256i*>(source)); 278 | 279 | vector = cross_lane_shuffle(vector, shuffle); 280 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(source), vector); 281 | } 282 | 283 | static inline void update_tags_256(uint8_t * restrict block, uint8_t index, 284 | uint8_t tag) { 285 | index = index + sizeof(__uint128_t); // offset index based on md field. 286 | block[63] = tag; // add tag at the end 287 | shuffle_256(block + SHUFFLE_SIZE, RM[index]); // right block shuffle 288 | if (index < SHUFFLE_SIZE) { // if index lies in the left block 289 | std::swap(block[31], block[32]); // move tag to the end of left block 290 | shuffle_256(block, LM[index]); // shuffle left block 291 | } 292 | } 293 | #endif 294 | 295 | #if TAG_BITS == 8 296 | static inline void update_md(uint64_t *md, uint8_t index) { 297 | uint64_t carry = (md[0] >> 63) & carry_pdep_table[index]; 298 | md[1] = _pdep_u64(md[1], high_order_pdep_table[index]) | carry; 299 | md[0] = _pdep_u64(md[0], low_order_pdep_table[index]); 300 | } 301 | 302 | static inline void remove_md(uint64_t *md, uint8_t index) { 303 | uint64_t carry = (md[1] & carry_pdep_table[index]) << 63; 304 | md[1] = _pext_u64(md[1], high_order_pdep_table[index]) | (1ULL << 63); 305 | md[0] = _pext_u64(md[0], low_order_pdep_table[index]) | carry; 306 | } 307 | 308 | // number of 0s in the metadata is the number of tags. 309 | static inline uint64_t get_block_free_space(uint64_t *vector) { 310 | uint64_t lower_word = vector[0]; 311 | uint64_t higher_word = vector[1]; 312 | return word_rank(lower_word) + word_rank(higher_word); 313 | } 314 | #elif TAG_BITS == 16 315 | static inline void update_md(uint64_t *md, uint8_t index) { 316 | *md = _pdep_u64(*md, low_order_pdep_table[index]); 317 | } 318 | 319 | static inline void remove_md(uint64_t *md, uint8_t index) { 320 | *md = _pext_u64(*md, low_order_pdep_table[index]) | (1ULL << 63); 321 | } 322 | 323 | // number of 0s in the metadata is the number of tags. 324 | static inline uint64_t get_block_free_space(uint64_t vector) { 325 | return word_rank(vector); 326 | } 327 | #endif 328 | 329 | // Create n/log(n) blocks of log(n) slots. 330 | // log(n) is 51 given a cache line size. 331 | // n/51 blocks. 332 | vqf_filter * vqf_init(uint64_t nslots) { 333 | vqf_filter *filter; 334 | 335 | uint64_t total_blocks = (nslots + QUQU_SLOTS_PER_BLOCK)/QUQU_SLOTS_PER_BLOCK; 336 | uint64_t total_size_in_bytes = sizeof(vqf_block) * total_blocks; 337 | 338 | filter = (vqf_filter *)malloc(sizeof(*filter) + total_size_in_bytes); 339 | printf("Size: %ld\n",total_size_in_bytes); 340 | assert(filter); 341 | 342 | filter->metadata.total_size_in_bytes = total_size_in_bytes; 343 | filter->metadata.nslots = total_blocks * QUQU_SLOTS_PER_BLOCK; 344 | #if TAG_BITS == 8 345 | filter->metadata.key_remainder_bits = 8; 346 | #elif TAG_BITS == 16 347 | filter->metadata.key_remainder_bits = 16; 348 | #endif 349 | filter->metadata.range = total_blocks * QUQU_BUCKETS_PER_BLOCK; 350 | //filter->metadata.range = total_blocks * QUQU_BUCKETS_PER_BLOCK * (1ULL << filter->metadata.key_remainder_bits); 351 | filter->metadata.nblocks = total_blocks; 352 | filter->metadata.nelts = 0; 353 | //printf("Range: %ld\n", filter->metadata.range); 354 | 355 | // memset to 1 356 | #if TAG_BITS == 8 357 | for (uint64_t i = 0; i < total_blocks; i++) { 358 | filter->blocks[i].md[0] = UINT64_MAX; 359 | filter->blocks[i].md[1] = UINT64_MAX; 360 | // reset the most significant bit of metadata for locking. 361 | filter->blocks[i].md[1] = filter->blocks[i].md[1] & ~(1ULL << 63); 362 | } 363 | #elif TAG_BITS == 16 364 | for (uint64_t i = 0; i < total_blocks; i++) { 365 | filter->blocks[i].md = UINT64_MAX; 366 | filter->blocks[i].md = filter->blocks[i].md & ~(1ULL << 63); 367 | } 368 | #endif 369 | 370 | return filter; 371 | } 372 | 373 | uint64_t alt_index(uint64_t index, uint64_t tag, uint64_t range) { 374 | return (uint64_t)(range - index + (tag * 0x5bd1e995)) % range; 375 | } 376 | 377 | // If the item goes in the i'th slot (starting from 0) in the block then 378 | // find the i'th 0 in the metadata, insert a 1 after that and shift the rest 379 | // by 1 bit. 380 | // Insert the new tag at the end of its run and shift the rest by 1 slot. 381 | bool vqf_insert(vqf_filter * restrict filter, uint64_t hash) { 382 | vqf_metadata * restrict metadata = &filter->metadata; 383 | vqf_block * restrict blocks = filter->blocks; 384 | uint64_t key_remainder_bits = metadata->key_remainder_bits; 385 | uint64_t range = metadata->range; 386 | 387 | uint64_t block_index = hash % range; 388 | lock(blocks[block_index/QUQU_BUCKETS_PER_BLOCK]); 389 | #if TAG_BITS == 8 390 | uint64_t *block_md = blocks[block_index/QUQU_BUCKETS_PER_BLOCK].md; 391 | uint64_t block_free = get_block_free_space(block_md); 392 | #elif TAG_BITS == 16 393 | uint64_t *block_md = &blocks[block_index/QUQU_BUCKETS_PER_BLOCK].md; 394 | uint64_t block_free = get_block_free_space(*block_md); 395 | #endif 396 | uint64_t tag = (hash >> 32) & TAG_MASK; tag += (tag == 0); 397 | uint64_t alt_block_index = alt_index(block_index, tag, range); 398 | 399 | //printf("Insertion: Hash: %llu Tag: %ld Prm: %ld Alt: %ld\n", hash, tag, block_index, alt_block_index); 400 | //assert(alt_index(alt_block_index, tag, range) == block_index); 401 | 402 | __builtin_prefetch(&blocks[alt_block_index/QUQU_BUCKETS_PER_BLOCK]); 403 | 404 | if (block_free < QUQU_CHECK_ALT && block_index/QUQU_BUCKETS_PER_BLOCK != alt_block_index/QUQU_BUCKETS_PER_BLOCK) { 405 | unlock(blocks[block_index/QUQU_BUCKETS_PER_BLOCK]); 406 | lock_blocks(filter, block_index, alt_block_index); 407 | #if TAG_BITS == 8 408 | uint64_t *alt_block_md = blocks[alt_block_index/QUQU_BUCKETS_PER_BLOCK].md; 409 | uint64_t alt_block_free = get_block_free_space(alt_block_md); 410 | #elif TAG_BITS == 16 411 | uint64_t *alt_block_md = &blocks[alt_block_index/QUQU_BUCKETS_PER_BLOCK].md; 412 | uint64_t alt_block_free = get_block_free_space(*alt_block_md); 413 | #endif 414 | // pick the least loaded block 415 | if (alt_block_free > block_free) { 416 | unlock(blocks[block_index/QUQU_BUCKETS_PER_BLOCK]); 417 | block_index = alt_block_index; 418 | block_md = alt_block_md; 419 | } else if (block_free == QUQU_BUCKETS_PER_BLOCK) { 420 | unlock_blocks(filter, block_index, alt_block_index); 421 | fprintf(stderr, "vqf filter is full."); 422 | return false; 423 | //exit(EXIT_FAILURE); 424 | } else { 425 | unlock(blocks[alt_block_index/QUQU_BUCKETS_PER_BLOCK]); 426 | } 427 | 428 | } 429 | 430 | uint64_t index = block_index / QUQU_BUCKETS_PER_BLOCK; 431 | uint64_t offset = block_index % QUQU_BUCKETS_PER_BLOCK; 432 | 433 | #if TAG_BITS == 8 434 | uint64_t slot_index = select_128(block_md, offset); 435 | uint64_t select_index = slot_index + offset - sizeof(__uint128_t); 436 | #elif TAG_BITS == 16 437 | uint64_t slot_index = select_64(*block_md, offset); 438 | uint64_t select_index = slot_index + offset - (sizeof(uint64_t)/2); 439 | #endif 440 | /*printf("index: %ld tag: %ld offset: %ld\n", index, tag, offset);*/ 441 | /*print_block(filter, index);*/ 442 | 443 | update_tags_512(&blocks[index], slot_index,tag); 444 | update_md(block_md, select_index); 445 | /*print_block(filter, index);*/ 446 | unlock(blocks[block_index/QUQU_BUCKETS_PER_BLOCK]); 447 | return true; 448 | } 449 | 450 | static inline bool remove_tags(vqf_filter * restrict filter, uint64_t tag, 451 | uint64_t block_index) { 452 | uint64_t index = block_index / QUQU_BUCKETS_PER_BLOCK; 453 | uint64_t offset = block_index % QUQU_BUCKETS_PER_BLOCK; 454 | 455 | #ifdef __AVX512BW__ 456 | #if TAG_BITS == 8 457 | __m512i bcast = _mm512_set1_epi8(tag); 458 | __m512i block = 459 | _mm512_loadu_si512(reinterpret_cast<__m512i*>(&filter->blocks[index])); 460 | volatile __mmask64 result = _mm512_cmp_epi8_mask(bcast, block, _MM_CMPINT_EQ); 461 | #elif TAG_BITS == 16 462 | __m512i bcast = _mm512_set1_epi16(tag); 463 | __m512i block = 464 | _mm512_loadu_si512(reinterpret_cast<__m512i*>(&filter->blocks[index])); 465 | volatile __mmask64 result = _mm512_cmp_epi16_mask(bcast, block, _MM_CMPINT_EQ); 466 | #endif 467 | #else 468 | #if TAG_BITS == 8 469 | __m256i bcast = _mm256_set1_epi8(tag); 470 | __m256i block = _mm256_loadu_si256(reinterpret_cast<__m256i*>(&filter->blocks[index])); 471 | __m256i result1t = _mm256_cmpeq_epi8(bcast, block); 472 | __mmask32 result1 = _mm256_movemask_epi8(result1t); 473 | /*__mmask32 result1 = _mm256_cmp_epi8_mask(bcast, block, _MM_CMPINT_EQ);*/ 474 | block = _mm256_loadu_si256(reinterpret_cast<__m256i*>((uint8_t*)&filter->blocks[index]+32)); 475 | __m256i result2t = _mm256_cmpeq_epi8(bcast, block); 476 | __mmask32 result2 = _mm256_movemask_epi8(result2t); 477 | /*__mmask32 result2 = _mm256_cmp_epi8_mask(bcast, block, _MM_CMPINT_EQ);*/ 478 | uint64_t result = (uint64_t)result2 << 32 | (uint64_t)result1; 479 | #elif TAG_BITS == 16 480 | uint64_t alt_mask = 0x55555555; 481 | __m256i bcast = _mm256_set1_epi16(tag); 482 | __m256i block = _mm256_loadu_si256(reinterpret_cast<__m256i*>(&filter->blocks[index])); 483 | __m256i result1t = _mm256_cmpeq_epi16(bcast, block); 484 | __mmask32 result1 = _mm256_movemask_epi8(result1t); 485 | result1 = _pext_u32(result1, alt_mask); 486 | /*__mmask32 result1 = _mm256_cmp_epi8_mask(bcast, block, _MM_CMPINT_EQ);*/ 487 | block = _mm256_loadu_si256(reinterpret_cast<__m256i*>((uint8_t*)&filter->blocks[index]+32)); 488 | __m256i result2t = _mm256_cmpeq_epi16(bcast, block); 489 | __mmask32 result2 = _mm256_movemask_epi8(result2t); 490 | result2 = _pext_u32(result2, alt_mask); 491 | /*__mmask32 result2 = _mm256_cmp_epi8_mask(bcast, block, _MM_CMPINT_EQ);*/ 492 | uint64_t result = (uint64_t)result2 << 16 | (uint64_t)result1; 493 | #endif 494 | #endif 495 | 496 | if (result == 0) { 497 | // no matching tags, can bail 498 | return false; 499 | } 500 | 501 | #if TAG_BITS == 8 502 | uint64_t start = offset != 0 ? lookup_128(filter->blocks[index].md, offset - 503 | 1) : one[0] << 2 * sizeof(uint64_t); 504 | uint64_t end = lookup_128(filter->blocks[index].md, offset); 505 | #elif TAG_BITS == 16 506 | uint64_t start = offset != 0 ? lookup_64(filter->blocks[index].md, offset - 507 | 1) : one[0] << (sizeof(uint64_t)/2); 508 | uint64_t end = lookup_64(filter->blocks[index].md, offset); 509 | #endif 510 | uint64_t mask = end - start; 511 | 512 | uint64_t check_indexes = mask & result; 513 | if (check_indexes != 0) { // remove the first available tag 514 | vqf_block * restrict blocks = filter->blocks; 515 | uint64_t remove_index = __builtin_ctzll(check_indexes); 516 | remove_tags_512(&blocks[index], remove_index); 517 | #if TAG_BITS == 8 518 | remove_index = remove_index + offset - sizeof(__uint128_t); 519 | uint64_t *block_md = blocks[block_index / QUQU_BUCKETS_PER_BLOCK].md; 520 | remove_md(block_md, remove_index); 521 | #elif TAG_BITS == 16 522 | remove_index = remove_index + offset - sizeof(uint64_t); 523 | uint64_t *block_md = &blocks[block_index / QUQU_BUCKETS_PER_BLOCK].md; 524 | remove_md(block_md, remove_index); 525 | #endif 526 | return true; 527 | } else 528 | return false; 529 | } 530 | 531 | bool vqf_remove(vqf_filter * restrict filter, uint64_t hash) { 532 | vqf_metadata * restrict metadata = &filter->metadata; 533 | uint64_t key_remainder_bits = metadata->key_remainder_bits; 534 | uint64_t range = metadata->range; 535 | 536 | uint64_t block_index = hash % range; 537 | uint64_t tag = (hash >> 32) & TAG_MASK; tag += (tag == 0); 538 | uint64_t alt_block_index = alt_index(block_index, tag, range); 539 | //uint64_t alt_block_index = ((block_index ^ (tag * 0x5bd1e995)) % range); 540 | //printf("Removal: Hash: %llu Tag: %ld Prm: %ld Alt: %ld\n", hash, tag, block_index, alt_block_index); 541 | 542 | __builtin_prefetch(&filter->blocks[alt_block_index / QUQU_BUCKETS_PER_BLOCK]); 543 | 544 | return remove_tags(filter, tag, block_index) || remove_tags(filter, tag, alt_block_index); 545 | } 546 | 547 | static inline bool check_tags(vqf_filter * restrict filter, uint64_t tag, 548 | uint64_t block_index) { 549 | uint64_t index = block_index / QUQU_BUCKETS_PER_BLOCK; 550 | uint64_t offset = block_index % QUQU_BUCKETS_PER_BLOCK; 551 | 552 | #ifdef __AVX512BW__ 553 | #if TAG_BITS == 8 554 | __m512i bcast = _mm512_set1_epi8(tag); 555 | __m512i block = 556 | _mm512_loadu_si512(reinterpret_cast<__m512i*>(&filter->blocks[index])); 557 | volatile __mmask64 result = _mm512_cmp_epi8_mask(bcast, block, _MM_CMPINT_EQ); 558 | #elif TAG_BITS == 16 559 | __m512i bcast = _mm512_set1_epi16(tag); 560 | __m512i block = 561 | _mm512_loadu_si512(reinterpret_cast<__m512i*>(&filter->blocks[index])); 562 | volatile __mmask64 result = _mm512_cmp_epi16_mask(bcast, block, _MM_CMPINT_EQ); 563 | #endif 564 | #else 565 | #if TAG_BITS == 8 566 | __m256i bcast = _mm256_set1_epi8(tag); 567 | __m256i block = _mm256_loadu_si256(reinterpret_cast<__m256i*>(&filter->blocks[index])); 568 | __m256i result1t = _mm256_cmpeq_epi8(bcast, block); 569 | __mmask32 result1 = _mm256_movemask_epi8(result1t); 570 | /*__mmask32 result1 = _mm256_cmp_epi8_mask(bcast, block, _MM_CMPINT_EQ);*/ 571 | block = _mm256_loadu_si256(reinterpret_cast<__m256i*>((uint8_t*)&filter->blocks[index]+32)); 572 | __m256i result2t = _mm256_cmpeq_epi8(bcast, block); 573 | __mmask32 result2 = _mm256_movemask_epi8(result2t); 574 | /*__mmask32 result2 = _mm256_cmp_epi8_mask(bcast, block, _MM_CMPINT_EQ);*/ 575 | uint64_t result = (uint64_t)result2 << 32 | (uint64_t)result1; 576 | #elif TAG_BITS == 16 577 | uint64_t alt_mask = 0x55555555; 578 | __m256i bcast = _mm256_set1_epi16(tag); 579 | __m256i block = _mm256_loadu_si256(reinterpret_cast<__m256i*>(&filter->blocks[index])); 580 | __m256i result1t = _mm256_cmpeq_epi16(bcast, block); 581 | __mmask32 result1 = _mm256_movemask_epi8(result1t); 582 | result1 = _pext_u32(result1, alt_mask); 583 | /*__mmask32 result1 = _mm256_cmp_epi8_mask(bcast, block, _MM_CMPINT_EQ);*/ 584 | block = _mm256_loadu_si256(reinterpret_cast<__m256i*>((uint8_t*)&filter->blocks[index]+32)); 585 | __m256i result2t = _mm256_cmpeq_epi16(bcast, block); 586 | __mmask32 result2 = _mm256_movemask_epi8(result2t); 587 | result2 = _pext_u32(result2, alt_mask); 588 | /*__mmask32 result2 = _mm256_cmp_epi8_mask(bcast, block, _MM_CMPINT_EQ);*/ 589 | uint64_t result = (uint64_t)result2 << 16 | (uint64_t)result1; 590 | #endif 591 | #endif 592 | 593 | if (result == 0) { 594 | // no matching tags, can bail 595 | return false; 596 | } 597 | 598 | #if TAG_BITS == 8 599 | uint64_t start = offset != 0 ? lookup_128(filter->blocks[index].md, offset - 600 | 1) : one[0] << 2 * sizeof(uint64_t); 601 | uint64_t end = lookup_128(filter->blocks[index].md, offset); 602 | #elif TAG_BITS == 16 603 | uint64_t start = offset != 0 ? lookup_64(filter->blocks[index].md, offset - 604 | 1) : one[0] << (sizeof(uint64_t)/2); 605 | uint64_t end = lookup_64(filter->blocks[index].md, offset); 606 | #endif 607 | uint64_t mask = end - start; 608 | return (mask & result) != 0; 609 | } 610 | 611 | // If the item goes in the i'th slot (starting from 0) in the block then 612 | // select(i) - i is the slot index for the end of the run. 613 | bool vqf_is_present(vqf_filter * restrict filter, uint64_t hash) { 614 | vqf_metadata * restrict metadata = &filter->metadata; 615 | //vqf_block * restrict blocks = filter->blocks; 616 | uint64_t key_remainder_bits = metadata->key_remainder_bits; 617 | uint64_t range = metadata->range; 618 | 619 | uint64_t block_index = hash % range; 620 | uint64_t tag = (hash >> 32) & TAG_MASK; tag += (tag == 0); 621 | //uint64_t alt_block_index = ((block_index ^ (tag * 0x5bd1e995)) % range); 622 | uint64_t alt_block_index = alt_index(block_index, tag, range); 623 | //printf("Query: Hash: %llu Tag: %ld Prm: %ld Alt: %ld\n", hash, tag, block_index, alt_block_index); 624 | 625 | __builtin_prefetch(&filter->blocks[alt_block_index / QUQU_BUCKETS_PER_BLOCK]); 626 | 627 | return check_tags(filter, tag, block_index) || check_tags(filter, tag, alt_block_index); 628 | 629 | /*if (!ret) {*/ 630 | /*printf("tag: %ld offset: %ld\n", tag, block_index % QUQU_SLOTS_PER_BLOCK);*/ 631 | /*print_block(filter, block_index / QUQU_SLOTS_PER_BLOCK);*/ 632 | /*print_block(filter, alt_block_index / QUQU_SLOTS_PER_BLOCK);*/ 633 | /*}*/ 634 | } 635 | 636 | --------------------------------------------------------------------------------