├── pic ├── hayai.png ├── nuc.png └── kakizome.png ├── .travis.yml ├── fig ├── rec_len.hg38.png ├── rec_prec.dm6.png └── rec_prec.hg38.png ├── .gitignore ├── sassert.h ├── universal.c ├── LICENSE.minimap.txt ├── LICENSE.txt ├── Makefile ├── Makefile.core ├── arch ├── arch.h ├── x86_64_sse41 │ ├── vector.h │ ├── v2i64.h │ ├── v4i32.h │ ├── v16i16.h │ ├── v2i32.h │ ├── v16i8.h │ ├── v32i8.h │ ├── v32i16.h │ ├── arch_util.h │ ├── v64i8.h │ └── v64i16.h ├── x86_64_avx2 │ ├── vector.h │ ├── v2i64.h │ ├── v16i16.h │ ├── v4i32.h │ ├── v32i16.h │ ├── v2i32.h │ ├── v16i8.h │ ├── v32i8.h │ ├── arch_util.h │ ├── v64i16.h │ └── v64i8.h └── vector_alias.h ├── log.h ├── README.minimap.md ├── ksort.h ├── kvec.h └── gaba.h /pic/hayai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ocxtal/minialign/HEAD/pic/hayai.png -------------------------------------------------------------------------------- /pic/nuc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ocxtal/minialign/HEAD/pic/nuc.png -------------------------------------------------------------------------------- /pic/kakizome.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ocxtal/minialign/HEAD/pic/kakizome.png -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: C 2 | script: 3 | - make clean && make 4 | - ./minialign -v 5 | -------------------------------------------------------------------------------- /fig/rec_len.hg38.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ocxtal/minialign/HEAD/fig/rec_len.hg38.png -------------------------------------------------------------------------------- /fig/rec_prec.dm6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ocxtal/minialign/HEAD/fig/rec_prec.dm6.png -------------------------------------------------------------------------------- /fig/rec_prec.hg38.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ocxtal/minialign/HEAD/fig/rec_prec.hg38.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .*.swp 2 | *.o 3 | *.a 4 | minialign* 5 | samsplit* 6 | *.txt 7 | *.fa 8 | *.sam 9 | *.bam 10 | *.bai 11 | stuff 12 | .DS_Store 13 | .gdb_history 14 | -------------------------------------------------------------------------------- /sassert.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file sassert.h 4 | * 5 | * @brief static assertion 6 | */ 7 | #ifndef _SASSERT_H_INCLUDED 8 | #define _SASSERT_H_INCLUDED 9 | 10 | #include 11 | 12 | /** 13 | * assert macros 14 | */ 15 | #define _sa_cat_intl(x, y) x##y 16 | #define _sa_cat(x, y) _sa_cat_intl(x, y) 17 | /* static assert */ 18 | #define _static_assert(expr) typedef char _sa_cat(_st, __LINE__)[(expr) ? 1 : -1] 19 | /* check offset equality of elements in two structs */ 20 | #define _static_assert_offset(st1, mb1, st2, mb2, ofs) \ 21 | _static_assert(offsetof(st1, mb1) == offsetof(st2, mb2) + ofs) 22 | 23 | 24 | #endif /* #ifndef _SASSERT_H_INCLUDED */ 25 | /** 26 | * end of sassert.h 27 | */ 28 | -------------------------------------------------------------------------------- /universal.c: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file universal.c 4 | * 5 | * @brief main function dispatcher 6 | */ 7 | #define ARCH_CAP 8 | 9 | #include 10 | #include "arch/arch.h" 11 | 12 | #if defined(__x86_64__) 13 | int main_sse41(int argc, char *argv[]); 14 | int main_avx2(int argc, char *argv[]); 15 | #elif defined(AARCH64) 16 | #elif defined(PPC64) 17 | #endif 18 | 19 | int main(int argc, char *argv[], char *envp[]) 20 | { 21 | #if defined(__x86_64__) 22 | if((arch_cap() & ARCH_CAP_AVX2) != 0) { 23 | return(main_avx2(argc, argv)); 24 | } 25 | if((arch_cap() & ARCH_CAP_SSE41) != 0) { 26 | return(main_sse41(argc, argv)); 27 | } 28 | #elif defined(AARCH64) 29 | #elif defined(PPC64) 30 | #endif 31 | 32 | fprintf(stderr, "[E::main] no main function found.\n"); 33 | return(2); 34 | } 35 | 36 | /** 37 | * end of universal.c 38 | */ 39 | -------------------------------------------------------------------------------- /LICENSE.minimap.txt: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2015 Broad Institute 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2016 Hajime Suzuki 4 | Copyright (c) 2015 Broad Institute 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining 7 | a copy of this software and associated documentation files (the 8 | "Software"), to deal in the Software without restriction, including 9 | without limitation the rights to use, copy, modify, merge, publish, 10 | distribute, sublicense, and/or sell copies of the Software, and to 11 | permit persons to whom the Software is furnished to do so, subject to 12 | the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be 15 | included in all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 21 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 22 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 23 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE. 25 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | # compilers 3 | CC = gcc 4 | GIT = git 5 | 6 | # compiler flags 7 | OFLAGS = -O3 8 | WFLAGS = -Wall -Wno-unused-function 9 | CFLAGS = $(OFLAGS) $(WFLAGS) -std=c99 -pipe -DMM_VERSION=\"$(VERSION)\" 10 | LDFLAGS = -lm -lz -lpthread 11 | 12 | # default version string is parsed from git tags, otherwise extracted from the source 13 | VERSION = $(shell $(GIT) describe --tags || grep "define MM_VERSION" minialign.c | grep -o '".*"' | sed 's/"//g') 14 | 15 | # install directory and binary name 16 | PREFIX = /usr/local 17 | TARGET = minialign 18 | 19 | all: native 20 | 21 | native: 22 | $(MAKE) -f Makefile.core CC=$(CC) CFLAGS='$(CFLAGS)' all 23 | $(CC) -o $(TARGET) $(CFLAGS) minialign.o gaba.*.o $(LDFLAGS) 24 | 25 | sse41 avx2: 26 | $(MAKE) -f Makefile.core CC=$(CC) CFLAGS='$(CFLAGS) -DUNITTEST=0' ARCH=`echo $@ | tr a-z A-Z` NAMESPACE=$@ all 27 | 28 | universal: sse41 avx2 29 | $(CC) -o $(TARGET) $(CFLAGS) -mtune=generic universal.c minialign.*.o gaba.*.o $(LDFLAGS) 30 | 31 | clean: 32 | rm -fr gmon.out *.o a.out $(TARGET) *~ *.a *.dSYM session* 33 | 34 | install: 35 | mkdir -p $(PREFIX)/bin 36 | cp $(TARGET) $(PREFIX)/bin/$(TARGET) 37 | 38 | uninstall: 39 | rm -f $(PREFIX)/bin/$(TARGET) 40 | 41 | gaba.c: gaba.h log.h unittest.h sassert.h 42 | gaba_wrap.h: gaba.h log.h unittest.h sassert.h 43 | minialign.c: kvec.h ksort.h gaba_wrap.h lmm.h unittest.h sassert.h 44 | -------------------------------------------------------------------------------- /Makefile.core: -------------------------------------------------------------------------------- 1 | 2 | # load architecture-dependent optimization flags, the default is `-march=native' 3 | ARCH = NATIVE 4 | NATIVE_FLAGS = $(shell bash -c "if [[ $(CC) = icc* ]]; then echo '-march=native' ; else echo '-march=native'; fi") 5 | SSE41_FLAGS = $(shell bash -c "if [[ $(CC) = icc* ]]; then echo '-msse4.2'; else echo '-msse4.2 -mpopcnt'; fi") 6 | AVX2_FLAGS = $(shell bash -c "if [[ $(CC) = icc* ]]; then echo '-march=core-avx2'; else echo '-mavx2 -mbmi -mbmi2 -mlzcnt -mpopcnt'; fi") 7 | ARCHFLAGS = $($(ARCH)_FLAGS) 8 | 9 | # add suffix if namespace is specified 10 | SUFFIX = $(NAMESPACE:$(NAMESPACE)=.$(NAMESPACE)) 11 | SUFFIXFLAGS = $(NAMESPACE:$(NAMESPACE)=-DNAMESPACE=$(NAMESPACE)) 12 | 13 | # compose flags 14 | CFLAGS_NOWARN = $(shell bash -c "if [[ $(CC) = icc* ]]; then echo '-Wno-unused-variable'; else echo '-Wno-unused-variable -Wno-unused-label'; fi") 15 | CFLAGS_INTL = $(CFLAGS) $(CFLAGS_NOWARN) $(ARCHFLAGS) $(SUFFIXFLAGS) 16 | 17 | # expand objects 18 | MINIALIGN_OBJS = minialign$(SUFFIX).o 19 | GABA_OBJS = $(shell bash -c "echo gaba.{linear,affine,combined}.{16,32,64}$(SUFFIX).o") 20 | 21 | 22 | all: $(MINIALIGN_OBJS) $(GABA_OBJS) 23 | 24 | $(MINIALIGN_OBJS): minialign.c 25 | $(CC) -c -o $@ $(CFLAGS_INTL) $< 26 | 27 | $(GABA_OBJS): gaba.c 28 | $(CC) -c -o $@ $(CFLAGS_INTL) -DMODEL=`echo $@ | cut -d'.' -f2 | tr a-z A-Z` -DBW=`echo $@ | cut -d'.' -f3` -DSUFFIX -DBIT=2 $< 29 | 30 | nowrap: minialign.c gaba.c 31 | $(CC) -c -o minialign.o $(CFLAGS_INTL) -DGABA_NOWRAP minialign.c 32 | $(CC) -c -o gaba.o $(CFLAGS_INTL) -DMODEL=COMBINED -DBW=64 -DBIT=2 gaba.c 33 | 34 | gaba.c: gaba.h log.h unittest.h sassert.h 35 | minialign.c: kvec.h ksort.h gaba_wrap.h gaba.h lmm.h unittest.h sassert.h 36 | gaba_wrap.h: gaba.h log.h unittest.h sassert.h 37 | -------------------------------------------------------------------------------- /arch/arch.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file arch.h 4 | */ 5 | #ifndef _ARCH_H_INCLUDED 6 | #define _ARCH_H_INCLUDED 7 | 8 | #include 9 | 10 | /** 11 | * define capability flags for all processors 12 | */ 13 | #define ARCH_CAP_SSE41 ( 0x01 ) 14 | #define ARCH_CAP_AVX2 ( 0x04 ) 15 | #define ARCH_CAP_NEON ( 0x10 ) 16 | #define ARCH_CAP_ALTIVEC ( 0x20 ) 17 | 18 | 19 | #ifdef __x86_64__ 20 | # if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) 21 | /* the compiler is gcc, not clang nor icc */ 22 | # define _ARCH_GCC_VERSION ( __GNUC__ * 100 + __GNUC_MINOR__ * 10 + __GNUC_PATCHLEVEL__ ) 23 | # elif defined(__GNUC__) && (defined(__clang__) || defined(__INTEL_COMPILER)) 24 | # define _ARCH_GCC_COMPAT ( __GNUC__ * 100 + __GNUC_MINOR__ * 10 + __GNUC_PATCHLEVEL__ ) 25 | # endif 26 | 27 | /* import architectured depentent stuffs and SIMD vectors */ 28 | # if defined(__AVX2__) 29 | # include "x86_64_avx2/arch_util.h" 30 | # include "x86_64_avx2/vector.h" 31 | # elif defined(__SSE4_1__) 32 | # include "x86_64_sse41/arch_util.h" 33 | # include "x86_64_sse41/vector.h" 34 | # elif !defined(ARCH_CAP) /* arch.h will be included without SIMD flag to check capability */ 35 | # error "No SIMD instruction set enabled. Check if SSE4.1 or AVX2 instructions are available and add `-msse4.1' or `-mavx2' to CFLAGS." 36 | # endif 37 | 38 | /* map reverse-complement sequence out of the canonical-formed address */ 39 | #define GREF_SEQ_LIM ( (uint8_t const *)0x800000000000 ) 40 | 41 | /** 42 | * @macro arch_cap 43 | * @brief returns architecture capability flag 44 | * 45 | * CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]==1 && 46 | * CPUID.(EAX=07H, ECX=0H):EBX.BMI1[bit 3]==1 && 47 | * CPUID.(EAX=80000001H):ECX.LZCNT[bit 5]==1 48 | */ 49 | #define arch_cap() ({ \ 50 | uint32_t eax, ebx, ecx, edx; \ 51 | __asm__ volatile("cpuid" \ 52 | : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) \ 53 | : "a"(0x01), "c"(0x00)); \ 54 | uint64_t sse4_cap = (ecx & 0x180000) != 0; \ 55 | __asm__ volatile("cpuid" \ 56 | : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) \ 57 | : "a"(0x07), "c"(0x00)); \ 58 | uint64_t avx2_cap = (ebx & (0x01<<5)) != 0; \ 59 | uint64_t bmi1_cap = (ebx & (0x01<<3)) != 0; \ 60 | __asm__ volatile("cpuid" \ 61 | : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) \ 62 | : "a"(0x80000001), "c"(0x00)); \ 63 | uint64_t lzcnt_cap = (ecx & (0x01<<5)) != 0; \ 64 | ((sse4_cap != 0) ? ARCH_CAP_SSE41 : 0) | ((avx2_cap && bmi1_cap && lzcnt_cap) ? ARCH_CAP_AVX2 : 0); \ 65 | }) 66 | 67 | #endif 68 | 69 | #ifdef AARCH64 70 | 71 | /* use x86_64 default */ 72 | #define GREF_SEQ_LIM ( (uint8_t const *)0x800000000000 ) 73 | 74 | #endif 75 | 76 | #ifdef PPC64 77 | 78 | /* use x86_64 default */ 79 | #define GREF_SEQ_LIM ( (uint8_t const *)0x800000000000 ) 80 | 81 | #endif 82 | 83 | /* 84 | * tests 85 | */ 86 | #if !defined(ARCH_CAP) && (!defined(_ARCH_UTIL_H_INCLUDED) || !defined(_VECTOR_H_INCLUDED)) 87 | # error "No SIMD environment detected. Check CFLAGS." 88 | #endif 89 | 90 | #ifndef GREF_SEQ_LIM 91 | # error "No architecuture detected. Check CFLAGS." 92 | #endif 93 | 94 | 95 | #endif /* #ifndef _ARCH_H_INCLUDED */ 96 | /** 97 | * end of arch.h 98 | */ 99 | -------------------------------------------------------------------------------- /log.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file log.h 4 | * 5 | * @brief log handler 6 | */ 7 | #ifndef _LOG_H_INCLUDED 8 | #define _LOG_H_INCLUDED 9 | 10 | #include 11 | #include 12 | 13 | /** 14 | * color outputs 15 | */ 16 | #define RED(x) "\x1b[31m" \ 17 | x \ 18 | "\x1b[39m" 19 | #define GREEN(x) "\x1b[32m" \ 20 | x \ 21 | "\x1b[39m" 22 | #define YELLOW(x) "\x1b[33m" \ 23 | x \ 24 | "\x1b[39m" 25 | #define BLUE(x) "\x1b[34m" \ 26 | x \ 27 | "\x1b[39m" 28 | #define MAGENTA(x) "\x1b[35m" \ 29 | x \ 30 | "\x1b[39m" 31 | #define CYAN(x) "\x1b[36m" \ 32 | x \ 33 | "\x1b[39m" 34 | #define WHITE(x) "\x1b[37m" \ 35 | x \ 36 | "\x1b[39m" 37 | 38 | /** 39 | * @macro DEBUG 40 | */ 41 | // #define DEBUG ( 1 ) 42 | 43 | /** 44 | * @macro dbprintf 45 | */ 46 | #ifdef DEBUG 47 | #undef trap 48 | #undef debug 49 | #undef debug_impl 50 | #undef dbprintf 51 | #undef print_lane 52 | 53 | #define trap() { \ 54 | *((volatile uint8_t *)NULL); \ 55 | } 56 | #define debug(...) { \ 57 | debug_impl(__VA_ARGS__, ""); \ 58 | } 59 | #define debug_impl(fmt, ...) { \ 60 | dbprintf("[%s: %s(%d)] " fmt "%s\n", __FILE__, __func__, __LINE__, __VA_ARGS__); \ 61 | } 62 | #define dbprintf(fmt, ...) { \ 63 | fprintf(stderr, fmt, __VA_ARGS__); \ 64 | } 65 | #define print_lane(p1, p2) { \ 66 | cell_t *p = p1, *t = p2; \ 67 | char *str = NULL; \ 68 | int len = 0, size = 256; \ 69 | str = malloc(size); \ 70 | len += sprintf(str+len, "["); \ 71 | while(p != t) { \ 72 | if(*--t <= CELL_MIN) { \ 73 | len += sprintf(str+len, "-oo,"); \ 74 | } else if(*t >= CELL_MAX) { \ 75 | len += sprintf(str+len, "oo,"); \ 76 | } else { \ 77 | len += sprintf(str+len, "%d,", *t); \ 78 | } \ 79 | if(len > (size - 20)) { \ 80 | size *= 2; \ 81 | str = realloc(str, size); \ 82 | } \ 83 | } \ 84 | str[len == 1 ? 1 : len-1] = ']'; \ 85 | str[len == 1 ? 2 : len] = '\0'; \ 86 | debug("lane(%s)", str); \ 87 | free(str); \ 88 | } 89 | /* compatible with dump in unittest.h */ 90 | #ifndef dump 91 | #define dump(ptr, len) ({ \ 92 | uint64_t size = (((len) + 15) / 16 + 1) * \ 93 | (strlen("0x0123456789abcdef:") + 16 * strlen(" 00a") + strlen(" \n+ margin")) \ 94 | + strlen(#ptr) + strlen("\n`' len: 100000000"); \ 95 | uint8_t *_ptr = (uint8_t *)(ptr); \ 96 | char *_str = alloca(size); \ 97 | char *_s = _str; \ 98 | /* make header */ \ 99 | _s += sprintf(_s, "\n`%s' len: %" PRId64 "\n", #ptr, (int64_t)len); \ 100 | _s += sprintf(_s, " "); \ 101 | for(int64_t i = 0; i < 16; i++) { \ 102 | _s += sprintf(_s, " %02x", (uint8_t)i); \ 103 | } \ 104 | _s += sprintf(_s, "\n"); \ 105 | for(int64_t i = 0; i < ((len) + 15) / 16; i++) { \ 106 | _s += sprintf(_s, "0x%016" PRIx64 ":", (uint64_t)_ptr); \ 107 | for(int64_t j = 0; j < 16; j++) { \ 108 | _s += sprintf(_s, " %02x", (uint8_t)_ptr[j]); \ 109 | } \ 110 | _s += sprintf(_s, " "); \ 111 | for(int64_t j = 0; j < 16; j++) { \ 112 | _s += sprintf(_s, "%c", isprint(_ptr[j]) ? _ptr[j] : ' '); \ 113 | } \ 114 | _s += sprintf(_s, "\n"); \ 115 | _ptr += 16; \ 116 | } \ 117 | (char const *)_str; \ 118 | }) 119 | #endif 120 | #else 121 | #undef trap 122 | #undef debug 123 | #undef debug_impl 124 | #undef dbprintf 125 | #undef print_lane 126 | 127 | // #define trap() { fprintf(stderr, "[%s: %s(%d)] segmentation fault (trapped)\n", __FILE__, __func__, __LINE__); *((volatile uint8_t *)NULL); } 128 | #define trap() {} 129 | #define debug(...) {} 130 | #define dbprintf(fmt, ...) {} 131 | #define print_lane(p1, p2) {} 132 | #ifndef dump 133 | #define dump(ptr, len) ; 134 | #endif 135 | #endif 136 | 137 | #endif /* #ifndef _LOG_H_INCLUDED */ 138 | /** 139 | * end of log.h 140 | */ 141 | -------------------------------------------------------------------------------- /arch/x86_64_sse41/vector.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file vector.h 4 | * 5 | * @brief header for various vector (SIMD) macros 6 | */ 7 | #ifndef _VECTOR_H_INCLUDED 8 | #define _VECTOR_H_INCLUDED 9 | 10 | /** 11 | * @struct v64_mask_s 12 | * 13 | * @brief common 64cell-wide mask type 14 | */ 15 | typedef struct v64_mask_s { 16 | uint16_t m1; 17 | uint16_t m2; 18 | uint16_t m3; 19 | uint16_t m4; 20 | } v64_mask_t; 21 | typedef struct v64_mask_s v64i8_mask_t; 22 | 23 | /** 24 | * @union v64_mask_u 25 | */ 26 | typedef union v64_mask_u { 27 | v64_mask_t mask; 28 | uint64_t all; 29 | } v64_masku_t; 30 | typedef union v64_mask_u v64i8_masku_t; 31 | 32 | /** 33 | * @struct v32_mask_s 34 | * 35 | * @brief common 32cell-wide mask type 36 | */ 37 | typedef struct v32_mask_s { 38 | uint16_t m1; 39 | uint16_t m2; 40 | } v32_mask_t; 41 | typedef struct v32_mask_s v32i8_mask_t; 42 | 43 | /** 44 | * @union v32_mask_u 45 | */ 46 | typedef union v32_mask_u { 47 | v32_mask_t mask; 48 | uint32_t all; 49 | } v32_masku_t; 50 | typedef union v32_mask_u v32i8_masku_t; 51 | 52 | /** 53 | * @struct v16_mask_s 54 | * 55 | * @brief common 16cell-wide mask type 56 | */ 57 | typedef struct v16_mask_s { 58 | uint16_t m1; 59 | } v16_mask_t; 60 | typedef struct v16_mask_s v16i8_mask_t; 61 | 62 | /** 63 | * @union v16_mask_u 64 | */ 65 | typedef union v16_mask_u { 66 | v16_mask_t mask; 67 | uint16_t all; 68 | } v16_masku_t; 69 | typedef union v16_mask_u v16i8_masku_t; 70 | 71 | /** 72 | * abstract vector types 73 | * 74 | * v2i32_t, v2i64_t for pair of 32-bit, 64-bit signed integers. Mainly for 75 | * a pair of coordinates. Conversion between the two types are provided. 76 | * 77 | * v16i8_t is a unit vector for substitution matrices and gap vectors. 78 | * Broadcast to v16i8_t and v32i8_t are provided. 79 | * 80 | * v32i8_t is a unit vector for small differences in banded alignment. v16i8_t 81 | * vector can be broadcasted to high and low 16 elements of v32i8_t. It can 82 | * also expanded to v32i16_t. 83 | * 84 | * v32i16_t is for middle differences in banded alignment. It can be converted 85 | * from v32i8_t 86 | */ 87 | #include "v2i32.h" 88 | #include "v4i32.h" 89 | #include "v2i64.h" 90 | #include "v16i8.h" 91 | #include "v16i16.h" 92 | #include "v32i8.h" 93 | #include "v32i16.h" 94 | #include "v64i8.h" 95 | #include "v64i16.h" 96 | 97 | /* conversion and cast between vector types */ 98 | #define _from_v16i8_v64i8(x) (v64i8_t){ (x).v1, (x).v1, (x).v1, (x).v1 } 99 | #define _from_v32i8_v64i8(x) (v64i8_t){ (x).v1, (x).v2, (x).v1, (x).v2 } 100 | #define _from_v64i8_v64i8(x) (v64i8_t){ (x).v1, (x).v2, (x).v3, (x).v4 } 101 | 102 | #define _from_v16i8_v32i8(x) (v32i8_t){ (x).v1, (x).v1 } 103 | #define _from_v32i8_v32i8(x) (v32i8_t){ (x).v1, (x).v2 } 104 | #define _from_v64i8_v32i8(x) (v32i8_t){ (x).v1, (x).v2 } 105 | 106 | #define _from_v16i8_v16i8(x) (v16i8_t){ (x).v1 } 107 | #define _from_v32i8_v16i8(x) (v16i8_t){ (x).v1 } 108 | #define _from_v64i8_v16i8(x) (v16i8_t){ (x).v1 } 109 | 110 | /* reversed alias */ 111 | #define _to_v64i8_v16i8(x) (v32i8_t){ (x).v1, (x).v1, (x).v1, (x).v1 } 112 | #define _to_v64i8_v32i8(x) (v32i8_t){ (x).v1, (x).v2, (x).v1, (x).v2 } 113 | #define _to_v64i8_v64i8(x) (v32i8_t){ (x).v1, (x).v2, (x).v3, (x).v4 } 114 | 115 | #define _to_v32i8_v16i8(x) (v32i8_t){ (x).v1, (x).v1 } 116 | #define _to_v32i8_v32i8(x) (v32i8_t){ (x).v1, (x).v2 } 117 | #define _to_v32i8_v64i8(x) (v32i8_t){ (x).v1, (x).v2 } 118 | 119 | #define _to_v16i8_v16i8(x) (v16i8_t){ (x).v1 } 120 | #define _to_v16i8_v32i8(x) (v16i8_t){ (x).v1 } 121 | #define _to_v16i8_v64i8(x) (v16i8_t){ (x).v1 } 122 | 123 | #define _cast_v2i64_v2i32(x) (v2i32_t){ (x).v1 } 124 | #define _cast_v2i32_v2i64(x) (v2i64_t){ (x).v1 } 125 | 126 | #endif /* _VECTOR_H_INCLUDED */ 127 | /** 128 | * end of vector.h 129 | */ 130 | -------------------------------------------------------------------------------- /arch/x86_64_avx2/vector.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file vector.h 4 | * 5 | * @brief header for various vector (SIMD) macros 6 | */ 7 | #ifndef _VECTOR_H_INCLUDED 8 | #define _VECTOR_H_INCLUDED 9 | 10 | /** 11 | * @struct v64_mask_s 12 | * 13 | * @brief common 32cell-wide mask type 14 | */ 15 | typedef struct v64_mask_s { 16 | uint32_t m1; 17 | uint32_t m2; 18 | } v64_mask_t; 19 | typedef struct v64_mask_s v64i8_mask_t; 20 | 21 | /** 22 | * @union v64_mask_u 23 | */ 24 | typedef union v64_mask_u { 25 | v64_mask_t mask; 26 | uint64_t all; 27 | } v64_masku_t; 28 | typedef union v64_mask_u v64i8_masku_t; 29 | 30 | /** 31 | * @struct v32_mask_s 32 | * 33 | * @brief common 32cell-wide mask type 34 | */ 35 | typedef struct v32_mask_s { 36 | uint32_t m1; 37 | } v32_mask_t; 38 | typedef struct v32_mask_s v32i8_mask_t; 39 | 40 | /** 41 | * @union v32_mask_u 42 | */ 43 | typedef union v32_mask_u { 44 | v32_mask_t mask; 45 | uint32_t all; 46 | } v32_masku_t; 47 | typedef union v32_mask_u v32i8_masku_t; 48 | 49 | /** 50 | * @struct v16_mask_s 51 | * 52 | * @brief common 16cell-wide mask type 53 | */ 54 | typedef struct v16_mask_s { 55 | uint16_t m1; 56 | } v16_mask_t; 57 | typedef struct v16_mask_s v16i8_mask_t; 58 | 59 | /** 60 | * @union v16_mask_u 61 | */ 62 | typedef union v16_mask_u { 63 | v16_mask_t mask; 64 | uint16_t all; 65 | } v16_masku_t; 66 | typedef union v16_mask_u v16i8_masku_t; 67 | 68 | /** 69 | * abstract vector types 70 | * 71 | * v2i32_t, v2i64_t for pair of 32-bit, 64-bit signed integers. Mainly for 72 | * a pair of coordinates. Conversion between the two types are provided. 73 | * 74 | * v16i8_t is a unit vector for substitution matrices and gap vectors. 75 | * Broadcast to v16i8_t and v32i8_t are provided. 76 | * 77 | * v32i8_t is a unit vector for small differences in banded alignment. v16i8_t 78 | * vector can be broadcasted to high and low 16 elements of v32i8_t. It can 79 | * also expanded to v32i16_t. 80 | * 81 | * v32i16_t is for middle differences in banded alignment. It can be converted 82 | * from v32i8_t 83 | */ 84 | #include "v2i32.h" 85 | #include "v4i32.h" 86 | #include "v2i64.h" 87 | #include "v16i8.h" 88 | #include "v16i16.h" 89 | #include "v32i8.h" 90 | #include "v32i16.h" 91 | #include "v64i8.h" 92 | #include "v64i16.h" 93 | 94 | #if defined(_ARCH_GCC_VERSION) && _ARCH_GCC_VERSION < 480 95 | # define _mm256_broadcastsi128_si256 _mm_broadcastsi128_si256 96 | #endif 97 | 98 | /* conversion and cast between vector types */ 99 | #define _from_v16i8_v64i8(x) (v64i8_t){ _mm256_broadcastsi128_si256((x).v1), _mm256_broadcastsi128_si256((x).v1) } 100 | #define _from_v32i8_v64i8(x) (v64i8_t){ (x).v1, (x).v1 } 101 | #define _from_v64i8_v64i8(x) (v64i8_t){ (x).v1, (x).v2 } 102 | 103 | #define _from_v16i8_v32i8(x) (v32i8_t){ _mm256_broadcastsi128_si256((x).v1) } 104 | #define _from_v32i8_v32i8(x) (v32i8_t){ (x).v1 } 105 | #define _from_v32i8_v32i8(x) (v32i8_t){ (x).v1 } 106 | 107 | #define _from_v16i8_v16i8(x) (v16i8_t){ (x).v1 } 108 | #define _from_v32i8_v16i8(x) (v16i8_t){ _mm256_castsi256_si128((x).v1) } 109 | #define _from_v64i8_v16i8(x) (v16i8_t){ _mm256_castsi256_si128((x).v1) } 110 | 111 | /* inversed alias */ 112 | #define _to_v64i8_v16i8(x) (v64i8_t){ _mm256_broadcastsi128_si256((x).v1), _mm256_broadcastsi128_si256((x).v1) } 113 | #define _to_v64i8_v32i8(x) (v64i8_t){ (x).v1, (x).v1 } 114 | #define _to_v64i8_v64i8(x) (v64i8_t){ (x).v1, (x).v2 } 115 | 116 | #define _to_v32i8_v16i8(x) (v32i8_t){ _mm256_broadcastsi128_si256((x).v1) } 117 | #define _to_v32i8_v32i8(x) (v32i8_t){ (x).v1 } 118 | #define _to_v32i8_v64i8(x) (v32i8_t){ (x).v1 } 119 | 120 | #define _to_v16i8_v16i8(x) (v16i8_t){ (x).v1 } 121 | #define _to_v16i8_v32i8(x) (v16i8_t){ _mm256_castsi256_si128((x).v1) } 122 | #define _to_v16i8_v64i8(x) (v16i8_t){ _mm256_castsi256_si128((x).v1) } 123 | 124 | #define _cast_v2i64_v2i32(x) (v2i32_t){ (x).v1 } 125 | #define _cast_v2i32_v2i64(x) (v2i64_t){ (x).v1 } 126 | 127 | #endif /* _VECTOR_H_INCLUDED */ 128 | /** 129 | * end of vector.h 130 | */ 131 | -------------------------------------------------------------------------------- /README.minimap.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | Minimap is an *experimental* tool to efficiently find multiple approximate 4 | mapping positions between two sets of long sequences, such as between reads and 5 | reference genomes, between genomes and between long noisy reads. By default, it 6 | is tuned to have high sensitivity to 2kb matches around 20% divergence but with 7 | low specificity. Minimap does not generate alignments as of now and because of 8 | this, it is usually tens of times faster than mainstream *aligners*. With four 9 | CPU cores, minimap can map 1.6Gbp PacBio reads to human in 2.5 minutes, 1Gbp 10 | PacBio E. coli reads to pre-indexed 9.6Gbp bacterial genomes in 3 minutes, to 11 | pre-indexed >100Gbp nt database in ~1 hour (of which ~20 minutes are spent on 12 | loading index from the network filesystem; peak RAM: 10GB), map 2800 bacteria 13 | to themselves in 1 hour, and map 1Gbp E. coli reads against themselves in a 14 | couple of minutes. 15 | 16 | Minimap does not replace mainstream aligners, but it can be useful when you 17 | want to quickly identify long approximate matches at moderate divergence among 18 | a huge collection of sequences. For this task, it is much faster than most 19 | existing tools. 20 | 21 | ## Usage 22 | 23 | * Map two sets of long sequences: 24 | ```sh 25 | minimap target.fa.gz query.fa.gz > out.mini 26 | ``` 27 | The output is TAB-delimited with each line consisting of query name, length, 28 | 0-based start, end, strand, target name, length, start, end, the number of 29 | matching bases, the number of co-linear minimizers in the match and the 30 | fraction of matching bases. 31 | 32 | * All-vs-all PacBio read self-mapping for [miniasm][miniasm]: 33 | ```sh 34 | minimap -Sw5 -L100 -m0 reads.fa reads.fa | gzip -1 > reads.paf.gz 35 | ``` 36 | 37 | * Prebuild index and then map: 38 | ```sh 39 | minimap -d target.mmi target.fa.gz 40 | minimap -l target.mmi query.fa.gz > out.mini 41 | ``` 42 | Minimap indexing is very fast (1 minute for human genome; 50 minutes for >100Gbp 43 | nt database retrieved on 2015-09-30), but for huge 44 | repeatedly used databases, prebuilding index is still preferred. 45 | 46 | * Map sequences against themselve without diagnal matches: 47 | ```sh 48 | minimap -S sequences.fa sequences.fa > self-match.mini 49 | ``` 50 | The output may still contain overlapping matches in repetitive regions. 51 | 52 | ## Algorithm Overview 53 | 54 | 1. Indexing. Collect all [(*w*,*k*)-minimizers][mini] in a batch (**-I**=4 55 | billion bp) of target sequences and store them in a hash table. Mark top 56 | **-f**=0.1% of most frequent minimizers as repeats. Minimap 57 | uses [invertible hash function][invhash] to avoid taking ploy-A as 58 | minimizers. 59 | 60 | 2. For each query, collect all (*w*,*k*)-minimizers and look up the hash table for 61 | matches (*qi*,*ti*,*si*), where 62 | *qi* is the query position, *ti* the target position 63 | and *si* indicates whether the minimizer match is on the same 64 | strand. 65 | 66 | 3. For matches on the same strand, sort by {*qi*-*ti*} 67 | and then cluster matches within a **-r**=500bp window. Minimap merges 68 | two windows if **-m**=50% of minimizer matches overlap. For matches on different 69 | strands, sort {*qi*+*ti*} and apply a similar 70 | clustering procedure. This is inspired by the [Hough transformation][hough]. 71 | 72 | 4. For each cluster, sort (*qi*,*ti*) by *qi* 73 | and solve a [longest increasing sequence problem][lis] for *ti*. This 74 | finds the longest co-linear matching chain. Break the chain whenever there 75 | is a gap longer than **-g**=10000. 76 | 77 | 5. Output the start and end of the chain if it contains **-c**=4 or more 78 | minimizer matches and the matching length is no less than **-L**=40. 79 | 80 | 6. Go to 1 and rewind to the first record of query if there are more target 81 | sequences; otherwise stop. 82 | 83 | To increase sensitivity, we may decrease **-w** to index more minimizers; 84 | we may also decrease **-k**, though this may greatly impact performance for 85 | mammalian genomes. 86 | 87 | Also note that by default, if the total length of target sequences is less than 88 | 4Gbp (1G=1 billion; controlled by **-I**), minimap creates one index and stream 89 | all the query sequences in one go. The multiple hits of a query sequence is 90 | adjacent to each other in the output. If the total length is greater than 91 | 4Gbp, minimap needs to read query sequences multiple times. The multiple hits 92 | of a query may not be adjacent. 93 | 94 | [mini]: http://bioinformatics.oxfordjournals.org/content/20/18/3363.abstract 95 | [lis]: https://en.wikipedia.org/wiki/Longest_increasing_subsequence 96 | [hough]: https://en.wikipedia.org/wiki/Hough_transform 97 | [invhash]: https://gist.github.com/lh3/974ced188be2f90422cc 98 | [miniasm]: https://github.com/lh3/miniasm 99 | -------------------------------------------------------------------------------- /ksort.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, 2011 Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | // This is a simplified version of ksort.h 27 | 28 | #ifndef AC_KSORT_H 29 | #define AC_KSORT_H 30 | 31 | #include 32 | #include 33 | 34 | typedef struct { 35 | void *left, *right; 36 | int depth; 37 | } ks_isort_stack_t; 38 | 39 | #define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } 40 | 41 | #define KSORT_INIT(name, type_t, __sort_lt) \ 42 | static type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ 43 | { \ 44 | type_t *low, *high, *k, *ll, *hh, *mid; \ 45 | low = arr; high = arr + n - 1; k = arr + kk; \ 46 | for (;;) { \ 47 | if (high <= low) return *k; \ 48 | if (high == low + 1) { \ 49 | if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ 50 | return *k; \ 51 | } \ 52 | mid = low + (high - low) / 2; \ 53 | if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ 54 | if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ 55 | if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ 56 | KSORT_SWAP(type_t, *mid, *(low+1)); \ 57 | ll = low + 1; hh = high; \ 58 | for (;;) { \ 59 | do ++ll; while (__sort_lt(*ll, *low)); \ 60 | do --hh; while (__sort_lt(*low, *hh)); \ 61 | if (hh < ll) break; \ 62 | KSORT_SWAP(type_t, *ll, *hh); \ 63 | } \ 64 | KSORT_SWAP(type_t, *low, *hh); \ 65 | if (hh <= k) low = ll; \ 66 | if (hh >= k) high = hh - 1; \ 67 | } \ 68 | } \ 69 | 70 | #define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) 71 | 72 | #define ks_lt_generic(a, b) ((a) < (b)) 73 | #define ks_lt_str(a, b) (strcmp((a), (b)) < 0) 74 | 75 | typedef const char *ksstr_t; 76 | 77 | #define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) 78 | #define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) 79 | 80 | #define RS_MIN_SIZE 64 81 | 82 | #define KRADIX_SORT_INIT(name, rstype_t, rskey, sizeof_key) \ 83 | typedef struct { \ 84 | rstype_t *b, *e; \ 85 | } rsbucket_##name##_t; \ 86 | static void rs_insertsort_##name(rstype_t *beg, rstype_t *end) \ 87 | { \ 88 | rstype_t *i; \ 89 | for (i = beg + 1; i < end; ++i) \ 90 | if (rskey(*i) < rskey(*(i - 1))) { \ 91 | rstype_t *j, tmp = *i; \ 92 | for (j = i; j > beg && rskey(tmp) < rskey(*(j-1)); --j) \ 93 | *j = *(j - 1); \ 94 | *j = tmp; \ 95 | } \ 96 | } \ 97 | static void rs_sort_##name(rstype_t *beg, rstype_t *end, int n_bits, int s) \ 98 | { \ 99 | rstype_t *i; \ 100 | int size = 1<b = k->e = beg; \ 103 | for (i = beg; i != end; ++i) ++b[rskey(*i)>>s&m].e; \ 104 | for (k = b + 1; k != be; ++k) \ 105 | k->e += (k-1)->e - beg, k->b = (k-1)->e; \ 106 | for (k = b; k != be;) { \ 107 | if (k->b != k->e) { \ 108 | rsbucket_##name##_t *l; \ 109 | if ((l = b + (rskey(*k->b)>>s&m)) != k) { \ 110 | rstype_t tmp = *k->b, swap; \ 111 | do { \ 112 | swap = tmp; tmp = *l->b; *l->b++ = swap; \ 113 | l = b + (rskey(tmp)>>s&m); \ 114 | } while (l != k); \ 115 | *k->b++ = tmp; \ 116 | } else ++k->b; \ 117 | } else ++k; \ 118 | } \ 119 | for (b->b = beg, k = b + 1; k != be; ++k) k->b = (k-1)->e; \ 120 | if (s) { \ 121 | s = s > n_bits? s - n_bits : 0; \ 122 | for (k = b; k != be; ++k) \ 123 | if (k->e - k->b > RS_MIN_SIZE) rs_sort_##name(k->b, k->e, n_bits, s); \ 124 | else if (k->e - k->b > 1) rs_insertsort_##name(k->b, k->e); \ 125 | } \ 126 | } \ 127 | static void radix_sort_##name(rstype_t *p, size_t l) \ 128 | { \ 129 | if (l <= RS_MIN_SIZE) rs_insertsort_##name(p, p + l); \ 130 | else rs_sort_##name(p, p + l, 8, sizeof_key * 8 - 8); \ 131 | } 132 | 133 | #endif 134 | -------------------------------------------------------------------------------- /arch/x86_64_sse41/v2i64.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file v2i64.h 4 | * 5 | * @brief struct and _Generic based vector class implementation 6 | */ 7 | #ifndef _V2I64_H_INCLUDED 8 | #define _V2I64_H_INCLUDED 9 | 10 | /* include header for intel / amd sse2 instruction sets */ 11 | #include 12 | 13 | /* 8bit 32cell */ 14 | typedef struct v2i64_s { 15 | __m128i v1; 16 | } v2i64_t; 17 | 18 | /* expanders (without argument) */ 19 | #define _e_x_v2i64_1(u) 20 | #define _e_x_v2i64_2(u) 21 | 22 | /* expanders (without immediate) */ 23 | #define _e_v_v2i64_1(a) (a).v1 24 | #define _e_v_v2i64_2(a) (a).v1 25 | #define _e_vv_v2i64_1(a, b) (a).v1, (b).v1 26 | #define _e_vv_v2i64_2(a, b) (a).v1, (b).v1 27 | #define _e_vvv_v2i64_1(a, b, c) (a).v1, (b).v1, (c).v1 28 | #define _e_vvv_v2i64_2(a, b, c) (a).v1, (b).v1, (c).v1 29 | 30 | /* expanders with immediate */ 31 | #define _e_i_v2i64_1(imm) (imm) 32 | #define _e_i_v2i64_2(imm) (imm) 33 | #define _e_vi_v2i64_1(a, imm) (a).v1, (imm) 34 | #define _e_vi_v2i64_2(a, imm) (a).v1, (imm) 35 | #define _e_vvi_v2i64_1(a, b, imm) (a).v1, (b).v1, (imm) 36 | #define _e_vvi_v2i64_2(a, b, imm) (a).v1, (b).v1, (imm) 37 | 38 | /* address calculation macros */ 39 | #define _addr_v2i64_1(imm) ( (__m128i *)(imm) ) 40 | #define _addr_v2i64_2(imm) ( (__m128i *)(imm) ) 41 | #define _pv_v2i64(ptr) ( _addr_v2i64_1(ptr) ) 42 | /* expanders with pointers */ 43 | #define _e_p_v2i64_1(ptr) _addr_v2i64_1(ptr) 44 | #define _e_p_v2i64_2(ptr) _addr_v2i64_2(ptr) 45 | #define _e_pv_v2i64_1(ptr, a) _addr_v2i64_1(ptr), (a).v1 46 | #define _e_pv_v2i64_2(ptr, a) _addr_v2i64_2(ptr), (a).v1 47 | 48 | /* expand intrinsic name */ 49 | #define _i_v2i64(intrin) _mm_##intrin##_epi64 50 | #define _i_v2i64x(intrin) _mm_##intrin##_si128 51 | 52 | /* apply */ 53 | #define _a_v2i64(intrin, expander, ...) ( \ 54 | (v2i64_t) { \ 55 | _i_v2i64(intrin)(expander##_v2i64_1(__VA_ARGS__)) \ 56 | } \ 57 | ) 58 | #define _a_v2i64x(intrin, expander, ...) ( \ 59 | (v2i64_t) { \ 60 | _i_v2i64x(intrin)(expander##_v2i64_1(__VA_ARGS__)) \ 61 | } \ 62 | ) 63 | #define _a_v2i64xv(intrin, expander, ...) { \ 64 | _i_v2i64x(intrin)(expander##_v2i64_1(__VA_ARGS__)); \ 65 | } 66 | 67 | /* load and store */ 68 | #define _load_v2i64(...) _a_v2i64x(load, _e_p, __VA_ARGS__) 69 | #define _loadu_v2i64(...) _a_v2i64x(loadu, _e_p, __VA_ARGS__) 70 | #define _store_v2i64(...) _a_v2i64xv(store, _e_pv, __VA_ARGS__) 71 | #define _storeu_v2i64(...) _a_v2i64xv(storeu, _e_pv, __VA_ARGS__) 72 | 73 | /* broadcast */ 74 | // #define _set_v2i64(...) _a_v2i64(set1, _e_i, __VA_ARGS__) 75 | #define _set_v2i64(x) ( (v2i64_t) { _mm_set1_epi64x(x) } ) 76 | #define _zero_v2i64() _a_v2i64x(setzero, _e_x, _unused) 77 | #define _seta_v2i64(x, y) ( (v2i64_t) { _mm_set_epi64x(x, y) } ) 78 | #define _swap_v2i64(x) ( \ 79 | (v2i64_t) { \ 80 | _mm_shuffle_epi32((x).v1, 0x4e) \ 81 | } \ 82 | ) 83 | 84 | /* logics */ 85 | #define _not_v2i64(...) _a_v2i64x(not, _e_v, __VA_ARGS__) 86 | #define _and_v2i64(...) _a_v2i64x(and, _e_vv, __VA_ARGS__) 87 | #define _or_v2i64(...) _a_v2i64x(or, _e_vv, __VA_ARGS__) 88 | #define _xor_v2i64(...) _a_v2i64x(xor, _e_vv, __VA_ARGS__) 89 | #define _andn_v2i64(...) _a_v2i64x(andnot, _e_vv, __VA_ARGS__) 90 | 91 | /* arithmetics */ 92 | #define _add_v2i64(...) _a_v2i64(add, _e_vv, __VA_ARGS__) 93 | #define _sub_v2i64(...) _a_v2i64(sub, _e_vv, __VA_ARGS__) 94 | // #define _max_v2i64(...) _a_v2i64(max, _e_vv, __VA_ARGS__) 95 | // #define _min_v2i64(...) _a_v2i64(min, _e_vv, __VA_ARGS__) 96 | #define _max_v2i64(a, b) ( (v2i64_t) { _mm_max_epi32(a.v1, b.v1) } ) 97 | #define _min_v2i64(a, b) ( (v2i64_t) { _mm_min_epi32(a.v1, b.v1) } ) 98 | 99 | /* shuffle */ 100 | // #define _shuf_v2i64(...) _a_v2i64(shuffle, _e_vv, __VA_ARGS__) 101 | 102 | /* blend */ 103 | #define _sel_v2i64(mask, a, b) ( \ 104 | (v2i64_t) { \ 105 | _mm_blendv_epi8((b).v1, (a).v1, (mask).v1) \ 106 | } \ 107 | ) 108 | 109 | /* compare */ 110 | #define _eq_v2i64(...) _a_v2i64(cmpeq, _e_vv, __VA_ARGS__) 111 | #define _gt_v2i64(...) _a_v2i64(cmpgt, _e_vv, __VA_ARGS__) 112 | 113 | /* test: take mask and test if all zero */ 114 | #define _test_v2i64(x, y) _mm_test_all_zeros((x).v1, (y).v1) 115 | 116 | /* insert and extract */ 117 | #define _ins_v2i64(a, val, imm) { \ 118 | (a).v1 = _i_v2i64((a).v1, (val), (imm)); \ 119 | } 120 | #define _ext_v2i64(a, imm) ( \ 121 | (int64_t)_i_v2i64(extract)((a).v1, (imm)) \ 122 | ) 123 | 124 | /* shift */ 125 | #define _shl_v2i64(a, n) ( \ 126 | (v2i64_t) {_i_v2i64(slli)((a).v1, (n))} \ 127 | ) 128 | #define _shr_v2i64(a, n) ( \ 129 | (v2i64_t) {_i_v2i64(srli)((a).v1, (n))} \ 130 | ) 131 | #define _shlv_v2i64(a, n) ( \ 132 | (v2i64_t) {_i_v2i64(sll)((a).v1, (n).v1)} \ 133 | ) 134 | #define _shrv_v2i64(a, n) ( \ 135 | (v2i64_t) {_i_v2i64(srl)((a).v1, (n).v1)} \ 136 | ) 137 | 138 | /* mask */ 139 | #define _mask_v2i64(a) ( \ 140 | (uint32_t) (_mm_movemask_epi8((a).v1)) \ 141 | ) 142 | #define V2I64_MASK_00 ( 0x0000 ) 143 | #define V2I64_MASK_01 ( 0x00ff ) 144 | #define V2I64_MASK_10 ( 0xff00 ) 145 | #define V2I64_MASK_11 ( 0xffff ) 146 | 147 | /* convert */ 148 | #define _cvt_v2i32_v2i64(a) ( \ 149 | (v2i64_t) { \ 150 | _mm_cvtepi32_epi64((a).v1) \ 151 | } \ 152 | ) 153 | 154 | /* transpose */ 155 | #define _lo_v2i64(a, b) ( \ 156 | (v2i64_t) { \ 157 | _mm_unpacklo_epi64((a).v1, (b).v1) \ 158 | } \ 159 | ) 160 | #define _hi_v2i64(a, b) ( \ 161 | (v2i64_t) { \ 162 | _mm_unpackhi_epi64((a).v1, (b).v1) \ 163 | } \ 164 | ) 165 | 166 | /* debug print */ 167 | #ifdef _LOG_H_INCLUDED 168 | #define _print_v2i64(a) { \ 169 | debug("(v2i64_t) %s(%lx, %lx)", #a, _ext_v2i64(a, 1), _ext_v2i64(a, 0)); \ 170 | } 171 | #else 172 | #define _print_v2i64(x) ; 173 | #endif 174 | 175 | #endif /* _V2I64_H_INCLUDED */ 176 | /** 177 | * end of v2i64.h 178 | */ 179 | -------------------------------------------------------------------------------- /arch/x86_64_avx2/v2i64.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file v2i64.h 4 | * 5 | * @brief struct and _Generic based vector class implementation 6 | */ 7 | #ifndef _V2I64_H_INCLUDED 8 | #define _V2I64_H_INCLUDED 9 | 10 | /* include header for intel / amd sse2 instruction sets */ 11 | #include 12 | 13 | /* 8bit 32cell */ 14 | typedef struct v2i64_s { 15 | __m128i v1; 16 | } v2i64_t; 17 | 18 | /* expanders (without argument) */ 19 | #define _e_x_v2i64_1(u) 20 | #define _e_x_v2i64_2(u) 21 | 22 | /* expanders (without immediate) */ 23 | #define _e_v_v2i64_1(a) (a).v1 24 | #define _e_v_v2i64_2(a) (a).v1 25 | #define _e_vv_v2i64_1(a, b) (a).v1, (b).v1 26 | #define _e_vv_v2i64_2(a, b) (a).v1, (b).v1 27 | #define _e_vvv_v2i64_1(a, b, c) (a).v1, (b).v1, (c).v1 28 | #define _e_vvv_v2i64_2(a, b, c) (a).v1, (b).v1, (c).v1 29 | 30 | /* expanders with immediate */ 31 | #define _e_i_v2i64_1(imm) (imm) 32 | #define _e_i_v2i64_2(imm) (imm) 33 | #define _e_vi_v2i64_1(a, imm) (a).v1, (imm) 34 | #define _e_vi_v2i64_2(a, imm) (a).v1, (imm) 35 | #define _e_vvi_v2i64_1(a, b, imm) (a).v1, (b).v1, (imm) 36 | #define _e_vvi_v2i64_2(a, b, imm) (a).v1, (b).v1, (imm) 37 | 38 | /* address calculation macros */ 39 | #define _addr_v2i64_1(imm) ( (__m128i *)(imm) ) 40 | #define _addr_v2i64_2(imm) ( (__m128i *)(imm) ) 41 | #define _pv_v2i64(ptr) ( _addr_v2i64_1(ptr) ) 42 | /* expanders with pointers */ 43 | #define _e_p_v2i64_1(ptr) _addr_v2i64_1(ptr) 44 | #define _e_p_v2i64_2(ptr) _addr_v2i64_2(ptr) 45 | #define _e_pv_v2i64_1(ptr, a) _addr_v2i64_1(ptr), (a).v1 46 | #define _e_pv_v2i64_2(ptr, a) _addr_v2i64_2(ptr), (a).v1 47 | 48 | /* expand intrinsic name */ 49 | #define _i_v2i64(intrin) _mm_##intrin##_epi64 50 | #define _i_v2i64x(intrin) _mm_##intrin##_si128 51 | 52 | /* apply */ 53 | #define _a_v2i64(intrin, expander, ...) ( \ 54 | (v2i64_t) { \ 55 | _i_v2i64(intrin)(expander##_v2i64_1(__VA_ARGS__)) \ 56 | } \ 57 | ) 58 | #define _a_v2i64x(intrin, expander, ...) ( \ 59 | (v2i64_t) { \ 60 | _i_v2i64x(intrin)(expander##_v2i64_1(__VA_ARGS__)) \ 61 | } \ 62 | ) 63 | #define _a_v2i64xv(intrin, expander, ...) { \ 64 | _i_v2i64x(intrin)(expander##_v2i64_1(__VA_ARGS__)); \ 65 | } 66 | 67 | /* load and store */ 68 | #define _load_v2i64(...) _a_v2i64x(load, _e_p, __VA_ARGS__) 69 | #define _loadu_v2i64(...) _a_v2i64x(loadu, _e_p, __VA_ARGS__) 70 | #define _store_v2i64(...) _a_v2i64xv(store, _e_pv, __VA_ARGS__) 71 | #define _storeu_v2i64(...) _a_v2i64xv(storeu, _e_pv, __VA_ARGS__) 72 | 73 | /* broadcast */ 74 | // #define _set_v2i64(...) _a_v2i64(set1, _e_i, __VA_ARGS__) 75 | #define _set_v2i64(x) ( (v2i64_t) { _mm_set1_epi64x(x) } ) 76 | #define _zero_v2i64() _a_v2i64x(setzero, _e_x, _unused) 77 | #define _seta_v2i64(x, y) ( (v2i64_t) { _mm_set_epi64x(x, y) } ) 78 | #define _swap_v2i64(x) ( \ 79 | (v2i64_t) { \ 80 | _mm_shuffle_epi32((x).v1, 0x4e) \ 81 | } \ 82 | ) 83 | 84 | /* logics */ 85 | #define _not_v2i64(...) _a_v2i64x(not, _e_v, __VA_ARGS__) 86 | #define _and_v2i64(...) _a_v2i64x(and, _e_vv, __VA_ARGS__) 87 | #define _or_v2i64(...) _a_v2i64x(or, _e_vv, __VA_ARGS__) 88 | #define _xor_v2i64(...) _a_v2i64x(xor, _e_vv, __VA_ARGS__) 89 | #define _andn_v2i64(...) _a_v2i64x(andnot, _e_vv, __VA_ARGS__) 90 | 91 | /* arithmetics */ 92 | #define _add_v2i64(...) _a_v2i64(add, _e_vv, __VA_ARGS__) 93 | #define _sub_v2i64(...) _a_v2i64(sub, _e_vv, __VA_ARGS__) 94 | // #define _max_v2i64(...) _a_v2i64(max, _e_vv, __VA_ARGS__) 95 | // #define _min_v2i64(...) _a_v2i64(min, _e_vv, __VA_ARGS__) 96 | #define _max_v2i64(a, b) ( (v2i64_t) { _mm_max_epi32(a.v1, b.v1) } ) 97 | #define _min_v2i64(a, b) ( (v2i64_t) { _mm_min_epi32(a.v1, b.v1) } ) 98 | 99 | /* shuffle */ 100 | // #define _shuf_v2i64(...) _a_v2i64(shuffle, _e_vv, __VA_ARGS__) 101 | 102 | /* blend */ 103 | #define _sel_v2i64(mask, a, b) ( \ 104 | (v2i64_t) { \ 105 | _mm_blendv_epi8((b).v1, (a).v1, (mask).v1) \ 106 | } \ 107 | ) 108 | 109 | /* compare */ 110 | #define _eq_v2i64(...) _a_v2i64(cmpeq, _e_vv, __VA_ARGS__) 111 | #define _gt_v2i64(...) _a_v2i64(cmpgt, _e_vv, __VA_ARGS__) 112 | 113 | /* test: take mask and test if all zero */ 114 | #define _test_v2i64(x, y) _mm_test_all_zeros((x).v1, (y).v1) 115 | 116 | /* insert and extract */ 117 | #define _ins_v2i64(a, val, imm) { \ 118 | (a).v1 = _i_v2i64((a).v1, (val), (imm)); \ 119 | } 120 | #define _ext_v2i64(a, imm) ( \ 121 | (int64_t)_i_v2i64(extract)((a).v1, (imm)) \ 122 | ) 123 | 124 | /* shift */ 125 | #define _shl_v2i64(a, n) ( \ 126 | (v2i64_t) {_i_v2i64(slli)((a).v1, (n))} \ 127 | ) 128 | #define _shr_v2i64(a, n) ( \ 129 | (v2i64_t) {_i_v2i64(srli)((a).v1, (n))} \ 130 | ) 131 | #define _shlv_v2i64(a, n) ( \ 132 | (v2i64_t) {_i_v2i64(sll)((a).v1, (n).v1)} \ 133 | ) 134 | #define _shrv_v2i64(a, n) ( \ 135 | (v2i64_t) {_i_v2i64(srl)((a).v1, (n).v1)} \ 136 | ) 137 | 138 | /* mask */ 139 | #define _mask_v2i64(a) ( \ 140 | (uint32_t) (_mm_movemask_epi8((a).v1)) \ 141 | ) 142 | #define V2I64_MASK_00 ( 0x0000 ) 143 | #define V2I64_MASK_01 ( 0x00ff ) 144 | #define V2I64_MASK_10 ( 0xff00 ) 145 | #define V2I64_MASK_11 ( 0xffff ) 146 | 147 | /* convert */ 148 | #define _cvt_v2i32_v2i64(a) ( \ 149 | (v2i64_t) { \ 150 | _mm_cvtepi32_epi64((a).v1) \ 151 | } \ 152 | ) 153 | 154 | /* transpose */ 155 | #define _lo_v2i64(a, b) ( \ 156 | (v2i64_t) { \ 157 | _mm_unpacklo_epi64((a).v1, (b).v1) \ 158 | } \ 159 | ) 160 | #define _hi_v2i64(a, b) ( \ 161 | (v2i64_t) { \ 162 | _mm_unpackhi_epi64((a).v1, (b).v1) \ 163 | } \ 164 | ) 165 | 166 | /* debug print */ 167 | // #ifdef _LOG_H_INCLUDED 168 | #define _print_v2i64(a) { \ 169 | debug("(v2i64_t) %s(%lx, %lx)", #a, _ext_v2i64(a, 1), _ext_v2i64(a, 0)); \ 170 | } 171 | // #else 172 | // #define _print_v2i64(x) ; 173 | // #endif 174 | 175 | #endif /* _V2I64_H_INCLUDED */ 176 | /** 177 | * end of v2i64.h 178 | */ 179 | -------------------------------------------------------------------------------- /arch/x86_64_avx2/v16i16.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file v16i16.h 4 | * 5 | * @brief struct and _Generic based vector class implementation 6 | */ 7 | #ifndef _V16I16_H_INCLUDED 8 | #define _V16I16_H_INCLUDED 9 | 10 | /* include header for intel / amd sse2 instruction sets */ 11 | #include 12 | 13 | /* 8bit 32cell */ 14 | typedef struct v16i16_s { 15 | __m256i v1; 16 | } v16i16_t; 17 | 18 | /* expanders (without argument) */ 19 | #define _e_x_v16i16_1(u) 20 | #define _e_x_v16i16_2(u) 21 | 22 | /* expanders (without immediate) */ 23 | #define _e_v_v16i16_1(a) (a).v1 24 | #define _e_v_v16i16_2(a) (a).v1 25 | #define _e_vv_v16i16_1(a, b) (a).v1, (b).v1 26 | #define _e_vv_v16i16_2(a, b) (a).v1, (b).v1 27 | #define _e_vvv_v16i16_1(a, b, c) (a).v1, (b).v1, (c).v1 28 | #define _e_vvv_v16i16_2(a, b, c) (a).v1, (b).v1, (c).v1 29 | 30 | /* expanders with immediate */ 31 | #define _e_i_v16i16_1(imm) (imm) 32 | #define _e_i_v16i16_2(imm) (imm) 33 | #define _e_vi_v16i16_1(a, imm) (a).v1, (imm) 34 | #define _e_vi_v16i16_2(a, imm) (a).v1, (imm) 35 | #define _e_vvi_v16i16_1(a, b, imm) (a).v1, (b).v1, (imm) 36 | #define _e_vvi_v16i16_2(a, b, imm) (a).v1, (b).v1, (imm) 37 | 38 | /* address calculation macros */ 39 | #define _addr_v16i16_1(imm) ( (__m256i *)(imm) ) 40 | #define _addr_v16i16_2(imm) ( (__m256i *)(imm) ) 41 | #define _pv_v16i16(ptr) ( _addr_v16i16_1(ptr) ) 42 | /* expanders with pointers */ 43 | #define _e_p_v16i16_1(ptr) _addr_v16i16_1(ptr) 44 | #define _e_p_v16i16_2(ptr) _addr_v16i16_2(ptr) 45 | #define _e_pv_v16i16_1(ptr, a) _addr_v16i16_1(ptr), (a).v1 46 | #define _e_pv_v16i16_2(ptr, a) _addr_v16i16_2(ptr), (a).v1 47 | 48 | /* expand intrinsic name */ 49 | #define _i_v16i16(intrin) _mm256_##intrin##_epi16 50 | #define _i_v16i16x(intrin) _mm256_##intrin##_si256 51 | 52 | /* apply */ 53 | #define _a_v16i16(intrin, expander, ...) ( \ 54 | (v16i16_t) { \ 55 | _i_v16i16(intrin)(expander##_v16i16_1(__VA_ARGS__)) \ 56 | } \ 57 | ) 58 | #define _a_v16i16x(intrin, expander, ...) ( \ 59 | (v16i16_t) { \ 60 | _i_v16i16x(intrin)(expander##_v16i16_1(__VA_ARGS__)) \ 61 | } \ 62 | ) 63 | #define _a_v16i16xv(intrin, expander, ...) { \ 64 | _i_v16i16x(intrin)(expander##_v16i16_1(__VA_ARGS__)); \ 65 | } 66 | 67 | /* load and store */ 68 | #define _load_v16i16(...) _a_v16i16x(load, _e_p, __VA_ARGS__) 69 | #define _loadu_v16i16(...) _a_v16i16x(loadu, _e_p, __VA_ARGS__) 70 | #define _store_v16i16(...) _a_v16i16xv(store, _e_pv, __VA_ARGS__) 71 | #define _storeu_v16i16(...) _a_v16i16xv(storeu, _e_pv, __VA_ARGS__) 72 | 73 | /* broadcast */ 74 | #define _set_v16i16(...) _a_v16i16(set1, _e_i, __VA_ARGS__) 75 | #define _zero_v16i16() _a_v16i16x(setzero, _e_x, _unused) 76 | #define _seta_v16i16(...) ( \ 77 | (v16i16_t) { \ 78 | _mm256_set_epi16(__VA_ARGS__) \ 79 | } \ 80 | ) 81 | 82 | /* swap (reverse) */ 83 | #define _swap_idx_v16i16() ( \ 84 | _mm256_broadcastsi128_si256(_mm_set_epi8( \ 85 | 1, 0, 3, 2, 5, 4, 7, 6, \ 86 | 9, 8, 11, 10, 13, 12, 15, 14)) \ 87 | ) 88 | #define _swap_v16i16(a) ( \ 89 | (v16i16_t) { \ 90 | _mm256_permute2x128_si256( \ 91 | _mm256_shuffle_epi8((a).v1, _swap_idx_v16i16()), \ 92 | _mm256_shuffle_epi8((a).v1, _swap_idx_v16i16()), \ 93 | 0x01) \ 94 | } \ 95 | ) 96 | 97 | /* logics */ 98 | #define _not_v16i16(...) _a_v16i16x(not, _e_v, __VA_ARGS__) 99 | #define _and_v16i16(...) _a_v16i16x(and, _e_vv, __VA_ARGS__) 100 | #define _or_v16i16(...) _a_v16i16x(or, _e_vv, __VA_ARGS__) 101 | #define _xor_v16i16(...) _a_v16i16x(xor, _e_vv, __VA_ARGS__) 102 | #define _andn_v16i16(...) _a_v16i16x(andnot, _e_vv, __VA_ARGS__) 103 | 104 | /* arithmetics */ 105 | #define _add_v16i16(...) _a_v16i16(add, _e_vv, __VA_ARGS__) 106 | #define _sub_v16i16(...) _a_v16i16(sub, _e_vv, __VA_ARGS__) 107 | #define _max_v16i16(...) _a_v16i16(max, _e_vv, __VA_ARGS__) 108 | #define _min_v16i16(...) _a_v16i16(min, _e_vv, __VA_ARGS__) 109 | 110 | /* shuffle */ 111 | #define _shuf_v16i16(...) _a_v16i16(shuffle, _e_vv, __VA_ARGS__) 112 | 113 | /* blend */ 114 | // #define _sel_v16i16(...) _a_v16i16(blendv, _e_vvv, __VA_ARGS__) 115 | 116 | /* compare */ 117 | #define _eq_v16i16(...) _a_v16i16(cmpeq, _e_vv, __VA_ARGS__) 118 | #define _gt_v16i16(...) _a_v16i16(cmpgt, _e_vv, __VA_ARGS__) 119 | 120 | /* insert and extract */ 121 | #define _ins_v16i16(a, val, imm) { \ 122 | (a).v1 = _i_v16i16(insert)((a).v1, (val), (imm)); \ 123 | } 124 | #define _ext_v16i16(a, imm) ( \ 125 | (int16_t)_i_v16i16(extract)((a).v1, (imm)) \ 126 | ) 127 | 128 | /* mask */ 129 | #define _mask_v16i16(a) ( \ 130 | (v16_mask_t) { \ 131 | .m1 = _mm256_movemask_epi8( \ 132 | _mm256_packs_epi16((a).v1, \ 133 | _mm256_castsi128_si256(_mm256_extracti128_si256((a).v1, 1)))) \ 134 | } \ 135 | ) 136 | 137 | /* horizontal max (reduction max) */ 138 | #define _hmax_v16i16(a) ({ \ 139 | __m128i _t = _mm_max_epi16( \ 140 | _mm256_castsi256_si128((a).v1), \ 141 | _mm256_extracti128_si256((a).v1, 1) \ 142 | ); \ 143 | _t = _mm_max_epi16(_t, _mm_srli_si128(_t, 8)); \ 144 | _t = _mm_max_epi16(_t, _mm_srli_si128(_t, 4)); \ 145 | _t = _mm_max_epi16(_t, _mm_srli_si128(_t, 2)); \ 146 | (int16_t)_mm_extract_epi16(_t, 0); \ 147 | }) 148 | 149 | /* convert */ 150 | #define _cvt_v16i8_v16i16(a) ( \ 151 | (v16i16_t) { \ 152 | _mm256_cvtepi8_epi16((a).v1) \ 153 | } \ 154 | ) 155 | 156 | /* debug print */ 157 | // #ifdef _LOG_H_INCLUDED 158 | #define _print_v16i16(a) { \ 159 | debug("(v16i16_t) %s(%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d)", \ 160 | #a, \ 161 | _ext_v16i16(a, 15), \ 162 | _ext_v16i16(a, 14), \ 163 | _ext_v16i16(a, 13), \ 164 | _ext_v16i16(a, 12), \ 165 | _ext_v16i16(a, 11), \ 166 | _ext_v16i16(a, 10), \ 167 | _ext_v16i16(a, 9), \ 168 | _ext_v16i16(a, 8), \ 169 | _ext_v16i16(a, 7), \ 170 | _ext_v16i16(a, 6), \ 171 | _ext_v16i16(a, 5), \ 172 | _ext_v16i16(a, 4), \ 173 | _ext_v16i16(a, 3), \ 174 | _ext_v16i16(a, 2), \ 175 | _ext_v16i16(a, 1), \ 176 | _ext_v16i16(a, 0)); \ 177 | } 178 | // #else 179 | // #define _print_v16i16(x) ; 180 | // #endif 181 | 182 | #endif /* _V16I16_H_INCLUDED */ 183 | /** 184 | * end of v16i16.h 185 | */ 186 | -------------------------------------------------------------------------------- /kvec.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, by Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* 27 | An example: 28 | 29 | #include "kvec.h" 30 | int main() { 31 | kvec_t(int) array; 32 | kv_init(array); 33 | kv_push(int, array, 10); // append 34 | kv_a(int, array, 20) = 5; // dynamic 35 | kv_A(array, 20) = 4; // static 36 | kv_destroy(array); 37 | return 0; 38 | } 39 | */ 40 | 41 | /* 42 | 2008-09-22 (0.1.0): 43 | 44 | * The initial version. 45 | 46 | */ 47 | 48 | #ifndef AC_KVEC_H 49 | #define AC_KVEC_H 50 | 51 | #include 52 | 53 | #define kv_roundup(x, base) ( (((x) + (base) - 1) / (base)) * (base) ) 54 | #define kv_roundup32(x) kv_roundup(x, 32) 55 | #define kv_max2(a, b) ( ((a) < (b)) ? (b) : (a) ) 56 | #define kv_min2(a, b) ( ((a) < (b)) ? (a) : (b) ) 57 | 58 | /* gcc builtin hints */ 59 | #define kv_likely(x) __builtin_expect(!!(x), 1) 60 | #define kv_unlikely(x) __builtin_expect(!!(x), 0) 61 | 62 | #define kvec_t(type) struct { size_t n, m; type *a; } 63 | #define kv_init(v) ((v).n = (v).m = 0, (v).a = 0) 64 | #define kv_inits(type) ((kvec_t(type)){ .n = 0, .m = 0, .a = 0 }) 65 | #define kv_destroy(v) free((v).a) 66 | #define kv_A(v, i) ((v).a[(i)]) 67 | #define kv_pop(v) ((v).a[--(v).n]) 68 | #define kv_size(v) ((v).n) 69 | #define kv_max(v) ((v).m) 70 | 71 | #define kv_resize(type, v, s) do { \ 72 | if ((v).m < (s)) { \ 73 | (v).m = (s); \ 74 | kv_roundup32((v).m); \ 75 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ 76 | } \ 77 | } while (0) 78 | 79 | #define kv_reserve(type, v, s) ( \ 80 | kv_likely((v).m > (s)) ? 0 : ((v).m = kv_max2(2*(v).m, (s)), (v).a = realloc((v).a, sizeof(type) * (v).m), 0) ) 81 | 82 | #define kv_copy(type, v1, v0) do { \ 83 | if (kv_unlikely((v1).m < (v0).n)) { kv_resize(type, v1, (v0).n); }\ 84 | (v1).n = (v0).n; \ 85 | memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \ 86 | } while (0) \ 87 | 88 | #define kv_push(type, v, x) ({ \ 89 | if (kv_unlikely((v).n == (v).m)) { \ 90 | (v).m = (v).m? (v).m<<1 : 2; \ 91 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ 92 | } \ 93 | (v).a[(v).n++] = (x); \ 94 | (v).n - 1; \ 95 | }) 96 | 97 | #define kv_pushp(type, v) ({ \ 98 | if (kv_unlikely((v).n == (v).m)) { \ 99 | (v).m = (v).m? (v).m<<1 : 2; \ 100 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ 101 | } \ 102 | &(v).a[(v).n++]; \ 103 | }) 104 | 105 | #define kv_pushm(type, v, arr, size) ({ \ 106 | if(kv_unlikely(((v).m - (v).n) < (uint64_t)(size))) { \ 107 | (v).m = kv_max2((v).m * 2, (v).n + (size)); \ 108 | (v).a = (type*)realloc((v).a, sizeof(*(v).a) * (v).m); \ 109 | } \ 110 | for(uint64_t _i = 0; _i < (uint64_t)(size); _i++) { \ 111 | (v).a[(v).n + _i] = (arr)[_i]; \ 112 | } \ 113 | (v).n += (uint64_t)(size); \ 114 | (v).n - (uint64_t)(size); \ 115 | }) 116 | 117 | #define kv_a(type, v, i) ((v).m <= (size_t)(i)? \ 118 | ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ 119 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ 120 | : (v).n <= (size_t)(i)? (v).n = (i) \ 121 | : 0), (v).a[(i)] 122 | 123 | #define kv_reverse(type, v, start) do { \ 124 | if ((v).m > 0 && (v).n > (start)) { \ 125 | size_t __i, __end = (v).n - (start); \ 126 | type *__a = (v).a + (start); \ 127 | for (__i = 0; __i < __end>>1; ++__i) { \ 128 | type __t = __a[__end - 1 - __i]; \ 129 | __a[__end - 1 - __i] = __a[__i]; __a[__i] = __t; \ 130 | } \ 131 | } \ 132 | } while (0) 133 | 134 | #define kv_foreach(type, v, _body) { \ 135 | type *p = (type *)(v).a - 1; \ 136 | type *_t = (type *)(v).a + (v).n; \ 137 | while(++p < _t) { _body; } \ 138 | } 139 | 140 | /** heap queue : elements in v must be orderd in heap */ 141 | #define kv_hq_init(v) { (v).n = (v).m = 1; (v).a = NULL; } 142 | #define kv_hq_inits(type) ((kvec_t(type)){ .n = 1, .m = 1, .a = NULL }) 143 | #define kv_hq_destroy(v) kv_destroy(v) 144 | #define kv_hq_clear(v) ( (v).n = 1 ) 145 | 146 | #define kv_hq_n(v, i) ( *((int64_t *)&v.a[i]) ) 147 | #define kv_hq_push(type, __comp, v, x) { \ 148 | kv_push(type, v, x); \ 149 | uint64_t i = (v).n - 1; \ 150 | while(i > 1 && __comp((v).a[i>>1], (v).a[i]) > 0) { \ 151 | (v).a[0] = (v).a[i>>1]; \ 152 | (v).a[i>>1] = (v).a[i]; \ 153 | (v).a[i] = (v).a[0]; \ 154 | i >>= 1; \ 155 | } \ 156 | } 157 | #define kv_hq_pop(type, __comp, v) ({ \ 158 | uint64_t i = 1, j = 2; \ 159 | (v).a[0] = (v).a[i]; \ 160 | (v).a[i] = (v).a[--(v).n]; \ 161 | (v).a[(v).n] = (v).a[0]; \ 162 | while(j < (v).n) { \ 163 | uint64_t k; \ 164 | /*k = (j + 1 < (v).n && kv_hq_n(v, j + 1) < kv_hq_n(v, j)) ? (j + 1) : j; */ \ 165 | k = (j + 1 < (v).n && __comp((v).a[j + 1], (v).a[j]) < 0) ? j + 1 : j; \ 166 | /*k = (kv_hq_n(v, k) < kv_hq_n(v, i)) ? k : 0; */ \ 167 | k = __comp((v).a[k], (v).a[i]) < 0 ? k : 0; \ 168 | if(k == 0) { break; } \ 169 | (v).a[0] = (v).a[k]; \ 170 | (v).a[k] = (v).a[i]; \ 171 | (v).a[i] = (v).a[0]; \ 172 | i = k; j = k<<1; \ 173 | } \ 174 | v.a[v.n]; \ 175 | }) 176 | #endif 177 | -------------------------------------------------------------------------------- /arch/x86_64_sse41/v4i32.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file v4i32.h 4 | * 5 | * @brief struct and _Generic based vector class implementation 6 | */ 7 | #ifndef _V4I32_H_INCLUDED 8 | #define _V4I32_H_INCLUDED 9 | 10 | /* include header for intel / amd sse2 instruction sets */ 11 | #include 12 | 13 | /* 8bit 32cell */ 14 | typedef struct v4i32_s { 15 | __m128i v1; 16 | } v4i32_t; 17 | 18 | /* expanders (without argument) */ 19 | #define _e_x_v4i32_1(u) 20 | #define _e_x_v4i32_2(u) 21 | 22 | /* expanders (without immediate) */ 23 | #define _e_v_v4i32_1(a) (a).v1 24 | #define _e_v_v4i32_2(a) (a).v1 25 | #define _e_vv_v4i32_1(a, b) (a).v1, (b).v1 26 | #define _e_vv_v4i32_2(a, b) (a).v1, (b).v1 27 | #define _e_vvv_v4i32_1(a, b, c) (a).v1, (b).v1, (c).v1 28 | #define _e_vvv_v4i32_2(a, b, c) (a).v1, (b).v1, (c).v1 29 | 30 | /* expanders with immediate */ 31 | #define _e_i_v4i32_1(imm) (imm) 32 | #define _e_i_v4i32_2(imm) (imm) 33 | #define _e_vi_v4i32_1(a, imm) (a).v1, (imm) 34 | #define _e_vi_v4i32_2(a, imm) (a).v1, (imm) 35 | #define _e_vvi_v4i32_1(a, b, imm) (a).v1, (b).v1, (imm) 36 | #define _e_vvi_v4i32_2(a, b, imm) (a).v1, (b).v1, (imm) 37 | 38 | /* address calculation macros */ 39 | #define _addr_v4i32_1(imm) ( (__m128i *)(imm) ) 40 | #define _addr_v4i32_2(imm) ( (__m128i *)(imm) ) 41 | #define _pv_v4i32(ptr) ( _addr_v4i32_1(ptr) ) 42 | /* expanders with pointers */ 43 | #define _e_p_v4i32_1(ptr) _addr_v4i32_1(ptr) 44 | #define _e_p_v4i32_2(ptr) _addr_v4i32_2(ptr) 45 | #define _e_pv_v4i32_1(ptr, a) _addr_v4i32_1(ptr), (a).v1 46 | #define _e_pv_v4i32_2(ptr, a) _addr_v4i32_2(ptr), (a).v1 47 | 48 | /* expand intrinsic name */ 49 | #define _i_v4i32(intrin) _mm_##intrin##_epi32 50 | #define _i_v4i32x(intrin) _mm_##intrin##_si128 51 | 52 | /* apply */ 53 | #define _a_v4i32(intrin, expander, ...) ( \ 54 | (v4i32_t) { \ 55 | _i_v4i32(intrin)(expander##_v4i32_1(__VA_ARGS__)) \ 56 | } \ 57 | ) 58 | #define _a_v4i32e(intrin, expander, ...) ( \ 59 | (v4i32_t) { \ 60 | _i_v4i32x(intrin)(expander##_v4i32_1(__VA_ARGS__)) \ 61 | } \ 62 | ) 63 | #define _a_v4i32ev(intrin, expander, ...) { \ 64 | _i_v4i32x(intrin)(expander##_v4i32_1(__VA_ARGS__)); \ 65 | } 66 | #define _a_v4i32x(intrin, expander, ...) ( \ 67 | (v4i32_t) { \ 68 | _i_v4i32x(intrin)(expander##_v4i32_1(__VA_ARGS__)) \ 69 | } \ 70 | ) 71 | #define _a_v4i32xv(intrin, expander, ...) { \ 72 | _i_v4i32x(intrin)(expander##_v4i32_1(__VA_ARGS__)); \ 73 | } 74 | 75 | /* load and store */ 76 | #define _load_v4i32(...) _a_v4i32e(load, _e_p, __VA_ARGS__) 77 | #define _loadu_v4i32(...) _a_v4i32e(load, _e_p, __VA_ARGS__) 78 | #define _store_v4i32(...) _a_v4i32ev(store, _e_pv, __VA_ARGS__) 79 | #define _storeu_v4i32(...) _a_v4i32ev(store, _e_pv, __VA_ARGS__) 80 | 81 | /* broadcast */ 82 | #define _set_v4i32(...) _a_v4i32(set1, _e_i, __VA_ARGS__) 83 | #define _zero_v4i32() _a_v4i32x(setzero, _e_x, _unused) 84 | #define _seta_v4i32(i4, i3, i2, i1) ( \ 85 | (v4i32_t) { \ 86 | _mm_set_epi32(i4, i3, i2, i1) \ 87 | } \ 88 | ) 89 | #define _swap_v4i32(x) ( \ 90 | (v4i32_t) { \ 91 | _mm_shuffle_epi32((x).v1, 0xe1) \ 92 | } \ 93 | ) 94 | 95 | /* logics */ 96 | #define _not_v4i32(...) _a_v4i32x(not, _e_v, __VA_ARGS__) 97 | #define _and_v4i32(...) _a_v4i32x(and, _e_vv, __VA_ARGS__) 98 | #define _or_v4i32(...) _a_v4i32x(or, _e_vv, __VA_ARGS__) 99 | #define _xor_v4i32(...) _a_v4i32x(xor, _e_vv, __VA_ARGS__) 100 | #define _andn_v4i32(...) _a_v4i32x(andnot, _e_vv, __VA_ARGS__) 101 | 102 | /* arithmetics */ 103 | #define _add_v4i32(...) _a_v4i32(add, _e_vv, __VA_ARGS__) 104 | #define _sub_v4i32(...) _a_v4i32(sub, _e_vv, __VA_ARGS__) 105 | #define _max_v4i32(...) _a_v4i32(max, _e_vv, __VA_ARGS__) 106 | #define _min_v4i32(...) _a_v4i32(min, _e_vv, __VA_ARGS__) 107 | 108 | /* blend: mask == 1 ? a : b */ 109 | #define _sel_v4i32(mask, a, b) ( \ 110 | (v4i32_t) { \ 111 | _mm_blendv_epi8((b).v1, (a).v1, (mask).v1) \ 112 | } \ 113 | ) 114 | 115 | /* compare */ 116 | #define _eq_v4i32(...) _a_v4i32(cmpeq, _e_vv, __VA_ARGS__) 117 | #define _gt_v4i32(...) _a_v4i32(cmpgt, _e_vv, __VA_ARGS__) 118 | 119 | /* test: take mask and test if all zero */ 120 | #define _test_v4i32(x, y) _mm_test_all_zeros((x).v1, (y).v1) 121 | 122 | /* insert and extract */ 123 | #define _ins_v4i32(a, val, imm) { \ 124 | (a).v1 = _i_v4i32((a).v1, (val), (imm)); \ 125 | } 126 | #define _ext_v4i32(a, imm) ( \ 127 | (int32_t)_i_v4i32(extract)((a).v1, (imm)) \ 128 | ) 129 | 130 | /* element shift */ 131 | #define _bsl_v4i32(a, imm) ( \ 132 | (v4i32_t) { \ 133 | _mm_slli_si128((a).v1, sizeof(int32_t) * (imm)) \ 134 | } \ 135 | ) 136 | #define _bsr_v4i32(a, imm) ( \ 137 | (v4i32_t) { \ 138 | _mm_srli_si128((a).v1, sizeof(int32_t) * (imm)) \ 139 | } \ 140 | ) 141 | 142 | /* double shift (palignr) */ 143 | #define _bsld_v4i32(a, b, imm) ( \ 144 | (v4i32_t) { \ 145 | _mm_alignr_epi8((a).v1, (b).v1, sizeof(__m128i) - sizeof(int32_t) * (imm)) \ 146 | } \ 147 | ) 148 | #define _bsrd_v4i32(a, b, imm) ( \ 149 | (v4i32_t) { \ 150 | _mm_alignr_epi8((a).v1, (b).v1, sizeof(int32_t) * (imm)) \ 151 | } \ 152 | ) 153 | 154 | /* bit shift */ 155 | #define _sal_v4i32(a, imm) ( \ 156 | (v4i32_t) {_i_v4i32(slli)((a).v1, (imm))} \ 157 | ) 158 | #define _sar_v4i32(a, imm) ( \ 159 | (v4i32_t) {_i_v4i32(srai)((a).v1, (imm))} \ 160 | ) 161 | #define _shl_v4i32(a, imm) ( \ 162 | (v4i32_t) {_i_v4i32(slli)((a).v1, (imm))} \ 163 | ) 164 | #define _shr_v4i32(a, imm) ( \ 165 | (v4i32_t) {_i_v4i32(srli)((a).v1, (imm))} \ 166 | ) 167 | 168 | /* mask */ 169 | #define _mask_v4i32(a) ( \ 170 | (uint32_t)(_mm_movemask_epi8((a).v1)) \ 171 | ) 172 | 173 | /* transpose */ 174 | #define _lo_v4i32(a, b) ( \ 175 | (v4i32_t) { \ 176 | _mm_unpacklo_epi32((a).v1, (b).v1) \ 177 | } \ 178 | ) 179 | #define _hi_v4i32(a, b) ( \ 180 | (v4i32_t) { \ 181 | _mm_shuffle_epi32(_mm_unpacklo_epi32((a).v1, (b).v1), 0x0e) \ 182 | } \ 183 | ) 184 | #define _shuf_v4i32(a, i) ( \ 185 | (v4i32_t) { \ 186 | _mm_shuffle_epi32((a).v1, i) \ 187 | } \ 188 | ) 189 | #define _km_v4i32(i4, i3, i2, i1) ( (i1) | ((i2)<<2) | ((i3)<<4) | ((i4)<<6) ) 190 | 191 | /* debug print */ 192 | #ifdef _LOG_H_INCLUDED 193 | #define _print_v4i32(a) { \ 194 | debug("(v4i32_t) %s(%d, %d, %d, %d)", #a, _ext_v4i32(a, 3), _ext_v4i32(a, 2), _ext_v4i32(a, 1), _ext_v4i32(a, 0)); \ 195 | } 196 | #else 197 | #define _print_v4i32(x) ; 198 | #endif 199 | 200 | #endif /* _V4I32_H_INCLUDED */ 201 | /** 202 | * end of v4i32.h 203 | */ 204 | -------------------------------------------------------------------------------- /arch/x86_64_sse41/v16i16.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file v16i16.h 4 | * 5 | * @brief struct and _Generic based vector class implementation 6 | */ 7 | #ifndef _V16I16_H_INCLUDED 8 | #define _V16I16_H_INCLUDED 9 | 10 | /* include header for intel / amd sse2 instruction sets */ 11 | #include 12 | 13 | /* 8bit 32cell */ 14 | typedef struct v16i16_s { 15 | __m128i v1; 16 | __m128i v2; 17 | } v16i16_t; 18 | 19 | /* expanders (without argument) */ 20 | #define _e_x_v16i16_1(u) 21 | #define _e_x_v16i16_2(u) 22 | 23 | /* expanders (without immediate) */ 24 | #define _e_v_v16i16_1(a) (a).v1 25 | #define _e_v_v16i16_2(a) (a).v2 26 | #define _e_vv_v16i16_1(a, b) (a).v1, (b).v1 27 | #define _e_vv_v16i16_2(a, b) (a).v2, (b).v2 28 | #define _e_vvv_v16i16_1(a, b, c) (a).v1, (b).v1, (c).v1 29 | #define _e_vvv_v16i16_2(a, b, c) (a).v2, (b).v2, (c).v2 30 | 31 | /* expanders with immediate */ 32 | #define _e_i_v16i16_1(imm) (imm) 33 | #define _e_i_v16i16_2(imm) (imm) 34 | #define _e_vi_v16i16_1(a, imm) (a).v1, (imm) 35 | #define _e_vi_v16i16_2(a, imm) (a).v2, (imm) 36 | #define _e_vvi_v16i16_1(a, b, imm) (a).v1, (b).v1, (imm) 37 | #define _e_vvi_v16i16_2(a, b, imm) (a).v2, (b).v2, (imm) 38 | 39 | /* address calculation macros */ 40 | #define _addr_v16i16_1(imm) ( (__m128i *)(imm) ) 41 | #define _addr_v16i16_2(imm) ( (__m128i *)(imm) + 1 ) 42 | #define _pv_v16i16(ptr) ( _addr_v16i16_1(ptr) ) 43 | /* expanders with pointers */ 44 | #define _e_p_v16i16_1(ptr) _addr_v16i16_1(ptr) 45 | #define _e_p_v16i16_2(ptr) _addr_v16i16_2(ptr) 46 | #define _e_pv_v16i16_1(ptr, a) _addr_v16i16_1(ptr), (a).v1 47 | #define _e_pv_v16i16_2(ptr, a) _addr_v16i16_2(ptr), (a).v2 48 | 49 | /* expand intrinsic name */ 50 | #define _i_v16i16(intrin) _mm_##intrin##_epi16 51 | #define _i_v16i16x(intrin) _mm_##intrin##_si128 52 | 53 | /* apply */ 54 | #define _a_v16i16(intrin, expander, ...) ( \ 55 | (v16i16_t) { \ 56 | _i_v16i16(intrin)(expander##_v16i16_1(__VA_ARGS__)), \ 57 | _i_v16i16(intrin)(expander##_v16i16_2(__VA_ARGS__)) \ 58 | } \ 59 | ) 60 | #define _a_v16i16x(intrin, expander, ...) ( \ 61 | (v16i16_t) { \ 62 | _i_v16i16x(intrin)(expander##_v16i16_1(__VA_ARGS__)), \ 63 | _i_v16i16x(intrin)(expander##_v16i16_2(__VA_ARGS__)) \ 64 | } \ 65 | ) 66 | #define _a_v16i16xv(intrin, expander, ...) { \ 67 | _i_v16i16x(intrin)(expander##_v16i16_1(__VA_ARGS__)); \ 68 | _i_v16i16x(intrin)(expander##_v16i16_2(__VA_ARGS__)); \ 69 | } 70 | 71 | /* load and store */ 72 | #define _load_v16i16(...) _a_v16i16x(load, _e_p, __VA_ARGS__) 73 | #define _loadu_v16i16(...) _a_v16i16x(loadu, _e_p, __VA_ARGS__) 74 | #define _store_v16i16(...) _a_v16i16xv(store, _e_pv, __VA_ARGS__) 75 | #define _storeu_v16i16(...) _a_v16i16xv(storeu, _e_pv, __VA_ARGS__) 76 | 77 | /* broadcast */ 78 | #define _set_v16i16(...) _a_v16i16(set1, _e_i, __VA_ARGS__) 79 | #define _zero_v16i16() _a_v16i16x(setzero, _e_x, _unused) 80 | #define _seta_v16i16(...) ( \ 81 | (v16i16_t) { \ 82 | _mm_set_epi16(__VA_ARGS__) \ 83 | } \ 84 | ) 85 | 86 | /* swap (reverse) */ 87 | #define _swap_idx_v16i16() ( \ 88 | _mm_set_epi8( \ 89 | 1, 0, 3, 2, 5, 4, 7, 6, \ 90 | 9, 8, 11, 10, 13, 12, 15, 14) \ 91 | ) 92 | #define _swap_v16i16(a) ( \ 93 | (v16i16_t) { \ 94 | _mm_shuffle_epi8((a).v2, _swap_idx_v16i16()), \ 95 | _mm_shuffle_epi8((a).v1, _swap_idx_v16i16()) \ 96 | } \ 97 | ) 98 | 99 | /* logics */ 100 | #define _not_v16i16(...) _a_v16i16x(not, _e_v, __VA_ARGS__) 101 | #define _and_v16i16(...) _a_v16i16x(and, _e_vv, __VA_ARGS__) 102 | #define _or_v16i16(...) _a_v16i16x(or, _e_vv, __VA_ARGS__) 103 | #define _xor_v16i16(...) _a_v16i16x(xor, _e_vv, __VA_ARGS__) 104 | #define _andn_v16i16(...) _a_v16i16x(andnot, _e_vv, __VA_ARGS__) 105 | 106 | /* arithmetics */ 107 | #define _add_v16i16(...) _a_v16i16(add, _e_vv, __VA_ARGS__) 108 | #define _sub_v16i16(...) _a_v16i16(sub, _e_vv, __VA_ARGS__) 109 | #define _max_v16i16(...) _a_v16i16(max, _e_vv, __VA_ARGS__) 110 | #define _min_v16i16(...) _a_v16i16(min, _e_vv, __VA_ARGS__) 111 | 112 | /* shuffle */ 113 | #define _shuf_v16i16(...) _a_v16i16(shuffle, _e_vv, __VA_ARGS__) 114 | 115 | /* blend */ 116 | // #define _sel_v16i16(...) _a_v16i16(blendv, _e_vvv, __VA_ARGS__) 117 | 118 | /* compare */ 119 | #define _eq_v16i16(...) _a_v16i16(cmpeq, _e_vv, __VA_ARGS__) 120 | #define _gt_v16i16(...) _a_v16i16(cmpgt, _e_vv, __VA_ARGS__) 121 | 122 | /* insert and extract */ 123 | #define _ins_v16i16(a, val, imm) { \ 124 | if((imm) < sizeof(__m128i)/sizeof(int16_t)) { \ 125 | (a).v1 = _i_v16i16(insert)((a).v1, (val), (imm)); \ 126 | } else { \ 127 | (a).v2 = _i_v16i16(insert)((a).v2, (val), (imm) - sizeof(__m128i)/sizeof(int16_t)); \ 128 | } \ 129 | } 130 | #define _ext_v16i16(a, imm) ( \ 131 | (int16_t)(((imm) < sizeof(__m128i)/sizeof(int16_t)) \ 132 | ? _i_v16i16(extract)((a).v1, (imm)) \ 133 | : _i_v16i16(extract)((a).v2, (imm) - sizeof(__m128i)/sizeof(int16_t))) \ 134 | ) 135 | 136 | /* mask */ 137 | #define _mask_v16i16(a) ( \ 138 | (v16_mask_t) { \ 139 | .m1 = _mm_movemask_epi8( \ 140 | _mm_packs_epi16((a).v1, (a).v2)) \ 141 | } \ 142 | ) 143 | 144 | /* horizontal max */ 145 | #define _hmax_v16i16(a) ({ \ 146 | __m128i _vmax = _mm_max_epi16((a).v1, (a).v2); \ 147 | _vmax = _mm_max_epi16(_vmax, \ 148 | _mm_srli_si128(_vmax, 8)); \ 149 | _vmax = _mm_max_epi16(_vmax, \ 150 | _mm_srli_si128(_vmax, 4)); \ 151 | _vmax = _mm_max_epi16(_vmax, \ 152 | _mm_srli_si128(_vmax, 2)); \ 153 | (int16_t)_mm_extract_epi16(_vmax, 0); \ 154 | }) 155 | 156 | /* convert */ 157 | #define _cvt_v16i8_v16i16(a) ( \ 158 | (v16i16_t) { \ 159 | _mm_cvtepi8_epi16((a).v1), \ 160 | _mm_cvtepi8_epi16(_mm_srli_si128((a).v1, 8)) \ 161 | } \ 162 | ) 163 | 164 | /* debug print */ 165 | #ifdef _LOG_H_INCLUDED 166 | #define _print_v16i16(a) { \ 167 | debug("(v16i16_t) %s(%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d)", \ 168 | #a, \ 169 | _ext_v16i16(a, 15), \ 170 | _ext_v16i16(a, 14), \ 171 | _ext_v16i16(a, 13), \ 172 | _ext_v16i16(a, 12), \ 173 | _ext_v16i16(a, 11), \ 174 | _ext_v16i16(a, 10), \ 175 | _ext_v16i16(a, 9), \ 176 | _ext_v16i16(a, 8), \ 177 | _ext_v16i16(a, 7), \ 178 | _ext_v16i16(a, 6), \ 179 | _ext_v16i16(a, 5), \ 180 | _ext_v16i16(a, 4), \ 181 | _ext_v16i16(a, 3), \ 182 | _ext_v16i16(a, 2), \ 183 | _ext_v16i16(a, 1), \ 184 | _ext_v16i16(a, 0)); \ 185 | } 186 | #else 187 | #define _print_v16i16(x) ; 188 | #endif 189 | 190 | #endif /* _V16I16_H_INCLUDED */ 191 | /** 192 | * end of v16i16.h 193 | */ 194 | -------------------------------------------------------------------------------- /arch/x86_64_avx2/v4i32.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file v4i32.h 4 | * 5 | * @brief struct and _Generic based vector class implementation 6 | */ 7 | #ifndef _V4I32_H_INCLUDED 8 | #define _V4I32_H_INCLUDED 9 | 10 | /* include header for intel / amd sse2 instruction sets */ 11 | #include 12 | 13 | /* 8bit 32cell */ 14 | typedef struct v4i32_s { 15 | __m128i v1; 16 | } v4i32_t; 17 | 18 | /* expanders (without argument) */ 19 | #define _e_x_v4i32_1(u) 20 | #define _e_x_v4i32_2(u) 21 | 22 | /* expanders (without immediate) */ 23 | #define _e_v_v4i32_1(a) (a).v1 24 | #define _e_v_v4i32_2(a) (a).v1 25 | #define _e_vv_v4i32_1(a, b) (a).v1, (b).v1 26 | #define _e_vv_v4i32_2(a, b) (a).v1, (b).v1 27 | #define _e_vvv_v4i32_1(a, b, c) (a).v1, (b).v1, (c).v1 28 | #define _e_vvv_v4i32_2(a, b, c) (a).v1, (b).v1, (c).v1 29 | 30 | /* expanders with immediate */ 31 | #define _e_i_v4i32_1(imm) (imm) 32 | #define _e_i_v4i32_2(imm) (imm) 33 | #define _e_vi_v4i32_1(a, imm) (a).v1, (imm) 34 | #define _e_vi_v4i32_2(a, imm) (a).v1, (imm) 35 | #define _e_vvi_v4i32_1(a, b, imm) (a).v1, (b).v1, (imm) 36 | #define _e_vvi_v4i32_2(a, b, imm) (a).v1, (b).v1, (imm) 37 | 38 | /* address calculation macros */ 39 | #define _addr_v4i32_1(imm) ( (__m128i *)(imm) ) 40 | #define _addr_v4i32_2(imm) ( (__m128i *)(imm) ) 41 | #define _pv_v4i32(ptr) ( _addr_v4i32_1(ptr) ) 42 | /* expanders with pointers */ 43 | #define _e_p_v4i32_1(ptr) _addr_v4i32_1(ptr) 44 | #define _e_p_v4i32_2(ptr) _addr_v4i32_2(ptr) 45 | #define _e_pv_v4i32_1(ptr, a) _addr_v4i32_1(ptr), (a).v1 46 | #define _e_pv_v4i32_2(ptr, a) _addr_v4i32_2(ptr), (a).v1 47 | 48 | /* expand intrinsic name */ 49 | #define _i_v4i32(intrin) _mm_##intrin##_epi32 50 | #define _i_v4u32(intrin) _mm_##intrin##_epu32 51 | #define _i_v4i32x(intrin) _mm_##intrin##_si128 52 | 53 | /* apply */ 54 | #define _a_v4i32(intrin, expander, ...) ( \ 55 | (v4i32_t) { \ 56 | _i_v4i32(intrin)(expander##_v4i32_1(__VA_ARGS__)) \ 57 | } \ 58 | ) 59 | #define _a_v4u32(intrin, expander, ...) ( \ 60 | (v4i32_t) { \ 61 | _i_v4u32(intrin)(expander##_v4i32_1(__VA_ARGS__)) \ 62 | } \ 63 | ) 64 | #define _a_v4i32x(intrin, expander, ...) ( \ 65 | (v4i32_t) { \ 66 | _i_v4i32x(intrin)(expander##_v4i32_1(__VA_ARGS__)) \ 67 | } \ 68 | ) 69 | #define _a_v4i32xv(intrin, expander, ...) { \ 70 | _i_v4i32x(intrin)(expander##_v4i32_1(__VA_ARGS__)); \ 71 | } 72 | 73 | /* load and store */ 74 | #define _load_v4i32(...) _a_v4i32x(load, _e_p, __VA_ARGS__) 75 | #define _loadu_v4i32(...) _a_v4i32x(load, _e_p, __VA_ARGS__) 76 | #define _store_v4i32(...) _a_v4i32xv(store, _e_pv, __VA_ARGS__) 77 | #define _storeu_v4i32(...) _a_v4i32xv(store, _e_pv, __VA_ARGS__) 78 | 79 | /* broadcast */ 80 | #define _set_v4i32(...) _a_v4i32(set1, _e_i, __VA_ARGS__) 81 | #define _zero_v4i32() _a_v4i32x(setzero, _e_x, _unused) 82 | #define _seta_v4i32(i4, i3, i2, i1) ( \ 83 | (v4i32_t) { \ 84 | _mm_set_epi32(i4, i3, i2, i1) \ 85 | } \ 86 | ) 87 | #define _swap_v4i32(x) ( \ 88 | (v4i32_t) { \ 89 | _mm_shuffle_epi32((x).v1, 0xe1) \ 90 | } \ 91 | ) 92 | 93 | /* logics */ 94 | #define _not_v4i32(...) _a_v4i32x(not, _e_v, __VA_ARGS__) 95 | #define _and_v4i32(...) _a_v4i32x(and, _e_vv, __VA_ARGS__) 96 | #define _or_v4i32(...) _a_v4i32x(or, _e_vv, __VA_ARGS__) 97 | #define _xor_v4i32(...) _a_v4i32x(xor, _e_vv, __VA_ARGS__) 98 | #define _andn_v4i32(...) _a_v4i32x(andnot, _e_vv, __VA_ARGS__) 99 | 100 | /* arithmetics */ 101 | #define _add_v4i32(...) _a_v4i32(add, _e_vv, __VA_ARGS__) 102 | #define _sub_v4i32(...) _a_v4i32(sub, _e_vv, __VA_ARGS__) 103 | #define _max_v4i32(...) _a_v4i32(max, _e_vv, __VA_ARGS__) 104 | #define _min_v4i32(...) _a_v4i32(min, _e_vv, __VA_ARGS__) 105 | #define _maxu_v4i32(...) _a_v4u32(max, _e_vv, __VA_ARGS__) 106 | #define _minu_v4i32(...) _a_v4u32(min, _e_vv, __VA_ARGS__) 107 | 108 | /* blend: mask == 1 ? a : b */ 109 | #define _sel_v4i32(mask, a, b) ( \ 110 | (v4i32_t) { \ 111 | _mm_blendv_epi8((b).v1, (a).v1, (mask).v1) \ 112 | } \ 113 | ) 114 | 115 | /* compare */ 116 | #define _eq_v4i32(...) _a_v4i32(cmpeq, _e_vv, __VA_ARGS__) 117 | #define _gt_v4i32(...) _a_v4i32(cmpgt, _e_vv, __VA_ARGS__) 118 | 119 | /* test: take mask and test if all zero */ 120 | #define _test_v4i32(x, y) _mm_test_all_zeros((x).v1, (y).v1) 121 | 122 | /* insert and extract */ 123 | #define _ins_v4i32(a, val, imm) { \ 124 | (a).v1 = _i_v4i32((a).v1, (val), (imm)); \ 125 | } 126 | #define _ext_v4i32(a, imm) ( \ 127 | (int32_t)_i_v4i32(extract)((a).v1, (imm)) \ 128 | ) 129 | 130 | /* element shift */ 131 | #define _bsl_v4i32(a, imm) ( \ 132 | (v4i32_t) { \ 133 | _mm_slli_si128((a).v1, sizeof(int32_t) * (imm)) \ 134 | } \ 135 | ) 136 | #define _bsr_v4i32(a, imm) ( \ 137 | (v4i32_t) { \ 138 | _mm_srli_si128((a).v1, sizeof(int32_t) * (imm)) \ 139 | } \ 140 | ) 141 | 142 | /* double shift (palignr) */ 143 | #define _bsld_v4i32(a, b, imm) ( \ 144 | (v4i32_t) { \ 145 | _mm_alignr_epi8((a).v1, (b).v1, sizeof(__m128i) - sizeof(int32_t) * (imm)) \ 146 | } \ 147 | ) 148 | #define _bsrd_v4i32(a, b, imm) ( \ 149 | (v4i32_t) { \ 150 | _mm_alignr_epi8((a).v1, (b).v1, sizeof(int32_t) * (imm)) \ 151 | } \ 152 | ) 153 | 154 | /* bit shift */ 155 | #define _sal_v4i32(a, imm) ( \ 156 | (v4i32_t) {_i_v4i32(slli)((a).v1, (imm))} \ 157 | ) 158 | #define _sar_v4i32(a, imm) ( \ 159 | (v4i32_t) {_i_v4i32(srai)((a).v1, (imm))} \ 160 | ) 161 | #define _shl_v4i32(a, imm) ( \ 162 | (v4i32_t) {_i_v4i32(slli)((a).v1, (imm))} \ 163 | ) 164 | #define _shr_v4i32(a, imm) ( \ 165 | (v4i32_t) {_i_v4i32(srli)((a).v1, (imm))} \ 166 | ) 167 | 168 | /* mask */ 169 | #define _mask_v4i32(a) ( \ 170 | (uint32_t)(_mm_movemask_epi8((a).v1)) \ 171 | ) 172 | 173 | /* transpose */ 174 | #define _lo_v4i32(a, b) ( \ 175 | (v4i32_t) { \ 176 | _mm_unpacklo_epi32((a).v1, (b).v1) \ 177 | } \ 178 | ) 179 | #define _hi_v4i32(a, b) ( \ 180 | (v4i32_t) { \ 181 | _mm_shuffle_epi32(_mm_unpacklo_epi32((a).v1, (b).v1), 0x0e) \ 182 | } \ 183 | ) 184 | #define _shuf_v4i32(a, i) ( \ 185 | (v4i32_t) { \ 186 | _mm_shuffle_epi32((a).v1, i) \ 187 | } \ 188 | ) 189 | #define _km_v4i32(i4, i3, i2, i1) ( (i1) | ((i2)<<2) | ((i3)<<4) | ((i4)<<6) ) 190 | 191 | /* debug print */ 192 | #ifdef _LOG_H_INCLUDED 193 | #define _print_v4i32(a) { \ 194 | debug("(v4i32_t) %s(%d, %d, %d, %d)", #a, _ext_v4i32(a, 3), _ext_v4i32(a, 2), _ext_v4i32(a, 1), _ext_v4i32(a, 0)); \ 195 | } 196 | #else 197 | #define _print_v4i32(x) ; 198 | #endif 199 | 200 | #endif /* _V4I32_H_INCLUDED */ 201 | /** 202 | * end of v4i32.h 203 | */ 204 | -------------------------------------------------------------------------------- /arch/x86_64_avx2/v32i16.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file v32i16.h 4 | * 5 | * @brief struct and _Generic based vector class implementation 6 | */ 7 | #ifndef _V32I16_H_INCLUDED 8 | #define _V32I16_H_INCLUDED 9 | 10 | /* include header for intel / amd sse2 instruction sets */ 11 | #include 12 | 13 | /* 16bit 32cell */ 14 | typedef struct v32i16_s { 15 | __m256i v1; 16 | __m256i v2; 17 | } v32i16_t; 18 | 19 | /* expanders (without argument) */ 20 | #define _e_x_v32i16_1(u) 21 | #define _e_x_v32i16_2(u) 22 | 23 | /* expanders (without immediate) */ 24 | #define _e_v_v32i16_1(a) (a).v1 25 | #define _e_v_v32i16_2(a) (a).v2 26 | #define _e_vv_v32i16_1(a, b) (a).v1, (b).v1 27 | #define _e_vv_v32i16_2(a, b) (a).v2, (b).v2 28 | #define _e_vvv_v32i16_1(a, b, c) (a).v1, (b).v1, (c).v1 29 | #define _e_vvv_v32i16_2(a, b, c) (a).v2, (b).v2, (c).v2 30 | 31 | /* expanders with immediate */ 32 | #define _e_i_v32i16_1(imm) (imm) 33 | #define _e_i_v32i16_2(imm) (imm) 34 | #define _e_vi_v32i16_1(a, imm) (a).v1, (imm) 35 | #define _e_vi_v32i16_2(a, imm) (a).v2, (imm) 36 | #define _e_vvi_v32i16_1(a, b, imm) (a).v1, (b).v1, (imm) 37 | #define _e_vvi_v32i16_2(a, b, imm) (a).v2, (b).v2, (imm) 38 | 39 | /* address calculation macros */ 40 | #define _addr_v32i16_1(imm) ( (__m256i *)(imm) ) 41 | #define _addr_v32i16_2(imm) ( (__m256i *)(imm) + 1 ) 42 | #define _pv_v32i16(ptr) ( _addr_v32i16_1(ptr) ) 43 | /* expanders with pointers */ 44 | #define _e_p_v32i16_1(ptr) _addr_v32i16_1(ptr) 45 | #define _e_p_v32i16_2(ptr) _addr_v32i16_2(ptr) 46 | #define _e_pv_v32i16_1(ptr, a) _addr_v32i16_1(ptr), (a).v1 47 | #define _e_pv_v32i16_2(ptr, a) _addr_v32i16_2(ptr), (a).v2 48 | 49 | /* expand intrinsic name */ 50 | #define _i_v32i16(intrin) _mm256_##intrin##_epi16 51 | #define _i_v32i16x(intrin) _mm256_##intrin##_si256 52 | 53 | /* apply */ 54 | #define _a_v32i16(intrin, expander, ...) ( \ 55 | (v32i16_t) { \ 56 | _i_v32i16(intrin)(expander##_v32i16_1(__VA_ARGS__)), \ 57 | _i_v32i16(intrin)(expander##_v32i16_2(__VA_ARGS__)) \ 58 | } \ 59 | ) 60 | #define _a_v32i16x(intrin, expander, ...) ( \ 61 | (v32i16_t) { \ 62 | _i_v32i16x(intrin)(expander##_v32i16_1(__VA_ARGS__)), \ 63 | _i_v32i16x(intrin)(expander##_v32i16_2(__VA_ARGS__)) \ 64 | } \ 65 | ) 66 | #define _a_v32i16xv(intrin, expander, ...) { \ 67 | _i_v32i16x(intrin)(expander##_v32i16_1(__VA_ARGS__)); \ 68 | _i_v32i16x(intrin)(expander##_v32i16_2(__VA_ARGS__)); \ 69 | } 70 | 71 | /* load and store */ 72 | #define _load_v32i16(...) _a_v32i16x(load, _e_p, __VA_ARGS__) 73 | #define _loadu_v32i16(...) _a_v32i16x(loadu, _e_p, __VA_ARGS__) 74 | #define _store_v32i16(...) _a_v32i16xv(store, _e_pv, __VA_ARGS__) 75 | #define _storeu_v32i16(...) _a_v32i16xv(storeu, _e_pv, __VA_ARGS__) 76 | 77 | /* broadcast */ 78 | #define _set_v32i16(...) _a_v32i16(set1, _e_i, __VA_ARGS__) 79 | #define _zero_v32i16() _a_v32i16x(setzero, _e_x, _unused) 80 | 81 | /* logics */ 82 | #define _not_v32i16(...) _a_v32i16x(not, _e_v, __VA_ARGS__) 83 | #define _and_v32i16(...) _a_v32i16x(and, _e_vv, __VA_ARGS__) 84 | #define _or_v32i16(...) _a_v32i16x(or, _e_vv, __VA_ARGS__) 85 | #define _xor_v32i16(...) _a_v32i16x(xor, _e_vv, __VA_ARGS__) 86 | #define _andn_v32i16(...) _a_v32i16x(andnot, _e_vv, __VA_ARGS__) 87 | 88 | /* arithmetics */ 89 | #define _add_v32i16(...) _a_v32i16(add, _e_vv, __VA_ARGS__) 90 | #define _sub_v32i16(...) _a_v32i16(sub, _e_vv, __VA_ARGS__) 91 | #define _max_v32i16(...) _a_v32i16(max, _e_vv, __VA_ARGS__) 92 | #define _min_v32i16(...) _a_v32i16(min, _e_vv, __VA_ARGS__) 93 | 94 | /* compare */ 95 | #define _eq_v32i16(...) _a_v32i16(cmpeq, _e_vv, __VA_ARGS__) 96 | #define _gt_v32i16(...) _a_v32i16(cmpgt, _e_vv, __VA_ARGS__) 97 | 98 | /* insert and extract */ 99 | #define _ins_v32i16(a, val, imm) { \ 100 | if((imm) < sizeof(__m256i)/sizeof(int16_t)) { \ 101 | (a).v1 = _i_v32i16(insert)((a).v1, (val), (imm)); \ 102 | } else if((imm) < 2*sizeof(__m256i)/sizeof(int16_t)) { \ 103 | (a).v2 = _i_v32i16(insert)((a).v2, (val), (imm) - sizeof(__m256i)/sizeof(int16_t)); \ 104 | } \ 105 | } 106 | #define _ext_v32i16(a, imm) ( \ 107 | (int16_t)(((imm) < sizeof(__m256i)/sizeof(int16_t)) \ 108 | ? _i_v32i16(extract)((a).v1, (imm)) \ 109 | : _i_v32i16(extract)((a).v2, (imm) - sizeof(__m256i)/sizeof(int16_t))) \ 110 | ) 111 | 112 | /* mask */ 113 | #define _mask_v32i16(a) ( \ 114 | (v32_mask_t) { \ 115 | .m1 = _mm256_movemask_epi8( \ 116 | _mm256_permute4x64_epi64( \ 117 | _mm256_packs_epi16((a).v1, (a).v2), 0xd8)) \ 118 | } \ 119 | ) 120 | 121 | /* horizontal max (reduction max) */ 122 | #define _hmax_v32i16(a) ({ \ 123 | __m256i _s = _mm256_max_epi16((a).v1, (a).v2); \ 124 | __m128i _t = _mm_max_epi16( \ 125 | _mm256_castsi256_si128(_s), \ 126 | _mm256_extracti128_si256(_s, 1) \ 127 | ); \ 128 | _t = _mm_max_epi16(_t, _mm_srli_si128(_t, 8)); \ 129 | _t = _mm_max_epi16(_t, _mm_srli_si128(_t, 4)); \ 130 | _t = _mm_max_epi16(_t, _mm_srli_si128(_t, 2)); \ 131 | (int16_t)_mm_extract_epi16(_t, 0); \ 132 | }) 133 | 134 | #define _cvt_v32i8_v32i16(a) ( \ 135 | (v32i16_t) { \ 136 | _mm256_cvtepi8_epi16(_mm256_castsi256_si128((a).v1)), \ 137 | _mm256_cvtepi8_epi16(_mm256_extracti128_si256((a).v1, 1)) \ 138 | } \ 139 | ) 140 | 141 | /* debug print */ 142 | // #ifdef _LOG_H_INCLUDED 143 | #define _print_v32i16(a) { \ 144 | debug("(v32i16_t) %s(%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, " \ 145 | "%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d)", \ 146 | #a, \ 147 | _ext_v32i16(a, 31), \ 148 | _ext_v32i16(a, 30), \ 149 | _ext_v32i16(a, 29), \ 150 | _ext_v32i16(a, 28), \ 151 | _ext_v32i16(a, 27), \ 152 | _ext_v32i16(a, 26), \ 153 | _ext_v32i16(a, 25), \ 154 | _ext_v32i16(a, 24), \ 155 | _ext_v32i16(a, 23), \ 156 | _ext_v32i16(a, 22), \ 157 | _ext_v32i16(a, 21), \ 158 | _ext_v32i16(a, 20), \ 159 | _ext_v32i16(a, 19), \ 160 | _ext_v32i16(a, 18), \ 161 | _ext_v32i16(a, 17), \ 162 | _ext_v32i16(a, 16), \ 163 | _ext_v32i16(a, 15), \ 164 | _ext_v32i16(a, 14), \ 165 | _ext_v32i16(a, 13), \ 166 | _ext_v32i16(a, 12), \ 167 | _ext_v32i16(a, 11), \ 168 | _ext_v32i16(a, 10), \ 169 | _ext_v32i16(a, 9), \ 170 | _ext_v32i16(a, 8), \ 171 | _ext_v32i16(a, 7), \ 172 | _ext_v32i16(a, 6), \ 173 | _ext_v32i16(a, 5), \ 174 | _ext_v32i16(a, 4), \ 175 | _ext_v32i16(a, 3), \ 176 | _ext_v32i16(a, 2), \ 177 | _ext_v32i16(a, 1), \ 178 | _ext_v32i16(a, 0)); \ 179 | } 180 | // #else 181 | // #define _print_v32i16(x) ; 182 | // #endif 183 | 184 | #endif /* _V32I16_H_INCLUDED */ 185 | /** 186 | * end of v32i16.h 187 | */ 188 | -------------------------------------------------------------------------------- /arch/vector_alias.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file vector_alias.h 4 | * 5 | * @brief make alias to vector macros 6 | */ 7 | #if !defined(_NVEC_ALIAS_PREFIX) || !defined(_WVEC_ALIAS_PREFIX) 8 | #error "_NVEC_ALIAS_PREFIX and _WVEC_ALIAS_PREFIX must be defined when alias.h is included." 9 | #else /* _VECTOR_ALIAS_PREFIX */ 10 | 11 | #define nvec_prefix _NVEC_ALIAS_PREFIX /* prefix for narrow variables */ 12 | #define wvec_prefix _WVEC_ALIAS_PREFIX /* prefix for wide variables */ 13 | 14 | /* join macros */ 15 | #define _vec_alias_join2_intl(a,b) a##b 16 | #define _vec_alias_join2(a,b) _vec_alias_join2_intl(a,b) 17 | 18 | #define _vec_alias_join3_intl(a,b,c) a##b##_##c 19 | #define _vec_alias_join3(a,b,c) _vec_alias_join3_intl(a,b,c) 20 | 21 | 22 | /* types */ 23 | #define nvec_t _vec_alias_join2(nvec_prefix, _t) 24 | #define nvec_mask_t _vec_alias_join2(nvec_prefix, _mask_t) 25 | #define nvec_masku_t _vec_alias_join2(nvec_prefix, _masku_t) 26 | 27 | #define wvec_t _vec_alias_join2(wvec_prefix, _t) 28 | #define wvec_mask_t _vec_alias_join2(wvec_prefix, _mask_t) 29 | #define wvec_masku_t _vec_alias_join2(wvec_prefix, _masku_t) 30 | 31 | /* address */ 32 | #define _pv_n _vec_alias_join2(_pv_, nvec_prefix) 33 | #define _pv_w _vec_alias_join2(_pv_, wvec_prefix) 34 | 35 | /* load and store */ 36 | #define _load_n _vec_alias_join2(_load_, nvec_prefix) 37 | #define _loadu_n _vec_alias_join2(_loadu_, nvec_prefix) 38 | #define _store_n _vec_alias_join2(_store_, nvec_prefix) 39 | #define _storeu_n _vec_alias_join2(_storeu_, nvec_prefix) 40 | 41 | #define _load_w _vec_alias_join2(_load_, wvec_prefix) 42 | #define _loadu_w _vec_alias_join2(_loadu_, wvec_prefix) 43 | #define _store_w _vec_alias_join2(_store_, wvec_prefix) 44 | #define _storeu_w _vec_alias_join2(_storeu_, wvec_prefix) 45 | 46 | /* broadcast */ 47 | #define _set_n _vec_alias_join2(_set_, nvec_prefix) 48 | #define _zero_n _vec_alias_join2(_zero_, nvec_prefix) 49 | #define _seta_n _vec_alias_join2(_seta_, nvec_prefix) 50 | #define _swap_n _vec_alias_join2(_swap_, nvec_prefix) 51 | 52 | #define _set_w _vec_alias_join2(_set_, wvec_prefix) 53 | #define _zero_w _vec_alias_join2(_zero_, wvec_prefix) 54 | #define _seta_w _vec_alias_join2(_seta_, wvec_prefix) 55 | #define _swap_w _vec_alias_join2(_swap_, wvec_prefix) 56 | 57 | /* logics */ 58 | #define _not_n _vec_alias_join2(_not_, nvec_prefix) 59 | #define _and_n _vec_alias_join2(_and_, nvec_prefix) 60 | #define _or_n _vec_alias_join2(_or_, nvec_prefix) 61 | #define _xor_n _vec_alias_join2(_xor_, nvec_prefix) 62 | #define _andn_n _vec_alias_join2(_andn_, nvec_prefix) 63 | 64 | #define _not_w _vec_alias_join2(_not_, wvec_prefix) 65 | #define _and_w _vec_alias_join2(_and_, wvec_prefix) 66 | #define _or_w _vec_alias_join2(_or_, wvec_prefix) 67 | #define _xor_w _vec_alias_join2(_xor_, wvec_prefix) 68 | #define _andn_w _vec_alias_join2(_andn_, wvec_prefix) 69 | 70 | /* arithmetics */ 71 | #define _add_n _vec_alias_join2(_add_, nvec_prefix) 72 | #define _sub_n _vec_alias_join2(_sub_, nvec_prefix) 73 | #define _adds_n _vec_alias_join2(_adds_, nvec_prefix) 74 | #define _subs_n _vec_alias_join2(_subs_, nvec_prefix) 75 | #define _addus_n _vec_alias_join2(_addus_, nvec_prefix) 76 | #define _subus_n _vec_alias_join2(_subus_, nvec_prefix) 77 | #define _max_n _vec_alias_join2(_max_, nvec_prefix) 78 | #define _min_n _vec_alias_join2(_min_, nvec_prefix) 79 | 80 | #define _add_w _vec_alias_join2(_add_, wvec_prefix) 81 | #define _sub_w _vec_alias_join2(_sub_, wvec_prefix) 82 | #define _adds_w _vec_alias_join2(_adds_, wvec_prefix) 83 | #define _subs_w _vec_alias_join2(_subs_, wvec_prefix) 84 | #define _max_w _vec_alias_join2(_max_, wvec_prefix) 85 | #define _min_w _vec_alias_join2(_min_, wvec_prefix) 86 | 87 | /* shuffle */ 88 | #define _shuf_n _vec_alias_join2(_shuf_, nvec_prefix) 89 | #define _shuf_w _vec_alias_join2(_shuf_, wvec_prefix) 90 | 91 | /* blend */ 92 | #define _sel_n _vec_alias_join2(_sel_, nvec_prefix) 93 | #define _sel_w _vec_alias_join2(_sel_, wvec_prefix) 94 | 95 | /* compare */ 96 | #define _eq_n _vec_alias_join2(_eq_, nvec_prefix) 97 | #define _lt_n _vec_alias_join2(_lt_, nvec_prefix) 98 | #define _gt_n _vec_alias_join2(_gt_, nvec_prefix) 99 | 100 | #define _eq_w _vec_alias_join2(_eq_, wvec_prefix) 101 | #define _lt_w _vec_alias_join2(_lt_, wvec_prefix) 102 | #define _gt_w _vec_alias_join2(_gt_, wvec_prefix) 103 | 104 | /* insert and extract */ 105 | #define _ins_n _vec_alias_join2(_ins_, nvec_prefix) 106 | #define _ext_n _vec_alias_join2(_ext_, nvec_prefix) 107 | 108 | #define _ins_w _vec_alias_join2(_ins_, wvec_prefix) 109 | #define _ext_w _vec_alias_join2(_ext_, wvec_prefix) 110 | 111 | /* shift */ 112 | #define _bsl_n _vec_alias_join2(_bsl_, nvec_prefix) 113 | #define _bsr_n _vec_alias_join2(_bsr_, nvec_prefix) 114 | #define _bsld_n _vec_alias_join2(_bsld_, nvec_prefix) 115 | #define _bsrd_n _vec_alias_join2(_bsrd_, nvec_prefix) 116 | #define _shl_n _vec_alias_join2(_shl_, nvec_prefix) 117 | #define _shr_n _vec_alias_join2(_shr_, nvec_prefix) 118 | #define _sal_n _vec_alias_join2(_sal_, nvec_prefix) 119 | #define _sar_n _vec_alias_join2(_sar_, nvec_prefix) 120 | 121 | #define _bsl_w _vec_alias_join2(_bsl_, wvec_prefix) 122 | #define _bsr_w _vec_alias_join2(_bsr_, wvec_prefix) 123 | #define _bsld_w _vec_alias_join2(_bsld_, wvec_prefix) 124 | #define _bsrd_w _vec_alias_join2(_bsrd_, wvec_prefix) 125 | #define _shl_w _vec_alias_join2(_shl_, wvec_prefix) 126 | #define _shr_w _vec_alias_join2(_shr_, wvec_prefix) 127 | #define _sal_w _vec_alias_join2(_sal_, wvec_prefix) 128 | #define _sar_w _vec_alias_join2(_sar_, wvec_prefix) 129 | 130 | /* mask */ 131 | #define _mask_n _vec_alias_join2(_mask_, nvec_prefix) 132 | #define _mask_w _vec_alias_join2(_mask_, wvec_prefix) 133 | 134 | /* horizontal max */ 135 | #define _hmax_n _vec_alias_join2(_hmax_, nvec_prefix) 136 | #define _hmax_w _vec_alias_join2(_hmax_, wvec_prefix) 137 | 138 | /* broadcast */ 139 | #define _from_v16i8_n _vec_alias_join2(_from_v16i8_, nvec_prefix) 140 | #define _from_v32i8_n _vec_alias_join2(_from_v32i8_, nvec_prefix) 141 | #define _to_v16i8_n _vec_alias_join2(_to_v16i8_, nvec_prefix) 142 | #define _to_v32i8_n _vec_alias_join2(_to_v32i8_, nvec_prefix) 143 | 144 | /* convert */ 145 | #define _cvt_n_w _vec_alias_join3(_cvt_, nvec_prefix, wvec_prefix) 146 | #define _cvt_w_n _vec_alias_join3(_cvt_, wvec_prefix, nvec_prefix) 147 | 148 | /* print */ 149 | #define _print_n _vec_alias_join2(_print_, nvec_prefix) 150 | #define _print_w _vec_alias_join2(_print_, wvec_prefix) 151 | 152 | #endif /* _VECTOR_ALIAS_PREFIX */ 153 | /** 154 | * end of vector_alias.h 155 | */ 156 | -------------------------------------------------------------------------------- /arch/x86_64_sse41/v2i32.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file v2i32.h 4 | * 5 | * @brief struct and _Generic based vector class implementation 6 | */ 7 | #ifndef _V2I32_H_INCLUDED 8 | #define _V2I32_H_INCLUDED 9 | 10 | /* include header for intel / amd sse2 instruction sets */ 11 | #include 12 | 13 | /* 8bit 32cell */ 14 | typedef struct v2i32_s { 15 | __m128i v1; 16 | } v2i32_t; 17 | 18 | /* expanders (without argument) */ 19 | #define _e_x_v2i32_1(u) 20 | #define _e_x_v2i32_2(u) 21 | 22 | /* expanders (without immediate) */ 23 | #define _e_v_v2i32_1(a) (a).v1 24 | #define _e_v_v2i32_2(a) (a).v1 25 | #define _e_vv_v2i32_1(a, b) (a).v1, (b).v1 26 | #define _e_vv_v2i32_2(a, b) (a).v1, (b).v1 27 | #define _e_vvv_v2i32_1(a, b, c) (a).v1, (b).v1, (c).v1 28 | #define _e_vvv_v2i32_2(a, b, c) (a).v1, (b).v1, (c).v1 29 | 30 | /* expanders with immediate */ 31 | #define _e_i_v2i32_1(imm) (imm) 32 | #define _e_i_v2i32_2(imm) (imm) 33 | #define _e_vi_v2i32_1(a, imm) (a).v1, (imm) 34 | #define _e_vi_v2i32_2(a, imm) (a).v1, (imm) 35 | #define _e_vvi_v2i32_1(a, b, imm) (a).v1, (b).v1, (imm) 36 | #define _e_vvi_v2i32_2(a, b, imm) (a).v1, (b).v1, (imm) 37 | 38 | /* address calculation macros */ 39 | #define _addr_v2i32_1(imm) ( (__m128i *)(imm) ) 40 | #define _addr_v2i32_2(imm) ( (__m128i *)(imm) ) 41 | #define _pv_v2i32(ptr) ( _addr_v2i32_1(ptr) ) 42 | /* expanders with pointers */ 43 | #define _e_p_v2i32_1(ptr) _addr_v2i32_1(ptr) 44 | #define _e_p_v2i32_2(ptr) _addr_v2i32_2(ptr) 45 | #define _e_pv_v2i32_1(ptr, a) _addr_v2i32_1(ptr), (a).v1 46 | #define _e_pv_v2i32_2(ptr, a) _addr_v2i32_2(ptr), (a).v1 47 | 48 | /* expand intrinsic name */ 49 | #define _i_v2i32(intrin) _mm_##intrin##_epi32 50 | #define _i_v2i32e(intrin) _mm_##intrin##_epi64 51 | #define _i_v2i32x(intrin) _mm_##intrin##_si128 52 | 53 | /* apply */ 54 | #define _a_v2i32(intrin, expander, ...) ( \ 55 | (v2i32_t) { \ 56 | _i_v2i32(intrin)(expander##_v2i32_1(__VA_ARGS__)) \ 57 | } \ 58 | ) 59 | #define _a_v2i32e(intrin, expander, ...) ( \ 60 | (v2i32_t) { \ 61 | _i_v2i32e(intrin)(expander##_v2i32_1(__VA_ARGS__)) \ 62 | } \ 63 | ) 64 | #define _a_v2i32ev(intrin, expander, ...) { \ 65 | _i_v2i32e(intrin)(expander##_v2i32_1(__VA_ARGS__)); \ 66 | } 67 | #define _a_v2i32x(intrin, expander, ...) ( \ 68 | (v2i32_t) { \ 69 | _i_v2i32x(intrin)(expander##_v2i32_1(__VA_ARGS__)) \ 70 | } \ 71 | ) 72 | #define _a_v2i32xv(intrin, expander, ...) { \ 73 | _i_v2i32x(intrin)(expander##_v2i32_1(__VA_ARGS__)); \ 74 | } 75 | 76 | /* load and store */ 77 | #define _load_v2i32(...) _a_v2i32e(loadl, _e_p, __VA_ARGS__) 78 | #define _loadu_v2i32(...) _a_v2i32e(loadl, _e_p, __VA_ARGS__) 79 | #define _store_v2i32(...) _a_v2i32ev(storel, _e_pv, __VA_ARGS__) 80 | #define _storeu_v2i32(...) _a_v2i32ev(storel, _e_pv, __VA_ARGS__) 81 | 82 | /* broadcast */ 83 | #define _set_v2i32(...) _a_v2i32(set1, _e_i, __VA_ARGS__) 84 | #define _zero_v2i32() _a_v2i32x(setzero, _e_x, _unused) 85 | #define _seta_v2i32(x, y) ( \ 86 | (v2i32_t) { \ 87 | _mm_cvtsi64_si128((((uint64_t)(x))<<32) | ((uint32_t)(y))) \ 88 | } \ 89 | ) 90 | #define _swap_v2i32(x) ( \ 91 | (v2i32_t) { \ 92 | _mm_shuffle_epi32((x).v1, 0xe1) \ 93 | } \ 94 | ) 95 | 96 | /* logics */ 97 | #define _not_v2i32(...) _a_v2i32x(not, _e_v, __VA_ARGS__) 98 | #define _and_v2i32(...) _a_v2i32x(and, _e_vv, __VA_ARGS__) 99 | #define _or_v2i32(...) _a_v2i32x(or, _e_vv, __VA_ARGS__) 100 | #define _xor_v2i32(...) _a_v2i32x(xor, _e_vv, __VA_ARGS__) 101 | #define _andn_v2i32(...) _a_v2i32x(andnot, _e_vv, __VA_ARGS__) 102 | 103 | /* arithmetics */ 104 | #define _add_v2i32(...) _a_v2i32(add, _e_vv, __VA_ARGS__) 105 | #define _sub_v2i32(...) _a_v2i32(sub, _e_vv, __VA_ARGS__) 106 | #define _mul_v2i32(...) _a_v2i32(mul, _e_vv, __VA_ARGS__) 107 | #define _max_v2i32(...) _a_v2i32(max, _e_vv, __VA_ARGS__) 108 | #define _min_v2i32(...) _a_v2i32(min, _e_vv, __VA_ARGS__) 109 | 110 | /* blend: (mask & b) | (~mask & a) */ 111 | #define _sel_v2i32(mask, a, b) ( \ 112 | (v2i32_t) { \ 113 | _mm_blendv_epi8((b).v1, (a).v1, (mask).v1) \ 114 | } \ 115 | ) 116 | 117 | /* compare */ 118 | #define _eq_v2i32(...) _a_v2i32(cmpeq, _e_vv, __VA_ARGS__) 119 | #define _gt_v2i32(...) _a_v2i32(cmpgt, _e_vv, __VA_ARGS__) 120 | 121 | /* test: take mask and test if all zero */ 122 | #define _test_v2i32(x, y) _mm_test_all_zeros((x).v1, (y).v1) 123 | 124 | /* insert and extract */ 125 | #define _ins_v2i32(a, val, imm) { \ 126 | (a).v1 = _i_v2i32((a).v1, (val), (imm)); \ 127 | } 128 | #define _ext_v2i32(a, imm) ( \ 129 | (int32_t)_i_v2i32(extract)((a).v1, (imm)) \ 130 | ) 131 | 132 | /* shift */ 133 | #define _sal_v2i32(a, imm) ( \ 134 | (v2i32_t) {_i_v2i32(slli)((a).v1, (imm))} \ 135 | ) 136 | #define _sar_v2i32(a, imm) ( \ 137 | (v2i32_t) {_i_v2i32(srai)((a).v1, (imm))} \ 138 | ) 139 | #define _shl_v2i32(a, imm) ( \ 140 | (v2i32_t) {_i_v2i32(slli)((a).v1, (imm))} \ 141 | ) 142 | #define _shr_v2i32(a, imm) ( \ 143 | (v2i32_t) {_i_v2i32(srli)((a).v1, (imm))} \ 144 | ) 145 | 146 | /* mask */ 147 | #define _mask_v2i32(a) ( \ 148 | (uint32_t) (0xff & _mm_movemask_epi8((a).v1)) \ 149 | ) 150 | #define V2I32_MASK_00 ( 0x00 ) 151 | #define V2I32_MASK_01 ( 0x0f ) 152 | #define V2I32_MASK_10 ( 0xf0 ) 153 | #define V2I32_MASK_11 ( 0xff ) 154 | 155 | /* convert */ 156 | typedef uint64_t v2i8_t; 157 | #define _load_v2i8(p) ({ \ 158 | uint8_t const *_p = (uint8_t const *)(p); \ 159 | *((uint16_t const *)(_p)); \ 160 | }) 161 | #define _store_v2i8(p, v) { \ 162 | uint8_t *_p = (uint8_t *)(p); \ 163 | *((uint16_t *)_p) = (v); \ 164 | } 165 | #define _cvt_v2i8_v2i32(a) ( \ 166 | (v2i32_t) { _mm_cvtepi8_epi32(_mm_cvtsi64_si128(a)) } \ 167 | ) 168 | #define _cvt_v2i32_v2i8(a) ( \ 169 | (uint16_t)_mm_cvtsi128_si64(_mm_shuffle_epi8((a).v1, _mm_cvtsi64_si128(0x0400))) \ 170 | ) 171 | #define _cvt_u64_v2i32(a) ( \ 172 | (v2i32_t){ _mm_cvtsi64_si128(a) } \ 173 | ) 174 | #define _cvt_v2i32_u64(a) ( \ 175 | (uint64_t)_mm_cvtsi128_si64((a).v1) \ 176 | ) 177 | #define _cvt_v2i64_v2i32(a) ( \ 178 | (v2i32_t) { \ 179 | _mm_shuffle_epi32((a).v1, 0xd8) \ 180 | } \ 181 | ) 182 | #define _cvth_v2i64_v2i32(a) ( \ 183 | (v2i32_t) { \ 184 | _mm_shuffle_epi32((a).v1, 0x8d) \ 185 | } \ 186 | ) 187 | 188 | /* transpose */ 189 | #define _lo_v2i32(a, b) ( \ 190 | (v2i32_t) { \ 191 | _mm_unpacklo_epi32((a).v1, (b).v1) \ 192 | } \ 193 | ) 194 | #define _hi_v2i32(a, b) ( \ 195 | (v2i32_t) { \ 196 | _mm_shuffle_epi32(_mm_unpacklo_epi32((a).v1, (b).v1), 0x0e) \ 197 | } \ 198 | ) 199 | 200 | /* debug print */ 201 | // #ifdef _LOG_H_INCLUDED 202 | #define _print_v2i32(a) { \ 203 | debug("(v2i32_t) %s(%d, %d)", #a, _ext_v2i32(a, 1), _ext_v2i32(a, 0)); \ 204 | } 205 | // #else 206 | // #define _print_v2i32(x) ; 207 | // #endif 208 | 209 | #endif /* _V2I32_H_INCLUDED */ 210 | /** 211 | * end of v2i32.h 212 | */ 213 | -------------------------------------------------------------------------------- /arch/x86_64_avx2/v2i32.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file v2i32.h 4 | * 5 | * @brief struct and _Generic based vector class implementation 6 | */ 7 | #ifndef _V2I32_H_INCLUDED 8 | #define _V2I32_H_INCLUDED 9 | 10 | /* include header for intel / amd sse2 instruction sets */ 11 | #include 12 | 13 | /* 8bit 32cell */ 14 | typedef struct v2i32_s { 15 | __m128i v1; 16 | } v2i32_t; 17 | 18 | /* expanders (without argument) */ 19 | #define _e_x_v2i32_1(u) 20 | #define _e_x_v2i32_2(u) 21 | 22 | /* expanders (without immediate) */ 23 | #define _e_v_v2i32_1(a) (a).v1 24 | #define _e_v_v2i32_2(a) (a).v1 25 | #define _e_vv_v2i32_1(a, b) (a).v1, (b).v1 26 | #define _e_vv_v2i32_2(a, b) (a).v1, (b).v1 27 | #define _e_vvv_v2i32_1(a, b, c) (a).v1, (b).v1, (c).v1 28 | #define _e_vvv_v2i32_2(a, b, c) (a).v1, (b).v1, (c).v1 29 | 30 | /* expanders with immediate */ 31 | #define _e_i_v2i32_1(imm) (imm) 32 | #define _e_i_v2i32_2(imm) (imm) 33 | #define _e_vi_v2i32_1(a, imm) (a).v1, (imm) 34 | #define _e_vi_v2i32_2(a, imm) (a).v1, (imm) 35 | #define _e_vvi_v2i32_1(a, b, imm) (a).v1, (b).v1, (imm) 36 | #define _e_vvi_v2i32_2(a, b, imm) (a).v1, (b).v1, (imm) 37 | 38 | /* address calculation macros */ 39 | #define _addr_v2i32_1(imm) ( (__m128i *)(imm) ) 40 | #define _addr_v2i32_2(imm) ( (__m128i *)(imm) ) 41 | #define _pv_v2i32(ptr) ( _addr_v2i32_1(ptr) ) 42 | /* expanders with pointers */ 43 | #define _e_p_v2i32_1(ptr) _addr_v2i32_1(ptr) 44 | #define _e_p_v2i32_2(ptr) _addr_v2i32_2(ptr) 45 | #define _e_pv_v2i32_1(ptr, a) _addr_v2i32_1(ptr), (a).v1 46 | #define _e_pv_v2i32_2(ptr, a) _addr_v2i32_2(ptr), (a).v1 47 | 48 | /* expand intrinsic name */ 49 | #define _i_v2i32(intrin) _mm_##intrin##_epi32 50 | #define _i_v2i32e(intrin) _mm_##intrin##_epi64 51 | #define _i_v2i32x(intrin) _mm_##intrin##_si128 52 | 53 | /* apply */ 54 | #define _a_v2i32(intrin, expander, ...) ( \ 55 | (v2i32_t) { \ 56 | _i_v2i32(intrin)(expander##_v2i32_1(__VA_ARGS__)) \ 57 | } \ 58 | ) 59 | #define _a_v2i32e(intrin, expander, ...) ( \ 60 | (v2i32_t) { \ 61 | _i_v2i32e(intrin)(expander##_v2i32_1(__VA_ARGS__)) \ 62 | } \ 63 | ) 64 | #define _a_v2i32ev(intrin, expander, ...) { \ 65 | _i_v2i32e(intrin)(expander##_v2i32_1(__VA_ARGS__)); \ 66 | } 67 | #define _a_v2i32x(intrin, expander, ...) ( \ 68 | (v2i32_t) { \ 69 | _i_v2i32x(intrin)(expander##_v2i32_1(__VA_ARGS__)) \ 70 | } \ 71 | ) 72 | #define _a_v2i32xv(intrin, expander, ...) { \ 73 | _i_v2i32x(intrin)(expander##_v2i32_1(__VA_ARGS__)); \ 74 | } 75 | 76 | /* load and store */ 77 | #define _load_v2i32(...) _a_v2i32e(loadl, _e_p, __VA_ARGS__) 78 | #define _loadu_v2i32(...) _a_v2i32e(loadl, _e_p, __VA_ARGS__) 79 | #define _store_v2i32(...) _a_v2i32ev(storel, _e_pv, __VA_ARGS__) 80 | #define _storeu_v2i32(...) _a_v2i32ev(storel, _e_pv, __VA_ARGS__) 81 | 82 | /* broadcast */ 83 | #define _set_v2i32(...) _a_v2i32(set1, _e_i, __VA_ARGS__) 84 | #define _zero_v2i32() _a_v2i32x(setzero, _e_x, _unused) 85 | #define _seta_v2i32(x, y) ( \ 86 | (v2i32_t) { \ 87 | _mm_cvtsi64_si128((((uint64_t)(x))<<32) | ((uint32_t)(y))) \ 88 | } \ 89 | ) 90 | #define _swap_v2i32(x) ( \ 91 | (v2i32_t) { \ 92 | _mm_shuffle_epi32((x).v1, 0xe1) \ 93 | } \ 94 | ) 95 | 96 | /* logics */ 97 | #define _not_v2i32(...) _a_v2i32x(not, _e_v, __VA_ARGS__) 98 | #define _and_v2i32(...) _a_v2i32x(and, _e_vv, __VA_ARGS__) 99 | #define _or_v2i32(...) _a_v2i32x(or, _e_vv, __VA_ARGS__) 100 | #define _xor_v2i32(...) _a_v2i32x(xor, _e_vv, __VA_ARGS__) 101 | #define _andn_v2i32(...) _a_v2i32x(andnot, _e_vv, __VA_ARGS__) 102 | 103 | /* arithmetics */ 104 | #define _add_v2i32(...) _a_v2i32(add, _e_vv, __VA_ARGS__) 105 | #define _sub_v2i32(...) _a_v2i32(sub, _e_vv, __VA_ARGS__) 106 | #define _mul_v2i32(...) _a_v2i32(mul, _e_vv, __VA_ARGS__) 107 | #define _max_v2i32(...) _a_v2i32(max, _e_vv, __VA_ARGS__) 108 | #define _min_v2i32(...) _a_v2i32(min, _e_vv, __VA_ARGS__) 109 | 110 | /* blend: mask == 1 ? a : b */ 111 | #define _sel_v2i32(mask, a, b) ( \ 112 | (v2i32_t) { \ 113 | _mm_blendv_epi8((b).v1, (a).v1, (mask).v1) \ 114 | } \ 115 | ) 116 | 117 | /* compare */ 118 | #define _eq_v2i32(...) _a_v2i32(cmpeq, _e_vv, __VA_ARGS__) 119 | #define _gt_v2i32(...) _a_v2i32(cmpgt, _e_vv, __VA_ARGS__) 120 | 121 | /* test: take mask and test if all zero */ 122 | #define _test_v2i32(x, y) _mm_test_all_zeros((x).v1, (y).v1) 123 | 124 | /* insert and extract */ 125 | #define _ins_v2i32(a, val, imm) { \ 126 | (a).v1 = _i_v2i32((a).v1, (val), (imm)); \ 127 | } 128 | #define _ext_v2i32(a, imm) ( \ 129 | (int32_t)_i_v2i32(extract)((a).v1, (imm)) \ 130 | ) 131 | 132 | /* shift */ 133 | #define _sal_v2i32(a, imm) ( \ 134 | (v2i32_t) {_i_v2i32(slli)((a).v1, (imm))} \ 135 | ) 136 | #define _sar_v2i32(a, imm) ( \ 137 | (v2i32_t) {_i_v2i32(srai)((a).v1, (imm))} \ 138 | ) 139 | #define _shl_v2i32(a, imm) ( \ 140 | (v2i32_t) {_i_v2i32(slli)((a).v1, (imm))} \ 141 | ) 142 | #define _shr_v2i32(a, imm) ( \ 143 | (v2i32_t) {_i_v2i32(srli)((a).v1, (imm))} \ 144 | ) 145 | 146 | /* mask */ 147 | #define _mask_v2i32(a) ( \ 148 | (uint32_t) (0xff & _mm_movemask_epi8((a).v1)) \ 149 | ) 150 | #define V2I32_MASK_00 ( 0x00 ) 151 | #define V2I32_MASK_01 ( 0x0f ) 152 | #define V2I32_MASK_10 ( 0xf0 ) 153 | #define V2I32_MASK_11 ( 0xff ) 154 | 155 | /* convert */ 156 | typedef uint64_t v2i8_t; 157 | #define _load_v2i8(p) ({ \ 158 | uint8_t const *_p = (uint8_t const *)(p); \ 159 | *((uint16_t const *)(_p)); \ 160 | }) 161 | #define _store_v2i8(p, v) { \ 162 | uint8_t *_p = (uint8_t *)(p); \ 163 | *((uint16_t *)_p) = (v); \ 164 | } 165 | #define _cvt_v2i8_v2i32(a) ( \ 166 | (v2i32_t) { \ 167 | _mm_cvtepi8_epi32(_mm_cvtsi64_si128(a)) \ 168 | } \ 169 | ) 170 | #define _cvt_v2i32_v2i8(a) ( \ 171 | (uint16_t)_mm_cvtsi128_si64(_mm_shuffle_epi8((a).v1, _mm_cvtsi64_si128(0x0400))) \ 172 | ) 173 | #define _cvt_u64_v2i32(a) ( \ 174 | (v2i32_t){ _mm_cvtsi64_si128(a) } \ 175 | ) 176 | #define _cvt_v2i32_u64(a) ( \ 177 | (uint64_t)_mm_cvtsi128_si64((a).v1) \ 178 | ) 179 | #define _cvt_v2i64_v2i32(a) ( \ 180 | (v2i32_t) { \ 181 | _mm_shuffle_epi32((a).v1, 0xd8) \ 182 | } \ 183 | ) 184 | #define _cvth_v2i64_v2i32(a) ( \ 185 | (v2i32_t) { \ 186 | _mm_shuffle_epi32((a).v1, 0x8d) \ 187 | } \ 188 | ) 189 | 190 | /* transpose */ 191 | #define _lo_v2i32(a, b) ( \ 192 | (v2i32_t) { \ 193 | _mm_unpacklo_epi32((a).v1, (b).v1) \ 194 | } \ 195 | ) 196 | #define _hi_v2i32(a, b) ( \ 197 | (v2i32_t) { \ 198 | _mm_shuffle_epi32(_mm_unpacklo_epi32((a).v1, (b).v1), 0x0e) \ 199 | } \ 200 | ) 201 | 202 | /* debug print */ 203 | // #ifdef _LOG_H_INCLUDED 204 | #define _print_v2i32(a) { \ 205 | debug("(v2i32_t) %s(%d, %d)", #a, _ext_v2i32(a, 1), _ext_v2i32(a, 0)); \ 206 | } 207 | // #else 208 | // #define _print_v2i32(x) ; 209 | // #endif 210 | 211 | #endif /* _V2I32_H_INCLUDED */ 212 | /** 213 | * end of v2i32.h 214 | */ 215 | -------------------------------------------------------------------------------- /arch/x86_64_sse41/v16i8.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file v16i8.h 4 | * 5 | * @brief struct and _Generic based vector class implementation 6 | */ 7 | #ifndef _V16I8_H_INCLUDED 8 | #define _V16I8_H_INCLUDED 9 | 10 | /* include header for intel / amd sse2 instruction sets */ 11 | #include 12 | 13 | /* 8bit 32cell */ 14 | typedef struct v16i8_s { 15 | __m128i v1; 16 | } v16i8_t; 17 | 18 | /* expanders (without argument) */ 19 | #define _e_x_v16i8_1(u) 20 | #define _e_x_v16i8_2(u) 21 | 22 | /* expanders (without immediate) */ 23 | #define _e_v_v16i8_1(a) (a).v1 24 | #define _e_v_v16i8_2(a) (a).v1 25 | #define _e_vv_v16i8_1(a, b) (a).v1, (b).v1 26 | #define _e_vv_v16i8_2(a, b) (a).v1, (b).v1 27 | #define _e_vvv_v16i8_1(a, b, c) (a).v1, (b).v1, (c).v1 28 | #define _e_vvv_v16i8_2(a, b, c) (a).v1, (b).v1, (c).v1 29 | 30 | /* expanders with immediate */ 31 | #define _e_i_v16i8_1(imm) (imm) 32 | #define _e_i_v16i8_2(imm) (imm) 33 | #define _e_vi_v16i8_1(a, imm) (a).v1, (imm) 34 | #define _e_vi_v16i8_2(a, imm) (a).v1, (imm) 35 | #define _e_vvi_v16i8_1(a, b, imm) (a).v1, (b).v1, (imm) 36 | #define _e_vvi_v16i8_2(a, b, imm) (a).v1, (b).v1, (imm) 37 | 38 | /* address calculation macros */ 39 | #define _addr_v16i8_1(imm) ( (__m128i *)(imm) ) 40 | #define _addr_v16i8_2(imm) ( (__m128i *)(imm) ) 41 | #define _pv_v16i8(ptr) ( _addr_v16i8_1(ptr) ) 42 | /* expanders with pointers */ 43 | #define _e_p_v16i8_1(ptr) _addr_v16i8_1(ptr) 44 | #define _e_p_v16i8_2(ptr) _addr_v16i8_2(ptr) 45 | #define _e_pv_v16i8_1(ptr, a) _addr_v16i8_1(ptr), (a).v1 46 | #define _e_pv_v16i8_2(ptr, a) _addr_v16i8_2(ptr), (a).v1 47 | 48 | /* expand intrinsic name */ 49 | #define _i_v16i8(intrin) _mm_##intrin##_epi8 50 | #define _i_v16u8(intrin) _mm_##intrin##_epu8 51 | #define _i_v16i8x(intrin) _mm_##intrin##_si128 52 | 53 | /* apply */ 54 | #define _a_v16i8(intrin, expander, ...) ( \ 55 | (v16i8_t) { \ 56 | _i_v16i8(intrin)(expander##_v16i8_1(__VA_ARGS__)) \ 57 | } \ 58 | ) 59 | #define _a_v16u8(intrin, expander, ...) ( \ 60 | (v16i8_t) { \ 61 | _i_v16u8(intrin)(expander##_v16i8_1(__VA_ARGS__)) \ 62 | } \ 63 | ) 64 | #define _a_v16i8x(intrin, expander, ...) ( \ 65 | (v16i8_t) { \ 66 | _i_v16i8x(intrin)(expander##_v16i8_1(__VA_ARGS__)) \ 67 | } \ 68 | ) 69 | #define _a_v16i8xv(intrin, expander, ...) { \ 70 | _i_v16i8x(intrin)(expander##_v16i8_1(__VA_ARGS__)); \ 71 | } 72 | 73 | /* load and store */ 74 | #define _load_v16i8(...) _a_v16i8x(load, _e_p, __VA_ARGS__) 75 | #define _loadu_v16i8(...) _a_v16i8x(loadu, _e_p, __VA_ARGS__) 76 | #define _store_v16i8(...) _a_v16i8xv(store, _e_pv, __VA_ARGS__) 77 | #define _storeu_v16i8(...) _a_v16i8xv(storeu, _e_pv, __VA_ARGS__) 78 | 79 | /* broadcast */ 80 | #define _set_v16i8(...) _a_v16i8(set1, _e_i, __VA_ARGS__) 81 | #define _zero_v16i8() _a_v16i8x(setzero, _e_x, _unused) 82 | #define _seta_v16i8(...) ( \ 83 | (v16i8_t) { \ 84 | _mm_set_epi8(__VA_ARGS__) \ 85 | } \ 86 | ) 87 | 88 | /* swap (reverse) */ 89 | #define _swap_idx_v16i8() ( \ 90 | _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) \ 91 | ) 92 | #define _swap_v16i8(a) ( \ 93 | (v16i8_t) { \ 94 | _mm_shuffle_epi8((a).v1, _swap_idx_v16i8()) \ 95 | } \ 96 | ) 97 | #define _swapn_idx_v16i8() ( \ 98 | _mm_set_epi8(-16, -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1) \ 99 | ) 100 | #define _swapn_v16i8(a, l) ( \ 101 | (v16i8_t) { \ 102 | _mm_shuffle_epi8((a).v1, _mm_add_epi8(_swapn_idx_v16i8(), _mm_set1_epi8(l))) \ 103 | } \ 104 | ) 105 | 106 | /* logics */ 107 | #define _not_v16i8(...) _a_v16i8x(not, _e_v, __VA_ARGS__) 108 | #define _and_v16i8(...) _a_v16i8x(and, _e_vv, __VA_ARGS__) 109 | #define _or_v16i8(...) _a_v16i8x(or, _e_vv, __VA_ARGS__) 110 | #define _xor_v16i8(...) _a_v16i8x(xor, _e_vv, __VA_ARGS__) 111 | #define _andn_v16i8(...) _a_v16i8x(andnot, _e_vv, __VA_ARGS__) 112 | 113 | /* arithmetics */ 114 | #define _add_v16i8(...) _a_v16i8(add, _e_vv, __VA_ARGS__) 115 | #define _sub_v16i8(...) _a_v16i8(sub, _e_vv, __VA_ARGS__) 116 | #define _adds_v16i8(...) _a_v16i8(adds, _e_vv, __VA_ARGS__) 117 | #define _subs_v16i8(...) _a_v16i8(subs, _e_vv, __VA_ARGS__) 118 | #define _addus_v16i8(...) _a_v16u8(adds, _e_vv, __VA_ARGS__) 119 | #define _subus_v16i8(...) _a_v16u8(subs, _e_vv, __VA_ARGS__) 120 | #define _max_v16i8(...) _a_v16i8(max, _e_vv, __VA_ARGS__) 121 | #define _min_v16i8(...) _a_v16i8(min, _e_vv, __VA_ARGS__) 122 | 123 | /* shuffle */ 124 | #define _shuf_v16i8(...) _a_v16i8(shuffle, _e_vv, __VA_ARGS__) 125 | 126 | /* blend */ 127 | #define _sel_v16i8(...) _a_v16i8(blendv, _e_vvv, __VA_ARGS__) 128 | 129 | /* compare */ 130 | #define _eq_v16i8(...) _a_v16i8(cmpeq, _e_vv, __VA_ARGS__) 131 | #define _gt_v16i8(...) _a_v16i8(cmpgt, _e_vv, __VA_ARGS__) 132 | 133 | /* insert and extract */ 134 | #define _ins_v16i8(a, val, imm) { \ 135 | (a).v1 = _i_v16i8(insert)((a).v1, (val), (imm)); \ 136 | } 137 | #define _ext_v16i8(a, imm) ( \ 138 | (int8_t)_i_v16i8(extract)((a).v1, (imm)) \ 139 | ) 140 | 141 | /* byte shift */ 142 | #define _bsl_v16i8(a, imm) ( \ 143 | (v16i8_t) { \ 144 | _i_v16i8x(slli)((a).v1, (imm)) \ 145 | } \ 146 | ) 147 | #define _bsr_v16i8(a, imm) ( \ 148 | (v16i8_t) { \ 149 | _i_v16i8x(srli)((a).v1, (imm)) \ 150 | } \ 151 | ) 152 | 153 | /* double shift (palignr) */ 154 | #define _bsld_v16i8(a, b, imm) ( \ 155 | (v16i8_t) { \ 156 | _mm_alignr_epi8((a).v1, (b).v1, sizeof(__m128i) - (imm)) \ 157 | } \ 158 | ) 159 | #define _bsrd_v16i8(a, b, imm) ( \ 160 | (v16i8_t) { \ 161 | _mm_alignr_epi8((a).v1, (b).v1, (imm)) \ 162 | } \ 163 | ) 164 | 165 | /* bit shift */ 166 | #define _shl_v16i8(a, imm) ( \ 167 | (v16i8_t) { \ 168 | _mm_slli_epi32((a).v1, (imm)) \ 169 | } \ 170 | ) 171 | #define _shr_v16i8(a, imm) ( \ 172 | (v16i8_t) { \ 173 | _mm_srli_epi32((a).v1, (imm)) \ 174 | } \ 175 | ) 176 | #define _sal_v16i8(a, imm) ( \ 177 | (v16i8_t) { \ 178 | _mm_slai_epi32((a).v1, (imm)) \ 179 | } \ 180 | ) 181 | #define _sar_v16i8(a, imm) ( \ 182 | (v16i8_t) { \ 183 | _mm_srai_epi32((a).v1, (imm)) \ 184 | } \ 185 | ) 186 | 187 | /* mask */ 188 | #define _mask_v16i8(a) ( \ 189 | (v16_mask_t) { \ 190 | .m1 = _i_v16i8(movemask)((a).v1) \ 191 | } \ 192 | ) 193 | 194 | /* horizontal max */ 195 | #define _hmax_v16i8(a) ({ \ 196 | __m128i _vmax = _mm_max_epi8((a).v1, \ 197 | _mm_srli_si128((a).v1, 8)); \ 198 | _vmax = _mm_max_epi8(_vmax, \ 199 | _mm_srli_si128(_vmax, 4)); \ 200 | _vmax = _mm_max_epi8(_vmax, \ 201 | _mm_srli_si128(_vmax, 2)); \ 202 | _vmax = _mm_max_epi8(_vmax, \ 203 | _mm_srli_si128(_vmax, 1)); \ 204 | (int8_t)_mm_extract_epi8(_vmax, 0); \ 205 | }) 206 | 207 | /* convert */ 208 | #define _cvt_v16i16_v16i8(a) ( \ 209 | (v16i8_t) { \ 210 | _mm_packs_epi16((a).v1, (a).v2) \ 211 | } \ 212 | ) 213 | 214 | /* debug print */ 215 | // #ifdef _LOG_H_INCLUDED 216 | #define _print_v16i8(a) { \ 217 | debug("(v16i8_t) %s(%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d)", \ 218 | #a, \ 219 | _ext_v16i8(a, 15), \ 220 | _ext_v16i8(a, 14), \ 221 | _ext_v16i8(a, 13), \ 222 | _ext_v16i8(a, 12), \ 223 | _ext_v16i8(a, 11), \ 224 | _ext_v16i8(a, 10), \ 225 | _ext_v16i8(a, 9), \ 226 | _ext_v16i8(a, 8), \ 227 | _ext_v16i8(a, 7), \ 228 | _ext_v16i8(a, 6), \ 229 | _ext_v16i8(a, 5), \ 230 | _ext_v16i8(a, 4), \ 231 | _ext_v16i8(a, 3), \ 232 | _ext_v16i8(a, 2), \ 233 | _ext_v16i8(a, 1), \ 234 | _ext_v16i8(a, 0)); \ 235 | } 236 | // #else 237 | // #define _print_v16i8(x) ; 238 | // #endif 239 | 240 | #endif /* _V16I8_H_INCLUDED */ 241 | /** 242 | * end of v16i8.h 243 | */ 244 | -------------------------------------------------------------------------------- /arch/x86_64_avx2/v16i8.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file v16i8.h 4 | * 5 | * @brief struct and _Generic based vector class implementation 6 | */ 7 | #ifndef _V16I8_H_INCLUDED 8 | #define _V16I8_H_INCLUDED 9 | 10 | /* include header for intel / amd sse2 instruction sets */ 11 | #include 12 | 13 | /* 8bit 32cell */ 14 | typedef struct v16i8_s { 15 | __m128i v1; 16 | } v16i8_t; 17 | 18 | /* expanders (without argument) */ 19 | #define _e_x_v16i8_1(u) 20 | #define _e_x_v16i8_2(u) 21 | 22 | /* expanders (without immediate) */ 23 | #define _e_v_v16i8_1(a) (a).v1 24 | #define _e_v_v16i8_2(a) (a).v1 25 | #define _e_vv_v16i8_1(a, b) (a).v1, (b).v1 26 | #define _e_vv_v16i8_2(a, b) (a).v1, (b).v1 27 | #define _e_vvv_v16i8_1(a, b, c) (a).v1, (b).v1, (c).v1 28 | #define _e_vvv_v16i8_2(a, b, c) (a).v1, (b).v1, (c).v1 29 | 30 | /* expanders with immediate */ 31 | #define _e_i_v16i8_1(imm) (imm) 32 | #define _e_i_v16i8_2(imm) (imm) 33 | #define _e_vi_v16i8_1(a, imm) (a).v1, (imm) 34 | #define _e_vi_v16i8_2(a, imm) (a).v1, (imm) 35 | #define _e_vvi_v16i8_1(a, b, imm) (a).v1, (b).v1, (imm) 36 | #define _e_vvi_v16i8_2(a, b, imm) (a).v1, (b).v1, (imm) 37 | 38 | /* address calculation macros */ 39 | #define _addr_v16i8_1(imm) ( (__m128i *)(imm) ) 40 | #define _addr_v16i8_2(imm) ( (__m128i *)(imm) ) 41 | #define _pv_v16i8(ptr) ( _addr_v16i8_1(ptr) ) 42 | /* expanders with pointers */ 43 | #define _e_p_v16i8_1(ptr) _addr_v16i8_1(ptr) 44 | #define _e_p_v16i8_2(ptr) _addr_v16i8_2(ptr) 45 | #define _e_pv_v16i8_1(ptr, a) _addr_v16i8_1(ptr), (a).v1 46 | #define _e_pv_v16i8_2(ptr, a) _addr_v16i8_2(ptr), (a).v1 47 | 48 | /* expand intrinsic name */ 49 | #define _i_v16i8(intrin) _mm_##intrin##_epi8 50 | #define _i_v16u8(intrin) _mm_##intrin##_epu8 51 | #define _i_v16i8x(intrin) _mm_##intrin##_si128 52 | 53 | /* apply */ 54 | #define _a_v16i8(intrin, expander, ...) ( \ 55 | (v16i8_t) { \ 56 | _i_v16i8(intrin)(expander##_v16i8_1(__VA_ARGS__)) \ 57 | } \ 58 | ) 59 | #define _a_v16u8(intrin, expander, ...) ( \ 60 | (v16i8_t) { \ 61 | _i_v16u8(intrin)(expander##_v16i8_1(__VA_ARGS__)) \ 62 | } \ 63 | ) 64 | #define _a_v16i8x(intrin, expander, ...) ( \ 65 | (v16i8_t) { \ 66 | _i_v16i8x(intrin)(expander##_v16i8_1(__VA_ARGS__)) \ 67 | } \ 68 | ) 69 | #define _a_v16i8xv(intrin, expander, ...) { \ 70 | _i_v16i8x(intrin)(expander##_v16i8_1(__VA_ARGS__)); \ 71 | } 72 | 73 | /* load and store */ 74 | #define _load_v16i8(...) _a_v16i8x(load, _e_p, __VA_ARGS__) 75 | #define _loadu_v16i8(...) _a_v16i8x(loadu, _e_p, __VA_ARGS__) 76 | #define _store_v16i8(...) _a_v16i8xv(store, _e_pv, __VA_ARGS__) 77 | #define _storeu_v16i8(...) _a_v16i8xv(storeu, _e_pv, __VA_ARGS__) 78 | 79 | /* broadcast */ 80 | #define _set_v16i8(...) _a_v16i8(set1, _e_i, __VA_ARGS__) 81 | #define _zero_v16i8() _a_v16i8x(setzero, _e_x, _unused) 82 | #define _seta_v16i8(...) ( \ 83 | (v16i8_t) { \ 84 | _mm_set_epi8(__VA_ARGS__) \ 85 | } \ 86 | ) 87 | 88 | /* swap (reverse) */ 89 | #define _swap_idx_v16i8() ( \ 90 | _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) \ 91 | ) 92 | #define _swap_v16i8(a) ( \ 93 | (v16i8_t) { \ 94 | _mm_shuffle_epi8((a).v1, _swap_idx_v16i8()) \ 95 | } \ 96 | ) 97 | #define _swapn_idx_v16i8() ( \ 98 | _mm_set_epi8(-16, -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1) \ 99 | ) 100 | #define _swapn_v16i8(a, l) ( \ 101 | (v16i8_t) { \ 102 | _mm_shuffle_epi8((a).v1, _mm_add_epi8(_swapn_idx_v16i8(), _mm_set1_epi8(l))) \ 103 | } \ 104 | ) 105 | 106 | /* logics */ 107 | #define _not_v16i8(...) _a_v16i8x(not, _e_v, __VA_ARGS__) 108 | #define _and_v16i8(...) _a_v16i8x(and, _e_vv, __VA_ARGS__) 109 | #define _or_v16i8(...) _a_v16i8x(or, _e_vv, __VA_ARGS__) 110 | #define _xor_v16i8(...) _a_v16i8x(xor, _e_vv, __VA_ARGS__) 111 | #define _andn_v16i8(...) _a_v16i8x(andnot, _e_vv, __VA_ARGS__) 112 | 113 | /* arithmetics */ 114 | #define _add_v16i8(...) _a_v16i8(add, _e_vv, __VA_ARGS__) 115 | #define _sub_v16i8(...) _a_v16i8(sub, _e_vv, __VA_ARGS__) 116 | #define _adds_v16i8(...) _a_v16i8(adds, _e_vv, __VA_ARGS__) 117 | #define _subs_v16i8(...) _a_v16i8(subs, _e_vv, __VA_ARGS__) 118 | #define _addus_v16i8(...) _a_v16i8(adds, _e_vv, __VA_ARGS__) 119 | #define _subus_v16i8(...) _a_v16i8(subs, _e_vv, __VA_ARGS__) 120 | #define _max_v16i8(...) _a_v16i8(max, _e_vv, __VA_ARGS__) 121 | #define _min_v16i8(...) _a_v16i8(min, _e_vv, __VA_ARGS__) 122 | 123 | /* shuffle */ 124 | #define _shuf_v16i8(...) _a_v16i8(shuffle, _e_vv, __VA_ARGS__) 125 | 126 | /* blend */ 127 | #define _sel_v16i8(...) _a_v16i8(blendv, _e_vvv, __VA_ARGS__) 128 | 129 | /* compare */ 130 | #define _eq_v16i8(...) _a_v16i8(cmpeq, _e_vv, __VA_ARGS__) 131 | #define _gt_v16i8(...) _a_v16i8(cmpgt, _e_vv, __VA_ARGS__) 132 | 133 | /* insert and extract */ 134 | #define _ins_v16i8(a, val, imm) { \ 135 | (a).v1 = _i_v16i8(insert)((a).v1, (val), (imm)); \ 136 | } 137 | #define _ext_v16i8(a, imm) ( \ 138 | (int8_t)_i_v16i8(extract)((a).v1, (imm)) \ 139 | ) 140 | 141 | /* byte shift */ 142 | #define _bsl_v16i8(a, imm) ( \ 143 | (v16i8_t) { \ 144 | _i_v16i8x(slli)((a).v1, (imm)) \ 145 | } \ 146 | ) 147 | #define _bsr_v16i8(a, imm) ( \ 148 | (v16i8_t) { \ 149 | _i_v16i8x(srli)((a).v1, (imm)) \ 150 | } \ 151 | ) 152 | 153 | /* double shift (palignr) */ 154 | #define _bsld_v16i8(a, b, imm) ( \ 155 | (v16i8_t) { \ 156 | _mm_alignr_epi8((a).v1, (b).v1, sizeof(__m128i) - (imm)) \ 157 | } \ 158 | ) 159 | #define _bsrd_v16i8(a, b, imm) ( \ 160 | (v16i8_t) { \ 161 | _mm_alignr_epi8((a).v1, (b).v1, (imm)) \ 162 | } \ 163 | ) 164 | 165 | /* bit shift */ 166 | #define _shl_v16i8(a, imm) ( \ 167 | (v16i8_t) { \ 168 | _mm_slli_epi32((a).v1, (imm)) \ 169 | } \ 170 | ) 171 | #define _shr_v16i8(a, imm) ( \ 172 | (v16i8_t) { \ 173 | _mm_srli_epi32((a).v1, (imm)) \ 174 | } \ 175 | ) 176 | #define _sal_v16i8(a, imm) ( \ 177 | (v16i8_t) { \ 178 | _mm_slai_epi32((a).v1, (imm)) \ 179 | } \ 180 | ) 181 | #define _sar_v16i8(a, imm) ( \ 182 | (v16i8_t) { \ 183 | _mm_srai_epi32((a).v1, (imm)) \ 184 | } \ 185 | ) 186 | 187 | /* mask */ 188 | #define _mask_v16i8(a) ( \ 189 | (v16_mask_t) { \ 190 | .m1 = _i_v16i8(movemask)((a).v1) \ 191 | } \ 192 | ) 193 | 194 | /* horizontal max */ 195 | #define _hmax_v16i8(a) ({ \ 196 | __m128i _vmax = _mm_max_epi8((a).v1, \ 197 | _mm_srli_si128((a).v1, 8)); \ 198 | _vmax = _mm_max_epi8(_vmax, \ 199 | _mm_srli_si128(_vmax, 4)); \ 200 | _vmax = _mm_max_epi8(_vmax, \ 201 | _mm_srli_si128(_vmax, 2)); \ 202 | _vmax = _mm_max_epi8(_vmax, \ 203 | _mm_srli_si128(_vmax, 1)); \ 204 | (int8_t)_mm_extract_epi8(_vmax, 0); \ 205 | }) 206 | 207 | /* convert */ 208 | #define _cvt_v16i16_v16i8(a) ( \ 209 | (v16i8_t) { \ 210 | _mm_packs_epi16(_mm256_castsi256_si128((a).v1), _mm256_extracti128_si256((a).v1, 1)) \ 211 | } \ 212 | ) 213 | 214 | /* debug print */ 215 | // #ifdef _LOG_H_INCLUDED 216 | #define _print_v16i8(a) { \ 217 | debug("(v16i8_t) %s(%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d)", \ 218 | #a, \ 219 | _ext_v16i8(a, 15), \ 220 | _ext_v16i8(a, 14), \ 221 | _ext_v16i8(a, 13), \ 222 | _ext_v16i8(a, 12), \ 223 | _ext_v16i8(a, 11), \ 224 | _ext_v16i8(a, 10), \ 225 | _ext_v16i8(a, 9), \ 226 | _ext_v16i8(a, 8), \ 227 | _ext_v16i8(a, 7), \ 228 | _ext_v16i8(a, 6), \ 229 | _ext_v16i8(a, 5), \ 230 | _ext_v16i8(a, 4), \ 231 | _ext_v16i8(a, 3), \ 232 | _ext_v16i8(a, 2), \ 233 | _ext_v16i8(a, 1), \ 234 | _ext_v16i8(a, 0)); \ 235 | } 236 | // #else 237 | // #define _print_v16i8(x) ; 238 | // #endif 239 | 240 | #endif /* _V16I8_H_INCLUDED */ 241 | /** 242 | * end of v16i8.h 243 | */ 244 | -------------------------------------------------------------------------------- /arch/x86_64_avx2/v32i8.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file v32i8.h 4 | * 5 | * @brief struct and _Generic based vector class implementation 6 | */ 7 | #ifndef _V32I8_H_INCLUDED 8 | #define _V32I8_H_INCLUDED 9 | 10 | /* include header for intel / amd sse2 instruction sets */ 11 | #include 12 | 13 | /* 8bit 32cell */ 14 | typedef struct v32i8_s { 15 | __m256i v1; 16 | } v32i8_t; 17 | 18 | /* expanders (without argument) */ 19 | #define _e_x_v32i8_1(u) 20 | 21 | /* expanders (without immediate) */ 22 | #define _e_v_v32i8_1(a) (a).v1 23 | #define _e_vv_v32i8_1(a, b) (a).v1, (b).v1 24 | #define _e_vvv_v32i8_1(a, b, c) (a).v1, (b).v1, (c).v1 25 | 26 | /* expanders with immediate */ 27 | #define _e_i_v32i8_1(imm) (imm) 28 | #define _e_vi_v32i8_1(a, imm) (a).v1, (imm) 29 | #define _e_vvi_v32i8_1(a, b, imm) (a).v1, (b).v1, (imm) 30 | 31 | /* address calculation macros */ 32 | #define _addr_v32i8_1(imm) ( (__m256i *)(imm) ) 33 | #define _pv_v32i8(ptr) ( _addr_v32i8_1(ptr) ) 34 | /* expanders with pointers */ 35 | #define _e_p_v32i8_1(ptr) _addr_v32i8_1(ptr) 36 | #define _e_pv_v32i8_1(ptr, a) _addr_v32i8_1(ptr), (a).v1 37 | 38 | /* expand intrinsic name */ 39 | #define _i_v32i8(intrin) _mm256_##intrin##_epi8 40 | #define _i_v32u8(intrin) _mm256_##intrin##_epu8 41 | #define _i_v32i8x(intrin) _mm256_##intrin##_si256 42 | 43 | /* apply */ 44 | #define _a_v32i8(intrin, expander, ...) ( \ 45 | (v32i8_t) { \ 46 | _i_v32i8(intrin)(expander##_v32i8_1(__VA_ARGS__)) \ 47 | } \ 48 | ) 49 | #define _a_v32u8(intrin, expander, ...) ( \ 50 | (v32i8_t) { \ 51 | _i_v32u8(intrin)(expander##_v32i8_1(__VA_ARGS__)) \ 52 | } \ 53 | ) 54 | #define _a_v32i8x(intrin, expander, ...) ( \ 55 | (v32i8_t) { \ 56 | _i_v32i8x(intrin)(expander##_v32i8_1(__VA_ARGS__)) \ 57 | } \ 58 | ) 59 | #define _a_v32i8xv(intrin, expander, ...) { \ 60 | _i_v32i8x(intrin)(expander##_v32i8_1(__VA_ARGS__)); \ 61 | } 62 | 63 | /* load and store */ 64 | #define _load_v32i8(...) _a_v32i8x(load, _e_p, __VA_ARGS__) 65 | #define _loadu_v32i8(...) _a_v32i8x(loadu, _e_p, __VA_ARGS__) 66 | #define _store_v32i8(...) _a_v32i8xv(store, _e_pv, __VA_ARGS__) 67 | #define _storeu_v32i8(...) _a_v32i8xv(storeu, _e_pv, __VA_ARGS__) 68 | 69 | /* broadcast */ 70 | #define _set_v32i8(...) _a_v32i8(set1, _e_i, __VA_ARGS__) 71 | #define _zero_v32i8() _a_v32i8x(setzero, _e_x, _unused) 72 | 73 | /* swap (reverse) */ 74 | #define _swap_idx_v32i8() ( \ 75 | _mm256_broadcastsi128_si256(_mm_set_epi8( \ 76 | 0, 1, 2, 3, 4, 5, 6, 7, \ 77 | 8, 9, 10, 11, 12, 13, 14, 15)) \ 78 | ) 79 | #define _swap_v32i8(a) ( \ 80 | (v32i8_t) { \ 81 | _mm256_permute2x128_si256( \ 82 | _mm256_shuffle_epi8((a).v1, _swap_idx_v32i8()), \ 83 | _mm256_shuffle_epi8((a).v1, _swap_idx_v32i8()), \ 84 | 0x01) \ 85 | } \ 86 | ) 87 | 88 | /* logics */ 89 | #define _not_v32i8(...) _a_v32i8x(not, _e_v, __VA_ARGS__) 90 | #define _and_v32i8(...) _a_v32i8x(and, _e_vv, __VA_ARGS__) 91 | #define _or_v32i8(...) _a_v32i8x(or, _e_vv, __VA_ARGS__) 92 | #define _xor_v32i8(...) _a_v32i8x(xor, _e_vv, __VA_ARGS__) 93 | #define _andn_v32i8(...) _a_v32i8x(andnot, _e_vv, __VA_ARGS__) 94 | 95 | /* arithmetics */ 96 | #define _add_v32i8(...) _a_v32i8(add, _e_vv, __VA_ARGS__) 97 | #define _sub_v32i8(...) _a_v32i8(sub, _e_vv, __VA_ARGS__) 98 | #define _adds_v32i8(...) _a_v32i8(adds, _e_vv, __VA_ARGS__) 99 | #define _subs_v32i8(...) _a_v32i8(subs, _e_vv, __VA_ARGS__) 100 | #define _addus_v32i8(...) _a_v32u8(adds, _e_vv, __VA_ARGS__) 101 | #define _subus_v32i8(...) _a_v32u8(subs, _e_vv, __VA_ARGS__) 102 | #define _max_v32i8(...) _a_v32i8(max, _e_vv, __VA_ARGS__) 103 | #define _min_v32i8(...) _a_v32i8(min, _e_vv, __VA_ARGS__) 104 | 105 | /* shuffle */ 106 | #define _shuf_v32i8(...) _a_v32i8(shuffle, _e_vv, __VA_ARGS__) 107 | 108 | /* blend */ 109 | #define _sel_v32i8(...) _a_v32i8(blendv, _e_vvv, __VA_ARGS__) 110 | 111 | /* compare */ 112 | #define _eq_v32i8(...) _a_v32i8(cmpeq, _e_vv, __VA_ARGS__) 113 | #define _gt_v32i8(...) _a_v32i8(cmpgt, _e_vv, __VA_ARGS__) 114 | 115 | /* insert and extract */ 116 | #define _ins_v32i8(a, val, imm) { \ 117 | (a).v1 = _i_v32i8(insert)((a).v1, (val), (imm)); \ 118 | } 119 | #define _ext_v32i8(a, imm) ( \ 120 | (int8_t)_i_v32i8(extract)((a).v1, (imm)) \ 121 | ) 122 | 123 | /* byte shift */ 124 | #define _bsl_v32i8(a, imm) ( \ 125 | (v32i8_t) { _mm256_alignr_epi8( \ 126 | (a).v1, \ 127 | _mm256_permute2x128_si256((a).v1, (a).v1, 0x08), \ 128 | 15 \ 129 | )} \ 130 | ) 131 | #define _bsr_v32i8(a, imm) ( \ 132 | (v32i8_t) { _mm256_alignr_epi8( \ 133 | _mm256_castsi128_si256( \ 134 | _mm256_extracti128_si256((a).v1, 1)), \ 135 | (a).v1, \ 136 | 1 \ 137 | )} \ 138 | ) 139 | 140 | /* double shift (palignr) */ 141 | #define _bsld_v32i8(a, b, imm) ( \ 142 | (v32i8_t) { _mm256_alignr_epi8( \ 143 | (a).v1, \ 144 | _mm256_permute2x128_si256((a).v1, (b).v1, 0x03), \ 145 | sizeof(__m128i) - (imm) \ 146 | )} \ 147 | ) 148 | #define _bsrd_v32i8(a, b, imm) ( \ 149 | (v32i8_t) { _mm256_alignr_epi8( \ 150 | _mm256_permute2x128_si256((a).v1, (b).v1, 0x03), \ 151 | (b).v1, \ 152 | (imm) \ 153 | )} \ 154 | ) 155 | 156 | /* bit shift */ 157 | #define _shl_v32i8(a, imm) ( \ 158 | (v32i8_t) { \ 159 | _mm256_slli_epi32((a).v1, (imm)) \ 160 | } \ 161 | ) 162 | #define _shr_v32i8(a, imm) ( \ 163 | (v32i8_t) { \ 164 | _mm256_srli_epi32((a).v1, (imm)) \ 165 | } \ 166 | ) 167 | #define _sal_v32i8(a, imm) ( \ 168 | (v32i8_t) { \ 169 | _mm256_slai_epi32((a).v1, (imm)) \ 170 | } \ 171 | ) 172 | #define _sar_v32i8(a, imm) ( \ 173 | (v32i8_t) { \ 174 | _mm256_srai_epi32((a).v1, (imm)) \ 175 | } \ 176 | ) 177 | 178 | /* mask */ 179 | #define _mask_v32i8(a) ( \ 180 | (v32_mask_t) { \ 181 | .m1 = _i_v32i8(movemask)((a).v1) \ 182 | } \ 183 | ) 184 | 185 | /* horizontal max (reduction max) */ 186 | #define _hmax_v32i8(a) ({ \ 187 | __m128i _t = _mm_max_epi8( \ 188 | _mm256_castsi256_si128((a).v1), \ 189 | _mm256_extracti128_si256((a).v1, 1) \ 190 | ); \ 191 | _t = _mm_max_epi8(_t, _mm_srli_si128(_t, 8)); \ 192 | _t = _mm_max_epi8(_t, _mm_srli_si128(_t, 4)); \ 193 | _t = _mm_max_epi8(_t, _mm_srli_si128(_t, 2)); \ 194 | _t = _mm_max_epi8(_t, _mm_srli_si128(_t, 1)); \ 195 | (int8_t)_mm_extract_epi8(_t, 0); \ 196 | }) 197 | 198 | /* convert */ 199 | #define _cvt_v32i16_v32i8(a) ( \ 200 | (v32i8_t) { \ 201 | _mm256_permute4x64_epi64(_mm256_packs_epi16((a).v1, (a).v2), 0xd8) \ 202 | } \ 203 | ) 204 | 205 | /* debug print */ 206 | // #ifdef _LOG_H_INCLUDED 207 | #define _print_v32i8(a) { \ 208 | debug("(v32i8_t) %s(%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, " \ 209 | "%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d)", \ 210 | #a, \ 211 | _ext_v32i8(a, 31), \ 212 | _ext_v32i8(a, 30), \ 213 | _ext_v32i8(a, 29), \ 214 | _ext_v32i8(a, 28), \ 215 | _ext_v32i8(a, 27), \ 216 | _ext_v32i8(a, 26), \ 217 | _ext_v32i8(a, 25), \ 218 | _ext_v32i8(a, 24), \ 219 | _ext_v32i8(a, 23), \ 220 | _ext_v32i8(a, 22), \ 221 | _ext_v32i8(a, 21), \ 222 | _ext_v32i8(a, 20), \ 223 | _ext_v32i8(a, 19), \ 224 | _ext_v32i8(a, 18), \ 225 | _ext_v32i8(a, 17), \ 226 | _ext_v32i8(a, 16), \ 227 | _ext_v32i8(a, 15), \ 228 | _ext_v32i8(a, 14), \ 229 | _ext_v32i8(a, 13), \ 230 | _ext_v32i8(a, 12), \ 231 | _ext_v32i8(a, 11), \ 232 | _ext_v32i8(a, 10), \ 233 | _ext_v32i8(a, 9), \ 234 | _ext_v32i8(a, 8), \ 235 | _ext_v32i8(a, 7), \ 236 | _ext_v32i8(a, 6), \ 237 | _ext_v32i8(a, 5), \ 238 | _ext_v32i8(a, 4), \ 239 | _ext_v32i8(a, 3), \ 240 | _ext_v32i8(a, 2), \ 241 | _ext_v32i8(a, 1), \ 242 | _ext_v32i8(a, 0)); \ 243 | } 244 | // #else 245 | // #define _print_v32i8(x) ; 246 | // #endif 247 | 248 | #endif /* _V32I8_H_INCLUDED */ 249 | /** 250 | * end of v32i8.h 251 | */ 252 | -------------------------------------------------------------------------------- /arch/x86_64_sse41/v32i8.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file v32i8.h 4 | * 5 | * @brief struct and _Generic based vector class implementation 6 | */ 7 | #ifndef _V32I8_H_INCLUDED 8 | #define _V32I8_H_INCLUDED 9 | 10 | /* include header for intel / amd sse2 instruction sets */ 11 | #include 12 | 13 | /* 8bit 32cell */ 14 | typedef struct v32i8_s { 15 | __m128i v1; 16 | __m128i v2; 17 | } v32i8_t; 18 | 19 | /* expanders (without argument) */ 20 | #define _e_x_v32i8_1(u) 21 | #define _e_x_v32i8_2(u) 22 | 23 | /* expanders (without immediate) */ 24 | #define _e_v_v32i8_1(a) (a).v1 25 | #define _e_v_v32i8_2(a) (a).v2 26 | #define _e_vv_v32i8_1(a, b) (a).v1, (b).v1 27 | #define _e_vv_v32i8_2(a, b) (a).v2, (b).v2 28 | #define _e_vvv_v32i8_1(a, b, c) (a).v1, (b).v1, (c).v1 29 | #define _e_vvv_v32i8_2(a, b, c) (a).v2, (b).v2, (c).v2 30 | 31 | /* expanders with immediate */ 32 | #define _e_i_v32i8_1(imm) (imm) 33 | #define _e_i_v32i8_2(imm) (imm) 34 | #define _e_vi_v32i8_1(a, imm) (a).v1, (imm) 35 | #define _e_vi_v32i8_2(a, imm) (a).v2, (imm) 36 | #define _e_vvi_v32i8_1(a, b, imm) (a).v1, (b).v1, (imm) 37 | #define _e_vvi_v32i8_2(a, b, imm) (a).v2, (b).v2, (imm) 38 | 39 | /* address calculation macros */ 40 | #define _addr_v32i8_1(imm) ( (__m128i *)(imm) ) 41 | #define _addr_v32i8_2(imm) ( (__m128i *)(imm) + 1 ) 42 | #define _pv_v32i8(ptr) ( _addr_v32i8_1(ptr) ) 43 | /* expanders with pointers */ 44 | #define _e_p_v32i8_1(ptr) _addr_v32i8_1(ptr) 45 | #define _e_p_v32i8_2(ptr) _addr_v32i8_2(ptr) 46 | #define _e_pv_v32i8_1(ptr, a) _addr_v32i8_1(ptr), (a).v1 47 | #define _e_pv_v32i8_2(ptr, a) _addr_v32i8_2(ptr), (a).v2 48 | 49 | /* expand intrinsic name */ 50 | #define _i_v32i8(intrin) _mm_##intrin##_epi8 51 | #define _i_v32u8(intrin) _mm_##intrin##_epu8 52 | #define _i_v32i8x(intrin) _mm_##intrin##_si128 53 | 54 | /* apply */ 55 | #define _a_v32i8(intrin, expander, ...) ( \ 56 | (v32i8_t) { \ 57 | _i_v32i8(intrin)(expander##_v32i8_1(__VA_ARGS__)), \ 58 | _i_v32i8(intrin)(expander##_v32i8_2(__VA_ARGS__)) \ 59 | } \ 60 | ) 61 | #define _a_v32u8(intrin, expander, ...) ( \ 62 | (v32i8_t) { \ 63 | _i_v32u8(intrin)(expander##_v32i8_1(__VA_ARGS__)), \ 64 | _i_v32u8(intrin)(expander##_v32i8_2(__VA_ARGS__)) \ 65 | } \ 66 | ) 67 | #define _a_v32i8x(intrin, expander, ...) ( \ 68 | (v32i8_t) { \ 69 | _i_v32i8x(intrin)(expander##_v32i8_1(__VA_ARGS__)), \ 70 | _i_v32i8x(intrin)(expander##_v32i8_2(__VA_ARGS__)) \ 71 | } \ 72 | ) 73 | #define _a_v32i8xv(intrin, expander, ...) { \ 74 | _i_v32i8x(intrin)(expander##_v32i8_1(__VA_ARGS__)); \ 75 | _i_v32i8x(intrin)(expander##_v32i8_2(__VA_ARGS__)); \ 76 | } 77 | 78 | /* load and store */ 79 | #define _load_v32i8(...) _a_v32i8x(load, _e_p, __VA_ARGS__) 80 | #define _loadu_v32i8(...) _a_v32i8x(loadu, _e_p, __VA_ARGS__) 81 | #define _store_v32i8(...) _a_v32i8xv(store, _e_pv, __VA_ARGS__) 82 | #define _storeu_v32i8(...) _a_v32i8xv(storeu, _e_pv, __VA_ARGS__) 83 | 84 | /* broadcast */ 85 | #define _set_v32i8(...) _a_v32i8(set1, _e_i, __VA_ARGS__) 86 | #define _zero_v32i8() _a_v32i8x(setzero, _e_x, _unused) 87 | 88 | /* swap (reverse) */ 89 | #define _swap_idx_v32i8() ( \ 90 | _mm_set_epi8( \ 91 | 0, 1, 2, 3, 4, 5, 6, 7, \ 92 | 8, 9, 10, 11, 12, 13, 14, 15) \ 93 | ) 94 | #define _swap_v32i8(a) ( \ 95 | (v32i8_t) { \ 96 | _mm_shuffle_epi8((a).v2, _swap_idx_v32i8()), \ 97 | _mm_shuffle_epi8((a).v1, _swap_idx_v32i8()) \ 98 | } \ 99 | ) 100 | 101 | /* logics */ 102 | #define _not_v32i8(...) _a_v32i8x(not, _e_v, __VA_ARGS__) 103 | #define _and_v32i8(...) _a_v32i8x(and, _e_vv, __VA_ARGS__) 104 | #define _or_v32i8(...) _a_v32i8x(or, _e_vv, __VA_ARGS__) 105 | #define _xor_v32i8(...) _a_v32i8x(xor, _e_vv, __VA_ARGS__) 106 | #define _andn_v32i8(...) _a_v32i8x(andnot, _e_vv, __VA_ARGS__) 107 | 108 | /* arithmetics */ 109 | #define _add_v32i8(...) _a_v32i8(add, _e_vv, __VA_ARGS__) 110 | #define _sub_v32i8(...) _a_v32i8(sub, _e_vv, __VA_ARGS__) 111 | #define _adds_v32i8(...) _a_v32i8(adds, _e_vv, __VA_ARGS__) 112 | #define _subs_v32i8(...) _a_v32i8(subs, _e_vv, __VA_ARGS__) 113 | #define _addus_v32i8(...) _a_v32u8(adds, _e_vv, __VA_ARGS__) 114 | #define _subus_v32i8(...) _a_v32u8(subs, _e_vv, __VA_ARGS__) 115 | #define _max_v32i8(...) _a_v32i8(max, _e_vv, __VA_ARGS__) 116 | #define _min_v32i8(...) _a_v32i8(min, _e_vv, __VA_ARGS__) 117 | 118 | /* shuffle */ 119 | #define _shuf_v32i8(...) _a_v32i8(shuffle, _e_vv, __VA_ARGS__) 120 | 121 | /* blend */ 122 | #define _sel_v32i8(...) _a_v32i8(blendv, _e_vvv, __VA_ARGS__) 123 | 124 | /* compare */ 125 | #define _eq_v32i8(...) _a_v32i8(cmpeq, _e_vv, __VA_ARGS__) 126 | #define _gt_v32i8(...) _a_v32i8(cmpgt, _e_vv, __VA_ARGS__) 127 | 128 | /* insert and extract */ 129 | #define _ins_v32i8(a, val, imm) { \ 130 | if((imm) < sizeof(__m128i)) { \ 131 | (a).v1 = _i_v32i8(insert)((a).v1, (val), (imm)); \ 132 | } else { \ 133 | (a).v2 = _i_v32i8(insert)((a).v2, (val), (imm) - sizeof(__m128i)); \ 134 | } \ 135 | } 136 | #define _ext_v32i8(a, imm) ( \ 137 | (int8_t)(((imm) < sizeof(__m128i)) ? ( \ 138 | _i_v32i8(extract)((a).v1, (imm)) \ 139 | ) : ( \ 140 | _i_v32i8(extract)((a).v2, (imm) - sizeof(__m128i)) \ 141 | )) \ 142 | ) 143 | 144 | /* shift */ 145 | #define _bsl_v32i8(a, imm) ( \ 146 | (v32i8_t) { \ 147 | _i_v32i8x(slli)((a).v1, (imm)), \ 148 | _i_v32i8(alignr)((a).v2, (a).v1, sizeof(__m128i) - (imm)) \ 149 | } \ 150 | ) 151 | #define _bsr_v32i8(a, imm) ( \ 152 | (v32i8_t) { \ 153 | _i_v32i8(alignr)((a).v2, (a).v1, (imm)), \ 154 | _i_v32i8x(srli)((a).v2, (imm)) \ 155 | } \ 156 | ) 157 | 158 | /* double shift (palignr) */ 159 | #define _bsld_v32i8(a, b, imm) ( \ 160 | (v32i8_t) { \ 161 | _i_v32i8(alignr)((a).v1, (b).v2, sizeof(__m128i) - (imm)), \ 162 | _i_v32i8(alignr)((a).v2, (a).v1, sizeof(__m128i) - (imm)) \ 163 | } \ 164 | ) 165 | #define _bsrd_v32i8(a, b, imm) ( \ 166 | (v32i8_t) { \ 167 | _i_v32i8(alignr)((b).v2, (b).v1, (imm)), \ 168 | _i_v32i8(alignr)((a).v1, (b).v2, (imm)) \ 169 | } \ 170 | ) 171 | 172 | /* bit shift */ 173 | #define _shl_v32i8(a, imm) ( \ 174 | (v32i8_t) { \ 175 | _mm_slli_epi32((a).v1, (imm)), \ 176 | _mm_slli_epi32((a).v2, (imm)) \ 177 | } \ 178 | ) 179 | #define _shr_v32i8(a, imm) ( \ 180 | (v32i8_t) { \ 181 | _mm_srli_epi32((a).v1, (imm)), \ 182 | _mm_srli_epi32((a).v2, (imm)) \ 183 | } \ 184 | ) 185 | #define _sal_v32i8(a, imm) ( \ 186 | (v32i8_t) { \ 187 | _mm_slai_epi32((a).v1, (imm)), \ 188 | _mm_slai_epi32((a).v2, (imm)) \ 189 | } \ 190 | ) 191 | #define _sar_v32i8(a, imm) ( \ 192 | (v32i8_t) { \ 193 | _mm_srai_epi32((a).v1, (imm)), \ 194 | _mm_srai_epi32((a).v2, (imm)) \ 195 | } \ 196 | ) 197 | 198 | /* mask */ 199 | #define _mask_v32i8(a) ( \ 200 | (v32_mask_t) { \ 201 | .m1 = _i_v32i8(movemask)((a).v1), \ 202 | .m2 = _i_v32i8(movemask)((a).v2) \ 203 | } \ 204 | ) 205 | 206 | /* convert */ 207 | #define _cvt_v32i16_v32i8(a) ( \ 208 | (v32i8_t) { \ 209 | _mm_packs_epi16((a).v1, (a).v2), \ 210 | _mm_packs_epi16((a).v3, (a).v4) \ 211 | } \ 212 | ) 213 | 214 | /* debug print */ 215 | #ifdef _LOG_H_INCLUDED 216 | #define _print_v32i8(a) { \ 217 | debug("(v32i8_t) %s(%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, " \ 218 | "%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d)", \ 219 | #a, \ 220 | _ext_v32i8(a, 31), \ 221 | _ext_v32i8(a, 30), \ 222 | _ext_v32i8(a, 29), \ 223 | _ext_v32i8(a, 28), \ 224 | _ext_v32i8(a, 27), \ 225 | _ext_v32i8(a, 26), \ 226 | _ext_v32i8(a, 25), \ 227 | _ext_v32i8(a, 24), \ 228 | _ext_v32i8(a, 23), \ 229 | _ext_v32i8(a, 22), \ 230 | _ext_v32i8(a, 21), \ 231 | _ext_v32i8(a, 20), \ 232 | _ext_v32i8(a, 19), \ 233 | _ext_v32i8(a, 18), \ 234 | _ext_v32i8(a, 17), \ 235 | _ext_v32i8(a, 16), \ 236 | _ext_v32i8(a, 15), \ 237 | _ext_v32i8(a, 14), \ 238 | _ext_v32i8(a, 13), \ 239 | _ext_v32i8(a, 12), \ 240 | _ext_v32i8(a, 11), \ 241 | _ext_v32i8(a, 10), \ 242 | _ext_v32i8(a, 9), \ 243 | _ext_v32i8(a, 8), \ 244 | _ext_v32i8(a, 7), \ 245 | _ext_v32i8(a, 6), \ 246 | _ext_v32i8(a, 5), \ 247 | _ext_v32i8(a, 4), \ 248 | _ext_v32i8(a, 3), \ 249 | _ext_v32i8(a, 2), \ 250 | _ext_v32i8(a, 1), \ 251 | _ext_v32i8(a, 0)); \ 252 | } 253 | #else 254 | #define _print_v32i8(x) ; 255 | #endif 256 | 257 | #endif /* _V32I8_H_INCLUDED */ 258 | /** 259 | * end of v32i8.h 260 | */ 261 | -------------------------------------------------------------------------------- /arch/x86_64_sse41/v32i16.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file v32i16.h 4 | * 5 | * @brief struct and _Generic based vector class implementation 6 | */ 7 | #ifndef _V32I16_H_INCLUDED 8 | #define _V32I16_H_INCLUDED 9 | 10 | /* include header for intel / amd sse2 instruction sets */ 11 | #include 12 | 13 | /* 16bit 32cell */ 14 | typedef struct v32i16_s { 15 | __m128i v1; 16 | __m128i v2; 17 | __m128i v3; 18 | __m128i v4; 19 | } v32i16_t; 20 | 21 | /* expanders (without argument) */ 22 | #define _e_x_v32i16_1(u) 23 | #define _e_x_v32i16_2(u) 24 | #define _e_x_v32i16_3(u) 25 | #define _e_x_v32i16_4(u) 26 | 27 | /* expanders (without immediate) */ 28 | #define _e_v_v32i16_1(a) (a).v1 29 | #define _e_v_v32i16_2(a) (a).v2 30 | #define _e_v_v32i16_3(a) (a).v3 31 | #define _e_v_v32i16_4(a) (a).v4 32 | #define _e_vv_v32i16_1(a, b) (a).v1, (b).v1 33 | #define _e_vv_v32i16_2(a, b) (a).v2, (b).v2 34 | #define _e_vv_v32i16_3(a, b) (a).v3, (b).v3 35 | #define _e_vv_v32i16_4(a, b) (a).v4, (b).v4 36 | #define _e_vvv_v32i16_1(a, b, c) (a).v1, (b).v1, (c).v1 37 | #define _e_vvv_v32i16_2(a, b, c) (a).v2, (b).v2, (c).v2 38 | #define _e_vvv_v32i16_3(a, b, c) (a).v3, (b).v3, (c).v3 39 | #define _e_vvv_v32i16_4(a, b, c) (a).v4, (b).v4, (c).v4 40 | 41 | /* expanders with immediate */ 42 | #define _e_i_v32i16_1(imm) (imm) 43 | #define _e_i_v32i16_2(imm) (imm) 44 | #define _e_i_v32i16_3(imm) (imm) 45 | #define _e_i_v32i16_4(imm) (imm) 46 | #define _e_vi_v32i16_1(a, imm) (a).v1, (imm) 47 | #define _e_vi_v32i16_2(a, imm) (a).v2, (imm) 48 | #define _e_vi_v32i16_3(a, imm) (a).v3, (imm) 49 | #define _e_vi_v32i16_4(a, imm) (a).v4, (imm) 50 | #define _e_vvi_v32i16_1(a, b, imm) (a).v1, (b).v1, (imm) 51 | #define _e_vvi_v32i16_2(a, b, imm) (a).v2, (b).v2, (imm) 52 | #define _e_vvi_v32i16_3(a, b, imm) (a).v3, (b).v3, (imm) 53 | #define _e_vvi_v32i16_4(a, b, imm) (a).v4, (b).v4, (imm) 54 | 55 | /* address calculation macros */ 56 | #define _addr_v32i16_1(imm) ( (__m128i *)(imm) ) 57 | #define _addr_v32i16_2(imm) ( (__m128i *)(imm) + 1 ) 58 | #define _addr_v32i16_3(imm) ( (__m128i *)(imm) + 2 ) 59 | #define _addr_v32i16_4(imm) ( (__m128i *)(imm) + 3 ) 60 | #define _pv_v32i16(ptr) ( _addr_v32i16_1(ptr) ) 61 | /* expanders with pointers */ 62 | #define _e_p_v32i16_1(ptr) _addr_v32i16_1(ptr) 63 | #define _e_p_v32i16_2(ptr) _addr_v32i16_2(ptr) 64 | #define _e_p_v32i16_3(ptr) _addr_v32i16_3(ptr) 65 | #define _e_p_v32i16_4(ptr) _addr_v32i16_4(ptr) 66 | #define _e_pv_v32i16_1(ptr, a) _addr_v32i16_1(ptr), (a).v1 67 | #define _e_pv_v32i16_2(ptr, a) _addr_v32i16_2(ptr), (a).v2 68 | #define _e_pv_v32i16_3(ptr, a) _addr_v32i16_3(ptr), (a).v3 69 | #define _e_pv_v32i16_4(ptr, a) _addr_v32i16_4(ptr), (a).v4 70 | 71 | /* expand intrinsic name */ 72 | #define _i_v32i16(intrin) _mm_##intrin##_epi16 73 | #define _i_v32i16x(intrin) _mm_##intrin##_si128 74 | 75 | /* apply */ 76 | #define _a_v32i16(intrin, expander, ...) ( \ 77 | (v32i16_t) { \ 78 | _i_v32i16(intrin)(expander##_v32i16_1(__VA_ARGS__)), \ 79 | _i_v32i16(intrin)(expander##_v32i16_2(__VA_ARGS__)), \ 80 | _i_v32i16(intrin)(expander##_v32i16_3(__VA_ARGS__)), \ 81 | _i_v32i16(intrin)(expander##_v32i16_4(__VA_ARGS__)) \ 82 | } \ 83 | ) 84 | #define _a_v32i16x(intrin, expander, ...) ( \ 85 | (v32i16_t) { \ 86 | _i_v32i16x(intrin)(expander##_v32i16_1(__VA_ARGS__)), \ 87 | _i_v32i16x(intrin)(expander##_v32i16_2(__VA_ARGS__)), \ 88 | _i_v32i16x(intrin)(expander##_v32i16_3(__VA_ARGS__)), \ 89 | _i_v32i16x(intrin)(expander##_v32i16_4(__VA_ARGS__)) \ 90 | } \ 91 | ) 92 | #define _a_v32i16xv(intrin, expander, ...) { \ 93 | _i_v32i16x(intrin)(expander##_v32i16_1(__VA_ARGS__)); \ 94 | _i_v32i16x(intrin)(expander##_v32i16_2(__VA_ARGS__)); \ 95 | _i_v32i16x(intrin)(expander##_v32i16_3(__VA_ARGS__)); \ 96 | _i_v32i16x(intrin)(expander##_v32i16_4(__VA_ARGS__)); \ 97 | } 98 | 99 | /* load and store */ 100 | #define _load_v32i16(...) _a_v32i16x(load, _e_p, __VA_ARGS__) 101 | #define _loadu_v32i16(...) _a_v32i16x(loadu, _e_p, __VA_ARGS__) 102 | #define _store_v32i16(...) _a_v32i16xv(store, _e_pv, __VA_ARGS__) 103 | #define _storeu_v32i16(...) _a_v32i16xv(storeu, _e_pv, __VA_ARGS__) 104 | 105 | /* broadcast */ 106 | #define _set_v32i16(...) _a_v32i16(set1, _e_i, __VA_ARGS__) 107 | #define _zero_v32i16() _a_v32i16x(setzero, _e_x, _unused) 108 | 109 | /* logics */ 110 | #define _not_v32i16(...) _a_v32i16x(not, _e_v, __VA_ARGS__) 111 | #define _and_v32i16(...) _a_v32i16x(and, _e_vv, __VA_ARGS__) 112 | #define _or_v32i16(...) _a_v32i16x(or, _e_vv, __VA_ARGS__) 113 | #define _xor_v32i16(...) _a_v32i16x(xor, _e_vv, __VA_ARGS__) 114 | #define _andn_v32i16(...) _a_v32i16x(andnot, _e_vv, __VA_ARGS__) 115 | 116 | /* arithmetics */ 117 | #define _add_v32i16(...) _a_v32i16(add, _e_vv, __VA_ARGS__) 118 | #define _sub_v32i16(...) _a_v32i16(sub, _e_vv, __VA_ARGS__) 119 | #define _max_v32i16(...) _a_v32i16(max, _e_vv, __VA_ARGS__) 120 | #define _min_v32i16(...) _a_v32i16(min, _e_vv, __VA_ARGS__) 121 | 122 | /* compare */ 123 | #define _eq_v32i16(...) _a_v32i16(cmpeq, _e_vv, __VA_ARGS__) 124 | #define _gt_v32i16(...) _a_v32i16(cmpgt, _e_vv, __VA_ARGS__) 125 | 126 | /* insert and extract */ 127 | #define _ins_v32i16(a, val, imm) { \ 128 | if((imm) < sizeof(__m128i)/sizeof(int16_t)) { \ 129 | (a).v1 = _i_v32i8(insert)((a).v1, (val), (imm)); \ 130 | } else if((imm) < 2*sizeof(__m128i)/sizeof(int16_t)) { \ 131 | (a).v2 = _i_v32i8(insert)((a).v2, (val), (imm) - sizeof(__m128i)/sizeof(int16_t)); \ 132 | } else if((imm) < 3*sizeof(__m128i)/sizeof(int16_t)) { \ 133 | (a).v3 = _i_v32i8(insert)((a).v3, (val), (imm) - 2*sizeof(__m128i)/sizeof(int16_t)); \ 134 | } else { \ 135 | (a).v4 = _i_v32i8(insert)((a).v4, (val), (imm) - 3*sizeof(__m128i)/sizeof(int16_t)); \ 136 | } \ 137 | } 138 | #define _ext_v32i16(a, imm) ( \ 139 | (int16_t)(((imm) < sizeof(__m128i)/sizeof(int16_t)) \ 140 | ? _i_v32i16(extract)((a).v1, (imm)) \ 141 | : (((imm) < 2*sizeof(__m128i)/sizeof(int16_t)) \ 142 | ? _i_v32i16(extract)((a).v2, (imm) - sizeof(__m128i)/sizeof(int16_t)) \ 143 | : (((imm) < 3*sizeof(__m128i)/sizeof(int16_t)) \ 144 | ? _i_v32i16(extract)((a).v3, (imm) - 2*sizeof(__m128i)/sizeof(int16_t)) \ 145 | : _i_v32i16(extract)((a).v4, (imm) - 3*sizeof(__m128i)/sizeof(int16_t))))) \ 146 | ) 147 | 148 | /* mask */ 149 | #define _mask_v32i16(a) ( \ 150 | (v32_mask_t) { \ 151 | .m1 = _mm_movemask_epi8( \ 152 | _mm_packs_epi16((a).v1, (a).v2)), \ 153 | .m2 = _mm_movemask_epi8( \ 154 | _mm_packs_epi16((a).v3, (a).v4)) \ 155 | } \ 156 | ) 157 | 158 | /* horizontal max (reduction max) */ 159 | #define _hmax_v32i16(a) ({ \ 160 | __m128i _vmax = _mm_max_epi16( \ 161 | _mm_max_epi16((a).v1, (a).v2), \ 162 | _mm_max_epi16((a).v3, (a).v4)); \ 163 | _vmax = _mm_max_epi16(_vmax, \ 164 | _mm_srli_si128(_vmax, 8)); \ 165 | _vmax = _mm_max_epi16(_vmax, \ 166 | _mm_srli_si128(_vmax, 4)); \ 167 | _vmax = _mm_max_epi16(_vmax, \ 168 | _mm_srli_si128(_vmax, 2)); \ 169 | (int16_t)_mm_extract_epi16(_vmax, 0); \ 170 | }) 171 | 172 | #define _cvt_v32i8_v32i16(a) ( \ 173 | (v32i16_t) { \ 174 | _mm_cvtepi8_epi16((a).v1), \ 175 | _mm_cvtepi8_epi16(_mm_srli_si128((a).v1, 8)), \ 176 | _mm_cvtepi8_epi16((a).v2), \ 177 | _mm_cvtepi8_epi16(_mm_srli_si128((a).v2, 8)) \ 178 | } \ 179 | ) 180 | 181 | /* debug print */ 182 | #ifdef _LOG_H_INCLUDED 183 | #define _print_v32i16(a) { \ 184 | debug("(v32i16_t) %s(%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, " \ 185 | "%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d)", \ 186 | #a, \ 187 | _ext_v32i16(a, 31), \ 188 | _ext_v32i16(a, 30), \ 189 | _ext_v32i16(a, 29), \ 190 | _ext_v32i16(a, 28), \ 191 | _ext_v32i16(a, 27), \ 192 | _ext_v32i16(a, 26), \ 193 | _ext_v32i16(a, 25), \ 194 | _ext_v32i16(a, 24), \ 195 | _ext_v32i16(a, 23), \ 196 | _ext_v32i16(a, 22), \ 197 | _ext_v32i16(a, 21), \ 198 | _ext_v32i16(a, 20), \ 199 | _ext_v32i16(a, 19), \ 200 | _ext_v32i16(a, 18), \ 201 | _ext_v32i16(a, 17), \ 202 | _ext_v32i16(a, 16), \ 203 | _ext_v32i16(a, 15), \ 204 | _ext_v32i16(a, 14), \ 205 | _ext_v32i16(a, 13), \ 206 | _ext_v32i16(a, 12), \ 207 | _ext_v32i16(a, 11), \ 208 | _ext_v32i16(a, 10), \ 209 | _ext_v32i16(a, 9), \ 210 | _ext_v32i16(a, 8), \ 211 | _ext_v32i16(a, 7), \ 212 | _ext_v32i16(a, 6), \ 213 | _ext_v32i16(a, 5), \ 214 | _ext_v32i16(a, 4), \ 215 | _ext_v32i16(a, 3), \ 216 | _ext_v32i16(a, 2), \ 217 | _ext_v32i16(a, 1), \ 218 | _ext_v32i16(a, 0)); \ 219 | } 220 | #else 221 | #define _print_v32i16(x) ; 222 | #endif 223 | 224 | #endif /* _V32I16_H_INCLUDED */ 225 | /** 226 | * end of v32i16.h 227 | */ 228 | -------------------------------------------------------------------------------- /arch/x86_64_avx2/arch_util.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file arch_util.h 4 | * 5 | * @brief architecture-dependent utilities devided from util.h 6 | */ 7 | #ifndef _ARCH_UTIL_H_INCLUDED 8 | #define _ARCH_UTIL_H_INCLUDED 9 | #define MM_ARCH "AVX2" 10 | 11 | #include "vector.h" 12 | #include 13 | #include 14 | 15 | /** 16 | * misc bit operations (popcnt, tzcnt, and lzcnt) 17 | */ 18 | 19 | /** 20 | * @macro popcnt 21 | */ 22 | #define popcnt(x) ( (uint64_t)_mm_popcnt_u64(x) ) 23 | 24 | /** 25 | * @macro ZCNT_RESULT 26 | * @brief workaround for a bug in gcc (<= 5), all the results of tzcnt / lzcnt macros must be modified by this label 27 | */ 28 | #ifndef ZCNT_RESULT 29 | # if defined(_ARCH_GCC_VERSION) && _ARCH_GCC_VERSION < 600 30 | # define ZCNT_RESULT volatile 31 | # else 32 | # define ZCNT_RESULT 33 | # endif 34 | #endif 35 | 36 | /** 37 | * @macro tzcnt 38 | * @brief trailing zero count (count #continuous zeros from LSb) 39 | */ 40 | /** immintrin.h is already included */ 41 | #if defined(_ARCH_GCC_VERSION) && _ARCH_GCC_VERSION < 490 42 | # define tzcnt(x) ( (uint64_t)__tzcnt_u64(x) ) 43 | #else 44 | # define tzcnt(x) ( (uint64_t)_tzcnt_u64(x) ) 45 | #endif 46 | 47 | /** 48 | * @macro lzcnt 49 | * @brief leading zero count (count #continuous zeros from MSb) 50 | */ 51 | /* __lzcnt_u64 in bmiintrin.h gcc-4.6, _lzcnt_u64 in lzcntintrin.h from gcc-4.7 */ 52 | #if defined(_ARCH_GCC_VERSION) && _ARCH_GCC_VERSION < 470 53 | # define lzcnt(x) ( (uint64_t)__lzcnt_u64(x) ) 54 | #else 55 | # define lzcnt(x) ( (uint64_t)_lzcnt_u64(x) ) 56 | #endif 57 | 58 | /** 59 | * @macro _swap_u64 60 | */ 61 | #if defined(__clang__) || (defined(_ARCH_GCC_VERSION) && _ARCH_GCC_VERSION < 470) 62 | # define _swap_u64(x) ({ uint64_t _x = (x); __asm__( "bswapq %0" : "+r"(_x) ); _x; }) 63 | #else 64 | # define _swap_u64(x) ( (uint64_t)_bswap64(x) ) 65 | #endif 66 | 67 | /** 68 | * @macro _loadu_u64, _storeu_u64 69 | */ 70 | #define _loadu_u64(p) ({ uint8_t const *_p = (uint8_t const *)(p); *((uint64_t const *)_p); }) 71 | #define _storeu_u64(p, e) { uint8_t *_p = (uint8_t *)(p); *((uint64_t *)(_p)) = (e); } 72 | #define _loadu_u32(p) ({ uint8_t const *_p = (uint8_t const *)(p); *((uint32_t const *)_p); }) 73 | #define _storeu_u32(p, e) { uint8_t *_p = (uint8_t *)(p); *((uint32_t *)(_p)) = (e); } 74 | 75 | /** 76 | * @macro _aligned_block_memcpy 77 | * 78 | * @brief copy size bytes from src to dst. 79 | * 80 | * @detail 81 | * src and dst must be aligned to 16-byte boundary. 82 | * copy must be multipe of 16. 83 | */ 84 | #define _ymm_rd_a(src, n) (ymm##n) = _mm256_load_si256((__m256i *)(src) + (n)) 85 | #define _ymm_rd_u(src, n) (ymm##n) = _mm256_loadu_si256((__m256i *)(src) + (n)) 86 | #define _ymm_wr_a(dst, n) _mm256_store_si256((__m256i *)(dst) + (n), (ymm##n)) 87 | #define _ymm_wr_u(dst, n) _mm256_storeu_si256((__m256i *)(dst) + (n), (ymm##n)) 88 | #define _memcpy_blk_intl(dst, src, size, _wr, _rd) { \ 89 | /** duff's device */ \ 90 | uint8_t *_src = (uint8_t *)(src), *_dst = (uint8_t *)(dst); \ 91 | uint64_t const _nreg = 16; /** #ymm registers == 16 */ \ 92 | uint64_t const _tcnt = (size) / sizeof(__m256i); \ 93 | uint64_t const _offset = ((_tcnt - 1) & (_nreg - 1)) - (_nreg - 1); \ 94 | uint64_t _jmp = _tcnt & (_nreg - 1); \ 95 | uint64_t _lcnt = (_tcnt + _nreg - 1) / _nreg; \ 96 | register __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7; \ 97 | register __m256i ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15; \ 98 | _src += _offset * sizeof(__m256i); \ 99 | _dst += _offset * sizeof(__m256i); \ 100 | switch(_jmp) { \ 101 | case 0: do { _rd(_src, 0); \ 102 | case 15: _rd(_src, 1); \ 103 | case 14: _rd(_src, 2); \ 104 | case 13: _rd(_src, 3); \ 105 | case 12: _rd(_src, 4); \ 106 | case 11: _rd(_src, 5); \ 107 | case 10: _rd(_src, 6); \ 108 | case 9: _rd(_src, 7); \ 109 | case 8: _rd(_src, 8); \ 110 | case 7: _rd(_src, 9); \ 111 | case 6: _rd(_src, 10); \ 112 | case 5: _rd(_src, 11); \ 113 | case 4: _rd(_src, 12); \ 114 | case 3: _rd(_src, 13); \ 115 | case 2: _rd(_src, 14); \ 116 | case 1: _rd(_src, 15); \ 117 | switch(_jmp) { \ 118 | case 0: _wr(_dst, 0); \ 119 | case 15: _wr(_dst, 1); \ 120 | case 14: _wr(_dst, 2); \ 121 | case 13: _wr(_dst, 3); \ 122 | case 12: _wr(_dst, 4); \ 123 | case 11: _wr(_dst, 5); \ 124 | case 10: _wr(_dst, 6); \ 125 | case 9: _wr(_dst, 7); \ 126 | case 8: _wr(_dst, 8); \ 127 | case 7: _wr(_dst, 9); \ 128 | case 6: _wr(_dst, 10); \ 129 | case 5: _wr(_dst, 11); \ 130 | case 4: _wr(_dst, 12); \ 131 | case 3: _wr(_dst, 13); \ 132 | case 2: _wr(_dst, 14); \ 133 | case 1: _wr(_dst, 15); \ 134 | } \ 135 | _src += _nreg * sizeof(__m256i); \ 136 | _dst += _nreg * sizeof(__m256i); \ 137 | _jmp = 0; \ 138 | } while(--_lcnt > 0); \ 139 | } \ 140 | } 141 | #define _memcpy_blk_aa(dst, src, len) _memcpy_blk_intl(dst, src, len, _ymm_wr_a, _ymm_rd_a) 142 | #define _memcpy_blk_au(dst, src, len) _memcpy_blk_intl(dst, src, len, _ymm_wr_a, _ymm_rd_u) 143 | #define _memcpy_blk_ua(dst, src, len) _memcpy_blk_intl(dst, src, len, _ymm_wr_u, _ymm_rd_a) 144 | #define _memcpy_blk_uu(dst, src, len) _memcpy_blk_intl(dst, src, len, _ymm_wr_u, _ymm_rd_u) 145 | #define _memset_blk_intl(dst, a, size, _wr) { \ 146 | uint8_t *_dst = (uint8_t *)(dst); \ 147 | __m256i const ymm0 = _mm256_set1_epi8((int8_t)a); \ 148 | uint64_t i; \ 149 | for(i = 0; i < size / sizeof(__m256i); i++) { \ 150 | _wr(_dst, 0); _dst += sizeof(__m256i); \ 151 | } \ 152 | } 153 | #define _memset_blk_a(dst, a, size) _memset_blk_intl(dst, a, size, _ymm_wr_a) 154 | #define _memset_blk_u(dst, a, size) _memset_blk_intl(dst, a, size, _ymm_wr_u) 155 | 156 | 157 | /** 158 | * substitution matrix abstraction 159 | */ 160 | /* store */ 161 | #define _store_sb(_scv, sv16) { _store_v32i8((_scv).v1, _from_v16i8_v32i8(sv16)); } 162 | 163 | /* load */ 164 | #define _load_sb(scv) ( _from_v32i8_n(_load_v32i8((scv).v1)) ) 165 | 166 | /** 167 | * gap penalty vector abstraction macros 168 | */ 169 | /* store */ 170 | #define _make_gap(_e1, _e2, _e3, _e4) ( \ 171 | (v16i8_t){ _mm_set_epi8( \ 172 | (_e4), (_e4), (_e4), (_e4), \ 173 | (_e3), (_e3), (_e3), (_e3), \ 174 | (_e2), (_e2), (_e2), (_e2), \ 175 | (_e1), (_e1), (_e1), (_e1)) \ 176 | } \ 177 | ) 178 | #define _store_adjh(_scv, _adjh, _adjv, _ofsh, _ofsv) { \ 179 | _store_v32i8((_scv).v3, _from_v16i8_v32i8(_make_gap(_adjh, _adjv, _ofsh, _ofsv))) \ 180 | } 181 | #define _store_adjv(_scv, _adjh, _adjv, _ofsh, _ofsv) { \ 182 | /* nothing to do */ \ 183 | /*_store_v32i8((_scv).v3, _from_v16i8_v32i8(_make_gap(_adjh, _adjv, _ofsh, _ofsv)))*/ \ 184 | } 185 | #define _store_ofsh(_scv, _adjh, _adjv, _ofsh, _ofsv) { \ 186 | /* nothing to do */ \ 187 | /* _store_v32i8((_scv).v5, _from_v16i8_v32i8(_make_gap(_adjh, _adjv, _ofsh, _ofsv)))*/ \ 188 | } 189 | #define _store_ofsv(_scv, _adjh, _adjv, _ofsh, _ofsv) { \ 190 | /* nothing to do */ \ 191 | /*_store_v32i8((_scv).v5, _from_v16i8_v32i8(_make_gap(_adjh, _adjv, _ofsh, _ofsv)))*/ \ 192 | } 193 | 194 | /* load */ 195 | #define _load_gap(_ptr, _idx) ( \ 196 | (v32i8_t){ _mm256_shuffle_epi32(_mm256_load_si256((__m256i const *)(_ptr)), (_idx)) } \ 197 | ) 198 | 199 | #define _load_adjh(_scv) ( _from_v32i8_n(_load_gap((_scv).v3, 0x00)) ) 200 | #define _load_adjv(_scv) ( _from_v32i8_n(_load_gap((_scv).v3, 0x00)) ) 201 | #define _load_ofsh(_scv) ( _from_v32i8_n(_load_gap((_scv).v3, 0x55)) ) 202 | #define _load_ofsv(_scv) ( _from_v32i8_n(_load_gap((_scv).v3, 0x55)) ) 203 | #define _load_gfh(_scv) ( _from_v32i8_n(_load_gap((_scv).v3, 0xaa)) ) 204 | #define _load_gfv(_scv) ( _from_v32i8_n(_load_gap((_scv).v3, 0xff)) ) 205 | /* 206 | #define _load_adjv(_scv) ( _from_v32i8_n(_load_gap((_scv).v3, 0x55)) ) 207 | #define _load_ofsv(_scv) ( _from_v32i8_n(_load_gap((_scv).v3, 0xff)) ) 208 | */ 209 | 210 | 211 | /* cache line operation */ 212 | #define WCR_BUF_SIZE ( 128 ) /** two cache lines in x86_64 */ 213 | #define memcpy_buf(_dst, _src) { \ 214 | register __m256i *_s = (__m256i *)(_src); \ 215 | register __m256i *_d = (__m256i *)(_dst); \ 216 | __m256i ymm0 = _mm256_load_si256(_s); \ 217 | __m256i ymm1 = _mm256_load_si256(_s + 1); \ 218 | __m256i ymm2 = _mm256_load_si256(_s + 2); \ 219 | __m256i ymm3 = _mm256_load_si256(_s + 3); \ 220 | _mm256_stream_si256(_d, ymm0); \ 221 | _mm256_stream_si256(_d + 1, ymm1); \ 222 | _mm256_stream_si256(_d + 2, ymm2); \ 223 | _mm256_stream_si256(_d + 3, ymm3); \ 224 | } 225 | 226 | /* 128bit register operation */ 227 | #define elem_128_t __m128i 228 | #define rd_128(_ptr) ( _mm_load_si128((__m128i *)(_ptr)) ) 229 | #define wr_128(_ptr, _e) { _mm_store_si128((__m128i *)(_ptr), (_e)); } 230 | #define _ex_128(k, h) ( _mm_extract_epi64((elem_128_t)k, h) ) 231 | #define ex_128(k, p) ( ((((p)>>3) ? _ex_128(k, 1) : _ex_128(k, 0))>>(((p) & 0x07)<<3)) & (WCR_OCC_SIZE-1) ) 232 | #define p_128(v) ( _mm_cvtsi64_si128((uint64_t)(v)) ) 233 | #define e_128(v) ( (uint64_t)_mm_cvtsi128_si64((__m128i)(v)) ) 234 | 235 | 236 | 237 | /* compare and swap (cas) */ 238 | #if defined(__GNUC__) 239 | # if (defined(_ARCH_GCC_VERSION) && _ARCH_GCC_VERSION < 470) || (defined(__INTEL_COMPILER) && _ARCH_GCC_COMPAT < 470) 240 | # define cas(ptr, cmp, val) ({ \ 241 | uint8_t _res; \ 242 | __asm__ volatile ("lock cmpxchg %[src], %[dst]\n\tsete %[res]" \ 243 | : [dst]"+m"(*ptr), [res]"=a"(_res) \ 244 | : [src]"r"(val), "a"(*cmp) \ 245 | : "memory", "cc"); \ 246 | _res; \ 247 | }) 248 | # define fence() ({ \ 249 | __asm__ volatile ("mfence"); \ 250 | }) 251 | # else /* > 4.7 */ 252 | # define cas(ptr, cmp, val) __atomic_compare_exchange_n(ptr, cmp, val, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED) 253 | # define fence() __sync_synchronize() 254 | # endif 255 | #else 256 | # error "atomic compare-and-exchange is not supported in this version of compiler." 257 | #endif 258 | 259 | #endif /* #ifndef _ARCH_UTIL_H_INCLUDED */ 260 | /** 261 | * end of arch_util.h 262 | */ 263 | -------------------------------------------------------------------------------- /arch/x86_64_avx2/v64i16.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file v64i16.h 4 | * 5 | * @brief struct and _Generic based vector class implementation 6 | */ 7 | #ifndef _V64I16_H_INCLUDED 8 | #define _V64I16_H_INCLUDED 9 | 10 | /* include header for intel / amd sse2 instruction sets */ 11 | #include 12 | 13 | /* 16bit 64cell */ 14 | typedef struct v64i16_s { 15 | __m256i v1; 16 | __m256i v2; 17 | __m256i v3; 18 | __m256i v4; 19 | } v64i16_t; 20 | 21 | /* expanders (without argument) */ 22 | #define _e_x_v64i16_1(u) 23 | #define _e_x_v64i16_2(u) 24 | #define _e_x_v64i16_3(u) 25 | #define _e_x_v64i16_4(u) 26 | 27 | /* expanders (without immediate) */ 28 | #define _e_v_v64i16_1(a) (a).v1 29 | #define _e_v_v64i16_2(a) (a).v2 30 | #define _e_v_v64i16_3(a) (a).v3 31 | #define _e_v_v64i16_4(a) (a).v4 32 | #define _e_vv_v64i16_1(a, b) (a).v1, (b).v1 33 | #define _e_vv_v64i16_2(a, b) (a).v2, (b).v2 34 | #define _e_vv_v64i16_3(a, b) (a).v3, (b).v3 35 | #define _e_vv_v64i16_4(a, b) (a).v4, (b).v4 36 | #define _e_vvv_v64i16_1(a, b, c) (a).v1, (b).v1, (c).v1 37 | #define _e_vvv_v64i16_2(a, b, c) (a).v2, (b).v2, (c).v2 38 | #define _e_vvv_v64i16_3(a, b, c) (a).v3, (b).v3, (c).v3 39 | #define _e_vvv_v64i16_4(a, b, c) (a).v4, (b).v4, (c).v4 40 | 41 | /* expanders with immediate */ 42 | #define _e_i_v64i16_1(imm) (imm) 43 | #define _e_i_v64i16_2(imm) (imm) 44 | #define _e_i_v64i16_3(imm) (imm) 45 | #define _e_i_v64i16_4(imm) (imm) 46 | #define _e_vi_v64i16_1(a, imm) (a).v1, (imm) 47 | #define _e_vi_v64i16_2(a, imm) (a).v2, (imm) 48 | #define _e_vi_v64i16_3(a, imm) (a).v3, (imm) 49 | #define _e_vi_v64i16_4(a, imm) (a).v4, (imm) 50 | #define _e_vvi_v64i16_1(a, b, imm) (a).v1, (b).v1, (imm) 51 | #define _e_vvi_v64i16_2(a, b, imm) (a).v2, (b).v2, (imm) 52 | #define _e_vvi_v64i16_3(a, b, imm) (a).v3, (b).v3, (imm) 53 | #define _e_vvi_v64i16_4(a, b, imm) (a).v4, (b).v4, (imm) 54 | 55 | /* address calculation macros */ 56 | #define _addr_v64i16_1(imm) ( (__m256i *)(imm) ) 57 | #define _addr_v64i16_2(imm) ( (__m256i *)(imm) + 1 ) 58 | #define _addr_v64i16_3(imm) ( (__m256i *)(imm) + 2 ) 59 | #define _addr_v64i16_4(imm) ( (__m256i *)(imm) + 3 ) 60 | #define _pv_v64i16(ptr) ( _addr_v64i16_1(ptr) ) 61 | 62 | /* expanders with pointers */ 63 | #define _e_p_v64i16_1(ptr) _addr_v64i16_1(ptr) 64 | #define _e_p_v64i16_2(ptr) _addr_v64i16_2(ptr) 65 | #define _e_p_v64i16_3(ptr) _addr_v64i16_3(ptr) 66 | #define _e_p_v64i16_4(ptr) _addr_v64i16_4(ptr) 67 | #define _e_pv_v64i16_1(ptr, a) _addr_v64i16_1(ptr), (a).v1 68 | #define _e_pv_v64i16_2(ptr, a) _addr_v64i16_2(ptr), (a).v2 69 | #define _e_pv_v64i16_3(ptr, a) _addr_v64i16_3(ptr), (a).v3 70 | #define _e_pv_v64i16_4(ptr, a) _addr_v64i16_4(ptr), (a).v4 71 | 72 | /* expand intrinsic name */ 73 | #define _i_v64i16(intrin) _mm256_##intrin##_epi16 74 | #define _i_v64i16x(intrin) _mm256_##intrin##_si256 75 | 76 | /* apply */ 77 | #define _a_v64i16(intrin, expander, ...) ( \ 78 | (v64i16_t) { \ 79 | _i_v64i16(intrin)(expander##_v64i16_1(__VA_ARGS__)), \ 80 | _i_v64i16(intrin)(expander##_v64i16_2(__VA_ARGS__)), \ 81 | _i_v64i16(intrin)(expander##_v64i16_3(__VA_ARGS__)), \ 82 | _i_v64i16(intrin)(expander##_v64i16_4(__VA_ARGS__)) \ 83 | } \ 84 | ) 85 | #define _a_v64i16x(intrin, expander, ...) ( \ 86 | (v64i16_t) { \ 87 | _i_v64i16x(intrin)(expander##_v64i16_1(__VA_ARGS__)), \ 88 | _i_v64i16x(intrin)(expander##_v64i16_2(__VA_ARGS__)), \ 89 | _i_v64i16x(intrin)(expander##_v64i16_3(__VA_ARGS__)), \ 90 | _i_v64i16x(intrin)(expander##_v64i16_4(__VA_ARGS__)) \ 91 | } \ 92 | ) 93 | #define _a_v64i16xv(intrin, expander, ...) { \ 94 | _i_v64i16x(intrin)(expander##_v64i16_1(__VA_ARGS__)); \ 95 | _i_v64i16x(intrin)(expander##_v64i16_2(__VA_ARGS__)); \ 96 | _i_v64i16x(intrin)(expander##_v64i16_3(__VA_ARGS__)); \ 97 | _i_v64i16x(intrin)(expander##_v64i16_4(__VA_ARGS__)); \ 98 | } 99 | 100 | /* load and store */ 101 | #define _load_v64i16(...) _a_v64i16x(load, _e_p, __VA_ARGS__) 102 | #define _loadu_v64i16(...) _a_v64i16x(loadu, _e_p, __VA_ARGS__) 103 | #define _store_v64i16(...) _a_v64i16xv(store, _e_pv, __VA_ARGS__) 104 | #define _storeu_v64i16(...) _a_v64i16xv(storeu, _e_pv, __VA_ARGS__) 105 | 106 | /* broadcast */ 107 | #define _set_v64i16(...) _a_v64i16(set1, _e_i, __VA_ARGS__) 108 | #define _zero_v64i16() _a_v64i16x(setzero, _e_x, _unused) 109 | 110 | /* logics */ 111 | #define _not_v64i16(...) _a_v64i16x(not, _e_v, __VA_ARGS__) 112 | #define _and_v64i16(...) _a_v64i16x(and, _e_vv, __VA_ARGS__) 113 | #define _or_v64i16(...) _a_v64i16x(or, _e_vv, __VA_ARGS__) 114 | #define _xor_v64i16(...) _a_v64i16x(xor, _e_vv, __VA_ARGS__) 115 | #define _andn_v64i16(...) _a_v64i16x(andnot, _e_vv, __VA_ARGS__) 116 | 117 | /* arithmetics */ 118 | #define _add_v64i16(...) _a_v64i16(add, _e_vv, __VA_ARGS__) 119 | #define _sub_v64i16(...) _a_v64i16(sub, _e_vv, __VA_ARGS__) 120 | #define _max_v64i16(...) _a_v64i16(max, _e_vv, __VA_ARGS__) 121 | #define _min_v64i16(...) _a_v64i16(min, _e_vv, __VA_ARGS__) 122 | 123 | /* compare */ 124 | #define _eq_v64i16(...) _a_v64i16(cmpeq, _e_vv, __VA_ARGS__) 125 | #define _gt_v64i16(...) _a_v64i16(cmpgt, _e_vv, __VA_ARGS__) 126 | 127 | 128 | /* insert and extract */ 129 | #define _V64I16_N ( sizeof(__m256i) / sizeof(int16_t) ) 130 | #define _ins_v64i16(a, val, imm) { \ 131 | if((imm) < _V64I16_N) { \ 132 | (a).v1 = _i_v64i16(insert)((a).v1, (val), (imm)); \ 133 | } else if((imm) < 2 * _V64I16_N) { \ 134 | (a).v2 = _i_v64i16(insert)((a).v2, (val), (imm) - _V64I16_N); \ 135 | } else if((imm) < 3 * _V64I16_N) { \ 136 | (a).v3 = _i_v64i16(insert)((a).v3, (val), (imm) - 2 * _V64I16_N); \ 137 | } else { \ 138 | (a).v4 = _i_v64i16(insert)((a).v4, (val), (imm) - 3 * _V64I16_N); \ 139 | } \ 140 | } 141 | #define _ext_v64i16(a, imm) ( \ 142 | (int16_t)(((imm) < _V64I16_N) ? ( \ 143 | _i_v64i16(extract)((a).v1, (imm)) \ 144 | ) : ((imm) < 2 * _V64I16_N) ? ( \ 145 | _i_v64i16(extract)((a).v2, (imm) - _V64I16_N) \ 146 | ) : ((imm) < 3 * _V64I16_N) ? ( \ 147 | _i_v64i16(extract)((a).v3, (imm) - 2 * _V64I16_N) \ 148 | ) : ( \ 149 | _i_v64i16(extract)((a).v4, (imm) - 3 * _V64I16_N) \ 150 | )) \ 151 | ) 152 | 153 | /* mask */ 154 | #define _mask_v64i16(a) ( \ 155 | (v64_mask_t) { \ 156 | .m1 = _mm256_movemask_epi8( \ 157 | _mm256_packs_epi16( \ 158 | _mm256_permute2x128_si256((a).v1, (a).v2, 0x20), \ 159 | _mm256_permute2x128_si256((a).v1, (a).v2, 0x31))), \ 160 | .m2 = _mm256_movemask_epi8( \ 161 | _mm256_packs_epi16( \ 162 | _mm256_permute2x128_si256((a).v3, (a).v4, 0x20), \ 163 | _mm256_permute2x128_si256((a).v3, (a).v4, 0x31))) \ 164 | } \ 165 | ) 166 | 167 | /* horizontal max (reduction max) */ 168 | #define _hmax_v64i16(a) ({ \ 169 | __m256i _s = _mm256_max_epi16( \ 170 | _mm256_max_epi16((a).v1, (a).v2), \ 171 | _mm256_max_epi16((a).v3, (a).v4) \ 172 | ); \ 173 | __m128i _t = _mm_max_epi16( \ 174 | _mm256_castsi256_si128(_s), \ 175 | _mm256_extracti128_si256(_s, 1) \ 176 | ); \ 177 | _t = _mm_max_epi16(_t, _mm_srli_si128(_t, 8)); \ 178 | _t = _mm_max_epi16(_t, _mm_srli_si128(_t, 4)); \ 179 | _t = _mm_max_epi16(_t, _mm_srli_si128(_t, 2)); \ 180 | (int16_t)_mm_extract_epi16(_t, 0); \ 181 | }) 182 | 183 | #define _cvt_v64i8_v64i16(a) ( \ 184 | (v64i16_t) { \ 185 | _mm256_cvtepi8_epi16(_mm256_castsi256_si128((a).v1)), \ 186 | _mm256_cvtepi8_epi16(_mm256_extracti128_si256((a).v1, 1)), \ 187 | _mm256_cvtepi8_epi16(_mm256_castsi256_si128((a).v2)), \ 188 | _mm256_cvtepi8_epi16(_mm256_extracti128_si256((a).v2, 1)) \ 189 | } \ 190 | ) 191 | 192 | /* debug print */ 193 | // #ifdef _LOG_H_INCLUDED 194 | #define _print_v64i16(a) { \ 195 | debug("(v64i16_t) %s(%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, " \ 196 | "%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, " \ 197 | "%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, " \ 198 | "%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d)", \ 199 | #a, \ 200 | _ext_v64i16(a, 32 + 31), \ 201 | _ext_v64i16(a, 32 + 30), \ 202 | _ext_v64i16(a, 32 + 29), \ 203 | _ext_v64i16(a, 32 + 28), \ 204 | _ext_v64i16(a, 32 + 27), \ 205 | _ext_v64i16(a, 32 + 26), \ 206 | _ext_v64i16(a, 32 + 25), \ 207 | _ext_v64i16(a, 32 + 24), \ 208 | _ext_v64i16(a, 32 + 23), \ 209 | _ext_v64i16(a, 32 + 22), \ 210 | _ext_v64i16(a, 32 + 21), \ 211 | _ext_v64i16(a, 32 + 20), \ 212 | _ext_v64i16(a, 32 + 19), \ 213 | _ext_v64i16(a, 32 + 18), \ 214 | _ext_v64i16(a, 32 + 17), \ 215 | _ext_v64i16(a, 32 + 16), \ 216 | _ext_v64i16(a, 32 + 15), \ 217 | _ext_v64i16(a, 32 + 14), \ 218 | _ext_v64i16(a, 32 + 13), \ 219 | _ext_v64i16(a, 32 + 12), \ 220 | _ext_v64i16(a, 32 + 11), \ 221 | _ext_v64i16(a, 32 + 10), \ 222 | _ext_v64i16(a, 32 + 9), \ 223 | _ext_v64i16(a, 32 + 8), \ 224 | _ext_v64i16(a, 32 + 7), \ 225 | _ext_v64i16(a, 32 + 6), \ 226 | _ext_v64i16(a, 32 + 5), \ 227 | _ext_v64i16(a, 32 + 4), \ 228 | _ext_v64i16(a, 32 + 3), \ 229 | _ext_v64i16(a, 32 + 2), \ 230 | _ext_v64i16(a, 32 + 1), \ 231 | _ext_v64i16(a, 32 + 0), \ 232 | _ext_v64i16(a, 31), \ 233 | _ext_v64i16(a, 30), \ 234 | _ext_v64i16(a, 29), \ 235 | _ext_v64i16(a, 28), \ 236 | _ext_v64i16(a, 27), \ 237 | _ext_v64i16(a, 26), \ 238 | _ext_v64i16(a, 25), \ 239 | _ext_v64i16(a, 24), \ 240 | _ext_v64i16(a, 23), \ 241 | _ext_v64i16(a, 22), \ 242 | _ext_v64i16(a, 21), \ 243 | _ext_v64i16(a, 20), \ 244 | _ext_v64i16(a, 19), \ 245 | _ext_v64i16(a, 18), \ 246 | _ext_v64i16(a, 17), \ 247 | _ext_v64i16(a, 16), \ 248 | _ext_v64i16(a, 15), \ 249 | _ext_v64i16(a, 14), \ 250 | _ext_v64i16(a, 13), \ 251 | _ext_v64i16(a, 12), \ 252 | _ext_v64i16(a, 11), \ 253 | _ext_v64i16(a, 10), \ 254 | _ext_v64i16(a, 9), \ 255 | _ext_v64i16(a, 8), \ 256 | _ext_v64i16(a, 7), \ 257 | _ext_v64i16(a, 6), \ 258 | _ext_v64i16(a, 5), \ 259 | _ext_v64i16(a, 4), \ 260 | _ext_v64i16(a, 3), \ 261 | _ext_v64i16(a, 2), \ 262 | _ext_v64i16(a, 1), \ 263 | _ext_v64i16(a, 0)); \ 264 | } 265 | // #else 266 | // #define _print_v64i16(x) ; 267 | // #endif 268 | 269 | #endif /* _V64I16_H_INCLUDED */ 270 | /** 271 | * end of v64i16.h 272 | */ 273 | -------------------------------------------------------------------------------- /arch/x86_64_avx2/v64i8.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file v64i8.h 4 | * 5 | * @brief struct and _Generic based vector class implementation 6 | */ 7 | #ifndef _V64I8_H_INCLUDED 8 | #define _V64I8_H_INCLUDED 9 | 10 | /* include header for intel / amd sse2 instruction sets */ 11 | #include 12 | 13 | /* 8bit 64cell */ 14 | typedef struct v64i8_s { 15 | __m256i v1; 16 | __m256i v2; 17 | } v64i8_t; 18 | 19 | /* expanders (without argument) */ 20 | #define _e_x_v64i8_1(u) 21 | #define _e_x_v64i8_2(u) 22 | 23 | /* expanders (without immediate) */ 24 | #define _e_v_v64i8_1(a) (a).v1 25 | #define _e_v_v64i8_2(a) (a).v2 26 | #define _e_vv_v64i8_1(a, b) (a).v1, (b).v1 27 | #define _e_vv_v64i8_2(a, b) (a).v2, (b).v2 28 | #define _e_vvv_v64i8_1(a, b, c) (a).v1, (b).v1, (c).v1 29 | #define _e_vvv_v64i8_2(a, b, c) (a).v2, (b).v2, (c).v2 30 | 31 | /* expanders with immediate */ 32 | #define _e_i_v64i8_1(imm) (imm) 33 | #define _e_i_v64i8_2(imm) (imm) 34 | #define _e_vi_v64i8_1(a, imm) (a).v1, (imm) 35 | #define _e_vi_v64i8_2(a, imm) (a).v2, (imm) 36 | #define _e_vvi_v64i8_1(a, b, imm) (a).v1, (b).v1, (imm) 37 | #define _e_vvi_v64i8_2(a, b, imm) (a).v2, (b).v2, (imm) 38 | 39 | /* address calculation macros */ 40 | #define _addr_v64i8_1(imm) ( (__m256i *)(imm) ) 41 | #define _addr_v64i8_2(imm) ( (__m256i *)(imm) + 1 ) 42 | #define _pv_v64i8(ptr) ( _addr_v64i8_1(ptr) ) 43 | 44 | /* expanders with pointers */ 45 | #define _e_p_v64i8_1(ptr) _addr_v64i8_1(ptr) 46 | #define _e_p_v64i8_2(ptr) _addr_v64i8_2(ptr) 47 | #define _e_pv_v64i8_1(ptr, a) _addr_v64i8_1(ptr), (a).v1 48 | #define _e_pv_v64i8_2(ptr, a) _addr_v64i8_2(ptr), (a).v2 49 | 50 | /* expand intrinsic name */ 51 | #define _i_v64i8(intrin) _mm256_##intrin##_epi8 52 | #define _i_v64u8(intrin) _mm256_##intrin##_epu8 53 | #define _i_v64i8x(intrin) _mm256_##intrin##_si256 54 | 55 | /* apply */ 56 | #define _a_v64i8(intrin, expander, ...) ( \ 57 | (v64i8_t) { \ 58 | _i_v64i8(intrin)(expander##_v64i8_1(__VA_ARGS__)), \ 59 | _i_v64i8(intrin)(expander##_v64i8_2(__VA_ARGS__)) \ 60 | } \ 61 | ) 62 | #define _a_v64u8(intrin, expander, ...) ( \ 63 | (v64i8_t) { \ 64 | _i_v64u8(intrin)(expander##_v64i8_1(__VA_ARGS__)), \ 65 | _i_v64u8(intrin)(expander##_v64i8_2(__VA_ARGS__)) \ 66 | } \ 67 | ) 68 | #define _a_v64i8x(intrin, expander, ...) ( \ 69 | (v64i8_t) { \ 70 | _i_v64i8x(intrin)(expander##_v64i8_1(__VA_ARGS__)), \ 71 | _i_v64i8x(intrin)(expander##_v64i8_2(__VA_ARGS__)) \ 72 | } \ 73 | ) 74 | #define _a_v64i8xv(intrin, expander, ...) { \ 75 | _i_v64i8x(intrin)(expander##_v64i8_1(__VA_ARGS__)); \ 76 | _i_v64i8x(intrin)(expander##_v64i8_2(__VA_ARGS__)); \ 77 | } 78 | 79 | /* load and store */ 80 | #define _load_v64i8(...) _a_v64i8x(load, _e_p, __VA_ARGS__) 81 | #define _loadu_v64i8(...) _a_v64i8x(loadu, _e_p, __VA_ARGS__) 82 | #define _store_v64i8(...) _a_v64i8xv(store, _e_pv, __VA_ARGS__) 83 | #define _storeu_v64i8(...) _a_v64i8xv(storeu, _e_pv, __VA_ARGS__) 84 | 85 | /* broadcast */ 86 | #define _set_v64i8(...) _a_v64i8(set1, _e_i, __VA_ARGS__) 87 | #define _zero_v64i8() _a_v64i8x(setzero, _e_x, _unused) 88 | 89 | /* swap (reverse) */ 90 | #define _swap_idx_v64i8() ( \ 91 | _mm256_broadcastsi128_si256(_mm_set_epi8( \ 92 | 0, 1, 2, 3, 4, 5, 6, 7, \ 93 | 8, 9, 10, 11, 12, 13, 14, 15)) \ 94 | ) 95 | #define _swap_v64i8(a) ( \ 96 | (v64i8_t) { \ 97 | _mm256_permute2x128_si256( \ 98 | _mm256_shuffle_epi8((a).v2, _swap_idx_v64i8()), \ 99 | _mm256_shuffle_epi8((a).v2, _swap_idx_v64i8()), \ 100 | 0x01), \ 101 | _mm256_permute2x128_si256( \ 102 | _mm256_shuffle_epi8((a).v1, _swap_idx_v64i8()), \ 103 | _mm256_shuffle_epi8((a).v1, _swap_idx_v64i8()), \ 104 | 0x01) \ 105 | } \ 106 | ) 107 | 108 | /* logics */ 109 | #define _not_v64i8(...) _a_v64i8x(not, _e_v, __VA_ARGS__) 110 | #define _and_v64i8(...) _a_v64i8x(and, _e_vv, __VA_ARGS__) 111 | #define _or_v64i8(...) _a_v64i8x(or, _e_vv, __VA_ARGS__) 112 | #define _xor_v64i8(...) _a_v64i8x(xor, _e_vv, __VA_ARGS__) 113 | #define _andn_v64i8(...) _a_v64i8x(andnot, _e_vv, __VA_ARGS__) 114 | 115 | /* arithmetics */ 116 | #define _add_v64i8(...) _a_v64i8(add, _e_vv, __VA_ARGS__) 117 | #define _sub_v64i8(...) _a_v64i8(sub, _e_vv, __VA_ARGS__) 118 | #define _adds_v64i8(...) _a_v64i8(adds, _e_vv, __VA_ARGS__) 119 | #define _subs_v64i8(...) _a_v64i8(subs, _e_vv, __VA_ARGS__) 120 | #define _addus_v64i8(...) _a_v64u8(adds, _e_vv, __VA_ARGS__) 121 | #define _subus_v64i8(...) _a_v64u8(subs, _e_vv, __VA_ARGS__) 122 | #define _max_v64i8(...) _a_v64i8(max, _e_vv, __VA_ARGS__) 123 | #define _min_v64i8(...) _a_v64i8(min, _e_vv, __VA_ARGS__) 124 | 125 | /* shuffle */ 126 | #define _shuf_v64i8(...) _a_v64i8(shuffle, _e_vv, __VA_ARGS__) 127 | 128 | /* blend */ 129 | #define _sel_v64i8(...) _a_v64i8(blendv, _e_vvv, __VA_ARGS__) 130 | 131 | /* compare */ 132 | #define _eq_v64i8(...) _a_v64i8(cmpeq, _e_vv, __VA_ARGS__) 133 | #define _gt_v64i8(...) _a_v64i8(cmpgt, _e_vv, __VA_ARGS__) 134 | 135 | /* insert and extract */ 136 | #define _ins_v64i8(a, val, imm) { \ 137 | if((imm) < sizeof(__m256i)) { \ 138 | (a).v1 = _i_v64i8(insert)((a).v1, (val), (imm)); \ 139 | } else { \ 140 | (a).v2 = _i_v64i8(insert)((a).v2, (val), (imm) - sizeof(__m256i)); \ 141 | } \ 142 | } 143 | #define _ext_v64i8(a, imm) ( \ 144 | (int8_t)(((imm) < sizeof(__m256i)) ? ( \ 145 | _i_v64i8(extract)((a).v1, (imm)) \ 146 | ) : ( \ 147 | _i_v64i8(extract)((a).v2, (imm) - sizeof(__m256i)) \ 148 | )) \ 149 | ) 150 | 151 | /* byte shift */ 152 | #define _bsl_v64i8(a, imm) ( \ 153 | (v64i8_t) { \ 154 | _mm256_alignr_epi8( \ 155 | (a).v1, \ 156 | _mm256_permute2x128_si256((a).v1, (a).v2, 0x08), \ 157 | 15), \ 158 | _mm256_alignr_epi8( \ 159 | (a).v2, \ 160 | _mm256_permute2x128_si256((a).v1, (a).v2, 0x21), \ 161 | 15) \ 162 | } \ 163 | ) 164 | #define _bsr_v64i8(a, imm) ( \ 165 | (v64i8_t) { \ 166 | _mm256_alignr_epi8( \ 167 | _mm256_permute2x128_si256((a).v1, (a).v2, 0x21), \ 168 | (a).v1, \ 169 | 1), \ 170 | _mm256_alignr_epi8( \ 171 | _mm256_castsi128_si256(_mm256_extracti128_si256((a).v2, 1)), \ 172 | (a).v2, \ 173 | 1) \ 174 | } \ 175 | ) 176 | 177 | /* double shift (palignr) */ 178 | #define _bsld_v64i8(a, b, imm) ( \ 179 | (v64i8_t) { \ 180 | _mm256_alignr_epi8( \ 181 | (a).v1, \ 182 | _mm256_permute2x128_si256((a).v1, (b).v2, 0x03), \ 183 | sizeof(__m128i) - (imm)), \ 184 | _mm256_alignr_epi8( \ 185 | (a).v2, \ 186 | _mm256_permute2x128_si256((a).v1, (a).v2, 0x21), \ 187 | sizeof(__m128i) - (imm)) \ 188 | } \ 189 | ) 190 | #define _bsrd_v64i8(a, b, imm) ( \ 191 | (v64i8_t) { \ 192 | _mm256_alignr_epi8( \ 193 | _mm256_permute2x128_si256((b).v1, (b).v2, 0x21), \ 194 | (b).v1, \ 195 | (imm)), \ 196 | _mm256_alignr_epi8( \ 197 | _mm256_permute2x128_si256((a).v1, (b).v2, 0x03), \ 198 | (a).v2, \ 199 | (imm)) \ 200 | } \ 201 | ) 202 | 203 | /* bit shift */ 204 | #define _shl_v64i8(a, imm) ( \ 205 | (v64i8_t) { \ 206 | _mm256_slli_epi32((a).v1, (imm)), \ 207 | _mm256_slli_epi32((a).v2, (imm)) \ 208 | } \ 209 | ) 210 | #define _shr_v64i8(a, imm) ( \ 211 | (v64i8_t) { \ 212 | _mm256_srli_epi32((a).v1, (imm)), \ 213 | _mm256_srli_epi32((a).v2, (imm)) \ 214 | } \ 215 | ) 216 | #define _sal_v64i8(a, imm) ( \ 217 | (v64i8_t) { \ 218 | _mm256_slai_epi32((a).v1, (imm)), \ 219 | _mm256_slai_epi32((a).v2, (imm)) \ 220 | } \ 221 | ) 222 | #define _sar_v64i8(a, imm) ( \ 223 | (v64i8_t) { \ 224 | _mm256_srai_epi32((a).v1, (imm)), \ 225 | _mm256_srai_epi32((a).v2, (imm)), \ 226 | } \ 227 | ) 228 | 229 | /* mask */ 230 | #define _mask_v64i8(a) ( \ 231 | (v64_mask_t) { \ 232 | .m1 = _i_v64i8(movemask)((a).v1), \ 233 | .m2 = _i_v64i8(movemask)((a).v2) \ 234 | } \ 235 | ) 236 | 237 | /* horizontal max (reduction max) */ 238 | #define _hmax_v64i8(a) ({ \ 239 | __m256i _s = _mm256_max_epi8((a).v1, (a).v2); \ 240 | __m128i _t = _mm_max_epi8( \ 241 | _mm256_castsi256_si128(_s), \ 242 | _mm256_extracti128_si256(_s, 1) \ 243 | ); \ 244 | _t = _mm_max_epi8(_t, _mm_srli_si128(_t, 8)); \ 245 | _t = _mm_max_epi8(_t, _mm_srli_si128(_t, 4)); \ 246 | _t = _mm_max_epi8(_t, _mm_srli_si128(_t, 2)); \ 247 | _t = _mm_max_epi8(_t, _mm_srli_si128(_t, 1)); \ 248 | (int8_t)_mm_extract_epi8(_t, 0); \ 249 | }) 250 | 251 | /* convert */ 252 | #define _cvt_v64i16_v64i8(a) ( \ 253 | (v64i8_t) { \ 254 | _mm256_packs_epi16( \ 255 | _mm256_permute2x128_si256((a).v1, (a).v2, 0x20), \ 256 | _mm256_permute2x128_si256((a).v1, (a).v2, 0x31)), \ 257 | _mm256_packs_epi16( \ 258 | _mm256_permute2x128_si256((a).v3, (a).v4, 0x20), \ 259 | _mm256_permute2x128_si256((a).v3, (a).v4, 0x31)) \ 260 | } \ 261 | ) 262 | 263 | /* debug print */ 264 | // #ifdef _LOG_H_INCLUDED 265 | #define _print_v64i8(a) { \ 266 | debug("(v64i8_t) %s(%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, " \ 267 | "%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, " \ 268 | "%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, " \ 269 | "%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d)", \ 270 | #a, \ 271 | _ext_v64i8(a, 32 + 31), \ 272 | _ext_v64i8(a, 32 + 30), \ 273 | _ext_v64i8(a, 32 + 29), \ 274 | _ext_v64i8(a, 32 + 28), \ 275 | _ext_v64i8(a, 32 + 27), \ 276 | _ext_v64i8(a, 32 + 26), \ 277 | _ext_v64i8(a, 32 + 25), \ 278 | _ext_v64i8(a, 32 + 24), \ 279 | _ext_v64i8(a, 32 + 23), \ 280 | _ext_v64i8(a, 32 + 22), \ 281 | _ext_v64i8(a, 32 + 21), \ 282 | _ext_v64i8(a, 32 + 20), \ 283 | _ext_v64i8(a, 32 + 19), \ 284 | _ext_v64i8(a, 32 + 18), \ 285 | _ext_v64i8(a, 32 + 17), \ 286 | _ext_v64i8(a, 32 + 16), \ 287 | _ext_v64i8(a, 32 + 15), \ 288 | _ext_v64i8(a, 32 + 14), \ 289 | _ext_v64i8(a, 32 + 13), \ 290 | _ext_v64i8(a, 32 + 12), \ 291 | _ext_v64i8(a, 32 + 11), \ 292 | _ext_v64i8(a, 32 + 10), \ 293 | _ext_v64i8(a, 32 + 9), \ 294 | _ext_v64i8(a, 32 + 8), \ 295 | _ext_v64i8(a, 32 + 7), \ 296 | _ext_v64i8(a, 32 + 6), \ 297 | _ext_v64i8(a, 32 + 5), \ 298 | _ext_v64i8(a, 32 + 4), \ 299 | _ext_v64i8(a, 32 + 3), \ 300 | _ext_v64i8(a, 32 + 2), \ 301 | _ext_v64i8(a, 32 + 1), \ 302 | _ext_v64i8(a, 32 + 0), \ 303 | _ext_v64i8(a, 31), \ 304 | _ext_v64i8(a, 30), \ 305 | _ext_v64i8(a, 29), \ 306 | _ext_v64i8(a, 28), \ 307 | _ext_v64i8(a, 27), \ 308 | _ext_v64i8(a, 26), \ 309 | _ext_v64i8(a, 25), \ 310 | _ext_v64i8(a, 24), \ 311 | _ext_v64i8(a, 23), \ 312 | _ext_v64i8(a, 22), \ 313 | _ext_v64i8(a, 21), \ 314 | _ext_v64i8(a, 20), \ 315 | _ext_v64i8(a, 19), \ 316 | _ext_v64i8(a, 18), \ 317 | _ext_v64i8(a, 17), \ 318 | _ext_v64i8(a, 16), \ 319 | _ext_v64i8(a, 15), \ 320 | _ext_v64i8(a, 14), \ 321 | _ext_v64i8(a, 13), \ 322 | _ext_v64i8(a, 12), \ 323 | _ext_v64i8(a, 11), \ 324 | _ext_v64i8(a, 10), \ 325 | _ext_v64i8(a, 9), \ 326 | _ext_v64i8(a, 8), \ 327 | _ext_v64i8(a, 7), \ 328 | _ext_v64i8(a, 6), \ 329 | _ext_v64i8(a, 5), \ 330 | _ext_v64i8(a, 4), \ 331 | _ext_v64i8(a, 3), \ 332 | _ext_v64i8(a, 2), \ 333 | _ext_v64i8(a, 1), \ 334 | _ext_v64i8(a, 0)); \ 335 | } 336 | // #else 337 | // #define _print_v64i8(x) ; 338 | // #endif 339 | 340 | #endif /* _V64I8_H_INCLUDED */ 341 | /** 342 | * end of v64i8.h 343 | */ 344 | -------------------------------------------------------------------------------- /arch/x86_64_sse41/arch_util.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file arch_util.h 4 | * 5 | * @brief architecture-dependent utilities devided from util.h 6 | */ 7 | #ifndef _ARCH_UTIL_H_INCLUDED 8 | #define _ARCH_UTIL_H_INCLUDED 9 | #define MM_ARCH "SSE4.1" 10 | 11 | #include 12 | #include 13 | 14 | /** 15 | * misc bit operations (popcnt, tzcnt, and lzcnt) 16 | */ 17 | 18 | /** 19 | * @macro popcnt 20 | */ 21 | #ifdef __POPCNT__ 22 | # define popcnt(x) ( (uint64_t)_mm_popcnt_u64(x) ) 23 | #else 24 | // #warning "popcnt instruction is not enabled." 25 | static inline 26 | uint64_t popcnt(uint64_t n) 27 | { 28 | uint64_t c = 0; 29 | c = (n & 0x5555555555555555) + ((n>>1) & 0x5555555555555555); 30 | c = (c & 0x3333333333333333) + ((c>>2) & 0x3333333333333333); 31 | c = (c & 0x0f0f0f0f0f0f0f0f) + ((c>>4) & 0x0f0f0f0f0f0f0f0f); 32 | c = (c & 0x00ff00ff00ff00ff) + ((c>>8) & 0x00ff00ff00ff00ff); 33 | c = (c & 0x0000ffff0000ffff) + ((c>>16) & 0x0000ffff0000ffff); 34 | c = (c & 0x00000000ffffffff) + ((c>>32) & 0x00000000ffffffff); 35 | return(c); 36 | } 37 | #endif 38 | 39 | /** 40 | * @macro ZCNT_RESULT 41 | * @brief workaround for a bug in gcc (<= 5), all the results of tzcnt / lzcnt macros must be modified by this label 42 | */ 43 | #ifndef ZCNT_RESULT 44 | # if defined(_ARCH_GCC_VERSION) && _ARCH_GCC_VERSION < 600 45 | # define ZCNT_RESULT volatile 46 | # else 47 | # define ZCNT_RESULT 48 | # endif 49 | #endif 50 | 51 | /** 52 | * @macro tzcnt 53 | * @brief trailing zero count (count #continuous zeros from LSb) 54 | */ 55 | #ifdef __BMI__ 56 | /** immintrin.h is already included */ 57 | # if defined(_ARCH_GCC_VERSION) && _ARCH_GCC_VERSION < 490 58 | # define tzcnt(x) ( (uint64_t)__tzcnt_u64(x) ) 59 | # else 60 | # define tzcnt(x) ( (uint64_t)_tzcnt_u64(x) ) 61 | # endif 62 | #else 63 | // #warning "tzcnt instruction is not enabled." 64 | static inline 65 | uint64_t tzcnt(uint64_t n) 66 | { 67 | #ifdef __POPCNT__ 68 | return(popcnt(~n & (n - 1))); 69 | #else 70 | if(n == 0) { 71 | return(64); 72 | } else { 73 | int64_t res; 74 | __asm__( "bsfq %1, %0" : "=r"(res) : "r"(n) ); 75 | return(res); 76 | } 77 | #endif 78 | } 79 | #endif 80 | 81 | /** 82 | * @macro lzcnt 83 | * @brief leading zero count (count #continuous zeros from MSb) 84 | */ 85 | #ifdef __LZCNT__ 86 | /* __lzcnt_u64 in bmiintrin.h gcc-4.6, _lzcnt_u64 in lzcntintrin.h from gcc-4.7 */ 87 | # if defined(_ARCH_GCC_VERSION) && _ARCH_GCC_VERSION < 470 88 | # define lzcnt(x) ( (uint64_t)__lzcnt_u64(x) ) 89 | # else 90 | # define lzcnt(x) ( (uint64_t)_lzcnt_u64(x) ) 91 | # endif 92 | #else 93 | // #warning "lzcnt instruction is not enabled." 94 | static inline 95 | uint64_t lzcnt(uint64_t n) 96 | { 97 | if(n == 0) { 98 | return(64); 99 | } else { 100 | int64_t res; 101 | __asm__( "bsrq %1, %0" : "=r"(res) : "r"(n) ); 102 | return(63 - res); 103 | } 104 | } 105 | #endif 106 | 107 | /** 108 | * @macro _swap_u64 109 | */ 110 | #if defined(__clang__) || (defined(_ARCH_GCC_VERSION) && _ARCH_GCC_VERSION < 470) 111 | # define _swap_u64(x) ({ uint64_t _x = (x); __asm__( "bswapq %0" : "+r"(_x) ); _x; }) 112 | #else 113 | # define _swap_u64(x) ( (uint64_t)_bswap64(x) ) 114 | #endif 115 | 116 | /** 117 | * @macro _loadu_u64, _storeu_u64 118 | */ 119 | #define _loadu_u64(p) ({ uint8_t const *_p = (uint8_t const *)(p); *((uint64_t const *)_p); }) 120 | #define _storeu_u64(p, e) { uint8_t *_p = (uint8_t *)(p); *((uint64_t *)(_p)) = (e); } 121 | #define _loadu_u32(p) ({ uint8_t const *_p = (uint8_t const *)(p); *((uint32_t const *)_p); }) 122 | #define _storeu_u32(p, e) { uint8_t *_p = (uint8_t *)(p); *((uint32_t *)(_p)) = (e); } 123 | 124 | /** 125 | * @macro _aligned_block_memcpy 126 | * 127 | * @brief copy size bytes from src to dst. 128 | * 129 | * @detail 130 | * src and dst must be aligned to 16-byte boundary. 131 | * copy must be multipe of 16. 132 | */ 133 | #define _xmm_rd_a(src, n) (xmm##n) = _mm_load_si128((__m128i *)(src) + (n)) 134 | #define _xmm_rd_u(src, n) (xmm##n) = _mm_loadu_si128((__m128i *)(src) + (n)) 135 | #define _xmm_wr_a(dst, n) _mm_store_si128((__m128i *)(dst) + (n), (xmm##n)) 136 | #define _xmm_wr_u(dst, n) _mm_storeu_si128((__m128i *)(dst) + (n), (xmm##n)) 137 | #define _memcpy_blk_intl(dst, src, size, _wr, _rd) { \ 138 | /** duff's device */ \ 139 | uint8_t *_src = (uint8_t *)(src), *_dst = (uint8_t *)(dst); \ 140 | uint64_t const _nreg = 16; /** #xmm registers == 16 */ \ 141 | uint64_t const _tcnt = (size) / sizeof(__m128i); \ 142 | uint64_t const _offset = ((_tcnt - 1) & (_nreg - 1)) - (_nreg - 1); \ 143 | uint64_t _jmp = _tcnt & (_nreg - 1); \ 144 | uint64_t _lcnt = (_tcnt + _nreg - 1) / _nreg; \ 145 | register __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \ 146 | register __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; \ 147 | _src += _offset * sizeof(__m128i); \ 148 | _dst += _offset * sizeof(__m128i); \ 149 | switch(_jmp) { \ 150 | case 0: do { _rd(_src, 0); \ 151 | case 15: _rd(_src, 1); \ 152 | case 14: _rd(_src, 2); \ 153 | case 13: _rd(_src, 3); \ 154 | case 12: _rd(_src, 4); \ 155 | case 11: _rd(_src, 5); \ 156 | case 10: _rd(_src, 6); \ 157 | case 9: _rd(_src, 7); \ 158 | case 8: _rd(_src, 8); \ 159 | case 7: _rd(_src, 9); \ 160 | case 6: _rd(_src, 10); \ 161 | case 5: _rd(_src, 11); \ 162 | case 4: _rd(_src, 12); \ 163 | case 3: _rd(_src, 13); \ 164 | case 2: _rd(_src, 14); \ 165 | case 1: _rd(_src, 15); \ 166 | switch(_jmp) { \ 167 | case 0: _wr(_dst, 0); \ 168 | case 15: _wr(_dst, 1); \ 169 | case 14: _wr(_dst, 2); \ 170 | case 13: _wr(_dst, 3); \ 171 | case 12: _wr(_dst, 4); \ 172 | case 11: _wr(_dst, 5); \ 173 | case 10: _wr(_dst, 6); \ 174 | case 9: _wr(_dst, 7); \ 175 | case 8: _wr(_dst, 8); \ 176 | case 7: _wr(_dst, 9); \ 177 | case 6: _wr(_dst, 10); \ 178 | case 5: _wr(_dst, 11); \ 179 | case 4: _wr(_dst, 12); \ 180 | case 3: _wr(_dst, 13); \ 181 | case 2: _wr(_dst, 14); \ 182 | case 1: _wr(_dst, 15); \ 183 | } \ 184 | _src += _nreg * sizeof(__m128i); \ 185 | _dst += _nreg * sizeof(__m128i); \ 186 | _jmp = 0; \ 187 | } while(--_lcnt > 0); \ 188 | } \ 189 | } 190 | #define _memcpy_blk_aa(dst, src, len) _memcpy_blk_intl(dst, src, len, _xmm_wr_a, _xmm_rd_a) 191 | #define _memcpy_blk_au(dst, src, len) _memcpy_blk_intl(dst, src, len, _xmm_wr_a, _xmm_rd_u) 192 | #define _memcpy_blk_ua(dst, src, len) _memcpy_blk_intl(dst, src, len, _xmm_wr_u, _xmm_rd_a) 193 | #define _memcpy_blk_uu(dst, src, len) _memcpy_blk_intl(dst, src, len, _xmm_wr_u, _xmm_rd_u) 194 | #define _memset_blk_intl(dst, a, size, _wr) { \ 195 | uint8_t *_dst = (uint8_t *)(dst); \ 196 | __m128i const xmm0 = _mm_set1_epi8((int8_t)a); \ 197 | uint64_t i; \ 198 | for(i = 0; i < size / sizeof(__m128i); i++) { \ 199 | _wr(_dst, 0); _dst += sizeof(__m128i); \ 200 | } \ 201 | } 202 | #define _memset_blk_a(dst, a, size) _memset_blk_intl(dst, a, size, _xmm_wr_a) 203 | #define _memset_blk_u(dst, a, size) _memset_blk_intl(dst, a, size, _xmm_wr_u) 204 | 205 | /** 206 | * substitution matrix abstraction 207 | */ 208 | /* store */ 209 | #define _store_sb(_scv, sv16) { _store_v16i8((_scv).v1, (sv16)); } 210 | 211 | /* load */ 212 | #define _load_sb(scv) ( _from_v16i8_n(_load_v16i8((scv).v1)) ) 213 | 214 | 215 | /** 216 | * gap penalty vector abstraction macros 217 | */ 218 | /* store */ 219 | #define _make_gap(_e1, _e2, _e3, _e4) ( \ 220 | (v16i8_t){ _mm_set_epi8( \ 221 | (_e4), (_e4), (_e4), (_e4), \ 222 | (_e3), (_e3), (_e3), (_e3), \ 223 | (_e2), (_e2), (_e2), (_e2), \ 224 | (_e1), (_e1), (_e1), (_e1)) \ 225 | } \ 226 | ) 227 | #define _store_adjh(_scv, _adjh, _adjv, _ofsh, _ofsv) { \ 228 | _store_v16i8((_scv).v2, _make_gap(_adjh, _adjv, _ofsh, _ofsv)) \ 229 | } 230 | #define _store_adjv(_scv, _adjh, _adjv, _ofsh, _ofsv) { \ 231 | /* nothing to do; use v2 */ \ 232 | /* _store_v16i8((_scv).v3, _make_gap(_adjh, _adjv, _ofsh, _ofsv)) */ \ 233 | } 234 | #define _store_ofsh(_scv, _adjh, _adjv, _ofsh, _ofsv) { \ 235 | /* nothing to do; use v2 */ \ 236 | /* _store_v16i8((_scv).v4, _make_gap(_adjh, _adjv, _ofsh, _ofsv)) */ \ 237 | } 238 | #define _store_ofsv(_scv, _adjh, _adjv, _ofsh, _ofsv) { \ 239 | /* nothing to do; use v2 */ \ 240 | /* _store_v16i8((_scv).v5, _make_gap(_adjh, _adjv, _ofsh, _ofsv)) */ \ 241 | } 242 | 243 | /* load */ 244 | #define _load_gap(_ptr, _idx) ( \ 245 | (v16i8_t){ _mm_shuffle_epi32(_mm_load_si128((__m128i const *)(_ptr)), (_idx)) } \ 246 | ) 247 | 248 | #define _load_adjh(_scv) ( _from_v16i8_n(_load_gap((_scv).v2, 0x00)) ) 249 | #define _load_adjv(_scv) ( _from_v16i8_n(_load_gap((_scv).v2, 0x00)) ) 250 | #define _load_ofsh(_scv) ( _from_v16i8_n(_load_gap((_scv).v2, 0x55)) ) 251 | #define _load_ofsv(_scv) ( _from_v16i8_n(_load_gap((_scv).v2, 0x55)) ) 252 | #define _load_gfh(_scv) ( _from_v16i8_n(_load_gap((_scv).v2, 0xaa)) ) 253 | #define _load_gfv(_scv) ( _from_v16i8_n(_load_gap((_scv).v2, 0xff)) ) 254 | /* 255 | #define _load_adjv(_scv) ( _from_v16i8_n(_load_gap((_scv).v2, 0x55)) ) 256 | #define _load_ofsv(_scv) ( _from_v16i8_n(_load_gap((_scv).v2, 0xff)) ) 257 | */ 258 | 259 | 260 | /* cache line operation */ 261 | #define WCR_BUF_SIZE ( 128 ) /** two cache lines in x86_64 */ 262 | #define memcpy_buf(_dst, _src) { \ 263 | register __m128i *_s = (__m128i *)(_src); \ 264 | register __m128i *_d = (__m128i *)(_dst); \ 265 | __m128i xmm0 = _mm_load_si128(_s); \ 266 | __m128i xmm1 = _mm_load_si128(_s + 1); \ 267 | __m128i xmm2 = _mm_load_si128(_s + 2); \ 268 | __m128i xmm3 = _mm_load_si128(_s + 3); \ 269 | __m128i xmm4 = _mm_load_si128(_s + 4); \ 270 | __m128i xmm5 = _mm_load_si128(_s + 5); \ 271 | __m128i xmm6 = _mm_load_si128(_s + 6); \ 272 | __m128i xmm7 = _mm_load_si128(_s + 7); \ 273 | _mm_stream_si128(_d, xmm0); \ 274 | _mm_stream_si128(_d + 1, xmm1); \ 275 | _mm_stream_si128(_d + 2, xmm2); \ 276 | _mm_stream_si128(_d + 3, xmm3); \ 277 | _mm_stream_si128(_d + 4, xmm4); \ 278 | _mm_stream_si128(_d + 5, xmm5); \ 279 | _mm_stream_si128(_d + 6, xmm6); \ 280 | _mm_stream_si128(_d + 7, xmm7); \ 281 | } 282 | 283 | /* 128bit register operation */ 284 | #define elem_128_t __m128i 285 | #define rd_128(_ptr) ( _mm_load_si128((__m128i *)(_ptr)) ) 286 | #define wr_128(_ptr, _e) { _mm_store_si128((__m128i *)(_ptr), (_e)); } 287 | #define _ex_128(k, h) ( _mm_extract_epi64((elem_128_t)k, h) ) 288 | #define ex_128(k, p) ( ((((p)>>3) ? _ex_128(k, 1) : (_ex_128(k, 0))>>(((p) & 0x07)<<3)) & (WCR_OCC_SIZE-1)) ) 289 | #define p_128(v) ( _mm_cvtsi64_si128((uint64_t)(v)) ) 290 | #define e_128(v) ( (uint64_t)_mm_cvtsi128_si64((__m128i)(v)) ) 291 | 292 | 293 | 294 | /* compare and swap (cas) */ 295 | #if defined(__GNUC__) 296 | # if (defined(_ARCH_GCC_VERSION) && _ARCH_GCC_VERSION < 470) || (defined(__INTEL_COMPILER) && _ARCH_GCC_COMPAT < 470) 297 | # define cas(ptr, cmp, val) ({ \ 298 | uint8_t _res; \ 299 | __asm__ volatile ("lock cmpxchg %[src], %[dst]\n\tsete %[res]" \ 300 | : [dst]"+m"(*ptr), [res]"=a"(_res) \ 301 | : [src]"r"(val), "a"(*cmp) \ 302 | : "memory", "cc"); \ 303 | _res; \ 304 | }) 305 | # define fence() ({ \ 306 | __asm__ volatile ("mfence"); \ 307 | }) 308 | # else /* > 4.7 */ 309 | # define cas(ptr, cmp, val) __atomic_compare_exchange_n(ptr, cmp, val, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED) 310 | # define fence() __sync_synchronize() 311 | # endif 312 | #else 313 | # error "atomic compare-and-exchange is not supported in this version of compiler." 314 | #endif 315 | 316 | #endif /* #ifndef _ARCH_UTIL_H_INCLUDED */ 317 | /** 318 | * end of arch_util.h 319 | */ 320 | -------------------------------------------------------------------------------- /arch/x86_64_sse41/v64i8.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file v64i8.h 4 | * 5 | * @brief struct and _Generic based vector class implementation 6 | */ 7 | #ifndef _V64I8_H_INCLUDED 8 | #define _V64I8_H_INCLUDED 9 | 10 | /* include header for intel / amd sse2 instruction sets */ 11 | #include 12 | 13 | /* 8bit 64cell */ 14 | typedef struct v64i8_s { 15 | __m128i v1; 16 | __m128i v2; 17 | __m128i v3; 18 | __m128i v4; 19 | } v64i8_t; 20 | 21 | /* expanders (without argument) */ 22 | #define _e_x_v64i8_1(u) 23 | #define _e_x_v64i8_2(u) 24 | #define _e_x_v64i8_3(u) 25 | #define _e_x_v64i8_4(u) 26 | 27 | /* expanders (without immediate) */ 28 | #define _e_v_v64i8_1(a) (a).v1 29 | #define _e_v_v64i8_2(a) (a).v2 30 | #define _e_v_v64i8_3(a) (a).v3 31 | #define _e_v_v64i8_4(a) (a).v4 32 | #define _e_vv_v64i8_1(a, b) (a).v1, (b).v1 33 | #define _e_vv_v64i8_2(a, b) (a).v2, (b).v2 34 | #define _e_vv_v64i8_3(a, b) (a).v3, (b).v3 35 | #define _e_vv_v64i8_4(a, b) (a).v4, (b).v4 36 | #define _e_vvv_v64i8_1(a, b, c) (a).v1, (b).v1, (c).v1 37 | #define _e_vvv_v64i8_2(a, b, c) (a).v2, (b).v2, (c).v2 38 | #define _e_vvv_v64i8_3(a, b, c) (a).v3, (b).v3, (c).v3 39 | #define _e_vvv_v64i8_4(a, b, c) (a).v4, (b).v4, (c).v4 40 | 41 | /* expanders with immediate */ 42 | #define _e_i_v64i8_1(imm) (imm) 43 | #define _e_i_v64i8_2(imm) (imm) 44 | #define _e_i_v64i8_3(imm) (imm) 45 | #define _e_i_v64i8_4(imm) (imm) 46 | #define _e_vi_v64i8_1(a, imm) (a).v1, (imm) 47 | #define _e_vi_v64i8_2(a, imm) (a).v2, (imm) 48 | #define _e_vi_v64i8_3(a, imm) (a).v3, (imm) 49 | #define _e_vi_v64i8_4(a, imm) (a).v4, (imm) 50 | #define _e_vvi_v64i8_1(a, b, imm) (a).v1, (b).v1, (imm) 51 | #define _e_vvi_v64i8_2(a, b, imm) (a).v2, (b).v2, (imm) 52 | #define _e_vvi_v64i8_3(a, b, imm) (a).v3, (b).v3, (imm) 53 | #define _e_vvi_v64i8_4(a, b, imm) (a).v4, (b).v4, (imm) 54 | 55 | /* address calculation macros */ 56 | #define _addr_v64i8_1(imm) ( (__m128i *)(imm) ) 57 | #define _addr_v64i8_2(imm) ( (__m128i *)(imm) + 1 ) 58 | #define _addr_v64i8_3(imm) ( (__m128i *)(imm) + 2 ) 59 | #define _addr_v64i8_4(imm) ( (__m128i *)(imm) + 3 ) 60 | #define _pv_v64i8(ptr) ( _addr_v64i8_1(ptr) ) 61 | 62 | /* expanders with pointers */ 63 | #define _e_p_v64i8_1(ptr) _addr_v64i8_1(ptr) 64 | #define _e_p_v64i8_2(ptr) _addr_v64i8_2(ptr) 65 | #define _e_p_v64i8_3(ptr) _addr_v64i8_3(ptr) 66 | #define _e_p_v64i8_4(ptr) _addr_v64i8_4(ptr) 67 | #define _e_pv_v64i8_1(ptr, a) _addr_v64i8_1(ptr), (a).v1 68 | #define _e_pv_v64i8_2(ptr, a) _addr_v64i8_2(ptr), (a).v2 69 | #define _e_pv_v64i8_3(ptr, a) _addr_v64i8_3(ptr), (a).v3 70 | #define _e_pv_v64i8_4(ptr, a) _addr_v64i8_4(ptr), (a).v4 71 | 72 | /* expand intrinsic name */ 73 | #define _i_v64i8(intrin) _mm_##intrin##_epi8 74 | #define _i_v64u8(intrin) _mm_##intrin##_epu8 75 | #define _i_v64i8x(intrin) _mm_##intrin##_si128 76 | 77 | /* apply */ 78 | #define _a_v64i8(intrin, expander, ...) ( \ 79 | (v64i8_t) { \ 80 | _i_v64i8(intrin)(expander##_v64i8_1(__VA_ARGS__)), \ 81 | _i_v64i8(intrin)(expander##_v64i8_2(__VA_ARGS__)), \ 82 | _i_v64i8(intrin)(expander##_v64i8_3(__VA_ARGS__)), \ 83 | _i_v64i8(intrin)(expander##_v64i8_4(__VA_ARGS__)) \ 84 | } \ 85 | ) 86 | #define _a_v64u8(intrin, expander, ...) ( \ 87 | (v64i8_t) { \ 88 | _i_v64u8(intrin)(expander##_v64i8_1(__VA_ARGS__)), \ 89 | _i_v64u8(intrin)(expander##_v64i8_2(__VA_ARGS__)), \ 90 | _i_v64u8(intrin)(expander##_v64i8_3(__VA_ARGS__)), \ 91 | _i_v64u8(intrin)(expander##_v64i8_4(__VA_ARGS__)) \ 92 | } \ 93 | ) 94 | #define _a_v64i8x(intrin, expander, ...) ( \ 95 | (v64i8_t) { \ 96 | _i_v64i8x(intrin)(expander##_v64i8_1(__VA_ARGS__)), \ 97 | _i_v64i8x(intrin)(expander##_v64i8_2(__VA_ARGS__)), \ 98 | _i_v64i8x(intrin)(expander##_v64i8_3(__VA_ARGS__)), \ 99 | _i_v64i8x(intrin)(expander##_v64i8_4(__VA_ARGS__)) \ 100 | } \ 101 | ) 102 | #define _a_v64i8xv(intrin, expander, ...) { \ 103 | _i_v64i8x(intrin)(expander##_v64i8_1(__VA_ARGS__)); \ 104 | _i_v64i8x(intrin)(expander##_v64i8_2(__VA_ARGS__)); \ 105 | _i_v64i8x(intrin)(expander##_v64i8_3(__VA_ARGS__)); \ 106 | _i_v64i8x(intrin)(expander##_v64i8_4(__VA_ARGS__)); \ 107 | } 108 | 109 | /* load and store */ 110 | #define _load_v64i8(...) _a_v64i8x(load, _e_p, __VA_ARGS__) 111 | #define _loadu_v64i8(...) _a_v64i8x(loadu, _e_p, __VA_ARGS__) 112 | #define _store_v64i8(...) _a_v64i8xv(store, _e_pv, __VA_ARGS__) 113 | #define _storeu_v64i8(...) _a_v64i8xv(storeu, _e_pv, __VA_ARGS__) 114 | 115 | /* broadcast */ 116 | #define _set_v64i8(...) _a_v64i8(set1, _e_i, __VA_ARGS__) 117 | #define _zero_v64i8() _a_v64i8x(setzero, _e_x, _unused) 118 | 119 | /* swap (reverse) */ 120 | #define _swap_idx_v64i8() ( \ 121 | _mm_set_epi8( \ 122 | 0, 1, 2, 3, 4, 5, 6, 7, \ 123 | 8, 9, 10, 11, 12, 13, 14, 15) \ 124 | ) 125 | #define _swap_v64i8(a) ( \ 126 | (v64i8_t) { \ 127 | _mm_shuffle_epi8((a).v4, _swap_idx_v64i8()), \ 128 | _mm_shuffle_epi8((a).v3, _swap_idx_v64i8()), \ 129 | _mm_shuffle_epi8((a).v2, _swap_idx_v64i8()), \ 130 | _mm_shuffle_epi8((a).v1, _swap_idx_v64i8()) \ 131 | } \ 132 | ) 133 | 134 | /* logics */ 135 | #define _not_v64i8(...) _a_v64i8x(not, _e_v, __VA_ARGS__) 136 | #define _and_v64i8(...) _a_v64i8x(and, _e_vv, __VA_ARGS__) 137 | #define _or_v64i8(...) _a_v64i8x(or, _e_vv, __VA_ARGS__) 138 | #define _xor_v64i8(...) _a_v64i8x(xor, _e_vv, __VA_ARGS__) 139 | #define _andn_v64i8(...) _a_v64i8x(andnot, _e_vv, __VA_ARGS__) 140 | 141 | /* arithmetics */ 142 | #define _add_v64i8(...) _a_v64i8(add, _e_vv, __VA_ARGS__) 143 | #define _sub_v64i8(...) _a_v64i8(sub, _e_vv, __VA_ARGS__) 144 | #define _adds_v64i8(...) _a_v64i8(adds, _e_vv, __VA_ARGS__) 145 | #define _subs_v64i8(...) _a_v64i8(subs, _e_vv, __VA_ARGS__) 146 | #define _addus_v64i8(...) _a_v64u8(adds, _e_vv, __VA_ARGS__) 147 | #define _subus_v64i8(...) _a_v64u8(subs, _e_vv, __VA_ARGS__) 148 | #define _max_v64i8(...) _a_v64i8(max, _e_vv, __VA_ARGS__) 149 | #define _min_v64i8(...) _a_v64i8(min, _e_vv, __VA_ARGS__) 150 | 151 | /* shuffle */ 152 | #define _shuf_v64i8(...) _a_v64i8(shuffle, _e_vv, __VA_ARGS__) 153 | 154 | /* blend */ 155 | #define _sel_v64i8(...) _a_v64i8(blendv, _e_vvv, __VA_ARGS__) 156 | 157 | /* compare */ 158 | #define _eq_v64i8(...) _a_v64i8(cmpeq, _e_vv, __VA_ARGS__) 159 | #define _gt_v64i8(...) _a_v64i8(cmpgt, _e_vv, __VA_ARGS__) 160 | 161 | /* insert and extract */ 162 | #define _ins_v64i8(a, val, imm) { \ 163 | if((imm) < sizeof(__m128i)) { \ 164 | (a).v1 = _i_v64i8(insert)((a).v1, (val), (imm)); \ 165 | } else if((imm) < 2 * sizeof(__m128i)) { \ 166 | (a).v2 = _i_v64i8(insert)((a).v2, (val), (imm) - sizeof(__m128i)); \ 167 | } else if((imm) < 3 * sizeof(__m128i)) { \ 168 | (a).v3 = _i_v64i8(insert)((a).v3, (val), (imm) - 2 * sizeof(__m128i)); \ 169 | } else { \ 170 | (a).v4 = _i_v64i8(insert)((a).v4, (val), (imm) - 3 * sizeof(__m128i)); \ 171 | } \ 172 | } 173 | #define _ext_v64i8(a, imm) ( \ 174 | (int8_t)(((imm) < sizeof(__m128i)) ? ( \ 175 | _i_v64i8(extract)((a).v1, (imm)) \ 176 | ) : ((imm) < 2 * sizeof(__m128i)) ? ( \ 177 | _i_v64i8(extract)((a).v2, (imm) - sizeof(__m128i)) \ 178 | ) : ((imm) < 3 * sizeof(__m128i)) ? ( \ 179 | _i_v64i8(extract)((a).v3, (imm) - 2 * sizeof(__m128i)) \ 180 | ) : ( \ 181 | _i_v64i8(extract)((a).v4, (imm) - 3 * sizeof(__m128i)) \ 182 | )) \ 183 | ) 184 | 185 | /* shift (imm must be smaller than 16) */ 186 | #define _bsl_v64i8(a, imm) ( \ 187 | (v64i8_t) { \ 188 | _i_v64i8x(slli)((a).v1, (imm)), \ 189 | _i_v64i8(alignr)((a).v2, (a).v1, sizeof(__m128i) - (imm)), \ 190 | _i_v64i8(alignr)((a).v3, (a).v2, sizeof(__m128i) - (imm)), \ 191 | _i_v64i8(alignr)((a).v4, (a).v3, sizeof(__m128i) - (imm)) \ 192 | } \ 193 | ) 194 | #define _bsr_v64i8(a, imm) ( \ 195 | (v64i8_t) { \ 196 | _i_v64i8(alignr)((a).v2, (a).v1, (imm)), \ 197 | _i_v64i8(alignr)((a).v3, (a).v2, (imm)), \ 198 | _i_v64i8(alignr)((a).v4, (a).v3, (imm)), \ 199 | _i_v64i8x(srli)((a).v4, (imm)) \ 200 | } \ 201 | ) 202 | 203 | /* double shift */ 204 | #define _bsld_v64i8(a, b, imm) ( \ 205 | (v64i8_t) { \ 206 | _i_v64i8(alignr)((a).v1, (b).v4, sizeof(__m128i) - (imm)), \ 207 | _i_v64i8(alignr)((a).v2, (a).v1, sizeof(__m128i) - (imm)), \ 208 | _i_v64i8(alignr)((a).v3, (a).v2, sizeof(__m128i) - (imm)), \ 209 | _i_v64i8(alignr)((a).v4, (a).v3, sizeof(__m128i) - (imm)) \ 210 | } \ 211 | ) 212 | #define _bsrd_v64i8(a, b, imm) ( \ 213 | (v64i8_t) { \ 214 | _i_v64i8(alignr)((b).v2, (b).v1, (imm)), \ 215 | _i_v64i8(alignr)((b).v3, (b).v2, (imm)), \ 216 | _i_v64i8(alignr)((b).v4, (b).v3, (imm)), \ 217 | _i_v64i8(alignr)((a).v1, (b).v4, (imm)) \ 218 | } \ 219 | ) 220 | 221 | /* bit shift */ 222 | #define _shl_v64i8(a, imm) ( \ 223 | (v64i8_t) { \ 224 | _mm_slli_epi32((a).v1, (imm)), \ 225 | _mm_slli_epi32((a).v2, (imm)), \ 226 | _mm_slli_epi32((a).v3, (imm)), \ 227 | _mm_slli_epi32((a).v4, (imm)) \ 228 | } \ 229 | ) 230 | #define _shr_v64i8(a, imm) ( \ 231 | (v64i8_t) { \ 232 | _mm_srli_epi32((a).v1, (imm)), \ 233 | _mm_srli_epi32((a).v2, (imm)), \ 234 | _mm_srli_epi32((a).v3, (imm)), \ 235 | _mm_srli_epi32((a).v4, (imm)) \ 236 | } \ 237 | ) 238 | #define _sal_v64i8(a, imm) ( \ 239 | (v64i8_t) { \ 240 | _mm_slai_epi32((a).v1, (imm)), \ 241 | _mm_slai_epi32((a).v2, (imm)), \ 242 | _mm_slai_epi32((a).v3, (imm)), \ 243 | _mm_slai_epi32((a).v4, (imm)) \ 244 | } \ 245 | ) 246 | #define _sar_v64i8(a, imm) ( \ 247 | (v64i8_t) { \ 248 | _mm_srai_epi32((a).v1, (imm)), \ 249 | _mm_srai_epi32((a).v2, (imm)), \ 250 | _mm_srai_epi32((a).v3, (imm)), \ 251 | _mm_srai_epi32((a).v4, (imm)) \ 252 | } \ 253 | ) 254 | 255 | /* mask */ 256 | #define _mask_v64i8(a) ( \ 257 | (v64_mask_t) { \ 258 | .m1 = _i_v64i8(movemask)((a).v1), \ 259 | .m2 = _i_v64i8(movemask)((a).v2), \ 260 | .m3 = _i_v64i8(movemask)((a).v3), \ 261 | .m4 = _i_v64i8(movemask)((a).v4) \ 262 | } \ 263 | ) 264 | 265 | /* convert */ 266 | #define _cvt_v64i16_v64i8(a) ( \ 267 | (v64i8_t) { \ 268 | _mm_packs_epi16((a).v1, (a).v2), \ 269 | _mm_packs_epi16((a).v3, (a).v4), \ 270 | _mm_packs_epi16((a).v5, (a).v6), \ 271 | _mm_packs_epi16((a).v7, (a).v8) \ 272 | } \ 273 | ) 274 | 275 | /* debug print */ 276 | #ifdef _LOG_H_INCLUDED 277 | #define _print_v64i8(a) { \ 278 | debug("(v64i8_t) %s(%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, " \ 279 | "%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, " \ 280 | "%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, " \ 281 | "%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d)", \ 282 | #a, \ 283 | _ext_v64i8(a, 32 + 31), \ 284 | _ext_v64i8(a, 32 + 30), \ 285 | _ext_v64i8(a, 32 + 29), \ 286 | _ext_v64i8(a, 32 + 28), \ 287 | _ext_v64i8(a, 32 + 27), \ 288 | _ext_v64i8(a, 32 + 26), \ 289 | _ext_v64i8(a, 32 + 25), \ 290 | _ext_v64i8(a, 32 + 24), \ 291 | _ext_v64i8(a, 32 + 23), \ 292 | _ext_v64i8(a, 32 + 22), \ 293 | _ext_v64i8(a, 32 + 21), \ 294 | _ext_v64i8(a, 32 + 20), \ 295 | _ext_v64i8(a, 32 + 19), \ 296 | _ext_v64i8(a, 32 + 18), \ 297 | _ext_v64i8(a, 32 + 17), \ 298 | _ext_v64i8(a, 32 + 16), \ 299 | _ext_v64i8(a, 32 + 15), \ 300 | _ext_v64i8(a, 32 + 14), \ 301 | _ext_v64i8(a, 32 + 13), \ 302 | _ext_v64i8(a, 32 + 12), \ 303 | _ext_v64i8(a, 32 + 11), \ 304 | _ext_v64i8(a, 32 + 10), \ 305 | _ext_v64i8(a, 32 + 9), \ 306 | _ext_v64i8(a, 32 + 8), \ 307 | _ext_v64i8(a, 32 + 7), \ 308 | _ext_v64i8(a, 32 + 6), \ 309 | _ext_v64i8(a, 32 + 5), \ 310 | _ext_v64i8(a, 32 + 4), \ 311 | _ext_v64i8(a, 32 + 3), \ 312 | _ext_v64i8(a, 32 + 2), \ 313 | _ext_v64i8(a, 32 + 1), \ 314 | _ext_v64i8(a, 32 + 0), \ 315 | _ext_v64i8(a, 31), \ 316 | _ext_v64i8(a, 30), \ 317 | _ext_v64i8(a, 29), \ 318 | _ext_v64i8(a, 28), \ 319 | _ext_v64i8(a, 27), \ 320 | _ext_v64i8(a, 26), \ 321 | _ext_v64i8(a, 25), \ 322 | _ext_v64i8(a, 24), \ 323 | _ext_v64i8(a, 23), \ 324 | _ext_v64i8(a, 22), \ 325 | _ext_v64i8(a, 21), \ 326 | _ext_v64i8(a, 20), \ 327 | _ext_v64i8(a, 19), \ 328 | _ext_v64i8(a, 18), \ 329 | _ext_v64i8(a, 17), \ 330 | _ext_v64i8(a, 16), \ 331 | _ext_v64i8(a, 15), \ 332 | _ext_v64i8(a, 14), \ 333 | _ext_v64i8(a, 13), \ 334 | _ext_v64i8(a, 12), \ 335 | _ext_v64i8(a, 11), \ 336 | _ext_v64i8(a, 10), \ 337 | _ext_v64i8(a, 9), \ 338 | _ext_v64i8(a, 8), \ 339 | _ext_v64i8(a, 7), \ 340 | _ext_v64i8(a, 6), \ 341 | _ext_v64i8(a, 5), \ 342 | _ext_v64i8(a, 4), \ 343 | _ext_v64i8(a, 3), \ 344 | _ext_v64i8(a, 2), \ 345 | _ext_v64i8(a, 1), \ 346 | _ext_v64i8(a, 0)); \ 347 | } 348 | #else 349 | #define _print_v64i8(x) ; 350 | #endif 351 | 352 | #endif /* _V64I8_H_INCLUDED */ 353 | /** 354 | * end of v64i8.h 355 | */ 356 | -------------------------------------------------------------------------------- /arch/x86_64_sse41/v64i16.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file v64i16.h 4 | * 5 | * @brief struct and _Generic based vector class implementation 6 | */ 7 | #ifndef _V64I16_H_INCLUDED 8 | #define _V64I16_H_INCLUDED 9 | 10 | /* include header for intel / amd sse2 instruction sets */ 11 | #include 12 | 13 | /* 16bit 64cell */ 14 | typedef struct v64i16_s { 15 | __m128i v1; 16 | __m128i v2; 17 | __m128i v3; 18 | __m128i v4; 19 | __m128i v5; 20 | __m128i v6; 21 | __m128i v7; 22 | __m128i v8; 23 | } v64i16_t; 24 | 25 | /* expanders (without argument) */ 26 | #define _e_x_v64i16_1(u) 27 | #define _e_x_v64i16_2(u) 28 | #define _e_x_v64i16_3(u) 29 | #define _e_x_v64i16_4(u) 30 | #define _e_x_v64i16_5(u) 31 | #define _e_x_v64i16_6(u) 32 | #define _e_x_v64i16_7(u) 33 | #define _e_x_v64i16_8(u) 34 | 35 | /* expanders (without immediate) */ 36 | #define _e_v_v64i16_1(a) (a).v1 37 | #define _e_v_v64i16_2(a) (a).v2 38 | #define _e_v_v64i16_3(a) (a).v3 39 | #define _e_v_v64i16_4(a) (a).v4 40 | #define _e_v_v64i16_5(a) (a).v5 41 | #define _e_v_v64i16_6(a) (a).v6 42 | #define _e_v_v64i16_7(a) (a).v7 43 | #define _e_v_v64i16_8(a) (a).v8 44 | #define _e_vv_v64i16_1(a, b) (a).v1, (b).v1 45 | #define _e_vv_v64i16_2(a, b) (a).v2, (b).v2 46 | #define _e_vv_v64i16_3(a, b) (a).v3, (b).v3 47 | #define _e_vv_v64i16_4(a, b) (a).v4, (b).v4 48 | #define _e_vv_v64i16_5(a, b) (a).v5, (b).v5 49 | #define _e_vv_v64i16_6(a, b) (a).v6, (b).v6 50 | #define _e_vv_v64i16_7(a, b) (a).v7, (b).v7 51 | #define _e_vv_v64i16_8(a, b) (a).v8, (b).v8 52 | #define _e_vvv_v64i16_1(a, b, c) (a).v1, (b).v1, (c).v1 53 | #define _e_vvv_v64i16_2(a, b, c) (a).v2, (b).v2, (c).v2 54 | #define _e_vvv_v64i16_3(a, b, c) (a).v3, (b).v3, (c).v3 55 | #define _e_vvv_v64i16_4(a, b, c) (a).v4, (b).v4, (c).v4 56 | #define _e_vvv_v64i16_5(a, b, c) (a).v5, (b).v5, (c).v5 57 | #define _e_vvv_v64i16_6(a, b, c) (a).v6, (b).v6, (c).v6 58 | #define _e_vvv_v64i16_7(a, b, c) (a).v7, (b).v7, (c).v7 59 | #define _e_vvv_v64i16_8(a, b, c) (a).v8, (b).v8, (c).v8 60 | 61 | /* expanders with immediate */ 62 | #define _e_i_v64i16_1(imm) (imm) 63 | #define _e_i_v64i16_2(imm) (imm) 64 | #define _e_i_v64i16_3(imm) (imm) 65 | #define _e_i_v64i16_4(imm) (imm) 66 | #define _e_i_v64i16_5(imm) (imm) 67 | #define _e_i_v64i16_6(imm) (imm) 68 | #define _e_i_v64i16_7(imm) (imm) 69 | #define _e_i_v64i16_8(imm) (imm) 70 | #define _e_vi_v64i16_1(a, imm) (a).v1, (imm) 71 | #define _e_vi_v64i16_2(a, imm) (a).v2, (imm) 72 | #define _e_vi_v64i16_3(a, imm) (a).v3, (imm) 73 | #define _e_vi_v64i16_4(a, imm) (a).v4, (imm) 74 | #define _e_vi_v64i16_5(a, imm) (a).v5, (imm) 75 | #define _e_vi_v64i16_6(a, imm) (a).v6, (imm) 76 | #define _e_vi_v64i16_7(a, imm) (a).v7, (imm) 77 | #define _e_vi_v64i16_8(a, imm) (a).v8, (imm) 78 | #define _e_vvi_v64i16_1(a, b, imm) (a).v1, (b).v1, (imm) 79 | #define _e_vvi_v64i16_2(a, b, imm) (a).v2, (b).v2, (imm) 80 | #define _e_vvi_v64i16_3(a, b, imm) (a).v3, (b).v3, (imm) 81 | #define _e_vvi_v64i16_4(a, b, imm) (a).v4, (b).v4, (imm) 82 | #define _e_vvi_v64i16_5(a, b, imm) (a).v5, (b).v5, (imm) 83 | #define _e_vvi_v64i16_6(a, b, imm) (a).v6, (b).v6, (imm) 84 | #define _e_vvi_v64i16_7(a, b, imm) (a).v7, (b).v7, (imm) 85 | #define _e_vvi_v64i16_8(a, b, imm) (a).v8, (b).v8, (imm) 86 | 87 | /* address calculation macros */ 88 | #define _addr_v64i16_1(imm) ( (__m128i *)(imm) ) 89 | #define _addr_v64i16_2(imm) ( (__m128i *)(imm) + 1 ) 90 | #define _addr_v64i16_3(imm) ( (__m128i *)(imm) + 2 ) 91 | #define _addr_v64i16_4(imm) ( (__m128i *)(imm) + 3 ) 92 | #define _addr_v64i16_5(imm) ( (__m128i *)(imm) + 4 ) 93 | #define _addr_v64i16_6(imm) ( (__m128i *)(imm) + 5 ) 94 | #define _addr_v64i16_7(imm) ( (__m128i *)(imm) + 6 ) 95 | #define _addr_v64i16_8(imm) ( (__m128i *)(imm) + 7 ) 96 | #define _pv_v64i16(ptr) ( _addr_v64i16_1(ptr) ) 97 | 98 | /* expanders with pointers */ 99 | #define _e_p_v64i16_1(ptr) _addr_v64i16_1(ptr) 100 | #define _e_p_v64i16_2(ptr) _addr_v64i16_2(ptr) 101 | #define _e_p_v64i16_3(ptr) _addr_v64i16_3(ptr) 102 | #define _e_p_v64i16_4(ptr) _addr_v64i16_4(ptr) 103 | #define _e_p_v64i16_5(ptr) _addr_v64i16_5(ptr) 104 | #define _e_p_v64i16_6(ptr) _addr_v64i16_6(ptr) 105 | #define _e_p_v64i16_7(ptr) _addr_v64i16_7(ptr) 106 | #define _e_p_v64i16_8(ptr) _addr_v64i16_8(ptr) 107 | #define _e_pv_v64i16_1(ptr, a) _addr_v64i16_1(ptr), (a).v1 108 | #define _e_pv_v64i16_2(ptr, a) _addr_v64i16_2(ptr), (a).v2 109 | #define _e_pv_v64i16_3(ptr, a) _addr_v64i16_3(ptr), (a).v3 110 | #define _e_pv_v64i16_4(ptr, a) _addr_v64i16_4(ptr), (a).v4 111 | #define _e_pv_v64i16_5(ptr, a) _addr_v64i16_5(ptr), (a).v5 112 | #define _e_pv_v64i16_6(ptr, a) _addr_v64i16_6(ptr), (a).v6 113 | #define _e_pv_v64i16_7(ptr, a) _addr_v64i16_7(ptr), (a).v7 114 | #define _e_pv_v64i16_8(ptr, a) _addr_v64i16_8(ptr), (a).v8 115 | 116 | /* expand intrinsic name */ 117 | #define _i_v64i16(intrin) _mm_##intrin##_epi16 118 | #define _i_v64i16x(intrin) _mm_##intrin##_si128 119 | 120 | /* apply */ 121 | #define _a_v64i16(intrin, expander, ...) ( \ 122 | (v64i16_t) { \ 123 | _i_v64i16(intrin)(expander##_v64i16_1(__VA_ARGS__)), \ 124 | _i_v64i16(intrin)(expander##_v64i16_2(__VA_ARGS__)), \ 125 | _i_v64i16(intrin)(expander##_v64i16_3(__VA_ARGS__)), \ 126 | _i_v64i16(intrin)(expander##_v64i16_4(__VA_ARGS__)), \ 127 | _i_v64i16(intrin)(expander##_v64i16_5(__VA_ARGS__)), \ 128 | _i_v64i16(intrin)(expander##_v64i16_6(__VA_ARGS__)), \ 129 | _i_v64i16(intrin)(expander##_v64i16_7(__VA_ARGS__)), \ 130 | _i_v64i16(intrin)(expander##_v64i16_8(__VA_ARGS__)) \ 131 | } \ 132 | ) 133 | #define _a_v64i16x(intrin, expander, ...) ( \ 134 | (v64i16_t) { \ 135 | _i_v64i16x(intrin)(expander##_v64i16_1(__VA_ARGS__)), \ 136 | _i_v64i16x(intrin)(expander##_v64i16_2(__VA_ARGS__)), \ 137 | _i_v64i16x(intrin)(expander##_v64i16_3(__VA_ARGS__)), \ 138 | _i_v64i16x(intrin)(expander##_v64i16_4(__VA_ARGS__)), \ 139 | _i_v64i16x(intrin)(expander##_v64i16_5(__VA_ARGS__)), \ 140 | _i_v64i16x(intrin)(expander##_v64i16_6(__VA_ARGS__)), \ 141 | _i_v64i16x(intrin)(expander##_v64i16_7(__VA_ARGS__)), \ 142 | _i_v64i16x(intrin)(expander##_v64i16_8(__VA_ARGS__)) \ 143 | } \ 144 | ) 145 | #define _a_v64i16xv(intrin, expander, ...) { \ 146 | _i_v64i16x(intrin)(expander##_v64i16_1(__VA_ARGS__)); \ 147 | _i_v64i16x(intrin)(expander##_v64i16_2(__VA_ARGS__)); \ 148 | _i_v64i16x(intrin)(expander##_v64i16_3(__VA_ARGS__)); \ 149 | _i_v64i16x(intrin)(expander##_v64i16_4(__VA_ARGS__)); \ 150 | _i_v64i16x(intrin)(expander##_v64i16_5(__VA_ARGS__)); \ 151 | _i_v64i16x(intrin)(expander##_v64i16_6(__VA_ARGS__)); \ 152 | _i_v64i16x(intrin)(expander##_v64i16_7(__VA_ARGS__)); \ 153 | _i_v64i16x(intrin)(expander##_v64i16_8(__VA_ARGS__)); \ 154 | } 155 | 156 | /* load and store */ 157 | #define _load_v64i16(...) _a_v64i16x(load, _e_p, __VA_ARGS__) 158 | #define _loadu_v64i16(...) _a_v64i16x(loadu, _e_p, __VA_ARGS__) 159 | #define _store_v64i16(...) _a_v64i16xv(store, _e_pv, __VA_ARGS__) 160 | #define _storeu_v64i16(...) _a_v64i16xv(storeu, _e_pv, __VA_ARGS__) 161 | 162 | /* broadcast */ 163 | #define _set_v64i16(...) _a_v64i16(set1, _e_i, __VA_ARGS__) 164 | #define _zero_v64i16() _a_v64i16x(setzero, _e_x, _unused) 165 | 166 | /* logics */ 167 | #define _not_v64i16(...) _a_v64i16x(not, _e_v, __VA_ARGS__) 168 | #define _and_v64i16(...) _a_v64i16x(and, _e_vv, __VA_ARGS__) 169 | #define _or_v64i16(...) _a_v64i16x(or, _e_vv, __VA_ARGS__) 170 | #define _xor_v64i16(...) _a_v64i16x(xor, _e_vv, __VA_ARGS__) 171 | #define _andn_v64i16(...) _a_v64i16x(andnot, _e_vv, __VA_ARGS__) 172 | 173 | /* arithmetics */ 174 | #define _add_v64i16(...) _a_v64i16(add, _e_vv, __VA_ARGS__) 175 | #define _sub_v64i16(...) _a_v64i16(sub, _e_vv, __VA_ARGS__) 176 | #define _max_v64i16(...) _a_v64i16(max, _e_vv, __VA_ARGS__) 177 | #define _min_v64i16(...) _a_v64i16(min, _e_vv, __VA_ARGS__) 178 | 179 | /* compare */ 180 | #define _eq_v64i16(...) _a_v64i16(cmpeq, _e_vv, __VA_ARGS__) 181 | #define _gt_v64i16(...) _a_v64i16(cmpgt, _e_vv, __VA_ARGS__) 182 | 183 | /* insert and extract */ 184 | #define _V64I16_N ( sizeof(__m128i) / sizeof(int16_t) ) 185 | #define _ins_v64i16(a, val, imm) { \ 186 | if((imm) < _V64I16_N) { \ 187 | (a).v1 = _i_v64i8(insert)((a).v1, (val), (imm)); \ 188 | } else if((imm) < 2 * _V64I16_N) { \ 189 | (a).v2 = _i_v64i8(insert)((a).v2, (val), (imm) - _V64I16_N); \ 190 | } else if((imm) < 3 * _V64I16_N) { \ 191 | (a).v3 = _i_v64i8(insert)((a).v3, (val), (imm) - 2 * _V64I16_N); \ 192 | } else if((imm) < 4 * _V64I16_N) { \ 193 | (a).v4 = _i_v64i8(insert)((a).v4, (val), (imm) - 3 * _V64I16_N); \ 194 | } else if((imm) < 5 * _V64I16_N) { \ 195 | (a).v5 = _i_v64i8(insert)((a).v5, (val), (imm) - 4 * _V64I16_N); \ 196 | } else if((imm) < 6 * _V64I16_N) { \ 197 | (a).v6 = _i_v64i8(insert)((a).v6, (val), (imm) - 5 * _V64I16_N); \ 198 | } else if((imm) < 7 * _V64I16_N) { \ 199 | (a).v7 = _i_v64i8(insert)((a).v7, (val), (imm) - 6 * _V64I16_N); \ 200 | } else { \ 201 | (a).v8 = _i_v64i8(insert)((a).v8, (val), (imm) - 7 * _V64I16_N); \ 202 | } \ 203 | } 204 | #define _ext_v64i16(a, imm) ( \ 205 | (int16_t)(((imm) < _V64I16_N) ? ( \ 206 | _i_v64i16(extract)((a).v1, (imm)) \ 207 | ) : ((imm) < 2 * _V64I16_N) ? ( \ 208 | _i_v64i16(extract)((a).v2, (imm) - _V64I16_N) \ 209 | ) : ((imm) < 3 * _V64I16_N) ? ( \ 210 | _i_v64i16(extract)((a).v3, (imm) - 2 * _V64I16_N) \ 211 | ) : ((imm) < 4 * _V64I16_N) ? ( \ 212 | _i_v64i16(extract)((a).v4, (imm) - 3 * _V64I16_N) \ 213 | ) : ((imm) < 5 * _V64I16_N) ? ( \ 214 | _i_v64i16(extract)((a).v5, (imm) - 4 * _V64I16_N) \ 215 | ) : ((imm) < 6 * _V64I16_N) ? ( \ 216 | _i_v64i16(extract)((a).v6, (imm) - 5 * _V64I16_N) \ 217 | ) : ((imm) < 7 * _V64I16_N) ? ( \ 218 | _i_v64i16(extract)((a).v7, (imm) - 6 * _V64I16_N) \ 219 | ) : ( \ 220 | _i_v64i16(extract)((a).v8, (imm) - 7 * _V64I16_N) \ 221 | )) \ 222 | ) 223 | 224 | /* mask */ 225 | #define _mask_v64i16(a) ( \ 226 | (v64_mask_t) { \ 227 | .m1 = _mm_movemask_epi8(_mm_packs_epi16((a).v1, (a).v2)), \ 228 | .m2 = _mm_movemask_epi8(_mm_packs_epi16((a).v3, (a).v4)), \ 229 | .m3 = _mm_movemask_epi8(_mm_packs_epi16((a).v5, (a).v6)), \ 230 | .m4 = _mm_movemask_epi8(_mm_packs_epi16((a).v7, (a).v8)) \ 231 | } \ 232 | ) 233 | 234 | /* horizontal max (reduction max) */ 235 | #define _hmax_v64i16(a) ({ \ 236 | __m128i _vmax = _mm_max_epi16( \ 237 | _mm_max_epi16(_mm_max_epi16((a).v1, (a).v2), _mm_max_epi16((a).v3, (a).v4)), \ 238 | _mm_max_epi16(_mm_max_epi16((a).v5, (a).v6), _mm_max_epi16((a).v7, (a).v8))); \ 239 | _vmax = _mm_max_epi16(_vmax, \ 240 | _mm_srli_si128(_vmax, 8)); \ 241 | _vmax = _mm_max_epi16(_vmax, \ 242 | _mm_srli_si128(_vmax, 4)); \ 243 | _vmax = _mm_max_epi16(_vmax, \ 244 | _mm_srli_si128(_vmax, 2)); \ 245 | (int16_t)_mm_extract_epi16(_vmax, 0); \ 246 | }) 247 | 248 | #define _cvt_v64i8_v64i16(a) ( \ 249 | (v64i16_t) { \ 250 | _mm_cvtepi8_epi16((a).v1), \ 251 | _mm_cvtepi8_epi16(_mm_srli_si128((a).v1, 8)), \ 252 | _mm_cvtepi8_epi16((a).v2), \ 253 | _mm_cvtepi8_epi16(_mm_srli_si128((a).v2, 8)), \ 254 | _mm_cvtepi8_epi16((a).v3), \ 255 | _mm_cvtepi8_epi16(_mm_srli_si128((a).v3, 8)), \ 256 | _mm_cvtepi8_epi16((a).v4), \ 257 | _mm_cvtepi8_epi16(_mm_srli_si128((a).v4, 8)) \ 258 | } \ 259 | ) 260 | 261 | /* debug print */ 262 | #ifdef _LOG_H_INCLUDED 263 | #define _print_v64i16(a) { \ 264 | debug("(v64i16_t) %s(%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, " \ 265 | "%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, ", \ 266 | "%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, ", \ 267 | "%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d)", \ 268 | #a, \ 269 | _ext_v64i16(a, 32 + 31), \ 270 | _ext_v64i16(a, 32 + 30), \ 271 | _ext_v64i16(a, 32 + 29), \ 272 | _ext_v64i16(a, 32 + 28), \ 273 | _ext_v64i16(a, 32 + 27), \ 274 | _ext_v64i16(a, 32 + 26), \ 275 | _ext_v64i16(a, 32 + 25), \ 276 | _ext_v64i16(a, 32 + 24), \ 277 | _ext_v64i16(a, 32 + 23), \ 278 | _ext_v64i16(a, 32 + 22), \ 279 | _ext_v64i16(a, 32 + 21), \ 280 | _ext_v64i16(a, 32 + 20), \ 281 | _ext_v64i16(a, 32 + 19), \ 282 | _ext_v64i16(a, 32 + 18), \ 283 | _ext_v64i16(a, 32 + 17), \ 284 | _ext_v64i16(a, 32 + 16), \ 285 | _ext_v64i16(a, 32 + 15), \ 286 | _ext_v64i16(a, 32 + 14), \ 287 | _ext_v64i16(a, 32 + 13), \ 288 | _ext_v64i16(a, 32 + 12), \ 289 | _ext_v64i16(a, 32 + 11), \ 290 | _ext_v64i16(a, 32 + 10), \ 291 | _ext_v64i16(a, 32 + 9), \ 292 | _ext_v64i16(a, 32 + 8), \ 293 | _ext_v64i16(a, 32 + 7), \ 294 | _ext_v64i16(a, 32 + 6), \ 295 | _ext_v64i16(a, 32 + 5), \ 296 | _ext_v64i16(a, 32 + 4), \ 297 | _ext_v64i16(a, 32 + 3), \ 298 | _ext_v64i16(a, 32 + 2), \ 299 | _ext_v64i16(a, 32 + 1), \ 300 | _ext_v64i16(a, 32 + 0), \ 301 | _ext_v64i16(a, 31), \ 302 | _ext_v64i16(a, 30), \ 303 | _ext_v64i16(a, 29), \ 304 | _ext_v64i16(a, 28), \ 305 | _ext_v64i16(a, 27), \ 306 | _ext_v64i16(a, 26), \ 307 | _ext_v64i16(a, 25), \ 308 | _ext_v64i16(a, 24), \ 309 | _ext_v64i16(a, 23), \ 310 | _ext_v64i16(a, 22), \ 311 | _ext_v64i16(a, 21), \ 312 | _ext_v64i16(a, 20), \ 313 | _ext_v64i16(a, 19), \ 314 | _ext_v64i16(a, 18), \ 315 | _ext_v64i16(a, 17), \ 316 | _ext_v64i16(a, 16), \ 317 | _ext_v64i16(a, 15), \ 318 | _ext_v64i16(a, 14), \ 319 | _ext_v64i16(a, 13), \ 320 | _ext_v64i16(a, 12), \ 321 | _ext_v64i16(a, 11), \ 322 | _ext_v64i16(a, 10), \ 323 | _ext_v64i16(a, 9), \ 324 | _ext_v64i16(a, 8), \ 325 | _ext_v64i16(a, 7), \ 326 | _ext_v64i16(a, 6), \ 327 | _ext_v64i16(a, 5), \ 328 | _ext_v64i16(a, 4), \ 329 | _ext_v64i16(a, 3), \ 330 | _ext_v64i16(a, 2), \ 331 | _ext_v64i16(a, 1), \ 332 | _ext_v64i16(a, 0)); \ 333 | } 334 | #else 335 | #define _print_v64i16(x) ; 336 | #endif 337 | 338 | #endif /* _V64I16_H_INCLUDED */ 339 | /** 340 | * end of v64i16.h 341 | */ 342 | -------------------------------------------------------------------------------- /gaba.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file gaba.h 4 | * 5 | * @brief C header of the libgaba (libsea3) API 6 | * 7 | * @author Hajime Suzuki 8 | * @date 2014/12/29 9 | * @license Apache v2 10 | * 11 | * @detail 12 | * a header for libgaba (libsea3): a fast banded seed-and-extend alignment library. 13 | */ 14 | 15 | #ifndef _GABA_H_INCLUDED 16 | #define _GABA_H_INCLUDED 17 | 18 | #include /** NULL and size_t */ 19 | #include /** uint8_t, int32_t, int64_t */ 20 | 21 | /** 22 | * @macro GABA_EXPORT_LEVEL 23 | */ 24 | #if defined(_GABA_WRAP_H_INCLUDED) && !defined(_GABA_EXPORT_LEVEL) 25 | /* included from gaba_wrap.h */ 26 | # define _GABA_EXPORT_LEVEL static 27 | #else 28 | /* single, linked to an object compiled without -DSUFFIX */ 29 | # define _GABA_EXPORT_LEVEL 30 | #endif 31 | 32 | /* do not bare wrapper functions by default */ 33 | #if !defined(_GABA_PARSE_EXPORT_LEVEL) 34 | # define _GABA_PARSE_EXPORT_LEVEL 35 | # warning "export parse" 36 | #endif 37 | 38 | #if !defined(_GABA_WRAP_EXPORT_LEVEL) 39 | # define _GABA_WRAP_EXPORT_LEVEL 40 | #endif 41 | 42 | /** 43 | * @enum gaba_status 44 | */ 45 | enum gaba_status { 46 | GABA_CONT = 0, /* continue, call again the function with the same args (but rarely occurrs) */ 47 | GABA_UPDATE_A = 0x000f, /* update required on section a (always combined with GABA_UPDATE) */ 48 | GABA_UPDATE_B = 0x00f0, /* update required on section b (always combined with GABA_UPDATE) */ 49 | GABA_TERM = 0x8000, /* extension terminated by X-drop */ 50 | GABA_OOM = 0x0400 /* out of memory (indicates malloc returned NULL) */ 51 | }; 52 | 53 | /** 54 | * @type gaba_lmalloc_t, gaba_free_t 55 | * @brief external malloc can be passed, otherwise system malloc will be used 56 | */ 57 | typedef void *(*gaba_lmalloc_t)(void *opaque, size_t size); 58 | typedef void (*gaba_lfree_t)(void *opaque, void *ptr); 59 | 60 | /** 61 | * @struct gaba_alloc_s 62 | * @brief optional memory allocator, malloc and free pair must not be NULL. 63 | * memory block must be freed when size == 0: 64 | * 65 | * void *alloc(void *opaque, void *ptr, size_t size) { 66 | * if(size == 0) { free(ptr); } 67 | * return(realloc(ptr, size)); 68 | * } 69 | */ 70 | struct gaba_alloc_s { 71 | void *opaque; /** local memory arena */ 72 | gaba_lmalloc_t lmalloc; /** local malloc; dedicated for alignment path generation */ 73 | gaba_lfree_t lfree; /** local free */ 74 | }; 75 | typedef struct gaba_alloc_s gaba_alloc_t; 76 | 77 | /** 78 | * @struct gaba_params_s 79 | * @brief input parameters of gaba_init 80 | */ 81 | struct gaba_params_s { 82 | /** scoring parameters */ 83 | int8_t score_matrix[16]; /** score matrix (substitution matrix) max must not exceed 7 */ 84 | int8_t gi; /** gap open penalty (0 for the linear-gap penalty; positive integer) */ 85 | int8_t ge; /** gap extension penalty (positive integer) */ 86 | int8_t gfa, gfb; /** linear-gap extension penalty for short indels (combined-gap penalty; gf > ge). gfa for gaps on sequence A, gfb for seq. B. */ 87 | 88 | /** score parameters */ 89 | int8_t xdrop; /** X-drop threshold, positive, less than 128 */ 90 | 91 | /** filtering parameters */ 92 | uint8_t filter_thresh; /** popcnt filter threshold, set zero if you want to disable it */ 93 | 94 | /* internal */ 95 | void *reserved; 96 | uint64_t _pad; 97 | }; 98 | typedef struct gaba_params_s gaba_params_t; 99 | 100 | /** 101 | * @macro GABA_PARAMS 102 | * @brief utility macro for gaba_init, see example on header. 103 | */ 104 | #define GABA_PARAMS(...) ( &((struct gaba_params_s const) { __VA_ARGS__ }) ) 105 | 106 | /** 107 | * @macro GABA_SCORE_SIMPLE 108 | * @brief utility macro for constructing score parameters. 109 | */ 110 | #define GABA_SCORE_SIMPLE(_m, _x, _gi, _ge) .score_matrix = { (_m),-(_x),-(_x),-(_x),-(_x),(_m),-(_x),-(_x),-(_x),-(_x),(_m),-(_x),-(_x),-(_x),-(_x),(_m) }, .gi = (_gi), .ge = (_ge) 111 | 112 | /** 113 | * @type gaba_t 114 | * 115 | * @brief (API) an alias to `struct gaba_context_s'. 116 | */ 117 | typedef struct gaba_context_s gaba_t; 118 | 119 | /** 120 | * @type gaba_stack_t 121 | * 122 | * @brief stack context container 123 | */ 124 | typedef struct gaba_stack_s gaba_stack_t; 125 | 126 | /** 127 | * @struct gaba_section_s 128 | * 129 | * @brief section container, a tuple of (id, length, head position). 130 | */ 131 | struct gaba_section_s { 132 | uint32_t id; /** (4) section id */ 133 | uint32_t len; /** (4) length of the seq */ 134 | uint8_t const *base; /** (8) pointer to the head of the sequence */ 135 | }; 136 | typedef struct gaba_section_s gaba_section_t; 137 | #define gaba_build_section(_id, _base, _len) ( \ 138 | (struct gaba_section_s){ \ 139 | .id = (_id), \ 140 | .base = (uint8_t const *)(_base), \ 141 | .len = (_len) \ 142 | } \ 143 | ) 144 | /** 145 | * @macro GABA_EOU 146 | * @brief end-of-userland pointer. Any input sequence pointer p that points to an address 147 | * after the end-of-userland is regarded as "phantom array". The actual sequence is fetched 148 | * from an array located at 2 * GABA_EQU - p (that is, the pointer p is mirrored at the 149 | * GABA_EOU) 150 | */ 151 | #define GABA_EOU ( (uint8_t const *)0x800000000000 ) 152 | #define gaba_mirror(base, len) ( GABA_EOU + (uint64_t)GABA_EOU - (uint64_t)(base) - (uint64_t)(len) ) 153 | 154 | /* gaba_rev is deprecated */ 155 | #define gaba_rev(pos, len) ( (len) + (uint64_t)(len) - (uint64_t)(pos) - 1 ) 156 | 157 | /** 158 | * @type gaba_dp_t 159 | * 160 | * @brief an alias to `struct gaba_dp_context_s`. 161 | */ 162 | #ifndef _GABA_WRAP_H_INCLUDED 163 | typedef struct gaba_dp_context_s gaba_dp_t; 164 | #endif 165 | 166 | /** 167 | * @struct gaba_fill_s 168 | */ 169 | struct gaba_fill_s { 170 | uint32_t aid, bid; /** (8) the last-filled section ids */ 171 | uint32_t ascnt, bscnt; /** (8) aligned section counts */ 172 | uint64_t apos, bpos; /** (16) #fetched bases from the head (ppos = apos + bpos) */ 173 | int64_t max; /** (8) max score in the entire band */ 174 | uint32_t status; /** (4) status (section update flags) */ 175 | // int32_t ppos; /** (8) #vectors from the head (FIXME: should be 64bit int) */ 176 | uint32_t reserved[5]; 177 | }; 178 | typedef struct gaba_fill_s gaba_fill_t; 179 | 180 | /** 181 | * @struct gaba_pos_pair_s 182 | */ 183 | struct gaba_pos_pair_s { 184 | uint32_t aid, bid; 185 | uint32_t apos, bpos; 186 | uint64_t plen; 187 | }; 188 | typedef struct gaba_pos_pair_s gaba_pos_pair_t; 189 | 190 | /** 191 | * @struct gaba_segment_s 192 | */ 193 | struct gaba_segment_s { 194 | uint32_t aid, bid; /** (8) id of the sections */ 195 | uint32_t apos, bpos; /** (8) pos in the sections */ 196 | uint32_t alen, blen; /** (8) lengths of the segments */ 197 | uint64_t ppos; /** (8) path string position (offset) */ 198 | }; 199 | typedef struct gaba_segment_s gaba_path_section_t; 200 | #define gaba_plen(seg) ( (seg)->alen + (seg)->blen ) 201 | 202 | /** 203 | * @struct gaba_alignment_s 204 | */ 205 | struct gaba_alignment_s { 206 | /* reserved for internal use */ 207 | void *reserved[2]; 208 | 209 | int64_t score; /** score */ 210 | double identity; /** estimated percent identity over the entire alignment, match_count / (match_count + mismatch_count) */ 211 | uint32_t agcnt, bgcnt; /** #gap bases on seq a and seq b */ 212 | uint32_t dcnt; /** #diagonals (match and mismatch) */ 213 | 214 | uint32_t slen; /* segment length */ 215 | struct gaba_segment_s const *seg; 216 | 217 | uint32_t plen, padding; /* path length (FIXME: uint64_t is better) */ 218 | uint32_t path[]; 219 | }; 220 | typedef struct gaba_alignment_s gaba_alignment_t; 221 | 222 | /** 223 | * @struct gaba_score_s 224 | */ 225 | struct gaba_score_s { 226 | /* score and identity */ 227 | int64_t score; 228 | double identity; 229 | 230 | /* match and gap counts */ 231 | uint32_t agcnt, bgcnt; /* a-side and b-side gap bases */ 232 | uint32_t mcnt, xcnt; /* match and mismatch counts */ 233 | uint32_t aicnt, bicnt; /* gap region counts */ 234 | 235 | /* short-gap counts */ 236 | uint32_t afgcnt, bfgcnt; 237 | uint32_t aficnt, bficnt; 238 | 239 | /* when the section starts with a gap, adj is set gap open penalty for the contiguous gap region */ 240 | int32_t adj; 241 | uint32_t reserved; 242 | }; 243 | typedef struct gaba_score_s gaba_score_t; 244 | 245 | /** 246 | * @fn gaba_init 247 | * @brief (API) gaba_init new API 248 | */ 249 | _GABA_EXPORT_LEVEL 250 | gaba_t *gaba_init(gaba_params_t const *params); 251 | 252 | /** 253 | * @fn gaba_clean 254 | * @brief (API) clean up the alignment context structure. 255 | */ 256 | _GABA_EXPORT_LEVEL 257 | void gaba_clean(gaba_t *ctx); 258 | 259 | /** 260 | * @fn gaba_dp_init 261 | * @brief create thread-local context deriving the global context (ctx) 262 | * with local memory arena and working buffers. alim and blim are respectively 263 | * the tails of sequence arrays. 264 | */ 265 | _GABA_EXPORT_LEVEL 266 | gaba_dp_t *gaba_dp_init(gaba_t const *ctx); 267 | 268 | /** 269 | * @fn gaba_dp_flush 270 | * @brief flush stack (flush all if NULL) 271 | */ 272 | _GABA_EXPORT_LEVEL 273 | void gaba_dp_flush( 274 | gaba_dp_t *dp); 275 | 276 | /** 277 | * @fn gaba_dp_save_stack 278 | */ 279 | _GABA_EXPORT_LEVEL 280 | gaba_stack_t const *gaba_dp_save_stack( 281 | gaba_dp_t *dp); 282 | 283 | /** 284 | * @fn gaba_dp_flush_stack 285 | */ 286 | _GABA_EXPORT_LEVEL 287 | void gaba_dp_flush_stack( 288 | gaba_dp_t *dp, 289 | gaba_stack_t const *stack); 290 | 291 | /** 292 | * @fn gaba_dp_clean 293 | */ 294 | _GABA_EXPORT_LEVEL 295 | void gaba_dp_clean( 296 | gaba_dp_t *dp); 297 | 298 | /** 299 | * @fn gaba_dp_fill_root 300 | */ 301 | _GABA_EXPORT_LEVEL 302 | gaba_fill_t *gaba_dp_fill_root( 303 | gaba_dp_t *dp, 304 | gaba_section_t const *a, 305 | uint32_t apos, 306 | gaba_section_t const *b, 307 | uint32_t bpos, 308 | uint32_t pridx); 309 | 310 | /** 311 | * @fn gaba_dp_fill 312 | * @brief fill dp matrix inside section pairs 313 | */ 314 | _GABA_EXPORT_LEVEL 315 | gaba_fill_t *gaba_dp_fill( 316 | gaba_dp_t *dp, 317 | gaba_fill_t const *prev_sec, 318 | gaba_section_t const *a, 319 | gaba_section_t const *b, 320 | uint32_t pridx); 321 | 322 | /** 323 | * @fn gaba_dp_merge 324 | * @brief merge multiple sections. all the vectors (tail objects) must be aligned on the same ppos, 325 | * and qofs are the q-distance of the two fill objects. 326 | */ 327 | #define MAX_MERGE_COUNT ( 14 ) 328 | _GABA_EXPORT_LEVEL 329 | gaba_fill_t *gaba_dp_merge( 330 | gaba_dp_t *dp, 331 | gaba_fill_t const *const *sec, 332 | uint8_t const *qofs, 333 | uint32_t cnt); 334 | 335 | /** 336 | * @fn gaba_dp_search_max 337 | */ 338 | _GABA_EXPORT_LEVEL 339 | gaba_pos_pair_t *gaba_dp_search_max( 340 | gaba_dp_t *dp, 341 | gaba_fill_t const *sec); 342 | 343 | /** 344 | * @fn gaba_dp_trace 345 | * @brief generate alignment result string, alloc->malloc and alloc->free must not be NULL if alloc is not NULL. 346 | */ 347 | _GABA_EXPORT_LEVEL 348 | gaba_alignment_t *gaba_dp_trace( 349 | gaba_dp_t *dp, 350 | gaba_fill_t const *tail, 351 | gaba_alloc_t const *alloc); 352 | 353 | /** 354 | * @fn gaba_dp_res_free 355 | */ 356 | _GABA_EXPORT_LEVEL 357 | void gaba_dp_res_free( 358 | gaba_dp_t *dp, 359 | gaba_alignment_t *aln); 360 | 361 | /** 362 | * @fn gaba_dp_calc_score 363 | * @brief calculate score, match count, mismatch count, and gap counts for the section 364 | */ 365 | _GABA_EXPORT_LEVEL 366 | gaba_score_t *gaba_dp_calc_score( 367 | gaba_dp_t *dp, 368 | uint32_t const *path, 369 | gaba_path_section_t const *s, 370 | gaba_section_t const *a, 371 | gaba_section_t const *b); 372 | 373 | /** 374 | * parser functions: the actual implementations are in gaba_parse.h 375 | */ 376 | 377 | /** 378 | * @type gaba_printer_t 379 | * @brief printer for print functions. simplest one to dump a CIGAR operation can be the following: 380 | * 381 | * int printer(FILE *fp, uint64_t len, char c) { return(fprintf(fp, "%c%lu", c, len)); } 382 | */ 383 | #ifndef _GABA_PRINTER_T_DEFINED 384 | #define _GABA_PRINTER_T_DEFINED 385 | typedef int (*gaba_printer_t)(void *, uint64_t, char); 386 | #endif 387 | 388 | /** 389 | * @fn gaba_print_cigar_forward, gaba_print_cigar_reverse 390 | * @brief dump CIGAR string (4M1I5M1D...) for a range specified by [offset, offset + len) on the path, 391 | * the range can be retireved from segment by [seg[i].ppos, gaba_plen(&seg[i])). 392 | */ 393 | _GABA_PARSE_EXPORT_LEVEL 394 | uint64_t gaba_print_cigar_forward( 395 | gaba_printer_t printer, 396 | void *fp, 397 | uint32_t const *path, 398 | uint64_t offset, 399 | uint64_t len); 400 | _GABA_PARSE_EXPORT_LEVEL 401 | uint64_t gaba_print_cigar_reverse( 402 | gaba_printer_t printer, 403 | void *fp, 404 | uint32_t const *path, 405 | uint64_t offset, 406 | uint64_t len); 407 | 408 | /** 409 | * @fn gaba_dump_cigar_forward, gaba_dump_cigar_reverse 410 | * @brief dump to memory. see print functions for the details. 411 | */ 412 | _GABA_PARSE_EXPORT_LEVEL 413 | uint64_t gaba_dump_cigar_forward( 414 | char *buf, 415 | uint64_t buf_size, 416 | uint32_t const *path, 417 | uint64_t offset, 418 | uint64_t len); 419 | _GABA_PARSE_EXPORT_LEVEL 420 | uint64_t gaba_dump_cigar_reverse( 421 | char *buf, 422 | uint64_t buf_size, 423 | uint32_t const *path, 424 | uint64_t offset, 425 | uint64_t len); 426 | 427 | /** 428 | * @fn gaba_dump_xcigar_forward, gaba_dump_xcigar_reverse 429 | */ 430 | _GABA_PARSE_EXPORT_LEVEL 431 | uint64_t gaba_print_xcigar_forward( 432 | gaba_printer_t printer, 433 | void *fp, 434 | uint32_t const *path, 435 | gaba_path_section_t const *s, 436 | gaba_section_t const *a, 437 | gaba_section_t const *b); 438 | _GABA_PARSE_EXPORT_LEVEL 439 | uint64_t gaba_print_xcigar_reverse( 440 | gaba_printer_t printer, 441 | void *fp, 442 | uint32_t const *path, 443 | gaba_path_section_t const *s, 444 | gaba_section_t const *a, 445 | gaba_section_t const *b); 446 | _GABA_PARSE_EXPORT_LEVEL 447 | uint64_t gaba_dump_xcigar_forward( 448 | char *buf, 449 | uint64_t buf_size, 450 | uint32_t const *path, 451 | gaba_path_section_t const *s, 452 | gaba_section_t const *a, 453 | gaba_section_t const *b); 454 | _GABA_PARSE_EXPORT_LEVEL 455 | uint64_t gaba_dump_xcigar_reverse( 456 | char *buf, 457 | uint64_t buf_size, 458 | uint32_t const *path, 459 | gaba_path_section_t const *s, 460 | gaba_section_t const *a, 461 | gaba_section_t const *b); 462 | 463 | /** 464 | * @fn gaba_dump_seq_forward, gaba_dump_seq_reverse 465 | * @brief dump sequence in ASCII format (ACACTGG...) with gaps. 466 | */ 467 | _GABA_PARSE_EXPORT_LEVEL 468 | uint64_t gaba_dump_seq_forward( 469 | char *buf, 470 | uint64_t buf_size, 471 | uint32_t conf, /* { SEQ_A, SEQ_B } x { SEQ_FW, SEQ_RV } */ 472 | uint32_t const *path, 473 | uint64_t offset, 474 | uint64_t len, 475 | uint8_t const *seq, /* a->seq[s->alen] when SEQ_RV */ 476 | char gap); /* gap char, '-' */ 477 | _GABA_PARSE_EXPORT_LEVEL 478 | uint64_t gaba_dump_seq_reverse( 479 | char *buf, 480 | uint64_t buf_size, 481 | uint32_t conf, /* { SEQ_A, SEQ_B } x { SEQ_FW, SEQ_RV } */ 482 | uint32_t const *path, 483 | uint64_t offset, 484 | uint64_t len, 485 | uint8_t const *seq, /* a->seq[s->alen] when SEQ_RV */ 486 | char gap); /* gap char, '-' */ 487 | 488 | /** 489 | * @fn gaba_dump_seq_ref, gaba_dump_seq_query 490 | * @brief calling the pair dumps MAF-styled two column-aligned strings. 491 | */ 492 | _GABA_PARSE_EXPORT_LEVEL 493 | uint64_t gaba_dump_seq_ref( 494 | char *buf, 495 | uint64_t buf_size, 496 | uint32_t const *path, 497 | gaba_path_section_t const *s, 498 | gaba_section_t const *a); 499 | _GABA_PARSE_EXPORT_LEVEL 500 | uint64_t gaba_dump_seq_query( 501 | char *buf, 502 | uint64_t buf_size, 503 | uint32_t const *path, 504 | gaba_path_section_t const *s, 505 | gaba_section_t const *a); 506 | 507 | #endif /* #ifndef _GABA_H_INCLUDED */ 508 | 509 | /* 510 | * end of gaba.h 511 | */ 512 | --------------------------------------------------------------------------------