├── .gitignore ├── Makefile ├── Arithmetic_Intrinsics └── src │ ├── hadds.c │ ├── hsubs.c │ ├── div.c │ ├── mulhrs.c │ ├── addsub.c │ ├── mullo.c │ ├── mulhi.c │ ├── mul.c │ ├── hadd.c │ ├── hsub.c │ ├── fmaddsub.c │ ├── fmsubadd.c │ ├── adds.c │ ├── subs.c │ ├── fmadd.c │ ├── fmsub.c │ ├── fnmadd.c │ ├── fnmsub.c │ ├── add.c │ ├── sub.c │ └── Makefile ├── Initialization_Intrinsics ├── include │ └── gcc_support.h └── src │ ├── setzero.c │ ├── loadu.c │ ├── load.c │ ├── Makefile │ ├── set1.c │ ├── set.c │ ├── setr.c │ └── maskload.c ├── Permuting_and_Shuffling ├── include │ └── gcc_support.h └── src │ ├── permute4x64.c │ ├── shufflehi.c │ ├── shufflelo.c │ ├── permutevar8x32.c │ ├── permute.c │ ├── Makefile │ ├── permutevar.c │ ├── permute2f128.c │ └── shuffle.c ├── LICENSE └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | bin 2 | obj 3 | *.o 4 | 5 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Author: TripleZ 2 | # Date: 2018-08-17 <-- This is my 21th birthday :tada: 3 | 4 | CC = gcc 5 | CFLAGS = -I$(INCDIR) -mavx -mavx2 -mfma -msse -msse2 -msse3 -Wall 6 | 7 | export CC 8 | export CFLAGS 9 | 10 | all: 11 | @$(MAKE) -C Initialization_Intrinsics/src all 12 | @$(MAKE) -C Arithmetic_Intrinsics/src all 13 | @$(MAKE) -C Permuting_and_Shuffling/src all 14 | 15 | .PHONY: clean run 16 | clean: 17 | @$(MAKE) -C Initialization_Intrinsics/src clean 18 | @$(MAKE) -C Arithmetic_Intrinsics/src clean 19 | @$(MAKE) -C Permuting_and_Shuffling/src clean 20 | 21 | run: 22 | @$(MAKE) -C Initialization_Intrinsics/src runall 23 | @$(MAKE) -C Arithmetic_Intrinsics/src runall 24 | @$(MAKE) -C Permuting_and_Shuffling/src runall 25 | -------------------------------------------------------------------------------- /Arithmetic_Intrinsics/src/hadds.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-17 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // 16-bit integer horizontal addition with saturation (AVX2) 12 | __m256i epi16_vec_0 = _mm256_set1_epi16(32700); 13 | __m256i epi16_vec_1 = _mm256_set1_epi16(17); 14 | 15 | __m256i epi16_result = _mm256_hadds_epi16(epi16_vec_0, epi16_vec_1); 16 | 17 | short* sho = (short*) &epi16_result; 18 | printf("short:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", sho[0], sho[1], sho[2], sho[3], sho[4], sho[5], sho[6], sho[7], sho[8], sho[9], sho[10], sho[11], sho[12], sho[13], sho[14], sho[15]); 19 | 20 | 21 | return 0; 22 | } 23 | -------------------------------------------------------------------------------- /Initialization_Intrinsics/include/gcc_support.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-16 4 | */ 5 | 6 | #ifndef __GCC_SUPPORT_H 7 | #define __GCC_SUPPORT_H 8 | 9 | #define _mm256_set_m128(vh, vl) \ 10 | _mm256_insertf128_ps(_mm256_castps128_ps256(vl), vh, 1) 11 | 12 | #define _mm256_set_m128d(vh, vl) \ 13 | _mm256_insertf128_pd(_mm256_castpd128_pd256(vl), (vh), 1) 14 | 15 | #define _mm256_set_m128i(vh, vl) \ 16 | _mm256_insertf128_si256(_mm256_castsi128_si256(vl), (vh), 1) 17 | 18 | #define _mm256_setr_m128(vh, vl) \ 19 | _mm256_set_m128((vl), (vh)) 20 | 21 | #define _mm256_setr_m128d(vh, vl) \ 22 | _mm256_set_m128d((vl), (vh)) 23 | 24 | #define _mm256_setr_m128i(vh, vl) \ 25 | _mm256_set_m128i((vl), (vh)) 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /Permuting_and_Shuffling/include/gcc_support.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-16 4 | */ 5 | 6 | #ifndef __GCC_SUPPORT_H 7 | #define __GCC_SUPPORT_H 8 | 9 | #define _mm256_set_m128(vh, vl) \ 10 | _mm256_insertf128_ps(_mm256_castps128_ps256(vl), vh, 1) 11 | 12 | #define _mm256_set_m128d(vh, vl) \ 13 | _mm256_insertf128_pd(_mm256_castpd128_pd256(vl), (vh), 1) 14 | 15 | #define _mm256_set_m128i(vh, vl) \ 16 | _mm256_insertf128_si256(_mm256_castsi128_si256(vl), (vh), 1) 17 | 18 | #define _mm256_setr_m128(vh, vl) \ 19 | _mm256_set_m128((vl), (vh)) 20 | 21 | #define _mm256_setr_m128d(vh, vl) \ 22 | _mm256_set_m128d((vl), (vh)) 23 | 24 | #define _mm256_setr_m128i(vh, vl) \ 25 | _mm256_set_m128i((vl), (vh)) 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /Arithmetic_Intrinsics/src/hsubs.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-17 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // 16-bit integer horizontal subtraction with saturation (AVX2) 12 | __m256i epi16_vec_0 = _mm256_setr_epi16(-32767, 100, -32767, 100, -32767, 100, -32767, 100, -32767, 100, -32767, 100, -32767, 100, -32767, 100); 13 | __m256i epi16_vec_1 = _mm256_setr_epi16(2, 7, 2, 7, 2, 7, 2, 7, 2, 7, 2, 7, 2, 7, 2, 7); 14 | 15 | __m256i epi16_result = _mm256_hsubs_epi16(epi16_vec_0, epi16_vec_1); 16 | 17 | short* sho = (short*) &epi16_result; 18 | printf("short:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", sho[0], sho[1], sho[2], sho[3], sho[4], sho[5], sho[6], sho[7], sho[8], sho[9], sho[10], sho[11], sho[12], sho[13], sho[14], sho[15]); 19 | 20 | return 0; 21 | } 22 | -------------------------------------------------------------------------------- /Initialization_Intrinsics/src/setzero.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-16 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Single-precision 12 | __m256 float_vec = _mm256_setzero_ps(); 13 | 14 | float* flo = (float*) &float_vec; 15 | printf("float:\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3], flo[4], flo[5], flo[6], flo[7]); 16 | 17 | // Double-precision 18 | __m256d double_vec = _mm256_setzero_pd(); 19 | 20 | double* dou = (double*) &double_vec; 21 | printf("double:\t%lf, %lf, %lf, %lf\n", dou[0], dou[1], dou[2], dou[3]); 22 | 23 | // Integers 24 | __m256i int_vec = _mm256_setzero_si256(); 25 | 26 | int* i = (int*) &int_vec; 27 | printf("int:\t%d, %d, %d, %d, %d, %d, %d, %d\n", i[0], i[1], i[2], i[3], i[4], i[5], i[6], i[7]); 28 | 29 | return 0; 30 | } 31 | -------------------------------------------------------------------------------- /Arithmetic_Intrinsics/src/div.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-20 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Single-precision division 12 | __m256 float_vec_0 = _mm256_set1_ps(2.0); 13 | __m256 float_vec_1 = _mm256_set1_ps(8.0); 14 | 15 | __m256 float_result = _mm256_div_ps(float_vec_0, float_vec_1); 16 | 17 | float* flo = (float*) &float_result; 18 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3], flo[4], flo[5], flo[6], flo[7]); 19 | 20 | // Double-precision division 21 | __m256d double_vec_0 = _mm256_set1_pd(2.0); 22 | __m256d double_vec_1 = _mm256_set1_pd(8.0); 23 | 24 | __m256d double_result = _mm256_div_pd(double_vec_0, double_vec_1); 25 | 26 | double* dou = (double*) &double_result; 27 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou[0], dou[1], dou[2], dou[3]); 28 | 29 | return 0; 30 | } 31 | -------------------------------------------------------------------------------- /Arithmetic_Intrinsics/src/mulhrs.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-20 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Signed 16-bit integer multiplication only store high bits without sign bit (AVX2) 12 | __m256i epi16_vec_0 = _mm256_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); 13 | __m256i epi16_vec_1 = _mm256_setr_epi16((unsigned short) 65535, (unsigned short) 65536, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 4, 5); 14 | 15 | __m256i epi16_result = _mm256_mulhrs_epi16(epi16_vec_0, epi16_vec_1); 16 | 17 | unsigned short *sho = (unsigned short *)&epi16_result; 18 | printf("short:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", sho[0], sho[1], sho[2], sho[3], sho[4], sho[5], sho[6], sho[7], sho[8], sho[9], sho[10], sho[11], sho[12], sho[13], sho[14], sho[15]); 19 | 20 | return 0; 21 | } 22 | -------------------------------------------------------------------------------- /Permuting_and_Shuffling/src/permute4x64.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-21 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Double-precision permutation with 256-bit vector and 8-bit control value (AVX2) 12 | __m256d double_256_vec_0 = _mm256_set_pd(4.0, 3.0, 2.0, 1.0); 13 | 14 | __m256d double_256_result = _mm256_permute4x64_pd(double_256_vec_0, 0b10011000); 15 | 16 | double* dou = (double*) &double_256_result; 17 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou[0], dou[1], dou[2], dou[3]); 18 | 19 | // 64-bit integer permutation with 256-bit vector and 8-bit control value (AVX2) 20 | __m256i epi64_256_vec_0 = _mm256_set_epi64x(4, 3, 2, 1); 21 | 22 | __m256i epi64_256_result = _mm256_permute4x64_epi64(epi64_256_vec_0, 0b10011000); 23 | 24 | long long int* i = (long long int*) &epi64_256_result; 25 | printf("long long int:\t %lld, %lld, %lld, %lld\n", i[0], i[1], i[2], i[3]); 26 | 27 | return 0; 28 | } 29 | -------------------------------------------------------------------------------- /Permuting_and_Shuffling/src/shufflehi.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-09-07 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, const char *argv[]) { 10 | 11 | // 16-bit integer shuffle in the high 64 bits of 128-lanes using the control in imm8 (AVX2) 12 | __m256i epi16_256_vec_0 = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); 13 | 14 | __m256i epi16_256_result = _mm256_shufflehi_epi16(epi16_256_vec_0, 0b00010111); 15 | // The result should be: (max -> least) 4, 3, 3, 1, 5, 6, 7, 8, 16 | // 12, 11, 11, 9, 13, 14, 15, 16 17 | // (least -> max) 16, 15, 14, 13, 9, 11, 11, 12, 18 | // 8, 7, 6, 5, 1, 3, 3, 4 19 | 20 | short* sho = (short*) &epi16_256_result; 21 | printf("short:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", sho[0], sho[1], sho[2], sho[3], sho[4], sho[5], sho[6], sho[7], sho[8], sho[9], sho[10], sho[11], sho[12], sho[13], sho[14], sho[15]); 22 | 23 | return 0; 24 | } 25 | -------------------------------------------------------------------------------- /Permuting_and_Shuffling/src/shufflelo.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-09-07 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, const char *argv[]) { 10 | 11 | // 16-bit integer shuffle in the high 64 bits of 128-lanes using the control in imm8 (AVX2) 12 | __m256i epi16_256_vec_0 = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); 13 | 14 | __m256i epi16_256_result = _mm256_shufflelo_epi16(epi16_256_vec_0, 0b00010111); 15 | // The result should be: (max -> least) 1, 2, 3, 4, 8, 7, 7, 5, 16 | // 9, 10, 11, 12, 16, 15, 15, 13 17 | // (least -> max) 13, 15, 15, 16, 12, 11, 10, 9, 18 | // 5, 7, 7, 8, 4, 3, 2, 1 19 | 20 | short* sho = (short*) &epi16_256_result; 21 | printf("short:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", sho[0], sho[1], sho[2], sho[3], sho[4], sho[5], sho[6], sho[7], sho[8], sho[9], sho[10], sho[11], sho[12], sho[13], sho[14], sho[15]); 22 | 23 | return 0; 24 | } 25 | -------------------------------------------------------------------------------- /Arithmetic_Intrinsics/src/addsub.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-17 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Single-precision addsub operation 12 | __m256 float_vec_0 = _mm256_set_ps(1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0); 13 | __m256 float_vec_1 = _mm256_set_ps(2.0, 7.0, 2.0, 7.0, 2.0, 7.0, 2.0, 7.0); 14 | 15 | __m256 float_result = _mm256_addsub_ps(float_vec_0, float_vec_1); // (3-7), (1+2) 16 | 17 | float* flo = (float*) &float_result; 18 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3], flo[4], flo[5], flo[6], flo[7]); 19 | 20 | // Double-precision addsub operation 21 | __m256d double_vec_0 = _mm256_set_pd(1.0, 3.0, 1.0, 3.0); 22 | __m256d double_vec_1 = _mm256_set_pd(2.0, 7.0, 2.0, 7.0); 23 | 24 | __m256d double_result = _mm256_addsub_pd(double_vec_0, double_vec_1); // (3-7), (1+2) 25 | 26 | double* dou = (double*) &double_result; 27 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou[0], dou[1], dou[2], dou[3]); 28 | 29 | return 0; 30 | } 31 | -------------------------------------------------------------------------------- /Permuting_and_Shuffling/src/permutevar8x32.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-21 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Single-precision 8x32 permutation with 256-bit vector and 32-bit integers control vector (AVX2) 12 | __m256 float_256_vec_0 = _mm256_set_ps(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0); 13 | __m256i epi32_256_ctl_0 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); 14 | 15 | __m256 float_256_result = _mm256_permutevar8x32_ps(float_256_vec_0, epi32_256_ctl_0); 16 | 17 | float* flo = (float*) &float_256_result; 18 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3], flo[4], flo[5], flo[6], flo[7]); 19 | 20 | // 32-bit integer 8x32 permutation with 256-bit vector and 32-bit integers control vector (AVX2) 21 | __m256i epi32_256_vec_0 = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1); 22 | __m256i epi32_256_ctl_1 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); 23 | 24 | __m256i epi32_256_result = _mm256_permutevar8x32_epi32(epi32_256_vec_0, epi32_256_ctl_1); 25 | 26 | int* i = (int*) &epi32_256_result; 27 | printf("int:\t\t%d, %d, %d, %d, %d, %d, %d, %d\n", i[0], i[1], i[2], i[3], i[4], i[5], i[6], i[7]); 28 | 29 | return 0; 30 | } 31 | -------------------------------------------------------------------------------- /Arithmetic_Intrinsics/src/mullo.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-18 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Signed 16-bit integer multiplication with low bits (AVX2) 12 | __m256i epi16_vec_0 = _mm256_setr_epi16(1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 32767, 32767, 32767); 13 | __m256i epi16_vec_1 = _mm256_setr_epi16(1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 3, 3); 14 | 15 | __m256i epi16_result = _mm256_mullo_epi16(epi16_vec_0, epi16_vec_1); 16 | 17 | short* sho = (short*) &epi16_result; 18 | printf("short:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", sho[0], sho[1], sho[2], sho[3], sho[4], sho[5], sho[6], sho[7], sho[8], sho[9], sho[10], sho[11], sho[12], sho[13], sho[14], sho[15]); 19 | 20 | // Signed 32-bit integer multiplication with low bits (AVX2) 21 | __m256i epi32_vec_0 = _mm256_setr_epi32(1, 3, 1, 3, 1, 2147483647, 2147483647, 2147483647); 22 | __m256i epi32_vec_1 = _mm256_setr_epi32(1, 3, 1, 3, 1, 3, 3, 3); 23 | 24 | __m256i epi32_result = _mm256_mullo_epi32(epi32_vec_0, epi32_vec_1); 25 | 26 | int* i = (int*) &epi32_result; 27 | printf("int:\t\t%d, %d, %d, %d, %d, %d, %d, %d\n", i[0], i[1], i[2], i[3], i[4], i[5], i[6], i[7]); 28 | 29 | return 0; 30 | } 31 | -------------------------------------------------------------------------------- /Initialization_Intrinsics/src/loadu.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-17 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Single-precision 12 | int i; 13 | float* floats = (float*) malloc(8 * 8 * sizeof(float)); 14 | for (i = 0; i < 8; i++) { 15 | floats[i] = (float)(i) + 1.0; 16 | } 17 | 18 | __m256 float_vec = _mm256_loadu_ps(floats); 19 | 20 | float* flo = (float*) &float_vec; 21 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3], flo[4], flo[5], flo[6], flo[7]); 22 | 23 | // Double-precision 24 | double* doubles = (double*) malloc(4 * 8 * sizeof(double)); 25 | for (i = 0; i < 4; i++) { 26 | doubles[i] = (double)(i) + 1.0; 27 | } 28 | 29 | __m256d double_vec = _mm256_loadu_pd(doubles); 30 | 31 | double* dou = (double*) &double_vec; 32 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou[0], dou[1], dou[2], dou[3]); 33 | 34 | // 32-bit Integer 35 | int* _int = (int*) malloc(8 * 8 * sizeof(int)); 36 | for (i = 0; i < 8; i++) { 37 | _int[i] = i + 1; 38 | } 39 | 40 | __m256i int_vec = _mm256_loadu_si256((const __m256i*)_int); 41 | 42 | int* i_v = (int*) &int_vec; 43 | printf("int:\t\t%d, %d, %d, %d, %d, %d, %d, %d\n", i_v[0], i_v[1], i_v[2], i_v[3], i_v[4], i_v[5], i_v[6], i_v[7]); 44 | 45 | return 0; 46 | } 47 | 48 | -------------------------------------------------------------------------------- /Arithmetic_Intrinsics/src/mulhi.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-19 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Signed 16-bit integer multiplication with high bits (AVX2) 12 | __m256i epi16_vec_0 = _mm256_setr_epi16(1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 32767, 32767, 32767); 13 | __m256i epi16_vec_1 = _mm256_setr_epi16(1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 3, 3); 14 | 15 | __m256i epi16_result = _mm256_mulhi_epi16(epi16_vec_0, epi16_vec_1); 16 | 17 | short *sho = (short *)&epi16_result; 18 | printf("short:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", sho[0], sho[1], sho[2], sho[3], sho[4], sho[5], sho[6], sho[7], sho[8], sho[9], sho[10], sho[11], sho[12], sho[13], sho[14], sho[15]); 19 | 20 | // Unsigned 16-bit integer multiplication with high bits (AVX2) 21 | __m256i epu16_vec_0 = _mm256_setr_epi16(1, 3, 1, 3, 1, (unsigned int)65535, (unsigned int)65535, (unsigned int)65535, 1, 3, 1, 3, 1, (unsigned int)65535, (unsigned int)65535, (unsigned int)65535); 22 | __m256i epu16_vec_1 = _mm256_setr_epi16(1, 3, 1, 3, 1, 3, 3, 3, 1, 3, 1, 3, 1, 3, 3, 3); 23 | 24 | __m256i epu16_result = _mm256_mulhi_epu16(epu16_vec_0, epu16_vec_1); 25 | 26 | int *i = (int *)&epu16_result; 27 | printf("int:\t\t%d, %d, %d, %d, %d, %d, %d, %d\n", i[0], i[1], i[2], i[3], i[4], i[5], i[6], i[7]); 28 | 29 | return 0; 30 | } 31 | -------------------------------------------------------------------------------- /Initialization_Intrinsics/src/load.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-17 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Single-precision 12 | int i; 13 | float* aligned_floats = (float*) aligned_alloc(32, 8 * 8 * sizeof(float)); 14 | for (i = 0; i < 8; i++) { 15 | aligned_floats[i] = (float)(i) + 1.0; 16 | } 17 | 18 | __m256 float_vec = _mm256_load_ps(aligned_floats); 19 | 20 | float* flo = (float*) &float_vec; 21 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3], flo[4], flo[5], flo[6], flo[7]); 22 | 23 | // Double-precision 24 | double* aligned_doubles = (double*) aligned_alloc(32, 4 * 8 * sizeof(double)); 25 | for (i = 0; i < 4; i++) { 26 | aligned_doubles[i] = (double)(i) + 1.0; 27 | } 28 | 29 | __m256d double_vec = _mm256_load_pd(aligned_doubles); 30 | 31 | double* dou = (double*) &double_vec; 32 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou[0], dou[1], dou[2], dou[3]); 33 | 34 | // 32-bit Integer 35 | int* aligned_int = (int*) aligned_alloc(32, 8 * 8 * sizeof(int)); 36 | for (i = 0; i < 8; i++) { 37 | aligned_int[i] = i + 1; 38 | } 39 | 40 | __m256i int_vec = _mm256_load_si256((const __m256i*) aligned_int); 41 | 42 | int* i_v = (int*) &int_vec; 43 | printf("int:\t\t%d, %d, %d, %d, %d, %d, %d, %d\n", i_v[0], i_v[1], i_v[2], i_v[3], i_v[4], i_v[5], i_v[6], i_v[7]); 44 | 45 | 46 | return 0; 47 | } 48 | 49 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018, Zhenzhen Zhao 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /Initialization_Intrinsics/src/Makefile: -------------------------------------------------------------------------------- 1 | # Author: TripleZ 2 | # Date: 2018-08-16 3 | 4 | CC = gcc 5 | CFLAGS = -I$(INCDIR) -mavx -mavx2 -mfma -msse -msse2 -msse3 -Wall 6 | 7 | INCDIR = ../include 8 | BINDIR = ../bin 9 | OBJDIR = ../obj 10 | 11 | ALL = setzero set1 set setr \ 12 | load loadu maskload 13 | 14 | $(OBJDIR)/%.o: %.c 15 | @[ -d $(OBJDIR) ] || mkdir $(OBJDIR) 16 | $(CC) -c -o $@ $< $(CFLAGS) 17 | 18 | all: $(ALL) 19 | 20 | setzero: $(patsubst %, $(OBJDIR)/%, setzero.o) 21 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 22 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 23 | 24 | set1: $(patsubst %, $(OBJDIR)/%, set1.o) 25 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 26 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 27 | 28 | set: $(patsubst %, $(OBJDIR)/%, set.o) 29 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 30 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 31 | 32 | setr: $(patsubst %, $(OBJDIR)/%, setr.o) 33 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 34 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 35 | 36 | load: $(patsubst %, $(OBJDIR)/%, load.o) 37 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 38 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 39 | 40 | loadu: $(patsubst %, $(OBJDIR)/%, loadu.o) 41 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 42 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 43 | 44 | maskload: $(patsubst %, $(OBJDIR)/%, maskload.o) 45 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 46 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 47 | 48 | run-%: $(BINDIR)/% 49 | $^ 50 | 51 | runall: all 52 | @echo -e 53 | @for exe in $(ALL); do \ 54 | echo "$(BINDIR)/$$exe"; \ 55 | $(BINDIR)/$$exe; \ 56 | echo -e; \ 57 | done 58 | 59 | 60 | .PHONY: clean 61 | clean: 62 | -@rm -rf $(BINDIR) $(OBJDIR) 63 | @echo "- Cleaned Directory 'Initialization_Intrinsics/'" 64 | -------------------------------------------------------------------------------- /Permuting_and_Shuffling/src/permute.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-21 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Single-precision permutation with 128-bit vector and 8-bit control value 12 | __m128 float_128_vec_0 = _mm_set_ps(4.0, 3.0, 2.0, 1.0); 13 | 14 | __m128 float_128_result = _mm_permute_ps(float_128_vec_0, 0b10011000); 15 | 16 | float* flo = (float*) &float_128_result; 17 | printf("float:\t\t%f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3]); 18 | 19 | // Double-precision permutation with 128-bit vector and 2-bit control value 20 | __m128d double_128_vec_0 = _mm_set_pd(6.0, 5.0); 21 | 22 | __m128d double_128_result = _mm_permute_pd(double_128_vec_0, 0b01); 23 | 24 | double* dou = (double*) &double_128_result; 25 | printf("double:\t\t%lf, %lf\n", dou[0], dou[1]); 26 | 27 | // Single-precision permutation with 256-bit vector adn 8-bit control value 28 | __m256 float_256_vec_0 = _mm256_set_ps(4.0, 3.0, 2.0, 1.0, 4.0, 3.0, 2.0, 1.0); 29 | 30 | __m256 float_256_result = _mm256_permute_ps(float_256_vec_0, 0b10011000); 31 | 32 | flo = (float*) &float_256_result; 33 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3], flo[4], flo[5], flo[6], flo[7]); 34 | 35 | // Double-precision permutation with 256-bit vector and 4-bit control value 36 | __m256d double_256_vec_0 = _mm256_set_pd(6.0, 5.0, 6.0, 5.0); 37 | 38 | __m256d double_256_result = _mm256_permute_pd(double_256_vec_0, 0b0101); 39 | 40 | dou = (double*) &double_256_result; 41 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou[0], dou[1], dou[2], dou[3]); 42 | 43 | return 0; 44 | } 45 | -------------------------------------------------------------------------------- /Arithmetic_Intrinsics/src/mul.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-18 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Single-precision multiplication 12 | __m256 float_vec_0 = _mm256_set1_ps(2.0); 13 | __m256 float_vec_1 = _mm256_set1_ps(8.0); 14 | 15 | __m256 float_result = _mm256_mul_ps(float_vec_0, float_vec_1); 16 | 17 | float* flo = (float*) &float_result; 18 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3], flo[4], flo[5], flo[6], flo[7]); 19 | 20 | // Double-precision multiplication 21 | __m256d double_vec_0 = _mm256_set1_pd(2.0); 22 | __m256d double_vec_1 = _mm256_set1_pd(8.0); 23 | 24 | __m256d double_result = _mm256_mul_pd(double_vec_0, double_vec_1); 25 | 26 | double* dou = (double*) &double_result; 27 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou[0], dou[1], dou[2], dou[3]); 28 | 29 | // Signed 32-bit integer multiplication (AVX2) 30 | __m256i epi32_vec_0 = _mm256_setr_epi32(2, 2, 3, 3, 4, 4, 5, 5); 31 | __m256i epi32_vec_1 = _mm256_setr_epi32(8, 8, 9, 9, 10, 10, 11, 11); 32 | 33 | __m256i epi32_result = _mm256_mul_epi32(epi32_vec_0, epi32_vec_1); 34 | 35 | long long int* i = (long long int*) &epi32_result; 36 | printf("signed int:\t%lld, %lld, %lld, %lld\n", i[0], i[1], i[2], i[3]); 37 | 38 | // Unsigned 32-bit integer multiplication (AVX2) 39 | __m256i epu32_vec_0 = _mm256_setr_epi32((unsigned int)-1, (unsigned int)-1, 3, 3, 4, 4, 5, 5); 40 | __m256i epu32_vec_1 = _mm256_setr_epi32(8, 1, 9, 9, 10, 10, 11, 11); 41 | 42 | __m256i epu32_result = _mm256_mul_epu32(epu32_vec_0, epu32_vec_1); 43 | 44 | long long int* u = (long long int*) &epu32_result; 45 | printf("unsigned int:\t%lld, %lld, %lld, %lld\n", u[0], u[1], u[2], u[3]); 46 | 47 | return 0; 48 | } 49 | -------------------------------------------------------------------------------- /Arithmetic_Intrinsics/src/hadd.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-17 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Single-precision horizontal addition 12 | __m256 float_vec_0 = _mm256_set1_ps(8.0); 13 | __m256 float_vec_1 = _mm256_set1_ps(17.0); 14 | 15 | __m256 float_result = _mm256_hadd_ps(float_vec_0, float_vec_1); 16 | 17 | float* flo = (float*) &float_result; 18 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3], flo[4], flo[5], flo[6], flo[7]); 19 | 20 | // Double-precision horizontal addition 21 | __m256d double_vec_0 = _mm256_set1_pd(8.0); 22 | __m256d double_vec_1 = _mm256_set1_pd(17.0); 23 | 24 | __m256d double_result = _mm256_hadd_pd(double_vec_0, double_vec_1); 25 | 26 | double* dou = (double*) &double_result; 27 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou[0], dou[1], dou[2], dou[3]); 28 | 29 | // 16-bit integer horizontal addition (AVX2) 30 | __m256i epi16_vec_0 = _mm256_set1_epi16(8); 31 | __m256i epi16_vec_1 = _mm256_set1_epi16(17); 32 | 33 | __m256i epi16_result = _mm256_hadd_epi16(epi16_vec_0, epi16_vec_1); 34 | 35 | short* sho = (short*) &epi16_result; 36 | printf("short:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", sho[0], sho[1], sho[2], sho[3], sho[4], sho[5], sho[6], sho[7], sho[8], sho[9], sho[10], sho[11], sho[12], sho[13], sho[14], sho[15]); 37 | 38 | // 32-bit integer horizontal addition (AVX2) 39 | __m256i epi32_vec_0 = _mm256_set1_epi32(8); 40 | __m256i epi32_vec_1 = _mm256_set1_epi32(17); 41 | 42 | __m256i epi32_result = _mm256_hadd_epi32(epi32_vec_0, epi32_vec_1); 43 | 44 | int* i = (int*) &epi32_result; 45 | printf("int:\t\t%d, %d, %d, %d, %d, %d, %d, %d\n", i[0], i[1], i[2], i[3], i[4], i[5], i[6], i[7]); 46 | 47 | return 0; 48 | } 49 | -------------------------------------------------------------------------------- /Permuting_and_Shuffling/src/Makefile: -------------------------------------------------------------------------------- 1 | # Author: TripleZ 2 | # Date: 2018-08-21 3 | 4 | CC = gcc 5 | CFLAGS = -I$(INCDIR) -mavx -mavx2 -mfma -msse -msse2 -msse3 -Wall 6 | 7 | INCDIR = ../include 8 | BINDIR = ../bin 9 | OBJDIR = ../obj 10 | 11 | ALL = permute permute4x64 permute2f128 permutevar permutevar8x32 \ 12 | shuffle shufflehi shufflelo 13 | 14 | $(OBJDIR)/%.o: %.c 15 | @[ -d $(OBJDIR) ] || mkdir $(OBJDIR) 16 | $(CC) -c -o $@ $< $(CFLAGS) 17 | 18 | all: $(ALL) 19 | 20 | shufflelo: $(patsubst %, $(OBJDIR)/%, shufflelo.o) 21 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 22 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 23 | 24 | 25 | shufflehi: $(patsubst %, $(OBJDIR)/%, shufflehi.o) 26 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 27 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 28 | 29 | shuffle: $(patsubst %, $(OBJDIR)/%, shuffle.o) 30 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 31 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 32 | 33 | permutevar8x32: $(patsubst %, $(OBJDIR)/%, permutevar8x32.o) 34 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 35 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 36 | 37 | permutevar: $(patsubst %, $(OBJDIR)/%, permutevar.o) 38 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 39 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 40 | 41 | permute2f128: $(patsubst %, $(OBJDIR)/%, permute2f128.o) 42 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 43 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 44 | 45 | permute4x64: $(patsubst %, $(OBJDIR)/%, permute4x64.o) 46 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 47 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 48 | 49 | permute: $(patsubst %, $(OBJDIR)/%, permute.o) 50 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 51 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 52 | 53 | run-%: $(BINDIR)/% 54 | $^ 55 | 56 | runall: all 57 | @echo -e 58 | @for exe in $(ALL); do \ 59 | echo "$(BINDIR)/$$exe"; \ 60 | $(BINDIR)/$$exe; \ 61 | echo -e; \ 62 | done 63 | 64 | 65 | .PHONY: clean 66 | clean: 67 | -@rm -rf $(BINDIR) $(OBJDIR) 68 | @echo "- Cleaned Directory 'Permuting_and_Shuffling/'" 69 | -------------------------------------------------------------------------------- /Initialization_Intrinsics/src/set1.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-16 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Single-precision 12 | __m256 float_vec = _mm256_set1_ps(1.0); 13 | 14 | float* flo = (float*) &float_vec; 15 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3], flo[4], flo[5], flo[6], flo[7]); 16 | 17 | // Double-precision 18 | __m256d double_vec = _mm256_set1_pd(2.0); 19 | 20 | double* dou = (double*) &double_vec; 21 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou[0], dou[1], dou[2], dou[3]); 22 | 23 | // 32-bit integer 24 | __m256i int_vec = _mm256_set1_epi32(3); 25 | 26 | int* i = (int*) &int_vec; 27 | printf("int:\t\t%d, %d, %d, %d, %d, %d, %d, %d\n", i[0], i[1], i[2], i[3], i[4], i[5], i[6], i[7]); 28 | 29 | // 64-bit integer 30 | __m256i long_vec = _mm256_set1_epi64x(4); 31 | 32 | long long* lo = (long long*) &long_vec; 33 | printf("long long:\t%d, %d, %d, %d\n", lo[0], lo[1], lo[2], lo[3]); 34 | 35 | // 16-bit integer 36 | __m256i short_vec = _mm256_set1_epi16(5); 37 | 38 | short* sho = (short*) &short_vec; 39 | printf("short:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", sho[0], sho[1], sho[2], sho[3], sho[4], sho[5], sho[6], sho[7], sho[8], sho[9], sho[10], sho[11], sho[12], sho[13], sho[14], sho[15]); 40 | 41 | // 8-bit integer 42 | __m256i char_vec = _mm256_set1_epi8(6); 43 | 44 | char* c = (char*) &char_vec; 45 | printf("char:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10], c[11], c[12], c[13], c[14], c[15], c[16], c[17], c[18], c[19], c[20], c[21], c[22], c[23], c[24], c[25], c[26], c[27], c[28], c[29], c[30], c[31]); 46 | 47 | return 0; 48 | } 49 | -------------------------------------------------------------------------------- /Permuting_and_Shuffling/src/permutevar.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-21 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Single-precision permutation with 128-bit vector and 32-bit integers control value 12 | __m128 float_128_vec_0 = _mm_set_ps(4.0, 3.0, 2.0, 1.0); 13 | __m128i epi32_128_vec_0 = _mm_set_epi32(2, 1, 6, 0); 14 | 15 | __m128 float_128_result = _mm_permutevar_ps(float_128_vec_0, epi32_128_vec_0); 16 | 17 | float* flo = (float*) &float_128_result; 18 | printf("float:\t\t%f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3]); 19 | 20 | // Double-precision permutation with 128-bit vector and 64-bit integers control value 21 | __m128d double_128_vec_0 = _mm_set_pd(6.0, 5.0); 22 | __m128i epi32_128_vec_1 = _mm_set_epi64x(2, 1); 23 | 24 | __m128d double_128_result = _mm_permutevar_pd(double_128_vec_0, epi32_128_vec_1); 25 | 26 | double* dou = (double*) &double_128_result; 27 | printf("double:\t\t%lf, %lf\n", dou[0], dou[1]); 28 | 29 | // Single-precision permutation with 256-bit vector adn 32-bit integers control value 30 | __m256 float_256_vec_0 = _mm256_set_ps(4.0, 3.0, 2.0, 1.0, 4.0, 3.0, 2.0, 1.0); 31 | __m256i epi32_256_vec_0 = _mm256_set_epi32(2, 1, 2, 0, 2, 1, 2, 0); 32 | 33 | __m256 float_256_result = _mm256_permutevar_ps(float_256_vec_0, epi32_256_vec_0); 34 | 35 | flo = (float*) &float_256_result; 36 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3], flo[4], flo[5], flo[6], flo[7]); 37 | 38 | // Double-precision permutation with 256-bit vector and 64-bit integers control value 39 | __m256d double_256_vec_0 = _mm256_set_pd(6.0, 5.0, 6.0, 5.0); 40 | __m256i epi32_256_vec_1 = _mm256_set_epi64x(2, 1, 2, 0); 41 | 42 | __m256d double_256_result = _mm256_permutevar_pd(double_256_vec_0, epi32_256_vec_1); 43 | 44 | dou = (double*) &double_256_result; 45 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou[0], dou[1], dou[2], dou[3]); 46 | 47 | return 0; 48 | } 49 | -------------------------------------------------------------------------------- /Arithmetic_Intrinsics/src/hsub.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-17 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Single-precision horizontal subtraction 12 | __m256 float_vec_0 = _mm256_set_ps(1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0); 13 | __m256 float_vec_1 = _mm256_set_ps(2.0, 7.0, 2.0, 7.0, 2.0, 7.0, 2.0, 7.0); 14 | 15 | __m256 float_result = _mm256_hsub_ps(float_vec_0, float_vec_1); 16 | 17 | float* flo = (float*) &float_result; 18 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3], flo[4], flo[5], flo[6], flo[7]); 19 | 20 | // Double-precision horizontal subtraction 21 | __m256d double_vec_0 = _mm256_set_pd(1.0, 3.0, 1.0, 3.0); 22 | __m256d double_vec_1 = _mm256_set_pd(2.0, 7.0, 2.0, 7.0); 23 | 24 | __m256d double_result = _mm256_hsub_pd(double_vec_0, double_vec_1); 25 | 26 | double* dou = (double*) &double_result; 27 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou[0], dou[1], dou[2], dou[3]); 28 | 29 | // 16-bit integer horizontal subtraction (AVX2) 30 | __m256i epi16_vec_0 = _mm256_set_epi16(1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3); 31 | __m256i epi16_vec_1 = _mm256_set_epi16(2, 7, 2, 7, 2, 7, 2, 7, 2, 7, 2, 7, 2, 7, 2, 7); 32 | 33 | __m256i epi16_result = _mm256_hsub_epi16(epi16_vec_0, epi16_vec_1); 34 | 35 | short* sho = (short*) &epi16_result; 36 | printf("short:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", sho[0], sho[1], sho[2], sho[3], sho[4], sho[5], sho[6], sho[7], sho[8], sho[9], sho[10], sho[11], sho[12], sho[13], sho[14], sho[15]); 37 | 38 | // 32-bit integer horizontal subtraction (AVX2) 39 | __m256i epi32_vec_0 = _mm256_set_epi32(1, 3, 1, 3, 1, 3, 1, 3); 40 | __m256i epi32_vec_1 = _mm256_set_epi32(2, 7, 2, 7, 2, 7, 2, 7); 41 | 42 | __m256i epi32_result = _mm256_hsub_epi32(epi32_vec_0, epi32_vec_1); 43 | 44 | int* i = (int*) &epi32_result; 45 | printf("int:\t\t%d, %d, %d, %d, %d, %d, %d, %d\n", i[0], i[1], i[2], i[3], i[4], i[5], i[6], i[7]); 46 | 47 | return 0; 48 | } 49 | -------------------------------------------------------------------------------- /Arithmetic_Intrinsics/src/fmaddsub.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-21 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Single-precision multiply and add or sub with 128-bit vectors (FMA) 12 | __m128 float_128_vec_0 = _mm_set1_ps(8.0); 13 | __m128 float_128_vec_1 = _mm_set1_ps(20.0); 14 | __m128 float_128_vec_2 = _mm_set1_ps(2.0); 15 | 16 | __m128 float_128_result = _mm_fmaddsub_ps(float_128_vec_0, float_128_vec_1, float_128_vec_2); 17 | 18 | float* flo = (float*) &float_128_result; 19 | printf("float:\t\t%f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3]); 20 | 21 | // Double-precision multiply and add or sub with 128-bit vectors (FMA) 22 | __m128d double_128_vec_0 = _mm_set1_pd(8.0); 23 | __m128d double_128_vec_1 = _mm_set1_pd(20.0); 24 | __m128d double_128_vec_2 = _mm_set1_pd(2.0); 25 | 26 | __m128d double_128_result = _mm_fmaddsub_pd(double_128_vec_0, double_128_vec_1, double_128_vec_2); 27 | 28 | double* dou = (double*) &double_128_result; 29 | printf("double:\t\t%lf, %lf\n", dou[0], dou[1]); 30 | 31 | // Single-precision multiply and add or sub with 256-bit vectors (FMA) 32 | __m256 float_256_vec_0 = _mm256_set1_ps(8.0); 33 | __m256 float_256_vec_1 = _mm256_set1_ps(20.0); 34 | __m256 float_256_vec_2 = _mm256_set1_ps(2.0); 35 | 36 | __m256 float_256_result = _mm256_fmaddsub_ps(float_256_vec_0, float_256_vec_1, float_256_vec_2); 37 | 38 | flo = (float*) &float_256_result; 39 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3], flo[4], flo[5], flo[6], flo[7]); 40 | 41 | // Double-precision multiply and add or sub with 256-bit vectors (FMA) 42 | __m256d double_256_vec_0 = _mm256_set1_pd(8.0); 43 | __m256d double_256_vec_1 = _mm256_set1_pd(20.0); 44 | __m256d double_256_vec_2 = _mm256_set1_pd(2.0); 45 | 46 | __m256d double_256_result = _mm256_fmaddsub_pd(double_256_vec_0, double_256_vec_1, double_256_vec_2); 47 | 48 | dou = (double*) &double_256_result; 49 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou[0], dou[1], dou[2], dou[3]); 50 | 51 | return 0; 52 | } 53 | -------------------------------------------------------------------------------- /Arithmetic_Intrinsics/src/fmsubadd.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-21 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Single-precision multiply and sub or add with 128-bit vectors (FMA) 12 | __m128 float_128_vec_0 = _mm_set1_ps(8.0); 13 | __m128 float_128_vec_1 = _mm_set1_ps(20.0); 14 | __m128 float_128_vec_2 = _mm_set1_ps(2.0); 15 | 16 | __m128 float_128_result = _mm_fmsubadd_ps(float_128_vec_0, float_128_vec_1, float_128_vec_2); 17 | 18 | float* flo = (float*) &float_128_result; 19 | printf("float:\t\t%f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3]); 20 | 21 | // Double-precision multiply and sub or add with 128-bit vectors (FMA) 22 | __m128d double_128_vec_0 = _mm_set1_pd(8.0); 23 | __m128d double_128_vec_1 = _mm_set1_pd(20.0); 24 | __m128d double_128_vec_2 = _mm_set1_pd(2.0); 25 | 26 | __m128d double_128_result = _mm_fmsubadd_pd(double_128_vec_0, double_128_vec_1, double_128_vec_2); 27 | 28 | double* dou = (double*) &double_128_result; 29 | printf("double:\t\t%lf, %lf\n", dou[0], dou[1]); 30 | 31 | // Single-precision multiply and sub or add with 256-bit vectors (FMA) 32 | __m256 float_256_vec_0 = _mm256_set1_ps(8.0); 33 | __m256 float_256_vec_1 = _mm256_set1_ps(20.0); 34 | __m256 float_256_vec_2 = _mm256_set1_ps(2.0); 35 | 36 | __m256 float_256_result = _mm256_fmsubadd_ps(float_256_vec_0, float_256_vec_1, float_256_vec_2); 37 | 38 | flo = (float*) &float_256_result; 39 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3], flo[4], flo[5], flo[6], flo[7]); 40 | 41 | // Double-precision multiply and sub or add with 256-bit vectors (FMA) 42 | __m256d double_256_vec_0 = _mm256_set1_pd(8.0); 43 | __m256d double_256_vec_1 = _mm256_set1_pd(20.0); 44 | __m256d double_256_vec_2 = _mm256_set1_pd(2.0); 45 | 46 | __m256d double_256_result = _mm256_fmsubadd_pd(double_256_vec_0, double_256_vec_1, double_256_vec_2); 47 | 48 | dou = (double*) &double_256_result; 49 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou[0], dou[1], dou[2], dou[3]); 50 | 51 | return 0; 52 | } 53 | -------------------------------------------------------------------------------- /Permuting_and_Shuffling/src/permute2f128.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-21 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | #include "gcc_support.h" 10 | 11 | int main(int argc, char const *argv[]) { 12 | 13 | // Single-precision permutation with two 256-bit vectors and 8-bit control value 14 | __m128 float_128_vec_0 = _mm_set1_ps(1.0); 15 | __m128 float_128_vec_1 = _mm_set1_ps(2.0); 16 | __m128 float_128_vec_2 = _mm_set1_ps(3.0); 17 | __m128 float_128_vec_3 = _mm_set1_ps(4.0); 18 | 19 | __m256 float_256_vec_0 = _mm256_set_m128(float_128_vec_1, float_128_vec_0); 20 | __m256 float_256_vec_1 = _mm256_set_m128(float_128_vec_3, float_128_vec_2); 21 | 22 | __m256 float_256_result = _mm256_permute2f128_ps(float_256_vec_0, float_256_vec_1, 0b11010010); 23 | 24 | float* flo = (float*) &float_256_result; 25 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3], flo[4], flo[5], flo[6], flo[7]); 26 | 27 | // Double-precision permutation with two 256-bit vectors and 8-bit control value 28 | __m128d double_128_vec_0 = _mm_set1_pd(1.0); 29 | __m128d double_128_vec_1 = _mm_set1_pd(2.0); 30 | __m128d double_128_vec_2 = _mm_set1_pd(3.0); 31 | __m128d double_128_vec_3 = _mm_set1_pd(4.0); 32 | 33 | __m256d double_256_vec_0 = _mm256_set_m128d(double_128_vec_1, double_128_vec_0); 34 | __m256d double_256_vec_1 = _mm256_set_m128d(double_128_vec_3, double_128_vec_2); 35 | 36 | __m256d double_256_result = _mm256_permute2f128_pd(double_256_vec_0, double_256_vec_1, 0b11010010); 37 | 38 | double* dou = (double*) &double_256_result; 39 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou[0], dou[1], dou[2], dou[3]); 40 | 41 | // 32-bit integer permutation with two 256-bit vectors and 8-bit control value 42 | __m128i epi32_128_vec_0 = _mm_set1_epi32(1); 43 | __m128i epi32_128_vec_1 = _mm_set1_epi32(2); 44 | __m128i epi32_128_vec_2 = _mm_set1_epi32(3); 45 | __m128i epi32_128_vec_3 = _mm_set1_epi32(4); 46 | 47 | __m256i epi32_256_vec_0 = _mm256_set_m128i(epi32_128_vec_1, epi32_128_vec_0); 48 | __m256i epi32_256_vec_1 = _mm256_set_m128i(epi32_128_vec_3, epi32_128_vec_2); 49 | 50 | __m256i epi32_256_result = _mm256_permute2f128_si256(epi32_256_vec_0, epi32_256_vec_1, 0b11010010); 51 | 52 | int* i = (int*) &epi32_256_result; 53 | printf("int:\t\t%d, %d, %d, %d, %d, %d, %d, %d\n", i[0], i[1], i[2], i[3], i[4], i[5], i[6], i[7]); 54 | 55 | return 0; 56 | } 57 | -------------------------------------------------------------------------------- /Arithmetic_Intrinsics/src/adds.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-17 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // 8-bit signed integer addition with saturation (AVX2) 12 | __m256i epi8_vec_0 = _mm256_set1_epi8(126); 13 | __m256i epi8_vec_1 = _mm256_set1_epi8(100); 14 | 15 | __m256i epi8_result = _mm256_adds_epi8(epi8_vec_0, epi8_vec_1); 16 | 17 | char* c = (char*) &epi8_result; 18 | printf("char:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10], c[11], c[12], c[13], c[14], c[15], c[16], c[17], c[18], c[19], c[20], c[21], c[22], c[23], c[24], c[25], c[26], c[27], c[28], c[29], c[30], c[31]); 19 | 20 | // 16-bit signed integer addition with saturation (AVX2) 21 | __m256i epi16_vec_0 = _mm256_set1_epi16(32766); 22 | __m256i epi16_vec_1 = _mm256_set1_epi16(100); 23 | 24 | __m256i epi16_result = _mm256_adds_epi16(epi16_vec_0, epi16_vec_1); 25 | 26 | short* sho = (short*) &epi16_result; 27 | printf("short:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", sho[0], sho[1], sho[2], sho[3], sho[4], sho[5], sho[6], sho[7], sho[8], sho[9], sho[10], sho[11], sho[12], sho[13], sho[14], sho[15]); 28 | 29 | // 8-bit unsigned integer addition with saturation (AVX2) 30 | __m256i epu8_vec_0 = _mm256_set1_epi8(254); 31 | __m256i epu8_vec_1 = _mm256_set1_epi8(100); 32 | 33 | __m256i epu8_result = _mm256_adds_epu8(epu8_vec_0, epu8_vec_1); 34 | 35 | unsigned char* c_u = (unsigned char*) &epu8_result; 36 | printf("char:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", c_u[0], c_u[1], c_u[2], c_u[3], c_u[4], c_u[5], c_u[6], c_u[7], c_u[8], c_u[9], c_u[10], c_u[11], c_u[12], c_u[13], c_u[14], c_u[15], c_u[16], c_u[17], c_u[18], c_u[19], c_u[20], c_u[21], c_u[22], c_u[23], c_u[24], c_u[25], c_u[26], c_u[27], c_u[28], c_u[29], c_u[30], c_u[31]); 37 | 38 | // 16-bit unsigned integer addition with saturation (AVX2) 39 | __m256i epu16_vec_0 = _mm256_set1_epi16(65534); 40 | __m256i epu16_vec_1 = _mm256_set1_epi16(100); 41 | 42 | __m256i epu16_result = _mm256_adds_epu16(epu16_vec_0, epu16_vec_1); 43 | 44 | unsigned short* sho_u = (unsigned short*) &epu16_result; 45 | printf("short:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", sho_u[0], sho_u[1], sho_u[2], sho_u[3], sho_u[4], sho_u[5], sho_u[6], sho_u[7], sho_u[8], sho_u[9], sho_u[10], sho_u[11], sho_u[12], sho_u[13], sho_u[14], sho_u[15]); 46 | 47 | return 0; 48 | } 49 | -------------------------------------------------------------------------------- /Arithmetic_Intrinsics/src/subs.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-17 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // 8-bit signed integer subtraction with saturation (AVX2) 12 | __m256i epi8_vec_0 = _mm256_set1_epi8(-127); 13 | __m256i epi8_vec_1 = _mm256_set1_epi8(100); 14 | 15 | __m256i epi8_result = _mm256_subs_epi8(epi8_vec_0, epi8_vec_1); 16 | 17 | char* c = (char*) &epi8_result; 18 | printf("char:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10], c[11], c[12], c[13], c[14], c[15], c[16], c[17], c[18], c[19], c[20], c[21], c[22], c[23], c[24], c[25], c[26], c[27], c[28], c[29], c[30], c[31]); 19 | 20 | // 16-bit signed integer subtraction with saturation (AVX2) 21 | __m256i epi16_vec_0 = _mm256_set1_epi16(-32767); 22 | __m256i epi16_vec_1 = _mm256_set1_epi16(100); 23 | 24 | __m256i epi16_result = _mm256_subs_epi16(epi16_vec_0, epi16_vec_1); 25 | 26 | short* sho = (short*) &epi16_result; 27 | printf("short:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", sho[0], sho[1], sho[2], sho[3], sho[4], sho[5], sho[6], sho[7], sho[8], sho[9], sho[10], sho[11], sho[12], sho[13], sho[14], sho[15]); 28 | 29 | // 8-bit unsigned integer subtraction with saturation (AVX2) 30 | __m256i epu8_vec_0 = _mm256_set1_epi8(100); 31 | __m256i epu8_vec_1 = _mm256_set1_epi8(255); 32 | 33 | __m256i epu8_result = _mm256_subs_epu8(epu8_vec_0, epu8_vec_1); 34 | 35 | unsigned char* c_u = (unsigned char*) &epu8_result; 36 | printf("char:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", c_u[0], c_u[1], c_u[2], c_u[3], c_u[4], c_u[5], c_u[6], c_u[7], c_u[8], c_u[9], c_u[10], c_u[11], c_u[12], c_u[13], c_u[14], c_u[15], c_u[16], c_u[17], c_u[18], c_u[19], c_u[20], c_u[21], c_u[22], c_u[23], c_u[24], c_u[25], c_u[26], c_u[27], c_u[28], c_u[29], c_u[30], c_u[31]); 37 | 38 | // 16-bit unsigned integer subtraction with saturation (AVX2) 39 | __m256i epu16_vec_0 = _mm256_set1_epi16(100); 40 | __m256i epu16_vec_1 = _mm256_set1_epi16(65535); 41 | 42 | __m256i epu16_result = _mm256_subs_epu16(epu16_vec_0, epu16_vec_1); 43 | 44 | unsigned short* sho_u = (unsigned short*) &epu16_result; 45 | printf("short:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", sho_u[0], sho_u[1], sho_u[2], sho_u[3], sho_u[4], sho_u[5], sho_u[6], sho_u[7], sho_u[8], sho_u[9], sho_u[10], sho_u[11], sho_u[12], sho_u[13], sho_u[14], sho_u[15]); 46 | 47 | return 0; 48 | } 49 | -------------------------------------------------------------------------------- /Arithmetic_Intrinsics/src/fmadd.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-20 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Single-precision multiply and add with 128-bit vectors (FMA) 12 | __m128 float_128_vec_0 = _mm_set1_ps(8.0); 13 | __m128 float_128_vec_1 = _mm_set1_ps(20.0); 14 | __m128 float_128_vec_2 = _mm_set1_ps(2.0); 15 | 16 | __m128 float_128_result = _mm_fmadd_ps(float_128_vec_0, float_128_vec_1, float_128_vec_2); 17 | 18 | float* flo = (float*) &float_128_result; 19 | printf("float:\t\t%f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3]); 20 | 21 | // Double-precision multiply and add with 128-bit vectors (FMA) 22 | __m128d double_128_vec_0 = _mm_set1_pd(8.0); 23 | __m128d double_128_vec_1 = _mm_set1_pd(20.0); 24 | __m128d double_128_vec_2 = _mm_set1_pd(2.0); 25 | 26 | __m128d double_128_result = _mm_fmadd_pd(double_128_vec_0, double_128_vec_1, double_128_vec_2); 27 | 28 | double* dou = (double*) &double_128_result; 29 | printf("double:\t\t%lf, %lf\n", dou[0], dou[1]); 30 | 31 | // Single-precision multiply and add with 256-bit vectors (FMA) 32 | __m256 float_256_vec_0 = _mm256_set1_ps(8.0); 33 | __m256 float_256_vec_1 = _mm256_set1_ps(20.0); 34 | __m256 float_256_vec_2 = _mm256_set1_ps(2.0); 35 | 36 | __m256 float_256_result = _mm256_fmadd_ps(float_256_vec_0, float_256_vec_1, float_256_vec_2); 37 | 38 | flo = (float*) &float_256_result; 39 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3], flo[4], flo[5], flo[6], flo[7]); 40 | 41 | // Double-precision multiply and add with 256-bit vectors (FMA) 42 | __m256d double_256_vec_0 = _mm256_set1_pd(8.0); 43 | __m256d double_256_vec_1 = _mm256_set1_pd(20.0); 44 | __m256d double_256_vec_2 = _mm256_set1_pd(2.0); 45 | 46 | __m256d double_256_result = _mm256_fmadd_pd(double_256_vec_0, double_256_vec_1, double_256_vec_2); 47 | 48 | dou = (double*) &double_256_result; 49 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou[0], dou[1], dou[2], dou[3]); 50 | 51 | // Single-precision multiply and add the lowest element with 128-bit vectors (FMA) 52 | __m128 float_128_low_result = _mm_fmadd_ss(float_128_vec_0, float_128_vec_1, float_128_vec_2); 53 | 54 | flo = (float*) &float_128_low_result; 55 | printf("float:\t\t%f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3]); 56 | 57 | // Double-precision multiply and add the lowest element with 128-bit vectors (FMA) 58 | __m128d double_128_low_result = _mm_fmadd_sd(double_128_vec_0, double_128_vec_1, double_128_vec_2); 59 | 60 | dou = (double*) &double_128_low_result; 61 | printf("double:\t\t%lf, %lf\n", dou[0], dou[1]); 62 | 63 | return 0; 64 | } 65 | -------------------------------------------------------------------------------- /Arithmetic_Intrinsics/src/fmsub.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-20 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Single-precision multiply and add with 128-bit vectors (FMA) 12 | __m128 float_128_vec_0 = _mm_set1_ps(8.0); 13 | __m128 float_128_vec_1 = _mm_set1_ps(20.0); 14 | __m128 float_128_vec_2 = _mm_set1_ps(2.0); 15 | 16 | __m128 float_128_result = _mm_fmsub_ps(float_128_vec_0, float_128_vec_1, float_128_vec_2); 17 | 18 | float* flo = (float*) &float_128_result; 19 | printf("float:\t\t%f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3]); 20 | 21 | // Double-precision multiply and add with 128-bit vectors (FMA) 22 | __m128d double_128_vec_0 = _mm_set1_pd(8.0); 23 | __m128d double_128_vec_1 = _mm_set1_pd(20.0); 24 | __m128d double_128_vec_2 = _mm_set1_pd(2.0); 25 | 26 | __m128d double_128_result = _mm_fmsub_pd(double_128_vec_0, double_128_vec_1, double_128_vec_2); 27 | 28 | double* dou = (double*) &double_128_result; 29 | printf("double:\t\t%lf, %lf\n", dou[0], dou[1]); 30 | 31 | // Single-precision multiply and add with 256-bit vectors (FMA) 32 | __m256 float_256_vec_0 = _mm256_set1_ps(8.0); 33 | __m256 float_256_vec_1 = _mm256_set1_ps(20.0); 34 | __m256 float_256_vec_2 = _mm256_set1_ps(2.0); 35 | 36 | __m256 float_256_result = _mm256_fmsub_ps(float_256_vec_0, float_256_vec_1, float_256_vec_2); 37 | 38 | flo = (float*) &float_256_result; 39 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3], flo[4], flo[5], flo[6], flo[7]); 40 | 41 | // Double-precision multiply and add with 256-bit vectors (FMA) 42 | __m256d double_256_vec_0 = _mm256_set1_pd(8.0); 43 | __m256d double_256_vec_1 = _mm256_set1_pd(20.0); 44 | __m256d double_256_vec_2 = _mm256_set1_pd(2.0); 45 | 46 | __m256d double_256_result = _mm256_fmsub_pd(double_256_vec_0, double_256_vec_1, double_256_vec_2); 47 | 48 | dou = (double*) &double_256_result; 49 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou[0], dou[1], dou[2], dou[3]); 50 | 51 | // Single-precision multiply and add the lowest element with 128-bit vectors (FMA) 52 | __m128 float_128_low_result = _mm_fmsub_ss(float_128_vec_0, float_128_vec_1, float_128_vec_2); 53 | 54 | flo = (float*) &float_128_low_result; 55 | printf("float:\t\t%f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3]); 56 | 57 | // Double-precision multiply and add the lowest element with 128-bit vectors (FMA) 58 | __m128d double_128_low_result = _mm_fmsub_sd(double_128_vec_0, double_128_vec_1, double_128_vec_2); 59 | 60 | dou = (double*) &double_128_low_result; 61 | printf("double:\t\t%lf, %lf\n", dou[0], dou[1]); 62 | 63 | return 0; 64 | } 65 | -------------------------------------------------------------------------------- /Arithmetic_Intrinsics/src/fnmadd.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-20 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Single-precision multiply negated and add with 128-bit vectors (FMA) 12 | __m128 float_128_vec_0 = _mm_set1_ps(8.0); 13 | __m128 float_128_vec_1 = _mm_set1_ps(20.0); 14 | __m128 float_128_vec_2 = _mm_set1_ps(2.0); 15 | 16 | __m128 float_128_result = _mm_fnmadd_ps(float_128_vec_0, float_128_vec_1, float_128_vec_2); 17 | 18 | float* flo = (float*) &float_128_result; 19 | printf("float:\t\t%f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3]); 20 | 21 | // Double-precision multiply negated and add with 128-bit vectors (FMA) 22 | __m128d double_128_vec_0 = _mm_set1_pd(8.0); 23 | __m128d double_128_vec_1 = _mm_set1_pd(20.0); 24 | __m128d double_128_vec_2 = _mm_set1_pd(2.0); 25 | 26 | __m128d double_128_result = _mm_fnmadd_pd(double_128_vec_0, double_128_vec_1, double_128_vec_2); 27 | 28 | double* dou = (double*) &double_128_result; 29 | printf("double:\t\t%lf, %lf\n", dou[0], dou[1]); 30 | 31 | // Single-precision multiply negated and add with 256-bit vectors (FMA) 32 | __m256 float_256_vec_0 = _mm256_set1_ps(8.0); 33 | __m256 float_256_vec_1 = _mm256_set1_ps(20.0); 34 | __m256 float_256_vec_2 = _mm256_set1_ps(2.0); 35 | 36 | __m256 float_256_result = _mm256_fnmadd_ps(float_256_vec_0, float_256_vec_1, float_256_vec_2); 37 | 38 | flo = (float*) &float_256_result; 39 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3], flo[4], flo[5], flo[6], flo[7]); 40 | 41 | // Double-precision multiply negated and add with 256-bit vectors (FMA) 42 | __m256d double_256_vec_0 = _mm256_set1_pd(8.0); 43 | __m256d double_256_vec_1 = _mm256_set1_pd(20.0); 44 | __m256d double_256_vec_2 = _mm256_set1_pd(2.0); 45 | 46 | __m256d double_256_result = _mm256_fnmadd_pd(double_256_vec_0, double_256_vec_1, double_256_vec_2); 47 | 48 | dou = (double*) &double_256_result; 49 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou[0], dou[1], dou[2], dou[3]); 50 | 51 | // Single-precision multiply negated and add the lowest element with 128-bit vectors (FMA) 52 | __m128 float_128_low_result = _mm_fnmadd_ss(float_128_vec_0, float_128_vec_1, float_128_vec_2); 53 | 54 | flo = (float*) &float_128_low_result; 55 | printf("float:\t\t%f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3]); 56 | 57 | // Double-precision multiply negated and add the lowest element with 128-bit vectors (FMA) 58 | __m128d double_128_low_result = _mm_fnmadd_sd(double_128_vec_0, double_128_vec_1, double_128_vec_2); 59 | 60 | dou = (double*) &double_128_low_result; 61 | printf("double:\t\t%lf, %lf\n", dou[0], dou[1]); 62 | 63 | return 0; 64 | } 65 | -------------------------------------------------------------------------------- /Arithmetic_Intrinsics/src/fnmsub.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-20 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Single-precision multiply negated and add with 128-bit vectors (FMA) 12 | __m128 float_128_vec_0 = _mm_set1_ps(8.0); 13 | __m128 float_128_vec_1 = _mm_set1_ps(20.0); 14 | __m128 float_128_vec_2 = _mm_set1_ps(2.0); 15 | 16 | __m128 float_128_result = _mm_fnmsub_ps(float_128_vec_0, float_128_vec_1, float_128_vec_2); 17 | 18 | float* flo = (float*) &float_128_result; 19 | printf("float:\t\t%f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3]); 20 | 21 | // Double-precision multiply negated and add with 128-bit vectors (FMA) 22 | __m128d double_128_vec_0 = _mm_set1_pd(8.0); 23 | __m128d double_128_vec_1 = _mm_set1_pd(20.0); 24 | __m128d double_128_vec_2 = _mm_set1_pd(2.0); 25 | 26 | __m128d double_128_result = _mm_fnmsub_pd(double_128_vec_0, double_128_vec_1, double_128_vec_2); 27 | 28 | double* dou = (double*) &double_128_result; 29 | printf("double:\t\t%lf, %lf\n", dou[0], dou[1]); 30 | 31 | // Single-precision multiply negated and add with 256-bit vectors (FMA) 32 | __m256 float_256_vec_0 = _mm256_set1_ps(8.0); 33 | __m256 float_256_vec_1 = _mm256_set1_ps(20.0); 34 | __m256 float_256_vec_2 = _mm256_set1_ps(2.0); 35 | 36 | __m256 float_256_result = _mm256_fnmsub_ps(float_256_vec_0, float_256_vec_1, float_256_vec_2); 37 | 38 | flo = (float*) &float_256_result; 39 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3], flo[4], flo[5], flo[6], flo[7]); 40 | 41 | // Double-precision multiply negated and add with 256-bit vectors (FMA) 42 | __m256d double_256_vec_0 = _mm256_set1_pd(8.0); 43 | __m256d double_256_vec_1 = _mm256_set1_pd(20.0); 44 | __m256d double_256_vec_2 = _mm256_set1_pd(2.0); 45 | 46 | __m256d double_256_result = _mm256_fnmsub_pd(double_256_vec_0, double_256_vec_1, double_256_vec_2); 47 | 48 | dou = (double*) &double_256_result; 49 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou[0], dou[1], dou[2], dou[3]); 50 | 51 | // Single-precision multiply negated and add the lowest element with 128-bit vectors (FMA) 52 | __m128 float_128_low_result = _mm_fnmsub_ss(float_128_vec_0, float_128_vec_1, float_128_vec_2); 53 | 54 | flo = (float*) &float_128_low_result; 55 | printf("float:\t\t%f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3]); 56 | 57 | // Double-precision multiply negated and add the lowest element with 128-bit vectors (FMA) 58 | __m128d double_128_low_result = _mm_fnmsub_sd(double_128_vec_0, double_128_vec_1, double_128_vec_2); 59 | 60 | dou = (double*) &double_128_low_result; 61 | printf("double:\t\t%lf, %lf\n", dou[0], dou[1]); 62 | 63 | return 0; 64 | } 65 | -------------------------------------------------------------------------------- /Arithmetic_Intrinsics/src/add.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-17 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Single-precision addition 12 | __m256 float_vec_0 = _mm256_set1_ps(8.0); 13 | __m256 float_vec_1 = _mm256_set1_ps(17.0); 14 | 15 | __m256 float_result = _mm256_add_ps(float_vec_0, float_vec_1); 16 | 17 | float* flo = (float*) &float_result; 18 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3], flo[4], flo[5], flo[6], flo[7]); 19 | 20 | // Double-precision addition 21 | __m256d double_vec_0 = _mm256_set1_pd(8.0); 22 | __m256d double_vec_1 = _mm256_set1_pd(17.0); 23 | 24 | __m256d double_result = _mm256_add_pd(double_vec_0, double_vec_1); 25 | 26 | double* dou = (double*) &double_result; 27 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou[0], dou[1], dou[2], dou[3]); 28 | 29 | // 8-bit integer addition (AVX2) 30 | __m256i epi8_vec_0 = _mm256_set1_epi8(8); 31 | __m256i epi8_vec_1 = _mm256_set1_epi8(17); 32 | 33 | __m256i epi8_result = _mm256_add_epi8(epi8_vec_0, epi8_vec_1); 34 | 35 | char* c = (char*) &epi8_result; 36 | printf("char:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10], c[11], c[12], c[13], c[14], c[15], c[16], c[17], c[18], c[19], c[20], c[21], c[22], c[23], c[24], c[25], c[26], c[27], c[28], c[29], c[30], c[31]); 37 | 38 | // 16-bit integer addition (AVX2) 39 | __m256i epi16_vec_0 = _mm256_set1_epi16(8); 40 | __m256i epi16_vec_1 = _mm256_set1_epi16(17); 41 | 42 | __m256i epi16_result = _mm256_add_epi16(epi16_vec_0, epi16_vec_1); 43 | 44 | short* sho = (short*) &epi16_result; 45 | printf("short:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", sho[0], sho[1], sho[2], sho[3], sho[4], sho[5], sho[6], sho[7], sho[8], sho[9], sho[10], sho[11], sho[12], sho[13], sho[14], sho[15]); 46 | 47 | // 32-bit integer addition (AVX2) 48 | __m256i epi32_vec_0 = _mm256_set1_epi32(8); 49 | __m256i epi32_vec_1 = _mm256_set1_epi32(17); 50 | 51 | __m256i epi32_result = _mm256_add_epi32(epi32_vec_0, epi32_vec_1); 52 | 53 | int* i = (int*) &epi32_result; 54 | printf("int:\t\t%d, %d, %d, %d, %d, %d, %d, %d\n", i[0], i[1], i[2], i[3], i[4], i[5], i[6], i[7]); 55 | 56 | // 64-bit integer addition (AVX2) 57 | __m256i epi64_vec_0 = _mm256_set1_epi64x(8); 58 | __m256i epi64_vec_1 = _mm256_set1_epi64x(17); 59 | 60 | __m256i epi64_result = _mm256_add_epi64(epi64_vec_0, epi64_vec_1); 61 | 62 | long long int* lo = (long long int*) &epi64_result; 63 | printf("long long:\t%lld, %lld, %lld, %lld\n", lo[0], lo[1], lo[2], lo[3]); 64 | 65 | return 0; 66 | } 67 | -------------------------------------------------------------------------------- /Arithmetic_Intrinsics/src/sub.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-17 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Single-precision addition 12 | __m256 float_vec_0 = _mm256_set1_ps(8.0); 13 | __m256 float_vec_1 = _mm256_set1_ps(17.0); 14 | 15 | __m256 float_result = _mm256_sub_ps(float_vec_0, float_vec_1); 16 | 17 | float* flo = (float*) &float_result; 18 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3], flo[4], flo[5], flo[6], flo[7]); 19 | 20 | // Double-precision addition 21 | __m256d double_vec_0 = _mm256_set1_pd(8.0); 22 | __m256d double_vec_1 = _mm256_set1_pd(17.0); 23 | 24 | __m256d double_result = _mm256_sub_pd(double_vec_0, double_vec_1); 25 | 26 | double* dou = (double*) &double_result; 27 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou[0], dou[1], dou[2], dou[3]); 28 | 29 | // 8-bit integer addition (AVX2) 30 | __m256i epi8_vec_0 = _mm256_set1_epi8(8); 31 | __m256i epi8_vec_1 = _mm256_set1_epi8(17); 32 | 33 | __m256i epi8_result = _mm256_sub_epi8(epi8_vec_0, epi8_vec_1); 34 | 35 | char* c = (char*) &epi8_result; 36 | printf("char:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10], c[11], c[12], c[13], c[14], c[15], c[16], c[17], c[18], c[19], c[20], c[21], c[22], c[23], c[24], c[25], c[26], c[27], c[28], c[29], c[30], c[31]); 37 | 38 | // 16-bit integer addition (AVX2) 39 | __m256i epi16_vec_0 = _mm256_set1_epi16(8); 40 | __m256i epi16_vec_1 = _mm256_set1_epi16(17); 41 | 42 | __m256i epi16_result = _mm256_sub_epi16(epi16_vec_0, epi16_vec_1); 43 | 44 | short* sho = (short*) &epi16_result; 45 | printf("short:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", sho[0], sho[1], sho[2], sho[3], sho[4], sho[5], sho[6], sho[7], sho[8], sho[9], sho[10], sho[11], sho[12], sho[13], sho[14], sho[15]); 46 | 47 | // 32-bit integer addition (AVX2) 48 | __m256i epi32_vec_0 = _mm256_set1_epi32(8); 49 | __m256i epi32_vec_1 = _mm256_set1_epi32(17); 50 | 51 | __m256i epi32_result = _mm256_sub_epi32(epi32_vec_0, epi32_vec_1); 52 | 53 | int* i = (int*) &epi32_result; 54 | printf("int:\t\t%d, %d, %d, %d, %d, %d, %d, %d\n", i[0], i[1], i[2], i[3], i[4], i[5], i[6], i[7]); 55 | 56 | // 64-bit integer addition (AVX2) 57 | __m256i epi64_vec_0 = _mm256_set1_epi64x(8); 58 | __m256i epi64_vec_1 = _mm256_set1_epi64x(17); 59 | 60 | __m256i epi64_result = _mm256_sub_epi64(epi64_vec_0, epi64_vec_1); 61 | 62 | long long int* lo = (long long int*) &epi64_result; 63 | printf("long long:\t%lld, %lld, %lld, %lld\n", lo[0], lo[1], lo[2], lo[3]); 64 | 65 | return 0; 66 | } 67 | -------------------------------------------------------------------------------- /Permuting_and_Shuffling/src/shuffle.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-09-04 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Single-precision shuffle with two 256-bit vectors and 8-bit control value 12 | __m256 float_256_vec_0 = _mm256_set_ps(1, 2, 3, 4, 5, 6, 7, 8); 13 | __m256 float_256_vec_1 = _mm256_set_ps(9, 10, 11, 12, 13 ,14, 15, 16); 14 | 15 | __m256 float_256_result = _mm256_shuffle_ps(float_256_vec_0, float_256_vec_1, 0b00010111); 16 | // The result should be: (max -> least) 12, 11, 3, 1 ,16, 15, 7, 5 17 | // (least -> max) 5, 7, 15, 16, 1, 3, 11, 12 18 | 19 | float* flo = (float*) &float_256_result; 20 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3], flo[4], flo[5], flo[6], flo[7]); 21 | 22 | // Double-precision shuffle with two 256-bit vectors and 4-bit control value 23 | __m256d double_256_vec_0 = _mm256_set_pd(1, 2, 3, 4); 24 | __m256d double_256_vec_1 = _mm256_set_pd(5, 6, 7, 8); 25 | 26 | __m256d double_256_result = _mm256_shuffle_pd(double_256_vec_0, double_256_vec_1, 0b0110); 27 | // The result should be: (max -> least) 6, 1, 7, 4 28 | // (least -> max) 4, 7, 1, 6 29 | 30 | double* dou = (double*) &double_256_result; 31 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou[0], dou[1], dou[2], dou[3]); 32 | 33 | // 32-bit integer shuffle with a 256-bit vector and 8-bit control value 34 | __m256i epi32_256_vec_0 = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); 35 | 36 | __m256i epi32_256_result = _mm256_shuffle_epi32(epi32_256_vec_0, 0b00010111); 37 | // The result should be: (max -> least) 4, 3, 3, 1, 8, 7, 7, 5 38 | // (least -> max) 5, 7, 7, 8, 1, 3, 3, 4 39 | 40 | int* i = (int*) &epi32_256_result; 41 | printf("int:\t\t%d, %d, %d, %d, %d, %d, %d, %d\n", i[0], i[1], i[2], i[3], i[4], i[5], i[6], i[7]); 42 | 43 | // 8-bit integer shuffle with 256-bit vector and another 256-bit control vector 44 | __m256i epi8_256_vec_0 = _mm256_set_epi8(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 45 | 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16); 46 | 47 | __m256i epi8_256_control_vec = _mm256_set_epi8(0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 48 | 8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15); 49 | 50 | __m256i epi8_256_result = _mm256_shuffle_epi8(epi8_256_vec_0, epi8_256_control_vec); 51 | // The result should be: (max -> least) 8, 8, 8, 0, 7, 0, 7, 0, 6, 0, 6, 0, 5, 0, 5, 0, 52 | // 12, 0, 12, 0, 11, 0, 11, 0, 10, 0, 10, 0, 9, 0, 9, 0 53 | // (least -> max) 0, 9, 0, 9, 0, 10, 0, 10, 0, 11, 0, 11, 0, 12, 0, 12, 54 | // 0, 5, 0, 5, 0, 6, 0, 6, 0, 7, 0, 7, 0, 8, 8, 8 55 | 56 | char* c = (char*) &epi8_256_result; 57 | printf("char:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10], c[11], c[12], c[13], c[14], c[15], c[16], c[17], c[18], c[19], c[20], c[21], c[22], c[23], c[24], c[25], c[26], c[27], c[28], c[29], c[30], c[31]); 58 | 59 | return 0; 60 | 61 | } 62 | -------------------------------------------------------------------------------- /Arithmetic_Intrinsics/src/Makefile: -------------------------------------------------------------------------------- 1 | # Author: TripleZ 2 | # Date: 2018-08-16 3 | 4 | CC = gcc 5 | CFLAGS = -I$(INCDIR) -mavx -mavx2 -mfma -msse -msse2 -msse3 -Wall 6 | 7 | INCDIR = ../include 8 | BINDIR = ../bin 9 | OBJDIR = ../obj 10 | 11 | ALL = add sub adds subs hadd hsub hadds hsubs addsub \ 12 | mul mullo mulhi mulhrs div \ 13 | fmadd fmsub fnmadd fnmsub fmaddsub fmsubadd 14 | 15 | $(OBJDIR)/%.o: %.c 16 | @[ -d $(OBJDIR) ] || mkdir $(OBJDIR) 17 | $(CC) -c -o $@ $< $(CFLAGS) 18 | 19 | all: $(ALL) 20 | 21 | add: $(patsubst %, $(OBJDIR)/%, add.o) 22 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 23 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 24 | 25 | sub: $(patsubst %, $(OBJDIR)/%, sub.o) 26 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 27 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 28 | 29 | adds: $(patsubst %, $(OBJDIR)/%, adds.o) 30 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 31 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 32 | 33 | subs: $(patsubst %, $(OBJDIR)/%, subs.o) 34 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 35 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 36 | 37 | hadd: $(patsubst %, $(OBJDIR)/%, hadd.o) 38 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 39 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 40 | 41 | hsub: $(patsubst %, $(OBJDIR)/%, hsub.o) 42 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 43 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 44 | 45 | hadds: $(patsubst %, $(OBJDIR)/%, hadds.o) 46 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 47 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 48 | 49 | hsubs: $(patsubst %, $(OBJDIR)/%, hsubs.o) 50 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 51 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 52 | 53 | addsub: $(patsubst %, $(OBJDIR)/%, addsub.o) 54 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 55 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 56 | 57 | mul: $(patsubst %, $(OBJDIR)/%, mul.o) 58 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 59 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 60 | 61 | mullo: $(patsubst %, $(OBJDIR)/%, mullo.o) 62 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 63 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 64 | 65 | mulhi: $(patsubst %, $(OBJDIR)/%, mulhi.o) 66 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 67 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 68 | 69 | mulhrs: $(patsubst %, $(OBJDIR)/%, mulhrs.o) 70 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 71 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 72 | 73 | div: $(patsubst %, $(OBJDIR)/%, div.o) 74 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 75 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 76 | 77 | fmadd: $(patsubst %, $(OBJDIR)/%, fmadd.o) 78 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 79 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 80 | 81 | fmsub: $(patsubst %, $(OBJDIR)/%, fmsub.o) 82 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 83 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 84 | 85 | fnmadd: $(patsubst %, $(OBJDIR)/%, fnmadd.o) 86 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 87 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 88 | 89 | fnmsub: $(patsubst %, $(OBJDIR)/%, fnmsub.o) 90 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 91 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 92 | 93 | fmaddsub: $(patsubst %, $(OBJDIR)/%, fmaddsub.o) 94 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 95 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 96 | 97 | fmsubadd: $(patsubst %, $(OBJDIR)/%, fmsubadd.o) 98 | @[ -d $(BINDIR) ] || mkdir $(BINDIR) 99 | $(CC) -o $(BINDIR)/$@ $^ $(CFLAGS) 100 | 101 | run-%: $(BINDIR)/% 102 | $^ 103 | 104 | runall: all 105 | @echo -e 106 | @for exe in $(ALL); do \ 107 | echo "$(BINDIR)/$$exe"; \ 108 | $(BINDIR)/$$exe; \ 109 | echo -e; \ 110 | done 111 | 112 | 113 | .PHONY: clean 114 | clean: 115 | -@rm -rf $(BINDIR) $(OBJDIR) 116 | @echo "- Cleaned Directory 'Arithmetic_Intrinsics/'" 117 | -------------------------------------------------------------------------------- /Initialization_Intrinsics/src/set.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-16 4 | */ 5 | 6 | #include // AVX/AVX2 7 | #include // SSE 8 | #include // SSE2 9 | #include 10 | 11 | #include "gcc_support.h" 12 | 13 | 14 | int main(int argc, char const *argv[]) { 15 | 16 | // Single-precision 17 | __m256 float_vec = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); 18 | 19 | float* flo = (float*) &float_vec; 20 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3], flo[4], flo[5], flo[6], flo[7]); 21 | 22 | // Double-precision 23 | __m256d double_vec = _mm256_set_pd(9.0, 10.0, 11.0, 12.0); 24 | 25 | double* dou = (double*) &double_vec; 26 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou[0], dou[1], dou[2], dou[3]); 27 | 28 | // 32-bit integer 29 | __m256i int_vec = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); 30 | 31 | int* i = (int*) &int_vec; 32 | printf("int:\t\t%d, %d, %d, %d, %d, %d, %d, %d\n", i[0], i[1], i[2], i[3], i[4], i[5], i[6], i[7]); 33 | 34 | // 64-bit integer 35 | __m256i long_vec = _mm256_set_epi64x(9, 10, 11, 12); 36 | 37 | long long* lo = (long long*) &long_vec; 38 | printf("long long:\t%lld, %lld, %lld, %lld\n", lo[0], lo[1], lo[2], lo[3]); 39 | 40 | // 16-bit integer 41 | __m256i short_vec = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); 42 | 43 | short* sho = (short*) &short_vec; 44 | printf("short:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", sho[0], sho[1], sho[2], sho[3], sho[4], sho[5], sho[6], sho[7], sho[8], sho[9], sho[10], sho[11], sho[12], sho[13], sho[14], sho[15]); 45 | 46 | // 8-bit integer 47 | __m256i char_vec = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32); 48 | 49 | char* c = (char*) &char_vec; 50 | printf("char:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10], c[11], c[12], c[13], c[14], c[15], c[16], c[17], c[18], c[19], c[20], c[21], c[22], c[23], c[24], c[25], c[26], c[27], c[28], c[29], c[30], c[31]); 51 | 52 | // Set value from 128-bit single-precision vectors 53 | __m128 float_vec_128_0 = _mm_set_ps(1.0, 2.0, 3.0, 4.0); 54 | __m128 float_vec_128_1 = _mm_set_ps(5.0, 6.0, 7.0, 8.0); 55 | 56 | __m256 float_vec_256 = _mm256_set_m128(float_vec_128_1, float_vec_128_0); 57 | float* flo_256 = (float*) &float_vec_256; 58 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo_256[0], flo_256[1], flo_256[2], flo_256[3], flo_256[4], flo_256[5], flo_256[6], flo_256[7]); 59 | 60 | // Set value from 128-bit double-precision vectors 61 | __m128d double_vec_128_0 = _mm_set_pd(9.0, 10.0); 62 | __m128d double_vec_128_1 = _mm_set_pd(11.0, 12.0); 63 | 64 | __m256d double_vec_256 = _mm256_set_m128d(double_vec_128_1, double_vec_128_0); 65 | double* dou_256 = (double*) &double_vec_256; 66 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou_256[0], dou_256[1], dou_256[2], dou_256[3]); 67 | 68 | // Set value from 128-bit integer vectors 69 | __m128i int_vec_128_0 = _mm_set_epi32(1, 2, 3, 4); 70 | __m128i int_vec_128_1 = _mm_set_epi32(5, 6, 7, 8); 71 | 72 | __m256i int_vec_256 = _mm256_set_m128i(int_vec_128_1, int_vec_128_0); 73 | int* i_256 = (int*) &int_vec_256; 74 | printf("int:\t\t%d, %d, %d, %d, %d, %d, %d, %d\n", i_256[0], i_256[1], i_256[2], i_256[3], i_256[4], i_256[5], i_256[6], i_256[7]); 75 | 76 | return 0; 77 | 78 | } 79 | -------------------------------------------------------------------------------- /Initialization_Intrinsics/src/setr.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-16 4 | */ 5 | 6 | #include // AVX/AVX2 7 | #include // SSE 8 | #include // SSE2 9 | #include 10 | 11 | #include "gcc_support.h" 12 | 13 | 14 | int main(int argc, char const *argv[]) { 15 | 16 | // Single-precision 17 | __m256 float_vec = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); 18 | 19 | float* flo = (float*) &float_vec; 20 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3], flo[4], flo[5], flo[6], flo[7]); 21 | 22 | // Double-precision 23 | __m256d double_vec = _mm256_setr_pd(9.0, 10.0, 11.0, 12.0); 24 | 25 | double* dou = (double*) &double_vec; 26 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou[0], dou[1], dou[2], dou[3]); 27 | 28 | // 32-bit integer 29 | __m256i int_vec = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); 30 | 31 | int* i = (int*) &int_vec; 32 | printf("int:\t\t%d, %d, %d, %d, %d, %d, %d, %d\n", i[0], i[1], i[2], i[3], i[4], i[5], i[6], i[7]); 33 | 34 | // 64-bit integer 35 | __m256i long_vec = _mm256_setr_epi64x(9, 10, 11, 12); 36 | 37 | long long* lo = (long long*) &long_vec; 38 | printf("long long:\t%d, %d, %d, %d\n", lo[0], lo[1], lo[2], lo[3]); 39 | 40 | // 16-bit integer 41 | __m256i short_vec = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); 42 | 43 | short* sho = (short*) &short_vec; 44 | printf("short:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", sho[0], sho[1], sho[2], sho[3], sho[4], sho[5], sho[6], sho[7], sho[8], sho[9], sho[10], sho[11], sho[12], sho[13], sho[14], sho[15]); 45 | 46 | // 8-bit integer 47 | __m256i char_vec = _mm256_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32); 48 | 49 | char* c = (char*) &char_vec; 50 | printf("char:\t\t%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10], c[11], c[12], c[13], c[14], c[15], c[16], c[17], c[18], c[19], c[20], c[21], c[22], c[23], c[24], c[25], c[26], c[27], c[28], c[29], c[30], c[31]); 51 | 52 | // Set value from 128-bit single-precision vectors 53 | __m128 float_vec_128_0 = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); 54 | __m128 float_vec_128_1 = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); 55 | 56 | __m256 float_vec_256 = _mm256_setr_m128(float_vec_128_1, float_vec_128_0); 57 | float* flo_256 = (float*) &float_vec_256; 58 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo_256[0], flo_256[1], flo_256[2], flo_256[3], flo_256[4], flo_256[5], flo_256[6], flo_256[7]); 59 | 60 | // Set value from 128-bit double-precision vectors 61 | __m128d double_vec_128_0 = _mm_setr_pd(9.0, 10.0); 62 | __m128d double_vec_128_1 = _mm_setr_pd(11.0, 12.0); 63 | 64 | __m256d double_vec_256 = _mm256_setr_m128d(double_vec_128_1, double_vec_128_0); 65 | double* dou_256 = (double*) &double_vec_256; 66 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou_256[0], dou_256[1], dou_256[2], dou_256[3]); 67 | 68 | // Set value from 128-bit integer vectors 69 | __m128i int_vec_128_0 = _mm_setr_epi32(1, 2, 3, 4); 70 | __m128i int_vec_128_1 = _mm_setr_epi32(5, 6, 7, 8); 71 | 72 | __m256i int_vec_256 = _mm256_setr_m128i(int_vec_128_1, int_vec_128_0); 73 | int* i_256 = (int*) &int_vec_256; 74 | printf("int:\t\t%d, %d, %d, %d, %d, %d, %d, %d\n", i_256[0], i_256[1], i_256[2], i_256[3], i_256[4], i_256[5], i_256[6], i_256[7]); 75 | 76 | return 0; 77 | 78 | } 79 | -------------------------------------------------------------------------------- /Initialization_Intrinsics/src/maskload.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-17 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char const *argv[]) { 10 | 11 | // Mask-load 128-bit single-precision vector 12 | float floats_128[4] = {1.0, 2.0, 3.0, 4.0}; 13 | 14 | __m128i mask_128_ps = _mm_setr_epi32(-3, -2 ,-1, 0); 15 | 16 | __m128 float_vec = _mm_maskload_ps(floats_128, mask_128_ps); 17 | 18 | float* flo = (float*) &float_vec; 19 | printf("float:\t\t%f, %f, %f, %f\n", flo[0], flo[1], flo[2], flo[3]); 20 | 21 | 22 | // Mask-load 128-bit double-precision vector 23 | double doubles_128[2] = {5.0, 6.0}; 24 | 25 | __m128i mask_128_pd = _mm_setr_epi32(-1, -1, 1, 1); 26 | 27 | __m128d double_vec = _mm_maskload_pd(doubles_128, mask_128_pd); 28 | 29 | double* dou = (double*) &double_vec; 30 | printf("double:\t\t%lf, %lf\n", dou[0], dou[1]); 31 | 32 | 33 | // Mask-load 256-bit single-precision vector 34 | float floats_256[8] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}; 35 | 36 | __m256i mask_256_ps = _mm256_setr_epi32(-3, -2 ,-1, 0, 1, 2, 3, 4); 37 | 38 | __m256 float_vec_256 = _mm256_maskload_ps(floats_256, mask_256_ps); 39 | 40 | float* flo_256 = (float*) &float_vec_256; 41 | printf("float:\t\t%f, %f, %f, %f, %f, %f, %f, %f\n", flo_256[0], flo_256[1], flo_256[2], flo_256[3], flo_256[4], flo_256[5], flo_256[6], flo_256[7]); 42 | 43 | 44 | // Mask-load 256-bit double-precision vector 45 | double doubles_256[4] = {5.0, 6.0, 7.0, 8.0}; 46 | 47 | __m256i mask_256_pd = _mm256_setr_epi64x(-1, 1, -1, 1); 48 | 49 | __m256d double_vec_256 = _mm256_maskload_pd(doubles_256, mask_256_pd); 50 | 51 | double* dou_256 = (double*) &double_vec_256; 52 | printf("double:\t\t%lf, %lf, %lf, %lf\n", dou_256[0], dou_256[1], dou_256[2], dou_256[3]); 53 | 54 | 55 | // Mask-load 128-bit 32-bit-integer vector (AVX2) 56 | int int32_128[4] = {1, 2, 3, 4}; 57 | 58 | __m128i mask_128_epi32 = _mm_setr_epi32(-3, -2 ,-1, 0); 59 | 60 | __m128i int32_vec_128 = _mm_maskload_epi32(int32_128, mask_128_epi32); 61 | 62 | int* i_v_32_128 = (int*) &int32_vec_128; 63 | printf("int:\t\t%d, %d, %d, %d\n", i_v_32_128[0], i_v_32_128[1], i_v_32_128[2], i_v_32_128[3]); 64 | 65 | 66 | // Mask-load 128-bit 64-bit-integer vector (AVX2) 67 | long long int int64_128[2] = {1, 2}; 68 | 69 | __m128i mask_128_epi64 = _mm_setr_epi32(-1, -1, 1, 1); 70 | 71 | __m128i int64_vec_128 = _mm_maskload_epi64(int64_128, mask_128_epi64); 72 | 73 | long long int* i_v_64_128 = (long long int*) &int64_vec_128; 74 | printf("long long:\t%lld, %lld\n", i_v_64_128[0], i_v_64_128[1]); 75 | 76 | 77 | // Mask-load 256-bit 32-bit-integer vector (AVX2) 78 | int int32_256[8] = {1, 2, 3, 4, 5, 6, 7, 8}; 79 | 80 | __m256i mask_256_epi32 = _mm256_setr_epi32(-3, -2 ,-1, 0, 1, 2, -10, 4); 81 | 82 | __m256i int32_vec_256 = _mm256_maskload_epi32(int32_256, mask_256_epi32); 83 | 84 | int* i_v_32_256 = (int*) &int32_vec_256; 85 | printf("int:\t\t%d, %d, %d, %d, %d, %d, %d, %d\n", i_v_32_256[0], i_v_32_256[1], i_v_32_256[2], i_v_32_256[3], i_v_32_256[4], i_v_32_256[5], i_v_32_256[6], i_v_32_256[7]); 86 | 87 | 88 | // Mask-load 256-bit 64-bit-integer vector (AVX2) 89 | long long int int64_256[4] = {1, 2, 3, 4}; 90 | 91 | __m256i mask_256_epi64 = _mm256_setr_epi64x(1, -1, 1, -1); 92 | 93 | __m256i int64_vec_256 = _mm256_maskload_epi64(int64_256, mask_256_epi64); 94 | 95 | long long int* i_v_64_256 = (long long int*) &int64_vec_256; 96 | printf("long long:\t%lld, %lld, %lld, %lld\n", i_v_64_256[0], i_v_64_256[1], i_v_64_256[2], i_v_64_256[3]); 97 | 98 | 99 | return 0; 100 | } 101 | 102 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AVX / AVX2 Intrinsics Example Code 2 | 3 | 4 | 5 | - [AVX / AVX2 Intrinsics Example Code](#avx--avx2-intrinsics-example-code) 6 | - [Quick Start](#quick-start) 7 | - [Compile](#compile) 8 | - [Run](#run) 9 | - [Clean](#clean) 10 | - [Initialization Intrinsics](#initialization-intrinsics) 11 | - [Initialization with Scalar Values](#initialization-with-scalar-values) 12 | - [Loading Data to Memory](#loading-data-to-memory) 13 | - [Arithmetic Intrinsics](#arithmetic-intrinsics) 14 | - [Addition and Subtraction](#addition-and-subtraction) 15 | - [Multiplication and Division](#multiplication-and-division) 16 | - [Fused Multiply and Add (FMA)](#fused-multiply-and-add-fma) 17 | - [Permuting and Shuffling](#permuting-and-shuffling) 18 | - [Permuting](#permuting) 19 | - [Copyright](#copyright) 20 | 21 | 22 | 23 | ## Quick Start 24 | 25 | ### Compile 26 | 27 | ```bash 28 | $ make 29 | ``` 30 | 31 | All the source files in `src/` will be compiled and generate binary files to the `bin/` in each subdirectory. 32 | 33 | ### Run 34 | 35 | Fast compile & run at one time! 36 | 37 | Execute this command: 38 | 39 | ```bash 40 | $ make run 41 | ``` 42 | 43 | At the project root directory, then you can see all the program output on your terminal :tada: 44 | 45 | ### Clean 46 | 47 | It's easy to clean all the output files, just enter the following command at the project root directory: 48 | 49 | ```bash 50 | $ make clean 51 | ``` 52 | 53 | Then you would find out all the output files are gone away! 54 | 55 | ## Initialization Intrinsics 56 | 57 | ### Initialization with Scalar Values 58 | 59 | - [setzero](Initialization_Intrinsics/src/setzero.c) 60 | > - [_mm256_setzero_ps](Initialization_Intrinsics/src/setzero.c#L12) 61 | > - [_mm256_setzero_pd](Initialization_Intrinsics/src/setzero.c#L18) 62 | > - [_mm256_setzero_si256](Initialization_Intrinsics/src/setzero.c#L24) 63 | 64 | - [set1](Initialization_Intrinsics/src/set1.c) 65 | > - [_mm256_set1_ps](Initialization_Intrinsics/src/set1.c#L12) 66 | > - [_mm256_set1_pd](Initialization_Intrinsics/src/set1.c#L18) 67 | > - [_mm256_set1_epi32](Initialization_Intrinsics/src/set1.c#L24) 68 | > - [_mm256_set1_epi64x](Initialization_Intrinsics/src/set1.c#L30) 69 | > - [_mm256_set1_epi16](Initialization_Intrinsics/src/set1.c#L36) 70 | > - [_mm256_set1_epi8](Initialization_Intrinsics/src/set1.c#L42) 71 | 72 | - [set](Initialization_Intrinsics/src/set.c) 73 | > - [_mm256_set_ps](Initialization_Intrinsics/src/set.c#L17) 74 | > - [_mm256_set_pd](Initialization_Intrinsics/src/set.c#L23) 75 | > - [_mm256_set_epi32](Initialization_Intrinsics/src/set.c#L29) 76 | > - [_mm256_set_epi64x](Initialization_Intrinsics/src/set.c#L35) 77 | > - [_mm256_set_epi16](Initialization_Intrinsics/src/set.c#L41) 78 | > - [_mm256_set_epi8](Initialization_Intrinsics/src/set.c#L47) 79 | > - [_mm256_set_m128](Initialization_Intrinsics/src/set.c#L56) 80 | > - [_mm256_set_m128d](Initialization_Intrinsics/src/set.c#L64) 81 | > - [_mm256_set_m128i](Initialization_Intrinsics/src/set.c#L72) 82 | 83 | - [setr](Initialization_Intrinsics/src/setr.c) 84 | > - [_mm256_setr_ps](Initialization_Intrinsics/src/setr.c#L17) 85 | > - [_mm256_setr_pd](Initialization_Intrinsics/src/setr.c#L23) 86 | > - [_mm256_setr_epi32](Initialization_Intrinsics/src/setr.c#L29) 87 | > - [_mm256_setr_epi64x](Initialization_Intrinsics/src/setr.c#L35) 88 | > - [_mm256_setr_epi16](Initialization_Intrinsics/src/setr.c#L41) 89 | > - [_mm256_setr_epi8](Initialization_Intrinsics/src/setr.c#L47) 90 | > - [_mm256_setr_m128](Initialization_Intrinsics/src/setr.c#L56) 91 | > - [_mm256_setr_m128d](Initialization_Intrinsics/src/setr.c#L64) 92 | > - [_mm256_setr_m128i](Initialization_Intrinsics/src/setr.c#L72) 93 | 94 | ### Loading Data to Memory 95 | 96 | - [load](Initialization_Intrinsics/src/load.c) 97 | > - [_mm256_load_ps](Initialization_Intrinsics/src/load.c#L18) 98 | > - [_mm256_load_pd](Initialization_Intrinsics/src/load.c#L29) 99 | > - [_mm256_load_si256](Initialization_Intrinsics/src/load.c#L40) 100 | 101 | - [loadu](Initialization_Intrinsics/src/loadu.c) 102 | > - [_mm256_loadu_ps](Initialization_Intrinsics/src/loadu.c#L18) 103 | > - [_mm256_loadu_pd](Initialization_Intrinsics/src/loadu.c#L29) 104 | > - [_mm256_loadu_si256](Initialization_Intrinsics/src/loadu.c#L40) 105 | 106 | - [maskload](Initialization_Intrinsics/src/maskload.c) 107 | > - [_mm_maskload_ps](Initialization_Intrinsics/src/maskload.c#L16) 108 | > - [_mm_maskload_pd](Initialization_Intrinsics/src/maskload.c#L27) 109 | > - [_mm256_maskload_ps](Initialization_Intrinsics/src/maskload.c#L38) 110 | > - [_mm256_maskload_pd](Initialization_Intrinsics/src/maskload.c#L49) 111 | > - [_mm_maskload_epi32](Initialization_Intrinsics/src/maskload.c#L60) `AVX2` 112 | > - [_mm_maskload_epi64](Initialization_Intrinsics/src/maskload.c#L71) `AVX2` 113 | > - [_mm256_maskload_epi32](Initialization_Intrinsics/src/maskload.c#L82) `AVX2` 114 | > - [_mm256_maskload_epi64](Initialization_Intrinsics/src/maskload.c#L93) `AVX2` 115 | 116 | ## Arithmetic Intrinsics 117 | 118 | ### Addition and Subtraction 119 | 120 | - [add](Arithmetic_Intrinsics/src/add.c) 121 | > - [_mm256_add_ps](Arithmetic_Intrinsics/src/add.c#L15) 122 | > - [_mm256_add_pd](Arithmetic_Intrinsics/src/add.c#L24) 123 | > - [_mm256_add_epi8](Arithmetic_Intrinsics/src/add.c#L33) `AVX2` 124 | > - [_mm256_add_epi16](Arithmetic_Intrinsics/src/add.c#L42) `AVX2` 125 | > - [_mm256_add_epi32](Arithmetic_Intrinsics/src/add.c#L51) `AVX2` 126 | > - [_mm256_add_epi64](Arithmetic_Intrinsics/src/add.c#L60) `AVX2` 127 | 128 | - [sub](Arithmetic_Intrinsics/src/sub.c) 129 | > - [_mm256_sub_ps](Arithmetic_Intrinsics/src/sub.c#L15) 130 | > - [_mm256_sub_pd](Arithmetic_Intrinsics/src/sub.c#L24) 131 | > - [_mm256_sub_epi8](Arithmetic_Intrinsics/src/sub.c#L33) `AVX2` 132 | > - [_mm256_sub_epi16](Arithmetic_Intrinsics/src/sub.c#L42) `AVX2` 133 | > - [_mm256_sub_epi32](Arithmetic_Intrinsics/src/sub.c#L51) `AVX2` 134 | > - [_mm256_sub_epi64](Arithmetic_Intrinsics/src/sub.c#L60) `AVX2` 135 | 136 | - [adds](Arithmetic_Intrinsics/src/adds.c) 137 | > - [_mm256_adds_epi8](Arithmetic_Intrinsics/src/adds.c#L15) `AVX2` 138 | > - [_mm256_adds_epi16](Arithmetic_Intrinsics/src/adds.c#L24) `AVX2` 139 | > - [_mm256_adds_epu8](Arithmetic_Intrinsics/src/adds.c#L33) `AVX2` 140 | > - [_mm256_adds_epu16](Arithmetic_Intrinsics/src/adds.c#L42) `AVX2` 141 | 142 | - [subs](Arithmetic_Intrinsics/src/subs.c) 143 | > - [_mm256_subs_epi8](Arithmetic_Intrinsics/src/subs.c#L15) `AVX2` 144 | > - [_mm256_subs_epi16](Arithmetic_Intrinsics/src/subs.c#L24) `AVX2` 145 | > - [_mm256_subs_epu8](Arithmetic_Intrinsics/src/subs.c#L33) `AVX2` 146 | > - [_mm256_subs_epu16](Arithmetic_Intrinsics/src/subs.c#L42) `AVX2` 147 | 148 | - [hadd](Arithmetic_Intrinsics/src/hadd.c) 149 | > - [_mm256_hadd_ps](Arithmetic_Intrinsics/src/hadd.c#L15) 150 | > - [_mm256_hadd_pd](Arithmetic_Intrinsics/src/hadd.c#L24) 151 | > - [_mm256_hadd_epi16](Arithmetic_Intrinsics/src/hadd.c#L33) `AVX2` 152 | > - [_mm256_hadd_epi32](Arithmetic_Intrinsics/src/hadd.c#L42) `AVX2` 153 | 154 | - [hsub](Arithmetic_Intrinsics/src/hsub.c) 155 | > - [_mm256_hadd_ps](Arithmetic_Intrinsics/src/hsub.c#L15) 156 | > - [_mm256_hadd_pd](Arithmetic_Intrinsics/src/hsub.c#L24) 157 | > - [_mm256_hadd_epi16](Arithmetic_Intrinsics/src/hsub.c#L33) `AVX2` 158 | > - [_mm256_hadd_epi32](Arithmetic_Intrinsics/src/hsub.c#L42) `AVX2` 159 | 160 | - [hadds](Arithmetic_Intrinsics/src/hadds.c) 161 | > - [_mm256_hadds_epi16](Arithmetic_Intrinsics/src/hadds.c#L15) `AVX2` 162 | 163 | - [hsubs](Arithmetic_Intrinsics/src/hsubs.c) 164 | > - [_mm256_hsubs_epi16](Arithmetic_Intrinsics/src/hsubs.c#L15) `AVX2` 165 | 166 | - [addsub](Arithmetic_Intrinsics/src/addsub.c) 167 | > - [_mm256_addsub_ps](Arithmetic_Intrinsics/src/addsub.c#L15) 168 | > - [_mm256_addsub_pd](Arithmetic_Intrinsics/src/addsub.c#L24) 169 | 170 | ### Multiplication and Division 171 | 172 | - [mul](Arithmetic_Intrinsics/src/mul.c) 173 | > - [_mm256_mul_ps](Arithmetic_Intrinsics/src/mul.c#L15) 174 | > - [_mm256_mul_pd](Arithmetic_Intrinsics/src/mul.c#L24) 175 | > - [_mm256_mul_epi32](Arithmetic_Intrinsics/src/mul.c#L33) `AVX2` 176 | > - [_mm256_mul_epu32](Arithmetic_Intrinsics/src/mul.c#L42) `AVX2` 177 | 178 | - [mullo](Arithmetic_Intrinsics/src/mul.c) 179 | > - [_mm256_mullo_epi16](Arithmetic_Intrinsics/src/mul.c#L15) `AVX2` 180 | > - [_mm256_mullo_epi32](Arithmetic_Intrinsics/src/mul.c#L24) `AVX2` 181 | 182 | - [mulhi](Arithmetic_Intrinsics/src/mulhi.c) 183 | > - [_mm256_mulhi_epi16](Arithmetic_Intrinsics/src/mulhi.c#L15) `AVX2` 184 | > - [_mm256_mulhi_epu16](Arithmetic_Intrinsics/src/mulhi.c#L24) `AVX2` 185 | 186 | - [mulhrs](Arithmetic_Intrinsics/src/mulhrs.c) 187 | > - [_mm256_mulhrs_epi16](Arithmetic_Intrinsics/src/mulhrs.c#L15) `AVX2` 188 | 189 | - [div](Arithmetic_Intrinsics/src/div.c) 190 | > - [_mm256_div_ps](Arithmetic_Intrinsics/src/div.c#L15) 191 | > - [_mm256_div_pd](Arithmetic_Intrinsics/src/div.c#L24) 192 | 193 | ### Fused Multiply and Add (FMA) 194 | 195 | - [fmadd](Arithmetic_Intrinsics/src/fmadd.c) 196 | > - [_mm_fmadd_ps](Arithmetic_Intrinsics/src/fmadd.c#L16) `FMA` 197 | > - [_mm_fmadd_pd](Arithmetic_Intrinsics/src/fmadd.c#L26) `FMA` 198 | > - [_mm256_fmadd_ps](Arithmetic_Intrinsics/src/fmadd.c#L36) `FMA` 199 | > - [_mm256_fmadd_pd](Arithmetic_Intrinsics/src/fmadd.c#L46) `FMA` 200 | > - [_mm_fmadd_ss](Arithmetic_Intrinsics/src/fmadd.c#L52) `FMA` 201 | > - [_mm_fmadd_sd](Arithmetic_Intrinsics/src/fmadd.c#L58) `FMA` 202 | 203 | - [fmsub](Arithmetic_Intrinsics/src/fmsub.c) 204 | > - [_mm_fmsub_ps](Arithmetic_Intrinsics/src/fmsub.c#L16) `FMA` 205 | > - [_mm_fmsub_pd](Arithmetic_Intrinsics/src/fmsub.c#L26) `FMA` 206 | > - [_mm256_fmsub_ps](Arithmetic_Intrinsics/src/fmsub.c#L36) `FMA` 207 | > - [_mm256_fmsub_pd](Arithmetic_Intrinsics/src/fmsub.c#L46) `FMA` 208 | > - [_mm_fmsub_ss](Arithmetic_Intrinsics/src/fmsub.c#L52) `FMA` 209 | > - [_mm_fmsub_sd](Arithmetic_Intrinsics/src/fmsub.c#L58) `FMA` 210 | 211 | - [fnmadd](Arithmetic_Intrinsics/src/fnmadd.c) 212 | > - [_mm_fnmadd_ps](Arithmetic_Intrinsics/src/fnmadd.c#L16) `FMA` 213 | > - [_mm_fnmadd_pd](Arithmetic_Intrinsics/src/fnmadd.c#L26) `FMA` 214 | > - [_mm256_fnmadd_ps](Arithmetic_Intrinsics/src/fnmadd.c#L36) `FMA` 215 | > - [_mm256_fnmadd_pd](Arithmetic_Intrinsics/src/fnmadd.c#L46) `FMA` 216 | > - [_mm_fnmadd_ss](Arithmetic_Intrinsics/src/fnmadd.c#L52) `FMA` 217 | > - [_mm_fnmadd_sd](Arithmetic_Intrinsics/src/fnmadd.c#L58) `FMA` 218 | 219 | - [fnmsub](Arithmetic_Intrinsics/src/fnmsub.c) 220 | > - [_mm_fnmsub_ps](Arithmetic_Intrinsics/src/fnmsub.c#L16) `FMA` 221 | > - [_mm_fnmsub_pd](Arithmetic_Intrinsics/src/fnmsub.c#L26) `FMA` 222 | > - [_mm256_fnmsub_ps](Arithmetic_Intrinsics/src/fnmsub.c#L36) `FMA` 223 | > - [_mm256_fnmsub_pd](Arithmetic_Intrinsics/src/fnmsub.c#L46) `FMA` 224 | > - [_mm_fnmsub_ss](Arithmetic_Intrinsics/src/fnmsub.c#L52) `FMA` 225 | > - [_mm_fnmsub_sd](Arithmetic_Intrinsics/src/fnmsub.c#L58) `FMA` 226 | 227 | - [fmaddsub](Arithmetic_Intrinsics/src/fmaddsub.c) 228 | > - [_mm_fmaddsub_ps](Arithmetic_Intrinsics/src/fmaddsub.c#L16) `FMA` 229 | > - [_mm_fmaddsub_pd](Arithmetic_Intrinsics/src/fmaddsub.c#L26) `FMA` 230 | > - [_mm256_fmaddsub_ps](Arithmetic_Intrinsics/src/fmaddsub.c#L36) `FMA` 231 | > - [_mm256_fmaddsub_pd](Arithmetic_Intrinsics/src/fmaddsub.c#L46) `FMA` 232 | 233 | - [fmsubadd](Arithmetic_Intrinsics/src/fmsubadd.c) 234 | > - [_mm_fmsubadd_ps](Arithmetic_Intrinsics/src/fmsubadd.c#L16) `FMA` 235 | > - [_mm_fmsubadd_pd](Arithmetic_Intrinsics/src/fmsubadd.c#L26) `FMA` 236 | > - [_mm256_fmsubadd_ps](Arithmetic_Intrinsics/src/fmsubadd.c#L36) `FMA` 237 | > - [_mm256_fmsubadd_pd](Arithmetic_Intrinsics/src/fmsubadd.c#L46) `FMA` 238 | 239 | ## Permuting and Shuffling 240 | 241 | ### Permuting 242 | 243 | - [permute](Permuting_and_Shuffling/src/permute.c) 244 | > - [_mm_permute_ps](Permuting_and_Shuffling/src/permute.c#L14) 245 | > - [_mm_permute_pd](Permuting_and_Shuffling/src/permute.c#L22) 246 | > - [_mm256_permute_ps](Permuting_and_Shuffling/src/permute.c#L30) 247 | > - [_mm256_permute_pd](Permuting_and_Shuffling/src/permute.c#L38) 248 | 249 | - [permute4x64](Permuting_and_Shuffling/src/permute4x64.c) 250 | > - [_mm256_permute4x64_pd](Permuting_and_Shuffling/src/permute4x64.c#L14) `AVX2` 251 | > - [_mm256_permute4x64_epi64](Permuting_and_Shuffling/src/permute4x64.c#L22) `AVX2` 252 | 253 | - [permute2f128](Permuting_and_Shuffling/src/permute2f128.c) 254 | > - [_mm256_permute2f128_ps](Permuting_and_Shuffling/src/permute2f128.c#L22) 255 | > - [_mm256_permute2f128_pd](Permuting_and_Shuffling/src/permute2f128.c#L36) 256 | > - [_mm256_permute2f128_si256](Permuting_and_Shuffling/src/permute2f128.c#L50) 257 | 258 | - [permutevar](Permuting_and_Shuffling/src/permutevar.c) 259 | > - [_mm_permutevar_ps](Permuting_and_Shuffling/src/permutevar.c#L15) 260 | > - [_mm_permutevar_pd](Permuting_and_Shuffling/src/permutevar.c#L24) 261 | > - [_mm256_permutevar_ps](Permuting_and_Shuffling/src/permutevar.c#L33) 262 | > - [_mm256_permutevar_pd](Permuting_and_Shuffling/src/permutevar.c#L42) 263 | 264 | - [permutevar8x32](Permuting_and_Shuffling/src/permutevar8x32.c) 265 | > - [_mm256_permutevar8x32_ps](Permuting_and_Shuffling/src/permutevar8x32.c#L15) `AVX2` 266 | > - [_mm256_permutevar8x32_epi32](Permuting_and_Shuffling/src/permutevar8x32.c#L24) `AVX2` 267 | 268 | - [shuffle](Permuting_and_Shuffling/src/shuffle.c) 269 | > - [_mm256_shuffle_ps](Permuting_and_Shuffling/src/shuffle.c#L15) 270 | > - [_mm256_shuffle_pd](Permuting_and_Shuffling/src/shuffle.c#L26) 271 | > - [_mm256_shuffle_epi32](Permuting_and_Shuffling/src/shuffle.c#L36) 272 | > - [_mm256_shuffle_epi8](Permuting_and_Shuffling/src/shuffle.c#L50) 273 | 274 | - [shufflehi](Permuting_and_Shuffling/src/shufflehi.c) 275 | > - [_mm256_shufflehi_epi16](Permuting_and_Shuffling/src/shufflehi.c#L14) `AVX2` 276 | 277 | - [shufflelo](Permuting_and_Shuffling/src/shufflelo.c) 278 | > - [_mm256_shufflelo_epi16](Permuting_and_Shuffling/src/shufflelo.c#L14) `AVX2` 279 | 280 | 281 | ## Copyright 282 | 283 | This project is licensed under the [BSD 3-Clause](LICENSE) license. 284 | --------------------------------------------------------------------------------