├── PEXT_PDEP_Emu.h ├── stdafx.cpp ├── Compiler_Intrinsic_Test.h ├── GFNI_Demo.h ├── 512bFMA_DP_Ports.cpp ├── Results ├── GFNI_Output.png ├── OfficeDayTime.png ├── GFNI_PosPopcnt.png ├── PEXT_PDEP_TR3970X.png ├── GFNI_8x8_Explanation.png ├── GFNI_vs_VBMI2.txt ├── Byte2Byte_CNL.txt ├── Byte2Byte_RKL.txt ├── Zen4_expected.txt ├── HYBRID_Lakefield_CPUID806A1.txt ├── TZCNT_RKL.txt └── TZCNT_WLC.txt ├── 512bFMA_DP_Ports_Asm.asm ├── 512bFMA_DP_Ports.h ├── InstLatX64_Demo.vcxproj.user ├── KmovTest.h ├── FirstByte.h ├── AVX512_Reduce_Add.h ├── VPCLMULQDQ_Demo.h ├── targetver.h ├── .gitattributes ├── stdafx.h ├── Kmov_Test.cpp ├── AVX_VNNI_INT16_Saturated_AddSub.h ├── Zen4_Demo.h ├── AVX512_BGVSER.h ├── .gitignore ├── AVX512_Saturated_AddSub.h ├── Byte2Byte.h ├── Zen5_Demo.h ├── AVX512_DecimalPrint.h ├── Byte2Byte.cpp ├── AMX_Demo.h ├── HWBITPERM_Demo.h ├── InstLatX64_Demo.h ├── P06P1.h ├── VPCLMULQDQ_Demo.cpp ├── Misc.h ├── ConsoleColor.h ├── LZCNT_Demo.h ├── Zen5_Demo_Port.h ├── Zen4_Demo_Port.h ├── TZCNT_Demo.h ├── KmovTest_Asm.asm ├── InstLatX64_Demo.sln ├── VPCLMULQDQ_Demo_Test.cpp ├── Args.h ├── HWBITPERM_Demo_Asm.asm ├── AVX512_Reduce_Add.cpp ├── AMX_Demo.cpp ├── InstLatX64_Demo.cpp ├── FirstByte.cpp ├── AVX_VNNI_INT16_Saturated_AddSub.cpp ├── Byte2Byte_Asm.asm ├── Misc.cpp ├── README.md ├── TZCNT_Demo.cpp ├── Zen5_Demo_Imm8.h ├── TZCNT_Demo_Asm.asm ├── LZCNT_Demo.cpp ├── LZCNT_Demo_Asm.asm ├── HWBITPERM_Demo.cpp ├── Zen4_Demo_Imm8.h ├── AVX512_BGVSER.cpp ├── P06P1.cpp ├── InstLatX64_Demo.vcxproj.filters └── Args.cpp /PEXT_PDEP_Emu.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | -------------------------------------------------------------------------------- /stdafx.cpp: -------------------------------------------------------------------------------- 1 | #include "stdafx.h" 2 | -------------------------------------------------------------------------------- /Compiler_Intrinsic_Test.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | void InstrincTest(void); 4 | -------------------------------------------------------------------------------- /GFNI_Demo.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InstLatx64/InstLatX64_Demo/HEAD/GFNI_Demo.h -------------------------------------------------------------------------------- /512bFMA_DP_Ports.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InstLatx64/InstLatX64_Demo/HEAD/512bFMA_DP_Ports.cpp -------------------------------------------------------------------------------- /Results/GFNI_Output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InstLatx64/InstLatX64_Demo/HEAD/Results/GFNI_Output.png -------------------------------------------------------------------------------- /512bFMA_DP_Ports_Asm.asm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InstLatx64/InstLatX64_Demo/HEAD/512bFMA_DP_Ports_Asm.asm -------------------------------------------------------------------------------- /Results/OfficeDayTime.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InstLatx64/InstLatX64_Demo/HEAD/Results/OfficeDayTime.png -------------------------------------------------------------------------------- /Results/GFNI_PosPopcnt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InstLatx64/InstLatX64_Demo/HEAD/Results/GFNI_PosPopcnt.png -------------------------------------------------------------------------------- /Results/PEXT_PDEP_TR3970X.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InstLatx64/InstLatX64_Demo/HEAD/Results/PEXT_PDEP_TR3970X.png -------------------------------------------------------------------------------- /Results/GFNI_8x8_Explanation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InstLatx64/InstLatX64_Demo/HEAD/Results/GFNI_8x8_Explanation.png -------------------------------------------------------------------------------- /512bFMA_DP_Ports.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | extern "C" void fma_shuffle_tpt(int); 4 | extern "C" void fma_only_tpt(int); 5 | 6 | int Get_512bFMA_DP_Ports_FromOptimGuide(void); -------------------------------------------------------------------------------- /InstLatX64_Demo.vcxproj.user: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /KmovTest.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define KMOV_REPEATS 0x1000000 4 | 5 | extern "C" unsigned __int64 KmovTest01(void); 6 | extern "C" unsigned __int64 KmovTest02(void); 7 | extern "C" unsigned __int64 KmovTest03(void); 8 | 9 | -------------------------------------------------------------------------------- /FirstByte.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | __m256i _mm256_firstbyte_epu32(__m256i a, char c); 4 | __m256i _mm256_firstbyte_epu64(__m256i a, char c); 5 | __m512i _mm512_firstbyte_epu32(__m512i a, char c); 6 | __m512i _mm512_firstbyte_epu64(__m512i a, char c); -------------------------------------------------------------------------------- /AVX512_Reduce_Add.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | uint32_t _mm512_reduce2_add_epu8(__m512i z); 4 | uint32_t _mm512_reduce2_add_epu16(__m512i z); 5 | uint64_t _mm512_reduce2_add_epu32(__m512i z); 6 | uint64_t _mm512_reduce2_add_epu64(__m512i z); 7 | uint64_t _mm512_reduce2_add_epu128(__m512i z, uint64_t* hi); 8 | -------------------------------------------------------------------------------- /VPCLMULQDQ_Demo.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | /* Prefix xor for entire vector register */ 4 | 5 | __m128i _mm_prefix_xor_clmul_si128(__m128i a); 6 | 7 | #if defined(__AVX2__) 8 | __m256i _mm256_prefix_xor_clmul_si256(__m256i a); 9 | #endif 10 | 11 | #if defined(__AVX512F__) 12 | __m512i _mm512_prefix_xor_clmul_si512(__m512i a); 13 | #endif 14 | -------------------------------------------------------------------------------- /targetver.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // Including SDKDDKVer.h defines the highest available Windows platform. 4 | 5 | // If you wish to build your application for a previous Windows platform, include WinSDKVer.h and 6 | // set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h. 7 | 8 | #include 9 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /stdafx.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "targetver.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | 15 | #include 16 | 17 | #include "Misc.h" 18 | #include "CPU_Props.h" 19 | #include "InstLatX64_Demo.h" 20 | #include "512bFMA_DP_Ports.h" 21 | #include "Args.h" 22 | #include "ConsoleColor.h" 23 | 24 | -------------------------------------------------------------------------------- /Kmov_Test.cpp: -------------------------------------------------------------------------------- 1 | #include "stdafx.h" 2 | #include "KmovTest.h" 3 | 4 | extern CPU_Props cpu_props; 5 | 6 | using namespace std; 7 | 8 | void Kmov_Test(void) { 9 | cout << "--- KMOV + POPCNT ---" << dec << right << endl; 10 | cout << "4x KMOVW + 4x POPCNT + 3x ADD :" << (double)KmovTest01() / (double)KMOV_REPEATS << endl; 11 | cout << "3x KUNPCK + 1x KMOVQ + 1x POPCNT:" << (double)KmovTest02() / (double)KMOV_REPEATS << endl; 12 | cout << "4x KMOVW mem + POPCNT mem :" << (double)KmovTest03() / (double)KMOV_REPEATS << endl; 13 | } -------------------------------------------------------------------------------- /AVX_VNNI_INT16_Saturated_AddSub.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef __cplusplus 4 | extern "C" { 5 | #endif 6 | 7 | __m128i _mm_adds_epi32(__m128i a, __m128i b); 8 | __m128i _mm_subs_epi32(__m128i a, __m128i b); 9 | 10 | __m128i _mm_adds_epu32(__m128i a, __m128i b); 11 | __m128i _mm_subs_epu32(__m128i a, __m128i b); 12 | 13 | __m256i _mm256_adds_epi32(__m256i a, __m256i b); 14 | __m256i _mm256_subs_epi32(__m256i a, __m256i b); 15 | 16 | __m256i _mm256_adds_epu32(__m256i a, __m256i b); 17 | __m256i _mm256_subs_epu32(__m256i a, __m256i b); 18 | 19 | #ifdef __cplusplus 20 | } 21 | #endif 22 | -------------------------------------------------------------------------------- /Zen4_Demo.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define _ZEN4_DEMO_VERSION 0x0100 4 | 5 | #define ZEN4_REPEATS 100 6 | #define ZEN4_FUNCS 11 7 | 8 | #define ZEN4_FUNCDECL0(NAME) \ 9 | {#NAME, { \ 10 | nullptr, \ 11 | nullptr, \ 12 | nullptr, \ 13 | nullptr, \ 14 | nullptr, \ 15 | nullptr, \ 16 | nullptr, \ 17 | nullptr, \ 18 | nullptr,}}, 19 | 20 | #ifdef __cplusplus 21 | extern "C" { 22 | #endif 23 | 24 | typedef unsigned __int64(__fastcall* TEST_PTR)(void); 25 | 26 | #ifdef __cplusplus 27 | } 28 | #endif 29 | 30 | typedef struct { 31 | const char name[64]; 32 | TEST_PTR funcs[ZEN4_FUNCS]; 33 | } zen4_methods; 34 | -------------------------------------------------------------------------------- /AVX512_BGVSER.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | __m256i _mm256_bslli_epi256(__m256i a, int i); 4 | __m256i _mm256_bsrli_epi256(__m256i a, int i); 5 | __m256i _mm256_palignr_epi256(__m256i a, __m256i b, int i); 6 | __m256i _mm256_palignl_epi256(__m256i a, __m256i b, int i); 7 | __m256i _mm256_rotater_epi256(__m256i a, int i); 8 | __m256i _mm256_rotatel_epi256(__m256i a, int i); 9 | 10 | __m512i _mm512_bslli_epi512(__m512i a, int i); 11 | __m512i _mm512_bsrli_epi512(__m512i a, int i); 12 | __m512i _mm512_palignr_epi512(__m512i a, __m512i b, int i); 13 | __m512i _mm512_palignl_epi512(__m512i a, __m512i b, int i); 14 | __m512i _mm512_rotater_epi512(__m512i a, int i); 15 | __m512i _mm512_rotatel_epi512(__m512i a, int i); -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | 34 | # ========================= 35 | # Operating System Files 36 | # ========================= 37 | 38 | # Windows 39 | # ========================= 40 | 41 | # Windows thumbnail cache files 42 | Thumbs.db 43 | ehthumbs.db 44 | ehthumbs_vista.db 45 | 46 | # Folder config file 47 | Desktop.ini 48 | 49 | # Recycle Bin used on file shares 50 | $RECYCLE.BIN/ 51 | 52 | # Windows Installer files 53 | *.cab 54 | *.msi 55 | *.msm 56 | *.msp 57 | 58 | # Windows shortcuts 59 | *.lnk 60 | -------------------------------------------------------------------------------- /AVX512_Saturated_AddSub.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef __cplusplus 4 | extern "C" { 5 | #endif 6 | 7 | __m512i _mm512_adds_epi32(__m512i a, __m512i b); 8 | __m512i _mm512_adds_epi64(__m512i a, __m512i b); 9 | 10 | __m512i _mm512_subs_epi32(__m512i a, __m512i b); 11 | __m512i _mm512_subs_epi64(__m512i a, __m512i b); 12 | 13 | __m512i _mm512_adds_epu32(__m512i a, __m512i b); 14 | __m512i _mm512_adds_epu64(__m512i a, __m512i b); 15 | 16 | __m512i _mm512_subs_epu32(__m512i a, __m512i b); 17 | __m512i _mm512_subs_epu64(__m512i a, __m512i b); 18 | 19 | __m512i _mm512_adds_Zen4_epi32(__m512i a, __m512i b); 20 | __m512i _mm512_adds_Zen4_epi64(__m512i a, __m512i b); 21 | 22 | __m512i _mm512_subs_Zen4_epi32(__m512i a, __m512i b); 23 | __m512i _mm512_subs_Zen4_epi64(__m512i a, __m512i b); 24 | 25 | __m512i _mm512_adds_Zen4_epu32(__m512i a, __m512i b); 26 | __m512i _mm512_adds_Zen4_epu64(__m512i a, __m512i b); 27 | 28 | __m512i _mm512_subs_Zen4_epu32(__m512i a, __m512i b); 29 | __m512i _mm512_subs_Zen4_epu64(__m512i a, __m512i b); 30 | 31 | #ifdef __cplusplus 32 | } 33 | #endif 34 | -------------------------------------------------------------------------------- /Byte2Byte.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define B2B_FUNCS 1 4 | #define B2B_REPEATS 1000000000.0 5 | 6 | #define B2B_FUNCDEF(METHOD) \ 7 | extern "C" unsigned __int64 __fastcall B2B_##METHOD##(void); 8 | 9 | #define B2B_FUNCDECL(NAME, METHOD, ISA) \ 10 | {#NAME, { \ 11 | B2B_##METHOD##}, \ 12 | ISA \ 13 | }, 14 | 15 | #define B2B_FUNCDECL0(NAME) \ 16 | {#NAME, {\ 17 | nullptr}, FEAT_AVX512F}, 18 | 19 | #ifdef __cplusplus 20 | extern "C" { 21 | #endif 22 | 23 | typedef unsigned __int64(__fastcall* B2B_PTR)(void); 24 | 25 | #ifdef __cplusplus 26 | } 27 | #endif 28 | 29 | typedef struct { 30 | const char name[64]; 31 | B2B_PTR funcs[B2B_FUNCS]; 32 | Feats feats; 33 | } b2b_methods; 34 | 35 | B2B_FUNCDEF(MASKEDVPERMI2B_LAT) 36 | B2B_FUNCDEF(KREGROUNDTRIP_LAT) 37 | B2B_FUNCDEF(GFNI_LAT) 38 | B2B_FUNCDEF(SRLQ_LAT) 39 | B2B_FUNCDEF(BLENDMB_LAT) 40 | B2B_FUNCDEF(MINMAX_LAT) 41 | 42 | B2B_FUNCDEF(MASKEDVPERMI2B_TP) 43 | B2B_FUNCDEF(KREGROUNDTRIP_TP) 44 | B2B_FUNCDEF(GFNI_TP) 45 | B2B_FUNCDEF(SRLQ_TP) 46 | B2B_FUNCDEF(BLENDMB_TP) 47 | B2B_FUNCDEF(MINMAX_TP) 48 | -------------------------------------------------------------------------------- /Results/GFNI_vs_VBMI2.txt: -------------------------------------------------------------------------------- 1 | Intel AlderLake | AMD Zen4 | 2 | Core i9-12900K | Ryzen 9 7950X | 3 | VBMI2 GFNI | VBMI2 GFNI | 4 | _mm_ror_*_epi8(x128, 6) 398305 597569 | 438386 : 600854 | TSC clks 5 | _mm256_ror_*_epi8(x256, 6) 398288 597618 | 438426 : 600858 | TSC clks 6 | _mm512_ror_*_epi8(x512, 6) 497893 597594 | 500618 : 600854 | TSC clks 7 | _mm_mask_ror_*_epi8(x128, 6) 497893 995795 | 500618 : 1001294 | TSC clks 8 | _mm256_mask_ror_*_epi8(x256, 6) 748788 995832 | 710604 : 1001298 | TSC clks 9 | _mm512_mask_ror_*_epi8(x512, 6) 796543 995817 | 800950 : 1001300 | TSC clks 10 | -----------------------------------------------------|--------------------|---------- 11 | _mm_rorv_*_epi8(x128, y128) 447484 995817 | 459542 : 1001300 | TSC clks 12 | _mm256_rorv_*_epi8(x256, y256) 447295 1598831 | 459560 : 1610728 | TSC clks 13 | _mm512_rorv_*_epi8(x512, y512) 498006 1593224 | 500744 : 1607388 | TSC clks 14 | _mm_mask_rorv_*_epi8(x128, y128) 754516 2090947 | 755160 : 2107980 | TSC clks 15 | _mm256_mask_rorv_*_epi8(x256, y256) 754350 2091355 | 755204 : 2124702 | TSC clks 16 | _mm512_mask_rorv_*_epi8(x512, y512) 796692 2091043 | 801072 : 2107988 | TSC clks -------------------------------------------------------------------------------- /Zen5_Demo.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define _ZEN5_DEMO_VERSION 0x0100 4 | 5 | #define ZEN5_REPEATS 100 6 | #define ZEN5_FUNCS 2 7 | #define ZEN5_FUNCS_X87 2 8 | 9 | #define ZEN5_FUNCDEF(INST, OPERANDS) \ 10 | extern "C" unsigned __int64 __fastcall Zen5_##INST##_##OPERANDS##_lat(void); \ 11 | extern "C" unsigned __int64 __fastcall Zen5_##INST##_##OPERANDS##_tp(void); 12 | 13 | #define ZEN5_FUNCDEF_X87(INST, OPERANDS) \ 14 | extern "C" unsigned __int64 __fastcall Zen5_##INST##_##OPERANDS##_lat(void); 15 | 16 | 17 | #define ZEN5_FUNCDECL(NAME, INST, OPERANDS) \ 18 | {#NAME, {\ 19 | Zen5_##INST##_##OPERANDS##_lat, \ 20 | Zen5_##INST##_##OPERANDS##_tp \ 21 | }}, 22 | 23 | #define ZEN5_FUNCDECL_X87(NAME, INST, OPERANDS) \ 24 | {#NAME, {\ 25 | Zen5_##INST##_##OPERANDS##_lat, \ 26 | }}, 27 | 28 | #define ZEN5_FUNCDECL0(NAME) \ 29 | {#NAME, { \ 30 | nullptr, \ 31 | nullptr, \ 32 | }}, 33 | 34 | #define ZEN5_X87_FUNCDECL0(NAME) \ 35 | {#NAME, {\ 36 | nullptr, \ 37 | }}, 38 | 39 | ZEN5_FUNCDEF(empty, empty) 40 | 41 | #ifdef __cplusplus 42 | extern "C" { 43 | #endif 44 | 45 | typedef unsigned __int64(__fastcall* TEST_PTR)(void); 46 | 47 | #ifdef __cplusplus 48 | } 49 | #endif 50 | 51 | typedef struct { 52 | const char name[64]; 53 | TEST_PTR funcs[ZEN5_FUNCS]; 54 | } zen5_methods; 55 | 56 | typedef struct { 57 | const char name[64]; 58 | TEST_PTR funcs[ZEN5_FUNCS_X87]; 59 | } zen5_methods_x87; 60 | 61 | -------------------------------------------------------------------------------- /AVX512_DecimalPrint.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "stdafx.h" 3 | 4 | #define AVX512_STR_BUFF 32 5 | 6 | typedef void (*U32_PRINT_PTR)(uint32_t, char *); 7 | typedef void (*S32_PRINT_PTR)(int32_t, char *); 8 | typedef void (*U64_PRINT_PTR)(uint64_t, char *); 9 | typedef void (*S64_PRINT_PTR)(int64_t, char *); 10 | 11 | template 12 | struct AVX512_decimalprint_methods { 13 | const char name[32]; 14 | const char isaName[16]; 15 | T_FUNC func; 16 | Feats feats; 17 | bool refFlag; 18 | }; 19 | 20 | AVX512_decimalprint_methods decprints_u32[]; 21 | AVX512_decimalprint_methods decprints_s32[]; 22 | AVX512_decimalprint_methods decprints_u64[]; 23 | AVX512_decimalprint_methods decprints_s64[]; 24 | 25 | extern "C" void _ultoa_avx512ifma_asm(uint32_t, char *); 26 | extern "C" void _ultoa_avx512f_asm(uint32_t, char *); 27 | 28 | extern "C" void _ltoa_avx512ifma_asm(int32_t, char *); 29 | extern "C" void _ltoa_avx512f_asm(int32_t, char *); 30 | 31 | extern "C" void _ui64toa_avx512ifma_asm(uint64_t, char*); 32 | extern "C" void _ui64toa_avx512f_asm(uint64_t, char*); 33 | 34 | extern "C" void _i64toa_avx512ifma_asm(int64_t, char*); 35 | extern "C" void _i64toa_avx512f_asm(int64_t, char*); 36 | 37 | extern "C" void to_string_avx512ifma_asm(uint64_t, char*); 38 | extern "C" void to_string_avx512f_asm(uint64_t, char*); 39 | 40 | extern inline uint64_t serialized_tsc(void); 41 | 42 | -------------------------------------------------------------------------------- /Byte2Byte.cpp: -------------------------------------------------------------------------------- 1 | #include "stdafx.h" 2 | #include "Byte2Byte.h" 3 | 4 | extern CPU_Props cpu_props; 5 | 6 | using namespace std; 7 | 8 | b2b_methods b2b[] = { 9 | {"Masked VPERMI2B pair LAT", B2B_MASKEDVPERMI2B_LAT, FEAT_AVX512VBMI}, 10 | {"VPERMI2B pair + kreg + TERNLOG LAT", B2B_KREGROUNDTRIP_LAT, FEAT_AVX512VBMI}, 11 | {"VPERMI2B pair + GFNI + TERNLOG LAT", B2B_GFNI_LAT, FEAT_GFNI}, 12 | {"VPERMI2B pair + VPSHRQ + TERNLOG LAT", B2B_SRLQ_LAT, FEAT_AVX512VBMI}, 13 | {"VPERMI2B pair + VPBLENDMB LAT", B2B_BLENDMB_LAT, FEAT_AVX512VBMI}, 14 | {"VPERMI2B pair + VPMIN/MAXSB LAT", B2B_MINMAX_LAT, FEAT_AVX512VBMI}, 15 | 16 | {"Masked VPERMI2B pair TP ", B2B_MASKEDVPERMI2B_TP, FEAT_AVX512VBMI}, 17 | {"VPERMI2B pair + kreg + TERNLOG TP ", B2B_KREGROUNDTRIP_TP, FEAT_AVX512VBMI}, 18 | {"VPERMI2B pair + GFNI + TERNLOG TP ", B2B_GFNI_TP, FEAT_GFNI}, 19 | {"VPERMI2B pair + VPSHRQ + TERNLOG TP ", B2B_SRLQ_TP, FEAT_AVX512VBMI}, 20 | {"VPERMI2B pair + VPBLENDMB TP ", B2B_BLENDMB_TP, FEAT_AVX512VBMI}, 21 | {"VPERMI2B pair + VPMIN/MAXSB TP ", B2B_MINMAX_TP, FEAT_AVX512VBMI}, 22 | }; 23 | 24 | void Byte2ByteTest(void) { 25 | SetThread(3); 26 | cout << "--- AVX512VBMI Byte2Byte mapping ---" << dec << right << endl; 27 | for (int b = 0; b < sizeof(b2b) / sizeof(b2b_methods); b++) { 28 | if (cpu_props.IsFeat(b2b[b].feats)) 29 | cout << b2b[b].name << ':' << (double)((b2b[b].funcs[0])()) / B2B_REPEATS << endl; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /AMX_Demo.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define _AMX_CONFIG_RESERVED_SIZE 14 4 | #define _AMX_VEX_MAX_REGISTERS 16 5 | 6 | // Intel Architecture Instruction Set Extensions and Future Features 7 | // Programming Reference May 2021 319433-044 p. 100 8 | // format of memory payload. each field is a byte. 9 | // 0: palette_id 10 | // 1: startRow (8b) 11 | // 2-15: reserved (must be zero) 12 | // 16-17: tile0.colsb -- bytes_per_row 13 | // 18-19: tile1.colsb 14 | // 20-21: tile2.colsb 15 | // ... 16 | // 46-47: tile15.colsb 17 | // 48: tile0.rows 18 | // 49: tile1.rows 19 | // 50: tile2.rows 20 | // ... 21 | // 63: tile15.rows 22 | 23 | class XTILECFG { 24 | unsigned char palette_id; 25 | unsigned char startRow; 26 | unsigned char reserved[_AMX_CONFIG_RESERVED_SIZE]; 27 | unsigned short tile_cols[_AMX_VEX_MAX_REGISTERS]; 28 | unsigned char tile_rows[_AMX_VEX_MAX_REGISTERS]; 29 | public: 30 | XTILECFG() { 31 | memset(this, 0, sizeof(XTILECFG)); 32 | }; 33 | XTILECFG(unsigned int c, unsigned int r, unsigned int tilesize, unsigned int maxreg, unsigned int p = 1) : palette_id(p), startRow(0) { 34 | memset(reserved, 0, _AMX_CONFIG_RESERVED_SIZE); 35 | for (unsigned int i = 0, ts = tilesize; (i < _AMX_VEX_MAX_REGISTERS); i++) { 36 | if ((i < maxreg) && (ts >= c * r)) { 37 | tile_cols[i] = c; 38 | tile_rows[i] = r; 39 | } else { 40 | tile_cols[i] = 0; 41 | tile_rows[i] = 0; 42 | } 43 | //std::cout << i << ' ' << ts << ' ' << c * r << std::endl; 44 | ts = ts >= (c * r) ? ts - (c * r) : 0; 45 | } 46 | }; 47 | void XTILECFG_reg(unsigned int c, unsigned int r, unsigned int reg) { 48 | tile_cols[reg] = c; 49 | tile_rows[reg] = r; 50 | } 51 | }; 52 | -------------------------------------------------------------------------------- /HWBITPERM_Demo.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define DEPEXT219_RANDOM 10000 4 | #define DEPEXT219_RETRIES 10000 5 | #define DEPEXT219_REPEATS 5000 6 | 7 | enum BITPERM { 8 | BEXT, 9 | BDEP, 10 | BGRP 11 | }; 12 | 13 | typedef unsigned __int64 (*BITPERM_PTR)(void); 14 | 15 | typedef __m512i (__vectorcall *BITPERM_PTR2)(__m512i, __m512i); 16 | 17 | typedef struct { 18 | const char name[32]; 19 | const char isaName[16]; 20 | int bitness; 21 | BITPERM_PTR lat; 22 | BITPERM_PTR tp; 23 | BITPERM_PTR2 func; 24 | Feats feats; 25 | BITPERM type; 26 | int ref; 27 | } bitperm_methods; 28 | 29 | #ifdef __cplusplus 30 | extern "C" { 31 | #endif 32 | 33 | unsigned __int64 BEXT32_HW_Lat(void); 34 | unsigned __int64 BEXT64_HW_Lat(void); 35 | unsigned __int64 BEXT32_HW_Tp(void); 36 | unsigned __int64 BEXT64_HW_Tp(void); 37 | __m512i __vectorcall BEXT32_HW(__m512i, __m512i); 38 | __m512i __vectorcall BEXT64_HW(__m512i, __m512i); 39 | 40 | unsigned __int64 BDEP32_HW_Lat(void); 41 | unsigned __int64 BDEP64_HW_Lat(void); 42 | unsigned __int64 BDEP32_HW_Tp(void); 43 | unsigned __int64 BDEP64_HW_Tp(void); 44 | __m512i __vectorcall BDEP32_HW(__m512i, __m512i); 45 | __m512i __vectorcall BDEP64_HW(__m512i, __m512i); 46 | 47 | unsigned __int64 BGRP32_HW_Lat(void); 48 | unsigned __int64 BGRP64_HW_Lat(void); 49 | unsigned __int64 BGRP32_HW_Tp(void); 50 | unsigned __int64 BGRP64_HW_Tp(void); 51 | __m512i __vectorcall BGRP32_HW(__m512i, __m512i); 52 | __m512i __vectorcall BGRP64_HW(__m512i, __m512i); 53 | 54 | unsigned int _pgrp_u32(unsigned int p, unsigned int m); 55 | unsigned __int64 _pgrp_u64(unsigned __int64 p, unsigned __int64 m); 56 | 57 | #ifdef __cplusplus 58 | } 59 | #endif 60 | -------------------------------------------------------------------------------- /InstLatX64_Demo.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | enum demoType { 4 | DEMO_GFNI, 5 | DEMO_VPCLMLQDQ, 6 | #if defined (__AVX2__) 7 | DEMO_VNNI_SADD, 8 | #endif 9 | #if defined (_M_X64) 10 | #if defined (__AVX2__) 11 | DEMO_P06P1, 12 | DEMO_PEXT_PDEP_EMU, 13 | DEMO_FIRSTBBYTE, 14 | #endif 15 | #if defined(__AVX512F__) 16 | DEMO_RADD, 17 | DEMO_AVX512_SADD, 18 | DEMO_KMEMDST, 19 | DEMO_ZEN4, 20 | DEMO_ZEN5, 21 | DEMO_INTRINSICS, 22 | DEMO_VBMI2, 23 | DEMO_BYTE2BYTE, 24 | DEMO_LZCNT, 25 | DEMO_TZCNT, 26 | DEMO_HWBITPERM, 27 | DEMO_KMOV, 28 | DEMO_AMX, 29 | DEMO_AVX512_DECPRINT, 30 | DEMO_AVX512_BGVSER, 31 | #endif 32 | #if defined(__AVX512F__) 33 | DEMO_LAST = DEMO_AVX512_BGVSER, 34 | #elif defined (__AVX2__) 35 | DEMO_LAST = DEMO_FIRSTBBYTE, 36 | #else 37 | DEMO_LAST = DEMO_VPCLMLQDQ, 38 | #endif 39 | #else 40 | DEMO_LAST = DEMO_VPCLMLQDQ, 41 | #endif 42 | }; 43 | 44 | typedef struct { 45 | const char* demoName; 46 | const char* alias; 47 | uint64_t demoMask; 48 | Feats feats; 49 | bool publicFlag; 50 | void (*func)(void); 51 | const char* comment; 52 | } demoTypeList; 53 | 54 | void GFNI_Demo(void); 55 | void VPCLMULQDQ_Demo(void); 56 | 57 | void PEXT_PDEP_Emu_Test(void); 58 | void FirstByte_Demo(void); 59 | void AVX512_Reduce_Add_Demo(void); 60 | void AVX512_Saturated_AddSub_Demo(void); 61 | void AVX512_KMemDst_Demo(void); 62 | void Zen4_Demo(void); 63 | void Zen5_Demo(void); 64 | void AVX512_InstrincTest(void); 65 | void VBMI2_Demo(void); 66 | void Byte2ByteTest(void); 67 | void LZCNT_Test(void); 68 | void TZCNT_Test(void); 69 | void HWBITPERM_Test(void); 70 | void Kmov_Test(void); 71 | void AMX_Test(void); 72 | void AVX512_DecimalPrint_Test(void); 73 | void AVX512_BGVSER_Test(void); 74 | void AVX_VNNI_Saturated_AddSub_Demo(void); 75 | void P0601_Test(void); 76 | -------------------------------------------------------------------------------- /P06P1.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define INSTNAMELEN 24 4 | #define INITLEN 16 5 | #define TESTCASE 8 6 | 7 | typedef unsigned __int64 (*MEASURR_PTR)(uint32_t * histo, uint64_t * data, uint64_t size); 8 | 9 | typedef unsigned __int64 (*MEASURE_PTR)(void); 10 | 11 | typedef struct { 12 | const char inst[INSTNAMELEN]; 13 | const char init[INITLEN]; 14 | MEASURE_PTR func[TESTCASE]; 15 | Feats feats; 16 | int uopscount; 17 | } measure_methods; 18 | 19 | #define P06P1_FUNCDEF(INST, INITREG) \ 20 | unsigned __int64 _##INST##_MOV_R##INITREG##_M1025_TIME(void); \ 21 | unsigned __int64 _##INST##_MOV_E##INITREG##_M1025_TIME(void); \ 22 | unsigned __int64 _##INST##_MOV_R##INITREG##_M1024_TIME(void); \ 23 | unsigned __int64 _##INST##_MOV_E##INITREG##_M1024_TIME(void); \ 24 | unsigned __int64 _##INST##_MOV_R##INITREG##_M513_TIME(void); \ 25 | unsigned __int64 _##INST##_MOV_E##INITREG##_M513_TIME(void); \ 26 | unsigned __int64 _##INST##_MOV_R##INITREG##_M512_TIME(void); \ 27 | unsigned __int64 _##INST##_MOV_E##INITREG##_M512_TIME(void); \ 28 | unsigned __int64 _##INST##_MOV_R##INITREG##_511_TIME(void); \ 29 | unsigned __int64 _##INST##_MOV_E##INITREG##_511_TIME(void); \ 30 | unsigned __int64 _##INST##_MOV_R##INITREG##_512_TIME(void); \ 31 | unsigned __int64 _##INST##_MOV_E##INITREG##_512_TIME(void); \ 32 | unsigned __int64 _##INST##_MOV_R##INITREG##_1023_TIME(void); \ 33 | unsigned __int64 _##INST##_MOV_E##INITREG##_1023_TIME(void); \ 34 | unsigned __int64 _##INST##_MOV_R##INITREG##_1024_TIME(void); \ 35 | unsigned __int64 _##INST##_MOV_E##INITREG##_1024_TIME(void); 36 | 37 | #define P06P1_FUNC(INST, FUNC, INITREG, ISA, UOPS) \ 38 | {INST, "MOV R"#INITREG", imm32", \ 39 | { \ 40 | _##FUNC##_MOV_R##INITREG##_M1025_TIME, \ 41 | _##FUNC##_MOV_R##INITREG##_M1024_TIME, \ 42 | _##FUNC##_MOV_R##INITREG##_M513_TIME, \ 43 | _##FUNC##_MOV_R##INITREG##_M512_TIME, \ 44 | _##FUNC##_MOV_R##INITREG##_511_TIME, \ 45 | _##FUNC##_MOV_R##INITREG##_512_TIME, \ 46 | _##FUNC##_MOV_R##INITREG##_1023_TIME, \ 47 | _##FUNC##_MOV_R##INITREG##_1024_TIME \ 48 | }, \ 49 | FEAT_##ISA##, UOPS}, 50 | -------------------------------------------------------------------------------- /VPCLMULQDQ_Demo.cpp: -------------------------------------------------------------------------------- 1 | #include "stdafx.h" 2 | #include "VPCLMULQDQ_Demo.h" 3 | 4 | __m128i _mm_prefix_xor_clmul_si128(__m128i a) { 5 | const __m128i full = _mm_set1_epi32(0xffffffff); 6 | __m128i clmul0_63 = _mm_clmulepi64_si128(a, full, 0x00); 7 | __m128i clmul64_127 = _mm_clmulepi64_si128(a, full, 0x01); 8 | #if !defined(__AVX512VL__) 9 | clmul64_127 = _mm_xor_si128(clmul64_127, _mm_shuffle_epi32(_mm_srai_epi32(clmul0_63, 31), 0x05)); 10 | #else 11 | clmul64_127 = _mm_xor_si128(clmul64_127, _mm_srai_epi64(clmul0_63, 63)); 12 | #endif 13 | return _mm_unpacklo_epi64(clmul0_63, clmul64_127); 14 | } 15 | 16 | #if defined(__AVX2__) 17 | __m256i _mm256_prefix_xor_clmul_si256(__m256i a) { 18 | const __m256i full = _mm256_set1_epi32(0xffffffff); 19 | __m256i clmul0_63 = _mm256_clmulepi64_epi128(a, full, 0x00); 20 | __m256i clmul64_127 = _mm256_clmulepi64_epi128(a, full, 0x01); 21 | #if !defined(__AVX512VL__) 22 | clmul64_127 = _mm256_xor_si256(clmul64_127, _mm256_shuffle_epi32(_mm256_srai_epi32(clmul0_63, 31), 0x05)); 23 | #else 24 | clmul64_127 = _mm256_xor_si256(clmul64_127, _mm256_srai_epi64(clmul0_63, 63)); 25 | #endif 26 | __m256i clmul0_127 = _mm256_unpacklo_epi64(clmul0_63, clmul64_127); 27 | #if !defined(__AVX512VL__) 28 | __m256i corr128_255 = _mm256_inserti128_si256(_mm256_setzero_si256(), _mm_shuffle_epi32(_mm_srai_epi32(_mm256_castsi256_si128(clmul0_127), 31), 0xff), 1); 29 | #else 30 | __m256i corr128_255 = _mm256_maskz_permutex_epi64(0xc, _mm256_srai_epi64(clmul64_127, 63), 0); 31 | #endif 32 | return _mm256_xor_si256(clmul0_127, corr128_255); 33 | } 34 | #endif 35 | 36 | #if defined(__AVX512F__) 37 | __m512i _mm512_prefix_xor_clmul_si512(__m512i a) { 38 | const __m512i full = _mm512_set1_epi32(0xffffffff); 39 | __m512i clmul0_63 = _mm512_clmulepi64_epi128(a, full, 0x00); 40 | __m512i clmul64_127 = _mm512_clmulepi64_epi128(a, full, 0x01); 41 | clmul64_127 = _mm512_xor_si512(clmul64_127, _mm512_srai_epi64(clmul0_63, 63)); 42 | __m512i clmul0_127 = _mm512_unpacklo_epi64(clmul0_63, clmul64_127); 43 | __m512i corr128_255 = _mm512_maskz_permutex_epi64(0xcc, _mm512_srai_epi64(clmul64_127, 63), 0); 44 | __m512i clmul0_255 = _mm512_xor_si512(clmul0_127, corr128_255); 45 | __m512i corr256_511 = _mm512_maskz_permutexvar_epi64(0xf0, _mm512_set1_epi64(3), _mm512_srai_epi64(clmul0_255, 63)); 46 | return _mm512_xor_si512(clmul0_255, corr256_511); 47 | } 48 | #endif 49 | -------------------------------------------------------------------------------- /Misc.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define _INSTLATX64_DEMO_TESTVECT_00 0xff7f3f1f0f070301 4 | #define _INSTLATX64_DEMO_TESTVECT_01 0x80c0e0f0f8fcfeff 5 | 6 | #define _INSTLATX64_DEMO_TESTVECT_02 0xaa55cc33a050c031 7 | #define _INSTLATX64_DEMO_TESTVECT_03 0x030c050a33cc55aa 8 | 9 | #define _INSTLATX64_DEMO_TESTVECT_04 0x080808ff08080808 10 | #define _INSTLATX64_DEMO_TESTVECT_05 0x8142241818244281 11 | 12 | #define _INSTLATX64_DEMO_TESTVECT_06 0x8040201008040201 13 | #define _INSTLATX64_DEMO_TESTVECT_07 0x0102040810204080 14 | 15 | #define _INSTLATX64_DEMO_TESTVECT_08 0x8040201008040000 16 | #define _INSTLATX64_DEMO_TESTVECT_09 0x0002040810204080 17 | 18 | #define _INSTLATX64_DEMO_TESTVECT_0A 0x8080808080808000 19 | #define _INSTLATX64_DEMO_TESTVECT_0B 0x0101010101010101 20 | 21 | #define _INSTLATX64_DEMO_TESTVECT_FE 0xfefefefefefefefe 22 | #define _INSTLATX64_DEMO_TESTVECT_FF 0xffffffffffffffff 23 | 24 | #define _INSTLATX64_DEMO_TESTMASK_8 0x5a 25 | #define _INSTLATX64_DEMO_TESTMASK_16 0x5a7e 26 | #define _INSTLATX64_DEMO_TESTMASK_32 0x5a7e3c18 27 | #define _INSTLATX64_DEMO_TESTMASK_64 0x07701ff13ff37ff7 28 | 29 | void printRes8(const char* name, __m128i res); 30 | void printRes8(const char* name, __m256i res); 31 | void printRes8(const char* name, __m512i res); 32 | 33 | void printRes16(const char * name, __m128i res); 34 | void printRes16(const char * name, __m256i res); 35 | void printRes16(const char * name, __m512i res); 36 | 37 | void printRes32(const char * name, __m128i res); 38 | void printRes32(const char * name, __m256i res); 39 | void printRes32(const char * name, __m512i res); 40 | 41 | void printRes(const char * name, __m128i res); 42 | void printRes(const char * name, __m256i res); 43 | void printRes(const char * name, __m512i res); 44 | 45 | void printRes(int r, const char * name, __m128i res); 46 | void printRes(int r, const char * name, __m256i res); 47 | void printRes(int r, const char * name, __m512i res); 48 | 49 | void printRes(int r, __m128i res); 50 | void printRes(int r, __m256i res); 51 | void printRes(int r, __m512i res); 52 | 53 | void random_wrap(unsigned int* random); 54 | void random_wrap(signed int* random); 55 | void random_wrap(unsigned long long* random); 56 | void random_wrap(signed long long* random); 57 | 58 | uint64_t serialized_tsc(void); 59 | 60 | void SetThread(size_t threadindex); 61 | 62 | #if defined (_M_X64) 63 | #define _ild_popcnt _mm_popcnt_u64 64 | #else 65 | #define _ild_popcnt _mm_popcnt_u32 66 | #endif 67 | 68 | -------------------------------------------------------------------------------- /ConsoleColor.h: -------------------------------------------------------------------------------- 1 | // ConsoleColor.h 2 | // Copyleft Vincent Godin 3 | // https://www.codeproject.com/articles/16431/add-color-to-your-std-cout 4 | 5 | #pragma once 6 | #include 7 | #include 8 | 9 | enum CharColor : WORD { 10 | //IRGB 11 | COLOR_BLACK = 0b0000, 12 | COLOR_DARKBLUE = 0b0001, 13 | COLOR_DARKGREEN = 0b0010, 14 | COLOR_LIGHTBLUE = 0b0011, 15 | COLOR_BRICK = 0b0100, 16 | COLOR_VIOLET = 0b0101, 17 | COLOR_GOLD = 0b0110, 18 | COLOR_GREY = 0b0111, 19 | COLOR_SILVER = 0b1000, 20 | COLOR_BLUE = 0b1001, 21 | COLOR_GREEN = 0b1010, 22 | COLOR_AQUA = 0b1011, 23 | COLOR_RED = 0b1100, 24 | COLOR_PURPLE = 0b1101, 25 | COLOR_YELLOW = 0b1110, 26 | COLOR_WHITE = 0b1111 27 | }; 28 | 29 | inline std::ostream& blue(std::ostream& s) 30 | { 31 | HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE); 32 | SetConsoleTextAttribute(hStdout, FOREGROUND_BLUE | FOREGROUND_INTENSITY); 33 | return s; 34 | } 35 | 36 | inline std::ostream& red(std::ostream& s) 37 | { 38 | HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE); 39 | SetConsoleTextAttribute(hStdout, FOREGROUND_RED | FOREGROUND_INTENSITY); 40 | return s; 41 | } 42 | 43 | inline std::ostream& green(std::ostream& s) 44 | { 45 | HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE); 46 | SetConsoleTextAttribute(hStdout, FOREGROUND_GREEN | FOREGROUND_INTENSITY); 47 | return s; 48 | } 49 | 50 | inline std::ostream& yellow(std::ostream& s) 51 | { 52 | HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE); 53 | SetConsoleTextAttribute(hStdout, FOREGROUND_GREEN | FOREGROUND_RED | FOREGROUND_INTENSITY); 54 | return s; 55 | } 56 | 57 | inline std::ostream& gold(std::ostream& s) 58 | { 59 | HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE); 60 | SetConsoleTextAttribute(hStdout, FOREGROUND_GREEN | FOREGROUND_RED); 61 | return s; 62 | } 63 | 64 | inline std::ostream& white(std::ostream& s) 65 | { 66 | HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE); 67 | SetConsoleTextAttribute(hStdout, FOREGROUND_RED | FOREGROUND_GREEN | FOREGROUND_BLUE); 68 | return s; 69 | } 70 | 71 | inline std::ostream& magenta(std::ostream& s) 72 | { 73 | HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE); 74 | SetConsoleTextAttribute(hStdout, FOREGROUND_RED | FOREGROUND_BLUE | FOREGROUND_INTENSITY); 75 | return s; 76 | } 77 | 78 | struct color { 79 | color(WORD attribute) :m_color(attribute) {}; 80 | WORD m_color; 81 | }; 82 | 83 | template 84 | std::basic_ostream<_Elem, _Traits>& 85 | operator<<(std::basic_ostream<_Elem, _Traits>& i, const color& c) 86 | { 87 | HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE); 88 | SetConsoleTextAttribute(hStdout, c.m_color); 89 | return i; 90 | } 91 | 92 | // Copyleft Vincent Godin 93 | 94 | -------------------------------------------------------------------------------- /LZCNT_Demo.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define LZCNT_REPEATS 0x1000000 4 | 5 | __m128i __vectorcall _mm_lzcnt_ild_epi8(__m128i a); 6 | __m256i __vectorcall _mm256_lzcnt_ild_epi8(__m256i a); 7 | __m512i __vectorcall _mm512_lzcnt_ild_epi8(__m512i a); 8 | 9 | __m128i __vectorcall _mm_lzcnt_ild_epi16(__m128i a); 10 | __m256i __vectorcall _mm256_lzcnt_ild_epi16(__m256i a); 11 | __m512i __vectorcall _mm512_lzcnt_ild_epi16(__m512i a); 12 | 13 | __m128i __vectorcall _mm_lzcnt_fp16_epi16(__m128i a); 14 | __m256i __vectorcall _mm256_lzcnt_fp16_epi16(__m256i a); 15 | __m512i __vectorcall _mm512_lzcnt_fp16_epi16(__m512i a); 16 | 17 | __m128i __vectorcall _mm_lzcnt_ild_epi8(__m128i a); 18 | __m256i __vectorcall _mm256_lzcnt_ild_epi8(__m256i a); 19 | __m512i __vectorcall _mm512_lzcnt_ild_epi8(__m512i a); 20 | 21 | __m128i __vectorcall _mm_lzcnt_ild_epi16(__m128i a); 22 | __m256i __vectorcall _mm256_lzcnt_ild_epi16(__m256i a); 23 | __m512i __vectorcall _mm512_lzcnt_ild_epi16(__m512i a); 24 | 25 | __m128i __vectorcall _mm_lzcnt_fp16_epi16(__m128i a); 26 | __m256i __vectorcall _mm256_lzcnt_fp16_epi16(__m256i a); 27 | __m512i __vectorcall _mm512_lzcnt_fp16_epi16(__m512i a); 28 | 29 | #ifdef __cplusplus 30 | extern "C" { 31 | #endif 32 | 33 | unsigned __int64 _mm_lzcnt_epi8_asm_timed(void); 34 | unsigned __int64 _mm_lzcnt_gfni_epi8_asm_timed(void); 35 | unsigned __int64 _mm_lzcnt_epi16_asm_timed(void); 36 | unsigned __int64 _mm_lzcnt_fp16_epi16_asm_timed(void); 37 | 38 | unsigned __int64 _mm256_lzcnt_epi8_asm_timed(void); 39 | unsigned __int64 _mm256_lzcnt_gfni_epi8_asm_timed(void); 40 | unsigned __int64 _mm256_lzcnt_epi16_asm_timed(void); 41 | unsigned __int64 _mm256_lzcnt_fp16_epi16_asm_timed(void); 42 | 43 | unsigned __int64 _mm512_lzcnt_epi8_asm_timed(void); 44 | unsigned __int64 _mm512_lzcnt_gfni_epi8_asm_timed(void); 45 | unsigned __int64 _mm512_lzcnt_epi16_asm_timed(void); 46 | unsigned __int64 _mm512_lzcnt_fp16_epi16_asm_timed(void); 47 | 48 | __m128i __vectorcall _mm_lzcnt_epi8_asm(__m128i a); 49 | __m128i __vectorcall _mm_lzcnt_gfni_epi8_asm(__m128i a); 50 | __m128i __vectorcall _mm_lzcnt_epi16_asm(__m128i a); 51 | __m128i __vectorcall _mm_lzcnt_fp16_epi16_asm(__m128i a); 52 | 53 | __m256i __vectorcall _mm256_lzcnt_epi8_asm(__m256i a); 54 | __m256i __vectorcall _mm256_lzcnt_gfni_epi8_asm(__m256i a); 55 | __m256i __vectorcall _mm256_lzcnt_epi16_asm(__m256i a); 56 | __m256i __vectorcall _mm256_lzcnt_fp16_epi16_asm(__m256i a); 57 | 58 | __m512i __vectorcall _mm512_lzcnt_epi8_asm(__m512i a); 59 | __m512i __vectorcall _mm512_lzcnt_gfni_epi8_asm(__m512i a); 60 | __m512i __vectorcall _mm512_lzcnt_epi16_asm(__m512i a); 61 | __m512i __vectorcall _mm512_lzcnt_fp16_epi16_asm(__m512i a); 62 | 63 | #ifdef __cplusplus 64 | } 65 | #endif 66 | -------------------------------------------------------------------------------- /Zen5_Demo_Port.h: -------------------------------------------------------------------------------- 1 | zen5_8clks_port01_m macro 2 | REPEAT 2 3 | pmullw mm0, mm0 4 | pmullw mm1, mm1 5 | pmullw mm2, mm2 6 | pmullw mm3, mm3 7 | pmullw mm4, mm4 8 | pmullw mm5, mm5 9 | pmullw mm6, mm6 10 | pmullw mm7, mm7 11 | endm 12 | endm 13 | 14 | zen5_8clks_port23_m macro 15 | REPEAT 2 16 | psllw mm0, 1 17 | psllw mm1, 1 18 | psllw mm2, 1 19 | psllw mm3, 1 20 | psllw mm4, 1 21 | psllw mm5, 1 22 | psllw mm6, 1 23 | psllw mm7, 1 24 | endm 25 | endm 26 | 27 | zen5_8clks_port12_m macro 28 | REPEAT 2 29 | packsswb mm0, mm0 30 | packsswb mm1, mm1 31 | packsswb mm2, mm2 32 | packsswb mm3, mm3 33 | packsswb mm4, mm4 34 | packsswb mm5, mm5 35 | packsswb mm6, mm6 36 | packsswb mm7, mm7 37 | endm 38 | endm 39 | 40 | zen5_8clks_port03_m macro 41 | REPEAT 2 42 | korb k0, k0, k1 43 | korb k1, k1, k2 44 | korb k2, k2, k3 45 | korb k3, k3, k4 46 | korb k4, k4, k5 47 | korb k5, k5, k6 48 | korb k6, k6, k7 49 | korb k7, k7, k0 50 | endm 51 | endm 52 | 53 | zen5_8clks_port0123_m macro 54 | REPEAT 4 55 | paddb mm0, mm0 56 | paddb mm1, mm1 57 | paddb mm2, mm2 58 | paddb mm3, mm3 59 | paddb mm4, mm4 60 | paddb mm5, mm5 61 | paddb mm6, mm6 62 | paddb mm7, mm7 63 | endm 64 | endm 65 | 66 | zen5_8clks_port1_m macro 67 | REPEAT 1 68 | kmovb k0, eax 69 | kmovb k1, eax 70 | kmovb k2, eax 71 | kmovb k3, eax 72 | kmovb k4, eax 73 | kmovb k5, eax 74 | kmovb k6, eax 75 | kmovb k7, eax 76 | endm 77 | endm 78 | 79 | zen5_8clks_port45_m macro 80 | REPEAT 2 81 | movq [memop1 - 080h], mm0 82 | movq [memop1 - 060h], mm1 83 | movq [memop1 - 040h], mm2 84 | movq [memop1 - 020h], mm3 85 | movq [memop1 + 000h], mm4 86 | movq [memop1 + 020h], mm5 87 | movq [memop1 + 040h], mm6 88 | movq [memop1 + 060h], mm7 89 | endm 90 | endm 91 | 92 | zen5_8clks_tern_m macro 93 | vpternlogq zmm0, zmm1, zmm2, 0 94 | vpternlogq zmm1, zmm2, zmm3, 1 95 | vpternlogq zmm2, zmm3, zmm4, 2 96 | vpternlogq zmm3, zmm4, zmm5, 3 97 | vpternlogq zmm4, zmm5, zmm6, 4 98 | vpternlogq zmm5, zmm6, zmm7, 5 99 | vpternlogq zmm6, zmm7, zmm0, 6 100 | vpternlogq zmm7, zmm0, zmm1, 7 101 | endm 102 | 103 | zen5_8clks_LDs_m macro 104 | REPEAT 2 105 | movq mm0, [memop1 - 080h] 106 | movq mm1, [memop1 - 060h] 107 | movq mm2, [memop1 - 040h] 108 | movq mm3, [memop1 - 020h] 109 | movq mm4, [memop1 + 000h] 110 | movq mm5, [memop1 + 020h] 111 | movq mm6, [memop1 + 040h] 112 | movq mm7, [memop1 + 060h] 113 | endm 114 | endm 115 | -------------------------------------------------------------------------------- /Zen4_Demo_Port.h: -------------------------------------------------------------------------------- 1 | zen4_8clks_port01_m macro 2 | REPEAT 2 3 | paddsb mm0, mm0 4 | paddsb mm1, mm1 5 | paddsb mm2, mm2 6 | paddsb mm3, mm3 7 | paddsb mm4, mm4 8 | paddsb mm5, mm5 9 | paddsb mm6, mm6 10 | paddsb mm7, mm7 11 | endm 12 | endm 13 | 14 | zen4_8clks_port23_m macro 15 | REPEAT 2 16 | psllw mm0, 1 17 | psllw mm1, 1 18 | psllw mm2, 1 19 | psllw mm3, 1 20 | psllw mm4, 1 21 | psllw mm5, 1 22 | psllw mm6, 1 23 | psllw mm7, 1 24 | endm 25 | endm 26 | 27 | zen4_8clks_port12_m macro 28 | REPEAT 2 29 | packsswb mm0, mm0 30 | packsswb mm1, mm1 31 | packsswb mm2, mm2 32 | packsswb mm3, mm3 33 | packsswb mm4, mm4 34 | packsswb mm5, mm5 35 | packsswb mm6, mm6 36 | packsswb mm7, mm7 37 | endm 38 | endm 39 | 40 | zen4_8clks_port123_m macro 41 | REPEAT 3 42 | punpcklbw mm0, mm0 43 | punpcklbw mm1, mm1 44 | punpcklbw mm2, mm2 45 | punpcklbw mm3, mm3 46 | punpcklbw mm4, mm4 47 | punpcklbw mm5, mm5 48 | punpcklbw mm6, mm6 49 | punpcklbw mm7, mm7 50 | endm 51 | endm 52 | 53 | zen4_8clks_port0123_m macro 54 | REPEAT 4 55 | por mm0, mm0 56 | por mm1, mm1 57 | por mm2, mm2 58 | por mm3, mm3 59 | por mm4, mm4 60 | por mm5, mm5 61 | por mm6, mm6 62 | por mm7, mm7 63 | endm 64 | endm 65 | 66 | zen4_8clks_port45_m macro 67 | REPEAT 1 68 | movq [memop1 - 080h], mm0 69 | movq [memop1 - 060h], mm1 70 | movq [memop1 - 040h], mm2 71 | movq [memop1 - 020h], mm3 72 | movq [memop1 + 000h], mm4 73 | movq [memop1 + 020h], mm5 74 | movq [memop1 + 040h], mm6 75 | movq [memop1 + 060h], mm7 76 | endm 77 | endm 78 | 79 | zen4_8clks_LDs_m macro 80 | REPEAT 2 81 | movq mm0, [memop1 - 080h] 82 | movq mm1, [memop1 - 060h] 83 | movq mm2, [memop1 - 040h] 84 | movq mm3, [memop1 - 020h] 85 | movq mm4, [memop1 + 000h] 86 | movq mm5, [memop1 + 020h] 87 | movq mm6, [memop1 + 040h] 88 | movq mm7, [memop1 + 060h] 89 | endm 90 | endm 91 | 92 | zen4_8clks_port1_m macro 93 | REPEAT 1 94 | pinsrw mm0, eax, 0 95 | pinsrw mm1, eax, 0 96 | pinsrw mm2, eax, 0 97 | pinsrw mm3, eax, 0 98 | pinsrw mm4, eax, 0 99 | pinsrw mm5, eax, 0 100 | pinsrw mm6, eax, 0 101 | pinsrw mm7, eax, 0 102 | endm 103 | endm 104 | 105 | zen4_8clks_tern_m macro 106 | vpternlogq zmm0, zmm1, zmm2, 0 107 | vpternlogq zmm1, zmm2, zmm3, 1 108 | vpternlogq zmm2, zmm3, zmm4, 2 109 | vpternlogq zmm3, zmm4, zmm5, 3 110 | vpternlogq zmm4, zmm5, zmm6, 4 111 | vpternlogq zmm5, zmm6, zmm7, 5 112 | vpternlogq zmm6, zmm7, zmm0, 6 113 | vpternlogq zmm7, zmm0, zmm1, 7 114 | endm 115 | -------------------------------------------------------------------------------- /TZCNT_Demo.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define TZCNT_REPEATS 0x1000000 4 | 5 | __m128i __vectorcall _mm_tzcnt_epi8(__m128i a); 6 | __m256i __vectorcall _mm256_tzcnt_epi8(__m256i a); 7 | __m512i __vectorcall _mm512_tzcnt_epi8(__m512i a); 8 | 9 | __m128i __vectorcall _mm_tzcnt_epi16(__m128i a); 10 | __m256i __vectorcall _mm256_tzcnt_epi16(__m256i a); 11 | __m512i __vectorcall _mm512_tzcnt_epi16(__m512i a); 12 | 13 | __m128i __vectorcall _mm_tzcnt_epi32(__m128i a); 14 | __m256i __vectorcall _mm256_tzcnt_epi32(__m256i a); 15 | __m512i __vectorcall _mm512_tzcnt_epi32(__m512i a); 16 | 17 | __m128i __vectorcall _mm_tzcnt_epi64(__m128i a); 18 | __m256i __vectorcall _mm256_tzcnt_epi64(__m256i a); 19 | __m512i __vectorcall _mm512_tzcnt_epi64(__m512i a); 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | unsigned __int64 _mm_tzcnt_epi8_asm_timed(void); 26 | unsigned __int64 _mm_tzcnt_epi16_asm_timed(void); 27 | unsigned __int64 _mm_tzcnt_epi32_asm_timed(void); 28 | unsigned __int64 _mm_tzcnt_epi64_asm_timed(void); 29 | 30 | unsigned __int64 _mm256_tzcnt_epi8_asm_timed(void); 31 | unsigned __int64 _mm256_tzcnt_epi16_asm_timed(void); 32 | unsigned __int64 _mm256_tzcnt_epi32_asm_timed(void); 33 | unsigned __int64 _mm256_tzcnt_epi64_asm_timed(void); 34 | 35 | unsigned __int64 _mm512_tzcnt_epi8_asm_timed(void); 36 | unsigned __int64 _mm512_tzcnt_epi16_asm_timed(void); 37 | unsigned __int64 _mm512_tzcnt_epi32_asm_timed(void); 38 | unsigned __int64 _mm512_tzcnt_epi64_asm_timed(void); 39 | 40 | unsigned __int64 _mm_tzcnt_epi32_cd_asm_timed(void); 41 | unsigned __int64 _mm256_tzcnt_epi32_cd_asm_timed(void); 42 | unsigned __int64 _mm512_tzcnt_epi32_cd_asm_timed(void); 43 | unsigned __int64 _mm_tzcnt_epi64_cd_asm_timed(void); 44 | unsigned __int64 _mm256_tzcnt_epi64_cd_asm_timed(void); 45 | unsigned __int64 _mm512_tzcnt_epi64_cd_asm_timed(void); 46 | 47 | __m128i __vectorcall _mm_tzcnt_epi8_asm(__m128i); 48 | __m128i __vectorcall _mm_tzcnt_epi16_asm(__m128i); 49 | __m128i __vectorcall _mm_tzcnt_epi32_asm(__m128i); 50 | __m128i __vectorcall _mm_tzcnt_epi64_asm(__m128i); 51 | 52 | __m256i __vectorcall _mm256_tzcnt_epi8_asm(__m256i); 53 | __m256i __vectorcall _mm256_tzcnt_epi16_asm(__m256i); 54 | __m256i __vectorcall _mm256_tzcnt_epi32_asm(__m256i); 55 | __m256i __vectorcall _mm256_tzcnt_epi64_asm(__m256i); 56 | 57 | __m512i __vectorcall _mm512_tzcnt_epi8_asm(__m512i); 58 | __m512i __vectorcall _mm512_tzcnt_epi16_asm(__m512i); 59 | __m512i __vectorcall _mm512_tzcnt_epi32_asm(__m512i); 60 | __m512i __vectorcall _mm512_tzcnt_epi64_asm(__m512i); 61 | 62 | __m256i __vectorcall _mm256_tzcnt_epi32_cd_asm(__m256i); 63 | __m512i __vectorcall _mm512_tzcnt_epi32_cd_asm(__m512i); 64 | __m256i __vectorcall _mm256_tzcnt_epi64_cd_asm(__m256i); 65 | __m512i __vectorcall _mm512_tzcnt_epi64_cd_asm(__m512i); 66 | 67 | #ifdef __cplusplus 68 | } 69 | #endif 70 | -------------------------------------------------------------------------------- /KmovTest_Asm.asm: -------------------------------------------------------------------------------- 1 | .data 2 | 3 | tempmem dq 00101010101010101h 4 | 5 | repeats equ 1000000h 6 | 7 | .code 8 | 9 | KmovTest01 proc 10 | push rbx 11 | push rdi 12 | push rsi 13 | 14 | kxnorq k0, k0, k0 15 | kxnorq k1, k1, k1 16 | kxnorq k2, k2, k2 17 | kxnorq k3, k3, k3 18 | 19 | mfence 20 | rdtscp 21 | lfence 22 | 23 | mov esi, eax 24 | mov edi, edx 25 | 26 | mov ecx, repeats 27 | 28 | startlabel: 29 | kmovw ebx, k0 ;P0 30 | kmovw edx, k0 ;P0 31 | kmovw eax, k0 ;P0 32 | kmovw r8d, k0 ;P0 33 | 34 | popcnt ebx, ebx ;P1 35 | popcnt edx, edx ;P1 36 | popcnt eax, eax ;P1 37 | popcnt r8d, r8d ;P1 38 | 39 | add ebx, edx ;P0156 40 | add eax, r8d ;P0156 41 | add eax, ebx ;P0156 42 | 43 | kmovw k0, eax ;P5 44 | 45 | dec ecx 46 | jnz startlabel 47 | 48 | mfence 49 | rdtscp 50 | lfence 51 | 52 | shl rdx, 20h 53 | shl rdi, 20h 54 | or rax, rdx 55 | or rsi, rdi 56 | 57 | sub rax, rsi 58 | 59 | 60 | pop rsi 61 | pop rdi 62 | pop rbx 63 | ret 64 | KmovTest01 endp 65 | 66 | KmovTest02 proc 67 | push rbx 68 | push rdi 69 | push rsi 70 | 71 | kxnorq k0, k0, k0 72 | kxorq k1, k1, k1 73 | kxorq k2, k2, k2 74 | kxorq k3, k3, k3 75 | 76 | mfence 77 | rdtscp 78 | lfence 79 | 80 | mov esi, eax 81 | mov edi, edx 82 | 83 | mov ecx, repeats 84 | 85 | startlabel: 86 | kunpckwd k1, k0, k0 ;P5 87 | kunpckwd k2, k0, k0 ;P5 88 | kunpckdq k3, k1, k2 ;P5 89 | 90 | kmovq rax, k3 ;P0 91 | popcnt rax, rax ;P1 92 | kmovq k0, rax ;P5 93 | 94 | dec ecx 95 | jnz startlabel 96 | 97 | mfence 98 | rdtscp 99 | lfence 100 | 101 | shl rdx, 20h 102 | shl rdi, 20h 103 | or rax, rdx 104 | or rsi, rdi 105 | 106 | sub rax, rsi 107 | 108 | 109 | pop rsi 110 | pop rdi 111 | pop rbx 112 | ret 113 | KmovTest02 endp 114 | 115 | KmovTest03 proc 116 | push rbx 117 | push rdi 118 | push rsi 119 | 120 | kxnorq k0, k0, k0 121 | kxnorq k1, k1, k1 122 | kxnorq k2, k2, k2 123 | kxnorq k3, k3, k3 124 | 125 | mfence 126 | rdtscp 127 | lfence 128 | 129 | mov esi, eax 130 | mov edi, edx 131 | 132 | mov ecx, repeats 133 | 134 | startlabel: 135 | kmovw word ptr [tempmem + 0], k0 136 | kmovw word ptr [tempmem + 2], k0 137 | kmovw word ptr [tempmem + 4], k0 138 | kmovw word ptr [tempmem + 6], k0 139 | 140 | popcnt rax, qword ptr [tempmem] 141 | 142 | kmovq k0, rax 143 | 144 | dec ecx 145 | jnz startlabel 146 | 147 | mfence 148 | rdtscp 149 | lfence 150 | 151 | shl rdx, 20h 152 | shl rdi, 20h 153 | or rax, rdx 154 | or rsi, rdi 155 | 156 | sub rax, rsi 157 | 158 | 159 | pop rsi 160 | pop rdi 161 | pop rbx 162 | ret 163 | KmovTest03 endp 164 | 165 | end -------------------------------------------------------------------------------- /InstLatX64_Demo.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 16 4 | VisualStudioVersion = 16.0.30204.135 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "InstLatX64_Demo", "InstLatX64_Demo.vcxproj", "{AA410AE3-620A-46C2-8DC8-345AC1644E24}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug_AVX2|x32 = Debug_AVX2|x32 11 | Debug_AVX2|x64 = Debug_AVX2|x64 12 | Debug_AVX512|x32 = Debug_AVX512|x32 13 | Debug_AVX512|x64 = Debug_AVX512|x64 14 | Debug_SSE|x32 = Debug_SSE|x32 15 | Debug_SSE|x64 = Debug_SSE|x64 16 | Release_AVX2|x32 = Release_AVX2|x32 17 | Release_AVX2|x64 = Release_AVX2|x64 18 | Release_AVX512|x32 = Release_AVX512|x32 19 | Release_AVX512|x64 = Release_AVX512|x64 20 | Release_SSE|x32 = Release_SSE|x32 21 | Release_SSE|x64 = Release_SSE|x64 22 | EndGlobalSection 23 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 24 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Debug_AVX2|x32.ActiveCfg = Debug_AVX2|Win32 25 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Debug_AVX2|x32.Build.0 = Debug_AVX2|Win32 26 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Debug_AVX2|x64.ActiveCfg = Debug_AVX2|x64 27 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Debug_AVX2|x64.Build.0 = Debug_AVX2|x64 28 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Debug_AVX512|x32.ActiveCfg = Debug_AVX512|Win32 29 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Debug_AVX512|x32.Build.0 = Debug_AVX512|Win32 30 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Debug_AVX512|x64.ActiveCfg = Debug_AVX512|x64 31 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Debug_AVX512|x64.Build.0 = Debug_AVX512|x64 32 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Debug_SSE|x32.ActiveCfg = Debug_SSE|Win32 33 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Debug_SSE|x32.Build.0 = Debug_SSE|Win32 34 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Debug_SSE|x64.ActiveCfg = Debug_SSE|x64 35 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Debug_SSE|x64.Build.0 = Debug_SSE|x64 36 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Release_AVX2|x32.ActiveCfg = Release_AVX2|Win32 37 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Release_AVX2|x32.Build.0 = Release_AVX2|Win32 38 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Release_AVX2|x64.ActiveCfg = Release_AVX2|x64 39 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Release_AVX2|x64.Build.0 = Release_AVX2|x64 40 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Release_AVX512|x32.ActiveCfg = Release_AVX512|Win32 41 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Release_AVX512|x32.Build.0 = Release_AVX512|Win32 42 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Release_AVX512|x64.ActiveCfg = Release_AVX512|x64 43 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Release_AVX512|x64.Build.0 = Release_AVX512|x64 44 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Release_SSE|x32.ActiveCfg = Release_SSE|Win32 45 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Release_SSE|x32.Build.0 = Release_SSE|Win32 46 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Release_SSE|x64.ActiveCfg = Release_SSE|x64 47 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Release_SSE|x64.Build.0 = Release_SSE|x64 48 | EndGlobalSection 49 | GlobalSection(SolutionProperties) = preSolution 50 | HideSolutionNode = FALSE 51 | EndGlobalSection 52 | GlobalSection(ExtensibilityGlobals) = postSolution 53 | SolutionGuid = {AFC14262-C3E3-4F51-85F3-F1095E229AEE} 54 | EndGlobalSection 55 | EndGlobal 56 | -------------------------------------------------------------------------------- /Results/Byte2Byte_CNL.txt: -------------------------------------------------------------------------------- 1 | Vendor: "GenuineIntel" 2 | Family:6 Model:102 Stepping:3 (60663) 3 | Brand: " Intel(R) Core(TM) i3-8121U CPU @ 2.20GHz" 4 | 512b FPU DP ports : 1 5 | ---GPR---------- 6 | RDTSC : supported 7 | RDTSCP : supported 8 | CMOV : supported 9 | CMPX8 : supported 10 | CMPX16 : supported 11 | AMD64 : supported 12 | LAHF : supported 13 | MOVBE : supported 14 | ABM : supported 15 | POPCNT : supported 16 | RDRAND : supported 17 | RDSEED : supported 18 | ADX : supported 19 | BMI : supported 20 | BMI2 : supported 21 | MOVDIRI : unsupported 22 | MOVDIR64B : unsupported 23 | ---SIMD--------- 24 | SSE : supported 25 | SSE2 : supported 26 | SSE3 : supported 27 | SSSE3 : supported 28 | SSE41 : supported 29 | SSE42 : supported 30 | SSE4A : unsupported 31 | CLMUL : supported 32 | AES : supported 33 | SHA : supported 34 | AVX : supported, OS enabled 35 | AVX2 : supported, OS enabled 36 | FMA : supported, OS enabled 37 | F16C : supported, OS enabled 38 | GFNI : unsupported 39 | VAES : unsupported 40 | VPCLMULQDQ : unsupported 41 | KEYLOCK : unsupported 42 | AVX_VNNI : unsupported 43 | ---AVX512------- 44 | AVX512F : supported, OS enabled 45 | AVX512CD : supported, OS enabled 46 | AVX512ER : unsupported 47 | AVX512PF : unsupported 48 | AVX512BW : supported, OS enabled 49 | AVX512DQ : supported, OS enabled 50 | AVX512VL : supported, OS enabled 51 | AVX512VBMI : supported, OS enabled 52 | AVX512IFMA : supported, OS enabled 53 | AVX512VNNI : unsupported 54 | AVX512_4VNNIW : unsupported 55 | AVX512_4FMAPS : unsupported 56 | AVX512_VPOPCNTDQ : unsupported 57 | AVX512_BITALG : unsupported 58 | AVX512_VBMI2 : unsupported 59 | AVX512_BF16 : unsupported 60 | AVX512_VP2INTERSECT : unsupported 61 | AVX512_FP16 : unsupported 62 | ---AMX---------- 63 | AMX-BF16 : unsupported 64 | AMX-INT8 : unsupported 65 | AMX-TILE : unsupported 66 | ---CacheLine---- 67 | PREFETCHW : supported 68 | PREFETCHWT1 : unsupported 69 | CLFLUSH : supported 70 | CLFLUSHOPT : supported 71 | CLWB : unsupported 72 | CLZERO : unsupported 73 | CLDEMOTE : unsupported 74 | ---Misc--------- 75 | LNOP : supported 76 | SERIALIZE : unsupported 77 | HYBRID : unsupported 78 | ---Deprecated--- 79 | X87 : supported 80 | MMX : supported 81 | MMX+ : unsupported 82 | 3DNow! : unsupported 83 | 3DNow!+ : unsupported 84 | XOP : unsupported 85 | FMA4 : unsupported 86 | TBM : unsupported 87 | --- AVX512VBMI Byte2Byte mapping --- 88 | Masked VPERMI2B pair :10.1104 89 | VPERMI2B pair + kreg + TERNLOG :8.86203 90 | VPERMI2B pair + VPSHRQ + TERNLOG:8.20596 91 | =================================== 92 | -------------------------------------------------------------------------------- /Results/Byte2Byte_RKL.txt: -------------------------------------------------------------------------------- 1 | Vendor: "GenuineIntel" 2 | Family:6 Model:167 Stepping:1 (a0671) 3 | Brand: " 11th Gen Intel(R) Core(TM) i9-11900K @ 3.50GHz" 4 | 512b FPU DP ports : 1 5 | ---GPR---------- 6 | RDTSC : supported 7 | RDTSCP : supported 8 | CMOV : supported 9 | CMPX8 : supported 10 | CMPX16 : supported 11 | AMD64 : supported 12 | LAHF : supported 13 | MOVBE : supported 14 | ABM : supported 15 | POPCNT : supported 16 | RDRAND : supported 17 | RDSEED : supported 18 | ADX : supported 19 | BMI : supported 20 | BMI2 : supported 21 | MOVDIRI : unsupported 22 | MOVDIR64B : unsupported 23 | ---SIMD--------- 24 | SSE : supported 25 | SSE2 : supported 26 | SSE3 : supported 27 | SSSE3 : supported 28 | SSE41 : supported 29 | SSE42 : supported 30 | SSE4A : unsupported 31 | CLMUL : supported 32 | AES : supported 33 | SHA : supported 34 | AVX : supported, OS enabled 35 | AVX2 : supported, OS enabled 36 | FMA : supported, OS enabled 37 | F16C : supported, OS enabled 38 | GFNI : supported 39 | VAES : supported 40 | VPCLMULQDQ : supported 41 | KEYLOCK : unsupported 42 | AVX_VNNI : unsupported 43 | ---AVX512------- 44 | AVX512F : supported, OS enabled 45 | AVX512CD : supported, OS enabled 46 | AVX512ER : unsupported 47 | AVX512PF : unsupported 48 | AVX512BW : supported, OS enabled 49 | AVX512DQ : supported, OS enabled 50 | AVX512VL : supported, OS enabled 51 | AVX512VBMI : supported, OS enabled 52 | AVX512IFMA : supported, OS enabled 53 | AVX512VNNI : supported, OS enabled 54 | AVX512_4VNNIW : unsupported 55 | AVX512_4FMAPS : unsupported 56 | AVX512_VPOPCNTDQ : supported, OS enabled 57 | AVX512_BITALG : supported, OS enabled 58 | AVX512_VBMI2 : supported, OS enabled 59 | AVX512_BF16 : unsupported 60 | AVX512_VP2INTERSECT : unsupported 61 | AVX512_FP16 : unsupported 62 | ---AMX---------- 63 | AMX-BF16 : unsupported 64 | AMX-INT8 : unsupported 65 | AMX-TILE : unsupported 66 | ---CacheLine---- 67 | PREFETCHW : supported 68 | PREFETCHWT1 : unsupported 69 | CLFLUSH : supported 70 | CLFLUSHOPT : supported 71 | CLWB : unsupported 72 | CLZERO : unsupported 73 | CLDEMOTE : unsupported 74 | ---Misc--------- 75 | LNOP : supported 76 | SERIALIZE : unsupported 77 | HYBRID : unsupported 78 | ---Deprecated--- 79 | X87 : supported 80 | MMX : supported 81 | MMX+ : unsupported 82 | 3DNow! : unsupported 83 | 3DNow!+ : unsupported 84 | XOP : unsupported 85 | FMA4 : unsupported 86 | TBM : unsupported 87 | --- AVX512VBMI Byte2Byte mapping --- 88 | Masked VPERMI2B pair :10.0858 89 | VPERMI2B pair + kreg + TERNLOG :8.95898 90 | VPERMI2B pair + GFNI + TERNLOG :8.49166 91 | VPERMI2B pair + VPSHRQ + TERNLOG:8.67294 92 | =================================== 93 | -------------------------------------------------------------------------------- /VPCLMULQDQ_Demo_Test.cpp: -------------------------------------------------------------------------------- 1 | // VPCLMULQDQ_Demo.cpp 2 | 3 | #include "stdafx.h" 4 | #include "VPCLMULQDQ_Demo.h" 5 | 6 | extern CPU_Props cpu_props; 7 | 8 | using namespace std; 9 | 10 | void VPCLMULQDQ_Demo_prefix_xor(void) { 11 | //PS-XOR(x) ^ PS-XOR(y) == PS-XOR(x ^ y) 12 | unsigned long long q64_0 = 0, q64_1 = 0; 13 | #if !defined(_M_X64) 14 | while (!_rdrand32_step((unsigned int *)&q64_0)); 15 | while (!_rdrand32_step((unsigned int *)&q64_0 + 1)); 16 | while (!_rdrand32_step((unsigned int *)&q64_1)); 17 | while (!_rdrand32_step((unsigned int *)&q64_1 + 1)); 18 | #else 19 | while (!_rdrand64_step(&q64_0)); 20 | while (!_rdrand64_step(&q64_1)); 21 | #endif 22 | 23 | if (cpu_props.IsFeat(FEAT_CLMUL)) { 24 | __m128i x128 = _mm_set_epi64x(q64_0, _rotl64(q64_0, q64_0 & 0x3f)); 25 | __m128i y128 = _mm_set_epi64x(q64_1, _rotl64(q64_1, q64_1 & 0x3f)); 26 | 27 | __m128i test128 = _mm_xor_si128( 28 | _mm_xor_si128(_mm_prefix_xor_clmul_si128(x128), _mm_prefix_xor_clmul_si128(y128)), 29 | _mm_prefix_xor_clmul_si128(_mm_xor_si128(x128, y128))); 30 | assert(_mm_testz_si128(test128, test128)); 31 | printRes("x128 ", x128); 32 | printRes("_mm_prefix_xor_clmul_si128 ", _mm_prefix_xor_clmul_si128(x128)); 33 | } 34 | #if defined(__AVX2__) 35 | if (cpu_props.IsFeat(FEAT_AVX_VPCLMULQDQ)) { 36 | unsigned long long q64_2 = 0, q64_3 = 0; 37 | #if !defined(_M_X64) 38 | while (!_rdrand32_step((unsigned int *)&q64_2)); 39 | while (!_rdrand32_step((unsigned int *)&q64_2 + 1)); 40 | while (!_rdrand32_step((unsigned int *)&q64_3)); 41 | while (!_rdrand32_step((unsigned int *)&q64_3 + 1)); 42 | #else 43 | while (!_rdrand64_step(&q64_2)); 44 | while (!_rdrand64_step(&q64_3)); 45 | #endif 46 | __m256i x256 = _mm256_set_epi64x(q64_0, _rotl64(q64_0, q64_0 & 0x3f), q64_2, _rotl64(q64_2, q64_2 & 0x3f)); 47 | __m256i y256 = _mm256_set_epi64x(q64_1, _rotl64(q64_1, q64_1 & 0x3f), q64_3, _rotl64(q64_3, q64_3 & 0x3f)); 48 | 49 | __m256i test256 = _mm256_xor_si256( 50 | _mm256_xor_si256(_mm256_prefix_xor_clmul_si256(x256), _mm256_prefix_xor_clmul_si256(y256)), 51 | _mm256_prefix_xor_clmul_si256(_mm256_xor_si256(x256, y256))); 52 | assert(_mm256_testz_si256(test256, test256)); 53 | 54 | printRes("x256 ", x256); 55 | printRes("_mm256_prefix_xor_clmul_si256 ", _mm256_prefix_xor_clmul_si256(x256)); 56 | } 57 | #endif 58 | #if defined(__AVX512F__) 59 | if (cpu_props.IsFeat(FEAT_AVX512_VPCLMULQDQ)) { 60 | unsigned long long q64_2 = 0, q64_3 = 0; 61 | #if !defined(_M_X64) 62 | while (!_rdrand32_step((unsigned int *)&q64_2)); 63 | while (!_rdrand32_step((unsigned int *)&q64_2 + 1)); 64 | while (!_rdrand32_step((unsigned int *)&q64_3)); 65 | while (!_rdrand32_step((unsigned int *)&q64_3 + 1)); 66 | #else 67 | while (!_rdrand64_step(&q64_2)); 68 | while (!_rdrand64_step(&q64_3)); 69 | #endif 70 | __m512i x512 = _mm512_set_epi64(q64_0, _rotl64(q64_0, q64_0 & 0x3f), q64_2, _rotl64(q64_2, q64_2 & 0x3f), q64_1, _rotl64(q64_1, q64_0 & 0x3f), q64_3, _rotl64(q64_2, q64_2 & 0x3f)); 71 | __m512i y512 = _mm512_set_epi64(q64_1, _rotl64(q64_1, q64_1 & 0x3f), q64_3, _rotl64(q64_3, q64_3 & 0x3f), q64_0, _rotl64(q64_0, q64_1 & 0x3f), q64_2, _rotl64(q64_3, q64_3 & 0x3f)); 72 | 73 | __mmask64 test512 =_mm512_cmpeq_epi8_mask( 74 | _mm512_xor_si512(_mm512_prefix_xor_clmul_si512(x512), _mm512_prefix_xor_clmul_si512(y512)), 75 | _mm512_prefix_xor_clmul_si512(_mm512_xor_si512(x512, y512))); 76 | assert(test512); 77 | 78 | printRes("x512 ", x512); 79 | printRes("_mm512_prefix_xor_clmul_si512 ", _mm512_prefix_xor_clmul_si512(x512)); 80 | } 81 | #endif 82 | } 83 | 84 | void VPCLMULQDQ_Demo(void) { 85 | cout << "-----------------------------------" << endl; 86 | VPCLMULQDQ_Demo_prefix_xor(); 87 | } 88 | -------------------------------------------------------------------------------- /Args.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #if defined(__AVX512F__) 4 | #define ISA_FILENAME "AVX512" 5 | #elif defined (__AVX2__) 6 | #define ISA_FILENAME "AVX2" 7 | #else 8 | #define ISA_FILENAME "SSE" 9 | #endif 10 | 11 | #if defined (_M_X64) 12 | #define SOLUTION_FILENAME "_X64" 13 | #else 14 | #define SOLUTION_FILENAME "_X32" 15 | #endif 16 | 17 | #if defined (_DEBUG) 18 | #define DEBUG_FILENAME "_Debug" 19 | #else 20 | #define DEBUG_FILENAME 21 | #endif 22 | 23 | #define DEMO_FILENAME "InstLatX64_Demo_" ISA_FILENAME SOLUTION_FILENAME DEBUG_FILENAME ".exe" 24 | 25 | #define STR_MAXLEN 256 26 | #define MAX_ARGERROR 8 27 | #define MAX_DEMOMASK 1 28 | #define MAX_TSCRATIO 2.0 29 | 30 | #define ARGERR_INV_CHAR "Invalid character: " 31 | #define ARGERR_INV_PARAM "Invalid parameter: " 32 | #define ARGERR_INV_SWITCH "Invalid switch: " 33 | #define ARGERR_INV_DEMO "Invalid demo type: " 34 | #define ARGERR_INV_CPUIDFILE "Invalid CPUID filename: " 35 | #define ARGERR_INV_XCR0 "Invalid XCR0 register value: " 36 | #define ARGERR_INV_TSCRATIO "Invalid TSC ratio value: " 37 | 38 | #define ARGERR_MISS_ARG "Missing argument: " 39 | #define ARGERR_MISS_DEMO "Missing demo type: " 40 | #define ARGERR_MISS_THREAD "Missing thread index: " 41 | #define ARGERR_MISS_CPUIDFILE "Missing CPUID filename: " 42 | #define ARGERR_MISS_XCR0 "Missing XCR0 register value: " 43 | #define ARGERR_MISS_TSCRATIO "Missing TSC ratio value: " 44 | 45 | #define DEFAULT_PCORE_INDEX ~0 46 | #define DEFAULT_ECORE_INDEX (DEFAULT_PCORE_INDEX - 1) 47 | #define DEFAULT_LPECORE_INDEX (DEFAULT_ECORE_INDEX - 1) 48 | 49 | enum argType { 50 | ARG_HELP, 51 | ARG_VERSION, 52 | ARG_DEMOLIST, 53 | ARG_CPUPROPS, 54 | ARG_PCORE, 55 | ARG_ECORE, 56 | ARG_LPECORE, 57 | ARG_CPUIDDUMP, 58 | ARG_PROCMASK, 59 | #if defined (_M_X64) && defined(__AVX512F__) 60 | ARG_512BFMADP, 61 | #endif 62 | ARG_DEMOTYPE, 63 | ARG_THREADINDEX, 64 | ARG_CPUIDFILE, 65 | ARG_XCR0, 66 | ARG_TSCRATIO, 67 | ARG_NOTHING, 68 | }; 69 | 70 | typedef struct { 71 | bool arguments; 72 | const char * longName; 73 | char shortName; 74 | argType type; 75 | const char * missingErr; 76 | const char * description; 77 | } paramsType; 78 | 79 | class Args { 80 | private: 81 | static const paramsType params[]; 82 | uint64_t demoMask[MAX_DEMOMASK] = {0}; 83 | argType paramType; 84 | size_t demoCount; 85 | size_t paramCount; 86 | size_t threadIndex; 87 | UINT64 xcr0; 88 | double tscRatio; 89 | const demoTypeList* demoList; 90 | bool helpFlag; 91 | bool versionFlag; 92 | bool listFlag; 93 | bool cpuPropsFlag; 94 | bool procMaskFlag; 95 | #if defined (_M_X64) && defined(__AVX512F__) 96 | bool _512bFMA_DP_Flag; 97 | #endif 98 | bool errorFlag; 99 | bool dumpFlag; 100 | bool cpuidFileFlag; 101 | bool validFlag; 102 | void SetError(char* , char*, const char* ); 103 | void SetParam(argType, char*, char* , int* ); 104 | char * cpuidFileName; 105 | public: 106 | Args(const demoTypeList[], size_t, int argc, char** argv); 107 | bool Init(int argc, char** argv); 108 | bool IsVersion(void) const; 109 | bool IsHelp(void) const; 110 | bool IsDemoList(void) const; 111 | bool IsCPUProps(void) const; 112 | bool IsCPUIDDump(void) const; 113 | bool IsCPUIDFile(void) const; 114 | bool IsProcMask(void) const; 115 | #if defined (_M_X64) && defined(__AVX512F__) 116 | bool Is_512bFMA_DP_Ports(void) const; 117 | #endif 118 | bool IsValid(void) const; 119 | void PrintUsage(void) const; 120 | void PrintVersion(void) const; 121 | size_t GetMaxDemo(void) const; 122 | size_t GetThreadIndex(CPU_Props) const; 123 | char* GetCPUIDFileName() const; 124 | bool IsSelected(size_t) const; 125 | UINT64 GetXCR0(void) const; 126 | double GetTSCRatio(void) const; 127 | }; 128 | 129 | extern CPU_Props cpu_props; 130 | -------------------------------------------------------------------------------- /Results/Zen4_expected.txt: -------------------------------------------------------------------------------- 1 | Vendor: "AuthenticAMD" 2 | Family:25 Model:16 Stepping:0 (a10f00) 3 | ---GPR---------- 4 | RDTSC : supported 5 | RDTSCP : supported 6 | CMOV : supported 7 | CMPX8 : supported 8 | CMPX16 : supported 9 | AMD64 : supported 10 | LAHF : supported 11 | MOVBE : supported 12 | ABM : supported 13 | POPCNT : supported 14 | RDRAND : supported 15 | RDSEED : supported 16 | ADX : supported 17 | BMI : supported 18 | BMI2 : supported 19 | MOVDIRI : unsupported 20 | MOVDIR64B : unsupported 21 | ---SIMD--------- 22 | SSE : supported 23 | SSE2 : supported 24 | SSE3 : supported 25 | SSSE3 : supported 26 | SSE41 : supported 27 | SSE42 : supported 28 | SSE4A : supported 29 | CLMUL : supported 30 | AES : supported 31 | SHA : supported 32 | AVX : supported, OS enabled 33 | AVX2 : supported, OS enabled 34 | FMA : supported, OS enabled 35 | F16C : supported, OS enabled 36 | GFNI : supported 37 | VAES : supported 38 | VPCLMULQDQ : supported 39 | AVX_VNNI : unsupported 40 | ---AVX512------- 41 | AVX512F : supported, OS enabled 42 | AVX512CD : supported, OS enabled 43 | AVX512ER : unsupported 44 | AVX512PF : unsupported 45 | AVX512BW : supported, OS enabled 46 | AVX512DQ : supported, OS enabled 47 | AVX512VL : supported, OS enabled 48 | AVX512VBMI : supported, OS enabled 49 | AVX512IFMA : supported, OS enabled 50 | AVX512VNNI : supported, OS enabled 51 | AVX512_4VNNIW : unsupported 52 | AVX512_4FMAPS : unsupported 53 | AVX512_VPOPCNTDQ : supported, OS enabled 54 | AVX512_BITALG : supported, OS enabled 55 | AVX512_VBMI2 : supported, OS enabled 56 | AVX512_BF16 : supported, OS enabled 57 | AVX512_VP2INTERSECT : unsupported 58 | AVX512_FP16 : unsupported 59 | ---AMX---------- 60 | AMX-BF16 : unsupported 61 | AMX-INT8 : unsupported 62 | AMX-TILE : unsupported 63 | ---CacheLine---- 64 | PREFETCHW : supported 65 | PREFETCHWT1 : unsupported 66 | CLFLUSH : supported 67 | CLFLUSHOPT : supported 68 | CLWB : supported 69 | CLZERO : supported 70 | CLDEMOTE : unsupported 71 | ---uCode-------- 72 | Enh REP MOVSB/STOSB : supported 73 | Fast short REP MOV : supported 74 | Fast zero-length MOVSB : unsupported 75 | Fast short STOSB : unsupported 76 | Fast short CMPSB, SCASB : unsupported 77 | ---Keylocker 78 | KEYLOCK : unsupported 79 | AESKLE : unsupported 80 | WIDE_KL : unsupported 81 | ---Uncategorized 82 | LNOP : supported 83 | SERIALIZE : unsupported 84 | HYBRID : unsupported 85 | RDPID : supported 86 | RDPRU : supported 87 | MCOMMIT : unsupported 88 | ---Deprecated--- 89 | X87 : supported 90 | MMX : supported 91 | MMX+ : supported 92 | 3DNow! : unsupported 93 | 3DNow!+ : unsupported 94 | XOP : unsupported 95 | FMA4 : unsupported 96 | TBM : unsupported 97 | MPX : unsupported 98 | HLE : unsupported 99 | PCOMMIT : unsupported -------------------------------------------------------------------------------- /Results/HYBRID_Lakefield_CPUID806A1.txt: -------------------------------------------------------------------------------- 1 | Vendor: "GenuineIntel" 2 | Family:6 Model:138 Stepping:1 (806a1) 3 | Brand: " Intel(R) Core(TM) i5-L16G7 CPU @ 1.40GHz" 4 | 512b FPU DP ports : 0 5 | ---GPR---------- 6 | RDTSC : supported 7 | RDTSCP : supported 8 | CMOV : supported 9 | CMPX8 : supported 10 | CMPX16 : supported 11 | AMD64 : supported 12 | LAHF : supported 13 | MOVBE : supported 14 | ABM : unsupported 15 | POPCNT : supported 16 | RDRAND : supported 17 | RDSEED : supported 18 | ADX : unsupported 19 | BMI : unsupported 20 | BMI2 : unsupported 21 | MOVDIRI : unsupported 22 | MOVDIR64B : unsupported 23 | ---SIMD--------- 24 | SSE : supported 25 | SSE2 : supported 26 | SSE3 : supported 27 | SSSE3 : supported 28 | SSE41 : supported 29 | SSE42 : supported 30 | SSE4A : unsupported 31 | CLMUL : supported 32 | AES : supported 33 | SHA : supported 34 | AVX : unsupported 35 | AVX2 : unsupported 36 | FMA : unsupported 37 | F16C : unsupported 38 | GFNI : supported 39 | VAES : unsupported 40 | VPCLMULQDQ : unsupported 41 | KEYLOCK : unsupported 42 | AVX_VNNI : unsupported 43 | ---AVX512------- 44 | AVX512F : unsupported 45 | AVX512CD : unsupported 46 | AVX512ER : unsupported 47 | AVX512PF : unsupported 48 | AVX512BW : unsupported 49 | AVX512DQ : unsupported 50 | AVX512VL : unsupported 51 | AVX512VBMI : unsupported 52 | AVX512IFMA : unsupported 53 | AVX512VNNI : unsupported 54 | AVX512_4VNNIW : unsupported 55 | AVX512_4FMAPS : unsupported 56 | AVX512_VPOPCNTDQ : unsupported 57 | AVX512_BITALG : unsupported 58 | AVX512_VBMI2 : unsupported 59 | AVX512_BF16 : unsupported 60 | AVX512_VP2INTERSECT : unsupported 61 | AVX512_FP16 : unsupported 62 | ---AMX---------- 63 | AMX-BF16 : unsupported 64 | AMX-INT8 : unsupported 65 | AMX-TILE : unsupported 66 | ---CacheLine---- 67 | PREFETCHW : supported 68 | PREFETCHWT1 : unsupported 69 | CLFLUSH : supported 70 | CLFLUSHOPT : supported 71 | CLWB : supported 72 | CLZERO : unsupported 73 | CLDEMOTE : unsupported 74 | ---uCode-------- 75 | Enh REP MOVSB/STOSB : supported 76 | Fast short REP MOV : unsupported 77 | Fast zero-length MOVSB : unsupported 78 | Fast short STOSB : unsupported 79 | Fast short CMPSB, SCASB : unsupported 80 | ---Uncategorized 81 | LNOP : supported 82 | SERIALIZE : unsupported 83 | HYBRID : supported 84 | RDPID : unsupported 85 | RDPRU : unsupported 86 | MCOMMIT : unsupported 87 | ---Deprecated--- 88 | X87 : supported 89 | MMX : supported 90 | MMX+ : unsupported 91 | 3DNow! : unsupported 92 | 3DNow!+ : unsupported 93 | XOP : unsupported 94 | FMA4 : unsupported 95 | TBM : unsupported 96 | MPX : unsupported 97 | HLE : unsupported 98 | PCOMMIT : unsupported 99 | --Hybrid info-- 100 | systemAffinityMask: 0x000000000000001f 101 | littleCoreMask : 0x000000000000000f 102 | bigCoreMask : 0x0000000000000010 103 | =================================== 104 | -------------------------------------------------------------------------------- /HWBITPERM_Demo_Asm.asm: -------------------------------------------------------------------------------- 1 | .data 2 | 3 | savereg dq 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h 4 | savereg2 dq 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h 5 | 6 | exch0 dq 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h 7 | exch1 dq 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h 8 | exch2 dq 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h 9 | 10 | .code 11 | 12 | EMPTY_Init macro EMPTYPAR, INST, CPU 13 | endm 14 | 15 | B64_HW_Core macro disp, INST 16 | mov rax, qword ptr [exch0 + disp] 17 | IFIDNI , 18 | mov r9, qword ptr [exch1 + disp] 19 | pext rdx, rax, r9 20 | popcnt r8, r9 21 | not r9 22 | pext r9, rax, r9 23 | shlx r9, r9, r8 24 | or rdx, r9 25 | ELSE 26 | INST rdx, rax, qword ptr [exch1 + disp] 27 | ENDIF 28 | mov qword ptr [exch2 + disp], rdx 29 | endm 30 | 31 | B32_HW_Core macro disp, INST 32 | mov eax, dword ptr [exch0 + disp] 33 | IFIDNI , 34 | mov r9d, dword ptr [exch1 + disp] 35 | pext edx, eax, r9d 36 | popcnt r8d, r9d 37 | not r9d 38 | pext r9d, eax, r9d 39 | shlx r9d, r9d, r8d 40 | or edx, r9d 41 | ELSE 42 | INST edx, eax, dword ptr [exch1 + disp] 43 | ENDIF 44 | mov dword ptr [exch2 + disp], edx 45 | endm 46 | 47 | HW macro BITNESS, INST 48 | IF BITNESS EQ 64 49 | vmovdqu64 zmmword ptr [exch0], zmm0 50 | vmovdqu64 zmmword ptr [exch1], zmm1 51 | ELSE 52 | vmovdqu32 zmmword ptr [exch0], zmm0 53 | vmovdqu32 zmmword ptr [exch1], zmm1 54 | ENDIF 55 | 56 | disp = 00h 57 | REPEAT 512 / BITNESS 58 | IF BITNESS EQ 64 59 | B64_HW_Core disp, INST 60 | ELSE 61 | B32_HW_Core disp, INST 62 | ENDIF 63 | disp = disp + (BITNESS / 8) 64 | endm 65 | 66 | IF BITNESS EQ 64 67 | vmovdqu64 zmm0, zmmword ptr [exch2] 68 | ELSE 69 | vmovdqu32 zmm0, zmmword ptr [exch2] 70 | ENDIF 71 | endm 72 | 73 | TIMED macro PNAME, INIT, BITNESS, CORE, INST, CPU, TPLAT 74 | PNAME proc 75 | push rbx 76 | push rdi 77 | push rsi 78 | 79 | INIT BITNESS, INST, CPU 80 | 81 | mfence 82 | rdtscp 83 | lfence 84 | 85 | mov esi, eax 86 | mov edi, edx 87 | 88 | mov ecx, DEPEXT219_REPEATS 89 | 90 | align 16 91 | startlabel: 92 | CORE BITNESS, INST 93 | 94 | IFIDNI , 95 | IF BITNESS EQ 64 96 | vmovdqa64 zmm1, zmm0 97 | ELSE 98 | vmovdqa32 zmm1, zmm0 99 | ENDIF 100 | ELSEIFIDNI , 101 | vpxor xmm0, xmm0, xmm0 102 | vpxor xmm1, xmm1, xmm1 103 | ENDIF 104 | 105 | dec ecx 106 | jnz startlabel 107 | 108 | mfence 109 | rdtscp 110 | lfence 111 | 112 | shl rdx, 20h 113 | shl rdi, 20h 114 | or rax, rdx 115 | or rsi, rdi 116 | 117 | sub rax, rsi 118 | 119 | 120 | pop rsi 121 | pop rdi 122 | pop rbx 123 | ret 124 | PNAME endp 125 | endm 126 | 127 | NAKED macro PNAME, INIT, BITNESS, CORE, INST, CPU 128 | PNAME proc 129 | 130 | INIT BITNESS, INST, CPU 131 | 132 | CORE BITNESS, INST 133 | 134 | 135 | ret 136 | PNAME endp 137 | endm 138 | 139 | ;Creadit: Travis Downs 140 | ;https://twitter.com/trav_downs/status/1418616866080116742? 141 | 142 | TIMED BEXT64_HW_Lat, EMPTY_Init, 64, HW, PEXT, SKX, LAT 143 | TIMED BDEP64_HW_Lat, EMPTY_Init, 64, HW, PDEP, SKX, LAT 144 | TIMED BGRP64_HW_Lat, EMPTY_Init, 64, HW, PGRP, SKX, LAT 145 | TIMED BEXT32_HW_Lat, EMPTY_Init, 32, HW, PEXT, SKX, LAT 146 | TIMED BDEP32_HW_Lat, EMPTY_Init, 32, HW, PDEP, SKX, LAT 147 | TIMED BGRP32_HW_Lat, EMPTY_Init, 32, HW, PGRP, SKX, LAT 148 | 149 | TIMED BEXT64_HW_Tp, EMPTY_Init, 64, HW, PEXT, SKX, TP 150 | TIMED BDEP64_HW_Tp, EMPTY_Init, 64, HW, PDEP, SKX, TP 151 | TIMED BGRP64_HW_Tp, EMPTY_Init, 64, HW, PGRP, SKX, TP 152 | TIMED BEXT32_HW_Tp, EMPTY_Init, 32, HW, PEXT, SKX, TP 153 | TIMED BDEP32_HW_Tp, EMPTY_Init, 32, HW, PDEP, SKX, TP 154 | TIMED BGRP32_HW_Tp, EMPTY_Init, 32, HW, PGRP, SKX, TP 155 | 156 | NAKED BEXT64_HW@@128, EMPTY_Init, 64, HW, PEXT, SKX 157 | NAKED BDEP64_HW@@128, EMPTY_Init, 64, HW, PDEP, SKX 158 | NAKED BGRP64_HW@@128, EMPTY_Init, 64, HW, PGRP, SKX 159 | NAKED BEXT32_HW@@128, EMPTY_Init, 32, HW, PEXT, SKX 160 | NAKED BDEP32_HW@@128, EMPTY_Init, 32, HW, PDEP, SKX 161 | NAKED BGRP32_HW@@128, EMPTY_Init, 32, HW, PGRP, SKX 162 | 163 | end -------------------------------------------------------------------------------- /AVX512_Reduce_Add.cpp: -------------------------------------------------------------------------------- 1 | #include "stdafx.h" 2 | #include "AVX512_Reduce_Add.h" 3 | 4 | extern CPU_Props cpu_props; 5 | 6 | using namespace std; 7 | 8 | //credit: @geofflangdale https://twitter.com/geofflangdale/status/1609575574946865154 9 | 10 | uint32_t _mm512_reduce2_add_epu8(__m512i z) { 11 | __m128i permb_collect = _mm_setr_epi64x(0x3830282018100800, 0x3931292119110901); 12 | __m512i sad = _mm512_sad_epu8(_mm512_setzero_si512(), z); 13 | __m128i permb = _mm512_castsi512_si128(_mm512_permutexvar_epi8(_mm512_zextsi128_si512(permb_collect), sad)); 14 | __m128i sad2 = _mm_sad_epu8(_mm_setzero_si128(), permb); 15 | return _mm_cvtsi128_si32(_mm_add_epi32(sad2, _mm_srli_si128(sad2, 7))); 16 | } 17 | 18 | uint32_t _mm512_reduce2_add_epu16(__m512i z) { 19 | __m128i pshufb_collect = _mm_setr_epi32(0x06040200, 0x0e0c0a08, 0x07050301, 0x0f0d0b09); 20 | __m512i pshufb = _mm512_shuffle_epi8(z, _mm512_broadcast_i32x4(pshufb_collect)); 21 | __m512i sad = _mm512_sad_epu8(_mm512_setzero_si512(), pshufb); 22 | __m128i permb_collect = _mm_setr_epi32(0x30201000, 0x31211101, 0x38281808, 0x39291909); 23 | __m128i permb = _mm512_castsi512_si128(_mm512_permutexvar_epi8(_mm512_zextsi128_si512(permb_collect), sad)); 24 | __m128i dbsad = _mm_maskz_dbsad_epu8(0x55, permb, _mm_setzero_si128(), 0); 25 | __m128i add = _mm_add_epi32(_mm_srli_epi64(dbsad, 24), dbsad); 26 | return _mm_cvtsi128_si32(_mm_add_epi32(add, _mm_srli_si128(add, 7))); 27 | } 28 | 29 | uint64_t _mm512_reduce2_add_epu32(__m512i z) { 30 | __m128i transpose_4x4 = _mm_setr_epi32(0x0c080400, 0x0d090501, 0x0e0a0602, 0x0f0b0703); 31 | __m512i transpose = _mm512_shuffle_epi8(z, _mm512_broadcast_i32x4(transpose_4x4)); 32 | __m512i dbsad = _mm512_maskz_dbsad_epu8(0x55555555, transpose, _mm512_setzero_si512(), 0); 33 | __m512i permb_collect = _mm512_setr_epi32(0x30201000, 0x3f3f3f3f, 0x31211101, 0x34241404, 0x35251505, 0x38281808, 0x39291909, 0x3c2c1c0c, 0x3d2d1d0d, 0x3f3f3f3f, 0x3f3f3f3f, 0x3f3f3f3f, 0x3f3f3f3f, 0x3f3f3f3f, 0x3f3f3f3f, 0x3f3f3f3f); 34 | __m512i permb = _mm512_permutexvar_epi8(permb_collect, dbsad); 35 | __m512i sad = _mm512_sad_epu8(permb, _mm512_setzero_si512()); 36 | __m128i permb2 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(_mm512_zextsi128_si512(_mm_setr_epi64x(0x3f3f3f2018100800, 0x3f3f21191109013f)), sad)); 37 | return _mm_cvtsi128_si64(_mm_add_epi64(permb2, _mm_unpackhi_epi64(permb2, permb2))); 38 | } 39 | 40 | uint64_t _mm512_reduce2_add_epu64(__m512i z) { 41 | __m512i transpose_8x8 = _mm512_setr_epi64(0x3830282018100800, 0x3931292119110901, 0x3a322a221a120a02, 0x3b332b231b130b03, 0x3c342c241c140c04, 0x3d352d251d150d05, 0x3e362e261e160e06, 0x3f372f271f170f07); 42 | __m512i transpose = _mm512_permutexvar_epi8(transpose_8x8, z); 43 | __m512i sad = _mm512_sad_epu8(transpose, _mm512_setzero_si512()); 44 | __m128i collect = _mm512_castsi512_si128(_mm512_permutexvar_epi8(_mm512_zextsi128_si512(_mm_setr_epi64x(0x3830282018100800, 0x312921191109013f)), sad)); 45 | return _mm_cvtsi128_si64(_mm_add_epi64(collect, _mm_unpackhi_epi64(collect, collect))); 46 | } 47 | 48 | uint64_t _mm512_reduce2_add_epu128(__m512i z, uint64_t* hi) { 49 | __m512i transpose_8x8 = _mm512_setr_epi64(0x3830282018100800, 0x3931292119110901, 0x3a322a221a120a02, 0x3b332b231b130b03, 0x3c342c241c140c04, 0x3d352d251d150d05, 0x3e362e261e160e06, 0x3f372f271f170f07); 50 | __m512i transpose = _mm512_permutexvar_epi8(transpose_8x8, z); 51 | __m512i sad = _mm512_sad_epu8(transpose, _mm512_setzero_si512()); 52 | __m128i collect0 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(_mm512_zextsi128_si512(_mm_setr_epi64x(0x3830282018100800, 0x3f38302820181008)), sad)); 53 | __m128i collect1 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(_mm512_zextsi128_si512(_mm_setr_epi64x(0x312921191109013f, 0x3931292119110901)), sad)); 54 | __m128i add = _mm_add_epi64(collect0, collect1); 55 | *hi = _mm_extract_epi64(add, 1) >> 56; 56 | return _mm_cvtsi128_si64(add); 57 | } 58 | 59 | void AVX512_Reduce_Add_Demo(void) { 60 | __m512i u = _mm512_undefined_epi32(); 61 | 62 | cout << hex; 63 | /* Microsoft built-in reduce_add intrinsics */ 64 | //cout << setw(16) << right << _mm512_reduce_add_epu8(_mm512_set1_epi8(0xFE)) << endl; 65 | //cout << setw(16) << right << _mm512_reduce_add_epu16(_mm512_set1_epi16(0xFEDC)) << endl; 66 | //cout << setw(16) << right << _mm512_reduce_add_epi32(_mm512_set1_epi32(0xFEDCBA98)) << endl; 67 | //cout << setw(16) << right << _mm512_reduce_add_epi64(_mm512_set1_epi64(0xFEDCBA9876543210)) << endl; 68 | 69 | cout << setw(16) << right << _mm512_reduce2_add_epu8(_mm512_set1_epi8((char)0xFE)) << endl; 70 | cout << setw(16) << right << _mm512_reduce2_add_epu16(_mm512_set1_epi16((short)0xFEDC)) << endl; 71 | cout << setw(16) << right << _mm512_reduce2_add_epu32(_mm512_set1_epi32(0xFEDCBA98)) << endl; 72 | cout << setw(16) << right << _mm512_reduce2_add_epu64(_mm512_set1_epi64(0xFEDCBA9876543210)) << endl; 73 | 74 | uint64_t hi, lo = _mm512_reduce2_add_epu128(_mm512_set1_epi64(0xFEDCBA9876543210), &hi); 75 | cout << setw(16) << right << hi << ':' << lo << endl; 76 | } -------------------------------------------------------------------------------- /AMX_Demo.cpp: -------------------------------------------------------------------------------- 1 | #include "stdafx.h" 2 | #include "AMX_Demo.h" 3 | 4 | extern CPU_Props cpu_props; 5 | 6 | using namespace std; 7 | 8 | void AMX_Test(void) { 9 | #if (_MSC_VER > 1927) 10 | #if defined (_M_X64) 11 | //AMX-TILE 12 | const unsigned int ttb = cpu_props.GetAMXPalette_TotalTileBytes(0); 13 | const unsigned int maxRegs = cpu_props.GetAMXPalette_MaxName(0); 14 | const unsigned int maxRegSize = ttb / max(1, maxRegs); 15 | const unsigned int amxCols = cpu_props.GetAMXCols(); 16 | const unsigned int amxRows = cpu_props.GetAMXRows(); 17 | cout << "ttb :" << dec << ttb << endl; 18 | cout << "maxRegs :" << maxRegs << endl; 19 | cout << "maxRegSize:" << maxRegSize << endl; 20 | cout << "AMXCols :" << amxCols << endl; 21 | cout << "AMXRows :" << amxRows << endl; 22 | XTILECFG load_tilecfg(amxCols, amxRows, ttb, maxRegs); 23 | 24 | XTILECFG store_tilecfg; 25 | unsigned char * tile0 = new unsigned char[maxRegSize]{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 26 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 27 | 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 28 | 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 29 | 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 30 | 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 31 | 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 32 | 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f}; 33 | unsigned char * tile1 = new unsigned char[maxRegSize]{ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 34 | 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 35 | 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 36 | 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 37 | 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 38 | 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 39 | 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 40 | 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff}; 41 | unsigned short *tile2 = new unsigned short[maxRegSize / 2]{ 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x4000, 0x4000, 0x4000, 0x4000, 42 | 0x4040, 0x4040, 0x4040, 0x4040, 0x0000, 0x0000, 0x0000, 0x0000, 43 | 0xbf80, 0xbf80, 0xbf80, 0xbf80, 0xc000, 0xc000, 0xc000, 0xc000, 44 | 0xc040, 0xc040, 0xc040, 0xc040, 0x8000, 0x8000, 0x8000, 0x8000}; 45 | unsigned char * restile2 = new unsigned char[maxRegSize]; 46 | unsigned char * restile3 = new unsigned char[maxRegSize]; 47 | unsigned char * restile4 = new unsigned char[maxRegSize]; 48 | unsigned char * restile5 = new unsigned char[maxRegSize]; 49 | unsigned char * restile6 = new unsigned char[maxRegSize]; 50 | unsigned char * restile7 = new unsigned char[maxRegSize]; 51 | 52 | //AMX-TILE 53 | _tile_release(); //TILERELEASE 54 | _tile_loadconfig(&load_tilecfg); //LDTILECFG 55 | _tile_loadd(0, tile0, 1); //TILELOADD 56 | _tile_loadd(1, tile0, 4); //TILELOADD 57 | _tile_loadd(2, tile0, 16); //TILEZERO 58 | _tile_zero(3); //TILEZERO 59 | _tile_zero(4); //TILEZERO 60 | _tile_zero(5); //TILEZERO 61 | _tile_stream_loadd(6, tile2, 2); //TILELOADDT1 62 | _tile_stream_loadd(7, tile2, 4); //TILELOADDT1 63 | //AMX-INT8 64 | _tile_dpbssd(2, 1, 0); //TDPBSSD 65 | _tile_dpbsud(3, 2, 0); //TDPBSSD 66 | _tile_dpbusd(4, 1, 0); //TDPBSSD 67 | _tile_dpbuud(5, 1, 0); //TDPBSSD 68 | //AMX-BF16 69 | _tile_dpbf16ps(0, 6, 7); //TDPBF16PS 70 | //AMX-TILE 71 | _tile_stored(2, restile2, amxCols); //TILESTORED 72 | _tile_stored(3, restile3, amxCols); //TILESTORED 73 | _tile_stored(4, restile4, amxCols); //TILESTORED 74 | _tile_stored(5, restile5, amxCols); //TILESTORED 75 | _tile_stored(6, restile6, amxCols); //TILESTORED 76 | _tile_storeconfig(&store_tilecfg); //STTILECFG 77 | _tile_release(); //TILERELEASE 78 | //Intel SDE command line 79 | //sde -spr -debugtrace -start_extension AMX_TILE -stop_extension AVX512EVEX -- InstLatX64_Demo_AVX512_x64.exe 80 | _mm512_storeu_epi64(restile7, _mm512_loadu_epi64(&store_tilecfg)); 81 | delete [] tile0, tile1, tile2; 82 | delete [] restile2, restile3, restile4, restile5; 83 | #endif // defined (_M_X64) 84 | #endif //_MSC_VER > 1927 85 | } -------------------------------------------------------------------------------- /InstLatX64_Demo.cpp: -------------------------------------------------------------------------------- 1 | // InstLatX64_Demo.cpp 2 | // 3 | 4 | #include "stdafx.h" 5 | 6 | using namespace std; 7 | 8 | const demoTypeList demos[] = { 9 | {"GFNI", "", DEMO_GFNI, FEAT_GFNI, true, GFNI_Demo, "SIMD byte granularity shifts/rotates, 8x8 bit, pospocnt, etc."}, 10 | {"VPCLMULQDQ", "CLMUL", DEMO_VPCLMLQDQ, FEAT_CLMUL, true, VPCLMULQDQ_Demo, "SIMD prefix xor / parity"}, 11 | #if defined (__AVX2__) 12 | {"AVX_VNNI_INT16_AddSubS", "VSAdd", DEMO_VNNI_SADD, FEAT_AVX_VNNI_INT16, true, AVX_VNNI_Saturated_AddSub_Demo, "_mm256_adds/subs_epi/epu/32 implementation"}, 13 | #endif 14 | #if defined (_M_X64) 15 | #if defined (__AVX2__) 16 | {"P06P1", "P06P1", DEMO_P06P1, FEAT_BMI2, true, P0601_Test, "Golden Cove P06P1 anomaly"}, 17 | {"PEXT_PDEP", "PEXT", DEMO_PEXT_PDEP_EMU, FEAT_BMI2, true, PEXT_PDEP_Emu_Test, "Fast GPR PEXT/PDEP instruction emulation for AMDs"}, 18 | {"FirstByte", "", DEMO_FIRSTBBYTE, FEAT_AVX2, true, FirstByte_Demo, "Finding first byte in lanes"}, 19 | #endif 20 | #if defined(__AVX512F__) 21 | {"Reduce_Add", "RAdd", DEMO_RADD, FEAT_AVX512VBMI, true, AVX512_Reduce_Add_Demo, "_mm512_reduce_add_epu8/16/32/64 implementation"}, 22 | {"Saturated_AddSub", "SAdd", DEMO_AVX512_SADD, FEAT_AVX512F, true, AVX512_Saturated_AddSub_Demo, "_mm512_adds/subs_epi/epu/32/64 implementation"}, 23 | {"KMemDst", "KMem", DEMO_KMEMDST, FEAT_AVX512F, true, AVX512_KMemDst_Demo, "AVX512 insts with masked memory destination"}, 24 | {"Zen4", "Zen4", DEMO_ZEN4, FEAT_AVX512F, true, Zen4_Demo, "AMD Zen4 SIMD analysis"}, 25 | {"Zen5", "Zen5", DEMO_ZEN5, FEAT_AVX512F, true, Zen5_Demo, "AMD Zen5 SIMD analysis"}, 26 | {"Intrinsics", "Intrin", DEMO_INTRINSICS, FEAT_AVX512F, true, AVX512_InstrincTest, "Visual Studio Compiler Intrinsics Test"}, 27 | {"VBMI2", "", DEMO_VBMI2, FEAT_AVX512_VBMI2, true, VBMI2_Demo, "SIMD variable rots and shifts for words and bytes"}, 28 | {"Byte2Byte", "B2B", DEMO_BYTE2BYTE, FEAT_AVX512VBMI, true, Byte2ByteTest, "Fastest Byte2Byte SIMD replacemant"}, 29 | {"LZCNT", "", DEMO_LZCNT, FEAT_AVX512_BITALG, true, LZCNT_Test, "Missing SIMD VPLZCNTB/W emulation"}, 30 | {"TZCNT", "", DEMO_TZCNT, FEAT_AVX512_VPOPCNTDQ, true, TZCNT_Test, "Missing SIMD VPTZCNTB/W/D/Q emulation"}, 31 | {"HWBITPERM", "HWB", DEMO_HWBITPERM, FEAT_AVX512BW, true, HWBITPERM_Test, "SVE2 vector BITPERM (BEXT/BDEP/BGRP) emulation with HW scalar BMI2 PEXT/PDEP instructions"}, 32 | {"KMOV", "", DEMO_KMOV, FEAT_AVX512BW, true, Kmov_Test, "KMOV"}, 33 | {"AMX", "", DEMO_AMX, FEAT_AMX_BF16, true, AMX_Test, "AMX 101"}, 34 | {"AVX512_DecPrint", "Print", DEMO_AVX512_DECPRINT, FEAT_AVX512F, true, AVX512_DecimalPrint_Test, "AVX512F & AVX512_IFMA decimal print"}, 35 | {"ByteShift", "BGVSER", DEMO_AVX512_BGVSER, FEAT_AVX512VBMI, true, AVX512_BGVSER_Test, "Byte-Granularity Variable Shift on Entire Register"}, 36 | #endif 37 | #endif 38 | }; 39 | 40 | Args args(demos, sizeof(demos) / sizeof(demoTypeList), __argc, __argv); 41 | CPU_Props cpu_props(args.GetXCR0()); 42 | 43 | int main(void) 44 | { 45 | if (args.IsValid()) { 46 | 47 | if (args.IsVersion()) 48 | args.PrintVersion(); 49 | 50 | if (args.IsHelp()) 51 | args.PrintUsage(); 52 | 53 | if (args.IsDemoList()) { 54 | cout << endl << "Demo types:"; 55 | for (uint32_t demo = 0; demo < sizeof(demos) / sizeof(demoTypeList); demo++) { 56 | cout << endl << setw(24) << demos[demo].demoName; 57 | if (_stricmp(demos[demo].alias, "") != 0) 58 | cout << " (alias:" << setw(6) << demos[demo].alias << ')'; 59 | else 60 | cout << " "; 61 | cout << (demos[demo].publicFlag ? " [PUB] " : " "); 62 | if (_stricmp(demos[demo].comment, "") != 0) 63 | cout << '(' << demos[demo].comment << ')'; 64 | } 65 | cout << endl; 66 | } 67 | bool fileRead = true; 68 | if (args.IsCPUIDFile()) { 69 | fileRead = cpu_props.GetFileCPUID(args.GetCPUIDFileName(), args.GetXCR0()); 70 | } 71 | if (fileRead) { 72 | if (args.IsCPUProps()) { 73 | cpu_props.PrintFeats(); 74 | //cpu_props.ForcedAVX512(); 75 | cpu_props.PrintXCR0(); 76 | } 77 | 78 | #if defined (_M_X64) && defined(__AVX512F__) 79 | if (args.Is_512bFMA_DP_Ports() && cpu_props.IsFeat(FEAT_AVX512F)) 80 | cpu_props.Print_512bFMA_DP_Ports(); 81 | #endif 82 | if (args.IsCPUIDDump()) 83 | cpu_props.PrintCPUIDDump(); 84 | if (args.IsProcMask()) 85 | cpu_props.PrintHybridMasks(); 86 | 87 | for (uint32_t demo = 0; demo <= args.GetMaxDemo(); demo++) { 88 | if (args.IsSelected(demo)) { 89 | cout << "===================================" << endl; 90 | cout << demos[demo].demoName << endl; 91 | if (cpu_props.IsFeat(demos[demo].feats)) { 92 | (demos[demo].func)(); 93 | } else { 94 | cpu_props.PrintFeat(demos[demo].feats); 95 | cout << " unsupported." << endl; 96 | } 97 | } 98 | } 99 | } else { 100 | cout << "CPUID file open error: " << args.GetCPUIDFileName(); 101 | } 102 | } 103 | return 0; 104 | } 105 | 106 | -------------------------------------------------------------------------------- /FirstByte.cpp: -------------------------------------------------------------------------------- 1 | #include "stdafx.h" 2 | #include "FirstByte.h" 3 | 4 | extern CPU_Props cpu_props; 5 | 6 | using namespace std; 7 | 8 | //original article: 9 | //http://0x80.pl/notesen/2023-02-06-avx512-find-first-byte-in-lane.html 10 | 11 | __m256i _mm256_firstbyte_epu32(__m256i a, char c) { 12 | __m256i one = _mm256_set1_epi8(0x01); 13 | __m256i check = _mm256_set1_epi8(c); 14 | __m256i vnnibase = _mm256_set1_epi32(0x80808000); 15 | __m256i vnnimul = _mm256_set1_epi32(0x08040201); 16 | __m256i shufb_const = _mm256_broadcastsi128_si256(_mm_setr_epi64x(0x0300010002000100, 0x0400010002000100)); //first zero index 17 | 18 | __m256i xorres = _mm256_xor_si256(check, a); 19 | __m256i minub = _mm256_min_epu8(one, xorres); 20 | 21 | /* VPDPBUSD collects LSBs into the b[3:0] bitfield: */ 22 | #if defined(__AVX512BW__) 23 | __m256i vnni = _mm256_dpbusd_epi32(vnnibase, vnnimul, minub); 24 | #else 25 | __m256i vnni = _mm256_dpbusd_avx_epi32(vnnibase, vnnimul, minub); 26 | #endif 27 | return _mm256_shuffle_epi8(shufb_const, vnni); 28 | } 29 | 30 | //credit: @dougallj 31 | //https://twitter.com/dougallj/status/1624663388856156160 32 | 33 | __m256i _mm256_firstbyte_epu64(__m256i a, char c) { 34 | __m256i one = _mm256_set1_epi8(0x01); 35 | __m256i check = _mm256_set1_epi8(c); 36 | __m256i mask = _mm256_cmpeq_epi8(check, a); 37 | __m256i lowmask = _mm256_andnot_si256(mask, _mm256_sub_epi64(mask, _mm256_set1_epi64x(1))); 38 | __m256i ones = _mm256_and_si256(lowmask, one); 39 | return _mm256_sad_epu8(_mm256_setzero_si256(), ones); 40 | } 41 | 42 | #if defined(__AVX512BW__) 43 | __m512i _mm512_firstbyte_epu32(__m512i a, char c) { 44 | __m512i one = _mm512_set1_epi8(0x01); 45 | __m512i check = _mm512_set1_epi8(c); 46 | __m512i vnnibase = _mm512_set1_epi32(0x80808000); 47 | __m512i vnnimul = _mm512_set1_epi32(0x08040201); 48 | __m512i shufb_const = _mm512_broadcast_i64x2(_mm_setr_epi64x(0x0300010002000100, 0x0400010002000100)); //first zero index 49 | 50 | __m512i xorres = _mm512_xor_epi64(check, a); 51 | __m512i minub = _mm512_min_epu8(one, xorres); 52 | 53 | /* VPDPBUSD collects LSBs into the b[3:0] bitfield: */ 54 | 55 | __m512i vnni = _mm512_dpbusd_epi32(vnnibase, vnnimul, minub); 56 | return _mm512_shuffle_epi8(shufb_const, vnni); 57 | } 58 | 59 | __m512i _mm512_firstbyte_epu64(__m512i a, char c) { 60 | __m512i one = _mm512_set1_epi8(0x01); 61 | __m512i check = _mm512_set1_epi8(c); 62 | __m512i mirror = _mm512_set1_epi64(0x0102040810204080); 63 | 64 | __m512i xorres = _mm512_xor_epi64(check, a); 65 | __m512i minub = _mm512_min_epu8(one, xorres); 66 | 67 | /* Mirror bits in qwords, through the */ 68 | /* 07-16-25-34-43-52-61-70 diagonal axis */ 69 | /* */ 70 | /* In[i,j] -> Out[7-j,7-i] */ 71 | /* */ 72 | /* In : MSB 77 76 75 74 73 72 71 70 */ 73 | /* 67 66 65 64 63 62 61 60 */ 74 | /* 57 56 55 54 53 52 51 50 */ 75 | /* 47 46 45 44 43 42 41 40 */ 76 | /* 37 36 35 34 33 32 31 30 */ 77 | /* 27 26 25 24 23 22 21 20 */ 78 | /* 17 16 15 14 13 12 11 10 */ 79 | /* 07 06 05 04 03 02 01 00 LSB */ 80 | /* */ 81 | /* Out : MSB 00 10 20 30 40 50 60 70 */ 82 | /* 01 11 21 31 41 51 61 71 */ 83 | /* 02 12 22 32 42 52 62 72 */ 84 | /* 03 13 23 33 43 53 63 73 */ 85 | /* 04 14 24 34 44 54 64 74 */ 86 | /* 05 15 25 35 45 55 65 75 */ 87 | /* 06 16 26 36 46 56 66 76 */ 88 | /* 07 17 27 37 47 57 67 77 LSB */ 89 | 90 | __m512i gfni = _mm512_gf2p8affine_epi64_epi8(mirror, minub, 0xff); //mirror & invert 70-60-50-40-30-20-10-00 into b[63:56] with imm8 0xff 91 | return _mm512_lzcnt_epi64(gfni); 92 | } 93 | #endif 94 | 95 | void FirstByte_Demo(void) { 96 | #if defined(__AVX2__) 97 | if (cpu_props.IsFeat(FEAT_AVX_VNNI)) { 98 | __m256i dword_testcase = _mm256_set_epi32(0xfedcba98, 0x76543210, 0xbd000000, 0xbd0000, 0xbd00, 0xbd, 0, 0xbdbd); 99 | printRes32("DWORD Testcase :", dword_testcase); 100 | printRes32("_mm256_firstbyte_epu32 :", _mm256_firstbyte_epu32(dword_testcase, (char)0xbd)); 101 | } 102 | __m256i qword_testcase = _mm256_set_epi64x(0, 0xbd00, 0xbdbdbd0000, 0xbdbdbdbd000000); 103 | printRes("QWORD Testcase :", qword_testcase); 104 | printRes("_mm256_firstbyte_epu64 :", _mm256_firstbyte_epu64(qword_testcase, (char)0xbd)); 105 | #if defined(__AVX512BW__) 106 | if (cpu_props.IsFeat(FEAT_AVX512VNNI)) { 107 | __m512i dword_testcase2 = _mm512_and_si512(_mm512_movm_epi8(0xfedcba9876543210), _mm512_set1_epi8((char)0xbd)); 108 | printRes32("DWORD Testcase2 :", dword_testcase2); 109 | printRes32("_mm512_firstbyte_epu32 :", _mm512_firstbyte_epu32(dword_testcase2, (char)0xbd)); 110 | } 111 | if (cpu_props.IsFeat(FEAT_GFNI)) { 112 | __m512i qword_testcase2 = _mm512_setr_epi64(0xbd, 0xbd00, 0xbd0000, 0xbd000000, 0xbd00000000, 0xbd0000000000, 0xbd000000000000, 0xbd00000000000000); 113 | printRes("QWORD Testcase2 :", qword_testcase2); 114 | printRes("_mm512_firstbyte_epu64 :", _mm512_firstbyte_epu64(qword_testcase2, (char)0xbd)); 115 | } 116 | #endif 117 | #endif 118 | } -------------------------------------------------------------------------------- /AVX_VNNI_INT16_Saturated_AddSub.cpp: -------------------------------------------------------------------------------- 1 | #include "stdafx.h" 2 | #include "AVX_VNNI_INT16_Saturated_AddSub.h" 3 | 4 | extern CPU_Props cpu_props; 5 | 6 | using namespace std; 7 | 8 | __m128i _mm_adds_epi32(__m128i a, __m128i b) { 9 | __m128i movwhdup = _mm_set_epi32(0x0f0e0f0e, 0x0b0a0b0a, 0x07060706, 0x03020302); 10 | __m128i _2x32768 = _mm_set1_epi32(0x80008000); 11 | __m128i one = _mm_set1_epi32(0x1); 12 | __m128i b_high = _mm_shuffle_epi8(b, movwhdup); 13 | __m128i temp = _mm_dpwusds_epi32(a, _2x32768, b_high); 14 | __m128i zpn_one = _mm_sign_epi16(one, _mm_or_si128(b_high, one)); 15 | return _mm_dpwsuds_epi32(temp, zpn_one, b); 16 | } 17 | 18 | __m128i _mm_subs_epi32(__m128i a, __m128i b) { 19 | __m128i movwhdup = _mm_set_epi32(0x0f0e0f0e, 0x0b0a0b0a, 0x07060706, 0x03020302); 20 | __m128i _m2x32768 = _mm_set1_epi32(0x80008000); 21 | __m128i one = _mm_set1_epi32(0x1); 22 | __m128i m_one = _mm_set1_epi32(0xffff); 23 | __m128i b_high = _mm_shuffle_epi8(b, movwhdup); 24 | __m128i temp = _mm_dpwssds_avx_epi32(a, _m2x32768, b_high); 25 | __m128i zpn_one = _mm_sign_epi16(m_one, _mm_or_si128(b_high, one)); 26 | return _mm_dpwsuds_epi32(temp, zpn_one, b); 27 | } 28 | 29 | __m128i _mm_adds_epu32(__m128i a, __m128i b) { 30 | return _mm_add_epi32(_mm_min_epu32(a, _mm_xor_si128(b, _mm_cmpeq_epi32(b, b))), b); 31 | } 32 | 33 | __m128i _mm_subs_epu32(__m128i a, __m128i b) { 34 | return _mm_sub_epi32(_mm_max_epu32(a, b), b); 35 | } 36 | 37 | __m256i _mm256_adds_epi32(__m256i a, __m256i b) { 38 | __m256i movwhdup = _mm256_set_epi32(0x0f0e0f0e, 0x0b0a0b0a, 0x07060706, 0x03020302, 0x0f0e0f0e, 0x0b0a0b0a, 0x07060706, 0x03020302); 39 | __m256i _2x32768 = _mm256_set1_epi32(0x80008000); 40 | __m256i one = _mm256_set1_epi32(0x1); 41 | __m256i b_high = _mm256_shuffle_epi8(b, movwhdup); 42 | __m256i temp = _mm256_dpwusds_epi32(a, _2x32768, b_high); 43 | __m256i zpn_one = _mm256_sign_epi16(one, _mm256_or_si256(b_high, one)); 44 | return _mm256_dpwsuds_epi32(temp, zpn_one, b); 45 | } 46 | 47 | __m256i _mm256_subs_epi32(__m256i a, __m256i b) { 48 | __m256i movwhdup = _mm256_set_epi32(0x0f0e0f0e, 0x0b0a0b0a, 0x07060706, 0x03020302, 0x0f0e0f0e, 0x0b0a0b0a, 0x07060706, 0x03020302); 49 | __m256i _m2x32768 = _mm256_set1_epi32(0x80008000); 50 | __m256i one = _mm256_set1_epi32(0x1); 51 | __m256i m_one = _mm256_set1_epi32(0xffff); 52 | __m256i b_high = _mm256_shuffle_epi8(b, movwhdup); 53 | __m256i temp = _mm256_dpwssds_avx_epi32(a, _m2x32768, b_high); 54 | __m256i zpn_one = _mm256_sign_epi16(m_one, _mm256_or_si256(b_high, one)); 55 | return _mm256_dpwsuds_epi32(temp, zpn_one, b); 56 | } 57 | 58 | __m256i _mm256_adds_epu32(__m256i a, __m256i b) { 59 | return _mm256_add_epi32(_mm256_min_epu32(a, _mm256_xor_si256(b, _mm256_cmpeq_epi32(b, b))), b); 60 | } 61 | 62 | __m256i _mm256_subs_epu32(__m256i a, __m256i b) { 63 | return _mm256_sub_epi32(_mm256_max_epu32(a, b), b); 64 | } 65 | 66 | void AVX_VNNI_Saturated_AddSub_Demo(void) { 67 | uint16_t x16 = __rdtsc() & 0xffff; 68 | uint32_t x32 = x16 * 0x10001; 69 | 70 | __m256i testcases_a16 = _mm256_set_epi16(0, 0, 0, 0, x16, x16, x16, x16, SHRT_MIN, SHRT_MIN, SHRT_MIN, SHRT_MIN, SHRT_MAX, SHRT_MAX, SHRT_MAX, SHRT_MAX); 71 | __m256i testcases_b16 = _mm256_set_epi16(0, x16, SHRT_MIN, SHRT_MAX, 0, x16, SHRT_MIN, SHRT_MAX, 0, x16, SHRT_MIN, SHRT_MAX, 0, x16, SHRT_MIN, SHRT_MAX); 72 | 73 | printRes16("Testcases_a16 :", testcases_a16); 74 | printRes16("Testcases_b16 :", testcases_b16); 75 | printRes16("Saturated signed add epi16:", _mm256_adds_epi16(testcases_a16, testcases_b16)); 76 | printRes16("Saturated signed sub epi16:", _mm256_subs_epi16(testcases_a16, testcases_b16)); 77 | printRes16("Saturated unsigned add epu16:", _mm256_adds_epu16(testcases_a16, testcases_b16)); 78 | printRes16("Saturated unsigned sub epu16:", _mm256_subs_epu16(testcases_a16, testcases_b16)); 79 | 80 | 81 | __m256i testcases_a32_0 = _mm256_set_epi32(0, 0, 0, 0, x32, x32, x32, x32); 82 | __m256i testcases_a32_1 = _mm256_set_epi32(LONG_MIN, LONG_MIN, LONG_MIN, LONG_MIN, LONG_MAX, LONG_MAX, LONG_MAX, LONG_MAX); 83 | __m256i testcases_b32_0 = _mm256_set_epi32(0, x32, LONG_MIN, LONG_MAX, 0, x32, LONG_MIN, LONG_MAX); 84 | __m256i testcases_b32_1 = _mm256_set_epi32(0, x32, LONG_MIN, LONG_MAX, 0, x32, LONG_MIN, LONG_MAX); 85 | 86 | printRes32("Testcases_a32_0 :", testcases_a32_0); 87 | printRes32("Testcases_a32_1 :", testcases_a32_1); 88 | printRes32("Testcases_b32_0 :", testcases_b32_0); 89 | printRes32("Testcases_b32_1 :", testcases_b32_1); 90 | printRes32("Saturated signed add epi32:", _mm256_adds_epi32(testcases_a32_0, testcases_b32_0)); 91 | printRes32("Saturated signed add epi32:", _mm256_adds_epi32(testcases_a32_1, testcases_b32_1)); 92 | printRes32("Saturated signed sub epi32:", _mm256_subs_epi32(testcases_a32_0, testcases_b32_0)); 93 | printRes32("Saturated signed sub epi32:", _mm256_subs_epi32(testcases_a32_1, testcases_b32_1)); 94 | printRes32("Saturated unsigned add epu32:", _mm256_adds_epu32(testcases_a32_0, testcases_b32_0)); 95 | printRes32("Saturated unsigned add epu32:", _mm256_adds_epu32(testcases_a32_1, testcases_b32_1)); 96 | printRes32("Saturated unsigned sub epu32:", _mm256_subs_epu32(testcases_a32_0, testcases_b32_0)); 97 | printRes32("Saturated unsigned sub epu32:", _mm256_subs_epu32(testcases_a32_1, testcases_b32_1)); 98 | } -------------------------------------------------------------------------------- /Byte2Byte_Asm.asm: -------------------------------------------------------------------------------- 1 | .data 2 | 3 | ;ident mapping 4 | ;byteconst_00_3f dq 00706050403020100h, 00f0e0d0c0b0a0908h, 01716151413121110h, 01f1e1d1c1b1a1918h, 02726252423222120h, 02f2e2d2c2b2a2928h, 03736353433323130h, 03f3e3d3c3b3a3938h 5 | ;byteconst_40_7f dq 04746454443424140h, 04f4e4d4c4b4a4948h, 05756555453525150h, 05f5e5d5c5b5a5958h, 06766656463626160h, 06f6e6d6c6b6a6968h, 07776757473727170h, 07f7e7d7c7b7a7978h 6 | ;byteconst_80_bf dq 08786858483828180h, 08f8e8d8c8b8a8988h, 09796959493929190h, 09f9e9d9c9b9a9998h, 0a7a6a5a4a3a2a1a0h, 0afaeadacabaaa9a8h, 0b7b6b5b4b3b2b1b0h, 0bfbebdbcbbbab9b8h 7 | ;byteconst_c0_ff dq 0c7c6c5c4c3c2c1c0h, 0cfcecdcccbcac9c8h, 0d7d6d5d4d3d2d1d0h, 0dfdedddcdbdad9d8h, 0e7e6e5e4e3e2e1e0h, 0efeeedecebeae9e8h, 0f7f6f5f4f3f2f1f0h, 0fffefdfcfbfaf9f8h 8 | 9 | ;+1 mapping 10 | byteconst_00_3f dq 00807060504030201h, 0100f0e0d0c0b0a09h, 01817161514131211h, 0201f1e1d1c1b1a19h, 02827262524232221h, 0302f2e2d2c2b2a29h, 03837363534333231h, 0403f3e3d3c3b3a39h 11 | byteconst_40_7f dq 04847464544434241h, 0504f4e4d4c4b4a49h, 05857565554535251h, 0605f5e5d5c5b5a59h, 06867666564636261h, 0706f6e6d6c6b6a69h, 07877767574737271h, 0807f7e7d7c7b7a79h 12 | byteconst_80_bf dq 08887868584838281h, 0908f8e8d8c8b8a89h, 09897969594939291h, 0a09f9e9d9c9b9a99h, 0a8a7a6a5a4a3a2a1h, 0b0afaeadacabaaa9h, 0b8b7b6b5b4b3b2b1h, 0c0bfbebdbcbbbab9h 13 | byteconst_c0_ff dq 0c8c7c6c5c4c3c2c1h, 0d0cfcecdcccbcac9h, 0d8d7d6d5d4d3d2d1h, 0e0dfdedddcdbdad9h, 0e8e7e6e5e4e3e2e1h, 0f0efeeedecebeae9h, 0f8f7f6f5f4f3f2f1h, 000fffefdfcfbfaf9h 14 | 15 | inp0 dq 0f5e78b9234190de4h, 0b79b5e89124e4ca9h, 06549ba41bb976aa9h, 03566abb891220879h 16 | ;inp1 dq 04a2eff9876341568h, 03973abdeff67892ah, 0ce49735167564bdeh, 0b8790eff12537166h 17 | ; 18 | gfni_sra dq 08080808080808080h 19 | lsb dq 00101010101010101h 20 | ; 21 | repeats equ 1000000000 22 | 23 | .code 24 | 25 | MASKEDVPERMI2B MACRO LAT 26 | vpmovb2m k1, zmm0 27 | vmovdqa64 zmm2, zmm0 28 | vmovdqa64 zmm1, zmm0 29 | knotq k2, k1 30 | vpermi2b zmm2 {k1}{z}, zmm30, zmm31 31 | vpermi2b zmm1 {k2}{z}, zmm28, zmm29 32 | IF LAT EQ 0 33 | vporq zmm0, zmm1, zmm2 34 | ELSE 35 | vporq zmm3, zmm1, zmm2 36 | ENDIF 37 | ENDM 38 | 39 | KREGROUNDTRIP MACRO LAT 40 | vpmovb2m k0, zmm0 41 | vmovdqa64 zmm3, zmm0 42 | vmovdqa64 zmm1, zmm0 43 | vpermi2b zmm3, zmm28, zmm29 44 | vpermi2b zmm1, zmm30, zmm31 45 | vpmovm2b zmm2, k0 46 | vpternlogq zmm3, zmm1, zmm2, 0d8h ;c?b:a 47 | IF LAT EQ 0 48 | vmovdqa64 zmm0, zmm3 49 | ENDIF 50 | ENDM 51 | 52 | GFNI MACRO LAT 53 | vmovdqa64 zmm3, zmm0 54 | vmovdqa64 zmm1, zmm0 55 | vmovdqa64 zmm2, zmm0 56 | vpermi2b zmm3, zmm28, zmm29 57 | vpermi2b zmm1, zmm30, zmm31 58 | vgf2p8affineqb zmm2, zmm2, zmm27, 0 59 | vpternlogq zmm3, zmm1, zmm2, 0d8h ;c?b:a 60 | IF LAT EQ 0 61 | vmovdqa64 zmm0, zmm3 62 | ENDIF 63 | ENDM 64 | 65 | SRLQ MACRO LAT 66 | vmovdqa64 zmm1, zmm0 67 | vpermi2b zmm1, zmm30, zmm31 68 | vpsrlq zmm2, zmm0, 7 69 | vmovdqa64 zmm3, zmm0 70 | vpermi2b zmm3, zmm28, zmm29 71 | vpandq zmm2, zmm2, zmm27 72 | vpsubb zmm2, zmm26, zmm2 73 | vpternlogq zmm3, zmm1, zmm2, 0d8h ;c?b:a 74 | IF LAT EQ 0 75 | vmovdqa64 zmm0, zmm3 76 | ENDIF 77 | ENDM 78 | 79 | BLENDMB MACRO LAT 80 | vpmovb2m k1, zmm0 81 | vmovdqa64 zmm2, zmm0 82 | vmovdqa64 zmm1, zmm0 83 | vpermi2b zmm2, zmm30, zmm31 84 | vpermi2b zmm1, zmm28, zmm29 85 | IF LAT EQ 0 86 | vpblendmb zmm0{k1}, zmm1, zmm2 87 | ELSE 88 | vpblendmb zmm3{k1}, zmm1, zmm2 89 | ENDIF 90 | ENDM 91 | 92 | MINMAX MACRO LAT 93 | vmovdqa64 zmm1, zmm0 94 | vpermi2b zmm1, zmm30, zmm31 95 | vpmaxsb zmm2, zmm0, zmm27 96 | vmovdqa64 zmm3, zmm0 97 | vpermi2b zmm3, zmm28, zmm29 98 | vpminsb zmm2, zmm2, zmm26 99 | vpternlogq zmm3, zmm1, zmm2, 0d8h ;c?b:a 100 | IF LAT EQ 0 101 | vmovdqa64 zmm0, zmm3 102 | ENDIF 103 | ENDM 104 | 105 | B2B_WRAPPER MACRO FUNCNAME, M1, LAT 106 | FUNCNAME PROC 107 | push rbx 108 | push rdi 109 | push rsi 110 | 111 | IFIDNI , 112 | vpbroadcastq zmm27, qword ptr [gfni_sra] 113 | ELSEIFIDNI , 114 | vpxorq zmm26, zmm26, zmm26 115 | vpbroadcastq zmm27, qword ptr [lsb] 116 | ELSEIFIDNI , 117 | vpxorq zmm26, zmm26, zmm26 118 | vpternlogq zmm27, zmm27, zmm27, 0ffh 119 | ENDIF 120 | vmovdqu64 zmm28, zmmword ptr [byteconst_00_3f] 121 | vmovdqu64 zmm29, zmmword ptr [byteconst_40_7f] 122 | vmovdqu64 zmm30, zmmword ptr [byteconst_80_bf] 123 | vmovdqu64 zmm31, zmmword ptr [byteconst_c0_ff] 124 | vmovdqu64 zmm0, zmmword ptr [inp0] 125 | IF LAT EQ 0 126 | kxorq k0, k0, k0 127 | vmovdqu64 zmm4, zmm0 ;equ test 128 | ENDIF 129 | mfence 130 | rdtscp 131 | lfence 132 | 133 | mov esi, eax 134 | mov edi, edx 135 | 136 | mov ecx, repeats 137 | 138 | startlabel: 139 | M1 LAT 140 | dec ecx 141 | jnz startlabel 142 | 143 | mfence 144 | rdtscp 145 | lfence 146 | 147 | shl rdx, 20h 148 | shl rdi, 20h 149 | or rax, rdx 150 | or rsi, rdi 151 | 152 | sub rax, rsi 153 | 154 | IF LAT EQ 0 155 | vpcmpeqb k0, zmm0, zmm4 ;equ test 156 | ENDIF 157 | pop rsi 158 | pop rdi 159 | pop rbx 160 | ret 161 | FUNCNAME ENDP 162 | ENDM 163 | 164 | B2B_WRAPPER B2B_MASKEDVPERMI2B_LAT, MASKEDVPERMI2B, 0 165 | B2B_WRAPPER B2B_MASKEDVPERMI2B_TP, MASKEDVPERMI2B, 1 166 | 167 | B2B_WRAPPER B2B_KREGROUNDTRIP_LAT, KREGROUNDTRIP, 0 168 | B2B_WRAPPER B2B_KREGROUNDTRIP_TP, KREGROUNDTRIP, 1 169 | 170 | B2B_WRAPPER B2B_GFNI_LAT, GFNI, 0 171 | B2B_WRAPPER B2B_GFNI_TP, GFNI, 1 172 | 173 | B2B_WRAPPER B2B_SRLQ_LAT, SRLQ, 0 174 | B2B_WRAPPER B2B_SRLQ_TP, SRLQ, 1 175 | 176 | B2B_WRAPPER B2B_BLENDMB_LAT, BLENDMB, 0 177 | B2B_WRAPPER B2B_BLENDMB_TP, BLENDMB, 1 178 | 179 | B2B_WRAPPER B2B_MINMAX_LAT, MINMAX, 0 180 | B2B_WRAPPER B2B_MINMAX_TP, MINMAX, 1 181 | 182 | end -------------------------------------------------------------------------------- /Results/TZCNT_RKL.txt: -------------------------------------------------------------------------------- 1 | Vendor: "GenuineIntel" 2 | Family:6 Model:167 Stepping:1 (a0671) 3 | Brand: " 11th Gen Intel(R) Core(TM) i9-11900K @ 3.50GHz" 4 | 512b FPU DP ports : 1 5 | ---GPR---------- 6 | RDTSC : supported 7 | RDTSCP : supported 8 | CMOV : supported 9 | CMPX8 : supported 10 | CMPX16 : supported 11 | AMD64 : supported 12 | LAHF : supported 13 | MOVBE : supported 14 | ABM : supported 15 | POPCNT : supported 16 | RDRAND : supported 17 | RDSEED : supported 18 | ADX : supported 19 | BMI : supported 20 | BMI2 : supported 21 | MOVDIRI : unsupported 22 | MOVDIR64B : unsupported 23 | ---SIMD--------- 24 | SSE : supported 25 | SSE2 : supported 26 | SSE3 : supported 27 | SSSE3 : supported 28 | SSE41 : supported 29 | SSE42 : supported 30 | SSE4A : unsupported 31 | CLMUL : supported 32 | AES : supported 33 | SHA : supported 34 | AVX : supported, OS enabled 35 | AVX2 : supported, OS enabled 36 | FMA : supported, OS enabled 37 | F16C : supported, OS enabled 38 | GFNI : supported 39 | VAES : supported 40 | VPCLMULQDQ : supported 41 | KEYLOCK : unsupported 42 | AVX_VNNI : unsupported 43 | ---AVX512------- 44 | AVX512F : supported, OS enabled 45 | AVX512CD : supported, OS enabled 46 | AVX512ER : unsupported 47 | AVX512PF : unsupported 48 | AVX512BW : supported, OS enabled 49 | AVX512DQ : supported, OS enabled 50 | AVX512VL : supported, OS enabled 51 | AVX512VBMI : supported, OS enabled 52 | AVX512IFMA : supported, OS enabled 53 | AVX512VNNI : supported, OS enabled 54 | AVX512_4VNNIW : unsupported 55 | AVX512_4FMAPS : unsupported 56 | AVX512_VPOPCNTDQ : supported, OS enabled 57 | AVX512_BITALG : supported, OS enabled 58 | AVX512_VBMI2 : supported, OS enabled 59 | AVX512_BF16 : unsupported 60 | AVX512_VP2INTERSECT : unsupported 61 | AVX512_FP16 : unsupported 62 | ---AMX---------- 63 | AMX-BF16 : unsupported 64 | AMX-INT8 : unsupported 65 | AMX-TILE : unsupported 66 | ---CacheLine---- 67 | PREFETCHW : supported 68 | PREFETCHWT1 : unsupported 69 | CLFLUSH : supported 70 | CLFLUSHOPT : supported 71 | CLWB : unsupported 72 | CLZERO : unsupported 73 | CLDEMOTE : unsupported 74 | ---Misc--------- 75 | LNOP : supported 76 | SERIALIZE : unsupported 77 | HYBRID : unsupported 78 | ---Deprecated--- 79 | X87 : supported 80 | MMX : supported 81 | MMX+ : unsupported 82 | 3DNow! : unsupported 83 | 3DNow!+ : unsupported 84 | XOP : unsupported 85 | FMA4 : unsupported 86 | TBM : unsupported 87 | --- AVX512_BITALG & AVX512_VPOPCNTDQ SIMD TZCNT --- 88 | x128 :0000000100020004 1000200040008000 89 | _mm_tzcnt_epi8 :0808080008010802 0408050806080708 90 | _mm_tzcnt_epi16 :0010000000010002 000c000d000e000f 91 | _mm_tzcnt_epi32 :0000000000000002 0000000d0000000f 92 | _mm_tzcnt_epi64 :0000000000000002 000000000000000f 93 | x256 :0000000100020004 1000200040008000 7f007e007c007800 7000600040000000 94 | _mm256_tzcnt_epi8 :0808080008010802 0408050806080708 0008010802080308 0408050806080808 95 | _mm256_tzcnt_epi16 :0010000000010002 000c000d000e000f 00080009000a000b 000c000d000e0010 96 | _mm256_tzcnt_epi32 :0000000000000002 0000000d0000000f 000000090000000b 0000000d0000001e 97 | _mm256_tzcnt_epi64 :0000000000000002 000000000000000f 000000000000000b 000000000000001e 98 | x512 :0000000100020004 1000200040008000 7f007e007c007800 7000600000000000 fffffffefffcfff8 fff0ffe0ffc0ff80 ff00fe00fc00f800 f000e000c0008000 99 | _mm512_tzcnt_epi8 :0808080008010802 0408050806080708 0008010802080308 0408050808080808 0000000100020003 0004000500060007 0008010802080308 0408050806080708 100 | _mm512_tzcnt_epi16 :0010000000010002 000c000d000e000f 00080009000a000b 000c000d00100010 0000000100020003 0004000500060007 00080009000a000b 000c000d000e000f 101 | _mm512_tzcnt_epi32 :0000000000000002 0000000d0000000f 000000090000000b 0000000d00000020 0000000100000003 0000000500000007 000000090000000b 0000000d0000000f 102 | _mm512_tzcnt_epi64 :0000000000000002 000000000000000f 000000000000000b 000000000000002d 0000000000000003 0000000000000007 000000000000000b 000000000000000f 103 | TSC CLKs:---------------------- 104 | _mm_tzcnt_epi8_asm :5.01382 105 | _mm_tzcnt_epi16_asm :5.00762 106 | _mm_tzcnt_epi32_asm :5.00959 107 | _mm_tzcnt_epi64_asm :5.01166 108 | _mm256_tzcnt_epi8_asm :5.00704 109 | _mm256_tzcnt_epi16_asm :5.00701 110 | _mm256_tzcnt_epi32_asm :5.00662 111 | _mm256_tzcnt_epi64_asm :5.009 112 | _mm512_tzcnt_epi8_asm :5.01387 113 | _mm512_tzcnt_epi16_asm :5.01192 114 | _mm512_tzcnt_epi32_asm :5.00803 115 | _mm512_tzcnt_epi64_asm :5.00779 116 | _mm256_tzcnt_epi32_lzcnt_asm :8.80157 117 | _mm512_tzcnt_epi32_lzcnt_asm :8.07228 118 | _mm256_tzcnt_epi64_lzcnt_asm :8.82087 119 | _mm512_tzcnt_epi64_lzcnt_asm :8.05901 120 | =================================== 121 | -------------------------------------------------------------------------------- /Results/TZCNT_WLC.txt: -------------------------------------------------------------------------------- 1 | Vendor: "GenuineIntel" 2 | Family:6 Model:140 Stepping:1 (806c1) 3 | Brand: " 11th Gen Intel(R) Core(TM) i5-1135G7 @ 2.40GHz" 4 | 512b FPU DP ports : 1 5 | ---GPR---------- 6 | RDTSC : supported 7 | RDTSCP : supported 8 | CMOV : supported 9 | CMPX8 : supported 10 | CMPX16 : supported 11 | AMD64 : supported 12 | LAHF : supported 13 | MOVBE : supported 14 | ABM : supported 15 | POPCNT : supported 16 | RDRAND : supported 17 | RDSEED : supported 18 | ADX : supported 19 | BMI : supported 20 | BMI2 : supported 21 | MOVDIRI : supported 22 | MOVDIR64B : supported 23 | ---SIMD--------- 24 | SSE : supported 25 | SSE2 : supported 26 | SSE3 : supported 27 | SSSE3 : supported 28 | SSE41 : supported 29 | SSE42 : supported 30 | SSE4A : unsupported 31 | CLMUL : supported 32 | AES : supported 33 | SHA : supported 34 | AVX : supported, OS enabled 35 | AVX2 : supported, OS enabled 36 | FMA : supported, OS enabled 37 | F16C : supported, OS enabled 38 | GFNI : supported 39 | VAES : supported 40 | VPCLMULQDQ : supported 41 | KEYLOCK : supported, OS disabled 42 | AVX_VNNI : unsupported 43 | ---AVX512------- 44 | AVX512F : supported, OS enabled 45 | AVX512CD : supported, OS enabled 46 | AVX512ER : unsupported 47 | AVX512PF : unsupported 48 | AVX512BW : supported, OS enabled 49 | AVX512DQ : supported, OS enabled 50 | AVX512VL : supported, OS enabled 51 | AVX512VBMI : supported, OS enabled 52 | AVX512IFMA : supported, OS enabled 53 | AVX512VNNI : supported, OS enabled 54 | AVX512_4VNNIW : unsupported 55 | AVX512_4FMAPS : unsupported 56 | AVX512_VPOPCNTDQ : supported, OS enabled 57 | AVX512_BITALG : supported, OS enabled 58 | AVX512_VBMI2 : supported, OS enabled 59 | AVX512_BF16 : unsupported 60 | AVX512_VP2INTERSECT : supported, OS enabled 61 | AVX512_FP16 : unsupported 62 | ---AMX---------- 63 | AMX-BF16 : unsupported 64 | AMX-INT8 : unsupported 65 | AMX-TILE : unsupported 66 | ---CacheLine---- 67 | PREFETCHW : supported 68 | PREFETCHWT1 : unsupported 69 | CLFLUSH : supported 70 | CLFLUSHOPT : supported 71 | CLWB : supported 72 | CLZERO : unsupported 73 | CLDEMOTE : unsupported 74 | ---Misc--------- 75 | LNOP : supported 76 | SERIALIZE : unsupported 77 | HYBRID : unsupported 78 | ---Deprecated--- 79 | X87 : supported 80 | MMX : supported 81 | MMX+ : unsupported 82 | 3DNow! : unsupported 83 | 3DNow!+ : unsupported 84 | XOP : unsupported 85 | FMA4 : unsupported 86 | TBM : unsupported 87 | --- AVX512_BITALG & AVX512_VPOPCNTDQ SIMD TZCNT --- 88 | x128 :0000000100020004 1000200040008000 89 | _mm_tzcnt_epi8 :0808080008010802 0408050806080708 90 | _mm_tzcnt_epi16 :0010000000010002 000c000d000e000f 91 | _mm_tzcnt_epi32 :0000000000000002 0000000d0000000f 92 | _mm_tzcnt_epi64 :0000000000000002 000000000000000f 93 | x256 :0000000100020004 1000200040008000 7f007e007c007800 7000600040000000 94 | _mm256_tzcnt_epi8 :0808080008010802 0408050806080708 0008010802080308 0408050806080808 95 | _mm256_tzcnt_epi16 :0010000000010002 000c000d000e000f 00080009000a000b 000c000d000e0010 96 | _mm256_tzcnt_epi32 :0000000000000002 0000000d0000000f 000000090000000b 0000000d0000001e 97 | _mm256_tzcnt_epi64 :0000000000000002 000000000000000f 000000000000000b 000000000000001e 98 | x512 :0000000100020004 1000200040008000 7f007e007c007800 7000600000000000 fffffffefffcfff8 fff0ffe0ffc0ff80 ff00fe00fc00f800 f000e000c0008000 99 | _mm512_tzcnt_epi8 :0808080008010802 0408050806080708 0008010802080308 0408050808080808 0000000100020003 0004000500060007 0008010802080308 0408050806080708 100 | _mm512_tzcnt_epi16 :0010000000010002 000c000d000e000f 00080009000a000b 000c000d00100010 0000000100020003 0004000500060007 00080009000a000b 000c000d000e000f 101 | _mm512_tzcnt_epi32 :0000000000000002 0000000d0000000f 000000090000000b 0000000d00000020 0000000100000003 0000000500000007 000000090000000b 0000000d0000000f 102 | _mm512_tzcnt_epi64 :0000000000000002 000000000000000f 000000000000000b 000000000000002d 0000000000000003 0000000000000007 000000000000000b 000000000000000f 103 | TSC CLKs:---------------------- 104 | _mm_tzcnt_epi8_asm :5.07324 105 | _mm_tzcnt_epi16_asm :5.06268 106 | _mm_tzcnt_epi32_asm :5.06161 107 | _mm_tzcnt_epi64_asm :5.06192 108 | _mm256_tzcnt_epi8_asm :5.06735 109 | _mm256_tzcnt_epi16_asm :5.06211 110 | _mm256_tzcnt_epi32_asm :5.06211 111 | _mm256_tzcnt_epi64_asm :5.05998 112 | _mm512_tzcnt_epi8_asm :5.06633 113 | _mm512_tzcnt_epi16_asm :5.06351 114 | _mm512_tzcnt_epi32_asm :5.0629 115 | _mm512_tzcnt_epi64_asm :5.06812 116 | _mm256_tzcnt_epi32_lzcnt_asm :8.92335 117 | _mm512_tzcnt_epi32_lzcnt_asm :8.19745 118 | _mm256_tzcnt_epi64_lzcnt_asm :8.92455 119 | _mm512_tzcnt_epi64_lzcnt_asm :8.18332 120 | =================================== 121 | -------------------------------------------------------------------------------- /Misc.cpp: -------------------------------------------------------------------------------- 1 | #include "stdafx.h" 2 | 3 | using namespace std; 4 | 5 | void printRes8(const char* name, __m128i res) { 6 | cout << setw(24) << left << setfill(' ') << name; 7 | for (int i = sizeof(__m128i) / sizeof(uint8_t) - 1; i >= 0; i--) 8 | cout << hex << setw(2) << setfill('0') << right << +*((uint8_t*)&res + i) << ' '; 9 | cout << endl; 10 | } 11 | 12 | void printRes8(const char* name, __m256i res) { 13 | cout << setw(24) << left << setfill(' ') << name; 14 | for (int i = sizeof(__m256i) / sizeof(uint8_t) - 1; i >= 0; i--) 15 | cout << hex << setw(2) << setfill('0') << right << +*((uint8_t*)&res + i) << ' '; 16 | cout << endl; 17 | } 18 | 19 | void printRes8(const char* name, __m512i res) { 20 | cout << setw(24) << left << setfill(' ') << name; 21 | for (int i = sizeof(__m512i) / sizeof(uint8_t) - 1; i >= 0; i--) 22 | cout << hex << setw(2) << setfill('0') << right << +*((uint8_t*)&res + i) << ' '; 23 | cout << endl; 24 | } 25 | 26 | void printRes16(const char * name, __m128i res) { 27 | cout << setw(24) << left << setfill(' ') << name; 28 | for (int i = sizeof(__m128i) / sizeof(short) - 1; i >= 0; i--) 29 | cout << hex << setw(4) << setfill('0') << right << *((unsigned __int16*)&res + i) << ' '; 30 | cout << endl; 31 | } 32 | 33 | void printRes16(const char * name, __m256i res) { 34 | cout << setw(24) << left << setfill(' ') << name; 35 | for (int i = sizeof(__m256i) / sizeof(short) - 1; i >= 0; i--) 36 | cout << hex << setw(4) << setfill('0') << right << *((unsigned __int16*)&res + i) << ' '; 37 | cout << endl; 38 | } 39 | 40 | void printRes16(const char * name, __m512i res) { 41 | cout << setw(24) << left << setfill(' ') << name; 42 | for (int i = sizeof(__m512i) / sizeof(short) - 1; i >= 0; i--) 43 | cout << hex << setw(4) << setfill('0') << right << *((unsigned __int16*)&res + i) << ' '; 44 | cout << endl; 45 | } 46 | 47 | void printRes32(const char * name, __m128i res) { 48 | cout << setw(24) << left << setfill(' ') << name; 49 | for (int i = sizeof(__m128i) / sizeof(long) - 1; i >= 0; i--) 50 | cout << hex << setw(8) << setfill('0') << right << *((unsigned __int32*)&res + i) << ' '; 51 | cout << endl; 52 | } 53 | 54 | void printRes32(const char * name, __m256i res) { 55 | cout << setw(24) << left << setfill(' ') << name; 56 | for (int i = sizeof(__m256i) / sizeof(long) - 1; i >= 0; i--) 57 | cout << hex << setw(8) << setfill('0') << right << *((unsigned __int32*)&res + i) << ' '; 58 | cout << endl; 59 | } 60 | 61 | void printRes32(const char * name, __m512i res) { 62 | cout << setw(24) << left << setfill(' ') << name; 63 | for (int i = sizeof(__m512i) / sizeof(long) - 1; i >= 0; i--) 64 | cout << hex << setw(8) << setfill('0') << right << *((unsigned __int32*)&res + i) << ' '; 65 | cout << endl; 66 | } 67 | 68 | void printRes(const char * name, __m128i res) { 69 | cout << setw(24) << left << setfill(' ') << name; 70 | for (int i = sizeof(__m128i) / sizeof(long long) - 1; i >= 0; i--) 71 | cout << hex << setw(16) << setfill('0') << right << *((unsigned __int64*)&res + i) << ' '; 72 | cout << endl; 73 | } 74 | 75 | void printRes(const char * name, __m256i res) { 76 | cout << setw(24) << left << setfill(' ') << name; 77 | for (int i = sizeof(__m256i) / sizeof(long long) - 1; i >= 0; i--) 78 | cout << hex << setw(16) << setfill('0') << right << *((unsigned __int64*)&res + i) << ' '; 79 | cout << endl; 80 | } 81 | 82 | void printRes(const char * name, __m512i res) { 83 | cout << setw(24) << left << setfill(' ') << name; 84 | for (int i = sizeof(__m512i) / sizeof(long long) - 1; i >= 0; i--) 85 | cout << hex << setw(16) << setfill('0') << right << *((unsigned __int64*)&res + i) << ' '; 86 | cout << endl; 87 | } 88 | 89 | void printRes(int r, const char * name, __m128i res) { 90 | cout << setw(2) << r << ':' << setw(24) << left << setfill(' ') << name; 91 | for (int i = sizeof(__m128i) / sizeof(long long) - 1; i >= 0; i--) 92 | cout << hex << setw(16) << setfill('0') << right << *((unsigned __int64*)&res + i) << ' '; 93 | cout << endl; 94 | } 95 | 96 | void printRes(int r, const char * name, __m256i res) { 97 | cout << setw(2) << r << ':' << setw(24) << left << setfill(' ') << name; 98 | for (int i = sizeof(__m256i) / sizeof(long long) - 1; i >= 0; i--) 99 | cout << hex << setw(16) << setfill('0') << right << *((unsigned __int64*)&res + i) << ' '; 100 | cout << endl; 101 | } 102 | 103 | void printRes(int r, const char * name, __m512i res) { 104 | cout << setw(2) << r << ':' << setw(24) << left << setfill(' ') << name; 105 | for (int i = sizeof(__m512i) / sizeof(long long) - 1; i >= 0; i--) 106 | cout << hex << setw(16) << setfill('0') << right << *((unsigned __int64*)&res + i) << ' '; 107 | cout << endl; 108 | } 109 | 110 | void printRes(int r, __m128i res) { 111 | cout << dec << setw(2) << r << ':'; 112 | for (int i = sizeof(__m128i) / sizeof(long long) - 1; i >= 0; i--) 113 | cout << hex << setw(16) << setfill('0') << right << *((unsigned __int64*)&res + i) << ' '; 114 | cout << endl; 115 | } 116 | 117 | void printRes(int r, __m256i res) { 118 | cout << dec << setw(2) << r << ':'; 119 | for (int i = sizeof(__m256i) / sizeof(long long) - 1; i >= 0; i--) 120 | cout << hex << setw(16) << setfill('0') << right << *((unsigned __int64*)&res + i) << ' '; 121 | cout << endl; 122 | } 123 | 124 | void printRes(int r, __m512i res) { 125 | cout << dec << setw(2) << r << ':'; 126 | for (int i = sizeof(__m512i) / sizeof(long long) - 1; i >= 0; i--) 127 | cout << hex << setw(16) << setfill('0') << right << *((unsigned __int64*)&res + i) << ' '; 128 | cout << endl; 129 | } 130 | 131 | // Intel SDM 32546276.pdf p.1739 132 | uint64_t serialized_tsc(void) { 133 | uint64_t tsc; 134 | uint32_t tsc_aux; 135 | _mm_mfence(); 136 | tsc = __rdtscp(&tsc_aux); 137 | _mm_lfence(); 138 | return tsc; 139 | } 140 | 141 | void random_wrap(unsigned int * random) { 142 | while (!_rdrand32_step(random)); 143 | } 144 | 145 | void random_wrap(signed int * random) { 146 | while (!_rdrand32_step((unsigned int *)random)); 147 | } 148 | 149 | #if defined (_M_X64) 150 | void random_wrap(unsigned long long * random) { 151 | while (!_rdrand64_step(random)); 152 | } 153 | 154 | void random_wrap(signed long long * random) { 155 | while (!_rdrand64_step((unsigned long long *)random)); 156 | } 157 | #endif 158 | 159 | void SetThread(size_t threadindex) { 160 | size_t t = (size_t)1 << threadindex; 161 | cout << "Procmask:0x" << hex << setfill('0') << setw(sizeof(size_t) * 2) << right << t << dec << setfill(' ') << endl; 162 | SetProcessAffinityMask(GetCurrentProcess(), t); 163 | SetThreadAffinityMask(GetCurrentThread(), t); 164 | Sleep(0); 165 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # InstLatX64_Demo 2 | 3 | Collected source form of some ideas 4 | 5 | ## GFNI_Demo.h - 6 | wrapper header for non-cryptographical use of (V)GF2P8AFFINEQB instruction in style of Intel intrinsics: 7 | * emulating the missing byte-garnularity shift and rotate instructions; 8 | ``` 9 | _(mm|mm256|mm512)(|_mask|_maskz)_(srli|srl|srai|sra|slli|sll|ror|rol)_gfni_epi8 10 | ``` 11 | * variable versions also supported with GF2P8MULB instruction: 12 | ``` 13 | _(mm|mm256|mm512)(|_mask|_maskz)_(srlv|sllv|rorv|rolv)_gfni_epi8 14 | ``` 15 | 16 | * revbit, bit-broadcast, prefix-xor operations for bytes 17 | ``` 18 | _(mm|mm256|mm512)(|_mask|_maskz)_(revbit|bcstbit|prefix_xor)_epi8 19 | ``` 20 | 21 | * rotate, mirror, multiplication operations for 8x8 bit matrices 22 | ``` 23 | _(mm|mm256|mm512)(|_mask|_maskz)_(mirror|rotate|multiplication)_8x8 24 | ``` 25 | 26 | * auxiliary: imm8 operand of (V)GF2P8AFFINEQB xors the result bytes, so it useful e.g. for inverting the all above functions or using for compile time known byte broadcast without using GPRs, Port5 or memory 27 | ``` 28 | _(mm|mm256|mm512)(|_mask|_maskz)_(inverse|set1_gfni)_epi8 29 | ``` 30 | 31 | * entire register pospopcount (if AVX512_BITALG & AVX512_VPOPCNTDQ also supported): 32 | ``` 33 | _(mm|mm256|mm512)_pospopcount_(u8|u16)_(si128|si256|si512)_epi8 34 | ``` 35 | 36 | * tzcnt, lzcnt for bytes (idea of https://gist.github.com/animetosho/6cb732ccb5ecd86675ca0a442b3c0622) 37 | ``` 38 | _(mm|mm256|mm512)(|_mask|_maskz)_(tzcnt|lzcnt)_gfni_epi8 39 | ``` 40 | 41 | # VBMI2_Demo.h 42 | wrapper header for VPSHLDW/VPSHRDW/VPSHLDVW/VPSHRDVW instructions for substituting the missing VPROLW/VPRORW/VPRORVW/VPRORVW instructions with the good old shld r1, r1 = rol r1 trick 43 | ``` 44 | _(mm|mm256|mm512)(|_mask|_maskz)_(ror|rol)_vbmi2_epi16 45 | ``` 46 | wrapper header for emulating the missing byte-garnularity shift and rotate instructions in variable versions too 47 | ``` 48 | _(mm|mm256|mm512)(|_mask|_maskz)_(slli|srli|srai|ror|rol)_vbmi2_epi8 49 | _(mm|mm256|mm512)(|_mask|_maskz)_(sllv|srlv|srav|rorv|rolv)_vbmi2_epi8 50 | ``` 51 | 52 | # VPCLMULQDQ_Demo.h 53 | experimental implementation of entire register (128/256/512b, xmm/ymm/zmm) prefix-xor operation with the VPCLMULQDQ extension 54 | ``` 55 | _mm_prefix_xor_clmul_si128(__m128i a); 56 | _mm256_prefix_xor_clmul_si256(__m256i a); 57 | _mm512_prefix_xor_clmul_si512(__m512i a); 58 | ``` 59 | 60 | # Compiler_Intrinsic_Test.cpp 61 | for testing Visual Studio AVX512 capabilities 62 | # TZCNT_Demo.cpp 63 | Emulating the missing SIMD VPTZCNTB / VPTZCNTW / VPTZCNTD / VPTZCNTQ instructions 64 | # LZCNT_Demo.cpp 65 | Emulating the missing SIMD VPLZCNTB / VPLZCNTW instructions 66 | # PEXT_PDEP_Emu.cpp 67 | Faster PEXT and PDEP emulation for AMD Excavator/Zen/Zen+/Zen2 based on Zach Wegner's ZP7 (Zach's Peppy Parallel-Prefix-Popcountin' PEXT/PDEP Polyfill) 68 | # CPU_Props.* 69 | detection of CPU properties for dispatching code paths 70 | # AVX512_DecimalPrint.* 71 | AVX512F, AVX512IFMA based implementation of _ultoa, _ltoa, _ui64toa, _i64toa functions. 72 | # AVX512_KMemDst.* 73 | code for examining the effect of the k mask register value on the EVEX-decoded instructions with memory destination 74 | # Zen4_Demo.* 75 | code for examining of instructions in AMD Zen4/Raphael CPU (CPUID A60F12). It is based on ideas from uops.info. Output example: \Results\Zen4_Demo_Imm8.txt 76 | # B2B_Demo.* 77 | VPERMI2B based code for fast any-to-any byte replacement. It can be useful e.g. for tolower/toupper type conversions or isxdigit/isalnum type classifications. 78 | [Performance results:](https://gist.github.com/InstLatx64/a5c60b714ef04ebe77f0b63639b36fd0) 79 | # AVX512_Reduce_Add.* 80 | (DB)SAD based _mm512_reduce_add_epu8/16/32/64 implementation 81 | # AVX512_Saturated_AddSub.* 82 | _mm512_adds/subs_epi/epu/32/64 implementation 83 | # FirstByte.* 84 | Finding first byte in lanes 85 | _mm256|512_firstbyte_epu32/64 implementation 86 | # HWBITPERM.* 87 | SVE2 vector BITPERM (BEXT/BDEP/BGRP) emulation with HW scalar BMI2 PEXT/PDEP instructions 88 | # AVX512_BGVSER.* 89 | Byte-Granularity Variable Shift on Entire Register 90 | ``` 91 | _(mm256|mm512))_(bsll|bsrl)_epi(256|512) [placeholder] 92 | _(mm256|mm512))_palign(l|r)_epi(256|512) 93 | _(mm256|mm512))_rotate(l|r)_epi(256|512) 94 | ``` 95 | # AVX_VNNI_INT16_Saturated_AddSub.* 96 | AVX_VNNI_INT16 based _(mm|mm256)_(adds|subs)_epi32 emulation proposal 97 | # P06P1.* 98 | Test code for an Intel Golden Cove / Raptor Cove / Redwood Cove / Lion Cove imm64-related anomaly 99 | 100 | ## References 101 | * Geoff Langdale [Why Ice Lake is Important (a bit-basher’s perspective)](https://branchfree.org/2019/05/29/why-ice-lake-is-important-a-bit-bashers-perspective/) 102 | * Marcus D. R. Klarqvist, Wojciech Muła, Daniel Lemire [Efficient Computation of Positional Population Counts Using SIMD Instructions](https://arxiv.org/abs/1911.02696) 103 | * Wojciech Muła [AVX512VBMI — remove spaces from text](http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html) 104 | * Zach Wegner [ZP7 (Zach's Peppy Parallel-Prefix-Popcountin' PEXT/PDEP Polyfill)](https://github.com/zwegner/zp7) 105 | * Abel, Andreas and Reineke, Jan [uops.info: Characterizing Latency, Throughput, and Port Usage of Instructions on Intel Microarchitectures](https://arxiv.org/pdf/1810.04610.pdf) 106 | * PerforatedBlob [TZCNT - TERNLOG->ANDN](https://twitter.com/PerforatedBlob/status/1418421045447454724) 107 | * TravisDowns [Scalar/HW GPR PDEP/PEXT reference code](https://twitter.com/trav_downs/status/1418616866080116742) 108 | * Daniel Lemire [Converting integers to decimal strings faster with AVX-512](https://lemire.me/blog/2022/03/28/converting-integers-to-decimal-strings-faster-with-avx-512/) 109 | * KMemDst results: [Intel SKX/CNL/TGL/RKL/ADL, AMD RPH](https://gist.github.com/InstLatx64/c7efbc71706561706888d7aa0548c4c5) 110 | * [Geoff Langdale's Byte2Byte question](https://twitter.com/geofflangdale/status/1406084804613861379) 111 | * [Geoff Langdale's reduce_add inspiration](https://twitter.com/geofflangdale/status/1609575574946865154) 112 | * A list of “out-of-band” uses for the GF2P8AFFINEQB instruction I haven't seen documented elsewhere: [idea of tzcnt/lzcnt_gfni_epi8, sllv/srlv_gfni_epi8](https://gist.github.com/animetosho/6cb732ccb5ecd86675ca0a442b3c0622) 113 | * [FirstByte inspiration](http://0x80.pl/notesen/2023-02-06-avx512-find-first-byte-in-lane.html) 114 | * Robert Clausecker [BGVSER inspiration](https://twitter.com/FUZxxl/status/1696448029358801311) 115 | * Tavian Barnes: [The Alder Lake anomaly, explained](https://tavianator.com/2025/shlxplained.html) -------------------------------------------------------------------------------- /TZCNT_Demo.cpp: -------------------------------------------------------------------------------- 1 | #include "stdafx.h" 2 | #include "TZCNT_Demo.h" 3 | /* 4 | VPTZCNTB/W/D/Q proposal: 5 | - tzcnt(a) = popcnt(tzmsk(a)) = popcnt(~a & (a-1)) = popcnt(andn(a, a-1)) = popcnt(andn(a, a+(-1))) (thx, @PerforatedBlob! https://twitter.com/PerforatedBlob/status/1418421045447454724) 6 | - zero case handled 7 | POPCNT vs LZCNT: 8 | - Byte/Word support too, not just DWord/QWord 9 | - faster (const 5 vs 8/9 clks on TGL, RKL) 10 | - only 1 const 11 | */ 12 | 13 | extern CPU_Props cpu_props; 14 | 15 | using namespace std; 16 | 17 | __m128i __vectorcall _mm_tzcnt_epi8(__m128i a) { 18 | __m128i u = _mm_undefined_si128(); 19 | return _mm_popcnt_epi8(_mm_andnot_si128(a, _mm_add_epi8(a, _mm_cmpeq_epi8(u, u)))); 20 | } 21 | 22 | __m256i __vectorcall _mm256_tzcnt_epi8(__m256i a) { 23 | __m256i u = _mm256_undefined_si256(); 24 | return _mm256_popcnt_epi8(_mm256_andnot_si256(a, _mm256_add_epi8(a, _mm256_cmpeq_epi8(u, u)))); 25 | } 26 | 27 | __m512i __vectorcall _mm512_tzcnt_epi8(__m512i a) { 28 | __m512i u = _mm512_undefined_epi32(); 29 | return _mm512_popcnt_epi8(_mm512_andnot_si512(a, _mm512_add_epi8(a, _mm512_ternarylogic_epi32(u, u, u, 0xff)))); 30 | } 31 | 32 | __m128i __vectorcall _mm_tzcnt_epi16(__m128i a) { 33 | __m128i u = _mm_undefined_si128(); 34 | return _mm_popcnt_epi16(_mm_andnot_si128(a, _mm_add_epi16(a, _mm_cmpeq_epi16(u, u)))); 35 | } 36 | 37 | __m256i __vectorcall _mm256_tzcnt_epi16(__m256i a) { 38 | __m256i u = _mm256_undefined_si256(); 39 | return _mm256_popcnt_epi16(_mm256_andnot_si256(a,_mm256_add_epi16(a, _mm256_cmpeq_epi16(u, u)))); 40 | } 41 | 42 | __m512i __vectorcall _mm512_tzcnt_epi16(__m512i a) { 43 | __m512i u = _mm512_undefined_epi32(); 44 | return _mm512_popcnt_epi16(_mm512_andnot_si512(a, _mm512_add_epi16(a, _mm512_ternarylogic_epi32(u, u, u, 0xff)))); 45 | } 46 | 47 | __m128i __vectorcall _mm_tzcnt_epi32(__m128i a) { 48 | __m128i u = _mm_undefined_si128(); 49 | return _mm_popcnt_epi32(_mm_andnot_si128(a, _mm_add_epi32(a, _mm_cmpeq_epi32(u, u)))); 50 | } 51 | 52 | __m256i __vectorcall _mm256_tzcnt_epi32(__m256i a) { 53 | __m256i u = _mm256_undefined_si256(); 54 | return _mm256_popcnt_epi32(_mm256_andnot_si256(a, _mm256_add_epi32(a, _mm256_cmpeq_epi32(u, u)))); 55 | } 56 | 57 | __m512i __vectorcall _mm512_tzcnt_epi32(__m512i a) { 58 | __m512i u = _mm512_undefined_epi32(); 59 | return _mm512_popcnt_epi32(_mm512_andnot_si512(a, _mm512_add_epi32(a, _mm512_ternarylogic_epi32(u, u, u, 0xff)))); 60 | } 61 | 62 | __m128i __vectorcall _mm_tzcnt_epi64(__m128i a) { 63 | __m128i u = _mm_undefined_si128(); 64 | return _mm_popcnt_epi64(_mm_andnot_si128(a, _mm_add_epi64(a, _mm_cmpeq_epi64(u, u)))); 65 | } 66 | 67 | __m256i __vectorcall _mm256_tzcnt_epi64(__m256i a) { 68 | __m256i u = _mm256_undefined_si256(); 69 | return _mm256_popcnt_epi64(_mm256_andnot_si256(a, _mm256_add_epi64(a, _mm256_cmpeq_epi64(u, u)))); 70 | } 71 | 72 | __m512i __vectorcall _mm512_tzcnt_epi64(__m512i a) { 73 | __m512i u = _mm512_undefined_epi32(); 74 | return _mm512_popcnt_epi64(_mm512_andnot_si512(a, _mm512_add_epi64(a, _mm512_ternarylogic_epi64(u, u, u, 0xff)))); 75 | } 76 | 77 | void TZCNT_Test(void) { 78 | cout << "--- AVX512_BITALG & AVX512_VPOPCNTDQ SIMD TZCNT ---" << dec << right << endl; 79 | __m128i x128 = _mm_set_epi16(0x0000, 0x0001, 0x0002, 0x0004, 0x1000, 0x2000, 0x4000, -32768), y128 = _mm_set1_epi16(0x7f); 80 | __m256i x256 = _mm256_set_epi16(0x0000, 0x0001, 0x0002, 0x0004, 0x1000, 0x2000, 0x4000, -32768, 0x07f00, 0x07e00, 0x07c00, 0x07800, 0x07000, 0x06000, 0x04000, 0x00000), y256 = _mm256_set1_epi16(0x7f); 81 | __m512i x512 = _mm512_set_epi16(0x0000, 0x0001, 0x0002, 0x0004, 0x1000, 0x2000, 0x4000, -32768, 0x07f00, 0x07e00, 0x07c00, 0x07800, 0x07000, 0x06000, 0x00000, 0x00000, -1, -2, -4, -8, -16, -32, -64, -128, -256, -512, -1024, -2048, -4096, -8192, -16384, -32768), y512 = _mm512_set1_epi16(0x7f); 82 | __mmask8 m8 = _INSTLATX64_DEMO_TESTMASK_8; 83 | __mmask16 m16 = _INSTLATX64_DEMO_TESTMASK_16; 84 | 85 | printRes("x128 :", x128); 86 | printRes("_mm_tzcnt_epi8 :", _mm_tzcnt_epi8(x128)); 87 | printRes("_mm_tzcnt_epi16 :", _mm_tzcnt_epi16(x128)); 88 | printRes("_mm_tzcnt_epi32 :", _mm_tzcnt_epi32(x128)); 89 | printRes("_mm_tzcnt_epi64 :", _mm_tzcnt_epi64(x128)); 90 | 91 | printRes("x256 :", x256); 92 | printRes("_mm256_tzcnt_epi8 :", _mm256_tzcnt_epi8(x256)); 93 | printRes("_mm256_tzcnt_epi16 :", _mm256_tzcnt_epi16(x256)); 94 | printRes("_mm256_tzcnt_epi32 :", _mm256_tzcnt_epi32(x256)); 95 | printRes("_mm256_tzcnt_epi64 :", _mm256_tzcnt_epi64(x256)); 96 | 97 | printRes("x512 :", x512); 98 | printRes("_mm512_tzcnt_epi8 :", _mm512_tzcnt_epi8(x512)); 99 | printRes("_mm512_tzcnt_epi16 :", _mm512_tzcnt_epi16(x512)); 100 | printRes("_mm512_tzcnt_epi32 :", _mm512_tzcnt_epi32(x512)); 101 | printRes("_mm512_tzcnt_epi64 :", _mm512_tzcnt_epi64(x512)); 102 | 103 | cout << "TSC CLKs:----------------------" << endl; 104 | 105 | cout << "_mm_tzcnt_epi8_asm :" << (double)_mm_tzcnt_epi8_asm_timed() / (double)TZCNT_REPEATS << endl; 106 | cout << "_mm_tzcnt_epi16_asm :" << (double)_mm_tzcnt_epi16_asm_timed() / (double)TZCNT_REPEATS << endl; 107 | cout << "_mm_tzcnt_epi32_asm :" << (double)_mm_tzcnt_epi32_asm_timed() / (double)TZCNT_REPEATS << endl; 108 | cout << "_mm_tzcnt_epi64_asm :" << (double)_mm_tzcnt_epi64_asm_timed() / (double)TZCNT_REPEATS << endl; 109 | 110 | cout << "_mm256_tzcnt_epi8_asm :" << (double)_mm256_tzcnt_epi8_asm_timed() / (double)TZCNT_REPEATS << endl; 111 | cout << "_mm256_tzcnt_epi16_asm :" << (double)_mm256_tzcnt_epi16_asm_timed() / (double)TZCNT_REPEATS << endl; 112 | cout << "_mm256_tzcnt_epi32_asm :" << (double)_mm256_tzcnt_epi32_asm_timed() / (double)TZCNT_REPEATS << endl; 113 | cout << "_mm256_tzcnt_epi64_asm :" << (double)_mm256_tzcnt_epi64_asm_timed() / (double)TZCNT_REPEATS << endl; 114 | 115 | cout << "_mm512_tzcnt_epi8_asm :" << (double)_mm512_tzcnt_epi8_asm_timed() / (double)TZCNT_REPEATS << endl; 116 | cout << "_mm512_tzcnt_epi16_asm :" << (double)_mm512_tzcnt_epi16_asm_timed() / (double)TZCNT_REPEATS << endl; 117 | cout << "_mm512_tzcnt_epi32_asm :" << (double)_mm512_tzcnt_epi32_asm_timed() / (double)TZCNT_REPEATS << endl; 118 | cout << "_mm512_tzcnt_epi64_asm :" << (double)_mm512_tzcnt_epi64_asm_timed() / (double)TZCNT_REPEATS << endl; 119 | 120 | cout << "_mm_tzcnt_epi32_cd_asm :" << (double)_mm_tzcnt_epi32_cd_asm_timed() / (double)TZCNT_REPEATS << endl; 121 | cout << "_mm256_tzcnt_epi32_cd_asm :" << (double)_mm256_tzcnt_epi32_cd_asm_timed() / (double)TZCNT_REPEATS << endl; 122 | cout << "_mm512_tzcnt_epi32_cd_asm :" << (double)_mm512_tzcnt_epi32_cd_asm_timed() / (double)TZCNT_REPEATS << endl; 123 | cout << "_mm_tzcnt_epi64_cd_asm :" << (double)_mm_tzcnt_epi64_cd_asm_timed() / (double)TZCNT_REPEATS << endl; 124 | cout << "_mm256_tzcnt_epi64_cd_asm :" << (double)_mm256_tzcnt_epi64_cd_asm_timed() / (double)TZCNT_REPEATS << endl; 125 | cout << "_mm512_tzcnt_epi64_cd_asm :" << (double)_mm512_tzcnt_epi64_cd_asm_timed() / (double)TZCNT_REPEATS << endl; 126 | } 127 | -------------------------------------------------------------------------------- /Zen5_Demo_Imm8.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define ZEN5_FUNCDEF_I8(INST, OPERANDS, I8) \ 4 | extern "C" unsigned __int64 __fastcall Zen5_##INST##_##OPERANDS##_##I8##_lat(void); \ 5 | extern "C" unsigned __int64 __fastcall Zen5_##INST##_##OPERANDS##_##I8##_tp(void); 6 | 7 | #define ZEN5_FUNCDECL_I8(NAME, INST, OPERANDS, I8) \ 8 | {#NAME, {\ 9 | Zen5_##INST##_##OPERANDS##_##I8##_lat, \ 10 | Zen5_##INST##_##OPERANDS##_##I8##_tp, \ 11 | }}, 12 | 13 | ZEN5_FUNCDEF_I8(vextracti128, ymmI82xmm, 000h) 14 | ZEN5_FUNCDEF_I8(vextractf128, ymmI82xmm, 000h) 15 | ZEN5_FUNCDEF_I8(vextracti128, ymmI82xmm, 001h) 16 | ZEN5_FUNCDEF_I8(vextractf128, ymmI82xmm, 001h) 17 | 18 | ZEN5_FUNCDEF_I8(vextracti32x4, ymmi82xmm, 000h) 19 | ZEN5_FUNCDEF_I8(vextractf32x4, ymmi82xmm, 000h) 20 | ZEN5_FUNCDEF_I8(vextracti32x4, ymmi82xmm, 001h) 21 | ZEN5_FUNCDEF_I8(vextractf32x4, ymmi82xmm, 001h) 22 | 23 | ZEN5_FUNCDEF_I8(vextracti64x2, ymmi82xmm, 000h) 24 | ZEN5_FUNCDEF_I8(vextractf64x2, ymmi82xmm, 000h) 25 | ZEN5_FUNCDEF_I8(vextracti64x2, ymmi82xmm, 001h) 26 | ZEN5_FUNCDEF_I8(vextractf64x2, ymmi82xmm, 001h) 27 | 28 | ZEN5_FUNCDEF_I8(vextracti32x4, zmmi82xmm, 000h) 29 | ZEN5_FUNCDEF_I8(vextractf32x4, zmmi82xmm, 000h) 30 | ZEN5_FUNCDEF_I8(vextracti32x4, zmmi82xmm, 001h) 31 | ZEN5_FUNCDEF_I8(vextractf32x4, zmmi82xmm, 001h) 32 | ZEN5_FUNCDEF_I8(vextracti32x4, zmmi82xmm, 002h) 33 | ZEN5_FUNCDEF_I8(vextractf32x4, zmmi82xmm, 002h) 34 | ZEN5_FUNCDEF_I8(vextracti32x4, zmmi82xmm, 003h) 35 | ZEN5_FUNCDEF_I8(vextractf32x4, zmmi82xmm, 003h) 36 | 37 | ZEN5_FUNCDEF_I8(vextracti32x8, zmmi82ymm, 000h) 38 | ZEN5_FUNCDEF_I8(vextractf32x8, zmmi82ymm, 000h) 39 | ZEN5_FUNCDEF_I8(vextracti32x8, zmmi82ymm, 001h) 40 | ZEN5_FUNCDEF_I8(vextractf32x8, zmmi82ymm, 001h) 41 | 42 | ZEN5_FUNCDEF_I8(vinserti128, xmmymmI82ymm, 000h) 43 | ZEN5_FUNCDEF_I8(vinsertf128, xmmymmI82ymm, 000h) 44 | ZEN5_FUNCDEF_I8(vinserti128, xmmymmI82ymm, 001h) 45 | ZEN5_FUNCDEF_I8(vinsertf128, xmmymmI82ymm, 001h) 46 | 47 | ZEN5_FUNCDEF_I8(vinserti32x4, xmmymmI82ymm, 000h) 48 | ZEN5_FUNCDEF_I8(vinsertf32x4, xmmymmI82ymm, 000h) 49 | ZEN5_FUNCDEF_I8(vinserti32x4, xmmymmI82ymm, 001h) 50 | ZEN5_FUNCDEF_I8(vinsertf32x4, xmmymmI82ymm, 001h) 51 | 52 | ZEN5_FUNCDEF_I8(vinserti64x2, xmmymmI82ymm, 000h) 53 | ZEN5_FUNCDEF_I8(vinsertf64x2, xmmymmI82ymm, 000h) 54 | ZEN5_FUNCDEF_I8(vinserti64x2, xmmymmI82ymm, 001h) 55 | ZEN5_FUNCDEF_I8(vinsertf64x2, xmmymmI82ymm, 001h) 56 | 57 | ZEN5_FUNCDEF_I8(vinserti32x4, xmmzmmI82zmm, 000h) 58 | ZEN5_FUNCDEF_I8(vinsertf32x4, xmmzmmI82zmm, 000h) 59 | ZEN5_FUNCDEF_I8(vinserti32x4, xmmzmmI82zmm, 001h) 60 | ZEN5_FUNCDEF_I8(vinsertf32x4, xmmzmmI82zmm, 001h) 61 | ZEN5_FUNCDEF_I8(vinserti32x4, xmmzmmI82zmm, 002h) 62 | ZEN5_FUNCDEF_I8(vinsertf32x4, xmmzmmI82zmm, 002h) 63 | ZEN5_FUNCDEF_I8(vinserti32x4, xmmzmmI82zmm, 003h) 64 | ZEN5_FUNCDEF_I8(vinsertf32x4, xmmzmmI82zmm, 003h) 65 | 66 | ZEN5_FUNCDEF_I8(vinserti32x8, ymmzmmI82zmm, 000h) 67 | ZEN5_FUNCDEF_I8(vinsertf32x8, ymmzmmI82zmm, 000h) 68 | ZEN5_FUNCDEF_I8(vinserti32x8, ymmzmmI82zmm, 001h) 69 | ZEN5_FUNCDEF_I8(vinsertf32x8, ymmzmmI82zmm, 001h) 70 | 71 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 000h) 72 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 000h) 73 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 001h) 74 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 001h) 75 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 002h) 76 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 002h) 77 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 003h) 78 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 003h) 79 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 008h) 80 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 008h) 81 | 82 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 010h) 83 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 010h) 84 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 011h) 85 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 011h) 86 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 012h) 87 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 012h) 88 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 013h) 89 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 013h) 90 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 018h) 91 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 018h) 92 | 93 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 020h) 94 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 020h) 95 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 021h) 96 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 021h) 97 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 022h) 98 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 022h) 99 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 023h) 100 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 023h) 101 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 028h) 102 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 028h) 103 | 104 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 030h) 105 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 030h) 106 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 031h) 107 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 031h) 108 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 032h) 109 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 032h) 110 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 033h) 111 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 033h) 112 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 038h) 113 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 038h) 114 | 115 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 080h) 116 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 080h) 117 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 081h) 118 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 081h) 119 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 082h) 120 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 082h) 121 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 083h) 122 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 083h) 123 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 088h) 124 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 088h) 125 | 126 | ZEN5_FUNCDEF_I8(vshufi32x4, 2ymmI82ymm, 000h) 127 | ZEN5_FUNCDEF_I8(vshuff32x4, 2ymmI82ymm, 000h) 128 | ZEN5_FUNCDEF_I8(vshufi32x4, 2ymmI82ymm, 001h) 129 | ZEN5_FUNCDEF_I8(vshuff32x4, 2ymmI82ymm, 001h) 130 | ZEN5_FUNCDEF_I8(vshufi32x4, 2ymmI82ymm, 002h) 131 | ZEN5_FUNCDEF_I8(vshuff32x4, 2ymmI82ymm, 002h) 132 | ZEN5_FUNCDEF_I8(vshufi32x4, 2ymmI82ymm, 003h) 133 | ZEN5_FUNCDEF_I8(vshuff32x4, 2ymmI82ymm, 003h) 134 | ZEN5_FUNCDEF_I8(vshufi64x2, 2ymmI82ymm, 000h) 135 | ZEN5_FUNCDEF_I8(vshuff64x2, 2ymmI82ymm, 000h) 136 | ZEN5_FUNCDEF_I8(vshufi64x2, 2ymmI82ymm, 001h) 137 | ZEN5_FUNCDEF_I8(vshuff64x2, 2ymmI82ymm, 001h) 138 | ZEN5_FUNCDEF_I8(vshufi64x2, 2ymmI82ymm, 002h) 139 | ZEN5_FUNCDEF_I8(vshuff64x2, 2ymmI82ymm, 002h) 140 | ZEN5_FUNCDEF_I8(vshufi64x2, 2ymmI82ymm, 003h) 141 | ZEN5_FUNCDEF_I8(vshuff64x2, 2ymmI82ymm, 003h) 142 | 143 | ZEN5_FUNCDEF_I8(vshufi32x4, 2zmmI82zmm, 000h) 144 | ZEN5_FUNCDEF_I8(vshuff32x4, 2zmmI82zmm, 000h) 145 | ZEN5_FUNCDEF_I8(vshufi32x4, 2zmmI82zmm, 044h) 146 | ZEN5_FUNCDEF_I8(vshuff32x4, 2zmmI82zmm, 044h) 147 | ZEN5_FUNCDEF_I8(vshufi32x4, 2zmmI82zmm, 0e4h) 148 | ZEN5_FUNCDEF_I8(vshuff32x4, 2zmmI82zmm, 0e4h) 149 | ZEN5_FUNCDEF_I8(vshufi32x4, 2zmmI82zmm, 0a5h) 150 | ZEN5_FUNCDEF_I8(vshuff32x4, 2zmmI82zmm, 0a5h) 151 | 152 | ZEN5_FUNCDEF_I8(vshufi64x2, 2zmmI82zmm, 000h) 153 | ZEN5_FUNCDEF_I8(vshuff64x2, 2zmmI82zmm, 000h) 154 | ZEN5_FUNCDEF_I8(vshufi64x2, 2zmmI82zmm, 044h) 155 | ZEN5_FUNCDEF_I8(vshuff64x2, 2zmmI82zmm, 044h) 156 | ZEN5_FUNCDEF_I8(vshufi64x2, 2zmmI82zmm, 0e4h) 157 | ZEN5_FUNCDEF_I8(vshuff64x2, 2zmmI82zmm, 0e4h) 158 | ZEN5_FUNCDEF_I8(vshufi64x2, 2zmmI82zmm, 0a5h) 159 | ZEN5_FUNCDEF_I8(vshuff64x2, 2zmmI82zmm, 0a5h) -------------------------------------------------------------------------------- /TZCNT_Demo_Asm.asm: -------------------------------------------------------------------------------- 1 | .data 2 | 3 | bytetest_00 db 001h, 002h, 004h, 008h, 010h, 020h, 040h, 080h 4 | bytetest_01 db 0c0h, 0e0h, 0f0h, 0f8h, 0fch, 0feh, 0ffh, 07fh 5 | bytetest_02 db 03fh, 01fh, 00fh, 007h, 003h, 001h, 000h, 000h 6 | bytetest_03 db 000h, 000h, 000h, 000h, 000h, 000h, 000h, 000h 7 | bytetest_04 db 000h, 000h, 000h, 000h, 000h, 000h, 000h, 000h 8 | bytetest_05 db 000h, 000h, 000h, 000h, 000h, 000h, 000h, 000h 9 | bytetest_06 db 000h, 000h, 000h, 000h, 000h, 000h, 000h, 000h 10 | bytetest_07 db 000h, 000h, 000h, 000h, 000h, 000h, 000h, 000h 11 | 12 | wordtest_00 dw 00001h, 00002h, 00004h, 00008h, 00010h, 00020h, 00040h, 00080h 13 | wordtest_01 dw 00100h, 00200h, 00400h, 00800h, 01000h, 02000h, 04000h, 08000h 14 | wordtest_02 dw 0ffffh, 0fffeh, 0fffch, 0fff8h, 0fff0h, 0ffe0h, 0ffc0h, 0ff80h 15 | wordtest_03 dw 0ff00h, 0fe00h, 0fc00h, 0f800h, 0f000h, 0e000h, 0c000h, 08000h 16 | 17 | dwordtest_00 dd 00000000h, 00000001h, 00000002h, 00000004h, 00000008h, 00000010h, 00000020h, 00000040h 18 | dwordtest_01 dd 00000080h, 00000100h, 00000200h, 00000400h, 10000000h, 20000000h, 40000000h, 80000000h 19 | 20 | qwordtest_00 dq 00000000h, 00000001h, 00000002h, 00000004h, 1000000000000000h, 2000000000000000h, 4000000000000000h, 8000000000000000h 21 | 22 | repeats equ 1000000h 23 | 24 | .code 25 | 26 | INIT macro VECREGSIZE, ISA 27 | IFIDNI , 28 | vpcmpeqw xmm1, xmm1, xmm1 29 | ELSEIFIDNI , 30 | vpcmpeqb ymm1, ymm1, ymm1 31 | ELSEIFIDNI , 32 | vpternlogd zmm1, zmm1, zmm1, 0ffh 33 | ENDIF 34 | IFIDNI , 35 | mov eax, 040h 36 | IFIDNI , 37 | vpbroadcastq xmm3, rax 38 | ELSEIFIDNI , 39 | vpbroadcastq ymm3, rax 40 | ELSEIFIDNI , 41 | vpbroadcastq zmm3, rax 42 | ENDIF 43 | ENDIF 44 | endm 45 | 46 | CORE macro DATA, VECREGSIZE, ISA 47 | IFIDNI , 48 | IFIDNI , 49 | IFIDNI , 50 | vpaddb xmm2, xmm0, xmm1 51 | vpandnd xmm0, xmm0, xmm2 52 | vpopcntb xmm0, xmm0 53 | ELSEIFIDNI , 54 | vpaddw xmm2, xmm0, xmm1 55 | vpandnd xmm0, xmm0, xmm2 56 | vpopcntw xmm0, xmm0 57 | ELSEIFIDNI , 58 | vpaddd xmm2, xmm0, xmm1 59 | vpandnd xmm0, xmm0, xmm2 60 | vpopcntd xmm0, xmm0 61 | ELSEIFIDNI , 62 | vpaddq xmm2, xmm0, xmm1 63 | vpandnq xmm0, xmm0, xmm2 64 | vpopcntq xmm0, xmm0 65 | ENDIF 66 | ELSEIFIDNI , 67 | IFIDNI , 68 | vpaddb ymm2, ymm0, ymm1 69 | vpandnd ymm0, ymm0, ymm2 70 | vpopcntb ymm0, ymm0 71 | ELSEIFIDNI , 72 | vpaddw ymm2, ymm0, ymm1 73 | vpandnd ymm0, ymm0, ymm2 74 | vpopcntw ymm0, ymm0 75 | ELSEIFIDNI , 76 | vpaddd ymm2, ymm0, ymm1 77 | vpandnd ymm0, ymm0, ymm2 78 | vpopcntd ymm0, ymm0 79 | ELSEIFIDNI , 80 | vpaddq ymm2, ymm0, ymm1 81 | vpandnq ymm0, ymm0, ymm2 82 | vpopcntq ymm0, ymm0 83 | ENDIF 84 | ELSEIFIDNI , 85 | IFIDNI , 86 | vpaddb zmm2, zmm0, zmm1 87 | vpandnd zmm0, zmm0, zmm2 88 | vpopcntb zmm0, zmm0 89 | ELSEIFIDNI , 90 | vpaddw zmm2, zmm0, zmm1 91 | vpandnd zmm0, zmm0, zmm2 92 | vpopcntw zmm0, zmm0 93 | ELSEIFIDNI , 94 | vpaddd zmm2, zmm0, zmm1 95 | vpandnd zmm0, zmm0, zmm2 96 | vpopcntd zmm0, zmm0 97 | ELSEIFIDNI , 98 | vpaddq zmm2, zmm0, zmm1 99 | vpandnq zmm0, zmm0, zmm2 100 | vpopcntq zmm0, zmm0 101 | ENDIF 102 | ENDIF 103 | ELSEIFIDNI , 104 | IFIDNI , 105 | IFIDNI , 106 | vpaddd xmm2, xmm0, xmm1 107 | vpandnd xmm0, xmm0, xmm2 108 | vplzcntd xmm0, xmm0 109 | vpsubd xmm0, xmm3, xmm0 110 | ELSEIFIDNI , 111 | vpaddq xmm2, xmm0, xmm1 112 | vpandnq xmm0, xmm0, xmm2 113 | vplzcntq xmm0, xmm0 114 | vpsubq xmm0, xmm3, xmm0 115 | ENDIF 116 | ELSEIFIDNI , 117 | IFIDNI , 118 | vpaddd ymm2, ymm0, ymm1 119 | vpandnd ymm0, ymm0, ymm2 120 | vplzcntd ymm0, ymm0 121 | vpsubd ymm0, ymm3, ymm0 122 | ELSEIFIDNI , 123 | vpaddq ymm2, ymm0, ymm1 124 | vpandnq ymm0, ymm0, ymm2 125 | vplzcntq ymm0, ymm0 126 | vpsubq ymm0, ymm3, ymm0 127 | ENDIF 128 | ELSEIFIDNI , 129 | IFIDNI , 130 | vpaddd zmm2, zmm0, zmm1 131 | vpandnd zmm0, zmm0, zmm2 132 | vplzcntd zmm0, zmm0 133 | vpsubd zmm0, zmm3, zmm0 134 | ELSEIFIDNI , 135 | vpaddq zmm2, zmm0, zmm1 136 | vpandnq zmm0, zmm0, zmm2 137 | vplzcntq zmm0, zmm0 138 | vpsubq zmm0, zmm3, zmm0 139 | ENDIF 140 | ENDIF 141 | ENDIF 142 | endm 143 | 144 | TIMED macro PNAME, DATA, VECREGSIZE, ISA 145 | PNAME proc 146 | push rbx 147 | push rdi 148 | push rsi 149 | 150 | INIT VECREGSIZE, ISA 151 | 152 | mfence 153 | rdtscp 154 | lfence 155 | 156 | mov esi, eax 157 | mov edi, edx 158 | 159 | mov ecx, repeats 160 | 161 | align 16 162 | startlabel: 163 | CORE DATA, VECREGSIZE, ISA 164 | 165 | dec ecx 166 | jnz startlabel 167 | 168 | mfence 169 | rdtscp 170 | lfence 171 | 172 | shl rdx, 20h 173 | shl rdi, 20h 174 | or rax, rdx 175 | or rsi, rdi 176 | 177 | sub rax, rsi 178 | 179 | 180 | pop rsi 181 | pop rdi 182 | pop rbx 183 | ret 184 | PNAME endp 185 | endm 186 | 187 | NAKED macro PNAME, DATA, VECREGSIZE, ISA 188 | PNAME proc 189 | 190 | INIT VECREGSIZE, ISA 191 | 192 | CORE DATA, VECREGSIZE, ISA 193 | 194 | ret 195 | PNAME endp 196 | endm 197 | 198 | TIMED _mm_tzcnt_epi8_asm_timed, EPI8, XMM, BITALG 199 | TIMED _mm_tzcnt_epi16_asm_timed, EPI16, XMM, BITALG 200 | TIMED _mm_tzcnt_epi32_asm_timed, EPI32, XMM, BITALG 201 | TIMED _mm_tzcnt_epi64_asm_timed, EPI64, XMM, BITALG 202 | 203 | NAKED _mm_tzcnt_epi8_asm@@16, EPI8, XMM, BITALG 204 | NAKED _mm_tzcnt_epi16_asm@@16, EPI16, XMM, BITALG 205 | NAKED _mm_tzcnt_epi32_asm@@16, EPI32, XMM, BITALG 206 | NAKED _mm_tzcnt_epi64_asm@@16, EPI64, XMM, BITALG 207 | 208 | TIMED _mm256_tzcnt_epi8_asm_timed, EPI8, YMM, BITALG 209 | TIMED _mm256_tzcnt_epi16_asm_timed, EPI16, YMM, BITALG 210 | TIMED _mm256_tzcnt_epi32_asm_timed, EPI32, YMM, BITALG 211 | TIMED _mm256_tzcnt_epi64_asm_timed, EPI64, YMM, BITALG 212 | 213 | NAKED _mm256_tzcnt_epi8_asm@@16, EPI8, YMM, BITALG 214 | NAKED _mm256_tzcnt_epi16_asm@@16, EPI16, YMM, BITALG 215 | NAKED _mm256_tzcnt_epi32_asm@@16, EPI32, YMM, BITALG 216 | NAKED _mm256_tzcnt_epi64_asm@@16, EPI64, YMM, BITALG 217 | 218 | TIMED _mm512_tzcnt_epi8_asm_timed, EPI8, ZMM, BITALG 219 | TIMED _mm512_tzcnt_epi16_asm_timed, EPI16, ZMM, BITALG 220 | TIMED _mm512_tzcnt_epi32_asm_timed, EPI32, ZMM, BITALG 221 | TIMED _mm512_tzcnt_epi64_asm_timed, EPI64, ZMM, BITALG 222 | 223 | NAKED _mm512_tzcnt_epi8_asm@@16, EPI8, ZMM, BITALG 224 | NAKED _mm512_tzcnt_epi16_asm@@16, EPI16, ZMM, BITALG 225 | NAKED _mm512_tzcnt_epi32_asm@@16, EPI32, ZMM, BITALG 226 | NAKED _mm512_tzcnt_epi64_asm@@16, EPI64, ZMM, BITALG 227 | 228 | TIMED _mm_tzcnt_epi32_cd_asm_timed, EPI32, XMM, CD 229 | TIMED _mm_tzcnt_epi64_cd_asm_timed, EPI64, XMM, CD 230 | TIMED _mm256_tzcnt_epi32_cd_asm_timed, EPI32, YMM, CD 231 | TIMED _mm256_tzcnt_epi64_cd_asm_timed, EPI64, YMM, CD 232 | TIMED _mm512_tzcnt_epi32_cd_asm_timed, EPI32, ZMM, CD 233 | TIMED _mm512_tzcnt_epi64_cd_asm_timed, EPI64, ZMM, CD 234 | 235 | NAKED _mm_tzcnt_epi32_cd_asm@@16, EPI32, XMM, CD 236 | NAKED _mm_tzcnt_epi64_cd_asm@@16, EPI64, XMM, CD 237 | NAKED _mm256_tzcnt_epi32_cd_asm@@16, EPI32, YMM, CD 238 | NAKED _mm256_tzcnt_epi64_cd_asm@@16, EPI64, YMM, CD 239 | NAKED _mm512_tzcnt_epi32_cd_asm@@16, EPI32, ZMM, CD 240 | NAKED _mm512_tzcnt_epi64_cd_asm@@16, EPI64, ZMM, CD 241 | 242 | end -------------------------------------------------------------------------------- /LZCNT_Demo.cpp: -------------------------------------------------------------------------------- 1 | #include "stdafx.h" 2 | #include "LZCNT_Demo.h" 3 | #include "GFNI_Demo.h" 4 | #include "VBMI2_Demo.h" 5 | /* 6 | VPLZCNTB/W proposal: 7 | - Byte/Word support 8 | - BITALG lzcnt(a) = popcnt(tzmsk(bit_reverse(a))) 9 | - FP16 lzcnt(a) = max(16, 30 - (fp16)a.exp)) 10 | - zero case handled 11 | */ 12 | 13 | extern CPU_Props cpu_props; 14 | 15 | using namespace std; 16 | 17 | __m128i __vectorcall _mm_lzcnt_ild_epi8(__m128i a) { 18 | __m128i u = _mm_undefined_si128(); 19 | __m128i r = _mm_revbit_epi8(a); 20 | return _mm_popcnt_epi8(_mm_andnot_si128(r, _mm_add_epi8(r, _mm_cmpeq_epi8(u, u)))); 21 | } 22 | 23 | __m256i __vectorcall _mm256_lzcnt_ild_epi8(__m256i a) { 24 | __m256i u = _mm256_undefined_si256(); 25 | __m256i r = _mm256_revbit_epi8(a); 26 | return _mm256_popcnt_epi8(_mm256_andnot_si256(r, _mm256_add_epi8(r, _mm256_cmpeq_epi8(u, u)))); 27 | } 28 | 29 | __m512i __vectorcall _mm512_lzcnt_ild_epi8(__m512i a) { 30 | __m512i u = _mm512_undefined_epi32(); 31 | __m512i r = _mm512_revbit_epi8(a); 32 | return _mm512_popcnt_epi8(_mm512_andnot_si512(r, _mm512_add_epi8(r, _mm512_ternarylogic_epi32(u, u, u, 0xff)))); 33 | } 34 | 35 | __m128i __vectorcall _mm_lzcnt_ild_epi16(__m128i a) { 36 | __m128i u = _mm_undefined_si128(); 37 | __m128i r = _mm_revbit_epi8(_mm_swaplh_epi8(a)); 38 | return _mm_popcnt_epi16(_mm_andnot_si128(r, _mm_add_epi16(r, _mm_cmpeq_epi16(u, u)))); 39 | } 40 | 41 | __m256i __vectorcall _mm256_lzcnt_ild_epi16(__m256i a) { 42 | __m256i u = _mm256_undefined_si256(); 43 | __m256i r = _mm256_revbit_epi8(_mm256_swaplh_epi8(a)); 44 | return _mm256_popcnt_epi16(_mm256_andnot_si256(r,_mm256_add_epi16(r, _mm256_cmpeq_epi16(u, u)))); 45 | } 46 | 47 | __m512i __vectorcall _mm512_lzcnt_ild_epi16(__m512i a) { 48 | __m512i u = _mm512_undefined_epi32(); 49 | __m512i r = _mm512_revbit_epi8(_mm512_swaplh_epi8(a)); 50 | return _mm512_popcnt_epi16(_mm512_andnot_si512(r, _mm512_add_epi16(r, _mm512_ternarylogic_epi32(u, u, u, 0xff)))); 51 | } 52 | 53 | __m128i __vectorcall _mm_lzcnt_fp16_epi16(__m128i a) { 54 | return _mm_min_epi16(_mm_sub_epi16(_mm_set1_epi16(0x1e), _mm_srli_epi16(_mm_cvtepu16_ph(a), 10)), _mm_set1_epi16(0x10)); 55 | } 56 | 57 | __m256i __vectorcall _mm256_lzcnt_fp16_epi16(__m256i a) { 58 | return _mm256_min_epi16(_mm256_sub_epi16(_mm256_set1_epi16(0x1e), _mm256_srli_epi16(_mm256_cvtepu16_ph(a), 10)), _mm256_set1_epi16(0x10)); 59 | } 60 | 61 | __m512i __vectorcall _mm512_lzcnt_fp16_epi16(__m512i a) { 62 | return _mm512_min_epi16(_mm512_sub_epi16(_mm512_set1_epi16(0x1e), _mm512_srli_epi16(_mm512_cvtepu16_ph(a), 10)), _mm512_set1_epi16(0x10)); 63 | } 64 | 65 | void LZCNT_Test(void) { 66 | cout << "--- AVX512_BITALG & AVX512_FP16 SIMD LZCNTB/W ---" << dec << right << endl; 67 | __m128i x128 = _mm_set_epi16(0x0000, 0x0001, 0x0002, 0x0004, 0x1000, 0x2000, 0x4000, -32768); 68 | __m256i x256 = _mm256_set_epi16(0x0000, 0x0001, 0x0002, 0x0004, 0x1000, 0x2000, 0x4000, -32768, 0x07f00, 0x07e00, 0x07c00, 0x07800, 0x07000, 0x06000, 0x04000, 0x00000); 69 | __m512i x512 = _mm512_set_epi16(0x0000, 0x0001, 0x0002, 0x0004, 0x1000, 0x2000, 0x4000, -32768, 0x07f00, 0x07e00, 0x07c00, 0x07800, 0x07000, 0x06000, 0x00000, 0x00000, (short)(1 << 15), 1 << 14, 1 << 13, 1 << 12, 1 << 11, 1 << 10, 1 << 9, 1 << 8, 1 << 7, 1 << 6, 1 << 5, 1 << 4, 1 << 3, 1 << 2, 1 << 1, 1 << 0); 70 | 71 | printRes("x128 :", x128); 72 | #if (_MSC_VER >= 1944) 73 | printRes("_mm_lzcnt_epi8 :", _mm_lzcnt_epi8(x128)); 74 | #endif 75 | printRes("_mm_lzcnt_ild_epi8 :", _mm_lzcnt_ild_epi8(x128)); 76 | printRes("_mm_lzcnt_epi8_asm :", _mm_lzcnt_epi8_asm(x128)); 77 | printRes("_mm_lzcnt_gfni_epi8 :", _mm_lzcnt_gfni_epi8(x128)); 78 | printRes("_mm_lzcnt_gfni_epi8_asm :", _mm_lzcnt_gfni_epi8_asm(x128)); 79 | #if (_MSC_VER >= 1944) 80 | printRes("_mm_lzcnt_epi16 :", _mm_lzcnt_epi16(x128)); 81 | #endif 82 | printRes("_mm_lzcnt_ild_epi16 :", _mm_lzcnt_ild_epi16(x128)); 83 | printRes("_mm_lzcnt_epi16_asm :", _mm_lzcnt_epi16_asm(x128)); 84 | if (cpu_props.IsFeat(FEAT_AVX512_FP16)) { 85 | printRes("_mm_lzcnt_fp16_epi16 :", _mm_lzcnt_fp16_epi16(x128)); 86 | printRes("_mm_lzcnt_fp16_epi16_asm :", _mm_lzcnt_fp16_epi16_asm(x128)); 87 | } 88 | 89 | printRes("x256 :", x256); 90 | #if (_MSC_VER >= 1944) 91 | printRes("_mm256_lzcnt_epi8 :", _mm256_lzcnt_epi8(x256)); 92 | #endif 93 | printRes("_mm256_lzcnt_ild_epi8 :", _mm256_lzcnt_ild_epi8(x256)); 94 | printRes("_mm256_lzcnt_epi8_asm :", _mm256_lzcnt_epi8_asm(x256)); 95 | printRes("_mm256_lzcnt_gfni_epi8 :", _mm256_lzcnt_gfni_epi8(x256)); 96 | printRes("_mm256_lzcnt_gfni_epi8_asm :", _mm256_lzcnt_gfni_epi8_asm(x256)); 97 | #if (_MSC_VER >= 1944) 98 | printRes("_mm256_lzcnt_epi16 :", _mm256_lzcnt_epi16(x256)); 99 | #endif 100 | printRes("_mm256_lzcnt_ild_epi16 :", _mm256_lzcnt_ild_epi16(x256)); 101 | printRes("_mm256_lzcnt_epi16_asm :", _mm256_lzcnt_epi16_asm(x256)); 102 | if (cpu_props.IsFeat(FEAT_AVX512_FP16)) { 103 | printRes("_mm256_lzcnt_fp16_epi16 :", _mm256_lzcnt_fp16_epi16(x256)); 104 | printRes("_mm256_lzcnt_fp16_epi16_asm :", _mm256_lzcnt_fp16_epi16_asm(x256)); 105 | } 106 | 107 | printRes("x512 :", x512); 108 | #if (_MSC_VER >= 1944) 109 | printRes("_mm512_lzcnt_epi8 :", _mm512_lzcnt_epi8(x512)); 110 | #endif 111 | printRes("_mm512_lzcnt_ild_epi8 :", _mm512_lzcnt_ild_epi8(x512)); 112 | printRes("_mm512_lzcnt_epi8 :", _mm512_lzcnt_epi8_asm(x512)); 113 | printRes("_mm512_lzcnt_gfni_epi8 :", _mm512_lzcnt_gfni_epi8(x512)); 114 | printRes("_mm512_lzcnt_gfni_epi8 :", _mm512_lzcnt_gfni_epi8_asm(x512)); 115 | #if (_MSC_VER >= 1944) 116 | printRes("_mm512_lzcnt_epi16 :", _mm512_lzcnt_epi16(x512)); 117 | #endif 118 | printRes("_mm512_lzcnt_ild_epi16 :", _mm512_lzcnt_ild_epi16(x512)); 119 | printRes("_mm512_lzcnt_epi16_asm :", _mm512_lzcnt_epi16_asm(x512)); 120 | if (cpu_props.IsFeat(FEAT_AVX512_FP16)) { 121 | printRes("_mm512_lzcnt_fp16_epi16 :", _mm512_lzcnt_fp16_epi16(x512)); 122 | printRes("_mm512_lzcnt_fp16_epi16_asm :", _mm512_lzcnt_fp16_epi16_asm(x512)); 123 | } 124 | 125 | cout << "TSC CLKs:----------------------" << endl; 126 | 127 | cout << "_mm_lzcnt_epi8_asm :" << (double)_mm_lzcnt_epi8_asm_timed() / (double)LZCNT_REPEATS << endl; 128 | cout << "_mm_lzcnt_gfni_epi8_asm :" << (double)_mm_lzcnt_gfni_epi8_asm_timed() / (double)LZCNT_REPEATS << endl; 129 | cout << "_mm256_lzcnt_epi8_asm :" << (double)_mm256_lzcnt_epi8_asm_timed() / (double)LZCNT_REPEATS << endl; 130 | cout << "_mm256_lzcnt_gfni_epi8_asm :" << (double)_mm256_lzcnt_gfni_epi8_asm_timed() / (double)LZCNT_REPEATS << endl; 131 | cout << "_mm512_lzcnt_epi8_asm :" << (double)_mm512_lzcnt_epi8_asm_timed() / (double)LZCNT_REPEATS << endl; 132 | cout << "_mm512_lzcnt_gfni_epi8_asm :" << (double)_mm512_lzcnt_gfni_epi8_asm_timed() / (double)LZCNT_REPEATS << endl; 133 | 134 | cout << "_mm_lzcnt_epi16_asm :" << (double)_mm_lzcnt_epi16_asm_timed() / (double)LZCNT_REPEATS << endl; 135 | if (cpu_props.IsFeat(FEAT_AVX512_FP16)) 136 | cout << "_mm_lzcnt_fp16_epi16_asm :" << (double)_mm_lzcnt_fp16_epi16_asm_timed() / (double)LZCNT_REPEATS << endl; 137 | 138 | cout << "_mm256_lzcnt_epi16_asm :" << (double)_mm256_lzcnt_epi16_asm_timed() / (double)LZCNT_REPEATS << endl; 139 | if (cpu_props.IsFeat(FEAT_AVX512_FP16)) 140 | cout << "_mm256_lzcnt_fp16_epi16_asm :" << (double)_mm256_lzcnt_fp16_epi16_asm_timed() / (double)LZCNT_REPEATS << endl; 141 | 142 | cout << "_mm512_lzcnt_epi16_asm :" << (double)_mm512_lzcnt_epi16_asm_timed() / (double)LZCNT_REPEATS << endl; 143 | if (cpu_props.IsFeat(FEAT_AVX512_FP16)) 144 | cout << "_mm512_lzcnt_fp16_epi16_asm :" << (double)_mm512_lzcnt_fp16_epi16_asm_timed() / (double)LZCNT_REPEATS << endl; 145 | } 146 | -------------------------------------------------------------------------------- /LZCNT_Demo_Asm.asm: -------------------------------------------------------------------------------- 1 | .data 2 | 3 | gfni_revbit dq 08040201008040201h 4 | gfni_tzcnt dq 0aaccf0ff00000000h 5 | 6 | fp16_expbias dd 001e001eh 7 | max_lzcntw dd 00100010h 8 | 9 | repeats equ 1000000h 10 | 11 | .code 12 | 13 | INIT8 macro VECREGSIZE, ISA 14 | IFIDNI , 15 | IFIDNI , 16 | vpcmpeqb xmm1, xmm1, xmm1 17 | vmovddup xmm2, qword ptr [gfni_revbit] 18 | ELSEIFIDNI , 19 | vpcmpeqb ymm1, ymm1, ymm1 20 | vpbroadcastq ymm2, qword ptr [gfni_revbit] 21 | ELSEIFIDNI , 22 | vpternlogd zmm1, zmm1, zmm1, 0ffh 23 | vpbroadcastq zmm2, qword ptr [gfni_revbit] 24 | ENDIF 25 | ELSEIFIDNI , 26 | IFIDNI , 27 | vpcmpeqb xmm1, xmm1, xmm1 28 | vmovddup xmm2, qword ptr [gfni_revbit] 29 | vmovddup xmm3, qword ptr [gfni_tzcnt] 30 | ELSEIFIDNI , 31 | vpcmpeqb ymm1, ymm1, ymm1 32 | vpbroadcastq ymm2, qword ptr [gfni_revbit] 33 | vpbroadcastq ymm3, qword ptr [gfni_tzcnt] 34 | ELSEIFIDNI , 35 | vpternlogd zmm1, zmm1, zmm1, 0ffh 36 | vpbroadcastq zmm2, qword ptr [gfni_revbit] 37 | vpbroadcastq zmm3, qword ptr [gfni_tzcnt] 38 | ENDIF 39 | ENDIF 40 | endm 41 | 42 | CORE8 macro VECREGSIZE, ISA 43 | IFIDNI , 44 | IFIDNI , 45 | vgf2p8affineqb xmm0, xmm0, xmm2, 0 46 | vpaddb xmm3, xmm0, xmm1 47 | vpandn xmm0, xmm0, xmm3 48 | vpopcntb xmm0, xmm0 49 | ELSEIFIDNI , 50 | vgf2p8affineqb ymm0, ymm0, ymm2, 0 51 | vpaddb ymm3, ymm0, ymm1 52 | vpandn ymm0, ymm0, ymm3 53 | vpopcntb ymm0, ymm0 54 | ELSEIFIDNI , 55 | vgf2p8affineqb zmm0, zmm0, zmm2, 0 56 | vpaddb zmm3, zmm0, zmm1 57 | vpandnq zmm0, zmm0, zmm3 58 | vpopcntb zmm0, zmm0 59 | ENDIF 60 | ELSEIFIDNI , 61 | IFIDNI , 62 | vgf2p8affineqb xmm0, xmm0, xmm2, 0 63 | vpaddb xmm4, xmm0, xmm1 64 | vpandn xmm0, xmm4, xmm0 65 | vgf2p8affineqb xmm0, xmm0, xmm3, 08h 66 | ELSEIFIDNI , 67 | vgf2p8affineqb ymm0, ymm0, ymm2, 0 68 | vpaddb ymm4, ymm0, ymm1 69 | vpandn ymm0, ymm4, ymm0 70 | vgf2p8affineqb ymm0, ymm0, ymm3, 08h 71 | ELSEIFIDNI , 72 | vgf2p8affineqb zmm0, zmm0, zmm2, 0 73 | vpaddb zmm4, zmm0, zmm1 74 | vpandnq zmm0, zmm4, zmm0 75 | vgf2p8affineqb zmm0, zmm0, zmm3, 08h 76 | ENDIF 77 | ENDIF 78 | endm 79 | 80 | INIT16 macro VECREGSIZE, ISA 81 | IFIDNI , 82 | IFIDNI , 83 | vpcmpeqw xmm1, xmm1, xmm1 84 | vmovddup xmm2, qword ptr [gfni_revbit] 85 | ELSEIFIDNI , 86 | vpcmpeqb ymm1, ymm1, ymm1 87 | vpbroadcastq ymm2, qword ptr [gfni_revbit] 88 | ELSEIFIDNI , 89 | vpternlogd zmm1, zmm1, zmm1, 0ffh 90 | vpbroadcastq zmm2, qword ptr [gfni_revbit] 91 | ENDIF 92 | ELSEIFIDNI , 93 | IFIDNI , 94 | ;vpbroadcastd ymm1, dword ptr [fp16_expbias] 95 | ;vpbroadcastd ymm2, dword ptr [max_lzcntw] 96 | vpxor xmm2, xmm2, xmm2 97 | vgf2p8affineqb xmm1, xmm2, xmm2, 0f0h ;001eh const gen 98 | vgf2p8affineqb xmm2, xmm2, xmm2, 080h ;0010h const gen 99 | vpsrlw xmm1, xmm1, 0bh ;001eh 100 | vpsrlw xmm2, xmm2, 0bh ;0010h 101 | ELSEIFIDNI , 102 | ;vpbroadcastd ymm1, dword ptr [fp16_expbias] 103 | ;vpbroadcastd ymm2, dword ptr [max_lzcntw] 104 | vpxor xmm2, xmm2, xmm2 105 | vgf2p8affineqb ymm1, ymm2, ymm2, 0f0h ;001eh const gen 106 | vgf2p8affineqb ymm2, ymm2, ymm2, 080h ;0010h const gen 107 | vpsrlw ymm1, ymm1, 0bh ;001eh 108 | vpsrlw ymm2, ymm2, 0bh ;0010h 109 | ELSEIFIDNI , 110 | ;vpbroadcastd zmm1, dword ptr [fp16_expbias] 111 | ;vpbroadcastd zmm2, dword ptr [max_lzcntw] 112 | vpxor xmm2, xmm2, xmm2 113 | vgf2p8affineqb zmm1, zmm2, zmm2, 0f0h ;001eh const gen 114 | vgf2p8affineqb zmm2, zmm2, zmm2, 080h ;0010h const gen 115 | vpsrlw zmm1, zmm1, 0bh ;001eh 116 | vpsrlw zmm2, zmm2, 0bh ;0010h 117 | ENDIF 118 | ENDIF 119 | endm 120 | 121 | CORE16 macro VECREGSIZE, ISA 122 | IFIDNI , 123 | IFIDNI , 124 | vpshldw xmm0, xmm0, xmm0, 8 125 | vgf2p8affineqb xmm0, xmm0, xmm2, 0 126 | vpaddw xmm3, xmm0, xmm1 127 | vpandn xmm0, xmm0, xmm3 128 | vpopcntw xmm0, xmm0 129 | ELSEIFIDNI , 130 | vpshldw ymm0, ymm0, ymm0, 8 131 | vgf2p8affineqb ymm0, ymm0, ymm2, 0 132 | vpaddw ymm3, ymm0, ymm1 133 | vpandn ymm0, ymm0, ymm3 134 | vpopcntw ymm0, ymm0 135 | ELSEIFIDNI , 136 | vpshldw zmm0, zmm0, zmm0, 8 137 | vgf2p8affineqb zmm0, zmm0, zmm2, 0 138 | vpaddw zmm3, zmm0, zmm1 139 | vpandnd zmm0, zmm0, zmm3 140 | vpopcntw zmm0, zmm0 141 | ENDIF 142 | ELSEIFIDNI , 143 | IFIDNI , 144 | vcvtuw2ph xmm0, xmm0 145 | vpsrlw xmm0, xmm0, 0ah 146 | vpsubw xmm0, xmm1, xmm0 147 | vpminuw xmm0, xmm0, xmm2 148 | ELSEIFIDNI , 149 | vcvtuw2ph ymm0, ymm0 150 | vpsrlw ymm0, ymm0, 0ah 151 | vpsubw ymm0, ymm1, ymm0 152 | vpminuw ymm0, ymm0, ymm2 153 | ELSEIFIDNI , 154 | vcvtuw2ph zmm0, zmm0 155 | vpsrlw zmm0, zmm0, 0ah 156 | vpsubw zmm0, zmm1, zmm0 157 | vpminuw zmm0, zmm0, zmm2 158 | ENDIF 159 | ENDIF 160 | endm 161 | 162 | 163 | TIMED macro PNAME, DATA, VECREGSIZE, ISA 164 | PNAME proc 165 | push rbx 166 | push rdi 167 | push rsi 168 | 169 | IFIDNI , 170 | INIT8 VECREGSIZE, ISA 171 | ELSEIFIDNI , 172 | INIT16 VECREGSIZE, ISA 173 | ENDIF 174 | 175 | mfence 176 | rdtscp 177 | lfence 178 | 179 | mov esi, eax 180 | mov edi, edx 181 | 182 | mov ecx, repeats 183 | 184 | align 16 185 | startlabel: 186 | IFIDNI , 187 | CORE8 VECREGSIZE, ISA 188 | ELSEIFIDNI , 189 | CORE16 VECREGSIZE, ISA 190 | ENDIF 191 | 192 | dec ecx 193 | jnz startlabel 194 | 195 | mfence 196 | rdtscp 197 | lfence 198 | 199 | shl rdx, 20h 200 | shl rdi, 20h 201 | or rax, rdx 202 | or rsi, rdi 203 | 204 | sub rax, rsi 205 | 206 | 207 | pop rsi 208 | pop rdi 209 | pop rbx 210 | ret 211 | PNAME endp 212 | endm 213 | 214 | NAKED macro PNAME, DATA, VECREGSIZE, ISA 215 | PNAME proc 216 | 217 | IFIDNI , 218 | INIT8 VECREGSIZE, ISA 219 | ELSEIFIDNI , 220 | INIT16 VECREGSIZE, ISA 221 | ENDIF 222 | 223 | IFIDNI , 224 | CORE8 VECREGSIZE, ISA 225 | ELSEIFIDNI , 226 | CORE16 VECREGSIZE, ISA 227 | ENDIF 228 | 229 | ret 230 | PNAME endp 231 | endm 232 | 233 | TIMED _mm_lzcnt_epi8_asm_timed, EPI8, XMM, BITALG 234 | TIMED _mm256_lzcnt_epi8_asm_timed, EPI8, YMM, BITALG 235 | TIMED _mm512_lzcnt_epi8_asm_timed, EPI8, ZMM, BITALG 236 | 237 | NAKED _mm_lzcnt_epi8_asm@@16, EPI8, XMM, BITALG 238 | NAKED _mm256_lzcnt_epi8_asm@@32, EPI8, YMM, BITALG 239 | NAKED _mm512_lzcnt_epi8_asm@@64, EPI8, ZMM, BITALG 240 | 241 | TIMED _mm_lzcnt_gfni_epi8_asm_timed, EPI8, XMM, GFNI 242 | TIMED _mm256_lzcnt_gfni_epi8_asm_timed, EPI8, YMM, GFNI 243 | TIMED _mm512_lzcnt_gfni_epi8_asm_timed, EPI8, ZMM, GFNI 244 | 245 | NAKED _mm_lzcnt_gfni_epi8_asm@@16, EPI8, XMM, GFNI 246 | NAKED _mm256_lzcnt_gfni_epi8_asm@@32, EPI8, YMM, GFNI 247 | NAKED _mm512_lzcnt_gfni_epi8_asm@@64, EPI8, ZMM, GFNI 248 | 249 | TIMED _mm_lzcnt_epi16_asm_timed, EPI16, XMM, BITALG 250 | TIMED _mm256_lzcnt_epi16_asm_timed, EPI16, YMM, BITALG 251 | TIMED _mm512_lzcnt_epi16_asm_timed, EPI16, ZMM, BITALG 252 | 253 | NAKED _mm_lzcnt_epi16_asm@@16, EPI16, ZMM, BITALG 254 | NAKED _mm256_lzcnt_epi16_asm@@32, EPI16, ZMM, BITALG 255 | NAKED _mm512_lzcnt_epi16_asm@@64, EPI16, ZMM, BITALG 256 | 257 | TIMED _mm_lzcnt_fp16_epi16_asm_timed, EPI16, XMM, FP16 258 | TIMED _mm256_lzcnt_fp16_epi16_asm_timed, EPI16, YMM, FP16 259 | TIMED _mm512_lzcnt_fp16_epi16_asm_timed, EPI16, ZMM, FP16 260 | 261 | NAKED _mm_lzcnt_fp16_epi16_asm@@16, EPI16, ZMM, FP16 262 | NAKED _mm256_lzcnt_fp16_epi16_asm@@32, EPI16, ZMM, FP16 263 | NAKED _mm512_lzcnt_fp16_epi16_asm@@64, EPI16, ZMM, FP16 264 | 265 | 266 | end -------------------------------------------------------------------------------- /HWBITPERM_Demo.cpp: -------------------------------------------------------------------------------- 1 | #include "stdafx.h" 2 | #include "HWBITPERM_Demo.h" 3 | 4 | extern CPU_Props cpu_props; 5 | 6 | using namespace std; 7 | 8 | bitperm_methods hw_reference[] = { 9 | {"PEXT32_HW zmm, zmm, zmm ", "SKX ", 32, BEXT32_HW_Lat, BEXT32_HW_Tp, BEXT32_HW, FEAT_AVX512BW, BEXT, 0}, 10 | {"PDEP32_HW zmm, zmm, zmm ", "SKX ", 32, BDEP32_HW_Lat, BDEP32_HW_Tp, BDEP32_HW, FEAT_AVX512BW, BDEP, 1}, 11 | {"BGRP32_HW zmm, zmm, zmm ", "SKX ", 32, BGRP32_HW_Lat, BGRP32_HW_Tp, BGRP32_HW, FEAT_AVX512BW, BGRP, 2}, 12 | {"PEXT64_HW zmm, zmm, zmm ", "SKX ", 64, BEXT64_HW_Lat, BEXT64_HW_Tp, BEXT64_HW, FEAT_AVX512BW, BEXT, 3}, 13 | {"PDEP64_HW zmm, zmm, zmm ", "SKX ", 64, BDEP64_HW_Lat, BDEP64_HW_Tp, BDEP64_HW, FEAT_AVX512BW, BDEP, 4}, 14 | {"BGRP64_HW zmm, zmm, zmm ", "SKX ", 64, BGRP64_HW_Lat, BGRP64_HW_Tp, BGRP64_HW, FEAT_AVX512BW, BGRP, 5}, 15 | }; 16 | 17 | void HWBITPERM_Compare(__m512i p, __m512i mask, __m512i ref, int bitness, BITPERM type) { 18 | for (int b = 0; b < (sizeof(hw_reference) / sizeof(bitperm_methods)); b++) { 19 | if (cpu_props.IsFeat(hw_reference[b].feats) && (bitness == hw_reference[b].bitness) && (type == hw_reference[b].type)) { 20 | __m512i res = (hw_reference[b].func)(p, mask); 21 | __mmask64 test = _mm512_cmpeq_epi8_mask(res, ref); 22 | if (test != ~0ULL) 23 | if (bitness == 32) { 24 | printRes32(hw_reference[b].name, p); 25 | printRes32(hw_reference[b].name, mask); 26 | printRes32(hw_reference[b].name, ref); 27 | printRes32(hw_reference[b].name, res); 28 | } else { 29 | printRes(hw_reference[b].name, p); 30 | printRes(hw_reference[b].name, mask); 31 | printRes(hw_reference[b].name, ref); 32 | printRes(hw_reference[b].name, res); 33 | } 34 | assert(test == ~0ULL); 35 | } 36 | } 37 | } 38 | 39 | unsigned int _pgrp_u32(unsigned int p, unsigned int m) { 40 | unsigned int zeros = _pext_u32(p, ~m) << (_mm_popcnt_u32(m)); 41 | unsigned int ones = _pext_u32(p, m); 42 | return zeros | ones; 43 | } 44 | 45 | unsigned __int64 _pgrp_u64(unsigned __int64 p, unsigned __int64 m) { 46 | unsigned __int64 zeros = _pext_u64(p, ~m) << (_mm_popcnt_u64(m)); 47 | unsigned __int64 ones = _pext_u64(p, m); 48 | return zeros | ones; 49 | } 50 | 51 | void HWBITPERM_Check64(void) { 52 | __m512i p, m, ref_ext, ref_dep, ref_grp; 53 | 54 | for (int j = 0; j < 8; j++) { 55 | for (int i = 0; i < 8; i++) { 56 | int b = 8 * j + i; 57 | p.m512i_u64[i] = ~0ULL; 58 | m.m512i_u64[i] = (((1ULL) << b) + ((1ULL << 63) >> b)) | (1ULL << 32) | (1ULL << 32) | (1ULL << 16) | (1ULL << 48); 59 | ref_ext.m512i_i64[i] = _pext_u64(p.m512i_u64[i], m.m512i_u64[i]); 60 | ref_dep.m512i_i64[i] = _pdep_u64(p.m512i_u64[i], m.m512i_u64[i]); 61 | ref_grp.m512i_i64[i] = _pgrp_u64(p.m512i_u64[i], m.m512i_u64[i]); 62 | } 63 | HWBITPERM_Compare(p, m, ref_ext, 64, BEXT); 64 | HWBITPERM_Compare(p, m, ref_dep, 64, BDEP); 65 | HWBITPERM_Compare(p, m, ref_grp, 64, BGRP); 66 | } 67 | 68 | for (int j = 0; j < 8; j++) { 69 | for (int i = 0; i < 8; i++) { 70 | int b = 8 * j + i; 71 | p.m512i_u64[i] = (((1ULL) << b) + ((1ULL << 63) >> b)) | (1ULL << 32) | (1ULL << 32) | (1ULL << 16) | (1ULL << 48); 72 | m.m512i_u64[i] = ~0ULL; 73 | ref_ext.m512i_i64[i] = _pext_u64(p.m512i_u64[i], m.m512i_u64[i]); 74 | ref_dep.m512i_i64[i] = _pdep_u64(p.m512i_u64[i], m.m512i_u64[i]); 75 | ref_grp.m512i_i64[i] = _pgrp_u64(p.m512i_u64[i], m.m512i_u64[i]); 76 | } 77 | HWBITPERM_Compare(p, m, ref_ext, 64, BEXT); 78 | HWBITPERM_Compare(p, m, ref_dep, 64, BDEP); 79 | HWBITPERM_Compare(p, m, ref_grp, 64, BGRP); 80 | } 81 | 82 | for (int j = 0; j < 8; j++) { 83 | for (int i = 0; i < 8; i++) { 84 | int b = 8 * j + i; 85 | p.m512i_u64[i] = ~0ULL; 86 | m.m512i_u64[i] = _bzhi_u64(~0, b + 1); 87 | ref_ext.m512i_i64[i] = _pext_u64(p.m512i_u64[i], m.m512i_u64[i]); 88 | ref_dep.m512i_i64[i] = _pdep_u64(p.m512i_u64[i], m.m512i_u64[i]); 89 | ref_grp.m512i_i64[i] = _pgrp_u64(p.m512i_u64[i], m.m512i_u64[i]); 90 | } 91 | HWBITPERM_Compare(p, m, ref_ext, 64, BEXT); 92 | HWBITPERM_Compare(p, m, ref_dep, 64, BDEP); 93 | HWBITPERM_Compare(p, m, ref_grp, 64, BGRP); 94 | } 95 | 96 | for (int j = 0; j < 1000; j++) { 97 | for (int i = 0; i < 8; i++) { 98 | while (!_rdrand64_step(&p.m512i_u64[i])); 99 | while (!_rdrand64_step(&m.m512i_u64[i])); 100 | ref_ext.m512i_i64[i] = _pext_u64(p.m512i_u64[i], m.m512i_u64[i]); 101 | ref_dep.m512i_i64[i] = _pdep_u64(p.m512i_u64[i], m.m512i_u64[i]); 102 | ref_grp.m512i_i64[i] = _pgrp_u64(p.m512i_u64[i], m.m512i_u64[i]); 103 | } 104 | HWBITPERM_Compare(p, m, ref_ext, 64, BEXT); 105 | HWBITPERM_Compare(p, m, ref_dep, 64, BDEP); 106 | HWBITPERM_Compare(p, m, ref_grp, 64, BGRP); 107 | } 108 | } 109 | 110 | void HWBITPERM_Check32(void) { 111 | __m512i p, m, ref_ext, ref_dep, ref_grp; 112 | 113 | for (int j = 0; j < 4; j++) { 114 | for (int i = 0; i < 16; i++) { 115 | int b = 16 * j + i; 116 | p.m512i_u32[i] = ~0UL; 117 | m.m512i_u32[i] = (((1UL) << b) + ((1UL << 31) >> b)) | (1UL << 16); 118 | ref_ext.m512i_u32[i] = _pext_u32(p.m512i_u32[i], m.m512i_u32[i]); 119 | ref_dep.m512i_u32[i] = _pdep_u32(p.m512i_u32[i], m.m512i_u32[i]); 120 | ref_grp.m512i_u32[i] = _pgrp_u32(p.m512i_u32[i], m.m512i_u32[i]); 121 | } 122 | HWBITPERM_Compare(p, m, ref_ext, 32, BEXT); 123 | HWBITPERM_Compare(p, m, ref_dep, 32, BDEP);; 124 | HWBITPERM_Compare(p, m, ref_grp, 32, BGRP);; 125 | } 126 | 127 | for (int j = 0; j < 4; j++) { 128 | for (int i = 0; i < 16; i++) { 129 | int b = 16 * j + i; 130 | p.m512i_u32[i] = (((1UL) << b) + ((1UL << 31) >> b)) | (1UL << 16); 131 | m.m512i_u32[i] = ~0UL; 132 | ref_ext.m512i_u32[i] = _pext_u32(p.m512i_u32[i], m.m512i_u32[i]); 133 | ref_dep.m512i_u32[i] = _pdep_u32(p.m512i_u32[i], m.m512i_u32[i]); 134 | ref_grp.m512i_u32[i] = _pgrp_u32(p.m512i_u32[i], m.m512i_u32[i]); 135 | } 136 | HWBITPERM_Compare(p, m, ref_ext, 32, BEXT); 137 | HWBITPERM_Compare(p, m, ref_dep, 32, BDEP);; 138 | HWBITPERM_Compare(p, m, ref_grp, 32, BGRP);; 139 | } 140 | 141 | for (int j = 0; j < 2; j++) { 142 | for (int i = 0; i < 16; i++) { 143 | int b = 16 * j + i; 144 | p.m512i_u32[i] = ~0UL; 145 | m.m512i_u32[i] = _bzhi_u32(~0, b + 1); 146 | ref_ext.m512i_u32[i] = _pext_u32(p.m512i_u32[i], m.m512i_u32[i]); 147 | ref_dep.m512i_u32[i] = _pdep_u32(p.m512i_u32[i], m.m512i_u32[i]); 148 | ref_grp.m512i_u32[i] = _pgrp_u32(p.m512i_u32[i], m.m512i_u32[i]); 149 | } 150 | HWBITPERM_Compare(p, m, ref_ext, 32, BEXT); 151 | HWBITPERM_Compare(p, m, ref_dep, 32, BDEP); 152 | HWBITPERM_Compare(p, m, ref_grp, 32, BGRP); 153 | } 154 | 155 | for (int j = 0; j < 1000; j++) { 156 | for (int i = 0; i < 16; i++) { 157 | while (!_rdrand32_step(&p.m512i_u32[i])); 158 | while (!_rdrand32_step(&m.m512i_u32[i])); 159 | ref_ext.m512i_u32[i] = _pext_u32(p.m512i_u32[i], m.m512i_u32[i]); 160 | ref_dep.m512i_u32[i] = _pdep_u32(p.m512i_u32[i], m.m512i_u32[i]); 161 | ref_grp.m512i_u32[i] = _pgrp_u32(p.m512i_u32[i], m.m512i_u32[i]); 162 | } 163 | HWBITPERM_Compare(p, m, ref_ext, 32, BEXT); 164 | HWBITPERM_Compare(p, m, ref_dep, 32, BDEP); 165 | HWBITPERM_Compare(p, m, ref_grp, 32, BGRP); 166 | } 167 | } 168 | 169 | void HWBITPERM_Time(int method) { 170 | unsigned __int64 minlat = ULONG_MAX; 171 | unsigned __int64 mintp = ULONG_MAX; 172 | for (int retry = 0; retry < DEPEXT219_RETRIES; retry++) { 173 | minlat = min(minlat, (hw_reference[method].lat)()); 174 | } 175 | for (int retry = 0; retry < DEPEXT219_RETRIES; retry++) { 176 | mintp = min(mintp, (hw_reference[method].tp)()); 177 | } 178 | cout << hw_reference[method].isaName << hw_reference[method].name << ": " << (double)minlat / (double)DEPEXT219_REPEATS << " | "; 179 | cout << (double)mintp / (double)DEPEXT219_REPEATS << " L|T"; 180 | cout << endl; 181 | } 182 | 183 | void HWBITPERM_Test() { 184 | SetThread(2); 185 | HWBITPERM_Check64(); 186 | HWBITPERM_Check32(); 187 | 188 | cout << setw(5) << fixed << setprecision(2); 189 | cout << endl<< "HW/Scalar TSC CLKs:----------------------" << endl; 190 | for (int b = 0; b < (sizeof(hw_reference) / sizeof(bitperm_methods)); b++) { 191 | if ((cpu_props.IsFeat(hw_reference[b].feats)) && (hw_reference[b].lat != NULL) && (hw_reference[b].tp != NULL)) 192 | HWBITPERM_Time(b); 193 | } 194 | } 195 | -------------------------------------------------------------------------------- /Zen4_Demo_Imm8.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define ZEN4_FUNCDEF_I8(INST, OPERANDS, I8) \ 4 | extern "C" unsigned __int64 __fastcall Zen4_##INST##_##OPERANDS##_##I8##_lat(void); \ 5 | extern "C" unsigned __int64 __fastcall Zen4_##INST##_##OPERANDS##_##I8##_tp(void); \ 6 | extern "C" unsigned __int64 __fastcall Zen4_##INST##_##OPERANDS##_##I8##_port01(void); \ 7 | extern "C" unsigned __int64 __fastcall Zen4_##INST##_##OPERANDS##_##I8##_port23(void); \ 8 | extern "C" unsigned __int64 __fastcall Zen4_##INST##_##OPERANDS##_##I8##_port12(void); \ 9 | extern "C" unsigned __int64 __fastcall Zen4_##INST##_##OPERANDS##_##I8##_port123(void); \ 10 | extern "C" unsigned __int64 __fastcall Zen4_##INST##_##OPERANDS##_##I8##_port0123(void);\ 11 | extern "C" unsigned __int64 __fastcall Zen4_##INST##_##OPERANDS##_##I8##_port45(void); \ 12 | extern "C" unsigned __int64 __fastcall Zen4_##INST##_##OPERANDS##_##I8##_tern(void); \ 13 | extern "C" unsigned __int64 __fastcall Zen4_##INST##_##OPERANDS##_##I8##_LDs(void); \ 14 | extern "C" unsigned __int64 __fastcall Zen4_##INST##_##OPERANDS##_##I8##_port1(void); 15 | 16 | #define ZEN4_FUNCDECL_I8(NAME, INST, OPERANDS, I8) \ 17 | {#NAME, {\ 18 | Zen4_##INST##_##OPERANDS##_##I8##_lat, \ 19 | Zen4_##INST##_##OPERANDS##_##I8##_tp, \ 20 | Zen4_##INST##_##OPERANDS##_##I8##_port01, \ 21 | Zen4_##INST##_##OPERANDS##_##I8##_port23, \ 22 | Zen4_##INST##_##OPERANDS##_##I8##_port12, \ 23 | Zen4_##INST##_##OPERANDS##_##I8##_port123, \ 24 | Zen4_##INST##_##OPERANDS##_##I8##_port0123, \ 25 | Zen4_##INST##_##OPERANDS##_##I8##_port45, \ 26 | Zen4_##INST##_##OPERANDS##_##I8##_tern, \ 27 | Zen4_##INST##_##OPERANDS##_##I8##_LDs, \ 28 | Zen4_##INST##_##OPERANDS##_##I8##_port1 \ 29 | }}, 30 | 31 | ZEN4_FUNCDEF_I8(vextracti128, ymmI82xmm, 000h) 32 | ZEN4_FUNCDEF_I8(vextractf128, ymmI82xmm, 000h) 33 | ZEN4_FUNCDEF_I8(vextracti128, ymmI82xmm, 001h) 34 | ZEN4_FUNCDEF_I8(vextractf128, ymmI82xmm, 001h) 35 | 36 | ZEN4_FUNCDEF_I8(vextracti32x4, ymmi82xmm, 000h) 37 | ZEN4_FUNCDEF_I8(vextractf32x4, ymmi82xmm, 000h) 38 | ZEN4_FUNCDEF_I8(vextracti32x4, ymmi82xmm, 001h) 39 | ZEN4_FUNCDEF_I8(vextractf32x4, ymmi82xmm, 001h) 40 | 41 | ZEN4_FUNCDEF_I8(vextracti64x2, ymmi82xmm, 000h) 42 | ZEN4_FUNCDEF_I8(vextractf64x2, ymmi82xmm, 000h) 43 | ZEN4_FUNCDEF_I8(vextracti64x2, ymmi82xmm, 001h) 44 | ZEN4_FUNCDEF_I8(vextractf64x2, ymmi82xmm, 001h) 45 | 46 | ZEN4_FUNCDEF_I8(vextracti32x4, zmmi82xmm, 000h) 47 | ZEN4_FUNCDEF_I8(vextractf32x4, zmmi82xmm, 000h) 48 | ZEN4_FUNCDEF_I8(vextracti32x4, zmmi82xmm, 001h) 49 | ZEN4_FUNCDEF_I8(vextractf32x4, zmmi82xmm, 001h) 50 | ZEN4_FUNCDEF_I8(vextracti32x4, zmmi82xmm, 002h) 51 | ZEN4_FUNCDEF_I8(vextractf32x4, zmmi82xmm, 002h) 52 | ZEN4_FUNCDEF_I8(vextracti32x4, zmmi82xmm, 003h) 53 | ZEN4_FUNCDEF_I8(vextractf32x4, zmmi82xmm, 003h) 54 | 55 | ZEN4_FUNCDEF_I8(vextracti32x8, zmmi82ymm, 000h) 56 | ZEN4_FUNCDEF_I8(vextractf32x8, zmmi82ymm, 000h) 57 | ZEN4_FUNCDEF_I8(vextracti32x8, zmmi82ymm, 001h) 58 | ZEN4_FUNCDEF_I8(vextractf32x8, zmmi82ymm, 001h) 59 | 60 | ZEN4_FUNCDEF_I8(vinserti128, xmmymmI82ymm, 000h) 61 | ZEN4_FUNCDEF_I8(vinsertf128, xmmymmI82ymm, 000h) 62 | ZEN4_FUNCDEF_I8(vinserti128, xmmymmI82ymm, 001h) 63 | ZEN4_FUNCDEF_I8(vinsertf128, xmmymmI82ymm, 001h) 64 | 65 | ZEN4_FUNCDEF_I8(vinserti32x4, xmmymmI82ymm, 000h) 66 | ZEN4_FUNCDEF_I8(vinsertf32x4, xmmymmI82ymm, 000h) 67 | ZEN4_FUNCDEF_I8(vinserti32x4, xmmymmI82ymm, 001h) 68 | ZEN4_FUNCDEF_I8(vinsertf32x4, xmmymmI82ymm, 001h) 69 | 70 | ZEN4_FUNCDEF_I8(vinserti64x2, xmmymmI82ymm, 000h) 71 | ZEN4_FUNCDEF_I8(vinsertf64x2, xmmymmI82ymm, 000h) 72 | ZEN4_FUNCDEF_I8(vinserti64x2, xmmymmI82ymm, 001h) 73 | ZEN4_FUNCDEF_I8(vinsertf64x2, xmmymmI82ymm, 001h) 74 | 75 | ZEN4_FUNCDEF_I8(vinserti32x4, xmmzmmI82zmm, 000h) 76 | ZEN4_FUNCDEF_I8(vinsertf32x4, xmmzmmI82zmm, 000h) 77 | ZEN4_FUNCDEF_I8(vinserti32x4, xmmzmmI82zmm, 001h) 78 | ZEN4_FUNCDEF_I8(vinsertf32x4, xmmzmmI82zmm, 001h) 79 | ZEN4_FUNCDEF_I8(vinserti32x4, xmmzmmI82zmm, 002h) 80 | ZEN4_FUNCDEF_I8(vinsertf32x4, xmmzmmI82zmm, 002h) 81 | ZEN4_FUNCDEF_I8(vinserti32x4, xmmzmmI82zmm, 003h) 82 | ZEN4_FUNCDEF_I8(vinsertf32x4, xmmzmmI82zmm, 003h) 83 | 84 | ZEN4_FUNCDEF_I8(vinserti32x8, ymmzmmI82zmm, 000h) 85 | ZEN4_FUNCDEF_I8(vinsertf32x8, ymmzmmI82zmm, 000h) 86 | ZEN4_FUNCDEF_I8(vinserti32x8, ymmzmmI82zmm, 001h) 87 | ZEN4_FUNCDEF_I8(vinsertf32x8, ymmzmmI82zmm, 001h) 88 | 89 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 000h) 90 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 000h) 91 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 001h) 92 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 001h) 93 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 002h) 94 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 002h) 95 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 003h) 96 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 003h) 97 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 008h) 98 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 008h) 99 | 100 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 010h) 101 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 010h) 102 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 011h) 103 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 011h) 104 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 012h) 105 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 012h) 106 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 013h) 107 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 013h) 108 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 018h) 109 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 018h) 110 | 111 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 020h) 112 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 020h) 113 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 021h) 114 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 021h) 115 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 022h) 116 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 022h) 117 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 023h) 118 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 023h) 119 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 028h) 120 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 028h) 121 | 122 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 030h) 123 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 030h) 124 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 031h) 125 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 031h) 126 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 032h) 127 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 032h) 128 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 033h) 129 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 033h) 130 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 038h) 131 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 038h) 132 | 133 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 080h) 134 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 080h) 135 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 081h) 136 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 081h) 137 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 082h) 138 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 082h) 139 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 083h) 140 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 083h) 141 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 088h) 142 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 088h) 143 | 144 | ZEN4_FUNCDEF_I8(vshufi32x4, 2ymmI82ymm, 000h) 145 | ZEN4_FUNCDEF_I8(vshuff32x4, 2ymmI82ymm, 000h) 146 | ZEN4_FUNCDEF_I8(vshufi32x4, 2ymmI82ymm, 001h) 147 | ZEN4_FUNCDEF_I8(vshuff32x4, 2ymmI82ymm, 001h) 148 | ZEN4_FUNCDEF_I8(vshufi32x4, 2ymmI82ymm, 002h) 149 | ZEN4_FUNCDEF_I8(vshuff32x4, 2ymmI82ymm, 002h) 150 | ZEN4_FUNCDEF_I8(vshufi32x4, 2ymmI82ymm, 003h) 151 | ZEN4_FUNCDEF_I8(vshuff32x4, 2ymmI82ymm, 003h) 152 | ZEN4_FUNCDEF_I8(vshufi64x2, 2ymmI82ymm, 000h) 153 | ZEN4_FUNCDEF_I8(vshuff64x2, 2ymmI82ymm, 000h) 154 | ZEN4_FUNCDEF_I8(vshufi64x2, 2ymmI82ymm, 001h) 155 | ZEN4_FUNCDEF_I8(vshuff64x2, 2ymmI82ymm, 001h) 156 | ZEN4_FUNCDEF_I8(vshufi64x2, 2ymmI82ymm, 002h) 157 | ZEN4_FUNCDEF_I8(vshuff64x2, 2ymmI82ymm, 002h) 158 | ZEN4_FUNCDEF_I8(vshufi64x2, 2ymmI82ymm, 003h) 159 | ZEN4_FUNCDEF_I8(vshuff64x2, 2ymmI82ymm, 003h) 160 | 161 | ZEN4_FUNCDEF_I8(vshufi32x4, 2zmmI82zmm, 000h) 162 | ZEN4_FUNCDEF_I8(vshuff32x4, 2zmmI82zmm, 000h) 163 | ZEN4_FUNCDEF_I8(vshufi32x4, 2zmmI82zmm, 044h) 164 | ZEN4_FUNCDEF_I8(vshuff32x4, 2zmmI82zmm, 044h) 165 | ZEN4_FUNCDEF_I8(vshufi32x4, 2zmmI82zmm, 0e4h) 166 | ZEN4_FUNCDEF_I8(vshuff32x4, 2zmmI82zmm, 0e4h) 167 | ZEN4_FUNCDEF_I8(vshufi32x4, 2zmmI82zmm, 0a5h) 168 | ZEN4_FUNCDEF_I8(vshuff32x4, 2zmmI82zmm, 0a5h) 169 | 170 | ZEN4_FUNCDEF_I8(vshufi64x2, 2zmmI82zmm, 000h) 171 | ZEN4_FUNCDEF_I8(vshuff64x2, 2zmmI82zmm, 000h) 172 | ZEN4_FUNCDEF_I8(vshufi64x2, 2zmmI82zmm, 044h) 173 | ZEN4_FUNCDEF_I8(vshuff64x2, 2zmmI82zmm, 044h) 174 | ZEN4_FUNCDEF_I8(vshufi64x2, 2zmmI82zmm, 0e4h) 175 | ZEN4_FUNCDEF_I8(vshuff64x2, 2zmmI82zmm, 0e4h) 176 | ZEN4_FUNCDEF_I8(vshufi64x2, 2zmmI82zmm, 0a5h) 177 | ZEN4_FUNCDEF_I8(vshuff64x2, 2zmmI82zmm, 0a5h) -------------------------------------------------------------------------------- /AVX512_BGVSER.cpp: -------------------------------------------------------------------------------- 1 | #include "stdafx.h" 2 | #include "conio.h" 3 | #include "AVX512_BGVSER.h" 4 | 5 | using namespace std; 6 | 7 | __m256i _mm256_bsrli_epi256(__m256i a, int b) { 8 | return _mm256_maskz_compress_epi8(~0UL << b, a); //left shift is correct here 9 | } 10 | 11 | __m256i _mm256_bslli_epi256(__m256i a, int b) { 12 | return _mm256_maskz_expand_epi8(~0UL << b, a); 13 | } 14 | 15 | __m256i _mm256_palignr_epi256(__m256i a, __m256i b, int c) { 16 | const __m256i disp = _mm256_setr_epi8( 17 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 18 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f); 19 | __m256i idx = _mm256_add_epi8(disp, _mm256_set1_epi8(c)); 20 | return _mm256_permutex2var_epi8(a, idx, b); 21 | } 22 | 23 | __m256i _mm256_palignl_epi256(__m256i a, __m256i b, int c) { 24 | const __m256i disp = _mm256_setr_epi8( 25 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 26 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f); 27 | __m256i idx = _mm256_sub_epi8(disp, _mm256_set1_epi8(c)); 28 | return _mm256_permutex2var_epi8(a, idx, b); 29 | } 30 | 31 | __m256i _mm256_rotater_epi256(__m256i a, int c) { 32 | const __m256i disp = _mm256_setr_epi8( 33 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 34 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f); 35 | __m256i idx = _mm256_add_epi8(disp, _mm256_set1_epi8(c)); 36 | return _mm256_permutexvar_epi8(idx, a); 37 | } 38 | 39 | __m256i _mm256_rotatel_epi256(__m256i a, int c) { 40 | const __m256i disp = _mm256_setr_epi8( 41 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 42 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f); 43 | __m256i idx = _mm256_sub_epi8(disp, _mm256_set1_epi8(c)); 44 | return _mm256_permutexvar_epi8(idx, a); 45 | } 46 | 47 | //BSRLI older 48 | //vpbroadcastb zmm1, rax ;P5 49 | //mov rcx, -1 ;P0156B 50 | //shlx rcx, rcx, rax ;P06 51 | //kmovq k1, rcx ;P5 52 | //vpaddb zmm1, zmm1, [disp] ;P05+P23A 53 | //vpermb zmm0 {k1}{z}, zmm1, zmm0 ;P5 54 | // 55 | //7 uops: P0156B+P06+3*P5+P23A+P05 56 | 57 | //shorter: 58 | //mov rcx, -1 ;P0156B 59 | //shlx rcx, rcx, rax ;P06 60 | //kmovq k1, rcx ;P5 61 | //vpcompressb zmm0 {k1}{z}, zmm0 ;2*P5 62 | // 63 | //5 uops: P0156B+P06+3*P5 64 | 65 | __m512i _mm512_bsrli_epi512(__m512i a, int b) { 66 | return _mm512_maskz_compress_epi8(~0ULL << b, a); //left shift is correct here 67 | } 68 | 69 | __m512i _mm512_bslli_epi512(__m512i a, int b) { 70 | return _mm512_maskz_expand_epi8(~0ULL << b, a); 71 | } 72 | 73 | __m512i _mm512_palignr_epi512(__m512i a, __m512i b, int c) { 74 | const __m512i disp = _mm512_setr_epi8( 75 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 76 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 77 | 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 78 | 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f); 79 | __m512i idx = _mm512_add_epi8(disp, _mm512_set1_epi8(c)); 80 | return _mm512_permutex2var_epi8(a, idx, b); 81 | } 82 | 83 | __m512i _mm512_palignl_epi512(__m512i a, __m512i b, int c) { 84 | const __m512i disp = _mm512_setr_epi8( 85 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 86 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 87 | 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 88 | 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f); 89 | __m512i idx = _mm512_sub_epi8(disp, _mm512_set1_epi8(c)); 90 | return _mm512_permutex2var_epi8(a, idx, b); 91 | } 92 | 93 | __m512i _mm512_rotater_epi512(__m512i a, int c) { 94 | const __m512i disp = _mm512_setr_epi8( 95 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 96 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 97 | 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 98 | 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f); 99 | __m512i idx = _mm512_add_epi8(disp, _mm512_set1_epi8(c)); 100 | return _mm512_permutexvar_epi8(idx, a); 101 | } 102 | 103 | __m512i _mm512_rotatel_epi512(__m512i a, int c) { 104 | const __m512i disp = _mm512_setr_epi8( 105 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 106 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 107 | 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 108 | 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f); 109 | __m512i idx = _mm512_sub_epi8(disp, _mm512_set1_epi8(c)); 110 | return _mm512_permutexvar_epi8(idx, a); 111 | } 112 | 113 | #pragma warning(disable : 4309) 114 | void YMM_Test(void) { 115 | const __m256i testdata0 = _mm256_setr_epi8( 116 | 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 117 | 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f); 118 | const __m256i testdata1 = _mm256_setr_epi8( 119 | 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 120 | 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f); 121 | 122 | cout << "_mm256_bslli_epi256" << endl; 123 | for (int i = 0; i < 32; i++) { 124 | printRes(i, _mm256_bslli_epi256(testdata0, i)); 125 | } 126 | 127 | cout << "_mm256_bsrli_epi256" << endl; 128 | for (int i = 0; i < 32; i++) { 129 | printRes(i, _mm256_bsrli_epi256(testdata0, i)); 130 | } 131 | 132 | cout << "_mm256_palignr_epi256" << endl; 133 | for (int i = 0; i < 32; i++) { 134 | printRes(i, _mm256_palignr_epi256(testdata0, testdata1, i)); 135 | } 136 | 137 | cout << "_mm256_palignl_epi256" << endl; 138 | for (int i = 0; i < 32; i++) { 139 | printRes(i, _mm256_palignl_epi256(testdata1, testdata0, i)); 140 | } 141 | 142 | cout << "_mm256_rotater_epi256" << endl; 143 | for (int i = 0; i < 32; i++) { 144 | printRes(i, _mm256_rotater_epi256(testdata1, i)); 145 | } 146 | 147 | cout << "_mm256_rotatel_epi256" << endl; 148 | for (int i = 0; i < 32; i++) { 149 | printRes(i, _mm256_rotatel_epi256(testdata1, i)); 150 | } 151 | } 152 | 153 | void ZMM_Test(void) { 154 | const __m512i testdata0 = _mm512_setr_epi8( 155 | 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 156 | 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 157 | 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 158 | 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f); 159 | const __m512i testdata1 = _mm512_setr_epi8( 160 | 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 161 | 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 162 | 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 163 | 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf); 164 | 165 | cout << "_mm512_bslli_epi512" << endl; 166 | for (int i = 0; i < 64; i++) { 167 | printRes(i, _mm512_bslli_epi512(testdata0, i)); 168 | } 169 | 170 | cout << "_mm512_bsrli_epi512" << endl; 171 | for (int i = 0; i < 64; i++) { 172 | printRes(i, _mm512_bsrli_epi512(testdata0, i)); 173 | } 174 | 175 | cout << "_mm512_palignr_epi512" << endl; 176 | for (int i = 0; i < 64; i++) { 177 | printRes(i, _mm512_palignr_epi512(testdata0, testdata1, i)); 178 | } 179 | 180 | cout << "_mm512_palignl_epi512" << endl; 181 | for (int i = 0; i < 64; i++) { 182 | printRes(i, _mm512_palignl_epi512(testdata1, testdata0, i)); 183 | } 184 | 185 | cout << "_mm512_rotater_epi512" << endl; 186 | for (int i = 0; i < 64; i++) { 187 | printRes(i, _mm512_rotater_epi512(testdata1, i)); 188 | } 189 | 190 | cout << "_mm512_rotatel_epi512" << endl; 191 | for (int i = 0; i < 64; i++) { 192 | printRes(i, _mm512_rotatel_epi512(testdata1, i)); 193 | } 194 | 195 | } 196 | 197 | void AVX512_BGVSER_Test(void) { 198 | YMM_Test(); 199 | ZMM_Test(); 200 | } 201 | 202 | #pragma warning(default : 4309) 203 | -------------------------------------------------------------------------------- /P06P1.cpp: -------------------------------------------------------------------------------- 1 | #include "stdafx.h" 2 | #include "P06P1.h" 3 | 4 | extern CPU_Props cpu_props; 5 | extern Args args; 6 | 7 | #define P06P1_RETRIES 10 8 | #define P06P1_REPEATS 1000 9 | 10 | using namespace std; 11 | 12 | #ifdef __cplusplus 13 | extern "C" { 14 | #endif 15 | 16 | P06P1_FUNCDEF(BT_RAX_RCX, CX) 17 | P06P1_FUNCDEF(BTC_RAX_RCX, CX) 18 | P06P1_FUNCDEF(BTR_RAX_RCX, CX) 19 | P06P1_FUNCDEF(BTS_RAX_RCX, CX) 20 | P06P1_FUNCDEF(BZHI_RAX_RAX_RCX, CX) 21 | P06P1_FUNCDEF(BEXTR_RAX_RAX_RCX, CX) 22 | 23 | P06P1_FUNCDEF(BT_RAX_RCX, AX) 24 | P06P1_FUNCDEF(BTC_RAX_RCX, AX) 25 | P06P1_FUNCDEF(BTR_RAX_RCX, AX) 26 | P06P1_FUNCDEF(BTS_RAX_RCX, AX) 27 | P06P1_FUNCDEF(BZHI_RAX_RAX_RCX, AX) 28 | P06P1_FUNCDEF(BEXTR_RAX_RAX_RCX, AX) 29 | 30 | P06P1_FUNCDEF(SHLX_RAX_RAX_RCX, CX) 31 | P06P1_FUNCDEF(SHL_RAX_CL, CX) 32 | P06P1_FUNCDEF(SHLX_RAX_RAX_RCX, AX) 33 | P06P1_FUNCDEF(SHL_RAX_CL, AX) 34 | P06P1_FUNCDEF(SHL_RAX_IMM8, AX) 35 | P06P1_FUNCDEF(SHL_RAX_IMPL1, AX) 36 | 37 | P06P1_FUNCDEF(SHRX_RAX_RAX_RCX, CX) 38 | P06P1_FUNCDEF(SHR_RAX_CL, CX) 39 | P06P1_FUNCDEF(SHRX_RAX_RAX_RCX, AX) 40 | P06P1_FUNCDEF(SHR_RAX_CL, AX) 41 | P06P1_FUNCDEF(SHR_RAX_IMM8, AX) 42 | P06P1_FUNCDEF(SHR_RAX_IMPL1, AX) 43 | 44 | P06P1_FUNCDEF(SARX_RAX_RAX_RCX, CX) 45 | P06P1_FUNCDEF(SAR_RAX_CL, CX) 46 | P06P1_FUNCDEF(SARX_RAX_RAX_RCX, AX) 47 | P06P1_FUNCDEF(SAR_RAX_CL, AX) 48 | P06P1_FUNCDEF(SAR_RAX_IMM8, AX) 49 | P06P1_FUNCDEF(SAR_RAX_IMPL1, AX) 50 | 51 | P06P1_FUNCDEF(RORX_RCX_RCX_IMM8, CX) 52 | P06P1_FUNCDEF(ROR_RAX_CL, CX) 53 | P06P1_FUNCDEF(ROR_RAX_CL, AX) 54 | P06P1_FUNCDEF(ROR_RAX_IMM8, AX) 55 | P06P1_FUNCDEF(ROR_RAX_IMPL1, AX) 56 | 57 | P06P1_FUNCDEF(ROL_RAX_CL, CX) 58 | P06P1_FUNCDEF(ROL_RAX_CL, AX) 59 | P06P1_FUNCDEF(ROL_RAX_IMM8, AX) 60 | P06P1_FUNCDEF(ROL_RAX_IMPL1, AX) 61 | 62 | P06P1_FUNCDEF(RCR_RAX_CL, CX) 63 | P06P1_FUNCDEF(RCR_RAX_CL, AX) 64 | P06P1_FUNCDEF(RCR_RAX_IMM8, AX) 65 | P06P1_FUNCDEF(RCR_RAX_IMPL1, AX) 66 | 67 | P06P1_FUNCDEF(RCL_RAX_CL, CX) 68 | P06P1_FUNCDEF(RCL_RAX_CL, AX) 69 | P06P1_FUNCDEF(RCL_RAX_IMM8, AX) 70 | P06P1_FUNCDEF(RCL_RAX_IMPL1, AX) 71 | 72 | P06P1_FUNCDEF(ADC_RAX_IMM8, AX) 73 | P06P1_FUNCDEF(SBB_RAX_IMM8, AX) 74 | 75 | P06P1_FUNCDEF(ADCX_RAX_RCX, CX) 76 | P06P1_FUNCDEF(ADOX_RAX_RCX, CX) 77 | P06P1_FUNCDEF(ADCX_RAX_RCX, AX) 78 | P06P1_FUNCDEF(ADOX_RAX_RCX, AX) 79 | 80 | P06P1_FUNCDEF(CMOVBE_RAX_RCX, CX) 81 | P06P1_FUNCDEF(CMOVNBE_RAX_RCX, CX) 82 | P06P1_FUNCDEF(CMOVZ_RAX_RCX, CX) 83 | P06P1_FUNCDEF(CMOVNZ_RAX_RCX, CX) 84 | 85 | P06P1_FUNCDEF(CMOVBE_RAX_RCX, AX) 86 | P06P1_FUNCDEF(CMOVNBE_RAX_RCX, AX) 87 | P06P1_FUNCDEF(CMOVZ_RAX_RCX, AX) 88 | P06P1_FUNCDEF(CMOVNZ_RAX_RCX, AX) 89 | 90 | P06P1_FUNCDEF(BSWAP_RAX, AX) 91 | P06P1_FUNCDEF(POPCNT_RCX_RCX, CX) 92 | P06P1_FUNCDEF(LZCNT_RCX_RCX, CX) 93 | P06P1_FUNCDEF(TZCNT_RCX_RCX, CX) 94 | 95 | P06P1_FUNCDEF(BSR_RCX_RCX, CX) 96 | P06P1_FUNCDEF(BSF_RCX_RCX, CX) 97 | 98 | P06P1_FUNCDEF(CRC32_RCX_RCX, CX) 99 | P06P1_FUNCDEF(PDEP_RAX_RAX_RCX, CX) 100 | P06P1_FUNCDEF(PEXT_RAX_RAX_RCX, CX) 101 | 102 | 103 | #ifdef __cplusplus 104 | } 105 | #endif 106 | 107 | measure_methods P06P1_affected[] = { 108 | P06P1_FUNC("SHLX RAX, RAX, RCX", SHLX_RAX_RAX_RCX, CX, BMI2, 1) 109 | P06P1_FUNC("SHRX RAX, RAX, RCX", SHRX_RAX_RAX_RCX, CX, BMI2, 1) 110 | P06P1_FUNC("SARX RAX, RAX, RCX", SARX_RAX_RAX_RCX, CX, BMI2, 1) 111 | P06P1_FUNC("SHL RAX, CL", SHL_RAX_CL, CX, AMD64, 2) 112 | P06P1_FUNC("SHR RAX, CL", SHR_RAX_CL, CX, AMD64, 2) 113 | P06P1_FUNC("SAR RAX, CL", SAR_RAX_CL, CX, AMD64, 2) 114 | P06P1_FUNC("ROR RAX, CL", ROR_RAX_CL, CX, AMD64, 2) 115 | P06P1_FUNC("ROL RAX, CL", ROL_RAX_CL, CX, AMD64, 2) 116 | P06P1_FUNC("BT RAX RCX + ADC RAX,0", BT_RAX_RCX, CX, AMD64, 2) 117 | P06P1_FUNC("BTC RAX RCX", BTC_RAX_RCX, CX, AMD64, 1) 118 | P06P1_FUNC("BTR RAX, RCX", BTR_RAX_RCX, CX, AMD64, 1) 119 | P06P1_FUNC("BTS RAX, RCX", BTS_RAX_RCX, CX, AMD64, 1) 120 | P06P1_FUNC("BZHI RAX, RAX, RCX", BZHI_RAX_RAX_RCX, CX, BMI2, 1) 121 | P06P1_FUNC("BEXTR RAX, RAX, RCX", BEXTR_RAX_RAX_RCX, CX, BMI, 2) 122 | }; 123 | 124 | measure_methods P06P1_unaffected[] = { 125 | P06P1_FUNC("SHLX RAX, RAX, RCX", SHLX_RAX_RAX_RCX, AX, BMI2, 1) 126 | P06P1_FUNC("SHRX RAX, RAX, RCX", SHRX_RAX_RAX_RCX, AX, BMI2, 1) 127 | P06P1_FUNC("SARX RAX, RAX, RCX", SARX_RAX_RAX_RCX, AX, BMI2, 1) 128 | P06P1_FUNC("RORX RCX, RCX, IMM8", RORX_RCX_RCX_IMM8, CX, BMI2, 1) 129 | 130 | P06P1_FUNC("SHL RAX, CL", SHL_RAX_CL, AX, AMD64, 2) 131 | P06P1_FUNC("SHR RAX, CL", SHR_RAX_CL, AX, AMD64, 2) 132 | P06P1_FUNC("SAR RAX, CL", SAR_RAX_CL, AX, AMD64, 2) 133 | P06P1_FUNC("ROR RAX, CL", ROR_RAX_CL, AX, AMD64, 2) 134 | P06P1_FUNC("ROL RAX, CL", ROL_RAX_CL, AX, AMD64, 2) 135 | P06P1_FUNC("RCR RAX, CL", RCR_RAX_CL, AX, AMD64, 7) 136 | P06P1_FUNC("RCL RAX, CL", RCL_RAX_CL, AX, AMD64, 7) 137 | 138 | P06P1_FUNC("RCR RAX, CL", RCR_RAX_CL, CX, AMD64, 7) 139 | P06P1_FUNC("RCL RAX, CL", RCL_RAX_CL, CX, AMD64, 7) 140 | 141 | P06P1_FUNC("SHL RAX, IMM8", SHL_RAX_IMM8, AX, AMD64, 1) 142 | P06P1_FUNC("SHR RAX, IMM8", SHR_RAX_IMM8, AX, AMD64, 1) 143 | P06P1_FUNC("SAR RAX, IMM8", SAR_RAX_IMM8, AX, AMD64, 1) 144 | P06P1_FUNC("ROR RAX, IMM8", ROR_RAX_IMM8, AX, AMD64, 1) 145 | P06P1_FUNC("ROL RAX, IMM8", ROL_RAX_IMM8, AX, AMD64, 1) 146 | P06P1_FUNC("RCR RAX, IMM8", RCR_RAX_IMM8, AX, AMD64, 7) 147 | P06P1_FUNC("RCL RAX, IMM8", RCL_RAX_IMM8, AX, AMD64, 7) 148 | 149 | P06P1_FUNC("SHL RAX, IMPL1", SHL_RAX_IMPL1, AX, AMD64, 1) 150 | P06P1_FUNC("SHR RAX, IMPL1", SHR_RAX_IMPL1, AX, AMD64, 1) 151 | P06P1_FUNC("SAR RAX, IMPL1", SAR_RAX_IMPL1, AX, AMD64, 1) 152 | P06P1_FUNC("ROR RAX, IMPL1", ROR_RAX_IMPL1, AX, AMD64, 2) 153 | P06P1_FUNC("ROL RAX, IMPL1", ROL_RAX_IMPL1, AX, AMD64, 2) 154 | P06P1_FUNC("RCR RAX, IMPL1", RCR_RAX_IMPL1, AX, AMD64, 3) 155 | P06P1_FUNC("RCL RAX, IMPL1", RCL_RAX_IMPL1, AX, AMD64, 3) 156 | 157 | 158 | P06P1_FUNC("ADC RAX, IMM8", ADC_RAX_IMM8, AX, AMD64, 1) 159 | P06P1_FUNC("SBB RAX, IMM8", SBB_RAX_IMM8, AX, AMD64, 1) 160 | 161 | P06P1_FUNC("ADCX RAX, RCX", ADCX_RAX_RCX, CX, ADX, 1) 162 | P06P1_FUNC("ADOX RAX, RCX", ADOX_RAX_RCX, CX, ADX, 1) 163 | P06P1_FUNC("ADCX RAX, RCX", ADCX_RAX_RCX, AX, ADX, 1) 164 | P06P1_FUNC("ADOX RAX, RCX", ADOX_RAX_RCX, AX, ADX, 1) 165 | 166 | P06P1_FUNC("CMOVBE RAX, RCX", CMOVBE_RAX_RCX, CX, CMOV, 2) 167 | P06P1_FUNC("CMOVNBE, RAX, RCX", CMOVNBE_RAX_RCX, CX, CMOV, 2) 168 | P06P1_FUNC("CMOVZ RAX, RCX", CMOVZ_RAX_RCX, CX, CMOV, 1) 169 | P06P1_FUNC("CMOVNZ RAX, RCX", CMOVNZ_RAX_RCX, CX, CMOV, 1) 170 | 171 | P06P1_FUNC("CMOVBE RAX, RCX", CMOVBE_RAX_RCX, AX, CMOV, 2) 172 | P06P1_FUNC("CMOVNBE, RAX, RCX", CMOVNBE_RAX_RCX, AX, CMOV, 2) 173 | P06P1_FUNC("CMOVZ RAX, RCX", CMOVZ_RAX_RCX, AX, CMOV, 1) 174 | P06P1_FUNC("CMOVNZ RAX, RCX", CMOVNZ_RAX_RCX, AX, CMOV, 1) 175 | 176 | P06P1_FUNC("BT RAX RCX + ADC RAX,0", BT_RAX_RCX, AX, AMD64, 2) 177 | P06P1_FUNC("BTC RAX RCX", BTC_RAX_RCX, AX, AMD64, 1) 178 | P06P1_FUNC("BTR RAX, RCX", BTR_RAX_RCX, AX, AMD64, 1) 179 | P06P1_FUNC("BTS RAX, RCX", BTS_RAX_RCX, AX, AMD64, 1) 180 | P06P1_FUNC("BZHI RAX, RAX, RCX", BZHI_RAX_RAX_RCX, AX, BMI2, 1) 181 | P06P1_FUNC("BEXTR RAX, RAX, RCX", BEXTR_RAX_RAX_RCX, AX, BMI, 2) 182 | 183 | P06P1_FUNC("BSWAP RAX", BSWAP_RAX, AX, AMD64, 2) 184 | P06P1_FUNC("POPCNT RCX, RCX", POPCNT_RCX_RCX, CX, POPCNT, 1) 185 | P06P1_FUNC("LZCNT RCX, RCX", LZCNT_RCX_RCX, CX, ABM, 1) 186 | P06P1_FUNC("TZCNT RCX, RCX", TZCNT_RCX_RCX, CX, BMI, 1) 187 | 188 | P06P1_FUNC("BSR RCX, RCX", BSR_RCX_RCX, CX, AMD64, 1) 189 | P06P1_FUNC("BSF RCX, RCX", BSF_RCX_RCX, CX, AMD64, 1) 190 | 191 | P06P1_FUNC("CRC32 RCX, RCX", CRC32_RCX_RCX, CX, SSE42, 1) 192 | P06P1_FUNC("PDEP RAX, RAX, RCX", PDEP_RAX_RAX_RCX, CX, BMI2, 1) 193 | P06P1_FUNC("PEXT RAX, RAX, RCX", PEXT_RAX_RAX_RCX, CX, BMI2, 1) 194 | }; 195 | 196 | void P0601_Time(measure_methods * m, int method, int testcase) { 197 | unsigned __int64 minres = ULONG_MAX; 198 | 199 | (m[method].func[testcase])(); 200 | for (int retry = 0; retry < P06P1_RETRIES; retry++) { 201 | minres = min(minres, (m[method].func[testcase])()); 202 | } 203 | cout << '\t' << setw(6) << right << (int)((double)minres / (double)P06P1_REPEATS); 204 | } 205 | 206 | void P0601(measure_methods * m, int instcount) { 207 | cout << "TSC CLKs:--------------------------------\t -1025\t -1024\t -513\t -512\t 511\t 512\t 1023\t 1024 (#uop)" << endl; 208 | 209 | for (int b = 0; b < instcount; b++) { 210 | cout << dec << setw(2) << b << ':'; 211 | cout << left << setw(INSTNAMELEN) << m[b].inst << ' '; 212 | cout << left << setw(INITLEN) << m[b].init << ": "; 213 | for (int t = 0; t < TESTCASE; t++) { 214 | if ((cpu_props.IsFeat(m[b].feats)) && (m[b].func[t] != NULL)) 215 | P0601_Time(m, b, t); 216 | } 217 | cout << " (" << m[b].uopscount << ')' << endl; 218 | } 219 | } 220 | 221 | void P0601_Test(void) { 222 | SetThread(args.GetThreadIndex(cpu_props)); 223 | cout << "Affected Instructions:" << endl; 224 | P0601(P06P1_affected, sizeof(P06P1_affected) / sizeof(measure_methods)); 225 | 226 | cout << "Unaffected Instructions:" << endl; 227 | P0601(P06P1_unaffected, sizeof(P06P1_unaffected) / sizeof(measure_methods)); 228 | } -------------------------------------------------------------------------------- /InstLatX64_Demo.vcxproj.filters: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | Source Files 20 | 21 | 22 | Source Files 23 | 24 | 25 | Source Files 26 | 27 | 28 | Source Files 29 | 30 | 31 | Source Files 32 | 33 | 34 | Source Files 35 | 36 | 37 | Source Files 38 | 39 | 40 | Source Files 41 | 42 | 43 | Source Files 44 | 45 | 46 | Source Files 47 | 48 | 49 | Source Files 50 | 51 | 52 | Source Files 53 | 54 | 55 | Source Files 56 | 57 | 58 | Source Files 59 | 60 | 61 | Source Files 62 | 63 | 64 | Source Files 65 | 66 | 67 | Source Files 68 | 69 | 70 | Source Files 71 | 72 | 73 | Source Files 74 | 75 | 76 | Source Files 77 | 78 | 79 | Source Files 80 | 81 | 82 | Source Files 83 | 84 | 85 | Source Files 86 | 87 | 88 | Source Files 89 | 90 | 91 | Source Files 92 | 93 | 94 | Source Files 95 | 96 | 97 | Source Files 98 | 99 | 100 | Source Files 101 | 102 | 103 | 104 | 105 | Header Files 106 | 107 | 108 | Header Files 109 | 110 | 111 | Header Files 112 | 113 | 114 | Header Files 115 | 116 | 117 | Header Files 118 | 119 | 120 | Header Files 121 | 122 | 123 | Header Files 124 | 125 | 126 | Header Files 127 | 128 | 129 | Header Files 130 | 131 | 132 | Header Files 133 | 134 | 135 | Header Files 136 | 137 | 138 | Header Files 139 | 140 | 141 | Header Files 142 | 143 | 144 | Header Files 145 | 146 | 147 | Header Files 148 | 149 | 150 | Header Files 151 | 152 | 153 | Header Files 154 | 155 | 156 | Header Files 157 | 158 | 159 | Header Files 160 | 161 | 162 | Header Files 163 | 164 | 165 | Header Files 166 | 167 | 168 | Header Files 169 | 170 | 171 | Header Files 172 | 173 | 174 | Header Files 175 | 176 | 177 | Header Files 178 | 179 | 180 | Header Files 181 | 182 | 183 | Header Files 184 | 185 | 186 | Header Files 187 | 188 | 189 | Header Files 190 | 191 | 192 | Header Files 193 | 194 | 195 | Header Files 196 | 197 | 198 | Header Files 199 | 200 | 201 | Header Files 202 | 203 | 204 | Header Files 205 | 206 | 207 | Header Files 208 | 209 | 210 | Header Files 211 | 212 | 213 | Header Files 214 | 215 | 216 | Header Files 217 | 218 | 219 | Header Files 220 | 221 | 222 | 223 | 224 | Source Files 225 | 226 | 227 | Source Files 228 | 229 | 230 | Source Files 231 | 232 | 233 | Source Files 234 | 235 | 236 | Source Files 237 | 238 | 239 | Source Files 240 | 241 | 242 | Source Files 243 | 244 | 245 | Source Files 246 | 247 | 248 | Source Files 249 | 250 | 251 | Source Files 252 | 253 | 254 | Source Files 255 | 256 | 257 | Source Files 258 | 259 | 260 | Source Files 261 | 262 | 263 | Source Files 264 | 265 | 266 | Source Files 267 | 268 | 269 | -------------------------------------------------------------------------------- /Args.cpp: -------------------------------------------------------------------------------- 1 | #include "stdafx.h" 2 | 3 | const paramsType Args::params[] = { 4 | {false, "help", 'h', ARG_HELP, NULL, "this help"}, 5 | {false, "help", '?', ARG_HELP, NULL, "this help"}, 6 | {false, "version", 'v', ARG_VERSION, NULL, "version info"}, 7 | {false, "list", 'l', ARG_DEMOLIST, NULL, "list of demo types"}, 8 | {false, "cpu", 'c', ARG_CPUPROPS, NULL, "list of CPU properties"}, 9 | {false, "pcore", '\0', ARG_PCORE, NULL, "using Performance core on hybrid CPU"}, 10 | {false, "ecore", '\0', ARG_ECORE, NULL, "using Efficient core on hybrid CPU"}, 11 | {false, "lcore", '\0', ARG_LPECORE, NULL, "using LP-E core on hybrid CPU"}, 12 | {false, "dump", 'm', ARG_CPUIDDUMP, NULL, "native CPUID dump"}, 13 | {false, "procmask", 'k', ARG_PROCMASK, NULL, "list of P/E/LPE procmasks"}, 14 | #if defined (_M_X64) && defined(__AVX512F__) 15 | {false, "512bFMA", '5', ARG_512BFMADP, NULL, "print number of 512b FMA double precision ports"}, 16 | #endif 17 | {true, "demo", 'd', ARG_DEMOTYPE, ARGERR_MISS_DEMO, "demo type"}, 18 | {true, "thread", 't', ARG_THREADINDEX, ARGERR_MISS_THREAD, "thread index"}, 19 | {true, "file", 'f', ARG_CPUIDFILE, ARGERR_MISS_CPUIDFILE, "CPUID from file"}, 20 | {true, "xcr0", 'x', ARG_XCR0, ARGERR_MISS_XCR0, "forced XCR0 value in hex w/o 0x"}, 21 | {true, "tscratio", '\0', ARG_TSCRATIO, ARGERR_MISS_TSCRATIO, "TSC result correction"}, 22 | }; 23 | 24 | void Args::SetError(char* errorPlace, char * tempStr, const char * errorMsg) { 25 | strcpy_s(errorPlace, STR_MAXLEN, errorMsg); 26 | strcat_s(errorPlace, STR_MAXLEN, tempStr); 27 | return; 28 | } 29 | 30 | void Args::SetParam(argType paramType, char * tempStr, char* errorPlace, int * errorCounter) { 31 | if (tempStr[0] == '\0') { 32 | if (params[paramType].arguments) { 33 | SetError(errorPlace, tempStr, params[paramType].missingErr); 34 | (*errorCounter)++; 35 | } 36 | } else { 37 | switch (paramType) { 38 | case ARG_HELP: { 39 | helpFlag = true; 40 | } break; 41 | case ARG_VERSION: { 42 | versionFlag = true; 43 | } break; 44 | case ARG_DEMOLIST: { 45 | listFlag = true; 46 | } break; 47 | case ARG_CPUPROPS: { 48 | cpuPropsFlag = true; 49 | } break; 50 | case ARG_PROCMASK: { 51 | procMaskFlag = true; 52 | } break; 53 | #if defined (_M_X64) && defined(__AVX512F__) 54 | case ARG_512BFMADP: { 55 | _512bFMA_DP_Flag = true; 56 | } break; 57 | #endif 58 | case ARG_DEMOTYPE: { 59 | uint32_t demo = 0; 60 | for (demo = 0; demo < demoCount; demo++) 61 | if ((_stricmp(demoList[demo].demoName, tempStr) == 0) || 62 | (_stricmp(demoList[demo].alias, tempStr) == 0)){ 63 | const uint64_t d_bit = 1ULL << (demoList[demo].demoMask & 0x3f); 64 | const uint64_t d_qword = min(demoList[demo].demoMask >> 6, MAX_DEMOMASK); 65 | demoMask[d_qword] |= d_bit; 66 | break; 67 | } 68 | if (demo == demoCount) { 69 | SetError(errorPlace, tempStr, ARGERR_INV_DEMO); 70 | (*errorCounter)++; 71 | } 72 | } break; 73 | case ARG_THREADINDEX: { 74 | char* endPtr = 0; 75 | threadIndex = strtol(tempStr, &endPtr, 10); 76 | } 77 | break; 78 | case ARG_PCORE: { 79 | threadIndex = DEFAULT_PCORE_INDEX; 80 | } 81 | break; 82 | case ARG_ECORE: { 83 | threadIndex = DEFAULT_ECORE_INDEX; 84 | } 85 | break; 86 | case ARG_LPECORE: { 87 | threadIndex = DEFAULT_LPECORE_INDEX; 88 | } 89 | break; 90 | case ARG_CPUIDDUMP: { 91 | dumpFlag = true; 92 | } break; 93 | case ARG_CPUIDFILE: { 94 | cpuidFileFlag = true; 95 | cpuidFileName = tempStr; 96 | } break; 97 | case ARG_XCR0: { 98 | char* endPtr = 0; 99 | xcr0 = strtol(tempStr, &endPtr, 16); 100 | if ((xcr0 & (_XCR0_X87 | _XCR0_AVX | _XCR0_AVX512 | _XCR0_AMX | _XCR0_APX)) != xcr0) { 101 | SetError(errorPlace, tempStr, ARGERR_INV_XCR0); 102 | (*errorCounter)++; 103 | } 104 | } 105 | break; 106 | case ARG_TSCRATIO: { 107 | char* endPtr = 0; 108 | tscRatio = strtod(tempStr, &endPtr); 109 | if ((tscRatio <= 0.0) || (tscRatio > MAX_TSCRATIO)) { 110 | SetError(errorPlace, tempStr, ARGERR_INV_TSCRATIO); 111 | (*errorCounter)++; 112 | } 113 | } 114 | break; 115 | case ARG_NOTHING: { 116 | } break; 117 | default: { 118 | } break; 119 | } 120 | } 121 | return; 122 | } 123 | 124 | void Args::PrintUsage(void) const { 125 | printf("\r\nUsage: %s [switches]", DEMO_FILENAME); 126 | printf("\r\nExample: %s --demo=GFNI -d=VBMI2 --help --version -c", DEMO_FILENAME); 127 | printf("\r\nSwitches:"); 128 | for (unsigned int comm = 0; comm < sizeof(params) / sizeof(paramsType); comm++) 129 | if (params[comm].shortName != '\0') 130 | printf("\r\n\t[-%c|--%-16s] %s", params[comm].shortName, params[comm].longName, params[comm].description); 131 | else 132 | printf("\r\n\t [--%-16s] %s", params[comm].longName, params[comm].description); 133 | printf("\r\n"); 134 | } 135 | 136 | void Args::PrintVersion(void) const { 137 | std::cout << "Build date:" << __DATE__ << " Time:" << __TIME__ << std::endl; 138 | }; 139 | 140 | bool Args::IsVersion(void) const{ 141 | return versionFlag; 142 | }; 143 | 144 | bool Args::IsHelp(void) const { 145 | return helpFlag; 146 | }; 147 | 148 | bool Args::IsDemoList(void) const { 149 | return listFlag; 150 | }; 151 | 152 | bool Args::IsCPUProps(void) const { 153 | return cpuPropsFlag; 154 | }; 155 | 156 | bool Args::IsProcMask(void) const { 157 | return procMaskFlag; 158 | }; 159 | 160 | #if defined (_M_X64) && defined(__AVX512F__) 161 | bool Args::Is_512bFMA_DP_Ports(void) const { 162 | return _512bFMA_DP_Flag; 163 | } 164 | #endif 165 | 166 | bool Args::IsCPUIDDump(void) const { 167 | return dumpFlag; 168 | }; 169 | 170 | bool Args::IsCPUIDFile(void) const { 171 | return cpuidFileFlag; 172 | }; 173 | 174 | size_t Args::GetMaxDemo(void) const { 175 | return DEMO_LAST; 176 | }; 177 | 178 | size_t Args::GetThreadIndex(CPU_Props c) const { 179 | switch (threadIndex) { 180 | case DEFAULT_PCORE_INDEX: return c.GetPCoreIndex(); 181 | case DEFAULT_ECORE_INDEX: return c.GetECoreIndex(); 182 | case DEFAULT_LPECORE_INDEX: return c.GetLPECoreIndex(); 183 | default: 184 | return threadIndex; 185 | } 186 | }; 187 | 188 | char* Args::GetCPUIDFileName() const { 189 | return cpuidFileName; 190 | }; 191 | 192 | bool Args::IsValid(void) const { 193 | return validFlag; 194 | }; 195 | 196 | UINT64 Args::GetXCR0() const { 197 | return xcr0; 198 | }; 199 | 200 | double Args::GetTSCRatio() const { 201 | return tscRatio; 202 | }; 203 | 204 | bool Args::IsSelected(size_t i) const { 205 | return ((demoMask[i >> 6] & (1ULL << (i & 0x3f))) != 0); 206 | }; 207 | 208 | Args::Args(const demoTypeList* demos, size_t size, int argc, char** argv) : 209 | demoList(demos), demoCount(size), paramCount(sizeof(params) / sizeof(paramsType)), 210 | versionFlag(0), helpFlag(0), listFlag(0), cpuPropsFlag(0), procMaskFlag(0), 211 | #if defined (_M_X64) && defined(__AVX512F__) 212 | _512bFMA_DP_Flag(0), 213 | #endif 214 | errorFlag(0), dumpFlag(0), cpuidFileFlag(0), paramType(ARG_NOTHING), threadIndex(0), cpuidFileName(0), xcr0(0), tscRatio(1.0) { 215 | validFlag = Init(argc, argv); 216 | }; 217 | 218 | bool Args::Init(int argc, char** argv) { 219 | char errorStr[MAX_ARGERROR][STR_MAXLEN]; 220 | memset(errorStr, 0, MAX_ARGERROR * STR_MAXLEN); 221 | int errorCounter = 0; 222 | for (int32_t a = 1; a < argc; a++) { 223 | bool handledFlag = false; 224 | switch (argv[a][0]) { 225 | case '/': 226 | case '-': 227 | switch (argv[a][1]) { 228 | case '/': 229 | case '-': { 230 | for (unsigned int p = 0; p < paramCount; p++) { 231 | if (params[p].arguments) { 232 | char* equPos = strchr(&(argv[a][2]), '='); 233 | if (equPos != 0) { 234 | const size_t paramSize = equPos - &(argv[a][2]); 235 | if (_strnicmp(params[p].longName, &(argv[a][2]), paramSize) == 0) { 236 | SetParam(params[p].type, equPos + 1, errorStr[errorCounter], &errorCounter); 237 | handledFlag = true; 238 | break; 239 | } 240 | } else { 241 | SetError(errorStr[errorCounter], &(argv[a][2]), params[p].missingErr); 242 | handledFlag = true; 243 | errorCounter++; 244 | } 245 | } else { 246 | if (_stricmp(params[p].longName, &(argv[a][2])) == 0) { 247 | SetParam(params[p].type, &(argv[a][3]), errorStr[errorCounter], &errorCounter); 248 | handledFlag = true; 249 | break; 250 | } 251 | } 252 | } 253 | } break; 254 | case '\0': 255 | case ' ': 256 | case '=': { 257 | SetError(errorStr[errorCounter], &(argv[a][0]), ARGERR_MISS_ARG); 258 | handledFlag = true; 259 | errorCounter++; 260 | } break; 261 | default: { 262 | bool findEqu = (argv[a][2] == '='); 263 | for (size_t p = 0; p < paramCount; p++) { 264 | char* find = strchr(&argv[a][1], params[p].shortName); 265 | if (!params[p].arguments && (find != 0) && (!findEqu)) { 266 | handledFlag = true; 267 | SetParam(params[p].type, find, errorStr[errorCounter], &errorCounter); 268 | } else if (params[p].arguments && (params[p].shortName == argv[a][1])) { 269 | if (findEqu) { 270 | handledFlag = true; 271 | SetParam(params[p].type, &(argv[a][3]), errorStr[errorCounter], &errorCounter); 272 | break; 273 | } else { 274 | handledFlag = true; 275 | SetError(errorStr[errorCounter], &(argv[a][2]), params[p].missingErr); 276 | errorCounter++; 277 | break; 278 | } 279 | } 280 | } 281 | } break; 282 | } 283 | break; 284 | default: 285 | break; 286 | } 287 | if (!handledFlag) { 288 | SetError(errorStr[errorCounter], &(argv[a][0]), ARGERR_INV_SWITCH); 289 | errorCounter++; 290 | } 291 | } 292 | if (errorCounter > 0) { 293 | for (int errs = 0; errs < errorCounter; errs++) 294 | printf_s("\r\n%s", errorStr[errs]); 295 | printf_s("\r\n"); 296 | return false; 297 | } 298 | return true; 299 | } --------------------------------------------------------------------------------