├── PEXT_PDEP_Emu.h
├── stdafx.cpp
├── Compiler_Intrinsic_Test.h
├── GFNI_Demo.h
├── 512bFMA_DP_Ports.cpp
├── Results
├── GFNI_Output.png
├── OfficeDayTime.png
├── GFNI_PosPopcnt.png
├── PEXT_PDEP_TR3970X.png
├── GFNI_8x8_Explanation.png
├── GFNI_vs_VBMI2.txt
├── Byte2Byte_CNL.txt
├── Byte2Byte_RKL.txt
├── Zen4_expected.txt
├── HYBRID_Lakefield_CPUID806A1.txt
├── TZCNT_RKL.txt
└── TZCNT_WLC.txt
├── 512bFMA_DP_Ports_Asm.asm
├── 512bFMA_DP_Ports.h
├── InstLatX64_Demo.vcxproj.user
├── KmovTest.h
├── FirstByte.h
├── AVX512_Reduce_Add.h
├── VPCLMULQDQ_Demo.h
├── targetver.h
├── .gitattributes
├── stdafx.h
├── Kmov_Test.cpp
├── AVX_VNNI_INT16_Saturated_AddSub.h
├── Zen4_Demo.h
├── AVX512_BGVSER.h
├── .gitignore
├── AVX512_Saturated_AddSub.h
├── Byte2Byte.h
├── Zen5_Demo.h
├── AVX512_DecimalPrint.h
├── Byte2Byte.cpp
├── AMX_Demo.h
├── HWBITPERM_Demo.h
├── InstLatX64_Demo.h
├── P06P1.h
├── VPCLMULQDQ_Demo.cpp
├── Misc.h
├── ConsoleColor.h
├── LZCNT_Demo.h
├── Zen5_Demo_Port.h
├── Zen4_Demo_Port.h
├── TZCNT_Demo.h
├── KmovTest_Asm.asm
├── InstLatX64_Demo.sln
├── VPCLMULQDQ_Demo_Test.cpp
├── Args.h
├── HWBITPERM_Demo_Asm.asm
├── AVX512_Reduce_Add.cpp
├── AMX_Demo.cpp
├── InstLatX64_Demo.cpp
├── FirstByte.cpp
├── AVX_VNNI_INT16_Saturated_AddSub.cpp
├── Byte2Byte_Asm.asm
├── Misc.cpp
├── README.md
├── TZCNT_Demo.cpp
├── Zen5_Demo_Imm8.h
├── TZCNT_Demo_Asm.asm
├── LZCNT_Demo.cpp
├── LZCNT_Demo_Asm.asm
├── HWBITPERM_Demo.cpp
├── Zen4_Demo_Imm8.h
├── AVX512_BGVSER.cpp
├── P06P1.cpp
├── InstLatX64_Demo.vcxproj.filters
└── Args.cpp
/PEXT_PDEP_Emu.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 |
--------------------------------------------------------------------------------
/stdafx.cpp:
--------------------------------------------------------------------------------
1 | #include "stdafx.h"
2 |
--------------------------------------------------------------------------------
/Compiler_Intrinsic_Test.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | void InstrincTest(void);
4 |
--------------------------------------------------------------------------------
/GFNI_Demo.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InstLatx64/InstLatX64_Demo/HEAD/GFNI_Demo.h
--------------------------------------------------------------------------------
/512bFMA_DP_Ports.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InstLatx64/InstLatX64_Demo/HEAD/512bFMA_DP_Ports.cpp
--------------------------------------------------------------------------------
/Results/GFNI_Output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InstLatx64/InstLatX64_Demo/HEAD/Results/GFNI_Output.png
--------------------------------------------------------------------------------
/512bFMA_DP_Ports_Asm.asm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InstLatx64/InstLatX64_Demo/HEAD/512bFMA_DP_Ports_Asm.asm
--------------------------------------------------------------------------------
/Results/OfficeDayTime.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InstLatx64/InstLatX64_Demo/HEAD/Results/OfficeDayTime.png
--------------------------------------------------------------------------------
/Results/GFNI_PosPopcnt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InstLatx64/InstLatX64_Demo/HEAD/Results/GFNI_PosPopcnt.png
--------------------------------------------------------------------------------
/Results/PEXT_PDEP_TR3970X.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InstLatx64/InstLatX64_Demo/HEAD/Results/PEXT_PDEP_TR3970X.png
--------------------------------------------------------------------------------
/Results/GFNI_8x8_Explanation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InstLatx64/InstLatX64_Demo/HEAD/Results/GFNI_8x8_Explanation.png
--------------------------------------------------------------------------------
/512bFMA_DP_Ports.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | extern "C" void fma_shuffle_tpt(int);
4 | extern "C" void fma_only_tpt(int);
5 |
6 | int Get_512bFMA_DP_Ports_FromOptimGuide(void);
--------------------------------------------------------------------------------
/InstLatX64_Demo.vcxproj.user:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/KmovTest.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #define KMOV_REPEATS 0x1000000
4 |
5 | extern "C" unsigned __int64 KmovTest01(void);
6 | extern "C" unsigned __int64 KmovTest02(void);
7 | extern "C" unsigned __int64 KmovTest03(void);
8 |
9 |
--------------------------------------------------------------------------------
/FirstByte.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | __m256i _mm256_firstbyte_epu32(__m256i a, char c);
4 | __m256i _mm256_firstbyte_epu64(__m256i a, char c);
5 | __m512i _mm512_firstbyte_epu32(__m512i a, char c);
6 | __m512i _mm512_firstbyte_epu64(__m512i a, char c);
--------------------------------------------------------------------------------
/AVX512_Reduce_Add.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | uint32_t _mm512_reduce2_add_epu8(__m512i z);
4 | uint32_t _mm512_reduce2_add_epu16(__m512i z);
5 | uint64_t _mm512_reduce2_add_epu32(__m512i z);
6 | uint64_t _mm512_reduce2_add_epu64(__m512i z);
7 | uint64_t _mm512_reduce2_add_epu128(__m512i z, uint64_t* hi);
8 |
--------------------------------------------------------------------------------
/VPCLMULQDQ_Demo.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | /* Prefix xor for entire vector register */
4 |
5 | __m128i _mm_prefix_xor_clmul_si128(__m128i a);
6 |
7 | #if defined(__AVX2__)
8 | __m256i _mm256_prefix_xor_clmul_si256(__m256i a);
9 | #endif
10 |
11 | #if defined(__AVX512F__)
12 | __m512i _mm512_prefix_xor_clmul_si512(__m512i a);
13 | #endif
14 |
--------------------------------------------------------------------------------
/targetver.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | // Including SDKDDKVer.h defines the highest available Windows platform.
4 |
5 | // If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
6 | // set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.
7 |
8 | #include
9 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
4 | # Custom for Visual Studio
5 | *.cs diff=csharp
6 |
7 | # Standard to msysgit
8 | *.doc diff=astextplain
9 | *.DOC diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot diff=astextplain
13 | *.DOT diff=astextplain
14 | *.pdf diff=astextplain
15 | *.PDF diff=astextplain
16 | *.rtf diff=astextplain
17 | *.RTF diff=astextplain
18 |
--------------------------------------------------------------------------------
/stdafx.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "targetver.h"
4 |
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 |
12 | #include
13 | #include
14 |
15 | #include
16 |
17 | #include "Misc.h"
18 | #include "CPU_Props.h"
19 | #include "InstLatX64_Demo.h"
20 | #include "512bFMA_DP_Ports.h"
21 | #include "Args.h"
22 | #include "ConsoleColor.h"
23 |
24 |
--------------------------------------------------------------------------------
/Kmov_Test.cpp:
--------------------------------------------------------------------------------
1 | #include "stdafx.h"
2 | #include "KmovTest.h"
3 |
4 | extern CPU_Props cpu_props;
5 |
6 | using namespace std;
7 |
8 | void Kmov_Test(void) {
9 | cout << "--- KMOV + POPCNT ---" << dec << right << endl;
10 | cout << "4x KMOVW + 4x POPCNT + 3x ADD :" << (double)KmovTest01() / (double)KMOV_REPEATS << endl;
11 | cout << "3x KUNPCK + 1x KMOVQ + 1x POPCNT:" << (double)KmovTest02() / (double)KMOV_REPEATS << endl;
12 | cout << "4x KMOVW mem + POPCNT mem :" << (double)KmovTest03() / (double)KMOV_REPEATS << endl;
13 | }
--------------------------------------------------------------------------------
/AVX_VNNI_INT16_Saturated_AddSub.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #ifdef __cplusplus
4 | extern "C" {
5 | #endif
6 |
7 | __m128i _mm_adds_epi32(__m128i a, __m128i b);
8 | __m128i _mm_subs_epi32(__m128i a, __m128i b);
9 |
10 | __m128i _mm_adds_epu32(__m128i a, __m128i b);
11 | __m128i _mm_subs_epu32(__m128i a, __m128i b);
12 |
13 | __m256i _mm256_adds_epi32(__m256i a, __m256i b);
14 | __m256i _mm256_subs_epi32(__m256i a, __m256i b);
15 |
16 | __m256i _mm256_adds_epu32(__m256i a, __m256i b);
17 | __m256i _mm256_subs_epu32(__m256i a, __m256i b);
18 |
19 | #ifdef __cplusplus
20 | }
21 | #endif
22 |
--------------------------------------------------------------------------------
/Zen4_Demo.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #define _ZEN4_DEMO_VERSION 0x0100
4 |
5 | #define ZEN4_REPEATS 100
6 | #define ZEN4_FUNCS 11
7 |
8 | #define ZEN4_FUNCDECL0(NAME) \
9 | {#NAME, { \
10 | nullptr, \
11 | nullptr, \
12 | nullptr, \
13 | nullptr, \
14 | nullptr, \
15 | nullptr, \
16 | nullptr, \
17 | nullptr, \
18 | nullptr,}},
19 |
20 | #ifdef __cplusplus
21 | extern "C" {
22 | #endif
23 |
24 | typedef unsigned __int64(__fastcall* TEST_PTR)(void);
25 |
26 | #ifdef __cplusplus
27 | }
28 | #endif
29 |
30 | typedef struct {
31 | const char name[64];
32 | TEST_PTR funcs[ZEN4_FUNCS];
33 | } zen4_methods;
34 |
--------------------------------------------------------------------------------
/AVX512_BGVSER.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | __m256i _mm256_bslli_epi256(__m256i a, int i);
4 | __m256i _mm256_bsrli_epi256(__m256i a, int i);
5 | __m256i _mm256_palignr_epi256(__m256i a, __m256i b, int i);
6 | __m256i _mm256_palignl_epi256(__m256i a, __m256i b, int i);
7 | __m256i _mm256_rotater_epi256(__m256i a, int i);
8 | __m256i _mm256_rotatel_epi256(__m256i a, int i);
9 |
10 | __m512i _mm512_bslli_epi512(__m512i a, int i);
11 | __m512i _mm512_bsrli_epi512(__m512i a, int i);
12 | __m512i _mm512_palignr_epi512(__m512i a, __m512i b, int i);
13 | __m512i _mm512_palignl_epi512(__m512i a, __m512i b, int i);
14 | __m512i _mm512_rotater_epi512(__m512i a, int i);
15 | __m512i _mm512_rotatel_epi512(__m512i a, int i);
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Prerequisites
2 | *.d
3 |
4 | # Compiled Object files
5 | *.slo
6 | *.lo
7 | *.o
8 | *.obj
9 |
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 |
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 |
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 |
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 |
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 |
34 | # =========================
35 | # Operating System Files
36 | # =========================
37 |
38 | # Windows
39 | # =========================
40 |
41 | # Windows thumbnail cache files
42 | Thumbs.db
43 | ehthumbs.db
44 | ehthumbs_vista.db
45 |
46 | # Folder config file
47 | Desktop.ini
48 |
49 | # Recycle Bin used on file shares
50 | $RECYCLE.BIN/
51 |
52 | # Windows Installer files
53 | *.cab
54 | *.msi
55 | *.msm
56 | *.msp
57 |
58 | # Windows shortcuts
59 | *.lnk
60 |
--------------------------------------------------------------------------------
/AVX512_Saturated_AddSub.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #ifdef __cplusplus
4 | extern "C" {
5 | #endif
6 |
7 | __m512i _mm512_adds_epi32(__m512i a, __m512i b);
8 | __m512i _mm512_adds_epi64(__m512i a, __m512i b);
9 |
10 | __m512i _mm512_subs_epi32(__m512i a, __m512i b);
11 | __m512i _mm512_subs_epi64(__m512i a, __m512i b);
12 |
13 | __m512i _mm512_adds_epu32(__m512i a, __m512i b);
14 | __m512i _mm512_adds_epu64(__m512i a, __m512i b);
15 |
16 | __m512i _mm512_subs_epu32(__m512i a, __m512i b);
17 | __m512i _mm512_subs_epu64(__m512i a, __m512i b);
18 |
19 | __m512i _mm512_adds_Zen4_epi32(__m512i a, __m512i b);
20 | __m512i _mm512_adds_Zen4_epi64(__m512i a, __m512i b);
21 |
22 | __m512i _mm512_subs_Zen4_epi32(__m512i a, __m512i b);
23 | __m512i _mm512_subs_Zen4_epi64(__m512i a, __m512i b);
24 |
25 | __m512i _mm512_adds_Zen4_epu32(__m512i a, __m512i b);
26 | __m512i _mm512_adds_Zen4_epu64(__m512i a, __m512i b);
27 |
28 | __m512i _mm512_subs_Zen4_epu32(__m512i a, __m512i b);
29 | __m512i _mm512_subs_Zen4_epu64(__m512i a, __m512i b);
30 |
31 | #ifdef __cplusplus
32 | }
33 | #endif
34 |
--------------------------------------------------------------------------------
/Byte2Byte.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #define B2B_FUNCS 1
4 | #define B2B_REPEATS 1000000000.0
5 |
6 | #define B2B_FUNCDEF(METHOD) \
7 | extern "C" unsigned __int64 __fastcall B2B_##METHOD##(void);
8 |
9 | #define B2B_FUNCDECL(NAME, METHOD, ISA) \
10 | {#NAME, { \
11 | B2B_##METHOD##}, \
12 | ISA \
13 | },
14 |
15 | #define B2B_FUNCDECL0(NAME) \
16 | {#NAME, {\
17 | nullptr}, FEAT_AVX512F},
18 |
19 | #ifdef __cplusplus
20 | extern "C" {
21 | #endif
22 |
23 | typedef unsigned __int64(__fastcall* B2B_PTR)(void);
24 |
25 | #ifdef __cplusplus
26 | }
27 | #endif
28 |
29 | typedef struct {
30 | const char name[64];
31 | B2B_PTR funcs[B2B_FUNCS];
32 | Feats feats;
33 | } b2b_methods;
34 |
35 | B2B_FUNCDEF(MASKEDVPERMI2B_LAT)
36 | B2B_FUNCDEF(KREGROUNDTRIP_LAT)
37 | B2B_FUNCDEF(GFNI_LAT)
38 | B2B_FUNCDEF(SRLQ_LAT)
39 | B2B_FUNCDEF(BLENDMB_LAT)
40 | B2B_FUNCDEF(MINMAX_LAT)
41 |
42 | B2B_FUNCDEF(MASKEDVPERMI2B_TP)
43 | B2B_FUNCDEF(KREGROUNDTRIP_TP)
44 | B2B_FUNCDEF(GFNI_TP)
45 | B2B_FUNCDEF(SRLQ_TP)
46 | B2B_FUNCDEF(BLENDMB_TP)
47 | B2B_FUNCDEF(MINMAX_TP)
48 |
--------------------------------------------------------------------------------
/Results/GFNI_vs_VBMI2.txt:
--------------------------------------------------------------------------------
1 | Intel AlderLake | AMD Zen4 |
2 | Core i9-12900K | Ryzen 9 7950X |
3 | VBMI2 GFNI | VBMI2 GFNI |
4 | _mm_ror_*_epi8(x128, 6) 398305 597569 | 438386 : 600854 | TSC clks
5 | _mm256_ror_*_epi8(x256, 6) 398288 597618 | 438426 : 600858 | TSC clks
6 | _mm512_ror_*_epi8(x512, 6) 497893 597594 | 500618 : 600854 | TSC clks
7 | _mm_mask_ror_*_epi8(x128, 6) 497893 995795 | 500618 : 1001294 | TSC clks
8 | _mm256_mask_ror_*_epi8(x256, 6) 748788 995832 | 710604 : 1001298 | TSC clks
9 | _mm512_mask_ror_*_epi8(x512, 6) 796543 995817 | 800950 : 1001300 | TSC clks
10 | -----------------------------------------------------|--------------------|----------
11 | _mm_rorv_*_epi8(x128, y128) 447484 995817 | 459542 : 1001300 | TSC clks
12 | _mm256_rorv_*_epi8(x256, y256) 447295 1598831 | 459560 : 1610728 | TSC clks
13 | _mm512_rorv_*_epi8(x512, y512) 498006 1593224 | 500744 : 1607388 | TSC clks
14 | _mm_mask_rorv_*_epi8(x128, y128) 754516 2090947 | 755160 : 2107980 | TSC clks
15 | _mm256_mask_rorv_*_epi8(x256, y256) 754350 2091355 | 755204 : 2124702 | TSC clks
16 | _mm512_mask_rorv_*_epi8(x512, y512) 796692 2091043 | 801072 : 2107988 | TSC clks
--------------------------------------------------------------------------------
/Zen5_Demo.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #define _ZEN5_DEMO_VERSION 0x0100
4 |
5 | #define ZEN5_REPEATS 100
6 | #define ZEN5_FUNCS 2
7 | #define ZEN5_FUNCS_X87 2
8 |
9 | #define ZEN5_FUNCDEF(INST, OPERANDS) \
10 | extern "C" unsigned __int64 __fastcall Zen5_##INST##_##OPERANDS##_lat(void); \
11 | extern "C" unsigned __int64 __fastcall Zen5_##INST##_##OPERANDS##_tp(void);
12 |
13 | #define ZEN5_FUNCDEF_X87(INST, OPERANDS) \
14 | extern "C" unsigned __int64 __fastcall Zen5_##INST##_##OPERANDS##_lat(void);
15 |
16 |
17 | #define ZEN5_FUNCDECL(NAME, INST, OPERANDS) \
18 | {#NAME, {\
19 | Zen5_##INST##_##OPERANDS##_lat, \
20 | Zen5_##INST##_##OPERANDS##_tp \
21 | }},
22 |
23 | #define ZEN5_FUNCDECL_X87(NAME, INST, OPERANDS) \
24 | {#NAME, {\
25 | Zen5_##INST##_##OPERANDS##_lat, \
26 | }},
27 |
28 | #define ZEN5_FUNCDECL0(NAME) \
29 | {#NAME, { \
30 | nullptr, \
31 | nullptr, \
32 | }},
33 |
34 | #define ZEN5_X87_FUNCDECL0(NAME) \
35 | {#NAME, {\
36 | nullptr, \
37 | }},
38 |
39 | ZEN5_FUNCDEF(empty, empty)
40 |
41 | #ifdef __cplusplus
42 | extern "C" {
43 | #endif
44 |
45 | typedef unsigned __int64(__fastcall* TEST_PTR)(void);
46 |
47 | #ifdef __cplusplus
48 | }
49 | #endif
50 |
51 | typedef struct {
52 | const char name[64];
53 | TEST_PTR funcs[ZEN5_FUNCS];
54 | } zen5_methods;
55 |
56 | typedef struct {
57 | const char name[64];
58 | TEST_PTR funcs[ZEN5_FUNCS_X87];
59 | } zen5_methods_x87;
60 |
61 |
--------------------------------------------------------------------------------
/AVX512_DecimalPrint.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "stdafx.h"
3 |
4 | #define AVX512_STR_BUFF 32
5 |
6 | typedef void (*U32_PRINT_PTR)(uint32_t, char *);
7 | typedef void (*S32_PRINT_PTR)(int32_t, char *);
8 | typedef void (*U64_PRINT_PTR)(uint64_t, char *);
9 | typedef void (*S64_PRINT_PTR)(int64_t, char *);
10 |
11 | template
12 | struct AVX512_decimalprint_methods {
13 | const char name[32];
14 | const char isaName[16];
15 | T_FUNC func;
16 | Feats feats;
17 | bool refFlag;
18 | };
19 |
20 | AVX512_decimalprint_methods decprints_u32[];
21 | AVX512_decimalprint_methods decprints_s32[];
22 | AVX512_decimalprint_methods decprints_u64[];
23 | AVX512_decimalprint_methods decprints_s64[];
24 |
25 | extern "C" void _ultoa_avx512ifma_asm(uint32_t, char *);
26 | extern "C" void _ultoa_avx512f_asm(uint32_t, char *);
27 |
28 | extern "C" void _ltoa_avx512ifma_asm(int32_t, char *);
29 | extern "C" void _ltoa_avx512f_asm(int32_t, char *);
30 |
31 | extern "C" void _ui64toa_avx512ifma_asm(uint64_t, char*);
32 | extern "C" void _ui64toa_avx512f_asm(uint64_t, char*);
33 |
34 | extern "C" void _i64toa_avx512ifma_asm(int64_t, char*);
35 | extern "C" void _i64toa_avx512f_asm(int64_t, char*);
36 |
37 | extern "C" void to_string_avx512ifma_asm(uint64_t, char*);
38 | extern "C" void to_string_avx512f_asm(uint64_t, char*);
39 |
40 | extern inline uint64_t serialized_tsc(void);
41 |
42 |
--------------------------------------------------------------------------------
/Byte2Byte.cpp:
--------------------------------------------------------------------------------
1 | #include "stdafx.h"
2 | #include "Byte2Byte.h"
3 |
4 | extern CPU_Props cpu_props;
5 |
6 | using namespace std;
7 |
8 | b2b_methods b2b[] = {
9 | {"Masked VPERMI2B pair LAT", B2B_MASKEDVPERMI2B_LAT, FEAT_AVX512VBMI},
10 | {"VPERMI2B pair + kreg + TERNLOG LAT", B2B_KREGROUNDTRIP_LAT, FEAT_AVX512VBMI},
11 | {"VPERMI2B pair + GFNI + TERNLOG LAT", B2B_GFNI_LAT, FEAT_GFNI},
12 | {"VPERMI2B pair + VPSHRQ + TERNLOG LAT", B2B_SRLQ_LAT, FEAT_AVX512VBMI},
13 | {"VPERMI2B pair + VPBLENDMB LAT", B2B_BLENDMB_LAT, FEAT_AVX512VBMI},
14 | {"VPERMI2B pair + VPMIN/MAXSB LAT", B2B_MINMAX_LAT, FEAT_AVX512VBMI},
15 |
16 | {"Masked VPERMI2B pair TP ", B2B_MASKEDVPERMI2B_TP, FEAT_AVX512VBMI},
17 | {"VPERMI2B pair + kreg + TERNLOG TP ", B2B_KREGROUNDTRIP_TP, FEAT_AVX512VBMI},
18 | {"VPERMI2B pair + GFNI + TERNLOG TP ", B2B_GFNI_TP, FEAT_GFNI},
19 | {"VPERMI2B pair + VPSHRQ + TERNLOG TP ", B2B_SRLQ_TP, FEAT_AVX512VBMI},
20 | {"VPERMI2B pair + VPBLENDMB TP ", B2B_BLENDMB_TP, FEAT_AVX512VBMI},
21 | {"VPERMI2B pair + VPMIN/MAXSB TP ", B2B_MINMAX_TP, FEAT_AVX512VBMI},
22 | };
23 |
24 | void Byte2ByteTest(void) {
25 | SetThread(3);
26 | cout << "--- AVX512VBMI Byte2Byte mapping ---" << dec << right << endl;
27 | for (int b = 0; b < sizeof(b2b) / sizeof(b2b_methods); b++) {
28 | if (cpu_props.IsFeat(b2b[b].feats))
29 | cout << b2b[b].name << ':' << (double)((b2b[b].funcs[0])()) / B2B_REPEATS << endl;
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/AMX_Demo.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #define _AMX_CONFIG_RESERVED_SIZE 14
4 | #define _AMX_VEX_MAX_REGISTERS 16
5 |
6 | // Intel Architecture Instruction Set Extensions and Future Features
7 | // Programming Reference May 2021 319433-044 p. 100
8 | // format of memory payload. each field is a byte.
9 | // 0: palette_id
10 | // 1: startRow (8b)
11 | // 2-15: reserved (must be zero)
12 | // 16-17: tile0.colsb -- bytes_per_row
13 | // 18-19: tile1.colsb
14 | // 20-21: tile2.colsb
15 | // ...
16 | // 46-47: tile15.colsb
17 | // 48: tile0.rows
18 | // 49: tile1.rows
19 | // 50: tile2.rows
20 | // ...
21 | // 63: tile15.rows
22 |
23 | class XTILECFG {
24 | unsigned char palette_id;
25 | unsigned char startRow;
26 | unsigned char reserved[_AMX_CONFIG_RESERVED_SIZE];
27 | unsigned short tile_cols[_AMX_VEX_MAX_REGISTERS];
28 | unsigned char tile_rows[_AMX_VEX_MAX_REGISTERS];
29 | public:
30 | XTILECFG() {
31 | memset(this, 0, sizeof(XTILECFG));
32 | };
33 | XTILECFG(unsigned int c, unsigned int r, unsigned int tilesize, unsigned int maxreg, unsigned int p = 1) : palette_id(p), startRow(0) {
34 | memset(reserved, 0, _AMX_CONFIG_RESERVED_SIZE);
35 | for (unsigned int i = 0, ts = tilesize; (i < _AMX_VEX_MAX_REGISTERS); i++) {
36 | if ((i < maxreg) && (ts >= c * r)) {
37 | tile_cols[i] = c;
38 | tile_rows[i] = r;
39 | } else {
40 | tile_cols[i] = 0;
41 | tile_rows[i] = 0;
42 | }
43 | //std::cout << i << ' ' << ts << ' ' << c * r << std::endl;
44 | ts = ts >= (c * r) ? ts - (c * r) : 0;
45 | }
46 | };
47 | void XTILECFG_reg(unsigned int c, unsigned int r, unsigned int reg) {
48 | tile_cols[reg] = c;
49 | tile_rows[reg] = r;
50 | }
51 | };
52 |
--------------------------------------------------------------------------------
/HWBITPERM_Demo.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #define DEPEXT219_RANDOM 10000
4 | #define DEPEXT219_RETRIES 10000
5 | #define DEPEXT219_REPEATS 5000
6 |
7 | enum BITPERM {
8 | BEXT,
9 | BDEP,
10 | BGRP
11 | };
12 |
13 | typedef unsigned __int64 (*BITPERM_PTR)(void);
14 |
15 | typedef __m512i (__vectorcall *BITPERM_PTR2)(__m512i, __m512i);
16 |
17 | typedef struct {
18 | const char name[32];
19 | const char isaName[16];
20 | int bitness;
21 | BITPERM_PTR lat;
22 | BITPERM_PTR tp;
23 | BITPERM_PTR2 func;
24 | Feats feats;
25 | BITPERM type;
26 | int ref;
27 | } bitperm_methods;
28 |
29 | #ifdef __cplusplus
30 | extern "C" {
31 | #endif
32 |
33 | unsigned __int64 BEXT32_HW_Lat(void);
34 | unsigned __int64 BEXT64_HW_Lat(void);
35 | unsigned __int64 BEXT32_HW_Tp(void);
36 | unsigned __int64 BEXT64_HW_Tp(void);
37 | __m512i __vectorcall BEXT32_HW(__m512i, __m512i);
38 | __m512i __vectorcall BEXT64_HW(__m512i, __m512i);
39 |
40 | unsigned __int64 BDEP32_HW_Lat(void);
41 | unsigned __int64 BDEP64_HW_Lat(void);
42 | unsigned __int64 BDEP32_HW_Tp(void);
43 | unsigned __int64 BDEP64_HW_Tp(void);
44 | __m512i __vectorcall BDEP32_HW(__m512i, __m512i);
45 | __m512i __vectorcall BDEP64_HW(__m512i, __m512i);
46 |
47 | unsigned __int64 BGRP32_HW_Lat(void);
48 | unsigned __int64 BGRP64_HW_Lat(void);
49 | unsigned __int64 BGRP32_HW_Tp(void);
50 | unsigned __int64 BGRP64_HW_Tp(void);
51 | __m512i __vectorcall BGRP32_HW(__m512i, __m512i);
52 | __m512i __vectorcall BGRP64_HW(__m512i, __m512i);
53 |
54 | unsigned int _pgrp_u32(unsigned int p, unsigned int m);
55 | unsigned __int64 _pgrp_u64(unsigned __int64 p, unsigned __int64 m);
56 |
57 | #ifdef __cplusplus
58 | }
59 | #endif
60 |
--------------------------------------------------------------------------------
/InstLatX64_Demo.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | enum demoType {
4 | DEMO_GFNI,
5 | DEMO_VPCLMLQDQ,
6 | #if defined (__AVX2__)
7 | DEMO_VNNI_SADD,
8 | #endif
9 | #if defined (_M_X64)
10 | #if defined (__AVX2__)
11 | DEMO_P06P1,
12 | DEMO_PEXT_PDEP_EMU,
13 | DEMO_FIRSTBBYTE,
14 | #endif
15 | #if defined(__AVX512F__)
16 | DEMO_RADD,
17 | DEMO_AVX512_SADD,
18 | DEMO_KMEMDST,
19 | DEMO_ZEN4,
20 | DEMO_ZEN5,
21 | DEMO_INTRINSICS,
22 | DEMO_VBMI2,
23 | DEMO_BYTE2BYTE,
24 | DEMO_LZCNT,
25 | DEMO_TZCNT,
26 | DEMO_HWBITPERM,
27 | DEMO_KMOV,
28 | DEMO_AMX,
29 | DEMO_AVX512_DECPRINT,
30 | DEMO_AVX512_BGVSER,
31 | #endif
32 | #if defined(__AVX512F__)
33 | DEMO_LAST = DEMO_AVX512_BGVSER,
34 | #elif defined (__AVX2__)
35 | DEMO_LAST = DEMO_FIRSTBBYTE,
36 | #else
37 | DEMO_LAST = DEMO_VPCLMLQDQ,
38 | #endif
39 | #else
40 | DEMO_LAST = DEMO_VPCLMLQDQ,
41 | #endif
42 | };
43 |
44 | typedef struct {
45 | const char* demoName;
46 | const char* alias;
47 | uint64_t demoMask;
48 | Feats feats;
49 | bool publicFlag;
50 | void (*func)(void);
51 | const char* comment;
52 | } demoTypeList;
53 |
54 | void GFNI_Demo(void);
55 | void VPCLMULQDQ_Demo(void);
56 |
57 | void PEXT_PDEP_Emu_Test(void);
58 | void FirstByte_Demo(void);
59 | void AVX512_Reduce_Add_Demo(void);
60 | void AVX512_Saturated_AddSub_Demo(void);
61 | void AVX512_KMemDst_Demo(void);
62 | void Zen4_Demo(void);
63 | void Zen5_Demo(void);
64 | void AVX512_InstrincTest(void);
65 | void VBMI2_Demo(void);
66 | void Byte2ByteTest(void);
67 | void LZCNT_Test(void);
68 | void TZCNT_Test(void);
69 | void HWBITPERM_Test(void);
70 | void Kmov_Test(void);
71 | void AMX_Test(void);
72 | void AVX512_DecimalPrint_Test(void);
73 | void AVX512_BGVSER_Test(void);
74 | void AVX_VNNI_Saturated_AddSub_Demo(void);
75 | void P0601_Test(void);
76 |
--------------------------------------------------------------------------------
/P06P1.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #define INSTNAMELEN 24
4 | #define INITLEN 16
5 | #define TESTCASE 8
6 |
7 | typedef unsigned __int64 (*MEASURR_PTR)(uint32_t * histo, uint64_t * data, uint64_t size);
8 |
9 | typedef unsigned __int64 (*MEASURE_PTR)(void);
10 |
11 | typedef struct {
12 | const char inst[INSTNAMELEN];
13 | const char init[INITLEN];
14 | MEASURE_PTR func[TESTCASE];
15 | Feats feats;
16 | int uopscount;
17 | } measure_methods;
18 |
19 | #define P06P1_FUNCDEF(INST, INITREG) \
20 | unsigned __int64 _##INST##_MOV_R##INITREG##_M1025_TIME(void); \
21 | unsigned __int64 _##INST##_MOV_E##INITREG##_M1025_TIME(void); \
22 | unsigned __int64 _##INST##_MOV_R##INITREG##_M1024_TIME(void); \
23 | unsigned __int64 _##INST##_MOV_E##INITREG##_M1024_TIME(void); \
24 | unsigned __int64 _##INST##_MOV_R##INITREG##_M513_TIME(void); \
25 | unsigned __int64 _##INST##_MOV_E##INITREG##_M513_TIME(void); \
26 | unsigned __int64 _##INST##_MOV_R##INITREG##_M512_TIME(void); \
27 | unsigned __int64 _##INST##_MOV_E##INITREG##_M512_TIME(void); \
28 | unsigned __int64 _##INST##_MOV_R##INITREG##_511_TIME(void); \
29 | unsigned __int64 _##INST##_MOV_E##INITREG##_511_TIME(void); \
30 | unsigned __int64 _##INST##_MOV_R##INITREG##_512_TIME(void); \
31 | unsigned __int64 _##INST##_MOV_E##INITREG##_512_TIME(void); \
32 | unsigned __int64 _##INST##_MOV_R##INITREG##_1023_TIME(void); \
33 | unsigned __int64 _##INST##_MOV_E##INITREG##_1023_TIME(void); \
34 | unsigned __int64 _##INST##_MOV_R##INITREG##_1024_TIME(void); \
35 | unsigned __int64 _##INST##_MOV_E##INITREG##_1024_TIME(void);
36 |
37 | #define P06P1_FUNC(INST, FUNC, INITREG, ISA, UOPS) \
38 | {INST, "MOV R"#INITREG", imm32", \
39 | { \
40 | _##FUNC##_MOV_R##INITREG##_M1025_TIME, \
41 | _##FUNC##_MOV_R##INITREG##_M1024_TIME, \
42 | _##FUNC##_MOV_R##INITREG##_M513_TIME, \
43 | _##FUNC##_MOV_R##INITREG##_M512_TIME, \
44 | _##FUNC##_MOV_R##INITREG##_511_TIME, \
45 | _##FUNC##_MOV_R##INITREG##_512_TIME, \
46 | _##FUNC##_MOV_R##INITREG##_1023_TIME, \
47 | _##FUNC##_MOV_R##INITREG##_1024_TIME \
48 | }, \
49 | FEAT_##ISA##, UOPS},
50 |
--------------------------------------------------------------------------------
/VPCLMULQDQ_Demo.cpp:
--------------------------------------------------------------------------------
1 | #include "stdafx.h"
2 | #include "VPCLMULQDQ_Demo.h"
3 |
4 | __m128i _mm_prefix_xor_clmul_si128(__m128i a) {
5 | const __m128i full = _mm_set1_epi32(0xffffffff);
6 | __m128i clmul0_63 = _mm_clmulepi64_si128(a, full, 0x00);
7 | __m128i clmul64_127 = _mm_clmulepi64_si128(a, full, 0x01);
8 | #if !defined(__AVX512VL__)
9 | clmul64_127 = _mm_xor_si128(clmul64_127, _mm_shuffle_epi32(_mm_srai_epi32(clmul0_63, 31), 0x05));
10 | #else
11 | clmul64_127 = _mm_xor_si128(clmul64_127, _mm_srai_epi64(clmul0_63, 63));
12 | #endif
13 | return _mm_unpacklo_epi64(clmul0_63, clmul64_127);
14 | }
15 |
16 | #if defined(__AVX2__)
17 | __m256i _mm256_prefix_xor_clmul_si256(__m256i a) {
18 | const __m256i full = _mm256_set1_epi32(0xffffffff);
19 | __m256i clmul0_63 = _mm256_clmulepi64_epi128(a, full, 0x00);
20 | __m256i clmul64_127 = _mm256_clmulepi64_epi128(a, full, 0x01);
21 | #if !defined(__AVX512VL__)
22 | clmul64_127 = _mm256_xor_si256(clmul64_127, _mm256_shuffle_epi32(_mm256_srai_epi32(clmul0_63, 31), 0x05));
23 | #else
24 | clmul64_127 = _mm256_xor_si256(clmul64_127, _mm256_srai_epi64(clmul0_63, 63));
25 | #endif
26 | __m256i clmul0_127 = _mm256_unpacklo_epi64(clmul0_63, clmul64_127);
27 | #if !defined(__AVX512VL__)
28 | __m256i corr128_255 = _mm256_inserti128_si256(_mm256_setzero_si256(), _mm_shuffle_epi32(_mm_srai_epi32(_mm256_castsi256_si128(clmul0_127), 31), 0xff), 1);
29 | #else
30 | __m256i corr128_255 = _mm256_maskz_permutex_epi64(0xc, _mm256_srai_epi64(clmul64_127, 63), 0);
31 | #endif
32 | return _mm256_xor_si256(clmul0_127, corr128_255);
33 | }
34 | #endif
35 |
36 | #if defined(__AVX512F__)
37 | __m512i _mm512_prefix_xor_clmul_si512(__m512i a) {
38 | const __m512i full = _mm512_set1_epi32(0xffffffff);
39 | __m512i clmul0_63 = _mm512_clmulepi64_epi128(a, full, 0x00);
40 | __m512i clmul64_127 = _mm512_clmulepi64_epi128(a, full, 0x01);
41 | clmul64_127 = _mm512_xor_si512(clmul64_127, _mm512_srai_epi64(clmul0_63, 63));
42 | __m512i clmul0_127 = _mm512_unpacklo_epi64(clmul0_63, clmul64_127);
43 | __m512i corr128_255 = _mm512_maskz_permutex_epi64(0xcc, _mm512_srai_epi64(clmul64_127, 63), 0);
44 | __m512i clmul0_255 = _mm512_xor_si512(clmul0_127, corr128_255);
45 | __m512i corr256_511 = _mm512_maskz_permutexvar_epi64(0xf0, _mm512_set1_epi64(3), _mm512_srai_epi64(clmul0_255, 63));
46 | return _mm512_xor_si512(clmul0_255, corr256_511);
47 | }
48 | #endif
49 |
--------------------------------------------------------------------------------
/Misc.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #define _INSTLATX64_DEMO_TESTVECT_00 0xff7f3f1f0f070301
4 | #define _INSTLATX64_DEMO_TESTVECT_01 0x80c0e0f0f8fcfeff
5 |
6 | #define _INSTLATX64_DEMO_TESTVECT_02 0xaa55cc33a050c031
7 | #define _INSTLATX64_DEMO_TESTVECT_03 0x030c050a33cc55aa
8 |
9 | #define _INSTLATX64_DEMO_TESTVECT_04 0x080808ff08080808
10 | #define _INSTLATX64_DEMO_TESTVECT_05 0x8142241818244281
11 |
12 | #define _INSTLATX64_DEMO_TESTVECT_06 0x8040201008040201
13 | #define _INSTLATX64_DEMO_TESTVECT_07 0x0102040810204080
14 |
15 | #define _INSTLATX64_DEMO_TESTVECT_08 0x8040201008040000
16 | #define _INSTLATX64_DEMO_TESTVECT_09 0x0002040810204080
17 |
18 | #define _INSTLATX64_DEMO_TESTVECT_0A 0x8080808080808000
19 | #define _INSTLATX64_DEMO_TESTVECT_0B 0x0101010101010101
20 |
21 | #define _INSTLATX64_DEMO_TESTVECT_FE 0xfefefefefefefefe
22 | #define _INSTLATX64_DEMO_TESTVECT_FF 0xffffffffffffffff
23 |
24 | #define _INSTLATX64_DEMO_TESTMASK_8 0x5a
25 | #define _INSTLATX64_DEMO_TESTMASK_16 0x5a7e
26 | #define _INSTLATX64_DEMO_TESTMASK_32 0x5a7e3c18
27 | #define _INSTLATX64_DEMO_TESTMASK_64 0x07701ff13ff37ff7
28 |
29 | void printRes8(const char* name, __m128i res);
30 | void printRes8(const char* name, __m256i res);
31 | void printRes8(const char* name, __m512i res);
32 |
33 | void printRes16(const char * name, __m128i res);
34 | void printRes16(const char * name, __m256i res);
35 | void printRes16(const char * name, __m512i res);
36 |
37 | void printRes32(const char * name, __m128i res);
38 | void printRes32(const char * name, __m256i res);
39 | void printRes32(const char * name, __m512i res);
40 |
41 | void printRes(const char * name, __m128i res);
42 | void printRes(const char * name, __m256i res);
43 | void printRes(const char * name, __m512i res);
44 |
45 | void printRes(int r, const char * name, __m128i res);
46 | void printRes(int r, const char * name, __m256i res);
47 | void printRes(int r, const char * name, __m512i res);
48 |
49 | void printRes(int r, __m128i res);
50 | void printRes(int r, __m256i res);
51 | void printRes(int r, __m512i res);
52 |
53 | void random_wrap(unsigned int* random);
54 | void random_wrap(signed int* random);
55 | void random_wrap(unsigned long long* random);
56 | void random_wrap(signed long long* random);
57 |
58 | uint64_t serialized_tsc(void);
59 |
60 | void SetThread(size_t threadindex);
61 |
62 | #if defined (_M_X64)
63 | #define _ild_popcnt _mm_popcnt_u64
64 | #else
65 | #define _ild_popcnt _mm_popcnt_u32
66 | #endif
67 |
68 |
--------------------------------------------------------------------------------
/ConsoleColor.h:
--------------------------------------------------------------------------------
1 | // ConsoleColor.h
2 | // Copyleft Vincent Godin
3 | // https://www.codeproject.com/articles/16431/add-color-to-your-std-cout
4 |
5 | #pragma once
6 | #include
7 | #include
8 |
9 | enum CharColor : WORD {
10 | //IRGB
11 | COLOR_BLACK = 0b0000,
12 | COLOR_DARKBLUE = 0b0001,
13 | COLOR_DARKGREEN = 0b0010,
14 | COLOR_LIGHTBLUE = 0b0011,
15 | COLOR_BRICK = 0b0100,
16 | COLOR_VIOLET = 0b0101,
17 | COLOR_GOLD = 0b0110,
18 | COLOR_GREY = 0b0111,
19 | COLOR_SILVER = 0b1000,
20 | COLOR_BLUE = 0b1001,
21 | COLOR_GREEN = 0b1010,
22 | COLOR_AQUA = 0b1011,
23 | COLOR_RED = 0b1100,
24 | COLOR_PURPLE = 0b1101,
25 | COLOR_YELLOW = 0b1110,
26 | COLOR_WHITE = 0b1111
27 | };
28 |
29 | inline std::ostream& blue(std::ostream& s)
30 | {
31 | HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE);
32 | SetConsoleTextAttribute(hStdout, FOREGROUND_BLUE | FOREGROUND_INTENSITY);
33 | return s;
34 | }
35 |
36 | inline std::ostream& red(std::ostream& s)
37 | {
38 | HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE);
39 | SetConsoleTextAttribute(hStdout, FOREGROUND_RED | FOREGROUND_INTENSITY);
40 | return s;
41 | }
42 |
43 | inline std::ostream& green(std::ostream& s)
44 | {
45 | HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE);
46 | SetConsoleTextAttribute(hStdout, FOREGROUND_GREEN | FOREGROUND_INTENSITY);
47 | return s;
48 | }
49 |
50 | inline std::ostream& yellow(std::ostream& s)
51 | {
52 | HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE);
53 | SetConsoleTextAttribute(hStdout, FOREGROUND_GREEN | FOREGROUND_RED | FOREGROUND_INTENSITY);
54 | return s;
55 | }
56 |
57 | inline std::ostream& gold(std::ostream& s)
58 | {
59 | HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE);
60 | SetConsoleTextAttribute(hStdout, FOREGROUND_GREEN | FOREGROUND_RED);
61 | return s;
62 | }
63 |
64 | inline std::ostream& white(std::ostream& s)
65 | {
66 | HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE);
67 | SetConsoleTextAttribute(hStdout, FOREGROUND_RED | FOREGROUND_GREEN | FOREGROUND_BLUE);
68 | return s;
69 | }
70 |
71 | inline std::ostream& magenta(std::ostream& s)
72 | {
73 | HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE);
74 | SetConsoleTextAttribute(hStdout, FOREGROUND_RED | FOREGROUND_BLUE | FOREGROUND_INTENSITY);
75 | return s;
76 | }
77 |
78 | struct color {
79 | color(WORD attribute) :m_color(attribute) {};
80 | WORD m_color;
81 | };
82 |
83 | template
84 | std::basic_ostream<_Elem, _Traits>&
85 | operator<<(std::basic_ostream<_Elem, _Traits>& i, const color& c)
86 | {
87 | HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE);
88 | SetConsoleTextAttribute(hStdout, c.m_color);
89 | return i;
90 | }
91 |
92 | // Copyleft Vincent Godin
93 |
94 |
--------------------------------------------------------------------------------
/LZCNT_Demo.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #define LZCNT_REPEATS 0x1000000
4 |
5 | __m128i __vectorcall _mm_lzcnt_ild_epi8(__m128i a);
6 | __m256i __vectorcall _mm256_lzcnt_ild_epi8(__m256i a);
7 | __m512i __vectorcall _mm512_lzcnt_ild_epi8(__m512i a);
8 |
9 | __m128i __vectorcall _mm_lzcnt_ild_epi16(__m128i a);
10 | __m256i __vectorcall _mm256_lzcnt_ild_epi16(__m256i a);
11 | __m512i __vectorcall _mm512_lzcnt_ild_epi16(__m512i a);
12 |
13 | __m128i __vectorcall _mm_lzcnt_fp16_epi16(__m128i a);
14 | __m256i __vectorcall _mm256_lzcnt_fp16_epi16(__m256i a);
15 | __m512i __vectorcall _mm512_lzcnt_fp16_epi16(__m512i a);
16 |
17 | __m128i __vectorcall _mm_lzcnt_ild_epi8(__m128i a);
18 | __m256i __vectorcall _mm256_lzcnt_ild_epi8(__m256i a);
19 | __m512i __vectorcall _mm512_lzcnt_ild_epi8(__m512i a);
20 |
21 | __m128i __vectorcall _mm_lzcnt_ild_epi16(__m128i a);
22 | __m256i __vectorcall _mm256_lzcnt_ild_epi16(__m256i a);
23 | __m512i __vectorcall _mm512_lzcnt_ild_epi16(__m512i a);
24 |
25 | __m128i __vectorcall _mm_lzcnt_fp16_epi16(__m128i a);
26 | __m256i __vectorcall _mm256_lzcnt_fp16_epi16(__m256i a);
27 | __m512i __vectorcall _mm512_lzcnt_fp16_epi16(__m512i a);
28 |
29 | #ifdef __cplusplus
30 | extern "C" {
31 | #endif
32 |
33 | unsigned __int64 _mm_lzcnt_epi8_asm_timed(void);
34 | unsigned __int64 _mm_lzcnt_gfni_epi8_asm_timed(void);
35 | unsigned __int64 _mm_lzcnt_epi16_asm_timed(void);
36 | unsigned __int64 _mm_lzcnt_fp16_epi16_asm_timed(void);
37 |
38 | unsigned __int64 _mm256_lzcnt_epi8_asm_timed(void);
39 | unsigned __int64 _mm256_lzcnt_gfni_epi8_asm_timed(void);
40 | unsigned __int64 _mm256_lzcnt_epi16_asm_timed(void);
41 | unsigned __int64 _mm256_lzcnt_fp16_epi16_asm_timed(void);
42 |
43 | unsigned __int64 _mm512_lzcnt_epi8_asm_timed(void);
44 | unsigned __int64 _mm512_lzcnt_gfni_epi8_asm_timed(void);
45 | unsigned __int64 _mm512_lzcnt_epi16_asm_timed(void);
46 | unsigned __int64 _mm512_lzcnt_fp16_epi16_asm_timed(void);
47 |
48 | __m128i __vectorcall _mm_lzcnt_epi8_asm(__m128i a);
49 | __m128i __vectorcall _mm_lzcnt_gfni_epi8_asm(__m128i a);
50 | __m128i __vectorcall _mm_lzcnt_epi16_asm(__m128i a);
51 | __m128i __vectorcall _mm_lzcnt_fp16_epi16_asm(__m128i a);
52 |
53 | __m256i __vectorcall _mm256_lzcnt_epi8_asm(__m256i a);
54 | __m256i __vectorcall _mm256_lzcnt_gfni_epi8_asm(__m256i a);
55 | __m256i __vectorcall _mm256_lzcnt_epi16_asm(__m256i a);
56 | __m256i __vectorcall _mm256_lzcnt_fp16_epi16_asm(__m256i a);
57 |
58 | __m512i __vectorcall _mm512_lzcnt_epi8_asm(__m512i a);
59 | __m512i __vectorcall _mm512_lzcnt_gfni_epi8_asm(__m512i a);
60 | __m512i __vectorcall _mm512_lzcnt_epi16_asm(__m512i a);
61 | __m512i __vectorcall _mm512_lzcnt_fp16_epi16_asm(__m512i a);
62 |
63 | #ifdef __cplusplus
64 | }
65 | #endif
66 |
--------------------------------------------------------------------------------
/Zen5_Demo_Port.h:
--------------------------------------------------------------------------------
1 | zen5_8clks_port01_m macro
2 | REPEAT 2
3 | pmullw mm0, mm0
4 | pmullw mm1, mm1
5 | pmullw mm2, mm2
6 | pmullw mm3, mm3
7 | pmullw mm4, mm4
8 | pmullw mm5, mm5
9 | pmullw mm6, mm6
10 | pmullw mm7, mm7
11 | endm
12 | endm
13 |
14 | zen5_8clks_port23_m macro
15 | REPEAT 2
16 | psllw mm0, 1
17 | psllw mm1, 1
18 | psllw mm2, 1
19 | psllw mm3, 1
20 | psllw mm4, 1
21 | psllw mm5, 1
22 | psllw mm6, 1
23 | psllw mm7, 1
24 | endm
25 | endm
26 |
27 | zen5_8clks_port12_m macro
28 | REPEAT 2
29 | packsswb mm0, mm0
30 | packsswb mm1, mm1
31 | packsswb mm2, mm2
32 | packsswb mm3, mm3
33 | packsswb mm4, mm4
34 | packsswb mm5, mm5
35 | packsswb mm6, mm6
36 | packsswb mm7, mm7
37 | endm
38 | endm
39 |
40 | zen5_8clks_port03_m macro
41 | REPEAT 2
42 | korb k0, k0, k1
43 | korb k1, k1, k2
44 | korb k2, k2, k3
45 | korb k3, k3, k4
46 | korb k4, k4, k5
47 | korb k5, k5, k6
48 | korb k6, k6, k7
49 | korb k7, k7, k0
50 | endm
51 | endm
52 |
53 | zen5_8clks_port0123_m macro
54 | REPEAT 4
55 | paddb mm0, mm0
56 | paddb mm1, mm1
57 | paddb mm2, mm2
58 | paddb mm3, mm3
59 | paddb mm4, mm4
60 | paddb mm5, mm5
61 | paddb mm6, mm6
62 | paddb mm7, mm7
63 | endm
64 | endm
65 |
66 | zen5_8clks_port1_m macro
67 | REPEAT 1
68 | kmovb k0, eax
69 | kmovb k1, eax
70 | kmovb k2, eax
71 | kmovb k3, eax
72 | kmovb k4, eax
73 | kmovb k5, eax
74 | kmovb k6, eax
75 | kmovb k7, eax
76 | endm
77 | endm
78 |
79 | zen5_8clks_port45_m macro
80 | REPEAT 2
81 | movq [memop1 - 080h], mm0
82 | movq [memop1 - 060h], mm1
83 | movq [memop1 - 040h], mm2
84 | movq [memop1 - 020h], mm3
85 | movq [memop1 + 000h], mm4
86 | movq [memop1 + 020h], mm5
87 | movq [memop1 + 040h], mm6
88 | movq [memop1 + 060h], mm7
89 | endm
90 | endm
91 |
92 | zen5_8clks_tern_m macro
93 | vpternlogq zmm0, zmm1, zmm2, 0
94 | vpternlogq zmm1, zmm2, zmm3, 1
95 | vpternlogq zmm2, zmm3, zmm4, 2
96 | vpternlogq zmm3, zmm4, zmm5, 3
97 | vpternlogq zmm4, zmm5, zmm6, 4
98 | vpternlogq zmm5, zmm6, zmm7, 5
99 | vpternlogq zmm6, zmm7, zmm0, 6
100 | vpternlogq zmm7, zmm0, zmm1, 7
101 | endm
102 |
103 | zen5_8clks_LDs_m macro
104 | REPEAT 2
105 | movq mm0, [memop1 - 080h]
106 | movq mm1, [memop1 - 060h]
107 | movq mm2, [memop1 - 040h]
108 | movq mm3, [memop1 - 020h]
109 | movq mm4, [memop1 + 000h]
110 | movq mm5, [memop1 + 020h]
111 | movq mm6, [memop1 + 040h]
112 | movq mm7, [memop1 + 060h]
113 | endm
114 | endm
115 |
--------------------------------------------------------------------------------
/Zen4_Demo_Port.h:
--------------------------------------------------------------------------------
1 | zen4_8clks_port01_m macro
2 | REPEAT 2
3 | paddsb mm0, mm0
4 | paddsb mm1, mm1
5 | paddsb mm2, mm2
6 | paddsb mm3, mm3
7 | paddsb mm4, mm4
8 | paddsb mm5, mm5
9 | paddsb mm6, mm6
10 | paddsb mm7, mm7
11 | endm
12 | endm
13 |
14 | zen4_8clks_port23_m macro
15 | REPEAT 2
16 | psllw mm0, 1
17 | psllw mm1, 1
18 | psllw mm2, 1
19 | psllw mm3, 1
20 | psllw mm4, 1
21 | psllw mm5, 1
22 | psllw mm6, 1
23 | psllw mm7, 1
24 | endm
25 | endm
26 |
27 | zen4_8clks_port12_m macro
28 | REPEAT 2
29 | packsswb mm0, mm0
30 | packsswb mm1, mm1
31 | packsswb mm2, mm2
32 | packsswb mm3, mm3
33 | packsswb mm4, mm4
34 | packsswb mm5, mm5
35 | packsswb mm6, mm6
36 | packsswb mm7, mm7
37 | endm
38 | endm
39 |
40 | zen4_8clks_port123_m macro
41 | REPEAT 3
42 | punpcklbw mm0, mm0
43 | punpcklbw mm1, mm1
44 | punpcklbw mm2, mm2
45 | punpcklbw mm3, mm3
46 | punpcklbw mm4, mm4
47 | punpcklbw mm5, mm5
48 | punpcklbw mm6, mm6
49 | punpcklbw mm7, mm7
50 | endm
51 | endm
52 |
53 | zen4_8clks_port0123_m macro
54 | REPEAT 4
55 | por mm0, mm0
56 | por mm1, mm1
57 | por mm2, mm2
58 | por mm3, mm3
59 | por mm4, mm4
60 | por mm5, mm5
61 | por mm6, mm6
62 | por mm7, mm7
63 | endm
64 | endm
65 |
66 | zen4_8clks_port45_m macro
67 | REPEAT 1
68 | movq [memop1 - 080h], mm0
69 | movq [memop1 - 060h], mm1
70 | movq [memop1 - 040h], mm2
71 | movq [memop1 - 020h], mm3
72 | movq [memop1 + 000h], mm4
73 | movq [memop1 + 020h], mm5
74 | movq [memop1 + 040h], mm6
75 | movq [memop1 + 060h], mm7
76 | endm
77 | endm
78 |
79 | zen4_8clks_LDs_m macro
80 | REPEAT 2
81 | movq mm0, [memop1 - 080h]
82 | movq mm1, [memop1 - 060h]
83 | movq mm2, [memop1 - 040h]
84 | movq mm3, [memop1 - 020h]
85 | movq mm4, [memop1 + 000h]
86 | movq mm5, [memop1 + 020h]
87 | movq mm6, [memop1 + 040h]
88 | movq mm7, [memop1 + 060h]
89 | endm
90 | endm
91 |
92 | zen4_8clks_port1_m macro
93 | REPEAT 1
94 | pinsrw mm0, eax, 0
95 | pinsrw mm1, eax, 0
96 | pinsrw mm2, eax, 0
97 | pinsrw mm3, eax, 0
98 | pinsrw mm4, eax, 0
99 | pinsrw mm5, eax, 0
100 | pinsrw mm6, eax, 0
101 | pinsrw mm7, eax, 0
102 | endm
103 | endm
104 |
105 | zen4_8clks_tern_m macro
106 | vpternlogq zmm0, zmm1, zmm2, 0
107 | vpternlogq zmm1, zmm2, zmm3, 1
108 | vpternlogq zmm2, zmm3, zmm4, 2
109 | vpternlogq zmm3, zmm4, zmm5, 3
110 | vpternlogq zmm4, zmm5, zmm6, 4
111 | vpternlogq zmm5, zmm6, zmm7, 5
112 | vpternlogq zmm6, zmm7, zmm0, 6
113 | vpternlogq zmm7, zmm0, zmm1, 7
114 | endm
115 |
--------------------------------------------------------------------------------
/TZCNT_Demo.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #define TZCNT_REPEATS 0x1000000
4 |
5 | __m128i __vectorcall _mm_tzcnt_epi8(__m128i a);
6 | __m256i __vectorcall _mm256_tzcnt_epi8(__m256i a);
7 | __m512i __vectorcall _mm512_tzcnt_epi8(__m512i a);
8 |
9 | __m128i __vectorcall _mm_tzcnt_epi16(__m128i a);
10 | __m256i __vectorcall _mm256_tzcnt_epi16(__m256i a);
11 | __m512i __vectorcall _mm512_tzcnt_epi16(__m512i a);
12 |
13 | __m128i __vectorcall _mm_tzcnt_epi32(__m128i a);
14 | __m256i __vectorcall _mm256_tzcnt_epi32(__m256i a);
15 | __m512i __vectorcall _mm512_tzcnt_epi32(__m512i a);
16 |
17 | __m128i __vectorcall _mm_tzcnt_epi64(__m128i a);
18 | __m256i __vectorcall _mm256_tzcnt_epi64(__m256i a);
19 | __m512i __vectorcall _mm512_tzcnt_epi64(__m512i a);
20 |
21 | #ifdef __cplusplus
22 | extern "C" {
23 | #endif
24 |
25 | unsigned __int64 _mm_tzcnt_epi8_asm_timed(void);
26 | unsigned __int64 _mm_tzcnt_epi16_asm_timed(void);
27 | unsigned __int64 _mm_tzcnt_epi32_asm_timed(void);
28 | unsigned __int64 _mm_tzcnt_epi64_asm_timed(void);
29 |
30 | unsigned __int64 _mm256_tzcnt_epi8_asm_timed(void);
31 | unsigned __int64 _mm256_tzcnt_epi16_asm_timed(void);
32 | unsigned __int64 _mm256_tzcnt_epi32_asm_timed(void);
33 | unsigned __int64 _mm256_tzcnt_epi64_asm_timed(void);
34 |
35 | unsigned __int64 _mm512_tzcnt_epi8_asm_timed(void);
36 | unsigned __int64 _mm512_tzcnt_epi16_asm_timed(void);
37 | unsigned __int64 _mm512_tzcnt_epi32_asm_timed(void);
38 | unsigned __int64 _mm512_tzcnt_epi64_asm_timed(void);
39 |
40 | unsigned __int64 _mm_tzcnt_epi32_cd_asm_timed(void);
41 | unsigned __int64 _mm256_tzcnt_epi32_cd_asm_timed(void);
42 | unsigned __int64 _mm512_tzcnt_epi32_cd_asm_timed(void);
43 | unsigned __int64 _mm_tzcnt_epi64_cd_asm_timed(void);
44 | unsigned __int64 _mm256_tzcnt_epi64_cd_asm_timed(void);
45 | unsigned __int64 _mm512_tzcnt_epi64_cd_asm_timed(void);
46 |
47 | __m128i __vectorcall _mm_tzcnt_epi8_asm(__m128i);
48 | __m128i __vectorcall _mm_tzcnt_epi16_asm(__m128i);
49 | __m128i __vectorcall _mm_tzcnt_epi32_asm(__m128i);
50 | __m128i __vectorcall _mm_tzcnt_epi64_asm(__m128i);
51 |
52 | __m256i __vectorcall _mm256_tzcnt_epi8_asm(__m256i);
53 | __m256i __vectorcall _mm256_tzcnt_epi16_asm(__m256i);
54 | __m256i __vectorcall _mm256_tzcnt_epi32_asm(__m256i);
55 | __m256i __vectorcall _mm256_tzcnt_epi64_asm(__m256i);
56 |
57 | __m512i __vectorcall _mm512_tzcnt_epi8_asm(__m512i);
58 | __m512i __vectorcall _mm512_tzcnt_epi16_asm(__m512i);
59 | __m512i __vectorcall _mm512_tzcnt_epi32_asm(__m512i);
60 | __m512i __vectorcall _mm512_tzcnt_epi64_asm(__m512i);
61 |
62 | __m256i __vectorcall _mm256_tzcnt_epi32_cd_asm(__m256i);
63 | __m512i __vectorcall _mm512_tzcnt_epi32_cd_asm(__m512i);
64 | __m256i __vectorcall _mm256_tzcnt_epi64_cd_asm(__m256i);
65 | __m512i __vectorcall _mm512_tzcnt_epi64_cd_asm(__m512i);
66 |
67 | #ifdef __cplusplus
68 | }
69 | #endif
70 |
--------------------------------------------------------------------------------
/KmovTest_Asm.asm:
--------------------------------------------------------------------------------
1 | .data
2 |
3 | tempmem dq 00101010101010101h
4 |
5 | repeats equ 1000000h
6 |
7 | .code
8 |
9 | KmovTest01 proc
10 | push rbx
11 | push rdi
12 | push rsi
13 |
14 | kxnorq k0, k0, k0
15 | kxnorq k1, k1, k1
16 | kxnorq k2, k2, k2
17 | kxnorq k3, k3, k3
18 |
19 | mfence
20 | rdtscp
21 | lfence
22 |
23 | mov esi, eax
24 | mov edi, edx
25 |
26 | mov ecx, repeats
27 |
28 | startlabel:
29 | kmovw ebx, k0 ;P0
30 | kmovw edx, k0 ;P0
31 | kmovw eax, k0 ;P0
32 | kmovw r8d, k0 ;P0
33 |
34 | popcnt ebx, ebx ;P1
35 | popcnt edx, edx ;P1
36 | popcnt eax, eax ;P1
37 | popcnt r8d, r8d ;P1
38 |
39 | add ebx, edx ;P0156
40 | add eax, r8d ;P0156
41 | add eax, ebx ;P0156
42 |
43 | kmovw k0, eax ;P5
44 |
45 | dec ecx
46 | jnz startlabel
47 |
48 | mfence
49 | rdtscp
50 | lfence
51 |
52 | shl rdx, 20h
53 | shl rdi, 20h
54 | or rax, rdx
55 | or rsi, rdi
56 |
57 | sub rax, rsi
58 |
59 |
60 | pop rsi
61 | pop rdi
62 | pop rbx
63 | ret
64 | KmovTest01 endp
65 |
66 | KmovTest02 proc
67 | push rbx
68 | push rdi
69 | push rsi
70 |
71 | kxnorq k0, k0, k0
72 | kxorq k1, k1, k1
73 | kxorq k2, k2, k2
74 | kxorq k3, k3, k3
75 |
76 | mfence
77 | rdtscp
78 | lfence
79 |
80 | mov esi, eax
81 | mov edi, edx
82 |
83 | mov ecx, repeats
84 |
85 | startlabel:
86 | kunpckwd k1, k0, k0 ;P5
87 | kunpckwd k2, k0, k0 ;P5
88 | kunpckdq k3, k1, k2 ;P5
89 |
90 | kmovq rax, k3 ;P0
91 | popcnt rax, rax ;P1
92 | kmovq k0, rax ;P5
93 |
94 | dec ecx
95 | jnz startlabel
96 |
97 | mfence
98 | rdtscp
99 | lfence
100 |
101 | shl rdx, 20h
102 | shl rdi, 20h
103 | or rax, rdx
104 | or rsi, rdi
105 |
106 | sub rax, rsi
107 |
108 |
109 | pop rsi
110 | pop rdi
111 | pop rbx
112 | ret
113 | KmovTest02 endp
114 |
115 | KmovTest03 proc
116 | push rbx
117 | push rdi
118 | push rsi
119 |
120 | kxnorq k0, k0, k0
121 | kxnorq k1, k1, k1
122 | kxnorq k2, k2, k2
123 | kxnorq k3, k3, k3
124 |
125 | mfence
126 | rdtscp
127 | lfence
128 |
129 | mov esi, eax
130 | mov edi, edx
131 |
132 | mov ecx, repeats
133 |
134 | startlabel:
135 | kmovw word ptr [tempmem + 0], k0
136 | kmovw word ptr [tempmem + 2], k0
137 | kmovw word ptr [tempmem + 4], k0
138 | kmovw word ptr [tempmem + 6], k0
139 |
140 | popcnt rax, qword ptr [tempmem]
141 |
142 | kmovq k0, rax
143 |
144 | dec ecx
145 | jnz startlabel
146 |
147 | mfence
148 | rdtscp
149 | lfence
150 |
151 | shl rdx, 20h
152 | shl rdi, 20h
153 | or rax, rdx
154 | or rsi, rdi
155 |
156 | sub rax, rsi
157 |
158 |
159 | pop rsi
160 | pop rdi
161 | pop rbx
162 | ret
163 | KmovTest03 endp
164 |
165 | end
--------------------------------------------------------------------------------
/InstLatX64_Demo.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 16
4 | VisualStudioVersion = 16.0.30204.135
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "InstLatX64_Demo", "InstLatX64_Demo.vcxproj", "{AA410AE3-620A-46C2-8DC8-345AC1644E24}"
7 | EndProject
8 | Global
9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | Debug_AVX2|x32 = Debug_AVX2|x32
11 | Debug_AVX2|x64 = Debug_AVX2|x64
12 | Debug_AVX512|x32 = Debug_AVX512|x32
13 | Debug_AVX512|x64 = Debug_AVX512|x64
14 | Debug_SSE|x32 = Debug_SSE|x32
15 | Debug_SSE|x64 = Debug_SSE|x64
16 | Release_AVX2|x32 = Release_AVX2|x32
17 | Release_AVX2|x64 = Release_AVX2|x64
18 | Release_AVX512|x32 = Release_AVX512|x32
19 | Release_AVX512|x64 = Release_AVX512|x64
20 | Release_SSE|x32 = Release_SSE|x32
21 | Release_SSE|x64 = Release_SSE|x64
22 | EndGlobalSection
23 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
24 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Debug_AVX2|x32.ActiveCfg = Debug_AVX2|Win32
25 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Debug_AVX2|x32.Build.0 = Debug_AVX2|Win32
26 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Debug_AVX2|x64.ActiveCfg = Debug_AVX2|x64
27 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Debug_AVX2|x64.Build.0 = Debug_AVX2|x64
28 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Debug_AVX512|x32.ActiveCfg = Debug_AVX512|Win32
29 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Debug_AVX512|x32.Build.0 = Debug_AVX512|Win32
30 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Debug_AVX512|x64.ActiveCfg = Debug_AVX512|x64
31 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Debug_AVX512|x64.Build.0 = Debug_AVX512|x64
32 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Debug_SSE|x32.ActiveCfg = Debug_SSE|Win32
33 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Debug_SSE|x32.Build.0 = Debug_SSE|Win32
34 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Debug_SSE|x64.ActiveCfg = Debug_SSE|x64
35 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Debug_SSE|x64.Build.0 = Debug_SSE|x64
36 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Release_AVX2|x32.ActiveCfg = Release_AVX2|Win32
37 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Release_AVX2|x32.Build.0 = Release_AVX2|Win32
38 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Release_AVX2|x64.ActiveCfg = Release_AVX2|x64
39 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Release_AVX2|x64.Build.0 = Release_AVX2|x64
40 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Release_AVX512|x32.ActiveCfg = Release_AVX512|Win32
41 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Release_AVX512|x32.Build.0 = Release_AVX512|Win32
42 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Release_AVX512|x64.ActiveCfg = Release_AVX512|x64
43 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Release_AVX512|x64.Build.0 = Release_AVX512|x64
44 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Release_SSE|x32.ActiveCfg = Release_SSE|Win32
45 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Release_SSE|x32.Build.0 = Release_SSE|Win32
46 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Release_SSE|x64.ActiveCfg = Release_SSE|x64
47 | {AA410AE3-620A-46C2-8DC8-345AC1644E24}.Release_SSE|x64.Build.0 = Release_SSE|x64
48 | EndGlobalSection
49 | GlobalSection(SolutionProperties) = preSolution
50 | HideSolutionNode = FALSE
51 | EndGlobalSection
52 | GlobalSection(ExtensibilityGlobals) = postSolution
53 | SolutionGuid = {AFC14262-C3E3-4F51-85F3-F1095E229AEE}
54 | EndGlobalSection
55 | EndGlobal
56 |
--------------------------------------------------------------------------------
/Results/Byte2Byte_CNL.txt:
--------------------------------------------------------------------------------
1 | Vendor: "GenuineIntel"
2 | Family:6 Model:102 Stepping:3 (60663)
3 | Brand: " Intel(R) Core(TM) i3-8121U CPU @ 2.20GHz"
4 | 512b FPU DP ports : 1
5 | ---GPR----------
6 | RDTSC : supported
7 | RDTSCP : supported
8 | CMOV : supported
9 | CMPX8 : supported
10 | CMPX16 : supported
11 | AMD64 : supported
12 | LAHF : supported
13 | MOVBE : supported
14 | ABM : supported
15 | POPCNT : supported
16 | RDRAND : supported
17 | RDSEED : supported
18 | ADX : supported
19 | BMI : supported
20 | BMI2 : supported
21 | MOVDIRI : unsupported
22 | MOVDIR64B : unsupported
23 | ---SIMD---------
24 | SSE : supported
25 | SSE2 : supported
26 | SSE3 : supported
27 | SSSE3 : supported
28 | SSE41 : supported
29 | SSE42 : supported
30 | SSE4A : unsupported
31 | CLMUL : supported
32 | AES : supported
33 | SHA : supported
34 | AVX : supported, OS enabled
35 | AVX2 : supported, OS enabled
36 | FMA : supported, OS enabled
37 | F16C : supported, OS enabled
38 | GFNI : unsupported
39 | VAES : unsupported
40 | VPCLMULQDQ : unsupported
41 | KEYLOCK : unsupported
42 | AVX_VNNI : unsupported
43 | ---AVX512-------
44 | AVX512F : supported, OS enabled
45 | AVX512CD : supported, OS enabled
46 | AVX512ER : unsupported
47 | AVX512PF : unsupported
48 | AVX512BW : supported, OS enabled
49 | AVX512DQ : supported, OS enabled
50 | AVX512VL : supported, OS enabled
51 | AVX512VBMI : supported, OS enabled
52 | AVX512IFMA : supported, OS enabled
53 | AVX512VNNI : unsupported
54 | AVX512_4VNNIW : unsupported
55 | AVX512_4FMAPS : unsupported
56 | AVX512_VPOPCNTDQ : unsupported
57 | AVX512_BITALG : unsupported
58 | AVX512_VBMI2 : unsupported
59 | AVX512_BF16 : unsupported
60 | AVX512_VP2INTERSECT : unsupported
61 | AVX512_FP16 : unsupported
62 | ---AMX----------
63 | AMX-BF16 : unsupported
64 | AMX-INT8 : unsupported
65 | AMX-TILE : unsupported
66 | ---CacheLine----
67 | PREFETCHW : supported
68 | PREFETCHWT1 : unsupported
69 | CLFLUSH : supported
70 | CLFLUSHOPT : supported
71 | CLWB : unsupported
72 | CLZERO : unsupported
73 | CLDEMOTE : unsupported
74 | ---Misc---------
75 | LNOP : supported
76 | SERIALIZE : unsupported
77 | HYBRID : unsupported
78 | ---Deprecated---
79 | X87 : supported
80 | MMX : supported
81 | MMX+ : unsupported
82 | 3DNow! : unsupported
83 | 3DNow!+ : unsupported
84 | XOP : unsupported
85 | FMA4 : unsupported
86 | TBM : unsupported
87 | --- AVX512VBMI Byte2Byte mapping ---
88 | Masked VPERMI2B pair :10.1104
89 | VPERMI2B pair + kreg + TERNLOG :8.86203
90 | VPERMI2B pair + VPSHRQ + TERNLOG:8.20596
91 | ===================================
92 |
--------------------------------------------------------------------------------
/Results/Byte2Byte_RKL.txt:
--------------------------------------------------------------------------------
1 | Vendor: "GenuineIntel"
2 | Family:6 Model:167 Stepping:1 (a0671)
3 | Brand: " 11th Gen Intel(R) Core(TM) i9-11900K @ 3.50GHz"
4 | 512b FPU DP ports : 1
5 | ---GPR----------
6 | RDTSC : supported
7 | RDTSCP : supported
8 | CMOV : supported
9 | CMPX8 : supported
10 | CMPX16 : supported
11 | AMD64 : supported
12 | LAHF : supported
13 | MOVBE : supported
14 | ABM : supported
15 | POPCNT : supported
16 | RDRAND : supported
17 | RDSEED : supported
18 | ADX : supported
19 | BMI : supported
20 | BMI2 : supported
21 | MOVDIRI : unsupported
22 | MOVDIR64B : unsupported
23 | ---SIMD---------
24 | SSE : supported
25 | SSE2 : supported
26 | SSE3 : supported
27 | SSSE3 : supported
28 | SSE41 : supported
29 | SSE42 : supported
30 | SSE4A : unsupported
31 | CLMUL : supported
32 | AES : supported
33 | SHA : supported
34 | AVX : supported, OS enabled
35 | AVX2 : supported, OS enabled
36 | FMA : supported, OS enabled
37 | F16C : supported, OS enabled
38 | GFNI : supported
39 | VAES : supported
40 | VPCLMULQDQ : supported
41 | KEYLOCK : unsupported
42 | AVX_VNNI : unsupported
43 | ---AVX512-------
44 | AVX512F : supported, OS enabled
45 | AVX512CD : supported, OS enabled
46 | AVX512ER : unsupported
47 | AVX512PF : unsupported
48 | AVX512BW : supported, OS enabled
49 | AVX512DQ : supported, OS enabled
50 | AVX512VL : supported, OS enabled
51 | AVX512VBMI : supported, OS enabled
52 | AVX512IFMA : supported, OS enabled
53 | AVX512VNNI : supported, OS enabled
54 | AVX512_4VNNIW : unsupported
55 | AVX512_4FMAPS : unsupported
56 | AVX512_VPOPCNTDQ : supported, OS enabled
57 | AVX512_BITALG : supported, OS enabled
58 | AVX512_VBMI2 : supported, OS enabled
59 | AVX512_BF16 : unsupported
60 | AVX512_VP2INTERSECT : unsupported
61 | AVX512_FP16 : unsupported
62 | ---AMX----------
63 | AMX-BF16 : unsupported
64 | AMX-INT8 : unsupported
65 | AMX-TILE : unsupported
66 | ---CacheLine----
67 | PREFETCHW : supported
68 | PREFETCHWT1 : unsupported
69 | CLFLUSH : supported
70 | CLFLUSHOPT : supported
71 | CLWB : unsupported
72 | CLZERO : unsupported
73 | CLDEMOTE : unsupported
74 | ---Misc---------
75 | LNOP : supported
76 | SERIALIZE : unsupported
77 | HYBRID : unsupported
78 | ---Deprecated---
79 | X87 : supported
80 | MMX : supported
81 | MMX+ : unsupported
82 | 3DNow! : unsupported
83 | 3DNow!+ : unsupported
84 | XOP : unsupported
85 | FMA4 : unsupported
86 | TBM : unsupported
87 | --- AVX512VBMI Byte2Byte mapping ---
88 | Masked VPERMI2B pair :10.0858
89 | VPERMI2B pair + kreg + TERNLOG :8.95898
90 | VPERMI2B pair + GFNI + TERNLOG :8.49166
91 | VPERMI2B pair + VPSHRQ + TERNLOG:8.67294
92 | ===================================
93 |
--------------------------------------------------------------------------------
/VPCLMULQDQ_Demo_Test.cpp:
--------------------------------------------------------------------------------
1 | // VPCLMULQDQ_Demo.cpp
2 |
3 | #include "stdafx.h"
4 | #include "VPCLMULQDQ_Demo.h"
5 |
6 | extern CPU_Props cpu_props;
7 |
8 | using namespace std;
9 |
10 | void VPCLMULQDQ_Demo_prefix_xor(void) {
11 | //PS-XOR(x) ^ PS-XOR(y) == PS-XOR(x ^ y)
12 | unsigned long long q64_0 = 0, q64_1 = 0;
13 | #if !defined(_M_X64)
14 | while (!_rdrand32_step((unsigned int *)&q64_0));
15 | while (!_rdrand32_step((unsigned int *)&q64_0 + 1));
16 | while (!_rdrand32_step((unsigned int *)&q64_1));
17 | while (!_rdrand32_step((unsigned int *)&q64_1 + 1));
18 | #else
19 | while (!_rdrand64_step(&q64_0));
20 | while (!_rdrand64_step(&q64_1));
21 | #endif
22 |
23 | if (cpu_props.IsFeat(FEAT_CLMUL)) {
24 | __m128i x128 = _mm_set_epi64x(q64_0, _rotl64(q64_0, q64_0 & 0x3f));
25 | __m128i y128 = _mm_set_epi64x(q64_1, _rotl64(q64_1, q64_1 & 0x3f));
26 |
27 | __m128i test128 = _mm_xor_si128(
28 | _mm_xor_si128(_mm_prefix_xor_clmul_si128(x128), _mm_prefix_xor_clmul_si128(y128)),
29 | _mm_prefix_xor_clmul_si128(_mm_xor_si128(x128, y128)));
30 | assert(_mm_testz_si128(test128, test128));
31 | printRes("x128 ", x128);
32 | printRes("_mm_prefix_xor_clmul_si128 ", _mm_prefix_xor_clmul_si128(x128));
33 | }
34 | #if defined(__AVX2__)
35 | if (cpu_props.IsFeat(FEAT_AVX_VPCLMULQDQ)) {
36 | unsigned long long q64_2 = 0, q64_3 = 0;
37 | #if !defined(_M_X64)
38 | while (!_rdrand32_step((unsigned int *)&q64_2));
39 | while (!_rdrand32_step((unsigned int *)&q64_2 + 1));
40 | while (!_rdrand32_step((unsigned int *)&q64_3));
41 | while (!_rdrand32_step((unsigned int *)&q64_3 + 1));
42 | #else
43 | while (!_rdrand64_step(&q64_2));
44 | while (!_rdrand64_step(&q64_3));
45 | #endif
46 | __m256i x256 = _mm256_set_epi64x(q64_0, _rotl64(q64_0, q64_0 & 0x3f), q64_2, _rotl64(q64_2, q64_2 & 0x3f));
47 | __m256i y256 = _mm256_set_epi64x(q64_1, _rotl64(q64_1, q64_1 & 0x3f), q64_3, _rotl64(q64_3, q64_3 & 0x3f));
48 |
49 | __m256i test256 = _mm256_xor_si256(
50 | _mm256_xor_si256(_mm256_prefix_xor_clmul_si256(x256), _mm256_prefix_xor_clmul_si256(y256)),
51 | _mm256_prefix_xor_clmul_si256(_mm256_xor_si256(x256, y256)));
52 | assert(_mm256_testz_si256(test256, test256));
53 |
54 | printRes("x256 ", x256);
55 | printRes("_mm256_prefix_xor_clmul_si256 ", _mm256_prefix_xor_clmul_si256(x256));
56 | }
57 | #endif
58 | #if defined(__AVX512F__)
59 | if (cpu_props.IsFeat(FEAT_AVX512_VPCLMULQDQ)) {
60 | unsigned long long q64_2 = 0, q64_3 = 0;
61 | #if !defined(_M_X64)
62 | while (!_rdrand32_step((unsigned int *)&q64_2));
63 | while (!_rdrand32_step((unsigned int *)&q64_2 + 1));
64 | while (!_rdrand32_step((unsigned int *)&q64_3));
65 | while (!_rdrand32_step((unsigned int *)&q64_3 + 1));
66 | #else
67 | while (!_rdrand64_step(&q64_2));
68 | while (!_rdrand64_step(&q64_3));
69 | #endif
70 | __m512i x512 = _mm512_set_epi64(q64_0, _rotl64(q64_0, q64_0 & 0x3f), q64_2, _rotl64(q64_2, q64_2 & 0x3f), q64_1, _rotl64(q64_1, q64_0 & 0x3f), q64_3, _rotl64(q64_2, q64_2 & 0x3f));
71 | __m512i y512 = _mm512_set_epi64(q64_1, _rotl64(q64_1, q64_1 & 0x3f), q64_3, _rotl64(q64_3, q64_3 & 0x3f), q64_0, _rotl64(q64_0, q64_1 & 0x3f), q64_2, _rotl64(q64_3, q64_3 & 0x3f));
72 |
73 | __mmask64 test512 =_mm512_cmpeq_epi8_mask(
74 | _mm512_xor_si512(_mm512_prefix_xor_clmul_si512(x512), _mm512_prefix_xor_clmul_si512(y512)),
75 | _mm512_prefix_xor_clmul_si512(_mm512_xor_si512(x512, y512)));
76 | assert(test512);
77 |
78 | printRes("x512 ", x512);
79 | printRes("_mm512_prefix_xor_clmul_si512 ", _mm512_prefix_xor_clmul_si512(x512));
80 | }
81 | #endif
82 | }
83 |
84 | void VPCLMULQDQ_Demo(void) {
85 | cout << "-----------------------------------" << endl;
86 | VPCLMULQDQ_Demo_prefix_xor();
87 | }
88 |
--------------------------------------------------------------------------------
/Args.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #if defined(__AVX512F__)
4 | #define ISA_FILENAME "AVX512"
5 | #elif defined (__AVX2__)
6 | #define ISA_FILENAME "AVX2"
7 | #else
8 | #define ISA_FILENAME "SSE"
9 | #endif
10 |
11 | #if defined (_M_X64)
12 | #define SOLUTION_FILENAME "_X64"
13 | #else
14 | #define SOLUTION_FILENAME "_X32"
15 | #endif
16 |
17 | #if defined (_DEBUG)
18 | #define DEBUG_FILENAME "_Debug"
19 | #else
20 | #define DEBUG_FILENAME
21 | #endif
22 |
23 | #define DEMO_FILENAME "InstLatX64_Demo_" ISA_FILENAME SOLUTION_FILENAME DEBUG_FILENAME ".exe"
24 |
25 | #define STR_MAXLEN 256
26 | #define MAX_ARGERROR 8
27 | #define MAX_DEMOMASK 1
28 | #define MAX_TSCRATIO 2.0
29 |
30 | #define ARGERR_INV_CHAR "Invalid character: "
31 | #define ARGERR_INV_PARAM "Invalid parameter: "
32 | #define ARGERR_INV_SWITCH "Invalid switch: "
33 | #define ARGERR_INV_DEMO "Invalid demo type: "
34 | #define ARGERR_INV_CPUIDFILE "Invalid CPUID filename: "
35 | #define ARGERR_INV_XCR0 "Invalid XCR0 register value: "
36 | #define ARGERR_INV_TSCRATIO "Invalid TSC ratio value: "
37 |
38 | #define ARGERR_MISS_ARG "Missing argument: "
39 | #define ARGERR_MISS_DEMO "Missing demo type: "
40 | #define ARGERR_MISS_THREAD "Missing thread index: "
41 | #define ARGERR_MISS_CPUIDFILE "Missing CPUID filename: "
42 | #define ARGERR_MISS_XCR0 "Missing XCR0 register value: "
43 | #define ARGERR_MISS_TSCRATIO "Missing TSC ratio value: "
44 |
45 | #define DEFAULT_PCORE_INDEX ~0
46 | #define DEFAULT_ECORE_INDEX (DEFAULT_PCORE_INDEX - 1)
47 | #define DEFAULT_LPECORE_INDEX (DEFAULT_ECORE_INDEX - 1)
48 |
49 | enum argType {
50 | ARG_HELP,
51 | ARG_VERSION,
52 | ARG_DEMOLIST,
53 | ARG_CPUPROPS,
54 | ARG_PCORE,
55 | ARG_ECORE,
56 | ARG_LPECORE,
57 | ARG_CPUIDDUMP,
58 | ARG_PROCMASK,
59 | #if defined (_M_X64) && defined(__AVX512F__)
60 | ARG_512BFMADP,
61 | #endif
62 | ARG_DEMOTYPE,
63 | ARG_THREADINDEX,
64 | ARG_CPUIDFILE,
65 | ARG_XCR0,
66 | ARG_TSCRATIO,
67 | ARG_NOTHING,
68 | };
69 |
70 | typedef struct {
71 | bool arguments;
72 | const char * longName;
73 | char shortName;
74 | argType type;
75 | const char * missingErr;
76 | const char * description;
77 | } paramsType;
78 |
79 | class Args {
80 | private:
81 | static const paramsType params[];
82 | uint64_t demoMask[MAX_DEMOMASK] = {0};
83 | argType paramType;
84 | size_t demoCount;
85 | size_t paramCount;
86 | size_t threadIndex;
87 | UINT64 xcr0;
88 | double tscRatio;
89 | const demoTypeList* demoList;
90 | bool helpFlag;
91 | bool versionFlag;
92 | bool listFlag;
93 | bool cpuPropsFlag;
94 | bool procMaskFlag;
95 | #if defined (_M_X64) && defined(__AVX512F__)
96 | bool _512bFMA_DP_Flag;
97 | #endif
98 | bool errorFlag;
99 | bool dumpFlag;
100 | bool cpuidFileFlag;
101 | bool validFlag;
102 | void SetError(char* , char*, const char* );
103 | void SetParam(argType, char*, char* , int* );
104 | char * cpuidFileName;
105 | public:
106 | Args(const demoTypeList[], size_t, int argc, char** argv);
107 | bool Init(int argc, char** argv);
108 | bool IsVersion(void) const;
109 | bool IsHelp(void) const;
110 | bool IsDemoList(void) const;
111 | bool IsCPUProps(void) const;
112 | bool IsCPUIDDump(void) const;
113 | bool IsCPUIDFile(void) const;
114 | bool IsProcMask(void) const;
115 | #if defined (_M_X64) && defined(__AVX512F__)
116 | bool Is_512bFMA_DP_Ports(void) const;
117 | #endif
118 | bool IsValid(void) const;
119 | void PrintUsage(void) const;
120 | void PrintVersion(void) const;
121 | size_t GetMaxDemo(void) const;
122 | size_t GetThreadIndex(CPU_Props) const;
123 | char* GetCPUIDFileName() const;
124 | bool IsSelected(size_t) const;
125 | UINT64 GetXCR0(void) const;
126 | double GetTSCRatio(void) const;
127 | };
128 |
129 | extern CPU_Props cpu_props;
130 |
--------------------------------------------------------------------------------
/Results/Zen4_expected.txt:
--------------------------------------------------------------------------------
1 | Vendor: "AuthenticAMD"
2 | Family:25 Model:16 Stepping:0 (a10f00)
3 | ---GPR----------
4 | RDTSC : supported
5 | RDTSCP : supported
6 | CMOV : supported
7 | CMPX8 : supported
8 | CMPX16 : supported
9 | AMD64 : supported
10 | LAHF : supported
11 | MOVBE : supported
12 | ABM : supported
13 | POPCNT : supported
14 | RDRAND : supported
15 | RDSEED : supported
16 | ADX : supported
17 | BMI : supported
18 | BMI2 : supported
19 | MOVDIRI : unsupported
20 | MOVDIR64B : unsupported
21 | ---SIMD---------
22 | SSE : supported
23 | SSE2 : supported
24 | SSE3 : supported
25 | SSSE3 : supported
26 | SSE41 : supported
27 | SSE42 : supported
28 | SSE4A : supported
29 | CLMUL : supported
30 | AES : supported
31 | SHA : supported
32 | AVX : supported, OS enabled
33 | AVX2 : supported, OS enabled
34 | FMA : supported, OS enabled
35 | F16C : supported, OS enabled
36 | GFNI : supported
37 | VAES : supported
38 | VPCLMULQDQ : supported
39 | AVX_VNNI : unsupported
40 | ---AVX512-------
41 | AVX512F : supported, OS enabled
42 | AVX512CD : supported, OS enabled
43 | AVX512ER : unsupported
44 | AVX512PF : unsupported
45 | AVX512BW : supported, OS enabled
46 | AVX512DQ : supported, OS enabled
47 | AVX512VL : supported, OS enabled
48 | AVX512VBMI : supported, OS enabled
49 | AVX512IFMA : supported, OS enabled
50 | AVX512VNNI : supported, OS enabled
51 | AVX512_4VNNIW : unsupported
52 | AVX512_4FMAPS : unsupported
53 | AVX512_VPOPCNTDQ : supported, OS enabled
54 | AVX512_BITALG : supported, OS enabled
55 | AVX512_VBMI2 : supported, OS enabled
56 | AVX512_BF16 : supported, OS enabled
57 | AVX512_VP2INTERSECT : unsupported
58 | AVX512_FP16 : unsupported
59 | ---AMX----------
60 | AMX-BF16 : unsupported
61 | AMX-INT8 : unsupported
62 | AMX-TILE : unsupported
63 | ---CacheLine----
64 | PREFETCHW : supported
65 | PREFETCHWT1 : unsupported
66 | CLFLUSH : supported
67 | CLFLUSHOPT : supported
68 | CLWB : supported
69 | CLZERO : supported
70 | CLDEMOTE : unsupported
71 | ---uCode--------
72 | Enh REP MOVSB/STOSB : supported
73 | Fast short REP MOV : supported
74 | Fast zero-length MOVSB : unsupported
75 | Fast short STOSB : unsupported
76 | Fast short CMPSB, SCASB : unsupported
77 | ---Keylocker
78 | KEYLOCK : unsupported
79 | AESKLE : unsupported
80 | WIDE_KL : unsupported
81 | ---Uncategorized
82 | LNOP : supported
83 | SERIALIZE : unsupported
84 | HYBRID : unsupported
85 | RDPID : supported
86 | RDPRU : supported
87 | MCOMMIT : unsupported
88 | ---Deprecated---
89 | X87 : supported
90 | MMX : supported
91 | MMX+ : supported
92 | 3DNow! : unsupported
93 | 3DNow!+ : unsupported
94 | XOP : unsupported
95 | FMA4 : unsupported
96 | TBM : unsupported
97 | MPX : unsupported
98 | HLE : unsupported
99 | PCOMMIT : unsupported
--------------------------------------------------------------------------------
/Results/HYBRID_Lakefield_CPUID806A1.txt:
--------------------------------------------------------------------------------
1 | Vendor: "GenuineIntel"
2 | Family:6 Model:138 Stepping:1 (806a1)
3 | Brand: " Intel(R) Core(TM) i5-L16G7 CPU @ 1.40GHz"
4 | 512b FPU DP ports : 0
5 | ---GPR----------
6 | RDTSC : supported
7 | RDTSCP : supported
8 | CMOV : supported
9 | CMPX8 : supported
10 | CMPX16 : supported
11 | AMD64 : supported
12 | LAHF : supported
13 | MOVBE : supported
14 | ABM : unsupported
15 | POPCNT : supported
16 | RDRAND : supported
17 | RDSEED : supported
18 | ADX : unsupported
19 | BMI : unsupported
20 | BMI2 : unsupported
21 | MOVDIRI : unsupported
22 | MOVDIR64B : unsupported
23 | ---SIMD---------
24 | SSE : supported
25 | SSE2 : supported
26 | SSE3 : supported
27 | SSSE3 : supported
28 | SSE41 : supported
29 | SSE42 : supported
30 | SSE4A : unsupported
31 | CLMUL : supported
32 | AES : supported
33 | SHA : supported
34 | AVX : unsupported
35 | AVX2 : unsupported
36 | FMA : unsupported
37 | F16C : unsupported
38 | GFNI : supported
39 | VAES : unsupported
40 | VPCLMULQDQ : unsupported
41 | KEYLOCK : unsupported
42 | AVX_VNNI : unsupported
43 | ---AVX512-------
44 | AVX512F : unsupported
45 | AVX512CD : unsupported
46 | AVX512ER : unsupported
47 | AVX512PF : unsupported
48 | AVX512BW : unsupported
49 | AVX512DQ : unsupported
50 | AVX512VL : unsupported
51 | AVX512VBMI : unsupported
52 | AVX512IFMA : unsupported
53 | AVX512VNNI : unsupported
54 | AVX512_4VNNIW : unsupported
55 | AVX512_4FMAPS : unsupported
56 | AVX512_VPOPCNTDQ : unsupported
57 | AVX512_BITALG : unsupported
58 | AVX512_VBMI2 : unsupported
59 | AVX512_BF16 : unsupported
60 | AVX512_VP2INTERSECT : unsupported
61 | AVX512_FP16 : unsupported
62 | ---AMX----------
63 | AMX-BF16 : unsupported
64 | AMX-INT8 : unsupported
65 | AMX-TILE : unsupported
66 | ---CacheLine----
67 | PREFETCHW : supported
68 | PREFETCHWT1 : unsupported
69 | CLFLUSH : supported
70 | CLFLUSHOPT : supported
71 | CLWB : supported
72 | CLZERO : unsupported
73 | CLDEMOTE : unsupported
74 | ---uCode--------
75 | Enh REP MOVSB/STOSB : supported
76 | Fast short REP MOV : unsupported
77 | Fast zero-length MOVSB : unsupported
78 | Fast short STOSB : unsupported
79 | Fast short CMPSB, SCASB : unsupported
80 | ---Uncategorized
81 | LNOP : supported
82 | SERIALIZE : unsupported
83 | HYBRID : supported
84 | RDPID : unsupported
85 | RDPRU : unsupported
86 | MCOMMIT : unsupported
87 | ---Deprecated---
88 | X87 : supported
89 | MMX : supported
90 | MMX+ : unsupported
91 | 3DNow! : unsupported
92 | 3DNow!+ : unsupported
93 | XOP : unsupported
94 | FMA4 : unsupported
95 | TBM : unsupported
96 | MPX : unsupported
97 | HLE : unsupported
98 | PCOMMIT : unsupported
99 | --Hybrid info--
100 | systemAffinityMask: 0x000000000000001f
101 | littleCoreMask : 0x000000000000000f
102 | bigCoreMask : 0x0000000000000010
103 | ===================================
104 |
--------------------------------------------------------------------------------
/HWBITPERM_Demo_Asm.asm:
--------------------------------------------------------------------------------
1 | .data
2 |
3 | savereg dq 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h
4 | savereg2 dq 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h
5 |
6 | exch0 dq 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h
7 | exch1 dq 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h
8 | exch2 dq 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h, 00000000000000000h
9 |
10 | .code
11 |
12 | EMPTY_Init macro EMPTYPAR, INST, CPU
13 | endm
14 |
15 | B64_HW_Core macro disp, INST
16 | mov rax, qword ptr [exch0 + disp]
17 | IFIDNI ,
18 | mov r9, qword ptr [exch1 + disp]
19 | pext rdx, rax, r9
20 | popcnt r8, r9
21 | not r9
22 | pext r9, rax, r9
23 | shlx r9, r9, r8
24 | or rdx, r9
25 | ELSE
26 | INST rdx, rax, qword ptr [exch1 + disp]
27 | ENDIF
28 | mov qword ptr [exch2 + disp], rdx
29 | endm
30 |
31 | B32_HW_Core macro disp, INST
32 | mov eax, dword ptr [exch0 + disp]
33 | IFIDNI ,
34 | mov r9d, dword ptr [exch1 + disp]
35 | pext edx, eax, r9d
36 | popcnt r8d, r9d
37 | not r9d
38 | pext r9d, eax, r9d
39 | shlx r9d, r9d, r8d
40 | or edx, r9d
41 | ELSE
42 | INST edx, eax, dword ptr [exch1 + disp]
43 | ENDIF
44 | mov dword ptr [exch2 + disp], edx
45 | endm
46 |
47 | HW macro BITNESS, INST
48 | IF BITNESS EQ 64
49 | vmovdqu64 zmmword ptr [exch0], zmm0
50 | vmovdqu64 zmmword ptr [exch1], zmm1
51 | ELSE
52 | vmovdqu32 zmmword ptr [exch0], zmm0
53 | vmovdqu32 zmmword ptr [exch1], zmm1
54 | ENDIF
55 |
56 | disp = 00h
57 | REPEAT 512 / BITNESS
58 | IF BITNESS EQ 64
59 | B64_HW_Core disp, INST
60 | ELSE
61 | B32_HW_Core disp, INST
62 | ENDIF
63 | disp = disp + (BITNESS / 8)
64 | endm
65 |
66 | IF BITNESS EQ 64
67 | vmovdqu64 zmm0, zmmword ptr [exch2]
68 | ELSE
69 | vmovdqu32 zmm0, zmmword ptr [exch2]
70 | ENDIF
71 | endm
72 |
73 | TIMED macro PNAME, INIT, BITNESS, CORE, INST, CPU, TPLAT
74 | PNAME proc
75 | push rbx
76 | push rdi
77 | push rsi
78 |
79 | INIT BITNESS, INST, CPU
80 |
81 | mfence
82 | rdtscp
83 | lfence
84 |
85 | mov esi, eax
86 | mov edi, edx
87 |
88 | mov ecx, DEPEXT219_REPEATS
89 |
90 | align 16
91 | startlabel:
92 | CORE BITNESS, INST
93 |
94 | IFIDNI ,
95 | IF BITNESS EQ 64
96 | vmovdqa64 zmm1, zmm0
97 | ELSE
98 | vmovdqa32 zmm1, zmm0
99 | ENDIF
100 | ELSEIFIDNI ,
101 | vpxor xmm0, xmm0, xmm0
102 | vpxor xmm1, xmm1, xmm1
103 | ENDIF
104 |
105 | dec ecx
106 | jnz startlabel
107 |
108 | mfence
109 | rdtscp
110 | lfence
111 |
112 | shl rdx, 20h
113 | shl rdi, 20h
114 | or rax, rdx
115 | or rsi, rdi
116 |
117 | sub rax, rsi
118 |
119 |
120 | pop rsi
121 | pop rdi
122 | pop rbx
123 | ret
124 | PNAME endp
125 | endm
126 |
127 | NAKED macro PNAME, INIT, BITNESS, CORE, INST, CPU
128 | PNAME proc
129 |
130 | INIT BITNESS, INST, CPU
131 |
132 | CORE BITNESS, INST
133 |
134 |
135 | ret
136 | PNAME endp
137 | endm
138 |
139 | ;Creadit: Travis Downs
140 | ;https://twitter.com/trav_downs/status/1418616866080116742?
141 |
142 | TIMED BEXT64_HW_Lat, EMPTY_Init, 64, HW, PEXT, SKX, LAT
143 | TIMED BDEP64_HW_Lat, EMPTY_Init, 64, HW, PDEP, SKX, LAT
144 | TIMED BGRP64_HW_Lat, EMPTY_Init, 64, HW, PGRP, SKX, LAT
145 | TIMED BEXT32_HW_Lat, EMPTY_Init, 32, HW, PEXT, SKX, LAT
146 | TIMED BDEP32_HW_Lat, EMPTY_Init, 32, HW, PDEP, SKX, LAT
147 | TIMED BGRP32_HW_Lat, EMPTY_Init, 32, HW, PGRP, SKX, LAT
148 |
149 | TIMED BEXT64_HW_Tp, EMPTY_Init, 64, HW, PEXT, SKX, TP
150 | TIMED BDEP64_HW_Tp, EMPTY_Init, 64, HW, PDEP, SKX, TP
151 | TIMED BGRP64_HW_Tp, EMPTY_Init, 64, HW, PGRP, SKX, TP
152 | TIMED BEXT32_HW_Tp, EMPTY_Init, 32, HW, PEXT, SKX, TP
153 | TIMED BDEP32_HW_Tp, EMPTY_Init, 32, HW, PDEP, SKX, TP
154 | TIMED BGRP32_HW_Tp, EMPTY_Init, 32, HW, PGRP, SKX, TP
155 |
156 | NAKED BEXT64_HW@@128, EMPTY_Init, 64, HW, PEXT, SKX
157 | NAKED BDEP64_HW@@128, EMPTY_Init, 64, HW, PDEP, SKX
158 | NAKED BGRP64_HW@@128, EMPTY_Init, 64, HW, PGRP, SKX
159 | NAKED BEXT32_HW@@128, EMPTY_Init, 32, HW, PEXT, SKX
160 | NAKED BDEP32_HW@@128, EMPTY_Init, 32, HW, PDEP, SKX
161 | NAKED BGRP32_HW@@128, EMPTY_Init, 32, HW, PGRP, SKX
162 |
163 | end
--------------------------------------------------------------------------------
/AVX512_Reduce_Add.cpp:
--------------------------------------------------------------------------------
1 | #include "stdafx.h"
2 | #include "AVX512_Reduce_Add.h"
3 |
4 | extern CPU_Props cpu_props;
5 |
6 | using namespace std;
7 |
8 | //credit: @geofflangdale https://twitter.com/geofflangdale/status/1609575574946865154
9 |
10 | uint32_t _mm512_reduce2_add_epu8(__m512i z) {
11 | __m128i permb_collect = _mm_setr_epi64x(0x3830282018100800, 0x3931292119110901);
12 | __m512i sad = _mm512_sad_epu8(_mm512_setzero_si512(), z);
13 | __m128i permb = _mm512_castsi512_si128(_mm512_permutexvar_epi8(_mm512_zextsi128_si512(permb_collect), sad));
14 | __m128i sad2 = _mm_sad_epu8(_mm_setzero_si128(), permb);
15 | return _mm_cvtsi128_si32(_mm_add_epi32(sad2, _mm_srli_si128(sad2, 7)));
16 | }
17 |
18 | uint32_t _mm512_reduce2_add_epu16(__m512i z) {
19 | __m128i pshufb_collect = _mm_setr_epi32(0x06040200, 0x0e0c0a08, 0x07050301, 0x0f0d0b09);
20 | __m512i pshufb = _mm512_shuffle_epi8(z, _mm512_broadcast_i32x4(pshufb_collect));
21 | __m512i sad = _mm512_sad_epu8(_mm512_setzero_si512(), pshufb);
22 | __m128i permb_collect = _mm_setr_epi32(0x30201000, 0x31211101, 0x38281808, 0x39291909);
23 | __m128i permb = _mm512_castsi512_si128(_mm512_permutexvar_epi8(_mm512_zextsi128_si512(permb_collect), sad));
24 | __m128i dbsad = _mm_maskz_dbsad_epu8(0x55, permb, _mm_setzero_si128(), 0);
25 | __m128i add = _mm_add_epi32(_mm_srli_epi64(dbsad, 24), dbsad);
26 | return _mm_cvtsi128_si32(_mm_add_epi32(add, _mm_srli_si128(add, 7)));
27 | }
28 |
29 | uint64_t _mm512_reduce2_add_epu32(__m512i z) {
30 | __m128i transpose_4x4 = _mm_setr_epi32(0x0c080400, 0x0d090501, 0x0e0a0602, 0x0f0b0703);
31 | __m512i transpose = _mm512_shuffle_epi8(z, _mm512_broadcast_i32x4(transpose_4x4));
32 | __m512i dbsad = _mm512_maskz_dbsad_epu8(0x55555555, transpose, _mm512_setzero_si512(), 0);
33 | __m512i permb_collect = _mm512_setr_epi32(0x30201000, 0x3f3f3f3f, 0x31211101, 0x34241404, 0x35251505, 0x38281808, 0x39291909, 0x3c2c1c0c, 0x3d2d1d0d, 0x3f3f3f3f, 0x3f3f3f3f, 0x3f3f3f3f, 0x3f3f3f3f, 0x3f3f3f3f, 0x3f3f3f3f, 0x3f3f3f3f);
34 | __m512i permb = _mm512_permutexvar_epi8(permb_collect, dbsad);
35 | __m512i sad = _mm512_sad_epu8(permb, _mm512_setzero_si512());
36 | __m128i permb2 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(_mm512_zextsi128_si512(_mm_setr_epi64x(0x3f3f3f2018100800, 0x3f3f21191109013f)), sad));
37 | return _mm_cvtsi128_si64(_mm_add_epi64(permb2, _mm_unpackhi_epi64(permb2, permb2)));
38 | }
39 |
40 | uint64_t _mm512_reduce2_add_epu64(__m512i z) {
41 | __m512i transpose_8x8 = _mm512_setr_epi64(0x3830282018100800, 0x3931292119110901, 0x3a322a221a120a02, 0x3b332b231b130b03, 0x3c342c241c140c04, 0x3d352d251d150d05, 0x3e362e261e160e06, 0x3f372f271f170f07);
42 | __m512i transpose = _mm512_permutexvar_epi8(transpose_8x8, z);
43 | __m512i sad = _mm512_sad_epu8(transpose, _mm512_setzero_si512());
44 | __m128i collect = _mm512_castsi512_si128(_mm512_permutexvar_epi8(_mm512_zextsi128_si512(_mm_setr_epi64x(0x3830282018100800, 0x312921191109013f)), sad));
45 | return _mm_cvtsi128_si64(_mm_add_epi64(collect, _mm_unpackhi_epi64(collect, collect)));
46 | }
47 |
48 | uint64_t _mm512_reduce2_add_epu128(__m512i z, uint64_t* hi) {
49 | __m512i transpose_8x8 = _mm512_setr_epi64(0x3830282018100800, 0x3931292119110901, 0x3a322a221a120a02, 0x3b332b231b130b03, 0x3c342c241c140c04, 0x3d352d251d150d05, 0x3e362e261e160e06, 0x3f372f271f170f07);
50 | __m512i transpose = _mm512_permutexvar_epi8(transpose_8x8, z);
51 | __m512i sad = _mm512_sad_epu8(transpose, _mm512_setzero_si512());
52 | __m128i collect0 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(_mm512_zextsi128_si512(_mm_setr_epi64x(0x3830282018100800, 0x3f38302820181008)), sad));
53 | __m128i collect1 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(_mm512_zextsi128_si512(_mm_setr_epi64x(0x312921191109013f, 0x3931292119110901)), sad));
54 | __m128i add = _mm_add_epi64(collect0, collect1);
55 | *hi = _mm_extract_epi64(add, 1) >> 56;
56 | return _mm_cvtsi128_si64(add);
57 | }
58 |
59 | void AVX512_Reduce_Add_Demo(void) {
60 | __m512i u = _mm512_undefined_epi32();
61 |
62 | cout << hex;
63 | /* Microsoft built-in reduce_add intrinsics */
64 | //cout << setw(16) << right << _mm512_reduce_add_epu8(_mm512_set1_epi8(0xFE)) << endl;
65 | //cout << setw(16) << right << _mm512_reduce_add_epu16(_mm512_set1_epi16(0xFEDC)) << endl;
66 | //cout << setw(16) << right << _mm512_reduce_add_epi32(_mm512_set1_epi32(0xFEDCBA98)) << endl;
67 | //cout << setw(16) << right << _mm512_reduce_add_epi64(_mm512_set1_epi64(0xFEDCBA9876543210)) << endl;
68 |
69 | cout << setw(16) << right << _mm512_reduce2_add_epu8(_mm512_set1_epi8((char)0xFE)) << endl;
70 | cout << setw(16) << right << _mm512_reduce2_add_epu16(_mm512_set1_epi16((short)0xFEDC)) << endl;
71 | cout << setw(16) << right << _mm512_reduce2_add_epu32(_mm512_set1_epi32(0xFEDCBA98)) << endl;
72 | cout << setw(16) << right << _mm512_reduce2_add_epu64(_mm512_set1_epi64(0xFEDCBA9876543210)) << endl;
73 |
74 | uint64_t hi, lo = _mm512_reduce2_add_epu128(_mm512_set1_epi64(0xFEDCBA9876543210), &hi);
75 | cout << setw(16) << right << hi << ':' << lo << endl;
76 | }
--------------------------------------------------------------------------------
/AMX_Demo.cpp:
--------------------------------------------------------------------------------
1 | #include "stdafx.h"
2 | #include "AMX_Demo.h"
3 |
4 | extern CPU_Props cpu_props;
5 |
6 | using namespace std;
7 |
8 | void AMX_Test(void) {
9 | #if (_MSC_VER > 1927)
10 | #if defined (_M_X64)
11 | //AMX-TILE
12 | const unsigned int ttb = cpu_props.GetAMXPalette_TotalTileBytes(0);
13 | const unsigned int maxRegs = cpu_props.GetAMXPalette_MaxName(0);
14 | const unsigned int maxRegSize = ttb / max(1, maxRegs);
15 | const unsigned int amxCols = cpu_props.GetAMXCols();
16 | const unsigned int amxRows = cpu_props.GetAMXRows();
17 | cout << "ttb :" << dec << ttb << endl;
18 | cout << "maxRegs :" << maxRegs << endl;
19 | cout << "maxRegSize:" << maxRegSize << endl;
20 | cout << "AMXCols :" << amxCols << endl;
21 | cout << "AMXRows :" << amxRows << endl;
22 | XTILECFG load_tilecfg(amxCols, amxRows, ttb, maxRegs);
23 |
24 | XTILECFG store_tilecfg;
25 | unsigned char * tile0 = new unsigned char[maxRegSize]{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
26 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
27 | 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
28 | 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
29 | 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
30 | 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
31 | 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
32 | 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f};
33 | unsigned char * tile1 = new unsigned char[maxRegSize]{ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
34 | 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
35 | 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
36 | 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
37 | 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
38 | 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
39 | 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
40 | 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff};
41 | unsigned short *tile2 = new unsigned short[maxRegSize / 2]{ 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x4000, 0x4000, 0x4000, 0x4000,
42 | 0x4040, 0x4040, 0x4040, 0x4040, 0x0000, 0x0000, 0x0000, 0x0000,
43 | 0xbf80, 0xbf80, 0xbf80, 0xbf80, 0xc000, 0xc000, 0xc000, 0xc000,
44 | 0xc040, 0xc040, 0xc040, 0xc040, 0x8000, 0x8000, 0x8000, 0x8000};
45 | unsigned char * restile2 = new unsigned char[maxRegSize];
46 | unsigned char * restile3 = new unsigned char[maxRegSize];
47 | unsigned char * restile4 = new unsigned char[maxRegSize];
48 | unsigned char * restile5 = new unsigned char[maxRegSize];
49 | unsigned char * restile6 = new unsigned char[maxRegSize];
50 | unsigned char * restile7 = new unsigned char[maxRegSize];
51 |
52 | //AMX-TILE
53 | _tile_release(); //TILERELEASE
54 | _tile_loadconfig(&load_tilecfg); //LDTILECFG
55 | _tile_loadd(0, tile0, 1); //TILELOADD
56 | _tile_loadd(1, tile0, 4); //TILELOADD
57 | _tile_loadd(2, tile0, 16); //TILEZERO
58 | _tile_zero(3); //TILEZERO
59 | _tile_zero(4); //TILEZERO
60 | _tile_zero(5); //TILEZERO
61 | _tile_stream_loadd(6, tile2, 2); //TILELOADDT1
62 | _tile_stream_loadd(7, tile2, 4); //TILELOADDT1
63 | //AMX-INT8
64 | _tile_dpbssd(2, 1, 0); //TDPBSSD
65 | _tile_dpbsud(3, 2, 0); //TDPBSSD
66 | _tile_dpbusd(4, 1, 0); //TDPBSSD
67 | _tile_dpbuud(5, 1, 0); //TDPBSSD
68 | //AMX-BF16
69 | _tile_dpbf16ps(0, 6, 7); //TDPBF16PS
70 | //AMX-TILE
71 | _tile_stored(2, restile2, amxCols); //TILESTORED
72 | _tile_stored(3, restile3, amxCols); //TILESTORED
73 | _tile_stored(4, restile4, amxCols); //TILESTORED
74 | _tile_stored(5, restile5, amxCols); //TILESTORED
75 | _tile_stored(6, restile6, amxCols); //TILESTORED
76 | _tile_storeconfig(&store_tilecfg); //STTILECFG
77 | _tile_release(); //TILERELEASE
78 | //Intel SDE command line
79 | //sde -spr -debugtrace -start_extension AMX_TILE -stop_extension AVX512EVEX -- InstLatX64_Demo_AVX512_x64.exe
80 | _mm512_storeu_epi64(restile7, _mm512_loadu_epi64(&store_tilecfg));
81 | delete [] tile0, tile1, tile2;
82 | delete [] restile2, restile3, restile4, restile5;
83 | #endif // defined (_M_X64)
84 | #endif //_MSC_VER > 1927
85 | }
--------------------------------------------------------------------------------
/InstLatX64_Demo.cpp:
--------------------------------------------------------------------------------
1 | // InstLatX64_Demo.cpp
2 | //
3 |
4 | #include "stdafx.h"
5 |
6 | using namespace std;
7 |
8 | const demoTypeList demos[] = {
9 | {"GFNI", "", DEMO_GFNI, FEAT_GFNI, true, GFNI_Demo, "SIMD byte granularity shifts/rotates, 8x8 bit, pospocnt, etc."},
10 | {"VPCLMULQDQ", "CLMUL", DEMO_VPCLMLQDQ, FEAT_CLMUL, true, VPCLMULQDQ_Demo, "SIMD prefix xor / parity"},
11 | #if defined (__AVX2__)
12 | {"AVX_VNNI_INT16_AddSubS", "VSAdd", DEMO_VNNI_SADD, FEAT_AVX_VNNI_INT16, true, AVX_VNNI_Saturated_AddSub_Demo, "_mm256_adds/subs_epi/epu/32 implementation"},
13 | #endif
14 | #if defined (_M_X64)
15 | #if defined (__AVX2__)
16 | {"P06P1", "P06P1", DEMO_P06P1, FEAT_BMI2, true, P0601_Test, "Golden Cove P06P1 anomaly"},
17 | {"PEXT_PDEP", "PEXT", DEMO_PEXT_PDEP_EMU, FEAT_BMI2, true, PEXT_PDEP_Emu_Test, "Fast GPR PEXT/PDEP instruction emulation for AMDs"},
18 | {"FirstByte", "", DEMO_FIRSTBBYTE, FEAT_AVX2, true, FirstByte_Demo, "Finding first byte in lanes"},
19 | #endif
20 | #if defined(__AVX512F__)
21 | {"Reduce_Add", "RAdd", DEMO_RADD, FEAT_AVX512VBMI, true, AVX512_Reduce_Add_Demo, "_mm512_reduce_add_epu8/16/32/64 implementation"},
22 | {"Saturated_AddSub", "SAdd", DEMO_AVX512_SADD, FEAT_AVX512F, true, AVX512_Saturated_AddSub_Demo, "_mm512_adds/subs_epi/epu/32/64 implementation"},
23 | {"KMemDst", "KMem", DEMO_KMEMDST, FEAT_AVX512F, true, AVX512_KMemDst_Demo, "AVX512 insts with masked memory destination"},
24 | {"Zen4", "Zen4", DEMO_ZEN4, FEAT_AVX512F, true, Zen4_Demo, "AMD Zen4 SIMD analysis"},
25 | {"Zen5", "Zen5", DEMO_ZEN5, FEAT_AVX512F, true, Zen5_Demo, "AMD Zen5 SIMD analysis"},
26 | {"Intrinsics", "Intrin", DEMO_INTRINSICS, FEAT_AVX512F, true, AVX512_InstrincTest, "Visual Studio Compiler Intrinsics Test"},
27 | {"VBMI2", "", DEMO_VBMI2, FEAT_AVX512_VBMI2, true, VBMI2_Demo, "SIMD variable rots and shifts for words and bytes"},
28 | {"Byte2Byte", "B2B", DEMO_BYTE2BYTE, FEAT_AVX512VBMI, true, Byte2ByteTest, "Fastest Byte2Byte SIMD replacemant"},
29 | {"LZCNT", "", DEMO_LZCNT, FEAT_AVX512_BITALG, true, LZCNT_Test, "Missing SIMD VPLZCNTB/W emulation"},
30 | {"TZCNT", "", DEMO_TZCNT, FEAT_AVX512_VPOPCNTDQ, true, TZCNT_Test, "Missing SIMD VPTZCNTB/W/D/Q emulation"},
31 | {"HWBITPERM", "HWB", DEMO_HWBITPERM, FEAT_AVX512BW, true, HWBITPERM_Test, "SVE2 vector BITPERM (BEXT/BDEP/BGRP) emulation with HW scalar BMI2 PEXT/PDEP instructions"},
32 | {"KMOV", "", DEMO_KMOV, FEAT_AVX512BW, true, Kmov_Test, "KMOV"},
33 | {"AMX", "", DEMO_AMX, FEAT_AMX_BF16, true, AMX_Test, "AMX 101"},
34 | {"AVX512_DecPrint", "Print", DEMO_AVX512_DECPRINT, FEAT_AVX512F, true, AVX512_DecimalPrint_Test, "AVX512F & AVX512_IFMA decimal print"},
35 | {"ByteShift", "BGVSER", DEMO_AVX512_BGVSER, FEAT_AVX512VBMI, true, AVX512_BGVSER_Test, "Byte-Granularity Variable Shift on Entire Register"},
36 | #endif
37 | #endif
38 | };
39 |
40 | Args args(demos, sizeof(demos) / sizeof(demoTypeList), __argc, __argv);
41 | CPU_Props cpu_props(args.GetXCR0());
42 |
43 | int main(void)
44 | {
45 | if (args.IsValid()) {
46 |
47 | if (args.IsVersion())
48 | args.PrintVersion();
49 |
50 | if (args.IsHelp())
51 | args.PrintUsage();
52 |
53 | if (args.IsDemoList()) {
54 | cout << endl << "Demo types:";
55 | for (uint32_t demo = 0; demo < sizeof(demos) / sizeof(demoTypeList); demo++) {
56 | cout << endl << setw(24) << demos[demo].demoName;
57 | if (_stricmp(demos[demo].alias, "") != 0)
58 | cout << " (alias:" << setw(6) << demos[demo].alias << ')';
59 | else
60 | cout << " ";
61 | cout << (demos[demo].publicFlag ? " [PUB] " : " ");
62 | if (_stricmp(demos[demo].comment, "") != 0)
63 | cout << '(' << demos[demo].comment << ')';
64 | }
65 | cout << endl;
66 | }
67 | bool fileRead = true;
68 | if (args.IsCPUIDFile()) {
69 | fileRead = cpu_props.GetFileCPUID(args.GetCPUIDFileName(), args.GetXCR0());
70 | }
71 | if (fileRead) {
72 | if (args.IsCPUProps()) {
73 | cpu_props.PrintFeats();
74 | //cpu_props.ForcedAVX512();
75 | cpu_props.PrintXCR0();
76 | }
77 |
78 | #if defined (_M_X64) && defined(__AVX512F__)
79 | if (args.Is_512bFMA_DP_Ports() && cpu_props.IsFeat(FEAT_AVX512F))
80 | cpu_props.Print_512bFMA_DP_Ports();
81 | #endif
82 | if (args.IsCPUIDDump())
83 | cpu_props.PrintCPUIDDump();
84 | if (args.IsProcMask())
85 | cpu_props.PrintHybridMasks();
86 |
87 | for (uint32_t demo = 0; demo <= args.GetMaxDemo(); demo++) {
88 | if (args.IsSelected(demo)) {
89 | cout << "===================================" << endl;
90 | cout << demos[demo].demoName << endl;
91 | if (cpu_props.IsFeat(demos[demo].feats)) {
92 | (demos[demo].func)();
93 | } else {
94 | cpu_props.PrintFeat(demos[demo].feats);
95 | cout << " unsupported." << endl;
96 | }
97 | }
98 | }
99 | } else {
100 | cout << "CPUID file open error: " << args.GetCPUIDFileName();
101 | }
102 | }
103 | return 0;
104 | }
105 |
106 |
--------------------------------------------------------------------------------
/FirstByte.cpp:
--------------------------------------------------------------------------------
1 | #include "stdafx.h"
2 | #include "FirstByte.h"
3 |
4 | extern CPU_Props cpu_props;
5 |
6 | using namespace std;
7 |
8 | //original article:
9 | //http://0x80.pl/notesen/2023-02-06-avx512-find-first-byte-in-lane.html
10 |
11 | __m256i _mm256_firstbyte_epu32(__m256i a, char c) {
12 | __m256i one = _mm256_set1_epi8(0x01);
13 | __m256i check = _mm256_set1_epi8(c);
14 | __m256i vnnibase = _mm256_set1_epi32(0x80808000);
15 | __m256i vnnimul = _mm256_set1_epi32(0x08040201);
16 | __m256i shufb_const = _mm256_broadcastsi128_si256(_mm_setr_epi64x(0x0300010002000100, 0x0400010002000100)); //first zero index
17 |
18 | __m256i xorres = _mm256_xor_si256(check, a);
19 | __m256i minub = _mm256_min_epu8(one, xorres);
20 |
21 | /* VPDPBUSD collects LSBs into the b[3:0] bitfield: */
22 | #if defined(__AVX512BW__)
23 | __m256i vnni = _mm256_dpbusd_epi32(vnnibase, vnnimul, minub);
24 | #else
25 | __m256i vnni = _mm256_dpbusd_avx_epi32(vnnibase, vnnimul, minub);
26 | #endif
27 | return _mm256_shuffle_epi8(shufb_const, vnni);
28 | }
29 |
30 | //credit: @dougallj
31 | //https://twitter.com/dougallj/status/1624663388856156160
32 |
33 | __m256i _mm256_firstbyte_epu64(__m256i a, char c) {
34 | __m256i one = _mm256_set1_epi8(0x01);
35 | __m256i check = _mm256_set1_epi8(c);
36 | __m256i mask = _mm256_cmpeq_epi8(check, a);
37 | __m256i lowmask = _mm256_andnot_si256(mask, _mm256_sub_epi64(mask, _mm256_set1_epi64x(1)));
38 | __m256i ones = _mm256_and_si256(lowmask, one);
39 | return _mm256_sad_epu8(_mm256_setzero_si256(), ones);
40 | }
41 |
42 | #if defined(__AVX512BW__)
43 | __m512i _mm512_firstbyte_epu32(__m512i a, char c) {
44 | __m512i one = _mm512_set1_epi8(0x01);
45 | __m512i check = _mm512_set1_epi8(c);
46 | __m512i vnnibase = _mm512_set1_epi32(0x80808000);
47 | __m512i vnnimul = _mm512_set1_epi32(0x08040201);
48 | __m512i shufb_const = _mm512_broadcast_i64x2(_mm_setr_epi64x(0x0300010002000100, 0x0400010002000100)); //first zero index
49 |
50 | __m512i xorres = _mm512_xor_epi64(check, a);
51 | __m512i minub = _mm512_min_epu8(one, xorres);
52 |
53 | /* VPDPBUSD collects LSBs into the b[3:0] bitfield: */
54 |
55 | __m512i vnni = _mm512_dpbusd_epi32(vnnibase, vnnimul, minub);
56 | return _mm512_shuffle_epi8(shufb_const, vnni);
57 | }
58 |
59 | __m512i _mm512_firstbyte_epu64(__m512i a, char c) {
60 | __m512i one = _mm512_set1_epi8(0x01);
61 | __m512i check = _mm512_set1_epi8(c);
62 | __m512i mirror = _mm512_set1_epi64(0x0102040810204080);
63 |
64 | __m512i xorres = _mm512_xor_epi64(check, a);
65 | __m512i minub = _mm512_min_epu8(one, xorres);
66 |
67 | /* Mirror bits in qwords, through the */
68 | /* 07-16-25-34-43-52-61-70 diagonal axis */
69 | /* */
70 | /* In[i,j] -> Out[7-j,7-i] */
71 | /* */
72 | /* In : MSB 77 76 75 74 73 72 71 70 */
73 | /* 67 66 65 64 63 62 61 60 */
74 | /* 57 56 55 54 53 52 51 50 */
75 | /* 47 46 45 44 43 42 41 40 */
76 | /* 37 36 35 34 33 32 31 30 */
77 | /* 27 26 25 24 23 22 21 20 */
78 | /* 17 16 15 14 13 12 11 10 */
79 | /* 07 06 05 04 03 02 01 00 LSB */
80 | /* */
81 | /* Out : MSB 00 10 20 30 40 50 60 70 */
82 | /* 01 11 21 31 41 51 61 71 */
83 | /* 02 12 22 32 42 52 62 72 */
84 | /* 03 13 23 33 43 53 63 73 */
85 | /* 04 14 24 34 44 54 64 74 */
86 | /* 05 15 25 35 45 55 65 75 */
87 | /* 06 16 26 36 46 56 66 76 */
88 | /* 07 17 27 37 47 57 67 77 LSB */
89 |
90 | __m512i gfni = _mm512_gf2p8affine_epi64_epi8(mirror, minub, 0xff); //mirror & invert 70-60-50-40-30-20-10-00 into b[63:56] with imm8 0xff
91 | return _mm512_lzcnt_epi64(gfni);
92 | }
93 | #endif
94 |
95 | void FirstByte_Demo(void) {
96 | #if defined(__AVX2__)
97 | if (cpu_props.IsFeat(FEAT_AVX_VNNI)) {
98 | __m256i dword_testcase = _mm256_set_epi32(0xfedcba98, 0x76543210, 0xbd000000, 0xbd0000, 0xbd00, 0xbd, 0, 0xbdbd);
99 | printRes32("DWORD Testcase :", dword_testcase);
100 | printRes32("_mm256_firstbyte_epu32 :", _mm256_firstbyte_epu32(dword_testcase, (char)0xbd));
101 | }
102 | __m256i qword_testcase = _mm256_set_epi64x(0, 0xbd00, 0xbdbdbd0000, 0xbdbdbdbd000000);
103 | printRes("QWORD Testcase :", qword_testcase);
104 | printRes("_mm256_firstbyte_epu64 :", _mm256_firstbyte_epu64(qword_testcase, (char)0xbd));
105 | #if defined(__AVX512BW__)
106 | if (cpu_props.IsFeat(FEAT_AVX512VNNI)) {
107 | __m512i dword_testcase2 = _mm512_and_si512(_mm512_movm_epi8(0xfedcba9876543210), _mm512_set1_epi8((char)0xbd));
108 | printRes32("DWORD Testcase2 :", dword_testcase2);
109 | printRes32("_mm512_firstbyte_epu32 :", _mm512_firstbyte_epu32(dword_testcase2, (char)0xbd));
110 | }
111 | if (cpu_props.IsFeat(FEAT_GFNI)) {
112 | __m512i qword_testcase2 = _mm512_setr_epi64(0xbd, 0xbd00, 0xbd0000, 0xbd000000, 0xbd00000000, 0xbd0000000000, 0xbd000000000000, 0xbd00000000000000);
113 | printRes("QWORD Testcase2 :", qword_testcase2);
114 | printRes("_mm512_firstbyte_epu64 :", _mm512_firstbyte_epu64(qword_testcase2, (char)0xbd));
115 | }
116 | #endif
117 | #endif
118 | }
--------------------------------------------------------------------------------
/AVX_VNNI_INT16_Saturated_AddSub.cpp:
--------------------------------------------------------------------------------
1 | #include "stdafx.h"
2 | #include "AVX_VNNI_INT16_Saturated_AddSub.h"
3 |
4 | extern CPU_Props cpu_props;
5 |
6 | using namespace std;
7 |
8 | __m128i _mm_adds_epi32(__m128i a, __m128i b) {
9 | __m128i movwhdup = _mm_set_epi32(0x0f0e0f0e, 0x0b0a0b0a, 0x07060706, 0x03020302);
10 | __m128i _2x32768 = _mm_set1_epi32(0x80008000);
11 | __m128i one = _mm_set1_epi32(0x1);
12 | __m128i b_high = _mm_shuffle_epi8(b, movwhdup);
13 | __m128i temp = _mm_dpwusds_epi32(a, _2x32768, b_high);
14 | __m128i zpn_one = _mm_sign_epi16(one, _mm_or_si128(b_high, one));
15 | return _mm_dpwsuds_epi32(temp, zpn_one, b);
16 | }
17 |
18 | __m128i _mm_subs_epi32(__m128i a, __m128i b) {
19 | __m128i movwhdup = _mm_set_epi32(0x0f0e0f0e, 0x0b0a0b0a, 0x07060706, 0x03020302);
20 | __m128i _m2x32768 = _mm_set1_epi32(0x80008000);
21 | __m128i one = _mm_set1_epi32(0x1);
22 | __m128i m_one = _mm_set1_epi32(0xffff);
23 | __m128i b_high = _mm_shuffle_epi8(b, movwhdup);
24 | __m128i temp = _mm_dpwssds_avx_epi32(a, _m2x32768, b_high);
25 | __m128i zpn_one = _mm_sign_epi16(m_one, _mm_or_si128(b_high, one));
26 | return _mm_dpwsuds_epi32(temp, zpn_one, b);
27 | }
28 |
29 | __m128i _mm_adds_epu32(__m128i a, __m128i b) {
30 | return _mm_add_epi32(_mm_min_epu32(a, _mm_xor_si128(b, _mm_cmpeq_epi32(b, b))), b);
31 | }
32 |
33 | __m128i _mm_subs_epu32(__m128i a, __m128i b) {
34 | return _mm_sub_epi32(_mm_max_epu32(a, b), b);
35 | }
36 |
37 | __m256i _mm256_adds_epi32(__m256i a, __m256i b) {
38 | __m256i movwhdup = _mm256_set_epi32(0x0f0e0f0e, 0x0b0a0b0a, 0x07060706, 0x03020302, 0x0f0e0f0e, 0x0b0a0b0a, 0x07060706, 0x03020302);
39 | __m256i _2x32768 = _mm256_set1_epi32(0x80008000);
40 | __m256i one = _mm256_set1_epi32(0x1);
41 | __m256i b_high = _mm256_shuffle_epi8(b, movwhdup);
42 | __m256i temp = _mm256_dpwusds_epi32(a, _2x32768, b_high);
43 | __m256i zpn_one = _mm256_sign_epi16(one, _mm256_or_si256(b_high, one));
44 | return _mm256_dpwsuds_epi32(temp, zpn_one, b);
45 | }
46 |
47 | __m256i _mm256_subs_epi32(__m256i a, __m256i b) {
48 | __m256i movwhdup = _mm256_set_epi32(0x0f0e0f0e, 0x0b0a0b0a, 0x07060706, 0x03020302, 0x0f0e0f0e, 0x0b0a0b0a, 0x07060706, 0x03020302);
49 | __m256i _m2x32768 = _mm256_set1_epi32(0x80008000);
50 | __m256i one = _mm256_set1_epi32(0x1);
51 | __m256i m_one = _mm256_set1_epi32(0xffff);
52 | __m256i b_high = _mm256_shuffle_epi8(b, movwhdup);
53 | __m256i temp = _mm256_dpwssds_avx_epi32(a, _m2x32768, b_high);
54 | __m256i zpn_one = _mm256_sign_epi16(m_one, _mm256_or_si256(b_high, one));
55 | return _mm256_dpwsuds_epi32(temp, zpn_one, b);
56 | }
57 |
58 | __m256i _mm256_adds_epu32(__m256i a, __m256i b) {
59 | return _mm256_add_epi32(_mm256_min_epu32(a, _mm256_xor_si256(b, _mm256_cmpeq_epi32(b, b))), b);
60 | }
61 |
62 | __m256i _mm256_subs_epu32(__m256i a, __m256i b) {
63 | return _mm256_sub_epi32(_mm256_max_epu32(a, b), b);
64 | }
65 |
66 | void AVX_VNNI_Saturated_AddSub_Demo(void) {
67 | uint16_t x16 = __rdtsc() & 0xffff;
68 | uint32_t x32 = x16 * 0x10001;
69 |
70 | __m256i testcases_a16 = _mm256_set_epi16(0, 0, 0, 0, x16, x16, x16, x16, SHRT_MIN, SHRT_MIN, SHRT_MIN, SHRT_MIN, SHRT_MAX, SHRT_MAX, SHRT_MAX, SHRT_MAX);
71 | __m256i testcases_b16 = _mm256_set_epi16(0, x16, SHRT_MIN, SHRT_MAX, 0, x16, SHRT_MIN, SHRT_MAX, 0, x16, SHRT_MIN, SHRT_MAX, 0, x16, SHRT_MIN, SHRT_MAX);
72 |
73 | printRes16("Testcases_a16 :", testcases_a16);
74 | printRes16("Testcases_b16 :", testcases_b16);
75 | printRes16("Saturated signed add epi16:", _mm256_adds_epi16(testcases_a16, testcases_b16));
76 | printRes16("Saturated signed sub epi16:", _mm256_subs_epi16(testcases_a16, testcases_b16));
77 | printRes16("Saturated unsigned add epu16:", _mm256_adds_epu16(testcases_a16, testcases_b16));
78 | printRes16("Saturated unsigned sub epu16:", _mm256_subs_epu16(testcases_a16, testcases_b16));
79 |
80 |
81 | __m256i testcases_a32_0 = _mm256_set_epi32(0, 0, 0, 0, x32, x32, x32, x32);
82 | __m256i testcases_a32_1 = _mm256_set_epi32(LONG_MIN, LONG_MIN, LONG_MIN, LONG_MIN, LONG_MAX, LONG_MAX, LONG_MAX, LONG_MAX);
83 | __m256i testcases_b32_0 = _mm256_set_epi32(0, x32, LONG_MIN, LONG_MAX, 0, x32, LONG_MIN, LONG_MAX);
84 | __m256i testcases_b32_1 = _mm256_set_epi32(0, x32, LONG_MIN, LONG_MAX, 0, x32, LONG_MIN, LONG_MAX);
85 |
86 | printRes32("Testcases_a32_0 :", testcases_a32_0);
87 | printRes32("Testcases_a32_1 :", testcases_a32_1);
88 | printRes32("Testcases_b32_0 :", testcases_b32_0);
89 | printRes32("Testcases_b32_1 :", testcases_b32_1);
90 | printRes32("Saturated signed add epi32:", _mm256_adds_epi32(testcases_a32_0, testcases_b32_0));
91 | printRes32("Saturated signed add epi32:", _mm256_adds_epi32(testcases_a32_1, testcases_b32_1));
92 | printRes32("Saturated signed sub epi32:", _mm256_subs_epi32(testcases_a32_0, testcases_b32_0));
93 | printRes32("Saturated signed sub epi32:", _mm256_subs_epi32(testcases_a32_1, testcases_b32_1));
94 | printRes32("Saturated unsigned add epu32:", _mm256_adds_epu32(testcases_a32_0, testcases_b32_0));
95 | printRes32("Saturated unsigned add epu32:", _mm256_adds_epu32(testcases_a32_1, testcases_b32_1));
96 | printRes32("Saturated unsigned sub epu32:", _mm256_subs_epu32(testcases_a32_0, testcases_b32_0));
97 | printRes32("Saturated unsigned sub epu32:", _mm256_subs_epu32(testcases_a32_1, testcases_b32_1));
98 | }
--------------------------------------------------------------------------------
/Byte2Byte_Asm.asm:
--------------------------------------------------------------------------------
1 | .data
2 |
3 | ;ident mapping
4 | ;byteconst_00_3f dq 00706050403020100h, 00f0e0d0c0b0a0908h, 01716151413121110h, 01f1e1d1c1b1a1918h, 02726252423222120h, 02f2e2d2c2b2a2928h, 03736353433323130h, 03f3e3d3c3b3a3938h
5 | ;byteconst_40_7f dq 04746454443424140h, 04f4e4d4c4b4a4948h, 05756555453525150h, 05f5e5d5c5b5a5958h, 06766656463626160h, 06f6e6d6c6b6a6968h, 07776757473727170h, 07f7e7d7c7b7a7978h
6 | ;byteconst_80_bf dq 08786858483828180h, 08f8e8d8c8b8a8988h, 09796959493929190h, 09f9e9d9c9b9a9998h, 0a7a6a5a4a3a2a1a0h, 0afaeadacabaaa9a8h, 0b7b6b5b4b3b2b1b0h, 0bfbebdbcbbbab9b8h
7 | ;byteconst_c0_ff dq 0c7c6c5c4c3c2c1c0h, 0cfcecdcccbcac9c8h, 0d7d6d5d4d3d2d1d0h, 0dfdedddcdbdad9d8h, 0e7e6e5e4e3e2e1e0h, 0efeeedecebeae9e8h, 0f7f6f5f4f3f2f1f0h, 0fffefdfcfbfaf9f8h
8 |
9 | ;+1 mapping
10 | byteconst_00_3f dq 00807060504030201h, 0100f0e0d0c0b0a09h, 01817161514131211h, 0201f1e1d1c1b1a19h, 02827262524232221h, 0302f2e2d2c2b2a29h, 03837363534333231h, 0403f3e3d3c3b3a39h
11 | byteconst_40_7f dq 04847464544434241h, 0504f4e4d4c4b4a49h, 05857565554535251h, 0605f5e5d5c5b5a59h, 06867666564636261h, 0706f6e6d6c6b6a69h, 07877767574737271h, 0807f7e7d7c7b7a79h
12 | byteconst_80_bf dq 08887868584838281h, 0908f8e8d8c8b8a89h, 09897969594939291h, 0a09f9e9d9c9b9a99h, 0a8a7a6a5a4a3a2a1h, 0b0afaeadacabaaa9h, 0b8b7b6b5b4b3b2b1h, 0c0bfbebdbcbbbab9h
13 | byteconst_c0_ff dq 0c8c7c6c5c4c3c2c1h, 0d0cfcecdcccbcac9h, 0d8d7d6d5d4d3d2d1h, 0e0dfdedddcdbdad9h, 0e8e7e6e5e4e3e2e1h, 0f0efeeedecebeae9h, 0f8f7f6f5f4f3f2f1h, 000fffefdfcfbfaf9h
14 |
15 | inp0 dq 0f5e78b9234190de4h, 0b79b5e89124e4ca9h, 06549ba41bb976aa9h, 03566abb891220879h
16 | ;inp1 dq 04a2eff9876341568h, 03973abdeff67892ah, 0ce49735167564bdeh, 0b8790eff12537166h
17 | ;
18 | gfni_sra dq 08080808080808080h
19 | lsb dq 00101010101010101h
20 | ;
21 | repeats equ 1000000000
22 |
23 | .code
24 |
25 | MASKEDVPERMI2B MACRO LAT
26 | vpmovb2m k1, zmm0
27 | vmovdqa64 zmm2, zmm0
28 | vmovdqa64 zmm1, zmm0
29 | knotq k2, k1
30 | vpermi2b zmm2 {k1}{z}, zmm30, zmm31
31 | vpermi2b zmm1 {k2}{z}, zmm28, zmm29
32 | IF LAT EQ 0
33 | vporq zmm0, zmm1, zmm2
34 | ELSE
35 | vporq zmm3, zmm1, zmm2
36 | ENDIF
37 | ENDM
38 |
39 | KREGROUNDTRIP MACRO LAT
40 | vpmovb2m k0, zmm0
41 | vmovdqa64 zmm3, zmm0
42 | vmovdqa64 zmm1, zmm0
43 | vpermi2b zmm3, zmm28, zmm29
44 | vpermi2b zmm1, zmm30, zmm31
45 | vpmovm2b zmm2, k0
46 | vpternlogq zmm3, zmm1, zmm2, 0d8h ;c?b:a
47 | IF LAT EQ 0
48 | vmovdqa64 zmm0, zmm3
49 | ENDIF
50 | ENDM
51 |
52 | GFNI MACRO LAT
53 | vmovdqa64 zmm3, zmm0
54 | vmovdqa64 zmm1, zmm0
55 | vmovdqa64 zmm2, zmm0
56 | vpermi2b zmm3, zmm28, zmm29
57 | vpermi2b zmm1, zmm30, zmm31
58 | vgf2p8affineqb zmm2, zmm2, zmm27, 0
59 | vpternlogq zmm3, zmm1, zmm2, 0d8h ;c?b:a
60 | IF LAT EQ 0
61 | vmovdqa64 zmm0, zmm3
62 | ENDIF
63 | ENDM
64 |
65 | SRLQ MACRO LAT
66 | vmovdqa64 zmm1, zmm0
67 | vpermi2b zmm1, zmm30, zmm31
68 | vpsrlq zmm2, zmm0, 7
69 | vmovdqa64 zmm3, zmm0
70 | vpermi2b zmm3, zmm28, zmm29
71 | vpandq zmm2, zmm2, zmm27
72 | vpsubb zmm2, zmm26, zmm2
73 | vpternlogq zmm3, zmm1, zmm2, 0d8h ;c?b:a
74 | IF LAT EQ 0
75 | vmovdqa64 zmm0, zmm3
76 | ENDIF
77 | ENDM
78 |
79 | BLENDMB MACRO LAT
80 | vpmovb2m k1, zmm0
81 | vmovdqa64 zmm2, zmm0
82 | vmovdqa64 zmm1, zmm0
83 | vpermi2b zmm2, zmm30, zmm31
84 | vpermi2b zmm1, zmm28, zmm29
85 | IF LAT EQ 0
86 | vpblendmb zmm0{k1}, zmm1, zmm2
87 | ELSE
88 | vpblendmb zmm3{k1}, zmm1, zmm2
89 | ENDIF
90 | ENDM
91 |
92 | MINMAX MACRO LAT
93 | vmovdqa64 zmm1, zmm0
94 | vpermi2b zmm1, zmm30, zmm31
95 | vpmaxsb zmm2, zmm0, zmm27
96 | vmovdqa64 zmm3, zmm0
97 | vpermi2b zmm3, zmm28, zmm29
98 | vpminsb zmm2, zmm2, zmm26
99 | vpternlogq zmm3, zmm1, zmm2, 0d8h ;c?b:a
100 | IF LAT EQ 0
101 | vmovdqa64 zmm0, zmm3
102 | ENDIF
103 | ENDM
104 |
105 | B2B_WRAPPER MACRO FUNCNAME, M1, LAT
106 | FUNCNAME PROC
107 | push rbx
108 | push rdi
109 | push rsi
110 |
111 | IFIDNI ,
112 | vpbroadcastq zmm27, qword ptr [gfni_sra]
113 | ELSEIFIDNI ,
114 | vpxorq zmm26, zmm26, zmm26
115 | vpbroadcastq zmm27, qword ptr [lsb]
116 | ELSEIFIDNI ,
117 | vpxorq zmm26, zmm26, zmm26
118 | vpternlogq zmm27, zmm27, zmm27, 0ffh
119 | ENDIF
120 | vmovdqu64 zmm28, zmmword ptr [byteconst_00_3f]
121 | vmovdqu64 zmm29, zmmword ptr [byteconst_40_7f]
122 | vmovdqu64 zmm30, zmmword ptr [byteconst_80_bf]
123 | vmovdqu64 zmm31, zmmword ptr [byteconst_c0_ff]
124 | vmovdqu64 zmm0, zmmword ptr [inp0]
125 | IF LAT EQ 0
126 | kxorq k0, k0, k0
127 | vmovdqu64 zmm4, zmm0 ;equ test
128 | ENDIF
129 | mfence
130 | rdtscp
131 | lfence
132 |
133 | mov esi, eax
134 | mov edi, edx
135 |
136 | mov ecx, repeats
137 |
138 | startlabel:
139 | M1 LAT
140 | dec ecx
141 | jnz startlabel
142 |
143 | mfence
144 | rdtscp
145 | lfence
146 |
147 | shl rdx, 20h
148 | shl rdi, 20h
149 | or rax, rdx
150 | or rsi, rdi
151 |
152 | sub rax, rsi
153 |
154 | IF LAT EQ 0
155 | vpcmpeqb k0, zmm0, zmm4 ;equ test
156 | ENDIF
157 | pop rsi
158 | pop rdi
159 | pop rbx
160 | ret
161 | FUNCNAME ENDP
162 | ENDM
163 |
164 | B2B_WRAPPER B2B_MASKEDVPERMI2B_LAT, MASKEDVPERMI2B, 0
165 | B2B_WRAPPER B2B_MASKEDVPERMI2B_TP, MASKEDVPERMI2B, 1
166 |
167 | B2B_WRAPPER B2B_KREGROUNDTRIP_LAT, KREGROUNDTRIP, 0
168 | B2B_WRAPPER B2B_KREGROUNDTRIP_TP, KREGROUNDTRIP, 1
169 |
170 | B2B_WRAPPER B2B_GFNI_LAT, GFNI, 0
171 | B2B_WRAPPER B2B_GFNI_TP, GFNI, 1
172 |
173 | B2B_WRAPPER B2B_SRLQ_LAT, SRLQ, 0
174 | B2B_WRAPPER B2B_SRLQ_TP, SRLQ, 1
175 |
176 | B2B_WRAPPER B2B_BLENDMB_LAT, BLENDMB, 0
177 | B2B_WRAPPER B2B_BLENDMB_TP, BLENDMB, 1
178 |
179 | B2B_WRAPPER B2B_MINMAX_LAT, MINMAX, 0
180 | B2B_WRAPPER B2B_MINMAX_TP, MINMAX, 1
181 |
182 | end
--------------------------------------------------------------------------------
/Results/TZCNT_RKL.txt:
--------------------------------------------------------------------------------
1 | Vendor: "GenuineIntel"
2 | Family:6 Model:167 Stepping:1 (a0671)
3 | Brand: " 11th Gen Intel(R) Core(TM) i9-11900K @ 3.50GHz"
4 | 512b FPU DP ports : 1
5 | ---GPR----------
6 | RDTSC : supported
7 | RDTSCP : supported
8 | CMOV : supported
9 | CMPX8 : supported
10 | CMPX16 : supported
11 | AMD64 : supported
12 | LAHF : supported
13 | MOVBE : supported
14 | ABM : supported
15 | POPCNT : supported
16 | RDRAND : supported
17 | RDSEED : supported
18 | ADX : supported
19 | BMI : supported
20 | BMI2 : supported
21 | MOVDIRI : unsupported
22 | MOVDIR64B : unsupported
23 | ---SIMD---------
24 | SSE : supported
25 | SSE2 : supported
26 | SSE3 : supported
27 | SSSE3 : supported
28 | SSE41 : supported
29 | SSE42 : supported
30 | SSE4A : unsupported
31 | CLMUL : supported
32 | AES : supported
33 | SHA : supported
34 | AVX : supported, OS enabled
35 | AVX2 : supported, OS enabled
36 | FMA : supported, OS enabled
37 | F16C : supported, OS enabled
38 | GFNI : supported
39 | VAES : supported
40 | VPCLMULQDQ : supported
41 | KEYLOCK : unsupported
42 | AVX_VNNI : unsupported
43 | ---AVX512-------
44 | AVX512F : supported, OS enabled
45 | AVX512CD : supported, OS enabled
46 | AVX512ER : unsupported
47 | AVX512PF : unsupported
48 | AVX512BW : supported, OS enabled
49 | AVX512DQ : supported, OS enabled
50 | AVX512VL : supported, OS enabled
51 | AVX512VBMI : supported, OS enabled
52 | AVX512IFMA : supported, OS enabled
53 | AVX512VNNI : supported, OS enabled
54 | AVX512_4VNNIW : unsupported
55 | AVX512_4FMAPS : unsupported
56 | AVX512_VPOPCNTDQ : supported, OS enabled
57 | AVX512_BITALG : supported, OS enabled
58 | AVX512_VBMI2 : supported, OS enabled
59 | AVX512_BF16 : unsupported
60 | AVX512_VP2INTERSECT : unsupported
61 | AVX512_FP16 : unsupported
62 | ---AMX----------
63 | AMX-BF16 : unsupported
64 | AMX-INT8 : unsupported
65 | AMX-TILE : unsupported
66 | ---CacheLine----
67 | PREFETCHW : supported
68 | PREFETCHWT1 : unsupported
69 | CLFLUSH : supported
70 | CLFLUSHOPT : supported
71 | CLWB : unsupported
72 | CLZERO : unsupported
73 | CLDEMOTE : unsupported
74 | ---Misc---------
75 | LNOP : supported
76 | SERIALIZE : unsupported
77 | HYBRID : unsupported
78 | ---Deprecated---
79 | X87 : supported
80 | MMX : supported
81 | MMX+ : unsupported
82 | 3DNow! : unsupported
83 | 3DNow!+ : unsupported
84 | XOP : unsupported
85 | FMA4 : unsupported
86 | TBM : unsupported
87 | --- AVX512_BITALG & AVX512_VPOPCNTDQ SIMD TZCNT ---
88 | x128 :0000000100020004 1000200040008000
89 | _mm_tzcnt_epi8 :0808080008010802 0408050806080708
90 | _mm_tzcnt_epi16 :0010000000010002 000c000d000e000f
91 | _mm_tzcnt_epi32 :0000000000000002 0000000d0000000f
92 | _mm_tzcnt_epi64 :0000000000000002 000000000000000f
93 | x256 :0000000100020004 1000200040008000 7f007e007c007800 7000600040000000
94 | _mm256_tzcnt_epi8 :0808080008010802 0408050806080708 0008010802080308 0408050806080808
95 | _mm256_tzcnt_epi16 :0010000000010002 000c000d000e000f 00080009000a000b 000c000d000e0010
96 | _mm256_tzcnt_epi32 :0000000000000002 0000000d0000000f 000000090000000b 0000000d0000001e
97 | _mm256_tzcnt_epi64 :0000000000000002 000000000000000f 000000000000000b 000000000000001e
98 | x512 :0000000100020004 1000200040008000 7f007e007c007800 7000600000000000 fffffffefffcfff8 fff0ffe0ffc0ff80 ff00fe00fc00f800 f000e000c0008000
99 | _mm512_tzcnt_epi8 :0808080008010802 0408050806080708 0008010802080308 0408050808080808 0000000100020003 0004000500060007 0008010802080308 0408050806080708
100 | _mm512_tzcnt_epi16 :0010000000010002 000c000d000e000f 00080009000a000b 000c000d00100010 0000000100020003 0004000500060007 00080009000a000b 000c000d000e000f
101 | _mm512_tzcnt_epi32 :0000000000000002 0000000d0000000f 000000090000000b 0000000d00000020 0000000100000003 0000000500000007 000000090000000b 0000000d0000000f
102 | _mm512_tzcnt_epi64 :0000000000000002 000000000000000f 000000000000000b 000000000000002d 0000000000000003 0000000000000007 000000000000000b 000000000000000f
103 | TSC CLKs:----------------------
104 | _mm_tzcnt_epi8_asm :5.01382
105 | _mm_tzcnt_epi16_asm :5.00762
106 | _mm_tzcnt_epi32_asm :5.00959
107 | _mm_tzcnt_epi64_asm :5.01166
108 | _mm256_tzcnt_epi8_asm :5.00704
109 | _mm256_tzcnt_epi16_asm :5.00701
110 | _mm256_tzcnt_epi32_asm :5.00662
111 | _mm256_tzcnt_epi64_asm :5.009
112 | _mm512_tzcnt_epi8_asm :5.01387
113 | _mm512_tzcnt_epi16_asm :5.01192
114 | _mm512_tzcnt_epi32_asm :5.00803
115 | _mm512_tzcnt_epi64_asm :5.00779
116 | _mm256_tzcnt_epi32_lzcnt_asm :8.80157
117 | _mm512_tzcnt_epi32_lzcnt_asm :8.07228
118 | _mm256_tzcnt_epi64_lzcnt_asm :8.82087
119 | _mm512_tzcnt_epi64_lzcnt_asm :8.05901
120 | ===================================
121 |
--------------------------------------------------------------------------------
/Results/TZCNT_WLC.txt:
--------------------------------------------------------------------------------
1 | Vendor: "GenuineIntel"
2 | Family:6 Model:140 Stepping:1 (806c1)
3 | Brand: " 11th Gen Intel(R) Core(TM) i5-1135G7 @ 2.40GHz"
4 | 512b FPU DP ports : 1
5 | ---GPR----------
6 | RDTSC : supported
7 | RDTSCP : supported
8 | CMOV : supported
9 | CMPX8 : supported
10 | CMPX16 : supported
11 | AMD64 : supported
12 | LAHF : supported
13 | MOVBE : supported
14 | ABM : supported
15 | POPCNT : supported
16 | RDRAND : supported
17 | RDSEED : supported
18 | ADX : supported
19 | BMI : supported
20 | BMI2 : supported
21 | MOVDIRI : supported
22 | MOVDIR64B : supported
23 | ---SIMD---------
24 | SSE : supported
25 | SSE2 : supported
26 | SSE3 : supported
27 | SSSE3 : supported
28 | SSE41 : supported
29 | SSE42 : supported
30 | SSE4A : unsupported
31 | CLMUL : supported
32 | AES : supported
33 | SHA : supported
34 | AVX : supported, OS enabled
35 | AVX2 : supported, OS enabled
36 | FMA : supported, OS enabled
37 | F16C : supported, OS enabled
38 | GFNI : supported
39 | VAES : supported
40 | VPCLMULQDQ : supported
41 | KEYLOCK : supported, OS disabled
42 | AVX_VNNI : unsupported
43 | ---AVX512-------
44 | AVX512F : supported, OS enabled
45 | AVX512CD : supported, OS enabled
46 | AVX512ER : unsupported
47 | AVX512PF : unsupported
48 | AVX512BW : supported, OS enabled
49 | AVX512DQ : supported, OS enabled
50 | AVX512VL : supported, OS enabled
51 | AVX512VBMI : supported, OS enabled
52 | AVX512IFMA : supported, OS enabled
53 | AVX512VNNI : supported, OS enabled
54 | AVX512_4VNNIW : unsupported
55 | AVX512_4FMAPS : unsupported
56 | AVX512_VPOPCNTDQ : supported, OS enabled
57 | AVX512_BITALG : supported, OS enabled
58 | AVX512_VBMI2 : supported, OS enabled
59 | AVX512_BF16 : unsupported
60 | AVX512_VP2INTERSECT : supported, OS enabled
61 | AVX512_FP16 : unsupported
62 | ---AMX----------
63 | AMX-BF16 : unsupported
64 | AMX-INT8 : unsupported
65 | AMX-TILE : unsupported
66 | ---CacheLine----
67 | PREFETCHW : supported
68 | PREFETCHWT1 : unsupported
69 | CLFLUSH : supported
70 | CLFLUSHOPT : supported
71 | CLWB : supported
72 | CLZERO : unsupported
73 | CLDEMOTE : unsupported
74 | ---Misc---------
75 | LNOP : supported
76 | SERIALIZE : unsupported
77 | HYBRID : unsupported
78 | ---Deprecated---
79 | X87 : supported
80 | MMX : supported
81 | MMX+ : unsupported
82 | 3DNow! : unsupported
83 | 3DNow!+ : unsupported
84 | XOP : unsupported
85 | FMA4 : unsupported
86 | TBM : unsupported
87 | --- AVX512_BITALG & AVX512_VPOPCNTDQ SIMD TZCNT ---
88 | x128 :0000000100020004 1000200040008000
89 | _mm_tzcnt_epi8 :0808080008010802 0408050806080708
90 | _mm_tzcnt_epi16 :0010000000010002 000c000d000e000f
91 | _mm_tzcnt_epi32 :0000000000000002 0000000d0000000f
92 | _mm_tzcnt_epi64 :0000000000000002 000000000000000f
93 | x256 :0000000100020004 1000200040008000 7f007e007c007800 7000600040000000
94 | _mm256_tzcnt_epi8 :0808080008010802 0408050806080708 0008010802080308 0408050806080808
95 | _mm256_tzcnt_epi16 :0010000000010002 000c000d000e000f 00080009000a000b 000c000d000e0010
96 | _mm256_tzcnt_epi32 :0000000000000002 0000000d0000000f 000000090000000b 0000000d0000001e
97 | _mm256_tzcnt_epi64 :0000000000000002 000000000000000f 000000000000000b 000000000000001e
98 | x512 :0000000100020004 1000200040008000 7f007e007c007800 7000600000000000 fffffffefffcfff8 fff0ffe0ffc0ff80 ff00fe00fc00f800 f000e000c0008000
99 | _mm512_tzcnt_epi8 :0808080008010802 0408050806080708 0008010802080308 0408050808080808 0000000100020003 0004000500060007 0008010802080308 0408050806080708
100 | _mm512_tzcnt_epi16 :0010000000010002 000c000d000e000f 00080009000a000b 000c000d00100010 0000000100020003 0004000500060007 00080009000a000b 000c000d000e000f
101 | _mm512_tzcnt_epi32 :0000000000000002 0000000d0000000f 000000090000000b 0000000d00000020 0000000100000003 0000000500000007 000000090000000b 0000000d0000000f
102 | _mm512_tzcnt_epi64 :0000000000000002 000000000000000f 000000000000000b 000000000000002d 0000000000000003 0000000000000007 000000000000000b 000000000000000f
103 | TSC CLKs:----------------------
104 | _mm_tzcnt_epi8_asm :5.07324
105 | _mm_tzcnt_epi16_asm :5.06268
106 | _mm_tzcnt_epi32_asm :5.06161
107 | _mm_tzcnt_epi64_asm :5.06192
108 | _mm256_tzcnt_epi8_asm :5.06735
109 | _mm256_tzcnt_epi16_asm :5.06211
110 | _mm256_tzcnt_epi32_asm :5.06211
111 | _mm256_tzcnt_epi64_asm :5.05998
112 | _mm512_tzcnt_epi8_asm :5.06633
113 | _mm512_tzcnt_epi16_asm :5.06351
114 | _mm512_tzcnt_epi32_asm :5.0629
115 | _mm512_tzcnt_epi64_asm :5.06812
116 | _mm256_tzcnt_epi32_lzcnt_asm :8.92335
117 | _mm512_tzcnt_epi32_lzcnt_asm :8.19745
118 | _mm256_tzcnt_epi64_lzcnt_asm :8.92455
119 | _mm512_tzcnt_epi64_lzcnt_asm :8.18332
120 | ===================================
121 |
--------------------------------------------------------------------------------
/Misc.cpp:
--------------------------------------------------------------------------------
1 | #include "stdafx.h"
2 |
3 | using namespace std;
4 |
5 | void printRes8(const char* name, __m128i res) {
6 | cout << setw(24) << left << setfill(' ') << name;
7 | for (int i = sizeof(__m128i) / sizeof(uint8_t) - 1; i >= 0; i--)
8 | cout << hex << setw(2) << setfill('0') << right << +*((uint8_t*)&res + i) << ' ';
9 | cout << endl;
10 | }
11 |
12 | void printRes8(const char* name, __m256i res) {
13 | cout << setw(24) << left << setfill(' ') << name;
14 | for (int i = sizeof(__m256i) / sizeof(uint8_t) - 1; i >= 0; i--)
15 | cout << hex << setw(2) << setfill('0') << right << +*((uint8_t*)&res + i) << ' ';
16 | cout << endl;
17 | }
18 |
19 | void printRes8(const char* name, __m512i res) {
20 | cout << setw(24) << left << setfill(' ') << name;
21 | for (int i = sizeof(__m512i) / sizeof(uint8_t) - 1; i >= 0; i--)
22 | cout << hex << setw(2) << setfill('0') << right << +*((uint8_t*)&res + i) << ' ';
23 | cout << endl;
24 | }
25 |
26 | void printRes16(const char * name, __m128i res) {
27 | cout << setw(24) << left << setfill(' ') << name;
28 | for (int i = sizeof(__m128i) / sizeof(short) - 1; i >= 0; i--)
29 | cout << hex << setw(4) << setfill('0') << right << *((unsigned __int16*)&res + i) << ' ';
30 | cout << endl;
31 | }
32 |
33 | void printRes16(const char * name, __m256i res) {
34 | cout << setw(24) << left << setfill(' ') << name;
35 | for (int i = sizeof(__m256i) / sizeof(short) - 1; i >= 0; i--)
36 | cout << hex << setw(4) << setfill('0') << right << *((unsigned __int16*)&res + i) << ' ';
37 | cout << endl;
38 | }
39 |
40 | void printRes16(const char * name, __m512i res) {
41 | cout << setw(24) << left << setfill(' ') << name;
42 | for (int i = sizeof(__m512i) / sizeof(short) - 1; i >= 0; i--)
43 | cout << hex << setw(4) << setfill('0') << right << *((unsigned __int16*)&res + i) << ' ';
44 | cout << endl;
45 | }
46 |
47 | void printRes32(const char * name, __m128i res) {
48 | cout << setw(24) << left << setfill(' ') << name;
49 | for (int i = sizeof(__m128i) / sizeof(long) - 1; i >= 0; i--)
50 | cout << hex << setw(8) << setfill('0') << right << *((unsigned __int32*)&res + i) << ' ';
51 | cout << endl;
52 | }
53 |
54 | void printRes32(const char * name, __m256i res) {
55 | cout << setw(24) << left << setfill(' ') << name;
56 | for (int i = sizeof(__m256i) / sizeof(long) - 1; i >= 0; i--)
57 | cout << hex << setw(8) << setfill('0') << right << *((unsigned __int32*)&res + i) << ' ';
58 | cout << endl;
59 | }
60 |
61 | void printRes32(const char * name, __m512i res) {
62 | cout << setw(24) << left << setfill(' ') << name;
63 | for (int i = sizeof(__m512i) / sizeof(long) - 1; i >= 0; i--)
64 | cout << hex << setw(8) << setfill('0') << right << *((unsigned __int32*)&res + i) << ' ';
65 | cout << endl;
66 | }
67 |
68 | void printRes(const char * name, __m128i res) {
69 | cout << setw(24) << left << setfill(' ') << name;
70 | for (int i = sizeof(__m128i) / sizeof(long long) - 1; i >= 0; i--)
71 | cout << hex << setw(16) << setfill('0') << right << *((unsigned __int64*)&res + i) << ' ';
72 | cout << endl;
73 | }
74 |
75 | void printRes(const char * name, __m256i res) {
76 | cout << setw(24) << left << setfill(' ') << name;
77 | for (int i = sizeof(__m256i) / sizeof(long long) - 1; i >= 0; i--)
78 | cout << hex << setw(16) << setfill('0') << right << *((unsigned __int64*)&res + i) << ' ';
79 | cout << endl;
80 | }
81 |
82 | void printRes(const char * name, __m512i res) {
83 | cout << setw(24) << left << setfill(' ') << name;
84 | for (int i = sizeof(__m512i) / sizeof(long long) - 1; i >= 0; i--)
85 | cout << hex << setw(16) << setfill('0') << right << *((unsigned __int64*)&res + i) << ' ';
86 | cout << endl;
87 | }
88 |
89 | void printRes(int r, const char * name, __m128i res) {
90 | cout << setw(2) << r << ':' << setw(24) << left << setfill(' ') << name;
91 | for (int i = sizeof(__m128i) / sizeof(long long) - 1; i >= 0; i--)
92 | cout << hex << setw(16) << setfill('0') << right << *((unsigned __int64*)&res + i) << ' ';
93 | cout << endl;
94 | }
95 |
96 | void printRes(int r, const char * name, __m256i res) {
97 | cout << setw(2) << r << ':' << setw(24) << left << setfill(' ') << name;
98 | for (int i = sizeof(__m256i) / sizeof(long long) - 1; i >= 0; i--)
99 | cout << hex << setw(16) << setfill('0') << right << *((unsigned __int64*)&res + i) << ' ';
100 | cout << endl;
101 | }
102 |
103 | void printRes(int r, const char * name, __m512i res) {
104 | cout << setw(2) << r << ':' << setw(24) << left << setfill(' ') << name;
105 | for (int i = sizeof(__m512i) / sizeof(long long) - 1; i >= 0; i--)
106 | cout << hex << setw(16) << setfill('0') << right << *((unsigned __int64*)&res + i) << ' ';
107 | cout << endl;
108 | }
109 |
110 | void printRes(int r, __m128i res) {
111 | cout << dec << setw(2) << r << ':';
112 | for (int i = sizeof(__m128i) / sizeof(long long) - 1; i >= 0; i--)
113 | cout << hex << setw(16) << setfill('0') << right << *((unsigned __int64*)&res + i) << ' ';
114 | cout << endl;
115 | }
116 |
117 | void printRes(int r, __m256i res) {
118 | cout << dec << setw(2) << r << ':';
119 | for (int i = sizeof(__m256i) / sizeof(long long) - 1; i >= 0; i--)
120 | cout << hex << setw(16) << setfill('0') << right << *((unsigned __int64*)&res + i) << ' ';
121 | cout << endl;
122 | }
123 |
124 | void printRes(int r, __m512i res) {
125 | cout << dec << setw(2) << r << ':';
126 | for (int i = sizeof(__m512i) / sizeof(long long) - 1; i >= 0; i--)
127 | cout << hex << setw(16) << setfill('0') << right << *((unsigned __int64*)&res + i) << ' ';
128 | cout << endl;
129 | }
130 |
131 | // Intel SDM 32546276.pdf p.1739
132 | uint64_t serialized_tsc(void) {
133 | uint64_t tsc;
134 | uint32_t tsc_aux;
135 | _mm_mfence();
136 | tsc = __rdtscp(&tsc_aux);
137 | _mm_lfence();
138 | return tsc;
139 | }
140 |
141 | void random_wrap(unsigned int * random) {
142 | while (!_rdrand32_step(random));
143 | }
144 |
145 | void random_wrap(signed int * random) {
146 | while (!_rdrand32_step((unsigned int *)random));
147 | }
148 |
149 | #if defined (_M_X64)
150 | void random_wrap(unsigned long long * random) {
151 | while (!_rdrand64_step(random));
152 | }
153 |
154 | void random_wrap(signed long long * random) {
155 | while (!_rdrand64_step((unsigned long long *)random));
156 | }
157 | #endif
158 |
159 | void SetThread(size_t threadindex) {
160 | size_t t = (size_t)1 << threadindex;
161 | cout << "Procmask:0x" << hex << setfill('0') << setw(sizeof(size_t) * 2) << right << t << dec << setfill(' ') << endl;
162 | SetProcessAffinityMask(GetCurrentProcess(), t);
163 | SetThreadAffinityMask(GetCurrentThread(), t);
164 | Sleep(0);
165 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # InstLatX64_Demo
2 |
3 | Collected source form of some ideas
4 |
5 | ## GFNI_Demo.h -
6 | wrapper header for non-cryptographical use of (V)GF2P8AFFINEQB instruction in style of Intel intrinsics:
7 | * emulating the missing byte-garnularity shift and rotate instructions;
8 | ```
9 | _(mm|mm256|mm512)(|_mask|_maskz)_(srli|srl|srai|sra|slli|sll|ror|rol)_gfni_epi8
10 | ```
11 | * variable versions also supported with GF2P8MULB instruction:
12 | ```
13 | _(mm|mm256|mm512)(|_mask|_maskz)_(srlv|sllv|rorv|rolv)_gfni_epi8
14 | ```
15 |
16 | * revbit, bit-broadcast, prefix-xor operations for bytes
17 | ```
18 | _(mm|mm256|mm512)(|_mask|_maskz)_(revbit|bcstbit|prefix_xor)_epi8
19 | ```
20 |
21 | * rotate, mirror, multiplication operations for 8x8 bit matrices
22 | ```
23 | _(mm|mm256|mm512)(|_mask|_maskz)_(mirror|rotate|multiplication)_8x8
24 | ```
25 |
26 | * auxiliary: imm8 operand of (V)GF2P8AFFINEQB xors the result bytes, so it useful e.g. for inverting the all above functions or using for compile time known byte broadcast without using GPRs, Port5 or memory
27 | ```
28 | _(mm|mm256|mm512)(|_mask|_maskz)_(inverse|set1_gfni)_epi8
29 | ```
30 |
31 | * entire register pospopcount (if AVX512_BITALG & AVX512_VPOPCNTDQ also supported):
32 | ```
33 | _(mm|mm256|mm512)_pospopcount_(u8|u16)_(si128|si256|si512)_epi8
34 | ```
35 |
36 | * tzcnt, lzcnt for bytes (idea of https://gist.github.com/animetosho/6cb732ccb5ecd86675ca0a442b3c0622)
37 | ```
38 | _(mm|mm256|mm512)(|_mask|_maskz)_(tzcnt|lzcnt)_gfni_epi8
39 | ```
40 |
41 | # VBMI2_Demo.h
42 | wrapper header for VPSHLDW/VPSHRDW/VPSHLDVW/VPSHRDVW instructions for substituting the missing VPROLW/VPRORW/VPRORVW/VPRORVW instructions with the good old shld r1, r1 = rol r1 trick
43 | ```
44 | _(mm|mm256|mm512)(|_mask|_maskz)_(ror|rol)_vbmi2_epi16
45 | ```
46 | wrapper header for emulating the missing byte-garnularity shift and rotate instructions in variable versions too
47 | ```
48 | _(mm|mm256|mm512)(|_mask|_maskz)_(slli|srli|srai|ror|rol)_vbmi2_epi8
49 | _(mm|mm256|mm512)(|_mask|_maskz)_(sllv|srlv|srav|rorv|rolv)_vbmi2_epi8
50 | ```
51 |
52 | # VPCLMULQDQ_Demo.h
53 | experimental implementation of entire register (128/256/512b, xmm/ymm/zmm) prefix-xor operation with the VPCLMULQDQ extension
54 | ```
55 | _mm_prefix_xor_clmul_si128(__m128i a);
56 | _mm256_prefix_xor_clmul_si256(__m256i a);
57 | _mm512_prefix_xor_clmul_si512(__m512i a);
58 | ```
59 |
60 | # Compiler_Intrinsic_Test.cpp
61 | for testing Visual Studio AVX512 capabilities
62 | # TZCNT_Demo.cpp
63 | Emulating the missing SIMD VPTZCNTB / VPTZCNTW / VPTZCNTD / VPTZCNTQ instructions
64 | # LZCNT_Demo.cpp
65 | Emulating the missing SIMD VPLZCNTB / VPLZCNTW instructions
66 | # PEXT_PDEP_Emu.cpp
67 | Faster PEXT and PDEP emulation for AMD Excavator/Zen/Zen+/Zen2 based on Zach Wegner's ZP7 (Zach's Peppy Parallel-Prefix-Popcountin' PEXT/PDEP Polyfill)
68 | # CPU_Props.*
69 | detection of CPU properties for dispatching code paths
70 | # AVX512_DecimalPrint.*
71 | AVX512F, AVX512IFMA based implementation of _ultoa, _ltoa, _ui64toa, _i64toa functions.
72 | # AVX512_KMemDst.*
73 | code for examining the effect of the k mask register value on the EVEX-decoded instructions with memory destination
74 | # Zen4_Demo.*
75 | code for examining of instructions in AMD Zen4/Raphael CPU (CPUID A60F12). It is based on ideas from uops.info. Output example: \Results\Zen4_Demo_Imm8.txt
76 | # B2B_Demo.*
77 | VPERMI2B based code for fast any-to-any byte replacement. It can be useful e.g. for tolower/toupper type conversions or isxdigit/isalnum type classifications.
78 | [Performance results:](https://gist.github.com/InstLatx64/a5c60b714ef04ebe77f0b63639b36fd0)
79 | # AVX512_Reduce_Add.*
80 | (DB)SAD based _mm512_reduce_add_epu8/16/32/64 implementation
81 | # AVX512_Saturated_AddSub.*
82 | _mm512_adds/subs_epi/epu/32/64 implementation
83 | # FirstByte.*
84 | Finding first byte in lanes
85 | _mm256|512_firstbyte_epu32/64 implementation
86 | # HWBITPERM.*
87 | SVE2 vector BITPERM (BEXT/BDEP/BGRP) emulation with HW scalar BMI2 PEXT/PDEP instructions
88 | # AVX512_BGVSER.*
89 | Byte-Granularity Variable Shift on Entire Register
90 | ```
91 | _(mm256|mm512))_(bsll|bsrl)_epi(256|512) [placeholder]
92 | _(mm256|mm512))_palign(l|r)_epi(256|512)
93 | _(mm256|mm512))_rotate(l|r)_epi(256|512)
94 | ```
95 | # AVX_VNNI_INT16_Saturated_AddSub.*
96 | AVX_VNNI_INT16 based _(mm|mm256)_(adds|subs)_epi32 emulation proposal
97 | # P06P1.*
98 | Test code for an Intel Golden Cove / Raptor Cove / Redwood Cove / Lion Cove imm64-related anomaly
99 |
100 | ## References
101 | * Geoff Langdale [Why Ice Lake is Important (a bit-basher’s perspective)](https://branchfree.org/2019/05/29/why-ice-lake-is-important-a-bit-bashers-perspective/)
102 | * Marcus D. R. Klarqvist, Wojciech Muła, Daniel Lemire [Efficient Computation of Positional Population Counts Using SIMD Instructions](https://arxiv.org/abs/1911.02696)
103 | * Wojciech Muła [AVX512VBMI — remove spaces from text](http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html)
104 | * Zach Wegner [ZP7 (Zach's Peppy Parallel-Prefix-Popcountin' PEXT/PDEP Polyfill)](https://github.com/zwegner/zp7)
105 | * Abel, Andreas and Reineke, Jan [uops.info: Characterizing Latency, Throughput, and Port Usage of Instructions on Intel Microarchitectures](https://arxiv.org/pdf/1810.04610.pdf)
106 | * PerforatedBlob [TZCNT - TERNLOG->ANDN](https://twitter.com/PerforatedBlob/status/1418421045447454724)
107 | * TravisDowns [Scalar/HW GPR PDEP/PEXT reference code](https://twitter.com/trav_downs/status/1418616866080116742)
108 | * Daniel Lemire [Converting integers to decimal strings faster with AVX-512](https://lemire.me/blog/2022/03/28/converting-integers-to-decimal-strings-faster-with-avx-512/)
109 | * KMemDst results: [Intel SKX/CNL/TGL/RKL/ADL, AMD RPH](https://gist.github.com/InstLatx64/c7efbc71706561706888d7aa0548c4c5)
110 | * [Geoff Langdale's Byte2Byte question](https://twitter.com/geofflangdale/status/1406084804613861379)
111 | * [Geoff Langdale's reduce_add inspiration](https://twitter.com/geofflangdale/status/1609575574946865154)
112 | * A list of “out-of-band” uses for the GF2P8AFFINEQB instruction I haven't seen documented elsewhere: [idea of tzcnt/lzcnt_gfni_epi8, sllv/srlv_gfni_epi8](https://gist.github.com/animetosho/6cb732ccb5ecd86675ca0a442b3c0622)
113 | * [FirstByte inspiration](http://0x80.pl/notesen/2023-02-06-avx512-find-first-byte-in-lane.html)
114 | * Robert Clausecker [BGVSER inspiration](https://twitter.com/FUZxxl/status/1696448029358801311)
115 | * Tavian Barnes: [The Alder Lake anomaly, explained](https://tavianator.com/2025/shlxplained.html)
--------------------------------------------------------------------------------
/TZCNT_Demo.cpp:
--------------------------------------------------------------------------------
1 | #include "stdafx.h"
2 | #include "TZCNT_Demo.h"
3 | /*
4 | VPTZCNTB/W/D/Q proposal:
5 | - tzcnt(a) = popcnt(tzmsk(a)) = popcnt(~a & (a-1)) = popcnt(andn(a, a-1)) = popcnt(andn(a, a+(-1))) (thx, @PerforatedBlob! https://twitter.com/PerforatedBlob/status/1418421045447454724)
6 | - zero case handled
7 | POPCNT vs LZCNT:
8 | - Byte/Word support too, not just DWord/QWord
9 | - faster (const 5 vs 8/9 clks on TGL, RKL)
10 | - only 1 const
11 | */
12 |
13 | extern CPU_Props cpu_props;
14 |
15 | using namespace std;
16 |
17 | __m128i __vectorcall _mm_tzcnt_epi8(__m128i a) {
18 | __m128i u = _mm_undefined_si128();
19 | return _mm_popcnt_epi8(_mm_andnot_si128(a, _mm_add_epi8(a, _mm_cmpeq_epi8(u, u))));
20 | }
21 |
22 | __m256i __vectorcall _mm256_tzcnt_epi8(__m256i a) {
23 | __m256i u = _mm256_undefined_si256();
24 | return _mm256_popcnt_epi8(_mm256_andnot_si256(a, _mm256_add_epi8(a, _mm256_cmpeq_epi8(u, u))));
25 | }
26 |
27 | __m512i __vectorcall _mm512_tzcnt_epi8(__m512i a) {
28 | __m512i u = _mm512_undefined_epi32();
29 | return _mm512_popcnt_epi8(_mm512_andnot_si512(a, _mm512_add_epi8(a, _mm512_ternarylogic_epi32(u, u, u, 0xff))));
30 | }
31 |
32 | __m128i __vectorcall _mm_tzcnt_epi16(__m128i a) {
33 | __m128i u = _mm_undefined_si128();
34 | return _mm_popcnt_epi16(_mm_andnot_si128(a, _mm_add_epi16(a, _mm_cmpeq_epi16(u, u))));
35 | }
36 |
37 | __m256i __vectorcall _mm256_tzcnt_epi16(__m256i a) {
38 | __m256i u = _mm256_undefined_si256();
39 | return _mm256_popcnt_epi16(_mm256_andnot_si256(a,_mm256_add_epi16(a, _mm256_cmpeq_epi16(u, u))));
40 | }
41 |
42 | __m512i __vectorcall _mm512_tzcnt_epi16(__m512i a) {
43 | __m512i u = _mm512_undefined_epi32();
44 | return _mm512_popcnt_epi16(_mm512_andnot_si512(a, _mm512_add_epi16(a, _mm512_ternarylogic_epi32(u, u, u, 0xff))));
45 | }
46 |
47 | __m128i __vectorcall _mm_tzcnt_epi32(__m128i a) {
48 | __m128i u = _mm_undefined_si128();
49 | return _mm_popcnt_epi32(_mm_andnot_si128(a, _mm_add_epi32(a, _mm_cmpeq_epi32(u, u))));
50 | }
51 |
52 | __m256i __vectorcall _mm256_tzcnt_epi32(__m256i a) {
53 | __m256i u = _mm256_undefined_si256();
54 | return _mm256_popcnt_epi32(_mm256_andnot_si256(a, _mm256_add_epi32(a, _mm256_cmpeq_epi32(u, u))));
55 | }
56 |
57 | __m512i __vectorcall _mm512_tzcnt_epi32(__m512i a) {
58 | __m512i u = _mm512_undefined_epi32();
59 | return _mm512_popcnt_epi32(_mm512_andnot_si512(a, _mm512_add_epi32(a, _mm512_ternarylogic_epi32(u, u, u, 0xff))));
60 | }
61 |
62 | __m128i __vectorcall _mm_tzcnt_epi64(__m128i a) {
63 | __m128i u = _mm_undefined_si128();
64 | return _mm_popcnt_epi64(_mm_andnot_si128(a, _mm_add_epi64(a, _mm_cmpeq_epi64(u, u))));
65 | }
66 |
67 | __m256i __vectorcall _mm256_tzcnt_epi64(__m256i a) {
68 | __m256i u = _mm256_undefined_si256();
69 | return _mm256_popcnt_epi64(_mm256_andnot_si256(a, _mm256_add_epi64(a, _mm256_cmpeq_epi64(u, u))));
70 | }
71 |
72 | __m512i __vectorcall _mm512_tzcnt_epi64(__m512i a) {
73 | __m512i u = _mm512_undefined_epi32();
74 | return _mm512_popcnt_epi64(_mm512_andnot_si512(a, _mm512_add_epi64(a, _mm512_ternarylogic_epi64(u, u, u, 0xff))));
75 | }
76 |
77 | void TZCNT_Test(void) {
78 | cout << "--- AVX512_BITALG & AVX512_VPOPCNTDQ SIMD TZCNT ---" << dec << right << endl;
79 | __m128i x128 = _mm_set_epi16(0x0000, 0x0001, 0x0002, 0x0004, 0x1000, 0x2000, 0x4000, -32768), y128 = _mm_set1_epi16(0x7f);
80 | __m256i x256 = _mm256_set_epi16(0x0000, 0x0001, 0x0002, 0x0004, 0x1000, 0x2000, 0x4000, -32768, 0x07f00, 0x07e00, 0x07c00, 0x07800, 0x07000, 0x06000, 0x04000, 0x00000), y256 = _mm256_set1_epi16(0x7f);
81 | __m512i x512 = _mm512_set_epi16(0x0000, 0x0001, 0x0002, 0x0004, 0x1000, 0x2000, 0x4000, -32768, 0x07f00, 0x07e00, 0x07c00, 0x07800, 0x07000, 0x06000, 0x00000, 0x00000, -1, -2, -4, -8, -16, -32, -64, -128, -256, -512, -1024, -2048, -4096, -8192, -16384, -32768), y512 = _mm512_set1_epi16(0x7f);
82 | __mmask8 m8 = _INSTLATX64_DEMO_TESTMASK_8;
83 | __mmask16 m16 = _INSTLATX64_DEMO_TESTMASK_16;
84 |
85 | printRes("x128 :", x128);
86 | printRes("_mm_tzcnt_epi8 :", _mm_tzcnt_epi8(x128));
87 | printRes("_mm_tzcnt_epi16 :", _mm_tzcnt_epi16(x128));
88 | printRes("_mm_tzcnt_epi32 :", _mm_tzcnt_epi32(x128));
89 | printRes("_mm_tzcnt_epi64 :", _mm_tzcnt_epi64(x128));
90 |
91 | printRes("x256 :", x256);
92 | printRes("_mm256_tzcnt_epi8 :", _mm256_tzcnt_epi8(x256));
93 | printRes("_mm256_tzcnt_epi16 :", _mm256_tzcnt_epi16(x256));
94 | printRes("_mm256_tzcnt_epi32 :", _mm256_tzcnt_epi32(x256));
95 | printRes("_mm256_tzcnt_epi64 :", _mm256_tzcnt_epi64(x256));
96 |
97 | printRes("x512 :", x512);
98 | printRes("_mm512_tzcnt_epi8 :", _mm512_tzcnt_epi8(x512));
99 | printRes("_mm512_tzcnt_epi16 :", _mm512_tzcnt_epi16(x512));
100 | printRes("_mm512_tzcnt_epi32 :", _mm512_tzcnt_epi32(x512));
101 | printRes("_mm512_tzcnt_epi64 :", _mm512_tzcnt_epi64(x512));
102 |
103 | cout << "TSC CLKs:----------------------" << endl;
104 |
105 | cout << "_mm_tzcnt_epi8_asm :" << (double)_mm_tzcnt_epi8_asm_timed() / (double)TZCNT_REPEATS << endl;
106 | cout << "_mm_tzcnt_epi16_asm :" << (double)_mm_tzcnt_epi16_asm_timed() / (double)TZCNT_REPEATS << endl;
107 | cout << "_mm_tzcnt_epi32_asm :" << (double)_mm_tzcnt_epi32_asm_timed() / (double)TZCNT_REPEATS << endl;
108 | cout << "_mm_tzcnt_epi64_asm :" << (double)_mm_tzcnt_epi64_asm_timed() / (double)TZCNT_REPEATS << endl;
109 |
110 | cout << "_mm256_tzcnt_epi8_asm :" << (double)_mm256_tzcnt_epi8_asm_timed() / (double)TZCNT_REPEATS << endl;
111 | cout << "_mm256_tzcnt_epi16_asm :" << (double)_mm256_tzcnt_epi16_asm_timed() / (double)TZCNT_REPEATS << endl;
112 | cout << "_mm256_tzcnt_epi32_asm :" << (double)_mm256_tzcnt_epi32_asm_timed() / (double)TZCNT_REPEATS << endl;
113 | cout << "_mm256_tzcnt_epi64_asm :" << (double)_mm256_tzcnt_epi64_asm_timed() / (double)TZCNT_REPEATS << endl;
114 |
115 | cout << "_mm512_tzcnt_epi8_asm :" << (double)_mm512_tzcnt_epi8_asm_timed() / (double)TZCNT_REPEATS << endl;
116 | cout << "_mm512_tzcnt_epi16_asm :" << (double)_mm512_tzcnt_epi16_asm_timed() / (double)TZCNT_REPEATS << endl;
117 | cout << "_mm512_tzcnt_epi32_asm :" << (double)_mm512_tzcnt_epi32_asm_timed() / (double)TZCNT_REPEATS << endl;
118 | cout << "_mm512_tzcnt_epi64_asm :" << (double)_mm512_tzcnt_epi64_asm_timed() / (double)TZCNT_REPEATS << endl;
119 |
120 | cout << "_mm_tzcnt_epi32_cd_asm :" << (double)_mm_tzcnt_epi32_cd_asm_timed() / (double)TZCNT_REPEATS << endl;
121 | cout << "_mm256_tzcnt_epi32_cd_asm :" << (double)_mm256_tzcnt_epi32_cd_asm_timed() / (double)TZCNT_REPEATS << endl;
122 | cout << "_mm512_tzcnt_epi32_cd_asm :" << (double)_mm512_tzcnt_epi32_cd_asm_timed() / (double)TZCNT_REPEATS << endl;
123 | cout << "_mm_tzcnt_epi64_cd_asm :" << (double)_mm_tzcnt_epi64_cd_asm_timed() / (double)TZCNT_REPEATS << endl;
124 | cout << "_mm256_tzcnt_epi64_cd_asm :" << (double)_mm256_tzcnt_epi64_cd_asm_timed() / (double)TZCNT_REPEATS << endl;
125 | cout << "_mm512_tzcnt_epi64_cd_asm :" << (double)_mm512_tzcnt_epi64_cd_asm_timed() / (double)TZCNT_REPEATS << endl;
126 | }
127 |
--------------------------------------------------------------------------------
/Zen5_Demo_Imm8.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #define ZEN5_FUNCDEF_I8(INST, OPERANDS, I8) \
4 | extern "C" unsigned __int64 __fastcall Zen5_##INST##_##OPERANDS##_##I8##_lat(void); \
5 | extern "C" unsigned __int64 __fastcall Zen5_##INST##_##OPERANDS##_##I8##_tp(void);
6 |
7 | #define ZEN5_FUNCDECL_I8(NAME, INST, OPERANDS, I8) \
8 | {#NAME, {\
9 | Zen5_##INST##_##OPERANDS##_##I8##_lat, \
10 | Zen5_##INST##_##OPERANDS##_##I8##_tp, \
11 | }},
12 |
13 | ZEN5_FUNCDEF_I8(vextracti128, ymmI82xmm, 000h)
14 | ZEN5_FUNCDEF_I8(vextractf128, ymmI82xmm, 000h)
15 | ZEN5_FUNCDEF_I8(vextracti128, ymmI82xmm, 001h)
16 | ZEN5_FUNCDEF_I8(vextractf128, ymmI82xmm, 001h)
17 |
18 | ZEN5_FUNCDEF_I8(vextracti32x4, ymmi82xmm, 000h)
19 | ZEN5_FUNCDEF_I8(vextractf32x4, ymmi82xmm, 000h)
20 | ZEN5_FUNCDEF_I8(vextracti32x4, ymmi82xmm, 001h)
21 | ZEN5_FUNCDEF_I8(vextractf32x4, ymmi82xmm, 001h)
22 |
23 | ZEN5_FUNCDEF_I8(vextracti64x2, ymmi82xmm, 000h)
24 | ZEN5_FUNCDEF_I8(vextractf64x2, ymmi82xmm, 000h)
25 | ZEN5_FUNCDEF_I8(vextracti64x2, ymmi82xmm, 001h)
26 | ZEN5_FUNCDEF_I8(vextractf64x2, ymmi82xmm, 001h)
27 |
28 | ZEN5_FUNCDEF_I8(vextracti32x4, zmmi82xmm, 000h)
29 | ZEN5_FUNCDEF_I8(vextractf32x4, zmmi82xmm, 000h)
30 | ZEN5_FUNCDEF_I8(vextracti32x4, zmmi82xmm, 001h)
31 | ZEN5_FUNCDEF_I8(vextractf32x4, zmmi82xmm, 001h)
32 | ZEN5_FUNCDEF_I8(vextracti32x4, zmmi82xmm, 002h)
33 | ZEN5_FUNCDEF_I8(vextractf32x4, zmmi82xmm, 002h)
34 | ZEN5_FUNCDEF_I8(vextracti32x4, zmmi82xmm, 003h)
35 | ZEN5_FUNCDEF_I8(vextractf32x4, zmmi82xmm, 003h)
36 |
37 | ZEN5_FUNCDEF_I8(vextracti32x8, zmmi82ymm, 000h)
38 | ZEN5_FUNCDEF_I8(vextractf32x8, zmmi82ymm, 000h)
39 | ZEN5_FUNCDEF_I8(vextracti32x8, zmmi82ymm, 001h)
40 | ZEN5_FUNCDEF_I8(vextractf32x8, zmmi82ymm, 001h)
41 |
42 | ZEN5_FUNCDEF_I8(vinserti128, xmmymmI82ymm, 000h)
43 | ZEN5_FUNCDEF_I8(vinsertf128, xmmymmI82ymm, 000h)
44 | ZEN5_FUNCDEF_I8(vinserti128, xmmymmI82ymm, 001h)
45 | ZEN5_FUNCDEF_I8(vinsertf128, xmmymmI82ymm, 001h)
46 |
47 | ZEN5_FUNCDEF_I8(vinserti32x4, xmmymmI82ymm, 000h)
48 | ZEN5_FUNCDEF_I8(vinsertf32x4, xmmymmI82ymm, 000h)
49 | ZEN5_FUNCDEF_I8(vinserti32x4, xmmymmI82ymm, 001h)
50 | ZEN5_FUNCDEF_I8(vinsertf32x4, xmmymmI82ymm, 001h)
51 |
52 | ZEN5_FUNCDEF_I8(vinserti64x2, xmmymmI82ymm, 000h)
53 | ZEN5_FUNCDEF_I8(vinsertf64x2, xmmymmI82ymm, 000h)
54 | ZEN5_FUNCDEF_I8(vinserti64x2, xmmymmI82ymm, 001h)
55 | ZEN5_FUNCDEF_I8(vinsertf64x2, xmmymmI82ymm, 001h)
56 |
57 | ZEN5_FUNCDEF_I8(vinserti32x4, xmmzmmI82zmm, 000h)
58 | ZEN5_FUNCDEF_I8(vinsertf32x4, xmmzmmI82zmm, 000h)
59 | ZEN5_FUNCDEF_I8(vinserti32x4, xmmzmmI82zmm, 001h)
60 | ZEN5_FUNCDEF_I8(vinsertf32x4, xmmzmmI82zmm, 001h)
61 | ZEN5_FUNCDEF_I8(vinserti32x4, xmmzmmI82zmm, 002h)
62 | ZEN5_FUNCDEF_I8(vinsertf32x4, xmmzmmI82zmm, 002h)
63 | ZEN5_FUNCDEF_I8(vinserti32x4, xmmzmmI82zmm, 003h)
64 | ZEN5_FUNCDEF_I8(vinsertf32x4, xmmzmmI82zmm, 003h)
65 |
66 | ZEN5_FUNCDEF_I8(vinserti32x8, ymmzmmI82zmm, 000h)
67 | ZEN5_FUNCDEF_I8(vinsertf32x8, ymmzmmI82zmm, 000h)
68 | ZEN5_FUNCDEF_I8(vinserti32x8, ymmzmmI82zmm, 001h)
69 | ZEN5_FUNCDEF_I8(vinsertf32x8, ymmzmmI82zmm, 001h)
70 |
71 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 000h)
72 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 000h)
73 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 001h)
74 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 001h)
75 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 002h)
76 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 002h)
77 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 003h)
78 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 003h)
79 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 008h)
80 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 008h)
81 |
82 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 010h)
83 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 010h)
84 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 011h)
85 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 011h)
86 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 012h)
87 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 012h)
88 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 013h)
89 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 013h)
90 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 018h)
91 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 018h)
92 |
93 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 020h)
94 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 020h)
95 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 021h)
96 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 021h)
97 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 022h)
98 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 022h)
99 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 023h)
100 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 023h)
101 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 028h)
102 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 028h)
103 |
104 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 030h)
105 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 030h)
106 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 031h)
107 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 031h)
108 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 032h)
109 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 032h)
110 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 033h)
111 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 033h)
112 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 038h)
113 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 038h)
114 |
115 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 080h)
116 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 080h)
117 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 081h)
118 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 081h)
119 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 082h)
120 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 082h)
121 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 083h)
122 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 083h)
123 | ZEN5_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 088h)
124 | ZEN5_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 088h)
125 |
126 | ZEN5_FUNCDEF_I8(vshufi32x4, 2ymmI82ymm, 000h)
127 | ZEN5_FUNCDEF_I8(vshuff32x4, 2ymmI82ymm, 000h)
128 | ZEN5_FUNCDEF_I8(vshufi32x4, 2ymmI82ymm, 001h)
129 | ZEN5_FUNCDEF_I8(vshuff32x4, 2ymmI82ymm, 001h)
130 | ZEN5_FUNCDEF_I8(vshufi32x4, 2ymmI82ymm, 002h)
131 | ZEN5_FUNCDEF_I8(vshuff32x4, 2ymmI82ymm, 002h)
132 | ZEN5_FUNCDEF_I8(vshufi32x4, 2ymmI82ymm, 003h)
133 | ZEN5_FUNCDEF_I8(vshuff32x4, 2ymmI82ymm, 003h)
134 | ZEN5_FUNCDEF_I8(vshufi64x2, 2ymmI82ymm, 000h)
135 | ZEN5_FUNCDEF_I8(vshuff64x2, 2ymmI82ymm, 000h)
136 | ZEN5_FUNCDEF_I8(vshufi64x2, 2ymmI82ymm, 001h)
137 | ZEN5_FUNCDEF_I8(vshuff64x2, 2ymmI82ymm, 001h)
138 | ZEN5_FUNCDEF_I8(vshufi64x2, 2ymmI82ymm, 002h)
139 | ZEN5_FUNCDEF_I8(vshuff64x2, 2ymmI82ymm, 002h)
140 | ZEN5_FUNCDEF_I8(vshufi64x2, 2ymmI82ymm, 003h)
141 | ZEN5_FUNCDEF_I8(vshuff64x2, 2ymmI82ymm, 003h)
142 |
143 | ZEN5_FUNCDEF_I8(vshufi32x4, 2zmmI82zmm, 000h)
144 | ZEN5_FUNCDEF_I8(vshuff32x4, 2zmmI82zmm, 000h)
145 | ZEN5_FUNCDEF_I8(vshufi32x4, 2zmmI82zmm, 044h)
146 | ZEN5_FUNCDEF_I8(vshuff32x4, 2zmmI82zmm, 044h)
147 | ZEN5_FUNCDEF_I8(vshufi32x4, 2zmmI82zmm, 0e4h)
148 | ZEN5_FUNCDEF_I8(vshuff32x4, 2zmmI82zmm, 0e4h)
149 | ZEN5_FUNCDEF_I8(vshufi32x4, 2zmmI82zmm, 0a5h)
150 | ZEN5_FUNCDEF_I8(vshuff32x4, 2zmmI82zmm, 0a5h)
151 |
152 | ZEN5_FUNCDEF_I8(vshufi64x2, 2zmmI82zmm, 000h)
153 | ZEN5_FUNCDEF_I8(vshuff64x2, 2zmmI82zmm, 000h)
154 | ZEN5_FUNCDEF_I8(vshufi64x2, 2zmmI82zmm, 044h)
155 | ZEN5_FUNCDEF_I8(vshuff64x2, 2zmmI82zmm, 044h)
156 | ZEN5_FUNCDEF_I8(vshufi64x2, 2zmmI82zmm, 0e4h)
157 | ZEN5_FUNCDEF_I8(vshuff64x2, 2zmmI82zmm, 0e4h)
158 | ZEN5_FUNCDEF_I8(vshufi64x2, 2zmmI82zmm, 0a5h)
159 | ZEN5_FUNCDEF_I8(vshuff64x2, 2zmmI82zmm, 0a5h)
--------------------------------------------------------------------------------
/TZCNT_Demo_Asm.asm:
--------------------------------------------------------------------------------
1 | .data
2 |
3 | bytetest_00 db 001h, 002h, 004h, 008h, 010h, 020h, 040h, 080h
4 | bytetest_01 db 0c0h, 0e0h, 0f0h, 0f8h, 0fch, 0feh, 0ffh, 07fh
5 | bytetest_02 db 03fh, 01fh, 00fh, 007h, 003h, 001h, 000h, 000h
6 | bytetest_03 db 000h, 000h, 000h, 000h, 000h, 000h, 000h, 000h
7 | bytetest_04 db 000h, 000h, 000h, 000h, 000h, 000h, 000h, 000h
8 | bytetest_05 db 000h, 000h, 000h, 000h, 000h, 000h, 000h, 000h
9 | bytetest_06 db 000h, 000h, 000h, 000h, 000h, 000h, 000h, 000h
10 | bytetest_07 db 000h, 000h, 000h, 000h, 000h, 000h, 000h, 000h
11 |
12 | wordtest_00 dw 00001h, 00002h, 00004h, 00008h, 00010h, 00020h, 00040h, 00080h
13 | wordtest_01 dw 00100h, 00200h, 00400h, 00800h, 01000h, 02000h, 04000h, 08000h
14 | wordtest_02 dw 0ffffh, 0fffeh, 0fffch, 0fff8h, 0fff0h, 0ffe0h, 0ffc0h, 0ff80h
15 | wordtest_03 dw 0ff00h, 0fe00h, 0fc00h, 0f800h, 0f000h, 0e000h, 0c000h, 08000h
16 |
17 | dwordtest_00 dd 00000000h, 00000001h, 00000002h, 00000004h, 00000008h, 00000010h, 00000020h, 00000040h
18 | dwordtest_01 dd 00000080h, 00000100h, 00000200h, 00000400h, 10000000h, 20000000h, 40000000h, 80000000h
19 |
20 | qwordtest_00 dq 00000000h, 00000001h, 00000002h, 00000004h, 1000000000000000h, 2000000000000000h, 4000000000000000h, 8000000000000000h
21 |
22 | repeats equ 1000000h
23 |
24 | .code
25 |
26 | INIT macro VECREGSIZE, ISA
27 | IFIDNI ,
28 | vpcmpeqw xmm1, xmm1, xmm1
29 | ELSEIFIDNI ,
30 | vpcmpeqb ymm1, ymm1, ymm1
31 | ELSEIFIDNI ,
32 | vpternlogd zmm1, zmm1, zmm1, 0ffh
33 | ENDIF
34 | IFIDNI ,
35 | mov eax, 040h
36 | IFIDNI ,
37 | vpbroadcastq xmm3, rax
38 | ELSEIFIDNI ,
39 | vpbroadcastq ymm3, rax
40 | ELSEIFIDNI ,
41 | vpbroadcastq zmm3, rax
42 | ENDIF
43 | ENDIF
44 | endm
45 |
46 | CORE macro DATA, VECREGSIZE, ISA
47 | IFIDNI ,
48 | IFIDNI ,
49 | IFIDNI ,
50 | vpaddb xmm2, xmm0, xmm1
51 | vpandnd xmm0, xmm0, xmm2
52 | vpopcntb xmm0, xmm0
53 | ELSEIFIDNI ,
54 | vpaddw xmm2, xmm0, xmm1
55 | vpandnd xmm0, xmm0, xmm2
56 | vpopcntw xmm0, xmm0
57 | ELSEIFIDNI ,
58 | vpaddd xmm2, xmm0, xmm1
59 | vpandnd xmm0, xmm0, xmm2
60 | vpopcntd xmm0, xmm0
61 | ELSEIFIDNI ,
62 | vpaddq xmm2, xmm0, xmm1
63 | vpandnq xmm0, xmm0, xmm2
64 | vpopcntq xmm0, xmm0
65 | ENDIF
66 | ELSEIFIDNI ,
67 | IFIDNI ,
68 | vpaddb ymm2, ymm0, ymm1
69 | vpandnd ymm0, ymm0, ymm2
70 | vpopcntb ymm0, ymm0
71 | ELSEIFIDNI ,
72 | vpaddw ymm2, ymm0, ymm1
73 | vpandnd ymm0, ymm0, ymm2
74 | vpopcntw ymm0, ymm0
75 | ELSEIFIDNI ,
76 | vpaddd ymm2, ymm0, ymm1
77 | vpandnd ymm0, ymm0, ymm2
78 | vpopcntd ymm0, ymm0
79 | ELSEIFIDNI ,
80 | vpaddq ymm2, ymm0, ymm1
81 | vpandnq ymm0, ymm0, ymm2
82 | vpopcntq ymm0, ymm0
83 | ENDIF
84 | ELSEIFIDNI ,
85 | IFIDNI ,
86 | vpaddb zmm2, zmm0, zmm1
87 | vpandnd zmm0, zmm0, zmm2
88 | vpopcntb zmm0, zmm0
89 | ELSEIFIDNI ,
90 | vpaddw zmm2, zmm0, zmm1
91 | vpandnd zmm0, zmm0, zmm2
92 | vpopcntw zmm0, zmm0
93 | ELSEIFIDNI ,
94 | vpaddd zmm2, zmm0, zmm1
95 | vpandnd zmm0, zmm0, zmm2
96 | vpopcntd zmm0, zmm0
97 | ELSEIFIDNI ,
98 | vpaddq zmm2, zmm0, zmm1
99 | vpandnq zmm0, zmm0, zmm2
100 | vpopcntq zmm0, zmm0
101 | ENDIF
102 | ENDIF
103 | ELSEIFIDNI ,
104 | IFIDNI ,
105 | IFIDNI ,
106 | vpaddd xmm2, xmm0, xmm1
107 | vpandnd xmm0, xmm0, xmm2
108 | vplzcntd xmm0, xmm0
109 | vpsubd xmm0, xmm3, xmm0
110 | ELSEIFIDNI ,
111 | vpaddq xmm2, xmm0, xmm1
112 | vpandnq xmm0, xmm0, xmm2
113 | vplzcntq xmm0, xmm0
114 | vpsubq xmm0, xmm3, xmm0
115 | ENDIF
116 | ELSEIFIDNI ,
117 | IFIDNI ,
118 | vpaddd ymm2, ymm0, ymm1
119 | vpandnd ymm0, ymm0, ymm2
120 | vplzcntd ymm0, ymm0
121 | vpsubd ymm0, ymm3, ymm0
122 | ELSEIFIDNI ,
123 | vpaddq ymm2, ymm0, ymm1
124 | vpandnq ymm0, ymm0, ymm2
125 | vplzcntq ymm0, ymm0
126 | vpsubq ymm0, ymm3, ymm0
127 | ENDIF
128 | ELSEIFIDNI ,
129 | IFIDNI ,
130 | vpaddd zmm2, zmm0, zmm1
131 | vpandnd zmm0, zmm0, zmm2
132 | vplzcntd zmm0, zmm0
133 | vpsubd zmm0, zmm3, zmm0
134 | ELSEIFIDNI ,
135 | vpaddq zmm2, zmm0, zmm1
136 | vpandnq zmm0, zmm0, zmm2
137 | vplzcntq zmm0, zmm0
138 | vpsubq zmm0, zmm3, zmm0
139 | ENDIF
140 | ENDIF
141 | ENDIF
142 | endm
143 |
144 | TIMED macro PNAME, DATA, VECREGSIZE, ISA
145 | PNAME proc
146 | push rbx
147 | push rdi
148 | push rsi
149 |
150 | INIT VECREGSIZE, ISA
151 |
152 | mfence
153 | rdtscp
154 | lfence
155 |
156 | mov esi, eax
157 | mov edi, edx
158 |
159 | mov ecx, repeats
160 |
161 | align 16
162 | startlabel:
163 | CORE DATA, VECREGSIZE, ISA
164 |
165 | dec ecx
166 | jnz startlabel
167 |
168 | mfence
169 | rdtscp
170 | lfence
171 |
172 | shl rdx, 20h
173 | shl rdi, 20h
174 | or rax, rdx
175 | or rsi, rdi
176 |
177 | sub rax, rsi
178 |
179 |
180 | pop rsi
181 | pop rdi
182 | pop rbx
183 | ret
184 | PNAME endp
185 | endm
186 |
187 | NAKED macro PNAME, DATA, VECREGSIZE, ISA
188 | PNAME proc
189 |
190 | INIT VECREGSIZE, ISA
191 |
192 | CORE DATA, VECREGSIZE, ISA
193 |
194 | ret
195 | PNAME endp
196 | endm
197 |
198 | TIMED _mm_tzcnt_epi8_asm_timed, EPI8, XMM, BITALG
199 | TIMED _mm_tzcnt_epi16_asm_timed, EPI16, XMM, BITALG
200 | TIMED _mm_tzcnt_epi32_asm_timed, EPI32, XMM, BITALG
201 | TIMED _mm_tzcnt_epi64_asm_timed, EPI64, XMM, BITALG
202 |
203 | NAKED _mm_tzcnt_epi8_asm@@16, EPI8, XMM, BITALG
204 | NAKED _mm_tzcnt_epi16_asm@@16, EPI16, XMM, BITALG
205 | NAKED _mm_tzcnt_epi32_asm@@16, EPI32, XMM, BITALG
206 | NAKED _mm_tzcnt_epi64_asm@@16, EPI64, XMM, BITALG
207 |
208 | TIMED _mm256_tzcnt_epi8_asm_timed, EPI8, YMM, BITALG
209 | TIMED _mm256_tzcnt_epi16_asm_timed, EPI16, YMM, BITALG
210 | TIMED _mm256_tzcnt_epi32_asm_timed, EPI32, YMM, BITALG
211 | TIMED _mm256_tzcnt_epi64_asm_timed, EPI64, YMM, BITALG
212 |
213 | NAKED _mm256_tzcnt_epi8_asm@@16, EPI8, YMM, BITALG
214 | NAKED _mm256_tzcnt_epi16_asm@@16, EPI16, YMM, BITALG
215 | NAKED _mm256_tzcnt_epi32_asm@@16, EPI32, YMM, BITALG
216 | NAKED _mm256_tzcnt_epi64_asm@@16, EPI64, YMM, BITALG
217 |
218 | TIMED _mm512_tzcnt_epi8_asm_timed, EPI8, ZMM, BITALG
219 | TIMED _mm512_tzcnt_epi16_asm_timed, EPI16, ZMM, BITALG
220 | TIMED _mm512_tzcnt_epi32_asm_timed, EPI32, ZMM, BITALG
221 | TIMED _mm512_tzcnt_epi64_asm_timed, EPI64, ZMM, BITALG
222 |
223 | NAKED _mm512_tzcnt_epi8_asm@@16, EPI8, ZMM, BITALG
224 | NAKED _mm512_tzcnt_epi16_asm@@16, EPI16, ZMM, BITALG
225 | NAKED _mm512_tzcnt_epi32_asm@@16, EPI32, ZMM, BITALG
226 | NAKED _mm512_tzcnt_epi64_asm@@16, EPI64, ZMM, BITALG
227 |
228 | TIMED _mm_tzcnt_epi32_cd_asm_timed, EPI32, XMM, CD
229 | TIMED _mm_tzcnt_epi64_cd_asm_timed, EPI64, XMM, CD
230 | TIMED _mm256_tzcnt_epi32_cd_asm_timed, EPI32, YMM, CD
231 | TIMED _mm256_tzcnt_epi64_cd_asm_timed, EPI64, YMM, CD
232 | TIMED _mm512_tzcnt_epi32_cd_asm_timed, EPI32, ZMM, CD
233 | TIMED _mm512_tzcnt_epi64_cd_asm_timed, EPI64, ZMM, CD
234 |
235 | NAKED _mm_tzcnt_epi32_cd_asm@@16, EPI32, XMM, CD
236 | NAKED _mm_tzcnt_epi64_cd_asm@@16, EPI64, XMM, CD
237 | NAKED _mm256_tzcnt_epi32_cd_asm@@16, EPI32, YMM, CD
238 | NAKED _mm256_tzcnt_epi64_cd_asm@@16, EPI64, YMM, CD
239 | NAKED _mm512_tzcnt_epi32_cd_asm@@16, EPI32, ZMM, CD
240 | NAKED _mm512_tzcnt_epi64_cd_asm@@16, EPI64, ZMM, CD
241 |
242 | end
--------------------------------------------------------------------------------
/LZCNT_Demo.cpp:
--------------------------------------------------------------------------------
1 | #include "stdafx.h"
2 | #include "LZCNT_Demo.h"
3 | #include "GFNI_Demo.h"
4 | #include "VBMI2_Demo.h"
5 | /*
6 | VPLZCNTB/W proposal:
7 | - Byte/Word support
8 | - BITALG lzcnt(a) = popcnt(tzmsk(bit_reverse(a)))
9 | - FP16 lzcnt(a) = max(16, 30 - (fp16)a.exp))
10 | - zero case handled
11 | */
12 |
13 | extern CPU_Props cpu_props;
14 |
15 | using namespace std;
16 |
17 | __m128i __vectorcall _mm_lzcnt_ild_epi8(__m128i a) {
18 | __m128i u = _mm_undefined_si128();
19 | __m128i r = _mm_revbit_epi8(a);
20 | return _mm_popcnt_epi8(_mm_andnot_si128(r, _mm_add_epi8(r, _mm_cmpeq_epi8(u, u))));
21 | }
22 |
23 | __m256i __vectorcall _mm256_lzcnt_ild_epi8(__m256i a) {
24 | __m256i u = _mm256_undefined_si256();
25 | __m256i r = _mm256_revbit_epi8(a);
26 | return _mm256_popcnt_epi8(_mm256_andnot_si256(r, _mm256_add_epi8(r, _mm256_cmpeq_epi8(u, u))));
27 | }
28 |
29 | __m512i __vectorcall _mm512_lzcnt_ild_epi8(__m512i a) {
30 | __m512i u = _mm512_undefined_epi32();
31 | __m512i r = _mm512_revbit_epi8(a);
32 | return _mm512_popcnt_epi8(_mm512_andnot_si512(r, _mm512_add_epi8(r, _mm512_ternarylogic_epi32(u, u, u, 0xff))));
33 | }
34 |
35 | __m128i __vectorcall _mm_lzcnt_ild_epi16(__m128i a) {
36 | __m128i u = _mm_undefined_si128();
37 | __m128i r = _mm_revbit_epi8(_mm_swaplh_epi8(a));
38 | return _mm_popcnt_epi16(_mm_andnot_si128(r, _mm_add_epi16(r, _mm_cmpeq_epi16(u, u))));
39 | }
40 |
41 | __m256i __vectorcall _mm256_lzcnt_ild_epi16(__m256i a) {
42 | __m256i u = _mm256_undefined_si256();
43 | __m256i r = _mm256_revbit_epi8(_mm256_swaplh_epi8(a));
44 | return _mm256_popcnt_epi16(_mm256_andnot_si256(r,_mm256_add_epi16(r, _mm256_cmpeq_epi16(u, u))));
45 | }
46 |
47 | __m512i __vectorcall _mm512_lzcnt_ild_epi16(__m512i a) {
48 | __m512i u = _mm512_undefined_epi32();
49 | __m512i r = _mm512_revbit_epi8(_mm512_swaplh_epi8(a));
50 | return _mm512_popcnt_epi16(_mm512_andnot_si512(r, _mm512_add_epi16(r, _mm512_ternarylogic_epi32(u, u, u, 0xff))));
51 | }
52 |
53 | __m128i __vectorcall _mm_lzcnt_fp16_epi16(__m128i a) {
54 | return _mm_min_epi16(_mm_sub_epi16(_mm_set1_epi16(0x1e), _mm_srli_epi16(_mm_cvtepu16_ph(a), 10)), _mm_set1_epi16(0x10));
55 | }
56 |
57 | __m256i __vectorcall _mm256_lzcnt_fp16_epi16(__m256i a) {
58 | return _mm256_min_epi16(_mm256_sub_epi16(_mm256_set1_epi16(0x1e), _mm256_srli_epi16(_mm256_cvtepu16_ph(a), 10)), _mm256_set1_epi16(0x10));
59 | }
60 |
61 | __m512i __vectorcall _mm512_lzcnt_fp16_epi16(__m512i a) {
62 | return _mm512_min_epi16(_mm512_sub_epi16(_mm512_set1_epi16(0x1e), _mm512_srli_epi16(_mm512_cvtepu16_ph(a), 10)), _mm512_set1_epi16(0x10));
63 | }
64 |
65 | void LZCNT_Test(void) {
66 | cout << "--- AVX512_BITALG & AVX512_FP16 SIMD LZCNTB/W ---" << dec << right << endl;
67 | __m128i x128 = _mm_set_epi16(0x0000, 0x0001, 0x0002, 0x0004, 0x1000, 0x2000, 0x4000, -32768);
68 | __m256i x256 = _mm256_set_epi16(0x0000, 0x0001, 0x0002, 0x0004, 0x1000, 0x2000, 0x4000, -32768, 0x07f00, 0x07e00, 0x07c00, 0x07800, 0x07000, 0x06000, 0x04000, 0x00000);
69 | __m512i x512 = _mm512_set_epi16(0x0000, 0x0001, 0x0002, 0x0004, 0x1000, 0x2000, 0x4000, -32768, 0x07f00, 0x07e00, 0x07c00, 0x07800, 0x07000, 0x06000, 0x00000, 0x00000, (short)(1 << 15), 1 << 14, 1 << 13, 1 << 12, 1 << 11, 1 << 10, 1 << 9, 1 << 8, 1 << 7, 1 << 6, 1 << 5, 1 << 4, 1 << 3, 1 << 2, 1 << 1, 1 << 0);
70 |
71 | printRes("x128 :", x128);
72 | #if (_MSC_VER >= 1944)
73 | printRes("_mm_lzcnt_epi8 :", _mm_lzcnt_epi8(x128));
74 | #endif
75 | printRes("_mm_lzcnt_ild_epi8 :", _mm_lzcnt_ild_epi8(x128));
76 | printRes("_mm_lzcnt_epi8_asm :", _mm_lzcnt_epi8_asm(x128));
77 | printRes("_mm_lzcnt_gfni_epi8 :", _mm_lzcnt_gfni_epi8(x128));
78 | printRes("_mm_lzcnt_gfni_epi8_asm :", _mm_lzcnt_gfni_epi8_asm(x128));
79 | #if (_MSC_VER >= 1944)
80 | printRes("_mm_lzcnt_epi16 :", _mm_lzcnt_epi16(x128));
81 | #endif
82 | printRes("_mm_lzcnt_ild_epi16 :", _mm_lzcnt_ild_epi16(x128));
83 | printRes("_mm_lzcnt_epi16_asm :", _mm_lzcnt_epi16_asm(x128));
84 | if (cpu_props.IsFeat(FEAT_AVX512_FP16)) {
85 | printRes("_mm_lzcnt_fp16_epi16 :", _mm_lzcnt_fp16_epi16(x128));
86 | printRes("_mm_lzcnt_fp16_epi16_asm :", _mm_lzcnt_fp16_epi16_asm(x128));
87 | }
88 |
89 | printRes("x256 :", x256);
90 | #if (_MSC_VER >= 1944)
91 | printRes("_mm256_lzcnt_epi8 :", _mm256_lzcnt_epi8(x256));
92 | #endif
93 | printRes("_mm256_lzcnt_ild_epi8 :", _mm256_lzcnt_ild_epi8(x256));
94 | printRes("_mm256_lzcnt_epi8_asm :", _mm256_lzcnt_epi8_asm(x256));
95 | printRes("_mm256_lzcnt_gfni_epi8 :", _mm256_lzcnt_gfni_epi8(x256));
96 | printRes("_mm256_lzcnt_gfni_epi8_asm :", _mm256_lzcnt_gfni_epi8_asm(x256));
97 | #if (_MSC_VER >= 1944)
98 | printRes("_mm256_lzcnt_epi16 :", _mm256_lzcnt_epi16(x256));
99 | #endif
100 | printRes("_mm256_lzcnt_ild_epi16 :", _mm256_lzcnt_ild_epi16(x256));
101 | printRes("_mm256_lzcnt_epi16_asm :", _mm256_lzcnt_epi16_asm(x256));
102 | if (cpu_props.IsFeat(FEAT_AVX512_FP16)) {
103 | printRes("_mm256_lzcnt_fp16_epi16 :", _mm256_lzcnt_fp16_epi16(x256));
104 | printRes("_mm256_lzcnt_fp16_epi16_asm :", _mm256_lzcnt_fp16_epi16_asm(x256));
105 | }
106 |
107 | printRes("x512 :", x512);
108 | #if (_MSC_VER >= 1944)
109 | printRes("_mm512_lzcnt_epi8 :", _mm512_lzcnt_epi8(x512));
110 | #endif
111 | printRes("_mm512_lzcnt_ild_epi8 :", _mm512_lzcnt_ild_epi8(x512));
112 | printRes("_mm512_lzcnt_epi8 :", _mm512_lzcnt_epi8_asm(x512));
113 | printRes("_mm512_lzcnt_gfni_epi8 :", _mm512_lzcnt_gfni_epi8(x512));
114 | printRes("_mm512_lzcnt_gfni_epi8 :", _mm512_lzcnt_gfni_epi8_asm(x512));
115 | #if (_MSC_VER >= 1944)
116 | printRes("_mm512_lzcnt_epi16 :", _mm512_lzcnt_epi16(x512));
117 | #endif
118 | printRes("_mm512_lzcnt_ild_epi16 :", _mm512_lzcnt_ild_epi16(x512));
119 | printRes("_mm512_lzcnt_epi16_asm :", _mm512_lzcnt_epi16_asm(x512));
120 | if (cpu_props.IsFeat(FEAT_AVX512_FP16)) {
121 | printRes("_mm512_lzcnt_fp16_epi16 :", _mm512_lzcnt_fp16_epi16(x512));
122 | printRes("_mm512_lzcnt_fp16_epi16_asm :", _mm512_lzcnt_fp16_epi16_asm(x512));
123 | }
124 |
125 | cout << "TSC CLKs:----------------------" << endl;
126 |
127 | cout << "_mm_lzcnt_epi8_asm :" << (double)_mm_lzcnt_epi8_asm_timed() / (double)LZCNT_REPEATS << endl;
128 | cout << "_mm_lzcnt_gfni_epi8_asm :" << (double)_mm_lzcnt_gfni_epi8_asm_timed() / (double)LZCNT_REPEATS << endl;
129 | cout << "_mm256_lzcnt_epi8_asm :" << (double)_mm256_lzcnt_epi8_asm_timed() / (double)LZCNT_REPEATS << endl;
130 | cout << "_mm256_lzcnt_gfni_epi8_asm :" << (double)_mm256_lzcnt_gfni_epi8_asm_timed() / (double)LZCNT_REPEATS << endl;
131 | cout << "_mm512_lzcnt_epi8_asm :" << (double)_mm512_lzcnt_epi8_asm_timed() / (double)LZCNT_REPEATS << endl;
132 | cout << "_mm512_lzcnt_gfni_epi8_asm :" << (double)_mm512_lzcnt_gfni_epi8_asm_timed() / (double)LZCNT_REPEATS << endl;
133 |
134 | cout << "_mm_lzcnt_epi16_asm :" << (double)_mm_lzcnt_epi16_asm_timed() / (double)LZCNT_REPEATS << endl;
135 | if (cpu_props.IsFeat(FEAT_AVX512_FP16))
136 | cout << "_mm_lzcnt_fp16_epi16_asm :" << (double)_mm_lzcnt_fp16_epi16_asm_timed() / (double)LZCNT_REPEATS << endl;
137 |
138 | cout << "_mm256_lzcnt_epi16_asm :" << (double)_mm256_lzcnt_epi16_asm_timed() / (double)LZCNT_REPEATS << endl;
139 | if (cpu_props.IsFeat(FEAT_AVX512_FP16))
140 | cout << "_mm256_lzcnt_fp16_epi16_asm :" << (double)_mm256_lzcnt_fp16_epi16_asm_timed() / (double)LZCNT_REPEATS << endl;
141 |
142 | cout << "_mm512_lzcnt_epi16_asm :" << (double)_mm512_lzcnt_epi16_asm_timed() / (double)LZCNT_REPEATS << endl;
143 | if (cpu_props.IsFeat(FEAT_AVX512_FP16))
144 | cout << "_mm512_lzcnt_fp16_epi16_asm :" << (double)_mm512_lzcnt_fp16_epi16_asm_timed() / (double)LZCNT_REPEATS << endl;
145 | }
146 |
--------------------------------------------------------------------------------
/LZCNT_Demo_Asm.asm:
--------------------------------------------------------------------------------
1 | .data
2 |
3 | gfni_revbit dq 08040201008040201h
4 | gfni_tzcnt dq 0aaccf0ff00000000h
5 |
6 | fp16_expbias dd 001e001eh
7 | max_lzcntw dd 00100010h
8 |
9 | repeats equ 1000000h
10 |
11 | .code
12 |
13 | INIT8 macro VECREGSIZE, ISA
14 | IFIDNI ,
15 | IFIDNI ,
16 | vpcmpeqb xmm1, xmm1, xmm1
17 | vmovddup xmm2, qword ptr [gfni_revbit]
18 | ELSEIFIDNI ,
19 | vpcmpeqb ymm1, ymm1, ymm1
20 | vpbroadcastq ymm2, qword ptr [gfni_revbit]
21 | ELSEIFIDNI ,
22 | vpternlogd zmm1, zmm1, zmm1, 0ffh
23 | vpbroadcastq zmm2, qword ptr [gfni_revbit]
24 | ENDIF
25 | ELSEIFIDNI ,
26 | IFIDNI ,
27 | vpcmpeqb xmm1, xmm1, xmm1
28 | vmovddup xmm2, qword ptr [gfni_revbit]
29 | vmovddup xmm3, qword ptr [gfni_tzcnt]
30 | ELSEIFIDNI ,
31 | vpcmpeqb ymm1, ymm1, ymm1
32 | vpbroadcastq ymm2, qword ptr [gfni_revbit]
33 | vpbroadcastq ymm3, qword ptr [gfni_tzcnt]
34 | ELSEIFIDNI ,
35 | vpternlogd zmm1, zmm1, zmm1, 0ffh
36 | vpbroadcastq zmm2, qword ptr [gfni_revbit]
37 | vpbroadcastq zmm3, qword ptr [gfni_tzcnt]
38 | ENDIF
39 | ENDIF
40 | endm
41 |
42 | CORE8 macro VECREGSIZE, ISA
43 | IFIDNI ,
44 | IFIDNI ,
45 | vgf2p8affineqb xmm0, xmm0, xmm2, 0
46 | vpaddb xmm3, xmm0, xmm1
47 | vpandn xmm0, xmm0, xmm3
48 | vpopcntb xmm0, xmm0
49 | ELSEIFIDNI ,
50 | vgf2p8affineqb ymm0, ymm0, ymm2, 0
51 | vpaddb ymm3, ymm0, ymm1
52 | vpandn ymm0, ymm0, ymm3
53 | vpopcntb ymm0, ymm0
54 | ELSEIFIDNI ,
55 | vgf2p8affineqb zmm0, zmm0, zmm2, 0
56 | vpaddb zmm3, zmm0, zmm1
57 | vpandnq zmm0, zmm0, zmm3
58 | vpopcntb zmm0, zmm0
59 | ENDIF
60 | ELSEIFIDNI ,
61 | IFIDNI ,
62 | vgf2p8affineqb xmm0, xmm0, xmm2, 0
63 | vpaddb xmm4, xmm0, xmm1
64 | vpandn xmm0, xmm4, xmm0
65 | vgf2p8affineqb xmm0, xmm0, xmm3, 08h
66 | ELSEIFIDNI ,
67 | vgf2p8affineqb ymm0, ymm0, ymm2, 0
68 | vpaddb ymm4, ymm0, ymm1
69 | vpandn ymm0, ymm4, ymm0
70 | vgf2p8affineqb ymm0, ymm0, ymm3, 08h
71 | ELSEIFIDNI ,
72 | vgf2p8affineqb zmm0, zmm0, zmm2, 0
73 | vpaddb zmm4, zmm0, zmm1
74 | vpandnq zmm0, zmm4, zmm0
75 | vgf2p8affineqb zmm0, zmm0, zmm3, 08h
76 | ENDIF
77 | ENDIF
78 | endm
79 |
80 | INIT16 macro VECREGSIZE, ISA
81 | IFIDNI ,
82 | IFIDNI ,
83 | vpcmpeqw xmm1, xmm1, xmm1
84 | vmovddup xmm2, qword ptr [gfni_revbit]
85 | ELSEIFIDNI ,
86 | vpcmpeqb ymm1, ymm1, ymm1
87 | vpbroadcastq ymm2, qword ptr [gfni_revbit]
88 | ELSEIFIDNI ,
89 | vpternlogd zmm1, zmm1, zmm1, 0ffh
90 | vpbroadcastq zmm2, qword ptr [gfni_revbit]
91 | ENDIF
92 | ELSEIFIDNI ,
93 | IFIDNI ,
94 | ;vpbroadcastd ymm1, dword ptr [fp16_expbias]
95 | ;vpbroadcastd ymm2, dword ptr [max_lzcntw]
96 | vpxor xmm2, xmm2, xmm2
97 | vgf2p8affineqb xmm1, xmm2, xmm2, 0f0h ;001eh const gen
98 | vgf2p8affineqb xmm2, xmm2, xmm2, 080h ;0010h const gen
99 | vpsrlw xmm1, xmm1, 0bh ;001eh
100 | vpsrlw xmm2, xmm2, 0bh ;0010h
101 | ELSEIFIDNI ,
102 | ;vpbroadcastd ymm1, dword ptr [fp16_expbias]
103 | ;vpbroadcastd ymm2, dword ptr [max_lzcntw]
104 | vpxor xmm2, xmm2, xmm2
105 | vgf2p8affineqb ymm1, ymm2, ymm2, 0f0h ;001eh const gen
106 | vgf2p8affineqb ymm2, ymm2, ymm2, 080h ;0010h const gen
107 | vpsrlw ymm1, ymm1, 0bh ;001eh
108 | vpsrlw ymm2, ymm2, 0bh ;0010h
109 | ELSEIFIDNI ,
110 | ;vpbroadcastd zmm1, dword ptr [fp16_expbias]
111 | ;vpbroadcastd zmm2, dword ptr [max_lzcntw]
112 | vpxor xmm2, xmm2, xmm2
113 | vgf2p8affineqb zmm1, zmm2, zmm2, 0f0h ;001eh const gen
114 | vgf2p8affineqb zmm2, zmm2, zmm2, 080h ;0010h const gen
115 | vpsrlw zmm1, zmm1, 0bh ;001eh
116 | vpsrlw zmm2, zmm2, 0bh ;0010h
117 | ENDIF
118 | ENDIF
119 | endm
120 |
121 | CORE16 macro VECREGSIZE, ISA
122 | IFIDNI ,
123 | IFIDNI ,
124 | vpshldw xmm0, xmm0, xmm0, 8
125 | vgf2p8affineqb xmm0, xmm0, xmm2, 0
126 | vpaddw xmm3, xmm0, xmm1
127 | vpandn xmm0, xmm0, xmm3
128 | vpopcntw xmm0, xmm0
129 | ELSEIFIDNI ,
130 | vpshldw ymm0, ymm0, ymm0, 8
131 | vgf2p8affineqb ymm0, ymm0, ymm2, 0
132 | vpaddw ymm3, ymm0, ymm1
133 | vpandn ymm0, ymm0, ymm3
134 | vpopcntw ymm0, ymm0
135 | ELSEIFIDNI ,
136 | vpshldw zmm0, zmm0, zmm0, 8
137 | vgf2p8affineqb zmm0, zmm0, zmm2, 0
138 | vpaddw zmm3, zmm0, zmm1
139 | vpandnd zmm0, zmm0, zmm3
140 | vpopcntw zmm0, zmm0
141 | ENDIF
142 | ELSEIFIDNI ,
143 | IFIDNI ,
144 | vcvtuw2ph xmm0, xmm0
145 | vpsrlw xmm0, xmm0, 0ah
146 | vpsubw xmm0, xmm1, xmm0
147 | vpminuw xmm0, xmm0, xmm2
148 | ELSEIFIDNI ,
149 | vcvtuw2ph ymm0, ymm0
150 | vpsrlw ymm0, ymm0, 0ah
151 | vpsubw ymm0, ymm1, ymm0
152 | vpminuw ymm0, ymm0, ymm2
153 | ELSEIFIDNI ,
154 | vcvtuw2ph zmm0, zmm0
155 | vpsrlw zmm0, zmm0, 0ah
156 | vpsubw zmm0, zmm1, zmm0
157 | vpminuw zmm0, zmm0, zmm2
158 | ENDIF
159 | ENDIF
160 | endm
161 |
162 |
163 | TIMED macro PNAME, DATA, VECREGSIZE, ISA
164 | PNAME proc
165 | push rbx
166 | push rdi
167 | push rsi
168 |
169 | IFIDNI ,
170 | INIT8 VECREGSIZE, ISA
171 | ELSEIFIDNI ,
172 | INIT16 VECREGSIZE, ISA
173 | ENDIF
174 |
175 | mfence
176 | rdtscp
177 | lfence
178 |
179 | mov esi, eax
180 | mov edi, edx
181 |
182 | mov ecx, repeats
183 |
184 | align 16
185 | startlabel:
186 | IFIDNI ,
187 | CORE8 VECREGSIZE, ISA
188 | ELSEIFIDNI ,
189 | CORE16 VECREGSIZE, ISA
190 | ENDIF
191 |
192 | dec ecx
193 | jnz startlabel
194 |
195 | mfence
196 | rdtscp
197 | lfence
198 |
199 | shl rdx, 20h
200 | shl rdi, 20h
201 | or rax, rdx
202 | or rsi, rdi
203 |
204 | sub rax, rsi
205 |
206 |
207 | pop rsi
208 | pop rdi
209 | pop rbx
210 | ret
211 | PNAME endp
212 | endm
213 |
214 | NAKED macro PNAME, DATA, VECREGSIZE, ISA
215 | PNAME proc
216 |
217 | IFIDNI ,
218 | INIT8 VECREGSIZE, ISA
219 | ELSEIFIDNI ,
220 | INIT16 VECREGSIZE, ISA
221 | ENDIF
222 |
223 | IFIDNI ,
224 | CORE8 VECREGSIZE, ISA
225 | ELSEIFIDNI ,
226 | CORE16 VECREGSIZE, ISA
227 | ENDIF
228 |
229 | ret
230 | PNAME endp
231 | endm
232 |
233 | TIMED _mm_lzcnt_epi8_asm_timed, EPI8, XMM, BITALG
234 | TIMED _mm256_lzcnt_epi8_asm_timed, EPI8, YMM, BITALG
235 | TIMED _mm512_lzcnt_epi8_asm_timed, EPI8, ZMM, BITALG
236 |
237 | NAKED _mm_lzcnt_epi8_asm@@16, EPI8, XMM, BITALG
238 | NAKED _mm256_lzcnt_epi8_asm@@32, EPI8, YMM, BITALG
239 | NAKED _mm512_lzcnt_epi8_asm@@64, EPI8, ZMM, BITALG
240 |
241 | TIMED _mm_lzcnt_gfni_epi8_asm_timed, EPI8, XMM, GFNI
242 | TIMED _mm256_lzcnt_gfni_epi8_asm_timed, EPI8, YMM, GFNI
243 | TIMED _mm512_lzcnt_gfni_epi8_asm_timed, EPI8, ZMM, GFNI
244 |
245 | NAKED _mm_lzcnt_gfni_epi8_asm@@16, EPI8, XMM, GFNI
246 | NAKED _mm256_lzcnt_gfni_epi8_asm@@32, EPI8, YMM, GFNI
247 | NAKED _mm512_lzcnt_gfni_epi8_asm@@64, EPI8, ZMM, GFNI
248 |
249 | TIMED _mm_lzcnt_epi16_asm_timed, EPI16, XMM, BITALG
250 | TIMED _mm256_lzcnt_epi16_asm_timed, EPI16, YMM, BITALG
251 | TIMED _mm512_lzcnt_epi16_asm_timed, EPI16, ZMM, BITALG
252 |
253 | NAKED _mm_lzcnt_epi16_asm@@16, EPI16, ZMM, BITALG
254 | NAKED _mm256_lzcnt_epi16_asm@@32, EPI16, ZMM, BITALG
255 | NAKED _mm512_lzcnt_epi16_asm@@64, EPI16, ZMM, BITALG
256 |
257 | TIMED _mm_lzcnt_fp16_epi16_asm_timed, EPI16, XMM, FP16
258 | TIMED _mm256_lzcnt_fp16_epi16_asm_timed, EPI16, YMM, FP16
259 | TIMED _mm512_lzcnt_fp16_epi16_asm_timed, EPI16, ZMM, FP16
260 |
261 | NAKED _mm_lzcnt_fp16_epi16_asm@@16, EPI16, ZMM, FP16
262 | NAKED _mm256_lzcnt_fp16_epi16_asm@@32, EPI16, ZMM, FP16
263 | NAKED _mm512_lzcnt_fp16_epi16_asm@@64, EPI16, ZMM, FP16
264 |
265 |
266 | end
--------------------------------------------------------------------------------
/HWBITPERM_Demo.cpp:
--------------------------------------------------------------------------------
1 | #include "stdafx.h"
2 | #include "HWBITPERM_Demo.h"
3 |
4 | extern CPU_Props cpu_props;
5 |
6 | using namespace std;
7 |
8 | bitperm_methods hw_reference[] = {
9 | {"PEXT32_HW zmm, zmm, zmm ", "SKX ", 32, BEXT32_HW_Lat, BEXT32_HW_Tp, BEXT32_HW, FEAT_AVX512BW, BEXT, 0},
10 | {"PDEP32_HW zmm, zmm, zmm ", "SKX ", 32, BDEP32_HW_Lat, BDEP32_HW_Tp, BDEP32_HW, FEAT_AVX512BW, BDEP, 1},
11 | {"BGRP32_HW zmm, zmm, zmm ", "SKX ", 32, BGRP32_HW_Lat, BGRP32_HW_Tp, BGRP32_HW, FEAT_AVX512BW, BGRP, 2},
12 | {"PEXT64_HW zmm, zmm, zmm ", "SKX ", 64, BEXT64_HW_Lat, BEXT64_HW_Tp, BEXT64_HW, FEAT_AVX512BW, BEXT, 3},
13 | {"PDEP64_HW zmm, zmm, zmm ", "SKX ", 64, BDEP64_HW_Lat, BDEP64_HW_Tp, BDEP64_HW, FEAT_AVX512BW, BDEP, 4},
14 | {"BGRP64_HW zmm, zmm, zmm ", "SKX ", 64, BGRP64_HW_Lat, BGRP64_HW_Tp, BGRP64_HW, FEAT_AVX512BW, BGRP, 5},
15 | };
16 |
17 | void HWBITPERM_Compare(__m512i p, __m512i mask, __m512i ref, int bitness, BITPERM type) {
18 | for (int b = 0; b < (sizeof(hw_reference) / sizeof(bitperm_methods)); b++) {
19 | if (cpu_props.IsFeat(hw_reference[b].feats) && (bitness == hw_reference[b].bitness) && (type == hw_reference[b].type)) {
20 | __m512i res = (hw_reference[b].func)(p, mask);
21 | __mmask64 test = _mm512_cmpeq_epi8_mask(res, ref);
22 | if (test != ~0ULL)
23 | if (bitness == 32) {
24 | printRes32(hw_reference[b].name, p);
25 | printRes32(hw_reference[b].name, mask);
26 | printRes32(hw_reference[b].name, ref);
27 | printRes32(hw_reference[b].name, res);
28 | } else {
29 | printRes(hw_reference[b].name, p);
30 | printRes(hw_reference[b].name, mask);
31 | printRes(hw_reference[b].name, ref);
32 | printRes(hw_reference[b].name, res);
33 | }
34 | assert(test == ~0ULL);
35 | }
36 | }
37 | }
38 |
39 | unsigned int _pgrp_u32(unsigned int p, unsigned int m) {
40 | unsigned int zeros = _pext_u32(p, ~m) << (_mm_popcnt_u32(m));
41 | unsigned int ones = _pext_u32(p, m);
42 | return zeros | ones;
43 | }
44 |
45 | unsigned __int64 _pgrp_u64(unsigned __int64 p, unsigned __int64 m) {
46 | unsigned __int64 zeros = _pext_u64(p, ~m) << (_mm_popcnt_u64(m));
47 | unsigned __int64 ones = _pext_u64(p, m);
48 | return zeros | ones;
49 | }
50 |
51 | void HWBITPERM_Check64(void) {
52 | __m512i p, m, ref_ext, ref_dep, ref_grp;
53 |
54 | for (int j = 0; j < 8; j++) {
55 | for (int i = 0; i < 8; i++) {
56 | int b = 8 * j + i;
57 | p.m512i_u64[i] = ~0ULL;
58 | m.m512i_u64[i] = (((1ULL) << b) + ((1ULL << 63) >> b)) | (1ULL << 32) | (1ULL << 32) | (1ULL << 16) | (1ULL << 48);
59 | ref_ext.m512i_i64[i] = _pext_u64(p.m512i_u64[i], m.m512i_u64[i]);
60 | ref_dep.m512i_i64[i] = _pdep_u64(p.m512i_u64[i], m.m512i_u64[i]);
61 | ref_grp.m512i_i64[i] = _pgrp_u64(p.m512i_u64[i], m.m512i_u64[i]);
62 | }
63 | HWBITPERM_Compare(p, m, ref_ext, 64, BEXT);
64 | HWBITPERM_Compare(p, m, ref_dep, 64, BDEP);
65 | HWBITPERM_Compare(p, m, ref_grp, 64, BGRP);
66 | }
67 |
68 | for (int j = 0; j < 8; j++) {
69 | for (int i = 0; i < 8; i++) {
70 | int b = 8 * j + i;
71 | p.m512i_u64[i] = (((1ULL) << b) + ((1ULL << 63) >> b)) | (1ULL << 32) | (1ULL << 32) | (1ULL << 16) | (1ULL << 48);
72 | m.m512i_u64[i] = ~0ULL;
73 | ref_ext.m512i_i64[i] = _pext_u64(p.m512i_u64[i], m.m512i_u64[i]);
74 | ref_dep.m512i_i64[i] = _pdep_u64(p.m512i_u64[i], m.m512i_u64[i]);
75 | ref_grp.m512i_i64[i] = _pgrp_u64(p.m512i_u64[i], m.m512i_u64[i]);
76 | }
77 | HWBITPERM_Compare(p, m, ref_ext, 64, BEXT);
78 | HWBITPERM_Compare(p, m, ref_dep, 64, BDEP);
79 | HWBITPERM_Compare(p, m, ref_grp, 64, BGRP);
80 | }
81 |
82 | for (int j = 0; j < 8; j++) {
83 | for (int i = 0; i < 8; i++) {
84 | int b = 8 * j + i;
85 | p.m512i_u64[i] = ~0ULL;
86 | m.m512i_u64[i] = _bzhi_u64(~0, b + 1);
87 | ref_ext.m512i_i64[i] = _pext_u64(p.m512i_u64[i], m.m512i_u64[i]);
88 | ref_dep.m512i_i64[i] = _pdep_u64(p.m512i_u64[i], m.m512i_u64[i]);
89 | ref_grp.m512i_i64[i] = _pgrp_u64(p.m512i_u64[i], m.m512i_u64[i]);
90 | }
91 | HWBITPERM_Compare(p, m, ref_ext, 64, BEXT);
92 | HWBITPERM_Compare(p, m, ref_dep, 64, BDEP);
93 | HWBITPERM_Compare(p, m, ref_grp, 64, BGRP);
94 | }
95 |
96 | for (int j = 0; j < 1000; j++) {
97 | for (int i = 0; i < 8; i++) {
98 | while (!_rdrand64_step(&p.m512i_u64[i]));
99 | while (!_rdrand64_step(&m.m512i_u64[i]));
100 | ref_ext.m512i_i64[i] = _pext_u64(p.m512i_u64[i], m.m512i_u64[i]);
101 | ref_dep.m512i_i64[i] = _pdep_u64(p.m512i_u64[i], m.m512i_u64[i]);
102 | ref_grp.m512i_i64[i] = _pgrp_u64(p.m512i_u64[i], m.m512i_u64[i]);
103 | }
104 | HWBITPERM_Compare(p, m, ref_ext, 64, BEXT);
105 | HWBITPERM_Compare(p, m, ref_dep, 64, BDEP);
106 | HWBITPERM_Compare(p, m, ref_grp, 64, BGRP);
107 | }
108 | }
109 |
110 | void HWBITPERM_Check32(void) {
111 | __m512i p, m, ref_ext, ref_dep, ref_grp;
112 |
113 | for (int j = 0; j < 4; j++) {
114 | for (int i = 0; i < 16; i++) {
115 | int b = 16 * j + i;
116 | p.m512i_u32[i] = ~0UL;
117 | m.m512i_u32[i] = (((1UL) << b) + ((1UL << 31) >> b)) | (1UL << 16);
118 | ref_ext.m512i_u32[i] = _pext_u32(p.m512i_u32[i], m.m512i_u32[i]);
119 | ref_dep.m512i_u32[i] = _pdep_u32(p.m512i_u32[i], m.m512i_u32[i]);
120 | ref_grp.m512i_u32[i] = _pgrp_u32(p.m512i_u32[i], m.m512i_u32[i]);
121 | }
122 | HWBITPERM_Compare(p, m, ref_ext, 32, BEXT);
123 | HWBITPERM_Compare(p, m, ref_dep, 32, BDEP);;
124 | HWBITPERM_Compare(p, m, ref_grp, 32, BGRP);;
125 | }
126 |
127 | for (int j = 0; j < 4; j++) {
128 | for (int i = 0; i < 16; i++) {
129 | int b = 16 * j + i;
130 | p.m512i_u32[i] = (((1UL) << b) + ((1UL << 31) >> b)) | (1UL << 16);
131 | m.m512i_u32[i] = ~0UL;
132 | ref_ext.m512i_u32[i] = _pext_u32(p.m512i_u32[i], m.m512i_u32[i]);
133 | ref_dep.m512i_u32[i] = _pdep_u32(p.m512i_u32[i], m.m512i_u32[i]);
134 | ref_grp.m512i_u32[i] = _pgrp_u32(p.m512i_u32[i], m.m512i_u32[i]);
135 | }
136 | HWBITPERM_Compare(p, m, ref_ext, 32, BEXT);
137 | HWBITPERM_Compare(p, m, ref_dep, 32, BDEP);;
138 | HWBITPERM_Compare(p, m, ref_grp, 32, BGRP);;
139 | }
140 |
141 | for (int j = 0; j < 2; j++) {
142 | for (int i = 0; i < 16; i++) {
143 | int b = 16 * j + i;
144 | p.m512i_u32[i] = ~0UL;
145 | m.m512i_u32[i] = _bzhi_u32(~0, b + 1);
146 | ref_ext.m512i_u32[i] = _pext_u32(p.m512i_u32[i], m.m512i_u32[i]);
147 | ref_dep.m512i_u32[i] = _pdep_u32(p.m512i_u32[i], m.m512i_u32[i]);
148 | ref_grp.m512i_u32[i] = _pgrp_u32(p.m512i_u32[i], m.m512i_u32[i]);
149 | }
150 | HWBITPERM_Compare(p, m, ref_ext, 32, BEXT);
151 | HWBITPERM_Compare(p, m, ref_dep, 32, BDEP);
152 | HWBITPERM_Compare(p, m, ref_grp, 32, BGRP);
153 | }
154 |
155 | for (int j = 0; j < 1000; j++) {
156 | for (int i = 0; i < 16; i++) {
157 | while (!_rdrand32_step(&p.m512i_u32[i]));
158 | while (!_rdrand32_step(&m.m512i_u32[i]));
159 | ref_ext.m512i_u32[i] = _pext_u32(p.m512i_u32[i], m.m512i_u32[i]);
160 | ref_dep.m512i_u32[i] = _pdep_u32(p.m512i_u32[i], m.m512i_u32[i]);
161 | ref_grp.m512i_u32[i] = _pgrp_u32(p.m512i_u32[i], m.m512i_u32[i]);
162 | }
163 | HWBITPERM_Compare(p, m, ref_ext, 32, BEXT);
164 | HWBITPERM_Compare(p, m, ref_dep, 32, BDEP);
165 | HWBITPERM_Compare(p, m, ref_grp, 32, BGRP);
166 | }
167 | }
168 |
169 | void HWBITPERM_Time(int method) {
170 | unsigned __int64 minlat = ULONG_MAX;
171 | unsigned __int64 mintp = ULONG_MAX;
172 | for (int retry = 0; retry < DEPEXT219_RETRIES; retry++) {
173 | minlat = min(minlat, (hw_reference[method].lat)());
174 | }
175 | for (int retry = 0; retry < DEPEXT219_RETRIES; retry++) {
176 | mintp = min(mintp, (hw_reference[method].tp)());
177 | }
178 | cout << hw_reference[method].isaName << hw_reference[method].name << ": " << (double)minlat / (double)DEPEXT219_REPEATS << " | ";
179 | cout << (double)mintp / (double)DEPEXT219_REPEATS << " L|T";
180 | cout << endl;
181 | }
182 |
183 | void HWBITPERM_Test() {
184 | SetThread(2);
185 | HWBITPERM_Check64();
186 | HWBITPERM_Check32();
187 |
188 | cout << setw(5) << fixed << setprecision(2);
189 | cout << endl<< "HW/Scalar TSC CLKs:----------------------" << endl;
190 | for (int b = 0; b < (sizeof(hw_reference) / sizeof(bitperm_methods)); b++) {
191 | if ((cpu_props.IsFeat(hw_reference[b].feats)) && (hw_reference[b].lat != NULL) && (hw_reference[b].tp != NULL))
192 | HWBITPERM_Time(b);
193 | }
194 | }
195 |
--------------------------------------------------------------------------------
/Zen4_Demo_Imm8.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #define ZEN4_FUNCDEF_I8(INST, OPERANDS, I8) \
4 | extern "C" unsigned __int64 __fastcall Zen4_##INST##_##OPERANDS##_##I8##_lat(void); \
5 | extern "C" unsigned __int64 __fastcall Zen4_##INST##_##OPERANDS##_##I8##_tp(void); \
6 | extern "C" unsigned __int64 __fastcall Zen4_##INST##_##OPERANDS##_##I8##_port01(void); \
7 | extern "C" unsigned __int64 __fastcall Zen4_##INST##_##OPERANDS##_##I8##_port23(void); \
8 | extern "C" unsigned __int64 __fastcall Zen4_##INST##_##OPERANDS##_##I8##_port12(void); \
9 | extern "C" unsigned __int64 __fastcall Zen4_##INST##_##OPERANDS##_##I8##_port123(void); \
10 | extern "C" unsigned __int64 __fastcall Zen4_##INST##_##OPERANDS##_##I8##_port0123(void);\
11 | extern "C" unsigned __int64 __fastcall Zen4_##INST##_##OPERANDS##_##I8##_port45(void); \
12 | extern "C" unsigned __int64 __fastcall Zen4_##INST##_##OPERANDS##_##I8##_tern(void); \
13 | extern "C" unsigned __int64 __fastcall Zen4_##INST##_##OPERANDS##_##I8##_LDs(void); \
14 | extern "C" unsigned __int64 __fastcall Zen4_##INST##_##OPERANDS##_##I8##_port1(void);
15 |
16 | #define ZEN4_FUNCDECL_I8(NAME, INST, OPERANDS, I8) \
17 | {#NAME, {\
18 | Zen4_##INST##_##OPERANDS##_##I8##_lat, \
19 | Zen4_##INST##_##OPERANDS##_##I8##_tp, \
20 | Zen4_##INST##_##OPERANDS##_##I8##_port01, \
21 | Zen4_##INST##_##OPERANDS##_##I8##_port23, \
22 | Zen4_##INST##_##OPERANDS##_##I8##_port12, \
23 | Zen4_##INST##_##OPERANDS##_##I8##_port123, \
24 | Zen4_##INST##_##OPERANDS##_##I8##_port0123, \
25 | Zen4_##INST##_##OPERANDS##_##I8##_port45, \
26 | Zen4_##INST##_##OPERANDS##_##I8##_tern, \
27 | Zen4_##INST##_##OPERANDS##_##I8##_LDs, \
28 | Zen4_##INST##_##OPERANDS##_##I8##_port1 \
29 | }},
30 |
31 | ZEN4_FUNCDEF_I8(vextracti128, ymmI82xmm, 000h)
32 | ZEN4_FUNCDEF_I8(vextractf128, ymmI82xmm, 000h)
33 | ZEN4_FUNCDEF_I8(vextracti128, ymmI82xmm, 001h)
34 | ZEN4_FUNCDEF_I8(vextractf128, ymmI82xmm, 001h)
35 |
36 | ZEN4_FUNCDEF_I8(vextracti32x4, ymmi82xmm, 000h)
37 | ZEN4_FUNCDEF_I8(vextractf32x4, ymmi82xmm, 000h)
38 | ZEN4_FUNCDEF_I8(vextracti32x4, ymmi82xmm, 001h)
39 | ZEN4_FUNCDEF_I8(vextractf32x4, ymmi82xmm, 001h)
40 |
41 | ZEN4_FUNCDEF_I8(vextracti64x2, ymmi82xmm, 000h)
42 | ZEN4_FUNCDEF_I8(vextractf64x2, ymmi82xmm, 000h)
43 | ZEN4_FUNCDEF_I8(vextracti64x2, ymmi82xmm, 001h)
44 | ZEN4_FUNCDEF_I8(vextractf64x2, ymmi82xmm, 001h)
45 |
46 | ZEN4_FUNCDEF_I8(vextracti32x4, zmmi82xmm, 000h)
47 | ZEN4_FUNCDEF_I8(vextractf32x4, zmmi82xmm, 000h)
48 | ZEN4_FUNCDEF_I8(vextracti32x4, zmmi82xmm, 001h)
49 | ZEN4_FUNCDEF_I8(vextractf32x4, zmmi82xmm, 001h)
50 | ZEN4_FUNCDEF_I8(vextracti32x4, zmmi82xmm, 002h)
51 | ZEN4_FUNCDEF_I8(vextractf32x4, zmmi82xmm, 002h)
52 | ZEN4_FUNCDEF_I8(vextracti32x4, zmmi82xmm, 003h)
53 | ZEN4_FUNCDEF_I8(vextractf32x4, zmmi82xmm, 003h)
54 |
55 | ZEN4_FUNCDEF_I8(vextracti32x8, zmmi82ymm, 000h)
56 | ZEN4_FUNCDEF_I8(vextractf32x8, zmmi82ymm, 000h)
57 | ZEN4_FUNCDEF_I8(vextracti32x8, zmmi82ymm, 001h)
58 | ZEN4_FUNCDEF_I8(vextractf32x8, zmmi82ymm, 001h)
59 |
60 | ZEN4_FUNCDEF_I8(vinserti128, xmmymmI82ymm, 000h)
61 | ZEN4_FUNCDEF_I8(vinsertf128, xmmymmI82ymm, 000h)
62 | ZEN4_FUNCDEF_I8(vinserti128, xmmymmI82ymm, 001h)
63 | ZEN4_FUNCDEF_I8(vinsertf128, xmmymmI82ymm, 001h)
64 |
65 | ZEN4_FUNCDEF_I8(vinserti32x4, xmmymmI82ymm, 000h)
66 | ZEN4_FUNCDEF_I8(vinsertf32x4, xmmymmI82ymm, 000h)
67 | ZEN4_FUNCDEF_I8(vinserti32x4, xmmymmI82ymm, 001h)
68 | ZEN4_FUNCDEF_I8(vinsertf32x4, xmmymmI82ymm, 001h)
69 |
70 | ZEN4_FUNCDEF_I8(vinserti64x2, xmmymmI82ymm, 000h)
71 | ZEN4_FUNCDEF_I8(vinsertf64x2, xmmymmI82ymm, 000h)
72 | ZEN4_FUNCDEF_I8(vinserti64x2, xmmymmI82ymm, 001h)
73 | ZEN4_FUNCDEF_I8(vinsertf64x2, xmmymmI82ymm, 001h)
74 |
75 | ZEN4_FUNCDEF_I8(vinserti32x4, xmmzmmI82zmm, 000h)
76 | ZEN4_FUNCDEF_I8(vinsertf32x4, xmmzmmI82zmm, 000h)
77 | ZEN4_FUNCDEF_I8(vinserti32x4, xmmzmmI82zmm, 001h)
78 | ZEN4_FUNCDEF_I8(vinsertf32x4, xmmzmmI82zmm, 001h)
79 | ZEN4_FUNCDEF_I8(vinserti32x4, xmmzmmI82zmm, 002h)
80 | ZEN4_FUNCDEF_I8(vinsertf32x4, xmmzmmI82zmm, 002h)
81 | ZEN4_FUNCDEF_I8(vinserti32x4, xmmzmmI82zmm, 003h)
82 | ZEN4_FUNCDEF_I8(vinsertf32x4, xmmzmmI82zmm, 003h)
83 |
84 | ZEN4_FUNCDEF_I8(vinserti32x8, ymmzmmI82zmm, 000h)
85 | ZEN4_FUNCDEF_I8(vinsertf32x8, ymmzmmI82zmm, 000h)
86 | ZEN4_FUNCDEF_I8(vinserti32x8, ymmzmmI82zmm, 001h)
87 | ZEN4_FUNCDEF_I8(vinsertf32x8, ymmzmmI82zmm, 001h)
88 |
89 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 000h)
90 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 000h)
91 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 001h)
92 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 001h)
93 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 002h)
94 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 002h)
95 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 003h)
96 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 003h)
97 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 008h)
98 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 008h)
99 |
100 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 010h)
101 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 010h)
102 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 011h)
103 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 011h)
104 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 012h)
105 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 012h)
106 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 013h)
107 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 013h)
108 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 018h)
109 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 018h)
110 |
111 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 020h)
112 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 020h)
113 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 021h)
114 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 021h)
115 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 022h)
116 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 022h)
117 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 023h)
118 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 023h)
119 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 028h)
120 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 028h)
121 |
122 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 030h)
123 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 030h)
124 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 031h)
125 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 031h)
126 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 032h)
127 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 032h)
128 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 033h)
129 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 033h)
130 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 038h)
131 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 038h)
132 |
133 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 080h)
134 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 080h)
135 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 081h)
136 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 081h)
137 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 082h)
138 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 082h)
139 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 083h)
140 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 083h)
141 | ZEN4_FUNCDEF_I8(vperm2i128, 2ymmI82ymm, 088h)
142 | ZEN4_FUNCDEF_I8(vperm2f128, 2ymmI82ymm, 088h)
143 |
144 | ZEN4_FUNCDEF_I8(vshufi32x4, 2ymmI82ymm, 000h)
145 | ZEN4_FUNCDEF_I8(vshuff32x4, 2ymmI82ymm, 000h)
146 | ZEN4_FUNCDEF_I8(vshufi32x4, 2ymmI82ymm, 001h)
147 | ZEN4_FUNCDEF_I8(vshuff32x4, 2ymmI82ymm, 001h)
148 | ZEN4_FUNCDEF_I8(vshufi32x4, 2ymmI82ymm, 002h)
149 | ZEN4_FUNCDEF_I8(vshuff32x4, 2ymmI82ymm, 002h)
150 | ZEN4_FUNCDEF_I8(vshufi32x4, 2ymmI82ymm, 003h)
151 | ZEN4_FUNCDEF_I8(vshuff32x4, 2ymmI82ymm, 003h)
152 | ZEN4_FUNCDEF_I8(vshufi64x2, 2ymmI82ymm, 000h)
153 | ZEN4_FUNCDEF_I8(vshuff64x2, 2ymmI82ymm, 000h)
154 | ZEN4_FUNCDEF_I8(vshufi64x2, 2ymmI82ymm, 001h)
155 | ZEN4_FUNCDEF_I8(vshuff64x2, 2ymmI82ymm, 001h)
156 | ZEN4_FUNCDEF_I8(vshufi64x2, 2ymmI82ymm, 002h)
157 | ZEN4_FUNCDEF_I8(vshuff64x2, 2ymmI82ymm, 002h)
158 | ZEN4_FUNCDEF_I8(vshufi64x2, 2ymmI82ymm, 003h)
159 | ZEN4_FUNCDEF_I8(vshuff64x2, 2ymmI82ymm, 003h)
160 |
161 | ZEN4_FUNCDEF_I8(vshufi32x4, 2zmmI82zmm, 000h)
162 | ZEN4_FUNCDEF_I8(vshuff32x4, 2zmmI82zmm, 000h)
163 | ZEN4_FUNCDEF_I8(vshufi32x4, 2zmmI82zmm, 044h)
164 | ZEN4_FUNCDEF_I8(vshuff32x4, 2zmmI82zmm, 044h)
165 | ZEN4_FUNCDEF_I8(vshufi32x4, 2zmmI82zmm, 0e4h)
166 | ZEN4_FUNCDEF_I8(vshuff32x4, 2zmmI82zmm, 0e4h)
167 | ZEN4_FUNCDEF_I8(vshufi32x4, 2zmmI82zmm, 0a5h)
168 | ZEN4_FUNCDEF_I8(vshuff32x4, 2zmmI82zmm, 0a5h)
169 |
170 | ZEN4_FUNCDEF_I8(vshufi64x2, 2zmmI82zmm, 000h)
171 | ZEN4_FUNCDEF_I8(vshuff64x2, 2zmmI82zmm, 000h)
172 | ZEN4_FUNCDEF_I8(vshufi64x2, 2zmmI82zmm, 044h)
173 | ZEN4_FUNCDEF_I8(vshuff64x2, 2zmmI82zmm, 044h)
174 | ZEN4_FUNCDEF_I8(vshufi64x2, 2zmmI82zmm, 0e4h)
175 | ZEN4_FUNCDEF_I8(vshuff64x2, 2zmmI82zmm, 0e4h)
176 | ZEN4_FUNCDEF_I8(vshufi64x2, 2zmmI82zmm, 0a5h)
177 | ZEN4_FUNCDEF_I8(vshuff64x2, 2zmmI82zmm, 0a5h)
--------------------------------------------------------------------------------
/AVX512_BGVSER.cpp:
--------------------------------------------------------------------------------
1 | #include "stdafx.h"
2 | #include "conio.h"
3 | #include "AVX512_BGVSER.h"
4 |
5 | using namespace std;
6 |
7 | __m256i _mm256_bsrli_epi256(__m256i a, int b) {
8 | return _mm256_maskz_compress_epi8(~0UL << b, a); //left shift is correct here
9 | }
10 |
11 | __m256i _mm256_bslli_epi256(__m256i a, int b) {
12 | return _mm256_maskz_expand_epi8(~0UL << b, a);
13 | }
14 |
15 | __m256i _mm256_palignr_epi256(__m256i a, __m256i b, int c) {
16 | const __m256i disp = _mm256_setr_epi8(
17 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
18 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f);
19 | __m256i idx = _mm256_add_epi8(disp, _mm256_set1_epi8(c));
20 | return _mm256_permutex2var_epi8(a, idx, b);
21 | }
22 |
23 | __m256i _mm256_palignl_epi256(__m256i a, __m256i b, int c) {
24 | const __m256i disp = _mm256_setr_epi8(
25 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
26 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f);
27 | __m256i idx = _mm256_sub_epi8(disp, _mm256_set1_epi8(c));
28 | return _mm256_permutex2var_epi8(a, idx, b);
29 | }
30 |
31 | __m256i _mm256_rotater_epi256(__m256i a, int c) {
32 | const __m256i disp = _mm256_setr_epi8(
33 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
34 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f);
35 | __m256i idx = _mm256_add_epi8(disp, _mm256_set1_epi8(c));
36 | return _mm256_permutexvar_epi8(idx, a);
37 | }
38 |
39 | __m256i _mm256_rotatel_epi256(__m256i a, int c) {
40 | const __m256i disp = _mm256_setr_epi8(
41 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
42 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f);
43 | __m256i idx = _mm256_sub_epi8(disp, _mm256_set1_epi8(c));
44 | return _mm256_permutexvar_epi8(idx, a);
45 | }
46 |
47 | //BSRLI older
48 | //vpbroadcastb zmm1, rax ;P5
49 | //mov rcx, -1 ;P0156B
50 | //shlx rcx, rcx, rax ;P06
51 | //kmovq k1, rcx ;P5
52 | //vpaddb zmm1, zmm1, [disp] ;P05+P23A
53 | //vpermb zmm0 {k1}{z}, zmm1, zmm0 ;P5
54 | //
55 | //7 uops: P0156B+P06+3*P5+P23A+P05
56 |
57 | //shorter:
58 | //mov rcx, -1 ;P0156B
59 | //shlx rcx, rcx, rax ;P06
60 | //kmovq k1, rcx ;P5
61 | //vpcompressb zmm0 {k1}{z}, zmm0 ;2*P5
62 | //
63 | //5 uops: P0156B+P06+3*P5
64 |
65 | __m512i _mm512_bsrli_epi512(__m512i a, int b) {
66 | return _mm512_maskz_compress_epi8(~0ULL << b, a); //left shift is correct here
67 | }
68 |
69 | __m512i _mm512_bslli_epi512(__m512i a, int b) {
70 | return _mm512_maskz_expand_epi8(~0ULL << b, a);
71 | }
72 |
73 | __m512i _mm512_palignr_epi512(__m512i a, __m512i b, int c) {
74 | const __m512i disp = _mm512_setr_epi8(
75 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
76 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
77 | 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
78 | 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f);
79 | __m512i idx = _mm512_add_epi8(disp, _mm512_set1_epi8(c));
80 | return _mm512_permutex2var_epi8(a, idx, b);
81 | }
82 |
83 | __m512i _mm512_palignl_epi512(__m512i a, __m512i b, int c) {
84 | const __m512i disp = _mm512_setr_epi8(
85 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
86 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
87 | 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
88 | 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f);
89 | __m512i idx = _mm512_sub_epi8(disp, _mm512_set1_epi8(c));
90 | return _mm512_permutex2var_epi8(a, idx, b);
91 | }
92 |
93 | __m512i _mm512_rotater_epi512(__m512i a, int c) {
94 | const __m512i disp = _mm512_setr_epi8(
95 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
96 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
97 | 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
98 | 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f);
99 | __m512i idx = _mm512_add_epi8(disp, _mm512_set1_epi8(c));
100 | return _mm512_permutexvar_epi8(idx, a);
101 | }
102 |
103 | __m512i _mm512_rotatel_epi512(__m512i a, int c) {
104 | const __m512i disp = _mm512_setr_epi8(
105 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
106 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
107 | 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
108 | 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f);
109 | __m512i idx = _mm512_sub_epi8(disp, _mm512_set1_epi8(c));
110 | return _mm512_permutexvar_epi8(idx, a);
111 | }
112 |
113 | #pragma warning(disable : 4309)
114 | void YMM_Test(void) {
115 | const __m256i testdata0 = _mm256_setr_epi8(
116 | 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
117 | 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f);
118 | const __m256i testdata1 = _mm256_setr_epi8(
119 | 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
120 | 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f);
121 |
122 | cout << "_mm256_bslli_epi256" << endl;
123 | for (int i = 0; i < 32; i++) {
124 | printRes(i, _mm256_bslli_epi256(testdata0, i));
125 | }
126 |
127 | cout << "_mm256_bsrli_epi256" << endl;
128 | for (int i = 0; i < 32; i++) {
129 | printRes(i, _mm256_bsrli_epi256(testdata0, i));
130 | }
131 |
132 | cout << "_mm256_palignr_epi256" << endl;
133 | for (int i = 0; i < 32; i++) {
134 | printRes(i, _mm256_palignr_epi256(testdata0, testdata1, i));
135 | }
136 |
137 | cout << "_mm256_palignl_epi256" << endl;
138 | for (int i = 0; i < 32; i++) {
139 | printRes(i, _mm256_palignl_epi256(testdata1, testdata0, i));
140 | }
141 |
142 | cout << "_mm256_rotater_epi256" << endl;
143 | for (int i = 0; i < 32; i++) {
144 | printRes(i, _mm256_rotater_epi256(testdata1, i));
145 | }
146 |
147 | cout << "_mm256_rotatel_epi256" << endl;
148 | for (int i = 0; i < 32; i++) {
149 | printRes(i, _mm256_rotatel_epi256(testdata1, i));
150 | }
151 | }
152 |
153 | void ZMM_Test(void) {
154 | const __m512i testdata0 = _mm512_setr_epi8(
155 | 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
156 | 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
157 | 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
158 | 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f);
159 | const __m512i testdata1 = _mm512_setr_epi8(
160 | 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
161 | 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
162 | 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
163 | 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf);
164 |
165 | cout << "_mm512_bslli_epi512" << endl;
166 | for (int i = 0; i < 64; i++) {
167 | printRes(i, _mm512_bslli_epi512(testdata0, i));
168 | }
169 |
170 | cout << "_mm512_bsrli_epi512" << endl;
171 | for (int i = 0; i < 64; i++) {
172 | printRes(i, _mm512_bsrli_epi512(testdata0, i));
173 | }
174 |
175 | cout << "_mm512_palignr_epi512" << endl;
176 | for (int i = 0; i < 64; i++) {
177 | printRes(i, _mm512_palignr_epi512(testdata0, testdata1, i));
178 | }
179 |
180 | cout << "_mm512_palignl_epi512" << endl;
181 | for (int i = 0; i < 64; i++) {
182 | printRes(i, _mm512_palignl_epi512(testdata1, testdata0, i));
183 | }
184 |
185 | cout << "_mm512_rotater_epi512" << endl;
186 | for (int i = 0; i < 64; i++) {
187 | printRes(i, _mm512_rotater_epi512(testdata1, i));
188 | }
189 |
190 | cout << "_mm512_rotatel_epi512" << endl;
191 | for (int i = 0; i < 64; i++) {
192 | printRes(i, _mm512_rotatel_epi512(testdata1, i));
193 | }
194 |
195 | }
196 |
197 | void AVX512_BGVSER_Test(void) {
198 | YMM_Test();
199 | ZMM_Test();
200 | }
201 |
202 | #pragma warning(default : 4309)
203 |
--------------------------------------------------------------------------------
/P06P1.cpp:
--------------------------------------------------------------------------------
1 | #include "stdafx.h"
2 | #include "P06P1.h"
3 |
4 | extern CPU_Props cpu_props;
5 | extern Args args;
6 |
7 | #define P06P1_RETRIES 10
8 | #define P06P1_REPEATS 1000
9 |
10 | using namespace std;
11 |
12 | #ifdef __cplusplus
13 | extern "C" {
14 | #endif
15 |
16 | P06P1_FUNCDEF(BT_RAX_RCX, CX)
17 | P06P1_FUNCDEF(BTC_RAX_RCX, CX)
18 | P06P1_FUNCDEF(BTR_RAX_RCX, CX)
19 | P06P1_FUNCDEF(BTS_RAX_RCX, CX)
20 | P06P1_FUNCDEF(BZHI_RAX_RAX_RCX, CX)
21 | P06P1_FUNCDEF(BEXTR_RAX_RAX_RCX, CX)
22 |
23 | P06P1_FUNCDEF(BT_RAX_RCX, AX)
24 | P06P1_FUNCDEF(BTC_RAX_RCX, AX)
25 | P06P1_FUNCDEF(BTR_RAX_RCX, AX)
26 | P06P1_FUNCDEF(BTS_RAX_RCX, AX)
27 | P06P1_FUNCDEF(BZHI_RAX_RAX_RCX, AX)
28 | P06P1_FUNCDEF(BEXTR_RAX_RAX_RCX, AX)
29 |
30 | P06P1_FUNCDEF(SHLX_RAX_RAX_RCX, CX)
31 | P06P1_FUNCDEF(SHL_RAX_CL, CX)
32 | P06P1_FUNCDEF(SHLX_RAX_RAX_RCX, AX)
33 | P06P1_FUNCDEF(SHL_RAX_CL, AX)
34 | P06P1_FUNCDEF(SHL_RAX_IMM8, AX)
35 | P06P1_FUNCDEF(SHL_RAX_IMPL1, AX)
36 |
37 | P06P1_FUNCDEF(SHRX_RAX_RAX_RCX, CX)
38 | P06P1_FUNCDEF(SHR_RAX_CL, CX)
39 | P06P1_FUNCDEF(SHRX_RAX_RAX_RCX, AX)
40 | P06P1_FUNCDEF(SHR_RAX_CL, AX)
41 | P06P1_FUNCDEF(SHR_RAX_IMM8, AX)
42 | P06P1_FUNCDEF(SHR_RAX_IMPL1, AX)
43 |
44 | P06P1_FUNCDEF(SARX_RAX_RAX_RCX, CX)
45 | P06P1_FUNCDEF(SAR_RAX_CL, CX)
46 | P06P1_FUNCDEF(SARX_RAX_RAX_RCX, AX)
47 | P06P1_FUNCDEF(SAR_RAX_CL, AX)
48 | P06P1_FUNCDEF(SAR_RAX_IMM8, AX)
49 | P06P1_FUNCDEF(SAR_RAX_IMPL1, AX)
50 |
51 | P06P1_FUNCDEF(RORX_RCX_RCX_IMM8, CX)
52 | P06P1_FUNCDEF(ROR_RAX_CL, CX)
53 | P06P1_FUNCDEF(ROR_RAX_CL, AX)
54 | P06P1_FUNCDEF(ROR_RAX_IMM8, AX)
55 | P06P1_FUNCDEF(ROR_RAX_IMPL1, AX)
56 |
57 | P06P1_FUNCDEF(ROL_RAX_CL, CX)
58 | P06P1_FUNCDEF(ROL_RAX_CL, AX)
59 | P06P1_FUNCDEF(ROL_RAX_IMM8, AX)
60 | P06P1_FUNCDEF(ROL_RAX_IMPL1, AX)
61 |
62 | P06P1_FUNCDEF(RCR_RAX_CL, CX)
63 | P06P1_FUNCDEF(RCR_RAX_CL, AX)
64 | P06P1_FUNCDEF(RCR_RAX_IMM8, AX)
65 | P06P1_FUNCDEF(RCR_RAX_IMPL1, AX)
66 |
67 | P06P1_FUNCDEF(RCL_RAX_CL, CX)
68 | P06P1_FUNCDEF(RCL_RAX_CL, AX)
69 | P06P1_FUNCDEF(RCL_RAX_IMM8, AX)
70 | P06P1_FUNCDEF(RCL_RAX_IMPL1, AX)
71 |
72 | P06P1_FUNCDEF(ADC_RAX_IMM8, AX)
73 | P06P1_FUNCDEF(SBB_RAX_IMM8, AX)
74 |
75 | P06P1_FUNCDEF(ADCX_RAX_RCX, CX)
76 | P06P1_FUNCDEF(ADOX_RAX_RCX, CX)
77 | P06P1_FUNCDEF(ADCX_RAX_RCX, AX)
78 | P06P1_FUNCDEF(ADOX_RAX_RCX, AX)
79 |
80 | P06P1_FUNCDEF(CMOVBE_RAX_RCX, CX)
81 | P06P1_FUNCDEF(CMOVNBE_RAX_RCX, CX)
82 | P06P1_FUNCDEF(CMOVZ_RAX_RCX, CX)
83 | P06P1_FUNCDEF(CMOVNZ_RAX_RCX, CX)
84 |
85 | P06P1_FUNCDEF(CMOVBE_RAX_RCX, AX)
86 | P06P1_FUNCDEF(CMOVNBE_RAX_RCX, AX)
87 | P06P1_FUNCDEF(CMOVZ_RAX_RCX, AX)
88 | P06P1_FUNCDEF(CMOVNZ_RAX_RCX, AX)
89 |
90 | P06P1_FUNCDEF(BSWAP_RAX, AX)
91 | P06P1_FUNCDEF(POPCNT_RCX_RCX, CX)
92 | P06P1_FUNCDEF(LZCNT_RCX_RCX, CX)
93 | P06P1_FUNCDEF(TZCNT_RCX_RCX, CX)
94 |
95 | P06P1_FUNCDEF(BSR_RCX_RCX, CX)
96 | P06P1_FUNCDEF(BSF_RCX_RCX, CX)
97 |
98 | P06P1_FUNCDEF(CRC32_RCX_RCX, CX)
99 | P06P1_FUNCDEF(PDEP_RAX_RAX_RCX, CX)
100 | P06P1_FUNCDEF(PEXT_RAX_RAX_RCX, CX)
101 |
102 |
103 | #ifdef __cplusplus
104 | }
105 | #endif
106 |
107 | measure_methods P06P1_affected[] = {
108 | P06P1_FUNC("SHLX RAX, RAX, RCX", SHLX_RAX_RAX_RCX, CX, BMI2, 1)
109 | P06P1_FUNC("SHRX RAX, RAX, RCX", SHRX_RAX_RAX_RCX, CX, BMI2, 1)
110 | P06P1_FUNC("SARX RAX, RAX, RCX", SARX_RAX_RAX_RCX, CX, BMI2, 1)
111 | P06P1_FUNC("SHL RAX, CL", SHL_RAX_CL, CX, AMD64, 2)
112 | P06P1_FUNC("SHR RAX, CL", SHR_RAX_CL, CX, AMD64, 2)
113 | P06P1_FUNC("SAR RAX, CL", SAR_RAX_CL, CX, AMD64, 2)
114 | P06P1_FUNC("ROR RAX, CL", ROR_RAX_CL, CX, AMD64, 2)
115 | P06P1_FUNC("ROL RAX, CL", ROL_RAX_CL, CX, AMD64, 2)
116 | P06P1_FUNC("BT RAX RCX + ADC RAX,0", BT_RAX_RCX, CX, AMD64, 2)
117 | P06P1_FUNC("BTC RAX RCX", BTC_RAX_RCX, CX, AMD64, 1)
118 | P06P1_FUNC("BTR RAX, RCX", BTR_RAX_RCX, CX, AMD64, 1)
119 | P06P1_FUNC("BTS RAX, RCX", BTS_RAX_RCX, CX, AMD64, 1)
120 | P06P1_FUNC("BZHI RAX, RAX, RCX", BZHI_RAX_RAX_RCX, CX, BMI2, 1)
121 | P06P1_FUNC("BEXTR RAX, RAX, RCX", BEXTR_RAX_RAX_RCX, CX, BMI, 2)
122 | };
123 |
124 | measure_methods P06P1_unaffected[] = {
125 | P06P1_FUNC("SHLX RAX, RAX, RCX", SHLX_RAX_RAX_RCX, AX, BMI2, 1)
126 | P06P1_FUNC("SHRX RAX, RAX, RCX", SHRX_RAX_RAX_RCX, AX, BMI2, 1)
127 | P06P1_FUNC("SARX RAX, RAX, RCX", SARX_RAX_RAX_RCX, AX, BMI2, 1)
128 | P06P1_FUNC("RORX RCX, RCX, IMM8", RORX_RCX_RCX_IMM8, CX, BMI2, 1)
129 |
130 | P06P1_FUNC("SHL RAX, CL", SHL_RAX_CL, AX, AMD64, 2)
131 | P06P1_FUNC("SHR RAX, CL", SHR_RAX_CL, AX, AMD64, 2)
132 | P06P1_FUNC("SAR RAX, CL", SAR_RAX_CL, AX, AMD64, 2)
133 | P06P1_FUNC("ROR RAX, CL", ROR_RAX_CL, AX, AMD64, 2)
134 | P06P1_FUNC("ROL RAX, CL", ROL_RAX_CL, AX, AMD64, 2)
135 | P06P1_FUNC("RCR RAX, CL", RCR_RAX_CL, AX, AMD64, 7)
136 | P06P1_FUNC("RCL RAX, CL", RCL_RAX_CL, AX, AMD64, 7)
137 |
138 | P06P1_FUNC("RCR RAX, CL", RCR_RAX_CL, CX, AMD64, 7)
139 | P06P1_FUNC("RCL RAX, CL", RCL_RAX_CL, CX, AMD64, 7)
140 |
141 | P06P1_FUNC("SHL RAX, IMM8", SHL_RAX_IMM8, AX, AMD64, 1)
142 | P06P1_FUNC("SHR RAX, IMM8", SHR_RAX_IMM8, AX, AMD64, 1)
143 | P06P1_FUNC("SAR RAX, IMM8", SAR_RAX_IMM8, AX, AMD64, 1)
144 | P06P1_FUNC("ROR RAX, IMM8", ROR_RAX_IMM8, AX, AMD64, 1)
145 | P06P1_FUNC("ROL RAX, IMM8", ROL_RAX_IMM8, AX, AMD64, 1)
146 | P06P1_FUNC("RCR RAX, IMM8", RCR_RAX_IMM8, AX, AMD64, 7)
147 | P06P1_FUNC("RCL RAX, IMM8", RCL_RAX_IMM8, AX, AMD64, 7)
148 |
149 | P06P1_FUNC("SHL RAX, IMPL1", SHL_RAX_IMPL1, AX, AMD64, 1)
150 | P06P1_FUNC("SHR RAX, IMPL1", SHR_RAX_IMPL1, AX, AMD64, 1)
151 | P06P1_FUNC("SAR RAX, IMPL1", SAR_RAX_IMPL1, AX, AMD64, 1)
152 | P06P1_FUNC("ROR RAX, IMPL1", ROR_RAX_IMPL1, AX, AMD64, 2)
153 | P06P1_FUNC("ROL RAX, IMPL1", ROL_RAX_IMPL1, AX, AMD64, 2)
154 | P06P1_FUNC("RCR RAX, IMPL1", RCR_RAX_IMPL1, AX, AMD64, 3)
155 | P06P1_FUNC("RCL RAX, IMPL1", RCL_RAX_IMPL1, AX, AMD64, 3)
156 |
157 |
158 | P06P1_FUNC("ADC RAX, IMM8", ADC_RAX_IMM8, AX, AMD64, 1)
159 | P06P1_FUNC("SBB RAX, IMM8", SBB_RAX_IMM8, AX, AMD64, 1)
160 |
161 | P06P1_FUNC("ADCX RAX, RCX", ADCX_RAX_RCX, CX, ADX, 1)
162 | P06P1_FUNC("ADOX RAX, RCX", ADOX_RAX_RCX, CX, ADX, 1)
163 | P06P1_FUNC("ADCX RAX, RCX", ADCX_RAX_RCX, AX, ADX, 1)
164 | P06P1_FUNC("ADOX RAX, RCX", ADOX_RAX_RCX, AX, ADX, 1)
165 |
166 | P06P1_FUNC("CMOVBE RAX, RCX", CMOVBE_RAX_RCX, CX, CMOV, 2)
167 | P06P1_FUNC("CMOVNBE, RAX, RCX", CMOVNBE_RAX_RCX, CX, CMOV, 2)
168 | P06P1_FUNC("CMOVZ RAX, RCX", CMOVZ_RAX_RCX, CX, CMOV, 1)
169 | P06P1_FUNC("CMOVNZ RAX, RCX", CMOVNZ_RAX_RCX, CX, CMOV, 1)
170 |
171 | P06P1_FUNC("CMOVBE RAX, RCX", CMOVBE_RAX_RCX, AX, CMOV, 2)
172 | P06P1_FUNC("CMOVNBE, RAX, RCX", CMOVNBE_RAX_RCX, AX, CMOV, 2)
173 | P06P1_FUNC("CMOVZ RAX, RCX", CMOVZ_RAX_RCX, AX, CMOV, 1)
174 | P06P1_FUNC("CMOVNZ RAX, RCX", CMOVNZ_RAX_RCX, AX, CMOV, 1)
175 |
176 | P06P1_FUNC("BT RAX RCX + ADC RAX,0", BT_RAX_RCX, AX, AMD64, 2)
177 | P06P1_FUNC("BTC RAX RCX", BTC_RAX_RCX, AX, AMD64, 1)
178 | P06P1_FUNC("BTR RAX, RCX", BTR_RAX_RCX, AX, AMD64, 1)
179 | P06P1_FUNC("BTS RAX, RCX", BTS_RAX_RCX, AX, AMD64, 1)
180 | P06P1_FUNC("BZHI RAX, RAX, RCX", BZHI_RAX_RAX_RCX, AX, BMI2, 1)
181 | P06P1_FUNC("BEXTR RAX, RAX, RCX", BEXTR_RAX_RAX_RCX, AX, BMI, 2)
182 |
183 | P06P1_FUNC("BSWAP RAX", BSWAP_RAX, AX, AMD64, 2)
184 | P06P1_FUNC("POPCNT RCX, RCX", POPCNT_RCX_RCX, CX, POPCNT, 1)
185 | P06P1_FUNC("LZCNT RCX, RCX", LZCNT_RCX_RCX, CX, ABM, 1)
186 | P06P1_FUNC("TZCNT RCX, RCX", TZCNT_RCX_RCX, CX, BMI, 1)
187 |
188 | P06P1_FUNC("BSR RCX, RCX", BSR_RCX_RCX, CX, AMD64, 1)
189 | P06P1_FUNC("BSF RCX, RCX", BSF_RCX_RCX, CX, AMD64, 1)
190 |
191 | P06P1_FUNC("CRC32 RCX, RCX", CRC32_RCX_RCX, CX, SSE42, 1)
192 | P06P1_FUNC("PDEP RAX, RAX, RCX", PDEP_RAX_RAX_RCX, CX, BMI2, 1)
193 | P06P1_FUNC("PEXT RAX, RAX, RCX", PEXT_RAX_RAX_RCX, CX, BMI2, 1)
194 | };
195 |
196 | void P0601_Time(measure_methods * m, int method, int testcase) {
197 | unsigned __int64 minres = ULONG_MAX;
198 |
199 | (m[method].func[testcase])();
200 | for (int retry = 0; retry < P06P1_RETRIES; retry++) {
201 | minres = min(minres, (m[method].func[testcase])());
202 | }
203 | cout << '\t' << setw(6) << right << (int)((double)minres / (double)P06P1_REPEATS);
204 | }
205 |
206 | void P0601(measure_methods * m, int instcount) {
207 | cout << "TSC CLKs:--------------------------------\t -1025\t -1024\t -513\t -512\t 511\t 512\t 1023\t 1024 (#uop)" << endl;
208 |
209 | for (int b = 0; b < instcount; b++) {
210 | cout << dec << setw(2) << b << ':';
211 | cout << left << setw(INSTNAMELEN) << m[b].inst << ' ';
212 | cout << left << setw(INITLEN) << m[b].init << ": ";
213 | for (int t = 0; t < TESTCASE; t++) {
214 | if ((cpu_props.IsFeat(m[b].feats)) && (m[b].func[t] != NULL))
215 | P0601_Time(m, b, t);
216 | }
217 | cout << " (" << m[b].uopscount << ')' << endl;
218 | }
219 | }
220 |
221 | void P0601_Test(void) {
222 | SetThread(args.GetThreadIndex(cpu_props));
223 | cout << "Affected Instructions:" << endl;
224 | P0601(P06P1_affected, sizeof(P06P1_affected) / sizeof(measure_methods));
225 |
226 | cout << "Unaffected Instructions:" << endl;
227 | P0601(P06P1_unaffected, sizeof(P06P1_unaffected) / sizeof(measure_methods));
228 | }
--------------------------------------------------------------------------------
/InstLatX64_Demo.vcxproj.filters:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF}
6 | cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx
7 |
8 |
9 | {93995380-89BD-4b04-88EB-625FBE52EBFB}
10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd
11 |
12 |
13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01}
14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms
15 |
16 |
17 |
18 |
19 | Source Files
20 |
21 |
22 | Source Files
23 |
24 |
25 | Source Files
26 |
27 |
28 | Source Files
29 |
30 |
31 | Source Files
32 |
33 |
34 | Source Files
35 |
36 |
37 | Source Files
38 |
39 |
40 | Source Files
41 |
42 |
43 | Source Files
44 |
45 |
46 | Source Files
47 |
48 |
49 | Source Files
50 |
51 |
52 | Source Files
53 |
54 |
55 | Source Files
56 |
57 |
58 | Source Files
59 |
60 |
61 | Source Files
62 |
63 |
64 | Source Files
65 |
66 |
67 | Source Files
68 |
69 |
70 | Source Files
71 |
72 |
73 | Source Files
74 |
75 |
76 | Source Files
77 |
78 |
79 | Source Files
80 |
81 |
82 | Source Files
83 |
84 |
85 | Source Files
86 |
87 |
88 | Source Files
89 |
90 |
91 | Source Files
92 |
93 |
94 | Source Files
95 |
96 |
97 | Source Files
98 |
99 |
100 | Source Files
101 |
102 |
103 |
104 |
105 | Header Files
106 |
107 |
108 | Header Files
109 |
110 |
111 | Header Files
112 |
113 |
114 | Header Files
115 |
116 |
117 | Header Files
118 |
119 |
120 | Header Files
121 |
122 |
123 | Header Files
124 |
125 |
126 | Header Files
127 |
128 |
129 | Header Files
130 |
131 |
132 | Header Files
133 |
134 |
135 | Header Files
136 |
137 |
138 | Header Files
139 |
140 |
141 | Header Files
142 |
143 |
144 | Header Files
145 |
146 |
147 | Header Files
148 |
149 |
150 | Header Files
151 |
152 |
153 | Header Files
154 |
155 |
156 | Header Files
157 |
158 |
159 | Header Files
160 |
161 |
162 | Header Files
163 |
164 |
165 | Header Files
166 |
167 |
168 | Header Files
169 |
170 |
171 | Header Files
172 |
173 |
174 | Header Files
175 |
176 |
177 | Header Files
178 |
179 |
180 | Header Files
181 |
182 |
183 | Header Files
184 |
185 |
186 | Header Files
187 |
188 |
189 | Header Files
190 |
191 |
192 | Header Files
193 |
194 |
195 | Header Files
196 |
197 |
198 | Header Files
199 |
200 |
201 | Header Files
202 |
203 |
204 | Header Files
205 |
206 |
207 | Header Files
208 |
209 |
210 | Header Files
211 |
212 |
213 | Header Files
214 |
215 |
216 | Header Files
217 |
218 |
219 | Header Files
220 |
221 |
222 |
223 |
224 | Source Files
225 |
226 |
227 | Source Files
228 |
229 |
230 | Source Files
231 |
232 |
233 | Source Files
234 |
235 |
236 | Source Files
237 |
238 |
239 | Source Files
240 |
241 |
242 | Source Files
243 |
244 |
245 | Source Files
246 |
247 |
248 | Source Files
249 |
250 |
251 | Source Files
252 |
253 |
254 | Source Files
255 |
256 |
257 | Source Files
258 |
259 |
260 | Source Files
261 |
262 |
263 | Source Files
264 |
265 |
266 | Source Files
267 |
268 |
269 |
--------------------------------------------------------------------------------
/Args.cpp:
--------------------------------------------------------------------------------
1 | #include "stdafx.h"
2 |
3 | const paramsType Args::params[] = {
4 | {false, "help", 'h', ARG_HELP, NULL, "this help"},
5 | {false, "help", '?', ARG_HELP, NULL, "this help"},
6 | {false, "version", 'v', ARG_VERSION, NULL, "version info"},
7 | {false, "list", 'l', ARG_DEMOLIST, NULL, "list of demo types"},
8 | {false, "cpu", 'c', ARG_CPUPROPS, NULL, "list of CPU properties"},
9 | {false, "pcore", '\0', ARG_PCORE, NULL, "using Performance core on hybrid CPU"},
10 | {false, "ecore", '\0', ARG_ECORE, NULL, "using Efficient core on hybrid CPU"},
11 | {false, "lcore", '\0', ARG_LPECORE, NULL, "using LP-E core on hybrid CPU"},
12 | {false, "dump", 'm', ARG_CPUIDDUMP, NULL, "native CPUID dump"},
13 | {false, "procmask", 'k', ARG_PROCMASK, NULL, "list of P/E/LPE procmasks"},
14 | #if defined (_M_X64) && defined(__AVX512F__)
15 | {false, "512bFMA", '5', ARG_512BFMADP, NULL, "print number of 512b FMA double precision ports"},
16 | #endif
17 | {true, "demo", 'd', ARG_DEMOTYPE, ARGERR_MISS_DEMO, "demo type"},
18 | {true, "thread", 't', ARG_THREADINDEX, ARGERR_MISS_THREAD, "thread index"},
19 | {true, "file", 'f', ARG_CPUIDFILE, ARGERR_MISS_CPUIDFILE, "CPUID from file"},
20 | {true, "xcr0", 'x', ARG_XCR0, ARGERR_MISS_XCR0, "forced XCR0 value in hex w/o 0x"},
21 | {true, "tscratio", '\0', ARG_TSCRATIO, ARGERR_MISS_TSCRATIO, "TSC result correction"},
22 | };
23 |
24 | void Args::SetError(char* errorPlace, char * tempStr, const char * errorMsg) {
25 | strcpy_s(errorPlace, STR_MAXLEN, errorMsg);
26 | strcat_s(errorPlace, STR_MAXLEN, tempStr);
27 | return;
28 | }
29 |
30 | void Args::SetParam(argType paramType, char * tempStr, char* errorPlace, int * errorCounter) {
31 | if (tempStr[0] == '\0') {
32 | if (params[paramType].arguments) {
33 | SetError(errorPlace, tempStr, params[paramType].missingErr);
34 | (*errorCounter)++;
35 | }
36 | } else {
37 | switch (paramType) {
38 | case ARG_HELP: {
39 | helpFlag = true;
40 | } break;
41 | case ARG_VERSION: {
42 | versionFlag = true;
43 | } break;
44 | case ARG_DEMOLIST: {
45 | listFlag = true;
46 | } break;
47 | case ARG_CPUPROPS: {
48 | cpuPropsFlag = true;
49 | } break;
50 | case ARG_PROCMASK: {
51 | procMaskFlag = true;
52 | } break;
53 | #if defined (_M_X64) && defined(__AVX512F__)
54 | case ARG_512BFMADP: {
55 | _512bFMA_DP_Flag = true;
56 | } break;
57 | #endif
58 | case ARG_DEMOTYPE: {
59 | uint32_t demo = 0;
60 | for (demo = 0; demo < demoCount; demo++)
61 | if ((_stricmp(demoList[demo].demoName, tempStr) == 0) ||
62 | (_stricmp(demoList[demo].alias, tempStr) == 0)){
63 | const uint64_t d_bit = 1ULL << (demoList[demo].demoMask & 0x3f);
64 | const uint64_t d_qword = min(demoList[demo].demoMask >> 6, MAX_DEMOMASK);
65 | demoMask[d_qword] |= d_bit;
66 | break;
67 | }
68 | if (demo == demoCount) {
69 | SetError(errorPlace, tempStr, ARGERR_INV_DEMO);
70 | (*errorCounter)++;
71 | }
72 | } break;
73 | case ARG_THREADINDEX: {
74 | char* endPtr = 0;
75 | threadIndex = strtol(tempStr, &endPtr, 10);
76 | }
77 | break;
78 | case ARG_PCORE: {
79 | threadIndex = DEFAULT_PCORE_INDEX;
80 | }
81 | break;
82 | case ARG_ECORE: {
83 | threadIndex = DEFAULT_ECORE_INDEX;
84 | }
85 | break;
86 | case ARG_LPECORE: {
87 | threadIndex = DEFAULT_LPECORE_INDEX;
88 | }
89 | break;
90 | case ARG_CPUIDDUMP: {
91 | dumpFlag = true;
92 | } break;
93 | case ARG_CPUIDFILE: {
94 | cpuidFileFlag = true;
95 | cpuidFileName = tempStr;
96 | } break;
97 | case ARG_XCR0: {
98 | char* endPtr = 0;
99 | xcr0 = strtol(tempStr, &endPtr, 16);
100 | if ((xcr0 & (_XCR0_X87 | _XCR0_AVX | _XCR0_AVX512 | _XCR0_AMX | _XCR0_APX)) != xcr0) {
101 | SetError(errorPlace, tempStr, ARGERR_INV_XCR0);
102 | (*errorCounter)++;
103 | }
104 | }
105 | break;
106 | case ARG_TSCRATIO: {
107 | char* endPtr = 0;
108 | tscRatio = strtod(tempStr, &endPtr);
109 | if ((tscRatio <= 0.0) || (tscRatio > MAX_TSCRATIO)) {
110 | SetError(errorPlace, tempStr, ARGERR_INV_TSCRATIO);
111 | (*errorCounter)++;
112 | }
113 | }
114 | break;
115 | case ARG_NOTHING: {
116 | } break;
117 | default: {
118 | } break;
119 | }
120 | }
121 | return;
122 | }
123 |
124 | void Args::PrintUsage(void) const {
125 | printf("\r\nUsage: %s [switches]", DEMO_FILENAME);
126 | printf("\r\nExample: %s --demo=GFNI -d=VBMI2 --help --version -c", DEMO_FILENAME);
127 | printf("\r\nSwitches:");
128 | for (unsigned int comm = 0; comm < sizeof(params) / sizeof(paramsType); comm++)
129 | if (params[comm].shortName != '\0')
130 | printf("\r\n\t[-%c|--%-16s] %s", params[comm].shortName, params[comm].longName, params[comm].description);
131 | else
132 | printf("\r\n\t [--%-16s] %s", params[comm].longName, params[comm].description);
133 | printf("\r\n");
134 | }
135 |
136 | void Args::PrintVersion(void) const {
137 | std::cout << "Build date:" << __DATE__ << " Time:" << __TIME__ << std::endl;
138 | };
139 |
140 | bool Args::IsVersion(void) const{
141 | return versionFlag;
142 | };
143 |
144 | bool Args::IsHelp(void) const {
145 | return helpFlag;
146 | };
147 |
148 | bool Args::IsDemoList(void) const {
149 | return listFlag;
150 | };
151 |
152 | bool Args::IsCPUProps(void) const {
153 | return cpuPropsFlag;
154 | };
155 |
156 | bool Args::IsProcMask(void) const {
157 | return procMaskFlag;
158 | };
159 |
160 | #if defined (_M_X64) && defined(__AVX512F__)
161 | bool Args::Is_512bFMA_DP_Ports(void) const {
162 | return _512bFMA_DP_Flag;
163 | }
164 | #endif
165 |
166 | bool Args::IsCPUIDDump(void) const {
167 | return dumpFlag;
168 | };
169 |
170 | bool Args::IsCPUIDFile(void) const {
171 | return cpuidFileFlag;
172 | };
173 |
174 | size_t Args::GetMaxDemo(void) const {
175 | return DEMO_LAST;
176 | };
177 |
178 | size_t Args::GetThreadIndex(CPU_Props c) const {
179 | switch (threadIndex) {
180 | case DEFAULT_PCORE_INDEX: return c.GetPCoreIndex();
181 | case DEFAULT_ECORE_INDEX: return c.GetECoreIndex();
182 | case DEFAULT_LPECORE_INDEX: return c.GetLPECoreIndex();
183 | default:
184 | return threadIndex;
185 | }
186 | };
187 |
188 | char* Args::GetCPUIDFileName() const {
189 | return cpuidFileName;
190 | };
191 |
192 | bool Args::IsValid(void) const {
193 | return validFlag;
194 | };
195 |
196 | UINT64 Args::GetXCR0() const {
197 | return xcr0;
198 | };
199 |
200 | double Args::GetTSCRatio() const {
201 | return tscRatio;
202 | };
203 |
204 | bool Args::IsSelected(size_t i) const {
205 | return ((demoMask[i >> 6] & (1ULL << (i & 0x3f))) != 0);
206 | };
207 |
208 | Args::Args(const demoTypeList* demos, size_t size, int argc, char** argv) :
209 | demoList(demos), demoCount(size), paramCount(sizeof(params) / sizeof(paramsType)),
210 | versionFlag(0), helpFlag(0), listFlag(0), cpuPropsFlag(0), procMaskFlag(0),
211 | #if defined (_M_X64) && defined(__AVX512F__)
212 | _512bFMA_DP_Flag(0),
213 | #endif
214 | errorFlag(0), dumpFlag(0), cpuidFileFlag(0), paramType(ARG_NOTHING), threadIndex(0), cpuidFileName(0), xcr0(0), tscRatio(1.0) {
215 | validFlag = Init(argc, argv);
216 | };
217 |
218 | bool Args::Init(int argc, char** argv) {
219 | char errorStr[MAX_ARGERROR][STR_MAXLEN];
220 | memset(errorStr, 0, MAX_ARGERROR * STR_MAXLEN);
221 | int errorCounter = 0;
222 | for (int32_t a = 1; a < argc; a++) {
223 | bool handledFlag = false;
224 | switch (argv[a][0]) {
225 | case '/':
226 | case '-':
227 | switch (argv[a][1]) {
228 | case '/':
229 | case '-': {
230 | for (unsigned int p = 0; p < paramCount; p++) {
231 | if (params[p].arguments) {
232 | char* equPos = strchr(&(argv[a][2]), '=');
233 | if (equPos != 0) {
234 | const size_t paramSize = equPos - &(argv[a][2]);
235 | if (_strnicmp(params[p].longName, &(argv[a][2]), paramSize) == 0) {
236 | SetParam(params[p].type, equPos + 1, errorStr[errorCounter], &errorCounter);
237 | handledFlag = true;
238 | break;
239 | }
240 | } else {
241 | SetError(errorStr[errorCounter], &(argv[a][2]), params[p].missingErr);
242 | handledFlag = true;
243 | errorCounter++;
244 | }
245 | } else {
246 | if (_stricmp(params[p].longName, &(argv[a][2])) == 0) {
247 | SetParam(params[p].type, &(argv[a][3]), errorStr[errorCounter], &errorCounter);
248 | handledFlag = true;
249 | break;
250 | }
251 | }
252 | }
253 | } break;
254 | case '\0':
255 | case ' ':
256 | case '=': {
257 | SetError(errorStr[errorCounter], &(argv[a][0]), ARGERR_MISS_ARG);
258 | handledFlag = true;
259 | errorCounter++;
260 | } break;
261 | default: {
262 | bool findEqu = (argv[a][2] == '=');
263 | for (size_t p = 0; p < paramCount; p++) {
264 | char* find = strchr(&argv[a][1], params[p].shortName);
265 | if (!params[p].arguments && (find != 0) && (!findEqu)) {
266 | handledFlag = true;
267 | SetParam(params[p].type, find, errorStr[errorCounter], &errorCounter);
268 | } else if (params[p].arguments && (params[p].shortName == argv[a][1])) {
269 | if (findEqu) {
270 | handledFlag = true;
271 | SetParam(params[p].type, &(argv[a][3]), errorStr[errorCounter], &errorCounter);
272 | break;
273 | } else {
274 | handledFlag = true;
275 | SetError(errorStr[errorCounter], &(argv[a][2]), params[p].missingErr);
276 | errorCounter++;
277 | break;
278 | }
279 | }
280 | }
281 | } break;
282 | }
283 | break;
284 | default:
285 | break;
286 | }
287 | if (!handledFlag) {
288 | SetError(errorStr[errorCounter], &(argv[a][0]), ARGERR_INV_SWITCH);
289 | errorCounter++;
290 | }
291 | }
292 | if (errorCounter > 0) {
293 | for (int errs = 0; errs < errorCounter; errs++)
294 | printf_s("\r\n%s", errorStr[errs]);
295 | printf_s("\r\n");
296 | return false;
297 | }
298 | return true;
299 | }
--------------------------------------------------------------------------------