├── .github
└── workflows
│ └── linux.yaml
├── .gitignore
├── AsmGen
├── AsmGen.csproj
├── AsmGen.sln
├── DataFiles
│ ├── BranchhistTestBlock.c
│ ├── CommonFunctions.c
│ ├── GccBranchHistFunction.c
│ ├── GccIndirectBranchFunction.c
│ ├── IndirectBranchTestBlock.c
│ └── clammicrobench.vcxproj_template
├── IUarchTest.cs
├── Program.cs
├── Properties
│ └── launchSettings.json
├── README.md
├── UarchTest.cs
├── UarchTestHelpers.cs
└── tests
│ ├── A73RobTest.cs
│ ├── AddLoopTest.cs
│ ├── AddNsq.cs
│ ├── AddSchedTest.cs
│ ├── AddvNsq.cs
│ ├── AddvSchedTest.cs
│ ├── AeseSchedTest.cs
│ ├── AesencNsq.cs
│ ├── BranchBufferTest.cs
│ ├── BranchHistoryTest.cs
│ ├── BtbTest.cs
│ ├── CvtSchedTest.cs
│ ├── FAdd256RfTest.cs
│ ├── Fadd128RfTest.cs
│ ├── Fadd128SchedTest.cs
│ ├── Fadd256SchedTest.cs
│ ├── FaddNsq.cs
│ ├── FaddSchedTest.cs
│ ├── FcmpSchedTest.cs
│ ├── FlagRfTest.cs
│ ├── Fma256SchedTest.cs
│ ├── FmovSched.cs
│ ├── FmulSchedTest.cs
│ ├── FpRfTest.cs
│ ├── FpStoreDataNsq.cs
│ ├── IdrfTest.cs
│ ├── IndirectBranchTest.cs
│ ├── IntRfDepStoreTest.cs
│ ├── IntRfTest.cs
│ ├── JsCvtNsq.cs
│ ├── JsCvtSched.cs
│ ├── JumpNsqTest.cs
│ ├── JumpSchedTest.cs
│ ├── LdqTest.cs
│ ├── LeaSchedTest.cs
│ ├── LoadNsq.cs
│ ├── LoadSchedTest.cs
│ ├── MaddSchedTest.cs
│ ├── MaskRfTest.cs
│ ├── MixAddJumpSched.cs
│ ├── MixAddvJsCvtNsq.cs
│ ├── MixAddvJsCvtSched.cs
│ ├── MixBranchStoreTest.cs
│ ├── MixFAdd256and32RfTest.cs
│ ├── MixFpRfDepBranchTest.cs
│ ├── MixFpVecRfTest.cs
│ ├── MixIntRfDepBranchTest.cs
│ ├── MixIntVec128RfTest.cs
│ ├── MixIntrfFprfTest.cs
│ ├── MixJumpStoreDataSched.cs
│ ├── MixJumpStoreSchedTest.cs
│ ├── MixJumpThenAddSched.cs
│ ├── MixLdqStqTest.cs
│ ├── MixLoadStoreDivSchedTest.cs
│ ├── MixLoadStoreSchedTest.cs
│ ├── MixStoreDivSchedTest.cs
│ ├── MixVec512Vec256BlockRfTest.cs
│ ├── MixVec512Vec256RfTest.cs
│ ├── MmxRfTest.cs
│ ├── MulSchedTest.cs
│ ├── NopLoopTest.cs
│ ├── PdepSchedTest.cs
│ ├── ReturnStackTest.cs
│ ├── RobTest.cs
│ ├── RorSchedTest.cs
│ ├── ShlSchedTest.cs
│ ├── StoreDataDivNsqTest.cs
│ ├── StoreDataNsqTest.cs
│ ├── StoreDataSchedTest.cs
│ ├── StoreDivNsqTest.cs
│ ├── StoreDivSchedTest.cs
│ ├── StoreNsq.cs
│ ├── StoreSchedTest.cs
│ ├── Stq128Test.cs
│ ├── Stq512Test.cs
│ ├── StqTest.cs
│ ├── TakenBranchBufferTest.cs
│ ├── TakenJumpSchedTest.cs
│ ├── Vec512RfTest.cs
│ ├── VecMulNsq.cs
│ └── ZeroRobTest.cs
├── CoherencyLatency
├── CoherencyLatency.cpp
├── CoherencyLatency.sln
├── CoherencyLatency.vcxproj
├── Makefile
├── PThreadsCoherencyLatency.c
└── c2cparse
│ ├── Program.cs
│ ├── c2cparse.csproj
│ └── c2cparse.sln
├── Common
├── arch_detect.mk
├── ci_gpumemlatency.sh
├── ci_package.sh
├── perfmon.h
├── timing.c
└── timing.h
├── CoreClockChecker
├── BoostClockChecker.c
├── BoostClockChecker_arm.s
├── BoostClockChecker_x86.s
├── CoreClockChecker.c
├── CoreClockChecker_x86.s
├── Makefile
└── WinCoreClockChecker
│ ├── CoreClockCheckFunctions.asm
│ ├── WinCoreClockChecker.cpp
│ ├── WinCoreClockChecker.sln
│ ├── WinCoreClockChecker.vcxproj
│ └── WinCoreClockChecker.vcxproj.filters
├── GpuMemLatency
├── Makefile
├── OpenCL
│ ├── LICENSE
│ ├── README.md
│ ├── include
│ │ └── CL
│ │ │ ├── cl.h
│ │ │ ├── cl_d3d10.h
│ │ │ ├── cl_d3d11.h
│ │ │ ├── cl_dx9_media_sharing.h
│ │ │ ├── cl_dx9_media_sharing_intel.h
│ │ │ ├── cl_egl.h
│ │ │ ├── cl_ext.h
│ │ │ ├── cl_ext_intel.h
│ │ │ ├── cl_gl.h
│ │ │ ├── cl_gl_ext.h
│ │ │ ├── cl_half.h
│ │ │ ├── cl_icd.h
│ │ │ ├── cl_platform.h
│ │ │ ├── cl_va_api_media_sharing_intel.h
│ │ │ ├── cl_version.h
│ │ │ └── opencl.h
│ └── lib
│ │ └── OpenCL.lib
├── atomic_test.c
├── bw_test.c
├── common.c
├── instruction_rate.c
├── instruction_rate_fp16_kernel.cl
├── instruction_rate_fp64_kernel.cl
├── instruction_rate_kernel.cl
├── kernel.cl
├── kernels
│ ├── atomic_exec_latency_test.cl
│ ├── buffer_bw_test.cl
│ ├── c2c_atomic_exec_latency_test.cl
│ ├── constant_unrolled_latency_test.cl
│ ├── ldst_bw_test.cl
│ ├── local_64_bw_test.cl
│ ├── local_atomic_latency_test.cl
│ ├── local_bw_test.cl
│ ├── local_float4_bw_test.cl
│ ├── local_unrolled_latency_test.cl
│ ├── scalar_unrolled_latency_test.cl
│ ├── sum_bw_test.cl
│ ├── tex_bw_test.cl
│ ├── tex_latency_test.cl
│ └── unrolled_latency_test.cl
├── latency_test.c
├── local_mem_latency_kernel.cl
├── opencltest.c
├── opencltest.h
├── opencltest.sln
├── opencltest.vcxproj
├── opencltest.vcxproj.filters
└── texturetest.c
├── InstructionRate
├── Makefile
├── arm_instructionrate.c
├── arm_instructionrate.s
├── riscv_instructionrate.c
├── riscv_instructionrate.s
├── test.s
├── x86_fusion.c
├── x86_fusion.s
├── x86_instructionrate.c
└── x86_instructionrate.s
├── LICENSE
├── LoadedMemoryLatency
├── LoadedMemoryLatency.c
├── LoadedMemoryLatency
│ ├── LoadedMemoryLatency.asm
│ ├── LoadedMemoryLatency.cpp
│ ├── LoadedMemoryLatency.sln
│ ├── LoadedMemoryLatency.vcxproj
│ └── LoadedMemoryLatency.vcxproj.filters
├── LoadedMemoryLatency_amd64.s
├── LoadedMemoryLatency_arm.s
└── Makefile
├── Makefile
├── MemoryBandwidth
├── Makefile
├── MemoryBandwidth.c
├── MemoryBandwidth
│ ├── MemoryBandwidth.cpp
│ ├── MemoryBandwidth.sln
│ ├── MemoryBandwidth.vcxproj
│ ├── MemoryBandwidth.vcxproj.filters
│ ├── MemoryBandwidthFunctions.asm
│ └── MemoryBandwidthFunctions32.asm
├── MemoryBandwidth_arm.s
├── MemoryBandwidth_riscv.s
├── MemoryBandwidth_x86.s
├── MixedMemoryBandwidthTest
│ ├── MemoryBandwidth.h
│ ├── MemoryBandwidthFunctions.asm
│ ├── MixedMemoryBandwidthTest.cpp
│ ├── MixedMemoryBandwidthTest.vcxproj
│ └── MixedMemoryBandwidthTest.vcxproj.filters
└── README.md
├── MemoryLatency
├── Makefile
├── MemoryLatency.c
├── MemoryLatency.cpp
├── MemoryLatency.sln
├── MemoryLatency.vcxproj
├── MemoryLatencyFunctions.asm
├── MemoryLatency_arm.s
├── MemoryLatency_i686.s
├── MemoryLatency_riscv.s
├── MemoryLatency_x86.s
└── README.md
├── README.md
├── mt_instructionrate
├── InstructionRateFunctions.asm
├── Makefile
├── Project1.vcxproj
├── Project1.vcxproj.filters
├── arm_mt_instructionrate.c
├── arm_mt_instructionrate.s
├── mt_instructionrate.c
├── mt_instructionrate.sln
├── ppc64_mt_instructionrate.c
├── ppc64_mt_instructionrate.s
├── x86_mt_instructionrate
├── x86_mt_instructionrate.c
└── x86_mt_instructionrate.s
└── svm
├── OpenCL
├── include
│ └── CL
│ │ ├── Utils
│ │ ├── Context.h
│ │ ├── Context.hpp
│ │ ├── Detail.hpp
│ │ ├── Device.hpp
│ │ ├── Error.h
│ │ ├── Error.hpp
│ │ ├── ErrorCodes.h
│ │ ├── Event.h
│ │ ├── Event.hpp
│ │ ├── File.h
│ │ ├── File.hpp
│ │ ├── InteropContext.hpp
│ │ ├── OpenCLUtilsCpp_Export.h
│ │ ├── OpenCLUtils_Export.h
│ │ ├── Platform.hpp
│ │ ├── Utils.h
│ │ └── Utils.hpp
│ │ ├── cl.h
│ │ ├── cl2.hpp
│ │ ├── cl_d3d10.h
│ │ ├── cl_d3d11.h
│ │ ├── cl_dx9_media_sharing.h
│ │ ├── cl_dx9_media_sharing_intel.h
│ │ ├── cl_egl.h
│ │ ├── cl_ext.h
│ │ ├── cl_ext_intel.h
│ │ ├── cl_function_types.h
│ │ ├── cl_gl.h
│ │ ├── cl_gl_ext.h
│ │ ├── cl_half.h
│ │ ├── cl_icd.h
│ │ ├── cl_layer.h
│ │ ├── cl_platform.h
│ │ ├── cl_va_api_media_sharing_intel.h
│ │ ├── cl_version.h
│ │ ├── opencl.h
│ │ └── opencl.hpp
├── lib
│ ├── OpenCL.lib
│ ├── OpenCLExt.lib
│ ├── OpenCLUtils.lib
│ ├── OpenCLUtilsCpp.lib
│ ├── OpenCLUtilsCppd.lib
│ ├── OpenCLUtilsd.lib
│ └── pkgconfig
│ │ └── OpenCL.pc
└── share
│ ├── cmake
│ ├── OpenCL
│ │ ├── OpenCLConfig.cmake
│ │ └── OpenCLConfigVersion.cmake
│ ├── OpenCLExtensionLoader
│ │ ├── OpenCLExtensionLoaderConfig.cmake
│ │ ├── OpenCLExtensionLoaderConfigVersion.cmake
│ │ ├── OpenCLExtensionLoaderTargets-debug.cmake
│ │ ├── OpenCLExtensionLoaderTargets-release.cmake
│ │ └── OpenCLExtensionLoaderTargets.cmake
│ ├── OpenCLHeaders
│ │ ├── OpenCLHeadersConfig.cmake
│ │ ├── OpenCLHeadersConfigVersion.cmake
│ │ └── OpenCLHeadersTargets.cmake
│ ├── OpenCLHeadersCpp
│ │ ├── OpenCLHeadersCppConfig.cmake
│ │ ├── OpenCLHeadersCppConfigVersion.cmake
│ │ └── OpenCLHeadersCppTargets.cmake
│ ├── OpenCLICDLoader
│ │ ├── OpenCLICDLoaderConfig.cmake
│ │ ├── OpenCLICDLoaderConfigVersion.cmake
│ │ ├── OpenCLICDLoaderTargets-debug.cmake
│ │ ├── OpenCLICDLoaderTargets-release.cmake
│ │ └── OpenCLICDLoaderTargets.cmake
│ ├── OpenCLUtils
│ │ ├── OpenCLUtilsConfig.cmake
│ │ ├── OpenCLUtilsConfigVersion.cmake
│ │ ├── OpenCLUtilsTargets-debug.cmake
│ │ ├── OpenCLUtilsTargets-release.cmake
│ │ └── OpenCLUtilsTargets.cmake
│ └── OpenCLUtilsCpp
│ │ ├── OpenCLUtilsCppConfig.cmake
│ │ ├── OpenCLUtilsCppConfigVersion.cmake
│ │ ├── OpenCLUtilsCppTargets-debug.cmake
│ │ ├── OpenCLUtilsCppTargets-release.cmake
│ │ └── OpenCLUtilsCppTargets.cmake
│ ├── man
│ └── man1
│ │ └── clinfo.1.gz
│ └── pkgconfig
│ ├── OpenCL-CLHPP.pc
│ └── OpenCL-Headers.pc
├── atomic_latency_kernel.cl
├── svm.sln
├── svm.vcxproj
├── svm.vcxproj.filters
└── svmtest.cpp
/.github/workflows/linux.yaml:
--------------------------------------------------------------------------------
1 | name: Build Benchmarks on Ubuntu
2 | on: [push]
3 | jobs:
4 | BuildBenchmarks:
5 | # Only Ubuntu for now.
6 | runs-on: ubuntu-latest
7 | steps:
8 | - name: Install prerequisites
9 | run: sudo apt update && sudo apt -qq --assume-yes full-upgrade && sudo apt install -qq -y build-essential crossbuild-essential-arm64 gcc-riscv64-linux-gnu ocl-icd-opencl-dev opencl-headers libnuma-dev b3sum unzip
10 | - name: Wild tomfoolery attempt
11 | run: eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)" && brew install mingw-w64
12 | - name: Check out repository code
13 | uses: actions/checkout@v3
14 | - name: Build all benchmarks
15 | run: eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)" && make ci
16 | - name: Package benchmarks
17 | run: make package
18 | - name: b3sum
19 | run: b3sum clammarks.txz
20 | # - name: Upload package
21 | # env:
22 | # UPLOAD_KEY: ${{ secrets.UPLOAD_KEY }}
23 | # UPLOAD_URL: ${{ secrets.UPLOAD_URL }}
24 | # run: curl -X PUT -T clammarks.txz -H "$UPLOAD_KEY" "$UPLOAD_URL"
25 |
--------------------------------------------------------------------------------
/AsmGen/AsmGen.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Exe
5 | net8.0
6 | false
7 | x64
8 | AnyCPU;x64
9 |
10 |
11 |
12 |
13 | Always
14 |
15 |
16 | Always
17 |
18 |
19 | Always
20 |
21 |
22 | Always
23 |
24 |
25 | Always
26 |
27 |
28 | Always
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/AsmGen/AsmGen.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 17
4 | VisualStudioVersion = 17.2.32516.85
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AsmGen", "AsmGen.csproj", "{B8930E86-946C-4831-B088-F571E73EEDC4}"
7 | EndProject
8 | Global
9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | Debug|Any CPU = Debug|Any CPU
11 | Debug|x64 = Debug|x64
12 | Release|Any CPU = Release|Any CPU
13 | Release|x64 = Release|x64
14 | EndGlobalSection
15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
16 | {B8930E86-946C-4831-B088-F571E73EEDC4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
17 | {B8930E86-946C-4831-B088-F571E73EEDC4}.Debug|Any CPU.Build.0 = Debug|Any CPU
18 | {B8930E86-946C-4831-B088-F571E73EEDC4}.Debug|x64.ActiveCfg = Debug|x64
19 | {B8930E86-946C-4831-B088-F571E73EEDC4}.Debug|x64.Build.0 = Debug|x64
20 | {B8930E86-946C-4831-B088-F571E73EEDC4}.Release|Any CPU.ActiveCfg = Release|Any CPU
21 | {B8930E86-946C-4831-B088-F571E73EEDC4}.Release|Any CPU.Build.0 = Release|Any CPU
22 | {B8930E86-946C-4831-B088-F571E73EEDC4}.Release|x64.ActiveCfg = Release|x64
23 | {B8930E86-946C-4831-B088-F571E73EEDC4}.Release|x64.Build.0 = Release|x64
24 | EndGlobalSection
25 | GlobalSection(SolutionProperties) = preSolution
26 | HideSolutionNode = FALSE
27 | EndGlobalSection
28 | GlobalSection(ExtensibilityGlobals) = postSolution
29 | SolutionGuid = {4433D029-CD62-44B9-862E-A8DE52DA45CE}
30 | EndGlobalSection
31 | EndGlobal
32 |
--------------------------------------------------------------------------------
/AsmGen/DataFiles/BranchhistTestBlock.c:
--------------------------------------------------------------------------------
1 | uint32_t testSizeCount = sizeof(branchHistoryLengths) / sizeof(int);
2 | initializeBranchHistFuncArr();
3 | srand(time(NULL));
4 |
5 | size_t resultSize = sizeof(float) * maxBranchCount * testSizeCount;
6 | float* randomResults = (float*)malloc(resultSize);
7 | float* predictableResults = (float*)malloc(resultSize);
8 | for (uint32_t branchCountIdx = 0; branchCountIdx < maxBranchCount; branchCountIdx++) {
9 | for (uint32_t testSizeIdx = 0; testSizeIdx < testSizeCount; testSizeIdx++) {
10 | uint32_t testSize = branchHistoryLengths[testSizeIdx];
11 | uint32_t branchCount = branchCounts[branchCountIdx];
12 | printf("Testing branch count %d history length %d\n", branchCount, testSize);
13 | randomResults[branchCountIdx * testSizeCount + testSizeIdx] = runBranchHistTest(testSize, branchCountIdx, 1);
14 | predictableResults[branchCountIdx * testSizeCount + testSizeIdx] = runBranchHistTest(testSize, branchCountIdx, 0);
15 | printf("%d, %f, %f\n", testSize,
16 | randomResults[branchCountIdx * testSizeCount + testSizeIdx],
17 | predictableResults[branchCountIdx * testSizeCount + testSizeIdx]);
18 | }
19 | }
20 |
21 | printf("Random:\n");
22 | printResultFloatArr(randomResults, branchHistoryLengths, testSizeCount, branchCounts, maxBranchCount);
23 | printf("\nPredictable:\n");
24 | printResultFloatArr(predictableResults, branchHistoryLengths, testSizeCount, branchCounts, maxBranchCount);
25 |
26 | free(randomResults);
27 | free(predictableResults);
28 |
--------------------------------------------------------------------------------
/AsmGen/DataFiles/CommonFunctions.c:
--------------------------------------------------------------------------------
1 | // this is a partial C file that's appended into generated code
2 | // stuff here is generic enough to work for both windows/vs and gcc
3 |
4 | #ifndef __MINGW32__
5 | // optional affinity setting for effed up qualcomm/android bs
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 |
12 | void setAffinity(int core) {
13 | cpu_set_t cpuset;
14 | CPU_ZERO(&cpuset);
15 | CPU_SET(core, &cpuset);
16 | printf("Set affinity to core %d\n", core);
17 | // sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset);
18 | }
19 | #endif
20 |
21 | struct ThreadData {
22 | int* A;
23 | int* B;
24 | float* fpArr;
25 | uint32_t list_size;
26 | uint64_t structIterations;
27 | };
28 |
29 | void printCsvHeader(uint32_t* xCounts, uint32_t xLen) {
30 | printf("x");
31 | for (uint32_t testSizeIdx = 0; testSizeIdx < xLen; testSizeIdx++) {
32 | printf(", %d", xCounts[testSizeIdx]);
33 | }
34 |
35 | printf("\n");
36 | }
37 |
38 | // print results in format that excel can take
39 | void printResultFloatArr(float* arr, uint32_t *xCounts, uint32_t xLen, uint32_t *yCounts, uint32_t yLen) {
40 | uint32_t testSizeCount = xLen;
41 | printCsvHeader(xCounts, xLen);
42 | for (uint32_t branchCountIdx = 0; branchCountIdx < yLen; branchCountIdx++) {
43 | // row header
44 | printf("%d", yCounts[branchCountIdx]);
45 | for (uint32_t testSizeIdx = 0; testSizeIdx < testSizeCount; testSizeIdx++) {
46 | printf(",%f", arr[branchCountIdx * testSizeCount + testSizeIdx]);
47 | }
48 |
49 | printf("\n");
50 | }
51 | }
52 |
53 | void FillPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t byte_increment) {
54 | uint32_t increment = byte_increment / sizeof(uint32_t);
55 | uint32_t element_count = list_size / increment;
56 | for (int i = 0; i < element_count; i++) {
57 | pattern_arr[i * increment] = i * increment;
58 | }
59 |
60 | int iter = element_count;
61 | while (iter > 1) {
62 | iter -= 1;
63 | int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);
64 | uint32_t tmp = pattern_arr[iter * increment];
65 | pattern_arr[iter * increment] = pattern_arr[j * increment];
66 | pattern_arr[j * increment] = tmp;
67 | }
68 | }
--------------------------------------------------------------------------------
/AsmGen/DataFiles/GccBranchHistFunction.c:
--------------------------------------------------------------------------------
1 | // this is a partial C file that's appended into generated code
2 |
3 | // Run a test, return the result in time (ns) per branch
4 | // historyLen: length of random array that the test loops through
5 | // branchCountIdx: index into array of branch counts, max determined by generated header/asm
6 | // random: if 1, randomize test array contents. If 0, fill with zeroes
7 | float runBranchHistTest(uint32_t historyLen, uint32_t branchCountIdx, int random) {
8 | struct timeval startTv, endTv;
9 | struct timezone startTz, endTz;
10 | uint32_t branchCount = branchCounts[branchCountIdx];
11 | uint64_t iterations = 320000000 / branchCount;
12 | uint64_t(*branchtestFunc)(uint64_t, uint32_t**, uint32_t) __attribute((sysv_abi)) = branchtestFuncArr[branchCountIdx];
13 | float onesCount = 0.0f;
14 |
15 | uint32_t** testArrToArr = (uint32_t**)malloc(sizeof(uint32_t*) * branchCount);
16 | for (int testArrIdx = 0; testArrIdx < branchCount; testArrIdx++) {
17 | uint32_t* testArr = (uint32_t*)malloc(sizeof(uint32_t) * historyLen);
18 | for (uint32_t i = 0; i < historyLen; i++) {
19 | testArr[i] = random ? rand() % 2 : 0;
20 | if (testArr[i] > 0)
21 | {
22 | onesCount += 1.0f;
23 | }
24 | }
25 | testArrToArr[testArrIdx] = testArr;
26 | }
27 |
28 | fprintf(stderr, "Starting test, should have %0.2f percent ones\n", onesCount / ((float)historyLen * branchCount));
29 | gettimeofday(&startTv, &startTz);
30 | uint64_t takenBranchCount = branchtestFunc(iterations, testArrToArr, historyLen);
31 | gettimeofday(&endTv, &endTz);
32 | uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
33 | float latency = 1e6 * (float)time_diff_ms / (float)iterations;
34 |
35 | // give result in latency per branch
36 | latency = latency / branchCount;
37 | fprintf(stderr, "History length %u, branch count %u: %0.2f percent not-taken\n", historyLen, branchCount, 100 * (float)takenBranchCount / ((float)iterations * branchCount));
38 |
39 | for (int testArrIdx = 0; testArrIdx < branchCount; testArrIdx++) free(testArrToArr[testArrIdx]);
40 | free(testArrToArr);
41 | return latency;
42 | }
43 |
--------------------------------------------------------------------------------
/AsmGen/DataFiles/IndirectBranchTestBlock.c:
--------------------------------------------------------------------------------
1 | // generated code will have:
2 | // - indirectBranchTargetCounts = array containing # of targets per branch
3 | // - indirectBranchCounts = array containing # of branches to test
4 | // - maxIndirectBranchCount = length of ^^
5 | // - initializeIndirectBranchFuncArr = populates
6 |
7 | uint32_t testSizeCount = sizeof(indirectBranchTargetCounts) / sizeof(int);
8 | initializeIndirectBranchFuncArr();
9 | srand(time(NULL));
10 |
11 | size_t resultSize = sizeof(float) * maxIndirectBranchCount * testSizeCount;
12 | float* results = (float*)malloc(resultSize);
13 | float* refResults = (float*)malloc(resultSize);
14 | for (uint32_t branchCountIdx = 0; branchCountIdx < maxIndirectBranchCount; branchCountIdx++) {
15 | for (uint32_t targetCountIdx = 0; targetCountIdx < testSizeCount; targetCountIdx++) {
16 | uint32_t testSize = indirectBranchTargetCounts[targetCountIdx];
17 | uint32_t branchCount = indirectBranchCounts[branchCountIdx];
18 | printf("Testing branch count %d target count %d:", branchCount, testSize);
19 | results[branchCountIdx * testSizeCount + targetCountIdx] = runIndirectBranchTest(branchCountIdx, targetCountIdx, 0);
20 | refResults[branchCountIdx * testSizeCount + targetCountIdx] = runIndirectBranchTest(branchCountIdx, targetCountIdx, 2);
21 | printf("%f ns, reference %f ns\n",
22 | results[branchCountIdx * testSizeCount + targetCountIdx],
23 | refResults[branchCountIdx * testSizeCount + targetCountIdx]);
24 | }
25 | }
26 |
27 | printf("Indirect branch results:\n");
28 | printResultFloatArr(results, indirectBranchTargetCounts, testSizeCount, indirectBranchCounts, maxIndirectBranchCount);
29 | printf("Reference indirect branch results:\n");
30 | printResultFloatArr(refResults, indirectBranchTargetCounts, testSizeCount, indirectBranchCounts, maxIndirectBranchCount);
31 |
32 | free(results);
33 | free(refResults);
34 |
--------------------------------------------------------------------------------
/AsmGen/IUarchTest.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public interface IUarchTest
6 | {
7 | public const string ThreadLaunchFunctionPrefix = "ThreadLaunch_";
8 | // enough to generate global lines, function calls, and let user pick from tests
9 | public string Prefix { get; }
10 | public string Description { get; }
11 | public bool DivideTimeByCount { get; }
12 | public bool SupportsIsa(ISA isa);
13 |
14 | public void GenerateAsm(StringBuilder sb, ISA isa);
15 | public void GenerateTestBlock(StringBuilder sb, ISA isa);
16 | public void GenerateAsmGlobalLines(StringBuilder sb);
17 | public void GenerateExternLines(StringBuilder sb);
18 |
19 | public enum ISA
20 | {
21 | amd64, // 64-bit x86
22 | aarch64, // 64-bit arm
23 | mips64, // 64-bit MIPS, for loongson
24 | riscv, // 64-bit risc-v
25 | }
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/AsmGen/Properties/launchSettings.json:
--------------------------------------------------------------------------------
1 | {
2 | "profiles": {
3 | "AsmGen": {
4 | "commandName": "Project",
5 | "commandLineArgs": "autocopy"
6 | }
7 | }
8 | }
9 |
--------------------------------------------------------------------------------
/AsmGen/tests/A73RobTest.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.Text;
3 |
4 | namespace AsmGen
5 | {
6 | ///
7 | /// Looking for reordering capacity limits on A73 by combining several different instruction types
8 | ///
9 | public class A73RobTest : UarchTest
10 | {
11 | public A73RobTest(int low, int high, int step)
12 | {
13 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
14 | this.Prefix = "a73rob";
15 | this.Description = "Mixed integer/vec128 + stores";
16 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
17 | this.GetFunctionCallParameters = "structIterations, A, fpArr";
18 | this.DivideTimeByCount = false;
19 | }
20 |
21 | public override bool SupportsIsa(IUarchTest.ISA isa)
22 | {
23 | if (isa == IUarchTest.ISA.aarch64) return true;
24 | return false;
25 | }
26 |
27 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
28 | {
29 | if (isa == IUarchTest.ISA.aarch64)
30 | {
31 | string postLoadInstrs = UarchTestHelpers.GetArmDependentBranch(this.Prefix);
32 | string initInstrs = " ldr q0, [x1]\n" +
33 | " ldr q1, [x1, #0x10]\n" +
34 | " ldr q2, [x1, #0x20]\n" +
35 | " ldr q3, [x1, #0x30]\n" +
36 | " ldr q4, [x1, #0x40]\n";
37 |
38 | List fillerInstrs = new List();
39 | for (int i = 0; i < this.Counts[this.Counts.Length - 1];i++)
40 | {
41 | if (i < 33) fillerInstrs.Add(" add v1.4s, v1.4s, v0.4s");
42 | else if (i < 66) fillerInstrs.Add(" add x15, x15, x11");
43 | else fillerInstrs.Add(" str x12, [x2]");
44 | }
45 |
46 | string[] fillerInstrsArr = fillerInstrs.ToArray();
47 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
48 | sb, this.Counts, this.Prefix, fillerInstrsArr, fillerInstrsArr, false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
49 | sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
50 | }
51 | }
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/AsmGen/tests/AddNsq.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class AddNsq : UarchTest
6 | {
7 | private int totalOps;
8 | public AddNsq(int low, int high, int step, int totalOps)
9 | {
10 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
11 | this.Prefix = "addnsq" + totalOps;
12 | this.Description = "Integer adds, excluding possible NSQ";
13 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
14 | this.GetFunctionCallParameters = "structIterations, A, fpArr";
15 | this.DivideTimeByCount = false;
16 | this.totalOps = totalOps;
17 | }
18 |
19 | public override bool SupportsIsa(IUarchTest.ISA isa)
20 | {
21 | // if (isa == IUarchTest.ISA.aarch64) return true;
22 | if (isa == IUarchTest.ISA.amd64) return true;
23 | return false;
24 | }
25 |
26 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
27 | {
28 | if (isa == IUarchTest.ISA.amd64)
29 | {
30 | string[] depInstrs = new string[2];
31 | depInstrs[0] = " add %rdi, %r15";
32 | depInstrs[1] = " add %rdi, %r14";
33 |
34 | string[] indepInstrs = new string[2];
35 | indepInstrs[0] = " add %r13, %r11";
36 | indepInstrs[1] = " add %r12, %r11";
37 | UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false);
38 | }
39 | }
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/AsmGen/tests/AddvNsq.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class AddvNsq : UarchTest
6 | {
7 | private int totalOps;
8 | public AddvNsq(int low, int high, int step, int totalOps)
9 | {
10 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
11 | this.Prefix = "addvnsq";
12 | this.Description = "ADDV, excluding possible NSQ";
13 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
14 | this.GetFunctionCallParameters = "structIterations, A, fpArr";
15 | this.DivideTimeByCount = false;
16 | this.totalOps = totalOps;
17 | }
18 |
19 | public override bool SupportsIsa(IUarchTest.ISA isa)
20 | {
21 | if (isa == IUarchTest.ISA.aarch64) return true;
22 | return false;
23 | }
24 |
25 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
26 | {
27 | if (isa == IUarchTest.ISA.aarch64)
28 | {
29 | string postLoadInstrs1 = " ldr d16, [x2, w25, sxtw #0]";
30 | string initInstrs = " ldr d15, [x2]";
31 | string[] depInstrs = new string[4];
32 | depInstrs[0] = " addv h1, v16.4h";
33 | depInstrs[1] = " addv h2, v16.4h";
34 | depInstrs[2] = " addv h3, v16.4h";
35 | depInstrs[3] = " addv h4, v16.4h";
36 |
37 | string[] indepInstrs = new string[4];
38 | indepInstrs[0] = " addv h1, v15.4h";
39 | indepInstrs[1] = " addv h2, v15.4h";
40 | indepInstrs[2] = " addv h3, v15.4h";
41 | indepInstrs[3] = " addv h4, v15.4h";
42 | UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs,
43 | postLoadInstrs: postLoadInstrs1);
44 | }
45 | }
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/AsmGen/tests/AddvSchedTest.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class AddvSched : UarchTest
6 | {
7 | public AddvSched(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "addvsched";
11 | this.Description = "ADDV Scheduler";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
13 | this.GetFunctionCallParameters = "structIterations, A, fpArr";
14 | this.DivideTimeByCount = false;
15 | }
16 |
17 | public override bool SupportsIsa(IUarchTest.ISA isa)
18 | {
19 | if (isa == IUarchTest.ISA.aarch64) return true;
20 | return false;
21 | }
22 |
23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
24 | {
25 | if (isa == IUarchTest.ISA.aarch64)
26 | {
27 | string postLoadInstrs1 = " ldr q16, [x2, w25, sxtw #0]";
28 | string postLoadInstrs2 = " ldr q16, [x2, w25, sxtw #0]";
29 | string[] unrolledInstrs = new string[4];
30 | unrolledInstrs[0] = " addv h1, v16.4h";
31 | unrolledInstrs[1] = " addv h2, v16.4h";
32 | unrolledInstrs[2] = " addv h3, v16.4h";
33 | unrolledInstrs[3] = " addv h4, v16.4h";
34 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, null,
35 | postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
36 | }
37 | }
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/AsmGen/tests/AeseSchedTest.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class AeseSchedTest : UarchTest
6 | {
7 | public AeseSchedTest(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "aesesched";
11 | this.Description = "aese scheduler";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
13 | this.GetFunctionCallParameters = "structIterations, A, fpArr";
14 | this.DivideTimeByCount = false;
15 | }
16 |
17 | public override bool SupportsIsa(IUarchTest.ISA isa)
18 | {
19 | if (isa == IUarchTest.ISA.amd64) return true;
20 | if (isa == IUarchTest.ISA.aarch64) return true;
21 | return false;
22 | }
23 |
24 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
25 | {
26 | if (isa == IUarchTest.ISA.amd64)
27 | {
28 | string[] unrolledAdds = new string[4];
29 | unrolledAdds[0] = " aesenc %xmm0, %xmm1";
30 | unrolledAdds[1] = " aesenc %xmm0, %xmm2";
31 | unrolledAdds[2] = " aesenc %xmm0, %xmm3";
32 | unrolledAdds[3] = " aesenc %xmm0, %xmm4";
33 |
34 | UarchTestHelpers.GenerateX86AsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);
35 | }
36 |
37 | if (isa == IUarchTest.ISA.aarch64)
38 | {
39 | string postLoadInstrs1 = " ldr q0, [x2, w25, uxtw#0]";
40 | string postLoadInstrs2 = " ldr q0, [x2, w26, uxtw#0]";
41 | string[] unrolledAdds = new string[4];
42 | unrolledAdds[0] = " aese v1.16b, v0.16b";
43 | unrolledAdds[1] = " aese v2.16b, v0.16b";
44 | unrolledAdds[2] = " aese v3.16b, v0.16b";
45 | unrolledAdds[3] = " aese v4.16b, v0.16b";
46 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, null, postLoadInstrs1, postLoadInstrs2);
47 | }
48 | }
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/AsmGen/tests/Fadd128SchedTest.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class Fadd128SchedTest : UarchTest
6 | {
7 | public Fadd128SchedTest(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "fadd128sched";
11 | this.Description = "128-bit Vector FP Add Scheduler";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
13 | this.GetFunctionCallParameters = "structIterations, A, fpArr";
14 | this.DivideTimeByCount = false;
15 | }
16 |
17 | public override bool SupportsIsa(IUarchTest.ISA isa)
18 | {
19 | if (isa == IUarchTest.ISA.amd64) return true;
20 | if (isa == IUarchTest.ISA.aarch64) return true;
21 | return false;
22 | }
23 |
24 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
25 | {
26 | if (isa == IUarchTest.ISA.amd64)
27 | {
28 | string[] unrolledAdds = new string[4];
29 | unrolledAdds[0] = " addps %xmm0, %xmm1";
30 | unrolledAdds[1] = " addps %xmm0, %xmm2";
31 | unrolledAdds[2] = " addps %xmm0, %xmm3";
32 | unrolledAdds[3] = " addps %xmm0, %xmm4";
33 |
34 | UarchTestHelpers.GenerateX86AsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);
35 | }
36 | else if (isa == IUarchTest.ISA.aarch64)
37 | {
38 | string postLoadInstrs1 = " ldr q0, [x2, w25, uxtw#0]";
39 | string postLoadInstrs2 = " ldr q0, [x2, w26, uxtw#0]";
40 | string[] unrolledAdds = new string[4];
41 | unrolledAdds[0] = " add v1.4s, v1.4s, v0.4s";
42 | unrolledAdds[1] = " add v2.4s, v2.4s, v0.4s";
43 | unrolledAdds[2] = " add v3.4s, v3.4s, v0.4s";
44 | unrolledAdds[3] = " add v4.4s, v4.4s, v0.4s";
45 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, null, postLoadInstrs1, postLoadInstrs2);
46 | }
47 | }
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/AsmGen/tests/FcmpSchedTest.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class FcmpSchedTest : UarchTest
6 | {
7 | public FcmpSchedTest(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "fcmpsched";
11 | this.Description = "FCMP Scheduler";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
13 | this.GetFunctionCallParameters = "structIterations, A, fpArr";
14 | this.DivideTimeByCount = false;
15 | }
16 |
17 | public override bool SupportsIsa(IUarchTest.ISA isa)
18 | {
19 | if (isa == IUarchTest.ISA.aarch64) return true;
20 | return false;
21 | }
22 |
23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
24 | {
25 | if (isa == IUarchTest.ISA.aarch64)
26 | {
27 | string[] unrolledAdds = new string[4];
28 | unrolledAdds[0] = " fcmp s17, s16";
29 | unrolledAdds[1] = " fcmp s19, s16";
30 | unrolledAdds[2] = " fcmp s19, s16";
31 | unrolledAdds[3] = " fcmp s20, s16";
32 | UarchTestHelpers.GenerateArmAsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);
33 | }
34 | }
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/AsmGen/tests/FlagRfTest.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class FlagRfTest : UarchTest
6 | {
7 | private bool initialDependentBranch;
8 | public FlagRfTest(int low, int high, int step, bool initialDependentBranch)
9 | {
10 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
11 | this.Prefix = "flagrf" + (initialDependentBranch ? "db" : string.Empty);
12 | this.Description = "Flags Register File" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty);
13 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
14 | this.GetFunctionCallParameters = "structIterations, A";
15 | this.DivideTimeByCount = false;
16 | this.initialDependentBranch = initialDependentBranch;
17 | }
18 |
19 | public override bool SupportsIsa(IUarchTest.ISA isa)
20 | {
21 | if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false;
22 | if (isa == IUarchTest.ISA.amd64) return true;
23 | if (isa == IUarchTest.ISA.aarch64) return true;
24 | if (isa == IUarchTest.ISA.mips64) return false;
25 | return false;
26 | }
27 |
28 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
29 | {
30 | if (isa == IUarchTest.ISA.amd64)
31 | {
32 | string[] unrolledAdds = new string[1];
33 | unrolledAdds[0] = " test %r15, %r14";
34 | UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);
35 | }
36 | else if (isa == IUarchTest.ISA.aarch64)
37 | {
38 | string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
39 | string[] unrolledAdds = new string[1];
40 | unrolledAdds[0] = " cmp x14, x15";
41 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
42 | sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
43 | if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
44 | }
45 | }
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/AsmGen/tests/FmovSched.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class FmovSched : UarchTest
6 | {
7 | public FmovSched(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "fmovsched";
11 | this.Description = "FMOV vec to gpr Scheduler";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
13 | this.GetFunctionCallParameters = "structIterations, A, fpArr";
14 | this.DivideTimeByCount = false;
15 | }
16 |
17 | public override bool SupportsIsa(IUarchTest.ISA isa)
18 | {
19 | if (isa == IUarchTest.ISA.aarch64) return true;
20 | return false;
21 | }
22 |
23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
24 | {
25 | if (isa == IUarchTest.ISA.aarch64)
26 | {
27 | string postLoadInstrs1 = " ldr d16, [x2, w25, sxtw #0]";
28 | string postLoadInstrs2 = " ldr d16, [x2, w25, sxtw #0]";
29 | string[] unrolledInstrs = new string[4];
30 | unrolledInstrs[0] = " fmov x15, d16";
31 | unrolledInstrs[1] = " fmov x14, d16";
32 | unrolledInstrs[2] = " fmov x13, d16";
33 | unrolledInstrs[3] = " fmov x12, d16";
34 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, null,
35 | postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
36 | }
37 | }
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/AsmGen/tests/FpStoreDataNsq.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class FpStoreDataNsqTest : UarchTest
6 | {
7 | public FpStoreDataNsqTest(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "fpstoredatansq" + high;
11 | this.Description = "Store FP 32-bit data scheduler capacity, excluding nsq";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
13 | this.GetFunctionCallParameters = "structIterations, A, fpArr";
14 | this.DivideTimeByCount = false;
15 | }
16 |
17 | public override bool SupportsIsa(IUarchTest.ISA isa)
18 | {
19 | if (isa == IUarchTest.ISA.amd64) return true;
20 | return false;
21 | }
22 |
23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
24 | {
25 | if (isa == IUarchTest.ISA.amd64)
26 | {
27 | string initInstrs = " vzeroupper\n vpcmpeqd %xmm2, %xmm2, %xmm2\n vpxor %xmm2, %xmm3, %xmm3\n cvtsi2ss %r11, %xmm3\n movss %xmm3, %xmm4\n movss %xmm3, %xmm5\n movss %xmm3, %xmm6";
28 | string postLoadInstr = " cvtsi2ss %rdi, %xmm1";
29 | string[] dependentStores = new string[4];
30 | dependentStores[0] = " movss %xmm1, (%r8)";
31 | dependentStores[1] = " movss %xmm1, (%r8, %r14, 4)";
32 | dependentStores[2] = " movss %xmm1, (%r8, %r13, 4)";
33 | dependentStores[3] = " movss %xmm1, (%r8, %r12, 4)";
34 |
35 | string[] indepFpInstrs = new string[4];
36 | indepFpInstrs[0] = " addss %xmm2, %xmm3";
37 | indepFpInstrs[1] = " addss %xmm2, %xmm4";
38 | indepFpInstrs[2] = " addss %xmm2, %xmm5";
39 | indepFpInstrs[3] = " addss %xmm2, %xmm6";
40 |
41 | UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, indepFpInstrs, false, initInstrs: initInstrs, postLoadInstrs: postLoadInstr);
42 | }
43 | }
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/AsmGen/tests/JsCvtNsq.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class JsCvtNsq : UarchTest
6 | {
7 | private int totalOps;
8 | public JsCvtNsq(int low, int high, int step, int totalOps)
9 | {
10 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
11 | this.Prefix = "jscvtnsq";
12 | this.Description = "FJCVTZS (FP Javascript Convert to Signed Fixed Point, Rounding toward Zero) Scheduler, excluding possible NSQ";
13 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
14 | this.GetFunctionCallParameters = "structIterations, A, fpArr";
15 | this.DivideTimeByCount = false;
16 | this.totalOps = totalOps;
17 | }
18 |
19 | public override bool SupportsIsa(IUarchTest.ISA isa)
20 | {
21 | if (isa == IUarchTest.ISA.aarch64) return true;
22 | return false;
23 | }
24 |
25 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
26 | {
27 | if (isa == IUarchTest.ISA.aarch64)
28 | {
29 | string postLoadInstrs1 = " ldr d16, [x2, w25, sxtw #0]";
30 | string initInstrs = " ldr d15, [x2]";
31 | string[] depInstrs = new string[4];
32 | depInstrs[0] = " fjcvtzs w15, d16";
33 | depInstrs[1] = " fjcvtzs w14, d16";
34 | depInstrs[2] = " fjcvtzs w13, d16";
35 | depInstrs[3] = " fjcvtzs w12, d16";
36 |
37 | string[] indepInstrs = new string[4];
38 | indepInstrs[0] = " fjcvtzs w15, d15";
39 | indepInstrs[1] = " fjcvtzs w14, d15";
40 | indepInstrs[2] = " fjcvtzs w13, d15";
41 | indepInstrs[3] = " fjcvtzs w12, d15";
42 | UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs,
43 | postLoadInstrs: postLoadInstrs1);
44 | }
45 | }
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/AsmGen/tests/JsCvtSched.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class JsCvtSched : UarchTest
6 | {
7 | public JsCvtSched(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "jscvtsched";
11 | this.Description = "FJCVTZS (FP Javascript Convert to Signed Fixed Point, Rounding toward Zero) Scheduler";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
13 | this.GetFunctionCallParameters = "structIterations, A, fpArr";
14 | this.DivideTimeByCount = false;
15 | }
16 |
17 | public override bool SupportsIsa(IUarchTest.ISA isa)
18 | {
19 | if (isa == IUarchTest.ISA.aarch64) return true;
20 | return false;
21 | }
22 |
23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
24 | {
25 | if (isa == IUarchTest.ISA.aarch64)
26 | {
27 | string postLoadInstrs1 = " ldr d16, [x2, w25, sxtw #0]";
28 | string postLoadInstrs2 = " ldr d16, [x2, w25, sxtw #0]";
29 | string[] unrolledInstrs = new string[4];
30 | unrolledInstrs[0] = " fjcvtzs w15, d16";
31 | unrolledInstrs[1] = " fjcvtzs w14, d16";
32 | unrolledInstrs[2] = " fjcvtzs w13, d16";
33 | unrolledInstrs[3] = " fjcvtzs w12, d16";
34 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, null,
35 | postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
36 | }
37 | }
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/AsmGen/tests/JumpNsqTest.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class JumpNsqTest : UarchTest
6 | {
7 | public JumpNsqTest(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "jumpnsq";
11 | this.Description = "Scheduler, Not-Taken Jumps, excluding possible nsq";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
13 | this.GetFunctionCallParameters = "structIterations, A";
14 | this.DivideTimeByCount = false;
15 | }
16 |
17 | public override bool SupportsIsa(IUarchTest.ISA isa)
18 | {
19 | if (isa == IUarchTest.ISA.amd64) return true;
20 | // if (isa == IUarchTest.ISA.aarch64) return true;
21 | // if (isa == IUarchTest.ISA.mips64) return true;
22 | // if (isa == IUarchTest.ISA.riscv) return true;
23 | return false;
24 | }
25 |
26 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
27 | {
28 | if (isa == IUarchTest.ISA.amd64)
29 | {
30 | string[] dependentJumps = new string[1];
31 | dependentJumps[0] = " cmp %rdi, %rsi\n je jumpnsq_reallybadthing";
32 | string[] independentJumps = new string[1];
33 | independentJumps[0] = " cmp %r13, %r14\n je jumpnsq_reallybadthing";
34 | UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentJumps, independentJumps);
35 |
36 | sb.AppendLine("jumpnsq_reallybadthing:\n int3");
37 | }
38 | }
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/AsmGen/tests/JumpSchedTest.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class JumpSchedTest : UarchTest
6 | {
7 | public JumpSchedTest(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "jumpsched";
11 | this.Description = "Scheduler, Not-Taken Jumps";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
13 | this.GetFunctionCallParameters = "structIterations, A";
14 | this.DivideTimeByCount = false;
15 | }
16 |
17 | public override bool SupportsIsa(IUarchTest.ISA isa)
18 | {
19 | if (isa == IUarchTest.ISA.amd64) return true;
20 | if (isa == IUarchTest.ISA.aarch64) return true;
21 | // if (isa == IUarchTest.ISA.mips64) return true;
22 | if (isa == IUarchTest.ISA.riscv) return true;
23 | return false;
24 | }
25 |
26 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
27 | {
28 | if (isa == IUarchTest.ISA.amd64)
29 | {
30 | string[] unrolledJumps = new string[1];
31 | unrolledJumps[0] = " cmp %rdi, %rsi\n je jumpsched_reallybadthing";
32 | UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true);
33 |
34 | sb.AppendLine("jumpsched_reallybadthing:\n int3");
35 | }
36 | else if (isa == IUarchTest.ISA.aarch64)
37 | {
38 | string[] unrolledJumps = new string[1];
39 | unrolledJumps[0] = " cmp x25, x26\n b.eq jumpsched_reallybadthing";
40 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true);
41 | sb.AppendLine("jumpsched_reallybadthing:\n .word 0xf7f0a000");
42 | }
43 | else if (isa == IUarchTest.ISA.riscv)
44 | {
45 | // todo
46 | string[] unrolledJumps = new string[1];
47 | unrolledJumps[0] = " beq x5, x6, jumpsched_reallybadthing";
48 | UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, false);
49 | sb.AppendLine("jumpsched_reallybadthing:\n .word 0x00000000");
50 | }
51 | }
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/AsmGen/tests/LeaSchedTest.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class LeaSchedTest : UarchTest
6 | {
7 | public LeaSchedTest(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "leasched";
11 | this.Description = "Scheduler, lea with base + index + offset";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
13 | this.GetFunctionCallParameters = "structIterations, A";
14 | this.DivideTimeByCount = false;
15 | }
16 |
17 | public override bool SupportsIsa(IUarchTest.ISA isa)
18 | {
19 | if (isa == IUarchTest.ISA.amd64) return true;
20 | return false;
21 | }
22 |
23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
24 | {
25 | if (isa == IUarchTest.ISA.amd64)
26 | {
27 | string[] unrolledAdds = new string[4];
28 | unrolledAdds[0] = " lea 128(%r15, %rdi), %r15";
29 | unrolledAdds[1] = " lea 128(%r14, %rdi), %r14";
30 | unrolledAdds[2] = " lea 128(%r13, %rdi), %r13";
31 | unrolledAdds[3] = " lea 128(%r12, %rdi), %r12";
32 | UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false);
33 | }
34 | }
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/AsmGen/tests/LoadNsq.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class LoadNsq : UarchTest
6 | {
7 | public LoadNsq(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "loadnsq";
11 | this.Description = "Load Address Scheduler, Excluding any NSQ";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
13 | this.GetFunctionCallParameters = "structIterations, A, fpArr";
14 | this.DivideTimeByCount = false;
15 | }
16 |
17 | public override bool SupportsIsa(IUarchTest.ISA isa)
18 | {
19 | if (isa == IUarchTest.ISA.aarch64) return true;
20 | if (isa == IUarchTest.ISA.amd64) return true;
21 | return false;
22 | }
23 |
24 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
25 | {
26 | if (isa == IUarchTest.ISA.amd64)
27 | {
28 | string[] dep = new string[3];
29 | dep[0] = " mov (%r8, %rdi, 4), %r15";
30 | dep[1] = " mov (%r8, %rdi, 4), %r14";
31 | dep[2] = " mov (%r8, %rdi, 4), %r13";
32 |
33 | string[] indep = new string[3];
34 | indep[0] = " mov (%r8), %r15";
35 | indep[1] = " mov (%r8), %r14";
36 | indep[2] = " mov (%r8), %r13";
37 |
38 | UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dep, indep, ptrChasingLoadsInSq: true);
39 | }
40 | if (isa == IUarchTest.ISA.aarch64)
41 | {
42 | string[] dep = new string[3];
43 | dep[0] = " ldr w15, [x2, w25, uxtw #2]";
44 | dep[1] = " ldr w14, [x2, w25, uxtw #2]";
45 | dep[2] = " ldr w13, [x2, w25, uxtw #2]";
46 |
47 | string[] indep = new string[3];
48 | indep[0] = " ldr w12, [x2]";
49 | indep[1] = " ldr w11, [x2]";
50 | indep[2] = " ldr w10, [x2]";
51 | UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dep, indep);
52 | }
53 | }
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/AsmGen/tests/MaddSchedTest.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class MaddSchedTest : UarchTest
6 | {
7 | public MaddSchedTest(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "maddsched";
11 | this.Description = "Scheduler, Integer Multiply-Add";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
13 | this.GetFunctionCallParameters = "structIterations, A";
14 | this.DivideTimeByCount = false;
15 | }
16 |
17 | public override bool SupportsIsa(IUarchTest.ISA isa)
18 | {
19 | if (isa == IUarchTest.ISA.aarch64) return true;
20 | return false;
21 | }
22 |
23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
24 | {
25 | if (isa == IUarchTest.ISA.aarch64)
26 | {
27 | string[] unrolledMuls = new string[4];
28 | unrolledMuls[0] = " madd x15, x15, x25, x10";
29 | unrolledMuls[1] = " madd x14, x14, x25, x10";
30 | unrolledMuls[2] = " madd x13, x13, x25, x10";
31 | unrolledMuls[3] = " madd x12, x12, x25, x10";
32 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledMuls, unrolledMuls, includePtrChasingLoads: false);
33 | }
34 | }
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/AsmGen/tests/MaskRfTest.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class MaskRfTest : UarchTest
6 | {
7 | public MaskRfTest(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "maskrf";
11 | this.Description = "Mask Registers - AVX-512 only";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
13 | this.GetFunctionCallParameters = "structIterations, A";
14 | this.DivideTimeByCount = false;
15 | }
16 |
17 | public override bool SupportsIsa(IUarchTest.ISA isa)
18 | {
19 | if (isa == IUarchTest.ISA.amd64) return true;
20 | return false;
21 | }
22 |
23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
24 | {
25 | if (isa == IUarchTest.ISA.amd64)
26 | {
27 | string[] unrolledAdds = new string[4];
28 | unrolledAdds[0] = " kaddb %k0, %k1, %k1";
29 | unrolledAdds[1] = " kaddb %k0, %k2, %k2";
30 | unrolledAdds[2] = " kaddb %k0, %k3, %k3";
31 | unrolledAdds[3] = " kaddb %k0, %k4, %k4";
32 | UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false);
33 | }
34 | }
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/AsmGen/tests/MixAddvJsCvtNsq.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class MixAddvJsCvtNsq : UarchTest
6 | {
7 | public MixAddvJsCvtNsq(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "mixaddvjscvtnsq";
11 | this.Description = "ADDV and fjcvtzs Scheduler, Excluding any NSQ";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
13 | this.GetFunctionCallParameters = "structIterations, A, fpArr";
14 | this.DivideTimeByCount = false;
15 | }
16 |
17 | public override bool SupportsIsa(IUarchTest.ISA isa)
18 | {
19 | if (isa == IUarchTest.ISA.aarch64) return true;
20 | return false;
21 | }
22 |
23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
24 | {
25 | if (isa == IUarchTest.ISA.aarch64)
26 | {
27 | string postLoadInstrs1 = " ldr q16, [x2, w25, sxtw #0]\n ldr d2, [x2, w25, sxtw #0]";
28 | string initInstrs = " ldr q17, [x2]\n ldr d15, [x2]";
29 | string[] depInstrs = new string[4];
30 | depInstrs[0] = " addv h1, v16.4h";
31 | depInstrs[1] = " fjcvtzs w15, d2";
32 | depInstrs[2] = " addv h3, v16.4h";
33 | depInstrs[3] = " fjcvtzs w14, d2";
34 |
35 | string[] indepInstrs = new string[4];
36 | indepInstrs[0] = " addv h4, v17.4h";
37 | indepInstrs[1] = " fjcvtzs w12, d15";
38 | indepInstrs[2] = " addv h5, v17.4h";
39 | indepInstrs[3] = " fjcvtzs w13, d15";
40 | UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs: initInstrs,
41 | postLoadInstrs: postLoadInstrs1);
42 | }
43 | }
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/AsmGen/tests/MixAddvJsCvtSched.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class MixAddvJsCvtSched : UarchTest
6 | {
7 | public MixAddvJsCvtSched(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "mixaddvjscvtsched";
11 | this.Description = "ADDV and fjcvtzs Scheduler";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
13 | this.GetFunctionCallParameters = "structIterations, A, fpArr";
14 | this.DivideTimeByCount = false;
15 | }
16 |
17 | public override bool SupportsIsa(IUarchTest.ISA isa)
18 | {
19 | if (isa == IUarchTest.ISA.aarch64) return true;
20 | return false;
21 | }
22 |
23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
24 | {
25 | if (isa == IUarchTest.ISA.aarch64)
26 | {
27 | string postLoadInstrs1 = " ldr q16, [x2, w25, sxtw #0]\n ldr d2, [x2, w25, sxtw #0]";
28 | string postLoadInstrs2 = " ldr q16, [x2, w26, sxtw #0]\n ldr d2, [x2, w26, sxtw #0]";
29 | string[] unrolledInstrs = new string[4];
30 | unrolledInstrs[0] = " addv h1, v16.4h";
31 | unrolledInstrs[1] = " fjcvtzs w15, d2";
32 | unrolledInstrs[2] = " addv h3, v16.4h";
33 | unrolledInstrs[3] = " fjcvtzs w14, d2";
34 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, null,
35 | postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
36 | }
37 | }
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/AsmGen/tests/MixFpRfDepBranchTest.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.Text;
3 |
4 | namespace AsmGen
5 | {
6 | public class MixFpRfDepBranchTest : UarchTest
7 | {
8 | private int interval;
9 | public MixFpRfDepBranchTest(int low, int high, int step, int interval)
10 | {
11 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
12 | this.Prefix = "mixfprfdepbranch" + interval;
13 | this.Description = "FP Register File, with some dependent branches";
14 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *fpArr";
15 | this.GetFunctionCallParameters = "structIterations, A, fpArr";
16 | this.DivideTimeByCount = false;
17 | this.interval = interval;
18 | }
19 |
20 | public override bool SupportsIsa(IUarchTest.ISA isa)
21 | {
22 | if (isa == IUarchTest.ISA.aarch64) return true;
23 | return false;
24 | }
25 |
26 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
27 | {
28 | if (isa == IUarchTest.ISA.aarch64)
29 | {
30 | string initInstrs = " ldr s17, [x2]\n" +
31 | " ldr s18, [x2, 4]\n" +
32 | " ldr s19, [x2, 8]\n" +
33 | " ldr s20, [x2, 12]\n" +
34 | " ldr s21, [x2, 16]\n";
35 |
36 | List unrolledAddsList = new List();
37 | for (int i = 0; i < this.Counts[this.Counts.Length - 1]; i++)
38 | {
39 | int regnum = 18 + (i % 4);
40 | unrolledAddsList.Add($" fadd s{regnum}, s{regnum}, s17");
41 | if (i % interval == 0) unrolledAddsList.Add(" cmp x25, x26\n b.eq mixfpjumpsched_badthing" + interval);
42 | }
43 | string[] unrolledAdds = unrolledAddsList.ToArray();
44 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true, initInstrs: initInstrs);
45 |
46 | sb.AppendLine($"mixfpjumpsched_badthing{interval}:\n .word 0xf7f0a000");
47 | }
48 | }
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/AsmGen/tests/MixFpVecRfTest.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.Text;
3 |
4 | namespace AsmGen
5 | {
6 | public class MixFpVecRfTest : UarchTest
7 | {
8 | private bool initialDependentBranch;
9 | public MixFpVecRfTest(int low, int high, int step, bool initialDependentBranch)
10 | {
11 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
12 | this.Prefix = "mixfpvecrf" + (initialDependentBranch ? "db" : string.Empty);
13 | this.Description = "Mixed FP/128-bit FP vec rf" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty);
14 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
15 | this.GetFunctionCallParameters = "structIterations, A, fpArr";
16 | this.DivideTimeByCount = false;
17 | this.initialDependentBranch = initialDependentBranch;
18 | }
19 |
20 | public override bool SupportsIsa(IUarchTest.ISA isa)
21 | {
22 | if (this.initialDependentBranch)
23 | {
24 | if (isa == IUarchTest.ISA.riscv) return true;
25 | return false;
26 | }
27 |
28 | if (isa == IUarchTest.ISA.riscv) return true;
29 | return false;
30 | }
31 |
32 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
33 | {
34 | if (isa == IUarchTest.ISA.riscv)
35 | {
36 | string initInstrs = " vsetvli t5, t6, e32\n vlw.v v0, (a1)\n fld f0, (a1)";
37 | string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : string.Empty;
38 | postLoadInstrs += "\n mv t6, a2";
39 | string[] unrolledInstrs = new string[2];
40 | unrolledInstrs[0] = " vfadd.vv v0, v0, v0";
41 | unrolledInstrs[1] = " fadd.s f0, f0, f0";
42 | UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false,
43 | initInstrs: initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
44 | if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));
45 | }
46 | }
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/AsmGen/tests/MixIntRfDepBranchTest.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.Text;
3 |
4 | namespace AsmGen
5 | {
6 | public class MixIntRfDepBranchTest : UarchTest
7 | {
8 | private int interval;
9 | public MixIntRfDepBranchTest(int low, int high, int step, int interval)
10 | {
11 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
12 | this.Prefix = "mixintrfdepbranch" + interval;
13 | this.Description = "Integer Register File, with some dependent branches";
14 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
15 | this.GetFunctionCallParameters = "structIterations, A";
16 | this.DivideTimeByCount = false;
17 | this.interval = interval;
18 | }
19 |
20 | public override bool SupportsIsa(IUarchTest.ISA isa)
21 | {
22 | if (isa == IUarchTest.ISA.aarch64) return true;
23 | return false;
24 | }
25 |
26 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
27 | {
28 | if (isa == IUarchTest.ISA.aarch64)
29 | {
30 | List unrolledAddsList = new List();
31 | for (int i = 1; i < this.Counts[this.Counts.Length - 1] + 1; i++)
32 | {
33 | int regnum = 12 + (i % 4);
34 | unrolledAddsList.Add($" add x{regnum}, x{regnum}, x11");
35 | if (i % interval == 0) unrolledAddsList.Add(" cmp x25, x26\n b.eq mixintjumpsched_badthing" + interval);
36 | }
37 | string[] unrolledAdds = unrolledAddsList.ToArray();
38 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);
39 |
40 | sb.AppendLine($"mixintjumpsched_badthing{interval}:\n .word 0xf7f0a000");
41 | }
42 | }
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/AsmGen/tests/MixJumpStoreDataSched.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class MixJumpStoreDataSched : UarchTest
6 | {
7 | public MixJumpStoreDataSched(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "mixjumpstoredatasched";
11 | this.Description = "Scheduler, Mixed Jumps and Store Data";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatarr";
13 | this.GetFunctionCallParameters = "structIterations, A, fpArr";
14 | this.DivideTimeByCount = false;
15 | }
16 |
17 | public override bool SupportsIsa(IUarchTest.ISA isa)
18 | {
19 | if (isa == IUarchTest.ISA.amd64) return true;
20 | //if (isa == IUarchTest.ISA.aarch64) return true;
21 | // if (isa == IUarchTest.ISA.mips64) return true;
22 | // if (isa == IUarchTest.ISA.riscv) return true;
23 | return false;
24 | }
25 |
26 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
27 | {
28 | if (isa == IUarchTest.ISA.amd64)
29 | {
30 | string[] unrolledJumps = new string[4];
31 | unrolledJumps[0] = " cmp %rdi, %rsi\n je mixjumpstoredatasched_reallybadthing";
32 | unrolledJumps[1] = " mov %rdi, (%r8)";
33 | unrolledJumps[2] = " cmp %rdi, %rsi\n je mixjumpstoredatasched_reallybadthing";
34 | unrolledJumps[3] = " mov %rdi, 64(%r8)";
35 | UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true);
36 |
37 | sb.AppendLine("mixjumpstoredatasched_reallybadthing:\n int3");
38 | }
39 | }
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/AsmGen/tests/MixJumpStoreSchedTest.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class MixJumpStoreSchedTest : UarchTest
6 | {
7 | public MixJumpStoreSchedTest(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "mixjumpstoresched";
11 | this.Description = "Scheduler, Mixed Jumps and Stores (Address Dependency)";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatarr";
13 | this.GetFunctionCallParameters = "structIterations, A, fpArr";
14 | this.DivideTimeByCount = false;
15 | }
16 |
17 | public override bool SupportsIsa(IUarchTest.ISA isa)
18 | {
19 | if (isa == IUarchTest.ISA.amd64) return true;
20 | //if (isa == IUarchTest.ISA.aarch64) return true;
21 | // if (isa == IUarchTest.ISA.mips64) return true;
22 | // if (isa == IUarchTest.ISA.riscv) return true;
23 | return false;
24 | }
25 |
26 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
27 | {
28 | if (isa == IUarchTest.ISA.amd64)
29 | {
30 | string[] unrolledJumps = new string[4];
31 | unrolledJumps[0] = " cmp %rdi, %rsi\n je mixstorejumpsched_reallybadthing";
32 | unrolledJumps[1] = " mov %r14, (%r8, %rdi, 2)";
33 | unrolledJumps[2] = " cmp %rdi, %rsi\n je mixstorejumpsched_reallybadthing";
34 | unrolledJumps[3] = " mov %r14, 64(%r8, %rdi, 2)";
35 | UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true);
36 |
37 | sb.AppendLine("mixstorejumpsched_reallybadthing:\n int3");
38 | }
39 | }
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/AsmGen/tests/MixJumpThenAddSched.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.Text;
3 |
4 | namespace AsmGen
5 | {
6 | public class MixJumpThenAddSched : UarchTest
7 | {
8 | public MixJumpThenAddSched(int low, int high, int step)
9 | {
10 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
11 | this.Prefix = "mixjumpthenaddsched";
12 | this.Description = "Scheduler, 40 NT jumps + adds";
13 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
14 | this.GetFunctionCallParameters = "structIterations, A";
15 | this.DivideTimeByCount = false;
16 | }
17 |
18 | public override bool SupportsIsa(IUarchTest.ISA isa)
19 | {
20 | // if (isa == IUarchTest.ISA.amd64) return true;
21 | if (isa == IUarchTest.ISA.aarch64) return true;
22 | // if (isa == IUarchTest.ISA.mips64) return true;
23 | // if (isa == IUarchTest.ISA.riscv) return true;
24 | return false;
25 | }
26 |
27 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
28 | {
29 | if (isa == IUarchTest.ISA.aarch64)
30 | {
31 | List unrolledJumps = new List();
32 | int instrIdx;
33 | for (instrIdx = 0; instrIdx < 40; instrIdx++) unrolledJumps.Add(" cmp x25, x26\n b.eq mixaddthenjumpsched_reallybadthing");
34 | for (; instrIdx < this.Counts[this.Counts.Length - 1]; instrIdx++) unrolledJumps.Add(" add x15, x15, x25");
35 | string[] instrs = unrolledJumps.ToArray();
36 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, instrs, instrs, includePtrChasingLoads: true, dsb: true);
37 | sb.AppendLine("mixaddthenjumpsched_reallybadthing:\n .word 0xf7f0a000");
38 | }
39 | }
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/AsmGen/tests/MixLoadStoreDivSchedTest.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class MixLoadStoreDivSchedTest : UarchTest
6 | {
7 | public MixLoadStoreDivSchedTest(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "mixloadstoredivsched";
11 | this.Description = "Load/Store Scheduler Capacity Test, using divs to block retirement";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int count, int *arr2, int *arr3";
13 | this.GetFunctionCallParameters = "structIterations, list_size, B, A";
14 | this.DivideTimeByCount = false;
15 | }
16 |
17 | public override bool SupportsIsa(IUarchTest.ISA isa)
18 | {
19 | if (isa == IUarchTest.ISA.amd64) return true;
20 | if (isa == IUarchTest.ISA.aarch64) return true;
21 | return false;
22 | }
23 |
24 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
25 | {
26 | if (isa == IUarchTest.ISA.amd64)
27 | {
28 | GenerateX86Asm(sb);
29 | }
30 | else if (isa == IUarchTest.ISA.aarch64)
31 | {
32 | GenerateArmAsm(sb);
33 | }
34 | }
35 |
36 | public void GenerateX86Asm(StringBuilder sb)
37 | {
38 | string[] dependentLoads = new string[2];
39 | dependentLoads[0] = " mov (%r9, %rdx, 4), %r15";
40 | dependentLoads[1] = " mov %r14, (%r8, %rdx, 4)";
41 |
42 | UarchTestHelpers.GenerateX86AsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, false);
43 | }
44 |
45 | public void GenerateArmAsm(StringBuilder sb)
46 | {
47 | string[] dependentLoads = new string[2];
48 | dependentLoads[0] = " ldr w15, [x3, w25, uxtw #2]";
49 | dependentLoads[1] = " str w14, [x2, w25, uxtw #2]";
50 |
51 | string[] dependentLoads1 = new string[2];
52 | dependentLoads1[0] = " ldr w15, [x3, w26, uxtw #2]";
53 | dependentLoads1[1] = " str w14, [x2, w26, uxtw #2]";
54 |
55 | UarchTestHelpers.GenerateArmAsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, false);
56 | }
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/AsmGen/tests/MixVec512Vec256RfTest.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.Text;
3 |
4 | namespace AsmGen
5 | {
6 | public class MixVec512Vec256RfTest : UarchTest
7 | {
8 | public MixVec512Vec256RfTest(int low, int high, int step)
9 | {
10 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
11 | this.Prefix = "mixvec512vec256rf";
12 | this.Description = "Mixed zmm/ymm regs - AVX-512 only, alternating";
13 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
14 | this.GetFunctionCallParameters = "structIterations, A, fpArr";
15 | this.DivideTimeByCount = false;
16 | }
17 |
18 | public override bool SupportsIsa(IUarchTest.ISA isa)
19 | {
20 | if (isa == IUarchTest.ISA.amd64) return true;
21 | return false;
22 | }
23 |
24 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
25 | {
26 | if (isa == IUarchTest.ISA.amd64)
27 | {
28 | // use even numbered regs for ymm testing
29 | string initInstrs = " vmovups (%r8), %zmm1\n" +
30 | " vmovups 64(%r8), %ymm2\n" +
31 | " vmovups 128(%r8), %zmm3\n" +
32 | " vmovups 192(%r8), %ymm4\n" +
33 | " vmovups 256(%r8), %zmm5\n";
34 |
35 | // use all zmm regs
36 | for (int i = 6; i < 32; i++)
37 | {
38 | if ((i & 1) == 0) initInstrs += "vmovups %ymm2, %ymm" + i + "\n";
39 | else initInstrs += "vmovups %zmm5, %zmm" + i + "\n";
40 | }
41 |
42 | List instrsList = new List();
43 | for (int i = 1; i < 32; i++)
44 | {
45 | if ((i & 1) == 0) instrsList.Add($" vaddps %ymm2, %ymm{i}, %ymm{i}");
46 | else instrsList.Add($" vaddps %zmm1, %zmm{i}, %zmm{i}");
47 | }
48 |
49 | string[] unrolledAdds = instrsList.ToArray();
50 | UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, initInstrs);
51 | }
52 | }
53 | }
54 | }
--------------------------------------------------------------------------------
/AsmGen/tests/MmxRfTest.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class MmxRfTest : UarchTest
6 | {
7 | public MmxRfTest(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "mmxrf";
11 | this.Description = "64-bit MMX RF Capacity Test. x86 only";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, int *arr2";
13 | this.GetFunctionCallParameters = "structIterations, A, B";
14 | this.DivideTimeByCount = false;
15 | }
16 | public override bool SupportsIsa(IUarchTest.ISA isa)
17 | {
18 | if (isa == IUarchTest.ISA.amd64) return true;
19 | return false;
20 | }
21 |
22 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
23 | {
24 | if (isa == IUarchTest.ISA.amd64) GenerateX86GccAsm(sb);
25 | }
26 |
27 | public void GenerateX86GccAsm(StringBuilder sb)
28 | {
29 | string initInstrs =
30 | " fsave (%r8)\n" +
31 | " movq (%rdx), %mm0\n" +
32 | " movq 8(%rdx), %mm1\n" +
33 | " movq 16(%rdx), %mm2\n" +
34 | " movq 24(%rdx), %mm3\n" +
35 | " movq 32(%rdx), %mm4\n";
36 |
37 | string cleanupInstrs = " frstor (%r8)";
38 |
39 | string[] unrolledAdds = new string[4];
40 | unrolledAdds[0] = " paddw %mm0, %mm1";
41 | unrolledAdds[1] = " paddw %mm0, %mm2";
42 | unrolledAdds[2] = " paddw %mm0, %mm3";
43 | unrolledAdds[3] = " paddw %mm0, %mm4";
44 |
45 | UarchTestHelpers.GenerateX86AsmStructureTestFuncs(
46 | sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, initInstrs: initInstrs, cleanupInstrs: cleanupInstrs);
47 | }
48 | }
49 | }
--------------------------------------------------------------------------------
/AsmGen/tests/PdepSchedTest.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class PdepSchedTest : UarchTest
6 | {
7 | public PdepSchedTest(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "pdepsched";
11 | this.Description = "Scheduler, PDEP";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
13 | this.GetFunctionCallParameters = "structIterations, A";
14 | this.DivideTimeByCount = false;
15 | }
16 |
17 | public override bool SupportsIsa(IUarchTest.ISA isa)
18 | {
19 | if (isa == IUarchTest.ISA.amd64) return true;
20 | return false;
21 | }
22 |
23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
24 | {
25 | if (isa == IUarchTest.ISA.amd64)
26 | {
27 | string[] unrolledAdds = new string[4];
28 | unrolledAdds[0] = " pdep %rdi, %r15, %r15";
29 | unrolledAdds[1] = " pdep %rdi, %r14, %r14";
30 | unrolledAdds[2] = " pdep %rdi, %r13, %r13";
31 | unrolledAdds[3] = " pdep %rdi, %r12, %r12";
32 | UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false);
33 | }
34 | }
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/AsmGen/tests/RorSchedTest.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class RorSchedTest : UarchTest
6 | {
7 | public RorSchedTest(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "rorsched";
11 | this.Description = "Scheduler, Integer Rotate by Immediate (1)";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
13 | this.GetFunctionCallParameters = "structIterations, A";
14 | this.DivideTimeByCount = false;
15 | }
16 |
17 | public override bool SupportsIsa(IUarchTest.ISA isa)
18 | {
19 | if (isa == IUarchTest.ISA.amd64) return true;
20 | return false;
21 | }
22 |
23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
24 | {
25 | if (isa == IUarchTest.ISA.amd64)
26 | {
27 | string postLoadInstrs = " mov %rdi, %r15";
28 | string postLoadInstrs2 = " mov %rsi, %r15";
29 | string[] unrolledInstrs = new string[1];
30 | unrolledInstrs[0] = " ror $1, %r15";
31 | UarchTestHelpers.GenerateX86AsmStructureTestFuncs(
32 | sb,
33 | this.Counts,
34 | this.Prefix,
35 | unrolledInstrs,
36 | unrolledInstrs,
37 | postLoadInstrs1: postLoadInstrs,
38 | postLoadInstrs2: postLoadInstrs2,
39 | includePtrChasingLoads: false);
40 | }
41 | }
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/AsmGen/tests/ShlSchedTest.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class ShlSchedTest : UarchTest
6 | {
7 | public ShlSchedTest(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "shlsched";
11 | this.Description = "Scheduler, Integer Shift by Immediate (1)";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
13 | this.GetFunctionCallParameters = "structIterations, A";
14 | this.DivideTimeByCount = false;
15 | }
16 |
17 | public override bool SupportsIsa(IUarchTest.ISA isa)
18 | {
19 | if (isa == IUarchTest.ISA.amd64) return true;
20 | return false;
21 | }
22 |
23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
24 | {
25 | if (isa == IUarchTest.ISA.amd64)
26 | {
27 | string postLoadInstrs = " mov %rdi, %r15";
28 | string postLoadInstrs2 = " mov %rsi, %r15";
29 | string[] unrolledInstrs = new string[1];
30 | unrolledInstrs[0] = " shl $1, %r15";
31 | UarchTestHelpers.GenerateX86AsmStructureTestFuncs(
32 | sb,
33 | this.Counts,
34 | this.Prefix,
35 | unrolledInstrs,
36 | unrolledInstrs,
37 | postLoadInstrs1: postLoadInstrs,
38 | postLoadInstrs2: postLoadInstrs2,
39 | includePtrChasingLoads: false);
40 | }
41 | }
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/AsmGen/tests/StoreDataNsqTest.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class StoreDataNsq : UarchTest
6 | {
7 | public StoreDataNsq(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "storedatansq";
11 | this.Description = "Store Data Scheduler, excluding NSQ";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
13 | this.GetFunctionCallParameters = "structIterations, A, fpArr";
14 | this.DivideTimeByCount = false;
15 | }
16 |
17 | public override bool SupportsIsa(IUarchTest.ISA isa)
18 | {
19 | if (isa == IUarchTest.ISA.amd64) return true;
20 | // if (isa == IUarchTest.ISA.aarch64) return true;
21 | // if (isa == IUarchTest.ISA.mips64) return true;
22 | // if (isa == IUarchTest.ISA.riscv) return true;
23 | return false;
24 | }
25 |
26 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
27 | {
28 | if (isa == IUarchTest.ISA.amd64)
29 | {
30 | string[] dependentLoads = new string[4];
31 | dependentLoads[0] = " mov %rdi, (%r8)";
32 | dependentLoads[1] = " mov %rdi, 8(%r8)";
33 | dependentLoads[2] = " mov %rdi, 16(%r8)";
34 | dependentLoads[3] = " mov %rdi, 24(%r8)";
35 |
36 | string[] independentLoads = new string[4];
37 | independentLoads[0] = " mov %r14, (%r8)";
38 | independentLoads[1] = " mov %r14, 8(%r8)";
39 | independentLoads[2] = " mov %r14, 16(%r8)";
40 | independentLoads[3] = " mov %r14, 24(%r8)";
41 | UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentLoads, independentLoads);
42 | }
43 | }
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/AsmGen/tests/StoreDivNsqTest.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class StoreDivNsqTest : UarchTest
6 | {
7 | public StoreDivNsqTest(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "storedivnsq";
11 | this.Description = "Store Scheduler, using DIVs to block retirement, excluding NSQ";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
13 | this.GetFunctionCallParameters = "structIterations, A, fpArr";
14 | this.DivideTimeByCount = false;
15 | }
16 |
17 | public override bool SupportsIsa(IUarchTest.ISA isa)
18 | {
19 | if (isa == IUarchTest.ISA.amd64) return true;
20 | if (isa == IUarchTest.ISA.aarch64) return true;
21 | return false;
22 | }
23 |
24 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
25 | {
26 | if (isa == IUarchTest.ISA.amd64)
27 | {
28 | // idiv puts remainder in RDX
29 | string[] dependentStores = new string[4];
30 | dependentStores[0] = " mov %r15w, (%r8, %rdx, 2)";
31 | dependentStores[1] = " mov %r15w, 2(%r8, %rdx, 2)";
32 | dependentStores[2] = " mov %r15w, 4(%r8, %rdx, 2)";
33 | dependentStores[3] = " mov %r15w, 6(%r8, %rdx, 2)";
34 |
35 | string[] indepStores = new string[4];
36 | indepStores[0] = " mov %r11w, (%r8)";
37 | indepStores[1] = " mov %r11w, 2(%r8)";
38 | indepStores[2] = " mov %r11w, 4(%r8)";
39 | indepStores[3] = " mov %r11w, 6(%r8)";
40 | UarchTestHelpers.GenerateX86AsmDivNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, indepStores);
41 | }
42 | else if (isa == IUarchTest.ISA.aarch64)
43 | {
44 | string[] dependentStores = new string[1];
45 | dependentStores[0] = " str w15, [x2, w25, uxtw #2]";
46 |
47 | string[] independentStores = new string[1];
48 | independentStores[0] = " str w15, [x2, w15, uxtw #2]";
49 |
50 | UarchTestHelpers.GenerateArmAsmDivNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, independentStores);
51 | }
52 | }
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/AsmGen/tests/StoreNsq.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace AsmGen
4 | {
5 | public class StoreNsq : UarchTest
6 | {
7 | public StoreNsq(int low, int high, int step)
8 | {
9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
10 | this.Prefix = "storensq";
11 | this.Description = "Store Address Scheduler, Excluding any NSQ";
12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
13 | this.GetFunctionCallParameters = "structIterations, A, fpArr";
14 | this.DivideTimeByCount = false;
15 | }
16 |
17 | public override bool SupportsIsa(IUarchTest.ISA isa)
18 | {
19 | if (isa == IUarchTest.ISA.aarch64) return true;
20 | return false;
21 | }
22 |
23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
24 | {
25 | if (isa == IUarchTest.ISA.aarch64)
26 | {
27 | string[] depStores = new string[4];
28 | depStores[0] = " str w15, [x2, w25, uxtw #2]";
29 | depStores[1] = " str w14, [x2, w25, uxtw #2]";
30 | depStores[2] = " str w13, [x2, w25, uxtw #2]";
31 | depStores[3] = " str w12, [x2, w25, uxtw #2]";
32 |
33 | string[] indepStores = new string[4];
34 | indepStores[0] = " str w15, [x2, w26, uxtw #2]";
35 | indepStores[1] = " str w14, [x2, w26, uxtw #2]";
36 | indepStores[2] = " str w13, [x2, w26, uxtw #2]";
37 | indepStores[3] = " str w12, [x2, w26, uxtw #2]";
38 | UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, depStores, indepStores);
39 | }
40 | }
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/AsmGen/tests/Vec512RfTest.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.Text;
3 |
4 | namespace AsmGen
5 | {
6 | public class Vec512RfTest : UarchTest
7 | {
8 | public Vec512RfTest(int low, int high, int step)
9 | {
10 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
11 | this.Prefix = "vec512rf";
12 | this.Description = "Vector (512-bit packed fp) RF Test - AVX-512 only";
13 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
14 | this.GetFunctionCallParameters = "structIterations, A, fpArr";
15 | this.DivideTimeByCount = false;
16 | }
17 |
18 | public override bool SupportsIsa(IUarchTest.ISA isa)
19 | {
20 | if (isa == IUarchTest.ISA.amd64) return true;
21 | return false;
22 | }
23 |
24 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
25 | {
26 | if (isa == IUarchTest.ISA.amd64)
27 | {
28 | // it's ok, the ptr chasing arr should be way bigger than this
29 | string initInstrs = " vmovups (%r8), %zmm1\n" +
30 | " vmovups 64(%r8), %zmm2\n" +
31 | " vmovups 128(%r8), %zmm3\n" +
32 | " vmovups 192(%r8), %zmm4\n" +
33 | " vmovups 256(%r8), %zmm5\n";
34 |
35 | // use all zmm regs
36 | for (int i = 6; i < 32; i++)
37 | {
38 | initInstrs += "vmovups %zmm5, %zmm" + i + "\n";
39 | }
40 |
41 | List instrsList = new List();
42 | for (int i = 1; i < 32; i++)
43 | {
44 | instrsList.Add($" vaddps %zmm1, %zmm{i}, %zmm{i}");
45 | }
46 |
47 | string[] unrolledAdds = instrsList.ToArray();
48 | UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, initInstrs);
49 | }
50 | }
51 | }
52 | }
--------------------------------------------------------------------------------
/CoherencyLatency/CoherencyLatency.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 16
4 | VisualStudioVersion = 16.0.31025.194
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CoherencyLatency", "CoherencyLatency.vcxproj", "{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}"
7 | EndProject
8 | Global
9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | Debug|x64 = Debug|x64
11 | Debug|x86 = Debug|x86
12 | Release|x64 = Release|x64
13 | Release|x86 = Release|x86
14 | EndGlobalSection
15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
16 | {6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Debug|x64.ActiveCfg = Debug|x64
17 | {6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Debug|x64.Build.0 = Debug|x64
18 | {6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Debug|x86.ActiveCfg = Debug|Win32
19 | {6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Debug|x86.Build.0 = Debug|Win32
20 | {6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Release|x64.ActiveCfg = Release|x64
21 | {6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Release|x64.Build.0 = Release|x64
22 | {6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Release|x86.ActiveCfg = Release|Win32
23 | {6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Release|x86.Build.0 = Release|Win32
24 | EndGlobalSection
25 | GlobalSection(SolutionProperties) = preSolution
26 | HideSolutionNode = FALSE
27 | EndGlobalSection
28 | GlobalSection(ExtensibilityGlobals) = postSolution
29 | SolutionGuid = {A6E60C3D-60ED-4DBF-B4AA-7C1C3A140325}
30 | EndGlobalSection
31 | EndGlobal
32 |
--------------------------------------------------------------------------------
/CoherencyLatency/Makefile:
--------------------------------------------------------------------------------
1 | include ../Common/arch_detect.mk
2 |
3 | CFLAGS = -pthread -O3
4 |
5 | all: $(TARGET)
6 |
7 | amd64:
8 | $(CC) $(CFLAGS) PThreadsCoherencyLatency.c -o CoherencyLatency_amd64 $(LDFLAGS)
9 |
10 | aarch64:
11 | $(CC) $(CFLAGS) PThreadsCoherencyLatency.c -o CoherencyLatency_aarch64 $(LDFLAGS)
12 |
13 | riscv64:
14 | $(CC) $(CFLAGS) PThreadsCoherencyLatency.c -o CoherencyLatency_riscv64 $(LDFLAGS)
15 |
16 | w64:
17 | $(CC) $(CFLAGS) CoherencyLatency.cpp -o CoherencyLatency_w64.exe $(LDFLAGS)
18 |
19 | # w64 can build with mingw 11, which isn't available on jammy
20 |
21 | ci: amd64 aarch64 riscv64
22 |
23 | clean:
24 | rm -rf *.o *.zip "ocl-icd-libopencl1*" "OpenCL-SDK*" && find . -type f -executable -delete
25 |
26 | .PHONY: all ci clean
27 |
--------------------------------------------------------------------------------
/CoherencyLatency/c2cparse/c2cparse.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Exe
5 | net6.0
6 | enable
7 | enable
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/CoherencyLatency/c2cparse/c2cparse.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 17
4 | VisualStudioVersion = 17.4.33110.190
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "c2cparse", "c2cparse.csproj", "{F9E172EC-1A9A-4908-9512-4547CD1CFD80}"
7 | EndProject
8 | Global
9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | Debug|Any CPU = Debug|Any CPU
11 | Release|Any CPU = Release|Any CPU
12 | EndGlobalSection
13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
14 | {F9E172EC-1A9A-4908-9512-4547CD1CFD80}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15 | {F9E172EC-1A9A-4908-9512-4547CD1CFD80}.Debug|Any CPU.Build.0 = Debug|Any CPU
16 | {F9E172EC-1A9A-4908-9512-4547CD1CFD80}.Release|Any CPU.ActiveCfg = Release|Any CPU
17 | {F9E172EC-1A9A-4908-9512-4547CD1CFD80}.Release|Any CPU.Build.0 = Release|Any CPU
18 | EndGlobalSection
19 | GlobalSection(SolutionProperties) = preSolution
20 | HideSolutionNode = FALSE
21 | EndGlobalSection
22 | GlobalSection(ExtensibilityGlobals) = postSolution
23 | SolutionGuid = {4C3856A5-1183-4D5F-80BE-3D694765A594}
24 | EndGlobalSection
25 | EndGlobal
26 |
--------------------------------------------------------------------------------
/Common/arch_detect.mk:
--------------------------------------------------------------------------------
1 | TARGET ?= amd64
2 |
3 | ifeq ($(OS),Windows_NT)
4 | TARGET = w64
5 | else
6 | UNAME_M := $(shell uname -m)
7 | ifeq ($(UNAME_M),x86_64)
8 | TARGET = amd64
9 | endif
10 | ifeq ($(UNAME_M),aarch64)
11 | TARGET = aarch64
12 | endif
13 | ifeq ($(UNAME_M),riscv64)
14 | TARGET = riscv64
15 | endif
16 | UNAME_S := $(shell uname -s)
17 | ifeq ($(UNAME_S),Darwin)
18 | TARGET = darwin
19 | endif
20 | endif
21 |
22 | amd64: CC = x86_64-linux-gnu-gcc
23 | amd64_numa: CC = x86_64-linux-gnu-gcc
24 | aarch64: CC := gcc
25 | aarch64_numa: CC = aarch64-linux-gnu-gcc
26 | riscv64: CC = riscv64-linux-gnu-gcc
27 | w64: CC = x86_64-w64-mingw32-gcc
28 | darwin: CC = clang
29 |
--------------------------------------------------------------------------------
/Common/ci_gpumemlatency.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | make_all () {
4 | make amd64
5 | make clean-obj
6 | LDFLAGS="-lm -L ocl-icd-arm64/usr/lib/aarch64-linux-gnu -lOpenCL" make aarch64
7 | make clean-obj
8 | LDFLAGS="-lm -L ocl-icd-riscv64/usr/lib/riscv64-linux-gnu -lOpenCL" make riscv64
9 | make clean-obj
10 | CPPFLAGS="-I OpenCL-SDK-${OCL_VER}-Win-x64/include" LDFLAGS="-lm -L OpenCL-SDK-${OCL_VER}-Win-x64/lib -lOpenCL" make w64
11 | make clean-obj
12 | }
13 |
14 | linux_deps () {
15 | for ARCH in arm64 riscv64; do
16 | if ! grep -q $ARCH /etc/apt/sources.list; then
17 | echo "deb [arch=${ARCH}] http://ports.ubuntu.com/ubuntu-ports $(lsb_release -c -s) universe" | sudo tee -a /etc/apt/sources.list
18 | echo "deb-src [arch=${ARCH}] http://ports.ubuntu.com/ubuntu-ports $(lsb_release -c -s) universe" | sudo tee -a /etc/apt/sources.list
19 | sudo apt update
20 | fi
21 | apt-get download "ocl-icd-libopencl1:${ARCH}"
22 | find . -type f -name "*${ARCH}*.deb" -exec dpkg-deb -x {} "ocl-icd-${ARCH}" \;
23 | done
24 | cp ocl-icd-arm64/usr/lib/aarch64-linux-gnu/libOpenCL.so.1 ocl-icd-arm64/usr/lib/aarch64-linux-gnu/libOpenCL.so
25 | cp ocl-icd-riscv64/usr/lib/riscv64-linux-gnu/libOpenCL.so.1 ocl-icd-riscv64/usr/lib/riscv64-linux-gnu/libOpenCL.so
26 | }
27 |
28 | w64_deps () {
29 | curl -fssLO "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/${OCL_VER}/OpenCL-SDK-${OCL_VER}-Win-x64.zip"
30 | unzip "OpenCL-SDK-${OCL_VER}-Win-x64.zip"
31 | }
32 |
33 | linux_deps
34 | w64_deps
35 | make_all
36 |
--------------------------------------------------------------------------------
/Common/ci_package.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | PKG="clammarks-$(git rev-parse --short HEAD)"
4 | rm -rf "$PKG" "clammarks.txz"
5 | mkdir -p "$PKG"
6 |
7 | for TARGET in "amd64" "aarch64" "riscv64" "w64"; do
8 | mkdir "$PKG/$TARGET"
9 | for COMPONENT in CoherencyLatency MemoryLatency MemoryBandwidth InstructionRate Meshsim CoreClockChecker GpuMemLatency; do
10 | find "$COMPONENT" -type f -name "*$TARGET*" -executable -exec cp {} "$PKG/$TARGET" \;
11 | done
12 | find "GpuMemLatency" -type f -name "*.cl" -exec cp {} "$PKG/$TARGET" \;
13 | done
14 |
15 | cp "LICENSE" "$PKG"
16 |
17 | tar caf "clammarks.txz" "$PKG"
18 |
--------------------------------------------------------------------------------
/Common/timing.c:
--------------------------------------------------------------------------------
1 | #ifdef _MSC_VER
2 | #include
3 | __declspec(selectany) struct timeb start, end;
4 | void start_timing() {
5 | ftime(&start);
6 | }
7 |
8 | unsigned int end_timing() {
9 | ftime(&end);
10 | return 1000 * (end.time - start.time) + (end.millitm - start.millitm);
11 | }
12 |
13 | void start_timing_ts(struct timeb *startTimeb) {
14 | ftime(startTimeb);
15 | }
16 |
17 | unsigned int end_timing_ts(struct timeb* startTimeb) {
18 | struct timeb end;
19 | ftime(&end);
20 | return 1000 * (end.time - startTimeb->time) + (end.millitm - startTimeb->millitm);
21 | }
22 | #else
23 | #include
24 | #include
25 | struct timeval startTv, endTv;
26 | void start_timing() {
27 | gettimeofday(&startTv, NULL);
28 | }
29 |
30 | unsigned int end_timing() {
31 | gettimeofday(&endTv, NULL);
32 | return (unsigned int)((endTv.tv_sec - startTv.tv_sec) * 1000 + (endTv.tv_usec - startTv.tv_usec) / 1000);
33 | }
34 |
35 | void start_timing_ts(struct timeval* start) {
36 | gettimeofday(start, NULL);
37 | }
38 |
39 | unsigned int end_timing_ts(struct timeval* start) {
40 | struct timeval end;
41 | gettimeofday(&end, NULL);
42 | return (unsigned int)((end.tv_sec - start->tv_sec) * 1000 + (end.tv_usec - start->tv_usec) / 1000);
43 |
44 | }
45 | #endif
46 |
47 | unsigned long long scale_iterations_to_target(unsigned long long last_iteration_count, float last_time, float target_time) {
48 | // safety measure to deal with nasty timer precision issues if the system is fast
49 | if (last_time < 50) return last_iteration_count * 2;
50 | return last_iteration_count * (target_time / last_time);
51 | }
52 |
--------------------------------------------------------------------------------
/Common/timing.h:
--------------------------------------------------------------------------------
1 | #ifndef timingincluded
2 | #define timingincluded
3 | #ifdef _MSC_VER
4 | #include
5 | #else
6 | #include
7 | #endif
8 | extern struct timeb start, end;
9 | inline void start_timing();
10 | inline unsigned int end_timing();
11 |
12 | #ifdef _MSC_VER
13 | void start_timing_ts(struct timeb* startTimeb);
14 | unsigned int end_timing_ts(struct timeb* startTimeb);
15 | #else
16 | void start_timing_ts(struct timeval* start);
17 | unsigned int end_timing_ts(struct timeval* start);
18 | #endif
19 | unsigned long long scale_iterations_to_target(unsigned long long last_iteration_count, float last_time, float target_time);
20 | #endif
21 |
--------------------------------------------------------------------------------
/CoreClockChecker/BoostClockChecker.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | extern uint64_t clktsctest(uint64_t iterations) __attribute((ms_abi));
10 |
11 | int main(int argc, char *argv[]) {
12 | struct timeval startTv, endTv;
13 | uint64_t iterations = 500000, samples = 100;
14 | unsigned int sleepSeconds = 5;
15 | time_t time_diff_ms;
16 |
17 | for (int argIdx = 1; argIdx < argc; argIdx++) {
18 | if (*(argv[argIdx]) == '-') {
19 | char *arg = argv[argIdx] + 1;
20 | if (strncmp(arg, "samples", 7) == 0) {
21 | argIdx++;
22 | samples = atol(argv[argIdx]);
23 | } else if (strncmp(arg, "iterations", 10) == 0) {
24 | argIdx++;
25 | iterations = atol(argv[argIdx]);
26 | } else if (strncmp(arg, "sleep", 5) == 0) {
27 | argIdx++;
28 | sleepSeconds = atoi(argv[argIdx]);
29 | }
30 | }
31 | }
32 |
33 | sleep(sleepSeconds);
34 |
35 | uint64_t *measuredTscs = malloc(samples * sizeof(uint64_t));
36 | for (uint64_t sampleIdx = 0; sampleIdx < samples; sampleIdx++) {
37 | uint64_t elapsedTsc = clktsctest(iterations);
38 | measuredTscs[sampleIdx] = elapsedTsc;
39 | }
40 |
41 | fprintf(stderr, "Used %lu samples\n", samples);
42 | fprintf(stderr, "Used %lu iterations\n", iterations);
43 | // figure out TSC to real time ratio
44 | fprintf(stderr, "Checking TSC ratio...\n");
45 | uint64_t iterationsHi = 8e9; // should be a couple seconds at least?
46 | gettimeofday(&startTv, NULL);
47 | uint64_t referenceElapsedTsc = clktsctest(iterationsHi);
48 | gettimeofday(&endTv, NULL);
49 | time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
50 | float tsc_per_ms = (float)referenceElapsedTsc / (float)time_diff_ms;
51 | float tsc_per_ns = tsc_per_ms / 1e6;
52 | fprintf(stderr, "TSC = %lu, elapsed ms = %lu\n", referenceElapsedTsc, time_diff_ms);
53 | fprintf(stderr, "TSC per ms: %f, TSC per ns: %f\n", tsc_per_ms, tsc_per_ns);
54 |
55 | printf("Time (ms), Clk (GHz), TSC\n");
56 | float elapsedTime = 0;
57 | for (uint64_t sampleIdx = 0; sampleIdx < samples; sampleIdx++) {
58 | // (tsc / ms) * tsc = 1 / ms
59 | float elapsedTimeMs = measuredTscs[sampleIdx] / tsc_per_ms;
60 | elapsedTime += elapsedTimeMs;
61 | float latency = 1e6 * elapsedTimeMs / (float)iterations;
62 | float addsPerNs = 1 / latency;
63 | printf("%f,%f,%lu\n", elapsedTime, addsPerNs, measuredTscs[sampleIdx]);
64 | }
65 |
66 | return 0;
67 | }
68 |
--------------------------------------------------------------------------------
/CoreClockChecker/BoostClockChecker_arm.s:
--------------------------------------------------------------------------------
1 | .text
2 | .global clktsctest
3 |
4 | .global _clktsctest
5 |
6 | .balign 4
7 |
8 | /* x0 = iterations, return elapsed TSC in x0 */
9 | _clktsctest:
10 | clktsctest:
11 | sub sp, sp, #0x40
12 | stp x10, x11, [sp, #0x10]
13 | stp x12, x13, [sp, #0x20]
14 | stp x14, x15, [sp, #0x30]
15 | mov x10, 1
16 | mov x11, 20
17 | mov x12, 0
18 | /* stackoverflow says this is a good idea */
19 | mrs x14, cntvct_el0
20 | clktsctest_loop:
21 | add x12, x12, x10
22 | add x12, x12, x10
23 | add x12, x12, x10
24 | add x12, x12, x10
25 | add x12, x12, x10
26 | add x12, x12, x10
27 | add x12, x12, x10
28 | add x12, x12, x10
29 | add x12, x12, x10
30 | add x12, x12, x10
31 | add x12, x12, x10
32 | add x12, x12, x10
33 | add x12, x12, x10
34 | add x12, x12, x10
35 | add x12, x12, x10
36 | add x12, x12, x10
37 | add x12, x12, x10
38 | add x12, x12, x10
39 | add x12, x12, x10
40 | add x12, x12, x10
41 | sub x0, x0, x11
42 | cbnz x0, clktsctest_loop
43 | mrs x15, cntvct_el0
44 | sub x0, x15, x14
45 | ldp x14, x15, [sp, #0x30]
46 | ldp x12, x13, [sp, #0x20]
47 | ldp x10, x11, [sp, #0x10]
48 | add sp, sp, #0x40
49 | ret
50 |
--------------------------------------------------------------------------------
/CoreClockChecker/BoostClockChecker_x86.s:
--------------------------------------------------------------------------------
1 | .global clktsctest
2 |
3 | /* rcx = iterations, return elapsed TSC in rax */
4 | clktsctest:
5 | push %rdx
6 | push %rbx
7 | push %r8
8 | push %r9
9 | push %r10
10 | mov %rcx, %rdi
11 | mov $1, %r8
12 | mov $20, %r9
13 | xor %rbx, %rbx
14 | rdtsc /* high 32 bits in EDX, low 32 bits in EAX */
15 | shl $32, %rdx /* shift high 32 bits into upper half of EDX */
16 | add %rax, %rdx /* place full 64-bit value in rdx */
17 | mov %rdx, %r10
18 | clktsctest_loop:
19 | add %r8, %rbx
20 | add %r8, %rbx
21 | add %r8, %rbx
22 | add %r8, %rbx
23 | add %r8, %rbx
24 | add %r8, %rbx
25 | add %r8, %rbx
26 | add %r8, %rbx
27 | add %r8, %rbx
28 | add %r8, %rbx
29 | add %r8, %rbx
30 | add %r8, %rbx
31 | add %r8, %rbx
32 | add %r8, %rbx
33 | add %r8, %rbx
34 | add %r8, %rbx
35 | add %r8, %rbx
36 | add %r8, %rbx
37 | add %r8, %rbx
38 | add %r8, %rbx
39 | sub %r9, %rdi
40 | jnz clktsctest_loop
41 | rdtsc
42 | shl $32, %rdx
43 | add %rdx, %rax /* now rax has the new value */
44 | sub %r10, %rax /* subtract old TSC value from the new one, which should be larger */
45 | pop %r10
46 | pop %r9
47 | pop %r8
48 | pop %rbx
49 | pop %rdx
50 | ret
51 |
--------------------------------------------------------------------------------
/CoreClockChecker/CoreClockChecker_x86.s:
--------------------------------------------------------------------------------
1 | .global clktest
2 |
3 | /*
4 | %rdi = arg0 = iteration count
5 | */
6 | clktest:
7 | push %rbx
8 | push %r8
9 | push %r9
10 | mov $1, %r8
11 | mov $20, %r9
12 | xor %rbx, %rbx
13 | clktest_loop:
14 | add %r8, %rbx
15 | add %r8, %rbx
16 | add %r8, %rbx
17 | add %r8, %rbx
18 | add %r8, %rbx
19 | add %r8, %rbx
20 | add %r8, %rbx
21 | add %r8, %rbx
22 | add %r8, %rbx
23 | add %r8, %rbx
24 | add %r8, %rbx
25 | add %r8, %rbx
26 | add %r8, %rbx
27 | add %r8, %rbx
28 | add %r8, %rbx
29 | add %r8, %rbx
30 | add %r8, %rbx
31 | add %r8, %rbx
32 | add %r8, %rbx
33 | add %r8, %rbx
34 | sub %r9, %rdi
35 | jnz clktest_loop
36 | pop %r9
37 | pop %r8
38 | pop %rbx
39 | ret
40 |
--------------------------------------------------------------------------------
/CoreClockChecker/Makefile:
--------------------------------------------------------------------------------
1 | include ../Common/arch_detect.mk
2 |
3 | CFLAGS = -O3
4 | LDFLAGS = -lm
5 |
6 | all: $(TARGET)
7 |
8 | amd64:
9 | $(CC) $(CFLAGS) -pthread CoreClockChecker.c CoreClockChecker_x86.s -o CoreClockChecker_amd64 $(LDFLAGS)
10 | $(CC) $(CFLAGS) BoostClockChecker.c BoostClockChecker_x86.s -o BoostClockChecker_amd64 $(LDFLAGS)
11 |
12 | aarch64:
13 | $(CC) $(CFLAGS) BoostClockChecker.c BoostClockChecker_arm.s -o BoostClockChecker_aarch64 $(LDFLAGS)
14 |
15 | w64:
16 | $(CC) $(CFLAGS) BoostClockChecker.c BoostClockChecker_x86.s -o BoostClockChecker_w64.exe $(LDFLAGS)
17 |
18 | ci: amd64 aarch64 w64
19 |
20 | clean:
21 | rm -f *.o && find . -type f -executable -delete
22 |
23 | .PHONY: all ci clean
24 |
--------------------------------------------------------------------------------
/CoreClockChecker/WinCoreClockChecker/CoreClockCheckFunctions.asm:
--------------------------------------------------------------------------------
1 | section .text
2 | bits 64
3 |
4 | global clktest
5 |
6 | ; rcx = iteration count
7 | ; rdx = address of memory location to monitor
8 | ; return elapsed tsc
9 | clktest:
10 | push rdx
11 | push rbx
12 | push r8
13 | push r9
14 | push r10
15 | push r11
16 | xor rbx, rbx
17 | mov r8, 1 ; GLC will eliminate adds with immediates or increments
18 | clktest_loop:
19 | add rbx, r8
20 | add rbx, r8
21 | add rbx, r8
22 | add rbx, r8
23 | add rbx, r8
24 | add rbx, r8
25 | add rbx, r8
26 | add rbx, r8
27 | add rbx, r8
28 | add rbx, r8
29 | add rbx, r8
30 | add rbx, r8
31 | add rbx, r8
32 | add rbx, r8
33 | add rbx, r8
34 | add rbx, r8
35 | add rbx, r8
36 | add rbx, r8
37 | add rbx, r8
38 | add rbx, r8
39 | mov r11d, [rdx]
40 | test r11d, r11d
41 | jnz clktest_loop_end ; early exit condition (someone else exited)
42 | sub rcx, 20
43 | jg clktest_loop
44 | mov [rdx], r8
45 | clktest_loop_end:
46 | mov rax, rbx
47 | pop r11
48 | pop r10
49 | pop r9
50 | pop r8
51 | pop rbx
52 | pop rdx
53 | ret
--------------------------------------------------------------------------------
/CoreClockChecker/WinCoreClockChecker/WinCoreClockChecker.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 17
4 | VisualStudioVersion = 17.9.34723.18
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WinCoreClockChecker", "WinCoreClockChecker.vcxproj", "{D70EC1DD-794C-4156-8483-227E566CC76B}"
7 | EndProject
8 | Global
9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | Debug|x64 = Debug|x64
11 | Debug|x86 = Debug|x86
12 | Release|x64 = Release|x64
13 | Release|x86 = Release|x86
14 | EndGlobalSection
15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
16 | {D70EC1DD-794C-4156-8483-227E566CC76B}.Debug|x64.ActiveCfg = Debug|x64
17 | {D70EC1DD-794C-4156-8483-227E566CC76B}.Debug|x64.Build.0 = Debug|x64
18 | {D70EC1DD-794C-4156-8483-227E566CC76B}.Debug|x86.ActiveCfg = Debug|Win32
19 | {D70EC1DD-794C-4156-8483-227E566CC76B}.Debug|x86.Build.0 = Debug|Win32
20 | {D70EC1DD-794C-4156-8483-227E566CC76B}.Release|x64.ActiveCfg = Release|x64
21 | {D70EC1DD-794C-4156-8483-227E566CC76B}.Release|x64.Build.0 = Release|x64
22 | {D70EC1DD-794C-4156-8483-227E566CC76B}.Release|x86.ActiveCfg = Release|Win32
23 | {D70EC1DD-794C-4156-8483-227E566CC76B}.Release|x86.Build.0 = Release|Win32
24 | EndGlobalSection
25 | GlobalSection(SolutionProperties) = preSolution
26 | HideSolutionNode = FALSE
27 | EndGlobalSection
28 | GlobalSection(ExtensibilityGlobals) = postSolution
29 | SolutionGuid = {6AA7051E-EAEF-48CA-9C08-8641D57B3EB1}
30 | EndGlobalSection
31 | EndGlobal
32 |
--------------------------------------------------------------------------------
/CoreClockChecker/WinCoreClockChecker/WinCoreClockChecker.vcxproj.filters:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF}
6 | cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx
7 |
8 |
9 | {93995380-89BD-4b04-88EB-625FBE52EBFB}
10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd
11 |
12 |
13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01}
14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms
15 |
16 |
17 |
18 |
19 | Source Files
20 |
21 |
22 |
23 |
24 | Source Files
25 |
26 |
27 |
--------------------------------------------------------------------------------
/GpuMemLatency/Makefile:
--------------------------------------------------------------------------------
1 | include ../Common/arch_detect.mk
2 |
3 | OCL_VER = v2023.04.17
4 | CI_SCRIPT = ../Common/ci_gpumemlatency.sh
5 |
6 | CFLAGS = -O3 -I ../Common
7 | DEPS = ../Common/timings.h
8 | OBJ = opencltest.o latency_test.o bw_test.o common.o atomic_test.o instruction_rate.o timing.o
9 | LDFLAGS ?= -lm -lOpenCL
10 | ifeq ($(TARGET), Darwin)
11 | LDFLAGS = -lm -framework OpenCL
12 | endif
13 |
14 | all: $(TARGET)
15 |
16 | GpuMemLatency: $(OBJ)
17 | $(CC) $(CPPFLAGS) $(CFLAGS) $^ -o $@ $(LDFLAGS)
18 |
19 | %.o: %.c $(DEPS)
20 | $(CC) $(CFLAGS) -c -o $@ $<
21 |
22 | timing.o:
23 | $(CC) $(CFLAGS) -c ../Common/timing.c -o timing.o
24 |
25 | amd64: $(OBJ)
26 | $(CC) $(CFLAGS) $^ -o GpuMemLatency_amd64 $(LDFLAGS)
27 |
28 | aarch64: $(OBJ)
29 | $(CC) $(CFLAGS) $^ -o GpuMemLatency_aarch64 $(LDFLAGS)
30 |
31 | riscv64: $(OBJ)
32 | $(CC) $(CFLAGS) $^ -o GpuMemLatency_riscv64 $(LDFLAGS)
33 |
34 | w64: $(OBJ)
35 | $(CC) $(CFLAGS) $^ -o GpuMemLatency_w64.exe $(LDFLAGS)
36 |
37 | darwin: $(OBJ)
38 | $(CC) $(CFLAGS) $^ -o GpuMemLatency_darwin $(LDFLAGS)
39 |
40 | ci: clean
41 | @OCL_VER=$(OCL_VER) sh $(CI_SCRIPT)
42 |
43 | clean-ci:
44 | rm -rf "*.deb" "*.zip" "ocl-icd-*" "OpenCL-SDK-*"
45 |
46 | clean-obj:
47 | rm -f *.o
48 |
49 | clean: clean-ci clean-obj
50 | find . -type f -executable -delete
51 |
52 | .PHONY: all ci clean-ci clean-obj clean
53 |
--------------------------------------------------------------------------------
/GpuMemLatency/OpenCL/README.md:
--------------------------------------------------------------------------------
1 | # OpenCLTM API Headers
2 |
3 | This repository contains C language headers for the OpenCL API.
4 |
5 | The authoritative public repository for these headers is located at:
6 |
7 | https://github.com/KhronosGroup/OpenCL-Headers
8 |
9 | Issues, proposed fixes for issues, and other suggested changes should be
10 | created using Github.
11 |
12 | ## Branch Structure
13 |
14 | The OpenCL API headers in this repository are Unified headers and are designed
15 | to work with all released OpenCL versions. This differs from previous OpenCL
16 | API headers, where version-specific API headers either existed in separate
17 | branches, or in separate folders in a branch.
18 |
19 | ## Compiling for a Specific OpenCL Version
20 |
21 | By default, the OpenCL API headers in this repository are for the latest
22 | OpenCL version (currently OpenCL 2.2). To use these API headers to target
23 | a different OpenCL version, an application may `#define` the preprocessor
24 | value `CL_TARGET_OPENCL_VERSION` before including the OpenCL API headers.
25 | The `CL_TARGET_OPENCL_VERSION` is a three digit decimal value representing
26 | the OpenCL API version.
27 |
28 | For example, to enforce usage of no more than the OpenCL 1.2 APIs, you may
29 | include the OpenCL API headers as follows:
30 |
31 | ```
32 | #define CL_TARGET_OPENCL_VERSION 120
33 | #include
34 | ```
35 |
36 | ## Directory Structure
37 |
38 | ```
39 | README.md This file
40 | LICENSE Source license for the OpenCL API headers
41 | CL/ Unified OpenCL API headers tree
42 | ```
43 |
44 | ## License
45 |
46 | See [LICENSE](LICENSE).
47 |
48 | ---
49 |
50 | OpenCL and the OpenCL logo are trademarks of Apple Inc. used by permission by Khronos.
51 |
--------------------------------------------------------------------------------
/GpuMemLatency/OpenCL/include/CL/cl_gl_ext.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright (c) 2008-2020 The Khronos Group Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | #ifndef __OPENCL_CL_GL_EXT_H
18 | #define __OPENCL_CL_GL_EXT_H
19 |
20 | #ifdef __cplusplus
21 | extern "C" {
22 | #endif
23 |
24 | #include
25 |
26 | /*
27 | * cl_khr_gl_event extension
28 | */
29 | #define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D
30 |
31 | extern CL_API_ENTRY cl_event CL_API_CALL
32 | clCreateEventFromGLsyncKHR(cl_context context,
33 | cl_GLsync cl_GLsync,
34 | cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
35 |
36 | #ifdef __cplusplus
37 | }
38 | #endif
39 |
40 | #endif /* __OPENCL_CL_GL_EXT_H */
41 |
--------------------------------------------------------------------------------
/GpuMemLatency/OpenCL/include/CL/opencl.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright (c) 2008-2020 The Khronos Group Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | #ifndef __OPENCL_H
18 | #define __OPENCL_H
19 |
20 | #ifdef __cplusplus
21 | extern "C" {
22 | #endif
23 |
24 | #include
25 | #include
26 | #include
27 | #include
28 |
29 | #ifdef __cplusplus
30 | }
31 | #endif
32 |
33 | #endif /* __OPENCL_H */
34 |
--------------------------------------------------------------------------------
/GpuMemLatency/OpenCL/lib/OpenCL.lib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clamchowder/Microbenchmarks/e8aaed570e6d7fea725f3d67313f5a51973d6fdf/GpuMemLatency/OpenCL/lib/OpenCL.lib
--------------------------------------------------------------------------------
/GpuMemLatency/kernels/atomic_exec_latency_test.cl:
--------------------------------------------------------------------------------
1 | __kernel void atomic_exec_latency_test(__global int* A, int count, __global int* ret) {
2 | int current = get_global_id(0) + 1;
3 | while (current <= 2 * count) {
4 | if (atomic_cmpxchg(A, current - 1, current) == current - 1) {
5 | current += 2;
6 | }
7 | }
8 | }
9 |
10 | __kernel void atomic_add_test(__global int *A, int count) {
11 | int addend = get_global_id(0);
12 | int addend1 = addend + 5;
13 | int addend2 = addend + 6;
14 | int addend3 = addend + 7;
15 | int addend4 = addend + 8;
16 | int addend5 = addend + 9;
17 | int addend6 = addend + 10;
18 | int addend7 = addend + 11;
19 | __global int *target = A + get_global_id(0);
20 | for (int i = 0; i < count; i++)
21 | {
22 | atomic_add(target, addend);
23 | atomic_add(target, addend1);
24 | atomic_add(target, addend2);
25 | atomic_add(target, addend3);
26 | atomic_add(target, addend4);
27 | atomic_add(target, addend5);
28 | atomic_add(target, addend6);
29 | atomic_add(target, addend7);
30 | }
31 | }
--------------------------------------------------------------------------------
/GpuMemLatency/kernels/buffer_bw_test.cl:
--------------------------------------------------------------------------------
1 | #define fixed_tex_test_size 1024
2 | __kernel void buffer_bw_test(__read_only image1d_buffer_t A, uint count, __global float* ret) {
3 | int threadId = get_global_id(0);
4 | int localId = get_local_id(0);
5 | int localSize = get_local_size(0);
6 | int groupId = get_group_id(0);
7 | uint4 acc1 = read_imageui(A, 0);
8 | uint4 acc2 = read_imageui(A, 1);
9 | uint4 acc3 = read_imageui(A, 2);
10 | uint4 acc4 = read_imageui(A, 3);
11 |
12 | int idx0 = localId;
13 | int idx1 = localId + localSize;
14 | int idx2 = localId + localSize * 2;
15 |
16 | // Each read_imageui reads out a 4-wide vector
17 | for (int i = 0; i < count; i += 16) {
18 | read_imageui(A, idx0);
19 | acc1 += read_imageui(A, idx0);
20 | acc2 += read_imageui(A, idx1);
21 | acc3 += read_imageui(A, idx2);
22 | acc4 += read_imageui(A, idx0 + 1);
23 | idx0 = (idx0 + localSize) & 0x3FF;
24 | idx1 = (idx1 + localSize) & 0x3FF;
25 | idx2 = (idx2 + localSize) & 0x3FF;
26 | }
27 |
28 | float4 out1 = convert_float4(acc1);
29 | float4 out2 = convert_float4(acc2);
30 | float4 out3 = convert_float4(acc3);
31 | float4 out4 = convert_float4(acc4);
32 | ret[threadId] = dot(out1, out2) + dot(out3, out4);
33 | }
--------------------------------------------------------------------------------
/GpuMemLatency/kernels/c2c_atomic_exec_latency_test.cl:
--------------------------------------------------------------------------------
1 | // hoping each thread/workgroup lands on a different CU
2 | // A = pointer to location being bounced around
3 | // count = iterations
4 | // ret = sink
5 | // t1 = id of thread 1
6 | // t2 = id of thread 2
7 | __kernel void c2c_atomic_exec_latency_test(__global int* A, int count, __global int* ret, int t1, int t2) {
8 | int global_id = get_global_id(0);
9 | int current = 0;
10 | if (global_id == t1) current = 1;
11 | else if (global_id == t2) current = 2;
12 |
13 | if (global_id == t1 || global_id == t2) {
14 | //printf("gid: %d, t1: %d, t2: %d, A: %d, current = %d\n", global_id, t1, t2, *A, current);
15 | while (current <= 2 * count) {
16 | if (atomic_cmpxchg(A, current - 1, current) == current - 1) {
17 | current += 2;
18 | }
19 | }
20 | ret[0] = current;
21 | }
22 | }
--------------------------------------------------------------------------------
/GpuMemLatency/kernels/constant_unrolled_latency_test.cl:
--------------------------------------------------------------------------------
1 | // latency test like the unrolled one above, but with input as constant memory
2 | __kernel void constant_unrolled_latency_test(__constant const int* A, int count, __global int* ret) {
3 | //int current = A[0];
4 | int current = get_global_size(0) > 1 ? ret[get_global_id(0)]: A[0];
5 | int result;
6 | for (int i = 0; i < count; i += 10) {
7 | result += current;
8 | current = A[current];
9 | result += current;
10 | current = A[current];
11 | result += current;
12 | current = A[current];
13 | result += current;
14 | current = A[current];
15 | result += current;
16 | current = A[current];
17 | result += current;
18 | current = A[current];
19 | result += current;
20 | current = A[current];
21 | result += current;
22 | current = A[current];
23 | result += current;
24 | current = A[current];
25 | result += current;
26 | current = A[current];
27 | }
28 |
29 | ret[0] = result;
30 | }
--------------------------------------------------------------------------------
/GpuMemLatency/kernels/local_64_bw_test.cl:
--------------------------------------------------------------------------------
1 | #define local64_test_size 2048 // size was given in 4B elements. This test uses 8B
2 | __kernel void local_64_bw_test(__global ulong* A, uint count, __global ulong* ret) {
3 | __local ulong local_a[local64_test_size];
4 | int threadId = get_global_id(0);
5 | int localId = get_local_id(0);
6 | int localSize = get_local_size(0);
7 | int groupId = get_group_id(0);
8 |
9 | // workgroup-wide copy from global mem into local mem
10 | for (int i = get_local_id(0);i < local64_test_size; i += get_local_size(0))
11 | local_a[i] = A[i];
12 | barrier(CLK_LOCAL_MEM_FENCE);
13 |
14 | ulong acc0 = 0, acc1 = 0, acc2 = 0, acc3 = 0;
15 |
16 | // assumes local memory size is at least 512x 64-bit uints
17 | int idx0 = localId;
18 | int idx1 = localId + localSize;
19 | for (int i = 0; i < count; i += 8) {
20 | acc0 ^= local_a[idx0];
21 | acc1 ^= local_a[idx1];
22 | idx0 = (idx0 + localSize) & 0x1FF;
23 | idx1 = (idx1 + localSize) & 0x1FF;
24 |
25 | acc3 ^= local_a[idx0];
26 | acc4 ^= local_a[idx1];
27 | idx0 = (idx0 + localSize) & 0x1FF;
28 | idx1 = (idx1 + localSize) & 0x1FF;
29 | }
30 |
31 | ret[threadId] = acc0 + acc1 + acc2 + acc3;
32 | }
33 |
--------------------------------------------------------------------------------
/GpuMemLatency/kernels/local_atomic_latency_test.cl:
--------------------------------------------------------------------------------
1 | __kernel void local_atomic_latency_test(__global int* A, int count, __global int* ret) {
2 | __local int a[1];
3 | int current = get_global_id(0) + 1;
4 | if (current == 1) a[0] = A[0];
5 | barrier(CLK_LOCAL_MEM_FENCE);
6 |
7 | while (current <= 2 * count) {
8 | if (atomic_cmpxchg(a, current - 1, current) == current - 1) {
9 | current += 2;
10 | }
11 | }
12 | }
13 |
14 | #define local_atomic_add_wg_size 256
15 | __kernel void local_atomic_add_test(__global int *A, int count) {
16 | __local int local_a[local_atomic_add_wg_size];
17 | local_a[get_local_id(0)] = A[get_global_id(0)];
18 | barrier(CLK_LOCAL_MEM_FENCE);
19 |
20 | int addend = get_global_id(0);
21 | int addend1 = addend + 5;
22 | int addend2 = addend + 6;
23 | int addend3 = addend + 7;
24 | int addend4 = addend + 8;
25 | int addend5 = addend + 9;
26 | int addend6 = addend + 10;
27 | int addend7 = addend + 11;
28 | __local int *target = local_a + get_local_id(0);
29 | for (int i = 0; i < count; i++)
30 | {
31 | atomic_add(target, addend);
32 | atomic_add(target, addend1);
33 | atomic_add(target, addend2);
34 | atomic_add(target, addend3);
35 | atomic_add(target, addend4);
36 | atomic_add(target, addend5);
37 | atomic_add(target, addend6);
38 | atomic_add(target, addend7);
39 | }
40 |
41 | A[get_global_id(0)] = local_a[get_local_id(0)];
42 | }
--------------------------------------------------------------------------------
/GpuMemLatency/kernels/local_bw_test.cl:
--------------------------------------------------------------------------------
1 | #define local_mem_bw_test_size 1024
2 | // test bandwidth with local memory. A must be at least local_mem_bw_test_size in floats
3 | __kernel void local_bw_test(__global float* A, uint count, __global float* ret) {
4 | __local float local_a[local_mem_bw_test_size];
5 | int threadId = get_global_id(0);
6 | int localId = get_local_id(0);
7 | int localSize = get_local_size(0);
8 | int groupId = get_group_id(0);
9 | float acc1 = 1.1;
10 | float acc2 = 2.2;
11 | float acc3 = 3.3;
12 | float acc4 = 4.4;
13 |
14 | //printf("subgroup size %d\n", get_sub_group_size());
15 |
16 | // workgroup-wide copy from global mem into local mem
17 | for (int i = get_local_id(0);i < local_mem_bw_test_size; i += get_local_size(0))
18 | local_a[i] = A[i];
19 | barrier(CLK_LOCAL_MEM_FENCE);
20 |
21 | // assumes local memory size is at least 1024 float4s
22 | int idx0 = localId;
23 | int idx1 = localId + localSize;
24 | int idx2 = localId + localSize * 2;
25 | for (int i = 0; i < count; i += 12) {
26 | acc1 += local_a[idx0] * local_a[idx1] + local_a[idx2];
27 | idx0 = (idx0 + localSize) & 0x3FF;
28 | idx1 = (idx1 + localSize) & 0x3FF;
29 | idx2 = (idx2 + localSize) & 0x3FF;
30 |
31 | acc2 += local_a[idx0] * local_a[idx1] + local_a[idx2];
32 | idx0 = (idx0 + localSize) & 0x3FF;
33 | idx1 = (idx1 + localSize) & 0x3FF;
34 | idx2 = (idx2 + localSize) & 0x3FF;
35 |
36 | acc3 += local_a[idx0] * local_a[idx1] + local_a[idx2];
37 | idx0 = (idx0 + localSize) & 0x3FF;
38 | idx1 = (idx1 + localSize) & 0x3FF;
39 | idx2 = (idx2 + localSize) & 0x3FF;
40 |
41 | acc4 += local_a[idx0] * local_a[idx1] + local_a[idx2];
42 | idx0 = (idx0 + localSize) & 0x3FF;
43 | idx1 = (idx1 + localSize) & 0x3FF;
44 | idx2 = (idx2 + localSize) & 0x3FF;
45 | }
46 |
47 | ret[threadId] = acc1 + acc2 + acc3 + acc4;
48 | }
--------------------------------------------------------------------------------
/GpuMemLatency/kernels/local_float4_bw_test.cl:
--------------------------------------------------------------------------------
1 | #define local_mem_bw_test_size 1024
2 | __kernel void local_float4_bw_test(__global float4* A, uint count, __global float* ret) {
3 | __local float4 local_a[local_mem_bw_test_size];
4 | int threadId = get_global_id(0);
5 | int localId = get_local_id(0);
6 | int localSize = get_local_size(0);
7 | int groupId = get_group_id(0);
8 | float4 acc1 = A[get_global_id(0) & 0x3FF];
9 | float4 acc2 = A[(get_global_id(0) + 1) & 0x3FF];
10 | float4 acc3 = A[(get_global_id(0) + 2) & 0x3FF];
11 | float4 acc4 = A[(get_global_id(0) + 3) & 0x3FF];
12 |
13 | // workgroup-wide copy from global mem into local mem
14 | for (int i = get_local_id(0);i < local_mem_bw_test_size; i += get_local_size(0))
15 | local_a[i] = A[i];
16 | barrier(CLK_LOCAL_MEM_FENCE);
17 |
18 | // assumes local memory size is at least 1024 float4s
19 | int idx0 = localId;
20 | int idx1 = localId + localSize;
21 | int idx2 = localId + localSize * 2;
22 | for (int i = 0; i < count; i += (12*4)) {
23 | acc1 += local_a[idx0] * local_a[idx1] + local_a[idx2];
24 | idx0 = (idx0 + localSize) & 0x3FF;
25 | idx1 = (idx1 + localSize) & 0x3FF;
26 | idx2 = (idx2 + localSize) & 0x3FF;
27 |
28 | acc2 += local_a[idx0] * local_a[idx1] + local_a[idx2];
29 | idx0 = (idx0 + localSize) & 0x3FF;
30 | idx1 = (idx1 + localSize) & 0x3FF;
31 | idx2 = (idx2 + localSize) & 0x3FF;
32 |
33 | acc3 += local_a[idx0] * local_a[idx1] + local_a[idx2];
34 | idx0 = (idx0 + localSize) & 0x3FF;
35 | idx1 = (idx1 + localSize) & 0x3FF;
36 | idx2 = (idx2 + localSize) & 0x3FF;
37 |
38 | acc4 += local_a[idx0] * local_a[idx1] + local_a[idx2];
39 | idx0 = (idx0 + localSize) & 0x3FF;
40 | idx1 = (idx1 + localSize) & 0x3FF;
41 | idx2 = (idx2 + localSize) & 0x3FF;
42 | }
43 |
44 | ret[threadId] = dot(acc1, acc2) + dot(acc3, acc4);
45 | }
--------------------------------------------------------------------------------
/GpuMemLatency/kernels/local_unrolled_latency_test.cl:
--------------------------------------------------------------------------------
1 | #define local_mem_test_size 1024
2 | // uses local memory (LDS/shmem)
3 | __kernel void local_unrolled_latency_test(__global const int* A, int count, __global int* ret) {
4 | __local int local_a[local_mem_test_size]; // 4 KB, should be present on all GPUs, amirite?
5 | // better be fast
6 | for (int i = get_local_id(0);i < local_mem_test_size; i += get_local_size(0))
7 | local_a[i] = A[i];
8 | barrier(CLK_LOCAL_MEM_FENCE);
9 |
10 | // everyone else can chill/get masked off
11 | if (get_local_id(0) == 0) {
12 | int current = local_a[0];
13 | int result;
14 | for (int i = 0; i < count; i += 10) {
15 | result += current;
16 | current = local_a[current];
17 | result += current;
18 | current = local_a[current];
19 | result += current;
20 | current = local_a[current];
21 | result += current;
22 | current = local_a[current];
23 | result += current;
24 | current = local_a[current];
25 | result += current;
26 | current = local_a[current];
27 | result += current;
28 | current = local_a[current];
29 | result += current;
30 | current = local_a[current];
31 | result += current;
32 | current = local_a[current];
33 | result += current;
34 | current = local_a[current];
35 | }
36 |
37 | ret[0] = result;
38 | }
39 | }
--------------------------------------------------------------------------------
/GpuMemLatency/kernels/scalar_unrolled_latency_test.cl:
--------------------------------------------------------------------------------
1 | // Ensures the loaded value will be constant across a workgroup
2 | __kernel void scalar_unrolled_latency_test(__global const int* A, int count, __global int* ret) {
3 | int current = get_num_groups(0) > 1 ? ret[get_group_id(0) * get_local_size(0)]: A[0];
4 | int result;
5 | for (int i = 0; i < count; i += 10) {
6 | result += current;
7 | current = A[current];
8 | result += current;
9 | current = A[current];
10 | result += current;
11 | current = A[current];
12 | result += current;
13 | current = A[current];
14 | result += current;
15 | current = A[current];
16 | result += current;
17 | current = A[current];
18 | result += current;
19 | current = A[current];
20 | result += current;
21 | current = A[current];
22 | result += current;
23 | current = A[current];
24 | result += current;
25 | current = A[current];
26 | }
27 |
28 | ret[0] = result;
29 | }
--------------------------------------------------------------------------------
/GpuMemLatency/kernels/sum_bw_test.cl:
--------------------------------------------------------------------------------
1 | __kernel void sum_bw_test(__global float* A, uint count, uint float4size, __global float* ret, uint skip, __global uint *startPositions) {
2 | int threadId = get_global_id(0);
3 | int localId = get_local_id(0);
4 | int localSize = get_local_size(0);
5 | int groupId = get_group_id(0);
6 | float4 result1 = (0.1f,0.2f,0.3f,0.4f);
7 | float4 result2 = (1.1f,1.2f,1.3f,1.4f);
8 | float4 result3 = (2.1f,2.2f,2.3f,2.4f);
9 | float4 result4 = (3.0f,3.1f,3.2f,3.3f);
10 | float4 result5 = (4.0f,4.2f,4.1f,4.3f);
11 |
12 | int initialIdx = startPositions[threadId];
13 | //int initialIdx = (groupId * skip * localSize + localId) % (float4size - 1);
14 | //startPositions[threadId] = initialIdx; // for debugging
15 |
16 | int idx = initialIdx;
17 | __global float4 *B = (__global float4 *)A;
18 | for (int i = 0; i < count; i += 20) {
19 | result1 += B[idx];
20 | idx += localSize;
21 | if (idx >= float4size) idx = initialIdx;
22 |
23 | result2 += B[idx];
24 | idx += localSize;
25 | if (idx >= float4size) idx = initialIdx;
26 |
27 | result3 += B[idx];
28 | idx += localSize;
29 | if (idx >= float4size) idx = initialIdx;
30 |
31 | result4 += B[idx];
32 | idx += localSize;
33 | if (idx >= float4size) idx = initialIdx;
34 |
35 | result5 += B[idx];
36 | idx += localSize;
37 | if (idx >= float4size) idx = initialIdx;
38 | }
39 |
40 | ret[threadId] = dot(result1, result2) + dot(result3, result4) + dot(result4, result5);
41 | }
--------------------------------------------------------------------------------
/GpuMemLatency/kernels/tex_bw_test.cl:
--------------------------------------------------------------------------------
1 | __constant sampler_t funny_sampler = CLK_NORMALIZED_COORDS_TRUE | // coordinates are from 0 to 1 (float)
2 | CLK_ADDRESS_REPEAT | // going out of bounds = replicate
3 | CLK_FILTER_NEAREST;
4 | __kernel void tex_bw_test(__read_only image2d_t A, int count, __global float* ret) {
5 | int localId = get_local_id(0);
6 | float pos = get_global_id(0) * native_recip((float)get_global_size(0));
7 | float2 increment;
8 | increment.x = 0.01; // guessing
9 | increment.y = 0.01;
10 |
11 | float2 current0, current1, current2, current3;
12 | current0.x = pos;
13 | current0.y = pos;
14 | current1.x = 0.1 + (localId / 10000);
15 | current1.y = 0.1 + (localId / 10000);
16 | current2.x = 0.01 + (localId / 10000);
17 | current2.y = 0.01 + (localId / 10000);
18 | current3.x = 0.002 + (localId / 5000);
19 | current3.y = 0.001 + (localId / 5000);
20 |
21 | float4 tmp0 = read_imagef(A, funny_sampler, current0);
22 | float4 tmp1 = read_imagef(A, funny_sampler, current1);
23 | float4 tmp2 = read_imagef(A, funny_sampler, current2);
24 | float4 tmp3 = read_imagef(A, funny_sampler, current3);
25 | for (int i = 0; i < count; i += 4)
26 | {
27 | tmp0 += read_imagef(A, funny_sampler, current0);
28 | tmp1 += read_imagef(A, funny_sampler, current1);
29 | tmp2 += read_imagef(A, funny_sampler, current2);
30 | tmp3 += read_imagef(A, funny_sampler, current3);
31 | current0 += increment;
32 | current1 += increment;
33 | current2 += increment;
34 | current3 += increment;
35 | }
36 |
37 | *ret = dot(tmp0, tmp1) + dot(tmp2, tmp3);
38 | }
--------------------------------------------------------------------------------
/GpuMemLatency/kernels/tex_latency_test.cl:
--------------------------------------------------------------------------------
1 | __kernel void tex_latency_test(__read_only image1d_buffer_t A, int count, __global int* ret, int list_size) {
2 | int localId = get_local_id(0);
3 | // uint4 current = read_imageui(A, direct_sampler, 0); // using sampler screws things up
4 | int startPos = get_global_size(0) > 1 ? ret[get_global_id(0)] : 0;
5 | uint4 current = read_imageui(A, startPos);
6 | // printf("start x: %u -> %u\n", startPos, current.x);
7 | for (int i = 0; i < count; i += 10) {
8 | // printf("current: %u %u %u %u, address: %d\n", current.x, current.y, current.z, current.w, (int)current.x / 4);
9 | //current = read_imageui(A, direct_sampler, i);
10 | current = read_imageui(A, current.x);
11 | current = read_imageui(A, current.x);
12 | current = read_imageui(A, current.x);
13 | current = read_imageui(A, current.x);
14 | current = read_imageui(A, current.x);
15 | current = read_imageui(A, current.x);
16 | current = read_imageui(A, current.x);
17 | current = read_imageui(A, current.x);
18 | current = read_imageui(A, current.x);
19 | current = read_imageui(A, current.x);
20 | //printf("%d: current read: %u %u %u %u\n", i, current.x, current.y, current.z, current.w);
21 | // local_a[localId] = current;
22 | }
23 |
24 | ret[get_global_id(0)] = current.x;
25 | }
26 |
--------------------------------------------------------------------------------
/GpuMemLatency/kernels/unrolled_latency_test.cl:
--------------------------------------------------------------------------------
1 | // unrolled until terascale no longer saw further improvement (10x unroll)
2 | // assumes count will be a multiple of 10. but it won't be too inaccurate with a big count
3 | // not divisible by 10
4 | __kernel void unrolled_latency_test(__global const int* A, int count, __global int* ret) {
5 | int current = get_global_size(0) > 1 ? ret[get_global_id(0)]: A[0]; // this will test vector latency on AMD. Set to A[0] for scalar latency
6 | int result;
7 | for (int i = 0; i < count; i += 10) {
8 | result += current;
9 | current = A[current];
10 | result += current;
11 | current = A[current];
12 | result += current;
13 | current = A[current];
14 | result += current;
15 | current = A[current];
16 | result += current;
17 | current = A[current];
18 | result += current;
19 | current = A[current];
20 | result += current;
21 | current = A[current];
22 | result += current;
23 | current = A[current];
24 | result += current;
25 | current = A[current];
26 | result += current;
27 | current = A[current];
28 | }
29 |
30 | ret[0] = result;
31 | }
--------------------------------------------------------------------------------
/GpuMemLatency/local_mem_latency_kernel.cl:
--------------------------------------------------------------------------------
1 | // for testing total local memory capacity by seeing when threads can no longer overlap in time
2 | // due to local mem capacity limits across the GPU
3 | // calling code expected to define LATENCY_LOCAL_MEM_SIZE
4 | __kernel void unrolled_latency_test_localmem(__global const int* A, int count, __global int* ret) {
5 | __local int local_a[LATENCY_LOCAL_MEM_SIZE];
6 | int start = A[0]; // this will test scalar latency, always
7 | int current = A[start];
8 | int result;
9 | for (int i = 0; i < count; i += 10) {
10 | result += current;
11 | current = A[current];
12 | result += current;
13 | current = A[current];
14 | result += current;
15 | current = A[current];
16 | result += current;
17 | current = A[current];
18 | result += current;
19 | current = A[current];
20 | result += current;
21 | current = A[current];
22 | result += current;
23 | current = A[current];
24 | result += current;
25 | current = A[current];
26 | result += current;
27 | current = A[current];
28 | result += current;
29 | current = A[current];
30 | local_a[i & (LATENCY_LOCAL_MEM_SIZE - 1)] = current;
31 | }
32 |
33 | ret[0] = local_a[current & (LATENCY_LOCAL_MEM_SIZE - 1)];
34 | }
35 |
--------------------------------------------------------------------------------
/GpuMemLatency/opencltest.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 16
4 | VisualStudioVersion = 16.0.30503.244
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "opencltest", "opencltest.vcxproj", "{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}"
7 | EndProject
8 | Global
9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | Debug|x64 = Debug|x64
11 | Debug|x86 = Debug|x86
12 | Release|x64 = Release|x64
13 | Release|x86 = Release|x86
14 | EndGlobalSection
15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
16 | {FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Debug|x64.ActiveCfg = Debug|x64
17 | {FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Debug|x64.Build.0 = Debug|x64
18 | {FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Debug|x86.ActiveCfg = Debug|Win32
19 | {FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Debug|x86.Build.0 = Debug|Win32
20 | {FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Release|x64.ActiveCfg = Release|x64
21 | {FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Release|x64.Build.0 = Release|x64
22 | {FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Release|x86.ActiveCfg = Release|Win32
23 | {FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Release|x86.Build.0 = Release|Win32
24 | EndGlobalSection
25 | GlobalSection(SolutionProperties) = preSolution
26 | HideSolutionNode = FALSE
27 | EndGlobalSection
28 | GlobalSection(ExtensibilityGlobals) = postSolution
29 | SolutionGuid = {4447E91D-E7A1-4249-87A7-E75A78167E71}
30 | EndGlobalSection
31 | EndGlobal
32 |
--------------------------------------------------------------------------------
/GpuMemLatency/opencltest.vcxproj.filters:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
--------------------------------------------------------------------------------
/GpuMemLatency/texturetest.c:
--------------------------------------------------------------------------------
1 | #include "opencltest.h"
2 |
3 |
--------------------------------------------------------------------------------
/InstructionRate/Makefile:
--------------------------------------------------------------------------------
1 | include ../Common/arch_detect.mk
2 |
3 | CFLAGS = -O3
4 |
5 | all: $(TARGET)
6 |
7 | amd64:
8 | $(CC) $(CFLAGS) x86_instructionrate.s x86_instructionrate.c -o InstructionRate_amd64 $(LDFLAGS)
9 |
10 | aarch64:
11 | $(CC) $(CFLAGS) -march=native -pthread arm_instructionrate.s arm_instructionrate.c -o InstructionRate_aarch64 $(LDFLAGS)
12 |
13 | riscv64:
14 | $(CC) $(CFLAGS) -march=rv64gc -pthread riscv_instructionrate.s riscv_instructionrate.c -o InstructionRate_riscv64 $(LDFLAGS)
15 |
16 | termux:
17 | clang -march=armv8+aes arm_instructionrate.s arm_instructionrate.c -o InstructionRate_aarch64 $(LDFLAGS)
18 |
19 | amd64_fusion:
20 | $(CC) $(CFLAGS) x86_fusion.s x86_fusion.c -o InstructionRateFusion_amd64 $(LDFLAGS)
21 |
22 | w64:
23 | $(CC) $(CFLAGS) x86_instructionrate.c x86_instructionrate.s -o InstructionRate_w64.exe $(LDFLAGS)
24 |
25 | ci: amd64 amd64_fusion aarch64 riscv64 w64
26 |
27 | clean:
28 | rm -f *.o && find . -type f -executable -delete
29 |
30 | .PHONY: all ci clean
31 |
--------------------------------------------------------------------------------
/LoadedMemoryLatency/LoadedMemoryLatency/LoadedMemoryLatency.asm:
--------------------------------------------------------------------------------
1 | section .text
2 | bits 64
3 |
4 | global asm_read
5 |
6 | ; rcx = ptr to array
7 | ; rdx = array length in bytes
8 | ; r8 = stop flag
9 | ; r9 = throttle factor
10 | ; return bytes read in rax
11 | asm_read:
12 | push rdi
13 | push rsi
14 | push r10
15 | push r11
16 | mov rdi, rcx ; save array base address
17 | xor rsi, rsi ; index
18 | xor rax, rax ; return value
19 | asm_read_pass_loop:
20 | movups xmm0, [rdi]
21 | movups xmm0, [rdi + 16]
22 | movups xmm0, [rdi + 32]
23 | movups xmm0, [rdi + 48]
24 | movups xmm0, [rdi + 64]
25 | movups xmm0, [rdi + 80]
26 | movups xmm0, [rdi + 96]
27 | movups xmm0, [rdi + 112]
28 |
29 | add rdi, 128
30 | add rsi, 128 ; update index
31 | add rax, 128 ; update return value
32 |
33 | test r9, r9 ; need to throttle?
34 | jz asm_read_throttle_end
35 | mov r10, r9
36 | asm_read_throttle:
37 | dec r10
38 | jnz asm_read_throttle;
39 | asm_read_throttle_end:
40 | mov r10d, [r8] ; check stop flag
41 | test r10d, r10d
42 | jnz asm_read_end
43 |
44 | cmp rdx, rsi ; array len - index > 0?
45 | jg asm_read_pass_loop
46 | mov rdi, rcx ; reset to start
47 | xor rsi, rsi ; and reset index
48 | jmp asm_read_pass_loop
49 | asm_read_end:
50 | pop r11
51 | pop r10
52 | pop rsi
53 | pop rdi
54 | ret
--------------------------------------------------------------------------------
/LoadedMemoryLatency/LoadedMemoryLatency/LoadedMemoryLatency.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 17
4 | VisualStudioVersion = 17.11.35327.3
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LoadedMemoryLatency", "LoadedMemoryLatency.vcxproj", "{E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}"
7 | EndProject
8 | Global
9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | Debug|x64 = Debug|x64
11 | Debug|x86 = Debug|x86
12 | Release|x64 = Release|x64
13 | Release|x86 = Release|x86
14 | EndGlobalSection
15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
16 | {E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Debug|x64.ActiveCfg = Debug|x64
17 | {E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Debug|x64.Build.0 = Debug|x64
18 | {E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Debug|x86.ActiveCfg = Debug|Win32
19 | {E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Debug|x86.Build.0 = Debug|Win32
20 | {E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Release|x64.ActiveCfg = Release|x64
21 | {E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Release|x64.Build.0 = Release|x64
22 | {E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Release|x86.ActiveCfg = Release|Win32
23 | {E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Release|x86.Build.0 = Release|Win32
24 | EndGlobalSection
25 | GlobalSection(SolutionProperties) = preSolution
26 | HideSolutionNode = FALSE
27 | EndGlobalSection
28 | GlobalSection(ExtensibilityGlobals) = postSolution
29 | SolutionGuid = {5656BCBF-7F82-471C-8AFE-1FE48AD34114}
30 | EndGlobalSection
31 | EndGlobal
32 |
--------------------------------------------------------------------------------
/LoadedMemoryLatency/LoadedMemoryLatency/LoadedMemoryLatency.vcxproj.filters:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF}
6 | cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx
7 |
8 |
9 | {93995380-89BD-4b04-88EB-625FBE52EBFB}
10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd
11 |
12 |
13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01}
14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms
15 |
16 |
17 |
18 |
19 | Source Files
20 |
21 |
22 |
23 |
24 | Source Files
25 |
26 |
27 |
--------------------------------------------------------------------------------
/LoadedMemoryLatency/LoadedMemoryLatency_amd64.s:
--------------------------------------------------------------------------------
1 | .global asm_read
2 |
3 | /* rcx = ptr to array
4 | rdx = arr length in bytes
5 | r8 = stop flag
6 | r9 = throttle factor
7 | return bytes read in rax
8 | */
9 | asm_read:
10 | push %rdi
11 | push %rsi
12 | push %r10
13 | push %r11
14 | mov %rcx, %rdi
15 | xor %rsi, %rsi
16 | xor %rax, %rax
17 | asm_read_pass_loop:
18 | /* load 128B */
19 | movups (%rdi), %xmm0
20 | movups 16(%rdi), %xmm0
21 | movups 32(%rdi), %xmm0
22 | movups 48(%rdi), %xmm0
23 | movups 64(%rdi), %xmm0
24 | movups 80(%rdi), %xmm0
25 | movups 96(%rdi), %xmm0
26 | movups 112(%rdi), %xmm0
27 |
28 | add $128, %rdi
29 | add $128, %rsi
30 | add $128, %rax
31 |
32 | test %r9, %r9
33 | jz asm_read_throttle_end
34 | mov %r9, %r10
35 | asm_read_throttle:
36 | dec %r10
37 | jnz asm_read_throttle
38 | asm_read_throttle_end:
39 | /* check stop flag */
40 | mov (%r8), %r10d
41 | test %r10d, %r10d
42 | jnz asm_read_end
43 |
44 | cmp %rsi, %rdx
45 | jg asm_read_pass_loop
46 | mov %rcx, %rdi
47 | xor %rsi, %rsi
48 | jmp asm_read_pass_loop
49 | asm_read_end:
50 | pop %r11
51 | pop %r10
52 | pop %rsi
53 | pop %rdi
54 | ret
55 |
--------------------------------------------------------------------------------
/LoadedMemoryLatency/LoadedMemoryLatency_arm.s:
--------------------------------------------------------------------------------
1 | .global asm_read
2 | .global _asm_read
3 |
4 | /* x0 = ptr to array
5 | x1 = arr length in bytes
6 | x2 = stop flag
7 | x3 = throttle factor
8 | return bytes read in x0
9 | */
10 | _asm_read:
11 | asm_read:
12 | sub sp, sp, #0x40
13 | stp x14, x15, [sp, #0x10]
14 | stp x12, x13, [sp, #0x20]
15 | stp x11, x10, [sp, #0x30]
16 | sub x1, x1, 128
17 | mov x15, x0 /* ptr into array */
18 | mov x12, 0 /* current offset into array */
19 | mov x13, 0 /* data transferred in bytes */
20 | asm_read_pass_loop:
21 | /* load 128B */
22 | ldr q16, [x15]
23 | ldr q16, [x15, 16]
24 | ldr q16, [x15, 32]
25 | ldr q16, [x15, 48]
26 | ldr q16, [x15, 64]
27 | ldr q16, [x15, 80]
28 | ldr q16, [x15, 96]
29 | ldr q16, [x15, 112]
30 | add x12, x12, 128
31 | add x15, x15, 128
32 | add x13, x13, 128
33 |
34 | cbz x3, asm_read_throttle_end
35 | mov x10, x3 /* save throttle factor */
36 | asm_read_throttle:
37 | sub x10, x10, 1
38 | cbnz x10, asm_read_throttle
39 | asm_read_throttle_end:
40 |
41 | /* end condition */
42 | ldr w14, [x2]
43 | cbnz x14, asm_read_end
44 |
45 | /* loop back condition */
46 | cmp x1, x12
47 | b.gt asm_read_pass_loop
48 | mov x15, x0
49 | mov x12, 0
50 | b asm_read_pass_loop
51 | asm_read_end:
52 | mov x0, x13
53 | ldp x11, x10, [sp, #0x30]
54 | ldp x12, x13, [sp, #0x20]
55 | ldp x14, x15, [sp, #0x10]
56 | add sp, sp, #0x40
57 | ret
58 |
--------------------------------------------------------------------------------
/LoadedMemoryLatency/Makefile:
--------------------------------------------------------------------------------
1 | amd64:
2 | gcc -O3 LoadedMemoryLatency.c LoadedMemoryLatency_amd64.s -o loadedlat_amd64 -lm
3 | aarch64:
4 | gcc -O3 LoadedMemoryLatency.c LoadedMemoryLatency_arm.s -o loadedlat_aarch64 -lm
5 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | include Common/arch_detect.mk
2 |
3 | COMPONENTS = CoherencyLatency MemoryLatency MemoryBandwidth InstructionRate Meshsim CoreClockChecker GpuMemLatency
4 |
5 | all: $(COMPONENTS)
6 |
7 | ci:
8 | for COMPONENT in $(COMPONENTS); do $(MAKE) -C $$COMPONENT ci; done
9 |
10 | package:
11 | @sh Common/ci_package.sh
12 |
13 | clean-package:
14 | find . -maxdepth 1 -type d -name "clammarks-*" -exec rm -rf {} \; && rm -f "clammarks.txz"
15 |
16 | clean:
17 | for COMPONENT in $(COMPONENTS); do $(MAKE) -C $$COMPONENT clean; done
18 |
19 | $(COMPONENTS): .FORCE
20 | $(MAKE) -C $@
21 |
22 | .FORCE:
23 |
24 | .PHONY: all ci package clean-package clean
25 |
--------------------------------------------------------------------------------
/MemoryBandwidth/Makefile:
--------------------------------------------------------------------------------
1 | include ../Common/arch_detect.mk
2 |
3 | CFLAGS = -pthread -O3
4 | LDFLAGS= -lm
5 |
6 | all: $(TARGET)
7 |
8 | amd64:
9 | $(CC) $(CFLAGS) MemoryBandwidth.c MemoryBandwidth_x86.s -o MemoryBandwidth_amd64 $(LDFLAGS)
10 |
11 | amd64-numa:
12 | $(CC) $(CFLAGS) -DNUMA MemoryBandwidth.c MemoryBandwidth_x86.s -o MemoryBandwidth_numa_amd64 $(LDFLAGS) -lnuma
13 |
14 | aarch64:
15 | $(CC) $(CFLAGS) MemoryBandwidth.c MemoryBandwidth_arm.s -o MemoryBandwidth_aarch64 $(LDFLAGS)
16 |
17 | termux:
18 | gcc -O3 -pthread MemoryBandwidth.c MemoryBandwidth_arm.s -o MemoryBandwidth_aarch64 -lm
19 |
20 | aarch64-numa:
21 | $(CC) $(CFLAGS) -DNUMA MemoryBandwidth.c MemoryBandwidth_arm.s -o MemoryBandwidth_numa_aarch64 $(LDFLAGS) -lnuma
22 |
23 | riscv64:
24 | $(CC) $(CFLAGS) -march=rv64gcv0p7 MemoryBandwidth.c MemoryBandwidth_riscv.s -o MemoryBandwidth_riscv64 $(LDFLAGS)
25 |
26 | w64:
27 | $(CC) $(CFLAGS) MemoryBandwidth.c MemoryBandwidth_x86.s -o MemoryBandwidth_w64.exe $(LDFLAGS)
28 |
29 | ci: amd64 amd64-numa aarch64 w64
30 |
31 | clean:
32 | rm -f *.o && find . -type f -executable -delete
33 |
34 | .PHONY: all ci clean
35 |
--------------------------------------------------------------------------------
/MemoryBandwidth/MemoryBandwidth/MemoryBandwidth.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 17
4 | VisualStudioVersion = 17.6.33815.320
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MemoryBandwidth", "MemoryBandwidth.vcxproj", "{E968D202-64A2-43A5-8BBD-D7D010D06564}"
7 | EndProject
8 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MixedMemoryBandwidthTest", "..\MixedMemoryBandwidthTest\MixedMemoryBandwidthTest.vcxproj", "{5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}"
9 | EndProject
10 | Global
11 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
12 | Debug|x64 = Debug|x64
13 | Debug|x86 = Debug|x86
14 | Release|x64 = Release|x64
15 | Release|x86 = Release|x86
16 | EndGlobalSection
17 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
18 | {E968D202-64A2-43A5-8BBD-D7D010D06564}.Debug|x64.ActiveCfg = Debug|x64
19 | {E968D202-64A2-43A5-8BBD-D7D010D06564}.Debug|x64.Build.0 = Debug|x64
20 | {E968D202-64A2-43A5-8BBD-D7D010D06564}.Debug|x86.ActiveCfg = Debug|Win32
21 | {E968D202-64A2-43A5-8BBD-D7D010D06564}.Debug|x86.Build.0 = Debug|Win32
22 | {E968D202-64A2-43A5-8BBD-D7D010D06564}.Release|x64.ActiveCfg = Release|x64
23 | {E968D202-64A2-43A5-8BBD-D7D010D06564}.Release|x64.Build.0 = Release|x64
24 | {E968D202-64A2-43A5-8BBD-D7D010D06564}.Release|x86.ActiveCfg = Release|Win32
25 | {E968D202-64A2-43A5-8BBD-D7D010D06564}.Release|x86.Build.0 = Release|Win32
26 | {5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Debug|x64.ActiveCfg = Debug|x64
27 | {5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Debug|x64.Build.0 = Debug|x64
28 | {5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Debug|x86.ActiveCfg = Debug|Win32
29 | {5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Debug|x86.Build.0 = Debug|Win32
30 | {5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Release|x64.ActiveCfg = Release|x64
31 | {5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Release|x64.Build.0 = Release|x64
32 | {5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Release|x86.ActiveCfg = Release|Win32
33 | {5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Release|x86.Build.0 = Release|Win32
34 | EndGlobalSection
35 | GlobalSection(SolutionProperties) = preSolution
36 | HideSolutionNode = FALSE
37 | EndGlobalSection
38 | GlobalSection(ExtensibilityGlobals) = postSolution
39 | SolutionGuid = {2EA86D6F-CEE0-40A9-B4DD-AC59CCAD358F}
40 | EndGlobalSection
41 | EndGlobal
42 |
--------------------------------------------------------------------------------
/MemoryBandwidth/MemoryBandwidth/MemoryBandwidth.vcxproj.filters:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF}
6 | cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx
7 |
8 |
9 | {93995380-89BD-4b04-88EB-625FBE52EBFB}
10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd
11 |
12 |
13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01}
14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms
15 |
16 |
17 |
18 |
19 | Source Files
20 |
21 |
22 |
23 |
24 | Source Files
25 |
26 |
27 | Source Files
28 |
29 |
30 |
--------------------------------------------------------------------------------
/MemoryBandwidth/MixedMemoryBandwidthTest/MixedMemoryBandwidthTest.vcxproj.filters:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF}
6 | cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx
7 |
8 |
9 | {93995380-89BD-4b04-88EB-625FBE52EBFB}
10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd
11 |
12 |
13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01}
14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms
15 |
16 |
17 |
18 |
19 | Source Files
20 |
21 |
22 |
23 |
24 | Source Files
25 |
26 |
27 |
28 |
29 | Header Files
30 |
31 |
32 |
--------------------------------------------------------------------------------
/MemoryLatency/Makefile:
--------------------------------------------------------------------------------
1 | include ../Common/arch_detect.mk
2 |
3 | CFLAGS = -O3
4 | LDFLAGS = -lm
5 |
6 | all: $(TARGET)
7 |
8 | amd64:
9 | $(CC) $(CFLAGS) MemoryLatency.c MemoryLatency_x86.s -o MemoryLatency_amd64 $(LDFLAGS)
10 |
11 | amd64-numa:
12 | $(CC) $(CFLAGS) -DNUMA MemoryLatency.c MemoryLatency_x86.s -o MemoryLatency_numa_amd64 $(LDFLAGS) -lnuma
13 |
14 | aarch64:
15 | $(CC) $(CFLAGS) MemoryLatency.c MemoryLatency_arm.s -o MemoryLatency_aarch64 $(LDFLAGS)
16 |
17 | aarch64-numa:
18 | $(CC) $(CFLAGS) -DNUMA MemoryLatency.c MemoryLatency_arm.s -o MemoryLatency_aarch64 $(LDFLAGS) -lnuma
19 |
20 | riscv64:
21 | $(CC) $(CFLAGS) MemoryLatency.c MemoryLatency_riscv.s -o MemoryLatency_riscv64 $(LDFLAGS)
22 |
23 | riscv64-numa:
24 | $(CC) $(CFLAGS) -DNUMA MemoryLatency.c MemoryLatency_riscv.s -o MemoryLatency_riscv64 $(LDFLAGS) -lnuma
25 |
26 | w64:
27 | $(CC) $(CFLAGS) MemoryLatency.cpp MemoryLatency_x86.s -o MemoryLatency_w64.exe $(LDFLAGS)
28 |
29 | # w64 can build with mingw 11, which isn't available on jammy
30 |
31 | ci: amd64 amd64-numa aarch64 riscv64 w64
32 |
33 | clean:
34 | rm -f *.o && find . -type f -executable -delete
35 |
36 | .PHONY: all ci clean
37 |
--------------------------------------------------------------------------------
/MemoryLatency/MemoryLatency.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 16
4 | VisualStudioVersion = 16.0.31229.75
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MemoryLatency", "MemoryLatency.vcxproj", "{3A98A230-A87B-432D-931D-369872DE24AF}"
7 | EndProject
8 | Global
9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | Debug|x64 = Debug|x64
11 | Debug|x86 = Debug|x86
12 | Release|x64 = Release|x64
13 | Release|x86 = Release|x86
14 | EndGlobalSection
15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
16 | {3A98A230-A87B-432D-931D-369872DE24AF}.Debug|x64.ActiveCfg = Debug|x64
17 | {3A98A230-A87B-432D-931D-369872DE24AF}.Debug|x64.Build.0 = Debug|x64
18 | {3A98A230-A87B-432D-931D-369872DE24AF}.Debug|x86.ActiveCfg = Debug|Win32
19 | {3A98A230-A87B-432D-931D-369872DE24AF}.Debug|x86.Build.0 = Debug|Win32
20 | {3A98A230-A87B-432D-931D-369872DE24AF}.Release|x64.ActiveCfg = Release|x64
21 | {3A98A230-A87B-432D-931D-369872DE24AF}.Release|x64.Build.0 = Release|x64
22 | {3A98A230-A87B-432D-931D-369872DE24AF}.Release|x86.ActiveCfg = Release|Win32
23 | {3A98A230-A87B-432D-931D-369872DE24AF}.Release|x86.Build.0 = Release|Win32
24 | EndGlobalSection
25 | GlobalSection(SolutionProperties) = preSolution
26 | HideSolutionNode = FALSE
27 | EndGlobalSection
28 | GlobalSection(ExtensibilityGlobals) = postSolution
29 | SolutionGuid = {F2D00DD2-A22B-4A3C-A2FF-9CE8CF9070D1}
30 | EndGlobalSection
31 | EndGlobal
32 |
--------------------------------------------------------------------------------
/MemoryLatency/MemoryLatencyFunctions.asm:
--------------------------------------------------------------------------------
1 | section .text
2 | bits 64
3 |
4 | global preplatencyarr
5 | global latencytest
6 |
7 | preplatencyarr:
8 | push r15
9 | push r14
10 | xor r15, r15 ; array index
11 | preplatencyarr_loop:
12 | mov r14, [rcx + r15 * 8]
13 | lea r14, [rcx + r14 * 8]
14 | mov [rcx + r15 * 8], r14
15 | inc r15
16 | cmp rdx, r15
17 | jne preplatencyarr_loop
18 | pop r14
19 | pop r15
20 | ret
21 |
22 | latencytest:
23 | push r15
24 | mov r15, [rdx]
25 | xor rax, rax
26 | latencytest_loop:
27 | mov r15, [r15]
28 | add rax, r15
29 | dec rcx
30 | jnz latencytest_loop
31 | pop r15
32 | ret
33 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Microbenchmarks
2 | Trying to figure various CPU (or GPU) things out.
3 |
4 | Basically my playground to microbenchmark various CPU-related things like ROB/register file sizes, lock/cache coherency latency, and cache/memory performance. This repo is loose collection of various experiments and is more of a playground than a well maintained piece of software. As such, various benchmarks may not work, or may not even compile. They're also not well documented and details of what's being tested may not be intuitive. Due to time constraints and real life priorities I won't be able to maintain this repo to an acceptable standard for public use.
5 |
6 | Feel free to try running the stuff here, but I highly suggest writing your own code because that'll provide a better understanding of the theory behind the benchmarks. Consider checking out https://github.com/travisdowns/robsize or https://github.com/Veedrac/microarchitecturometer.
7 |
8 | # Building Clammicrobench with Generated Code
9 | Get NASM (https://www.nasm.us/) and make sure it's in your path. Then things should build under Visual Studio 2022.
10 |
11 | Some microbenchmarks have the source code and assembly generated by C# code, to avoid crazy stuff like self modifying code. For clammicrobench, build/run the AsmGen project. Pass "autocopy" on the command line to have it automatically place generated ASM files for Visual Studio. Then, the clammicrobench project should build.
12 |
--------------------------------------------------------------------------------
/mt_instructionrate/Makefile:
--------------------------------------------------------------------------------
1 | x86:
2 | gcc -pthread -masm=intel x86_mt_instructionrate.s mt_instructionrate.c ../Common/timing.c -o x86_mt_instructionrate -static
3 | aarch64:
4 | gcc -pthread mt_instructionrate.c arm_mt_instructionrate.s ../Common/timing.c -o arm_mt_instructionrate
5 | ppc64:
6 | gcc -pthread -mregnames mt_instructionrate.c ppc64_mt_instructionrate.s ../Common/timing.c -o ppc64_mt_instructionrate
7 |
--------------------------------------------------------------------------------
/mt_instructionrate/Project1.vcxproj.filters:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF}
6 | cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx
7 |
8 |
9 | {93995380-89BD-4b04-88EB-625FBE52EBFB}
10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd
11 |
12 |
13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01}
14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms
15 |
16 |
17 |
18 |
19 | Source Files
20 |
21 |
22 | Header Files
23 |
24 |
25 |
26 |
27 | Header Files
28 |
29 |
30 |
31 |
32 | Source Files
33 |
34 |
35 |
--------------------------------------------------------------------------------
/mt_instructionrate/mt_instructionrate.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 17
4 | VisualStudioVersion = 17.8.34511.84
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Project1", "Project1.vcxproj", "{0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}"
7 | EndProject
8 | Global
9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | Debug|x64 = Debug|x64
11 | Debug|x86 = Debug|x86
12 | Release|x64 = Release|x64
13 | Release|x86 = Release|x86
14 | EndGlobalSection
15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
16 | {0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Debug|x64.ActiveCfg = Debug|x64
17 | {0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Debug|x64.Build.0 = Debug|x64
18 | {0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Debug|x86.ActiveCfg = Debug|Win32
19 | {0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Debug|x86.Build.0 = Debug|Win32
20 | {0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Release|x64.ActiveCfg = Release|x64
21 | {0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Release|x64.Build.0 = Release|x64
22 | {0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Release|x86.ActiveCfg = Release|Win32
23 | {0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Release|x86.Build.0 = Release|Win32
24 | EndGlobalSection
25 | GlobalSection(SolutionProperties) = preSolution
26 | HideSolutionNode = FALSE
27 | EndGlobalSection
28 | GlobalSection(ExtensibilityGlobals) = postSolution
29 | SolutionGuid = {B31B466E-F833-4B33-9E21-74616F970AA2}
30 | EndGlobalSection
31 | EndGlobal
32 |
--------------------------------------------------------------------------------
/mt_instructionrate/ppc64_mt_instructionrate.c:
--------------------------------------------------------------------------------
1 | extern uint64_t vec_int32_add_test(uint64_t iterations, void *data);
2 | extern uint64_t vec_int32_mul_test(uint64_t iterations, void *data);
3 | extern uint64_t vec_fp32_add_test(uint64_t iterations, void *data);
4 | extern uint64_t vec_fp32_fma_test(uint64_t iterations, void *data);
5 | extern uint64_t vec_fp32_isqrt_test(uint64_t iterations, void *data);
6 | extern uint64_t fp64_add_test(uint64_t iterations, void *data);
7 | extern uint64_t fp64_fma_test(uint64_t iterations, void *data);
8 |
9 | void RunTests() {
10 | uint64_t iterations = 3500000000;
11 | int testDataLength = 256;
12 | uint32_t *intTestArr = (uint32_t *)malloc(sizeof(uint32_t) * testDataLength);
13 | uint32_t *fpTestArr = (uint32_t *)malloc(sizeof(uint32_t) * testDataLength);
14 | for (int i = 0; i < testDataLength; i++) {
15 | intTestArr[i] = i;
16 | fpTestArr[i] = i * 1.2f;
17 | }
18 |
19 | fprintf(stderr, "Measuring INT32 adds\n");
20 | float int32adds = measureFunction(iterations, vec_int32_add_test, intTestArr);
21 | float int32muls = measureFunction(iterations, vec_int32_mul_test, intTestArr);
22 | float fp32adds = measureFunction(iterations, vec_fp32_add_test, fpTestArr);
23 | float fp32fmas = measureFunction(iterations, vec_fp32_fma_test, fpTestArr);
24 | float fp32isqrt = measureFunction(iterations, vec_fp32_isqrt_test, fpTestArr);
25 | float fp64adds = measureFunction(iterations, fp64_add_test, fpTestArr);
26 | float fp64fmas = measureFunction(iterations, fp64_fma_test, fpTestArr);
27 |
28 | printf("-----GOPS/s-----\n");
29 | printf("Altivec INT32 Add: %f\n", int32adds);
30 | printf("Altivec INT32 Multiply: %f\n", int32muls);
31 | printf("Altivec FP32 Add: %f\n", fp32adds);
32 | printf("Altivec FP32 FMA: %f (%f GFLOPS)\n", fp32fmas, 2 * fp32fmas);
33 | printf("Altivec FP32 Inverse Square Root: %f\n", fp32isqrt);
34 | printf("FP64 Add: %f\n", fp64adds);
35 | printf("FP64 FMA: %f (%f GFLOPS)\n", fp64fmas, 2 * fp64fmas);
36 |
37 | free(intTestArr);
38 | free(fpTestArr);
39 | return;
40 | }
41 |
--------------------------------------------------------------------------------
/mt_instructionrate/x86_mt_instructionrate:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clamchowder/Microbenchmarks/e8aaed570e6d7fea725f3d67313f5a51973d6fdf/mt_instructionrate/x86_mt_instructionrate
--------------------------------------------------------------------------------
/svm/OpenCL/include/CL/Utils/Context.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | // OpenCL Utils includes
4 | #include "OpenCLUtils_Export.h"
5 |
6 | // OpenCL includes
7 | #include
8 |
9 | // STL includes
10 | #include
11 |
12 | UTILS_EXPORT
13 | cl_context cl_util_get_context(const cl_uint plat_id, const cl_uint dev_id,
14 | const cl_device_type type, cl_int* const error);
15 | UTILS_EXPORT
16 | cl_device_id cl_util_get_device(const cl_uint plat_id, const cl_uint dev_id,
17 | const cl_device_type type, cl_int* const error);
18 |
19 | UTILS_EXPORT
20 | cl_int cl_util_print_device_info(const cl_device_id device);
21 |
22 | UTILS_EXPORT
23 | char* cl_util_get_device_info(const cl_device_id device,
24 | const cl_device_info info, cl_int* const error);
25 | UTILS_EXPORT
26 | char* cl_util_get_platform_info(const cl_platform_id platform,
27 | const cl_platform_info info,
28 | cl_int* const error);
29 |
30 | // build program and show log if build is not successful
31 | UTILS_EXPORT
32 | cl_int cl_util_build_program(const cl_program pr, const cl_device_id dev,
33 | const char* const opt);
34 |
35 | #define GET_CURRENT_TIMER(time) \
36 | struct timespec time; \
37 | timespec_get(&time, TIME_UTC); \
38 | { \
39 | }
40 |
41 | #define TIMER_DIFFERENCE(dt, time1, time2) \
42 | { \
43 | dt = (time2.tv_sec - time1.tv_sec) * 1000000000 \
44 | + (time2.tv_nsec - time1.tv_nsec); \
45 | }
46 |
47 | #define START_TIMER GET_CURRENT_TIMER(start_timer1)
48 | #define STOP_TIMER(dt) \
49 | GET_CURRENT_TIMER(stop_timer2) \
50 | TIMER_DIFFERENCE(dt, start_timer1, stop_timer2)
51 |
--------------------------------------------------------------------------------
/svm/OpenCL/include/CL/Utils/Context.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | // OpenCL SDK includes
4 | #include "OpenCLUtilsCpp_Export.h"
5 |
6 | #include
7 |
8 | // OpenCL includes
9 | #include
10 |
11 | namespace cl {
12 | namespace util {
13 | Context UTILSCPP_EXPORT get_context(cl_uint plat_id, cl_uint dev_id,
14 | cl_device_type type,
15 | cl_int* error = nullptr);
16 |
17 | void UTILSCPP_EXPORT print_device_info(const cl::Device& device);
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/svm/OpenCL/include/CL/Utils/Device.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "OpenCLUtilsCpp_Export.h"
4 | #include
5 |
6 | #include
7 |
8 | namespace cl {
9 | namespace util {
10 | bool UTILSCPP_EXPORT opencl_c_version_contains(
11 | const cl::Device& device, const cl::string& version_fragment);
12 |
13 | bool UTILSCPP_EXPORT supports_extension(const cl::Device& device,
14 | const cl::string& extension);
15 |
16 | #ifdef CL_VERSION_3_0
17 | bool UTILSCPP_EXPORT supports_feature(const cl::Device& device,
18 | const cl::string& feature_name);
19 | #endif
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/svm/OpenCL/include/CL/Utils/Error.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | // OpenCL Utils includes
4 | #include "OpenCLUtilsCpp_Export.h"
5 |
6 | // OpenCL Utils includes
7 | #include
8 |
9 | // OpenCL includes
10 | #include
11 |
12 | namespace cl {
13 | namespace util {
14 | #if defined(CL_HPP_ENABLE_EXCEPTIONS)
15 | /*! \brief Exception class
16 | *
17 | * This may be thrown by SDK utility functions when
18 | * CL_HPP_ENABLE_EXCEPTIONS is defined.
19 | */
20 | class Error : public std::exception {
21 | private:
22 | int err_;
23 | const char* errStr_;
24 |
25 | public:
26 | /*! \brief Create a new SDK error exception for a given error code
27 | * and corresponding message.
28 | *
29 | * \param err error code value.
30 | *
31 | * \param errStr a descriptive string that must remain in scope until
32 | * handling of the exception has concluded. If set, it
33 | * will be returned by what().
34 | */
35 | Error(cl_int err, const char* errStr = NULL): err_(err), errStr_(errStr)
36 | {}
37 |
38 | ~Error() noexcept {}
39 |
40 | /*! \brief Get error string associated with exception
41 | *
42 | * \return A memory pointer to the error message string.
43 | */
44 | virtual const char* what() const noexcept
45 | {
46 | if (errStr_ == NULL)
47 | {
48 | return "empty";
49 | }
50 | else
51 | {
52 | return errStr_;
53 | }
54 | }
55 |
56 | /*! \brief Get error code associated with exception
57 | *
58 | * \return The error code.
59 | */
60 | cl_int err(void) const { return err_; }
61 | };
62 | #endif
63 |
64 | namespace detail {
65 | UTILSCPP_EXPORT cl_int errHandler(cl_int err, cl_int* errPtr,
66 | const char* errStr = nullptr);
67 | }
68 |
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/svm/OpenCL/include/CL/Utils/ErrorCodes.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #define CL_UTIL_INDEX_OUT_OF_RANGE -2000
4 | #define CL_UTIL_DEVICE_NOT_INTEROPERABLE -2001
5 | #define CL_UTIL_FILE_OPERATION_ERROR -2002
6 |
--------------------------------------------------------------------------------
/svm/OpenCL/include/CL/Utils/Event.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | // OpenCL Utils includes
4 | #include "OpenCLUtils_Export.h"
5 |
6 | // OpenCL includes
7 | #include
8 |
9 | UTILS_EXPORT
10 | cl_ulong cl_util_get_event_duration(const cl_event event,
11 | const cl_profiling_info start,
12 | const cl_profiling_info end,
13 | cl_int* const error);
14 |
--------------------------------------------------------------------------------
/svm/OpenCL/include/CL/Utils/Event.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | // OpenCL SDK includes
4 | #include "OpenCLUtilsCpp_Export.h"
5 |
6 | // STL includes
7 | #include
8 |
9 | // OpenCL includes
10 | #include
11 |
12 | namespace cl {
13 | namespace util {
14 | template
15 | auto get_duration(cl::Event& ev)
16 | {
17 | return std::chrono::duration_cast(std::chrono::nanoseconds{
18 | ev.getProfilingInfo() - ev.getProfilingInfo() });
19 | }
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/svm/OpenCL/include/CL/Utils/File.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | // OpenCL Utils includes
4 | #include "OpenCLUtils_Export.h"
5 |
6 | // OpenCL includes
7 | #include
8 |
9 | // read all the text file contents securely in ANSI C89
10 | // return pointer to C-string with file contents
11 | // can handle streams with no known size and no support for fseek
12 | // based on https://stackoverflow.com/questions/14002954/ by Nominal Animal
13 | UTILS_EXPORT
14 | char* cl_util_read_text_file(const char* const filename, size_t* const length,
15 | cl_int* const error);
16 |
17 | // read all the binary file contents securely in ANSI C89
18 | // return pointer to file contents
19 | // can handle streams with no known size and no support for fseek
20 | // based on https://stackoverflow.com/questions/14002954/ by Nominal Animal
21 | UTILS_EXPORT
22 | unsigned char* cl_util_read_binary_file(const char* const filename,
23 | size_t* const length,
24 | cl_int* const error);
25 |
26 | // write binaries of OpenCL compiled program
27 | // binaries are written as separate files for each device
28 | // with file name "(program_file_name)_(name of device).bin"
29 | // based on variant of Logan
30 | // http://logan.tw/posts/2014/11/22/pre-compile-the-opencl-kernel-program-part-2/
31 | UTILS_EXPORT
32 | cl_int cl_util_write_binaries(const cl_program program,
33 | const char* const program_file_name);
34 |
35 | // read binaries of OpenCL compiled program
36 | // from files of file names "(program_file_name)_(name of device).bin"
37 | UTILS_EXPORT
38 | cl_program cl_util_read_binaries(const cl_context context,
39 | const cl_device_id* const devices,
40 | const cl_uint num_devices,
41 | const char* const program_file_name,
42 | cl_int* const error);
43 |
44 | // returns the folder containing the running executable
45 | UTILS_EXPORT
46 | cl_int cl_util_executable_folder(char* filename, size_t* const length);
47 |
48 | // read all the text file contents securely in ANSI C89
49 | // return pointer to C-string with file contents
50 | // interprets filename relative to the folder containing
51 | // the running executable
52 | UTILS_EXPORT
53 | char* cl_util_read_exe_relative_text_file(const char* const rel_path,
54 | size_t* const length,
55 | cl_int* const error);
56 |
--------------------------------------------------------------------------------
/svm/OpenCL/include/CL/Utils/File.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | // OpenCL SDK includes
4 | #include "OpenCLUtilsCpp_Export.h"
5 |
6 | #include
7 |
8 | // OpenCL includes
9 | #include
10 |
11 |
12 | namespace cl {
13 | namespace util {
14 |
15 | std::string UTILSCPP_EXPORT read_text_file(const char* const filename,
16 | cl_int* const error = nullptr);
17 |
18 | std::vector UTILSCPP_EXPORT
19 | read_binary_file(const char* const filename, cl_int* const error = nullptr);
20 |
21 | Program::Binaries UTILSCPP_EXPORT read_binary_files(
22 | const std::vector& devices,
23 | const char* const program_file_name, cl_int* const error = nullptr);
24 |
25 | cl_int UTILSCPP_EXPORT
26 | write_binaries(const cl::Program::Binaries& binaries,
27 | const std::vector& devices,
28 | const char* const program_file_name);
29 |
30 | std::string UTILSCPP_EXPORT
31 | executable_folder(cl_int* const error = nullptr);
32 |
33 | std::string UTILSCPP_EXPORT read_exe_relative_text_file(
34 | const char* const filename, cl_int* const error = nullptr);
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/svm/OpenCL/include/CL/Utils/InteropContext.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "OpenCLUtilsCpp_Export.h"
4 | #include
5 |
6 | #include
7 |
8 | namespace cl {
9 | namespace util {
10 | vector
11 | UTILSCPP_EXPORT get_interop_context_properties(const cl::Device& plat,
12 | cl_int* error = nullptr);
13 |
14 | Context UTILSCPP_EXPORT get_interop_context(int plat_id, int dev_id,
15 | cl_device_type type,
16 | cl_int* error = nullptr);
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/svm/OpenCL/include/CL/Utils/OpenCLUtilsCpp_Export.h:
--------------------------------------------------------------------------------
1 |
2 | #ifndef UTILSCPP_EXPORT_H
3 | #define UTILSCPP_EXPORT_H
4 |
5 | #ifdef OPENCLUTILSCPP_STATIC_DEFINE
6 | # define UTILSCPP_EXPORT
7 | # define OPENCLUTILSCPP_NO_EXPORT
8 | #else
9 | # ifndef UTILSCPP_EXPORT
10 | # ifdef OpenCLUtilsCpp_EXPORTS
11 | /* We are building this library */
12 | # define UTILSCPP_EXPORT
13 | # else
14 | /* We are using this library */
15 | # define UTILSCPP_EXPORT
16 | # endif
17 | # endif
18 |
19 | # ifndef OPENCLUTILSCPP_NO_EXPORT
20 | # define OPENCLUTILSCPP_NO_EXPORT
21 | # endif
22 | #endif
23 |
24 | #ifndef OPENCLUTILSCPP_DEPRECATED
25 | # define OPENCLUTILSCPP_DEPRECATED __declspec(deprecated)
26 | #endif
27 |
28 | #ifndef OPENCLUTILSCPP_DEPRECATED_EXPORT
29 | # define OPENCLUTILSCPP_DEPRECATED_EXPORT UTILSCPP_EXPORT OPENCLUTILSCPP_DEPRECATED
30 | #endif
31 |
32 | #ifndef OPENCLUTILSCPP_DEPRECATED_NO_EXPORT
33 | # define OPENCLUTILSCPP_DEPRECATED_NO_EXPORT OPENCLUTILSCPP_NO_EXPORT OPENCLUTILSCPP_DEPRECATED
34 | #endif
35 |
36 | /* NOLINTNEXTLINE(readability-avoid-unconditional-preprocessor-if) */
37 | #if 0 /* DEFINE_NO_DEPRECATED */
38 | # ifndef OPENCLUTILSCPP_NO_DEPRECATED
39 | # define OPENCLUTILSCPP_NO_DEPRECATED
40 | # endif
41 | #endif
42 |
43 | #endif /* UTILSCPP_EXPORT_H */
44 |
--------------------------------------------------------------------------------
/svm/OpenCL/include/CL/Utils/OpenCLUtils_Export.h:
--------------------------------------------------------------------------------
1 |
2 | #ifndef UTILS_EXPORT_H
3 | #define UTILS_EXPORT_H
4 |
5 | #ifdef OPENCLUTILS_STATIC_DEFINE
6 | # define UTILS_EXPORT
7 | # define OPENCLUTILS_NO_EXPORT
8 | #else
9 | # ifndef UTILS_EXPORT
10 | # ifdef OpenCLUtils_EXPORTS
11 | /* We are building this library */
12 | # define UTILS_EXPORT
13 | # else
14 | /* We are using this library */
15 | # define UTILS_EXPORT
16 | # endif
17 | # endif
18 |
19 | # ifndef OPENCLUTILS_NO_EXPORT
20 | # define OPENCLUTILS_NO_EXPORT
21 | # endif
22 | #endif
23 |
24 | #ifndef OPENCLUTILS_DEPRECATED
25 | # define OPENCLUTILS_DEPRECATED __declspec(deprecated)
26 | #endif
27 |
28 | #ifndef OPENCLUTILS_DEPRECATED_EXPORT
29 | # define OPENCLUTILS_DEPRECATED_EXPORT UTILS_EXPORT OPENCLUTILS_DEPRECATED
30 | #endif
31 |
32 | #ifndef OPENCLUTILS_DEPRECATED_NO_EXPORT
33 | # define OPENCLUTILS_DEPRECATED_NO_EXPORT OPENCLUTILS_NO_EXPORT OPENCLUTILS_DEPRECATED
34 | #endif
35 |
36 | /* NOLINTNEXTLINE(readability-avoid-unconditional-preprocessor-if) */
37 | #if 0 /* DEFINE_NO_DEPRECATED */
38 | # ifndef OPENCLUTILS_NO_DEPRECATED
39 | # define OPENCLUTILS_NO_DEPRECATED
40 | # endif
41 | #endif
42 |
43 | #endif /* UTILS_EXPORT_H */
44 |
--------------------------------------------------------------------------------
/svm/OpenCL/include/CL/Utils/Platform.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "OpenCLUtilsCpp_Export.h"
4 | #include
5 |
6 | #include
7 |
8 | namespace cl {
9 | namespace util {
10 | bool UTILSCPP_EXPORT supports_extension(const cl::Platform& platform,
11 | const cl::string& extension);
12 |
13 | bool UTILSCPP_EXPORT platform_version_contains(
14 | const cl::Platform& platform, const cl::string& version_fragment);
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/svm/OpenCL/include/CL/Utils/Utils.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | // OpenCL Utils includes
4 | #include "OpenCLUtils_Export.h"
5 |
6 | #include
7 | #include
8 | #include
9 |
10 | // OpenCL includes
11 | #include
12 |
--------------------------------------------------------------------------------
/svm/OpenCL/include/CL/Utils/Utils.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | // OpenCL Utils includes
4 | #include "OpenCLUtils_Export.h"
5 |
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 |
14 | // OpenCL includes
15 | #include
16 |
--------------------------------------------------------------------------------
/svm/OpenCL/include/CL/cl2.hpp:
--------------------------------------------------------------------------------
1 | //
2 | // Copyright (c) 2020 The Khronos Group Inc.
3 | //
4 | // Licensed under the Apache License, Version 2.0 (the "License");
5 | // you may not use this file except in compliance with the License.
6 | // You may obtain a copy of the License at
7 | //
8 | // http://www.apache.org/licenses/LICENSE-2.0
9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | //
16 |
17 | #include
18 | #pragma message("cl2.hpp has been renamed to opencl.hpp to make it clear that it supports all versions of OpenCL. Please include opencl.hpp directly.")
19 |
--------------------------------------------------------------------------------
/svm/OpenCL/include/CL/cl_dx9_media_sharing_intel.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright (c) 2008-2020 The Khronos Group Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | #include
18 | #pragma message("The Intel DX9 media sharing extensions have been moved into cl_dx9_media_sharing.h. Please include cl_dx9_media_sharing.h directly.")
19 |
--------------------------------------------------------------------------------
/svm/OpenCL/include/CL/cl_ext_intel.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright (c) 2008-2020 The Khronos Group Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *
16 | ******************************************************************************/
17 |
18 | #include
19 | #pragma message("The Intel extensions have been moved into cl_ext.h. Please include cl_ext.h directly.")
20 |
--------------------------------------------------------------------------------
/svm/OpenCL/include/CL/cl_gl_ext.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright (c) 2008-2021 The Khronos Group Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | #include
18 | #pragma message("The extensions in cl_gl_ext.h have been moved into cl_gl.h. Please include cl_gl.h directly.")
19 |
--------------------------------------------------------------------------------
/svm/OpenCL/include/CL/opencl.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright (c) 2008-2021 The Khronos Group Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | ******************************************************************************/
16 |
17 | #ifndef __OPENCL_H
18 | #define __OPENCL_H
19 |
20 | #ifdef __cplusplus
21 | extern "C" {
22 | #endif
23 |
24 | #include
25 | #include
26 | #include
27 |
28 | #ifdef __cplusplus
29 | }
30 | #endif
31 |
32 | #endif /* __OPENCL_H */
33 |
--------------------------------------------------------------------------------
/svm/OpenCL/lib/OpenCL.lib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clamchowder/Microbenchmarks/e8aaed570e6d7fea725f3d67313f5a51973d6fdf/svm/OpenCL/lib/OpenCL.lib
--------------------------------------------------------------------------------
/svm/OpenCL/lib/OpenCLExt.lib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clamchowder/Microbenchmarks/e8aaed570e6d7fea725f3d67313f5a51973d6fdf/svm/OpenCL/lib/OpenCLExt.lib
--------------------------------------------------------------------------------
/svm/OpenCL/lib/OpenCLUtils.lib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clamchowder/Microbenchmarks/e8aaed570e6d7fea725f3d67313f5a51973d6fdf/svm/OpenCL/lib/OpenCLUtils.lib
--------------------------------------------------------------------------------
/svm/OpenCL/lib/OpenCLUtilsCpp.lib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clamchowder/Microbenchmarks/e8aaed570e6d7fea725f3d67313f5a51973d6fdf/svm/OpenCL/lib/OpenCLUtilsCpp.lib
--------------------------------------------------------------------------------
/svm/OpenCL/lib/OpenCLUtilsCppd.lib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clamchowder/Microbenchmarks/e8aaed570e6d7fea725f3d67313f5a51973d6fdf/svm/OpenCL/lib/OpenCLUtilsCppd.lib
--------------------------------------------------------------------------------
/svm/OpenCL/lib/OpenCLUtilsd.lib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clamchowder/Microbenchmarks/e8aaed570e6d7fea725f3d67313f5a51973d6fdf/svm/OpenCL/lib/OpenCLUtilsd.lib
--------------------------------------------------------------------------------
/svm/OpenCL/lib/pkgconfig/OpenCL.pc:
--------------------------------------------------------------------------------
1 | prefix=D:/a/OpenCL-SDK/OpenCL-SDK/install
2 | exec_prefix=${prefix}
3 | libdir=${exec_prefix}/lib
4 |
5 | Name: OpenCL
6 | Description: Khronos OpenCL ICD Loader
7 | Requires: OpenCL-Headers
8 | Version: 3.0
9 | Libs: -L${libdir} -lOpenCL
10 |
--------------------------------------------------------------------------------
/svm/OpenCL/share/cmake/OpenCL/OpenCLConfig.cmake:
--------------------------------------------------------------------------------
1 | get_filename_component(PARENT_DIR ${CMAKE_CURRENT_LIST_DIR} PATH)
2 | include("${PARENT_DIR}/OpenCLHeaders/OpenCLHeadersConfig.cmake")
3 | include("${PARENT_DIR}/OpenCLICDLoader/OpenCLICDLoaderConfig.cmake")
4 | include("${PARENT_DIR}/OpenCLHeadersCpp/OpenCLHeadersCppConfig.cmake")
5 | include("${PARENT_DIR}/OpenCLUtils/OpenCLUtilsConfig.cmake")
6 | include("${PARENT_DIR}/OpenCLUtilsCpp/OpenCLUtilsCppConfig.cmake")
7 |
--------------------------------------------------------------------------------
/svm/OpenCL/share/cmake/OpenCL/OpenCLConfigVersion.cmake:
--------------------------------------------------------------------------------
1 | # This is a basic version file for the Config-mode of find_package().
2 | # It is used by write_basic_package_version_file() as input file for configure_file()
3 | # to create a version-file which can be installed along a config.cmake file.
4 | #
5 | # The created file sets PACKAGE_VERSION_EXACT if the current version string and
6 | # the requested version string are exactly the same and it sets
7 | # PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version.
8 | # The variable CVF_VERSION must be set before calling configure_file().
9 |
10 | set(PACKAGE_VERSION "2024.10.24")
11 |
12 | if (PACKAGE_FIND_VERSION_RANGE)
13 | # Package version must be in the requested version range
14 | if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN)
15 | OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX)
16 | OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX)))
17 | set(PACKAGE_VERSION_COMPATIBLE FALSE)
18 | else()
19 | set(PACKAGE_VERSION_COMPATIBLE TRUE)
20 | endif()
21 | else()
22 | if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)
23 | set(PACKAGE_VERSION_COMPATIBLE FALSE)
24 | else()
25 | set(PACKAGE_VERSION_COMPATIBLE TRUE)
26 | if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
27 | set(PACKAGE_VERSION_EXACT TRUE)
28 | endif()
29 | endif()
30 | endif()
31 |
32 |
33 | # if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it:
34 | if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "" STREQUAL "")
35 | return()
36 | endif()
37 |
38 | # check that the installed version has the same 32/64bit-ness as the one which is currently searching:
39 | if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "")
40 | math(EXPR installedBits " * 8")
41 | set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)")
42 | set(PACKAGE_VERSION_UNSUITABLE TRUE)
43 | endif()
44 |
--------------------------------------------------------------------------------
/svm/OpenCL/share/cmake/OpenCLExtensionLoader/OpenCLExtensionLoaderConfig.cmake:
--------------------------------------------------------------------------------
1 | include("${CMAKE_CURRENT_LIST_DIR}/OpenCLExtensionLoaderTargets.cmake")
--------------------------------------------------------------------------------
/svm/OpenCL/share/cmake/OpenCLExtensionLoader/OpenCLExtensionLoaderConfigVersion.cmake:
--------------------------------------------------------------------------------
1 | # This is a basic version file for the Config-mode of find_package().
2 | # It is used by write_basic_package_version_file() as input file for configure_file()
3 | # to create a version-file which can be installed along a config.cmake file.
4 | #
5 | # The created file sets PACKAGE_VERSION_EXACT if the current version string and
6 | # the requested version string are exactly the same and it sets
7 | # PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version.
8 | # The variable CVF_VERSION must be set before calling configure_file().
9 |
10 | set(PACKAGE_VERSION "1.0.220515")
11 |
12 | if (PACKAGE_FIND_VERSION_RANGE)
13 | # Package version must be in the requested version range
14 | if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN)
15 | OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX)
16 | OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX)))
17 | set(PACKAGE_VERSION_COMPATIBLE FALSE)
18 | else()
19 | set(PACKAGE_VERSION_COMPATIBLE TRUE)
20 | endif()
21 | else()
22 | if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)
23 | set(PACKAGE_VERSION_COMPATIBLE FALSE)
24 | else()
25 | set(PACKAGE_VERSION_COMPATIBLE TRUE)
26 | if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
27 | set(PACKAGE_VERSION_EXACT TRUE)
28 | endif()
29 | endif()
30 | endif()
31 |
32 |
33 | # if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it:
34 | if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "" STREQUAL "")
35 | return()
36 | endif()
37 |
38 | # check that the installed version has the same 32/64bit-ness as the one which is currently searching:
39 | if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "")
40 | math(EXPR installedBits " * 8")
41 | set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)")
42 | set(PACKAGE_VERSION_UNSUITABLE TRUE)
43 | endif()
44 |
--------------------------------------------------------------------------------
/svm/OpenCL/share/cmake/OpenCLExtensionLoader/OpenCLExtensionLoaderTargets-debug.cmake:
--------------------------------------------------------------------------------
1 | #----------------------------------------------------------------
2 | # Generated CMake target import file for configuration "Debug".
3 | #----------------------------------------------------------------
4 |
5 | # Commands may need to know the format version.
6 | set(CMAKE_IMPORT_FILE_VERSION 1)
7 |
8 | # Import target "OpenCL::OpenCLExt" for configuration "Debug"
9 | set_property(TARGET OpenCL::OpenCLExt APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG)
10 | set_target_properties(OpenCL::OpenCLExt PROPERTIES
11 | IMPORTED_LINK_INTERFACE_LANGUAGES_DEBUG "CXX"
12 | IMPORTED_LOCATION_DEBUG "${_IMPORT_PREFIX}/lib/OpenCLExt.lib"
13 | )
14 |
15 | list(APPEND _cmake_import_check_targets OpenCL::OpenCLExt )
16 | list(APPEND _cmake_import_check_files_for_OpenCL::OpenCLExt "${_IMPORT_PREFIX}/lib/OpenCLExt.lib" )
17 |
18 | # Commands beyond this point should not need to know the version.
19 | set(CMAKE_IMPORT_FILE_VERSION)
20 |
--------------------------------------------------------------------------------
/svm/OpenCL/share/cmake/OpenCLExtensionLoader/OpenCLExtensionLoaderTargets-release.cmake:
--------------------------------------------------------------------------------
1 | #----------------------------------------------------------------
2 | # Generated CMake target import file for configuration "Release".
3 | #----------------------------------------------------------------
4 |
5 | # Commands may need to know the format version.
6 | set(CMAKE_IMPORT_FILE_VERSION 1)
7 |
8 | # Import target "OpenCL::OpenCLExt" for configuration "Release"
9 | set_property(TARGET OpenCL::OpenCLExt APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
10 | set_target_properties(OpenCL::OpenCLExt PROPERTIES
11 | IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "CXX"
12 | IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/OpenCLExt.lib"
13 | )
14 |
15 | list(APPEND _cmake_import_check_targets OpenCL::OpenCLExt )
16 | list(APPEND _cmake_import_check_files_for_OpenCL::OpenCLExt "${_IMPORT_PREFIX}/lib/OpenCLExt.lib" )
17 |
18 | # Commands beyond this point should not need to know the version.
19 | set(CMAKE_IMPORT_FILE_VERSION)
20 |
--------------------------------------------------------------------------------
/svm/OpenCL/share/cmake/OpenCLHeaders/OpenCLHeadersConfig.cmake:
--------------------------------------------------------------------------------
1 | include("${CMAKE_CURRENT_LIST_DIR}/OpenCLHeadersTargets.cmake")
--------------------------------------------------------------------------------
/svm/OpenCL/share/cmake/OpenCLHeaders/OpenCLHeadersConfigVersion.cmake:
--------------------------------------------------------------------------------
1 | # This is a basic version file for the Config-mode of find_package().
2 | # It is used by write_basic_package_version_file() as input file for configure_file()
3 | # to create a version-file which can be installed along a config.cmake file.
4 | #
5 | # The created file sets PACKAGE_VERSION_EXACT if the current version string and
6 | # the requested version string are exactly the same and it sets
7 | # PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version.
8 | # The variable CVF_VERSION must be set before calling configure_file().
9 |
10 | set(PACKAGE_VERSION "3.0")
11 |
12 | if (PACKAGE_FIND_VERSION_RANGE)
13 | # Package version must be in the requested version range
14 | if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN)
15 | OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX)
16 | OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX)))
17 | set(PACKAGE_VERSION_COMPATIBLE FALSE)
18 | else()
19 | set(PACKAGE_VERSION_COMPATIBLE TRUE)
20 | endif()
21 | else()
22 | if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)
23 | set(PACKAGE_VERSION_COMPATIBLE FALSE)
24 | else()
25 | set(PACKAGE_VERSION_COMPATIBLE TRUE)
26 | if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
27 | set(PACKAGE_VERSION_EXACT TRUE)
28 | endif()
29 | endif()
30 | endif()
31 |
32 |
33 | # if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it:
34 | if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "" STREQUAL "")
35 | return()
36 | endif()
37 |
38 | # check that the installed version has the same 32/64bit-ness as the one which is currently searching:
39 | if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "")
40 | math(EXPR installedBits " * 8")
41 | set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)")
42 | set(PACKAGE_VERSION_UNSUITABLE TRUE)
43 | endif()
44 |
--------------------------------------------------------------------------------
/svm/OpenCL/share/cmake/OpenCLHeadersCpp/OpenCLHeadersCppConfig.cmake:
--------------------------------------------------------------------------------
1 | include("${CMAKE_CURRENT_LIST_DIR}/OpenCLHeadersCppTargets.cmake")
--------------------------------------------------------------------------------
/svm/OpenCL/share/cmake/OpenCLHeadersCpp/OpenCLHeadersCppConfigVersion.cmake:
--------------------------------------------------------------------------------
1 | # This is a basic version file for the Config-mode of find_package().
2 | # It is used by write_basic_package_version_file() as input file for configure_file()
3 | # to create a version-file which can be installed along a config.cmake file.
4 | #
5 | # The created file sets PACKAGE_VERSION_EXACT if the current version string and
6 | # the requested version string are exactly the same and it sets
7 | # PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version.
8 | # The variable CVF_VERSION must be set before calling configure_file().
9 |
10 | set(PACKAGE_VERSION "3.0")
11 |
12 | if (PACKAGE_FIND_VERSION_RANGE)
13 | # Package version must be in the requested version range
14 | if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN)
15 | OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX)
16 | OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX)))
17 | set(PACKAGE_VERSION_COMPATIBLE FALSE)
18 | else()
19 | set(PACKAGE_VERSION_COMPATIBLE TRUE)
20 | endif()
21 | else()
22 | if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)
23 | set(PACKAGE_VERSION_COMPATIBLE FALSE)
24 | else()
25 | set(PACKAGE_VERSION_COMPATIBLE TRUE)
26 | if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
27 | set(PACKAGE_VERSION_EXACT TRUE)
28 | endif()
29 | endif()
30 | endif()
31 |
32 |
33 | # if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it:
34 | if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "" STREQUAL "")
35 | return()
36 | endif()
37 |
38 | # check that the installed version has the same 32/64bit-ness as the one which is currently searching:
39 | if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "")
40 | math(EXPR installedBits " * 8")
41 | set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)")
42 | set(PACKAGE_VERSION_UNSUITABLE TRUE)
43 | endif()
44 |
--------------------------------------------------------------------------------
/svm/OpenCL/share/cmake/OpenCLICDLoader/OpenCLICDLoaderConfig.cmake:
--------------------------------------------------------------------------------
1 | include("${CMAKE_CURRENT_LIST_DIR}/OpenCLICDLoaderTargets.cmake")
--------------------------------------------------------------------------------
/svm/OpenCL/share/cmake/OpenCLICDLoader/OpenCLICDLoaderConfigVersion.cmake:
--------------------------------------------------------------------------------
1 | # This is a basic version file for the Config-mode of find_package().
2 | # It is used by write_basic_package_version_file() as input file for configure_file()
3 | # to create a version-file which can be installed along a config.cmake file.
4 | #
5 | # The created file sets PACKAGE_VERSION_EXACT if the current version string and
6 | # the requested version string are exactly the same and it sets
7 | # PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version.
8 | # The variable CVF_VERSION must be set before calling configure_file().
9 |
10 | set(PACKAGE_VERSION "3.0")
11 |
12 | if (PACKAGE_FIND_VERSION_RANGE)
13 | # Package version must be in the requested version range
14 | if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN)
15 | OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX)
16 | OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX)))
17 | set(PACKAGE_VERSION_COMPATIBLE FALSE)
18 | else()
19 | set(PACKAGE_VERSION_COMPATIBLE TRUE)
20 | endif()
21 | else()
22 | if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)
23 | set(PACKAGE_VERSION_COMPATIBLE FALSE)
24 | else()
25 | set(PACKAGE_VERSION_COMPATIBLE TRUE)
26 | if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
27 | set(PACKAGE_VERSION_EXACT TRUE)
28 | endif()
29 | endif()
30 | endif()
31 |
32 |
33 | # if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it:
34 | if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "" STREQUAL "")
35 | return()
36 | endif()
37 |
38 | # check that the installed version has the same 32/64bit-ness as the one which is currently searching:
39 | if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "")
40 | math(EXPR installedBits " * 8")
41 | set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)")
42 | set(PACKAGE_VERSION_UNSUITABLE TRUE)
43 | endif()
44 |
--------------------------------------------------------------------------------
/svm/OpenCL/share/cmake/OpenCLICDLoader/OpenCLICDLoaderTargets-debug.cmake:
--------------------------------------------------------------------------------
1 | #----------------------------------------------------------------
2 | # Generated CMake target import file for configuration "Debug".
3 | #----------------------------------------------------------------
4 |
5 | # Commands may need to know the format version.
6 | set(CMAKE_IMPORT_FILE_VERSION 1)
7 |
8 | # Import target "OpenCL::OpenCL" for configuration "Debug"
9 | set_property(TARGET OpenCL::OpenCL APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG)
10 | set_target_properties(OpenCL::OpenCL PROPERTIES
11 | IMPORTED_IMPLIB_DEBUG "${_IMPORT_PREFIX}/lib/OpenCL.lib"
12 | IMPORTED_LOCATION_DEBUG "${_IMPORT_PREFIX}/bin/OpenCL.dll"
13 | )
14 |
15 | list(APPEND _cmake_import_check_targets OpenCL::OpenCL )
16 | list(APPEND _cmake_import_check_files_for_OpenCL::OpenCL "${_IMPORT_PREFIX}/lib/OpenCL.lib" "${_IMPORT_PREFIX}/bin/OpenCL.dll" )
17 |
18 | # Commands beyond this point should not need to know the version.
19 | set(CMAKE_IMPORT_FILE_VERSION)
20 |
--------------------------------------------------------------------------------
/svm/OpenCL/share/cmake/OpenCLICDLoader/OpenCLICDLoaderTargets-release.cmake:
--------------------------------------------------------------------------------
1 | #----------------------------------------------------------------
2 | # Generated CMake target import file for configuration "Release".
3 | #----------------------------------------------------------------
4 |
5 | # Commands may need to know the format version.
6 | set(CMAKE_IMPORT_FILE_VERSION 1)
7 |
8 | # Import target "OpenCL::OpenCL" for configuration "Release"
9 | set_property(TARGET OpenCL::OpenCL APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
10 | set_target_properties(OpenCL::OpenCL PROPERTIES
11 | IMPORTED_IMPLIB_RELEASE "${_IMPORT_PREFIX}/lib/OpenCL.lib"
12 | IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/bin/OpenCL.dll"
13 | )
14 |
15 | list(APPEND _cmake_import_check_targets OpenCL::OpenCL )
16 | list(APPEND _cmake_import_check_files_for_OpenCL::OpenCL "${_IMPORT_PREFIX}/lib/OpenCL.lib" "${_IMPORT_PREFIX}/bin/OpenCL.dll" )
17 |
18 | # Commands beyond this point should not need to know the version.
19 | set(CMAKE_IMPORT_FILE_VERSION)
20 |
--------------------------------------------------------------------------------
/svm/OpenCL/share/cmake/OpenCLUtils/OpenCLUtilsConfig.cmake:
--------------------------------------------------------------------------------
1 | include("${CMAKE_CURRENT_LIST_DIR}/OpenCLUtilsTargets.cmake")
--------------------------------------------------------------------------------
/svm/OpenCL/share/cmake/OpenCLUtils/OpenCLUtilsConfigVersion.cmake:
--------------------------------------------------------------------------------
1 | # This is a basic version file for the Config-mode of find_package().
2 | # It is used by write_basic_package_version_file() as input file for configure_file()
3 | # to create a version-file which can be installed along a config.cmake file.
4 | #
5 | # The created file sets PACKAGE_VERSION_EXACT if the current version string and
6 | # the requested version string are exactly the same and it sets
7 | # PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version.
8 | # The variable CVF_VERSION must be set before calling configure_file().
9 |
10 | set(PACKAGE_VERSION "2024.10.24")
11 |
12 | if (PACKAGE_FIND_VERSION_RANGE)
13 | # Package version must be in the requested version range
14 | if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN)
15 | OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX)
16 | OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX)))
17 | set(PACKAGE_VERSION_COMPATIBLE FALSE)
18 | else()
19 | set(PACKAGE_VERSION_COMPATIBLE TRUE)
20 | endif()
21 | else()
22 | if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)
23 | set(PACKAGE_VERSION_COMPATIBLE FALSE)
24 | else()
25 | set(PACKAGE_VERSION_COMPATIBLE TRUE)
26 | if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
27 | set(PACKAGE_VERSION_EXACT TRUE)
28 | endif()
29 | endif()
30 | endif()
31 |
32 |
33 | # if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it:
34 | if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "" STREQUAL "")
35 | return()
36 | endif()
37 |
38 | # check that the installed version has the same 32/64bit-ness as the one which is currently searching:
39 | if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "")
40 | math(EXPR installedBits " * 8")
41 | set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)")
42 | set(PACKAGE_VERSION_UNSUITABLE TRUE)
43 | endif()
44 |
--------------------------------------------------------------------------------
/svm/OpenCL/share/cmake/OpenCLUtils/OpenCLUtilsTargets-debug.cmake:
--------------------------------------------------------------------------------
1 | #----------------------------------------------------------------
2 | # Generated CMake target import file for configuration "Debug".
3 | #----------------------------------------------------------------
4 |
5 | # Commands may need to know the format version.
6 | set(CMAKE_IMPORT_FILE_VERSION 1)
7 |
8 | # Import target "OpenCL::Utils" for configuration "Debug"
9 | set_property(TARGET OpenCL::Utils APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG)
10 | set_target_properties(OpenCL::Utils PROPERTIES
11 | IMPORTED_LINK_INTERFACE_LANGUAGES_DEBUG "C"
12 | IMPORTED_LOCATION_DEBUG "${_IMPORT_PREFIX}/lib/OpenCLUtilsd.lib"
13 | )
14 |
15 | list(APPEND _cmake_import_check_targets OpenCL::Utils )
16 | list(APPEND _cmake_import_check_files_for_OpenCL::Utils "${_IMPORT_PREFIX}/lib/OpenCLUtilsd.lib" )
17 |
18 | # Commands beyond this point should not need to know the version.
19 | set(CMAKE_IMPORT_FILE_VERSION)
20 |
--------------------------------------------------------------------------------
/svm/OpenCL/share/cmake/OpenCLUtils/OpenCLUtilsTargets-release.cmake:
--------------------------------------------------------------------------------
1 | #----------------------------------------------------------------
2 | # Generated CMake target import file for configuration "Release".
3 | #----------------------------------------------------------------
4 |
5 | # Commands may need to know the format version.
6 | set(CMAKE_IMPORT_FILE_VERSION 1)
7 |
8 | # Import target "OpenCL::Utils" for configuration "Release"
9 | set_property(TARGET OpenCL::Utils APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
10 | set_target_properties(OpenCL::Utils PROPERTIES
11 | IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "C"
12 | IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/OpenCLUtils.lib"
13 | )
14 |
15 | list(APPEND _cmake_import_check_targets OpenCL::Utils )
16 | list(APPEND _cmake_import_check_files_for_OpenCL::Utils "${_IMPORT_PREFIX}/lib/OpenCLUtils.lib" )
17 |
18 | # Commands beyond this point should not need to know the version.
19 | set(CMAKE_IMPORT_FILE_VERSION)
20 |
--------------------------------------------------------------------------------
/svm/OpenCL/share/cmake/OpenCLUtilsCpp/OpenCLUtilsCppConfig.cmake:
--------------------------------------------------------------------------------
1 | include("${CMAKE_CURRENT_LIST_DIR}/OpenCLUtilsCppTargets.cmake")
--------------------------------------------------------------------------------
/svm/OpenCL/share/cmake/OpenCLUtilsCpp/OpenCLUtilsCppConfigVersion.cmake:
--------------------------------------------------------------------------------
1 | # This is a basic version file for the Config-mode of find_package().
2 | # It is used by write_basic_package_version_file() as input file for configure_file()
3 | # to create a version-file which can be installed along a config.cmake file.
4 | #
5 | # The created file sets PACKAGE_VERSION_EXACT if the current version string and
6 | # the requested version string are exactly the same and it sets
7 | # PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version.
8 | # The variable CVF_VERSION must be set before calling configure_file().
9 |
10 | set(PACKAGE_VERSION "2024.10.24")
11 |
12 | if (PACKAGE_FIND_VERSION_RANGE)
13 | # Package version must be in the requested version range
14 | if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN)
15 | OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX)
16 | OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX)))
17 | set(PACKAGE_VERSION_COMPATIBLE FALSE)
18 | else()
19 | set(PACKAGE_VERSION_COMPATIBLE TRUE)
20 | endif()
21 | else()
22 | if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)
23 | set(PACKAGE_VERSION_COMPATIBLE FALSE)
24 | else()
25 | set(PACKAGE_VERSION_COMPATIBLE TRUE)
26 | if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
27 | set(PACKAGE_VERSION_EXACT TRUE)
28 | endif()
29 | endif()
30 | endif()
31 |
32 |
33 | # if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it:
34 | if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "" STREQUAL "")
35 | return()
36 | endif()
37 |
38 | # check that the installed version has the same 32/64bit-ness as the one which is currently searching:
39 | if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "")
40 | math(EXPR installedBits " * 8")
41 | set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)")
42 | set(PACKAGE_VERSION_UNSUITABLE TRUE)
43 | endif()
44 |
--------------------------------------------------------------------------------
/svm/OpenCL/share/cmake/OpenCLUtilsCpp/OpenCLUtilsCppTargets-debug.cmake:
--------------------------------------------------------------------------------
1 | #----------------------------------------------------------------
2 | # Generated CMake target import file for configuration "Debug".
3 | #----------------------------------------------------------------
4 |
5 | # Commands may need to know the format version.
6 | set(CMAKE_IMPORT_FILE_VERSION 1)
7 |
8 | # Import target "OpenCL::UtilsCpp" for configuration "Debug"
9 | set_property(TARGET OpenCL::UtilsCpp APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG)
10 | set_target_properties(OpenCL::UtilsCpp PROPERTIES
11 | IMPORTED_LINK_INTERFACE_LANGUAGES_DEBUG "C;CXX"
12 | IMPORTED_LOCATION_DEBUG "${_IMPORT_PREFIX}/lib/OpenCLUtilsCppd.lib"
13 | )
14 |
15 | list(APPEND _cmake_import_check_targets OpenCL::UtilsCpp )
16 | list(APPEND _cmake_import_check_files_for_OpenCL::UtilsCpp "${_IMPORT_PREFIX}/lib/OpenCLUtilsCppd.lib" )
17 |
18 | # Commands beyond this point should not need to know the version.
19 | set(CMAKE_IMPORT_FILE_VERSION)
20 |
--------------------------------------------------------------------------------
/svm/OpenCL/share/cmake/OpenCLUtilsCpp/OpenCLUtilsCppTargets-release.cmake:
--------------------------------------------------------------------------------
1 | #----------------------------------------------------------------
2 | # Generated CMake target import file for configuration "Release".
3 | #----------------------------------------------------------------
4 |
5 | # Commands may need to know the format version.
6 | set(CMAKE_IMPORT_FILE_VERSION 1)
7 |
8 | # Import target "OpenCL::UtilsCpp" for configuration "Release"
9 | set_property(TARGET OpenCL::UtilsCpp APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
10 | set_target_properties(OpenCL::UtilsCpp PROPERTIES
11 | IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "C;CXX"
12 | IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/OpenCLUtilsCpp.lib"
13 | )
14 |
15 | list(APPEND _cmake_import_check_targets OpenCL::UtilsCpp )
16 | list(APPEND _cmake_import_check_files_for_OpenCL::UtilsCpp "${_IMPORT_PREFIX}/lib/OpenCLUtilsCpp.lib" )
17 |
18 | # Commands beyond this point should not need to know the version.
19 | set(CMAKE_IMPORT_FILE_VERSION)
20 |
--------------------------------------------------------------------------------
/svm/OpenCL/share/man/man1/clinfo.1.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clamchowder/Microbenchmarks/e8aaed570e6d7fea725f3d67313f5a51973d6fdf/svm/OpenCL/share/man/man1/clinfo.1.gz
--------------------------------------------------------------------------------
/svm/OpenCL/share/pkgconfig/OpenCL-CLHPP.pc:
--------------------------------------------------------------------------------
1 | prefix=D:/a/OpenCL-SDK/OpenCL-SDK/install
2 | includedir=${prefix}/include
3 |
4 | Name: OpenCL-CLHPP
5 | Description: OpenCL API C++ bindings
6 | Requires: OpenCL-Headers
7 | Version: 3.0
8 | Cflags: -I${includedir}
9 |
--------------------------------------------------------------------------------
/svm/OpenCL/share/pkgconfig/OpenCL-Headers.pc:
--------------------------------------------------------------------------------
1 | prefix=D:/a/OpenCL-SDK/OpenCL-SDK/install
2 | includedir=${prefix}/include
3 |
4 | Name: OpenCL-Headers
5 | Description: Khronos OpenCL Headers
6 | Version: 3.0
7 | Cflags: -I${includedir}
8 |
--------------------------------------------------------------------------------
/svm/atomic_latency_kernel.cl:
--------------------------------------------------------------------------------
1 | __kernel void atomic_exec_latency_test(__global int* A, int count) {
2 | int current = 1;
3 | while (current <= 2 * count) {
4 | if (atomic_cmpxchg(A, current - 1, current) == current - 1) {
5 | current += 2;
6 | // printf("gpu current = %d\n", current);
7 | } // else printf("A = %d wait for %d\n", *A, current - 1);
8 | }
9 | }
10 |
11 | __kernel void increment_on_gpu(__global int *A)
12 | {
13 | *A = *A + 1;
14 | }
--------------------------------------------------------------------------------
/svm/svm.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 17
4 | VisualStudioVersion = 17.12.35527.113 d17.12
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "svm", "svm.vcxproj", "{411AB5E4-FD55-4478-83F2-80C51F205FA7}"
7 | EndProject
8 | Global
9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | Debug|x64 = Debug|x64
11 | Debug|x86 = Debug|x86
12 | Release|x64 = Release|x64
13 | Release|x86 = Release|x86
14 | EndGlobalSection
15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
16 | {411AB5E4-FD55-4478-83F2-80C51F205FA7}.Debug|x64.ActiveCfg = Debug|x64
17 | {411AB5E4-FD55-4478-83F2-80C51F205FA7}.Debug|x64.Build.0 = Debug|x64
18 | {411AB5E4-FD55-4478-83F2-80C51F205FA7}.Debug|x86.ActiveCfg = Debug|Win32
19 | {411AB5E4-FD55-4478-83F2-80C51F205FA7}.Debug|x86.Build.0 = Debug|Win32
20 | {411AB5E4-FD55-4478-83F2-80C51F205FA7}.Release|x64.ActiveCfg = Release|x64
21 | {411AB5E4-FD55-4478-83F2-80C51F205FA7}.Release|x64.Build.0 = Release|x64
22 | {411AB5E4-FD55-4478-83F2-80C51F205FA7}.Release|x86.ActiveCfg = Release|Win32
23 | {411AB5E4-FD55-4478-83F2-80C51F205FA7}.Release|x86.Build.0 = Release|Win32
24 | EndGlobalSection
25 | GlobalSection(SolutionProperties) = preSolution
26 | HideSolutionNode = FALSE
27 | EndGlobalSection
28 | EndGlobal
29 |
--------------------------------------------------------------------------------
/svm/svm.vcxproj.filters:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF}
6 | cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx
7 |
8 |
9 | {93995380-89BD-4b04-88EB-625FBE52EBFB}
10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd
11 |
12 |
13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01}
14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms
15 |
16 |
17 |
18 |
19 | Source Files
20 |
21 |
22 |
--------------------------------------------------------------------------------