├── .github └── workflows │ └── linux.yaml ├── .gitignore ├── AsmGen ├── AsmGen.csproj ├── AsmGen.sln ├── DataFiles │ ├── BranchhistTestBlock.c │ ├── CommonFunctions.c │ ├── GccBranchHistFunction.c │ ├── GccIndirectBranchFunction.c │ ├── IndirectBranchTestBlock.c │ └── clammicrobench.vcxproj_template ├── IUarchTest.cs ├── Program.cs ├── Properties │ └── launchSettings.json ├── README.md ├── UarchTest.cs ├── UarchTestHelpers.cs └── tests │ ├── A73RobTest.cs │ ├── AddLoopTest.cs │ ├── AddNsq.cs │ ├── AddSchedTest.cs │ ├── AddvNsq.cs │ ├── AddvSchedTest.cs │ ├── AeseSchedTest.cs │ ├── AesencNsq.cs │ ├── BranchBufferTest.cs │ ├── BranchHistoryTest.cs │ ├── BtbTest.cs │ ├── CvtSchedTest.cs │ ├── FAdd256RfTest.cs │ ├── Fadd128RfTest.cs │ ├── Fadd128SchedTest.cs │ ├── Fadd256SchedTest.cs │ ├── FaddNsq.cs │ ├── FaddSchedTest.cs │ ├── FcmpSchedTest.cs │ ├── FlagRfTest.cs │ ├── Fma256SchedTest.cs │ ├── FmovSched.cs │ ├── FmulSchedTest.cs │ ├── FpRfTest.cs │ ├── FpStoreDataNsq.cs │ ├── IdrfTest.cs │ ├── IndirectBranchTest.cs │ ├── IntRfDepStoreTest.cs │ ├── IntRfTest.cs │ ├── JsCvtNsq.cs │ ├── JsCvtSched.cs │ ├── JumpNsqTest.cs │ ├── JumpSchedTest.cs │ ├── LdqTest.cs │ ├── LeaSchedTest.cs │ ├── LoadNsq.cs │ ├── LoadSchedTest.cs │ ├── MaddSchedTest.cs │ ├── MaskRfTest.cs │ ├── MixAddJumpSched.cs │ ├── MixAddvJsCvtNsq.cs │ ├── MixAddvJsCvtSched.cs │ ├── MixBranchStoreTest.cs │ ├── MixFAdd256and32RfTest.cs │ ├── MixFpRfDepBranchTest.cs │ ├── MixFpVecRfTest.cs │ ├── MixIntRfDepBranchTest.cs │ ├── MixIntVec128RfTest.cs │ ├── MixIntrfFprfTest.cs │ ├── MixJumpStoreDataSched.cs │ ├── MixJumpStoreSchedTest.cs │ ├── MixJumpThenAddSched.cs │ ├── MixLdqStqTest.cs │ ├── MixLoadStoreDivSchedTest.cs │ ├── MixLoadStoreSchedTest.cs │ ├── MixStoreDivSchedTest.cs │ ├── MixVec512Vec256BlockRfTest.cs │ ├── MixVec512Vec256RfTest.cs │ ├── MmxRfTest.cs │ ├── MulSchedTest.cs │ ├── NopLoopTest.cs │ ├── PdepSchedTest.cs │ ├── ReturnStackTest.cs │ ├── RobTest.cs │ ├── RorSchedTest.cs │ ├── ShlSchedTest.cs │ ├── StoreDataDivNsqTest.cs │ ├── StoreDataNsqTest.cs │ ├── StoreDataSchedTest.cs │ ├── StoreDivNsqTest.cs │ ├── StoreDivSchedTest.cs │ ├── StoreNsq.cs │ ├── StoreSchedTest.cs │ ├── Stq128Test.cs │ ├── Stq512Test.cs │ ├── StqTest.cs │ ├── TakenBranchBufferTest.cs │ ├── TakenJumpSchedTest.cs │ ├── Vec512RfTest.cs │ ├── VecMulNsq.cs │ └── ZeroRobTest.cs ├── CoherencyLatency ├── CoherencyLatency.cpp ├── CoherencyLatency.sln ├── CoherencyLatency.vcxproj ├── Makefile ├── PThreadsCoherencyLatency.c └── c2cparse │ ├── Program.cs │ ├── c2cparse.csproj │ └── c2cparse.sln ├── Common ├── arch_detect.mk ├── ci_gpumemlatency.sh ├── ci_package.sh ├── perfmon.h ├── timing.c └── timing.h ├── CoreClockChecker ├── BoostClockChecker.c ├── BoostClockChecker_arm.s ├── BoostClockChecker_x86.s ├── CoreClockChecker.c ├── CoreClockChecker_x86.s ├── Makefile └── WinCoreClockChecker │ ├── CoreClockCheckFunctions.asm │ ├── WinCoreClockChecker.cpp │ ├── WinCoreClockChecker.sln │ ├── WinCoreClockChecker.vcxproj │ └── WinCoreClockChecker.vcxproj.filters ├── GpuMemLatency ├── Makefile ├── OpenCL │ ├── LICENSE │ ├── README.md │ ├── include │ │ └── CL │ │ │ ├── cl.h │ │ │ ├── cl_d3d10.h │ │ │ ├── cl_d3d11.h │ │ │ ├── cl_dx9_media_sharing.h │ │ │ ├── cl_dx9_media_sharing_intel.h │ │ │ ├── cl_egl.h │ │ │ ├── cl_ext.h │ │ │ ├── cl_ext_intel.h │ │ │ ├── cl_gl.h │ │ │ ├── cl_gl_ext.h │ │ │ ├── cl_half.h │ │ │ ├── cl_icd.h │ │ │ ├── cl_platform.h │ │ │ ├── cl_va_api_media_sharing_intel.h │ │ │ ├── cl_version.h │ │ │ └── opencl.h │ └── lib │ │ └── OpenCL.lib ├── atomic_test.c ├── bw_test.c ├── common.c ├── instruction_rate.c ├── instruction_rate_fp16_kernel.cl ├── instruction_rate_fp64_kernel.cl ├── instruction_rate_kernel.cl ├── kernel.cl ├── kernels │ ├── atomic_exec_latency_test.cl │ ├── buffer_bw_test.cl │ ├── c2c_atomic_exec_latency_test.cl │ ├── constant_unrolled_latency_test.cl │ ├── ldst_bw_test.cl │ ├── local_64_bw_test.cl │ ├── local_atomic_latency_test.cl │ ├── local_bw_test.cl │ ├── local_float4_bw_test.cl │ ├── local_unrolled_latency_test.cl │ ├── scalar_unrolled_latency_test.cl │ ├── sum_bw_test.cl │ ├── tex_bw_test.cl │ ├── tex_latency_test.cl │ └── unrolled_latency_test.cl ├── latency_test.c ├── local_mem_latency_kernel.cl ├── opencltest.c ├── opencltest.h ├── opencltest.sln ├── opencltest.vcxproj ├── opencltest.vcxproj.filters └── texturetest.c ├── InstructionRate ├── Makefile ├── arm_instructionrate.c ├── arm_instructionrate.s ├── riscv_instructionrate.c ├── riscv_instructionrate.s ├── test.s ├── x86_fusion.c ├── x86_fusion.s ├── x86_instructionrate.c └── x86_instructionrate.s ├── LICENSE ├── LoadedMemoryLatency ├── LoadedMemoryLatency.c ├── LoadedMemoryLatency │ ├── LoadedMemoryLatency.asm │ ├── LoadedMemoryLatency.cpp │ ├── LoadedMemoryLatency.sln │ ├── LoadedMemoryLatency.vcxproj │ └── LoadedMemoryLatency.vcxproj.filters ├── LoadedMemoryLatency_amd64.s ├── LoadedMemoryLatency_arm.s └── Makefile ├── Makefile ├── MemoryBandwidth ├── Makefile ├── MemoryBandwidth.c ├── MemoryBandwidth │ ├── MemoryBandwidth.cpp │ ├── MemoryBandwidth.sln │ ├── MemoryBandwidth.vcxproj │ ├── MemoryBandwidth.vcxproj.filters │ ├── MemoryBandwidthFunctions.asm │ └── MemoryBandwidthFunctions32.asm ├── MemoryBandwidth_arm.s ├── MemoryBandwidth_riscv.s ├── MemoryBandwidth_x86.s ├── MixedMemoryBandwidthTest │ ├── MemoryBandwidth.h │ ├── MemoryBandwidthFunctions.asm │ ├── MixedMemoryBandwidthTest.cpp │ ├── MixedMemoryBandwidthTest.vcxproj │ └── MixedMemoryBandwidthTest.vcxproj.filters └── README.md ├── MemoryLatency ├── Makefile ├── MemoryLatency.c ├── MemoryLatency.cpp ├── MemoryLatency.sln ├── MemoryLatency.vcxproj ├── MemoryLatencyFunctions.asm ├── MemoryLatency_arm.s ├── MemoryLatency_i686.s ├── MemoryLatency_riscv.s ├── MemoryLatency_x86.s └── README.md ├── README.md ├── mt_instructionrate ├── InstructionRateFunctions.asm ├── Makefile ├── Project1.vcxproj ├── Project1.vcxproj.filters ├── arm_mt_instructionrate.c ├── arm_mt_instructionrate.s ├── mt_instructionrate.c ├── mt_instructionrate.sln ├── ppc64_mt_instructionrate.c ├── ppc64_mt_instructionrate.s ├── x86_mt_instructionrate ├── x86_mt_instructionrate.c └── x86_mt_instructionrate.s └── svm ├── OpenCL ├── include │ └── CL │ │ ├── Utils │ │ ├── Context.h │ │ ├── Context.hpp │ │ ├── Detail.hpp │ │ ├── Device.hpp │ │ ├── Error.h │ │ ├── Error.hpp │ │ ├── ErrorCodes.h │ │ ├── Event.h │ │ ├── Event.hpp │ │ ├── File.h │ │ ├── File.hpp │ │ ├── InteropContext.hpp │ │ ├── OpenCLUtilsCpp_Export.h │ │ ├── OpenCLUtils_Export.h │ │ ├── Platform.hpp │ │ ├── Utils.h │ │ └── Utils.hpp │ │ ├── cl.h │ │ ├── cl2.hpp │ │ ├── cl_d3d10.h │ │ ├── cl_d3d11.h │ │ ├── cl_dx9_media_sharing.h │ │ ├── cl_dx9_media_sharing_intel.h │ │ ├── cl_egl.h │ │ ├── cl_ext.h │ │ ├── cl_ext_intel.h │ │ ├── cl_function_types.h │ │ ├── cl_gl.h │ │ ├── cl_gl_ext.h │ │ ├── cl_half.h │ │ ├── cl_icd.h │ │ ├── cl_layer.h │ │ ├── cl_platform.h │ │ ├── cl_va_api_media_sharing_intel.h │ │ ├── cl_version.h │ │ ├── opencl.h │ │ └── opencl.hpp ├── lib │ ├── OpenCL.lib │ ├── OpenCLExt.lib │ ├── OpenCLUtils.lib │ ├── OpenCLUtilsCpp.lib │ ├── OpenCLUtilsCppd.lib │ ├── OpenCLUtilsd.lib │ └── pkgconfig │ │ └── OpenCL.pc └── share │ ├── cmake │ ├── OpenCL │ │ ├── OpenCLConfig.cmake │ │ └── OpenCLConfigVersion.cmake │ ├── OpenCLExtensionLoader │ │ ├── OpenCLExtensionLoaderConfig.cmake │ │ ├── OpenCLExtensionLoaderConfigVersion.cmake │ │ ├── OpenCLExtensionLoaderTargets-debug.cmake │ │ ├── OpenCLExtensionLoaderTargets-release.cmake │ │ └── OpenCLExtensionLoaderTargets.cmake │ ├── OpenCLHeaders │ │ ├── OpenCLHeadersConfig.cmake │ │ ├── OpenCLHeadersConfigVersion.cmake │ │ └── OpenCLHeadersTargets.cmake │ ├── OpenCLHeadersCpp │ │ ├── OpenCLHeadersCppConfig.cmake │ │ ├── OpenCLHeadersCppConfigVersion.cmake │ │ └── OpenCLHeadersCppTargets.cmake │ ├── OpenCLICDLoader │ │ ├── OpenCLICDLoaderConfig.cmake │ │ ├── OpenCLICDLoaderConfigVersion.cmake │ │ ├── OpenCLICDLoaderTargets-debug.cmake │ │ ├── OpenCLICDLoaderTargets-release.cmake │ │ └── OpenCLICDLoaderTargets.cmake │ ├── OpenCLUtils │ │ ├── OpenCLUtilsConfig.cmake │ │ ├── OpenCLUtilsConfigVersion.cmake │ │ ├── OpenCLUtilsTargets-debug.cmake │ │ ├── OpenCLUtilsTargets-release.cmake │ │ └── OpenCLUtilsTargets.cmake │ └── OpenCLUtilsCpp │ │ ├── OpenCLUtilsCppConfig.cmake │ │ ├── OpenCLUtilsCppConfigVersion.cmake │ │ ├── OpenCLUtilsCppTargets-debug.cmake │ │ ├── OpenCLUtilsCppTargets-release.cmake │ │ └── OpenCLUtilsCppTargets.cmake │ ├── man │ └── man1 │ │ └── clinfo.1.gz │ └── pkgconfig │ ├── OpenCL-CLHPP.pc │ └── OpenCL-Headers.pc ├── atomic_latency_kernel.cl ├── svm.sln ├── svm.vcxproj ├── svm.vcxproj.filters └── svmtest.cpp /.github/workflows/linux.yaml: -------------------------------------------------------------------------------- 1 | name: Build Benchmarks on Ubuntu 2 | on: [push] 3 | jobs: 4 | BuildBenchmarks: 5 | # Only Ubuntu for now. 6 | runs-on: ubuntu-latest 7 | steps: 8 | - name: Install prerequisites 9 | run: sudo apt update && sudo apt -qq --assume-yes full-upgrade && sudo apt install -qq -y build-essential crossbuild-essential-arm64 gcc-riscv64-linux-gnu ocl-icd-opencl-dev opencl-headers libnuma-dev b3sum unzip 10 | - name: Wild tomfoolery attempt 11 | run: eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)" && brew install mingw-w64 12 | - name: Check out repository code 13 | uses: actions/checkout@v3 14 | - name: Build all benchmarks 15 | run: eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)" && make ci 16 | - name: Package benchmarks 17 | run: make package 18 | - name: b3sum 19 | run: b3sum clammarks.txz 20 | # - name: Upload package 21 | # env: 22 | # UPLOAD_KEY: ${{ secrets.UPLOAD_KEY }} 23 | # UPLOAD_URL: ${{ secrets.UPLOAD_URL }} 24 | # run: curl -X PUT -T clammarks.txz -H "$UPLOAD_KEY" "$UPLOAD_URL" 25 | -------------------------------------------------------------------------------- /AsmGen/AsmGen.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | net8.0 6 | false 7 | x64 8 | AnyCPU;x64 9 | 10 | 11 | 12 | 13 | Always 14 | 15 | 16 | Always 17 | 18 | 19 | Always 20 | 21 | 22 | Always 23 | 24 | 25 | Always 26 | 27 | 28 | Always 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /AsmGen/AsmGen.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 17 4 | VisualStudioVersion = 17.2.32516.85 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AsmGen", "AsmGen.csproj", "{B8930E86-946C-4831-B088-F571E73EEDC4}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|Any CPU = Debug|Any CPU 11 | Debug|x64 = Debug|x64 12 | Release|Any CPU = Release|Any CPU 13 | Release|x64 = Release|x64 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {B8930E86-946C-4831-B088-F571E73EEDC4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 17 | {B8930E86-946C-4831-B088-F571E73EEDC4}.Debug|Any CPU.Build.0 = Debug|Any CPU 18 | {B8930E86-946C-4831-B088-F571E73EEDC4}.Debug|x64.ActiveCfg = Debug|x64 19 | {B8930E86-946C-4831-B088-F571E73EEDC4}.Debug|x64.Build.0 = Debug|x64 20 | {B8930E86-946C-4831-B088-F571E73EEDC4}.Release|Any CPU.ActiveCfg = Release|Any CPU 21 | {B8930E86-946C-4831-B088-F571E73EEDC4}.Release|Any CPU.Build.0 = Release|Any CPU 22 | {B8930E86-946C-4831-B088-F571E73EEDC4}.Release|x64.ActiveCfg = Release|x64 23 | {B8930E86-946C-4831-B088-F571E73EEDC4}.Release|x64.Build.0 = Release|x64 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | GlobalSection(ExtensibilityGlobals) = postSolution 29 | SolutionGuid = {4433D029-CD62-44B9-862E-A8DE52DA45CE} 30 | EndGlobalSection 31 | EndGlobal 32 | -------------------------------------------------------------------------------- /AsmGen/DataFiles/BranchhistTestBlock.c: -------------------------------------------------------------------------------- 1 | uint32_t testSizeCount = sizeof(branchHistoryLengths) / sizeof(int); 2 | initializeBranchHistFuncArr(); 3 | srand(time(NULL)); 4 | 5 | size_t resultSize = sizeof(float) * maxBranchCount * testSizeCount; 6 | float* randomResults = (float*)malloc(resultSize); 7 | float* predictableResults = (float*)malloc(resultSize); 8 | for (uint32_t branchCountIdx = 0; branchCountIdx < maxBranchCount; branchCountIdx++) { 9 | for (uint32_t testSizeIdx = 0; testSizeIdx < testSizeCount; testSizeIdx++) { 10 | uint32_t testSize = branchHistoryLengths[testSizeIdx]; 11 | uint32_t branchCount = branchCounts[branchCountIdx]; 12 | printf("Testing branch count %d history length %d\n", branchCount, testSize); 13 | randomResults[branchCountIdx * testSizeCount + testSizeIdx] = runBranchHistTest(testSize, branchCountIdx, 1); 14 | predictableResults[branchCountIdx * testSizeCount + testSizeIdx] = runBranchHistTest(testSize, branchCountIdx, 0); 15 | printf("%d, %f, %f\n", testSize, 16 | randomResults[branchCountIdx * testSizeCount + testSizeIdx], 17 | predictableResults[branchCountIdx * testSizeCount + testSizeIdx]); 18 | } 19 | } 20 | 21 | printf("Random:\n"); 22 | printResultFloatArr(randomResults, branchHistoryLengths, testSizeCount, branchCounts, maxBranchCount); 23 | printf("\nPredictable:\n"); 24 | printResultFloatArr(predictableResults, branchHistoryLengths, testSizeCount, branchCounts, maxBranchCount); 25 | 26 | free(randomResults); 27 | free(predictableResults); 28 | -------------------------------------------------------------------------------- /AsmGen/DataFiles/CommonFunctions.c: -------------------------------------------------------------------------------- 1 | // this is a partial C file that's appended into generated code 2 | // stuff here is generic enough to work for both windows/vs and gcc 3 | 4 | #ifndef __MINGW32__ 5 | // optional affinity setting for effed up qualcomm/android bs 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | void setAffinity(int core) { 13 | cpu_set_t cpuset; 14 | CPU_ZERO(&cpuset); 15 | CPU_SET(core, &cpuset); 16 | printf("Set affinity to core %d\n", core); 17 | // sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset); 18 | } 19 | #endif 20 | 21 | struct ThreadData { 22 | int* A; 23 | int* B; 24 | float* fpArr; 25 | uint32_t list_size; 26 | uint64_t structIterations; 27 | }; 28 | 29 | void printCsvHeader(uint32_t* xCounts, uint32_t xLen) { 30 | printf("x"); 31 | for (uint32_t testSizeIdx = 0; testSizeIdx < xLen; testSizeIdx++) { 32 | printf(", %d", xCounts[testSizeIdx]); 33 | } 34 | 35 | printf("\n"); 36 | } 37 | 38 | // print results in format that excel can take 39 | void printResultFloatArr(float* arr, uint32_t *xCounts, uint32_t xLen, uint32_t *yCounts, uint32_t yLen) { 40 | uint32_t testSizeCount = xLen; 41 | printCsvHeader(xCounts, xLen); 42 | for (uint32_t branchCountIdx = 0; branchCountIdx < yLen; branchCountIdx++) { 43 | // row header 44 | printf("%d", yCounts[branchCountIdx]); 45 | for (uint32_t testSizeIdx = 0; testSizeIdx < testSizeCount; testSizeIdx++) { 46 | printf(",%f", arr[branchCountIdx * testSizeCount + testSizeIdx]); 47 | } 48 | 49 | printf("\n"); 50 | } 51 | } 52 | 53 | void FillPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t byte_increment) { 54 | uint32_t increment = byte_increment / sizeof(uint32_t); 55 | uint32_t element_count = list_size / increment; 56 | for (int i = 0; i < element_count; i++) { 57 | pattern_arr[i * increment] = i * increment; 58 | } 59 | 60 | int iter = element_count; 61 | while (iter > 1) { 62 | iter -= 1; 63 | int j = iter - 1 == 0 ? 0 : rand() % (iter - 1); 64 | uint32_t tmp = pattern_arr[iter * increment]; 65 | pattern_arr[iter * increment] = pattern_arr[j * increment]; 66 | pattern_arr[j * increment] = tmp; 67 | } 68 | } -------------------------------------------------------------------------------- /AsmGen/DataFiles/GccBranchHistFunction.c: -------------------------------------------------------------------------------- 1 | // this is a partial C file that's appended into generated code 2 | 3 | // Run a test, return the result in time (ns) per branch 4 | // historyLen: length of random array that the test loops through 5 | // branchCountIdx: index into array of branch counts, max determined by generated header/asm 6 | // random: if 1, randomize test array contents. If 0, fill with zeroes 7 | float runBranchHistTest(uint32_t historyLen, uint32_t branchCountIdx, int random) { 8 | struct timeval startTv, endTv; 9 | struct timezone startTz, endTz; 10 | uint32_t branchCount = branchCounts[branchCountIdx]; 11 | uint64_t iterations = 320000000 / branchCount; 12 | uint64_t(*branchtestFunc)(uint64_t, uint32_t**, uint32_t) __attribute((sysv_abi)) = branchtestFuncArr[branchCountIdx]; 13 | float onesCount = 0.0f; 14 | 15 | uint32_t** testArrToArr = (uint32_t**)malloc(sizeof(uint32_t*) * branchCount); 16 | for (int testArrIdx = 0; testArrIdx < branchCount; testArrIdx++) { 17 | uint32_t* testArr = (uint32_t*)malloc(sizeof(uint32_t) * historyLen); 18 | for (uint32_t i = 0; i < historyLen; i++) { 19 | testArr[i] = random ? rand() % 2 : 0; 20 | if (testArr[i] > 0) 21 | { 22 | onesCount += 1.0f; 23 | } 24 | } 25 | testArrToArr[testArrIdx] = testArr; 26 | } 27 | 28 | fprintf(stderr, "Starting test, should have %0.2f percent ones\n", onesCount / ((float)historyLen * branchCount)); 29 | gettimeofday(&startTv, &startTz); 30 | uint64_t takenBranchCount = branchtestFunc(iterations, testArrToArr, historyLen); 31 | gettimeofday(&endTv, &endTz); 32 | uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000); 33 | float latency = 1e6 * (float)time_diff_ms / (float)iterations; 34 | 35 | // give result in latency per branch 36 | latency = latency / branchCount; 37 | fprintf(stderr, "History length %u, branch count %u: %0.2f percent not-taken\n", historyLen, branchCount, 100 * (float)takenBranchCount / ((float)iterations * branchCount)); 38 | 39 | for (int testArrIdx = 0; testArrIdx < branchCount; testArrIdx++) free(testArrToArr[testArrIdx]); 40 | free(testArrToArr); 41 | return latency; 42 | } 43 | -------------------------------------------------------------------------------- /AsmGen/DataFiles/IndirectBranchTestBlock.c: -------------------------------------------------------------------------------- 1 | // generated code will have: 2 | // - indirectBranchTargetCounts = array containing # of targets per branch 3 | // - indirectBranchCounts = array containing # of branches to test 4 | // - maxIndirectBranchCount = length of ^^ 5 | // - initializeIndirectBranchFuncArr = populates 6 | 7 | uint32_t testSizeCount = sizeof(indirectBranchTargetCounts) / sizeof(int); 8 | initializeIndirectBranchFuncArr(); 9 | srand(time(NULL)); 10 | 11 | size_t resultSize = sizeof(float) * maxIndirectBranchCount * testSizeCount; 12 | float* results = (float*)malloc(resultSize); 13 | float* refResults = (float*)malloc(resultSize); 14 | for (uint32_t branchCountIdx = 0; branchCountIdx < maxIndirectBranchCount; branchCountIdx++) { 15 | for (uint32_t targetCountIdx = 0; targetCountIdx < testSizeCount; targetCountIdx++) { 16 | uint32_t testSize = indirectBranchTargetCounts[targetCountIdx]; 17 | uint32_t branchCount = indirectBranchCounts[branchCountIdx]; 18 | printf("Testing branch count %d target count %d:", branchCount, testSize); 19 | results[branchCountIdx * testSizeCount + targetCountIdx] = runIndirectBranchTest(branchCountIdx, targetCountIdx, 0); 20 | refResults[branchCountIdx * testSizeCount + targetCountIdx] = runIndirectBranchTest(branchCountIdx, targetCountIdx, 2); 21 | printf("%f ns, reference %f ns\n", 22 | results[branchCountIdx * testSizeCount + targetCountIdx], 23 | refResults[branchCountIdx * testSizeCount + targetCountIdx]); 24 | } 25 | } 26 | 27 | printf("Indirect branch results:\n"); 28 | printResultFloatArr(results, indirectBranchTargetCounts, testSizeCount, indirectBranchCounts, maxIndirectBranchCount); 29 | printf("Reference indirect branch results:\n"); 30 | printResultFloatArr(refResults, indirectBranchTargetCounts, testSizeCount, indirectBranchCounts, maxIndirectBranchCount); 31 | 32 | free(results); 33 | free(refResults); 34 | -------------------------------------------------------------------------------- /AsmGen/IUarchTest.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public interface IUarchTest 6 | { 7 | public const string ThreadLaunchFunctionPrefix = "ThreadLaunch_"; 8 | // enough to generate global lines, function calls, and let user pick from tests 9 | public string Prefix { get; } 10 | public string Description { get; } 11 | public bool DivideTimeByCount { get; } 12 | public bool SupportsIsa(ISA isa); 13 | 14 | public void GenerateAsm(StringBuilder sb, ISA isa); 15 | public void GenerateTestBlock(StringBuilder sb, ISA isa); 16 | public void GenerateAsmGlobalLines(StringBuilder sb); 17 | public void GenerateExternLines(StringBuilder sb); 18 | 19 | public enum ISA 20 | { 21 | amd64, // 64-bit x86 22 | aarch64, // 64-bit arm 23 | mips64, // 64-bit MIPS, for loongson 24 | riscv, // 64-bit risc-v 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /AsmGen/Properties/launchSettings.json: -------------------------------------------------------------------------------- 1 | { 2 | "profiles": { 3 | "AsmGen": { 4 | "commandName": "Project", 5 | "commandLineArgs": "autocopy" 6 | } 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /AsmGen/tests/A73RobTest.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Text; 3 | 4 | namespace AsmGen 5 | { 6 | /// 7 | /// Looking for reordering capacity limits on A73 by combining several different instruction types 8 | /// 9 | public class A73RobTest : UarchTest 10 | { 11 | public A73RobTest(int low, int high, int step) 12 | { 13 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 14 | this.Prefix = "a73rob"; 15 | this.Description = "Mixed integer/vec128 + stores"; 16 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; 17 | this.GetFunctionCallParameters = "structIterations, A, fpArr"; 18 | this.DivideTimeByCount = false; 19 | } 20 | 21 | public override bool SupportsIsa(IUarchTest.ISA isa) 22 | { 23 | if (isa == IUarchTest.ISA.aarch64) return true; 24 | return false; 25 | } 26 | 27 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 28 | { 29 | if (isa == IUarchTest.ISA.aarch64) 30 | { 31 | string postLoadInstrs = UarchTestHelpers.GetArmDependentBranch(this.Prefix); 32 | string initInstrs = " ldr q0, [x1]\n" + 33 | " ldr q1, [x1, #0x10]\n" + 34 | " ldr q2, [x1, #0x20]\n" + 35 | " ldr q3, [x1, #0x30]\n" + 36 | " ldr q4, [x1, #0x40]\n"; 37 | 38 | List fillerInstrs = new List(); 39 | for (int i = 0; i < this.Counts[this.Counts.Length - 1];i++) 40 | { 41 | if (i < 33) fillerInstrs.Add(" add v1.4s, v1.4s, v0.4s"); 42 | else if (i < 66) fillerInstrs.Add(" add x15, x15, x11"); 43 | else fillerInstrs.Add(" str x12, [x2]"); 44 | } 45 | 46 | string[] fillerInstrsArr = fillerInstrs.ToArray(); 47 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs( 48 | sb, this.Counts, this.Prefix, fillerInstrsArr, fillerInstrsArr, false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs); 49 | sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix)); 50 | } 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /AsmGen/tests/AddNsq.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class AddNsq : UarchTest 6 | { 7 | private int totalOps; 8 | public AddNsq(int low, int high, int step, int totalOps) 9 | { 10 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 11 | this.Prefix = "addnsq" + totalOps; 12 | this.Description = "Integer adds, excluding possible NSQ"; 13 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; 14 | this.GetFunctionCallParameters = "structIterations, A, fpArr"; 15 | this.DivideTimeByCount = false; 16 | this.totalOps = totalOps; 17 | } 18 | 19 | public override bool SupportsIsa(IUarchTest.ISA isa) 20 | { 21 | // if (isa == IUarchTest.ISA.aarch64) return true; 22 | if (isa == IUarchTest.ISA.amd64) return true; 23 | return false; 24 | } 25 | 26 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 27 | { 28 | if (isa == IUarchTest.ISA.amd64) 29 | { 30 | string[] depInstrs = new string[2]; 31 | depInstrs[0] = " add %rdi, %r15"; 32 | depInstrs[1] = " add %rdi, %r14"; 33 | 34 | string[] indepInstrs = new string[2]; 35 | indepInstrs[0] = " add %r13, %r11"; 36 | indepInstrs[1] = " add %r12, %r11"; 37 | UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false); 38 | } 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /AsmGen/tests/AddvNsq.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class AddvNsq : UarchTest 6 | { 7 | private int totalOps; 8 | public AddvNsq(int low, int high, int step, int totalOps) 9 | { 10 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 11 | this.Prefix = "addvnsq"; 12 | this.Description = "ADDV, excluding possible NSQ"; 13 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; 14 | this.GetFunctionCallParameters = "structIterations, A, fpArr"; 15 | this.DivideTimeByCount = false; 16 | this.totalOps = totalOps; 17 | } 18 | 19 | public override bool SupportsIsa(IUarchTest.ISA isa) 20 | { 21 | if (isa == IUarchTest.ISA.aarch64) return true; 22 | return false; 23 | } 24 | 25 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 26 | { 27 | if (isa == IUarchTest.ISA.aarch64) 28 | { 29 | string postLoadInstrs1 = " ldr d16, [x2, w25, sxtw #0]"; 30 | string initInstrs = " ldr d15, [x2]"; 31 | string[] depInstrs = new string[4]; 32 | depInstrs[0] = " addv h1, v16.4h"; 33 | depInstrs[1] = " addv h2, v16.4h"; 34 | depInstrs[2] = " addv h3, v16.4h"; 35 | depInstrs[3] = " addv h4, v16.4h"; 36 | 37 | string[] indepInstrs = new string[4]; 38 | indepInstrs[0] = " addv h1, v15.4h"; 39 | indepInstrs[1] = " addv h2, v15.4h"; 40 | indepInstrs[2] = " addv h3, v15.4h"; 41 | indepInstrs[3] = " addv h4, v15.4h"; 42 | UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs, 43 | postLoadInstrs: postLoadInstrs1); 44 | } 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /AsmGen/tests/AddvSchedTest.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class AddvSched : UarchTest 6 | { 7 | public AddvSched(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "addvsched"; 11 | this.Description = "ADDV Scheduler"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; 13 | this.GetFunctionCallParameters = "structIterations, A, fpArr"; 14 | this.DivideTimeByCount = false; 15 | } 16 | 17 | public override bool SupportsIsa(IUarchTest.ISA isa) 18 | { 19 | if (isa == IUarchTest.ISA.aarch64) return true; 20 | return false; 21 | } 22 | 23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 24 | { 25 | if (isa == IUarchTest.ISA.aarch64) 26 | { 27 | string postLoadInstrs1 = " ldr q16, [x2, w25, sxtw #0]"; 28 | string postLoadInstrs2 = " ldr q16, [x2, w25, sxtw #0]"; 29 | string[] unrolledInstrs = new string[4]; 30 | unrolledInstrs[0] = " addv h1, v16.4h"; 31 | unrolledInstrs[1] = " addv h2, v16.4h"; 32 | unrolledInstrs[2] = " addv h3, v16.4h"; 33 | unrolledInstrs[3] = " addv h4, v16.4h"; 34 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, null, 35 | postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2); 36 | } 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /AsmGen/tests/AeseSchedTest.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class AeseSchedTest : UarchTest 6 | { 7 | public AeseSchedTest(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "aesesched"; 11 | this.Description = "aese scheduler"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; 13 | this.GetFunctionCallParameters = "structIterations, A, fpArr"; 14 | this.DivideTimeByCount = false; 15 | } 16 | 17 | public override bool SupportsIsa(IUarchTest.ISA isa) 18 | { 19 | if (isa == IUarchTest.ISA.amd64) return true; 20 | if (isa == IUarchTest.ISA.aarch64) return true; 21 | return false; 22 | } 23 | 24 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 25 | { 26 | if (isa == IUarchTest.ISA.amd64) 27 | { 28 | string[] unrolledAdds = new string[4]; 29 | unrolledAdds[0] = " aesenc %xmm0, %xmm1"; 30 | unrolledAdds[1] = " aesenc %xmm0, %xmm2"; 31 | unrolledAdds[2] = " aesenc %xmm0, %xmm3"; 32 | unrolledAdds[3] = " aesenc %xmm0, %xmm4"; 33 | 34 | UarchTestHelpers.GenerateX86AsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds); 35 | } 36 | 37 | if (isa == IUarchTest.ISA.aarch64) 38 | { 39 | string postLoadInstrs1 = " ldr q0, [x2, w25, uxtw#0]"; 40 | string postLoadInstrs2 = " ldr q0, [x2, w26, uxtw#0]"; 41 | string[] unrolledAdds = new string[4]; 42 | unrolledAdds[0] = " aese v1.16b, v0.16b"; 43 | unrolledAdds[1] = " aese v2.16b, v0.16b"; 44 | unrolledAdds[2] = " aese v3.16b, v0.16b"; 45 | unrolledAdds[3] = " aese v4.16b, v0.16b"; 46 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, null, postLoadInstrs1, postLoadInstrs2); 47 | } 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /AsmGen/tests/Fadd128SchedTest.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class Fadd128SchedTest : UarchTest 6 | { 7 | public Fadd128SchedTest(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "fadd128sched"; 11 | this.Description = "128-bit Vector FP Add Scheduler"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; 13 | this.GetFunctionCallParameters = "structIterations, A, fpArr"; 14 | this.DivideTimeByCount = false; 15 | } 16 | 17 | public override bool SupportsIsa(IUarchTest.ISA isa) 18 | { 19 | if (isa == IUarchTest.ISA.amd64) return true; 20 | if (isa == IUarchTest.ISA.aarch64) return true; 21 | return false; 22 | } 23 | 24 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 25 | { 26 | if (isa == IUarchTest.ISA.amd64) 27 | { 28 | string[] unrolledAdds = new string[4]; 29 | unrolledAdds[0] = " addps %xmm0, %xmm1"; 30 | unrolledAdds[1] = " addps %xmm0, %xmm2"; 31 | unrolledAdds[2] = " addps %xmm0, %xmm3"; 32 | unrolledAdds[3] = " addps %xmm0, %xmm4"; 33 | 34 | UarchTestHelpers.GenerateX86AsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds); 35 | } 36 | else if (isa == IUarchTest.ISA.aarch64) 37 | { 38 | string postLoadInstrs1 = " ldr q0, [x2, w25, uxtw#0]"; 39 | string postLoadInstrs2 = " ldr q0, [x2, w26, uxtw#0]"; 40 | string[] unrolledAdds = new string[4]; 41 | unrolledAdds[0] = " add v1.4s, v1.4s, v0.4s"; 42 | unrolledAdds[1] = " add v2.4s, v2.4s, v0.4s"; 43 | unrolledAdds[2] = " add v3.4s, v3.4s, v0.4s"; 44 | unrolledAdds[3] = " add v4.4s, v4.4s, v0.4s"; 45 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, null, postLoadInstrs1, postLoadInstrs2); 46 | } 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /AsmGen/tests/FcmpSchedTest.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class FcmpSchedTest : UarchTest 6 | { 7 | public FcmpSchedTest(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "fcmpsched"; 11 | this.Description = "FCMP Scheduler"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; 13 | this.GetFunctionCallParameters = "structIterations, A, fpArr"; 14 | this.DivideTimeByCount = false; 15 | } 16 | 17 | public override bool SupportsIsa(IUarchTest.ISA isa) 18 | { 19 | if (isa == IUarchTest.ISA.aarch64) return true; 20 | return false; 21 | } 22 | 23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 24 | { 25 | if (isa == IUarchTest.ISA.aarch64) 26 | { 27 | string[] unrolledAdds = new string[4]; 28 | unrolledAdds[0] = " fcmp s17, s16"; 29 | unrolledAdds[1] = " fcmp s19, s16"; 30 | unrolledAdds[2] = " fcmp s19, s16"; 31 | unrolledAdds[3] = " fcmp s20, s16"; 32 | UarchTestHelpers.GenerateArmAsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds); 33 | } 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /AsmGen/tests/FlagRfTest.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class FlagRfTest : UarchTest 6 | { 7 | private bool initialDependentBranch; 8 | public FlagRfTest(int low, int high, int step, bool initialDependentBranch) 9 | { 10 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 11 | this.Prefix = "flagrf" + (initialDependentBranch ? "db" : string.Empty); 12 | this.Description = "Flags Register File" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty); 13 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; 14 | this.GetFunctionCallParameters = "structIterations, A"; 15 | this.DivideTimeByCount = false; 16 | this.initialDependentBranch = initialDependentBranch; 17 | } 18 | 19 | public override bool SupportsIsa(IUarchTest.ISA isa) 20 | { 21 | if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false; 22 | if (isa == IUarchTest.ISA.amd64) return true; 23 | if (isa == IUarchTest.ISA.aarch64) return true; 24 | if (isa == IUarchTest.ISA.mips64) return false; 25 | return false; 26 | } 27 | 28 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 29 | { 30 | if (isa == IUarchTest.ISA.amd64) 31 | { 32 | string[] unrolledAdds = new string[1]; 33 | unrolledAdds[0] = " test %r15, %r14"; 34 | UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true); 35 | } 36 | else if (isa == IUarchTest.ISA.aarch64) 37 | { 38 | string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null; 39 | string[] unrolledAdds = new string[1]; 40 | unrolledAdds[0] = " cmp x14, x15"; 41 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs( 42 | sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs); 43 | if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix)); 44 | } 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /AsmGen/tests/FmovSched.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class FmovSched : UarchTest 6 | { 7 | public FmovSched(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "fmovsched"; 11 | this.Description = "FMOV vec to gpr Scheduler"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; 13 | this.GetFunctionCallParameters = "structIterations, A, fpArr"; 14 | this.DivideTimeByCount = false; 15 | } 16 | 17 | public override bool SupportsIsa(IUarchTest.ISA isa) 18 | { 19 | if (isa == IUarchTest.ISA.aarch64) return true; 20 | return false; 21 | } 22 | 23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 24 | { 25 | if (isa == IUarchTest.ISA.aarch64) 26 | { 27 | string postLoadInstrs1 = " ldr d16, [x2, w25, sxtw #0]"; 28 | string postLoadInstrs2 = " ldr d16, [x2, w25, sxtw #0]"; 29 | string[] unrolledInstrs = new string[4]; 30 | unrolledInstrs[0] = " fmov x15, d16"; 31 | unrolledInstrs[1] = " fmov x14, d16"; 32 | unrolledInstrs[2] = " fmov x13, d16"; 33 | unrolledInstrs[3] = " fmov x12, d16"; 34 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, null, 35 | postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2); 36 | } 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /AsmGen/tests/FpStoreDataNsq.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class FpStoreDataNsqTest : UarchTest 6 | { 7 | public FpStoreDataNsqTest(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "fpstoredatansq" + high; 11 | this.Description = "Store FP 32-bit data scheduler capacity, excluding nsq"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; 13 | this.GetFunctionCallParameters = "structIterations, A, fpArr"; 14 | this.DivideTimeByCount = false; 15 | } 16 | 17 | public override bool SupportsIsa(IUarchTest.ISA isa) 18 | { 19 | if (isa == IUarchTest.ISA.amd64) return true; 20 | return false; 21 | } 22 | 23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 24 | { 25 | if (isa == IUarchTest.ISA.amd64) 26 | { 27 | string initInstrs = " vzeroupper\n vpcmpeqd %xmm2, %xmm2, %xmm2\n vpxor %xmm2, %xmm3, %xmm3\n cvtsi2ss %r11, %xmm3\n movss %xmm3, %xmm4\n movss %xmm3, %xmm5\n movss %xmm3, %xmm6"; 28 | string postLoadInstr = " cvtsi2ss %rdi, %xmm1"; 29 | string[] dependentStores = new string[4]; 30 | dependentStores[0] = " movss %xmm1, (%r8)"; 31 | dependentStores[1] = " movss %xmm1, (%r8, %r14, 4)"; 32 | dependentStores[2] = " movss %xmm1, (%r8, %r13, 4)"; 33 | dependentStores[3] = " movss %xmm1, (%r8, %r12, 4)"; 34 | 35 | string[] indepFpInstrs = new string[4]; 36 | indepFpInstrs[0] = " addss %xmm2, %xmm3"; 37 | indepFpInstrs[1] = " addss %xmm2, %xmm4"; 38 | indepFpInstrs[2] = " addss %xmm2, %xmm5"; 39 | indepFpInstrs[3] = " addss %xmm2, %xmm6"; 40 | 41 | UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, indepFpInstrs, false, initInstrs: initInstrs, postLoadInstrs: postLoadInstr); 42 | } 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /AsmGen/tests/JsCvtNsq.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class JsCvtNsq : UarchTest 6 | { 7 | private int totalOps; 8 | public JsCvtNsq(int low, int high, int step, int totalOps) 9 | { 10 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 11 | this.Prefix = "jscvtnsq"; 12 | this.Description = "FJCVTZS (FP Javascript Convert to Signed Fixed Point, Rounding toward Zero) Scheduler, excluding possible NSQ"; 13 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; 14 | this.GetFunctionCallParameters = "structIterations, A, fpArr"; 15 | this.DivideTimeByCount = false; 16 | this.totalOps = totalOps; 17 | } 18 | 19 | public override bool SupportsIsa(IUarchTest.ISA isa) 20 | { 21 | if (isa == IUarchTest.ISA.aarch64) return true; 22 | return false; 23 | } 24 | 25 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 26 | { 27 | if (isa == IUarchTest.ISA.aarch64) 28 | { 29 | string postLoadInstrs1 = " ldr d16, [x2, w25, sxtw #0]"; 30 | string initInstrs = " ldr d15, [x2]"; 31 | string[] depInstrs = new string[4]; 32 | depInstrs[0] = " fjcvtzs w15, d16"; 33 | depInstrs[1] = " fjcvtzs w14, d16"; 34 | depInstrs[2] = " fjcvtzs w13, d16"; 35 | depInstrs[3] = " fjcvtzs w12, d16"; 36 | 37 | string[] indepInstrs = new string[4]; 38 | indepInstrs[0] = " fjcvtzs w15, d15"; 39 | indepInstrs[1] = " fjcvtzs w14, d15"; 40 | indepInstrs[2] = " fjcvtzs w13, d15"; 41 | indepInstrs[3] = " fjcvtzs w12, d15"; 42 | UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs, 43 | postLoadInstrs: postLoadInstrs1); 44 | } 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /AsmGen/tests/JsCvtSched.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class JsCvtSched : UarchTest 6 | { 7 | public JsCvtSched(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "jscvtsched"; 11 | this.Description = "FJCVTZS (FP Javascript Convert to Signed Fixed Point, Rounding toward Zero) Scheduler"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; 13 | this.GetFunctionCallParameters = "structIterations, A, fpArr"; 14 | this.DivideTimeByCount = false; 15 | } 16 | 17 | public override bool SupportsIsa(IUarchTest.ISA isa) 18 | { 19 | if (isa == IUarchTest.ISA.aarch64) return true; 20 | return false; 21 | } 22 | 23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 24 | { 25 | if (isa == IUarchTest.ISA.aarch64) 26 | { 27 | string postLoadInstrs1 = " ldr d16, [x2, w25, sxtw #0]"; 28 | string postLoadInstrs2 = " ldr d16, [x2, w25, sxtw #0]"; 29 | string[] unrolledInstrs = new string[4]; 30 | unrolledInstrs[0] = " fjcvtzs w15, d16"; 31 | unrolledInstrs[1] = " fjcvtzs w14, d16"; 32 | unrolledInstrs[2] = " fjcvtzs w13, d16"; 33 | unrolledInstrs[3] = " fjcvtzs w12, d16"; 34 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, null, 35 | postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2); 36 | } 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /AsmGen/tests/JumpNsqTest.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class JumpNsqTest : UarchTest 6 | { 7 | public JumpNsqTest(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "jumpnsq"; 11 | this.Description = "Scheduler, Not-Taken Jumps, excluding possible nsq"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; 13 | this.GetFunctionCallParameters = "structIterations, A"; 14 | this.DivideTimeByCount = false; 15 | } 16 | 17 | public override bool SupportsIsa(IUarchTest.ISA isa) 18 | { 19 | if (isa == IUarchTest.ISA.amd64) return true; 20 | // if (isa == IUarchTest.ISA.aarch64) return true; 21 | // if (isa == IUarchTest.ISA.mips64) return true; 22 | // if (isa == IUarchTest.ISA.riscv) return true; 23 | return false; 24 | } 25 | 26 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 27 | { 28 | if (isa == IUarchTest.ISA.amd64) 29 | { 30 | string[] dependentJumps = new string[1]; 31 | dependentJumps[0] = " cmp %rdi, %rsi\n je jumpnsq_reallybadthing"; 32 | string[] independentJumps = new string[1]; 33 | independentJumps[0] = " cmp %r13, %r14\n je jumpnsq_reallybadthing"; 34 | UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentJumps, independentJumps); 35 | 36 | sb.AppendLine("jumpnsq_reallybadthing:\n int3"); 37 | } 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /AsmGen/tests/JumpSchedTest.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class JumpSchedTest : UarchTest 6 | { 7 | public JumpSchedTest(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "jumpsched"; 11 | this.Description = "Scheduler, Not-Taken Jumps"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; 13 | this.GetFunctionCallParameters = "structIterations, A"; 14 | this.DivideTimeByCount = false; 15 | } 16 | 17 | public override bool SupportsIsa(IUarchTest.ISA isa) 18 | { 19 | if (isa == IUarchTest.ISA.amd64) return true; 20 | if (isa == IUarchTest.ISA.aarch64) return true; 21 | // if (isa == IUarchTest.ISA.mips64) return true; 22 | if (isa == IUarchTest.ISA.riscv) return true; 23 | return false; 24 | } 25 | 26 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 27 | { 28 | if (isa == IUarchTest.ISA.amd64) 29 | { 30 | string[] unrolledJumps = new string[1]; 31 | unrolledJumps[0] = " cmp %rdi, %rsi\n je jumpsched_reallybadthing"; 32 | UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true); 33 | 34 | sb.AppendLine("jumpsched_reallybadthing:\n int3"); 35 | } 36 | else if (isa == IUarchTest.ISA.aarch64) 37 | { 38 | string[] unrolledJumps = new string[1]; 39 | unrolledJumps[0] = " cmp x25, x26\n b.eq jumpsched_reallybadthing"; 40 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true); 41 | sb.AppendLine("jumpsched_reallybadthing:\n .word 0xf7f0a000"); 42 | } 43 | else if (isa == IUarchTest.ISA.riscv) 44 | { 45 | // todo 46 | string[] unrolledJumps = new string[1]; 47 | unrolledJumps[0] = " beq x5, x6, jumpsched_reallybadthing"; 48 | UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, false); 49 | sb.AppendLine("jumpsched_reallybadthing:\n .word 0x00000000"); 50 | } 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /AsmGen/tests/LeaSchedTest.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class LeaSchedTest : UarchTest 6 | { 7 | public LeaSchedTest(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "leasched"; 11 | this.Description = "Scheduler, lea with base + index + offset"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; 13 | this.GetFunctionCallParameters = "structIterations, A"; 14 | this.DivideTimeByCount = false; 15 | } 16 | 17 | public override bool SupportsIsa(IUarchTest.ISA isa) 18 | { 19 | if (isa == IUarchTest.ISA.amd64) return true; 20 | return false; 21 | } 22 | 23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 24 | { 25 | if (isa == IUarchTest.ISA.amd64) 26 | { 27 | string[] unrolledAdds = new string[4]; 28 | unrolledAdds[0] = " lea 128(%r15, %rdi), %r15"; 29 | unrolledAdds[1] = " lea 128(%r14, %rdi), %r14"; 30 | unrolledAdds[2] = " lea 128(%r13, %rdi), %r13"; 31 | unrolledAdds[3] = " lea 128(%r12, %rdi), %r12"; 32 | UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false); 33 | } 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /AsmGen/tests/LoadNsq.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class LoadNsq : UarchTest 6 | { 7 | public LoadNsq(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "loadnsq"; 11 | this.Description = "Load Address Scheduler, Excluding any NSQ"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; 13 | this.GetFunctionCallParameters = "structIterations, A, fpArr"; 14 | this.DivideTimeByCount = false; 15 | } 16 | 17 | public override bool SupportsIsa(IUarchTest.ISA isa) 18 | { 19 | if (isa == IUarchTest.ISA.aarch64) return true; 20 | if (isa == IUarchTest.ISA.amd64) return true; 21 | return false; 22 | } 23 | 24 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 25 | { 26 | if (isa == IUarchTest.ISA.amd64) 27 | { 28 | string[] dep = new string[3]; 29 | dep[0] = " mov (%r8, %rdi, 4), %r15"; 30 | dep[1] = " mov (%r8, %rdi, 4), %r14"; 31 | dep[2] = " mov (%r8, %rdi, 4), %r13"; 32 | 33 | string[] indep = new string[3]; 34 | indep[0] = " mov (%r8), %r15"; 35 | indep[1] = " mov (%r8), %r14"; 36 | indep[2] = " mov (%r8), %r13"; 37 | 38 | UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dep, indep, ptrChasingLoadsInSq: true); 39 | } 40 | if (isa == IUarchTest.ISA.aarch64) 41 | { 42 | string[] dep = new string[3]; 43 | dep[0] = " ldr w15, [x2, w25, uxtw #2]"; 44 | dep[1] = " ldr w14, [x2, w25, uxtw #2]"; 45 | dep[2] = " ldr w13, [x2, w25, uxtw #2]"; 46 | 47 | string[] indep = new string[3]; 48 | indep[0] = " ldr w12, [x2]"; 49 | indep[1] = " ldr w11, [x2]"; 50 | indep[2] = " ldr w10, [x2]"; 51 | UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dep, indep); 52 | } 53 | } 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /AsmGen/tests/MaddSchedTest.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class MaddSchedTest : UarchTest 6 | { 7 | public MaddSchedTest(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "maddsched"; 11 | this.Description = "Scheduler, Integer Multiply-Add"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; 13 | this.GetFunctionCallParameters = "structIterations, A"; 14 | this.DivideTimeByCount = false; 15 | } 16 | 17 | public override bool SupportsIsa(IUarchTest.ISA isa) 18 | { 19 | if (isa == IUarchTest.ISA.aarch64) return true; 20 | return false; 21 | } 22 | 23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 24 | { 25 | if (isa == IUarchTest.ISA.aarch64) 26 | { 27 | string[] unrolledMuls = new string[4]; 28 | unrolledMuls[0] = " madd x15, x15, x25, x10"; 29 | unrolledMuls[1] = " madd x14, x14, x25, x10"; 30 | unrolledMuls[2] = " madd x13, x13, x25, x10"; 31 | unrolledMuls[3] = " madd x12, x12, x25, x10"; 32 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledMuls, unrolledMuls, includePtrChasingLoads: false); 33 | } 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /AsmGen/tests/MaskRfTest.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class MaskRfTest : UarchTest 6 | { 7 | public MaskRfTest(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "maskrf"; 11 | this.Description = "Mask Registers - AVX-512 only"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; 13 | this.GetFunctionCallParameters = "structIterations, A"; 14 | this.DivideTimeByCount = false; 15 | } 16 | 17 | public override bool SupportsIsa(IUarchTest.ISA isa) 18 | { 19 | if (isa == IUarchTest.ISA.amd64) return true; 20 | return false; 21 | } 22 | 23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 24 | { 25 | if (isa == IUarchTest.ISA.amd64) 26 | { 27 | string[] unrolledAdds = new string[4]; 28 | unrolledAdds[0] = " kaddb %k0, %k1, %k1"; 29 | unrolledAdds[1] = " kaddb %k0, %k2, %k2"; 30 | unrolledAdds[2] = " kaddb %k0, %k3, %k3"; 31 | unrolledAdds[3] = " kaddb %k0, %k4, %k4"; 32 | UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false); 33 | } 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /AsmGen/tests/MixAddvJsCvtNsq.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class MixAddvJsCvtNsq : UarchTest 6 | { 7 | public MixAddvJsCvtNsq(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "mixaddvjscvtnsq"; 11 | this.Description = "ADDV and fjcvtzs Scheduler, Excluding any NSQ"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; 13 | this.GetFunctionCallParameters = "structIterations, A, fpArr"; 14 | this.DivideTimeByCount = false; 15 | } 16 | 17 | public override bool SupportsIsa(IUarchTest.ISA isa) 18 | { 19 | if (isa == IUarchTest.ISA.aarch64) return true; 20 | return false; 21 | } 22 | 23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 24 | { 25 | if (isa == IUarchTest.ISA.aarch64) 26 | { 27 | string postLoadInstrs1 = " ldr q16, [x2, w25, sxtw #0]\n ldr d2, [x2, w25, sxtw #0]"; 28 | string initInstrs = " ldr q17, [x2]\n ldr d15, [x2]"; 29 | string[] depInstrs = new string[4]; 30 | depInstrs[0] = " addv h1, v16.4h"; 31 | depInstrs[1] = " fjcvtzs w15, d2"; 32 | depInstrs[2] = " addv h3, v16.4h"; 33 | depInstrs[3] = " fjcvtzs w14, d2"; 34 | 35 | string[] indepInstrs = new string[4]; 36 | indepInstrs[0] = " addv h4, v17.4h"; 37 | indepInstrs[1] = " fjcvtzs w12, d15"; 38 | indepInstrs[2] = " addv h5, v17.4h"; 39 | indepInstrs[3] = " fjcvtzs w13, d15"; 40 | UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs: initInstrs, 41 | postLoadInstrs: postLoadInstrs1); 42 | } 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /AsmGen/tests/MixAddvJsCvtSched.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class MixAddvJsCvtSched : UarchTest 6 | { 7 | public MixAddvJsCvtSched(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "mixaddvjscvtsched"; 11 | this.Description = "ADDV and fjcvtzs Scheduler"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; 13 | this.GetFunctionCallParameters = "structIterations, A, fpArr"; 14 | this.DivideTimeByCount = false; 15 | } 16 | 17 | public override bool SupportsIsa(IUarchTest.ISA isa) 18 | { 19 | if (isa == IUarchTest.ISA.aarch64) return true; 20 | return false; 21 | } 22 | 23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 24 | { 25 | if (isa == IUarchTest.ISA.aarch64) 26 | { 27 | string postLoadInstrs1 = " ldr q16, [x2, w25, sxtw #0]\n ldr d2, [x2, w25, sxtw #0]"; 28 | string postLoadInstrs2 = " ldr q16, [x2, w26, sxtw #0]\n ldr d2, [x2, w26, sxtw #0]"; 29 | string[] unrolledInstrs = new string[4]; 30 | unrolledInstrs[0] = " addv h1, v16.4h"; 31 | unrolledInstrs[1] = " fjcvtzs w15, d2"; 32 | unrolledInstrs[2] = " addv h3, v16.4h"; 33 | unrolledInstrs[3] = " fjcvtzs w14, d2"; 34 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, null, 35 | postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2); 36 | } 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /AsmGen/tests/MixFpRfDepBranchTest.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Text; 3 | 4 | namespace AsmGen 5 | { 6 | public class MixFpRfDepBranchTest : UarchTest 7 | { 8 | private int interval; 9 | public MixFpRfDepBranchTest(int low, int high, int step, int interval) 10 | { 11 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 12 | this.Prefix = "mixfprfdepbranch" + interval; 13 | this.Description = "FP Register File, with some dependent branches"; 14 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *fpArr"; 15 | this.GetFunctionCallParameters = "structIterations, A, fpArr"; 16 | this.DivideTimeByCount = false; 17 | this.interval = interval; 18 | } 19 | 20 | public override bool SupportsIsa(IUarchTest.ISA isa) 21 | { 22 | if (isa == IUarchTest.ISA.aarch64) return true; 23 | return false; 24 | } 25 | 26 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 27 | { 28 | if (isa == IUarchTest.ISA.aarch64) 29 | { 30 | string initInstrs = " ldr s17, [x2]\n" + 31 | " ldr s18, [x2, 4]\n" + 32 | " ldr s19, [x2, 8]\n" + 33 | " ldr s20, [x2, 12]\n" + 34 | " ldr s21, [x2, 16]\n"; 35 | 36 | List unrolledAddsList = new List(); 37 | for (int i = 0; i < this.Counts[this.Counts.Length - 1]; i++) 38 | { 39 | int regnum = 18 + (i % 4); 40 | unrolledAddsList.Add($" fadd s{regnum}, s{regnum}, s17"); 41 | if (i % interval == 0) unrolledAddsList.Add(" cmp x25, x26\n b.eq mixfpjumpsched_badthing" + interval); 42 | } 43 | string[] unrolledAdds = unrolledAddsList.ToArray(); 44 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true, initInstrs: initInstrs); 45 | 46 | sb.AppendLine($"mixfpjumpsched_badthing{interval}:\n .word 0xf7f0a000"); 47 | } 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /AsmGen/tests/MixFpVecRfTest.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Text; 3 | 4 | namespace AsmGen 5 | { 6 | public class MixFpVecRfTest : UarchTest 7 | { 8 | private bool initialDependentBranch; 9 | public MixFpVecRfTest(int low, int high, int step, bool initialDependentBranch) 10 | { 11 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 12 | this.Prefix = "mixfpvecrf" + (initialDependentBranch ? "db" : string.Empty); 13 | this.Description = "Mixed FP/128-bit FP vec rf" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty); 14 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; 15 | this.GetFunctionCallParameters = "structIterations, A, fpArr"; 16 | this.DivideTimeByCount = false; 17 | this.initialDependentBranch = initialDependentBranch; 18 | } 19 | 20 | public override bool SupportsIsa(IUarchTest.ISA isa) 21 | { 22 | if (this.initialDependentBranch) 23 | { 24 | if (isa == IUarchTest.ISA.riscv) return true; 25 | return false; 26 | } 27 | 28 | if (isa == IUarchTest.ISA.riscv) return true; 29 | return false; 30 | } 31 | 32 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 33 | { 34 | if (isa == IUarchTest.ISA.riscv) 35 | { 36 | string initInstrs = " vsetvli t5, t6, e32\n vlw.v v0, (a1)\n fld f0, (a1)"; 37 | string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : string.Empty; 38 | postLoadInstrs += "\n mv t6, a2"; 39 | string[] unrolledInstrs = new string[2]; 40 | unrolledInstrs[0] = " vfadd.vv v0, v0, v0"; 41 | unrolledInstrs[1] = " fadd.s f0, f0, f0"; 42 | UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, 43 | initInstrs: initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs); 44 | if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix)); 45 | } 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /AsmGen/tests/MixIntRfDepBranchTest.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Text; 3 | 4 | namespace AsmGen 5 | { 6 | public class MixIntRfDepBranchTest : UarchTest 7 | { 8 | private int interval; 9 | public MixIntRfDepBranchTest(int low, int high, int step, int interval) 10 | { 11 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 12 | this.Prefix = "mixintrfdepbranch" + interval; 13 | this.Description = "Integer Register File, with some dependent branches"; 14 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; 15 | this.GetFunctionCallParameters = "structIterations, A"; 16 | this.DivideTimeByCount = false; 17 | this.interval = interval; 18 | } 19 | 20 | public override bool SupportsIsa(IUarchTest.ISA isa) 21 | { 22 | if (isa == IUarchTest.ISA.aarch64) return true; 23 | return false; 24 | } 25 | 26 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 27 | { 28 | if (isa == IUarchTest.ISA.aarch64) 29 | { 30 | List unrolledAddsList = new List(); 31 | for (int i = 1; i < this.Counts[this.Counts.Length - 1] + 1; i++) 32 | { 33 | int regnum = 12 + (i % 4); 34 | unrolledAddsList.Add($" add x{regnum}, x{regnum}, x11"); 35 | if (i % interval == 0) unrolledAddsList.Add(" cmp x25, x26\n b.eq mixintjumpsched_badthing" + interval); 36 | } 37 | string[] unrolledAdds = unrolledAddsList.ToArray(); 38 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true); 39 | 40 | sb.AppendLine($"mixintjumpsched_badthing{interval}:\n .word 0xf7f0a000"); 41 | } 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /AsmGen/tests/MixJumpStoreDataSched.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class MixJumpStoreDataSched : UarchTest 6 | { 7 | public MixJumpStoreDataSched(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "mixjumpstoredatasched"; 11 | this.Description = "Scheduler, Mixed Jumps and Store Data"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatarr"; 13 | this.GetFunctionCallParameters = "structIterations, A, fpArr"; 14 | this.DivideTimeByCount = false; 15 | } 16 | 17 | public override bool SupportsIsa(IUarchTest.ISA isa) 18 | { 19 | if (isa == IUarchTest.ISA.amd64) return true; 20 | //if (isa == IUarchTest.ISA.aarch64) return true; 21 | // if (isa == IUarchTest.ISA.mips64) return true; 22 | // if (isa == IUarchTest.ISA.riscv) return true; 23 | return false; 24 | } 25 | 26 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 27 | { 28 | if (isa == IUarchTest.ISA.amd64) 29 | { 30 | string[] unrolledJumps = new string[4]; 31 | unrolledJumps[0] = " cmp %rdi, %rsi\n je mixjumpstoredatasched_reallybadthing"; 32 | unrolledJumps[1] = " mov %rdi, (%r8)"; 33 | unrolledJumps[2] = " cmp %rdi, %rsi\n je mixjumpstoredatasched_reallybadthing"; 34 | unrolledJumps[3] = " mov %rdi, 64(%r8)"; 35 | UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true); 36 | 37 | sb.AppendLine("mixjumpstoredatasched_reallybadthing:\n int3"); 38 | } 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /AsmGen/tests/MixJumpStoreSchedTest.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class MixJumpStoreSchedTest : UarchTest 6 | { 7 | public MixJumpStoreSchedTest(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "mixjumpstoresched"; 11 | this.Description = "Scheduler, Mixed Jumps and Stores (Address Dependency)"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatarr"; 13 | this.GetFunctionCallParameters = "structIterations, A, fpArr"; 14 | this.DivideTimeByCount = false; 15 | } 16 | 17 | public override bool SupportsIsa(IUarchTest.ISA isa) 18 | { 19 | if (isa == IUarchTest.ISA.amd64) return true; 20 | //if (isa == IUarchTest.ISA.aarch64) return true; 21 | // if (isa == IUarchTest.ISA.mips64) return true; 22 | // if (isa == IUarchTest.ISA.riscv) return true; 23 | return false; 24 | } 25 | 26 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 27 | { 28 | if (isa == IUarchTest.ISA.amd64) 29 | { 30 | string[] unrolledJumps = new string[4]; 31 | unrolledJumps[0] = " cmp %rdi, %rsi\n je mixstorejumpsched_reallybadthing"; 32 | unrolledJumps[1] = " mov %r14, (%r8, %rdi, 2)"; 33 | unrolledJumps[2] = " cmp %rdi, %rsi\n je mixstorejumpsched_reallybadthing"; 34 | unrolledJumps[3] = " mov %r14, 64(%r8, %rdi, 2)"; 35 | UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true); 36 | 37 | sb.AppendLine("mixstorejumpsched_reallybadthing:\n int3"); 38 | } 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /AsmGen/tests/MixJumpThenAddSched.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Text; 3 | 4 | namespace AsmGen 5 | { 6 | public class MixJumpThenAddSched : UarchTest 7 | { 8 | public MixJumpThenAddSched(int low, int high, int step) 9 | { 10 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 11 | this.Prefix = "mixjumpthenaddsched"; 12 | this.Description = "Scheduler, 40 NT jumps + adds"; 13 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; 14 | this.GetFunctionCallParameters = "structIterations, A"; 15 | this.DivideTimeByCount = false; 16 | } 17 | 18 | public override bool SupportsIsa(IUarchTest.ISA isa) 19 | { 20 | // if (isa == IUarchTest.ISA.amd64) return true; 21 | if (isa == IUarchTest.ISA.aarch64) return true; 22 | // if (isa == IUarchTest.ISA.mips64) return true; 23 | // if (isa == IUarchTest.ISA.riscv) return true; 24 | return false; 25 | } 26 | 27 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 28 | { 29 | if (isa == IUarchTest.ISA.aarch64) 30 | { 31 | List unrolledJumps = new List(); 32 | int instrIdx; 33 | for (instrIdx = 0; instrIdx < 40; instrIdx++) unrolledJumps.Add(" cmp x25, x26\n b.eq mixaddthenjumpsched_reallybadthing"); 34 | for (; instrIdx < this.Counts[this.Counts.Length - 1]; instrIdx++) unrolledJumps.Add(" add x15, x15, x25"); 35 | string[] instrs = unrolledJumps.ToArray(); 36 | UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, instrs, instrs, includePtrChasingLoads: true, dsb: true); 37 | sb.AppendLine("mixaddthenjumpsched_reallybadthing:\n .word 0xf7f0a000"); 38 | } 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /AsmGen/tests/MixLoadStoreDivSchedTest.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class MixLoadStoreDivSchedTest : UarchTest 6 | { 7 | public MixLoadStoreDivSchedTest(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "mixloadstoredivsched"; 11 | this.Description = "Load/Store Scheduler Capacity Test, using divs to block retirement"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int count, int *arr2, int *arr3"; 13 | this.GetFunctionCallParameters = "structIterations, list_size, B, A"; 14 | this.DivideTimeByCount = false; 15 | } 16 | 17 | public override bool SupportsIsa(IUarchTest.ISA isa) 18 | { 19 | if (isa == IUarchTest.ISA.amd64) return true; 20 | if (isa == IUarchTest.ISA.aarch64) return true; 21 | return false; 22 | } 23 | 24 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 25 | { 26 | if (isa == IUarchTest.ISA.amd64) 27 | { 28 | GenerateX86Asm(sb); 29 | } 30 | else if (isa == IUarchTest.ISA.aarch64) 31 | { 32 | GenerateArmAsm(sb); 33 | } 34 | } 35 | 36 | public void GenerateX86Asm(StringBuilder sb) 37 | { 38 | string[] dependentLoads = new string[2]; 39 | dependentLoads[0] = " mov (%r9, %rdx, 4), %r15"; 40 | dependentLoads[1] = " mov %r14, (%r8, %rdx, 4)"; 41 | 42 | UarchTestHelpers.GenerateX86AsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, false); 43 | } 44 | 45 | public void GenerateArmAsm(StringBuilder sb) 46 | { 47 | string[] dependentLoads = new string[2]; 48 | dependentLoads[0] = " ldr w15, [x3, w25, uxtw #2]"; 49 | dependentLoads[1] = " str w14, [x2, w25, uxtw #2]"; 50 | 51 | string[] dependentLoads1 = new string[2]; 52 | dependentLoads1[0] = " ldr w15, [x3, w26, uxtw #2]"; 53 | dependentLoads1[1] = " str w14, [x2, w26, uxtw #2]"; 54 | 55 | UarchTestHelpers.GenerateArmAsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, false); 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /AsmGen/tests/MixVec512Vec256RfTest.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Text; 3 | 4 | namespace AsmGen 5 | { 6 | public class MixVec512Vec256RfTest : UarchTest 7 | { 8 | public MixVec512Vec256RfTest(int low, int high, int step) 9 | { 10 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 11 | this.Prefix = "mixvec512vec256rf"; 12 | this.Description = "Mixed zmm/ymm regs - AVX-512 only, alternating"; 13 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; 14 | this.GetFunctionCallParameters = "structIterations, A, fpArr"; 15 | this.DivideTimeByCount = false; 16 | } 17 | 18 | public override bool SupportsIsa(IUarchTest.ISA isa) 19 | { 20 | if (isa == IUarchTest.ISA.amd64) return true; 21 | return false; 22 | } 23 | 24 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 25 | { 26 | if (isa == IUarchTest.ISA.amd64) 27 | { 28 | // use even numbered regs for ymm testing 29 | string initInstrs = " vmovups (%r8), %zmm1\n" + 30 | " vmovups 64(%r8), %ymm2\n" + 31 | " vmovups 128(%r8), %zmm3\n" + 32 | " vmovups 192(%r8), %ymm4\n" + 33 | " vmovups 256(%r8), %zmm5\n"; 34 | 35 | // use all zmm regs 36 | for (int i = 6; i < 32; i++) 37 | { 38 | if ((i & 1) == 0) initInstrs += "vmovups %ymm2, %ymm" + i + "\n"; 39 | else initInstrs += "vmovups %zmm5, %zmm" + i + "\n"; 40 | } 41 | 42 | List instrsList = new List(); 43 | for (int i = 1; i < 32; i++) 44 | { 45 | if ((i & 1) == 0) instrsList.Add($" vaddps %ymm2, %ymm{i}, %ymm{i}"); 46 | else instrsList.Add($" vaddps %zmm1, %zmm{i}, %zmm{i}"); 47 | } 48 | 49 | string[] unrolledAdds = instrsList.ToArray(); 50 | UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, initInstrs); 51 | } 52 | } 53 | } 54 | } -------------------------------------------------------------------------------- /AsmGen/tests/MmxRfTest.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class MmxRfTest : UarchTest 6 | { 7 | public MmxRfTest(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "mmxrf"; 11 | this.Description = "64-bit MMX RF Capacity Test. x86 only"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, int *arr2"; 13 | this.GetFunctionCallParameters = "structIterations, A, B"; 14 | this.DivideTimeByCount = false; 15 | } 16 | public override bool SupportsIsa(IUarchTest.ISA isa) 17 | { 18 | if (isa == IUarchTest.ISA.amd64) return true; 19 | return false; 20 | } 21 | 22 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 23 | { 24 | if (isa == IUarchTest.ISA.amd64) GenerateX86GccAsm(sb); 25 | } 26 | 27 | public void GenerateX86GccAsm(StringBuilder sb) 28 | { 29 | string initInstrs = 30 | " fsave (%r8)\n" + 31 | " movq (%rdx), %mm0\n" + 32 | " movq 8(%rdx), %mm1\n" + 33 | " movq 16(%rdx), %mm2\n" + 34 | " movq 24(%rdx), %mm3\n" + 35 | " movq 32(%rdx), %mm4\n"; 36 | 37 | string cleanupInstrs = " frstor (%r8)"; 38 | 39 | string[] unrolledAdds = new string[4]; 40 | unrolledAdds[0] = " paddw %mm0, %mm1"; 41 | unrolledAdds[1] = " paddw %mm0, %mm2"; 42 | unrolledAdds[2] = " paddw %mm0, %mm3"; 43 | unrolledAdds[3] = " paddw %mm0, %mm4"; 44 | 45 | UarchTestHelpers.GenerateX86AsmStructureTestFuncs( 46 | sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, initInstrs: initInstrs, cleanupInstrs: cleanupInstrs); 47 | } 48 | } 49 | } -------------------------------------------------------------------------------- /AsmGen/tests/PdepSchedTest.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class PdepSchedTest : UarchTest 6 | { 7 | public PdepSchedTest(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "pdepsched"; 11 | this.Description = "Scheduler, PDEP"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; 13 | this.GetFunctionCallParameters = "structIterations, A"; 14 | this.DivideTimeByCount = false; 15 | } 16 | 17 | public override bool SupportsIsa(IUarchTest.ISA isa) 18 | { 19 | if (isa == IUarchTest.ISA.amd64) return true; 20 | return false; 21 | } 22 | 23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 24 | { 25 | if (isa == IUarchTest.ISA.amd64) 26 | { 27 | string[] unrolledAdds = new string[4]; 28 | unrolledAdds[0] = " pdep %rdi, %r15, %r15"; 29 | unrolledAdds[1] = " pdep %rdi, %r14, %r14"; 30 | unrolledAdds[2] = " pdep %rdi, %r13, %r13"; 31 | unrolledAdds[3] = " pdep %rdi, %r12, %r12"; 32 | UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false); 33 | } 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /AsmGen/tests/RorSchedTest.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class RorSchedTest : UarchTest 6 | { 7 | public RorSchedTest(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "rorsched"; 11 | this.Description = "Scheduler, Integer Rotate by Immediate (1)"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; 13 | this.GetFunctionCallParameters = "structIterations, A"; 14 | this.DivideTimeByCount = false; 15 | } 16 | 17 | public override bool SupportsIsa(IUarchTest.ISA isa) 18 | { 19 | if (isa == IUarchTest.ISA.amd64) return true; 20 | return false; 21 | } 22 | 23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 24 | { 25 | if (isa == IUarchTest.ISA.amd64) 26 | { 27 | string postLoadInstrs = " mov %rdi, %r15"; 28 | string postLoadInstrs2 = " mov %rsi, %r15"; 29 | string[] unrolledInstrs = new string[1]; 30 | unrolledInstrs[0] = " ror $1, %r15"; 31 | UarchTestHelpers.GenerateX86AsmStructureTestFuncs( 32 | sb, 33 | this.Counts, 34 | this.Prefix, 35 | unrolledInstrs, 36 | unrolledInstrs, 37 | postLoadInstrs1: postLoadInstrs, 38 | postLoadInstrs2: postLoadInstrs2, 39 | includePtrChasingLoads: false); 40 | } 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /AsmGen/tests/ShlSchedTest.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class ShlSchedTest : UarchTest 6 | { 7 | public ShlSchedTest(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "shlsched"; 11 | this.Description = "Scheduler, Integer Shift by Immediate (1)"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; 13 | this.GetFunctionCallParameters = "structIterations, A"; 14 | this.DivideTimeByCount = false; 15 | } 16 | 17 | public override bool SupportsIsa(IUarchTest.ISA isa) 18 | { 19 | if (isa == IUarchTest.ISA.amd64) return true; 20 | return false; 21 | } 22 | 23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 24 | { 25 | if (isa == IUarchTest.ISA.amd64) 26 | { 27 | string postLoadInstrs = " mov %rdi, %r15"; 28 | string postLoadInstrs2 = " mov %rsi, %r15"; 29 | string[] unrolledInstrs = new string[1]; 30 | unrolledInstrs[0] = " shl $1, %r15"; 31 | UarchTestHelpers.GenerateX86AsmStructureTestFuncs( 32 | sb, 33 | this.Counts, 34 | this.Prefix, 35 | unrolledInstrs, 36 | unrolledInstrs, 37 | postLoadInstrs1: postLoadInstrs, 38 | postLoadInstrs2: postLoadInstrs2, 39 | includePtrChasingLoads: false); 40 | } 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /AsmGen/tests/StoreDataNsqTest.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class StoreDataNsq : UarchTest 6 | { 7 | public StoreDataNsq(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "storedatansq"; 11 | this.Description = "Store Data Scheduler, excluding NSQ"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; 13 | this.GetFunctionCallParameters = "structIterations, A, fpArr"; 14 | this.DivideTimeByCount = false; 15 | } 16 | 17 | public override bool SupportsIsa(IUarchTest.ISA isa) 18 | { 19 | if (isa == IUarchTest.ISA.amd64) return true; 20 | // if (isa == IUarchTest.ISA.aarch64) return true; 21 | // if (isa == IUarchTest.ISA.mips64) return true; 22 | // if (isa == IUarchTest.ISA.riscv) return true; 23 | return false; 24 | } 25 | 26 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 27 | { 28 | if (isa == IUarchTest.ISA.amd64) 29 | { 30 | string[] dependentLoads = new string[4]; 31 | dependentLoads[0] = " mov %rdi, (%r8)"; 32 | dependentLoads[1] = " mov %rdi, 8(%r8)"; 33 | dependentLoads[2] = " mov %rdi, 16(%r8)"; 34 | dependentLoads[3] = " mov %rdi, 24(%r8)"; 35 | 36 | string[] independentLoads = new string[4]; 37 | independentLoads[0] = " mov %r14, (%r8)"; 38 | independentLoads[1] = " mov %r14, 8(%r8)"; 39 | independentLoads[2] = " mov %r14, 16(%r8)"; 40 | independentLoads[3] = " mov %r14, 24(%r8)"; 41 | UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentLoads, independentLoads); 42 | } 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /AsmGen/tests/StoreDivNsqTest.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class StoreDivNsqTest : UarchTest 6 | { 7 | public StoreDivNsqTest(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "storedivnsq"; 11 | this.Description = "Store Scheduler, using DIVs to block retirement, excluding NSQ"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; 13 | this.GetFunctionCallParameters = "structIterations, A, fpArr"; 14 | this.DivideTimeByCount = false; 15 | } 16 | 17 | public override bool SupportsIsa(IUarchTest.ISA isa) 18 | { 19 | if (isa == IUarchTest.ISA.amd64) return true; 20 | if (isa == IUarchTest.ISA.aarch64) return true; 21 | return false; 22 | } 23 | 24 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 25 | { 26 | if (isa == IUarchTest.ISA.amd64) 27 | { 28 | // idiv puts remainder in RDX 29 | string[] dependentStores = new string[4]; 30 | dependentStores[0] = " mov %r15w, (%r8, %rdx, 2)"; 31 | dependentStores[1] = " mov %r15w, 2(%r8, %rdx, 2)"; 32 | dependentStores[2] = " mov %r15w, 4(%r8, %rdx, 2)"; 33 | dependentStores[3] = " mov %r15w, 6(%r8, %rdx, 2)"; 34 | 35 | string[] indepStores = new string[4]; 36 | indepStores[0] = " mov %r11w, (%r8)"; 37 | indepStores[1] = " mov %r11w, 2(%r8)"; 38 | indepStores[2] = " mov %r11w, 4(%r8)"; 39 | indepStores[3] = " mov %r11w, 6(%r8)"; 40 | UarchTestHelpers.GenerateX86AsmDivNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, indepStores); 41 | } 42 | else if (isa == IUarchTest.ISA.aarch64) 43 | { 44 | string[] dependentStores = new string[1]; 45 | dependentStores[0] = " str w15, [x2, w25, uxtw #2]"; 46 | 47 | string[] independentStores = new string[1]; 48 | independentStores[0] = " str w15, [x2, w15, uxtw #2]"; 49 | 50 | UarchTestHelpers.GenerateArmAsmDivNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, independentStores); 51 | } 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /AsmGen/tests/StoreNsq.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace AsmGen 4 | { 5 | public class StoreNsq : UarchTest 6 | { 7 | public StoreNsq(int low, int high, int step) 8 | { 9 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 10 | this.Prefix = "storensq"; 11 | this.Description = "Store Address Scheduler, Excluding any NSQ"; 12 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; 13 | this.GetFunctionCallParameters = "structIterations, A, fpArr"; 14 | this.DivideTimeByCount = false; 15 | } 16 | 17 | public override bool SupportsIsa(IUarchTest.ISA isa) 18 | { 19 | if (isa == IUarchTest.ISA.aarch64) return true; 20 | return false; 21 | } 22 | 23 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 24 | { 25 | if (isa == IUarchTest.ISA.aarch64) 26 | { 27 | string[] depStores = new string[4]; 28 | depStores[0] = " str w15, [x2, w25, uxtw #2]"; 29 | depStores[1] = " str w14, [x2, w25, uxtw #2]"; 30 | depStores[2] = " str w13, [x2, w25, uxtw #2]"; 31 | depStores[3] = " str w12, [x2, w25, uxtw #2]"; 32 | 33 | string[] indepStores = new string[4]; 34 | indepStores[0] = " str w15, [x2, w26, uxtw #2]"; 35 | indepStores[1] = " str w14, [x2, w26, uxtw #2]"; 36 | indepStores[2] = " str w13, [x2, w26, uxtw #2]"; 37 | indepStores[3] = " str w12, [x2, w26, uxtw #2]"; 38 | UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, depStores, indepStores); 39 | } 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /AsmGen/tests/Vec512RfTest.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Text; 3 | 4 | namespace AsmGen 5 | { 6 | public class Vec512RfTest : UarchTest 7 | { 8 | public Vec512RfTest(int low, int high, int step) 9 | { 10 | this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); 11 | this.Prefix = "vec512rf"; 12 | this.Description = "Vector (512-bit packed fp) RF Test - AVX-512 only"; 13 | this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; 14 | this.GetFunctionCallParameters = "structIterations, A, fpArr"; 15 | this.DivideTimeByCount = false; 16 | } 17 | 18 | public override bool SupportsIsa(IUarchTest.ISA isa) 19 | { 20 | if (isa == IUarchTest.ISA.amd64) return true; 21 | return false; 22 | } 23 | 24 | public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) 25 | { 26 | if (isa == IUarchTest.ISA.amd64) 27 | { 28 | // it's ok, the ptr chasing arr should be way bigger than this 29 | string initInstrs = " vmovups (%r8), %zmm1\n" + 30 | " vmovups 64(%r8), %zmm2\n" + 31 | " vmovups 128(%r8), %zmm3\n" + 32 | " vmovups 192(%r8), %zmm4\n" + 33 | " vmovups 256(%r8), %zmm5\n"; 34 | 35 | // use all zmm regs 36 | for (int i = 6; i < 32; i++) 37 | { 38 | initInstrs += "vmovups %zmm5, %zmm" + i + "\n"; 39 | } 40 | 41 | List instrsList = new List(); 42 | for (int i = 1; i < 32; i++) 43 | { 44 | instrsList.Add($" vaddps %zmm1, %zmm{i}, %zmm{i}"); 45 | } 46 | 47 | string[] unrolledAdds = instrsList.ToArray(); 48 | UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, initInstrs); 49 | } 50 | } 51 | } 52 | } -------------------------------------------------------------------------------- /CoherencyLatency/CoherencyLatency.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 16 4 | VisualStudioVersion = 16.0.31025.194 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CoherencyLatency", "CoherencyLatency.vcxproj", "{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|x64 = Debug|x64 11 | Debug|x86 = Debug|x86 12 | Release|x64 = Release|x64 13 | Release|x86 = Release|x86 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Debug|x64.ActiveCfg = Debug|x64 17 | {6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Debug|x64.Build.0 = Debug|x64 18 | {6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Debug|x86.ActiveCfg = Debug|Win32 19 | {6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Debug|x86.Build.0 = Debug|Win32 20 | {6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Release|x64.ActiveCfg = Release|x64 21 | {6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Release|x64.Build.0 = Release|x64 22 | {6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Release|x86.ActiveCfg = Release|Win32 23 | {6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Release|x86.Build.0 = Release|Win32 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | GlobalSection(ExtensibilityGlobals) = postSolution 29 | SolutionGuid = {A6E60C3D-60ED-4DBF-B4AA-7C1C3A140325} 30 | EndGlobalSection 31 | EndGlobal 32 | -------------------------------------------------------------------------------- /CoherencyLatency/Makefile: -------------------------------------------------------------------------------- 1 | include ../Common/arch_detect.mk 2 | 3 | CFLAGS = -pthread -O3 4 | 5 | all: $(TARGET) 6 | 7 | amd64: 8 | $(CC) $(CFLAGS) PThreadsCoherencyLatency.c -o CoherencyLatency_amd64 $(LDFLAGS) 9 | 10 | aarch64: 11 | $(CC) $(CFLAGS) PThreadsCoherencyLatency.c -o CoherencyLatency_aarch64 $(LDFLAGS) 12 | 13 | riscv64: 14 | $(CC) $(CFLAGS) PThreadsCoherencyLatency.c -o CoherencyLatency_riscv64 $(LDFLAGS) 15 | 16 | w64: 17 | $(CC) $(CFLAGS) CoherencyLatency.cpp -o CoherencyLatency_w64.exe $(LDFLAGS) 18 | 19 | # w64 can build with mingw 11, which isn't available on jammy 20 | 21 | ci: amd64 aarch64 riscv64 22 | 23 | clean: 24 | rm -rf *.o *.zip "ocl-icd-libopencl1*" "OpenCL-SDK*" && find . -type f -executable -delete 25 | 26 | .PHONY: all ci clean 27 | -------------------------------------------------------------------------------- /CoherencyLatency/c2cparse/c2cparse.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | net6.0 6 | enable 7 | enable 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /CoherencyLatency/c2cparse/c2cparse.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 17 4 | VisualStudioVersion = 17.4.33110.190 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "c2cparse", "c2cparse.csproj", "{F9E172EC-1A9A-4908-9512-4547CD1CFD80}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|Any CPU = Debug|Any CPU 11 | Release|Any CPU = Release|Any CPU 12 | EndGlobalSection 13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 14 | {F9E172EC-1A9A-4908-9512-4547CD1CFD80}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 15 | {F9E172EC-1A9A-4908-9512-4547CD1CFD80}.Debug|Any CPU.Build.0 = Debug|Any CPU 16 | {F9E172EC-1A9A-4908-9512-4547CD1CFD80}.Release|Any CPU.ActiveCfg = Release|Any CPU 17 | {F9E172EC-1A9A-4908-9512-4547CD1CFD80}.Release|Any CPU.Build.0 = Release|Any CPU 18 | EndGlobalSection 19 | GlobalSection(SolutionProperties) = preSolution 20 | HideSolutionNode = FALSE 21 | EndGlobalSection 22 | GlobalSection(ExtensibilityGlobals) = postSolution 23 | SolutionGuid = {4C3856A5-1183-4D5F-80BE-3D694765A594} 24 | EndGlobalSection 25 | EndGlobal 26 | -------------------------------------------------------------------------------- /Common/arch_detect.mk: -------------------------------------------------------------------------------- 1 | TARGET ?= amd64 2 | 3 | ifeq ($(OS),Windows_NT) 4 | TARGET = w64 5 | else 6 | UNAME_M := $(shell uname -m) 7 | ifeq ($(UNAME_M),x86_64) 8 | TARGET = amd64 9 | endif 10 | ifeq ($(UNAME_M),aarch64) 11 | TARGET = aarch64 12 | endif 13 | ifeq ($(UNAME_M),riscv64) 14 | TARGET = riscv64 15 | endif 16 | UNAME_S := $(shell uname -s) 17 | ifeq ($(UNAME_S),Darwin) 18 | TARGET = darwin 19 | endif 20 | endif 21 | 22 | amd64: CC = x86_64-linux-gnu-gcc 23 | amd64_numa: CC = x86_64-linux-gnu-gcc 24 | aarch64: CC := gcc 25 | aarch64_numa: CC = aarch64-linux-gnu-gcc 26 | riscv64: CC = riscv64-linux-gnu-gcc 27 | w64: CC = x86_64-w64-mingw32-gcc 28 | darwin: CC = clang 29 | -------------------------------------------------------------------------------- /Common/ci_gpumemlatency.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | make_all () { 4 | make amd64 5 | make clean-obj 6 | LDFLAGS="-lm -L ocl-icd-arm64/usr/lib/aarch64-linux-gnu -lOpenCL" make aarch64 7 | make clean-obj 8 | LDFLAGS="-lm -L ocl-icd-riscv64/usr/lib/riscv64-linux-gnu -lOpenCL" make riscv64 9 | make clean-obj 10 | CPPFLAGS="-I OpenCL-SDK-${OCL_VER}-Win-x64/include" LDFLAGS="-lm -L OpenCL-SDK-${OCL_VER}-Win-x64/lib -lOpenCL" make w64 11 | make clean-obj 12 | } 13 | 14 | linux_deps () { 15 | for ARCH in arm64 riscv64; do 16 | if ! grep -q $ARCH /etc/apt/sources.list; then 17 | echo "deb [arch=${ARCH}] http://ports.ubuntu.com/ubuntu-ports $(lsb_release -c -s) universe" | sudo tee -a /etc/apt/sources.list 18 | echo "deb-src [arch=${ARCH}] http://ports.ubuntu.com/ubuntu-ports $(lsb_release -c -s) universe" | sudo tee -a /etc/apt/sources.list 19 | sudo apt update 20 | fi 21 | apt-get download "ocl-icd-libopencl1:${ARCH}" 22 | find . -type f -name "*${ARCH}*.deb" -exec dpkg-deb -x {} "ocl-icd-${ARCH}" \; 23 | done 24 | cp ocl-icd-arm64/usr/lib/aarch64-linux-gnu/libOpenCL.so.1 ocl-icd-arm64/usr/lib/aarch64-linux-gnu/libOpenCL.so 25 | cp ocl-icd-riscv64/usr/lib/riscv64-linux-gnu/libOpenCL.so.1 ocl-icd-riscv64/usr/lib/riscv64-linux-gnu/libOpenCL.so 26 | } 27 | 28 | w64_deps () { 29 | curl -fssLO "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/${OCL_VER}/OpenCL-SDK-${OCL_VER}-Win-x64.zip" 30 | unzip "OpenCL-SDK-${OCL_VER}-Win-x64.zip" 31 | } 32 | 33 | linux_deps 34 | w64_deps 35 | make_all 36 | -------------------------------------------------------------------------------- /Common/ci_package.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | PKG="clammarks-$(git rev-parse --short HEAD)" 4 | rm -rf "$PKG" "clammarks.txz" 5 | mkdir -p "$PKG" 6 | 7 | for TARGET in "amd64" "aarch64" "riscv64" "w64"; do 8 | mkdir "$PKG/$TARGET" 9 | for COMPONENT in CoherencyLatency MemoryLatency MemoryBandwidth InstructionRate Meshsim CoreClockChecker GpuMemLatency; do 10 | find "$COMPONENT" -type f -name "*$TARGET*" -executable -exec cp {} "$PKG/$TARGET" \; 11 | done 12 | find "GpuMemLatency" -type f -name "*.cl" -exec cp {} "$PKG/$TARGET" \; 13 | done 14 | 15 | cp "LICENSE" "$PKG" 16 | 17 | tar caf "clammarks.txz" "$PKG" 18 | -------------------------------------------------------------------------------- /Common/timing.c: -------------------------------------------------------------------------------- 1 | #ifdef _MSC_VER 2 | #include 3 | __declspec(selectany) struct timeb start, end; 4 | void start_timing() { 5 | ftime(&start); 6 | } 7 | 8 | unsigned int end_timing() { 9 | ftime(&end); 10 | return 1000 * (end.time - start.time) + (end.millitm - start.millitm); 11 | } 12 | 13 | void start_timing_ts(struct timeb *startTimeb) { 14 | ftime(startTimeb); 15 | } 16 | 17 | unsigned int end_timing_ts(struct timeb* startTimeb) { 18 | struct timeb end; 19 | ftime(&end); 20 | return 1000 * (end.time - startTimeb->time) + (end.millitm - startTimeb->millitm); 21 | } 22 | #else 23 | #include 24 | #include 25 | struct timeval startTv, endTv; 26 | void start_timing() { 27 | gettimeofday(&startTv, NULL); 28 | } 29 | 30 | unsigned int end_timing() { 31 | gettimeofday(&endTv, NULL); 32 | return (unsigned int)((endTv.tv_sec - startTv.tv_sec) * 1000 + (endTv.tv_usec - startTv.tv_usec) / 1000); 33 | } 34 | 35 | void start_timing_ts(struct timeval* start) { 36 | gettimeofday(start, NULL); 37 | } 38 | 39 | unsigned int end_timing_ts(struct timeval* start) { 40 | struct timeval end; 41 | gettimeofday(&end, NULL); 42 | return (unsigned int)((end.tv_sec - start->tv_sec) * 1000 + (end.tv_usec - start->tv_usec) / 1000); 43 | 44 | } 45 | #endif 46 | 47 | unsigned long long scale_iterations_to_target(unsigned long long last_iteration_count, float last_time, float target_time) { 48 | // safety measure to deal with nasty timer precision issues if the system is fast 49 | if (last_time < 50) return last_iteration_count * 2; 50 | return last_iteration_count * (target_time / last_time); 51 | } 52 | -------------------------------------------------------------------------------- /Common/timing.h: -------------------------------------------------------------------------------- 1 | #ifndef timingincluded 2 | #define timingincluded 3 | #ifdef _MSC_VER 4 | #include 5 | #else 6 | #include 7 | #endif 8 | extern struct timeb start, end; 9 | inline void start_timing(); 10 | inline unsigned int end_timing(); 11 | 12 | #ifdef _MSC_VER 13 | void start_timing_ts(struct timeb* startTimeb); 14 | unsigned int end_timing_ts(struct timeb* startTimeb); 15 | #else 16 | void start_timing_ts(struct timeval* start); 17 | unsigned int end_timing_ts(struct timeval* start); 18 | #endif 19 | unsigned long long scale_iterations_to_target(unsigned long long last_iteration_count, float last_time, float target_time); 20 | #endif 21 | -------------------------------------------------------------------------------- /CoreClockChecker/BoostClockChecker.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | extern uint64_t clktsctest(uint64_t iterations) __attribute((ms_abi)); 10 | 11 | int main(int argc, char *argv[]) { 12 | struct timeval startTv, endTv; 13 | uint64_t iterations = 500000, samples = 100; 14 | unsigned int sleepSeconds = 5; 15 | time_t time_diff_ms; 16 | 17 | for (int argIdx = 1; argIdx < argc; argIdx++) { 18 | if (*(argv[argIdx]) == '-') { 19 | char *arg = argv[argIdx] + 1; 20 | if (strncmp(arg, "samples", 7) == 0) { 21 | argIdx++; 22 | samples = atol(argv[argIdx]); 23 | } else if (strncmp(arg, "iterations", 10) == 0) { 24 | argIdx++; 25 | iterations = atol(argv[argIdx]); 26 | } else if (strncmp(arg, "sleep", 5) == 0) { 27 | argIdx++; 28 | sleepSeconds = atoi(argv[argIdx]); 29 | } 30 | } 31 | } 32 | 33 | sleep(sleepSeconds); 34 | 35 | uint64_t *measuredTscs = malloc(samples * sizeof(uint64_t)); 36 | for (uint64_t sampleIdx = 0; sampleIdx < samples; sampleIdx++) { 37 | uint64_t elapsedTsc = clktsctest(iterations); 38 | measuredTscs[sampleIdx] = elapsedTsc; 39 | } 40 | 41 | fprintf(stderr, "Used %lu samples\n", samples); 42 | fprintf(stderr, "Used %lu iterations\n", iterations); 43 | // figure out TSC to real time ratio 44 | fprintf(stderr, "Checking TSC ratio...\n"); 45 | uint64_t iterationsHi = 8e9; // should be a couple seconds at least? 46 | gettimeofday(&startTv, NULL); 47 | uint64_t referenceElapsedTsc = clktsctest(iterationsHi); 48 | gettimeofday(&endTv, NULL); 49 | time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000); 50 | float tsc_per_ms = (float)referenceElapsedTsc / (float)time_diff_ms; 51 | float tsc_per_ns = tsc_per_ms / 1e6; 52 | fprintf(stderr, "TSC = %lu, elapsed ms = %lu\n", referenceElapsedTsc, time_diff_ms); 53 | fprintf(stderr, "TSC per ms: %f, TSC per ns: %f\n", tsc_per_ms, tsc_per_ns); 54 | 55 | printf("Time (ms), Clk (GHz), TSC\n"); 56 | float elapsedTime = 0; 57 | for (uint64_t sampleIdx = 0; sampleIdx < samples; sampleIdx++) { 58 | // (tsc / ms) * tsc = 1 / ms 59 | float elapsedTimeMs = measuredTscs[sampleIdx] / tsc_per_ms; 60 | elapsedTime += elapsedTimeMs; 61 | float latency = 1e6 * elapsedTimeMs / (float)iterations; 62 | float addsPerNs = 1 / latency; 63 | printf("%f,%f,%lu\n", elapsedTime, addsPerNs, measuredTscs[sampleIdx]); 64 | } 65 | 66 | return 0; 67 | } 68 | -------------------------------------------------------------------------------- /CoreClockChecker/BoostClockChecker_arm.s: -------------------------------------------------------------------------------- 1 | .text 2 | .global clktsctest 3 | 4 | .global _clktsctest 5 | 6 | .balign 4 7 | 8 | /* x0 = iterations, return elapsed TSC in x0 */ 9 | _clktsctest: 10 | clktsctest: 11 | sub sp, sp, #0x40 12 | stp x10, x11, [sp, #0x10] 13 | stp x12, x13, [sp, #0x20] 14 | stp x14, x15, [sp, #0x30] 15 | mov x10, 1 16 | mov x11, 20 17 | mov x12, 0 18 | /* stackoverflow says this is a good idea */ 19 | mrs x14, cntvct_el0 20 | clktsctest_loop: 21 | add x12, x12, x10 22 | add x12, x12, x10 23 | add x12, x12, x10 24 | add x12, x12, x10 25 | add x12, x12, x10 26 | add x12, x12, x10 27 | add x12, x12, x10 28 | add x12, x12, x10 29 | add x12, x12, x10 30 | add x12, x12, x10 31 | add x12, x12, x10 32 | add x12, x12, x10 33 | add x12, x12, x10 34 | add x12, x12, x10 35 | add x12, x12, x10 36 | add x12, x12, x10 37 | add x12, x12, x10 38 | add x12, x12, x10 39 | add x12, x12, x10 40 | add x12, x12, x10 41 | sub x0, x0, x11 42 | cbnz x0, clktsctest_loop 43 | mrs x15, cntvct_el0 44 | sub x0, x15, x14 45 | ldp x14, x15, [sp, #0x30] 46 | ldp x12, x13, [sp, #0x20] 47 | ldp x10, x11, [sp, #0x10] 48 | add sp, sp, #0x40 49 | ret 50 | -------------------------------------------------------------------------------- /CoreClockChecker/BoostClockChecker_x86.s: -------------------------------------------------------------------------------- 1 | .global clktsctest 2 | 3 | /* rcx = iterations, return elapsed TSC in rax */ 4 | clktsctest: 5 | push %rdx 6 | push %rbx 7 | push %r8 8 | push %r9 9 | push %r10 10 | mov %rcx, %rdi 11 | mov $1, %r8 12 | mov $20, %r9 13 | xor %rbx, %rbx 14 | rdtsc /* high 32 bits in EDX, low 32 bits in EAX */ 15 | shl $32, %rdx /* shift high 32 bits into upper half of EDX */ 16 | add %rax, %rdx /* place full 64-bit value in rdx */ 17 | mov %rdx, %r10 18 | clktsctest_loop: 19 | add %r8, %rbx 20 | add %r8, %rbx 21 | add %r8, %rbx 22 | add %r8, %rbx 23 | add %r8, %rbx 24 | add %r8, %rbx 25 | add %r8, %rbx 26 | add %r8, %rbx 27 | add %r8, %rbx 28 | add %r8, %rbx 29 | add %r8, %rbx 30 | add %r8, %rbx 31 | add %r8, %rbx 32 | add %r8, %rbx 33 | add %r8, %rbx 34 | add %r8, %rbx 35 | add %r8, %rbx 36 | add %r8, %rbx 37 | add %r8, %rbx 38 | add %r8, %rbx 39 | sub %r9, %rdi 40 | jnz clktsctest_loop 41 | rdtsc 42 | shl $32, %rdx 43 | add %rdx, %rax /* now rax has the new value */ 44 | sub %r10, %rax /* subtract old TSC value from the new one, which should be larger */ 45 | pop %r10 46 | pop %r9 47 | pop %r8 48 | pop %rbx 49 | pop %rdx 50 | ret 51 | -------------------------------------------------------------------------------- /CoreClockChecker/CoreClockChecker_x86.s: -------------------------------------------------------------------------------- 1 | .global clktest 2 | 3 | /* 4 | %rdi = arg0 = iteration count 5 | */ 6 | clktest: 7 | push %rbx 8 | push %r8 9 | push %r9 10 | mov $1, %r8 11 | mov $20, %r9 12 | xor %rbx, %rbx 13 | clktest_loop: 14 | add %r8, %rbx 15 | add %r8, %rbx 16 | add %r8, %rbx 17 | add %r8, %rbx 18 | add %r8, %rbx 19 | add %r8, %rbx 20 | add %r8, %rbx 21 | add %r8, %rbx 22 | add %r8, %rbx 23 | add %r8, %rbx 24 | add %r8, %rbx 25 | add %r8, %rbx 26 | add %r8, %rbx 27 | add %r8, %rbx 28 | add %r8, %rbx 29 | add %r8, %rbx 30 | add %r8, %rbx 31 | add %r8, %rbx 32 | add %r8, %rbx 33 | add %r8, %rbx 34 | sub %r9, %rdi 35 | jnz clktest_loop 36 | pop %r9 37 | pop %r8 38 | pop %rbx 39 | ret 40 | -------------------------------------------------------------------------------- /CoreClockChecker/Makefile: -------------------------------------------------------------------------------- 1 | include ../Common/arch_detect.mk 2 | 3 | CFLAGS = -O3 4 | LDFLAGS = -lm 5 | 6 | all: $(TARGET) 7 | 8 | amd64: 9 | $(CC) $(CFLAGS) -pthread CoreClockChecker.c CoreClockChecker_x86.s -o CoreClockChecker_amd64 $(LDFLAGS) 10 | $(CC) $(CFLAGS) BoostClockChecker.c BoostClockChecker_x86.s -o BoostClockChecker_amd64 $(LDFLAGS) 11 | 12 | aarch64: 13 | $(CC) $(CFLAGS) BoostClockChecker.c BoostClockChecker_arm.s -o BoostClockChecker_aarch64 $(LDFLAGS) 14 | 15 | w64: 16 | $(CC) $(CFLAGS) BoostClockChecker.c BoostClockChecker_x86.s -o BoostClockChecker_w64.exe $(LDFLAGS) 17 | 18 | ci: amd64 aarch64 w64 19 | 20 | clean: 21 | rm -f *.o && find . -type f -executable -delete 22 | 23 | .PHONY: all ci clean 24 | -------------------------------------------------------------------------------- /CoreClockChecker/WinCoreClockChecker/CoreClockCheckFunctions.asm: -------------------------------------------------------------------------------- 1 | section .text 2 | bits 64 3 | 4 | global clktest 5 | 6 | ; rcx = iteration count 7 | ; rdx = address of memory location to monitor 8 | ; return elapsed tsc 9 | clktest: 10 | push rdx 11 | push rbx 12 | push r8 13 | push r9 14 | push r10 15 | push r11 16 | xor rbx, rbx 17 | mov r8, 1 ; GLC will eliminate adds with immediates or increments 18 | clktest_loop: 19 | add rbx, r8 20 | add rbx, r8 21 | add rbx, r8 22 | add rbx, r8 23 | add rbx, r8 24 | add rbx, r8 25 | add rbx, r8 26 | add rbx, r8 27 | add rbx, r8 28 | add rbx, r8 29 | add rbx, r8 30 | add rbx, r8 31 | add rbx, r8 32 | add rbx, r8 33 | add rbx, r8 34 | add rbx, r8 35 | add rbx, r8 36 | add rbx, r8 37 | add rbx, r8 38 | add rbx, r8 39 | mov r11d, [rdx] 40 | test r11d, r11d 41 | jnz clktest_loop_end ; early exit condition (someone else exited) 42 | sub rcx, 20 43 | jg clktest_loop 44 | mov [rdx], r8 45 | clktest_loop_end: 46 | mov rax, rbx 47 | pop r11 48 | pop r10 49 | pop r9 50 | pop r8 51 | pop rbx 52 | pop rdx 53 | ret -------------------------------------------------------------------------------- /CoreClockChecker/WinCoreClockChecker/WinCoreClockChecker.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 17 4 | VisualStudioVersion = 17.9.34723.18 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WinCoreClockChecker", "WinCoreClockChecker.vcxproj", "{D70EC1DD-794C-4156-8483-227E566CC76B}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|x64 = Debug|x64 11 | Debug|x86 = Debug|x86 12 | Release|x64 = Release|x64 13 | Release|x86 = Release|x86 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {D70EC1DD-794C-4156-8483-227E566CC76B}.Debug|x64.ActiveCfg = Debug|x64 17 | {D70EC1DD-794C-4156-8483-227E566CC76B}.Debug|x64.Build.0 = Debug|x64 18 | {D70EC1DD-794C-4156-8483-227E566CC76B}.Debug|x86.ActiveCfg = Debug|Win32 19 | {D70EC1DD-794C-4156-8483-227E566CC76B}.Debug|x86.Build.0 = Debug|Win32 20 | {D70EC1DD-794C-4156-8483-227E566CC76B}.Release|x64.ActiveCfg = Release|x64 21 | {D70EC1DD-794C-4156-8483-227E566CC76B}.Release|x64.Build.0 = Release|x64 22 | {D70EC1DD-794C-4156-8483-227E566CC76B}.Release|x86.ActiveCfg = Release|Win32 23 | {D70EC1DD-794C-4156-8483-227E566CC76B}.Release|x86.Build.0 = Release|Win32 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | GlobalSection(ExtensibilityGlobals) = postSolution 29 | SolutionGuid = {6AA7051E-EAEF-48CA-9C08-8641D57B3EB1} 30 | EndGlobalSection 31 | EndGlobal 32 | -------------------------------------------------------------------------------- /CoreClockChecker/WinCoreClockChecker/WinCoreClockChecker.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | Source Files 20 | 21 | 22 | 23 | 24 | Source Files 25 | 26 | 27 | -------------------------------------------------------------------------------- /GpuMemLatency/Makefile: -------------------------------------------------------------------------------- 1 | include ../Common/arch_detect.mk 2 | 3 | OCL_VER = v2023.04.17 4 | CI_SCRIPT = ../Common/ci_gpumemlatency.sh 5 | 6 | CFLAGS = -O3 -I ../Common 7 | DEPS = ../Common/timings.h 8 | OBJ = opencltest.o latency_test.o bw_test.o common.o atomic_test.o instruction_rate.o timing.o 9 | LDFLAGS ?= -lm -lOpenCL 10 | ifeq ($(TARGET), Darwin) 11 | LDFLAGS = -lm -framework OpenCL 12 | endif 13 | 14 | all: $(TARGET) 15 | 16 | GpuMemLatency: $(OBJ) 17 | $(CC) $(CPPFLAGS) $(CFLAGS) $^ -o $@ $(LDFLAGS) 18 | 19 | %.o: %.c $(DEPS) 20 | $(CC) $(CFLAGS) -c -o $@ $< 21 | 22 | timing.o: 23 | $(CC) $(CFLAGS) -c ../Common/timing.c -o timing.o 24 | 25 | amd64: $(OBJ) 26 | $(CC) $(CFLAGS) $^ -o GpuMemLatency_amd64 $(LDFLAGS) 27 | 28 | aarch64: $(OBJ) 29 | $(CC) $(CFLAGS) $^ -o GpuMemLatency_aarch64 $(LDFLAGS) 30 | 31 | riscv64: $(OBJ) 32 | $(CC) $(CFLAGS) $^ -o GpuMemLatency_riscv64 $(LDFLAGS) 33 | 34 | w64: $(OBJ) 35 | $(CC) $(CFLAGS) $^ -o GpuMemLatency_w64.exe $(LDFLAGS) 36 | 37 | darwin: $(OBJ) 38 | $(CC) $(CFLAGS) $^ -o GpuMemLatency_darwin $(LDFLAGS) 39 | 40 | ci: clean 41 | @OCL_VER=$(OCL_VER) sh $(CI_SCRIPT) 42 | 43 | clean-ci: 44 | rm -rf "*.deb" "*.zip" "ocl-icd-*" "OpenCL-SDK-*" 45 | 46 | clean-obj: 47 | rm -f *.o 48 | 49 | clean: clean-ci clean-obj 50 | find . -type f -executable -delete 51 | 52 | .PHONY: all ci clean-ci clean-obj clean 53 | -------------------------------------------------------------------------------- /GpuMemLatency/OpenCL/README.md: -------------------------------------------------------------------------------- 1 | # OpenCLTM API Headers 2 | 3 | This repository contains C language headers for the OpenCL API. 4 | 5 | The authoritative public repository for these headers is located at: 6 | 7 | https://github.com/KhronosGroup/OpenCL-Headers 8 | 9 | Issues, proposed fixes for issues, and other suggested changes should be 10 | created using Github. 11 | 12 | ## Branch Structure 13 | 14 | The OpenCL API headers in this repository are Unified headers and are designed 15 | to work with all released OpenCL versions. This differs from previous OpenCL 16 | API headers, where version-specific API headers either existed in separate 17 | branches, or in separate folders in a branch. 18 | 19 | ## Compiling for a Specific OpenCL Version 20 | 21 | By default, the OpenCL API headers in this repository are for the latest 22 | OpenCL version (currently OpenCL 2.2). To use these API headers to target 23 | a different OpenCL version, an application may `#define` the preprocessor 24 | value `CL_TARGET_OPENCL_VERSION` before including the OpenCL API headers. 25 | The `CL_TARGET_OPENCL_VERSION` is a three digit decimal value representing 26 | the OpenCL API version. 27 | 28 | For example, to enforce usage of no more than the OpenCL 1.2 APIs, you may 29 | include the OpenCL API headers as follows: 30 | 31 | ``` 32 | #define CL_TARGET_OPENCL_VERSION 120 33 | #include 34 | ``` 35 | 36 | ## Directory Structure 37 | 38 | ``` 39 | README.md This file 40 | LICENSE Source license for the OpenCL API headers 41 | CL/ Unified OpenCL API headers tree 42 | ``` 43 | 44 | ## License 45 | 46 | See [LICENSE](LICENSE). 47 | 48 | --- 49 | 50 | OpenCL and the OpenCL logo are trademarks of Apple Inc. used by permission by Khronos. 51 | -------------------------------------------------------------------------------- /GpuMemLatency/OpenCL/include/CL/cl_gl_ext.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (c) 2008-2020 The Khronos Group Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | #ifndef __OPENCL_CL_GL_EXT_H 18 | #define __OPENCL_CL_GL_EXT_H 19 | 20 | #ifdef __cplusplus 21 | extern "C" { 22 | #endif 23 | 24 | #include 25 | 26 | /* 27 | * cl_khr_gl_event extension 28 | */ 29 | #define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D 30 | 31 | extern CL_API_ENTRY cl_event CL_API_CALL 32 | clCreateEventFromGLsyncKHR(cl_context context, 33 | cl_GLsync cl_GLsync, 34 | cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1; 35 | 36 | #ifdef __cplusplus 37 | } 38 | #endif 39 | 40 | #endif /* __OPENCL_CL_GL_EXT_H */ 41 | -------------------------------------------------------------------------------- /GpuMemLatency/OpenCL/include/CL/opencl.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (c) 2008-2020 The Khronos Group Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | #ifndef __OPENCL_H 18 | #define __OPENCL_H 19 | 20 | #ifdef __cplusplus 21 | extern "C" { 22 | #endif 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | #ifdef __cplusplus 30 | } 31 | #endif 32 | 33 | #endif /* __OPENCL_H */ 34 | -------------------------------------------------------------------------------- /GpuMemLatency/OpenCL/lib/OpenCL.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clamchowder/Microbenchmarks/e8aaed570e6d7fea725f3d67313f5a51973d6fdf/GpuMemLatency/OpenCL/lib/OpenCL.lib -------------------------------------------------------------------------------- /GpuMemLatency/kernels/atomic_exec_latency_test.cl: -------------------------------------------------------------------------------- 1 | __kernel void atomic_exec_latency_test(__global int* A, int count, __global int* ret) { 2 | int current = get_global_id(0) + 1; 3 | while (current <= 2 * count) { 4 | if (atomic_cmpxchg(A, current - 1, current) == current - 1) { 5 | current += 2; 6 | } 7 | } 8 | } 9 | 10 | __kernel void atomic_add_test(__global int *A, int count) { 11 | int addend = get_global_id(0); 12 | int addend1 = addend + 5; 13 | int addend2 = addend + 6; 14 | int addend3 = addend + 7; 15 | int addend4 = addend + 8; 16 | int addend5 = addend + 9; 17 | int addend6 = addend + 10; 18 | int addend7 = addend + 11; 19 | __global int *target = A + get_global_id(0); 20 | for (int i = 0; i < count; i++) 21 | { 22 | atomic_add(target, addend); 23 | atomic_add(target, addend1); 24 | atomic_add(target, addend2); 25 | atomic_add(target, addend3); 26 | atomic_add(target, addend4); 27 | atomic_add(target, addend5); 28 | atomic_add(target, addend6); 29 | atomic_add(target, addend7); 30 | } 31 | } -------------------------------------------------------------------------------- /GpuMemLatency/kernels/buffer_bw_test.cl: -------------------------------------------------------------------------------- 1 | #define fixed_tex_test_size 1024 2 | __kernel void buffer_bw_test(__read_only image1d_buffer_t A, uint count, __global float* ret) { 3 | int threadId = get_global_id(0); 4 | int localId = get_local_id(0); 5 | int localSize = get_local_size(0); 6 | int groupId = get_group_id(0); 7 | uint4 acc1 = read_imageui(A, 0); 8 | uint4 acc2 = read_imageui(A, 1); 9 | uint4 acc3 = read_imageui(A, 2); 10 | uint4 acc4 = read_imageui(A, 3); 11 | 12 | int idx0 = localId; 13 | int idx1 = localId + localSize; 14 | int idx2 = localId + localSize * 2; 15 | 16 | // Each read_imageui reads out a 4-wide vector 17 | for (int i = 0; i < count; i += 16) { 18 | read_imageui(A, idx0); 19 | acc1 += read_imageui(A, idx0); 20 | acc2 += read_imageui(A, idx1); 21 | acc3 += read_imageui(A, idx2); 22 | acc4 += read_imageui(A, idx0 + 1); 23 | idx0 = (idx0 + localSize) & 0x3FF; 24 | idx1 = (idx1 + localSize) & 0x3FF; 25 | idx2 = (idx2 + localSize) & 0x3FF; 26 | } 27 | 28 | float4 out1 = convert_float4(acc1); 29 | float4 out2 = convert_float4(acc2); 30 | float4 out3 = convert_float4(acc3); 31 | float4 out4 = convert_float4(acc4); 32 | ret[threadId] = dot(out1, out2) + dot(out3, out4); 33 | } -------------------------------------------------------------------------------- /GpuMemLatency/kernels/c2c_atomic_exec_latency_test.cl: -------------------------------------------------------------------------------- 1 | // hoping each thread/workgroup lands on a different CU 2 | // A = pointer to location being bounced around 3 | // count = iterations 4 | // ret = sink 5 | // t1 = id of thread 1 6 | // t2 = id of thread 2 7 | __kernel void c2c_atomic_exec_latency_test(__global int* A, int count, __global int* ret, int t1, int t2) { 8 | int global_id = get_global_id(0); 9 | int current = 0; 10 | if (global_id == t1) current = 1; 11 | else if (global_id == t2) current = 2; 12 | 13 | if (global_id == t1 || global_id == t2) { 14 | //printf("gid: %d, t1: %d, t2: %d, A: %d, current = %d\n", global_id, t1, t2, *A, current); 15 | while (current <= 2 * count) { 16 | if (atomic_cmpxchg(A, current - 1, current) == current - 1) { 17 | current += 2; 18 | } 19 | } 20 | ret[0] = current; 21 | } 22 | } -------------------------------------------------------------------------------- /GpuMemLatency/kernels/constant_unrolled_latency_test.cl: -------------------------------------------------------------------------------- 1 | // latency test like the unrolled one above, but with input as constant memory 2 | __kernel void constant_unrolled_latency_test(__constant const int* A, int count, __global int* ret) { 3 | //int current = A[0]; 4 | int current = get_global_size(0) > 1 ? ret[get_global_id(0)]: A[0]; 5 | int result; 6 | for (int i = 0; i < count; i += 10) { 7 | result += current; 8 | current = A[current]; 9 | result += current; 10 | current = A[current]; 11 | result += current; 12 | current = A[current]; 13 | result += current; 14 | current = A[current]; 15 | result += current; 16 | current = A[current]; 17 | result += current; 18 | current = A[current]; 19 | result += current; 20 | current = A[current]; 21 | result += current; 22 | current = A[current]; 23 | result += current; 24 | current = A[current]; 25 | result += current; 26 | current = A[current]; 27 | } 28 | 29 | ret[0] = result; 30 | } -------------------------------------------------------------------------------- /GpuMemLatency/kernels/local_64_bw_test.cl: -------------------------------------------------------------------------------- 1 | #define local64_test_size 2048 // size was given in 4B elements. This test uses 8B 2 | __kernel void local_64_bw_test(__global ulong* A, uint count, __global ulong* ret) { 3 | __local ulong local_a[local64_test_size]; 4 | int threadId = get_global_id(0); 5 | int localId = get_local_id(0); 6 | int localSize = get_local_size(0); 7 | int groupId = get_group_id(0); 8 | 9 | // workgroup-wide copy from global mem into local mem 10 | for (int i = get_local_id(0);i < local64_test_size; i += get_local_size(0)) 11 | local_a[i] = A[i]; 12 | barrier(CLK_LOCAL_MEM_FENCE); 13 | 14 | ulong acc0 = 0, acc1 = 0, acc2 = 0, acc3 = 0; 15 | 16 | // assumes local memory size is at least 512x 64-bit uints 17 | int idx0 = localId; 18 | int idx1 = localId + localSize; 19 | for (int i = 0; i < count; i += 8) { 20 | acc0 ^= local_a[idx0]; 21 | acc1 ^= local_a[idx1]; 22 | idx0 = (idx0 + localSize) & 0x1FF; 23 | idx1 = (idx1 + localSize) & 0x1FF; 24 | 25 | acc3 ^= local_a[idx0]; 26 | acc4 ^= local_a[idx1]; 27 | idx0 = (idx0 + localSize) & 0x1FF; 28 | idx1 = (idx1 + localSize) & 0x1FF; 29 | } 30 | 31 | ret[threadId] = acc0 + acc1 + acc2 + acc3; 32 | } 33 | -------------------------------------------------------------------------------- /GpuMemLatency/kernels/local_atomic_latency_test.cl: -------------------------------------------------------------------------------- 1 | __kernel void local_atomic_latency_test(__global int* A, int count, __global int* ret) { 2 | __local int a[1]; 3 | int current = get_global_id(0) + 1; 4 | if (current == 1) a[0] = A[0]; 5 | barrier(CLK_LOCAL_MEM_FENCE); 6 | 7 | while (current <= 2 * count) { 8 | if (atomic_cmpxchg(a, current - 1, current) == current - 1) { 9 | current += 2; 10 | } 11 | } 12 | } 13 | 14 | #define local_atomic_add_wg_size 256 15 | __kernel void local_atomic_add_test(__global int *A, int count) { 16 | __local int local_a[local_atomic_add_wg_size]; 17 | local_a[get_local_id(0)] = A[get_global_id(0)]; 18 | barrier(CLK_LOCAL_MEM_FENCE); 19 | 20 | int addend = get_global_id(0); 21 | int addend1 = addend + 5; 22 | int addend2 = addend + 6; 23 | int addend3 = addend + 7; 24 | int addend4 = addend + 8; 25 | int addend5 = addend + 9; 26 | int addend6 = addend + 10; 27 | int addend7 = addend + 11; 28 | __local int *target = local_a + get_local_id(0); 29 | for (int i = 0; i < count; i++) 30 | { 31 | atomic_add(target, addend); 32 | atomic_add(target, addend1); 33 | atomic_add(target, addend2); 34 | atomic_add(target, addend3); 35 | atomic_add(target, addend4); 36 | atomic_add(target, addend5); 37 | atomic_add(target, addend6); 38 | atomic_add(target, addend7); 39 | } 40 | 41 | A[get_global_id(0)] = local_a[get_local_id(0)]; 42 | } -------------------------------------------------------------------------------- /GpuMemLatency/kernels/local_bw_test.cl: -------------------------------------------------------------------------------- 1 | #define local_mem_bw_test_size 1024 2 | // test bandwidth with local memory. A must be at least local_mem_bw_test_size in floats 3 | __kernel void local_bw_test(__global float* A, uint count, __global float* ret) { 4 | __local float local_a[local_mem_bw_test_size]; 5 | int threadId = get_global_id(0); 6 | int localId = get_local_id(0); 7 | int localSize = get_local_size(0); 8 | int groupId = get_group_id(0); 9 | float acc1 = 1.1; 10 | float acc2 = 2.2; 11 | float acc3 = 3.3; 12 | float acc4 = 4.4; 13 | 14 | //printf("subgroup size %d\n", get_sub_group_size()); 15 | 16 | // workgroup-wide copy from global mem into local mem 17 | for (int i = get_local_id(0);i < local_mem_bw_test_size; i += get_local_size(0)) 18 | local_a[i] = A[i]; 19 | barrier(CLK_LOCAL_MEM_FENCE); 20 | 21 | // assumes local memory size is at least 1024 float4s 22 | int idx0 = localId; 23 | int idx1 = localId + localSize; 24 | int idx2 = localId + localSize * 2; 25 | for (int i = 0; i < count; i += 12) { 26 | acc1 += local_a[idx0] * local_a[idx1] + local_a[idx2]; 27 | idx0 = (idx0 + localSize) & 0x3FF; 28 | idx1 = (idx1 + localSize) & 0x3FF; 29 | idx2 = (idx2 + localSize) & 0x3FF; 30 | 31 | acc2 += local_a[idx0] * local_a[idx1] + local_a[idx2]; 32 | idx0 = (idx0 + localSize) & 0x3FF; 33 | idx1 = (idx1 + localSize) & 0x3FF; 34 | idx2 = (idx2 + localSize) & 0x3FF; 35 | 36 | acc3 += local_a[idx0] * local_a[idx1] + local_a[idx2]; 37 | idx0 = (idx0 + localSize) & 0x3FF; 38 | idx1 = (idx1 + localSize) & 0x3FF; 39 | idx2 = (idx2 + localSize) & 0x3FF; 40 | 41 | acc4 += local_a[idx0] * local_a[idx1] + local_a[idx2]; 42 | idx0 = (idx0 + localSize) & 0x3FF; 43 | idx1 = (idx1 + localSize) & 0x3FF; 44 | idx2 = (idx2 + localSize) & 0x3FF; 45 | } 46 | 47 | ret[threadId] = acc1 + acc2 + acc3 + acc4; 48 | } -------------------------------------------------------------------------------- /GpuMemLatency/kernels/local_float4_bw_test.cl: -------------------------------------------------------------------------------- 1 | #define local_mem_bw_test_size 1024 2 | __kernel void local_float4_bw_test(__global float4* A, uint count, __global float* ret) { 3 | __local float4 local_a[local_mem_bw_test_size]; 4 | int threadId = get_global_id(0); 5 | int localId = get_local_id(0); 6 | int localSize = get_local_size(0); 7 | int groupId = get_group_id(0); 8 | float4 acc1 = A[get_global_id(0) & 0x3FF]; 9 | float4 acc2 = A[(get_global_id(0) + 1) & 0x3FF]; 10 | float4 acc3 = A[(get_global_id(0) + 2) & 0x3FF]; 11 | float4 acc4 = A[(get_global_id(0) + 3) & 0x3FF]; 12 | 13 | // workgroup-wide copy from global mem into local mem 14 | for (int i = get_local_id(0);i < local_mem_bw_test_size; i += get_local_size(0)) 15 | local_a[i] = A[i]; 16 | barrier(CLK_LOCAL_MEM_FENCE); 17 | 18 | // assumes local memory size is at least 1024 float4s 19 | int idx0 = localId; 20 | int idx1 = localId + localSize; 21 | int idx2 = localId + localSize * 2; 22 | for (int i = 0; i < count; i += (12*4)) { 23 | acc1 += local_a[idx0] * local_a[idx1] + local_a[idx2]; 24 | idx0 = (idx0 + localSize) & 0x3FF; 25 | idx1 = (idx1 + localSize) & 0x3FF; 26 | idx2 = (idx2 + localSize) & 0x3FF; 27 | 28 | acc2 += local_a[idx0] * local_a[idx1] + local_a[idx2]; 29 | idx0 = (idx0 + localSize) & 0x3FF; 30 | idx1 = (idx1 + localSize) & 0x3FF; 31 | idx2 = (idx2 + localSize) & 0x3FF; 32 | 33 | acc3 += local_a[idx0] * local_a[idx1] + local_a[idx2]; 34 | idx0 = (idx0 + localSize) & 0x3FF; 35 | idx1 = (idx1 + localSize) & 0x3FF; 36 | idx2 = (idx2 + localSize) & 0x3FF; 37 | 38 | acc4 += local_a[idx0] * local_a[idx1] + local_a[idx2]; 39 | idx0 = (idx0 + localSize) & 0x3FF; 40 | idx1 = (idx1 + localSize) & 0x3FF; 41 | idx2 = (idx2 + localSize) & 0x3FF; 42 | } 43 | 44 | ret[threadId] = dot(acc1, acc2) + dot(acc3, acc4); 45 | } -------------------------------------------------------------------------------- /GpuMemLatency/kernels/local_unrolled_latency_test.cl: -------------------------------------------------------------------------------- 1 | #define local_mem_test_size 1024 2 | // uses local memory (LDS/shmem) 3 | __kernel void local_unrolled_latency_test(__global const int* A, int count, __global int* ret) { 4 | __local int local_a[local_mem_test_size]; // 4 KB, should be present on all GPUs, amirite? 5 | // better be fast 6 | for (int i = get_local_id(0);i < local_mem_test_size; i += get_local_size(0)) 7 | local_a[i] = A[i]; 8 | barrier(CLK_LOCAL_MEM_FENCE); 9 | 10 | // everyone else can chill/get masked off 11 | if (get_local_id(0) == 0) { 12 | int current = local_a[0]; 13 | int result; 14 | for (int i = 0; i < count; i += 10) { 15 | result += current; 16 | current = local_a[current]; 17 | result += current; 18 | current = local_a[current]; 19 | result += current; 20 | current = local_a[current]; 21 | result += current; 22 | current = local_a[current]; 23 | result += current; 24 | current = local_a[current]; 25 | result += current; 26 | current = local_a[current]; 27 | result += current; 28 | current = local_a[current]; 29 | result += current; 30 | current = local_a[current]; 31 | result += current; 32 | current = local_a[current]; 33 | result += current; 34 | current = local_a[current]; 35 | } 36 | 37 | ret[0] = result; 38 | } 39 | } -------------------------------------------------------------------------------- /GpuMemLatency/kernels/scalar_unrolled_latency_test.cl: -------------------------------------------------------------------------------- 1 | // Ensures the loaded value will be constant across a workgroup 2 | __kernel void scalar_unrolled_latency_test(__global const int* A, int count, __global int* ret) { 3 | int current = get_num_groups(0) > 1 ? ret[get_group_id(0) * get_local_size(0)]: A[0]; 4 | int result; 5 | for (int i = 0; i < count; i += 10) { 6 | result += current; 7 | current = A[current]; 8 | result += current; 9 | current = A[current]; 10 | result += current; 11 | current = A[current]; 12 | result += current; 13 | current = A[current]; 14 | result += current; 15 | current = A[current]; 16 | result += current; 17 | current = A[current]; 18 | result += current; 19 | current = A[current]; 20 | result += current; 21 | current = A[current]; 22 | result += current; 23 | current = A[current]; 24 | result += current; 25 | current = A[current]; 26 | } 27 | 28 | ret[0] = result; 29 | } -------------------------------------------------------------------------------- /GpuMemLatency/kernels/sum_bw_test.cl: -------------------------------------------------------------------------------- 1 | __kernel void sum_bw_test(__global float* A, uint count, uint float4size, __global float* ret, uint skip, __global uint *startPositions) { 2 | int threadId = get_global_id(0); 3 | int localId = get_local_id(0); 4 | int localSize = get_local_size(0); 5 | int groupId = get_group_id(0); 6 | float4 result1 = (0.1f,0.2f,0.3f,0.4f); 7 | float4 result2 = (1.1f,1.2f,1.3f,1.4f); 8 | float4 result3 = (2.1f,2.2f,2.3f,2.4f); 9 | float4 result4 = (3.0f,3.1f,3.2f,3.3f); 10 | float4 result5 = (4.0f,4.2f,4.1f,4.3f); 11 | 12 | int initialIdx = startPositions[threadId]; 13 | //int initialIdx = (groupId * skip * localSize + localId) % (float4size - 1); 14 | //startPositions[threadId] = initialIdx; // for debugging 15 | 16 | int idx = initialIdx; 17 | __global float4 *B = (__global float4 *)A; 18 | for (int i = 0; i < count; i += 20) { 19 | result1 += B[idx]; 20 | idx += localSize; 21 | if (idx >= float4size) idx = initialIdx; 22 | 23 | result2 += B[idx]; 24 | idx += localSize; 25 | if (idx >= float4size) idx = initialIdx; 26 | 27 | result3 += B[idx]; 28 | idx += localSize; 29 | if (idx >= float4size) idx = initialIdx; 30 | 31 | result4 += B[idx]; 32 | idx += localSize; 33 | if (idx >= float4size) idx = initialIdx; 34 | 35 | result5 += B[idx]; 36 | idx += localSize; 37 | if (idx >= float4size) idx = initialIdx; 38 | } 39 | 40 | ret[threadId] = dot(result1, result2) + dot(result3, result4) + dot(result4, result5); 41 | } -------------------------------------------------------------------------------- /GpuMemLatency/kernels/tex_bw_test.cl: -------------------------------------------------------------------------------- 1 | __constant sampler_t funny_sampler = CLK_NORMALIZED_COORDS_TRUE | // coordinates are from 0 to 1 (float) 2 | CLK_ADDRESS_REPEAT | // going out of bounds = replicate 3 | CLK_FILTER_NEAREST; 4 | __kernel void tex_bw_test(__read_only image2d_t A, int count, __global float* ret) { 5 | int localId = get_local_id(0); 6 | float pos = get_global_id(0) * native_recip((float)get_global_size(0)); 7 | float2 increment; 8 | increment.x = 0.01; // guessing 9 | increment.y = 0.01; 10 | 11 | float2 current0, current1, current2, current3; 12 | current0.x = pos; 13 | current0.y = pos; 14 | current1.x = 0.1 + (localId / 10000); 15 | current1.y = 0.1 + (localId / 10000); 16 | current2.x = 0.01 + (localId / 10000); 17 | current2.y = 0.01 + (localId / 10000); 18 | current3.x = 0.002 + (localId / 5000); 19 | current3.y = 0.001 + (localId / 5000); 20 | 21 | float4 tmp0 = read_imagef(A, funny_sampler, current0); 22 | float4 tmp1 = read_imagef(A, funny_sampler, current1); 23 | float4 tmp2 = read_imagef(A, funny_sampler, current2); 24 | float4 tmp3 = read_imagef(A, funny_sampler, current3); 25 | for (int i = 0; i < count; i += 4) 26 | { 27 | tmp0 += read_imagef(A, funny_sampler, current0); 28 | tmp1 += read_imagef(A, funny_sampler, current1); 29 | tmp2 += read_imagef(A, funny_sampler, current2); 30 | tmp3 += read_imagef(A, funny_sampler, current3); 31 | current0 += increment; 32 | current1 += increment; 33 | current2 += increment; 34 | current3 += increment; 35 | } 36 | 37 | *ret = dot(tmp0, tmp1) + dot(tmp2, tmp3); 38 | } -------------------------------------------------------------------------------- /GpuMemLatency/kernels/tex_latency_test.cl: -------------------------------------------------------------------------------- 1 | __kernel void tex_latency_test(__read_only image1d_buffer_t A, int count, __global int* ret, int list_size) { 2 | int localId = get_local_id(0); 3 | // uint4 current = read_imageui(A, direct_sampler, 0); // using sampler screws things up 4 | int startPos = get_global_size(0) > 1 ? ret[get_global_id(0)] : 0; 5 | uint4 current = read_imageui(A, startPos); 6 | // printf("start x: %u -> %u\n", startPos, current.x); 7 | for (int i = 0; i < count; i += 10) { 8 | // printf("current: %u %u %u %u, address: %d\n", current.x, current.y, current.z, current.w, (int)current.x / 4); 9 | //current = read_imageui(A, direct_sampler, i); 10 | current = read_imageui(A, current.x); 11 | current = read_imageui(A, current.x); 12 | current = read_imageui(A, current.x); 13 | current = read_imageui(A, current.x); 14 | current = read_imageui(A, current.x); 15 | current = read_imageui(A, current.x); 16 | current = read_imageui(A, current.x); 17 | current = read_imageui(A, current.x); 18 | current = read_imageui(A, current.x); 19 | current = read_imageui(A, current.x); 20 | //printf("%d: current read: %u %u %u %u\n", i, current.x, current.y, current.z, current.w); 21 | // local_a[localId] = current; 22 | } 23 | 24 | ret[get_global_id(0)] = current.x; 25 | } 26 | -------------------------------------------------------------------------------- /GpuMemLatency/kernels/unrolled_latency_test.cl: -------------------------------------------------------------------------------- 1 | // unrolled until terascale no longer saw further improvement (10x unroll) 2 | // assumes count will be a multiple of 10. but it won't be too inaccurate with a big count 3 | // not divisible by 10 4 | __kernel void unrolled_latency_test(__global const int* A, int count, __global int* ret) { 5 | int current = get_global_size(0) > 1 ? ret[get_global_id(0)]: A[0]; // this will test vector latency on AMD. Set to A[0] for scalar latency 6 | int result; 7 | for (int i = 0; i < count; i += 10) { 8 | result += current; 9 | current = A[current]; 10 | result += current; 11 | current = A[current]; 12 | result += current; 13 | current = A[current]; 14 | result += current; 15 | current = A[current]; 16 | result += current; 17 | current = A[current]; 18 | result += current; 19 | current = A[current]; 20 | result += current; 21 | current = A[current]; 22 | result += current; 23 | current = A[current]; 24 | result += current; 25 | current = A[current]; 26 | result += current; 27 | current = A[current]; 28 | } 29 | 30 | ret[0] = result; 31 | } -------------------------------------------------------------------------------- /GpuMemLatency/local_mem_latency_kernel.cl: -------------------------------------------------------------------------------- 1 | // for testing total local memory capacity by seeing when threads can no longer overlap in time 2 | // due to local mem capacity limits across the GPU 3 | // calling code expected to define LATENCY_LOCAL_MEM_SIZE 4 | __kernel void unrolled_latency_test_localmem(__global const int* A, int count, __global int* ret) { 5 | __local int local_a[LATENCY_LOCAL_MEM_SIZE]; 6 | int start = A[0]; // this will test scalar latency, always 7 | int current = A[start]; 8 | int result; 9 | for (int i = 0; i < count; i += 10) { 10 | result += current; 11 | current = A[current]; 12 | result += current; 13 | current = A[current]; 14 | result += current; 15 | current = A[current]; 16 | result += current; 17 | current = A[current]; 18 | result += current; 19 | current = A[current]; 20 | result += current; 21 | current = A[current]; 22 | result += current; 23 | current = A[current]; 24 | result += current; 25 | current = A[current]; 26 | result += current; 27 | current = A[current]; 28 | result += current; 29 | current = A[current]; 30 | local_a[i & (LATENCY_LOCAL_MEM_SIZE - 1)] = current; 31 | } 32 | 33 | ret[0] = local_a[current & (LATENCY_LOCAL_MEM_SIZE - 1)]; 34 | } 35 | -------------------------------------------------------------------------------- /GpuMemLatency/opencltest.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 16 4 | VisualStudioVersion = 16.0.30503.244 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "opencltest", "opencltest.vcxproj", "{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|x64 = Debug|x64 11 | Debug|x86 = Debug|x86 12 | Release|x64 = Release|x64 13 | Release|x86 = Release|x86 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Debug|x64.ActiveCfg = Debug|x64 17 | {FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Debug|x64.Build.0 = Debug|x64 18 | {FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Debug|x86.ActiveCfg = Debug|Win32 19 | {FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Debug|x86.Build.0 = Debug|Win32 20 | {FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Release|x64.ActiveCfg = Release|x64 21 | {FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Release|x64.Build.0 = Release|x64 22 | {FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Release|x86.ActiveCfg = Release|Win32 23 | {FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Release|x86.Build.0 = Release|Win32 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | GlobalSection(ExtensibilityGlobals) = postSolution 29 | SolutionGuid = {4447E91D-E7A1-4249-87A7-E75A78167E71} 30 | EndGlobalSection 31 | EndGlobal 32 | -------------------------------------------------------------------------------- /GpuMemLatency/opencltest.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /GpuMemLatency/texturetest.c: -------------------------------------------------------------------------------- 1 | #include "opencltest.h" 2 | 3 | -------------------------------------------------------------------------------- /InstructionRate/Makefile: -------------------------------------------------------------------------------- 1 | include ../Common/arch_detect.mk 2 | 3 | CFLAGS = -O3 4 | 5 | all: $(TARGET) 6 | 7 | amd64: 8 | $(CC) $(CFLAGS) x86_instructionrate.s x86_instructionrate.c -o InstructionRate_amd64 $(LDFLAGS) 9 | 10 | aarch64: 11 | $(CC) $(CFLAGS) -march=native -pthread arm_instructionrate.s arm_instructionrate.c -o InstructionRate_aarch64 $(LDFLAGS) 12 | 13 | riscv64: 14 | $(CC) $(CFLAGS) -march=rv64gc -pthread riscv_instructionrate.s riscv_instructionrate.c -o InstructionRate_riscv64 $(LDFLAGS) 15 | 16 | termux: 17 | clang -march=armv8+aes arm_instructionrate.s arm_instructionrate.c -o InstructionRate_aarch64 $(LDFLAGS) 18 | 19 | amd64_fusion: 20 | $(CC) $(CFLAGS) x86_fusion.s x86_fusion.c -o InstructionRateFusion_amd64 $(LDFLAGS) 21 | 22 | w64: 23 | $(CC) $(CFLAGS) x86_instructionrate.c x86_instructionrate.s -o InstructionRate_w64.exe $(LDFLAGS) 24 | 25 | ci: amd64 amd64_fusion aarch64 riscv64 w64 26 | 27 | clean: 28 | rm -f *.o && find . -type f -executable -delete 29 | 30 | .PHONY: all ci clean 31 | -------------------------------------------------------------------------------- /LoadedMemoryLatency/LoadedMemoryLatency/LoadedMemoryLatency.asm: -------------------------------------------------------------------------------- 1 | section .text 2 | bits 64 3 | 4 | global asm_read 5 | 6 | ; rcx = ptr to array 7 | ; rdx = array length in bytes 8 | ; r8 = stop flag 9 | ; r9 = throttle factor 10 | ; return bytes read in rax 11 | asm_read: 12 | push rdi 13 | push rsi 14 | push r10 15 | push r11 16 | mov rdi, rcx ; save array base address 17 | xor rsi, rsi ; index 18 | xor rax, rax ; return value 19 | asm_read_pass_loop: 20 | movups xmm0, [rdi] 21 | movups xmm0, [rdi + 16] 22 | movups xmm0, [rdi + 32] 23 | movups xmm0, [rdi + 48] 24 | movups xmm0, [rdi + 64] 25 | movups xmm0, [rdi + 80] 26 | movups xmm0, [rdi + 96] 27 | movups xmm0, [rdi + 112] 28 | 29 | add rdi, 128 30 | add rsi, 128 ; update index 31 | add rax, 128 ; update return value 32 | 33 | test r9, r9 ; need to throttle? 34 | jz asm_read_throttle_end 35 | mov r10, r9 36 | asm_read_throttle: 37 | dec r10 38 | jnz asm_read_throttle; 39 | asm_read_throttle_end: 40 | mov r10d, [r8] ; check stop flag 41 | test r10d, r10d 42 | jnz asm_read_end 43 | 44 | cmp rdx, rsi ; array len - index > 0? 45 | jg asm_read_pass_loop 46 | mov rdi, rcx ; reset to start 47 | xor rsi, rsi ; and reset index 48 | jmp asm_read_pass_loop 49 | asm_read_end: 50 | pop r11 51 | pop r10 52 | pop rsi 53 | pop rdi 54 | ret -------------------------------------------------------------------------------- /LoadedMemoryLatency/LoadedMemoryLatency/LoadedMemoryLatency.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 17 4 | VisualStudioVersion = 17.11.35327.3 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LoadedMemoryLatency", "LoadedMemoryLatency.vcxproj", "{E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|x64 = Debug|x64 11 | Debug|x86 = Debug|x86 12 | Release|x64 = Release|x64 13 | Release|x86 = Release|x86 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Debug|x64.ActiveCfg = Debug|x64 17 | {E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Debug|x64.Build.0 = Debug|x64 18 | {E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Debug|x86.ActiveCfg = Debug|Win32 19 | {E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Debug|x86.Build.0 = Debug|Win32 20 | {E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Release|x64.ActiveCfg = Release|x64 21 | {E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Release|x64.Build.0 = Release|x64 22 | {E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Release|x86.ActiveCfg = Release|Win32 23 | {E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Release|x86.Build.0 = Release|Win32 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | GlobalSection(ExtensibilityGlobals) = postSolution 29 | SolutionGuid = {5656BCBF-7F82-471C-8AFE-1FE48AD34114} 30 | EndGlobalSection 31 | EndGlobal 32 | -------------------------------------------------------------------------------- /LoadedMemoryLatency/LoadedMemoryLatency/LoadedMemoryLatency.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | Source Files 20 | 21 | 22 | 23 | 24 | Source Files 25 | 26 | 27 | -------------------------------------------------------------------------------- /LoadedMemoryLatency/LoadedMemoryLatency_amd64.s: -------------------------------------------------------------------------------- 1 | .global asm_read 2 | 3 | /* rcx = ptr to array 4 | rdx = arr length in bytes 5 | r8 = stop flag 6 | r9 = throttle factor 7 | return bytes read in rax 8 | */ 9 | asm_read: 10 | push %rdi 11 | push %rsi 12 | push %r10 13 | push %r11 14 | mov %rcx, %rdi 15 | xor %rsi, %rsi 16 | xor %rax, %rax 17 | asm_read_pass_loop: 18 | /* load 128B */ 19 | movups (%rdi), %xmm0 20 | movups 16(%rdi), %xmm0 21 | movups 32(%rdi), %xmm0 22 | movups 48(%rdi), %xmm0 23 | movups 64(%rdi), %xmm0 24 | movups 80(%rdi), %xmm0 25 | movups 96(%rdi), %xmm0 26 | movups 112(%rdi), %xmm0 27 | 28 | add $128, %rdi 29 | add $128, %rsi 30 | add $128, %rax 31 | 32 | test %r9, %r9 33 | jz asm_read_throttle_end 34 | mov %r9, %r10 35 | asm_read_throttle: 36 | dec %r10 37 | jnz asm_read_throttle 38 | asm_read_throttle_end: 39 | /* check stop flag */ 40 | mov (%r8), %r10d 41 | test %r10d, %r10d 42 | jnz asm_read_end 43 | 44 | cmp %rsi, %rdx 45 | jg asm_read_pass_loop 46 | mov %rcx, %rdi 47 | xor %rsi, %rsi 48 | jmp asm_read_pass_loop 49 | asm_read_end: 50 | pop %r11 51 | pop %r10 52 | pop %rsi 53 | pop %rdi 54 | ret 55 | -------------------------------------------------------------------------------- /LoadedMemoryLatency/LoadedMemoryLatency_arm.s: -------------------------------------------------------------------------------- 1 | .global asm_read 2 | .global _asm_read 3 | 4 | /* x0 = ptr to array 5 | x1 = arr length in bytes 6 | x2 = stop flag 7 | x3 = throttle factor 8 | return bytes read in x0 9 | */ 10 | _asm_read: 11 | asm_read: 12 | sub sp, sp, #0x40 13 | stp x14, x15, [sp, #0x10] 14 | stp x12, x13, [sp, #0x20] 15 | stp x11, x10, [sp, #0x30] 16 | sub x1, x1, 128 17 | mov x15, x0 /* ptr into array */ 18 | mov x12, 0 /* current offset into array */ 19 | mov x13, 0 /* data transferred in bytes */ 20 | asm_read_pass_loop: 21 | /* load 128B */ 22 | ldr q16, [x15] 23 | ldr q16, [x15, 16] 24 | ldr q16, [x15, 32] 25 | ldr q16, [x15, 48] 26 | ldr q16, [x15, 64] 27 | ldr q16, [x15, 80] 28 | ldr q16, [x15, 96] 29 | ldr q16, [x15, 112] 30 | add x12, x12, 128 31 | add x15, x15, 128 32 | add x13, x13, 128 33 | 34 | cbz x3, asm_read_throttle_end 35 | mov x10, x3 /* save throttle factor */ 36 | asm_read_throttle: 37 | sub x10, x10, 1 38 | cbnz x10, asm_read_throttle 39 | asm_read_throttle_end: 40 | 41 | /* end condition */ 42 | ldr w14, [x2] 43 | cbnz x14, asm_read_end 44 | 45 | /* loop back condition */ 46 | cmp x1, x12 47 | b.gt asm_read_pass_loop 48 | mov x15, x0 49 | mov x12, 0 50 | b asm_read_pass_loop 51 | asm_read_end: 52 | mov x0, x13 53 | ldp x11, x10, [sp, #0x30] 54 | ldp x12, x13, [sp, #0x20] 55 | ldp x14, x15, [sp, #0x10] 56 | add sp, sp, #0x40 57 | ret 58 | -------------------------------------------------------------------------------- /LoadedMemoryLatency/Makefile: -------------------------------------------------------------------------------- 1 | amd64: 2 | gcc -O3 LoadedMemoryLatency.c LoadedMemoryLatency_amd64.s -o loadedlat_amd64 -lm 3 | aarch64: 4 | gcc -O3 LoadedMemoryLatency.c LoadedMemoryLatency_arm.s -o loadedlat_aarch64 -lm 5 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | include Common/arch_detect.mk 2 | 3 | COMPONENTS = CoherencyLatency MemoryLatency MemoryBandwidth InstructionRate Meshsim CoreClockChecker GpuMemLatency 4 | 5 | all: $(COMPONENTS) 6 | 7 | ci: 8 | for COMPONENT in $(COMPONENTS); do $(MAKE) -C $$COMPONENT ci; done 9 | 10 | package: 11 | @sh Common/ci_package.sh 12 | 13 | clean-package: 14 | find . -maxdepth 1 -type d -name "clammarks-*" -exec rm -rf {} \; && rm -f "clammarks.txz" 15 | 16 | clean: 17 | for COMPONENT in $(COMPONENTS); do $(MAKE) -C $$COMPONENT clean; done 18 | 19 | $(COMPONENTS): .FORCE 20 | $(MAKE) -C $@ 21 | 22 | .FORCE: 23 | 24 | .PHONY: all ci package clean-package clean 25 | -------------------------------------------------------------------------------- /MemoryBandwidth/Makefile: -------------------------------------------------------------------------------- 1 | include ../Common/arch_detect.mk 2 | 3 | CFLAGS = -pthread -O3 4 | LDFLAGS= -lm 5 | 6 | all: $(TARGET) 7 | 8 | amd64: 9 | $(CC) $(CFLAGS) MemoryBandwidth.c MemoryBandwidth_x86.s -o MemoryBandwidth_amd64 $(LDFLAGS) 10 | 11 | amd64-numa: 12 | $(CC) $(CFLAGS) -DNUMA MemoryBandwidth.c MemoryBandwidth_x86.s -o MemoryBandwidth_numa_amd64 $(LDFLAGS) -lnuma 13 | 14 | aarch64: 15 | $(CC) $(CFLAGS) MemoryBandwidth.c MemoryBandwidth_arm.s -o MemoryBandwidth_aarch64 $(LDFLAGS) 16 | 17 | termux: 18 | gcc -O3 -pthread MemoryBandwidth.c MemoryBandwidth_arm.s -o MemoryBandwidth_aarch64 -lm 19 | 20 | aarch64-numa: 21 | $(CC) $(CFLAGS) -DNUMA MemoryBandwidth.c MemoryBandwidth_arm.s -o MemoryBandwidth_numa_aarch64 $(LDFLAGS) -lnuma 22 | 23 | riscv64: 24 | $(CC) $(CFLAGS) -march=rv64gcv0p7 MemoryBandwidth.c MemoryBandwidth_riscv.s -o MemoryBandwidth_riscv64 $(LDFLAGS) 25 | 26 | w64: 27 | $(CC) $(CFLAGS) MemoryBandwidth.c MemoryBandwidth_x86.s -o MemoryBandwidth_w64.exe $(LDFLAGS) 28 | 29 | ci: amd64 amd64-numa aarch64 w64 30 | 31 | clean: 32 | rm -f *.o && find . -type f -executable -delete 33 | 34 | .PHONY: all ci clean 35 | -------------------------------------------------------------------------------- /MemoryBandwidth/MemoryBandwidth/MemoryBandwidth.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 17 4 | VisualStudioVersion = 17.6.33815.320 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MemoryBandwidth", "MemoryBandwidth.vcxproj", "{E968D202-64A2-43A5-8BBD-D7D010D06564}" 7 | EndProject 8 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MixedMemoryBandwidthTest", "..\MixedMemoryBandwidthTest\MixedMemoryBandwidthTest.vcxproj", "{5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}" 9 | EndProject 10 | Global 11 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 12 | Debug|x64 = Debug|x64 13 | Debug|x86 = Debug|x86 14 | Release|x64 = Release|x64 15 | Release|x86 = Release|x86 16 | EndGlobalSection 17 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 18 | {E968D202-64A2-43A5-8BBD-D7D010D06564}.Debug|x64.ActiveCfg = Debug|x64 19 | {E968D202-64A2-43A5-8BBD-D7D010D06564}.Debug|x64.Build.0 = Debug|x64 20 | {E968D202-64A2-43A5-8BBD-D7D010D06564}.Debug|x86.ActiveCfg = Debug|Win32 21 | {E968D202-64A2-43A5-8BBD-D7D010D06564}.Debug|x86.Build.0 = Debug|Win32 22 | {E968D202-64A2-43A5-8BBD-D7D010D06564}.Release|x64.ActiveCfg = Release|x64 23 | {E968D202-64A2-43A5-8BBD-D7D010D06564}.Release|x64.Build.0 = Release|x64 24 | {E968D202-64A2-43A5-8BBD-D7D010D06564}.Release|x86.ActiveCfg = Release|Win32 25 | {E968D202-64A2-43A5-8BBD-D7D010D06564}.Release|x86.Build.0 = Release|Win32 26 | {5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Debug|x64.ActiveCfg = Debug|x64 27 | {5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Debug|x64.Build.0 = Debug|x64 28 | {5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Debug|x86.ActiveCfg = Debug|Win32 29 | {5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Debug|x86.Build.0 = Debug|Win32 30 | {5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Release|x64.ActiveCfg = Release|x64 31 | {5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Release|x64.Build.0 = Release|x64 32 | {5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Release|x86.ActiveCfg = Release|Win32 33 | {5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Release|x86.Build.0 = Release|Win32 34 | EndGlobalSection 35 | GlobalSection(SolutionProperties) = preSolution 36 | HideSolutionNode = FALSE 37 | EndGlobalSection 38 | GlobalSection(ExtensibilityGlobals) = postSolution 39 | SolutionGuid = {2EA86D6F-CEE0-40A9-B4DD-AC59CCAD358F} 40 | EndGlobalSection 41 | EndGlobal 42 | -------------------------------------------------------------------------------- /MemoryBandwidth/MemoryBandwidth/MemoryBandwidth.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | Source Files 20 | 21 | 22 | 23 | 24 | Source Files 25 | 26 | 27 | Source Files 28 | 29 | 30 | -------------------------------------------------------------------------------- /MemoryBandwidth/MixedMemoryBandwidthTest/MixedMemoryBandwidthTest.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | Source Files 20 | 21 | 22 | 23 | 24 | Source Files 25 | 26 | 27 | 28 | 29 | Header Files 30 | 31 | 32 | -------------------------------------------------------------------------------- /MemoryLatency/Makefile: -------------------------------------------------------------------------------- 1 | include ../Common/arch_detect.mk 2 | 3 | CFLAGS = -O3 4 | LDFLAGS = -lm 5 | 6 | all: $(TARGET) 7 | 8 | amd64: 9 | $(CC) $(CFLAGS) MemoryLatency.c MemoryLatency_x86.s -o MemoryLatency_amd64 $(LDFLAGS) 10 | 11 | amd64-numa: 12 | $(CC) $(CFLAGS) -DNUMA MemoryLatency.c MemoryLatency_x86.s -o MemoryLatency_numa_amd64 $(LDFLAGS) -lnuma 13 | 14 | aarch64: 15 | $(CC) $(CFLAGS) MemoryLatency.c MemoryLatency_arm.s -o MemoryLatency_aarch64 $(LDFLAGS) 16 | 17 | aarch64-numa: 18 | $(CC) $(CFLAGS) -DNUMA MemoryLatency.c MemoryLatency_arm.s -o MemoryLatency_aarch64 $(LDFLAGS) -lnuma 19 | 20 | riscv64: 21 | $(CC) $(CFLAGS) MemoryLatency.c MemoryLatency_riscv.s -o MemoryLatency_riscv64 $(LDFLAGS) 22 | 23 | riscv64-numa: 24 | $(CC) $(CFLAGS) -DNUMA MemoryLatency.c MemoryLatency_riscv.s -o MemoryLatency_riscv64 $(LDFLAGS) -lnuma 25 | 26 | w64: 27 | $(CC) $(CFLAGS) MemoryLatency.cpp MemoryLatency_x86.s -o MemoryLatency_w64.exe $(LDFLAGS) 28 | 29 | # w64 can build with mingw 11, which isn't available on jammy 30 | 31 | ci: amd64 amd64-numa aarch64 riscv64 w64 32 | 33 | clean: 34 | rm -f *.o && find . -type f -executable -delete 35 | 36 | .PHONY: all ci clean 37 | -------------------------------------------------------------------------------- /MemoryLatency/MemoryLatency.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 16 4 | VisualStudioVersion = 16.0.31229.75 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MemoryLatency", "MemoryLatency.vcxproj", "{3A98A230-A87B-432D-931D-369872DE24AF}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|x64 = Debug|x64 11 | Debug|x86 = Debug|x86 12 | Release|x64 = Release|x64 13 | Release|x86 = Release|x86 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {3A98A230-A87B-432D-931D-369872DE24AF}.Debug|x64.ActiveCfg = Debug|x64 17 | {3A98A230-A87B-432D-931D-369872DE24AF}.Debug|x64.Build.0 = Debug|x64 18 | {3A98A230-A87B-432D-931D-369872DE24AF}.Debug|x86.ActiveCfg = Debug|Win32 19 | {3A98A230-A87B-432D-931D-369872DE24AF}.Debug|x86.Build.0 = Debug|Win32 20 | {3A98A230-A87B-432D-931D-369872DE24AF}.Release|x64.ActiveCfg = Release|x64 21 | {3A98A230-A87B-432D-931D-369872DE24AF}.Release|x64.Build.0 = Release|x64 22 | {3A98A230-A87B-432D-931D-369872DE24AF}.Release|x86.ActiveCfg = Release|Win32 23 | {3A98A230-A87B-432D-931D-369872DE24AF}.Release|x86.Build.0 = Release|Win32 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | GlobalSection(ExtensibilityGlobals) = postSolution 29 | SolutionGuid = {F2D00DD2-A22B-4A3C-A2FF-9CE8CF9070D1} 30 | EndGlobalSection 31 | EndGlobal 32 | -------------------------------------------------------------------------------- /MemoryLatency/MemoryLatencyFunctions.asm: -------------------------------------------------------------------------------- 1 | section .text 2 | bits 64 3 | 4 | global preplatencyarr 5 | global latencytest 6 | 7 | preplatencyarr: 8 | push r15 9 | push r14 10 | xor r15, r15 ; array index 11 | preplatencyarr_loop: 12 | mov r14, [rcx + r15 * 8] 13 | lea r14, [rcx + r14 * 8] 14 | mov [rcx + r15 * 8], r14 15 | inc r15 16 | cmp rdx, r15 17 | jne preplatencyarr_loop 18 | pop r14 19 | pop r15 20 | ret 21 | 22 | latencytest: 23 | push r15 24 | mov r15, [rdx] 25 | xor rax, rax 26 | latencytest_loop: 27 | mov r15, [r15] 28 | add rax, r15 29 | dec rcx 30 | jnz latencytest_loop 31 | pop r15 32 | ret 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Microbenchmarks 2 | Trying to figure various CPU (or GPU) things out. 3 | 4 | Basically my playground to microbenchmark various CPU-related things like ROB/register file sizes, lock/cache coherency latency, and cache/memory performance. This repo is loose collection of various experiments and is more of a playground than a well maintained piece of software. As such, various benchmarks may not work, or may not even compile. They're also not well documented and details of what's being tested may not be intuitive. Due to time constraints and real life priorities I won't be able to maintain this repo to an acceptable standard for public use. 5 | 6 | Feel free to try running the stuff here, but I highly suggest writing your own code because that'll provide a better understanding of the theory behind the benchmarks. Consider checking out https://github.com/travisdowns/robsize or https://github.com/Veedrac/microarchitecturometer. 7 | 8 | # Building Clammicrobench with Generated Code 9 | Get NASM (https://www.nasm.us/) and make sure it's in your path. Then things should build under Visual Studio 2022. 10 | 11 | Some microbenchmarks have the source code and assembly generated by C# code, to avoid crazy stuff like self modifying code. For clammicrobench, build/run the AsmGen project. Pass "autocopy" on the command line to have it automatically place generated ASM files for Visual Studio. Then, the clammicrobench project should build. 12 | -------------------------------------------------------------------------------- /mt_instructionrate/Makefile: -------------------------------------------------------------------------------- 1 | x86: 2 | gcc -pthread -masm=intel x86_mt_instructionrate.s mt_instructionrate.c ../Common/timing.c -o x86_mt_instructionrate -static 3 | aarch64: 4 | gcc -pthread mt_instructionrate.c arm_mt_instructionrate.s ../Common/timing.c -o arm_mt_instructionrate 5 | ppc64: 6 | gcc -pthread -mregnames mt_instructionrate.c ppc64_mt_instructionrate.s ../Common/timing.c -o ppc64_mt_instructionrate 7 | -------------------------------------------------------------------------------- /mt_instructionrate/Project1.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | Source Files 20 | 21 | 22 | Header Files 23 | 24 | 25 | 26 | 27 | Header Files 28 | 29 | 30 | 31 | 32 | Source Files 33 | 34 | 35 | -------------------------------------------------------------------------------- /mt_instructionrate/mt_instructionrate.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 17 4 | VisualStudioVersion = 17.8.34511.84 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Project1", "Project1.vcxproj", "{0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|x64 = Debug|x64 11 | Debug|x86 = Debug|x86 12 | Release|x64 = Release|x64 13 | Release|x86 = Release|x86 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Debug|x64.ActiveCfg = Debug|x64 17 | {0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Debug|x64.Build.0 = Debug|x64 18 | {0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Debug|x86.ActiveCfg = Debug|Win32 19 | {0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Debug|x86.Build.0 = Debug|Win32 20 | {0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Release|x64.ActiveCfg = Release|x64 21 | {0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Release|x64.Build.0 = Release|x64 22 | {0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Release|x86.ActiveCfg = Release|Win32 23 | {0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Release|x86.Build.0 = Release|Win32 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | GlobalSection(ExtensibilityGlobals) = postSolution 29 | SolutionGuid = {B31B466E-F833-4B33-9E21-74616F970AA2} 30 | EndGlobalSection 31 | EndGlobal 32 | -------------------------------------------------------------------------------- /mt_instructionrate/ppc64_mt_instructionrate.c: -------------------------------------------------------------------------------- 1 | extern uint64_t vec_int32_add_test(uint64_t iterations, void *data); 2 | extern uint64_t vec_int32_mul_test(uint64_t iterations, void *data); 3 | extern uint64_t vec_fp32_add_test(uint64_t iterations, void *data); 4 | extern uint64_t vec_fp32_fma_test(uint64_t iterations, void *data); 5 | extern uint64_t vec_fp32_isqrt_test(uint64_t iterations, void *data); 6 | extern uint64_t fp64_add_test(uint64_t iterations, void *data); 7 | extern uint64_t fp64_fma_test(uint64_t iterations, void *data); 8 | 9 | void RunTests() { 10 | uint64_t iterations = 3500000000; 11 | int testDataLength = 256; 12 | uint32_t *intTestArr = (uint32_t *)malloc(sizeof(uint32_t) * testDataLength); 13 | uint32_t *fpTestArr = (uint32_t *)malloc(sizeof(uint32_t) * testDataLength); 14 | for (int i = 0; i < testDataLength; i++) { 15 | intTestArr[i] = i; 16 | fpTestArr[i] = i * 1.2f; 17 | } 18 | 19 | fprintf(stderr, "Measuring INT32 adds\n"); 20 | float int32adds = measureFunction(iterations, vec_int32_add_test, intTestArr); 21 | float int32muls = measureFunction(iterations, vec_int32_mul_test, intTestArr); 22 | float fp32adds = measureFunction(iterations, vec_fp32_add_test, fpTestArr); 23 | float fp32fmas = measureFunction(iterations, vec_fp32_fma_test, fpTestArr); 24 | float fp32isqrt = measureFunction(iterations, vec_fp32_isqrt_test, fpTestArr); 25 | float fp64adds = measureFunction(iterations, fp64_add_test, fpTestArr); 26 | float fp64fmas = measureFunction(iterations, fp64_fma_test, fpTestArr); 27 | 28 | printf("-----GOPS/s-----\n"); 29 | printf("Altivec INT32 Add: %f\n", int32adds); 30 | printf("Altivec INT32 Multiply: %f\n", int32muls); 31 | printf("Altivec FP32 Add: %f\n", fp32adds); 32 | printf("Altivec FP32 FMA: %f (%f GFLOPS)\n", fp32fmas, 2 * fp32fmas); 33 | printf("Altivec FP32 Inverse Square Root: %f\n", fp32isqrt); 34 | printf("FP64 Add: %f\n", fp64adds); 35 | printf("FP64 FMA: %f (%f GFLOPS)\n", fp64fmas, 2 * fp64fmas); 36 | 37 | free(intTestArr); 38 | free(fpTestArr); 39 | return; 40 | } 41 | -------------------------------------------------------------------------------- /mt_instructionrate/x86_mt_instructionrate: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clamchowder/Microbenchmarks/e8aaed570e6d7fea725f3d67313f5a51973d6fdf/mt_instructionrate/x86_mt_instructionrate -------------------------------------------------------------------------------- /svm/OpenCL/include/CL/Utils/Context.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // OpenCL Utils includes 4 | #include "OpenCLUtils_Export.h" 5 | 6 | // OpenCL includes 7 | #include 8 | 9 | // STL includes 10 | #include 11 | 12 | UTILS_EXPORT 13 | cl_context cl_util_get_context(const cl_uint plat_id, const cl_uint dev_id, 14 | const cl_device_type type, cl_int* const error); 15 | UTILS_EXPORT 16 | cl_device_id cl_util_get_device(const cl_uint plat_id, const cl_uint dev_id, 17 | const cl_device_type type, cl_int* const error); 18 | 19 | UTILS_EXPORT 20 | cl_int cl_util_print_device_info(const cl_device_id device); 21 | 22 | UTILS_EXPORT 23 | char* cl_util_get_device_info(const cl_device_id device, 24 | const cl_device_info info, cl_int* const error); 25 | UTILS_EXPORT 26 | char* cl_util_get_platform_info(const cl_platform_id platform, 27 | const cl_platform_info info, 28 | cl_int* const error); 29 | 30 | // build program and show log if build is not successful 31 | UTILS_EXPORT 32 | cl_int cl_util_build_program(const cl_program pr, const cl_device_id dev, 33 | const char* const opt); 34 | 35 | #define GET_CURRENT_TIMER(time) \ 36 | struct timespec time; \ 37 | timespec_get(&time, TIME_UTC); \ 38 | { \ 39 | } 40 | 41 | #define TIMER_DIFFERENCE(dt, time1, time2) \ 42 | { \ 43 | dt = (time2.tv_sec - time1.tv_sec) * 1000000000 \ 44 | + (time2.tv_nsec - time1.tv_nsec); \ 45 | } 46 | 47 | #define START_TIMER GET_CURRENT_TIMER(start_timer1) 48 | #define STOP_TIMER(dt) \ 49 | GET_CURRENT_TIMER(stop_timer2) \ 50 | TIMER_DIFFERENCE(dt, start_timer1, stop_timer2) 51 | -------------------------------------------------------------------------------- /svm/OpenCL/include/CL/Utils/Context.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // OpenCL SDK includes 4 | #include "OpenCLUtilsCpp_Export.h" 5 | 6 | #include 7 | 8 | // OpenCL includes 9 | #include 10 | 11 | namespace cl { 12 | namespace util { 13 | Context UTILSCPP_EXPORT get_context(cl_uint plat_id, cl_uint dev_id, 14 | cl_device_type type, 15 | cl_int* error = nullptr); 16 | 17 | void UTILSCPP_EXPORT print_device_info(const cl::Device& device); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /svm/OpenCL/include/CL/Utils/Device.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "OpenCLUtilsCpp_Export.h" 4 | #include 5 | 6 | #include 7 | 8 | namespace cl { 9 | namespace util { 10 | bool UTILSCPP_EXPORT opencl_c_version_contains( 11 | const cl::Device& device, const cl::string& version_fragment); 12 | 13 | bool UTILSCPP_EXPORT supports_extension(const cl::Device& device, 14 | const cl::string& extension); 15 | 16 | #ifdef CL_VERSION_3_0 17 | bool UTILSCPP_EXPORT supports_feature(const cl::Device& device, 18 | const cl::string& feature_name); 19 | #endif 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /svm/OpenCL/include/CL/Utils/Error.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // OpenCL Utils includes 4 | #include "OpenCLUtilsCpp_Export.h" 5 | 6 | // OpenCL Utils includes 7 | #include 8 | 9 | // OpenCL includes 10 | #include 11 | 12 | namespace cl { 13 | namespace util { 14 | #if defined(CL_HPP_ENABLE_EXCEPTIONS) 15 | /*! \brief Exception class 16 | * 17 | * This may be thrown by SDK utility functions when 18 | * CL_HPP_ENABLE_EXCEPTIONS is defined. 19 | */ 20 | class Error : public std::exception { 21 | private: 22 | int err_; 23 | const char* errStr_; 24 | 25 | public: 26 | /*! \brief Create a new SDK error exception for a given error code 27 | * and corresponding message. 28 | * 29 | * \param err error code value. 30 | * 31 | * \param errStr a descriptive string that must remain in scope until 32 | * handling of the exception has concluded. If set, it 33 | * will be returned by what(). 34 | */ 35 | Error(cl_int err, const char* errStr = NULL): err_(err), errStr_(errStr) 36 | {} 37 | 38 | ~Error() noexcept {} 39 | 40 | /*! \brief Get error string associated with exception 41 | * 42 | * \return A memory pointer to the error message string. 43 | */ 44 | virtual const char* what() const noexcept 45 | { 46 | if (errStr_ == NULL) 47 | { 48 | return "empty"; 49 | } 50 | else 51 | { 52 | return errStr_; 53 | } 54 | } 55 | 56 | /*! \brief Get error code associated with exception 57 | * 58 | * \return The error code. 59 | */ 60 | cl_int err(void) const { return err_; } 61 | }; 62 | #endif 63 | 64 | namespace detail { 65 | UTILSCPP_EXPORT cl_int errHandler(cl_int err, cl_int* errPtr, 66 | const char* errStr = nullptr); 67 | } 68 | 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /svm/OpenCL/include/CL/Utils/ErrorCodes.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define CL_UTIL_INDEX_OUT_OF_RANGE -2000 4 | #define CL_UTIL_DEVICE_NOT_INTEROPERABLE -2001 5 | #define CL_UTIL_FILE_OPERATION_ERROR -2002 6 | -------------------------------------------------------------------------------- /svm/OpenCL/include/CL/Utils/Event.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // OpenCL Utils includes 4 | #include "OpenCLUtils_Export.h" 5 | 6 | // OpenCL includes 7 | #include 8 | 9 | UTILS_EXPORT 10 | cl_ulong cl_util_get_event_duration(const cl_event event, 11 | const cl_profiling_info start, 12 | const cl_profiling_info end, 13 | cl_int* const error); 14 | -------------------------------------------------------------------------------- /svm/OpenCL/include/CL/Utils/Event.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // OpenCL SDK includes 4 | #include "OpenCLUtilsCpp_Export.h" 5 | 6 | // STL includes 7 | #include 8 | 9 | // OpenCL includes 10 | #include 11 | 12 | namespace cl { 13 | namespace util { 14 | template 15 | auto get_duration(cl::Event& ev) 16 | { 17 | return std::chrono::duration_cast(std::chrono::nanoseconds{ 18 | ev.getProfilingInfo() - ev.getProfilingInfo() }); 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /svm/OpenCL/include/CL/Utils/File.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // OpenCL Utils includes 4 | #include "OpenCLUtils_Export.h" 5 | 6 | // OpenCL includes 7 | #include 8 | 9 | // read all the text file contents securely in ANSI C89 10 | // return pointer to C-string with file contents 11 | // can handle streams with no known size and no support for fseek 12 | // based on https://stackoverflow.com/questions/14002954/ by Nominal Animal 13 | UTILS_EXPORT 14 | char* cl_util_read_text_file(const char* const filename, size_t* const length, 15 | cl_int* const error); 16 | 17 | // read all the binary file contents securely in ANSI C89 18 | // return pointer to file contents 19 | // can handle streams with no known size and no support for fseek 20 | // based on https://stackoverflow.com/questions/14002954/ by Nominal Animal 21 | UTILS_EXPORT 22 | unsigned char* cl_util_read_binary_file(const char* const filename, 23 | size_t* const length, 24 | cl_int* const error); 25 | 26 | // write binaries of OpenCL compiled program 27 | // binaries are written as separate files for each device 28 | // with file name "(program_file_name)_(name of device).bin" 29 | // based on variant of Logan 30 | // http://logan.tw/posts/2014/11/22/pre-compile-the-opencl-kernel-program-part-2/ 31 | UTILS_EXPORT 32 | cl_int cl_util_write_binaries(const cl_program program, 33 | const char* const program_file_name); 34 | 35 | // read binaries of OpenCL compiled program 36 | // from files of file names "(program_file_name)_(name of device).bin" 37 | UTILS_EXPORT 38 | cl_program cl_util_read_binaries(const cl_context context, 39 | const cl_device_id* const devices, 40 | const cl_uint num_devices, 41 | const char* const program_file_name, 42 | cl_int* const error); 43 | 44 | // returns the folder containing the running executable 45 | UTILS_EXPORT 46 | cl_int cl_util_executable_folder(char* filename, size_t* const length); 47 | 48 | // read all the text file contents securely in ANSI C89 49 | // return pointer to C-string with file contents 50 | // interprets filename relative to the folder containing 51 | // the running executable 52 | UTILS_EXPORT 53 | char* cl_util_read_exe_relative_text_file(const char* const rel_path, 54 | size_t* const length, 55 | cl_int* const error); 56 | -------------------------------------------------------------------------------- /svm/OpenCL/include/CL/Utils/File.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // OpenCL SDK includes 4 | #include "OpenCLUtilsCpp_Export.h" 5 | 6 | #include 7 | 8 | // OpenCL includes 9 | #include 10 | 11 | 12 | namespace cl { 13 | namespace util { 14 | 15 | std::string UTILSCPP_EXPORT read_text_file(const char* const filename, 16 | cl_int* const error = nullptr); 17 | 18 | std::vector UTILSCPP_EXPORT 19 | read_binary_file(const char* const filename, cl_int* const error = nullptr); 20 | 21 | Program::Binaries UTILSCPP_EXPORT read_binary_files( 22 | const std::vector& devices, 23 | const char* const program_file_name, cl_int* const error = nullptr); 24 | 25 | cl_int UTILSCPP_EXPORT 26 | write_binaries(const cl::Program::Binaries& binaries, 27 | const std::vector& devices, 28 | const char* const program_file_name); 29 | 30 | std::string UTILSCPP_EXPORT 31 | executable_folder(cl_int* const error = nullptr); 32 | 33 | std::string UTILSCPP_EXPORT read_exe_relative_text_file( 34 | const char* const filename, cl_int* const error = nullptr); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /svm/OpenCL/include/CL/Utils/InteropContext.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "OpenCLUtilsCpp_Export.h" 4 | #include 5 | 6 | #include 7 | 8 | namespace cl { 9 | namespace util { 10 | vector 11 | UTILSCPP_EXPORT get_interop_context_properties(const cl::Device& plat, 12 | cl_int* error = nullptr); 13 | 14 | Context UTILSCPP_EXPORT get_interop_context(int plat_id, int dev_id, 15 | cl_device_type type, 16 | cl_int* error = nullptr); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /svm/OpenCL/include/CL/Utils/OpenCLUtilsCpp_Export.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef UTILSCPP_EXPORT_H 3 | #define UTILSCPP_EXPORT_H 4 | 5 | #ifdef OPENCLUTILSCPP_STATIC_DEFINE 6 | # define UTILSCPP_EXPORT 7 | # define OPENCLUTILSCPP_NO_EXPORT 8 | #else 9 | # ifndef UTILSCPP_EXPORT 10 | # ifdef OpenCLUtilsCpp_EXPORTS 11 | /* We are building this library */ 12 | # define UTILSCPP_EXPORT 13 | # else 14 | /* We are using this library */ 15 | # define UTILSCPP_EXPORT 16 | # endif 17 | # endif 18 | 19 | # ifndef OPENCLUTILSCPP_NO_EXPORT 20 | # define OPENCLUTILSCPP_NO_EXPORT 21 | # endif 22 | #endif 23 | 24 | #ifndef OPENCLUTILSCPP_DEPRECATED 25 | # define OPENCLUTILSCPP_DEPRECATED __declspec(deprecated) 26 | #endif 27 | 28 | #ifndef OPENCLUTILSCPP_DEPRECATED_EXPORT 29 | # define OPENCLUTILSCPP_DEPRECATED_EXPORT UTILSCPP_EXPORT OPENCLUTILSCPP_DEPRECATED 30 | #endif 31 | 32 | #ifndef OPENCLUTILSCPP_DEPRECATED_NO_EXPORT 33 | # define OPENCLUTILSCPP_DEPRECATED_NO_EXPORT OPENCLUTILSCPP_NO_EXPORT OPENCLUTILSCPP_DEPRECATED 34 | #endif 35 | 36 | /* NOLINTNEXTLINE(readability-avoid-unconditional-preprocessor-if) */ 37 | #if 0 /* DEFINE_NO_DEPRECATED */ 38 | # ifndef OPENCLUTILSCPP_NO_DEPRECATED 39 | # define OPENCLUTILSCPP_NO_DEPRECATED 40 | # endif 41 | #endif 42 | 43 | #endif /* UTILSCPP_EXPORT_H */ 44 | -------------------------------------------------------------------------------- /svm/OpenCL/include/CL/Utils/OpenCLUtils_Export.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef UTILS_EXPORT_H 3 | #define UTILS_EXPORT_H 4 | 5 | #ifdef OPENCLUTILS_STATIC_DEFINE 6 | # define UTILS_EXPORT 7 | # define OPENCLUTILS_NO_EXPORT 8 | #else 9 | # ifndef UTILS_EXPORT 10 | # ifdef OpenCLUtils_EXPORTS 11 | /* We are building this library */ 12 | # define UTILS_EXPORT 13 | # else 14 | /* We are using this library */ 15 | # define UTILS_EXPORT 16 | # endif 17 | # endif 18 | 19 | # ifndef OPENCLUTILS_NO_EXPORT 20 | # define OPENCLUTILS_NO_EXPORT 21 | # endif 22 | #endif 23 | 24 | #ifndef OPENCLUTILS_DEPRECATED 25 | # define OPENCLUTILS_DEPRECATED __declspec(deprecated) 26 | #endif 27 | 28 | #ifndef OPENCLUTILS_DEPRECATED_EXPORT 29 | # define OPENCLUTILS_DEPRECATED_EXPORT UTILS_EXPORT OPENCLUTILS_DEPRECATED 30 | #endif 31 | 32 | #ifndef OPENCLUTILS_DEPRECATED_NO_EXPORT 33 | # define OPENCLUTILS_DEPRECATED_NO_EXPORT OPENCLUTILS_NO_EXPORT OPENCLUTILS_DEPRECATED 34 | #endif 35 | 36 | /* NOLINTNEXTLINE(readability-avoid-unconditional-preprocessor-if) */ 37 | #if 0 /* DEFINE_NO_DEPRECATED */ 38 | # ifndef OPENCLUTILS_NO_DEPRECATED 39 | # define OPENCLUTILS_NO_DEPRECATED 40 | # endif 41 | #endif 42 | 43 | #endif /* UTILS_EXPORT_H */ 44 | -------------------------------------------------------------------------------- /svm/OpenCL/include/CL/Utils/Platform.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "OpenCLUtilsCpp_Export.h" 4 | #include 5 | 6 | #include 7 | 8 | namespace cl { 9 | namespace util { 10 | bool UTILSCPP_EXPORT supports_extension(const cl::Platform& platform, 11 | const cl::string& extension); 12 | 13 | bool UTILSCPP_EXPORT platform_version_contains( 14 | const cl::Platform& platform, const cl::string& version_fragment); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /svm/OpenCL/include/CL/Utils/Utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // OpenCL Utils includes 4 | #include "OpenCLUtils_Export.h" 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | // OpenCL includes 11 | #include 12 | -------------------------------------------------------------------------------- /svm/OpenCL/include/CL/Utils/Utils.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // OpenCL Utils includes 4 | #include "OpenCLUtils_Export.h" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | // OpenCL includes 15 | #include 16 | -------------------------------------------------------------------------------- /svm/OpenCL/include/CL/cl2.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright (c) 2020 The Khronos Group Inc. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | 17 | #include 18 | #pragma message("cl2.hpp has been renamed to opencl.hpp to make it clear that it supports all versions of OpenCL. Please include opencl.hpp directly.") 19 | -------------------------------------------------------------------------------- /svm/OpenCL/include/CL/cl_dx9_media_sharing_intel.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (c) 2008-2020 The Khronos Group Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | #include 18 | #pragma message("The Intel DX9 media sharing extensions have been moved into cl_dx9_media_sharing.h. Please include cl_dx9_media_sharing.h directly.") 19 | -------------------------------------------------------------------------------- /svm/OpenCL/include/CL/cl_ext_intel.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (c) 2008-2020 The Khronos Group Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | ******************************************************************************/ 17 | 18 | #include 19 | #pragma message("The Intel extensions have been moved into cl_ext.h. Please include cl_ext.h directly.") 20 | -------------------------------------------------------------------------------- /svm/OpenCL/include/CL/cl_gl_ext.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (c) 2008-2021 The Khronos Group Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | #include 18 | #pragma message("The extensions in cl_gl_ext.h have been moved into cl_gl.h. Please include cl_gl.h directly.") 19 | -------------------------------------------------------------------------------- /svm/OpenCL/include/CL/opencl.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (c) 2008-2021 The Khronos Group Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | #ifndef __OPENCL_H 18 | #define __OPENCL_H 19 | 20 | #ifdef __cplusplus 21 | extern "C" { 22 | #endif 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif /* __OPENCL_H */ 33 | -------------------------------------------------------------------------------- /svm/OpenCL/lib/OpenCL.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clamchowder/Microbenchmarks/e8aaed570e6d7fea725f3d67313f5a51973d6fdf/svm/OpenCL/lib/OpenCL.lib -------------------------------------------------------------------------------- /svm/OpenCL/lib/OpenCLExt.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clamchowder/Microbenchmarks/e8aaed570e6d7fea725f3d67313f5a51973d6fdf/svm/OpenCL/lib/OpenCLExt.lib -------------------------------------------------------------------------------- /svm/OpenCL/lib/OpenCLUtils.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clamchowder/Microbenchmarks/e8aaed570e6d7fea725f3d67313f5a51973d6fdf/svm/OpenCL/lib/OpenCLUtils.lib -------------------------------------------------------------------------------- /svm/OpenCL/lib/OpenCLUtilsCpp.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clamchowder/Microbenchmarks/e8aaed570e6d7fea725f3d67313f5a51973d6fdf/svm/OpenCL/lib/OpenCLUtilsCpp.lib -------------------------------------------------------------------------------- /svm/OpenCL/lib/OpenCLUtilsCppd.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clamchowder/Microbenchmarks/e8aaed570e6d7fea725f3d67313f5a51973d6fdf/svm/OpenCL/lib/OpenCLUtilsCppd.lib -------------------------------------------------------------------------------- /svm/OpenCL/lib/OpenCLUtilsd.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clamchowder/Microbenchmarks/e8aaed570e6d7fea725f3d67313f5a51973d6fdf/svm/OpenCL/lib/OpenCLUtilsd.lib -------------------------------------------------------------------------------- /svm/OpenCL/lib/pkgconfig/OpenCL.pc: -------------------------------------------------------------------------------- 1 | prefix=D:/a/OpenCL-SDK/OpenCL-SDK/install 2 | exec_prefix=${prefix} 3 | libdir=${exec_prefix}/lib 4 | 5 | Name: OpenCL 6 | Description: Khronos OpenCL ICD Loader 7 | Requires: OpenCL-Headers 8 | Version: 3.0 9 | Libs: -L${libdir} -lOpenCL 10 | -------------------------------------------------------------------------------- /svm/OpenCL/share/cmake/OpenCL/OpenCLConfig.cmake: -------------------------------------------------------------------------------- 1 | get_filename_component(PARENT_DIR ${CMAKE_CURRENT_LIST_DIR} PATH) 2 | include("${PARENT_DIR}/OpenCLHeaders/OpenCLHeadersConfig.cmake") 3 | include("${PARENT_DIR}/OpenCLICDLoader/OpenCLICDLoaderConfig.cmake") 4 | include("${PARENT_DIR}/OpenCLHeadersCpp/OpenCLHeadersCppConfig.cmake") 5 | include("${PARENT_DIR}/OpenCLUtils/OpenCLUtilsConfig.cmake") 6 | include("${PARENT_DIR}/OpenCLUtilsCpp/OpenCLUtilsCppConfig.cmake") 7 | -------------------------------------------------------------------------------- /svm/OpenCL/share/cmake/OpenCL/OpenCLConfigVersion.cmake: -------------------------------------------------------------------------------- 1 | # This is a basic version file for the Config-mode of find_package(). 2 | # It is used by write_basic_package_version_file() as input file for configure_file() 3 | # to create a version-file which can be installed along a config.cmake file. 4 | # 5 | # The created file sets PACKAGE_VERSION_EXACT if the current version string and 6 | # the requested version string are exactly the same and it sets 7 | # PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version. 8 | # The variable CVF_VERSION must be set before calling configure_file(). 9 | 10 | set(PACKAGE_VERSION "2024.10.24") 11 | 12 | if (PACKAGE_FIND_VERSION_RANGE) 13 | # Package version must be in the requested version range 14 | if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN) 15 | OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX) 16 | OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX))) 17 | set(PACKAGE_VERSION_COMPATIBLE FALSE) 18 | else() 19 | set(PACKAGE_VERSION_COMPATIBLE TRUE) 20 | endif() 21 | else() 22 | if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION) 23 | set(PACKAGE_VERSION_COMPATIBLE FALSE) 24 | else() 25 | set(PACKAGE_VERSION_COMPATIBLE TRUE) 26 | if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION) 27 | set(PACKAGE_VERSION_EXACT TRUE) 28 | endif() 29 | endif() 30 | endif() 31 | 32 | 33 | # if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it: 34 | if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "" STREQUAL "") 35 | return() 36 | endif() 37 | 38 | # check that the installed version has the same 32/64bit-ness as the one which is currently searching: 39 | if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "") 40 | math(EXPR installedBits " * 8") 41 | set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)") 42 | set(PACKAGE_VERSION_UNSUITABLE TRUE) 43 | endif() 44 | -------------------------------------------------------------------------------- /svm/OpenCL/share/cmake/OpenCLExtensionLoader/OpenCLExtensionLoaderConfig.cmake: -------------------------------------------------------------------------------- 1 | include("${CMAKE_CURRENT_LIST_DIR}/OpenCLExtensionLoaderTargets.cmake") -------------------------------------------------------------------------------- /svm/OpenCL/share/cmake/OpenCLExtensionLoader/OpenCLExtensionLoaderConfigVersion.cmake: -------------------------------------------------------------------------------- 1 | # This is a basic version file for the Config-mode of find_package(). 2 | # It is used by write_basic_package_version_file() as input file for configure_file() 3 | # to create a version-file which can be installed along a config.cmake file. 4 | # 5 | # The created file sets PACKAGE_VERSION_EXACT if the current version string and 6 | # the requested version string are exactly the same and it sets 7 | # PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version. 8 | # The variable CVF_VERSION must be set before calling configure_file(). 9 | 10 | set(PACKAGE_VERSION "1.0.220515") 11 | 12 | if (PACKAGE_FIND_VERSION_RANGE) 13 | # Package version must be in the requested version range 14 | if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN) 15 | OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX) 16 | OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX))) 17 | set(PACKAGE_VERSION_COMPATIBLE FALSE) 18 | else() 19 | set(PACKAGE_VERSION_COMPATIBLE TRUE) 20 | endif() 21 | else() 22 | if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION) 23 | set(PACKAGE_VERSION_COMPATIBLE FALSE) 24 | else() 25 | set(PACKAGE_VERSION_COMPATIBLE TRUE) 26 | if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION) 27 | set(PACKAGE_VERSION_EXACT TRUE) 28 | endif() 29 | endif() 30 | endif() 31 | 32 | 33 | # if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it: 34 | if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "" STREQUAL "") 35 | return() 36 | endif() 37 | 38 | # check that the installed version has the same 32/64bit-ness as the one which is currently searching: 39 | if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "") 40 | math(EXPR installedBits " * 8") 41 | set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)") 42 | set(PACKAGE_VERSION_UNSUITABLE TRUE) 43 | endif() 44 | -------------------------------------------------------------------------------- /svm/OpenCL/share/cmake/OpenCLExtensionLoader/OpenCLExtensionLoaderTargets-debug.cmake: -------------------------------------------------------------------------------- 1 | #---------------------------------------------------------------- 2 | # Generated CMake target import file for configuration "Debug". 3 | #---------------------------------------------------------------- 4 | 5 | # Commands may need to know the format version. 6 | set(CMAKE_IMPORT_FILE_VERSION 1) 7 | 8 | # Import target "OpenCL::OpenCLExt" for configuration "Debug" 9 | set_property(TARGET OpenCL::OpenCLExt APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG) 10 | set_target_properties(OpenCL::OpenCLExt PROPERTIES 11 | IMPORTED_LINK_INTERFACE_LANGUAGES_DEBUG "CXX" 12 | IMPORTED_LOCATION_DEBUG "${_IMPORT_PREFIX}/lib/OpenCLExt.lib" 13 | ) 14 | 15 | list(APPEND _cmake_import_check_targets OpenCL::OpenCLExt ) 16 | list(APPEND _cmake_import_check_files_for_OpenCL::OpenCLExt "${_IMPORT_PREFIX}/lib/OpenCLExt.lib" ) 17 | 18 | # Commands beyond this point should not need to know the version. 19 | set(CMAKE_IMPORT_FILE_VERSION) 20 | -------------------------------------------------------------------------------- /svm/OpenCL/share/cmake/OpenCLExtensionLoader/OpenCLExtensionLoaderTargets-release.cmake: -------------------------------------------------------------------------------- 1 | #---------------------------------------------------------------- 2 | # Generated CMake target import file for configuration "Release". 3 | #---------------------------------------------------------------- 4 | 5 | # Commands may need to know the format version. 6 | set(CMAKE_IMPORT_FILE_VERSION 1) 7 | 8 | # Import target "OpenCL::OpenCLExt" for configuration "Release" 9 | set_property(TARGET OpenCL::OpenCLExt APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE) 10 | set_target_properties(OpenCL::OpenCLExt PROPERTIES 11 | IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "CXX" 12 | IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/OpenCLExt.lib" 13 | ) 14 | 15 | list(APPEND _cmake_import_check_targets OpenCL::OpenCLExt ) 16 | list(APPEND _cmake_import_check_files_for_OpenCL::OpenCLExt "${_IMPORT_PREFIX}/lib/OpenCLExt.lib" ) 17 | 18 | # Commands beyond this point should not need to know the version. 19 | set(CMAKE_IMPORT_FILE_VERSION) 20 | -------------------------------------------------------------------------------- /svm/OpenCL/share/cmake/OpenCLHeaders/OpenCLHeadersConfig.cmake: -------------------------------------------------------------------------------- 1 | include("${CMAKE_CURRENT_LIST_DIR}/OpenCLHeadersTargets.cmake") -------------------------------------------------------------------------------- /svm/OpenCL/share/cmake/OpenCLHeaders/OpenCLHeadersConfigVersion.cmake: -------------------------------------------------------------------------------- 1 | # This is a basic version file for the Config-mode of find_package(). 2 | # It is used by write_basic_package_version_file() as input file for configure_file() 3 | # to create a version-file which can be installed along a config.cmake file. 4 | # 5 | # The created file sets PACKAGE_VERSION_EXACT if the current version string and 6 | # the requested version string are exactly the same and it sets 7 | # PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version. 8 | # The variable CVF_VERSION must be set before calling configure_file(). 9 | 10 | set(PACKAGE_VERSION "3.0") 11 | 12 | if (PACKAGE_FIND_VERSION_RANGE) 13 | # Package version must be in the requested version range 14 | if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN) 15 | OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX) 16 | OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX))) 17 | set(PACKAGE_VERSION_COMPATIBLE FALSE) 18 | else() 19 | set(PACKAGE_VERSION_COMPATIBLE TRUE) 20 | endif() 21 | else() 22 | if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION) 23 | set(PACKAGE_VERSION_COMPATIBLE FALSE) 24 | else() 25 | set(PACKAGE_VERSION_COMPATIBLE TRUE) 26 | if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION) 27 | set(PACKAGE_VERSION_EXACT TRUE) 28 | endif() 29 | endif() 30 | endif() 31 | 32 | 33 | # if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it: 34 | if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "" STREQUAL "") 35 | return() 36 | endif() 37 | 38 | # check that the installed version has the same 32/64bit-ness as the one which is currently searching: 39 | if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "") 40 | math(EXPR installedBits " * 8") 41 | set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)") 42 | set(PACKAGE_VERSION_UNSUITABLE TRUE) 43 | endif() 44 | -------------------------------------------------------------------------------- /svm/OpenCL/share/cmake/OpenCLHeadersCpp/OpenCLHeadersCppConfig.cmake: -------------------------------------------------------------------------------- 1 | include("${CMAKE_CURRENT_LIST_DIR}/OpenCLHeadersCppTargets.cmake") -------------------------------------------------------------------------------- /svm/OpenCL/share/cmake/OpenCLHeadersCpp/OpenCLHeadersCppConfigVersion.cmake: -------------------------------------------------------------------------------- 1 | # This is a basic version file for the Config-mode of find_package(). 2 | # It is used by write_basic_package_version_file() as input file for configure_file() 3 | # to create a version-file which can be installed along a config.cmake file. 4 | # 5 | # The created file sets PACKAGE_VERSION_EXACT if the current version string and 6 | # the requested version string are exactly the same and it sets 7 | # PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version. 8 | # The variable CVF_VERSION must be set before calling configure_file(). 9 | 10 | set(PACKAGE_VERSION "3.0") 11 | 12 | if (PACKAGE_FIND_VERSION_RANGE) 13 | # Package version must be in the requested version range 14 | if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN) 15 | OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX) 16 | OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX))) 17 | set(PACKAGE_VERSION_COMPATIBLE FALSE) 18 | else() 19 | set(PACKAGE_VERSION_COMPATIBLE TRUE) 20 | endif() 21 | else() 22 | if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION) 23 | set(PACKAGE_VERSION_COMPATIBLE FALSE) 24 | else() 25 | set(PACKAGE_VERSION_COMPATIBLE TRUE) 26 | if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION) 27 | set(PACKAGE_VERSION_EXACT TRUE) 28 | endif() 29 | endif() 30 | endif() 31 | 32 | 33 | # if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it: 34 | if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "" STREQUAL "") 35 | return() 36 | endif() 37 | 38 | # check that the installed version has the same 32/64bit-ness as the one which is currently searching: 39 | if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "") 40 | math(EXPR installedBits " * 8") 41 | set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)") 42 | set(PACKAGE_VERSION_UNSUITABLE TRUE) 43 | endif() 44 | -------------------------------------------------------------------------------- /svm/OpenCL/share/cmake/OpenCLICDLoader/OpenCLICDLoaderConfig.cmake: -------------------------------------------------------------------------------- 1 | include("${CMAKE_CURRENT_LIST_DIR}/OpenCLICDLoaderTargets.cmake") -------------------------------------------------------------------------------- /svm/OpenCL/share/cmake/OpenCLICDLoader/OpenCLICDLoaderConfigVersion.cmake: -------------------------------------------------------------------------------- 1 | # This is a basic version file for the Config-mode of find_package(). 2 | # It is used by write_basic_package_version_file() as input file for configure_file() 3 | # to create a version-file which can be installed along a config.cmake file. 4 | # 5 | # The created file sets PACKAGE_VERSION_EXACT if the current version string and 6 | # the requested version string are exactly the same and it sets 7 | # PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version. 8 | # The variable CVF_VERSION must be set before calling configure_file(). 9 | 10 | set(PACKAGE_VERSION "3.0") 11 | 12 | if (PACKAGE_FIND_VERSION_RANGE) 13 | # Package version must be in the requested version range 14 | if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN) 15 | OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX) 16 | OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX))) 17 | set(PACKAGE_VERSION_COMPATIBLE FALSE) 18 | else() 19 | set(PACKAGE_VERSION_COMPATIBLE TRUE) 20 | endif() 21 | else() 22 | if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION) 23 | set(PACKAGE_VERSION_COMPATIBLE FALSE) 24 | else() 25 | set(PACKAGE_VERSION_COMPATIBLE TRUE) 26 | if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION) 27 | set(PACKAGE_VERSION_EXACT TRUE) 28 | endif() 29 | endif() 30 | endif() 31 | 32 | 33 | # if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it: 34 | if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "" STREQUAL "") 35 | return() 36 | endif() 37 | 38 | # check that the installed version has the same 32/64bit-ness as the one which is currently searching: 39 | if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "") 40 | math(EXPR installedBits " * 8") 41 | set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)") 42 | set(PACKAGE_VERSION_UNSUITABLE TRUE) 43 | endif() 44 | -------------------------------------------------------------------------------- /svm/OpenCL/share/cmake/OpenCLICDLoader/OpenCLICDLoaderTargets-debug.cmake: -------------------------------------------------------------------------------- 1 | #---------------------------------------------------------------- 2 | # Generated CMake target import file for configuration "Debug". 3 | #---------------------------------------------------------------- 4 | 5 | # Commands may need to know the format version. 6 | set(CMAKE_IMPORT_FILE_VERSION 1) 7 | 8 | # Import target "OpenCL::OpenCL" for configuration "Debug" 9 | set_property(TARGET OpenCL::OpenCL APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG) 10 | set_target_properties(OpenCL::OpenCL PROPERTIES 11 | IMPORTED_IMPLIB_DEBUG "${_IMPORT_PREFIX}/lib/OpenCL.lib" 12 | IMPORTED_LOCATION_DEBUG "${_IMPORT_PREFIX}/bin/OpenCL.dll" 13 | ) 14 | 15 | list(APPEND _cmake_import_check_targets OpenCL::OpenCL ) 16 | list(APPEND _cmake_import_check_files_for_OpenCL::OpenCL "${_IMPORT_PREFIX}/lib/OpenCL.lib" "${_IMPORT_PREFIX}/bin/OpenCL.dll" ) 17 | 18 | # Commands beyond this point should not need to know the version. 19 | set(CMAKE_IMPORT_FILE_VERSION) 20 | -------------------------------------------------------------------------------- /svm/OpenCL/share/cmake/OpenCLICDLoader/OpenCLICDLoaderTargets-release.cmake: -------------------------------------------------------------------------------- 1 | #---------------------------------------------------------------- 2 | # Generated CMake target import file for configuration "Release". 3 | #---------------------------------------------------------------- 4 | 5 | # Commands may need to know the format version. 6 | set(CMAKE_IMPORT_FILE_VERSION 1) 7 | 8 | # Import target "OpenCL::OpenCL" for configuration "Release" 9 | set_property(TARGET OpenCL::OpenCL APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE) 10 | set_target_properties(OpenCL::OpenCL PROPERTIES 11 | IMPORTED_IMPLIB_RELEASE "${_IMPORT_PREFIX}/lib/OpenCL.lib" 12 | IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/bin/OpenCL.dll" 13 | ) 14 | 15 | list(APPEND _cmake_import_check_targets OpenCL::OpenCL ) 16 | list(APPEND _cmake_import_check_files_for_OpenCL::OpenCL "${_IMPORT_PREFIX}/lib/OpenCL.lib" "${_IMPORT_PREFIX}/bin/OpenCL.dll" ) 17 | 18 | # Commands beyond this point should not need to know the version. 19 | set(CMAKE_IMPORT_FILE_VERSION) 20 | -------------------------------------------------------------------------------- /svm/OpenCL/share/cmake/OpenCLUtils/OpenCLUtilsConfig.cmake: -------------------------------------------------------------------------------- 1 | include("${CMAKE_CURRENT_LIST_DIR}/OpenCLUtilsTargets.cmake") -------------------------------------------------------------------------------- /svm/OpenCL/share/cmake/OpenCLUtils/OpenCLUtilsConfigVersion.cmake: -------------------------------------------------------------------------------- 1 | # This is a basic version file for the Config-mode of find_package(). 2 | # It is used by write_basic_package_version_file() as input file for configure_file() 3 | # to create a version-file which can be installed along a config.cmake file. 4 | # 5 | # The created file sets PACKAGE_VERSION_EXACT if the current version string and 6 | # the requested version string are exactly the same and it sets 7 | # PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version. 8 | # The variable CVF_VERSION must be set before calling configure_file(). 9 | 10 | set(PACKAGE_VERSION "2024.10.24") 11 | 12 | if (PACKAGE_FIND_VERSION_RANGE) 13 | # Package version must be in the requested version range 14 | if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN) 15 | OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX) 16 | OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX))) 17 | set(PACKAGE_VERSION_COMPATIBLE FALSE) 18 | else() 19 | set(PACKAGE_VERSION_COMPATIBLE TRUE) 20 | endif() 21 | else() 22 | if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION) 23 | set(PACKAGE_VERSION_COMPATIBLE FALSE) 24 | else() 25 | set(PACKAGE_VERSION_COMPATIBLE TRUE) 26 | if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION) 27 | set(PACKAGE_VERSION_EXACT TRUE) 28 | endif() 29 | endif() 30 | endif() 31 | 32 | 33 | # if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it: 34 | if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "" STREQUAL "") 35 | return() 36 | endif() 37 | 38 | # check that the installed version has the same 32/64bit-ness as the one which is currently searching: 39 | if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "") 40 | math(EXPR installedBits " * 8") 41 | set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)") 42 | set(PACKAGE_VERSION_UNSUITABLE TRUE) 43 | endif() 44 | -------------------------------------------------------------------------------- /svm/OpenCL/share/cmake/OpenCLUtils/OpenCLUtilsTargets-debug.cmake: -------------------------------------------------------------------------------- 1 | #---------------------------------------------------------------- 2 | # Generated CMake target import file for configuration "Debug". 3 | #---------------------------------------------------------------- 4 | 5 | # Commands may need to know the format version. 6 | set(CMAKE_IMPORT_FILE_VERSION 1) 7 | 8 | # Import target "OpenCL::Utils" for configuration "Debug" 9 | set_property(TARGET OpenCL::Utils APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG) 10 | set_target_properties(OpenCL::Utils PROPERTIES 11 | IMPORTED_LINK_INTERFACE_LANGUAGES_DEBUG "C" 12 | IMPORTED_LOCATION_DEBUG "${_IMPORT_PREFIX}/lib/OpenCLUtilsd.lib" 13 | ) 14 | 15 | list(APPEND _cmake_import_check_targets OpenCL::Utils ) 16 | list(APPEND _cmake_import_check_files_for_OpenCL::Utils "${_IMPORT_PREFIX}/lib/OpenCLUtilsd.lib" ) 17 | 18 | # Commands beyond this point should not need to know the version. 19 | set(CMAKE_IMPORT_FILE_VERSION) 20 | -------------------------------------------------------------------------------- /svm/OpenCL/share/cmake/OpenCLUtils/OpenCLUtilsTargets-release.cmake: -------------------------------------------------------------------------------- 1 | #---------------------------------------------------------------- 2 | # Generated CMake target import file for configuration "Release". 3 | #---------------------------------------------------------------- 4 | 5 | # Commands may need to know the format version. 6 | set(CMAKE_IMPORT_FILE_VERSION 1) 7 | 8 | # Import target "OpenCL::Utils" for configuration "Release" 9 | set_property(TARGET OpenCL::Utils APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE) 10 | set_target_properties(OpenCL::Utils PROPERTIES 11 | IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "C" 12 | IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/OpenCLUtils.lib" 13 | ) 14 | 15 | list(APPEND _cmake_import_check_targets OpenCL::Utils ) 16 | list(APPEND _cmake_import_check_files_for_OpenCL::Utils "${_IMPORT_PREFIX}/lib/OpenCLUtils.lib" ) 17 | 18 | # Commands beyond this point should not need to know the version. 19 | set(CMAKE_IMPORT_FILE_VERSION) 20 | -------------------------------------------------------------------------------- /svm/OpenCL/share/cmake/OpenCLUtilsCpp/OpenCLUtilsCppConfig.cmake: -------------------------------------------------------------------------------- 1 | include("${CMAKE_CURRENT_LIST_DIR}/OpenCLUtilsCppTargets.cmake") -------------------------------------------------------------------------------- /svm/OpenCL/share/cmake/OpenCLUtilsCpp/OpenCLUtilsCppConfigVersion.cmake: -------------------------------------------------------------------------------- 1 | # This is a basic version file for the Config-mode of find_package(). 2 | # It is used by write_basic_package_version_file() as input file for configure_file() 3 | # to create a version-file which can be installed along a config.cmake file. 4 | # 5 | # The created file sets PACKAGE_VERSION_EXACT if the current version string and 6 | # the requested version string are exactly the same and it sets 7 | # PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version. 8 | # The variable CVF_VERSION must be set before calling configure_file(). 9 | 10 | set(PACKAGE_VERSION "2024.10.24") 11 | 12 | if (PACKAGE_FIND_VERSION_RANGE) 13 | # Package version must be in the requested version range 14 | if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN) 15 | OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX) 16 | OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX))) 17 | set(PACKAGE_VERSION_COMPATIBLE FALSE) 18 | else() 19 | set(PACKAGE_VERSION_COMPATIBLE TRUE) 20 | endif() 21 | else() 22 | if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION) 23 | set(PACKAGE_VERSION_COMPATIBLE FALSE) 24 | else() 25 | set(PACKAGE_VERSION_COMPATIBLE TRUE) 26 | if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION) 27 | set(PACKAGE_VERSION_EXACT TRUE) 28 | endif() 29 | endif() 30 | endif() 31 | 32 | 33 | # if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it: 34 | if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "" STREQUAL "") 35 | return() 36 | endif() 37 | 38 | # check that the installed version has the same 32/64bit-ness as the one which is currently searching: 39 | if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "") 40 | math(EXPR installedBits " * 8") 41 | set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)") 42 | set(PACKAGE_VERSION_UNSUITABLE TRUE) 43 | endif() 44 | -------------------------------------------------------------------------------- /svm/OpenCL/share/cmake/OpenCLUtilsCpp/OpenCLUtilsCppTargets-debug.cmake: -------------------------------------------------------------------------------- 1 | #---------------------------------------------------------------- 2 | # Generated CMake target import file for configuration "Debug". 3 | #---------------------------------------------------------------- 4 | 5 | # Commands may need to know the format version. 6 | set(CMAKE_IMPORT_FILE_VERSION 1) 7 | 8 | # Import target "OpenCL::UtilsCpp" for configuration "Debug" 9 | set_property(TARGET OpenCL::UtilsCpp APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG) 10 | set_target_properties(OpenCL::UtilsCpp PROPERTIES 11 | IMPORTED_LINK_INTERFACE_LANGUAGES_DEBUG "C;CXX" 12 | IMPORTED_LOCATION_DEBUG "${_IMPORT_PREFIX}/lib/OpenCLUtilsCppd.lib" 13 | ) 14 | 15 | list(APPEND _cmake_import_check_targets OpenCL::UtilsCpp ) 16 | list(APPEND _cmake_import_check_files_for_OpenCL::UtilsCpp "${_IMPORT_PREFIX}/lib/OpenCLUtilsCppd.lib" ) 17 | 18 | # Commands beyond this point should not need to know the version. 19 | set(CMAKE_IMPORT_FILE_VERSION) 20 | -------------------------------------------------------------------------------- /svm/OpenCL/share/cmake/OpenCLUtilsCpp/OpenCLUtilsCppTargets-release.cmake: -------------------------------------------------------------------------------- 1 | #---------------------------------------------------------------- 2 | # Generated CMake target import file for configuration "Release". 3 | #---------------------------------------------------------------- 4 | 5 | # Commands may need to know the format version. 6 | set(CMAKE_IMPORT_FILE_VERSION 1) 7 | 8 | # Import target "OpenCL::UtilsCpp" for configuration "Release" 9 | set_property(TARGET OpenCL::UtilsCpp APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE) 10 | set_target_properties(OpenCL::UtilsCpp PROPERTIES 11 | IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "C;CXX" 12 | IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/OpenCLUtilsCpp.lib" 13 | ) 14 | 15 | list(APPEND _cmake_import_check_targets OpenCL::UtilsCpp ) 16 | list(APPEND _cmake_import_check_files_for_OpenCL::UtilsCpp "${_IMPORT_PREFIX}/lib/OpenCLUtilsCpp.lib" ) 17 | 18 | # Commands beyond this point should not need to know the version. 19 | set(CMAKE_IMPORT_FILE_VERSION) 20 | -------------------------------------------------------------------------------- /svm/OpenCL/share/man/man1/clinfo.1.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clamchowder/Microbenchmarks/e8aaed570e6d7fea725f3d67313f5a51973d6fdf/svm/OpenCL/share/man/man1/clinfo.1.gz -------------------------------------------------------------------------------- /svm/OpenCL/share/pkgconfig/OpenCL-CLHPP.pc: -------------------------------------------------------------------------------- 1 | prefix=D:/a/OpenCL-SDK/OpenCL-SDK/install 2 | includedir=${prefix}/include 3 | 4 | Name: OpenCL-CLHPP 5 | Description: OpenCL API C++ bindings 6 | Requires: OpenCL-Headers 7 | Version: 3.0 8 | Cflags: -I${includedir} 9 | -------------------------------------------------------------------------------- /svm/OpenCL/share/pkgconfig/OpenCL-Headers.pc: -------------------------------------------------------------------------------- 1 | prefix=D:/a/OpenCL-SDK/OpenCL-SDK/install 2 | includedir=${prefix}/include 3 | 4 | Name: OpenCL-Headers 5 | Description: Khronos OpenCL Headers 6 | Version: 3.0 7 | Cflags: -I${includedir} 8 | -------------------------------------------------------------------------------- /svm/atomic_latency_kernel.cl: -------------------------------------------------------------------------------- 1 | __kernel void atomic_exec_latency_test(__global int* A, int count) { 2 | int current = 1; 3 | while (current <= 2 * count) { 4 | if (atomic_cmpxchg(A, current - 1, current) == current - 1) { 5 | current += 2; 6 | // printf("gpu current = %d\n", current); 7 | } // else printf("A = %d wait for %d\n", *A, current - 1); 8 | } 9 | } 10 | 11 | __kernel void increment_on_gpu(__global int *A) 12 | { 13 | *A = *A + 1; 14 | } -------------------------------------------------------------------------------- /svm/svm.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 17 4 | VisualStudioVersion = 17.12.35527.113 d17.12 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "svm", "svm.vcxproj", "{411AB5E4-FD55-4478-83F2-80C51F205FA7}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|x64 = Debug|x64 11 | Debug|x86 = Debug|x86 12 | Release|x64 = Release|x64 13 | Release|x86 = Release|x86 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {411AB5E4-FD55-4478-83F2-80C51F205FA7}.Debug|x64.ActiveCfg = Debug|x64 17 | {411AB5E4-FD55-4478-83F2-80C51F205FA7}.Debug|x64.Build.0 = Debug|x64 18 | {411AB5E4-FD55-4478-83F2-80C51F205FA7}.Debug|x86.ActiveCfg = Debug|Win32 19 | {411AB5E4-FD55-4478-83F2-80C51F205FA7}.Debug|x86.Build.0 = Debug|Win32 20 | {411AB5E4-FD55-4478-83F2-80C51F205FA7}.Release|x64.ActiveCfg = Release|x64 21 | {411AB5E4-FD55-4478-83F2-80C51F205FA7}.Release|x64.Build.0 = Release|x64 22 | {411AB5E4-FD55-4478-83F2-80C51F205FA7}.Release|x86.ActiveCfg = Release|Win32 23 | {411AB5E4-FD55-4478-83F2-80C51F205FA7}.Release|x86.Build.0 = Release|Win32 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | EndGlobal 29 | -------------------------------------------------------------------------------- /svm/svm.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | Source Files 20 | 21 | 22 | --------------------------------------------------------------------------------