├── .github └── workflows │ └── release.yml ├── .gitignore ├── App.config ├── BandwidthRunner.cs ├── BenchmarkDll.c ├── BenchmarkDll.vcxproj ├── BenchmarkDll.vcxproj.filters ├── BenchmarkDllCommon.h ├── BenchmarkInteropFunctions.cs ├── BenchmarkSubmission.cs ├── BenchmarkSubmissionDialog.Designer.cs ├── BenchmarkSubmissionDialog.cs ├── BenchmarkSubmissionDialog.resx ├── GlobalTestSettings.cs ├── LatencyRunner.cs ├── MemoryBandwidthFunctions.asm ├── MemoryLatency.c ├── MemoryLatencyFunctions.asm ├── MicrobenchmarkForm.Designer.cs ├── MicrobenchmarkForm.cs ├── MicrobenchmarkForm.resx ├── MicrobenchmarkGui.csproj ├── MicrobenchmarkGui.sln ├── OpCode.cs ├── OpenCL ├── LICENSE ├── README.md ├── include │ └── CL │ │ ├── cl.h │ │ ├── cl_d3d10.h │ │ ├── cl_d3d11.h │ │ ├── cl_dx9_media_sharing.h │ │ ├── cl_dx9_media_sharing_intel.h │ │ ├── cl_egl.h │ │ ├── cl_ext.h │ │ ├── cl_ext_intel.h │ │ ├── cl_gl.h │ │ ├── cl_gl_ext.h │ │ ├── cl_half.h │ │ ├── cl_icd.h │ │ ├── cl_platform.h │ │ ├── cl_va_api_media_sharing_intel.h │ │ ├── cl_version.h │ │ └── opencl.h └── lib │ └── OpenCL.lib ├── OpenCLTest.cs ├── OpenClFunctions.c ├── Program.cs ├── Properties ├── AssemblyInfo.cs ├── Resources.Designer.cs ├── Resources.resx ├── Settings.Designer.cs └── Settings.settings ├── README.md ├── TestUtilities.cs ├── app.manifest ├── framework.h ├── img ├── guiscreenshot.png ├── guiscreenshot_latency.png └── lockpages.png ├── latencykernel.cl ├── latencykernel_tex.cl ├── packages.config ├── pch.c └── pch.h /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Build and Release 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | 7 | jobs: 8 | build: 9 | runs-on: windows-latest 10 | 11 | steps: 12 | - uses: actions/checkout@v4 13 | 14 | - name: Setup MSBuild 15 | uses: microsoft/setup-msbuild@v2 16 | 17 | - name: Get Latest NASM Version 18 | id: nasm-version 19 | run: | 20 | $nasmPage = Invoke-WebRequest -Uri "https://www.nasm.us" 21 | if ($nasmPage.Content -match 'releasebuilds/(\d+\.\d+\.\d+)/') { 22 | $version = $matches[1] 23 | echo "NASM_VERSION=$version" >> $env:GITHUB_ENV 24 | Write-Host "Latest NASM version: $version" 25 | } else { 26 | Write-Error "Could not determine latest NASM version" 27 | exit 1 28 | } 29 | 30 | - name: Setup NASM 31 | run: | 32 | $version = $env:NASM_VERSION 33 | $url = "https://www.nasm.us/pub/nasm/releasebuilds/$version/win64/nasm-$version-win64.zip" 34 | Write-Host "Downloading NASM from: $url" 35 | Invoke-WebRequest -Uri $url -OutFile nasm.zip 36 | Expand-Archive nasm.zip -DestinationPath . 37 | echo "$pwd\nasm-$version" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append 38 | 39 | - name: Build Solution 40 | run: | 41 | msbuild /p:Configuration=Release /p:Platform=x64 MicrobenchmarkGui.sln 42 | 43 | - name: Create Release ZIP 44 | run: | 45 | # Create temporary directory for ZIP contents 46 | New-Item -ItemType Directory -Path tmp 47 | 48 | # Copy only the required files (flat structure) 49 | Copy-Item "x64\Release\MicrobenchmarkGui.exe.config" tmp/ 50 | Copy-Item "x64\Release\MicrobenchmarkGui.exe" tmp/ 51 | Copy-Item "x64\Release\BenchmarkDll.dll" tmp/ 52 | Copy-Item "x64\Release\Newtonsoft.Json.dll" tmp/ 53 | 54 | # Create release name with short commit hash 55 | $commitHash = $env:GITHUB_SHA.Substring(0, 7) 56 | $releaseName = "MicrobenchmarkGui-$commitHash" 57 | 58 | # Create ZIP file 59 | Compress-Archive -Path "tmp\*" -DestinationPath "$releaseName.zip" 60 | 61 | # Store names for next step 62 | echo "RELEASE_NAME=$releaseName" >> $env:GITHUB_ENV 63 | echo "COMMIT_HASH=$commitHash" >> $env:GITHUB_ENV 64 | 65 | - name: Create Release 66 | id: create_release 67 | uses: softprops/action-gh-release@v2 68 | env: 69 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 70 | with: 71 | tag_name: ${{ env.COMMIT_HASH }} 72 | name: ${{ env.RELEASE_NAME }} 73 | body: | 74 | - ${{ env.COMMIT_HASH }} ${{ github.event.head_commit.message }} 75 | - Built with NASM ${{ env.NASM_VERSION }} 76 | draft: false 77 | prerelease: false 78 | files: ${{ env.RELEASE_NAME }}.zip 79 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | *.swp 13 | *generatednasm* 14 | 15 | # User-specific files (MonoDevelop/Xamarin Studio) 16 | *.userprefs 17 | 18 | # Mono auto generated files 19 | mono_crash.* 20 | 21 | # Build results 22 | [Dd]ebug/ 23 | [Dd]ebugPublic/ 24 | [Rr]elease/ 25 | [Rr]eleases/ 26 | x64/ 27 | x86/ 28 | [Ww][Ii][Nn]32/ 29 | [Aa][Rr][Mm]/ 30 | [Aa][Rr][Mm]64/ 31 | bld/ 32 | [Bb]in/ 33 | [Oo]bj/ 34 | [Ll]og/ 35 | [Ll]ogs/ 36 | clammicrobench/*.asm 37 | 38 | # Visual Studio 2015/2017 cache/options directory 39 | .vs/ 40 | # Uncomment if you have tasks that create the project's static files in wwwroot 41 | #wwwroot/ 42 | 43 | # Visual Studio 2017 auto generated files 44 | Generated\ Files/ 45 | 46 | # MSTest test Results 47 | [Tt]est[Rr]esult*/ 48 | [Bb]uild[Ll]og.* 49 | 50 | # NUnit 51 | *.VisualState.xml 52 | TestResult.xml 53 | nunit-*.xml 54 | 55 | # Build Results of an ATL Project 56 | [Dd]ebugPS/ 57 | [Rr]eleasePS/ 58 | dlldata.c 59 | 60 | # Benchmark Results 61 | BenchmarkDotNet.Artifacts/ 62 | 63 | # .NET Core 64 | project.lock.json 65 | project.fragment.lock.json 66 | artifacts/ 67 | 68 | # ASP.NET Scaffolding 69 | ScaffoldingReadMe.txt 70 | 71 | # StyleCop 72 | StyleCopReport.xml 73 | 74 | # Files built by Visual Studio 75 | *_i.c 76 | *_p.c 77 | *_h.h 78 | *.ilk 79 | *.meta 80 | *.obj 81 | *.iobj 82 | *.pch 83 | *.pdb 84 | *.ipdb 85 | *.pgc 86 | *.pgd 87 | *.rsp 88 | *.sbr 89 | *.tlb 90 | *.tli 91 | *.tlh 92 | *.tmp 93 | *.tmp_proj 94 | *_wpftmp.csproj 95 | *.log 96 | *.tlog 97 | *.vspscc 98 | *.vssscc 99 | .builds 100 | *.pidb 101 | *.svclog 102 | *.scc 103 | 104 | # Chutzpah Test files 105 | _Chutzpah* 106 | 107 | # Visual C++ cache files 108 | ipch/ 109 | *.aps 110 | *.ncb 111 | *.opendb 112 | *.opensdf 113 | *.sdf 114 | *.cachefile 115 | *.VC.db 116 | *.VC.VC.opendb 117 | 118 | # Visual Studio profiler 119 | *.psess 120 | *.vsp 121 | *.vspx 122 | *.sap 123 | 124 | # Visual Studio Trace Files 125 | *.e2e 126 | 127 | # TFS 2012 Local Workspace 128 | $tf/ 129 | 130 | # Guidance Automation Toolkit 131 | *.gpState 132 | 133 | # ReSharper is a .NET coding add-in 134 | _ReSharper*/ 135 | *.[Rr]e[Ss]harper 136 | *.DotSettings.user 137 | 138 | # TeamCity is a build add-in 139 | _TeamCity* 140 | 141 | # DotCover is a Code Coverage Tool 142 | *.dotCover 143 | 144 | # AxoCover is a Code Coverage Tool 145 | .axoCover/* 146 | !.axoCover/settings.json 147 | 148 | # Coverlet is a free, cross platform Code Coverage Tool 149 | coverage*.json 150 | coverage*.xml 151 | coverage*.info 152 | 153 | # Visual Studio code coverage results 154 | *.coverage 155 | *.coveragexml 156 | 157 | # NCrunch 158 | _NCrunch_* 159 | .*crunch*.local.xml 160 | nCrunchTemp_* 161 | 162 | # MightyMoose 163 | *.mm.* 164 | AutoTest.Net/ 165 | 166 | # Web workbench (sass) 167 | .sass-cache/ 168 | 169 | # Installshield output folder 170 | [Ee]xpress/ 171 | 172 | # DocProject is a documentation generator add-in 173 | DocProject/buildhelp/ 174 | DocProject/Help/*.HxT 175 | DocProject/Help/*.HxC 176 | DocProject/Help/*.hhc 177 | DocProject/Help/*.hhk 178 | DocProject/Help/*.hhp 179 | DocProject/Help/Html2 180 | DocProject/Help/html 181 | 182 | # Click-Once directory 183 | publish/ 184 | 185 | # Publish Web Output 186 | *.[Pp]ublish.xml 187 | *.azurePubxml 188 | # Note: Comment the next line if you want to checkin your web deploy settings, 189 | # but database connection strings (with potential passwords) will be unencrypted 190 | *.pubxml 191 | *.publishproj 192 | 193 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 194 | # checkin your Azure Web App publish settings, but sensitive information contained 195 | # in these scripts will be unencrypted 196 | PublishScripts/ 197 | 198 | # NuGet Packages 199 | *.nupkg 200 | # NuGet Symbol Packages 201 | *.snupkg 202 | # The packages folder can be ignored because of Package Restore 203 | **/[Pp]ackages/* 204 | # except build/, which is used as an MSBuild target. 205 | !**/[Pp]ackages/build/ 206 | # Uncomment if necessary however generally it will be regenerated when needed 207 | #!**/[Pp]ackages/repositories.config 208 | # NuGet v3's project.json files produces more ignorable files 209 | *.nuget.props 210 | *.nuget.targets 211 | 212 | # Nuget personal access tokens and Credentials 213 | nuget.config 214 | 215 | # Microsoft Azure Build Output 216 | csx/ 217 | *.build.csdef 218 | 219 | # Microsoft Azure Emulator 220 | ecf/ 221 | rcf/ 222 | 223 | # Windows Store app package directories and files 224 | AppPackages/ 225 | BundleArtifacts/ 226 | Package.StoreAssociation.xml 227 | _pkginfo.txt 228 | *.appx 229 | *.appxbundle 230 | *.appxupload 231 | 232 | # Visual Studio cache files 233 | # files ending in .cache can be ignored 234 | *.[Cc]ache 235 | # but keep track of directories ending in .cache 236 | !?*.[Cc]ache/ 237 | 238 | # Others 239 | ClientBin/ 240 | ~$* 241 | *~ 242 | *.dbmdl 243 | *.dbproj.schemaview 244 | *.jfm 245 | *.pfx 246 | *.publishsettings 247 | orleans.codegen.cs 248 | 249 | # Including strong name files can present a security risk 250 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 251 | #*.snk 252 | 253 | # Since there are multiple workflows, uncomment next line to ignore bower_components 254 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 255 | #bower_components/ 256 | 257 | # RIA/Silverlight projects 258 | Generated_Code/ 259 | 260 | # Backup & report files from converting an old project file 261 | # to a newer Visual Studio version. Backup files are not needed, 262 | # because we have git ;-) 263 | _UpgradeReport_Files/ 264 | Backup*/ 265 | UpgradeLog*.XML 266 | UpgradeLog*.htm 267 | ServiceFabricBackup/ 268 | *.rptproj.bak 269 | 270 | # SQL Server files 271 | *.mdf 272 | *.ldf 273 | *.ndf 274 | 275 | # Business Intelligence projects 276 | *.rdl.data 277 | *.bim.layout 278 | *.bim_*.settings 279 | *.rptproj.rsuser 280 | *- [Bb]ackup.rdl 281 | *- [Bb]ackup ([0-9]).rdl 282 | *- [Bb]ackup ([0-9][0-9]).rdl 283 | 284 | # Microsoft Fakes 285 | FakesAssemblies/ 286 | 287 | # GhostDoc plugin setting file 288 | *.GhostDoc.xml 289 | 290 | # Node.js Tools for Visual Studio 291 | .ntvs_analysis.dat 292 | node_modules/ 293 | 294 | # Visual Studio 6 build log 295 | *.plg 296 | 297 | # Visual Studio 6 workspace options file 298 | *.opt 299 | 300 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 301 | *.vbw 302 | 303 | # Visual Studio LightSwitch build output 304 | **/*.HTMLClient/GeneratedArtifacts 305 | **/*.DesktopClient/GeneratedArtifacts 306 | **/*.DesktopClient/ModelManifest.xml 307 | **/*.Server/GeneratedArtifacts 308 | **/*.Server/ModelManifest.xml 309 | _Pvt_Extensions 310 | 311 | # Paket dependency manager 312 | .paket/paket.exe 313 | paket-files/ 314 | 315 | # FAKE - F# Make 316 | .fake/ 317 | 318 | # CodeRush personal settings 319 | .cr/personal 320 | 321 | # Python Tools for Visual Studio (PTVS) 322 | __pycache__/ 323 | *.pyc 324 | 325 | # Cake - Uncomment if you are using it 326 | # tools/** 327 | # !tools/packages.config 328 | 329 | # Tabs Studio 330 | *.tss 331 | 332 | # Telerik's JustMock configuration file 333 | *.jmconfig 334 | 335 | # BizTalk build output 336 | *.btp.cs 337 | *.btm.cs 338 | *.odx.cs 339 | *.xsd.cs 340 | 341 | # OpenCover UI analysis results 342 | OpenCover/ 343 | 344 | # Azure Stream Analytics local run output 345 | ASALocalRun/ 346 | 347 | # MSBuild Binary and Structured Log 348 | *.binlog 349 | 350 | # NVidia Nsight GPU debugger configuration file 351 | *.nvuser 352 | 353 | # MFractors (Xamarin productivity tool) working folder 354 | .mfractor/ 355 | 356 | # Local History for Visual Studio 357 | .localhistory/ 358 | 359 | # BeatPulse healthcheck temp database 360 | healthchecksdb 361 | 362 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 363 | MigrationBackup/ 364 | 365 | # Ionide (cross platform F# VS Code tools) working folder 366 | .ionide/ 367 | 368 | # Fody - auto-generated XML schema 369 | FodyWeavers.xsd 370 | 371 | # VS Code files for those working on multiple tools 372 | .vscode/* 373 | !.vscode/settings.json 374 | !.vscode/tasks.json 375 | !.vscode/launch.json 376 | !.vscode/extensions.json 377 | *.code-workspace 378 | 379 | # Local History for Visual Studio Code 380 | .history/ 381 | 382 | # Windows Installer files from build outputs 383 | *.cab 384 | *.msi 385 | *.msix 386 | *.msm 387 | *.msp 388 | 389 | # JetBrains Rider 390 | .idea/ 391 | *.sln.iml 392 | -------------------------------------------------------------------------------- /App.config: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /BandwidthRunner.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Diagnostics; 4 | using System.Text; 5 | using System.Threading; 6 | using System.Threading.Tasks; 7 | using System.Windows.Forms; 8 | using System.Windows.Forms.DataVisualization.Charting; 9 | 10 | namespace MicrobenchmarkGui 11 | { 12 | public class BandwidthRunner 13 | { 14 | public uint[] testSizes = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 512, 600, 768, 1024, 1536, 2048, 15 | 3072, 4096, 5120, 6144, 8192, 10240, 12288, 16384, 24567, 32768, 65536, 98304, 16 | 131072, 262144, 393216, 524288, 1048576, 1572864, 2097152, 3145728 }; 17 | 18 | public bool running = false; 19 | 20 | /// 21 | /// Test type to run, for automated test 22 | /// 23 | public BenchmarkInteropFunctions.TestType testType; 24 | 25 | // run results 26 | public Dictionary>> RunResults; 27 | 28 | // last run results 29 | public string[][] formattedResults; 30 | 31 | /// 32 | /// List of test results from last run 33 | /// 34 | public List testResultsList; 35 | 36 | /// 37 | /// List of tested points from last run 38 | /// 39 | public List floatTestPoints; 40 | 41 | private ListView resultListView; 42 | private Chart resultChart; 43 | private MicrobenchmarkForm.SafeSetResultListView setListViewDelegate; 44 | private MicrobenchmarkForm.SafeSetResultListViewColumns setListViewColumnsDelegate; 45 | private MicrobenchmarkForm.SafeSetResultsChart setChartDelegate; 46 | private MicrobenchmarkForm.SafeSetProgressLabel setProgressLabelDelegate; 47 | private Label progressLabel; 48 | private string[] bwCols = { "Data Size", "Bandwidth" }; 49 | 50 | public BandwidthRunner(MicrobenchmarkForm.SafeSetResultListView setListViewDelegate, 51 | MicrobenchmarkForm.SafeSetResultListViewColumns setListViewColsDelegate, 52 | MicrobenchmarkForm.SafeSetResultsChart setChartDelegate, 53 | MicrobenchmarkForm.SafeSetProgressLabel setLabelDelegate, 54 | ListView resultListView, 55 | Chart resultChart, 56 | Label progressLabel) 57 | { 58 | this.setListViewColumnsDelegate = setListViewColsDelegate; 59 | this.setListViewDelegate = setListViewDelegate; 60 | this.setChartDelegate = setChartDelegate; 61 | this.setProgressLabelDelegate = setLabelDelegate; 62 | this.resultListView = resultListView; 63 | this.resultChart = resultChart; 64 | this.progressLabel = progressLabel; 65 | 66 | this.RunResults = new Dictionary>>(); 67 | } 68 | 69 | private uint GetIterationCount(uint testSize, uint dataGb) 70 | { 71 | uint gbToTransfer = dataGb; 72 | if (testSize > 64) gbToTransfer = dataGb / 2; 73 | if (testSize > 512) gbToTransfer = dataGb / 4; 74 | if (testSize > 8192) gbToTransfer = dataGb / 8; 75 | uint iterations = gbToTransfer * 1024 * 1024 / testSize; 76 | if (iterations % 2 != 0) iterations += 1; 77 | 78 | if (iterations < 4) return 4; // Set a minimum to reduce noise 79 | else return iterations; 80 | } 81 | 82 | // Run through test sizes, meant to be run in a background thread 83 | public void StartFullTest(uint threads, bool shared, BenchmarkInteropFunctions.TestType testType, CancellationToken runCancel) 84 | { 85 | running = true; 86 | string testLabel = threads + "T " + testType.ToString(); 87 | List> currentRunResults = new List>(); 88 | testResultsList = new List(); 89 | floatTestPoints = new List(); 90 | resultListView.Invoke(setListViewColumnsDelegate, new object[] { bwCols }); 91 | float[] testResults = new float[testSizes.Length]; 92 | formattedResults = new string[testSizes.Length][]; 93 | 94 | for (uint i = 0; i < testSizes.Length; i++) 95 | { 96 | testResults[i] = 0; 97 | formattedResults[i] = new string[2]; 98 | formattedResults[i][0] = string.Format("{0} KB", testSizes[i]); 99 | formattedResults[i][1] = "Not Run"; 100 | } 101 | 102 | resultListView.Invoke(setListViewDelegate, new object[] { formattedResults }); 103 | 104 | float lastTimeMs = 0; 105 | for (uint testIdx = 0; testIdx < testSizes.Length; testIdx++) 106 | { 107 | if (runCancel.IsCancellationRequested) 108 | { 109 | break; 110 | } 111 | 112 | uint testSize = testSizes[testIdx]; 113 | ulong currentIterations = GetIterationCount(testSize, 32); 114 | float targetTimeMs = 3000, minTimeMs = 1000, result; 115 | 116 | if (GlobalTestSettings.MinTestSizeKb != 0 && GlobalTestSettings.MinTestSizeKb > testSize) continue; 117 | 118 | Stopwatch debugStopwatch = new Stopwatch(); 119 | 120 | do { 121 | float dataTransferredGb = (float)((currentIterations * testSize * 1024.0 * (shared ? threads : 1)) / 1e9); 122 | string progressMessage = string.Format("Testing bandwidth over {0} KB, {1}K iterations = {2:F2} GB, last run = {3:F2} ms", testSize, currentIterations / 1000, dataTransferredGb, lastTimeMs); 123 | progressLabel.Invoke(setProgressLabelDelegate, new object[] { progressMessage }); 124 | 125 | debugStopwatch.Restart(); 126 | result = BenchmarkInteropFunctions.MeasureBw(testSize, currentIterations, threads, shared ? 1 : 0, testType); 127 | debugStopwatch.Stop(); 128 | 129 | lastTimeMs = 1000 * dataTransferredGb / result; 130 | currentIterations = TestUtilities.ScaleIterations(currentIterations, targetTimeMs, lastTimeMs); 131 | Console.WriteLine("Reported {0:F2} ms, sw {1} ms. Next Iteration Count: {2}", lastTimeMs, debugStopwatch.ElapsedMilliseconds, currentIterations); 132 | } while (lastTimeMs < minTimeMs); 133 | 134 | testResults[testIdx] = result; 135 | if (result != 0) formattedResults[testIdx][1] = string.Format("{0:F2} GB/s", result); 136 | else formattedResults[testIdx][1] = "N/A"; 137 | resultListView.Invoke(setListViewDelegate, new object[] { formattedResults }); 138 | 139 | if (result != 0) 140 | { 141 | floatTestPoints.Add(testSize); 142 | testResultsList.Add(result); 143 | currentRunResults.Add(new Tuple(testSize, result)); 144 | resultChart.Invoke(setChartDelegate, new object[] { testLabel, floatTestPoints.ToArray(), testResultsList.ToArray(), MicrobenchmarkForm.ResultChartType.CpuMemoryBandwidth }); 145 | } 146 | } 147 | 148 | progressLabel.Invoke(setProgressLabelDelegate, new object[] { "Run finished" }); 149 | running = false; 150 | RunResults.Add(testLabel, currentRunResults); 151 | } 152 | 153 | // Run a single test size, meant to be run in a background thread 154 | public void RunSingleTest(uint sizeKb, uint threads, bool shared, BenchmarkInteropFunctions.TestType testType) 155 | { 156 | running = true; 157 | float result = BenchmarkInteropFunctions.MeasureBw(sizeKb, GetIterationCount(sizeKb, 512), threads, shared ? 1 : 0, testType); 158 | resultListView.Invoke(setListViewColumnsDelegate, new object[] { bwCols }); 159 | string[][] formattedResults = new string[1][]; 160 | formattedResults[0] = new string[2]; 161 | formattedResults[0][0] = sizeKb + " KB"; 162 | formattedResults[0][1] = result + " GB/s"; 163 | resultListView.Invoke(setListViewDelegate, new object[] { formattedResults }); 164 | } 165 | 166 | public string GetTestSizesAsString() 167 | { 168 | return string.Join(",", testSizes); 169 | } 170 | 171 | // Shouldn't be called when test is running, but UI will take care of that 172 | public void SetTestSizes(string input) 173 | { 174 | string[] inputArr = input.Split(new char[] { ',' } , StringSplitOptions.RemoveEmptyEntries); 175 | uint[] newTestSizes = new uint[inputArr.Length]; 176 | for (uint i = 0;i < inputArr.Length; i++) 177 | { 178 | newTestSizes[i] = uint.Parse(inputArr[i]); 179 | } 180 | 181 | testSizes = newTestSizes; 182 | } 183 | } 184 | } 185 | -------------------------------------------------------------------------------- /BenchmarkDll.c: -------------------------------------------------------------------------------- 1 | #include "pch.h" 2 | #include "BenchmarkDllCommon.h" 3 | 4 | extern float mmx_asm_read(void* arr, uint64_t arr_length, uint64_t iterations); 5 | extern float mmx_asm_write(void* arr, uint64_t arr_length, uint64_t iterations); 6 | extern float mmx_asm_ntwrite(void* arr, uint64_t arr_length, uint64_t iterations); 7 | extern float sse_asm_read(void* arr, uint64_t arr_length, uint64_t iterations); 8 | extern float sse_asm_write(void* arr, uint64_t arr_length, uint64_t iterations); 9 | extern float sse_asm_ntwrite(void* arr, uint64_t arr_length, uint64_t iterations); 10 | extern float sse_asm_ntread(void* arr, uint64_t arr_length, uint64_t iterations); 11 | extern float sse_asm_copy(void* arr, uint64_t arr_length, uint64_t iterations); 12 | extern float sse_asm_add(void* arr, uint64_t arr_length, uint64_t iterations); 13 | extern float avx_asm_read(void* arr, uint64_t arr_length, uint64_t iterations); 14 | extern float avx_asm_write(void* arr, uint64_t arr_length, uint64_t iterations); 15 | extern float avx_asm_ntwrite(void* arr, uint64_t arr_length, uint64_t iterations); 16 | extern float avx_asm_copy(void* arr, uint64_t arr_length, uint64_t iterations); 17 | extern float avx_asm_cflip(void* arr, uint64_t arr_length, uint64_t iterations); 18 | extern float avx_asm_add(void* arr, uint64_t arr_length, uint64_t iterations); 19 | extern float avx512_asm_read(void* arr, uint64_t arr_length, uint64_t iterations); 20 | extern float avx512_asm_write(void* arr, uint64_t arr_length, uint64_t iterations); 21 | extern float avx512_asm_ntwrite(void* arr, uint64_t arr_length, uint64_t iterations); 22 | extern float avx512_asm_add(void* arr, uint64_t arr_length, uint64_t iterations); 23 | extern float repmovsb_copy(void* arr, uint64_t arr_length, uint64_t iterations); 24 | extern float repstosb_write(void* arr, uint64_t arr_length, uint64_t iterations); 25 | extern float repmovsd_copy(void* arr, uint64_t arr_length, uint64_t iterations); 26 | extern float repstosd_write(void* arr, uint64_t arr_length, uint64_t iterations); 27 | float (*bw_func)(void*, uint64_t, uint64_t) = sse_asm_read; 28 | 29 | float __fastcall instr_read(void* arr, uint64_t arr_length, uint64_t iterations); 30 | 31 | BOOL APIENTRY DllMain( HMODULE hModule, 32 | DWORD ul_reason_for_call, 33 | LPVOID lpReserved 34 | ) 35 | { 36 | switch (ul_reason_for_call) 37 | { 38 | case DLL_PROCESS_ATTACH: 39 | case DLL_THREAD_ATTACH: 40 | case DLL_THREAD_DETACH: 41 | case DLL_PROCESS_DETACH: 42 | break; 43 | } 44 | return TRUE; 45 | } 46 | 47 | // Does thing work? 48 | __declspec(dllexport) float __stdcall test(int size); 49 | float __stdcall test(int size) 50 | { 51 | return (float)size + 0.1f; 52 | } 53 | 54 | __declspec(dllexport) int __stdcall CheckAvxSupport(); 55 | int __stdcall CheckAvxSupport() 56 | { 57 | int cpuid_data[4]; 58 | __cpuidex(cpuid_data, 1, 0); 59 | if (cpuid_data[2] & (1UL << 28)) return 1; 60 | return 0; 61 | } 62 | 63 | __declspec(dllexport) int __stdcall CheckAvx512Support(); 64 | int __stdcall CheckAvx512Support() 65 | { 66 | int cpuid_data[4]; 67 | __cpuidex(cpuid_data, 7, 0); 68 | if (cpuid_data[1] & (1UL << 16)) return 1; 69 | return 0; 70 | } 71 | 72 | enum TestType { 73 | None = 0, 74 | SseRead = 1, 75 | SseWrite = 2, 76 | SseCopy = 3, 77 | SseAdd = 4, 78 | AvxRead = 5, 79 | AvxWrite = 6, 80 | AvxCopy = 7, 81 | AvxCflip = 8, 82 | AvxAdd = 9, 83 | Avx512Read = 10, 84 | Avx512Write = 11, 85 | Avx512Add = 12, 86 | Instr4 = 13, 87 | Instr8 = 14, 88 | K8Instr4 = 15, 89 | Branch16 = 16, 90 | MmxRead = 17, 91 | MmxWrite = 18, 92 | MmxNtWrite = 19, 93 | SseNtWrite = 20, 94 | AvxNtWrite = 21, 95 | Avx512NtWrite = 22, 96 | SseNtRead = 23, 97 | RepMovsb = 24, 98 | RepStosb = 25, 99 | RepMovsd = 26, 100 | RepStosd = 27 101 | }; 102 | 103 | typedef struct BandwidthTestThreadData { 104 | uint32_t iterations; 105 | uint32_t arr_length; 106 | float* arr; 107 | float bw; // written to by the thread 108 | } BandwidthTestThreadData; 109 | 110 | DWORD WINAPI ReadBandwidthTestThread(LPVOID param) { 111 | BandwidthTestThreadData* bwTestData = (BandwidthTestThreadData*)param; 112 | float sum = bw_func(bwTestData->arr, bwTestData->arr_length, bwTestData->iterations); 113 | if (sum == 0) return 1; 114 | return 0; 115 | } 116 | 117 | void FillInstructionArray(uint64_t* arr, uint64_t sizeKb, enum TestType nopSize) 118 | { 119 | char nop8b[8] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 }; 120 | 121 | // zen/piledriver optimization manual uses this pattern 122 | char nop4b[8] = { 0x0F, 0x1F, 0x40, 0x00, 0x0F, 0x1F, 0x40, 0x00 }; 123 | 124 | // athlon64 (K8) optimization manual pattern 125 | char k8_nop4b[8] = { 0x66, 0x66, 0x66, 0x90, 0x66, 0x66, 0x66, 0x90 }; 126 | 127 | uint64_t elements = (sizeKb * 1024 / 8) - 1; // leave room for ret 128 | unsigned char* functionEnd = (unsigned char*)(arr + elements); 129 | 130 | if (nopSize != Branch16) { 131 | uint64_t* nopPtr; 132 | if (nopSize == Instr8) nopPtr = (uint64_t*)(nop8b); 133 | else if (nopSize == Instr4) nopPtr = (uint64_t*)(nop4b); 134 | else if (nopSize == K8Instr4) nopPtr = (uint64_t*)(k8_nop4b); 135 | else { 136 | return; 137 | } 138 | 139 | for (uint64_t nopIdx = 0; nopIdx < elements; nopIdx++) { 140 | arr[nopIdx] = *nopPtr; 141 | } 142 | 143 | functionEnd[0] = 0xC3; 144 | } 145 | else { 146 | // jump forward 14 bytes 147 | char branch16b[8] = { 0xEB, 0x0E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 148 | char ret8b[8] = { 0xC3, 0, 0, 0, 0, 0, 0, 0 }; 149 | uint64_t* branchPtr = (uint64_t*)(branch16b); 150 | uint64_t* nopPtr = (uint64_t*)(nop8b); // doesn't really matter, we should never hit this 151 | 152 | // last iteration must have nopIdx % 2 == 1, so the jump will go to the return statement 153 | // i.e. branchElements for loop must be even, so the last iteration is odd 154 | uint64_t branchElements = elements % 2 == 0 ? elements : elements - 1; 155 | uint64_t nopIdx; 156 | for (nopIdx = 0; nopIdx < branchElements; nopIdx++) { 157 | arr[nopIdx] = nopIdx % 2 == 0 ? *branchPtr : *nopPtr; 158 | } 159 | 160 | arr[nopIdx] = *(uint64_t*)ret8b; 161 | } 162 | } 163 | __declspec(dllexport) float __stdcall MeasureBw(uint32_t sizeKb, uint64_t iterations, uint32_t threads, int shared, enum TestType mode); 164 | 165 | float __stdcall MeasureBw(uint32_t sizeKb, uint64_t iterations, uint32_t threads, int shared, enum TestType mode) { 166 | struct timeb start, end; 167 | float bw = 0; 168 | uint32_t elements = sizeKb * 1024 / sizeof(float); 169 | uint32_t private_elements = (uint32_t)ceil((double)sizeKb / (double)threads) * 256; 170 | DWORD protection_flags = PAGE_EXECUTE_READWRITE; 171 | if (!shared) elements = private_elements; 172 | if (!shared && sizeKb < threads) { 173 | //fprintf(stderr, "Too many threads for this size, continuing\n"); 174 | return 0; 175 | } 176 | 177 | if (mode == None) 178 | { 179 | // need to auto detect later 180 | bw_func = sse_asm_read; // guaranteed to work 181 | } 182 | else if (mode == AvxRead) { bw_func = avx_asm_read; } 183 | else if (mode == AvxWrite) { bw_func = avx_asm_write; } 184 | else if (mode == AvxAdd) { bw_func = avx_asm_add; } 185 | else if (mode == AvxCflip) { bw_func = avx_asm_cflip; } 186 | else if (mode == AvxCopy) { bw_func = avx_asm_copy; } 187 | else if (mode == SseRead) { bw_func = sse_asm_read; } 188 | else if (mode == SseWrite) { bw_func = sse_asm_write; } 189 | else if (mode == SseAdd) { bw_func = sse_asm_add; } 190 | else if (mode == SseCopy) { bw_func = sse_asm_copy; } 191 | else if (mode == Avx512Read) { bw_func = avx512_asm_read; } 192 | else if (mode == Avx512Write) { bw_func = avx512_asm_write; } 193 | else if (mode == Avx512Add) { bw_func = avx512_asm_add; } 194 | else if (mode == MmxRead) { bw_func = mmx_asm_read; } 195 | else if (mode == MmxWrite) { bw_func = mmx_asm_write; } 196 | else if (mode == MmxNtWrite) { bw_func = mmx_asm_ntwrite; } 197 | else if (mode == SseNtWrite) { bw_func = sse_asm_ntwrite; } 198 | else if (mode == AvxNtWrite) { bw_func = avx_asm_ntwrite; } 199 | else if (mode == Avx512NtWrite) { bw_func = avx512_asm_ntwrite; } 200 | else if (mode == SseNtRead) { bw_func = sse_asm_ntread; } 201 | else if (mode == Instr4 || mode == Instr8 || mode == K8Instr4 || mode == Branch16) 202 | { 203 | bw_func = instr_read; 204 | } 205 | else if (mode == RepMovsb) { bw_func = repmovsb_copy; } 206 | else if (mode == RepStosb) { bw_func = repstosb_write; } 207 | else if (mode == RepMovsd) { bw_func = repmovsd_copy; } 208 | else if (mode == RepStosd) { bw_func = repstosd_write; } 209 | else 210 | { 211 | return -3; 212 | } 213 | 214 | // make array and fill it with something 215 | float* testArr = NULL; 216 | if (shared) { 217 | testArr = (float*)VirtualAlloc(NULL, elements * sizeof(float), MEM_COMMIT | MEM_RESERVE, protection_flags); 218 | if (testArr == NULL) { 219 | return 15; 220 | } 221 | 222 | if (mode != None) 223 | { 224 | FillInstructionArray((uint64_t*)testArr, sizeKb, mode); 225 | } 226 | else { 227 | for (uint32_t i = 0; i < elements; i++) { 228 | testArr[i] = i + 0.5f; 229 | } 230 | } 231 | } 232 | 233 | HANDLE* testThreads = (HANDLE*)malloc(threads * sizeof(HANDLE)); 234 | DWORD* tids = (DWORD*)malloc(threads * sizeof(DWORD)); 235 | struct BandwidthTestThreadData* threadData = (struct BandwidthTestThreadData*)malloc(threads * sizeof(struct BandwidthTestThreadData)); 236 | 237 | for (uint64_t i = 0; i < threads; i++) { 238 | if (shared) { 239 | threadData[i].arr = testArr; 240 | } 241 | else { 242 | threadData[i].arr = (float*)VirtualAlloc(NULL, elements * sizeof(float), MEM_COMMIT | MEM_RESERVE, protection_flags); 243 | if (threadData[i].arr == NULL) { 244 | return 0; 245 | } 246 | 247 | if (mode != None) 248 | { 249 | FillInstructionArray((uint64_t*)threadData[i].arr, (elements * 4) / 1024, mode); 250 | } 251 | else 252 | { 253 | for (uint64_t arr_idx = 0; arr_idx < elements; arr_idx++) { 254 | threadData[i].arr[arr_idx] = arr_idx + i + 0.5f; 255 | } 256 | } 257 | } 258 | 259 | threadData[i].arr_length = elements; 260 | threadData[i].bw = 0; 261 | threadData[i].iterations = iterations; 262 | testThreads[i] = CreateThread(NULL, 0, ReadBandwidthTestThread, threadData + i, CREATE_SUSPENDED, tids + i); 263 | 264 | // turns out setting affinity makes no difference, and it's easier to set affinity via start /affinity anyway 265 | //SetThreadAffinityMask(testThreads[i], 1UL << i); 266 | } 267 | 268 | ftime(&start); 269 | for (uint32_t i = 0; i < threads; i++) ResumeThread(testThreads[i]); 270 | WaitForMultipleObjects((DWORD)threads, testThreads, TRUE, INFINITE); 271 | ftime(&end); 272 | 273 | int64_t time_diff_ms = 1000 * (end.time - start.time) + (end.millitm - start.millitm); 274 | double gbTransferred = (uint64_t)iterations * sizeof(float) * elements * threads / (double)1e9; 275 | bw = (float)(1000 * gbTransferred / (double)time_diff_ms); 276 | 277 | free(testThreads); 278 | if (shared) VirtualFree(testArr, elements * sizeof(float), MEM_RELEASE); 279 | free(tids); 280 | 281 | if (!shared) { 282 | for (uint32_t i = 0; i < threads; i++) { 283 | VirtualFreeEx(GetCurrentProcess(), threadData[i].arr, 0, MEM_RELEASE); 284 | } 285 | } 286 | 287 | free(threadData); 288 | return bw; 289 | } 290 | 291 | /// 292 | /// Bandwidth measuring function for instruction-side BW. Simply jumps into the 293 | /// array its given. So that array better be filled with valid instructions, with a 294 | /// return at the end. 295 | /// 296 | /// Array containing instructions, terminated with a return 297 | /// Length of arr in bytes, not used as arr better be ret-terminated 298 | /// How many times to run the nop function (arr) 299 | /// Nothing useful lol 300 | float __fastcall instr_read(void* arr, uint64_t arr_length, uint64_t iterations) 301 | { 302 | void (*nopfunc)(uint64_t); 303 | nopfunc = (void(*)(uint64_t))arr; 304 | int iterIdx; 305 | for (iterIdx = 0; iterIdx < iterations; iterIdx++) nopfunc(iterations); 306 | return (float)iterIdx; 307 | } -------------------------------------------------------------------------------- /BenchmarkDll.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | Debug 14 | x64 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | 16.0 23 | Win32Proj 24 | {2fc1b46d-1c99-4f82-96f4-e99320552268} 25 | BenchmarkDll 26 | 10.0 27 | 28 | 29 | 30 | DynamicLibrary 31 | true 32 | v143 33 | Unicode 34 | 35 | 36 | DynamicLibrary 37 | false 38 | v143 39 | true 40 | Unicode 41 | 42 | 43 | DynamicLibrary 44 | true 45 | v143 46 | Unicode 47 | 48 | 49 | DynamicLibrary 50 | false 51 | v143 52 | true 53 | Unicode 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | true 75 | 76 | 77 | false 78 | 79 | 80 | true 81 | 82 | 83 | false 84 | 85 | 86 | 87 | Level3 88 | true 89 | WIN32;_DEBUG;BENCHMARKDLL_EXPORTS;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) 90 | true 91 | Use 92 | pch.h 93 | 94 | 95 | Windows 96 | true 97 | false 98 | 99 | 100 | 101 | 102 | Level3 103 | true 104 | true 105 | true 106 | WIN32;NDEBUG;BENCHMARKDLL_EXPORTS;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) 107 | true 108 | Use 109 | pch.h 110 | 111 | 112 | Windows 113 | true 114 | true 115 | true 116 | false 117 | 118 | 119 | 120 | 121 | Level3 122 | true 123 | _CRT_SECURE_NO_WARNINGS;_DEBUG;BENCHMARKDLL_EXPORTS;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) 124 | true 125 | Use 126 | pch.h 127 | $(SolutionDir)\OpenCL\include;%(AdditionalIncludeDirectories) 128 | 129 | 130 | Windows 131 | true 132 | false 133 | $(SolutionDir)\OpenCL\lib;%(AdditionalLibraryDirectories) 134 | OpenCL.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 135 | 136 | 137 | 138 | 139 | Level3 140 | true 141 | true 142 | true 143 | _CRT_SECURE_NO_WARNINGS;NDEBUG;BENCHMARKDLL_EXPORTS;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) 144 | true 145 | Use 146 | pch.h 147 | $(SolutionDir)\OpenCL\include;%(AdditionalIncludeDirectories) 148 | 149 | 150 | Windows 151 | true 152 | true 153 | true 154 | false 155 | $(SolutionDir)\OpenCL\lib;%(AdditionalLibraryDirectories) 156 | kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;OpenCL.lib;%(AdditionalDependencies) 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | Create 170 | Create 171 | Create 172 | Create 173 | 174 | 175 | 176 | 177 | Document 178 | nasm -f win64 MemoryBandwidthFunctions.asm 179 | Running NASM 180 | MemoryBandwidthFunctions.obj 181 | nasm -f win64 MemoryBandwidthFunctions.asm 182 | Running NASM 183 | MemoryBandwidthFunctions.obj 184 | 185 | 186 | 187 | 188 | false 189 | Document 190 | nasm -f win64 MemoryLatencyFunctions.asm 191 | Building Memory Latency functions 192 | MemoryLatencyFunctions.obj 193 | nasm -f win64 MemoryLatencyFunctions.asm 194 | MemoryLatencyFunctions.obj 195 | 196 | 197 | 198 | 199 | 200 | -------------------------------------------------------------------------------- /BenchmarkDll.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | Header Files 20 | 21 | 22 | Header Files 23 | 24 | 25 | Header Files 26 | 27 | 28 | 29 | 30 | Source Files 31 | 32 | 33 | Source Files 34 | 35 | 36 | Source Files 37 | 38 | 39 | Source Files 40 | 41 | 42 | 43 | 44 | Source Files 45 | 46 | 47 | Source Files 48 | 49 | 50 | -------------------------------------------------------------------------------- /BenchmarkDllCommon.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | void FillPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t byte_increment); 11 | void FillTlbTestPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t cacheline_size, uint32_t page_size); -------------------------------------------------------------------------------- /BenchmarkInteropFunctions.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Runtime.InteropServices; 3 | 4 | namespace MicrobenchmarkGui 5 | { 6 | public static class BenchmarkInteropFunctions 7 | { 8 | [DllImport(@"BenchmarkDll.dll", CharSet = CharSet.Auto, CallingConvention = CallingConvention.StdCall)] 9 | public static extern float test(int size); 10 | 11 | // must be kept in sync with the one in bandwidth.c 12 | public enum TestType 13 | { 14 | None = 0, 15 | SseRead = 1, 16 | SseWrite = 2, 17 | SseCopy = 3, 18 | SseAdd = 4, 19 | AvxRead = 5, 20 | AvxWrite = 6, 21 | AvxCopy = 7, 22 | AvxCflip = 8, 23 | AvxAdd = 9, 24 | Avx512Read = 10, 25 | Avx512Write = 11, 26 | Avx512Add = 12, 27 | Instr4 = 13, 28 | Instr8 = 14, 29 | K8Instr4 = 15, 30 | Branch16 = 16, 31 | MmxRead = 17, 32 | MmxWrite = 18, 33 | MmxNtWrite = 19, 34 | SseNtWrite = 20, 35 | AvxNtWrite = 21, 36 | Avx512NtWrite = 22, 37 | SseNtRead = 23, 38 | RepMovsb = 24, 39 | RepStosb = 25, 40 | RepMovsd = 26, 41 | RepStosd = 27 42 | }; 43 | 44 | [DllImport(@"BenchmarkDll.dll", CharSet = CharSet.Auto, CallingConvention = CallingConvention.StdCall)] 45 | public static extern float MeasureBw(uint sizeKb, ulong iterations, uint threads, int shared, TestType testType); 46 | 47 | [DllImport(@"BenchmarkDll.dll", CharSet = CharSet.Auto, CallingConvention = CallingConvention.StdCall)] 48 | public static extern int CheckAvxSupport(); 49 | 50 | [DllImport(@"BenchmarkDll.dll", CharSet = CharSet.Auto, CallingConvention = CallingConvention.StdCall)] 51 | public static extern int CheckAvx512Support(); 52 | 53 | [DllImport(@"BenchmarkDll.dll", CharSet = CharSet.Auto, CallingConvention = CallingConvention.StdCall)] 54 | public static extern int SetLargePages(uint enable); 55 | 56 | [DllImport(@"BenchmarkDll.dll", CharSet = CharSet.Auto, CallingConvention = CallingConvention.StdCall)] 57 | public static extern float RunLatencyTest(uint sizeKb, ulong iterations); 58 | 59 | [DllImport(@"BenchmarkDll.dll", CharSet = CharSet.Auto, CallingConvention = CallingConvention.StdCall)] 60 | public static extern float RunAsmLatencyTest(uint sizeKb, ulong iterations); 61 | 62 | // OpenCL related functions 63 | [DllImport(@"BenchmarkDll.dll", CharSet = CharSet.Auto, CallingConvention = CallingConvention.StdCall)] 64 | public static extern int SetOpenCLContext(int platformIndex, int deviceIndex); 65 | 66 | [DllImport(@"BenchmarkDll.dll", CharSet = CharSet.Auto, CallingConvention = CallingConvention.StdCall)] 67 | public static extern int GetPlatformCount(); 68 | 69 | [DllImport(@"BenchmarkDll.dll", CharSet = CharSet.Auto, CallingConvention = CallingConvention.StdCall)] 70 | public static extern int GetDeviceCount(int platformIndex); 71 | 72 | /// 73 | /// Gets an OpenCL device's name 74 | /// 75 | /// Platform index 76 | /// Device index 77 | /// Pointer to block of memory to put the device name into 78 | /// Max length of device (size of memory block above). Includes terminating null 79 | /// 0 on success, opencl error code on failure 80 | [DllImport(@"BenchmarkDll.dll", CharSet = CharSet.Ansi, CallingConvention = CallingConvention.StdCall)] 81 | public static extern int GetDeviceName(int platformIndex, int deviceIndex, IntPtr deviceNamePtr, int maxDeviceNameLen); 82 | 83 | /// 84 | /// Gets an OpenCL platform's name 85 | /// 86 | /// Platform index 87 | /// Pointer to block of memory to put the name into 88 | /// Max name length, includes terminating null 89 | /// 0 on success, error code on fail 90 | [DllImport(@"BenchmarkDll.dll", CharSet = CharSet.Ansi, CallingConvention = CallingConvention.StdCall)] 91 | public static extern int GetPlatformName(int platformIndex, IntPtr platformNamePtr, int maxPlatformNameLen); 92 | 93 | // keep in sync with the one in OpenCLFunctions.c 94 | public enum CLTestType 95 | { 96 | None = 0, 97 | GlobalScalar = 1, 98 | GlobalVector = 2, 99 | ConstantScalar = 3, 100 | Texture = 4, 101 | Local = 5, 102 | LinkBw = 6 103 | }; 104 | 105 | [DllImport(@"BenchmarkDll.dll", CharSet = CharSet.Auto, CallingConvention = CallingConvention.StdCall)] 106 | public static extern float RunCLLatencyTest(uint sizeKb, uint iterations, CLTestType testType, int tlb); 107 | 108 | [DllImport(@"BenchmarkDll.dll", CharSet = CharSet.Auto, CallingConvention = CallingConvention.StdCall)] 109 | public static extern float RunCLLinkBwTest(uint sizeKb, uint iterations, int cpuToGpu); 110 | 111 | [DllImport(@"BenchmarkDll.dll", CharSet = CharSet.Auto, CallingConvention = CallingConvention.StdCall)] 112 | public static extern int InitializeLatencyTest(CLTestType testType); 113 | 114 | [DllImport(@"BenchmarkDll.dll", CharSet = CharSet.Auto, CallingConvention = CallingConvention.StdCall)] 115 | public static extern int DeinitializeLatencyTest(); 116 | 117 | [DllImport(@"BenchmarkDll.dll", CharSet = CharSet.Auto, CallingConvention = CallingConvention.StdCall)] 118 | public static extern ulong GetDeviceMaxConstantBufferSize(); 119 | 120 | [DllImport(@"BenchmarkDll.dll", CharSet = CharSet.Auto, CallingConvention = CallingConvention.StdCall)] 121 | public static extern ulong GetDeviceMaxBufferSize(); 122 | 123 | [DllImport(@"BenchmarkDll.dll", CharSet = CharSet.Auto, CallingConvention = CallingConvention.StdCall)] 124 | public static extern ulong GetDeviceMaxTextureSize(); 125 | 126 | [DllImport(@"BenchmarkDll.dll", CharSet = CharSet.Auto, CallingConvention = CallingConvention.StdCall)] 127 | public static extern void SetGpuPtrChasingStride(uint stride); 128 | 129 | [DllImport(@"BenchmarkDll.dll", CharSet = CharSet.Auto, CallingConvention = CallingConvention.StdCall)] 130 | public static extern uint GetGpuPtrChasingStride(); 131 | 132 | [DllImport(@"BenchmarkDll.dll", CharSet = CharSet.Auto, CallingConvention = CallingConvention.StdCall)] 133 | public static extern void SetGpuEstimatedPageSize(uint pageSize); 134 | 135 | [DllImport(@"BenchmarkDll.dll", CharSet = CharSet.Auto, CallingConvention = CallingConvention.StdCall)] 136 | public static extern void GetGpuEstimatedPageSize(); 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /BenchmarkSubmission.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | 3 | namespace MicrobenchmarkGui 4 | { 5 | public class BenchmarkSubmission 6 | { 7 | public string TestName { get; set; } 8 | public string CpuName { get; set; } 9 | public string MotherboardName { get; set; } 10 | public string MemoryConfig { get; set; } 11 | public string Notes { get; set; } 12 | public float[][] Results { get; set; } 13 | 14 | public BenchmarkSubmission() 15 | { 16 | Results = new float[0][]; 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /BenchmarkSubmissionDialog.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Drawing; 4 | using System.Threading.Tasks; 5 | using System.Windows.Forms; 6 | using System.Text; 7 | using System.Linq; 8 | using System.Management; 9 | using Newtonsoft.Json; 10 | using System.Net.Http; 11 | 12 | namespace MicrobenchmarkGui 13 | { 14 | public partial class BenchmarkSubmissionDialog : Form 15 | { 16 | private readonly string testName; 17 | private readonly List<(float size, float result)> results; 18 | private readonly BenchmarkSubmission submission; 19 | private const string SERVER_URL = "https://memrank.reali.es"; 20 | 21 | public BenchmarkSubmissionDialog(string testName, List<(float size, float result)> results) 22 | { 23 | InitializeComponent(); 24 | this.testName = testName; 25 | this.results = results; 26 | this.submission = new BenchmarkSubmission(); 27 | 28 | // Set window properties 29 | this.MinimumSize = new Size(500, 400); 30 | this.StartPosition = FormStartPosition.CenterParent; 31 | 32 | // Initialize summary text 33 | UpdateSummary(); 34 | 35 | // Pre-populate system information 36 | PopulateSystemInfo(); 37 | } 38 | 39 | private void PopulateSystemInfo() 40 | { 41 | try 42 | { 43 | // Get CPU info 44 | cpuNameTextBox.Text = OpCode.GetProcessorName() ?? "Unknown CPU"; 45 | 46 | // Get motherboard info using WMI 47 | string motherboardInfo = "Unknown Motherboard"; 48 | try 49 | { 50 | using (var searcher = new ManagementObjectSearcher("SELECT * FROM Win32_BaseBoard")) 51 | { 52 | foreach (ManagementObject board in searcher.Get()) 53 | { 54 | string manufacturer = board["Manufacturer"]?.ToString() ?? ""; 55 | string product = board["Product"]?.ToString() ?? ""; 56 | motherboardInfo = $"{manufacturer} {product}".Trim(); 57 | if (string.IsNullOrWhiteSpace(motherboardInfo)) 58 | { 59 | motherboardInfo = "Unknown Motherboard"; 60 | } 61 | break; 62 | } 63 | } 64 | } 65 | catch 66 | { 67 | // Keep default "Unknown Motherboard" value 68 | } 69 | motherboardTextBox.Text = motherboardInfo; 70 | 71 | // Get memory configuration using WMI 72 | string memoryConfig = "Unknown Memory Configuration"; 73 | try 74 | { 75 | // First try to get actual running speed from BIOS 76 | int currentSpeed = 0; 77 | try 78 | { 79 | using (var searcher = new ManagementObjectSearcher(@"root\WMI", "SELECT * FROM MSMemory_Performance")) 80 | { 81 | foreach (ManagementObject obj in searcher.Get()) 82 | { 83 | if (obj["ConfiguredMemoryClockSpeed"] != null) 84 | { 85 | currentSpeed = Convert.ToInt32(obj["ConfiguredMemoryClockSpeed"]); 86 | break; 87 | } 88 | } 89 | } 90 | } 91 | catch 92 | { 93 | // If we can't get the actual speed, we'll fall back to rated speed 94 | } 95 | 96 | using (var searcher = new ManagementObjectSearcher("SELECT * FROM Win32_PhysicalMemory")) 97 | { 98 | var memoryModules = new List(); 99 | ulong totalCapacity = 0; 100 | int moduleCount = 0; 101 | 102 | foreach (ManagementObject memory in searcher.Get()) 103 | { 104 | moduleCount++; 105 | ulong capacity = Convert.ToUInt64(memory["Capacity"]); 106 | totalCapacity += capacity; 107 | 108 | // Use actual speed if we got it, otherwise fall back to rated speed 109 | int speed = currentSpeed > 0 ? currentSpeed : 110 | Convert.ToInt32(memory["ConfiguredClockSpeed"] ?? memory["Speed"] ?? 0); 111 | 112 | string memoryType = GetMemoryType(memory); 113 | 114 | memoryModules.Add($"{capacity / (1024 * 1024 * 1024)}GB {memoryType}" + 115 | (speed > 0 ? $" @ {speed}MHz" : "")); 116 | } 117 | 118 | if (totalCapacity > 0) 119 | { 120 | memoryConfig = $"{totalCapacity / (1024 * 1024 * 1024)}GB Total ({moduleCount} modules)"; 121 | if (memoryModules.Count > 0) 122 | { 123 | memoryConfig += $" - {string.Join(", ", memoryModules)}"; 124 | } 125 | } 126 | } 127 | } 128 | catch 129 | { 130 | // Keep default "Unknown Memory Configuration" value 131 | } 132 | memoryConfigTextBox.Text = memoryConfig; 133 | 134 | // Make the fields read-only without edit option 135 | cpuNameTextBox.ReadOnly = true; 136 | motherboardTextBox.ReadOnly = true; 137 | memoryConfigTextBox.ReadOnly = true; 138 | } 139 | catch (Exception ex) 140 | { 141 | // Set default values if overall detection fails 142 | cpuNameTextBox.Text = "Unknown CPU"; 143 | motherboardTextBox.Text = "Unknown Motherboard"; 144 | memoryConfigTextBox.Text = "Unknown Memory Configuration"; 145 | 146 | MessageBox.Show($"Error retrieving system information: {ex.Message}\nDefault values have been set.", 147 | "System Info Error", MessageBoxButtons.OK, MessageBoxIcon.Warning); 148 | } 149 | } 150 | 151 | private string GetMemoryType(ManagementObject memory) 152 | { 153 | // Try to get detailed memory type 154 | try 155 | { 156 | int memoryType = Convert.ToInt32(memory["SMBIOSMemoryType"]); 157 | switch (memoryType) 158 | { 159 | case 26: return "DDR4"; 160 | case 30: return "LPDDR4"; 161 | case 34: return "DDR5"; 162 | case 35: return "LPDDR5"; 163 | default: 164 | if (memory["MemoryType"] != null) 165 | { 166 | return $"DDR{memory["MemoryType"]}"; 167 | } 168 | return "DDR"; 169 | } 170 | } 171 | catch 172 | { 173 | return "DDR"; 174 | } 175 | } 176 | 177 | private void UpdateSummary() 178 | { 179 | var summary = new StringBuilder(); 180 | summary.AppendLine($"Test: {testName}"); 181 | summary.AppendLine($"Number of data points: {results.Count}"); 182 | 183 | if (results.Any()) 184 | { 185 | summary.AppendLine($"Size range: {results.Min(r => r.size):F2} KB to {results.Max(r => r.size):F2} KB"); 186 | summary.AppendLine($"Result range: {results.Min(r => r.result):F2} to {results.Max(r => r.result):F2}"); 187 | } 188 | 189 | summaryTextBox.Text = summary.ToString(); 190 | } 191 | 192 | private async void submitButton_Click(object sender, EventArgs e) 193 | { 194 | if (await SubmitAsync()) 195 | { 196 | DialogResult = DialogResult.OK; 197 | } 198 | } 199 | 200 | public async Task SubmitAsync() 201 | { 202 | // Get values from form controls 203 | submission.TestName = testName; 204 | submission.CpuName = cpuNameTextBox.Text; 205 | submission.MotherboardName = motherboardTextBox.Text; 206 | submission.MemoryConfig = memoryConfigTextBox.Text; 207 | submission.Notes = notesTextBox.Text; 208 | submission.Results = results.Select(r => new float[] { r.size, r.result }).ToArray(); 209 | 210 | // Validate required fields 211 | if (string.IsNullOrWhiteSpace(submission.CpuName) || 212 | string.IsNullOrWhiteSpace(submission.MotherboardName) || 213 | string.IsNullOrWhiteSpace(submission.MemoryConfig)) 214 | { 215 | MessageBox.Show("Please fill in all required fields.", "Validation Error", 216 | MessageBoxButtons.OK, MessageBoxIcon.Warning); 217 | return false; 218 | } 219 | 220 | try 221 | { 222 | string jsonSubmission = JsonConvert.SerializeObject(submission, Formatting.Indented); 223 | 224 | using (var client = new HttpClient()) 225 | { 226 | var content = new StringContent(jsonSubmission, Encoding.UTF8, "application/json"); 227 | var response = await client.PostAsync($"{SERVER_URL}/submit", content); 228 | 229 | if (!response.IsSuccessStatusCode) 230 | { 231 | throw new Exception($"Server returned status code: {response.StatusCode}"); 232 | } 233 | 234 | var responseJson = await response.Content.ReadAsStringAsync(); 235 | var result = JsonConvert.DeserializeAnonymousType(responseJson, new { success = false, url = "", id = 0 }); 236 | 237 | if (result?.success == true && !string.IsNullOrEmpty(result.url)) 238 | { 239 | System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo 240 | { 241 | FileName = $"{SERVER_URL}{result.url}", 242 | UseShellExecute = true 243 | }); 244 | return true; 245 | } 246 | 247 | throw new Exception("Invalid response from server"); 248 | } 249 | } 250 | catch (Exception ex) 251 | { 252 | MessageBox.Show($"Error submitting results: {ex.Message}", 253 | "Submission Error", 254 | MessageBoxButtons.OK, MessageBoxIcon.Error); 255 | return false; 256 | } 257 | } 258 | 259 | protected override void OnFormClosing(FormClosingEventArgs e) 260 | { 261 | if (DialogResult == DialogResult.None) 262 | { 263 | DialogResult = DialogResult.Cancel; 264 | } 265 | base.OnFormClosing(e); 266 | } 267 | 268 | private void cancelButton_Click(object sender, EventArgs e) 269 | { 270 | DialogResult = DialogResult.Cancel; 271 | } 272 | } 273 | } 274 | -------------------------------------------------------------------------------- /BenchmarkSubmissionDialog.resx: -------------------------------------------------------------------------------- 1 |  2 | 3 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | text/microsoft-resx 110 | 111 | 112 | 2.0 113 | 114 | 115 | System.Resources.ResXResourceReader, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 116 | 117 | 118 | System.Resources.ResXResourceWriter, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 119 | 120 | -------------------------------------------------------------------------------- /GlobalTestSettings.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace MicrobenchmarkGui 8 | { 9 | /// 10 | /// Container class for settings that should apply across tests 11 | /// 12 | public static class GlobalTestSettings 13 | { 14 | /// 15 | /// Minimum test size in KB, for tests that go through multiple sizes 16 | /// 17 | public static uint MinTestSizeKb = 0; 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /LatencyRunner.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Diagnostics; 4 | using System.Threading; 5 | using System.Windows.Forms; 6 | using System.Windows.Forms.DataVisualization.Charting; 7 | 8 | namespace MicrobenchmarkGui 9 | { 10 | public class LatencyRunner 11 | { 12 | public uint[] testSizes = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 512, 600, 768, 1024, 1536, 2048, 13 | 3072, 4096, 5120, 6144, 8192, 10240, 12288, 16384, 24567, 32768, 65536, 98304, 14 | 131072, 262144, 393216, 524288, 1048576 }; 15 | 16 | public bool running = false; 17 | 18 | // run results 19 | public Dictionary>> RunResults; 20 | 21 | // last run results 22 | public string[][] formattedResults; 23 | 24 | /// 25 | /// List of test results from last run 26 | /// 27 | public List testResultsList; 28 | 29 | /// 30 | /// List of tested points from last run 31 | /// 32 | public List floatTestPoints; 33 | 34 | private ListView resultListView; 35 | private Chart resultChart; 36 | private MicrobenchmarkForm.SafeSetResultListView setListViewDelegate; 37 | private MicrobenchmarkForm.SafeSetResultListViewColumns setListViewColumnsDelegate; 38 | private MicrobenchmarkForm.SafeSetResultsChart setChartDelegate; 39 | private MicrobenchmarkForm.SafeSetProgressLabel setProgressLabelDelegate; 40 | private Label progressLabel; 41 | private string[] bwCols = { "Data Size", "Latency" }; 42 | 43 | public LatencyRunner(MicrobenchmarkForm.SafeSetResultListView setListViewDelegate, 44 | MicrobenchmarkForm.SafeSetResultListViewColumns setListViewColsDelegate, 45 | MicrobenchmarkForm.SafeSetResultsChart setChartDelegate, 46 | MicrobenchmarkForm.SafeSetProgressLabel setLabelDelegate, 47 | ListView resultListView, 48 | Chart resultChart, 49 | Label progressLabel) 50 | { 51 | this.setListViewColumnsDelegate = setListViewColsDelegate; 52 | this.setListViewDelegate = setListViewDelegate; 53 | this.setChartDelegate = setChartDelegate; 54 | this.setProgressLabelDelegate = setLabelDelegate; 55 | this.resultListView = resultListView; 56 | this.resultChart = resultChart; 57 | this.progressLabel = progressLabel; 58 | 59 | this.RunResults = new Dictionary>>(); 60 | } 61 | 62 | // Run through test sizes, meant to be run in a background thread 63 | public void StartFullTest(bool asm, bool largePages, CancellationToken runCancel) 64 | { 65 | string testLabel = (asm ? "ASM" : "C") + ", " + (largePages ? "Large Pages" : "Default Pages"); 66 | List> currentRunResults = new List>(); 67 | testResultsList = new List(); 68 | floatTestPoints = new List(); 69 | resultListView.Invoke(setListViewColumnsDelegate, new object[] { bwCols }); 70 | float[] testResults = new float[testSizes.Length]; 71 | formattedResults = new string[testSizes.Length][]; 72 | 73 | for (uint i = 0; i < testSizes.Length; i++) 74 | { 75 | testResults[i] = 0; 76 | formattedResults[i] = new string[2]; 77 | formattedResults[i][0] = string.Format("{0} KB", testSizes[i]); 78 | formattedResults[i][1] = "Not Run"; 79 | } 80 | 81 | resultListView.Invoke(setListViewDelegate, new object[] { formattedResults }); 82 | 83 | if (!largePages) 84 | { 85 | BenchmarkInteropFunctions.SetLargePages(0); 86 | } 87 | else 88 | { 89 | uint maxTestSize = testSizes[testSizes.Length - 1]; 90 | int rc = BenchmarkInteropFunctions.SetLargePages(maxTestSize * 1024); 91 | if (rc == -1) 92 | { 93 | progressLabel.Invoke(setProgressLabelDelegate, 94 | new object[] { "Failed to get SeLockMemoryPrivilege for large lages. See README.md" }); 95 | return; 96 | } 97 | else if (rc == -2) 98 | { 99 | progressLabel.Invoke(setProgressLabelDelegate, 100 | new object[] { "Could not allocate " + maxTestSize + " KB with large pages. If you have enough free memory, try rebooting" }); 101 | return; 102 | } 103 | } 104 | 105 | float targetTimeMs = 3500, minTimeMs = 1500, lastTimeMs = 0; 106 | Stopwatch testStopwatch = new Stopwatch(); 107 | for (uint testIdx = 0; testIdx < testSizes.Length; testIdx++) 108 | { 109 | if (runCancel.IsCancellationRequested) 110 | { 111 | break; 112 | } 113 | 114 | uint testSize = testSizes[testIdx]; 115 | float result; 116 | ulong currentIterations = 2500000; 117 | 118 | if (GlobalTestSettings.MinTestSizeKb != 0 && GlobalTestSettings.MinTestSizeKb > testSize) continue; 119 | 120 | do 121 | { 122 | progressLabel.Invoke(setProgressLabelDelegate, new object[] { $"Testing {testSize} KB with {currentIterations / 1000}K iterations. Last run = {lastTimeMs} ms" }); 123 | 124 | Console.WriteLine("Starting run with {0}K iterations", currentIterations / 1000); 125 | testStopwatch.Restart(); 126 | if (asm) result = BenchmarkInteropFunctions.RunAsmLatencyTest(testSize, currentIterations); 127 | else result = BenchmarkInteropFunctions.RunLatencyTest(testSize, currentIterations); 128 | testStopwatch.Stop(); 129 | 130 | lastTimeMs = (float)(result * currentIterations / 1e6); 131 | Console.WriteLine("Calculated time: {0:F2}, stopwatch time: {1}", lastTimeMs, testStopwatch.ElapsedMilliseconds); 132 | currentIterations = TestUtilities.ScaleIterations(currentIterations, targetTimeMs, lastTimeMs); 133 | } while (lastTimeMs < minTimeMs); 134 | 135 | testResults[testIdx] = result; 136 | 137 | if (result != 0) formattedResults[testIdx][1] = string.Format("{0:F2} ns", result); 138 | else formattedResults[testIdx][1] = "N/A"; 139 | resultListView.Invoke(setListViewDelegate, new object[] { formattedResults }); 140 | 141 | if (result != 0) 142 | { 143 | floatTestPoints.Add(testSize); 144 | testResultsList.Add(result); 145 | currentRunResults.Add(new Tuple(testSize, result)); 146 | resultChart.Invoke(setChartDelegate, new object[] { testLabel, floatTestPoints.ToArray(), testResultsList.ToArray(), MicrobenchmarkForm.ResultChartType.CpuMemoryLatency }); 147 | } 148 | } 149 | 150 | progressLabel.Invoke(setProgressLabelDelegate, new object[] { "Run finished" }); 151 | RunResults.Add(testLabel, currentRunResults); 152 | } 153 | 154 | public string GetTestSizesAsString() 155 | { 156 | return string.Join(",", testSizes); 157 | } 158 | 159 | // Shouldn't be called when test is running, but UI will take care of that 160 | public void SetTestSizes(string input) 161 | { 162 | string[] inputArr = input.Split(new char[] { ',' } , StringSplitOptions.RemoveEmptyEntries); 163 | uint[] newTestSizes = new uint[inputArr.Length]; 164 | for (uint i = 0;i < inputArr.Length; i++) 165 | { 166 | newTestSizes[i] = uint.Parse(inputArr[i]); 167 | } 168 | 169 | testSizes = newTestSizes; 170 | } 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /MemoryLatency.c: -------------------------------------------------------------------------------- 1 | #include "pch.h" 2 | #include "BenchmarkDllCommon.h" 3 | 4 | // If set, memory latency tests will use this as the test array 5 | // If not set, test runs will use malloc() 6 | void* mem = NULL; 7 | 8 | // mem latency functions 9 | __declspec(dllexport) float __stdcall RunAsmLatencyTest(uint32_t size_kb, uint64_t iterations); 10 | __declspec(dllexport) float __stdcall RunLatencyTest(uint32_t size_kb, uint64_t iterations); 11 | __declspec(dllexport) int __stdcall SetLargePages(uint32_t enable); 12 | 13 | int GetPrivilege(); 14 | 15 | /// 16 | /// Sets large pages state. Will allocate array if large pages are enabled 17 | /// 18 | /// If greater than 0, enable large pages, with array set to specified size in bytes. If 0, disable large pages and free any allocated arr 19 | /// 0 on success, something else otherwise 20 | int SetLargePages(uint32_t enable) 21 | { 22 | if (enable == 0) 23 | { 24 | if (mem != NULL) 25 | { 26 | VirtualFree(mem, 0, MEM_RELEASE); 27 | mem = NULL; 28 | } 29 | 30 | return 0; 31 | } 32 | else 33 | { 34 | if (mem != NULL) 35 | { 36 | VirtualFree(mem, 0, MEM_RELEASE); 37 | mem = NULL; 38 | } 39 | 40 | if (GetPrivilege() != 0) 41 | { 42 | return -1; 43 | } 44 | 45 | mem = VirtualAlloc(NULL, enable, MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES, PAGE_READWRITE); 46 | if (mem == NULL) 47 | { 48 | return -2; 49 | } 50 | 51 | return 0; 52 | } 53 | } 54 | 55 | /// 56 | /// Fills pattern array with 32-bit integers 57 | /// 58 | /// array to fill 59 | /// number of 32-bit elements 60 | /// how far apart elements should be spaced 61 | void FillPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t byte_increment) { 62 | uint32_t increment = byte_increment / sizeof(uint32_t); 63 | uint32_t element_count = list_size / increment; 64 | for (int i = 0; i < element_count; i++) { 65 | pattern_arr[i * increment] = i * increment; 66 | } 67 | 68 | int iter = element_count; 69 | while (iter > 1) { 70 | iter -= 1; 71 | int j = iter - 1 == 0 ? 0 : rand() % (iter - 1); 72 | uint32_t tmp = pattern_arr[iter * increment]; 73 | pattern_arr[iter * increment] = pattern_arr[j * increment]; 74 | pattern_arr[j * increment] = tmp; 75 | } 76 | } 77 | 78 | uint32_t GetTlbShiftedOffset(uint32_t index, uint32_t cacheline_size, uint32_t page_size) 79 | { 80 | uint32_t page_increment = page_size / sizeof(uint32_t); 81 | uint32_t cacheline_increment = cacheline_size / sizeof(uint32_t); 82 | uint32_t byte_offset = (index * cacheline_increment) & (page_increment - 1); 83 | return index * page_increment + byte_offset; 84 | } 85 | 86 | /// 87 | /// Fills pattern array with page_size as the pointer chasing stride, but 88 | /// 89 | /// 90 | /// 91 | /// 92 | /// 93 | /// 94 | void FillTlbTestPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t cacheline_size, uint32_t page_size) { 95 | // fill a temporary array with the element count 96 | uint32_t element_count = list_size * sizeof(uint32_t) / page_size; 97 | uint32_t* temp_arr = (uint32_t*)malloc(sizeof(uint32_t) * element_count); 98 | uint32_t page_increment = page_size / sizeof(uint32_t); 99 | FillPatternArr(temp_arr, element_count, sizeof(uint32_t)); 100 | memset(pattern_arr, INT_MAX, list_size * sizeof(uint32_t)); 101 | for (uint32_t i = 0; i < element_count; i++) 102 | { 103 | uint32_t dst_index = GetTlbShiftedOffset(i, cacheline_size, page_size); 104 | uint32_t dst_value = GetTlbShiftedOffset(temp_arr[i], cacheline_size, page_size); 105 | pattern_arr[dst_index] = dst_value; 106 | } 107 | free(temp_arr); 108 | } 109 | 110 | /// 111 | /// Fills pattern array with 64-bit integers 112 | /// 113 | /// array to fill 114 | /// number of 64-bit elements in array 115 | /// how far apart elements should be spaced 116 | void FillPatternArr64(uint64_t* pattern_arr, uint64_t list_size, uint64_t byte_increment) { 117 | uint32_t increment = byte_increment / sizeof(uint64_t); 118 | uint32_t element_count = list_size / increment; 119 | for (int i = 0; i < element_count; i++) { 120 | pattern_arr[i * increment] = i * increment; 121 | } 122 | 123 | int iter = element_count; 124 | while (iter > 1) { 125 | iter -= 1; 126 | int j = iter - 1 == 0 ? 0 : rand() % (iter - 1); 127 | uint64_t tmp = pattern_arr[iter * increment]; 128 | pattern_arr[iter * increment] = pattern_arr[j * increment]; 129 | pattern_arr[j * increment] = tmp; 130 | } 131 | } 132 | 133 | float RunAsmLatencyTest(uint32_t size_kb, uint64_t iterations) { 134 | struct timeb start, end; 135 | uint32_t list_size = size_kb * 1024 / sizeof(void*); 136 | 137 | uint64_t* A; 138 | if (mem == NULL) { 139 | A = (uint64_t*)malloc(size_kb * 1024); 140 | } 141 | else { 142 | A = (uint64_t*)mem; 143 | } 144 | 145 | memset(A, 0, 1024 * size_kb); 146 | FillPatternArr64(A, size_kb * 1024 / sizeof(uint64_t), 64); 147 | preplatencyarr(A, size_kb * 1024 / sizeof(uint64_t)); 148 | 149 | ftime(&start); 150 | uint64_t sum = latencytest(iterations, A); 151 | ftime(&end); 152 | int64_t time_diff_ms = 1000 * (end.time - start.time) + (end.millitm - start.millitm); 153 | float latency = 1e6 * (float)time_diff_ms / (float)iterations; 154 | if (mem == NULL) free(A); 155 | return latency; 156 | } 157 | 158 | float RunLatencyTest(uint32_t size_kb, uint64_t iterations) { 159 | struct timeb start, end; 160 | uint32_t list_size = size_kb * 1024 / 4; 161 | uint32_t current; 162 | 163 | // Fill list to create random access pattern 164 | int* A; 165 | if (mem == NULL) { 166 | A = (int*)malloc(sizeof(int) * list_size); 167 | } 168 | else { 169 | A = (int*)mem; 170 | } 171 | 172 | for (int i = 0; i < list_size; i++) { 173 | A[i] = i; 174 | } 175 | 176 | FillPatternArr(A, list_size, 64); 177 | 178 | // Run test 179 | ftime(&start); 180 | current = A[0]; 181 | for (int i = 0; i < iterations; i++) { 182 | current = A[current]; 183 | } 184 | ftime(&end); 185 | int64_t time_diff_ms = 1000 * (end.time - start.time) + (end.millitm - start.millitm); 186 | float latency = 1e6 * (float)time_diff_ms / (float)iterations; 187 | 188 | int tmp = A[current]; 189 | if (mem == NULL) free(A); 190 | if (current == tmp) return 0; 191 | return latency; 192 | } 193 | 194 | int GetPrivilege() 195 | { 196 | HANDLE hToken; 197 | TOKEN_PRIVILEGES tp; 198 | BOOL status; 199 | DWORD error; 200 | 201 | // open process token 202 | if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken)) 203 | { 204 | return -1; 205 | } 206 | 207 | // get the luid 208 | if (!LookupPrivilegeValue(NULL, TEXT("SeLockMemoryPrivilege"), &tp.Privileges[0].Luid)) 209 | { 210 | return -1; 211 | } 212 | 213 | // enable privilege 214 | tp.PrivilegeCount = 1; 215 | tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; 216 | status = AdjustTokenPrivileges(hToken, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0); 217 | 218 | // It is possible for AdjustTokenPrivileges to return TRUE and still not succeed. 219 | // So always check for the last error value. 220 | error = GetLastError(); 221 | if (!status || (error != ERROR_SUCCESS)) 222 | { 223 | return -1; 224 | } 225 | 226 | // close the handle 227 | if (!CloseHandle(hToken)) 228 | { 229 | return -1; 230 | } 231 | 232 | return 0; 233 | } 234 | -------------------------------------------------------------------------------- /MemoryLatencyFunctions.asm: -------------------------------------------------------------------------------- 1 | section .text 2 | bits 64 3 | 4 | global preplatencyarr 5 | global latencytest 6 | 7 | preplatencyarr: 8 | push r15 9 | push r14 10 | xor r15, r15 ; array index 11 | preplatencyarr_loop: 12 | mov r14, [rcx + r15 * 8] 13 | lea r14, [rcx + r14 * 8] 14 | mov [rcx + r15 * 8], r14 15 | inc r15 16 | cmp rdx, r15 17 | jne preplatencyarr_loop 18 | pop r14 19 | pop r15 20 | ret 21 | 22 | latencytest: 23 | push r15 24 | mov r15, [rdx] 25 | xor rax, rax 26 | latencytest_loop: 27 | mov r15, [r15] 28 | add rax, r15 29 | dec rcx 30 | jnz latencytest_loop 31 | pop r15 32 | ret 33 | -------------------------------------------------------------------------------- /MicrobenchmarkForm.resx: -------------------------------------------------------------------------------- 1 |  2 | 3 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | text/microsoft-resx 110 | 111 | 112 | 2.0 113 | 114 | 115 | System.Resources.ResXResourceReader, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 116 | 117 | 118 | System.Resources.ResXResourceWriter, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 119 | 120 | -------------------------------------------------------------------------------- /MicrobenchmarkGui.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | AnyCPU 7 | {EA6B854D-FAD1-4212-8953-4F32286E1B57} 8 | WinExe 9 | MicrobenchmarkGui 10 | MicrobenchmarkGui 11 | v4.7.2 12 | 512 13 | true 14 | true 15 | 16 | 17 | x64 18 | true 19 | full 20 | false 21 | x64\Debug\ 22 | DEBUG;TRACE 23 | prompt 24 | 4 25 | 26 | 27 | x64 28 | pdbonly 29 | true 30 | x64\Release\ 31 | TRACE 32 | prompt 33 | 4 34 | false 35 | 36 | 37 | 38 | packages\Newtonsoft.Json.13.0.3\lib\net45\Newtonsoft.Json.dll 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | Form 58 | 59 | 60 | BenchmarkSubmissionDialog.cs 61 | 62 | 63 | 64 | 65 | 66 | 67 | Form 68 | 69 | 70 | MicrobenchmarkForm.cs 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | BenchmarkSubmissionDialog.cs 79 | 80 | 81 | MicrobenchmarkForm.cs 82 | 83 | 84 | ResXFileCodeGenerator 85 | Resources.Designer.cs 86 | Designer 87 | 88 | 89 | True 90 | Resources.resx 91 | 92 | 93 | 94 | 95 | SettingsSingleFileGenerator 96 | Settings.Designer.cs 97 | 98 | 99 | True 100 | Settings.settings 101 | True 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /MicrobenchmarkGui.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 16 4 | VisualStudioVersion = 16.0.32630.194 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MicrobenchmarkGui", "MicrobenchmarkGui.csproj", "{EA6B854D-FAD1-4212-8953-4F32286E1B57}" 7 | EndProject 8 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BenchmarkDll", "BenchmarkDll.vcxproj", "{2FC1B46D-1C99-4F82-96F4-E99320552268}" 9 | EndProject 10 | Global 11 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 12 | Debug|x64 = Debug|x64 13 | Release|x64 = Release|x64 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {EA6B854D-FAD1-4212-8953-4F32286E1B57}.Debug|x64.ActiveCfg = Debug|Any CPU 17 | {EA6B854D-FAD1-4212-8953-4F32286E1B57}.Debug|x64.Build.0 = Debug|Any CPU 18 | {EA6B854D-FAD1-4212-8953-4F32286E1B57}.Release|x64.ActiveCfg = Release|Any CPU 19 | {EA6B854D-FAD1-4212-8953-4F32286E1B57}.Release|x64.Build.0 = Release|Any CPU 20 | {2FC1B46D-1C99-4F82-96F4-E99320552268}.Debug|x64.ActiveCfg = Debug|x64 21 | {2FC1B46D-1C99-4F82-96F4-E99320552268}.Debug|x64.Build.0 = Debug|x64 22 | {2FC1B46D-1C99-4F82-96F4-E99320552268}.Release|x64.ActiveCfg = Release|x64 23 | {2FC1B46D-1C99-4F82-96F4-E99320552268}.Release|x64.Build.0 = Release|x64 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | GlobalSection(ExtensibilityGlobals) = postSolution 29 | SolutionGuid = {8171071B-2507-4C70-864A-5EBA5237C090} 30 | EndGlobalSection 31 | EndGlobal 32 | -------------------------------------------------------------------------------- /OpCode.cs: -------------------------------------------------------------------------------- 1 | // From LibreHardwareMonitor 2 | // Mozilla Public License 2.0 3 | // If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | // Copyright (C) LibreHardwareMonitor and Contributors 5 | // All Rights Reserved 6 | 7 | using System; 8 | using System.Runtime.InteropServices; 9 | using static System.Windows.Forms.VisualStyles.VisualStyleElement; 10 | 11 | namespace MicrobenchmarkGui 12 | { 13 | internal static class OpCode 14 | { 15 | public static CpuidDelegate Cpuid; 16 | public static RdtscDelegate Rdtsc; 17 | 18 | private static IntPtr _codeBuffer; 19 | private static ulong _size; 20 | 21 | // void __stdcall cpuidex(unsigned int index, unsigned int ecxValue, 22 | // unsigned int* eax, unsigned int* ebx, unsigned int* ecx, 23 | // unsigned int* edx) 24 | // { 25 | // int info[4]; 26 | // __cpuidex(info, index, ecxValue); 27 | // *eax = info[0]; 28 | // *ebx = info[1]; 29 | // *ecx = info[2]; 30 | // *edx = info[3]; 31 | // } 32 | 33 | private static readonly byte[] CpuId32 = 34 | { 35 | 0x55, // push ebp 36 | 0x8B, 37 | 0xEC, // mov ebp, esp 38 | 0x83, 39 | 0xEC, 40 | 0x10, // sub esp, 10h 41 | 0x8B, 42 | 0x45, 43 | 0x08, // mov eax, dword ptr [ebp+8] 44 | 0x8B, 45 | 0x4D, 46 | 0x0C, // mov ecx, dword ptr [ebp+0Ch] 47 | 0x53, // push ebx 48 | 0x0F, 49 | 0xA2, // cpuid 50 | 0x56, // push esi 51 | 0x8D, 52 | 0x75, 53 | 0xF0, // lea esi, [info] 54 | 0x89, 55 | 0x06, // mov dword ptr [esi],eax 56 | 0x8B, 57 | 0x45, 58 | 0x10, // mov eax, dword ptr [eax] 59 | 0x89, 60 | 0x5E, 61 | 0x04, // mov dword ptr [esi+4], ebx 62 | 0x89, 63 | 0x4E, 64 | 0x08, // mov dword ptr [esi+8], ecx 65 | 0x89, 66 | 0x56, 67 | 0x0C, // mov dword ptr [esi+0Ch], edx 68 | 0x8B, 69 | 0x4D, 70 | 0xF0, // mov ecx, dword ptr [info] 71 | 0x89, 72 | 0x08, // mov dword ptr [eax], ecx 73 | 0x8B, 74 | 0x45, 75 | 0x14, // mov eax, dword ptr [ebx] 76 | 0x8B, 77 | 0x4D, 78 | 0xF4, // mov ecx, dword ptr [ebp-0Ch] 79 | 0x89, 80 | 0x08, // mov dword ptr [eax], ecx 81 | 0x8B, 82 | 0x45, 83 | 0x18, // mov eax, dword ptr [ecx] 84 | 0x8B, 85 | 0x4D, 86 | 0xF8, // mov ecx, dword ptr [ebp-8] 87 | 0x89, 88 | 0x08, // mov dword ptr [eax], ecx 89 | 0x8B, 90 | 0x45, 91 | 0x1C, // mov eax, dword ptr [edx] 92 | 0x8B, 93 | 0x4D, 94 | 0xFC, // mov ecx, dword ptr [ebp-4] 95 | 0x5E, // pop esi 96 | 0x89, 97 | 0x08, // mov dword ptr [eax], ecx 98 | 0x5B, // pop ebx 99 | 0xC9, // leave 100 | 0xC2, 101 | 0x18, 102 | 0x00 // ret 18h 103 | }; 104 | 105 | private static readonly byte[] CpuId64Linux = 106 | { 107 | 0x49, 108 | 0x89, 109 | 0xD2, // mov r10, rdx 110 | 0x49, 111 | 0x89, 112 | 0xCB, // mov r11, rcx 113 | 0x53, // push rbx 114 | 0x89, 115 | 0xF8, // mov eax, edi 116 | 0x89, 117 | 0xF1, // mov ecx, esi 118 | 0x0F, 119 | 0xA2, // cpuid 120 | 0x41, 121 | 0x89, 122 | 0x02, // mov dword ptr [r10], eax 123 | 0x41, 124 | 0x89, 125 | 0x1B, // mov dword ptr [r11], ebx 126 | 0x41, 127 | 0x89, 128 | 0x08, // mov dword ptr [r8], ecx 129 | 0x41, 130 | 0x89, 131 | 0x11, // mov dword ptr [r9], edx 132 | 0x5B, // pop rbx 133 | 0xC3 // ret 134 | }; 135 | 136 | private static readonly byte[] CpuId64Windows = 137 | { 138 | 0x48, 139 | 0x89, 140 | 0x5C, 141 | 0x24, 142 | 0x08, // mov qword ptr [rsp+8], rbx 143 | 0x8B, 144 | 0xC1, // mov eax, ecx 145 | 0x8B, 146 | 0xCA, // mov ecx, edx 147 | 0x0F, 148 | 0xA2, // cpuid 149 | 0x41, 150 | 0x89, 151 | 0x00, // mov dword ptr [r8], eax 152 | 0x48, 153 | 0x8B, 154 | 0x44, 155 | 0x24, 156 | 0x28, // mov rax, qword ptr [rsp+28h] 157 | 0x41, 158 | 0x89, 159 | 0x19, // mov dword ptr [r9], ebx 160 | 0x48, 161 | 0x8B, 162 | 0x5C, 163 | 0x24, 164 | 0x08, // mov rbx, qword ptr [rsp+8] 165 | 0x89, 166 | 0x08, // mov dword ptr [rax], ecx 167 | 0x48, 168 | 0x8B, 169 | 0x44, 170 | 0x24, 171 | 0x30, // mov rax, qword ptr [rsp+30h] 172 | 0x89, 173 | 0x10, // mov dword ptr [rax], edx 174 | 0xC3 // ret 175 | }; 176 | 177 | // unsigned __int64 __stdcall rdtsc() { 178 | // return __rdtsc(); 179 | // } 180 | 181 | private static readonly byte[] Rdtsc32 = 182 | { 183 | 0x0F, 184 | 0x31, // rdtsc 185 | 0xC3 // ret 186 | }; 187 | 188 | private static readonly byte[] Rdtsc64 = 189 | { 190 | 0x0F, 191 | 0x31, // rdtsc 192 | 0x48, 193 | 0xC1, 194 | 0xE2, 195 | 0x20, // shl rdx, 20h 196 | 0x48, 197 | 0x0B, 198 | 0xC2, // or rax, rdx 199 | 0xC3 // ret 200 | }; 201 | 202 | [UnmanagedFunctionPointer(CallingConvention.StdCall)] 203 | public delegate bool CpuidDelegate(uint index, uint ecxValue, out uint eax, out uint ebx, out uint ecx, out uint edx); 204 | 205 | [UnmanagedFunctionPointer(CallingConvention.StdCall)] 206 | public delegate ulong RdtscDelegate(); 207 | 208 | [Flags] 209 | internal enum MEM : uint 210 | { 211 | MEM_COMMIT = 0x1000, 212 | MEM_RESERVE = 0x2000, 213 | MEM_DECOMMIT = 0x4000, 214 | MEM_RELEASE = 0x8000, 215 | MEM_RESET = 0x80000, 216 | MEM_LARGE_PAGES = 0x20000000, 217 | MEM_PHYSICAL = 0x400000, 218 | MEM_TOP_DOWN = 0x100000, 219 | MEM_WRITE_WATCH = 0x200000 220 | } 221 | 222 | [Flags] 223 | internal enum PAGE : uint 224 | { 225 | PAGE_EXECUTE = 0x10, 226 | PAGE_EXECUTE_READ = 0x20, 227 | PAGE_EXECUTE_READWRITE = 0x40, 228 | PAGE_EXECUTE_WRITECOPY = 0x80, 229 | PAGE_NOACCESS = 0x01, 230 | PAGE_READONLY = 0x02, 231 | PAGE_READWRITE = 0x04, 232 | PAGE_WRITECOPY = 0x08, 233 | PAGE_GUARD = 0x100, 234 | PAGE_NOCACHE = 0x200, 235 | PAGE_WRITECOMBINE = 0x400 236 | } 237 | 238 | [DllImport("kernel32.dll", CallingConvention = CallingConvention.Winapi)] 239 | internal static extern IntPtr VirtualAlloc(IntPtr lpAddress, UIntPtr dwSize, MEM flAllocationType, PAGE flProtect); 240 | 241 | [DllImport("kernel32.dll", CallingConvention = CallingConvention.Winapi)] 242 | internal static extern bool VirtualFree(IntPtr lpAddress, UIntPtr dwSize, MEM dwFreeType); 243 | 244 | public static void Open() 245 | { 246 | byte[] rdTscCode; 247 | byte[] cpuidCode; 248 | if (IntPtr.Size == 4) 249 | { 250 | rdTscCode = Rdtsc32; 251 | cpuidCode = CpuId32; 252 | } 253 | else 254 | { 255 | rdTscCode = Rdtsc64; 256 | 257 | cpuidCode = CpuId64Windows; 258 | } 259 | 260 | _size = (ulong)(rdTscCode.Length + cpuidCode.Length); 261 | 262 | _codeBuffer = VirtualAlloc(IntPtr.Zero, 263 | (UIntPtr)_size, 264 | MEM.MEM_COMMIT | MEM.MEM_RESERVE, 265 | PAGE.PAGE_EXECUTE_READWRITE); 266 | 267 | Marshal.Copy(rdTscCode, 0, _codeBuffer, rdTscCode.Length); 268 | Rdtsc = Marshal.GetDelegateForFunctionPointer(_codeBuffer, typeof(RdtscDelegate)) as RdtscDelegate; 269 | IntPtr cpuidAddress = (IntPtr)((long)_codeBuffer + rdTscCode.Length); 270 | Marshal.Copy(cpuidCode, 0, cpuidAddress, cpuidCode.Length); 271 | Cpuid = Marshal.GetDelegateForFunctionPointer(cpuidAddress, typeof(CpuidDelegate)) as CpuidDelegate; 272 | } 273 | 274 | public static void Close() 275 | { 276 | Rdtsc = null; 277 | Cpuid = null; 278 | VirtualFree(_codeBuffer, UIntPtr.Zero, MEM.MEM_RELEASE); 279 | } 280 | 281 | /// 282 | /// Gets the CPU manufacturer ID string, from cpuid with eax = 0 283 | /// 284 | /// Manufacturer ID string 285 | public static string GetManufacturerId() 286 | { 287 | uint eax, ecx, edx, ebx; 288 | byte[] cpuManufacturerBytes = new byte[12]; 289 | Cpuid(0, 0, out eax, out ebx, out ecx, out edx); 290 | 291 | // when you use a managed language and can't play with types 292 | cpuManufacturerBytes[0] = (byte)ebx; 293 | cpuManufacturerBytes[1] = (byte)(ebx >> 8); 294 | cpuManufacturerBytes[2] = (byte)(ebx >> 16); 295 | cpuManufacturerBytes[3] = (byte)(ebx >> 24); 296 | cpuManufacturerBytes[4] = (byte)edx; 297 | cpuManufacturerBytes[5] = (byte)(edx >> 8); 298 | cpuManufacturerBytes[6] = (byte)(edx >> 16); 299 | cpuManufacturerBytes[7] = (byte)(edx >> 24); 300 | cpuManufacturerBytes[8] = (byte)ecx; 301 | cpuManufacturerBytes[9] = (byte)(ecx >> 8); 302 | cpuManufacturerBytes[10] = (byte)(ecx >> 16); 303 | cpuManufacturerBytes[11] = (byte)(ecx >> 24); 304 | return System.Text.Encoding.ASCII.GetString(cpuManufacturerBytes); 305 | } 306 | 307 | public static string GetProcessorName() 308 | { 309 | uint[] buffer = new uint[12]; 310 | Cpuid(0x80000002, 0, out buffer[0], out buffer[1], out buffer[2], out buffer[3]); 311 | Cpuid(0x80000003, 0, out buffer[4], out buffer[5], out buffer[6], out buffer[7]); 312 | Cpuid(0x80000004, 0, out buffer[8], out buffer[9], out buffer[10], out buffer[11]); 313 | 314 | byte[] dst = new byte[buffer.Length * 4]; 315 | Buffer.BlockCopy(buffer, 0, dst, 0, buffer.Length * 4); 316 | return System.Text.Encoding.ASCII.GetString(dst); 317 | } 318 | 319 | public static void GetProcessorVersion(out byte family, out byte model, out byte stepping) 320 | { 321 | uint eax, ecx, edx, ebx; 322 | Cpuid(1, 0, out eax, out ebx, out ecx, out edx); 323 | 324 | stepping = (byte)(eax & 0xF); 325 | family = (byte)((eax >> 8) & 0xF); 326 | model = (byte)((eax >> 4) & 0xF); 327 | 328 | // wikipedia says if family id is 6 or 15, model = model + extended model id shifted left by 4 bits 329 | // extended model id starts on bit 16 330 | if (family == 6 || family == 15) 331 | { 332 | model += (byte)((eax >> 12) & 0xF0); 333 | } 334 | 335 | // if family is 15, family = family + extended family 336 | if (family == 15) 337 | { 338 | family += (byte)(eax >> 20); 339 | } 340 | } 341 | } 342 | } 343 | -------------------------------------------------------------------------------- /OpenCL/LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /OpenCL/README.md: -------------------------------------------------------------------------------- 1 | # OpenCLTM API Headers 2 | 3 | This repository contains C language headers for the OpenCL API. 4 | 5 | The authoritative public repository for these headers is located at: 6 | 7 | https://github.com/KhronosGroup/OpenCL-Headers 8 | 9 | Issues, proposed fixes for issues, and other suggested changes should be 10 | created using Github. 11 | 12 | ## Branch Structure 13 | 14 | The OpenCL API headers in this repository are Unified headers and are designed 15 | to work with all released OpenCL versions. This differs from previous OpenCL 16 | API headers, where version-specific API headers either existed in separate 17 | branches, or in separate folders in a branch. 18 | 19 | ## Compiling for a Specific OpenCL Version 20 | 21 | By default, the OpenCL API headers in this repository are for the latest 22 | OpenCL version (currently OpenCL 2.2). To use these API headers to target 23 | a different OpenCL version, an application may `#define` the preprocessor 24 | value `CL_TARGET_OPENCL_VERSION` before including the OpenCL API headers. 25 | The `CL_TARGET_OPENCL_VERSION` is a three digit decimal value representing 26 | the OpenCL API version. 27 | 28 | For example, to enforce usage of no more than the OpenCL 1.2 APIs, you may 29 | include the OpenCL API headers as follows: 30 | 31 | ``` 32 | #define CL_TARGET_OPENCL_VERSION 120 33 | #include 34 | ``` 35 | 36 | ## Directory Structure 37 | 38 | ``` 39 | README.md This file 40 | LICENSE Source license for the OpenCL API headers 41 | CL/ Unified OpenCL API headers tree 42 | ``` 43 | 44 | ## License 45 | 46 | See [LICENSE](LICENSE). 47 | 48 | --- 49 | 50 | OpenCL and the OpenCL logo are trademarks of Apple Inc. used by permission by Khronos. 51 | -------------------------------------------------------------------------------- /OpenCL/include/CL/cl_d3d10.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (c) 2008-2020 The Khronos Group Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | #ifndef __OPENCL_CL_D3D10_H 18 | #define __OPENCL_CL_D3D10_H 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | #ifdef __cplusplus 25 | extern "C" { 26 | #endif 27 | 28 | /****************************************************************************** 29 | * cl_khr_d3d10_sharing */ 30 | #define cl_khr_d3d10_sharing 1 31 | 32 | typedef cl_uint cl_d3d10_device_source_khr; 33 | typedef cl_uint cl_d3d10_device_set_khr; 34 | 35 | /******************************************************************************/ 36 | 37 | /* Error Codes */ 38 | #define CL_INVALID_D3D10_DEVICE_KHR -1002 39 | #define CL_INVALID_D3D10_RESOURCE_KHR -1003 40 | #define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR -1004 41 | #define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR -1005 42 | 43 | /* cl_d3d10_device_source_nv */ 44 | #define CL_D3D10_DEVICE_KHR 0x4010 45 | #define CL_D3D10_DXGI_ADAPTER_KHR 0x4011 46 | 47 | /* cl_d3d10_device_set_nv */ 48 | #define CL_PREFERRED_DEVICES_FOR_D3D10_KHR 0x4012 49 | #define CL_ALL_DEVICES_FOR_D3D10_KHR 0x4013 50 | 51 | /* cl_context_info */ 52 | #define CL_CONTEXT_D3D10_DEVICE_KHR 0x4014 53 | #define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C 54 | 55 | /* cl_mem_info */ 56 | #define CL_MEM_D3D10_RESOURCE_KHR 0x4015 57 | 58 | /* cl_image_info */ 59 | #define CL_IMAGE_D3D10_SUBRESOURCE_KHR 0x4016 60 | 61 | /* cl_command_type */ 62 | #define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR 0x4017 63 | #define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR 0x4018 64 | 65 | /******************************************************************************/ 66 | 67 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)( 68 | cl_platform_id platform, 69 | cl_d3d10_device_source_khr d3d_device_source, 70 | void * d3d_object, 71 | cl_d3d10_device_set_khr d3d_device_set, 72 | cl_uint num_entries, 73 | cl_device_id * devices, 74 | cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0; 75 | 76 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)( 77 | cl_context context, 78 | cl_mem_flags flags, 79 | ID3D10Buffer * resource, 80 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; 81 | 82 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)( 83 | cl_context context, 84 | cl_mem_flags flags, 85 | ID3D10Texture2D * resource, 86 | UINT subresource, 87 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; 88 | 89 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)( 90 | cl_context context, 91 | cl_mem_flags flags, 92 | ID3D10Texture3D * resource, 93 | UINT subresource, 94 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; 95 | 96 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)( 97 | cl_command_queue command_queue, 98 | cl_uint num_objects, 99 | const cl_mem * mem_objects, 100 | cl_uint num_events_in_wait_list, 101 | const cl_event * event_wait_list, 102 | cl_event * event) CL_API_SUFFIX__VERSION_1_0; 103 | 104 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)( 105 | cl_command_queue command_queue, 106 | cl_uint num_objects, 107 | const cl_mem * mem_objects, 108 | cl_uint num_events_in_wait_list, 109 | const cl_event * event_wait_list, 110 | cl_event * event) CL_API_SUFFIX__VERSION_1_0; 111 | 112 | #ifdef __cplusplus 113 | } 114 | #endif 115 | 116 | #endif /* __OPENCL_CL_D3D10_H */ 117 | -------------------------------------------------------------------------------- /OpenCL/include/CL/cl_d3d11.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (c) 2008-2020 The Khronos Group Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | #ifndef __OPENCL_CL_D3D11_H 18 | #define __OPENCL_CL_D3D11_H 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | #ifdef __cplusplus 25 | extern "C" { 26 | #endif 27 | 28 | /****************************************************************************** 29 | * cl_khr_d3d11_sharing */ 30 | #define cl_khr_d3d11_sharing 1 31 | 32 | typedef cl_uint cl_d3d11_device_source_khr; 33 | typedef cl_uint cl_d3d11_device_set_khr; 34 | 35 | /******************************************************************************/ 36 | 37 | /* Error Codes */ 38 | #define CL_INVALID_D3D11_DEVICE_KHR -1006 39 | #define CL_INVALID_D3D11_RESOURCE_KHR -1007 40 | #define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR -1008 41 | #define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR -1009 42 | 43 | /* cl_d3d11_device_source */ 44 | #define CL_D3D11_DEVICE_KHR 0x4019 45 | #define CL_D3D11_DXGI_ADAPTER_KHR 0x401A 46 | 47 | /* cl_d3d11_device_set */ 48 | #define CL_PREFERRED_DEVICES_FOR_D3D11_KHR 0x401B 49 | #define CL_ALL_DEVICES_FOR_D3D11_KHR 0x401C 50 | 51 | /* cl_context_info */ 52 | #define CL_CONTEXT_D3D11_DEVICE_KHR 0x401D 53 | #define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D 54 | 55 | /* cl_mem_info */ 56 | #define CL_MEM_D3D11_RESOURCE_KHR 0x401E 57 | 58 | /* cl_image_info */ 59 | #define CL_IMAGE_D3D11_SUBRESOURCE_KHR 0x401F 60 | 61 | /* cl_command_type */ 62 | #define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR 0x4020 63 | #define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR 0x4021 64 | 65 | /******************************************************************************/ 66 | 67 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)( 68 | cl_platform_id platform, 69 | cl_d3d11_device_source_khr d3d_device_source, 70 | void * d3d_object, 71 | cl_d3d11_device_set_khr d3d_device_set, 72 | cl_uint num_entries, 73 | cl_device_id * devices, 74 | cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2; 75 | 76 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)( 77 | cl_context context, 78 | cl_mem_flags flags, 79 | ID3D11Buffer * resource, 80 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; 81 | 82 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)( 83 | cl_context context, 84 | cl_mem_flags flags, 85 | ID3D11Texture2D * resource, 86 | UINT subresource, 87 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; 88 | 89 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)( 90 | cl_context context, 91 | cl_mem_flags flags, 92 | ID3D11Texture3D * resource, 93 | UINT subresource, 94 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; 95 | 96 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)( 97 | cl_command_queue command_queue, 98 | cl_uint num_objects, 99 | const cl_mem * mem_objects, 100 | cl_uint num_events_in_wait_list, 101 | const cl_event * event_wait_list, 102 | cl_event * event) CL_API_SUFFIX__VERSION_1_2; 103 | 104 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)( 105 | cl_command_queue command_queue, 106 | cl_uint num_objects, 107 | const cl_mem * mem_objects, 108 | cl_uint num_events_in_wait_list, 109 | const cl_event * event_wait_list, 110 | cl_event * event) CL_API_SUFFIX__VERSION_1_2; 111 | 112 | #ifdef __cplusplus 113 | } 114 | #endif 115 | 116 | #endif /* __OPENCL_CL_D3D11_H */ 117 | -------------------------------------------------------------------------------- /OpenCL/include/CL/cl_dx9_media_sharing.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (c) 2008-2020 The Khronos Group Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | #ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H 18 | #define __OPENCL_CL_DX9_MEDIA_SHARING_H 19 | 20 | #include 21 | #include 22 | 23 | #ifdef __cplusplus 24 | extern "C" { 25 | #endif 26 | 27 | /******************************************************************************/ 28 | /* cl_khr_dx9_media_sharing */ 29 | #define cl_khr_dx9_media_sharing 1 30 | 31 | typedef cl_uint cl_dx9_media_adapter_type_khr; 32 | typedef cl_uint cl_dx9_media_adapter_set_khr; 33 | 34 | #if defined(_WIN32) 35 | #include 36 | typedef struct _cl_dx9_surface_info_khr 37 | { 38 | IDirect3DSurface9 *resource; 39 | HANDLE shared_handle; 40 | } cl_dx9_surface_info_khr; 41 | #endif 42 | 43 | 44 | /******************************************************************************/ 45 | 46 | /* Error Codes */ 47 | #define CL_INVALID_DX9_MEDIA_ADAPTER_KHR -1010 48 | #define CL_INVALID_DX9_MEDIA_SURFACE_KHR -1011 49 | #define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR -1012 50 | #define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR -1013 51 | 52 | /* cl_media_adapter_type_khr */ 53 | #define CL_ADAPTER_D3D9_KHR 0x2020 54 | #define CL_ADAPTER_D3D9EX_KHR 0x2021 55 | #define CL_ADAPTER_DXVA_KHR 0x2022 56 | 57 | /* cl_media_adapter_set_khr */ 58 | #define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2023 59 | #define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2024 60 | 61 | /* cl_context_info */ 62 | #define CL_CONTEXT_ADAPTER_D3D9_KHR 0x2025 63 | #define CL_CONTEXT_ADAPTER_D3D9EX_KHR 0x2026 64 | #define CL_CONTEXT_ADAPTER_DXVA_KHR 0x2027 65 | 66 | /* cl_mem_info */ 67 | #define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR 0x2028 68 | #define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR 0x2029 69 | 70 | /* cl_image_info */ 71 | #define CL_IMAGE_DX9_MEDIA_PLANE_KHR 0x202A 72 | 73 | /* cl_command_type */ 74 | #define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR 0x202B 75 | #define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR 0x202C 76 | 77 | /******************************************************************************/ 78 | 79 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)( 80 | cl_platform_id platform, 81 | cl_uint num_media_adapters, 82 | cl_dx9_media_adapter_type_khr * media_adapter_type, 83 | void * media_adapters, 84 | cl_dx9_media_adapter_set_khr media_adapter_set, 85 | cl_uint num_entries, 86 | cl_device_id * devices, 87 | cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2; 88 | 89 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)( 90 | cl_context context, 91 | cl_mem_flags flags, 92 | cl_dx9_media_adapter_type_khr adapter_type, 93 | void * surface_info, 94 | cl_uint plane, 95 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; 96 | 97 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)( 98 | cl_command_queue command_queue, 99 | cl_uint num_objects, 100 | const cl_mem * mem_objects, 101 | cl_uint num_events_in_wait_list, 102 | const cl_event * event_wait_list, 103 | cl_event * event) CL_API_SUFFIX__VERSION_1_2; 104 | 105 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)( 106 | cl_command_queue command_queue, 107 | cl_uint num_objects, 108 | const cl_mem * mem_objects, 109 | cl_uint num_events_in_wait_list, 110 | const cl_event * event_wait_list, 111 | cl_event * event) CL_API_SUFFIX__VERSION_1_2; 112 | 113 | #ifdef __cplusplus 114 | } 115 | #endif 116 | 117 | #endif /* __OPENCL_CL_DX9_MEDIA_SHARING_H */ 118 | -------------------------------------------------------------------------------- /OpenCL/include/CL/cl_dx9_media_sharing_intel.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (c) 2008-2020 The Khronos Group Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | /*****************************************************************************\ 17 | 18 | Copyright (c) 2013-2019 Intel Corporation All Rights Reserved. 19 | 20 | THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 24 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 25 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 26 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 27 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 28 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 29 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE 30 | MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | 32 | File Name: cl_dx9_media_sharing_intel.h 33 | 34 | Abstract: 35 | 36 | Notes: 37 | 38 | \*****************************************************************************/ 39 | 40 | #ifndef __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H 41 | #define __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H 42 | 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | 50 | #ifdef __cplusplus 51 | extern "C" { 52 | #endif 53 | 54 | /*************************************** 55 | * cl_intel_dx9_media_sharing extension * 56 | ****************************************/ 57 | 58 | #define cl_intel_dx9_media_sharing 1 59 | 60 | typedef cl_uint cl_dx9_device_source_intel; 61 | typedef cl_uint cl_dx9_device_set_intel; 62 | 63 | /* error codes */ 64 | #define CL_INVALID_DX9_DEVICE_INTEL -1010 65 | #define CL_INVALID_DX9_RESOURCE_INTEL -1011 66 | #define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL -1012 67 | #define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL -1013 68 | 69 | /* cl_dx9_device_source_intel */ 70 | #define CL_D3D9_DEVICE_INTEL 0x4022 71 | #define CL_D3D9EX_DEVICE_INTEL 0x4070 72 | #define CL_DXVA_DEVICE_INTEL 0x4071 73 | 74 | /* cl_dx9_device_set_intel */ 75 | #define CL_PREFERRED_DEVICES_FOR_DX9_INTEL 0x4024 76 | #define CL_ALL_DEVICES_FOR_DX9_INTEL 0x4025 77 | 78 | /* cl_context_info */ 79 | #define CL_CONTEXT_D3D9_DEVICE_INTEL 0x4026 80 | #define CL_CONTEXT_D3D9EX_DEVICE_INTEL 0x4072 81 | #define CL_CONTEXT_DXVA_DEVICE_INTEL 0x4073 82 | 83 | /* cl_mem_info */ 84 | #define CL_MEM_DX9_RESOURCE_INTEL 0x4027 85 | #define CL_MEM_DX9_SHARED_HANDLE_INTEL 0x4074 86 | 87 | /* cl_image_info */ 88 | #define CL_IMAGE_DX9_PLANE_INTEL 0x4075 89 | 90 | /* cl_command_type */ 91 | #define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL 0x402A 92 | #define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL 0x402B 93 | /******************************************************************************/ 94 | 95 | extern CL_API_ENTRY cl_int CL_API_CALL 96 | clGetDeviceIDsFromDX9INTEL( 97 | cl_platform_id platform, 98 | cl_dx9_device_source_intel dx9_device_source, 99 | void* dx9_object, 100 | cl_dx9_device_set_intel dx9_device_set, 101 | cl_uint num_entries, 102 | cl_device_id* devices, 103 | cl_uint* num_devices) CL_EXT_SUFFIX__VERSION_1_1; 104 | 105 | typedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)( 106 | cl_platform_id platform, 107 | cl_dx9_device_source_intel dx9_device_source, 108 | void* dx9_object, 109 | cl_dx9_device_set_intel dx9_device_set, 110 | cl_uint num_entries, 111 | cl_device_id* devices, 112 | cl_uint* num_devices) CL_EXT_SUFFIX__VERSION_1_1; 113 | 114 | extern CL_API_ENTRY cl_mem CL_API_CALL 115 | clCreateFromDX9MediaSurfaceINTEL( 116 | cl_context context, 117 | cl_mem_flags flags, 118 | IDirect3DSurface9* resource, 119 | HANDLE sharedHandle, 120 | UINT plane, 121 | cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_1; 122 | 123 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)( 124 | cl_context context, 125 | cl_mem_flags flags, 126 | IDirect3DSurface9* resource, 127 | HANDLE sharedHandle, 128 | UINT plane, 129 | cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_1; 130 | 131 | extern CL_API_ENTRY cl_int CL_API_CALL 132 | clEnqueueAcquireDX9ObjectsINTEL( 133 | cl_command_queue command_queue, 134 | cl_uint num_objects, 135 | const cl_mem* mem_objects, 136 | cl_uint num_events_in_wait_list, 137 | const cl_event* event_wait_list, 138 | cl_event* event) CL_EXT_SUFFIX__VERSION_1_1; 139 | 140 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)( 141 | cl_command_queue command_queue, 142 | cl_uint num_objects, 143 | const cl_mem* mem_objects, 144 | cl_uint num_events_in_wait_list, 145 | const cl_event* event_wait_list, 146 | cl_event* event) CL_EXT_SUFFIX__VERSION_1_1; 147 | 148 | extern CL_API_ENTRY cl_int CL_API_CALL 149 | clEnqueueReleaseDX9ObjectsINTEL( 150 | cl_command_queue command_queue, 151 | cl_uint num_objects, 152 | cl_mem* mem_objects, 153 | cl_uint num_events_in_wait_list, 154 | const cl_event* event_wait_list, 155 | cl_event* event) CL_EXT_SUFFIX__VERSION_1_1; 156 | 157 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)( 158 | cl_command_queue command_queue, 159 | cl_uint num_objects, 160 | cl_mem* mem_objects, 161 | cl_uint num_events_in_wait_list, 162 | const cl_event* event_wait_list, 163 | cl_event* event) CL_EXT_SUFFIX__VERSION_1_1; 164 | 165 | #ifdef __cplusplus 166 | } 167 | #endif 168 | 169 | #endif /* __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H */ 170 | -------------------------------------------------------------------------------- /OpenCL/include/CL/cl_egl.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (c) 2008-2020 The Khronos Group Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | #ifndef __OPENCL_CL_EGL_H 18 | #define __OPENCL_CL_EGL_H 19 | 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | 26 | 27 | /* Command type for events created with clEnqueueAcquireEGLObjectsKHR */ 28 | #define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR 0x202F 29 | #define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR 0x202D 30 | #define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR 0x202E 31 | 32 | /* Error type for clCreateFromEGLImageKHR */ 33 | #define CL_INVALID_EGL_OBJECT_KHR -1093 34 | #define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR -1092 35 | 36 | /* CLeglImageKHR is an opaque handle to an EGLImage */ 37 | typedef void* CLeglImageKHR; 38 | 39 | /* CLeglDisplayKHR is an opaque handle to an EGLDisplay */ 40 | typedef void* CLeglDisplayKHR; 41 | 42 | /* CLeglSyncKHR is an opaque handle to an EGLSync object */ 43 | typedef void* CLeglSyncKHR; 44 | 45 | /* properties passed to clCreateFromEGLImageKHR */ 46 | typedef intptr_t cl_egl_image_properties_khr; 47 | 48 | 49 | #define cl_khr_egl_image 1 50 | 51 | extern CL_API_ENTRY cl_mem CL_API_CALL 52 | clCreateFromEGLImageKHR(cl_context context, 53 | CLeglDisplayKHR egldisplay, 54 | CLeglImageKHR eglimage, 55 | cl_mem_flags flags, 56 | const cl_egl_image_properties_khr * properties, 57 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; 58 | 59 | typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)( 60 | cl_context context, 61 | CLeglDisplayKHR egldisplay, 62 | CLeglImageKHR eglimage, 63 | cl_mem_flags flags, 64 | const cl_egl_image_properties_khr * properties, 65 | cl_int * errcode_ret); 66 | 67 | 68 | extern CL_API_ENTRY cl_int CL_API_CALL 69 | clEnqueueAcquireEGLObjectsKHR(cl_command_queue command_queue, 70 | cl_uint num_objects, 71 | const cl_mem * mem_objects, 72 | cl_uint num_events_in_wait_list, 73 | const cl_event * event_wait_list, 74 | cl_event * event) CL_API_SUFFIX__VERSION_1_0; 75 | 76 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)( 77 | cl_command_queue command_queue, 78 | cl_uint num_objects, 79 | const cl_mem * mem_objects, 80 | cl_uint num_events_in_wait_list, 81 | const cl_event * event_wait_list, 82 | cl_event * event); 83 | 84 | 85 | extern CL_API_ENTRY cl_int CL_API_CALL 86 | clEnqueueReleaseEGLObjectsKHR(cl_command_queue command_queue, 87 | cl_uint num_objects, 88 | const cl_mem * mem_objects, 89 | cl_uint num_events_in_wait_list, 90 | const cl_event * event_wait_list, 91 | cl_event * event) CL_API_SUFFIX__VERSION_1_0; 92 | 93 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)( 94 | cl_command_queue command_queue, 95 | cl_uint num_objects, 96 | const cl_mem * mem_objects, 97 | cl_uint num_events_in_wait_list, 98 | const cl_event * event_wait_list, 99 | cl_event * event); 100 | 101 | 102 | #define cl_khr_egl_event 1 103 | 104 | extern CL_API_ENTRY cl_event CL_API_CALL 105 | clCreateEventFromEGLSyncKHR(cl_context context, 106 | CLeglSyncKHR sync, 107 | CLeglDisplayKHR display, 108 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; 109 | 110 | typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)( 111 | cl_context context, 112 | CLeglSyncKHR sync, 113 | CLeglDisplayKHR display, 114 | cl_int * errcode_ret); 115 | 116 | #ifdef __cplusplus 117 | } 118 | #endif 119 | 120 | #endif /* __OPENCL_CL_EGL_H */ 121 | -------------------------------------------------------------------------------- /OpenCL/include/CL/cl_gl.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (c) 2008-2020 The Khronos Group Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | #ifndef __OPENCL_CL_GL_H 18 | #define __OPENCL_CL_GL_H 19 | 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | 26 | typedef cl_uint cl_gl_object_type; 27 | typedef cl_uint cl_gl_texture_info; 28 | typedef cl_uint cl_gl_platform_info; 29 | typedef struct __GLsync *cl_GLsync; 30 | 31 | /* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken */ 32 | #define CL_GL_OBJECT_BUFFER 0x2000 33 | #define CL_GL_OBJECT_TEXTURE2D 0x2001 34 | #define CL_GL_OBJECT_TEXTURE3D 0x2002 35 | #define CL_GL_OBJECT_RENDERBUFFER 0x2003 36 | #ifdef CL_VERSION_1_2 37 | #define CL_GL_OBJECT_TEXTURE2D_ARRAY 0x200E 38 | #define CL_GL_OBJECT_TEXTURE1D 0x200F 39 | #define CL_GL_OBJECT_TEXTURE1D_ARRAY 0x2010 40 | #define CL_GL_OBJECT_TEXTURE_BUFFER 0x2011 41 | #endif 42 | 43 | /* cl_gl_texture_info */ 44 | #define CL_GL_TEXTURE_TARGET 0x2004 45 | #define CL_GL_MIPMAP_LEVEL 0x2005 46 | #ifdef CL_VERSION_1_2 47 | #define CL_GL_NUM_SAMPLES 0x2012 48 | #endif 49 | 50 | 51 | extern CL_API_ENTRY cl_mem CL_API_CALL 52 | clCreateFromGLBuffer(cl_context context, 53 | cl_mem_flags flags, 54 | cl_GLuint bufobj, 55 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; 56 | 57 | #ifdef CL_VERSION_1_2 58 | 59 | extern CL_API_ENTRY cl_mem CL_API_CALL 60 | clCreateFromGLTexture(cl_context context, 61 | cl_mem_flags flags, 62 | cl_GLenum target, 63 | cl_GLint miplevel, 64 | cl_GLuint texture, 65 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; 66 | 67 | #endif 68 | 69 | extern CL_API_ENTRY cl_mem CL_API_CALL 70 | clCreateFromGLRenderbuffer(cl_context context, 71 | cl_mem_flags flags, 72 | cl_GLuint renderbuffer, 73 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; 74 | 75 | extern CL_API_ENTRY cl_int CL_API_CALL 76 | clGetGLObjectInfo(cl_mem memobj, 77 | cl_gl_object_type * gl_object_type, 78 | cl_GLuint * gl_object_name) CL_API_SUFFIX__VERSION_1_0; 79 | 80 | extern CL_API_ENTRY cl_int CL_API_CALL 81 | clGetGLTextureInfo(cl_mem memobj, 82 | cl_gl_texture_info param_name, 83 | size_t param_value_size, 84 | void * param_value, 85 | size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; 86 | 87 | extern CL_API_ENTRY cl_int CL_API_CALL 88 | clEnqueueAcquireGLObjects(cl_command_queue command_queue, 89 | cl_uint num_objects, 90 | const cl_mem * mem_objects, 91 | cl_uint num_events_in_wait_list, 92 | const cl_event * event_wait_list, 93 | cl_event * event) CL_API_SUFFIX__VERSION_1_0; 94 | 95 | extern CL_API_ENTRY cl_int CL_API_CALL 96 | clEnqueueReleaseGLObjects(cl_command_queue command_queue, 97 | cl_uint num_objects, 98 | const cl_mem * mem_objects, 99 | cl_uint num_events_in_wait_list, 100 | const cl_event * event_wait_list, 101 | cl_event * event) CL_API_SUFFIX__VERSION_1_0; 102 | 103 | 104 | /* Deprecated OpenCL 1.1 APIs */ 105 | extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL 106 | clCreateFromGLTexture2D(cl_context context, 107 | cl_mem_flags flags, 108 | cl_GLenum target, 109 | cl_GLint miplevel, 110 | cl_GLuint texture, 111 | cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; 112 | 113 | extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL 114 | clCreateFromGLTexture3D(cl_context context, 115 | cl_mem_flags flags, 116 | cl_GLenum target, 117 | cl_GLint miplevel, 118 | cl_GLuint texture, 119 | cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; 120 | 121 | /* cl_khr_gl_sharing extension */ 122 | 123 | #define cl_khr_gl_sharing 1 124 | 125 | typedef cl_uint cl_gl_context_info; 126 | 127 | /* Additional Error Codes */ 128 | #define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000 129 | 130 | /* cl_gl_context_info */ 131 | #define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006 132 | #define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007 133 | 134 | /* Additional cl_context_properties */ 135 | #define CL_GL_CONTEXT_KHR 0x2008 136 | #define CL_EGL_DISPLAY_KHR 0x2009 137 | #define CL_GLX_DISPLAY_KHR 0x200A 138 | #define CL_WGL_HDC_KHR 0x200B 139 | #define CL_CGL_SHAREGROUP_KHR 0x200C 140 | 141 | extern CL_API_ENTRY cl_int CL_API_CALL 142 | clGetGLContextInfoKHR(const cl_context_properties * properties, 143 | cl_gl_context_info param_name, 144 | size_t param_value_size, 145 | void * param_value, 146 | size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; 147 | 148 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)( 149 | const cl_context_properties * properties, 150 | cl_gl_context_info param_name, 151 | size_t param_value_size, 152 | void * param_value, 153 | size_t * param_value_size_ret); 154 | 155 | #ifdef __cplusplus 156 | } 157 | #endif 158 | 159 | #endif /* __OPENCL_CL_GL_H */ 160 | -------------------------------------------------------------------------------- /OpenCL/include/CL/cl_gl_ext.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (c) 2008-2020 The Khronos Group Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | #ifndef __OPENCL_CL_GL_EXT_H 18 | #define __OPENCL_CL_GL_EXT_H 19 | 20 | #ifdef __cplusplus 21 | extern "C" { 22 | #endif 23 | 24 | #include 25 | 26 | /* 27 | * cl_khr_gl_event extension 28 | */ 29 | #define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D 30 | 31 | extern CL_API_ENTRY cl_event CL_API_CALL 32 | clCreateEventFromGLsyncKHR(cl_context context, 33 | cl_GLsync cl_GLsync, 34 | cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1; 35 | 36 | #ifdef __cplusplus 37 | } 38 | #endif 39 | 40 | #endif /* __OPENCL_CL_GL_EXT_H */ 41 | -------------------------------------------------------------------------------- /OpenCL/include/CL/cl_half.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (c) 2019-2020 The Khronos Group Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | /** 18 | * This is a header-only utility library that provides OpenCL host code with 19 | * routines for converting to/from cl_half values. 20 | * 21 | * Example usage: 22 | * 23 | * #include 24 | * ... 25 | * cl_half h = cl_half_from_float(0.5f, CL_HALF_RTE); 26 | * cl_float f = cl_half_to_float(h); 27 | */ 28 | 29 | #ifndef OPENCL_CL_HALF_H 30 | #define OPENCL_CL_HALF_H 31 | 32 | #include 33 | 34 | #include 35 | 36 | #ifdef __cplusplus 37 | extern "C" { 38 | #endif 39 | 40 | 41 | /** 42 | * Rounding mode used when converting to cl_half. 43 | */ 44 | typedef enum 45 | { 46 | CL_HALF_RTE, // round to nearest even 47 | CL_HALF_RTZ, // round towards zero 48 | CL_HALF_RTP, // round towards positive infinity 49 | CL_HALF_RTN, // round towards negative infinity 50 | } cl_half_rounding_mode; 51 | 52 | 53 | /* Private utility macros. */ 54 | #define CL_HALF_EXP_MASK 0x7C00 55 | #define CL_HALF_MAX_FINITE_MAG 0x7BFF 56 | 57 | 58 | /* 59 | * Utility to deal with values that overflow when converting to half precision. 60 | */ 61 | static inline cl_half cl_half_handle_overflow(cl_half_rounding_mode rounding_mode, 62 | uint16_t sign) 63 | { 64 | if (rounding_mode == CL_HALF_RTZ) 65 | { 66 | // Round overflow towards zero -> largest finite number (preserving sign) 67 | return (sign << 15) | CL_HALF_MAX_FINITE_MAG; 68 | } 69 | else if (rounding_mode == CL_HALF_RTP && sign) 70 | { 71 | // Round negative overflow towards positive infinity -> most negative finite number 72 | return (1 << 15) | CL_HALF_MAX_FINITE_MAG; 73 | } 74 | else if (rounding_mode == CL_HALF_RTN && !sign) 75 | { 76 | // Round positive overflow towards negative infinity -> largest finite number 77 | return CL_HALF_MAX_FINITE_MAG; 78 | } 79 | 80 | // Overflow to infinity 81 | return (sign << 15) | CL_HALF_EXP_MASK; 82 | } 83 | 84 | /* 85 | * Utility to deal with values that underflow when converting to half precision. 86 | */ 87 | static inline cl_half cl_half_handle_underflow(cl_half_rounding_mode rounding_mode, 88 | uint16_t sign) 89 | { 90 | if (rounding_mode == CL_HALF_RTP && !sign) 91 | { 92 | // Round underflow towards positive infinity -> smallest positive value 93 | return (sign << 15) | 1; 94 | } 95 | else if (rounding_mode == CL_HALF_RTN && sign) 96 | { 97 | // Round underflow towards negative infinity -> largest negative value 98 | return (sign << 15) | 1; 99 | } 100 | 101 | // Flush to zero 102 | return (sign << 15); 103 | } 104 | 105 | 106 | /** 107 | * Convert a cl_float to a cl_half. 108 | */ 109 | static inline cl_half cl_half_from_float(cl_float f, cl_half_rounding_mode rounding_mode) 110 | { 111 | // Type-punning to get direct access to underlying bits 112 | union 113 | { 114 | cl_float f; 115 | uint32_t i; 116 | } f32; 117 | f32.f = f; 118 | 119 | // Extract sign bit 120 | uint16_t sign = f32.i >> 31; 121 | 122 | // Extract FP32 exponent and mantissa 123 | uint32_t f_exp = (f32.i >> (CL_FLT_MANT_DIG - 1)) & 0xFF; 124 | uint32_t f_mant = f32.i & ((1 << (CL_FLT_MANT_DIG - 1)) - 1); 125 | 126 | // Remove FP32 exponent bias 127 | int32_t exp = f_exp - CL_FLT_MAX_EXP + 1; 128 | 129 | // Add FP16 exponent bias 130 | uint16_t h_exp = exp + CL_HALF_MAX_EXP - 1; 131 | 132 | // Position of the bit that will become the FP16 mantissa LSB 133 | uint32_t lsb_pos = CL_FLT_MANT_DIG - CL_HALF_MANT_DIG; 134 | 135 | // Check for NaN / infinity 136 | if (f_exp == 0xFF) 137 | { 138 | if (f_mant) 139 | { 140 | // NaN -> propagate mantissa and silence it 141 | uint16_t h_mant = f_mant >> lsb_pos; 142 | h_mant |= 0x200; 143 | return (sign << 15) | CL_HALF_EXP_MASK | h_mant; 144 | } 145 | else 146 | { 147 | // Infinity -> zero mantissa 148 | return (sign << 15) | CL_HALF_EXP_MASK; 149 | } 150 | } 151 | 152 | // Check for zero 153 | if (!f_exp && !f_mant) 154 | { 155 | return (sign << 15); 156 | } 157 | 158 | // Check for overflow 159 | if (exp >= CL_HALF_MAX_EXP) 160 | { 161 | return cl_half_handle_overflow(rounding_mode, sign); 162 | } 163 | 164 | // Check for underflow 165 | if (exp < (CL_HALF_MIN_EXP - CL_HALF_MANT_DIG - 1)) 166 | { 167 | return cl_half_handle_underflow(rounding_mode, sign); 168 | } 169 | 170 | // Check for value that will become denormal 171 | if (exp < -14) 172 | { 173 | // Denormal -> include the implicit 1 from the FP32 mantissa 174 | h_exp = 0; 175 | f_mant |= 1 << (CL_FLT_MANT_DIG - 1); 176 | 177 | // Mantissa shift amount depends on exponent 178 | lsb_pos = -exp + (CL_FLT_MANT_DIG - 25); 179 | } 180 | 181 | // Generate FP16 mantissa by shifting FP32 mantissa 182 | uint16_t h_mant = f_mant >> lsb_pos; 183 | 184 | // Check whether we need to round 185 | uint32_t halfway = 1 << (lsb_pos - 1); 186 | uint32_t mask = (halfway << 1) - 1; 187 | switch (rounding_mode) 188 | { 189 | case CL_HALF_RTE: 190 | if ((f_mant & mask) > halfway) 191 | { 192 | // More than halfway -> round up 193 | h_mant += 1; 194 | } 195 | else if ((f_mant & mask) == halfway) 196 | { 197 | // Exactly halfway -> round to nearest even 198 | if (h_mant & 0x1) 199 | h_mant += 1; 200 | } 201 | break; 202 | case CL_HALF_RTZ: 203 | // Mantissa has already been truncated -> do nothing 204 | break; 205 | case CL_HALF_RTP: 206 | if ((f_mant & mask) && !sign) 207 | { 208 | // Round positive numbers up 209 | h_mant += 1; 210 | } 211 | break; 212 | case CL_HALF_RTN: 213 | if ((f_mant & mask) && sign) 214 | { 215 | // Round negative numbers down 216 | h_mant += 1; 217 | } 218 | break; 219 | } 220 | 221 | // Check for mantissa overflow 222 | if (h_mant & 0x400) 223 | { 224 | h_exp += 1; 225 | h_mant = 0; 226 | } 227 | 228 | return (sign << 15) | (h_exp << 10) | h_mant; 229 | } 230 | 231 | 232 | /** 233 | * Convert a cl_double to a cl_half. 234 | */ 235 | static inline cl_half cl_half_from_double(cl_double d, cl_half_rounding_mode rounding_mode) 236 | { 237 | // Type-punning to get direct access to underlying bits 238 | union 239 | { 240 | cl_double d; 241 | uint64_t i; 242 | } f64; 243 | f64.d = d; 244 | 245 | // Extract sign bit 246 | uint16_t sign = f64.i >> 63; 247 | 248 | // Extract FP64 exponent and mantissa 249 | uint64_t d_exp = (f64.i >> (CL_DBL_MANT_DIG - 1)) & 0x7FF; 250 | uint64_t d_mant = f64.i & (((uint64_t)1 << (CL_DBL_MANT_DIG - 1)) - 1); 251 | 252 | // Remove FP64 exponent bias 253 | int64_t exp = d_exp - CL_DBL_MAX_EXP + 1; 254 | 255 | // Add FP16 exponent bias 256 | uint16_t h_exp = (uint16_t)(exp + CL_HALF_MAX_EXP - 1); 257 | 258 | // Position of the bit that will become the FP16 mantissa LSB 259 | uint32_t lsb_pos = CL_DBL_MANT_DIG - CL_HALF_MANT_DIG; 260 | 261 | // Check for NaN / infinity 262 | if (d_exp == 0x7FF) 263 | { 264 | if (d_mant) 265 | { 266 | // NaN -> propagate mantissa and silence it 267 | uint16_t h_mant = (uint16_t)(d_mant >> lsb_pos); 268 | h_mant |= 0x200; 269 | return (sign << 15) | CL_HALF_EXP_MASK | h_mant; 270 | } 271 | else 272 | { 273 | // Infinity -> zero mantissa 274 | return (sign << 15) | CL_HALF_EXP_MASK; 275 | } 276 | } 277 | 278 | // Check for zero 279 | if (!d_exp && !d_mant) 280 | { 281 | return (sign << 15); 282 | } 283 | 284 | // Check for overflow 285 | if (exp >= CL_HALF_MAX_EXP) 286 | { 287 | return cl_half_handle_overflow(rounding_mode, sign); 288 | } 289 | 290 | // Check for underflow 291 | if (exp < (CL_HALF_MIN_EXP - CL_HALF_MANT_DIG - 1)) 292 | { 293 | return cl_half_handle_underflow(rounding_mode, sign); 294 | } 295 | 296 | // Check for value that will become denormal 297 | if (exp < -14) 298 | { 299 | // Include the implicit 1 from the FP64 mantissa 300 | h_exp = 0; 301 | d_mant |= (uint64_t)1 << (CL_DBL_MANT_DIG - 1); 302 | 303 | // Mantissa shift amount depends on exponent 304 | lsb_pos = (uint32_t)(-exp + (CL_DBL_MANT_DIG - 25)); 305 | } 306 | 307 | // Generate FP16 mantissa by shifting FP64 mantissa 308 | uint16_t h_mant = (uint16_t)(d_mant >> lsb_pos); 309 | 310 | // Check whether we need to round 311 | uint64_t halfway = (uint64_t)1 << (lsb_pos - 1); 312 | uint64_t mask = (halfway << 1) - 1; 313 | switch (rounding_mode) 314 | { 315 | case CL_HALF_RTE: 316 | if ((d_mant & mask) > halfway) 317 | { 318 | // More than halfway -> round up 319 | h_mant += 1; 320 | } 321 | else if ((d_mant & mask) == halfway) 322 | { 323 | // Exactly halfway -> round to nearest even 324 | if (h_mant & 0x1) 325 | h_mant += 1; 326 | } 327 | break; 328 | case CL_HALF_RTZ: 329 | // Mantissa has already been truncated -> do nothing 330 | break; 331 | case CL_HALF_RTP: 332 | if ((d_mant & mask) && !sign) 333 | { 334 | // Round positive numbers up 335 | h_mant += 1; 336 | } 337 | break; 338 | case CL_HALF_RTN: 339 | if ((d_mant & mask) && sign) 340 | { 341 | // Round negative numbers down 342 | h_mant += 1; 343 | } 344 | break; 345 | } 346 | 347 | // Check for mantissa overflow 348 | if (h_mant & 0x400) 349 | { 350 | h_exp += 1; 351 | h_mant = 0; 352 | } 353 | 354 | return (sign << 15) | (h_exp << 10) | h_mant; 355 | } 356 | 357 | 358 | /** 359 | * Convert a cl_half to a cl_float. 360 | */ 361 | static inline cl_float cl_half_to_float(cl_half h) 362 | { 363 | // Type-punning to get direct access to underlying bits 364 | union 365 | { 366 | cl_float f; 367 | uint32_t i; 368 | } f32; 369 | 370 | // Extract sign bit 371 | uint16_t sign = h >> 15; 372 | 373 | // Extract FP16 exponent and mantissa 374 | uint16_t h_exp = (h >> (CL_HALF_MANT_DIG - 1)) & 0x1F; 375 | uint16_t h_mant = h & 0x3FF; 376 | 377 | // Remove FP16 exponent bias 378 | int32_t exp = h_exp - CL_HALF_MAX_EXP + 1; 379 | 380 | // Add FP32 exponent bias 381 | uint32_t f_exp = exp + CL_FLT_MAX_EXP - 1; 382 | 383 | // Check for NaN / infinity 384 | if (h_exp == 0x1F) 385 | { 386 | if (h_mant) 387 | { 388 | // NaN -> propagate mantissa and silence it 389 | uint32_t f_mant = h_mant << (CL_FLT_MANT_DIG - CL_HALF_MANT_DIG); 390 | f_mant |= 0x400000; 391 | f32.i = (sign << 31) | 0x7F800000 | f_mant; 392 | return f32.f; 393 | } 394 | else 395 | { 396 | // Infinity -> zero mantissa 397 | f32.i = (sign << 31) | 0x7F800000; 398 | return f32.f; 399 | } 400 | } 401 | 402 | // Check for zero / denormal 403 | if (h_exp == 0) 404 | { 405 | if (h_mant == 0) 406 | { 407 | // Zero -> zero exponent 408 | f_exp = 0; 409 | } 410 | else 411 | { 412 | // Denormal -> normalize it 413 | // - Shift mantissa to make most-significant 1 implicit 414 | // - Adjust exponent accordingly 415 | uint32_t shift = 0; 416 | while ((h_mant & 0x400) == 0) 417 | { 418 | h_mant <<= 1; 419 | shift++; 420 | } 421 | h_mant &= 0x3FF; 422 | f_exp -= shift - 1; 423 | } 424 | } 425 | 426 | f32.i = (sign << 31) | (f_exp << 23) | (h_mant << 13); 427 | return f32.f; 428 | } 429 | 430 | 431 | #undef CL_HALF_EXP_MASK 432 | #undef CL_HALF_MAX_FINITE_MAG 433 | 434 | 435 | #ifdef __cplusplus 436 | } 437 | #endif 438 | 439 | 440 | #endif /* OPENCL_CL_HALF_H */ 441 | -------------------------------------------------------------------------------- /OpenCL/include/CL/cl_va_api_media_sharing_intel.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (c) 2008-2020 The Khronos Group Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | /*****************************************************************************\ 17 | 18 | Copyright (c) 2013-2019 Intel Corporation All Rights Reserved. 19 | 20 | THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 24 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 25 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 26 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 27 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 28 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING 29 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE 30 | MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | 32 | File Name: cl_va_api_media_sharing_intel.h 33 | 34 | Abstract: 35 | 36 | Notes: 37 | 38 | \*****************************************************************************/ 39 | 40 | 41 | #ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H 42 | #define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H 43 | 44 | #include 45 | #include 46 | #include 47 | 48 | #ifdef __cplusplus 49 | extern "C" { 50 | #endif 51 | 52 | /****************************************** 53 | * cl_intel_va_api_media_sharing extension * 54 | *******************************************/ 55 | 56 | #define cl_intel_va_api_media_sharing 1 57 | 58 | /* error codes */ 59 | #define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL -1098 60 | #define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL -1099 61 | #define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL -1100 62 | #define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL -1101 63 | 64 | /* cl_va_api_device_source_intel */ 65 | #define CL_VA_API_DISPLAY_INTEL 0x4094 66 | 67 | /* cl_va_api_device_set_intel */ 68 | #define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL 0x4095 69 | #define CL_ALL_DEVICES_FOR_VA_API_INTEL 0x4096 70 | 71 | /* cl_context_info */ 72 | #define CL_CONTEXT_VA_API_DISPLAY_INTEL 0x4097 73 | 74 | /* cl_mem_info */ 75 | #define CL_MEM_VA_API_MEDIA_SURFACE_INTEL 0x4098 76 | 77 | /* cl_image_info */ 78 | #define CL_IMAGE_VA_API_PLANE_INTEL 0x4099 79 | 80 | /* cl_command_type */ 81 | #define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL 0x409A 82 | #define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL 0x409B 83 | 84 | typedef cl_uint cl_va_api_device_source_intel; 85 | typedef cl_uint cl_va_api_device_set_intel; 86 | 87 | extern CL_API_ENTRY cl_int CL_API_CALL 88 | clGetDeviceIDsFromVA_APIMediaAdapterINTEL( 89 | cl_platform_id platform, 90 | cl_va_api_device_source_intel media_adapter_type, 91 | void* media_adapter, 92 | cl_va_api_device_set_intel media_adapter_set, 93 | cl_uint num_entries, 94 | cl_device_id* devices, 95 | cl_uint* num_devices) CL_EXT_SUFFIX__VERSION_1_2; 96 | 97 | typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)( 98 | cl_platform_id platform, 99 | cl_va_api_device_source_intel media_adapter_type, 100 | void* media_adapter, 101 | cl_va_api_device_set_intel media_adapter_set, 102 | cl_uint num_entries, 103 | cl_device_id* devices, 104 | cl_uint* num_devices) CL_EXT_SUFFIX__VERSION_1_2; 105 | 106 | extern CL_API_ENTRY cl_mem CL_API_CALL 107 | clCreateFromVA_APIMediaSurfaceINTEL( 108 | cl_context context, 109 | cl_mem_flags flags, 110 | VASurfaceID* surface, 111 | cl_uint plane, 112 | cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2; 113 | 114 | typedef CL_API_ENTRY cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)( 115 | cl_context context, 116 | cl_mem_flags flags, 117 | VASurfaceID* surface, 118 | cl_uint plane, 119 | cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2; 120 | 121 | extern CL_API_ENTRY cl_int CL_API_CALL 122 | clEnqueueAcquireVA_APIMediaSurfacesINTEL( 123 | cl_command_queue command_queue, 124 | cl_uint num_objects, 125 | const cl_mem* mem_objects, 126 | cl_uint num_events_in_wait_list, 127 | const cl_event* event_wait_list, 128 | cl_event* event) CL_EXT_SUFFIX__VERSION_1_2; 129 | 130 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)( 131 | cl_command_queue command_queue, 132 | cl_uint num_objects, 133 | const cl_mem* mem_objects, 134 | cl_uint num_events_in_wait_list, 135 | const cl_event* event_wait_list, 136 | cl_event* event) CL_EXT_SUFFIX__VERSION_1_2; 137 | 138 | extern CL_API_ENTRY cl_int CL_API_CALL 139 | clEnqueueReleaseVA_APIMediaSurfacesINTEL( 140 | cl_command_queue command_queue, 141 | cl_uint num_objects, 142 | const cl_mem* mem_objects, 143 | cl_uint num_events_in_wait_list, 144 | const cl_event* event_wait_list, 145 | cl_event* event) CL_EXT_SUFFIX__VERSION_1_2; 146 | 147 | typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)( 148 | cl_command_queue command_queue, 149 | cl_uint num_objects, 150 | const cl_mem* mem_objects, 151 | cl_uint num_events_in_wait_list, 152 | const cl_event* event_wait_list, 153 | cl_event* event) CL_EXT_SUFFIX__VERSION_1_2; 154 | 155 | #ifdef __cplusplus 156 | } 157 | #endif 158 | 159 | #endif /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */ 160 | -------------------------------------------------------------------------------- /OpenCL/include/CL/cl_version.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (c) 2018-2020 The Khronos Group Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | #ifndef __CL_VERSION_H 18 | #define __CL_VERSION_H 19 | 20 | /* Detect which version to target */ 21 | #if !defined(CL_TARGET_OPENCL_VERSION) 22 | #pragma message("cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 220 (OpenCL 2.2)") 23 | #define CL_TARGET_OPENCL_VERSION 220 24 | #endif 25 | #if CL_TARGET_OPENCL_VERSION != 100 && \ 26 | CL_TARGET_OPENCL_VERSION != 110 && \ 27 | CL_TARGET_OPENCL_VERSION != 120 && \ 28 | CL_TARGET_OPENCL_VERSION != 200 && \ 29 | CL_TARGET_OPENCL_VERSION != 210 && \ 30 | CL_TARGET_OPENCL_VERSION != 220 && \ 31 | CL_TARGET_OPENCL_VERSION != 300 32 | #pragma message("cl_version: CL_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220, 300). Defaulting to 220 (OpenCL 2.2)") 33 | #undef CL_TARGET_OPENCL_VERSION 34 | #define CL_TARGET_OPENCL_VERSION 220 35 | #endif 36 | 37 | 38 | /* OpenCL Version */ 39 | #if CL_TARGET_OPENCL_VERSION >= 300 && !defined(CL_VERSION_3_0) 40 | #define CL_VERSION_3_0 1 41 | #endif 42 | #if CL_TARGET_OPENCL_VERSION >= 220 && !defined(CL_VERSION_2_2) 43 | #define CL_VERSION_2_2 1 44 | #endif 45 | #if CL_TARGET_OPENCL_VERSION >= 210 && !defined(CL_VERSION_2_1) 46 | #define CL_VERSION_2_1 1 47 | #endif 48 | #if CL_TARGET_OPENCL_VERSION >= 200 && !defined(CL_VERSION_2_0) 49 | #define CL_VERSION_2_0 1 50 | #endif 51 | #if CL_TARGET_OPENCL_VERSION >= 120 && !defined(CL_VERSION_1_2) 52 | #define CL_VERSION_1_2 1 53 | #endif 54 | #if CL_TARGET_OPENCL_VERSION >= 110 && !defined(CL_VERSION_1_1) 55 | #define CL_VERSION_1_1 1 56 | #endif 57 | #if CL_TARGET_OPENCL_VERSION >= 100 && !defined(CL_VERSION_1_0) 58 | #define CL_VERSION_1_0 1 59 | #endif 60 | 61 | /* Allow deprecated APIs for older OpenCL versions. */ 62 | #if CL_TARGET_OPENCL_VERSION <= 220 && !defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS) 63 | #define CL_USE_DEPRECATED_OPENCL_2_2_APIS 64 | #endif 65 | #if CL_TARGET_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS) 66 | #define CL_USE_DEPRECATED_OPENCL_2_1_APIS 67 | #endif 68 | #if CL_TARGET_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS) 69 | #define CL_USE_DEPRECATED_OPENCL_2_0_APIS 70 | #endif 71 | #if CL_TARGET_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) 72 | #define CL_USE_DEPRECATED_OPENCL_1_2_APIS 73 | #endif 74 | #if CL_TARGET_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) 75 | #define CL_USE_DEPRECATED_OPENCL_1_1_APIS 76 | #endif 77 | #if CL_TARGET_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS) 78 | #define CL_USE_DEPRECATED_OPENCL_1_0_APIS 79 | #endif 80 | 81 | #endif /* __CL_VERSION_H */ 82 | -------------------------------------------------------------------------------- /OpenCL/include/CL/opencl.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (c) 2008-2020 The Khronos Group Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | ******************************************************************************/ 16 | 17 | #ifndef __OPENCL_H 18 | #define __OPENCL_H 19 | 20 | #ifdef __cplusplus 21 | extern "C" { 22 | #endif 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | #ifdef __cplusplus 30 | } 31 | #endif 32 | 33 | #endif /* __OPENCL_H */ 34 | -------------------------------------------------------------------------------- /OpenCL/lib/OpenCL.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clamchowder/MicrobenchmarksGui/08bbe546622b843d62f548590d90b5a871d4a588/OpenCL/lib/OpenCL.lib -------------------------------------------------------------------------------- /Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Threading.Tasks; 5 | using System.Windows.Forms; 6 | 7 | namespace MicrobenchmarkGui 8 | { 9 | static class Program 10 | { 11 | /// 12 | /// The main entry point for the application. 13 | /// 14 | [STAThread] 15 | static void Main() 16 | { 17 | Application.EnableVisualStyles(); 18 | Application.SetCompatibleTextRenderingDefault(false); 19 | Application.Run(new MicrobenchmarkForm()); 20 | } 21 | } 22 | } -------------------------------------------------------------------------------- /Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // General Information about an assembly is controlled through the following 6 | // set of attributes. Change these attribute values to modify the information 7 | // associated with an assembly. 8 | [assembly: AssemblyTitle("MicrobenchmarkGui")] 9 | [assembly: AssemblyDescription("")] 10 | [assembly: AssemblyConfiguration("")] 11 | [assembly: AssemblyCompany("")] 12 | [assembly: AssemblyProduct("MicrobenchmarkGui")] 13 | [assembly: AssemblyCopyright("Copyright © 2022")] 14 | [assembly: AssemblyTrademark("")] 15 | [assembly: AssemblyCulture("")] 16 | 17 | // Setting ComVisible to false makes the types in this assembly not visible 18 | // to COM components. If you need to access a type in this assembly from 19 | // COM, set the ComVisible attribute to true on that type. 20 | [assembly: ComVisible(false)] 21 | 22 | // The following GUID is for the ID of the typelib if this project is exposed to COM 23 | [assembly: Guid("ea6b854d-fad1-4212-8953-4f32286e1b57")] 24 | 25 | // Version information for an assembly consists of the following four values: 26 | // 27 | // Major Version 28 | // Minor Version 29 | // Build Number 30 | // Revision 31 | // 32 | // You can specify all the values or you can default the Build and Revision Numbers 33 | // by using the '*' as shown below: 34 | // [assembly: AssemblyVersion("1.0.*")] 35 | [assembly: AssemblyVersion("1.0.0.0")] 36 | [assembly: AssemblyFileVersion("1.0.0.0")] 37 | -------------------------------------------------------------------------------- /Properties/Resources.Designer.cs: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // This code was generated by a tool. 4 | // Runtime Version:4.0.30319.42000 5 | // 6 | // Changes to this file may cause incorrect behavior and will be lost if 7 | // the code is regenerated. 8 | // 9 | //------------------------------------------------------------------------------ 10 | 11 | 12 | namespace MicrobenchmarkGui.Properties 13 | { 14 | /// 15 | /// A strongly-typed resource class, for looking up localized strings, etc. 16 | /// 17 | // This class was auto-generated by the StronglyTypedResourceBuilder 18 | // class via a tool like ResGen or Visual Studio. 19 | // To add or remove a member, edit your .ResX file then rerun ResGen 20 | // with the /str option, or rebuild your VS project. 21 | [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Resources.Tools.StronglyTypedResourceBuilder", "4.0.0.0")] 22 | [global::System.Diagnostics.DebuggerNonUserCodeAttribute()] 23 | [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()] 24 | internal class Resources 25 | { 26 | 27 | private static global::System.Resources.ResourceManager resourceMan; 28 | 29 | private static global::System.Globalization.CultureInfo resourceCulture; 30 | 31 | [global::System.Diagnostics.CodeAnalysis.SuppressMessageAttribute("Microsoft.Performance", "CA1811:AvoidUncalledPrivateCode")] 32 | internal Resources() 33 | { 34 | } 35 | 36 | /// 37 | /// Returns the cached ResourceManager instance used by this class. 38 | /// 39 | [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)] 40 | internal static global::System.Resources.ResourceManager ResourceManager 41 | { 42 | get 43 | { 44 | if ((resourceMan == null)) 45 | { 46 | global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("MicrobenchmarkGui.Properties.Resources", typeof(Resources).Assembly); 47 | resourceMan = temp; 48 | } 49 | return resourceMan; 50 | } 51 | } 52 | 53 | /// 54 | /// Overrides the current thread's CurrentUICulture property for all 55 | /// resource lookups using this strongly typed resource class. 56 | /// 57 | [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)] 58 | internal static global::System.Globalization.CultureInfo Culture 59 | { 60 | get 61 | { 62 | return resourceCulture; 63 | } 64 | set 65 | { 66 | resourceCulture = value; 67 | } 68 | } 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /Properties/Resources.resx: -------------------------------------------------------------------------------- 1 |  2 | 3 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | text/microsoft-resx 107 | 108 | 109 | 2.0 110 | 111 | 112 | System.Resources.ResXResourceReader, System.Windows.Forms, Version=2.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 113 | 114 | 115 | System.Resources.ResXResourceWriter, System.Windows.Forms, Version=2.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 116 | 117 | -------------------------------------------------------------------------------- /Properties/Settings.Designer.cs: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // This code was generated by a tool. 4 | // Runtime Version:4.0.30319.42000 5 | // 6 | // Changes to this file may cause incorrect behavior and will be lost if 7 | // the code is regenerated. 8 | // 9 | //------------------------------------------------------------------------------ 10 | 11 | 12 | namespace MicrobenchmarkGui.Properties 13 | { 14 | [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()] 15 | [global::System.CodeDom.Compiler.GeneratedCodeAttribute("Microsoft.VisualStudio.Editors.SettingsDesigner.SettingsSingleFileGenerator", "11.0.0.0")] 16 | internal sealed partial class Settings : global::System.Configuration.ApplicationSettingsBase 17 | { 18 | 19 | private static Settings defaultInstance = ((Settings)(global::System.Configuration.ApplicationSettingsBase.Synchronized(new Settings()))); 20 | 21 | public static Settings Default 22 | { 23 | get 24 | { 25 | return defaultInstance; 26 | } 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /Properties/Settings.settings: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Microbenchmark GUI, currently cache and memory benchmark 2 | Started as a GUI project for https://github.com/clamchowder/Microbenchmarks, ended as a simple cache and memory benchmark because covering any more would require a hideous amount of GUI code. I'm not trying to imply the GUI code is anything but hideous, by the way :P 3 | 4 | Screenshot of the GUI 5 | 6 | Unlike another well known cache and memory benchmark that's spelled AIDA, this aims to be a free and more advanced tool. It runs through a lot of tests sizes designed to cover most cache capacities. Then, you can look through the results to determine bandwidth and latency for each level in the memory hierarchy. 7 | 8 | This program also provides more control over what's being tested. You can select SSE, AVX, AVX-512, or even MMX. You can test instruction fetch bandwidth, like in the screenshot above. And you can control how many threads are used. 9 | 10 | Non-goals: 11 | - Make results comparable to AIDA's. I don't know exactly what they're doing anyway. Don't try to directly compare results from this program to AIDA's. 12 | - Automatically determine bandwidth and latency at different cache levels. I think this is a shortcoming with AIDA since it seems prone to generating erroneous results with new CPU releases. Zen 3 with 96 MB of vertically stacked L3 is a good example. 13 | - Provide any kind of support. Not enough hours in a day, not enough free time :( 14 | 15 | # Building 16 | Get NASM (https://www.nasm.us/) and make sure it's in your path. Then things should build under Visual Studio 2019 or 2022. 17 | 18 | # Memory Bandwidth Options 19 | Most of these should be self explanatory. Anything that might require more explanation: 20 | 21 | ### Threading Mode 22 | - Private array per thread: Each thread gets its own array. You'll see bandwidth drop off after the sum of private cache sizes are exceeded. 23 | - One array shared by all threads: All threads read from the same array. You'll see bandwidth drop off after cache accessible to one core is exceeded, because data will be duplicated across private caches. 24 | 25 | ### Access Mode 26 | - Data Non-Temporal Read: Non-temporal means accesses don't have good temporal locality. In other words, once a location has been accessed, it's going to be a long time before it'll be accessed again, so don't bother caching it. If checked, `movntdqa` is used to load data. It doesn't seem to behave differently from regular reads, as far as this test is concerned. 27 | - Data Non-Temporal Writes: Non-temporal writes tend to behave very differently and bypass caching, using a write-combining memory model even if the region isn't marked write combining by the OS. This typically results in higher write bandwidth to memory, at the expense of not benefitting from cache. 28 | - Instruction fetch: Fills an array with the specified instructions, terminates it with a `ret`, and jumps to it. You can specify four different ways to fill the array: 29 | - 4B NOPs (0F 1F 40 00): 4 byte NOP recommended for padding in AMD optimization guides for Bulldozer and later. 4 bytes is representative of instruction length in typical integer code. Good for testing instruction fetch bandwidth in terms of IPC. 30 | - 8B NOPs (0F 1F 84 00 00 00 00 00): 8 byte NOP recommended for padding, as with above. 8 bytes is representative of very long instructions, which can come up when dealing with large immediates or AVX/AVX2/AVX-512 ISA extensions. Good for testing instruction fetch bandwidth in bytes/cycle. 31 | - Taken Branch per 16B: Each 16B block has an unconditional jump that jumps to the next 16B block. Good for testing BTB capacity. 32 | - 4B NOPs (66 66 66 90): 4 byte NOP recommended by an old AMD Athlon optimization guide. Strangely, Athlons seem to do fine with the 0F 1F 40 00 NOPs, but some sorta old Intel CPUs benefit from this. 33 | 34 | ### Test Method 35 | Specifies what instruction set extension to use. Memory accesses are aligned. 36 | - SSE: 128-bit accesses, using `movaps` 37 | - AVX: 256-bit accesses, using `vmovaps` with 256-bit YMM registers. May not be available on your CPU. Using AVX might not be beneficial even if your CPU supports it. For example, 256-bit AVX stores are microcoded on Piledriver and suffer from extremely slow throughput, so you should test with SSE in that case. 38 | - AVX-512: 512-bit accesses, using `vmovaps` with 512-bit ZMM registers. May not be avaialble on your CPU. 39 | - MMX: 64-bit accesses, using `movq`. MMX is a rather old instruction set extension introduced in later versions of the original Pentium. It still works on modern CPUs, even if it doesn't see a ton of use. 40 | 41 | # Memory Latency Options 42 | 43 | Memory latency has fewer test options, because it visits 64B cachelines in random order within a specified test sizes. But there are still a few controls: 44 | 45 | ## Access Mode 46 | - Simple addressing: instruction uses a register's value as a pointer to read from memory 47 | - Indexed addressing (C): C compilers like to compile `current = arr[current]` into an instruction that uses indexed addressing. In other words, the instruction specifies a base register and an index register. The CPU must add them together to get the final address used to access memory. On some CPUs, this indexed addressing mode creates an extra cycle of latency. 48 | 49 | ## Paging Mode 50 | Most applications have memory mapped for them in 4 KB pages, which reduces wasted memory and fragmentation. Memory can also be mapped in 2 MB pages. Windows calls this "Large Pages", while Linux calls it "Huge Pages". CPUs cache virtual to physical address translations in structures called TLBs, or translation lookaside buffers. 2 MB pages let TLB size go further, since each cached translation works with a 2 MB aligned block of memory instead of a 4 KB one. 51 | 52 | You can use 2 MB pages to minimize address translation penalties, letting you see L2 and L3 cache latency more clearly. However, this is a bit tricky to do on Windows. You need to give your account the "Lock pages in memory" privilege: 53 | 54 | Go to local security policy, local polices, user rights assignment, lock pages in memory and add yourself 55 | 56 | If you've logged in with a non-local account, you can also give "Everyone" the privilege. You need to reboot for the change to take effect. 57 | 58 | Then if you select Large Pages under Paging Mode, the test will allocate 1 GB (the largest test size for mem latency) upfront and run all test sizes within that. That means you need to have 1 GB of contiguous memory free. If you have a system without much memory that has been running for a while, you might have to reboot before allocation will succeed. 59 | 60 | Have fun! -------------------------------------------------------------------------------- /TestUtilities.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace MicrobenchmarkGui 8 | { 9 | public static class TestUtilities 10 | { 11 | /// 12 | /// Scale iterations to reach target time. 13 | /// 14 | /// Last iteration count 15 | /// Desired run time 16 | /// Last run time 17 | /// 18 | public static ulong ScaleIterations(ulong lastRunIterations, float targetTimeMs, float lastTimeMs) 19 | { 20 | if (lastTimeMs < 100) 21 | { 22 | return lastRunIterations * 5; 23 | } 24 | 25 | return (ulong)(lastRunIterations * (targetTimeMs / lastTimeMs)); 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /app.manifest: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | 7 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 53 | 54 | 55 | true 56 | 57 | 58 | 59 | 60 | 61 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /framework.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers 4 | // Windows Header Files 5 | #include 6 | -------------------------------------------------------------------------------- /img/guiscreenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clamchowder/MicrobenchmarksGui/08bbe546622b843d62f548590d90b5a871d4a588/img/guiscreenshot.png -------------------------------------------------------------------------------- /img/guiscreenshot_latency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clamchowder/MicrobenchmarksGui/08bbe546622b843d62f548590d90b5a871d4a588/img/guiscreenshot_latency.png -------------------------------------------------------------------------------- /img/lockpages.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clamchowder/MicrobenchmarksGui/08bbe546622b843d62f548590d90b5a871d4a588/img/lockpages.png -------------------------------------------------------------------------------- /latencykernel.cl: -------------------------------------------------------------------------------- 1 | // unrolled until terascale no longer saw further improvement (10x unroll) 2 | // assumes count will be a multiple of 10. but it won't be too inaccurate with a big count 3 | // not divisible by 10 4 | __kernel void unrolled_latency_test(__global const int* A, int count, __global int* ret) { 5 | int current = A[0]; 6 | int result; 7 | for (int i = 0; i < count; i += 10) { 8 | result += current; 9 | current = A[current]; 10 | result += current; 11 | current = A[current]; 12 | result += current; 13 | current = A[current]; 14 | result += current; 15 | current = A[current]; 16 | result += current; 17 | current = A[current]; 18 | result += current; 19 | current = A[current]; 20 | result += current; 21 | current = A[current]; 22 | result += current; 23 | current = A[current]; 24 | result += current; 25 | current = A[current]; 26 | result += current; 27 | current = A[current]; 28 | } 29 | 30 | ret[0] = result; 31 | } 32 | 33 | __kernel void unrolled_latency_test_amdvectorworkaround(__global const int* A, int count, __global int* ret) { 34 | int start = A[1 + get_local_id(0)]; 35 | int current = A[start]; 36 | int result; 37 | for (int i = 0; i < count; i += 10) { 38 | result += current; 39 | current = A[current]; 40 | result += current; 41 | current = A[current]; 42 | result += current; 43 | current = A[current]; 44 | result += current; 45 | current = A[current]; 46 | result += current; 47 | current = A[current]; 48 | result += current; 49 | current = A[current]; 50 | result += current; 51 | current = A[current]; 52 | result += current; 53 | current = A[current]; 54 | result += current; 55 | current = A[current]; 56 | result += current; 57 | current = A[current]; 58 | } 59 | 60 | ret[0] = result; 61 | } 62 | 63 | // latency test like the unrolled one above, but with input as constant memory 64 | __kernel void constant_unrolled_latency_test(__constant const int* A, int count, __global int* ret) { 65 | int current = A[0]; 66 | int result; 67 | for (int i = 0; i < count; i += 10) { 68 | result += current; 69 | current = A[current]; 70 | result += current; 71 | current = A[current]; 72 | result += current; 73 | current = A[current]; 74 | result += current; 75 | current = A[current]; 76 | result += current; 77 | current = A[current]; 78 | result += current; 79 | current = A[current]; 80 | result += current; 81 | current = A[current]; 82 | result += current; 83 | current = A[current]; 84 | result += current; 85 | current = A[current]; 86 | result += current; 87 | current = A[current]; 88 | } 89 | 90 | ret[0] = result; 91 | } 92 | 93 | #define local_mem_test_size 1024 94 | // uses local memory (LDS/shmem) 95 | __kernel void local_unrolled_latency_test(__global const int* A, int count, __global int* ret) { 96 | __local int local_a[local_mem_test_size]; // 4 KB, should be present on all GPUs, amirite? 97 | // better be fast 98 | for (int i = get_local_id(0);i < local_mem_test_size; i += get_local_size(0)) 99 | local_a[i] = A[i]; 100 | barrier(CLK_LOCAL_MEM_FENCE); 101 | 102 | // everyone else can chill/get masked off 103 | if (get_local_id(0) == 0) { 104 | int current = local_a[0]; 105 | int result; 106 | for (int i = 0; i < count; i += 10) { 107 | result += current; 108 | current = local_a[current]; 109 | result += current; 110 | current = local_a[current]; 111 | result += current; 112 | current = local_a[current]; 113 | result += current; 114 | current = local_a[current]; 115 | result += current; 116 | current = local_a[current]; 117 | result += current; 118 | current = local_a[current]; 119 | result += current; 120 | current = local_a[current]; 121 | result += current; 122 | current = local_a[current]; 123 | result += current; 124 | current = local_a[current]; 125 | result += current; 126 | current = local_a[current]; 127 | } 128 | 129 | ret[0] = result; 130 | } 131 | } 132 | 133 | __kernel void dummy_add(__global int* A) { 134 | A[get_global_id(0)]++; 135 | } -------------------------------------------------------------------------------- /latencykernel_tex.cl: -------------------------------------------------------------------------------- 1 | // does not work on Fermi 2 | __kernel void tex_latency_test(__read_only image1d_buffer_t A, int count, __global int* ret) { 3 | __local uint4 local_a[128]; 4 | int localId = get_local_id(0); 5 | uint4 current = read_imageui(A, 0); 6 | for (int i = 0; i < count; i += 10) { 7 | current = read_imageui(A, current.x); 8 | current = read_imageui(A, current.x); 9 | current = read_imageui(A, current.x); 10 | current = read_imageui(A, current.x); 11 | current = read_imageui(A, current.x); 12 | current = read_imageui(A, current.x); 13 | current = read_imageui(A, current.x); 14 | current = read_imageui(A, current.x); 15 | current = read_imageui(A, current.x); 16 | current = read_imageui(A, current.x); 17 | local_a[localId] = current; 18 | } 19 | 20 | ret[0] = local_a[localId].x; 21 | } -------------------------------------------------------------------------------- /packages.config: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /pch.c: -------------------------------------------------------------------------------- 1 | // pch.cpp: source file corresponding to the pre-compiled header 2 | 3 | #include "pch.h" 4 | 5 | // When you are using pre-compiled headers, this source file is necessary for compilation to succeed. 6 | -------------------------------------------------------------------------------- /pch.h: -------------------------------------------------------------------------------- 1 | // pch.h: This is a precompiled header file. 2 | // Files listed below are compiled only once, improving build performance for future builds. 3 | // This also affects IntelliSense performance, including code completion and many code browsing features. 4 | // However, files listed here are ALL re-compiled if any one of them is updated between builds. 5 | // Do not add files here that you will be updating frequently as this negates the performance advantage. 6 | 7 | #ifndef PCH_H 8 | #define PCH_H 9 | 10 | // add headers that you want to pre-compile here 11 | #include "framework.h" 12 | 13 | #endif //PCH_H 14 | --------------------------------------------------------------------------------