├── .gitattributes ├── .gitignore ├── LICENSE ├── Readme.md ├── SIMDPerformance.sln ├── SIMDPerformance ├── Program.cs ├── SIMDFloatPerformance.cs └── SIMDPerformanceBench.csproj └── SIMDPerformanceDebug ├── FloatOps.cs ├── Program.cs └── SIMDPerformanceDebug.csproj /.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text=auto 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | #*.jpg binary 44 | #*.png binary 45 | #*.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Build results 17 | [Dd]ebug/ 18 | [Dd]ebugPublic/ 19 | [Rr]elease/ 20 | [Rr]eleases/ 21 | x64/ 22 | x86/ 23 | [Aa][Rr][Mm]/ 24 | [Aa][Rr][Mm]64/ 25 | bld/ 26 | [Bb]in/ 27 | [Oo]bj/ 28 | [Ll]og/ 29 | 30 | # Visual Studio 2015/2017 cache/options directory 31 | .vs/ 32 | # Uncomment if you have tasks that create the project's static files in wwwroot 33 | #wwwroot/ 34 | 35 | # Visual Studio 2017 auto generated files 36 | Generated\ Files/ 37 | 38 | # MSTest test Results 39 | [Tt]est[Rr]esult*/ 40 | [Bb]uild[Ll]og.* 41 | 42 | # NUNIT 43 | *.VisualState.xml 44 | TestResult.xml 45 | 46 | # Build Results of an ATL Project 47 | [Dd]ebugPS/ 48 | [Rr]eleasePS/ 49 | dlldata.c 50 | 51 | # Benchmark Results 52 | BenchmarkDotNet.Artifacts/ 53 | 54 | # .NET Core 55 | project.lock.json 56 | project.fragment.lock.json 57 | artifacts/ 58 | 59 | # StyleCop 60 | StyleCopReport.xml 61 | 62 | # Files built by Visual Studio 63 | *_i.c 64 | *_p.c 65 | *_h.h 66 | *.ilk 67 | *.meta 68 | *.obj 69 | *.iobj 70 | *.pch 71 | *.pdb 72 | *.ipdb 73 | *.pgc 74 | *.pgd 75 | *.rsp 76 | *.sbr 77 | *.tlb 78 | *.tli 79 | *.tlh 80 | *.tmp 81 | *.tmp_proj 82 | *_wpftmp.csproj 83 | *.log 84 | *.vspscc 85 | *.vssscc 86 | .builds 87 | *.pidb 88 | *.svclog 89 | *.scc 90 | 91 | # Chutzpah Test files 92 | _Chutzpah* 93 | 94 | # Visual C++ cache files 95 | ipch/ 96 | *.aps 97 | *.ncb 98 | *.opendb 99 | *.opensdf 100 | *.sdf 101 | *.cachefile 102 | *.VC.db 103 | *.VC.VC.opendb 104 | 105 | # Visual Studio profiler 106 | *.psess 107 | *.vsp 108 | *.vspx 109 | *.sap 110 | 111 | # Visual Studio Trace Files 112 | *.e2e 113 | 114 | # TFS 2012 Local Workspace 115 | $tf/ 116 | 117 | # Guidance Automation Toolkit 118 | *.gpState 119 | 120 | # ReSharper is a .NET coding add-in 121 | _ReSharper*/ 122 | *.[Rr]e[Ss]harper 123 | *.DotSettings.user 124 | 125 | # JustCode is a .NET coding add-in 126 | .JustCode 127 | 128 | # TeamCity is a build add-in 129 | _TeamCity* 130 | 131 | # DotCover is a Code Coverage Tool 132 | *.dotCover 133 | 134 | # AxoCover is a Code Coverage Tool 135 | .axoCover/* 136 | !.axoCover/settings.json 137 | 138 | # Visual Studio code coverage results 139 | *.coverage 140 | *.coveragexml 141 | 142 | # NCrunch 143 | _NCrunch_* 144 | .*crunch*.local.xml 145 | nCrunchTemp_* 146 | 147 | # MightyMoose 148 | *.mm.* 149 | AutoTest.Net/ 150 | 151 | # Web workbench (sass) 152 | .sass-cache/ 153 | 154 | # Installshield output folder 155 | [Ee]xpress/ 156 | 157 | # DocProject is a documentation generator add-in 158 | DocProject/buildhelp/ 159 | DocProject/Help/*.HxT 160 | DocProject/Help/*.HxC 161 | DocProject/Help/*.hhc 162 | DocProject/Help/*.hhk 163 | DocProject/Help/*.hhp 164 | DocProject/Help/Html2 165 | DocProject/Help/html 166 | 167 | # Click-Once directory 168 | publish/ 169 | 170 | # Publish Web Output 171 | *.[Pp]ublish.xml 172 | *.azurePubxml 173 | # Note: Comment the next line if you want to checkin your web deploy settings, 174 | # but database connection strings (with potential passwords) will be unencrypted 175 | *.pubxml 176 | *.publishproj 177 | 178 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 179 | # checkin your Azure Web App publish settings, but sensitive information contained 180 | # in these scripts will be unencrypted 181 | PublishScripts/ 182 | 183 | # NuGet Packages 184 | *.nupkg 185 | # The packages folder can be ignored because of Package Restore 186 | **/[Pp]ackages/* 187 | # except build/, which is used as an MSBuild target. 188 | !**/[Pp]ackages/build/ 189 | # Uncomment if necessary however generally it will be regenerated when needed 190 | #!**/[Pp]ackages/repositories.config 191 | # NuGet v3's project.json files produces more ignorable files 192 | *.nuget.props 193 | *.nuget.targets 194 | 195 | # Microsoft Azure Build Output 196 | csx/ 197 | *.build.csdef 198 | 199 | # Microsoft Azure Emulator 200 | ecf/ 201 | rcf/ 202 | 203 | # Windows Store app package directories and files 204 | AppPackages/ 205 | BundleArtifacts/ 206 | Package.StoreAssociation.xml 207 | _pkginfo.txt 208 | *.appx 209 | 210 | # Visual Studio cache files 211 | # files ending in .cache can be ignored 212 | *.[Cc]ache 213 | # but keep track of directories ending in .cache 214 | !?*.[Cc]ache/ 215 | 216 | # Others 217 | ClientBin/ 218 | ~$* 219 | *~ 220 | *.dbmdl 221 | *.dbproj.schemaview 222 | *.jfm 223 | *.pfx 224 | *.publishsettings 225 | orleans.codegen.cs 226 | 227 | # Including strong name files can present a security risk 228 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 229 | #*.snk 230 | 231 | # Since there are multiple workflows, uncomment next line to ignore bower_components 232 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 233 | #bower_components/ 234 | 235 | # RIA/Silverlight projects 236 | Generated_Code/ 237 | 238 | # Backup & report files from converting an old project file 239 | # to a newer Visual Studio version. Backup files are not needed, 240 | # because we have git ;-) 241 | _UpgradeReport_Files/ 242 | Backup*/ 243 | UpgradeLog*.XML 244 | UpgradeLog*.htm 245 | ServiceFabricBackup/ 246 | *.rptproj.bak 247 | 248 | # SQL Server files 249 | *.mdf 250 | *.ldf 251 | *.ndf 252 | 253 | # Business Intelligence projects 254 | *.rdl.data 255 | *.bim.layout 256 | *.bim_*.settings 257 | *.rptproj.rsuser 258 | *- Backup*.rdl 259 | 260 | # Microsoft Fakes 261 | FakesAssemblies/ 262 | 263 | # GhostDoc plugin setting file 264 | *.GhostDoc.xml 265 | 266 | # Node.js Tools for Visual Studio 267 | .ntvs_analysis.dat 268 | node_modules/ 269 | 270 | # Visual Studio 6 build log 271 | *.plg 272 | 273 | # Visual Studio 6 workspace options file 274 | *.opt 275 | 276 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 277 | *.vbw 278 | 279 | # Visual Studio LightSwitch build output 280 | **/*.HTMLClient/GeneratedArtifacts 281 | **/*.DesktopClient/GeneratedArtifacts 282 | **/*.DesktopClient/ModelManifest.xml 283 | **/*.Server/GeneratedArtifacts 284 | **/*.Server/ModelManifest.xml 285 | _Pvt_Extensions 286 | 287 | # Paket dependency manager 288 | .paket/paket.exe 289 | paket-files/ 290 | 291 | # FAKE - F# Make 292 | .fake/ 293 | 294 | # JetBrains Rider 295 | .idea/ 296 | *.sln.iml 297 | 298 | # CodeRush personal settings 299 | .cr/personal 300 | 301 | # Python Tools for Visual Studio (PTVS) 302 | __pycache__/ 303 | *.pyc 304 | 305 | # Cake - Uncomment if you are using it 306 | # tools/** 307 | # !tools/packages.config 308 | 309 | # Tabs Studio 310 | *.tss 311 | 312 | # Telerik's JustMock configuration file 313 | *.jmconfig 314 | 315 | # BizTalk build output 316 | *.btp.cs 317 | *.btm.cs 318 | *.odx.cs 319 | *.xsd.cs 320 | 321 | # OpenCover UI analysis results 322 | OpenCover/ 323 | 324 | # Azure Stream Analytics local run output 325 | ASALocalRun/ 326 | 327 | # MSBuild Binary and Structured Log 328 | *.binlog 329 | 330 | # NVidia Nsight GPU debugger configuration file 331 | *.nvuser 332 | 333 | # MFractors (Xamarin productivity tool) working folder 334 | .mfractor/ 335 | 336 | # Local History for Visual Studio 337 | .localhistory/ 338 | 339 | # BeatPulse healthcheck temp database 340 | healthchecksdb -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 C. B. Gonzalez 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | ## C# High performance SIMD operations with MemoryMarshal.Cast 2 | 3 | Some example benchmarks to demonstrate how to use vectorization with SIMD in .Net. 4 | 5 | #### What you need #### 6 | These projects are developed against **.NET Core 2.2** but should work also in 2.1. 7 | 8 | In order to run the benchmarks you´ll need the excellent [BenchmarkDotNet](https://www.nuget.org/packages/BenchmarkDotNet/). 9 | 10 | For **.Net framework** you need to add the [System.Numerics.Vectors](https://www.nuget.org/packages/System.Numerics.Vectors/) package and [System.Memory](https://www.nuget.org/packages/System.Memory/) in order to be able to use `Span` or `Memory`. 11 | 12 | For a basic **introduction to SIMD**, have a look at [this](https://github.com/CBGonzalez/SIMDIntro) project. 13 | 14 | #### Introduction #### 15 | 16 | SIMD (Single Instruction, Multiple Data) will be used typically to process large amounts of numeric data, where every data element needs to receive the same treatment. 17 | 18 | Notice that SIMD happens at the processor core level, so additional speedup can be obtained using more than one thread. 19 | 20 | On a relatively modern CPU with AVX2 capabilities, the cores are able to process vectors containing 256 bits: you can operate with **8** `float` values (8 * 32 bits = 256 bits) in one go, or **4** `double` numbers. 21 | 22 | #### The data #### 23 | 24 | In order to be able to create Vectors, your data needs to be available in memory, as arrays or spans. 25 | 26 | ##### Arrays ##### 27 | 28 | Imagining that your data is available in two `float[]` arrays `left` and `right` and the result will be stored in `results`, a naïve approach to summing pairs of value would be: 29 | 30 | ``` 31 | public void SimpleSumArray() 32 | { 33 | for (int i = 0; i < left.Length; i++) 34 | { 35 | results[i] = left[i] + right[i]; 36 | } 37 | 38 | } 39 | ``` 40 | 41 | It´s not really fair to measure improvement based on a worst case scenario, so an improvement on scalar performance could be to use `Span` in place of naked arrays: 42 | 43 | ``` 44 | public void SimpleSumSpan() 45 | { 46 | ReadOnlySpan leftSpan = leftMemory.Span; 47 | ReadOnlySpan rightSpan = rightMemory.Span; 48 | Span resultsSpan = resultsMemory.Span; 49 | for (int i = 0; i < leftSpan.Length; i++) 50 | { 51 | resultsSpan[i] = leftSpan[i] + rightSpan[i]; 52 | } 53 | } 54 | ``` 55 | If unsafe code is used, we can do: 56 | ``` 57 | public unsafe void SimpleSumSpanUnsafe() 58 | { 59 | ReadOnlySpan leftSpan = leftMemory.Span; 60 | ReadOnlySpan rightSpan = rightMemory.Span; 61 | Span resultsSpan = resultsMemory.Span; 62 | fixed (float* leftBasePtr = &leftSpan[0]) 63 | fixed (float* rightBasePtr = &rightSpan[0]) 64 | fixed (float* resultBasePtr = &resultsSpan[0]) 65 | { 66 | float* leftCurrPtr = leftBasePtr; 67 | float* rightCurrPtr = rightBasePtr; 68 | float* resultCurrPtr = resultBasePtr; 69 | for (int i = 0; i < leftSpan.Length; i++) 70 | { 71 | *resultCurrPtr = *leftCurrPtr + *rightCurrPtr; 72 | rightCurrPtr++; 73 | leftCurrPtr++; 74 | resultCurrPtr++; 75 | } 76 | } 77 | } 78 | ``` 79 | 80 | The results on my system: 81 | 82 | ``` 83 | | Method | Mean | Error | StdDev | Ratio | 84 | |-------------------- |----------:|----------:|----------:|------:| 85 | | SimpleSumArray | 225.02 us | 2.2411 us | 1.8714 us | 1.00 | 86 | | SimpleSumSpan | 110.40 us | 0.7707 us | 0.6017 us | 0.49 | 87 | | SimpleSumSpanUnsafe | 74.93 us | 1.4354 us | 1.2724 us | 0.33 | 88 | ``` 89 | 90 | 91 | Using `Span` instead of an array gives a nice 50 % improvement all by itself, unsafe code runs in a third of the time. The unsafe speedup is likely to disappear for more complex operations, where the access time gain through pointers fades away in face of longer operation time. 92 | 93 | ##### Vectors ##### 94 | 95 | A naïve vectorization using arrays could be: 96 | ``` 97 | public void SimpleSumVectors() 98 | { 99 | int ceiling = left.Length / floatSlots * floatSlots; 100 | for (int i = 0; i < ceiling; i += floatSlots) 101 | { 102 | Vector v1 = new Vector(left, i); 103 | Vector v2 = new Vector(right, i); 104 | (v1 + v2).CopyTo(results, i); 105 | } 106 | for (int i = ceiling; i < left.Length; i++) 107 | { 108 | results[i] = left[i] + right[i]; 109 | } 110 | } 111 | ``` 112 | And the result: 113 | ``` 114 | | Method | Mean | Error | StdDev | Median | Ratio | RatioSD | 115 | |-------------------- |----------:|---------:|---------:|----------:|------:|--------:| 116 | | SimpleSumArray | 229.84 us | 6.690 us | 5.930 us | 227.15 us | 2.02 | 0.09 | 117 | | SimpleSumSpan | 113.51 us | 2.557 us | 3.828 us | 112.82 us | 1.00 | 0.00 | 118 | | SimpleSumSpanUnsafe | 75.58 us | 1.256 us | 1.114 us | 75.61 us | 0.67 | 0.02 | 119 | | SimpleSumVectors | 51.89 us | 1.021 us | 1.868 us | 50.99 us | 0.46 | 0.02 | 120 | ``` 121 | Already a 50% improvement over `Span` and faster that `unsafe`! 122 | 123 | Simply using `Span`in order to improve the creation of vectors doesn´t help, it actually increases the run time (not shown). 124 | 125 | Looking at the `SimpleSumVectors()` code above two things jump out: we need to repeatedly create vectors and we need to copy the resulting values back to the result array, and all that inside the inner loop. 126 | 127 | Let´s try to avoid that using `System.Runtime.InteropservicesMemoryMarshal.Cast`. This function will allow us to map data from one type to another, without actually copying bytes around. 128 | 129 | ``` 130 | public void SimpleSumVectorsNoCopy() 131 | { 132 | int numVectors = left.Length / floatSlots; 133 | int ceiling = numVectors * floatSlots; 134 | 135 | ReadOnlySpan> leftVecArray = MemoryMarshal.Cast>(leftMemory.Span); 136 | ReadOnlySpan> rightVecArray = MemoryMarshal.Cast>(rightMemory.Span); 137 | Span> resultsVecArray = MemoryMarshal.Cast>(resultsMemory.Span); 138 | 139 | for (int i = 0; i < numVectors; i++) 140 | { 141 | resultsVecArray[i] = leftVecArray[i] + rightVecArray[i]; 142 | } 143 | // Finish operation with any numbers leftover 144 | for (int i = ceiling; i < left.Length; i++) 145 | { 146 | results[i] = left[i] + right[i]; 147 | } 148 | } 149 | ``` 150 | The magic happens by using `MemoryMarshal.Cast>(leftMemory.Span)`: the `float` array in `lefMemory.Span` is reinterpreted as an array of `Vector`. 151 | 152 | Remember that a `lefMemory.Span` actually points to the array we used to create it. In the same way, `leftVecArray` point to the same data. So if data changes in `resultsVecArray`, the array `results` actually gets changed. 153 | 154 | The result of that magic: 155 | ``` 156 | | Method | Mean | Error | StdDev | Median | Ratio | RatioSD | 157 | |----------------------- |----------:|----------:|----------:|----------:|------:|--------:| 158 | | SimpleSumArray | 239.42 us | 4.7123 us | 9.3016 us | 235.43 us | 2.00 | 0.12 | 159 | | SimpleSumSpan | 119.90 us | 7.9034 us | 8.1162 us | 115.94 us | 1.00 | 0.00 | 160 | | SimpleSumSpanUnsafe | 79.21 us | 1.5772 us | 1.9370 us | 78.95 us | 0.66 | 0.04 | 161 | | SimpleSumVectors | 53.93 us | 1.0742 us | 2.3123 us | 53.03 us | 0.46 | 0.04 | 162 | | SimpleSumVectorsNoCopy | 44.07 us | 0.4126 us | 0.3445 us | 43.98 us | 0.37 | 0.03 | 163 | ``` 164 | We achieve a bit less than a 50% improvement over unsafe operations and a respectable improvement over `Span` operations. 165 | 166 | ##### More complex calculations ##### 167 | 168 | If we replace the simple sum 169 | 170 | ``` 171 | results[i] = left[i] + right[i]; 172 | ``` 173 | with 174 | ``` 175 | results[i] = (float)Math.Sqrt((left[i] * right[i] + floatPi) / floatPi); 176 | ``` 177 | we see a more substantial gain for vectorization (see the project for code): 178 | ``` 179 | | Method | Mean | Error | StdDev | Ratio | RatioSD | 180 | |--------------------- |---------:|-----------:|-----------:|------:|--------:| 181 | | ComplexOpsSpan | 754.9 us | 11.2312 us | 10.5057 us | 1.00 | 0.00 | 182 | | ComplexOpsSpanUnsafe | 753.8 us | 9.4597 us | 8.8486 us | 1.00 | 0.02 | 183 | | ComplexVectorsNoCopy | 133.2 us | 0.4963 us | 0.4400 us | 0.18 | 0.00 | 184 | ``` 185 | We have a very respectable > 5x improvement in performance (and, as expected, the advantage of doing unsafe operations disappears). 186 | 187 | #### Conclusion #### 188 | 189 | If you are going to vectorize your calculations, **benchmarking is a must** to make sure you´re actually improving performance. 190 | 191 | A case in point is integer types: only addition, subtraction and bitwise operations are supported. 192 | 193 | An example using division and `Vector.Sqrt` gives the following results: 194 | ``` 195 | | Method | Mean | Error | StdDev | 196 | |--------------------------- |-----------:|-----------:|-----------:| 197 | | ComplexOpsIntSpan | 364.7 us | 5.4188 us | 4.8036 us | 198 | | ComplexOpsVectorsNoCopyInt | 1,020.7 us | 35.6401 us | 33.3378 us | 199 | ``` 200 | The vectorized routine **increases** execution time 2.8x since the vectorized code is compiled to use software implementations instead of hardware operations. 201 | -------------------------------------------------------------------------------- /SIMDPerformance.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 16 4 | VisualStudioVersion = 16.0.28803.352 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SIMDPerformanceBench", "SIMDPerformance\SIMDPerformanceBench.csproj", "{3BAF9899-3BB5-433A-987C-F45519BCE4B9}" 7 | EndProject 8 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SIMDPerformanceDebug", "SIMDPerformanceDebug\SIMDPerformanceDebug.csproj", "{32B90246-60B7-4A5F-A528-EA86CD7B5302}" 9 | EndProject 10 | Global 11 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 12 | Debug|Any CPU = Debug|Any CPU 13 | Release|Any CPU = Release|Any CPU 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {3BAF9899-3BB5-433A-987C-F45519BCE4B9}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 17 | {3BAF9899-3BB5-433A-987C-F45519BCE4B9}.Debug|Any CPU.Build.0 = Debug|Any CPU 18 | {3BAF9899-3BB5-433A-987C-F45519BCE4B9}.Release|Any CPU.ActiveCfg = Release|Any CPU 19 | {3BAF9899-3BB5-433A-987C-F45519BCE4B9}.Release|Any CPU.Build.0 = Release|Any CPU 20 | {32B90246-60B7-4A5F-A528-EA86CD7B5302}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 21 | {32B90246-60B7-4A5F-A528-EA86CD7B5302}.Debug|Any CPU.Build.0 = Debug|Any CPU 22 | {32B90246-60B7-4A5F-A528-EA86CD7B5302}.Release|Any CPU.ActiveCfg = Release|Any CPU 23 | {32B90246-60B7-4A5F-A528-EA86CD7B5302}.Release|Any CPU.Build.0 = Release|Any CPU 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | GlobalSection(ExtensibilityGlobals) = postSolution 29 | SolutionGuid = {63636A77-C6D1-4ADB-82E0-A41C3A321AE3} 30 | EndGlobalSection 31 | EndGlobal 32 | -------------------------------------------------------------------------------- /SIMDPerformance/Program.cs: -------------------------------------------------------------------------------- 1 | using BenchmarkDotNet.Running; 2 | 3 | namespace SIMDPerformanceBench 4 | { 5 | class Program 6 | { 7 | static void Main(string[] args) 8 | { 9 | var summary = BenchmarkRunner.Run(); 10 | } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /SIMDPerformance/SIMDFloatPerformance.cs: -------------------------------------------------------------------------------- 1 | using BenchmarkDotNet.Attributes; 2 | using System; 3 | using System.Runtime.InteropServices; 4 | using System.Numerics; 5 | 6 | namespace SIMDPerformanceBench 7 | { 8 | [DisassemblyDiagnoser(printAsm: true, printSource: true)] 9 | public class SIMDFloatPerformance 10 | { 11 | public static float[] left, right, results; 12 | public static int[] leftInt, rightInt, resultsInt; 13 | public static ReadOnlyMemory leftMemory, rightMemory; 14 | public static ReadOnlyMemory leftMemoryInt, rightMemoryInt; 15 | UnsafeMemoryFloat leftUnsafe, rightUnsafe, resultsUnsafe; 16 | public static Memory resultsMemory; 17 | public static Memory resultsMemoryInt; 18 | public const int ITEMS = 100003; 19 | public static float floatPi; 20 | public static int floatSlots, intSlots; 21 | 22 | [GlobalSetup] 23 | public void GlobalSetup() 24 | { 25 | floatSlots = Vector.Count; 26 | intSlots = Vector.Count; 27 | floatPi = (float)Math.PI; 28 | 29 | left = new float[ITEMS]; 30 | leftMemory = new ReadOnlyMemory(left); 31 | leftInt = new int[ITEMS]; 32 | leftMemoryInt = new ReadOnlyMemory(leftInt); 33 | right = new float[ITEMS]; 34 | rightMemory = new ReadOnlyMemory(right); 35 | rightInt = new int[ITEMS]; 36 | rightMemoryInt = new ReadOnlyMemory(rightInt); 37 | results = new float[ITEMS]; 38 | resultsInt = new int[ITEMS]; 39 | resultsMemory = new Memory(results); 40 | resultsMemoryInt = new Memory(resultsInt); 41 | leftUnsafe = new UnsafeMemoryFloat(ITEMS, Vector.Count, 0); 42 | rightUnsafe = new UnsafeMemoryFloat(ITEMS, Vector.Count, 0); 43 | resultsUnsafe = new UnsafeMemoryFloat(ITEMS, Vector.Count, 0); 44 | for (int i = 0; i < ITEMS; i++) 45 | { 46 | left[i] = i; 47 | right[i] = i + floatPi; 48 | leftInt[i] = i; 49 | rightInt[i] = i / 2; 50 | leftUnsafe[i] = i; 51 | rightUnsafe[i] = i + floatPi; 52 | } 53 | 54 | } 55 | 56 | 57 | [GlobalCleanup] 58 | public void GlobalCleanup() 59 | { 60 | if(leftUnsafe != null) 61 | { 62 | leftUnsafe.Dispose(); 63 | leftUnsafe = null; 64 | } 65 | if (rightUnsafe != null) 66 | { 67 | rightUnsafe.Dispose(); 68 | rightUnsafe = null; 69 | } 70 | if (resultsUnsafe != null) 71 | { 72 | resultsUnsafe.Dispose(); 73 | resultsUnsafe = null; 74 | } 75 | } 76 | 77 | 78 | //[Benchmark] 79 | public void SimpleSumArray() 80 | { 81 | for (int i = 0; i < left.Length; i++) 82 | { 83 | results[i] = left[i] + right[i]; 84 | } 85 | 86 | } 87 | 88 | //[Benchmark(Baseline = true)] 89 | public void SimpleSumSpan() 90 | { 91 | //results = new float[ITEMS]; 92 | //resultsMemory = new Memory(results); 93 | ReadOnlySpan leftSpan = leftMemory.Span; 94 | ReadOnlySpan rightSpan = rightMemory.Span; 95 | Span resultsSpan = resultsMemory.Span; 96 | for (int i = 0; i < leftSpan.Length; i++) 97 | { 98 | resultsSpan[i] = leftSpan[i] + rightSpan[i]; 99 | } 100 | } 101 | 102 | //[Benchmark] 103 | public unsafe void SimpleSumSpanUnsafe() 104 | { 105 | //results = new float[ITEMS]; 106 | //resultsMemory = new Memory(results); 107 | ReadOnlySpan leftSpan = leftMemory.Span; 108 | ReadOnlySpan rightSpan = rightMemory.Span; 109 | Span resultsSpan = resultsMemory.Span; 110 | fixed (float* leftBasePtr = &leftSpan[0]) 111 | fixed (float* rightBasePtr = &rightSpan[0]) 112 | fixed (float* resultBasePtr = &resultsSpan[0]) 113 | { 114 | float* leftCurrPtr = leftBasePtr; 115 | float* rightCurrPtr = rightBasePtr; 116 | float* resultCurrPtr = resultBasePtr; 117 | for (int i = 0; i < leftSpan.Length; i++) 118 | { 119 | *resultCurrPtr = *leftCurrPtr + *rightCurrPtr; 120 | rightCurrPtr++; 121 | leftCurrPtr++; 122 | resultCurrPtr++; 123 | } 124 | } 125 | } 126 | 127 | //[Benchmark] 128 | public void SimpleSumVectors() 129 | { 130 | int ceiling = left.Length / floatSlots * floatSlots; 131 | 132 | for (int i = 0; i < ceiling; i += floatSlots) 133 | { 134 | Vector v1 = new Vector(left, i); 135 | Vector v2 = new Vector(right, i); 136 | (v1 + v2).CopyTo(results, i); 137 | } 138 | for (int i = ceiling; i < left.Length; i++) 139 | { 140 | results[i] = left[i] + right[i]; 141 | } 142 | } 143 | 144 | //[Benchmark] 145 | public void SimpleSumVectorsSpan() 146 | { 147 | int ceiling = left.Length / floatSlots * floatSlots; 148 | Span leftSpan = new Span(left); 149 | Span rightSpan = new Span(right); 150 | //Span resultsSpan = resultsMemory.Span; 151 | Span leftSlice = leftSpan.Slice(0, floatSlots); 152 | Span rightSlice = rightSpan.Slice(0, floatSlots); 153 | //Span resultSlice = resultsSpan.Slice(0, floatSlots); 154 | //results = new float[left.Length]; 155 | for (int i = 0; i < ceiling; i += floatSlots) 156 | { 157 | Vector v1 = new Vector(leftSlice); 158 | Vector v2 = new Vector(rightSlice); 159 | leftSlice = leftSpan.Slice(i, floatSlots); 160 | rightSlice = rightSpan.Slice(i, floatSlots); 161 | (v1 + v2).CopyTo(results, i); 162 | } 163 | // Finish operation with any numbers leftover 164 | for (int i = ceiling; i < left.Length; i++) 165 | { 166 | results[i] = left[i] + right[i]; 167 | } 168 | } 169 | 170 | [Benchmark] 171 | public void SimpleSumVectorsNoCopy() 172 | { 173 | int numVectors = left.Length / floatSlots; 174 | int ceiling = numVectors * floatSlots; 175 | //results = new float[left.Length]; 176 | //resultsMemory = new Memory(results); 177 | ReadOnlySpan> leftVecArray = MemoryMarshal.Cast>(leftMemory.Span); 178 | ReadOnlySpan> rightVecArray = MemoryMarshal.Cast>(rightMemory.Span); 179 | Span> resultsVecArray = MemoryMarshal.Cast>(resultsMemory.Span); 180 | for (int i = 0; i < numVectors; i++) 181 | { 182 | resultsVecArray[i] = leftVecArray[i] + rightVecArray[i]; 183 | } 184 | // Finish operation with any numbers leftover 185 | for (int i = ceiling; i < left.Length; i++) 186 | { 187 | results[i] = left[i] + right[i]; 188 | } 189 | } 190 | 191 | [Benchmark] 192 | public unsafe void SimpleSumVectorsUnsafe() 193 | { 194 | int numVectors = left.Length / floatSlots; 195 | int ceiling = numVectors * floatSlots; 196 | ReadOnlySpan leftUnsafeSpan = new ReadOnlySpan(leftUnsafe.BufferIntPtr.ToPointer(), numVectors * floatSlots); 197 | ReadOnlySpan rightUnsafeSpan = new ReadOnlySpan(rightUnsafe.BufferIntPtr.ToPointer(), numVectors * floatSlots); 198 | Span resultsUnsafeSpan = new Span(resultsUnsafe.BufferIntPtr.ToPointer(), numVectors * floatSlots); 199 | ReadOnlySpan> leftVecArray = MemoryMarshal.Cast>(leftUnsafeSpan); 200 | ReadOnlySpan> rightVecArray = MemoryMarshal.Cast>(rightUnsafeSpan); 201 | Span> resultsVecArray = MemoryMarshal.Cast>(resultsUnsafeSpan); 202 | 203 | for (int i = 0; i < numVectors; i++) 204 | { 205 | resultsVecArray[i] = leftVecArray[i] + rightVecArray[i]; 206 | } 207 | for (int i = ceiling; i < left.Length; i++) 208 | { 209 | //resultsUnsafe[i] = leftUnsafe[i] + rightUnsafe[i]; 210 | results[i] = left[i] + right[i]; 211 | } 212 | } 213 | 214 | [Benchmark(Baseline = true)] 215 | public void ComplexOpsSpan() 216 | { 217 | //results = new float[ITEMS]; 218 | //resultsMemory = new Memory(results); 219 | ReadOnlySpan leftSpan = leftMemory.Span; 220 | ReadOnlySpan rightSpan = rightMemory.Span; 221 | Span resultsSpan = resultsMemory.Span; 222 | for (int i = 0; i < leftSpan.Length; i++) 223 | { 224 | resultsSpan[i] = (float)Math.Sqrt((leftSpan[i] * rightSpan[i] + floatPi) / floatPi); 225 | } 226 | } 227 | 228 | [Benchmark] 229 | public unsafe void ComplexOpsSpanUnsafe() 230 | { 231 | //results = new float[ITEMS]; 232 | //resultsMemory = new Memory(results); 233 | ReadOnlySpan leftSpan = leftMemory.Span; 234 | ReadOnlySpan rightSpan = rightMemory.Span; 235 | Span resultsSpan = resultsMemory.Span; 236 | fixed (float* leftBasePtr = &leftSpan[0]) 237 | fixed (float* rightBasePtr = &rightSpan[0]) 238 | fixed (float* piPtr = &floatPi) 239 | fixed (float* resultBasePtr = &resultsSpan[0]) 240 | { 241 | float* leftCurrPtr = leftBasePtr; 242 | float* rightCurrPtr = rightBasePtr; 243 | float* resultCurrPtr = resultBasePtr; 244 | for (int i = 0; i < leftSpan.Length; i++) 245 | { 246 | *resultCurrPtr = (float)Math.Sqrt((*leftCurrPtr * *rightCurrPtr + *piPtr) / *piPtr); 247 | rightCurrPtr++; 248 | leftCurrPtr++; 249 | resultCurrPtr++; 250 | } 251 | } 252 | } 253 | 254 | [Benchmark] 255 | public void ComplexOpsVectorsNoCopy() 256 | { 257 | int numVectors = left.Length / floatSlots; 258 | int ceiling = numVectors * floatSlots; 259 | Vector piVector = new Vector(floatPi); 260 | //results = new float[left.Length]; 261 | //resultsMemory = new Memory(results); 262 | ReadOnlySpan> leftVecArray = MemoryMarshal.Cast>(leftMemory.Span); 263 | ReadOnlySpan> rightVecArray = MemoryMarshal.Cast>(rightMemory.Span); 264 | Span> resultsVecArray = MemoryMarshal.Cast>(resultsMemory.Span); 265 | for (int i = 0; i < numVectors; i++) 266 | { 267 | resultsVecArray[i] = Vector.SquareRoot((leftVecArray[i] * rightVecArray[i] + piVector) / piVector); 268 | } 269 | // Finish operation with any numbers leftover 270 | for (int i = ceiling; i < left.Length; i++) 271 | { 272 | results[i] = left[i] + right[i]; 273 | } 274 | } 275 | 276 | [Benchmark] 277 | public unsafe void ComplexOpsVectorsUnsafe() 278 | { 279 | int numVectors = left.Length / floatSlots; 280 | int ceiling = numVectors * floatSlots; 281 | Vector piVector = new Vector(floatPi); 282 | ReadOnlySpan leftUnsafeSpan = new ReadOnlySpan(leftUnsafe.BufferIntPtr.ToPointer(), numVectors * floatSlots); 283 | ReadOnlySpan rightUnsafeSpan = new ReadOnlySpan(rightUnsafe.BufferIntPtr.ToPointer(), numVectors * floatSlots); 284 | Span resultsUnsafeSpan = new Span(resultsUnsafe.BufferIntPtr.ToPointer(), numVectors * floatSlots); 285 | ReadOnlySpan> leftVecArray = MemoryMarshal.Cast>(leftUnsafeSpan); 286 | ReadOnlySpan> rightVecArray = MemoryMarshal.Cast>(rightUnsafeSpan); 287 | Span> resultsVecArray = MemoryMarshal.Cast>(resultsUnsafeSpan); 288 | 289 | for (int i = 0; i < numVectors; i++) 290 | { 291 | resultsVecArray[i] = Vector.SquareRoot((leftVecArray[i] * rightVecArray[i] + piVector) / piVector); 292 | } 293 | for (int i = ceiling; i < left.Length; i++) 294 | { 295 | //resultsUnsafe[i] = leftUnsafe[i] + rightUnsafe[i]; 296 | results[i] = left[i] + right[i]; 297 | } 298 | } 299 | [Benchmark] 300 | public void ComplexOpsIntSpan() 301 | { 302 | //results = new float[ITEMS]; 303 | //resultsMemory = new Memory(results); 304 | ReadOnlySpan leftSpan = leftMemoryInt.Span; 305 | ReadOnlySpan rightSpan = rightMemoryInt.Span; 306 | Span resultsSpan = resultsMemoryInt.Span; 307 | int intFactor = -43; 308 | for (int i = 0; i < leftSpan.Length; i++) 309 | { 310 | resultsSpan[i] = (int)Math.Sqrt((leftSpan[i] * rightSpan[i] + intFactor) / intFactor); 311 | } 312 | } 313 | 314 | [Benchmark] 315 | public void ComplexOpsVectorsNoCopyInt() 316 | { 317 | int numVectors = left.Length / intSlots; 318 | int ceiling = numVectors * intSlots; 319 | Vector constVector = new Vector(-43); 320 | //results = new float[left.Length]; 321 | //resultsMemory = new Memory(results); 322 | ReadOnlySpan> leftVecArray = MemoryMarshal.Cast>(leftMemoryInt.Span); 323 | ReadOnlySpan> rightVecArray = MemoryMarshal.Cast>(rightMemoryInt.Span); 324 | Span> resultsVecArray = MemoryMarshal.Cast>(resultsMemoryInt.Span); 325 | for (int i = 0; i < numVectors; i++) 326 | { 327 | resultsVecArray[i] = Vector.SquareRoot((leftVecArray[i] * rightVecArray[i] + constVector) / constVector); 328 | } 329 | // Finish operation with any numbers leftover 330 | for (int i = ceiling; i < left.Length; i++) 331 | { 332 | results[i] = left[i] + right[i]; 333 | } 334 | } 335 | } 336 | 337 | public unsafe class UnsafeMemoryFloat : IDisposable 338 | { 339 | private byte[] byteBuffer; 340 | private GCHandle bufferGCHandle; 341 | private readonly IntPtr bufferIntPtr; 342 | private readonly int length; 343 | private bool disposedValue = false; 344 | 345 | public int Length => length; 346 | public IntPtr BufferIntPtr => bufferIntPtr; 347 | 348 | public float this[int index] 349 | { 350 | set { *((float*)bufferIntPtr.ToPointer() + index) = value; } 351 | } 352 | 353 | public UnsafeMemoryFloat(int len, int byteAlignment, int offset) 354 | { 355 | length = len; 356 | byteBuffer = new byte[length * sizeof(float) + byteAlignment]; 357 | bufferGCHandle = GCHandle.Alloc(byteBuffer, GCHandleType.Pinned); 358 | long int64Ptr = bufferGCHandle.AddrOfPinnedObject().ToInt64(); 359 | long alignError = byteAlignment - int64Ptr % byteAlignment; 360 | int64Ptr = int64Ptr + alignError; 361 | int64Ptr += offset; 362 | bufferIntPtr = new IntPtr(int64Ptr); 363 | } 364 | 365 | #region IDisposable Support 366 | protected virtual void Dispose(bool disposing) 367 | { 368 | if (!disposedValue) 369 | { 370 | if (disposing) 371 | { 372 | if (bufferGCHandle.IsAllocated) 373 | { 374 | bufferGCHandle.Free(); 375 | byteBuffer = null; 376 | } 377 | } 378 | disposedValue = true; 379 | } 380 | } 381 | 382 | public void Dispose() 383 | { 384 | Dispose(true); 385 | } 386 | #endregion 387 | } 388 | } 389 | -------------------------------------------------------------------------------- /SIMDPerformance/SIMDPerformanceBench.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | netcoreapp2.2 6 | 7 | 8 | 9 | pdbonly 10 | true 11 | true 12 | 13 | 14 | 15 | true 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /SIMDPerformanceDebug/FloatOps.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Runtime.InteropServices; 3 | using System.Numerics; 4 | 5 | namespace SIMDPerformanceDebug 6 | { 7 | public static class FloatOps 8 | { 9 | internal static float[] left, right, results, resultsReference; 10 | internal static ReadOnlyMemory leftMemory, rightMemory; 11 | internal static UnsafeMemoryFloat leftUnsafe, rightUnsafe, resultsUnsafe; 12 | internal static Memory resultsMemory; 13 | internal const int ITEMS = 100003; 14 | internal static float floatPi; 15 | internal static int floatSlots; 16 | 17 | static FloatOps() 18 | { 19 | floatSlots = Vector.Count; 20 | floatPi = (float)Math.PI; 21 | left = new float[ITEMS]; 22 | leftMemory = new ReadOnlyMemory(left); 23 | right = new float[ITEMS]; 24 | rightMemory = new ReadOnlyMemory(right); 25 | results = new float[ITEMS]; 26 | resultsMemory = new Memory(results); 27 | leftUnsafe = new UnsafeMemoryFloat(ITEMS, Vector.Count, 0); 28 | rightUnsafe = new UnsafeMemoryFloat(ITEMS, Vector.Count, 0); 29 | resultsUnsafe = new UnsafeMemoryFloat(ITEMS, Vector.Count, 0); 30 | for (int i = 0; i < ITEMS; i++) 31 | { 32 | left[i] = i; 33 | right[i] = i + floatPi; 34 | leftUnsafe[i] = i; 35 | rightUnsafe[i] = i + floatPi; 36 | } 37 | resultsReference = new float[ITEMS]; 38 | } 39 | 40 | public static ref float[] SimpleSumArray() 41 | { 42 | for(int i = 0; i < left.Length; i++) 43 | { 44 | results[i] = left[i] + right[i]; 45 | } 46 | 47 | return ref results; 48 | } 49 | 50 | public static void SimpleSumSpan() 51 | { 52 | results = new float[ITEMS]; 53 | resultsMemory = new Memory(results); 54 | ReadOnlySpan leftSpan = leftMemory.Span; 55 | ReadOnlySpan rightSpan = rightMemory.Span; 56 | Span resultsSpan = resultsMemory.Span; 57 | //resultsSpan = resultsMemory.Span; 58 | for (int i = 0; i < leftSpan.Length; i++) 59 | { 60 | resultsSpan[i] = leftSpan[i] + rightSpan[i]; 61 | } 62 | } 63 | 64 | public static unsafe void SimpleSumSpanUnsafe() 65 | { 66 | results = new float[ITEMS]; 67 | resultsMemory = new Memory(results); 68 | ReadOnlySpan leftSpan = leftMemory.Span; 69 | ReadOnlySpan rightSpan = rightMemory.Span; 70 | Span resultsSpan = resultsMemory.Span; 71 | //resultsSpan = resultsMemory.Span; 72 | fixed (float* leftBasePtr = &leftSpan[0]) 73 | fixed (float* rightBasePtr = &rightSpan[0]) 74 | fixed (float* resultBasePtr = &resultsSpan[0]) 75 | { 76 | float* leftCurrPtr = leftBasePtr; 77 | float* rightCurrPtr = rightBasePtr; 78 | float* resultCurrPtr = resultBasePtr; 79 | for (int i = 0; i < leftSpan.Length; i++) 80 | { 81 | *resultCurrPtr = *leftCurrPtr + *rightCurrPtr; 82 | rightCurrPtr++; 83 | leftCurrPtr++; 84 | resultCurrPtr++; 85 | } 86 | } 87 | } 88 | public static void SimpleSumVectors() 89 | { 90 | Vector resultVector; 91 | int ceiling = left.Length / floatSlots * floatSlots; 92 | results = new float[left.Length]; 93 | for (int i = 0; i < ceiling; i += floatSlots) 94 | { 95 | resultVector = new Vector(left, i) + new Vector(right, i); 96 | resultVector.CopyTo(results, i); 97 | } 98 | for (int i = ceiling; i < left.Length; i++) 99 | { 100 | results[i] = left[i] + right[i]; 101 | } 102 | } 103 | 104 | public static void SimpleSumVectorsNoCopy() 105 | { 106 | int numVectors = left.Length / floatSlots; 107 | int ceiling = numVectors * floatSlots; 108 | results = new float[left.Length]; 109 | resultsMemory = new Memory(results); 110 | ReadOnlySpan> leftVecArray = MemoryMarshal.Cast>(leftMemory.Span); 111 | ReadOnlySpan> rightVecArray = MemoryMarshal.Cast>(rightMemory.Span); 112 | Span> resultsVecArray = MemoryMarshal.Cast>(resultsMemory.Span); 113 | for (int i = 0; i < numVectors; i++) 114 | { 115 | resultsVecArray[i] = leftVecArray[i] + rightVecArray[i]; 116 | } 117 | // Finish operation with any numbers leftover 118 | for(int i = ceiling; i < left.Length; i++) 119 | { 120 | results[i] = left[i] + right[i]; 121 | } 122 | } 123 | 124 | public static unsafe void SimpleSumVectorsUnsafe() 125 | { 126 | int numVectors = left.Length / floatSlots; 127 | ReadOnlySpan leftUnsafeSpan = new ReadOnlySpan(leftUnsafe.BufferIntPtr.ToPointer(), numVectors * floatSlots); 128 | ReadOnlySpan rightUnsafeSpan = new ReadOnlySpan(rightUnsafe.BufferIntPtr.ToPointer(), numVectors * floatSlots); 129 | Span resultsUnsafeSpan = new Span(resultsUnsafe.BufferIntPtr.ToPointer(), numVectors * floatSlots); 130 | ReadOnlySpan> leftVecArray = MemoryMarshal.Cast>(leftUnsafeSpan); 131 | ReadOnlySpan> rightVecArray = MemoryMarshal.Cast>(rightUnsafeSpan); 132 | Span> resultsVecArray = MemoryMarshal.Cast>(resultsUnsafeSpan); 133 | for (int i = 0; i < numVectors; i++) 134 | { 135 | resultsVecArray[i] = leftVecArray[i] + rightVecArray[i]; 136 | } 137 | for (int i = numVectors * floatSlots; i < left.Length; i++) 138 | { 139 | resultsUnsafe[i] = leftUnsafe[i] + rightUnsafe[i]; 140 | } 141 | 142 | } 143 | 144 | } 145 | 146 | public unsafe class UnsafeMemoryFloat : IDisposable 147 | { 148 | private byte[] byteBuffer; 149 | private GCHandle bufferGCHandle; 150 | private readonly IntPtr bufferIntPtr; 151 | private readonly int length; 152 | private bool disposedValue = false; 153 | 154 | public float this[int index] 155 | { 156 | get => *((float*)bufferIntPtr.ToPointer() + index); 157 | set => *((float*)bufferIntPtr.ToPointer() + index) = value; 158 | } 159 | 160 | 161 | 162 | public int Length => length; 163 | public IntPtr BufferIntPtr => bufferIntPtr; 164 | 165 | public UnsafeMemoryFloat(int len, int byteAlignment, int offset) 166 | { 167 | length = len; 168 | byteBuffer = new byte[length * sizeof(float) + byteAlignment]; 169 | bufferGCHandle = GCHandle.Alloc(byteBuffer, GCHandleType.Pinned); 170 | long int64Ptr = bufferGCHandle.AddrOfPinnedObject().ToInt64(); 171 | long alignError = byteAlignment - int64Ptr % byteAlignment; 172 | int64Ptr += alignError; 173 | int64Ptr += offset; 174 | bufferIntPtr = new IntPtr(int64Ptr); 175 | } 176 | 177 | 178 | 179 | #region IDisposable Support 180 | protected virtual void Dispose(bool disposing) 181 | { 182 | if (!disposedValue) 183 | { 184 | if (disposing) 185 | { 186 | if (bufferGCHandle.IsAllocated) 187 | { 188 | bufferGCHandle.Free(); 189 | byteBuffer = null; 190 | } 191 | } 192 | disposedValue = true; 193 | } 194 | } 195 | 196 | public void Dispose() 197 | { 198 | Dispose(true); 199 | } 200 | #endregion 201 | } 202 | } 203 | -------------------------------------------------------------------------------- /SIMDPerformanceDebug/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace SIMDPerformanceDebug 4 | { 5 | class Program 6 | { 7 | static void Main(string[] args) 8 | { 9 | bool success = true, overallSuccess = true; 10 | FloatOps.SimpleSumArray(); 11 | //Create a reference to compare future runs 12 | for(int i = 0; i < FloatOps.results.Length; i++) 13 | { 14 | FloatOps.resultsReference[i] = FloatOps.results[i]; 15 | } 16 | FloatOps.SimpleSumSpan(); 17 | success = Checkresults(); 18 | overallSuccess &= success; 19 | if(!success) 20 | { 21 | Console.WriteLine("Mismatch in SimpleSumSpan"); 22 | } 23 | FloatOps.SimpleSumSpanUnsafe(); 24 | if (!success) 25 | { 26 | Console.WriteLine("Mismatch in SimpleSumSpanUnsafe"); 27 | } 28 | success = Checkresults(); 29 | overallSuccess &= success; 30 | if (!success) 31 | { 32 | Console.WriteLine("Mismatch in SimpleSumVectors"); 33 | } 34 | FloatOps.SimpleSumVectors(); 35 | success = Checkresults(); 36 | overallSuccess &= success; 37 | if (!success) 38 | { 39 | Console.WriteLine("Mismatch in SimpleSumVectors"); 40 | } 41 | FloatOps.SimpleSumVectorsNoCopy(); 42 | success = Checkresults(); 43 | overallSuccess &= success; 44 | if (!success) 45 | { 46 | Console.WriteLine("Mismatch in SimpleSumVectorsNoCopy"); 47 | } 48 | FloatOps.SimpleSumVectorsUnsafe(); 49 | for(int i = 0; i < FloatOps.resultsUnsafe.Length; i++) 50 | { 51 | success = true; 52 | if (FloatOps.resultsUnsafe[i] != FloatOps.resultsReference[i]) 53 | { 54 | Console.WriteLine($"Result does not match starting at {i}: {FloatOps.resultsReference[i]} vs {FloatOps.results[i]}"); 55 | success = false; 56 | break; 57 | } 58 | } 59 | overallSuccess &= success; 60 | Console.WriteLine($"Finished. Success: {overallSuccess}"); 61 | return; 62 | 63 | bool Checkresults() 64 | { 65 | bool opsMatch = true; 66 | for (int i = 0; i < FloatOps.results.Length; i++) 67 | { 68 | opsMatch &= FloatOps.resultsReference[i] == FloatOps.results[i]; 69 | if (!opsMatch) 70 | { 71 | Console.WriteLine($"Result does not match starting at {i}: {FloatOps.resultsReference[i]} vs {FloatOps.results[i]}"); 72 | break; 73 | } 74 | } 75 | return opsMatch; 76 | } 77 | } 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /SIMDPerformanceDebug/SIMDPerformanceDebug.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | netcoreapp2.2 6 | 7 | 8 | 9 | true 10 | 11 | 12 | 13 | true 14 | 15 | 16 | 17 | --------------------------------------------------------------------------------