├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md └── src ├── benchmark_cm256.cpp ├── benchmark_fastecc.cpp ├── benchmark_leopard.cpp ├── benchmark_wirehair.cpp ├── common.h ├── compile.cmd └── main.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | *.exe 2 | 3 | ## Ignore Visual Studio temporary files, build results, and 4 | ## files generated by popular Visual Studio add-ons. 5 | ## 6 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 7 | 8 | # User-specific files 9 | *.rsuser 10 | *.suo 11 | *.user 12 | *.userosscache 13 | *.sln.docstates 14 | 15 | # User-specific files (MonoDevelop/Xamarin Studio) 16 | *.userprefs 17 | 18 | # Mono auto generated files 19 | mono_crash.* 20 | 21 | # Build results 22 | [Dd]ebug/ 23 | [Dd]ebugPublic/ 24 | [Rr]elease/ 25 | [Rr]eleases/ 26 | x64/ 27 | x86/ 28 | [Aa][Rr][Mm]/ 29 | [Aa][Rr][Mm]64/ 30 | bld/ 31 | [Bb]in/ 32 | [Oo]bj/ 33 | [Ll]og/ 34 | [Ll]ogs/ 35 | 36 | # Visual Studio 2015/2017 cache/options directory 37 | .vs/ 38 | # Uncomment if you have tasks that create the project's static files in wwwroot 39 | #wwwroot/ 40 | 41 | # Visual Studio 2017 auto generated files 42 | Generated\ Files/ 43 | 44 | # MSTest test Results 45 | [Tt]est[Rr]esult*/ 46 | [Bb]uild[Ll]og.* 47 | 48 | # NUnit 49 | *.VisualState.xml 50 | TestResult.xml 51 | nunit-*.xml 52 | 53 | # Build Results of an ATL Project 54 | [Dd]ebugPS/ 55 | [Rr]eleasePS/ 56 | dlldata.c 57 | 58 | # Benchmark Results 59 | BenchmarkDotNet.Artifacts/ 60 | 61 | # .NET Core 62 | project.lock.json 63 | project.fragment.lock.json 64 | artifacts/ 65 | 66 | # StyleCop 67 | StyleCopReport.xml 68 | 69 | # Files built by Visual Studio 70 | *_i.c 71 | *_p.c 72 | *_h.h 73 | *.ilk 74 | *.meta 75 | *.obj 76 | *.iobj 77 | *.pch 78 | *.pdb 79 | *.ipdb 80 | *.pgc 81 | *.pgd 82 | *.rsp 83 | *.sbr 84 | *.tlb 85 | *.tli 86 | *.tlh 87 | *.tmp 88 | *.tmp_proj 89 | *_wpftmp.csproj 90 | *.log 91 | *.vspscc 92 | *.vssscc 93 | .builds 94 | *.pidb 95 | *.svclog 96 | *.scc 97 | 98 | # Chutzpah Test files 99 | _Chutzpah* 100 | 101 | # Visual C++ cache files 102 | ipch/ 103 | *.aps 104 | *.ncb 105 | *.opendb 106 | *.opensdf 107 | *.sdf 108 | *.cachefile 109 | *.VC.db 110 | *.VC.VC.opendb 111 | 112 | # Visual Studio profiler 113 | *.psess 114 | *.vsp 115 | *.vspx 116 | *.sap 117 | 118 | # Visual Studio Trace Files 119 | *.e2e 120 | 121 | # TFS 2012 Local Workspace 122 | $tf/ 123 | 124 | # Guidance Automation Toolkit 125 | *.gpState 126 | 127 | # ReSharper is a .NET coding add-in 128 | _ReSharper*/ 129 | *.[Rr]e[Ss]harper 130 | *.DotSettings.user 131 | 132 | # TeamCity is a build add-in 133 | _TeamCity* 134 | 135 | # DotCover is a Code Coverage Tool 136 | *.dotCover 137 | 138 | # AxoCover is a Code Coverage Tool 139 | .axoCover/* 140 | !.axoCover/settings.json 141 | 142 | # Visual Studio code coverage results 143 | *.coverage 144 | *.coveragexml 145 | 146 | # NCrunch 147 | _NCrunch_* 148 | .*crunch*.local.xml 149 | nCrunchTemp_* 150 | 151 | # MightyMoose 152 | *.mm.* 153 | AutoTest.Net/ 154 | 155 | # Web workbench (sass) 156 | .sass-cache/ 157 | 158 | # Installshield output folder 159 | [Ee]xpress/ 160 | 161 | # DocProject is a documentation generator add-in 162 | DocProject/buildhelp/ 163 | DocProject/Help/*.HxT 164 | DocProject/Help/*.HxC 165 | DocProject/Help/*.hhc 166 | DocProject/Help/*.hhk 167 | DocProject/Help/*.hhp 168 | DocProject/Help/Html2 169 | DocProject/Help/html 170 | 171 | # Click-Once directory 172 | publish/ 173 | 174 | # Publish Web Output 175 | *.[Pp]ublish.xml 176 | *.azurePubxml 177 | # Note: Comment the next line if you want to checkin your web deploy settings, 178 | # but database connection strings (with potential passwords) will be unencrypted 179 | *.pubxml 180 | *.publishproj 181 | 182 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 183 | # checkin your Azure Web App publish settings, but sensitive information contained 184 | # in these scripts will be unencrypted 185 | PublishScripts/ 186 | 187 | # NuGet Packages 188 | *.nupkg 189 | # NuGet Symbol Packages 190 | *.snupkg 191 | # The packages folder can be ignored because of Package Restore 192 | **/[Pp]ackages/* 193 | # except build/, which is used as an MSBuild target. 194 | !**/[Pp]ackages/build/ 195 | # Uncomment if necessary however generally it will be regenerated when needed 196 | #!**/[Pp]ackages/repositories.config 197 | # NuGet v3's project.json files produces more ignorable files 198 | *.nuget.props 199 | *.nuget.targets 200 | 201 | # Microsoft Azure Build Output 202 | csx/ 203 | *.build.csdef 204 | 205 | # Microsoft Azure Emulator 206 | ecf/ 207 | rcf/ 208 | 209 | # Windows Store app package directories and files 210 | AppPackages/ 211 | BundleArtifacts/ 212 | Package.StoreAssociation.xml 213 | _pkginfo.txt 214 | *.appx 215 | *.appxbundle 216 | *.appxupload 217 | 218 | # Visual Studio cache files 219 | # files ending in .cache can be ignored 220 | *.[Cc]ache 221 | # but keep track of directories ending in .cache 222 | !?*.[Cc]ache/ 223 | 224 | # Others 225 | ClientBin/ 226 | ~$* 227 | *~ 228 | *.dbmdl 229 | *.dbproj.schemaview 230 | *.jfm 231 | *.pfx 232 | *.publishsettings 233 | orleans.codegen.cs 234 | 235 | # Including strong name files can present a security risk 236 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 237 | #*.snk 238 | 239 | # Since there are multiple workflows, uncomment next line to ignore bower_components 240 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 241 | #bower_components/ 242 | 243 | # RIA/Silverlight projects 244 | Generated_Code/ 245 | 246 | # Backup & report files from converting an old project file 247 | # to a newer Visual Studio version. Backup files are not needed, 248 | # because we have git ;-) 249 | _UpgradeReport_Files/ 250 | Backup*/ 251 | UpgradeLog*.XML 252 | UpgradeLog*.htm 253 | ServiceFabricBackup/ 254 | *.rptproj.bak 255 | 256 | # SQL Server files 257 | *.mdf 258 | *.ldf 259 | *.ndf 260 | 261 | # Business Intelligence projects 262 | *.rdl.data 263 | *.bim.layout 264 | *.bim_*.settings 265 | *.rptproj.rsuser 266 | *- [Bb]ackup.rdl 267 | *- [Bb]ackup ([0-9]).rdl 268 | *- [Bb]ackup ([0-9][0-9]).rdl 269 | 270 | # Microsoft Fakes 271 | FakesAssemblies/ 272 | 273 | # GhostDoc plugin setting file 274 | *.GhostDoc.xml 275 | 276 | # Node.js Tools for Visual Studio 277 | .ntvs_analysis.dat 278 | node_modules/ 279 | 280 | # Visual Studio 6 build log 281 | *.plg 282 | 283 | # Visual Studio 6 workspace options file 284 | *.opt 285 | 286 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 287 | *.vbw 288 | 289 | # Visual Studio LightSwitch build output 290 | **/*.HTMLClient/GeneratedArtifacts 291 | **/*.DesktopClient/GeneratedArtifacts 292 | **/*.DesktopClient/ModelManifest.xml 293 | **/*.Server/GeneratedArtifacts 294 | **/*.Server/ModelManifest.xml 295 | _Pvt_Extensions 296 | 297 | # Paket dependency manager 298 | .paket/paket.exe 299 | paket-files/ 300 | 301 | # FAKE - F# Make 302 | .fake/ 303 | 304 | # CodeRush personal settings 305 | .cr/personal 306 | 307 | # Python Tools for Visual Studio (PTVS) 308 | __pycache__/ 309 | *.pyc 310 | 311 | # Cake - Uncomment if you are using it 312 | # tools/** 313 | # !tools/packages.config 314 | 315 | # Tabs Studio 316 | *.tss 317 | 318 | # Telerik's JustMock configuration file 319 | *.jmconfig 320 | 321 | # BizTalk build output 322 | *.btp.cs 323 | *.btm.cs 324 | *.odx.cs 325 | *.xsd.cs 326 | 327 | # OpenCover UI analysis results 328 | OpenCover/ 329 | 330 | # Azure Stream Analytics local run output 331 | ASALocalRun/ 332 | 333 | # MSBuild Binary and Structured Log 334 | *.binlog 335 | 336 | # NVidia Nsight GPU debugger configuration file 337 | *.nvuser 338 | 339 | # MFractors (Xamarin productivity tool) working folder 340 | .mfractor/ 341 | 342 | # Local History for Visual Studio 343 | .localhistory/ 344 | 345 | # BeatPulse healthcheck temp database 346 | healthchecksdb 347 | 348 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 349 | MigrationBackup/ 350 | 351 | # Ionide (cross platform F# VS Code tools) working folder 352 | .ionide/ 353 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "external/cm256"] 2 | path = external/cm256 3 | url = https://github.com/catid/cm256 4 | [submodule "external/leopard"] 5 | path = external/leopard 6 | url = https://github.com/catid/leopard 7 | [submodule "external/FastECC"] 8 | path = external/FastECC 9 | url = https://github.com/Bulat-Ziganshin/FastECC 10 | [submodule "external/wirehair"] 11 | path = external/wirehair 12 | url = https://github.com/catid/wirehair 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Bulat-Ziganshin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Comparison of leading error-correcting code implementations 2 | 3 | We plan to compare: 4 | - O(N^2) Reed-Solomon codecs: 5 | - [x] [CM256](https://github.com/catid/cm256) - GF(2^8) 6 | - [ ] [Intel ISA-L](https://github.com/intel/isa-l) - GF(2^8) 7 | - O(N*log(N)) Reed-Solomon codecs: 8 | - [x] [Leopard](https://github.com/catid/leopard) - uses [FWHT](https://en.wikipedia.org/wiki/Fast_Walsh%E2%80%93Hadamard_transform) in GF(2^8) or GF(2^16), up to 2^16 blocks, data blocks >= parity blocks 9 | - [x] [FastECC](https://github.com/Bulat-Ziganshin/FastECC) - uses FFT in GF(p), up to 2^20 blocks 10 | - O(N) non-MDS codec: 11 | - [x] [Wirehair](https://github.com/catid/wirehair) - fountain code, up to 64000 data blocks 12 | 13 | SIMD usage: 14 | - CM256, Leopard and Wirehair provides AVX2/SSSE3/Neon64/Neon-optimized code paths 15 | - Intel ISA-L provides AVX512/AVX2/AVX/SSSE3/Neon/SVE/VSX-optimized code paths 16 | - FastECC provides AVX2/SSE2-optimized code paths 17 | 18 | So far, the benchmark is single-threaded. Leopard and FastECC have built-in OpenMP support, which may be enabled by adding `-fopenmp` to the compilation commands. 19 | 20 | 21 | ## Results 22 | 23 | Notes: 24 | - 80+20 means 80 data blocks and 20 parity blocks 25 | - Encoding speeds are measured in terms of original data processed 26 | - Decoding speeds are measured in terms of recovered data produced: 27 | - first test recovers single block, so `speed = one block size / time` 28 | - second test recovers as much blocks as code can do, so `speed = size of all parity blocks / time` 29 | - Each program run involves multiple "trials", 1000 by default, and we compute average time of trial 30 | - Formatted results are represented by the best runs among multiple experiments 31 | - Raw results are the single runs, just for quick comparison 32 | - Block sizes for each run were optimized to fit all data into L3 cache, but fixed to 4 KB for large codewords 33 | - Benchmark CPU is i7-8665U (4C/8T Skylake running at 3.3-4.5 GHz) 34 | 35 | 36 | ### Formatted results of CM256 37 | 38 | CM256: 39 | 40 | | AVX2 | Encoding all | Decoding one | Decoding all | 41 | | -------: | -----------: | ------------: | ------------: | 42 | | 200+50 | 643 MB/s | 156 MB/s | 159 MB/s | 43 | | 50+50 | 635 MB/s | 636 MB/s | 716 MB/s | 44 | | 80+20 | 1650 MB/s | 403 MB/s | 413 MB/s | 45 | | 20+20 | 1606 MB/s | 1562 MB/s | 1880 MB/s | 46 | 47 | | SSSE3 | Encoding all | Decoding one | Decoding all | 48 | | -------: | -----------: | ------------: | ------------: | 49 | | 200+50 | 336 MB/s | 84 MB/s | 86 MB/s | 50 | | 50+50 | 346 MB/s | 339 MB/s | 352 MB/s | 51 | | 80+20 | 882 MB/s | 212 MB/s | 219 MB/s | 52 | | 20+20 | 892 MB/s | 866 MB/s | 892 MB/s | 53 | 54 | 55 | ### Raw results with AVX2 56 | 57 | ``` 58 | D:\>bench_avx2 200 50 16384 100 59 | Params: data_blocks=200 parity_blocks=50 chunk_size=16384 trials=100 60 | CM256 (avx2, 64-bit): 61 | encode: 5219 usec, 628 MB/s 62 | decode one: 109 usec, 151 MB/s 63 | decode all: 4677 usec, 175 MB/s 64 | Leopard (avx2, 64-bit): 65 | encode: 1377 usec, 2379 MB/s 66 | decode one: 4574 usec, 4 MB/s 67 | decode all: 4401 usec, 186 MB/s 68 | FastECC 0xfff00001 32-bit 69 | encode: 7129 usec, 460 MB/s 70 | Wirehair (64-bit): 71 | encode: 2272 usec, 1443 MB/s 72 | decode one: 2506 usec, 7 MB/s 73 | decode all: 3061 usec, 268 MB/s 74 | 75 | D:\>bench_avx2 50 50 16384 1000 76 | Params: data_blocks=50 parity_blocks=50 chunk_size=16384 trials=1000 77 | CM256 (avx2, 64-bit): 78 | encode: 1306 usec, 627 MB/s 79 | decode one: 27 usec, 606 MB/s 80 | decode all: 1182 usec, 693 MB/s 81 | Leopard (avx2, 64-bit): 82 | encode: 228 usec, 3593 MB/s 83 | decode one: 599 usec, 27 MB/s 84 | decode all: 673 usec, 1218 MB/s 85 | FastECC 0xfff00001 32-bit 86 | encode: 1324 usec, 619 MB/s 87 | Wirehair (64-bit): 88 | encode: 603 usec, 1360 MB/s 89 | decode one: 527 usec, 31 MB/s 90 | decode all: 678 usec, 1209 MB/s 91 | 92 | D:\>bench_avx2 80 20 16384 1000 93 | Params: data_blocks=80 parity_blocks=20 chunk_size=16384 trials=1000 94 | CM256 (avx2, 64-bit): 95 | encode: 851 usec, 1540 MB/s 96 | decode one: 44 usec, 370 MB/s 97 | decode all: 755 usec, 434 MB/s 98 | Leopard (avx2, 64-bit): 99 | encode: 239 usec, 5485 MB/s 100 | decode one: 594 usec, 28 MB/s 101 | decode all: 620 usec, 529 MB/s 102 | FastECC 0xfff00001 32-bit 103 | encode: 3227 usec, 406 MB/s 104 | Wirehair (64-bit): 105 | encode: 977 usec, 1342 MB/s 106 | decode one: 1069 usec, 15 MB/s 107 | decode all: 1225 usec, 268 MB/s 108 | 109 | D:\>bench_avx2 20 20 65536 500 110 | Params: data_blocks=20 parity_blocks=20 chunk_size=65536 trials=500 111 | CM256 (avx2, 64-bit): 112 | encode: 1230 usec, 1066 MB/s 113 | decode one: 62 usec, 1053 MB/s 114 | decode all: 1238 usec, 1059 MB/s 115 | Leopard (avx2, 64-bit): 116 | encode: 586 usec, 2235 MB/s 117 | decode one: 1536 usec, 43 MB/s 118 | decode all: 1571 usec, 834 MB/s 119 | FastECC 0xfff00001 32-bit 120 | encode: 2378 usec, 551 MB/s 121 | Wirehair (64-bit): 122 | encode: 1643 usec, 798 MB/s 123 | decode one: 1579 usec, 41 MB/s 124 | decode all: 1808 usec, 725 MB/s 125 | ``` 126 | 127 | 128 | ### Raw results with SSSE3 129 | 130 | ``` 131 | D:\>bench_sse4 200 50 16384 100 132 | Params: data_blocks=200 parity_blocks=50 chunk_size=16384 trials=100 133 | CM256 (ssse3, 64-bit): 134 | encode: 11353 usec, 289 MB/s 135 | decode one: 233 usec, 70 MB/s 136 | decode all: 11088 usec, 74 MB/s 137 | Leopard (ssse3, 64-bit): 138 | encode: 2558 usec, 1281 MB/s 139 | decode one: 7345 usec, 2 MB/s 140 | decode all: 7490 usec, 109 MB/s 141 | FastECC 0xfff00001 32-bit 142 | encode: 11768 usec, 278 MB/s 143 | Wirehair (64-bit): 144 | encode: 2955 usec, 1109 MB/s 145 | decode one: 3164 usec, 5 MB/s 146 | decode all: 3615 usec, 227 MB/s 147 | 148 | D:\>bench_sse4 50 50 16384 1000 149 | Params: data_blocks=50 parity_blocks=50 chunk_size=16384 trials=1000 150 | CM256 (ssse3, 64-bit): 151 | encode: 2731 usec, 300 MB/s 152 | decode one: 56 usec, 292 MB/s 153 | decode all: 2719 usec, 301 MB/s 154 | Leopard (ssse3, 64-bit): 155 | encode: 460 usec, 1781 MB/s 156 | decode one: 1131 usec, 14 MB/s 157 | decode all: 1268 usec, 646 MB/s 158 | FastECC 0xfff00001 32-bit 159 | encode: 2123 usec, 386 MB/s 160 | Wirehair (64-bit): 161 | encode: 970 usec, 844 MB/s 162 | decode one: 828 usec, 20 MB/s 163 | decode all: 1051 usec, 780 MB/s 164 | 165 | D:\>bench_sse4 80 20 16384 1000 166 | Params: data_blocks=80 parity_blocks=20 chunk_size=16384 trials=1000 167 | CM256 (ssse3, 64-bit): 168 | encode: 1689 usec, 776 MB/s 169 | decode one: 88 usec, 187 MB/s 170 | decode all: 1699 usec, 193 MB/s 171 | Leopard (ssse3, 64-bit): 172 | encode: 436 usec, 3006 MB/s 173 | decode one: 1115 usec, 15 MB/s 174 | decode all: 1152 usec, 284 MB/s 175 | FastECC 0xfff00001 32-bit 176 | encode: 4840 usec, 271 MB/s 177 | Wirehair (64-bit): 178 | encode: 1192 usec, 1100 MB/s 179 | decode one: 1275 usec, 13 MB/s 180 | decode all: 1408 usec, 233 MB/s 181 | 182 | D:\>bench_sse4 20 20 65536 500 183 | Params: data_blocks=20 parity_blocks=20 chunk_size=65536 trials=500 184 | CM256 (ssse3, 64-bit): 185 | encode: 1872 usec, 700 MB/s 186 | decode one: 97 usec, 674 MB/s 187 | decode all: 1864 usec, 703 MB/s 188 | Leopard (ssse3, 64-bit): 189 | encode: 866 usec, 1514 MB/s 190 | decode one: 2250 usec, 29 MB/s 191 | decode all: 2377 usec, 551 MB/s 192 | FastECC 0xfff00001 32-bit 193 | encode: 3749 usec, 350 MB/s 194 | Wirehair (64-bit): 195 | encode: 2267 usec, 578 MB/s 196 | decode one: 2087 usec, 31 MB/s 197 | decode all: 2341 usec, 560 MB/s 198 | ``` 199 | 200 | 201 | ### Raw results for larger codewords 202 | 203 | ``` 204 | D:\>bench_avx2 2048 2048 4096 100 205 | Params: data_blocks=2048 parity_blocks=2048 chunk_size=4096 trials=100 206 | Leopard (avx2, 64-bit): 207 | encode: 8612 usec, 974 MB/s 208 | decode one: 18663 usec, 0 MB/s 209 | decode all: 21211 usec, 395 MB/s 210 | FastECC 0xfff00001 32-bit 211 | encode: 23819 usec, 352 MB/s 212 | Wirehair (64-bit): 213 | encode: 8301 usec, 1011 MB/s 214 | decode one: 6920 usec, 1 MB/s 215 | decode all: 9668 usec, 868 MB/s 216 | 217 | D:\>bench_avx2 32000 32000 4096 20 218 | Params: data_blocks=32000 parity_blocks=32000 chunk_size=4096 trials=20 219 | Leopard (avx2, 64-bit): 220 | encode: 216624 usec, 605 MB/s 221 | decode one: 427401 usec, 0 MB/s 222 | decode all: 515774 usec, 254 MB/s 223 | FastECC 0xfff00001 32-bit 224 | encode: 584607 usec, 224 MB/s 225 | Wirehair (64-bit): 226 | encode: 245237 usec, 534 MB/s 227 | decode one: 197916 usec, 0 MB/s 228 | decode all: 272011 usec, 482 MB/s 229 | ``` 230 | 231 | Now the same with OpenMP: 232 | ``` 233 | D:\>bench_avx2_openmp 2048 2048 4096 100 234 | Params: data_blocks=2048 parity_blocks=2048 chunk_size=4096 trials=100 235 | Leopard (avx2, 64-bit): 236 | encode: 6204 usec, 1352 MB/s 237 | decode one: 36027 usec, 0 MB/s 238 | decode all: 37741 usec, 222 MB/s 239 | FastECC 0xfff00001 32-bit 240 | encode: 7182 usec, 1168 MB/s 241 | Wirehair (64-bit): 242 | encode: 8446 usec, 993 MB/s 243 | decode one: 6943 usec, 1 MB/s 244 | decode all: 9709 usec, 864 MB/s 245 | 246 | D:\>bench_avx2_openmp 32000 32000 4096 20 247 | Params: data_blocks=32000 parity_blocks=32000 chunk_size=4096 trials=20 248 | Leopard (avx2, 64-bit): 249 | encode: 206935 usec, 633 MB/s 250 | decode one: 880574 usec, 0 MB/s 251 | decode all: 963278 usec, 136 MB/s 252 | FastECC 0xfff00001 32-bit 253 | encode: 209445 usec, 626 MB/s 254 | Wirehair (64-bit): 255 | encode: 257553 usec, 509 MB/s 256 | decode one: 202161 usec, 0 MB/s 257 | decode all: 284040 usec, 461 MB/s 258 | ``` 259 | 260 | 261 | ## Conclusions 262 | 263 | ### Encoding speed 264 | 265 | O(N^2) algorithms encoding speed reported in THIS benchmark 266 | is O(1/number_of_parity_words). It's why: 267 | 268 | So-called O(N^2) algorithms really are `O(M*K)`. 269 | It's because the RS matrix algo multiples vector of M words (input data) 270 | by `K*M` matrix and gets vector of K words (parity), 271 | which requires `K*M` multiplications and additions. 272 | 273 | When you have any `O(K*M)` algo with M input words and K output words, 274 | you can say that its speed is O(1/K) relative to input data processed 275 | or O(1/M) relative to output data produced :slight_smile: 276 | This benchmark reports encoding speed relative to input data size, 277 | so matrix RS algos speed is O(1/number_of_parity_words) 278 | 279 | As of cache effects, I optimized the chunk size for each M+K setting 280 | to reach best results. For larger codewords it means smaller chunks 281 | and thus a bit higher overheads, but effect was within 1% 282 | (i.e. for 20+20 I used 64KB blocks, but even with 4KB blocks 283 | it will be only 10% slower) 284 | 285 | 286 | ### Recovery speed 287 | 288 | In O(N^2) RS algos, recovery of multiple blocks is just 289 | recovery of a single block performed multiple times. 290 | Thus, speed per block is the same (modulo setup time). 291 | More concrete, for K data blocks, M parity blocks, 292 | and blocksize B, encoding time is `O(K*M*B)`. 293 | Decoding L lost blocks will take `O(K*L*B)` 294 | (it combines K survived blocks to recompute each lost block). 295 | 296 | But in fast RS algos, single block recovery requires almost 297 | the same amount of work as recovery of all lost blocks 298 | in the worst case, since FFT+IFFT steps don't depend on 299 | the amount of blocks we are going to recover. 300 | 301 | Thus, matrix algorithms will always be faster for recovery 302 | of only one or few missing blocks 303 | 304 | We can counterfight that by using matrix computations 305 | for small recoveries in fast algos too. At least it's 306 | possible for FastECC. This requires computation of 307 | Newton polynomial thus O(N^2) divisions - but it probably 308 | is still faster than O(B\*N\*log(N)) multiplications 309 | required for full decoding. 310 | 311 | 312 | ### Precomputed tables 313 | 314 | ISA-L API is more low-level - you can compute encoding tables 315 | just once and use them in multiple calls. It's especially 316 | important when we want to process a stream with many gigabytes 317 | using just a few megabytes of memory. 318 | 319 | Moreover, it may be possible to use CM256-computed tables with ISA-L. 320 | They have [two advantages](https://github.com/catid/cm256#comparisons-with-other-libraries) over ISA-L tables: 321 | - first parity block is just XOR of all data blocks 322 | - recovery tables are computed faster 323 | 324 | When encoding or decoding operations with the same parameters 325 | are repeated multiple times, it can make sense to keep cache 326 | of such tables in order to avoid costly initialization. 327 | The most obvious example is recovery of data of missed node 328 | in ECC-protected distributed storage like [Codex](https://github.com/status-im/nim-codex). 329 | 330 | 331 | ### Art of benchmarking 332 | 333 | Overall, proper benchmarking is an art of its own. 334 | AVX usually runs at slower frequencies and have weird implementation, 335 | this means that we better skip a first millisecond of its execution 336 | and don't mix AVX and non-AVX code. 337 | 338 | Mobile CPUs are tend to lower freqs on load, especially on m/t load, 339 | and after prolonged load they may further lower freq due to overheating. 340 | So, ideally we should skip a first few trials and then measure fastest one 341 | (when CPU had highest freq). 342 | But afair, cpu time measure sometimes may be incorrect when thread is switched 343 | to another core, so we have either to pin task to a single core or drop a few outliers. 344 | 345 | 346 | -------------------------------------------------------------------------------- /src/benchmark_cm256.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Benchmarking CM256 library: https://github.com/catid/cm256 3 | // 4 | 5 | #include 6 | #include 7 | 8 | #include "../src/gf256.cpp" 9 | 10 | #include "common.h" 11 | 12 | 13 | // Perform single encoding operation, return false if it fails 14 | bool cm256_benchmark_encode( 15 | ECC_bench_params params, 16 | uint8_t* originalFileData, 17 | uint8_t* recoveryBlocks, 18 | OperationTimer& encode_time) 19 | { 20 | // Pointers to data 21 | cm256_block blocks[256]; 22 | for (int i = 0; i < params.OriginalCount; ++i) 23 | { 24 | blocks[i].Block = originalFileData + i * params.BlockBytes; 25 | } 26 | 27 | encode_time.BeginCall(); 28 | // Generate recovery data 29 | if (cm256_encode(params, blocks, recoveryBlocks)) 30 | { 31 | printf(" cm256_encode failed\n"); 32 | return false; 33 | } 34 | encode_time.EndCall(); 35 | 36 | return true; 37 | } 38 | 39 | 40 | // Perform single operation decoding single lost block, return false if it fails 41 | bool cm256_benchmark_decode_one_block( 42 | ECC_bench_params params, 43 | uint8_t* originalFileData, 44 | uint8_t* recoveryBlocks, 45 | OperationTimer& decode_time) 46 | { 47 | // Pointers to data 48 | cm256_block blocks[256]; 49 | 50 | // Initialize the indices 51 | for (int i = 0; i < params.OriginalCount; ++i) 52 | { 53 | blocks[i].Block = originalFileData + i * params.BlockBytes; 54 | blocks[i].Index = cm256_get_original_block_index(params, i); 55 | } 56 | 57 | //// Simulate loss of data, subsituting a recovery block in its place //// 58 | int lostBlock = params.RecoveryCount==1? 0 : 1; // Since recovery block #0 recovers much faster using just XORs 59 | blocks[0].Block = recoveryBlocks + lostBlock * params.BlockBytes; // A recovery block 60 | blocks[0].Index = cm256_get_recovery_block_index(params, lostBlock); // A recovery block index 61 | //// Simulate loss of data, subsituting a recovery block in its place //// 62 | 63 | decode_time.BeginCall(); 64 | if (cm256_decode(params, blocks)) 65 | { 66 | printf(" cm256_decode failed\n"); 67 | return false; 68 | } 69 | decode_time.EndCall(); 70 | 71 | // blocks[0].Index will now be = lostBlock 72 | // and blocks[0].Block overwritten with recovered data 73 | 74 | return true; 75 | } 76 | 77 | 78 | // Perform single operation decoding as much blocks as possible, return false if it fails 79 | bool cm256_benchmark_decode_all_blocks( 80 | ECC_bench_params params, 81 | uint8_t* originalFileData, 82 | uint8_t* recoveryBlocks, 83 | OperationTimer& decode_time) 84 | { 85 | // Pointers to data 86 | cm256_block blocks[256]; 87 | 88 | // Initialize the indices for recovery operation 89 | for (int i = 0; i < params.OriginalCount; ++i) 90 | { 91 | if (i < params.RecoveryCount) { 92 | // Simulate loss of data, subsituting a recovery block in its place 93 | blocks[i].Block = recoveryBlocks + i * params.BlockBytes; // recovery block 94 | blocks[i].Index = cm256_get_recovery_block_index(params, i); // recovery block index 95 | } else { 96 | blocks[i].Block = originalFileData + i * params.BlockBytes; // data block 97 | blocks[i].Index = cm256_get_original_block_index(params, i); // data block index 98 | } 99 | } 100 | 101 | decode_time.BeginCall(); 102 | if (cm256_decode(params, blocks)) 103 | { 104 | printf(" cm256_decode failed\n"); 105 | return false; 106 | } 107 | decode_time.EndCall(); 108 | 109 | // For each i, 110 | // blocks[i].Index will now be = cm256_get_original_block_index(params, i) 111 | // and blocks[i].Block overwritten with recovered data of this block 112 | 113 | return true; 114 | } 115 | 116 | 117 | // Benchmark library and print results, return false if anything failed 118 | bool cm256_benchmark_main(ECC_bench_params params, uint8_t* buffer) 119 | { 120 | if (params.OriginalCount + params.RecoveryCount > 256) 121 | return false; 122 | 123 | // Initialize library and choose CPU SIMD extension to use 124 | if (cm256_init()) { 125 | printf("cm256_init failed\n"); 126 | return false; 127 | } 128 | 129 | // Print CPU SIMD extensions used to accelerate library in this run 130 | // (depends on compilation options such as -mavx2 and actual CPU) 131 | printf("CM256 (%s, %d-bit):\n", 132 | #ifndef GF256_TARGET_MOBILE 133 | # ifdef GF256_TRY_AVX2 134 | CpuHasAVX2? "avx2": 135 | # endif 136 | CpuHasSSSE3? "ssse3": 137 | #endif 138 | #if defined(GF256_TRY_NEON) 139 | CpuHasNeon64? "neon64": 140 | CpuHasNeon? "neon": 141 | #endif 142 | "", sizeof(size_t)*8); 143 | 144 | 145 | // Places for original and parity data 146 | auto originalFileData = buffer; 147 | auto recoveryBlocks = buffer + params.OriginalFileBytes(); 148 | 149 | // Total encode/decode times 150 | OperationTimer encode_time, decode_one_time, decode_all_time; 151 | 152 | // Repeat benchmark multiple times to improve its accuracy 153 | for (int trial = 0; trial < params.Trials; ++trial) 154 | { 155 | if (! cm256_benchmark_encode(params, originalFileData, recoveryBlocks, encode_time)) { 156 | return false; 157 | } 158 | if (! cm256_benchmark_decode_one_block(params, originalFileData, recoveryBlocks, decode_one_time)) { 159 | return false; 160 | } 161 | if (! cm256_benchmark_encode(params, originalFileData, recoveryBlocks, encode_time)) { 162 | return false; 163 | } 164 | if (! cm256_benchmark_decode_all_blocks(params, originalFileData, recoveryBlocks, decode_all_time)) { 165 | return false; 166 | } 167 | } 168 | 169 | // Benchmark reports for each operation 170 | encode_time.Print("encode", params.OriginalFileBytes()); 171 | decode_one_time.Print("decode one", params.BlockBytes); 172 | decode_all_time.Print("decode all", params.RecoveryDataBytes()); 173 | 174 | return true; 175 | } 176 | -------------------------------------------------------------------------------- /src/benchmark_fastecc.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Benchmarking FastECC library: https://github.com/Bulat-Ziganshin/FastECC 3 | // 4 | // Unfortunately, this early version of the library doesn't export ready-to-use encoder 5 | // so we literally copied this code from RS.cpp: 6 | // 7 | // Implementation of the Reed-Solomon algo in O(N*log(N)) using Number-Theoretical Transform in GF(p) 8 | // 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | #include "common.h" 15 | 16 | #include "GF(p).cpp" 17 | #include "ntt.cpp" 18 | 19 | 20 | // Extra workspace used by the library on top of place required for original data 21 | size_t fastecc_extra_space(ECC_bench_params params) 22 | { 23 | return params.BlockBytes * params.OriginalCount; 24 | } 25 | 26 | 27 | // Benchmark encoding using the Reed-Solomon algo 28 | template 29 | void EncodeReedSolomon (size_t N, size_t SIZE, T **data) 30 | { 31 | // 1. iNTT: polynomial interpolation. We find coefficients of order-N polynomial describing the source data 32 | MFA_NTT (data, N, SIZE, true); 33 | // Now we should divide results by N in order to get coefficients, but we combined this operation with the multiplication below 34 | 35 | // Now we can evaluate the polynomial at 2*N points. 36 | // Points with even index will contain the source data, 37 | // while points with odd indexes may be used as ECC data. 38 | // But more efficient approach is to compute only odd-indexed points. 39 | // This is accomplished by the following steps: 40 | 41 | // 2. Multiply the polynomial coefficients by root(2*N)**i 42 | T root_2N = GF_Root(2*N), inv_N = GF_Inv(N); 43 | #pragma omp parallel for 44 | for (ptrdiff_t i=0; i (inv_N, GF_Pow(root_2N,i)); // root_2N**i / N (combine division by N with multiplication by powers of the root) 46 | T* __restrict__ block = data[i]; 47 | for (size_t k=0; k (block[k], root_i); 49 | } 50 | } 51 | 52 | // 3. NTT: polynomial evaluation. This evaluates the modified polynomial at root(N)**i points, 53 | // that is equivalent to evaluation of the original polynomial at root(2*N)**(2*i+1) points. 54 | MFA_NTT (data, N, SIZE, false); 55 | 56 | // Further optimization: in order to compute only even-indexed points, 57 | // it's enough to compute order-N/2 NTT of data[i]+data[i+N/2]. And so on... 58 | } 59 | 60 | 61 | template 62 | bool fastecc_benchmark_specialize(ECC_bench_params params, uint8_t* buffer) 63 | { 64 | // Total encode/decode times 65 | OperationTimer encode_time, decode_one_time, decode_all_time; 66 | 67 | size_t N = NextPow2( std::max( params.OriginalCount, params.RecoveryCount)); // NTT order 68 | size_t SIZE = params.BlockBytes / sizeof(T); 69 | 70 | // Use extra space because algorithm overwrites data in-place 71 | T *data0 = (T*) (buffer + params.OriginalFileBytes()); 72 | 73 | // Fill space with values < P (larger values are incompatible with FastECC algorithm) 74 | for (size_t i=0; i (N, SIZE, data); 90 | encode_time.EndCall(); 91 | } 92 | 93 | // Benchmark reports for each operation 94 | encode_time.Print("encode", params.OriginalFileBytes()); 95 | 96 | return true; 97 | } 98 | 99 | 100 | // Benchmark library and print results, return false if anything failed 101 | bool fastecc_benchmark_main(ECC_bench_params params, uint8_t* buffer) 102 | { 103 | return fastecc_benchmark_specialize (params, buffer); 104 | } 105 | -------------------------------------------------------------------------------- /src/benchmark_leopard.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Benchmarking Leopard library: https://github.com/catid/leopard 3 | // 4 | 5 | #include 6 | #include 7 | 8 | #include "common.h" 9 | 10 | #include "LeopardFF8.cpp" 11 | #undef LEO_MUL_128 12 | #undef LEO_MULADD_128 13 | #undef LEO_MUL_256 14 | #undef LEO_MULADD_256 15 | #undef LEO_IFFTB_256 16 | #undef LEO_IFFTB_128 17 | #undef LEO_FFTB_256 18 | #undef LEO_FFTB_128 19 | #include "LeopardFF16.cpp" 20 | #include "LeopardCommon.cpp" 21 | #include "leopard.cpp" 22 | 23 | 24 | // Extra workspace used by the library on top of place required for original data 25 | size_t leopard_extra_space(ECC_bench_params params) 26 | { 27 | size_t encode_work_count = leo_encode_work_count(params.OriginalCount, params.RecoveryCount); 28 | size_t decode_work_count = leo_decode_work_count(params.OriginalCount, params.RecoveryCount); 29 | return params.BlockBytes * (encode_work_count + decode_work_count); 30 | } 31 | 32 | 33 | // Perform single encoding operation, return false if it fails 34 | bool leopard_benchmark_encode( 35 | ECC_bench_params params, 36 | size_t encode_work_count, 37 | void** original_data, 38 | void** parity_data, 39 | OperationTimer& encode_time) 40 | { 41 | // Generate recovery data 42 | encode_time.BeginCall(); 43 | LeopardResult encodeResult = leo_encode( 44 | params.BlockBytes, 45 | params.OriginalCount, 46 | params.RecoveryCount, 47 | encode_work_count, 48 | original_data, 49 | parity_data 50 | ); 51 | encode_time.EndCall(); 52 | 53 | if (encodeResult != Leopard_Success) 54 | { 55 | printf(" leo_encode failed: %s\n", leo_result_string(encodeResult)); 56 | return false; 57 | } 58 | 59 | return true; 60 | } 61 | 62 | 63 | // Perform single decoding operation, return false if it fails 64 | bool leopard_benchmark_decode( 65 | ECC_bench_params params, 66 | size_t decode_work_count, 67 | void** originalFileData_losing_one, 68 | void** recoveryBlocks, 69 | void** decoderWorkArea, 70 | OperationTimer& decode_time) 71 | { 72 | decode_time.BeginCall(); 73 | LeopardResult decodeResult = leo_decode( 74 | params.BlockBytes, 75 | params.OriginalCount, 76 | params.RecoveryCount, 77 | decode_work_count, 78 | originalFileData_losing_one, 79 | recoveryBlocks, 80 | decoderWorkArea); 81 | decode_time.EndCall(); 82 | 83 | if (decodeResult != Leopard_Success) 84 | { 85 | printf(" leo_decode-one failed: %s\n", leo_result_string(decodeResult)); 86 | return false; 87 | } 88 | 89 | return true; 90 | } 91 | 92 | 93 | // Benchmark library and print results, return false if anything failed 94 | bool leopard_benchmark_main(ECC_bench_params params, uint8_t* buffer) 95 | { 96 | // Total encode/decode times 97 | OperationTimer encode_time, decode_one_time, decode_all_time; 98 | 99 | if (leo_init()) { 100 | printf("leo_init failed\n"); 101 | return false; 102 | } 103 | 104 | size_t encode_work_count = leo_encode_work_count(params.OriginalCount, params.RecoveryCount); 105 | size_t decode_work_count = leo_decode_work_count(params.OriginalCount, params.RecoveryCount); 106 | 107 | if (encode_work_count == 0) // 0 means unsupported data+parity combination 108 | return false; 109 | 110 | // Print CPU SIMD extensions used to accelerate library in this run 111 | // (depends on compilation options such as -mavx2 and actual CPU) 112 | printf("Leopard (%s, %d-bit):\n", 113 | #ifndef GF256_TARGET_MOBILE 114 | # ifdef GF256_TRY_AVX2 115 | leopard::CpuHasAVX2? "avx2": 116 | # endif 117 | leopard::CpuHasSSSE3? "ssse3": 118 | #endif 119 | #if defined(GF256_TRY_NEON) 120 | leopard::CpuHasNeon64? "neon64": 121 | leopard::CpuHasNeon? "neon": 122 | #endif 123 | "", sizeof(size_t)*8); 124 | 125 | // Pointers to data 126 | std::vector original_data(params.OriginalCount); 127 | std::vector original_data_losing_one(params.OriginalCount); 128 | std::vector original_data_losing_most_possible(params.OriginalCount); 129 | std::vector encode_work_data(encode_work_count); 130 | std::vector decode_work_data(decode_work_count); 131 | 132 | for (unsigned i = 0; i < params.OriginalCount; ++i) { 133 | original_data[i] = buffer; 134 | // Lose only the first block 135 | original_data_losing_one[i] = (i==0? nullptr : buffer); 136 | // Lose up to RecoveryCount blocks 137 | original_data_losing_most_possible[i] = (i < params.RecoveryCount? nullptr : buffer); 138 | buffer += params.BlockBytes; 139 | } 140 | for (unsigned i = 0; i < encode_work_count; ++i) { 141 | encode_work_data[i] = buffer; 142 | buffer += params.BlockBytes; 143 | } 144 | for (unsigned i = 0; i < decode_work_count; ++i) { 145 | decode_work_data[i] = buffer; 146 | buffer += params.BlockBytes; 147 | } 148 | 149 | // It's exactly like original_data[] bit with the first block lost 150 | // so we have to repair it 151 | original_data_losing_one[0] = nullptr; 152 | 153 | void** originalFileData = (void**)&original_data[0]; 154 | void** recoveryBlocks = (void**)&encode_work_data[0]; // recovery data written here 155 | void** decoderWorkArea = (void**)&decode_work_data[0]; 156 | void** originalFileData_losing_one = (void**)&original_data_losing_one[0]; 157 | void** originalFileData_losing_most_possible = (void**)&original_data_losing_most_possible[0]; 158 | 159 | // Repeat benchmark multiple times to improve its accuracy 160 | for (int trial = 0; trial < params.Trials; ++trial) 161 | { 162 | if (! leopard_benchmark_encode(params, encode_work_count, 163 | originalFileData, recoveryBlocks, encode_time)) { 164 | return false; 165 | } 166 | if (! leopard_benchmark_decode(params, decode_work_count, 167 | originalFileData_losing_one, recoveryBlocks, decoderWorkArea, decode_one_time)) { 168 | return false; 169 | } 170 | if (! leopard_benchmark_decode(params, decode_work_count, 171 | originalFileData_losing_most_possible, recoveryBlocks, decoderWorkArea, decode_all_time)) { 172 | return false; 173 | } 174 | } 175 | 176 | // Benchmark reports for each operation 177 | encode_time.Print("encode", params.OriginalFileBytes()); 178 | decode_one_time.Print("decode one", params.BlockBytes); 179 | decode_all_time.Print("decode all", params.RecoveryDataBytes()); 180 | 181 | return true; 182 | } 183 | -------------------------------------------------------------------------------- /src/benchmark_wirehair.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Benchmarking Wirehair library: https://github.com/catid/wirehair 3 | // 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include "common.h" 10 | 11 | #include "gf256.h" 12 | #include "WirehairTools.cpp" 13 | #include "WirehairCodec.cpp" 14 | #include "wirehair.cpp" 15 | 16 | 17 | // Perform single encoding operation, return false if it fails 18 | bool wirehair_benchmark_encode( 19 | ECC_bench_params params, 20 | uint8_t* originalFileData, 21 | uint8_t* recoveryBlocks, 22 | WirehairCodec& encoder) 23 | { 24 | // Create encoder 25 | encoder = wirehair_encoder_create( 26 | encoder, // [Optional] Pointer to prior codec object 27 | originalFileData, // Pointer to message 28 | params.OriginalFileBytes(), // Bytes in the message 29 | params.BlockBytes); // Bytes in an output block 30 | 31 | if (!encoder) { 32 | printf("wirehair_encoder_create failed\n"); 33 | return false; 34 | } 35 | 36 | // Generate recovery data 37 | for (int i = 0; i < params.RecoveryCount; ++i) 38 | { 39 | auto blockId = i + params.OriginalCount; 40 | auto blockSize = params.BlockBytes; 41 | auto blockPtr = recoveryBlocks + i * blockSize; 42 | 43 | // Encode a packet 44 | uint32_t writeLen = 0; 45 | WirehairResult encodeResult = wirehair_encode( 46 | encoder, // Pointer to codec from wirehair_encoder_create() 47 | blockId, // Identifier of block to generate 48 | blockPtr, // Pointer to output block data 49 | blockSize, // Bytes in the output buffer 50 | &writeLen); // Number of bytes written <= blockBytes 51 | 52 | if (encodeResult != Wirehair_Success || writeLen != blockSize) 53 | { 54 | printf("wirehair_encode failed: %s\n", wirehair_result_string(encodeResult)); 55 | return false; 56 | } 57 | } 58 | 59 | return true; 60 | } 61 | 62 | 63 | // Perform single operation decoding single lost block, return false if it fails 64 | bool wirehair_benchmark_decode_one_block( 65 | ECC_bench_params params, 66 | uint8_t* originalFileData, 67 | uint8_t* recoveryBlocks, 68 | WirehairCodec& decoder) 69 | { 70 | // Create decoder 71 | decoder = wirehair_decoder_create( 72 | decoder, // Codec object to reuse 73 | params.OriginalFileBytes(), // Bytes in the message to decode 74 | params.BlockBytes); // Bytes in each encoded block 75 | 76 | if (!decoder) { 77 | printf("wirehair_decoder_create failed\n"); 78 | return false; 79 | } 80 | 81 | auto blockSize = params.BlockBytes; 82 | 83 | 84 | // Simulate loss of the first data block, 85 | // using instead as much recovery blocks as required by the codec 86 | for (int blockId = 1; blockId < params.OriginalCount + params.RecoveryCount; ++blockId) 87 | { 88 | auto blockPtr = originalFileData + blockId * blockSize; 89 | 90 | // Attempt decode 91 | WirehairResult decodeResult = wirehair_decode( 92 | decoder, // Pointer to codec from wirehair_decoder_create() 93 | blockId, // ID number of received block 94 | blockPtr, // Pointer to block data 95 | blockSize); // Number of bytes in the data block 96 | 97 | // If decoder returns success: 98 | if (decodeResult == Wirehair_Success) { 99 | // Decoder has enough data to recover now 100 | goto recover; 101 | } 102 | 103 | if (decodeResult != Wirehair_NeedMore) { 104 | printf("wirehair_decode failed: %s\n", wirehair_result_string(decodeResult)); 105 | return false; 106 | } 107 | } 108 | 109 | printf("wirehair_benchmark_decode_one_block failed: not enough data for recovery\n"); 110 | return false; 111 | 112 | 113 | recover: 114 | // Now let's recover the first data block 115 | auto blockId = 0; 116 | auto blockPtr = originalFileData; 117 | 118 | uint32_t writeLen = 0; 119 | WirehairResult recoverResult = wirehair_recover_block( 120 | decoder, // Pointer to codec from wirehair_decoder_create() 121 | blockId, // ID of the block to reconstruct 122 | blockPtr, // Pointer to block data 123 | &writeLen // Set to the number of data bytes in the block 124 | ); 125 | 126 | if (recoverResult != Wirehair_Success || writeLen != blockSize) { 127 | printf("wirehair_recover_block failed: %s\n", wirehair_result_string(recoverResult)); 128 | return false; 129 | } 130 | 131 | /* Altenatively, we can recover the entire original data that works only slightly slower 132 | (probably because it memcpy's more data): 133 | 134 | WirehairResult recoverResult = wirehair_recover( 135 | decoder, // Pointer to codec from wirehair_decoder_create() 136 | originalFileData, // Buffer where reconstructed message will be written 137 | params.OriginalFileBytes() // Bytes in the message 138 | ); 139 | */ 140 | return true; 141 | } 142 | 143 | 144 | // Perform single operation decoding as much blocks as possible, return false if it fails 145 | bool wirehair_benchmark_decode_all_blocks( 146 | ECC_bench_params params, 147 | uint8_t* originalFileData, 148 | uint8_t* recoveryBlocks, 149 | WirehairCodec& decoder) 150 | { 151 | // Create decoder 152 | decoder = wirehair_decoder_create( 153 | decoder, // Codec object to reuse 154 | params.OriginalFileBytes(), // Bytes in the message to decode 155 | params.BlockBytes); // Bytes in each encoded block 156 | 157 | if (!decoder) { 158 | printf("wirehair_decoder_create failed\n"); 159 | return false; 160 | } 161 | 162 | auto blockSize = params.BlockBytes; 163 | 164 | 165 | // Simulate loss of as much data blocks as possible, 166 | // using instead all the recovery blocks available 167 | for (int blockId = params.OriginalCount + params.RecoveryCount; --blockId > 0 ; ) 168 | { 169 | auto blockPtr = originalFileData + blockId * blockSize; 170 | 171 | // Attempt decode 172 | WirehairResult decodeResult = wirehair_decode( 173 | decoder, // Pointer to codec from wirehair_decoder_create() 174 | blockId, // ID number of received block 175 | blockPtr, // Pointer to block data 176 | blockSize); // Number of bytes in the data block 177 | 178 | // If decoder returns success: 179 | if (decodeResult == Wirehair_Success) { 180 | // Decoder has enough data to recover now 181 | goto recover; 182 | } 183 | 184 | if (decodeResult != Wirehair_NeedMore) { 185 | printf("wirehair_decode failed: %s\n", wirehair_result_string(decodeResult)); 186 | return false; 187 | } 188 | } 189 | 190 | printf("wirehair_benchmark_decode_all_blocks failed: not enough data for recovery\n"); 191 | return false; 192 | 193 | 194 | recover: 195 | // Now let's recover the entire buffer 196 | WirehairResult recoverResult = wirehair_recover( 197 | decoder, // Pointer to codec from wirehair_decoder_create() 198 | originalFileData, // Buffer where reconstructed message will be written 199 | params.OriginalFileBytes() // Bytes in the message 200 | ); 201 | 202 | if (recoverResult != Wirehair_Success) { 203 | printf("wirehair_recover failed: %s\n", wirehair_result_string(recoverResult)); 204 | return false; 205 | } 206 | 207 | return true; 208 | } 209 | 210 | 211 | // Benchmark library and print results, return false if anything failed 212 | bool wirehair_benchmark_main(ECC_bench_params params, uint8_t* buffer) 213 | { 214 | // Initialize the library 215 | const WirehairResult initResult = wirehair_init(); 216 | if (initResult != Wirehair_Success) { 217 | printf("wirehair_init failed: %s\n", wirehair_result_string(initResult)); 218 | return false; 219 | } 220 | 221 | // Introduce himself 222 | printf("Wirehair (%d-bit):\n", sizeof(size_t)*8); 223 | 224 | // Automatically free codecs memory 225 | struct FreeCodecs{ 226 | WirehairCodec encoder = nullptr, decoder_one = nullptr, decoder_all = nullptr; 227 | ~FreeCodecs() { 228 | wirehair_free(encoder); 229 | wirehair_free(decoder_one); 230 | wirehair_free(decoder_all); 231 | } 232 | } codecs; 233 | 234 | 235 | // Places for original and parity data 236 | auto originalFileData = buffer; 237 | auto recoveryBlocks = buffer + params.OriginalFileBytes(); 238 | 239 | // Total encode/decode times 240 | OperationTimer encode_time, decode_one_time, decode_all_time; 241 | 242 | // Repeat benchmark multiple times to improve its accuracy 243 | for (int trial = 0; trial < params.Trials; ++trial) 244 | { 245 | encode_time.BeginCall(); 246 | if (! wirehair_benchmark_encode(params, originalFileData, recoveryBlocks, codecs.encoder)) { 247 | return false; 248 | } 249 | encode_time.EndCall(); 250 | decode_one_time.BeginCall(); 251 | if (! wirehair_benchmark_decode_one_block(params, originalFileData, recoveryBlocks, codecs.decoder_one)) { 252 | return false; 253 | } 254 | decode_one_time.EndCall(); 255 | decode_all_time.BeginCall(); 256 | if (! wirehair_benchmark_decode_all_blocks(params, originalFileData, recoveryBlocks, codecs.decoder_all)) { 257 | return false; 258 | } 259 | decode_all_time.EndCall(); 260 | } 261 | 262 | // Benchmark reports for each operation 263 | encode_time.Print("encode", params.OriginalFileBytes()); 264 | decode_one_time.Print("decode one", params.BlockBytes); 265 | decode_all_time.Print("decode all", params.RecoveryDataBytes()); 266 | 267 | return true; 268 | } 269 | -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | #include "cm256.h" 2 | #include "../unit_test/SiameseTools.h" 3 | 4 | 5 | struct ECC_bench_params : cm256_encoder_params 6 | { 7 | // Repeat benchmark multiple times to improve its accuracy 8 | int Trials; 9 | 10 | // Size of the original file 11 | size_t OriginalFileBytes() { return OriginalCount * BlockBytes;} 12 | 13 | // Size of the original file 14 | size_t RecoveryDataBytes() { return RecoveryCount * BlockBytes;} 15 | }; 16 | 17 | 18 | // Benchmark each library and print results, return false if anything failed 19 | bool cm256_benchmark_main(ECC_bench_params params, uint8_t* buffer); 20 | bool leopard_benchmark_main(ECC_bench_params params, uint8_t* buffer); 21 | bool fastecc_benchmark_main(ECC_bench_params params, uint8_t* buffer); 22 | bool wirehair_benchmark_main(ECC_bench_params params, uint8_t* buffer); 23 | 24 | // Extra workspace used by each library on top of place required for original data 25 | size_t leopard_extra_space(ECC_bench_params params); 26 | size_t fastecc_extra_space(ECC_bench_params params); 27 | 28 | // Write benchmark results to logfile 29 | void write_to_logfile(const char* operation, int invocations, double microseconds_per_call, double megabytes_per_second); 30 | 31 | 32 | //----------------------------------------------------------------------------- 33 | class OperationTimer 34 | { 35 | public: 36 | void BeginCall() 37 | { 38 | t0 = siamese::GetTimeUsec(); 39 | } 40 | void EndCall() 41 | { 42 | const uint64_t t1 = siamese::GetTimeUsec(); 43 | const uint64_t delta = t1 - t0; 44 | if (++Invocations == 1) 45 | MaxCallUsec = MinCallUsec = delta; 46 | else if (MaxCallUsec < delta) 47 | MaxCallUsec = delta; 48 | else if (MinCallUsec > delta) 49 | MinCallUsec = delta; 50 | TotalUsec += delta; 51 | t0 = 0; 52 | } 53 | void Reset() 54 | { 55 | t0 = 0; 56 | Invocations = 0; 57 | TotalUsec = 0; 58 | } 59 | void Print(const char* operation, uint64_t bytes_processed_per_call) 60 | { 61 | double microseconds_per_call = double(TotalUsec) / Invocations; 62 | double megabytes_per_second = bytes_processed_per_call / microseconds_per_call; 63 | printf(" %s: %.0lf usec, %.0lf MB/s\n", operation, microseconds_per_call, megabytes_per_second); 64 | write_to_logfile(operation, Invocations, microseconds_per_call, megabytes_per_second); 65 | } 66 | 67 | uint64_t t0 = 0; 68 | uint64_t Invocations = 0; 69 | uint64_t TotalUsec = 0; 70 | uint64_t MaxCallUsec = 0; 71 | uint64_t MinCallUsec = 0; 72 | }; 73 | 74 | 75 | // Round x up to 2^i 76 | inline uint64_t NextPow2(uint64_t x) 77 | { 78 | if (x == 0) return 0; 79 | if (x == 1) return 1; 80 | int i = 1; 81 | for (x--; x/=2; i++) 82 | ; 83 | return uint64_t(1) << i; 84 | } 85 | -------------------------------------------------------------------------------- /src/compile.cmd: -------------------------------------------------------------------------------- 1 | g++ -o bench_avx2 -mavx2 -DSIMD=AVX2 -mtune=skylake -O3 -s main.cpp benchmark_cm256.cpp ../external/cm256/src/cm256.cpp benchmark_leopard.cpp benchmark_fastecc.cpp benchmark_wirehair.cpp -I../external/cm256/include -I../external/leopard -I../external/FastECC -I../external/wirehair -I../external/wirehair/include 2 | g++ -o bench_sse4 -msse4 -DSIMD=SSE2 -mtune=skylake -O3 -s main.cpp benchmark_cm256.cpp ../external/cm256/src/cm256.cpp benchmark_leopard.cpp benchmark_fastecc.cpp benchmark_wirehair.cpp -I../external/cm256/include -I../external/leopard -I../external/FastECC -I../external/wirehair -I../external/wirehair/include 3 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "common.h" 4 | 5 | #include "../unit_test/SiameseTools.cpp" 6 | 7 | #define BUFSIZE_ALIGNMENT 64 /* at least 16 for SSE intrinsics, and at least 64 for Leopard */ 8 | #define align_up(value, ALIGNMENT) ((((value) + ALIGNMENT - 1) / ALIGNMENT) * ALIGNMENT) 9 | 10 | 11 | // Benchmark parameters set at cmdline 12 | ECC_bench_params params; 13 | 14 | // Currently benchmarked library 15 | const char *library = ""; 16 | 17 | // File to save benchmark results 18 | FILE* logfile = NULL; 19 | 20 | // Write benchmark results to logfile 21 | void write_to_logfile(const char* operation, int invocations, double microseconds_per_call, double megabytes_per_second) 22 | { 23 | if (logfile) 24 | { 25 | fprintf(logfile, "%d,%d,%d,%s,%s,%d,%lf,%lf\n", 26 | params.OriginalCount, params.RecoveryCount, params.BlockBytes, 27 | library, operation, 28 | invocations, microseconds_per_call, megabytes_per_second); 29 | fflush(logfile); 30 | } 31 | } 32 | 33 | 34 | // Parse ECC parameters from cmdline 35 | void parse_cmdline(int argc, char** argv) 36 | { 37 | // Number of blocks 38 | params.OriginalCount = 50; 39 | 40 | // Number of additional recovery blocks generated by encoder 41 | params.RecoveryCount = 50; 42 | 43 | // Number of bytes per file block 44 | params.BlockBytes = 4096; 45 | 46 | // Repeat benchmark multiple times to improve its accuracy 47 | params.Trials = 1000; 48 | 49 | if (argc==1) printf("Usage: bench data_blocks parity_blocks chunk_size trials logfile\n"); 50 | if (argc>1) params.OriginalCount = atoi(argv[1]); 51 | if (argc>2) params.RecoveryCount = atoi(argv[2]); 52 | if (argc>3) params.BlockBytes = atoi(argv[3]); 53 | if (argc>4) params.Trials = atoi(argv[4]); 54 | if (argc>5) logfile = fopen(argv[5],"a"); 55 | 56 | // Round up for compatibility with all benchmarked libraries 57 | params.BlockBytes = align_up(params.BlockBytes, BUFSIZE_ALIGNMENT); 58 | 59 | printf("Params: data_blocks=%d parity_blocks=%d chunk_size=%d trials=%d\n", 60 | params.OriginalCount, params.RecoveryCount, params.BlockBytes, params.Trials); 61 | } 62 | 63 | 64 | // Try to seize a CPU core into exclusive use by this thread 65 | void occupy_cpu_core() 66 | { 67 | // Increase process/thread priorities to ensure repeatable results 68 | #ifdef _WIN32 69 | ::SetPriorityClass(::GetCurrentProcess(), HIGH_PRIORITY_CLASS); 70 | ::SetThreadPriority(::GetCurrentThread(), THREAD_PRIORITY_HIGHEST); 71 | #endif 72 | std::this_thread::sleep_for(std::chrono::milliseconds(100)); 73 | } 74 | 75 | 76 | // Benchmark all libraries using parameters provided on cmdline 77 | int main(int argc, char** argv) 78 | { 79 | // Setup benchmark configuration based on cmdline options 80 | parse_cmdline(argc, argv); 81 | 82 | // Alloc single buffer large enough for any operation in any tested library 83 | size_t bufsize = params.OriginalFileBytes() + 84 | std::max(params.RecoveryDataBytes(), // CM256/Wirehair extra space 85 | std::max(leopard_extra_space(params), 86 | fastecc_extra_space(params))); 87 | auto buffer = new uint8_t[bufsize + BUFSIZE_ALIGNMENT]; 88 | 89 | // Align buffer start for compatibility with all benchmarked libraries 90 | buffer = (uint8_t*) align_up(uintptr_t(buffer), BUFSIZE_ALIGNMENT); 91 | 92 | // Fill place allocated for the file contents with random numbers. 93 | // It's critical to fill it with non-repeating data 94 | // since some libraries rely on table lookups 95 | // and can get unfair speedup on repeated data. 96 | for (size_t i = 0; i < params.OriginalFileBytes(); ++i) { 97 | buffer[i] = (uint8_t)((i*123456791) >> 13); 98 | } 99 | 100 | // Benchmark each library 101 | occupy_cpu_core(); 102 | library = "CM256"; cm256_benchmark_main(params, buffer); 103 | library = "Leopard"; leopard_benchmark_main(params, buffer); 104 | library = "FastECC"; fastecc_benchmark_main(params, buffer); 105 | library = "Wirehair"; wirehair_benchmark_main(params, buffer); 106 | 107 | if (logfile) fclose(logfile); 108 | return 0; 109 | } 110 | --------------------------------------------------------------------------------