├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
└── src
    ├── benchmark_cm256.cpp
    ├── benchmark_fastecc.cpp
    ├── benchmark_leopard.cpp
    ├── benchmark_wirehair.cpp
    ├── common.h
    ├── compile.cmd
    └── main.cpp


/.gitignore:
--------------------------------------------------------------------------------
  1 | *.exe
  2 | 
  3 | ## Ignore Visual Studio temporary files, build results, and
  4 | ## files generated by popular Visual Studio add-ons.
  5 | ##
  6 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
  7 | 
  8 | # User-specific files
  9 | *.rsuser
 10 | *.suo
 11 | *.user
 12 | *.userosscache
 13 | *.sln.docstates
 14 | 
 15 | # User-specific files (MonoDevelop/Xamarin Studio)
 16 | *.userprefs
 17 | 
 18 | # Mono auto generated files
 19 | mono_crash.*
 20 | 
 21 | # Build results
 22 | [Dd]ebug/
 23 | [Dd]ebugPublic/
 24 | [Rr]elease/
 25 | [Rr]eleases/
 26 | x64/
 27 | x86/
 28 | [Aa][Rr][Mm]/
 29 | [Aa][Rr][Mm]64/
 30 | bld/
 31 | [Bb]in/
 32 | [Oo]bj/
 33 | [Ll]og/
 34 | [Ll]ogs/
 35 | 
 36 | # Visual Studio 2015/2017 cache/options directory
 37 | .vs/
 38 | # Uncomment if you have tasks that create the project's static files in wwwroot
 39 | #wwwroot/
 40 | 
 41 | # Visual Studio 2017 auto generated files
 42 | Generated\ Files/
 43 | 
 44 | # MSTest test Results
 45 | [Tt]est[Rr]esult*/
 46 | [Bb]uild[Ll]og.*
 47 | 
 48 | # NUnit
 49 | *.VisualState.xml
 50 | TestResult.xml
 51 | nunit-*.xml
 52 | 
 53 | # Build Results of an ATL Project
 54 | [Dd]ebugPS/
 55 | [Rr]eleasePS/
 56 | dlldata.c
 57 | 
 58 | # Benchmark Results
 59 | BenchmarkDotNet.Artifacts/
 60 | 
 61 | # .NET Core
 62 | project.lock.json
 63 | project.fragment.lock.json
 64 | artifacts/
 65 | 
 66 | # StyleCop
 67 | StyleCopReport.xml
 68 | 
 69 | # Files built by Visual Studio
 70 | *_i.c
 71 | *_p.c
 72 | *_h.h
 73 | *.ilk
 74 | *.meta
 75 | *.obj
 76 | *.iobj
 77 | *.pch
 78 | *.pdb
 79 | *.ipdb
 80 | *.pgc
 81 | *.pgd
 82 | *.rsp
 83 | *.sbr
 84 | *.tlb
 85 | *.tli
 86 | *.tlh
 87 | *.tmp
 88 | *.tmp_proj
 89 | *_wpftmp.csproj
 90 | *.log
 91 | *.vspscc
 92 | *.vssscc
 93 | .builds
 94 | *.pidb
 95 | *.svclog
 96 | *.scc
 97 | 
 98 | # Chutzpah Test files
 99 | _Chutzpah*
100 | 
101 | # Visual C++ cache files
102 | ipch/
103 | *.aps
104 | *.ncb
105 | *.opendb
106 | *.opensdf
107 | *.sdf
108 | *.cachefile
109 | *.VC.db
110 | *.VC.VC.opendb
111 | 
112 | # Visual Studio profiler
113 | *.psess
114 | *.vsp
115 | *.vspx
116 | *.sap
117 | 
118 | # Visual Studio Trace Files
119 | *.e2e
120 | 
121 | # TFS 2012 Local Workspace
122 | $tf/
123 | 
124 | # Guidance Automation Toolkit
125 | *.gpState
126 | 
127 | # ReSharper is a .NET coding add-in
128 | _ReSharper*/
129 | *.[Rr]e[Ss]harper
130 | *.DotSettings.user
131 | 
132 | # TeamCity is a build add-in
133 | _TeamCity*
134 | 
135 | # DotCover is a Code Coverage Tool
136 | *.dotCover
137 | 
138 | # AxoCover is a Code Coverage Tool
139 | .axoCover/*
140 | !.axoCover/settings.json
141 | 
142 | # Visual Studio code coverage results
143 | *.coverage
144 | *.coveragexml
145 | 
146 | # NCrunch
147 | _NCrunch_*
148 | .*crunch*.local.xml
149 | nCrunchTemp_*
150 | 
151 | # MightyMoose
152 | *.mm.*
153 | AutoTest.Net/
154 | 
155 | # Web workbench (sass)
156 | .sass-cache/
157 | 
158 | # Installshield output folder
159 | [Ee]xpress/
160 | 
161 | # DocProject is a documentation generator add-in
162 | DocProject/buildhelp/
163 | DocProject/Help/*.HxT
164 | DocProject/Help/*.HxC
165 | DocProject/Help/*.hhc
166 | DocProject/Help/*.hhk
167 | DocProject/Help/*.hhp
168 | DocProject/Help/Html2
169 | DocProject/Help/html
170 | 
171 | # Click-Once directory
172 | publish/
173 | 
174 | # Publish Web Output
175 | *.[Pp]ublish.xml
176 | *.azurePubxml
177 | # Note: Comment the next line if you want to checkin your web deploy settings,
178 | # but database connection strings (with potential passwords) will be unencrypted
179 | *.pubxml
180 | *.publishproj
181 | 
182 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
183 | # checkin your Azure Web App publish settings, but sensitive information contained
184 | # in these scripts will be unencrypted
185 | PublishScripts/
186 | 
187 | # NuGet Packages
188 | *.nupkg
189 | # NuGet Symbol Packages
190 | *.snupkg
191 | # The packages folder can be ignored because of Package Restore
192 | **/[Pp]ackages/*
193 | # except build/, which is used as an MSBuild target.
194 | !**/[Pp]ackages/build/
195 | # Uncomment if necessary however generally it will be regenerated when needed
196 | #!**/[Pp]ackages/repositories.config
197 | # NuGet v3's project.json files produces more ignorable files
198 | *.nuget.props
199 | *.nuget.targets
200 | 
201 | # Microsoft Azure Build Output
202 | csx/
203 | *.build.csdef
204 | 
205 | # Microsoft Azure Emulator
206 | ecf/
207 | rcf/
208 | 
209 | # Windows Store app package directories and files
210 | AppPackages/
211 | BundleArtifacts/
212 | Package.StoreAssociation.xml
213 | _pkginfo.txt
214 | *.appx
215 | *.appxbundle
216 | *.appxupload
217 | 
218 | # Visual Studio cache files
219 | # files ending in .cache can be ignored
220 | *.[Cc]ache
221 | # but keep track of directories ending in .cache
222 | !?*.[Cc]ache/
223 | 
224 | # Others
225 | ClientBin/
226 | ~$*
227 | *~
228 | *.dbmdl
229 | *.dbproj.schemaview
230 | *.jfm
231 | *.pfx
232 | *.publishsettings
233 | orleans.codegen.cs
234 | 
235 | # Including strong name files can present a security risk
236 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
237 | #*.snk
238 | 
239 | # Since there are multiple workflows, uncomment next line to ignore bower_components
240 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
241 | #bower_components/
242 | 
243 | # RIA/Silverlight projects
244 | Generated_Code/
245 | 
246 | # Backup & report files from converting an old project file
247 | # to a newer Visual Studio version. Backup files are not needed,
248 | # because we have git ;-)
249 | _UpgradeReport_Files/
250 | Backup*/
251 | UpgradeLog*.XML
252 | UpgradeLog*.htm
253 | ServiceFabricBackup/
254 | *.rptproj.bak
255 | 
256 | # SQL Server files
257 | *.mdf
258 | *.ldf
259 | *.ndf
260 | 
261 | # Business Intelligence projects
262 | *.rdl.data
263 | *.bim.layout
264 | *.bim_*.settings
265 | *.rptproj.rsuser
266 | *- [Bb]ackup.rdl
267 | *- [Bb]ackup ([0-9]).rdl
268 | *- [Bb]ackup ([0-9][0-9]).rdl
269 | 
270 | # Microsoft Fakes
271 | FakesAssemblies/
272 | 
273 | # GhostDoc plugin setting file
274 | *.GhostDoc.xml
275 | 
276 | # Node.js Tools for Visual Studio
277 | .ntvs_analysis.dat
278 | node_modules/
279 | 
280 | # Visual Studio 6 build log
281 | *.plg
282 | 
283 | # Visual Studio 6 workspace options file
284 | *.opt
285 | 
286 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
287 | *.vbw
288 | 
289 | # Visual Studio LightSwitch build output
290 | **/*.HTMLClient/GeneratedArtifacts
291 | **/*.DesktopClient/GeneratedArtifacts
292 | **/*.DesktopClient/ModelManifest.xml
293 | **/*.Server/GeneratedArtifacts
294 | **/*.Server/ModelManifest.xml
295 | _Pvt_Extensions
296 | 
297 | # Paket dependency manager
298 | .paket/paket.exe
299 | paket-files/
300 | 
301 | # FAKE - F# Make
302 | .fake/
303 | 
304 | # CodeRush personal settings
305 | .cr/personal
306 | 
307 | # Python Tools for Visual Studio (PTVS)
308 | __pycache__/
309 | *.pyc
310 | 
311 | # Cake - Uncomment if you are using it
312 | # tools/**
313 | # !tools/packages.config
314 | 
315 | # Tabs Studio
316 | *.tss
317 | 
318 | # Telerik's JustMock configuration file
319 | *.jmconfig
320 | 
321 | # BizTalk build output
322 | *.btp.cs
323 | *.btm.cs
324 | *.odx.cs
325 | *.xsd.cs
326 | 
327 | # OpenCover UI analysis results
328 | OpenCover/
329 | 
330 | # Azure Stream Analytics local run output
331 | ASALocalRun/
332 | 
333 | # MSBuild Binary and Structured Log
334 | *.binlog
335 | 
336 | # NVidia Nsight GPU debugger configuration file
337 | *.nvuser
338 | 
339 | # MFractors (Xamarin productivity tool) working folder
340 | .mfractor/
341 | 
342 | # Local History for Visual Studio
343 | .localhistory/
344 | 
345 | # BeatPulse healthcheck temp database
346 | healthchecksdb
347 | 
348 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
349 | MigrationBackup/
350 | 
351 | # Ionide (cross platform F# VS Code tools) working folder
352 | .ionide/
353 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "external/cm256"]
 2 | 	path = external/cm256
 3 | 	url = https://github.com/catid/cm256
 4 | [submodule "external/leopard"]
 5 | 	path = external/leopard
 6 | 	url = https://github.com/catid/leopard
 7 | [submodule "external/FastECC"]
 8 | 	path = external/FastECC
 9 | 	url = https://github.com/Bulat-Ziganshin/FastECC
10 | [submodule "external/wirehair"]
11 | 	path = external/wirehair
12 | 	url = https://github.com/catid/wirehair
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Bulat-Ziganshin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Comparison of leading error-correcting code implementations
  2 | 
  3 | We plan to compare:
  4 | - O(N^2) Reed-Solomon codecs:
  5 |   - [x] [CM256](https://github.com/catid/cm256) - GF(2^8)
  6 |   - [ ] [Intel ISA-L](https://github.com/intel/isa-l) - GF(2^8)
  7 | - O(N*log(N)) Reed-Solomon codecs:
  8 |   - [x] [Leopard](https://github.com/catid/leopard) - uses [FWHT](https://en.wikipedia.org/wiki/Fast_Walsh%E2%80%93Hadamard_transform) in GF(2^8) or GF(2^16), up to 2^16 blocks, data blocks >= parity blocks
  9 |   - [x] [FastECC](https://github.com/Bulat-Ziganshin/FastECC) - uses FFT in GF(p), up to 2^20 blocks
 10 | - O(N) non-MDS codec:
 11 |   - [x] [Wirehair](https://github.com/catid/wirehair) - fountain code, up to 64000 data blocks
 12 | 
 13 | SIMD usage:
 14 | - CM256, Leopard and Wirehair provides AVX2/SSSE3/Neon64/Neon-optimized code paths
 15 | - Intel ISA-L provides AVX512/AVX2/AVX/SSSE3/Neon/SVE/VSX-optimized code paths
 16 | - FastECC provides AVX2/SSE2-optimized code paths
 17 | 
 18 | So far, the benchmark is single-threaded. Leopard and FastECC have built-in OpenMP support, which may be enabled by adding `-fopenmp` to the compilation commands.
 19 | 
 20 | 
 21 | ## Results
 22 | 
 23 | Notes:
 24 | - 80+20 means 80 data blocks and 20 parity blocks
 25 | - Encoding speeds are measured in terms of original data processed
 26 | - Decoding speeds are measured in terms of recovered data produced:
 27 |   - first test recovers single block, so `speed = one block size / time`
 28 |   - second test recovers as much blocks as code can do, so `speed = size of all parity blocks / time`
 29 | - Each program run involves multiple "trials", 1000 by default, and we compute average time of trial
 30 |   - Formatted results are represented by the best runs among multiple experiments
 31 |   - Raw results are the single runs, just for quick comparison
 32 | - Block sizes for each run were optimized to fit all data into L3 cache, but fixed to 4 KB for large codewords
 33 | - Benchmark CPU is i7-8665U (4C/8T Skylake running at 3.3-4.5 GHz)
 34 | 
 35 | 
 36 | ### Formatted results of CM256
 37 | 
 38 | CM256:
 39 | 
 40 | | AVX2     | Encoding all | Decoding one  | Decoding all  |
 41 | | -------: | -----------: | ------------: | ------------: |
 42 | | 200+50   |     643 MB/s |      156 MB/s |      159 MB/s |
 43 | | 50+50    |     635 MB/s |      636 MB/s |      716 MB/s |
 44 | | 80+20    |    1650 MB/s |      403 MB/s |      413 MB/s |
 45 | | 20+20    |    1606 MB/s |     1562 MB/s |     1880 MB/s |
 46 | 
 47 | | SSSE3    | Encoding all | Decoding one  | Decoding all  |
 48 | | -------: | -----------: | ------------: | ------------: |
 49 | | 200+50   |     336 MB/s |       84 MB/s |       86 MB/s |
 50 | | 50+50    |     346 MB/s |      339 MB/s |      352 MB/s |
 51 | | 80+20    |     882 MB/s |      212 MB/s |      219 MB/s |
 52 | | 20+20    |     892 MB/s |      866 MB/s |      892 MB/s |
 53 | 
 54 | 
 55 | ### Raw results with AVX2
 56 | 
 57 | ```
 58 | D:\>bench_avx2 200 50 16384 100
 59 | Params: data_blocks=200 parity_blocks=50 chunk_size=16384 trials=100
 60 | CM256 (avx2, 64-bit):
 61 |   encode: 5219 usec, 628 MB/s
 62 |   decode one: 109 usec, 151 MB/s
 63 |   decode all: 4677 usec, 175 MB/s
 64 | Leopard (avx2, 64-bit):
 65 |   encode: 1377 usec, 2379 MB/s
 66 |   decode one: 4574 usec, 4 MB/s
 67 |   decode all: 4401 usec, 186 MB/s
 68 | FastECC 0xfff00001 32-bit
 69 |   encode: 7129 usec, 460 MB/s
 70 | Wirehair (64-bit):
 71 |   encode: 2272 usec, 1443 MB/s
 72 |   decode one: 2506 usec, 7 MB/s
 73 |   decode all: 3061 usec, 268 MB/s
 74 | 
 75 | D:\>bench_avx2 50 50 16384 1000
 76 | Params: data_blocks=50 parity_blocks=50 chunk_size=16384 trials=1000
 77 | CM256 (avx2, 64-bit):
 78 |   encode: 1306 usec, 627 MB/s
 79 |   decode one: 27 usec, 606 MB/s
 80 |   decode all: 1182 usec, 693 MB/s
 81 | Leopard (avx2, 64-bit):
 82 |   encode: 228 usec, 3593 MB/s
 83 |   decode one: 599 usec, 27 MB/s
 84 |   decode all: 673 usec, 1218 MB/s
 85 | FastECC 0xfff00001 32-bit
 86 |   encode: 1324 usec, 619 MB/s
 87 | Wirehair (64-bit):
 88 |   encode: 603 usec, 1360 MB/s
 89 |   decode one: 527 usec, 31 MB/s
 90 |   decode all: 678 usec, 1209 MB/s
 91 | 
 92 | D:\>bench_avx2 80 20 16384 1000
 93 | Params: data_blocks=80 parity_blocks=20 chunk_size=16384 trials=1000
 94 | CM256 (avx2, 64-bit):
 95 |   encode: 851 usec, 1540 MB/s
 96 |   decode one: 44 usec, 370 MB/s
 97 |   decode all: 755 usec, 434 MB/s
 98 | Leopard (avx2, 64-bit):
 99 |   encode: 239 usec, 5485 MB/s
100 |   decode one: 594 usec, 28 MB/s
101 |   decode all: 620 usec, 529 MB/s
102 | FastECC 0xfff00001 32-bit
103 |   encode: 3227 usec, 406 MB/s
104 | Wirehair (64-bit):
105 |   encode: 977 usec, 1342 MB/s
106 |   decode one: 1069 usec, 15 MB/s
107 |   decode all: 1225 usec, 268 MB/s
108 | 
109 | D:\>bench_avx2 20 20 65536 500
110 | Params: data_blocks=20 parity_blocks=20 chunk_size=65536 trials=500
111 | CM256 (avx2, 64-bit):
112 |   encode: 1230 usec, 1066 MB/s
113 |   decode one: 62 usec, 1053 MB/s
114 |   decode all: 1238 usec, 1059 MB/s
115 | Leopard (avx2, 64-bit):
116 |   encode: 586 usec, 2235 MB/s
117 |   decode one: 1536 usec, 43 MB/s
118 |   decode all: 1571 usec, 834 MB/s
119 | FastECC 0xfff00001 32-bit
120 |   encode: 2378 usec, 551 MB/s
121 | Wirehair (64-bit):
122 |   encode: 1643 usec, 798 MB/s
123 |   decode one: 1579 usec, 41 MB/s
124 |   decode all: 1808 usec, 725 MB/s
125 | ```
126 | 
127 | 
128 | ### Raw results with SSSE3
129 | 
130 | ```
131 | D:\>bench_sse4 200 50 16384 100
132 | Params: data_blocks=200 parity_blocks=50 chunk_size=16384 trials=100
133 | CM256 (ssse3, 64-bit):
134 |   encode: 11353 usec, 289 MB/s
135 |   decode one: 233 usec, 70 MB/s
136 |   decode all: 11088 usec, 74 MB/s
137 | Leopard (ssse3, 64-bit):
138 |   encode: 2558 usec, 1281 MB/s
139 |   decode one: 7345 usec, 2 MB/s
140 |   decode all: 7490 usec, 109 MB/s
141 | FastECC 0xfff00001 32-bit
142 |   encode: 11768 usec, 278 MB/s
143 | Wirehair (64-bit):
144 |   encode: 2955 usec, 1109 MB/s
145 |   decode one: 3164 usec, 5 MB/s
146 |   decode all: 3615 usec, 227 MB/s
147 | 
148 | D:\>bench_sse4 50 50 16384 1000
149 | Params: data_blocks=50 parity_blocks=50 chunk_size=16384 trials=1000
150 | CM256 (ssse3, 64-bit):
151 |   encode: 2731 usec, 300 MB/s
152 |   decode one: 56 usec, 292 MB/s
153 |   decode all: 2719 usec, 301 MB/s
154 | Leopard (ssse3, 64-bit):
155 |   encode: 460 usec, 1781 MB/s
156 |   decode one: 1131 usec, 14 MB/s
157 |   decode all: 1268 usec, 646 MB/s
158 | FastECC 0xfff00001 32-bit
159 |   encode: 2123 usec, 386 MB/s
160 | Wirehair (64-bit):
161 |   encode: 970 usec, 844 MB/s
162 |   decode one: 828 usec, 20 MB/s
163 |   decode all: 1051 usec, 780 MB/s
164 | 
165 | D:\>bench_sse4 80 20 16384 1000
166 | Params: data_blocks=80 parity_blocks=20 chunk_size=16384 trials=1000
167 | CM256 (ssse3, 64-bit):
168 |   encode: 1689 usec, 776 MB/s
169 |   decode one: 88 usec, 187 MB/s
170 |   decode all: 1699 usec, 193 MB/s
171 | Leopard (ssse3, 64-bit):
172 |   encode: 436 usec, 3006 MB/s
173 |   decode one: 1115 usec, 15 MB/s
174 |   decode all: 1152 usec, 284 MB/s
175 | FastECC 0xfff00001 32-bit
176 |   encode: 4840 usec, 271 MB/s
177 | Wirehair (64-bit):
178 |   encode: 1192 usec, 1100 MB/s
179 |   decode one: 1275 usec, 13 MB/s
180 |   decode all: 1408 usec, 233 MB/s
181 | 
182 | D:\>bench_sse4 20 20 65536 500
183 | Params: data_blocks=20 parity_blocks=20 chunk_size=65536 trials=500
184 | CM256 (ssse3, 64-bit):
185 |   encode: 1872 usec, 700 MB/s
186 |   decode one: 97 usec, 674 MB/s
187 |   decode all: 1864 usec, 703 MB/s
188 | Leopard (ssse3, 64-bit):
189 |   encode: 866 usec, 1514 MB/s
190 |   decode one: 2250 usec, 29 MB/s
191 |   decode all: 2377 usec, 551 MB/s
192 | FastECC 0xfff00001 32-bit
193 |   encode: 3749 usec, 350 MB/s
194 | Wirehair (64-bit):
195 |   encode: 2267 usec, 578 MB/s
196 |   decode one: 2087 usec, 31 MB/s
197 |   decode all: 2341 usec, 560 MB/s
198 | ```
199 | 
200 | 
201 | ### Raw results for larger codewords
202 | 
203 | ```
204 | D:\>bench_avx2 2048 2048 4096 100
205 | Params: data_blocks=2048 parity_blocks=2048 chunk_size=4096 trials=100
206 | Leopard (avx2, 64-bit):
207 |   encode: 8612 usec, 974 MB/s
208 |   decode one: 18663 usec, 0 MB/s
209 |   decode all: 21211 usec, 395 MB/s
210 | FastECC 0xfff00001 32-bit
211 |   encode: 23819 usec, 352 MB/s
212 | Wirehair (64-bit):
213 |   encode: 8301 usec, 1011 MB/s
214 |   decode one: 6920 usec, 1 MB/s
215 |   decode all: 9668 usec, 868 MB/s
216 | 
217 | D:\>bench_avx2 32000 32000 4096 20
218 | Params: data_blocks=32000 parity_blocks=32000 chunk_size=4096 trials=20
219 | Leopard (avx2, 64-bit):
220 |   encode: 216624 usec, 605 MB/s
221 |   decode one: 427401 usec, 0 MB/s
222 |   decode all: 515774 usec, 254 MB/s
223 | FastECC 0xfff00001 32-bit
224 |   encode: 584607 usec, 224 MB/s
225 | Wirehair (64-bit):
226 |   encode: 245237 usec, 534 MB/s
227 |   decode one: 197916 usec, 0 MB/s
228 |   decode all: 272011 usec, 482 MB/s
229 | ```
230 | 
231 | Now the same with OpenMP:
232 | ```
233 | D:\>bench_avx2_openmp 2048 2048 4096 100
234 | Params: data_blocks=2048 parity_blocks=2048 chunk_size=4096 trials=100
235 | Leopard (avx2, 64-bit):
236 |   encode: 6204 usec, 1352 MB/s
237 |   decode one: 36027 usec, 0 MB/s
238 |   decode all: 37741 usec, 222 MB/s
239 | FastECC 0xfff00001 32-bit
240 |   encode: 7182 usec, 1168 MB/s
241 | Wirehair (64-bit):
242 |   encode: 8446 usec, 993 MB/s
243 |   decode one: 6943 usec, 1 MB/s
244 |   decode all: 9709 usec, 864 MB/s
245 | 
246 | D:\>bench_avx2_openmp 32000 32000 4096 20
247 | Params: data_blocks=32000 parity_blocks=32000 chunk_size=4096 trials=20
248 | Leopard (avx2, 64-bit):
249 |   encode: 206935 usec, 633 MB/s
250 |   decode one: 880574 usec, 0 MB/s
251 |   decode all: 963278 usec, 136 MB/s
252 | FastECC 0xfff00001 32-bit
253 |   encode: 209445 usec, 626 MB/s
254 | Wirehair (64-bit):
255 |   encode: 257553 usec, 509 MB/s
256 |   decode one: 202161 usec, 0 MB/s
257 |   decode all: 284040 usec, 461 MB/s
258 | ```
259 | 
260 | 
261 | ## Conclusions
262 | 
263 | ### Encoding speed
264 | 
265 | O(N^2) algorithms encoding speed reported in THIS benchmark
266 | is O(1/number_of_parity_words). It's why:
267 | 
268 | So-called O(N^2) algorithms really are `O(M*K)`.
269 | It's because the RS matrix algo multiples vector of M words (input data)
270 | by `K*M` matrix and gets vector of K words (parity),
271 | which requires `K*M` multiplications and additions.
272 | 
273 | When you have any `O(K*M)` algo with M input words and K output words,
274 | you can say that its speed is O(1/K) relative to input data processed
275 | or O(1/M) relative to output data produced :slight_smile:
276 | This benchmark reports encoding speed relative to input data size,
277 | so matrix RS algos speed is O(1/number_of_parity_words)
278 | 
279 | As of cache effects, I optimized the chunk size for each M+K setting
280 | to reach best results. For larger codewords it means smaller chunks
281 | and thus a bit higher overheads, but effect was within 1%
282 | (i.e. for 20+20 I used 64KB blocks, but even with 4KB blocks
283 | it will be only 10% slower)
284 | 
285 | 
286 | ### Recovery speed
287 | 
288 | In O(N^2) RS algos, recovery of multiple blocks is just
289 | recovery of a single block performed multiple times.
290 | Thus, speed per block is the same (modulo setup time).
291 | More concrete, for K data blocks, M parity blocks,
292 | and blocksize B, encoding time is `O(K*M*B)`.
293 | Decoding L lost blocks will take `O(K*L*B)`
294 | (it combines K survived blocks to recompute each lost block).
295 | 
296 | But in fast RS algos, single block recovery requires almost
297 | the same amount of work as recovery of all lost blocks
298 | in the worst case, since FFT+IFFT steps don't depend on
299 | the amount of blocks we are going to recover.
300 | 
301 | Thus, matrix algorithms will always be faster for recovery
302 | of only one or few missing blocks
303 | 
304 | We can counterfight that by using matrix computations
305 | for small recoveries in fast algos too. At least it's
306 | possible for FastECC. This requires computation of
307 | Newton polynomial thus O(N^2) divisions - but it probably
308 | is still faster than O(B\*N\*log(N)) multiplications
309 | required for full decoding.
310 | 
311 | 
312 | ### Precomputed tables
313 | 
314 | ISA-L API is more low-level - you can compute encoding tables
315 | just once and use them in multiple calls. It's especially
316 | important when we want to process a stream with many gigabytes
317 | using just a few megabytes of memory.
318 | 
319 | Moreover, it may be possible to use CM256-computed tables with ISA-L.
320 | They have [two advantages](https://github.com/catid/cm256#comparisons-with-other-libraries) over ISA-L tables:
321 | - first parity block is just XOR of all data blocks
322 | - recovery tables are computed faster
323 | 
324 | When encoding or decoding operations with the same parameters
325 | are repeated multiple times, it can make sense to keep cache
326 | of such tables in order to avoid costly initialization.
327 | The most obvious example is recovery of data of missed node
328 | in ECC-protected distributed storage like [Codex](https://github.com/status-im/nim-codex).
329 | 
330 | 
331 | ### Art of benchmarking
332 | 
333 | Overall, proper benchmarking is an art of its own.
334 | AVX usually runs at slower frequencies and have weird implementation,
335 | this means that we better skip a first millisecond of its execution
336 | and don't mix AVX and non-AVX code.
337 | 
338 | Mobile CPUs are tend to lower freqs on load, especially on m/t load,
339 | and after prolonged load they may further lower freq due to overheating.
340 | So, ideally we should skip a first few trials and then measure fastest one
341 | (when CPU had highest freq).
342 | But afair, cpu time measure sometimes may be incorrect when thread is switched
343 | to another core, so we have either to pin task to a single core or drop a few outliers.
344 | 
345 | 
346 | 


--------------------------------------------------------------------------------
/src/benchmark_cm256.cpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Benchmarking CM256 library: https://github.com/catid/cm256
  3 | //
  4 | 
  5 | #include <cstdio>
  6 | #include <memory>
  7 | 
  8 | #include "../src/gf256.cpp"
  9 | 
 10 | #include "common.h"
 11 | 
 12 | 
 13 | // Perform single encoding operation, return false if it fails
 14 | bool cm256_benchmark_encode(
 15 |     ECC_bench_params params,
 16 |     uint8_t* originalFileData,
 17 |     uint8_t* recoveryBlocks,
 18 |     OperationTimer& encode_time)
 19 | {
 20 |     // Pointers to data
 21 |     cm256_block blocks[256];
 22 |     for (int i = 0; i < params.OriginalCount; ++i)
 23 |     {
 24 |         blocks[i].Block = originalFileData + i * params.BlockBytes;
 25 |     }
 26 | 
 27 |     encode_time.BeginCall();
 28 |     // Generate recovery data
 29 |     if (cm256_encode(params, blocks, recoveryBlocks))
 30 |     {
 31 |         printf("  cm256_encode failed\n");
 32 |         return false;
 33 |     }
 34 |     encode_time.EndCall();
 35 | 
 36 |     return true;
 37 | }
 38 | 
 39 | 
 40 | // Perform single operation decoding single lost block, return false if it fails
 41 | bool cm256_benchmark_decode_one_block(
 42 |     ECC_bench_params params,
 43 |     uint8_t* originalFileData,
 44 |     uint8_t* recoveryBlocks,
 45 |     OperationTimer& decode_time)
 46 | {
 47 |     // Pointers to data
 48 |     cm256_block blocks[256];
 49 | 
 50 |     // Initialize the indices
 51 |     for (int i = 0; i < params.OriginalCount; ++i)
 52 |     {
 53 |         blocks[i].Block = originalFileData + i * params.BlockBytes;
 54 |         blocks[i].Index = cm256_get_original_block_index(params, i);
 55 |     }
 56 | 
 57 |     //// Simulate loss of data, subsituting a recovery block in its place ////
 58 |     int lostBlock = params.RecoveryCount==1? 0 : 1;  // Since recovery block #0 recovers much faster using just XORs
 59 |     blocks[0].Block = recoveryBlocks + lostBlock * params.BlockBytes; // A recovery block
 60 |     blocks[0].Index = cm256_get_recovery_block_index(params, lostBlock); // A recovery block index
 61 |     //// Simulate loss of data, subsituting a recovery block in its place ////
 62 | 
 63 |     decode_time.BeginCall();
 64 |     if (cm256_decode(params, blocks))
 65 |     {
 66 |         printf("  cm256_decode failed\n");
 67 |         return false;
 68 |     }
 69 |     decode_time.EndCall();
 70 | 
 71 |     // blocks[0].Index will now be = lostBlock
 72 |     // and blocks[0].Block overwritten with recovered data
 73 | 
 74 |     return true;
 75 | }
 76 | 
 77 | 
 78 | // Perform single operation decoding as much blocks as possible, return false if it fails
 79 | bool cm256_benchmark_decode_all_blocks(
 80 |     ECC_bench_params params,
 81 |     uint8_t* originalFileData,
 82 |     uint8_t* recoveryBlocks,
 83 |     OperationTimer& decode_time)
 84 | {
 85 |     // Pointers to data
 86 |     cm256_block blocks[256];
 87 | 
 88 |     // Initialize the indices for recovery operation
 89 |     for (int i = 0; i < params.OriginalCount; ++i)
 90 |     {
 91 |         if (i < params.RecoveryCount) {
 92 |             // Simulate loss of data, subsituting a recovery block in its place
 93 |             blocks[i].Block = recoveryBlocks + i * params.BlockBytes;       // recovery block
 94 |             blocks[i].Index = cm256_get_recovery_block_index(params, i);    // recovery block index
 95 |         } else {
 96 |             blocks[i].Block = originalFileData + i * params.BlockBytes;     // data block
 97 |             blocks[i].Index = cm256_get_original_block_index(params, i);    // data block index
 98 |         }
 99 |     }
100 | 
101 |     decode_time.BeginCall();
102 |     if (cm256_decode(params, blocks))
103 |     {
104 |         printf("  cm256_decode failed\n");
105 |         return false;
106 |     }
107 |     decode_time.EndCall();
108 | 
109 |     // For each i,
110 |     //   blocks[i].Index will now be = cm256_get_original_block_index(params, i)
111 |     //   and blocks[i].Block overwritten with recovered data of this block
112 | 
113 |     return true;
114 | }
115 | 
116 | 
117 | // Benchmark library and print results, return false if anything failed
118 | bool cm256_benchmark_main(ECC_bench_params params, uint8_t* buffer)
119 | {
120 |     if (params.OriginalCount + params.RecoveryCount > 256)
121 |         return false;
122 | 
123 |     // Initialize library and choose CPU SIMD extension to use
124 |     if (cm256_init()) {
125 |         printf("cm256_init failed\n");
126 |         return false;
127 |     }
128 | 
129 |     // Print CPU SIMD extensions used to accelerate library in this run
130 |     // (depends on compilation options such as -mavx2 and actual CPU)
131 |     printf("CM256 (%s, %d-bit):\n",
132 | #ifndef GF256_TARGET_MOBILE
133 | #  ifdef GF256_TRY_AVX2
134 |         CpuHasAVX2? "avx2":
135 | #  endif
136 |         CpuHasSSSE3? "ssse3":
137 | #endif
138 | #if defined(GF256_TRY_NEON)
139 |         CpuHasNeon64? "neon64":
140 |         CpuHasNeon? "neon":
141 | #endif
142 |         "", sizeof(size_t)*8);
143 | 
144 | 
145 |     // Places for original and parity data
146 |     auto originalFileData = buffer;
147 |     auto recoveryBlocks   = buffer + params.OriginalFileBytes();
148 | 
149 |     // Total encode/decode times
150 |     OperationTimer encode_time, decode_one_time, decode_all_time;
151 | 
152 |     // Repeat benchmark multiple times to improve its accuracy
153 |     for (int trial = 0; trial < params.Trials; ++trial)
154 |     {
155 |         if (! cm256_benchmark_encode(params, originalFileData, recoveryBlocks, encode_time)) {
156 |             return false;
157 |         }
158 |         if (! cm256_benchmark_decode_one_block(params, originalFileData, recoveryBlocks, decode_one_time)) {
159 |             return false;
160 |         }
161 |         if (! cm256_benchmark_encode(params, originalFileData, recoveryBlocks, encode_time)) {
162 |             return false;
163 |         }
164 |         if (! cm256_benchmark_decode_all_blocks(params, originalFileData, recoveryBlocks, decode_all_time)) {
165 |             return false;
166 |         }
167 |     }
168 | 
169 |     // Benchmark reports for each operation
170 |     encode_time.Print("encode", params.OriginalFileBytes());
171 |     decode_one_time.Print("decode one", params.BlockBytes);
172 |     decode_all_time.Print("decode all", params.RecoveryDataBytes());
173 | 
174 |     return true;
175 | }
176 | 


--------------------------------------------------------------------------------
/src/benchmark_fastecc.cpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Benchmarking FastECC library: https://github.com/Bulat-Ziganshin/FastECC
  3 | //
  4 | // Unfortunately, this early version of the library doesn't export ready-to-use encoder
  5 | // so we literally copied this code from RS.cpp:
  6 | //
  7 | // Implementation of the Reed-Solomon algo in O(N*log(N)) using Number-Theoretical Transform in GF(p)
  8 | //
  9 | 
 10 | #include <cstdio>
 11 | #include <cmath>
 12 | #include <memory>
 13 | 
 14 | #include "common.h"
 15 | 
 16 | #include "GF(p).cpp"
 17 | #include "ntt.cpp"
 18 | 
 19 | 
 20 | // Extra workspace used by the library on top of place required for original data
 21 | size_t fastecc_extra_space(ECC_bench_params params)
 22 | {
 23 |     return params.BlockBytes * params.OriginalCount;
 24 | }
 25 | 
 26 | 
 27 | // Benchmark encoding using the Reed-Solomon algo
 28 | template <typename T, T P>
 29 | void EncodeReedSolomon (size_t N, size_t SIZE, T **data)
 30 | {
 31 |     // 1. iNTT: polynomial interpolation. We find coefficients of order-N polynomial describing the source data
 32 |     MFA_NTT<T,P> (data, N, SIZE, true);
 33 |     // Now we should divide results by N in order to get coefficients, but we combined this operation with the multiplication below
 34 | 
 35 |     // Now we can evaluate the polynomial at 2*N points.
 36 |     // Points with even index will contain the source data,
 37 |     // while points with odd indexes may be used as ECC data.
 38 |     // But more efficient approach is to compute only odd-indexed points.
 39 |     // This is accomplished by the following steps:
 40 | 
 41 |     // 2. Multiply the polynomial coefficients by root(2*N)**i
 42 |     T root_2N = GF_Root<T,P>(2*N),  inv_N = GF_Inv<T,P>(N);
 43 |     #pragma omp parallel for
 44 |     for (ptrdiff_t i=0; i<N; i++) {
 45 |         T root_i = GF_Mul<T,P> (inv_N, GF_Pow<T,P>(root_2N,i));    // root_2N**i / N (combine division by N with multiplication by powers of the root)
 46 |         T* __restrict__ block = data[i];
 47 |         for (size_t k=0; k<SIZE; k++) {         // cycle over SIZE elements of the single block
 48 |             block[k] = GF_Mul<T,P> (block[k], root_i);
 49 |         }
 50 |     }
 51 | 
 52 |     // 3. NTT: polynomial evaluation. This evaluates the modified polynomial at root(N)**i points,
 53 |     // that is equivalent to evaluation of the original polynomial at root(2*N)**(2*i+1) points.
 54 |     MFA_NTT<T,P> (data, N, SIZE, false);
 55 | 
 56 |     // Further optimization: in order to compute only even-indexed points,
 57 |     // it's enough to compute order-N/2 NTT of data[i]+data[i+N/2]. And so on...
 58 | }
 59 | 
 60 | 
 61 | template <typename T, T P>
 62 | bool fastecc_benchmark_specialize(ECC_bench_params params, uint8_t* buffer)
 63 | {
 64 |     // Total encode/decode times
 65 |     OperationTimer encode_time, decode_one_time, decode_all_time;
 66 | 
 67 |     size_t N = NextPow2( std::max( params.OriginalCount, params.RecoveryCount));   // NTT order
 68 |     size_t SIZE = params.BlockBytes / sizeof(T);
 69 | 
 70 |     // Use extra space because algorithm overwrites data in-place
 71 |     T *data0 = (T*) (buffer + params.OriginalFileBytes());
 72 | 
 73 |     // Fill space with values < P (larger values are incompatible with FastECC algorithm)
 74 |     for (size_t i=0; i<N*SIZE; i++) {
 75 |         data0[i] = (i < P? i : i%P);
 76 |     }
 77 | 
 78 |     T **data = new T* [N];      // pointers to blocks
 79 |     for (size_t i=0; i<N; i++)
 80 |         data[i] = data0 + i*SIZE;
 81 | 
 82 |     printf("FastECC 0x%llx %d-bit\n", (unsigned long long)P, sizeof(T)*8);
 83 | 
 84 |     // Repeat benchmark multiple times to improve its accuracy
 85 |     for (int trial = 0; trial < params.Trials; ++trial)
 86 |     {
 87 |         // Generate recovery data
 88 |         encode_time.BeginCall();
 89 |         EncodeReedSolomon<T,P> (N, SIZE, data);
 90 |         encode_time.EndCall();
 91 |     }
 92 | 
 93 |     // Benchmark reports for each operation
 94 |     encode_time.Print("encode", params.OriginalFileBytes());
 95 | 
 96 |     return true;
 97 | }
 98 | 
 99 | 
100 | // Benchmark library and print results, return false if anything failed
101 | bool fastecc_benchmark_main(ECC_bench_params params, uint8_t* buffer)
102 | {
103 |     return fastecc_benchmark_specialize<uint32_t,0xFFF00001> (params, buffer);
104 | }
105 | 


--------------------------------------------------------------------------------
/src/benchmark_leopard.cpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Benchmarking Leopard library: https://github.com/catid/leopard
  3 | //
  4 | 
  5 | #include <cstdio>
  6 | #include <memory>
  7 | 
  8 | #include "common.h"
  9 | 
 10 | #include "LeopardFF8.cpp"
 11 | #undef LEO_MUL_128
 12 | #undef LEO_MULADD_128
 13 | #undef LEO_MUL_256
 14 | #undef LEO_MULADD_256
 15 | #undef LEO_IFFTB_256
 16 | #undef LEO_IFFTB_128
 17 | #undef LEO_FFTB_256
 18 | #undef LEO_FFTB_128
 19 | #include "LeopardFF16.cpp"
 20 | #include "LeopardCommon.cpp"
 21 | #include "leopard.cpp"
 22 | 
 23 | 
 24 | // Extra workspace used by the library on top of place required for original data
 25 | size_t leopard_extra_space(ECC_bench_params params)
 26 | {
 27 |     size_t encode_work_count = leo_encode_work_count(params.OriginalCount, params.RecoveryCount);
 28 |     size_t decode_work_count = leo_decode_work_count(params.OriginalCount, params.RecoveryCount);
 29 |     return params.BlockBytes * (encode_work_count + decode_work_count);
 30 | }
 31 | 
 32 | 
 33 | // Perform single encoding operation, return false if it fails
 34 | bool leopard_benchmark_encode(
 35 |     ECC_bench_params params,
 36 |     size_t encode_work_count,
 37 |     void** original_data,
 38 |     void** parity_data,
 39 |     OperationTimer& encode_time)
 40 | {
 41 |     // Generate recovery data
 42 |     encode_time.BeginCall();
 43 |     LeopardResult encodeResult = leo_encode(
 44 |         params.BlockBytes,
 45 |         params.OriginalCount,
 46 |         params.RecoveryCount,
 47 |         encode_work_count,
 48 |         original_data,
 49 |         parity_data
 50 |     );
 51 |     encode_time.EndCall();
 52 | 
 53 |     if (encodeResult != Leopard_Success)
 54 |     {
 55 |         printf("  leo_encode failed: %s\n", leo_result_string(encodeResult));
 56 |         return false;
 57 |     }
 58 | 
 59 |     return true;
 60 | }
 61 | 
 62 | 
 63 | // Perform single decoding operation, return false if it fails
 64 | bool leopard_benchmark_decode(
 65 |     ECC_bench_params params,
 66 |     size_t decode_work_count,
 67 |     void** originalFileData_losing_one,
 68 |     void** recoveryBlocks,
 69 |     void** decoderWorkArea,
 70 |     OperationTimer& decode_time)
 71 | {
 72 |     decode_time.BeginCall();
 73 |     LeopardResult decodeResult = leo_decode(
 74 |         params.BlockBytes,
 75 |         params.OriginalCount,
 76 |         params.RecoveryCount,
 77 |         decode_work_count,
 78 |         originalFileData_losing_one,
 79 |         recoveryBlocks,
 80 |         decoderWorkArea);
 81 |     decode_time.EndCall();
 82 | 
 83 |     if (decodeResult != Leopard_Success)
 84 |     {
 85 |         printf("  leo_decode-one failed: %s\n", leo_result_string(decodeResult));
 86 |         return false;
 87 |     }
 88 | 
 89 |     return true;
 90 | }
 91 | 
 92 | 
 93 | // Benchmark library and print results, return false if anything failed
 94 | bool leopard_benchmark_main(ECC_bench_params params, uint8_t* buffer)
 95 | {
 96 |     // Total encode/decode times
 97 |     OperationTimer encode_time, decode_one_time, decode_all_time;
 98 | 
 99 |     if (leo_init()) {
100 |         printf("leo_init failed\n");
101 |         return false;
102 |     }
103 | 
104 |     size_t encode_work_count = leo_encode_work_count(params.OriginalCount, params.RecoveryCount);
105 |     size_t decode_work_count = leo_decode_work_count(params.OriginalCount, params.RecoveryCount);
106 | 
107 |     if (encode_work_count == 0)  // 0 means unsupported data+parity combination
108 |         return false;
109 | 
110 |     // Print CPU SIMD extensions used to accelerate library in this run
111 |     // (depends on compilation options such as -mavx2 and actual CPU)
112 |     printf("Leopard (%s, %d-bit):\n",
113 | #ifndef GF256_TARGET_MOBILE
114 | #  ifdef GF256_TRY_AVX2
115 |         leopard::CpuHasAVX2? "avx2":
116 | #  endif
117 |         leopard::CpuHasSSSE3? "ssse3":
118 | #endif
119 | #if defined(GF256_TRY_NEON)
120 |         leopard::CpuHasNeon64? "neon64":
121 |         leopard::CpuHasNeon? "neon":
122 | #endif
123 |         "", sizeof(size_t)*8);
124 | 
125 |     // Pointers to data
126 |     std::vector<uint8_t*> original_data(params.OriginalCount);
127 |     std::vector<uint8_t*> original_data_losing_one(params.OriginalCount);
128 |     std::vector<uint8_t*> original_data_losing_most_possible(params.OriginalCount);
129 |     std::vector<uint8_t*> encode_work_data(encode_work_count);
130 |     std::vector<uint8_t*> decode_work_data(decode_work_count);
131 | 
132 |     for (unsigned i = 0; i < params.OriginalCount; ++i) {
133 |         original_data[i] = buffer;
134 |         // Lose only the first block
135 |         original_data_losing_one[i] = (i==0? nullptr : buffer);
136 |         // Lose up to RecoveryCount blocks
137 |         original_data_losing_most_possible[i] = (i < params.RecoveryCount? nullptr : buffer);
138 |         buffer += params.BlockBytes;
139 |     }
140 |     for (unsigned i = 0; i < encode_work_count; ++i) {
141 |         encode_work_data[i] = buffer;
142 |         buffer += params.BlockBytes;
143 |     }
144 |     for (unsigned i = 0; i < decode_work_count; ++i) {
145 |         decode_work_data[i] = buffer;
146 |         buffer += params.BlockBytes;
147 |     }
148 | 
149 |     // It's exactly like original_data[] bit with the first block lost
150 |     // so we have to repair it
151 |     original_data_losing_one[0] = nullptr;
152 | 
153 |     void** originalFileData = (void**)&original_data[0];
154 |     void** recoveryBlocks   = (void**)&encode_work_data[0];   // recovery data written here
155 |     void** decoderWorkArea  = (void**)&decode_work_data[0];
156 |     void** originalFileData_losing_one = (void**)&original_data_losing_one[0];
157 |     void** originalFileData_losing_most_possible = (void**)&original_data_losing_most_possible[0];
158 | 
159 |     // Repeat benchmark multiple times to improve its accuracy
160 |     for (int trial = 0; trial < params.Trials; ++trial)
161 |     {
162 |         if (! leopard_benchmark_encode(params, encode_work_count,
163 |                 originalFileData, recoveryBlocks, encode_time)) {
164 |             return false;
165 |         }
166 |         if (! leopard_benchmark_decode(params, decode_work_count,
167 |                 originalFileData_losing_one, recoveryBlocks, decoderWorkArea, decode_one_time)) {
168 |             return false;
169 |         }
170 |         if (! leopard_benchmark_decode(params, decode_work_count,
171 |                 originalFileData_losing_most_possible, recoveryBlocks, decoderWorkArea, decode_all_time)) {
172 |             return false;
173 |         }
174 |     }
175 | 
176 |     // Benchmark reports for each operation
177 |     encode_time.Print("encode", params.OriginalFileBytes());
178 |     decode_one_time.Print("decode one", params.BlockBytes);
179 |     decode_all_time.Print("decode all", params.RecoveryDataBytes());
180 | 
181 |     return true;
182 | }
183 | 


--------------------------------------------------------------------------------
/src/benchmark_wirehair.cpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Benchmarking Wirehair library: https://github.com/catid/wirehair
  3 | //
  4 | 
  5 | #include <cstdio>
  6 | #include <cmath>
  7 | #include <memory>
  8 | 
  9 | #include "common.h"
 10 | 
 11 | #include "gf256.h"
 12 | #include "WirehairTools.cpp"
 13 | #include "WirehairCodec.cpp"
 14 | #include "wirehair.cpp"
 15 | 
 16 | 
 17 | // Perform single encoding operation, return false if it fails
 18 | bool wirehair_benchmark_encode(
 19 |     ECC_bench_params params,
 20 |     uint8_t* originalFileData,
 21 |     uint8_t* recoveryBlocks,
 22 |     WirehairCodec& encoder)
 23 | {
 24 |     // Create encoder
 25 |     encoder = wirehair_encoder_create(
 26 |         encoder,                     // [Optional] Pointer to prior codec object
 27 |         originalFileData,            // Pointer to message
 28 |         params.OriginalFileBytes(),  // Bytes in the message
 29 |         params.BlockBytes);          // Bytes in an output block
 30 | 
 31 |     if (!encoder) {
 32 |         printf("wirehair_encoder_create failed\n");
 33 |         return false;
 34 |     }
 35 | 
 36 |     // Generate recovery data
 37 |     for (int i = 0; i < params.RecoveryCount; ++i)
 38 |     {
 39 |         auto blockId   = i + params.OriginalCount;
 40 |         auto blockSize = params.BlockBytes;
 41 |         auto blockPtr  = recoveryBlocks + i * blockSize;
 42 | 
 43 |         // Encode a packet
 44 |         uint32_t writeLen = 0;
 45 |         WirehairResult encodeResult = wirehair_encode(
 46 |             encoder,     // Pointer to codec from wirehair_encoder_create()
 47 |             blockId,     // Identifier of block to generate
 48 |             blockPtr,    // Pointer to output block data
 49 |             blockSize,   // Bytes in the output buffer
 50 |             &writeLen);  // Number of bytes written <= blockBytes
 51 | 
 52 |         if (encodeResult != Wirehair_Success  ||  writeLen != blockSize)
 53 |         {
 54 |             printf("wirehair_encode failed: %s\n", wirehair_result_string(encodeResult));
 55 |             return false;
 56 |         }
 57 |     }
 58 | 
 59 |     return true;
 60 | }
 61 | 
 62 | 
 63 | // Perform single operation decoding single lost block, return false if it fails
 64 | bool wirehair_benchmark_decode_one_block(
 65 |     ECC_bench_params params,
 66 |     uint8_t* originalFileData,
 67 |     uint8_t* recoveryBlocks,
 68 |     WirehairCodec& decoder)
 69 | {
 70 |     // Create decoder
 71 |     decoder = wirehair_decoder_create(
 72 |         decoder,                     // Codec object to reuse
 73 |         params.OriginalFileBytes(),  // Bytes in the message to decode
 74 |         params.BlockBytes);          // Bytes in each encoded block
 75 | 
 76 |     if (!decoder) {
 77 |         printf("wirehair_decoder_create failed\n");
 78 |         return false;
 79 |     }
 80 | 
 81 |     auto blockSize = params.BlockBytes;
 82 | 
 83 | 
 84 |     // Simulate loss of the first data block,
 85 |     // using instead as much recovery blocks as required by the codec
 86 |     for (int blockId = 1; blockId < params.OriginalCount + params.RecoveryCount; ++blockId)
 87 |     {
 88 |         auto blockPtr = originalFileData + blockId * blockSize;
 89 | 
 90 |         // Attempt decode
 91 |         WirehairResult decodeResult = wirehair_decode(
 92 |             decoder,     // Pointer to codec from wirehair_decoder_create()
 93 |             blockId,     // ID number of received block
 94 |             blockPtr,    // Pointer to block data
 95 |             blockSize);  // Number of bytes in the data block
 96 | 
 97 |         // If decoder returns success:
 98 |         if (decodeResult == Wirehair_Success) {
 99 |             // Decoder has enough data to recover now
100 |             goto recover;
101 |         }
102 | 
103 |         if (decodeResult != Wirehair_NeedMore) {
104 |             printf("wirehair_decode failed: %s\n", wirehair_result_string(decodeResult));
105 |             return false;
106 |         }
107 |     }
108 | 
109 |     printf("wirehair_benchmark_decode_one_block failed: not enough data for recovery\n");
110 |     return false;
111 | 
112 | 
113 | recover:
114 |     // Now let's recover the first data block
115 |     auto blockId  = 0;
116 |     auto blockPtr = originalFileData;
117 | 
118 |     uint32_t writeLen = 0;
119 |     WirehairResult recoverResult = wirehair_recover_block(
120 |         decoder,    // Pointer to codec from wirehair_decoder_create()
121 |         blockId,    // ID of the block to reconstruct
122 |         blockPtr,   // Pointer to block data
123 |         &writeLen   // Set to the number of data bytes in the block
124 |     );
125 | 
126 |     if (recoverResult != Wirehair_Success  ||  writeLen != blockSize) {
127 |         printf("wirehair_recover_block failed: %s\n", wirehair_result_string(recoverResult));
128 |         return false;
129 |     }
130 | 
131 | /* Altenatively, we can recover the entire original data that works only slightly slower
132 |    (probably because it memcpy's more data):
133 | 
134 |     WirehairResult recoverResult = wirehair_recover(
135 |         decoder,                    // Pointer to codec from wirehair_decoder_create()
136 |         originalFileData,           // Buffer where reconstructed message will be written
137 |         params.OriginalFileBytes()  // Bytes in the message
138 |     );
139 | */
140 |     return true;
141 | }
142 | 
143 | 
144 | // Perform single operation decoding as much blocks as possible, return false if it fails
145 | bool wirehair_benchmark_decode_all_blocks(
146 |     ECC_bench_params params,
147 |     uint8_t* originalFileData,
148 |     uint8_t* recoveryBlocks,
149 |     WirehairCodec& decoder)
150 | {
151 |     // Create decoder
152 |     decoder = wirehair_decoder_create(
153 |         decoder,                     // Codec object to reuse
154 |         params.OriginalFileBytes(),  // Bytes in the message to decode
155 |         params.BlockBytes);          // Bytes in each encoded block
156 | 
157 |     if (!decoder) {
158 |         printf("wirehair_decoder_create failed\n");
159 |         return false;
160 |     }
161 | 
162 |     auto blockSize = params.BlockBytes;
163 | 
164 | 
165 |     // Simulate loss of as much data blocks as possible,
166 |     // using instead all the recovery blocks available
167 |     for (int blockId = params.OriginalCount + params.RecoveryCount; --blockId > 0 ; )
168 |     {
169 |         auto blockPtr = originalFileData + blockId * blockSize;
170 | 
171 |         // Attempt decode
172 |         WirehairResult decodeResult = wirehair_decode(
173 |             decoder,     // Pointer to codec from wirehair_decoder_create()
174 |             blockId,     // ID number of received block
175 |             blockPtr,    // Pointer to block data
176 |             blockSize);  // Number of bytes in the data block
177 | 
178 |         // If decoder returns success:
179 |         if (decodeResult == Wirehair_Success) {
180 |             // Decoder has enough data to recover now
181 |             goto recover;
182 |         }
183 | 
184 |         if (decodeResult != Wirehair_NeedMore) {
185 |             printf("wirehair_decode failed: %s\n", wirehair_result_string(decodeResult));
186 |             return false;
187 |         }
188 |     }
189 | 
190 |     printf("wirehair_benchmark_decode_all_blocks failed: not enough data for recovery\n");
191 |     return false;
192 | 
193 | 
194 | recover:
195 |     // Now let's recover the entire buffer
196 |     WirehairResult recoverResult = wirehair_recover(
197 |         decoder,                    // Pointer to codec from wirehair_decoder_create()
198 |         originalFileData,           // Buffer where reconstructed message will be written
199 |         params.OriginalFileBytes()  // Bytes in the message
200 |     );
201 | 
202 |     if (recoverResult != Wirehair_Success) {
203 |         printf("wirehair_recover failed: %s\n", wirehair_result_string(recoverResult));
204 |         return false;
205 |     }
206 | 
207 |     return true;
208 | }
209 | 
210 | 
211 | // Benchmark library and print results, return false if anything failed
212 | bool wirehair_benchmark_main(ECC_bench_params params, uint8_t* buffer)
213 | {
214 |     // Initialize the library
215 |     const WirehairResult initResult = wirehair_init();
216 |     if (initResult != Wirehair_Success) {
217 |         printf("wirehair_init failed: %s\n", wirehair_result_string(initResult));
218 |         return false;
219 |     }
220 | 
221 |     // Introduce himself
222 |     printf("Wirehair (%d-bit):\n", sizeof(size_t)*8);
223 | 
224 |     // Automatically free codecs memory
225 |     struct FreeCodecs{
226 |         WirehairCodec encoder = nullptr, decoder_one = nullptr, decoder_all = nullptr;
227 |         ~FreeCodecs() {
228 |             wirehair_free(encoder);
229 |             wirehair_free(decoder_one);
230 |             wirehair_free(decoder_all);
231 |         }
232 |     } codecs;
233 | 
234 | 
235 |     // Places for original and parity data
236 |     auto originalFileData = buffer;
237 |     auto recoveryBlocks   = buffer + params.OriginalFileBytes();
238 | 
239 |     // Total encode/decode times
240 |     OperationTimer encode_time, decode_one_time, decode_all_time;
241 | 
242 |     // Repeat benchmark multiple times to improve its accuracy
243 |     for (int trial = 0; trial < params.Trials; ++trial)
244 |     {
245 |         encode_time.BeginCall();
246 |         if (! wirehair_benchmark_encode(params, originalFileData, recoveryBlocks, codecs.encoder)) {
247 |             return false;
248 |         }
249 |         encode_time.EndCall();
250 |         decode_one_time.BeginCall();
251 |         if (! wirehair_benchmark_decode_one_block(params, originalFileData, recoveryBlocks, codecs.decoder_one)) {
252 |             return false;
253 |         }
254 |         decode_one_time.EndCall();
255 |         decode_all_time.BeginCall();
256 |         if (! wirehair_benchmark_decode_all_blocks(params, originalFileData, recoveryBlocks, codecs.decoder_all)) {
257 |             return false;
258 |         }
259 |         decode_all_time.EndCall();
260 |     }
261 | 
262 |     // Benchmark reports for each operation
263 |     encode_time.Print("encode", params.OriginalFileBytes());
264 |     decode_one_time.Print("decode one", params.BlockBytes);
265 |     decode_all_time.Print("decode all", params.RecoveryDataBytes());
266 | 
267 |     return true;
268 | }
269 | 


--------------------------------------------------------------------------------
/src/common.h:
--------------------------------------------------------------------------------
 1 | #include "cm256.h"
 2 | #include "../unit_test/SiameseTools.h"
 3 | 
 4 | 
 5 | struct ECC_bench_params : cm256_encoder_params
 6 | {
 7 |     // Repeat benchmark multiple times to improve its accuracy
 8 |     int Trials;
 9 | 
10 |     // Size of the original file
11 |     size_t OriginalFileBytes() { return OriginalCount * BlockBytes;}
12 | 
13 |     // Size of the original file
14 |     size_t RecoveryDataBytes() { return RecoveryCount * BlockBytes;}
15 | };
16 | 
17 | 
18 | // Benchmark each library and print results, return false if anything failed
19 | bool cm256_benchmark_main(ECC_bench_params params, uint8_t* buffer);
20 | bool leopard_benchmark_main(ECC_bench_params params, uint8_t* buffer);
21 | bool fastecc_benchmark_main(ECC_bench_params params, uint8_t* buffer);
22 | bool wirehair_benchmark_main(ECC_bench_params params, uint8_t* buffer);
23 | 
24 | // Extra workspace used by each library on top of place required for original data
25 | size_t leopard_extra_space(ECC_bench_params params);
26 | size_t fastecc_extra_space(ECC_bench_params params);
27 | 
28 | // Write benchmark results to logfile
29 | void write_to_logfile(const char* operation, int invocations, double microseconds_per_call, double megabytes_per_second);
30 | 
31 | 
32 | //-----------------------------------------------------------------------------
33 | class OperationTimer
34 | {
35 | public:
36 |     void BeginCall()
37 |     {
38 |         t0 = siamese::GetTimeUsec();
39 |     }
40 |     void EndCall()
41 |     {
42 |         const uint64_t t1 = siamese::GetTimeUsec();
43 |         const uint64_t delta = t1 - t0;
44 |         if (++Invocations == 1)
45 |             MaxCallUsec = MinCallUsec = delta;
46 |         else if (MaxCallUsec < delta)
47 |             MaxCallUsec = delta;
48 |         else if (MinCallUsec > delta)
49 |             MinCallUsec = delta;
50 |         TotalUsec += delta;
51 |         t0 = 0;
52 |     }
53 |     void Reset()
54 |     {
55 |         t0 = 0;
56 |         Invocations = 0;
57 |         TotalUsec = 0;
58 |     }
59 |     void Print(const char* operation, uint64_t bytes_processed_per_call)
60 |     {
61 |         double microseconds_per_call = double(TotalUsec) / Invocations;
62 |         double megabytes_per_second = bytes_processed_per_call / microseconds_per_call;
63 |         printf("  %s: %.0lf usec, %.0lf MB/s\n", operation, microseconds_per_call, megabytes_per_second);
64 |         write_to_logfile(operation, Invocations, microseconds_per_call, megabytes_per_second);
65 |     }
66 | 
67 |     uint64_t t0 = 0;
68 |     uint64_t Invocations = 0;
69 |     uint64_t TotalUsec = 0;
70 |     uint64_t MaxCallUsec = 0;
71 |     uint64_t MinCallUsec = 0;
72 | };
73 | 
74 | 
75 | // Round x up to 2^i
76 | inline uint64_t NextPow2(uint64_t x)
77 | {
78 | 	if (x == 0)  return 0;
79 | 	if (x == 1)  return 1;
80 | 	int i = 1;
81 | 	for (x--; x/=2; i++)
82 | 		;
83 | 	return uint64_t(1) << i;
84 | }
85 | 


--------------------------------------------------------------------------------
/src/compile.cmd:
--------------------------------------------------------------------------------
1 | g++ -o bench_avx2 -mavx2 -DSIMD=AVX2 -mtune=skylake -O3 -s main.cpp benchmark_cm256.cpp ../external/cm256/src/cm256.cpp benchmark_leopard.cpp benchmark_fastecc.cpp benchmark_wirehair.cpp -I../external/cm256/include -I../external/leopard -I../external/FastECC -I../external/wirehair -I../external/wirehair/include
2 | g++ -o bench_sse4 -msse4 -DSIMD=SSE2 -mtune=skylake -O3 -s main.cpp benchmark_cm256.cpp ../external/cm256/src/cm256.cpp benchmark_leopard.cpp benchmark_fastecc.cpp benchmark_wirehair.cpp -I../external/cm256/include -I../external/leopard -I../external/FastECC -I../external/wirehair -I../external/wirehair/include
3 | 


--------------------------------------------------------------------------------
/src/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <cstdio>
  2 | #include <thread>
  3 | #include "common.h"
  4 | 
  5 | #include "../unit_test/SiameseTools.cpp"
  6 | 
  7 | #define BUFSIZE_ALIGNMENT 64  /* at least 16 for SSE intrinsics, and at least 64 for Leopard */
  8 | #define align_up(value, ALIGNMENT) ((((value) + ALIGNMENT - 1) / ALIGNMENT) * ALIGNMENT)
  9 | 
 10 | 
 11 | // Benchmark parameters set at cmdline
 12 | ECC_bench_params params;
 13 | 
 14 | // Currently benchmarked library
 15 | const char *library = "";
 16 | 
 17 | // File to save benchmark results
 18 | FILE* logfile = NULL;
 19 | 
 20 | // Write benchmark results to logfile
 21 | void write_to_logfile(const char* operation, int invocations, double microseconds_per_call, double megabytes_per_second)
 22 | {
 23 |     if (logfile)
 24 |     {
 25 |         fprintf(logfile, "%d,%d,%d,%s,%s,%d,%lf,%lf\n",
 26 |             params.OriginalCount, params.RecoveryCount, params.BlockBytes,
 27 |             library, operation,
 28 |             invocations, microseconds_per_call, megabytes_per_second);
 29 |         fflush(logfile);
 30 |     }
 31 | }
 32 | 
 33 | 
 34 | // Parse ECC parameters from cmdline
 35 | void parse_cmdline(int argc, char** argv)
 36 | {
 37 |     // Number of blocks
 38 |     params.OriginalCount = 50;
 39 | 
 40 |     // Number of additional recovery blocks generated by encoder
 41 |     params.RecoveryCount = 50;
 42 | 
 43 |     // Number of bytes per file block
 44 |     params.BlockBytes = 4096;
 45 | 
 46 |     // Repeat benchmark multiple times to improve its accuracy
 47 |     params.Trials = 1000;
 48 | 
 49 |     if (argc==1) printf("Usage: bench data_blocks parity_blocks chunk_size trials logfile\n");
 50 |     if (argc>1)  params.OriginalCount = atoi(argv[1]);
 51 |     if (argc>2)  params.RecoveryCount = atoi(argv[2]);
 52 |     if (argc>3)  params.BlockBytes    = atoi(argv[3]);
 53 |     if (argc>4)  params.Trials        = atoi(argv[4]);
 54 |     if (argc>5)  logfile              = fopen(argv[5],"a");
 55 | 
 56 |     // Round up for compatibility with all benchmarked libraries
 57 |     params.BlockBytes = align_up(params.BlockBytes, BUFSIZE_ALIGNMENT);
 58 | 
 59 |     printf("Params: data_blocks=%d parity_blocks=%d chunk_size=%d trials=%d\n",
 60 |         params.OriginalCount, params.RecoveryCount, params.BlockBytes, params.Trials);
 61 | }
 62 | 
 63 | 
 64 | // Try to seize a CPU core into exclusive use by this thread
 65 | void occupy_cpu_core()
 66 | {
 67 |     // Increase process/thread priorities to ensure repeatable results
 68 | #ifdef _WIN32
 69 |     ::SetPriorityClass(::GetCurrentProcess(), HIGH_PRIORITY_CLASS);
 70 |     ::SetThreadPriority(::GetCurrentThread(), THREAD_PRIORITY_HIGHEST);
 71 | #endif
 72 |     std::this_thread::sleep_for(std::chrono::milliseconds(100));
 73 | }
 74 | 
 75 | 
 76 | // Benchmark all libraries using parameters provided on cmdline
 77 | int main(int argc, char** argv)
 78 | {
 79 |     // Setup benchmark configuration based on cmdline options
 80 |     parse_cmdline(argc, argv);
 81 | 
 82 |     // Alloc single buffer large enough for any operation in any tested library
 83 |     size_t bufsize = params.OriginalFileBytes() +
 84 |                          std::max(params.RecoveryDataBytes(),   // CM256/Wirehair extra space
 85 |                          std::max(leopard_extra_space(params),
 86 |                                   fastecc_extra_space(params)));
 87 |     auto buffer = new uint8_t[bufsize + BUFSIZE_ALIGNMENT];
 88 | 
 89 |     // Align buffer start for compatibility with all benchmarked libraries
 90 |     buffer = (uint8_t*) align_up(uintptr_t(buffer), BUFSIZE_ALIGNMENT);
 91 | 
 92 |     // Fill place allocated for the file contents with random numbers.
 93 |     // It's critical to fill it with non-repeating data
 94 |     // since some libraries rely on table lookups
 95 |     // and can get unfair speedup on repeated data.
 96 |     for (size_t i = 0; i < params.OriginalFileBytes(); ++i) {
 97 |         buffer[i] = (uint8_t)((i*123456791) >> 13);
 98 |     }
 99 | 
100 |     // Benchmark each library
101 |     occupy_cpu_core();
102 |     library = "CM256";    cm256_benchmark_main(params, buffer);
103 |     library = "Leopard";  leopard_benchmark_main(params, buffer);
104 |     library = "FastECC";  fastecc_benchmark_main(params, buffer);
105 |     library = "Wirehair"; wirehair_benchmark_main(params, buffer);
106 | 
107 |     if (logfile)  fclose(logfile);
108 |     return 0;
109 | }
110 | 


--------------------------------------------------------------------------------