├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README.md ├── fp61.cpp ├── fp61.h └── tests ├── benchmarks.cpp ├── gf256.cpp ├── gf256.h └── tests.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | 4 | # User-specific files 5 | *.suo 6 | *.user 7 | *.userosscache 8 | *.sln.docstates 9 | 10 | # User-specific files (MonoDevelop/Xamarin Studio) 11 | *.userprefs 12 | 13 | # Build results 14 | [Dd]ebug/ 15 | [Dd]ebugPublic/ 16 | [Rr]elease/ 17 | [Rr]eleases/ 18 | x64/ 19 | x86/ 20 | bld/ 21 | [Bb]in/ 22 | [Oo]bj/ 23 | [Ll]og/ 24 | 25 | # Visual Studio 2015 cache/options directory 26 | .vs/ 27 | # Uncomment if you have tasks that create the project's static files in wwwroot 28 | #wwwroot/ 29 | 30 | # MSTest test Results 31 | [Tt]est[Rr]esult*/ 32 | [Bb]uild[Ll]og.* 33 | 34 | # NUNIT 35 | *.VisualState.xml 36 | TestResult.xml 37 | 38 | # Build Results of an ATL Project 39 | [Dd]ebugPS/ 40 | [Rr]eleasePS/ 41 | dlldata.c 42 | 43 | # DNX 44 | project.lock.json 45 | artifacts/ 46 | 47 | *_i.c 48 | *_p.c 49 | *_i.h 50 | *.ilk 51 | *.meta 52 | *.obj 53 | *.pch 54 | *.pdb 55 | *.pgc 56 | *.pgd 57 | *.rsp 58 | *.sbr 59 | *.tlb 60 | *.tli 61 | *.tlh 62 | *.tmp 63 | *.tmp_proj 64 | *.log 65 | *.vspscc 66 | *.vssscc 67 | .builds 68 | *.pidb 69 | *.svclog 70 | *.scc 71 | 72 | # Chutzpah Test files 73 | _Chutzpah* 74 | 75 | # Visual C++ cache files 76 | ipch/ 77 | *.aps 78 | *.ncb 79 | *.opendb 80 | *.opensdf 81 | *.sdf 82 | *.cachefile 83 | *.VC.db 84 | *.VC.VC.opendb 85 | 86 | # Visual Studio profiler 87 | *.psess 88 | *.vsp 89 | *.vspx 90 | *.sap 91 | 92 | # TFS 2012 Local Workspace 93 | $tf/ 94 | 95 | # Guidance Automation Toolkit 96 | *.gpState 97 | 98 | # ReSharper is a .NET coding add-in 99 | _ReSharper*/ 100 | *.[Rr]e[Ss]harper 101 | *.DotSettings.user 102 | 103 | # JustCode is a .NET coding add-in 104 | .JustCode 105 | 106 | # TeamCity is a build add-in 107 | _TeamCity* 108 | 109 | # DotCover is a Code Coverage Tool 110 | *.dotCover 111 | 112 | # NCrunch 113 | _NCrunch_* 114 | .*crunch*.local.xml 115 | nCrunchTemp_* 116 | 117 | # MightyMoose 118 | *.mm.* 119 | AutoTest.Net/ 120 | 121 | # Web workbench (sass) 122 | .sass-cache/ 123 | 124 | # Installshield output folder 125 | [Ee]xpress/ 126 | 127 | # DocProject is a documentation generator add-in 128 | DocProject/buildhelp/ 129 | DocProject/Help/*.HxT 130 | DocProject/Help/*.HxC 131 | DocProject/Help/*.hhc 132 | DocProject/Help/*.hhk 133 | DocProject/Help/*.hhp 134 | DocProject/Help/Html2 135 | DocProject/Help/html 136 | 137 | # Click-Once directory 138 | publish/ 139 | 140 | # Publish Web Output 141 | *.[Pp]ublish.xml 142 | *.azurePubxml 143 | # TODO: Comment the next line if you want to checkin your web deploy settings 144 | # but database connection strings (with potential passwords) will be unencrypted 145 | *.pubxml 146 | *.publishproj 147 | 148 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 149 | # checkin your Azure Web App publish settings, but sensitive information contained 150 | # in these scripts will be unencrypted 151 | PublishScripts/ 152 | 153 | # NuGet Packages 154 | *.nupkg 155 | # The packages folder can be ignored because of Package Restore 156 | **/packages/* 157 | # except build/, which is used as an MSBuild target. 158 | !**/packages/build/ 159 | # Uncomment if necessary however generally it will be regenerated when needed 160 | #!**/packages/repositories.config 161 | # NuGet v3's project.json files produces more ignoreable files 162 | *.nuget.props 163 | *.nuget.targets 164 | 165 | # Microsoft Azure Build Output 166 | csx/ 167 | *.build.csdef 168 | 169 | # Microsoft Azure Emulator 170 | ecf/ 171 | rcf/ 172 | 173 | # Windows Store app package directories and files 174 | AppPackages/ 175 | BundleArtifacts/ 176 | Package.StoreAssociation.xml 177 | _pkginfo.txt 178 | 179 | # Visual Studio cache files 180 | # files ending in .cache can be ignored 181 | *.[Cc]ache 182 | # but keep track of directories ending in .cache 183 | !*.[Cc]ache/ 184 | 185 | # Others 186 | ClientBin/ 187 | ~$* 188 | *~ 189 | *.dbmdl 190 | *.dbproj.schemaview 191 | *.pfx 192 | *.publishsettings 193 | node_modules/ 194 | orleans.codegen.cs 195 | 196 | # Since there are multiple workflows, uncomment next line to ignore bower_components 197 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 198 | #bower_components/ 199 | 200 | # RIA/Silverlight projects 201 | Generated_Code/ 202 | 203 | # Backup & report files from converting an old project file 204 | # to a newer Visual Studio version. Backup files are not needed, 205 | # because we have git ;-) 206 | _UpgradeReport_Files/ 207 | Backup*/ 208 | UpgradeLog*.XML 209 | UpgradeLog*.htm 210 | 211 | # SQL Server files 212 | *.mdf 213 | *.ldf 214 | 215 | # Business Intelligence projects 216 | *.rdl.data 217 | *.bim.layout 218 | *.bim_*.settings 219 | 220 | # Microsoft Fakes 221 | FakesAssemblies/ 222 | 223 | # GhostDoc plugin setting file 224 | *.GhostDoc.xml 225 | 226 | # Node.js Tools for Visual Studio 227 | .ntvs_analysis.dat 228 | 229 | # Visual Studio 6 build log 230 | *.plg 231 | 232 | # Visual Studio 6 workspace options file 233 | *.opt 234 | 235 | # Visual Studio LightSwitch build output 236 | **/*.HTMLClient/GeneratedArtifacts 237 | **/*.DesktopClient/GeneratedArtifacts 238 | **/*.DesktopClient/ModelManifest.xml 239 | **/*.Server/GeneratedArtifacts 240 | **/*.Server/ModelManifest.xml 241 | _Pvt_Extensions 242 | 243 | # Paket dependency manager 244 | .paket/paket.exe 245 | paket-files/ 246 | 247 | # FAKE - F# Make 248 | .fake/ 249 | 250 | # JetBrains Rider 251 | .idea/ 252 | *.sln.iml 253 | *.txt 254 | *.lib 255 | *.exe 256 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5) 2 | project(fp61) 3 | 4 | set(CMAKE_CXX_STANDARD 11) 5 | 6 | # Fp61 library source files 7 | set(FP61_LIB_SRCFILES 8 | fp61.cpp 9 | fp61.h) 10 | 11 | add_library(fp61 ${FP61_LIB_SRCFILES}) 12 | 13 | add_executable(tests tests/tests.cpp) 14 | target_link_libraries(tests fp61) 15 | 16 | add_executable(benchmarks 17 | tests/benchmarks.cpp 18 | tests/gf256.h 19 | tests/gf256.cpp) 20 | target_link_libraries(benchmarks fp61) 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018, Chris Taylor 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Fp61 2 | ## Finite field arithmetic modulo Mersenne prime p = 2^61-1 in C++ 3 | 4 | ### Disclaimer: I don't recommend using Fp61 for erasure codes. This was an experiment to see how it would perform, and unfortunately the results were not good. See the benchmarks below. 5 | 6 | This software takes advantage of the commonly available fast 64x64->128 multiplier 7 | to accelerate finite (base) field arithmetic. So it runs a lot faster 8 | when built into a 64-bit executable. 9 | 10 | This math code offers use of lazy reduction techniques for speed, 11 | via fp61::PartialReduce(). 12 | 13 | + Addition of 8 values can be evaluated before reduction. 14 | + Sums of 4 products can be evaluated with partial reductions. 15 | 16 | ## Benchmarks 17 | 18 | The goal of the benchmarks is to determine how fast Fp61 arithmetic is 19 | for the purpose of implementing erasure codes in software. 20 | 21 | *Drumroll...* Results: 22 | 23 | The results are not good at all. The Fp61 encoder is roughly 20x slower 24 | than my GF(2^8) code (gf256). So, I do not recommend using Fp61. 25 | 26 | The majority of the slowdown comes from the ByteReader class that needs 27 | to convert byte data into 61-bit Fp words. So it seems that having an 28 | odd field size to achieve lazy reductions does not help performance. 29 | 30 | *Sad trombone...* 31 | 32 | Benchmarks for Fp61 erasure codes. Before running the benchmarks please run the tests to make sure everything's working on your PC. It's going to run quite a bit faster with 64-bit builds because it takes advantage of the speed of 64-bit multiplications. 33 | 34 | Testing file size = 10 bytes 35 | N = 2 : gf256_MBPS=250 Fp61_MBPS=65 Fp61_OutputBytes=16 36 | N = 4 : gf256_MBPS=305 Fp61_MBPS=116 Fp61_OutputBytes=16 37 | N = 8 : gf256_MBPS=138 Fp61_MBPS=80 Fp61_OutputBytes=16 38 | N = 16 : gf256_MBPS=337 Fp61_MBPS=110 Fp61_OutputBytes=16 39 | N = 32 : gf256_MBPS=711 Fp61_MBPS=242 Fp61_OutputBytes=16 40 | N = 64 : gf256_MBPS=665 Fp61_MBPS=226 Fp61_OutputBytes=16 41 | N = 128 : gf256_MBPS=868 Fp61_MBPS=297 Fp61_OutputBytes=16 42 | N = 256 : gf256_MBPS=713 Fp61_MBPS=240 Fp61_OutputBytes=16 43 | N = 512 : gf256_MBPS=881 Fp61_MBPS=300 Fp61_OutputBytes=16 44 | Testing file size = 100 bytes 45 | N = 2 : gf256_MBPS=1234 Fp61_MBPS=214 Fp61_OutputBytes=107 46 | N = 4 : gf256_MBPS=4000 Fp61_MBPS=486 Fp61_OutputBytes=107 47 | N = 8 : gf256_MBPS=2631 Fp61_MBPS=328 Fp61_OutputBytes=107 48 | N = 16 : gf256_MBPS=2051 Fp61_MBPS=300 Fp61_OutputBytes=107 49 | N = 32 : gf256_MBPS=3850 Fp61_MBPS=433 Fp61_OutputBytes=107 50 | N = 64 : gf256_MBPS=3972 Fp61_MBPS=428 Fp61_OutputBytes=107 51 | N = 128 : gf256_MBPS=4397 Fp61_MBPS=444 Fp61_OutputBytes=107 52 | N = 256 : gf256_MBPS=5137 Fp61_MBPS=500 Fp61_OutputBytes=107 53 | N = 512 : gf256_MBPS=5129 Fp61_MBPS=492 Fp61_OutputBytes=107 54 | Testing file size = 1000 bytes 55 | N = 2 : gf256_MBPS=10309 Fp61_MBPS=889 Fp61_OutputBytes=1007 56 | N = 4 : gf256_MBPS=15325 Fp61_MBPS=848 Fp61_OutputBytes=1007 57 | N = 8 : gf256_MBPS=9184 Fp61_MBPS=486 Fp61_OutputBytes=1007 58 | N = 16 : gf256_MBPS=12728 Fp61_MBPS=722 Fp61_OutputBytes=1007 59 | N = 32 : gf256_MBPS=11838 Fp61_MBPS=610 Fp61_OutputBytes=1007 60 | N = 64 : gf256_MBPS=10555 Fp61_MBPS=604 Fp61_OutputBytes=1007 61 | N = 128 : gf256_MBPS=11354 Fp61_MBPS=614 Fp61_OutputBytes=1007 62 | N = 256 : gf256_MBPS=14782 Fp61_MBPS=816 Fp61_OutputBytes=1007 63 | N = 512 : gf256_MBPS=18430 Fp61_MBPS=940 Fp61_OutputBytes=1007 64 | Testing file size = 10000 bytes 65 | N = 2 : gf256_MBPS=19138 Fp61_MBPS=893 Fp61_OutputBytes=10004 66 | N = 4 : gf256_MBPS=20283 Fp61_MBPS=959 Fp61_OutputBytes=10004 67 | N = 8 : gf256_MBPS=20953 Fp61_MBPS=1010 Fp61_OutputBytes=10004 68 | N = 16 : gf256_MBPS=22893 Fp61_MBPS=1056 Fp61_OutputBytes=10004 69 | N = 32 : gf256_MBPS=24461 Fp61_MBPS=1087 Fp61_OutputBytes=10004 70 | N = 64 : gf256_MBPS=22945 Fp61_MBPS=1057 Fp61_OutputBytes=10004 71 | N = 128 : gf256_MBPS=16939 Fp61_MBPS=982 Fp61_OutputBytes=10004 72 | N = 256 : gf256_MBPS=18608 Fp61_MBPS=927 Fp61_OutputBytes=10004 73 | N = 512 : gf256_MBPS=16662 Fp61_MBPS=734 Fp61_OutputBytes=10004 74 | Testing file size = 100000 bytes 75 | N = 2 : gf256_MBPS=22941 Fp61_MBPS=962 Fp61_OutputBytes=100002 76 | N = 4 : gf256_MBPS=22827 Fp61_MBPS=976 Fp61_OutputBytes=100002 77 | N = 8 : gf256_MBPS=16210 Fp61_MBPS=1052 Fp61_OutputBytes=100002 78 | N = 16 : gf256_MBPS=17354 Fp61_MBPS=1044 Fp61_OutputBytes=100002 79 | N = 32 : gf256_MBPS=16976 Fp61_MBPS=1030 Fp61_OutputBytes=100002 80 | N = 64 : gf256_MBPS=13570 Fp61_MBPS=910 Fp61_OutputBytes=100002 81 | N = 128 : gf256_MBPS=10592 Fp61_MBPS=533 Fp61_OutputBytes=100002 82 | N = 256 : gf256_MBPS=10637 Fp61_MBPS=500 Fp61_OutputBytes=100002 83 | N = 512 : gf256_MBPS=11528 Fp61_MBPS=483 Fp61_OutputBytes=100002 84 | 85 | Note that near the end it looks like the file sizes are exceeding the processor cache and it starts slowing down by 2x. 86 | 87 | 88 | ## API 89 | 90 | Supported arithmetic operations: Add, Negation, Multiply, Mul Inverse. 91 | Subtraction is implemented via Negation. 92 | 93 | Partial Reduction from full 64 bits to 62 bits: 94 | 95 | r = fp61::PartialReduce(x) 96 | 97 | Partially reduce x (mod p). This clears bits #63 and #62. 98 | 99 | The result can be passed directly to fp61::Add4(), fp61::Multiply(), 100 | and fp61::Finalize(). 101 | 102 | Final Reduction from 64 bits to

8-bit lookup table (shuffle). 306 | 307 | Fp61 math runs fastest when a 64x64->128 multiply instruction is available, which is unavailable on ARM64. It has to use a schoolbook multiplication approach to emulate the wider multiplier, requiring 4 multiplies instead of 1. 308 | 309 | Regarding fitting data into the fields, GF(2^8) and GF(2^16) have an advantage because input data is in bytes. Data needs to be packed into Fp61 values in order to work on it, but the encoding is fairly straight-forward. 310 | 311 | Regarding erasure code applications, a random linear code based on GF(2^8) will fail to recover roughly 0.2% of the time, requiring one extra recovery packet. GF(2^16) and Fp61 have almost no overhead. 312 | 313 | 314 | #### Comparing Fp61 to Fp=2^32-5: 315 | 316 | Fp61 has a number of advantages over Fp=2^32-5 and some disadvantages. 317 | 318 | Clear advantages for Fp61: 319 | 320 | (1) Since the prime modulus leaves some extra bits in the 64-bit words, lazy reduction can be used to cut down the cost of additions by about 1/2. For erasure codes muladd/mulsub are the common operations, so cheap additions are great. 321 | 322 | (2) The reductions are cheaper for Fp61 in general due to the Mersenne prime modulus. Furthermore the reductions have no conditionals, so the performance is pretty much consistent regardless of the input. 323 | 324 | (3) The smaller field consumes data at 1/2 the rate, which is problematic because of the data packing required. Generally computers are more efficient for larger reads, so reading/writing twice as much data is more efficient. If a prefix code is used, then 2x the amount of state needs to be kept for the same amount of data. Fp61 must emit one bit for its prefix code, whereas the smaller field must emit 2-3 bits. 325 | 326 | Possible advantages for Fp=2^32-5: 327 | 328 | (1) Fp61 may be overall less efficient on mobile (ARM64) processors. Despite the speed disadvantages discussed above, when the 64-bit multiply instruction is unavailable, the smaller prime may pull ahead for performance. It would require benchmarking to really answer this. 329 | 330 | 331 | #### Comparing Fp61 to Fp=2^127-1: 332 | 333 | Perhaps Fp61 might be useful for cryptography for an extension field like Fp^4 for a 244-bit field. 334 | All of the operations are constant-time so it is pretty close to being good for crypto. 335 | The inverse operation would be exponentially faster since the Euler totient -based inverse function only needs to evaluate a 64-bit exponentiation instead of a 128-bit exponentiation. 336 | 337 | Because the Fp=2^127-1 field is so close to the word size, every add operation needs a reduction costing 3 cycles. The partial reduction for 2^61-1 runs in 2 cycles and only needs to be performed every 4 additions, plus it does not 338 | require any assembly code software. 339 | Overall addition for 4 values is 6x faster in this field. If two 61-bit fields are used in an OEF, then addition of 61x2= 121-bit values is 3x faster using this as a base field. 340 | 341 | Multiplication for 2^127-1 is complicated because there is no instruction that performs 128x128->256 bit products. So it requires 4 MUL instructions. Reduction requires about 10 instructions. Multiplication for 2^61-1 is done with 1 MUL instruction. 342 | Reduction requires about 7 instructions for repeated muliplies. In an OEF, the 121-bit multiply is overall less complicated, maybe by 30%? 343 | 344 | 345 | #### Ideas for future work: 346 | 347 | It may be interesting to use Fp=2^31-1 for mobile targets because the 32x32->64 multiplier that is available is a good fit for this field. Reduction is simple and may allow for some laziness to cut the reduction costs in half, but it's not clear how it would work out practically without implementing it. 348 | 349 | Solinas prime p=2^64-2^32+1 would allow a much less awkward algorithm for packing data into the field, and its reduction is even simpler than the Fp61 prime. 350 | 351 | 352 | #### Credits 353 | 354 | Software by Christopher A. Taylor . 355 | 356 | Please reach out if you need support or would like to collaborate on a project. 357 | -------------------------------------------------------------------------------- /fp61.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2018 Christopher A. Taylor. All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | * Neither the name of Fp61 nor the names of its contributors may be 13 | used to endorse or promote products derived from this software without 14 | specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 20 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 | POSSIBILITY OF SUCH DAMAGE. 27 | */ 28 | 29 | #include "fp61.h" 30 | 31 | namespace fp61 { 32 | 33 | 34 | // This is an unrolled implementation of Knuth's unsigned version of the eGCD, 35 | // specialized for the prime. It handles any input. 36 | uint64_t Inverse(uint64_t u) 37 | { 38 | uint64_t u1, u3, v1, v3, qt; 39 | 40 | qt = u / kPrime; 41 | u3 = u % kPrime; 42 | u1 = 1; 43 | 44 | if (u3 == 0) { 45 | return 0; // No inverse 46 | } 47 | 48 | qt = kPrime / u3; 49 | v3 = kPrime % u3; 50 | v1 = qt; 51 | 52 | for (;;) 53 | { 54 | if (v3 == 0) { 55 | return u3 == 1 ? u1 : 0; 56 | } 57 | 58 | qt = u3 / v3; 59 | u3 %= v3; 60 | u1 += qt * v1; 61 | 62 | if (u3 == 0) { 63 | return v3 == 1 ? kPrime - v1 : 0; 64 | } 65 | 66 | qt = v3 / u3; 67 | v3 %= u3; 68 | v1 += qt * u1; 69 | } 70 | } 71 | 72 | 73 | //------------------------------------------------------------------------------ 74 | // Memory Reading 75 | 76 | uint64_t ReadBytes_LE(const uint8_t* data, unsigned bytes) 77 | { 78 | switch (bytes) 79 | { 80 | case 8: return ReadU64_LE(data); 81 | case 7: return ((uint64_t)data[6] << 48) | ((uint64_t)data[5] << 40) | ((uint64_t)data[4] << 32) | ReadU32_LE(data); 82 | case 6: return ((uint64_t)data[5] << 40) | ((uint64_t)data[4] << 32) | ReadU32_LE(data); 83 | case 5: return ((uint64_t)data[4] << 32) | ReadU32_LE(data); 84 | case 4: return ReadU32_LE(data); 85 | case 3: return ((uint32_t)data[2] << 16) | ((uint32_t)data[1] << 8) | data[0]; 86 | case 2: return ((uint32_t)data[1] << 8) | data[0]; 87 | case 1: return data[0]; 88 | default: break; 89 | } 90 | return 0; 91 | } 92 | 93 | ReadResult ByteReader::Read(uint64_t& fpOut) 94 | { 95 | uint64_t word, r, workspace = Workspace; 96 | int nextAvailable, available = Available; 97 | 98 | // If enough bits are already available: 99 | if (available >= 61) 100 | { 101 | r = workspace & kPrime; 102 | workspace >>= 61; 103 | nextAvailable = available - 61; 104 | } 105 | else 106 | { 107 | unsigned bytes = Bytes; 108 | 109 | // Read a word to fill in the difference 110 | if (bytes >= 8) 111 | { 112 | word = ReadU64_LE(Data); 113 | Data += 8; 114 | Bytes = bytes - 8; 115 | nextAvailable = available + 3; 116 | } 117 | else 118 | { 119 | if (bytes == 0 && available <= 0) { 120 | return ReadResult::Empty; 121 | } 122 | 123 | word = ReadBytes_LE(Data, bytes); 124 | Bytes = 0; 125 | 126 | // Note this may go negative but we check for that above 127 | nextAvailable = available + bytes * 8 - 61; 128 | } 129 | 130 | // This assumes workspace high bits (beyond `available`) are 0 131 | r = (workspace | (word << available)) & kPrime; 132 | 133 | // Remaining workspace bits are taken from read word 134 | workspace = word >> (61 - available); 135 | } 136 | 137 | // If there is ambiguity in the representation: 138 | if (IsU64Ambiguous(r)) 139 | { 140 | // This will not overflow because available <= 60. 141 | // We add up to 3 more bits, so adding one more keeps us within 64 bits. 142 | ++nextAvailable; 143 | 144 | // Insert bit 0 for 0ff..ff and 1 for 1ff..ff to resolve the ambiguity 145 | workspace = (workspace << 1) | (r >> 60); 146 | 147 | // Use kAmbiguity value for a placeholder 148 | r = kAmbiguityMask; 149 | } 150 | 151 | Workspace = workspace; 152 | Available = nextAvailable; 153 | 154 | fpOut = r; 155 | return ReadResult::Success; 156 | } 157 | 158 | uint64_t WordReader::Read() 159 | { 160 | int nextAvailable, available = Available; 161 | uint64_t r, workspace = Workspace; 162 | 163 | if (available >= 61) 164 | { 165 | r = workspace & kPrime; 166 | nextAvailable = available - 61; 167 | workspace >>= 61; 168 | } 169 | else 170 | { 171 | uint64_t word; 172 | unsigned bytes = Bytes; 173 | 174 | // If we can read a full word: 175 | if (bytes >= 8) 176 | { 177 | word = ReadU64_LE(Data); 178 | Data += 8; 179 | Bytes = bytes - 8; 180 | nextAvailable = available + 3; // +64 - 61 181 | } 182 | else 183 | { 184 | if (bytes == 0 && available <= 0) { 185 | return 0; // No data left to read 186 | } 187 | 188 | word = ReadBytes_LE(Data, bytes); 189 | 190 | // Note this may go negative but we check for negative above 191 | nextAvailable = available + bytes * 8 - 61; 192 | 193 | Bytes = 0; 194 | } 195 | 196 | r = (workspace | (word << available)) & kPrime; 197 | workspace = word >> (61 - available); 198 | } 199 | 200 | Workspace = workspace; 201 | Available = nextAvailable; 202 | 203 | return r; 204 | } 205 | 206 | 207 | //------------------------------------------------------------------------------ 208 | // Memory Writing 209 | 210 | void WriteBytes_LE(uint8_t* data, unsigned bytes, uint64_t value) 211 | { 212 | switch (bytes) 213 | { 214 | case 8: WriteU64_LE(data, value); 215 | return; 216 | case 7: data[6] = (uint8_t)(value >> 48); 217 | case 6: data[5] = (uint8_t)(value >> 40); 218 | case 5: data[4] = (uint8_t)(value >> 32); 219 | case 4: WriteU32_LE(data, static_cast(value)); 220 | return; 221 | case 3: data[2] = (uint8_t)(value >> 16); 222 | case 2: data[1] = (uint8_t)(value >> 8); 223 | case 1: data[0] = (uint8_t)value; 224 | default: break; 225 | } 226 | } 227 | 228 | 229 | //------------------------------------------------------------------------------ 230 | // Random 231 | 232 | // From http://xoshiro.di.unimi.it/splitmix64.c 233 | // Written in 2015 by Sebastiano Vigna (vigna@acm.org) 234 | uint64_t HashU64(uint64_t x) 235 | { 236 | x += 0x9e3779b97f4a7c15; 237 | uint64_t z = x; 238 | z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9; 239 | z = (z ^ (z >> 27)) * 0x94d049bb133111eb; 240 | return z ^ (z >> 31); 241 | } 242 | 243 | void Random::Seed(uint64_t x) 244 | { 245 | // Fill initial state as recommended by authors 246 | uint64_t h = HashU64(x); 247 | State[0] = h; 248 | h = HashU64(h); 249 | State[1] = h; 250 | h = HashU64(h); 251 | State[2] = h; 252 | h = HashU64(h); 253 | State[3] = h; 254 | } 255 | 256 | 257 | } // namespace fp61 258 | -------------------------------------------------------------------------------- /fp61.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2018 Christopher A. Taylor. All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | * Neither the name of Fp61 nor the names of its contributors may be 13 | used to endorse or promote products derived from this software without 14 | specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 20 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 | POSSIBILITY OF SUCH DAMAGE. 27 | */ 28 | 29 | #ifndef CAT_FP61_H 30 | #define CAT_FP61_H 31 | 32 | #include 33 | 34 | /** \mainpage 35 | Fp61 : Finite field arithmetic modulo Mersenne prime p = 2^61-1 in C++ 36 | 37 | The Fp61 software takes advantage of the commonly available fast 38 | 64x64->128 multiplier to accelerate finite (base) field arithmetic. 39 | So it runs a lot faster when built into a 64-bit executable. 40 | 41 | This math code offers use of lazy reduction techniques for speed, 42 | via fp61::PartialReduce(). 43 | 44 | + Addition of 8 values can be evaluated before reduction. 45 | + Sums of 4 products can be evaluated with partial reductions. 46 | */ 47 | 48 | // Define this to avoid any unaligned memory accesses while reading data. 49 | // This is useful as a quick-fix for mobile applications. 50 | // A preferred solution is to ensure that the data provided is aligned. 51 | // Another reason to do this is if the platform is big-endian. 52 | //#define FP61_SAFE_MEMORY_ACCESSES 53 | 54 | 55 | //------------------------------------------------------------------------------ 56 | // Portability Macros 57 | 58 | // Compiler-specific force inline keyword 59 | #ifdef _MSC_VER 60 | # define FP61_FORCE_INLINE inline __forceinline 61 | #else 62 | # define FP61_FORCE_INLINE inline __attribute__((always_inline)) 63 | #endif 64 | 65 | 66 | //------------------------------------------------------------------------------ 67 | // Portable 64x64->128 Multiply 68 | // CAT_MUL128: r{hi,lo} = x * y 69 | 70 | // Returns low part of product, and high part is set in r_hi 71 | FP61_FORCE_INLINE uint64_t Emulate64x64to128( 72 | uint64_t& r_hi, 73 | const uint64_t x, 74 | const uint64_t y) 75 | { 76 | // Form temporary 32-bit words 77 | const uint32_t x0 = static_cast(x); 78 | const uint32_t x1 = static_cast(x >> 32); 79 | const uint32_t y0 = static_cast(y); 80 | const uint32_t y1 = static_cast(y >> 32); 81 | 82 | // Calculate 32x32->64 bit products 83 | const uint64_t p11 = static_cast(x1) * y1; 84 | const uint64_t p01 = static_cast(x0) * y1; 85 | const uint64_t p10 = static_cast(x1) * y0; 86 | const uint64_t p00 = static_cast(x0) * y0; 87 | 88 | /* 89 | This is implementing schoolbook multiplication: 90 | 91 | x1 x0 92 | X y1 y0 93 | ------------- 94 | 00 LOW PART 95 | ------------- 96 | 00 97 | 10 10 MIDDLE PART 98 | + 01 99 | ------------- 100 | 01 101 | + 11 11 HIGH PART 102 | ------------- 103 | */ 104 | 105 | // 64-bit product + two 32-bit values 106 | const uint64_t middle = p10 107 | + static_cast(p00 >> 32) 108 | + static_cast(p01); 109 | 110 | /* 111 | Proof that 64-bit products can accumulate two more 32-bit values 112 | without overflowing: 113 | 114 | Max 32-bit value is 2^32 - 1. 115 | PSum = (2^32-1) * (2^32-1) + (2^32-1) + (2^32-1) 116 | = 2^64 - 2^32 - 2^32 + 1 + 2^32 - 1 + 2^32 - 1 117 | = 2^64 - 1 118 | Therefore it cannot overflow regardless of input. 119 | */ 120 | 121 | // 64-bit product + two 32-bit values 122 | r_hi = p11 123 | + static_cast(middle >> 32) 124 | + static_cast(p01 >> 32); 125 | 126 | // Add LOW PART and lower half of MIDDLE PART 127 | return (middle << 32) | static_cast(p00); 128 | } 129 | 130 | #if defined(_MSC_VER) && defined(_WIN64) 131 | // Visual Studio 64-bit 132 | 133 | # include 134 | # pragma intrinsic(_umul128) 135 | # define CAT_MUL128(r_hi, r_lo, x, y) \ 136 | r_lo = _umul128(x, y, &(r_hi)); 137 | 138 | #elif defined(__SIZEOF_INT128__) 139 | // Compiler supporting 128-bit values (GCC/Clang) 140 | 141 | # define CAT_MUL128(r_hi, r_lo, x, y) \ 142 | { \ 143 | unsigned __int128 w = (unsigned __int128)x * y; \ 144 | r_lo = (uint64_t)w; \ 145 | r_hi = (uint64_t)(w >> 64); \ 146 | } 147 | 148 | #else 149 | // Emulate 64x64->128-bit multiply with 64x64->64 operations 150 | 151 | # define CAT_MUL128(r_hi, r_lo, x, y) \ 152 | r_lo = Emulate64x64to128(r_hi, x, y); 153 | 154 | #endif // End CAT_MUL128 155 | 156 | 157 | namespace fp61 { 158 | 159 | 160 | //------------------------------------------------------------------------------ 161 | // Constants 162 | 163 | // p = 2^61 - 1 164 | static const uint64_t kPrime = ((uint64_t)1 << 61) - 1; 165 | 166 | // Mask where bit #63 is clear and all other bits are set. 167 | static const uint64_t kMask63 = ((uint64_t)1 << 63) - 1; 168 | 169 | 170 | //------------------------------------------------------------------------------ 171 | // API 172 | 173 | /** 174 | r = fp61::PartialReduce(x) 175 | 176 | Partially reduce x (mod p). This clears bits #63 and #62. 177 | 178 | The result can be passed directly to fp61::Add4(), fp61::Multiply(), 179 | and fp61::Finalize(). 180 | */ 181 | FP61_FORCE_INLINE uint64_t PartialReduce(uint64_t x) 182 | { 183 | // Eliminate bits #63 to #61, which may carry back up into bit #61, 184 | // So we will only definitely reduce #63 and #62. 185 | return (x & kPrime) + (x >> 61); // 0 <= result <= 2^62 - 1 186 | } 187 | 188 | /** 189 | r = fp61::Finalize(x) 190 | 191 | Finalize reduction of x (mod p) from PartialReduce() 192 | Preconditions: Bits #63 and #62 are clear and x != 0x3ffffffffffffffeULL 193 | 194 | This function fails for x = 0x3ffffffffffffffeULL. 195 | The partial reduction function does not produce this bit pattern for any 196 | input, so this exception is allowed because I'm assuming the input comes 197 | from fp61::PartialReduce(). So, do not mask down to 62 random bits and 198 | pass to this function because it can fail in this one case. 199 | 200 | Returns a value in Fp (less than p). 201 | */ 202 | FP61_FORCE_INLINE uint64_t Finalize(uint64_t x) 203 | { 204 | // Eliminate #61. 205 | // The +1 also handles the case where x = p and x = 0x3fffffffffffffffULL. 206 | // I don't see a way to tweak this to handle 0x3ffffffffffffffeULL... 207 | return (x + ((x+1) >> 61)) & kPrime; // 0 <= result < p 208 | } 209 | 210 | /** 211 | r = fp61::Add4(x, y, z, w) 212 | 213 | Sum x + y + z + w (without full reduction modulo p). 214 | Preconditions: x,y,z,w <2^62 215 | 216 | Probably you will want to just inline this code and follow the pattern, 217 | since being restricted to adding 4 things at a time is kind of weird. 218 | 219 | The result can be passed directly to fp61::Add4(), fp61::Multiply(), and 220 | fp61::Finalize(). 221 | */ 222 | FP61_FORCE_INLINE uint64_t Add4(uint64_t x, uint64_t y, uint64_t z, uint64_t w) 223 | { 224 | return PartialReduce(x + y + z + w); 225 | } 226 | 227 | /** 228 | r = fp61::Negate(x) 229 | 230 | r = -x (without reduction modulo p) 231 | Preconditions: x <= p 232 | 233 | The input needs to be have bits #63 #62 #61 cleared. 234 | This can be ensured by calling fp61::PartialReduce() and 235 | fp61::Finalize() first. Since this is more expensive than addition 236 | it is best to reorganize operations to avoid needing this reduction. 237 | 238 | Return a value <= p. 239 | */ 240 | FP61_FORCE_INLINE uint64_t Negate(uint64_t x) 241 | { 242 | return kPrime - x; 243 | } 244 | 245 | // For subtraction, use fp61::Negate() and add: x + (-y). 246 | 247 | /** 248 | r = fp61::Multiply(x, y) 249 | 250 | r = x * y (with partial reduction modulo p) 251 | 252 | Important Input Restriction: 253 | 254 | The number of bits between x and y must be less than 124 bits. 255 | 256 | Call fp61::PartialReduce() to reduce inputs if needed, 257 | which makes sure that both inputs are 62 bits or fewer. 258 | 259 | Example: If x <= 2^62-1 (62 bits), then y <= 2^62-1 (62 bits). 260 | This means that up to 2 values can be accumulated in x and 2 in y. 261 | 262 | But it is also possible to balance the input in other ways. 263 | 264 | Example: If x <= 2^61-1 (61 bits), then y <= 2^63-1 (63 bits). 265 | This means that up to 4 values can be accumulated in y. 266 | 267 | Result: 268 | 269 | The result is stored in bits #61 to #0 (62 bits of the word). 270 | Call fp61::Finalize() to reduce the result to 61 bits. 271 | */ 272 | FP61_FORCE_INLINE uint64_t Multiply(uint64_t x, uint64_t y) 273 | { 274 | uint64_t p_lo, p_hi; 275 | CAT_MUL128(p_hi, p_lo, x, y); 276 | 277 | /* 278 | Largest x,y = p - 1 = 2^61 - 2 = L. 279 | 280 | L*L = (2^61-2) * (2^61-2) 281 | = 2^(61+61) - 4*2^61 + 4 282 | = 2^122 - 2^63 + 4 283 | That is the high 6 bits are zero. 284 | 285 | We represent the product as two 64-bit words, or 128 bits. 286 | 287 | Say the low bit bit #64 is set in the high word. 288 | To eliminate this bit we need to subtract (2^61 - 1) * 2^3. 289 | This means we need to add a bit at #3. 290 | Similarly for bit #65 we need to add a bit at #4. 291 | 292 | High bits #127 to #125 affect high bits #66 to #64. 293 | High bits #124 to #64 affect low bits #63 to #3. 294 | Low bits #63 to #61 affect low bits #2 to #0. 295 | 296 | If we eliminate from high bits to low bits, then we could carry back 297 | up into the high bits again. So we should instead eliminate bits #61 298 | through #63 first to prevent carries into the high word. 299 | */ 300 | 301 | // Eliminate bits #63 to #61, which may carry back up into bit #61, 302 | // So we will only definitely reduce #63 and #62. 303 | uint64_t r = (p_lo & kPrime) + (p_lo >> 61); 304 | 305 | // Eliminate bits #123 to #64 (60 bits). 306 | // This stops short of #124 that would affect bit #63 because it 307 | // prevents the addition from overflowing the 64-bit word. 308 | r += ((p_hi << 3) & kMask63); 309 | 310 | // This last reduction step is not strictly necessary, but is almost always 311 | // a good idea when used to implement some algorithm, so I include it. 312 | // Partially reduce the result to clear the high 2 bits. 313 | return PartialReduce(r); 314 | } 315 | 316 | /** 317 | r = fp61::Inverse(x) 318 | 319 | r = x^-1 (mod p) 320 | The input value x can be any 64-bit value. 321 | 322 | This operation is kind of heavy so it should be avoided where possible. 323 | 324 | This operation is not constant-time. 325 | A constant-time version can be implemented using Euler's totient method and 326 | a straight line similar to https://github.com/catid/snowshoe/blob/master/src/fp.inc#L545 327 | 328 | Returns the multiplicative inverse of x modulo p. 329 | 0 < result < p 330 | 331 | If the inverse does not exist, it returns 0. 332 | */ 333 | uint64_t Inverse(uint64_t x); 334 | 335 | 336 | //------------------------------------------------------------------------------ 337 | // Memory Reading 338 | 339 | /// Read 8 bytes in little-endian byte order 340 | FP61_FORCE_INLINE uint64_t ReadU64_LE(const uint8_t* data) 341 | { 342 | #ifdef FP61_SAFE_MEMORY_ACCESSES 343 | return ((uint64_t)data[7] << 56) | ((uint64_t)data[6] << 48) | ((uint64_t)data[5] << 40) | 344 | ((uint64_t)data[4] << 32) | ((uint64_t)data[3] << 24) | ((uint64_t)data[2] << 16) | 345 | ((uint64_t)data[1] << 8) | data[0]; 346 | #else 347 | const uint64_t* wordPtr = reinterpret_cast(data); 348 | return *wordPtr; 349 | #endif 350 | } 351 | 352 | /// Read 4 bytes in little-endian byte order 353 | FP61_FORCE_INLINE uint32_t ReadU32_LE(const uint8_t* data) 354 | { 355 | #ifdef FP61_SAFE_MEMORY_ACCESSES 356 | return ((uint32_t)data[3] << 24) | ((uint32_t)data[2] << 16) | ((uint32_t)data[1] << 8) | data[0]; 357 | #else 358 | const uint32_t* wordPtr = reinterpret_cast(data); 359 | return *wordPtr; 360 | #endif 361 | } 362 | 363 | /// Read between 0..8 bytes in little-endian byte order 364 | /// Returns 0 for any other value for `bytes` 365 | uint64_t ReadBytes_LE(const uint8_t* data, unsigned bytes); 366 | 367 | enum class ReadResult 368 | { 369 | Success, ///< Read returned with a word of data 370 | Empty ///< No data remaining to read 371 | }; 372 | 373 | /** 374 | Fitting Bytes Into Words 375 | 376 | When converting byte data to words, a value of 2^61-1 is problematic 377 | because it does not fit in the field Fp that ranges from 0..(2^61-2). 378 | 379 | One way to fit these values into the field would be to emit 1ff..ffe 380 | for both 1ff..ffe and 1ff..fff, and then inject a new bit after it to 381 | allow the ByteWriter code to reverse the transformation. The problem 382 | with this is that the lower bit is modified, which is the same one 383 | that signals how the prior word is modified. 384 | 385 | So a better way to fix 1ff..fff is to make it ambiguous with 0ff..fff, 386 | where the high bit of the word is flipped. Now when 0ff..fff is seen 387 | by the ByteWriter, it knows to check the next word's low bit and 388 | optionally reverse it back to 1ff..fff. 389 | 390 | As an aside, we want to design the ByteReader to be as fast as possible 391 | because it is used by the erasure code encoder - The decoder must only 392 | reverse this transformation for any lost data, so it can be slower. 393 | 394 | It may be a good idea to XOR input data by a random sequence to randomize 395 | the odds of using extra bits, depending on the application. 396 | */ 397 | static const uint64_t kAmbiguityMask = ((uint64_t)1 << 60) - 1; // 0x0ff...fff 398 | 399 | /// Returns true if the U64 word provided needs an extra bit to represent it 400 | FP61_FORCE_INLINE bool IsU64Ambiguous(uint64_t u64_word) 401 | { 402 | return (u64_word & kAmbiguityMask) == kAmbiguityMask; 403 | } 404 | 405 | /// Returns true if this Fp word could have originally been 0ff..ff or 1ff..ff 406 | FP61_FORCE_INLINE bool IsFpAmbiguous(uint64_t fp_word) 407 | { 408 | return fp_word == kAmbiguityMask; 409 | } 410 | 411 | /** 412 | ByteReader 413 | 414 | Reads 8 bytes at a time from the input data and outputs 61-bit Fp words. 415 | Pads the final < 8 bytes with zeros. 416 | 417 | See the comments on Fitting Bytes Into Words for how this works. 418 | 419 | Call ByteReader::MaxWords() to calculate the maximum number of words that 420 | can be generated for worst-case input of all FFF...FFs. 421 | 422 | Define FP61_SAFE_MEMORY_ACCESSES if the platform does not support unaligned 423 | reads and the input data is unaligned, or the platform is big-endian. 424 | 425 | Call BeginRead() to begin reading. 426 | 427 | Call ReadNext() repeatedly to read all words from the data. 428 | It will return ReadResult::Empty when all bits are empty. 429 | */ 430 | struct ByteReader 431 | { 432 | const uint8_t* Data; 433 | unsigned Bytes; 434 | uint64_t Workspace; 435 | int Available; 436 | 437 | 438 | /// Calculates and returns the maximum number of Fp field words that may be 439 | /// produced by the ByteReader. 440 | static FP61_FORCE_INLINE unsigned MaxWords(unsigned bytes) 441 | { 442 | unsigned bits = bytes * 8; 443 | 444 | // Round up to the nearest word. 445 | // All words may be expanded by one bit, hence the (bits/61) factor. 446 | return (bits + (bits / 61) + 60) / 61; 447 | } 448 | 449 | /// Begin reading data 450 | FP61_FORCE_INLINE void BeginRead(const uint8_t* data, unsigned bytes) 451 | { 452 | Data = data; 453 | Bytes = bytes; 454 | Workspace = 0; 455 | Available = 0; 456 | } 457 | 458 | /// Returns ReadResult::Empty when no more data is available. 459 | /// Otherwise fpOut will be a value between 0 and p-1. 460 | ReadResult Read(uint64_t& fpOut); 461 | }; 462 | 463 | /** 464 | WordReader 465 | 466 | Reads a series of 61-bit finalized Fp field elements from a byte array. 467 | 468 | This differs from ByteReader in two ways: 469 | (1) It does not have to handle the special case of all ffffs. 470 | (2) It terminates deterministically at WordCount() words rather than 471 | based on the contents of the data. 472 | 473 | Call WordCount() to calculate the number of words to expect to read from 474 | a given number of bytes. 475 | 476 | Call BeginRead() to start reading. 477 | Call Read() to retrieve each consecutive word. 478 | */ 479 | struct WordReader 480 | { 481 | const uint8_t* Data; 482 | unsigned Bytes; 483 | uint64_t Workspace; 484 | unsigned Available; 485 | 486 | 487 | /// Calculate the number of words that can be read from a number of bytes 488 | static FP61_FORCE_INLINE unsigned WordCount(unsigned bytes) 489 | { 490 | // Note that only whole (not partial) words can be read, so this rounds down 491 | return (bytes * 8) / 61; 492 | } 493 | 494 | /// Begin writing to the given memory location 495 | FP61_FORCE_INLINE void BeginRead(const uint8_t* data, unsigned bytes) 496 | { 497 | Data = data; 498 | Bytes = bytes; 499 | Workspace = 0; 500 | Available = 0; 501 | } 502 | 503 | /// Read the next word. 504 | /// It is up to the application to know when to stop reading, 505 | /// based on the WordCount() count of words to read. 506 | uint64_t Read(); 507 | }; 508 | 509 | 510 | //------------------------------------------------------------------------------ 511 | // Memory Writing 512 | 513 | /// Write 4 bytes in little-endian byte order 514 | FP61_FORCE_INLINE void WriteU32_LE(uint8_t* data, uint32_t value) 515 | { 516 | #ifdef FP61_SAFE_MEMORY_ACCESSES 517 | data[3] = (uint8_t)(value >> 24); 518 | data[2] = (uint8_t)(value >> 16); 519 | data[1] = (uint8_t)(value >> 8); 520 | data[0] = (uint8_t)value; 521 | #else 522 | uint32_t* wordPtr = reinterpret_cast(data); 523 | *wordPtr = value; 524 | #endif 525 | } 526 | 527 | /// Write 8 bytes in little-endian byte order 528 | FP61_FORCE_INLINE void WriteU64_LE(uint8_t* data, uint64_t value) 529 | { 530 | #ifdef FP61_SAFE_MEMORY_ACCESSES 531 | data[7] = (uint8_t)(value >> 56); 532 | data[6] = (uint8_t)(value >> 48); 533 | data[5] = (uint8_t)(value >> 40); 534 | data[4] = (uint8_t)(value >> 32); 535 | data[3] = (uint8_t)(value >> 24); 536 | data[2] = (uint8_t)(value >> 16); 537 | data[1] = (uint8_t)(value >> 8); 538 | data[0] = (uint8_t)value; 539 | #else 540 | uint64_t* wordPtr = reinterpret_cast(data); 541 | *wordPtr = value; 542 | #endif 543 | } 544 | 545 | /// Write between 0..8 bytes in little-endian byte order 546 | void WriteBytes_LE(uint8_t* data, unsigned bytes, uint64_t value); 547 | 548 | /** 549 | WordWriter 550 | 551 | Writes a series of 61-bit finalized Fp field elements to a byte array. 552 | The resulting data can be read by WordReader. 553 | 554 | Call BytesNeeded() to calculate the number of bytes needed to store the 555 | given number of Fp words. 556 | 557 | Call BeginWrite() to start writing. 558 | Call Write() to write the next word. 559 | 560 | Call Flush() to write the last few bytes. 561 | Flush() returns the number of overall written bytes. 562 | */ 563 | struct WordWriter 564 | { 565 | uint8_t* Data; 566 | uint8_t* DataWritePtr; 567 | uint64_t Workspace; 568 | unsigned Available; 569 | 570 | 571 | /// Calculate the number of bytes that will be written 572 | /// for the given number of Fp words. 573 | static FP61_FORCE_INLINE unsigned BytesNeeded(unsigned words) 574 | { 575 | // 61 bits per word 576 | const unsigned bits = words * 61; 577 | 578 | // Round up to the next byte 579 | return (bits + 7) / 8; 580 | } 581 | 582 | /// Begin writing to the given memory location. 583 | /// It is up to the application to provide enough space in the buffer by 584 | /// using BytesNeeded() to calculate the buffer size. 585 | FP61_FORCE_INLINE void BeginWrite(uint8_t* data) 586 | { 587 | Data = data; 588 | DataWritePtr = data; 589 | Workspace = 0; 590 | Available = 0; 591 | } 592 | 593 | /// Write the next word 594 | FP61_FORCE_INLINE void Write(uint64_t word) 595 | { 596 | unsigned available = Available; 597 | uint64_t workspace = Workspace; 598 | 599 | // Include any bits that fit 600 | workspace |= word << available; 601 | available += 61; 602 | 603 | // If there is a full word now: 604 | if (available >= 64) 605 | { 606 | // Write the word 607 | WriteU64_LE(DataWritePtr, workspace); 608 | DataWritePtr += 8; 609 | available -= 64; 610 | 611 | // Keep remaining bits 612 | workspace = word >> (61 - available); 613 | } 614 | 615 | Workspace = workspace; 616 | Available = available; 617 | } 618 | 619 | /// Flush the output, writing fractions of a word if needed. 620 | /// This must be called or the output may be truncated. 621 | /// Returns the number of bytes written overall. 622 | FP61_FORCE_INLINE unsigned Flush() 623 | { 624 | const unsigned finalBytes = (Available + 7) / 8; 625 | 626 | // Write the number of available bytes 627 | WriteBytes_LE(DataWritePtr, finalBytes, Workspace); 628 | 629 | // Calculate number of bytes written overall 630 | const uintptr_t writtenBytes = static_cast(DataWritePtr - Data) + finalBytes; 631 | 632 | return static_cast(writtenBytes); 633 | } 634 | }; 635 | 636 | /** 637 | ByteWriter 638 | 639 | Writes a series of 61-bit finalized Fp field elements to a byte array, 640 | reversing the encoding of ByteReader. This is different from WordWriter 641 | because it can also write 61-bit values that are all ones (outside of Fp). 642 | 643 | See the comments on Fitting Bytes Into Words for how this works. 644 | 645 | Call MaxBytesNeeded() to calculate the maximum number of bytes needed 646 | to store the given number of Fp words. 647 | 648 | Call BeginWrite() to start writing. 649 | Call Write() to write the next word. 650 | 651 | Call Flush() to write the last few bytes. 652 | Flush() returns the number of overall written bytes. 653 | */ 654 | struct ByteWriter 655 | { 656 | WordWriter Writer; 657 | bool Packed; 658 | 659 | /// Calculate the maximum number of bytes that will be written for the 660 | /// given number of Fp words. May be up to 1.6% larger than necessary. 661 | static FP61_FORCE_INLINE unsigned MaxBytesNeeded(unsigned words) 662 | { 663 | return WordWriter::BytesNeeded(words); 664 | } 665 | 666 | /// Begin writing to the given memory location. 667 | /// It is up to the application to provide enough space in the buffer by 668 | /// using MaxBytesNeeded() to calculate the buffer size. 669 | FP61_FORCE_INLINE void BeginWrite(uint8_t* data) 670 | { 671 | Writer.BeginWrite(data); 672 | Packed = false; 673 | } 674 | 675 | /// Write the next word 676 | FP61_FORCE_INLINE void Write(uint64_t word) 677 | { 678 | const unsigned word_bits = (word == kAmbiguityMask) ? 60 : 61; 679 | 680 | unsigned available = Writer.Available; 681 | uint64_t workspace = Writer.Workspace; 682 | 683 | // Include any bits that fit 684 | workspace |= word << available; 685 | available += word_bits; 686 | 687 | // If there is a full word now: 688 | if (available >= 64) 689 | { 690 | // Write the word 691 | WriteU64_LE(Writer.DataWritePtr, workspace); 692 | Writer.DataWritePtr += 8; 693 | available -= 64; 694 | 695 | // Keep remaining bits 696 | workspace = word >> (word_bits - available); 697 | } 698 | 699 | Writer.Workspace = workspace; 700 | Writer.Available = available; 701 | } 702 | 703 | /// Flush the output, writing fractions of a word if needed. 704 | /// This must be called or the output may be truncated. 705 | /// Returns the number of bytes written overall. 706 | FP61_FORCE_INLINE unsigned Flush() 707 | { 708 | return Writer.Flush(); 709 | } 710 | }; 711 | 712 | 713 | //------------------------------------------------------------------------------ 714 | // Random Numbers 715 | 716 | #define CAT_ROL64(x, bits) ( ((uint64_t)(x) << (bits)) | ((uint64_t)(x) >> (64 - (bits))) ) 717 | 718 | /** 719 | Random 720 | 721 | Xoshiro256+ based pseudo-random number generator (PRNG) that can generate 722 | random numbers between 1..p. NextNonzeroFp() is mainly intended to be used 723 | for producing convolutional code coefficients to multiply by the data. 724 | 725 | Call Seed() to provide a 64-bit generator seed. 726 | Call NextNonzeroFp() to produce a random 61-bit number from 1..p 727 | Call NextFp() to produce a random 61-bit number from 0..p 728 | Call Next() to produce a random 64-bit number. 729 | */ 730 | struct Random 731 | { 732 | uint64_t State[4]; 733 | 734 | 735 | /// Seed the generator 736 | void Seed(uint64_t x); 737 | 738 | /// Get the next 64-bit random number. 739 | /// The low 3 bits are slightly weak according to the authors. 740 | // From http://xoshiro.di.unimi.it/xoshiro256plus.c 741 | // Written in 2018 by David Blackman and Sebastiano Vigna (vigna@acm.org) 742 | FP61_FORCE_INLINE uint64_t Next() 743 | { 744 | uint64_t s0 = State[0], s1 = State[1], s2 = State[2], s3 = State[3]; 745 | 746 | const uint64_t result = s0 + s3; 747 | 748 | const uint64_t t = s1 << 17; 749 | s2 ^= s0; 750 | s3 ^= s1; 751 | s1 ^= s2; 752 | s0 ^= s3; 753 | s2 ^= t; 754 | s3 = CAT_ROL64(s3, 45); 755 | 756 | State[0] = s0, State[1] = s1, State[2] = s2, State[3] = s3; 757 | 758 | return result; 759 | } 760 | 761 | static FP61_FORCE_INLINE uint64_t ConvertRandToFp(uint64_t word) 762 | { 763 | // Pick high bits as recommended by Xoshiro authors 764 | word >>= 3; 765 | 766 | // If word + 1 overflows, then subtract 1. 767 | // This converts fffff to ffffe and slightly biases the PRNG. 768 | word -= (word + 1) >> 61; 769 | 770 | return word; 771 | } 772 | 773 | static FP61_FORCE_INLINE uint64_t ConvertRandToNonzeroFp(uint64_t word) 774 | { 775 | word = ConvertRandToFp(word); 776 | 777 | // If word - 1 borrows out, then add 1. 778 | // This converts 0 to 1 and slightly biases the PRNG. 779 | word += (word - 1) >> 63; 780 | 781 | return word; 782 | } 783 | 784 | /// Get the next random value between 0..p 785 | FP61_FORCE_INLINE uint64_t NextFp() 786 | { 787 | return ConvertRandToFp(Next()); 788 | } 789 | 790 | /// Get the next random value between 1..p 791 | FP61_FORCE_INLINE uint64_t NextNonzeroFp() 792 | { 793 | return ConvertRandToNonzeroFp(Next()); 794 | } 795 | }; 796 | 797 | /// Hash a 64-bit value to another 64-bit value 798 | uint64_t HashU64(uint64_t x); 799 | 800 | /// Hash a seed into a value from 1..p-1 801 | FP61_FORCE_INLINE uint64_t HashToNonzeroFp(uint64_t word) 802 | { 803 | // Run a simple mixer based on HashU64() 804 | word += 0x9e3779b97f4a7c15; 805 | word = (word ^ (word >> 30)) * 0xbf58476d1ce4e5b9; 806 | 807 | // Take the top 61 bits 808 | word >>= 3; 809 | 810 | // Eliminate values = p 811 | word -= (word + 1) >> 61; 812 | 813 | // Eliminate values = 0 814 | word += (word - 1) >> 63; 815 | 816 | return word; 817 | } 818 | 819 | 820 | } // namespace fp61 821 | 822 | 823 | #endif // CAT_FP61_H 824 | -------------------------------------------------------------------------------- /tests/benchmarks.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2018 Christopher A. Taylor. All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | * Neither the name of Fp61 nor the names of its contributors may be 13 | used to endorse or promote products derived from this software without 14 | specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 20 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 | POSSIBILITY OF SUCH DAMAGE. 27 | */ 28 | 29 | #include "../fp61.h" 30 | #include "gf256.h" 31 | 32 | #define FP61_ENABLE_GF256_COMPARE 33 | 34 | /** 35 | Fp61 Benchmarks 36 | 37 | The goal of the benchmarks is to determine how fast Fp61 arithmetic is 38 | for the purpose of implementing erasure codes in software. 39 | 40 | 41 | *Drumroll...* Results: 42 | 43 | The results are not good at all. The Fp61 encoder is roughly 20x slower 44 | than my Galois field code (gf256). So, I do not recommend using Fp61. 45 | 46 | The majority of the slowdown comes from the ByteReader class that needs 47 | to convert byte data into 61-bit Fp words. So it seems that having an 48 | odd field size to achieve lazy reductions does not help performance. 49 | 50 | *Sad trombone...* 51 | 52 | Benchmarks for Fp61 erasure codes. Before running the benchmarks please run the tests to make sure everything's working on your PC. It's going to run quite a bit faster with 64-bit builds because it takes advantage of the speed of 64-bit multiplications. 53 | 54 | Testing file size = 10 bytes 55 | N = 2 : gf256_MBPS=250 Fp61_MBPS=65 Fp61_OutputBytes=16 56 | N = 4 : gf256_MBPS=305 Fp61_MBPS=116 Fp61_OutputBytes=16 57 | N = 8 : gf256_MBPS=138 Fp61_MBPS=80 Fp61_OutputBytes=16 58 | N = 16 : gf256_MBPS=337 Fp61_MBPS=110 Fp61_OutputBytes=16 59 | N = 32 : gf256_MBPS=711 Fp61_MBPS=242 Fp61_OutputBytes=16 60 | N = 64 : gf256_MBPS=665 Fp61_MBPS=226 Fp61_OutputBytes=16 61 | N = 128 : gf256_MBPS=868 Fp61_MBPS=297 Fp61_OutputBytes=16 62 | N = 256 : gf256_MBPS=713 Fp61_MBPS=240 Fp61_OutputBytes=16 63 | N = 512 : gf256_MBPS=881 Fp61_MBPS=300 Fp61_OutputBytes=16 64 | Testing file size = 100 bytes 65 | N = 2 : gf256_MBPS=1234 Fp61_MBPS=214 Fp61_OutputBytes=107 66 | N = 4 : gf256_MBPS=4000 Fp61_MBPS=486 Fp61_OutputBytes=107 67 | N = 8 : gf256_MBPS=2631 Fp61_MBPS=328 Fp61_OutputBytes=107 68 | N = 16 : gf256_MBPS=2051 Fp61_MBPS=300 Fp61_OutputBytes=107 69 | N = 32 : gf256_MBPS=3850 Fp61_MBPS=433 Fp61_OutputBytes=107 70 | N = 64 : gf256_MBPS=3972 Fp61_MBPS=428 Fp61_OutputBytes=107 71 | N = 128 : gf256_MBPS=4397 Fp61_MBPS=444 Fp61_OutputBytes=107 72 | N = 256 : gf256_MBPS=5137 Fp61_MBPS=500 Fp61_OutputBytes=107 73 | N = 512 : gf256_MBPS=5129 Fp61_MBPS=492 Fp61_OutputBytes=107 74 | Testing file size = 1000 bytes 75 | N = 2 : gf256_MBPS=10309 Fp61_MBPS=889 Fp61_OutputBytes=1007 76 | N = 4 : gf256_MBPS=15325 Fp61_MBPS=848 Fp61_OutputBytes=1007 77 | N = 8 : gf256_MBPS=9184 Fp61_MBPS=486 Fp61_OutputBytes=1007 78 | N = 16 : gf256_MBPS=12728 Fp61_MBPS=722 Fp61_OutputBytes=1007 79 | N = 32 : gf256_MBPS=11838 Fp61_MBPS=610 Fp61_OutputBytes=1007 80 | N = 64 : gf256_MBPS=10555 Fp61_MBPS=604 Fp61_OutputBytes=1007 81 | N = 128 : gf256_MBPS=11354 Fp61_MBPS=614 Fp61_OutputBytes=1007 82 | N = 256 : gf256_MBPS=14782 Fp61_MBPS=816 Fp61_OutputBytes=1007 83 | N = 512 : gf256_MBPS=18430 Fp61_MBPS=940 Fp61_OutputBytes=1007 84 | Testing file size = 10000 bytes 85 | N = 2 : gf256_MBPS=19138 Fp61_MBPS=893 Fp61_OutputBytes=10004 86 | N = 4 : gf256_MBPS=20283 Fp61_MBPS=959 Fp61_OutputBytes=10004 87 | N = 8 : gf256_MBPS=20953 Fp61_MBPS=1010 Fp61_OutputBytes=10004 88 | N = 16 : gf256_MBPS=22893 Fp61_MBPS=1056 Fp61_OutputBytes=10004 89 | N = 32 : gf256_MBPS=24461 Fp61_MBPS=1087 Fp61_OutputBytes=10004 90 | N = 64 : gf256_MBPS=22945 Fp61_MBPS=1057 Fp61_OutputBytes=10004 91 | N = 128 : gf256_MBPS=16939 Fp61_MBPS=982 Fp61_OutputBytes=10004 92 | N = 256 : gf256_MBPS=18608 Fp61_MBPS=927 Fp61_OutputBytes=10004 93 | N = 512 : gf256_MBPS=16662 Fp61_MBPS=734 Fp61_OutputBytes=10004 94 | Testing file size = 100000 bytes 95 | N = 2 : gf256_MBPS=22941 Fp61_MBPS=962 Fp61_OutputBytes=100002 96 | N = 4 : gf256_MBPS=22827 Fp61_MBPS=976 Fp61_OutputBytes=100002 97 | N = 8 : gf256_MBPS=16210 Fp61_MBPS=1052 Fp61_OutputBytes=100002 98 | N = 16 : gf256_MBPS=17354 Fp61_MBPS=1044 Fp61_OutputBytes=100002 99 | N = 32 : gf256_MBPS=16976 Fp61_MBPS=1030 Fp61_OutputBytes=100002 100 | N = 64 : gf256_MBPS=13570 Fp61_MBPS=910 Fp61_OutputBytes=100002 101 | N = 128 : gf256_MBPS=10592 Fp61_MBPS=533 Fp61_OutputBytes=100002 102 | N = 256 : gf256_MBPS=10637 Fp61_MBPS=500 Fp61_OutputBytes=100002 103 | N = 512 : gf256_MBPS=11528 Fp61_MBPS=483 Fp61_OutputBytes=100002 104 | 105 | 106 | Erasure codes are usually based on 8-bit Galois fields, but I was 107 | intrigued by the speed of the 64-bit multiplier on modern Intel processors. 108 | To take advantage of the fast multiplier I first looked at a number of field 109 | options before settling on Fp=2^61-1. Note that I haven't benchmarked 110 | these other options so my comments might be misleading or incorrect. 111 | 112 | Some other options I investigated: 113 | 114 | Fp=2^64+c 115 | - The values of `c` that I found had a high Hamming weight so would be 116 | expensive to reduce using the pseudo-Mersenne reduction approach. 117 | - These seem to be patented. Didn't really look into that issue. 118 | - Fp values do not fit into 64-bit words so they're slower to work with. 119 | - The reduction seems to require 128-bit adds/subs to implement properly, 120 | which are awkward to implement on some compilers. 121 | - There's no room for lazy reductions, so adds/subs are more expensive. 122 | 123 | Fp=2^64-c, specifically Solinas prime Fp=2^64-2^8-1 124 | - The smallest values of `c` that I found had a high Hamming weight so would 125 | be expensive to reduce using the pseudo-Mersenne reduction approach. 126 | - The reduction seems to require 128-bit adds/subs to implement properly, 127 | which are awkward to implement on some compilers. 128 | - There's no room for lazy reductions, so adds/subs are more expensive. 129 | ? Packing might be a littler simpler since all data is word-sized ? 130 | 131 | Reduction approaches considered: 132 | 133 | Montgomery: 134 | This requires that the Montgomery u factor has a low Hamming weight to 135 | implement efficiently. p=2^64-2^32+1 happens to have this by chance, 136 | but it's a rare property. It then requires two 128-bit products and adds. 137 | 138 | Pseudo-Mersenne: 139 | This does not require an efficient u factor, but still requires similarly 140 | two 128-bit products and adds. 141 | 142 | Mersenne: 143 | This is what Fp61 uses. The reduction has to be applied multiple times to 144 | fully flush data back into the field < p, and it restricts the sizes of the 145 | inputs to 62 bits. But in trade, no 128-bit operations are needed. 146 | */ 147 | 148 | #include 149 | #include 150 | #include 151 | #include 152 | using namespace std; 153 | 154 | 155 | #ifdef _WIN32 156 | #ifndef NOMINMAX 157 | #define NOMINMAX 158 | #endif 159 | #include 160 | #elif __MACH__ 161 | #include 162 | #include 163 | #include 164 | 165 | extern mach_port_t clock_port; 166 | #else 167 | #include 168 | #include 169 | #endif 170 | 171 | 172 | //------------------------------------------------------------------------------ 173 | // Timing 174 | 175 | #ifdef _WIN32 176 | // Precomputed frequency inverse 177 | static double PerfFrequencyInverseUsec = 0.; 178 | static double PerfFrequencyInverseMsec = 0.; 179 | 180 | static void InitPerfFrequencyInverse() 181 | { 182 | LARGE_INTEGER freq = {}; 183 | if (!::QueryPerformanceFrequency(&freq) || freq.QuadPart == 0) 184 | return; 185 | const double invFreq = 1. / (double)freq.QuadPart; 186 | PerfFrequencyInverseUsec = 1000000. * invFreq; 187 | PerfFrequencyInverseMsec = 1000. * invFreq; 188 | } 189 | #elif __MACH__ 190 | static bool m_clock_serv_init = false; 191 | static clock_serv_t m_clock_serv = 0; 192 | 193 | static void InitClockServ() 194 | { 195 | m_clock_serv_init = true; 196 | host_get_clock_service(mach_host_self(), SYSTEM_CLOCK, &m_clock_serv); 197 | } 198 | #endif // _WIN32 199 | 200 | uint64_t GetTimeUsec() 201 | { 202 | #ifdef _WIN32 203 | LARGE_INTEGER timeStamp = {}; 204 | if (!::QueryPerformanceCounter(&timeStamp)) 205 | return 0; 206 | if (PerfFrequencyInverseUsec == 0.) 207 | InitPerfFrequencyInverse(); 208 | return (uint64_t)(PerfFrequencyInverseUsec * timeStamp.QuadPart); 209 | #elif __MACH__ 210 | if (!m_clock_serv_init) 211 | InitClockServ(); 212 | 213 | mach_timespec_t tv; 214 | clock_get_time(m_clock_serv, &tv); 215 | 216 | return 1000000 * tv.tv_sec + tv.tv_nsec / 1000; 217 | #else 218 | struct timeval tv; 219 | gettimeofday(&tv, nullptr); 220 | return 1000000 * tv.tv_sec + tv.tv_usec; 221 | #endif 222 | } 223 | 224 | 225 | //------------------------------------------------------------------------------ 226 | // Fp61 Erasure Code Encoder 227 | 228 | // Get maximum number of bytes needed for a recovery packet 229 | static unsigned GetRecoveryBytes(unsigned originalBytes) 230 | { 231 | const unsigned maxWords = fp61::ByteReader::MaxWords(originalBytes); 232 | const unsigned maxBytes = fp61::WordWriter::BytesNeeded(maxWords); 233 | return maxBytes; 234 | } 235 | 236 | /** 237 | Encode() 238 | 239 | This function implements the encoder for an erasure code. 240 | It accepts a set of equal-sized data packets and outputs one recovery packet 241 | that can repair one lost original packet. 242 | 243 | The recovery packet must be GetRecoveryBytes() in size. 244 | 245 | Returns the number of bytes written. 246 | */ 247 | unsigned Encode( 248 | const std::vector>& originals, 249 | unsigned N, 250 | unsigned bytes, 251 | uint64_t seed, 252 | uint8_t* recovery) 253 | { 254 | uint64_t seedMix = fp61::HashU64(seed); 255 | 256 | std::vector readers; 257 | readers.resize(N); 258 | for (unsigned i = 0; i < N; ++i) { 259 | readers[i].BeginRead(&originals[i][0], bytes); 260 | } 261 | 262 | fp61::WordWriter writer; 263 | writer.BeginWrite(recovery); 264 | 265 | const unsigned minWords = fp61::WordReader::WordCount(bytes); 266 | for (unsigned i = 0; i < minWords; ++i) 267 | { 268 | uint64_t fpword; 269 | readers[0].Read(fpword); 270 | uint64_t coeff = fp61::HashToNonzeroFp(seedMix + 0); 271 | uint64_t sum = fp61::Multiply(coeff, fpword); 272 | 273 | unsigned column = 1; 274 | unsigned columnsRemaining = N - 1; 275 | while (columnsRemaining >= 3) 276 | { 277 | uint64_t coeff0 = fp61::HashToNonzeroFp(seedMix + column); 278 | uint64_t coeff1 = fp61::HashToNonzeroFp(seedMix + column + 1); 279 | uint64_t coeff2 = fp61::HashToNonzeroFp(seedMix + column + 2); 280 | 281 | uint64_t fpword0, fpword1, fpword2; 282 | readers[column].Read(fpword0); 283 | readers[column + 1].Read(fpword1); 284 | readers[column + 2].Read(fpword2); 285 | 286 | sum += fp61::Multiply(coeff0, fpword0); 287 | sum += fp61::Multiply(coeff1, fpword1); 288 | sum += fp61::Multiply(coeff2, fpword2); 289 | sum = fp61::PartialReduce(sum); 290 | 291 | column += 3; 292 | columnsRemaining -= 3; 293 | } 294 | 295 | while (columnsRemaining > 0) 296 | { 297 | uint64_t temp; 298 | readers[column].Read(temp); 299 | sum += fp61::Multiply(coeff, temp); 300 | 301 | column++; 302 | columnsRemaining--; 303 | } 304 | sum = fp61::PartialReduce(sum); 305 | sum = fp61::Finalize(sum); 306 | writer.Write(sum); 307 | } 308 | 309 | for (;;) 310 | { 311 | bool more_data = false; 312 | 313 | uint64_t sum = 0; 314 | 315 | for (unsigned i = 0; i < N; ++i) 316 | { 317 | uint64_t coeff = fp61::HashToNonzeroFp(seedMix + i); 318 | 319 | uint64_t fpword; 320 | if (readers[i].Read(fpword) == fp61::ReadResult::Success) 321 | { 322 | more_data = true; 323 | 324 | sum += fp61::Multiply(coeff, fpword); 325 | sum = fp61::PartialReduce(sum); 326 | } 327 | } 328 | 329 | if (!more_data) { 330 | break; 331 | } 332 | 333 | sum = fp61::Finalize(sum); 334 | writer.Write(sum); 335 | } 336 | 337 | return writer.Flush(); 338 | } 339 | 340 | void EncodeGF256( 341 | const std::vector>& originals, 342 | unsigned N, 343 | unsigned bytes, 344 | uint64_t seed, 345 | uint8_t* recovery) 346 | { 347 | uint64_t seedMix = fp61::HashU64(seed); 348 | 349 | uint8_t coeff = (uint8_t)fp61::HashToNonzeroFp(seedMix + 0); 350 | if (coeff == 0) { 351 | coeff = 1; 352 | } 353 | 354 | gf256_mul_mem(recovery, &originals[0][0], coeff, bytes); 355 | 356 | for (unsigned i = 1; i < N; ++i) 357 | { 358 | coeff = (uint8_t)fp61::HashToNonzeroFp(seedMix + 0); 359 | if (coeff == 0) { 360 | coeff = 1; 361 | } 362 | 363 | gf256_muladd_mem(recovery, coeff, &originals[i][0], bytes); 364 | } 365 | } 366 | 367 | 368 | //------------------------------------------------------------------------------ 369 | // Benchmarks 370 | 371 | static const unsigned kFileSizes[] = { 372 | 10, 100, 1000, 10000, 100000 373 | }; 374 | static const unsigned kFileSizesCount = static_cast(sizeof(kFileSizes) / sizeof(kFileSizes[0])); 375 | 376 | static const unsigned kFileN[] = { 377 | 2, 4, 8, 16, 32, 64, 128, 256, 512 378 | }; 379 | static const unsigned kFileNCount = static_cast(sizeof(kFileN) / sizeof(kFileN[0])); 380 | 381 | static const unsigned kTrials = 1000; 382 | 383 | void RunBenchmarks() 384 | { 385 | fp61::Random prng; 386 | prng.Seed(0); 387 | 388 | std::vector> original_data; 389 | std::vector recovery_data; 390 | 391 | for (unsigned i = 0; i < kFileSizesCount; ++i) 392 | { 393 | unsigned fileSizeBytes = kFileSizes[i]; 394 | 395 | cout << "Testing file size = " << fileSizeBytes << " bytes" << endl; 396 | 397 | for (unsigned j = 0; j < kFileNCount; ++j) 398 | { 399 | unsigned N = kFileN[j]; 400 | 401 | cout << "N = " << N << " : "; 402 | 403 | uint64_t sizeSum = 0, timeSum = 0; 404 | uint64_t timeSum_gf256 = 0; 405 | 406 | for (unsigned k = 0; k < kTrials; ++k) 407 | { 408 | /* 409 | File pieces: f0, f1, f3, f4, ... 410 | Coefficients: m0, m1, m2, m3, ... 411 | 412 | R = m0 * f0 + m1 * f1 + m2 * f2 + ... 413 | 414 | R = sum(m_i * f_i) (mod 2^61-1) 415 | 416 | To compute the recovery packet R we process the calculations 417 | for the first word from all of the file pieces to produce a 418 | single word of output. This is a matrix-vector product 419 | between file data f_i (treated as Fp words) and randomly 420 | chosen generator matrix coefficients m_i. 421 | 422 | Lazy reduction can be used to simplify the add steps. 423 | 424 | Then we continue to the next word for all the file pieces, 425 | producing the next word of output. 426 | 427 | It is possible to interleave the calculations for output 428 | words, and for input words to achieve higher throughput. 429 | 430 | The number of words for each file piece can vary slightly 431 | based on the data (if the data bytes do not fit evenly into 432 | the Fp words, we have to add extra bits to resolve 433 | ambiguities). 434 | 435 | The result is a set of 61-bit Fp words serialized to bytes, 436 | that is about 8 bytes more than the original file sizes. 437 | 438 | The erasure code decoder (not implemented) would be able 439 | to take these recovery packets and fix lost data. 440 | The decoder performance would be fairly similar to the 441 | encoder performance for this type of erasure code, since 442 | the runtime is dominated by this matrix-vector product. 443 | */ 444 | 445 | original_data.resize(N); 446 | for (unsigned s = 0; s < N; ++s) 447 | { 448 | // Add 8 bytes padding to simplify tester 449 | original_data[s].resize(fileSizeBytes + 8); 450 | 451 | // Fill the data with random bytes 452 | for (unsigned r = 0; r < i; r += 8) 453 | { 454 | uint64_t w; 455 | if (prng.Next() % 100 <= 3) { 456 | w = ~(uint64_t)0; 457 | } 458 | else { 459 | w = prng.Next(); 460 | } 461 | fp61::WriteU64_LE(&original_data[s][r], w); 462 | } 463 | } 464 | 465 | const unsigned maxRecoveryBytes = GetRecoveryBytes(fileSizeBytes); 466 | recovery_data.resize(maxRecoveryBytes); 467 | 468 | { 469 | uint64_t t0 = GetTimeUsec(); 470 | 471 | unsigned recoveryBytes = Encode(original_data, N, fileSizeBytes, k, &recovery_data[0]); 472 | 473 | uint64_t t1 = GetTimeUsec(); 474 | 475 | sizeSum += recoveryBytes; 476 | timeSum += t1 - t0; 477 | } 478 | 479 | #ifdef FP61_ENABLE_GF256_COMPARE 480 | { 481 | uint64_t t0 = GetTimeUsec(); 482 | 483 | EncodeGF256(original_data, N, fileSizeBytes, k, &recovery_data[0]); 484 | 485 | uint64_t t1 = GetTimeUsec(); 486 | 487 | timeSum_gf256 += t1 - t0; 488 | } 489 | #endif // FP61_ENABLE_GF256_COMPARE 490 | } 491 | 492 | #ifdef FP61_ENABLE_GF256_COMPARE 493 | cout << " gf256_MBPS=" << (uint64_t)fileSizeBytes * N * kTrials / timeSum_gf256; 494 | #endif // FP61_ENABLE_GF256_COMPARE 495 | cout << " Fp61_MBPS=" << (uint64_t)fileSizeBytes * N * kTrials / timeSum; 496 | cout << " Fp61_OutputBytes=" << sizeSum / (float)kTrials; 497 | cout << endl; 498 | } 499 | } 500 | } 501 | 502 | 503 | //------------------------------------------------------------------------------ 504 | // Entrypoint 505 | 506 | int main() 507 | { 508 | cout << "Benchmarks for Fp61 erasure codes. Before running the benchmarks please run the tests to make sure everything's working on your PC. It's going to run quite a bit faster with 64-bit builds because it takes advantage of the speed of 64-bit multiplications." << endl; 509 | cout << endl; 510 | 511 | gf256_init(); 512 | 513 | RunBenchmarks(); 514 | 515 | cout << endl; 516 | return 0; 517 | } 518 | -------------------------------------------------------------------------------- /tests/gf256.cpp: -------------------------------------------------------------------------------- 1 | /** \file 2 | \brief GF(256) Main C API Source 3 | \copyright Copyright (c) 2017 Christopher A. Taylor. All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | * Neither the name of GF256 nor the names of its contributors may be 14 | used to endorse or promote products derived from this software without 15 | specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 21 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 | POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | #include "gf256.h" 31 | 32 | #ifdef LINUX_ARM 33 | #include 34 | #include 35 | #include 36 | #include 37 | #endif 38 | 39 | //------------------------------------------------------------------------------ 40 | // Workaround for ARMv7 that doesn't provide vqtbl1_* 41 | // This comes from linux-raid (https://www.spinics.net/lists/raid/msg58403.html) 42 | // 43 | #ifdef GF256_TRY_NEON 44 | #if __ARM_ARCH <= 7 && !defined(__aarch64__) 45 | static GF256_FORCE_INLINE uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b) 46 | { 47 | union { 48 | uint8x16_t val; 49 | uint8x8x2_t pair; 50 | } __a = { a }; 51 | 52 | return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)), 53 | vtbl2_u8(__a.pair, vget_high_u8(b))); 54 | } 55 | #endif 56 | #endif 57 | 58 | //------------------------------------------------------------------------------ 59 | // Self-Test 60 | // 61 | // This is executed during initialization to make sure the library is working 62 | 63 | static const unsigned kTestBufferBytes = 32 + 16 + 8 + 4 + 2 + 1; 64 | static const unsigned kTestBufferAllocated = 64; 65 | struct SelfTestBuffersT 66 | { 67 | GF256_ALIGNED uint8_t A[kTestBufferAllocated]; 68 | GF256_ALIGNED uint8_t B[kTestBufferAllocated]; 69 | GF256_ALIGNED uint8_t C[kTestBufferAllocated]; 70 | }; 71 | static GF256_ALIGNED SelfTestBuffersT m_SelfTestBuffers; 72 | 73 | static bool gf256_self_test() 74 | { 75 | if ((uintptr_t)m_SelfTestBuffers.A % GF256_ALIGN_BYTES != 0) 76 | return false; 77 | if ((uintptr_t)m_SelfTestBuffers.A % GF256_ALIGN_BYTES != 0) 78 | return false; 79 | if ((uintptr_t)m_SelfTestBuffers.B % GF256_ALIGN_BYTES != 0) 80 | return false; 81 | if ((uintptr_t)m_SelfTestBuffers.C % GF256_ALIGN_BYTES != 0) 82 | return false; 83 | 84 | // Check multiplication/division 85 | for (unsigned i = 0; i < 256; ++i) 86 | { 87 | for (unsigned j = 0; j < 256; ++j) 88 | { 89 | uint8_t prod = gf256_mul((uint8_t)i, (uint8_t)j); 90 | if (i != 0 && j != 0) 91 | { 92 | uint8_t div1 = gf256_div(prod, (uint8_t)i); 93 | if (div1 != j) 94 | return false; 95 | uint8_t div2 = gf256_div(prod, (uint8_t)j); 96 | if (div2 != i) 97 | return false; 98 | } 99 | else if (prod != 0) 100 | return false; 101 | if (j == 1 && prod != i) 102 | return false; 103 | } 104 | } 105 | 106 | // Check for overruns 107 | m_SelfTestBuffers.A[kTestBufferBytes] = 0x5a; 108 | m_SelfTestBuffers.B[kTestBufferBytes] = 0x5a; 109 | m_SelfTestBuffers.C[kTestBufferBytes] = 0x5a; 110 | 111 | // Test gf256_add_mem() 112 | for (unsigned i = 0; i < kTestBufferBytes; ++i) 113 | { 114 | m_SelfTestBuffers.A[i] = 0x1f; 115 | m_SelfTestBuffers.B[i] = 0xf7; 116 | } 117 | gf256_add_mem(m_SelfTestBuffers.A, m_SelfTestBuffers.B, kTestBufferBytes); 118 | for (unsigned i = 0; i < kTestBufferBytes; ++i) 119 | if (m_SelfTestBuffers.A[i] != (0x1f ^ 0xf7)) 120 | return false; 121 | 122 | // Test gf256_add2_mem() 123 | for (unsigned i = 0; i < kTestBufferBytes; ++i) 124 | { 125 | m_SelfTestBuffers.A[i] = 0x1f; 126 | m_SelfTestBuffers.B[i] = 0xf7; 127 | m_SelfTestBuffers.C[i] = 0x71; 128 | } 129 | gf256_add2_mem(m_SelfTestBuffers.A, m_SelfTestBuffers.B, m_SelfTestBuffers.C, kTestBufferBytes); 130 | for (unsigned i = 0; i < kTestBufferBytes; ++i) 131 | if (m_SelfTestBuffers.A[i] != (0x1f ^ 0xf7 ^ 0x71)) 132 | return false; 133 | 134 | // Test gf256_addset_mem() 135 | for (unsigned i = 0; i < kTestBufferBytes; ++i) 136 | { 137 | m_SelfTestBuffers.A[i] = 0x55; 138 | m_SelfTestBuffers.B[i] = 0xaa; 139 | m_SelfTestBuffers.C[i] = 0x6c; 140 | } 141 | gf256_addset_mem(m_SelfTestBuffers.A, m_SelfTestBuffers.B, m_SelfTestBuffers.C, kTestBufferBytes); 142 | for (unsigned i = 0; i < kTestBufferBytes; ++i) 143 | if (m_SelfTestBuffers.A[i] != (0xaa ^ 0x6c)) 144 | return false; 145 | 146 | // Test gf256_muladd_mem() 147 | for (unsigned i = 0; i < kTestBufferBytes; ++i) 148 | { 149 | m_SelfTestBuffers.A[i] = 0xff; 150 | m_SelfTestBuffers.B[i] = 0xaa; 151 | } 152 | const uint8_t expectedMulAdd = gf256_mul(0xaa, 0x6c); 153 | gf256_muladd_mem(m_SelfTestBuffers.A, 0x6c, m_SelfTestBuffers.B, kTestBufferBytes); 154 | for (unsigned i = 0; i < kTestBufferBytes; ++i) 155 | if (m_SelfTestBuffers.A[i] != (expectedMulAdd ^ 0xff)) 156 | return false; 157 | 158 | // Test gf256_mul_mem() 159 | for (unsigned i = 0; i < kTestBufferBytes; ++i) 160 | { 161 | m_SelfTestBuffers.A[i] = 0xff; 162 | m_SelfTestBuffers.B[i] = 0x55; 163 | } 164 | const uint8_t expectedMul = gf256_mul(0xa2, 0x55); 165 | gf256_mul_mem(m_SelfTestBuffers.A, m_SelfTestBuffers.B, 0xa2, kTestBufferBytes); 166 | for (unsigned i = 0; i < kTestBufferBytes; ++i) 167 | if (m_SelfTestBuffers.A[i] != expectedMul) 168 | return false; 169 | 170 | if (m_SelfTestBuffers.A[kTestBufferBytes] != 0x5a) 171 | return false; 172 | if (m_SelfTestBuffers.B[kTestBufferBytes] != 0x5a) 173 | return false; 174 | if (m_SelfTestBuffers.C[kTestBufferBytes] != 0x5a) 175 | return false; 176 | 177 | return true; 178 | } 179 | 180 | 181 | //------------------------------------------------------------------------------ 182 | // Runtime CPU Architecture Check 183 | // 184 | // Feature checks stolen shamelessly from 185 | // https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c 186 | 187 | #if defined(HAVE_ANDROID_GETCPUFEATURES) 188 | #include 189 | #endif 190 | 191 | #if defined(GF256_TRY_NEON) 192 | # if defined(IOS) && defined(__ARM_NEON__) 193 | // Requires iPhone 5S or newer 194 | static const bool CpuHasNeon = true; 195 | static const bool CpuHasNeon64 = true; 196 | # else // ANDROID or LINUX_ARM 197 | # if defined(__aarch64__) 198 | static bool CpuHasNeon = true; // if AARCH64, then we have NEON for sure... 199 | static bool CpuHasNeon64 = true; // And we have ASIMD 200 | # else 201 | static bool CpuHasNeon = false; // if not, then we have to check at runtime. 202 | static bool CpuHasNeon64 = false; // And we don't have ASIMD 203 | # endif 204 | # endif 205 | #endif 206 | 207 | #if !defined(GF256_TARGET_MOBILE) 208 | 209 | #ifdef _MSC_VER 210 | #include // __cpuid 211 | #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX 212 | #endif 213 | 214 | #ifdef GF256_TRY_AVX2 215 | static bool CpuHasAVX2 = false; 216 | #endif 217 | static bool CpuHasSSSE3 = false; 218 | 219 | #define CPUID_EBX_AVX2 0x00000020 220 | #define CPUID_ECX_SSSE3 0x00000200 221 | 222 | static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type) 223 | { 224 | #if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)) 225 | __cpuid((int *) cpu_info, cpu_info_type); 226 | #else //if defined(HAVE_CPUID) 227 | cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; 228 | # ifdef __i386__ 229 | __asm__ __volatile__ ("pushfl; pushfl; " 230 | "popl %0; " 231 | "movl %0, %1; xorl %2, %0; " 232 | "pushl %0; " 233 | "popfl; pushfl; popl %0; popfl" : 234 | "=&r" (cpu_info[0]), "=&r" (cpu_info[1]) : 235 | "i" (0x200000)); 236 | if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) { 237 | return; /* LCOV_EXCL_LINE */ 238 | } 239 | # endif 240 | # ifdef __i386__ 241 | __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" : 242 | "=a" (cpu_info[0]), "=&r" (cpu_info[1]), 243 | "=c" (cpu_info[2]), "=d" (cpu_info[3]) : 244 | "0" (cpu_info_type), "2" (0U)); 245 | # elif defined(__x86_64__) 246 | __asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" : 247 | "=a" (cpu_info[0]), "=&r" (cpu_info[1]), 248 | "=c" (cpu_info[2]), "=d" (cpu_info[3]) : 249 | "0" (cpu_info_type), "2" (0U)); 250 | # else 251 | __asm__ __volatile__ ("cpuid" : 252 | "=a" (cpu_info[0]), "=b" (cpu_info[1]), 253 | "=c" (cpu_info[2]), "=d" (cpu_info[3]) : 254 | "0" (cpu_info_type), "2" (0U)); 255 | # endif 256 | #endif 257 | } 258 | 259 | #else 260 | #if defined(LINUX_ARM) 261 | static void checkLinuxARMNeonCapabilities( bool& cpuHasNeon ) 262 | { 263 | auto cpufile = open("/proc/self/auxv", O_RDONLY); 264 | Elf32_auxv_t auxv; 265 | if (cpufile >= 0) 266 | { 267 | const auto size_auxv_t = sizeof(Elf32_auxv_t); 268 | while (read(cpufile, &auxv, size_auxv_t) == size_auxv_t) 269 | { 270 | if (auxv.a_type == AT_HWCAP) 271 | { 272 | cpuHasNeon = (auxv.a_un.a_val & 4096) != 0; 273 | break; 274 | } 275 | } 276 | close(cpufile); 277 | } 278 | else 279 | { 280 | cpuHasNeon = false; 281 | } 282 | } 283 | #endif 284 | #endif // defined(GF256_TARGET_MOBILE) 285 | 286 | static void gf256_architecture_init() 287 | { 288 | #if defined(GF256_TRY_NEON) 289 | 290 | // Check for NEON support on Android platform 291 | #if defined(HAVE_ANDROID_GETCPUFEATURES) 292 | AndroidCpuFamily family = android_getCpuFamily(); 293 | if (family == ANDROID_CPU_FAMILY_ARM) 294 | { 295 | if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) 296 | CpuHasNeon = true; 297 | } 298 | else if (family == ANDROID_CPU_FAMILY_ARM64) 299 | { 300 | CpuHasNeon = true; 301 | if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD) 302 | CpuHasNeon64 = true; 303 | } 304 | #endif 305 | 306 | #if defined(LINUX_ARM) 307 | // Check for NEON support on other ARM/Linux platforms 308 | checkLinuxARMNeonCapabilities(CpuHasNeon); 309 | #endif 310 | 311 | #endif //GF256_TRY_NEON 312 | 313 | #if !defined(GF256_TARGET_MOBILE) 314 | unsigned int cpu_info[4]; 315 | 316 | _cpuid(cpu_info, 1); 317 | CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0); 318 | 319 | #if defined(GF256_TRY_AVX2) 320 | _cpuid(cpu_info, 7); 321 | CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0); 322 | #endif // GF256_TRY_AVX2 323 | 324 | // When AVX2 and SSSE3 are unavailable, Siamese takes 4x longer to decode 325 | // and 2.6x longer to encode. Encoding requires a lot more simple XOR ops 326 | // so it is still pretty fast. Decoding is usually really quick because 327 | // average loss rates are low, but when needed it requires a lot more 328 | // GF multiplies requiring table lookups which is slower. 329 | 330 | #endif // GF256_TARGET_MOBILE 331 | } 332 | 333 | 334 | //------------------------------------------------------------------------------ 335 | // Context Object 336 | 337 | // Context object for GF(2^^8) math 338 | GF256_ALIGNED gf256_ctx GF256Ctx; 339 | static bool Initialized = false; 340 | 341 | 342 | //------------------------------------------------------------------------------ 343 | // Generator Polynomial 344 | 345 | // There are only 16 irreducible polynomials for GF(2^^8) 346 | static const int GF256_GEN_POLY_COUNT = 16; 347 | static const uint8_t GF256_GEN_POLY[GF256_GEN_POLY_COUNT] = { 348 | 0x8e, 0x95, 0x96, 0xa6, 0xaf, 0xb1, 0xb2, 0xb4, 349 | 0xb8, 0xc3, 0xc6, 0xd4, 0xe1, 0xe7, 0xf3, 0xfa 350 | }; 351 | 352 | static const int kDefaultPolynomialIndex = 3; 353 | 354 | // Select which polynomial to use 355 | static void gf256_poly_init(int polynomialIndex) 356 | { 357 | if (polynomialIndex < 0 || polynomialIndex >= GF256_GEN_POLY_COUNT) 358 | polynomialIndex = kDefaultPolynomialIndex; 359 | 360 | GF256Ctx.Polynomial = (GF256_GEN_POLY[polynomialIndex] << 1) | 1; 361 | } 362 | 363 | 364 | //------------------------------------------------------------------------------ 365 | // Exponential and Log Tables 366 | 367 | // Construct EXP and LOG tables from polynomial 368 | static void gf256_explog_init() 369 | { 370 | unsigned poly = GF256Ctx.Polynomial; 371 | uint8_t* exptab = GF256Ctx.GF256_EXP_TABLE; 372 | uint16_t* logtab = GF256Ctx.GF256_LOG_TABLE; 373 | 374 | logtab[0] = 512; 375 | exptab[0] = 1; 376 | for (unsigned jj = 1; jj < 255; ++jj) 377 | { 378 | unsigned next = (unsigned)exptab[jj - 1] * 2; 379 | if (next >= 256) 380 | next ^= poly; 381 | 382 | exptab[jj] = static_cast( next ); 383 | logtab[exptab[jj]] = static_cast( jj ); 384 | } 385 | exptab[255] = exptab[0]; 386 | logtab[exptab[255]] = 255; 387 | for (unsigned jj = 256; jj < 2 * 255; ++jj) 388 | exptab[jj] = exptab[jj % 255]; 389 | exptab[2 * 255] = 1; 390 | for (unsigned jj = 2 * 255 + 1; jj < 4 * 255; ++jj) 391 | exptab[jj] = 0; 392 | } 393 | 394 | 395 | //------------------------------------------------------------------------------ 396 | // Multiply and Divide Tables 397 | 398 | // Initialize MUL and DIV tables using LOG and EXP tables 399 | static void gf256_muldiv_init() 400 | { 401 | // Allocate table memory 65KB x 2 402 | uint8_t* m = GF256Ctx.GF256_MUL_TABLE; 403 | uint8_t* d = GF256Ctx.GF256_DIV_TABLE; 404 | 405 | // Unroll y = 0 subtable 406 | for (int x = 0; x < 256; ++x) 407 | m[x] = d[x] = 0; 408 | 409 | // For each other y value: 410 | for (int y = 1; y < 256; ++y) 411 | { 412 | // Calculate log(y) for mult and 255 - log(y) for div 413 | const uint8_t log_y = static_cast(GF256Ctx.GF256_LOG_TABLE[y]); 414 | const uint8_t log_yn = 255 - log_y; 415 | 416 | // Next subtable 417 | m += 256, d += 256; 418 | 419 | // Unroll x = 0 420 | m[0] = 0, d[0] = 0; 421 | 422 | // Calculate x * y, x / y 423 | for (int x = 1; x < 256; ++x) 424 | { 425 | uint16_t log_x = GF256Ctx.GF256_LOG_TABLE[x]; 426 | 427 | m[x] = GF256Ctx.GF256_EXP_TABLE[log_x + log_y]; 428 | d[x] = GF256Ctx.GF256_EXP_TABLE[log_x + log_yn]; 429 | } 430 | } 431 | } 432 | 433 | 434 | //------------------------------------------------------------------------------ 435 | // Inverse Table 436 | 437 | // Initialize INV table using DIV table 438 | static void gf256_inv_init() 439 | { 440 | for (int x = 0; x < 256; ++x) 441 | GF256Ctx.GF256_INV_TABLE[x] = gf256_div(1, static_cast(x)); 442 | } 443 | 444 | 445 | //------------------------------------------------------------------------------ 446 | // Square Table 447 | 448 | // Initialize SQR table using MUL table 449 | static void gf256_sqr_init() 450 | { 451 | for (int x = 0; x < 256; ++x) 452 | GF256Ctx.GF256_SQR_TABLE[x] = gf256_mul(static_cast(x), static_cast(x)); 453 | } 454 | 455 | 456 | //------------------------------------------------------------------------------ 457 | // Multiply and Add Memory Tables 458 | 459 | /* 460 | Fast algorithm to compute m[1..8] = a[1..8] * b in GF(256) 461 | using SSE3 SIMD instruction set: 462 | 463 | Consider z = x * y in GF(256). 464 | This operation can be performed bit-by-bit. Usefully, the partial product 465 | of each bit is combined linearly with the rest. This means that the 8-bit 466 | number x can be split into its high and low 4 bits, and partial products 467 | can be formed from each half. Then the halves can be linearly combined: 468 | 469 | z = x[0..3] * y + x[4..7] * y 470 | 471 | The multiplication of each half can be done efficiently via table lookups, 472 | and the addition in GF(256) is XOR. There must be two tables that map 16 473 | input elements for the low or high 4 bits of x to the two partial products. 474 | Each value for y has a different set of two tables: 475 | 476 | z = TABLE_LO_y(x[0..3]) xor TABLE_HI_y(x[4..7]) 477 | 478 | This means that we need 16 * 2 * 256 = 8192 bytes for precomputed tables. 479 | 480 | Computing z[] = x[] * y can be performed 16 bytes at a time by using the 481 | 128-bit register operations supported by modern processors. 482 | 483 | This is efficiently realized in SSE3 using the _mm_shuffle_epi8() function 484 | provided by Visual Studio 2010 or newer in . This function 485 | uses the low bits to do a table lookup on each byte. Unfortunately the 486 | high bit of each mask byte has the special feature that it clears the 487 | output byte when it is set, so we need to make sure it's cleared by masking 488 | off the high bit of each byte before using it: 489 | 490 | clr_mask = _mm_set1_epi8(0x0f) = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 491 | 492 | For the low half of the partial product, clear the high bit of each byte 493 | and perform the table lookup: 494 | 495 | p_lo = _mm_and_si128(x, clr_mask) 496 | p_lo = _mm_shuffle_epi8(p_lo, TABLE_LO_y) 497 | 498 | For the high half of the partial product, shift the high 4 bits of each 499 | byte into the low 4 bits and clear the high bit of each byte, and then 500 | perform the table lookup: 501 | 502 | p_hi = _mm_srli_epi64(x, 4) 503 | p_hi = _mm_and_si128(p_hi, clr_mask) 504 | p_hi = _mm_shuffle_epi8(p_hi, TABLE_HI_y) 505 | 506 | Finally add the two partial products to form the product, recalling that 507 | addition is XOR in a Galois field: 508 | 509 | result = _mm_xor_si128(p_lo, p_hi) 510 | 511 | This crunches 16 bytes of x at a time, and the result can be stored in z. 512 | */ 513 | 514 | /* 515 | Intrinsic reference: 516 | 517 | SSE3, VS2010+, tmmintrin.h: 518 | 519 | GF256_M128 _mm_shuffle_epi8(GF256_M128 a, GF256_M128 mask); 520 | Emits the Supplemental Streaming SIMD Extensions 3 (SSSE3) instruction pshufb. This instruction shuffles 16-byte parameters from a 128-bit parameter. 521 | 522 | Pseudo-code for PSHUFB (with 128 bit operands): 523 | 524 | for i = 0 to 15 { 525 | if (SRC[(i * 8)+7] = 1 ) then 526 | DEST[(i*8)+7..(i*8)+0] <- 0; 527 | else 528 | index[3..0] <- SRC[(i*8)+3 .. (i*8)+0]; 529 | DEST[(i*8)+7..(i*8)+0] <- DEST[(index*8+7)..(index*8+0)]; 530 | endif 531 | } 532 | 533 | SSE2, VS2008+, emmintrin.h: 534 | 535 | GF256_M128 _mm_slli_epi64 (GF256_M128 a, int count); 536 | Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while shifting in zeros. 537 | GF256_M128 _mm_srli_epi64 (GF256_M128 a, int count); 538 | Shifts the 2 signed or unsigned 64-bit integers in a right by count bits while shifting in zeros. 539 | GF256_M128 _mm_set1_epi8 (char b); 540 | Sets the 16 signed 8-bit integer values to b. 541 | GF256_M128 _mm_and_si128 (GF256_M128 a, GF256_M128 b); 542 | Computes the bitwise AND of the 128-bit value in a and the 128-bit value in b. 543 | GF256_M128 _mm_xor_si128 ( GF256_M128 a, GF256_M128 b); 544 | Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in b. 545 | */ 546 | 547 | // Initialize the multiplication tables using gf256_mul() 548 | static void gf256_mul_mem_init() 549 | { 550 | // Reuse aligned self test buffers to load table data 551 | uint8_t* lo = m_SelfTestBuffers.A; 552 | uint8_t* hi = m_SelfTestBuffers.B; 553 | 554 | for (int y = 0; y < 256; ++y) 555 | { 556 | // TABLE_LO_Y maps 0..15 to 8-bit partial product based on y. 557 | for (unsigned char x = 0; x < 16; ++x) 558 | { 559 | lo[x] = gf256_mul(x, static_cast( y )); 560 | hi[x] = gf256_mul(x << 4, static_cast( y )); 561 | } 562 | 563 | #if defined(GF256_TRY_NEON) 564 | if (CpuHasNeon) 565 | { 566 | GF256Ctx.MM128.TABLE_LO_Y[y] = vld1q_u8(lo); 567 | GF256Ctx.MM128.TABLE_HI_Y[y] = vld1q_u8(hi); 568 | } 569 | #elif !defined(GF256_TARGET_MOBILE) 570 | const GF256_M128 table_lo = _mm_loadu_si128((GF256_M128*)lo); 571 | const GF256_M128 table_hi = _mm_loadu_si128((GF256_M128*)hi); 572 | _mm_storeu_si128(GF256Ctx.MM128.TABLE_LO_Y + y, table_lo); 573 | _mm_storeu_si128(GF256Ctx.MM128.TABLE_HI_Y + y, table_hi); 574 | # ifdef GF256_TRY_AVX2 575 | if (CpuHasAVX2) 576 | { 577 | const GF256_M256 table_lo2 = _mm256_broadcastsi128_si256(table_lo); 578 | const GF256_M256 table_hi2 = _mm256_broadcastsi128_si256(table_hi); 579 | _mm256_storeu_si256(GF256Ctx.MM256.TABLE_LO_Y + y, table_lo2); 580 | _mm256_storeu_si256(GF256Ctx.MM256.TABLE_HI_Y + y, table_hi2); 581 | } 582 | # endif // GF256_TRY_AVX2 583 | #endif // GF256_TARGET_MOBILE 584 | } 585 | } 586 | 587 | 588 | //------------------------------------------------------------------------------ 589 | // Initialization 590 | 591 | static unsigned char kLittleEndianTestData[4] = { 4, 3, 2, 1 }; 592 | 593 | union UnionType 594 | { 595 | uint32_t IntValue; 596 | char CharArray[4]; 597 | }; 598 | 599 | static bool IsLittleEndian() 600 | { 601 | UnionType type; 602 | for (unsigned i = 0; i < 4; ++i) 603 | type.CharArray[i] = kLittleEndianTestData[i]; 604 | return 0x01020304 == type.IntValue; 605 | } 606 | 607 | extern "C" int gf256_init_(int version) 608 | { 609 | if (version != GF256_VERSION) 610 | return -1; // User's header does not match library version. 611 | 612 | // Avoid multiple initialization 613 | if (Initialized) 614 | return 0; 615 | Initialized = true; 616 | 617 | if (!IsLittleEndian()) 618 | return -2; // Architecture is not supported (code won't work without mods). 619 | 620 | gf256_architecture_init(); 621 | gf256_poly_init(kDefaultPolynomialIndex); 622 | gf256_explog_init(); 623 | gf256_muldiv_init(); 624 | gf256_inv_init(); 625 | gf256_sqr_init(); 626 | gf256_mul_mem_init(); 627 | 628 | if (!gf256_self_test()) 629 | return -3; // Self-test failed (perhaps untested configuration) 630 | 631 | return 0; 632 | } 633 | 634 | 635 | //------------------------------------------------------------------------------ 636 | // Operations 637 | 638 | extern "C" void gf256_add_mem(void * GF256_RESTRICT vx, 639 | const void * GF256_RESTRICT vy, int bytes) 640 | { 641 | GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast(vx); 642 | const GF256_M128 * GF256_RESTRICT y16 = reinterpret_cast(vy); 643 | 644 | #if defined(GF256_TARGET_MOBILE) 645 | # if defined(GF256_TRY_NEON) 646 | // Handle multiples of 64 bytes 647 | if (CpuHasNeon) 648 | { 649 | while (bytes >= 64) 650 | { 651 | GF256_M128 x0 = vld1q_u8((uint8_t*) x16); 652 | GF256_M128 x1 = vld1q_u8((uint8_t*)(x16 + 1) ); 653 | GF256_M128 x2 = vld1q_u8((uint8_t*)(x16 + 2) ); 654 | GF256_M128 x3 = vld1q_u8((uint8_t*)(x16 + 3) ); 655 | GF256_M128 y0 = vld1q_u8((uint8_t*)y16); 656 | GF256_M128 y1 = vld1q_u8((uint8_t*)(y16 + 1)); 657 | GF256_M128 y2 = vld1q_u8((uint8_t*)(y16 + 2)); 658 | GF256_M128 y3 = vld1q_u8((uint8_t*)(y16 + 3)); 659 | 660 | vst1q_u8((uint8_t*)x16, veorq_u8(x0, y0)); 661 | vst1q_u8((uint8_t*)(x16 + 1), veorq_u8(x1, y1)); 662 | vst1q_u8((uint8_t*)(x16 + 2), veorq_u8(x2, y2)); 663 | vst1q_u8((uint8_t*)(x16 + 3), veorq_u8(x3, y3)); 664 | 665 | bytes -= 64, x16 += 4, y16 += 4; 666 | } 667 | 668 | // Handle multiples of 16 bytes 669 | while (bytes >= 16) 670 | { 671 | GF256_M128 x0 = vld1q_u8((uint8_t*)x16); 672 | GF256_M128 y0 = vld1q_u8((uint8_t*)y16); 673 | 674 | vst1q_u8((uint8_t*)x16, veorq_u8(x0, y0)); 675 | 676 | bytes -= 16, ++x16, ++y16; 677 | } 678 | } 679 | else 680 | # endif // GF256_TRY_NEON 681 | { 682 | uint64_t * GF256_RESTRICT x8 = reinterpret_cast(x16); 683 | const uint64_t * GF256_RESTRICT y8 = reinterpret_cast(y16); 684 | 685 | const unsigned count = (unsigned)bytes / 8; 686 | for (unsigned ii = 0; ii < count; ++ii) 687 | x8[ii] ^= y8[ii]; 688 | 689 | x16 = reinterpret_cast(x8 + count); 690 | y16 = reinterpret_cast(y8 + count); 691 | 692 | bytes -= (count * 8); 693 | } 694 | #else // GF256_TARGET_MOBILE 695 | # if defined(GF256_TRY_AVX2) 696 | if (CpuHasAVX2) 697 | { 698 | GF256_M256 * GF256_RESTRICT x32 = reinterpret_cast(x16); 699 | const GF256_M256 * GF256_RESTRICT y32 = reinterpret_cast(y16); 700 | 701 | while (bytes >= 128) 702 | { 703 | GF256_M256 x0 = _mm256_loadu_si256(x32); 704 | GF256_M256 y0 = _mm256_loadu_si256(y32); 705 | x0 = _mm256_xor_si256(x0, y0); 706 | GF256_M256 x1 = _mm256_loadu_si256(x32 + 1); 707 | GF256_M256 y1 = _mm256_loadu_si256(y32 + 1); 708 | x1 = _mm256_xor_si256(x1, y1); 709 | GF256_M256 x2 = _mm256_loadu_si256(x32 + 2); 710 | GF256_M256 y2 = _mm256_loadu_si256(y32 + 2); 711 | x2 = _mm256_xor_si256(x2, y2); 712 | GF256_M256 x3 = _mm256_loadu_si256(x32 + 3); 713 | GF256_M256 y3 = _mm256_loadu_si256(y32 + 3); 714 | x3 = _mm256_xor_si256(x3, y3); 715 | 716 | _mm256_storeu_si256(x32, x0); 717 | _mm256_storeu_si256(x32 + 1, x1); 718 | _mm256_storeu_si256(x32 + 2, x2); 719 | _mm256_storeu_si256(x32 + 3, x3); 720 | 721 | bytes -= 128, x32 += 4, y32 += 4; 722 | } 723 | 724 | // Handle multiples of 32 bytes 725 | while (bytes >= 32) 726 | { 727 | // x[i] = x[i] xor y[i] 728 | _mm256_storeu_si256(x32, 729 | _mm256_xor_si256( 730 | _mm256_loadu_si256(x32), 731 | _mm256_loadu_si256(y32))); 732 | 733 | bytes -= 32, ++x32, ++y32; 734 | } 735 | 736 | x16 = reinterpret_cast(x32); 737 | y16 = reinterpret_cast(y32); 738 | } 739 | else 740 | # endif // GF256_TRY_AVX2 741 | { 742 | while (bytes >= 64) 743 | { 744 | GF256_M128 x0 = _mm_loadu_si128(x16); 745 | GF256_M128 y0 = _mm_loadu_si128(y16); 746 | x0 = _mm_xor_si128(x0, y0); 747 | GF256_M128 x1 = _mm_loadu_si128(x16 + 1); 748 | GF256_M128 y1 = _mm_loadu_si128(y16 + 1); 749 | x1 = _mm_xor_si128(x1, y1); 750 | GF256_M128 x2 = _mm_loadu_si128(x16 + 2); 751 | GF256_M128 y2 = _mm_loadu_si128(y16 + 2); 752 | x2 = _mm_xor_si128(x2, y2); 753 | GF256_M128 x3 = _mm_loadu_si128(x16 + 3); 754 | GF256_M128 y3 = _mm_loadu_si128(y16 + 3); 755 | x3 = _mm_xor_si128(x3, y3); 756 | 757 | _mm_storeu_si128(x16, x0); 758 | _mm_storeu_si128(x16 + 1, x1); 759 | _mm_storeu_si128(x16 + 2, x2); 760 | _mm_storeu_si128(x16 + 3, x3); 761 | 762 | bytes -= 64, x16 += 4, y16 += 4; 763 | } 764 | } 765 | #endif // GF256_TARGET_MOBILE 766 | 767 | #if !defined(GF256_TARGET_MOBILE) 768 | // Handle multiples of 16 bytes 769 | while (bytes >= 16) 770 | { 771 | // x[i] = x[i] xor y[i] 772 | _mm_storeu_si128(x16, 773 | _mm_xor_si128( 774 | _mm_loadu_si128(x16), 775 | _mm_loadu_si128(y16))); 776 | 777 | bytes -= 16, ++x16, ++y16; 778 | } 779 | #endif 780 | 781 | uint8_t * GF256_RESTRICT x1 = reinterpret_cast(x16); 782 | const uint8_t * GF256_RESTRICT y1 = reinterpret_cast(y16); 783 | 784 | // Handle a block of 8 bytes 785 | const int eight = bytes & 8; 786 | if (eight) 787 | { 788 | uint64_t * GF256_RESTRICT x8 = reinterpret_cast(x1); 789 | const uint64_t * GF256_RESTRICT y8 = reinterpret_cast(y1); 790 | *x8 ^= *y8; 791 | } 792 | 793 | // Handle a block of 4 bytes 794 | const int four = bytes & 4; 795 | if (four) 796 | { 797 | uint32_t * GF256_RESTRICT x4 = reinterpret_cast(x1 + eight); 798 | const uint32_t * GF256_RESTRICT y4 = reinterpret_cast(y1 + eight); 799 | *x4 ^= *y4; 800 | } 801 | 802 | // Handle final bytes 803 | const int offset = eight + four; 804 | switch (bytes & 3) 805 | { 806 | case 3: x1[offset + 2] ^= y1[offset + 2]; 807 | case 2: x1[offset + 1] ^= y1[offset + 1]; 808 | case 1: x1[offset] ^= y1[offset]; 809 | default: 810 | break; 811 | } 812 | } 813 | 814 | extern "C" void gf256_add2_mem(void * GF256_RESTRICT vz, const void * GF256_RESTRICT vx, 815 | const void * GF256_RESTRICT vy, int bytes) 816 | { 817 | GF256_M128 * GF256_RESTRICT z16 = reinterpret_cast(vz); 818 | const GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast(vx); 819 | const GF256_M128 * GF256_RESTRICT y16 = reinterpret_cast(vy); 820 | 821 | #if defined(GF256_TARGET_MOBILE) 822 | # if defined(GF256_TRY_NEON) 823 | // Handle multiples of 64 bytes 824 | if (CpuHasNeon) 825 | { 826 | // Handle multiples of 16 bytes 827 | while (bytes >= 16) 828 | { 829 | // z[i] = z[i] xor x[i] xor y[i] 830 | vst1q_u8((uint8_t*)z16, 831 | veorq_u8( 832 | vld1q_u8((uint8_t*)z16), 833 | veorq_u8( 834 | vld1q_u8((uint8_t*)x16), 835 | vld1q_u8((uint8_t*)y16)))); 836 | 837 | bytes -= 16, ++x16, ++y16, ++z16; 838 | } 839 | } 840 | else 841 | # endif // GF256_TRY_NEON 842 | { 843 | uint64_t * GF256_RESTRICT z8 = reinterpret_cast(z16); 844 | const uint64_t * GF256_RESTRICT x8 = reinterpret_cast(x16); 845 | const uint64_t * GF256_RESTRICT y8 = reinterpret_cast(y16); 846 | 847 | const unsigned count = (unsigned)bytes / 8; 848 | for (unsigned ii = 0; ii < count; ++ii) 849 | z8[ii] ^= x8[ii] ^ y8[ii]; 850 | 851 | z16 = reinterpret_cast(z8 + count); 852 | x16 = reinterpret_cast(x8 + count); 853 | y16 = reinterpret_cast(y8 + count); 854 | } 855 | #else // GF256_TARGET_MOBILE 856 | # if defined(GF256_TRY_AVX2) 857 | if (CpuHasAVX2) 858 | { 859 | GF256_M256 * GF256_RESTRICT z32 = reinterpret_cast(z16); 860 | const GF256_M256 * GF256_RESTRICT x32 = reinterpret_cast(x16); 861 | const GF256_M256 * GF256_RESTRICT y32 = reinterpret_cast(y16); 862 | 863 | const unsigned count = bytes / 32; 864 | for (unsigned i = 0; i < count; ++i) 865 | { 866 | _mm256_storeu_si256(z32 + i, 867 | _mm256_xor_si256( 868 | _mm256_loadu_si256(z32 + i), 869 | _mm256_xor_si256( 870 | _mm256_loadu_si256(x32 + i), 871 | _mm256_loadu_si256(y32 + i)))); 872 | } 873 | 874 | bytes -= count * 32; 875 | z16 = reinterpret_cast(z32 + count); 876 | x16 = reinterpret_cast(x32 + count); 877 | y16 = reinterpret_cast(y32 + count); 878 | } 879 | # endif // GF256_TRY_AVX2 880 | 881 | // Handle multiples of 16 bytes 882 | while (bytes >= 16) 883 | { 884 | // z[i] = z[i] xor x[i] xor y[i] 885 | _mm_storeu_si128(z16, 886 | _mm_xor_si128( 887 | _mm_loadu_si128(z16), 888 | _mm_xor_si128( 889 | _mm_loadu_si128(x16), 890 | _mm_loadu_si128(y16)))); 891 | 892 | bytes -= 16, ++x16, ++y16, ++z16; 893 | } 894 | #endif // GF256_TARGET_MOBILE 895 | 896 | uint8_t * GF256_RESTRICT z1 = reinterpret_cast(z16); 897 | const uint8_t * GF256_RESTRICT x1 = reinterpret_cast(x16); 898 | const uint8_t * GF256_RESTRICT y1 = reinterpret_cast(y16); 899 | 900 | // Handle a block of 8 bytes 901 | const int eight = bytes & 8; 902 | if (eight) 903 | { 904 | uint64_t * GF256_RESTRICT z8 = reinterpret_cast(z1); 905 | const uint64_t * GF256_RESTRICT x8 = reinterpret_cast(x1); 906 | const uint64_t * GF256_RESTRICT y8 = reinterpret_cast(y1); 907 | *z8 ^= *x8 ^ *y8; 908 | } 909 | 910 | // Handle a block of 4 bytes 911 | const int four = bytes & 4; 912 | if (four) 913 | { 914 | uint32_t * GF256_RESTRICT z4 = reinterpret_cast(z1 + eight); 915 | const uint32_t * GF256_RESTRICT x4 = reinterpret_cast(x1 + eight); 916 | const uint32_t * GF256_RESTRICT y4 = reinterpret_cast(y1 + eight); 917 | *z4 ^= *x4 ^ *y4; 918 | } 919 | 920 | // Handle final bytes 921 | const int offset = eight + four; 922 | switch (bytes & 3) 923 | { 924 | case 3: z1[offset + 2] ^= x1[offset + 2] ^ y1[offset + 2]; 925 | case 2: z1[offset + 1] ^= x1[offset + 1] ^ y1[offset + 1]; 926 | case 1: z1[offset] ^= x1[offset] ^ y1[offset]; 927 | default: 928 | break; 929 | } 930 | } 931 | 932 | extern "C" void gf256_addset_mem(void * GF256_RESTRICT vz, const void * GF256_RESTRICT vx, 933 | const void * GF256_RESTRICT vy, int bytes) 934 | { 935 | GF256_M128 * GF256_RESTRICT z16 = reinterpret_cast(vz); 936 | const GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast(vx); 937 | const GF256_M128 * GF256_RESTRICT y16 = reinterpret_cast(vy); 938 | 939 | #if defined(GF256_TARGET_MOBILE) 940 | # if defined(GF256_TRY_NEON) 941 | // Handle multiples of 64 bytes 942 | if (CpuHasNeon) 943 | { 944 | while (bytes >= 64) 945 | { 946 | GF256_M128 x0 = vld1q_u8((uint8_t*)x16); 947 | GF256_M128 x1 = vld1q_u8((uint8_t*)(x16 + 1)); 948 | GF256_M128 x2 = vld1q_u8((uint8_t*)(x16 + 2)); 949 | GF256_M128 x3 = vld1q_u8((uint8_t*)(x16 + 3)); 950 | GF256_M128 y0 = vld1q_u8((uint8_t*)(y16)); 951 | GF256_M128 y1 = vld1q_u8((uint8_t*)(y16 + 1)); 952 | GF256_M128 y2 = vld1q_u8((uint8_t*)(y16 + 2)); 953 | GF256_M128 y3 = vld1q_u8((uint8_t*)(y16 + 3)); 954 | 955 | vst1q_u8((uint8_t*)z16, veorq_u8(x0, y0)); 956 | vst1q_u8((uint8_t*)(z16 + 1), veorq_u8(x1, y1)); 957 | vst1q_u8((uint8_t*)(z16 + 2), veorq_u8(x2, y2)); 958 | vst1q_u8((uint8_t*)(z16 + 3), veorq_u8(x3, y3)); 959 | 960 | bytes -= 64, x16 += 4, y16 += 4, z16 += 4; 961 | } 962 | 963 | // Handle multiples of 16 bytes 964 | while (bytes >= 16) 965 | { 966 | // z[i] = x[i] xor y[i] 967 | vst1q_u8((uint8_t*)z16, 968 | veorq_u8( 969 | vld1q_u8((uint8_t*)x16), 970 | vld1q_u8((uint8_t*)y16))); 971 | 972 | bytes -= 16, ++x16, ++y16, ++z16; 973 | } 974 | } 975 | else 976 | # endif // GF256_TRY_NEON 977 | { 978 | uint64_t * GF256_RESTRICT z8 = reinterpret_cast(z16); 979 | const uint64_t * GF256_RESTRICT x8 = reinterpret_cast(x16); 980 | const uint64_t * GF256_RESTRICT y8 = reinterpret_cast(y16); 981 | 982 | const unsigned count = (unsigned)bytes / 8; 983 | for (unsigned ii = 0; ii < count; ++ii) 984 | z8[ii] = x8[ii] ^ y8[ii]; 985 | 986 | x16 = reinterpret_cast(x8 + count); 987 | y16 = reinterpret_cast(y8 + count); 988 | z16 = reinterpret_cast(z8 + count); 989 | 990 | bytes -= (count * 8); 991 | } 992 | #else // GF256_TARGET_MOBILE 993 | # if defined(GF256_TRY_AVX2) 994 | if (CpuHasAVX2) 995 | { 996 | GF256_M256 * GF256_RESTRICT z32 = reinterpret_cast(z16); 997 | const GF256_M256 * GF256_RESTRICT x32 = reinterpret_cast(x16); 998 | const GF256_M256 * GF256_RESTRICT y32 = reinterpret_cast(y16); 999 | 1000 | const unsigned count = bytes / 32; 1001 | for (unsigned i = 0; i < count; ++i) 1002 | { 1003 | _mm256_storeu_si256(z32 + i, 1004 | _mm256_xor_si256( 1005 | _mm256_loadu_si256(x32 + i), 1006 | _mm256_loadu_si256(y32 + i))); 1007 | } 1008 | 1009 | bytes -= count * 32; 1010 | z16 = reinterpret_cast(z32 + count); 1011 | x16 = reinterpret_cast(x32 + count); 1012 | y16 = reinterpret_cast(y32 + count); 1013 | } 1014 | else 1015 | # endif // GF256_TRY_AVX2 1016 | { 1017 | // Handle multiples of 64 bytes 1018 | while (bytes >= 64) 1019 | { 1020 | GF256_M128 x0 = _mm_loadu_si128(x16); 1021 | GF256_M128 x1 = _mm_loadu_si128(x16 + 1); 1022 | GF256_M128 x2 = _mm_loadu_si128(x16 + 2); 1023 | GF256_M128 x3 = _mm_loadu_si128(x16 + 3); 1024 | GF256_M128 y0 = _mm_loadu_si128(y16); 1025 | GF256_M128 y1 = _mm_loadu_si128(y16 + 1); 1026 | GF256_M128 y2 = _mm_loadu_si128(y16 + 2); 1027 | GF256_M128 y3 = _mm_loadu_si128(y16 + 3); 1028 | 1029 | _mm_storeu_si128(z16, _mm_xor_si128(x0, y0)); 1030 | _mm_storeu_si128(z16 + 1, _mm_xor_si128(x1, y1)); 1031 | _mm_storeu_si128(z16 + 2, _mm_xor_si128(x2, y2)); 1032 | _mm_storeu_si128(z16 + 3, _mm_xor_si128(x3, y3)); 1033 | 1034 | bytes -= 64, x16 += 4, y16 += 4, z16 += 4; 1035 | } 1036 | } 1037 | 1038 | // Handle multiples of 16 bytes 1039 | while (bytes >= 16) 1040 | { 1041 | // z[i] = x[i] xor y[i] 1042 | _mm_storeu_si128(z16, 1043 | _mm_xor_si128( 1044 | _mm_loadu_si128(x16), 1045 | _mm_loadu_si128(y16))); 1046 | 1047 | bytes -= 16, ++x16, ++y16, ++z16; 1048 | } 1049 | #endif // GF256_TARGET_MOBILE 1050 | 1051 | uint8_t * GF256_RESTRICT z1 = reinterpret_cast(z16); 1052 | const uint8_t * GF256_RESTRICT x1 = reinterpret_cast(x16); 1053 | const uint8_t * GF256_RESTRICT y1 = reinterpret_cast(y16); 1054 | 1055 | // Handle a block of 8 bytes 1056 | const int eight = bytes & 8; 1057 | if (eight) 1058 | { 1059 | uint64_t * GF256_RESTRICT z8 = reinterpret_cast(z1); 1060 | const uint64_t * GF256_RESTRICT x8 = reinterpret_cast(x1); 1061 | const uint64_t * GF256_RESTRICT y8 = reinterpret_cast(y1); 1062 | *z8 = *x8 ^ *y8; 1063 | } 1064 | 1065 | // Handle a block of 4 bytes 1066 | const int four = bytes & 4; 1067 | if (four) 1068 | { 1069 | uint32_t * GF256_RESTRICT z4 = reinterpret_cast(z1 + eight); 1070 | const uint32_t * GF256_RESTRICT x4 = reinterpret_cast(x1 + eight); 1071 | const uint32_t * GF256_RESTRICT y4 = reinterpret_cast(y1 + eight); 1072 | *z4 = *x4 ^ *y4; 1073 | } 1074 | 1075 | // Handle final bytes 1076 | const int offset = eight + four; 1077 | switch (bytes & 3) 1078 | { 1079 | case 3: z1[offset + 2] = x1[offset + 2] ^ y1[offset + 2]; 1080 | case 2: z1[offset + 1] = x1[offset + 1] ^ y1[offset + 1]; 1081 | case 1: z1[offset] = x1[offset] ^ y1[offset]; 1082 | default: 1083 | break; 1084 | } 1085 | } 1086 | 1087 | extern "C" void gf256_mul_mem(void * GF256_RESTRICT vz, const void * GF256_RESTRICT vx, uint8_t y, int bytes) 1088 | { 1089 | // Use a single if-statement to handle special cases 1090 | if (y <= 1) 1091 | { 1092 | if (y == 0) 1093 | memset(vz, 0, bytes); 1094 | else if (vz != vx) 1095 | memcpy(vz, vx, bytes); 1096 | return; 1097 | } 1098 | 1099 | GF256_M128 * GF256_RESTRICT z16 = reinterpret_cast(vz); 1100 | const GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast(vx); 1101 | 1102 | #if defined(GF256_TARGET_MOBILE) 1103 | #if defined(GF256_TRY_NEON) 1104 | if (bytes >= 16 && CpuHasNeon) 1105 | { 1106 | // Partial product tables; see above 1107 | const GF256_M128 table_lo_y = vld1q_u8((uint8_t*)(GF256Ctx.MM128.TABLE_LO_Y + y)); 1108 | const GF256_M128 table_hi_y = vld1q_u8((uint8_t*)(GF256Ctx.MM128.TABLE_HI_Y + y)); 1109 | 1110 | // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 1111 | const GF256_M128 clr_mask = vdupq_n_u8(0x0f); 1112 | 1113 | // Handle multiples of 16 bytes 1114 | do 1115 | { 1116 | // See above comments for details 1117 | GF256_M128 x0 = vld1q_u8((uint8_t*)x16); 1118 | GF256_M128 l0 = vandq_u8(x0, clr_mask); 1119 | x0 = vshrq_n_u8(x0, 4); 1120 | GF256_M128 h0 = vandq_u8(x0, clr_mask); 1121 | l0 = vqtbl1q_u8(table_lo_y, l0); 1122 | h0 = vqtbl1q_u8(table_hi_y, h0); 1123 | vst1q_u8((uint8_t*)z16, veorq_u8(l0, h0)); 1124 | 1125 | bytes -= 16, ++x16, ++z16; 1126 | } while (bytes >= 16); 1127 | } 1128 | #endif 1129 | #else 1130 | # if defined(GF256_TRY_AVX2) 1131 | if (bytes >= 32 && CpuHasAVX2) 1132 | { 1133 | // Partial product tables; see above 1134 | const GF256_M256 table_lo_y = _mm256_loadu_si256(GF256Ctx.MM256.TABLE_LO_Y + y); 1135 | const GF256_M256 table_hi_y = _mm256_loadu_si256(GF256Ctx.MM256.TABLE_HI_Y + y); 1136 | 1137 | // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 1138 | const GF256_M256 clr_mask = _mm256_set1_epi8(0x0f); 1139 | 1140 | GF256_M256 * GF256_RESTRICT z32 = reinterpret_cast(vz); 1141 | const GF256_M256 * GF256_RESTRICT x32 = reinterpret_cast(vx); 1142 | 1143 | // Handle multiples of 32 bytes 1144 | do 1145 | { 1146 | // See above comments for details 1147 | GF256_M256 x0 = _mm256_loadu_si256(x32); 1148 | GF256_M256 l0 = _mm256_and_si256(x0, clr_mask); 1149 | x0 = _mm256_srli_epi64(x0, 4); 1150 | GF256_M256 h0 = _mm256_and_si256(x0, clr_mask); 1151 | l0 = _mm256_shuffle_epi8(table_lo_y, l0); 1152 | h0 = _mm256_shuffle_epi8(table_hi_y, h0); 1153 | _mm256_storeu_si256(z32, _mm256_xor_si256(l0, h0)); 1154 | 1155 | bytes -= 32, ++x32, ++z32; 1156 | } while (bytes >= 32); 1157 | 1158 | z16 = reinterpret_cast(z32); 1159 | x16 = reinterpret_cast(x32); 1160 | } 1161 | # endif // GF256_TRY_AVX2 1162 | if (bytes >= 16 && CpuHasSSSE3) 1163 | { 1164 | // Partial product tables; see above 1165 | const GF256_M128 table_lo_y = _mm_loadu_si128(GF256Ctx.MM128.TABLE_LO_Y + y); 1166 | const GF256_M128 table_hi_y = _mm_loadu_si128(GF256Ctx.MM128.TABLE_HI_Y + y); 1167 | 1168 | // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 1169 | const GF256_M128 clr_mask = _mm_set1_epi8(0x0f); 1170 | 1171 | // Handle multiples of 16 bytes 1172 | do 1173 | { 1174 | // See above comments for details 1175 | GF256_M128 x0 = _mm_loadu_si128(x16); 1176 | GF256_M128 l0 = _mm_and_si128(x0, clr_mask); 1177 | x0 = _mm_srli_epi64(x0, 4); 1178 | GF256_M128 h0 = _mm_and_si128(x0, clr_mask); 1179 | l0 = _mm_shuffle_epi8(table_lo_y, l0); 1180 | h0 = _mm_shuffle_epi8(table_hi_y, h0); 1181 | _mm_storeu_si128(z16, _mm_xor_si128(l0, h0)); 1182 | 1183 | bytes -= 16, ++x16, ++z16; 1184 | } while (bytes >= 16); 1185 | } 1186 | #endif 1187 | 1188 | uint8_t * GF256_RESTRICT z1 = reinterpret_cast(z16); 1189 | const uint8_t * GF256_RESTRICT x1 = reinterpret_cast(x16); 1190 | const uint8_t * GF256_RESTRICT table = GF256Ctx.GF256_MUL_TABLE + ((unsigned)y << 8); 1191 | 1192 | // Handle blocks of 8 bytes 1193 | while (bytes >= 8) 1194 | { 1195 | uint64_t * GF256_RESTRICT z8 = reinterpret_cast(z1); 1196 | uint64_t word = table[x1[0]]; 1197 | word |= (uint64_t)table[x1[1]] << 8; 1198 | word |= (uint64_t)table[x1[2]] << 16; 1199 | word |= (uint64_t)table[x1[3]] << 24; 1200 | word |= (uint64_t)table[x1[4]] << 32; 1201 | word |= (uint64_t)table[x1[5]] << 40; 1202 | word |= (uint64_t)table[x1[6]] << 48; 1203 | word |= (uint64_t)table[x1[7]] << 56; 1204 | *z8 = word; 1205 | 1206 | bytes -= 8, x1 += 8, z1 += 8; 1207 | } 1208 | 1209 | // Handle a block of 4 bytes 1210 | const int four = bytes & 4; 1211 | if (four) 1212 | { 1213 | uint32_t * GF256_RESTRICT z4 = reinterpret_cast(z1); 1214 | uint32_t word = table[x1[0]]; 1215 | word |= (uint32_t)table[x1[1]] << 8; 1216 | word |= (uint32_t)table[x1[2]] << 16; 1217 | word |= (uint32_t)table[x1[3]] << 24; 1218 | *z4 = word; 1219 | } 1220 | 1221 | // Handle single bytes 1222 | const int offset = four; 1223 | switch (bytes & 3) 1224 | { 1225 | case 3: z1[offset + 2] = table[x1[offset + 2]]; 1226 | case 2: z1[offset + 1] = table[x1[offset + 1]]; 1227 | case 1: z1[offset] = table[x1[offset]]; 1228 | default: 1229 | break; 1230 | } 1231 | } 1232 | 1233 | extern "C" void gf256_muladd_mem(void * GF256_RESTRICT vz, uint8_t y, 1234 | const void * GF256_RESTRICT vx, int bytes) 1235 | { 1236 | // Use a single if-statement to handle special cases 1237 | if (y <= 1) 1238 | { 1239 | if (y == 1) 1240 | gf256_add_mem(vz, vx, bytes); 1241 | return; 1242 | } 1243 | 1244 | GF256_M128 * GF256_RESTRICT z16 = reinterpret_cast(vz); 1245 | const GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast(vx); 1246 | 1247 | #if defined(GF256_TARGET_MOBILE) 1248 | #if defined(GF256_TRY_NEON) 1249 | if (bytes >= 16 && CpuHasNeon) 1250 | { 1251 | // Partial product tables; see above 1252 | const GF256_M128 table_lo_y = vld1q_u8((uint8_t*)(GF256Ctx.MM128.TABLE_LO_Y + y)); 1253 | const GF256_M128 table_hi_y = vld1q_u8((uint8_t*)(GF256Ctx.MM128.TABLE_HI_Y + y)); 1254 | 1255 | // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 1256 | const GF256_M128 clr_mask = vdupq_n_u8(0x0f); 1257 | 1258 | // Handle multiples of 16 bytes 1259 | do 1260 | { 1261 | // See above comments for details 1262 | GF256_M128 x0 = vld1q_u8((uint8_t*)x16); 1263 | GF256_M128 l0 = vandq_u8(x0, clr_mask); 1264 | 1265 | // x0 = vshrq_n_u8(x0, 4); 1266 | x0 = (GF256_M128)vshrq_n_u64( (uint64x2_t)x0, 4); 1267 | GF256_M128 h0 = vandq_u8(x0, clr_mask); 1268 | l0 = vqtbl1q_u8(table_lo_y, l0); 1269 | h0 = vqtbl1q_u8(table_hi_y, h0); 1270 | const GF256_M128 p0 = veorq_u8(l0, h0); 1271 | const GF256_M128 z0 = vld1q_u8((uint8_t*)z16); 1272 | vst1q_u8((uint8_t*)z16, veorq_u8(p0, z0)); 1273 | bytes -= 16, ++x16, ++z16; 1274 | } while (bytes >= 16); 1275 | } 1276 | #endif 1277 | #else // GF256_TARGET_MOBILE 1278 | # if defined(GF256_TRY_AVX2) 1279 | if (bytes >= 32 && CpuHasAVX2) 1280 | { 1281 | // Partial product tables; see above 1282 | const GF256_M256 table_lo_y = _mm256_loadu_si256(GF256Ctx.MM256.TABLE_LO_Y + y); 1283 | const GF256_M256 table_hi_y = _mm256_loadu_si256(GF256Ctx.MM256.TABLE_HI_Y + y); 1284 | 1285 | // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 1286 | const GF256_M256 clr_mask = _mm256_set1_epi8(0x0f); 1287 | 1288 | GF256_M256 * GF256_RESTRICT z32 = reinterpret_cast(z16); 1289 | const GF256_M256 * GF256_RESTRICT x32 = reinterpret_cast(x16); 1290 | 1291 | // On my Reed Solomon codec, the encoder unit test runs in 640 usec without and 550 usec with the optimization (86% of the original time) 1292 | const unsigned count = bytes / 64; 1293 | for (unsigned i = 0; i < count; ++i) 1294 | { 1295 | // See above comments for details 1296 | GF256_M256 x0 = _mm256_loadu_si256(x32 + i * 2); 1297 | GF256_M256 l0 = _mm256_and_si256(x0, clr_mask); 1298 | x0 = _mm256_srli_epi64(x0, 4); 1299 | const GF256_M256 z0 = _mm256_loadu_si256(z32 + i * 2); 1300 | GF256_M256 h0 = _mm256_and_si256(x0, clr_mask); 1301 | l0 = _mm256_shuffle_epi8(table_lo_y, l0); 1302 | h0 = _mm256_shuffle_epi8(table_hi_y, h0); 1303 | const GF256_M256 p0 = _mm256_xor_si256(l0, h0); 1304 | _mm256_storeu_si256(z32 + i * 2, _mm256_xor_si256(p0, z0)); 1305 | 1306 | GF256_M256 x1 = _mm256_loadu_si256(x32 + i * 2 + 1); 1307 | GF256_M256 l1 = _mm256_and_si256(x1, clr_mask); 1308 | x1 = _mm256_srli_epi64(x1, 4); 1309 | const GF256_M256 z1 = _mm256_loadu_si256(z32 + i * 2 + 1); 1310 | GF256_M256 h1 = _mm256_and_si256(x1, clr_mask); 1311 | l1 = _mm256_shuffle_epi8(table_lo_y, l1); 1312 | h1 = _mm256_shuffle_epi8(table_hi_y, h1); 1313 | const GF256_M256 p1 = _mm256_xor_si256(l1, h1); 1314 | _mm256_storeu_si256(z32 + i * 2 + 1, _mm256_xor_si256(p1, z1)); 1315 | } 1316 | bytes -= count * 64; 1317 | z32 += count * 2; 1318 | x32 += count * 2; 1319 | 1320 | if (bytes >= 32) 1321 | { 1322 | GF256_M256 x0 = _mm256_loadu_si256(x32); 1323 | GF256_M256 l0 = _mm256_and_si256(x0, clr_mask); 1324 | x0 = _mm256_srli_epi64(x0, 4); 1325 | GF256_M256 h0 = _mm256_and_si256(x0, clr_mask); 1326 | l0 = _mm256_shuffle_epi8(table_lo_y, l0); 1327 | h0 = _mm256_shuffle_epi8(table_hi_y, h0); 1328 | const GF256_M256 p0 = _mm256_xor_si256(l0, h0); 1329 | const GF256_M256 z0 = _mm256_loadu_si256(z32); 1330 | _mm256_storeu_si256(z32, _mm256_xor_si256(p0, z0)); 1331 | 1332 | bytes -= 32; 1333 | z32++; 1334 | x32++; 1335 | } 1336 | 1337 | z16 = reinterpret_cast(z32); 1338 | x16 = reinterpret_cast(x32); 1339 | } 1340 | # endif // GF256_TRY_AVX2 1341 | if (bytes >= 16 && CpuHasSSSE3) 1342 | { 1343 | // Partial product tables; see above 1344 | const GF256_M128 table_lo_y = _mm_loadu_si128(GF256Ctx.MM128.TABLE_LO_Y + y); 1345 | const GF256_M128 table_hi_y = _mm_loadu_si128(GF256Ctx.MM128.TABLE_HI_Y + y); 1346 | 1347 | // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 1348 | const GF256_M128 clr_mask = _mm_set1_epi8(0x0f); 1349 | 1350 | // This unroll seems to provide about 7% speed boost when AVX2 is disabled 1351 | while (bytes >= 32) 1352 | { 1353 | bytes -= 32; 1354 | 1355 | GF256_M128 x1 = _mm_loadu_si128(x16 + 1); 1356 | GF256_M128 l1 = _mm_and_si128(x1, clr_mask); 1357 | x1 = _mm_srli_epi64(x1, 4); 1358 | GF256_M128 h1 = _mm_and_si128(x1, clr_mask); 1359 | l1 = _mm_shuffle_epi8(table_lo_y, l1); 1360 | h1 = _mm_shuffle_epi8(table_hi_y, h1); 1361 | const GF256_M128 z1 = _mm_loadu_si128(z16 + 1); 1362 | 1363 | GF256_M128 x0 = _mm_loadu_si128(x16); 1364 | GF256_M128 l0 = _mm_and_si128(x0, clr_mask); 1365 | x0 = _mm_srli_epi64(x0, 4); 1366 | GF256_M128 h0 = _mm_and_si128(x0, clr_mask); 1367 | l0 = _mm_shuffle_epi8(table_lo_y, l0); 1368 | h0 = _mm_shuffle_epi8(table_hi_y, h0); 1369 | const GF256_M128 z0 = _mm_loadu_si128(z16); 1370 | 1371 | const GF256_M128 p1 = _mm_xor_si128(l1, h1); 1372 | _mm_storeu_si128(z16 + 1, _mm_xor_si128(p1, z1)); 1373 | 1374 | const GF256_M128 p0 = _mm_xor_si128(l0, h0); 1375 | _mm_storeu_si128(z16, _mm_xor_si128(p0, z0)); 1376 | 1377 | x16 += 2, z16 += 2; 1378 | } 1379 | 1380 | // Handle multiples of 16 bytes 1381 | while (bytes >= 16) 1382 | { 1383 | // See above comments for details 1384 | GF256_M128 x0 = _mm_loadu_si128(x16); 1385 | GF256_M128 l0 = _mm_and_si128(x0, clr_mask); 1386 | x0 = _mm_srli_epi64(x0, 4); 1387 | GF256_M128 h0 = _mm_and_si128(x0, clr_mask); 1388 | l0 = _mm_shuffle_epi8(table_lo_y, l0); 1389 | h0 = _mm_shuffle_epi8(table_hi_y, h0); 1390 | const GF256_M128 p0 = _mm_xor_si128(l0, h0); 1391 | const GF256_M128 z0 = _mm_loadu_si128(z16); 1392 | _mm_storeu_si128(z16, _mm_xor_si128(p0, z0)); 1393 | 1394 | bytes -= 16, ++x16, ++z16; 1395 | } 1396 | } 1397 | #endif // GF256_TARGET_MOBILE 1398 | 1399 | uint8_t * GF256_RESTRICT z1 = reinterpret_cast(z16); 1400 | const uint8_t * GF256_RESTRICT x1 = reinterpret_cast(x16); 1401 | const uint8_t * GF256_RESTRICT table = GF256Ctx.GF256_MUL_TABLE + ((unsigned)y << 8); 1402 | 1403 | // Handle blocks of 8 bytes 1404 | while (bytes >= 8) 1405 | { 1406 | uint64_t * GF256_RESTRICT z8 = reinterpret_cast(z1); 1407 | uint64_t word = table[x1[0]]; 1408 | word |= (uint64_t)table[x1[1]] << 8; 1409 | word |= (uint64_t)table[x1[2]] << 16; 1410 | word |= (uint64_t)table[x1[3]] << 24; 1411 | word |= (uint64_t)table[x1[4]] << 32; 1412 | word |= (uint64_t)table[x1[5]] << 40; 1413 | word |= (uint64_t)table[x1[6]] << 48; 1414 | word |= (uint64_t)table[x1[7]] << 56; 1415 | *z8 ^= word; 1416 | 1417 | bytes -= 8, x1 += 8, z1 += 8; 1418 | } 1419 | 1420 | // Handle a block of 4 bytes 1421 | const int four = bytes & 4; 1422 | if (four) 1423 | { 1424 | uint32_t * GF256_RESTRICT z4 = reinterpret_cast(z1); 1425 | uint32_t word = table[x1[0]]; 1426 | word |= (uint32_t)table[x1[1]] << 8; 1427 | word |= (uint32_t)table[x1[2]] << 16; 1428 | word |= (uint32_t)table[x1[3]] << 24; 1429 | *z4 ^= word; 1430 | } 1431 | 1432 | // Handle single bytes 1433 | const int offset = four; 1434 | switch (bytes & 3) 1435 | { 1436 | case 3: z1[offset + 2] ^= table[x1[offset + 2]]; 1437 | case 2: z1[offset + 1] ^= table[x1[offset + 1]]; 1438 | case 1: z1[offset] ^= table[x1[offset]]; 1439 | default: 1440 | break; 1441 | } 1442 | } 1443 | 1444 | extern "C" void gf256_memswap(void * GF256_RESTRICT vx, void * GF256_RESTRICT vy, int bytes) 1445 | { 1446 | #if defined(GF256_TARGET_MOBILE) 1447 | uint64_t * GF256_RESTRICT x16 = reinterpret_cast(vx); 1448 | uint64_t * GF256_RESTRICT y16 = reinterpret_cast(vy); 1449 | 1450 | const unsigned count = (unsigned)bytes / 8; 1451 | for (unsigned ii = 0; ii < count; ++ii) 1452 | { 1453 | const uint64_t temp = x16[ii]; 1454 | x16[ii] = y16[ii]; 1455 | y16[ii] = temp; 1456 | } 1457 | 1458 | x16 += count; 1459 | y16 += count; 1460 | #else 1461 | GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast(vx); 1462 | GF256_M128 * GF256_RESTRICT y16 = reinterpret_cast(vy); 1463 | 1464 | // Handle blocks of 16 bytes 1465 | while (bytes >= 16) 1466 | { 1467 | GF256_M128 x0 = _mm_loadu_si128(x16); 1468 | GF256_M128 y0 = _mm_loadu_si128(y16); 1469 | _mm_storeu_si128(x16, y0); 1470 | _mm_storeu_si128(y16, x0); 1471 | 1472 | bytes -= 16, ++x16, ++y16; 1473 | } 1474 | #endif 1475 | 1476 | uint8_t * GF256_RESTRICT x1 = reinterpret_cast(x16); 1477 | uint8_t * GF256_RESTRICT y1 = reinterpret_cast(y16); 1478 | 1479 | // Handle a block of 8 bytes 1480 | const int eight = bytes & 8; 1481 | if (eight) 1482 | { 1483 | uint64_t * GF256_RESTRICT x8 = reinterpret_cast(x1); 1484 | uint64_t * GF256_RESTRICT y8 = reinterpret_cast(y1); 1485 | 1486 | uint64_t temp = *x8; 1487 | *x8 = *y8; 1488 | *y8 = temp; 1489 | } 1490 | 1491 | // Handle a block of 4 bytes 1492 | const int four = bytes & 4; 1493 | if (four) 1494 | { 1495 | uint32_t * GF256_RESTRICT x4 = reinterpret_cast(x1 + eight); 1496 | uint32_t * GF256_RESTRICT y4 = reinterpret_cast(y1 + eight); 1497 | 1498 | uint32_t temp = *x4; 1499 | *x4 = *y4; 1500 | *y4 = temp; 1501 | } 1502 | 1503 | // Handle final bytes 1504 | const int offset = eight + four; 1505 | uint8_t temp; 1506 | switch (bytes & 3) 1507 | { 1508 | case 3: temp = x1[offset + 2]; x1[offset + 2] = y1[offset + 2]; y1[offset + 2] = temp; 1509 | case 2: temp = x1[offset + 1]; x1[offset + 1] = y1[offset + 1]; y1[offset + 1] = temp; 1510 | case 1: temp = x1[offset]; x1[offset] = y1[offset]; y1[offset] = temp; 1511 | default: 1512 | break; 1513 | } 1514 | } 1515 | -------------------------------------------------------------------------------- /tests/gf256.h: -------------------------------------------------------------------------------- 1 | /** \file 2 | \brief GF(256) Main C API Header 3 | \copyright Copyright (c) 2017 Christopher A. Taylor. All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | * Neither the name of GF256 nor the names of its contributors may be 14 | used to endorse or promote products derived from this software without 15 | specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 21 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 | POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | #ifndef CAT_GF256_H 31 | #define CAT_GF256_H 32 | 33 | /** \page GF256 GF(256) Math Module 34 | 35 | This module provides efficient implementations of bulk 36 | GF(2^^8) math operations over memory buffers. 37 | 38 | Addition is done over the base field in GF(2) meaning 39 | that addition is XOR between memory buffers. 40 | 41 | Multiplication is performed using table lookups via 42 | SIMD instructions. This is somewhat slower than XOR, 43 | but fast enough to not become a major bottleneck when 44 | used sparingly. 45 | */ 46 | 47 | #include // uint32_t etc 48 | #include // memcpy, memset 49 | 50 | /// Library header version 51 | #define GF256_VERSION 2 52 | 53 | //------------------------------------------------------------------------------ 54 | // Platform/Architecture 55 | 56 | #if defined(ANDROID) || defined(IOS) || defined(LINUX_ARM) 57 | #define GF256_TARGET_MOBILE 58 | #endif // ANDROID 59 | 60 | #if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900) 61 | #define GF256_TRY_AVX2 /* 256-bit */ 62 | #include 63 | #define GF256_ALIGN_BYTES 32 64 | #else // __AVX2__ 65 | #define GF256_ALIGN_BYTES 16 66 | #endif // __AVX2__ 67 | 68 | #if !defined(GF256_TARGET_MOBILE) 69 | // Note: MSVC currently only supports SSSE3 but not AVX2 70 | #include // SSSE3: _mm_shuffle_epi8 71 | #include // SSE2 72 | #endif // GF256_TARGET_MOBILE 73 | 74 | #if defined(HAVE_ARM_NEON_H) 75 | #include 76 | #endif // HAVE_ARM_NEON_H 77 | 78 | #if defined(GF256_TARGET_MOBILE) 79 | 80 | #define GF256_ALIGNED_ACCESSES /* Inputs must be aligned to GF256_ALIGN_BYTES */ 81 | 82 | # if defined(HAVE_ARM_NEON_H) 83 | // Compiler-specific 128-bit SIMD register keyword 84 | #define GF256_M128 uint8x16_t 85 | #define GF256_TRY_NEON 86 | #else 87 | #define GF256_M128 uint64_t 88 | # endif 89 | 90 | #else // GF256_TARGET_MOBILE 91 | 92 | // Compiler-specific 128-bit SIMD register keyword 93 | #define GF256_M128 __m128i 94 | 95 | #endif // GF256_TARGET_MOBILE 96 | 97 | #ifdef GF256_TRY_AVX2 98 | // Compiler-specific 256-bit SIMD register keyword 99 | #define GF256_M256 __m256i 100 | #endif 101 | 102 | // Compiler-specific C++11 restrict keyword 103 | #define GF256_RESTRICT __restrict 104 | 105 | // Compiler-specific force inline keyword 106 | #ifdef _MSC_VER 107 | #define GF256_FORCE_INLINE inline __forceinline 108 | #else 109 | #define GF256_FORCE_INLINE inline __attribute__((always_inline)) 110 | #endif 111 | 112 | // Compiler-specific alignment keyword 113 | // Note: Alignment only matters for ARM NEON where it should be 16 114 | #ifdef _MSC_VER 115 | #define GF256_ALIGNED __declspec(align(GF256_ALIGN_BYTES)) 116 | #else // _MSC_VER 117 | #define GF256_ALIGNED __attribute__((aligned(GF256_ALIGN_BYTES))) 118 | #endif // _MSC_VER 119 | 120 | #ifdef __cplusplus 121 | extern "C" { 122 | #endif // __cplusplus 123 | 124 | 125 | //------------------------------------------------------------------------------ 126 | // Portability 127 | 128 | /// Swap two memory buffers in-place 129 | extern void gf256_memswap(void * GF256_RESTRICT vx, void * GF256_RESTRICT vy, int bytes); 130 | 131 | 132 | //------------------------------------------------------------------------------ 133 | // GF(256) Context 134 | 135 | #ifdef _MSC_VER 136 | #pragma warning(push) 137 | #pragma warning(disable: 4324) // warning C4324: 'gf256_ctx' : structure was padded due to __declspec(align()) 138 | #endif // _MSC_VER 139 | 140 | /// The context object stores tables required to perform library calculations 141 | struct gf256_ctx 142 | { 143 | /// We require memory to be aligned since the SIMD instructions benefit from 144 | /// or require aligned accesses to the table data. 145 | struct 146 | { 147 | GF256_ALIGNED GF256_M128 TABLE_LO_Y[256]; 148 | GF256_ALIGNED GF256_M128 TABLE_HI_Y[256]; 149 | } MM128; 150 | #ifdef GF256_TRY_AVX2 151 | struct 152 | { 153 | GF256_ALIGNED GF256_M256 TABLE_LO_Y[256]; 154 | GF256_ALIGNED GF256_M256 TABLE_HI_Y[256]; 155 | } MM256; 156 | #endif // GF256_TRY_AVX2 157 | 158 | /// Mul/Div/Inv/Sqr tables 159 | uint8_t GF256_MUL_TABLE[256 * 256]; 160 | uint8_t GF256_DIV_TABLE[256 * 256]; 161 | uint8_t GF256_INV_TABLE[256]; 162 | uint8_t GF256_SQR_TABLE[256]; 163 | 164 | /// Log/Exp tables 165 | uint16_t GF256_LOG_TABLE[256]; 166 | uint8_t GF256_EXP_TABLE[512 * 2 + 1]; 167 | 168 | /// Polynomial used 169 | unsigned Polynomial; 170 | }; 171 | 172 | #ifdef _MSC_VER 173 | #pragma warning(pop) 174 | #endif // _MSC_VER 175 | 176 | extern gf256_ctx GF256Ctx; 177 | 178 | 179 | //------------------------------------------------------------------------------ 180 | // Initialization 181 | 182 | /** 183 | Initialize a context, filling in the tables. 184 | 185 | Thread-safety / Usage Notes: 186 | 187 | It is perfectly safe and encouraged to use a gf256_ctx object from multiple 188 | threads. The gf256_init() is relatively expensive and should only be done 189 | once, though it will take less than a millisecond. 190 | 191 | The gf256_ctx object must be aligned to 16 byte boundary. 192 | Simply tag the object with GF256_ALIGNED to achieve this. 193 | 194 | Example: 195 | static GF256_ALIGNED gf256_ctx TheGF256Context; 196 | gf256_init(&TheGF256Context, 0); 197 | 198 | Returns 0 on success and other values on failure. 199 | */ 200 | extern int gf256_init_(int version); 201 | #define gf256_init() gf256_init_(GF256_VERSION) 202 | 203 | 204 | //------------------------------------------------------------------------------ 205 | // Math Operations 206 | 207 | /// return x + y 208 | static GF256_FORCE_INLINE uint8_t gf256_add(uint8_t x, uint8_t y) 209 | { 210 | return (uint8_t)(x ^ y); 211 | } 212 | 213 | /// return x * y 214 | /// For repeated multiplication by a constant, it is faster to put the constant in y. 215 | static GF256_FORCE_INLINE uint8_t gf256_mul(uint8_t x, uint8_t y) 216 | { 217 | return GF256Ctx.GF256_MUL_TABLE[((unsigned)y << 8) + x]; 218 | } 219 | 220 | /// return x / y 221 | /// Memory-access optimized for constant divisors in y. 222 | static GF256_FORCE_INLINE uint8_t gf256_div(uint8_t x, uint8_t y) 223 | { 224 | return GF256Ctx.GF256_DIV_TABLE[((unsigned)y << 8) + x]; 225 | } 226 | 227 | /// return 1 / x 228 | static GF256_FORCE_INLINE uint8_t gf256_inv(uint8_t x) 229 | { 230 | return GF256Ctx.GF256_INV_TABLE[x]; 231 | } 232 | 233 | /// return x * x 234 | static GF256_FORCE_INLINE uint8_t gf256_sqr(uint8_t x) 235 | { 236 | return GF256Ctx.GF256_SQR_TABLE[x]; 237 | } 238 | 239 | 240 | //------------------------------------------------------------------------------ 241 | // Bulk Memory Math Operations 242 | 243 | /// Performs "x[] += y[]" bulk memory XOR operation 244 | extern void gf256_add_mem(void * GF256_RESTRICT vx, 245 | const void * GF256_RESTRICT vy, int bytes); 246 | 247 | /// Performs "z[] += x[] + y[]" bulk memory operation 248 | extern void gf256_add2_mem(void * GF256_RESTRICT vz, const void * GF256_RESTRICT vx, 249 | const void * GF256_RESTRICT vy, int bytes); 250 | 251 | /// Performs "z[] = x[] + y[]" bulk memory operation 252 | extern void gf256_addset_mem(void * GF256_RESTRICT vz, const void * GF256_RESTRICT vx, 253 | const void * GF256_RESTRICT vy, int bytes); 254 | 255 | /// Performs "z[] = x[] * y" bulk memory operation 256 | extern void gf256_mul_mem(void * GF256_RESTRICT vz, 257 | const void * GF256_RESTRICT vx, uint8_t y, int bytes); 258 | 259 | /// Performs "z[] += x[] * y" bulk memory operation 260 | extern void gf256_muladd_mem(void * GF256_RESTRICT vz, uint8_t y, 261 | const void * GF256_RESTRICT vx, int bytes); 262 | 263 | /// Performs "x[] /= y" bulk memory operation 264 | static GF256_FORCE_INLINE void gf256_div_mem(void * GF256_RESTRICT vz, 265 | const void * GF256_RESTRICT vx, uint8_t y, int bytes) 266 | { 267 | // Multiply by inverse 268 | gf256_mul_mem(vz, vx, y == 1 ? (uint8_t)1 : GF256Ctx.GF256_INV_TABLE[y], bytes); 269 | } 270 | 271 | 272 | //------------------------------------------------------------------------------ 273 | // Misc Operations 274 | 275 | /// Swap two memory buffers in-place 276 | extern void gf256_memswap(void * GF256_RESTRICT vx, void * GF256_RESTRICT vy, int bytes); 277 | 278 | 279 | #ifdef __cplusplus 280 | } 281 | #endif // __cplusplus 282 | 283 | #endif // CAT_GF256_H 284 | -------------------------------------------------------------------------------- /tests/tests.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2018 Christopher A. Taylor. All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | * Neither the name of Fp61 nor the names of its contributors may be 13 | used to endorse or promote products derived from this software without 14 | specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 20 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 | POSSIBILITY OF SUCH DAMAGE. 27 | */ 28 | 29 | #include "../fp61.h" 30 | 31 | #include 32 | #include 33 | #include 34 | #include 35 | using namespace std; 36 | 37 | 38 | //------------------------------------------------------------------------------ 39 | // Portability macros 40 | 41 | // Compiler-specific debug break 42 | #if defined(_DEBUG) || defined(DEBUG) 43 | #define FP61_DEBUG 44 | #ifdef _WIN32 45 | #define FP61_DEBUG_BREAK() __debugbreak() 46 | #else 47 | #define FP61_DEBUG_BREAK() __builtin_trap() 48 | #endif 49 | #define FP61_DEBUG_ASSERT(cond) { if (!(cond)) { FP61_DEBUG_BREAK(); } } 50 | #else 51 | #define FP61_DEBUG_BREAK() do {} while (false); 52 | #define FP61_DEBUG_ASSERT(cond) do {} while (false); 53 | #endif 54 | 55 | 56 | //------------------------------------------------------------------------------ 57 | // Constants 58 | 59 | #define FP61_RET_FAIL -1 60 | #define FP61_RET_SUCCESS 0 61 | 62 | static const uint64_t MASK61 = ((uint64_t)1 << 61) - 1; 63 | static const uint64_t MASK62 = ((uint64_t)1 << 62) - 1; 64 | static const uint64_t MASK63 = ((uint64_t)1 << 63) - 1; 65 | static const uint64_t MASK64 = ~(uint64_t)0; 66 | static const uint64_t MASK64_NO62 = MASK64 ^ ((uint64_t)1 << 62); 67 | static const uint64_t MASK64_NO61 = MASK64 ^ ((uint64_t)1 << 61); 68 | static const uint64_t MASK64_NO60 = MASK64 ^ ((uint64_t)1 << 60); 69 | static const uint64_t MASK63_NO61 = MASK63 ^ ((uint64_t)1 << 61); 70 | static const uint64_t MASK63_NO60 = MASK63 ^ ((uint64_t)1 << 60); 71 | static const uint64_t MASK62_NO60 = MASK62 ^ ((uint64_t)1 << 60); 72 | 73 | #if defined(FP61_DEBUG) 74 | static const unsigned kRandomTestLoops = 100000; 75 | static const unsigned kMaxDataLength = 4000; 76 | #else 77 | static const unsigned kRandomTestLoops = 10000000; 78 | static const unsigned kMaxDataLength = 10000; 79 | #endif 80 | 81 | 82 | //------------------------------------------------------------------------------ 83 | // Tools 84 | 85 | static std::string HexString(uint64_t x) 86 | { 87 | std::stringstream ss; 88 | ss << hex << setfill('0') << setw(16) << x; 89 | return ss.str(); 90 | } 91 | 92 | 93 | //------------------------------------------------------------------------------ 94 | // Tests: Negate 95 | 96 | static bool test_negate(uint64_t x) 97 | { 98 | uint64_t n = fp61::Negate(x); 99 | uint64_t s = (x + n) % fp61::kPrime; 100 | if (s != 0) { 101 | cout << "Failed for x = " << hex << HexString(x) << endl; 102 | FP61_DEBUG_BREAK(); 103 | return false; 104 | } 105 | return true; 106 | } 107 | 108 | static bool TestNegate() 109 | { 110 | cout << "TestNegate..."; 111 | 112 | // Input is allowed to be 0 <= x <= p 113 | for (uint64_t x = 0; x < 1000; ++x) { 114 | if (!test_negate(x)) { 115 | return false; 116 | } 117 | } 118 | for (uint64_t x = fp61::kPrime; x >= fp61::kPrime - 1000; --x) { 119 | if (!test_negate(x)) { 120 | return false; 121 | } 122 | } 123 | 124 | fp61::Random prng; 125 | prng.Seed(1); 126 | 127 | for (unsigned i = 0; i < kRandomTestLoops; ++i) 128 | { 129 | uint64_t x = prng.Next() & fp61::kPrime; 130 | if (!test_negate(x)) { 131 | return false; 132 | } 133 | } 134 | 135 | cout << "Passed" << endl; 136 | 137 | return true; 138 | } 139 | 140 | 141 | //------------------------------------------------------------------------------ 142 | // Tests: Add 143 | 144 | static bool TestAdd() 145 | { 146 | cout << "TestAdd..."; 147 | 148 | // Preconditions: x,y,z,w <2^62 149 | const uint64_t largest = ((uint64_t)1 << 62) - 1; 150 | const uint64_t reduced = largest % fp61::kPrime; 151 | 152 | for (uint64_t x = largest; x >= largest - 1000; --x) 153 | { 154 | uint64_t r = fp61::Add4(largest, largest, largest, x); 155 | 156 | uint64_t expected = 0; 157 | expected = (expected + reduced) % fp61::kPrime; 158 | expected = (expected + reduced) % fp61::kPrime; 159 | expected = (expected + reduced) % fp61::kPrime; 160 | expected = (expected + (x % fp61::kPrime)) % fp61::kPrime; 161 | 162 | if (r % fp61::kPrime != expected) { 163 | cout << "Failed for x = " << HexString(x) << endl; 164 | FP61_DEBUG_BREAK(); 165 | return false; 166 | } 167 | } 168 | 169 | for (uint64_t x = largest; x >= largest - 1000; --x) 170 | { 171 | for (uint64_t y = largest; y >= largest - 1000; --y) 172 | { 173 | uint64_t r = fp61::Add4(largest, largest, x, y); 174 | 175 | uint64_t expected = 0; 176 | expected = (expected + reduced) % fp61::kPrime; 177 | expected = (expected + reduced) % fp61::kPrime; 178 | expected = (expected + (y % fp61::kPrime)) % fp61::kPrime; 179 | expected = (expected + (x % fp61::kPrime)) % fp61::kPrime; 180 | 181 | if (r % fp61::kPrime != expected) { 182 | cout << "Failed for x=" << HexString(x) << " y=" << HexString(y) << endl; 183 | FP61_DEBUG_BREAK(); 184 | return false; 185 | } 186 | } 187 | } 188 | 189 | fp61::Random prng; 190 | prng.Seed(0); 191 | 192 | for (unsigned i = 0; i < kRandomTestLoops; ++i) 193 | { 194 | // Select 4 values from 0..2^62-1 195 | uint64_t x = prng.Next() & MASK62; 196 | uint64_t y = prng.Next() & MASK62; 197 | uint64_t w = prng.Next() & MASK62; 198 | uint64_t z = prng.Next() & MASK62; 199 | 200 | uint64_t r = fp61::Add4(x, y, z, w); 201 | 202 | uint64_t expected = 0; 203 | expected = (expected + (x % fp61::kPrime)) % fp61::kPrime; 204 | expected = (expected + (y % fp61::kPrime)) % fp61::kPrime; 205 | expected = (expected + (z % fp61::kPrime)) % fp61::kPrime; 206 | expected = (expected + (w % fp61::kPrime)) % fp61::kPrime; 207 | 208 | if (r % fp61::kPrime != expected) { 209 | cout << "Failed (random) for i = " << i << endl; 210 | FP61_DEBUG_BREAK(); 211 | return false; 212 | } 213 | } 214 | 215 | cout << "Passed" << endl; 216 | 217 | return true; 218 | } 219 | 220 | 221 | //------------------------------------------------------------------------------ 222 | // Tests: Partial Reduction 223 | 224 | static bool test_pred(uint64_t x) 225 | { 226 | uint64_t expected = x % fp61::kPrime; 227 | 228 | uint64_t r = fp61::PartialReduce(x); 229 | 230 | if ((r >> 62) != 0) 231 | { 232 | cout << "High bit overflow failed for x=" << HexString(x) << endl; 233 | FP61_DEBUG_BREAK(); 234 | return false; 235 | } 236 | 237 | uint64_t actual = fp61::PartialReduce(x) % fp61::kPrime; 238 | 239 | if (actual != expected) 240 | { 241 | cout << "Failed for x=" << HexString(x) << endl; 242 | FP61_DEBUG_BREAK(); 243 | return false; 244 | } 245 | return true; 246 | } 247 | 248 | static bool TestPartialReduction() 249 | { 250 | cout << "TestPartialReduction..."; 251 | 252 | // Input can have any bit set 253 | 254 | for (uint64_t x = 0; x < 1000; ++x) { 255 | if (!test_pred(x)) { 256 | return false; 257 | } 258 | } 259 | for (uint64_t x = MASK64; x > MASK64 - 1000; --x) { 260 | if (!test_pred(x)) { 261 | return false; 262 | } 263 | } 264 | for (uint64_t x = MASK64_NO62 + 1000; x > MASK64_NO62 - 1000; --x) { 265 | if (!test_pred(x)) { 266 | return false; 267 | } 268 | } 269 | for (uint64_t x = MASK64_NO61 + 1000; x > MASK64_NO61 - 1000; --x) { 270 | if (!test_pred(x)) { 271 | return false; 272 | } 273 | } 274 | for (uint64_t x = MASK64_NO60 + 1000; x > MASK64_NO60 - 1000; --x) { 275 | if (!test_pred(x)) { 276 | return false; 277 | } 278 | } 279 | for (uint64_t x = MASK63; x > MASK63 - 1000; --x) { 280 | if (!test_pred(x)) { 281 | return false; 282 | } 283 | } 284 | for (uint64_t x = MASK63_NO61 + 1000; x > MASK63_NO61 - 1000; --x) { 285 | if (!test_pred(x)) { 286 | return false; 287 | } 288 | } 289 | for (uint64_t x = MASK63_NO60 + 1000; x > MASK63_NO60 - 1000; --x) { 290 | if (!test_pred(x)) { 291 | return false; 292 | } 293 | } 294 | for (uint64_t x = MASK62 + 1000; x > MASK62 - 1000; --x) { 295 | if (!test_pred(x)) { 296 | return false; 297 | } 298 | } 299 | for (uint64_t x = MASK62_NO60 + 1000; x > MASK62_NO60 - 1000; --x) { 300 | if (!test_pred(x)) { 301 | return false; 302 | } 303 | } 304 | for (uint64_t x = MASK61 + 1000; x > MASK61 - 1000; --x) { 305 | if (!test_pred(x)) { 306 | return false; 307 | } 308 | } 309 | 310 | fp61::Random prng; 311 | prng.Seed(2); 312 | 313 | for (unsigned i = 0; i < kRandomTestLoops; ++i) 314 | { 315 | uint64_t x = prng.Next(); 316 | 317 | if (!test_pred(x)) { 318 | return false; 319 | } 320 | } 321 | 322 | cout << "Passed" << endl; 323 | 324 | return true; 325 | } 326 | 327 | 328 | //------------------------------------------------------------------------------ 329 | // Tests: Finalize Reduction 330 | 331 | static bool test_fred(uint64_t x) 332 | { 333 | // EXCEPTION: This input is known to not work 334 | if (x == 0x3ffffffffffffffeULL) { 335 | return true; 336 | } 337 | 338 | uint64_t actual = fp61::Finalize(x); 339 | uint64_t expected = x % fp61::kPrime; 340 | 341 | if (actual != expected) 342 | { 343 | cout << "Failed for x=" << HexString(x) << endl; 344 | FP61_DEBUG_BREAK(); 345 | return false; 346 | } 347 | return true; 348 | } 349 | 350 | static bool TestFinalizeReduction() 351 | { 352 | cout << "TestFinalizeReduction..."; 353 | 354 | // Input has #63 and #62 clear, other bits can take on any value 355 | 356 | for (uint64_t x = 0; x < 1000; ++x) { 357 | if (!test_fred(x)) { 358 | return false; 359 | } 360 | } 361 | for (uint64_t x = MASK62; x > MASK62 - 1000; --x) { 362 | if (!test_fred(x)) { 363 | return false; 364 | } 365 | } 366 | for (uint64_t x = MASK62_NO60 + 1000; x > MASK62_NO60 - 1000; --x) { 367 | if (!test_fred(x)) { 368 | return false; 369 | } 370 | } 371 | for (uint64_t x = MASK61 + 1000; x > MASK61 - 1000; --x) { 372 | if (!test_fred(x)) { 373 | return false; 374 | } 375 | } 376 | 377 | fp61::Random prng; 378 | prng.Seed(3); 379 | 380 | for (unsigned i = 0; i < kRandomTestLoops; ++i) 381 | { 382 | uint64_t x = prng.Next() & MASK62; 383 | 384 | if (!test_fred(x)) { 385 | return false; 386 | } 387 | } 388 | 389 | cout << "Passed" << endl; 390 | 391 | return true; 392 | } 393 | 394 | 395 | //------------------------------------------------------------------------------ 396 | // Tests: Multiply 397 | 398 | static bool test_mul(uint64_t x, uint64_t y) 399 | { 400 | uint64_t p = fp61::Multiply(x, y); 401 | 402 | if ((p >> 62) != 0) { 403 | cout << "Failed (high bit overflow) for x=" << HexString(x) << ", y=" << HexString(y) << endl; 404 | FP61_DEBUG_BREAK(); 405 | return false; 406 | } 407 | 408 | uint64_t r0, r1; 409 | CAT_MUL128(r1, r0, x, y); 410 | 411 | //A % B == (((AH % B) * (2^64 % B)) + (AL % B)) % B 412 | // == (((AH % B) * ((2^64 - B) % B)) + (AL % B)) % B 413 | r1 %= fp61::kPrime; 414 | uint64_t NB = (uint64_t)(-(int64_t)fp61::kPrime); 415 | uint64_t mod = r1 * (NB % fp61::kPrime); 416 | mod += r0 % fp61::kPrime; 417 | mod %= fp61::kPrime; 418 | 419 | if (p % fp61::kPrime != mod) { 420 | cout << "Failed (reduced result mismatch) for x=" << HexString(x) << ", y=" << HexString(y) << endl; 421 | FP61_DEBUG_BREAK(); 422 | return false; 423 | } 424 | 425 | return true; 426 | } 427 | 428 | static bool TestMultiply() 429 | { 430 | cout << "TestMultiply..."; 431 | 432 | // Number of bits between x, y must be 124 or fewer. 433 | 434 | for (uint64_t x = 0; x < 1000; ++x) { 435 | for (uint64_t y = x; y < 1000; ++y) { 436 | if (!test_mul(x, y)) { 437 | return false; 438 | } 439 | } 440 | } 441 | for (uint64_t x = MASK62; x > MASK62 - 1000; --x) { 442 | for (uint64_t y = x; y > MASK62 - 1000; --y) { 443 | if (!test_mul(x, y)) { 444 | return false; 445 | } 446 | } 447 | } 448 | for (uint64_t x = MASK62_NO60 + 1000; x > MASK62_NO60 - 1000; --x) { 449 | for (uint64_t y = x; y > MASK62_NO60 - 1000; --y) { 450 | if (!test_mul(x, y)) { 451 | return false; 452 | } 453 | } 454 | } 455 | for (uint64_t x = MASK61 + 1000; x > MASK61 - 1000; --x) { 456 | for (uint64_t y = x; y > MASK61 - 1000; --y) { 457 | if (!test_mul(x, y)) { 458 | return false; 459 | } 460 | } 461 | } 462 | 463 | fp61::Random prng; 464 | prng.Seed(4); 465 | 466 | // 62 + 62 = 124 bits 467 | for (unsigned i = 0; i < kRandomTestLoops; ++i) 468 | { 469 | uint64_t x = prng.Next() & MASK62; 470 | uint64_t y = prng.Next() & MASK62; 471 | 472 | if (!test_mul(x, y)) { 473 | return false; 474 | } 475 | } 476 | 477 | // 61 + 63 = 124 bits 478 | for (unsigned i = 0; i < kRandomTestLoops; ++i) 479 | { 480 | uint64_t x = prng.Next() & MASK61; 481 | uint64_t y = prng.Next() & MASK63; 482 | 483 | if (!test_mul(x, y)) { 484 | return false; 485 | } 486 | } 487 | 488 | // Commutivity test 489 | for (unsigned i = 0; i < kRandomTestLoops; ++i) 490 | { 491 | uint64_t x = prng.Next() & MASK62; 492 | uint64_t y = prng.Next() & MASK62; 493 | uint64_t z = prng.Next() & MASK62; 494 | 495 | uint64_t r = fp61::Finalize(fp61::Multiply(fp61::Multiply(z, y), x)); 496 | uint64_t s = fp61::Finalize(fp61::Multiply(fp61::Multiply(x, z), y)); 497 | uint64_t t = fp61::Finalize(fp61::Multiply(fp61::Multiply(x, y), z)); 498 | 499 | if (r != s || s != t) { 500 | cout << "Failed (does not commute) for i=" << i << endl; 501 | FP61_DEBUG_BREAK(); 502 | return false; 503 | } 504 | } 505 | 506 | // Direct function test 507 | uint64_t r1, r0; 508 | r0 = Emulate64x64to128(r1, MASK64, MASK64); 509 | 510 | if (r1 != 0xfffffffffffffffe || r0 != 1) { 511 | cout << "Failed (Emulate64x64to128 failed)" << endl; 512 | FP61_DEBUG_BREAK(); 513 | return false; 514 | } 515 | 516 | cout << "Passed" << endl; 517 | 518 | return true; 519 | } 520 | 521 | 522 | //------------------------------------------------------------------------------ 523 | // Tests: Inverse 524 | 525 | static bool test_inv(uint64_t x) 526 | { 527 | uint64_t i = fp61::Inverse(x); 528 | 529 | // If no inverse existed: 530 | if (i == 0) 531 | { 532 | // Then it must have evenly divided 533 | if (x % fp61::kPrime == 0) { 534 | return true; 535 | } 536 | 537 | // Otherwise this should have had a result 538 | cout << "Failed (no result) for x=" << HexString(x) << endl; 539 | FP61_DEBUG_BREAK(); 540 | return false; 541 | } 542 | 543 | // Result must be in Fp 544 | if (i >= fp61::kPrime) 545 | { 546 | cout << "Failed (result too large) for x=" << HexString(x) << endl; 547 | FP61_DEBUG_BREAK(); 548 | return false; 549 | } 550 | 551 | // mul requires partially reduced input 552 | x = fp61::PartialReduce(x); 553 | 554 | uint64_t p = fp61::Multiply(x, i); 555 | 556 | // If result is not 1 then it is not a multiplicative inverse 557 | if (fp61::Finalize(p) != 1) 558 | { 559 | cout << "Failed (finalized result not 1) for x=" << HexString(x) << endl; 560 | FP61_DEBUG_BREAK(); 561 | return false; 562 | } 563 | 564 | // Double check the reduce function... 565 | if (p % fp61::kPrime != 1) 566 | { 567 | cout << "Failed (remainder not 1) for x=" << HexString(x) << endl; 568 | FP61_DEBUG_BREAK(); 569 | return false; 570 | } 571 | 572 | return true; 573 | } 574 | 575 | static bool TestMulInverse() 576 | { 577 | cout << "TestMulInverse..."; 578 | 579 | // x < p 580 | 581 | // Small values 582 | for (uint64_t x = 1; x < 1000; ++x) { 583 | if (!test_inv(x)) { 584 | return false; 585 | } 586 | } 587 | 588 | fp61::Random prng; 589 | prng.Seed(5); 590 | 591 | for (unsigned i = 0; i < kRandomTestLoops; ++i) 592 | { 593 | uint64_t x = prng.Next(); 594 | 595 | if (!test_inv(x)) { 596 | return false; 597 | } 598 | } 599 | 600 | cout << "Passed" << endl; 601 | 602 | return true; 603 | } 604 | 605 | 606 | //------------------------------------------------------------------------------ 607 | // Tests: ByteReader 608 | 609 | bool test_byte_reader(const uint8_t* data, unsigned bytes) 610 | { 611 | fp61::ByteReader reader; 612 | 613 | reader.BeginRead(data, bytes); 614 | 615 | // Round up to the next 61 bits 616 | uint64_t expandedBits = bytes * 8; 617 | unsigned actualReads = 0; 618 | unsigned bits = 0; 619 | bool packed = false; 620 | unsigned packedBit = 0; 621 | 622 | uint64_t fp; 623 | while (fp61::ReadResult::Success == reader.Read(fp)) 624 | { 625 | unsigned readStart = bits / 8; 626 | if (readStart >= bytes) 627 | { 628 | // We can read one extra bit if the packing is the last thing 629 | if (!packed || readStart != bytes) 630 | { 631 | FP61_DEBUG_BREAK(); 632 | cout << "Failed (too many reads) for bytes=" << bytes << " actualReads=" << actualReads << endl; 633 | return false; 634 | } 635 | } 636 | 637 | int readBytes = (int)bytes - (int)readStart; 638 | if (readBytes < 0) { 639 | readBytes = 0; 640 | } 641 | else if (readBytes > 8) { 642 | readBytes = 8; 643 | } 644 | 645 | uint64_t x = fp61::ReadBytes_LE(data + readStart, readBytes) >> (bits % 8); 646 | 647 | int readBits = (readBytes * 8) - (bits % 8); 648 | if (readBytes >= 8 && readBits > 0 && readBits < 61 && readStart + readBytes < bytes) 649 | { 650 | // Need to read one more byte sometimes 651 | uint64_t high = data[readStart + readBytes]; 652 | high <<= readBits; 653 | x |= high; 654 | } 655 | 656 | // Test packing 657 | if (packed) 658 | { 659 | x <<= 1; 660 | x |= packedBit; 661 | bits += 60; 662 | ++expandedBits; 663 | } 664 | else 665 | { 666 | bits += 61; 667 | } 668 | 669 | x &= fp61::kPrime; 670 | 671 | packed = fp61::IsU64Ambiguous(x); 672 | if (packed) 673 | { 674 | packedBit = (x == fp61::kPrime); 675 | x = fp61::kAmbiguityMask; 676 | } 677 | 678 | if (fp != x) 679 | { 680 | FP61_DEBUG_BREAK(); 681 | cout << "Failed (wrong value) for bytes=" << bytes << " actualReads=" << actualReads << endl; 682 | return false; 683 | } 684 | ++actualReads; 685 | } 686 | 687 | const unsigned expectedReads = (unsigned)((expandedBits + 60) / 61); 688 | if (actualReads != expectedReads) 689 | { 690 | FP61_DEBUG_BREAK(); 691 | cout << "Failed (read count wrong) for bytes=" << bytes << endl; 692 | return false; 693 | } 694 | 695 | const unsigned maxWords = fp61::ByteReader::MaxWords(bytes); 696 | if (maxWords < actualReads) 697 | { 698 | FP61_DEBUG_BREAK(); 699 | cout << "Failed (MaxWords wrong) for bytes=" << bytes << endl; 700 | return false; 701 | } 702 | 703 | return true; 704 | } 705 | 706 | bool TestByteReader() 707 | { 708 | cout << "TestByteReader..."; 709 | 710 | uint8_t data[10 + 8] = { 711 | 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 712 | 0, 0, 0, 0, 0, 0, 0, 0 // Padding to simplify test 713 | }; 714 | 715 | uint64_t w = fp61::ReadU64_LE(data); 716 | if (w != 0x0807060504030201ULL) { 717 | cout << "Failed (ReadU64_LE)" << endl; 718 | FP61_DEBUG_BREAK(); 719 | return false; 720 | } 721 | 722 | uint32_t u = fp61::ReadU32_LE(data); 723 | if (u != 0x04030201UL) { 724 | cout << "Failed (ReadU32_LE)" << endl; 725 | FP61_DEBUG_BREAK(); 726 | return false; 727 | } 728 | 729 | uint64_t z = fp61::ReadBytes_LE(data, 0); 730 | if (z != 0) { 731 | cout << "Failed (ReadBytes_LE 0)" << endl; 732 | FP61_DEBUG_BREAK(); 733 | return false; 734 | } 735 | 736 | for (unsigned i = 1; i <= 8; ++i) 737 | { 738 | uint64_t v = fp61::ReadBytes_LE(data, i); 739 | uint64_t d = v ^ w; 740 | d <<= 8 * (8 - i); 741 | if (d != 0) { 742 | cout << "Failed (ReadBytes_LE) for i = " << i << endl; 743 | FP61_DEBUG_BREAK(); 744 | return false; 745 | } 746 | } 747 | 748 | uint8_t simpledata[16 + 8] = { 749 | 0, 1, 2, 3, 4, 5, 6, 7, 750 | 8, 9, 10, 11, 12, 13, 14, 15, 751 | 0 752 | }; 753 | 754 | for (unsigned i = 0; i <= 16; ++i) 755 | { 756 | if (!test_byte_reader(simpledata, i)) { 757 | return false; 758 | } 759 | } 760 | 761 | uint8_t allones[16 + 8] = { 762 | 254,255,255,255,255,255,255,255, 763 | 255,255,255,255,255,255,255,255, 764 | 0 765 | }; 766 | 767 | for (unsigned i = 0; i <= 16; ++i) 768 | { 769 | if (!test_byte_reader(allones, i)) { 770 | return false; 771 | } 772 | } 773 | 774 | uint8_t mixed[20 + 8] = { 775 | 254,255,255,255,255,255,255,255,0, // Inject a non-overflowing bit in the middle 776 | 255,255,255,255,255,255,255, 777 | 255,255,255,255, 778 | 0 779 | }; 780 | 781 | for (unsigned i = 0; i <= 16; ++i) 782 | { 783 | if (!test_byte_reader(allones, i)) { 784 | return false; 785 | } 786 | } 787 | 788 | vector randBytes(kMaxDataLength + 8, 0); // +8 to avoid bounds checking 789 | 790 | fp61::Random prng; 791 | prng.Seed(10); 792 | 793 | for (unsigned i = 0; i < kMaxDataLength; ++i) 794 | { 795 | for (unsigned j = 0; j < 1; ++j) 796 | { 797 | // Fill the data with random bytes 798 | for (unsigned k = 0; k < i; k += 8) 799 | { 800 | uint64_t w; 801 | if (prng.Next() % 100 <= 3) { 802 | w = ~(uint64_t)0; 803 | } 804 | else { 805 | w = prng.Next(); 806 | } 807 | fp61::WriteU64_LE(&randBytes[k], w); 808 | } 809 | 810 | if (!test_byte_reader(&randBytes[0], i)) { 811 | return false; 812 | } 813 | } 814 | } 815 | 816 | cout << "Passed" << endl; 817 | 818 | return true; 819 | } 820 | 821 | 822 | //------------------------------------------------------------------------------ 823 | // Tests: Random 824 | 825 | static bool TestRandom() 826 | { 827 | cout << "TestRandom..."; 828 | 829 | for (int i = -1000; i < 1000; ++i) 830 | { 831 | uint64_t loWord = static_cast(i); 832 | loWord <<= 3; // Put it in the high bits 833 | uint64_t loResult = fp61::Random::ConvertRandToFp(loWord); 834 | 835 | if (loResult >= fp61::kPrime) 836 | { 837 | cout << "Failed (RandToFp low) at i = " << i << endl; 838 | FP61_DEBUG_BREAK(); 839 | return false; 840 | } 841 | 842 | uint64_t hiWord = fp61::kPrime + static_cast(i); 843 | hiWord <<= 3; // Put it in the high bits 844 | uint64_t hiResult = fp61::Random::ConvertRandToFp(hiWord); 845 | 846 | if (hiResult >= fp61::kPrime) 847 | { 848 | cout << "Failed (RandToFp high) at i = " << i << endl; 849 | FP61_DEBUG_BREAK(); 850 | return false; 851 | } 852 | } 853 | 854 | for (int i = -1000; i < 1000; ++i) 855 | { 856 | uint64_t loWord = static_cast(i); 857 | loWord <<= 3; // Put it in the high bits 858 | uint64_t loResult = fp61::Random::ConvertRandToNonzeroFp(loWord); 859 | 860 | if (loResult <= 0 || loResult >= fp61::kPrime) 861 | { 862 | cout << "Failed (RandToNonzeroFp low) at i = " << i << endl; 863 | FP61_DEBUG_BREAK(); 864 | return false; 865 | } 866 | 867 | uint64_t hiWord = fp61::kPrime + static_cast(i); 868 | hiWord <<= 3; // Put it in the high bits 869 | uint64_t hiResult = fp61::Random::ConvertRandToNonzeroFp(hiWord); 870 | 871 | if (hiResult <= 0 || hiResult >= fp61::kPrime) 872 | { 873 | cout << "Failed (RandToNonzeroFp high) at i = " << i << endl; 874 | FP61_DEBUG_BREAK(); 875 | return false; 876 | } 877 | } 878 | 879 | cout << "Passed" << endl; 880 | 881 | return true; 882 | } 883 | 884 | 885 | //------------------------------------------------------------------------------ 886 | // Tests: WordReader/WordWriter 887 | 888 | static bool TestWordSerialization() 889 | { 890 | cout << "TestWordSerialization..."; 891 | 892 | fp61::WordWriter writer; 893 | fp61::WordReader reader; 894 | 895 | fp61::Random prng; 896 | prng.Seed(11); 897 | 898 | std::vector data; 899 | std::vector wordData; 900 | 901 | for (unsigned i = 1; i < kMaxDataLength; ++i) 902 | { 903 | unsigned words = i; 904 | unsigned bytesNeeded = fp61::WordWriter::BytesNeeded(words); 905 | 906 | data.resize(bytesNeeded); 907 | wordData.resize(words); 908 | 909 | writer.BeginWrite(&data[0]); 910 | reader.BeginRead(&data[0], bytesNeeded); 911 | 912 | for (unsigned j = 0; j < words; ++j) 913 | { 914 | // Generate a value from 0..p because the writer technically does not care about staying within the field 915 | uint64_t w = prng.Next() & MASK61; 916 | wordData[j] = w; 917 | writer.Write(w); 918 | } 919 | writer.Flush(); 920 | 921 | for (unsigned j = 0; j < words; ++j) 922 | { 923 | uint64_t u = reader.Read(); 924 | if (u != wordData[j]) 925 | { 926 | cout << "Failed (readback failed) at i = " << i << " j = " << j << endl; 927 | FP61_DEBUG_BREAK(); 928 | return false; 929 | } 930 | } 931 | } 932 | 933 | cout << "Passed" << endl; 934 | 935 | return true; 936 | } 937 | 938 | 939 | //------------------------------------------------------------------------------ 940 | // Tests: ByteWriter 941 | 942 | bool TestByteWriter() 943 | { 944 | cout << "TestByteWriter..."; 945 | 946 | fp61::ByteReader reader; 947 | fp61::ByteWriter writer; 948 | 949 | fp61::Random prng; 950 | prng.Seed(14); 951 | 952 | std::vector original, recovered; 953 | 954 | for (unsigned i = 1; i < kMaxDataLength; ++i) 955 | { 956 | unsigned bytes = i; 957 | 958 | for (unsigned j = 0; j < 10; ++j) 959 | { 960 | // Padding to simplify tester 961 | original.resize(bytes + 8); 962 | 963 | // Fill the data with random bytes 964 | for (unsigned k = 0; k < i; k += 8) 965 | { 966 | uint64_t w; 967 | if (prng.Next() % 100 <= 3) { 968 | w = ~(uint64_t)0; 969 | } 970 | else { 971 | w = prng.Next(); 972 | } 973 | fp61::WriteU64_LE(&original[k], w); 974 | } 975 | 976 | reader.BeginRead(&original[0], bytes); 977 | 978 | unsigned maxWords = fp61::ByteReader::MaxWords(bytes); 979 | unsigned maxBytes = fp61::ByteWriter::MaxBytesNeeded(maxWords); 980 | 981 | recovered.resize(maxBytes); 982 | writer.BeginWrite(&recovered[0]); 983 | 984 | // Write words we get directly back out 985 | uint64_t word; 986 | while (reader.Read(word) != fp61::ReadResult::Empty) { 987 | writer.Write(word); 988 | } 989 | unsigned writtenBytes = writer.Flush(); 990 | 991 | // TBD: Check if high bits are 0? 992 | 993 | if (writtenBytes > maxBytes || 994 | writtenBytes > bytes + 8) 995 | { 996 | cout << "Failed (byte count mismatch) at i = " << i << " j = " << j << endl; 997 | FP61_DEBUG_BREAK(); 998 | return false; 999 | } 1000 | 1001 | if (0 != memcmp(&recovered[0], &original[0], bytes)) 1002 | { 1003 | cout << "Failed (data corruption) at i = " << i << " j = " << j << endl; 1004 | FP61_DEBUG_BREAK(); 1005 | return false; 1006 | } 1007 | } 1008 | } 1009 | 1010 | cout << "Passed" << endl; 1011 | 1012 | return true; 1013 | } 1014 | 1015 | 1016 | //------------------------------------------------------------------------------ 1017 | // Tests: Integration 1018 | 1019 | // Tests all of the serialization/deserialization and some math code 1020 | bool TestIntegration() 1021 | { 1022 | cout << "TestIntegration..."; 1023 | 1024 | std::vector data, recovery, recovered; 1025 | 1026 | fp61::Random prng; 1027 | prng.Seed(13); 1028 | 1029 | // Test a range of data sizes 1030 | for (unsigned i = 1; i < kMaxDataLength; ++i) 1031 | { 1032 | unsigned bytes = i; 1033 | 1034 | // Run a few tests for each size 1035 | for (unsigned j = 0; j < 10; ++j) 1036 | { 1037 | // Generate some test data: 1038 | 1039 | // Allocate padded data to simplify tester 1040 | data.resize(bytes + 8); 1041 | 1042 | // Fill the data with random bytes 1043 | for (unsigned k = 0; k < i; k += 8) 1044 | { 1045 | uint64_t w; 1046 | if (prng.Next() % 100 <= 3) { 1047 | w = ~(uint64_t)0; 1048 | } 1049 | else { 1050 | w = prng.Next(); 1051 | } 1052 | fp61::WriteU64_LE(&data[k], w); 1053 | } 1054 | 1055 | // Read data from the simulated packet, 1056 | // perform some example Fp operation on it, 1057 | // and then store it to a simulated recovery packet. 1058 | 1059 | // Preallocate enough space in recovery packets for the worst case 1060 | const unsigned maxWords = fp61::ByteReader::MaxWords(bytes); 1061 | recovery.resize(fp61::WordWriter::BytesNeeded(maxWords)); 1062 | 1063 | fp61::WordWriter recovery_writer; 1064 | recovery_writer.BeginWrite(&recovery[0]); 1065 | 1066 | fp61::ByteReader original_reader; 1067 | original_reader.BeginRead(&data[0], bytes); 1068 | 1069 | fp61::Random coeff_prng; 1070 | coeff_prng.Seed(bytes + j * 500000); 1071 | 1072 | // Start reading words from the original file/packet, 1073 | // multiplying them by a random coefficient, 1074 | // and writing them to the recovery file/packet. 1075 | uint64_t r; 1076 | while (original_reader.Read(r) == fp61::ReadResult::Success) 1077 | { 1078 | // Pick random coefficient to multiply between 1..p-1 1079 | uint64_t coeff = coeff_prng.NextNonzeroFp(); 1080 | 1081 | // x = r * coeff (62 bits) 1082 | uint64_t x = fp61::Multiply(r, coeff); 1083 | 1084 | // Finalize x (61 bits < p) 1085 | uint64_t f = fp61::Finalize(x); 1086 | 1087 | // Write to recovery file/packet 1088 | recovery_writer.Write(f); 1089 | } 1090 | 1091 | // Flush the remaining bits to the recovery file/packet 1092 | unsigned writtenRecoveryBytes = recovery_writer.Flush(); 1093 | 1094 | // Simulate reading data from the recovery file/packet 1095 | // and recovering the original data: 1096 | 1097 | fp61::WordReader recovery_reader; 1098 | recovery_reader.BeginRead(&recovery[0], writtenRecoveryBytes); 1099 | 1100 | // Allocate space for recovered data (may be up to 1.6% larger than needed) 1101 | const unsigned recoveryWords = fp61::WordReader::WordCount(writtenRecoveryBytes); 1102 | const unsigned maxBytes = fp61::ByteWriter::MaxBytesNeeded(recoveryWords); 1103 | recovered.resize(maxBytes); 1104 | 1105 | fp61::ByteWriter original_writer; 1106 | original_writer.BeginWrite(&recovered[0]); 1107 | 1108 | // Reproduce the same random sequence 1109 | coeff_prng.Seed(bytes + j * 500000); 1110 | 1111 | // For each word to read: 1112 | const unsigned readWords = fp61::WordReader::WordCount(writtenRecoveryBytes); 1113 | for (unsigned i = 0; i < readWords; ++i) 1114 | { 1115 | // Pick random coefficient to multiply between 1..p-1 1116 | uint64_t coeff = coeff_prng.NextNonzeroFp(); 1117 | uint64_t inv_coeff = fp61::Inverse(coeff); 1118 | 1119 | // Read the next word (61 bits) 1120 | uint64_t f = recovery_reader.Read(); 1121 | 1122 | // Invert the multiplication (62 bits) 1123 | uint64_t x = fp61::Multiply(f, inv_coeff); 1124 | 1125 | // Finalize x (61 bits < p) 1126 | x = fp61::Finalize(x); 1127 | 1128 | // Write to recovered original data buffer 1129 | original_writer.Write(x); 1130 | } 1131 | 1132 | // Flush the remaining bits to the recovered original file/packet 1133 | unsigned recoveredBytes = original_writer.Flush(); 1134 | 1135 | if (recoveredBytes > maxBytes || 1136 | recoveredBytes > bytes + 8) 1137 | { 1138 | cout << "Failed (byte count mismatch) at i = " << i << " j = " << j << endl; 1139 | FP61_DEBUG_BREAK(); 1140 | return false; 1141 | } 1142 | 1143 | if (0 != memcmp(&recovered[0], &data[0], bytes)) 1144 | { 1145 | cout << "Failed (data corruption) at i = " << i << " j = " << j << endl; 1146 | FP61_DEBUG_BREAK(); 1147 | return false; 1148 | } 1149 | } 1150 | } 1151 | 1152 | cout << "Passed" << endl; 1153 | 1154 | return true; 1155 | } 1156 | 1157 | 1158 | //------------------------------------------------------------------------------ 1159 | // Entrypoint 1160 | 1161 | int main() 1162 | { 1163 | cout << "Unit tester for Fp61. Exits with -1 on failure, 0 on success" << endl; 1164 | cout << endl; 1165 | 1166 | int result = FP61_RET_SUCCESS; 1167 | 1168 | if (!TestByteWriter()) { 1169 | result = FP61_RET_FAIL; 1170 | } 1171 | if (!TestIntegration()) { 1172 | result = FP61_RET_FAIL; 1173 | } 1174 | if (!TestRandom()) { 1175 | result = FP61_RET_FAIL; 1176 | } 1177 | if (!TestWordSerialization()) { 1178 | result = FP61_RET_FAIL; 1179 | } 1180 | if (!TestNegate()) { 1181 | result = FP61_RET_FAIL; 1182 | } 1183 | if (!TestAdd()) { 1184 | result = FP61_RET_FAIL; 1185 | } 1186 | if (!TestPartialReduction()) { 1187 | result = FP61_RET_FAIL; 1188 | } 1189 | if (!TestFinalizeReduction()) { 1190 | result = FP61_RET_FAIL; 1191 | } 1192 | if (!TestMultiply()) { 1193 | result = FP61_RET_FAIL; 1194 | } 1195 | if (!TestMulInverse()) { 1196 | result = FP61_RET_FAIL; 1197 | } 1198 | if (!TestByteReader()) { 1199 | result = FP61_RET_FAIL; 1200 | } 1201 | 1202 | cout << endl; 1203 | if (result == FP61_RET_FAIL) { 1204 | cout << "*** Tests failed (see above)! Returning -1" << endl; 1205 | } 1206 | else { 1207 | cout << "*** Tests succeeded! Returning 0" << endl; 1208 | } 1209 | 1210 | return result; 1211 | } 1212 | --------------------------------------------------------------------------------