├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── README.md
├── fp61.cpp
├── fp61.h
└── tests
    ├── benchmarks.cpp
    ├── gf256.cpp
    ├── gf256.h
    └── tests.cpp


/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | 
  4 | # User-specific files
  5 | *.suo
  6 | *.user
  7 | *.userosscache
  8 | *.sln.docstates
  9 | 
 10 | # User-specific files (MonoDevelop/Xamarin Studio)
 11 | *.userprefs
 12 | 
 13 | # Build results
 14 | [Dd]ebug/
 15 | [Dd]ebugPublic/
 16 | [Rr]elease/
 17 | [Rr]eleases/
 18 | x64/
 19 | x86/
 20 | bld/
 21 | [Bb]in/
 22 | [Oo]bj/
 23 | [Ll]og/
 24 | 
 25 | # Visual Studio 2015 cache/options directory
 26 | .vs/
 27 | # Uncomment if you have tasks that create the project's static files in wwwroot
 28 | #wwwroot/
 29 | 
 30 | # MSTest test Results
 31 | [Tt]est[Rr]esult*/
 32 | [Bb]uild[Ll]og.*
 33 | 
 34 | # NUNIT
 35 | *.VisualState.xml
 36 | TestResult.xml
 37 | 
 38 | # Build Results of an ATL Project
 39 | [Dd]ebugPS/
 40 | [Rr]eleasePS/
 41 | dlldata.c
 42 | 
 43 | # DNX
 44 | project.lock.json
 45 | artifacts/
 46 | 
 47 | *_i.c
 48 | *_p.c
 49 | *_i.h
 50 | *.ilk
 51 | *.meta
 52 | *.obj
 53 | *.pch
 54 | *.pdb
 55 | *.pgc
 56 | *.pgd
 57 | *.rsp
 58 | *.sbr
 59 | *.tlb
 60 | *.tli
 61 | *.tlh
 62 | *.tmp
 63 | *.tmp_proj
 64 | *.log
 65 | *.vspscc
 66 | *.vssscc
 67 | .builds
 68 | *.pidb
 69 | *.svclog
 70 | *.scc
 71 | 
 72 | # Chutzpah Test files
 73 | _Chutzpah*
 74 | 
 75 | # Visual C++ cache files
 76 | ipch/
 77 | *.aps
 78 | *.ncb
 79 | *.opendb
 80 | *.opensdf
 81 | *.sdf
 82 | *.cachefile
 83 | *.VC.db
 84 | *.VC.VC.opendb
 85 | 
 86 | # Visual Studio profiler
 87 | *.psess
 88 | *.vsp
 89 | *.vspx
 90 | *.sap
 91 | 
 92 | # TFS 2012 Local Workspace
 93 | $tf/
 94 | 
 95 | # Guidance Automation Toolkit
 96 | *.gpState
 97 | 
 98 | # ReSharper is a .NET coding add-in
 99 | _ReSharper*/
100 | *.[Rr]e[Ss]harper
101 | *.DotSettings.user
102 | 
103 | # JustCode is a .NET coding add-in
104 | .JustCode
105 | 
106 | # TeamCity is a build add-in
107 | _TeamCity*
108 | 
109 | # DotCover is a Code Coverage Tool
110 | *.dotCover
111 | 
112 | # NCrunch
113 | _NCrunch_*
114 | .*crunch*.local.xml
115 | nCrunchTemp_*
116 | 
117 | # MightyMoose
118 | *.mm.*
119 | AutoTest.Net/
120 | 
121 | # Web workbench (sass)
122 | .sass-cache/
123 | 
124 | # Installshield output folder
125 | [Ee]xpress/
126 | 
127 | # DocProject is a documentation generator add-in
128 | DocProject/buildhelp/
129 | DocProject/Help/*.HxT
130 | DocProject/Help/*.HxC
131 | DocProject/Help/*.hhc
132 | DocProject/Help/*.hhk
133 | DocProject/Help/*.hhp
134 | DocProject/Help/Html2
135 | DocProject/Help/html
136 | 
137 | # Click-Once directory
138 | publish/
139 | 
140 | # Publish Web Output
141 | *.[Pp]ublish.xml
142 | *.azurePubxml
143 | # TODO: Comment the next line if you want to checkin your web deploy settings
144 | # but database connection strings (with potential passwords) will be unencrypted
145 | *.pubxml
146 | *.publishproj
147 | 
148 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
149 | # checkin your Azure Web App publish settings, but sensitive information contained
150 | # in these scripts will be unencrypted
151 | PublishScripts/
152 | 
153 | # NuGet Packages
154 | *.nupkg
155 | # The packages folder can be ignored because of Package Restore
156 | **/packages/*
157 | # except build/, which is used as an MSBuild target.
158 | !**/packages/build/
159 | # Uncomment if necessary however generally it will be regenerated when needed
160 | #!**/packages/repositories.config
161 | # NuGet v3's project.json files produces more ignoreable files
162 | *.nuget.props
163 | *.nuget.targets
164 | 
165 | # Microsoft Azure Build Output
166 | csx/
167 | *.build.csdef
168 | 
169 | # Microsoft Azure Emulator
170 | ecf/
171 | rcf/
172 | 
173 | # Windows Store app package directories and files
174 | AppPackages/
175 | BundleArtifacts/
176 | Package.StoreAssociation.xml
177 | _pkginfo.txt
178 | 
179 | # Visual Studio cache files
180 | # files ending in .cache can be ignored
181 | *.[Cc]ache
182 | # but keep track of directories ending in .cache
183 | !*.[Cc]ache/
184 | 
185 | # Others
186 | ClientBin/
187 | ~$*
188 | *~
189 | *.dbmdl
190 | *.dbproj.schemaview
191 | *.pfx
192 | *.publishsettings
193 | node_modules/
194 | orleans.codegen.cs
195 | 
196 | # Since there are multiple workflows, uncomment next line to ignore bower_components
197 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
198 | #bower_components/
199 | 
200 | # RIA/Silverlight projects
201 | Generated_Code/
202 | 
203 | # Backup & report files from converting an old project file
204 | # to a newer Visual Studio version. Backup files are not needed,
205 | # because we have git ;-)
206 | _UpgradeReport_Files/
207 | Backup*/
208 | UpgradeLog*.XML
209 | UpgradeLog*.htm
210 | 
211 | # SQL Server files
212 | *.mdf
213 | *.ldf
214 | 
215 | # Business Intelligence projects
216 | *.rdl.data
217 | *.bim.layout
218 | *.bim_*.settings
219 | 
220 | # Microsoft Fakes
221 | FakesAssemblies/
222 | 
223 | # GhostDoc plugin setting file
224 | *.GhostDoc.xml
225 | 
226 | # Node.js Tools for Visual Studio
227 | .ntvs_analysis.dat
228 | 
229 | # Visual Studio 6 build log
230 | *.plg
231 | 
232 | # Visual Studio 6 workspace options file
233 | *.opt
234 | 
235 | # Visual Studio LightSwitch build output
236 | **/*.HTMLClient/GeneratedArtifacts
237 | **/*.DesktopClient/GeneratedArtifacts
238 | **/*.DesktopClient/ModelManifest.xml
239 | **/*.Server/GeneratedArtifacts
240 | **/*.Server/ModelManifest.xml
241 | _Pvt_Extensions
242 | 
243 | # Paket dependency manager
244 | .paket/paket.exe
245 | paket-files/
246 | 
247 | # FAKE - F# Make
248 | .fake/
249 | 
250 | # JetBrains Rider
251 | .idea/
252 | *.sln.iml
253 | *.txt
254 | *.lib
255 | *.exe
256 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | project(fp61)
 3 | 
 4 | set(CMAKE_CXX_STANDARD 11)
 5 | 
 6 | # Fp61 library source files
 7 | set(FP61_LIB_SRCFILES
 8 |         fp61.cpp
 9 |         fp61.h)
10 | 
11 | add_library(fp61 ${FP61_LIB_SRCFILES})
12 | 
13 | add_executable(tests tests/tests.cpp)
14 | target_link_libraries(tests fp61)
15 | 
16 | add_executable(benchmarks
17 | 	tests/benchmarks.cpp
18 | 	tests/gf256.h
19 | 	tests/gf256.cpp)
20 | target_link_libraries(benchmarks fp61)
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2018, Chris Taylor
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Fp61
  2 | ## Finite field arithmetic modulo Mersenne prime p = 2^61-1 in C++
  3 | 
  4 | ### Disclaimer: I don't recommend using Fp61 for erasure codes.  This was an experiment to see how it would perform, and unfortunately the results were not good.  See the benchmarks below.
  5 | 
  6 | This software takes advantage of the commonly available fast 64x64->128 multiplier
  7 | to accelerate finite (base) field arithmetic.  So it runs a lot faster
  8 | when built into a 64-bit executable.
  9 | 
 10 | This math code offers use of lazy reduction techniques for speed,
 11 | via fp61::PartialReduce().
 12 | 
 13 | + Addition of 8 values can be evaluated before reduction.
 14 | + Sums of 4 products can be evaluated with partial reductions.
 15 | 
 16 | ## Benchmarks
 17 | 
 18 | The goal of the benchmarks is to determine how fast Fp61 arithmetic is
 19 | for the purpose of implementing erasure codes in software.
 20 | 
 21 | *Drumroll...* Results:
 22 | 
 23 | The results are not good at all.  The Fp61 encoder is roughly 20x slower
 24 | than my GF(2^8) code (gf256).  So, I do not recommend using Fp61.
 25 | 
 26 | The majority of the slowdown comes from the ByteReader class that needs
 27 | to convert byte data into 61-bit Fp words.  So it seems that having an
 28 | odd field size to achieve lazy reductions does not help performance.
 29 | 
 30 | *Sad trombone...*
 31 | 
 32 |     Benchmarks for Fp61 erasure codes.  Before running the benchmarks please run the tests to make sure everything's working on your PC.  It's going to run quite a bit faster with 64-bit builds because it takes advantage of the speed of 64-bit multiplications.
 33 | 
 34 |     Testing file size = 10 bytes
 35 |     N = 2 :  gf256_MBPS=250 Fp61_MBPS=65 Fp61_OutputBytes=16
 36 |     N = 4 :  gf256_MBPS=305 Fp61_MBPS=116 Fp61_OutputBytes=16
 37 |     N = 8 :  gf256_MBPS=138 Fp61_MBPS=80 Fp61_OutputBytes=16
 38 |     N = 16 :  gf256_MBPS=337 Fp61_MBPS=110 Fp61_OutputBytes=16
 39 |     N = 32 :  gf256_MBPS=711 Fp61_MBPS=242 Fp61_OutputBytes=16
 40 |     N = 64 :  gf256_MBPS=665 Fp61_MBPS=226 Fp61_OutputBytes=16
 41 |     N = 128 :  gf256_MBPS=868 Fp61_MBPS=297 Fp61_OutputBytes=16
 42 |     N = 256 :  gf256_MBPS=713 Fp61_MBPS=240 Fp61_OutputBytes=16
 43 |     N = 512 :  gf256_MBPS=881 Fp61_MBPS=300 Fp61_OutputBytes=16
 44 |     Testing file size = 100 bytes
 45 |     N = 2 :  gf256_MBPS=1234 Fp61_MBPS=214 Fp61_OutputBytes=107
 46 |     N = 4 :  gf256_MBPS=4000 Fp61_MBPS=486 Fp61_OutputBytes=107
 47 |     N = 8 :  gf256_MBPS=2631 Fp61_MBPS=328 Fp61_OutputBytes=107
 48 |     N = 16 :  gf256_MBPS=2051 Fp61_MBPS=300 Fp61_OutputBytes=107
 49 |     N = 32 :  gf256_MBPS=3850 Fp61_MBPS=433 Fp61_OutputBytes=107
 50 |     N = 64 :  gf256_MBPS=3972 Fp61_MBPS=428 Fp61_OutputBytes=107
 51 |     N = 128 :  gf256_MBPS=4397 Fp61_MBPS=444 Fp61_OutputBytes=107
 52 |     N = 256 :  gf256_MBPS=5137 Fp61_MBPS=500 Fp61_OutputBytes=107
 53 |     N = 512 :  gf256_MBPS=5129 Fp61_MBPS=492 Fp61_OutputBytes=107
 54 |     Testing file size = 1000 bytes
 55 |     N = 2 :  gf256_MBPS=10309 Fp61_MBPS=889 Fp61_OutputBytes=1007
 56 |     N = 4 :  gf256_MBPS=15325 Fp61_MBPS=848 Fp61_OutputBytes=1007
 57 |     N = 8 :  gf256_MBPS=9184 Fp61_MBPS=486 Fp61_OutputBytes=1007
 58 |     N = 16 :  gf256_MBPS=12728 Fp61_MBPS=722 Fp61_OutputBytes=1007
 59 |     N = 32 :  gf256_MBPS=11838 Fp61_MBPS=610 Fp61_OutputBytes=1007
 60 |     N = 64 :  gf256_MBPS=10555 Fp61_MBPS=604 Fp61_OutputBytes=1007
 61 |     N = 128 :  gf256_MBPS=11354 Fp61_MBPS=614 Fp61_OutputBytes=1007
 62 |     N = 256 :  gf256_MBPS=14782 Fp61_MBPS=816 Fp61_OutputBytes=1007
 63 |     N = 512 :  gf256_MBPS=18430 Fp61_MBPS=940 Fp61_OutputBytes=1007
 64 |     Testing file size = 10000 bytes
 65 |     N = 2 :  gf256_MBPS=19138 Fp61_MBPS=893 Fp61_OutputBytes=10004
 66 |     N = 4 :  gf256_MBPS=20283 Fp61_MBPS=959 Fp61_OutputBytes=10004
 67 |     N = 8 :  gf256_MBPS=20953 Fp61_MBPS=1010 Fp61_OutputBytes=10004
 68 |     N = 16 :  gf256_MBPS=22893 Fp61_MBPS=1056 Fp61_OutputBytes=10004
 69 |     N = 32 :  gf256_MBPS=24461 Fp61_MBPS=1087 Fp61_OutputBytes=10004
 70 |     N = 64 :  gf256_MBPS=22945 Fp61_MBPS=1057 Fp61_OutputBytes=10004
 71 |     N = 128 :  gf256_MBPS=16939 Fp61_MBPS=982 Fp61_OutputBytes=10004
 72 |     N = 256 :  gf256_MBPS=18608 Fp61_MBPS=927 Fp61_OutputBytes=10004
 73 |     N = 512 :  gf256_MBPS=16662 Fp61_MBPS=734 Fp61_OutputBytes=10004
 74 |     Testing file size = 100000 bytes
 75 |     N = 2 :  gf256_MBPS=22941 Fp61_MBPS=962 Fp61_OutputBytes=100002
 76 |     N = 4 :  gf256_MBPS=22827 Fp61_MBPS=976 Fp61_OutputBytes=100002
 77 |     N = 8 :  gf256_MBPS=16210 Fp61_MBPS=1052 Fp61_OutputBytes=100002
 78 |     N = 16 :  gf256_MBPS=17354 Fp61_MBPS=1044 Fp61_OutputBytes=100002
 79 |     N = 32 :  gf256_MBPS=16976 Fp61_MBPS=1030 Fp61_OutputBytes=100002
 80 |     N = 64 :  gf256_MBPS=13570 Fp61_MBPS=910 Fp61_OutputBytes=100002
 81 |     N = 128 :  gf256_MBPS=10592 Fp61_MBPS=533 Fp61_OutputBytes=100002
 82 |     N = 256 :  gf256_MBPS=10637 Fp61_MBPS=500 Fp61_OutputBytes=100002
 83 |     N = 512 :  gf256_MBPS=11528 Fp61_MBPS=483 Fp61_OutputBytes=100002
 84 | 
 85 | Note that near the end it looks like the file sizes are exceeding the processor cache and it starts slowing down by 2x.
 86 | 
 87 | 
 88 | ## API
 89 | 
 90 | Supported arithmetic operations: Add, Negation, Multiply, Mul Inverse.
 91 | Subtraction is implemented via Negation.
 92 | 
 93 | Partial Reduction from full 64 bits to 62 bits:
 94 | 
 95 |     r = fp61::PartialReduce(x)
 96 | 
 97 |     Partially reduce x (mod p).  This clears bits #63 and #62.
 98 | 
 99 |     The result can be passed directly to fp61::Add4(), fp61::Multiply(),
100 |     and fp61::Finalize().
101 | 
102 | Final Reduction from 64 bits to <p (within the field Fp):
103 | 
104 |     r = fp61::Finalize(x)
105 | 
106 |     Finalize reduction of x (mod p) from PartialReduce()
107 |     Preconditions: Bits #63 and #62 are clear and x != 0x3ffffffffffffffeULL
108 | 
109 |     This function fails for x = 0x3ffffffffffffffeULL.
110 |     The partial reduction function does not produce this bit pattern for any
111 |     input, so this exception is allowed because I'm assuming the input comes
112 |     from fp61::PartialReduce().  So, do not mask down to 62 random bits and
113 |     pass to this function because it can fail in this one case.
114 | 
115 |     Returns a value in Fp (less than p).
116 | 
117 | Chained Addition of Four Values:
118 | 
119 |     r = fp61::Add4(x, y, z, w)
120 | 
121 |     Sum x + y + z + w (without full reduction modulo p).
122 |     Preconditions: x,y,z,w <2^62
123 | 
124 |     Probably you will want to just inline this code and follow the pattern,
125 |     since being restricted to adding 4 things at a time is kind of weird.
126 | 
127 |     The result can be passed directly to fp61::Add4(), fp61::Multiply(), and
128 |     fp61::Finalize().
129 | 
130 | You can also use normal addition but you have to be careful about bit overflow.
131 | 
132 | Negation:
133 | 
134 |     r = fp61::Negate(x)
135 | 
136 |     r = -x (without reduction modulo p)
137 |     Preconditions: x <= p
138 | 
139 |     The input needs to be have bits #63 #62 #61 cleared.
140 |     This can be ensured by calling fp61::PartialReduce() and
141 |     fp61::Finalize() first.  Since this is more expensive than addition
142 |     it is best to reorganize operations to avoid needing this reduction.
143 | 
144 |     Return a value <= p.
145 | 
146 | For subtraction, use fp61::Negate() and add: x + (-y).
147 | 
148 | Multiplication:
149 | 
150 |     r = fp61::Multiply(x, y)
151 | 
152 |     r = x * y (with partial reduction modulo p)
153 | 
154 |     Important Input Restriction:
155 | 
156 |         The number of bits between x and y must be less than 124 bits.
157 | 
158 |         Call fp61::PartialReduce() to reduce inputs if needed,
159 |         which makes sure that both inputs are 62 bits or fewer.
160 | 
161 |         Example: If x <= 2^62-1 (62 bits), then y <= 2^62-1 (62 bits).
162 |         This means that up to 2 values can be accumulated in x and 2 in y.
163 | 
164 |         But it is also possible to balance the input in other ways.
165 | 
166 |         Example: If x <= 2^61-1 (61 bits), then y <= 2^63-1 (63 bits).
167 |         This means that up to 4 values can be accumulated in y.
168 | 
169 |     Result:
170 | 
171 |         The result is stored in bits #61 to #0 (62 bits of the word).
172 |         Call fp61::Finalize() to reduce the result to 61 bits.
173 | 
174 | Modular Multiplicative Inverse:
175 | 
176 |     r = fp61::Inverse(x)
177 | 
178 |     r = x^-1 (mod p)
179 |     The input value x can be any 64-bit value.
180 | 
181 |     This operation is kind of heavy so it should be avoided where possible.
182 | 
183 |     This operation is not constant-time.
184 |     A constant-time version can be implemented using Euler's totient method and
185 |     a straight line similar to https://github.com/catid/snowshoe/blob/master/src/fp.inc#L545
186 | 
187 |     Returns the multiplicative inverse of x modulo p.
188 |     0 < result < p
189 | 
190 |     If the inverse does not exist, it returns 0.
191 | 
192 | Fitting Bytes Into Words
193 | 
194 |     When converting byte data to words, a value of 2^61-1 is problematic
195 |     because it does not fit in the field Fp that ranges from 0..(2^61-2).
196 | 
197 |     One way to fit these values into the field would be to emit 1ff..ffe
198 |     for both 1ff..ffe and 1ff..fff, and then inject a new bit after it to
199 |     allow the ByteWriter code to reverse the transformation.  The problem
200 |     with this is that the lower bit is modified, which is the same one
201 |     that signals how the prior word is modified.
202 | 
203 |     So a better way to fix 1ff..fff is to make it ambiguous with 0ff..fff,
204 |     where the high bit of the word is flipped.  Now when 0ff..fff is seen
205 |     by the ByteWriter, it knows to check the next word's low bit and
206 |     optionally reverse it back to 1ff..fff.
207 | 
208 |     As an aside, we want to design the ByteReader to be as fast as possible
209 |     because it is used by the erasure code encoder - The decoder must only
210 |     reverse this transformation for any lost data, so it can be slower.
211 | 
212 |     It may be a good idea to XOR input data by a random sequence to randomize
213 |     the odds of using extra bits, depending on the application.
214 | 
215 | Reading Byte Data (e.g. from a file or packet) Into 61-bit Field Words:
216 | 
217 |     ByteReader
218 | 
219 |     Reads 8 bytes at a time from the input data and outputs 61-bit Fp words.
220 |     Pads the final < 8 bytes with zeros.
221 | 
222 |     See the comments on Fitting Bytes Into Words for how this works.
223 | 
224 |     Call ByteReader::MaxWords() to calculate the maximum number of words that
225 |     can be generated for worst-case input of all FFF...FFs.
226 | 
227 |     Define FP61_SAFE_MEMORY_ACCESSES if the platform does not support unaligned
228 |     reads and the input data is unaligned, or the platform is big-endian.
229 | 
230 |     Call BeginRead() to begin reading.
231 | 
232 |     Call ReadNext() repeatedly to read all words from the data.
233 |     It will return ReadResult::Empty when all bits are empty.
234 | 
235 | Writing Fp Words (e.g. storing field words to file or packet):
236 | 
237 |     WordWriter
238 | 
239 |     Writes a series of 61-bit finalized Fp field elements to a byte array.
240 |     The resulting data can be read by WordReader.
241 | 
242 |     Call BytesNeeded() to calculate the number of bytes needed to store the
243 |     given number of Fp words.
244 | 
245 |     Call BeginWrite() to start writing.
246 |     Call Write() to write the next word.
247 | 
248 |     Call Flush() to write the last few bytes.
249 |     Flush() returns the number of overall written bytes.
250 | 
251 | Reading Fp Words (e.g. from a file or packet):
252 | 
253 |     WordReader
254 | 
255 |     Reads a series of 61-bit finalized Fp field elements from a byte array.
256 | 
257 |     This differs from ByteReader in two ways:
258 |     (1) It does not have to handle the special case of all ffffs.
259 |     (2) It terminates deterministically at WordCount() words rather than
260 |     based on the contents of the data.
261 | 
262 |     Call WordCount() to calculate the number of words to expect to read from
263 |     a given number of bytes.
264 | 
265 |     Call BeginRead() to start reading.
266 |     Call Read() to retrieve each consecutive word.
267 | 
268 | Writing 61-bit Field Words Back Into Byte Data (e.g. recovering a file or packet):
269 | 
270 |     ByteWriter
271 | 
272 |     Writes a series of 61-bit finalized Fp field elements to a byte array,
273 |     reversing the encoding of ByteReader.  This is different from WordWriter
274 |     because it can also write 61-bit values that are all ones (outside of Fp).
275 | 
276 |     See the comments on Fitting Bytes Into Words for how this works.
277 | 
278 |     Call MaxBytesNeeded() to calculate the maximum number of bytes needed
279 |     to store the given number of Fp words.
280 | 
281 |     Call BeginWrite() to start writing.
282 |     Call Write() to write the next word.
283 | 
284 |     Call Flush() to write the last few bytes.
285 |     Flush() returns the number of overall written bytes.
286 | 
287 | Generating random Fp words (e.g. to fill a random matrix):
288 | 
289 |     Random
290 | 
291 |     Xoroshiro256+ based pseudo-random number generator (PRNG) that can generate
292 |     random numbers between 1..p.  NextNonzeroFp() is mainly intended to be used
293 |     for producing convolutional code coefficients to multiply by the data.
294 | 
295 |     Call Seed() to provide a 64-bit generator seed.
296 |     Call NextNonzeroFp() to produce a random 61-bit number from 1..p
297 |     Call NextFp() to produce a random 61-bit number from 0..p
298 |     Call Next() to produce a random 64-bit number.
299 | 
300 | 
301 | #### Comparing Fp61 to GF(2^8) and GF(2^16):
302 | 
303 | GF(2^8) math requires SSSE3 instructions (PSHUFB) for best speed, which may not be available.  ARM64 and Intel both support the shuffle instruction.  It falls back to table lookups, which is relatively very slow.
304 | 
305 | GF(2^16) math gets awkward and 2x slower to implement.  Awkward: The input data needs to be in 32 byte chunks for best speed, and interleaved in a special way.  Slower: Similar to schoolbook multiplication it requires 4 shuffles instead of 2 for the same amount of data because the base operation is limited to a 4-bit->8-bit lookup table (shuffle).
306 | 
307 | Fp61 math runs fastest when a 64x64->128 multiply instruction is available, which is unavailable on ARM64.  It has to use a schoolbook multiplication approach to emulate the wider multiplier, requiring 4 multiplies instead of 1.
308 | 
309 | Regarding fitting data into the fields, GF(2^8) and GF(2^16) have an advantage because input data is in bytes.  Data needs to be packed into Fp61 values in order to work on it, but the encoding is fairly straight-forward.
310 | 
311 | Regarding erasure code applications, a random linear code based on GF(2^8) will fail to recover roughly 0.2% of the time, requiring one extra recovery packet.  GF(2^16) and Fp61 have almost no overhead.
312 | 
313 | 
314 | #### Comparing Fp61 to Fp=2^32-5:
315 | 
316 | Fp61 has a number of advantages over Fp=2^32-5 and some disadvantages.
317 | 
318 | Clear advantages for Fp61:
319 | 
320 | (1) Since the prime modulus leaves some extra bits in the 64-bit words, lazy reduction can be used to cut down the cost of additions by about 1/2.  For erasure codes muladd/mulsub are the common operations, so cheap additions are great.
321 | 
322 | (2) The reductions are cheaper for Fp61 in general due to the Mersenne prime modulus.  Furthermore the reductions have no conditionals, so the performance is pretty much consistent regardless of the input.
323 | 
324 | (3) The smaller field consumes data at 1/2 the rate, which is problematic because of the data packing required.  Generally computers are more efficient for larger reads, so reading/writing twice as much data is more efficient.  If a prefix code is used, then 2x the amount of state needs to be kept for the same amount of data.  Fp61 must emit one bit for its prefix code, whereas the smaller field must emit 2-3 bits.
325 | 
326 | Possible advantages for Fp=2^32-5:
327 | 
328 | (1) Fp61 may be overall less efficient on mobile (ARM64) processors.  Despite the speed disadvantages discussed above, when the 64-bit multiply instruction is unavailable, the smaller prime may pull ahead for performance.  It would require benchmarking to really answer this.
329 | 
330 | 
331 | #### Comparing Fp61 to Fp=2^127-1:
332 | 
333 | Perhaps Fp61 might be useful for cryptography for an extension field like Fp^4 for a 244-bit field.
334 | All of the operations are constant-time so it is pretty close to being good for crypto.
335 | The inverse operation would be exponentially faster since the Euler totient -based inverse function only needs to evaluate a 64-bit exponentiation instead of a 128-bit exponentiation.
336 | 
337 | Because the Fp=2^127-1 field is so close to the word size, every add operation needs a reduction costing 3 cycles.  The partial reduction for 2^61-1 runs in 2 cycles and only needs to be performed every 4 additions, plus it does not
338 | require any assembly code software.
339 | Overall addition for 4 values is 6x faster in this field.  If two 61-bit fields are used in an OEF, then addition of 61x2= 121-bit values is 3x faster using this as a base field.
340 | 
341 | Multiplication for 2^127-1 is complicated because there is no instruction that performs 128x128->256 bit products.  So it requires 4 MUL instructions. Reduction requires about 10 instructions.  Multiplication for 2^61-1 is done with 1 MUL instruction.
342 | Reduction requires about 7 instructions for repeated muliplies.  In an OEF, the 121-bit multiply is overall less complicated, maybe by 30%?
343 | 
344 | 
345 | #### Ideas for future work:
346 | 
347 | It may be interesting to use Fp=2^31-1 for mobile targets because the 32x32->64 multiplier that is available is a good fit for this field.  Reduction is simple and may allow for some laziness to cut the reduction costs in half, but it's not clear how it would work out practically without implementing it.
348 | 
349 | Solinas prime p=2^64-2^32+1 would allow a much less awkward algorithm for packing data into the field, and its reduction is even simpler than the Fp61 prime.
350 | 
351 | 
352 | #### Credits
353 | 
354 | Software by Christopher A. Taylor <mrcatid@gmail.com>.
355 | 
356 | Please reach out if you need support or would like to collaborate on a project.
357 | 


--------------------------------------------------------------------------------
/fp61.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |     Copyright (c) 2018 Christopher A. Taylor.  All rights reserved.
  3 | 
  4 |     Redistribution and use in source and binary forms, with or without
  5 |     modification, are permitted provided that the following conditions are met:
  6 | 
  7 |     * Redistributions of source code must retain the above copyright notice,
  8 |       this list of conditions and the following disclaimer.
  9 |     * Redistributions in binary form must reproduce the above copyright notice,
 10 |       this list of conditions and the following disclaimer in the documentation
 11 |       and/or other materials provided with the distribution.
 12 |     * Neither the name of Fp61 nor the names of its contributors may be
 13 |       used to endorse or promote products derived from this software without
 14 |       specific prior written permission.
 15 | 
 16 |     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 17 |     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 |     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 19 |     ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 20 |     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 21 |     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 22 |     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 23 |     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 24 |     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 25 |     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 26 |     POSSIBILITY OF SUCH DAMAGE.
 27 | */
 28 | 
 29 | #include "fp61.h"
 30 | 
 31 | namespace fp61 {
 32 | 
 33 | 
 34 | // This is an unrolled implementation of Knuth's unsigned version of the eGCD,
 35 | // specialized for the prime.  It handles any input.
 36 | uint64_t Inverse(uint64_t u)
 37 | {
 38 |     uint64_t u1, u3, v1, v3, qt;
 39 | 
 40 |     qt = u / kPrime;
 41 |     u3 = u % kPrime;
 42 |     u1 = 1;
 43 | 
 44 |     if (u3 == 0) {
 45 |         return 0; // No inverse
 46 |     }
 47 | 
 48 |     qt = kPrime / u3;
 49 |     v3 = kPrime % u3;
 50 |     v1 = qt;
 51 | 
 52 |     for (;;)
 53 |     {
 54 |         if (v3 == 0) {
 55 |             return u3 == 1 ? u1 : 0;
 56 |         }
 57 | 
 58 |         qt = u3 / v3;
 59 |         u3 %= v3;
 60 |         u1 += qt * v1;
 61 | 
 62 |         if (u3 == 0) {
 63 |             return v3 == 1 ? kPrime - v1 : 0;
 64 |         }
 65 | 
 66 |         qt = v3 / u3;
 67 |         v3 %= u3;
 68 |         v1 += qt * u1;
 69 |     }
 70 | }
 71 | 
 72 | 
 73 | //------------------------------------------------------------------------------
 74 | // Memory Reading
 75 | 
 76 | uint64_t ReadBytes_LE(const uint8_t* data, unsigned bytes)
 77 | {
 78 |     switch (bytes)
 79 |     {
 80 |     case 8: return ReadU64_LE(data);
 81 |     case 7: return ((uint64_t)data[6] << 48) | ((uint64_t)data[5] << 40) | ((uint64_t)data[4] << 32) | ReadU32_LE(data);
 82 |     case 6: return ((uint64_t)data[5] << 40) | ((uint64_t)data[4] << 32) | ReadU32_LE(data);
 83 |     case 5: return ((uint64_t)data[4] << 32) | ReadU32_LE(data);
 84 |     case 4: return ReadU32_LE(data);
 85 |     case 3: return ((uint32_t)data[2] << 16) | ((uint32_t)data[1] << 8) | data[0];
 86 |     case 2: return ((uint32_t)data[1] << 8) | data[0];
 87 |     case 1: return data[0];
 88 |     default: break;
 89 |     }
 90 |     return 0;
 91 | }
 92 | 
 93 | ReadResult ByteReader::Read(uint64_t& fpOut)
 94 | {
 95 |     uint64_t word, r, workspace = Workspace;
 96 |     int nextAvailable, available = Available;
 97 | 
 98 |     // If enough bits are already available:
 99 |     if (available >= 61)
100 |     {
101 |         r = workspace & kPrime;
102 |         workspace >>= 61;
103 |         nextAvailable = available - 61;
104 |     }
105 |     else
106 |     {
107 |         unsigned bytes = Bytes;
108 | 
109 |         // Read a word to fill in the difference
110 |         if (bytes >= 8)
111 |         {
112 |             word = ReadU64_LE(Data);
113 |             Data += 8;
114 |             Bytes = bytes - 8;
115 |             nextAvailable = available + 3;
116 |         }
117 |         else
118 |         {
119 |             if (bytes == 0 && available <= 0) {
120 |                 return ReadResult::Empty;
121 |             }
122 | 
123 |             word = ReadBytes_LE(Data, bytes);
124 |             Bytes = 0;
125 | 
126 |             // Note this may go negative but we check for that above
127 |             nextAvailable = available + bytes * 8 - 61;
128 |         }
129 | 
130 |         // This assumes workspace high bits (beyond `available`) are 0
131 |         r = (workspace | (word << available)) & kPrime;
132 | 
133 |         // Remaining workspace bits are taken from read word
134 |         workspace = word >> (61 - available);
135 |     }
136 | 
137 |     // If there is ambiguity in the representation:
138 |     if (IsU64Ambiguous(r))
139 |     {
140 |         // This will not overflow because available <= 60.
141 |         // We add up to 3 more bits, so adding one more keeps us within 64 bits.
142 |         ++nextAvailable;
143 | 
144 |         // Insert bit 0 for 0ff..ff and 1 for 1ff..ff to resolve the ambiguity
145 |         workspace = (workspace << 1) | (r >> 60);
146 | 
147 |         // Use kAmbiguity value for a placeholder
148 |         r = kAmbiguityMask;
149 |     }
150 | 
151 |     Workspace = workspace;
152 |     Available = nextAvailable;
153 | 
154 |     fpOut = r;
155 |     return ReadResult::Success;
156 | }
157 | 
158 | uint64_t WordReader::Read()
159 | {
160 |     int nextAvailable, available = Available;
161 |     uint64_t r, workspace = Workspace;
162 | 
163 |     if (available >= 61)
164 |     {
165 |         r = workspace & kPrime;
166 |         nextAvailable = available - 61;
167 |         workspace >>= 61;
168 |     }
169 |     else
170 |     {
171 |         uint64_t word;
172 |         unsigned bytes = Bytes;
173 | 
174 |         // If we can read a full word:
175 |         if (bytes >= 8)
176 |         {
177 |             word = ReadU64_LE(Data);
178 |             Data += 8;
179 |             Bytes = bytes - 8;
180 |             nextAvailable = available + 3; // +64 - 61
181 |         }
182 |         else
183 |         {
184 |             if (bytes == 0 && available <= 0) {
185 |                 return 0; // No data left to read
186 |             }
187 | 
188 |             word = ReadBytes_LE(Data, bytes);
189 | 
190 |             // Note this may go negative but we check for negative above
191 |             nextAvailable = available + bytes * 8 - 61;
192 | 
193 |             Bytes = 0;
194 |         }
195 | 
196 |         r = (workspace | (word << available)) & kPrime;
197 |         workspace = word >> (61 - available);
198 |     }
199 | 
200 |     Workspace = workspace;
201 |     Available = nextAvailable;
202 | 
203 |     return r;
204 | }
205 | 
206 | 
207 | //------------------------------------------------------------------------------
208 | // Memory Writing
209 | 
210 | void WriteBytes_LE(uint8_t* data, unsigned bytes, uint64_t value)
211 | {
212 |     switch (bytes)
213 |     {
214 |     case 8: WriteU64_LE(data, value);
215 |         return;
216 |     case 7: data[6] = (uint8_t)(value >> 48);
217 |     case 6: data[5] = (uint8_t)(value >> 40);
218 |     case 5: data[4] = (uint8_t)(value >> 32);
219 |     case 4: WriteU32_LE(data, static_cast<uint32_t>(value));
220 |         return;
221 |     case 3: data[2] = (uint8_t)(value >> 16);
222 |     case 2: data[1] = (uint8_t)(value >> 8);
223 |     case 1: data[0] = (uint8_t)value;
224 |     default: break;
225 |     }
226 | }
227 | 
228 | 
229 | //------------------------------------------------------------------------------
230 | // Random
231 | 
232 | // From http://xoshiro.di.unimi.it/splitmix64.c
233 | // Written in 2015 by Sebastiano Vigna (vigna@acm.org)
234 | uint64_t HashU64(uint64_t x)
235 | {
236 |     x += 0x9e3779b97f4a7c15;
237 |     uint64_t z = x;
238 |     z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
239 |     z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
240 |     return z ^ (z >> 31);
241 | }
242 | 
243 | void Random::Seed(uint64_t x)
244 | {
245 |     // Fill initial state as recommended by authors
246 |     uint64_t h = HashU64(x);
247 |     State[0] = h;
248 |     h = HashU64(h);
249 |     State[1] = h;
250 |     h = HashU64(h);
251 |     State[2] = h;
252 |     h = HashU64(h);
253 |     State[3] = h;
254 | }
255 | 
256 | 
257 | } // namespace fp61
258 | 


--------------------------------------------------------------------------------
/fp61.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |     Copyright (c) 2018 Christopher A. Taylor.  All rights reserved.
  3 | 
  4 |     Redistribution and use in source and binary forms, with or without
  5 |     modification, are permitted provided that the following conditions are met:
  6 | 
  7 |     * Redistributions of source code must retain the above copyright notice,
  8 |       this list of conditions and the following disclaimer.
  9 |     * Redistributions in binary form must reproduce the above copyright notice,
 10 |       this list of conditions and the following disclaimer in the documentation
 11 |       and/or other materials provided with the distribution.
 12 |     * Neither the name of Fp61 nor the names of its contributors may be
 13 |       used to endorse or promote products derived from this software without
 14 |       specific prior written permission.
 15 | 
 16 |     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 17 |     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 |     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 19 |     ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 20 |     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 21 |     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 22 |     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 23 |     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 24 |     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 25 |     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 26 |     POSSIBILITY OF SUCH DAMAGE.
 27 | */
 28 | 
 29 | #ifndef CAT_FP61_H
 30 | #define CAT_FP61_H
 31 | 
 32 | #include <stdint.h>
 33 | 
 34 | /** \mainpage
 35 |     Fp61 : Finite field arithmetic modulo Mersenne prime p = 2^61-1 in C++
 36 | 
 37 |     The Fp61 software takes advantage of the commonly available fast
 38 |     64x64->128 multiplier to accelerate finite (base) field arithmetic.
 39 |     So it runs a lot faster when built into a 64-bit executable.
 40 | 
 41 |     This math code offers use of lazy reduction techniques for speed,
 42 |     via fp61::PartialReduce().
 43 | 
 44 |     + Addition of 8 values can be evaluated before reduction.
 45 |     + Sums of 4 products can be evaluated with partial reductions.
 46 | */
 47 | 
 48 | // Define this to avoid any unaligned memory accesses while reading data.
 49 | // This is useful as a quick-fix for mobile applications.
 50 | // A preferred solution is to ensure that the data provided is aligned.
 51 | // Another reason to do this is if the platform is big-endian.
 52 | //#define FP61_SAFE_MEMORY_ACCESSES
 53 | 
 54 | 
 55 | //------------------------------------------------------------------------------
 56 | // Portability Macros
 57 | 
 58 | // Compiler-specific force inline keyword
 59 | #ifdef _MSC_VER
 60 | # define FP61_FORCE_INLINE inline __forceinline
 61 | #else
 62 | # define FP61_FORCE_INLINE inline __attribute__((always_inline))
 63 | #endif
 64 | 
 65 | 
 66 | //------------------------------------------------------------------------------
 67 | // Portable 64x64->128 Multiply
 68 | // CAT_MUL128: r{hi,lo} = x * y
 69 | 
 70 | // Returns low part of product, and high part is set in r_hi
 71 | FP61_FORCE_INLINE uint64_t Emulate64x64to128(
 72 |     uint64_t& r_hi,
 73 |     const uint64_t x,
 74 |     const uint64_t y)
 75 | {
 76 |     // Form temporary 32-bit words
 77 |     const uint32_t x0 = static_cast<uint32_t>(x);
 78 |     const uint32_t x1 = static_cast<uint32_t>(x >> 32);
 79 |     const uint32_t y0 = static_cast<uint32_t>(y);
 80 |     const uint32_t y1 = static_cast<uint32_t>(y >> 32);
 81 | 
 82 |     // Calculate 32x32->64 bit products
 83 |     const uint64_t p11 = static_cast<uint64_t>(x1) * y1;
 84 |     const uint64_t p01 = static_cast<uint64_t>(x0) * y1;
 85 |     const uint64_t p10 = static_cast<uint64_t>(x1) * y0;
 86 |     const uint64_t p00 = static_cast<uint64_t>(x0) * y0;
 87 | 
 88 |     /*
 89 |         This is implementing schoolbook multiplication:
 90 | 
 91 |                 x1 x0
 92 |         X       y1 y0
 93 |         -------------
 94 |                    00  LOW PART
 95 |         -------------
 96 |                 00
 97 |              10 10     MIDDLE PART
 98 |         +       01
 99 |         -------------
100 |              01 
101 |         + 11 11        HIGH PART
102 |         -------------
103 |     */
104 | 
105 |     // 64-bit product + two 32-bit values
106 |     const uint64_t middle = p10
107 |         + static_cast<uint32_t>(p00 >> 32)
108 |         + static_cast<uint32_t>(p01);
109 | 
110 |     /*
111 |         Proof that 64-bit products can accumulate two more 32-bit values
112 |         without overflowing:
113 | 
114 |         Max 32-bit value is 2^32 - 1.
115 |         PSum = (2^32-1) * (2^32-1) + (2^32-1) + (2^32-1)
116 |              = 2^64 - 2^32 - 2^32 + 1 + 2^32 - 1 + 2^32 - 1
117 |              = 2^64 - 1
118 |         Therefore it cannot overflow regardless of input.
119 |     */
120 | 
121 |     // 64-bit product + two 32-bit values
122 |     r_hi = p11
123 |         + static_cast<uint32_t>(middle >> 32)
124 |         + static_cast<uint32_t>(p01 >> 32);
125 | 
126 |     // Add LOW PART and lower half of MIDDLE PART
127 |     return (middle << 32) | static_cast<uint32_t>(p00);
128 | }
129 | 
130 | #if defined(_MSC_VER) && defined(_WIN64)
131 | // Visual Studio 64-bit
132 | 
133 | # include <intrin.h>
134 | # pragma intrinsic(_umul128)
135 | # define CAT_MUL128(r_hi, r_lo, x, y) \
136 |     r_lo = _umul128(x, y, &(r_hi));
137 | 
138 | #elif defined(__SIZEOF_INT128__)
139 | // Compiler supporting 128-bit values (GCC/Clang)
140 | 
141 | # define CAT_MUL128(r_hi, r_lo, x, y)                   \
142 |     {                                                   \
143 |         unsigned __int128 w = (unsigned __int128)x * y; \
144 |         r_lo = (uint64_t)w;                             \
145 |         r_hi = (uint64_t)(w >> 64);                     \
146 |     }
147 | 
148 | #else
149 | // Emulate 64x64->128-bit multiply with 64x64->64 operations
150 | 
151 | # define CAT_MUL128(r_hi, r_lo, x, y) \
152 |     r_lo = Emulate64x64to128(r_hi, x, y);
153 | 
154 | #endif // End CAT_MUL128
155 | 
156 | 
157 | namespace fp61 {
158 | 
159 | 
160 | //------------------------------------------------------------------------------
161 | // Constants
162 | 
163 | // p = 2^61 - 1
164 | static const uint64_t kPrime = ((uint64_t)1 << 61) - 1;
165 | 
166 | // Mask where bit #63 is clear and all other bits are set.
167 | static const uint64_t kMask63 = ((uint64_t)1 << 63) - 1;
168 | 
169 | 
170 | //------------------------------------------------------------------------------
171 | // API
172 | 
173 | /**
174 |     r = fp61::PartialReduce(x)
175 | 
176 |     Partially reduce x (mod p).  This clears bits #63 and #62.
177 | 
178 |     The result can be passed directly to fp61::Add4(), fp61::Multiply(),
179 |     and fp61::Finalize().
180 | */
181 | FP61_FORCE_INLINE uint64_t PartialReduce(uint64_t x)
182 | {
183 |     // Eliminate bits #63 to #61, which may carry back up into bit #61,
184 |     // So we will only definitely reduce #63 and #62.
185 |     return (x & kPrime) + (x >> 61); // 0 <= result <= 2^62 - 1
186 | }
187 | 
188 | /**
189 |     r = fp61::Finalize(x)
190 | 
191 |     Finalize reduction of x (mod p) from PartialReduce()
192 |     Preconditions: Bits #63 and #62 are clear and x != 0x3ffffffffffffffeULL
193 | 
194 |     This function fails for x = 0x3ffffffffffffffeULL.
195 |     The partial reduction function does not produce this bit pattern for any
196 |     input, so this exception is allowed because I'm assuming the input comes
197 |     from fp61::PartialReduce().  So, do not mask down to 62 random bits and
198 |     pass to this function because it can fail in this one case.
199 | 
200 |     Returns a value in Fp (less than p).
201 | */
202 | FP61_FORCE_INLINE uint64_t Finalize(uint64_t x)
203 | {
204 |     // Eliminate #61.
205 |     // The +1 also handles the case where x = p and x = 0x3fffffffffffffffULL.
206 |     // I don't see a way to tweak this to handle 0x3ffffffffffffffeULL...
207 |     return (x + ((x+1) >> 61)) & kPrime; // 0 <= result < p
208 | }
209 | 
210 | /**
211 |     r = fp61::Add4(x, y, z, w)
212 | 
213 |     Sum x + y + z + w (without full reduction modulo p).
214 |     Preconditions: x,y,z,w <2^62
215 | 
216 |     Probably you will want to just inline this code and follow the pattern,
217 |     since being restricted to adding 4 things at a time is kind of weird.
218 | 
219 |     The result can be passed directly to fp61::Add4(), fp61::Multiply(), and
220 |     fp61::Finalize().
221 | */
222 | FP61_FORCE_INLINE uint64_t Add4(uint64_t x, uint64_t y, uint64_t z, uint64_t w)
223 | {
224 |     return PartialReduce(x + y + z + w);
225 | }
226 | 
227 | /**
228 |     r = fp61::Negate(x)
229 | 
230 |     r = -x (without reduction modulo p)
231 |     Preconditions: x <= p
232 | 
233 |     The input needs to be have bits #63 #62 #61 cleared.
234 |     This can be ensured by calling fp61::PartialReduce() and
235 |     fp61::Finalize() first.  Since this is more expensive than addition
236 |     it is best to reorganize operations to avoid needing this reduction.
237 | 
238 |     Return a value <= p.
239 | */
240 | FP61_FORCE_INLINE uint64_t Negate(uint64_t x)
241 | {
242 |     return kPrime - x;
243 | }
244 | 
245 | // For subtraction, use fp61::Negate() and add: x + (-y).
246 | 
247 | /**
248 |     r = fp61::Multiply(x, y)
249 | 
250 |     r = x * y (with partial reduction modulo p)
251 | 
252 |     Important Input Restriction:
253 | 
254 |         The number of bits between x and y must be less than 124 bits.
255 | 
256 |         Call fp61::PartialReduce() to reduce inputs if needed,
257 |         which makes sure that both inputs are 62 bits or fewer.
258 | 
259 |         Example: If x <= 2^62-1 (62 bits), then y <= 2^62-1 (62 bits).
260 |         This means that up to 2 values can be accumulated in x and 2 in y.
261 | 
262 |         But it is also possible to balance the input in other ways.
263 | 
264 |         Example: If x <= 2^61-1 (61 bits), then y <= 2^63-1 (63 bits).
265 |         This means that up to 4 values can be accumulated in y.
266 | 
267 |     Result:
268 | 
269 |         The result is stored in bits #61 to #0 (62 bits of the word).
270 |         Call fp61::Finalize() to reduce the result to 61 bits.
271 | */
272 | FP61_FORCE_INLINE uint64_t Multiply(uint64_t x, uint64_t y)
273 | {
274 |     uint64_t p_lo, p_hi;
275 |     CAT_MUL128(p_hi, p_lo, x, y);
276 | 
277 |     /*
278 |         Largest x,y = p - 1 = 2^61 - 2 = L.
279 | 
280 |         L*L = (2^61-2) * (2^61-2)
281 |             = 2^(61+61) - 4*2^61 + 4
282 |             = 2^122 - 2^63 + 4
283 |         That is the high 6 bits are zero.
284 | 
285 |         We represent the product as two 64-bit words, or 128 bits.
286 | 
287 |         Say the low bit bit #64 is set in the high word.
288 |         To eliminate this bit we need to subtract (2^61 - 1) * 2^3.
289 |         This means we need to add a bit at #3.
290 |         Similarly for bit #65 we need to add a bit at #4.
291 | 
292 |         High bits #127 to #125 affect high bits #66 to #64.
293 |         High bits #124 to #64 affect low bits #63 to #3.
294 |         Low bits #63 to #61 affect low bits #2 to #0.
295 | 
296 |         If we eliminate from high bits to low bits, then we could carry back
297 |         up into the high bits again.  So we should instead eliminate bits #61
298 |         through #63 first to prevent carries into the high word.
299 |     */
300 | 
301 |     // Eliminate bits #63 to #61, which may carry back up into bit #61,
302 |     // So we will only definitely reduce #63 and #62.
303 |     uint64_t r = (p_lo & kPrime) + (p_lo >> 61);
304 | 
305 |     // Eliminate bits #123 to #64 (60 bits).
306 |     // This stops short of #124 that would affect bit #63 because it
307 |     // prevents the addition from overflowing the 64-bit word.
308 |     r += ((p_hi << 3) & kMask63);
309 | 
310 |     // This last reduction step is not strictly necessary, but is almost always
311 |     // a good idea when used to implement some algorithm, so I include it.
312 |     // Partially reduce the result to clear the high 2 bits.
313 |     return PartialReduce(r);
314 | }
315 | 
316 | /**
317 |     r = fp61::Inverse(x)
318 | 
319 |     r = x^-1 (mod p)
320 |     The input value x can be any 64-bit value.
321 | 
322 |     This operation is kind of heavy so it should be avoided where possible.
323 | 
324 |     This operation is not constant-time.
325 |     A constant-time version can be implemented using Euler's totient method and
326 |     a straight line similar to https://github.com/catid/snowshoe/blob/master/src/fp.inc#L545
327 | 
328 |     Returns the multiplicative inverse of x modulo p.
329 |     0 < result < p
330 | 
331 |     If the inverse does not exist, it returns 0.
332 | */
333 | uint64_t Inverse(uint64_t x);
334 | 
335 | 
336 | //------------------------------------------------------------------------------
337 | // Memory Reading
338 | 
339 | /// Read 8 bytes in little-endian byte order
340 | FP61_FORCE_INLINE uint64_t ReadU64_LE(const uint8_t* data)
341 | {
342 | #ifdef FP61_SAFE_MEMORY_ACCESSES
343 |     return ((uint64_t)data[7] << 56) | ((uint64_t)data[6] << 48) | ((uint64_t)data[5] << 40) |
344 |         ((uint64_t)data[4] << 32) | ((uint64_t)data[3] << 24) | ((uint64_t)data[2] << 16) |
345 |         ((uint64_t)data[1] << 8) | data[0];
346 | #else
347 |     const uint64_t* wordPtr = reinterpret_cast<const uint64_t*>(data);
348 |     return *wordPtr;
349 | #endif
350 | }
351 | 
352 | /// Read 4 bytes in little-endian byte order
353 | FP61_FORCE_INLINE uint32_t ReadU32_LE(const uint8_t* data)
354 | {
355 | #ifdef FP61_SAFE_MEMORY_ACCESSES
356 |     return ((uint32_t)data[3] << 24) | ((uint32_t)data[2] << 16) | ((uint32_t)data[1] << 8) | data[0];
357 | #else
358 |     const uint32_t* wordPtr = reinterpret_cast<const uint32_t*>(data);
359 |     return *wordPtr;
360 | #endif
361 | }
362 | 
363 | /// Read between 0..8 bytes in little-endian byte order
364 | /// Returns 0 for any other value for `bytes`
365 | uint64_t ReadBytes_LE(const uint8_t* data, unsigned bytes);
366 | 
367 | enum class ReadResult
368 | {
369 |     Success, ///< Read returned with a word of data
370 |     Empty    ///< No data remaining to read
371 | };
372 | 
373 | /**
374 |     Fitting Bytes Into Words
375 | 
376 |     When converting byte data to words, a value of 2^61-1 is problematic
377 |     because it does not fit in the field Fp that ranges from 0..(2^61-2).
378 | 
379 |     One way to fit these values into the field would be to emit 1ff..ffe
380 |     for both 1ff..ffe and 1ff..fff, and then inject a new bit after it to
381 |     allow the ByteWriter code to reverse the transformation.  The problem
382 |     with this is that the lower bit is modified, which is the same one
383 |     that signals how the prior word is modified.
384 | 
385 |     So a better way to fix 1ff..fff is to make it ambiguous with 0ff..fff,
386 |     where the high bit of the word is flipped.  Now when 0ff..fff is seen
387 |     by the ByteWriter, it knows to check the next word's low bit and
388 |     optionally reverse it back to 1ff..fff.
389 | 
390 |     As an aside, we want to design the ByteReader to be as fast as possible
391 |     because it is used by the erasure code encoder - The decoder must only
392 |     reverse this transformation for any lost data, so it can be slower.
393 | 
394 |     It may be a good idea to XOR input data by a random sequence to randomize
395 |     the odds of using extra bits, depending on the application.
396 | */
397 | static const uint64_t kAmbiguityMask = ((uint64_t)1 << 60) - 1; // 0x0ff...fff
398 | 
399 | /// Returns true if the U64 word provided needs an extra bit to represent it
400 | FP61_FORCE_INLINE bool IsU64Ambiguous(uint64_t u64_word)
401 | {
402 |     return (u64_word & kAmbiguityMask) == kAmbiguityMask;
403 | }
404 | 
405 | /// Returns true if this Fp word could have originally been 0ff..ff or 1ff..ff
406 | FP61_FORCE_INLINE bool IsFpAmbiguous(uint64_t fp_word)
407 | {
408 |     return fp_word == kAmbiguityMask;
409 | }
410 | 
411 | /**
412 |     ByteReader
413 | 
414 |     Reads 8 bytes at a time from the input data and outputs 61-bit Fp words.
415 |     Pads the final < 8 bytes with zeros.
416 | 
417 |     See the comments on Fitting Bytes Into Words for how this works.
418 | 
419 |     Call ByteReader::MaxWords() to calculate the maximum number of words that
420 |     can be generated for worst-case input of all FFF...FFs.
421 | 
422 |     Define FP61_SAFE_MEMORY_ACCESSES if the platform does not support unaligned
423 |     reads and the input data is unaligned, or the platform is big-endian.
424 | 
425 |     Call BeginRead() to begin reading.
426 | 
427 |     Call ReadNext() repeatedly to read all words from the data.
428 |     It will return ReadResult::Empty when all bits are empty.
429 | */
430 | struct ByteReader
431 | {
432 |     const uint8_t* Data;
433 |     unsigned Bytes;
434 |     uint64_t Workspace;
435 |     int Available;
436 | 
437 | 
438 |     /// Calculates and returns the maximum number of Fp field words that may be
439 |     /// produced by the ByteReader.
440 |     static FP61_FORCE_INLINE unsigned MaxWords(unsigned bytes)
441 |     {
442 |         unsigned bits = bytes * 8;
443 | 
444 |         // Round up to the nearest word.
445 |         // All words may be expanded by one bit, hence the (bits/61) factor.
446 |         return (bits + (bits / 61) + 60) / 61;
447 |     }
448 | 
449 |     /// Begin reading data
450 |     FP61_FORCE_INLINE void BeginRead(const uint8_t* data, unsigned bytes)
451 |     {
452 |         Data = data;
453 |         Bytes = bytes;
454 |         Workspace = 0;
455 |         Available = 0;
456 |     }
457 | 
458 |     /// Returns ReadResult::Empty when no more data is available.
459 |     /// Otherwise fpOut will be a value between 0 and p-1.
460 |     ReadResult Read(uint64_t& fpOut);
461 | };
462 | 
463 | /**
464 |     WordReader
465 | 
466 |     Reads a series of 61-bit finalized Fp field elements from a byte array.
467 | 
468 |     This differs from ByteReader in two ways:
469 |     (1) It does not have to handle the special case of all ffffs.
470 |     (2) It terminates deterministically at WordCount() words rather than
471 |     based on the contents of the data.
472 | 
473 |     Call WordCount() to calculate the number of words to expect to read from
474 |     a given number of bytes.
475 | 
476 |     Call BeginRead() to start reading.
477 |     Call Read() to retrieve each consecutive word.
478 | */
479 | struct WordReader
480 | {
481 |     const uint8_t* Data;
482 |     unsigned Bytes;
483 |     uint64_t Workspace;
484 |     unsigned Available;
485 | 
486 | 
487 |     /// Calculate the number of words that can be read from a number of bytes
488 |     static FP61_FORCE_INLINE unsigned WordCount(unsigned bytes)
489 |     {
490 |         // Note that only whole (not partial) words can be read, so this rounds down
491 |         return (bytes * 8) / 61;
492 |     }
493 | 
494 |     /// Begin writing to the given memory location
495 |     FP61_FORCE_INLINE void BeginRead(const uint8_t* data, unsigned bytes)
496 |     {
497 |         Data = data;
498 |         Bytes = bytes;
499 |         Workspace = 0;
500 |         Available = 0;
501 |     }
502 | 
503 |     /// Read the next word.
504 |     /// It is up to the application to know when to stop reading,
505 |     /// based on the WordCount() count of words to read.
506 |     uint64_t Read();
507 | };
508 | 
509 | 
510 | //------------------------------------------------------------------------------
511 | // Memory Writing
512 | 
513 | /// Write 4 bytes in little-endian byte order
514 | FP61_FORCE_INLINE void WriteU32_LE(uint8_t* data, uint32_t value)
515 | {
516 | #ifdef FP61_SAFE_MEMORY_ACCESSES
517 |     data[3] = (uint8_t)(value >> 24);
518 |     data[2] = (uint8_t)(value >> 16);
519 |     data[1] = (uint8_t)(value >> 8);
520 |     data[0] = (uint8_t)value;
521 | #else
522 |     uint32_t* wordPtr = reinterpret_cast<uint32_t*>(data);
523 |     *wordPtr = value;
524 | #endif
525 | }
526 | 
527 | /// Write 8 bytes in little-endian byte order
528 | FP61_FORCE_INLINE void WriteU64_LE(uint8_t* data, uint64_t value)
529 | {
530 | #ifdef FP61_SAFE_MEMORY_ACCESSES
531 |     data[7] = (uint8_t)(value >> 56);
532 |     data[6] = (uint8_t)(value >> 48);
533 |     data[5] = (uint8_t)(value >> 40);
534 |     data[4] = (uint8_t)(value >> 32);
535 |     data[3] = (uint8_t)(value >> 24);
536 |     data[2] = (uint8_t)(value >> 16);
537 |     data[1] = (uint8_t)(value >> 8);
538 |     data[0] = (uint8_t)value;
539 | #else
540 |     uint64_t* wordPtr = reinterpret_cast<uint64_t*>(data);
541 |     *wordPtr = value;
542 | #endif
543 | }
544 | 
545 | /// Write between 0..8 bytes in little-endian byte order
546 | void WriteBytes_LE(uint8_t* data, unsigned bytes, uint64_t value);
547 | 
548 | /**
549 |     WordWriter
550 | 
551 |     Writes a series of 61-bit finalized Fp field elements to a byte array.
552 |     The resulting data can be read by WordReader.
553 | 
554 |     Call BytesNeeded() to calculate the number of bytes needed to store the
555 |     given number of Fp words.
556 | 
557 |     Call BeginWrite() to start writing.
558 |     Call Write() to write the next word.
559 | 
560 |     Call Flush() to write the last few bytes.
561 |     Flush() returns the number of overall written bytes.
562 | */
563 | struct WordWriter
564 | {
565 |     uint8_t* Data;
566 |     uint8_t* DataWritePtr;
567 |     uint64_t Workspace;
568 |     unsigned Available;
569 | 
570 | 
571 |     /// Calculate the number of bytes that will be written
572 |     /// for the given number of Fp words.
573 |     static FP61_FORCE_INLINE unsigned BytesNeeded(unsigned words)
574 |     {
575 |         // 61 bits per word
576 |         const unsigned bits = words * 61;
577 | 
578 |         // Round up to the next byte
579 |         return (bits + 7) / 8;
580 |     }
581 | 
582 |     /// Begin writing to the given memory location.
583 |     /// It is up to the application to provide enough space in the buffer by
584 |     /// using BytesNeeded() to calculate the buffer size.
585 |     FP61_FORCE_INLINE void BeginWrite(uint8_t* data)
586 |     {
587 |         Data = data;
588 |         DataWritePtr = data;
589 |         Workspace = 0;
590 |         Available = 0;
591 |     }
592 | 
593 |     /// Write the next word
594 |     FP61_FORCE_INLINE void Write(uint64_t word)
595 |     {
596 |         unsigned available = Available;
597 |         uint64_t workspace = Workspace;
598 | 
599 |         // Include any bits that fit
600 |         workspace |= word << available;
601 |         available += 61;
602 | 
603 |         // If there is a full word now:
604 |         if (available >= 64)
605 |         {
606 |             // Write the word
607 |             WriteU64_LE(DataWritePtr, workspace);
608 |             DataWritePtr += 8;
609 |             available -= 64;
610 | 
611 |             // Keep remaining bits
612 |             workspace = word >> (61 - available);
613 |         }
614 | 
615 |         Workspace = workspace;
616 |         Available = available;
617 |     }
618 | 
619 |     /// Flush the output, writing fractions of a word if needed.
620 |     /// This must be called or the output may be truncated.
621 |     /// Returns the number of bytes written overall.
622 |     FP61_FORCE_INLINE unsigned Flush()
623 |     {
624 |         const unsigned finalBytes = (Available + 7) / 8;
625 | 
626 |         // Write the number of available bytes
627 |         WriteBytes_LE(DataWritePtr, finalBytes, Workspace);
628 | 
629 |         // Calculate number of bytes written overall
630 |         const uintptr_t writtenBytes = static_cast<uintptr_t>(DataWritePtr - Data) + finalBytes;
631 | 
632 |         return static_cast<unsigned>(writtenBytes);
633 |     }
634 | };
635 | 
636 | /**
637 |     ByteWriter
638 | 
639 |     Writes a series of 61-bit finalized Fp field elements to a byte array,
640 |     reversing the encoding of ByteReader.  This is different from WordWriter
641 |     because it can also write 61-bit values that are all ones (outside of Fp).
642 | 
643 |     See the comments on Fitting Bytes Into Words for how this works.
644 | 
645 |     Call MaxBytesNeeded() to calculate the maximum number of bytes needed
646 |     to store the given number of Fp words.
647 | 
648 |     Call BeginWrite() to start writing.
649 |     Call Write() to write the next word.
650 | 
651 |     Call Flush() to write the last few bytes.
652 |     Flush() returns the number of overall written bytes.
653 | */
654 | struct ByteWriter
655 | {
656 |     WordWriter Writer;
657 |     bool Packed;
658 | 
659 |     /// Calculate the maximum number of bytes that will be written for the
660 |     /// given number of Fp words.  May be up to 1.6% larger than necessary.
661 |     static FP61_FORCE_INLINE unsigned MaxBytesNeeded(unsigned words)
662 |     {
663 |         return WordWriter::BytesNeeded(words);
664 |     }
665 | 
666 |     /// Begin writing to the given memory location.
667 |     /// It is up to the application to provide enough space in the buffer by
668 |     /// using MaxBytesNeeded() to calculate the buffer size.
669 |     FP61_FORCE_INLINE void BeginWrite(uint8_t* data)
670 |     {
671 |         Writer.BeginWrite(data);
672 |         Packed = false;
673 |     }
674 | 
675 |     /// Write the next word
676 |     FP61_FORCE_INLINE void Write(uint64_t word)
677 |     {
678 |         const unsigned word_bits = (word == kAmbiguityMask) ? 60 : 61;
679 | 
680 |         unsigned available = Writer.Available;
681 |         uint64_t workspace = Writer.Workspace;
682 | 
683 |         // Include any bits that fit
684 |         workspace |= word << available;
685 |         available += word_bits;
686 | 
687 |         // If there is a full word now:
688 |         if (available >= 64)
689 |         {
690 |             // Write the word
691 |             WriteU64_LE(Writer.DataWritePtr, workspace);
692 |             Writer.DataWritePtr += 8;
693 |             available -= 64;
694 | 
695 |             // Keep remaining bits
696 |             workspace = word >> (word_bits - available);
697 |         }
698 | 
699 |         Writer.Workspace = workspace;
700 |         Writer.Available = available;
701 |     }
702 | 
703 |     /// Flush the output, writing fractions of a word if needed.
704 |     /// This must be called or the output may be truncated.
705 |     /// Returns the number of bytes written overall.
706 |     FP61_FORCE_INLINE unsigned Flush()
707 |     {
708 |         return Writer.Flush();
709 |     }
710 | };
711 | 
712 | 
713 | //------------------------------------------------------------------------------
714 | // Random Numbers
715 | 
716 | #define CAT_ROL64(x, bits) ( ((uint64_t)(x) << (bits)) | ((uint64_t)(x) >> (64 - (bits))) )
717 | 
718 | /**
719 |     Random
720 | 
721 |     Xoshiro256+ based pseudo-random number generator (PRNG) that can generate
722 |     random numbers between 1..p.  NextNonzeroFp() is mainly intended to be used
723 |     for producing convolutional code coefficients to multiply by the data.
724 | 
725 |     Call Seed() to provide a 64-bit generator seed.
726 |     Call NextNonzeroFp() to produce a random 61-bit number from 1..p
727 |     Call NextFp() to produce a random 61-bit number from 0..p
728 |     Call Next() to produce a random 64-bit number.
729 | */
730 | struct Random
731 | {
732 |     uint64_t State[4];
733 | 
734 | 
735 |     /// Seed the generator
736 |     void Seed(uint64_t x);
737 | 
738 |     /// Get the next 64-bit random number.
739 |     /// The low 3 bits are slightly weak according to the authors.
740 |     // From http://xoshiro.di.unimi.it/xoshiro256plus.c
741 |     // Written in 2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)
742 |     FP61_FORCE_INLINE uint64_t Next()
743 |     {
744 |         uint64_t s0 = State[0], s1 = State[1], s2 = State[2], s3 = State[3];
745 | 
746 |         const uint64_t result = s0 + s3;
747 | 
748 |         const uint64_t t = s1 << 17;
749 |         s2 ^= s0;
750 |         s3 ^= s1;
751 |         s1 ^= s2;
752 |         s0 ^= s3;
753 |         s2 ^= t;
754 |         s3 = CAT_ROL64(s3, 45);
755 | 
756 |         State[0] = s0, State[1] = s1, State[2] = s2, State[3] = s3;
757 | 
758 |         return result;
759 |     }
760 | 
761 |     static FP61_FORCE_INLINE uint64_t ConvertRandToFp(uint64_t word)
762 |     {
763 |         // Pick high bits as recommended by Xoshiro authors
764 |         word >>= 3;
765 | 
766 |         // If word + 1 overflows, then subtract 1.
767 |         // This converts fffff to ffffe and slightly biases the PRNG.
768 |         word -= (word + 1) >> 61;
769 | 
770 |         return word;
771 |     }
772 | 
773 |     static FP61_FORCE_INLINE uint64_t ConvertRandToNonzeroFp(uint64_t word)
774 |     {
775 |         word = ConvertRandToFp(word);
776 | 
777 |         // If word - 1 borrows out, then add 1.
778 |         // This converts 0 to 1 and slightly biases the PRNG.
779 |         word += (word - 1) >> 63;
780 | 
781 |         return word;
782 |     }
783 | 
784 |     /// Get the next random value between 0..p
785 |     FP61_FORCE_INLINE uint64_t NextFp()
786 |     {
787 |         return ConvertRandToFp(Next());
788 |     }
789 | 
790 |     /// Get the next random value between 1..p
791 |     FP61_FORCE_INLINE uint64_t NextNonzeroFp()
792 |     {
793 |         return ConvertRandToNonzeroFp(Next());
794 |     }
795 | };
796 | 
797 | /// Hash a 64-bit value to another 64-bit value
798 | uint64_t HashU64(uint64_t x);
799 | 
800 | /// Hash a seed into a value from 1..p-1
801 | FP61_FORCE_INLINE uint64_t HashToNonzeroFp(uint64_t word)
802 | {
803 |     // Run a simple mixer based on HashU64()
804 |     word += 0x9e3779b97f4a7c15;
805 |     word = (word ^ (word >> 30)) * 0xbf58476d1ce4e5b9;
806 | 
807 |     // Take the top 61 bits
808 |     word >>= 3;
809 | 
810 |     // Eliminate values = p
811 |     word -= (word + 1) >> 61;
812 | 
813 |     // Eliminate values = 0
814 |     word += (word - 1) >> 63;
815 | 
816 |     return word;
817 | }
818 | 
819 | 
820 | } // namespace fp61
821 | 
822 | 
823 | #endif // CAT_FP61_H
824 | 


--------------------------------------------------------------------------------
/tests/benchmarks.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |     Copyright (c) 2018 Christopher A. Taylor.  All rights reserved.
  3 | 
  4 |     Redistribution and use in source and binary forms, with or without
  5 |     modification, are permitted provided that the following conditions are met:
  6 | 
  7 |     * Redistributions of source code must retain the above copyright notice,
  8 |       this list of conditions and the following disclaimer.
  9 |     * Redistributions in binary form must reproduce the above copyright notice,
 10 |       this list of conditions and the following disclaimer in the documentation
 11 |       and/or other materials provided with the distribution.
 12 |     * Neither the name of Fp61 nor the names of its contributors may be
 13 |       used to endorse or promote products derived from this software without
 14 |       specific prior written permission.
 15 | 
 16 |     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 17 |     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 |     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 19 |     ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 20 |     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 21 |     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 22 |     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 23 |     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 24 |     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 25 |     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 26 |     POSSIBILITY OF SUCH DAMAGE.
 27 | */
 28 | 
 29 | #include "../fp61.h"
 30 | #include "gf256.h"
 31 | 
 32 | #define FP61_ENABLE_GF256_COMPARE
 33 | 
 34 | /**
 35 |     Fp61 Benchmarks
 36 | 
 37 |     The goal of the benchmarks is to determine how fast Fp61 arithmetic is
 38 |     for the purpose of implementing erasure codes in software.
 39 | 
 40 | 
 41 |     *Drumroll...* Results:
 42 | 
 43 |     The results are not good at all.  The Fp61 encoder is roughly 20x slower
 44 |     than my Galois field code (gf256).  So, I do not recommend using Fp61.
 45 | 
 46 |     The majority of the slowdown comes from the ByteReader class that needs
 47 |     to convert byte data into 61-bit Fp words.  So it seems that having an
 48 |     odd field size to achieve lazy reductions does not help performance.
 49 | 
 50 |     *Sad trombone...*
 51 | 
 52 |     Benchmarks for Fp61 erasure codes.  Before running the benchmarks please run the tests to make sure everything's working on your PC.  It's going to run quite a bit faster with 64-bit builds because it takes advantage of the speed of 64-bit multiplications.
 53 | 
 54 |     Testing file size = 10 bytes
 55 |     N = 2 :  gf256_MBPS=250 Fp61_MBPS=65 Fp61_OutputBytes=16
 56 |     N = 4 :  gf256_MBPS=305 Fp61_MBPS=116 Fp61_OutputBytes=16
 57 |     N = 8 :  gf256_MBPS=138 Fp61_MBPS=80 Fp61_OutputBytes=16
 58 |     N = 16 :  gf256_MBPS=337 Fp61_MBPS=110 Fp61_OutputBytes=16
 59 |     N = 32 :  gf256_MBPS=711 Fp61_MBPS=242 Fp61_OutputBytes=16
 60 |     N = 64 :  gf256_MBPS=665 Fp61_MBPS=226 Fp61_OutputBytes=16
 61 |     N = 128 :  gf256_MBPS=868 Fp61_MBPS=297 Fp61_OutputBytes=16
 62 |     N = 256 :  gf256_MBPS=713 Fp61_MBPS=240 Fp61_OutputBytes=16
 63 |     N = 512 :  gf256_MBPS=881 Fp61_MBPS=300 Fp61_OutputBytes=16
 64 |     Testing file size = 100 bytes
 65 |     N = 2 :  gf256_MBPS=1234 Fp61_MBPS=214 Fp61_OutputBytes=107
 66 |     N = 4 :  gf256_MBPS=4000 Fp61_MBPS=486 Fp61_OutputBytes=107
 67 |     N = 8 :  gf256_MBPS=2631 Fp61_MBPS=328 Fp61_OutputBytes=107
 68 |     N = 16 :  gf256_MBPS=2051 Fp61_MBPS=300 Fp61_OutputBytes=107
 69 |     N = 32 :  gf256_MBPS=3850 Fp61_MBPS=433 Fp61_OutputBytes=107
 70 |     N = 64 :  gf256_MBPS=3972 Fp61_MBPS=428 Fp61_OutputBytes=107
 71 |     N = 128 :  gf256_MBPS=4397 Fp61_MBPS=444 Fp61_OutputBytes=107
 72 |     N = 256 :  gf256_MBPS=5137 Fp61_MBPS=500 Fp61_OutputBytes=107
 73 |     N = 512 :  gf256_MBPS=5129 Fp61_MBPS=492 Fp61_OutputBytes=107
 74 |     Testing file size = 1000 bytes
 75 |     N = 2 :  gf256_MBPS=10309 Fp61_MBPS=889 Fp61_OutputBytes=1007
 76 |     N = 4 :  gf256_MBPS=15325 Fp61_MBPS=848 Fp61_OutputBytes=1007
 77 |     N = 8 :  gf256_MBPS=9184 Fp61_MBPS=486 Fp61_OutputBytes=1007
 78 |     N = 16 :  gf256_MBPS=12728 Fp61_MBPS=722 Fp61_OutputBytes=1007
 79 |     N = 32 :  gf256_MBPS=11838 Fp61_MBPS=610 Fp61_OutputBytes=1007
 80 |     N = 64 :  gf256_MBPS=10555 Fp61_MBPS=604 Fp61_OutputBytes=1007
 81 |     N = 128 :  gf256_MBPS=11354 Fp61_MBPS=614 Fp61_OutputBytes=1007
 82 |     N = 256 :  gf256_MBPS=14782 Fp61_MBPS=816 Fp61_OutputBytes=1007
 83 |     N = 512 :  gf256_MBPS=18430 Fp61_MBPS=940 Fp61_OutputBytes=1007
 84 |     Testing file size = 10000 bytes
 85 |     N = 2 :  gf256_MBPS=19138 Fp61_MBPS=893 Fp61_OutputBytes=10004
 86 |     N = 4 :  gf256_MBPS=20283 Fp61_MBPS=959 Fp61_OutputBytes=10004
 87 |     N = 8 :  gf256_MBPS=20953 Fp61_MBPS=1010 Fp61_OutputBytes=10004
 88 |     N = 16 :  gf256_MBPS=22893 Fp61_MBPS=1056 Fp61_OutputBytes=10004
 89 |     N = 32 :  gf256_MBPS=24461 Fp61_MBPS=1087 Fp61_OutputBytes=10004
 90 |     N = 64 :  gf256_MBPS=22945 Fp61_MBPS=1057 Fp61_OutputBytes=10004
 91 |     N = 128 :  gf256_MBPS=16939 Fp61_MBPS=982 Fp61_OutputBytes=10004
 92 |     N = 256 :  gf256_MBPS=18608 Fp61_MBPS=927 Fp61_OutputBytes=10004
 93 |     N = 512 :  gf256_MBPS=16662 Fp61_MBPS=734 Fp61_OutputBytes=10004
 94 |     Testing file size = 100000 bytes
 95 |     N = 2 :  gf256_MBPS=22941 Fp61_MBPS=962 Fp61_OutputBytes=100002
 96 |     N = 4 :  gf256_MBPS=22827 Fp61_MBPS=976 Fp61_OutputBytes=100002
 97 |     N = 8 :  gf256_MBPS=16210 Fp61_MBPS=1052 Fp61_OutputBytes=100002
 98 |     N = 16 :  gf256_MBPS=17354 Fp61_MBPS=1044 Fp61_OutputBytes=100002
 99 |     N = 32 :  gf256_MBPS=16976 Fp61_MBPS=1030 Fp61_OutputBytes=100002
100 |     N = 64 :  gf256_MBPS=13570 Fp61_MBPS=910 Fp61_OutputBytes=100002
101 |     N = 128 :  gf256_MBPS=10592 Fp61_MBPS=533 Fp61_OutputBytes=100002
102 |     N = 256 :  gf256_MBPS=10637 Fp61_MBPS=500 Fp61_OutputBytes=100002
103 |     N = 512 :  gf256_MBPS=11528 Fp61_MBPS=483 Fp61_OutputBytes=100002
104 | 
105 | 
106 |     Erasure codes are usually based on 8-bit Galois fields, but I was
107 |     intrigued by the speed of the 64-bit multiplier on modern Intel processors.
108 |     To take advantage of the fast multiplier I first looked at a number of field
109 |     options before settling on Fp=2^61-1.  Note that I haven't benchmarked
110 |     these other options so my comments might be misleading or incorrect.
111 | 
112 |     Some other options I investigated:
113 | 
114 |     Fp=2^64+c
115 |     - The values of `c` that I found had a high Hamming weight so would be
116 |       expensive to reduce using the pseudo-Mersenne reduction approach.
117 |     - These seem to be patented.  Didn't really look into that issue.
118 |     - Fp values do not fit into 64-bit words so they're slower to work with.
119 |     - The reduction seems to require 128-bit adds/subs to implement properly,
120 |       which are awkward to implement on some compilers.
121 |     - There's no room for lazy reductions, so adds/subs are more expensive.
122 | 
123 |     Fp=2^64-c, specifically Solinas prime Fp=2^64-2^8-1
124 |     - The smallest values of `c` that I found had a high Hamming weight so would
125 |       be expensive to reduce using the pseudo-Mersenne reduction approach.
126 |     - The reduction seems to require 128-bit adds/subs to implement properly,
127 |       which are awkward to implement on some compilers.
128 |     - There's no room for lazy reductions, so adds/subs are more expensive.
129 |     ? Packing might be a littler simpler since all data is word-sized ?
130 | 
131 |     Reduction approaches considered:
132 | 
133 |     Montgomery:
134 |     This requires that the Montgomery u factor has a low Hamming weight to
135 |     implement efficiently.  p=2^64-2^32+1 happens to have this by chance,
136 |     but it's a rare property.  It then requires two 128-bit products and adds.
137 | 
138 |     Pseudo-Mersenne:
139 |     This does not require an efficient u factor, but still requires similarly
140 |     two 128-bit products and adds.
141 | 
142 |     Mersenne:
143 |     This is what Fp61 uses.  The reduction has to be applied multiple times to
144 |     fully flush data back into the field < p, and it restricts the sizes of the
145 |     inputs to 62 bits.  But in trade, no 128-bit operations are needed.
146 | */
147 | 
148 | #include <iostream>
149 | #include <iomanip>
150 | #include <sstream>
151 | #include <vector>
152 | using namespace std;
153 | 
154 | 
155 | #ifdef _WIN32
156 |     #ifndef NOMINMAX
157 |         #define NOMINMAX
158 |     #endif
159 |     #include <windows.h>
160 | #elif __MACH__
161 |     #include <mach/mach_time.h>
162 |     #include <mach/mach.h>
163 |     #include <mach/clock.h>
164 | 
165 |     extern mach_port_t clock_port;
166 | #else
167 |     #include <time.h>
168 |     #include <sys/time.h>
169 | #endif
170 | 
171 | 
172 | //------------------------------------------------------------------------------
173 | // Timing
174 | 
175 | #ifdef _WIN32
176 | // Precomputed frequency inverse
177 | static double PerfFrequencyInverseUsec = 0.;
178 | static double PerfFrequencyInverseMsec = 0.;
179 | 
180 | static void InitPerfFrequencyInverse()
181 | {
182 |     LARGE_INTEGER freq = {};
183 |     if (!::QueryPerformanceFrequency(&freq) || freq.QuadPart == 0)
184 |         return;
185 |     const double invFreq = 1. / (double)freq.QuadPart;
186 |     PerfFrequencyInverseUsec = 1000000. * invFreq;
187 |     PerfFrequencyInverseMsec = 1000. * invFreq;
188 | }
189 | #elif __MACH__
190 | static bool m_clock_serv_init = false;
191 | static clock_serv_t m_clock_serv = 0;
192 | 
193 | static void InitClockServ()
194 | {
195 |     m_clock_serv_init = true;
196 |     host_get_clock_service(mach_host_self(), SYSTEM_CLOCK, &m_clock_serv);
197 | }
198 | #endif // _WIN32
199 | 
200 | uint64_t GetTimeUsec()
201 | {
202 | #ifdef _WIN32
203 |     LARGE_INTEGER timeStamp = {};
204 |     if (!::QueryPerformanceCounter(&timeStamp))
205 |         return 0;
206 |     if (PerfFrequencyInverseUsec == 0.)
207 |         InitPerfFrequencyInverse();
208 |     return (uint64_t)(PerfFrequencyInverseUsec * timeStamp.QuadPart);
209 | #elif __MACH__
210 |     if (!m_clock_serv_init)
211 |         InitClockServ();
212 | 
213 |     mach_timespec_t tv;
214 |     clock_get_time(m_clock_serv, &tv);
215 | 
216 |     return 1000000 * tv.tv_sec + tv.tv_nsec / 1000;
217 | #else
218 |     struct timeval tv;
219 |     gettimeofday(&tv, nullptr);
220 |     return 1000000 * tv.tv_sec + tv.tv_usec;
221 | #endif
222 | }
223 | 
224 | 
225 | //------------------------------------------------------------------------------
226 | // Fp61 Erasure Code Encoder
227 | 
228 | // Get maximum number of bytes needed for a recovery packet
229 | static unsigned GetRecoveryBytes(unsigned originalBytes)
230 | {
231 |     const unsigned maxWords = fp61::ByteReader::MaxWords(originalBytes);
232 |     const unsigned maxBytes = fp61::WordWriter::BytesNeeded(maxWords);
233 |     return maxBytes;
234 | }
235 | 
236 | /**
237 |     Encode()
238 | 
239 |     This function implements the encoder for an erasure code.
240 |     It accepts a set of equal-sized data packets and outputs one recovery packet
241 |     that can repair one lost original packet.
242 | 
243 |     The recovery packet must be GetRecoveryBytes() in size.
244 | 
245 |     Returns the number of bytes written.
246 | */
247 | unsigned Encode(
248 |     const std::vector<std::vector<uint8_t>>& originals,
249 |     unsigned N,
250 |     unsigned bytes,
251 |     uint64_t seed,
252 |     uint8_t* recovery)
253 | {
254 |     uint64_t seedMix = fp61::HashU64(seed);
255 | 
256 |     std::vector<fp61::ByteReader> readers;
257 |     readers.resize(N);
258 |     for (unsigned i = 0; i < N; ++i) {
259 |         readers[i].BeginRead(&originals[i][0], bytes);
260 |     }
261 | 
262 |     fp61::WordWriter writer;
263 |     writer.BeginWrite(recovery);
264 | 
265 |     const unsigned minWords = fp61::WordReader::WordCount(bytes);
266 |     for (unsigned i = 0; i < minWords; ++i)
267 |     {
268 |         uint64_t fpword;
269 |         readers[0].Read(fpword);
270 |         uint64_t coeff = fp61::HashToNonzeroFp(seedMix + 0);
271 |         uint64_t sum = fp61::Multiply(coeff, fpword);
272 | 
273 |         unsigned column = 1;
274 |         unsigned columnsRemaining = N - 1;
275 |         while (columnsRemaining >= 3)
276 |         {
277 |             uint64_t coeff0 = fp61::HashToNonzeroFp(seedMix + column);
278 |             uint64_t coeff1 = fp61::HashToNonzeroFp(seedMix + column + 1);
279 |             uint64_t coeff2 = fp61::HashToNonzeroFp(seedMix + column + 2);
280 | 
281 |             uint64_t fpword0, fpword1, fpword2;
282 |             readers[column].Read(fpword0);
283 |             readers[column + 1].Read(fpword1);
284 |             readers[column + 2].Read(fpword2);
285 | 
286 |             sum += fp61::Multiply(coeff0, fpword0);
287 |             sum += fp61::Multiply(coeff1, fpword1);
288 |             sum += fp61::Multiply(coeff2, fpword2);
289 |             sum = fp61::PartialReduce(sum);
290 | 
291 |             column += 3;
292 |             columnsRemaining -= 3;
293 |         }
294 | 
295 |         while (columnsRemaining > 0)
296 |         {
297 |             uint64_t temp;
298 |             readers[column].Read(temp);
299 |             sum += fp61::Multiply(coeff, temp);
300 | 
301 |             column++;
302 |             columnsRemaining--;
303 |         }
304 |         sum = fp61::PartialReduce(sum);
305 |         sum = fp61::Finalize(sum);
306 |         writer.Write(sum);
307 |     }
308 | 
309 |     for (;;)
310 |     {
311 |         bool more_data = false;
312 | 
313 |         uint64_t sum = 0;
314 | 
315 |         for (unsigned i = 0; i < N; ++i)
316 |         {
317 |             uint64_t coeff = fp61::HashToNonzeroFp(seedMix + i);
318 | 
319 |             uint64_t fpword;
320 |             if (readers[i].Read(fpword) == fp61::ReadResult::Success)
321 |             {
322 |                 more_data = true;
323 | 
324 |                 sum += fp61::Multiply(coeff, fpword);
325 |                 sum = fp61::PartialReduce(sum);
326 |             }
327 |         }
328 | 
329 |         if (!more_data) {
330 |             break;
331 |         }
332 | 
333 |         sum = fp61::Finalize(sum);
334 |         writer.Write(sum);
335 |     }
336 | 
337 |     return writer.Flush();
338 | }
339 | 
340 | void EncodeGF256(
341 |     const std::vector<std::vector<uint8_t>>& originals,
342 |     unsigned N,
343 |     unsigned bytes,
344 |     uint64_t seed,
345 |     uint8_t* recovery)
346 | {
347 |     uint64_t seedMix = fp61::HashU64(seed);
348 | 
349 |     uint8_t coeff = (uint8_t)fp61::HashToNonzeroFp(seedMix + 0);
350 |     if (coeff == 0) {
351 |         coeff = 1;
352 |     }
353 | 
354 |     gf256_mul_mem(recovery, &originals[0][0], coeff, bytes);
355 | 
356 |     for (unsigned i = 1; i < N; ++i)
357 |     {
358 |         coeff = (uint8_t)fp61::HashToNonzeroFp(seedMix + 0);
359 |         if (coeff == 0) {
360 |             coeff = 1;
361 |         }
362 | 
363 |         gf256_muladd_mem(recovery, coeff, &originals[i][0], bytes);
364 |     }
365 | }
366 | 
367 | 
368 | //------------------------------------------------------------------------------
369 | // Benchmarks
370 | 
371 | static const unsigned kFileSizes[] = {
372 |     10, 100, 1000, 10000, 100000
373 | };
374 | static const unsigned kFileSizesCount = static_cast<unsigned>(sizeof(kFileSizes) / sizeof(kFileSizes[0]));
375 | 
376 | static const unsigned kFileN[] = {
377 |     2, 4, 8, 16, 32, 64, 128, 256, 512
378 | };
379 | static const unsigned kFileNCount = static_cast<unsigned>(sizeof(kFileN) / sizeof(kFileN[0]));
380 | 
381 | static const unsigned kTrials = 1000;
382 | 
383 | void RunBenchmarks()
384 | {
385 |     fp61::Random prng;
386 |     prng.Seed(0);
387 | 
388 |     std::vector<std::vector<uint8_t>> original_data;
389 |     std::vector<uint8_t> recovery_data;
390 | 
391 |     for (unsigned i = 0; i < kFileSizesCount; ++i)
392 |     {
393 |         unsigned fileSizeBytes = kFileSizes[i];
394 | 
395 |         cout << "Testing file size = " << fileSizeBytes << " bytes" << endl;
396 | 
397 |         for (unsigned j = 0; j < kFileNCount; ++j)
398 |         {
399 |             unsigned N = kFileN[j];
400 | 
401 |             cout << "N = " << N << " : ";
402 | 
403 |             uint64_t sizeSum = 0, timeSum = 0;
404 |             uint64_t timeSum_gf256 = 0;
405 | 
406 |             for (unsigned k = 0; k < kTrials; ++k)
407 |             {
408 |                 /*
409 |                     File pieces: f0, f1, f3, f4, ...
410 |                     Coefficients: m0, m1, m2, m3, ...
411 | 
412 |                     R = m0 * f0 + m1 * f1 + m2 * f2 + ...
413 | 
414 |                     R = sum(m_i * f_i) (mod 2^61-1)
415 | 
416 |                     To compute the recovery packet R we process the calculations
417 |                     for the first word from all of the file pieces to produce a
418 |                     single word of output.  This is a matrix-vector product
419 |                     between file data f_i (treated as Fp words) and randomly
420 |                     chosen generator matrix coefficients m_i.
421 | 
422 |                     Lazy reduction can be used to simplify the add steps.
423 | 
424 |                     Then we continue to the next word for all the file pieces,
425 |                     producing the next word of output.
426 | 
427 |                     It is possible to interleave the calculations for output
428 |                     words, and for input words to achieve higher throughput.
429 | 
430 |                     The number of words for each file piece can vary slightly
431 |                     based on the data (if the data bytes do not fit evenly into
432 |                     the Fp words, we have to add extra bits to resolve
433 |                     ambiguities).
434 | 
435 |                     The result is a set of 61-bit Fp words serialized to bytes,
436 |                     that is about 8 bytes more than the original file sizes.
437 | 
438 |                     The erasure code decoder (not implemented) would be able
439 |                     to take these recovery packets and fix lost data.
440 |                     The decoder performance would be fairly similar to the
441 |                     encoder performance for this type of erasure code, since
442 |                     the runtime is dominated by this matrix-vector product.
443 |                 */
444 | 
445 |                 original_data.resize(N);
446 |                 for (unsigned s = 0; s < N; ++s)
447 |                 {
448 |                     // Add 8 bytes padding to simplify tester
449 |                     original_data[s].resize(fileSizeBytes + 8);
450 | 
451 |                     // Fill the data with random bytes
452 |                     for (unsigned r = 0; r < i; r += 8)
453 |                     {
454 |                         uint64_t w;
455 |                         if (prng.Next() % 100 <= 3) {
456 |                             w = ~(uint64_t)0;
457 |                         }
458 |                         else {
459 |                             w = prng.Next();
460 |                         }
461 |                         fp61::WriteU64_LE(&original_data[s][r], w);
462 |                     }
463 |                 }
464 | 
465 |                 const unsigned maxRecoveryBytes = GetRecoveryBytes(fileSizeBytes);
466 |                 recovery_data.resize(maxRecoveryBytes);
467 | 
468 |                 {
469 |                     uint64_t t0 = GetTimeUsec();
470 | 
471 |                     unsigned recoveryBytes = Encode(original_data, N, fileSizeBytes, k, &recovery_data[0]);
472 | 
473 |                     uint64_t t1 = GetTimeUsec();
474 | 
475 |                     sizeSum += recoveryBytes;
476 |                     timeSum += t1 - t0;
477 |                 }
478 | 
479 | #ifdef FP61_ENABLE_GF256_COMPARE
480 |                 {
481 |                     uint64_t t0 = GetTimeUsec();
482 | 
483 |                     EncodeGF256(original_data, N, fileSizeBytes, k, &recovery_data[0]);
484 | 
485 |                     uint64_t t1 = GetTimeUsec();
486 | 
487 |                     timeSum_gf256 += t1 - t0;
488 |                 }
489 | #endif // FP61_ENABLE_GF256_COMPARE
490 |             }
491 | 
492 | #ifdef FP61_ENABLE_GF256_COMPARE
493 |             cout << " gf256_MBPS=" << (uint64_t)fileSizeBytes * N * kTrials / timeSum_gf256;
494 | #endif // FP61_ENABLE_GF256_COMPARE
495 |             cout << " Fp61_MBPS=" << (uint64_t)fileSizeBytes * N * kTrials / timeSum;
496 |             cout << " Fp61_OutputBytes=" << sizeSum / (float)kTrials;
497 |             cout << endl;
498 |         }
499 |     }
500 | }
501 | 
502 | 
503 | //------------------------------------------------------------------------------
504 | // Entrypoint
505 | 
506 | int main()
507 | {
508 |     cout << "Benchmarks for Fp61 erasure codes.  Before running the benchmarks please run the tests to make sure everything's working on your PC.  It's going to run quite a bit faster with 64-bit builds because it takes advantage of the speed of 64-bit multiplications." << endl;
509 |     cout << endl;
510 | 
511 |     gf256_init();
512 | 
513 |     RunBenchmarks();
514 | 
515 |     cout << endl;
516 |     return 0;
517 | }
518 | 


--------------------------------------------------------------------------------
/tests/gf256.cpp:
--------------------------------------------------------------------------------
   1 | /** \file
   2 |     \brief GF(256) Main C API Source
   3 |     \copyright Copyright (c) 2017 Christopher A. Taylor.  All rights reserved.
   4 | 
   5 |     Redistribution and use in source and binary forms, with or without
   6 |     modification, are permitted provided that the following conditions are met:
   7 | 
   8 |     * Redistributions of source code must retain the above copyright notice,
   9 |       this list of conditions and the following disclaimer.
  10 |     * Redistributions in binary form must reproduce the above copyright notice,
  11 |       this list of conditions and the following disclaimer in the documentation
  12 |       and/or other materials provided with the distribution.
  13 |     * Neither the name of GF256 nor the names of its contributors may be
  14 |       used to endorse or promote products derived from this software without
  15 |       specific prior written permission.
  16 | 
  17 |     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18 |     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19 |     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20 |     ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  21 |     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  22 |     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  23 |     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  24 |     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  25 |     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  26 |     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  27 |     POSSIBILITY OF SUCH DAMAGE.
  28 | */
  29 | 
  30 | #include "gf256.h"
  31 | 
  32 | #ifdef LINUX_ARM
  33 | #include <unistd.h>
  34 | #include <fcntl.h>
  35 | #include <elf.h>
  36 | #include <linux/auxvec.h>
  37 | #endif
  38 | 
  39 | //------------------------------------------------------------------------------
  40 | // Workaround for ARMv7 that doesn't provide vqtbl1_*
  41 | // This comes from linux-raid (https://www.spinics.net/lists/raid/msg58403.html)
  42 | //
  43 | #ifdef GF256_TRY_NEON
  44 | #if __ARM_ARCH <= 7 && !defined(__aarch64__)
  45 | static GF256_FORCE_INLINE uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
  46 | {
  47 |     union {
  48 |         uint8x16_t    val;
  49 |         uint8x8x2_t    pair;
  50 |     } __a = { a };
  51 | 
  52 |     return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
  53 |                        vtbl2_u8(__a.pair, vget_high_u8(b)));
  54 | }
  55 | #endif
  56 | #endif
  57 | 
  58 | //------------------------------------------------------------------------------
  59 | // Self-Test
  60 | //
  61 | // This is executed during initialization to make sure the library is working
  62 | 
  63 | static const unsigned kTestBufferBytes = 32 + 16 + 8 + 4 + 2 + 1;
  64 | static const unsigned kTestBufferAllocated = 64;
  65 | struct SelfTestBuffersT
  66 | {
  67 |     GF256_ALIGNED uint8_t A[kTestBufferAllocated];
  68 |     GF256_ALIGNED uint8_t B[kTestBufferAllocated];
  69 |     GF256_ALIGNED uint8_t C[kTestBufferAllocated];
  70 | };
  71 | static GF256_ALIGNED SelfTestBuffersT m_SelfTestBuffers;
  72 | 
  73 | static bool gf256_self_test()
  74 | {
  75 |     if ((uintptr_t)m_SelfTestBuffers.A % GF256_ALIGN_BYTES != 0)
  76 |         return false;
  77 |     if ((uintptr_t)m_SelfTestBuffers.A % GF256_ALIGN_BYTES != 0)
  78 |         return false;
  79 |     if ((uintptr_t)m_SelfTestBuffers.B % GF256_ALIGN_BYTES != 0)
  80 |         return false;
  81 |     if ((uintptr_t)m_SelfTestBuffers.C % GF256_ALIGN_BYTES != 0)
  82 |         return false;
  83 | 
  84 |     // Check multiplication/division
  85 |     for (unsigned i = 0; i < 256; ++i)
  86 |     {
  87 |         for (unsigned j = 0; j < 256; ++j)
  88 |         {
  89 |             uint8_t prod = gf256_mul((uint8_t)i, (uint8_t)j);
  90 |             if (i != 0 && j != 0)
  91 |             {
  92 |                 uint8_t div1 = gf256_div(prod, (uint8_t)i);
  93 |                 if (div1 != j)
  94 |                     return false;
  95 |                 uint8_t div2 = gf256_div(prod, (uint8_t)j);
  96 |                 if (div2 != i)
  97 |                     return false;
  98 |             }
  99 |             else if (prod != 0)
 100 |                 return false;
 101 |             if (j == 1 && prod != i)
 102 |                 return false;
 103 |         }
 104 |     }
 105 | 
 106 |     // Check for overruns
 107 |     m_SelfTestBuffers.A[kTestBufferBytes] = 0x5a;
 108 |     m_SelfTestBuffers.B[kTestBufferBytes] = 0x5a;
 109 |     m_SelfTestBuffers.C[kTestBufferBytes] = 0x5a;
 110 | 
 111 |     // Test gf256_add_mem()
 112 |     for (unsigned i = 0; i < kTestBufferBytes; ++i)
 113 |     {
 114 |         m_SelfTestBuffers.A[i] = 0x1f;
 115 |         m_SelfTestBuffers.B[i] = 0xf7;
 116 |     }
 117 |     gf256_add_mem(m_SelfTestBuffers.A, m_SelfTestBuffers.B, kTestBufferBytes);
 118 |     for (unsigned i = 0; i < kTestBufferBytes; ++i)
 119 |         if (m_SelfTestBuffers.A[i] != (0x1f ^ 0xf7))
 120 |             return false;
 121 | 
 122 |     // Test gf256_add2_mem()
 123 |     for (unsigned i = 0; i < kTestBufferBytes; ++i)
 124 |     {
 125 |         m_SelfTestBuffers.A[i] = 0x1f;
 126 |         m_SelfTestBuffers.B[i] = 0xf7;
 127 |         m_SelfTestBuffers.C[i] = 0x71;
 128 |     }
 129 |     gf256_add2_mem(m_SelfTestBuffers.A, m_SelfTestBuffers.B, m_SelfTestBuffers.C, kTestBufferBytes);
 130 |     for (unsigned i = 0; i < kTestBufferBytes; ++i)
 131 |         if (m_SelfTestBuffers.A[i] != (0x1f ^ 0xf7 ^ 0x71))
 132 |             return false;
 133 | 
 134 |     // Test gf256_addset_mem()
 135 |     for (unsigned i = 0; i < kTestBufferBytes; ++i)
 136 |     {
 137 |         m_SelfTestBuffers.A[i] = 0x55;
 138 |         m_SelfTestBuffers.B[i] = 0xaa;
 139 |         m_SelfTestBuffers.C[i] = 0x6c;
 140 |     }
 141 |     gf256_addset_mem(m_SelfTestBuffers.A, m_SelfTestBuffers.B, m_SelfTestBuffers.C, kTestBufferBytes);
 142 |     for (unsigned i = 0; i < kTestBufferBytes; ++i)
 143 |         if (m_SelfTestBuffers.A[i] != (0xaa ^ 0x6c))
 144 |             return false;
 145 | 
 146 |     // Test gf256_muladd_mem()
 147 |     for (unsigned i = 0; i < kTestBufferBytes; ++i)
 148 |     {
 149 |         m_SelfTestBuffers.A[i] = 0xff;
 150 |         m_SelfTestBuffers.B[i] = 0xaa;
 151 |     }
 152 |     const uint8_t expectedMulAdd = gf256_mul(0xaa, 0x6c);
 153 |     gf256_muladd_mem(m_SelfTestBuffers.A, 0x6c, m_SelfTestBuffers.B, kTestBufferBytes);
 154 |     for (unsigned i = 0; i < kTestBufferBytes; ++i)
 155 |         if (m_SelfTestBuffers.A[i] != (expectedMulAdd ^ 0xff))
 156 |             return false;
 157 | 
 158 |     // Test gf256_mul_mem()
 159 |     for (unsigned i = 0; i < kTestBufferBytes; ++i)
 160 |     {
 161 |         m_SelfTestBuffers.A[i] = 0xff;
 162 |         m_SelfTestBuffers.B[i] = 0x55;
 163 |     }
 164 |     const uint8_t expectedMul = gf256_mul(0xa2, 0x55);
 165 |     gf256_mul_mem(m_SelfTestBuffers.A, m_SelfTestBuffers.B, 0xa2, kTestBufferBytes);
 166 |     for (unsigned i = 0; i < kTestBufferBytes; ++i)
 167 |         if (m_SelfTestBuffers.A[i] != expectedMul)
 168 |             return false;
 169 | 
 170 |     if (m_SelfTestBuffers.A[kTestBufferBytes] != 0x5a)
 171 |         return false;
 172 |     if (m_SelfTestBuffers.B[kTestBufferBytes] != 0x5a)
 173 |         return false;
 174 |     if (m_SelfTestBuffers.C[kTestBufferBytes] != 0x5a)
 175 |         return false;
 176 | 
 177 |     return true;
 178 | }
 179 | 
 180 | 
 181 | //------------------------------------------------------------------------------
 182 | // Runtime CPU Architecture Check
 183 | //
 184 | // Feature checks stolen shamelessly from
 185 | // https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c
 186 | 
 187 | #if defined(HAVE_ANDROID_GETCPUFEATURES)
 188 | #include <cpu-features.h>
 189 | #endif
 190 | 
 191 | #if defined(GF256_TRY_NEON)
 192 | # if defined(IOS) && defined(__ARM_NEON__)
 193 | // Requires iPhone 5S or newer
 194 | static const bool CpuHasNeon = true;
 195 | static const bool CpuHasNeon64 = true;
 196 | # else // ANDROID or LINUX_ARM
 197 | #  if defined(__aarch64__)
 198 | static bool CpuHasNeon = true;      // if AARCH64, then we have NEON for sure...
 199 | static bool CpuHasNeon64 = true;    // And we have ASIMD
 200 | #  else
 201 | static bool CpuHasNeon = false;     // if not, then we have to check at runtime.
 202 | static bool CpuHasNeon64 = false;   // And we don't have ASIMD
 203 | #  endif
 204 | # endif
 205 | #endif
 206 | 
 207 | #if !defined(GF256_TARGET_MOBILE)
 208 | 
 209 | #ifdef _MSC_VER
 210 |     #include <intrin.h> // __cpuid
 211 |     #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
 212 | #endif
 213 | 
 214 | #ifdef GF256_TRY_AVX2
 215 | static bool CpuHasAVX2 = false;
 216 | #endif
 217 | static bool CpuHasSSSE3 = false;
 218 | 
 219 | #define CPUID_EBX_AVX2    0x00000020
 220 | #define CPUID_ECX_SSSE3   0x00000200
 221 | 
 222 | static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type)
 223 | {
 224 | #if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
 225 |     __cpuid((int *) cpu_info, cpu_info_type);
 226 | #else //if defined(HAVE_CPUID)
 227 |     cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
 228 | # ifdef __i386__
 229 |     __asm__ __volatile__ ("pushfl; pushfl; "
 230 |                           "popl %0; "
 231 |                           "movl %0, %1; xorl %2, %0; "
 232 |                           "pushl %0; "
 233 |                           "popfl; pushfl; popl %0; popfl" :
 234 |                           "=&r" (cpu_info[0]), "=&r" (cpu_info[1]) :
 235 |                           "i" (0x200000));
 236 |     if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) {
 237 |         return; /* LCOV_EXCL_LINE */
 238 |     }
 239 | # endif
 240 | # ifdef __i386__
 241 |     __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" :
 242 |                           "=a" (cpu_info[0]), "=&r" (cpu_info[1]),
 243 |                           "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
 244 |                           "0" (cpu_info_type), "2" (0U));
 245 | # elif defined(__x86_64__)
 246 |     __asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" :
 247 |                           "=a" (cpu_info[0]), "=&r" (cpu_info[1]),
 248 |                           "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
 249 |                           "0" (cpu_info_type), "2" (0U));
 250 | # else
 251 |     __asm__ __volatile__ ("cpuid" :
 252 |                           "=a" (cpu_info[0]), "=b" (cpu_info[1]),
 253 |                           "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
 254 |                           "0" (cpu_info_type), "2" (0U));
 255 | # endif
 256 | #endif
 257 | }
 258 | 
 259 | #else
 260 | #if defined(LINUX_ARM)
 261 | static void checkLinuxARMNeonCapabilities( bool& cpuHasNeon )
 262 | {
 263 |     auto cpufile = open("/proc/self/auxv", O_RDONLY);
 264 |     Elf32_auxv_t auxv;
 265 |     if (cpufile >= 0)
 266 |     {
 267 |         const auto size_auxv_t = sizeof(Elf32_auxv_t);
 268 |         while (read(cpufile, &auxv, size_auxv_t) == size_auxv_t)
 269 |         {
 270 |             if (auxv.a_type == AT_HWCAP)
 271 |             {
 272 |                 cpuHasNeon = (auxv.a_un.a_val & 4096) != 0;
 273 |                 break;
 274 |             }
 275 |         }
 276 |         close(cpufile);
 277 |     }
 278 |     else
 279 |     {
 280 |         cpuHasNeon = false;
 281 |     }
 282 | }
 283 | #endif
 284 | #endif // defined(GF256_TARGET_MOBILE)
 285 | 
 286 | static void gf256_architecture_init()
 287 | {
 288 | #if defined(GF256_TRY_NEON)
 289 | 
 290 |     // Check for NEON support on Android platform
 291 | #if defined(HAVE_ANDROID_GETCPUFEATURES)
 292 |     AndroidCpuFamily family = android_getCpuFamily();
 293 |     if (family == ANDROID_CPU_FAMILY_ARM)
 294 |     {
 295 |         if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON)
 296 |             CpuHasNeon = true;
 297 |     }
 298 |     else if (family == ANDROID_CPU_FAMILY_ARM64)
 299 |     {
 300 |         CpuHasNeon = true;
 301 |         if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD)
 302 |             CpuHasNeon64 = true;
 303 |     }
 304 | #endif
 305 | 
 306 | #if defined(LINUX_ARM)
 307 |     // Check for NEON support on other ARM/Linux platforms
 308 |     checkLinuxARMNeonCapabilities(CpuHasNeon);
 309 | #endif
 310 | 
 311 | #endif //GF256_TRY_NEON
 312 | 
 313 | #if !defined(GF256_TARGET_MOBILE)
 314 |     unsigned int cpu_info[4];
 315 | 
 316 |     _cpuid(cpu_info, 1);
 317 |     CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0);
 318 | 
 319 | #if defined(GF256_TRY_AVX2)
 320 |     _cpuid(cpu_info, 7);
 321 |     CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0);
 322 | #endif // GF256_TRY_AVX2
 323 | 
 324 |     // When AVX2 and SSSE3 are unavailable, Siamese takes 4x longer to decode
 325 |     // and 2.6x longer to encode.  Encoding requires a lot more simple XOR ops
 326 |     // so it is still pretty fast.  Decoding is usually really quick because
 327 |     // average loss rates are low, but when needed it requires a lot more
 328 |     // GF multiplies requiring table lookups which is slower.
 329 | 
 330 | #endif // GF256_TARGET_MOBILE
 331 | }
 332 | 
 333 | 
 334 | //------------------------------------------------------------------------------
 335 | // Context Object
 336 | 
 337 | // Context object for GF(2^^8) math
 338 | GF256_ALIGNED gf256_ctx GF256Ctx;
 339 | static bool Initialized = false;
 340 | 
 341 | 
 342 | //------------------------------------------------------------------------------
 343 | // Generator Polynomial
 344 | 
 345 | // There are only 16 irreducible polynomials for GF(2^^8)
 346 | static const int GF256_GEN_POLY_COUNT = 16;
 347 | static const uint8_t GF256_GEN_POLY[GF256_GEN_POLY_COUNT] = {
 348 |     0x8e, 0x95, 0x96, 0xa6, 0xaf, 0xb1, 0xb2, 0xb4,
 349 |     0xb8, 0xc3, 0xc6, 0xd4, 0xe1, 0xe7, 0xf3, 0xfa
 350 | };
 351 | 
 352 | static const int kDefaultPolynomialIndex = 3;
 353 | 
 354 | // Select which polynomial to use
 355 | static void gf256_poly_init(int polynomialIndex)
 356 | {
 357 |     if (polynomialIndex < 0 || polynomialIndex >= GF256_GEN_POLY_COUNT)
 358 |         polynomialIndex = kDefaultPolynomialIndex;
 359 | 
 360 |     GF256Ctx.Polynomial = (GF256_GEN_POLY[polynomialIndex] << 1) | 1;
 361 | }
 362 | 
 363 | 
 364 | //------------------------------------------------------------------------------
 365 | // Exponential and Log Tables
 366 | 
 367 | // Construct EXP and LOG tables from polynomial
 368 | static void gf256_explog_init()
 369 | {
 370 |     unsigned poly = GF256Ctx.Polynomial;
 371 |     uint8_t* exptab = GF256Ctx.GF256_EXP_TABLE;
 372 |     uint16_t* logtab = GF256Ctx.GF256_LOG_TABLE;
 373 | 
 374 |     logtab[0] = 512;
 375 |     exptab[0] = 1;
 376 |     for (unsigned jj = 1; jj < 255; ++jj)
 377 |     {
 378 |         unsigned next = (unsigned)exptab[jj - 1] * 2;
 379 |         if (next >= 256)
 380 |             next ^= poly;
 381 | 
 382 |         exptab[jj] = static_cast<uint8_t>( next );
 383 |         logtab[exptab[jj]] = static_cast<uint16_t>( jj );
 384 |     }
 385 |     exptab[255] = exptab[0];
 386 |     logtab[exptab[255]] = 255;
 387 |     for (unsigned jj = 256; jj < 2 * 255; ++jj)
 388 |         exptab[jj] = exptab[jj % 255];
 389 |     exptab[2 * 255] = 1;
 390 |     for (unsigned jj = 2 * 255 + 1; jj < 4 * 255; ++jj)
 391 |         exptab[jj] = 0;
 392 | }
 393 | 
 394 | 
 395 | //------------------------------------------------------------------------------
 396 | // Multiply and Divide Tables
 397 | 
 398 | // Initialize MUL and DIV tables using LOG and EXP tables
 399 | static void gf256_muldiv_init()
 400 | {
 401 |     // Allocate table memory 65KB x 2
 402 |     uint8_t* m = GF256Ctx.GF256_MUL_TABLE;
 403 |     uint8_t* d = GF256Ctx.GF256_DIV_TABLE;
 404 | 
 405 |     // Unroll y = 0 subtable
 406 |     for (int x = 0; x < 256; ++x)
 407 |         m[x] = d[x] = 0;
 408 | 
 409 |     // For each other y value:
 410 |     for (int y = 1; y < 256; ++y)
 411 |     {
 412 |         // Calculate log(y) for mult and 255 - log(y) for div
 413 |         const uint8_t log_y = static_cast<uint8_t>(GF256Ctx.GF256_LOG_TABLE[y]);
 414 |         const uint8_t log_yn = 255 - log_y;
 415 | 
 416 |         // Next subtable
 417 |         m += 256, d += 256;
 418 | 
 419 |         // Unroll x = 0
 420 |         m[0] = 0, d[0] = 0;
 421 | 
 422 |         // Calculate x * y, x / y
 423 |         for (int x = 1; x < 256; ++x)
 424 |         {
 425 |             uint16_t log_x = GF256Ctx.GF256_LOG_TABLE[x];
 426 | 
 427 |             m[x] = GF256Ctx.GF256_EXP_TABLE[log_x + log_y];
 428 |             d[x] = GF256Ctx.GF256_EXP_TABLE[log_x + log_yn];
 429 |         }
 430 |     }
 431 | }
 432 | 
 433 | 
 434 | //------------------------------------------------------------------------------
 435 | // Inverse Table
 436 | 
 437 | // Initialize INV table using DIV table
 438 | static void gf256_inv_init()
 439 | {
 440 |     for (int x = 0; x < 256; ++x)
 441 |         GF256Ctx.GF256_INV_TABLE[x] = gf256_div(1, static_cast<uint8_t>(x));
 442 | }
 443 | 
 444 | 
 445 | //------------------------------------------------------------------------------
 446 | // Square Table
 447 | 
 448 | // Initialize SQR table using MUL table
 449 | static void gf256_sqr_init()
 450 | {
 451 |     for (int x = 0; x < 256; ++x)
 452 |         GF256Ctx.GF256_SQR_TABLE[x] = gf256_mul(static_cast<uint8_t>(x), static_cast<uint8_t>(x));
 453 | }
 454 | 
 455 | 
 456 | //------------------------------------------------------------------------------
 457 | // Multiply and Add Memory Tables
 458 | 
 459 | /*
 460 |     Fast algorithm to compute m[1..8] = a[1..8] * b in GF(256)
 461 |     using SSE3 SIMD instruction set:
 462 | 
 463 |     Consider z = x * y in GF(256).
 464 |     This operation can be performed bit-by-bit.  Usefully, the partial product
 465 |     of each bit is combined linearly with the rest.  This means that the 8-bit
 466 |     number x can be split into its high and low 4 bits, and partial products
 467 |     can be formed from each half.  Then the halves can be linearly combined:
 468 | 
 469 |         z = x[0..3] * y + x[4..7] * y
 470 | 
 471 |     The multiplication of each half can be done efficiently via table lookups,
 472 |     and the addition in GF(256) is XOR.  There must be two tables that map 16
 473 |     input elements for the low or high 4 bits of x to the two partial products.
 474 |     Each value for y has a different set of two tables:
 475 | 
 476 |         z = TABLE_LO_y(x[0..3]) xor TABLE_HI_y(x[4..7])
 477 | 
 478 |     This means that we need 16 * 2 * 256 = 8192 bytes for precomputed tables.
 479 | 
 480 |     Computing z[] = x[] * y can be performed 16 bytes at a time by using the
 481 |     128-bit register operations supported by modern processors.
 482 | 
 483 |     This is efficiently realized in SSE3 using the _mm_shuffle_epi8() function
 484 |     provided by Visual Studio 2010 or newer in <tmmintrin.h>.  This function
 485 |     uses the low bits to do a table lookup on each byte.  Unfortunately the
 486 |     high bit of each mask byte has the special feature that it clears the
 487 |     output byte when it is set, so we need to make sure it's cleared by masking
 488 |     off the high bit of each byte before using it:
 489 | 
 490 |         clr_mask = _mm_set1_epi8(0x0f) = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
 491 | 
 492 |     For the low half of the partial product, clear the high bit of each byte
 493 |     and perform the table lookup:
 494 | 
 495 |         p_lo = _mm_and_si128(x, clr_mask)
 496 |         p_lo = _mm_shuffle_epi8(p_lo, TABLE_LO_y)
 497 | 
 498 |     For the high half of the partial product, shift the high 4 bits of each
 499 |     byte into the low 4 bits and clear the high bit of each byte, and then
 500 |     perform the table lookup:
 501 | 
 502 |         p_hi = _mm_srli_epi64(x, 4)
 503 |         p_hi = _mm_and_si128(p_hi, clr_mask)
 504 |         p_hi = _mm_shuffle_epi8(p_hi, TABLE_HI_y)
 505 | 
 506 |     Finally add the two partial products to form the product, recalling that
 507 |     addition is XOR in a Galois field:
 508 | 
 509 |         result = _mm_xor_si128(p_lo, p_hi)
 510 | 
 511 |     This crunches 16 bytes of x at a time, and the result can be stored in z.
 512 | */
 513 | 
 514 | /*
 515 |     Intrinsic reference:
 516 | 
 517 |     SSE3, VS2010+, tmmintrin.h:
 518 | 
 519 |     GF256_M128 _mm_shuffle_epi8(GF256_M128 a, GF256_M128 mask);
 520 |         Emits the Supplemental Streaming SIMD Extensions 3 (SSSE3) instruction pshufb. This instruction shuffles 16-byte parameters from a 128-bit parameter.
 521 | 
 522 |         Pseudo-code for PSHUFB (with 128 bit operands):
 523 | 
 524 |             for i = 0 to 15 {
 525 |                  if (SRC[(i * 8)+7] = 1 ) then
 526 |                       DEST[(i*8)+7..(i*8)+0] <- 0;
 527 |                   else
 528 |                       index[3..0] <- SRC[(i*8)+3 .. (i*8)+0];
 529 |                       DEST[(i*8)+7..(i*8)+0] <- DEST[(index*8+7)..(index*8+0)];
 530 |                  endif
 531 |             }
 532 | 
 533 |     SSE2, VS2008+, emmintrin.h:
 534 | 
 535 |     GF256_M128 _mm_slli_epi64 (GF256_M128 a, int count);
 536 |         Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while shifting in zeros.
 537 |     GF256_M128 _mm_srli_epi64 (GF256_M128 a, int count);
 538 |         Shifts the 2 signed or unsigned 64-bit integers in a right by count bits while shifting in zeros.
 539 |     GF256_M128 _mm_set1_epi8 (char b);
 540 |         Sets the 16 signed 8-bit integer values to b.
 541 |     GF256_M128 _mm_and_si128 (GF256_M128 a, GF256_M128 b);
 542 |         Computes the bitwise AND of the 128-bit value in a and the 128-bit value in b.
 543 |     GF256_M128 _mm_xor_si128 ( GF256_M128 a, GF256_M128 b);
 544 |         Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in b.
 545 | */
 546 | 
 547 | // Initialize the multiplication tables using gf256_mul()
 548 | static void gf256_mul_mem_init()
 549 | {
 550 |     // Reuse aligned self test buffers to load table data
 551 |     uint8_t* lo = m_SelfTestBuffers.A;
 552 |     uint8_t* hi = m_SelfTestBuffers.B;
 553 | 
 554 |     for (int y = 0; y < 256; ++y)
 555 |     {
 556 |         // TABLE_LO_Y maps 0..15 to 8-bit partial product based on y.
 557 |         for (unsigned char x = 0; x < 16; ++x)
 558 |         {
 559 |             lo[x] = gf256_mul(x, static_cast<uint8_t>( y ));
 560 |             hi[x] = gf256_mul(x << 4, static_cast<uint8_t>( y ));
 561 |         }
 562 | 
 563 | #if defined(GF256_TRY_NEON)
 564 |         if (CpuHasNeon)
 565 |         {
 566 |             GF256Ctx.MM128.TABLE_LO_Y[y] = vld1q_u8(lo);
 567 |             GF256Ctx.MM128.TABLE_HI_Y[y] = vld1q_u8(hi);
 568 |         }
 569 | #elif !defined(GF256_TARGET_MOBILE)
 570 |         const GF256_M128 table_lo = _mm_loadu_si128((GF256_M128*)lo);
 571 |         const GF256_M128 table_hi = _mm_loadu_si128((GF256_M128*)hi);
 572 |         _mm_storeu_si128(GF256Ctx.MM128.TABLE_LO_Y + y, table_lo);
 573 |         _mm_storeu_si128(GF256Ctx.MM128.TABLE_HI_Y + y, table_hi);
 574 | # ifdef GF256_TRY_AVX2
 575 |         if (CpuHasAVX2)
 576 |         {
 577 |             const GF256_M256 table_lo2 = _mm256_broadcastsi128_si256(table_lo);
 578 |             const GF256_M256 table_hi2 = _mm256_broadcastsi128_si256(table_hi);
 579 |             _mm256_storeu_si256(GF256Ctx.MM256.TABLE_LO_Y + y, table_lo2);
 580 |             _mm256_storeu_si256(GF256Ctx.MM256.TABLE_HI_Y + y, table_hi2);
 581 |         }
 582 | # endif // GF256_TRY_AVX2
 583 | #endif // GF256_TARGET_MOBILE
 584 |     }
 585 | }
 586 | 
 587 | 
 588 | //------------------------------------------------------------------------------
 589 | // Initialization
 590 | 
 591 | static unsigned char kLittleEndianTestData[4] = { 4, 3, 2, 1 };
 592 | 
 593 | union UnionType
 594 | {
 595 |     uint32_t IntValue;
 596 |     char CharArray[4];
 597 | };
 598 | 
 599 | static bool IsLittleEndian()
 600 | {
 601 |     UnionType type;
 602 |     for (unsigned i = 0; i < 4; ++i)
 603 |         type.CharArray[i] = kLittleEndianTestData[i];
 604 |     return 0x01020304 == type.IntValue;
 605 | }
 606 | 
 607 | extern "C" int gf256_init_(int version)
 608 | {
 609 |     if (version != GF256_VERSION)
 610 |         return -1; // User's header does not match library version.
 611 | 
 612 |     // Avoid multiple initialization
 613 |     if (Initialized)
 614 |         return 0;
 615 |     Initialized = true;
 616 | 
 617 |     if (!IsLittleEndian())
 618 |         return -2; // Architecture is not supported (code won't work without mods).
 619 | 
 620 |     gf256_architecture_init();
 621 |     gf256_poly_init(kDefaultPolynomialIndex);
 622 |     gf256_explog_init();
 623 |     gf256_muldiv_init();
 624 |     gf256_inv_init();
 625 |     gf256_sqr_init();
 626 |     gf256_mul_mem_init();
 627 | 
 628 |     if (!gf256_self_test())
 629 |         return -3; // Self-test failed (perhaps untested configuration)
 630 | 
 631 |     return 0;
 632 | }
 633 | 
 634 | 
 635 | //------------------------------------------------------------------------------
 636 | // Operations
 637 | 
 638 | extern "C" void gf256_add_mem(void * GF256_RESTRICT vx,
 639 |                               const void * GF256_RESTRICT vy, int bytes)
 640 | {
 641 |     GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast<GF256_M128 *>(vx);
 642 |     const GF256_M128 * GF256_RESTRICT y16 = reinterpret_cast<const GF256_M128 *>(vy);
 643 | 
 644 | #if defined(GF256_TARGET_MOBILE)
 645 | # if defined(GF256_TRY_NEON)
 646 |     // Handle multiples of 64 bytes
 647 |     if (CpuHasNeon)
 648 |     {
 649 |         while (bytes >= 64)
 650 |         {
 651 |             GF256_M128 x0 = vld1q_u8((uint8_t*) x16);
 652 |             GF256_M128 x1 = vld1q_u8((uint8_t*)(x16 + 1) );
 653 |             GF256_M128 x2 = vld1q_u8((uint8_t*)(x16 + 2) );
 654 |             GF256_M128 x3 = vld1q_u8((uint8_t*)(x16 + 3) );
 655 |             GF256_M128 y0 = vld1q_u8((uint8_t*)y16);
 656 |             GF256_M128 y1 = vld1q_u8((uint8_t*)(y16 + 1));
 657 |             GF256_M128 y2 = vld1q_u8((uint8_t*)(y16 + 2));
 658 |             GF256_M128 y3 = vld1q_u8((uint8_t*)(y16 + 3));
 659 | 
 660 |             vst1q_u8((uint8_t*)x16,     veorq_u8(x0, y0));
 661 |             vst1q_u8((uint8_t*)(x16 + 1), veorq_u8(x1, y1));
 662 |             vst1q_u8((uint8_t*)(x16 + 2), veorq_u8(x2, y2));
 663 |             vst1q_u8((uint8_t*)(x16 + 3), veorq_u8(x3, y3));
 664 | 
 665 |             bytes -= 64, x16 += 4, y16 += 4;
 666 |         }
 667 | 
 668 |         // Handle multiples of 16 bytes
 669 |         while (bytes >= 16)
 670 |         {
 671 |             GF256_M128 x0 = vld1q_u8((uint8_t*)x16);
 672 |             GF256_M128 y0 = vld1q_u8((uint8_t*)y16);
 673 | 
 674 |             vst1q_u8((uint8_t*)x16, veorq_u8(x0, y0));
 675 | 
 676 |             bytes -= 16, ++x16, ++y16;
 677 |         }
 678 |     }
 679 |     else
 680 | # endif // GF256_TRY_NEON
 681 |     {
 682 |         uint64_t * GF256_RESTRICT x8 = reinterpret_cast<uint64_t *>(x16);
 683 |         const uint64_t * GF256_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);
 684 | 
 685 |         const unsigned count = (unsigned)bytes / 8;
 686 |         for (unsigned ii = 0; ii < count; ++ii)
 687 |             x8[ii] ^= y8[ii];
 688 | 
 689 |         x16 = reinterpret_cast<GF256_M128 *>(x8 + count);
 690 |         y16 = reinterpret_cast<const GF256_M128 *>(y8 + count);
 691 | 
 692 |         bytes -= (count * 8);
 693 |     }
 694 | #else // GF256_TARGET_MOBILE
 695 | # if defined(GF256_TRY_AVX2)
 696 |     if (CpuHasAVX2)
 697 |     {
 698 |         GF256_M256 * GF256_RESTRICT x32 = reinterpret_cast<GF256_M256 *>(x16);
 699 |         const GF256_M256 * GF256_RESTRICT y32 = reinterpret_cast<const GF256_M256 *>(y16);
 700 | 
 701 |         while (bytes >= 128)
 702 |         {
 703 |             GF256_M256 x0 = _mm256_loadu_si256(x32);
 704 |             GF256_M256 y0 = _mm256_loadu_si256(y32);
 705 |             x0 = _mm256_xor_si256(x0, y0);
 706 |             GF256_M256 x1 = _mm256_loadu_si256(x32 + 1);
 707 |             GF256_M256 y1 = _mm256_loadu_si256(y32 + 1);
 708 |             x1 = _mm256_xor_si256(x1, y1);
 709 |             GF256_M256 x2 = _mm256_loadu_si256(x32 + 2);
 710 |             GF256_M256 y2 = _mm256_loadu_si256(y32 + 2);
 711 |             x2 = _mm256_xor_si256(x2, y2);
 712 |             GF256_M256 x3 = _mm256_loadu_si256(x32 + 3);
 713 |             GF256_M256 y3 = _mm256_loadu_si256(y32 + 3);
 714 |             x3 = _mm256_xor_si256(x3, y3);
 715 | 
 716 |             _mm256_storeu_si256(x32, x0);
 717 |             _mm256_storeu_si256(x32 + 1, x1);
 718 |             _mm256_storeu_si256(x32 + 2, x2);
 719 |             _mm256_storeu_si256(x32 + 3, x3);
 720 | 
 721 |             bytes -= 128, x32 += 4, y32 += 4;
 722 |         }
 723 | 
 724 |         // Handle multiples of 32 bytes
 725 |         while (bytes >= 32)
 726 |         {
 727 |             // x[i] = x[i] xor y[i]
 728 |             _mm256_storeu_si256(x32,
 729 |                 _mm256_xor_si256(
 730 |                     _mm256_loadu_si256(x32),
 731 |                     _mm256_loadu_si256(y32)));
 732 | 
 733 |             bytes -= 32, ++x32, ++y32;
 734 |         }
 735 | 
 736 |         x16 = reinterpret_cast<GF256_M128 *>(x32);
 737 |         y16 = reinterpret_cast<const GF256_M128 *>(y32);
 738 |     }
 739 |     else
 740 | # endif // GF256_TRY_AVX2
 741 |     {
 742 |         while (bytes >= 64)
 743 |         {
 744 |             GF256_M128 x0 = _mm_loadu_si128(x16);
 745 |             GF256_M128 y0 = _mm_loadu_si128(y16);
 746 |             x0 = _mm_xor_si128(x0, y0);
 747 |             GF256_M128 x1 = _mm_loadu_si128(x16 + 1);
 748 |             GF256_M128 y1 = _mm_loadu_si128(y16 + 1);
 749 |             x1 = _mm_xor_si128(x1, y1);
 750 |             GF256_M128 x2 = _mm_loadu_si128(x16 + 2);
 751 |             GF256_M128 y2 = _mm_loadu_si128(y16 + 2);
 752 |             x2 = _mm_xor_si128(x2, y2);
 753 |             GF256_M128 x3 = _mm_loadu_si128(x16 + 3);
 754 |             GF256_M128 y3 = _mm_loadu_si128(y16 + 3);
 755 |             x3 = _mm_xor_si128(x3, y3);
 756 | 
 757 |             _mm_storeu_si128(x16, x0);
 758 |             _mm_storeu_si128(x16 + 1, x1);
 759 |             _mm_storeu_si128(x16 + 2, x2);
 760 |             _mm_storeu_si128(x16 + 3, x3);
 761 | 
 762 |             bytes -= 64, x16 += 4, y16 += 4;
 763 |         }
 764 |     }
 765 | #endif // GF256_TARGET_MOBILE
 766 | 
 767 | #if !defined(GF256_TARGET_MOBILE)
 768 |     // Handle multiples of 16 bytes
 769 |     while (bytes >= 16)
 770 |     {
 771 |         // x[i] = x[i] xor y[i]
 772 |         _mm_storeu_si128(x16,
 773 |             _mm_xor_si128(
 774 |                 _mm_loadu_si128(x16),
 775 |                 _mm_loadu_si128(y16)));
 776 | 
 777 |         bytes -= 16, ++x16, ++y16;
 778 |     }
 779 | #endif
 780 | 
 781 |     uint8_t * GF256_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
 782 |     const uint8_t * GF256_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
 783 | 
 784 |     // Handle a block of 8 bytes
 785 |     const int eight = bytes & 8;
 786 |     if (eight)
 787 |     {
 788 |         uint64_t * GF256_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
 789 |         const uint64_t * GF256_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
 790 |         *x8 ^= *y8;
 791 |     }
 792 | 
 793 |     // Handle a block of 4 bytes
 794 |     const int four = bytes & 4;
 795 |     if (four)
 796 |     {
 797 |         uint32_t * GF256_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1 + eight);
 798 |         const uint32_t * GF256_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
 799 |         *x4 ^= *y4;
 800 |     }
 801 | 
 802 |     // Handle final bytes
 803 |     const int offset = eight + four;
 804 |     switch (bytes & 3)
 805 |     {
 806 |     case 3: x1[offset + 2] ^= y1[offset + 2];
 807 |     case 2: x1[offset + 1] ^= y1[offset + 1];
 808 |     case 1: x1[offset] ^= y1[offset];
 809 |     default:
 810 |         break;
 811 |     }
 812 | }
 813 | 
 814 | extern "C" void gf256_add2_mem(void * GF256_RESTRICT vz, const void * GF256_RESTRICT vx,
 815 |                                const void * GF256_RESTRICT vy, int bytes)
 816 | {
 817 |     GF256_M128 * GF256_RESTRICT z16 = reinterpret_cast<GF256_M128*>(vz);
 818 |     const GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast<const GF256_M128*>(vx);
 819 |     const GF256_M128 * GF256_RESTRICT y16 = reinterpret_cast<const GF256_M128*>(vy);
 820 | 
 821 | #if defined(GF256_TARGET_MOBILE)
 822 | # if defined(GF256_TRY_NEON)
 823 |     // Handle multiples of 64 bytes
 824 |     if (CpuHasNeon)
 825 |     {
 826 |         // Handle multiples of 16 bytes
 827 |         while (bytes >= 16)
 828 |         {
 829 |             // z[i] = z[i] xor x[i] xor y[i]
 830 |             vst1q_u8((uint8_t*)z16,
 831 |                 veorq_u8(
 832 |                     vld1q_u8((uint8_t*)z16),
 833 |                     veorq_u8(
 834 |                         vld1q_u8((uint8_t*)x16),
 835 |                         vld1q_u8((uint8_t*)y16))));
 836 | 
 837 |             bytes -= 16, ++x16, ++y16, ++z16;
 838 |         }
 839 |     }
 840 |     else
 841 | # endif // GF256_TRY_NEON
 842 |     {
 843 |         uint64_t * GF256_RESTRICT z8 = reinterpret_cast<uint64_t *>(z16);
 844 |         const uint64_t * GF256_RESTRICT x8 = reinterpret_cast<const uint64_t *>(x16);
 845 |         const uint64_t * GF256_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);
 846 | 
 847 |         const unsigned count = (unsigned)bytes / 8;
 848 |         for (unsigned ii = 0; ii < count; ++ii)
 849 |             z8[ii] ^= x8[ii] ^ y8[ii];
 850 | 
 851 |         z16 = reinterpret_cast<GF256_M128 *>(z8 + count);
 852 |         x16 = reinterpret_cast<const GF256_M128 *>(x8 + count);
 853 |         y16 = reinterpret_cast<const GF256_M128 *>(y8 + count);
 854 |     }
 855 | #else // GF256_TARGET_MOBILE
 856 | # if defined(GF256_TRY_AVX2)
 857 |     if (CpuHasAVX2)
 858 |     {
 859 |         GF256_M256 * GF256_RESTRICT z32 = reinterpret_cast<GF256_M256 *>(z16);
 860 |         const GF256_M256 * GF256_RESTRICT x32 = reinterpret_cast<const GF256_M256 *>(x16);
 861 |         const GF256_M256 * GF256_RESTRICT y32 = reinterpret_cast<const GF256_M256 *>(y16);
 862 | 
 863 |         const unsigned count = bytes / 32;
 864 |         for (unsigned i = 0; i < count; ++i)
 865 |         {
 866 |             _mm256_storeu_si256(z32 + i,
 867 |                 _mm256_xor_si256(
 868 |                     _mm256_loadu_si256(z32 + i),
 869 |                     _mm256_xor_si256(
 870 |                         _mm256_loadu_si256(x32 + i),
 871 |                         _mm256_loadu_si256(y32 + i))));
 872 |         }
 873 | 
 874 |         bytes -= count * 32;
 875 |         z16 = reinterpret_cast<GF256_M128 *>(z32 + count);
 876 |         x16 = reinterpret_cast<const GF256_M128 *>(x32 + count);
 877 |         y16 = reinterpret_cast<const GF256_M128 *>(y32 + count);
 878 |     }
 879 | # endif // GF256_TRY_AVX2
 880 | 
 881 |     // Handle multiples of 16 bytes
 882 |     while (bytes >= 16)
 883 |     {
 884 |         // z[i] = z[i] xor x[i] xor y[i]
 885 |         _mm_storeu_si128(z16,
 886 |             _mm_xor_si128(
 887 |                 _mm_loadu_si128(z16),
 888 |                 _mm_xor_si128(
 889 |                     _mm_loadu_si128(x16),
 890 |                     _mm_loadu_si128(y16))));
 891 | 
 892 |         bytes -= 16, ++x16, ++y16, ++z16;
 893 |     }
 894 | #endif // GF256_TARGET_MOBILE
 895 | 
 896 |     uint8_t * GF256_RESTRICT z1 = reinterpret_cast<uint8_t *>(z16);
 897 |     const uint8_t * GF256_RESTRICT x1 = reinterpret_cast<const uint8_t *>(x16);
 898 |     const uint8_t * GF256_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
 899 | 
 900 |     // Handle a block of 8 bytes
 901 |     const int eight = bytes & 8;
 902 |     if (eight)
 903 |     {
 904 |         uint64_t * GF256_RESTRICT z8 = reinterpret_cast<uint64_t *>(z1);
 905 |         const uint64_t * GF256_RESTRICT x8 = reinterpret_cast<const uint64_t *>(x1);
 906 |         const uint64_t * GF256_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
 907 |         *z8 ^= *x8 ^ *y8;
 908 |     }
 909 | 
 910 |     // Handle a block of 4 bytes
 911 |     const int four = bytes & 4;
 912 |     if (four)
 913 |     {
 914 |         uint32_t * GF256_RESTRICT z4 = reinterpret_cast<uint32_t *>(z1 + eight);
 915 |         const uint32_t * GF256_RESTRICT x4 = reinterpret_cast<const uint32_t *>(x1 + eight);
 916 |         const uint32_t * GF256_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
 917 |         *z4 ^= *x4 ^ *y4;
 918 |     }
 919 | 
 920 |     // Handle final bytes
 921 |     const int offset = eight + four;
 922 |     switch (bytes & 3)
 923 |     {
 924 |     case 3: z1[offset + 2] ^= x1[offset + 2] ^ y1[offset + 2];
 925 |     case 2: z1[offset + 1] ^= x1[offset + 1] ^ y1[offset + 1];
 926 |     case 1: z1[offset] ^= x1[offset] ^ y1[offset];
 927 |     default:
 928 |         break;
 929 |     }
 930 | }
 931 | 
 932 | extern "C" void gf256_addset_mem(void * GF256_RESTRICT vz, const void * GF256_RESTRICT vx,
 933 |                                  const void * GF256_RESTRICT vy, int bytes)
 934 | {
 935 |     GF256_M128 * GF256_RESTRICT z16 = reinterpret_cast<GF256_M128*>(vz);
 936 |     const GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast<const GF256_M128*>(vx);
 937 |     const GF256_M128 * GF256_RESTRICT y16 = reinterpret_cast<const GF256_M128*>(vy);
 938 | 
 939 | #if defined(GF256_TARGET_MOBILE)
 940 | # if defined(GF256_TRY_NEON)
 941 |     // Handle multiples of 64 bytes
 942 |     if (CpuHasNeon)
 943 |     {
 944 |         while (bytes >= 64)
 945 |         {
 946 |             GF256_M128 x0 = vld1q_u8((uint8_t*)x16);
 947 |             GF256_M128 x1 = vld1q_u8((uint8_t*)(x16 + 1));
 948 |             GF256_M128 x2 = vld1q_u8((uint8_t*)(x16 + 2));
 949 |             GF256_M128 x3 = vld1q_u8((uint8_t*)(x16 + 3));
 950 |             GF256_M128 y0 = vld1q_u8((uint8_t*)(y16));
 951 |             GF256_M128 y1 = vld1q_u8((uint8_t*)(y16 + 1));
 952 |             GF256_M128 y2 = vld1q_u8((uint8_t*)(y16 + 2));
 953 |             GF256_M128 y3 = vld1q_u8((uint8_t*)(y16 + 3));
 954 | 
 955 |             vst1q_u8((uint8_t*)z16,     veorq_u8(x0, y0));
 956 |             vst1q_u8((uint8_t*)(z16 + 1), veorq_u8(x1, y1));
 957 |             vst1q_u8((uint8_t*)(z16 + 2), veorq_u8(x2, y2));
 958 |             vst1q_u8((uint8_t*)(z16 + 3), veorq_u8(x3, y3));
 959 | 
 960 |             bytes -= 64, x16 += 4, y16 += 4, z16 += 4;
 961 |         }
 962 | 
 963 |         // Handle multiples of 16 bytes
 964 |         while (bytes >= 16)
 965 |         {
 966 |             // z[i] = x[i] xor y[i]
 967 |             vst1q_u8((uint8_t*)z16,
 968 |                      veorq_u8(
 969 |                          vld1q_u8((uint8_t*)x16),
 970 |                          vld1q_u8((uint8_t*)y16)));
 971 | 
 972 |             bytes -= 16, ++x16, ++y16, ++z16;
 973 |         }
 974 |     }
 975 |     else
 976 | # endif // GF256_TRY_NEON
 977 |     {
 978 |         uint64_t * GF256_RESTRICT z8 = reinterpret_cast<uint64_t *>(z16);
 979 |         const uint64_t * GF256_RESTRICT x8 = reinterpret_cast<const uint64_t *>(x16);
 980 |         const uint64_t * GF256_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);
 981 | 
 982 |         const unsigned count = (unsigned)bytes / 8;
 983 |         for (unsigned ii = 0; ii < count; ++ii)
 984 |             z8[ii] = x8[ii] ^ y8[ii];
 985 | 
 986 |         x16 = reinterpret_cast<const GF256_M128 *>(x8 + count);
 987 |         y16 = reinterpret_cast<const GF256_M128 *>(y8 + count);
 988 |         z16 = reinterpret_cast<GF256_M128 *>(z8 + count);
 989 | 
 990 |         bytes -= (count * 8);
 991 |     }
 992 | #else // GF256_TARGET_MOBILE
 993 | # if defined(GF256_TRY_AVX2)
 994 |     if (CpuHasAVX2)
 995 |     {
 996 |         GF256_M256 * GF256_RESTRICT z32 = reinterpret_cast<GF256_M256 *>(z16);
 997 |         const GF256_M256 * GF256_RESTRICT x32 = reinterpret_cast<const GF256_M256 *>(x16);
 998 |         const GF256_M256 * GF256_RESTRICT y32 = reinterpret_cast<const GF256_M256 *>(y16);
 999 | 
1000 |         const unsigned count = bytes / 32;
1001 |         for (unsigned i = 0; i < count; ++i)
1002 |         {
1003 |             _mm256_storeu_si256(z32 + i,
1004 |                 _mm256_xor_si256(
1005 |                     _mm256_loadu_si256(x32 + i),
1006 |                     _mm256_loadu_si256(y32 + i)));
1007 |         }
1008 | 
1009 |         bytes -= count * 32;
1010 |         z16 = reinterpret_cast<GF256_M128 *>(z32 + count);
1011 |         x16 = reinterpret_cast<const GF256_M128 *>(x32 + count);
1012 |         y16 = reinterpret_cast<const GF256_M128 *>(y32 + count);
1013 |     }
1014 |     else
1015 | # endif // GF256_TRY_AVX2
1016 |     {
1017 |         // Handle multiples of 64 bytes
1018 |         while (bytes >= 64)
1019 |         {
1020 |             GF256_M128 x0 = _mm_loadu_si128(x16);
1021 |             GF256_M128 x1 = _mm_loadu_si128(x16 + 1);
1022 |             GF256_M128 x2 = _mm_loadu_si128(x16 + 2);
1023 |             GF256_M128 x3 = _mm_loadu_si128(x16 + 3);
1024 |             GF256_M128 y0 = _mm_loadu_si128(y16);
1025 |             GF256_M128 y1 = _mm_loadu_si128(y16 + 1);
1026 |             GF256_M128 y2 = _mm_loadu_si128(y16 + 2);
1027 |             GF256_M128 y3 = _mm_loadu_si128(y16 + 3);
1028 | 
1029 |             _mm_storeu_si128(z16,     _mm_xor_si128(x0, y0));
1030 |             _mm_storeu_si128(z16 + 1, _mm_xor_si128(x1, y1));
1031 |             _mm_storeu_si128(z16 + 2, _mm_xor_si128(x2, y2));
1032 |             _mm_storeu_si128(z16 + 3, _mm_xor_si128(x3, y3));
1033 | 
1034 |             bytes -= 64, x16 += 4, y16 += 4, z16 += 4;
1035 |         }
1036 |     }
1037 | 
1038 |     // Handle multiples of 16 bytes
1039 |     while (bytes >= 16)
1040 |     {
1041 |         // z[i] = x[i] xor y[i]
1042 |         _mm_storeu_si128(z16,
1043 |             _mm_xor_si128(
1044 |                 _mm_loadu_si128(x16),
1045 |                 _mm_loadu_si128(y16)));
1046 | 
1047 |         bytes -= 16, ++x16, ++y16, ++z16;
1048 |     }
1049 | #endif // GF256_TARGET_MOBILE
1050 | 
1051 |     uint8_t * GF256_RESTRICT z1 = reinterpret_cast<uint8_t *>(z16);
1052 |     const uint8_t * GF256_RESTRICT x1 = reinterpret_cast<const uint8_t *>(x16);
1053 |     const uint8_t * GF256_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
1054 | 
1055 |     // Handle a block of 8 bytes
1056 |     const int eight = bytes & 8;
1057 |     if (eight)
1058 |     {
1059 |         uint64_t * GF256_RESTRICT z8 = reinterpret_cast<uint64_t *>(z1);
1060 |         const uint64_t * GF256_RESTRICT x8 = reinterpret_cast<const uint64_t *>(x1);
1061 |         const uint64_t * GF256_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
1062 |         *z8 = *x8 ^ *y8;
1063 |     }
1064 | 
1065 |     // Handle a block of 4 bytes
1066 |     const int four = bytes & 4;
1067 |     if (four)
1068 |     {
1069 |         uint32_t * GF256_RESTRICT z4 = reinterpret_cast<uint32_t *>(z1 + eight);
1070 |         const uint32_t * GF256_RESTRICT x4 = reinterpret_cast<const uint32_t *>(x1 + eight);
1071 |         const uint32_t * GF256_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
1072 |         *z4 = *x4 ^ *y4;
1073 |     }
1074 | 
1075 |     // Handle final bytes
1076 |     const int offset = eight + four;
1077 |     switch (bytes & 3)
1078 |     {
1079 |     case 3: z1[offset + 2] = x1[offset + 2] ^ y1[offset + 2];
1080 |     case 2: z1[offset + 1] = x1[offset + 1] ^ y1[offset + 1];
1081 |     case 1: z1[offset] = x1[offset] ^ y1[offset];
1082 |     default:
1083 |         break;
1084 |     }
1085 | }
1086 | 
1087 | extern "C" void gf256_mul_mem(void * GF256_RESTRICT vz, const void * GF256_RESTRICT vx, uint8_t y, int bytes)
1088 | {
1089 |     // Use a single if-statement to handle special cases
1090 |     if (y <= 1)
1091 |     {
1092 |         if (y == 0)
1093 |             memset(vz, 0, bytes);
1094 |         else if (vz != vx)
1095 |             memcpy(vz, vx, bytes);
1096 |         return;
1097 |     }
1098 | 
1099 |     GF256_M128 * GF256_RESTRICT z16 = reinterpret_cast<GF256_M128 *>(vz);
1100 |     const GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast<const GF256_M128 *>(vx);
1101 | 
1102 | #if defined(GF256_TARGET_MOBILE)
1103 | #if defined(GF256_TRY_NEON)
1104 |     if (bytes >= 16 && CpuHasNeon)
1105 |     {
1106 |         // Partial product tables; see above
1107 |         const GF256_M128 table_lo_y = vld1q_u8((uint8_t*)(GF256Ctx.MM128.TABLE_LO_Y + y));
1108 |         const GF256_M128 table_hi_y = vld1q_u8((uint8_t*)(GF256Ctx.MM128.TABLE_HI_Y + y));
1109 | 
1110 |         // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
1111 |         const GF256_M128 clr_mask = vdupq_n_u8(0x0f);
1112 | 
1113 |         // Handle multiples of 16 bytes
1114 |         do
1115 |         {
1116 |             // See above comments for details
1117 |             GF256_M128 x0 = vld1q_u8((uint8_t*)x16);
1118 |             GF256_M128 l0 = vandq_u8(x0, clr_mask);
1119 |             x0 = vshrq_n_u8(x0, 4);
1120 |             GF256_M128 h0 = vandq_u8(x0, clr_mask);
1121 |             l0 = vqtbl1q_u8(table_lo_y, l0);
1122 |             h0 = vqtbl1q_u8(table_hi_y, h0);
1123 |             vst1q_u8((uint8_t*)z16, veorq_u8(l0, h0));
1124 | 
1125 |             bytes -= 16, ++x16, ++z16;
1126 |         } while (bytes >= 16);
1127 |     }
1128 | #endif
1129 | #else
1130 | # if defined(GF256_TRY_AVX2)
1131 |     if (bytes >= 32 && CpuHasAVX2)
1132 |     {
1133 |         // Partial product tables; see above
1134 |         const GF256_M256 table_lo_y = _mm256_loadu_si256(GF256Ctx.MM256.TABLE_LO_Y + y);
1135 |         const GF256_M256 table_hi_y = _mm256_loadu_si256(GF256Ctx.MM256.TABLE_HI_Y + y);
1136 | 
1137 |         // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
1138 |         const GF256_M256 clr_mask = _mm256_set1_epi8(0x0f);
1139 | 
1140 |         GF256_M256 * GF256_RESTRICT z32 = reinterpret_cast<GF256_M256 *>(vz);
1141 |         const GF256_M256 * GF256_RESTRICT x32 = reinterpret_cast<const GF256_M256 *>(vx);
1142 | 
1143 |         // Handle multiples of 32 bytes
1144 |         do
1145 |         {
1146 |             // See above comments for details
1147 |             GF256_M256 x0 = _mm256_loadu_si256(x32);
1148 |             GF256_M256 l0 = _mm256_and_si256(x0, clr_mask);
1149 |             x0 = _mm256_srli_epi64(x0, 4);
1150 |             GF256_M256 h0 = _mm256_and_si256(x0, clr_mask);
1151 |             l0 = _mm256_shuffle_epi8(table_lo_y, l0);
1152 |             h0 = _mm256_shuffle_epi8(table_hi_y, h0);
1153 |             _mm256_storeu_si256(z32, _mm256_xor_si256(l0, h0));
1154 | 
1155 |             bytes -= 32, ++x32, ++z32;
1156 |         } while (bytes >= 32);
1157 | 
1158 |         z16 = reinterpret_cast<GF256_M128 *>(z32);
1159 |         x16 = reinterpret_cast<const GF256_M128 *>(x32);
1160 |     }
1161 | # endif // GF256_TRY_AVX2
1162 |     if (bytes >= 16 && CpuHasSSSE3)
1163 |     {
1164 |         // Partial product tables; see above
1165 |         const GF256_M128 table_lo_y = _mm_loadu_si128(GF256Ctx.MM128.TABLE_LO_Y + y);
1166 |         const GF256_M128 table_hi_y = _mm_loadu_si128(GF256Ctx.MM128.TABLE_HI_Y + y);
1167 | 
1168 |         // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
1169 |         const GF256_M128 clr_mask = _mm_set1_epi8(0x0f);
1170 | 
1171 |         // Handle multiples of 16 bytes
1172 |         do
1173 |         {
1174 |             // See above comments for details
1175 |             GF256_M128 x0 = _mm_loadu_si128(x16);
1176 |             GF256_M128 l0 = _mm_and_si128(x0, clr_mask);
1177 |             x0 = _mm_srli_epi64(x0, 4);
1178 |             GF256_M128 h0 = _mm_and_si128(x0, clr_mask);
1179 |             l0 = _mm_shuffle_epi8(table_lo_y, l0);
1180 |             h0 = _mm_shuffle_epi8(table_hi_y, h0);
1181 |             _mm_storeu_si128(z16, _mm_xor_si128(l0, h0));
1182 | 
1183 |             bytes -= 16, ++x16, ++z16;
1184 |         } while (bytes >= 16);
1185 |     }
1186 | #endif
1187 | 
1188 |     uint8_t * GF256_RESTRICT z1 = reinterpret_cast<uint8_t*>(z16);
1189 |     const uint8_t * GF256_RESTRICT x1 = reinterpret_cast<const uint8_t*>(x16);
1190 |     const uint8_t * GF256_RESTRICT table = GF256Ctx.GF256_MUL_TABLE + ((unsigned)y << 8);
1191 | 
1192 |     // Handle blocks of 8 bytes
1193 |     while (bytes >= 8)
1194 |     {
1195 |         uint64_t * GF256_RESTRICT z8 = reinterpret_cast<uint64_t *>(z1);
1196 |         uint64_t word = table[x1[0]];
1197 |         word |= (uint64_t)table[x1[1]] << 8;
1198 |         word |= (uint64_t)table[x1[2]] << 16;
1199 |         word |= (uint64_t)table[x1[3]] << 24;
1200 |         word |= (uint64_t)table[x1[4]] << 32;
1201 |         word |= (uint64_t)table[x1[5]] << 40;
1202 |         word |= (uint64_t)table[x1[6]] << 48;
1203 |         word |= (uint64_t)table[x1[7]] << 56;
1204 |         *z8 = word;
1205 | 
1206 |         bytes -= 8, x1 += 8, z1 += 8;
1207 |     }
1208 | 
1209 |     // Handle a block of 4 bytes
1210 |     const int four = bytes & 4;
1211 |     if (four)
1212 |     {
1213 |         uint32_t * GF256_RESTRICT z4 = reinterpret_cast<uint32_t *>(z1);
1214 |         uint32_t word = table[x1[0]];
1215 |         word |= (uint32_t)table[x1[1]] << 8;
1216 |         word |= (uint32_t)table[x1[2]] << 16;
1217 |         word |= (uint32_t)table[x1[3]] << 24;
1218 |         *z4 = word;
1219 |     }
1220 | 
1221 |     // Handle single bytes
1222 |     const int offset = four;
1223 |     switch (bytes & 3)
1224 |     {
1225 |     case 3: z1[offset + 2] = table[x1[offset + 2]];
1226 |     case 2: z1[offset + 1] = table[x1[offset + 1]];
1227 |     case 1: z1[offset] = table[x1[offset]];
1228 |     default:
1229 |         break;
1230 |     }
1231 | }
1232 | 
1233 | extern "C" void gf256_muladd_mem(void * GF256_RESTRICT vz, uint8_t y,
1234 |                                  const void * GF256_RESTRICT vx, int bytes)
1235 | {
1236 |     // Use a single if-statement to handle special cases
1237 |     if (y <= 1)
1238 |     {
1239 |         if (y == 1)
1240 |             gf256_add_mem(vz, vx, bytes);
1241 |         return;
1242 |     }
1243 | 
1244 |     GF256_M128 * GF256_RESTRICT z16 = reinterpret_cast<GF256_M128 *>(vz);
1245 |     const GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast<const GF256_M128 *>(vx);
1246 | 
1247 | #if defined(GF256_TARGET_MOBILE)
1248 | #if defined(GF256_TRY_NEON)
1249 |     if (bytes >= 16 && CpuHasNeon)
1250 |     {
1251 |         // Partial product tables; see above
1252 |         const GF256_M128 table_lo_y = vld1q_u8((uint8_t*)(GF256Ctx.MM128.TABLE_LO_Y + y));
1253 |         const GF256_M128 table_hi_y = vld1q_u8((uint8_t*)(GF256Ctx.MM128.TABLE_HI_Y + y));
1254 | 
1255 |         // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
1256 |         const GF256_M128 clr_mask = vdupq_n_u8(0x0f);
1257 | 
1258 |         // Handle multiples of 16 bytes
1259 |         do
1260 |         {
1261 |             // See above comments for details
1262 |             GF256_M128 x0 = vld1q_u8((uint8_t*)x16);
1263 |             GF256_M128 l0 = vandq_u8(x0, clr_mask);
1264 | 
1265 |             // x0 = vshrq_n_u8(x0, 4);
1266 |             x0 = (GF256_M128)vshrq_n_u64( (uint64x2_t)x0, 4);
1267 |             GF256_M128 h0 = vandq_u8(x0, clr_mask);
1268 |             l0 = vqtbl1q_u8(table_lo_y, l0);
1269 |             h0 = vqtbl1q_u8(table_hi_y, h0);
1270 |             const GF256_M128 p0 = veorq_u8(l0, h0);
1271 |             const GF256_M128 z0 = vld1q_u8((uint8_t*)z16);
1272 |             vst1q_u8((uint8_t*)z16, veorq_u8(p0, z0));
1273 |             bytes -= 16, ++x16, ++z16;
1274 |         } while (bytes >= 16);
1275 |     }
1276 | #endif
1277 | #else // GF256_TARGET_MOBILE
1278 | # if defined(GF256_TRY_AVX2)
1279 |     if (bytes >= 32 && CpuHasAVX2)
1280 |     {
1281 |         // Partial product tables; see above
1282 |         const GF256_M256 table_lo_y = _mm256_loadu_si256(GF256Ctx.MM256.TABLE_LO_Y + y);
1283 |         const GF256_M256 table_hi_y = _mm256_loadu_si256(GF256Ctx.MM256.TABLE_HI_Y + y);
1284 | 
1285 |         // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
1286 |         const GF256_M256 clr_mask = _mm256_set1_epi8(0x0f);
1287 | 
1288 |         GF256_M256 * GF256_RESTRICT z32 = reinterpret_cast<GF256_M256 *>(z16);
1289 |         const GF256_M256 * GF256_RESTRICT x32 = reinterpret_cast<const GF256_M256 *>(x16);
1290 | 
1291 |         // On my Reed Solomon codec, the encoder unit test runs in 640 usec without and 550 usec with the optimization (86% of the original time)
1292 |         const unsigned count = bytes / 64;
1293 |         for (unsigned i = 0; i < count; ++i)
1294 |         {
1295 |             // See above comments for details
1296 |             GF256_M256 x0 = _mm256_loadu_si256(x32 + i * 2);
1297 |             GF256_M256 l0 = _mm256_and_si256(x0, clr_mask);
1298 |             x0 = _mm256_srli_epi64(x0, 4);
1299 |             const GF256_M256 z0 = _mm256_loadu_si256(z32 + i * 2);
1300 |             GF256_M256 h0 = _mm256_and_si256(x0, clr_mask);
1301 |             l0 = _mm256_shuffle_epi8(table_lo_y, l0);
1302 |             h0 = _mm256_shuffle_epi8(table_hi_y, h0);
1303 |             const GF256_M256 p0 = _mm256_xor_si256(l0, h0);
1304 |             _mm256_storeu_si256(z32 + i * 2, _mm256_xor_si256(p0, z0));
1305 | 
1306 |             GF256_M256 x1 = _mm256_loadu_si256(x32 + i * 2 + 1);
1307 |             GF256_M256 l1 = _mm256_and_si256(x1, clr_mask);
1308 |             x1 = _mm256_srli_epi64(x1, 4);
1309 |             const GF256_M256 z1 = _mm256_loadu_si256(z32 + i * 2 + 1);
1310 |             GF256_M256 h1 = _mm256_and_si256(x1, clr_mask);
1311 |             l1 = _mm256_shuffle_epi8(table_lo_y, l1);
1312 |             h1 = _mm256_shuffle_epi8(table_hi_y, h1);
1313 |             const GF256_M256 p1 = _mm256_xor_si256(l1, h1);
1314 |             _mm256_storeu_si256(z32 + i * 2 + 1, _mm256_xor_si256(p1, z1));
1315 |         }
1316 |         bytes -= count * 64;
1317 |         z32 += count * 2;
1318 |         x32 += count * 2;
1319 | 
1320 |         if (bytes >= 32)
1321 |         {
1322 |             GF256_M256 x0 = _mm256_loadu_si256(x32);
1323 |             GF256_M256 l0 = _mm256_and_si256(x0, clr_mask);
1324 |             x0 = _mm256_srli_epi64(x0, 4);
1325 |             GF256_M256 h0 = _mm256_and_si256(x0, clr_mask);
1326 |             l0 = _mm256_shuffle_epi8(table_lo_y, l0);
1327 |             h0 = _mm256_shuffle_epi8(table_hi_y, h0);
1328 |             const GF256_M256 p0 = _mm256_xor_si256(l0, h0);
1329 |             const GF256_M256 z0 = _mm256_loadu_si256(z32);
1330 |             _mm256_storeu_si256(z32, _mm256_xor_si256(p0, z0));
1331 | 
1332 |             bytes -= 32;
1333 |             z32++;
1334 |             x32++;
1335 |         }
1336 | 
1337 |         z16 = reinterpret_cast<GF256_M128 *>(z32);
1338 |         x16 = reinterpret_cast<const GF256_M128 *>(x32);
1339 |     }
1340 | # endif // GF256_TRY_AVX2
1341 |     if (bytes >= 16 && CpuHasSSSE3)
1342 |     {
1343 |         // Partial product tables; see above
1344 |         const GF256_M128 table_lo_y = _mm_loadu_si128(GF256Ctx.MM128.TABLE_LO_Y + y);
1345 |         const GF256_M128 table_hi_y = _mm_loadu_si128(GF256Ctx.MM128.TABLE_HI_Y + y);
1346 | 
1347 |         // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
1348 |         const GF256_M128 clr_mask = _mm_set1_epi8(0x0f);
1349 | 
1350 |         // This unroll seems to provide about 7% speed boost when AVX2 is disabled
1351 |         while (bytes >= 32)
1352 |         {
1353 |             bytes -= 32;
1354 | 
1355 |             GF256_M128 x1 = _mm_loadu_si128(x16 + 1);
1356 |             GF256_M128 l1 = _mm_and_si128(x1, clr_mask);
1357 |             x1 = _mm_srli_epi64(x1, 4);
1358 |             GF256_M128 h1 = _mm_and_si128(x1, clr_mask);
1359 |             l1 = _mm_shuffle_epi8(table_lo_y, l1);
1360 |             h1 = _mm_shuffle_epi8(table_hi_y, h1);
1361 |             const GF256_M128 z1 = _mm_loadu_si128(z16 + 1);
1362 | 
1363 |             GF256_M128 x0 = _mm_loadu_si128(x16);
1364 |             GF256_M128 l0 = _mm_and_si128(x0, clr_mask);
1365 |             x0 = _mm_srli_epi64(x0, 4);
1366 |             GF256_M128 h0 = _mm_and_si128(x0, clr_mask);
1367 |             l0 = _mm_shuffle_epi8(table_lo_y, l0);
1368 |             h0 = _mm_shuffle_epi8(table_hi_y, h0);
1369 |             const GF256_M128 z0 = _mm_loadu_si128(z16);
1370 | 
1371 |             const GF256_M128 p1 = _mm_xor_si128(l1, h1);
1372 |             _mm_storeu_si128(z16 + 1, _mm_xor_si128(p1, z1));
1373 | 
1374 |             const GF256_M128 p0 = _mm_xor_si128(l0, h0);
1375 |             _mm_storeu_si128(z16, _mm_xor_si128(p0, z0));
1376 | 
1377 |             x16 += 2, z16 += 2;
1378 |         }
1379 | 
1380 |         // Handle multiples of 16 bytes
1381 |         while (bytes >= 16)
1382 |         {
1383 |             // See above comments for details
1384 |             GF256_M128 x0 = _mm_loadu_si128(x16);
1385 |             GF256_M128 l0 = _mm_and_si128(x0, clr_mask);
1386 |             x0 = _mm_srli_epi64(x0, 4);
1387 |             GF256_M128 h0 = _mm_and_si128(x0, clr_mask);
1388 |             l0 = _mm_shuffle_epi8(table_lo_y, l0);
1389 |             h0 = _mm_shuffle_epi8(table_hi_y, h0);
1390 |             const GF256_M128 p0 = _mm_xor_si128(l0, h0);
1391 |             const GF256_M128 z0 = _mm_loadu_si128(z16);
1392 |             _mm_storeu_si128(z16, _mm_xor_si128(p0, z0));
1393 | 
1394 |             bytes -= 16, ++x16, ++z16;
1395 |         }
1396 |     }
1397 | #endif // GF256_TARGET_MOBILE
1398 | 
1399 |     uint8_t * GF256_RESTRICT z1 = reinterpret_cast<uint8_t*>(z16);
1400 |     const uint8_t * GF256_RESTRICT x1 = reinterpret_cast<const uint8_t*>(x16);
1401 |     const uint8_t * GF256_RESTRICT table = GF256Ctx.GF256_MUL_TABLE + ((unsigned)y << 8);
1402 | 
1403 |     // Handle blocks of 8 bytes
1404 |     while (bytes >= 8)
1405 |     {
1406 |         uint64_t * GF256_RESTRICT z8 = reinterpret_cast<uint64_t *>(z1);
1407 |         uint64_t word = table[x1[0]];
1408 |         word |= (uint64_t)table[x1[1]] << 8;
1409 |         word |= (uint64_t)table[x1[2]] << 16;
1410 |         word |= (uint64_t)table[x1[3]] << 24;
1411 |         word |= (uint64_t)table[x1[4]] << 32;
1412 |         word |= (uint64_t)table[x1[5]] << 40;
1413 |         word |= (uint64_t)table[x1[6]] << 48;
1414 |         word |= (uint64_t)table[x1[7]] << 56;
1415 |         *z8 ^= word;
1416 | 
1417 |         bytes -= 8, x1 += 8, z1 += 8;
1418 |     }
1419 | 
1420 |     // Handle a block of 4 bytes
1421 |     const int four = bytes & 4;
1422 |     if (four)
1423 |     {
1424 |         uint32_t * GF256_RESTRICT z4 = reinterpret_cast<uint32_t *>(z1);
1425 |         uint32_t word = table[x1[0]];
1426 |         word |= (uint32_t)table[x1[1]] << 8;
1427 |         word |= (uint32_t)table[x1[2]] << 16;
1428 |         word |= (uint32_t)table[x1[3]] << 24;
1429 |         *z4 ^= word;
1430 |     }
1431 | 
1432 |     // Handle single bytes
1433 |     const int offset = four;
1434 |     switch (bytes & 3)
1435 |     {
1436 |     case 3: z1[offset + 2] ^= table[x1[offset + 2]];
1437 |     case 2: z1[offset + 1] ^= table[x1[offset + 1]];
1438 |     case 1: z1[offset] ^= table[x1[offset]];
1439 |     default:
1440 |         break;
1441 |     }
1442 | }
1443 | 
1444 | extern "C" void gf256_memswap(void * GF256_RESTRICT vx, void * GF256_RESTRICT vy, int bytes)
1445 | {
1446 | #if defined(GF256_TARGET_MOBILE)
1447 |     uint64_t * GF256_RESTRICT x16 = reinterpret_cast<uint64_t *>(vx);
1448 |     uint64_t * GF256_RESTRICT y16 = reinterpret_cast<uint64_t *>(vy);
1449 | 
1450 |     const unsigned count = (unsigned)bytes / 8;
1451 |     for (unsigned ii = 0; ii < count; ++ii)
1452 |     {
1453 |         const uint64_t temp = x16[ii];
1454 |         x16[ii] = y16[ii];
1455 |         y16[ii] = temp;
1456 |     }
1457 | 
1458 |     x16 += count;
1459 |     y16 += count;
1460 | #else
1461 |     GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast<GF256_M128 *>(vx);
1462 |     GF256_M128 * GF256_RESTRICT y16 = reinterpret_cast<GF256_M128 *>(vy);
1463 | 
1464 |     // Handle blocks of 16 bytes
1465 |     while (bytes >= 16)
1466 |     {
1467 |         GF256_M128 x0 = _mm_loadu_si128(x16);
1468 |         GF256_M128 y0 = _mm_loadu_si128(y16);
1469 |         _mm_storeu_si128(x16, y0);
1470 |         _mm_storeu_si128(y16, x0);
1471 | 
1472 |         bytes -= 16, ++x16, ++y16;
1473 |     }
1474 | #endif
1475 | 
1476 |     uint8_t * GF256_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
1477 |     uint8_t * GF256_RESTRICT y1 = reinterpret_cast<uint8_t *>(y16);
1478 | 
1479 |     // Handle a block of 8 bytes
1480 |     const int eight = bytes & 8;
1481 |     if (eight)
1482 |     {
1483 |         uint64_t * GF256_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
1484 |         uint64_t * GF256_RESTRICT y8 = reinterpret_cast<uint64_t *>(y1);
1485 | 
1486 |         uint64_t temp = *x8;
1487 |         *x8 = *y8;
1488 |         *y8 = temp;
1489 |     }
1490 | 
1491 |     // Handle a block of 4 bytes
1492 |     const int four = bytes & 4;
1493 |     if (four)
1494 |     {
1495 |         uint32_t * GF256_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1 + eight);
1496 |         uint32_t * GF256_RESTRICT y4 = reinterpret_cast<uint32_t *>(y1 + eight);
1497 | 
1498 |         uint32_t temp = *x4;
1499 |         *x4 = *y4;
1500 |         *y4 = temp;
1501 |     }
1502 | 
1503 |     // Handle final bytes
1504 |     const int offset = eight + four;
1505 |     uint8_t temp;
1506 |     switch (bytes & 3)
1507 |     {
1508 |     case 3: temp = x1[offset + 2]; x1[offset + 2] = y1[offset + 2]; y1[offset + 2] = temp;
1509 |     case 2: temp = x1[offset + 1]; x1[offset + 1] = y1[offset + 1]; y1[offset + 1] = temp;
1510 |     case 1: temp = x1[offset]; x1[offset] = y1[offset]; y1[offset] = temp;
1511 |     default:
1512 |         break;
1513 |     }
1514 | }
1515 | 


--------------------------------------------------------------------------------
/tests/gf256.h:
--------------------------------------------------------------------------------
  1 | /** \file
  2 |     \brief GF(256) Main C API Header
  3 |     \copyright Copyright (c) 2017 Christopher A. Taylor.  All rights reserved.
  4 | 
  5 |     Redistribution and use in source and binary forms, with or without
  6 |     modification, are permitted provided that the following conditions are met:
  7 | 
  8 |     * Redistributions of source code must retain the above copyright notice,
  9 |       this list of conditions and the following disclaimer.
 10 |     * Redistributions in binary form must reproduce the above copyright notice,
 11 |       this list of conditions and the following disclaimer in the documentation
 12 |       and/or other materials provided with the distribution.
 13 |     * Neither the name of GF256 nor the names of its contributors may be
 14 |       used to endorse or promote products derived from this software without
 15 |       specific prior written permission.
 16 | 
 17 |     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 18 |     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 19 |     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 20 |     ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 21 |     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 22 |     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 23 |     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 24 |     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 25 |     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 26 |     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 27 |     POSSIBILITY OF SUCH DAMAGE.
 28 | */
 29 | 
 30 | #ifndef CAT_GF256_H
 31 | #define CAT_GF256_H
 32 | 
 33 | /** \page GF256 GF(256) Math Module
 34 | 
 35 |     This module provides efficient implementations of bulk
 36 |     GF(2^^8) math operations over memory buffers.
 37 | 
 38 |     Addition is done over the base field in GF(2) meaning
 39 |     that addition is XOR between memory buffers.
 40 | 
 41 |     Multiplication is performed using table lookups via
 42 |     SIMD instructions.  This is somewhat slower than XOR,
 43 |     but fast enough to not become a major bottleneck when
 44 |     used sparingly.
 45 | */
 46 | 
 47 | #include <stdint.h> // uint32_t etc
 48 | #include <cstring> // memcpy, memset
 49 | 
 50 | /// Library header version
 51 | #define GF256_VERSION 2
 52 | 
 53 | //------------------------------------------------------------------------------
 54 | // Platform/Architecture
 55 | 
 56 | #if defined(ANDROID) || defined(IOS) || defined(LINUX_ARM)
 57 |     #define GF256_TARGET_MOBILE
 58 | #endif // ANDROID
 59 | 
 60 | #if defined(__AVX2__) || (defined (_MSC_VER) && _MSC_VER >= 1900)
 61 |     #define GF256_TRY_AVX2 /* 256-bit */
 62 |     #include <immintrin.h>
 63 |     #define GF256_ALIGN_BYTES 32
 64 | #else // __AVX2__
 65 |     #define GF256_ALIGN_BYTES 16
 66 | #endif // __AVX2__
 67 | 
 68 | #if !defined(GF256_TARGET_MOBILE)
 69 |     // Note: MSVC currently only supports SSSE3 but not AVX2
 70 |     #include <tmmintrin.h> // SSSE3: _mm_shuffle_epi8
 71 |     #include <emmintrin.h> // SSE2
 72 | #endif // GF256_TARGET_MOBILE
 73 | 
 74 | #if defined(HAVE_ARM_NEON_H)
 75 |     #include <arm_neon.h>
 76 | #endif // HAVE_ARM_NEON_H
 77 | 
 78 | #if defined(GF256_TARGET_MOBILE)
 79 | 
 80 |     #define GF256_ALIGNED_ACCESSES /* Inputs must be aligned to GF256_ALIGN_BYTES */
 81 | 
 82 | # if defined(HAVE_ARM_NEON_H)
 83 |     // Compiler-specific 128-bit SIMD register keyword
 84 |     #define GF256_M128 uint8x16_t
 85 |     #define GF256_TRY_NEON
 86 | #else
 87 |     #define GF256_M128 uint64_t
 88 | # endif
 89 | 
 90 | #else // GF256_TARGET_MOBILE
 91 | 
 92 |     // Compiler-specific 128-bit SIMD register keyword
 93 |     #define GF256_M128 __m128i
 94 | 
 95 | #endif // GF256_TARGET_MOBILE
 96 | 
 97 | #ifdef GF256_TRY_AVX2
 98 |     // Compiler-specific 256-bit SIMD register keyword
 99 |     #define GF256_M256 __m256i
100 | #endif
101 | 
102 | // Compiler-specific C++11 restrict keyword
103 | #define GF256_RESTRICT __restrict
104 | 
105 | // Compiler-specific force inline keyword
106 | #ifdef _MSC_VER
107 |     #define GF256_FORCE_INLINE inline __forceinline
108 | #else
109 |     #define GF256_FORCE_INLINE inline __attribute__((always_inline))
110 | #endif
111 | 
112 | // Compiler-specific alignment keyword
113 | // Note: Alignment only matters for ARM NEON where it should be 16
114 | #ifdef _MSC_VER
115 |     #define GF256_ALIGNED __declspec(align(GF256_ALIGN_BYTES))
116 | #else // _MSC_VER
117 |     #define GF256_ALIGNED __attribute__((aligned(GF256_ALIGN_BYTES)))
118 | #endif // _MSC_VER
119 | 
120 | #ifdef __cplusplus
121 | extern "C" {
122 | #endif // __cplusplus
123 | 
124 | 
125 | //------------------------------------------------------------------------------
126 | // Portability
127 | 
128 | /// Swap two memory buffers in-place
129 | extern void gf256_memswap(void * GF256_RESTRICT vx, void * GF256_RESTRICT vy, int bytes);
130 | 
131 | 
132 | //------------------------------------------------------------------------------
133 | // GF(256) Context
134 | 
135 | #ifdef _MSC_VER
136 |     #pragma warning(push)
137 |     #pragma warning(disable: 4324) // warning C4324: 'gf256_ctx' : structure was padded due to __declspec(align())
138 | #endif // _MSC_VER
139 | 
140 | /// The context object stores tables required to perform library calculations
141 | struct gf256_ctx
142 | {
143 |     /// We require memory to be aligned since the SIMD instructions benefit from
144 |     /// or require aligned accesses to the table data.
145 |     struct
146 |     {
147 |         GF256_ALIGNED GF256_M128 TABLE_LO_Y[256];
148 |         GF256_ALIGNED GF256_M128 TABLE_HI_Y[256];
149 |     } MM128;
150 | #ifdef GF256_TRY_AVX2
151 |     struct
152 |     {
153 |         GF256_ALIGNED GF256_M256 TABLE_LO_Y[256];
154 |         GF256_ALIGNED GF256_M256 TABLE_HI_Y[256];
155 |     } MM256;
156 | #endif // GF256_TRY_AVX2
157 | 
158 |     /// Mul/Div/Inv/Sqr tables
159 |     uint8_t GF256_MUL_TABLE[256 * 256];
160 |     uint8_t GF256_DIV_TABLE[256 * 256];
161 |     uint8_t GF256_INV_TABLE[256];
162 |     uint8_t GF256_SQR_TABLE[256];
163 | 
164 |     /// Log/Exp tables
165 |     uint16_t GF256_LOG_TABLE[256];
166 |     uint8_t GF256_EXP_TABLE[512 * 2 + 1];
167 | 
168 |     /// Polynomial used
169 |     unsigned Polynomial;
170 | };
171 | 
172 | #ifdef _MSC_VER
173 |     #pragma warning(pop)
174 | #endif // _MSC_VER
175 | 
176 | extern gf256_ctx GF256Ctx;
177 | 
178 | 
179 | //------------------------------------------------------------------------------
180 | // Initialization
181 | 
182 | /**
183 |     Initialize a context, filling in the tables.
184 |     
185 |     Thread-safety / Usage Notes:
186 |     
187 |     It is perfectly safe and encouraged to use a gf256_ctx object from multiple
188 |     threads.  The gf256_init() is relatively expensive and should only be done
189 |     once, though it will take less than a millisecond.
190 |     
191 |     The gf256_ctx object must be aligned to 16 byte boundary.
192 |     Simply tag the object with GF256_ALIGNED to achieve this.
193 |     
194 |     Example:
195 |        static GF256_ALIGNED gf256_ctx TheGF256Context;
196 |        gf256_init(&TheGF256Context, 0);
197 |     
198 |     Returns 0 on success and other values on failure.
199 | */
200 | extern int gf256_init_(int version);
201 | #define gf256_init() gf256_init_(GF256_VERSION)
202 | 
203 | 
204 | //------------------------------------------------------------------------------
205 | // Math Operations
206 | 
207 | /// return x + y
208 | static GF256_FORCE_INLINE uint8_t gf256_add(uint8_t x, uint8_t y)
209 | {
210 |     return (uint8_t)(x ^ y);
211 | }
212 | 
213 | /// return x * y
214 | /// For repeated multiplication by a constant, it is faster to put the constant in y.
215 | static GF256_FORCE_INLINE uint8_t gf256_mul(uint8_t x, uint8_t y)
216 | {
217 |     return GF256Ctx.GF256_MUL_TABLE[((unsigned)y << 8) + x];
218 | }
219 | 
220 | /// return x / y
221 | /// Memory-access optimized for constant divisors in y.
222 | static GF256_FORCE_INLINE uint8_t gf256_div(uint8_t x, uint8_t y)
223 | {
224 |     return GF256Ctx.GF256_DIV_TABLE[((unsigned)y << 8) + x];
225 | }
226 | 
227 | /// return 1 / x
228 | static GF256_FORCE_INLINE uint8_t gf256_inv(uint8_t x)
229 | {
230 |     return GF256Ctx.GF256_INV_TABLE[x];
231 | }
232 | 
233 | /// return x * x
234 | static GF256_FORCE_INLINE uint8_t gf256_sqr(uint8_t x)
235 | {
236 |     return GF256Ctx.GF256_SQR_TABLE[x];
237 | }
238 | 
239 | 
240 | //------------------------------------------------------------------------------
241 | // Bulk Memory Math Operations
242 | 
243 | /// Performs "x[] += y[]" bulk memory XOR operation
244 | extern void gf256_add_mem(void * GF256_RESTRICT vx,
245 |                           const void * GF256_RESTRICT vy, int bytes);
246 | 
247 | /// Performs "z[] += x[] + y[]" bulk memory operation
248 | extern void gf256_add2_mem(void * GF256_RESTRICT vz, const void * GF256_RESTRICT vx,
249 |                            const void * GF256_RESTRICT vy, int bytes);
250 | 
251 | /// Performs "z[] = x[] + y[]" bulk memory operation
252 | extern void gf256_addset_mem(void * GF256_RESTRICT vz, const void * GF256_RESTRICT vx,
253 |                              const void * GF256_RESTRICT vy, int bytes);
254 | 
255 | /// Performs "z[] = x[] * y" bulk memory operation
256 | extern void gf256_mul_mem(void * GF256_RESTRICT vz,
257 |                           const void * GF256_RESTRICT vx, uint8_t y, int bytes);
258 | 
259 | /// Performs "z[] += x[] * y" bulk memory operation
260 | extern void gf256_muladd_mem(void * GF256_RESTRICT vz, uint8_t y,
261 |                              const void * GF256_RESTRICT vx, int bytes);
262 | 
263 | /// Performs "x[] /= y" bulk memory operation
264 | static GF256_FORCE_INLINE void gf256_div_mem(void * GF256_RESTRICT vz,
265 |                                              const void * GF256_RESTRICT vx, uint8_t y, int bytes)
266 | {
267 |     // Multiply by inverse
268 |     gf256_mul_mem(vz, vx, y == 1 ? (uint8_t)1 : GF256Ctx.GF256_INV_TABLE[y], bytes);
269 | }
270 | 
271 | 
272 | //------------------------------------------------------------------------------
273 | // Misc Operations
274 | 
275 | /// Swap two memory buffers in-place
276 | extern void gf256_memswap(void * GF256_RESTRICT vx, void * GF256_RESTRICT vy, int bytes);
277 | 
278 | 
279 | #ifdef __cplusplus
280 | }
281 | #endif // __cplusplus
282 | 
283 | #endif // CAT_GF256_H
284 | 


--------------------------------------------------------------------------------
/tests/tests.cpp:
--------------------------------------------------------------------------------
   1 | /*
   2 |     Copyright (c) 2018 Christopher A. Taylor.  All rights reserved.
   3 | 
   4 |     Redistribution and use in source and binary forms, with or without
   5 |     modification, are permitted provided that the following conditions are met:
   6 | 
   7 |     * Redistributions of source code must retain the above copyright notice,
   8 |       this list of conditions and the following disclaimer.
   9 |     * Redistributions in binary form must reproduce the above copyright notice,
  10 |       this list of conditions and the following disclaimer in the documentation
  11 |       and/or other materials provided with the distribution.
  12 |     * Neither the name of Fp61 nor the names of its contributors may be
  13 |       used to endorse or promote products derived from this software without
  14 |       specific prior written permission.
  15 | 
  16 |     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17 |     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18 |     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19 |     ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  20 |     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  21 |     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  22 |     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  23 |     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  24 |     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  25 |     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  26 |     POSSIBILITY OF SUCH DAMAGE.
  27 | */
  28 | 
  29 | #include "../fp61.h"
  30 | 
  31 | #include <iostream>
  32 | #include <iomanip>
  33 | #include <sstream>
  34 | #include <vector>
  35 | using namespace std;
  36 | 
  37 | 
  38 | //------------------------------------------------------------------------------
  39 | // Portability macros
  40 | 
  41 | // Compiler-specific debug break
  42 | #if defined(_DEBUG) || defined(DEBUG)
  43 |     #define FP61_DEBUG
  44 |     #ifdef _WIN32
  45 |         #define FP61_DEBUG_BREAK() __debugbreak()
  46 |     #else
  47 |         #define FP61_DEBUG_BREAK() __builtin_trap()
  48 |     #endif
  49 |     #define FP61_DEBUG_ASSERT(cond) { if (!(cond)) { FP61_DEBUG_BREAK(); } }
  50 | #else
  51 |     #define FP61_DEBUG_BREAK() do {} while (false);
  52 |     #define FP61_DEBUG_ASSERT(cond) do {} while (false);
  53 | #endif
  54 | 
  55 | 
  56 | //------------------------------------------------------------------------------
  57 | // Constants
  58 | 
  59 | #define FP61_RET_FAIL -1
  60 | #define FP61_RET_SUCCESS 0
  61 | 
  62 | static const uint64_t MASK61 = ((uint64_t)1 << 61) - 1;
  63 | static const uint64_t MASK62 = ((uint64_t)1 << 62) - 1;
  64 | static const uint64_t MASK63 = ((uint64_t)1 << 63) - 1;
  65 | static const uint64_t MASK64 = ~(uint64_t)0;
  66 | static const uint64_t MASK64_NO62 = MASK64 ^ ((uint64_t)1 << 62);
  67 | static const uint64_t MASK64_NO61 = MASK64 ^ ((uint64_t)1 << 61);
  68 | static const uint64_t MASK64_NO60 = MASK64 ^ ((uint64_t)1 << 60);
  69 | static const uint64_t MASK63_NO61 = MASK63 ^ ((uint64_t)1 << 61);
  70 | static const uint64_t MASK63_NO60 = MASK63 ^ ((uint64_t)1 << 60);
  71 | static const uint64_t MASK62_NO60 = MASK62 ^ ((uint64_t)1 << 60);
  72 | 
  73 | #if defined(FP61_DEBUG)
  74 | static const unsigned kRandomTestLoops = 100000;
  75 | static const unsigned kMaxDataLength = 4000;
  76 | #else
  77 | static const unsigned kRandomTestLoops = 10000000;
  78 | static const unsigned kMaxDataLength = 10000;
  79 | #endif
  80 | 
  81 | 
  82 | //------------------------------------------------------------------------------
  83 | // Tools
  84 | 
  85 | static std::string HexString(uint64_t x)
  86 | {
  87 |     std::stringstream ss;
  88 |     ss << hex << setfill('0') << setw(16) << x;
  89 |     return ss.str();
  90 | }
  91 | 
  92 | 
  93 | //------------------------------------------------------------------------------
  94 | // Tests: Negate
  95 | 
  96 | static bool test_negate(uint64_t x)
  97 | {
  98 |     uint64_t n = fp61::Negate(x);
  99 |     uint64_t s = (x + n) % fp61::kPrime;
 100 |     if (s != 0) {
 101 |         cout << "Failed for x = " << hex << HexString(x) << endl;
 102 |         FP61_DEBUG_BREAK();
 103 |         return false;
 104 |     }
 105 |     return true;
 106 | }
 107 | 
 108 | static bool TestNegate()
 109 | {
 110 |     cout << "TestNegate...";
 111 | 
 112 |     // Input is allowed to be 0 <= x <= p
 113 |     for (uint64_t x = 0; x < 1000; ++x) {
 114 |         if (!test_negate(x)) {
 115 |             return false;
 116 |         }
 117 |     }
 118 |     for (uint64_t x = fp61::kPrime; x >= fp61::kPrime - 1000; --x) {
 119 |         if (!test_negate(x)) {
 120 |             return false;
 121 |         }
 122 |     }
 123 | 
 124 |     fp61::Random prng;
 125 |     prng.Seed(1);
 126 | 
 127 |     for (unsigned i = 0; i < kRandomTestLoops; ++i)
 128 |     {
 129 |         uint64_t x = prng.Next() & fp61::kPrime;
 130 |         if (!test_negate(x)) {
 131 |             return false;
 132 |         }
 133 |     }
 134 | 
 135 |     cout << "Passed" << endl;
 136 | 
 137 |     return true;
 138 | }
 139 | 
 140 | 
 141 | //------------------------------------------------------------------------------
 142 | // Tests: Add
 143 | 
 144 | static bool TestAdd()
 145 | {
 146 |     cout << "TestAdd...";
 147 | 
 148 |     // Preconditions: x,y,z,w <2^62
 149 |     const uint64_t largest = ((uint64_t)1 << 62) - 1;
 150 |     const uint64_t reduced = largest % fp61::kPrime;
 151 | 
 152 |     for (uint64_t x = largest; x >= largest - 1000; --x)
 153 |     {
 154 |         uint64_t r = fp61::Add4(largest, largest, largest, x);
 155 | 
 156 |         uint64_t expected = 0;
 157 |         expected = (expected + reduced) % fp61::kPrime;
 158 |         expected = (expected + reduced) % fp61::kPrime;
 159 |         expected = (expected + reduced) % fp61::kPrime;
 160 |         expected = (expected + (x % fp61::kPrime)) % fp61::kPrime;
 161 | 
 162 |         if (r % fp61::kPrime != expected) {
 163 |             cout << "Failed for x = " << HexString(x) << endl;
 164 |             FP61_DEBUG_BREAK();
 165 |             return false;
 166 |         }
 167 |     }
 168 | 
 169 |     for (uint64_t x = largest; x >= largest - 1000; --x)
 170 |     {
 171 |         for (uint64_t y = largest; y >= largest - 1000; --y)
 172 |         {
 173 |             uint64_t r = fp61::Add4(largest, largest, x, y);
 174 | 
 175 |             uint64_t expected = 0;
 176 |             expected = (expected + reduced) % fp61::kPrime;
 177 |             expected = (expected + reduced) % fp61::kPrime;
 178 |             expected = (expected + (y % fp61::kPrime)) % fp61::kPrime;
 179 |             expected = (expected + (x % fp61::kPrime)) % fp61::kPrime;
 180 | 
 181 |             if (r % fp61::kPrime != expected) {
 182 |                 cout << "Failed for x=" << HexString(x) << " y=" << HexString(y) << endl;
 183 |                 FP61_DEBUG_BREAK();
 184 |                 return false;
 185 |             }
 186 |         }
 187 |     }
 188 | 
 189 |     fp61::Random prng;
 190 |     prng.Seed(0);
 191 | 
 192 |     for (unsigned i = 0; i < kRandomTestLoops; ++i)
 193 |     {
 194 |         // Select 4 values from 0..2^62-1
 195 |         uint64_t x = prng.Next() & MASK62;
 196 |         uint64_t y = prng.Next() & MASK62;
 197 |         uint64_t w = prng.Next() & MASK62;
 198 |         uint64_t z = prng.Next() & MASK62;
 199 | 
 200 |         uint64_t r = fp61::Add4(x, y, z, w);
 201 | 
 202 |         uint64_t expected = 0;
 203 |         expected = (expected + (x % fp61::kPrime)) % fp61::kPrime;
 204 |         expected = (expected + (y % fp61::kPrime)) % fp61::kPrime;
 205 |         expected = (expected + (z % fp61::kPrime)) % fp61::kPrime;
 206 |         expected = (expected + (w % fp61::kPrime)) % fp61::kPrime;
 207 | 
 208 |         if (r % fp61::kPrime != expected) {
 209 |             cout << "Failed (random) for i = " << i << endl;
 210 |             FP61_DEBUG_BREAK();
 211 |             return false;
 212 |         }
 213 |     }
 214 | 
 215 |     cout << "Passed" << endl;
 216 | 
 217 |     return true;
 218 | }
 219 | 
 220 | 
 221 | //------------------------------------------------------------------------------
 222 | // Tests: Partial Reduction
 223 | 
 224 | static bool test_pred(uint64_t x)
 225 | {
 226 |     uint64_t expected = x % fp61::kPrime;
 227 | 
 228 |     uint64_t r = fp61::PartialReduce(x);
 229 | 
 230 |     if ((r >> 62) != 0)
 231 |     {
 232 |         cout << "High bit overflow failed for x=" << HexString(x) << endl;
 233 |         FP61_DEBUG_BREAK();
 234 |         return false;
 235 |     }
 236 | 
 237 |     uint64_t actual = fp61::PartialReduce(x) % fp61::kPrime;
 238 | 
 239 |     if (actual != expected)
 240 |     {
 241 |         cout << "Failed for x=" << HexString(x) << endl;
 242 |         FP61_DEBUG_BREAK();
 243 |         return false;
 244 |     }
 245 |     return true;
 246 | }
 247 | 
 248 | static bool TestPartialReduction()
 249 | {
 250 |     cout << "TestPartialReduction...";
 251 | 
 252 |     // Input can have any bit set
 253 | 
 254 |     for (uint64_t x = 0; x < 1000; ++x) {
 255 |         if (!test_pred(x)) {
 256 |             return false;
 257 |         }
 258 |     }
 259 |     for (uint64_t x = MASK64; x > MASK64 - 1000; --x) {
 260 |         if (!test_pred(x)) {
 261 |             return false;
 262 |         }
 263 |     }
 264 |     for (uint64_t x = MASK64_NO62 + 1000; x > MASK64_NO62 - 1000; --x) {
 265 |         if (!test_pred(x)) {
 266 |             return false;
 267 |         }
 268 |     }
 269 |     for (uint64_t x = MASK64_NO61 + 1000; x > MASK64_NO61 - 1000; --x) {
 270 |         if (!test_pred(x)) {
 271 |             return false;
 272 |         }
 273 |     }
 274 |     for (uint64_t x = MASK64_NO60 + 1000; x > MASK64_NO60 - 1000; --x) {
 275 |         if (!test_pred(x)) {
 276 |             return false;
 277 |         }
 278 |     }
 279 |     for (uint64_t x = MASK63; x > MASK63 - 1000; --x) {
 280 |         if (!test_pred(x)) {
 281 |             return false;
 282 |         }
 283 |     }
 284 |     for (uint64_t x = MASK63_NO61 + 1000; x > MASK63_NO61 - 1000; --x) {
 285 |         if (!test_pred(x)) {
 286 |             return false;
 287 |         }
 288 |     }
 289 |     for (uint64_t x = MASK63_NO60 + 1000; x > MASK63_NO60 - 1000; --x) {
 290 |         if (!test_pred(x)) {
 291 |             return false;
 292 |         }
 293 |     }
 294 |     for (uint64_t x = MASK62 + 1000; x > MASK62 - 1000; --x) {
 295 |         if (!test_pred(x)) {
 296 |             return false;
 297 |         }
 298 |     }
 299 |     for (uint64_t x = MASK62_NO60 + 1000; x > MASK62_NO60 - 1000; --x) {
 300 |         if (!test_pred(x)) {
 301 |             return false;
 302 |         }
 303 |     }
 304 |     for (uint64_t x = MASK61 + 1000; x > MASK61 - 1000; --x) {
 305 |         if (!test_pred(x)) {
 306 |             return false;
 307 |         }
 308 |     }
 309 | 
 310 |     fp61::Random prng;
 311 |     prng.Seed(2);
 312 | 
 313 |     for (unsigned i = 0; i < kRandomTestLoops; ++i)
 314 |     {
 315 |         uint64_t x = prng.Next();
 316 | 
 317 |         if (!test_pred(x)) {
 318 |             return false;
 319 |         }
 320 |     }
 321 | 
 322 |     cout << "Passed" << endl;
 323 | 
 324 |     return true;
 325 | }
 326 | 
 327 | 
 328 | //------------------------------------------------------------------------------
 329 | // Tests: Finalize Reduction
 330 | 
 331 | static bool test_fred(uint64_t x)
 332 | {
 333 |     // EXCEPTION: This input is known to not work
 334 |     if (x == 0x3ffffffffffffffeULL) {
 335 |         return true;
 336 |     }
 337 | 
 338 |     uint64_t actual = fp61::Finalize(x);
 339 |     uint64_t expected = x % fp61::kPrime;
 340 | 
 341 |     if (actual != expected)
 342 |     {
 343 |         cout << "Failed for x=" << HexString(x) << endl;
 344 |         FP61_DEBUG_BREAK();
 345 |         return false;
 346 |     }
 347 |     return true;
 348 | }
 349 | 
 350 | static bool TestFinalizeReduction()
 351 | {
 352 |     cout << "TestFinalizeReduction...";
 353 | 
 354 |     // Input has #63 and #62 clear, other bits can take on any value
 355 | 
 356 |     for (uint64_t x = 0; x < 1000; ++x) {
 357 |         if (!test_fred(x)) {
 358 |             return false;
 359 |         }
 360 |     }
 361 |     for (uint64_t x = MASK62; x > MASK62 - 1000; --x) {
 362 |         if (!test_fred(x)) {
 363 |             return false;
 364 |         }
 365 |     }
 366 |     for (uint64_t x = MASK62_NO60 + 1000; x > MASK62_NO60 - 1000; --x) {
 367 |         if (!test_fred(x)) {
 368 |             return false;
 369 |         }
 370 |     }
 371 |     for (uint64_t x = MASK61 + 1000; x > MASK61 - 1000; --x) {
 372 |         if (!test_fred(x)) {
 373 |             return false;
 374 |         }
 375 |     }
 376 | 
 377 |     fp61::Random prng;
 378 |     prng.Seed(3);
 379 | 
 380 |     for (unsigned i = 0; i < kRandomTestLoops; ++i)
 381 |     {
 382 |         uint64_t x = prng.Next() & MASK62;
 383 | 
 384 |         if (!test_fred(x)) {
 385 |             return false;
 386 |         }
 387 |     }
 388 | 
 389 |     cout << "Passed" << endl;
 390 | 
 391 |     return true;
 392 | }
 393 | 
 394 | 
 395 | //------------------------------------------------------------------------------
 396 | // Tests: Multiply
 397 | 
 398 | static bool test_mul(uint64_t x, uint64_t y)
 399 | {
 400 |     uint64_t p = fp61::Multiply(x, y);
 401 | 
 402 |     if ((p >> 62) != 0) {
 403 |         cout << "Failed (high bit overflow) for x=" << HexString(x) << ", y=" << HexString(y) << endl;
 404 |         FP61_DEBUG_BREAK();
 405 |         return false;
 406 |     }
 407 | 
 408 |     uint64_t r0, r1;
 409 |     CAT_MUL128(r1, r0, x, y);
 410 | 
 411 |     //A % B == (((AH % B) * (2^64 % B)) + (AL % B)) % B
 412 |     //  == (((AH % B) * ((2^64 - B) % B)) + (AL % B)) % B
 413 |     r1 %= fp61::kPrime;
 414 |     uint64_t NB = (uint64_t)(-(int64_t)fp61::kPrime);
 415 |     uint64_t mod = r1 * (NB % fp61::kPrime);
 416 |     mod += r0 % fp61::kPrime;
 417 |     mod %= fp61::kPrime;
 418 | 
 419 |     if (p % fp61::kPrime != mod) {
 420 |         cout << "Failed (reduced result mismatch) for x=" << HexString(x) << ", y=" << HexString(y) << endl;
 421 |         FP61_DEBUG_BREAK();
 422 |         return false;
 423 |     }
 424 | 
 425 |     return true;
 426 | }
 427 | 
 428 | static bool TestMultiply()
 429 | {
 430 |     cout << "TestMultiply...";
 431 | 
 432 |     // Number of bits between x, y must be 124 or fewer.
 433 | 
 434 |     for (uint64_t x = 0; x < 1000; ++x) {
 435 |         for (uint64_t y = x; y < 1000; ++y) {
 436 |             if (!test_mul(x, y)) {
 437 |                 return false;
 438 |             }
 439 |         }
 440 |     }
 441 |     for (uint64_t x = MASK62; x > MASK62 - 1000; --x) {
 442 |         for (uint64_t y = x; y > MASK62 - 1000; --y) {
 443 |             if (!test_mul(x, y)) {
 444 |                 return false;
 445 |             }
 446 |         }
 447 |     }
 448 |     for (uint64_t x = MASK62_NO60 + 1000; x > MASK62_NO60 - 1000; --x) {
 449 |         for (uint64_t y = x; y > MASK62_NO60 - 1000; --y) {
 450 |             if (!test_mul(x, y)) {
 451 |                 return false;
 452 |             }
 453 |         }
 454 |     }
 455 |     for (uint64_t x = MASK61 + 1000; x > MASK61 - 1000; --x) {
 456 |         for (uint64_t y = x; y > MASK61 - 1000; --y) {
 457 |             if (!test_mul(x, y)) {
 458 |                 return false;
 459 |             }
 460 |         }
 461 |     }
 462 | 
 463 |     fp61::Random prng;
 464 |     prng.Seed(4);
 465 | 
 466 |     // 62 + 62 = 124 bits
 467 |     for (unsigned i = 0; i < kRandomTestLoops; ++i)
 468 |     {
 469 |         uint64_t x = prng.Next() & MASK62;
 470 |         uint64_t y = prng.Next() & MASK62;
 471 | 
 472 |         if (!test_mul(x, y)) {
 473 |             return false;
 474 |         }
 475 |     }
 476 | 
 477 |     // 61 + 63 = 124 bits
 478 |     for (unsigned i = 0; i < kRandomTestLoops; ++i)
 479 |     {
 480 |         uint64_t x = prng.Next() & MASK61;
 481 |         uint64_t y = prng.Next() & MASK63;
 482 | 
 483 |         if (!test_mul(x, y)) {
 484 |             return false;
 485 |         }
 486 |     }
 487 | 
 488 |     // Commutivity test
 489 |     for (unsigned i = 0; i < kRandomTestLoops; ++i)
 490 |     {
 491 |         uint64_t x = prng.Next() & MASK62;
 492 |         uint64_t y = prng.Next() & MASK62;
 493 |         uint64_t z = prng.Next() & MASK62;
 494 | 
 495 |         uint64_t r = fp61::Finalize(fp61::Multiply(fp61::Multiply(z, y), x));
 496 |         uint64_t s = fp61::Finalize(fp61::Multiply(fp61::Multiply(x, z), y));
 497 |         uint64_t t = fp61::Finalize(fp61::Multiply(fp61::Multiply(x, y), z));
 498 | 
 499 |         if (r != s || s != t) {
 500 |             cout << "Failed (does not commute) for i=" << i << endl;
 501 |             FP61_DEBUG_BREAK();
 502 |             return false;
 503 |         }
 504 |     }
 505 | 
 506 |     // Direct function test
 507 |     uint64_t r1, r0;
 508 |     r0 = Emulate64x64to128(r1, MASK64, MASK64);
 509 | 
 510 |     if (r1 != 0xfffffffffffffffe || r0 != 1) {
 511 |         cout << "Failed (Emulate64x64to128 failed)" << endl;
 512 |         FP61_DEBUG_BREAK();
 513 |         return false;
 514 |     }
 515 | 
 516 |     cout << "Passed" << endl;
 517 | 
 518 |     return true;
 519 | }
 520 | 
 521 | 
 522 | //------------------------------------------------------------------------------
 523 | // Tests: Inverse
 524 | 
 525 | static bool test_inv(uint64_t x)
 526 | {
 527 |     uint64_t i = fp61::Inverse(x);
 528 | 
 529 |     // If no inverse existed:
 530 |     if (i == 0)
 531 |     {
 532 |         // Then it must have evenly divided
 533 |         if (x % fp61::kPrime == 0) {
 534 |             return true;
 535 |         }
 536 | 
 537 |         // Otherwise this should have had a result
 538 |         cout << "Failed (no result) for x=" << HexString(x) << endl;
 539 |         FP61_DEBUG_BREAK();
 540 |         return false;
 541 |     }
 542 | 
 543 |     // Result must be in Fp
 544 |     if (i >= fp61::kPrime)
 545 |     {
 546 |         cout << "Failed (result too large) for x=" << HexString(x) << endl;
 547 |         FP61_DEBUG_BREAK();
 548 |         return false;
 549 |     }
 550 | 
 551 |     // mul requires partially reduced input
 552 |     x = fp61::PartialReduce(x);
 553 | 
 554 |     uint64_t p = fp61::Multiply(x, i);
 555 | 
 556 |     // If result is not 1 then it is not a multiplicative inverse
 557 |     if (fp61::Finalize(p) != 1)
 558 |     {
 559 |         cout << "Failed (finalized result not 1) for x=" << HexString(x) << endl;
 560 |         FP61_DEBUG_BREAK();
 561 |         return false;
 562 |     }
 563 | 
 564 |     // Double check the reduce function...
 565 |     if (p % fp61::kPrime != 1)
 566 |     {
 567 |         cout << "Failed (remainder not 1) for x=" << HexString(x) << endl;
 568 |         FP61_DEBUG_BREAK();
 569 |         return false;
 570 |     }
 571 | 
 572 |     return true;
 573 | }
 574 | 
 575 | static bool TestMulInverse()
 576 | {
 577 |     cout << "TestMulInverse...";
 578 | 
 579 |     // x < p
 580 | 
 581 |     // Small values
 582 |     for (uint64_t x = 1; x < 1000; ++x) {
 583 |         if (!test_inv(x)) {
 584 |             return false;
 585 |         }
 586 |     }
 587 | 
 588 |     fp61::Random prng;
 589 |     prng.Seed(5);
 590 | 
 591 |     for (unsigned i = 0; i < kRandomTestLoops; ++i)
 592 |     {
 593 |         uint64_t x = prng.Next();
 594 | 
 595 |         if (!test_inv(x)) {
 596 |             return false;
 597 |         }
 598 |     }
 599 | 
 600 |     cout << "Passed" << endl;
 601 | 
 602 |     return true;
 603 | }
 604 | 
 605 | 
 606 | //------------------------------------------------------------------------------
 607 | // Tests: ByteReader
 608 | 
 609 | bool test_byte_reader(const uint8_t* data, unsigned bytes)
 610 | {
 611 |     fp61::ByteReader reader;
 612 | 
 613 |     reader.BeginRead(data, bytes);
 614 | 
 615 |     // Round up to the next 61 bits
 616 |     uint64_t expandedBits = bytes * 8;
 617 |     unsigned actualReads = 0;
 618 |     unsigned bits = 0;
 619 |     bool packed = false;
 620 |     unsigned packedBit = 0;
 621 | 
 622 |     uint64_t fp;
 623 |     while (fp61::ReadResult::Success == reader.Read(fp))
 624 |     {
 625 |         unsigned readStart = bits / 8;
 626 |         if (readStart >= bytes)
 627 |         {
 628 |             // We can read one extra bit if the packing is the last thing
 629 |             if (!packed || readStart != bytes)
 630 |             {
 631 |                 FP61_DEBUG_BREAK();
 632 |                 cout << "Failed (too many reads) for bytes=" << bytes << " actualReads=" << actualReads << endl;
 633 |                 return false;
 634 |             }
 635 |         }
 636 | 
 637 |         int readBytes = (int)bytes - (int)readStart;
 638 |         if (readBytes < 0) {
 639 |             readBytes = 0;
 640 |         }
 641 |         else if (readBytes > 8) {
 642 |             readBytes = 8;
 643 |         }
 644 | 
 645 |         uint64_t x = fp61::ReadBytes_LE(data + readStart, readBytes) >> (bits % 8);
 646 | 
 647 |         int readBits = (readBytes * 8) - (bits % 8);
 648 |         if (readBytes >= 8 && readBits > 0 && readBits < 61 && readStart + readBytes < bytes)
 649 |         {
 650 |             // Need to read one more byte sometimes
 651 |             uint64_t high = data[readStart + readBytes];
 652 |             high <<= readBits;
 653 |             x |= high;
 654 |         }
 655 | 
 656 |         // Test packing
 657 |         if (packed)
 658 |         {
 659 |             x <<= 1;
 660 |             x |= packedBit;
 661 |             bits += 60;
 662 |             ++expandedBits;
 663 |         }
 664 |         else
 665 |         {
 666 |             bits += 61;
 667 |         }
 668 | 
 669 |         x &= fp61::kPrime;
 670 | 
 671 |         packed = fp61::IsU64Ambiguous(x);
 672 |         if (packed)
 673 |         {
 674 |             packedBit = (x == fp61::kPrime);
 675 |             x = fp61::kAmbiguityMask;
 676 |         }
 677 | 
 678 |         if (fp != x)
 679 |         {
 680 |             FP61_DEBUG_BREAK();
 681 |             cout << "Failed (wrong value) for bytes=" << bytes << " actualReads=" << actualReads << endl;
 682 |             return false;
 683 |         }
 684 |         ++actualReads;
 685 |     }
 686 | 
 687 |     const unsigned expectedReads = (unsigned)((expandedBits + 60) / 61);
 688 |     if (actualReads != expectedReads)
 689 |     {
 690 |         FP61_DEBUG_BREAK();
 691 |         cout << "Failed (read count wrong) for bytes=" << bytes << endl;
 692 |         return false;
 693 |     }
 694 | 
 695 |     const unsigned maxWords = fp61::ByteReader::MaxWords(bytes);
 696 |     if (maxWords < actualReads)
 697 |     {
 698 |         FP61_DEBUG_BREAK();
 699 |         cout << "Failed (MaxWords wrong) for bytes=" << bytes << endl;
 700 |         return false;
 701 |     }
 702 | 
 703 |     return true;
 704 | }
 705 | 
 706 | bool TestByteReader()
 707 | {
 708 |     cout << "TestByteReader...";
 709 | 
 710 |     uint8_t data[10 + 8] = {
 711 |         1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
 712 |         0, 0, 0, 0, 0, 0, 0, 0 // Padding to simplify test
 713 |     };
 714 | 
 715 |     uint64_t w = fp61::ReadU64_LE(data);
 716 |     if (w != 0x0807060504030201ULL) {
 717 |         cout << "Failed (ReadU64_LE)" << endl;
 718 |         FP61_DEBUG_BREAK();
 719 |         return false;
 720 |     }
 721 | 
 722 |     uint32_t u = fp61::ReadU32_LE(data);
 723 |     if (u != 0x04030201UL) {
 724 |         cout << "Failed (ReadU32_LE)" << endl;
 725 |         FP61_DEBUG_BREAK();
 726 |         return false;
 727 |     }
 728 | 
 729 |     uint64_t z = fp61::ReadBytes_LE(data, 0);
 730 |     if (z != 0) {
 731 |         cout << "Failed (ReadBytes_LE 0)" << endl;
 732 |         FP61_DEBUG_BREAK();
 733 |         return false;
 734 |     }
 735 | 
 736 |     for (unsigned i = 1; i <= 8; ++i)
 737 |     {
 738 |         uint64_t v = fp61::ReadBytes_LE(data, i);
 739 |         uint64_t d = v ^ w;
 740 |         d <<= 8 * (8 - i);
 741 |         if (d != 0) {
 742 |             cout << "Failed (ReadBytes_LE) for i = " << i << endl;
 743 |             FP61_DEBUG_BREAK();
 744 |             return false;
 745 |         }
 746 |     }
 747 | 
 748 |     uint8_t simpledata[16 + 8] = {
 749 |         0, 1, 2, 3, 4, 5, 6, 7,
 750 |         8, 9, 10, 11, 12, 13, 14, 15,
 751 |         0
 752 |     };
 753 | 
 754 |     for (unsigned i = 0; i <= 16; ++i)
 755 |     {
 756 |         if (!test_byte_reader(simpledata, i)) {
 757 |             return false;
 758 |         }
 759 |     }
 760 | 
 761 |     uint8_t allones[16 + 8] = {
 762 |         254,255,255,255,255,255,255,255,
 763 |         255,255,255,255,255,255,255,255,
 764 |         0
 765 |     };
 766 | 
 767 |     for (unsigned i = 0; i <= 16; ++i)
 768 |     {
 769 |         if (!test_byte_reader(allones, i)) {
 770 |             return false;
 771 |         }
 772 |     }
 773 | 
 774 |     uint8_t mixed[20 + 8] = {
 775 |         254,255,255,255,255,255,255,255,0, // Inject a non-overflowing bit in the middle
 776 |         255,255,255,255,255,255,255,
 777 |         255,255,255,255,
 778 |         0
 779 |     };
 780 | 
 781 |     for (unsigned i = 0; i <= 16; ++i)
 782 |     {
 783 |         if (!test_byte_reader(allones, i)) {
 784 |             return false;
 785 |         }
 786 |     }
 787 | 
 788 |     vector<uint8_t> randBytes(kMaxDataLength + 8, 0); // +8 to avoid bounds checking
 789 | 
 790 |     fp61::Random prng;
 791 |     prng.Seed(10);
 792 | 
 793 |     for (unsigned i = 0; i < kMaxDataLength; ++i)
 794 |     {
 795 |         for (unsigned j = 0; j < 1; ++j)
 796 |         {
 797 |             // Fill the data with random bytes
 798 |             for (unsigned k = 0; k < i; k += 8)
 799 |             {
 800 |                 uint64_t w;
 801 |                 if (prng.Next() % 100 <= 3) {
 802 |                     w = ~(uint64_t)0;
 803 |                 }
 804 |                 else {
 805 |                     w = prng.Next();
 806 |                 }
 807 |                 fp61::WriteU64_LE(&randBytes[k], w);
 808 |             }
 809 | 
 810 |             if (!test_byte_reader(&randBytes[0], i)) {
 811 |                 return false;
 812 |             }
 813 |         }
 814 |     }
 815 | 
 816 |     cout << "Passed" << endl;
 817 | 
 818 |     return true;
 819 | }
 820 | 
 821 | 
 822 | //------------------------------------------------------------------------------
 823 | // Tests: Random
 824 | 
 825 | static bool TestRandom()
 826 | {
 827 |     cout << "TestRandom...";
 828 | 
 829 |     for (int i = -1000; i < 1000; ++i)
 830 |     {
 831 |         uint64_t loWord = static_cast<int64_t>(i);
 832 |         loWord <<= 3; // Put it in the high bits
 833 |         uint64_t loResult = fp61::Random::ConvertRandToFp(loWord);
 834 | 
 835 |         if (loResult >= fp61::kPrime)
 836 |         {
 837 |             cout << "Failed (RandToFp low) at i = " << i << endl;
 838 |             FP61_DEBUG_BREAK();
 839 |             return false;
 840 |         }
 841 | 
 842 |         uint64_t hiWord = fp61::kPrime + static_cast<int64_t>(i);
 843 |         hiWord <<= 3; // Put it in the high bits
 844 |         uint64_t hiResult = fp61::Random::ConvertRandToFp(hiWord);
 845 | 
 846 |         if (hiResult >= fp61::kPrime)
 847 |         {
 848 |             cout << "Failed (RandToFp high) at i = " << i << endl;
 849 |             FP61_DEBUG_BREAK();
 850 |             return false;
 851 |         }
 852 |     }
 853 | 
 854 |     for (int i = -1000; i < 1000; ++i)
 855 |     {
 856 |         uint64_t loWord = static_cast<int64_t>(i);
 857 |         loWord <<= 3; // Put it in the high bits
 858 |         uint64_t loResult = fp61::Random::ConvertRandToNonzeroFp(loWord);
 859 | 
 860 |         if (loResult <= 0 || loResult >= fp61::kPrime)
 861 |         {
 862 |             cout << "Failed (RandToNonzeroFp low) at i = " << i << endl;
 863 |             FP61_DEBUG_BREAK();
 864 |             return false;
 865 |         }
 866 | 
 867 |         uint64_t hiWord = fp61::kPrime + static_cast<int64_t>(i);
 868 |         hiWord <<= 3; // Put it in the high bits
 869 |         uint64_t hiResult = fp61::Random::ConvertRandToNonzeroFp(hiWord);
 870 | 
 871 |         if (hiResult <= 0 || hiResult >= fp61::kPrime)
 872 |         {
 873 |             cout << "Failed (RandToNonzeroFp high) at i = " << i << endl;
 874 |             FP61_DEBUG_BREAK();
 875 |             return false;
 876 |         }
 877 |     }
 878 | 
 879 |     cout << "Passed" << endl;
 880 | 
 881 |     return true;
 882 | }
 883 | 
 884 | 
 885 | //------------------------------------------------------------------------------
 886 | // Tests: WordReader/WordWriter
 887 | 
 888 | static bool TestWordSerialization()
 889 | {
 890 |     cout << "TestWordSerialization...";
 891 | 
 892 |     fp61::WordWriter writer;
 893 |     fp61::WordReader reader;
 894 | 
 895 |     fp61::Random prng;
 896 |     prng.Seed(11);
 897 | 
 898 |     std::vector<uint8_t> data;
 899 |     std::vector<uint64_t> wordData;
 900 | 
 901 |     for (unsigned i = 1; i < kMaxDataLength; ++i)
 902 |     {
 903 |         unsigned words = i;
 904 |         unsigned bytesNeeded = fp61::WordWriter::BytesNeeded(words);
 905 | 
 906 |         data.resize(bytesNeeded);
 907 |         wordData.resize(words);
 908 | 
 909 |         writer.BeginWrite(&data[0]);
 910 |         reader.BeginRead(&data[0], bytesNeeded);
 911 | 
 912 |         for (unsigned j = 0; j < words; ++j)
 913 |         {
 914 |             // Generate a value from 0..p because the writer technically does not care about staying within the field
 915 |             uint64_t w = prng.Next() & MASK61;
 916 |             wordData[j] = w;
 917 |             writer.Write(w);
 918 |         }
 919 |         writer.Flush();
 920 | 
 921 |         for (unsigned j = 0; j < words; ++j)
 922 |         {
 923 |             uint64_t u = reader.Read();
 924 |             if (u != wordData[j])
 925 |             {
 926 |                 cout << "Failed (readback failed) at i = " << i << " j = " << j << endl;
 927 |                 FP61_DEBUG_BREAK();
 928 |                 return false;
 929 |             }
 930 |         }
 931 |     }
 932 | 
 933 |     cout << "Passed" << endl;
 934 | 
 935 |     return true;
 936 | }
 937 | 
 938 | 
 939 | //------------------------------------------------------------------------------
 940 | // Tests: ByteWriter
 941 | 
 942 | bool TestByteWriter()
 943 | {
 944 |     cout << "TestByteWriter...";
 945 | 
 946 |     fp61::ByteReader reader;
 947 |     fp61::ByteWriter writer;
 948 | 
 949 |     fp61::Random prng;
 950 |     prng.Seed(14);
 951 | 
 952 |     std::vector<uint8_t> original, recovered;
 953 | 
 954 |     for (unsigned i = 1; i < kMaxDataLength; ++i)
 955 |     {
 956 |         unsigned bytes = i;
 957 | 
 958 |         for (unsigned j = 0; j < 10; ++j)
 959 |         {
 960 |             // Padding to simplify tester
 961 |             original.resize(bytes + 8);
 962 | 
 963 |             // Fill the data with random bytes
 964 |             for (unsigned k = 0; k < i; k += 8)
 965 |             {
 966 |                 uint64_t w;
 967 |                 if (prng.Next() % 100 <= 3) {
 968 |                     w = ~(uint64_t)0;
 969 |                 }
 970 |                 else {
 971 |                     w = prng.Next();
 972 |                 }
 973 |                 fp61::WriteU64_LE(&original[k], w);
 974 |             }
 975 | 
 976 |             reader.BeginRead(&original[0], bytes);
 977 | 
 978 |             unsigned maxWords = fp61::ByteReader::MaxWords(bytes);
 979 |             unsigned maxBytes = fp61::ByteWriter::MaxBytesNeeded(maxWords);
 980 | 
 981 |             recovered.resize(maxBytes);
 982 |             writer.BeginWrite(&recovered[0]);
 983 | 
 984 |             // Write words we get directly back out
 985 |             uint64_t word;
 986 |             while (reader.Read(word) != fp61::ReadResult::Empty) {
 987 |                 writer.Write(word);
 988 |             }
 989 |             unsigned writtenBytes = writer.Flush();
 990 | 
 991 |             // TBD: Check if high bits are 0?
 992 | 
 993 |             if (writtenBytes > maxBytes ||
 994 |                 writtenBytes > bytes + 8)
 995 |             {
 996 |                 cout << "Failed (byte count mismatch) at i = " << i << " j = " << j << endl;
 997 |                 FP61_DEBUG_BREAK();
 998 |                 return false;
 999 |             }
1000 | 
1001 |             if (0 != memcmp(&recovered[0], &original[0], bytes))
1002 |             {
1003 |                 cout << "Failed (data corruption) at i = " << i << " j = " << j << endl;
1004 |                 FP61_DEBUG_BREAK();
1005 |                 return false;
1006 |             }
1007 |         }
1008 |     }
1009 | 
1010 |     cout << "Passed" << endl;
1011 | 
1012 |     return true;
1013 | }
1014 | 
1015 | 
1016 | //------------------------------------------------------------------------------
1017 | // Tests: Integration
1018 | 
1019 | // Tests all of the serialization/deserialization and some math code
1020 | bool TestIntegration()
1021 | {
1022 |     cout << "TestIntegration...";
1023 | 
1024 |     std::vector<uint8_t> data, recovery, recovered;
1025 | 
1026 |     fp61::Random prng;
1027 |     prng.Seed(13);
1028 | 
1029 |     // Test a range of data sizes
1030 |     for (unsigned i = 1; i < kMaxDataLength; ++i)
1031 |     {
1032 |         unsigned bytes = i;
1033 | 
1034 |         // Run a few tests for each size
1035 |         for (unsigned j = 0; j < 10; ++j)
1036 |         {
1037 |             // Generate some test data:
1038 | 
1039 |             // Allocate padded data to simplify tester
1040 |             data.resize(bytes + 8);
1041 | 
1042 |             // Fill the data with random bytes
1043 |             for (unsigned k = 0; k < i; k += 8)
1044 |             {
1045 |                 uint64_t w;
1046 |                 if (prng.Next() % 100 <= 3) {
1047 |                     w = ~(uint64_t)0;
1048 |                 }
1049 |                 else {
1050 |                     w = prng.Next();
1051 |                 }
1052 |                 fp61::WriteU64_LE(&data[k], w);
1053 |             }
1054 | 
1055 |             // Read data from the simulated packet,
1056 |             // perform some example Fp operation on it,
1057 |             // and then store it to a simulated recovery packet.
1058 | 
1059 |             // Preallocate enough space in recovery packets for the worst case
1060 |             const unsigned maxWords = fp61::ByteReader::MaxWords(bytes);
1061 |             recovery.resize(fp61::WordWriter::BytesNeeded(maxWords));
1062 | 
1063 |             fp61::WordWriter recovery_writer;
1064 |             recovery_writer.BeginWrite(&recovery[0]);
1065 | 
1066 |             fp61::ByteReader original_reader;
1067 |             original_reader.BeginRead(&data[0], bytes);
1068 | 
1069 |             fp61::Random coeff_prng;
1070 |             coeff_prng.Seed(bytes + j * 500000);
1071 | 
1072 |             // Start reading words from the original file/packet,
1073 |             // multiplying them by a random coefficient,
1074 |             // and writing them to the recovery file/packet.
1075 |             uint64_t r;
1076 |             while (original_reader.Read(r) == fp61::ReadResult::Success)
1077 |             {
1078 |                 // Pick random coefficient to multiply between 1..p-1
1079 |                 uint64_t coeff = coeff_prng.NextNonzeroFp();
1080 | 
1081 |                 // x = r * coeff (62 bits)
1082 |                 uint64_t x = fp61::Multiply(r, coeff);
1083 | 
1084 |                 // Finalize x (61 bits < p)
1085 |                 uint64_t f = fp61::Finalize(x);
1086 | 
1087 |                 // Write to recovery file/packet
1088 |                 recovery_writer.Write(f);
1089 |             }
1090 | 
1091 |             // Flush the remaining bits to the recovery file/packet
1092 |             unsigned writtenRecoveryBytes = recovery_writer.Flush();
1093 | 
1094 |             // Simulate reading data from the recovery file/packet
1095 |             // and recovering the original data:
1096 | 
1097 |             fp61::WordReader recovery_reader;
1098 |             recovery_reader.BeginRead(&recovery[0], writtenRecoveryBytes);
1099 | 
1100 |             // Allocate space for recovered data (may be up to 1.6% larger than needed)
1101 |             const unsigned recoveryWords = fp61::WordReader::WordCount(writtenRecoveryBytes);
1102 |             const unsigned maxBytes = fp61::ByteWriter::MaxBytesNeeded(recoveryWords);
1103 |             recovered.resize(maxBytes);
1104 | 
1105 |             fp61::ByteWriter original_writer;
1106 |             original_writer.BeginWrite(&recovered[0]);
1107 | 
1108 |             // Reproduce the same random sequence
1109 |             coeff_prng.Seed(bytes + j * 500000);
1110 | 
1111 |             // For each word to read:
1112 |             const unsigned readWords = fp61::WordReader::WordCount(writtenRecoveryBytes);
1113 |             for (unsigned i = 0; i < readWords; ++i)
1114 |             {
1115 |                 // Pick random coefficient to multiply between 1..p-1
1116 |                 uint64_t coeff = coeff_prng.NextNonzeroFp();
1117 |                 uint64_t inv_coeff = fp61::Inverse(coeff);
1118 | 
1119 |                 // Read the next word (61 bits)
1120 |                 uint64_t f = recovery_reader.Read();
1121 | 
1122 |                 // Invert the multiplication (62 bits)
1123 |                 uint64_t x = fp61::Multiply(f, inv_coeff);
1124 | 
1125 |                 // Finalize x (61 bits < p)
1126 |                 x = fp61::Finalize(x);
1127 | 
1128 |                 // Write to recovered original data buffer
1129 |                 original_writer.Write(x);
1130 |             }
1131 | 
1132 |             // Flush the remaining bits to the recovered original file/packet
1133 |             unsigned recoveredBytes = original_writer.Flush();
1134 | 
1135 |             if (recoveredBytes > maxBytes ||
1136 |                 recoveredBytes > bytes + 8)
1137 |             {
1138 |                 cout << "Failed (byte count mismatch) at i = " << i << " j = " << j << endl;
1139 |                 FP61_DEBUG_BREAK();
1140 |                 return false;
1141 |             }
1142 | 
1143 |             if (0 != memcmp(&recovered[0], &data[0], bytes))
1144 |             {
1145 |                 cout << "Failed (data corruption) at i = " << i << " j = " << j << endl;
1146 |                 FP61_DEBUG_BREAK();
1147 |                 return false;
1148 |             }
1149 |         }
1150 |     }
1151 | 
1152 |     cout << "Passed" << endl;
1153 | 
1154 |     return true;
1155 | }
1156 | 
1157 | 
1158 | //------------------------------------------------------------------------------
1159 | // Entrypoint
1160 | 
1161 | int main()
1162 | {
1163 |     cout << "Unit tester for Fp61.  Exits with -1 on failure, 0 on success" << endl;
1164 |     cout << endl;
1165 | 
1166 |     int result = FP61_RET_SUCCESS;
1167 | 
1168 |     if (!TestByteWriter()) {
1169 |         result = FP61_RET_FAIL;
1170 |     }
1171 |     if (!TestIntegration()) {
1172 |         result = FP61_RET_FAIL;
1173 |     }
1174 |     if (!TestRandom()) {
1175 |         result = FP61_RET_FAIL;
1176 |     }
1177 |     if (!TestWordSerialization()) {
1178 |         result = FP61_RET_FAIL;
1179 |     }
1180 |     if (!TestNegate()) {
1181 |         result = FP61_RET_FAIL;
1182 |     }
1183 |     if (!TestAdd()) {
1184 |         result = FP61_RET_FAIL;
1185 |     }
1186 |     if (!TestPartialReduction()) {
1187 |         result = FP61_RET_FAIL;
1188 |     }
1189 |     if (!TestFinalizeReduction()) {
1190 |         result = FP61_RET_FAIL;
1191 |     }
1192 |     if (!TestMultiply()) {
1193 |         result = FP61_RET_FAIL;
1194 |     }
1195 |     if (!TestMulInverse()) {
1196 |         result = FP61_RET_FAIL;
1197 |     }
1198 |     if (!TestByteReader()) {
1199 |         result = FP61_RET_FAIL;
1200 |     }
1201 | 
1202 |     cout << endl;
1203 |     if (result == FP61_RET_FAIL) {
1204 |         cout << "*** Tests failed (see above)!  Returning -1" << endl;
1205 |     }
1206 |     else {
1207 |         cout << "*** Tests succeeded!  Returning 0" << endl;
1208 |     }
1209 | 
1210 |     return result;
1211 | }
1212 | 


--------------------------------------------------------------------------------