├── .gitattributes
├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── README.md
├── include
    ├── cm256.h
    └── gf256.h
├── src
    ├── cm256.cpp
    └── gf256.cpp
└── unit_test
    ├── Debug
        └── matrix_test.exe
    ├── Release
        └── matrix_test.exe
    ├── SiameseTools.cpp
    ├── SiameseTools.h
    ├── main.cpp
    ├── matrix_test.sln
    ├── matrix_test.vcxproj
    ├── matrix_test.vcxproj.filters
    └── x64
        ├── Debug
            └── matrix_test.exe
        └── Release
            └── matrix_test.exe


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | 
 5 | # Folder config file
 6 | Desktop.ini
 7 | 
 8 | # Recycle Bin used on file shares
 9 | $RECYCLE.BIN/
10 | 
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 | 
17 | # Windows shortcuts
18 | *.lnk
19 | 
20 | # MSVC temp files
21 | *.obj
22 | *.log
23 | *.ilk
24 | *.pdb
25 | *.tlog
26 | *.idb
27 | *.opensdf
28 | *.sdf
29 | *.user
30 | *.suo
31 | 
32 | # =========================
33 | # Operating System Files
34 | # =========================
35 | 
36 | # OSX
37 | # =========================
38 | 
39 | .DS_Store
40 | .AppleDouble
41 | .LSOverride
42 | 
43 | # Thumbnails
44 | ._*
45 | 
46 | # Files that might appear on external disk
47 | .Spotlight-V100
48 | .Trashes
49 | 
50 | # Directories potentially created on remote AFP share
51 | .AppleDB
52 | .AppleDesktop
53 | Network Trash Folder
54 | Temporary Items
55 | .apdisk
56 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | project(cm256)
 3 | 
 4 | set(CMAKE_CXX_STANDARD 11)
 5 | 
 6 | set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "" FORCE)
 7 | 
 8 | if(NOT CMAKE_BUILD_TYPE)
 9 |     set(CMAKE_BUILD_TYPE Release)
10 | endif()
11 | 
12 | option(MARCH_NATIVE "Use -march=native option" ON)
13 | 
14 | set(LIB_SOURCE_FILES
15 |     src/cm256.cpp
16 |     include/cm256.h
17 |     src/gf256.cpp
18 |     include/gf256.h
19 | )
20 | 
21 | set(UNIT_TEST_SOURCE_FILES
22 |     unit_test/main.cpp
23 |     unit_test/SiameseTools.cpp
24 |     unit_test/SiameseTools.h
25 | )
26 | 
27 | if(MSVC)
28 | else()
29 |     set(CMAKE_CXX_FLAGS "-Wall -Wextra")
30 |     set(CMAKE_CXX_FLAGS_DEBUG "-g -O0")
31 |     set(CMAKE_CXX_FLAGS_RELEASE "-O3")
32 | endif()
33 | 
34 | if(MARCH_NATIVE)
35 |     set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -march=native")
36 |     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -march=native")
37 | endif()
38 | 
39 | add_library(cm256 ${LIB_SOURCE_FILES})
40 | set_target_properties(cm256 PROPERTIES VERSION 2)
41 | set_target_properties(cm256 PROPERTIES SOVERSION 2)
42 | target_include_directories(cm256 PUBLIC ${PROJECT_SOURCE_DIR}/include)
43 | 
44 | add_executable(unit_test ${UNIT_TEST_SOURCE_FILES})
45 | target_link_libraries(unit_test cm256)
46 | 
47 | install(TARGETS cm256
48 |     ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
49 |     LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
50 |     PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
51 | install(DIRECTORY include DESTINATION ${CMAKE_INSTALL_PREFIX})
52 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2024, Chris Taylor
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this
 9 |    list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 |    this list of conditions and the following disclaimer in the documentation
13 |    and/or other materials provided with the distribution.
14 | 
15 | 3. Neither the name of the copyright holder nor the names of its
16 |    contributors may be used to endorse or promote products derived from
17 |    this software without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # cm256
  2 | Fast GF(256) Cauchy MDS Block Erasure Codec in C
  3 | 
  4 | cm256 is a simple library for erasure codes.  From given data it generates
  5 | redundant data that can be used to recover the originals.
  6 | 
  7 | It is roughly 2x faster than Longhair, and CM256 supports input data that is not a multiple of 8 bytes.
  8 | 
  9 | Currently only Visual Studio 2013 is supported, though other versions of MSVC may work.
 10 | 
 11 | The original data should be split up into equally-sized chunks.  If one of these chunks
 12 | is erased, the redundant data can fill in the gap through decoding.
 13 | 
 14 | The erasure code is parameterized by three values (`OriginalCount`, `RecoveryCount`, `BlockBytes`).  These are:
 15 | 
 16 | + The number of blocks of original data (`OriginalCount`), which must be less than 256.
 17 | + The number of blocks of redundant data (`RecoveryCount`), which must be no more than `256 - OriginalCount`.
 18 | 
 19 | For example, if a file is split into 3 equal pieces and sent over a network, `OriginalCount` is 3.
 20 | And if 2 additional redundant packets are generated, `RecoveryCount` is 2.
 21 | In this case up to 256 - 3 = 253 additional redundant packets can be generated.
 22 | 
 23 | cm256 is released under the BSD3 license for free use in commercial and open source projects.
 24 | 
 25 | ##### Building: Quick Setup
 26 | 
 27 | Include the cm256.* and gf256.* files in your project and consult the cm256.h header for usage.
 28 | 
 29 | 
 30 | ## Usage
 31 | 
 32 | Documentation is provided in the header file [cm256.h](https://github.com/catid/cm256/raw/master/cm256.h).
 33 | 
 34 | When your application starts up it should call `cm256_init()` to verify that the library is linked properly:
 35 | 
 36 | ~~~
 37 | 	#include "cm256.h"
 38 | 
 39 | 	if (cm256_init()) {
 40 | 		// Wrong static library
 41 | 		exit(1);
 42 | 	}
 43 | ~~~
 44 | 
 45 | To generate redundancy, use the `cm256_encode` function.  To solve for the original data use the `cm256_decode` function.
 46 | 
 47 | Example usage:
 48 | 
 49 | ~~~
 50 | bool ExampleFileUsage()
 51 | {
 52 |     if (cm256_init())
 53 |     {
 54 |         exit(1);
 55 |     }
 56 | 
 57 |     cm256_encoder_params params;
 58 | 
 59 |     // Number of bytes per file block
 60 |     params.BlockBytes = 4321;
 61 | 
 62 |     // Number of blocks
 63 |     params.OriginalCount = 33;
 64 | 
 65 |     // Number of additional recovery blocks generated by encoder
 66 |     params.RecoveryCount = 12;
 67 | 
 68 |     // Size of the original file
 69 |     static const int OriginalFileBytes = params.OriginalCount * params.BlockBytes;
 70 | 
 71 |     // Allocate and fill the original file data
 72 |     uint8_t* originalFileData = new uint8_t[OriginalFileBytes];
 73 |     memset(originalFileData, 1, OriginalFileBytes);
 74 | 
 75 |     // Pointers to data
 76 |     cm256_block blocks[256];
 77 |     for (int i = 0; i < params.OriginalCount; ++i)
 78 |     {
 79 |         blocks[i].Block = originalFileData + i * params.BlockBytes;
 80 |     }
 81 | 
 82 |     // Recovery data
 83 |     uint8_t* recoveryBlocks = new uint8_t[params.RecoveryCount * params.BlockBytes];
 84 | 
 85 |     // Generate recovery data
 86 |     if (cm256_encode(params, blocks, recoveryBlocks))
 87 |     {
 88 |         exit(1);
 89 |     }
 90 | 
 91 |     // Initialize the indices
 92 |     for (int i = 0; i < params.OriginalCount; ++i)
 93 |     {
 94 |         blocks[i].Index = cm256_get_original_block_index(params, i);
 95 |     }
 96 | 
 97 |     //// Simulate loss of data, subsituting a recovery block in its place ////
 98 |     blocks[0].Block = recoveryBlocks; // First recovery block
 99 |     blocks[0].Index = cm256_get_recovery_block_index(params, 0); // First recovery block index
100 |     //// Simulate loss of data, subsituting a recovery block in its place ////
101 | 
102 |     if (cm256_decode(params, blocks))
103 |     {
104 |         exit(1);
105 |     }
106 | 
107 |     // blocks[0].Index will now be 0.
108 | 
109 |     delete[] originalFileData;
110 |     delete[] recoveryBlocks;
111 | 
112 |     return true;
113 | }
114 | ~~~
115 | 
116 | The example above is just one way to use the `cm256_decode` function.
117 | 
118 | This API was designed to be flexible enough for UDP/IP-based file transfer where
119 | the blocks arrive out of order.
120 | 
121 | 
122 | #### Benchmark
123 | 
124 | CM256 demonstrates similar encoding and (worst case) decoding performance:
125 | 
126 | ~~~
127 | Encoder: 1296 bytes k = 100 m = 1 : 5.55886 usec, 23314.1 MBps
128 | Decoder: 1296 bytes k = 100 m = 1 : 6.72915 usec, 19259.5 MBps
129 | Encoder: 1296 bytes k = 100 m = 2 : 17.2617 usec, 7507.93 MBps
130 | Decoder: 1296 bytes k = 100 m = 2 : 19.6023 usec, 6611.46 MBps
131 | Encoder: 1296 bytes k = 100 m = 3 : 30.4275 usec, 4259.31 MBps
132 | Decoder: 1296 bytes k = 100 m = 3 : 32.4755 usec, 3990.7 MBps
133 | Encoder: 1296 bytes k = 100 m = 4 : 40.6675 usec, 3186.82 MBps
134 | Decoder: 1296 bytes k = 100 m = 4 : 43.5932 usec, 2972.94 MBps
135 | Encoder: 1296 bytes k = 100 m = 5 : 51.7852 usec, 2502.64 MBps
136 | Decoder: 1296 bytes k = 100 m = 5 : 51.4926 usec, 2516.86 MBps
137 | Encoder: 1296 bytes k = 100 m = 6 : 62.6104 usec, 2069.94 MBps
138 | Decoder: 1296 bytes k = 100 m = 6 : 64.9509 usec, 1995.35 MBps
139 | Encoder: 1296 bytes k = 100 m = 7 : 76.3612 usec, 1697.2 MBps
140 | Decoder: 1296 bytes k = 100 m = 7 : 75.191 usec, 1723.61 MBps
141 | Encoder: 1296 bytes k = 100 m = 8 : 85.1384 usec, 1522.23 MBps
142 | Decoder: 1296 bytes k = 100 m = 8 : 83.0904 usec, 1559.75 MBps
143 | Encoder: 1296 bytes k = 100 m = 9 : 96.2561 usec, 1346.41 MBps
144 | Decoder: 1296 bytes k = 100 m = 9 : 95.3784 usec, 1358.8 MBps
145 | Encoder: 1296 bytes k = 100 m = 10 : 110.592 usec, 1171.87 MBps
146 | Decoder: 1296 bytes k = 100 m = 10 : 109.714 usec, 1181.25 MBps
147 | 
148 | Encoder: 1296 bytes k = 100 m = 20 : 223.525 usec, 579.801 MBps
149 | Decoder: 1296 bytes k = 100 m = 20 : 209.481 usec, 618.671 MBps
150 | 
151 | Encoder: 1296 bytes k = 100 m = 30 : 372.737 usec, 347.699 MBps
152 | Decoder: 1296 bytes k = 100 m = 30 : 322.707 usec, 401.603 MBps
153 | 
154 | Encoder: 1296 bytes k = 100 m = 40 : 471.626 usec, 274.794 MBps
155 | Decoder: 1296 bytes k = 100 m = 40 : 434.762 usec, 298.094 MBps
156 | 
157 | Encoder: 1296 bytes k = 100 m = 50 : 592.751 usec, 218.642 MBps
158 | Decoder: 1296 bytes k = 100 m = 50 : 545.939 usec, 237.389 MBps
159 | ~~~
160 | (These performance numbers are out of date and not well calibrated - Decoding now takes the same time as encoding within a few microseconds thanks to the new matrix solver.)
161 | 
162 | Longhair Library Results:
163 | 
164 | Note that I hand-optimized the MemXOR.cpp implementation on this PC to run faster than what is available on github, so this is a fair comparison.
165 | 
166 | ~~~
167 | Encoded k=100 data blocks with m=1 recovery blocks in 4.09607 usec : 31640.1 MB/s
168 | + Decoded 1 erasures in 5.85144 usec : 22148.4 MB/s
169 | Encoded k=100 data blocks with m=2 recovery blocks in 41.5452 usec : 3119.5 MB/s
170 | + Decoded 2 erasures in 43.5931 usec : 2972.94 MB/s
171 | Encoded k=100 data blocks with m=3 recovery blocks in 80.7498 usec : 1604.96 MB/s
172 | + Decoded 3 erasures in 86.6013 usec : 1496.51 MB/s
173 | Encoded k=100 data blocks with m=4 recovery blocks in 123.465 usec : 1049.69 MB/s
174 | + Decoded 4 erasures in 127.854 usec : 1013.66 MB/s
175 | Encoded k=100 data blocks with m=5 recovery blocks in 76.9464 usec : 1684.29 MB/s
176 | + Decoded 5 erasures in 88.6493 usec : 1461.94 MB/s
177 | Encoded k=100 data blocks with m=6 recovery blocks in 87.7717 usec : 1476.56 MB/s
178 | + Decoded 6 erasures in 100.352 usec : 1291.45 MB/s
179 | Encoded k=100 data blocks with m=7 recovery blocks in 103.863 usec : 1247.8 MB/s
180 | + Decoded 7 erasures in 127.269 usec : 1018.32 MB/s
181 | Encoded k=100 data blocks with m=8 recovery blocks in 118.784 usec : 1091.05 MB/s
182 | + Decoded 8 erasures in 145.701 usec : 889.494 MB/s
183 | Encoded k=100 data blocks with m=9 recovery blocks in 146.871 usec : 882.406 MB/s
184 | + Decoded 9 erasures in 158.574 usec : 817.284 MB/s
185 | Encoded k=100 data blocks with m=10 recovery blocks in 156.819 usec : 826.433 MB/s
186 | + Decoded 10 erasures in 181.102 usec : 715.619 MB/s
187 | 
188 | Encoded k=100 data blocks with m=20 recovery blocks in 282.039 usec : 459.511 MB/s
189 | + Decoded 20 erasures in 370.103 usec : 350.172 MB/s
190 | 
191 | Encoded k=100 data blocks with m=30 recovery blocks in 428.618 usec : 302.367 MB/s
192 | + Decoded 30 erasures in 614.693 usec : 210.837 MB/s
193 | 
194 | Encoded k=100 data blocks with m=40 recovery blocks in 562.323 usec : 230.472 MB/s
195 | + Decoded 40 erasures in 855.188 usec : 151.546 MB/s
196 | 
197 | Encoded k=100 data blocks with m=50 recovery blocks in 727.041 usec : 178.257 MB/s
198 | + Decoded 50 erasures in 1181.11 usec : 109.727 MB/s
199 | ~~~
200 | 
201 | Results Discussion:
202 | 
203 | For m=1 they are both running the same kind of code, so they're basically the same.
204 | 
205 | For m=2 and m=3, CM256 is 2.5x faster.
206 | 
207 | For m=4, CM256 is 3x faster in this case.  Longhair could use more tuning.  Back when I wrote it, the right time to switch to the Windowed decoder was at m=5, but on my new PC it seems like m=4 is a better time to do it.  CM256 only has one mode so it doesn't require any tuning for best performance.
208 | 
209 | For m=5...30, CM256 performance is not quite 2x faster, maybe 1.7x or so.
210 | 
211 | For m>30, CM256 is at least 2x faster.
212 | 
213 | 
214 | #### Comparisons with Other Libraries
215 | 
216 | The approach taken in CM256 is similar to the Intel Storage Acceleration Library (ISA-L) available here:
217 | 
218 | https://01.org/intel%C2%AE-storage-acceleration-library-open-source-version/downloads
219 | 
220 | ISA-L more aggressively optimizes the matrix multiplication operation, which is the most expensive step of encoding.
221 | 
222 | CM256 takes better advantage of the m=1 case and the first recovery symbol, which is also possible with the Vandermonde matrices supported by ISA-L.
223 | 
224 | ISA-L uses a O(N^3) Gaussian elimination solver for decoding.  The CM256 decoder solves the linear system using a fast O(N^2) LDU-decomposition algorithm from "Pivoting and Backward Stability of Fast Algorithms for Solving Cauchy Linear Equations" (T. Boros, T. Kailath, V. Olshevsky), which was hand-optimized for memory accesses.
225 | 
226 | 
227 | #### Credits
228 | 
229 | This software was written entirely by myself ( Christopher A. Taylor <mrcatid@gmail.com> ).  If you
230 | find it useful and would like to buy me a coffee, consider [tipping](https://www.gittip.com/catid/).
231 | 


--------------------------------------------------------------------------------
/include/cm256.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | 	Copyright (c) 2015 Christopher A. Taylor.  All rights reserved.
  3 | 
  4 | 	Redistribution and use in source and binary forms, with or without
  5 | 	modification, are permitted provided that the following conditions are met:
  6 | 
  7 | 	* Redistributions of source code must retain the above copyright notice,
  8 | 	  this list of conditions and the following disclaimer.
  9 | 	* Redistributions in binary form must reproduce the above copyright notice,
 10 | 	  this list of conditions and the following disclaimer in the documentation
 11 | 	  and/or other materials provided with the distribution.
 12 | 	* Neither the name of CM256 nor the names of its contributors may be
 13 | 	  used to endorse or promote products derived from this software without
 14 | 	  specific prior written permission.
 15 | 
 16 | 	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 17 | 	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 | 	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 19 | 	ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 20 | 	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 21 | 	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 22 | 	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 23 | 	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 24 | 	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 25 | 	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 26 | 	POSSIBILITY OF SUCH DAMAGE.
 27 | */
 28 | 
 29 | #ifndef CM256_H
 30 | #define CM256_H
 31 | 
 32 | #include "gf256.h"
 33 | 
 34 | #include <assert.h>
 35 | 
 36 | // Library version
 37 | #define CM256_VERSION 2
 38 | 
 39 | 
 40 | #ifdef __cplusplus
 41 | extern "C" {
 42 | #endif
 43 | 
 44 | /*
 45 |  * Verify binary compatibility with the API on startup.
 46 |  *
 47 |  * Example:
 48 |  * 	if (cm256_init()) exit(1);
 49 |  *
 50 |  * Returns 0 on success, and any other code indicates failure.
 51 |  */
 52 | extern int cm256_init_(int version);
 53 | #define cm256_init() cm256_init_(CM256_VERSION)
 54 | 
 55 | 
 56 | // Encoder parameters
 57 | typedef struct cm256_encoder_params_t {
 58 |     // Original block count < 256
 59 |     int OriginalCount;
 60 | 
 61 |     // Recovery block count < 256
 62 |     int RecoveryCount;
 63 | 
 64 |     // Number of bytes per block (all blocks are the same size in bytes)
 65 |     int BlockBytes;
 66 | } cm256_encoder_params;
 67 | 
 68 | // Descriptor for data block
 69 | typedef struct cm256_block_t {
 70 |     // Pointer to data received.
 71 |     void* Block;
 72 | 
 73 |     // Block index.
 74 |     // For original data, it will be in the range
 75 |     //    [0..(originalCount-1)] inclusive.
 76 |     // For recovery data, the first one's Index must be originalCount,
 77 |     //    and it will be in the range
 78 |     //    [originalCount..(originalCount+recoveryCount-1)] inclusive.
 79 |     unsigned char Index;
 80 |     // Ignored during encoding, required during decoding.
 81 | } cm256_block;
 82 | 
 83 | 
 84 | // Compute the value to put in the Index member of cm256_block
 85 | static inline unsigned char cm256_get_recovery_block_index(cm256_encoder_params params, int recoveryBlockIndex)
 86 | {
 87 |     assert(recoveryBlockIndex >= 0 && recoveryBlockIndex < params.RecoveryCount);
 88 |     return (unsigned char)(params.OriginalCount + recoveryBlockIndex);
 89 | }
 90 | static inline unsigned char cm256_get_original_block_index(cm256_encoder_params params, int originalBlockIndex)
 91 | {
 92 |     assert(originalBlockIndex >= 0 && originalBlockIndex < params.OriginalCount);
 93 |     return (unsigned char)(originalBlockIndex);
 94 | }
 95 | 
 96 | 
 97 | /*
 98 |  * Cauchy MDS GF(256) encode
 99 |  *
100 |  * This produces a set of recovery blocks that should be transmitted after the
101 |  * original data blocks.
102 |  *
103 |  * It takes in 'originalCount' equal-sized blocks and produces 'recoveryCount'
104 |  * equally-sized recovery blocks.
105 |  *
106 |  * The input 'originals' array allows more natural usage of the library.
107 |  * The output recovery blocks are stored end-to-end in 'recoveryBlocks'.
108 |  * 'recoveryBlocks' should have recoveryCount * blockBytes bytes available.
109 |  *
110 |  * Precondition: originalCount + recoveryCount <= 256
111 |  *
112 |  * When transmitting the data, the block index of the data should be sent,
113 |  * and the recovery block index is also needed.  The decoder should also
114 |  * be provided with the values of originalCount, recoveryCount and blockBytes.
115 |  *
116 |  * Example wire format:
117 |  * [originalCount(1 byte)] [recoveryCount(1 byte)]
118 |  * [blockIndex(1 byte)] [blockData(blockBytes bytes)]
119 |  *
120 |  * Be careful not to mix blocks from different encoders.
121 |  *
122 |  * It is possible to support variable-length data by including the original
123 |  * data length at the front of each message in 2 bytes, such that when it is
124 |  * recovered after a loss the data length is available in the block data and
125 |  * the remaining bytes of padding can be neglected.
126 |  *
127 |  * Returns 0 on success, and any other code indicates failure.
128 |  */
129 | extern int cm256_encode(
130 |     cm256_encoder_params params, // Encoder parameters
131 |     cm256_block* originals,      // Array of pointers to original blocks
132 |     void* recoveryBlocks);       // Output recovery blocks end-to-end
133 | 
134 | // Encode one block.
135 | // Note: This function does not validate input, use with care.
136 | extern void cm256_encode_block(
137 |     cm256_encoder_params params, // Encoder parameters
138 |     cm256_block* originals,      // Array of pointers to original blocks
139 |     int recoveryBlockIndex,      // Return value from cm256_get_recovery_block_index()
140 |     void* recoveryBlock);        // Output recovery block
141 | 
142 | /*
143 |  * Cauchy MDS GF(256) decode
144 |  *
145 |  * This recovers the original data from the recovery data in the provided
146 |  * blocks.  There should be 'originalCount' blocks in the provided array.
147 |  * Recovery will always be possible if that many blocks are received.
148 |  *
149 |  * Provide the same values for 'originalCount', 'recoveryCount', and
150 |  * 'blockBytes' used by the encoder.
151 |  *
152 |  * The block Index should be set to the block index of the original data,
153 |  * as described in the cm256_block struct comments above.
154 |  *
155 |  * Recovery blocks will be replaced with original data and the Index
156 |  * will be updated to indicate the original block that was recovered.
157 |  *
158 |  * Returns 0 on success, and any other code indicates failure.
159 |  */
160 | extern int cm256_decode(
161 |     cm256_encoder_params params, // Encoder parameters
162 |     cm256_block* blocks);        // Array of 'originalCount' blocks as described above
163 | 
164 | 
165 | #ifdef __cplusplus
166 | }
167 | #endif
168 | 
169 | 
170 | #endif // CM256_H
171 | 


--------------------------------------------------------------------------------
/include/gf256.h:
--------------------------------------------------------------------------------
  1 | /** \file
  2 |     \brief GF(256) Main C API Header
  3 |     \copyright Copyright (c) 2017 Christopher A. Taylor.  All rights reserved.
  4 | 
  5 |     Redistribution and use in source and binary forms, with or without
  6 |     modification, are permitted provided that the following conditions are met:
  7 | 
  8 |     * Redistributions of source code must retain the above copyright notice,
  9 |       this list of conditions and the following disclaimer.
 10 |     * Redistributions in binary form must reproduce the above copyright notice,
 11 |       this list of conditions and the following disclaimer in the documentation
 12 |       and/or other materials provided with the distribution.
 13 |     * Neither the name of GF256 nor the names of its contributors may be
 14 |       used to endorse or promote products derived from this software without
 15 |       specific prior written permission.
 16 | 
 17 |     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 18 |     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 19 |     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 20 |     ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 21 |     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 22 |     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 23 |     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 24 |     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 25 |     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 26 |     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 27 |     POSSIBILITY OF SUCH DAMAGE.
 28 | */
 29 | 
 30 | #ifndef CAT_GF256_H
 31 | #define CAT_GF256_H
 32 | 
 33 | /** \page GF256 GF(256) Math Module
 34 | 
 35 |     This module provides efficient implementations of bulk
 36 |     GF(2^^8) math operations over memory buffers.
 37 | 
 38 |     Addition is done over the base field in GF(2) meaning
 39 |     that addition is XOR between memory buffers.
 40 | 
 41 |     Multiplication is performed using table lookups via
 42 |     SIMD instructions.  This is somewhat slower than XOR,
 43 |     but fast enough to not become a major bottleneck when
 44 |     used sparingly.
 45 | */
 46 | 
 47 | #include <stdint.h> // uint32_t etc
 48 | #include <cstring> // memcpy, memset
 49 | 
 50 | /// Library header version
 51 | #define GF256_VERSION 2
 52 | 
 53 | //------------------------------------------------------------------------------
 54 | // Platform/Architecture
 55 | 
 56 | #if defined(__ARM_ARCH) || defined(__ARM_NEON) || defined(__ARM_NEON__)
 57 |     #if !defined IOS
 58 |         #define LINUX_ARM
 59 |     #endif
 60 | #endif
 61 | 
 62 | #if defined(ANDROID) || defined(IOS) || defined(LINUX_ARM) || defined(__powerpc__) || defined(__s390__)
 63 |     #define GF256_TARGET_MOBILE
 64 | #endif // ANDROID
 65 | 
 66 | #if defined(__AVX2__) && (!defined (_MSC_VER) || _MSC_VER >= 1900)
 67 |     #define GF256_TRY_AVX2 /* 256-bit */
 68 |     #include <immintrin.h>
 69 |     #define GF256_ALIGN_BYTES 32
 70 | #else // __AVX2__
 71 |     #define GF256_ALIGN_BYTES 16
 72 | #endif // __AVX2__
 73 | 
 74 | #if !defined(GF256_TARGET_MOBILE)
 75 |     #include <tmmintrin.h> // SSSE3: _mm_shuffle_epi8
 76 |     #include <emmintrin.h> // SSE2
 77 | #endif // GF256_TARGET_MOBILE
 78 | 
 79 | #if defined(__ARM_NEON) || defined(__ARM_NEON__)
 80 |     #include <arm_neon.h>
 81 |     #define GF256_TRY_NEON
 82 | #endif
 83 | 
 84 | // Compiler-specific 128-bit SIMD register keyword
 85 | #if defined(GF256_TARGET_MOBILE)
 86 | #if defined(GF256_TRY_NEON)
 87 |     #define GF256_M128 uint8x16_t
 88 | #else
 89 |     #define GF256_M128 uint64_t
 90 | #endif // GF256_TRY_NEON
 91 | #else // GF256_TARGET_MOBILE
 92 |     #define GF256_M128 __m128i
 93 | #endif // GF256_TARGET_MOBILE
 94 | 
 95 | // Compiler-specific 256-bit SIMD register keyword
 96 | #ifdef GF256_TRY_AVX2
 97 |     #define GF256_M256 __m256i
 98 | #endif
 99 | 
100 | // Compiler-specific C++11 restrict keyword
101 | #define GF256_RESTRICT __restrict
102 | 
103 | // Compiler-specific force inline keyword
104 | #ifdef _MSC_VER
105 |     #define GF256_FORCE_INLINE inline __forceinline
106 | #else
107 |     #define GF256_FORCE_INLINE inline __attribute__((always_inline))
108 | #endif
109 | 
110 | // Compiler-specific alignment keyword
111 | // Note: Alignment only matters for ARM NEON where it should be 16
112 | #ifdef _MSC_VER
113 |     #define GF256_ALIGNED __declspec(align(GF256_ALIGN_BYTES))
114 | #else // _MSC_VER
115 |     #define GF256_ALIGNED __attribute__((aligned(GF256_ALIGN_BYTES)))
116 | #endif // _MSC_VER
117 | 
118 | #ifdef __cplusplus
119 | extern "C" {
120 | #endif // __cplusplus
121 | 
122 | 
123 | //------------------------------------------------------------------------------
124 | // Portability
125 | 
126 | /// Swap two memory buffers in-place
127 | extern void gf256_memswap(void * GF256_RESTRICT vx, void * GF256_RESTRICT vy, int bytes);
128 | 
129 | 
130 | //------------------------------------------------------------------------------
131 | // GF(256) Context
132 | 
133 | #ifdef _MSC_VER
134 |     #pragma warning(push)
135 |     #pragma warning(disable: 4324) // warning C4324: 'gf256_ctx' : structure was padded due to __declspec(align())
136 | #endif // _MSC_VER
137 | 
138 | /// The context object stores tables required to perform library calculations
139 | struct gf256_ctx
140 | {
141 |     /// We require memory to be aligned since the SIMD instructions benefit from
142 |     /// or require aligned accesses to the table data.
143 |     struct
144 |     {
145 |         GF256_ALIGNED GF256_M128 TABLE_LO_Y[256];
146 |         GF256_ALIGNED GF256_M128 TABLE_HI_Y[256];
147 |     } MM128;
148 | #ifdef GF256_TRY_AVX2
149 |     struct
150 |     {
151 |         GF256_ALIGNED GF256_M256 TABLE_LO_Y[256];
152 |         GF256_ALIGNED GF256_M256 TABLE_HI_Y[256];
153 |     } MM256;
154 | #endif // GF256_TRY_AVX2
155 | 
156 |     /// Mul/Div/Inv/Sqr tables
157 |     uint8_t GF256_MUL_TABLE[256 * 256];
158 |     uint8_t GF256_DIV_TABLE[256 * 256];
159 |     uint8_t GF256_INV_TABLE[256];
160 |     uint8_t GF256_SQR_TABLE[256];
161 | 
162 |     /// Log/Exp tables
163 |     uint16_t GF256_LOG_TABLE[256];
164 |     uint8_t GF256_EXP_TABLE[512 * 2 + 1];
165 | 
166 |     /// Polynomial used
167 |     unsigned Polynomial;
168 | };
169 | 
170 | #ifdef _MSC_VER
171 |     #pragma warning(pop)
172 | #endif // _MSC_VER
173 | 
174 | extern gf256_ctx GF256Ctx;
175 | 
176 | 
177 | //------------------------------------------------------------------------------
178 | // Initialization
179 | 
180 | /**
181 |     Initialize a context, filling in the tables.
182 |     
183 |     Thread-safety / Usage Notes:
184 |     
185 |     It is perfectly safe and encouraged to use a gf256_ctx object from multiple
186 |     threads.  The gf256_init() is relatively expensive and should only be done
187 |     once, though it will take less than a millisecond.
188 |     
189 |     The gf256_ctx object must be aligned to 16 byte boundary.
190 |     Simply tag the object with GF256_ALIGNED to achieve this.
191 |     
192 |     Example:
193 |        static GF256_ALIGNED gf256_ctx TheGF256Context;
194 |        gf256_init(&TheGF256Context, 0);
195 |     
196 |     Returns 0 on success and other values on failure.
197 | */
198 | extern int gf256_init_(int version);
199 | #define gf256_init() gf256_init_(GF256_VERSION)
200 | 
201 | 
202 | //------------------------------------------------------------------------------
203 | // Math Operations
204 | 
205 | /// return x + y
206 | static GF256_FORCE_INLINE uint8_t gf256_add(uint8_t x, uint8_t y)
207 | {
208 |     return (uint8_t)(x ^ y);
209 | }
210 | 
211 | /// return x * y
212 | /// For repeated multiplication by a constant, it is faster to put the constant in y.
213 | static GF256_FORCE_INLINE uint8_t gf256_mul(uint8_t x, uint8_t y)
214 | {
215 |     return GF256Ctx.GF256_MUL_TABLE[((unsigned)y << 8) + x];
216 | }
217 | 
218 | /// return x / y
219 | /// Memory-access optimized for constant divisors in y.
220 | static GF256_FORCE_INLINE uint8_t gf256_div(uint8_t x, uint8_t y)
221 | {
222 |     return GF256Ctx.GF256_DIV_TABLE[((unsigned)y << 8) + x];
223 | }
224 | 
225 | /// return 1 / x
226 | static GF256_FORCE_INLINE uint8_t gf256_inv(uint8_t x)
227 | {
228 |     return GF256Ctx.GF256_INV_TABLE[x];
229 | }
230 | 
231 | /// return x * x
232 | static GF256_FORCE_INLINE uint8_t gf256_sqr(uint8_t x)
233 | {
234 |     return GF256Ctx.GF256_SQR_TABLE[x];
235 | }
236 | 
237 | 
238 | //------------------------------------------------------------------------------
239 | // Bulk Memory Math Operations
240 | 
241 | /// Performs "x[] += y[]" bulk memory XOR operation
242 | extern void gf256_add_mem(void * GF256_RESTRICT vx,
243 |                           const void * GF256_RESTRICT vy, int bytes);
244 | 
245 | /// Performs "z[] += x[] + y[]" bulk memory operation
246 | extern void gf256_add2_mem(void * GF256_RESTRICT vz, const void * GF256_RESTRICT vx,
247 |                            const void * GF256_RESTRICT vy, int bytes);
248 | 
249 | /// Performs "z[] = x[] + y[]" bulk memory operation
250 | extern void gf256_addset_mem(void * GF256_RESTRICT vz, const void * GF256_RESTRICT vx,
251 |                              const void * GF256_RESTRICT vy, int bytes);
252 | 
253 | /// Performs "z[] = x[] * y" bulk memory operation
254 | extern void gf256_mul_mem(void * GF256_RESTRICT vz,
255 |                           const void * GF256_RESTRICT vx, uint8_t y, int bytes);
256 | 
257 | /// Performs "z[] += x[] * y" bulk memory operation
258 | extern void gf256_muladd_mem(void * GF256_RESTRICT vz, uint8_t y,
259 |                              const void * GF256_RESTRICT vx, int bytes);
260 | 
261 | /// Performs "x[] /= y" bulk memory operation
262 | static GF256_FORCE_INLINE void gf256_div_mem(void * GF256_RESTRICT vz,
263 |                                              const void * GF256_RESTRICT vx, uint8_t y, int bytes)
264 | {
265 |     // Multiply by inverse
266 |     gf256_mul_mem(vz, vx, y == 1 ? (uint8_t)1 : GF256Ctx.GF256_INV_TABLE[y], bytes);
267 | }
268 | 
269 | 
270 | //------------------------------------------------------------------------------
271 | // Misc Operations
272 | 
273 | #ifdef __cplusplus
274 | }
275 | #endif // __cplusplus
276 | 
277 | #endif // CAT_GF256_H
278 | 


--------------------------------------------------------------------------------
/src/cm256.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | 	Copyright (c) 2015 Christopher A. Taylor.  All rights reserved.
  3 | 
  4 | 	Redistribution and use in source and binary forms, with or without
  5 | 	modification, are permitted provided that the following conditions are met:
  6 | 
  7 | 	* Redistributions of source code must retain the above copyright notice,
  8 | 	  this list of conditions and the following disclaimer.
  9 | 	* Redistributions in binary form must reproduce the above copyright notice,
 10 | 	  this list of conditions and the following disclaimer in the documentation
 11 | 	  and/or other materials provided with the distribution.
 12 | 	* Neither the name of CM256 nor the names of its contributors may be
 13 | 	  used to endorse or promote products derived from this software without
 14 | 	  specific prior written permission.
 15 | 
 16 | 	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 17 | 	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 | 	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 19 | 	ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 20 | 	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 21 | 	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 22 | 	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 23 | 	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 24 | 	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 25 | 	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 26 | 	POSSIBILITY OF SUCH DAMAGE.
 27 | */
 28 | 
 29 | #include "cm256.h"
 30 | 
 31 | 
 32 | /*
 33 |     GF(256) Cauchy Matrix Overview
 34 | 
 35 |     As described on Wikipedia, each element of a normal Cauchy matrix is defined as:
 36 | 
 37 |         a_ij = 1 / (x_i - y_j)
 38 |         The arrays x_i and y_j are vector parameters of the matrix.
 39 |         The values in x_i cannot be reused in y_j.
 40 | 
 41 |     Moving beyond the Wikipedia...
 42 | 
 43 |     (1) Number of rows (R) is the range of i, and number of columns (C) is the range of j.
 44 | 
 45 |     (2) Being able to select x_i and y_j makes Cauchy matrices more flexible in practice
 46 |         than Vandermonde matrices, which only have one parameter per row.
 47 | 
 48 |     (3) Cauchy matrices are always invertible, AKA always full rank, AKA when treated as
 49 |         as linear system y = M*x, the linear system has a single solution.
 50 | 
 51 |     (4) A Cauchy matrix concatenated below a square CxC identity matrix always has rank C,
 52 |         Meaning that any R rows can be eliminated from the concatenated matrix and the
 53 |         matrix will still be invertible.  This is how Reed-Solomon erasure codes work.
 54 | 
 55 |     (5) Any row or column can be multiplied by non-zero values, and the resulting matrix
 56 |         is still full rank.  This is true for any matrix, since it is effectively the same
 57 |         as pre and post multiplying by diagonal matrices, which are always invertible.
 58 | 
 59 |     (6) Matrix elements with a value of 1 are much faster to operate on than other values.
 60 |         For instance a matrix of [1, 1, 1, 1, 1] is invertible and much faster for various
 61 |         purposes than [2, 2, 2, 2, 2].
 62 | 
 63 |     (7) For GF(256) matrices, the symbols in x_i and y_j are selected from the numbers
 64 |         0...255, and so the number of rows + number of columns may not exceed 256.
 65 |         Note that values in x_i and y_j may not be reused as stated above.
 66 | 
 67 |     In summary, Cauchy matrices
 68 |         are preferred over Vandermonde matrices.  (2)
 69 |         are great for MDS erasure codes.  (3) and (4)
 70 |         should be optimized to include more 1 elements.  (5) and (6)
 71 |         have a limited size in GF(256), rows+cols <= 256.  (7)
 72 | */
 73 | 
 74 | 
 75 | //-----------------------------------------------------------------------------
 76 | // Initialization
 77 | 
 78 | extern "C" int cm256_init_(int version)
 79 | {
 80 |     if (version != CM256_VERSION)
 81 |     {
 82 |         // User's header does not match library version
 83 |         return -10;
 84 |     }
 85 | 
 86 |     // Return error code from GF(256) init if required
 87 |     return gf256_init();
 88 | }
 89 | 
 90 | 
 91 | /*
 92 |     Selected Cauchy Matrix Form
 93 | 
 94 |     The matrix consists of elements a_ij, where i = row, j = column.
 95 |     a_ij = 1 / (x_i - y_j), where x_i and y_j are sets of GF(256) values
 96 |     that do not intersect.
 97 | 
 98 |     We select x_i and y_j to just be incrementing numbers for the
 99 |     purposes of this library.  Further optimizations may yield matrices
100 |     with more 1 elements, but the benefit seems relatively small.
101 | 
102 |     The x_i values range from 0...(originalCount - 1).
103 |     The y_j values range from originalCount...(originalCount + recoveryCount - 1).
104 | 
105 |     We then improve the Cauchy matrix by dividing each column by the
106 |     first row element of that column.  The result is an invertible
107 |     matrix that has all 1 elements in the first row.  This is equivalent
108 |     to a rotated Vandermonde matrix, so we could have used one of those.
109 | 
110 |     The advantage of doing this is that operations involving the first
111 |     row will be extremely fast (just memory XOR), so the decoder can
112 |     be optimized to take advantage of the shortcut when the first
113 |     recovery row can be used.
114 | 
115 |     First row element of Cauchy matrix for each column:
116 |     a_0j = 1 / (x_0 - y_j) = 1 / (x_0 - y_j)
117 | 
118 |     Our Cauchy matrix sets first row to ones, so:
119 |     a_ij = (1 / (x_i - y_j)) / a_0j
120 |     a_ij = (y_j - x_0) / (x_i - y_j)
121 |     a_ij = (y_j + x_0) div (x_i + y_j) in GF(256)
122 | */
123 | 
124 | // This function generates each matrix element based on x_i, x_0, y_j
125 | // Note that for x_i == x_0, this will return 1, so it is better to unroll out the first row.
126 | static GF256_FORCE_INLINE unsigned char GetMatrixElement(unsigned char x_i, unsigned char x_0, unsigned char y_j)
127 | {
128 |     return gf256_div(gf256_add(y_j, x_0), gf256_add(x_i, y_j));
129 | }
130 | 
131 | 
132 | //-----------------------------------------------------------------------------
133 | // Encoding
134 | 
135 | extern "C" void cm256_encode_block(
136 |     cm256_encoder_params params, // Encoder parameters
137 |     cm256_block* originals,      // Array of pointers to original blocks
138 |     int recoveryBlockIndex,      // Return value from cm256_get_recovery_block_index()
139 |     void* recoveryBlock)         // Output recovery block
140 | {
141 |     // If only one block of input data,
142 |     if (params.OriginalCount == 1)
143 |     {
144 |         // No meaningful operation here, degenerate to outputting the same data each time.
145 | 
146 |         memcpy(recoveryBlock, originals[0].Block, params.BlockBytes);
147 |         return;
148 |     }
149 |     // else OriginalCount >= 2:
150 | 
151 |     // Unroll first row of recovery matrix:
152 |     // The matrix we generate for the first row is all ones,
153 |     // so it is merely a parity of the original data.
154 |     if (recoveryBlockIndex == params.OriginalCount)
155 |     {
156 |         gf256_addset_mem(recoveryBlock, originals[0].Block, originals[1].Block, params.BlockBytes);
157 |         for (int j = 2; j < params.OriginalCount; ++j)
158 |         {
159 |             gf256_add_mem(recoveryBlock, originals[j].Block, params.BlockBytes);
160 |         }
161 |         return;
162 |     }
163 | 
164 |     // TBD: Faster algorithms seem to exist for computing this matrix-vector product.
165 | 
166 |     // Start the x_0 values arbitrarily from the original count.
167 |     const uint8_t x_0 = static_cast<uint8_t>(params.OriginalCount);
168 | 
169 |     // For other rows:
170 |     {
171 |         const uint8_t x_i = static_cast<uint8_t>(recoveryBlockIndex);
172 | 
173 |         // Unroll first operation for speed
174 |         {
175 |             const uint8_t y_0 = 0;
176 |             const uint8_t matrixElement = GetMatrixElement(x_i, x_0, y_0);
177 | 
178 |             gf256_mul_mem(recoveryBlock, originals[0].Block, matrixElement, params.BlockBytes);
179 |         }
180 | 
181 |         // For each original data column,
182 |         for (int j = 1; j < params.OriginalCount; ++j)
183 |         {
184 |             const uint8_t y_j = static_cast<uint8_t>(j);
185 |             const uint8_t matrixElement = GetMatrixElement(x_i, x_0, y_j);
186 | 
187 |             gf256_muladd_mem(recoveryBlock, matrixElement, originals[j].Block, params.BlockBytes);
188 |         }
189 |     }
190 | }
191 | 
192 | extern "C" int cm256_encode(
193 |     cm256_encoder_params params, // Encoder params
194 |     cm256_block* originals,      // Array of pointers to original blocks
195 |     void* recoveryBlocks)        // Output recovery blocks end-to-end
196 | {
197 |     // Validate input:
198 |     if (params.OriginalCount <= 0 ||
199 |         params.RecoveryCount <= 0 ||
200 |         params.BlockBytes <= 0)
201 |     {
202 |         return -1;
203 |     }
204 |     if (params.OriginalCount + params.RecoveryCount > 256)
205 |     {
206 |         return -2;
207 |     }
208 |     if (!originals || !recoveryBlocks)
209 |     {
210 |         return -3;
211 |     }
212 | 
213 |     uint8_t* recoveryBlock = static_cast<uint8_t*>(recoveryBlocks);
214 | 
215 |     for (int block = 0; block < params.RecoveryCount; ++block, recoveryBlock += params.BlockBytes)
216 |     {
217 |         cm256_encode_block(params, originals, (params.OriginalCount + block), recoveryBlock);
218 |     }
219 | 
220 |     return 0;
221 | }
222 | 
223 | 
224 | //-----------------------------------------------------------------------------
225 | // Decoding
226 | 
227 | struct CM256Decoder
228 | {
229 |     // Encode parameters
230 |     cm256_encoder_params Params;
231 | 
232 |     // Recovery blocks
233 |     cm256_block* Recovery[256];
234 |     int RecoveryCount;
235 | 
236 |     // Original blocks
237 |     cm256_block* Original[256];
238 |     int OriginalCount;
239 | 
240 |     // Row indices that were erased
241 |     uint8_t ErasuresIndices[256];
242 | 
243 |     // Initialize the decoder
244 |     bool Initialize(cm256_encoder_params& params, cm256_block* blocks);
245 | 
246 |     // Decode m=1 case
247 |     void DecodeM1();
248 | 
249 |     // Decode for m>1 case
250 |     void Decode();
251 | 
252 |     // Generate the LU decomposition of the matrix
253 |     void GenerateLDUDecomposition(uint8_t* matrix_L, uint8_t* diag_D, uint8_t* matrix_U);
254 | };
255 | 
256 | bool CM256Decoder::Initialize(cm256_encoder_params& params, cm256_block* blocks)
257 | {
258 |     Params = params;
259 | 
260 |     cm256_block* block = blocks;
261 |     OriginalCount = 0;
262 |     RecoveryCount = 0;
263 | 
264 |     // Initialize erasures to zeros
265 |     for (int ii = 0; ii < params.OriginalCount; ++ii)
266 |     {
267 |         ErasuresIndices[ii] = 0;
268 |     }
269 | 
270 |     // For each input block,
271 |     for (int ii = 0; ii < params.OriginalCount; ++ii, ++block)
272 |     {
273 |         int row = block->Index;
274 | 
275 |         // If it is an original block,
276 |         if (row < params.OriginalCount)
277 |         {
278 |             Original[OriginalCount++] = block;
279 | 
280 |             if (ErasuresIndices[row] != 0)
281 |             {
282 |                 // Error out if two row indices repeat
283 |                 return false;
284 |             }
285 | 
286 |             ErasuresIndices[row] = 1;
287 |         }
288 |         else
289 |         {
290 |             Recovery[RecoveryCount++] = block;
291 |         }
292 |     }
293 | 
294 |     // Identify erasures
295 |     for (int ii = 0, indexCount = 0; ii < 256; ++ii)
296 |     {
297 |         if (!ErasuresIndices[ii])
298 |         {
299 |             ErasuresIndices[indexCount] = static_cast<uint8_t>( ii );
300 | 
301 |             if (++indexCount >= RecoveryCount)
302 |             {
303 |                 break;
304 |             }
305 |         }
306 |     }
307 | 
308 |     return true;
309 | }
310 | 
311 | void CM256Decoder::DecodeM1()
312 | {
313 |     // XOR all other blocks into the recovery block
314 |     uint8_t* outBlock = static_cast<uint8_t*>(Recovery[0]->Block);
315 |     const uint8_t* inBlock = nullptr;
316 | 
317 |     // For each block,
318 |     for (int ii = 0; ii < OriginalCount; ++ii)
319 |     {
320 |         const uint8_t* inBlock2 = static_cast<const uint8_t*>(Original[ii]->Block);
321 | 
322 |         if (!inBlock)
323 |         {
324 |             inBlock = inBlock2;
325 |         }
326 |         else
327 |         {
328 |             // outBlock ^= inBlock ^ inBlock2
329 |             gf256_add2_mem(outBlock, inBlock, inBlock2, Params.BlockBytes);
330 |             inBlock = nullptr;
331 |         }
332 |     }
333 | 
334 |     // Complete XORs
335 |     if (inBlock)
336 |     {
337 |         gf256_add_mem(outBlock, inBlock, Params.BlockBytes);
338 |     }
339 | 
340 |     // Recover the index it corresponds to
341 |     Recovery[0]->Index = ErasuresIndices[0];
342 | }
343 | 
344 | // Generate the LU decomposition of the matrix
345 | void CM256Decoder::GenerateLDUDecomposition(uint8_t* matrix_L, uint8_t* diag_D, uint8_t* matrix_U)
346 | {
347 |     // Schur-type-direct-Cauchy algorithm 2.5 from
348 |     // "Pivoting and Backward Stability of Fast Algorithms for Solving Cauchy Linear Equations"
349 |     // T. Boros, T. Kailath, V. Olshevsky
350 |     // Modified for practical use.  I folded the diagonal parts of U/L matrices into the
351 |     // diagonal one to reduce the number of multiplications to perform against the input data,
352 |     // and organized the triangle matrices in memory to allow for faster SSE3 GF multiplications.
353 | 
354 |     // Matrix size NxN
355 |     const int N = RecoveryCount;
356 | 
357 |     // Generators
358 |     uint8_t g[256], b[256];
359 |     for (int i = 0; i < N; ++i)
360 |     {
361 |         g[i] = 1;
362 |         b[i] = 1;
363 |     }
364 | 
365 |     // Temporary buffer for rotated row of U matrix
366 |     // This allows for faster GF bulk multiplication
367 |     uint8_t rotated_row_U[256];
368 |     uint8_t* last_U = matrix_U + ((N - 1) * N) / 2 - 1;
369 |     int firstOffset_U = 0;
370 | 
371 |     // Start the x_0 values arbitrarily from the original count.
372 |     const uint8_t x_0 = static_cast<uint8_t>(Params.OriginalCount);
373 | 
374 |     // Unrolling k = 0 just makes it slower for some reason.
375 |     for (int k = 0; k < N - 1; ++k)
376 |     {
377 |         const uint8_t x_k = Recovery[k]->Index;
378 |         const uint8_t y_k = ErasuresIndices[k];
379 | 
380 |         // D_kk = (x_k + y_k)
381 |         // L_kk = g[k] / (x_k + y_k)
382 |         // U_kk = b[k] * (x_0 + y_k) / (x_k + y_k)
383 |         const uint8_t D_kk = gf256_add(x_k, y_k);
384 |         const uint8_t L_kk = gf256_div(g[k], D_kk);
385 |         const uint8_t U_kk = gf256_mul(gf256_div(b[k], D_kk), gf256_add(x_0, y_k));
386 | 
387 |         // diag_D[k] = D_kk * L_kk * U_kk
388 |         diag_D[k] = gf256_mul(D_kk, gf256_mul(L_kk, U_kk));
389 | 
390 |         // Computing the k-th row of L and U
391 |         uint8_t* row_L = matrix_L;
392 |         uint8_t* row_U = rotated_row_U;
393 |         for (int j = k + 1; j < N; ++j)
394 |         {
395 |             const uint8_t x_j = Recovery[j]->Index;
396 |             const uint8_t y_j = ErasuresIndices[j];
397 | 
398 |             // L_jk = g[j] / (x_j + y_k)
399 |             // U_kj = b[j] / (x_k + y_j)
400 |             const uint8_t L_jk = gf256_div(g[j], gf256_add(x_j, y_k));
401 |             const uint8_t U_kj = gf256_div(b[j], gf256_add(x_k, y_j));
402 | 
403 |             *matrix_L++ = L_jk;
404 |             *row_U++ = U_kj;
405 | 
406 |             // g[j] = g[j] * (x_j + x_k) / (x_j + y_k)
407 |             // b[j] = b[j] * (y_j + y_k) / (y_j + x_k)
408 |             g[j] = gf256_mul(g[j], gf256_div(gf256_add(x_j, x_k), gf256_add(x_j, y_k)));
409 |             b[j] = gf256_mul(b[j], gf256_div(gf256_add(y_j, y_k), gf256_add(y_j, x_k)));
410 |         }
411 | 
412 |         // Do these row/column divisions in bulk for speed.
413 |         // L_jk /= L_kk
414 |         // U_kj /= U_kk
415 |         const int count = N - (k + 1);
416 |         gf256_div_mem(row_L, row_L, L_kk, count);
417 |         gf256_div_mem(rotated_row_U, rotated_row_U, U_kk, count);
418 | 
419 |         // Copy U matrix row into place in memory.
420 |         uint8_t* output_U = last_U + firstOffset_U;
421 |         row_U = rotated_row_U;
422 |         for (int j = k + 1; j < N; ++j)
423 |         {
424 |             *output_U = *row_U++;
425 |             output_U -= j;
426 |         }
427 |         firstOffset_U -= k + 2;
428 |     }
429 | 
430 |     // Multiply diagonal matrix into U
431 |     uint8_t* row_U = matrix_U;
432 |     for (int j = N - 1; j > 0; --j)
433 |     {
434 |         const uint8_t y_j = ErasuresIndices[j];
435 |         const int count = j;
436 | 
437 |         gf256_mul_mem(row_U, row_U, gf256_add(x_0, y_j), count);
438 |         row_U += count;
439 |     }
440 | 
441 |     const uint8_t x_n = Recovery[N - 1]->Index;
442 |     const uint8_t y_n = ErasuresIndices[N - 1];
443 | 
444 |     // D_nn = 1 / (x_n + y_n)
445 |     // L_nn = g[N-1]
446 |     // U_nn = b[N-1] * (x_0 + y_n)
447 |     const uint8_t L_nn = g[N - 1];
448 |     const uint8_t U_nn = gf256_mul(b[N - 1], gf256_add(x_0, y_n));
449 | 
450 |     // diag_D[N-1] = L_nn * D_nn * U_nn
451 |     diag_D[N - 1] = gf256_div(gf256_mul(L_nn, U_nn), gf256_add(x_n, y_n));
452 | }
453 | 
454 | void CM256Decoder::Decode()
455 | {
456 |     // Matrix size is NxN, where N is the number of recovery blocks used.
457 |     const int N = RecoveryCount;
458 | 
459 |     // Start the x_0 values arbitrarily from the original count.
460 |     const uint8_t x_0 = static_cast<uint8_t>(Params.OriginalCount);
461 | 
462 |     // Eliminate original data from the the recovery rows
463 |     for (int originalIndex = 0; originalIndex < OriginalCount; ++originalIndex)
464 |     {
465 |         const uint8_t* inBlock = static_cast<const uint8_t*>(Original[originalIndex]->Block);
466 |         const uint8_t inRow = Original[originalIndex]->Index;
467 | 
468 |         for (int recoveryIndex = 0; recoveryIndex < N; ++recoveryIndex)
469 |         {
470 |             uint8_t* outBlock = static_cast<uint8_t*>(Recovery[recoveryIndex]->Block);
471 |             const uint8_t x_i = Recovery[recoveryIndex]->Index;
472 |             const uint8_t y_j = inRow;
473 |             const uint8_t matrixElement = GetMatrixElement(x_i, x_0, y_j);
474 | 
475 |             gf256_muladd_mem(outBlock, matrixElement, inBlock, Params.BlockBytes);
476 |         }
477 |     }
478 | 
479 |     // Allocate matrix
480 |     static const int StackAllocSize = 2048;
481 |     uint8_t stackMatrix[StackAllocSize];
482 |     uint8_t* dynamicMatrix = nullptr;
483 |     uint8_t* matrix = stackMatrix;
484 |     const int requiredSpace = N * N;
485 |     if (requiredSpace > StackAllocSize)
486 |     {
487 |         dynamicMatrix = new uint8_t[requiredSpace];
488 |         matrix = dynamicMatrix;
489 |     }
490 | 
491 |     /*
492 |         Compute matrix decomposition:
493 | 
494 |             G = L * D * U
495 | 
496 |         L is lower-triangular, diagonal is all ones.
497 |         D is a diagonal matrix.
498 |         U is upper-triangular, diagonal is all ones.
499 |     */
500 |     uint8_t* matrix_U = matrix;
501 |     uint8_t* diag_D = matrix_U + (N - 1) * N / 2;
502 |     uint8_t* matrix_L = diag_D + N;
503 |     GenerateLDUDecomposition(matrix_L, diag_D, matrix_U);
504 | 
505 |     /*
506 |         Eliminate lower left triangle.
507 |     */
508 |     // For each column,
509 |     for (int j = 0; j < N - 1; ++j)
510 |     {
511 |         const void* block_j = Recovery[j]->Block;
512 | 
513 |         // For each row,
514 |         for (int i = j + 1; i < N; ++i)
515 |         {
516 |             void* block_i = Recovery[i]->Block;
517 |             const uint8_t c_ij = *matrix_L++; // Matrix elements are stored column-first, top-down.
518 | 
519 |             gf256_muladd_mem(block_i, c_ij, block_j, Params.BlockBytes);
520 |         }
521 |     }
522 | 
523 |     /*
524 |         Eliminate diagonal.
525 |     */
526 |     for (int i = 0; i < N; ++i)
527 |     {
528 |         void* block = Recovery[i]->Block;
529 | 
530 |         Recovery[i]->Index = ErasuresIndices[i];
531 | 
532 |         gf256_div_mem(block, block, diag_D[i], Params.BlockBytes);
533 |     }
534 | 
535 |     /*
536 |         Eliminate upper right triangle.
537 |     */
538 |     for (int j = N - 1; j >= 1; --j)
539 |     {
540 |         const void* block_j = Recovery[j]->Block;
541 | 
542 |         for (int i = j - 1; i >= 0; --i)
543 |         {
544 |             void* block_i = Recovery[i]->Block;
545 |             const uint8_t c_ij = *matrix_U++; // Matrix elements are stored column-first, bottom-up.
546 | 
547 |             gf256_muladd_mem(block_i, c_ij, block_j, Params.BlockBytes);
548 |         }
549 |     }
550 | 
551 |     delete[] dynamicMatrix;
552 | }
553 | 
554 | extern "C" int cm256_decode(
555 |     cm256_encoder_params params, // Encoder params
556 |     cm256_block* blocks)         // Array of 'originalCount' blocks as described above
557 | {
558 |     if (params.OriginalCount <= 0 ||
559 |         params.RecoveryCount <= 0 ||
560 |         params.BlockBytes <= 0)
561 |     {
562 |         return -1;
563 |     }
564 |     if (params.OriginalCount + params.RecoveryCount > 256)
565 |     {
566 |         return -2;
567 |     }
568 |     if (!blocks)
569 |     {
570 |         return -3;
571 |     }
572 | 
573 |     // If there is only one block,
574 |     if (params.OriginalCount == 1)
575 |     {
576 |         // It is the same block repeated
577 |         blocks[0].Index = 0;
578 |         return 0;
579 |     }
580 | 
581 |     CM256Decoder state;
582 |     if (!state.Initialize(params, blocks))
583 |     {
584 |         return -5;
585 |     }
586 | 
587 |     // If nothing is erased,
588 |     if (state.RecoveryCount <= 0)
589 |     {
590 |         return 0;
591 |     }
592 | 
593 |     // If m=1,
594 |     if (params.RecoveryCount == 1)
595 |     {
596 |         state.DecodeM1();
597 |         return 0;
598 |     }
599 | 
600 |     // Decode for m>1
601 |     state.Decode();
602 |     return 0;
603 | }
604 | 


--------------------------------------------------------------------------------
/src/gf256.cpp:
--------------------------------------------------------------------------------
   1 | /** \file
   2 |     \brief GF(256) Main C API Source
   3 |     \copyright Copyright (c) 2017 Christopher A. Taylor.  All rights reserved.
   4 | 
   5 |     Redistribution and use in source and binary forms, with or without
   6 |     modification, are permitted provided that the following conditions are met:
   7 | 
   8 |     * Redistributions of source code must retain the above copyright notice,
   9 |       this list of conditions and the following disclaimer.
  10 |     * Redistributions in binary form must reproduce the above copyright notice,
  11 |       this list of conditions and the following disclaimer in the documentation
  12 |       and/or other materials provided with the distribution.
  13 |     * Neither the name of GF256 nor the names of its contributors may be
  14 |       used to endorse or promote products derived from this software without
  15 |       specific prior written permission.
  16 | 
  17 |     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18 |     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19 |     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20 |     ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  21 |     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  22 |     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  23 |     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  24 |     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  25 |     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  26 |     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  27 |     POSSIBILITY OF SUCH DAMAGE.
  28 | */
  29 | 
  30 | #include "gf256.h"
  31 | 
  32 | #ifdef LINUX_ARM
  33 | #include <unistd.h>
  34 | #include <fcntl.h>
  35 | #include <elf.h>
  36 | #include <linux/auxvec.h>
  37 | #endif
  38 | 
  39 | //------------------------------------------------------------------------------
  40 | // Detect host byte order.
  41 | // This check works with GCC and LLVM; assume little-endian byte order when
  42 | // using any other compiler.
  43 | // The result is verified during initialization.
  44 | //
  45 | #if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) \
  46 |     && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  47 | #define GF256_IS_BIG_ENDIAN
  48 | #endif
  49 | 
  50 | //------------------------------------------------------------------------------
  51 | // Workaround for ARMv7 that doesn't provide vqtbl1_*
  52 | // This comes from linux-raid (https://www.spinics.net/lists/raid/msg58403.html)
  53 | //
  54 | #ifdef GF256_TRY_NEON
  55 | #if __ARM_ARCH <= 7 && !defined(__aarch64__)
  56 | static GF256_FORCE_INLINE uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
  57 | {
  58 |     union {
  59 |         uint8x16_t    val;
  60 |         uint8x8x2_t    pair;
  61 |     } __a = { a };
  62 | 
  63 |     return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
  64 |                        vtbl2_u8(__a.pair, vget_high_u8(b)));
  65 | }
  66 | #endif
  67 | #endif
  68 | 
  69 | //------------------------------------------------------------------------------
  70 | // Self-Test
  71 | //
  72 | // This is executed during initialization to make sure the library is working
  73 | 
  74 | static const unsigned kTestBufferBytes = 32 + 16 + 8 + 4 + 2 + 1;
  75 | static const unsigned kTestBufferAllocated = 64;
  76 | struct SelfTestBuffersT
  77 | {
  78 |     GF256_ALIGNED uint8_t A[kTestBufferAllocated];
  79 |     GF256_ALIGNED uint8_t B[kTestBufferAllocated];
  80 |     GF256_ALIGNED uint8_t C[kTestBufferAllocated];
  81 | };
  82 | static GF256_ALIGNED SelfTestBuffersT m_SelfTestBuffers;
  83 | 
  84 | static bool gf256_self_test()
  85 | {
  86 |     if ((uintptr_t)m_SelfTestBuffers.A % GF256_ALIGN_BYTES != 0)
  87 |         return false;
  88 |     if ((uintptr_t)m_SelfTestBuffers.A % GF256_ALIGN_BYTES != 0)
  89 |         return false;
  90 |     if ((uintptr_t)m_SelfTestBuffers.B % GF256_ALIGN_BYTES != 0)
  91 |         return false;
  92 |     if ((uintptr_t)m_SelfTestBuffers.C % GF256_ALIGN_BYTES != 0)
  93 |         return false;
  94 | 
  95 |     // Check multiplication/division
  96 |     for (unsigned i = 0; i < 256; ++i)
  97 |     {
  98 |         for (unsigned j = 0; j < 256; ++j)
  99 |         {
 100 |             uint8_t prod = gf256_mul((uint8_t)i, (uint8_t)j);
 101 |             if (i != 0 && j != 0)
 102 |             {
 103 |                 uint8_t div1 = gf256_div(prod, (uint8_t)i);
 104 |                 if (div1 != j)
 105 |                     return false;
 106 |                 uint8_t div2 = gf256_div(prod, (uint8_t)j);
 107 |                 if (div2 != i)
 108 |                     return false;
 109 |             }
 110 |             else if (prod != 0)
 111 |                 return false;
 112 |             if (j == 1 && prod != i)
 113 |                 return false;
 114 |         }
 115 |     }
 116 | 
 117 |     // Check for overruns
 118 |     m_SelfTestBuffers.A[kTestBufferBytes] = 0x5a;
 119 |     m_SelfTestBuffers.B[kTestBufferBytes] = 0x5a;
 120 |     m_SelfTestBuffers.C[kTestBufferBytes] = 0x5a;
 121 | 
 122 |     // Test gf256_add_mem()
 123 |     for (unsigned i = 0; i < kTestBufferBytes; ++i)
 124 |     {
 125 |         m_SelfTestBuffers.A[i] = 0x1f;
 126 |         m_SelfTestBuffers.B[i] = 0xf7;
 127 |     }
 128 |     gf256_add_mem(m_SelfTestBuffers.A, m_SelfTestBuffers.B, kTestBufferBytes);
 129 |     for (unsigned i = 0; i < kTestBufferBytes; ++i)
 130 |         if (m_SelfTestBuffers.A[i] != (0x1f ^ 0xf7))
 131 |             return false;
 132 | 
 133 |     // Test gf256_add2_mem()
 134 |     for (unsigned i = 0; i < kTestBufferBytes; ++i)
 135 |     {
 136 |         m_SelfTestBuffers.A[i] = 0x1f;
 137 |         m_SelfTestBuffers.B[i] = 0xf7;
 138 |         m_SelfTestBuffers.C[i] = 0x71;
 139 |     }
 140 |     gf256_add2_mem(m_SelfTestBuffers.A, m_SelfTestBuffers.B, m_SelfTestBuffers.C, kTestBufferBytes);
 141 |     for (unsigned i = 0; i < kTestBufferBytes; ++i)
 142 |         if (m_SelfTestBuffers.A[i] != (0x1f ^ 0xf7 ^ 0x71))
 143 |             return false;
 144 | 
 145 |     // Test gf256_addset_mem()
 146 |     for (unsigned i = 0; i < kTestBufferBytes; ++i)
 147 |     {
 148 |         m_SelfTestBuffers.A[i] = 0x55;
 149 |         m_SelfTestBuffers.B[i] = 0xaa;
 150 |         m_SelfTestBuffers.C[i] = 0x6c;
 151 |     }
 152 |     gf256_addset_mem(m_SelfTestBuffers.A, m_SelfTestBuffers.B, m_SelfTestBuffers.C, kTestBufferBytes);
 153 |     for (unsigned i = 0; i < kTestBufferBytes; ++i)
 154 |         if (m_SelfTestBuffers.A[i] != (0xaa ^ 0x6c))
 155 |             return false;
 156 | 
 157 |     // Test gf256_muladd_mem()
 158 |     for (unsigned i = 0; i < kTestBufferBytes; ++i)
 159 |     {
 160 |         m_SelfTestBuffers.A[i] = 0xff;
 161 |         m_SelfTestBuffers.B[i] = 0xaa;
 162 |     }
 163 |     const uint8_t expectedMulAdd = gf256_mul(0xaa, 0x6c);
 164 |     gf256_muladd_mem(m_SelfTestBuffers.A, 0x6c, m_SelfTestBuffers.B, kTestBufferBytes);
 165 |     for (unsigned i = 0; i < kTestBufferBytes; ++i)
 166 |         if (m_SelfTestBuffers.A[i] != (expectedMulAdd ^ 0xff))
 167 |             return false;
 168 | 
 169 |     // Test gf256_mul_mem()
 170 |     for (unsigned i = 0; i < kTestBufferBytes; ++i)
 171 |     {
 172 |         m_SelfTestBuffers.A[i] = 0xff;
 173 |         m_SelfTestBuffers.B[i] = 0x55;
 174 |     }
 175 |     const uint8_t expectedMul = gf256_mul(0xa2, 0x55);
 176 |     gf256_mul_mem(m_SelfTestBuffers.A, m_SelfTestBuffers.B, 0xa2, kTestBufferBytes);
 177 |     for (unsigned i = 0; i < kTestBufferBytes; ++i)
 178 |         if (m_SelfTestBuffers.A[i] != expectedMul)
 179 |             return false;
 180 | 
 181 |     if (m_SelfTestBuffers.A[kTestBufferBytes] != 0x5a)
 182 |         return false;
 183 |     if (m_SelfTestBuffers.B[kTestBufferBytes] != 0x5a)
 184 |         return false;
 185 |     if (m_SelfTestBuffers.C[kTestBufferBytes] != 0x5a)
 186 |         return false;
 187 | 
 188 |     return true;
 189 | }
 190 | 
 191 | 
 192 | //------------------------------------------------------------------------------
 193 | // Runtime CPU Architecture Check
 194 | //
 195 | // Feature checks stolen shamelessly from
 196 | // https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c
 197 | 
 198 | #if defined(HAVE_ANDROID_GETCPUFEATURES)
 199 | #include <cpu-features.h>
 200 | #endif
 201 | 
 202 | #if defined(GF256_TRY_NEON)
 203 | # if defined(IOS) && (defined(__ARM_NEON) || defined(__ARM_NEON__))
 204 | // Requires iPhone 5S or newer
 205 | static const bool CpuHasNeon = true;
 206 | static const bool CpuHasNeon64 = true;
 207 | # else // ANDROID or LINUX_ARM
 208 | #  if defined(__aarch64__)
 209 | static bool CpuHasNeon = true;      // if AARCH64, then we have NEON for sure...
 210 | static bool CpuHasNeon64 = true;    // And we have ASIMD
 211 | #  else
 212 | static bool CpuHasNeon = false;     // if not, then we have to check at runtime.
 213 | static bool CpuHasNeon64 = false;   // And we don't have ASIMD
 214 | #  endif
 215 | # endif
 216 | #endif
 217 | 
 218 | #if !defined(GF256_TARGET_MOBILE)
 219 | 
 220 | #ifdef _MSC_VER
 221 |     #include <intrin.h> // __cpuid
 222 |     #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
 223 | #endif
 224 | 
 225 | #ifdef GF256_TRY_AVX2
 226 | static bool CpuHasAVX2 = false;
 227 | #endif
 228 | static bool CpuHasSSSE3 = false;
 229 | 
 230 | #define CPUID_EBX_AVX2    0x00000020
 231 | #define CPUID_ECX_SSSE3   0x00000200
 232 | 
 233 | static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type)
 234 | {
 235 | #if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
 236 |     __cpuid((int *) cpu_info, cpu_info_type);
 237 | #else //if defined(HAVE_CPUID)
 238 |     cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
 239 | # ifdef __i386__
 240 |     __asm__ __volatile__ ("pushfl; pushfl; "
 241 |                           "popl %0; "
 242 |                           "movl %0, %1; xorl %2, %0; "
 243 |                           "pushl %0; "
 244 |                           "popfl; pushfl; popl %0; popfl" :
 245 |                           "=&r" (cpu_info[0]), "=&r" (cpu_info[1]) :
 246 |                           "i" (0x200000));
 247 |     if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) {
 248 |         return; /* LCOV_EXCL_LINE */
 249 |     }
 250 | # endif
 251 | # ifdef __i386__
 252 |     __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" :
 253 |                           "=a" (cpu_info[0]), "=&r" (cpu_info[1]),
 254 |                           "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
 255 |                           "0" (cpu_info_type), "2" (0U));
 256 | # elif defined(__x86_64__)
 257 |     __asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" :
 258 |                           "=a" (cpu_info[0]), "=&r" (cpu_info[1]),
 259 |                           "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
 260 |                           "0" (cpu_info_type), "2" (0U));
 261 | # else
 262 |     __asm__ __volatile__ ("cpuid" :
 263 |                           "=a" (cpu_info[0]), "=b" (cpu_info[1]),
 264 |                           "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
 265 |                           "0" (cpu_info_type), "2" (0U));
 266 | # endif
 267 | #endif
 268 | }
 269 | 
 270 | #else
 271 | #if defined(LINUX_ARM)
 272 | static void checkLinuxARMNeonCapabilities( bool& cpuHasNeon )
 273 | {
 274 |     auto cpufile = open("/proc/self/auxv", O_RDONLY);
 275 |     Elf32_auxv_t auxv;
 276 |     if (cpufile >= 0)
 277 |     {
 278 |         const auto size_auxv_t = sizeof(Elf32_auxv_t);
 279 |         while (read(cpufile, &auxv, size_auxv_t) == size_auxv_t)
 280 |         {
 281 |             if (auxv.a_type == AT_HWCAP)
 282 |             {
 283 |                 cpuHasNeon = (auxv.a_un.a_val & 4096) != 0;
 284 |                 break;
 285 |             }
 286 |         }
 287 |         close(cpufile);
 288 |     }
 289 |     else
 290 |     {
 291 |         cpuHasNeon = false;
 292 |     }
 293 | }
 294 | #endif
 295 | #endif // defined(GF256_TARGET_MOBILE)
 296 | 
 297 | static void gf256_architecture_init()
 298 | {
 299 | #if defined(GF256_TRY_NEON)
 300 | 
 301 |     // Check for NEON support on Android platform
 302 | #if defined(HAVE_ANDROID_GETCPUFEATURES)
 303 |     AndroidCpuFamily family = android_getCpuFamily();
 304 |     if (family == ANDROID_CPU_FAMILY_ARM)
 305 |     {
 306 |         if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON)
 307 |             CpuHasNeon = true;
 308 |     }
 309 |     else if (family == ANDROID_CPU_FAMILY_ARM64)
 310 |     {
 311 |         CpuHasNeon = true;
 312 |         if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD)
 313 |             CpuHasNeon64 = true;
 314 |     }
 315 | #endif
 316 | 
 317 | #if defined(LINUX_ARM)
 318 |     // Check for NEON support on other ARM/Linux platforms
 319 |     checkLinuxARMNeonCapabilities(CpuHasNeon);
 320 | #endif
 321 | 
 322 | #endif //GF256_TRY_NEON
 323 | 
 324 | #if !defined(GF256_TARGET_MOBILE)
 325 |     unsigned int cpu_info[4];
 326 | 
 327 |     _cpuid(cpu_info, 1);
 328 |     CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0);
 329 | 
 330 | #if defined(GF256_TRY_AVX2)
 331 |     _cpuid(cpu_info, 7);
 332 |     CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0);
 333 | #endif // GF256_TRY_AVX2
 334 | 
 335 |     // When AVX2 and SSSE3 are unavailable, Siamese takes 4x longer to decode
 336 |     // and 2.6x longer to encode.  Encoding requires a lot more simple XOR ops
 337 |     // so it is still pretty fast.  Decoding is usually really quick because
 338 |     // average loss rates are low, but when needed it requires a lot more
 339 |     // GF multiplies requiring table lookups which is slower.
 340 | 
 341 | #endif // GF256_TARGET_MOBILE
 342 | }
 343 | 
 344 | 
 345 | //------------------------------------------------------------------------------
 346 | // Context Object
 347 | 
 348 | // Context object for GF(2^^8) math
 349 | GF256_ALIGNED gf256_ctx GF256Ctx;
 350 | static bool Initialized = false;
 351 | 
 352 | 
 353 | //------------------------------------------------------------------------------
 354 | // Generator Polynomial
 355 | 
 356 | // There are only 16 irreducible polynomials for GF(2^^8)
 357 | static const int GF256_GEN_POLY_COUNT = 16;
 358 | static const uint8_t GF256_GEN_POLY[GF256_GEN_POLY_COUNT] = {
 359 |     0x8e, 0x95, 0x96, 0xa6, 0xaf, 0xb1, 0xb2, 0xb4,
 360 |     0xb8, 0xc3, 0xc6, 0xd4, 0xe1, 0xe7, 0xf3, 0xfa
 361 | };
 362 | 
 363 | static const int kDefaultPolynomialIndex = 3;
 364 | 
 365 | // Select which polynomial to use
 366 | static void gf256_poly_init(int polynomialIndex)
 367 | {
 368 |     if (polynomialIndex < 0 || polynomialIndex >= GF256_GEN_POLY_COUNT)
 369 |         polynomialIndex = kDefaultPolynomialIndex;
 370 | 
 371 |     GF256Ctx.Polynomial = (GF256_GEN_POLY[polynomialIndex] << 1) | 1;
 372 | }
 373 | 
 374 | 
 375 | //------------------------------------------------------------------------------
 376 | // Exponential and Log Tables
 377 | 
 378 | // Construct EXP and LOG tables from polynomial
 379 | static void gf256_explog_init()
 380 | {
 381 |     unsigned poly = GF256Ctx.Polynomial;
 382 |     uint8_t* exptab = GF256Ctx.GF256_EXP_TABLE;
 383 |     uint16_t* logtab = GF256Ctx.GF256_LOG_TABLE;
 384 | 
 385 |     logtab[0] = 512;
 386 |     exptab[0] = 1;
 387 |     for (unsigned jj = 1; jj < 255; ++jj)
 388 |     {
 389 |         unsigned next = (unsigned)exptab[jj - 1] * 2;
 390 |         if (next >= 256)
 391 |             next ^= poly;
 392 | 
 393 |         exptab[jj] = static_cast<uint8_t>( next );
 394 |         logtab[exptab[jj]] = static_cast<uint16_t>( jj );
 395 |     }
 396 |     exptab[255] = exptab[0];
 397 |     logtab[exptab[255]] = 255;
 398 |     for (unsigned jj = 256; jj < 2 * 255; ++jj)
 399 |         exptab[jj] = exptab[jj % 255];
 400 |     exptab[2 * 255] = 1;
 401 |     for (unsigned jj = 2 * 255 + 1; jj < 4 * 255; ++jj)
 402 |         exptab[jj] = 0;
 403 | }
 404 | 
 405 | 
 406 | //------------------------------------------------------------------------------
 407 | // Multiply and Divide Tables
 408 | 
 409 | // Initialize MUL and DIV tables using LOG and EXP tables
 410 | static void gf256_muldiv_init()
 411 | {
 412 |     // Allocate table memory 65KB x 2
 413 |     uint8_t* m = GF256Ctx.GF256_MUL_TABLE;
 414 |     uint8_t* d = GF256Ctx.GF256_DIV_TABLE;
 415 | 
 416 |     // Unroll y = 0 subtable
 417 |     for (int x = 0; x < 256; ++x)
 418 |         m[x] = d[x] = 0;
 419 | 
 420 |     // For each other y value:
 421 |     for (int y = 1; y < 256; ++y)
 422 |     {
 423 |         // Calculate log(y) for mult and 255 - log(y) for div
 424 |         const uint8_t log_y = static_cast<uint8_t>(GF256Ctx.GF256_LOG_TABLE[y]);
 425 |         const uint8_t log_yn = 255 - log_y;
 426 | 
 427 |         // Next subtable
 428 |         m += 256, d += 256;
 429 | 
 430 |         // Unroll x = 0
 431 |         m[0] = 0, d[0] = 0;
 432 | 
 433 |         // Calculate x * y, x / y
 434 |         for (int x = 1; x < 256; ++x)
 435 |         {
 436 |             uint16_t log_x = GF256Ctx.GF256_LOG_TABLE[x];
 437 | 
 438 |             m[x] = GF256Ctx.GF256_EXP_TABLE[log_x + log_y];
 439 |             d[x] = GF256Ctx.GF256_EXP_TABLE[log_x + log_yn];
 440 |         }
 441 |     }
 442 | }
 443 | 
 444 | 
 445 | //------------------------------------------------------------------------------
 446 | // Inverse Table
 447 | 
 448 | // Initialize INV table using DIV table
 449 | static void gf256_inv_init()
 450 | {
 451 |     for (int x = 0; x < 256; ++x)
 452 |         GF256Ctx.GF256_INV_TABLE[x] = gf256_div(1, static_cast<uint8_t>(x));
 453 | }
 454 | 
 455 | 
 456 | //------------------------------------------------------------------------------
 457 | // Square Table
 458 | 
 459 | // Initialize SQR table using MUL table
 460 | static void gf256_sqr_init()
 461 | {
 462 |     for (int x = 0; x < 256; ++x)
 463 |         GF256Ctx.GF256_SQR_TABLE[x] = gf256_mul(static_cast<uint8_t>(x), static_cast<uint8_t>(x));
 464 | }
 465 | 
 466 | 
 467 | //------------------------------------------------------------------------------
 468 | // Multiply and Add Memory Tables
 469 | 
 470 | /*
 471 |     Fast algorithm to compute m[1..8] = a[1..8] * b in GF(256)
 472 |     using SSE3 SIMD instruction set:
 473 | 
 474 |     Consider z = x * y in GF(256).
 475 |     This operation can be performed bit-by-bit.  Usefully, the partial product
 476 |     of each bit is combined linearly with the rest.  This means that the 8-bit
 477 |     number x can be split into its high and low 4 bits, and partial products
 478 |     can be formed from each half.  Then the halves can be linearly combined:
 479 | 
 480 |         z = x[0..3] * y + x[4..7] * y
 481 | 
 482 |     The multiplication of each half can be done efficiently via table lookups,
 483 |     and the addition in GF(256) is XOR.  There must be two tables that map 16
 484 |     input elements for the low or high 4 bits of x to the two partial products.
 485 |     Each value for y has a different set of two tables:
 486 | 
 487 |         z = TABLE_LO_y(x[0..3]) xor TABLE_HI_y(x[4..7])
 488 | 
 489 |     This means that we need 16 * 2 * 256 = 8192 bytes for precomputed tables.
 490 | 
 491 |     Computing z[] = x[] * y can be performed 16 bytes at a time by using the
 492 |     128-bit register operations supported by modern processors.
 493 | 
 494 |     This is efficiently realized in SSE3 using the _mm_shuffle_epi8() function
 495 |     provided by Visual Studio 2010 or newer in <tmmintrin.h>.  This function
 496 |     uses the low bits to do a table lookup on each byte.  Unfortunately the
 497 |     high bit of each mask byte has the special feature that it clears the
 498 |     output byte when it is set, so we need to make sure it's cleared by masking
 499 |     off the high bit of each byte before using it:
 500 | 
 501 |         clr_mask = _mm_set1_epi8(0x0f) = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
 502 | 
 503 |     For the low half of the partial product, clear the high bit of each byte
 504 |     and perform the table lookup:
 505 | 
 506 |         p_lo = _mm_and_si128(x, clr_mask)
 507 |         p_lo = _mm_shuffle_epi8(p_lo, TABLE_LO_y)
 508 | 
 509 |     For the high half of the partial product, shift the high 4 bits of each
 510 |     byte into the low 4 bits and clear the high bit of each byte, and then
 511 |     perform the table lookup:
 512 | 
 513 |         p_hi = _mm_srli_epi64(x, 4)
 514 |         p_hi = _mm_and_si128(p_hi, clr_mask)
 515 |         p_hi = _mm_shuffle_epi8(p_hi, TABLE_HI_y)
 516 | 
 517 |     Finally add the two partial products to form the product, recalling that
 518 |     addition is XOR in a Galois field:
 519 | 
 520 |         result = _mm_xor_si128(p_lo, p_hi)
 521 | 
 522 |     This crunches 16 bytes of x at a time, and the result can be stored in z.
 523 | */
 524 | 
 525 | /*
 526 |     Intrinsic reference:
 527 | 
 528 |     SSE3, VS2010+, tmmintrin.h:
 529 | 
 530 |     GF256_M128 _mm_shuffle_epi8(GF256_M128 a, GF256_M128 mask);
 531 |         Emits the Supplemental Streaming SIMD Extensions 3 (SSSE3) instruction pshufb. This instruction shuffles 16-byte parameters from a 128-bit parameter.
 532 | 
 533 |         Pseudo-code for PSHUFB (with 128 bit operands):
 534 | 
 535 |             for i = 0 to 15 {
 536 |                  if (SRC[(i * 8)+7] = 1 ) then
 537 |                       DEST[(i*8)+7..(i*8)+0] <- 0;
 538 |                   else
 539 |                       index[3..0] <- SRC[(i*8)+3 .. (i*8)+0];
 540 |                       DEST[(i*8)+7..(i*8)+0] <- DEST[(index*8+7)..(index*8+0)];
 541 |                  endif
 542 |             }
 543 | 
 544 |     SSE2, VS2008+, emmintrin.h:
 545 | 
 546 |     GF256_M128 _mm_slli_epi64 (GF256_M128 a, int count);
 547 |         Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while shifting in zeros.
 548 |     GF256_M128 _mm_srli_epi64 (GF256_M128 a, int count);
 549 |         Shifts the 2 signed or unsigned 64-bit integers in a right by count bits while shifting in zeros.
 550 |     GF256_M128 _mm_set1_epi8 (char b);
 551 |         Sets the 16 signed 8-bit integer values to b.
 552 |     GF256_M128 _mm_and_si128 (GF256_M128 a, GF256_M128 b);
 553 |         Computes the bitwise AND of the 128-bit value in a and the 128-bit value in b.
 554 |     GF256_M128 _mm_xor_si128 ( GF256_M128 a, GF256_M128 b);
 555 |         Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in b.
 556 | */
 557 | 
 558 | // Initialize the multiplication tables using gf256_mul()
 559 | static void gf256_mul_mem_init()
 560 | {
 561 |     // Reuse aligned self test buffers to load table data
 562 |     uint8_t* lo = m_SelfTestBuffers.A;
 563 |     uint8_t* hi = m_SelfTestBuffers.B;
 564 | 
 565 |     for (int y = 0; y < 256; ++y)
 566 |     {
 567 |         // TABLE_LO_Y maps 0..15 to 8-bit partial product based on y.
 568 |         for (unsigned char x = 0; x < 16; ++x)
 569 |         {
 570 |             lo[x] = gf256_mul(x, static_cast<uint8_t>( y ));
 571 |             hi[x] = gf256_mul(x << 4, static_cast<uint8_t>( y ));
 572 |         }
 573 | 
 574 | #if defined(GF256_TRY_NEON)
 575 |         if (CpuHasNeon)
 576 |         {
 577 |             GF256Ctx.MM128.TABLE_LO_Y[y] = vld1q_u8(lo);
 578 |             GF256Ctx.MM128.TABLE_HI_Y[y] = vld1q_u8(hi);
 579 |         }
 580 | #elif !defined(GF256_TARGET_MOBILE)
 581 |         const GF256_M128 table_lo = _mm_loadu_si128((GF256_M128*)lo);
 582 |         const GF256_M128 table_hi = _mm_loadu_si128((GF256_M128*)hi);
 583 |         _mm_storeu_si128(GF256Ctx.MM128.TABLE_LO_Y + y, table_lo);
 584 |         _mm_storeu_si128(GF256Ctx.MM128.TABLE_HI_Y + y, table_hi);
 585 | # ifdef GF256_TRY_AVX2
 586 |         if (CpuHasAVX2)
 587 |         {
 588 |             const GF256_M256 table_lo2 = _mm256_broadcastsi128_si256(table_lo);
 589 |             const GF256_M256 table_hi2 = _mm256_broadcastsi128_si256(table_hi);
 590 |             _mm256_storeu_si256(GF256Ctx.MM256.TABLE_LO_Y + y, table_lo2);
 591 |             _mm256_storeu_si256(GF256Ctx.MM256.TABLE_HI_Y + y, table_hi2);
 592 |         }
 593 | # endif // GF256_TRY_AVX2
 594 | #endif // GF256_TARGET_MOBILE
 595 |     }
 596 | }
 597 | 
 598 | 
 599 | //------------------------------------------------------------------------------
 600 | // Initialization
 601 | 
 602 | #ifdef GF256_IS_BIG_ENDIAN
 603 | static unsigned char kEndianTestData[4] = { 1, 2, 3, 4 };
 604 | #else
 605 | static unsigned char kEndianTestData[4] = { 4, 3, 2, 1 };
 606 | #endif
 607 | 
 608 | union UnionType
 609 | {
 610 |     uint32_t IntValue;
 611 |     char CharArray[4];
 612 | };
 613 | 
 614 | static bool IsExpectedEndian()
 615 | {
 616 |     UnionType type;
 617 |     for (unsigned i = 0; i < 4; ++i)
 618 |         type.CharArray[i] = kEndianTestData[i];
 619 |     return 0x01020304 == type.IntValue;
 620 | }
 621 | 
 622 | extern "C" int gf256_init_(int version)
 623 | {
 624 |     if (version != GF256_VERSION)
 625 |         return -1; // User's header does not match library version.
 626 | 
 627 |     // Avoid multiple initialization
 628 |     if (Initialized)
 629 |         return 0;
 630 |     Initialized = true;
 631 | 
 632 |     if (!IsExpectedEndian())
 633 |         return -2; // Unexpected byte order.
 634 | 
 635 |     gf256_architecture_init();
 636 |     gf256_poly_init(kDefaultPolynomialIndex);
 637 |     gf256_explog_init();
 638 |     gf256_muldiv_init();
 639 |     gf256_inv_init();
 640 |     gf256_sqr_init();
 641 |     gf256_mul_mem_init();
 642 | 
 643 |     if (!gf256_self_test())
 644 |         return -3; // Self-test failed (perhaps untested configuration)
 645 | 
 646 |     return 0;
 647 | }
 648 | 
 649 | 
 650 | //------------------------------------------------------------------------------
 651 | // Operations
 652 | 
 653 | extern "C" void gf256_add_mem(void * GF256_RESTRICT vx,
 654 |                               const void * GF256_RESTRICT vy, int bytes)
 655 | {
 656 |     GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast<GF256_M128 *>(vx);
 657 |     const GF256_M128 * GF256_RESTRICT y16 = reinterpret_cast<const GF256_M128 *>(vy);
 658 | 
 659 | #if defined(GF256_TARGET_MOBILE)
 660 | # if defined(GF256_TRY_NEON)
 661 |     // Handle multiples of 64 bytes
 662 |     if (CpuHasNeon)
 663 |     {
 664 |         while (bytes >= 64)
 665 |         {
 666 |             GF256_M128 x0 = vld1q_u8((uint8_t*) x16);
 667 |             GF256_M128 x1 = vld1q_u8((uint8_t*)(x16 + 1) );
 668 |             GF256_M128 x2 = vld1q_u8((uint8_t*)(x16 + 2) );
 669 |             GF256_M128 x3 = vld1q_u8((uint8_t*)(x16 + 3) );
 670 |             GF256_M128 y0 = vld1q_u8((uint8_t*)y16);
 671 |             GF256_M128 y1 = vld1q_u8((uint8_t*)(y16 + 1));
 672 |             GF256_M128 y2 = vld1q_u8((uint8_t*)(y16 + 2));
 673 |             GF256_M128 y3 = vld1q_u8((uint8_t*)(y16 + 3));
 674 | 
 675 |             vst1q_u8((uint8_t*)x16,     veorq_u8(x0, y0));
 676 |             vst1q_u8((uint8_t*)(x16 + 1), veorq_u8(x1, y1));
 677 |             vst1q_u8((uint8_t*)(x16 + 2), veorq_u8(x2, y2));
 678 |             vst1q_u8((uint8_t*)(x16 + 3), veorq_u8(x3, y3));
 679 | 
 680 |             bytes -= 64, x16 += 4, y16 += 4;
 681 |         }
 682 | 
 683 |         // Handle multiples of 16 bytes
 684 |         while (bytes >= 16)
 685 |         {
 686 |             GF256_M128 x0 = vld1q_u8((uint8_t*)x16);
 687 |             GF256_M128 y0 = vld1q_u8((uint8_t*)y16);
 688 | 
 689 |             vst1q_u8((uint8_t*)x16, veorq_u8(x0, y0));
 690 | 
 691 |             bytes -= 16, ++x16, ++y16;
 692 |         }
 693 |     }
 694 |     else
 695 | # endif // GF256_TRY_NEON
 696 |     {
 697 |         uint64_t * GF256_RESTRICT x8 = reinterpret_cast<uint64_t *>(x16);
 698 |         const uint64_t * GF256_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);
 699 | 
 700 |         const unsigned count = (unsigned)bytes / 8;
 701 |         for (unsigned ii = 0; ii < count; ++ii)
 702 |             x8[ii] ^= y8[ii];
 703 | 
 704 |         x16 = reinterpret_cast<GF256_M128 *>(x8 + count);
 705 |         y16 = reinterpret_cast<const GF256_M128 *>(y8 + count);
 706 | 
 707 |         bytes -= (count * 8);
 708 |     }
 709 | #else // GF256_TARGET_MOBILE
 710 | # if defined(GF256_TRY_AVX2)
 711 |     if (CpuHasAVX2)
 712 |     {
 713 |         GF256_M256 * GF256_RESTRICT x32 = reinterpret_cast<GF256_M256 *>(x16);
 714 |         const GF256_M256 * GF256_RESTRICT y32 = reinterpret_cast<const GF256_M256 *>(y16);
 715 | 
 716 |         while (bytes >= 128)
 717 |         {
 718 |             GF256_M256 x0 = _mm256_loadu_si256(x32);
 719 |             GF256_M256 y0 = _mm256_loadu_si256(y32);
 720 |             x0 = _mm256_xor_si256(x0, y0);
 721 |             GF256_M256 x1 = _mm256_loadu_si256(x32 + 1);
 722 |             GF256_M256 y1 = _mm256_loadu_si256(y32 + 1);
 723 |             x1 = _mm256_xor_si256(x1, y1);
 724 |             GF256_M256 x2 = _mm256_loadu_si256(x32 + 2);
 725 |             GF256_M256 y2 = _mm256_loadu_si256(y32 + 2);
 726 |             x2 = _mm256_xor_si256(x2, y2);
 727 |             GF256_M256 x3 = _mm256_loadu_si256(x32 + 3);
 728 |             GF256_M256 y3 = _mm256_loadu_si256(y32 + 3);
 729 |             x3 = _mm256_xor_si256(x3, y3);
 730 | 
 731 |             _mm256_storeu_si256(x32, x0);
 732 |             _mm256_storeu_si256(x32 + 1, x1);
 733 |             _mm256_storeu_si256(x32 + 2, x2);
 734 |             _mm256_storeu_si256(x32 + 3, x3);
 735 | 
 736 |             bytes -= 128, x32 += 4, y32 += 4;
 737 |         }
 738 | 
 739 |         // Handle multiples of 32 bytes
 740 |         while (bytes >= 32)
 741 |         {
 742 |             // x[i] = x[i] xor y[i]
 743 |             _mm256_storeu_si256(x32,
 744 |                 _mm256_xor_si256(
 745 |                     _mm256_loadu_si256(x32),
 746 |                     _mm256_loadu_si256(y32)));
 747 | 
 748 |             bytes -= 32, ++x32, ++y32;
 749 |         }
 750 | 
 751 |         x16 = reinterpret_cast<GF256_M128 *>(x32);
 752 |         y16 = reinterpret_cast<const GF256_M128 *>(y32);
 753 |     }
 754 |     else
 755 | # endif // GF256_TRY_AVX2
 756 |     {
 757 |         while (bytes >= 64)
 758 |         {
 759 |             GF256_M128 x0 = _mm_loadu_si128(x16);
 760 |             GF256_M128 y0 = _mm_loadu_si128(y16);
 761 |             x0 = _mm_xor_si128(x0, y0);
 762 |             GF256_M128 x1 = _mm_loadu_si128(x16 + 1);
 763 |             GF256_M128 y1 = _mm_loadu_si128(y16 + 1);
 764 |             x1 = _mm_xor_si128(x1, y1);
 765 |             GF256_M128 x2 = _mm_loadu_si128(x16 + 2);
 766 |             GF256_M128 y2 = _mm_loadu_si128(y16 + 2);
 767 |             x2 = _mm_xor_si128(x2, y2);
 768 |             GF256_M128 x3 = _mm_loadu_si128(x16 + 3);
 769 |             GF256_M128 y3 = _mm_loadu_si128(y16 + 3);
 770 |             x3 = _mm_xor_si128(x3, y3);
 771 | 
 772 |             _mm_storeu_si128(x16, x0);
 773 |             _mm_storeu_si128(x16 + 1, x1);
 774 |             _mm_storeu_si128(x16 + 2, x2);
 775 |             _mm_storeu_si128(x16 + 3, x3);
 776 | 
 777 |             bytes -= 64, x16 += 4, y16 += 4;
 778 |         }
 779 |     }
 780 | #endif // GF256_TARGET_MOBILE
 781 | 
 782 | #if !defined(GF256_TARGET_MOBILE)
 783 |     // Handle multiples of 16 bytes
 784 |     while (bytes >= 16)
 785 |     {
 786 |         // x[i] = x[i] xor y[i]
 787 |         _mm_storeu_si128(x16,
 788 |             _mm_xor_si128(
 789 |                 _mm_loadu_si128(x16),
 790 |                 _mm_loadu_si128(y16)));
 791 | 
 792 |         bytes -= 16, ++x16, ++y16;
 793 |     }
 794 | #endif
 795 | 
 796 |     uint8_t * GF256_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
 797 |     const uint8_t * GF256_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
 798 | 
 799 |     // Handle a block of 8 bytes
 800 |     const int eight = bytes & 8;
 801 |     if (eight)
 802 |     {
 803 |         uint64_t * GF256_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
 804 |         const uint64_t * GF256_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
 805 |         *x8 ^= *y8;
 806 |     }
 807 | 
 808 |     // Handle a block of 4 bytes
 809 |     const int four = bytes & 4;
 810 |     if (four)
 811 |     {
 812 |         uint32_t * GF256_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1 + eight);
 813 |         const uint32_t * GF256_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
 814 |         *x4 ^= *y4;
 815 |     }
 816 | 
 817 |     // Handle final bytes
 818 |     const int offset = eight + four;
 819 |     switch (bytes & 3)
 820 |     {
 821 |     case 3: x1[offset + 2] ^= y1[offset + 2];
 822 |     case 2: x1[offset + 1] ^= y1[offset + 1];
 823 |     case 1: x1[offset] ^= y1[offset];
 824 |     default:
 825 |         break;
 826 |     }
 827 | }
 828 | 
 829 | extern "C" void gf256_add2_mem(void * GF256_RESTRICT vz, const void * GF256_RESTRICT vx,
 830 |                                const void * GF256_RESTRICT vy, int bytes)
 831 | {
 832 |     GF256_M128 * GF256_RESTRICT z16 = reinterpret_cast<GF256_M128*>(vz);
 833 |     const GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast<const GF256_M128*>(vx);
 834 |     const GF256_M128 * GF256_RESTRICT y16 = reinterpret_cast<const GF256_M128*>(vy);
 835 | 
 836 | #if defined(GF256_TARGET_MOBILE)
 837 | # if defined(GF256_TRY_NEON)
 838 |     // Handle multiples of 64 bytes
 839 |     if (CpuHasNeon)
 840 |     {
 841 |         // Handle multiples of 16 bytes
 842 |         while (bytes >= 16)
 843 |         {
 844 |             // z[i] = z[i] xor x[i] xor y[i]
 845 |             vst1q_u8((uint8_t*)z16,
 846 |                 veorq_u8(
 847 |                     vld1q_u8((uint8_t*)z16),
 848 |                     veorq_u8(
 849 |                         vld1q_u8((uint8_t*)x16),
 850 |                         vld1q_u8((uint8_t*)y16))));
 851 | 
 852 |             bytes -= 16, ++x16, ++y16, ++z16;
 853 |         }
 854 |     }
 855 |     else
 856 | # endif // GF256_TRY_NEON
 857 |     {
 858 |         uint64_t * GF256_RESTRICT z8 = reinterpret_cast<uint64_t *>(z16);
 859 |         const uint64_t * GF256_RESTRICT x8 = reinterpret_cast<const uint64_t *>(x16);
 860 |         const uint64_t * GF256_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);
 861 | 
 862 |         const unsigned count = (unsigned)bytes / 8;
 863 |         for (unsigned ii = 0; ii < count; ++ii)
 864 |             z8[ii] ^= x8[ii] ^ y8[ii];
 865 | 
 866 |         z16 = reinterpret_cast<GF256_M128 *>(z8 + count);
 867 |         x16 = reinterpret_cast<const GF256_M128 *>(x8 + count);
 868 |         y16 = reinterpret_cast<const GF256_M128 *>(y8 + count);
 869 | 
 870 |         bytes -= (count * 8);
 871 |     }
 872 | #else // GF256_TARGET_MOBILE
 873 | # if defined(GF256_TRY_AVX2)
 874 |     if (CpuHasAVX2)
 875 |     {
 876 |         GF256_M256 * GF256_RESTRICT z32 = reinterpret_cast<GF256_M256 *>(z16);
 877 |         const GF256_M256 * GF256_RESTRICT x32 = reinterpret_cast<const GF256_M256 *>(x16);
 878 |         const GF256_M256 * GF256_RESTRICT y32 = reinterpret_cast<const GF256_M256 *>(y16);
 879 | 
 880 |         const unsigned count = bytes / 32;
 881 |         for (unsigned i = 0; i < count; ++i)
 882 |         {
 883 |             _mm256_storeu_si256(z32 + i,
 884 |                 _mm256_xor_si256(
 885 |                     _mm256_loadu_si256(z32 + i),
 886 |                     _mm256_xor_si256(
 887 |                         _mm256_loadu_si256(x32 + i),
 888 |                         _mm256_loadu_si256(y32 + i))));
 889 |         }
 890 | 
 891 |         bytes -= count * 32;
 892 |         z16 = reinterpret_cast<GF256_M128 *>(z32 + count);
 893 |         x16 = reinterpret_cast<const GF256_M128 *>(x32 + count);
 894 |         y16 = reinterpret_cast<const GF256_M128 *>(y32 + count);
 895 |     }
 896 | # endif // GF256_TRY_AVX2
 897 | 
 898 |     // Handle multiples of 16 bytes
 899 |     while (bytes >= 16)
 900 |     {
 901 |         // z[i] = z[i] xor x[i] xor y[i]
 902 |         _mm_storeu_si128(z16,
 903 |             _mm_xor_si128(
 904 |                 _mm_loadu_si128(z16),
 905 |                 _mm_xor_si128(
 906 |                     _mm_loadu_si128(x16),
 907 |                     _mm_loadu_si128(y16))));
 908 | 
 909 |         bytes -= 16, ++x16, ++y16, ++z16;
 910 |     }
 911 | #endif // GF256_TARGET_MOBILE
 912 | 
 913 |     uint8_t * GF256_RESTRICT z1 = reinterpret_cast<uint8_t *>(z16);
 914 |     const uint8_t * GF256_RESTRICT x1 = reinterpret_cast<const uint8_t *>(x16);
 915 |     const uint8_t * GF256_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
 916 | 
 917 |     // Handle a block of 8 bytes
 918 |     const int eight = bytes & 8;
 919 |     if (eight)
 920 |     {
 921 |         uint64_t * GF256_RESTRICT z8 = reinterpret_cast<uint64_t *>(z1);
 922 |         const uint64_t * GF256_RESTRICT x8 = reinterpret_cast<const uint64_t *>(x1);
 923 |         const uint64_t * GF256_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
 924 |         *z8 ^= *x8 ^ *y8;
 925 |     }
 926 | 
 927 |     // Handle a block of 4 bytes
 928 |     const int four = bytes & 4;
 929 |     if (four)
 930 |     {
 931 |         uint32_t * GF256_RESTRICT z4 = reinterpret_cast<uint32_t *>(z1 + eight);
 932 |         const uint32_t * GF256_RESTRICT x4 = reinterpret_cast<const uint32_t *>(x1 + eight);
 933 |         const uint32_t * GF256_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
 934 |         *z4 ^= *x4 ^ *y4;
 935 |     }
 936 | 
 937 |     // Handle final bytes
 938 |     const int offset = eight + four;
 939 |     switch (bytes & 3)
 940 |     {
 941 |     case 3: z1[offset + 2] ^= x1[offset + 2] ^ y1[offset + 2];
 942 |     case 2: z1[offset + 1] ^= x1[offset + 1] ^ y1[offset + 1];
 943 |     case 1: z1[offset] ^= x1[offset] ^ y1[offset];
 944 |     default:
 945 |         break;
 946 |     }
 947 | }
 948 | 
 949 | extern "C" void gf256_addset_mem(void * GF256_RESTRICT vz, const void * GF256_RESTRICT vx,
 950 |                                  const void * GF256_RESTRICT vy, int bytes)
 951 | {
 952 |     GF256_M128 * GF256_RESTRICT z16 = reinterpret_cast<GF256_M128*>(vz);
 953 |     const GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast<const GF256_M128*>(vx);
 954 |     const GF256_M128 * GF256_RESTRICT y16 = reinterpret_cast<const GF256_M128*>(vy);
 955 | 
 956 | #if defined(GF256_TARGET_MOBILE)
 957 | # if defined(GF256_TRY_NEON)
 958 |     // Handle multiples of 64 bytes
 959 |     if (CpuHasNeon)
 960 |     {
 961 |         while (bytes >= 64)
 962 |         {
 963 |             GF256_M128 x0 = vld1q_u8((uint8_t*)x16);
 964 |             GF256_M128 x1 = vld1q_u8((uint8_t*)(x16 + 1));
 965 |             GF256_M128 x2 = vld1q_u8((uint8_t*)(x16 + 2));
 966 |             GF256_M128 x3 = vld1q_u8((uint8_t*)(x16 + 3));
 967 |             GF256_M128 y0 = vld1q_u8((uint8_t*)(y16));
 968 |             GF256_M128 y1 = vld1q_u8((uint8_t*)(y16 + 1));
 969 |             GF256_M128 y2 = vld1q_u8((uint8_t*)(y16 + 2));
 970 |             GF256_M128 y3 = vld1q_u8((uint8_t*)(y16 + 3));
 971 | 
 972 |             vst1q_u8((uint8_t*)z16,     veorq_u8(x0, y0));
 973 |             vst1q_u8((uint8_t*)(z16 + 1), veorq_u8(x1, y1));
 974 |             vst1q_u8((uint8_t*)(z16 + 2), veorq_u8(x2, y2));
 975 |             vst1q_u8((uint8_t*)(z16 + 3), veorq_u8(x3, y3));
 976 | 
 977 |             bytes -= 64, x16 += 4, y16 += 4, z16 += 4;
 978 |         }
 979 | 
 980 |         // Handle multiples of 16 bytes
 981 |         while (bytes >= 16)
 982 |         {
 983 |             // z[i] = x[i] xor y[i]
 984 |             vst1q_u8((uint8_t*)z16,
 985 |                      veorq_u8(
 986 |                          vld1q_u8((uint8_t*)x16),
 987 |                          vld1q_u8((uint8_t*)y16)));
 988 | 
 989 |             bytes -= 16, ++x16, ++y16, ++z16;
 990 |         }
 991 |     }
 992 |     else
 993 | # endif // GF256_TRY_NEON
 994 |     {
 995 |         uint64_t * GF256_RESTRICT z8 = reinterpret_cast<uint64_t *>(z16);
 996 |         const uint64_t * GF256_RESTRICT x8 = reinterpret_cast<const uint64_t *>(x16);
 997 |         const uint64_t * GF256_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);
 998 | 
 999 |         const unsigned count = (unsigned)bytes / 8;
1000 |         for (unsigned ii = 0; ii < count; ++ii)
1001 |             z8[ii] = x8[ii] ^ y8[ii];
1002 | 
1003 |         x16 = reinterpret_cast<const GF256_M128 *>(x8 + count);
1004 |         y16 = reinterpret_cast<const GF256_M128 *>(y8 + count);
1005 |         z16 = reinterpret_cast<GF256_M128 *>(z8 + count);
1006 | 
1007 |         bytes -= (count * 8);
1008 |     }
1009 | #else // GF256_TARGET_MOBILE
1010 | # if defined(GF256_TRY_AVX2)
1011 |     if (CpuHasAVX2)
1012 |     {
1013 |         GF256_M256 * GF256_RESTRICT z32 = reinterpret_cast<GF256_M256 *>(z16);
1014 |         const GF256_M256 * GF256_RESTRICT x32 = reinterpret_cast<const GF256_M256 *>(x16);
1015 |         const GF256_M256 * GF256_RESTRICT y32 = reinterpret_cast<const GF256_M256 *>(y16);
1016 | 
1017 |         const unsigned count = bytes / 32;
1018 |         for (unsigned i = 0; i < count; ++i)
1019 |         {
1020 |             _mm256_storeu_si256(z32 + i,
1021 |                 _mm256_xor_si256(
1022 |                     _mm256_loadu_si256(x32 + i),
1023 |                     _mm256_loadu_si256(y32 + i)));
1024 |         }
1025 | 
1026 |         bytes -= count * 32;
1027 |         z16 = reinterpret_cast<GF256_M128 *>(z32 + count);
1028 |         x16 = reinterpret_cast<const GF256_M128 *>(x32 + count);
1029 |         y16 = reinterpret_cast<const GF256_M128 *>(y32 + count);
1030 |     }
1031 |     else
1032 | # endif // GF256_TRY_AVX2
1033 |     {
1034 |         // Handle multiples of 64 bytes
1035 |         while (bytes >= 64)
1036 |         {
1037 |             GF256_M128 x0 = _mm_loadu_si128(x16);
1038 |             GF256_M128 x1 = _mm_loadu_si128(x16 + 1);
1039 |             GF256_M128 x2 = _mm_loadu_si128(x16 + 2);
1040 |             GF256_M128 x3 = _mm_loadu_si128(x16 + 3);
1041 |             GF256_M128 y0 = _mm_loadu_si128(y16);
1042 |             GF256_M128 y1 = _mm_loadu_si128(y16 + 1);
1043 |             GF256_M128 y2 = _mm_loadu_si128(y16 + 2);
1044 |             GF256_M128 y3 = _mm_loadu_si128(y16 + 3);
1045 | 
1046 |             _mm_storeu_si128(z16,     _mm_xor_si128(x0, y0));
1047 |             _mm_storeu_si128(z16 + 1, _mm_xor_si128(x1, y1));
1048 |             _mm_storeu_si128(z16 + 2, _mm_xor_si128(x2, y2));
1049 |             _mm_storeu_si128(z16 + 3, _mm_xor_si128(x3, y3));
1050 | 
1051 |             bytes -= 64, x16 += 4, y16 += 4, z16 += 4;
1052 |         }
1053 |     }
1054 | 
1055 |     // Handle multiples of 16 bytes
1056 |     while (bytes >= 16)
1057 |     {
1058 |         // z[i] = x[i] xor y[i]
1059 |         _mm_storeu_si128(z16,
1060 |             _mm_xor_si128(
1061 |                 _mm_loadu_si128(x16),
1062 |                 _mm_loadu_si128(y16)));
1063 | 
1064 |         bytes -= 16, ++x16, ++y16, ++z16;
1065 |     }
1066 | #endif // GF256_TARGET_MOBILE
1067 | 
1068 |     uint8_t * GF256_RESTRICT z1 = reinterpret_cast<uint8_t *>(z16);
1069 |     const uint8_t * GF256_RESTRICT x1 = reinterpret_cast<const uint8_t *>(x16);
1070 |     const uint8_t * GF256_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);
1071 | 
1072 |     // Handle a block of 8 bytes
1073 |     const int eight = bytes & 8;
1074 |     if (eight)
1075 |     {
1076 |         uint64_t * GF256_RESTRICT z8 = reinterpret_cast<uint64_t *>(z1);
1077 |         const uint64_t * GF256_RESTRICT x8 = reinterpret_cast<const uint64_t *>(x1);
1078 |         const uint64_t * GF256_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
1079 |         *z8 = *x8 ^ *y8;
1080 |     }
1081 | 
1082 |     // Handle a block of 4 bytes
1083 |     const int four = bytes & 4;
1084 |     if (four)
1085 |     {
1086 |         uint32_t * GF256_RESTRICT z4 = reinterpret_cast<uint32_t *>(z1 + eight);
1087 |         const uint32_t * GF256_RESTRICT x4 = reinterpret_cast<const uint32_t *>(x1 + eight);
1088 |         const uint32_t * GF256_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
1089 |         *z4 = *x4 ^ *y4;
1090 |     }
1091 | 
1092 |     // Handle final bytes
1093 |     const int offset = eight + four;
1094 |     switch (bytes & 3)
1095 |     {
1096 |     case 3: z1[offset + 2] = x1[offset + 2] ^ y1[offset + 2];
1097 |     case 2: z1[offset + 1] = x1[offset + 1] ^ y1[offset + 1];
1098 |     case 1: z1[offset] = x1[offset] ^ y1[offset];
1099 |     default:
1100 |         break;
1101 |     }
1102 | }
1103 | 
1104 | extern "C" void gf256_mul_mem(void * GF256_RESTRICT vz, const void * GF256_RESTRICT vx, uint8_t y, int bytes)
1105 | {
1106 |     // Use a single if-statement to handle special cases
1107 |     if (y <= 1)
1108 |     {
1109 |         if (y == 0)
1110 |             memset(vz, 0, bytes);
1111 |         else if (vz != vx)
1112 |             memcpy(vz, vx, bytes);
1113 |         return;
1114 |     }
1115 | 
1116 |     GF256_M128 * GF256_RESTRICT z16 = reinterpret_cast<GF256_M128 *>(vz);
1117 |     const GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast<const GF256_M128 *>(vx);
1118 | 
1119 | #if defined(GF256_TARGET_MOBILE)
1120 | #if defined(GF256_TRY_NEON)
1121 |     if (bytes >= 16 && CpuHasNeon)
1122 |     {
1123 |         // Partial product tables; see above
1124 |         const GF256_M128 table_lo_y = vld1q_u8((uint8_t*)(GF256Ctx.MM128.TABLE_LO_Y + y));
1125 |         const GF256_M128 table_hi_y = vld1q_u8((uint8_t*)(GF256Ctx.MM128.TABLE_HI_Y + y));
1126 | 
1127 |         // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
1128 |         const GF256_M128 clr_mask = vdupq_n_u8(0x0f);
1129 | 
1130 |         // Handle multiples of 16 bytes
1131 |         do
1132 |         {
1133 |             // See above comments for details
1134 |             GF256_M128 x0 = vld1q_u8((uint8_t*)x16);
1135 |             GF256_M128 l0 = vandq_u8(x0, clr_mask);
1136 |             x0 = vshrq_n_u8(x0, 4);
1137 |             GF256_M128 h0 = vandq_u8(x0, clr_mask);
1138 |             l0 = vqtbl1q_u8(table_lo_y, l0);
1139 |             h0 = vqtbl1q_u8(table_hi_y, h0);
1140 |             vst1q_u8((uint8_t*)z16, veorq_u8(l0, h0));
1141 | 
1142 |             bytes -= 16, ++x16, ++z16;
1143 |         } while (bytes >= 16);
1144 |     }
1145 | #endif
1146 | #else
1147 | # if defined(GF256_TRY_AVX2)
1148 |     if (bytes >= 32 && CpuHasAVX2)
1149 |     {
1150 |         // Partial product tables; see above
1151 |         const GF256_M256 table_lo_y = _mm256_loadu_si256(GF256Ctx.MM256.TABLE_LO_Y + y);
1152 |         const GF256_M256 table_hi_y = _mm256_loadu_si256(GF256Ctx.MM256.TABLE_HI_Y + y);
1153 | 
1154 |         // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
1155 |         const GF256_M256 clr_mask = _mm256_set1_epi8(0x0f);
1156 | 
1157 |         GF256_M256 * GF256_RESTRICT z32 = reinterpret_cast<GF256_M256 *>(vz);
1158 |         const GF256_M256 * GF256_RESTRICT x32 = reinterpret_cast<const GF256_M256 *>(vx);
1159 | 
1160 |         // Handle multiples of 32 bytes
1161 |         do
1162 |         {
1163 |             // See above comments for details
1164 |             GF256_M256 x0 = _mm256_loadu_si256(x32);
1165 |             GF256_M256 l0 = _mm256_and_si256(x0, clr_mask);
1166 |             x0 = _mm256_srli_epi64(x0, 4);
1167 |             GF256_M256 h0 = _mm256_and_si256(x0, clr_mask);
1168 |             l0 = _mm256_shuffle_epi8(table_lo_y, l0);
1169 |             h0 = _mm256_shuffle_epi8(table_hi_y, h0);
1170 |             _mm256_storeu_si256(z32, _mm256_xor_si256(l0, h0));
1171 | 
1172 |             bytes -= 32, ++x32, ++z32;
1173 |         } while (bytes >= 32);
1174 | 
1175 |         z16 = reinterpret_cast<GF256_M128 *>(z32);
1176 |         x16 = reinterpret_cast<const GF256_M128 *>(x32);
1177 |     }
1178 | # endif // GF256_TRY_AVX2
1179 |     if (bytes >= 16 && CpuHasSSSE3)
1180 |     {
1181 |         // Partial product tables; see above
1182 |         const GF256_M128 table_lo_y = _mm_loadu_si128(GF256Ctx.MM128.TABLE_LO_Y + y);
1183 |         const GF256_M128 table_hi_y = _mm_loadu_si128(GF256Ctx.MM128.TABLE_HI_Y + y);
1184 | 
1185 |         // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
1186 |         const GF256_M128 clr_mask = _mm_set1_epi8(0x0f);
1187 | 
1188 |         // Handle multiples of 16 bytes
1189 |         do
1190 |         {
1191 |             // See above comments for details
1192 |             GF256_M128 x0 = _mm_loadu_si128(x16);
1193 |             GF256_M128 l0 = _mm_and_si128(x0, clr_mask);
1194 |             x0 = _mm_srli_epi64(x0, 4);
1195 |             GF256_M128 h0 = _mm_and_si128(x0, clr_mask);
1196 |             l0 = _mm_shuffle_epi8(table_lo_y, l0);
1197 |             h0 = _mm_shuffle_epi8(table_hi_y, h0);
1198 |             _mm_storeu_si128(z16, _mm_xor_si128(l0, h0));
1199 | 
1200 |             bytes -= 16, ++x16, ++z16;
1201 |         } while (bytes >= 16);
1202 |     }
1203 | #endif
1204 | 
1205 |     uint8_t * GF256_RESTRICT z1 = reinterpret_cast<uint8_t*>(z16);
1206 |     const uint8_t * GF256_RESTRICT x1 = reinterpret_cast<const uint8_t*>(x16);
1207 |     const uint8_t * GF256_RESTRICT table = GF256Ctx.GF256_MUL_TABLE + ((unsigned)y << 8);
1208 | 
1209 |     // Handle blocks of 8 bytes
1210 |     while (bytes >= 8)
1211 |     {
1212 |         uint64_t * GF256_RESTRICT z8 = reinterpret_cast<uint64_t *>(z1);
1213 | #ifdef GF256_IS_BIG_ENDIAN
1214 |         uint64_t word = (uint64_t)table[x1[0]] << 56;
1215 |         word |= (uint64_t)table[x1[1]] << 48;
1216 |         word |= (uint64_t)table[x1[2]] << 40;
1217 |         word |= (uint64_t)table[x1[3]] << 32;
1218 |         word |= (uint64_t)table[x1[4]] << 24;
1219 |         word |= (uint64_t)table[x1[5]] << 16;
1220 |         word |= (uint64_t)table[x1[6]] << 8;
1221 |         word |= (uint64_t)table[x1[7]];
1222 | #else
1223 |         uint64_t word = table[x1[0]];
1224 |         word |= (uint64_t)table[x1[1]] << 8;
1225 |         word |= (uint64_t)table[x1[2]] << 16;
1226 |         word |= (uint64_t)table[x1[3]] << 24;
1227 |         word |= (uint64_t)table[x1[4]] << 32;
1228 |         word |= (uint64_t)table[x1[5]] << 40;
1229 |         word |= (uint64_t)table[x1[6]] << 48;
1230 |         word |= (uint64_t)table[x1[7]] << 56;
1231 | #endif
1232 |         *z8 = word;
1233 | 
1234 |         bytes -= 8, x1 += 8, z1 += 8;
1235 |     }
1236 | 
1237 |     // Handle a block of 4 bytes
1238 |     const int four = bytes & 4;
1239 |     if (four)
1240 |     {
1241 |         uint32_t * GF256_RESTRICT z4 = reinterpret_cast<uint32_t *>(z1);
1242 | #ifdef GF256_IS_BIG_ENDIAN
1243 |         uint32_t word = (uint32_t)table[x1[0]] << 24;
1244 |         word |= (uint32_t)table[x1[1]] << 16;
1245 |         word |= (uint32_t)table[x1[2]] << 8;
1246 |         word |= (uint32_t)table[x1[3]];
1247 | #else
1248 |         uint32_t word = table[x1[0]];
1249 |         word |= (uint32_t)table[x1[1]] << 8;
1250 |         word |= (uint32_t)table[x1[2]] << 16;
1251 |         word |= (uint32_t)table[x1[3]] << 24;
1252 | #endif
1253 |         *z4 = word;
1254 |     }
1255 | 
1256 |     // Handle single bytes
1257 |     const int offset = four;
1258 |     switch (bytes & 3)
1259 |     {
1260 |     case 3: z1[offset + 2] = table[x1[offset + 2]];
1261 |     case 2: z1[offset + 1] = table[x1[offset + 1]];
1262 |     case 1: z1[offset] = table[x1[offset]];
1263 |     default:
1264 |         break;
1265 |     }
1266 | }
1267 | 
1268 | extern "C" void gf256_muladd_mem(void * GF256_RESTRICT vz, uint8_t y,
1269 |                                  const void * GF256_RESTRICT vx, int bytes)
1270 | {
1271 |     // Use a single if-statement to handle special cases
1272 |     if (y <= 1)
1273 |     {
1274 |         if (y == 1)
1275 |             gf256_add_mem(vz, vx, bytes);
1276 |         return;
1277 |     }
1278 | 
1279 |     GF256_M128 * GF256_RESTRICT z16 = reinterpret_cast<GF256_M128 *>(vz);
1280 |     const GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast<const GF256_M128 *>(vx);
1281 | 
1282 | #if defined(GF256_TARGET_MOBILE)
1283 | #if defined(GF256_TRY_NEON)
1284 |     if (bytes >= 16 && CpuHasNeon)
1285 |     {
1286 |         // Partial product tables; see above
1287 |         const GF256_M128 table_lo_y = vld1q_u8((uint8_t*)(GF256Ctx.MM128.TABLE_LO_Y + y));
1288 |         const GF256_M128 table_hi_y = vld1q_u8((uint8_t*)(GF256Ctx.MM128.TABLE_HI_Y + y));
1289 | 
1290 |         // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
1291 |         const GF256_M128 clr_mask = vdupq_n_u8(0x0f);
1292 | 
1293 |         // Handle multiples of 16 bytes
1294 |         do
1295 |         {
1296 |             // See above comments for details
1297 |             GF256_M128 x0 = vld1q_u8((uint8_t*)x16);
1298 |             GF256_M128 l0 = vandq_u8(x0, clr_mask);
1299 | 
1300 |             // x0 = vshrq_n_u8(x0, 4);
1301 |             x0 = (GF256_M128)vshrq_n_u64( (uint64x2_t)x0, 4);
1302 |             GF256_M128 h0 = vandq_u8(x0, clr_mask);
1303 |             l0 = vqtbl1q_u8(table_lo_y, l0);
1304 |             h0 = vqtbl1q_u8(table_hi_y, h0);
1305 |             const GF256_M128 p0 = veorq_u8(l0, h0);
1306 |             const GF256_M128 z0 = vld1q_u8((uint8_t*)z16);
1307 |             vst1q_u8((uint8_t*)z16, veorq_u8(p0, z0));
1308 |             bytes -= 16, ++x16, ++z16;
1309 |         } while (bytes >= 16);
1310 |     }
1311 | #endif
1312 | #else // GF256_TARGET_MOBILE
1313 | # if defined(GF256_TRY_AVX2)
1314 |     if (bytes >= 32 && CpuHasAVX2)
1315 |     {
1316 |         // Partial product tables; see above
1317 |         const GF256_M256 table_lo_y = _mm256_loadu_si256(GF256Ctx.MM256.TABLE_LO_Y + y);
1318 |         const GF256_M256 table_hi_y = _mm256_loadu_si256(GF256Ctx.MM256.TABLE_HI_Y + y);
1319 | 
1320 |         // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
1321 |         const GF256_M256 clr_mask = _mm256_set1_epi8(0x0f);
1322 | 
1323 |         GF256_M256 * GF256_RESTRICT z32 = reinterpret_cast<GF256_M256 *>(z16);
1324 |         const GF256_M256 * GF256_RESTRICT x32 = reinterpret_cast<const GF256_M256 *>(x16);
1325 | 
1326 |         // On my Reed Solomon codec, the encoder unit test runs in 640 usec without and 550 usec with the optimization (86% of the original time)
1327 |         const unsigned count = bytes / 64;
1328 |         for (unsigned i = 0; i < count; ++i)
1329 |         {
1330 |             // See above comments for details
1331 |             GF256_M256 x0 = _mm256_loadu_si256(x32 + i * 2);
1332 |             GF256_M256 l0 = _mm256_and_si256(x0, clr_mask);
1333 |             x0 = _mm256_srli_epi64(x0, 4);
1334 |             const GF256_M256 z0 = _mm256_loadu_si256(z32 + i * 2);
1335 |             GF256_M256 h0 = _mm256_and_si256(x0, clr_mask);
1336 |             l0 = _mm256_shuffle_epi8(table_lo_y, l0);
1337 |             h0 = _mm256_shuffle_epi8(table_hi_y, h0);
1338 |             const GF256_M256 p0 = _mm256_xor_si256(l0, h0);
1339 |             _mm256_storeu_si256(z32 + i * 2, _mm256_xor_si256(p0, z0));
1340 | 
1341 |             GF256_M256 x1 = _mm256_loadu_si256(x32 + i * 2 + 1);
1342 |             GF256_M256 l1 = _mm256_and_si256(x1, clr_mask);
1343 |             x1 = _mm256_srli_epi64(x1, 4);
1344 |             const GF256_M256 z1 = _mm256_loadu_si256(z32 + i * 2 + 1);
1345 |             GF256_M256 h1 = _mm256_and_si256(x1, clr_mask);
1346 |             l1 = _mm256_shuffle_epi8(table_lo_y, l1);
1347 |             h1 = _mm256_shuffle_epi8(table_hi_y, h1);
1348 |             const GF256_M256 p1 = _mm256_xor_si256(l1, h1);
1349 |             _mm256_storeu_si256(z32 + i * 2 + 1, _mm256_xor_si256(p1, z1));
1350 |         }
1351 |         bytes -= count * 64;
1352 |         z32 += count * 2;
1353 |         x32 += count * 2;
1354 | 
1355 |         if (bytes >= 32)
1356 |         {
1357 |             GF256_M256 x0 = _mm256_loadu_si256(x32);
1358 |             GF256_M256 l0 = _mm256_and_si256(x0, clr_mask);
1359 |             x0 = _mm256_srli_epi64(x0, 4);
1360 |             GF256_M256 h0 = _mm256_and_si256(x0, clr_mask);
1361 |             l0 = _mm256_shuffle_epi8(table_lo_y, l0);
1362 |             h0 = _mm256_shuffle_epi8(table_hi_y, h0);
1363 |             const GF256_M256 p0 = _mm256_xor_si256(l0, h0);
1364 |             const GF256_M256 z0 = _mm256_loadu_si256(z32);
1365 |             _mm256_storeu_si256(z32, _mm256_xor_si256(p0, z0));
1366 | 
1367 |             bytes -= 32;
1368 |             z32++;
1369 |             x32++;
1370 |         }
1371 | 
1372 |         z16 = reinterpret_cast<GF256_M128 *>(z32);
1373 |         x16 = reinterpret_cast<const GF256_M128 *>(x32);
1374 |     }
1375 | # endif // GF256_TRY_AVX2
1376 |     if (bytes >= 16 && CpuHasSSSE3)
1377 |     {
1378 |         // Partial product tables; see above
1379 |         const GF256_M128 table_lo_y = _mm_loadu_si128(GF256Ctx.MM128.TABLE_LO_Y + y);
1380 |         const GF256_M128 table_hi_y = _mm_loadu_si128(GF256Ctx.MM128.TABLE_HI_Y + y);
1381 | 
1382 |         // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
1383 |         const GF256_M128 clr_mask = _mm_set1_epi8(0x0f);
1384 | 
1385 |         // This unroll seems to provide about 7% speed boost when AVX2 is disabled
1386 |         while (bytes >= 32)
1387 |         {
1388 |             bytes -= 32;
1389 | 
1390 |             GF256_M128 x1 = _mm_loadu_si128(x16 + 1);
1391 |             GF256_M128 l1 = _mm_and_si128(x1, clr_mask);
1392 |             x1 = _mm_srli_epi64(x1, 4);
1393 |             GF256_M128 h1 = _mm_and_si128(x1, clr_mask);
1394 |             l1 = _mm_shuffle_epi8(table_lo_y, l1);
1395 |             h1 = _mm_shuffle_epi8(table_hi_y, h1);
1396 |             const GF256_M128 z1 = _mm_loadu_si128(z16 + 1);
1397 | 
1398 |             GF256_M128 x0 = _mm_loadu_si128(x16);
1399 |             GF256_M128 l0 = _mm_and_si128(x0, clr_mask);
1400 |             x0 = _mm_srli_epi64(x0, 4);
1401 |             GF256_M128 h0 = _mm_and_si128(x0, clr_mask);
1402 |             l0 = _mm_shuffle_epi8(table_lo_y, l0);
1403 |             h0 = _mm_shuffle_epi8(table_hi_y, h0);
1404 |             const GF256_M128 z0 = _mm_loadu_si128(z16);
1405 | 
1406 |             const GF256_M128 p1 = _mm_xor_si128(l1, h1);
1407 |             _mm_storeu_si128(z16 + 1, _mm_xor_si128(p1, z1));
1408 | 
1409 |             const GF256_M128 p0 = _mm_xor_si128(l0, h0);
1410 |             _mm_storeu_si128(z16, _mm_xor_si128(p0, z0));
1411 | 
1412 |             x16 += 2, z16 += 2;
1413 |         }
1414 | 
1415 |         // Handle multiples of 16 bytes
1416 |         while (bytes >= 16)
1417 |         {
1418 |             // See above comments for details
1419 |             GF256_M128 x0 = _mm_loadu_si128(x16);
1420 |             GF256_M128 l0 = _mm_and_si128(x0, clr_mask);
1421 |             x0 = _mm_srli_epi64(x0, 4);
1422 |             GF256_M128 h0 = _mm_and_si128(x0, clr_mask);
1423 |             l0 = _mm_shuffle_epi8(table_lo_y, l0);
1424 |             h0 = _mm_shuffle_epi8(table_hi_y, h0);
1425 |             const GF256_M128 p0 = _mm_xor_si128(l0, h0);
1426 |             const GF256_M128 z0 = _mm_loadu_si128(z16);
1427 |             _mm_storeu_si128(z16, _mm_xor_si128(p0, z0));
1428 | 
1429 |             bytes -= 16, ++x16, ++z16;
1430 |         }
1431 |     }
1432 | #endif // GF256_TARGET_MOBILE
1433 | 
1434 |     uint8_t * GF256_RESTRICT z1 = reinterpret_cast<uint8_t*>(z16);
1435 |     const uint8_t * GF256_RESTRICT x1 = reinterpret_cast<const uint8_t*>(x16);
1436 |     const uint8_t * GF256_RESTRICT table = GF256Ctx.GF256_MUL_TABLE + ((unsigned)y << 8);
1437 | 
1438 |     // Handle blocks of 8 bytes
1439 |     while (bytes >= 8)
1440 |     {
1441 |         uint64_t * GF256_RESTRICT z8 = reinterpret_cast<uint64_t *>(z1);
1442 | #ifdef GF256_IS_BIG_ENDIAN
1443 |         uint64_t word = (uint64_t)table[x1[0]] << 56;
1444 |         word |= (uint64_t)table[x1[1]] << 48;
1445 |         word |= (uint64_t)table[x1[2]] << 40;
1446 |         word |= (uint64_t)table[x1[3]] << 32;
1447 |         word |= (uint64_t)table[x1[4]] << 24;
1448 |         word |= (uint64_t)table[x1[5]] << 16;
1449 |         word |= (uint64_t)table[x1[6]] << 8;
1450 |         word |= (uint64_t)table[x1[7]];
1451 | #else
1452 |         uint64_t word = table[x1[0]];
1453 |         word |= (uint64_t)table[x1[1]] << 8;
1454 |         word |= (uint64_t)table[x1[2]] << 16;
1455 |         word |= (uint64_t)table[x1[3]] << 24;
1456 |         word |= (uint64_t)table[x1[4]] << 32;
1457 |         word |= (uint64_t)table[x1[5]] << 40;
1458 |         word |= (uint64_t)table[x1[6]] << 48;
1459 |         word |= (uint64_t)table[x1[7]] << 56;
1460 | #endif
1461 |         *z8 ^= word;
1462 | 
1463 |         bytes -= 8, x1 += 8, z1 += 8;
1464 |     }
1465 | 
1466 |     // Handle a block of 4 bytes
1467 |     const int four = bytes & 4;
1468 |     if (four)
1469 |     {
1470 |         uint32_t * GF256_RESTRICT z4 = reinterpret_cast<uint32_t *>(z1);
1471 | #ifdef GF256_IS_BIG_ENDIAN
1472 |         uint32_t word = (uint32_t)table[x1[0]] << 24;
1473 |         word |= (uint32_t)table[x1[1]] << 16;
1474 |         word |= (uint32_t)table[x1[2]] << 8;
1475 |         word |= (uint32_t)table[x1[3]];
1476 | #else
1477 |         uint32_t word = table[x1[0]];
1478 |         word |= (uint32_t)table[x1[1]] << 8;
1479 |         word |= (uint32_t)table[x1[2]] << 16;
1480 |         word |= (uint32_t)table[x1[3]] << 24;
1481 | #endif
1482 |         *z4 ^= word;
1483 |     }
1484 | 
1485 |     // Handle single bytes
1486 |     const int offset = four;
1487 |     switch (bytes & 3)
1488 |     {
1489 |     case 3: z1[offset + 2] ^= table[x1[offset + 2]];
1490 |     case 2: z1[offset + 1] ^= table[x1[offset + 1]];
1491 |     case 1: z1[offset] ^= table[x1[offset]];
1492 |     default:
1493 |         break;
1494 |     }
1495 | }
1496 | 
1497 | extern "C" void gf256_memswap(void * GF256_RESTRICT vx, void * GF256_RESTRICT vy, int bytes)
1498 | {
1499 | #if defined(GF256_TARGET_MOBILE)
1500 |     uint64_t * GF256_RESTRICT x16 = reinterpret_cast<uint64_t *>(vx);
1501 |     uint64_t * GF256_RESTRICT y16 = reinterpret_cast<uint64_t *>(vy);
1502 | 
1503 |     const unsigned count = (unsigned)bytes / 8;
1504 |     for (unsigned ii = 0; ii < count; ++ii)
1505 |     {
1506 |         const uint64_t temp = x16[ii];
1507 |         x16[ii] = y16[ii];
1508 |         y16[ii] = temp;
1509 |     }
1510 | 
1511 |     x16 += count;
1512 |     y16 += count;
1513 |     bytes -= count * 8;
1514 | #else
1515 |     GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast<GF256_M128 *>(vx);
1516 |     GF256_M128 * GF256_RESTRICT y16 = reinterpret_cast<GF256_M128 *>(vy);
1517 | 
1518 |     // Handle blocks of 16 bytes
1519 |     while (bytes >= 16)
1520 |     {
1521 |         GF256_M128 x0 = _mm_loadu_si128(x16);
1522 |         GF256_M128 y0 = _mm_loadu_si128(y16);
1523 |         _mm_storeu_si128(x16, y0);
1524 |         _mm_storeu_si128(y16, x0);
1525 | 
1526 |         bytes -= 16, ++x16, ++y16;
1527 |     }
1528 | #endif
1529 | 
1530 |     uint8_t * GF256_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
1531 |     uint8_t * GF256_RESTRICT y1 = reinterpret_cast<uint8_t *>(y16);
1532 | 
1533 |     // Handle a block of 8 bytes
1534 |     const int eight = bytes & 8;
1535 |     if (eight)
1536 |     {
1537 |         uint64_t * GF256_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
1538 |         uint64_t * GF256_RESTRICT y8 = reinterpret_cast<uint64_t *>(y1);
1539 | 
1540 |         uint64_t temp = *x8;
1541 |         *x8 = *y8;
1542 |         *y8 = temp;
1543 |     }
1544 | 
1545 |     // Handle a block of 4 bytes
1546 |     const int four = bytes & 4;
1547 |     if (four)
1548 |     {
1549 |         uint32_t * GF256_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1 + eight);
1550 |         uint32_t * GF256_RESTRICT y4 = reinterpret_cast<uint32_t *>(y1 + eight);
1551 | 
1552 |         uint32_t temp = *x4;
1553 |         *x4 = *y4;
1554 |         *y4 = temp;
1555 |     }
1556 | 
1557 |     // Handle final bytes
1558 |     const int offset = eight + four;
1559 |     uint8_t temp;
1560 |     switch (bytes & 3)
1561 |     {
1562 |     case 3: temp = x1[offset + 2]; x1[offset + 2] = y1[offset + 2]; y1[offset + 2] = temp;
1563 |     case 2: temp = x1[offset + 1]; x1[offset + 1] = y1[offset + 1]; y1[offset + 1] = temp;
1564 |     case 1: temp = x1[offset]; x1[offset] = y1[offset]; y1[offset] = temp;
1565 |     default:
1566 |         break;
1567 |     }
1568 | }
1569 | 


--------------------------------------------------------------------------------
/unit_test/Debug/matrix_test.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/catid/cm256/634d1e36b095dbba937586a7e9843d828762dae4/unit_test/Debug/matrix_test.exe


--------------------------------------------------------------------------------
/unit_test/Release/matrix_test.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/catid/cm256/634d1e36b095dbba937586a7e9843d828762dae4/unit_test/Release/matrix_test.exe


--------------------------------------------------------------------------------
/unit_test/SiameseTools.cpp:
--------------------------------------------------------------------------------
  1 | /** \file
  2 |     \brief Siamese FEC Implementation: Tools
  3 |     \copyright Copyright (c) 2017 Christopher A. Taylor.  All rights reserved.
  4 | 
  5 |     Redistribution and use in source and binary forms, with or without
  6 |     modification, are permitted provided that the following conditions are met:
  7 | 
  8 |     * Redistributions of source code must retain the above copyright notice,
  9 |       this list of conditions and the following disclaimer.
 10 |     * Redistributions in binary form must reproduce the above copyright notice,
 11 |       this list of conditions and the following disclaimer in the documentation
 12 |       and/or other materials provided with the distribution.
 13 |     * Neither the name of Siamese nor the names of its contributors may be
 14 |       used to endorse or promote products derived from this software without
 15 |       specific prior written permission.
 16 | 
 17 |     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 18 |     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 19 |     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 20 |     ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 21 |     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 22 |     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 23 |     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 24 |     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 25 |     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 26 |     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 27 |     POSSIBILITY OF SUCH DAMAGE.
 28 | */
 29 | 
 30 | #include "SiameseTools.h"
 31 | 
 32 | #ifdef _WIN32
 33 |     #ifndef NOMINMAX
 34 |         #define NOMINMAX
 35 |     #endif
 36 |     #include <windows.h>
 37 | #elif __MACH__
 38 |     #include <mach/mach_time.h>
 39 |     #include <mach/mach.h>
 40 |     #include <mach/clock.h>
 41 | 
 42 |     extern mach_port_t clock_port;
 43 | #else
 44 |     #include <time.h>
 45 |     #include <sys/time.h>
 46 | #endif
 47 | 
 48 | namespace siamese {
 49 | 
 50 | 
 51 | //------------------------------------------------------------------------------
 52 | // Timing
 53 | 
 54 | #ifdef _WIN32
 55 | // Precomputed frequency inverse
 56 | static double PerfFrequencyInverseUsec = 0.;
 57 | static double PerfFrequencyInverseMsec = 0.;
 58 | 
 59 | static void InitPerfFrequencyInverse()
 60 | {
 61 |     LARGE_INTEGER freq = {};
 62 |     if (!::QueryPerformanceFrequency(&freq) || freq.QuadPart == 0)
 63 |         return;
 64 |     const double invFreq = 1. / (double)freq.QuadPart;
 65 |     PerfFrequencyInverseUsec = 1000000. * invFreq;
 66 |     PerfFrequencyInverseMsec = 1000. * invFreq;
 67 |     SIAMESE_DEBUG_ASSERT(PerfFrequencyInverseUsec > 0.);
 68 |     SIAMESE_DEBUG_ASSERT(PerfFrequencyInverseMsec > 0.);
 69 | }
 70 | #elif __MACH__
 71 | static bool m_clock_serv_init = false;
 72 | static clock_serv_t m_clock_serv = 0;
 73 | 
 74 | static void InitClockServ()
 75 | {
 76 |     m_clock_serv_init = true;
 77 |     host_get_clock_service(mach_host_self(), SYSTEM_CLOCK, &m_clock_serv);
 78 | }
 79 | #endif // _WIN32
 80 | 
 81 | uint64_t GetTimeUsec()
 82 | {
 83 | #ifdef _WIN32
 84 |     LARGE_INTEGER timeStamp = {};
 85 |     if (!::QueryPerformanceCounter(&timeStamp))
 86 |         return 0;
 87 |     if (PerfFrequencyInverseUsec == 0.)
 88 |         InitPerfFrequencyInverse();
 89 |     return (uint64_t)(PerfFrequencyInverseUsec * timeStamp.QuadPart);
 90 | #elif __MACH__
 91 |     if (!m_clock_serv_init)
 92 |         InitClockServ();
 93 | 
 94 |     mach_timespec_t tv;
 95 |     clock_get_time(m_clock_serv, &tv);
 96 | 
 97 |     return 1000000 * tv.tv_sec + tv.tv_nsec / 1000;
 98 | #else
 99 |     struct timeval tv;
100 |     gettimeofday(&tv, nullptr);
101 |     return 1000000 * tv.tv_sec + tv.tv_usec;
102 | #endif
103 | }
104 | 
105 | uint64_t GetTimeMsec()
106 | {
107 | #ifdef _WIN32
108 |     LARGE_INTEGER timeStamp = {};
109 |     if (!::QueryPerformanceCounter(&timeStamp))
110 |         return 0;
111 |     if (PerfFrequencyInverseMsec == 0.)
112 |         InitPerfFrequencyInverse();
113 |     return (uint64_t)(PerfFrequencyInverseMsec * timeStamp.QuadPart);
114 | #else
115 |     // TBD: Optimize this?
116 |     return GetTimeUsec() / 1000;
117 | #endif
118 | }
119 | 
120 | 
121 | } // namespace siamese
122 | 


--------------------------------------------------------------------------------
/unit_test/SiameseTools.h:
--------------------------------------------------------------------------------
  1 | /** \file
  2 |     \brief Siamese FEC Implementation: Tools
  3 |     \copyright Copyright (c) 2017 Christopher A. Taylor.  All rights reserved.
  4 | 
  5 |     Redistribution and use in source and binary forms, with or without
  6 |     modification, are permitted provided that the following conditions are met:
  7 | 
  8 |     * Redistributions of source code must retain the above copyright notice,
  9 |       this list of conditions and the following disclaimer.
 10 |     * Redistributions in binary form must reproduce the above copyright notice,
 11 |       this list of conditions and the following disclaimer in the documentation
 12 |       and/or other materials provided with the distribution.
 13 |     * Neither the name of Siamese nor the names of its contributors may be
 14 |       used to endorse or promote products derived from this software without
 15 |       specific prior written permission.
 16 | 
 17 |     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 18 |     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 19 |     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 20 |     ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 21 |     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 22 |     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 23 |     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 24 |     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 25 |     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 26 |     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 27 |     POSSIBILITY OF SUCH DAMAGE.
 28 | */
 29 | 
 30 | #pragma once
 31 | 
 32 | /**
 33 |     Tools:
 34 | 
 35 |     + System headers
 36 |     + Debug breakpoints/asserts
 37 |     + Compiler-specific code wrappers
 38 |     + PCGRandom implementation
 39 |     + Microsecond timing
 40 |     + Windowed minimum/maximum
 41 | */
 42 | 
 43 | #include <stdint.h> // uint32_t
 44 | #include <string.h> // memcpy
 45 | #include <new> // std::nothrow
 46 | 
 47 | 
 48 | //------------------------------------------------------------------------------
 49 | // Portability macros
 50 | 
 51 | // Compiler-specific debug break
 52 | #if defined(_DEBUG) || defined(DEBUG)
 53 |     #define SIAMESE_DEBUG
 54 |     #ifdef _WIN32
 55 |         #define SIAMESE_DEBUG_BREAK() __debugbreak()
 56 |     #else
 57 |         #define SIAMESE_DEBUG_BREAK() __builtin_trap()
 58 |     #endif
 59 |     #define SIAMESE_DEBUG_ASSERT(cond) { if (!(cond)) { SIAMESE_DEBUG_BREAK(); } }
 60 | #else
 61 |     #define SIAMESE_DEBUG_BREAK() do {} while (false);
 62 |     #define SIAMESE_DEBUG_ASSERT(cond) do {} while (false);
 63 | #endif
 64 | 
 65 | // Compiler-specific force inline keyword
 66 | #ifdef _MSC_VER
 67 |     #define SIAMESE_FORCE_INLINE inline __forceinline
 68 | #else
 69 |     #define SIAMESE_FORCE_INLINE inline __attribute__((always_inline))
 70 | #endif
 71 | 
 72 | 
 73 | namespace siamese {
 74 | 
 75 | 
 76 | //------------------------------------------------------------------------------
 77 | // PCG PRNG
 78 | 
 79 | /// From http://www.pcg-random.org/
 80 | class PCGRandom
 81 | {
 82 | public:
 83 |     void Seed(uint64_t y, uint64_t x = 0)
 84 |     {
 85 |         State = 0;
 86 |         Inc = (y << 1u) | 1u;
 87 |         Next();
 88 |         State += x;
 89 |         Next();
 90 |     }
 91 | 
 92 |     uint32_t Next()
 93 |     {
 94 |         const uint64_t oldstate = State;
 95 |         State = oldstate * UINT64_C(6364136223846793005) + Inc;
 96 |         const uint32_t xorshifted = (uint32_t)(((oldstate >> 18) ^ oldstate) >> 27);
 97 |         const uint32_t rot = oldstate >> 59;
 98 |         return (xorshifted >> rot) | (xorshifted << ((uint32_t)(-(int32_t)rot) & 31));
 99 |     }
100 | 
101 |     uint64_t State = 0, Inc = 0;
102 | };
103 | 
104 | 
105 | //------------------------------------------------------------------------------
106 | // Timing
107 | 
108 | /// Microsecond-accurate platform independent high-resolution timer
109 | uint64_t GetTimeUsec();
110 | 
111 | /// Millisecond-accurate platform independent high-resolution timer
112 | uint64_t GetTimeMsec();
113 | 
114 | 
115 | //------------------------------------------------------------------------------
116 | // WindowedMinMax
117 | 
118 | template<typename T> struct WindowedMinCompare
119 | {
120 |     SIAMESE_FORCE_INLINE bool operator()(const T x, const T y) const
121 |     {
122 |         return x <= y;
123 |     }
124 | };
125 | 
126 | template<typename T> struct WindowedMaxCompare
127 | {
128 |     SIAMESE_FORCE_INLINE bool operator()(const T x, const T y) const
129 |     {
130 |         return x >= y;
131 |     }
132 | };
133 | 
134 | /// Templated class that calculates a running windowed minimum or maximum with
135 | /// a fixed time and resource cost.
136 | template<typename T, class CompareT> class WindowedMinMax
137 | {
138 | public:
139 |     typedef uint64_t TimeT;
140 |     CompareT Compare;
141 | 
142 |     struct Sample
143 |     {
144 |         /// Sample value
145 |         T Value;
146 | 
147 |         /// Timestamp of data collection
148 |         TimeT Timestamp;
149 | 
150 | 
151 |         /// Default values and initializing constructor
152 |         explicit Sample(T value = 0, TimeT timestamp = 0)
153 |             : Value(value)
154 |             , Timestamp(timestamp)
155 |         {
156 |         }
157 | 
158 |         /// Check if a timeout expired
159 |         inline bool TimeoutExpired(TimeT now, TimeT timeout)
160 |         {
161 |             return (TimeT)(now - Timestamp) > timeout;
162 |         }
163 |     };
164 | 
165 | 
166 |     static const unsigned kSampleCount = 3;
167 | 
168 |     Sample Samples[kSampleCount];
169 | 
170 | 
171 |     bool IsValid() const
172 |     {
173 |         return Samples[0].Value != 0; ///< ish
174 |     }
175 | 
176 |     T GetBest() const
177 |     {
178 |         return Samples[0].Value;
179 |     }
180 | 
181 |     void Reset(const Sample sample = Sample())
182 |     {
183 |         Samples[0] = Samples[1] = Samples[2] = sample;
184 |     }
185 | 
186 |     void Update(T value, TimeT timestamp, const TimeT windowLengthTime)
187 |     {
188 |         const Sample sample(value, timestamp);
189 | 
190 |         // On the first sample, new best sample, or if window length has expired:
191 |         if (!IsValid() ||
192 |             Compare(value, Samples[0].Value) ||
193 |             Samples[2].TimeoutExpired(sample.Timestamp, windowLengthTime))
194 |         {
195 |             Reset(sample);
196 |             return;
197 |         }
198 | 
199 |         // Insert the new value into the sorted array
200 |         if (Compare(value, Samples[1].Value))
201 |             Samples[2] = Samples[1] = sample;
202 |         else if (Compare(value, Samples[2].Value))
203 |             Samples[2] = sample;
204 | 
205 |         // Expire best if it has been the best for a long time
206 |         if (Samples[0].TimeoutExpired(sample.Timestamp, windowLengthTime))
207 |         {
208 |             // Also expire the next best if needed
209 |             if (Samples[1].TimeoutExpired(sample.Timestamp, windowLengthTime))
210 |             {
211 |                 Samples[0] = Samples[2];
212 |                 Samples[1] = sample;
213 |             }
214 |             else
215 |             {
216 |                 Samples[0] = Samples[1];
217 |                 Samples[1] = Samples[2];
218 |             }
219 |             Samples[2] = sample;
220 |             return;
221 |         }
222 | 
223 |         // Quarter of window has gone by without a better value - Use the second-best
224 |         if (Samples[1].Value == Samples[0].Value &&
225 |             Samples[1].TimeoutExpired(sample.Timestamp, windowLengthTime / 4))
226 |         {
227 |             Samples[2] = Samples[1] = sample;
228 |             return;
229 |         }
230 | 
231 |         // Half the window has gone by without a better value - Use the third-best one
232 |         if (Samples[2].Value == Samples[1].Value &&
233 |             Samples[2].TimeoutExpired(sample.Timestamp, windowLengthTime / 2))
234 |         {
235 |             Samples[2] = sample;
236 |         }
237 |     }
238 | };
239 | 
240 | 
241 | } // namespace siamese
242 | 


--------------------------------------------------------------------------------
/unit_test/main.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | 	Copyright (c) 2015 Christopher A. Taylor.  All rights reserved.
  3 | 
  4 | 	Redistribution and use in source and binary forms, with or without
  5 | 	modification, are permitted provided that the following conditions are met:
  6 | 
  7 | 	* Redistributions of source code must retain the above copyright notice,
  8 | 	  this list of conditions and the following disclaimer.
  9 | 	* Redistributions in binary form must reproduce the above copyright notice,
 10 | 	  this list of conditions and the following disclaimer in the documentation
 11 | 	  and/or other materials provided with the distribution.
 12 | 	* Neither the name of CM256 nor the names of its contributors may be
 13 | 	  used to endorse or promote products derived from this software without
 14 | 	  specific prior written permission.
 15 | 
 16 | 	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 17 | 	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 | 	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 19 | 	ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 20 | 	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 21 | 	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 22 | 	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 23 | 	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 24 | 	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 25 | 	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 26 | 	POSSIBILITY OF SUCH DAMAGE.
 27 | */
 28 | 
 29 | #ifdef _MSC_VER
 30 |     #pragma warning(push)
 31 |     #pragma warning(disable: 4530) // warning C4530: C++ exception handler used, but unwind semantics are not enabled. Specify /EHsc
 32 | #endif
 33 | 
 34 | #include <iostream>
 35 | using namespace std;
 36 | 
 37 | #ifdef _MSC_VER
 38 |     #pragma warning(pop)
 39 | #endif
 40 | 
 41 | #include "cm256.h"
 42 | #include "SiameseTools.h"
 43 | 
 44 | #ifdef _WIN32
 45 | #define WIN32_LEAN_AND_MEAN
 46 | #include <Windows.h>
 47 | #endif
 48 | 
 49 | #include <chrono>
 50 | #include <thread>
 51 | 
 52 | void initializeBlocks(cm256_block originals[256], int blockCount, int blockBytes)
 53 | {
 54 |     for (int i = 0; i < blockCount; ++i)
 55 |     {
 56 |         for (int j = 0; j < blockBytes; ++j)
 57 |         {
 58 |             const uint8_t expected = (uint8_t)(i + j * 13);
 59 |             uint8_t* data = (uint8_t*)originals[i].Block;
 60 |             data[j] = expected;
 61 |         }
 62 |     }
 63 | }
 64 | 
 65 | bool validateSolution(cm256_block_t* blocks, int blockCount, int blockBytes)
 66 | {
 67 |     uint8_t seen[256] = { 0 };
 68 | 
 69 |     for (int i = 0; i < blockCount; ++i)
 70 |     {
 71 |         uint8_t index = blocks[i].Index;
 72 | 
 73 |         if (index >= blockCount)
 74 |         {
 75 |             return false;
 76 |         }
 77 | 
 78 |         if (seen[index])
 79 |         {
 80 |             return false;
 81 |         }
 82 | 
 83 |         seen[index] = 1;
 84 | 
 85 |         for (int j = 0; j < blockBytes; ++j)
 86 |         {
 87 |             const uint8_t expected = (uint8_t)(index + j * 13);
 88 |             uint8_t* blockData = (uint8_t*)blocks[i].Block;
 89 |             if (blockData[j] != expected)
 90 |             {
 91 |                 return false;
 92 |             }
 93 |         }
 94 |     }
 95 | 
 96 |     return true;
 97 | }
 98 | 
 99 | 
100 | 
101 | bool ExampleFileUsage()
102 | {
103 |     if (cm256_init())
104 |     {
105 |         return false;
106 |     }
107 | 
108 |     cm256_encoder_params params;
109 | 
110 |     // Number of bytes per file block
111 |     params.BlockBytes = 1296;
112 | 
113 |     // Number of blocks
114 |     params.OriginalCount = 100;
115 | 
116 |     // Number of additional recovery blocks generated by encoder
117 |     params.RecoveryCount = 30;
118 | 
119 |     // Size of the original file
120 |     static const int OriginalFileBytes = params.OriginalCount * params.BlockBytes;
121 | 
122 |     // Allocate and fill the original file data
123 |     uint8_t* originalFileData = new uint8_t[OriginalFileBytes];
124 |     for (int i = 0; i < OriginalFileBytes; ++i)
125 |     {
126 |         originalFileData[i] = (uint8_t)i;
127 |     }
128 | 
129 |     // Pointers to data
130 |     cm256_block blocks[256];
131 |     for (int i = 0; i < params.OriginalCount; ++i)
132 |     {
133 |         blocks[i].Block = originalFileData + i * params.BlockBytes;
134 |     }
135 | 
136 |     // Recovery data
137 |     uint8_t* recoveryBlocks = new uint8_t[params.RecoveryCount * params.BlockBytes];
138 | 
139 |     // Generate recovery data
140 |     if (cm256_encode(params, blocks, recoveryBlocks))
141 |     {
142 |         return false;
143 |     }
144 | 
145 |     // Initialize the indices
146 |     for (int i = 0; i < params.OriginalCount; ++i)
147 |     {
148 |         blocks[i].Index = cm256_get_original_block_index(params, i);
149 |     }
150 | 
151 |     //// Simulate loss of data, substituting a recovery block in its place ////
152 |     for (int i = 0; i < params.RecoveryCount && i < params.OriginalCount; ++i)
153 |     {
154 |         blocks[i].Block = recoveryBlocks + params.BlockBytes * i; // First recovery block
155 |         blocks[i].Index = cm256_get_recovery_block_index(params, i); // First recovery block index
156 |     }
157 |     //// Simulate loss of data, substituting a recovery block in its place ////
158 | 
159 |     if (cm256_decode(params, blocks))
160 |     {
161 |         return false;
162 |     }
163 | 
164 |     for (int i = 0; i < params.RecoveryCount && i < params.OriginalCount; ++i)
165 |     {
166 |         uint8_t* block = (uint8_t*)blocks[i].Block;
167 |         int index = blocks[i].Index;
168 | 
169 |         for (int j = 0; j < params.BlockBytes; ++j)
170 |         {
171 |             const uint8_t expected = (uint8_t)(j + index * params.BlockBytes);
172 |             if (block[j] != expected)
173 |             {
174 |                 return false;
175 |             }
176 |         }
177 |     }
178 | 
179 |     delete[] originalFileData;
180 |     delete[] recoveryBlocks;
181 | 
182 |     return true;
183 | }
184 | 
185 | bool CheckMemSwap()
186 | {
187 |     unsigned char buffa[16 + 8 + 4 + 3];
188 |     memset(buffa, 1, sizeof(buffa));
189 |     unsigned char buffb[16 + 8 + 4 + 3];
190 |     memset(buffb, 2, sizeof(buffb));
191 | 
192 |     gf256_memswap(buffa, buffb, (int)sizeof(buffa));
193 | 
194 |     for (int i = 0; i < (int)sizeof(buffa); ++i)
195 |     {
196 |         if (buffa[i] != 2)
197 |         {
198 |             return false;
199 |         }
200 |         if (buffb[i] != 1)
201 |         {
202 |             return false;
203 |         }
204 |     }
205 | 
206 |     gf256_memswap(buffa, buffb, (int)sizeof(buffa));
207 | 
208 |     for (int i = 0; i < (int)sizeof(buffa); ++i)
209 |     {
210 |         if (buffa[i] != 1)
211 |         {
212 |             return false;
213 |         }
214 |         if (buffb[i] != 2)
215 |         {
216 |             return false;
217 |         }
218 |     }
219 | 
220 |     return true;
221 | }
222 | 
223 | bool FinerPerfTimingTest()
224 | {
225 | #ifdef _WIN32
226 |     ::SetPriorityClass(::GetCurrentProcess(), REALTIME_PRIORITY_CLASS);
227 |     ::SetThreadPriority(::GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL);
228 | #endif
229 | 
230 |     std::this_thread::sleep_for(std::chrono::milliseconds(1000));
231 | 
232 |     if (cm256_init())
233 |     {
234 |         return false;
235 |     }
236 | 
237 |     cm256_block blocks[256];
238 | 
239 |     uint64_t tsum = 0;
240 | 
241 |     cm256_encoder_params params;
242 |     params.BlockBytes = 1296;
243 |     params.OriginalCount = 100;
244 |     params.RecoveryCount = 30;
245 | 
246 |     unsigned char* orig_data = new unsigned char[256 * params.BlockBytes];
247 |     unsigned char* recoveryData = new unsigned char[256 * params.BlockBytes];
248 | 
249 |     const int trials = 1000;
250 |     for (int trial = 0; trial < trials; ++trial)
251 |     {
252 |         for (int i = 0; i < params.BlockBytes * params.OriginalCount; ++i)
253 |         {
254 |             orig_data[i] = (uint8_t)i;
255 |         }
256 | 
257 |         for (int i = 0; i < params.OriginalCount; ++i)
258 |         {
259 |             blocks[i].Block = orig_data + i * params.BlockBytes;
260 |         }
261 | 
262 |         if (cm256_encode(params, blocks, recoveryData))
263 |         {
264 |             return false;
265 |         }
266 | 
267 |         // Initialize the indices
268 |         for (int i = 0; i < params.OriginalCount; ++i)
269 |         {
270 |             blocks[i].Index = cm256_get_original_block_index(params, i);
271 |         }
272 | 
273 |         //// Simulate loss of data, substituting a recovery block in its place ////
274 |         for (int i = 0; i < params.RecoveryCount && i < params.OriginalCount; ++i)
275 |         {
276 |             blocks[i].Block = recoveryData + params.BlockBytes * i; // First recovery block
277 |             blocks[i].Index = cm256_get_recovery_block_index(params, i); // First recovery block index
278 |         }
279 |         //// Simulate loss of data, substituting a recovery block in its place ////
280 | 
281 |         const uint64_t t0 = siamese::GetTimeUsec();
282 | 
283 |         if (cm256_decode(params, blocks))
284 |         {
285 |             return false;
286 |         }
287 | 
288 |         const uint64_t t1 = siamese::GetTimeUsec();
289 |         tsum += t1 - t0;
290 | 
291 |         for (int i = 0; i < params.RecoveryCount && i < params.OriginalCount; ++i)
292 |         {
293 |             uint8_t* block = (uint8_t*)blocks[i].Block;
294 |             int index = blocks[i].Index;
295 | 
296 |             for (int j = 0; j < params.BlockBytes; ++j)
297 |             {
298 |                 const uint8_t expected = (uint8_t)(j + index * params.BlockBytes);
299 |                 if (block[j] != expected)
300 |                 {
301 |                     return false;
302 |                 }
303 |             }
304 |         }
305 |     }
306 | 
307 |     const double opusec = tsum / static_cast<double>( trials );
308 |     const double mbps = (params.BlockBytes * params.OriginalCount / opusec);
309 | 
310 |     cout << opusec << " usec, " << mbps << " MBps" << endl;
311 | 
312 | #ifdef _WIN32
313 |     ::SetThreadPriority(::GetCurrentThread(), THREAD_PRIORITY_NORMAL);
314 |     ::SetPriorityClass(::GetCurrentProcess(), NORMAL_PRIORITY_CLASS);
315 | #endif
316 | 
317 |     return true;
318 | }
319 | 
320 | 
321 | bool BulkPerfTesting()
322 | {
323 |     if (cm256_init())
324 |     {
325 |         return false;
326 |     }
327 | 
328 |     static const int MaxBlockBytes = 10000; // multiple of 10
329 | 
330 |     unsigned char* orig_data = new unsigned char[256 * MaxBlockBytes];
331 | 
332 |     unsigned char* recoveryData = new unsigned char[256 * MaxBlockBytes];
333 | 
334 |     cm256_block blocks[256];
335 | 
336 |     for (int blockBytes = 8 * 162; blockBytes <= MaxBlockBytes; blockBytes *= 10)
337 |     {
338 |         for (int originalCount = 1; originalCount < 256; ++originalCount)
339 |         {
340 |             for (int recoveryCount = 1; recoveryCount <= 1 + originalCount / 2 && recoveryCount <= 256 - originalCount; ++recoveryCount)
341 |             {
342 |                 cm256_encoder_params params;
343 |                 params.BlockBytes = blockBytes;
344 |                 params.OriginalCount = originalCount;
345 |                 params.RecoveryCount = recoveryCount;
346 | 
347 |                 for (int i = 0; i < 256; ++i)
348 |                 {
349 |                     blocks[i].Block = orig_data + i * MaxBlockBytes;
350 |                 }
351 | 
352 |                 initializeBlocks(blocks, originalCount, blockBytes);
353 | 
354 |                 {
355 |                     const uint64_t t0 = siamese::GetTimeUsec();
356 | 
357 |                     if (cm256_encode(params, blocks, recoveryData))
358 |                     {
359 |                         cout << "Encoder error" << endl;
360 |                         return false;
361 |                     }
362 | 
363 |                     const uint64_t t1 = siamese::GetTimeUsec();
364 |                     const int dt_usec = (int)static_cast<int64_t>( t1 - t0 );
365 | 
366 |                     const double opusec = dt_usec;
367 |                     const double mbps = (params.BlockBytes * params.OriginalCount / opusec);
368 | 
369 |                     cout << "Encoder: " << blockBytes << " bytes k = " << originalCount << " m = " << recoveryCount << " : " << opusec << " usec, " << mbps << " MBps" << endl;
370 |                 }
371 | 
372 |                 // Fill in indices
373 |                 for (int i = 0; i < originalCount; ++i)
374 |                 {
375 |                     blocks[i].Index = cm256_get_original_block_index(params, i);
376 |                 }
377 | 
378 |                 for (int ii = 0; ii < recoveryCount; ++ii)
379 |                 {
380 |                     int erasure_index = recoveryCount - ii - 1;
381 |                     blocks[ii].Block = recoveryData + erasure_index * blockBytes;
382 |                     blocks[ii].Index = cm256_get_recovery_block_index(params, erasure_index);
383 |                 }
384 | 
385 |                 {
386 |                     const uint64_t t0 = siamese::GetTimeUsec();
387 | 
388 |                     if (cm256_decode(params, blocks))
389 |                     {
390 |                         cout << "Decoder error" << endl;
391 |                         return false;
392 |                     }
393 | 
394 |                     const uint64_t t1 = siamese::GetTimeUsec();
395 |                     const int dt_usec = (int)static_cast<int64_t>( t1 - t0 );
396 | 
397 |                     const double opusec = dt_usec;
398 |                     const double mbps = (params.BlockBytes * params.OriginalCount / opusec);
399 | 
400 |                     cout << "Decoder: " << blockBytes << " bytes k = " << originalCount << " m = " << recoveryCount << " : " << opusec << " usec, " << mbps << " MBps" << endl;
401 |                 }
402 | 
403 |                 if (!validateSolution(blocks, originalCount, blockBytes))
404 |                 {
405 |                     cout << "Solution invalid" << endl;
406 |                     return false;
407 |                 }
408 |             }
409 |         }
410 |     }
411 | 
412 |     return true;
413 | }
414 | 
415 | 
416 | int main()
417 | {
418 | #if 1
419 |     if (!ExampleFileUsage())
420 |     {
421 |         exit(1);
422 |     }
423 | #endif
424 | #if 1
425 |     if (!CheckMemSwap())
426 |     {
427 |         exit(4);
428 |     }
429 | #endif
430 | #if 1
431 |     if (!FinerPerfTimingTest())
432 |     {
433 |         exit(2);
434 |     }
435 | #endif
436 | #if 1
437 |     if (!BulkPerfTesting())
438 |     {
439 |         exit(3);
440 |     }
441 | #endif
442 | 
443 |     return 0;
444 | }
445 | 


--------------------------------------------------------------------------------
/unit_test/matrix_test.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio 2013
 4 | VisualStudioVersion = 12.0.40629.0
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "matrix_test", "matrix_test.vcxproj", "{7685F962-9880-43F5-A10E-B2A1E0E09465}"
 7 | EndProject
 8 | Global
 9 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | 		Debug|Win32 = Debug|Win32
11 | 		Debug|x64 = Debug|x64
12 | 		Release|Win32 = Release|Win32
13 | 		Release|x64 = Release|x64
14 | 	EndGlobalSection
15 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
16 | 		{7685F962-9880-43F5-A10E-B2A1E0E09465}.Debug|Win32.ActiveCfg = Debug|Win32
17 | 		{7685F962-9880-43F5-A10E-B2A1E0E09465}.Debug|Win32.Build.0 = Debug|Win32
18 | 		{7685F962-9880-43F5-A10E-B2A1E0E09465}.Debug|x64.ActiveCfg = Debug|x64
19 | 		{7685F962-9880-43F5-A10E-B2A1E0E09465}.Debug|x64.Build.0 = Debug|x64
20 | 		{7685F962-9880-43F5-A10E-B2A1E0E09465}.Release|Win32.ActiveCfg = Release|Win32
21 | 		{7685F962-9880-43F5-A10E-B2A1E0E09465}.Release|Win32.Build.0 = Release|Win32
22 | 		{7685F962-9880-43F5-A10E-B2A1E0E09465}.Release|x64.ActiveCfg = Release|x64
23 | 		{7685F962-9880-43F5-A10E-B2A1E0E09465}.Release|x64.Build.0 = Release|x64
24 | 	EndGlobalSection
25 | 	GlobalSection(SolutionProperties) = preSolution
26 | 		HideSolutionNode = FALSE
27 | 	EndGlobalSection
28 | EndGlobal
29 | 


--------------------------------------------------------------------------------
/unit_test/matrix_test.vcxproj:
--------------------------------------------------------------------------------
  1 | ﻿<?xml version="1.0" encoding="utf-8"?>
  2 | <Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <ItemGroup Label="ProjectConfigurations">
  4 |     <ProjectConfiguration Include="Debug|Win32">
  5 |       <Configuration>Debug</Configuration>
  6 |       <Platform>Win32</Platform>
  7 |     </ProjectConfiguration>
  8 |     <ProjectConfiguration Include="Debug|x64">
  9 |       <Configuration>Debug</Configuration>
 10 |       <Platform>x64</Platform>
 11 |     </ProjectConfiguration>
 12 |     <ProjectConfiguration Include="Release|Win32">
 13 |       <Configuration>Release</Configuration>
 14 |       <Platform>Win32</Platform>
 15 |     </ProjectConfiguration>
 16 |     <ProjectConfiguration Include="Release|x64">
 17 |       <Configuration>Release</Configuration>
 18 |       <Platform>x64</Platform>
 19 |     </ProjectConfiguration>
 20 |   </ItemGroup>
 21 |   <PropertyGroup Label="Globals">
 22 |     <ProjectGuid>{7685F962-9880-43F5-A10E-B2A1E0E09465}</ProjectGuid>
 23 |     <RootNamespace>matrix_test</RootNamespace>
 24 |   </PropertyGroup>
 25 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
 26 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
 27 |     <ConfigurationType>Application</ConfigurationType>
 28 |     <UseDebugLibraries>true</UseDebugLibraries>
 29 |     <PlatformToolset>v120</PlatformToolset>
 30 |     <CharacterSet>MultiByte</CharacterSet>
 31 |   </PropertyGroup>
 32 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
 33 |     <ConfigurationType>Application</ConfigurationType>
 34 |     <UseDebugLibraries>true</UseDebugLibraries>
 35 |     <PlatformToolset>v120</PlatformToolset>
 36 |     <CharacterSet>MultiByte</CharacterSet>
 37 |   </PropertyGroup>
 38 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
 39 |     <ConfigurationType>Application</ConfigurationType>
 40 |     <UseDebugLibraries>false</UseDebugLibraries>
 41 |     <PlatformToolset>v120</PlatformToolset>
 42 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 43 |     <CharacterSet>MultiByte</CharacterSet>
 44 |   </PropertyGroup>
 45 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
 46 |     <ConfigurationType>Application</ConfigurationType>
 47 |     <UseDebugLibraries>false</UseDebugLibraries>
 48 |     <PlatformToolset>v120</PlatformToolset>
 49 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 50 |     <CharacterSet>MultiByte</CharacterSet>
 51 |   </PropertyGroup>
 52 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
 53 |   <ImportGroup Label="ExtensionSettings">
 54 |   </ImportGroup>
 55 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 56 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 57 |   </ImportGroup>
 58 |   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
 59 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 60 |   </ImportGroup>
 61 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
 62 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 63 |   </ImportGroup>
 64 |   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
 65 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 66 |   </ImportGroup>
 67 |   <PropertyGroup Label="UserMacros" />
 68 |   <PropertyGroup />
 69 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 70 |     <ClCompile>
 71 |       <WarningLevel>Level4</WarningLevel>
 72 |       <Optimization>Disabled</Optimization>
 73 |       <SDLCheck>true</SDLCheck>
 74 |       <TreatWarningAsError>true</TreatWarningAsError>
 75 |       <ExceptionHandling>false</ExceptionHandling>
 76 |       <BufferSecurityCheck>true</BufferSecurityCheck>
 77 |       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
 78 |       <RuntimeTypeInfo>false</RuntimeTypeInfo>
 79 |       <OpenMPSupport>false</OpenMPSupport>
 80 |     </ClCompile>
 81 |     <Link>
 82 |       <GenerateDebugInformation>true</GenerateDebugInformation>
 83 |     </Link>
 84 |   </ItemDefinitionGroup>
 85 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
 86 |     <ClCompile>
 87 |       <WarningLevel>Level4</WarningLevel>
 88 |       <Optimization>Disabled</Optimization>
 89 |       <SDLCheck>true</SDLCheck>
 90 |       <TreatWarningAsError>true</TreatWarningAsError>
 91 |       <ExceptionHandling>false</ExceptionHandling>
 92 |       <BufferSecurityCheck>true</BufferSecurityCheck>
 93 |       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
 94 |       <RuntimeTypeInfo>false</RuntimeTypeInfo>
 95 |       <OpenMPSupport>false</OpenMPSupport>
 96 |     </ClCompile>
 97 |     <Link>
 98 |       <GenerateDebugInformation>true</GenerateDebugInformation>
 99 |     </Link>
100 |   </ItemDefinitionGroup>
101 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
102 |     <ClCompile>
103 |       <WarningLevel>Level4</WarningLevel>
104 |       <Optimization>Full</Optimization>
105 |       <FunctionLevelLinking>true</FunctionLevelLinking>
106 |       <IntrinsicFunctions>true</IntrinsicFunctions>
107 |       <SDLCheck>true</SDLCheck>
108 |       <TreatWarningAsError>true</TreatWarningAsError>
109 |       <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
110 |       <FavorSizeOrSpeed>Size</FavorSizeOrSpeed>
111 |       <OmitFramePointers>true</OmitFramePointers>
112 |       <ExceptionHandling>false</ExceptionHandling>
113 |       <BufferSecurityCheck>false</BufferSecurityCheck>
114 |       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
115 |       <RuntimeTypeInfo>false</RuntimeTypeInfo>
116 |       <OpenMPSupport>false</OpenMPSupport>
117 |     </ClCompile>
118 |     <Link>
119 |       <GenerateDebugInformation>true</GenerateDebugInformation>
120 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
121 |       <OptimizeReferences>true</OptimizeReferences>
122 |     </Link>
123 |   </ItemDefinitionGroup>
124 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
125 |     <ClCompile>
126 |       <WarningLevel>Level4</WarningLevel>
127 |       <Optimization>Full</Optimization>
128 |       <FunctionLevelLinking>true</FunctionLevelLinking>
129 |       <IntrinsicFunctions>true</IntrinsicFunctions>
130 |       <SDLCheck>true</SDLCheck>
131 |       <TreatWarningAsError>true</TreatWarningAsError>
132 |       <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
133 |       <FavorSizeOrSpeed>Size</FavorSizeOrSpeed>
134 |       <OmitFramePointers>true</OmitFramePointers>
135 |       <ExceptionHandling>false</ExceptionHandling>
136 |       <BufferSecurityCheck>false</BufferSecurityCheck>
137 |       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
138 |       <RuntimeTypeInfo>false</RuntimeTypeInfo>
139 |       <OpenMPSupport>false</OpenMPSupport>
140 |     </ClCompile>
141 |     <Link>
142 |       <GenerateDebugInformation>true</GenerateDebugInformation>
143 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
144 |       <OptimizeReferences>true</OptimizeReferences>
145 |     </Link>
146 |   </ItemDefinitionGroup>
147 |   <ItemGroup>
148 |     <ClCompile Include="..\cm256.cpp" />
149 |     <ClCompile Include="..\gf256.cpp" />
150 |     <ClCompile Include="main.cpp" />
151 |   </ItemGroup>
152 |   <ItemGroup>
153 |     <ClInclude Include="..\cm256.h" />
154 |     <ClInclude Include="..\gf256.h" />
155 |   </ItemGroup>
156 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
157 |   <ImportGroup Label="ExtensionTargets">
158 |   </ImportGroup>
159 | </Project>


--------------------------------------------------------------------------------
/unit_test/matrix_test.vcxproj.filters:
--------------------------------------------------------------------------------
 1 | ﻿<?xml version="1.0" encoding="utf-8"?>
 2 | <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
 3 |   <ItemGroup>
 4 |     <Filter Include="Source Files">
 5 |       <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
 6 |       <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
 7 |     </Filter>
 8 |     <Filter Include="Header Files">
 9 |       <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
10 |       <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
11 |     </Filter>
12 |     <Filter Include="Resource Files">
13 |       <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
14 |       <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
15 |     </Filter>
16 |   </ItemGroup>
17 |   <ItemGroup>
18 |     <ClCompile Include="main.cpp">
19 |       <Filter>Source Files</Filter>
20 |     </ClCompile>
21 |     <ClCompile Include="..\cm256.cpp">
22 |       <Filter>Source Files</Filter>
23 |     </ClCompile>
24 |     <ClCompile Include="..\gf256.cpp">
25 |       <Filter>Source Files</Filter>
26 |     </ClCompile>
27 |   </ItemGroup>
28 |   <ItemGroup>
29 |     <ClInclude Include="..\cm256.h">
30 |       <Filter>Source Files</Filter>
31 |     </ClInclude>
32 |     <ClInclude Include="..\gf256.h">
33 |       <Filter>Source Files</Filter>
34 |     </ClInclude>
35 |   </ItemGroup>
36 | </Project>


--------------------------------------------------------------------------------
/unit_test/x64/Debug/matrix_test.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/catid/cm256/634d1e36b095dbba937586a7e9843d828762dae4/unit_test/x64/Debug/matrix_test.exe


--------------------------------------------------------------------------------
/unit_test/x64/Release/matrix_test.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/catid/cm256/634d1e36b095dbba937586a7e9843d828762dae4/unit_test/x64/Release/matrix_test.exe


--------------------------------------------------------------------------------