├── .gitattributes ├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README.md ├── include ├── cm256.h └── gf256.h ├── src ├── cm256.cpp └── gf256.cpp └── unit_test ├── Debug └── matrix_test.exe ├── Release └── matrix_test.exe ├── SiameseTools.cpp ├── SiameseTools.h ├── main.cpp ├── matrix_test.sln ├── matrix_test.vcxproj ├── matrix_test.vcxproj.filters └── x64 ├── Debug └── matrix_test.exe └── Release └── matrix_test.exe /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # MSVC temp files 21 | *.obj 22 | *.log 23 | *.ilk 24 | *.pdb 25 | *.tlog 26 | *.idb 27 | *.opensdf 28 | *.sdf 29 | *.user 30 | *.suo 31 | 32 | # ========================= 33 | # Operating System Files 34 | # ========================= 35 | 36 | # OSX 37 | # ========================= 38 | 39 | .DS_Store 40 | .AppleDouble 41 | .LSOverride 42 | 43 | # Thumbnails 44 | ._* 45 | 46 | # Files that might appear on external disk 47 | .Spotlight-V100 48 | .Trashes 49 | 50 | # Directories potentially created on remote AFP share 51 | .AppleDB 52 | .AppleDesktop 53 | Network Trash Folder 54 | Temporary Items 55 | .apdisk 56 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5) 2 | project(cm256) 3 | 4 | set(CMAKE_CXX_STANDARD 11) 5 | 6 | set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "" FORCE) 7 | 8 | if(NOT CMAKE_BUILD_TYPE) 9 | set(CMAKE_BUILD_TYPE Release) 10 | endif() 11 | 12 | option(MARCH_NATIVE "Use -march=native option" ON) 13 | 14 | set(LIB_SOURCE_FILES 15 | src/cm256.cpp 16 | include/cm256.h 17 | src/gf256.cpp 18 | include/gf256.h 19 | ) 20 | 21 | set(UNIT_TEST_SOURCE_FILES 22 | unit_test/main.cpp 23 | unit_test/SiameseTools.cpp 24 | unit_test/SiameseTools.h 25 | ) 26 | 27 | if(MSVC) 28 | else() 29 | set(CMAKE_CXX_FLAGS "-Wall -Wextra") 30 | set(CMAKE_CXX_FLAGS_DEBUG "-g -O0") 31 | set(CMAKE_CXX_FLAGS_RELEASE "-O3") 32 | endif() 33 | 34 | if(MARCH_NATIVE) 35 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -march=native") 36 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -march=native") 37 | endif() 38 | 39 | add_library(cm256 ${LIB_SOURCE_FILES}) 40 | set_target_properties(cm256 PROPERTIES VERSION 2) 41 | set_target_properties(cm256 PROPERTIES SOVERSION 2) 42 | target_include_directories(cm256 PUBLIC ${PROJECT_SOURCE_DIR}/include) 43 | 44 | add_executable(unit_test ${UNIT_TEST_SOURCE_FILES}) 45 | target_link_libraries(unit_test cm256) 46 | 47 | install(TARGETS cm256 48 | ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} 49 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 50 | PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) 51 | install(DIRECTORY include DESTINATION ${CMAKE_INSTALL_PREFIX}) 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2024, Chris Taylor 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cm256 2 | Fast GF(256) Cauchy MDS Block Erasure Codec in C 3 | 4 | cm256 is a simple library for erasure codes. From given data it generates 5 | redundant data that can be used to recover the originals. 6 | 7 | It is roughly 2x faster than Longhair, and CM256 supports input data that is not a multiple of 8 bytes. 8 | 9 | Currently only Visual Studio 2013 is supported, though other versions of MSVC may work. 10 | 11 | The original data should be split up into equally-sized chunks. If one of these chunks 12 | is erased, the redundant data can fill in the gap through decoding. 13 | 14 | The erasure code is parameterized by three values (`OriginalCount`, `RecoveryCount`, `BlockBytes`). These are: 15 | 16 | + The number of blocks of original data (`OriginalCount`), which must be less than 256. 17 | + The number of blocks of redundant data (`RecoveryCount`), which must be no more than `256 - OriginalCount`. 18 | 19 | For example, if a file is split into 3 equal pieces and sent over a network, `OriginalCount` is 3. 20 | And if 2 additional redundant packets are generated, `RecoveryCount` is 2. 21 | In this case up to 256 - 3 = 253 additional redundant packets can be generated. 22 | 23 | cm256 is released under the BSD3 license for free use in commercial and open source projects. 24 | 25 | ##### Building: Quick Setup 26 | 27 | Include the cm256.* and gf256.* files in your project and consult the cm256.h header for usage. 28 | 29 | 30 | ## Usage 31 | 32 | Documentation is provided in the header file [cm256.h](https://github.com/catid/cm256/raw/master/cm256.h). 33 | 34 | When your application starts up it should call `cm256_init()` to verify that the library is linked properly: 35 | 36 | ~~~ 37 | #include "cm256.h" 38 | 39 | if (cm256_init()) { 40 | // Wrong static library 41 | exit(1); 42 | } 43 | ~~~ 44 | 45 | To generate redundancy, use the `cm256_encode` function. To solve for the original data use the `cm256_decode` function. 46 | 47 | Example usage: 48 | 49 | ~~~ 50 | bool ExampleFileUsage() 51 | { 52 | if (cm256_init()) 53 | { 54 | exit(1); 55 | } 56 | 57 | cm256_encoder_params params; 58 | 59 | // Number of bytes per file block 60 | params.BlockBytes = 4321; 61 | 62 | // Number of blocks 63 | params.OriginalCount = 33; 64 | 65 | // Number of additional recovery blocks generated by encoder 66 | params.RecoveryCount = 12; 67 | 68 | // Size of the original file 69 | static const int OriginalFileBytes = params.OriginalCount * params.BlockBytes; 70 | 71 | // Allocate and fill the original file data 72 | uint8_t* originalFileData = new uint8_t[OriginalFileBytes]; 73 | memset(originalFileData, 1, OriginalFileBytes); 74 | 75 | // Pointers to data 76 | cm256_block blocks[256]; 77 | for (int i = 0; i < params.OriginalCount; ++i) 78 | { 79 | blocks[i].Block = originalFileData + i * params.BlockBytes; 80 | } 81 | 82 | // Recovery data 83 | uint8_t* recoveryBlocks = new uint8_t[params.RecoveryCount * params.BlockBytes]; 84 | 85 | // Generate recovery data 86 | if (cm256_encode(params, blocks, recoveryBlocks)) 87 | { 88 | exit(1); 89 | } 90 | 91 | // Initialize the indices 92 | for (int i = 0; i < params.OriginalCount; ++i) 93 | { 94 | blocks[i].Index = cm256_get_original_block_index(params, i); 95 | } 96 | 97 | //// Simulate loss of data, subsituting a recovery block in its place //// 98 | blocks[0].Block = recoveryBlocks; // First recovery block 99 | blocks[0].Index = cm256_get_recovery_block_index(params, 0); // First recovery block index 100 | //// Simulate loss of data, subsituting a recovery block in its place //// 101 | 102 | if (cm256_decode(params, blocks)) 103 | { 104 | exit(1); 105 | } 106 | 107 | // blocks[0].Index will now be 0. 108 | 109 | delete[] originalFileData; 110 | delete[] recoveryBlocks; 111 | 112 | return true; 113 | } 114 | ~~~ 115 | 116 | The example above is just one way to use the `cm256_decode` function. 117 | 118 | This API was designed to be flexible enough for UDP/IP-based file transfer where 119 | the blocks arrive out of order. 120 | 121 | 122 | #### Benchmark 123 | 124 | CM256 demonstrates similar encoding and (worst case) decoding performance: 125 | 126 | ~~~ 127 | Encoder: 1296 bytes k = 100 m = 1 : 5.55886 usec, 23314.1 MBps 128 | Decoder: 1296 bytes k = 100 m = 1 : 6.72915 usec, 19259.5 MBps 129 | Encoder: 1296 bytes k = 100 m = 2 : 17.2617 usec, 7507.93 MBps 130 | Decoder: 1296 bytes k = 100 m = 2 : 19.6023 usec, 6611.46 MBps 131 | Encoder: 1296 bytes k = 100 m = 3 : 30.4275 usec, 4259.31 MBps 132 | Decoder: 1296 bytes k = 100 m = 3 : 32.4755 usec, 3990.7 MBps 133 | Encoder: 1296 bytes k = 100 m = 4 : 40.6675 usec, 3186.82 MBps 134 | Decoder: 1296 bytes k = 100 m = 4 : 43.5932 usec, 2972.94 MBps 135 | Encoder: 1296 bytes k = 100 m = 5 : 51.7852 usec, 2502.64 MBps 136 | Decoder: 1296 bytes k = 100 m = 5 : 51.4926 usec, 2516.86 MBps 137 | Encoder: 1296 bytes k = 100 m = 6 : 62.6104 usec, 2069.94 MBps 138 | Decoder: 1296 bytes k = 100 m = 6 : 64.9509 usec, 1995.35 MBps 139 | Encoder: 1296 bytes k = 100 m = 7 : 76.3612 usec, 1697.2 MBps 140 | Decoder: 1296 bytes k = 100 m = 7 : 75.191 usec, 1723.61 MBps 141 | Encoder: 1296 bytes k = 100 m = 8 : 85.1384 usec, 1522.23 MBps 142 | Decoder: 1296 bytes k = 100 m = 8 : 83.0904 usec, 1559.75 MBps 143 | Encoder: 1296 bytes k = 100 m = 9 : 96.2561 usec, 1346.41 MBps 144 | Decoder: 1296 bytes k = 100 m = 9 : 95.3784 usec, 1358.8 MBps 145 | Encoder: 1296 bytes k = 100 m = 10 : 110.592 usec, 1171.87 MBps 146 | Decoder: 1296 bytes k = 100 m = 10 : 109.714 usec, 1181.25 MBps 147 | 148 | Encoder: 1296 bytes k = 100 m = 20 : 223.525 usec, 579.801 MBps 149 | Decoder: 1296 bytes k = 100 m = 20 : 209.481 usec, 618.671 MBps 150 | 151 | Encoder: 1296 bytes k = 100 m = 30 : 372.737 usec, 347.699 MBps 152 | Decoder: 1296 bytes k = 100 m = 30 : 322.707 usec, 401.603 MBps 153 | 154 | Encoder: 1296 bytes k = 100 m = 40 : 471.626 usec, 274.794 MBps 155 | Decoder: 1296 bytes k = 100 m = 40 : 434.762 usec, 298.094 MBps 156 | 157 | Encoder: 1296 bytes k = 100 m = 50 : 592.751 usec, 218.642 MBps 158 | Decoder: 1296 bytes k = 100 m = 50 : 545.939 usec, 237.389 MBps 159 | ~~~ 160 | (These performance numbers are out of date and not well calibrated - Decoding now takes the same time as encoding within a few microseconds thanks to the new matrix solver.) 161 | 162 | Longhair Library Results: 163 | 164 | Note that I hand-optimized the MemXOR.cpp implementation on this PC to run faster than what is available on github, so this is a fair comparison. 165 | 166 | ~~~ 167 | Encoded k=100 data blocks with m=1 recovery blocks in 4.09607 usec : 31640.1 MB/s 168 | + Decoded 1 erasures in 5.85144 usec : 22148.4 MB/s 169 | Encoded k=100 data blocks with m=2 recovery blocks in 41.5452 usec : 3119.5 MB/s 170 | + Decoded 2 erasures in 43.5931 usec : 2972.94 MB/s 171 | Encoded k=100 data blocks with m=3 recovery blocks in 80.7498 usec : 1604.96 MB/s 172 | + Decoded 3 erasures in 86.6013 usec : 1496.51 MB/s 173 | Encoded k=100 data blocks with m=4 recovery blocks in 123.465 usec : 1049.69 MB/s 174 | + Decoded 4 erasures in 127.854 usec : 1013.66 MB/s 175 | Encoded k=100 data blocks with m=5 recovery blocks in 76.9464 usec : 1684.29 MB/s 176 | + Decoded 5 erasures in 88.6493 usec : 1461.94 MB/s 177 | Encoded k=100 data blocks with m=6 recovery blocks in 87.7717 usec : 1476.56 MB/s 178 | + Decoded 6 erasures in 100.352 usec : 1291.45 MB/s 179 | Encoded k=100 data blocks with m=7 recovery blocks in 103.863 usec : 1247.8 MB/s 180 | + Decoded 7 erasures in 127.269 usec : 1018.32 MB/s 181 | Encoded k=100 data blocks with m=8 recovery blocks in 118.784 usec : 1091.05 MB/s 182 | + Decoded 8 erasures in 145.701 usec : 889.494 MB/s 183 | Encoded k=100 data blocks with m=9 recovery blocks in 146.871 usec : 882.406 MB/s 184 | + Decoded 9 erasures in 158.574 usec : 817.284 MB/s 185 | Encoded k=100 data blocks with m=10 recovery blocks in 156.819 usec : 826.433 MB/s 186 | + Decoded 10 erasures in 181.102 usec : 715.619 MB/s 187 | 188 | Encoded k=100 data blocks with m=20 recovery blocks in 282.039 usec : 459.511 MB/s 189 | + Decoded 20 erasures in 370.103 usec : 350.172 MB/s 190 | 191 | Encoded k=100 data blocks with m=30 recovery blocks in 428.618 usec : 302.367 MB/s 192 | + Decoded 30 erasures in 614.693 usec : 210.837 MB/s 193 | 194 | Encoded k=100 data blocks with m=40 recovery blocks in 562.323 usec : 230.472 MB/s 195 | + Decoded 40 erasures in 855.188 usec : 151.546 MB/s 196 | 197 | Encoded k=100 data blocks with m=50 recovery blocks in 727.041 usec : 178.257 MB/s 198 | + Decoded 50 erasures in 1181.11 usec : 109.727 MB/s 199 | ~~~ 200 | 201 | Results Discussion: 202 | 203 | For m=1 they are both running the same kind of code, so they're basically the same. 204 | 205 | For m=2 and m=3, CM256 is 2.5x faster. 206 | 207 | For m=4, CM256 is 3x faster in this case. Longhair could use more tuning. Back when I wrote it, the right time to switch to the Windowed decoder was at m=5, but on my new PC it seems like m=4 is a better time to do it. CM256 only has one mode so it doesn't require any tuning for best performance. 208 | 209 | For m=5...30, CM256 performance is not quite 2x faster, maybe 1.7x or so. 210 | 211 | For m>30, CM256 is at least 2x faster. 212 | 213 | 214 | #### Comparisons with Other Libraries 215 | 216 | The approach taken in CM256 is similar to the Intel Storage Acceleration Library (ISA-L) available here: 217 | 218 | https://01.org/intel%C2%AE-storage-acceleration-library-open-source-version/downloads 219 | 220 | ISA-L more aggressively optimizes the matrix multiplication operation, which is the most expensive step of encoding. 221 | 222 | CM256 takes better advantage of the m=1 case and the first recovery symbol, which is also possible with the Vandermonde matrices supported by ISA-L. 223 | 224 | ISA-L uses a O(N^3) Gaussian elimination solver for decoding. The CM256 decoder solves the linear system using a fast O(N^2) LDU-decomposition algorithm from "Pivoting and Backward Stability of Fast Algorithms for Solving Cauchy Linear Equations" (T. Boros, T. Kailath, V. Olshevsky), which was hand-optimized for memory accesses. 225 | 226 | 227 | #### Credits 228 | 229 | This software was written entirely by myself ( Christopher A. Taylor ). If you 230 | find it useful and would like to buy me a coffee, consider [tipping](https://www.gittip.com/catid/). 231 | -------------------------------------------------------------------------------- /include/cm256.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015 Christopher A. Taylor. All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | * Neither the name of CM256 nor the names of its contributors may be 13 | used to endorse or promote products derived from this software without 14 | specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 20 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 | POSSIBILITY OF SUCH DAMAGE. 27 | */ 28 | 29 | #ifndef CM256_H 30 | #define CM256_H 31 | 32 | #include "gf256.h" 33 | 34 | #include 35 | 36 | // Library version 37 | #define CM256_VERSION 2 38 | 39 | 40 | #ifdef __cplusplus 41 | extern "C" { 42 | #endif 43 | 44 | /* 45 | * Verify binary compatibility with the API on startup. 46 | * 47 | * Example: 48 | * if (cm256_init()) exit(1); 49 | * 50 | * Returns 0 on success, and any other code indicates failure. 51 | */ 52 | extern int cm256_init_(int version); 53 | #define cm256_init() cm256_init_(CM256_VERSION) 54 | 55 | 56 | // Encoder parameters 57 | typedef struct cm256_encoder_params_t { 58 | // Original block count < 256 59 | int OriginalCount; 60 | 61 | // Recovery block count < 256 62 | int RecoveryCount; 63 | 64 | // Number of bytes per block (all blocks are the same size in bytes) 65 | int BlockBytes; 66 | } cm256_encoder_params; 67 | 68 | // Descriptor for data block 69 | typedef struct cm256_block_t { 70 | // Pointer to data received. 71 | void* Block; 72 | 73 | // Block index. 74 | // For original data, it will be in the range 75 | // [0..(originalCount-1)] inclusive. 76 | // For recovery data, the first one's Index must be originalCount, 77 | // and it will be in the range 78 | // [originalCount..(originalCount+recoveryCount-1)] inclusive. 79 | unsigned char Index; 80 | // Ignored during encoding, required during decoding. 81 | } cm256_block; 82 | 83 | 84 | // Compute the value to put in the Index member of cm256_block 85 | static inline unsigned char cm256_get_recovery_block_index(cm256_encoder_params params, int recoveryBlockIndex) 86 | { 87 | assert(recoveryBlockIndex >= 0 && recoveryBlockIndex < params.RecoveryCount); 88 | return (unsigned char)(params.OriginalCount + recoveryBlockIndex); 89 | } 90 | static inline unsigned char cm256_get_original_block_index(cm256_encoder_params params, int originalBlockIndex) 91 | { 92 | assert(originalBlockIndex >= 0 && originalBlockIndex < params.OriginalCount); 93 | return (unsigned char)(originalBlockIndex); 94 | } 95 | 96 | 97 | /* 98 | * Cauchy MDS GF(256) encode 99 | * 100 | * This produces a set of recovery blocks that should be transmitted after the 101 | * original data blocks. 102 | * 103 | * It takes in 'originalCount' equal-sized blocks and produces 'recoveryCount' 104 | * equally-sized recovery blocks. 105 | * 106 | * The input 'originals' array allows more natural usage of the library. 107 | * The output recovery blocks are stored end-to-end in 'recoveryBlocks'. 108 | * 'recoveryBlocks' should have recoveryCount * blockBytes bytes available. 109 | * 110 | * Precondition: originalCount + recoveryCount <= 256 111 | * 112 | * When transmitting the data, the block index of the data should be sent, 113 | * and the recovery block index is also needed. The decoder should also 114 | * be provided with the values of originalCount, recoveryCount and blockBytes. 115 | * 116 | * Example wire format: 117 | * [originalCount(1 byte)] [recoveryCount(1 byte)] 118 | * [blockIndex(1 byte)] [blockData(blockBytes bytes)] 119 | * 120 | * Be careful not to mix blocks from different encoders. 121 | * 122 | * It is possible to support variable-length data by including the original 123 | * data length at the front of each message in 2 bytes, such that when it is 124 | * recovered after a loss the data length is available in the block data and 125 | * the remaining bytes of padding can be neglected. 126 | * 127 | * Returns 0 on success, and any other code indicates failure. 128 | */ 129 | extern int cm256_encode( 130 | cm256_encoder_params params, // Encoder parameters 131 | cm256_block* originals, // Array of pointers to original blocks 132 | void* recoveryBlocks); // Output recovery blocks end-to-end 133 | 134 | // Encode one block. 135 | // Note: This function does not validate input, use with care. 136 | extern void cm256_encode_block( 137 | cm256_encoder_params params, // Encoder parameters 138 | cm256_block* originals, // Array of pointers to original blocks 139 | int recoveryBlockIndex, // Return value from cm256_get_recovery_block_index() 140 | void* recoveryBlock); // Output recovery block 141 | 142 | /* 143 | * Cauchy MDS GF(256) decode 144 | * 145 | * This recovers the original data from the recovery data in the provided 146 | * blocks. There should be 'originalCount' blocks in the provided array. 147 | * Recovery will always be possible if that many blocks are received. 148 | * 149 | * Provide the same values for 'originalCount', 'recoveryCount', and 150 | * 'blockBytes' used by the encoder. 151 | * 152 | * The block Index should be set to the block index of the original data, 153 | * as described in the cm256_block struct comments above. 154 | * 155 | * Recovery blocks will be replaced with original data and the Index 156 | * will be updated to indicate the original block that was recovered. 157 | * 158 | * Returns 0 on success, and any other code indicates failure. 159 | */ 160 | extern int cm256_decode( 161 | cm256_encoder_params params, // Encoder parameters 162 | cm256_block* blocks); // Array of 'originalCount' blocks as described above 163 | 164 | 165 | #ifdef __cplusplus 166 | } 167 | #endif 168 | 169 | 170 | #endif // CM256_H 171 | -------------------------------------------------------------------------------- /include/gf256.h: -------------------------------------------------------------------------------- 1 | /** \file 2 | \brief GF(256) Main C API Header 3 | \copyright Copyright (c) 2017 Christopher A. Taylor. All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | * Neither the name of GF256 nor the names of its contributors may be 14 | used to endorse or promote products derived from this software without 15 | specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 21 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 | POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | #ifndef CAT_GF256_H 31 | #define CAT_GF256_H 32 | 33 | /** \page GF256 GF(256) Math Module 34 | 35 | This module provides efficient implementations of bulk 36 | GF(2^^8) math operations over memory buffers. 37 | 38 | Addition is done over the base field in GF(2) meaning 39 | that addition is XOR between memory buffers. 40 | 41 | Multiplication is performed using table lookups via 42 | SIMD instructions. This is somewhat slower than XOR, 43 | but fast enough to not become a major bottleneck when 44 | used sparingly. 45 | */ 46 | 47 | #include // uint32_t etc 48 | #include // memcpy, memset 49 | 50 | /// Library header version 51 | #define GF256_VERSION 2 52 | 53 | //------------------------------------------------------------------------------ 54 | // Platform/Architecture 55 | 56 | #if defined(__ARM_ARCH) || defined(__ARM_NEON) || defined(__ARM_NEON__) 57 | #if !defined IOS 58 | #define LINUX_ARM 59 | #endif 60 | #endif 61 | 62 | #if defined(ANDROID) || defined(IOS) || defined(LINUX_ARM) || defined(__powerpc__) || defined(__s390__) 63 | #define GF256_TARGET_MOBILE 64 | #endif // ANDROID 65 | 66 | #if defined(__AVX2__) && (!defined (_MSC_VER) || _MSC_VER >= 1900) 67 | #define GF256_TRY_AVX2 /* 256-bit */ 68 | #include 69 | #define GF256_ALIGN_BYTES 32 70 | #else // __AVX2__ 71 | #define GF256_ALIGN_BYTES 16 72 | #endif // __AVX2__ 73 | 74 | #if !defined(GF256_TARGET_MOBILE) 75 | #include // SSSE3: _mm_shuffle_epi8 76 | #include // SSE2 77 | #endif // GF256_TARGET_MOBILE 78 | 79 | #if defined(__ARM_NEON) || defined(__ARM_NEON__) 80 | #include 81 | #define GF256_TRY_NEON 82 | #endif 83 | 84 | // Compiler-specific 128-bit SIMD register keyword 85 | #if defined(GF256_TARGET_MOBILE) 86 | #if defined(GF256_TRY_NEON) 87 | #define GF256_M128 uint8x16_t 88 | #else 89 | #define GF256_M128 uint64_t 90 | #endif // GF256_TRY_NEON 91 | #else // GF256_TARGET_MOBILE 92 | #define GF256_M128 __m128i 93 | #endif // GF256_TARGET_MOBILE 94 | 95 | // Compiler-specific 256-bit SIMD register keyword 96 | #ifdef GF256_TRY_AVX2 97 | #define GF256_M256 __m256i 98 | #endif 99 | 100 | // Compiler-specific C++11 restrict keyword 101 | #define GF256_RESTRICT __restrict 102 | 103 | // Compiler-specific force inline keyword 104 | #ifdef _MSC_VER 105 | #define GF256_FORCE_INLINE inline __forceinline 106 | #else 107 | #define GF256_FORCE_INLINE inline __attribute__((always_inline)) 108 | #endif 109 | 110 | // Compiler-specific alignment keyword 111 | // Note: Alignment only matters for ARM NEON where it should be 16 112 | #ifdef _MSC_VER 113 | #define GF256_ALIGNED __declspec(align(GF256_ALIGN_BYTES)) 114 | #else // _MSC_VER 115 | #define GF256_ALIGNED __attribute__((aligned(GF256_ALIGN_BYTES))) 116 | #endif // _MSC_VER 117 | 118 | #ifdef __cplusplus 119 | extern "C" { 120 | #endif // __cplusplus 121 | 122 | 123 | //------------------------------------------------------------------------------ 124 | // Portability 125 | 126 | /// Swap two memory buffers in-place 127 | extern void gf256_memswap(void * GF256_RESTRICT vx, void * GF256_RESTRICT vy, int bytes); 128 | 129 | 130 | //------------------------------------------------------------------------------ 131 | // GF(256) Context 132 | 133 | #ifdef _MSC_VER 134 | #pragma warning(push) 135 | #pragma warning(disable: 4324) // warning C4324: 'gf256_ctx' : structure was padded due to __declspec(align()) 136 | #endif // _MSC_VER 137 | 138 | /// The context object stores tables required to perform library calculations 139 | struct gf256_ctx 140 | { 141 | /// We require memory to be aligned since the SIMD instructions benefit from 142 | /// or require aligned accesses to the table data. 143 | struct 144 | { 145 | GF256_ALIGNED GF256_M128 TABLE_LO_Y[256]; 146 | GF256_ALIGNED GF256_M128 TABLE_HI_Y[256]; 147 | } MM128; 148 | #ifdef GF256_TRY_AVX2 149 | struct 150 | { 151 | GF256_ALIGNED GF256_M256 TABLE_LO_Y[256]; 152 | GF256_ALIGNED GF256_M256 TABLE_HI_Y[256]; 153 | } MM256; 154 | #endif // GF256_TRY_AVX2 155 | 156 | /// Mul/Div/Inv/Sqr tables 157 | uint8_t GF256_MUL_TABLE[256 * 256]; 158 | uint8_t GF256_DIV_TABLE[256 * 256]; 159 | uint8_t GF256_INV_TABLE[256]; 160 | uint8_t GF256_SQR_TABLE[256]; 161 | 162 | /// Log/Exp tables 163 | uint16_t GF256_LOG_TABLE[256]; 164 | uint8_t GF256_EXP_TABLE[512 * 2 + 1]; 165 | 166 | /// Polynomial used 167 | unsigned Polynomial; 168 | }; 169 | 170 | #ifdef _MSC_VER 171 | #pragma warning(pop) 172 | #endif // _MSC_VER 173 | 174 | extern gf256_ctx GF256Ctx; 175 | 176 | 177 | //------------------------------------------------------------------------------ 178 | // Initialization 179 | 180 | /** 181 | Initialize a context, filling in the tables. 182 | 183 | Thread-safety / Usage Notes: 184 | 185 | It is perfectly safe and encouraged to use a gf256_ctx object from multiple 186 | threads. The gf256_init() is relatively expensive and should only be done 187 | once, though it will take less than a millisecond. 188 | 189 | The gf256_ctx object must be aligned to 16 byte boundary. 190 | Simply tag the object with GF256_ALIGNED to achieve this. 191 | 192 | Example: 193 | static GF256_ALIGNED gf256_ctx TheGF256Context; 194 | gf256_init(&TheGF256Context, 0); 195 | 196 | Returns 0 on success and other values on failure. 197 | */ 198 | extern int gf256_init_(int version); 199 | #define gf256_init() gf256_init_(GF256_VERSION) 200 | 201 | 202 | //------------------------------------------------------------------------------ 203 | // Math Operations 204 | 205 | /// return x + y 206 | static GF256_FORCE_INLINE uint8_t gf256_add(uint8_t x, uint8_t y) 207 | { 208 | return (uint8_t)(x ^ y); 209 | } 210 | 211 | /// return x * y 212 | /// For repeated multiplication by a constant, it is faster to put the constant in y. 213 | static GF256_FORCE_INLINE uint8_t gf256_mul(uint8_t x, uint8_t y) 214 | { 215 | return GF256Ctx.GF256_MUL_TABLE[((unsigned)y << 8) + x]; 216 | } 217 | 218 | /// return x / y 219 | /// Memory-access optimized for constant divisors in y. 220 | static GF256_FORCE_INLINE uint8_t gf256_div(uint8_t x, uint8_t y) 221 | { 222 | return GF256Ctx.GF256_DIV_TABLE[((unsigned)y << 8) + x]; 223 | } 224 | 225 | /// return 1 / x 226 | static GF256_FORCE_INLINE uint8_t gf256_inv(uint8_t x) 227 | { 228 | return GF256Ctx.GF256_INV_TABLE[x]; 229 | } 230 | 231 | /// return x * x 232 | static GF256_FORCE_INLINE uint8_t gf256_sqr(uint8_t x) 233 | { 234 | return GF256Ctx.GF256_SQR_TABLE[x]; 235 | } 236 | 237 | 238 | //------------------------------------------------------------------------------ 239 | // Bulk Memory Math Operations 240 | 241 | /// Performs "x[] += y[]" bulk memory XOR operation 242 | extern void gf256_add_mem(void * GF256_RESTRICT vx, 243 | const void * GF256_RESTRICT vy, int bytes); 244 | 245 | /// Performs "z[] += x[] + y[]" bulk memory operation 246 | extern void gf256_add2_mem(void * GF256_RESTRICT vz, const void * GF256_RESTRICT vx, 247 | const void * GF256_RESTRICT vy, int bytes); 248 | 249 | /// Performs "z[] = x[] + y[]" bulk memory operation 250 | extern void gf256_addset_mem(void * GF256_RESTRICT vz, const void * GF256_RESTRICT vx, 251 | const void * GF256_RESTRICT vy, int bytes); 252 | 253 | /// Performs "z[] = x[] * y" bulk memory operation 254 | extern void gf256_mul_mem(void * GF256_RESTRICT vz, 255 | const void * GF256_RESTRICT vx, uint8_t y, int bytes); 256 | 257 | /// Performs "z[] += x[] * y" bulk memory operation 258 | extern void gf256_muladd_mem(void * GF256_RESTRICT vz, uint8_t y, 259 | const void * GF256_RESTRICT vx, int bytes); 260 | 261 | /// Performs "x[] /= y" bulk memory operation 262 | static GF256_FORCE_INLINE void gf256_div_mem(void * GF256_RESTRICT vz, 263 | const void * GF256_RESTRICT vx, uint8_t y, int bytes) 264 | { 265 | // Multiply by inverse 266 | gf256_mul_mem(vz, vx, y == 1 ? (uint8_t)1 : GF256Ctx.GF256_INV_TABLE[y], bytes); 267 | } 268 | 269 | 270 | //------------------------------------------------------------------------------ 271 | // Misc Operations 272 | 273 | #ifdef __cplusplus 274 | } 275 | #endif // __cplusplus 276 | 277 | #endif // CAT_GF256_H 278 | -------------------------------------------------------------------------------- /src/cm256.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015 Christopher A. Taylor. All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | * Neither the name of CM256 nor the names of its contributors may be 13 | used to endorse or promote products derived from this software without 14 | specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 20 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 | POSSIBILITY OF SUCH DAMAGE. 27 | */ 28 | 29 | #include "cm256.h" 30 | 31 | 32 | /* 33 | GF(256) Cauchy Matrix Overview 34 | 35 | As described on Wikipedia, each element of a normal Cauchy matrix is defined as: 36 | 37 | a_ij = 1 / (x_i - y_j) 38 | The arrays x_i and y_j are vector parameters of the matrix. 39 | The values in x_i cannot be reused in y_j. 40 | 41 | Moving beyond the Wikipedia... 42 | 43 | (1) Number of rows (R) is the range of i, and number of columns (C) is the range of j. 44 | 45 | (2) Being able to select x_i and y_j makes Cauchy matrices more flexible in practice 46 | than Vandermonde matrices, which only have one parameter per row. 47 | 48 | (3) Cauchy matrices are always invertible, AKA always full rank, AKA when treated as 49 | as linear system y = M*x, the linear system has a single solution. 50 | 51 | (4) A Cauchy matrix concatenated below a square CxC identity matrix always has rank C, 52 | Meaning that any R rows can be eliminated from the concatenated matrix and the 53 | matrix will still be invertible. This is how Reed-Solomon erasure codes work. 54 | 55 | (5) Any row or column can be multiplied by non-zero values, and the resulting matrix 56 | is still full rank. This is true for any matrix, since it is effectively the same 57 | as pre and post multiplying by diagonal matrices, which are always invertible. 58 | 59 | (6) Matrix elements with a value of 1 are much faster to operate on than other values. 60 | For instance a matrix of [1, 1, 1, 1, 1] is invertible and much faster for various 61 | purposes than [2, 2, 2, 2, 2]. 62 | 63 | (7) For GF(256) matrices, the symbols in x_i and y_j are selected from the numbers 64 | 0...255, and so the number of rows + number of columns may not exceed 256. 65 | Note that values in x_i and y_j may not be reused as stated above. 66 | 67 | In summary, Cauchy matrices 68 | are preferred over Vandermonde matrices. (2) 69 | are great for MDS erasure codes. (3) and (4) 70 | should be optimized to include more 1 elements. (5) and (6) 71 | have a limited size in GF(256), rows+cols <= 256. (7) 72 | */ 73 | 74 | 75 | //----------------------------------------------------------------------------- 76 | // Initialization 77 | 78 | extern "C" int cm256_init_(int version) 79 | { 80 | if (version != CM256_VERSION) 81 | { 82 | // User's header does not match library version 83 | return -10; 84 | } 85 | 86 | // Return error code from GF(256) init if required 87 | return gf256_init(); 88 | } 89 | 90 | 91 | /* 92 | Selected Cauchy Matrix Form 93 | 94 | The matrix consists of elements a_ij, where i = row, j = column. 95 | a_ij = 1 / (x_i - y_j), where x_i and y_j are sets of GF(256) values 96 | that do not intersect. 97 | 98 | We select x_i and y_j to just be incrementing numbers for the 99 | purposes of this library. Further optimizations may yield matrices 100 | with more 1 elements, but the benefit seems relatively small. 101 | 102 | The x_i values range from 0...(originalCount - 1). 103 | The y_j values range from originalCount...(originalCount + recoveryCount - 1). 104 | 105 | We then improve the Cauchy matrix by dividing each column by the 106 | first row element of that column. The result is an invertible 107 | matrix that has all 1 elements in the first row. This is equivalent 108 | to a rotated Vandermonde matrix, so we could have used one of those. 109 | 110 | The advantage of doing this is that operations involving the first 111 | row will be extremely fast (just memory XOR), so the decoder can 112 | be optimized to take advantage of the shortcut when the first 113 | recovery row can be used. 114 | 115 | First row element of Cauchy matrix for each column: 116 | a_0j = 1 / (x_0 - y_j) = 1 / (x_0 - y_j) 117 | 118 | Our Cauchy matrix sets first row to ones, so: 119 | a_ij = (1 / (x_i - y_j)) / a_0j 120 | a_ij = (y_j - x_0) / (x_i - y_j) 121 | a_ij = (y_j + x_0) div (x_i + y_j) in GF(256) 122 | */ 123 | 124 | // This function generates each matrix element based on x_i, x_0, y_j 125 | // Note that for x_i == x_0, this will return 1, so it is better to unroll out the first row. 126 | static GF256_FORCE_INLINE unsigned char GetMatrixElement(unsigned char x_i, unsigned char x_0, unsigned char y_j) 127 | { 128 | return gf256_div(gf256_add(y_j, x_0), gf256_add(x_i, y_j)); 129 | } 130 | 131 | 132 | //----------------------------------------------------------------------------- 133 | // Encoding 134 | 135 | extern "C" void cm256_encode_block( 136 | cm256_encoder_params params, // Encoder parameters 137 | cm256_block* originals, // Array of pointers to original blocks 138 | int recoveryBlockIndex, // Return value from cm256_get_recovery_block_index() 139 | void* recoveryBlock) // Output recovery block 140 | { 141 | // If only one block of input data, 142 | if (params.OriginalCount == 1) 143 | { 144 | // No meaningful operation here, degenerate to outputting the same data each time. 145 | 146 | memcpy(recoveryBlock, originals[0].Block, params.BlockBytes); 147 | return; 148 | } 149 | // else OriginalCount >= 2: 150 | 151 | // Unroll first row of recovery matrix: 152 | // The matrix we generate for the first row is all ones, 153 | // so it is merely a parity of the original data. 154 | if (recoveryBlockIndex == params.OriginalCount) 155 | { 156 | gf256_addset_mem(recoveryBlock, originals[0].Block, originals[1].Block, params.BlockBytes); 157 | for (int j = 2; j < params.OriginalCount; ++j) 158 | { 159 | gf256_add_mem(recoveryBlock, originals[j].Block, params.BlockBytes); 160 | } 161 | return; 162 | } 163 | 164 | // TBD: Faster algorithms seem to exist for computing this matrix-vector product. 165 | 166 | // Start the x_0 values arbitrarily from the original count. 167 | const uint8_t x_0 = static_cast(params.OriginalCount); 168 | 169 | // For other rows: 170 | { 171 | const uint8_t x_i = static_cast(recoveryBlockIndex); 172 | 173 | // Unroll first operation for speed 174 | { 175 | const uint8_t y_0 = 0; 176 | const uint8_t matrixElement = GetMatrixElement(x_i, x_0, y_0); 177 | 178 | gf256_mul_mem(recoveryBlock, originals[0].Block, matrixElement, params.BlockBytes); 179 | } 180 | 181 | // For each original data column, 182 | for (int j = 1; j < params.OriginalCount; ++j) 183 | { 184 | const uint8_t y_j = static_cast(j); 185 | const uint8_t matrixElement = GetMatrixElement(x_i, x_0, y_j); 186 | 187 | gf256_muladd_mem(recoveryBlock, matrixElement, originals[j].Block, params.BlockBytes); 188 | } 189 | } 190 | } 191 | 192 | extern "C" int cm256_encode( 193 | cm256_encoder_params params, // Encoder params 194 | cm256_block* originals, // Array of pointers to original blocks 195 | void* recoveryBlocks) // Output recovery blocks end-to-end 196 | { 197 | // Validate input: 198 | if (params.OriginalCount <= 0 || 199 | params.RecoveryCount <= 0 || 200 | params.BlockBytes <= 0) 201 | { 202 | return -1; 203 | } 204 | if (params.OriginalCount + params.RecoveryCount > 256) 205 | { 206 | return -2; 207 | } 208 | if (!originals || !recoveryBlocks) 209 | { 210 | return -3; 211 | } 212 | 213 | uint8_t* recoveryBlock = static_cast(recoveryBlocks); 214 | 215 | for (int block = 0; block < params.RecoveryCount; ++block, recoveryBlock += params.BlockBytes) 216 | { 217 | cm256_encode_block(params, originals, (params.OriginalCount + block), recoveryBlock); 218 | } 219 | 220 | return 0; 221 | } 222 | 223 | 224 | //----------------------------------------------------------------------------- 225 | // Decoding 226 | 227 | struct CM256Decoder 228 | { 229 | // Encode parameters 230 | cm256_encoder_params Params; 231 | 232 | // Recovery blocks 233 | cm256_block* Recovery[256]; 234 | int RecoveryCount; 235 | 236 | // Original blocks 237 | cm256_block* Original[256]; 238 | int OriginalCount; 239 | 240 | // Row indices that were erased 241 | uint8_t ErasuresIndices[256]; 242 | 243 | // Initialize the decoder 244 | bool Initialize(cm256_encoder_params& params, cm256_block* blocks); 245 | 246 | // Decode m=1 case 247 | void DecodeM1(); 248 | 249 | // Decode for m>1 case 250 | void Decode(); 251 | 252 | // Generate the LU decomposition of the matrix 253 | void GenerateLDUDecomposition(uint8_t* matrix_L, uint8_t* diag_D, uint8_t* matrix_U); 254 | }; 255 | 256 | bool CM256Decoder::Initialize(cm256_encoder_params& params, cm256_block* blocks) 257 | { 258 | Params = params; 259 | 260 | cm256_block* block = blocks; 261 | OriginalCount = 0; 262 | RecoveryCount = 0; 263 | 264 | // Initialize erasures to zeros 265 | for (int ii = 0; ii < params.OriginalCount; ++ii) 266 | { 267 | ErasuresIndices[ii] = 0; 268 | } 269 | 270 | // For each input block, 271 | for (int ii = 0; ii < params.OriginalCount; ++ii, ++block) 272 | { 273 | int row = block->Index; 274 | 275 | // If it is an original block, 276 | if (row < params.OriginalCount) 277 | { 278 | Original[OriginalCount++] = block; 279 | 280 | if (ErasuresIndices[row] != 0) 281 | { 282 | // Error out if two row indices repeat 283 | return false; 284 | } 285 | 286 | ErasuresIndices[row] = 1; 287 | } 288 | else 289 | { 290 | Recovery[RecoveryCount++] = block; 291 | } 292 | } 293 | 294 | // Identify erasures 295 | for (int ii = 0, indexCount = 0; ii < 256; ++ii) 296 | { 297 | if (!ErasuresIndices[ii]) 298 | { 299 | ErasuresIndices[indexCount] = static_cast( ii ); 300 | 301 | if (++indexCount >= RecoveryCount) 302 | { 303 | break; 304 | } 305 | } 306 | } 307 | 308 | return true; 309 | } 310 | 311 | void CM256Decoder::DecodeM1() 312 | { 313 | // XOR all other blocks into the recovery block 314 | uint8_t* outBlock = static_cast(Recovery[0]->Block); 315 | const uint8_t* inBlock = nullptr; 316 | 317 | // For each block, 318 | for (int ii = 0; ii < OriginalCount; ++ii) 319 | { 320 | const uint8_t* inBlock2 = static_cast(Original[ii]->Block); 321 | 322 | if (!inBlock) 323 | { 324 | inBlock = inBlock2; 325 | } 326 | else 327 | { 328 | // outBlock ^= inBlock ^ inBlock2 329 | gf256_add2_mem(outBlock, inBlock, inBlock2, Params.BlockBytes); 330 | inBlock = nullptr; 331 | } 332 | } 333 | 334 | // Complete XORs 335 | if (inBlock) 336 | { 337 | gf256_add_mem(outBlock, inBlock, Params.BlockBytes); 338 | } 339 | 340 | // Recover the index it corresponds to 341 | Recovery[0]->Index = ErasuresIndices[0]; 342 | } 343 | 344 | // Generate the LU decomposition of the matrix 345 | void CM256Decoder::GenerateLDUDecomposition(uint8_t* matrix_L, uint8_t* diag_D, uint8_t* matrix_U) 346 | { 347 | // Schur-type-direct-Cauchy algorithm 2.5 from 348 | // "Pivoting and Backward Stability of Fast Algorithms for Solving Cauchy Linear Equations" 349 | // T. Boros, T. Kailath, V. Olshevsky 350 | // Modified for practical use. I folded the diagonal parts of U/L matrices into the 351 | // diagonal one to reduce the number of multiplications to perform against the input data, 352 | // and organized the triangle matrices in memory to allow for faster SSE3 GF multiplications. 353 | 354 | // Matrix size NxN 355 | const int N = RecoveryCount; 356 | 357 | // Generators 358 | uint8_t g[256], b[256]; 359 | for (int i = 0; i < N; ++i) 360 | { 361 | g[i] = 1; 362 | b[i] = 1; 363 | } 364 | 365 | // Temporary buffer for rotated row of U matrix 366 | // This allows for faster GF bulk multiplication 367 | uint8_t rotated_row_U[256]; 368 | uint8_t* last_U = matrix_U + ((N - 1) * N) / 2 - 1; 369 | int firstOffset_U = 0; 370 | 371 | // Start the x_0 values arbitrarily from the original count. 372 | const uint8_t x_0 = static_cast(Params.OriginalCount); 373 | 374 | // Unrolling k = 0 just makes it slower for some reason. 375 | for (int k = 0; k < N - 1; ++k) 376 | { 377 | const uint8_t x_k = Recovery[k]->Index; 378 | const uint8_t y_k = ErasuresIndices[k]; 379 | 380 | // D_kk = (x_k + y_k) 381 | // L_kk = g[k] / (x_k + y_k) 382 | // U_kk = b[k] * (x_0 + y_k) / (x_k + y_k) 383 | const uint8_t D_kk = gf256_add(x_k, y_k); 384 | const uint8_t L_kk = gf256_div(g[k], D_kk); 385 | const uint8_t U_kk = gf256_mul(gf256_div(b[k], D_kk), gf256_add(x_0, y_k)); 386 | 387 | // diag_D[k] = D_kk * L_kk * U_kk 388 | diag_D[k] = gf256_mul(D_kk, gf256_mul(L_kk, U_kk)); 389 | 390 | // Computing the k-th row of L and U 391 | uint8_t* row_L = matrix_L; 392 | uint8_t* row_U = rotated_row_U; 393 | for (int j = k + 1; j < N; ++j) 394 | { 395 | const uint8_t x_j = Recovery[j]->Index; 396 | const uint8_t y_j = ErasuresIndices[j]; 397 | 398 | // L_jk = g[j] / (x_j + y_k) 399 | // U_kj = b[j] / (x_k + y_j) 400 | const uint8_t L_jk = gf256_div(g[j], gf256_add(x_j, y_k)); 401 | const uint8_t U_kj = gf256_div(b[j], gf256_add(x_k, y_j)); 402 | 403 | *matrix_L++ = L_jk; 404 | *row_U++ = U_kj; 405 | 406 | // g[j] = g[j] * (x_j + x_k) / (x_j + y_k) 407 | // b[j] = b[j] * (y_j + y_k) / (y_j + x_k) 408 | g[j] = gf256_mul(g[j], gf256_div(gf256_add(x_j, x_k), gf256_add(x_j, y_k))); 409 | b[j] = gf256_mul(b[j], gf256_div(gf256_add(y_j, y_k), gf256_add(y_j, x_k))); 410 | } 411 | 412 | // Do these row/column divisions in bulk for speed. 413 | // L_jk /= L_kk 414 | // U_kj /= U_kk 415 | const int count = N - (k + 1); 416 | gf256_div_mem(row_L, row_L, L_kk, count); 417 | gf256_div_mem(rotated_row_U, rotated_row_U, U_kk, count); 418 | 419 | // Copy U matrix row into place in memory. 420 | uint8_t* output_U = last_U + firstOffset_U; 421 | row_U = rotated_row_U; 422 | for (int j = k + 1; j < N; ++j) 423 | { 424 | *output_U = *row_U++; 425 | output_U -= j; 426 | } 427 | firstOffset_U -= k + 2; 428 | } 429 | 430 | // Multiply diagonal matrix into U 431 | uint8_t* row_U = matrix_U; 432 | for (int j = N - 1; j > 0; --j) 433 | { 434 | const uint8_t y_j = ErasuresIndices[j]; 435 | const int count = j; 436 | 437 | gf256_mul_mem(row_U, row_U, gf256_add(x_0, y_j), count); 438 | row_U += count; 439 | } 440 | 441 | const uint8_t x_n = Recovery[N - 1]->Index; 442 | const uint8_t y_n = ErasuresIndices[N - 1]; 443 | 444 | // D_nn = 1 / (x_n + y_n) 445 | // L_nn = g[N-1] 446 | // U_nn = b[N-1] * (x_0 + y_n) 447 | const uint8_t L_nn = g[N - 1]; 448 | const uint8_t U_nn = gf256_mul(b[N - 1], gf256_add(x_0, y_n)); 449 | 450 | // diag_D[N-1] = L_nn * D_nn * U_nn 451 | diag_D[N - 1] = gf256_div(gf256_mul(L_nn, U_nn), gf256_add(x_n, y_n)); 452 | } 453 | 454 | void CM256Decoder::Decode() 455 | { 456 | // Matrix size is NxN, where N is the number of recovery blocks used. 457 | const int N = RecoveryCount; 458 | 459 | // Start the x_0 values arbitrarily from the original count. 460 | const uint8_t x_0 = static_cast(Params.OriginalCount); 461 | 462 | // Eliminate original data from the the recovery rows 463 | for (int originalIndex = 0; originalIndex < OriginalCount; ++originalIndex) 464 | { 465 | const uint8_t* inBlock = static_cast(Original[originalIndex]->Block); 466 | const uint8_t inRow = Original[originalIndex]->Index; 467 | 468 | for (int recoveryIndex = 0; recoveryIndex < N; ++recoveryIndex) 469 | { 470 | uint8_t* outBlock = static_cast(Recovery[recoveryIndex]->Block); 471 | const uint8_t x_i = Recovery[recoveryIndex]->Index; 472 | const uint8_t y_j = inRow; 473 | const uint8_t matrixElement = GetMatrixElement(x_i, x_0, y_j); 474 | 475 | gf256_muladd_mem(outBlock, matrixElement, inBlock, Params.BlockBytes); 476 | } 477 | } 478 | 479 | // Allocate matrix 480 | static const int StackAllocSize = 2048; 481 | uint8_t stackMatrix[StackAllocSize]; 482 | uint8_t* dynamicMatrix = nullptr; 483 | uint8_t* matrix = stackMatrix; 484 | const int requiredSpace = N * N; 485 | if (requiredSpace > StackAllocSize) 486 | { 487 | dynamicMatrix = new uint8_t[requiredSpace]; 488 | matrix = dynamicMatrix; 489 | } 490 | 491 | /* 492 | Compute matrix decomposition: 493 | 494 | G = L * D * U 495 | 496 | L is lower-triangular, diagonal is all ones. 497 | D is a diagonal matrix. 498 | U is upper-triangular, diagonal is all ones. 499 | */ 500 | uint8_t* matrix_U = matrix; 501 | uint8_t* diag_D = matrix_U + (N - 1) * N / 2; 502 | uint8_t* matrix_L = diag_D + N; 503 | GenerateLDUDecomposition(matrix_L, diag_D, matrix_U); 504 | 505 | /* 506 | Eliminate lower left triangle. 507 | */ 508 | // For each column, 509 | for (int j = 0; j < N - 1; ++j) 510 | { 511 | const void* block_j = Recovery[j]->Block; 512 | 513 | // For each row, 514 | for (int i = j + 1; i < N; ++i) 515 | { 516 | void* block_i = Recovery[i]->Block; 517 | const uint8_t c_ij = *matrix_L++; // Matrix elements are stored column-first, top-down. 518 | 519 | gf256_muladd_mem(block_i, c_ij, block_j, Params.BlockBytes); 520 | } 521 | } 522 | 523 | /* 524 | Eliminate diagonal. 525 | */ 526 | for (int i = 0; i < N; ++i) 527 | { 528 | void* block = Recovery[i]->Block; 529 | 530 | Recovery[i]->Index = ErasuresIndices[i]; 531 | 532 | gf256_div_mem(block, block, diag_D[i], Params.BlockBytes); 533 | } 534 | 535 | /* 536 | Eliminate upper right triangle. 537 | */ 538 | for (int j = N - 1; j >= 1; --j) 539 | { 540 | const void* block_j = Recovery[j]->Block; 541 | 542 | for (int i = j - 1; i >= 0; --i) 543 | { 544 | void* block_i = Recovery[i]->Block; 545 | const uint8_t c_ij = *matrix_U++; // Matrix elements are stored column-first, bottom-up. 546 | 547 | gf256_muladd_mem(block_i, c_ij, block_j, Params.BlockBytes); 548 | } 549 | } 550 | 551 | delete[] dynamicMatrix; 552 | } 553 | 554 | extern "C" int cm256_decode( 555 | cm256_encoder_params params, // Encoder params 556 | cm256_block* blocks) // Array of 'originalCount' blocks as described above 557 | { 558 | if (params.OriginalCount <= 0 || 559 | params.RecoveryCount <= 0 || 560 | params.BlockBytes <= 0) 561 | { 562 | return -1; 563 | } 564 | if (params.OriginalCount + params.RecoveryCount > 256) 565 | { 566 | return -2; 567 | } 568 | if (!blocks) 569 | { 570 | return -3; 571 | } 572 | 573 | // If there is only one block, 574 | if (params.OriginalCount == 1) 575 | { 576 | // It is the same block repeated 577 | blocks[0].Index = 0; 578 | return 0; 579 | } 580 | 581 | CM256Decoder state; 582 | if (!state.Initialize(params, blocks)) 583 | { 584 | return -5; 585 | } 586 | 587 | // If nothing is erased, 588 | if (state.RecoveryCount <= 0) 589 | { 590 | return 0; 591 | } 592 | 593 | // If m=1, 594 | if (params.RecoveryCount == 1) 595 | { 596 | state.DecodeM1(); 597 | return 0; 598 | } 599 | 600 | // Decode for m>1 601 | state.Decode(); 602 | return 0; 603 | } 604 | -------------------------------------------------------------------------------- /src/gf256.cpp: -------------------------------------------------------------------------------- 1 | /** \file 2 | \brief GF(256) Main C API Source 3 | \copyright Copyright (c) 2017 Christopher A. Taylor. All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | * Neither the name of GF256 nor the names of its contributors may be 14 | used to endorse or promote products derived from this software without 15 | specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 21 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 | POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | #include "gf256.h" 31 | 32 | #ifdef LINUX_ARM 33 | #include 34 | #include 35 | #include 36 | #include 37 | #endif 38 | 39 | //------------------------------------------------------------------------------ 40 | // Detect host byte order. 41 | // This check works with GCC and LLVM; assume little-endian byte order when 42 | // using any other compiler. 43 | // The result is verified during initialization. 44 | // 45 | #if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) \ 46 | && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 47 | #define GF256_IS_BIG_ENDIAN 48 | #endif 49 | 50 | //------------------------------------------------------------------------------ 51 | // Workaround for ARMv7 that doesn't provide vqtbl1_* 52 | // This comes from linux-raid (https://www.spinics.net/lists/raid/msg58403.html) 53 | // 54 | #ifdef GF256_TRY_NEON 55 | #if __ARM_ARCH <= 7 && !defined(__aarch64__) 56 | static GF256_FORCE_INLINE uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b) 57 | { 58 | union { 59 | uint8x16_t val; 60 | uint8x8x2_t pair; 61 | } __a = { a }; 62 | 63 | return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)), 64 | vtbl2_u8(__a.pair, vget_high_u8(b))); 65 | } 66 | #endif 67 | #endif 68 | 69 | //------------------------------------------------------------------------------ 70 | // Self-Test 71 | // 72 | // This is executed during initialization to make sure the library is working 73 | 74 | static const unsigned kTestBufferBytes = 32 + 16 + 8 + 4 + 2 + 1; 75 | static const unsigned kTestBufferAllocated = 64; 76 | struct SelfTestBuffersT 77 | { 78 | GF256_ALIGNED uint8_t A[kTestBufferAllocated]; 79 | GF256_ALIGNED uint8_t B[kTestBufferAllocated]; 80 | GF256_ALIGNED uint8_t C[kTestBufferAllocated]; 81 | }; 82 | static GF256_ALIGNED SelfTestBuffersT m_SelfTestBuffers; 83 | 84 | static bool gf256_self_test() 85 | { 86 | if ((uintptr_t)m_SelfTestBuffers.A % GF256_ALIGN_BYTES != 0) 87 | return false; 88 | if ((uintptr_t)m_SelfTestBuffers.A % GF256_ALIGN_BYTES != 0) 89 | return false; 90 | if ((uintptr_t)m_SelfTestBuffers.B % GF256_ALIGN_BYTES != 0) 91 | return false; 92 | if ((uintptr_t)m_SelfTestBuffers.C % GF256_ALIGN_BYTES != 0) 93 | return false; 94 | 95 | // Check multiplication/division 96 | for (unsigned i = 0; i < 256; ++i) 97 | { 98 | for (unsigned j = 0; j < 256; ++j) 99 | { 100 | uint8_t prod = gf256_mul((uint8_t)i, (uint8_t)j); 101 | if (i != 0 && j != 0) 102 | { 103 | uint8_t div1 = gf256_div(prod, (uint8_t)i); 104 | if (div1 != j) 105 | return false; 106 | uint8_t div2 = gf256_div(prod, (uint8_t)j); 107 | if (div2 != i) 108 | return false; 109 | } 110 | else if (prod != 0) 111 | return false; 112 | if (j == 1 && prod != i) 113 | return false; 114 | } 115 | } 116 | 117 | // Check for overruns 118 | m_SelfTestBuffers.A[kTestBufferBytes] = 0x5a; 119 | m_SelfTestBuffers.B[kTestBufferBytes] = 0x5a; 120 | m_SelfTestBuffers.C[kTestBufferBytes] = 0x5a; 121 | 122 | // Test gf256_add_mem() 123 | for (unsigned i = 0; i < kTestBufferBytes; ++i) 124 | { 125 | m_SelfTestBuffers.A[i] = 0x1f; 126 | m_SelfTestBuffers.B[i] = 0xf7; 127 | } 128 | gf256_add_mem(m_SelfTestBuffers.A, m_SelfTestBuffers.B, kTestBufferBytes); 129 | for (unsigned i = 0; i < kTestBufferBytes; ++i) 130 | if (m_SelfTestBuffers.A[i] != (0x1f ^ 0xf7)) 131 | return false; 132 | 133 | // Test gf256_add2_mem() 134 | for (unsigned i = 0; i < kTestBufferBytes; ++i) 135 | { 136 | m_SelfTestBuffers.A[i] = 0x1f; 137 | m_SelfTestBuffers.B[i] = 0xf7; 138 | m_SelfTestBuffers.C[i] = 0x71; 139 | } 140 | gf256_add2_mem(m_SelfTestBuffers.A, m_SelfTestBuffers.B, m_SelfTestBuffers.C, kTestBufferBytes); 141 | for (unsigned i = 0; i < kTestBufferBytes; ++i) 142 | if (m_SelfTestBuffers.A[i] != (0x1f ^ 0xf7 ^ 0x71)) 143 | return false; 144 | 145 | // Test gf256_addset_mem() 146 | for (unsigned i = 0; i < kTestBufferBytes; ++i) 147 | { 148 | m_SelfTestBuffers.A[i] = 0x55; 149 | m_SelfTestBuffers.B[i] = 0xaa; 150 | m_SelfTestBuffers.C[i] = 0x6c; 151 | } 152 | gf256_addset_mem(m_SelfTestBuffers.A, m_SelfTestBuffers.B, m_SelfTestBuffers.C, kTestBufferBytes); 153 | for (unsigned i = 0; i < kTestBufferBytes; ++i) 154 | if (m_SelfTestBuffers.A[i] != (0xaa ^ 0x6c)) 155 | return false; 156 | 157 | // Test gf256_muladd_mem() 158 | for (unsigned i = 0; i < kTestBufferBytes; ++i) 159 | { 160 | m_SelfTestBuffers.A[i] = 0xff; 161 | m_SelfTestBuffers.B[i] = 0xaa; 162 | } 163 | const uint8_t expectedMulAdd = gf256_mul(0xaa, 0x6c); 164 | gf256_muladd_mem(m_SelfTestBuffers.A, 0x6c, m_SelfTestBuffers.B, kTestBufferBytes); 165 | for (unsigned i = 0; i < kTestBufferBytes; ++i) 166 | if (m_SelfTestBuffers.A[i] != (expectedMulAdd ^ 0xff)) 167 | return false; 168 | 169 | // Test gf256_mul_mem() 170 | for (unsigned i = 0; i < kTestBufferBytes; ++i) 171 | { 172 | m_SelfTestBuffers.A[i] = 0xff; 173 | m_SelfTestBuffers.B[i] = 0x55; 174 | } 175 | const uint8_t expectedMul = gf256_mul(0xa2, 0x55); 176 | gf256_mul_mem(m_SelfTestBuffers.A, m_SelfTestBuffers.B, 0xa2, kTestBufferBytes); 177 | for (unsigned i = 0; i < kTestBufferBytes; ++i) 178 | if (m_SelfTestBuffers.A[i] != expectedMul) 179 | return false; 180 | 181 | if (m_SelfTestBuffers.A[kTestBufferBytes] != 0x5a) 182 | return false; 183 | if (m_SelfTestBuffers.B[kTestBufferBytes] != 0x5a) 184 | return false; 185 | if (m_SelfTestBuffers.C[kTestBufferBytes] != 0x5a) 186 | return false; 187 | 188 | return true; 189 | } 190 | 191 | 192 | //------------------------------------------------------------------------------ 193 | // Runtime CPU Architecture Check 194 | // 195 | // Feature checks stolen shamelessly from 196 | // https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/runtime.c 197 | 198 | #if defined(HAVE_ANDROID_GETCPUFEATURES) 199 | #include 200 | #endif 201 | 202 | #if defined(GF256_TRY_NEON) 203 | # if defined(IOS) && (defined(__ARM_NEON) || defined(__ARM_NEON__)) 204 | // Requires iPhone 5S or newer 205 | static const bool CpuHasNeon = true; 206 | static const bool CpuHasNeon64 = true; 207 | # else // ANDROID or LINUX_ARM 208 | # if defined(__aarch64__) 209 | static bool CpuHasNeon = true; // if AARCH64, then we have NEON for sure... 210 | static bool CpuHasNeon64 = true; // And we have ASIMD 211 | # else 212 | static bool CpuHasNeon = false; // if not, then we have to check at runtime. 213 | static bool CpuHasNeon64 = false; // And we don't have ASIMD 214 | # endif 215 | # endif 216 | #endif 217 | 218 | #if !defined(GF256_TARGET_MOBILE) 219 | 220 | #ifdef _MSC_VER 221 | #include // __cpuid 222 | #pragma warning(disable: 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX 223 | #endif 224 | 225 | #ifdef GF256_TRY_AVX2 226 | static bool CpuHasAVX2 = false; 227 | #endif 228 | static bool CpuHasSSSE3 = false; 229 | 230 | #define CPUID_EBX_AVX2 0x00000020 231 | #define CPUID_ECX_SSSE3 0x00000200 232 | 233 | static void _cpuid(unsigned int cpu_info[4U], const unsigned int cpu_info_type) 234 | { 235 | #if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)) 236 | __cpuid((int *) cpu_info, cpu_info_type); 237 | #else //if defined(HAVE_CPUID) 238 | cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; 239 | # ifdef __i386__ 240 | __asm__ __volatile__ ("pushfl; pushfl; " 241 | "popl %0; " 242 | "movl %0, %1; xorl %2, %0; " 243 | "pushl %0; " 244 | "popfl; pushfl; popl %0; popfl" : 245 | "=&r" (cpu_info[0]), "=&r" (cpu_info[1]) : 246 | "i" (0x200000)); 247 | if (((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0) { 248 | return; /* LCOV_EXCL_LINE */ 249 | } 250 | # endif 251 | # ifdef __i386__ 252 | __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" : 253 | "=a" (cpu_info[0]), "=&r" (cpu_info[1]), 254 | "=c" (cpu_info[2]), "=d" (cpu_info[3]) : 255 | "0" (cpu_info_type), "2" (0U)); 256 | # elif defined(__x86_64__) 257 | __asm__ __volatile__ ("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" : 258 | "=a" (cpu_info[0]), "=&r" (cpu_info[1]), 259 | "=c" (cpu_info[2]), "=d" (cpu_info[3]) : 260 | "0" (cpu_info_type), "2" (0U)); 261 | # else 262 | __asm__ __volatile__ ("cpuid" : 263 | "=a" (cpu_info[0]), "=b" (cpu_info[1]), 264 | "=c" (cpu_info[2]), "=d" (cpu_info[3]) : 265 | "0" (cpu_info_type), "2" (0U)); 266 | # endif 267 | #endif 268 | } 269 | 270 | #else 271 | #if defined(LINUX_ARM) 272 | static void checkLinuxARMNeonCapabilities( bool& cpuHasNeon ) 273 | { 274 | auto cpufile = open("/proc/self/auxv", O_RDONLY); 275 | Elf32_auxv_t auxv; 276 | if (cpufile >= 0) 277 | { 278 | const auto size_auxv_t = sizeof(Elf32_auxv_t); 279 | while (read(cpufile, &auxv, size_auxv_t) == size_auxv_t) 280 | { 281 | if (auxv.a_type == AT_HWCAP) 282 | { 283 | cpuHasNeon = (auxv.a_un.a_val & 4096) != 0; 284 | break; 285 | } 286 | } 287 | close(cpufile); 288 | } 289 | else 290 | { 291 | cpuHasNeon = false; 292 | } 293 | } 294 | #endif 295 | #endif // defined(GF256_TARGET_MOBILE) 296 | 297 | static void gf256_architecture_init() 298 | { 299 | #if defined(GF256_TRY_NEON) 300 | 301 | // Check for NEON support on Android platform 302 | #if defined(HAVE_ANDROID_GETCPUFEATURES) 303 | AndroidCpuFamily family = android_getCpuFamily(); 304 | if (family == ANDROID_CPU_FAMILY_ARM) 305 | { 306 | if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) 307 | CpuHasNeon = true; 308 | } 309 | else if (family == ANDROID_CPU_FAMILY_ARM64) 310 | { 311 | CpuHasNeon = true; 312 | if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD) 313 | CpuHasNeon64 = true; 314 | } 315 | #endif 316 | 317 | #if defined(LINUX_ARM) 318 | // Check for NEON support on other ARM/Linux platforms 319 | checkLinuxARMNeonCapabilities(CpuHasNeon); 320 | #endif 321 | 322 | #endif //GF256_TRY_NEON 323 | 324 | #if !defined(GF256_TARGET_MOBILE) 325 | unsigned int cpu_info[4]; 326 | 327 | _cpuid(cpu_info, 1); 328 | CpuHasSSSE3 = ((cpu_info[2] & CPUID_ECX_SSSE3) != 0); 329 | 330 | #if defined(GF256_TRY_AVX2) 331 | _cpuid(cpu_info, 7); 332 | CpuHasAVX2 = ((cpu_info[1] & CPUID_EBX_AVX2) != 0); 333 | #endif // GF256_TRY_AVX2 334 | 335 | // When AVX2 and SSSE3 are unavailable, Siamese takes 4x longer to decode 336 | // and 2.6x longer to encode. Encoding requires a lot more simple XOR ops 337 | // so it is still pretty fast. Decoding is usually really quick because 338 | // average loss rates are low, but when needed it requires a lot more 339 | // GF multiplies requiring table lookups which is slower. 340 | 341 | #endif // GF256_TARGET_MOBILE 342 | } 343 | 344 | 345 | //------------------------------------------------------------------------------ 346 | // Context Object 347 | 348 | // Context object for GF(2^^8) math 349 | GF256_ALIGNED gf256_ctx GF256Ctx; 350 | static bool Initialized = false; 351 | 352 | 353 | //------------------------------------------------------------------------------ 354 | // Generator Polynomial 355 | 356 | // There are only 16 irreducible polynomials for GF(2^^8) 357 | static const int GF256_GEN_POLY_COUNT = 16; 358 | static const uint8_t GF256_GEN_POLY[GF256_GEN_POLY_COUNT] = { 359 | 0x8e, 0x95, 0x96, 0xa6, 0xaf, 0xb1, 0xb2, 0xb4, 360 | 0xb8, 0xc3, 0xc6, 0xd4, 0xe1, 0xe7, 0xf3, 0xfa 361 | }; 362 | 363 | static const int kDefaultPolynomialIndex = 3; 364 | 365 | // Select which polynomial to use 366 | static void gf256_poly_init(int polynomialIndex) 367 | { 368 | if (polynomialIndex < 0 || polynomialIndex >= GF256_GEN_POLY_COUNT) 369 | polynomialIndex = kDefaultPolynomialIndex; 370 | 371 | GF256Ctx.Polynomial = (GF256_GEN_POLY[polynomialIndex] << 1) | 1; 372 | } 373 | 374 | 375 | //------------------------------------------------------------------------------ 376 | // Exponential and Log Tables 377 | 378 | // Construct EXP and LOG tables from polynomial 379 | static void gf256_explog_init() 380 | { 381 | unsigned poly = GF256Ctx.Polynomial; 382 | uint8_t* exptab = GF256Ctx.GF256_EXP_TABLE; 383 | uint16_t* logtab = GF256Ctx.GF256_LOG_TABLE; 384 | 385 | logtab[0] = 512; 386 | exptab[0] = 1; 387 | for (unsigned jj = 1; jj < 255; ++jj) 388 | { 389 | unsigned next = (unsigned)exptab[jj - 1] * 2; 390 | if (next >= 256) 391 | next ^= poly; 392 | 393 | exptab[jj] = static_cast( next ); 394 | logtab[exptab[jj]] = static_cast( jj ); 395 | } 396 | exptab[255] = exptab[0]; 397 | logtab[exptab[255]] = 255; 398 | for (unsigned jj = 256; jj < 2 * 255; ++jj) 399 | exptab[jj] = exptab[jj % 255]; 400 | exptab[2 * 255] = 1; 401 | for (unsigned jj = 2 * 255 + 1; jj < 4 * 255; ++jj) 402 | exptab[jj] = 0; 403 | } 404 | 405 | 406 | //------------------------------------------------------------------------------ 407 | // Multiply and Divide Tables 408 | 409 | // Initialize MUL and DIV tables using LOG and EXP tables 410 | static void gf256_muldiv_init() 411 | { 412 | // Allocate table memory 65KB x 2 413 | uint8_t* m = GF256Ctx.GF256_MUL_TABLE; 414 | uint8_t* d = GF256Ctx.GF256_DIV_TABLE; 415 | 416 | // Unroll y = 0 subtable 417 | for (int x = 0; x < 256; ++x) 418 | m[x] = d[x] = 0; 419 | 420 | // For each other y value: 421 | for (int y = 1; y < 256; ++y) 422 | { 423 | // Calculate log(y) for mult and 255 - log(y) for div 424 | const uint8_t log_y = static_cast(GF256Ctx.GF256_LOG_TABLE[y]); 425 | const uint8_t log_yn = 255 - log_y; 426 | 427 | // Next subtable 428 | m += 256, d += 256; 429 | 430 | // Unroll x = 0 431 | m[0] = 0, d[0] = 0; 432 | 433 | // Calculate x * y, x / y 434 | for (int x = 1; x < 256; ++x) 435 | { 436 | uint16_t log_x = GF256Ctx.GF256_LOG_TABLE[x]; 437 | 438 | m[x] = GF256Ctx.GF256_EXP_TABLE[log_x + log_y]; 439 | d[x] = GF256Ctx.GF256_EXP_TABLE[log_x + log_yn]; 440 | } 441 | } 442 | } 443 | 444 | 445 | //------------------------------------------------------------------------------ 446 | // Inverse Table 447 | 448 | // Initialize INV table using DIV table 449 | static void gf256_inv_init() 450 | { 451 | for (int x = 0; x < 256; ++x) 452 | GF256Ctx.GF256_INV_TABLE[x] = gf256_div(1, static_cast(x)); 453 | } 454 | 455 | 456 | //------------------------------------------------------------------------------ 457 | // Square Table 458 | 459 | // Initialize SQR table using MUL table 460 | static void gf256_sqr_init() 461 | { 462 | for (int x = 0; x < 256; ++x) 463 | GF256Ctx.GF256_SQR_TABLE[x] = gf256_mul(static_cast(x), static_cast(x)); 464 | } 465 | 466 | 467 | //------------------------------------------------------------------------------ 468 | // Multiply and Add Memory Tables 469 | 470 | /* 471 | Fast algorithm to compute m[1..8] = a[1..8] * b in GF(256) 472 | using SSE3 SIMD instruction set: 473 | 474 | Consider z = x * y in GF(256). 475 | This operation can be performed bit-by-bit. Usefully, the partial product 476 | of each bit is combined linearly with the rest. This means that the 8-bit 477 | number x can be split into its high and low 4 bits, and partial products 478 | can be formed from each half. Then the halves can be linearly combined: 479 | 480 | z = x[0..3] * y + x[4..7] * y 481 | 482 | The multiplication of each half can be done efficiently via table lookups, 483 | and the addition in GF(256) is XOR. There must be two tables that map 16 484 | input elements for the low or high 4 bits of x to the two partial products. 485 | Each value for y has a different set of two tables: 486 | 487 | z = TABLE_LO_y(x[0..3]) xor TABLE_HI_y(x[4..7]) 488 | 489 | This means that we need 16 * 2 * 256 = 8192 bytes for precomputed tables. 490 | 491 | Computing z[] = x[] * y can be performed 16 bytes at a time by using the 492 | 128-bit register operations supported by modern processors. 493 | 494 | This is efficiently realized in SSE3 using the _mm_shuffle_epi8() function 495 | provided by Visual Studio 2010 or newer in . This function 496 | uses the low bits to do a table lookup on each byte. Unfortunately the 497 | high bit of each mask byte has the special feature that it clears the 498 | output byte when it is set, so we need to make sure it's cleared by masking 499 | off the high bit of each byte before using it: 500 | 501 | clr_mask = _mm_set1_epi8(0x0f) = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 502 | 503 | For the low half of the partial product, clear the high bit of each byte 504 | and perform the table lookup: 505 | 506 | p_lo = _mm_and_si128(x, clr_mask) 507 | p_lo = _mm_shuffle_epi8(p_lo, TABLE_LO_y) 508 | 509 | For the high half of the partial product, shift the high 4 bits of each 510 | byte into the low 4 bits and clear the high bit of each byte, and then 511 | perform the table lookup: 512 | 513 | p_hi = _mm_srli_epi64(x, 4) 514 | p_hi = _mm_and_si128(p_hi, clr_mask) 515 | p_hi = _mm_shuffle_epi8(p_hi, TABLE_HI_y) 516 | 517 | Finally add the two partial products to form the product, recalling that 518 | addition is XOR in a Galois field: 519 | 520 | result = _mm_xor_si128(p_lo, p_hi) 521 | 522 | This crunches 16 bytes of x at a time, and the result can be stored in z. 523 | */ 524 | 525 | /* 526 | Intrinsic reference: 527 | 528 | SSE3, VS2010+, tmmintrin.h: 529 | 530 | GF256_M128 _mm_shuffle_epi8(GF256_M128 a, GF256_M128 mask); 531 | Emits the Supplemental Streaming SIMD Extensions 3 (SSSE3) instruction pshufb. This instruction shuffles 16-byte parameters from a 128-bit parameter. 532 | 533 | Pseudo-code for PSHUFB (with 128 bit operands): 534 | 535 | for i = 0 to 15 { 536 | if (SRC[(i * 8)+7] = 1 ) then 537 | DEST[(i*8)+7..(i*8)+0] <- 0; 538 | else 539 | index[3..0] <- SRC[(i*8)+3 .. (i*8)+0]; 540 | DEST[(i*8)+7..(i*8)+0] <- DEST[(index*8+7)..(index*8+0)]; 541 | endif 542 | } 543 | 544 | SSE2, VS2008+, emmintrin.h: 545 | 546 | GF256_M128 _mm_slli_epi64 (GF256_M128 a, int count); 547 | Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while shifting in zeros. 548 | GF256_M128 _mm_srli_epi64 (GF256_M128 a, int count); 549 | Shifts the 2 signed or unsigned 64-bit integers in a right by count bits while shifting in zeros. 550 | GF256_M128 _mm_set1_epi8 (char b); 551 | Sets the 16 signed 8-bit integer values to b. 552 | GF256_M128 _mm_and_si128 (GF256_M128 a, GF256_M128 b); 553 | Computes the bitwise AND of the 128-bit value in a and the 128-bit value in b. 554 | GF256_M128 _mm_xor_si128 ( GF256_M128 a, GF256_M128 b); 555 | Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in b. 556 | */ 557 | 558 | // Initialize the multiplication tables using gf256_mul() 559 | static void gf256_mul_mem_init() 560 | { 561 | // Reuse aligned self test buffers to load table data 562 | uint8_t* lo = m_SelfTestBuffers.A; 563 | uint8_t* hi = m_SelfTestBuffers.B; 564 | 565 | for (int y = 0; y < 256; ++y) 566 | { 567 | // TABLE_LO_Y maps 0..15 to 8-bit partial product based on y. 568 | for (unsigned char x = 0; x < 16; ++x) 569 | { 570 | lo[x] = gf256_mul(x, static_cast( y )); 571 | hi[x] = gf256_mul(x << 4, static_cast( y )); 572 | } 573 | 574 | #if defined(GF256_TRY_NEON) 575 | if (CpuHasNeon) 576 | { 577 | GF256Ctx.MM128.TABLE_LO_Y[y] = vld1q_u8(lo); 578 | GF256Ctx.MM128.TABLE_HI_Y[y] = vld1q_u8(hi); 579 | } 580 | #elif !defined(GF256_TARGET_MOBILE) 581 | const GF256_M128 table_lo = _mm_loadu_si128((GF256_M128*)lo); 582 | const GF256_M128 table_hi = _mm_loadu_si128((GF256_M128*)hi); 583 | _mm_storeu_si128(GF256Ctx.MM128.TABLE_LO_Y + y, table_lo); 584 | _mm_storeu_si128(GF256Ctx.MM128.TABLE_HI_Y + y, table_hi); 585 | # ifdef GF256_TRY_AVX2 586 | if (CpuHasAVX2) 587 | { 588 | const GF256_M256 table_lo2 = _mm256_broadcastsi128_si256(table_lo); 589 | const GF256_M256 table_hi2 = _mm256_broadcastsi128_si256(table_hi); 590 | _mm256_storeu_si256(GF256Ctx.MM256.TABLE_LO_Y + y, table_lo2); 591 | _mm256_storeu_si256(GF256Ctx.MM256.TABLE_HI_Y + y, table_hi2); 592 | } 593 | # endif // GF256_TRY_AVX2 594 | #endif // GF256_TARGET_MOBILE 595 | } 596 | } 597 | 598 | 599 | //------------------------------------------------------------------------------ 600 | // Initialization 601 | 602 | #ifdef GF256_IS_BIG_ENDIAN 603 | static unsigned char kEndianTestData[4] = { 1, 2, 3, 4 }; 604 | #else 605 | static unsigned char kEndianTestData[4] = { 4, 3, 2, 1 }; 606 | #endif 607 | 608 | union UnionType 609 | { 610 | uint32_t IntValue; 611 | char CharArray[4]; 612 | }; 613 | 614 | static bool IsExpectedEndian() 615 | { 616 | UnionType type; 617 | for (unsigned i = 0; i < 4; ++i) 618 | type.CharArray[i] = kEndianTestData[i]; 619 | return 0x01020304 == type.IntValue; 620 | } 621 | 622 | extern "C" int gf256_init_(int version) 623 | { 624 | if (version != GF256_VERSION) 625 | return -1; // User's header does not match library version. 626 | 627 | // Avoid multiple initialization 628 | if (Initialized) 629 | return 0; 630 | Initialized = true; 631 | 632 | if (!IsExpectedEndian()) 633 | return -2; // Unexpected byte order. 634 | 635 | gf256_architecture_init(); 636 | gf256_poly_init(kDefaultPolynomialIndex); 637 | gf256_explog_init(); 638 | gf256_muldiv_init(); 639 | gf256_inv_init(); 640 | gf256_sqr_init(); 641 | gf256_mul_mem_init(); 642 | 643 | if (!gf256_self_test()) 644 | return -3; // Self-test failed (perhaps untested configuration) 645 | 646 | return 0; 647 | } 648 | 649 | 650 | //------------------------------------------------------------------------------ 651 | // Operations 652 | 653 | extern "C" void gf256_add_mem(void * GF256_RESTRICT vx, 654 | const void * GF256_RESTRICT vy, int bytes) 655 | { 656 | GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast(vx); 657 | const GF256_M128 * GF256_RESTRICT y16 = reinterpret_cast(vy); 658 | 659 | #if defined(GF256_TARGET_MOBILE) 660 | # if defined(GF256_TRY_NEON) 661 | // Handle multiples of 64 bytes 662 | if (CpuHasNeon) 663 | { 664 | while (bytes >= 64) 665 | { 666 | GF256_M128 x0 = vld1q_u8((uint8_t*) x16); 667 | GF256_M128 x1 = vld1q_u8((uint8_t*)(x16 + 1) ); 668 | GF256_M128 x2 = vld1q_u8((uint8_t*)(x16 + 2) ); 669 | GF256_M128 x3 = vld1q_u8((uint8_t*)(x16 + 3) ); 670 | GF256_M128 y0 = vld1q_u8((uint8_t*)y16); 671 | GF256_M128 y1 = vld1q_u8((uint8_t*)(y16 + 1)); 672 | GF256_M128 y2 = vld1q_u8((uint8_t*)(y16 + 2)); 673 | GF256_M128 y3 = vld1q_u8((uint8_t*)(y16 + 3)); 674 | 675 | vst1q_u8((uint8_t*)x16, veorq_u8(x0, y0)); 676 | vst1q_u8((uint8_t*)(x16 + 1), veorq_u8(x1, y1)); 677 | vst1q_u8((uint8_t*)(x16 + 2), veorq_u8(x2, y2)); 678 | vst1q_u8((uint8_t*)(x16 + 3), veorq_u8(x3, y3)); 679 | 680 | bytes -= 64, x16 += 4, y16 += 4; 681 | } 682 | 683 | // Handle multiples of 16 bytes 684 | while (bytes >= 16) 685 | { 686 | GF256_M128 x0 = vld1q_u8((uint8_t*)x16); 687 | GF256_M128 y0 = vld1q_u8((uint8_t*)y16); 688 | 689 | vst1q_u8((uint8_t*)x16, veorq_u8(x0, y0)); 690 | 691 | bytes -= 16, ++x16, ++y16; 692 | } 693 | } 694 | else 695 | # endif // GF256_TRY_NEON 696 | { 697 | uint64_t * GF256_RESTRICT x8 = reinterpret_cast(x16); 698 | const uint64_t * GF256_RESTRICT y8 = reinterpret_cast(y16); 699 | 700 | const unsigned count = (unsigned)bytes / 8; 701 | for (unsigned ii = 0; ii < count; ++ii) 702 | x8[ii] ^= y8[ii]; 703 | 704 | x16 = reinterpret_cast(x8 + count); 705 | y16 = reinterpret_cast(y8 + count); 706 | 707 | bytes -= (count * 8); 708 | } 709 | #else // GF256_TARGET_MOBILE 710 | # if defined(GF256_TRY_AVX2) 711 | if (CpuHasAVX2) 712 | { 713 | GF256_M256 * GF256_RESTRICT x32 = reinterpret_cast(x16); 714 | const GF256_M256 * GF256_RESTRICT y32 = reinterpret_cast(y16); 715 | 716 | while (bytes >= 128) 717 | { 718 | GF256_M256 x0 = _mm256_loadu_si256(x32); 719 | GF256_M256 y0 = _mm256_loadu_si256(y32); 720 | x0 = _mm256_xor_si256(x0, y0); 721 | GF256_M256 x1 = _mm256_loadu_si256(x32 + 1); 722 | GF256_M256 y1 = _mm256_loadu_si256(y32 + 1); 723 | x1 = _mm256_xor_si256(x1, y1); 724 | GF256_M256 x2 = _mm256_loadu_si256(x32 + 2); 725 | GF256_M256 y2 = _mm256_loadu_si256(y32 + 2); 726 | x2 = _mm256_xor_si256(x2, y2); 727 | GF256_M256 x3 = _mm256_loadu_si256(x32 + 3); 728 | GF256_M256 y3 = _mm256_loadu_si256(y32 + 3); 729 | x3 = _mm256_xor_si256(x3, y3); 730 | 731 | _mm256_storeu_si256(x32, x0); 732 | _mm256_storeu_si256(x32 + 1, x1); 733 | _mm256_storeu_si256(x32 + 2, x2); 734 | _mm256_storeu_si256(x32 + 3, x3); 735 | 736 | bytes -= 128, x32 += 4, y32 += 4; 737 | } 738 | 739 | // Handle multiples of 32 bytes 740 | while (bytes >= 32) 741 | { 742 | // x[i] = x[i] xor y[i] 743 | _mm256_storeu_si256(x32, 744 | _mm256_xor_si256( 745 | _mm256_loadu_si256(x32), 746 | _mm256_loadu_si256(y32))); 747 | 748 | bytes -= 32, ++x32, ++y32; 749 | } 750 | 751 | x16 = reinterpret_cast(x32); 752 | y16 = reinterpret_cast(y32); 753 | } 754 | else 755 | # endif // GF256_TRY_AVX2 756 | { 757 | while (bytes >= 64) 758 | { 759 | GF256_M128 x0 = _mm_loadu_si128(x16); 760 | GF256_M128 y0 = _mm_loadu_si128(y16); 761 | x0 = _mm_xor_si128(x0, y0); 762 | GF256_M128 x1 = _mm_loadu_si128(x16 + 1); 763 | GF256_M128 y1 = _mm_loadu_si128(y16 + 1); 764 | x1 = _mm_xor_si128(x1, y1); 765 | GF256_M128 x2 = _mm_loadu_si128(x16 + 2); 766 | GF256_M128 y2 = _mm_loadu_si128(y16 + 2); 767 | x2 = _mm_xor_si128(x2, y2); 768 | GF256_M128 x3 = _mm_loadu_si128(x16 + 3); 769 | GF256_M128 y3 = _mm_loadu_si128(y16 + 3); 770 | x3 = _mm_xor_si128(x3, y3); 771 | 772 | _mm_storeu_si128(x16, x0); 773 | _mm_storeu_si128(x16 + 1, x1); 774 | _mm_storeu_si128(x16 + 2, x2); 775 | _mm_storeu_si128(x16 + 3, x3); 776 | 777 | bytes -= 64, x16 += 4, y16 += 4; 778 | } 779 | } 780 | #endif // GF256_TARGET_MOBILE 781 | 782 | #if !defined(GF256_TARGET_MOBILE) 783 | // Handle multiples of 16 bytes 784 | while (bytes >= 16) 785 | { 786 | // x[i] = x[i] xor y[i] 787 | _mm_storeu_si128(x16, 788 | _mm_xor_si128( 789 | _mm_loadu_si128(x16), 790 | _mm_loadu_si128(y16))); 791 | 792 | bytes -= 16, ++x16, ++y16; 793 | } 794 | #endif 795 | 796 | uint8_t * GF256_RESTRICT x1 = reinterpret_cast(x16); 797 | const uint8_t * GF256_RESTRICT y1 = reinterpret_cast(y16); 798 | 799 | // Handle a block of 8 bytes 800 | const int eight = bytes & 8; 801 | if (eight) 802 | { 803 | uint64_t * GF256_RESTRICT x8 = reinterpret_cast(x1); 804 | const uint64_t * GF256_RESTRICT y8 = reinterpret_cast(y1); 805 | *x8 ^= *y8; 806 | } 807 | 808 | // Handle a block of 4 bytes 809 | const int four = bytes & 4; 810 | if (four) 811 | { 812 | uint32_t * GF256_RESTRICT x4 = reinterpret_cast(x1 + eight); 813 | const uint32_t * GF256_RESTRICT y4 = reinterpret_cast(y1 + eight); 814 | *x4 ^= *y4; 815 | } 816 | 817 | // Handle final bytes 818 | const int offset = eight + four; 819 | switch (bytes & 3) 820 | { 821 | case 3: x1[offset + 2] ^= y1[offset + 2]; 822 | case 2: x1[offset + 1] ^= y1[offset + 1]; 823 | case 1: x1[offset] ^= y1[offset]; 824 | default: 825 | break; 826 | } 827 | } 828 | 829 | extern "C" void gf256_add2_mem(void * GF256_RESTRICT vz, const void * GF256_RESTRICT vx, 830 | const void * GF256_RESTRICT vy, int bytes) 831 | { 832 | GF256_M128 * GF256_RESTRICT z16 = reinterpret_cast(vz); 833 | const GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast(vx); 834 | const GF256_M128 * GF256_RESTRICT y16 = reinterpret_cast(vy); 835 | 836 | #if defined(GF256_TARGET_MOBILE) 837 | # if defined(GF256_TRY_NEON) 838 | // Handle multiples of 64 bytes 839 | if (CpuHasNeon) 840 | { 841 | // Handle multiples of 16 bytes 842 | while (bytes >= 16) 843 | { 844 | // z[i] = z[i] xor x[i] xor y[i] 845 | vst1q_u8((uint8_t*)z16, 846 | veorq_u8( 847 | vld1q_u8((uint8_t*)z16), 848 | veorq_u8( 849 | vld1q_u8((uint8_t*)x16), 850 | vld1q_u8((uint8_t*)y16)))); 851 | 852 | bytes -= 16, ++x16, ++y16, ++z16; 853 | } 854 | } 855 | else 856 | # endif // GF256_TRY_NEON 857 | { 858 | uint64_t * GF256_RESTRICT z8 = reinterpret_cast(z16); 859 | const uint64_t * GF256_RESTRICT x8 = reinterpret_cast(x16); 860 | const uint64_t * GF256_RESTRICT y8 = reinterpret_cast(y16); 861 | 862 | const unsigned count = (unsigned)bytes / 8; 863 | for (unsigned ii = 0; ii < count; ++ii) 864 | z8[ii] ^= x8[ii] ^ y8[ii]; 865 | 866 | z16 = reinterpret_cast(z8 + count); 867 | x16 = reinterpret_cast(x8 + count); 868 | y16 = reinterpret_cast(y8 + count); 869 | 870 | bytes -= (count * 8); 871 | } 872 | #else // GF256_TARGET_MOBILE 873 | # if defined(GF256_TRY_AVX2) 874 | if (CpuHasAVX2) 875 | { 876 | GF256_M256 * GF256_RESTRICT z32 = reinterpret_cast(z16); 877 | const GF256_M256 * GF256_RESTRICT x32 = reinterpret_cast(x16); 878 | const GF256_M256 * GF256_RESTRICT y32 = reinterpret_cast(y16); 879 | 880 | const unsigned count = bytes / 32; 881 | for (unsigned i = 0; i < count; ++i) 882 | { 883 | _mm256_storeu_si256(z32 + i, 884 | _mm256_xor_si256( 885 | _mm256_loadu_si256(z32 + i), 886 | _mm256_xor_si256( 887 | _mm256_loadu_si256(x32 + i), 888 | _mm256_loadu_si256(y32 + i)))); 889 | } 890 | 891 | bytes -= count * 32; 892 | z16 = reinterpret_cast(z32 + count); 893 | x16 = reinterpret_cast(x32 + count); 894 | y16 = reinterpret_cast(y32 + count); 895 | } 896 | # endif // GF256_TRY_AVX2 897 | 898 | // Handle multiples of 16 bytes 899 | while (bytes >= 16) 900 | { 901 | // z[i] = z[i] xor x[i] xor y[i] 902 | _mm_storeu_si128(z16, 903 | _mm_xor_si128( 904 | _mm_loadu_si128(z16), 905 | _mm_xor_si128( 906 | _mm_loadu_si128(x16), 907 | _mm_loadu_si128(y16)))); 908 | 909 | bytes -= 16, ++x16, ++y16, ++z16; 910 | } 911 | #endif // GF256_TARGET_MOBILE 912 | 913 | uint8_t * GF256_RESTRICT z1 = reinterpret_cast(z16); 914 | const uint8_t * GF256_RESTRICT x1 = reinterpret_cast(x16); 915 | const uint8_t * GF256_RESTRICT y1 = reinterpret_cast(y16); 916 | 917 | // Handle a block of 8 bytes 918 | const int eight = bytes & 8; 919 | if (eight) 920 | { 921 | uint64_t * GF256_RESTRICT z8 = reinterpret_cast(z1); 922 | const uint64_t * GF256_RESTRICT x8 = reinterpret_cast(x1); 923 | const uint64_t * GF256_RESTRICT y8 = reinterpret_cast(y1); 924 | *z8 ^= *x8 ^ *y8; 925 | } 926 | 927 | // Handle a block of 4 bytes 928 | const int four = bytes & 4; 929 | if (four) 930 | { 931 | uint32_t * GF256_RESTRICT z4 = reinterpret_cast(z1 + eight); 932 | const uint32_t * GF256_RESTRICT x4 = reinterpret_cast(x1 + eight); 933 | const uint32_t * GF256_RESTRICT y4 = reinterpret_cast(y1 + eight); 934 | *z4 ^= *x4 ^ *y4; 935 | } 936 | 937 | // Handle final bytes 938 | const int offset = eight + four; 939 | switch (bytes & 3) 940 | { 941 | case 3: z1[offset + 2] ^= x1[offset + 2] ^ y1[offset + 2]; 942 | case 2: z1[offset + 1] ^= x1[offset + 1] ^ y1[offset + 1]; 943 | case 1: z1[offset] ^= x1[offset] ^ y1[offset]; 944 | default: 945 | break; 946 | } 947 | } 948 | 949 | extern "C" void gf256_addset_mem(void * GF256_RESTRICT vz, const void * GF256_RESTRICT vx, 950 | const void * GF256_RESTRICT vy, int bytes) 951 | { 952 | GF256_M128 * GF256_RESTRICT z16 = reinterpret_cast(vz); 953 | const GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast(vx); 954 | const GF256_M128 * GF256_RESTRICT y16 = reinterpret_cast(vy); 955 | 956 | #if defined(GF256_TARGET_MOBILE) 957 | # if defined(GF256_TRY_NEON) 958 | // Handle multiples of 64 bytes 959 | if (CpuHasNeon) 960 | { 961 | while (bytes >= 64) 962 | { 963 | GF256_M128 x0 = vld1q_u8((uint8_t*)x16); 964 | GF256_M128 x1 = vld1q_u8((uint8_t*)(x16 + 1)); 965 | GF256_M128 x2 = vld1q_u8((uint8_t*)(x16 + 2)); 966 | GF256_M128 x3 = vld1q_u8((uint8_t*)(x16 + 3)); 967 | GF256_M128 y0 = vld1q_u8((uint8_t*)(y16)); 968 | GF256_M128 y1 = vld1q_u8((uint8_t*)(y16 + 1)); 969 | GF256_M128 y2 = vld1q_u8((uint8_t*)(y16 + 2)); 970 | GF256_M128 y3 = vld1q_u8((uint8_t*)(y16 + 3)); 971 | 972 | vst1q_u8((uint8_t*)z16, veorq_u8(x0, y0)); 973 | vst1q_u8((uint8_t*)(z16 + 1), veorq_u8(x1, y1)); 974 | vst1q_u8((uint8_t*)(z16 + 2), veorq_u8(x2, y2)); 975 | vst1q_u8((uint8_t*)(z16 + 3), veorq_u8(x3, y3)); 976 | 977 | bytes -= 64, x16 += 4, y16 += 4, z16 += 4; 978 | } 979 | 980 | // Handle multiples of 16 bytes 981 | while (bytes >= 16) 982 | { 983 | // z[i] = x[i] xor y[i] 984 | vst1q_u8((uint8_t*)z16, 985 | veorq_u8( 986 | vld1q_u8((uint8_t*)x16), 987 | vld1q_u8((uint8_t*)y16))); 988 | 989 | bytes -= 16, ++x16, ++y16, ++z16; 990 | } 991 | } 992 | else 993 | # endif // GF256_TRY_NEON 994 | { 995 | uint64_t * GF256_RESTRICT z8 = reinterpret_cast(z16); 996 | const uint64_t * GF256_RESTRICT x8 = reinterpret_cast(x16); 997 | const uint64_t * GF256_RESTRICT y8 = reinterpret_cast(y16); 998 | 999 | const unsigned count = (unsigned)bytes / 8; 1000 | for (unsigned ii = 0; ii < count; ++ii) 1001 | z8[ii] = x8[ii] ^ y8[ii]; 1002 | 1003 | x16 = reinterpret_cast(x8 + count); 1004 | y16 = reinterpret_cast(y8 + count); 1005 | z16 = reinterpret_cast(z8 + count); 1006 | 1007 | bytes -= (count * 8); 1008 | } 1009 | #else // GF256_TARGET_MOBILE 1010 | # if defined(GF256_TRY_AVX2) 1011 | if (CpuHasAVX2) 1012 | { 1013 | GF256_M256 * GF256_RESTRICT z32 = reinterpret_cast(z16); 1014 | const GF256_M256 * GF256_RESTRICT x32 = reinterpret_cast(x16); 1015 | const GF256_M256 * GF256_RESTRICT y32 = reinterpret_cast(y16); 1016 | 1017 | const unsigned count = bytes / 32; 1018 | for (unsigned i = 0; i < count; ++i) 1019 | { 1020 | _mm256_storeu_si256(z32 + i, 1021 | _mm256_xor_si256( 1022 | _mm256_loadu_si256(x32 + i), 1023 | _mm256_loadu_si256(y32 + i))); 1024 | } 1025 | 1026 | bytes -= count * 32; 1027 | z16 = reinterpret_cast(z32 + count); 1028 | x16 = reinterpret_cast(x32 + count); 1029 | y16 = reinterpret_cast(y32 + count); 1030 | } 1031 | else 1032 | # endif // GF256_TRY_AVX2 1033 | { 1034 | // Handle multiples of 64 bytes 1035 | while (bytes >= 64) 1036 | { 1037 | GF256_M128 x0 = _mm_loadu_si128(x16); 1038 | GF256_M128 x1 = _mm_loadu_si128(x16 + 1); 1039 | GF256_M128 x2 = _mm_loadu_si128(x16 + 2); 1040 | GF256_M128 x3 = _mm_loadu_si128(x16 + 3); 1041 | GF256_M128 y0 = _mm_loadu_si128(y16); 1042 | GF256_M128 y1 = _mm_loadu_si128(y16 + 1); 1043 | GF256_M128 y2 = _mm_loadu_si128(y16 + 2); 1044 | GF256_M128 y3 = _mm_loadu_si128(y16 + 3); 1045 | 1046 | _mm_storeu_si128(z16, _mm_xor_si128(x0, y0)); 1047 | _mm_storeu_si128(z16 + 1, _mm_xor_si128(x1, y1)); 1048 | _mm_storeu_si128(z16 + 2, _mm_xor_si128(x2, y2)); 1049 | _mm_storeu_si128(z16 + 3, _mm_xor_si128(x3, y3)); 1050 | 1051 | bytes -= 64, x16 += 4, y16 += 4, z16 += 4; 1052 | } 1053 | } 1054 | 1055 | // Handle multiples of 16 bytes 1056 | while (bytes >= 16) 1057 | { 1058 | // z[i] = x[i] xor y[i] 1059 | _mm_storeu_si128(z16, 1060 | _mm_xor_si128( 1061 | _mm_loadu_si128(x16), 1062 | _mm_loadu_si128(y16))); 1063 | 1064 | bytes -= 16, ++x16, ++y16, ++z16; 1065 | } 1066 | #endif // GF256_TARGET_MOBILE 1067 | 1068 | uint8_t * GF256_RESTRICT z1 = reinterpret_cast(z16); 1069 | const uint8_t * GF256_RESTRICT x1 = reinterpret_cast(x16); 1070 | const uint8_t * GF256_RESTRICT y1 = reinterpret_cast(y16); 1071 | 1072 | // Handle a block of 8 bytes 1073 | const int eight = bytes & 8; 1074 | if (eight) 1075 | { 1076 | uint64_t * GF256_RESTRICT z8 = reinterpret_cast(z1); 1077 | const uint64_t * GF256_RESTRICT x8 = reinterpret_cast(x1); 1078 | const uint64_t * GF256_RESTRICT y8 = reinterpret_cast(y1); 1079 | *z8 = *x8 ^ *y8; 1080 | } 1081 | 1082 | // Handle a block of 4 bytes 1083 | const int four = bytes & 4; 1084 | if (four) 1085 | { 1086 | uint32_t * GF256_RESTRICT z4 = reinterpret_cast(z1 + eight); 1087 | const uint32_t * GF256_RESTRICT x4 = reinterpret_cast(x1 + eight); 1088 | const uint32_t * GF256_RESTRICT y4 = reinterpret_cast(y1 + eight); 1089 | *z4 = *x4 ^ *y4; 1090 | } 1091 | 1092 | // Handle final bytes 1093 | const int offset = eight + four; 1094 | switch (bytes & 3) 1095 | { 1096 | case 3: z1[offset + 2] = x1[offset + 2] ^ y1[offset + 2]; 1097 | case 2: z1[offset + 1] = x1[offset + 1] ^ y1[offset + 1]; 1098 | case 1: z1[offset] = x1[offset] ^ y1[offset]; 1099 | default: 1100 | break; 1101 | } 1102 | } 1103 | 1104 | extern "C" void gf256_mul_mem(void * GF256_RESTRICT vz, const void * GF256_RESTRICT vx, uint8_t y, int bytes) 1105 | { 1106 | // Use a single if-statement to handle special cases 1107 | if (y <= 1) 1108 | { 1109 | if (y == 0) 1110 | memset(vz, 0, bytes); 1111 | else if (vz != vx) 1112 | memcpy(vz, vx, bytes); 1113 | return; 1114 | } 1115 | 1116 | GF256_M128 * GF256_RESTRICT z16 = reinterpret_cast(vz); 1117 | const GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast(vx); 1118 | 1119 | #if defined(GF256_TARGET_MOBILE) 1120 | #if defined(GF256_TRY_NEON) 1121 | if (bytes >= 16 && CpuHasNeon) 1122 | { 1123 | // Partial product tables; see above 1124 | const GF256_M128 table_lo_y = vld1q_u8((uint8_t*)(GF256Ctx.MM128.TABLE_LO_Y + y)); 1125 | const GF256_M128 table_hi_y = vld1q_u8((uint8_t*)(GF256Ctx.MM128.TABLE_HI_Y + y)); 1126 | 1127 | // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 1128 | const GF256_M128 clr_mask = vdupq_n_u8(0x0f); 1129 | 1130 | // Handle multiples of 16 bytes 1131 | do 1132 | { 1133 | // See above comments for details 1134 | GF256_M128 x0 = vld1q_u8((uint8_t*)x16); 1135 | GF256_M128 l0 = vandq_u8(x0, clr_mask); 1136 | x0 = vshrq_n_u8(x0, 4); 1137 | GF256_M128 h0 = vandq_u8(x0, clr_mask); 1138 | l0 = vqtbl1q_u8(table_lo_y, l0); 1139 | h0 = vqtbl1q_u8(table_hi_y, h0); 1140 | vst1q_u8((uint8_t*)z16, veorq_u8(l0, h0)); 1141 | 1142 | bytes -= 16, ++x16, ++z16; 1143 | } while (bytes >= 16); 1144 | } 1145 | #endif 1146 | #else 1147 | # if defined(GF256_TRY_AVX2) 1148 | if (bytes >= 32 && CpuHasAVX2) 1149 | { 1150 | // Partial product tables; see above 1151 | const GF256_M256 table_lo_y = _mm256_loadu_si256(GF256Ctx.MM256.TABLE_LO_Y + y); 1152 | const GF256_M256 table_hi_y = _mm256_loadu_si256(GF256Ctx.MM256.TABLE_HI_Y + y); 1153 | 1154 | // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 1155 | const GF256_M256 clr_mask = _mm256_set1_epi8(0x0f); 1156 | 1157 | GF256_M256 * GF256_RESTRICT z32 = reinterpret_cast(vz); 1158 | const GF256_M256 * GF256_RESTRICT x32 = reinterpret_cast(vx); 1159 | 1160 | // Handle multiples of 32 bytes 1161 | do 1162 | { 1163 | // See above comments for details 1164 | GF256_M256 x0 = _mm256_loadu_si256(x32); 1165 | GF256_M256 l0 = _mm256_and_si256(x0, clr_mask); 1166 | x0 = _mm256_srli_epi64(x0, 4); 1167 | GF256_M256 h0 = _mm256_and_si256(x0, clr_mask); 1168 | l0 = _mm256_shuffle_epi8(table_lo_y, l0); 1169 | h0 = _mm256_shuffle_epi8(table_hi_y, h0); 1170 | _mm256_storeu_si256(z32, _mm256_xor_si256(l0, h0)); 1171 | 1172 | bytes -= 32, ++x32, ++z32; 1173 | } while (bytes >= 32); 1174 | 1175 | z16 = reinterpret_cast(z32); 1176 | x16 = reinterpret_cast(x32); 1177 | } 1178 | # endif // GF256_TRY_AVX2 1179 | if (bytes >= 16 && CpuHasSSSE3) 1180 | { 1181 | // Partial product tables; see above 1182 | const GF256_M128 table_lo_y = _mm_loadu_si128(GF256Ctx.MM128.TABLE_LO_Y + y); 1183 | const GF256_M128 table_hi_y = _mm_loadu_si128(GF256Ctx.MM128.TABLE_HI_Y + y); 1184 | 1185 | // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 1186 | const GF256_M128 clr_mask = _mm_set1_epi8(0x0f); 1187 | 1188 | // Handle multiples of 16 bytes 1189 | do 1190 | { 1191 | // See above comments for details 1192 | GF256_M128 x0 = _mm_loadu_si128(x16); 1193 | GF256_M128 l0 = _mm_and_si128(x0, clr_mask); 1194 | x0 = _mm_srli_epi64(x0, 4); 1195 | GF256_M128 h0 = _mm_and_si128(x0, clr_mask); 1196 | l0 = _mm_shuffle_epi8(table_lo_y, l0); 1197 | h0 = _mm_shuffle_epi8(table_hi_y, h0); 1198 | _mm_storeu_si128(z16, _mm_xor_si128(l0, h0)); 1199 | 1200 | bytes -= 16, ++x16, ++z16; 1201 | } while (bytes >= 16); 1202 | } 1203 | #endif 1204 | 1205 | uint8_t * GF256_RESTRICT z1 = reinterpret_cast(z16); 1206 | const uint8_t * GF256_RESTRICT x1 = reinterpret_cast(x16); 1207 | const uint8_t * GF256_RESTRICT table = GF256Ctx.GF256_MUL_TABLE + ((unsigned)y << 8); 1208 | 1209 | // Handle blocks of 8 bytes 1210 | while (bytes >= 8) 1211 | { 1212 | uint64_t * GF256_RESTRICT z8 = reinterpret_cast(z1); 1213 | #ifdef GF256_IS_BIG_ENDIAN 1214 | uint64_t word = (uint64_t)table[x1[0]] << 56; 1215 | word |= (uint64_t)table[x1[1]] << 48; 1216 | word |= (uint64_t)table[x1[2]] << 40; 1217 | word |= (uint64_t)table[x1[3]] << 32; 1218 | word |= (uint64_t)table[x1[4]] << 24; 1219 | word |= (uint64_t)table[x1[5]] << 16; 1220 | word |= (uint64_t)table[x1[6]] << 8; 1221 | word |= (uint64_t)table[x1[7]]; 1222 | #else 1223 | uint64_t word = table[x1[0]]; 1224 | word |= (uint64_t)table[x1[1]] << 8; 1225 | word |= (uint64_t)table[x1[2]] << 16; 1226 | word |= (uint64_t)table[x1[3]] << 24; 1227 | word |= (uint64_t)table[x1[4]] << 32; 1228 | word |= (uint64_t)table[x1[5]] << 40; 1229 | word |= (uint64_t)table[x1[6]] << 48; 1230 | word |= (uint64_t)table[x1[7]] << 56; 1231 | #endif 1232 | *z8 = word; 1233 | 1234 | bytes -= 8, x1 += 8, z1 += 8; 1235 | } 1236 | 1237 | // Handle a block of 4 bytes 1238 | const int four = bytes & 4; 1239 | if (four) 1240 | { 1241 | uint32_t * GF256_RESTRICT z4 = reinterpret_cast(z1); 1242 | #ifdef GF256_IS_BIG_ENDIAN 1243 | uint32_t word = (uint32_t)table[x1[0]] << 24; 1244 | word |= (uint32_t)table[x1[1]] << 16; 1245 | word |= (uint32_t)table[x1[2]] << 8; 1246 | word |= (uint32_t)table[x1[3]]; 1247 | #else 1248 | uint32_t word = table[x1[0]]; 1249 | word |= (uint32_t)table[x1[1]] << 8; 1250 | word |= (uint32_t)table[x1[2]] << 16; 1251 | word |= (uint32_t)table[x1[3]] << 24; 1252 | #endif 1253 | *z4 = word; 1254 | } 1255 | 1256 | // Handle single bytes 1257 | const int offset = four; 1258 | switch (bytes & 3) 1259 | { 1260 | case 3: z1[offset + 2] = table[x1[offset + 2]]; 1261 | case 2: z1[offset + 1] = table[x1[offset + 1]]; 1262 | case 1: z1[offset] = table[x1[offset]]; 1263 | default: 1264 | break; 1265 | } 1266 | } 1267 | 1268 | extern "C" void gf256_muladd_mem(void * GF256_RESTRICT vz, uint8_t y, 1269 | const void * GF256_RESTRICT vx, int bytes) 1270 | { 1271 | // Use a single if-statement to handle special cases 1272 | if (y <= 1) 1273 | { 1274 | if (y == 1) 1275 | gf256_add_mem(vz, vx, bytes); 1276 | return; 1277 | } 1278 | 1279 | GF256_M128 * GF256_RESTRICT z16 = reinterpret_cast(vz); 1280 | const GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast(vx); 1281 | 1282 | #if defined(GF256_TARGET_MOBILE) 1283 | #if defined(GF256_TRY_NEON) 1284 | if (bytes >= 16 && CpuHasNeon) 1285 | { 1286 | // Partial product tables; see above 1287 | const GF256_M128 table_lo_y = vld1q_u8((uint8_t*)(GF256Ctx.MM128.TABLE_LO_Y + y)); 1288 | const GF256_M128 table_hi_y = vld1q_u8((uint8_t*)(GF256Ctx.MM128.TABLE_HI_Y + y)); 1289 | 1290 | // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 1291 | const GF256_M128 clr_mask = vdupq_n_u8(0x0f); 1292 | 1293 | // Handle multiples of 16 bytes 1294 | do 1295 | { 1296 | // See above comments for details 1297 | GF256_M128 x0 = vld1q_u8((uint8_t*)x16); 1298 | GF256_M128 l0 = vandq_u8(x0, clr_mask); 1299 | 1300 | // x0 = vshrq_n_u8(x0, 4); 1301 | x0 = (GF256_M128)vshrq_n_u64( (uint64x2_t)x0, 4); 1302 | GF256_M128 h0 = vandq_u8(x0, clr_mask); 1303 | l0 = vqtbl1q_u8(table_lo_y, l0); 1304 | h0 = vqtbl1q_u8(table_hi_y, h0); 1305 | const GF256_M128 p0 = veorq_u8(l0, h0); 1306 | const GF256_M128 z0 = vld1q_u8((uint8_t*)z16); 1307 | vst1q_u8((uint8_t*)z16, veorq_u8(p0, z0)); 1308 | bytes -= 16, ++x16, ++z16; 1309 | } while (bytes >= 16); 1310 | } 1311 | #endif 1312 | #else // GF256_TARGET_MOBILE 1313 | # if defined(GF256_TRY_AVX2) 1314 | if (bytes >= 32 && CpuHasAVX2) 1315 | { 1316 | // Partial product tables; see above 1317 | const GF256_M256 table_lo_y = _mm256_loadu_si256(GF256Ctx.MM256.TABLE_LO_Y + y); 1318 | const GF256_M256 table_hi_y = _mm256_loadu_si256(GF256Ctx.MM256.TABLE_HI_Y + y); 1319 | 1320 | // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 1321 | const GF256_M256 clr_mask = _mm256_set1_epi8(0x0f); 1322 | 1323 | GF256_M256 * GF256_RESTRICT z32 = reinterpret_cast(z16); 1324 | const GF256_M256 * GF256_RESTRICT x32 = reinterpret_cast(x16); 1325 | 1326 | // On my Reed Solomon codec, the encoder unit test runs in 640 usec without and 550 usec with the optimization (86% of the original time) 1327 | const unsigned count = bytes / 64; 1328 | for (unsigned i = 0; i < count; ++i) 1329 | { 1330 | // See above comments for details 1331 | GF256_M256 x0 = _mm256_loadu_si256(x32 + i * 2); 1332 | GF256_M256 l0 = _mm256_and_si256(x0, clr_mask); 1333 | x0 = _mm256_srli_epi64(x0, 4); 1334 | const GF256_M256 z0 = _mm256_loadu_si256(z32 + i * 2); 1335 | GF256_M256 h0 = _mm256_and_si256(x0, clr_mask); 1336 | l0 = _mm256_shuffle_epi8(table_lo_y, l0); 1337 | h0 = _mm256_shuffle_epi8(table_hi_y, h0); 1338 | const GF256_M256 p0 = _mm256_xor_si256(l0, h0); 1339 | _mm256_storeu_si256(z32 + i * 2, _mm256_xor_si256(p0, z0)); 1340 | 1341 | GF256_M256 x1 = _mm256_loadu_si256(x32 + i * 2 + 1); 1342 | GF256_M256 l1 = _mm256_and_si256(x1, clr_mask); 1343 | x1 = _mm256_srli_epi64(x1, 4); 1344 | const GF256_M256 z1 = _mm256_loadu_si256(z32 + i * 2 + 1); 1345 | GF256_M256 h1 = _mm256_and_si256(x1, clr_mask); 1346 | l1 = _mm256_shuffle_epi8(table_lo_y, l1); 1347 | h1 = _mm256_shuffle_epi8(table_hi_y, h1); 1348 | const GF256_M256 p1 = _mm256_xor_si256(l1, h1); 1349 | _mm256_storeu_si256(z32 + i * 2 + 1, _mm256_xor_si256(p1, z1)); 1350 | } 1351 | bytes -= count * 64; 1352 | z32 += count * 2; 1353 | x32 += count * 2; 1354 | 1355 | if (bytes >= 32) 1356 | { 1357 | GF256_M256 x0 = _mm256_loadu_si256(x32); 1358 | GF256_M256 l0 = _mm256_and_si256(x0, clr_mask); 1359 | x0 = _mm256_srli_epi64(x0, 4); 1360 | GF256_M256 h0 = _mm256_and_si256(x0, clr_mask); 1361 | l0 = _mm256_shuffle_epi8(table_lo_y, l0); 1362 | h0 = _mm256_shuffle_epi8(table_hi_y, h0); 1363 | const GF256_M256 p0 = _mm256_xor_si256(l0, h0); 1364 | const GF256_M256 z0 = _mm256_loadu_si256(z32); 1365 | _mm256_storeu_si256(z32, _mm256_xor_si256(p0, z0)); 1366 | 1367 | bytes -= 32; 1368 | z32++; 1369 | x32++; 1370 | } 1371 | 1372 | z16 = reinterpret_cast(z32); 1373 | x16 = reinterpret_cast(x32); 1374 | } 1375 | # endif // GF256_TRY_AVX2 1376 | if (bytes >= 16 && CpuHasSSSE3) 1377 | { 1378 | // Partial product tables; see above 1379 | const GF256_M128 table_lo_y = _mm_loadu_si128(GF256Ctx.MM128.TABLE_LO_Y + y); 1380 | const GF256_M128 table_hi_y = _mm_loadu_si128(GF256Ctx.MM128.TABLE_HI_Y + y); 1381 | 1382 | // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 1383 | const GF256_M128 clr_mask = _mm_set1_epi8(0x0f); 1384 | 1385 | // This unroll seems to provide about 7% speed boost when AVX2 is disabled 1386 | while (bytes >= 32) 1387 | { 1388 | bytes -= 32; 1389 | 1390 | GF256_M128 x1 = _mm_loadu_si128(x16 + 1); 1391 | GF256_M128 l1 = _mm_and_si128(x1, clr_mask); 1392 | x1 = _mm_srli_epi64(x1, 4); 1393 | GF256_M128 h1 = _mm_and_si128(x1, clr_mask); 1394 | l1 = _mm_shuffle_epi8(table_lo_y, l1); 1395 | h1 = _mm_shuffle_epi8(table_hi_y, h1); 1396 | const GF256_M128 z1 = _mm_loadu_si128(z16 + 1); 1397 | 1398 | GF256_M128 x0 = _mm_loadu_si128(x16); 1399 | GF256_M128 l0 = _mm_and_si128(x0, clr_mask); 1400 | x0 = _mm_srli_epi64(x0, 4); 1401 | GF256_M128 h0 = _mm_and_si128(x0, clr_mask); 1402 | l0 = _mm_shuffle_epi8(table_lo_y, l0); 1403 | h0 = _mm_shuffle_epi8(table_hi_y, h0); 1404 | const GF256_M128 z0 = _mm_loadu_si128(z16); 1405 | 1406 | const GF256_M128 p1 = _mm_xor_si128(l1, h1); 1407 | _mm_storeu_si128(z16 + 1, _mm_xor_si128(p1, z1)); 1408 | 1409 | const GF256_M128 p0 = _mm_xor_si128(l0, h0); 1410 | _mm_storeu_si128(z16, _mm_xor_si128(p0, z0)); 1411 | 1412 | x16 += 2, z16 += 2; 1413 | } 1414 | 1415 | // Handle multiples of 16 bytes 1416 | while (bytes >= 16) 1417 | { 1418 | // See above comments for details 1419 | GF256_M128 x0 = _mm_loadu_si128(x16); 1420 | GF256_M128 l0 = _mm_and_si128(x0, clr_mask); 1421 | x0 = _mm_srli_epi64(x0, 4); 1422 | GF256_M128 h0 = _mm_and_si128(x0, clr_mask); 1423 | l0 = _mm_shuffle_epi8(table_lo_y, l0); 1424 | h0 = _mm_shuffle_epi8(table_hi_y, h0); 1425 | const GF256_M128 p0 = _mm_xor_si128(l0, h0); 1426 | const GF256_M128 z0 = _mm_loadu_si128(z16); 1427 | _mm_storeu_si128(z16, _mm_xor_si128(p0, z0)); 1428 | 1429 | bytes -= 16, ++x16, ++z16; 1430 | } 1431 | } 1432 | #endif // GF256_TARGET_MOBILE 1433 | 1434 | uint8_t * GF256_RESTRICT z1 = reinterpret_cast(z16); 1435 | const uint8_t * GF256_RESTRICT x1 = reinterpret_cast(x16); 1436 | const uint8_t * GF256_RESTRICT table = GF256Ctx.GF256_MUL_TABLE + ((unsigned)y << 8); 1437 | 1438 | // Handle blocks of 8 bytes 1439 | while (bytes >= 8) 1440 | { 1441 | uint64_t * GF256_RESTRICT z8 = reinterpret_cast(z1); 1442 | #ifdef GF256_IS_BIG_ENDIAN 1443 | uint64_t word = (uint64_t)table[x1[0]] << 56; 1444 | word |= (uint64_t)table[x1[1]] << 48; 1445 | word |= (uint64_t)table[x1[2]] << 40; 1446 | word |= (uint64_t)table[x1[3]] << 32; 1447 | word |= (uint64_t)table[x1[4]] << 24; 1448 | word |= (uint64_t)table[x1[5]] << 16; 1449 | word |= (uint64_t)table[x1[6]] << 8; 1450 | word |= (uint64_t)table[x1[7]]; 1451 | #else 1452 | uint64_t word = table[x1[0]]; 1453 | word |= (uint64_t)table[x1[1]] << 8; 1454 | word |= (uint64_t)table[x1[2]] << 16; 1455 | word |= (uint64_t)table[x1[3]] << 24; 1456 | word |= (uint64_t)table[x1[4]] << 32; 1457 | word |= (uint64_t)table[x1[5]] << 40; 1458 | word |= (uint64_t)table[x1[6]] << 48; 1459 | word |= (uint64_t)table[x1[7]] << 56; 1460 | #endif 1461 | *z8 ^= word; 1462 | 1463 | bytes -= 8, x1 += 8, z1 += 8; 1464 | } 1465 | 1466 | // Handle a block of 4 bytes 1467 | const int four = bytes & 4; 1468 | if (four) 1469 | { 1470 | uint32_t * GF256_RESTRICT z4 = reinterpret_cast(z1); 1471 | #ifdef GF256_IS_BIG_ENDIAN 1472 | uint32_t word = (uint32_t)table[x1[0]] << 24; 1473 | word |= (uint32_t)table[x1[1]] << 16; 1474 | word |= (uint32_t)table[x1[2]] << 8; 1475 | word |= (uint32_t)table[x1[3]]; 1476 | #else 1477 | uint32_t word = table[x1[0]]; 1478 | word |= (uint32_t)table[x1[1]] << 8; 1479 | word |= (uint32_t)table[x1[2]] << 16; 1480 | word |= (uint32_t)table[x1[3]] << 24; 1481 | #endif 1482 | *z4 ^= word; 1483 | } 1484 | 1485 | // Handle single bytes 1486 | const int offset = four; 1487 | switch (bytes & 3) 1488 | { 1489 | case 3: z1[offset + 2] ^= table[x1[offset + 2]]; 1490 | case 2: z1[offset + 1] ^= table[x1[offset + 1]]; 1491 | case 1: z1[offset] ^= table[x1[offset]]; 1492 | default: 1493 | break; 1494 | } 1495 | } 1496 | 1497 | extern "C" void gf256_memswap(void * GF256_RESTRICT vx, void * GF256_RESTRICT vy, int bytes) 1498 | { 1499 | #if defined(GF256_TARGET_MOBILE) 1500 | uint64_t * GF256_RESTRICT x16 = reinterpret_cast(vx); 1501 | uint64_t * GF256_RESTRICT y16 = reinterpret_cast(vy); 1502 | 1503 | const unsigned count = (unsigned)bytes / 8; 1504 | for (unsigned ii = 0; ii < count; ++ii) 1505 | { 1506 | const uint64_t temp = x16[ii]; 1507 | x16[ii] = y16[ii]; 1508 | y16[ii] = temp; 1509 | } 1510 | 1511 | x16 += count; 1512 | y16 += count; 1513 | bytes -= count * 8; 1514 | #else 1515 | GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast(vx); 1516 | GF256_M128 * GF256_RESTRICT y16 = reinterpret_cast(vy); 1517 | 1518 | // Handle blocks of 16 bytes 1519 | while (bytes >= 16) 1520 | { 1521 | GF256_M128 x0 = _mm_loadu_si128(x16); 1522 | GF256_M128 y0 = _mm_loadu_si128(y16); 1523 | _mm_storeu_si128(x16, y0); 1524 | _mm_storeu_si128(y16, x0); 1525 | 1526 | bytes -= 16, ++x16, ++y16; 1527 | } 1528 | #endif 1529 | 1530 | uint8_t * GF256_RESTRICT x1 = reinterpret_cast(x16); 1531 | uint8_t * GF256_RESTRICT y1 = reinterpret_cast(y16); 1532 | 1533 | // Handle a block of 8 bytes 1534 | const int eight = bytes & 8; 1535 | if (eight) 1536 | { 1537 | uint64_t * GF256_RESTRICT x8 = reinterpret_cast(x1); 1538 | uint64_t * GF256_RESTRICT y8 = reinterpret_cast(y1); 1539 | 1540 | uint64_t temp = *x8; 1541 | *x8 = *y8; 1542 | *y8 = temp; 1543 | } 1544 | 1545 | // Handle a block of 4 bytes 1546 | const int four = bytes & 4; 1547 | if (four) 1548 | { 1549 | uint32_t * GF256_RESTRICT x4 = reinterpret_cast(x1 + eight); 1550 | uint32_t * GF256_RESTRICT y4 = reinterpret_cast(y1 + eight); 1551 | 1552 | uint32_t temp = *x4; 1553 | *x4 = *y4; 1554 | *y4 = temp; 1555 | } 1556 | 1557 | // Handle final bytes 1558 | const int offset = eight + four; 1559 | uint8_t temp; 1560 | switch (bytes & 3) 1561 | { 1562 | case 3: temp = x1[offset + 2]; x1[offset + 2] = y1[offset + 2]; y1[offset + 2] = temp; 1563 | case 2: temp = x1[offset + 1]; x1[offset + 1] = y1[offset + 1]; y1[offset + 1] = temp; 1564 | case 1: temp = x1[offset]; x1[offset] = y1[offset]; y1[offset] = temp; 1565 | default: 1566 | break; 1567 | } 1568 | } 1569 | -------------------------------------------------------------------------------- /unit_test/Debug/matrix_test.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/cm256/634d1e36b095dbba937586a7e9843d828762dae4/unit_test/Debug/matrix_test.exe -------------------------------------------------------------------------------- /unit_test/Release/matrix_test.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/cm256/634d1e36b095dbba937586a7e9843d828762dae4/unit_test/Release/matrix_test.exe -------------------------------------------------------------------------------- /unit_test/SiameseTools.cpp: -------------------------------------------------------------------------------- 1 | /** \file 2 | \brief Siamese FEC Implementation: Tools 3 | \copyright Copyright (c) 2017 Christopher A. Taylor. All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | * Neither the name of Siamese nor the names of its contributors may be 14 | used to endorse or promote products derived from this software without 15 | specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 21 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 | POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | #include "SiameseTools.h" 31 | 32 | #ifdef _WIN32 33 | #ifndef NOMINMAX 34 | #define NOMINMAX 35 | #endif 36 | #include 37 | #elif __MACH__ 38 | #include 39 | #include 40 | #include 41 | 42 | extern mach_port_t clock_port; 43 | #else 44 | #include 45 | #include 46 | #endif 47 | 48 | namespace siamese { 49 | 50 | 51 | //------------------------------------------------------------------------------ 52 | // Timing 53 | 54 | #ifdef _WIN32 55 | // Precomputed frequency inverse 56 | static double PerfFrequencyInverseUsec = 0.; 57 | static double PerfFrequencyInverseMsec = 0.; 58 | 59 | static void InitPerfFrequencyInverse() 60 | { 61 | LARGE_INTEGER freq = {}; 62 | if (!::QueryPerformanceFrequency(&freq) || freq.QuadPart == 0) 63 | return; 64 | const double invFreq = 1. / (double)freq.QuadPart; 65 | PerfFrequencyInverseUsec = 1000000. * invFreq; 66 | PerfFrequencyInverseMsec = 1000. * invFreq; 67 | SIAMESE_DEBUG_ASSERT(PerfFrequencyInverseUsec > 0.); 68 | SIAMESE_DEBUG_ASSERT(PerfFrequencyInverseMsec > 0.); 69 | } 70 | #elif __MACH__ 71 | static bool m_clock_serv_init = false; 72 | static clock_serv_t m_clock_serv = 0; 73 | 74 | static void InitClockServ() 75 | { 76 | m_clock_serv_init = true; 77 | host_get_clock_service(mach_host_self(), SYSTEM_CLOCK, &m_clock_serv); 78 | } 79 | #endif // _WIN32 80 | 81 | uint64_t GetTimeUsec() 82 | { 83 | #ifdef _WIN32 84 | LARGE_INTEGER timeStamp = {}; 85 | if (!::QueryPerformanceCounter(&timeStamp)) 86 | return 0; 87 | if (PerfFrequencyInverseUsec == 0.) 88 | InitPerfFrequencyInverse(); 89 | return (uint64_t)(PerfFrequencyInverseUsec * timeStamp.QuadPart); 90 | #elif __MACH__ 91 | if (!m_clock_serv_init) 92 | InitClockServ(); 93 | 94 | mach_timespec_t tv; 95 | clock_get_time(m_clock_serv, &tv); 96 | 97 | return 1000000 * tv.tv_sec + tv.tv_nsec / 1000; 98 | #else 99 | struct timeval tv; 100 | gettimeofday(&tv, nullptr); 101 | return 1000000 * tv.tv_sec + tv.tv_usec; 102 | #endif 103 | } 104 | 105 | uint64_t GetTimeMsec() 106 | { 107 | #ifdef _WIN32 108 | LARGE_INTEGER timeStamp = {}; 109 | if (!::QueryPerformanceCounter(&timeStamp)) 110 | return 0; 111 | if (PerfFrequencyInverseMsec == 0.) 112 | InitPerfFrequencyInverse(); 113 | return (uint64_t)(PerfFrequencyInverseMsec * timeStamp.QuadPart); 114 | #else 115 | // TBD: Optimize this? 116 | return GetTimeUsec() / 1000; 117 | #endif 118 | } 119 | 120 | 121 | } // namespace siamese 122 | -------------------------------------------------------------------------------- /unit_test/SiameseTools.h: -------------------------------------------------------------------------------- 1 | /** \file 2 | \brief Siamese FEC Implementation: Tools 3 | \copyright Copyright (c) 2017 Christopher A. Taylor. All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | * Neither the name of Siamese nor the names of its contributors may be 14 | used to endorse or promote products derived from this software without 15 | specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 21 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 | POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | #pragma once 31 | 32 | /** 33 | Tools: 34 | 35 | + System headers 36 | + Debug breakpoints/asserts 37 | + Compiler-specific code wrappers 38 | + PCGRandom implementation 39 | + Microsecond timing 40 | + Windowed minimum/maximum 41 | */ 42 | 43 | #include // uint32_t 44 | #include // memcpy 45 | #include // std::nothrow 46 | 47 | 48 | //------------------------------------------------------------------------------ 49 | // Portability macros 50 | 51 | // Compiler-specific debug break 52 | #if defined(_DEBUG) || defined(DEBUG) 53 | #define SIAMESE_DEBUG 54 | #ifdef _WIN32 55 | #define SIAMESE_DEBUG_BREAK() __debugbreak() 56 | #else 57 | #define SIAMESE_DEBUG_BREAK() __builtin_trap() 58 | #endif 59 | #define SIAMESE_DEBUG_ASSERT(cond) { if (!(cond)) { SIAMESE_DEBUG_BREAK(); } } 60 | #else 61 | #define SIAMESE_DEBUG_BREAK() do {} while (false); 62 | #define SIAMESE_DEBUG_ASSERT(cond) do {} while (false); 63 | #endif 64 | 65 | // Compiler-specific force inline keyword 66 | #ifdef _MSC_VER 67 | #define SIAMESE_FORCE_INLINE inline __forceinline 68 | #else 69 | #define SIAMESE_FORCE_INLINE inline __attribute__((always_inline)) 70 | #endif 71 | 72 | 73 | namespace siamese { 74 | 75 | 76 | //------------------------------------------------------------------------------ 77 | // PCG PRNG 78 | 79 | /// From http://www.pcg-random.org/ 80 | class PCGRandom 81 | { 82 | public: 83 | void Seed(uint64_t y, uint64_t x = 0) 84 | { 85 | State = 0; 86 | Inc = (y << 1u) | 1u; 87 | Next(); 88 | State += x; 89 | Next(); 90 | } 91 | 92 | uint32_t Next() 93 | { 94 | const uint64_t oldstate = State; 95 | State = oldstate * UINT64_C(6364136223846793005) + Inc; 96 | const uint32_t xorshifted = (uint32_t)(((oldstate >> 18) ^ oldstate) >> 27); 97 | const uint32_t rot = oldstate >> 59; 98 | return (xorshifted >> rot) | (xorshifted << ((uint32_t)(-(int32_t)rot) & 31)); 99 | } 100 | 101 | uint64_t State = 0, Inc = 0; 102 | }; 103 | 104 | 105 | //------------------------------------------------------------------------------ 106 | // Timing 107 | 108 | /// Microsecond-accurate platform independent high-resolution timer 109 | uint64_t GetTimeUsec(); 110 | 111 | /// Millisecond-accurate platform independent high-resolution timer 112 | uint64_t GetTimeMsec(); 113 | 114 | 115 | //------------------------------------------------------------------------------ 116 | // WindowedMinMax 117 | 118 | template struct WindowedMinCompare 119 | { 120 | SIAMESE_FORCE_INLINE bool operator()(const T x, const T y) const 121 | { 122 | return x <= y; 123 | } 124 | }; 125 | 126 | template struct WindowedMaxCompare 127 | { 128 | SIAMESE_FORCE_INLINE bool operator()(const T x, const T y) const 129 | { 130 | return x >= y; 131 | } 132 | }; 133 | 134 | /// Templated class that calculates a running windowed minimum or maximum with 135 | /// a fixed time and resource cost. 136 | template class WindowedMinMax 137 | { 138 | public: 139 | typedef uint64_t TimeT; 140 | CompareT Compare; 141 | 142 | struct Sample 143 | { 144 | /// Sample value 145 | T Value; 146 | 147 | /// Timestamp of data collection 148 | TimeT Timestamp; 149 | 150 | 151 | /// Default values and initializing constructor 152 | explicit Sample(T value = 0, TimeT timestamp = 0) 153 | : Value(value) 154 | , Timestamp(timestamp) 155 | { 156 | } 157 | 158 | /// Check if a timeout expired 159 | inline bool TimeoutExpired(TimeT now, TimeT timeout) 160 | { 161 | return (TimeT)(now - Timestamp) > timeout; 162 | } 163 | }; 164 | 165 | 166 | static const unsigned kSampleCount = 3; 167 | 168 | Sample Samples[kSampleCount]; 169 | 170 | 171 | bool IsValid() const 172 | { 173 | return Samples[0].Value != 0; ///< ish 174 | } 175 | 176 | T GetBest() const 177 | { 178 | return Samples[0].Value; 179 | } 180 | 181 | void Reset(const Sample sample = Sample()) 182 | { 183 | Samples[0] = Samples[1] = Samples[2] = sample; 184 | } 185 | 186 | void Update(T value, TimeT timestamp, const TimeT windowLengthTime) 187 | { 188 | const Sample sample(value, timestamp); 189 | 190 | // On the first sample, new best sample, or if window length has expired: 191 | if (!IsValid() || 192 | Compare(value, Samples[0].Value) || 193 | Samples[2].TimeoutExpired(sample.Timestamp, windowLengthTime)) 194 | { 195 | Reset(sample); 196 | return; 197 | } 198 | 199 | // Insert the new value into the sorted array 200 | if (Compare(value, Samples[1].Value)) 201 | Samples[2] = Samples[1] = sample; 202 | else if (Compare(value, Samples[2].Value)) 203 | Samples[2] = sample; 204 | 205 | // Expire best if it has been the best for a long time 206 | if (Samples[0].TimeoutExpired(sample.Timestamp, windowLengthTime)) 207 | { 208 | // Also expire the next best if needed 209 | if (Samples[1].TimeoutExpired(sample.Timestamp, windowLengthTime)) 210 | { 211 | Samples[0] = Samples[2]; 212 | Samples[1] = sample; 213 | } 214 | else 215 | { 216 | Samples[0] = Samples[1]; 217 | Samples[1] = Samples[2]; 218 | } 219 | Samples[2] = sample; 220 | return; 221 | } 222 | 223 | // Quarter of window has gone by without a better value - Use the second-best 224 | if (Samples[1].Value == Samples[0].Value && 225 | Samples[1].TimeoutExpired(sample.Timestamp, windowLengthTime / 4)) 226 | { 227 | Samples[2] = Samples[1] = sample; 228 | return; 229 | } 230 | 231 | // Half the window has gone by without a better value - Use the third-best one 232 | if (Samples[2].Value == Samples[1].Value && 233 | Samples[2].TimeoutExpired(sample.Timestamp, windowLengthTime / 2)) 234 | { 235 | Samples[2] = sample; 236 | } 237 | } 238 | }; 239 | 240 | 241 | } // namespace siamese 242 | -------------------------------------------------------------------------------- /unit_test/main.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015 Christopher A. Taylor. All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | * Neither the name of CM256 nor the names of its contributors may be 13 | used to endorse or promote products derived from this software without 14 | specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 20 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 | POSSIBILITY OF SUCH DAMAGE. 27 | */ 28 | 29 | #ifdef _MSC_VER 30 | #pragma warning(push) 31 | #pragma warning(disable: 4530) // warning C4530: C++ exception handler used, but unwind semantics are not enabled. Specify /EHsc 32 | #endif 33 | 34 | #include 35 | using namespace std; 36 | 37 | #ifdef _MSC_VER 38 | #pragma warning(pop) 39 | #endif 40 | 41 | #include "cm256.h" 42 | #include "SiameseTools.h" 43 | 44 | #ifdef _WIN32 45 | #define WIN32_LEAN_AND_MEAN 46 | #include 47 | #endif 48 | 49 | #include 50 | #include 51 | 52 | void initializeBlocks(cm256_block originals[256], int blockCount, int blockBytes) 53 | { 54 | for (int i = 0; i < blockCount; ++i) 55 | { 56 | for (int j = 0; j < blockBytes; ++j) 57 | { 58 | const uint8_t expected = (uint8_t)(i + j * 13); 59 | uint8_t* data = (uint8_t*)originals[i].Block; 60 | data[j] = expected; 61 | } 62 | } 63 | } 64 | 65 | bool validateSolution(cm256_block_t* blocks, int blockCount, int blockBytes) 66 | { 67 | uint8_t seen[256] = { 0 }; 68 | 69 | for (int i = 0; i < blockCount; ++i) 70 | { 71 | uint8_t index = blocks[i].Index; 72 | 73 | if (index >= blockCount) 74 | { 75 | return false; 76 | } 77 | 78 | if (seen[index]) 79 | { 80 | return false; 81 | } 82 | 83 | seen[index] = 1; 84 | 85 | for (int j = 0; j < blockBytes; ++j) 86 | { 87 | const uint8_t expected = (uint8_t)(index + j * 13); 88 | uint8_t* blockData = (uint8_t*)blocks[i].Block; 89 | if (blockData[j] != expected) 90 | { 91 | return false; 92 | } 93 | } 94 | } 95 | 96 | return true; 97 | } 98 | 99 | 100 | 101 | bool ExampleFileUsage() 102 | { 103 | if (cm256_init()) 104 | { 105 | return false; 106 | } 107 | 108 | cm256_encoder_params params; 109 | 110 | // Number of bytes per file block 111 | params.BlockBytes = 1296; 112 | 113 | // Number of blocks 114 | params.OriginalCount = 100; 115 | 116 | // Number of additional recovery blocks generated by encoder 117 | params.RecoveryCount = 30; 118 | 119 | // Size of the original file 120 | static const int OriginalFileBytes = params.OriginalCount * params.BlockBytes; 121 | 122 | // Allocate and fill the original file data 123 | uint8_t* originalFileData = new uint8_t[OriginalFileBytes]; 124 | for (int i = 0; i < OriginalFileBytes; ++i) 125 | { 126 | originalFileData[i] = (uint8_t)i; 127 | } 128 | 129 | // Pointers to data 130 | cm256_block blocks[256]; 131 | for (int i = 0; i < params.OriginalCount; ++i) 132 | { 133 | blocks[i].Block = originalFileData + i * params.BlockBytes; 134 | } 135 | 136 | // Recovery data 137 | uint8_t* recoveryBlocks = new uint8_t[params.RecoveryCount * params.BlockBytes]; 138 | 139 | // Generate recovery data 140 | if (cm256_encode(params, blocks, recoveryBlocks)) 141 | { 142 | return false; 143 | } 144 | 145 | // Initialize the indices 146 | for (int i = 0; i < params.OriginalCount; ++i) 147 | { 148 | blocks[i].Index = cm256_get_original_block_index(params, i); 149 | } 150 | 151 | //// Simulate loss of data, substituting a recovery block in its place //// 152 | for (int i = 0; i < params.RecoveryCount && i < params.OriginalCount; ++i) 153 | { 154 | blocks[i].Block = recoveryBlocks + params.BlockBytes * i; // First recovery block 155 | blocks[i].Index = cm256_get_recovery_block_index(params, i); // First recovery block index 156 | } 157 | //// Simulate loss of data, substituting a recovery block in its place //// 158 | 159 | if (cm256_decode(params, blocks)) 160 | { 161 | return false; 162 | } 163 | 164 | for (int i = 0; i < params.RecoveryCount && i < params.OriginalCount; ++i) 165 | { 166 | uint8_t* block = (uint8_t*)blocks[i].Block; 167 | int index = blocks[i].Index; 168 | 169 | for (int j = 0; j < params.BlockBytes; ++j) 170 | { 171 | const uint8_t expected = (uint8_t)(j + index * params.BlockBytes); 172 | if (block[j] != expected) 173 | { 174 | return false; 175 | } 176 | } 177 | } 178 | 179 | delete[] originalFileData; 180 | delete[] recoveryBlocks; 181 | 182 | return true; 183 | } 184 | 185 | bool CheckMemSwap() 186 | { 187 | unsigned char buffa[16 + 8 + 4 + 3]; 188 | memset(buffa, 1, sizeof(buffa)); 189 | unsigned char buffb[16 + 8 + 4 + 3]; 190 | memset(buffb, 2, sizeof(buffb)); 191 | 192 | gf256_memswap(buffa, buffb, (int)sizeof(buffa)); 193 | 194 | for (int i = 0; i < (int)sizeof(buffa); ++i) 195 | { 196 | if (buffa[i] != 2) 197 | { 198 | return false; 199 | } 200 | if (buffb[i] != 1) 201 | { 202 | return false; 203 | } 204 | } 205 | 206 | gf256_memswap(buffa, buffb, (int)sizeof(buffa)); 207 | 208 | for (int i = 0; i < (int)sizeof(buffa); ++i) 209 | { 210 | if (buffa[i] != 1) 211 | { 212 | return false; 213 | } 214 | if (buffb[i] != 2) 215 | { 216 | return false; 217 | } 218 | } 219 | 220 | return true; 221 | } 222 | 223 | bool FinerPerfTimingTest() 224 | { 225 | #ifdef _WIN32 226 | ::SetPriorityClass(::GetCurrentProcess(), REALTIME_PRIORITY_CLASS); 227 | ::SetThreadPriority(::GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL); 228 | #endif 229 | 230 | std::this_thread::sleep_for(std::chrono::milliseconds(1000)); 231 | 232 | if (cm256_init()) 233 | { 234 | return false; 235 | } 236 | 237 | cm256_block blocks[256]; 238 | 239 | uint64_t tsum = 0; 240 | 241 | cm256_encoder_params params; 242 | params.BlockBytes = 1296; 243 | params.OriginalCount = 100; 244 | params.RecoveryCount = 30; 245 | 246 | unsigned char* orig_data = new unsigned char[256 * params.BlockBytes]; 247 | unsigned char* recoveryData = new unsigned char[256 * params.BlockBytes]; 248 | 249 | const int trials = 1000; 250 | for (int trial = 0; trial < trials; ++trial) 251 | { 252 | for (int i = 0; i < params.BlockBytes * params.OriginalCount; ++i) 253 | { 254 | orig_data[i] = (uint8_t)i; 255 | } 256 | 257 | for (int i = 0; i < params.OriginalCount; ++i) 258 | { 259 | blocks[i].Block = orig_data + i * params.BlockBytes; 260 | } 261 | 262 | if (cm256_encode(params, blocks, recoveryData)) 263 | { 264 | return false; 265 | } 266 | 267 | // Initialize the indices 268 | for (int i = 0; i < params.OriginalCount; ++i) 269 | { 270 | blocks[i].Index = cm256_get_original_block_index(params, i); 271 | } 272 | 273 | //// Simulate loss of data, substituting a recovery block in its place //// 274 | for (int i = 0; i < params.RecoveryCount && i < params.OriginalCount; ++i) 275 | { 276 | blocks[i].Block = recoveryData + params.BlockBytes * i; // First recovery block 277 | blocks[i].Index = cm256_get_recovery_block_index(params, i); // First recovery block index 278 | } 279 | //// Simulate loss of data, substituting a recovery block in its place //// 280 | 281 | const uint64_t t0 = siamese::GetTimeUsec(); 282 | 283 | if (cm256_decode(params, blocks)) 284 | { 285 | return false; 286 | } 287 | 288 | const uint64_t t1 = siamese::GetTimeUsec(); 289 | tsum += t1 - t0; 290 | 291 | for (int i = 0; i < params.RecoveryCount && i < params.OriginalCount; ++i) 292 | { 293 | uint8_t* block = (uint8_t*)blocks[i].Block; 294 | int index = blocks[i].Index; 295 | 296 | for (int j = 0; j < params.BlockBytes; ++j) 297 | { 298 | const uint8_t expected = (uint8_t)(j + index * params.BlockBytes); 299 | if (block[j] != expected) 300 | { 301 | return false; 302 | } 303 | } 304 | } 305 | } 306 | 307 | const double opusec = tsum / static_cast( trials ); 308 | const double mbps = (params.BlockBytes * params.OriginalCount / opusec); 309 | 310 | cout << opusec << " usec, " << mbps << " MBps" << endl; 311 | 312 | #ifdef _WIN32 313 | ::SetThreadPriority(::GetCurrentThread(), THREAD_PRIORITY_NORMAL); 314 | ::SetPriorityClass(::GetCurrentProcess(), NORMAL_PRIORITY_CLASS); 315 | #endif 316 | 317 | return true; 318 | } 319 | 320 | 321 | bool BulkPerfTesting() 322 | { 323 | if (cm256_init()) 324 | { 325 | return false; 326 | } 327 | 328 | static const int MaxBlockBytes = 10000; // multiple of 10 329 | 330 | unsigned char* orig_data = new unsigned char[256 * MaxBlockBytes]; 331 | 332 | unsigned char* recoveryData = new unsigned char[256 * MaxBlockBytes]; 333 | 334 | cm256_block blocks[256]; 335 | 336 | for (int blockBytes = 8 * 162; blockBytes <= MaxBlockBytes; blockBytes *= 10) 337 | { 338 | for (int originalCount = 1; originalCount < 256; ++originalCount) 339 | { 340 | for (int recoveryCount = 1; recoveryCount <= 1 + originalCount / 2 && recoveryCount <= 256 - originalCount; ++recoveryCount) 341 | { 342 | cm256_encoder_params params; 343 | params.BlockBytes = blockBytes; 344 | params.OriginalCount = originalCount; 345 | params.RecoveryCount = recoveryCount; 346 | 347 | for (int i = 0; i < 256; ++i) 348 | { 349 | blocks[i].Block = orig_data + i * MaxBlockBytes; 350 | } 351 | 352 | initializeBlocks(blocks, originalCount, blockBytes); 353 | 354 | { 355 | const uint64_t t0 = siamese::GetTimeUsec(); 356 | 357 | if (cm256_encode(params, blocks, recoveryData)) 358 | { 359 | cout << "Encoder error" << endl; 360 | return false; 361 | } 362 | 363 | const uint64_t t1 = siamese::GetTimeUsec(); 364 | const int dt_usec = (int)static_cast( t1 - t0 ); 365 | 366 | const double opusec = dt_usec; 367 | const double mbps = (params.BlockBytes * params.OriginalCount / opusec); 368 | 369 | cout << "Encoder: " << blockBytes << " bytes k = " << originalCount << " m = " << recoveryCount << " : " << opusec << " usec, " << mbps << " MBps" << endl; 370 | } 371 | 372 | // Fill in indices 373 | for (int i = 0; i < originalCount; ++i) 374 | { 375 | blocks[i].Index = cm256_get_original_block_index(params, i); 376 | } 377 | 378 | for (int ii = 0; ii < recoveryCount; ++ii) 379 | { 380 | int erasure_index = recoveryCount - ii - 1; 381 | blocks[ii].Block = recoveryData + erasure_index * blockBytes; 382 | blocks[ii].Index = cm256_get_recovery_block_index(params, erasure_index); 383 | } 384 | 385 | { 386 | const uint64_t t0 = siamese::GetTimeUsec(); 387 | 388 | if (cm256_decode(params, blocks)) 389 | { 390 | cout << "Decoder error" << endl; 391 | return false; 392 | } 393 | 394 | const uint64_t t1 = siamese::GetTimeUsec(); 395 | const int dt_usec = (int)static_cast( t1 - t0 ); 396 | 397 | const double opusec = dt_usec; 398 | const double mbps = (params.BlockBytes * params.OriginalCount / opusec); 399 | 400 | cout << "Decoder: " << blockBytes << " bytes k = " << originalCount << " m = " << recoveryCount << " : " << opusec << " usec, " << mbps << " MBps" << endl; 401 | } 402 | 403 | if (!validateSolution(blocks, originalCount, blockBytes)) 404 | { 405 | cout << "Solution invalid" << endl; 406 | return false; 407 | } 408 | } 409 | } 410 | } 411 | 412 | return true; 413 | } 414 | 415 | 416 | int main() 417 | { 418 | #if 1 419 | if (!ExampleFileUsage()) 420 | { 421 | exit(1); 422 | } 423 | #endif 424 | #if 1 425 | if (!CheckMemSwap()) 426 | { 427 | exit(4); 428 | } 429 | #endif 430 | #if 1 431 | if (!FinerPerfTimingTest()) 432 | { 433 | exit(2); 434 | } 435 | #endif 436 | #if 1 437 | if (!BulkPerfTesting()) 438 | { 439 | exit(3); 440 | } 441 | #endif 442 | 443 | return 0; 444 | } 445 | -------------------------------------------------------------------------------- /unit_test/matrix_test.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 2013 4 | VisualStudioVersion = 12.0.40629.0 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "matrix_test", "matrix_test.vcxproj", "{7685F962-9880-43F5-A10E-B2A1E0E09465}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|Win32 = Debug|Win32 11 | Debug|x64 = Debug|x64 12 | Release|Win32 = Release|Win32 13 | Release|x64 = Release|x64 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {7685F962-9880-43F5-A10E-B2A1E0E09465}.Debug|Win32.ActiveCfg = Debug|Win32 17 | {7685F962-9880-43F5-A10E-B2A1E0E09465}.Debug|Win32.Build.0 = Debug|Win32 18 | {7685F962-9880-43F5-A10E-B2A1E0E09465}.Debug|x64.ActiveCfg = Debug|x64 19 | {7685F962-9880-43F5-A10E-B2A1E0E09465}.Debug|x64.Build.0 = Debug|x64 20 | {7685F962-9880-43F5-A10E-B2A1E0E09465}.Release|Win32.ActiveCfg = Release|Win32 21 | {7685F962-9880-43F5-A10E-B2A1E0E09465}.Release|Win32.Build.0 = Release|Win32 22 | {7685F962-9880-43F5-A10E-B2A1E0E09465}.Release|x64.ActiveCfg = Release|x64 23 | {7685F962-9880-43F5-A10E-B2A1E0E09465}.Release|x64.Build.0 = Release|x64 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | EndGlobal 29 | -------------------------------------------------------------------------------- /unit_test/matrix_test.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Debug 10 | x64 11 | 12 | 13 | Release 14 | Win32 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | {7685F962-9880-43F5-A10E-B2A1E0E09465} 23 | matrix_test 24 | 25 | 26 | 27 | Application 28 | true 29 | v120 30 | MultiByte 31 | 32 | 33 | Application 34 | true 35 | v120 36 | MultiByte 37 | 38 | 39 | Application 40 | false 41 | v120 42 | true 43 | MultiByte 44 | 45 | 46 | Application 47 | false 48 | v120 49 | true 50 | MultiByte 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | Level4 72 | Disabled 73 | true 74 | true 75 | false 76 | true 77 | MultiThreadedDebug 78 | false 79 | false 80 | 81 | 82 | true 83 | 84 | 85 | 86 | 87 | Level4 88 | Disabled 89 | true 90 | true 91 | false 92 | true 93 | MultiThreadedDebug 94 | false 95 | false 96 | 97 | 98 | true 99 | 100 | 101 | 102 | 103 | Level4 104 | Full 105 | true 106 | true 107 | true 108 | true 109 | AnySuitable 110 | Size 111 | true 112 | false 113 | false 114 | MultiThreaded 115 | false 116 | false 117 | 118 | 119 | true 120 | true 121 | true 122 | 123 | 124 | 125 | 126 | Level4 127 | Full 128 | true 129 | true 130 | true 131 | true 132 | AnySuitable 133 | Size 134 | true 135 | false 136 | false 137 | MultiThreaded 138 | false 139 | false 140 | 141 | 142 | true 143 | true 144 | true 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | -------------------------------------------------------------------------------- /unit_test/matrix_test.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;hm;inl;inc;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | Source Files 20 | 21 | 22 | Source Files 23 | 24 | 25 | Source Files 26 | 27 | 28 | 29 | 30 | Source Files 31 | 32 | 33 | Source Files 34 | 35 | 36 | -------------------------------------------------------------------------------- /unit_test/x64/Debug/matrix_test.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/cm256/634d1e36b095dbba937586a7e9843d828762dae4/unit_test/x64/Debug/matrix_test.exe -------------------------------------------------------------------------------- /unit_test/x64/Release/matrix_test.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catid/cm256/634d1e36b095dbba937586a7e9843d828762dae4/unit_test/x64/Release/matrix_test.exe --------------------------------------------------------------------------------