├── .gitignore ├── .gitmodules ├── .travis.yml ├── LICENSE ├── README.md ├── example └── main.cpp ├── lzsse2 ├── lzsse2.cpp ├── lzsse2.h └── lzsse2_platform.h ├── lzsse4 ├── lzsse4.cpp ├── lzsse4.h └── lzsse4_platform.h ├── lzsse8 ├── lzsse8.cpp ├── lzsse8.h └── lzsse8_platform.h ├── premake4.exe └── premake4.lua /.gitignore: -------------------------------------------------------------------------------- 1 | bin/* 2 | gmake/* 3 | vs2013/* 4 | vs2015/* -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "simde"] 2 | path = simde 3 | url = https://github.com/simd-everywhere/simde-no-tests.git 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c 2 | sudo: false 3 | branches: 4 | except: 5 | - /^(wip\/)?(appveyor|msvc|mingw|windows)(\-.+)?$/ 6 | matrix: 7 | include: 8 | ### 9 | ## Linux builds using various versions of GCC. 10 | ### 11 | - env: C_COMPILER=gcc-7 12 | addons: 13 | apt: 14 | sources: 15 | - ubuntu-toolchain-r-test 16 | - premake4 17 | packages: 18 | - gcc-7 19 | - g++-7 20 | - premake4 21 | # - env: C_COMPILER=gcc-6 22 | # addons: 23 | # apt: 24 | # sources: 25 | # - ubuntu-toolchain-r-test 26 | # - premake4 27 | # packages: 28 | # - gcc-6 29 | # - g++-6 30 | # - premake4 31 | - env: C_COMPILER=gcc-5 32 | addons: 33 | apt: 34 | sources: 35 | - ubuntu-toolchain-r-test 36 | - premake4 37 | packages: 38 | - gcc-5 39 | - g++-5 40 | - premake4 41 | # - env: C_COMPILER=gcc-4.9 42 | # addons: 43 | # apt: 44 | # sources: 45 | # - ubuntu-toolchain-r-test 46 | # - premake4 47 | # packages: 48 | # - gcc-4.9 49 | # - g++-4.9 50 | # - premake4 51 | - env: C_COMPILER=gcc-4.8 52 | addons: 53 | apt: 54 | sources: 55 | - ubuntu-toolchain-r-test 56 | - premake4 57 | packages: 58 | - gcc-4.8 59 | - g++-4.8 60 | - premake4 61 | # - env: C_COMPILER=gcc-4.7 62 | # addons: 63 | # apt: 64 | # sources: 65 | # - ubuntu-toolchain-r-test 66 | # - premake4 67 | # packages: 68 | # - gcc-4.7 69 | # - g++-4.7 70 | # - premake4 71 | - env: C_COMPILER=gcc-4.6 72 | addons: 73 | apt: 74 | sources: 75 | - ubuntu-toolchain-r-test 76 | - premake4 77 | packages: 78 | - gcc-4.6 79 | - g++-4.6 80 | - premake4 81 | # - os: linux 82 | # env: C_COMPILER=gcc-4.5 83 | # addons: 84 | # apt: 85 | # sources: 86 | # - ubuntu-toolchain-r-test 87 | # - premake4 88 | # packages: 89 | # - gcc-4.5 90 | # - g++-4.5 91 | # - premake4 92 | - env: C_COMPILER=gcc-4.4 93 | addons: 94 | apt: 95 | sources: 96 | - ubuntu-toolchain-r-test 97 | - premake4 98 | packages: 99 | - gcc-4.4 100 | - g++-4.4 101 | - premake4 102 | 103 | ### 104 | ## clang on Linux 105 | ### 106 | - env: C_COMPILER=clang-3.9 107 | addons: 108 | apt: 109 | sources: 110 | - llvm-toolchain-precise-3.9 111 | - ubuntu-toolchain-r-test 112 | - premake4 113 | packages: 114 | - clang-3.9 115 | - premake4 116 | # - env: C_COMPILER=clang-3.8 117 | # addons: 118 | # apt: 119 | # sources: 120 | # - llvm-toolchain-precise-3.8 121 | # - ubuntu-toolchain-r-test 122 | # - premake4 123 | # packages: 124 | # - clang-3.8 125 | # - premake4 126 | - env: C_COMPILER=clang-3.7 127 | addons: 128 | apt: 129 | sources: 130 | - llvm-toolchain-precise-3.7 131 | - ubuntu-toolchain-r-test 132 | - premake4 133 | packages: 134 | - clang-3.7 135 | - premake4 136 | # - env: C_COMPILER=clang-3.6 137 | # addons: 138 | # apt: 139 | # sources: 140 | # - llvm-toolchain-precise-3.6 141 | # - ubuntu-toolchain-r-test 142 | # - premake4 143 | # packages: 144 | # - clang-3.6 145 | # - premake4 146 | - env: C_COMPILER=clang-3.5 147 | addons: 148 | apt: 149 | sources: 150 | - llvm-toolchain-precise-3.5 151 | - ubuntu-toolchain-r-test 152 | - premake4 153 | packages: 154 | - clang-3.5 155 | - premake4 156 | 157 | ### 158 | ## PGI 159 | ### 160 | - env: C_COMPILER=pgcc ENABLE_OPENMP=y 161 | addons: 162 | apt: 163 | sources: 164 | - premake4 165 | packages: 166 | - premake4 167 | 168 | ### 169 | ## OS X 170 | ### 171 | # - os: osx 172 | 173 | before_install: 174 | ### 175 | ## If we use the matrix to set CC/CXX Travis, overwrites the values, 176 | ## so instead we use C/CXX_COMPILER, then copy the values to CC/CXX 177 | ## here (after Travis has set CC/CXX). 178 | ### 179 | - if [ "${C_COMPILER}" = "pgcc" ]; then wget -q -O /dev/stdout 'https://raw.githubusercontent.com/nemequ/pgi-travis/master/install-pgi.sh' | /bin/sh; fi 180 | - if [ -n "${C_COMPILER}" ]; then export CC="${C_COMPILER}"; fi 181 | - premake4 gmake 182 | 183 | script: 184 | - cd gmake && make 185 | 186 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, Conor Stokes 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LZSSE-SIMDe 2 | 3 | [LZSSE](https://github.com/ConorStokes/LZSSE/) has a hard dependency on SSE4.1 which prevents it from working on other architectures, or even x86/x86_64 machines without support for the SSE4.1 instruction set. According to the [Steam Hardware Survey](http://store.steampowered.com/hwsurvey), SSE4.1 currently has just under 90% penetration, and of course that is only for machines with Steam installed (which is a pretty big bias). 4 | 5 | This is a fork of [LZSSE](https://github.com/ConorStokes/LZSSE/) which uses [SIMDe](https://github.com/nemequ/simde) to allow for LZSSE (de)compression on platforms where SSE4.1 is not supported, including other architectures (such as ARM). 6 | 7 | Note that, with the default block size from the example program, LZSSE-SIMDe will not work on 32-bit architectures due to memory requirements. Reducing the block size resolves the issue, and the code has been tested on ARM and x86. [PAE](https://en.wikipedia.org/wiki/Physical_Address_Extension) should also work, but has not been tested. 8 | 9 | For machines with SSE4.1 support there should be no performance impact. The SSE4.1 intrinsics will be called, and the compiler should be capable of optimizing away any overhead associated with SIMDe. 10 | 11 | For machines which don't natively support the instructions used, SIMDe will emulate them using other SIMD APIs or, if that fails, portable fallbacks. 12 | 13 | Note that a mix of the two is quite possible; for example, a CPU may support SSSE3 but not SSE4.1, in which case SSE4.1 functions will be emulated but SSSE3 and earlier instructions will used. 14 | 15 | I'll try to keep this up to date with LZSSE, but I will not accept any changes directly to this repository not directly related to porting to SIMDe. If you find a bug, please file it with LZSSE or SIMDe, whichever would be more appropriate. 16 | 17 | ## SIMDe Performance 18 | 19 | This is based on some testing with g++ 10 using [raspbian-jessie-lite-20151121](https://github.com/nemequ/squash-corpus/blob/master/data/raspbian-jessie-lite-20151121.tar.xz). Results are the average wall-clock time across 5 runs. 20 | 21 | As you read this, please keep two things in mind: 22 | 23 | First, this isn't likely to be the same for your code. Performance will depend heavily on which functions you use, and what SIMDe's options for fallbacks are. SIMDe is usually very easy to integrate into your project, so you should really run your own tests using your code and your data. 24 | 25 | Second, SIMDe should never make your code slower, only more portable. It doesn't really make sense to think of SIMDe as a performance hit since the alternative is that the code doesn't work at all; in that sense, SIMDe represents an infinite performance improvement. 26 | 27 | That said, if you are currently maintaining a portable fallback and an SSE version, there is an excellent chance that SIMDe will be significantly faster than your portable fallback. 28 | 29 | Now that that's out of the way, let's get to some data. 30 | 31 | If provided the same compiler flags (in this case, `-msse4.1 -O3`), results for LZSSE and LZSSE-SIMDe are effectively the same. So **SIMDe doesn't make things worse**, which is *very* important: 32 | 33 | | Library | Variant | Compress | Decompress | 34 | | ----------- | ------- | -------- | ---------- | 35 | | LZSSE | LZSSE2 | 87.18 s | 0.55 s | 36 | | LZSSE-SIMDe | LZSSE2 | 86.33 s | 0.55 s | 37 | | LZSSE | LZSSE4 | 73.83 s | 0.47 s | 38 | | LZSSE-SIMDe | LZSSE4 | 73.48 s | 0.47 s | 39 | | LZSSE | LZSSE8 | 79.16 s | 0.45 s | 40 | | LZSSE-SIMDe | LZSSE8 | 79.28 s | 0.44 s | 41 | 42 | Things get a bit more interesting if we compile without SSE 4.1 support, forcing SIMDe to use portable implementations of the SSE 4.1 functions that LZSSE relies on: 43 | 44 | | Flags | Variant | Compress | Decompress | 45 | | -------- | ------- | -------- | ---------- | 46 | | -msse2 | LZSSE2 | 86.40 s | 13.70 s | 47 | | -msse2 | LZSSE4 | 74.14 s | 10.78 s | 48 | | -msse2 | LZSSE8 | 76.76 s | 10.07 s | 49 | | -mssse3 | LZSSE2 | 86.47 s | 0.55 s | 50 | | -mssse3 | LZSSE4 | 73.01 s | 0.48 s | 51 | | -mssse3 | LZSSE8 | 78.81 s | 0.45 s | 52 | | -msse4.1 | LZSSE2 | 86.33 s | 0.55 s | 53 | | -msse4.1 | LZSSE4 | 73.48 s | 0.47 s | 54 | | -msse4.1 | LZSSE8 | 79.28 s | 0.44 s | 55 | 56 | Remember, there are no numbers here for upstream LZSSE since it simply doesn't work at all. As you can see, moving from SSE2 to SSSE3 provides a *huge* performance increase for decompression; that's because SSSE3 supports the `_mm_shuffle_epi8` function, which we don't have a very good way to emulate on previous versions of SSE. That makes it a great example of SIMDe performance can vary wildly depending on the functions you use and the platform you're targeting. For what it's worth, AArch64 does have a good way to emulate it (`vqtbl1q_s8`), as does AltiVec (`vec_perm`), and ARMv7 has a decent option with a couple of `vtbl2_s8` calls. 57 | 58 | In the worst case we can force SIMDe to always use the portable fallbacks and rely exclusively on the compiler to auto-vectorize by passing `-DSIMDE_NO_NATIVE`. To be clear, you should never do this; it's really only there to help us test the fallbacks. In this case, flags like `-msse4.1` only tell the compiler (GCC in this case) which extensions it is allowed to use; they are completely ignored by SIMDe. 59 | 60 | | Flags | Variant | Compress | Decompress | 61 | | -------- | ------- | -------- | ---------- | 62 | | -msse2 | LZSSE2 | 169.52 s | 13.72 s | 63 | | -msse2 | LZSSE4 | 138.56 s | 10.82 s | 64 | | -msse2 | LZSSE8 | 140.77 s | 10.08 s | 65 | | -mssse3 | LZSSE2 | 169.48 s | 13.71 s | 66 | | -mssse3 | LZSSE4 | 138.29 s | 10.80 s | 67 | | -mssse3 | LZSSE8 | 140.71 s | 10.06 s | 68 | | -msse4.1 | LZSSE2 | 169.41 s | 1.99 s | 69 | | -msse4.1 | LZSSE4 | 138.28 s | 1.50 s | 70 | | -msse4.1 | LZSSE8 | 140.02 s | 1.46 s | 71 | 72 | Notice that this time there is not a significant change in compression speed with SSSE3. That's because the compiler isn't smart enough to recognize that it should compile our portable implementation of `_mm_shuffle_epi8` to a `PSHUFB` instruction, which is a good example of why SIMDe generally significantly outperforms non-vectorized fallbacks even on architectures other than the one the original SIMD implementation is targeted at. That said, with SSE 4.1 the compiler was able to use `PSHFLW`/`PSHFHW` (from SSE) with a blend from SSE4.1, recovering *most* of the performance. 73 | 74 | It's tempting to think of this a bit like running the code on ARM, WASM, POWER, etc., but that's not accurate; on ARM SIMDe can use NEON to implement the SSE functions, and the code tends to be much faster. Similarly, on WASM we can use WASM SIMD, and on POWER we can use AltiVec/VSX. 75 | 76 | # LZSSE 77 | [LZSS](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Storer%E2%80%93Szymanski) designed for a branchless SSE decompression implementation. 78 | 79 | Three variants: 80 | - LZSSE2, for high compression files with small literal runs. 81 | - LZSSE4, for a more balanced mix of literals and matches. 82 | - LZSSE8, for lower compression data with longer runs of matches. 83 | 84 | All three variants have an optimal parser implementation, which uses a quite strong match finder (very similar to LzFind) combined with a Storer-Szymanski style parse. LZSSE4 and LZSSE8 have "fast" compressor implementations, which use a simple hash table based matching and a greedy parse. 85 | 86 | Currently LZSSE8 is the recommended variant to use in the general case, as it generally performs well in most cases (and you have the option of both optimal parse and fast compression). LZSSE2 is recommended if you are only using text, especially heavily compressible text, but is slow/doesn't compress as well on less compressible data and binaries. 87 | 88 | The code is approaching production readiness and LZSSE2 and LZSSE8 have received a reasonable amount of testing. 89 | 90 | See these blog posts [An LZ Codec Designed for SSE Decompression](http://conorstokes.github.io/compression/2016/02/15/an-LZ-codec-designed-for-SSE-decompression) and [Compressor Improvements and LZSSE2 vs LZSSE8](http://conorstokes.github.io/compression/2016/02/24/compressor-improvements-and-lzsse2-vs-lzsse8) for a description of how the compression algorithm and implementation function. There are also benchmarks, but these may not be upto date (in particular the figures in the initial blog post no longer represent compression performance). 91 | -------------------------------------------------------------------------------- /example/main.cpp: -------------------------------------------------------------------------------- 1 | // LZSSE.cpp : Defines the entry point for the console application. 2 | // 3 | #define _CRT_SECURE_NO_WARNINGS 1 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "../lzsse2/lzsse2.h" 10 | #include "../lzsse4/lzsse4.h" 11 | #include "../lzsse8/lzsse8.h" 12 | 13 | static const uint32_t MAGIC_NUMBER = 0x28F19732; 14 | 15 | void DisplayUsage() 16 | { 17 | printf( "Usage:\n" ); 18 | printf( " lzsse [args] input_file output_file\n" ); 19 | printf( "\n" ); 20 | printf( "Arguments:\n" ); 21 | printf( " -2 Compress in lzsse2 mode (default)\n" ); 22 | printf( " -4 Compress in lzsse4 mode\n" ); 23 | printf( " -8 Compress in lzsse8 mode\n" ); 24 | printf( " -f Optimal parse (default)\n" ); 25 | printf( " -o Fast parse (not available for lzsse2)\n" ); 26 | printf( " -d Decompress\n" ); 27 | printf( " -lN Compression level for optimal parse, where N is 1 to 17 (default 16)\n" ); 28 | printf( " -bN Block size in KiB, default 131,072\n" ); 29 | printf( "\n" ); 30 | } 31 | 32 | static size_t CompressorFastLZSSE4( LZSSE4_FastParseState* state, const void* input, size_t inputLength, void* output, size_t outputLength, unsigned int ) 33 | { 34 | return LZSSE4_CompressFast( state, input, inputLength, output, outputLength ); 35 | } 36 | 37 | static size_t CompressorFastLZSSE8( LZSSE8_FastParseState* state, const void* input, size_t inputLength, void* output, size_t outputLength, unsigned int ) 38 | { 39 | return LZSSE8_CompressFast( state, input, inputLength, output, outputLength ); 40 | } 41 | 42 | template 43 | void Compress( FILE* inputFile, FILE* outputFile, uint64_t blockSize, uint8_t mode, unsigned int level, State* state, size_t (*compressor)( State*, const void*, size_t, void*, size_t, unsigned int ) ) 44 | { 45 | if ( state == nullptr ) 46 | { 47 | printf( "Couldn't allocate parse state\n" ); 48 | exit( 1 ); 49 | } 50 | 51 | if ( fwrite( &MAGIC_NUMBER, sizeof( uint32_t ), 1, outputFile ) == 0 ) 52 | { 53 | printf( "Couldn't write magic number\n" ); 54 | exit( 1 ); 55 | } 56 | 57 | if ( fwrite( &mode, sizeof( uint8_t ), 1, outputFile ) == 0 ) 58 | { 59 | printf( "Couldn't write stream type\n" ); 60 | exit( 1 ); 61 | } 62 | 63 | if ( fwrite( &blockSize, sizeof( uint64_t ), 1, outputFile ) == 0 ) 64 | { 65 | printf( "Couldn't write block size\n" ); 66 | exit( 1 ); 67 | } 68 | 69 | size_t typedBlockSize = static_cast< size_t >( blockSize ); 70 | uint8_t* inputBuffer = reinterpret_cast< uint8_t* >( malloc( typedBlockSize ) ); 71 | uint8_t* outputBuffer = reinterpret_cast< uint8_t* >( malloc( typedBlockSize ) ); 72 | 73 | if ( inputBuffer == nullptr || outputBuffer == nullptr ) 74 | { 75 | printf( "Couldn't allocate buffer memory\n" ); 76 | exit( 1 ); 77 | } 78 | 79 | for ( ;; ) 80 | { 81 | size_t readSize = fread( inputBuffer, 1, blockSize, inputFile ); 82 | 83 | if ( readSize == 0 ) 84 | { 85 | break; 86 | } 87 | 88 | size_t compressedSize = compressor( state, inputBuffer, readSize, outputBuffer, typedBlockSize, level ); 89 | 90 | if ( compressedSize == 0 ) 91 | { 92 | printf( "Compression function failed\n" ); 93 | exit( 1 ); 94 | } 95 | 96 | uint32_t compressedLength = static_cast< uint32_t >( compressedSize ); 97 | uint32_t uncompressedLength = static_cast< uint32_t >( readSize ); 98 | 99 | if ( fwrite( &uncompressedLength, sizeof( uint32_t ), 1, outputFile ) < 1 ) 100 | { 101 | break; 102 | } 103 | 104 | if ( fwrite( &compressedLength, sizeof( uint32_t ), 1, outputFile ) < 1 ) 105 | { 106 | printf( "Error writing compressed length from block\n" ); 107 | exit( 1 ); 108 | } 109 | 110 | if ( fwrite( outputBuffer, 1, compressedLength, outputFile ) != compressedLength ) 111 | { 112 | printf( "Error writing block\n" ); 113 | exit( 1 ); 114 | } 115 | } 116 | 117 | free( inputBuffer ); 118 | free( outputBuffer ); 119 | } 120 | 121 | void Decompress( FILE* inputFile, FILE* outputFile ) 122 | { 123 | uint32_t magicNumber; 124 | uint64_t blockSize = 128 * 1024 * 1024; 125 | 126 | if ( fread( &magicNumber, sizeof( uint32_t ), 1, inputFile ) < 1 || magicNumber != MAGIC_NUMBER ) 127 | { 128 | printf( "Couldn't read magic number, or magic number incorrect\n" ); 129 | exit( 1 ); 130 | } 131 | 132 | uint8_t streamType; 133 | 134 | if ( fread( &streamType, sizeof( uint8_t ), 1, inputFile ) < 1 ) 135 | { 136 | printf( "Couldn't read stream type\n" ); 137 | exit( 1 ); 138 | } 139 | 140 | if ( fread( &blockSize, sizeof( uint64_t ), 1, inputFile ) < 1 ) 141 | { 142 | printf( "Couldn't read block size\n" ); 143 | exit( 1 ); 144 | } 145 | 146 | uint8_t* inputBuffer = reinterpret_cast< uint8_t* >( malloc( static_cast< size_t >( blockSize ) ) ); 147 | uint8_t* outputBuffer = reinterpret_cast< uint8_t* >( malloc( static_cast< size_t >( blockSize ) ) ); 148 | 149 | if ( inputBuffer == nullptr || outputBuffer == nullptr ) 150 | { 151 | printf( "Couldn't allocate buffer memory\n" ); 152 | exit( 1 ); 153 | } 154 | 155 | size_t( *decompressor )( const void*, size_t, void*, size_t ); 156 | 157 | switch ( streamType ) 158 | { 159 | case 2: 160 | 161 | decompressor = LZSSE2_Decompress; 162 | break; 163 | 164 | case 4: 165 | 166 | decompressor = LZSSE4_Decompress; 167 | break; 168 | 169 | case 8: 170 | 171 | decompressor = LZSSE8_Decompress; 172 | break; 173 | 174 | default: 175 | 176 | printf( "Invalid stream type\n" ); 177 | exit( 1 ); 178 | 179 | } 180 | 181 | memset( inputBuffer, 0, blockSize ); 182 | memset( outputBuffer, 0, blockSize ); 183 | 184 | for ( ;; ) 185 | { 186 | uint32_t compressedLength; 187 | uint32_t uncompressedLength; 188 | 189 | if ( fread( &uncompressedLength, sizeof( uint32_t ), 1, inputFile ) < 1 ) 190 | { 191 | break; 192 | } 193 | 194 | if ( fread( &compressedLength, sizeof( uint32_t ), 1, inputFile ) < 1 ) 195 | { 196 | printf( "Error reading compressed length from block\n" ); 197 | exit( 1 ); 198 | } 199 | 200 | if ( fread( inputBuffer, 1, compressedLength, inputFile ) != compressedLength ) 201 | { 202 | printf( "Error reading block\n" ); 203 | exit( 1 ); 204 | } 205 | 206 | size_t decompressedSize = 0; 207 | 208 | decompressedSize = 209 | decompressor( inputBuffer, 210 | compressedLength, 211 | outputBuffer, 212 | uncompressedLength ); 213 | 214 | if ( decompressedSize != size_t( uncompressedLength ) ) 215 | { 216 | printf( "Error in decompression stream\n" ); 217 | exit( 1 ); 218 | } 219 | 220 | if ( fwrite( outputBuffer, 1, uncompressedLength, outputFile ) != uncompressedLength ) 221 | { 222 | printf( "Couldn't write block to output file\n" ); 223 | exit( 1 ); 224 | } 225 | } 226 | 227 | free( inputBuffer ); 228 | free( outputBuffer ); 229 | } 230 | 231 | int main( int argc, const char** argv ) 232 | { 233 | bool decompression = false; 234 | bool optimal = true; 235 | uint64_t blockSize = 128 * 1024 * 1024; 236 | uint8_t mode = 2; 237 | unsigned int level = 16; 238 | 239 | if ( argc < 3 ) 240 | { 241 | DisplayUsage(); 242 | exit( 1 ); 243 | } 244 | 245 | for ( int argIndex = 1; argIndex < argc - 2; ++argIndex ) 246 | { 247 | const char* arg = argv[ argIndex ]; 248 | 249 | if ( arg[ 0 ] == '-' ) 250 | { 251 | switch ( arg[ 1 ] ) 252 | { 253 | case 'd': 254 | 255 | decompression = true; 256 | break; 257 | 258 | case '2': 259 | 260 | mode = 2; 261 | break; 262 | 263 | case '4': 264 | 265 | mode = 4; 266 | break; 267 | 268 | case '8': 269 | 270 | mode = 8; 271 | break; 272 | 273 | case 'l': 274 | 275 | level = static_cast< unsigned int >( strtoul( arg + 2, nullptr, 10 ) ); 276 | break; 277 | 278 | case 'b': 279 | 280 | blockSize = strtoull( arg + 2, nullptr, 10 ) * 1024; 281 | break; 282 | 283 | case 'o': 284 | 285 | optimal = true; 286 | break; 287 | 288 | case 'f': 289 | 290 | optimal = false; 291 | break; 292 | 293 | } 294 | } 295 | } 296 | 297 | FILE* inputFile = fopen( argv[ argc - 2 ], "rb" ); 298 | 299 | if ( inputFile == nullptr ) 300 | { 301 | perror( argv[ argc - 2 ] ); 302 | exit( 1 ); 303 | } 304 | 305 | FILE* outputFile = fopen( argv[ argc - 1 ], "wb+" ); 306 | 307 | if ( outputFile == nullptr ) 308 | { 309 | perror( argv[ argc - 2 ] ); 310 | exit( 1 ); 311 | } 312 | 313 | if ( decompression ) 314 | { 315 | Decompress( inputFile, outputFile ); 316 | } 317 | else 318 | { 319 | switch ( mode ) 320 | { 321 | case 2: 322 | { 323 | LZSSE2_OptimalParseState* state = LZSSE2_MakeOptimalParseState( static_cast< size_t >( blockSize ) ); 324 | 325 | Compress( inputFile, outputFile, blockSize, mode, level, state, LZSSE2_CompressOptimalParse ); 326 | 327 | LZSSE2_FreeOptimalParseState( state ); 328 | 329 | break; 330 | } 331 | 332 | case 4: 333 | { 334 | if ( optimal ) 335 | { 336 | LZSSE4_OptimalParseState* state = LZSSE4_MakeOptimalParseState( static_cast( blockSize ) ); 337 | 338 | Compress( inputFile, outputFile, blockSize, mode, level, state, LZSSE4_CompressOptimalParse ); 339 | 340 | LZSSE4_FreeOptimalParseState( state ); 341 | } 342 | else 343 | { 344 | LZSSE4_FastParseState* state = LZSSE4_MakeFastParseState(); 345 | 346 | Compress( inputFile, outputFile, blockSize, mode, level, state, CompressorFastLZSSE4 ); 347 | 348 | LZSSE4_FreeFastParseState( state ); 349 | } 350 | 351 | break; 352 | } 353 | 354 | case 8: 355 | { 356 | if ( optimal ) 357 | { 358 | LZSSE8_OptimalParseState* state = LZSSE8_MakeOptimalParseState( static_cast( blockSize ) ); 359 | 360 | Compress( inputFile, outputFile, blockSize, mode, level, state, LZSSE8_CompressOptimalParse ); 361 | 362 | LZSSE8_FreeOptimalParseState( state ); 363 | } 364 | else 365 | { 366 | LZSSE8_FastParseState* state = LZSSE8_MakeFastParseState(); 367 | 368 | Compress( inputFile, outputFile, blockSize, mode, level, state, CompressorFastLZSSE8 ); 369 | 370 | LZSSE8_FreeFastParseState( state ); 371 | } 372 | 373 | break; 374 | } 375 | 376 | default: 377 | 378 | printf( "Invalid stream type\n" ); 379 | exit( 1 ); 380 | 381 | } 382 | } 383 | 384 | fclose( inputFile ); 385 | fclose( outputFile ); 386 | 387 | return 0; 388 | } 389 | 390 | -------------------------------------------------------------------------------- /lzsse2/lzsse2.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2016, Conor Stokes 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | */ 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #include "lzsse2_platform.h" 32 | #include "lzsse2.h" 33 | 34 | #pragma warning ( disable : 4127 ) 35 | 36 | namespace 37 | { 38 | // Constants - most of these should not be changed without corresponding code changes because it will break many things in unpredictable ways. 39 | const uint32_t WINDOW_BITS = 16; 40 | const uint32_t MIN_MATCH_LENGTH = 3; 41 | const uint32_t LZ_WINDOW_SIZE = 1 << WINDOW_BITS; 42 | const uint32_t LZ_WINDOW_MASK = LZ_WINDOW_SIZE - 1; 43 | const uint32_t OPTIMAL_HASH_BITS = 20; 44 | const uint32_t OPTIMAL_BUCKETS_COUNT = 1 << OPTIMAL_HASH_BITS; 45 | const uint32_t OPTIMAL_HASH_MASK = OPTIMAL_BUCKETS_COUNT - 1; 46 | const uint32_t MIN_COMPRESSION_SIZE = 32; 47 | const uint32_t END_PADDING_LITERALS = 16; 48 | const int32_t NO_MATCH = -1; 49 | const int32_t EMPTY_NODE = -1; 50 | const uint32_t MIN_LITERAL_COUNT = 8; 51 | const uint32_t HASH_MULTIPLIER = 4013; 52 | const uint32_t HASH_REMOVAL_MULTIPLIER = HASH_MULTIPLIER * HASH_MULTIPLIER; 53 | const uint32_t CONTROL_BITS = 4; 54 | const uint32_t LITERAL_BITS = 8; 55 | const uint32_t OFFSET_BITS = 16; 56 | const uint32_t BASE_MATCH_BITS = OFFSET_BITS + CONTROL_BITS; 57 | const uint32_t SINGLE_LITERAL_COST = CONTROL_BITS + LITERAL_BITS; 58 | const uint32_t DOUBLE_LITERAL_COST = SINGLE_LITERAL_COST + LITERAL_BITS; 59 | const uint32_t EXTENDED_MATCH_BOUND = ( 1 << CONTROL_BITS ) - 1; 60 | const uint32_t CONTROL_BLOCK_SIZE = sizeof( __m128i ); 61 | const uint32_t CONTROLS_PER_BLOCK = 32; 62 | const uint32_t LITERALS_PER_CONTROL = 2; 63 | const uint32_t MAX_INPUT_PER_CONTROL = 2; 64 | const size_t OUTPUT_BUFFER_SAFE = EXTENDED_MATCH_BOUND * CONTROLS_PER_BLOCK; 65 | const size_t INPUT_BUFFER_SAFE = MAX_INPUT_PER_CONTROL * CONTROLS_PER_BLOCK; 66 | const uint16_t INITIAL_OFFSET = MIN_MATCH_LENGTH; 67 | const size_t SKIP_MATCH_LENGTH = 128; 68 | const uint32_t NO_SKIP_LEVEL = 17; 69 | } 70 | 71 | struct Arrival 72 | { 73 | size_t cost; 74 | int32_t from; 75 | int32_t to; 76 | uint16_t offset; 77 | }; 78 | 79 | struct TreeNode 80 | { 81 | int32_t children[ 2 ]; 82 | }; 83 | 84 | struct LZSSE2_OptimalParseState 85 | { 86 | // Note, we should really replace this with a BST, hash chaining works but is *slooooooooooooooow* for optimal parse. 87 | int32_t roots[ OPTIMAL_BUCKETS_COUNT ]; 88 | 89 | TreeNode window[ LZ_WINDOW_SIZE ]; 90 | 91 | Arrival* arrivals; 92 | 93 | size_t bufferSize; 94 | }; 95 | 96 | 97 | LZSSE2_OptimalParseState* LZSSE2_MakeOptimalParseState( size_t bufferSize ) 98 | { 99 | if ( bufferSize > 0 && ( SIZE_MAX / sizeof( Arrival ) ) < bufferSize ) 100 | { 101 | return nullptr; 102 | } 103 | 104 | LZSSE2_OptimalParseState* result = reinterpret_cast< LZSSE2_OptimalParseState* >( ::malloc( sizeof( LZSSE2_OptimalParseState ) ) ); 105 | 106 | result->bufferSize = bufferSize; 107 | 108 | if ( result != nullptr ) 109 | { 110 | result->arrivals = reinterpret_cast< Arrival* >( ::malloc( sizeof( Arrival ) * bufferSize ) ); 111 | 112 | if ( result->arrivals == nullptr ) 113 | { 114 | LZSSE2_FreeOptimalParseState( result ); 115 | 116 | result = nullptr; 117 | } 118 | } 119 | 120 | return result; 121 | } 122 | 123 | 124 | void LZSSE2_FreeOptimalParseState( LZSSE2_OptimalParseState* toFree ) 125 | { 126 | ::free( toFree->arrivals ); 127 | 128 | toFree->arrivals = nullptr; 129 | 130 | ::free( toFree ); 131 | } 132 | 133 | 134 | inline uint32_t CalculateHash( const uint8_t* inputCursor ) 135 | { 136 | return ( uint32_t( inputCursor[ 0 ] ) * HASH_MULTIPLIER * HASH_MULTIPLIER + uint32_t( inputCursor[ 1 ] ) * HASH_MULTIPLIER + uint32_t( inputCursor[ 2 ] ) ) & OPTIMAL_HASH_MASK; 137 | } 138 | 139 | 140 | struct Match 141 | { 142 | size_t length; 143 | int32_t position; 144 | uint16_t offset; 145 | }; 146 | 147 | 148 | inline Match SearchAndUpdateFinder( LZSSE2_OptimalParseState& state, const uint8_t* input, const uint8_t* inputCursor, const uint8_t* inputEnd, uint32_t cutOff ) 149 | { 150 | Match result; 151 | 152 | int32_t position = static_cast( inputCursor - input ); 153 | 154 | result.position = NO_MATCH; 155 | result.length = MIN_MATCH_LENGTH; 156 | result.offset = 0; 157 | 158 | size_t lengthToEnd = inputEnd - inputCursor; 159 | int32_t lastPosition = position - ( LZ_WINDOW_SIZE - 1 ); 160 | uint32_t hash = CalculateHash( inputCursor ); 161 | 162 | lastPosition = lastPosition > 0 ? lastPosition : 0; 163 | 164 | int32_t treeCursor = state.roots[ hash ]; 165 | 166 | state.roots[ hash ] = position; 167 | 168 | int32_t* left = &state.window[ position & LZ_WINDOW_MASK ].children[ 1 ]; 169 | int32_t* right = &state.window[ position & LZ_WINDOW_MASK ].children[ 0 ]; 170 | size_t leftLength = 0; 171 | size_t rightLength = 0; 172 | 173 | for ( ;; ) 174 | { 175 | if ( cutOff-- == 0 || treeCursor < lastPosition ) 176 | { 177 | *left = *right = EMPTY_NODE; 178 | break; 179 | } 180 | 181 | TreeNode& currentNode = state.window[ treeCursor & LZ_WINDOW_MASK ]; 182 | const uint8_t* key = input + treeCursor; 183 | size_t matchLength = leftLength < rightLength ? leftLength : rightLength; 184 | 185 | uint16_t matchOffset = static_cast( position - treeCursor ); 186 | size_t maxLength = matchOffset <= ( EXTENDED_MATCH_BOUND + 1 ) && matchOffset < lengthToEnd ? matchOffset : lengthToEnd; 187 | 188 | while ( matchLength < lengthToEnd ) 189 | { 190 | __m128i input16 = _mm_loadu_si128( reinterpret_cast( inputCursor + matchLength ) ); 191 | __m128i match16 = _mm_loadu_si128( reinterpret_cast( key + matchLength ) ); 192 | 193 | unsigned long matchBytes; 194 | 195 | _BitScanForward( &matchBytes, ( static_cast( ~_mm_movemask_epi8( _mm_cmpeq_epi8( input16, match16 ) ) ) | 0x10000 ) ); 196 | 197 | matchLength += matchBytes; 198 | 199 | if ( matchBytes != 16 ) 200 | { 201 | break; 202 | } 203 | } 204 | 205 | matchLength = matchLength < lengthToEnd ? matchLength : lengthToEnd; 206 | 207 | size_t truncatedMatchLength = matchLength < maxLength ? matchLength : maxLength; 208 | 209 | if ( truncatedMatchLength >= result.length ) 210 | { 211 | result.length = truncatedMatchLength; 212 | result.offset = matchOffset; 213 | result.position = treeCursor; 214 | } 215 | 216 | if ( matchLength == lengthToEnd ) 217 | { 218 | *left = currentNode.children[ 1 ]; 219 | *right = currentNode.children[ 0 ]; 220 | break; 221 | } 222 | 223 | if ( inputCursor[ matchLength ] < key[ matchLength ] || ( matchLength == lengthToEnd ) ) 224 | { 225 | *left = treeCursor; 226 | left = currentNode.children; 227 | treeCursor = *left; 228 | leftLength = matchLength; 229 | } 230 | else 231 | { 232 | *right = treeCursor; 233 | right = currentNode.children + 1; 234 | treeCursor = *right; 235 | rightLength = matchLength; 236 | } 237 | } 238 | 239 | // Special RLE overlapping match case, the LzFind style match above doesn't work very well with our 240 | // restriction of overlapping matches having offsets of at least 16. 241 | // Suffix array seems like a better option to handling this. 242 | { 243 | // Note, we're detecting long RLE here, but if we have an offset too close, we'll sacrifice a fair 244 | // amount of decompression performance to load-hit-stores. 245 | int32_t matchPosition = position - ( sizeof( __m128i ) * 2 ); 246 | 247 | if ( matchPosition >= 0 ) 248 | { 249 | uint16_t matchOffset = static_cast( position - matchPosition ); 250 | const uint8_t* key = input + matchPosition; 251 | size_t matchLength = 0; 252 | 253 | while ( matchLength < lengthToEnd ) 254 | { 255 | __m128i input16 = _mm_loadu_si128( reinterpret_cast( inputCursor + matchLength ) ); 256 | __m128i match16 = _mm_loadu_si128( reinterpret_cast( key + matchLength ) ); 257 | 258 | unsigned long matchBytes; 259 | 260 | _BitScanForward( &matchBytes, ( static_cast( ~_mm_movemask_epi8( _mm_cmpeq_epi8( input16, match16 ) ) ) | 0x10000 ) ); 261 | 262 | matchLength += matchBytes; 263 | 264 | if ( matchBytes != 16 ) 265 | { 266 | break; 267 | } 268 | 269 | } 270 | 271 | matchLength = matchLength < lengthToEnd ? matchLength : lengthToEnd; 272 | 273 | if ( matchLength >= result.length ) 274 | { 275 | result.length = matchLength; 276 | result.offset = matchOffset; 277 | result.position = matchPosition; 278 | } 279 | } 280 | } 281 | 282 | return result; 283 | } 284 | 285 | 286 | size_t LZSSE2_CompressOptimalParse( LZSSE2_OptimalParseState* state, const void* inputChar, size_t inputLength, void* outputChar, size_t outputLength, unsigned int level ) 287 | { 288 | if ( outputLength < inputLength || state->bufferSize < inputLength ) 289 | { 290 | // error case, output buffer not large enough. 291 | return 0; 292 | } 293 | 294 | const uint8_t* input = reinterpret_cast< const uint8_t* >( inputChar ); 295 | uint8_t* output = reinterpret_cast< uint8_t* >( outputChar ); 296 | 297 | if ( inputLength < MIN_COMPRESSION_SIZE ) 298 | { 299 | memcpy( output, input, inputLength ); 300 | 301 | return inputLength; 302 | } 303 | 304 | const uint8_t* inputCursor = input; 305 | const uint8_t* inputEnd = input + inputLength; 306 | Arrival* arrivalWatermark = state->arrivals; 307 | Arrival* arrival = state->arrivals; 308 | uint32_t cutOff = 1 << level; 309 | 310 | for ( int32_t* rootCursor = state->roots, *end = rootCursor + OPTIMAL_BUCKETS_COUNT; rootCursor < end; rootCursor += 4 ) 311 | { 312 | rootCursor[ 0 ] = EMPTY_NODE; 313 | rootCursor[ 1 ] = EMPTY_NODE; 314 | rootCursor[ 2 ] = EMPTY_NODE; 315 | rootCursor[ 3 ] = EMPTY_NODE; 316 | } 317 | 318 | for ( uint32_t where = 0; where < MIN_MATCH_LENGTH; ++where ) 319 | { 320 | /*Match dummy = */ SearchAndUpdateFinder( *state, input, inputCursor, inputEnd - END_PADDING_LITERALS, cutOff ); 321 | 322 | ++inputCursor; 323 | } 324 | 325 | arrival->cost = LITERAL_BITS * MIN_MATCH_LENGTH; 326 | arrival->from = -1; 327 | arrival->offset = 0; 328 | 329 | // loop through each character and project forward the matches at that character to calculate the cheapest 330 | // path of arrival for each individual character. 331 | for ( const uint8_t* earlyEnd = inputEnd - END_PADDING_LITERALS; inputCursor < earlyEnd; ++inputCursor, ++arrival ) 332 | { 333 | uint32_t lengthToEnd = static_cast< uint32_t >( earlyEnd - inputCursor ); 334 | int32_t currentPosition = static_cast< int32_t >( inputCursor - input ); 335 | Arrival* literalFirst = arrival + 1; 336 | Arrival* literalSecond = arrival + 2; 337 | size_t arrivalCost = arrival->cost; 338 | 339 | // NOTE - we currently assume only 2 literals filled in here, because the minimum match length is 3. 340 | // If we wanted to go with a higher minimum match length, we would need to fill in more literals before hand. 341 | // Also, because there is a maximum of 2 literals per control block assumed. 342 | 343 | // project forward the cost of a single literal 344 | if ( literalFirst > arrivalWatermark || literalFirst->cost > ( arrival->cost + SINGLE_LITERAL_COST ) ) 345 | { 346 | literalFirst->cost = arrival->cost + SINGLE_LITERAL_COST; 347 | literalFirst->from = currentPosition; 348 | literalFirst->offset = 0; 349 | 350 | arrivalWatermark = literalFirst > arrivalWatermark ? literalFirst : arrivalWatermark; 351 | } 352 | 353 | // project forward the cost of two literals 354 | if ( lengthToEnd > 1 ) 355 | { 356 | if ( literalSecond > arrivalWatermark || literalFirst->cost > ( arrival->cost + DOUBLE_LITERAL_COST ) ) 357 | { 358 | literalSecond->cost = arrival->cost + DOUBLE_LITERAL_COST; 359 | literalSecond->from = currentPosition; 360 | literalSecond->offset = 0; 361 | 362 | arrivalWatermark = literalSecond > arrivalWatermark ? literalSecond : arrivalWatermark; 363 | } 364 | } 365 | else 366 | { 367 | continue; 368 | } 369 | 370 | Match match = SearchAndUpdateFinder( *state, input, inputCursor, earlyEnd, cutOff ); 371 | 372 | if ( match.position != NO_MATCH ) 373 | { 374 | for ( size_t matchedLength = MIN_MATCH_LENGTH, end = match.length + 1; matchedLength < end; ++matchedLength ) 375 | { 376 | Arrival* matchArrival = arrival + matchedLength; 377 | size_t matchCost = arrivalCost + BASE_MATCH_BITS; 378 | 379 | if ( matchedLength > EXTENDED_MATCH_BOUND ) 380 | { 381 | matchCost += ( ( matchedLength - 1 ) / EXTENDED_MATCH_BOUND ) * CONTROL_BITS; 382 | } 383 | 384 | if ( matchArrival > arrivalWatermark || matchArrival->cost > matchCost ) 385 | { 386 | matchArrival->cost = matchCost; 387 | matchArrival->from = currentPosition; 388 | matchArrival->offset = match.offset; 389 | 390 | arrivalWatermark = matchArrival > arrivalWatermark ? matchArrival : arrivalWatermark; 391 | } 392 | } 393 | 394 | if ( match.length > SKIP_MATCH_LENGTH && level < NO_SKIP_LEVEL ) 395 | { 396 | arrival += match.length - LITERALS_PER_CONTROL; 397 | inputCursor += match.length - LITERALS_PER_CONTROL; 398 | } 399 | } 400 | } 401 | 402 | // If this would cost more to encode than it would if it were just literals, encode it with no control blocks, 403 | // just literals 404 | if ( ( arrivalWatermark->cost + END_PADDING_LITERALS * LITERAL_BITS + CONTROLS_PER_BLOCK * CONTROL_BITS ) > ( inputLength * LITERAL_BITS ) ) 405 | { 406 | memcpy( output, input, inputLength ); 407 | 408 | return inputLength; 409 | } 410 | 411 | Arrival* previousPathNode; 412 | 413 | // now trace the actual optimal parse path back, connecting the nodes in the other direction. 414 | for ( const Arrival* pathNode = arrivalWatermark; pathNode->from > 0; pathNode = previousPathNode ) 415 | { 416 | previousPathNode = state->arrivals + ( pathNode->from - MIN_MATCH_LENGTH ); 417 | 418 | previousPathNode->to = static_cast( ( pathNode - state->arrivals ) + MIN_MATCH_LENGTH ); 419 | } 420 | 421 | uint8_t* outputCursor = output; 422 | 423 | memcpy( outputCursor, input, MIN_MATCH_LENGTH ); 424 | 425 | outputCursor += MIN_MATCH_LENGTH; 426 | 427 | uint8_t* currentControlBlock = outputCursor; 428 | uint32_t currentControlCount = 0; 429 | uint32_t totalControlCount = 0; 430 | 431 | outputCursor += CONTROL_BLOCK_SIZE; 432 | 433 | Arrival* nextPathNode; 434 | 435 | size_t totalPathLength = MIN_MATCH_LENGTH; 436 | uint16_t previousOffset = INITIAL_OFFSET; 437 | 438 | bool lastControlIsNop = false; 439 | 440 | // Now walk forwards again and actually write out the data. 441 | for ( const Arrival* pathNode = state->arrivals; pathNode < arrivalWatermark; pathNode = nextPathNode ) 442 | { 443 | int32_t currentPosition = static_cast< int32_t >( ( pathNode - state->arrivals ) + MIN_MATCH_LENGTH ); 444 | 445 | nextPathNode = state->arrivals + ( pathNode->to - MIN_MATCH_LENGTH ); 446 | 447 | size_t pathDistance = nextPathNode - pathNode; 448 | 449 | totalPathLength += pathDistance; 450 | 451 | lastControlIsNop = false; 452 | 453 | if ( pathDistance < MIN_MATCH_LENGTH ) 454 | { 455 | if ( currentControlCount == CONTROLS_PER_BLOCK ) 456 | { 457 | currentControlBlock = outputCursor; 458 | outputCursor += CONTROL_BLOCK_SIZE; 459 | currentControlCount = 0; 460 | } 461 | 462 | if ( ( currentControlCount & 1 ) == 0 ) 463 | { 464 | currentControlBlock[ currentControlCount >> 1 ] = 465 | ( static_cast( pathDistance ) - 1 ); 466 | } 467 | else 468 | { 469 | currentControlBlock[ currentControlCount >> 1 ] |= 470 | ( static_cast< uint8_t >( pathDistance ) - 1 ) << CONTROL_BITS; 471 | } 472 | 473 | // output the literals. 474 | for ( int32_t where = 0; where < pathDistance; ++where ) 475 | { 476 | const uint8_t* currentInput = input + currentPosition + where; 477 | 478 | outputCursor[ where ] = *currentInput ^ *( currentInput - previousOffset ); 479 | } 480 | 481 | outputCursor += pathDistance; 482 | 483 | ++totalControlCount; 484 | ++currentControlCount; 485 | } 486 | else 487 | { 488 | size_t toEncode = pathDistance - 1; //note, we always subtract one here, because the first control block of the match encodes this way 489 | 490 | // make sure the control block for the first part of the match has been allocated 491 | // note, this is idempontent if we have not actually incremented the control count and we try this again. 492 | if ( currentControlCount == CONTROLS_PER_BLOCK ) 493 | { 494 | currentControlBlock = outputCursor; 495 | outputCursor += CONTROL_BLOCK_SIZE; 496 | currentControlCount = 0; 497 | } 498 | 499 | // output the offset (after control block containing the first control nibble for this match). 500 | *reinterpret_cast< uint16_t* >( outputCursor ) = nextPathNode->offset ^ previousOffset; 501 | 502 | previousOffset = nextPathNode->offset; 503 | 504 | outputCursor += sizeof( uint16_t ); 505 | 506 | for ( ;; ) 507 | { 508 | if ( currentControlCount == CONTROLS_PER_BLOCK ) 509 | { 510 | currentControlBlock = outputCursor; 511 | outputCursor += CONTROL_BLOCK_SIZE; 512 | currentControlCount = 0; 513 | } 514 | 515 | if ( toEncode >= EXTENDED_MATCH_BOUND ) 516 | { 517 | if ( ( currentControlCount & 1 ) == 0 ) 518 | { 519 | currentControlBlock[ currentControlCount >> 1 ] = 520 | static_cast( EXTENDED_MATCH_BOUND ); 521 | } 522 | else 523 | { 524 | currentControlBlock[ currentControlCount >> 1 ] |= 525 | static_cast< uint8_t >( EXTENDED_MATCH_BOUND ) << CONTROL_BITS; 526 | } 527 | 528 | toEncode -= EXTENDED_MATCH_BOUND; 529 | 530 | ++totalControlCount; 531 | ++currentControlCount; 532 | 533 | } 534 | else 535 | { 536 | if ( ( currentControlCount & 1 ) == 0 ) 537 | { 538 | currentControlBlock[ currentControlCount >> 1 ] = 539 | static_cast( toEncode ); 540 | } 541 | else 542 | { 543 | currentControlBlock[ currentControlCount >> 1 ] |= 544 | static_cast< uint8_t >( toEncode ) << CONTROL_BITS; 545 | } 546 | 547 | if ( toEncode == 0 && currentControlCount == 0 ) 548 | { 549 | lastControlIsNop = true; 550 | } 551 | 552 | ++totalControlCount; 553 | ++currentControlCount; 554 | 555 | break; 556 | } 557 | } 558 | } 559 | } 560 | 561 | if ( lastControlIsNop ) 562 | { 563 | outputCursor -= CONTROL_BLOCK_SIZE; 564 | } 565 | 566 | size_t remainingLiterals = ( input + inputLength ) - inputCursor; 567 | 568 | // copy remaining literals 569 | memcpy( outputCursor, inputCursor, remainingLiterals ); 570 | 571 | outputCursor += remainingLiterals; 572 | 573 | return outputCursor - output; 574 | } 575 | 576 | 577 | size_t LZSSE2_Decompress( const void* inputChar, size_t inputLength, void* outputChar, size_t outputLength ) 578 | { 579 | const uint8_t* input = reinterpret_cast< const uint8_t* >( inputChar ); 580 | uint8_t* output = reinterpret_cast< uint8_t* >( outputChar ); 581 | 582 | // Length it not work compressing, just copy initial values 583 | if ( outputLength == inputLength ) 584 | { 585 | memcpy( output, input, outputLength ); 586 | 587 | return inputLength; 588 | } 589 | 590 | const uint8_t* inputCursor = input; 591 | uint8_t* outputCursor = output; 592 | 593 | // The offset starts off as the minimum match length. We actually need it at least two 594 | // characters back because we need them to be set to xor out the literals from the match data. 595 | size_t offset = INITIAL_OFFSET; 596 | __m128i previousCarryHi = _mm_setzero_si128(); 597 | 598 | *( outputCursor++ ) = *( inputCursor++ ); 599 | *( outputCursor++ ) = *( inputCursor++ ); 600 | *( outputCursor++ ) = *( inputCursor++ ); 601 | 602 | // What these macros do: 603 | // Decode a single literal run or match run for a single control nibble. 604 | // How they do it: 605 | // - Read the *unaligned* input (in the case of LZSSE-F - twice, for LZSSE-O we read once) - one goes into an SSE register, 606 | // because it could either be literals or an offset (or nothing at all). The low byte of streamBytesRead controls how much we advance 607 | // the input cursor. 608 | // - Used a contived set of casts to sign extend the "read offset" control mask and then use it to mask the input word, 609 | // which is then xor'd against the offset, for a "branchless" conditional move into the offset which 610 | // has been carried over from the previous literal/match block. Note, this ends up doing better than a cmov on most 611 | // modern processors. But we need to pre-xor the input offset. 612 | // - We then load the match data from output buffer (offset back from the current output point). Unconditional load here. 613 | // - We broadcast the "from literal" control mask from the current least significant byte of the SSE register using a shuffle epi-8 614 | // - We mask the literals with that SSE register wide mask. 615 | // - The literals have been pre-xor'd with the data read in as match data, so we use an xor to branchlessly choose between the two. 616 | // In this case, it ends up a better option than a blendv on most processors. 617 | // - Store the block. We store all 16 bytes of the SSE register (due to some constraints in the format of the data, we won't 618 | // go past the end of the buffer), but we may overlap this. 619 | // - bytesOut controls how much we advance the output cursor. 620 | // - We use 8 bit shifts to advance all the controls up to the next byte. There is some variable sized register trickery that 621 | // x86/x64 is great for as long as we don't anger the register renamer. 622 | 623 | #define DECODE_STEP( HILO, CHECKMATCH, CHECKBUFFERS ) \ 624 | { \ 625 | \ 626 | uint64_t inputWord = *reinterpret_cast( inputCursor ); \ 627 | __m128i literals = _mm_cvtsi64_si128( inputWord ); \ 628 | \ 629 | offset ^= static_cast( static_cast( static_cast( readOffsetHalf##HILO ) ) ) & inputWord; \ 630 | \ 631 | readOffsetHalf##HILO >>= 8; \ 632 | \ 633 | const uint8_t* matchPointer = outputCursor - offset; \ 634 | \ 635 | if ( CHECKMATCH && matchPointer < output ) \ 636 | goto MATCH_UNDERFLOW; \ 637 | \ 638 | __m128i fromLiteral = _mm_shuffle_epi8( fromLiteral##HILO, _mm_setzero_si128() ); \ 639 | __m128i matchData = _mm_loadu_si128( reinterpret_cast( matchPointer ) ); \ 640 | \ 641 | literals = _mm_and_si128( fromLiteral, literals ); \ 642 | \ 643 | fromLiteral##HILO = _mm_srli_si128( fromLiteral##HILO, 1 ); \ 644 | \ 645 | __m128i toStore = _mm_xor_si128( matchData, literals ); \ 646 | \ 647 | _mm_storeu_si128( reinterpret_cast<__m128i*>( outputCursor ), toStore ); \ 648 | \ 649 | outputCursor += static_cast< uint8_t >( bytesOutHalf##HILO ); \ 650 | inputCursor += static_cast< uint8_t >( streamBytesReadHalf##HILO ); \ 651 | \ 652 | bytesOutHalf##HILO >>= 8; \ 653 | streamBytesReadHalf##HILO >>= 8; \ 654 | \ 655 | if ( CHECKBUFFERS && ( outputCursor >= outputEarlyEnd || inputCursor > inputEarlyEnd ) ) \ 656 | goto BUFFER_END; \ 657 | } 658 | 659 | #define DECODE_STEP_HALF( HILO, CHECKMATCH, CHECKBUFFERS ) \ 660 | { \ 661 | size_t inputWord = *reinterpret_cast< const uint16_t* >( inputCursor ); \ 662 | __m128i literals = _mm_cvtsi64_si128( inputWord ); \ 663 | \ 664 | offset ^= static_cast< size_t >( static_cast< ptrdiff_t >( static_cast( readOffsetHalf##HILO ) ) ) & inputWord; \ 665 | \ 666 | const uint8_t* matchPointer = outputCursor - offset; \ 667 | \ 668 | if ( CHECKMATCH && matchPointer < output ) \ 669 | goto MATCH_UNDERFLOW; \ 670 | \ 671 | __m128i fromLiteral = _mm_shuffle_epi8( fromLiteral##HILO, _mm_setzero_si128() ); \ 672 | __m128i matchData = _mm_loadu_si128( reinterpret_cast( matchPointer ) ); \ 673 | \ 674 | literals = _mm_and_si128( fromLiteral, literals ); \ 675 | \ 676 | fromLiteral##HILO = _mm_srli_si128( fromLiteral##HILO, 1 ); \ 677 | \ 678 | __m128i toStore = _mm_xor_si128( matchData, literals ); \ 679 | \ 680 | _mm_storeu_si128( reinterpret_cast<__m128i*>( outputCursor ), toStore ); \ 681 | \ 682 | outputCursor += static_cast< uint8_t >( bytesOutHalf##HILO ); \ 683 | inputCursor += static_cast< uint8_t >( streamBytesReadHalf##HILO ); \ 684 | \ 685 | if ( CHECKBUFFERS && ( outputCursor >= outputEarlyEnd || inputCursor > inputEarlyEnd ) ) \ 686 | goto BUFFER_END; \ 687 | } 688 | 689 | #define DECODE_STEP_END( HILO, CHECKMATCH, CHECKBUFFERS ) \ 690 | { \ 691 | size_t inputWord = *reinterpret_cast< const uint16_t* >( inputCursor ); \ 692 | __m128i literals = _mm_cvtsi64_si128( inputWord ); \ 693 | \ 694 | offset ^= static_cast< size_t >( static_cast< ptrdiff_t >( static_cast( readOffsetHalf##HILO ) ) ) & inputWord; \ 695 | \ 696 | const uint8_t* matchPointer = outputCursor - offset; \ 697 | \ 698 | if ( CHECKMATCH && matchPointer < output ) \ 699 | goto MATCH_UNDERFLOW; \ 700 | \ 701 | __m128i fromLiteral = _mm_shuffle_epi8( fromLiteral##HILO, _mm_setzero_si128() ); \ 702 | __m128i matchData = _mm_loadu_si128( reinterpret_cast( matchPointer ) ); \ 703 | \ 704 | literals = _mm_and_si128( fromLiteral, literals ); \ 705 | \ 706 | __m128i toStore = _mm_xor_si128( matchData, literals ); \ 707 | \ 708 | _mm_storeu_si128( reinterpret_cast<__m128i*>( outputCursor ), toStore ); \ 709 | \ 710 | outputCursor += static_cast< uint8_t >( bytesOutHalf##HILO ); \ 711 | inputCursor += static_cast< uint8_t >( streamBytesReadHalf##HILO ); \ 712 | \ 713 | if ( CHECKBUFFERS && ( outputCursor >= outputEarlyEnd || inputCursor > inputEarlyEnd ) ) \ 714 | goto BUFFER_END; \ 715 | } 716 | 717 | #define DECODE_STEP_LO(CHECKMATCH, CHECKBUFFERS ) DECODE_STEP( Lo, CHECKMATCH, CHECKBUFFERS ) 718 | #define DECODE_STEP_HI(CHECKMATCH, CHECKBUFFERS ) DECODE_STEP( Hi, CHECKMATCH, CHECKBUFFERS ) 719 | #define DECODE_STEP_HALF_LO(CHECKMATCH, CHECKBUFFERS ) DECODE_STEP_HALF( Lo, CHECKMATCH, CHECKBUFFERS ) 720 | #define DECODE_STEP_HALF_HI(CHECKMATCH, CHECKBUFFERS ) DECODE_STEP_HALF( Hi, CHECKMATCH, CHECKBUFFERS ) 721 | #define DECODE_STEP_END_LO(CHECKMATCH, CHECKBUFFERS ) DECODE_STEP_END( Lo, CHECKMATCH, CHECKBUFFERS ) 722 | #define DECODE_STEP_END_HI(CHECKMATCH, CHECKBUFFERS ) DECODE_STEP_END( Hi, CHECKMATCH, CHECKBUFFERS ) 723 | 724 | __m128i nibbleMask = _mm_set1_epi8( 0xF ); 725 | __m128i literalsPerControl = _mm_set1_epi8( LITERALS_PER_CONTROL ); 726 | 727 | // Note, we use this block here because it allows the "fake" inputEarlyEnd/outputEarlyEnd not to cause register spills 728 | // in the decompression loops. And yes, that did actually happen. 729 | { 730 | #pragma warning ( push ) 731 | #pragma warning ( disable : 4101 ) 732 | 733 | const uint8_t* inputEarlyEnd; //= ( input + inputLength ) - END_PADDING_LITERALS; 734 | uint8_t* outputEarlyEnd;// = ( output + outputLength ) - END_PADDING_LITERALS; 735 | 736 | #pragma warning ( pop ) 737 | 738 | // "Safe" ends to the buffer, before the input/output cursors hit these, we can loop without overflow checks. 739 | const uint8_t* inputSafeEnd = ( input + inputLength ) - INPUT_BUFFER_SAFE; 740 | uint8_t* outputSafeEnd = ( output + outputLength ) - OUTPUT_BUFFER_SAFE; 741 | 742 | // Decoding loop with offset output buffer underflow test, but no buffer overflow tests, assumed to end at a safe distance 743 | // from overflows 744 | while ( ( outputCursor - output ) < LZ_WINDOW_SIZE && outputCursor < outputSafeEnd && inputCursor < inputSafeEnd ) 745 | { 746 | // load the control block 747 | __m128i controlBlock = _mm_loadu_si128( reinterpret_cast( inputCursor ) ); 748 | 749 | // split the control block into high and low nibbles. 750 | __m128i controlHi = _mm_and_si128( _mm_srli_epi32( controlBlock, CONTROL_BITS ), nibbleMask ); 751 | __m128i controlLo = _mm_and_si128( controlBlock, nibbleMask ); 752 | 753 | // Here we are testing if the runs will be literals or matches. Note that if the carries are set from the previous operation 754 | // this will essentially be ignored later on. 755 | __m128i isLiteralHi = _mm_cmplt_epi8( controlHi, literalsPerControl ); 756 | __m128i isLiteralLo = _mm_cmplt_epi8( controlLo, literalsPerControl ); 757 | 758 | // Work out the carry for the low nibbles (which will be used with the high controls to put them into 759 | // match without offset read mode). 760 | __m128i carryLo = _mm_cmpeq_epi8( controlLo, nibbleMask ); 761 | 762 | // The carry for the high nibbles is used with the low controls, but needs one byte from the previous iteration. We save 763 | // the calculated carry to use that byte next iteration. 764 | __m128i carryHi = _mm_cmpeq_epi8( controlHi, nibbleMask ); 765 | __m128i shiftedCarryHi = _mm_alignr_epi8( carryHi, previousCarryHi, 15 ); 766 | 767 | previousCarryHi = carryHi; 768 | 769 | // I want 128 set bits please. 770 | __m128i allSet = _mm_cmpeq_epi8( shiftedCarryHi, shiftedCarryHi ); 771 | 772 | // Calcualting the bytes to output to the stream. Basically, we are subtracting negative one from the control value if the 773 | // carry is not set. This works because the masks produced by comparisons are the equivalent to negative one, which 774 | // make this a conditional increment. 775 | __m128i bytesOutLo = _mm_sub_epi8( controlLo, _mm_xor_si128( shiftedCarryHi, allSet ) ); 776 | __m128i bytesOutHi = _mm_sub_epi8( controlHi, _mm_xor_si128( carryLo, allSet ) ); 777 | 778 | // Calculate the number of bytes to read per control. 779 | // In the case the carry is set, no bytes. Otherwise, the offset size (2 bytes) for matches or the number of output bytes for literals. 780 | __m128i streamBytesReadLo = _mm_andnot_si128( shiftedCarryHi, _mm_min_epi8( literalsPerControl, bytesOutLo ) ); 781 | __m128i streamBytesReadHi = _mm_andnot_si128( carryLo, _mm_min_epi8( literalsPerControl, bytesOutHi ) ); 782 | 783 | // Masks to read the offset (or keep the previous one) - set in the case that this is not a literal and the carry is not set 784 | __m128i readOffsetLo = _mm_xor_si128( _mm_or_si128( isLiteralLo, shiftedCarryHi ), allSet ); 785 | __m128i readOffsetHi = _mm_xor_si128( _mm_or_si128( isLiteralHi, carryLo ), allSet ); 786 | 787 | // Masks whether we are reading literals - set if the carry is not set and these are literals. 788 | __m128i fromLiteralLo = _mm_andnot_si128( shiftedCarryHi, isLiteralLo ); 789 | __m128i fromLiteralHi = _mm_andnot_si128( carryLo, isLiteralHi ); 790 | 791 | // Advance the input past the control block. 792 | inputCursor += CONTROL_BLOCK_SIZE; 793 | 794 | { 795 | // Pull out the bottom halves off the SSE registers from before - we want these 796 | // things in GPRs for the more linear logic. 797 | uint64_t bytesOutHalfLo = static_cast( _mm_cvtsi128_si64( bytesOutLo ) ); 798 | uint64_t bytesOutHalfHi = static_cast( _mm_cvtsi128_si64( bytesOutHi ) ); 799 | 800 | uint64_t streamBytesReadHalfLo = static_cast( _mm_cvtsi128_si64( streamBytesReadLo ) ); 801 | uint64_t streamBytesReadHalfHi = static_cast( _mm_cvtsi128_si64( streamBytesReadHi ) ); 802 | 803 | uint64_t readOffsetHalfLo = static_cast( _mm_cvtsi128_si64( readOffsetLo ) ); 804 | uint64_t readOffsetHalfHi = static_cast( _mm_cvtsi128_si64( readOffsetHi ) ); 805 | 806 | DECODE_STEP_LO( true, false ); 807 | DECODE_STEP_HI( true, false ); 808 | DECODE_STEP_LO( true, false ); 809 | DECODE_STEP_HI( true, false ); 810 | 811 | DECODE_STEP_LO( true, false ); 812 | DECODE_STEP_HI( true, false ); 813 | DECODE_STEP_LO( true, false ); 814 | DECODE_STEP_HI( true, false ); 815 | 816 | DECODE_STEP_LO( true, false ); 817 | DECODE_STEP_HI( true, false ); 818 | DECODE_STEP_LO( true, false ); 819 | DECODE_STEP_HI( true, false ); 820 | 821 | DECODE_STEP_LO( true, false ); 822 | DECODE_STEP_HI( true, false ); 823 | DECODE_STEP_HALF_LO( true, false ); 824 | DECODE_STEP_HALF_HI( true, false ); 825 | } 826 | 827 | { 828 | // Now the top halves. 829 | uint64_t bytesOutHalfLo = static_cast( _mm_extract_epi64( bytesOutLo, 1 ) ); 830 | uint64_t bytesOutHalfHi = static_cast( _mm_extract_epi64( bytesOutHi, 1 ) ); 831 | 832 | uint64_t streamBytesReadHalfLo = static_cast( _mm_extract_epi64( streamBytesReadLo, 1 ) ); 833 | uint64_t streamBytesReadHalfHi = static_cast( _mm_extract_epi64( streamBytesReadHi, 1 ) ); 834 | 835 | uint64_t readOffsetHalfLo = static_cast( _mm_extract_epi64( readOffsetLo, 1 ) ); 836 | uint64_t readOffsetHalfHi = static_cast( _mm_extract_epi64( readOffsetHi, 1 ) ); 837 | 838 | DECODE_STEP_LO( true, false ); 839 | DECODE_STEP_HI( true, false ); 840 | DECODE_STEP_LO( true, false ); 841 | DECODE_STEP_HI( true, false ); 842 | 843 | DECODE_STEP_LO( true, false ); 844 | DECODE_STEP_HI( true, false ); 845 | DECODE_STEP_LO( true, false ); 846 | DECODE_STEP_HI( true, false ); 847 | 848 | DECODE_STEP_LO( true, false ); 849 | DECODE_STEP_HI( true, false ); 850 | DECODE_STEP_LO( true, false ); 851 | DECODE_STEP_HI( true, false ); 852 | 853 | DECODE_STEP_LO( true, false ); 854 | DECODE_STEP_HI( true, false ); 855 | DECODE_STEP_END_LO( true, false ); 856 | DECODE_STEP_END_HI( true, false ); 857 | } 858 | } 859 | 860 | // Decoding loop with no buffer checks, but will end at a safe distance from the end of the buffers. 861 | // Note, when we get here we have already reached the point in the output buffer which is *past* where we can underflow 862 | // due to a bad match offset. 863 | while ( outputCursor < outputSafeEnd && inputCursor < inputSafeEnd ) 864 | { 865 | // This code is the same as the loop above, see comments there 866 | __m128i controlBlock = _mm_loadu_si128( reinterpret_cast( inputCursor ) ); 867 | __m128i controlHi = _mm_and_si128( _mm_srli_epi32( controlBlock, CONTROL_BITS ), nibbleMask ); 868 | __m128i controlLo = _mm_and_si128( controlBlock, nibbleMask ); 869 | 870 | __m128i isLiteralHi = _mm_cmplt_epi8( controlHi, literalsPerControl ); 871 | __m128i isLiteralLo = _mm_cmplt_epi8( controlLo, literalsPerControl ); 872 | __m128i carryLo = _mm_cmpeq_epi8( controlLo, nibbleMask ); 873 | __m128i carryHi = _mm_cmpeq_epi8( controlHi, nibbleMask ); 874 | __m128i shiftedCarryHi = _mm_alignr_epi8( carryHi, previousCarryHi, 15 ); // where we take the carry from the previous hi values 875 | 876 | previousCarryHi = carryHi; 877 | 878 | __m128i neg1 = _mm_cmpeq_epi8( shiftedCarryHi, shiftedCarryHi ); 879 | 880 | __m128i bytesOutLo = _mm_sub_epi8( controlLo, _mm_xor_si128( shiftedCarryHi, neg1 ) ); 881 | __m128i bytesOutHi = _mm_sub_epi8( controlHi, _mm_xor_si128( carryLo, neg1 ) ); 882 | 883 | __m128i streamBytesReadLo = _mm_andnot_si128( shiftedCarryHi, _mm_min_epi8( literalsPerControl, bytesOutLo ) ); 884 | __m128i streamBytesReadHi = _mm_andnot_si128( carryLo, _mm_min_epi8( literalsPerControl, bytesOutHi ) ); 885 | 886 | __m128i readOffsetLo = _mm_xor_si128( _mm_or_si128( isLiteralLo, shiftedCarryHi ), neg1 ); 887 | __m128i readOffsetHi = _mm_xor_si128( _mm_or_si128( isLiteralHi, carryLo ), neg1 ); 888 | 889 | __m128i fromLiteralLo = _mm_andnot_si128( shiftedCarryHi, isLiteralLo ); 890 | __m128i fromLiteralHi = _mm_andnot_si128( carryLo, isLiteralHi ); 891 | 892 | inputCursor += CONTROL_BLOCK_SIZE; 893 | 894 | { 895 | uint64_t bytesOutHalfLo = static_cast( _mm_cvtsi128_si64( bytesOutLo ) ); 896 | uint64_t bytesOutHalfHi = static_cast( _mm_cvtsi128_si64( bytesOutHi ) ); 897 | 898 | uint64_t streamBytesReadHalfLo = static_cast( _mm_cvtsi128_si64( streamBytesReadLo ) ); 899 | uint64_t streamBytesReadHalfHi = static_cast( _mm_cvtsi128_si64( streamBytesReadHi ) ); 900 | 901 | uint64_t readOffsetHalfLo = static_cast( _mm_cvtsi128_si64( readOffsetLo ) ); 902 | uint64_t readOffsetHalfHi = static_cast( _mm_cvtsi128_si64( readOffsetHi ) ); 903 | 904 | DECODE_STEP_LO( false, false ); 905 | DECODE_STEP_HI( false, false ); 906 | DECODE_STEP_LO( false, false ); 907 | DECODE_STEP_HI( false, false ); 908 | 909 | DECODE_STEP_LO( false, false ); 910 | DECODE_STEP_HI( false, false ); 911 | DECODE_STEP_LO( false, false ); 912 | DECODE_STEP_HI( false, false ); 913 | 914 | DECODE_STEP_LO( false, false ); 915 | DECODE_STEP_HI( false, false ); 916 | DECODE_STEP_LO( false, false ); 917 | DECODE_STEP_HI( false, false ); 918 | 919 | DECODE_STEP_LO( false, false ); 920 | DECODE_STEP_HI( false, false ); 921 | DECODE_STEP_HALF_LO( false, false ); 922 | DECODE_STEP_HALF_HI( false, false ); 923 | } 924 | 925 | { 926 | uint64_t bytesOutHalfLo = static_cast( _mm_extract_epi64( bytesOutLo, 1 ) ); 927 | uint64_t bytesOutHalfHi = static_cast( _mm_extract_epi64( bytesOutHi, 1 ) ); 928 | 929 | uint64_t streamBytesReadHalfLo = static_cast( _mm_extract_epi64( streamBytesReadLo, 1 ) ); 930 | uint64_t streamBytesReadHalfHi = static_cast( _mm_extract_epi64( streamBytesReadHi, 1 ) ); 931 | 932 | uint64_t readOffsetHalfLo = static_cast( _mm_extract_epi64( readOffsetLo, 1 ) ); 933 | uint64_t readOffsetHalfHi = static_cast( _mm_extract_epi64( readOffsetHi, 1 ) ); 934 | 935 | DECODE_STEP_LO( false, false ); 936 | DECODE_STEP_HI( false, false ); 937 | DECODE_STEP_LO( false, false ); 938 | DECODE_STEP_HI( false, false ); 939 | 940 | DECODE_STEP_LO( false, false ); 941 | DECODE_STEP_HI( false, false ); 942 | DECODE_STEP_LO( false, false ); 943 | DECODE_STEP_HI( false, false ); 944 | 945 | DECODE_STEP_LO( false, false ); 946 | DECODE_STEP_HI( false, false ); 947 | DECODE_STEP_LO( false, false ); 948 | DECODE_STEP_HI( false, false ); 949 | 950 | DECODE_STEP_LO( false, false ); 951 | DECODE_STEP_HI( false, false ); 952 | DECODE_STEP_END_LO( false, false ); 953 | DECODE_STEP_END_HI( false, false ); 954 | } 955 | } 956 | } 957 | 958 | // Decoding loop with all buffer checks. 959 | { 960 | const uint8_t* inputEarlyEnd; 961 | uint8_t* outputEarlyEnd; 962 | inputEarlyEnd = (( input + inputLength ) - END_PADDING_LITERALS); 963 | outputEarlyEnd = ( output + outputLength ) - END_PADDING_LITERALS; 964 | 965 | while ( outputCursor < outputEarlyEnd && inputCursor < inputEarlyEnd ) 966 | { 967 | __m128i controlBlock = _mm_loadu_si128( reinterpret_cast( inputCursor ) ); 968 | __m128i controlHi = _mm_and_si128( _mm_srli_epi32( controlBlock, CONTROL_BITS ), nibbleMask ); 969 | __m128i controlLo = _mm_and_si128( controlBlock, nibbleMask ); 970 | 971 | __m128i isLiteralHi = _mm_cmplt_epi8( controlHi, literalsPerControl ); 972 | __m128i isLiteralLo = _mm_cmplt_epi8( controlLo, literalsPerControl ); 973 | __m128i carryLo = _mm_cmpeq_epi8( controlLo, nibbleMask ); 974 | __m128i carryHi = _mm_cmpeq_epi8( controlHi, nibbleMask ); 975 | __m128i shiftedCarryHi = _mm_alignr_epi8( carryHi, previousCarryHi, 15 ); // where we take the carry from the previous hi values 976 | 977 | previousCarryHi = carryHi; 978 | 979 | __m128i neg1 = _mm_cmpeq_epi8( shiftedCarryHi, shiftedCarryHi ); 980 | 981 | __m128i bytesOutLo = _mm_sub_epi8( controlLo, _mm_xor_si128( shiftedCarryHi, neg1 ) ); 982 | __m128i bytesOutHi = _mm_sub_epi8( controlHi, _mm_xor_si128( carryLo, neg1 ) ); 983 | 984 | __m128i streamBytesReadLo = _mm_andnot_si128( shiftedCarryHi, _mm_min_epi8( literalsPerControl, bytesOutLo ) ); 985 | __m128i streamBytesReadHi = _mm_andnot_si128( carryLo, _mm_min_epi8( literalsPerControl, bytesOutHi ) ); 986 | 987 | __m128i readOffsetLo = _mm_xor_si128( _mm_or_si128( isLiteralLo, shiftedCarryHi ), neg1 ); 988 | __m128i readOffsetHi = _mm_xor_si128( _mm_or_si128( isLiteralHi, carryLo ), neg1 ); 989 | 990 | __m128i fromLiteralLo = _mm_andnot_si128( shiftedCarryHi, isLiteralLo ); 991 | __m128i fromLiteralHi = _mm_andnot_si128( carryLo, isLiteralHi ); 992 | 993 | inputCursor += CONTROL_BLOCK_SIZE; 994 | 995 | if ( inputCursor > inputEarlyEnd ) 996 | goto BUFFER_END; 997 | 998 | { 999 | uint64_t bytesOutHalfLo = static_cast( _mm_cvtsi128_si64( bytesOutLo ) ); 1000 | uint64_t bytesOutHalfHi = static_cast( _mm_cvtsi128_si64( bytesOutHi ) ); 1001 | 1002 | uint64_t streamBytesReadHalfLo = static_cast( _mm_cvtsi128_si64( streamBytesReadLo ) ); 1003 | uint64_t streamBytesReadHalfHi = static_cast( _mm_cvtsi128_si64( streamBytesReadHi ) ); 1004 | 1005 | uint64_t readOffsetHalfLo = static_cast( _mm_cvtsi128_si64( readOffsetLo ) ); 1006 | uint64_t readOffsetHalfHi = static_cast( _mm_cvtsi128_si64( readOffsetHi ) ); 1007 | 1008 | DECODE_STEP_LO( true, true ); 1009 | DECODE_STEP_HI( true, true ); 1010 | DECODE_STEP_LO( true, true ); 1011 | DECODE_STEP_HI( true, true ); 1012 | 1013 | DECODE_STEP_LO( true, true ); 1014 | DECODE_STEP_HI( true, true ); 1015 | DECODE_STEP_LO( true, true ); 1016 | DECODE_STEP_HI( true, true ); 1017 | 1018 | DECODE_STEP_LO( true, true ); 1019 | DECODE_STEP_HI( true, true ); 1020 | DECODE_STEP_LO( true, true ); 1021 | DECODE_STEP_HI( true, true ); 1022 | 1023 | DECODE_STEP_LO( true, true ); 1024 | DECODE_STEP_HI( true, true ); 1025 | DECODE_STEP_HALF_LO( true, true ); 1026 | DECODE_STEP_HALF_HI( true, true ); 1027 | } 1028 | 1029 | { 1030 | uint64_t bytesOutHalfLo = static_cast( _mm_extract_epi64( bytesOutLo, 1 ) ); 1031 | uint64_t bytesOutHalfHi = static_cast( _mm_extract_epi64( bytesOutHi, 1 ) ); 1032 | 1033 | uint64_t streamBytesReadHalfLo = static_cast( _mm_extract_epi64( streamBytesReadLo, 1 ) ); 1034 | uint64_t streamBytesReadHalfHi = static_cast( _mm_extract_epi64( streamBytesReadHi, 1 ) ); 1035 | 1036 | uint64_t readOffsetHalfLo = static_cast( _mm_extract_epi64( readOffsetLo, 1 ) ); 1037 | uint64_t readOffsetHalfHi = static_cast( _mm_extract_epi64( readOffsetHi, 1 ) ); 1038 | 1039 | DECODE_STEP_LO( true, true ); 1040 | DECODE_STEP_HI( true, true ); 1041 | DECODE_STEP_LO( true, true ); 1042 | DECODE_STEP_HI( true, true ); 1043 | 1044 | DECODE_STEP_LO( true, true ); 1045 | DECODE_STEP_HI( true, true ); 1046 | DECODE_STEP_LO( true, true ); 1047 | DECODE_STEP_HI( true, true ); 1048 | 1049 | DECODE_STEP_LO( true, true ); 1050 | DECODE_STEP_HI( true, true ); 1051 | DECODE_STEP_LO( true, true ); 1052 | DECODE_STEP_HI( true, true ); 1053 | 1054 | DECODE_STEP_LO( true, true ); 1055 | DECODE_STEP_HI( true, true ); 1056 | DECODE_STEP_END_LO( true, true ); 1057 | DECODE_STEP_END_HI( true, true ); 1058 | } 1059 | } 1060 | 1061 | BUFFER_END: 1062 | 1063 | // When we get here, we have either advanced the right amount on both cursors 1064 | // or something bad happened, so leave it as is, so we can tell where 1065 | // the error happened. 1066 | if ( inputCursor == inputEarlyEnd && outputCursor == outputEarlyEnd ) 1067 | { 1068 | size_t remainingLiterals = ( input + inputLength ) - inputCursor; 1069 | 1070 | // copy any trailing literals 1071 | memcpy( outputCursor, inputCursor, remainingLiterals ); 1072 | 1073 | outputCursor += remainingLiterals; 1074 | } 1075 | } 1076 | 1077 | MATCH_UNDERFLOW: 1078 | 1079 | return outputCursor - output; 1080 | } 1081 | -------------------------------------------------------------------------------- /lzsse2/lzsse2.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2016, Conor Stokes 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | */ 25 | 26 | #ifndef LZSSE2_H__ 27 | #define LZSSE2_H__ 28 | 29 | #pragma once 30 | 31 | /* LZSSE2 - x64/SSE targeted codec for better performance with high compression ratio data/more optimal compressors. 32 | * Supports minimum 3 byte matches, maximum 16 bytes of match per control word and 2 byte literal runs per control word. 33 | */ 34 | 35 | #ifdef __cplusplus 36 | extern "C" 37 | { 38 | #endif 39 | 40 | /* Re-usable parse state object for compression. */ 41 | typedef struct LZSSE2_OptimalParseState LZSSE2_OptimalParseState; 42 | 43 | /* Allocate the parse state for compression - returns null on failure. Note 44 | Buffersize has to be greater or equal to any inputLength used with LZSSE2_CompressOptimalParse */ 45 | LZSSE2_OptimalParseState* LZSSE2_MakeOptimalParseState( size_t bufferSize ); 46 | 47 | /* De-allocate the parse state for compression */ 48 | void LZSSE2_FreeOptimalParseState( LZSSE2_OptimalParseState* toFree ); 49 | 50 | /* "Optimal" compression routine. 51 | * Will compress data into LZSSE2 format, uses hash BST matching to find matches and run an optimal parse (high relative memory usage). Requires SSE 4.1. 52 | * state : Contains the hash table for matching, passed as a parameter so that allocations can be re-used. 53 | * input : Buffer containing uncompressed data to be compressed. May not be null. 54 | * inputLength : Length of the compressed data in the input buffer - note should be under 2GB. 55 | * output : Buffer that will receive the compressed output. 56 | * outputLength : The length reserved in the buffer for compressed data. This should be at least inputLength. Note, 57 | * The compressed data should never be longer than inputLength, as in this case the data is stored raw. 58 | * level : The compression level to use for this file 1->17, 17 is highest compression, 0 is least 59 | * Thread Safety - state can not be used on multiple threads with calls running concurrently. Can run multiple threads with separate state 60 | * concurrently. 61 | * 62 | * Returns the size of the compressed data, or 0 in the case of error (e.g. outputLength is less than inputLength). 63 | */ 64 | size_t LZSSE2_CompressOptimalParse( LZSSE2_OptimalParseState* state, const void* input, size_t inputLength, void* output, size_t outputLength, unsigned int level ); 65 | 66 | /* Decompression routine. 67 | * This routine will decompress data in the LZSSE2 format and currently requires SSE 4.1 and is targeted at x64. 68 | * It will perform poorly on x86 due to hunger for registers. 69 | * input : Buffer containing compressed input block. May not be null. 70 | * inputLength : Length of the compressed data in the input buffer - note, this should be under 2GB 71 | * output : Buffer that will received the de-compressed output. Note, that this needs to be at least outputLength long. 72 | * May not be null. 73 | * outputLength : The length of the compressed output - note, this should be under 2GB 74 | * 75 | * Provided that input and output are valid pointers to buffers of at least their specified size, this routine 76 | * should be memory safe - both match pointer checks and input/output buffer checks exist. 77 | * 78 | * Returns the size of the decompressed data, which will be less than outputLength in the event of an error (number of bytes 79 | * will indicate where in the output stream the error occured). 80 | * 81 | * Note that this data is not hash verified, errors that occur are either from a misformed stream or bad buffer sizes. 82 | * Remember, corrupt data can still be valid to decompress. 83 | */ 84 | size_t LZSSE2_Decompress( const void* input, size_t inputLength, void* output, size_t outputLength ); 85 | 86 | #ifdef __cplusplus 87 | } 88 | #endif 89 | 90 | #endif /* -- LZSSE2_H__ */ 91 | -------------------------------------------------------------------------------- /lzsse2/lzsse2_platform.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2016, Brian Marshall 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | */ 25 | 26 | #ifndef LZSSE2_PLATFORM_H__ 27 | #define LZSSE2_PLATFORM_H__ 28 | 29 | #pragma once 30 | 31 | /* 32 | Compiler/Platform detection based on the table from: 33 | https://blogs.msdn.microsoft.com/vcblog/2015/12/04/clang-with-microsoft-codegen-in-vs-2015-update-1/ 34 | */ 35 | 36 | #ifdef _MSC_VER 37 | 38 | /* 39 | Microsoft Visual Studio Support. 40 | C1xx/C2, Clang/C2 and Clang/LLVM all support the Microsoft header files and _BitScanForward 41 | 42 | Note: if you receive errors with the intrinsics make sure that you have SSE4.1 support enabled. 43 | For example with Clang include "-msse4.1" on the command line 44 | */ 45 | #include 46 | 47 | #else /* _MSC_VER */ 48 | 49 | #ifdef __GNUC__ 50 | 51 | /* 52 | GCC 53 | */ 54 | 55 | #define SIMDE_ENABLE_NATIVE_ALIASES 56 | #include "../simde/x86/sse4.1.h" 57 | /* _BitScanForward is Visual Studio specific. */ 58 | #define _BitScanForward(x, m) *(x) = __builtin_ctz(m) 59 | 60 | #else 61 | 62 | /* 63 | If you hit the error below, then add detection for your compiler/platform to this header file. 64 | */ 65 | #error Platform not supported 66 | 67 | #endif /* __GNUC__ */ 68 | #endif /* _MSC_VER */ 69 | 70 | #endif /* -- LZSSE2_PLATFORM_H__ */ 71 | -------------------------------------------------------------------------------- /lzsse4/lzsse4.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2016, Conor Stokes 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | */ 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #include "lzsse4_platform.h" 32 | #include "lzsse4.h" 33 | 34 | #pragma warning ( disable : 4127 ) 35 | 36 | namespace 37 | { 38 | // Constants - most of these should not be changed without corresponding code changes because it will break many things in unpredictable ways. 39 | const uint32_t WINDOW_BITS = 16; 40 | const uint32_t MIN_MATCH_LENGTH = 4; 41 | const uint32_t LZ_WINDOW_SIZE = 1 << WINDOW_BITS; 42 | const uint32_t LZ_WINDOW_MASK = LZ_WINDOW_SIZE - 1; 43 | const uint32_t FAST_HASH_BITS = 20; // You can change this - more bits = more matches, less bits = more cache hits 44 | const uint32_t FAST_BUCKETS_COUNT = 1 << FAST_HASH_BITS; 45 | const uint32_t FAST_HASH_MASK = FAST_BUCKETS_COUNT - 1; 46 | const uint32_t MIN_COMPRESSION_SIZE = 32; 47 | const uint32_t END_PADDING_LITERALS = 16; 48 | const int32_t NO_MATCH = -1; 49 | const int32_t EMPTY_NODE = -1; 50 | const uint32_t MIN_LITERAL_COUNT = 8; 51 | const uint32_t CONTROL_BITS = 4; 52 | const uint32_t LITERAL_BITS = 8; 53 | const uint32_t OFFSET_BITS = 16; 54 | const uint32_t BASE_MATCH_BITS = OFFSET_BITS + CONTROL_BITS; 55 | const uint32_t OFFSET_SIZE = 2; 56 | const uint32_t EXTENDED_MATCH_BOUND = ( 1 << CONTROL_BITS ) - 1; 57 | const uint32_t CONTROL_BLOCK_SIZE = sizeof( __m128i ); 58 | const uint32_t CONTROLS_PER_BLOCK = 32; 59 | const uint32_t LITERALS_PER_CONTROL = 4; 60 | const uint32_t MAX_INPUT_PER_CONTROL = 4; 61 | const size_t OUTPUT_BUFFER_SAFE = EXTENDED_MATCH_BOUND * CONTROLS_PER_BLOCK; 62 | const size_t INPUT_BUFFER_SAFE = MAX_INPUT_PER_CONTROL * CONTROLS_PER_BLOCK; 63 | const uint16_t INITIAL_OFFSET = MIN_MATCH_LENGTH; 64 | const uint32_t OPTIMAL_HASH_BITS = 20; 65 | const uint32_t OPTIMAL_BUCKETS_COUNT = 1 << OPTIMAL_HASH_BITS; 66 | const uint32_t OPTIMAL_HASH_MASK = OPTIMAL_BUCKETS_COUNT - 1; 67 | const size_t SKIP_MATCH_LENGTH = 128; 68 | const uint32_t NO_SKIP_LEVEL = 17; 69 | } 70 | 71 | 72 | struct LZSSE4_FastParseState 73 | { 74 | int32_t buckets[ FAST_BUCKETS_COUNT ]; // stores the first matching position, we can then look at the rest of the matches by tracing through the window. 75 | }; 76 | 77 | 78 | LZSSE4_FastParseState* LZSSE4_MakeFastParseState() 79 | { 80 | return new LZSSE4_FastParseState(); 81 | } 82 | 83 | 84 | void LZSSE4_FreeFastParseState( LZSSE4_FastParseState* toFree ) 85 | { 86 | delete toFree; 87 | } 88 | 89 | 90 | inline void SetHash( LZSSE4_FastParseState* state, uint32_t hash, const uint8_t* input, const uint8_t* inputCursor ) 91 | { 92 | int32_t position = static_cast( inputCursor - input ); 93 | 94 | state->buckets[ hash & FAST_HASH_MASK ] = position; 95 | } 96 | 97 | // Simple fast hash function - actually what is used in snappy and derivatives 98 | // There's probably better, but I haven't spent time focusing on this area yet. 99 | inline uint32_t HashFast( const uint8_t* inputCursor ) 100 | { 101 | return *reinterpret_cast( inputCursor ) * 0x1e35a7bd >> ( 32 - FAST_HASH_BITS ); 102 | } 103 | 104 | size_t LZSSE4_CompressFast( LZSSE4_FastParseState* state, const void* inputChar, size_t inputLength, void* outputChar, size_t outputLength ) 105 | { 106 | if ( outputLength < inputLength ) 107 | { 108 | // error case, output buffer not large enough. 109 | return 0; 110 | } 111 | 112 | const uint8_t* input = reinterpret_cast< const uint8_t* >( inputChar ); 113 | uint8_t* output = reinterpret_cast< uint8_t* >( outputChar ); 114 | 115 | if ( inputLength < MIN_COMPRESSION_SIZE ) 116 | { 117 | memcpy( output, input, inputLength ); 118 | 119 | return inputLength; 120 | } 121 | 122 | const uint8_t* inputCursor = input; 123 | const uint8_t* inputEnd = input + inputLength; 124 | const uint8_t* inputEarlyEnd = inputEnd - END_PADDING_LITERALS; 125 | uint8_t* outputCursor = output; 126 | uint8_t* outputEarlyEnd = ( output + outputLength ) - END_PADDING_LITERALS; 127 | uint32_t hash = 0; 128 | 129 | // initialize hash to empty 130 | for ( int32_t* where = state->buckets, *end = state->buckets + FAST_BUCKETS_COUNT; where < end; where += 4 ) 131 | { 132 | where[ 0 ] = -1; 133 | where[ 1 ] = -1; 134 | where[ 2 ] = -1; 135 | where[ 3 ] = -1; 136 | } 137 | 138 | // initial literals that wont be compressed 139 | for ( uint32_t where = 0; where < MIN_MATCH_LENGTH; ++where ) 140 | { 141 | hash = HashFast( inputCursor ); 142 | 143 | SetHash( state, hash, input, inputCursor ); 144 | 145 | *( outputCursor++ ) = *( inputCursor++ ); 146 | } 147 | 148 | uint8_t* currentControlBlock = outputCursor; 149 | uint32_t currentControlCount = 0; 150 | uint16_t previousOffset = INITIAL_OFFSET; 151 | size_t literalsToFlush = 0; 152 | 153 | outputCursor += CONTROL_BLOCK_SIZE; 154 | 155 | bool lastControlIsNop = false; 156 | 157 | // Loop through the data until we hit the end of one of the buffers (minus the end padding literals) 158 | while ( inputCursor < inputEarlyEnd && outputCursor <= outputEarlyEnd ) 159 | { 160 | lastControlIsNop = false; 161 | 162 | hash = HashFast( inputCursor ); 163 | 164 | int matchPosition = state->buckets[ hash & FAST_HASH_MASK ]; 165 | int currentPosition = static_cast< int32_t >( inputCursor - input ); 166 | uint32_t matchLength = 0; 167 | uint16_t matchOffset = static_cast< uint16_t >( currentPosition - matchPosition ); 168 | 169 | // If we had a hit in the hash and it wasn't outside the window. 170 | if ( matchPosition >= 0 && ( currentPosition - matchPosition ) < ( LZ_WINDOW_SIZE - 1 ) ) 171 | { 172 | const uint8_t* matchCandidate = input + matchPosition; 173 | uint32_t lengthToEnd = static_cast< uint32_t >( inputEarlyEnd - inputCursor ); 174 | // Here we limit the hash length to prevent overlap matches with offset less than 16 bytes 175 | uint32_t maxLength = matchOffset <= ( EXTENDED_MATCH_BOUND + 1 ) && matchOffset < lengthToEnd ? matchOffset : lengthToEnd; 176 | 177 | // Find how long the match is 16 bytes at a time. 178 | while ( matchLength < maxLength ) 179 | { 180 | __m128i input16 = _mm_loadu_si128( reinterpret_cast( inputCursor + matchLength ) ); 181 | __m128i match16 = _mm_loadu_si128( reinterpret_cast( matchCandidate + matchLength ) ); 182 | 183 | unsigned long matchBytes; 184 | 185 | // Finds the number of equal bytes at the start of the 16 186 | _BitScanForward( &matchBytes, ( static_cast< unsigned long >( ~_mm_movemask_epi8( _mm_cmpeq_epi8( input16, match16 ) ) ) | 0x10000 ) ); 187 | 188 | matchLength += matchBytes; 189 | 190 | if ( matchBytes != sizeof( __m128i ) ) 191 | { 192 | break; 193 | } 194 | } 195 | 196 | matchLength = matchLength < maxLength ? matchLength : maxLength; 197 | } 198 | 199 | // If we have at least the minimum match length (4 bytes) 200 | if ( matchLength >= MIN_MATCH_LENGTH ) 201 | { 202 | // Do we have literals to flush before the match? 203 | if ( literalsToFlush > 0 ) 204 | { 205 | // Start a new control block if we need one. 206 | if ( currentControlCount == CONTROLS_PER_BLOCK ) 207 | { 208 | currentControlBlock = outputCursor; 209 | outputCursor += CONTROL_BLOCK_SIZE; 210 | 211 | _mm_storeu_si128( reinterpret_cast< __m128i* >( outputCursor ), _mm_setzero_si128() ); 212 | 213 | currentControlCount = 0; 214 | 215 | // Would be larger than compressed size, get out! 216 | if ( outputCursor > outputEarlyEnd ) 217 | { 218 | break; 219 | } 220 | } 221 | 222 | currentControlBlock[ currentControlCount >> 1 ] = 223 | ( currentControlBlock[ currentControlCount >> 1 ] >> 4 ) | ( static_cast( literalsToFlush - 1 ) << 4 ); 224 | 225 | // flush the literals. 226 | // note the xor against the data that would be read in the match. 227 | for ( uint32_t where = 0; where < literalsToFlush; ++where ) 228 | { 229 | const uint8_t* currentInput = inputCursor - ( literalsToFlush - where ); 230 | 231 | *( outputCursor++ ) = *currentInput ^ *( currentInput - previousOffset ); 232 | } 233 | 234 | ++currentControlCount; 235 | 236 | literalsToFlush = 0; 237 | 238 | // Would be larger than compressed size, get out! 239 | if ( outputCursor > outputEarlyEnd ) 240 | { 241 | break; 242 | } 243 | } 244 | 245 | // Start a new control block if the previous one is full. 246 | // Note this is done before the offset is written out - the offset 247 | // is always written after the control block containing the first 248 | // control in the match. 249 | if ( currentControlCount == CONTROLS_PER_BLOCK ) 250 | { 251 | currentControlBlock = outputCursor; 252 | outputCursor += CONTROL_BLOCK_SIZE; 253 | 254 | _mm_storeu_si128( reinterpret_cast< __m128i* >( outputCursor ), _mm_setzero_si128() ); 255 | 256 | currentControlCount = 0; 257 | 258 | if ( outputCursor > outputEarlyEnd ) 259 | { 260 | break; 261 | } 262 | } 263 | 264 | // The match length value we are encoding. 265 | size_t toEncode = matchLength; 266 | 267 | // Write the offset out - note the xor with the previous offset. 268 | *reinterpret_cast< uint16_t* >( outputCursor ) = matchOffset ^ previousOffset; 269 | 270 | previousOffset = matchOffset; 271 | outputCursor += sizeof( uint16_t ); 272 | 273 | for ( ;; ) 274 | { 275 | // Check if we need to start a new control block 276 | if ( currentControlCount == CONTROLS_PER_BLOCK ) 277 | { 278 | currentControlBlock = outputCursor; 279 | outputCursor += CONTROL_BLOCK_SIZE; 280 | 281 | _mm_storeu_si128( reinterpret_cast< __m128i* >( outputCursor ), _mm_setzero_si128() ); 282 | 283 | currentControlCount = 0; 284 | 285 | if ( outputCursor > outputEarlyEnd ) 286 | { 287 | break; 288 | } 289 | } 290 | 291 | // If the encode size is greater than we can hold in a control, write out a full match length 292 | // control, subtract full control value from the amount to encode and loop around again. 293 | if ( toEncode >= EXTENDED_MATCH_BOUND ) 294 | { 295 | currentControlBlock[ currentControlCount >> 1 ] = 296 | ( currentControlBlock[ currentControlCount >> 1 ] >> 4 ) | ( static_cast( EXTENDED_MATCH_BOUND ) << 4 ); 297 | 298 | toEncode -= EXTENDED_MATCH_BOUND; 299 | 300 | ++currentControlCount; 301 | } 302 | else // Write out the remaining match length control. Could potentially be zero. 303 | { 304 | currentControlBlock[ currentControlCount >> 1 ] = 305 | ( currentControlBlock[ currentControlCount >> 1 ] >> 4 ) | ( static_cast( toEncode ) << 4 ); 306 | 307 | if ( currentControlCount == 0 && toEncode == 0 ) 308 | { 309 | lastControlIsNop = true; 310 | } 311 | 312 | ++currentControlCount; 313 | 314 | break; 315 | } 316 | } 317 | 318 | // Update the value into the hash for future matches. 319 | SetHash( state, hash, input, inputCursor ); 320 | 321 | ++inputCursor; 322 | 323 | // Hash all the other values in the match too. 324 | for ( const uint8_t* nextArrival = inputCursor + matchLength - 1; inputCursor < nextArrival; ++inputCursor ) 325 | { 326 | hash = HashFast( inputCursor ); 327 | SetHash( state, hash, input, inputCursor ); 328 | } 329 | } 330 | else 331 | { 332 | // One more literal to write out. 333 | ++literalsToFlush; 334 | 335 | // If we have reached the maximum number of literals allowed in the control, flush them out. 336 | if ( literalsToFlush == LITERALS_PER_CONTROL ) 337 | { 338 | // Check if the control block is full and we need start a new one. 339 | if ( currentControlCount == CONTROLS_PER_BLOCK ) 340 | { 341 | currentControlBlock = outputCursor; 342 | outputCursor += CONTROL_BLOCK_SIZE; 343 | 344 | _mm_storeu_si128( reinterpret_cast< __m128i* >( outputCursor ), _mm_setzero_si128() ); 345 | 346 | currentControlCount = 0; 347 | 348 | if ( outputCursor > outputEarlyEnd ) 349 | { 350 | break; 351 | } 352 | } 353 | 354 | currentControlBlock[ currentControlCount >> 1 ] = 355 | ( currentControlBlock[ currentControlCount >> 1 ] >> 4 ) | ( ( static_cast( LITERALS_PER_CONTROL - 1 ) ) << 4 ); 356 | 357 | ++currentControlCount; 358 | 359 | *reinterpret_cast< uint32_t* >( outputCursor ) = 360 | *reinterpret_cast< const uint32_t* >( inputCursor - 3 ) ^ 361 | *reinterpret_cast< const uint32_t* >( ( inputCursor - 3 ) - previousOffset ); 362 | 363 | outputCursor += 4; 364 | 365 | //*( outputCursor++ ) = *( inputCursor - 3 ) ^ *( ( inputCursor - 3 ) - previousOffset ); 366 | //*( outputCursor++ ) = *( inputCursor - 2 ) ^ *( ( inputCursor - 2 ) - previousOffset ); 367 | //*( outputCursor++ ) = *( inputCursor - 1 ) ^ *( ( inputCursor - 1 ) - previousOffset ); 368 | //*( outputCursor++ ) = *inputCursor ^ *( inputCursor - previousOffset ); 369 | 370 | if ( outputCursor > outputEarlyEnd ) 371 | { 372 | break; 373 | } 374 | 375 | literalsToFlush = 0; 376 | } 377 | 378 | // Update the hash with this byte 379 | SetHash( state, hash, input, inputCursor ); 380 | 381 | ++inputCursor; 382 | } 383 | } 384 | 385 | // If we would create a compression output bigger than or equal to the input, just copy the input to the output and return equal size. 386 | if ( ( ( outputCursor + literalsToFlush + ( currentControlCount == CONTROLS_PER_BLOCK ? CONTROL_BLOCK_SIZE : 0 ) ) ) >= output + inputLength - END_PADDING_LITERALS ) 387 | { 388 | memcpy( output, input, inputLength ); 389 | 390 | outputCursor = output + inputLength; 391 | } 392 | else 393 | { 394 | // Flush any remaining literals. 395 | if ( literalsToFlush > 0 ) 396 | { 397 | lastControlIsNop = false; 398 | 399 | if ( currentControlCount == CONTROLS_PER_BLOCK ) 400 | { 401 | currentControlBlock = outputCursor; 402 | outputCursor += CONTROL_BLOCK_SIZE; 403 | 404 | _mm_storeu_si128( reinterpret_cast< __m128i* >( outputCursor ), _mm_setzero_si128() ); 405 | 406 | currentControlCount = 0; 407 | } 408 | 409 | currentControlBlock[ currentControlCount >> 1 ] = 410 | ( currentControlBlock[ currentControlCount >> 1 ] >> 4 ) | ( static_cast( literalsToFlush - 1 ) << 4 ); 411 | 412 | for ( uint32_t where = 0; where < literalsToFlush; ++where ) 413 | { 414 | const uint8_t* currentInput = inputCursor - ( literalsToFlush - where ); 415 | 416 | *( outputCursor++ ) = *currentInput ^ *( currentInput - previousOffset ); 417 | } 418 | 419 | ++currentControlCount; 420 | } 421 | 422 | // Need to finish off shifting the final control block into the low nibble if there is no second nibble 423 | if ( ( currentControlCount & 1 ) > 0 ) 424 | { 425 | currentControlBlock[ currentControlCount >> 1 ] >>= 4; 426 | } 427 | 428 | if ( lastControlIsNop ) 429 | { 430 | outputCursor -= CONTROL_BLOCK_SIZE; 431 | } 432 | 433 | size_t remainingLiterals = ( input + inputLength ) - inputCursor; 434 | 435 | // copy remaining literals 436 | memcpy( outputCursor, inputCursor, remainingLiterals ); 437 | 438 | outputCursor += remainingLiterals; 439 | } 440 | 441 | // Return the size of the compressed data. 442 | return outputCursor - output; 443 | } 444 | 445 | 446 | struct Arrival 447 | { 448 | size_t cost; 449 | int32_t from; 450 | int32_t to; 451 | uint16_t offset; 452 | }; 453 | 454 | struct TreeNode 455 | { 456 | int32_t children[ 2 ]; 457 | }; 458 | 459 | struct LZSSE4_OptimalParseState 460 | { 461 | // Note, we should really replace this with a BST, hash chaining works but is *slooooooooooooooow* for optimal parse. 462 | int32_t roots[ OPTIMAL_BUCKETS_COUNT ]; 463 | 464 | TreeNode window[ LZ_WINDOW_SIZE ]; 465 | 466 | Arrival* arrivals; 467 | 468 | size_t bufferSize; 469 | }; 470 | 471 | 472 | LZSSE4_OptimalParseState* LZSSE4_MakeOptimalParseState( size_t bufferSize ) 473 | { 474 | if ( bufferSize > 0 && ( SIZE_MAX / sizeof( Arrival ) ) < bufferSize ) 475 | { 476 | return nullptr; 477 | } 478 | 479 | LZSSE4_OptimalParseState* result = reinterpret_cast< LZSSE4_OptimalParseState* >( ::malloc( sizeof( LZSSE4_OptimalParseState ) ) ); 480 | 481 | result->bufferSize = bufferSize; 482 | 483 | if ( result != nullptr ) 484 | { 485 | result->arrivals = reinterpret_cast< Arrival* >( ::malloc( sizeof( Arrival ) * bufferSize ) ); 486 | 487 | if ( result->arrivals == nullptr ) 488 | { 489 | LZSSE4_FreeOptimalParseState( result ); 490 | 491 | result = nullptr; 492 | } 493 | } 494 | 495 | return result; 496 | } 497 | 498 | 499 | void LZSSE4_FreeOptimalParseState( LZSSE4_OptimalParseState* toFree ) 500 | { 501 | ::free( toFree->arrivals ); 502 | 503 | toFree->arrivals = nullptr; 504 | 505 | ::free( toFree ); 506 | } 507 | 508 | 509 | inline uint32_t HashOptimal( const uint8_t* inputCursor ) 510 | { 511 | return *reinterpret_cast( inputCursor ) * 0x1e35a7bd >> ( 32 - OPTIMAL_HASH_BITS ); 512 | } 513 | 514 | 515 | struct Match 516 | { 517 | size_t length; 518 | int32_t position; 519 | uint16_t offset; 520 | }; 521 | 522 | 523 | inline Match SearchAndUpdateFinder( LZSSE4_OptimalParseState& state, const uint8_t* input, const uint8_t* inputCursor, const uint8_t* inputEnd, uint32_t cutOff ) 524 | { 525 | Match result; 526 | 527 | int32_t position = static_cast( inputCursor - input ); 528 | 529 | result.position = NO_MATCH; 530 | result.length = MIN_MATCH_LENGTH; 531 | result.offset = 0; 532 | 533 | size_t lengthToEnd = inputEnd - inputCursor; 534 | int32_t lastPosition = position - ( LZ_WINDOW_SIZE - 1 ); 535 | uint32_t hash = HashOptimal( inputCursor ); 536 | 537 | lastPosition = lastPosition > 0 ? lastPosition : 0; 538 | 539 | int32_t treeCursor = state.roots[ hash ]; 540 | 541 | state.roots[ hash ] = position; 542 | 543 | int32_t* left = &state.window[ position & LZ_WINDOW_MASK ].children[ 1 ]; 544 | int32_t* right = &state.window[ position & LZ_WINDOW_MASK ].children[ 0 ]; 545 | size_t leftLength = 0; 546 | size_t rightLength = 0; 547 | 548 | for ( ;; ) 549 | { 550 | if ( cutOff-- == 0 || treeCursor < lastPosition ) 551 | { 552 | *left = *right = EMPTY_NODE; 553 | break; 554 | } 555 | 556 | TreeNode& currentNode = state.window[ treeCursor & LZ_WINDOW_MASK ]; 557 | const uint8_t* key = input + treeCursor; 558 | size_t matchLength = leftLength < rightLength ? leftLength : rightLength; 559 | 560 | uint16_t matchOffset = static_cast< uint16_t >( position - treeCursor ); 561 | size_t maxLength = matchOffset <= ( EXTENDED_MATCH_BOUND + 1 ) && matchOffset < lengthToEnd ? matchOffset : lengthToEnd; 562 | 563 | while ( matchLength < lengthToEnd ) 564 | { 565 | __m128i input16 = _mm_loadu_si128( reinterpret_cast( inputCursor + matchLength ) ); 566 | __m128i match16 = _mm_loadu_si128( reinterpret_cast( key + matchLength ) ); 567 | 568 | unsigned long matchBytes; 569 | 570 | _BitScanForward( &matchBytes, ( static_cast( ~_mm_movemask_epi8( _mm_cmpeq_epi8( input16, match16 ) ) ) | 0x10000 ) ); 571 | 572 | matchLength += matchBytes; 573 | 574 | if ( matchBytes != 16 ) 575 | { 576 | break; 577 | } 578 | } 579 | 580 | matchLength = matchLength < lengthToEnd ? matchLength : lengthToEnd; 581 | 582 | size_t truncatedMatchLength = matchLength < maxLength ? matchLength : maxLength; 583 | 584 | if ( truncatedMatchLength >= result.length && matchOffset >= LITERALS_PER_CONTROL ) 585 | { 586 | result.length = truncatedMatchLength; 587 | result.offset = matchOffset; 588 | result.position = treeCursor; 589 | } 590 | 591 | if ( matchLength == lengthToEnd ) 592 | { 593 | *left = currentNode.children[ 1 ]; 594 | *right = currentNode.children[ 0 ]; 595 | break; 596 | } 597 | 598 | if ( inputCursor[ matchLength ] < key[ matchLength ] || ( matchLength == lengthToEnd ) ) 599 | { 600 | *left = treeCursor; 601 | left = currentNode.children; 602 | treeCursor = *left; 603 | leftLength = matchLength; 604 | } 605 | else 606 | { 607 | *right = treeCursor; 608 | right = currentNode.children + 1; 609 | treeCursor = *right; 610 | rightLength = matchLength; 611 | } 612 | } 613 | 614 | // Special RLE overlapping match case, the LzFind style match above doesn't work very well with our 615 | // restriction of overlapping matches having offsets of at least 16. 616 | // Suffix array seems like a better option to handling this. 617 | { 618 | // Note, we're detecting long RLE here, but if we have an offset too close, we'll sacrifice a fair 619 | // amount of decompression performance to load-hit-stores. 620 | int32_t matchPosition = position - ( sizeof( __m128i ) * 2 ); 621 | 622 | if ( matchPosition >= 0 ) 623 | { 624 | uint16_t matchOffset = static_cast( position - matchPosition ); 625 | const uint8_t* key = input + matchPosition; 626 | size_t matchLength = 0; 627 | 628 | while ( matchLength < lengthToEnd ) 629 | { 630 | __m128i input16 = _mm_loadu_si128( reinterpret_cast( inputCursor + matchLength ) ); 631 | __m128i match16 = _mm_loadu_si128( reinterpret_cast( key + matchLength ) ); 632 | 633 | unsigned long matchBytes; 634 | 635 | _BitScanForward( &matchBytes, ( static_cast( ~_mm_movemask_epi8( _mm_cmpeq_epi8( input16, match16 ) ) ) | 0x10000 ) ); 636 | 637 | matchLength += matchBytes; 638 | 639 | if ( matchBytes != 16 ) 640 | { 641 | break; 642 | } 643 | 644 | } 645 | 646 | matchLength = matchLength < lengthToEnd ? matchLength : lengthToEnd; 647 | 648 | if ( matchLength >= result.length ) 649 | { 650 | result.length = matchLength; 651 | result.offset = matchOffset; 652 | result.position = matchPosition; 653 | } 654 | } 655 | } 656 | 657 | return result; 658 | } 659 | 660 | 661 | size_t LZSSE4_CompressOptimalParse( LZSSE4_OptimalParseState* state, const void* inputChar, size_t inputLength, void* outputChar, size_t outputLength, unsigned int level ) 662 | { 663 | if ( outputLength < inputLength || state->bufferSize < inputLength ) 664 | { 665 | // error case, output buffer not large enough. 666 | return 0; 667 | } 668 | 669 | const uint8_t* input = reinterpret_cast< const uint8_t* >( inputChar ); 670 | uint8_t* output = reinterpret_cast< uint8_t* >( outputChar ); 671 | 672 | if ( inputLength < MIN_COMPRESSION_SIZE ) 673 | { 674 | memcpy( output, input, inputLength ); 675 | 676 | return inputLength; 677 | } 678 | 679 | const uint8_t* inputCursor = input; 680 | const uint8_t* inputEnd = input + inputLength; 681 | Arrival* arrivalWatermark = state->arrivals; 682 | Arrival* arrival = state->arrivals; 683 | uint32_t cutOff = 1 << level; 684 | 685 | for ( int32_t* rootCursor = state->roots, *end = rootCursor + OPTIMAL_BUCKETS_COUNT; rootCursor < end; rootCursor += 4 ) 686 | { 687 | rootCursor[ 0 ] = EMPTY_NODE; 688 | rootCursor[ 1 ] = EMPTY_NODE; 689 | rootCursor[ 2 ] = EMPTY_NODE; 690 | rootCursor[ 3 ] = EMPTY_NODE; 691 | } 692 | 693 | for ( uint32_t where = 0; where < MIN_MATCH_LENGTH; ++where ) 694 | { 695 | SearchAndUpdateFinder( *state, input, inputCursor, inputEnd - END_PADDING_LITERALS, cutOff ); 696 | 697 | ++inputCursor; 698 | } 699 | 700 | arrival->cost = LITERAL_BITS * LITERALS_PER_CONTROL; 701 | arrival->from = -1; 702 | arrival->offset = 0; 703 | 704 | // loop through each character and project forward the matches at that character to calculate the cheapest 705 | // path of arrival for each individual character. 706 | for ( const uint8_t* earlyEnd = inputEnd - END_PADDING_LITERALS; inputCursor < earlyEnd; ++inputCursor, ++arrival ) 707 | { 708 | uint32_t lengthToEnd = static_cast< uint32_t >( earlyEnd - inputCursor ); 709 | int32_t currentPosition = static_cast< int32_t >( inputCursor - input ); 710 | size_t literalsForward = LITERALS_PER_CONTROL < lengthToEnd ? LITERALS_PER_CONTROL : lengthToEnd; 711 | size_t arrivalCost = arrival->cost; 712 | 713 | // NOTE - we currently assume only 2 literals filled in here, because the minimum match length is 3. 714 | // If we wanted to go with a higher minimum match length, we would need to fill in more literals before hand. 715 | // Also, because there is a maximum of 2 literals per control block assumed. 716 | 717 | // project forward the cost of a single literal 718 | 719 | for ( size_t where = 1; where <= literalsForward; ++where ) 720 | { 721 | Arrival* literalArrival = arrival + where; 722 | size_t literalCost = arrivalCost + CONTROL_BITS + ( where * LITERAL_BITS ); 723 | 724 | if ( literalArrival > arrivalWatermark || literalArrival->cost > literalCost ) 725 | { 726 | literalArrival->cost = literalCost; 727 | literalArrival->from = currentPosition; 728 | literalArrival->offset = 0; 729 | 730 | arrivalWatermark = literalArrival > arrivalWatermark ? literalArrival : arrivalWatermark; 731 | } 732 | } 733 | 734 | Match match = SearchAndUpdateFinder( *state, input, inputCursor, earlyEnd, cutOff ); 735 | 736 | if ( match.position != NO_MATCH ) 737 | { 738 | for ( size_t matchedLength = MIN_MATCH_LENGTH, end = match.length + 1; matchedLength < end; ++matchedLength ) 739 | { 740 | Arrival* matchArrival = arrival + matchedLength; 741 | size_t matchCost = arrivalCost + BASE_MATCH_BITS; 742 | 743 | if ( matchedLength >= EXTENDED_MATCH_BOUND ) 744 | { 745 | matchCost += ( matchedLength / EXTENDED_MATCH_BOUND ) * CONTROL_BITS; 746 | } 747 | 748 | if ( matchArrival > arrivalWatermark || matchArrival->cost > matchCost ) 749 | { 750 | matchArrival->cost = matchCost; 751 | matchArrival->from = currentPosition; 752 | matchArrival->offset = match.offset; 753 | 754 | arrivalWatermark = matchArrival > arrivalWatermark ? matchArrival : arrivalWatermark; 755 | } 756 | } 757 | 758 | if ( match.length > SKIP_MATCH_LENGTH && level < NO_SKIP_LEVEL ) 759 | { 760 | arrival += match.length - LITERALS_PER_CONTROL; 761 | inputCursor += match.length - LITERALS_PER_CONTROL; 762 | } 763 | } 764 | } 765 | 766 | // If this would cost more to encode than it would if it were just literals, encode it with no control blocks, 767 | // just literals 768 | if ( ( arrivalWatermark->cost + END_PADDING_LITERALS * LITERAL_BITS + CONTROLS_PER_BLOCK * CONTROL_BITS ) > ( inputLength * LITERAL_BITS ) ) 769 | { 770 | memcpy( output, input, inputLength ); 771 | 772 | return inputLength; 773 | } 774 | 775 | Arrival* previousPathNode; 776 | 777 | // now trace the actual optimal parse path back, connecting the nodes in the other direction. 778 | for ( const Arrival* pathNode = arrivalWatermark; pathNode->from > 0; pathNode = previousPathNode ) 779 | { 780 | previousPathNode = state->arrivals + ( pathNode->from - LITERALS_PER_CONTROL ); 781 | 782 | previousPathNode->to = static_cast( ( pathNode - state->arrivals ) + LITERALS_PER_CONTROL ); 783 | } 784 | 785 | uint8_t* outputCursor = output; 786 | 787 | memcpy( outputCursor, input, MIN_MATCH_LENGTH ); 788 | 789 | outputCursor += MIN_MATCH_LENGTH; 790 | 791 | uint8_t* currentControlBlock = outputCursor; 792 | uint32_t currentControlCount = 0; 793 | uint32_t totalControlCount = 0; 794 | 795 | outputCursor += CONTROL_BLOCK_SIZE; 796 | 797 | Arrival* nextPathNode; 798 | 799 | size_t totalPathLength = MIN_MATCH_LENGTH; 800 | uint16_t previousOffset = INITIAL_OFFSET; 801 | 802 | bool lastControlIsNop = false; 803 | 804 | // Now walk forwards again and actually write out the data. 805 | for ( const Arrival* pathNode = state->arrivals; pathNode < arrivalWatermark; pathNode = nextPathNode ) 806 | { 807 | int32_t currentPosition = static_cast< int32_t >( ( pathNode - state->arrivals ) + LITERALS_PER_CONTROL ); 808 | 809 | nextPathNode = state->arrivals + ( pathNode->to - LITERALS_PER_CONTROL ); 810 | 811 | size_t pathDistance = nextPathNode - pathNode; 812 | 813 | totalPathLength += pathDistance; 814 | 815 | lastControlIsNop = false; 816 | 817 | if ( nextPathNode->offset == 0 ) 818 | { 819 | if ( currentControlCount == CONTROLS_PER_BLOCK ) 820 | { 821 | currentControlBlock = outputCursor; 822 | outputCursor += CONTROL_BLOCK_SIZE; 823 | currentControlCount = 0; 824 | } 825 | 826 | if ( ( currentControlCount & 1 ) == 0 ) 827 | { 828 | currentControlBlock[ currentControlCount >> 1 ] = 829 | ( static_cast( pathDistance ) - 1 ); 830 | } 831 | else 832 | { 833 | currentControlBlock[ currentControlCount >> 1 ] |= 834 | ( static_cast< uint8_t >( pathDistance ) - 1 ) << CONTROL_BITS; 835 | } 836 | 837 | // output the literals. 838 | for ( int32_t where = 0; where < pathDistance; ++where ) 839 | { 840 | const uint8_t* currentInput = input + currentPosition + where; 841 | 842 | outputCursor[ where ] = *currentInput ^ *( currentInput - previousOffset ); 843 | } 844 | 845 | outputCursor += pathDistance; 846 | 847 | ++totalControlCount; 848 | ++currentControlCount; 849 | } 850 | else 851 | { 852 | // Check if we need to start a new control block 853 | if ( currentControlCount == CONTROLS_PER_BLOCK ) 854 | { 855 | currentControlBlock = outputCursor; 856 | outputCursor += CONTROL_BLOCK_SIZE; 857 | 858 | _mm_storeu_si128( reinterpret_cast<__m128i*>( outputCursor ), _mm_setzero_si128() ); 859 | 860 | currentControlCount = 0; 861 | } 862 | 863 | // Write the offset out - note the xor with the previous offset. 864 | *reinterpret_cast< uint16_t* >( outputCursor ) = nextPathNode->offset ^ previousOffset; 865 | 866 | previousOffset = nextPathNode->offset; 867 | outputCursor += sizeof( uint16_t ); 868 | 869 | if ( pathDistance < EXTENDED_MATCH_BOUND ) 870 | { 871 | if ( ( currentControlCount & 1 ) == 0 ) 872 | { 873 | currentControlBlock[ currentControlCount >> 1 ] = 874 | static_cast( pathDistance ); 875 | } 876 | else 877 | { 878 | currentControlBlock[ currentControlCount >> 1 ] |= 879 | static_cast< uint8_t >( pathDistance ) << CONTROL_BITS; 880 | } 881 | 882 | ++currentControlCount; 883 | } 884 | else 885 | { 886 | if ( ( currentControlCount & 1 ) == 0 ) 887 | { 888 | currentControlBlock[ currentControlCount >> 1 ] = 889 | static_cast( EXTENDED_MATCH_BOUND ); 890 | } 891 | else 892 | { 893 | currentControlBlock[ currentControlCount >> 1 ] |= 894 | static_cast< uint8_t >( EXTENDED_MATCH_BOUND ) << CONTROL_BITS; 895 | } 896 | 897 | ++currentControlCount; 898 | 899 | size_t toEncode = pathDistance - EXTENDED_MATCH_BOUND; 900 | 901 | for ( ;; ) 902 | { 903 | // Check if we need to start a new control block 904 | if ( currentControlCount == CONTROLS_PER_BLOCK ) 905 | { 906 | currentControlBlock = outputCursor; 907 | outputCursor += CONTROL_BLOCK_SIZE; 908 | 909 | _mm_storeu_si128( reinterpret_cast<__m128i*>( outputCursor ), _mm_setzero_si128() ); 910 | 911 | currentControlCount = 0; 912 | } 913 | 914 | // If the encode size is greater than we can hold in a control, write out a full match length 915 | // control, subtract full control value from the amount to encode and loop around again. 916 | if ( toEncode >= EXTENDED_MATCH_BOUND ) 917 | { 918 | if ( ( currentControlCount & 1 ) == 0 ) 919 | { 920 | currentControlBlock[ currentControlCount >> 1 ] = 921 | static_cast( EXTENDED_MATCH_BOUND ); 922 | } 923 | else 924 | { 925 | currentControlBlock[ currentControlCount >> 1 ] |= 926 | static_cast< uint8_t >( EXTENDED_MATCH_BOUND ) << CONTROL_BITS; 927 | } 928 | 929 | toEncode -= EXTENDED_MATCH_BOUND; 930 | 931 | ++currentControlCount; 932 | } 933 | else // Write out the remaining match length control. Could potentially be zero. 934 | { 935 | if ( ( currentControlCount & 1 ) == 0 ) 936 | { 937 | currentControlBlock[ currentControlCount >> 1 ] = 938 | static_cast( toEncode ); 939 | } 940 | else 941 | { 942 | currentControlBlock[ currentControlCount >> 1 ] |= 943 | static_cast< uint8_t >( toEncode ) << CONTROL_BITS; 944 | } 945 | 946 | if ( toEncode == 0 && currentControlCount == 0 ) 947 | { 948 | lastControlIsNop = true; 949 | } 950 | 951 | ++currentControlCount; 952 | 953 | break; 954 | } 955 | } 956 | } 957 | } 958 | } 959 | 960 | if ( lastControlIsNop ) 961 | { 962 | outputCursor -= CONTROL_BLOCK_SIZE; 963 | } 964 | 965 | size_t remainingLiterals = ( input + inputLength ) - inputCursor; 966 | 967 | // copy remaining literals 968 | memcpy( outputCursor, inputCursor, remainingLiterals ); 969 | 970 | outputCursor += remainingLiterals; 971 | 972 | return outputCursor - output; 973 | } 974 | 975 | 976 | size_t LZSSE4_Decompress( const void* inputChar, size_t inputLength, void* outputChar, size_t outputLength ) 977 | { 978 | const uint8_t* input = reinterpret_cast< const uint8_t* >( inputChar ); 979 | uint8_t* output = reinterpret_cast< uint8_t* >( outputChar ); 980 | 981 | // Data was not compressible, just copy initial values 982 | if ( outputLength == inputLength ) 983 | { 984 | memcpy( output, input, outputLength ); 985 | 986 | return inputLength; 987 | } 988 | 989 | const uint8_t* inputCursor = input; 990 | uint8_t* outputCursor = output; 991 | 992 | // The offset starts off as the minimum match length. We actually need it least four 993 | // characters back because we need them to be set to xor out the literals from the match data. 994 | size_t offset = INITIAL_OFFSET; 995 | __m128i previousCarryHi = _mm_setzero_si128(); 996 | 997 | // Copy the initial literals to the output. 998 | for ( uint32_t where = 0; where < MIN_MATCH_LENGTH; ++where ) 999 | { 1000 | *( outputCursor++ ) = *( inputCursor++ ); 1001 | } 1002 | 1003 | // Let me be clear, I am usually anti-macro, but they work for this particular (very unusual) case. 1004 | // DECODE_STEP is a regular decoding step, DECODE_STEP_HALF and DECODE_STEP_END are because the compiler couldn't 1005 | // seem to remove some of the dead code where values were updated and then never used. 1006 | 1007 | // What these macros do: 1008 | // Decode a single literal run or match run for a single control nibble. 1009 | // How they do it: 1010 | // - Read the *unaligned* input (in the case of LZSSE-F - twice), it goes into both a regular variable and an SSE register, 1011 | // because it could either be literals or an offset (or nothing at all). The low byte of streamBytesRead controls how much we advance 1012 | // the input cursor. 1013 | // - Used a contived set of casts to sign extend the "read offset" control mask and then use it to mask the input word, 1014 | // which is then xor'd against the offset, for a "branchless" conditional move into the offset which 1015 | // has been carried over from the previous literal/match block. Note, this ends up doing better than a cmov on most 1016 | // modern processors. But we need to pre-xor the input offset. 1017 | // - We then load the match data from output buffer (offset back from the current output point). Unconditional load here. 1018 | // - We broadcast the "from literal" control mask from the current least significant byte of the SSE register using a shuffle epi-8 1019 | // - We mask the literals with that SSE register wide mask. 1020 | // - The literals have been pre-xor'd with the data read in as match data, so we use an xor to branchlessly choose between the two. 1021 | // In this case, it ends up a better option than a blendv on most processors. 1022 | // - Store the block. We store all 16 bytes of the SSE register (due to some constraints in the format of the data, we won't 1023 | // go past the end of the buffer), but we may overlap this. 1024 | // - bytesOut controls how much we advance the output cursor. 1025 | // - We use 8 bit shifts to advance all the controls up to the next byte. There is some variable sized register trickery that 1026 | // x86/x64 is great for as long as we don't anger the register renamer. 1027 | 1028 | #define DECODE_STEP( HILO, CHECKMATCH, CHECKBUFFERS ) \ 1029 | { \ 1030 | size_t inputWord = *reinterpret_cast( inputCursor ); \ 1031 | __m128i literals = _mm_loadu_si128( reinterpret_cast( inputCursor ) ); \ 1032 | \ 1033 | offset ^= static_cast( static_cast( static_cast( readOffsetHalf##HILO ) ) ) & inputWord; \ 1034 | \ 1035 | readOffsetHalf##HILO >>= 8; \ 1036 | \ 1037 | const uint8_t* matchPointer = reinterpret_cast( outputCursor - offset ); \ 1038 | \ 1039 | if ( CHECKMATCH && matchPointer < output ) \ 1040 | goto MATCH_UNDERFLOW; \ 1041 | \ 1042 | __m128i fromLiteral = _mm_shuffle_epi8( fromLiteral##HILO, _mm_setzero_si128() ); \ 1043 | __m128i matchData = _mm_loadu_si128( reinterpret_cast( matchPointer ) ); \ 1044 | \ 1045 | literals = _mm_and_si128( literals, fromLiteral ); \ 1046 | \ 1047 | fromLiteral##HILO = _mm_srli_si128( fromLiteral##HILO, 1 ); \ 1048 | \ 1049 | __m128i toStore = _mm_xor_si128( matchData, literals ); \ 1050 | \ 1051 | _mm_storeu_si128( reinterpret_cast<__m128i*>( outputCursor ), toStore ); \ 1052 | \ 1053 | outputCursor += static_cast< uint8_t >( bytesOutHalf##HILO ); \ 1054 | inputCursor += static_cast< uint8_t >( streamBytesReadHalf##HILO ); \ 1055 | \ 1056 | bytesOutHalf##HILO >>= 8; \ 1057 | streamBytesReadHalf##HILO >>= 8; \ 1058 | \ 1059 | if ( CHECKBUFFERS && ( outputCursor >= outputEarlyEnd || inputCursor > inputEarlyEnd ) ) \ 1060 | goto BUFFER_END; \ 1061 | } 1062 | 1063 | #define DECODE_STEP_HALF( HILO, CHECKMATCH, CHECKBUFFERS ) \ 1064 | { \ 1065 | size_t inputWord = *reinterpret_cast( inputCursor ); \ 1066 | __m128i literals = _mm_loadu_si128( reinterpret_cast( inputCursor ) ); \ 1067 | \ 1068 | offset ^= static_cast( static_cast( static_cast( readOffsetHalf##HILO ) ) ) & inputWord; \ 1069 | \ 1070 | const uint8_t* matchPointer = reinterpret_cast( outputCursor - offset ); \ 1071 | \ 1072 | if ( CHECKMATCH && matchPointer < output ) \ 1073 | goto MATCH_UNDERFLOW; \ 1074 | \ 1075 | __m128i fromLiteral = _mm_shuffle_epi8( fromLiteral##HILO, _mm_setzero_si128() ); \ 1076 | __m128i matchData = _mm_loadu_si128( reinterpret_cast( matchPointer ) ); \ 1077 | \ 1078 | literals = _mm_and_si128( literals, fromLiteral ); \ 1079 | \ 1080 | fromLiteral##HILO = _mm_srli_si128( fromLiteral##HILO, 1 ); \ 1081 | \ 1082 | __m128i toStore = _mm_xor_si128( matchData, literals ); \ 1083 | \ 1084 | _mm_storeu_si128( reinterpret_cast<__m128i*>( outputCursor ), toStore ); \ 1085 | \ 1086 | outputCursor += static_cast< uint8_t >( bytesOutHalf##HILO ); \ 1087 | inputCursor += static_cast< uint8_t >( streamBytesReadHalf##HILO ); \ 1088 | \ 1089 | if ( CHECKBUFFERS && ( outputCursor >= outputEarlyEnd || inputCursor > inputEarlyEnd ) ) \ 1090 | goto BUFFER_END; \ 1091 | } 1092 | 1093 | #define DECODE_STEP_END( HILO, CHECKMATCH, CHECKBUFFERS ) \ 1094 | { \ 1095 | size_t inputWord = *reinterpret_cast( inputCursor ); \ 1096 | __m128i literals = _mm_loadu_si128( reinterpret_cast( inputCursor ) ); \ 1097 | \ 1098 | offset ^= static_cast( static_cast( static_cast( readOffsetHalf##HILO ) ) ) & inputWord; \ 1099 | \ 1100 | const uint8_t* matchPointer = reinterpret_cast( outputCursor - offset ); \ 1101 | \ 1102 | if ( CHECKMATCH && matchPointer < output ) \ 1103 | goto MATCH_UNDERFLOW; \ 1104 | \ 1105 | __m128i fromLiteral = _mm_shuffle_epi8( fromLiteral##HILO, _mm_setzero_si128() ); \ 1106 | __m128i matchData = _mm_loadu_si128( reinterpret_cast( matchPointer ) ); \ 1107 | \ 1108 | literals = _mm_and_si128( literals, fromLiteral ); \ 1109 | \ 1110 | __m128i toStore = _mm_xor_si128( matchData, literals ); \ 1111 | \ 1112 | _mm_storeu_si128( reinterpret_cast<__m128i*>( outputCursor ), toStore ); \ 1113 | \ 1114 | outputCursor += static_cast< uint8_t >( bytesOutHalf##HILO ); \ 1115 | inputCursor += static_cast< uint8_t >( streamBytesReadHalf##HILO ); \ 1116 | \ 1117 | if ( CHECKBUFFERS && ( outputCursor >= outputEarlyEnd || inputCursor > inputEarlyEnd ) ) \ 1118 | goto BUFFER_END; \ 1119 | } 1120 | 1121 | #define DECODE_STEP_LO(CHECKMATCH, CHECKBUFFERS ) DECODE_STEP( Lo, CHECKMATCH, CHECKBUFFERS ) 1122 | #define DECODE_STEP_HI(CHECKMATCH, CHECKBUFFERS ) DECODE_STEP( Hi, CHECKMATCH, CHECKBUFFERS ) 1123 | #define DECODE_STEP_HALF_LO(CHECKMATCH, CHECKBUFFERS ) DECODE_STEP_HALF( Lo, CHECKMATCH, CHECKBUFFERS ) 1124 | #define DECODE_STEP_HALF_HI(CHECKMATCH, CHECKBUFFERS ) DECODE_STEP_HALF( Hi, CHECKMATCH, CHECKBUFFERS ) 1125 | #define DECODE_STEP_END_LO(CHECKMATCH, CHECKBUFFERS ) DECODE_STEP_END( Lo, CHECKMATCH, CHECKBUFFERS ) 1126 | #define DECODE_STEP_END_HI(CHECKMATCH, CHECKBUFFERS ) DECODE_STEP_END( Hi, CHECKMATCH, CHECKBUFFERS ) 1127 | 1128 | __m128i nibbleMask = _mm_set1_epi8( 0xF ); 1129 | __m128i offsetSize = _mm_set1_epi8( OFFSET_SIZE ); 1130 | 1131 | // Note, we use this block here because it allows the "fake" inputEarlyEnd/outputEarlyEnd not to cause register spills 1132 | // in the decompression loops. And yes, that did actually happen. 1133 | { 1134 | 1135 | #pragma warning ( push ) 1136 | #pragma warning ( disable : 4101 ) 1137 | 1138 | // These variables are not actually ever used in this block, because we use 1139 | // a constant conditional expression to take out the branches that would hit them. 1140 | // But unfortunately, we need them to compile. 1141 | const uint8_t* inputEarlyEnd; 1142 | uint8_t* outputEarlyEnd; 1143 | 1144 | #pragma warning ( pop ) 1145 | 1146 | // "Safe" ends to the buffer, before the input/output cursors hit these, we can loop without overflow checks. 1147 | const uint8_t* inputSafeEnd = ( input + inputLength ) - INPUT_BUFFER_SAFE; 1148 | uint8_t* outputSafeEnd = ( output + outputLength ) - OUTPUT_BUFFER_SAFE; 1149 | 1150 | // Decoding loop with offset output buffer underflow test, but no buffer overflow tests, assumed to end at a safe distance 1151 | // from overflows 1152 | while ( ( outputCursor - output ) < LZ_WINDOW_SIZE && outputCursor < outputSafeEnd && inputCursor < inputSafeEnd ) 1153 | { 1154 | // load the control block 1155 | __m128i controlBlock = _mm_loadu_si128( reinterpret_cast( inputCursor ) ); 1156 | 1157 | // split the control block into high and low nibbles 1158 | __m128i controlHi = _mm_and_si128( _mm_srli_epi32( controlBlock, CONTROL_BITS ), nibbleMask ); 1159 | __m128i controlLo = _mm_and_si128( controlBlock, nibbleMask ); 1160 | 1161 | // Note, the carries are set when the nibble is at its highest value, 15, meaning the operation after will 1162 | // be an extension of the current match operation. 1163 | 1164 | // Work out the carry for the low nibbles (which will be used with the high controls to put them into 1165 | // match without offset read mode). 1166 | __m128i carryLo = _mm_cmpeq_epi8( controlLo, nibbleMask ); 1167 | 1168 | // The carry for the high nibbles is used with the low controls, but needs one byte from the previous iteration. We save 1169 | // the calculated carry to use that byte next iteration. 1170 | __m128i carryHi = _mm_cmpeq_epi8( controlHi, nibbleMask ); 1171 | __m128i shiftedCarryHi = _mm_alignr_epi8( carryHi, previousCarryHi, 15 ); 1172 | 1173 | previousCarryHi = carryHi; 1174 | 1175 | // We make the implicit assumption that the maximum number of literals to controls here is twice the offset size (4 vs 2), 1176 | // we are doing this here to save keeping the value around (spilling or fetching it each time) 1177 | __m128i literalsPerControl = _mm_add_epi8( offsetSize, offsetSize ); 1178 | 1179 | // Here we are testing if the runs will be literals or matches. Note that if the carries are set from the previous operation 1180 | // this will essentially be ignored later on. 1181 | __m128i isLiteralHi = _mm_cmplt_epi8( controlHi, literalsPerControl ); 1182 | __m128i isLiteralLo = _mm_cmplt_epi8( controlLo, literalsPerControl ); 1183 | 1184 | // Here we're calculating the number of bytes that will be output, we are actually subtracting negative one from the control 1185 | // (handy trick where comparison result masks are negative one) if carry is not set and it is a literal. 1186 | __m128i bytesOutLo = _mm_sub_epi8( controlLo, _mm_andnot_si128( shiftedCarryHi, isLiteralLo ) ); 1187 | __m128i bytesOutHi = _mm_sub_epi8( controlHi, _mm_andnot_si128( carryLo, isLiteralHi ) ); 1188 | 1189 | // Calculate the number of bytes to read per control. 1190 | // In the case the carry is set, no bytes. Otherwise, the offset size (2 bytes) for matches or the number of output bytes for literals. 1191 | __m128i streamBytesReadLo = _mm_andnot_si128( shiftedCarryHi, _mm_blendv_epi8( offsetSize, bytesOutLo, isLiteralLo ) ); 1192 | __m128i streamBytesReadHi = _mm_andnot_si128( carryLo, _mm_blendv_epi8( offsetSize, bytesOutHi, isLiteralHi ) ); 1193 | 1194 | // I want 128 set bits please. 1195 | __m128i allSet = _mm_cmpeq_epi8( shiftedCarryHi, shiftedCarryHi ); 1196 | 1197 | // Masks to read the offset (or keep the previous one) - set in the case that this is not a literal and the carry is not set 1198 | __m128i readOffsetLo = _mm_xor_si128( _mm_or_si128( isLiteralLo, shiftedCarryHi ), allSet ); 1199 | __m128i readOffsetHi = _mm_xor_si128( _mm_or_si128( isLiteralHi, carryLo ), allSet ); 1200 | 1201 | // Masks whether we are reading literals - set if the carry is not set and these are literals. 1202 | __m128i fromLiteralLo = _mm_andnot_si128( shiftedCarryHi, isLiteralLo ); 1203 | __m128i fromLiteralHi = _mm_andnot_si128( carryLo, isLiteralHi ); 1204 | 1205 | // Advance the input past the control block 1206 | inputCursor += CONTROL_BLOCK_SIZE; 1207 | 1208 | { 1209 | // Pull out the bottom halves off the SSE registers from before - we want these 1210 | // things in GPRs for the more linear logic. 1211 | uint64_t bytesOutHalfLo = static_cast( _mm_cvtsi128_si64( bytesOutLo ) ); 1212 | uint64_t bytesOutHalfHi = static_cast( _mm_cvtsi128_si64( bytesOutHi ) ); 1213 | 1214 | uint64_t streamBytesReadHalfLo = static_cast( _mm_cvtsi128_si64( streamBytesReadLo ) ); 1215 | uint64_t streamBytesReadHalfHi = static_cast( _mm_cvtsi128_si64( streamBytesReadHi ) ); 1216 | 1217 | uint64_t readOffsetHalfLo = static_cast( _mm_cvtsi128_si64( readOffsetLo ) ); 1218 | uint64_t readOffsetHalfHi = static_cast( _mm_cvtsi128_si64( readOffsetHi ) ); 1219 | 1220 | DECODE_STEP_LO( true, false ); 1221 | DECODE_STEP_HI( true, false ); 1222 | DECODE_STEP_LO( true, false ); 1223 | DECODE_STEP_HI( true, false ); 1224 | 1225 | DECODE_STEP_LO( true, false ); 1226 | DECODE_STEP_HI( true, false ); 1227 | DECODE_STEP_LO( true, false ); 1228 | DECODE_STEP_HI( true, false ); 1229 | 1230 | DECODE_STEP_LO( true, false ); 1231 | DECODE_STEP_HI( true, false ); 1232 | DECODE_STEP_LO( true, false ); 1233 | DECODE_STEP_HI( true, false ); 1234 | 1235 | DECODE_STEP_LO( true, false ); 1236 | DECODE_STEP_HI( true, false ); 1237 | DECODE_STEP_HALF_LO( true, false ); 1238 | DECODE_STEP_HALF_HI( true, false ); 1239 | } 1240 | 1241 | { 1242 | // Now the top halves. 1243 | uint64_t bytesOutHalfLo = static_cast( _mm_extract_epi64( bytesOutLo, 1 ) ); 1244 | uint64_t bytesOutHalfHi = static_cast( _mm_extract_epi64( bytesOutHi, 1 ) ); 1245 | 1246 | uint64_t streamBytesReadHalfLo = static_cast( _mm_extract_epi64( streamBytesReadLo, 1 ) ); 1247 | uint64_t streamBytesReadHalfHi = static_cast( _mm_extract_epi64( streamBytesReadHi, 1 ) ); 1248 | 1249 | uint64_t readOffsetHalfLo = static_cast( _mm_extract_epi64( readOffsetLo, 1 ) ); 1250 | uint64_t readOffsetHalfHi = static_cast( _mm_extract_epi64( readOffsetHi, 1 ) ); 1251 | 1252 | DECODE_STEP_LO( true, false ); 1253 | DECODE_STEP_HI( true, false ); 1254 | DECODE_STEP_LO( true, false ); 1255 | DECODE_STEP_HI( true, false ); 1256 | 1257 | DECODE_STEP_LO( true, false ); 1258 | DECODE_STEP_HI( true, false ); 1259 | DECODE_STEP_LO( true, false ); 1260 | DECODE_STEP_HI( true, false ); 1261 | 1262 | DECODE_STEP_LO( true, false ); 1263 | DECODE_STEP_HI( true, false ); 1264 | DECODE_STEP_LO( true, false ); 1265 | DECODE_STEP_HI( true, false ); 1266 | 1267 | DECODE_STEP_LO( true, false ); 1268 | DECODE_STEP_HI( true, false ); 1269 | DECODE_STEP_END_LO( true, false ); 1270 | DECODE_STEP_END_HI( true, false ); 1271 | } 1272 | } 1273 | 1274 | // Decoding loop with no buffer checks, but will end at a safe distance from the end of the buffers. 1275 | // Note, when we get here we have already reached the point in the output buffer which is *past* where we can underflow 1276 | // due to a bad match offset. 1277 | while ( outputCursor < outputSafeEnd && inputCursor < inputSafeEnd ) 1278 | { 1279 | // This code is the same as the loop above, see comments there 1280 | __m128i controlBlock = _mm_loadu_si128( reinterpret_cast( inputCursor ) ); 1281 | __m128i controlHi = _mm_and_si128( _mm_srli_epi32( controlBlock, CONTROL_BITS ), nibbleMask ); 1282 | __m128i controlLo = _mm_and_si128( controlBlock, nibbleMask ); 1283 | 1284 | __m128i carryLo = _mm_cmpeq_epi8( controlLo, nibbleMask ); 1285 | __m128i carryHi = _mm_cmpeq_epi8( controlHi, nibbleMask ); 1286 | __m128i shiftedCarryHi = _mm_alignr_epi8( carryHi, previousCarryHi, 15 ); // where we take the carry from the previous hi values 1287 | 1288 | previousCarryHi = carryHi; 1289 | 1290 | __m128i literalsPerControl = _mm_add_epi8( offsetSize, offsetSize ); 1291 | __m128i isLiteralHi = _mm_cmplt_epi8( controlHi, literalsPerControl ); 1292 | __m128i isLiteralLo = _mm_cmplt_epi8( controlLo, literalsPerControl ); 1293 | 1294 | __m128i bytesOutLo = _mm_sub_epi8( controlLo, _mm_andnot_si128( shiftedCarryHi, isLiteralLo ) ); 1295 | __m128i bytesOutHi = _mm_sub_epi8( controlHi, _mm_andnot_si128( carryLo, isLiteralHi ) ); 1296 | 1297 | __m128i streamBytesReadLo = _mm_andnot_si128( shiftedCarryHi, _mm_blendv_epi8( offsetSize, bytesOutLo, isLiteralLo ) ); 1298 | __m128i streamBytesReadHi = _mm_andnot_si128( carryLo, _mm_blendv_epi8( offsetSize, bytesOutHi, isLiteralHi ) ); 1299 | 1300 | __m128i neg1 = _mm_cmpeq_epi8( shiftedCarryHi, shiftedCarryHi ); 1301 | 1302 | __m128i readOffsetLo = _mm_xor_si128( _mm_or_si128( isLiteralLo, shiftedCarryHi ), neg1 ); 1303 | __m128i readOffsetHi = _mm_xor_si128( _mm_or_si128( isLiteralHi, carryLo ), neg1 ); 1304 | 1305 | __m128i fromLiteralLo = _mm_andnot_si128( shiftedCarryHi, isLiteralLo ); 1306 | __m128i fromLiteralHi = _mm_andnot_si128( carryLo, isLiteralHi ); 1307 | 1308 | inputCursor += CONTROL_BLOCK_SIZE; 1309 | 1310 | { 1311 | uint64_t bytesOutHalfLo = static_cast( _mm_cvtsi128_si64( bytesOutLo ) ); 1312 | uint64_t bytesOutHalfHi = static_cast( _mm_cvtsi128_si64( bytesOutHi ) ); 1313 | 1314 | uint64_t streamBytesReadHalfLo = static_cast( _mm_cvtsi128_si64( streamBytesReadLo ) ); 1315 | uint64_t streamBytesReadHalfHi = static_cast( _mm_cvtsi128_si64( streamBytesReadHi ) ); 1316 | 1317 | uint64_t readOffsetHalfLo = static_cast( _mm_cvtsi128_si64( readOffsetLo ) ); 1318 | uint64_t readOffsetHalfHi = static_cast( _mm_cvtsi128_si64( readOffsetHi ) ); 1319 | 1320 | DECODE_STEP_LO( false, false ); 1321 | DECODE_STEP_HI( false, false ); 1322 | DECODE_STEP_LO( false, false ); 1323 | DECODE_STEP_HI( false, false ); 1324 | 1325 | DECODE_STEP_LO( false, false ); 1326 | DECODE_STEP_HI( false, false ); 1327 | DECODE_STEP_LO( false, false ); 1328 | DECODE_STEP_HI( false, false ); 1329 | 1330 | DECODE_STEP_LO( false, false ); 1331 | DECODE_STEP_HI( false, false ); 1332 | DECODE_STEP_LO( false, false ); 1333 | DECODE_STEP_HI( false, false ); 1334 | 1335 | DECODE_STEP_LO( false, false ); 1336 | DECODE_STEP_HI( false, false ); 1337 | DECODE_STEP_HALF_LO( false, false ); 1338 | DECODE_STEP_HALF_HI( false, false ); 1339 | } 1340 | 1341 | { 1342 | uint64_t bytesOutHalfLo = static_cast( _mm_extract_epi64( bytesOutLo, 1 ) ); 1343 | uint64_t bytesOutHalfHi = static_cast( _mm_extract_epi64( bytesOutHi, 1 ) ); 1344 | 1345 | uint64_t streamBytesReadHalfLo = static_cast( _mm_extract_epi64( streamBytesReadLo, 1 ) ); 1346 | uint64_t streamBytesReadHalfHi = static_cast( _mm_extract_epi64( streamBytesReadHi, 1 ) ); 1347 | 1348 | uint64_t readOffsetHalfLo = static_cast( _mm_extract_epi64( readOffsetLo, 1 ) ); 1349 | uint64_t readOffsetHalfHi = static_cast( _mm_extract_epi64( readOffsetHi, 1 ) ); 1350 | 1351 | DECODE_STEP_LO( false, false ); 1352 | DECODE_STEP_HI( false, false ); 1353 | DECODE_STEP_LO( false, false ); 1354 | DECODE_STEP_HI( false, false ); 1355 | 1356 | DECODE_STEP_LO( false, false ); 1357 | DECODE_STEP_HI( false, false ); 1358 | DECODE_STEP_LO( false, false ); 1359 | DECODE_STEP_HI( false, false ); 1360 | 1361 | DECODE_STEP_LO( false, false ); 1362 | DECODE_STEP_HI( false, false ); 1363 | DECODE_STEP_LO( false, false ); 1364 | DECODE_STEP_HI( false, false ); 1365 | 1366 | DECODE_STEP_LO( false, false ); 1367 | DECODE_STEP_HI( false, false ); 1368 | DECODE_STEP_END_LO( false, false ); 1369 | DECODE_STEP_END_HI( false, false ); 1370 | } 1371 | } 1372 | } 1373 | 1374 | // Decoding loop with all buffer checks. 1375 | { 1376 | const uint8_t* inputEarlyEnd; 1377 | uint8_t* outputEarlyEnd; 1378 | inputEarlyEnd = ( input + inputLength ) - END_PADDING_LITERALS; 1379 | outputEarlyEnd = ( output + outputLength ) - END_PADDING_LITERALS; 1380 | 1381 | while ( outputCursor < outputEarlyEnd && inputCursor < inputEarlyEnd ) 1382 | { 1383 | __m128i controlBlock = _mm_loadu_si128( reinterpret_cast( inputCursor ) ); 1384 | __m128i controlHi = _mm_and_si128( _mm_srli_epi32( controlBlock, CONTROL_BITS ), nibbleMask ); 1385 | __m128i controlLo = _mm_and_si128( controlBlock, nibbleMask ); 1386 | 1387 | __m128i carryLo = _mm_cmpeq_epi8( controlLo, nibbleMask ); 1388 | __m128i carryHi = _mm_cmpeq_epi8( controlHi, nibbleMask ); 1389 | __m128i shiftedCarryHi = _mm_alignr_epi8( carryHi, previousCarryHi, 15 ); 1390 | 1391 | previousCarryHi = carryHi; 1392 | 1393 | __m128i literalsPerControl = _mm_add_epi8( offsetSize, offsetSize ); 1394 | __m128i isLiteralHi = _mm_cmplt_epi8( controlHi, literalsPerControl ); 1395 | __m128i isLiteralLo = _mm_cmplt_epi8( controlLo, literalsPerControl ); 1396 | 1397 | __m128i bytesOutLo = _mm_sub_epi8( controlLo, _mm_andnot_si128( shiftedCarryHi, isLiteralLo ) ); 1398 | __m128i bytesOutHi = _mm_sub_epi8( controlHi, _mm_andnot_si128( carryLo, isLiteralHi ) ); 1399 | 1400 | __m128i streamBytesReadLo = _mm_andnot_si128( shiftedCarryHi, _mm_blendv_epi8( offsetSize, bytesOutLo, isLiteralLo ) ); 1401 | __m128i streamBytesReadHi = _mm_andnot_si128( carryLo, _mm_blendv_epi8( offsetSize, bytesOutHi, isLiteralHi ) ); 1402 | 1403 | __m128i neg1 = _mm_cmpeq_epi8( shiftedCarryHi, shiftedCarryHi ); 1404 | 1405 | __m128i readOffsetLo = _mm_xor_si128( _mm_or_si128( isLiteralLo, shiftedCarryHi ), neg1 ); 1406 | __m128i readOffsetHi = _mm_xor_si128( _mm_or_si128( isLiteralHi, carryLo ), neg1 ); 1407 | 1408 | __m128i fromLiteralLo = _mm_andnot_si128( shiftedCarryHi, isLiteralLo ); 1409 | __m128i fromLiteralHi = _mm_andnot_si128( carryLo, isLiteralHi ); 1410 | 1411 | inputCursor += CONTROL_BLOCK_SIZE; 1412 | 1413 | if ( inputCursor > inputEarlyEnd ) 1414 | goto BUFFER_END; 1415 | 1416 | { 1417 | uint64_t bytesOutHalfLo = static_cast( _mm_cvtsi128_si64( bytesOutLo ) ); 1418 | uint64_t bytesOutHalfHi = static_cast( _mm_cvtsi128_si64( bytesOutHi ) ); 1419 | 1420 | uint64_t streamBytesReadHalfLo = static_cast( _mm_cvtsi128_si64( streamBytesReadLo ) ); 1421 | uint64_t streamBytesReadHalfHi = static_cast( _mm_cvtsi128_si64( streamBytesReadHi ) ); 1422 | 1423 | uint64_t readOffsetHalfLo = static_cast( _mm_cvtsi128_si64( readOffsetLo ) ); 1424 | uint64_t readOffsetHalfHi = static_cast( _mm_cvtsi128_si64( readOffsetHi ) ); 1425 | 1426 | DECODE_STEP_LO( true, true ); 1427 | DECODE_STEP_HI( true, true ); 1428 | DECODE_STEP_LO( true, true ); 1429 | DECODE_STEP_HI( true, true ); 1430 | 1431 | DECODE_STEP_LO( true, true ); 1432 | DECODE_STEP_HI( true, true ); 1433 | DECODE_STEP_LO( true, true ); 1434 | DECODE_STEP_HI( true, true ); 1435 | 1436 | DECODE_STEP_LO( true, true ); 1437 | DECODE_STEP_HI( true, true ); 1438 | DECODE_STEP_LO( true, true ); 1439 | DECODE_STEP_HI( true, true ); 1440 | 1441 | DECODE_STEP_LO( true, true ); 1442 | DECODE_STEP_HI( true, true ); 1443 | DECODE_STEP_HALF_LO( true, true ); 1444 | DECODE_STEP_HALF_HI( true, true ); 1445 | } 1446 | 1447 | { 1448 | // Now the top halves. 1449 | uint64_t bytesOutHalfLo = static_cast( _mm_extract_epi64( bytesOutLo, 1 ) ); 1450 | uint64_t bytesOutHalfHi = static_cast( _mm_extract_epi64( bytesOutHi, 1 ) ); 1451 | 1452 | uint64_t streamBytesReadHalfLo = static_cast( _mm_extract_epi64( streamBytesReadLo, 1 ) ); 1453 | uint64_t streamBytesReadHalfHi = static_cast( _mm_extract_epi64( streamBytesReadHi, 1 ) ); 1454 | 1455 | uint64_t readOffsetHalfLo = static_cast( _mm_extract_epi64( readOffsetLo, 1 ) ); 1456 | uint64_t readOffsetHalfHi = static_cast( _mm_extract_epi64( readOffsetHi, 1 ) ); 1457 | 1458 | DECODE_STEP_LO( true, true ); 1459 | DECODE_STEP_HI( true, true ); 1460 | DECODE_STEP_LO( true, true ); 1461 | DECODE_STEP_HI( true, true ); 1462 | 1463 | DECODE_STEP_LO( true, true ); 1464 | DECODE_STEP_HI( true, true ); 1465 | DECODE_STEP_LO( true, true ); 1466 | DECODE_STEP_HI( true, true ); 1467 | 1468 | DECODE_STEP_LO( true, true ); 1469 | DECODE_STEP_HI( true, true ); 1470 | DECODE_STEP_LO( true, true ); 1471 | DECODE_STEP_HI( true, true ); 1472 | 1473 | DECODE_STEP_LO( true, true ); 1474 | DECODE_STEP_HI( true, true ); 1475 | DECODE_STEP_END_LO( true, true ); 1476 | DECODE_STEP_END_HI( true, true ); 1477 | } 1478 | } 1479 | 1480 | BUFFER_END: 1481 | 1482 | // When we get here, we have either advanced the right amount on both cursors 1483 | // or something bad happened, so leave it as is, so we can tell where 1484 | // the error happened. 1485 | if ( inputCursor == inputEarlyEnd && outputCursor == outputEarlyEnd ) 1486 | { 1487 | size_t remainingLiterals = ( input + inputLength ) - inputCursor; 1488 | 1489 | // copy any trailing literals 1490 | memcpy( outputCursor, inputCursor, remainingLiterals ); 1491 | 1492 | outputCursor += remainingLiterals; 1493 | } 1494 | } 1495 | 1496 | MATCH_UNDERFLOW: 1497 | 1498 | return outputCursor - output; 1499 | } 1500 | -------------------------------------------------------------------------------- /lzsse4/lzsse4.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2016, Conor Stokes 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | */ 25 | 26 | #ifndef LZSSE4_H__ 27 | #define LZSSE4_H__ 28 | 29 | #pragma once 30 | 31 | /* LZSSE4 - x64/SSE targeted codec for better performance with lower compression ratio data/less optimal compressors. 32 | * Supports minimum 4 byte matches, maximum 15 bytes of match per control word and 4 byte literal runs per control word. 33 | */ 34 | 35 | #ifdef __cplusplus 36 | extern "C" 37 | { 38 | #endif 39 | 40 | /* Re-usable parse state object for compression. */ 41 | typedef struct LZSSE4_FastParseState LZSSE4_FastParseState; 42 | 43 | /* Allocate the parse state for compression - returns null on failure */ 44 | LZSSE4_FastParseState* LZSSE4_MakeFastParseState(); 45 | 46 | /* De-allocate the parse state for compression */ 47 | void LZSSE4_FreeFastParseState( LZSSE4_FastParseState* toFree ); 48 | 49 | /* Re-usable parse state object for compression. */ 50 | typedef struct LZSSE4_OptimalParseState LZSSE4_OptimalParseState; 51 | 52 | /* Allocate the parse state for compression - returns null on failure. Note 53 | Buffersize has to be greater or equal to any inputLength used with LZSSE2_CompressOptimalParse */ 54 | LZSSE4_OptimalParseState* LZSSE4_MakeOptimalParseState( size_t bufferSize ); 55 | 56 | /* De-allocate the parse state for compression */ 57 | void LZSSE4_FreeOptimalParseState( LZSSE4_OptimalParseState* toFree ); 58 | 59 | 60 | /* "Optimal" compression routine. 61 | * Will compress data into LZSSE4 format, uses hash BST matching to find matches and run an optimal parse (high relative memory usage). Requires SSE 4.1. 62 | * state : Contains the hash table for matching, passed as a parameter so that allocations can be re-used. 63 | * input : Buffer containing uncompressed data to be compressed. May not be null. 64 | * inputLength : Length of the compressed data in the input buffer - note should be under 2GB. 65 | * output : Buffer that will receive the compressed output. 66 | * outputLength : The length reserved in the buffer for compressed data. This should be at least inputLength. Note, 67 | * The compressed data should never be longer than inputLength, as in this case the data is stored raw. 68 | * level : The compression level to use for this file 1->17, 17 delivers the highest compression, 1 delivers the least. 69 | * Thread Safety - state can not be used on multiple threads with calls running concurrently. Can run multiple threads with separate state 70 | * concurrently. 71 | * 72 | * Returns the size of the compressed data, or 0 in the case of error (e.g. outputLength is less than inputLength). 73 | */ 74 | size_t LZSSE4_CompressOptimalParse( LZSSE4_OptimalParseState* state, const void* input, size_t inputLength, void* output, size_t outputLength, unsigned int level ); 75 | 76 | 77 | /* "Fast" compression routine. 78 | * Will compress data into LZSSE4 format, uses a simple single entry hash/greedy matching to find matches. Requires SSE 4.1. 79 | * state : Contains the hash table for matching, passed as a parameter so that allocations can be re-used. 80 | * input : Buffer containing uncompressed data to be compressed. May not be null. 81 | * inputLength : Length of the compressed data in the input buffer - note should be under 2GB. 82 | * output : Buffer that will receive the compressed output. 83 | * outputLength : The length reserved in the buffer for compressed data. This should be at least inputLength. Note, 84 | * The compressed data should never be longer than inputLength, as in this case the data is stored raw. 85 | * 86 | * Thread Safety - state can not be used on multiple threads with calls running concurrently. Can run multiple threads with separate state 87 | * concurrently. 88 | * 89 | * Returns the size of the compressed data, or 0 in the case of error (e.g. outputLength is less than inputLength). 90 | */ 91 | size_t LZSSE4_CompressFast( LZSSE4_FastParseState* state, const void* input, size_t inputLength, void* output, size_t outputLength ); 92 | 93 | /* Decompression routine. 94 | * This routine will decompress data in the LZSSE4 format and currently requires SSE 4.1 and is targeted at x64. 95 | * It will perform poorly on x86 due to hunger for registers. 96 | * input : Buffer containing compressed input block. May not be null. 97 | * inputLength : Length of the compressed data in the input buffer - note, this should be under 2GB 98 | * output : Buffer that will received the de-compressed output. Note, that this needs to be at least outputLength long. 99 | * May not be null. 100 | * outputLength : The length of the compressed output - note, this should be under 2GB 101 | * 102 | * Provided that input and output are valid pointers to buffers of at least their specified size, this routine 103 | * should be memory safe - both match pointer checks and input/output buffer checks exist. 104 | * 105 | * Returns the size of the decompressed data, which will be less than outputLength in the event of an error (number of bytes 106 | * will indicate where in the output stream the error occured). 107 | * 108 | * Note that this data is not hash verified, errors that occur are either from a misformed stream or bad buffer sizes. 109 | * Remember, corrupt data can still be valid to decompress. 110 | */ 111 | size_t LZSSE4_Decompress( const void* input, size_t inputLength, void* output, size_t outputLength ); 112 | 113 | #ifdef __cplusplus 114 | } 115 | #endif 116 | 117 | #endif /* -- LZSSE4_H__ */ 118 | -------------------------------------------------------------------------------- /lzsse4/lzsse4_platform.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2016, Brian Marshall 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | */ 25 | 26 | #ifndef LZSSE4_PLATFORM_H__ 27 | #define LZSSE4_PLATFORM_H__ 28 | 29 | #pragma once 30 | 31 | /* 32 | Compiler/Platform detection based on the table from: 33 | https://blogs.msdn.microsoft.com/vcblog/2015/12/04/clang-with-microsoft-codegen-in-vs-2015-update-1/ 34 | */ 35 | 36 | #ifdef _MSC_VER 37 | 38 | /* 39 | Microsoft Visual Studio Support. 40 | C1xx/C2, Clang/C2 and Clang/LLVM all support the Microsoft header files and _BitScanForward 41 | 42 | Note: if you receive errors with the intrinsics make sure that you have SSE4.1 support enabled. 43 | For example with Clang include "-msse4.1" on the command line 44 | */ 45 | #include 46 | 47 | #else /* _MSC_VER */ 48 | 49 | #ifdef __GNUC__ 50 | 51 | /* 52 | GCC 53 | */ 54 | 55 | #define SIMDE_ENABLE_NATIVE_ALIASES 56 | #include "../simde/x86/sse4.1.h" 57 | /* _BitScanForward is Visual Studio specific. */ 58 | #define _BitScanForward(x, m) *(x) = __builtin_ctz(m) 59 | 60 | #else 61 | 62 | /* 63 | If you hit the error below, then add detection for your compiler/platform to this header file. 64 | */ 65 | #error Platform not supported 66 | 67 | #endif /* __GNUC__ */ 68 | #endif /* _MSC_VER */ 69 | 70 | #endif /* -- LZSSE4_PLATFORM_H__ */ 71 | -------------------------------------------------------------------------------- /lzsse8/lzsse8.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2016, Conor Stokes 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | */ 25 | 26 | #ifndef LZSSE8_H__ 27 | #define LZSSE8_H__ 28 | 29 | #pragma once 30 | 31 | /* LZSSE8 - x64/SSE targeted codec for better performance with lower compression ratio data/less optimal compressors. 32 | * Supports minimum 4 byte matches, maximum 15 bytes of match per control word and 8 byte literal runs per control word. 33 | */ 34 | 35 | #ifdef __cplusplus 36 | extern "C" 37 | { 38 | #endif 39 | 40 | /* Re-usable parse state object for compression. */ 41 | typedef struct LZSSE8_FastParseState LZSSE8_FastParseState; 42 | 43 | /* Allocate the parse state for compression - returns null on failure */ 44 | LZSSE8_FastParseState* LZSSE8_MakeFastParseState(); 45 | 46 | /* De-allocate the parse state for compression */ 47 | void LZSSE8_FreeFastParseState( LZSSE8_FastParseState* toFree ); 48 | 49 | /* Re-usable parse state object for compression. */ 50 | typedef struct LZSSE8_OptimalParseState LZSSE8_OptimalParseState; 51 | 52 | /* Allocate the parse state for compression - returns null on failure. Note 53 | Buffersize has to be greater or equal to any inputLength used with LZSSE2_CompressOptimalParse */ 54 | LZSSE8_OptimalParseState* LZSSE8_MakeOptimalParseState( size_t bufferSize ); 55 | 56 | /* De-allocate the parse state for compression */ 57 | void LZSSE8_FreeOptimalParseState( LZSSE8_OptimalParseState* toFree ); 58 | 59 | 60 | /* "Optimal" compression routine. 61 | * Will compress data into LZSSE8 format, uses hash BST matching to find matches and run an optimal parse (high relative memory usage). Requires SSE 4.1. 62 | * state : Contains the hash table for matching, passed as a parameter so that allocations can be re-used. 63 | * input : Buffer containing uncompressed data to be compressed. May not be null. 64 | * inputLength : Length of the compressed data in the input buffer - note should be under 2GB. 65 | * output : Buffer that will receive the compressed output. 66 | * outputLength : The length reserved in the buffer for compressed data. This should be at least inputLength. Note, 67 | * The compressed data should never be longer than inputLength, as in this case the data is stored raw. 68 | * level : The compression level to use for this file 1->17, 17 delivers the highest compression, 1 delivers the least. 69 | * Thread Safety - state can not be used on multiple threads with calls running concurrently. Can run multiple threads with separate state 70 | * concurrently. 71 | * 72 | * Returns the size of the compressed data, or 0 in the case of error (e.g. outputLength is less than inputLength). 73 | */ 74 | size_t LZSSE8_CompressOptimalParse( LZSSE8_OptimalParseState* state, const void* input, size_t inputLength, void* output, size_t outputLength, unsigned int level ); 75 | 76 | 77 | /* "Fast" compression routine. 78 | * Will compress data into LZSSE8 format, uses a simple single entry hash/greedy matching to find matches. Requires SSE 4.1. 79 | * state : Contains the hash table for matching, passed as a parameter so that allocations can be re-used. 80 | * input : Buffer containing uncompressed data to be compressed. May not be null. 81 | * inputLength : Length of the compressed data in the input buffer - note should be under 2GB. 82 | * output : Buffer that will receive the compressed output. 83 | * outputLength : The length reserved in the buffer for compressed data. This should be at least inputLength. Note, 84 | * The compressed data should never be longer than inputLength, as in this case the data is stored raw. 85 | * 86 | * Thread Safety - state can not be used on multiple threads with calls running concurrently. Can run multiple threads with separate state 87 | * concurrently. 88 | * 89 | * Returns the size of the compressed data, or 0 in the case of error (e.g. outputLength is less than inputLength). 90 | */ 91 | size_t LZSSE8_CompressFast( LZSSE8_FastParseState* state, const void* input, size_t inputLength, void* output, size_t outputLength ); 92 | 93 | /* Decompression routine. 94 | * This routine will decompress data in the LZSSE8 format and currently requires SSE 4.1 and is targeted at x64. 95 | * It will perform poorly on x86 due to hunger for registers. 96 | * input : Buffer containing compressed input block. May not be null. 97 | * inputLength : Length of the compressed data in the input buffer - note, this should be under 2GB 98 | * output : Buffer that will received the de-compressed output. Note, that this needs to be at least outputLength long. 99 | * May not be null. 100 | * outputLength : The length of the compressed output - note, this should be under 2GB 101 | * 102 | * Provided that input and output are valid pointers to buffers of at least their specified size, this routine 103 | * should be memory safe - both match pointer checks and input/output buffer checks exist. 104 | * 105 | * Returns the size of the decompressed data, which will be less than outputLength in the event of an error (number of bytes 106 | * will indicate where in the output stream the error occured). 107 | * 108 | * Note that this data is not hash verified, errors that occur are either from a misformed stream or bad buffer sizes. 109 | * Remember, corrupt data can still be valid to decompress. 110 | */ 111 | size_t LZSSE8_Decompress( const void* input, size_t inputLength, void* output, size_t outputLength ); 112 | 113 | #ifdef __cplusplus 114 | } 115 | #endif 116 | 117 | #endif /* -- LZSSE8_H__ */ 118 | -------------------------------------------------------------------------------- /lzsse8/lzsse8_platform.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2016, Brian Marshall 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | */ 25 | 26 | #ifndef LZSSE8_PLATFORM_H__ 27 | #define LZSSE8_PLATFORM_H__ 28 | 29 | #pragma once 30 | 31 | /* 32 | Compiler/Platform detection based on the table from: 33 | https://blogs.msdn.microsoft.com/vcblog/2015/12/04/clang-with-microsoft-codegen-in-vs-2015-update-1/ 34 | */ 35 | 36 | #ifdef _MSC_VER 37 | 38 | /* 39 | Microsoft Visual Studio Support. 40 | C1xx/C2, Clang/C2 and Clang/LLVM all support the Microsoft header files and _BitScanForward 41 | 42 | Note: if you receive errors with the intrinsics make sure that you have SSE4.1 support enabled. 43 | For example with Clang include "-msse4.1" on the command line 44 | */ 45 | #include 46 | 47 | #else /* _MSC_VER */ 48 | 49 | #ifdef __GNUC__ 50 | 51 | /* 52 | GCC 53 | */ 54 | 55 | #define SIMDE_ENABLE_NATIVE_ALIASES 56 | #include "../simde/x86/sse4.1.h" 57 | /* _BitScanForward is Visual Studio specific. */ 58 | #define _BitScanForward(x, m) *(x) = __builtin_ctz(m) 59 | 60 | #else 61 | 62 | /* 63 | If you hit the error below, then add detection for your compiler/platform to this header file. 64 | */ 65 | #error Platform not supported 66 | 67 | #endif /* __GNUC__ */ 68 | #endif /* _MSC_VER */ 69 | 70 | #endif /* -- LZSSE8_PLATFORM_H__ */ 71 | -------------------------------------------------------------------------------- /premake4.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nemequ/LZSSE-SIMDe/b4ba34fa8c9411b1a69ad34ace775817fd4ce03e/premake4.exe -------------------------------------------------------------------------------- /premake4.lua: -------------------------------------------------------------------------------- 1 | solution "lzsse" 2 | configurations { "Debug", "Release" } 3 | platforms { "x64" } 4 | includedirs { "include" } 5 | flags { "NoPCH" } 6 | location ( _ACTION ) 7 | configuration { "gmake" } 8 | buildoptions { "-std=c++11" } 9 | 10 | project "lzsse" 11 | language "C++" 12 | kind "ConsoleApp" 13 | files { "lzsse2/*.cpp", "lzsse2/*.c", "lzsse2/*.h", "lzsse4/*.cpp", "lzsse4/*.c", "lzsse4/*.h", "lzsse8/*.cpp", "lzsse8/*.c", "lzsse8/*.h", "example/*.cpp", "example/*.c", "example/*.h" } 14 | 15 | configuration "Debug*" 16 | flags { "Symbols" } 17 | 18 | configuration "Release*" 19 | flags { "OptimizeSpeed" } 20 | 21 | configuration { "x64", "Debug" } 22 | targetdir "bin/64/debug" 23 | 24 | configuration { "x64", "Release" } 25 | targetdir "bin/64/release" 26 | --------------------------------------------------------------------------------