├── .gitignore
├── .gitmodules
├── .travis.yml
├── LICENSE
├── README.md
├── example
    └── main.cpp
├── lzsse2
    ├── lzsse2.cpp
    ├── lzsse2.h
    └── lzsse2_platform.h
├── lzsse4
    ├── lzsse4.cpp
    ├── lzsse4.h
    └── lzsse4_platform.h
├── lzsse8
    ├── lzsse8.cpp
    ├── lzsse8.h
    └── lzsse8_platform.h
├── premake4.exe
└── premake4.lua


/.gitignore:
--------------------------------------------------------------------------------
1 | bin/*
2 | gmake/*
3 | vs2013/*
4 | vs2015/*


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "simde"]
2 | 	path = simde
3 | 	url = https://github.com/simd-everywhere/simde-no-tests.git
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
  1 | language: c
  2 | sudo: false
  3 | branches:
  4 |   except:
  5 |     - /^(wip\/)?(appveyor|msvc|mingw|windows)(\-.+)?$/
  6 | matrix:
  7 |   include:
  8 |     ###
  9 |     ## Linux builds using various versions of GCC.
 10 |     ###
 11 |     - env: C_COMPILER=gcc-7
 12 |       addons:
 13 |         apt:
 14 |           sources:
 15 |           - ubuntu-toolchain-r-test
 16 |           - premake4
 17 |           packages:
 18 |           - gcc-7
 19 |           - g++-7
 20 |           - premake4
 21 |     # - env: C_COMPILER=gcc-6
 22 |     #   addons:
 23 |     #     apt:
 24 |     #       sources:
 25 |     #       - ubuntu-toolchain-r-test
 26 |     #       - premake4
 27 |     #       packages:
 28 |     #       - gcc-6
 29 |     #       - g++-6
 30 |     #       - premake4
 31 |     - env: C_COMPILER=gcc-5
 32 |       addons:
 33 |         apt:
 34 |           sources:
 35 |           - ubuntu-toolchain-r-test
 36 |           - premake4
 37 |           packages:
 38 |           - gcc-5
 39 |           - g++-5
 40 |           - premake4
 41 |     # - env: C_COMPILER=gcc-4.9
 42 |     #   addons:
 43 |     #     apt:
 44 |     #       sources:
 45 |     #       - ubuntu-toolchain-r-test
 46 |     #       - premake4
 47 |     #       packages:
 48 |     #       - gcc-4.9
 49 |     #       - g++-4.9
 50 |     #       - premake4
 51 |     - env: C_COMPILER=gcc-4.8
 52 |       addons:
 53 |         apt:
 54 |           sources:
 55 |           - ubuntu-toolchain-r-test
 56 |           - premake4
 57 |           packages:
 58 |           - gcc-4.8
 59 |           - g++-4.8
 60 |           - premake4
 61 |     # - env: C_COMPILER=gcc-4.7
 62 |     #   addons:
 63 |     #     apt:
 64 |     #       sources:
 65 |     #       - ubuntu-toolchain-r-test
 66 |     #       - premake4
 67 |     #       packages:
 68 |     #       - gcc-4.7
 69 |     #       - g++-4.7
 70 |     #       - premake4
 71 |     - env: C_COMPILER=gcc-4.6
 72 |       addons:
 73 |         apt:
 74 |           sources:
 75 |           - ubuntu-toolchain-r-test
 76 |           - premake4
 77 |           packages:
 78 |           - gcc-4.6
 79 |           - g++-4.6
 80 |           - premake4
 81 |     # - os: linux
 82 |     #   env: C_COMPILER=gcc-4.5
 83 |     #   addons:
 84 |     #     apt:
 85 |     #       sources:
 86 |     #       - ubuntu-toolchain-r-test
 87 |     #       - premake4
 88 |     #       packages:
 89 |     #       - gcc-4.5
 90 |     #       - g++-4.5
 91 |     #       - premake4
 92 |     - env: C_COMPILER=gcc-4.4
 93 |       addons:
 94 |         apt:
 95 |           sources:
 96 |           - ubuntu-toolchain-r-test
 97 |           - premake4
 98 |           packages:
 99 |           - gcc-4.4
100 |           - g++-4.4
101 |           - premake4
102 | 
103 |     ###
104 |     ## clang on Linux
105 |     ###
106 |     - env: C_COMPILER=clang-3.9
107 |       addons:
108 |         apt:
109 |           sources:
110 |           - llvm-toolchain-precise-3.9
111 |           - ubuntu-toolchain-r-test
112 |           - premake4
113 |           packages:
114 |           - clang-3.9
115 |           - premake4
116 |     # - env: C_COMPILER=clang-3.8
117 |     #   addons:
118 |     #     apt:
119 |     #       sources:
120 |     #       - llvm-toolchain-precise-3.8
121 |     #       - ubuntu-toolchain-r-test
122 |     #       - premake4
123 |     #       packages:
124 |     #       - clang-3.8
125 |     #       - premake4
126 |     - env: C_COMPILER=clang-3.7
127 |       addons:
128 |         apt:
129 |           sources:
130 |           - llvm-toolchain-precise-3.7
131 |           - ubuntu-toolchain-r-test
132 |           - premake4
133 |           packages:
134 |           - clang-3.7
135 |           - premake4
136 |     # - env: C_COMPILER=clang-3.6
137 |     #   addons:
138 |     #     apt:
139 |     #       sources:
140 |     #       - llvm-toolchain-precise-3.6
141 |     #       - ubuntu-toolchain-r-test
142 |     #       - premake4
143 |     #       packages:
144 |     #       - clang-3.6
145 |     #       - premake4
146 |     - env: C_COMPILER=clang-3.5
147 |       addons:
148 |         apt:
149 |           sources:
150 |           - llvm-toolchain-precise-3.5
151 |           - ubuntu-toolchain-r-test
152 |           - premake4
153 |           packages:
154 |           - clang-3.5
155 |           - premake4
156 | 
157 |     ###
158 |     ## PGI
159 |     ###
160 |     - env: C_COMPILER=pgcc ENABLE_OPENMP=y
161 |       addons:
162 |         apt:
163 |           sources:
164 |           - premake4
165 |           packages:
166 |           - premake4
167 | 
168 |     ###
169 |     ## OS X
170 |     ###
171 |     # - os: osx
172 | 
173 | before_install:
174 | ###
175 | ## If we use the matrix to set CC/CXX Travis, overwrites the values,
176 | ## so instead we use C/CXX_COMPILER, then copy the values to CC/CXX
177 | ## here (after Travis has set CC/CXX).
178 | ###
179 | - if [ "${C_COMPILER}" = "pgcc" ]; then wget -q -O /dev/stdout 'https://raw.githubusercontent.com/nemequ/pgi-travis/master/install-pgi.sh' | /bin/sh; fi
180 | - if [ -n "${C_COMPILER}" ]; then export CC="${C_COMPILER}"; fi
181 | - premake4 gmake
182 | 
183 | script:
184 | - cd gmake && make
185 | 
186 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016, Conor Stokes
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # LZSSE-SIMDe
 2 | 
 3 | [LZSSE](https://github.com/ConorStokes/LZSSE/) has a hard dependency on SSE4.1 which prevents it from working on other architectures, or even x86/x86_64 machines without support for the SSE4.1 instruction set.  According to the [Steam Hardware Survey](http://store.steampowered.com/hwsurvey), SSE4.1 currently has just under 90% penetration, and of course that is only for machines with Steam installed (which is a pretty big bias).
 4 | 
 5 | This is a fork of [LZSSE](https://github.com/ConorStokes/LZSSE/) which uses [SIMDe](https://github.com/nemequ/simde) to allow for LZSSE (de)compression on platforms where SSE4.1 is not supported, including other architectures (such as ARM).
 6 | 
 7 | Note that, with the default block size from the example program, LZSSE-SIMDe will not work on 32-bit architectures due to memory requirements.  Reducing the block size resolves the issue, and the code has been tested on ARM and x86.  [PAE](https://en.wikipedia.org/wiki/Physical_Address_Extension) should also work, but has not been tested.
 8 | 
 9 | For machines with SSE4.1 support there should be no performance impact.  The SSE4.1 intrinsics will be called, and the compiler should be capable of optimizing away any overhead associated with SIMDe.
10 | 
11 | For machines which don't natively support the instructions used, SIMDe will emulate them using other SIMD APIs or, if that fails, portable fallbacks.
12 | 
13 | Note that a mix of the two is quite possible; for example, a CPU may support SSSE3 but not SSE4.1, in which case SSE4.1 functions will be emulated but SSSE3 and earlier instructions will used.
14 | 
15 | I'll try to keep this up to date with LZSSE, but I will not accept any changes directly to this repository not directly related to porting to SIMDe.  If you find a bug, please file it with LZSSE or SIMDe, whichever would be more appropriate.
16 | 
17 | ## SIMDe Performance
18 | 
19 | This is based on some testing with g++ 10 using [raspbian-jessie-lite-20151121](https://github.com/nemequ/squash-corpus/blob/master/data/raspbian-jessie-lite-20151121.tar.xz).  Results are the average wall-clock time across 5 runs.
20 | 
21 | As you read this, please keep two things in mind:
22 | 
23 | First, this isn't likely to be the same for your code.  Performance will depend heavily on which functions you use, and what SIMDe's options for fallbacks are.  SIMDe is usually very easy to integrate into your project, so you should really run your own tests using your code and your data.
24 | 
25 | Second, SIMDe should never make your code slower, only more portable.  It doesn't really make sense to think of SIMDe as a performance hit since the alternative is that the code doesn't work at all; in that sense, SIMDe represents an infinite performance improvement.
26 | 
27 | That said, if you are currently maintaining a portable fallback and an SSE version, there is an excellent chance that SIMDe will be significantly faster than your portable fallback.
28 | 
29 | Now that that's out of the way, let's get to some data.
30 | 
31 | If provided the same compiler flags (in this case, `-msse4.1 -O3`), results for LZSSE and LZSSE-SIMDe are effectively the same.  So **SIMDe doesn't make things worse**, which is *very* important:
32 | 
33 | | Library     | Variant | Compress | Decompress |
34 | | ----------- | ------- | -------- | ---------- |
35 | | LZSSE       | LZSSE2  |  87.18 s |     0.55 s |
36 | | LZSSE-SIMDe | LZSSE2  |  86.33 s |     0.55 s |
37 | | LZSSE       | LZSSE4  |  73.83 s |     0.47 s |
38 | | LZSSE-SIMDe |	LZSSE4  |  73.48 s |     0.47 s |
39 | | LZSSE       | LZSSE8  |  79.16 s |     0.45 s |
40 | | LZSSE-SIMDe |	LZSSE8  |  79.28 s |     0.44 s |
41 | 
42 | Things get a bit more interesting if we compile without SSE 4.1 support, forcing SIMDe to use portable implementations of the SSE 4.1 functions that LZSSE relies on:
43 | 
44 | | Flags    | Variant | Compress | Decompress |
45 | | -------- | ------- | -------- | ---------- |
46 | | -msse2   | LZSSE2  |  86.40 s |    13.70 s |
47 | | -msse2   | LZSSE4  |  74.14 s |    10.78 s |
48 | | -msse2   | LZSSE8  |  76.76 s |    10.07 s |
49 | | -mssse3  | LZSSE2  |  86.47 s |     0.55 s |
50 | | -mssse3  | LZSSE4  |  73.01 s |     0.48 s |
51 | | -mssse3  | LZSSE8  |  78.81 s |     0.45 s |
52 | | -msse4.1 | LZSSE2  |  86.33 s |     0.55 s |
53 | | -msse4.1 | LZSSE4  |  73.48 s |     0.47 s |
54 | | -msse4.1 | LZSSE8  |  79.28 s |     0.44 s |
55 | 
56 | Remember, there are no numbers here for upstream LZSSE since it simply doesn't work at all.  As you can see, moving from SSE2 to SSSE3 provides a *huge* performance increase for decompression; that's because SSSE3 supports the `_mm_shuffle_epi8` function, which we don't have a very good way to emulate on previous versions of SSE.  That makes it a great example of SIMDe performance can vary wildly depending on the functions you use and the platform you're targeting.  For what it's worth, AArch64 does have a good way to emulate it (`vqtbl1q_s8`), as does AltiVec (`vec_perm`), and ARMv7 has a decent option with a couple of `vtbl2_s8` calls.
57 | 
58 | In the worst case we can force SIMDe to always use the portable fallbacks and rely exclusively on the compiler to auto-vectorize by passing `-DSIMDE_NO_NATIVE`.  To be clear, you should never do this; it's really only there to help us test the fallbacks.  In this case, flags like `-msse4.1` only tell the compiler (GCC in this case) which extensions it is allowed to use; they are completely ignored by SIMDe.
59 | 
60 | | Flags    | Variant | Compress | Decompress |
61 | | -------- | ------- | -------- | ---------- |
62 | | -msse2   | LZSSE2  | 169.52 s |    13.72 s |
63 | | -msse2   | LZSSE4  | 138.56 s |    10.82 s |
64 | | -msse2   | LZSSE8  | 140.77 s |    10.08 s |
65 | | -mssse3  | LZSSE2  | 169.48 s |    13.71 s |
66 | | -mssse3  | LZSSE4  | 138.29 s |    10.80 s |
67 | | -mssse3  | LZSSE8  | 140.71 s |    10.06 s |
68 | | -msse4.1 | LZSSE2  | 169.41 s |     1.99 s |
69 | | -msse4.1 | LZSSE4  | 138.28 s |     1.50 s |
70 | | -msse4.1 | LZSSE8  | 140.02 s |     1.46 s |
71 | 
72 | Notice that this time there is not a significant change in compression speed with SSSE3.  That's because the compiler isn't smart enough to recognize that it should compile our portable implementation of `_mm_shuffle_epi8` to a `PSHUFB` instruction, which is a good example of why SIMDe generally significantly outperforms non-vectorized fallbacks even on architectures other than the one the original SIMD implementation is targeted at.  That said, with SSE 4.1 the compiler was able to use `PSHFLW`/`PSHFHW` (from SSE) with a blend from SSE4.1, recovering *most* of the performance.
73 | 
74 | It's tempting to think of this a bit like running the code on ARM, WASM, POWER, etc., but that's not accurate; on ARM SIMDe can use NEON to implement the SSE functions, and the code tends to be much faster.  Similarly, on WASM we can use WASM SIMD, and on POWER we can use AltiVec/VSX.
75 | 
76 | # LZSSE
77 | [LZSS](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Storer%E2%80%93Szymanski) designed for a branchless SSE decompression implementation.
78 | 
79 | Three variants:
80 | - LZSSE2, for high compression files with small literal runs.
81 | - LZSSE4, for a more balanced mix of literals and matches.
82 | - LZSSE8, for lower compression data with longer runs of matches.
83 | 
84 | All three variants have an optimal parser implementation, which uses a quite strong match finder (very similar to LzFind) combined with a Storer-Szymanski style parse. LZSSE4 and LZSSE8 have "fast" compressor implementations, which use a simple hash table based matching and a greedy parse.
85 | 
86 | Currently LZSSE8 is the recommended variant to use in the general case, as it generally performs well in most cases (and you have the option of both optimal parse and fast compression). LZSSE2 is recommended if you are only using text, especially heavily compressible text, but is slow/doesn't compress as well on less compressible data and binaries.
87 | 
88 | The code is approaching production readiness and LZSSE2 and LZSSE8 have received a reasonable amount of testing.
89 | 
90 | See these blog posts [An LZ Codec Designed for SSE Decompression](http://conorstokes.github.io/compression/2016/02/15/an-LZ-codec-designed-for-SSE-decompression) and [Compressor Improvements and LZSSE2 vs LZSSE8](http://conorstokes.github.io/compression/2016/02/24/compressor-improvements-and-lzsse2-vs-lzsse8) for a description of how the compression algorithm and implementation function. There are also benchmarks, but these may not be upto date (in particular the figures in the initial blog post no longer represent compression performance).
91 | 


--------------------------------------------------------------------------------
/example/main.cpp:
--------------------------------------------------------------------------------
  1 | // LZSSE.cpp : Defines the entry point for the console application.
  2 | //
  3 | #define _CRT_SECURE_NO_WARNINGS 1
  4 | 
  5 | #include <string.h>
  6 | #include <stdio.h>
  7 | #include <stdint.h>
  8 | #include <stdlib.h>
  9 | #include "../lzsse2/lzsse2.h"
 10 | #include "../lzsse4/lzsse4.h"
 11 | #include "../lzsse8/lzsse8.h"
 12 | 
 13 | static const uint32_t MAGIC_NUMBER = 0x28F19732;
 14 | 
 15 | void DisplayUsage()
 16 | {
 17 |     printf( "Usage:\n" );
 18 |     printf( "    lzsse [args] input_file output_file\n" );
 19 |     printf( "\n" );
 20 |     printf( "Arguments:\n" );
 21 |     printf( "    -2    Compress in lzsse2 mode (default)\n" );
 22 |     printf( "    -4    Compress in lzsse4 mode\n" );
 23 |     printf( "    -8    Compress in lzsse8 mode\n" );
 24 |     printf( "    -f    Optimal parse (default)\n" );
 25 |     printf( "    -o    Fast parse (not available for lzsse2)\n" );
 26 |     printf( "    -d    Decompress\n" );
 27 |     printf( "    -lN   Compression level for optimal parse, where N is 1 to 17 (default 16)\n" );
 28 |     printf( "    -bN   Block size in KiB, default 131,072\n" );
 29 |     printf( "\n" );
 30 | }
 31 | 
 32 | static size_t CompressorFastLZSSE4( LZSSE4_FastParseState* state, const void* input, size_t inputLength, void* output, size_t outputLength, unsigned int )
 33 | {
 34 |     return LZSSE4_CompressFast( state, input, inputLength, output, outputLength );
 35 | }
 36 | 
 37 | static size_t CompressorFastLZSSE8( LZSSE8_FastParseState* state, const void* input, size_t inputLength, void* output, size_t outputLength, unsigned int )
 38 | {
 39 |     return LZSSE8_CompressFast( state, input, inputLength, output, outputLength );
 40 | }
 41 | 
 42 | template <typename State>
 43 | void Compress( FILE* inputFile, FILE* outputFile, uint64_t blockSize, uint8_t mode, unsigned int level, State* state, size_t (*compressor)( State*, const void*, size_t, void*, size_t, unsigned int ) )
 44 | {
 45 |     if ( state == nullptr )
 46 |     {
 47 |         printf( "Couldn't allocate parse state\n" );
 48 |         exit( 1 );
 49 |     }
 50 | 
 51 |     if ( fwrite( &MAGIC_NUMBER, sizeof( uint32_t ), 1, outputFile ) == 0 )
 52 |     {
 53 |         printf( "Couldn't write magic number\n" );
 54 |         exit( 1 );
 55 |     }
 56 | 
 57 |     if ( fwrite( &mode, sizeof( uint8_t ), 1, outputFile ) == 0 )
 58 |     {
 59 |         printf( "Couldn't write stream type\n" );
 60 |         exit( 1 );
 61 |     }
 62 | 
 63 |     if ( fwrite( &blockSize, sizeof( uint64_t ), 1, outputFile ) == 0 )
 64 |     {
 65 |         printf( "Couldn't write block size\n" );
 66 |         exit( 1 );
 67 |     }
 68 | 
 69 |     size_t   typedBlockSize = static_cast< size_t >( blockSize );
 70 |     uint8_t* inputBuffer    = reinterpret_cast< uint8_t* >( malloc( typedBlockSize ) );
 71 |     uint8_t* outputBuffer   = reinterpret_cast< uint8_t* >( malloc( typedBlockSize ) );
 72 | 
 73 |     if ( inputBuffer == nullptr || outputBuffer == nullptr )
 74 |     {
 75 |         printf( "Couldn't allocate buffer memory\n" );
 76 |         exit( 1 );
 77 |     }
 78 | 
 79 |     for ( ;; )
 80 |     {
 81 |         size_t readSize = fread( inputBuffer, 1, blockSize, inputFile );
 82 | 
 83 |         if ( readSize == 0 )
 84 |         {
 85 |             break;
 86 |         }
 87 | 
 88 |         size_t compressedSize = compressor( state, inputBuffer, readSize, outputBuffer, typedBlockSize, level );
 89 | 
 90 |         if ( compressedSize == 0 )
 91 |         {
 92 |             printf( "Compression function failed\n" );
 93 |             exit( 1 );
 94 |         }
 95 | 
 96 |         uint32_t compressedLength   = static_cast< uint32_t >( compressedSize );
 97 |         uint32_t uncompressedLength = static_cast< uint32_t >( readSize );
 98 | 
 99 |         if ( fwrite( &uncompressedLength, sizeof( uint32_t ), 1, outputFile ) < 1  )
100 |         {
101 |             break;
102 |         }
103 | 
104 |         if ( fwrite( &compressedLength, sizeof( uint32_t ), 1, outputFile ) < 1 )
105 |         {
106 |             printf( "Error writing compressed length from block\n" );
107 |             exit( 1 );
108 |         }
109 | 
110 |         if ( fwrite( outputBuffer, 1, compressedLength, outputFile ) != compressedLength )
111 |         {
112 |             printf( "Error writing block\n" );
113 |             exit( 1 );
114 |         }
115 |     }
116 |     
117 |     free( inputBuffer );
118 |     free( outputBuffer );
119 | }
120 | 
121 | void Decompress( FILE* inputFile, FILE* outputFile )
122 | {
123 |     uint32_t magicNumber;
124 |     uint64_t blockSize     = 128 * 1024 * 1024;
125 | 
126 |     if ( fread( &magicNumber, sizeof( uint32_t ), 1, inputFile ) < 1 || magicNumber != MAGIC_NUMBER )
127 |     {
128 |         printf( "Couldn't read magic number, or magic number incorrect\n" );
129 |         exit( 1 );
130 |     }
131 | 
132 |     uint8_t streamType;
133 | 
134 |     if ( fread( &streamType, sizeof( uint8_t ), 1, inputFile ) < 1 )
135 |     {
136 |         printf( "Couldn't read stream type\n" );
137 |         exit( 1 );
138 |     }
139 | 
140 |     if ( fread( &blockSize, sizeof( uint64_t ), 1, inputFile ) < 1 )
141 |     {
142 |         printf( "Couldn't read block size\n" );
143 |         exit( 1 );
144 |     }
145 | 
146 |     uint8_t* inputBuffer  = reinterpret_cast< uint8_t* >( malloc( static_cast< size_t >( blockSize ) ) );
147 |     uint8_t* outputBuffer = reinterpret_cast< uint8_t* >( malloc( static_cast< size_t >( blockSize ) ) );
148 | 
149 |     if ( inputBuffer == nullptr || outputBuffer == nullptr )
150 |     {
151 |         printf( "Couldn't allocate buffer memory\n" );
152 |         exit( 1 );
153 |     }
154 | 
155 |     size_t( *decompressor )( const void*, size_t, void*, size_t );
156 | 
157 |     switch ( streamType )
158 |     {
159 |     case 2:
160 | 
161 |         decompressor = LZSSE2_Decompress;
162 |         break;
163 | 
164 |     case 4:
165 | 
166 |         decompressor = LZSSE4_Decompress;
167 |         break;
168 | 
169 |     case 8:
170 | 
171 |         decompressor = LZSSE8_Decompress;
172 |         break;
173 | 
174 |     default:
175 | 
176 |         printf( "Invalid stream type\n" );
177 |         exit( 1 );
178 | 
179 |     }
180 | 
181 |     memset( inputBuffer, 0, blockSize );
182 |     memset( outputBuffer, 0, blockSize );
183 | 
184 |     for ( ;; )
185 |     {
186 |         uint32_t compressedLength;
187 |         uint32_t uncompressedLength;
188 | 
189 |         if ( fread( &uncompressedLength, sizeof( uint32_t ), 1, inputFile ) < 1  )
190 |         {
191 |             break;
192 |         }
193 | 
194 |         if ( fread( &compressedLength, sizeof( uint32_t ), 1, inputFile ) < 1  )
195 |         {
196 |             printf( "Error reading compressed length from block\n" );
197 |             exit( 1 );
198 |         }
199 | 
200 |         if ( fread( inputBuffer, 1, compressedLength, inputFile ) != compressedLength )
201 |         {
202 |             printf( "Error reading block\n" );
203 |             exit( 1 );
204 |         }
205 | 
206 |         size_t decompressedSize = 0;
207 |         
208 |         decompressedSize =
209 |            decompressor( inputBuffer,
210 |                          compressedLength,
211 |                          outputBuffer,
212 |                          uncompressedLength );
213 |         
214 |         if ( decompressedSize != size_t( uncompressedLength ) )
215 |         {
216 |             printf( "Error in decompression stream\n" );
217 |             exit( 1 );
218 |         }
219 | 
220 |         if ( fwrite( outputBuffer, 1, uncompressedLength, outputFile ) != uncompressedLength )
221 |         {
222 |             printf( "Couldn't write block to output file\n" );
223 |             exit( 1 );
224 |         }
225 |     }
226 | 
227 |     free( inputBuffer );
228 |     free( outputBuffer );
229 | }
230 | 
231 | int main( int argc, const char** argv )
232 | {
233 |     bool         decompression = false;
234 |     bool         optimal       = true;
235 |     uint64_t     blockSize     = 128 * 1024 * 1024;
236 |     uint8_t      mode          = 2;
237 |     unsigned int level         = 16;
238 |     
239 |     if ( argc < 3 )
240 |     {
241 |         DisplayUsage();
242 |         exit( 1 );
243 |     }
244 |     
245 |     for ( int argIndex = 1; argIndex < argc - 2; ++argIndex )
246 |     {
247 |         const char* arg = argv[ argIndex ];
248 | 
249 |         if ( arg[ 0 ] == '-' )
250 |         {
251 |             switch ( arg[ 1 ] )
252 |             {
253 |             case 'd':
254 | 
255 |                 decompression = true;
256 |                 break;
257 | 
258 |             case '2':
259 |                 
260 |                 mode = 2;
261 |                 break;
262 | 
263 |             case '4':
264 | 
265 |                 mode = 4;
266 |                 break;
267 | 
268 |             case '8':
269 | 
270 |                 mode = 8;
271 |                 break;
272 | 
273 |             case 'l':
274 | 
275 |                 level = static_cast< unsigned int >( strtoul( arg + 2, nullptr, 10 ) );
276 |                 break;
277 | 
278 |             case 'b':
279 | 
280 |                 blockSize = strtoull( arg + 2, nullptr, 10 ) * 1024;
281 |                 break;
282 | 
283 |             case 'o':
284 | 
285 |                 optimal = true;
286 |                 break;
287 | 
288 |             case 'f':
289 | 
290 |                 optimal = false;
291 |                 break;
292 | 
293 |             }
294 |         }
295 |     }
296 | 
297 |     FILE* inputFile  = fopen( argv[ argc - 2 ], "rb" );
298 | 
299 |     if ( inputFile == nullptr )
300 |     {
301 |         perror( argv[ argc - 2 ] );
302 |         exit( 1 );
303 |     }
304 | 
305 |     FILE* outputFile = fopen( argv[ argc - 1 ], "wb+" );
306 |     
307 |     if ( outputFile == nullptr )
308 |     {
309 |         perror( argv[ argc - 2 ] );
310 |         exit( 1 );
311 |     }
312 |  
313 |     if ( decompression )
314 |     {
315 |         Decompress( inputFile, outputFile );
316 |     }
317 |     else
318 |     {
319 |         switch ( mode )
320 |         {
321 |         case 2:
322 |         {
323 |             LZSSE2_OptimalParseState* state = LZSSE2_MakeOptimalParseState( static_cast< size_t >( blockSize ) );
324 | 
325 |             Compress( inputFile, outputFile, blockSize, mode, level, state, LZSSE2_CompressOptimalParse );
326 | 
327 |             LZSSE2_FreeOptimalParseState( state );
328 | 
329 |             break;
330 |         }
331 | 
332 |         case 4:
333 |         {
334 |             if ( optimal )
335 |             {
336 |                 LZSSE4_OptimalParseState* state = LZSSE4_MakeOptimalParseState( static_cast<size_t>( blockSize ) );
337 | 
338 |                 Compress( inputFile, outputFile, blockSize, mode, level, state, LZSSE4_CompressOptimalParse );
339 | 
340 |                 LZSSE4_FreeOptimalParseState( state );
341 |             }
342 |             else
343 |             {
344 |                 LZSSE4_FastParseState* state = LZSSE4_MakeFastParseState();
345 | 
346 |                 Compress( inputFile, outputFile, blockSize, mode, level, state, CompressorFastLZSSE4 );
347 | 
348 |                 LZSSE4_FreeFastParseState( state );
349 |             }
350 | 
351 |             break;
352 |         }
353 |         
354 |         case 8:
355 |         {
356 |             if ( optimal )
357 |             {
358 |                 LZSSE8_OptimalParseState* state = LZSSE8_MakeOptimalParseState( static_cast<size_t>( blockSize ) );
359 | 
360 |                 Compress( inputFile, outputFile, blockSize, mode, level, state, LZSSE8_CompressOptimalParse );
361 | 
362 |                 LZSSE8_FreeOptimalParseState( state );
363 |             }
364 |             else
365 |             {
366 |                 LZSSE8_FastParseState* state = LZSSE8_MakeFastParseState();
367 | 
368 |                 Compress( inputFile, outputFile, blockSize, mode, level, state, CompressorFastLZSSE8 );
369 | 
370 |                 LZSSE8_FreeFastParseState( state );
371 |             }
372 | 
373 |             break;
374 |         }
375 | 
376 |         default:
377 | 
378 |             printf( "Invalid stream type\n" );
379 |             exit( 1 );
380 | 
381 |         }
382 |     }
383 | 
384 |     fclose( inputFile );
385 |     fclose( outputFile );
386 | 
387 |     return 0;
388 | }
389 | 
390 | 


--------------------------------------------------------------------------------
/lzsse2/lzsse2.cpp:
--------------------------------------------------------------------------------
   1 | /*
   2 | Copyright (c) 2016, Conor Stokes
   3 | All rights reserved.
   4 | 
   5 | Redistribution and use in source and binary forms, with or without
   6 | modification, are permitted provided that the following conditions are met:
   7 | 
   8 | 1. Redistributions of source code must retain the above copyright notice, this
   9 | list of conditions and the following disclaimer.
  10 | 2. Redistributions in binary form must reproduce the above copyright notice,
  11 | this list of conditions and the following disclaimer in the documentation
  12 | and/or other materials provided with the distribution.
  13 | 
  14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
  18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24 | */
  25 | 
  26 | #include <string.h>
  27 | #include <stdlib.h>
  28 | #include <stdint.h>
  29 | #include <stddef.h>
  30 | 
  31 | #include "lzsse2_platform.h"
  32 | #include "lzsse2.h"
  33 | 
  34 | #pragma warning ( disable : 4127 )
  35 | 
  36 | namespace
  37 | {
  38 |     // Constants - most of these should not be changed without corresponding code changes because it will break many things in unpredictable ways.
  39 |     const uint32_t WINDOW_BITS             = 16;
  40 |     const uint32_t MIN_MATCH_LENGTH        = 3; 
  41 |     const uint32_t LZ_WINDOW_SIZE          = 1 << WINDOW_BITS;
  42 |     const uint32_t LZ_WINDOW_MASK          = LZ_WINDOW_SIZE - 1;
  43 |     const uint32_t OPTIMAL_HASH_BITS       = 20;
  44 |     const uint32_t OPTIMAL_BUCKETS_COUNT   = 1 << OPTIMAL_HASH_BITS;
  45 |     const uint32_t OPTIMAL_HASH_MASK       = OPTIMAL_BUCKETS_COUNT - 1;
  46 |     const uint32_t MIN_COMPRESSION_SIZE    = 32;
  47 |     const uint32_t END_PADDING_LITERALS    = 16;
  48 |     const int32_t  NO_MATCH                = -1;
  49 |     const int32_t  EMPTY_NODE              = -1;
  50 |     const uint32_t MIN_LITERAL_COUNT       = 8;
  51 |     const uint32_t HASH_MULTIPLIER         = 4013;
  52 |     const uint32_t HASH_REMOVAL_MULTIPLIER = HASH_MULTIPLIER * HASH_MULTIPLIER;
  53 |     const uint32_t CONTROL_BITS            = 4;
  54 |     const uint32_t LITERAL_BITS            = 8;
  55 |     const uint32_t OFFSET_BITS             = 16;
  56 |     const uint32_t BASE_MATCH_BITS         = OFFSET_BITS + CONTROL_BITS;
  57 |     const uint32_t SINGLE_LITERAL_COST     = CONTROL_BITS + LITERAL_BITS;
  58 |     const uint32_t DOUBLE_LITERAL_COST     = SINGLE_LITERAL_COST + LITERAL_BITS;
  59 |     const uint32_t EXTENDED_MATCH_BOUND    = ( 1 << CONTROL_BITS ) - 1;
  60 |     const uint32_t CONTROL_BLOCK_SIZE      = sizeof( __m128i );
  61 |     const uint32_t CONTROLS_PER_BLOCK      = 32;
  62 |     const uint32_t LITERALS_PER_CONTROL    = 2;
  63 |     const uint32_t MAX_INPUT_PER_CONTROL   = 2;
  64 |     const size_t   OUTPUT_BUFFER_SAFE      = EXTENDED_MATCH_BOUND * CONTROLS_PER_BLOCK;
  65 |     const size_t   INPUT_BUFFER_SAFE       = MAX_INPUT_PER_CONTROL * CONTROLS_PER_BLOCK;
  66 |     const uint16_t INITIAL_OFFSET          = MIN_MATCH_LENGTH;
  67 |     const size_t   SKIP_MATCH_LENGTH       = 128;
  68 |     const uint32_t NO_SKIP_LEVEL           = 17;
  69 | }
  70 | 
  71 | struct Arrival
  72 | {
  73 |     size_t    cost;
  74 |     int32_t   from;
  75 |     int32_t   to;
  76 |     uint16_t  offset;
  77 | };
  78 | 
  79 | struct TreeNode
  80 | {
  81 |     int32_t children[ 2 ];
  82 | };
  83 | 
  84 | struct LZSSE2_OptimalParseState
  85 | {
  86 |     // Note, we should really replace this with a BST, hash chaining works but is *slooooooooooooooow* for optimal parse.
  87 |     int32_t roots[ OPTIMAL_BUCKETS_COUNT ];
  88 | 
  89 |     TreeNode window[ LZ_WINDOW_SIZE ];
  90 | 
  91 |     Arrival* arrivals;
  92 | 
  93 |     size_t bufferSize;
  94 | };
  95 | 
  96 | 
  97 | LZSSE2_OptimalParseState* LZSSE2_MakeOptimalParseState( size_t bufferSize )
  98 | {
  99 |     if ( bufferSize > 0 && ( SIZE_MAX / sizeof( Arrival ) ) < bufferSize )
 100 |     {
 101 |         return nullptr;
 102 |     }
 103 | 
 104 |     LZSSE2_OptimalParseState* result = reinterpret_cast< LZSSE2_OptimalParseState* >( ::malloc( sizeof( LZSSE2_OptimalParseState ) ) );
 105 | 
 106 |     result->bufferSize = bufferSize;
 107 | 
 108 |     if ( result != nullptr )
 109 |     {
 110 |         result->arrivals = reinterpret_cast< Arrival* >( ::malloc( sizeof( Arrival ) * bufferSize ) );
 111 | 
 112 |         if ( result->arrivals == nullptr )
 113 |         {
 114 |             LZSSE2_FreeOptimalParseState( result );
 115 | 
 116 |             result = nullptr;
 117 |         }
 118 |     }
 119 | 
 120 |     return result;
 121 | }
 122 | 
 123 | 
 124 | void LZSSE2_FreeOptimalParseState( LZSSE2_OptimalParseState* toFree )
 125 | {
 126 |     ::free( toFree->arrivals );
 127 |         
 128 |     toFree->arrivals = nullptr;
 129 | 
 130 |     ::free( toFree );
 131 | }
 132 | 
 133 | 
 134 | inline uint32_t CalculateHash( const uint8_t* inputCursor )
 135 | {
 136 |     return ( uint32_t( inputCursor[ 0 ] ) * HASH_MULTIPLIER * HASH_MULTIPLIER + uint32_t( inputCursor[ 1 ] ) * HASH_MULTIPLIER + uint32_t( inputCursor[ 2 ] ) ) & OPTIMAL_HASH_MASK;
 137 | }
 138 | 
 139 | 
 140 | struct Match
 141 | {
 142 |     size_t length;
 143 |     int32_t position;
 144 |     uint16_t offset;
 145 | };
 146 | 
 147 | 
 148 | inline Match SearchAndUpdateFinder( LZSSE2_OptimalParseState& state, const uint8_t* input, const uint8_t* inputCursor, const uint8_t* inputEnd, uint32_t cutOff )
 149 | {
 150 |     Match result;
 151 | 
 152 |     int32_t position = static_cast<int32_t>( inputCursor - input );
 153 | 
 154 |     result.position = NO_MATCH;
 155 |     result.length   = MIN_MATCH_LENGTH;
 156 |     result.offset   = 0;
 157 | 
 158 |     size_t   lengthToEnd  = inputEnd - inputCursor;
 159 |     int32_t  lastPosition = position - ( LZ_WINDOW_SIZE - 1 );
 160 |     uint32_t hash         = CalculateHash( inputCursor );
 161 | 
 162 |     lastPosition = lastPosition > 0 ? lastPosition : 0;
 163 | 
 164 |     int32_t treeCursor = state.roots[ hash ];
 165 | 
 166 |     state.roots[ hash ] = position;
 167 |     
 168 |     int32_t* left        = &state.window[ position & LZ_WINDOW_MASK ].children[ 1 ];
 169 |     int32_t* right       = &state.window[ position & LZ_WINDOW_MASK ].children[ 0 ];
 170 |     size_t   leftLength  = 0;
 171 |     size_t   rightLength = 0;
 172 | 
 173 |     for ( ;; )
 174 |     {
 175 |         if ( cutOff-- == 0 || treeCursor < lastPosition )
 176 |         {
 177 |             *left = *right = EMPTY_NODE;
 178 |             break;
 179 |         }
 180 | 
 181 |         TreeNode&      currentNode = state.window[ treeCursor & LZ_WINDOW_MASK ];
 182 |         const uint8_t* key         = input + treeCursor;
 183 |         size_t         matchLength = leftLength < rightLength ? leftLength : rightLength;
 184 | 
 185 |         uint16_t       matchOffset = static_cast<uint16_t>( position - treeCursor );
 186 |         size_t         maxLength   = matchOffset <= ( EXTENDED_MATCH_BOUND + 1 ) && matchOffset < lengthToEnd ? matchOffset : lengthToEnd;
 187 | 
 188 |         while ( matchLength < lengthToEnd )
 189 |         {
 190 |             __m128i input16 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( inputCursor + matchLength ) );
 191 |             __m128i match16 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( key + matchLength ) );
 192 | 
 193 |             unsigned long matchBytes;
 194 | 
 195 |             _BitScanForward( &matchBytes, ( static_cast<unsigned long>( ~_mm_movemask_epi8( _mm_cmpeq_epi8( input16, match16 ) ) ) | 0x10000 ) );
 196 | 
 197 |             matchLength += matchBytes;
 198 | 
 199 |             if ( matchBytes != 16 )
 200 |             {
 201 |                 break;
 202 |             }
 203 |         }
 204 | 
 205 |         matchLength = matchLength < lengthToEnd ? matchLength : lengthToEnd;
 206 | 
 207 |         size_t truncatedMatchLength = matchLength < maxLength ? matchLength : maxLength;
 208 | 
 209 |         if ( truncatedMatchLength >= result.length )
 210 |         {
 211 |             result.length   = truncatedMatchLength;
 212 |             result.offset   = matchOffset;
 213 |             result.position = treeCursor;
 214 |         }
 215 |         
 216 |         if ( matchLength == lengthToEnd )
 217 |         {
 218 |             *left  = currentNode.children[ 1 ];
 219 |             *right = currentNode.children[ 0 ];
 220 |             break;
 221 |         }
 222 | 
 223 |         if ( inputCursor[ matchLength ] < key[ matchLength ] || ( matchLength == lengthToEnd ) )
 224 |         {
 225 |             *left       = treeCursor;
 226 |             left        = currentNode.children;
 227 |             treeCursor  = *left;
 228 |             leftLength  = matchLength;
 229 |         }
 230 |         else
 231 |         {
 232 |             *right      = treeCursor;
 233 |             right       = currentNode.children + 1;
 234 |             treeCursor  = *right;
 235 |             rightLength = matchLength;
 236 |         }
 237 |     }
 238 |     
 239 |     // Special RLE overlapping match case, the LzFind style match above doesn't work very well with our
 240 |     // restriction of overlapping matches having offsets of at least 16.
 241 |     // Suffix array seems like a better option to handling this.
 242 |     {
 243 |         // Note, we're detecting long RLE here, but if we have an offset too close, we'll sacrifice a fair 
 244 |         // amount of decompression performance to load-hit-stores.
 245 |         int32_t matchPosition = position - ( sizeof( __m128i ) * 2 );
 246 | 
 247 |         if ( matchPosition >= 0 )
 248 |         {
 249 |             uint16_t       matchOffset = static_cast<uint16_t>( position - matchPosition );
 250 |             const uint8_t* key         = input + matchPosition;
 251 |             size_t         matchLength = 0;
 252 | 
 253 |             while ( matchLength < lengthToEnd )
 254 |             {
 255 |                 __m128i input16 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( inputCursor + matchLength ) );
 256 |                 __m128i match16 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( key + matchLength ) );
 257 | 
 258 |                 unsigned long matchBytes;
 259 | 
 260 |                 _BitScanForward( &matchBytes, ( static_cast<unsigned long>( ~_mm_movemask_epi8( _mm_cmpeq_epi8( input16, match16 ) ) ) | 0x10000 ) );
 261 | 
 262 |                 matchLength += matchBytes;
 263 | 
 264 |                 if ( matchBytes != 16 )
 265 |                 {
 266 |                     break;
 267 |                 }
 268 | 
 269 |             }
 270 | 
 271 |             matchLength = matchLength < lengthToEnd ? matchLength : lengthToEnd;
 272 | 
 273 |             if ( matchLength >= result.length )
 274 |             {
 275 |                 result.length   = matchLength;
 276 |                 result.offset   = matchOffset;
 277 |                 result.position = matchPosition;
 278 |             }
 279 |         }
 280 |     }
 281 | 
 282 |     return result;
 283 | }
 284 | 
 285 | 
 286 | size_t LZSSE2_CompressOptimalParse( LZSSE2_OptimalParseState* state, const void* inputChar, size_t inputLength, void* outputChar, size_t outputLength, unsigned int level )
 287 | {
 288 |     if ( outputLength < inputLength || state->bufferSize < inputLength )
 289 |     {
 290 |         // error case, output buffer not large enough.
 291 |         return 0;
 292 |     }
 293 | 
 294 |     const uint8_t* input  = reinterpret_cast< const uint8_t* >( inputChar );
 295 |     uint8_t*       output = reinterpret_cast< uint8_t* >( outputChar );
 296 |         
 297 |     if ( inputLength < MIN_COMPRESSION_SIZE )
 298 |     {
 299 |         memcpy( output, input, inputLength );
 300 | 
 301 |         return inputLength;
 302 |     }
 303 | 
 304 |     const uint8_t* inputCursor      = input;
 305 |     const uint8_t* inputEnd         = input + inputLength;
 306 |     Arrival*       arrivalWatermark = state->arrivals;
 307 |     Arrival*       arrival          = state->arrivals;
 308 |     uint32_t       cutOff           = 1 << level;
 309 | 
 310 |     for ( int32_t* rootCursor = state->roots, *end = rootCursor + OPTIMAL_BUCKETS_COUNT; rootCursor < end; rootCursor += 4 )
 311 |     {
 312 |         rootCursor[ 0 ] = EMPTY_NODE;
 313 |         rootCursor[ 1 ] = EMPTY_NODE;
 314 |         rootCursor[ 2 ] = EMPTY_NODE;
 315 |         rootCursor[ 3 ] = EMPTY_NODE;
 316 |     }
 317 | 
 318 |     for ( uint32_t where = 0; where < MIN_MATCH_LENGTH; ++where )
 319 |     {
 320 |         /*Match dummy = */ SearchAndUpdateFinder( *state, input, inputCursor, inputEnd - END_PADDING_LITERALS, cutOff );
 321 | 
 322 |         ++inputCursor;
 323 |     }
 324 | 
 325 |     arrival->cost   = LITERAL_BITS * MIN_MATCH_LENGTH;
 326 |     arrival->from   = -1;
 327 |     arrival->offset = 0;
 328 |     
 329 |     // loop through each character and project forward the matches at that character to calculate the cheapest
 330 |     // path of arrival for each individual character.
 331 |     for ( const uint8_t* earlyEnd = inputEnd - END_PADDING_LITERALS; inputCursor < earlyEnd; ++inputCursor, ++arrival )
 332 |     {
 333 |         uint32_t  lengthToEnd     = static_cast< uint32_t >( earlyEnd - inputCursor );       
 334 |         int32_t   currentPosition = static_cast< int32_t >( inputCursor - input );
 335 |         Arrival*  literalFirst    = arrival + 1;
 336 |         Arrival*  literalSecond   = arrival + 2;
 337 |         size_t    arrivalCost     = arrival->cost;
 338 |         
 339 |         // NOTE - we currently assume only 2 literals filled in here, because the minimum match length is 3.
 340 |         // If we wanted to go with a higher minimum match length, we would need to fill in more literals before hand.
 341 |         // Also, because there is a maximum of 2 literals per control block assumed.
 342 | 
 343 |         // project forward the cost of a single literal
 344 |         if ( literalFirst > arrivalWatermark || literalFirst->cost > ( arrival->cost + SINGLE_LITERAL_COST ) )
 345 |         {
 346 |             literalFirst->cost   = arrival->cost + SINGLE_LITERAL_COST;
 347 |             literalFirst->from   = currentPosition;
 348 |             literalFirst->offset = 0;
 349 | 
 350 |             arrivalWatermark = literalFirst > arrivalWatermark ? literalFirst : arrivalWatermark;
 351 |         }
 352 | 
 353 |         // project forward the cost of two literals
 354 |         if ( lengthToEnd > 1 )
 355 |         {
 356 |             if ( literalSecond > arrivalWatermark || literalFirst->cost > ( arrival->cost + DOUBLE_LITERAL_COST ) )
 357 |             {
 358 |                 literalSecond->cost   = arrival->cost + DOUBLE_LITERAL_COST;
 359 |                 literalSecond->from   = currentPosition;
 360 |                 literalSecond->offset = 0;
 361 | 
 362 |                 arrivalWatermark = literalSecond > arrivalWatermark ? literalSecond : arrivalWatermark;
 363 |             }
 364 |         }
 365 |         else
 366 |         {
 367 |             continue;
 368 |         }
 369 | 
 370 |         Match match = SearchAndUpdateFinder( *state, input, inputCursor, earlyEnd, cutOff );
 371 |         
 372 |         if ( match.position != NO_MATCH )
 373 |         {
 374 |             for ( size_t matchedLength = MIN_MATCH_LENGTH, end = match.length + 1; matchedLength < end; ++matchedLength )
 375 |             {
 376 |                 Arrival* matchArrival = arrival + matchedLength;
 377 |                 size_t   matchCost    = arrivalCost + BASE_MATCH_BITS;
 378 | 
 379 |                 if ( matchedLength > EXTENDED_MATCH_BOUND )
 380 |                 {
 381 |                     matchCost += ( ( matchedLength - 1 ) / EXTENDED_MATCH_BOUND ) * CONTROL_BITS;
 382 |                 }
 383 | 
 384 |                 if ( matchArrival > arrivalWatermark || matchArrival->cost > matchCost )
 385 |                 {
 386 |                     matchArrival->cost   = matchCost;
 387 |                     matchArrival->from   = currentPosition;
 388 |                     matchArrival->offset = match.offset;
 389 | 
 390 |                     arrivalWatermark = matchArrival > arrivalWatermark ? matchArrival : arrivalWatermark;
 391 |                 }
 392 |             }
 393 | 
 394 |             if ( match.length > SKIP_MATCH_LENGTH && level < NO_SKIP_LEVEL )
 395 |             {
 396 |                 arrival     += match.length - LITERALS_PER_CONTROL;
 397 |                 inputCursor += match.length - LITERALS_PER_CONTROL;
 398 |             }
 399 |         }
 400 |     }
 401 | 
 402 |     // If this would cost more to encode than it would if it were just literals, encode it with no control blocks,
 403 |     // just literals
 404 |     if ( ( arrivalWatermark->cost + END_PADDING_LITERALS * LITERAL_BITS + CONTROLS_PER_BLOCK * CONTROL_BITS ) > ( inputLength * LITERAL_BITS ) )
 405 |     {
 406 |         memcpy( output, input, inputLength );
 407 | 
 408 |         return inputLength;
 409 |     }
 410 | 
 411 |     Arrival* previousPathNode;
 412 | 
 413 |     // now trace the actual optimal parse path back, connecting the nodes in the other direction.
 414 |     for ( const Arrival* pathNode = arrivalWatermark; pathNode->from > 0; pathNode = previousPathNode )
 415 |     {
 416 |         previousPathNode     = state->arrivals + ( pathNode->from - MIN_MATCH_LENGTH );
 417 | 
 418 |         previousPathNode->to = static_cast<int32_t>( ( pathNode - state->arrivals ) + MIN_MATCH_LENGTH );
 419 |     }
 420 | 
 421 |     uint8_t* outputCursor = output;
 422 | 
 423 |     memcpy( outputCursor, input, MIN_MATCH_LENGTH );
 424 | 
 425 |     outputCursor += MIN_MATCH_LENGTH;
 426 | 
 427 |     uint8_t* currentControlBlock = outputCursor;
 428 |     uint32_t currentControlCount = 0;
 429 |     uint32_t totalControlCount   = 0;
 430 | 
 431 |     outputCursor += CONTROL_BLOCK_SIZE;
 432 | 
 433 |     Arrival* nextPathNode;
 434 | 
 435 |     size_t   totalPathLength  = MIN_MATCH_LENGTH;
 436 |     uint16_t previousOffset   = INITIAL_OFFSET;
 437 | 
 438 |     bool     lastControlIsNop = false;
 439 | 
 440 |     // Now walk forwards again and actually write out the data.
 441 |     for ( const Arrival* pathNode = state->arrivals; pathNode < arrivalWatermark; pathNode = nextPathNode )
 442 |     {
 443 |         int32_t currentPosition = static_cast< int32_t >( ( pathNode - state->arrivals ) + MIN_MATCH_LENGTH );
 444 | 
 445 |         nextPathNode = state->arrivals + ( pathNode->to - MIN_MATCH_LENGTH );
 446 | 
 447 |         size_t pathDistance = nextPathNode - pathNode;
 448 | 
 449 |         totalPathLength += pathDistance;
 450 | 
 451 |         lastControlIsNop = false;
 452 | 
 453 |         if ( pathDistance < MIN_MATCH_LENGTH )
 454 |         {
 455 |             if ( currentControlCount == CONTROLS_PER_BLOCK )
 456 |             {
 457 |                 currentControlBlock  = outputCursor;
 458 |                 outputCursor        += CONTROL_BLOCK_SIZE;
 459 |                 currentControlCount  = 0;
 460 |             }
 461 | 
 462 |             if ( ( currentControlCount & 1 ) == 0 )
 463 |             {
 464 |                 currentControlBlock[ currentControlCount >> 1 ] =
 465 |                     ( static_cast<uint8_t>( pathDistance ) - 1 );
 466 |             }
 467 |             else
 468 |             {
 469 |                 currentControlBlock[ currentControlCount >> 1 ] |= 
 470 |                     ( static_cast< uint8_t >( pathDistance ) - 1 ) << CONTROL_BITS;
 471 |             }
 472 | 
 473 |             // output the literals.
 474 |             for ( int32_t where = 0; where < pathDistance; ++where )
 475 |             {
 476 |                 const uint8_t* currentInput = input + currentPosition + where;
 477 | 
 478 |                 outputCursor[ where ] = *currentInput ^ *( currentInput - previousOffset );
 479 |             }
 480 | 
 481 |             outputCursor += pathDistance;
 482 | 
 483 |             ++totalControlCount;
 484 |             ++currentControlCount;
 485 |         }
 486 |         else
 487 |         {
 488 |             size_t toEncode = pathDistance - 1; //note, we always subtract one here, because the first control block of the match encodes this way
 489 | 
 490 |                                                 // make sure the control block for the first part of the match has been allocated
 491 |                                                 // note, this is idempontent if we have not actually incremented the control count and we try this again.
 492 |             if ( currentControlCount == CONTROLS_PER_BLOCK )
 493 |             {
 494 |                 currentControlBlock  = outputCursor;
 495 |                 outputCursor        += CONTROL_BLOCK_SIZE;
 496 |                 currentControlCount  = 0;
 497 |             }
 498 | 
 499 |             // output the offset (after control block containing the first control nibble for this match).
 500 |             *reinterpret_cast< uint16_t* >( outputCursor ) = nextPathNode->offset ^ previousOffset;
 501 | 
 502 |             previousOffset = nextPathNode->offset;
 503 | 
 504 |             outputCursor += sizeof( uint16_t );
 505 | 
 506 |             for ( ;; )
 507 |             {
 508 |                 if ( currentControlCount == CONTROLS_PER_BLOCK )
 509 |                 {
 510 |                     currentControlBlock  = outputCursor;
 511 |                     outputCursor        += CONTROL_BLOCK_SIZE;
 512 |                     currentControlCount  = 0;
 513 |                 }
 514 | 
 515 |                 if ( toEncode >= EXTENDED_MATCH_BOUND )
 516 |                 {
 517 |                     if ( ( currentControlCount & 1 ) == 0 )
 518 |                     {
 519 |                         currentControlBlock[ currentControlCount >> 1 ] =
 520 |                             static_cast<uint8_t>( EXTENDED_MATCH_BOUND );
 521 |                     }
 522 |                     else
 523 |                     {
 524 |                         currentControlBlock[ currentControlCount >> 1 ] |= 
 525 |                             static_cast< uint8_t >( EXTENDED_MATCH_BOUND ) << CONTROL_BITS;
 526 |                     }
 527 | 
 528 |                     toEncode -= EXTENDED_MATCH_BOUND;
 529 | 
 530 |                     ++totalControlCount;
 531 |                     ++currentControlCount;
 532 | 
 533 |                 }
 534 |                 else
 535 |                 {
 536 |                     if ( ( currentControlCount & 1 ) == 0 )
 537 |                     {
 538 |                         currentControlBlock[ currentControlCount >> 1 ] =
 539 |                             static_cast<uint8_t>( toEncode );
 540 |                     }
 541 |                     else
 542 |                     {
 543 |                         currentControlBlock[ currentControlCount >> 1 ] |= 
 544 |                             static_cast< uint8_t >( toEncode ) << CONTROL_BITS;
 545 |                     }
 546 | 
 547 |                     if ( toEncode == 0 && currentControlCount == 0 )
 548 |                     {
 549 |                         lastControlIsNop = true;
 550 |                     }
 551 | 
 552 |                     ++totalControlCount;
 553 |                     ++currentControlCount;
 554 | 
 555 |                     break;
 556 |                 }
 557 |             }
 558 |         }
 559 |     }
 560 | 
 561 |     if ( lastControlIsNop )
 562 |     {
 563 |         outputCursor -= CONTROL_BLOCK_SIZE;
 564 |     }
 565 | 
 566 |     size_t remainingLiterals = ( input + inputLength ) - inputCursor;
 567 | 
 568 |     // copy remaining literals
 569 |     memcpy( outputCursor, inputCursor, remainingLiterals );
 570 | 
 571 |     outputCursor += remainingLiterals;
 572 | 
 573 |     return outputCursor - output;
 574 | }
 575 | 
 576 | 
 577 | size_t LZSSE2_Decompress( const void* inputChar, size_t inputLength, void* outputChar, size_t outputLength )
 578 | {
 579 |     const uint8_t* input  = reinterpret_cast< const uint8_t* >( inputChar );
 580 |     uint8_t*       output = reinterpret_cast< uint8_t* >( outputChar );
 581 | 
 582 |     // Length it not work compressing, just copy initial values
 583 |     if ( outputLength == inputLength )
 584 |     {
 585 |         memcpy( output, input, outputLength );
 586 | 
 587 |         return inputLength;
 588 |     }
 589 | 
 590 |     const uint8_t* inputCursor  = input;
 591 |     uint8_t*       outputCursor = output;
 592 | 
 593 |     // The offset starts off as the minimum match length. We actually need it at least two
 594 |     // characters back because we need them to be set to xor out the literals from the match data.
 595 |     size_t  offset          = INITIAL_OFFSET;
 596 |     __m128i previousCarryHi = _mm_setzero_si128();
 597 | 
 598 |     *( outputCursor++ ) = *( inputCursor++ );
 599 |     *( outputCursor++ ) = *( inputCursor++ );
 600 |     *( outputCursor++ ) = *( inputCursor++ );
 601 | 
 602 |     // What these macros do:
 603 |     //     Decode a single literal run or match run for a single control nibble.
 604 |     // How they do it:
 605 |     //    - Read the *unaligned* input (in the case of LZSSE-F - twice, for LZSSE-O we read once) - one goes into an SSE register,
 606 |     //      because it could either be literals or an offset (or nothing at all). The low byte of streamBytesRead controls how much we advance
 607 |     //      the input cursor.
 608 |     //    - Used a contived set of casts to sign extend the "read offset" control mask and then use it to mask the input word,
 609 |     //      which is then xor'd against the offset, for a "branchless" conditional move into the offset which
 610 |     //      has been carried over from the previous literal/match block. Note, this ends up doing better than a cmov on most 
 611 |     //      modern processors. But we need to pre-xor the input offset.
 612 |     //    - We then load the match data from output buffer (offset back from the current output point). Unconditional load here.
 613 |     //    - We broadcast the "from literal" control mask from the current least significant byte of the SSE register using a shuffle epi-8
 614 |     //    - We mask the literals with that SSE register wide mask.
 615 |     //    - The literals have been pre-xor'd with the data read in as match data, so we use an xor to branchlessly choose between the two.
 616 |     //      In this case, it ends up a better option than a blendv on most processors.
 617 |     //    - Store the block. We store all 16 bytes of the SSE register (due to some constraints in the format of the data, we won't
 618 |     //      go past the end of the buffer), but we may overlap this.
 619 |     //    - bytesOut controls how much we advance the output cursor.
 620 |     //    - We use 8 bit shifts to advance all the controls up to the next byte. There is some variable sized register trickery that 
 621 |     //      x86/x64 is great for as long as we don't anger the register renamer.
 622 | 
 623 | #define DECODE_STEP( HILO, CHECKMATCH, CHECKBUFFERS )                                                                           \
 624 |     {                                                                                                                           \
 625 |                                                                                                                                 \
 626 |         uint64_t inputWord = *reinterpret_cast<const uint16_t*>( inputCursor );                                                 \
 627 |         __m128i  literals  = _mm_cvtsi64_si128( inputWord );                                                                    \
 628 |                                                                                                                                 \
 629 |         offset ^= static_cast<size_t>( static_cast<ptrdiff_t>( static_cast<int8_t>( readOffsetHalf##HILO ) ) ) & inputWord;     \
 630 |                                                                                                                                 \
 631 |         readOffsetHalf##HILO >>= 8;                                                                                             \
 632 |                                                                                                                                 \
 633 |         const uint8_t* matchPointer = outputCursor - offset;                                                                    \
 634 |                                                                                                                                 \
 635 |         if ( CHECKMATCH && matchPointer < output )                                                                              \
 636 |             goto MATCH_UNDERFLOW;                                                                                               \
 637 |                                                                                                                                 \
 638 |         __m128i fromLiteral = _mm_shuffle_epi8( fromLiteral##HILO, _mm_setzero_si128() );                                       \
 639 |         __m128i matchData   = _mm_loadu_si128( reinterpret_cast<const __m128i*>( matchPointer ) );                              \
 640 |                                                                                                                                 \
 641 |         literals = _mm_and_si128( fromLiteral, literals );                                                                      \
 642 |                                                                                                                                 \
 643 |         fromLiteral##HILO   = _mm_srli_si128( fromLiteral##HILO, 1 );                                                           \
 644 |                                                                                                                                 \
 645 |         __m128i toStore     = _mm_xor_si128( matchData, literals );                                                             \
 646 |                                                                                                                                 \
 647 |         _mm_storeu_si128( reinterpret_cast<__m128i*>( outputCursor ), toStore );                                                \
 648 |                                                                                                                                 \
 649 |         outputCursor += static_cast< uint8_t >( bytesOutHalf##HILO );                                                           \
 650 |         inputCursor  += static_cast< uint8_t >( streamBytesReadHalf##HILO );                                                    \
 651 |                                                                                                                                 \
 652 |         bytesOutHalf##HILO        >>= 8;                                                                                        \
 653 |         streamBytesReadHalf##HILO >>= 8;                                                                                        \
 654 |                                                                                                                                 \
 655 |         if ( CHECKBUFFERS && ( outputCursor >= outputEarlyEnd || inputCursor > inputEarlyEnd ) )                                \
 656 |             goto BUFFER_END;                                                                                                    \
 657 |     }
 658 | 
 659 | #define DECODE_STEP_HALF( HILO, CHECKMATCH, CHECKBUFFERS )                                                                      \
 660 |     {                                                                                                                           \
 661 |         size_t  inputWord   = *reinterpret_cast< const uint16_t* >( inputCursor );                                              \
 662 |         __m128i literals    = _mm_cvtsi64_si128( inputWord );                                                                   \
 663 |                                                                                                                                 \
 664 |         offset ^= static_cast< size_t >( static_cast< ptrdiff_t >( static_cast<int8_t>( readOffsetHalf##HILO ) ) ) & inputWord; \
 665 |                                                                                                                                 \
 666 |         const uint8_t* matchPointer = outputCursor - offset;                                                                    \
 667 |                                                                                                                                 \
 668 |         if ( CHECKMATCH && matchPointer < output )                                                                              \
 669 |             goto MATCH_UNDERFLOW;                                                                                               \
 670 |                                                                                                                                 \
 671 |         __m128i fromLiteral = _mm_shuffle_epi8( fromLiteral##HILO, _mm_setzero_si128() );                                       \
 672 |         __m128i matchData   = _mm_loadu_si128( reinterpret_cast<const __m128i*>( matchPointer ) );                              \
 673 |                                                                                                                                 \
 674 |         literals = _mm_and_si128( fromLiteral, literals );                                                                      \
 675 |                                                                                                                                 \
 676 |         fromLiteral##HILO = _mm_srli_si128( fromLiteral##HILO, 1 );                                                             \
 677 |                                                                                                                                 \
 678 |         __m128i toStore     = _mm_xor_si128( matchData, literals );                                                             \
 679 |                                                                                                                                 \
 680 |         _mm_storeu_si128( reinterpret_cast<__m128i*>( outputCursor ), toStore );                                                \
 681 |                                                                                                                                 \
 682 |         outputCursor += static_cast< uint8_t >( bytesOutHalf##HILO );                                                           \
 683 |         inputCursor  += static_cast< uint8_t >( streamBytesReadHalf##HILO );                                                    \
 684 |                                                                                                                                 \
 685 |         if ( CHECKBUFFERS && ( outputCursor >= outputEarlyEnd || inputCursor > inputEarlyEnd ) )                                \
 686 |             goto BUFFER_END;                                                                                                    \
 687 |     }
 688 | 
 689 | #define DECODE_STEP_END( HILO, CHECKMATCH, CHECKBUFFERS )                                                                       \
 690 |     {                                                                                                                           \
 691 |         size_t  inputWord = *reinterpret_cast< const uint16_t* >( inputCursor );                                                \
 692 |         __m128i literals  = _mm_cvtsi64_si128( inputWord );                                                                     \
 693 |                                                                                                                                 \
 694 |         offset ^= static_cast< size_t >( static_cast< ptrdiff_t >( static_cast<int8_t>( readOffsetHalf##HILO ) ) ) & inputWord; \
 695 |                                                                                                                                 \
 696 |         const uint8_t* matchPointer = outputCursor - offset;                                                                    \
 697 |                                                                                                                                 \
 698 |         if ( CHECKMATCH && matchPointer < output )                                                                              \
 699 |             goto MATCH_UNDERFLOW;                                                                                               \
 700 |                                                                                                                                 \
 701 |         __m128i fromLiteral = _mm_shuffle_epi8( fromLiteral##HILO, _mm_setzero_si128() );                                       \
 702 |         __m128i matchData   = _mm_loadu_si128( reinterpret_cast<const __m128i*>( matchPointer ) );                              \
 703 |                                                                                                                                 \
 704 |         literals = _mm_and_si128( fromLiteral, literals );                                                                      \
 705 |                                                                                                                                 \
 706 |         __m128i toStore     = _mm_xor_si128( matchData, literals );                                                             \
 707 |                                                                                                                                 \
 708 |         _mm_storeu_si128( reinterpret_cast<__m128i*>( outputCursor ), toStore );                                                \
 709 |                                                                                                                                 \
 710 |         outputCursor += static_cast< uint8_t >( bytesOutHalf##HILO );                                                           \
 711 |         inputCursor  += static_cast< uint8_t >( streamBytesReadHalf##HILO );                                                    \
 712 |                                                                                                                                 \
 713 |         if ( CHECKBUFFERS && ( outputCursor >= outputEarlyEnd || inputCursor > inputEarlyEnd ) )                                \
 714 |             goto BUFFER_END;                                                                                                    \
 715 |         } 
 716 | 
 717 | #define DECODE_STEP_LO(CHECKMATCH, CHECKBUFFERS )          DECODE_STEP( Lo, CHECKMATCH, CHECKBUFFERS )
 718 | #define DECODE_STEP_HI(CHECKMATCH, CHECKBUFFERS )          DECODE_STEP( Hi, CHECKMATCH, CHECKBUFFERS )
 719 | #define DECODE_STEP_HALF_LO(CHECKMATCH, CHECKBUFFERS )     DECODE_STEP_HALF( Lo, CHECKMATCH, CHECKBUFFERS )
 720 | #define DECODE_STEP_HALF_HI(CHECKMATCH, CHECKBUFFERS )     DECODE_STEP_HALF( Hi, CHECKMATCH, CHECKBUFFERS )
 721 | #define DECODE_STEP_END_LO(CHECKMATCH, CHECKBUFFERS )      DECODE_STEP_END( Lo, CHECKMATCH, CHECKBUFFERS )
 722 | #define DECODE_STEP_END_HI(CHECKMATCH, CHECKBUFFERS )      DECODE_STEP_END( Hi, CHECKMATCH, CHECKBUFFERS )
 723 | 
 724 |     __m128i nibbleMask             = _mm_set1_epi8( 0xF );
 725 |     __m128i literalsPerControl     = _mm_set1_epi8( LITERALS_PER_CONTROL );
 726 | 
 727 |     // Note, we use this block here because it allows the "fake" inputEarlyEnd/outputEarlyEnd not to cause register spills 
 728 |     // in the decompression loops. And yes, that did actually happen.
 729 |     {
 730 | #pragma warning ( push )
 731 | #pragma warning ( disable : 4101 )
 732 | 
 733 |         const  uint8_t* inputEarlyEnd; //= ( input + inputLength ) - END_PADDING_LITERALS;
 734 |         uint8_t*        outputEarlyEnd;// = ( output + outputLength ) - END_PADDING_LITERALS;
 735 | 
 736 | #pragma warning ( pop )
 737 | 
 738 |         // "Safe" ends to the buffer, before the input/output cursors hit these, we can loop without overflow checks.
 739 |         const  uint8_t* inputSafeEnd  = ( input + inputLength ) - INPUT_BUFFER_SAFE;
 740 |         uint8_t*        outputSafeEnd = ( output + outputLength ) - OUTPUT_BUFFER_SAFE;
 741 | 
 742 |         // Decoding loop with offset output buffer underflow test, but no buffer overflow tests, assumed to end at a safe distance 
 743 |         // from overflows
 744 |         while ( ( outputCursor - output ) < LZ_WINDOW_SIZE && outputCursor < outputSafeEnd && inputCursor < inputSafeEnd )
 745 |         {
 746 |             // load the control block
 747 |             __m128i controlBlock      = _mm_loadu_si128( reinterpret_cast<const __m128i*>( inputCursor ) );
 748 | 
 749 |             // split the control block into high and low nibbles.
 750 |             __m128i controlHi         = _mm_and_si128( _mm_srli_epi32( controlBlock, CONTROL_BITS ), nibbleMask );
 751 |             __m128i controlLo         = _mm_and_si128( controlBlock, nibbleMask );
 752 |             
 753 |             // Here we are testing if the runs will be literals or matches. Note that if the carries are set from the previous operation
 754 |             // this will essentially be ignored later on.
 755 |             __m128i isLiteralHi       = _mm_cmplt_epi8( controlHi, literalsPerControl );
 756 |             __m128i isLiteralLo       = _mm_cmplt_epi8( controlLo, literalsPerControl );
 757 | 
 758 |             // Work out the carry for the low nibbles (which will be used with the high controls to put them into 
 759 |             // match without offset read mode).
 760 |             __m128i carryLo           = _mm_cmpeq_epi8( controlLo, nibbleMask );
 761 |             
 762 |             // The carry for the high nibbles is used with the low controls, but needs one byte from the previous iteration. We save
 763 |             // the calculated carry to use that byte next iteration.
 764 |             __m128i carryHi           = _mm_cmpeq_epi8( controlHi, nibbleMask );
 765 |             __m128i shiftedCarryHi    = _mm_alignr_epi8( carryHi, previousCarryHi, 15 );
 766 | 
 767 |             previousCarryHi = carryHi;
 768 | 
 769 |             // I want 128 set bits please.
 770 |             __m128i allSet            = _mm_cmpeq_epi8( shiftedCarryHi, shiftedCarryHi ); 
 771 | 
 772 |             // Calcualting the bytes to output to the stream. Basically, we are subtracting negative one from the control value if the 
 773 |             // carry is not set. This works because the masks produced by comparisons are the equivalent to negative one, which 
 774 |             // make this a conditional increment.
 775 |             __m128i bytesOutLo        = _mm_sub_epi8( controlLo, _mm_xor_si128( shiftedCarryHi, allSet ) );
 776 |             __m128i bytesOutHi        = _mm_sub_epi8( controlHi, _mm_xor_si128( carryLo, allSet ) );
 777 | 
 778 |             // Calculate the number of bytes to read per control.
 779 |             // In the case the carry is set, no bytes. Otherwise, the offset size (2 bytes) for matches or the number of output bytes for literals.
 780 |             __m128i streamBytesReadLo = _mm_andnot_si128( shiftedCarryHi, _mm_min_epi8( literalsPerControl, bytesOutLo ) );
 781 |             __m128i streamBytesReadHi = _mm_andnot_si128( carryLo, _mm_min_epi8( literalsPerControl, bytesOutHi ) );
 782 |             
 783 |             // Masks to read the offset (or keep the previous one) - set in the case that this is not a literal and the carry is not set
 784 |             __m128i readOffsetLo      = _mm_xor_si128( _mm_or_si128( isLiteralLo, shiftedCarryHi ), allSet );
 785 |             __m128i readOffsetHi      = _mm_xor_si128( _mm_or_si128( isLiteralHi, carryLo ), allSet );
 786 | 
 787 |             // Masks whether we are reading literals - set if the carry is not set and these are literals.
 788 |             __m128i fromLiteralLo     = _mm_andnot_si128( shiftedCarryHi, isLiteralLo );
 789 |             __m128i fromLiteralHi     = _mm_andnot_si128( carryLo, isLiteralHi );
 790 | 
 791 |             // Advance the input past the control block.
 792 |             inputCursor += CONTROL_BLOCK_SIZE;
 793 | 
 794 |             {
 795 |                 // Pull out the bottom halves off the SSE registers from before - we want these
 796 |                 // things in GPRs for the more linear logic.
 797 |                 uint64_t bytesOutHalfLo        = static_cast<uint64_t>( _mm_cvtsi128_si64( bytesOutLo ) );
 798 |                 uint64_t bytesOutHalfHi        = static_cast<uint64_t>( _mm_cvtsi128_si64( bytesOutHi ) );
 799 | 
 800 |                 uint64_t streamBytesReadHalfLo = static_cast<uint64_t>( _mm_cvtsi128_si64( streamBytesReadLo ) );
 801 |                 uint64_t streamBytesReadHalfHi = static_cast<uint64_t>( _mm_cvtsi128_si64( streamBytesReadHi ) );
 802 | 
 803 |                 uint64_t readOffsetHalfLo      = static_cast<uint64_t>( _mm_cvtsi128_si64( readOffsetLo ) );
 804 |                 uint64_t readOffsetHalfHi      = static_cast<uint64_t>( _mm_cvtsi128_si64( readOffsetHi ) );
 805 | 
 806 |                 DECODE_STEP_LO( true, false );
 807 |                 DECODE_STEP_HI( true, false );
 808 |                 DECODE_STEP_LO( true, false );
 809 |                 DECODE_STEP_HI( true, false );
 810 | 
 811 |                 DECODE_STEP_LO( true, false );
 812 |                 DECODE_STEP_HI( true, false );
 813 |                 DECODE_STEP_LO( true, false );
 814 |                 DECODE_STEP_HI( true, false );
 815 | 
 816 |                 DECODE_STEP_LO( true, false );
 817 |                 DECODE_STEP_HI( true, false );
 818 |                 DECODE_STEP_LO( true, false );
 819 |                 DECODE_STEP_HI( true, false );
 820 | 
 821 |                 DECODE_STEP_LO( true, false );
 822 |                 DECODE_STEP_HI( true, false );
 823 |                 DECODE_STEP_HALF_LO( true, false );
 824 |                 DECODE_STEP_HALF_HI( true, false );
 825 |             }
 826 | 
 827 |             {
 828 |                 // Now the top halves.
 829 |                 uint64_t bytesOutHalfLo        = static_cast<uint64_t>( _mm_extract_epi64( bytesOutLo, 1 ) );
 830 |                 uint64_t bytesOutHalfHi        = static_cast<uint64_t>( _mm_extract_epi64( bytesOutHi, 1 ) );
 831 | 
 832 |                 uint64_t streamBytesReadHalfLo = static_cast<uint64_t>( _mm_extract_epi64( streamBytesReadLo, 1 ) );
 833 |                 uint64_t streamBytesReadHalfHi = static_cast<uint64_t>( _mm_extract_epi64( streamBytesReadHi, 1 ) );
 834 | 
 835 |                 uint64_t readOffsetHalfLo      = static_cast<uint64_t>( _mm_extract_epi64( readOffsetLo, 1 ) );
 836 |                 uint64_t readOffsetHalfHi      = static_cast<uint64_t>( _mm_extract_epi64( readOffsetHi, 1 ) );
 837 | 
 838 |                 DECODE_STEP_LO( true, false );
 839 |                 DECODE_STEP_HI( true, false );
 840 |                 DECODE_STEP_LO( true, false );
 841 |                 DECODE_STEP_HI( true, false );
 842 | 
 843 |                 DECODE_STEP_LO( true, false );
 844 |                 DECODE_STEP_HI( true, false );
 845 |                 DECODE_STEP_LO( true, false );
 846 |                 DECODE_STEP_HI( true, false );
 847 | 
 848 |                 DECODE_STEP_LO( true, false );
 849 |                 DECODE_STEP_HI( true, false );
 850 |                 DECODE_STEP_LO( true, false );
 851 |                 DECODE_STEP_HI( true, false );
 852 | 
 853 |                 DECODE_STEP_LO( true, false );
 854 |                 DECODE_STEP_HI( true, false );
 855 |                 DECODE_STEP_END_LO( true, false );
 856 |                 DECODE_STEP_END_HI( true, false );
 857 |             }
 858 |         }
 859 | 
 860 |         // Decoding loop with no buffer checks, but will end at a safe distance from the end of the buffers.
 861 |         // Note, when we get here we have already reached the point in the output buffer which is *past* where we can underflow
 862 |         // due to a bad match offset.
 863 |         while ( outputCursor < outputSafeEnd && inputCursor < inputSafeEnd )
 864 |         {
 865 |             // This code is the same as the loop above, see comments there
 866 |             __m128i controlBlock      = _mm_loadu_si128( reinterpret_cast<const __m128i*>( inputCursor ) );
 867 |             __m128i controlHi         = _mm_and_si128( _mm_srli_epi32( controlBlock, CONTROL_BITS ), nibbleMask );
 868 |             __m128i controlLo         = _mm_and_si128( controlBlock, nibbleMask );
 869 | 
 870 |             __m128i isLiteralHi       = _mm_cmplt_epi8( controlHi, literalsPerControl );
 871 |             __m128i isLiteralLo       = _mm_cmplt_epi8( controlLo, literalsPerControl );
 872 |             __m128i carryLo           = _mm_cmpeq_epi8( controlLo, nibbleMask );
 873 |             __m128i carryHi           = _mm_cmpeq_epi8( controlHi, nibbleMask );
 874 |             __m128i shiftedCarryHi    = _mm_alignr_epi8( carryHi, previousCarryHi, 15 ); // where we take the carry from the previous hi values
 875 | 
 876 |             previousCarryHi = carryHi;
 877 | 
 878 |             __m128i neg1              = _mm_cmpeq_epi8( shiftedCarryHi, shiftedCarryHi );
 879 | 
 880 |             __m128i bytesOutLo        = _mm_sub_epi8( controlLo, _mm_xor_si128( shiftedCarryHi, neg1 ) );
 881 |             __m128i bytesOutHi        = _mm_sub_epi8( controlHi, _mm_xor_si128( carryLo, neg1 ) );
 882 | 
 883 |             __m128i streamBytesReadLo = _mm_andnot_si128( shiftedCarryHi, _mm_min_epi8( literalsPerControl, bytesOutLo ) );
 884 |             __m128i streamBytesReadHi = _mm_andnot_si128( carryLo, _mm_min_epi8( literalsPerControl, bytesOutHi ) );
 885 | 
 886 |             __m128i readOffsetLo      = _mm_xor_si128( _mm_or_si128( isLiteralLo, shiftedCarryHi ), neg1 );
 887 |             __m128i readOffsetHi      = _mm_xor_si128( _mm_or_si128( isLiteralHi, carryLo ), neg1 );
 888 | 
 889 |             __m128i fromLiteralLo     = _mm_andnot_si128( shiftedCarryHi, isLiteralLo );
 890 |             __m128i fromLiteralHi     = _mm_andnot_si128( carryLo, isLiteralHi );
 891 | 
 892 |             inputCursor += CONTROL_BLOCK_SIZE;
 893 | 
 894 |             {
 895 |                 uint64_t bytesOutHalfLo        = static_cast<uint64_t>( _mm_cvtsi128_si64( bytesOutLo ) );
 896 |                 uint64_t bytesOutHalfHi        = static_cast<uint64_t>( _mm_cvtsi128_si64( bytesOutHi ) );
 897 | 
 898 |                 uint64_t streamBytesReadHalfLo = static_cast<uint64_t>( _mm_cvtsi128_si64( streamBytesReadLo ) );
 899 |                 uint64_t streamBytesReadHalfHi = static_cast<uint64_t>( _mm_cvtsi128_si64( streamBytesReadHi ) );
 900 | 
 901 |                 uint64_t readOffsetHalfLo      = static_cast<uint64_t>( _mm_cvtsi128_si64( readOffsetLo ) );
 902 |                 uint64_t readOffsetHalfHi      = static_cast<uint64_t>( _mm_cvtsi128_si64( readOffsetHi ) );
 903 | 
 904 |                 DECODE_STEP_LO( false, false );
 905 |                 DECODE_STEP_HI( false, false );
 906 |                 DECODE_STEP_LO( false, false );
 907 |                 DECODE_STEP_HI( false, false );
 908 | 
 909 |                 DECODE_STEP_LO( false, false );
 910 |                 DECODE_STEP_HI( false, false );
 911 |                 DECODE_STEP_LO( false, false );
 912 |                 DECODE_STEP_HI( false, false );
 913 | 
 914 |                 DECODE_STEP_LO( false, false );
 915 |                 DECODE_STEP_HI( false, false );
 916 |                 DECODE_STEP_LO( false, false );
 917 |                 DECODE_STEP_HI( false, false );
 918 | 
 919 |                 DECODE_STEP_LO( false, false );
 920 |                 DECODE_STEP_HI( false, false );
 921 |                 DECODE_STEP_HALF_LO( false, false );
 922 |                 DECODE_STEP_HALF_HI( false, false );
 923 |             }
 924 | 
 925 |             {
 926 |                 uint64_t bytesOutHalfLo        = static_cast<uint64_t>( _mm_extract_epi64( bytesOutLo, 1 ) );
 927 |                 uint64_t bytesOutHalfHi        = static_cast<uint64_t>( _mm_extract_epi64( bytesOutHi, 1 ) );
 928 | 
 929 |                 uint64_t streamBytesReadHalfLo = static_cast<uint64_t>( _mm_extract_epi64( streamBytesReadLo, 1 ) );
 930 |                 uint64_t streamBytesReadHalfHi = static_cast<uint64_t>( _mm_extract_epi64( streamBytesReadHi, 1 ) );
 931 | 
 932 |                 uint64_t readOffsetHalfLo      = static_cast<uint64_t>( _mm_extract_epi64( readOffsetLo, 1 ) );
 933 |                 uint64_t readOffsetHalfHi      = static_cast<uint64_t>( _mm_extract_epi64( readOffsetHi, 1 ) );
 934 | 
 935 |                 DECODE_STEP_LO( false, false );
 936 |                 DECODE_STEP_HI( false, false );
 937 |                 DECODE_STEP_LO( false, false );
 938 |                 DECODE_STEP_HI( false, false );
 939 | 
 940 |                 DECODE_STEP_LO( false, false );
 941 |                 DECODE_STEP_HI( false, false );
 942 |                 DECODE_STEP_LO( false, false );
 943 |                 DECODE_STEP_HI( false, false );
 944 | 
 945 |                 DECODE_STEP_LO( false, false );
 946 |                 DECODE_STEP_HI( false, false );
 947 |                 DECODE_STEP_LO( false, false );
 948 |                 DECODE_STEP_HI( false, false );
 949 | 
 950 |                 DECODE_STEP_LO( false, false );
 951 |                 DECODE_STEP_HI( false, false );
 952 |                 DECODE_STEP_END_LO( false, false );
 953 |                 DECODE_STEP_END_HI( false, false );
 954 |             }
 955 |         }
 956 |     }
 957 |     
 958 |     // Decoding loop with all buffer checks.
 959 |     {
 960 |         const uint8_t* inputEarlyEnd;
 961 |         uint8_t*        outputEarlyEnd;
 962 |         inputEarlyEnd  = (( input + inputLength ) - END_PADDING_LITERALS);
 963 |         outputEarlyEnd = ( output + outputLength ) - END_PADDING_LITERALS;
 964 | 
 965 |         while ( outputCursor < outputEarlyEnd && inputCursor < inputEarlyEnd )
 966 |         {
 967 |             __m128i controlBlock      = _mm_loadu_si128( reinterpret_cast<const __m128i*>( inputCursor ) );
 968 |             __m128i controlHi         = _mm_and_si128( _mm_srli_epi32( controlBlock, CONTROL_BITS ), nibbleMask );
 969 |             __m128i controlLo         = _mm_and_si128( controlBlock, nibbleMask );
 970 | 
 971 |             __m128i isLiteralHi       = _mm_cmplt_epi8( controlHi, literalsPerControl );
 972 |             __m128i isLiteralLo       = _mm_cmplt_epi8( controlLo, literalsPerControl );
 973 |             __m128i carryLo           = _mm_cmpeq_epi8( controlLo, nibbleMask );
 974 |             __m128i carryHi           = _mm_cmpeq_epi8( controlHi, nibbleMask );
 975 |             __m128i shiftedCarryHi    = _mm_alignr_epi8( carryHi, previousCarryHi, 15 ); // where we take the carry from the previous hi values
 976 | 
 977 |             previousCarryHi = carryHi;
 978 | 
 979 |             __m128i neg1              = _mm_cmpeq_epi8( shiftedCarryHi, shiftedCarryHi ); 
 980 | 
 981 |             __m128i bytesOutLo        = _mm_sub_epi8( controlLo, _mm_xor_si128( shiftedCarryHi, neg1 ) );
 982 |             __m128i bytesOutHi        = _mm_sub_epi8( controlHi, _mm_xor_si128( carryLo, neg1 ) );
 983 | 
 984 |             __m128i streamBytesReadLo = _mm_andnot_si128( shiftedCarryHi, _mm_min_epi8( literalsPerControl, bytesOutLo ) );
 985 |             __m128i streamBytesReadHi = _mm_andnot_si128( carryLo, _mm_min_epi8( literalsPerControl, bytesOutHi ) );
 986 | 
 987 |             __m128i readOffsetLo      = _mm_xor_si128( _mm_or_si128( isLiteralLo, shiftedCarryHi ), neg1 );
 988 |             __m128i readOffsetHi      = _mm_xor_si128( _mm_or_si128( isLiteralHi, carryLo ), neg1 );
 989 | 
 990 |             __m128i fromLiteralLo     = _mm_andnot_si128( shiftedCarryHi, isLiteralLo );
 991 |             __m128i fromLiteralHi     = _mm_andnot_si128( carryLo, isLiteralHi );
 992 | 
 993 |             inputCursor += CONTROL_BLOCK_SIZE;
 994 | 
 995 |             if ( inputCursor > inputEarlyEnd )
 996 |                 goto BUFFER_END;
 997 | 
 998 |             {
 999 |                 uint64_t bytesOutHalfLo        = static_cast<uint64_t>( _mm_cvtsi128_si64( bytesOutLo ) );
1000 |                 uint64_t bytesOutHalfHi        = static_cast<uint64_t>( _mm_cvtsi128_si64( bytesOutHi ) );
1001 | 
1002 |                 uint64_t streamBytesReadHalfLo = static_cast<uint64_t>( _mm_cvtsi128_si64( streamBytesReadLo ) );
1003 |                 uint64_t streamBytesReadHalfHi = static_cast<uint64_t>( _mm_cvtsi128_si64( streamBytesReadHi ) );
1004 | 
1005 |                 uint64_t readOffsetHalfLo      = static_cast<uint64_t>( _mm_cvtsi128_si64( readOffsetLo ) );
1006 |                 uint64_t readOffsetHalfHi      = static_cast<uint64_t>( _mm_cvtsi128_si64( readOffsetHi ) );
1007 | 
1008 |                 DECODE_STEP_LO( true, true );
1009 |                 DECODE_STEP_HI( true, true );
1010 |                 DECODE_STEP_LO( true, true );
1011 |                 DECODE_STEP_HI( true, true );
1012 | 
1013 |                 DECODE_STEP_LO( true, true );
1014 |                 DECODE_STEP_HI( true, true );
1015 |                 DECODE_STEP_LO( true, true );
1016 |                 DECODE_STEP_HI( true, true );
1017 | 
1018 |                 DECODE_STEP_LO( true, true );
1019 |                 DECODE_STEP_HI( true, true );
1020 |                 DECODE_STEP_LO( true, true );
1021 |                 DECODE_STEP_HI( true, true );
1022 | 
1023 |                 DECODE_STEP_LO( true, true );
1024 |                 DECODE_STEP_HI( true, true );
1025 |                 DECODE_STEP_HALF_LO( true, true );
1026 |                 DECODE_STEP_HALF_HI( true, true );
1027 |             }
1028 | 
1029 |             {
1030 |                 uint64_t bytesOutHalfLo        = static_cast<uint64_t>( _mm_extract_epi64( bytesOutLo, 1 ) );
1031 |                 uint64_t bytesOutHalfHi        = static_cast<uint64_t>( _mm_extract_epi64( bytesOutHi, 1 ) );
1032 | 
1033 |                 uint64_t streamBytesReadHalfLo = static_cast<uint64_t>( _mm_extract_epi64( streamBytesReadLo, 1 ) );
1034 |                 uint64_t streamBytesReadHalfHi = static_cast<uint64_t>( _mm_extract_epi64( streamBytesReadHi, 1 ) );
1035 | 
1036 |                 uint64_t readOffsetHalfLo      = static_cast<uint64_t>( _mm_extract_epi64( readOffsetLo, 1 ) );
1037 |                 uint64_t readOffsetHalfHi      = static_cast<uint64_t>( _mm_extract_epi64( readOffsetHi, 1 ) );
1038 | 
1039 |                 DECODE_STEP_LO( true, true );
1040 |                 DECODE_STEP_HI( true, true );
1041 |                 DECODE_STEP_LO( true, true );
1042 |                 DECODE_STEP_HI( true, true );
1043 | 
1044 |                 DECODE_STEP_LO( true, true );
1045 |                 DECODE_STEP_HI( true, true );
1046 |                 DECODE_STEP_LO( true, true );
1047 |                 DECODE_STEP_HI( true, true );
1048 | 
1049 |                 DECODE_STEP_LO( true, true );
1050 |                 DECODE_STEP_HI( true, true );
1051 |                 DECODE_STEP_LO( true, true );
1052 |                 DECODE_STEP_HI( true, true );
1053 | 
1054 |                 DECODE_STEP_LO( true, true );
1055 |                 DECODE_STEP_HI( true, true );
1056 |                 DECODE_STEP_END_LO( true, true );
1057 |                 DECODE_STEP_END_HI( true, true );
1058 |             }
1059 |         }
1060 | 
1061 | BUFFER_END:
1062 | 
1063 |         // When we get here, we have either advanced the right amount on both cursors
1064 |         // or something bad happened, so leave it as is, so we can tell where
1065 |         // the error happened. 
1066 |         if ( inputCursor == inputEarlyEnd && outputCursor == outputEarlyEnd )
1067 |         {
1068 |             size_t remainingLiterals = ( input + inputLength ) - inputCursor;
1069 | 
1070 |             // copy any trailing literals
1071 |             memcpy( outputCursor, inputCursor, remainingLiterals );
1072 | 
1073 |             outputCursor += remainingLiterals;
1074 |         }
1075 |     }
1076 | 
1077 | MATCH_UNDERFLOW:
1078 | 
1079 |     return outputCursor - output;
1080 | }
1081 | 


--------------------------------------------------------------------------------
/lzsse2/lzsse2.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2016, Conor Stokes
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this
 9 | list of conditions and the following disclaimer.
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | */
25 | 
26 | #ifndef LZSSE2_H__
27 | #define LZSSE2_H__
28 | 
29 | #pragma once
30 | 
31 | /* LZSSE2 - x64/SSE targeted codec for better performance with high compression ratio data/more optimal compressors.
32 |  * Supports minimum 3 byte matches, maximum 16 bytes of match per control word and 2 byte literal runs per control word. 
33 |  */
34 | 
35 | #ifdef __cplusplus
36 | extern "C"
37 | {
38 | #endif
39 | 
40 |  /* Re-usable parse state object for compression. */
41 | typedef struct LZSSE2_OptimalParseState LZSSE2_OptimalParseState;
42 | 
43 | /* Allocate the parse state for compression - returns null on failure. Note
44 |    Buffersize has to be greater or equal to any inputLength used with LZSSE2_CompressOptimalParse */
45 | LZSSE2_OptimalParseState* LZSSE2_MakeOptimalParseState( size_t bufferSize );
46 | 
47 | /* De-allocate the parse state for compression */
48 | void LZSSE2_FreeOptimalParseState( LZSSE2_OptimalParseState* toFree );
49 | 
50 | /* "Optimal" compression routine.
51 | * Will compress data into LZSSE2 format, uses hash BST matching to find matches and run an optimal parse (high relative memory usage). Requires SSE 4.1.
52 | * state : Contains the hash table for matching, passed as a parameter so that allocations can be re-used. 
53 | * input : Buffer containing uncompressed data to be compressed. May not be null.
54 | * inputLength : Length of the compressed data in the input buffer - note should be under 2GB.
55 | * output : Buffer that will receive the compressed output. 
56 | * outputLength : The length reserved in the buffer for compressed data. This should be at least inputLength. Note,
57 | *                The compressed data should never be longer than inputLength, as in this case the data is stored raw.
58 | * level : The compression level to use for this file 1->17, 17 is highest compression, 0 is least
59 | * Thread Safety - state can not be used on multiple threads with calls running concurrently. Can run multiple threads with separate state
60 | * concurrently.
61 | *
62 | * Returns the size of the compressed data, or 0 in the case of error (e.g. outputLength is less than inputLength).
63 | */
64 | size_t LZSSE2_CompressOptimalParse( LZSSE2_OptimalParseState* state, const void* input, size_t inputLength, void* output, size_t outputLength, unsigned int level );
65 | 
66 | /* Decompression routine.
67 | * This routine will decompress data in the LZSSE2 format and currently requires SSE 4.1 and is targeted at x64.
68 | * It will perform poorly on x86 due to hunger for registers.
69 | *  input : Buffer containing compressed input block. May not be null.
70 | *  inputLength : Length of the compressed data in the input buffer - note, this should be under 2GB
71 | *  output : Buffer that will received the de-compressed output. Note, that this needs to be at least outputLength long.
72 | *           May not be null.
73 | *  outputLength : The length of the compressed output - note, this should be under 2GB
74 | *
75 | * Provided that input and output are valid pointers to buffers of at least their specified size, this routine
76 | * should be memory safe - both match pointer checks and input/output buffer checks exist.
77 | *
78 | * Returns the size of the decompressed data, which will be less than outputLength in the event of an error (number of bytes
79 | * will indicate where in the output stream the error occured).
80 | *
81 | * Note that this data is not hash verified, errors that occur are either from a misformed stream or bad buffer sizes.
82 | * Remember, corrupt data can still be valid to decompress.
83 | */ 
84 | size_t LZSSE2_Decompress( const void* input, size_t inputLength, void* output, size_t outputLength );
85 | 
86 | #ifdef __cplusplus
87 | }
88 | #endif
89 | 
90 | #endif /* -- LZSSE2_H__ */
91 | 


--------------------------------------------------------------------------------
/lzsse2/lzsse2_platform.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2016, Brian Marshall
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this
 9 | list of conditions and the following disclaimer.
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | */
25 | 
26 | #ifndef LZSSE2_PLATFORM_H__
27 | #define LZSSE2_PLATFORM_H__
28 | 
29 | #pragma once
30 | 
31 | /*
32 |  Compiler/Platform detection based on the table from:
33 |  https://blogs.msdn.microsoft.com/vcblog/2015/12/04/clang-with-microsoft-codegen-in-vs-2015-update-1/
34 | */
35 | 
36 | #ifdef	_MSC_VER
37 | 
38 | /*
39 |  Microsoft Visual Studio Support.
40 |  C1xx/C2, Clang/C2 and Clang/LLVM all support the Microsoft header files and _BitScanForward
41 | 
42 |  Note: if you receive errors with the intrinsics make sure that you have SSE4.1 support enabled.
43 |  For example with Clang include "-msse4.1" on the command line
44 | */
45 | #include <intrin.h>
46 | 
47 | #else	/* _MSC_VER */
48 | 
49 | #ifdef __GNUC__
50 | 
51 | /*
52 |  GCC
53 | */
54 | 
55 | #define SIMDE_ENABLE_NATIVE_ALIASES
56 | #include "../simde/x86/sse4.1.h"
57 | /* _BitScanForward is Visual Studio specific. */
58 | #define _BitScanForward(x, m) *(x) = __builtin_ctz(m)
59 | 
60 | #else
61 | 
62 | /*
63 | If you hit the error below, then add detection for your compiler/platform to this header file.
64 | */
65 | #error Platform not supported
66 | 
67 | #endif	/* __GNUC__ */
68 | #endif  /* _MSC_VER */
69 | 
70 | #endif /* -- LZSSE2_PLATFORM_H__ */
71 | 


--------------------------------------------------------------------------------
/lzsse4/lzsse4.cpp:
--------------------------------------------------------------------------------
   1 | /*
   2 | Copyright (c) 2016, Conor Stokes
   3 | All rights reserved.
   4 | 
   5 | Redistribution and use in source and binary forms, with or without
   6 | modification, are permitted provided that the following conditions are met:
   7 | 
   8 | 1. Redistributions of source code must retain the above copyright notice, this
   9 | list of conditions and the following disclaimer.
  10 | 2. Redistributions in binary form must reproduce the above copyright notice,
  11 | this list of conditions and the following disclaimer in the documentation
  12 | and/or other materials provided with the distribution.
  13 | 
  14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
  18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24 | */
  25 | 
  26 | #include <string.h>
  27 | #include <stdlib.h>
  28 | #include <stdint.h>
  29 | #include <stddef.h>
  30 | 
  31 | #include "lzsse4_platform.h"
  32 | #include "lzsse4.h"
  33 | 
  34 | #pragma warning ( disable : 4127 )
  35 | 
  36 | namespace
  37 | {
  38 |     // Constants - most of these should not be changed without corresponding code changes because it will break many things in unpredictable ways.
  39 |     const uint32_t WINDOW_BITS             = 16;
  40 |     const uint32_t MIN_MATCH_LENGTH        = 4; 
  41 |     const uint32_t LZ_WINDOW_SIZE          = 1 << WINDOW_BITS;
  42 |     const uint32_t LZ_WINDOW_MASK          = LZ_WINDOW_SIZE - 1;
  43 |     const uint32_t FAST_HASH_BITS          = 20; // You can change this - more bits = more matches, less bits = more cache hits
  44 |     const uint32_t FAST_BUCKETS_COUNT      = 1 << FAST_HASH_BITS;
  45 |     const uint32_t FAST_HASH_MASK          = FAST_BUCKETS_COUNT - 1;
  46 |     const uint32_t MIN_COMPRESSION_SIZE    = 32;
  47 |     const uint32_t END_PADDING_LITERALS    = 16;
  48 |     const int32_t  NO_MATCH                = -1;
  49 |     const int32_t  EMPTY_NODE              = -1;
  50 |     const uint32_t MIN_LITERAL_COUNT       = 8;
  51 |     const uint32_t CONTROL_BITS            = 4;
  52 |     const uint32_t LITERAL_BITS            = 8;
  53 |     const uint32_t OFFSET_BITS             = 16;
  54 |     const uint32_t BASE_MATCH_BITS         = OFFSET_BITS + CONTROL_BITS;
  55 |     const uint32_t OFFSET_SIZE             = 2;
  56 |     const uint32_t EXTENDED_MATCH_BOUND    = ( 1 << CONTROL_BITS ) - 1;
  57 |     const uint32_t CONTROL_BLOCK_SIZE      = sizeof( __m128i );
  58 |     const uint32_t CONTROLS_PER_BLOCK      = 32;
  59 |     const uint32_t LITERALS_PER_CONTROL    = 4;
  60 |     const uint32_t MAX_INPUT_PER_CONTROL   = 4;
  61 |     const size_t   OUTPUT_BUFFER_SAFE      = EXTENDED_MATCH_BOUND * CONTROLS_PER_BLOCK;
  62 |     const size_t   INPUT_BUFFER_SAFE       = MAX_INPUT_PER_CONTROL * CONTROLS_PER_BLOCK;
  63 |     const uint16_t INITIAL_OFFSET          = MIN_MATCH_LENGTH;
  64 |     const uint32_t OPTIMAL_HASH_BITS       = 20;
  65 |     const uint32_t OPTIMAL_BUCKETS_COUNT   = 1 << OPTIMAL_HASH_BITS;
  66 |     const uint32_t OPTIMAL_HASH_MASK       = OPTIMAL_BUCKETS_COUNT - 1;
  67 |     const size_t   SKIP_MATCH_LENGTH       = 128;
  68 |     const uint32_t NO_SKIP_LEVEL           = 17;
  69 | }
  70 | 
  71 | 
  72 | struct LZSSE4_FastParseState
  73 | {
  74 |     int32_t buckets[ FAST_BUCKETS_COUNT ]; // stores the first matching position, we can then look at the rest of the matches by tracing through the window.
  75 | };
  76 | 
  77 | 
  78 | LZSSE4_FastParseState* LZSSE4_MakeFastParseState()
  79 | {
  80 |     return new LZSSE4_FastParseState();
  81 | }
  82 | 
  83 | 
  84 | void LZSSE4_FreeFastParseState( LZSSE4_FastParseState* toFree )
  85 | {
  86 |     delete toFree;
  87 | }
  88 | 
  89 | 
  90 | inline void SetHash( LZSSE4_FastParseState* state, uint32_t hash, const uint8_t* input, const uint8_t* inputCursor )
  91 | {
  92 |     int32_t position = static_cast<int32_t>( inputCursor - input );
  93 | 
  94 |     state->buckets[ hash & FAST_HASH_MASK ] = position;
  95 | }
  96 | 
  97 | // Simple fast hash function - actually what is used in snappy and derivatives
  98 | // There's probably better, but I haven't spent time focusing on this area yet.
  99 | inline uint32_t HashFast( const uint8_t* inputCursor )
 100 | {
 101 |     return *reinterpret_cast<const uint32_t*>( inputCursor ) * 0x1e35a7bd >> ( 32 - FAST_HASH_BITS );
 102 | }
 103 | 
 104 | size_t LZSSE4_CompressFast( LZSSE4_FastParseState* state, const void* inputChar, size_t inputLength, void* outputChar, size_t outputLength )
 105 | {
 106 |     if ( outputLength < inputLength )
 107 |     {
 108 |         // error case, output buffer not large enough.
 109 |         return 0;
 110 |     }
 111 | 
 112 |     const uint8_t* input  = reinterpret_cast< const uint8_t* >( inputChar );
 113 |     uint8_t*       output = reinterpret_cast< uint8_t* >( outputChar );
 114 | 
 115 |     if ( inputLength < MIN_COMPRESSION_SIZE )
 116 |     {
 117 |         memcpy( output, input, inputLength );
 118 | 
 119 |         return inputLength;
 120 |     }
 121 | 
 122 |     const uint8_t* inputCursor    = input;
 123 |     const uint8_t* inputEnd       = input + inputLength;
 124 |     const uint8_t* inputEarlyEnd  = inputEnd - END_PADDING_LITERALS;
 125 |     uint8_t*       outputCursor   = output;
 126 |     uint8_t*       outputEarlyEnd = ( output + outputLength ) - END_PADDING_LITERALS;
 127 |     uint32_t       hash           = 0;
 128 | 
 129 |     // initialize hash to empty 
 130 |     for ( int32_t* where = state->buckets, *end = state->buckets + FAST_BUCKETS_COUNT; where < end; where += 4 )
 131 |     {
 132 |         where[ 0 ] = -1;
 133 |         where[ 1 ] = -1;
 134 |         where[ 2 ] = -1;
 135 |         where[ 3 ] = -1;
 136 |     }
 137 | 
 138 |     // initial literals that wont be compressed
 139 |     for ( uint32_t where = 0; where < MIN_MATCH_LENGTH; ++where )
 140 |     {
 141 |         hash = HashFast( inputCursor );
 142 | 
 143 |         SetHash( state, hash, input, inputCursor );
 144 | 
 145 |         *( outputCursor++ ) = *( inputCursor++ );
 146 |     }
 147 | 
 148 |     uint8_t* currentControlBlock = outputCursor;    
 149 |     uint32_t currentControlCount = 0;
 150 |     uint16_t previousOffset      = INITIAL_OFFSET;
 151 |     size_t   literalsToFlush     = 0;
 152 | 
 153 |     outputCursor += CONTROL_BLOCK_SIZE;
 154 | 
 155 |     bool lastControlIsNop = false;
 156 |     
 157 |     // Loop through the data until we hit the end of one of the buffers (minus the end padding literals)
 158 |     while ( inputCursor < inputEarlyEnd && outputCursor <= outputEarlyEnd )
 159 |     { 
 160 |         lastControlIsNop = false;
 161 | 
 162 |         hash = HashFast( inputCursor );
 163 | 
 164 |         int      matchPosition   = state->buckets[ hash & FAST_HASH_MASK ];
 165 |         int      currentPosition = static_cast< int32_t >( inputCursor - input );
 166 |         uint32_t matchLength     = 0;
 167 |         uint16_t matchOffset     = static_cast< uint16_t >( currentPosition - matchPosition );
 168 | 
 169 |         // If we had a hit in the hash and it wasn't outside the window.
 170 |         if ( matchPosition >= 0 && ( currentPosition - matchPosition ) < ( LZ_WINDOW_SIZE - 1 ) )
 171 |         {
 172 |             const uint8_t* matchCandidate = input + matchPosition;
 173 |             uint32_t       lengthToEnd    = static_cast< uint32_t >( inputEarlyEnd - inputCursor );
 174 |             // Here we limit the hash length to prevent overlap matches with offset less than 16 bytes
 175 |             uint32_t       maxLength      = matchOffset <= ( EXTENDED_MATCH_BOUND + 1 ) && matchOffset < lengthToEnd ? matchOffset : lengthToEnd;
 176 | 
 177 |             // Find how long the match is 16 bytes at a time.
 178 |             while ( matchLength < maxLength )
 179 |             {
 180 |                 __m128i input16 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( inputCursor + matchLength ) );
 181 |                 __m128i match16 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( matchCandidate + matchLength ) );
 182 | 
 183 |                 unsigned long matchBytes;
 184 | 
 185 |                 // Finds the number of equal bytes at the start of the 16 
 186 |                 _BitScanForward( &matchBytes, ( static_cast< unsigned long >( ~_mm_movemask_epi8( _mm_cmpeq_epi8( input16, match16 ) ) ) | 0x10000 ) );
 187 | 
 188 |                 matchLength += matchBytes;
 189 | 
 190 |                 if ( matchBytes != sizeof( __m128i ) )
 191 |                 {
 192 |                     break;
 193 |                 }
 194 |             }
 195 | 
 196 |             matchLength = matchLength < maxLength ? matchLength : maxLength;
 197 |         }
 198 | 
 199 |         // If we have at least the minimum match length (4 bytes)
 200 |         if ( matchLength >= MIN_MATCH_LENGTH )
 201 |         {
 202 |             // Do we have literals to flush before the match?
 203 |             if ( literalsToFlush > 0 )
 204 |             {
 205 |                 // Start a new control block if we need one.
 206 |                 if ( currentControlCount == CONTROLS_PER_BLOCK )
 207 |                 {
 208 |                     currentControlBlock  = outputCursor;
 209 |                     outputCursor        += CONTROL_BLOCK_SIZE;
 210 | 
 211 |                     _mm_storeu_si128( reinterpret_cast< __m128i* >( outputCursor ), _mm_setzero_si128() );
 212 | 
 213 |                     currentControlCount  = 0;
 214 | 
 215 |                     // Would be larger than compressed size, get out!
 216 |                     if ( outputCursor > outputEarlyEnd )
 217 |                     {
 218 |                         break;
 219 |                     }
 220 |                 }
 221 | 
 222 |                 currentControlBlock[ currentControlCount >> 1 ] = 
 223 |                     ( currentControlBlock[ currentControlCount >> 1 ] >> 4 ) | ( static_cast<uint8_t>( literalsToFlush - 1 ) << 4 );
 224 | 
 225 |                 // flush the literals.
 226 |                 // note the xor against the data that would be read in the match.
 227 |                 for ( uint32_t where = 0; where < literalsToFlush; ++where )
 228 |                 {
 229 |                     const uint8_t* currentInput = inputCursor - ( literalsToFlush - where );
 230 | 
 231 |                     *( outputCursor++ ) = *currentInput ^ *( currentInput - previousOffset );
 232 |                 }
 233 | 
 234 |                 ++currentControlCount;
 235 | 
 236 |                 literalsToFlush = 0;
 237 |                     
 238 |                 // Would be larger than compressed size, get out!
 239 |                 if ( outputCursor > outputEarlyEnd )
 240 |                 {
 241 |                     break;
 242 |                 }
 243 |             }
 244 | 
 245 |             // Start a new control block if the previous one is full.
 246 |             // Note this is done before the offset is written out - the offset
 247 |             // is always written after the control block containing the first
 248 |             // control in the match.
 249 |             if ( currentControlCount == CONTROLS_PER_BLOCK )
 250 |             {
 251 |                 currentControlBlock  = outputCursor;
 252 |                 outputCursor        += CONTROL_BLOCK_SIZE;
 253 | 
 254 |                 _mm_storeu_si128( reinterpret_cast< __m128i* >( outputCursor ), _mm_setzero_si128() );
 255 | 
 256 |                 currentControlCount  = 0;
 257 |                     
 258 |                 if ( outputCursor > outputEarlyEnd )
 259 |                 {
 260 |                     break;
 261 |                 }
 262 |             }
 263 | 
 264 |             // The match length value we are encoding.
 265 |             size_t toEncode = matchLength;
 266 | 
 267 |             // Write the offset out - note the xor with the previous offset.
 268 |             *reinterpret_cast< uint16_t* >( outputCursor ) = matchOffset ^ previousOffset;
 269 | 
 270 |             previousOffset = matchOffset;
 271 |             outputCursor  += sizeof( uint16_t );
 272 | 
 273 |             for ( ;; )
 274 |             {
 275 |                 // Check if we need to start a new control block
 276 |                 if ( currentControlCount == CONTROLS_PER_BLOCK )
 277 |                 {
 278 |                     currentControlBlock  = outputCursor;
 279 |                     outputCursor        += CONTROL_BLOCK_SIZE;
 280 | 
 281 |                     _mm_storeu_si128( reinterpret_cast< __m128i* >( outputCursor ), _mm_setzero_si128() );
 282 | 
 283 |                     currentControlCount  = 0;
 284 |                         
 285 |                     if ( outputCursor > outputEarlyEnd )
 286 |                     {
 287 |                         break;
 288 |                     }
 289 |                 }
 290 | 
 291 |                 // If the encode size is greater than we can hold in a control, write out a full match length
 292 |                 // control, subtract full control value from the amount to encode and loop around again.
 293 |                 if ( toEncode >= EXTENDED_MATCH_BOUND )
 294 |                 {
 295 |                     currentControlBlock[ currentControlCount >> 1 ] = 
 296 |                         ( currentControlBlock[ currentControlCount >> 1 ] >> 4 ) | ( static_cast<uint8_t>( EXTENDED_MATCH_BOUND ) << 4 );
 297 | 
 298 |                     toEncode -= EXTENDED_MATCH_BOUND;
 299 | 
 300 |                     ++currentControlCount;
 301 |                 }
 302 |                 else // Write out the remaining match length control. Could potentially be zero.
 303 |                 {
 304 |                     currentControlBlock[ currentControlCount >> 1 ] = 
 305 |                         ( currentControlBlock[ currentControlCount >> 1 ] >> 4 ) | ( static_cast<uint8_t>( toEncode ) << 4 );
 306 | 
 307 |                     if ( currentControlCount == 0 && toEncode == 0 )
 308 |                     {
 309 |                         lastControlIsNop = true;
 310 |                     }
 311 | 
 312 |                     ++currentControlCount;
 313 | 
 314 |                     break;
 315 |                 }
 316 |             }
 317 | 
 318 |             // Update the value into the hash for future matches.
 319 |             SetHash( state, hash, input, inputCursor );
 320 | 
 321 |             ++inputCursor;
 322 | 
 323 |             // Hash all the other values in the match too.
 324 |             for ( const uint8_t* nextArrival = inputCursor + matchLength - 1; inputCursor < nextArrival; ++inputCursor )
 325 |             {
 326 |                 hash = HashFast( inputCursor );
 327 |                 SetHash( state, hash, input, inputCursor );
 328 |             }
 329 |         }
 330 |         else
 331 |         {
 332 |             // One more literal to write out.
 333 |             ++literalsToFlush;
 334 | 
 335 |             // If we have reached the maximum number of literals allowed in the control, flush them out.
 336 |             if ( literalsToFlush == LITERALS_PER_CONTROL )
 337 |             {
 338 |                 // Check if the control block is full and we need start a new one.
 339 |                 if ( currentControlCount == CONTROLS_PER_BLOCK )
 340 |                 {
 341 |                     currentControlBlock  = outputCursor;
 342 |                     outputCursor        += CONTROL_BLOCK_SIZE;
 343 | 
 344 |                     _mm_storeu_si128( reinterpret_cast< __m128i* >( outputCursor ), _mm_setzero_si128() );
 345 | 
 346 |                     currentControlCount  = 0;
 347 | 
 348 |                     if ( outputCursor > outputEarlyEnd )
 349 |                     {
 350 |                         break;
 351 |                     }
 352 |                 }
 353 | 
 354 |                 currentControlBlock[ currentControlCount >> 1 ] = 
 355 |                     ( currentControlBlock[ currentControlCount >> 1 ] >> 4 ) | ( ( static_cast<uint8_t>( LITERALS_PER_CONTROL - 1 ) ) << 4 );
 356 | 
 357 |                 ++currentControlCount;
 358 | 
 359 |                 *reinterpret_cast< uint32_t* >( outputCursor ) = 
 360 |                     *reinterpret_cast< const uint32_t* >( inputCursor - 3 ) ^ 
 361 |                     *reinterpret_cast< const uint32_t* >( ( inputCursor - 3 ) - previousOffset );
 362 | 
 363 |                 outputCursor += 4;
 364 | 
 365 |                 //*( outputCursor++ ) = *( inputCursor - 3 ) ^ *( ( inputCursor - 3 ) - previousOffset );
 366 |                 //*( outputCursor++ ) = *( inputCursor - 2 ) ^ *( ( inputCursor - 2 ) - previousOffset );
 367 |                 //*( outputCursor++ ) = *( inputCursor - 1 ) ^ *( ( inputCursor - 1 ) - previousOffset );
 368 |                 //*( outputCursor++ ) = *inputCursor ^ *( inputCursor - previousOffset );
 369 |                     
 370 |                 if ( outputCursor > outputEarlyEnd )
 371 |                 {
 372 |                     break;
 373 |                 }
 374 | 
 375 |                 literalsToFlush = 0;
 376 |             }
 377 | 
 378 |             // Update the hash with this byte
 379 |             SetHash( state, hash, input, inputCursor );
 380 | 
 381 |             ++inputCursor;
 382 |         }
 383 |     }
 384 | 
 385 |     // If we would create a compression output bigger than or equal to the input, just copy the input to the output and return equal size.
 386 |     if ( ( ( outputCursor + literalsToFlush + ( currentControlCount == CONTROLS_PER_BLOCK ? CONTROL_BLOCK_SIZE : 0 ) ) ) >= output + inputLength - END_PADDING_LITERALS )
 387 |     {
 388 |         memcpy( output, input, inputLength );
 389 | 
 390 |         outputCursor = output + inputLength;
 391 |     }
 392 |     else 
 393 |     {
 394 |         // Flush any remaining literals.
 395 |         if ( literalsToFlush > 0 )
 396 |         {
 397 |             lastControlIsNop = false;
 398 | 
 399 |             if ( currentControlCount == CONTROLS_PER_BLOCK )
 400 |             {
 401 |                 currentControlBlock = outputCursor;
 402 |                 outputCursor       += CONTROL_BLOCK_SIZE;
 403 | 
 404 |                 _mm_storeu_si128( reinterpret_cast< __m128i* >( outputCursor ), _mm_setzero_si128() );
 405 | 
 406 |                 currentControlCount = 0;
 407 |             }
 408 | 
 409 |             currentControlBlock[ currentControlCount >> 1 ] = 
 410 |                 ( currentControlBlock[ currentControlCount >> 1 ] >> 4 ) | ( static_cast<uint8_t>( literalsToFlush - 1 ) << 4 );
 411 |             
 412 |             for ( uint32_t where = 0; where < literalsToFlush; ++where )
 413 |             {
 414 |                 const uint8_t* currentInput = inputCursor - ( literalsToFlush - where );
 415 | 
 416 |                 *( outputCursor++ ) = *currentInput ^ *( currentInput - previousOffset );
 417 |             }
 418 | 
 419 |             ++currentControlCount;
 420 |         }
 421 | 
 422 |         // Need to finish off shifting the final control block into the low nibble if there is no second nibble
 423 |         if ( ( currentControlCount & 1 ) > 0 )
 424 |         {
 425 |             currentControlBlock[ currentControlCount >> 1 ] >>= 4;
 426 |         }
 427 | 
 428 |         if ( lastControlIsNop )
 429 |         {
 430 |             outputCursor -= CONTROL_BLOCK_SIZE;
 431 |         }
 432 | 
 433 |         size_t remainingLiterals = ( input + inputLength ) - inputCursor;
 434 | 
 435 |         // copy remaining literals
 436 |         memcpy( outputCursor, inputCursor, remainingLiterals );
 437 | 
 438 |         outputCursor += remainingLiterals;
 439 |     }
 440 | 
 441 |     // Return the size of the compressed data.
 442 |     return outputCursor - output;
 443 | }
 444 | 
 445 | 
 446 | struct Arrival
 447 | {
 448 |     size_t    cost;
 449 |     int32_t   from;
 450 |     int32_t   to;
 451 |     uint16_t  offset;
 452 | };
 453 | 
 454 | struct TreeNode
 455 | {
 456 |     int32_t children[ 2 ];
 457 | };
 458 | 
 459 | struct LZSSE4_OptimalParseState
 460 | {
 461 |     // Note, we should really replace this with a BST, hash chaining works but is *slooooooooooooooow* for optimal parse.
 462 |     int32_t roots[ OPTIMAL_BUCKETS_COUNT ];
 463 | 
 464 |     TreeNode window[ LZ_WINDOW_SIZE ];
 465 | 
 466 |     Arrival* arrivals;
 467 | 
 468 |     size_t bufferSize;
 469 | };
 470 | 
 471 | 
 472 | LZSSE4_OptimalParseState* LZSSE4_MakeOptimalParseState( size_t bufferSize )
 473 | {
 474 |     if ( bufferSize > 0 && ( SIZE_MAX / sizeof( Arrival ) ) < bufferSize )
 475 |     {
 476 |         return nullptr;
 477 |     }
 478 | 
 479 |     LZSSE4_OptimalParseState* result = reinterpret_cast< LZSSE4_OptimalParseState* >( ::malloc( sizeof( LZSSE4_OptimalParseState ) ) );
 480 | 
 481 |     result->bufferSize = bufferSize;
 482 | 
 483 |     if ( result != nullptr )
 484 |     {
 485 |         result->arrivals = reinterpret_cast< Arrival* >( ::malloc( sizeof( Arrival ) * bufferSize ) );
 486 | 
 487 |         if ( result->arrivals == nullptr )
 488 |         {
 489 |             LZSSE4_FreeOptimalParseState( result );
 490 | 
 491 |             result = nullptr;
 492 |         }
 493 |     }
 494 | 
 495 |     return result;
 496 | }
 497 | 
 498 | 
 499 | void LZSSE4_FreeOptimalParseState( LZSSE4_OptimalParseState* toFree )
 500 | {
 501 |     ::free( toFree->arrivals );
 502 | 
 503 |     toFree->arrivals = nullptr;
 504 | 
 505 |     ::free( toFree );
 506 | }
 507 | 
 508 | 
 509 | inline uint32_t HashOptimal( const uint8_t* inputCursor )
 510 | {
 511 |     return *reinterpret_cast<const uint32_t*>( inputCursor ) * 0x1e35a7bd >> ( 32 - OPTIMAL_HASH_BITS );
 512 | }
 513 | 
 514 | 
 515 | struct Match
 516 | {
 517 |     size_t length;
 518 |     int32_t position;
 519 |     uint16_t offset;
 520 | };
 521 | 
 522 | 
 523 | inline Match SearchAndUpdateFinder( LZSSE4_OptimalParseState& state, const uint8_t* input, const uint8_t* inputCursor, const uint8_t* inputEnd, uint32_t cutOff )
 524 | {
 525 |     Match result;
 526 | 
 527 |     int32_t position = static_cast<int32_t>( inputCursor - input );
 528 | 
 529 |     result.position = NO_MATCH;
 530 |     result.length   = MIN_MATCH_LENGTH;
 531 |     result.offset   = 0;
 532 | 
 533 |     size_t   lengthToEnd  = inputEnd - inputCursor;
 534 |     int32_t  lastPosition = position - ( LZ_WINDOW_SIZE - 1 );
 535 |     uint32_t hash         = HashOptimal( inputCursor );
 536 | 
 537 |     lastPosition = lastPosition > 0 ? lastPosition : 0;
 538 | 
 539 |     int32_t treeCursor = state.roots[ hash ];
 540 | 
 541 |     state.roots[ hash ] = position;
 542 | 
 543 |     int32_t* left        = &state.window[ position & LZ_WINDOW_MASK ].children[ 1 ];
 544 |     int32_t* right       = &state.window[ position & LZ_WINDOW_MASK ].children[ 0 ];
 545 |     size_t   leftLength  = 0;
 546 |     size_t   rightLength = 0;
 547 | 
 548 |     for ( ;; )
 549 |     {
 550 |         if ( cutOff-- == 0 || treeCursor < lastPosition )
 551 |         {
 552 |             *left = *right = EMPTY_NODE;
 553 |             break;
 554 |         }
 555 | 
 556 |         TreeNode&      currentNode = state.window[ treeCursor & LZ_WINDOW_MASK ];
 557 |         const uint8_t* key         = input + treeCursor;
 558 |         size_t         matchLength = leftLength < rightLength ? leftLength : rightLength;
 559 | 
 560 |         uint16_t       matchOffset = static_cast< uint16_t >( position - treeCursor );
 561 |         size_t         maxLength   = matchOffset <= ( EXTENDED_MATCH_BOUND + 1 ) && matchOffset < lengthToEnd ? matchOffset : lengthToEnd;
 562 | 
 563 |         while ( matchLength < lengthToEnd )
 564 |         {
 565 |             __m128i input16 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( inputCursor + matchLength ) );
 566 |             __m128i match16 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( key + matchLength ) );
 567 | 
 568 |             unsigned long matchBytes;
 569 | 
 570 |             _BitScanForward( &matchBytes, ( static_cast<unsigned long>( ~_mm_movemask_epi8( _mm_cmpeq_epi8( input16, match16 ) ) ) | 0x10000 ) );
 571 | 
 572 |             matchLength += matchBytes;
 573 | 
 574 |             if ( matchBytes != 16 )
 575 |             {
 576 |                 break;
 577 |             }
 578 |         }
 579 | 
 580 |         matchLength = matchLength < lengthToEnd ? matchLength : lengthToEnd;
 581 | 
 582 |         size_t truncatedMatchLength = matchLength < maxLength ? matchLength : maxLength;
 583 | 
 584 |         if ( truncatedMatchLength >= result.length && matchOffset >= LITERALS_PER_CONTROL )
 585 |         {
 586 |             result.length   = truncatedMatchLength;
 587 |             result.offset   = matchOffset;
 588 |             result.position = treeCursor;
 589 |         }
 590 | 
 591 |         if ( matchLength == lengthToEnd )
 592 |         {
 593 |             *left  = currentNode.children[ 1 ];
 594 |             *right = currentNode.children[ 0 ];
 595 |             break;
 596 |         }
 597 | 
 598 |         if ( inputCursor[ matchLength ] < key[ matchLength ] || ( matchLength == lengthToEnd ) )
 599 |         {
 600 |             *left       = treeCursor;
 601 |             left        = currentNode.children;
 602 |             treeCursor  = *left;
 603 |             leftLength  = matchLength;
 604 |         }
 605 |         else
 606 |         {
 607 |             *right      = treeCursor;
 608 |             right       = currentNode.children + 1;
 609 |             treeCursor  = *right;
 610 |             rightLength = matchLength;
 611 |         }
 612 |     }
 613 | 
 614 |     // Special RLE overlapping match case, the LzFind style match above doesn't work very well with our
 615 |     // restriction of overlapping matches having offsets of at least 16.
 616 |     // Suffix array seems like a better option to handling this.
 617 |     {
 618 |         // Note, we're detecting long RLE here, but if we have an offset too close, we'll sacrifice a fair 
 619 |         // amount of decompression performance to load-hit-stores.
 620 |         int32_t matchPosition = position - ( sizeof( __m128i ) * 2 );
 621 | 
 622 |         if ( matchPosition >= 0 )
 623 |         {
 624 |             uint16_t       matchOffset = static_cast<uint16_t>( position - matchPosition );
 625 |             const uint8_t* key = input + matchPosition;
 626 |             size_t         matchLength = 0;
 627 | 
 628 |             while ( matchLength < lengthToEnd )
 629 |             {
 630 |                 __m128i input16 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( inputCursor + matchLength ) );
 631 |                 __m128i match16 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( key + matchLength ) );
 632 | 
 633 |                 unsigned long matchBytes;
 634 | 
 635 |                 _BitScanForward( &matchBytes, ( static_cast<unsigned long>( ~_mm_movemask_epi8( _mm_cmpeq_epi8( input16, match16 ) ) ) | 0x10000 ) );
 636 | 
 637 |                 matchLength += matchBytes;
 638 | 
 639 |                 if ( matchBytes != 16 )
 640 |                 {
 641 |                     break;
 642 |                 }
 643 | 
 644 |             }
 645 | 
 646 |             matchLength = matchLength < lengthToEnd ? matchLength : lengthToEnd;
 647 | 
 648 |             if ( matchLength >= result.length )
 649 |             {
 650 |                 result.length = matchLength;
 651 |                 result.offset = matchOffset;
 652 |                 result.position = matchPosition;
 653 |             }
 654 |         }
 655 |     }
 656 | 
 657 |     return result;
 658 | }
 659 | 
 660 | 
 661 | size_t LZSSE4_CompressOptimalParse( LZSSE4_OptimalParseState* state, const void* inputChar, size_t inputLength, void* outputChar, size_t outputLength, unsigned int level )
 662 | {
 663 |     if ( outputLength < inputLength || state->bufferSize < inputLength )
 664 |     {
 665 |         // error case, output buffer not large enough.
 666 |         return 0;
 667 |     }
 668 | 
 669 |     const uint8_t* input  = reinterpret_cast< const uint8_t* >( inputChar );
 670 |     uint8_t*       output = reinterpret_cast< uint8_t* >( outputChar );
 671 | 
 672 |     if ( inputLength < MIN_COMPRESSION_SIZE )
 673 |     {
 674 |         memcpy( output, input, inputLength );
 675 | 
 676 |         return inputLength;
 677 |     }
 678 | 
 679 |     const uint8_t* inputCursor      = input;
 680 |     const uint8_t* inputEnd         = input + inputLength;
 681 |     Arrival*       arrivalWatermark = state->arrivals;
 682 |     Arrival*       arrival          = state->arrivals;
 683 |     uint32_t       cutOff           = 1 << level;
 684 | 
 685 |     for ( int32_t* rootCursor = state->roots, *end = rootCursor + OPTIMAL_BUCKETS_COUNT; rootCursor < end; rootCursor += 4 )
 686 |     {
 687 |         rootCursor[ 0 ] = EMPTY_NODE;
 688 |         rootCursor[ 1 ] = EMPTY_NODE;
 689 |         rootCursor[ 2 ] = EMPTY_NODE;
 690 |         rootCursor[ 3 ] = EMPTY_NODE;
 691 |     }
 692 | 
 693 |     for ( uint32_t where = 0; where < MIN_MATCH_LENGTH; ++where )
 694 |     {
 695 |         SearchAndUpdateFinder( *state, input, inputCursor, inputEnd - END_PADDING_LITERALS, cutOff );
 696 | 
 697 |         ++inputCursor;
 698 |     }
 699 | 
 700 |     arrival->cost   = LITERAL_BITS * LITERALS_PER_CONTROL;
 701 |     arrival->from   = -1;
 702 |     arrival->offset = 0;
 703 | 
 704 |     // loop through each character and project forward the matches at that character to calculate the cheapest
 705 |     // path of arrival for each individual character.
 706 |     for ( const uint8_t* earlyEnd = inputEnd - END_PADDING_LITERALS; inputCursor < earlyEnd; ++inputCursor, ++arrival )
 707 |     {
 708 |         uint32_t  lengthToEnd     = static_cast< uint32_t >( earlyEnd - inputCursor );       
 709 |         int32_t   currentPosition = static_cast< int32_t >( inputCursor - input );
 710 |         size_t    literalsForward = LITERALS_PER_CONTROL < lengthToEnd ? LITERALS_PER_CONTROL : lengthToEnd;
 711 |         size_t    arrivalCost     = arrival->cost;
 712 | 
 713 |         // NOTE - we currently assume only 2 literals filled in here, because the minimum match length is 3.
 714 |         // If we wanted to go with a higher minimum match length, we would need to fill in more literals before hand.
 715 |         // Also, because there is a maximum of 2 literals per control block assumed.
 716 | 
 717 |         // project forward the cost of a single literal
 718 | 
 719 |         for ( size_t where = 1; where <= literalsForward; ++where )
 720 |         {
 721 |             Arrival* literalArrival = arrival + where;
 722 |             size_t   literalCost    = arrivalCost + CONTROL_BITS + ( where * LITERAL_BITS );
 723 | 
 724 |             if ( literalArrival > arrivalWatermark || literalArrival->cost > literalCost )
 725 |             {
 726 |                 literalArrival->cost   = literalCost;
 727 |                 literalArrival->from   = currentPosition;
 728 |                 literalArrival->offset = 0;
 729 | 
 730 |                 arrivalWatermark = literalArrival > arrivalWatermark ? literalArrival : arrivalWatermark;
 731 |             }
 732 |         }
 733 | 
 734 |         Match match = SearchAndUpdateFinder( *state, input, inputCursor, earlyEnd, cutOff );
 735 | 
 736 |         if ( match.position != NO_MATCH )
 737 |         {
 738 |             for ( size_t matchedLength = MIN_MATCH_LENGTH, end = match.length + 1; matchedLength < end; ++matchedLength )
 739 |             {
 740 |                 Arrival* matchArrival = arrival + matchedLength;
 741 |                 size_t   matchCost    = arrivalCost + BASE_MATCH_BITS;
 742 | 
 743 |                 if ( matchedLength >= EXTENDED_MATCH_BOUND )
 744 |                 {
 745 |                     matchCost += ( matchedLength / EXTENDED_MATCH_BOUND ) * CONTROL_BITS;
 746 |                 }
 747 | 
 748 |                 if ( matchArrival > arrivalWatermark || matchArrival->cost > matchCost )
 749 |                 {
 750 |                     matchArrival->cost   = matchCost;
 751 |                     matchArrival->from   = currentPosition;
 752 |                     matchArrival->offset = match.offset;
 753 | 
 754 |                     arrivalWatermark = matchArrival > arrivalWatermark ? matchArrival : arrivalWatermark;
 755 |                 }
 756 |             }
 757 | 
 758 |             if ( match.length > SKIP_MATCH_LENGTH && level < NO_SKIP_LEVEL )
 759 |             {
 760 |                 arrival     += match.length - LITERALS_PER_CONTROL;
 761 |                 inputCursor += match.length - LITERALS_PER_CONTROL;
 762 |             }
 763 |         }
 764 |     }
 765 | 
 766 |     // If this would cost more to encode than it would if it were just literals, encode it with no control blocks,
 767 |     // just literals
 768 |     if ( ( arrivalWatermark->cost + END_PADDING_LITERALS * LITERAL_BITS + CONTROLS_PER_BLOCK * CONTROL_BITS ) > ( inputLength * LITERAL_BITS ) )
 769 |     {
 770 |         memcpy( output, input, inputLength );
 771 | 
 772 |         return inputLength;
 773 |     }
 774 | 
 775 |     Arrival* previousPathNode;
 776 | 
 777 |     // now trace the actual optimal parse path back, connecting the nodes in the other direction.
 778 |     for ( const Arrival* pathNode = arrivalWatermark; pathNode->from > 0; pathNode = previousPathNode )
 779 |     {
 780 |         previousPathNode     = state->arrivals + ( pathNode->from - LITERALS_PER_CONTROL );
 781 | 
 782 |         previousPathNode->to = static_cast<int32_t>( ( pathNode - state->arrivals ) + LITERALS_PER_CONTROL );
 783 |     }
 784 | 
 785 |     uint8_t* outputCursor = output;
 786 | 
 787 |     memcpy( outputCursor, input, MIN_MATCH_LENGTH );
 788 | 
 789 |     outputCursor += MIN_MATCH_LENGTH;
 790 | 
 791 |     uint8_t* currentControlBlock = outputCursor;
 792 |     uint32_t currentControlCount = 0;
 793 |     uint32_t totalControlCount   = 0;
 794 | 
 795 |     outputCursor += CONTROL_BLOCK_SIZE;
 796 | 
 797 |     Arrival* nextPathNode;
 798 | 
 799 |     size_t   totalPathLength = MIN_MATCH_LENGTH;
 800 |     uint16_t previousOffset  = INITIAL_OFFSET;
 801 | 
 802 |     bool lastControlIsNop = false;
 803 | 
 804 |     // Now walk forwards again and actually write out the data.
 805 |     for ( const Arrival* pathNode = state->arrivals; pathNode < arrivalWatermark; pathNode = nextPathNode )
 806 |     {
 807 |         int32_t currentPosition = static_cast< int32_t >( ( pathNode - state->arrivals ) + LITERALS_PER_CONTROL );
 808 | 
 809 |         nextPathNode = state->arrivals + ( pathNode->to - LITERALS_PER_CONTROL );
 810 | 
 811 |         size_t pathDistance = nextPathNode - pathNode;
 812 | 
 813 |         totalPathLength += pathDistance;
 814 | 
 815 |         lastControlIsNop = false;
 816 | 
 817 |         if ( nextPathNode->offset == 0 )
 818 |         {
 819 |             if ( currentControlCount == CONTROLS_PER_BLOCK )
 820 |             {
 821 |                 currentControlBlock  = outputCursor;
 822 |                 outputCursor        += CONTROL_BLOCK_SIZE;
 823 |                 currentControlCount  = 0;
 824 |             }
 825 | 
 826 |             if ( ( currentControlCount & 1 ) == 0 )
 827 |             {
 828 |                 currentControlBlock[ currentControlCount >> 1 ] =
 829 |                     ( static_cast<uint8_t>( pathDistance ) - 1 );
 830 |             }
 831 |             else
 832 |             {
 833 |                 currentControlBlock[ currentControlCount >> 1 ] |= 
 834 |                     ( static_cast< uint8_t >( pathDistance ) - 1 ) << CONTROL_BITS;
 835 |             }
 836 | 
 837 |             // output the literals.
 838 |             for ( int32_t where = 0; where < pathDistance; ++where )
 839 |             {
 840 |                 const uint8_t* currentInput = input + currentPosition + where;
 841 | 
 842 |                 outputCursor[ where ] = *currentInput ^ *( currentInput - previousOffset );
 843 |             }
 844 | 
 845 |             outputCursor += pathDistance;
 846 | 
 847 |             ++totalControlCount;
 848 |             ++currentControlCount;
 849 |         }
 850 |         else
 851 |         {
 852 |             // Check if we need to start a new control block
 853 |             if ( currentControlCount == CONTROLS_PER_BLOCK )
 854 |             {
 855 |                 currentControlBlock = outputCursor;
 856 |                 outputCursor       += CONTROL_BLOCK_SIZE;
 857 | 
 858 |                 _mm_storeu_si128( reinterpret_cast<__m128i*>( outputCursor ), _mm_setzero_si128() );
 859 | 
 860 |                 currentControlCount = 0;
 861 |             }
 862 | 
 863 |             // Write the offset out - note the xor with the previous offset.
 864 |             *reinterpret_cast< uint16_t* >( outputCursor ) = nextPathNode->offset ^ previousOffset;
 865 | 
 866 |             previousOffset = nextPathNode->offset;
 867 |             outputCursor  += sizeof( uint16_t );
 868 | 
 869 |             if ( pathDistance < EXTENDED_MATCH_BOUND )
 870 |             {
 871 |                 if ( ( currentControlCount & 1 ) == 0 )
 872 |                 {
 873 |                     currentControlBlock[ currentControlCount >> 1 ] =
 874 |                         static_cast<uint8_t>( pathDistance );
 875 |                 }
 876 |                 else
 877 |                 {
 878 |                     currentControlBlock[ currentControlCount >> 1 ] |= 
 879 |                         static_cast< uint8_t >( pathDistance ) << CONTROL_BITS;
 880 |                 }
 881 | 
 882 |                 ++currentControlCount;
 883 |             }
 884 |             else
 885 |             {
 886 |                 if ( ( currentControlCount & 1 ) == 0 )
 887 |                 {
 888 |                     currentControlBlock[ currentControlCount >> 1 ] =
 889 |                         static_cast<uint8_t>( EXTENDED_MATCH_BOUND );
 890 |                 }
 891 |                 else
 892 |                 {
 893 |                     currentControlBlock[ currentControlCount >> 1 ] |= 
 894 |                         static_cast< uint8_t >( EXTENDED_MATCH_BOUND ) << CONTROL_BITS;
 895 |                 }
 896 | 
 897 |                 ++currentControlCount;
 898 | 
 899 |                 size_t toEncode = pathDistance - EXTENDED_MATCH_BOUND;
 900 | 
 901 |                 for ( ;; )
 902 |                 {
 903 |                     // Check if we need to start a new control block
 904 |                     if ( currentControlCount == CONTROLS_PER_BLOCK )
 905 |                     {
 906 |                         currentControlBlock = outputCursor;
 907 |                         outputCursor       += CONTROL_BLOCK_SIZE;
 908 | 
 909 |                         _mm_storeu_si128( reinterpret_cast<__m128i*>( outputCursor ), _mm_setzero_si128() );
 910 | 
 911 |                         currentControlCount = 0;
 912 |                     }
 913 | 
 914 |                     // If the encode size is greater than we can hold in a control, write out a full match length
 915 |                     // control, subtract full control value from the amount to encode and loop around again.
 916 |                     if ( toEncode >= EXTENDED_MATCH_BOUND )
 917 |                     {
 918 |                         if ( ( currentControlCount & 1 ) == 0 )
 919 |                         {
 920 |                             currentControlBlock[ currentControlCount >> 1 ] =
 921 |                                 static_cast<uint8_t>( EXTENDED_MATCH_BOUND );
 922 |                         }
 923 |                         else
 924 |                         {
 925 |                             currentControlBlock[ currentControlCount >> 1 ] |= 
 926 |                                 static_cast< uint8_t >( EXTENDED_MATCH_BOUND ) << CONTROL_BITS;
 927 |                         }
 928 | 
 929 |                         toEncode -= EXTENDED_MATCH_BOUND;
 930 | 
 931 |                         ++currentControlCount;
 932 |                     }
 933 |                     else // Write out the remaining match length control. Could potentially be zero.
 934 |                     {
 935 |                         if ( ( currentControlCount & 1 ) == 0 )
 936 |                         {
 937 |                             currentControlBlock[ currentControlCount >> 1 ] =
 938 |                                 static_cast<uint8_t>( toEncode );
 939 |                         }
 940 |                         else
 941 |                         {
 942 |                             currentControlBlock[ currentControlCount >> 1 ] |= 
 943 |                                 static_cast< uint8_t >( toEncode ) << CONTROL_BITS;
 944 |                         }
 945 | 
 946 |                         if ( toEncode == 0 && currentControlCount == 0 )
 947 |                         {
 948 |                             lastControlIsNop = true;
 949 |                         }
 950 | 
 951 |                         ++currentControlCount;
 952 | 
 953 |                         break;
 954 |                     }
 955 |                 }
 956 |             }
 957 |         }
 958 |     }
 959 | 
 960 |     if ( lastControlIsNop )
 961 |     {
 962 |         outputCursor -= CONTROL_BLOCK_SIZE;
 963 |     }
 964 | 
 965 |     size_t remainingLiterals = ( input + inputLength ) - inputCursor;
 966 | 
 967 |     // copy remaining literals
 968 |     memcpy( outputCursor, inputCursor, remainingLiterals );
 969 | 
 970 |     outputCursor += remainingLiterals;
 971 | 
 972 |     return outputCursor - output;
 973 | }
 974 | 
 975 | 
 976 | size_t LZSSE4_Decompress( const void* inputChar, size_t inputLength, void* outputChar, size_t outputLength )
 977 | {
 978 |     const uint8_t* input  = reinterpret_cast< const uint8_t* >( inputChar );
 979 |     uint8_t*       output = reinterpret_cast< uint8_t* >( outputChar );
 980 | 
 981 |     // Data was not compressible, just copy initial values
 982 |     if ( outputLength == inputLength )
 983 |     {
 984 |         memcpy( output, input, outputLength );
 985 | 
 986 |         return inputLength;
 987 |     }
 988 | 
 989 |     const uint8_t* inputCursor  = input;
 990 |     uint8_t*       outputCursor = output;
 991 | 
 992 |     // The offset starts off as the minimum match length. We actually need it least four
 993 |     // characters back because we need them to be set to xor out the literals from the match data.
 994 |     size_t  offset          = INITIAL_OFFSET;
 995 |     __m128i previousCarryHi = _mm_setzero_si128();
 996 | 
 997 |     // Copy the initial literals to the output.
 998 |     for ( uint32_t where = 0; where < MIN_MATCH_LENGTH; ++where )
 999 |     {
1000 |         *( outputCursor++ ) = *( inputCursor++ );
1001 |     }
1002 | 
1003 |     // Let me be clear, I am usually anti-macro, but they work for this particular (very unusual) case.  
1004 |     // DECODE_STEP is a regular decoding step, DECODE_STEP_HALF and DECODE_STEP_END are because the compiler couldn't
1005 |     // seem to remove some of the dead code where values were updated and then never used.
1006 | 
1007 |     // What these macros do:
1008 |     //     Decode a single literal run or match run for a single control nibble.
1009 |     // How they do it:
1010 |     //    - Read the *unaligned* input (in the case of LZSSE-F - twice), it goes into both a regular variable and an SSE register,
1011 |     //      because it could either be literals or an offset (or nothing at all). The low byte of streamBytesRead controls how much we advance
1012 |     //      the input cursor.
1013 |     //    - Used a contived set of casts to sign extend the "read offset" control mask and then use it to mask the input word,
1014 |     //      which is then xor'd against the offset, for a "branchless" conditional move into the offset which
1015 |     //      has been carried over from the previous literal/match block. Note, this ends up doing better than a cmov on most 
1016 |     //      modern processors. But we need to pre-xor the input offset.
1017 |     //    - We then load the match data from output buffer (offset back from the current output point). Unconditional load here.
1018 |     //    - We broadcast the "from literal" control mask from the current least significant byte of the SSE register using a shuffle epi-8
1019 |     //    - We mask the literals with that SSE register wide mask.
1020 |     //    - The literals have been pre-xor'd with the data read in as match data, so we use an xor to branchlessly choose between the two.
1021 |     //      In this case, it ends up a better option than a blendv on most processors.
1022 |     //    - Store the block. We store all 16 bytes of the SSE register (due to some constraints in the format of the data, we won't
1023 |     //      go past the end of the buffer), but we may overlap this.
1024 |     //    - bytesOut controls how much we advance the output cursor.
1025 |     //    - We use 8 bit shifts to advance all the controls up to the next byte. There is some variable sized register trickery that 
1026 |     //      x86/x64 is great for as long as we don't anger the register renamer.
1027 | 
1028 | #define DECODE_STEP( HILO, CHECKMATCH, CHECKBUFFERS )                                                                           \
1029 |     {                                                                                                                           \
1030 |         size_t  inputWord = *reinterpret_cast<const uint16_t*>( inputCursor );                                                  \
1031 |         __m128i literals  = _mm_loadu_si128( reinterpret_cast<const __m128i*>( inputCursor ) );                                 \
1032 |                                                                                                                                 \
1033 |         offset ^= static_cast<size_t>( static_cast<ptrdiff_t>( static_cast<int8_t>( readOffsetHalf##HILO ) ) ) & inputWord;     \
1034 |                                                                                                                                 \
1035 |         readOffsetHalf##HILO >>= 8;                                                                                             \
1036 |                                                                                                                                 \
1037 |         const uint8_t* matchPointer = reinterpret_cast<const uint8_t*>( outputCursor - offset );                                \
1038 |                                                                                                                                 \
1039 |         if ( CHECKMATCH && matchPointer < output )                                                                              \
1040 |             goto MATCH_UNDERFLOW;                                                                                               \
1041 |                                                                                                                                 \
1042 |         __m128i fromLiteral = _mm_shuffle_epi8( fromLiteral##HILO, _mm_setzero_si128() );                                       \
1043 |         __m128i matchData   = _mm_loadu_si128( reinterpret_cast<const __m128i*>( matchPointer ) );                              \
1044 |                                                                                                                                 \
1045 |         literals = _mm_and_si128( literals, fromLiteral );                                                                      \
1046 |                                                                                                                                 \
1047 |         fromLiteral##HILO   = _mm_srli_si128( fromLiteral##HILO, 1 );                                                           \
1048 |                                                                                                                                 \
1049 |         __m128i toStore     = _mm_xor_si128( matchData, literals );                                                             \
1050 |                                                                                                                                 \
1051 |         _mm_storeu_si128( reinterpret_cast<__m128i*>( outputCursor ), toStore );                                                \
1052 |                                                                                                                                 \
1053 |         outputCursor += static_cast< uint8_t >( bytesOutHalf##HILO );                                                           \
1054 |         inputCursor  += static_cast< uint8_t >( streamBytesReadHalf##HILO );                                                    \
1055 |                                                                                                                                 \
1056 |         bytesOutHalf##HILO        >>= 8;                                                                                        \
1057 |         streamBytesReadHalf##HILO >>= 8;                                                                                        \
1058 |                                                                                                                                 \
1059 |         if ( CHECKBUFFERS && ( outputCursor >= outputEarlyEnd || inputCursor > inputEarlyEnd ) )                                \
1060 |             goto BUFFER_END;                                                                                                    \
1061 |     }
1062 | 
1063 | #define DECODE_STEP_HALF( HILO, CHECKMATCH, CHECKBUFFERS )                                                                      \
1064 |     {                                                                                                                           \
1065 |         size_t  inputWord = *reinterpret_cast<const uint16_t*>( inputCursor );                                                  \
1066 |         __m128i literals = _mm_loadu_si128( reinterpret_cast<const __m128i*>( inputCursor ) );                                  \
1067 |                                                                                                                                 \
1068 |         offset ^= static_cast<size_t>( static_cast<ptrdiff_t>( static_cast<int8_t>( readOffsetHalf##HILO ) ) ) & inputWord;     \
1069 |                                                                                                                                 \
1070 |         const uint8_t* matchPointer = reinterpret_cast<const uint8_t*>( outputCursor - offset );                                \
1071 |                                                                                                                                 \
1072 |         if ( CHECKMATCH && matchPointer < output )                                                                              \
1073 |             goto MATCH_UNDERFLOW;                                                                                               \
1074 |                                                                                                                                 \
1075 |         __m128i fromLiteral = _mm_shuffle_epi8( fromLiteral##HILO, _mm_setzero_si128() );                                       \
1076 |         __m128i matchData   = _mm_loadu_si128( reinterpret_cast<const __m128i*>( matchPointer ) );                              \
1077 |                                                                                                                                 \
1078 |         literals = _mm_and_si128( literals, fromLiteral );                                                                      \
1079 |                                                                                                                                 \
1080 |         fromLiteral##HILO   = _mm_srli_si128( fromLiteral##HILO, 1 );                                                           \
1081 |                                                                                                                                 \
1082 |         __m128i toStore     = _mm_xor_si128( matchData, literals );                                                             \
1083 |                                                                                                                                 \
1084 |         _mm_storeu_si128( reinterpret_cast<__m128i*>( outputCursor ), toStore );                                                \
1085 |                                                                                                                                 \
1086 |         outputCursor += static_cast< uint8_t >( bytesOutHalf##HILO );                                                           \
1087 |         inputCursor  += static_cast< uint8_t >( streamBytesReadHalf##HILO );                                                    \
1088 |                                                                                                                                 \
1089 |         if ( CHECKBUFFERS && ( outputCursor >= outputEarlyEnd || inputCursor > inputEarlyEnd ) )                                \
1090 |             goto BUFFER_END;                                                                                                    \
1091 |     }
1092 | 
1093 | #define DECODE_STEP_END( HILO, CHECKMATCH, CHECKBUFFERS )                                                                       \
1094 |     {                                                                                                                           \
1095 |         size_t  inputWord = *reinterpret_cast<const uint16_t*>( inputCursor );                                                  \
1096 |         __m128i literals = _mm_loadu_si128( reinterpret_cast<const __m128i*>( inputCursor ) );                                  \
1097 |                                                                                                                                 \
1098 |         offset ^= static_cast<size_t>( static_cast<ptrdiff_t>( static_cast<int8_t>( readOffsetHalf##HILO ) ) ) & inputWord;     \
1099 |                                                                                                                                 \
1100 |         const uint8_t* matchPointer = reinterpret_cast<const uint8_t*>( outputCursor - offset );                                \
1101 |                                                                                                                                 \
1102 |         if ( CHECKMATCH && matchPointer < output )                                                                              \
1103 |             goto MATCH_UNDERFLOW;                                                                                               \
1104 |                                                                                                                                 \
1105 |         __m128i fromLiteral = _mm_shuffle_epi8( fromLiteral##HILO, _mm_setzero_si128() );                                       \
1106 |         __m128i matchData   = _mm_loadu_si128( reinterpret_cast<const __m128i*>( matchPointer ) );                              \
1107 |                                                                                                                                 \
1108 |         literals = _mm_and_si128( literals, fromLiteral );                                                                      \
1109 |                                                                                                                                 \
1110 |         __m128i toStore     = _mm_xor_si128( matchData, literals );                                                             \
1111 |                                                                                                                                 \
1112 |         _mm_storeu_si128( reinterpret_cast<__m128i*>( outputCursor ), toStore );                                                \
1113 |                                                                                                                                 \
1114 |         outputCursor += static_cast< uint8_t >( bytesOutHalf##HILO );                                                           \
1115 |         inputCursor  += static_cast< uint8_t >( streamBytesReadHalf##HILO );                                                    \
1116 |                                                                                                                                 \
1117 |         if ( CHECKBUFFERS && ( outputCursor >= outputEarlyEnd || inputCursor > inputEarlyEnd ) )                                \
1118 |             goto BUFFER_END;                                                                                                    \
1119 |         } 
1120 | 
1121 | #define DECODE_STEP_LO(CHECKMATCH, CHECKBUFFERS )          DECODE_STEP( Lo, CHECKMATCH, CHECKBUFFERS )
1122 | #define DECODE_STEP_HI(CHECKMATCH, CHECKBUFFERS )          DECODE_STEP( Hi, CHECKMATCH, CHECKBUFFERS )
1123 | #define DECODE_STEP_HALF_LO(CHECKMATCH, CHECKBUFFERS )     DECODE_STEP_HALF( Lo, CHECKMATCH, CHECKBUFFERS )
1124 | #define DECODE_STEP_HALF_HI(CHECKMATCH, CHECKBUFFERS )     DECODE_STEP_HALF( Hi, CHECKMATCH, CHECKBUFFERS )
1125 | #define DECODE_STEP_END_LO(CHECKMATCH, CHECKBUFFERS )      DECODE_STEP_END( Lo, CHECKMATCH, CHECKBUFFERS )
1126 | #define DECODE_STEP_END_HI(CHECKMATCH, CHECKBUFFERS )      DECODE_STEP_END( Hi, CHECKMATCH, CHECKBUFFERS )
1127 | 
1128 |     __m128i nibbleMask = _mm_set1_epi8( 0xF );
1129 |     __m128i offsetSize = _mm_set1_epi8( OFFSET_SIZE );
1130 | 
1131 |     // Note, we use this block here because it allows the "fake" inputEarlyEnd/outputEarlyEnd not to cause register spills 
1132 |     // in the decompression loops. And yes, that did actually happen.
1133 |     {
1134 | 
1135 | #pragma warning ( push )
1136 | #pragma warning ( disable : 4101 )
1137 | 
1138 |         // These variables are not actually ever used in this block, because we use
1139 |         // a constant conditional expression to take out the branches that would hit them.
1140 |         // But unfortunately, we need them to compile.
1141 |         const  uint8_t* inputEarlyEnd;
1142 |         uint8_t*        outputEarlyEnd;
1143 | 
1144 | #pragma warning ( pop )
1145 | 
1146 |         // "Safe" ends to the buffer, before the input/output cursors hit these, we can loop without overflow checks.
1147 |         const  uint8_t* inputSafeEnd  = ( input + inputLength ) - INPUT_BUFFER_SAFE;
1148 |         uint8_t*        outputSafeEnd = ( output + outputLength ) - OUTPUT_BUFFER_SAFE;
1149 | 
1150 |         // Decoding loop with offset output buffer underflow test, but no buffer overflow tests, assumed to end at a safe distance 
1151 |         // from overflows
1152 |         while ( ( outputCursor - output ) < LZ_WINDOW_SIZE && outputCursor < outputSafeEnd && inputCursor < inputSafeEnd )
1153 |         {
1154 |             // load the control block
1155 |             __m128i controlBlock       = _mm_loadu_si128( reinterpret_cast<const __m128i*>( inputCursor ) );
1156 | 
1157 |             // split the control block into high and low nibbles
1158 |             __m128i controlHi          = _mm_and_si128( _mm_srli_epi32( controlBlock, CONTROL_BITS ), nibbleMask );
1159 |             __m128i controlLo          = _mm_and_si128( controlBlock, nibbleMask );
1160 | 
1161 |             // Note, the carries are set when the nibble is at its highest value, 15, meaning the operation after will
1162 |             // be an extension of the current match operation.
1163 | 
1164 |             // Work out the carry for the low nibbles (which will be used with the high controls to put them into 
1165 |             // match without offset read mode).
1166 |             __m128i carryLo            = _mm_cmpeq_epi8( controlLo, nibbleMask );
1167 | 
1168 |             // The carry for the high nibbles is used with the low controls, but needs one byte from the previous iteration. We save
1169 |             // the calculated carry to use that byte next iteration.
1170 |             __m128i carryHi            = _mm_cmpeq_epi8( controlHi, nibbleMask );
1171 |             __m128i shiftedCarryHi     = _mm_alignr_epi8( carryHi, previousCarryHi, 15 ); 
1172 | 
1173 |             previousCarryHi = carryHi;
1174 | 
1175 |             // We make the implicit assumption that the maximum number of literals to controls here is twice the offset size (4 vs 2),
1176 |             // we are doing this here to save keeping the value around (spilling or fetching it each time)
1177 |             __m128i literalsPerControl = _mm_add_epi8( offsetSize, offsetSize );
1178 | 
1179 |             // Here we are testing if the runs will be literals or matches. Note that if the carries are set from the previous operation
1180 |             // this will essentially be ignored later on.
1181 |             __m128i isLiteralHi        = _mm_cmplt_epi8( controlHi, literalsPerControl );
1182 |             __m128i isLiteralLo        = _mm_cmplt_epi8( controlLo, literalsPerControl );
1183 | 
1184 |             // Here we're calculating the number of bytes that will be output, we are actually subtracting negative one from the control 
1185 |             // (handy trick where comparison result masks are negative one) if carry is not set and it is a literal.
1186 |             __m128i bytesOutLo         = _mm_sub_epi8( controlLo, _mm_andnot_si128( shiftedCarryHi, isLiteralLo ) );
1187 |             __m128i bytesOutHi         = _mm_sub_epi8( controlHi, _mm_andnot_si128( carryLo, isLiteralHi ) ); 
1188 | 
1189 |             // Calculate the number of bytes to read per control.
1190 |             // In the case the carry is set, no bytes. Otherwise, the offset size (2 bytes) for matches or the number of output bytes for literals.
1191 |             __m128i streamBytesReadLo  = _mm_andnot_si128( shiftedCarryHi, _mm_blendv_epi8( offsetSize, bytesOutLo, isLiteralLo ) );
1192 |             __m128i streamBytesReadHi  = _mm_andnot_si128( carryLo, _mm_blendv_epi8( offsetSize, bytesOutHi, isLiteralHi ) );
1193 | 
1194 |             // I want 128 set bits please.
1195 |             __m128i allSet             = _mm_cmpeq_epi8( shiftedCarryHi, shiftedCarryHi ); 
1196 | 
1197 |             // Masks to read the offset (or keep the previous one) - set in the case that this is not a literal and the carry is not set
1198 |             __m128i readOffsetLo       = _mm_xor_si128( _mm_or_si128( isLiteralLo, shiftedCarryHi ), allSet );
1199 |             __m128i readOffsetHi       = _mm_xor_si128( _mm_or_si128( isLiteralHi, carryLo ), allSet );
1200 | 
1201 |             // Masks whether we are reading literals - set if the carry is not set and these are literals.
1202 |             __m128i fromLiteralLo      = _mm_andnot_si128( shiftedCarryHi, isLiteralLo );
1203 |             __m128i fromLiteralHi      = _mm_andnot_si128( carryLo, isLiteralHi );
1204 | 
1205 |             // Advance the input past the control block
1206 |             inputCursor += CONTROL_BLOCK_SIZE;
1207 | 
1208 |             {
1209 |                 // Pull out the bottom halves off the SSE registers from before - we want these
1210 |                 // things in GPRs for the more linear logic.
1211 |                 uint64_t bytesOutHalfLo        = static_cast<uint64_t>( _mm_cvtsi128_si64( bytesOutLo ) );
1212 |                 uint64_t bytesOutHalfHi        = static_cast<uint64_t>( _mm_cvtsi128_si64( bytesOutHi ) );
1213 | 
1214 |                 uint64_t streamBytesReadHalfLo = static_cast<uint64_t>( _mm_cvtsi128_si64( streamBytesReadLo ) );
1215 |                 uint64_t streamBytesReadHalfHi = static_cast<uint64_t>( _mm_cvtsi128_si64( streamBytesReadHi ) );
1216 | 
1217 |                 uint64_t readOffsetHalfLo      = static_cast<uint64_t>( _mm_cvtsi128_si64( readOffsetLo ) );
1218 |                 uint64_t readOffsetHalfHi      = static_cast<uint64_t>( _mm_cvtsi128_si64( readOffsetHi ) );
1219 | 
1220 |                 DECODE_STEP_LO( true, false );
1221 |                 DECODE_STEP_HI( true, false );
1222 |                 DECODE_STEP_LO( true, false );
1223 |                 DECODE_STEP_HI( true, false );
1224 | 
1225 |                 DECODE_STEP_LO( true, false );
1226 |                 DECODE_STEP_HI( true, false );
1227 |                 DECODE_STEP_LO( true, false );
1228 |                 DECODE_STEP_HI( true, false );
1229 | 
1230 |                 DECODE_STEP_LO( true, false );
1231 |                 DECODE_STEP_HI( true, false );
1232 |                 DECODE_STEP_LO( true, false );
1233 |                 DECODE_STEP_HI( true, false );
1234 | 
1235 |                 DECODE_STEP_LO( true, false );
1236 |                 DECODE_STEP_HI( true, false );
1237 |                 DECODE_STEP_HALF_LO( true, false );
1238 |                 DECODE_STEP_HALF_HI( true, false );
1239 |             }
1240 | 
1241 |             {
1242 |                 // Now the top halves.
1243 |                 uint64_t bytesOutHalfLo        = static_cast<uint64_t>( _mm_extract_epi64( bytesOutLo, 1 ) );
1244 |                 uint64_t bytesOutHalfHi        = static_cast<uint64_t>( _mm_extract_epi64( bytesOutHi, 1 ) );
1245 | 
1246 |                 uint64_t streamBytesReadHalfLo = static_cast<uint64_t>( _mm_extract_epi64( streamBytesReadLo, 1 ) );
1247 |                 uint64_t streamBytesReadHalfHi = static_cast<uint64_t>( _mm_extract_epi64( streamBytesReadHi, 1 ) );
1248 | 
1249 |                 uint64_t readOffsetHalfLo      = static_cast<uint64_t>( _mm_extract_epi64( readOffsetLo, 1 ) );
1250 |                 uint64_t readOffsetHalfHi      = static_cast<uint64_t>( _mm_extract_epi64( readOffsetHi, 1 ) );
1251 | 
1252 |                 DECODE_STEP_LO( true, false );
1253 |                 DECODE_STEP_HI( true, false );
1254 |                 DECODE_STEP_LO( true, false );
1255 |                 DECODE_STEP_HI( true, false );
1256 | 
1257 |                 DECODE_STEP_LO( true, false );
1258 |                 DECODE_STEP_HI( true, false );
1259 |                 DECODE_STEP_LO( true, false );
1260 |                 DECODE_STEP_HI( true, false );
1261 | 
1262 |                 DECODE_STEP_LO( true, false );
1263 |                 DECODE_STEP_HI( true, false );
1264 |                 DECODE_STEP_LO( true, false );
1265 |                 DECODE_STEP_HI( true, false );
1266 | 
1267 |                 DECODE_STEP_LO( true, false );
1268 |                 DECODE_STEP_HI( true, false );
1269 |                 DECODE_STEP_END_LO( true, false );
1270 |                 DECODE_STEP_END_HI( true, false );
1271 |             }
1272 |         }
1273 | 
1274 |         // Decoding loop with no buffer checks, but will end at a safe distance from the end of the buffers.
1275 |         // Note, when we get here we have already reached the point in the output buffer which is *past* where we can underflow
1276 |         // due to a bad match offset.
1277 |         while ( outputCursor < outputSafeEnd && inputCursor < inputSafeEnd )
1278 |         {
1279 |             // This code is the same as the loop above, see comments there
1280 |             __m128i controlBlock       = _mm_loadu_si128( reinterpret_cast<const __m128i*>( inputCursor ) );
1281 |             __m128i controlHi          = _mm_and_si128( _mm_srli_epi32( controlBlock, CONTROL_BITS ), nibbleMask );
1282 |             __m128i controlLo          = _mm_and_si128( controlBlock, nibbleMask );
1283 | 
1284 |             __m128i carryLo            = _mm_cmpeq_epi8( controlLo, nibbleMask );
1285 |             __m128i carryHi            = _mm_cmpeq_epi8( controlHi, nibbleMask );
1286 |             __m128i shiftedCarryHi     = _mm_alignr_epi8( carryHi, previousCarryHi, 15 ); // where we take the carry from the previous hi values
1287 | 
1288 |             previousCarryHi = carryHi;
1289 | 
1290 |             __m128i literalsPerControl = _mm_add_epi8( offsetSize, offsetSize );
1291 |             __m128i isLiteralHi        = _mm_cmplt_epi8( controlHi, literalsPerControl );
1292 |             __m128i isLiteralLo        = _mm_cmplt_epi8( controlLo, literalsPerControl );
1293 | 
1294 |             __m128i bytesOutLo         = _mm_sub_epi8( controlLo, _mm_andnot_si128( shiftedCarryHi, isLiteralLo ) );
1295 |             __m128i bytesOutHi         = _mm_sub_epi8( controlHi, _mm_andnot_si128( carryLo, isLiteralHi ) ); 
1296 | 
1297 |             __m128i streamBytesReadLo  = _mm_andnot_si128( shiftedCarryHi, _mm_blendv_epi8( offsetSize, bytesOutLo, isLiteralLo ) );
1298 |             __m128i streamBytesReadHi  = _mm_andnot_si128( carryLo, _mm_blendv_epi8( offsetSize, bytesOutHi, isLiteralHi ) );
1299 | 
1300 |             __m128i neg1               = _mm_cmpeq_epi8( shiftedCarryHi, shiftedCarryHi ); 
1301 | 
1302 |             __m128i readOffsetLo       = _mm_xor_si128( _mm_or_si128( isLiteralLo, shiftedCarryHi ), neg1 );
1303 |             __m128i readOffsetHi       = _mm_xor_si128( _mm_or_si128( isLiteralHi, carryLo ), neg1 );
1304 | 
1305 |             __m128i fromLiteralLo      = _mm_andnot_si128( shiftedCarryHi, isLiteralLo );
1306 |             __m128i fromLiteralHi      = _mm_andnot_si128( carryLo, isLiteralHi );
1307 | 
1308 |             inputCursor += CONTROL_BLOCK_SIZE;
1309 | 
1310 |             {
1311 |                 uint64_t bytesOutHalfLo        = static_cast<uint64_t>( _mm_cvtsi128_si64( bytesOutLo ) );
1312 |                 uint64_t bytesOutHalfHi        = static_cast<uint64_t>( _mm_cvtsi128_si64( bytesOutHi ) );
1313 | 
1314 |                 uint64_t streamBytesReadHalfLo = static_cast<uint64_t>( _mm_cvtsi128_si64( streamBytesReadLo ) );
1315 |                 uint64_t streamBytesReadHalfHi = static_cast<uint64_t>( _mm_cvtsi128_si64( streamBytesReadHi ) );
1316 | 
1317 |                 uint64_t readOffsetHalfLo      = static_cast<uint64_t>( _mm_cvtsi128_si64( readOffsetLo ) );
1318 |                 uint64_t readOffsetHalfHi      = static_cast<uint64_t>( _mm_cvtsi128_si64( readOffsetHi ) );
1319 | 
1320 |                 DECODE_STEP_LO( false, false );
1321 |                 DECODE_STEP_HI( false, false );
1322 |                 DECODE_STEP_LO( false, false );
1323 |                 DECODE_STEP_HI( false, false );
1324 | 
1325 |                 DECODE_STEP_LO( false, false );
1326 |                 DECODE_STEP_HI( false, false );
1327 |                 DECODE_STEP_LO( false, false );
1328 |                 DECODE_STEP_HI( false, false );
1329 | 
1330 |                 DECODE_STEP_LO( false, false );
1331 |                 DECODE_STEP_HI( false, false );
1332 |                 DECODE_STEP_LO( false, false );
1333 |                 DECODE_STEP_HI( false, false );
1334 | 
1335 |                 DECODE_STEP_LO( false, false );
1336 |                 DECODE_STEP_HI( false, false );
1337 |                 DECODE_STEP_HALF_LO( false, false );
1338 |                 DECODE_STEP_HALF_HI( false, false );
1339 |             }
1340 | 
1341 |             {
1342 |                 uint64_t bytesOutHalfLo        = static_cast<uint64_t>( _mm_extract_epi64( bytesOutLo, 1 ) );
1343 |                 uint64_t bytesOutHalfHi        = static_cast<uint64_t>( _mm_extract_epi64( bytesOutHi, 1 ) );
1344 | 
1345 |                 uint64_t streamBytesReadHalfLo = static_cast<uint64_t>( _mm_extract_epi64( streamBytesReadLo, 1 ) );
1346 |                 uint64_t streamBytesReadHalfHi = static_cast<uint64_t>( _mm_extract_epi64( streamBytesReadHi, 1 ) );
1347 | 
1348 |                 uint64_t readOffsetHalfLo      = static_cast<uint64_t>( _mm_extract_epi64( readOffsetLo, 1 ) );
1349 |                 uint64_t readOffsetHalfHi      = static_cast<uint64_t>( _mm_extract_epi64( readOffsetHi, 1 ) );
1350 | 
1351 |                 DECODE_STEP_LO( false, false );
1352 |                 DECODE_STEP_HI( false, false );
1353 |                 DECODE_STEP_LO( false, false );
1354 |                 DECODE_STEP_HI( false, false );
1355 | 
1356 |                 DECODE_STEP_LO( false, false );
1357 |                 DECODE_STEP_HI( false, false );
1358 |                 DECODE_STEP_LO( false, false );
1359 |                 DECODE_STEP_HI( false, false );
1360 | 
1361 |                 DECODE_STEP_LO( false, false );
1362 |                 DECODE_STEP_HI( false, false );
1363 |                 DECODE_STEP_LO( false, false );
1364 |                 DECODE_STEP_HI( false, false );
1365 | 
1366 |                 DECODE_STEP_LO( false, false );
1367 |                 DECODE_STEP_HI( false, false );
1368 |                 DECODE_STEP_END_LO( false, false );
1369 |                 DECODE_STEP_END_HI( false, false );
1370 |             }
1371 |         }
1372 |     }
1373 | 
1374 |     // Decoding loop with all buffer checks.
1375 |     {
1376 |         const  uint8_t* inputEarlyEnd;
1377 |         uint8_t*        outputEarlyEnd;
1378 |         inputEarlyEnd  = ( input + inputLength ) - END_PADDING_LITERALS;
1379 |         outputEarlyEnd = ( output + outputLength ) - END_PADDING_LITERALS;
1380 | 
1381 |         while ( outputCursor < outputEarlyEnd && inputCursor < inputEarlyEnd )
1382 |         {
1383 |             __m128i controlBlock       = _mm_loadu_si128( reinterpret_cast<const __m128i*>( inputCursor ) );
1384 |             __m128i controlHi          = _mm_and_si128( _mm_srli_epi32( controlBlock, CONTROL_BITS ), nibbleMask );
1385 |             __m128i controlLo          = _mm_and_si128( controlBlock, nibbleMask );
1386 | 
1387 |             __m128i carryLo            = _mm_cmpeq_epi8( controlLo, nibbleMask );
1388 |             __m128i carryHi            = _mm_cmpeq_epi8( controlHi, nibbleMask );
1389 |             __m128i shiftedCarryHi     = _mm_alignr_epi8( carryHi, previousCarryHi, 15 ); 
1390 | 
1391 |             previousCarryHi = carryHi;
1392 | 
1393 |             __m128i literalsPerControl = _mm_add_epi8( offsetSize, offsetSize );
1394 |             __m128i isLiteralHi        = _mm_cmplt_epi8( controlHi, literalsPerControl );
1395 |             __m128i isLiteralLo        = _mm_cmplt_epi8( controlLo, literalsPerControl );
1396 | 
1397 |             __m128i bytesOutLo         = _mm_sub_epi8( controlLo, _mm_andnot_si128( shiftedCarryHi, isLiteralLo ) );
1398 |             __m128i bytesOutHi         = _mm_sub_epi8( controlHi, _mm_andnot_si128( carryLo, isLiteralHi ) );
1399 | 
1400 |             __m128i streamBytesReadLo  = _mm_andnot_si128( shiftedCarryHi, _mm_blendv_epi8( offsetSize, bytesOutLo, isLiteralLo ) );
1401 |             __m128i streamBytesReadHi  = _mm_andnot_si128( carryLo, _mm_blendv_epi8( offsetSize, bytesOutHi, isLiteralHi ) );
1402 | 
1403 |             __m128i neg1               = _mm_cmpeq_epi8( shiftedCarryHi, shiftedCarryHi ); 
1404 | 
1405 |             __m128i readOffsetLo       = _mm_xor_si128( _mm_or_si128( isLiteralLo, shiftedCarryHi ), neg1 );
1406 |             __m128i readOffsetHi       = _mm_xor_si128( _mm_or_si128( isLiteralHi, carryLo ), neg1 );
1407 | 
1408 |             __m128i fromLiteralLo      = _mm_andnot_si128( shiftedCarryHi, isLiteralLo );
1409 |             __m128i fromLiteralHi      = _mm_andnot_si128( carryLo, isLiteralHi );
1410 | 
1411 |             inputCursor += CONTROL_BLOCK_SIZE;
1412 | 
1413 |             if ( inputCursor > inputEarlyEnd )
1414 |                 goto BUFFER_END;
1415 | 
1416 |             {
1417 |                 uint64_t bytesOutHalfLo        = static_cast<uint64_t>( _mm_cvtsi128_si64( bytesOutLo ) );
1418 |                 uint64_t bytesOutHalfHi        = static_cast<uint64_t>( _mm_cvtsi128_si64( bytesOutHi ) );
1419 | 
1420 |                 uint64_t streamBytesReadHalfLo = static_cast<uint64_t>( _mm_cvtsi128_si64( streamBytesReadLo ) );
1421 |                 uint64_t streamBytesReadHalfHi = static_cast<uint64_t>( _mm_cvtsi128_si64( streamBytesReadHi ) );
1422 | 
1423 |                 uint64_t readOffsetHalfLo      = static_cast<uint64_t>( _mm_cvtsi128_si64( readOffsetLo ) );
1424 |                 uint64_t readOffsetHalfHi      = static_cast<uint64_t>( _mm_cvtsi128_si64( readOffsetHi ) );
1425 | 
1426 |                 DECODE_STEP_LO( true, true );
1427 |                 DECODE_STEP_HI( true, true );
1428 |                 DECODE_STEP_LO( true, true );
1429 |                 DECODE_STEP_HI( true, true );
1430 | 
1431 |                 DECODE_STEP_LO( true, true );
1432 |                 DECODE_STEP_HI( true, true );
1433 |                 DECODE_STEP_LO( true, true );
1434 |                 DECODE_STEP_HI( true, true );
1435 | 
1436 |                 DECODE_STEP_LO( true, true );
1437 |                 DECODE_STEP_HI( true, true );
1438 |                 DECODE_STEP_LO( true, true );
1439 |                 DECODE_STEP_HI( true, true );
1440 | 
1441 |                 DECODE_STEP_LO( true, true );
1442 |                 DECODE_STEP_HI( true, true );
1443 |                 DECODE_STEP_HALF_LO( true, true );
1444 |                 DECODE_STEP_HALF_HI( true, true );
1445 |             }
1446 | 
1447 |             {
1448 |                 // Now the top halves.
1449 |                 uint64_t bytesOutHalfLo        = static_cast<uint64_t>( _mm_extract_epi64( bytesOutLo, 1 ) );
1450 |                 uint64_t bytesOutHalfHi        = static_cast<uint64_t>( _mm_extract_epi64( bytesOutHi, 1 ) );
1451 | 
1452 |                 uint64_t streamBytesReadHalfLo = static_cast<uint64_t>( _mm_extract_epi64( streamBytesReadLo, 1 ) );
1453 |                 uint64_t streamBytesReadHalfHi = static_cast<uint64_t>( _mm_extract_epi64( streamBytesReadHi, 1 ) );
1454 | 
1455 |                 uint64_t readOffsetHalfLo      = static_cast<uint64_t>( _mm_extract_epi64( readOffsetLo, 1 ) );
1456 |                 uint64_t readOffsetHalfHi      = static_cast<uint64_t>( _mm_extract_epi64( readOffsetHi, 1 ) );
1457 | 
1458 |                 DECODE_STEP_LO( true, true );
1459 |                 DECODE_STEP_HI( true, true );
1460 |                 DECODE_STEP_LO( true, true );
1461 |                 DECODE_STEP_HI( true, true );
1462 | 
1463 |                 DECODE_STEP_LO( true, true );
1464 |                 DECODE_STEP_HI( true, true );
1465 |                 DECODE_STEP_LO( true, true );
1466 |                 DECODE_STEP_HI( true, true );
1467 | 
1468 |                 DECODE_STEP_LO( true, true );
1469 |                 DECODE_STEP_HI( true, true );
1470 |                 DECODE_STEP_LO( true, true );
1471 |                 DECODE_STEP_HI( true, true );
1472 | 
1473 |                 DECODE_STEP_LO( true, true );
1474 |                 DECODE_STEP_HI( true, true );
1475 |                 DECODE_STEP_END_LO( true, true );
1476 |                 DECODE_STEP_END_HI( true, true );
1477 |             }
1478 |         }
1479 | 
1480 | BUFFER_END:
1481 | 
1482 |         // When we get here, we have either advanced the right amount on both cursors
1483 |         // or something bad happened, so leave it as is, so we can tell where
1484 |         // the error happened. 
1485 |         if ( inputCursor == inputEarlyEnd && outputCursor == outputEarlyEnd )
1486 |         {
1487 |             size_t remainingLiterals = ( input + inputLength ) - inputCursor;
1488 | 
1489 |             // copy any trailing literals
1490 |             memcpy( outputCursor, inputCursor, remainingLiterals );
1491 | 
1492 |             outputCursor += remainingLiterals;
1493 |         }
1494 |     }
1495 | 
1496 | MATCH_UNDERFLOW:
1497 | 
1498 |     return outputCursor - output;
1499 | }
1500 | 


--------------------------------------------------------------------------------
/lzsse4/lzsse4.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2016, Conor Stokes
  3 | All rights reserved.
  4 | 
  5 | Redistribution and use in source and binary forms, with or without
  6 | modification, are permitted provided that the following conditions are met:
  7 | 
  8 | 1. Redistributions of source code must retain the above copyright notice, this
  9 | list of conditions and the following disclaimer.
 10 | 2. Redistributions in binary form must reproduce the above copyright notice,
 11 | this list of conditions and the following disclaimer in the documentation
 12 | and/or other materials provided with the distribution.
 13 | 
 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 24 | */
 25 | 
 26 | #ifndef LZSSE4_H__
 27 | #define LZSSE4_H__
 28 | 
 29 | #pragma once
 30 | 
 31 | /* LZSSE4 - x64/SSE targeted codec for better performance with lower compression ratio data/less optimal compressors.
 32 |  * Supports minimum 4 byte matches, maximum 15 bytes of match per control word and 4 byte literal runs per control word. 
 33 |  */
 34 | 
 35 | #ifdef __cplusplus
 36 | extern "C"
 37 | {
 38 | #endif
 39 | 
 40 | /* Re-usable parse state object for compression. */
 41 | typedef struct LZSSE4_FastParseState LZSSE4_FastParseState;
 42 | 
 43 | /* Allocate the parse state for compression - returns null on failure */
 44 | LZSSE4_FastParseState* LZSSE4_MakeFastParseState();
 45 | 
 46 | /* De-allocate the parse state for compression */
 47 | void LZSSE4_FreeFastParseState( LZSSE4_FastParseState* toFree );
 48 | 
 49 | /* Re-usable parse state object for compression. */
 50 | typedef struct LZSSE4_OptimalParseState LZSSE4_OptimalParseState;
 51 | 
 52 | /* Allocate the parse state for compression - returns null on failure. Note
 53 |    Buffersize has to be greater or equal to any inputLength used with LZSSE2_CompressOptimalParse */
 54 | LZSSE4_OptimalParseState* LZSSE4_MakeOptimalParseState( size_t bufferSize );
 55 | 
 56 | /* De-allocate the parse state for compression */
 57 | void LZSSE4_FreeOptimalParseState( LZSSE4_OptimalParseState* toFree );
 58 | 
 59 | 
 60 | /* "Optimal" compression routine.
 61 | * Will compress data into LZSSE4 format, uses hash BST matching to find matches and run an optimal parse (high relative memory usage). Requires SSE 4.1.
 62 | * state : Contains the hash table for matching, passed as a parameter so that allocations can be re-used. 
 63 | * input : Buffer containing uncompressed data to be compressed. May not be null.
 64 | * inputLength : Length of the compressed data in the input buffer - note should be under 2GB.
 65 | * output : Buffer that will receive the compressed output. 
 66 | * outputLength : The length reserved in the buffer for compressed data. This should be at least inputLength. Note,
 67 | *                The compressed data should never be longer than inputLength, as in this case the data is stored raw.
 68 | * level : The compression level to use for this file 1->17, 17 delivers the highest compression, 1 delivers the least.
 69 | * Thread Safety - state can not be used on multiple threads with calls running concurrently. Can run multiple threads with separate state
 70 | * concurrently.
 71 | *
 72 | * Returns the size of the compressed data, or 0 in the case of error (e.g. outputLength is less than inputLength).
 73 | */
 74 | size_t LZSSE4_CompressOptimalParse( LZSSE4_OptimalParseState* state, const void* input, size_t inputLength, void* output, size_t outputLength, unsigned int level );
 75 | 
 76 | 
 77 | /* "Fast" compression routine.
 78 |  * Will compress data into LZSSE4 format, uses a simple single entry hash/greedy matching to find matches. Requires SSE 4.1.
 79 |  * state : Contains the hash table for matching, passed as a parameter so that allocations can be re-used. 
 80 |  * input : Buffer containing uncompressed data to be compressed. May not be null.
 81 |  * inputLength : Length of the compressed data in the input buffer - note should be under 2GB.
 82 |  * output : Buffer that will receive the compressed output. 
 83 |  * outputLength : The length reserved in the buffer for compressed data. This should be at least inputLength. Note,
 84 |  *                The compressed data should never be longer than inputLength, as in this case the data is stored raw.
 85 |  *
 86 |  * Thread Safety - state can not be used on multiple threads with calls running concurrently. Can run multiple threads with separate state
 87 |  * concurrently.
 88 |  *
 89 |  * Returns the size of the compressed data, or 0 in the case of error (e.g. outputLength is less than inputLength).
 90 |  */
 91 | size_t LZSSE4_CompressFast( LZSSE4_FastParseState* state, const void* input, size_t inputLength, void* output, size_t outputLength );
 92 | 
 93 | /* Decompression routine.
 94 |  * This routine will decompress data in the LZSSE4 format and currently requires SSE 4.1 and is targeted at x64.
 95 |  * It will perform poorly on x86 due to hunger for registers.
 96 |  *  input : Buffer containing compressed input block. May not be null.
 97 |  *  inputLength : Length of the compressed data in the input buffer - note, this should be under 2GB
 98 |  *  output : Buffer that will received the de-compressed output. Note, that this needs to be at least outputLength long.
 99 |  *           May not be null.
100 |  *  outputLength : The length of the compressed output - note, this should be under 2GB
101 |  *
102 |  * Provided that input and output are valid pointers to buffers of at least their specified size, this routine
103 |  * should be memory safe - both match pointer checks and input/output buffer checks exist.
104 |  *
105 |  * Returns the size of the decompressed data, which will be less than outputLength in the event of an error (number of bytes
106 |  * will indicate where in the output stream the error occured).
107 |  *
108 |  * Note that this data is not hash verified, errors that occur are either from a misformed stream or bad buffer sizes.
109 |  * Remember, corrupt data can still be valid to decompress.
110 |  */ 
111 | size_t LZSSE4_Decompress( const void* input, size_t inputLength, void* output, size_t outputLength );
112 | 
113 | #ifdef __cplusplus
114 | }
115 | #endif
116 | 
117 | #endif /* -- LZSSE4_H__ */
118 | 


--------------------------------------------------------------------------------
/lzsse4/lzsse4_platform.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2016, Brian Marshall
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this
 9 | list of conditions and the following disclaimer.
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | */
25 | 
26 | #ifndef LZSSE4_PLATFORM_H__
27 | #define LZSSE4_PLATFORM_H__
28 | 
29 | #pragma once
30 | 
31 | /*
32 |  Compiler/Platform detection based on the table from:
33 |  https://blogs.msdn.microsoft.com/vcblog/2015/12/04/clang-with-microsoft-codegen-in-vs-2015-update-1/
34 | */
35 | 
36 | #ifdef	_MSC_VER
37 | 
38 | /*
39 |  Microsoft Visual Studio Support.
40 |  C1xx/C2, Clang/C2 and Clang/LLVM all support the Microsoft header files and _BitScanForward
41 | 
42 |  Note: if you receive errors with the intrinsics make sure that you have SSE4.1 support enabled.
43 |  For example with Clang include "-msse4.1" on the command line
44 | */
45 | #include <intrin.h>
46 | 
47 | #else	/* _MSC_VER */
48 | 
49 | #ifdef __GNUC__
50 | 
51 | /*
52 |  GCC
53 | */
54 | 
55 | #define SIMDE_ENABLE_NATIVE_ALIASES
56 | #include "../simde/x86/sse4.1.h"
57 | /* _BitScanForward is Visual Studio specific. */
58 | #define _BitScanForward(x, m) *(x) = __builtin_ctz(m)
59 | 
60 | #else
61 | 
62 | /*
63 | If you hit the error below, then add detection for your compiler/platform to this header file.
64 | */
65 | #error Platform not supported
66 | 
67 | #endif	/* __GNUC__ */
68 | #endif  /* _MSC_VER */
69 | 
70 | #endif /* -- LZSSE4_PLATFORM_H__ */
71 | 


--------------------------------------------------------------------------------
/lzsse8/lzsse8.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2016, Conor Stokes
  3 | All rights reserved.
  4 | 
  5 | Redistribution and use in source and binary forms, with or without
  6 | modification, are permitted provided that the following conditions are met:
  7 | 
  8 | 1. Redistributions of source code must retain the above copyright notice, this
  9 | list of conditions and the following disclaimer.
 10 | 2. Redistributions in binary form must reproduce the above copyright notice,
 11 | this list of conditions and the following disclaimer in the documentation
 12 | and/or other materials provided with the distribution.
 13 | 
 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 24 | */
 25 | 
 26 | #ifndef LZSSE8_H__
 27 | #define LZSSE8_H__
 28 | 
 29 | #pragma once
 30 | 
 31 | /* LZSSE8 - x64/SSE targeted codec for better performance with lower compression ratio data/less optimal compressors.
 32 | * Supports minimum 4 byte matches, maximum 15 bytes of match per control word and 8 byte literal runs per control word. 
 33 | */
 34 | 
 35 | #ifdef __cplusplus
 36 | extern "C"
 37 | {
 38 | #endif
 39 | 
 40 | /* Re-usable parse state object for compression. */
 41 | typedef struct LZSSE8_FastParseState LZSSE8_FastParseState;
 42 | 
 43 | /* Allocate the parse state for compression - returns null on failure */
 44 | LZSSE8_FastParseState* LZSSE8_MakeFastParseState();
 45 | 
 46 | /* De-allocate the parse state for compression */
 47 | void LZSSE8_FreeFastParseState( LZSSE8_FastParseState* toFree );
 48 | 
 49 | /* Re-usable parse state object for compression. */
 50 | typedef struct LZSSE8_OptimalParseState LZSSE8_OptimalParseState;
 51 | 
 52 | /* Allocate the parse state for compression - returns null on failure. Note
 53 | Buffersize has to be greater or equal to any inputLength used with LZSSE2_CompressOptimalParse */
 54 | LZSSE8_OptimalParseState* LZSSE8_MakeOptimalParseState( size_t bufferSize );
 55 | 
 56 | /* De-allocate the parse state for compression */
 57 | void LZSSE8_FreeOptimalParseState( LZSSE8_OptimalParseState* toFree );
 58 | 
 59 | 
 60 | /* "Optimal" compression routine.
 61 | * Will compress data into LZSSE8 format, uses hash BST matching to find matches and run an optimal parse (high relative memory usage). Requires SSE 4.1.
 62 | * state : Contains the hash table for matching, passed as a parameter so that allocations can be re-used. 
 63 | * input : Buffer containing uncompressed data to be compressed. May not be null.
 64 | * inputLength : Length of the compressed data in the input buffer - note should be under 2GB.
 65 | * output : Buffer that will receive the compressed output. 
 66 | * outputLength : The length reserved in the buffer for compressed data. This should be at least inputLength. Note,
 67 | *                The compressed data should never be longer than inputLength, as in this case the data is stored raw.
 68 | * level : The compression level to use for this file 1->17, 17 delivers the highest compression, 1 delivers the least.
 69 | * Thread Safety - state can not be used on multiple threads with calls running concurrently. Can run multiple threads with separate state
 70 | * concurrently.
 71 | *
 72 | * Returns the size of the compressed data, or 0 in the case of error (e.g. outputLength is less than inputLength).
 73 | */
 74 | size_t LZSSE8_CompressOptimalParse( LZSSE8_OptimalParseState* state, const void* input, size_t inputLength, void* output, size_t outputLength, unsigned int level );
 75 | 
 76 | 
 77 | /* "Fast" compression routine.
 78 | * Will compress data into LZSSE8 format, uses a simple single entry hash/greedy matching to find matches. Requires SSE 4.1.
 79 | * state : Contains the hash table for matching, passed as a parameter so that allocations can be re-used. 
 80 | * input : Buffer containing uncompressed data to be compressed. May not be null.
 81 | * inputLength : Length of the compressed data in the input buffer - note should be under 2GB.
 82 | * output : Buffer that will receive the compressed output. 
 83 | * outputLength : The length reserved in the buffer for compressed data. This should be at least inputLength. Note,
 84 | *                The compressed data should never be longer than inputLength, as in this case the data is stored raw.
 85 | *
 86 | * Thread Safety - state can not be used on multiple threads with calls running concurrently. Can run multiple threads with separate state
 87 | * concurrently.
 88 | *
 89 | * Returns the size of the compressed data, or 0 in the case of error (e.g. outputLength is less than inputLength).
 90 | */
 91 | size_t LZSSE8_CompressFast( LZSSE8_FastParseState* state, const void* input, size_t inputLength, void* output, size_t outputLength );
 92 | 
 93 | /* Decompression routine.
 94 | * This routine will decompress data in the LZSSE8 format and currently requires SSE 4.1 and is targeted at x64.
 95 | * It will perform poorly on x86 due to hunger for registers.
 96 | *  input : Buffer containing compressed input block. May not be null.
 97 | *  inputLength : Length of the compressed data in the input buffer - note, this should be under 2GB
 98 | *  output : Buffer that will received the de-compressed output. Note, that this needs to be at least outputLength long.
 99 | *           May not be null.
100 | *  outputLength : The length of the compressed output - note, this should be under 2GB
101 | *
102 | * Provided that input and output are valid pointers to buffers of at least their specified size, this routine
103 | * should be memory safe - both match pointer checks and input/output buffer checks exist.
104 | *
105 | * Returns the size of the decompressed data, which will be less than outputLength in the event of an error (number of bytes
106 | * will indicate where in the output stream the error occured).
107 | *
108 | * Note that this data is not hash verified, errors that occur are either from a misformed stream or bad buffer sizes.
109 | * Remember, corrupt data can still be valid to decompress.
110 | */ 
111 | size_t LZSSE8_Decompress( const void* input, size_t inputLength, void* output, size_t outputLength );
112 | 
113 | #ifdef __cplusplus
114 | }
115 | #endif
116 | 
117 | #endif /* -- LZSSE8_H__ */
118 | 


--------------------------------------------------------------------------------
/lzsse8/lzsse8_platform.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2016, Brian Marshall
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this
 9 | list of conditions and the following disclaimer.
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | */
25 | 
26 | #ifndef LZSSE8_PLATFORM_H__
27 | #define LZSSE8_PLATFORM_H__
28 | 
29 | #pragma once
30 | 
31 | /*
32 |  Compiler/Platform detection based on the table from:
33 |  https://blogs.msdn.microsoft.com/vcblog/2015/12/04/clang-with-microsoft-codegen-in-vs-2015-update-1/
34 | */
35 | 
36 | #ifdef	_MSC_VER
37 | 
38 | /*
39 |  Microsoft Visual Studio Support.
40 |  C1xx/C2, Clang/C2 and Clang/LLVM all support the Microsoft header files and _BitScanForward
41 | 
42 |  Note: if you receive errors with the intrinsics make sure that you have SSE4.1 support enabled.
43 |  For example with Clang include "-msse4.1" on the command line
44 | */
45 | #include <intrin.h>
46 | 
47 | #else	/* _MSC_VER */
48 | 
49 | #ifdef __GNUC__
50 | 
51 | /*
52 |  GCC
53 | */
54 | 
55 | #define SIMDE_ENABLE_NATIVE_ALIASES
56 | #include "../simde/x86/sse4.1.h"
57 | /* _BitScanForward is Visual Studio specific. */
58 | #define _BitScanForward(x, m) *(x) = __builtin_ctz(m)
59 | 
60 | #else
61 | 
62 | /*
63 | If you hit the error below, then add detection for your compiler/platform to this header file.
64 | */
65 | #error Platform not supported
66 | 
67 | #endif	/* __GNUC__ */
68 | #endif  /* _MSC_VER */
69 | 
70 | #endif /* -- LZSSE8_PLATFORM_H__ */
71 | 


--------------------------------------------------------------------------------
/premake4.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nemequ/LZSSE-SIMDe/b4ba34fa8c9411b1a69ad34ace775817fd4ce03e/premake4.exe


--------------------------------------------------------------------------------
/premake4.lua:
--------------------------------------------------------------------------------
 1 | solution "lzsse"
 2 | 	configurations { "Debug", "Release" }
 3 | 	platforms      { "x64" }
 4 | 	includedirs    { "include" }
 5 | 	flags          { "NoPCH" }
 6 | 	location       ( _ACTION )
 7 | 	configuration { "gmake" }
 8 | 		buildoptions   { "-std=c++11" }
 9 | 
10 | 	project "lzsse"
11 | 		language "C++"
12 | 		kind "ConsoleApp"
13 | 		files { "lzsse2/*.cpp", "lzsse2/*.c", "lzsse2/*.h", "lzsse4/*.cpp", "lzsse4/*.c", "lzsse4/*.h", "lzsse8/*.cpp", "lzsse8/*.c", "lzsse8/*.h", "example/*.cpp", "example/*.c", "example/*.h" }
14 | 
15 | 		configuration "Debug*"
16 | 			flags { "Symbols" }
17 | 
18 | 		configuration "Release*"
19 | 			flags { "OptimizeSpeed" }
20 | 
21 | 		configuration { "x64", "Debug" }
22 | 			targetdir "bin/64/debug"
23 | 
24 | 		configuration { "x64", "Release" }
25 | 			targetdir "bin/64/release"
26 | 


--------------------------------------------------------------------------------