├── tools ├── msc_test.bat ├── afl-fuzz │ ├── compress │ │ ├── inputs │ │ │ └── 0 │ │ └── fuzz.c │ ├── decompress │ │ └── inputs │ │ │ └── 0 │ ├── Makefile │ └── prepare_for_fuzz.sh ├── windows_test.sh ├── mips_test.sh ├── arm_test.sh └── make-windows-releases ├── .gitignore ├── lib ├── lz_hash.h ├── x86_cpu_features.h ├── lz_extend.h ├── xpack_constants.h ├── xpack_common.h ├── x86_cpu_features.c ├── unaligned.h ├── xpack_common.c ├── hc_matchfinder.h ├── decompress_impl.h ├── xpack_decompress.c └── xpack_compress.c ├── programs ├── detect.sh ├── tgetopt.c ├── prog_util.h ├── benchmark.c ├── prog_util.c └── xpack.c ├── COPYING ├── Makefile.msc ├── common ├── compiler_msc.h ├── compiler_gcc.h └── common_defs.h ├── README.md ├── libxpack.h └── Makefile /tools/msc_test.bat: -------------------------------------------------------------------------------- 1 | nmake /f Makefile.msc clean 2 | nmake /f Makefile.msc 3 | copy /y *.exe j:\exe\ 4 | -------------------------------------------------------------------------------- /tools/afl-fuzz/compress/inputs/0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebiggers/xpack/HEAD/tools/afl-fuzz/compress/inputs/0 -------------------------------------------------------------------------------- /tools/afl-fuzz/decompress/inputs/0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebiggers/xpack/HEAD/tools/afl-fuzz/decompress/inputs/0 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.a 2 | *.dll 3 | *.exe 4 | *.exp 5 | *.lib 6 | *.o 7 | *.obj 8 | *.so 9 | /.lib-cflags 10 | /.prog-cflags 11 | /programs/config.h 12 | /benchmark 13 | /xpack 14 | /xunpack 15 | tags 16 | cscope* 17 | -------------------------------------------------------------------------------- /tools/windows_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | make -j CC=i686-w64-mingw32-gcc 6 | cp -vf *.exe /j/exe/ 7 | make -j CC=x86_64-w64-mingw32-gcc 8 | cp -vf *.exe /j/exe64/ 9 | 10 | sudo systemctl restart smbd 11 | -------------------------------------------------------------------------------- /tools/afl-fuzz/Makefile: -------------------------------------------------------------------------------- 1 | SRC := $(wildcard */*.c) 2 | EXE := $(SRC:.c=) 3 | 4 | CFLAGS := -O2 -s 5 | LDLIBS := -lxpack 6 | LDFLAGS := -L../.. 7 | CPPFLAGS := -I../.. 8 | 9 | all:$(EXE) 10 | 11 | clean: 12 | rm -f $(EXE) 13 | -------------------------------------------------------------------------------- /tools/afl-fuzz/prepare_for_fuzz.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | AFL_HARDEN=1 make CC=afl-gcc -C ../../ 6 | 7 | make clean 8 | AFL_HARDEN=1 make CC=afl-gcc 9 | 10 | for dir in $(find . -mindepth 1 -maxdepth 1 -type d); do 11 | rm -rf /tmp/$dir 12 | cp -va $dir /tmp/$dir 13 | mkdir -p /tmp/$dir/outputs 14 | done 15 | -------------------------------------------------------------------------------- /tools/mips_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | TOOLCHAIN_DIR=$HOME/src/ddwrt-toolchains/toolchain-mips_34kc_gcc-5.1.0_musl-1.1.9 6 | 7 | make -j benchmark \ 8 | CC="$TOOLCHAIN_DIR/bin/mips-openwrt-linux-musl-gcc" \ 9 | CFLAGS="-DNEED_PRINTF" 10 | 11 | scp benchmark $HOME/data/test root@dd-wrt: 12 | ssh root@dd-wrt ./benchmark "$@" test 13 | -------------------------------------------------------------------------------- /tools/arm_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | NDKDIR=/opt/android-ndk 6 | 7 | make -j benchmark \ 8 | CC="$NDKDIR/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-gcc" \ 9 | CFLAGS="--sysroot=$NDKDIR/platforms/android-12/arch-arm -march=armv7-a -fPIC -pie -mfpu=neon -mfloat-abi=softfp" 10 | 11 | adb push benchmark /data/local/tmp 12 | adb push $HOME/data/testdata /data/local/tmp 13 | adb shell /data/local/tmp/benchmark "$@" /data/local/tmp/testdata 14 | -------------------------------------------------------------------------------- /tools/make-windows-releases: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -eu 4 | 5 | for arch in 'i686' 'x86_64'; do 6 | make -j CC=${arch}-w64-mingw32-gcc CFLAGS="-Werror" 7 | dir=xpack-$(git describe --tags | tr -d v)-windows-${arch}-bin 8 | rm -rf $dir ${dir}.zip 9 | mkdir $dir 10 | cp libxpack.dll libxpack.lib libxpack.h *.exe $dir 11 | ${arch}-w64-mingw32-strip ${dir}/libxpack.dll ${dir}/*.exe 12 | for file in COPYING; do 13 | sed < $file > ${dir}/${file}.txt -e 's/$/\r/g' 14 | done 15 | for file in README.md; do 16 | sed < $file > ${dir}/${file} -e 's/$/\r/g' 17 | done 18 | (cd ${dir} && zip -r ../${dir}.zip .) 19 | done 20 | -------------------------------------------------------------------------------- /lib/lz_hash.h: -------------------------------------------------------------------------------- 1 | /* 2 | * lz_hash.h - hashing for Lempel-Ziv matchfinding 3 | */ 4 | 5 | #ifndef LIB_LZ_HASH_H 6 | #define LIB_LZ_HASH_H 7 | 8 | #include "common_defs.h" 9 | 10 | /* 11 | * The hash function: given a sequence prefix held in the low-order bits of a 12 | * 32-bit value, multiply by a carefully-chosen large constant. Discard any 13 | * bits of the product that don't fit in a 32-bit value, but take the 14 | * next-highest @num_bits bits of the product as the hash value, as those have 15 | * the most randomness. 16 | */ 17 | static forceinline u32 18 | lz_hash(u32 seq, unsigned num_bits) 19 | { 20 | return (u32)(seq * 0x1E35A7BD) >> (32 - num_bits); 21 | } 22 | 23 | #endif /* LIB_LZ_HASH_H */ 24 | -------------------------------------------------------------------------------- /programs/detect.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | if [ -z "$CC" ]; then 4 | CC=cc 5 | fi 6 | 7 | echo "/* THIS FILE WAS AUTOMATICALLY GENERATED. DO NOT EDIT. */" 8 | echo "#ifndef _CONFIG_H" 9 | echo "#define _CONFIG_H" 10 | 11 | tmpfile="$(mktemp -t xpack_config.XXXXXXXX)" 12 | trap "rm -f \"$tmpfile\"" EXIT 13 | 14 | check_function() { 15 | funcname="$1" 16 | macro="HAVE_$(echo $funcname | tr a-z A-Z)" 17 | echo "int main() { $funcname(); }" > "$tmpfile" 18 | echo 19 | echo "/* Is the $funcname() function available? */" 20 | if $CC -x c $tmpfile -o /dev/null > /dev/null 2>&1; then 21 | echo "#define $macro 1" 22 | else 23 | echo "/* $macro is not set */" 24 | fi 25 | } 26 | 27 | check_function clock_gettime 28 | check_function futimens 29 | check_function futimes 30 | 31 | echo 32 | echo "#endif /* _CONFIG_H */" 33 | -------------------------------------------------------------------------------- /tools/afl-fuzz/compress/fuzz.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | int main(int argc, char **argv) 9 | { 10 | struct xpack_decompressor *d; 11 | struct xpack_compressor *c; 12 | int ret; 13 | int fd = open(argv[1], O_RDONLY); 14 | struct stat stbuf; 15 | assert(fd >= 0); 16 | ret = fstat(fd, &stbuf); 17 | assert(!ret); 18 | 19 | char in[stbuf.st_size]; 20 | ret = read(fd, in, sizeof in); 21 | assert(ret == sizeof in); 22 | 23 | c = xpack_alloc_compressor(stbuf.st_size, 6); 24 | d = xpack_alloc_decompressor(); 25 | 26 | char out[sizeof(in)]; 27 | char checkarray[sizeof(in)]; 28 | 29 | size_t csize = xpack_compress(c, in,sizeof in, out, sizeof out); 30 | if (csize) { 31 | enum decompress_result res; 32 | res = xpack_decompress(d, out, csize, checkarray, sizeof in, NULL); 33 | assert(!res); 34 | assert(!memcmp(in, checkarray, sizeof in)); 35 | } 36 | 37 | xpack_free_compressor(c); 38 | xpack_free_decompressor(d); 39 | return 0; 40 | } 41 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | Copyright 2016 Eric Biggers 2 | 3 | Permission is hereby granted, free of charge, to any person 4 | obtaining a copy of this software and associated documentation files 5 | (the "Software"), to deal in the Software without restriction, 6 | including without limitation the rights to use, copy, modify, merge, 7 | publish, distribute, sublicense, and/or sell copies of the Software, 8 | and to permit persons to whom the Software is furnished to do so, 9 | subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 18 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 19 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile.msc: -------------------------------------------------------------------------------- 1 | # 2 | # Makefile for the Microsoft toolchain 3 | # 4 | # Usage: 5 | # nmake /f Makefile.msc 6 | # 7 | 8 | CC = cl 9 | LD = link 10 | AR = lib 11 | CFLAGS = /MD /O2 -I. -Icommon 12 | LDFLAGS = 13 | 14 | STATICLIB = libxpackstatic.lib 15 | SHAREDLIB = libxpack.dll 16 | IMPLIB = libxpack.lib 17 | 18 | LIB_OBJ = lib/x86_cpu_features.obj \ 19 | lib/xpack_compress.obj \ 20 | lib/xpack_decompress.obj \ 21 | lib/xpack_common.obj 22 | 23 | PROG_COMMON_OBJ = programs/prog_util.obj \ 24 | programs/tgetopt.obj \ 25 | $(STATICLIB) 26 | 27 | PROG_CFLAGS = $(CFLAGS) -Iprograms 28 | 29 | PROGRAMS = benchmark.exe xpack.exe xunpack.exe 30 | 31 | all: $(STATICLIB) $(SHAREDLIB) $(IMPLIB) $(PROGRAMS) 32 | 33 | .c.obj: 34 | $(CC) -c /Fo$@ $(CFLAGS) $** 35 | 36 | $(STATICLIB): $(LIB_OBJ) 37 | $(AR) $(ARFLAGS) -out:$@ $(LIB_OBJ) 38 | 39 | $(SHAREDLIB): $(LIB_OBJ) 40 | $(LD) $(LDFLAGS) -out:$@ -dll -implib:$(IMPLIB) $(LIB_OBJ) 41 | 42 | $(IMPLIB): $(SHAREDLIB) 43 | 44 | benchmark.exe:programs/benchmark.obj $(PROG_COMMON_OBJ) 45 | $(LD) $(LDFLAGS) -out:$@ $** 46 | 47 | xpack.exe:programs/xpack.obj $(PROG_COMMON_OBJ) 48 | $(LD) $(LDFLAGS) -out:$@ $** 49 | 50 | xunpack.exe:xpack.exe 51 | copy $** $@ 52 | 53 | clean: 54 | -del *.dll *.exe *.exp *.lib lib\*.obj programs\*.obj 2>nul 55 | -------------------------------------------------------------------------------- /lib/x86_cpu_features.h: -------------------------------------------------------------------------------- 1 | /* 2 | * x86_cpu_features.h - feature detection for x86 processors 3 | */ 4 | 5 | #ifndef LIB_X86_CPU_FEATURES_H 6 | #define LIB_X86_CPU_FEATURES_H 7 | 8 | #include "common_defs.h" 9 | 10 | #if defined(__x86_64__) && COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 11 | # define X86_CPU_FEATURES_ENABLED 1 12 | #else 13 | # define X86_CPU_FEATURES_ENABLED 0 14 | #endif 15 | 16 | #if X86_CPU_FEATURES_ENABLED 17 | 18 | #define X86_CPU_FEATURE_SSE 0x00000001 19 | #define X86_CPU_FEATURE_SSE2 0x00000002 20 | #define X86_CPU_FEATURE_SSE3 0x00000004 21 | #define X86_CPU_FEATURE_SSSE3 0x00000008 22 | #define X86_CPU_FEATURE_SSE4_1 0x00000010 23 | #define X86_CPU_FEATURE_SSE4_2 0x00000020 24 | #define X86_CPU_FEATURE_AVX 0x00000040 25 | #define X86_CPU_FEATURE_BMI 0x00000080 26 | #define X86_CPU_FEATURE_AVX2 0x00000100 27 | #define X86_CPU_FEATURE_BMI2 0x00000200 28 | 29 | #define X86_CPU_FEATURES_KNOWN 0x80000000 30 | 31 | extern u32 _x86_cpu_features; 32 | 33 | extern void 34 | x86_setup_cpu_features(void); 35 | 36 | /* Does the processor have the specified feature? */ 37 | static forceinline bool 38 | x86_have_cpu_feature(u32 feature) 39 | { 40 | if (_x86_cpu_features == 0) 41 | x86_setup_cpu_features(); 42 | return _x86_cpu_features & feature; 43 | } 44 | 45 | #endif /* X86_CPU_FEATURES_ENABLED */ 46 | 47 | #endif /* LIB_X86_CPU_FEATURES_H */ 48 | -------------------------------------------------------------------------------- /lib/lz_extend.h: -------------------------------------------------------------------------------- 1 | /* 2 | * lz_extend.h - fast match extension for Lempel-Ziv matchfinding 3 | */ 4 | 5 | #ifndef LIB_LZ_EXTEND_H 6 | #define LIB_LZ_EXTEND_H 7 | 8 | #include "unaligned.h" 9 | 10 | /* 11 | * Return the number of bytes at @matchptr that match the bytes at @strptr, up 12 | * to a maximum of @max_len. Initially, @start_len bytes are matched. 13 | */ 14 | static forceinline u32 15 | lz_extend(const u8 * const strptr, const u8 * const matchptr, 16 | const u32 start_len, const u32 max_len) 17 | { 18 | u32 len = start_len; 19 | machine_word_t v_word; 20 | 21 | if (UNALIGNED_ACCESS_IS_FAST) { 22 | 23 | if (likely(max_len - len >= 4 * WORDBYTES)) { 24 | 25 | #define COMPARE_WORD_STEP \ 26 | v_word = load_word_unaligned(&matchptr[len]) ^ \ 27 | load_word_unaligned(&strptr[len]); \ 28 | if (v_word != 0) \ 29 | goto word_differs; \ 30 | len += WORDBYTES; \ 31 | 32 | COMPARE_WORD_STEP 33 | COMPARE_WORD_STEP 34 | COMPARE_WORD_STEP 35 | COMPARE_WORD_STEP 36 | #undef COMPARE_WORD_STEP 37 | } 38 | 39 | while (len + WORDBYTES <= max_len) { 40 | v_word = load_word_unaligned(&matchptr[len]) ^ 41 | load_word_unaligned(&strptr[len]); 42 | if (v_word != 0) 43 | goto word_differs; 44 | len += WORDBYTES; 45 | } 46 | } 47 | 48 | while (len < max_len && matchptr[len] == strptr[len]) 49 | len++; 50 | return len; 51 | 52 | word_differs: 53 | if (CPU_IS_LITTLE_ENDIAN()) 54 | len += (bsfw(v_word) >> 3); 55 | else 56 | len += (8 * WORDBYTES - 1 - bsrw(v_word)) >> 3; 57 | return len; 58 | } 59 | 60 | #endif /* LIB_LZ_EXTEND_H */ 61 | -------------------------------------------------------------------------------- /lib/xpack_constants.h: -------------------------------------------------------------------------------- 1 | /* 2 | * xpack_constants.h - constants for the XPACK compression format 3 | */ 4 | 5 | #ifndef LIB_XPACK_CONSTANTS_H 6 | #define LIB_XPACK_CONSTANTS_H 7 | 8 | #define MIN_MATCH_LEN 2 9 | #define NUM_REPS 3 10 | 11 | #define BLOCKTYPE_VERBATIM 1 12 | #define BLOCKTYPE_ALIGNED 2 13 | #define BLOCKTYPE_UNCOMPRESSED 3 14 | 15 | #define NUM_BLOCKTYPE_BITS 3 16 | #define NUM_BLOCKSIZE_BITS 20 17 | #define DEFAULT_BLOCK_SIZE 32768 18 | 19 | #define NUM_ALIGNED_BITS 3 20 | 21 | #define LITERAL_ALPHABET_SIZE 256 22 | #define LITRUNLEN_ALPHABET_SIZE 16 23 | #define LENGTH_ALPHABET_SIZE 64 24 | #define MAX_OFFSET_ALPHABET_SIZE 32 25 | #define ALIGNED_ALPHABET_SIZE (1 << NUM_ALIGNED_BITS) 26 | 27 | #define MAX_ALPHABET_SIZE LITERAL_ALPHABET_SIZE 28 | 29 | #define MAX_LOG2_NUM_LITERAL_STATES 10 30 | #define MAX_LOG2_NUM_LITRUNLEN_STATES 9 31 | #define MAX_LOG2_NUM_LENGTH_STATES 9 32 | #define MAX_LOG2_NUM_OFFSET_STATES 9 33 | #define MAX_LOG2_NUM_ALIGNED_STATES 7 34 | 35 | #define MAX_LOG2_NUM_STATES MAX_LOG2_NUM_LITERAL_STATES 36 | #define MAX_NUM_STATES (1 << MAX_LOG2_NUM_STATES) 37 | 38 | #define NUM_LITERAL_STREAMS 2 39 | 40 | #define MAGIC_FILESIZE 12000000 41 | 42 | #define CODEBITS 4 43 | #define MAX_EXTRA_CODEBITS ((1 << CODEBITS) - 3) 44 | #define ZEROCODE1 ((1 << CODEBITS) - 2) 45 | #define ZEROCODE2 ((1 << CODEBITS) - 1) 46 | #define ZEROCODE1_NBITS 2 47 | #define ZEROCODE2_NBITS 7 48 | #define ZEROCODE1_MIN 1 49 | #define ZEROCODE1_MAX (ZEROCODE1_MIN + (1 << ZEROCODE1_NBITS) - 1) 50 | #define ZEROCODE2_MIN (ZEROCODE1_MAX + 1) 51 | #define ZEROCODE2_MAX (ZEROCODE2_MIN + (1 << ZEROCODE2_NBITS) - 1) 52 | 53 | #endif /* LIB_XPACK_CONSTANTS_H */ 54 | -------------------------------------------------------------------------------- /lib/xpack_common.h: -------------------------------------------------------------------------------- 1 | #ifndef LIB_XPACK_COMMON_H 2 | #define LIB_XPACK_COMMON_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "common_defs.h" 9 | #include "unaligned.h" 10 | #include "xpack_constants.h" 11 | 12 | #include "libxpack.h" 13 | 14 | #ifdef ENABLE_PREPROCESSING 15 | extern void preprocess(void *data, u32 size); 16 | extern void postprocess(void *data, u32 size); 17 | #endif 18 | 19 | /* 20 | * Given the number of states, return the corresponding state generator, which 21 | * is the amount by which we will step through the states when assigning symbols 22 | * to states. We require a value such that every state will be visited exactly 23 | * once after num_states steps. Mathematically, we require a generator of the 24 | * cyclic group consisting of the set of integers {0...num_states - 1} and the 25 | * group operation of addition modulo num_states. By a well-known theorem, the 26 | * generators are the set of integers relatively prime to num_states. In this 27 | * case, since num_states is a power of 2, its prime factors are all 2's; 28 | * therefore, the generators are all numbers that do not have 2 as a prime 29 | * factor, i.e. all odd numbers. 30 | * 31 | * The number '1' is always a valid choice, but a poor one because it is 32 | * advantageous to distribute each symbol's states more evenly. The value we 33 | * actually use that works well in practice is five-eighths the number of states 34 | * plus 3. But use | instead of + to guarantee an odd number if num_states <= 35 | * 8. Also, it is okay to use a value greater than num_states because we have 36 | * to mod with num_states after each addition anyway. 37 | * 38 | * Note: it is essential that the encoder and decoder always choose the same 39 | * generator as each other for a given num_states! If you were to change this 40 | * formula, then you would change the on-disk compression format. 41 | */ 42 | static forceinline unsigned 43 | get_state_generator(unsigned num_states) 44 | { 45 | return (num_states >> 1) | (num_states >> 3) | 3; 46 | } 47 | 48 | /* Initialize the recent offsets queue. */ 49 | static forceinline void 50 | init_recent_offsets(u32 recent_offsets[NUM_REPS]) 51 | { 52 | unsigned i; 53 | 54 | for (i = 0; i < NUM_REPS; i++) 55 | recent_offsets[i] = 1 + i; 56 | } 57 | 58 | #endif /* LIB_XPACK_COMMON_H */ 59 | -------------------------------------------------------------------------------- /common/compiler_msc.h: -------------------------------------------------------------------------------- 1 | /* 2 | * compiler_msc.h - definitions for the Microsoft C Compiler 3 | */ 4 | 5 | #define BUILDING_LIBXPACK 6 | 7 | #define LIBEXPORT __declspec(dllexport) 8 | 9 | /* 10 | * Old versions (e.g. VS2010) of MSC don't have the C99 header stdbool.h. 11 | * Beware: the below replacement isn't fully standard, since normally any value 12 | * != 0 should be implicitly cast to a bool with value 1... but that doesn't 13 | * happen if bool is really just an 'int'. 14 | */ 15 | typedef int bool; 16 | #define true 1 17 | #define false 0 18 | #define __bool_true_false_are_defined 1 19 | 20 | /* Define ssize_t */ 21 | #ifdef _WIN64 22 | typedef long long ssize_t; 23 | #else 24 | typedef int ssize_t; 25 | #endif 26 | 27 | /* 28 | * Old versions (e.g. VS2010) of MSC have stdint.h but not the C99 header 29 | * inttypes.h. Work around this by defining the PRI* macros ourselves. 30 | */ 31 | #include 32 | #define PRIu8 "hhu" 33 | #define PRIu16 "hu" 34 | #define PRIu32 "u" 35 | #define PRIu64 "llu" 36 | #define PRIi8 "hhi" 37 | #define PRIi16 "hi" 38 | #define PRIi32 "i" 39 | #define PRIi64 "lli" 40 | #define PRIx8 "hhx" 41 | #define PRIx16 "hx" 42 | #define PRIx32 "x" 43 | #define PRIx64 "llx" 44 | 45 | /* Assume a little endian architecture with fast unaligned access */ 46 | #define CPU_IS_LITTLE_ENDIAN() 1 47 | #define UNALIGNED_ACCESS_IS_FAST 1 48 | 49 | /* __restrict has nonstandard behavior; don't use it */ 50 | #define restrict 51 | 52 | /* ... but we can use __inline and __forceinline */ 53 | #define inline __inline 54 | #define forceinline __forceinline 55 | 56 | /* Byte swap functions */ 57 | #define bswap16 _byteswap_ushort 58 | #define bswap32 _byteswap_ulong 59 | #define bswap64 _byteswap_uint64 60 | 61 | /* Bit scan functions (32-bit) */ 62 | 63 | static forceinline unsigned 64 | bsr32(uint32_t n) 65 | { 66 | _BitScanReverse(&n, n); 67 | return n; 68 | } 69 | #define bsr32 bsr32 70 | 71 | static forceinline unsigned 72 | bsf32(uint32_t n) 73 | { 74 | _BitScanForward(&n, n); 75 | return n; 76 | } 77 | #define bsf32 bsf32 78 | 79 | #ifdef _M_X64 /* Bit scan functions (64-bit) */ 80 | 81 | static forceinline unsigned 82 | bsr64(uint64_t n) 83 | { 84 | _BitScanReverse64(&n, n); 85 | return n; 86 | } 87 | #define bsr64 bsr64 88 | 89 | static forceinline unsigned 90 | bsf64(uint64_t n) 91 | { 92 | _BitScanForward64(&n, n); 93 | return n; 94 | } 95 | #define bsf64 bsf64 96 | 97 | #endif /* _M_X64 */ 98 | -------------------------------------------------------------------------------- /common/compiler_gcc.h: -------------------------------------------------------------------------------- 1 | /* 2 | * compiler_gcc.h - definitions for the GNU C Compiler. Currently this also 3 | * handles clang and the Intel C Compiler. 4 | */ 5 | 6 | #define GCC_PREREQ(major, minor) \ 7 | (!defined(__clang__) && !defined(__INTEL_COMPILER) && \ 8 | (__GNUC__ > (major) || \ 9 | (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))) 10 | 11 | #ifndef __has_attribute 12 | # define __has_attribute(attribute) 0 13 | #endif 14 | #ifndef __has_feature 15 | # define __has_feature(feature) 0 16 | #endif 17 | #ifndef __has_builtin 18 | # define __has_builtin(builtin) 0 19 | #endif 20 | 21 | #ifdef _WIN32 22 | # define LIBEXPORT __declspec(dllexport) 23 | #else 24 | # define LIBEXPORT __attribute__((visibility("default"))) 25 | #endif 26 | 27 | #define inline inline 28 | #define forceinline inline __attribute__((always_inline)) 29 | #define restrict __restrict__ 30 | #define likely(expr) __builtin_expect(!!(expr), 1) 31 | #define unlikely(expr) __builtin_expect(!!(expr), 0) 32 | #define prefetchr(addr) __builtin_prefetch((addr), 0) 33 | #define prefetchw(addr) __builtin_prefetch((addr), 1) 34 | 35 | #define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE \ 36 | (GCC_PREREQ(4, 4) || __has_attribute(target)) 37 | 38 | #define COMPILER_SUPPORTS_BMI2_TARGET \ 39 | (COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE && \ 40 | (GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_pdep_di))) 41 | 42 | /* Newer gcc supports __BYTE_ORDER__. Older gcc doesn't. */ 43 | #ifdef __BYTE_ORDER__ 44 | # define CPU_IS_LITTLE_ENDIAN() (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) 45 | #endif 46 | 47 | #if GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16) 48 | # define bswap16 __builtin_bswap16 49 | #endif 50 | 51 | #if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32) 52 | # define bswap32 __builtin_bswap32 53 | #endif 54 | 55 | #if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64) 56 | # define bswap64 __builtin_bswap64 57 | #endif 58 | 59 | #if defined(__x86_64__) || defined(__i386__) || defined(__ARM_FEATURE_UNALIGNED) 60 | # define UNALIGNED_ACCESS_IS_FAST 1 61 | #endif 62 | 63 | /* With gcc, we can access unaligned memory through 'packed' structures. */ 64 | #define DEFINE_UNALIGNED_TYPE(type) \ 65 | \ 66 | struct type##unaligned { \ 67 | type v; \ 68 | } __attribute__((packed)); \ 69 | \ 70 | static forceinline type \ 71 | load_##type##_unaligned(const void *p) \ 72 | { \ 73 | return ((const struct type##unaligned *)p)->v; \ 74 | } \ 75 | \ 76 | static forceinline void \ 77 | store_##type##_unaligned(type v, void *p) \ 78 | { \ 79 | ((struct type##unaligned *)p)->v = v; \ 80 | } 81 | 82 | #define bsr32(n) (31 - __builtin_clz(n)) 83 | #define bsr64(n) (63 - __builtin_clzll(n)) 84 | #define bsf32(n) __builtin_ctz(n) 85 | #define bsf64(n) __builtin_ctzll(n) 86 | -------------------------------------------------------------------------------- /programs/tgetopt.c: -------------------------------------------------------------------------------- 1 | /* 2 | * tgetopt.c - portable replacement for GNU getopt() 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #include "prog_util.h" 29 | 30 | tchar *toptarg; 31 | int toptind = 1, topterr = 1, toptopt; 32 | 33 | /* 34 | * This is a simple implementation of getopt(). It can be compiled with either 35 | * 'char' or 'wchar_t' as the character type. 36 | * 37 | * Do *not* use this implementation if you need any of the following features, 38 | * as they are not supported: 39 | * - Optional arguments 40 | * - Long options 41 | * - Option-related arguments retained in argv, not nulled out 42 | * - '+' and '-' characters in optstring 43 | */ 44 | int 45 | tgetopt(int argc, tchar *argv[], const tchar *optstring) 46 | { 47 | static tchar empty[1]; 48 | static tchar *nextchar; 49 | static bool done; 50 | 51 | if (toptind == 1) { 52 | /* Starting to scan a new argument vector */ 53 | nextchar = NULL; 54 | done = false; 55 | } 56 | 57 | while (!done && (nextchar != NULL || toptind < argc)) { 58 | if (nextchar == NULL) { 59 | /* Scanning a new argument */ 60 | tchar *arg = argv[toptind++]; 61 | if (arg[0] == '-' && arg[1] != '\0') { 62 | if (arg[1] == '-' && arg[2] == '\0') { 63 | /* All args after "--" are nonoptions */ 64 | argv[toptind - 1] = NULL; 65 | done = true; 66 | } else { 67 | /* Start of short option characters */ 68 | nextchar = &arg[1]; 69 | } 70 | } 71 | } else { 72 | /* More short options in previous arg */ 73 | tchar opt = *nextchar; 74 | tchar *p = tstrchr(optstring, opt); 75 | if (p == NULL) { 76 | if (topterr) 77 | msg("invalid option -- '%"TC"'", opt); 78 | toptopt = opt; 79 | return '?'; 80 | } 81 | /* 'opt' is a valid short option character */ 82 | nextchar++; 83 | if (*(p + 1) == ':') { 84 | /* 'opt' requires an argument */ 85 | if (*nextchar != '\0') { 86 | /* Optarg is in same argv argument */ 87 | toptarg = nextchar; 88 | } else if (toptind < argc) { 89 | /* Optarg is next argv argument */ 90 | argv[toptind - 1] = NULL; 91 | toptarg = argv[toptind++]; 92 | } else { 93 | if (topterr && *optstring != ':') { 94 | msg("option requires an " 95 | "argument -- '%"TC"'", opt); 96 | } 97 | toptopt = opt; 98 | opt = (*optstring == ':') ? ':' : '?'; 99 | } 100 | nextchar = empty; 101 | } 102 | if (*nextchar == '\0') { 103 | argv[toptind - 1] = NULL; 104 | nextchar = NULL; 105 | } 106 | return opt; 107 | } 108 | } 109 | 110 | /* Done scanning. Move all nonoptions to the end, set optind to the 111 | * index of the first nonoption, and return -1. */ 112 | toptind = argc; 113 | while (--argc > 0) 114 | if (argv[argc] != NULL) 115 | argv[--toptind] = argv[argc]; 116 | done = true; 117 | return -1; 118 | } 119 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | XPACK is an experimental compression format. It is intended to have better 4 | performance than DEFLATE as implemented in the zlib library and also produce a 5 | notably better compression ratio on most inputs. The format is not yet stable. 6 | 7 | XPACK has been inspired by the DEFLATE, LZX, and Zstandard formats, among 8 | others. Originally envisioned as a DEFLATE replacement, it won't necessarily 9 | see a lot of additional development since other solutions such as Zstandard seem 10 | to have gotten much closer to that goal first (great job to those involved!). 11 | But I am releasing the code anyway for anyone who may find it useful. 12 | 13 | # Format overview 14 | 15 | Like many other common compression formats, XPACK is based on the LZ77 method 16 | (decomposition into literals and length/offset copy commands) with a number of 17 | tricks on top. Features include: 18 | 19 | * Increased sliding window, or "dictionary", size (like LZX and Zstd) 20 | * Entropy encoding with finite state entropy (FSE) codes, also known as 21 | table-based asymmetric numeral systems (tANS) (like Zstd) 22 | * Minimum match length of 2 (like LZX) 23 | * Lowest three bits of match offsets can be entropy-encoded (like LZX) 24 | * Aligned and verbatim blocks (like LZX) 25 | * Recent match offsets queue with three entries (like LZX) 26 | * Literals packed separately from matches, and with two FSE streams (like older 27 | Zstd versions) 28 | * Literal runs (like Zstd) 29 | * Concise FSE header (state count list) representation 30 | * Decoder reads in forwards direction, encoder writes in backwards direction 31 | * Optional preprocessing step for x86 machine code (like LZX) 32 | 33 | # Implementation overview 34 | 35 | libxpack is a library containing an optimized, portable implementation of an 36 | XPACK compressor and decompressor. Features currently include: 37 | 38 | * Whole-buffer compression and decompression only 39 | * Multiple compression levels 40 | * Fast hash chains-based matchfinder 41 | * Greedy and lazy parsers 42 | * Decompressor automatically uses Intel BMI2 instructions when supported 43 | 44 | In addition, the following command-line programs using libxpack are provided: 45 | 46 | * xpack (or xunpack), a program which behaves like a standard UNIX command-line 47 | compressor such as gzip (or gunzip). The command-line interface should be 48 | compatible enough that xpack can be used as a drop-in gzip replacement in many 49 | cases --- though the on-disk format is incompatible, of course. 50 | * benchmark, a program for benchmarking in-memory compression and decompression 51 | 52 | Note that currently, all the programs internally use "chunks", as the library 53 | does not yet support streaming. This will worsen the compression ratio 54 | slightly, compared to what is possible. 55 | 56 | All files may be modified and/or redistributed under the terms of the MIT 57 | license. There is NO WARRANTY, to the extent permitted by law. See the COPYING 58 | file for details. 59 | 60 | # Building 61 | 62 | ## For UNIX 63 | 64 | Just run `make`. You need GNU Make and either GCC or Clang. GCC is recommended 65 | because it builds slightly faster binaries. There is no `make install` yet; 66 | just copy the file(s) to where you want. 67 | 68 | By default, all targets are built, including the library and programs. `make 69 | help` shows the available targets. There are also several options which can be 70 | set on the `make` command line. See the Makefile for details. 71 | 72 | ## For Windows 73 | 74 | MinGW (GCC) is the recommended compiler to use when building binaries for 75 | Windows. MinGW can be used on either Windows or Linux. Use a command like: 76 | 77 | $ make CC=x86_64-w64-mingw32-gcc 78 | 79 | Windows binaries prebuilt with MinGW may also be downloaded from 80 | https://github.com/ebiggers/xpack/releases. 81 | 82 | Alternatively, a separate Makefile, `Makefile.msc`, is provided for the tools 83 | that come with Visual Studio, for those who strongly prefer that toolchain. 84 | 85 | As usual, 64-bit binaries are faster than 32-bit binaries and should be 86 | preferred whenever possible. 87 | -------------------------------------------------------------------------------- /lib/x86_cpu_features.c: -------------------------------------------------------------------------------- 1 | /* 2 | * x86_cpu_features.c - feature detection for x86 processors 3 | */ 4 | 5 | #include "x86_cpu_features.h" 6 | 7 | #if X86_CPU_FEATURES_ENABLED 8 | 9 | #define DEBUG 0 10 | 11 | #if DEBUG 12 | # include 13 | #endif 14 | 15 | u32 _x86_cpu_features = 0; 16 | 17 | /* 18 | * With old GCC versions we have to manually save and restore the x86_32 PIC 19 | * register (ebx). See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602 20 | */ 21 | #if defined(__i386__) && defined(__PIC__) 22 | # define EBX_CONSTRAINT "=r" 23 | #else 24 | # define EBX_CONSTRAINT "=b" 25 | #endif 26 | 27 | /* Execute the CPUID instruction. */ 28 | static inline void 29 | cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d) 30 | { 31 | __asm__(".ifnc %%ebx, %1; mov %%ebx, %1; .endif\n" 32 | "cpuid \n" 33 | ".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n" 34 | : "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d) 35 | : "a" (leaf), "c" (subleaf)); 36 | } 37 | 38 | /* Read an extended control register. */ 39 | static inline u64 40 | read_xcr(u32 index) 41 | { 42 | u32 edx, eax; 43 | 44 | /* 45 | * Execute the "xgetbv" instruction. Old versions of binutils do not 46 | * recognize this instruction, so list the raw bytes instead. 47 | */ 48 | __asm__ (".byte 0x0f, 0x01, 0xd0" : "=d" (edx), "=a" (eax) : "c" (index)); 49 | 50 | return ((u64)edx << 32) | eax; 51 | } 52 | 53 | #define IS_SET(reg, bit) ((reg) & ((u32)1 << (bit))) 54 | 55 | /* Initialize _x86_cpu_features with bits for interesting processor features. */ 56 | void 57 | x86_setup_cpu_features(void) 58 | { 59 | u32 features = 0; 60 | u32 dummy1, dummy2, dummy3, dummy4; 61 | u32 max_function; 62 | u32 features_1, features_2, features_3, features_4; 63 | bool os_saves_ymm_regs = false; 64 | 65 | /* Get maximum supported function */ 66 | cpuid(0, 0, &max_function, &dummy2, &dummy3, &dummy4); 67 | if (max_function < 1) 68 | goto out; 69 | 70 | /* Standard feature flags */ 71 | cpuid(1, 0, &dummy1, &dummy2, &features_2, &features_1); 72 | 73 | if (IS_SET(features_1, 25)) 74 | features |= X86_CPU_FEATURE_SSE; 75 | 76 | if (IS_SET(features_1, 26)) 77 | features |= X86_CPU_FEATURE_SSE2; 78 | 79 | if (IS_SET(features_2, 0)) 80 | features |= X86_CPU_FEATURE_SSE3; 81 | 82 | if (IS_SET(features_2, 9)) 83 | features |= X86_CPU_FEATURE_SSSE3; 84 | 85 | if (IS_SET(features_2, 19)) 86 | features |= X86_CPU_FEATURE_SSE4_1; 87 | 88 | if (IS_SET(features_2, 20)) 89 | features |= X86_CPU_FEATURE_SSE4_2; 90 | 91 | if (IS_SET(features_2, 27)) /* OSXSAVE set? */ 92 | if ((read_xcr(0) & 0x6) == 0x6) 93 | os_saves_ymm_regs = true; 94 | 95 | if (os_saves_ymm_regs && IS_SET(features_2, 28)) 96 | features |= X86_CPU_FEATURE_AVX; 97 | 98 | if (max_function < 7) 99 | goto out; 100 | 101 | /* Extended feature flags */ 102 | cpuid(7, 0, &dummy1, &features_3, &features_4, &dummy4); 103 | 104 | if (IS_SET(features_3, 3)) 105 | features |= X86_CPU_FEATURE_BMI; 106 | 107 | if (os_saves_ymm_regs && IS_SET(features_3, 5)) 108 | features |= X86_CPU_FEATURE_AVX2; 109 | 110 | if (IS_SET(features_3, 8)) 111 | features |= X86_CPU_FEATURE_BMI2; 112 | 113 | out: 114 | 115 | #if DEBUG 116 | printf("Detected x86 CPU features: "); 117 | if (features & X86_CPU_FEATURE_SSE) 118 | printf("SSE "); 119 | if (features & X86_CPU_FEATURE_SSE2) 120 | printf("SSE2 "); 121 | if (features & X86_CPU_FEATURE_SSE3) 122 | printf("SSE3 "); 123 | if (features & X86_CPU_FEATURE_SSSE3) 124 | printf("SSSE3 "); 125 | if (features & X86_CPU_FEATURE_SSE4_1) 126 | printf("SSE4.1 "); 127 | if (features & X86_CPU_FEATURE_SSE4_2) 128 | printf("SSE4.2 "); 129 | if (features & X86_CPU_FEATURE_BMI) 130 | printf("BMI "); 131 | if (features & X86_CPU_FEATURE_AVX) 132 | printf("AVX "); 133 | if (features & X86_CPU_FEATURE_BMI2) 134 | printf("BMI2 "); 135 | if (features & X86_CPU_FEATURE_AVX2) 136 | printf("AVX2 "); 137 | printf("\n"); 138 | #endif /* DEBUG */ 139 | 140 | _x86_cpu_features = features | X86_CPU_FEATURES_KNOWN; 141 | } 142 | 143 | #endif /* X86_CPU_FEATURES_ENABLED */ 144 | -------------------------------------------------------------------------------- /lib/unaligned.h: -------------------------------------------------------------------------------- 1 | /* 2 | * unaligned.h - inline functions for unaligned memory accesses 3 | */ 4 | 5 | #ifndef LIB_UNALIGNED_H 6 | #define LIB_UNALIGNED_H 7 | 8 | #include "common_defs.h" 9 | 10 | /* 11 | * Naming note: 12 | * 13 | * {load,store}_*_unaligned() deal with raw bytes without endianness conversion. 14 | * {get,put}_unaligned_*() deal with a specific endianness. 15 | */ 16 | 17 | DEFINE_UNALIGNED_TYPE(u16) 18 | DEFINE_UNALIGNED_TYPE(u32) 19 | DEFINE_UNALIGNED_TYPE(u64) 20 | DEFINE_UNALIGNED_TYPE(machine_word_t) 21 | 22 | #define load_word_unaligned load_machine_word_t_unaligned 23 | #define store_word_unaligned store_machine_word_t_unaligned 24 | 25 | /***** Unaligned loads *****/ 26 | 27 | static forceinline u16 28 | get_unaligned_le16(const u8 *p) 29 | { 30 | if (UNALIGNED_ACCESS_IS_FAST) 31 | return le16_bswap(load_u16_unaligned(p)); 32 | else 33 | return ((u16)p[1] << 8) | p[0]; 34 | } 35 | 36 | static forceinline u32 37 | get_unaligned_le32(const u8 *p) 38 | { 39 | if (UNALIGNED_ACCESS_IS_FAST) 40 | return le32_bswap(load_u32_unaligned(p)); 41 | else 42 | return ((u32)p[3] << 24) | ((u32)p[2] << 16) | 43 | ((u32)p[1] << 8) | p[0]; 44 | } 45 | 46 | static forceinline u64 47 | get_unaligned_le64(const u8 *p) 48 | { 49 | if (UNALIGNED_ACCESS_IS_FAST) 50 | return le64_bswap(load_u64_unaligned(p)); 51 | else 52 | return ((u64)p[7] << 56) | ((u64)p[6] << 48) | 53 | ((u64)p[5] << 40) | ((u64)p[4] << 32) | 54 | ((u64)p[3] << 24) | ((u64)p[2] << 16) | 55 | ((u64)p[1] << 8) | p[0]; 56 | } 57 | 58 | static forceinline machine_word_t 59 | get_unaligned_leword(const u8 *p) 60 | { 61 | STATIC_ASSERT(WORDBYTES == 4 || WORDBYTES == 8); 62 | if (WORDBYTES == 4) 63 | return get_unaligned_le32(p); 64 | else 65 | return get_unaligned_le64(p); 66 | } 67 | 68 | /***** Unaligned stores *****/ 69 | 70 | static forceinline void 71 | put_unaligned_le16(u16 v, u8 *p) 72 | { 73 | if (UNALIGNED_ACCESS_IS_FAST) { 74 | store_u16_unaligned(le16_bswap(v), p); 75 | } else { 76 | p[0] = (u8)(v >> 0); 77 | p[1] = (u8)(v >> 8); 78 | } 79 | } 80 | 81 | static forceinline void 82 | put_unaligned_le32(u32 v, u8 *p) 83 | { 84 | if (UNALIGNED_ACCESS_IS_FAST) { 85 | store_u32_unaligned(le32_bswap(v), p); 86 | } else { 87 | p[0] = (u8)(v >> 0); 88 | p[1] = (u8)(v >> 8); 89 | p[2] = (u8)(v >> 16); 90 | p[3] = (u8)(v >> 24); 91 | } 92 | } 93 | 94 | static forceinline void 95 | put_unaligned_le64(u64 v, u8 *p) 96 | { 97 | if (UNALIGNED_ACCESS_IS_FAST) { 98 | store_u64_unaligned(le64_bswap(v), p); 99 | } else { 100 | p[0] = (u8)(v >> 0); 101 | p[1] = (u8)(v >> 8); 102 | p[2] = (u8)(v >> 16); 103 | p[3] = (u8)(v >> 24); 104 | p[4] = (u8)(v >> 32); 105 | p[5] = (u8)(v >> 40); 106 | p[6] = (u8)(v >> 48); 107 | p[7] = (u8)(v >> 56); 108 | } 109 | } 110 | 111 | static forceinline void 112 | put_unaligned_leword(machine_word_t v, u8 *p) 113 | { 114 | STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); 115 | if (WORDBITS == 32) 116 | put_unaligned_le32(v, p); 117 | else 118 | put_unaligned_le64(v, p); 119 | } 120 | 121 | /***** 24-bit loads *****/ 122 | 123 | /* 124 | * Given a 32-bit value that was loaded with the platform's native endianness, 125 | * return a 32-bit value whose high-order 8 bits are 0 and whose low-order 24 126 | * bits contain the first 3 bytes, arranged in octets in a platform-dependent 127 | * order, at the memory location from which the input 32-bit value was loaded. 128 | */ 129 | static forceinline u32 130 | loaded_u32_to_u24(u32 v) 131 | { 132 | if (CPU_IS_LITTLE_ENDIAN()) 133 | return v & 0xFFFFFF; 134 | else 135 | return v >> 8; 136 | } 137 | 138 | /* 139 | * Load the next 3 bytes from the memory location @p into the 24 low-order bits 140 | * of a 32-bit value. The order in which the 3 bytes will be arranged as octets 141 | * in the 24 bits is platform-dependent. At least LOAD_U24_REQUIRED_NBYTES 142 | * bytes must be available at @p; note that this may be more than 3. 143 | */ 144 | static forceinline u32 145 | load_u24_unaligned(const u8 *p) 146 | { 147 | #if UNALIGNED_ACCESS_IS_FAST 148 | # define LOAD_U24_REQUIRED_NBYTES 4 149 | return loaded_u32_to_u24(load_u32_unaligned(p)); 150 | #else 151 | # define LOAD_U24_REQUIRED_NBYTES 3 152 | if (CPU_IS_LITTLE_ENDIAN()) 153 | return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16); 154 | else 155 | return ((u32)p[2] << 0) | ((u32)p[1] << 8) | ((u32)p[0] << 16); 156 | #endif 157 | } 158 | 159 | #endif /* LIB_UNALIGNED_H */ 160 | -------------------------------------------------------------------------------- /programs/prog_util.h: -------------------------------------------------------------------------------- 1 | /* 2 | * prog_util.h - utility functions for programs 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #ifndef PROGRAMS_PROG_UTIL_H 29 | #define PROGRAMS_PROG_UTIL_H 30 | 31 | #ifdef HAVE_CONFIG_H 32 | # include "config.h" 33 | #endif 34 | 35 | #include "libxpack.h" 36 | 37 | #include 38 | #include 39 | #include 40 | #include 41 | 42 | #include "common_defs.h" 43 | 44 | #ifdef __GNUC__ 45 | # define _printf(str_idx, args_idx) \ 46 | __attribute__((format(printf, str_idx, args_idx))) 47 | #else 48 | # define _printf(str_idx, args_idx) 49 | #endif 50 | 51 | #ifdef _WIN32 52 | 53 | /* 54 | * Definitions for Windows builds. Mainly, 'tchar' is defined to be the 2-byte 55 | * 'wchar_t' type instead of 'char'. This is the only "easy" way I know of to 56 | * get full Unicode support on Windows... 57 | */ 58 | 59 | #include 60 | extern int wmain(int argc, wchar_t **argv); 61 | # define tmain wmain 62 | # define tchar wchar_t 63 | # define _T(text) L##text 64 | # define T(text) _T(text) 65 | # define TS "ls" 66 | # define TC "lc" 67 | # define tmemcpy wmemcpy 68 | # define topen _wopen 69 | # define tstrchr wcschr 70 | # define tstrcmp wcscmp 71 | # define tstrcpy wcscpy 72 | # define tstrlen wcslen 73 | # define tstrrchr wcsrchr 74 | # define tstrtoul wcstoul 75 | # define tstrxcmp wcsicmp 76 | # define tunlink _wunlink 77 | # define tutimbuf _utimbuf 78 | # define tutime _wutime 79 | # ifdef _MSC_VER 80 | /* Standard file descriptors? What are those? */ 81 | # define STDIN_FILENO 0 82 | # define STDOUT_FILENO 1 83 | # define STDERR_FILENO 2 84 | 85 | /* Fix the broken stat-related definitions. */ 86 | # define fstat _fstat 87 | # define stat _stat 88 | # define S_ISREG(m) (((m) & S_IFMT) == S_IFREG) 89 | # define S_ISDIR(m) (((m) & S_IFMT) == S_IFDIR) 90 | # endif 91 | # define O_NOFOLLOW 0 92 | 93 | #else /* _WIN32 */ 94 | 95 | /* Standard definitions for everyone else */ 96 | 97 | # define tmain main 98 | # define tchar char 99 | # define T(text) text 100 | # define TS "s" 101 | # define TC "c" 102 | # define tmemcpy memcpy 103 | # define topen open 104 | # define tstrchr strchr 105 | # define tstrcmp strcmp 106 | # define tstrcpy strcpy 107 | # define tstrlen strlen 108 | # define tstrrchr strrchr 109 | # define tstrtoul strtoul 110 | # define tstrxcmp strcmp 111 | # define tunlink unlink 112 | # define tutimbuf utimbuf 113 | # define tutime utime 114 | # define O_BINARY 0 115 | 116 | #endif /* !_WIN32 */ 117 | 118 | extern const tchar *program_invocation_name; 119 | 120 | extern void _printf(1, 2) msg(const char *fmt, ...); 121 | extern void _printf(1, 2) msg_errno(const char *fmt, ...); 122 | 123 | extern void *xmalloc(size_t size); 124 | 125 | extern u64 current_time(void); 126 | 127 | extern const tchar *get_filename(const tchar *path); 128 | 129 | struct file_stream { 130 | int fd; 131 | tchar *name; 132 | bool is_standard_stream; 133 | }; 134 | 135 | extern int xopen_for_read(const tchar *path, struct file_stream *strm); 136 | extern int xopen_for_write(const tchar *path, bool force, 137 | struct file_stream *strm); 138 | 139 | extern ssize_t xread(struct file_stream *strm, void *buf, size_t count); 140 | extern int skip_bytes(struct file_stream *strm, size_t count); 141 | extern int full_write(struct file_stream *strm, const void *buf, size_t count); 142 | 143 | extern int xclose(struct file_stream *strm); 144 | 145 | extern u32 parse_chunk_size(const tchar *arg); 146 | extern int parse_compression_level(const tchar *arg); 147 | 148 | extern struct xpack_compressor *alloc_compressor(u32 chunk_size, int level); 149 | extern struct xpack_decompressor *alloc_decompressor(void); 150 | 151 | /* tgetopt.c */ 152 | 153 | extern tchar *toptarg; 154 | extern int toptind, topterr, toptopt; 155 | 156 | extern int tgetopt(int argc, tchar *argv[], const tchar *optstring); 157 | 158 | #endif /* PROGRAMS_PROG_UTIL_H */ 159 | -------------------------------------------------------------------------------- /libxpack.h: -------------------------------------------------------------------------------- 1 | /* 2 | * libxpack.h - public header for libxpack 3 | */ 4 | 5 | #ifndef LIBXPACK_H 6 | #define LIBXPACK_H 7 | 8 | #ifdef __cplusplus 9 | extern "C" { 10 | #endif 11 | 12 | #include 13 | 14 | /* Microsoft C / Visual Studio garbage. If you want to link to the DLL version 15 | * of libxpack, then #define LIBXPACK_DLL. */ 16 | #ifdef _MSC_VER 17 | # ifdef BUILDING_LIBXPACK 18 | # define LIBXPACKAPI __declspec(dllexport) 19 | # elif defined(LIBXPACK_DLL) 20 | # define LIBXPACKAPI __declspec(dllimport) 21 | # endif 22 | #endif 23 | #ifndef LIBXPACKAPI 24 | # define LIBXPACKAPI 25 | #endif 26 | 27 | /* ========================================================================== */ 28 | /* Compression */ 29 | /* ========================================================================== */ 30 | 31 | struct xpack_compressor; 32 | 33 | /* 34 | * xpack_alloc_compressor() allocates a new compressor. 35 | * 36 | * 'max_buffer_size' is the maximum size of any buffer which will be compressed 37 | * by the compressor. This specifies the maximum allowed value for the 38 | * 'uncompressed_size' parameter of xpack_compress() when called using this 39 | * compressor. 40 | * 41 | * 'compression_level' is the compression level on a zlib-like scale (1 = 42 | * fastest, 6 = medium/default, 9 = slowest). 43 | * 44 | * Returns a pointer to the new compressor, or NULL if out of memory or the 45 | * maximum buffer size or compression level is not supported. 46 | */ 47 | LIBXPACKAPI struct xpack_compressor * 48 | xpack_alloc_compressor(size_t max_buffer_size, int compression_level); 49 | 50 | /* 51 | * xpack_compress() compresses a buffer of data. The function attempts to 52 | * compress 'in_nbytes' bytes of data located at 'in' and write the results to 53 | * 'out', which has space for 'out_nbytes_avail' bytes. The return value is the 54 | * compressed size in bytes, or 0 if the data could not be compressed to 55 | * 'out_nbytes_avail' bytes or fewer. 56 | */ 57 | LIBXPACKAPI size_t 58 | xpack_compress(struct xpack_compressor *compressor, 59 | const void *in, size_t in_nbytes, 60 | void *out, size_t out_nbytes_avail); 61 | 62 | /* 63 | * xpack_free_compressor() frees a compressor allocated with 64 | * xpack_alloc_compressor(). If NULL is passed, then no action is taken. 65 | */ 66 | LIBXPACKAPI void 67 | xpack_free_compressor(struct xpack_compressor *compressor); 68 | 69 | /* ========================================================================== */ 70 | /* Decompression */ 71 | /* ========================================================================== */ 72 | 73 | struct xpack_decompressor; 74 | 75 | /* 76 | * xpack_alloc_decompressor() allocates a new decompressor. 77 | * 78 | * Returns a pointer to the new decompressor, or NULL if out of memory. 79 | */ 80 | LIBXPACKAPI struct xpack_decompressor * 81 | xpack_alloc_decompressor(void); 82 | 83 | /* Result of a call to xpack_decompress() */ 84 | enum decompress_result { 85 | 86 | /* Decompression was successful */ 87 | DECOMPRESS_SUCCESS = 0, 88 | 89 | /* Decompressed failed because the compressed data was invalid, corrupt, 90 | * or otherwise unsupported */ 91 | DECOMPRESS_BAD_DATA = 1, 92 | 93 | /* A NULL 'actual_out_nbytes_ret' was provided, but the data would have 94 | * decompressed to fewer than 'out_nbytes_avail' bytes */ 95 | DECOMPRESS_SHORT_OUTPUT = 2, 96 | 97 | /* The data would have decompressed to more than 'out_nbytes_avail' 98 | * bytes */ 99 | DECOMPRESS_INSUFFICIENT_SPACE = 3, 100 | }; 101 | 102 | /* 103 | * xpack_decompress() decompresses 'in_nbytes' bytes of compressed data at 'in' 104 | * and writes the uncompressed data to 'out', which is a buffer of at least 105 | * 'out_nbytes_avail' bytes. If decompression was successful, then 0 106 | * (DECOMPRESS_SUCCESS) is returned; otherwise, a nonzero result code such as 107 | * DECOMPRESS_BAD_DATA is returned. If a nonzero result code is returned, then 108 | * the contents of the output buffer are undefined. 109 | * 110 | * xpack_decompress() can be used in cases where the actual uncompressed size is 111 | * known (recommended) or unknown (not recommended): 112 | * 113 | * - If the actual uncompressed size is known, then pass the actual 114 | * uncompressed size as 'out_nbytes_avail' and pass NULL for 115 | * 'actual_out_nbytes_ret'. This makes xpack_decompress() fail with 116 | * DECOMPRESS_SHORT_OUTPUT if the data decompressed to fewer than the 117 | * specified number of bytes. 118 | * 119 | * - If the actual uncompressed size is unknown, then provide a non-NULL 120 | * 'actual_out_nbytes_ret' and provide a buffer with some size 121 | * 'out_nbytes_avail' that you think is large enough to hold all the 122 | * uncompressed data. In this case, if the data decompresses to less than 123 | * or equal to 'out_nbytes_avail' bytes, then xpack_decompress() will write 124 | * the actual uncompressed size to *actual_out_nbytes_ret and return 0 125 | * (DECOMPRESS_SUCCESS). Otherwise, it will return 126 | * DECOMPRESS_INSUFFICIENT_SPACE if the provided buffer was not large enough 127 | * but no other problems were encountered, or another nonzero result code if 128 | * decompression failed for another reason. 129 | */ 130 | LIBXPACKAPI enum decompress_result 131 | xpack_decompress(struct xpack_decompressor *decompressor, 132 | const void *in, size_t in_nbytes, 133 | void *out, size_t out_nbytes_avail, 134 | size_t *actual_out_nbytes_ret); 135 | 136 | /* 137 | * xpack_free_decompressor() frees a decompressor allocated with 138 | * xpack_alloc_decompressor(). If NULL is passed, no action is taken. 139 | */ 140 | LIBXPACKAPI void 141 | xpack_free_decompressor(struct xpack_decompressor *decompressor); 142 | 143 | 144 | #ifdef __cplusplus 145 | } 146 | #endif 147 | 148 | #endif /* LIBXPACK_H */ 149 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Use 'make help' to list available targets. 3 | # 4 | # Define V=1 to enable "verbose" mode, showing all executed commands. 5 | # 6 | # Define DECOMPRESSION_ONLY to omit all compression code, building a 7 | # decompression-only library. If doing this, you must also build a specific 8 | # library target such as 'libxpack.a', as the programs will no longer compile. 9 | # 10 | # TODO: ENABLE_PREPROCESSING option 11 | # 12 | ############################################################################## 13 | 14 | #### Common compiler flags. 15 | #### Flags given here are not intended to be overridden, but you can add more 16 | #### by defining CFLAGS in the environment or on the 'make' command line. 17 | 18 | cc-option = $(shell if $(CC) $(1) -c -x c /dev/null -o /dev/null \ 19 | 1>&2 2>/dev/null; then echo $(1); fi) 20 | 21 | override CFLAGS := \ 22 | $(CFLAGS) -O2 -fomit-frame-pointer -std=gnu89 -I. -Icommon \ 23 | -Wall -Wundef \ 24 | $(call cc-option,-Wdeclaration-after-statement) \ 25 | $(call cc-option,-Wmissing-prototypes) \ 26 | $(call cc-option,-Wstrict-prototypes) \ 27 | $(call cc-option,-Wvla) 28 | 29 | ############################################################################## 30 | 31 | STATIC_LIB_SUFFIX := .a 32 | SHARED_LIB_SUFFIX := .so 33 | PROG_SUFFIX := 34 | PROG_CFLAGS := 35 | PIC_REQUIRED := 1 36 | HARD_LINKS := 1 37 | 38 | # Compiling for Windows with MinGW? 39 | ifneq ($(findstring -mingw,$(CC)),) 40 | ifeq ($(AR),ar) 41 | AR := $(patsubst %-gcc,%-ar,$(CC)) 42 | endif 43 | STATIC_LIB_SUFFIX := .lib 44 | SHARED_LIB_SUFFIX := .dll 45 | PROG_SUFFIX := .exe 46 | PROG_CFLAGS := -static -municode 47 | PIC_REQUIRED := 48 | HARD_LINKS := 49 | endif 50 | 51 | ############################################################################## 52 | 53 | #### Quiet make is enabled by default. Define V=1 to disable. 54 | 55 | ifneq ($(findstring s,$(MAKEFLAGS)),s) 56 | ifneq ($(V),1) 57 | QUIET_CC = @echo ' CC ' $@; 58 | QUIET_CCLD = @echo ' CCLD ' $@; 59 | QUIET_AR = @echo ' AR ' $@; 60 | QUIET_LN = @echo ' LN ' $@; 61 | QUIET_CP = @echo ' CP ' $@; 62 | QUIET_GEN = @echo ' GEN ' $@; 63 | endif 64 | endif 65 | 66 | ############################################################################## 67 | 68 | COMMON_HEADERS := $(wildcard common/*.h) 69 | ALL_TARGETS := 70 | 71 | #### Library 72 | 73 | STATIC_LIB := libxpack$(STATIC_LIB_SUFFIX) 74 | SHARED_LIB := libxpack$(SHARED_LIB_SUFFIX) 75 | 76 | LIB_CFLAGS += $(CFLAGS) -fvisibility=hidden -D_ANSI_SOURCE 77 | 78 | DECOMPRESSION_ONLY := 79 | ifdef DECOMPRESSION_ONLY 80 | LIB_CFLAGS += -DDECOMPRESSION_ONLY=1 81 | endif 82 | 83 | ENABLE_PREPROCESSING := 84 | ifdef ENABLE_PREPROCESSING 85 | LIB_CFLAGS += -DENABLE_PREPROCESSING=1 86 | endif 87 | 88 | LIB_HEADERS := $(wildcard lib/*.h) 89 | 90 | LIB_SRC := lib/x86_cpu_features.c \ 91 | lib/xpack_common.c \ 92 | lib/xpack_compress.c \ 93 | lib/xpack_decompress.c 94 | 95 | LIB_OBJ := $(LIB_SRC:.c=.o) 96 | LIB_PIC_OBJ := $(LIB_SRC:.c=.pic.o) 97 | ifdef PIC_REQUIRED 98 | SHLIB_OBJ := $(LIB_PIC_OBJ) 99 | else 100 | SHLIB_OBJ := $(LIB_OBJ) 101 | endif 102 | 103 | # Compile position dependent library object files 104 | $(LIB_OBJ): %.o: %.c $(LIB_HEADERS) $(COMMON_HEADERS) .lib-cflags 105 | $(QUIET_CC) $(CC) -o $@ -c $(LIB_CFLAGS) $< 106 | 107 | # Compile position independent library object files 108 | $(LIB_PIC_OBJ): %.pic.o: %.c $(LIB_HEADERS) $(COMMON_HEADERS) .lib-cflags 109 | $(QUIET_CC) $(CC) -o $@ -c $(LIB_CFLAGS) -fPIC $< 110 | 111 | # Link shared library 112 | $(SHARED_LIB):$(SHLIB_OBJ) 113 | $(QUIET_CCLD) $(CC) -o $@ $(LDFLAGS) $(LIB_CFLAGS) -shared $+ 114 | 115 | ALL_TARGETS += $(SHARED_LIB) 116 | 117 | # Create static library 118 | $(STATIC_LIB):$(LIB_OBJ) 119 | $(QUIET_AR) $(AR) cr $@ $+ 120 | 121 | ALL_TARGETS += $(STATIC_LIB) 122 | 123 | # Rebuild if CC or LIB_CFLAGS changed 124 | .lib-cflags: FORCE 125 | @flags='$(CC):$(LIB_CFLAGS)'; \ 126 | if [ "$$flags" != "`cat $@ 2>/dev/null`" ]; then \ 127 | [ -e $@ ] && echo "Rebuilding library due to new compiler flags"; \ 128 | echo "$$flags" > $@; \ 129 | fi 130 | 131 | ############################################################################## 132 | 133 | #### Programs 134 | 135 | PROG_CFLAGS += $(CFLAGS) \ 136 | -D_DEFAULT_SOURCE \ 137 | -D_FILE_OFFSET_BITS=64 \ 138 | -DHAVE_CONFIG_H 139 | 140 | PROG_COMMON_HEADERS := programs/prog_util.h programs/config.h 141 | PROG_COMMON_SRC := programs/prog_util.c programs/tgetopt.c 142 | PROG_SPECIFIC_SRC := programs/xpack.c programs/benchmark.c 143 | 144 | PROG_COMMON_OBJ := $(PROG_COMMON_SRC:.c=.o) 145 | PROG_SPECIFIC_OBJ := $(PROG_SPECIFIC_SRC:.c=.o) 146 | PROG_OBJ := $(PROG_COMMON_OBJ) $(PROG_SPECIFIC_OBJ) 147 | 148 | # Generate autodetected configuration header 149 | programs/config.h:programs/detect.sh .prog-cflags 150 | $(QUIET_GEN) CC=$(CC) $< > $@ 151 | 152 | # Compile program object files 153 | $(PROG_OBJ): %.o: %.c $(PROG_COMMON_HEADERS) $(COMMON_HEADERS) .prog-cflags 154 | $(QUIET_CC) $(CC) -o $@ -c $(PROG_CFLAGS) $< 155 | 156 | # Link benchmark program 157 | benchmark$(PROG_SUFFIX):programs/benchmark.o $(PROG_COMMON_OBJ) $(STATIC_LIB) 158 | $(QUIET_CCLD) $(CC) -o $@ $(LDFLAGS) $(PROG_CFLAGS) $+ 159 | 160 | ALL_TARGETS += benchmark$(PROG_SUFFIX) 161 | 162 | # Link xpack program 163 | xpack$(PROG_SUFFIX):programs/xpack.o $(PROG_COMMON_OBJ) $(STATIC_LIB) 164 | $(QUIET_CCLD) $(CC) -o $@ $(LDFLAGS) $(PROG_CFLAGS) $+ 165 | 166 | ALL_TARGETS += xpack$(PROG_SUFFIX) 167 | 168 | ifdef HARD_LINKS 169 | # Hard link xunpack to xpack 170 | xunpack$(PROG_SUFFIX):xpack$(PROG_SUFFIX) 171 | $(QUIET_LN) ln -f $< $@ 172 | else 173 | # No hard links; copy xpack to xunpack 174 | xunpack$(PROG_SUFFIX):xpack$(PROG_SUFFIX) 175 | $(QUIET_CP) cp -f $< $@ 176 | endif 177 | 178 | ALL_TARGETS += xunpack$(PROG_SUFFIX) 179 | 180 | # Rebuild if CC or PROG_CFLAGS changed 181 | .prog-cflags: FORCE 182 | @flags='$(CC):$(PROG_CFLAGS)'; \ 183 | if [ "$$flags" != "`cat $@ 2>/dev/null`" ]; then \ 184 | [ -e $@ ] && echo "Rebuilding programs due to new compiler flags"; \ 185 | echo "$$flags" > $@; \ 186 | fi 187 | 188 | ############################################################################## 189 | 190 | all:$(ALL_TARGETS) 191 | 192 | help: 193 | @echo "Available targets:" 194 | @echo "------------------" 195 | @for target in $(ALL_TARGETS); do \ 196 | echo -e "$$target"; \ 197 | done 198 | 199 | clean: 200 | rm -f *.a *.dll *.exe *.exp *.lib *.so \ 201 | lib/*.o lib/*.obj programs/*.o programs/*.obj \ 202 | benchmark xpack xunpack programs/config.h \ 203 | .lib-cflags .prog-cflags 204 | 205 | realclean: clean 206 | rm -f tags cscope* 207 | 208 | FORCE: 209 | 210 | .PHONY: all help clean realclean 211 | 212 | .DEFAULT_GOAL = all 213 | -------------------------------------------------------------------------------- /lib/xpack_common.c: -------------------------------------------------------------------------------- 1 | /* 2 | * xpack_common.c - common code for XPACK compression and decompression 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #ifdef ENABLE_PREPROCESSING 29 | 30 | #include 31 | 32 | #ifdef __SSE2__ 33 | # include 34 | #endif 35 | 36 | #ifdef __AVX2__ 37 | # include 38 | #endif 39 | 40 | #include "xpack_common.h" 41 | #include "unaligned.h" 42 | 43 | static void 44 | do_translate_target(void *target, s32 input_pos) 45 | { 46 | s32 abs_offset, rel_offset; 47 | 48 | rel_offset = get_unaligned_le32(target); 49 | if (rel_offset >= -input_pos && rel_offset < MAGIC_FILESIZE) { 50 | if (rel_offset < MAGIC_FILESIZE - input_pos) { 51 | /* "good translation" */ 52 | abs_offset = rel_offset + input_pos; 53 | } else { 54 | /* "compensating translation" */ 55 | abs_offset = rel_offset - MAGIC_FILESIZE; 56 | } 57 | put_unaligned_le32(abs_offset, target); 58 | } 59 | } 60 | 61 | static void 62 | undo_translate_target(void *target, s32 input_pos) 63 | { 64 | s32 abs_offset, rel_offset; 65 | 66 | abs_offset = get_unaligned_le32(target); 67 | if (abs_offset >= 0) { 68 | if (abs_offset < MAGIC_FILESIZE) { 69 | /* "good translation" */ 70 | rel_offset = abs_offset - input_pos; 71 | put_unaligned_le32(rel_offset, target); 72 | } 73 | } else { 74 | if (abs_offset >= -input_pos) { 75 | /* "compensating translation" */ 76 | rel_offset = abs_offset + MAGIC_FILESIZE; 77 | put_unaligned_le32(rel_offset, target); 78 | } 79 | } 80 | } 81 | 82 | static void 83 | e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32)) 84 | { 85 | 86 | #if !defined(__SSE2__) && !defined(__AVX2__) 87 | /* 88 | * A worthwhile optimization is to push the end-of-buffer check into the 89 | * relatively rare E8 case. This is possible if we replace the last six 90 | * bytes of data with E8 bytes; then we are guaranteed to hit an E8 byte 91 | * before reaching end-of-buffer. In addition, this scheme guarantees 92 | * that no translation can begin following an E8 byte in the last 10 93 | * bytes because a 4-byte offset containing E8 as its high byte is a 94 | * large negative number that is not valid for translation. That is 95 | * exactly what we need. 96 | */ 97 | u8 *tail; 98 | u8 saved_bytes[6]; 99 | u8 *p; 100 | 101 | if (size <= 10) 102 | return; 103 | 104 | tail = &data[size - 6]; 105 | memcpy(saved_bytes, tail, 6); 106 | memset(tail, 0xE8, 6); 107 | p = data; 108 | for (;;) { 109 | while (*p != 0xE8) 110 | p++; 111 | if (p >= tail) 112 | break; 113 | (*process_target)(p + 1, p - data); 114 | p += 5; 115 | } 116 | memcpy(tail, saved_bytes, 6); 117 | #else 118 | /* SSE2 or AVX-2 optimized version for x86_64 */ 119 | 120 | u8 *p = data; 121 | u64 valid_mask = ~0; 122 | 123 | if (size <= 10) 124 | return; 125 | #ifdef __AVX2__ 126 | # define ALIGNMENT_REQUIRED 32 127 | #else 128 | # define ALIGNMENT_REQUIRED 16 129 | #endif 130 | 131 | /* Process one byte at a time until the pointer is properly aligned. */ 132 | while ((uintptr_t)p % ALIGNMENT_REQUIRED != 0) { 133 | if (p >= data + size - 10) 134 | return; 135 | if (*p == 0xE8 && (valid_mask & 1)) { 136 | (*process_target)(p + 1, p - data); 137 | valid_mask &= ~0x1F; 138 | } 139 | p++; 140 | valid_mask >>= 1; 141 | valid_mask |= (u64)1 << 63; 142 | } 143 | 144 | if (data + size - p >= 64) { 145 | 146 | /* Vectorized processing */ 147 | 148 | /* Note: we use a "trap" E8 byte to eliminate the need to check 149 | * for end-of-buffer in the inner loop. This byte is carefully 150 | * positioned so that it will never be changed by a previous 151 | * translation before it is detected. */ 152 | 153 | u8 *trap = p + ((data + size - p) & ~31) - 32 + 4; 154 | u8 saved_byte = *trap; 155 | *trap = 0xE8; 156 | 157 | for (;;) { 158 | u32 e8_mask; 159 | u8 *orig_p = p; 160 | #ifdef __AVX2__ 161 | const __m256i e8_bytes = _mm256_set1_epi8(0xE8); 162 | for (;;) { 163 | __m256i bytes = *(const __m256i *)p; 164 | __m256i cmpresult = _mm256_cmpeq_epi8(bytes, e8_bytes); 165 | e8_mask = _mm256_movemask_epi8(cmpresult); 166 | if (e8_mask) 167 | break; 168 | p += 32; 169 | } 170 | #else 171 | const __m128i e8_bytes = _mm_set1_epi8(0xE8); 172 | for (;;) { 173 | /* Read the next 32 bytes of data and test them 174 | * for E8 bytes. */ 175 | __m128i bytes1 = *(const __m128i *)p; 176 | __m128i bytes2 = *(const __m128i *)(p + 16); 177 | __m128i cmpresult1 = _mm_cmpeq_epi8(bytes1, e8_bytes); 178 | __m128i cmpresult2 = _mm_cmpeq_epi8(bytes2, e8_bytes); 179 | u32 mask1 = _mm_movemask_epi8(cmpresult1); 180 | u32 mask2 = _mm_movemask_epi8(cmpresult2); 181 | /* The masks have a bit set for each E8 byte. 182 | * We stay in this fast inner loop as long as 183 | * there are no E8 bytes. */ 184 | if (mask1 | mask2) { 185 | e8_mask = mask1 | (mask2 << 16); 186 | break; 187 | } 188 | p += 32; 189 | } 190 | #endif 191 | 192 | /* Did we pass over data with no E8 bytes? */ 193 | if (p != orig_p) 194 | valid_mask = ~0; 195 | 196 | /* Are we nearing end-of-buffer? */ 197 | if (p == trap - 4) 198 | break; 199 | 200 | /* Process the E8 bytes. However, the AND with 201 | * 'valid_mask' ensures we never process an E8 byte that 202 | * was itself part of a translation target. */ 203 | while ((e8_mask &= valid_mask)) { 204 | unsigned bit = bsf32(e8_mask); 205 | (*process_target)(p + bit + 1, p + bit - data); 206 | valid_mask &= ~((u64)0x1F << bit); 207 | } 208 | 209 | valid_mask >>= 32; 210 | valid_mask |= 0xFFFFFFFF00000000; 211 | p += 32; 212 | } 213 | 214 | *trap = saved_byte; 215 | } 216 | 217 | /* Approaching the end of the buffer; process one byte a time. */ 218 | while (p < data + size - 10) { 219 | if (*p == 0xE8 && (valid_mask & 1)) { 220 | (*process_target)(p + 1, p - data); 221 | valid_mask &= ~0x1F; 222 | } 223 | p++; 224 | valid_mask >>= 1; 225 | valid_mask |= (u64)1 << 63; 226 | } 227 | #endif /* __SSE2__ || __AVX2__ */ 228 | } 229 | 230 | void 231 | preprocess(void *data, u32 size) 232 | { 233 | e8_filter(data, size, do_translate_target); 234 | } 235 | 236 | void 237 | postprocess(void *data, u32 size) 238 | { 239 | e8_filter(data, size, undo_translate_target); 240 | } 241 | 242 | #endif /* ENABLE_PREPROCESSING */ 243 | -------------------------------------------------------------------------------- /programs/benchmark.c: -------------------------------------------------------------------------------- 1 | /* 2 | * benchmark.c - a compression testing and benchmark program 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #include "prog_util.h" 29 | 30 | static const tchar *const optstring = T("123456789hL:s:V"); 31 | 32 | static void 33 | show_usage(FILE *fp) 34 | { 35 | fprintf(fp, 36 | "Usage: %"TS" [-123456789hV] [-L LVL] [-s SIZE] [FILE]...\n" 37 | "Benchmark XPACK compression and decompression on the specified FILEs.\n" 38 | "\n" 39 | "Options:\n" 40 | " -1 fastest (worst) compression\n" 41 | " -9 slowest (best) compression\n" 42 | " -h print this help\n" 43 | " -L LVL compression level [1-9] (default 6)\n" 44 | " -s SIZE chunk size (default 524288)\n" 45 | " -V show version and legal information\n", 46 | program_invocation_name); 47 | } 48 | 49 | static void 50 | show_version(void) 51 | { 52 | printf( 53 | "XPACK compression benchmark program, experimental version\n" 54 | "Copyright 2016 Eric Biggers\n" 55 | "\n" 56 | "This program is free software which may be modified and/or redistributed\n" 57 | "under the terms of the MIT license. There is NO WARRANTY, to the extent\n" 58 | "permitted by law. See the COPYING file for details.\n" 59 | ); 60 | } 61 | 62 | static int 63 | do_benchmark(struct file_stream *in, void *original_buf, void *compressed_buf, 64 | void *decompressed_buf, u32 chunk_size, 65 | struct xpack_compressor *compressor, 66 | struct xpack_decompressor *decompressor) 67 | { 68 | u64 total_uncompressed_size = 0; 69 | u64 total_compressed_size = 0; 70 | u64 total_compress_time = 0; 71 | u64 total_decompress_time = 0; 72 | ssize_t ret; 73 | 74 | while ((ret = xread(in, original_buf, chunk_size)) > 0) { 75 | u32 original_size = ret; 76 | u32 compressed_size; 77 | u64 start_time; 78 | enum decompress_result result; 79 | 80 | total_uncompressed_size += original_size; 81 | 82 | /* Compress the chunk of data. */ 83 | start_time = current_time(); 84 | compressed_size = xpack_compress(compressor, 85 | original_buf, 86 | original_size, 87 | compressed_buf, 88 | original_size - 1); 89 | total_compress_time += current_time() - start_time; 90 | 91 | if (compressed_size) { 92 | /* Successfully compressed the chunk of data. */ 93 | 94 | /* Decompress the data we just compressed and compare 95 | * the result with the original. */ 96 | start_time = current_time(); 97 | result = xpack_decompress(decompressor, 98 | compressed_buf, 99 | compressed_size, 100 | decompressed_buf, 101 | original_size, 102 | NULL); 103 | total_decompress_time += current_time() - start_time; 104 | 105 | if (result != DECOMPRESS_SUCCESS) { 106 | msg("%"TS": failed to decompress data", 107 | in->name); 108 | return -1; 109 | } 110 | 111 | if (memcmp(original_buf, decompressed_buf, 112 | original_size) != 0) 113 | { 114 | msg("%"TS": data did not decompress to " 115 | "original", in->name); 116 | return -1; 117 | } 118 | 119 | total_compressed_size += compressed_size; 120 | } else { 121 | /* Compression did not make the chunk smaller. */ 122 | total_compressed_size += original_size; 123 | } 124 | } 125 | 126 | if (ret < 0) 127 | return ret; 128 | 129 | if (total_uncompressed_size == 0) { 130 | printf("\tFile was empty.\n"); 131 | return 0; 132 | } 133 | 134 | if (total_compress_time == 0) 135 | total_compress_time = 1; 136 | if (total_decompress_time == 0) 137 | total_decompress_time = 1; 138 | 139 | printf("\tCompressed %"PRIu64 " => %"PRIu64" bytes (%u.%03u%%)\n", 140 | total_uncompressed_size, total_compressed_size, 141 | (unsigned int)(total_compressed_size * 100 / 142 | total_uncompressed_size), 143 | (unsigned int)(total_compressed_size * 100000 / 144 | total_uncompressed_size % 1000)); 145 | printf("\tCompression time: %"PRIu64" ms (%"PRIu64" MB/s)\n", 146 | total_compress_time / 1000000, 147 | 1000 * total_uncompressed_size / total_compress_time); 148 | printf("\tDecompression time: %"PRIu64" ms (%"PRIu64" MB/s)\n", 149 | total_decompress_time / 1000000, 150 | 1000 * total_uncompressed_size / total_decompress_time); 151 | 152 | return 0; 153 | } 154 | 155 | int 156 | tmain(int argc, tchar *argv[]) 157 | { 158 | u32 chunk_size = 524288; 159 | int compression_level = 6; 160 | void *original_buf = NULL; 161 | void *compressed_buf = NULL; 162 | void *decompressed_buf = NULL; 163 | struct xpack_compressor *compressor = NULL; 164 | struct xpack_decompressor *decompressor = NULL; 165 | tchar *default_file_list[] = { NULL }; 166 | int opt_char; 167 | int i; 168 | int ret; 169 | 170 | program_invocation_name = get_filename(argv[0]); 171 | 172 | while ((opt_char = tgetopt(argc, argv, optstring)) != -1) { 173 | switch (opt_char) { 174 | case '1': 175 | case '2': 176 | case '3': 177 | case '4': 178 | case '5': 179 | case '6': 180 | case '7': 181 | case '8': 182 | case '9': 183 | compression_level = opt_char - '0'; 184 | break; 185 | case 'h': 186 | show_usage(stdout); 187 | return 0; 188 | case 'L': 189 | compression_level = parse_compression_level(toptarg); 190 | if (compression_level <= 0) 191 | return 1; 192 | break; 193 | case 's': 194 | chunk_size = parse_chunk_size(toptarg); 195 | if (chunk_size == 0) 196 | return 1; 197 | break; 198 | case 'V': 199 | show_version(); 200 | return 0; 201 | default: 202 | show_usage(stderr); 203 | return 1; 204 | } 205 | } 206 | 207 | argc -= toptind; 208 | argv += toptind; 209 | 210 | original_buf = xmalloc(chunk_size); 211 | compressed_buf = xmalloc(chunk_size - 1); 212 | decompressed_buf = xmalloc(chunk_size); 213 | compressor = alloc_compressor(chunk_size, compression_level); 214 | decompressor = alloc_decompressor(); 215 | 216 | ret = -1; 217 | if (original_buf == NULL || compressed_buf == NULL || 218 | decompressed_buf == NULL || 219 | compressor == NULL || decompressor == NULL) 220 | goto out; 221 | 222 | if (argc == 0) { 223 | argv = default_file_list; 224 | argc = ARRAY_LEN(default_file_list); 225 | } else { 226 | for (i = 0; i < argc; i++) 227 | if (argv[i][0] == '-' && argv[i][1] == '\0') 228 | argv[i] = NULL; 229 | } 230 | 231 | printf("Benchmarking XPACK compression:\n"); 232 | printf("\tChunk size: %"PRIu32"\n", chunk_size); 233 | printf("\tCompression level: %d\n", compression_level); 234 | 235 | for (i = 0; i < argc; i++) { 236 | struct file_stream in; 237 | 238 | ret = xopen_for_read(argv[i], &in); 239 | if (ret != 0) 240 | goto out; 241 | 242 | printf("Processing %"TS"...\n", in.name); 243 | 244 | ret = do_benchmark(&in, original_buf, compressed_buf, 245 | decompressed_buf, chunk_size, compressor, 246 | decompressor); 247 | xclose(&in); 248 | if (ret != 0) 249 | goto out; 250 | } 251 | ret = 0; 252 | out: 253 | xpack_free_decompressor(decompressor); 254 | xpack_free_compressor(compressor); 255 | free(decompressed_buf); 256 | free(compressed_buf); 257 | free(original_buf); 258 | return -ret; 259 | } 260 | -------------------------------------------------------------------------------- /common/common_defs.h: -------------------------------------------------------------------------------- 1 | /* 2 | * common_defs.h 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #ifndef COMMON_COMMON_DEFS_H 29 | #define COMMON_COMMON_DEFS_H 30 | 31 | #ifdef __GNUC__ 32 | # include "compiler_gcc.h" 33 | #elif defined(_MSC_VER) 34 | # include "compiler_msc.h" 35 | #else 36 | # pragma message("Unrecognized compiler. Please add a header file for your compiler. Compilation will proceed, but performance may suffer!") 37 | #endif 38 | 39 | /* ========================================================================== */ 40 | /* Type definitions */ 41 | /* ========================================================================== */ 42 | 43 | #include /* size_t */ 44 | 45 | #ifndef __bool_true_false_are_defined 46 | # include /* bool */ 47 | #endif 48 | 49 | /* Fixed-width integer types */ 50 | #ifndef PRIu32 51 | # include 52 | #endif 53 | typedef uint8_t u8; 54 | typedef uint16_t u16; 55 | typedef uint32_t u32; 56 | typedef uint64_t u64; 57 | typedef int8_t s8; 58 | typedef int16_t s16; 59 | typedef int32_t s32; 60 | typedef int64_t s64; 61 | 62 | /* 63 | * Word type of the target architecture. Use 'size_t' instead of 'unsigned 64 | * long' to account for platforms such as Windows that use 32-bit 'unsigned 65 | * long' on 64-bit architectures. 66 | */ 67 | typedef size_t machine_word_t; 68 | 69 | /* Number of bytes in a word */ 70 | #define WORDBYTES ((int)sizeof(machine_word_t)) 71 | 72 | /* Number of bits in a word */ 73 | #define WORDBITS (8 * WORDBYTES) 74 | 75 | /* ========================================================================== */ 76 | /* Optional compiler features */ 77 | /* ========================================================================== */ 78 | 79 | /* LIBEXPORT - annotate a function that is part of the library API */ 80 | #ifndef LIBEXPORT 81 | # define LIBEXPORT 82 | #endif 83 | 84 | /* inline - suggest that a function be inlined */ 85 | #ifndef inline 86 | # define inline 87 | #endif 88 | 89 | /* forceinline - force a function to be inlined, if possible */ 90 | #ifndef forceinline 91 | # define forceinline inline 92 | #endif 93 | 94 | /* restrict - annotate a non-aliased pointer */ 95 | #ifndef restrict 96 | # define restrict 97 | #endif 98 | 99 | /* likely(expr) - hint that an expression is usually true */ 100 | #ifndef likely 101 | # define likely(expr) (expr) 102 | #endif 103 | 104 | /* unlikely(expr) - hint that an expression is usually false */ 105 | #ifndef unlikely 106 | # define unlikely(expr) (expr) 107 | #endif 108 | 109 | /* prefetchr(addr) - prefetch into L1 cache for read */ 110 | #ifndef prefetchr 111 | # define prefetchr(addr) 112 | #endif 113 | 114 | /* prefetchw(addr) - prefetch into L1 cache for write */ 115 | #ifndef prefetchw 116 | # define prefetchw(addr) 117 | #endif 118 | 119 | /* Does the compiler support the 'target' function attribute? */ 120 | #ifndef COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 121 | # define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 0 122 | #endif 123 | 124 | /* Does the compiler support __attribute__((target("bmi2")))? */ 125 | #ifndef COMPILER_SUPPORTS_BMI2_TARGET 126 | # define COMPILER_SUPPORTS_BMI2_TARGET 0 127 | #endif 128 | 129 | /* ========================================================================== */ 130 | /* Miscellaneous macros */ 131 | /* ========================================================================== */ 132 | 133 | #define ARRAY_LEN(A) (sizeof(A) / sizeof((A)[0])) 134 | #define MIN(a, b) ((a) <= (b) ? (a) : (b)) 135 | #define MAX(a, b) ((a) >= (b) ? (a) : (b)) 136 | #define MAX3(a, b, c) MAX((a), MAX((b), (c))) 137 | #define MAX4(a, b, c, d) MAX((a), MAX3((b), (c), (d))) 138 | #define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) 139 | #define STATIC_ASSERT(expr) ((void)sizeof(char[1 - 2 * !(expr)])) 140 | 141 | /* ========================================================================== */ 142 | /* Endianness handling */ 143 | /* ========================================================================== */ 144 | 145 | /* 146 | * CPU_IS_LITTLE_ENDIAN() - a macro which evaluates to 1 if the CPU is little 147 | * endian or 0 if it is big endian. The macro should be defined in a way such 148 | * that the compiler can evaluate it at compilation time. If not defined, a 149 | * fallback is used. 150 | */ 151 | #ifndef CPU_IS_LITTLE_ENDIAN 152 | static forceinline int CPU_IS_LITTLE_ENDIAN(void) 153 | { 154 | union { 155 | unsigned int v; 156 | unsigned char b; 157 | } u; 158 | u.v = 1; 159 | return u.b; 160 | } 161 | #endif 162 | 163 | /* bswap16(n) - swap the bytes of a 16-bit integer */ 164 | #ifndef bswap16 165 | static forceinline u16 bswap16(u16 n) 166 | { 167 | return (n << 8) | (n >> 8); 168 | } 169 | #endif 170 | 171 | /* bswap32(n) - swap the bytes of a 32-bit integer */ 172 | #ifndef bswap32 173 | static forceinline u32 bswap32(u32 n) 174 | { 175 | return ((n & 0x000000FF) << 24) | 176 | ((n & 0x0000FF00) << 8) | 177 | ((n & 0x00FF0000) >> 8) | 178 | ((n & 0xFF000000) >> 24); 179 | } 180 | #endif 181 | 182 | /* bswap64(n) - swap the bytes of a 64-bit integer */ 183 | #ifndef bswap64 184 | static forceinline u64 bswap64(u64 n) 185 | { 186 | return ((n & 0x00000000000000FF) << 56) | 187 | ((n & 0x000000000000FF00) << 40) | 188 | ((n & 0x0000000000FF0000) << 24) | 189 | ((n & 0x00000000FF000000) << 8) | 190 | ((n & 0x000000FF00000000) >> 8) | 191 | ((n & 0x0000FF0000000000) >> 24) | 192 | ((n & 0x00FF000000000000) >> 40) | 193 | ((n & 0xFF00000000000000) >> 56); 194 | } 195 | #endif 196 | 197 | #define le16_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap16(n)) 198 | #define le32_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap32(n)) 199 | #define le64_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap64(n)) 200 | 201 | /* ========================================================================== */ 202 | /* Unaligned memory accesses */ 203 | /* ========================================================================== */ 204 | 205 | /* 206 | * UNALIGNED_ACCESS_IS_FAST should be defined to 1 if unaligned memory accesses 207 | * can be performed efficiently on the target platform. 208 | */ 209 | #ifndef UNALIGNED_ACCESS_IS_FAST 210 | # define UNALIGNED_ACCESS_IS_FAST 0 211 | #endif 212 | 213 | /* 214 | * DEFINE_UNALIGNED_TYPE(type) - a macro that, given an integer type 'type', 215 | * defines load_type_unaligned(addr) and store_type_unaligned(v, addr) functions 216 | * which load and store variables of type 'type' from/to unaligned memory 217 | * addresses. If not defined, a fallback is used. 218 | */ 219 | #ifndef DEFINE_UNALIGNED_TYPE 220 | 221 | /* 222 | * Although memcpy() may seem inefficient, it *usually* gets optimized 223 | * appropriately by modern compilers. It's portable and may be the best we can 224 | * do for a fallback... 225 | */ 226 | #include 227 | 228 | #define DEFINE_UNALIGNED_TYPE(type) \ 229 | \ 230 | static forceinline type \ 231 | load_##type##_unaligned(const void *p) \ 232 | { \ 233 | type v; \ 234 | memcpy(&v, p, sizeof(v)); \ 235 | return v; \ 236 | } \ 237 | \ 238 | static forceinline void \ 239 | store_##type##_unaligned(type v, void *p) \ 240 | { \ 241 | memcpy(p, &v, sizeof(v)); \ 242 | } 243 | 244 | #endif /* !DEFINE_UNALIGNED_TYPE */ 245 | 246 | /* ========================================================================== */ 247 | /* Bit scan functions */ 248 | /* ========================================================================== */ 249 | 250 | /* 251 | * Bit Scan Reverse (BSR) - find the 0-based index (relative to the least 252 | * significant end) of the *most* significant 1 bit in the input value. The 253 | * input value must be nonzero! 254 | */ 255 | 256 | #ifndef bsr32 257 | static forceinline unsigned 258 | bsr32(u32 n) 259 | { 260 | unsigned i = 0; 261 | while ((n >>= 1) != 0) 262 | i++; 263 | return i; 264 | } 265 | #endif 266 | 267 | #ifndef bsr64 268 | static forceinline unsigned 269 | bsr64(u64 n) 270 | { 271 | unsigned i = 0; 272 | while ((n >>= 1) != 0) 273 | i++; 274 | return i; 275 | } 276 | #endif 277 | 278 | static forceinline unsigned 279 | bsrw(machine_word_t n) 280 | { 281 | STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); 282 | if (WORDBITS == 32) 283 | return bsr32(n); 284 | else 285 | return bsr64(n); 286 | } 287 | 288 | /* 289 | * Bit Scan Forward (BSF) - find the 0-based index (relative to the least 290 | * significant end) of the *least* significant 1 bit in the input value. The 291 | * input value must be nonzero! 292 | */ 293 | 294 | #ifndef bsf32 295 | static forceinline unsigned 296 | bsf32(u32 n) 297 | { 298 | unsigned i = 0; 299 | while ((n & 1) == 0) { 300 | i++; 301 | n >>= 1; 302 | } 303 | return i; 304 | } 305 | #endif 306 | 307 | #ifndef bsf64 308 | static forceinline unsigned 309 | bsf64(u64 n) 310 | { 311 | unsigned i = 0; 312 | while ((n & 1) == 0) { 313 | i++; 314 | n >>= 1; 315 | } 316 | return i; 317 | } 318 | #endif 319 | 320 | static forceinline unsigned 321 | bsfw(machine_word_t n) 322 | { 323 | STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); 324 | if (WORDBITS == 32) 325 | return bsf32(n); 326 | else 327 | return bsf64(n); 328 | } 329 | 330 | #endif /* COMMON_COMMON_DEFS_H */ 331 | -------------------------------------------------------------------------------- /programs/prog_util.c: -------------------------------------------------------------------------------- 1 | /* 2 | * prog_util.c - utility functions for programs 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #include "prog_util.h" 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | #ifdef _WIN32 35 | # include 36 | #else 37 | # include 38 | # include 39 | #endif 40 | 41 | /* The invocation name of the program (filename component only) */ 42 | const tchar *program_invocation_name; 43 | 44 | static void 45 | do_msg(const char *format, bool with_errno, va_list va) 46 | { 47 | int saved_errno = errno; 48 | 49 | fprintf(stderr, "%"TS": ", program_invocation_name); 50 | vfprintf(stderr, format, va); 51 | if (with_errno) 52 | fprintf(stderr, ": %s\n", strerror(saved_errno)); 53 | else 54 | fprintf(stderr, "\n"); 55 | 56 | errno = saved_errno; 57 | } 58 | 59 | /* Print a message to standard error */ 60 | void 61 | msg(const char *format, ...) 62 | { 63 | va_list va; 64 | 65 | va_start(va, format); 66 | do_msg(format, false, va); 67 | va_end(va); 68 | } 69 | 70 | /* Print a message to standard error, including a description of errno */ 71 | void 72 | msg_errno(const char *format, ...) 73 | { 74 | va_list va; 75 | 76 | va_start(va, format); 77 | do_msg(format, true, va); 78 | va_end(va); 79 | } 80 | 81 | /* malloc() wrapper */ 82 | void * 83 | xmalloc(size_t size) 84 | { 85 | void *p = malloc(size); 86 | if (p == NULL && size == 0) 87 | p = malloc(1); 88 | if (p == NULL) 89 | msg("Out of memory"); 90 | return p; 91 | } 92 | 93 | /* 94 | * Retrieve the current time in nanoseconds since a start time which is fixed 95 | * for the duration of program execution but is otherwise unspecified 96 | */ 97 | u64 98 | current_time(void) 99 | { 100 | #ifdef _WIN32 101 | FILETIME ft; 102 | GetSystemTimeAsFileTime(&ft); 103 | return 100 * (((u64)ft.dwHighDateTime << 32) | ft.dwLowDateTime); 104 | #elif defined(HAVE_CLOCK_GETTIME) 105 | struct timespec ts; 106 | clock_gettime(CLOCK_MONOTONIC, &ts); 107 | return (1000000000 * (u64)ts.tv_sec) + ts.tv_nsec; 108 | #else 109 | struct timeval tv; 110 | gettimeofday(&tv, NULL); 111 | return (1000000000 * (u64)tv.tv_sec) + (1000 * (u64)tv.tv_usec); 112 | #endif 113 | } 114 | 115 | /* 116 | * Retrieve a pointer to the filename component of the specified path. 117 | * 118 | * Note: this does not modify the path. Therefore, it is not guaranteed to work 119 | * properly for directories, since a path to a directory might have trailing 120 | * slashes. 121 | */ 122 | const tchar * 123 | get_filename(const tchar *path) 124 | { 125 | const tchar *slash = tstrrchr(path, '/'); 126 | #ifdef _WIN32 127 | const tchar *backslash = tstrrchr(path, '\\'); 128 | if (backslash != NULL && (slash == NULL || backslash > slash)) 129 | slash = backslash; 130 | #endif 131 | if (slash != NULL) 132 | return slash + 1; 133 | return path; 134 | } 135 | 136 | /* Create a copy of 'path' surrounded by double quotes */ 137 | static tchar * 138 | quote_path(const tchar *path) 139 | { 140 | size_t len = tstrlen(path); 141 | tchar *result; 142 | 143 | result = xmalloc((1 + len + 1 + 1) * sizeof(tchar)); 144 | if (result == NULL) 145 | return NULL; 146 | result[0] = '"'; 147 | tmemcpy(&result[1], path, len); 148 | result[1 + len] = '"'; 149 | result[1 + len + 1] = '\0'; 150 | return result; 151 | } 152 | 153 | /* Open a file for reading, or set up standard input for reading */ 154 | int 155 | xopen_for_read(const tchar *path, struct file_stream *strm) 156 | { 157 | if (path == NULL) { 158 | strm->is_standard_stream = true; 159 | strm->name = T("standard input"); 160 | strm->fd = STDIN_FILENO; 161 | #ifdef _WIN32 162 | _setmode(strm->fd, O_BINARY); 163 | #endif 164 | return 0; 165 | } 166 | 167 | strm->is_standard_stream = false; 168 | 169 | strm->name = quote_path(path); 170 | if (strm->name == NULL) 171 | return -1; 172 | 173 | strm->fd = topen(path, O_RDONLY | O_BINARY | O_NOFOLLOW); 174 | if (strm->fd < 0) { 175 | msg_errno("Can't open %"TS" for reading", strm->name); 176 | free(strm->name); 177 | return -1; 178 | } 179 | 180 | return 0; 181 | } 182 | 183 | /* Open a file for writing, or set up standard output for writing */ 184 | int 185 | xopen_for_write(const tchar *path, bool overwrite, struct file_stream *strm) 186 | { 187 | if (path == NULL) { 188 | strm->is_standard_stream = true; 189 | strm->name = T("standard output"); 190 | strm->fd = STDOUT_FILENO; 191 | #ifdef _WIN32 192 | _setmode(strm->fd, O_BINARY); 193 | #endif 194 | return 0; 195 | } 196 | 197 | strm->is_standard_stream = false; 198 | 199 | strm->name = quote_path(path); 200 | if (strm->name == NULL) 201 | goto err; 202 | retry: 203 | strm->fd = topen(path, O_WRONLY | O_BINARY | O_NOFOLLOW | 204 | O_CREAT | O_EXCL, 0644); 205 | if (strm->fd < 0) { 206 | if (errno != EEXIST) { 207 | msg_errno("Can't open %"TS" for writing", strm->name); 208 | goto err; 209 | } 210 | if (!overwrite) { 211 | if (!isatty(STDERR_FILENO) || !isatty(STDIN_FILENO)) { 212 | msg("%"TS" already exists; use -f to overwrite", 213 | strm->name); 214 | goto err; 215 | } 216 | fprintf(stderr, "%"TS": %"TS" already exists; " 217 | "overwrite? (y/n) ", 218 | program_invocation_name, strm->name); 219 | if (getchar() != 'y') { 220 | msg("Not overwriting."); 221 | goto err; 222 | } 223 | } 224 | if (tunlink(path) != 0) { 225 | msg_errno("Unable to delete %"TS, strm->name); 226 | goto err; 227 | } 228 | goto retry; 229 | } 230 | 231 | return 0; 232 | 233 | err: 234 | free(strm->name); 235 | return -1; 236 | } 237 | 238 | /* 239 | * Read from a file, returning the full count to indicate all bytes were read, a 240 | * short count (possibly 0) to indicate EOF, or -1 to indicate error. 241 | */ 242 | ssize_t 243 | xread(struct file_stream *strm, void *buf, size_t count) 244 | { 245 | char *p = buf; 246 | size_t orig_count = count; 247 | 248 | while (count != 0) { 249 | ssize_t res = read(strm->fd, p, MIN(count, INT_MAX)); 250 | if (res == 0) 251 | break; 252 | if (res < 0) { 253 | msg_errno("Error reading from %"TS, strm->name); 254 | return -1; 255 | } 256 | p += res; 257 | count -= res; 258 | } 259 | return orig_count - count; 260 | } 261 | 262 | /* Skip over 'count' bytes from a file, returning 0 on success or -1 on error */ 263 | int 264 | skip_bytes(struct file_stream *strm, size_t count) 265 | { 266 | size_t bufsize; 267 | char *buffer; 268 | ssize_t ret; 269 | 270 | if (count == 0) 271 | return 0; 272 | 273 | bufsize = MIN(count, 4096); 274 | buffer = xmalloc(bufsize); 275 | if (buffer == NULL) 276 | return -1; 277 | do { 278 | size_t n = MIN(count, bufsize); 279 | ret = xread(strm, buffer, n); 280 | if (ret < 0) 281 | goto out; 282 | if (ret != n) { 283 | msg("%"TS": unexpected end-of-file", strm->name); 284 | ret = -1; 285 | goto out; 286 | } 287 | count -= ret; 288 | } while (count != 0); 289 | ret = 0; 290 | out: 291 | free(buffer); 292 | return ret; 293 | } 294 | 295 | /* Write to a file, returning 0 if all bytes were written or -1 on error */ 296 | int 297 | full_write(struct file_stream *strm, const void *buf, size_t count) 298 | { 299 | const char *p = buf; 300 | 301 | while (count != 0) { 302 | ssize_t res = write(strm->fd, p, MIN(count, INT_MAX)); 303 | if (res <= 0) { 304 | msg_errno("Error writing to %"TS, strm->name); 305 | return -1; 306 | } 307 | p += res; 308 | count -= res; 309 | } 310 | return 0; 311 | } 312 | 313 | /* Close a file, returning 0 on success or -1 on error */ 314 | int 315 | xclose(struct file_stream *strm) 316 | { 317 | int ret = 0; 318 | if (strm->fd >= 0 && !strm->is_standard_stream) { 319 | if (close(strm->fd) != 0) { 320 | msg_errno("Error closing %"TS, strm->name); 321 | ret = -1; 322 | } 323 | free(strm->name); 324 | } 325 | strm->fd = -1; 326 | strm->name = NULL; 327 | return ret; 328 | } 329 | 330 | /* 331 | * Parse the chunk size given on the command line, returning the chunk size on 332 | * success or 0 on error 333 | */ 334 | u32 335 | parse_chunk_size(const tchar *arg) 336 | { 337 | tchar *tmp; 338 | unsigned long chunk_size = tstrtoul(arg, &tmp, 10); 339 | 340 | if (chunk_size < 1024 || chunk_size > 67108864 || *tmp != '\0') { 341 | msg("Invalid chunk size : \"%"TS"\". " 342 | "Must be an integer in the range [1024, 67108864]", arg); 343 | return 0; 344 | } 345 | 346 | return chunk_size; 347 | } 348 | 349 | /* 350 | * Parse the compression level given on the command line, returning the 351 | * compression level on success or 0 on error 352 | */ 353 | int 354 | parse_compression_level(const tchar *arg) 355 | { 356 | tchar *tmp; 357 | unsigned long level = tstrtoul(arg, &tmp, 10); 358 | 359 | if (level < 1 || level > 9 || *tmp != '\0') { 360 | msg("Invalid compression level: \"%"TS"\". " 361 | "Must be an integer in the range [1, 9].", arg); 362 | return 0; 363 | } 364 | 365 | return level; 366 | } 367 | 368 | /* Allocate a new XPACK compressor */ 369 | struct xpack_compressor * 370 | alloc_compressor(u32 chunk_size, int level) 371 | { 372 | struct xpack_compressor *c; 373 | 374 | c = xpack_alloc_compressor(chunk_size, level); 375 | if (c == NULL) { 376 | msg_errno("Unable to allocate compressor with " 377 | "chunk size %"PRIu32" and compression level %d", 378 | chunk_size, level); 379 | } 380 | return c; 381 | } 382 | 383 | /* Allocate a new XPACK decompressor */ 384 | struct xpack_decompressor * 385 | alloc_decompressor(void) 386 | { 387 | struct xpack_decompressor *d; 388 | 389 | d = xpack_alloc_decompressor(); 390 | if (d == NULL) 391 | msg_errno("Unable to allocate decompressor"); 392 | 393 | return d; 394 | } 395 | -------------------------------------------------------------------------------- /lib/hc_matchfinder.h: -------------------------------------------------------------------------------- 1 | /* 2 | * hc_matchfinder.h - Lempel-Ziv matchfinding with a hash table of linked lists 3 | * 4 | * --------------------------------------------------------------------------- 5 | * 6 | * Algorithm 7 | * 8 | * This is a Hash Chains (hc) based matchfinder. 9 | * 10 | * The main data structure is a hash table where each hash bucket contains a 11 | * linked list (or "chain") of sequences whose first 4 bytes share the same hash 12 | * code. Each sequence is identified by its starting position in the input 13 | * buffer. 14 | * 15 | * The algorithm processes the input buffer sequentially. At each byte 16 | * position, the hash code of the first 4 bytes of the sequence beginning at 17 | * that position (the sequence being matched against) is computed. This 18 | * identifies the hash bucket to use for that position. Then, this hash 19 | * bucket's linked list is searched for matches. Then, a new linked list node 20 | * is created to represent the current sequence and is prepended to the list. 21 | * 22 | * This algorithm has several useful properties: 23 | * 24 | * - It only finds true Lempel-Ziv matches; i.e., those where the matching 25 | * sequence occurs prior to the sequence being matched against. 26 | * 27 | * - The sequences in each linked list are always sorted by decreasing starting 28 | * position. Therefore, the closest (smallest offset) matches are found 29 | * first, which in many compression formats tend to be the cheapest to encode. 30 | * 31 | * - Although fast running time is not guaranteed due to the possibility of the 32 | * lists getting very long, the worst degenerate behavior can be easily 33 | * prevented by capping the number of nodes searched at each position. 34 | * 35 | * - If the compressor decides not to search for matches at a certain position, 36 | * then that position can be quickly inserted without searching the list. 37 | * 38 | * - The algorithm is adaptable to sliding windows: just store the positions 39 | * relative to a "base" value that is updated from time to time, and stop 40 | * searching each list when the sequences get too far away. 41 | * 42 | * --------------------------------------------------------------------------- 43 | * 44 | * Notes on usage 45 | * 46 | * The number of bytes that must be allocated for a given 'struct 47 | * hc_matchfinder' must be gotten by calling hc_matchfinder_size(). 48 | * 49 | * ---------------------------------------------------------------------------- 50 | * 51 | * Optimizations 52 | * 53 | * The main hash table and chains handle length 4+ matches. Length 3 matches 54 | * are handled by a separate hash table with no chains. This works well for 55 | * typical "greedy" or "lazy"-style compressors, where length 3 matches are 56 | * often only helpful if they have small offsets. Instead of searching a full 57 | * chain for length 3+ matches, the algorithm just checks for one close length 3 58 | * match, then focuses on finding length 4+ matches. 59 | * 60 | * The longest_match() and skip_positions() functions are inlined into the 61 | * compressors that use them. This isn't just about saving the overhead of a 62 | * function call. These functions are intended to be called from the inner 63 | * loops of compressors, where giving the compiler more control over register 64 | * allocation is very helpful. There is also significant benefit to be gained 65 | * from allowing the CPU to predict branches independently at each call site. 66 | * For example, "lazy"-style compressors can be written with two calls to 67 | * longest_match(), each of which starts with a different 'best_len' and 68 | * therefore has significantly different performance characteristics. 69 | * 70 | * Although any hash function can be used, a multiplicative hash is fast and 71 | * works well. 72 | * 73 | * On some processors, it is significantly faster to extend matches by whole 74 | * words (32 or 64 bits) instead of by individual bytes. For this to be the 75 | * case, the processor must implement unaligned memory accesses efficiently and 76 | * must have either a fast "find first set bit" instruction or a fast "find last 77 | * set bit" instruction, depending on the processor's endianness. 78 | * 79 | * The code uses one loop for finding the first match and one loop for finding a 80 | * longer match. Each of these loops is tuned for its respective task and in 81 | * combination are faster than a single generalized loop that handles both 82 | * tasks. 83 | * 84 | * The code also uses a tight inner loop that only compares the last and first 85 | * bytes of a potential match. It is only when these bytes match that a full 86 | * match extension is attempted. 87 | * 88 | * ---------------------------------------------------------------------------- 89 | */ 90 | 91 | #include 92 | 93 | #include "lz_extend.h" 94 | #include "lz_hash.h" 95 | #include "unaligned.h" 96 | 97 | #define HC_MATCHFINDER_HASH3_ORDER 15 98 | #define HC_MATCHFINDER_HASH4_ORDER 16 99 | 100 | struct hc_matchfinder { 101 | 102 | /* The hash table for finding length 3 matches */ 103 | u32 hash3_tab[1UL << HC_MATCHFINDER_HASH3_ORDER]; 104 | 105 | /* The hash table which contains the first nodes of the linked lists for 106 | * finding length 4+ matches */ 107 | u32 hash4_tab[1UL << HC_MATCHFINDER_HASH4_ORDER]; 108 | 109 | /* The "next node" references for the linked lists. The "next node" of 110 | * the node for the sequence with position 'pos' is 'next_tab[pos]'. */ 111 | u32 next_tab[]; 112 | }; 113 | 114 | /* 115 | * Return the number of bytes that must be allocated for a 'hc_matchfinder' that 116 | * can work with buffers up to the specified size. 117 | */ 118 | static forceinline size_t 119 | hc_matchfinder_size(size_t max_bufsize) 120 | { 121 | return sizeof(struct hc_matchfinder) + (max_bufsize * sizeof(u32)); 122 | } 123 | 124 | /* Prepare the matchfinder for a new input buffer. */ 125 | static forceinline void 126 | hc_matchfinder_init(struct hc_matchfinder *mf) 127 | { 128 | memset(mf, 0, sizeof(*mf)); 129 | } 130 | 131 | /* 132 | * Find the longest match longer than 'best_len' bytes. 133 | * 134 | * @mf 135 | * The matchfinder structure. 136 | * @in_begin 137 | * Pointer to the beginning of the input buffer. 138 | * @cur_pos 139 | * The current position in the input buffer (the position of the sequence 140 | * being matched against). 141 | * @best_len 142 | * Require a match longer than this length. 143 | * @max_len 144 | * The maximum permissible match length at this position. 145 | * @nice_len 146 | * Stop searching if a match of at least this length is found. 147 | * Must be <= @max_len. 148 | * @max_search_depth 149 | * Limit on the number of potential matches to consider. Must be >= 1. 150 | * @next_hashes 151 | * The precomputed hash codes for the sequence beginning at @in_next. 152 | * These will be used and then updated with the precomputed hashcodes for 153 | * the sequence beginning at @in_next + 1. 154 | * @offset_ret 155 | * If a match is found, its offset is returned in this location. 156 | * 157 | * Return the length of the match found, or 'best_len' if no match longer than 158 | * 'best_len' was found. 159 | */ 160 | static forceinline u32 161 | hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf, 162 | const u8 * const restrict in_begin, 163 | const ptrdiff_t cur_pos, 164 | u32 best_len, 165 | const u32 max_len, 166 | const u32 nice_len, 167 | const u32 max_search_depth, 168 | u32 next_hashes[restrict 2], 169 | u32 * const restrict offset_ret) 170 | { 171 | const u8 *in_next = in_begin + cur_pos; 172 | u32 depth_remaining = max_search_depth; 173 | const u8 *best_matchptr = in_next; 174 | u32 cur_node3, cur_node4; 175 | u32 hash3, hash4; 176 | u32 next_seq3, next_seq4; 177 | u32 seq4; 178 | const u8 *matchptr; 179 | u32 len; 180 | 181 | if (unlikely(max_len < 5)) /* can we read 4 bytes from 'in_next + 1'? */ 182 | goto out; 183 | 184 | /* Get the precomputed hash codes */ 185 | hash3 = next_hashes[0]; 186 | hash4 = next_hashes[1]; 187 | 188 | /* From the hash buckets, get the first node of each linked list. */ 189 | cur_node3 = mf->hash3_tab[hash3]; 190 | cur_node4 = mf->hash4_tab[hash4]; 191 | 192 | /* Update for length 3 matches. This replaces the singleton node in the 193 | * 'hash3' bucket with the node for the current sequence. */ 194 | mf->hash3_tab[hash3] = cur_pos; 195 | 196 | /* Update for length 4 matches. This prepends the node for the current 197 | * sequence to the linked list in the 'hash4' bucket. */ 198 | mf->hash4_tab[hash4] = cur_pos; 199 | mf->next_tab[cur_pos] = cur_node4; 200 | 201 | /* Compute the next hash codes */ 202 | next_seq4 = load_u32_unaligned(in_next + 1); 203 | next_seq3 = loaded_u32_to_u24(next_seq4); 204 | next_hashes[0] = lz_hash(next_seq3, HC_MATCHFINDER_HASH3_ORDER); 205 | next_hashes[1] = lz_hash(next_seq4, HC_MATCHFINDER_HASH4_ORDER); 206 | prefetchw(&mf->hash3_tab[next_hashes[0]]); 207 | prefetchw(&mf->hash4_tab[next_hashes[1]]); 208 | 209 | if (best_len < 4) { /* No match of length >= 4 found yet? */ 210 | 211 | /* Check for a length 3 match if needed */ 212 | 213 | if (!cur_node3) 214 | goto out; 215 | 216 | seq4 = load_u32_unaligned(in_next); 217 | 218 | if (best_len < 3) { 219 | matchptr = &in_begin[cur_node3]; 220 | if (load_u24_unaligned(matchptr) == loaded_u32_to_u24(seq4)) { 221 | best_len = 3; 222 | best_matchptr = matchptr; 223 | } 224 | } 225 | 226 | /* Check for a length 4 match */ 227 | 228 | if (!cur_node4) 229 | goto out; 230 | 231 | for (;;) { 232 | /* No length 4 match found yet. Check the first 4 bytes. */ 233 | matchptr = &in_begin[cur_node4]; 234 | 235 | if (load_u32_unaligned(matchptr) == seq4) 236 | break; 237 | 238 | /* The first 4 bytes did not match. Keep trying. */ 239 | cur_node4 = mf->next_tab[cur_node4]; 240 | if (!cur_node4 || !--depth_remaining) 241 | goto out; 242 | } 243 | 244 | /* Found a match of length >= 4. Extend it to its full length. */ 245 | best_matchptr = matchptr; 246 | best_len = lz_extend(in_next, best_matchptr, 4, max_len); 247 | if (best_len >= nice_len) 248 | goto out; 249 | cur_node4 = mf->next_tab[cur_node4]; 250 | if (!cur_node4 || !--depth_remaining) 251 | goto out; 252 | } else { 253 | if (!cur_node4 || best_len >= nice_len) 254 | goto out; 255 | } 256 | 257 | /* Check for matches of length >= 5 */ 258 | 259 | for (;;) { 260 | for (;;) { 261 | matchptr = &in_begin[cur_node4]; 262 | 263 | /* Already found a length 4 match. Try for a longer 264 | * match; start by checking either the last 4 bytes and 265 | * the first 4 bytes, or the last byte. (The last byte, 266 | * the one which would extend the match length by 1, is 267 | * the most important.) */ 268 | #if UNALIGNED_ACCESS_IS_FAST 269 | if ((load_u32_unaligned(matchptr + best_len - 3) == 270 | load_u32_unaligned(in_next + best_len - 3)) && 271 | (load_u32_unaligned(matchptr) == 272 | load_u32_unaligned(in_next))) 273 | #else 274 | if (matchptr[best_len] == in_next[best_len]) 275 | #endif 276 | break; 277 | 278 | /* Continue to the next node in the list */ 279 | cur_node4 = mf->next_tab[cur_node4]; 280 | if (!cur_node4 || !--depth_remaining) 281 | goto out; 282 | } 283 | 284 | #if UNALIGNED_ACCESS_IS_FAST 285 | len = 4; 286 | #else 287 | len = 0; 288 | #endif 289 | len = lz_extend(in_next, matchptr, len, max_len); 290 | if (len > best_len) { 291 | /* This is the new longest match */ 292 | best_len = len; 293 | best_matchptr = matchptr; 294 | if (best_len >= nice_len) 295 | goto out; 296 | } 297 | 298 | /* Continue to the next node in the list */ 299 | cur_node4 = mf->next_tab[cur_node4]; 300 | if (!cur_node4 || !--depth_remaining) 301 | goto out; 302 | } 303 | out: 304 | *offset_ret = in_next - best_matchptr; 305 | return best_len; 306 | } 307 | 308 | /* 309 | * Advance the matchfinder, but don't search for matches. 310 | * 311 | * @mf 312 | * The matchfinder structure. 313 | * @in_begin 314 | * Pointer to the beginning of the input buffer. 315 | * @cur_pos 316 | * The current position in the input buffer (the position of the sequence 317 | * being matched against). 318 | * @end_pos 319 | * The length of the input buffer. 320 | * @next_hashes 321 | * The precomputed hash codes for the sequence beginning at @in_next. 322 | * These will be used and then updated with the precomputed hashcodes for 323 | * the sequence beginning at @in_next + @count. 324 | * @count 325 | * The number of bytes to advance. Must be > 0. 326 | * 327 | * Returns @in_next + @count. 328 | */ 329 | static forceinline const u8 * 330 | hc_matchfinder_skip_positions(struct hc_matchfinder * const restrict mf, 331 | const u8 * const restrict in_begin, 332 | const ptrdiff_t cur_pos, 333 | const ptrdiff_t end_pos, 334 | const u32 count, 335 | u32 next_hashes[restrict 2]) 336 | { 337 | const u8 *in_next = in_begin + cur_pos; 338 | const u8 * const stop_ptr = in_next + count; 339 | 340 | if (likely(count + 5 <= end_pos - cur_pos)) { 341 | u32 hash3, hash4; 342 | u32 next_seq3, next_seq4; 343 | 344 | hash3 = next_hashes[0]; 345 | hash4 = next_hashes[1]; 346 | do { 347 | mf->hash3_tab[hash3] = in_next - in_begin; 348 | mf->next_tab[in_next - in_begin] = mf->hash4_tab[hash4]; 349 | mf->hash4_tab[hash4] = in_next - in_begin; 350 | 351 | next_seq4 = load_u32_unaligned(++in_next); 352 | next_seq3 = loaded_u32_to_u24(next_seq4); 353 | hash3 = lz_hash(next_seq3, HC_MATCHFINDER_HASH3_ORDER); 354 | hash4 = lz_hash(next_seq4, HC_MATCHFINDER_HASH4_ORDER); 355 | 356 | } while (in_next != stop_ptr); 357 | 358 | prefetchw(&mf->hash3_tab[hash3]); 359 | prefetchw(&mf->hash4_tab[hash4]); 360 | next_hashes[0] = hash3; 361 | next_hashes[1] = hash4; 362 | } 363 | 364 | return stop_ptr; 365 | } 366 | -------------------------------------------------------------------------------- /lib/decompress_impl.h: -------------------------------------------------------------------------------- 1 | /* 2 | * decompress_impl.h - XPACK decompression implementation 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | /* 29 | * This is the actual decompression routine, lifted out of xpack_decompress.c so 30 | * that it can be compiled with different target instruction sets. 31 | */ 32 | 33 | static enum decompress_result ATTRIBUTES 34 | FUNCNAME(struct xpack_decompressor * restrict d, 35 | const void * restrict in, size_t in_nbytes, 36 | void * restrict out, size_t out_nbytes_avail, 37 | size_t *actual_out_nbytes_ret) 38 | { 39 | const u8 *in_next = in; 40 | const u8 * const in_end = in_next + in_nbytes; 41 | u8 *out_next = out; 42 | u8 * const out_end = out_next + out_nbytes_avail; 43 | u8 *out_block_end; 44 | u32 recent_offsets[NUM_REPS]; 45 | #ifdef ENABLE_PREPROCESSING 46 | unsigned preprocessed = 0; 47 | #endif 48 | machine_word_t bitbuf = 0; 49 | unsigned bitsleft = 0; 50 | size_t overrun_count = 0; 51 | unsigned num_padding_bits; 52 | unsigned num_state_counts; 53 | unsigned is_final_block; 54 | unsigned block_type; 55 | size_t block_usize; 56 | s32 num_literals; 57 | u8 *literals; 58 | u8 *lits; 59 | u8 *lits_end; 60 | unsigned order; 61 | const u8 *extra_bytes; 62 | u32 num_extra_bytes; 63 | unsigned log2_num_literal_states; 64 | unsigned log2_num_litrunlen_states; 65 | unsigned log2_num_length_states; 66 | unsigned log2_num_offset_states; 67 | unsigned log2_num_aligned_states; 68 | #if NUM_LITERAL_STREAMS == 2 69 | unsigned literal_state_1; 70 | unsigned literal_state_2; 71 | #else 72 | unsigned literal_state; 73 | #endif 74 | unsigned litrunlen_state; 75 | unsigned length_state; 76 | unsigned offset_state; 77 | unsigned aligned_state; 78 | unsigned i; 79 | u32 sym; 80 | u32 bits; 81 | 82 | init_recent_offsets(recent_offsets); 83 | 84 | next_block: 85 | /* Starting to decompress the next block */ 86 | 87 | ENSURE_BITS(1 + NUM_BLOCKTYPE_BITS + 1 + NUM_BLOCKSIZE_BITS); 88 | 89 | /* "final block" flag */ 90 | is_final_block = POP_BITS(1); 91 | 92 | /* block type */ 93 | block_type = POP_BITS(NUM_BLOCKTYPE_BITS); 94 | 95 | /* block uncompressed size */ 96 | if (POP_BITS(1)) 97 | block_usize = DEFAULT_BLOCK_SIZE; 98 | else 99 | block_usize = POP_BITS(NUM_BLOCKSIZE_BITS); 100 | 101 | SAFETY_CHECK(block_type == BLOCKTYPE_ALIGNED || 102 | block_type == BLOCKTYPE_VERBATIM); 103 | 104 | if (unlikely(block_usize > out_end - out_next)) 105 | return DECOMPRESS_INSUFFICIENT_SPACE; 106 | 107 | SAFETY_CHECK(block_usize > 0); 108 | 109 | out_block_end = out_next + block_usize; 110 | 111 | /* Read the FSE state counts for each alphabet. */ 112 | ENSURE_BITS(20); 113 | log2_num_literal_states = POP_BITS(4); 114 | log2_num_litrunlen_states = POP_BITS(4); 115 | log2_num_length_states = POP_BITS(4); 116 | log2_num_offset_states = POP_BITS(4); 117 | if (block_type == BLOCKTYPE_ALIGNED) 118 | log2_num_aligned_states = POP_BITS(4); 119 | else 120 | log2_num_aligned_states = 0; 121 | 122 | SAFETY_CHECK(log2_num_literal_states <= MAX_LOG2_NUM_LITERAL_STATES && 123 | log2_num_litrunlen_states <= MAX_LOG2_NUM_LITRUNLEN_STATES && 124 | log2_num_length_states <= MAX_LOG2_NUM_LENGTH_STATES && 125 | log2_num_offset_states <= MAX_LOG2_NUM_OFFSET_STATES && 126 | log2_num_aligned_states <= MAX_LOG2_NUM_ALIGNED_STATES); 127 | 128 | #ifndef _MSC_VER 129 | STATIC_ASSERT(offsetof(struct xpack_decompressor, 130 | aligned_state_counts[ALIGNED_ALPHABET_SIZE]) == 131 | offsetof(struct xpack_decompressor, state_counts) + 132 | sizeof(d->state_counts)); 133 | #endif 134 | 135 | num_state_counts = ARRAY_LEN(d->state_counts); 136 | if (block_type != BLOCKTYPE_ALIGNED) 137 | num_state_counts -= ALIGNED_ALPHABET_SIZE; 138 | 139 | for (i = 0; i < num_state_counts; ) { 140 | unsigned code; 141 | 142 | ENSURE_BITS(CODEBITS + MAX_EXTRA_CODEBITS); 143 | 144 | code = POP_BITS(CODEBITS); 145 | 146 | if (code < ZEROCODE1) { 147 | /* single nonzero count */ 148 | d->state_counts[i++] = (1 << code) + POP_BITS(code); 149 | } else { 150 | unsigned num_zeroes; 151 | 152 | if (code == ZEROCODE1) { 153 | /* a few zeroes */ 154 | num_zeroes = ZEROCODE1_MIN + 155 | POP_BITS(ZEROCODE1_NBITS); 156 | } else { 157 | /* many zeroes */ 158 | num_zeroes = ZEROCODE2_MIN + 159 | POP_BITS(ZEROCODE2_NBITS); 160 | } 161 | SAFETY_CHECK(num_zeroes <= num_state_counts - i); 162 | do { 163 | d->state_counts[i++] = 0; 164 | } while (--num_zeroes); 165 | } 166 | } 167 | 168 | #ifdef ENABLE_PREPROCESSING 169 | preprocessed |= d->literal_state_counts[0xE8]; 170 | #endif 171 | 172 | /* Prepare the extra_bytes pointer. */ 173 | 174 | ENSURE_BITS(5); 175 | order = POP_BITS(5); 176 | STATIC_ASSERT(CAN_ENSURE(25)); 177 | SAFETY_CHECK(order <= 25); 178 | ENSURE_BITS(order); 179 | num_extra_bytes = ((u32)1 << order) + POP_BITS(order) - 1; 180 | ALIGN_INPUT(); 181 | SAFETY_CHECK(num_extra_bytes < in_end - in_next); 182 | extra_bytes = in_next; 183 | in_next += num_extra_bytes; 184 | 185 | /* Set up the FSE symbol input stream. */ 186 | SAFETY_CHECK(*in_next != 0); 187 | num_padding_bits = 1 + bsf32(*in_next); 188 | bitbuf = *in_next++ >> num_padding_bits; 189 | bitsleft = 8 - num_padding_bits; 190 | 191 | /* Decode the literals. */ 192 | 193 | ENSURE_BITS(5); 194 | order = POP_BITS(5); 195 | STATIC_ASSERT(CAN_ENSURE(25)); 196 | SAFETY_CHECK(order <= 25); 197 | ENSURE_BITS(order); 198 | num_literals = ((u32)1 << order) + POP_BITS(order) - 1; 199 | SAFETY_CHECK(num_literals <= out_block_end - out_next); 200 | literals = out_block_end - num_literals; 201 | 202 | SAFETY_CHECK(build_fse_decode_table(d->literal_decode_table, 203 | d->literal_state_counts, 204 | LITERAL_ALPHABET_SIZE, 205 | log2_num_literal_states)); 206 | 207 | #if NUM_LITERAL_STREAMS == 2 208 | ENSURE_BITS(2 * MAX_LOG2_NUM_LITERAL_STATES); 209 | literal_state_1 = POP_BITS(log2_num_literal_states); 210 | literal_state_2 = POP_BITS(log2_num_literal_states); 211 | lits = literals; 212 | lits_end = literals + (num_literals & ~1); 213 | while (lits != lits_end) { 214 | ENSURE_BITS(2 * MAX_LOG2_NUM_LITERAL_STATES); 215 | *lits++ = DECODE_SYMBOL(literal_state_1, d->literal_decode_table); 216 | *lits++ = DECODE_SYMBOL(literal_state_2, d->literal_decode_table); 217 | } 218 | if (lits_end != out_block_end) { 219 | ENSURE_BITS(MAX_LOG2_NUM_LITERAL_STATES); 220 | *lits++ = DECODE_SYMBOL(literal_state_1, d->literal_decode_table); 221 | } 222 | SAFETY_CHECK(literal_state_1 == 0 && literal_state_2 == 0); 223 | #else 224 | ENSURE_BITS(MAX_LOG2_NUM_LITERAL_STATES); 225 | literal_state = POP_BITS(log2_num_literal_states); 226 | lits = literals; 227 | lits_end = literals + num_literals; 228 | while (lits != lits_end) { 229 | ENSURE_BITS(MAX_LOG2_NUM_LITERAL_STATES); 230 | *lits++ = DECODE_SYMBOL(literal_state, d->literal_decode_table); 231 | } 232 | SAFETY_CHECK(literal_state == 0); 233 | #endif 234 | 235 | /* Prepare to decode literal runs and matches */ 236 | 237 | ENSURE_BITS(MAX_LOG2_NUM_LITRUNLEN_STATES + MAX_LOG2_NUM_LENGTH_STATES); 238 | litrunlen_state = POP_BITS(log2_num_litrunlen_states); 239 | length_state = POP_BITS(log2_num_length_states); 240 | 241 | ENSURE_BITS(MAX_LOG2_NUM_OFFSET_STATES + MAX_LOG2_NUM_ALIGNED_STATES); 242 | offset_state = POP_BITS(log2_num_offset_states); 243 | aligned_state = 0; 244 | if (block_type == BLOCKTYPE_ALIGNED) 245 | aligned_state = POP_BITS(log2_num_aligned_states); 246 | 247 | SAFETY_CHECK(build_fse_decode_table(d->litrunlen_decode_table, 248 | d->litrunlen_state_counts, 249 | LITRUNLEN_ALPHABET_SIZE, 250 | log2_num_litrunlen_states)); 251 | 252 | SAFETY_CHECK(build_fse_decode_table(d->length_decode_table, 253 | d->length_state_counts, 254 | LENGTH_ALPHABET_SIZE, 255 | log2_num_length_states)); 256 | 257 | SAFETY_CHECK(build_fse_decode_table(d->offset_decode_table, 258 | d->offset_state_counts, 259 | MAX_OFFSET_ALPHABET_SIZE, 260 | log2_num_offset_states)); 261 | 262 | if (block_type == BLOCKTYPE_ALIGNED) { 263 | SAFETY_CHECK(build_fse_decode_table(d->aligned_decode_table, 264 | d->aligned_state_counts, 265 | ALIGNED_ALPHABET_SIZE, 266 | log2_num_aligned_states)); 267 | } 268 | 269 | /* Decode literal runs and matches */ 270 | 271 | for (;;) { 272 | u32 litrunlen; 273 | u32 length; 274 | u32 offset; 275 | unsigned offset_sym; 276 | 277 | STATIC_ASSERT(MAX_LOG2_NUM_LITRUNLEN_STATES + 278 | MAX_LOG2_NUM_LENGTH_STATES + 279 | MAX_LOG2_NUM_OFFSET_STATES <= 32); 280 | if (CAN_ENSURE(32)) 281 | ENSURE_BITS(32); 282 | else 283 | ENSURE_BITS(MAX_LOG2_NUM_LITRUNLEN_STATES + 284 | MAX_LOG2_NUM_LENGTH_STATES); 285 | 286 | /* BEGIN decode literal run */ 287 | 288 | /* Decode the literal run length and copy the literals. */ 289 | litrunlen = DECODE_SYMBOL(litrunlen_state, 290 | d->litrunlen_decode_table); 291 | 292 | #if 0 /* Unoptimized version */ 293 | if (litrunlen == LITRUNLEN_ALPHABET_SIZE - 1) { 294 | SAFETY_CHECK(extra_bytes < in_end); 295 | litrunlen += *extra_bytes++; 296 | if (litrunlen == 0xFF + LITRUNLEN_ALPHABET_SIZE - 1) { 297 | SAFETY_CHECK(in_end - extra_bytes >= 3); 298 | litrunlen += (u32)*extra_bytes++ << 0; 299 | litrunlen += (u32)*extra_bytes++ << 8; 300 | litrunlen += (u32)*extra_bytes++ << 16; 301 | } 302 | } 303 | num_literals -= litrunlen; 304 | SAFETY_CHECK(num_literals >= 0); 305 | SAFETY_CHECK(out_next <= literals); 306 | while (litrunlen--) 307 | *out_next++ = *literals++; 308 | if (out_next == out_block_end) /* End of block? */ 309 | break; 310 | #else 311 | STATIC_ASSERT(LITRUNLEN_ALPHABET_SIZE - 2 <= 15); 312 | if (UNALIGNED_ACCESS_IS_FAST && 313 | likely(num_literals >= 16 && literals - out_next >= 16 && 314 | litrunlen != LITRUNLEN_ALPHABET_SIZE - 1)) 315 | { 316 | /* Fast case */ 317 | copy_16_bytes_unaligned(literals, out_next); 318 | out_next += litrunlen; 319 | literals += litrunlen; 320 | num_literals -= litrunlen; 321 | } else { 322 | /* Slow case */ 323 | const u32 cutoff = LITRUNLEN_ALPHABET_SIZE - 1; 324 | if (litrunlen == cutoff) { 325 | SAFETY_CHECK(extra_bytes < in_end); 326 | litrunlen += *extra_bytes++; 327 | if (litrunlen == 0xFF + cutoff) { 328 | SAFETY_CHECK(in_end - extra_bytes >= 3); 329 | litrunlen += (u32)*extra_bytes++ << 0; 330 | litrunlen += (u32)*extra_bytes++ << 8; 331 | litrunlen += (u32)*extra_bytes++ << 16; 332 | } 333 | } 334 | 335 | num_literals -= litrunlen; 336 | SAFETY_CHECK(num_literals >= 0); 337 | 338 | if (UNALIGNED_ACCESS_IS_FAST && 339 | likely(litrunlen + WORDBYTES <= literals - out_next && 340 | num_literals >= WORDBYTES)) 341 | { 342 | const u8 *src = literals; 343 | u8 *dst = out_next; 344 | 345 | out_next += litrunlen; 346 | literals += litrunlen; 347 | do { 348 | copy_word_unaligned(src, dst); 349 | src += WORDBYTES; 350 | dst += WORDBYTES; 351 | litrunlen -= WORDBYTES; 352 | } while ((s32)litrunlen > 0); 353 | } else { 354 | while (litrunlen--) 355 | *out_next++ = *literals++; 356 | } 357 | 358 | if (out_next == out_block_end) /* End of block? */ 359 | break; 360 | } 361 | #endif 362 | /* END decode literal run */ 363 | 364 | /* BEGIN decode match */ 365 | 366 | /* Decode the length symbol */ 367 | 368 | length = DECODE_SYMBOL(length_state, d->length_decode_table); 369 | 370 | /* Decode the offset symbol */ 371 | 372 | if (!CAN_ENSURE(32)) 373 | ENSURE_BITS(MAX_LOG2_NUM_OFFSET_STATES); 374 | offset_sym = DECODE_SYMBOL(offset_state, d->offset_decode_table); 375 | 376 | /* Decode the rest of the offset */ 377 | 378 | if (offset_sym >= NUM_REPS) { 379 | 380 | /* Explicit offset */ 381 | 382 | unsigned offset_log2 = offset_sym - NUM_REPS; 383 | 384 | offset = (u32)1 << offset_log2; 385 | 386 | if (block_type == BLOCKTYPE_ALIGNED && 387 | offset_log2 >= NUM_ALIGNED_BITS) 388 | { 389 | ENSURE_BITS(MAX_LOG2_NUM_ALIGNED_STATES + 390 | offset_log2 - NUM_ALIGNED_BITS); 391 | 392 | offset += DECODE_SYMBOL(aligned_state, 393 | d->aligned_decode_table); 394 | offset += POP_BITS(offset_log2 - 395 | NUM_ALIGNED_BITS) << 396 | NUM_ALIGNED_BITS; 397 | } else { 398 | ENSURE_BITS(offset_log2); 399 | offset += POP_BITS(offset_log2); 400 | } 401 | 402 | STATIC_ASSERT(NUM_REPS >= 1 && NUM_REPS <= 4); 403 | #if NUM_REPS >= 4 404 | recent_offsets[3] = recent_offsets[2]; 405 | #endif 406 | #if NUM_REPS >= 3 407 | recent_offsets[2] = recent_offsets[1]; 408 | #endif 409 | #if NUM_REPS >= 2 410 | recent_offsets[1] = recent_offsets[0]; 411 | #endif 412 | } else { 413 | /* Repeat offset */ 414 | offset = recent_offsets[offset_sym]; 415 | recent_offsets[offset_sym] = recent_offsets[0]; 416 | } 417 | 418 | recent_offsets[0] = offset; 419 | 420 | SAFETY_CHECK(offset <= out_next - (u8 *)out); 421 | 422 | /* Decode the remainder of the length and copy the match. */ 423 | 424 | length += MIN_MATCH_LEN; 425 | 426 | if (UNALIGNED_ACCESS_IS_FAST && length <= 16 && 427 | offset >= length && literals - out_next >= 16) 428 | { 429 | /* 430 | * Fast case: short length, no overlap, and we aren't 431 | * getting too close to the literals portion of the 432 | * output buffer. 433 | */ 434 | copy_16_bytes_unaligned(out_next - offset, out_next); 435 | } else { 436 | /* 437 | * "Slow case" (but still very important): long length, 438 | * or small offset, or we're getting close to the 439 | * literals portion of the output buffer. 440 | */ 441 | const u32 cutoff = LENGTH_ALPHABET_SIZE - 1 + MIN_MATCH_LEN; 442 | const u8 *src; 443 | u8 *dst, *end; 444 | if (length == cutoff) { 445 | SAFETY_CHECK(extra_bytes < in_end); 446 | length += *extra_bytes++; 447 | if (length == 0xFF + cutoff) { 448 | SAFETY_CHECK(in_end - extra_bytes >= 3); 449 | length += (u32)*extra_bytes++ << 0; 450 | length += (u32)*extra_bytes++ << 8; 451 | length += (u32)*extra_bytes++ << 16; 452 | } 453 | } 454 | 455 | SAFETY_CHECK(length <= literals - out_next); 456 | 457 | src = out_next - offset; 458 | dst = out_next; 459 | end = out_next + length; 460 | 461 | if (UNALIGNED_ACCESS_IS_FAST && 462 | likely(literals - end >= WORDBYTES)) { 463 | if (offset >= WORDBYTES) { 464 | copy_word_unaligned(src, dst); 465 | src += WORDBYTES; 466 | dst += WORDBYTES; 467 | if (dst < end) { 468 | do { 469 | copy_word_unaligned(src, dst); 470 | src += WORDBYTES; 471 | dst += WORDBYTES; 472 | } while (dst < end); 473 | } 474 | } else if (offset == 1) { 475 | machine_word_t v = repeat_byte(*(dst - 1)); 476 | do { 477 | store_word_unaligned(v, dst); 478 | src += WORDBYTES; 479 | dst += WORDBYTES; 480 | } while (dst < end); 481 | } else { 482 | do { 483 | *dst++ = *src++; 484 | } while (dst < end); 485 | } 486 | } else { 487 | do { 488 | *dst++ = *src++; 489 | } while (dst < end); 490 | } 491 | } 492 | 493 | out_next += length; 494 | 495 | /* END decode match */ 496 | } 497 | 498 | SAFETY_CHECK(litrunlen_state == 0 && length_state == 0 && 499 | offset_state == 0 && aligned_state == 0); 500 | 501 | ALIGN_INPUT(); 502 | 503 | /* Finished decompressing a block. */ 504 | if (!is_final_block) 505 | goto next_block; 506 | 507 | /* That was the final block. */ 508 | 509 | #ifdef ENABLE_PREPROCESSING 510 | /* Postprocess the data if needed. */ 511 | if (preprocessed) 512 | postprocess(out, out_nbytes_avail); 513 | #endif 514 | 515 | if (actual_out_nbytes_ret) { 516 | *actual_out_nbytes_ret = out_next - (u8 *)out; 517 | } else { 518 | if (out_next != out_end) 519 | return DECOMPRESS_SHORT_OUTPUT; 520 | } 521 | return DECOMPRESS_SUCCESS; 522 | } 523 | -------------------------------------------------------------------------------- /programs/xpack.c: -------------------------------------------------------------------------------- 1 | /* 2 | * xpack.c - a file compression and decompression program 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #include "prog_util.h" 29 | 30 | #include 31 | #include 32 | #ifdef _WIN32 33 | # include 34 | #else 35 | # include 36 | # include 37 | # include 38 | #endif 39 | 40 | struct options { 41 | bool to_stdout; 42 | bool decompress; 43 | bool force; 44 | bool keep; 45 | int compression_level; 46 | u32 chunk_size; 47 | const tchar *suffix; 48 | }; 49 | 50 | static const tchar *const optstring = T("123456789cdfhkL:s:S:V"); 51 | 52 | static void 53 | show_usage(FILE *fp) 54 | { 55 | fprintf(fp, 56 | "Usage: %"TS" [-123456789cdfhkV] [-L LVL] [-s SIZE] [-S SUF] [FILE]...\n" 57 | "Compress or decompress the specified FILEs.\n" 58 | "\n" 59 | "Options:\n" 60 | " -1 fastest (worst) compression\n" 61 | " -9 slowest (best) compression\n" 62 | " -c write to standard output\n" 63 | " -d decompress\n" 64 | " -f overwrite existing output files\n" 65 | " -h print this help\n" 66 | " -k don't delete input files\n" 67 | " -L LVL compression level [1-9] (default 6)\n" 68 | " -s SIZE chunk size (default 524288)\n" 69 | " -S SUF use suffix .SUF instead of .xpack\n" 70 | " -V show version and legal information\n" 71 | "\n" 72 | "NOTICE: this program is currently experimental, and the on-disk format\n" 73 | "is not yet stable!\n", 74 | program_invocation_name); 75 | } 76 | 77 | static void 78 | show_version(void) 79 | { 80 | printf( 81 | "xpack compression program, experimental version\n" 82 | "Copyright 2016 Eric Biggers\n" 83 | "\n" 84 | "This program is free software which may be modified and/or redistributed\n" 85 | "under the terms of the MIT license. There is NO WARRANTY, to the extent\n" 86 | "permitted by law. See the COPYING file for details.\n" 87 | ); 88 | } 89 | 90 | /* Was the program invoked in decompression mode? */ 91 | static bool 92 | is_xunpack(void) 93 | { 94 | if (tstrxcmp(program_invocation_name, T("xunpack")) == 0) 95 | return true; 96 | #ifdef _WIN32 97 | if (tstrxcmp(program_invocation_name, T("xunpack.exe")) == 0) 98 | return true; 99 | #endif 100 | return false; 101 | } 102 | 103 | static const tchar * 104 | get_suffix(const tchar *path, const tchar *suffix) 105 | { 106 | const tchar *dot = tstrrchr(get_filename(path), '.'); 107 | 108 | if (dot != NULL && tstrxcmp(dot + 1, suffix) == 0) 109 | return dot; 110 | return NULL; 111 | } 112 | 113 | static bool 114 | has_suffix(const tchar *path, const tchar *suffix) 115 | { 116 | return get_suffix(path, suffix) != NULL; 117 | } 118 | 119 | struct xpack_file_header { 120 | #define XPACK_MAGIC "XPACK\0\0\0" 121 | char magic[8]; 122 | u32 chunk_size; 123 | u16 header_size; 124 | u8 version; 125 | u8 compression_level; 126 | }; 127 | 128 | struct xpack_chunk_header { 129 | u32 stored_size; 130 | u32 original_size; 131 | }; 132 | 133 | static void 134 | bswap_file_header(struct xpack_file_header *hdr) 135 | { 136 | STATIC_ASSERT(sizeof(struct xpack_file_header) == 16); 137 | 138 | hdr->chunk_size = le32_bswap(hdr->chunk_size); 139 | hdr->header_size = le16_bswap(hdr->header_size); 140 | } 141 | 142 | static void 143 | bswap_chunk_header(struct xpack_chunk_header *hdr) 144 | { 145 | STATIC_ASSERT(sizeof(struct xpack_chunk_header) == 8); 146 | 147 | hdr->stored_size = le32_bswap(hdr->stored_size); 148 | hdr->original_size = le32_bswap(hdr->original_size); 149 | } 150 | 151 | static int 152 | write_file_header(struct file_stream *out, u32 chunk_size, int compression_level) 153 | { 154 | struct xpack_file_header hdr; 155 | 156 | memcpy(hdr.magic, XPACK_MAGIC, sizeof(hdr.magic)); 157 | hdr.chunk_size = chunk_size; 158 | hdr.header_size = sizeof(hdr); 159 | hdr.version = 1; 160 | hdr.compression_level = compression_level; 161 | 162 | bswap_file_header(&hdr); 163 | return full_write(out, &hdr, sizeof(hdr)); 164 | } 165 | 166 | static int 167 | write_chunk_header(struct file_stream *out, u32 stored_size, u32 original_size) 168 | { 169 | struct xpack_chunk_header hdr; 170 | 171 | hdr.stored_size = stored_size; 172 | hdr.original_size = original_size; 173 | 174 | bswap_chunk_header(&hdr); 175 | return full_write(out, &hdr, sizeof(hdr)); 176 | } 177 | 178 | static int 179 | do_compress(struct xpack_compressor *compressor, struct file_stream *in, 180 | struct file_stream *out, u32 chunk_size) 181 | { 182 | void *original_buf = xmalloc(chunk_size); 183 | void *compressed_buf = xmalloc(chunk_size - 1); 184 | ssize_t ret = -1; 185 | 186 | if (original_buf == NULL || compressed_buf == NULL) 187 | goto out; 188 | 189 | while ((ret = xread(in, original_buf, chunk_size)) > 0) { 190 | u32 original_size = ret; 191 | u32 compressed_size; 192 | void *stored_buf; 193 | u32 stored_size; 194 | 195 | compressed_size = xpack_compress(compressor, 196 | original_buf, 197 | original_size, 198 | compressed_buf, 199 | original_size - 1); 200 | if (compressed_size == 0) { 201 | /* Store the chunk uncompressed */ 202 | stored_buf = original_buf; 203 | stored_size = original_size; 204 | } else { 205 | /* Store the chunk compressed */ 206 | stored_buf = compressed_buf; 207 | stored_size = compressed_size; 208 | } 209 | 210 | ret = write_chunk_header(out, stored_size, original_size); 211 | if (ret != 0) 212 | goto out; 213 | 214 | ret = full_write(out, stored_buf, stored_size); 215 | if (ret != 0) 216 | goto out; 217 | } 218 | out: 219 | free(compressed_buf); 220 | free(original_buf); 221 | return ret; 222 | } 223 | 224 | static int 225 | do_decompress(struct xpack_decompressor *decompressor, struct file_stream *in, 226 | struct file_stream *out, u32 chunk_size) 227 | { 228 | void *original_buf = xmalloc(chunk_size); 229 | void *compressed_buf = xmalloc(chunk_size - 1); 230 | ssize_t ret = -1; 231 | struct xpack_chunk_header chunk_hdr; 232 | 233 | if (original_buf == NULL || compressed_buf == NULL) 234 | goto out; 235 | 236 | while ((ret = xread(in, &chunk_hdr, sizeof(chunk_hdr))) 237 | == sizeof(chunk_hdr)) 238 | { 239 | u32 original_size; 240 | u32 stored_size; 241 | enum decompress_result result; 242 | 243 | bswap_chunk_header(&chunk_hdr); 244 | original_size = chunk_hdr.original_size; 245 | stored_size = chunk_hdr.stored_size; 246 | 247 | if (original_size < 1 || original_size > chunk_size || 248 | stored_size < 1 || stored_size > original_size) { 249 | msg("%"TS": file corrupt", in->name); 250 | ret = -1; 251 | goto out; 252 | } 253 | 254 | ret = xread(in, (stored_size == original_size) ? 255 | original_buf : compressed_buf, stored_size); 256 | if (ret < 0) 257 | goto out; 258 | 259 | if (ret != stored_size) { 260 | msg("%"TS": unexpected end-of-file", in->name); 261 | ret = -1; 262 | goto out; 263 | } 264 | 265 | if (stored_size != original_size) { 266 | /* Chunk was stored compressed */ 267 | result = xpack_decompress(decompressor, 268 | compressed_buf, stored_size, 269 | original_buf, original_size, 270 | NULL); 271 | if (result != DECOMPRESS_SUCCESS) { 272 | msg("%"TS": data corrupt", in->name); 273 | ret = -1; 274 | goto out; 275 | } 276 | } 277 | 278 | ret = full_write(out, original_buf, original_size); 279 | if (ret != 0) 280 | goto out; 281 | } 282 | if (ret > 0) { 283 | msg("%"TS": unexpected end-of-file", in->name); 284 | ret = -1; 285 | } 286 | out: 287 | free(compressed_buf); 288 | free(original_buf); 289 | return ret; 290 | } 291 | 292 | static int 293 | stat_file(struct file_stream *in, struct stat *stbuf, bool allow_hard_links) 294 | { 295 | if (fstat(in->fd, stbuf) != 0) { 296 | msg("%"TS": unable to stat file", in->name); 297 | return -1; 298 | } 299 | 300 | if (!S_ISREG(stbuf->st_mode) && !in->is_standard_stream) { 301 | msg("%"TS" is %s -- skipping", 302 | in->name, S_ISDIR(stbuf->st_mode) ? "a directory" : 303 | "not a regular file"); 304 | return -2; 305 | } 306 | 307 | if (stbuf->st_nlink > 1 && !allow_hard_links) { 308 | msg("%"TS" has multiple hard links -- skipping " 309 | "(use -f to process anyway)", in->name); 310 | return -2; 311 | } 312 | 313 | return 0; 314 | } 315 | 316 | static void 317 | restore_mode(struct file_stream *out, const struct stat *stbuf) 318 | { 319 | #ifndef _WIN32 320 | if (fchmod(out->fd, stbuf->st_mode) != 0) 321 | msg_errno("%"TS": unable to preserve mode", out->name); 322 | #endif 323 | } 324 | 325 | static void 326 | restore_owner_and_group(struct file_stream *out, const struct stat *stbuf) 327 | { 328 | #ifndef _WIN32 329 | if (fchown(out->fd, stbuf->st_uid, stbuf->st_gid) != 0) { 330 | msg_errno("%"TS": unable to preserve owner and group", 331 | out->name); 332 | } 333 | #endif 334 | } 335 | 336 | static void 337 | restore_timestamps(struct file_stream *out, const tchar *newpath, 338 | const struct stat *stbuf) 339 | { 340 | int ret; 341 | #if defined(HAVE_FUTIMENS) 342 | struct timespec times[2] = { 343 | stbuf->st_atim, stbuf->st_mtim, 344 | }; 345 | ret = futimens(out->fd, times); 346 | #elif defined(HAVE_FUTIMES) 347 | struct timeval times[2] = { 348 | { stbuf->st_atim.tv_sec, stbuf->st_atim.tv_nsec / 1000, }, 349 | { stbuf->st_mtim.tv_sec, stbuf->st_mtim.tv_nsec / 1000, }, 350 | }; 351 | ret = futimes(out->fd, times); 352 | #else /* HAVE_FUTIMES */ 353 | struct tutimbuf times = { 354 | stbuf->st_atime, stbuf->st_mtime, 355 | }; 356 | ret = tutime(newpath, ×); 357 | #endif /* !HAVE_FUTIMES */ 358 | if (ret != 0) 359 | msg_errno("%"TS": unable to preserve timestamps", out->name); 360 | } 361 | 362 | static void 363 | restore_metadata(struct file_stream *out, const tchar *newpath, 364 | const struct stat *stbuf) 365 | { 366 | restore_mode(out, stbuf); 367 | restore_owner_and_group(out, stbuf); 368 | restore_timestamps(out, newpath, stbuf); 369 | } 370 | 371 | static int 372 | decompress_file(struct xpack_decompressor *decompressor, const tchar *path, 373 | const struct options *options) 374 | { 375 | tchar *newpath = NULL; 376 | struct file_stream in; 377 | struct file_stream out; 378 | struct xpack_file_header hdr; 379 | struct stat stbuf; 380 | int ret; 381 | int ret2; 382 | 383 | if (path != NULL && !options->to_stdout) { 384 | const tchar *suffix = get_suffix(path, options->suffix); 385 | if (suffix == NULL) { 386 | msg("\"%"TS"\" does not end with the .%"TS" suffix -- " 387 | "skipping", path, options->suffix); 388 | ret = -2; 389 | goto out; 390 | } 391 | newpath = xmalloc((suffix - path + 1) * sizeof(tchar)); 392 | tmemcpy(newpath, path, suffix - path); 393 | newpath[suffix - path] = '\0'; 394 | } 395 | 396 | ret = xopen_for_read(path, &in); 397 | if (ret != 0) 398 | goto out_free_newpath; 399 | 400 | if (!options->force && isatty(in.fd)) { 401 | msg("Refusing to read compressed data from terminal. " 402 | "Use -f to override.\nFor help, use -h."); 403 | ret = -1; 404 | goto out_close_in; 405 | } 406 | 407 | ret = stat_file(&in, &stbuf, options->force || newpath == NULL); 408 | if (ret != 0) 409 | goto out_close_in; 410 | 411 | ret = xread(&in, &hdr, sizeof(hdr)); 412 | if (ret < 0) 413 | goto out_close_in; 414 | if (ret != sizeof(hdr)) { 415 | msg("%"TS": not in XPACK format", in.name); 416 | ret = -1; 417 | goto out_close_in; 418 | } 419 | bswap_file_header(&hdr); 420 | 421 | if (memcmp(hdr.magic, XPACK_MAGIC, sizeof(hdr.magic)) != 0) { 422 | msg("%"TS": not in XPACK format", in.name); 423 | ret = -1; 424 | goto out_close_in; 425 | } 426 | 427 | if (hdr.version != 1) { 428 | msg("%"TS": unsupported version (%d)", in.name, hdr.version); 429 | ret = -1; 430 | goto out_close_in; 431 | } 432 | 433 | if (hdr.header_size < sizeof(hdr)) { 434 | msg("%"TS": incorrect header size (%"PRIu16")", in.name, 435 | hdr.header_size); 436 | ret = -1; 437 | goto out_close_in; 438 | } 439 | 440 | if (hdr.chunk_size < 1024 || hdr.chunk_size > 67108864) { 441 | msg("%"TS": unsupported chunk size (%"PRIu32")", in.name, 442 | hdr.chunk_size); 443 | ret = -1; 444 | goto out_close_in; 445 | } 446 | 447 | ret = skip_bytes(&in, hdr.header_size - sizeof(hdr)); 448 | if (ret != 0) 449 | goto out_close_in; 450 | 451 | ret = xopen_for_write(newpath, options->force, &out); 452 | if (ret != 0) 453 | goto out_close_in; 454 | 455 | ret = do_decompress(decompressor, &in, &out, hdr.chunk_size); 456 | if (ret != 0) 457 | goto out_close_out; 458 | 459 | if (path != NULL && newpath != NULL) 460 | restore_metadata(&out, newpath, &stbuf); 461 | ret = 0; 462 | out_close_out: 463 | ret2 = xclose(&out); 464 | if (ret == 0) 465 | ret = ret2; 466 | if (ret != 0 && newpath != NULL) 467 | tunlink(newpath); 468 | out_close_in: 469 | xclose(&in); 470 | if (ret == 0 && path != NULL && newpath != NULL && !options->keep) 471 | tunlink(path); 472 | out_free_newpath: 473 | free(newpath); 474 | out: 475 | return ret; 476 | } 477 | 478 | static int 479 | compress_file(struct xpack_compressor *compressor, const tchar *path, 480 | const struct options *options) 481 | { 482 | tchar *newpath = NULL; 483 | struct file_stream in; 484 | struct file_stream out; 485 | struct stat stbuf; 486 | int ret; 487 | int ret2; 488 | 489 | if (path != NULL && !options->to_stdout) { 490 | size_t path_nchars, suffix_nchars; 491 | 492 | if (!options->force && has_suffix(path, options->suffix)) { 493 | msg("%"TS": already has .%"TS" suffix -- skipping", 494 | path, options->suffix); 495 | ret = -2; 496 | goto out; 497 | } 498 | path_nchars = tstrlen(path); 499 | suffix_nchars = tstrlen(options->suffix); 500 | newpath = xmalloc((path_nchars + 1 + suffix_nchars + 1) * 501 | sizeof(tchar)); 502 | tmemcpy(newpath, path, path_nchars); 503 | newpath[path_nchars] = '.'; 504 | tmemcpy(&newpath[path_nchars + 1], options->suffix, 505 | suffix_nchars + 1); 506 | } 507 | 508 | ret = xopen_for_read(path, &in); 509 | if (ret != 0) 510 | goto out_free_newpath; 511 | 512 | ret = stat_file(&in, &stbuf, options->force || newpath == NULL); 513 | if (ret != 0) 514 | goto out_close_in; 515 | 516 | ret = xopen_for_write(newpath, options->force, &out); 517 | if (ret != 0) 518 | goto out_close_in; 519 | 520 | if (!options->force && isatty(out.fd)) { 521 | msg("Refusing to write compressed data to terminal. " 522 | "Use -f to override.\nFor help, use -h."); 523 | ret = -1; 524 | goto out_close_out; 525 | } 526 | 527 | ret = write_file_header(&out, options->chunk_size, 528 | options->compression_level); 529 | if (ret != 0) 530 | goto out_close_out; 531 | 532 | ret = do_compress(compressor, &in, &out, options->chunk_size); 533 | if (ret != 0) 534 | goto out_close_out; 535 | 536 | if (path != NULL && newpath != NULL) 537 | restore_metadata(&out, newpath, &stbuf); 538 | ret = 0; 539 | out_close_out: 540 | ret2 = xclose(&out); 541 | if (ret == 0) 542 | ret = ret2; 543 | if (ret != 0 && newpath != NULL) 544 | tunlink(newpath); 545 | out_close_in: 546 | xclose(&in); 547 | if (ret == 0 && path != NULL && newpath != NULL && !options->keep) 548 | tunlink(path); 549 | out_free_newpath: 550 | free(newpath); 551 | out: 552 | return ret; 553 | } 554 | 555 | int 556 | tmain(int argc, tchar *argv[]) 557 | { 558 | struct options options; 559 | tchar *default_file_list[] = { NULL }; 560 | int opt_char; 561 | int i; 562 | int ret; 563 | 564 | program_invocation_name = get_filename(argv[0]); 565 | 566 | options.to_stdout = false; 567 | options.decompress = is_xunpack(); 568 | options.force = false; 569 | options.keep = false; 570 | options.compression_level = 6; 571 | options.chunk_size = 524288; 572 | options.suffix = T("xpack"); 573 | 574 | while ((opt_char = tgetopt(argc, argv, optstring)) != -1) { 575 | switch (opt_char) { 576 | case '1': 577 | case '2': 578 | case '3': 579 | case '4': 580 | case '5': 581 | case '6': 582 | case '7': 583 | case '8': 584 | case '9': 585 | options.compression_level = opt_char - '0'; 586 | break; 587 | case 'c': 588 | options.to_stdout = true; 589 | break; 590 | case 'd': 591 | options.decompress = true; 592 | break; 593 | case 'f': 594 | options.force = true; 595 | break; 596 | case 'h': 597 | show_usage(stdout); 598 | return 0; 599 | case 'k': 600 | options.keep = true; 601 | break; 602 | case 'L': 603 | options.compression_level = 604 | parse_compression_level(toptarg); 605 | if (options.compression_level <= 0) 606 | return 1; 607 | break; 608 | case 's': 609 | options.chunk_size = parse_chunk_size(toptarg); 610 | if (options.chunk_size == 0) 611 | return 1; 612 | break; 613 | case 'S': 614 | options.suffix = toptarg; 615 | break; 616 | case 'V': 617 | show_version(); 618 | return 0; 619 | default: 620 | show_usage(stderr); 621 | return 1; 622 | } 623 | } 624 | 625 | argv += toptind; 626 | argc -= toptind; 627 | 628 | if (argc == 0) { 629 | argv = default_file_list; 630 | argc = ARRAY_LEN(default_file_list); 631 | } else { 632 | for (i = 0; i < argc; i++) 633 | if (argv[i][0] == '-' && argv[i][1] == '\0') 634 | argv[i] = NULL; 635 | } 636 | 637 | ret = 0; 638 | if (options.decompress) { 639 | struct xpack_decompressor *d; 640 | 641 | d = alloc_decompressor(); 642 | if (d == NULL) 643 | return 1; 644 | 645 | for (i = 0; i < argc; i++) 646 | ret |= -decompress_file(d, argv[i], &options); 647 | 648 | xpack_free_decompressor(d); 649 | } else { 650 | struct xpack_compressor *c; 651 | 652 | c = alloc_compressor(options.chunk_size, 653 | options.compression_level); 654 | if (c == NULL) 655 | return 1; 656 | 657 | for (i = 0; i < argc; i++) 658 | ret |= -compress_file(c, argv[i], &options); 659 | 660 | xpack_free_compressor(c); 661 | } 662 | 663 | /* 664 | * If ret=0, there were no warnings or errors. Exit with status 0. 665 | * If ret=2, there was at least one warning. Exit with status 2. 666 | * Else, there was at least one error. Exit with status 1. 667 | */ 668 | if (ret != 0 && ret != 2) 669 | ret = 1; 670 | 671 | return ret; 672 | } 673 | -------------------------------------------------------------------------------- /lib/xpack_decompress.c: -------------------------------------------------------------------------------- 1 | /* 2 | * xpack_decompress.c - decompressor for the XPACK compression format 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #ifdef __SSE2__ 29 | # include 30 | #endif 31 | 32 | #include "xpack_common.h" 33 | #include "x86_cpu_features.h" 34 | 35 | /* 36 | * If the expression passed to SAFETY_CHECK() evaluates to false, then the 37 | * decompression routine immediately returns DECOMPRESS_BAD_DATA, indicating the 38 | * compressed data is invalid. 39 | * 40 | * Theoretically, these checks could be disabled for specialized applications 41 | * where all input to the decompressor will be trusted. 42 | */ 43 | #if 0 44 | # pragma message("UNSAFE DECOMPRESSION IS ENABLED. THIS MUST ONLY BE USED IF THE DECOMPRESSOR INPUT WILL ALWAYS BE TRUSTED!") 45 | # define SAFETY_CHECK(expr) (void)(expr) 46 | #else 47 | # define SAFETY_CHECK(expr) if (unlikely(!(expr))) return DECOMPRESS_BAD_DATA 48 | #endif 49 | 50 | /* 51 | * An entry in a FSE decode table. The index of the entry in the table is the 52 | * state with which the entry is associated. 53 | * 54 | * For efficiency we sometimes access this struct as the u32 value and sometimes 55 | * as the individual fields. 56 | */ 57 | typedef struct { 58 | union { 59 | u32 entry; 60 | struct { /* for big endian systems */ 61 | u16 destination_range_start; 62 | u8 num_bits; 63 | u8 symbol; 64 | } be; 65 | struct { /* for little endian systems */ 66 | u8 symbol; 67 | u8 num_bits; 68 | u16 destination_range_start; 69 | } le; 70 | }; 71 | } fse_decode_entry_t; 72 | 73 | /* 74 | * DECODE_SYMBOL() - Macro to decode a FSE-encoded symbol. The decoded symbol 75 | * is obtained from the decode table entry for the current state. The state is 76 | * then updated to the next state, which is obtained by indexing the current 77 | * state's "destination range" with the next 'num_bits' bits of input data. 78 | */ 79 | #if 1 /* Optimized version which accesses the entry as a u32 */ 80 | #define DECODE_SYMBOL(state, decode_table) \ 81 | ( \ 82 | sym = decode_table[state].entry, \ 83 | state = (sym >> 16) + POP_BITS((sym >> 8) & 0xFF), \ 84 | sym & 0xFF \ 85 | ) 86 | 87 | #else /* Unoptimized version which accesses individual struct members */ 88 | #define DECODE_SYMBOL(state, decode_table) \ 89 | ( \ 90 | sym = CPU_IS_LITTLE_ENDIAN() ? \ 91 | decode_table[state].le.symbol : \ 92 | decode_table[state].be.symbol, \ 93 | state = CPU_IS_LITTLE_ENDIAN() ? \ 94 | decode_table[state].le.destination_range_start + \ 95 | POP_BITS(decode_table[state].le.num_bits) : \ 96 | decode_table[state].be.destination_range_start + \ 97 | POP_BITS(decode_table[state].be.num_bits), \ 98 | sym \ 99 | ) 100 | #endif 101 | 102 | /* 103 | * Build the FSE decode table for an alphabet. 104 | * 105 | * @decode_table [out] 106 | * The decode table to build. 107 | * @state_counts [in but invalidated] 108 | * An array which provides, for each symbol in the alphabet, the number of 109 | * states which should be assigned to that symbol. 110 | * @alphabet_size [in] 111 | * The number of symbols in the alphabet. 112 | * @log2_num_states [in] 113 | * The log base 2 of the number of states, which is also the number of 114 | * entries in the decode table being built. 115 | * 116 | * Returns true if the state counts were valid or false if they were not. 117 | */ 118 | static bool 119 | build_fse_decode_table(fse_decode_entry_t decode_table[], u16 state_counts[], 120 | unsigned alphabet_size, unsigned log2_num_states) 121 | { 122 | /* 123 | * Assign a symbol to each state such that each symbol 'sym' gets 124 | * assigned to exactly 'state_counts[sym]' states. To do this, assign 125 | * states to symbols in order of increasing symbol value while visiting 126 | * all states in a special order. 127 | */ 128 | const unsigned num_states = 1 << log2_num_states; 129 | const unsigned state_generator = get_state_generator(num_states); 130 | const unsigned state_mask = num_states - 1; 131 | unsigned state = 0; 132 | u32 total_count = 0; 133 | unsigned sym; 134 | 135 | for (sym = 0; sym < alphabet_size; sym++) { 136 | unsigned count = state_counts[sym]; 137 | if (count == 0) /* Unused symbol? */ 138 | continue; 139 | total_count += count; 140 | do { 141 | decode_table[state].entry = sym; 142 | state = (state + state_generator) & state_mask; 143 | } while (--count); 144 | } 145 | 146 | /* 147 | * Verify that the sum of the state counts really was 148 | * 2**log2_num_states. With a bad input, the sum might be lower than 149 | * expected (in which case not all states were visited) or higher than 150 | * expected (in which case some states were visited multiple times). 151 | * Both cases are strictly forbidden. 152 | */ 153 | if (unlikely(total_count != num_states)) 154 | return false; 155 | 156 | /* 157 | * Now, set 'num_bits' and 'destination_range_start' for each decode 158 | * table entry. This works as follows. First, a little background: 159 | * given a symbol that is assigned 'count' states out of a total of 160 | * 'num_states' states, the entropy, in bits, of an occurrence of that 161 | * symbol is: 162 | * 163 | * log2(1/probability) 164 | * = log2(1/(count/num_states)) 165 | * = log2(num_states/count) 166 | * = log2(num_states) - log2(count) 167 | * 168 | * This may be a non-integer value. The rounded-down value is: 169 | * 170 | * min_bits = floor(log2(num_states) - log2(count)) 171 | * = log2(num_states) - ceil(log2(count)) 172 | * 173 | * With finite state entropy coding, we will sometimes code the symbol 174 | * using 'min_bits' bits and sometimes using 'min_bits + 1' bits. Each 175 | * of the symbol's 'count' states will be associated with one of these 176 | * two choices of 'num_bits'. In addition, each state will point to a 177 | * "destination range" of length '2**num_bits'. The destination range 178 | * is the range of states which the encoder may have been in prior to 179 | * encoding the symbol and entering a given state. 180 | * 181 | * The precise mapping of a symbol's states to bit counts and 182 | * destination ranges is defined as follows. For some 'X < count', the 183 | * numerically first 'X' states are each assigned 'min_bits + 1' bits 184 | * and are mapped consecutively to a series of destination ranges that 185 | * ends with state 'num_states - 1'. The remaining 'count - X' states 186 | * are each assigned 'min_bits' bits and are mapped consecutively to a 187 | * series of destination ranges that starts with state 0. Since the 188 | * destination ranges must exactly cover all 'num_states' states (this 189 | * is required, in general, for encoding to have been possible), we can 190 | * solve for 'X': 191 | * 192 | * (2**(min_bits+1))X + (2**min_bits)(count - X) = num_states 193 | * (2**min_bits)(2X + count - X) = num_states 194 | * (2**min_bits)(X + count) = num_states 195 | * X + count = num_states / (2**min_bits) 196 | * X = num_states / (2**min_bits) - count 197 | * 198 | * As an example, with num_states = 256 and count = 23, then min_bits = 199 | * log2(256) - ceil(log2(23)) = 8 - 5 = 3. So each of the symbol's 23 200 | * states will be assigned 3 ('min_bits') or 4 ('min_bits + 1') bits. 201 | * Processing the 23 states in ascending numerical order, the first X 202 | * states will each be assigned 4 bits and the next 23 - X states will 203 | * each be assigned 3 bits. X is: 204 | * 205 | * X = num_states / (2**min_bits) - count 206 | * = 256 / (2**3) - 23 207 | * = 9 208 | * 209 | * Hence, the first 9 states will each be assigned 4 bits and have 210 | * destination ranges covering the last 9 * 2**4 = 144 of the 256 211 | * states, and the remaining 23 - 9 = 14 states will each be assigned 3 212 | * bits and have destination ranges covering the first 14 * 2**3 = 112 213 | * of the 256 states. 214 | * 215 | * There are a few possible implementations for actually computing 216 | * 'num_bits' and 'destination_range_start' for each of a symbol's 217 | * states. What we do is iterate through *all* states in ascending 218 | * order. This interleaves states for different symbols but guarantees 219 | * that all states for each symbol are visited in ascending order. 220 | * 'state_counts[sym]' is re-used as a counter which is incremented each 221 | * time after a state for symbol 'sym' is visited. 'X' is just the 222 | * distance between the initial value of 'state_counts[sym]' and the 223 | * closest power of 2 greater than or equal to 'state_counts[sym]'. 224 | * When the counter reaches this power of 2, then the number of bits 225 | * required, as computed by 'log2(num_states) - floor(log2(counter))', 226 | * decreases from 'min_bits + 1' to 'min_bits'. In addition, the 227 | * destination range start for each state is easily computed from the 228 | * value of the counter and num_bits at that state. 229 | */ 230 | for (state = 0; state < num_states; state++) { 231 | 232 | u32 sym = decode_table[state].entry; 233 | u32 counter = state_counts[sym]++; 234 | unsigned num_bits = log2_num_states - bsr32(counter); 235 | u32 destination_range_start = (counter << num_bits) - num_states; 236 | 237 | if (CPU_IS_LITTLE_ENDIAN()) { 238 | decode_table[state].le.num_bits = num_bits; 239 | decode_table[state].le.destination_range_start = destination_range_start; 240 | } else { 241 | decode_table[state].be.num_bits = num_bits; 242 | decode_table[state].be.destination_range_start = destination_range_start; 243 | } 244 | } 245 | 246 | return true; 247 | } 248 | 249 | /* Copy a word from @src to @dst, making no assumptions about alignment. */ 250 | static forceinline void 251 | copy_word_unaligned(const u8 *src, u8 *dst) 252 | { 253 | store_word_unaligned(load_word_unaligned(src), dst); 254 | } 255 | 256 | /* Copy 16 bytes from @src to @dst, making no assumptions about alignment. */ 257 | static forceinline void 258 | copy_16_bytes_unaligned(const u8 *src, u8 *dst) 259 | { 260 | #ifdef __SSE2__ 261 | __m128i v = _mm_loadu_si128((const __m128i *)src); 262 | _mm_storeu_si128((__m128i *)dst, v); 263 | #else 264 | STATIC_ASSERT(WORDBYTES == 4 || WORDBYTES == 8); 265 | if (WORDBYTES == 4) { 266 | copy_word_unaligned(src + 0, dst + 0); 267 | copy_word_unaligned(src + 4, dst + 4); 268 | copy_word_unaligned(src + 8, dst + 8); 269 | copy_word_unaligned(src + 12, dst + 12); 270 | } else { 271 | copy_word_unaligned(src + 0, dst + 0); 272 | copy_word_unaligned(src + 8, dst + 8); 273 | } 274 | #endif 275 | } 276 | 277 | /* Build a word which consists of the byte @b repeated. */ 278 | static forceinline machine_word_t 279 | repeat_byte(u8 b) 280 | { 281 | machine_word_t v; 282 | 283 | STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); 284 | 285 | v = b; 286 | v |= v << 8; 287 | v |= v << 16; 288 | v |= v << ((WORDBITS == 64) ? 32 : 0); 289 | return v; 290 | } 291 | 292 | 293 | /****************************************************************************** 294 | * Input bitstream * 295 | ******************************************************************************/ 296 | 297 | /* 298 | * The state of the "input bitstream" consists of the following variables: 299 | * 300 | * - in_next: pointer to the next unread byte in the input buffer 301 | * 302 | * - in_end: pointer just past the end of the input buffer 303 | * 304 | * - bitbuf: a word-sized variable containing bits that have been read from 305 | * the input buffer. The buffered bits are right-aligned 306 | * (they're the low-order bits). 307 | * 308 | * - bitsleft: number of bits in 'bitbuf' that are valid. 309 | * 310 | * To make it easier for the compiler to optimize the code by keeping variables 311 | * in registers, these are declared as normal variables and manipulated using 312 | * macros. 313 | */ 314 | 315 | /* 316 | * The maximum number of bits that can be requested to be in the bitbuffer 317 | * variable. This is the maximum value of 'n' that can be passed to 318 | * ENSURE_BITS(n). 319 | * 320 | * This not equal to WORDBITS because we never read less than one byte at a 321 | * time. If the bitbuffer variable contains more than (WORDBITS - 8) bits, then 322 | * we can't read another byte without first consuming some bits. So the maximum 323 | * count we can ensure is (WORDBITS - 7). 324 | */ 325 | #define MAX_ENSURE (WORDBITS - 7) 326 | 327 | /* 328 | * Evaluates to true if 'n' is a valid argument to ENSURE_BITS(n), or false if 329 | * 'n' is too large to be passed to ENSURE_BITS(n). Note: if 'n' is a compile 330 | * time constant, then this expression will be a compile-type constant. 331 | * Therefore, CAN_ENSURE() can be used choose between alternative 332 | * implementations at compile time. 333 | */ 334 | #define CAN_ENSURE(n) ((n) <= MAX_ENSURE) 335 | 336 | /* 337 | * Fill the bitbuffer variable, reading one byte at a time. 338 | * 339 | * Note: if we would overrun the input buffer, we just don't read anything, 340 | * leaving the bits as 0 but marking them as filled. This makes the 341 | * implementation simpler because this removes the need to distinguish between 342 | * "real" overruns and overruns that occur because of our own lookahead during 343 | * decompression. The disadvantage is that a "real" overrun can go undetected, 344 | * and the decompressor may return a success status rather than the expected 345 | * failure status if one occurs. However, this is not too important because 346 | * even if this specific case were to be handled "correctly", one could easily 347 | * come up with a different case where the compressed data would be corrupted in 348 | * such a way that fully retains its validity from the point of view of the 349 | * decompressor. Users should run a checksum against the decompressed data if 350 | * they wish to detect corruptions. 351 | */ 352 | #define FILL_BITS_BYTEWISE() \ 353 | do { \ 354 | do { \ 355 | if (likely(in_next != in_end)) \ 356 | bitbuf |= (machine_word_t)*in_next++ << bitsleft; \ 357 | else \ 358 | overrun_count++; \ 359 | bitsleft += 8; \ 360 | } while (bitsleft <= WORDBITS - 8); \ 361 | } while (0) 362 | 363 | /* 364 | * Fill the bitbuffer variable by reading the next word from the input buffer. 365 | * This can be significantly faster than FILL_BITS_BYTEWISE(). However, for 366 | * this to work correctly, the word must be interpreted in little-endian format. 367 | * In addition, the memory access may be unaligned. Therefore, this method is 368 | * most efficient on little-endian architectures that support fast unaligned 369 | * access, such as x86 and x86_64. 370 | */ 371 | #define FILL_BITS_WORDWISE() \ 372 | do { \ 373 | bitbuf |= get_unaligned_leword(in_next) << bitsleft; \ 374 | in_next += (WORDBITS - bitsleft) >> 3; \ 375 | bitsleft += (WORDBITS - bitsleft) & ~7; \ 376 | } while (0) 377 | 378 | /* 379 | * Load more bits from the input buffer until the specified number of bits is 380 | * present in the bitbuffer variable. 'n' must be <= MAX_ENSURE. 381 | */ 382 | #define ENSURE_BITS(n) \ 383 | do { \ 384 | if (bitsleft < (n)) { \ 385 | if (UNALIGNED_ACCESS_IS_FAST && \ 386 | likely(in_end - in_next >= WORDBYTES)) \ 387 | FILL_BITS_WORDWISE(); \ 388 | else \ 389 | FILL_BITS_BYTEWISE(); \ 390 | } \ 391 | } while (0) 392 | 393 | /* Remove and return the next 'n' bits from the bitbuffer variable. */ 394 | #define POP_BITS(n) \ 395 | ( \ 396 | bits = (u32)bitbuf & (((u32)1 << (n)) - 1), \ 397 | bitbuf >>= (n), \ 398 | bitsleft -= (n), \ 399 | bits \ 400 | ) 401 | 402 | /* 403 | * Align the input to the next byte boundary, discarding any remaining bits in 404 | * the current byte. 405 | * 406 | * Note that if the bitbuffer variable currently contains more than 8 bits, then 407 | * we must rewind 'in_next', effectively putting those bits back. Only the bits 408 | * in what would be the "current" byte if we were reading one byte at a time can 409 | * be actually discarded. 410 | */ 411 | #define ALIGN_INPUT() \ 412 | do { \ 413 | in_next -= (bitsleft >> 3) - MIN(overrun_count, bitsleft >> 3); \ 414 | bitbuf = 0; \ 415 | bitsleft = 0; \ 416 | } while (0) 417 | 418 | 419 | /* The main decompressor structure */ 420 | struct xpack_decompressor { 421 | 422 | /* 423 | * The FSE decoding table for each alphabet. The literal table can be 424 | * in union with the other tables because all literal symbols are 425 | * decoded first. 426 | */ 427 | union { 428 | fse_decode_entry_t literal_decode_table 429 | [1 << MAX_LOG2_NUM_LITERAL_STATES]; 430 | struct { 431 | fse_decode_entry_t litrunlen_decode_table 432 | [1 << MAX_LOG2_NUM_LITRUNLEN_STATES]; 433 | fse_decode_entry_t length_decode_table 434 | [1 << MAX_LOG2_NUM_LENGTH_STATES]; 435 | fse_decode_entry_t offset_decode_table 436 | [1 << MAX_LOG2_NUM_OFFSET_STATES]; 437 | fse_decode_entry_t aligned_decode_table 438 | [1 << MAX_LOG2_NUM_ALIGNED_STATES]; 439 | }; 440 | }; 441 | 442 | /* The FSE state counts for each alphabet */ 443 | union { 444 | u16 state_counts[LITERAL_ALPHABET_SIZE + 445 | LITRUNLEN_ALPHABET_SIZE + 446 | LENGTH_ALPHABET_SIZE + 447 | MAX_OFFSET_ALPHABET_SIZE + 448 | ALIGNED_ALPHABET_SIZE]; 449 | struct { 450 | u16 literal_state_counts[LITERAL_ALPHABET_SIZE]; 451 | u16 litrunlen_state_counts[LITRUNLEN_ALPHABET_SIZE]; 452 | u16 length_state_counts[LENGTH_ALPHABET_SIZE]; 453 | u16 offset_state_counts[MAX_OFFSET_ALPHABET_SIZE]; 454 | u16 aligned_state_counts[ALIGNED_ALPHABET_SIZE]; 455 | }; 456 | }; 457 | }; 458 | 459 | #define FUNCNAME xpack_decompress_default 460 | #define ATTRIBUTES 461 | #include "decompress_impl.h" 462 | #undef FUNCNAME 463 | #undef ATTRIBUTES 464 | 465 | #if X86_CPU_FEATURES_ENABLED && \ 466 | COMPILER_SUPPORTS_BMI2_TARGET && !defined(__BMI2__) 467 | # define FUNCNAME xpack_decompress_bmi2 468 | # define ATTRIBUTES __attribute__((target("bmi2"))) 469 | # include "decompress_impl.h" 470 | # undef FUNCNAME 471 | # undef ATTRIBUTES 472 | # define DISPATCH_ENABLED 1 473 | #else 474 | # define DISPATCH_ENABLED 0 475 | #endif 476 | 477 | #if DISPATCH_ENABLED 478 | 479 | static enum decompress_result 480 | dispatch(struct xpack_decompressor *d, const void *in, size_t in_nbytes, 481 | void *out, size_t out_nbytes_avail, size_t *actual_out_nbytes_ret); 482 | 483 | typedef enum decompress_result (*decompress_func_t) 484 | (struct xpack_decompressor *d, const void *in, size_t in_nbytes, 485 | void *out, size_t out_nbytes_avail, size_t *actual_out_nbytes_ret); 486 | 487 | static decompress_func_t decompress_impl = dispatch; 488 | 489 | static enum decompress_result 490 | dispatch(struct xpack_decompressor *d, const void *in, size_t in_nbytes, 491 | void *out, size_t out_nbytes_avail, size_t *actual_out_nbytes_ret) 492 | { 493 | decompress_func_t f = xpack_decompress_default; 494 | #if X86_CPU_FEATURES_ENABLED 495 | if (x86_have_cpu_feature(X86_CPU_FEATURE_BMI2)) 496 | f = xpack_decompress_bmi2; 497 | #endif 498 | decompress_impl = f; 499 | return (*f)(d, in, in_nbytes, out, out_nbytes_avail, 500 | actual_out_nbytes_ret); 501 | } 502 | #endif /* DISPATCH_ENABLED */ 503 | 504 | /* 505 | * This is the main decompression routine. See libxpack.h for the 506 | * documentation. 507 | * 508 | * Note that the real code is in decompress_impl.h. The part here just handles 509 | * calling the appropriate implementation depending on the CPU features at 510 | * runtime. 511 | */ 512 | LIBEXPORT enum decompress_result 513 | xpack_decompress(struct xpack_decompressor *d, const void *in, size_t in_nbytes, 514 | void *out, size_t out_nbytes_avail, 515 | size_t *actual_out_nbytes_ret) 516 | { 517 | #if DISPATCH_ENABLED 518 | return (*decompress_impl)(d, in, in_nbytes, out, out_nbytes_avail, 519 | actual_out_nbytes_ret); 520 | #else 521 | return xpack_decompress_default(d, in, in_nbytes, out, out_nbytes_avail, 522 | actual_out_nbytes_ret); 523 | #endif 524 | } 525 | 526 | LIBEXPORT struct xpack_decompressor * 527 | xpack_alloc_decompressor(void) 528 | { 529 | return malloc(sizeof(struct xpack_decompressor)); 530 | } 531 | 532 | LIBEXPORT void 533 | xpack_free_decompressor(struct xpack_decompressor *d) 534 | { 535 | free(d); 536 | } 537 | -------------------------------------------------------------------------------- /lib/xpack_compress.c: -------------------------------------------------------------------------------- 1 | /* 2 | * xpack_compress.c - compressor for the XPACK compression format 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #ifndef DECOMPRESSION_ONLY 29 | 30 | #ifdef __SSE2__ 31 | # include 32 | #endif 33 | #ifdef __SSE4_1__ 34 | # include 35 | #endif 36 | 37 | #include "hc_matchfinder.h" 38 | #include "lz_extend.h" 39 | #include "xpack_common.h" 40 | 41 | /* 42 | * The compressor always chooses a block of at least MIN_BLOCK_LENGTH bytes, 43 | * except if the last block has to be shorter. 44 | */ 45 | #define MIN_BLOCK_LENGTH 10000 46 | 47 | /* 48 | * The compressor attempts to end blocks after SOFT_MAX_BLOCK_LENGTH bytes, but 49 | * the final size might be larger due to matches extending beyond the end of the 50 | * block. Specifically: 51 | * 52 | * - The greedy parser may choose an arbitrarily long match starting at the 53 | * SOFT_MAX_BLOCK_LENGTH'th byte. 54 | * 55 | * - The lazy parser may choose a sequence of literals starting at the 56 | * SOFT_MAX_BLOCK_LENGTH'th byte when it sees a sequence of increasing good 57 | * matches. The final match may be of arbitrary length. The length of the 58 | * literal sequence is approximately limited by the "nice match length" 59 | * parameter. The actual limit is related to match scores and may be 60 | * slightly different. We overestimate the limit as EXTRA_LITERAL_SPACE. 61 | */ 62 | #define SOFT_MAX_BLOCK_LENGTH 300000 63 | #define EXTRA_LITERAL_SPACE 512 64 | 65 | /* Holds the symbols and extra offset bits needed to represent a match */ 66 | struct match { 67 | u8 litrunlen_sym; 68 | u8 length_sym; 69 | u8 offset_sym; 70 | u32 extra_offset_bits; 71 | }; 72 | 73 | /* Frequency counters for each alphabet */ 74 | struct freqs { 75 | u32 literal[LITERAL_ALPHABET_SIZE]; 76 | u32 litrunlen[LITRUNLEN_ALPHABET_SIZE]; 77 | u32 length[LENGTH_ALPHABET_SIZE]; 78 | u32 offset[MAX_OFFSET_ALPHABET_SIZE]; 79 | u32 aligned[ALIGNED_ALPHABET_SIZE]; 80 | }; 81 | 82 | /* Finite State Entropy encoding information for a symbol */ 83 | struct fse_symbol_encoding_info { 84 | u32 adjusted_num_states_in_big_ranges; 85 | s32 next_states_begin; 86 | }; 87 | 88 | /* Finite State Entropy encoding information for each alphabet */ 89 | struct codes { 90 | struct fse_symbol_encoding_info literal_sym_encinfo[LITERAL_ALPHABET_SIZE]; 91 | struct fse_symbol_encoding_info litrunlen_sym_encinfo[LITRUNLEN_ALPHABET_SIZE]; 92 | struct fse_symbol_encoding_info length_sym_encinfo[LENGTH_ALPHABET_SIZE]; 93 | struct fse_symbol_encoding_info offset_sym_encinfo[MAX_OFFSET_ALPHABET_SIZE]; 94 | struct fse_symbol_encoding_info aligned_sym_encinfo[ALIGNED_ALPHABET_SIZE]; 95 | 96 | u16 literal_next_statesx[1 << MAX_LOG2_NUM_LITERAL_STATES]; 97 | u16 litrunlen_next_statesx[1 << MAX_LOG2_NUM_LITRUNLEN_STATES]; 98 | u16 length_next_statesx[1 << MAX_LOG2_NUM_LENGTH_STATES]; 99 | u16 offset_next_statesx[1 << MAX_LOG2_NUM_OFFSET_STATES]; 100 | u16 aligned_next_statesx[1 << MAX_LOG2_NUM_ALIGNED_STATES]; 101 | 102 | unsigned log2_num_literal_states; 103 | unsigned log2_num_litrunlen_states; 104 | unsigned log2_num_length_states; 105 | unsigned log2_num_offset_states; 106 | unsigned log2_num_aligned_states; 107 | 108 | union { 109 | u16 state_counts[LITERAL_ALPHABET_SIZE + 110 | LITRUNLEN_ALPHABET_SIZE + 111 | LENGTH_ALPHABET_SIZE + 112 | MAX_OFFSET_ALPHABET_SIZE + 113 | ALIGNED_ALPHABET_SIZE]; 114 | struct { 115 | u16 literal_state_counts[LITERAL_ALPHABET_SIZE]; 116 | u16 litrunlen_state_counts[LITRUNLEN_ALPHABET_SIZE]; 117 | u16 length_state_counts[LENGTH_ALPHABET_SIZE]; 118 | u16 offset_state_counts[MAX_OFFSET_ALPHABET_SIZE]; 119 | u16 aligned_state_counts[ALIGNED_ALPHABET_SIZE]; 120 | }; 121 | }; 122 | }; 123 | 124 | /* Block split statistics. See "Block splitting algorithm" below. */ 125 | #define NUM_LITERAL_OBSERVATION_TYPES 8 126 | #define NUM_MATCH_OBSERVATION_TYPES 2 127 | #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES) 128 | struct block_split_stats { 129 | u32 new_observations[NUM_OBSERVATION_TYPES]; 130 | u32 observations[NUM_OBSERVATION_TYPES]; 131 | u32 num_new_observations; 132 | u32 num_observations; 133 | }; 134 | 135 | /* The main compressor structure */ 136 | struct xpack_compressor { 137 | 138 | unsigned nice_match_length; 139 | unsigned max_search_depth; 140 | u8 *in_buffer; 141 | size_t in_nbytes; 142 | size_t max_buffer_size; 143 | size_t (*impl)(struct xpack_compressor *, void *, size_t); 144 | 145 | struct freqs freqs; 146 | struct block_split_stats split_stats; 147 | struct codes codes; 148 | 149 | unsigned cumul_state_counts[MAX_ALPHABET_SIZE]; 150 | u8 state_to_symbol[MAX_NUM_STATES]; 151 | 152 | u32 num_literals; 153 | u32 num_matches; 154 | u32 num_extra_bytes; 155 | 156 | u8 literals[SOFT_MAX_BLOCK_LENGTH + EXTRA_LITERAL_SPACE]; 157 | struct match matches[DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH, MIN_MATCH_LEN) + 1]; 158 | u8 extra_bytes[6 + /* extra for actual block length > soft max */ 159 | MAX4(1 * DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH, 160 | MIN_MATCH_LEN + LENGTH_ALPHABET_SIZE - 1), 161 | 3 * DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH, 162 | MIN_MATCH_LEN + LENGTH_ALPHABET_SIZE - 1 + 0xFF), 163 | 1 * DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH, 164 | LITRUNLEN_ALPHABET_SIZE - 1), 165 | 3 * DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH, 166 | LITRUNLEN_ALPHABET_SIZE - 1 + 0xFF))]; 167 | 168 | /* Hash chains matchfinder (MUST BE LAST!!!) */ 169 | struct hc_matchfinder hc_mf; 170 | }; 171 | 172 | /* Return the log base 2 of 'n', rounded up to the nearest integer. */ 173 | static forceinline unsigned 174 | ilog2_ceil(u32 n) 175 | { 176 | if (n <= 1) 177 | return 0; 178 | return 1 + bsr32(n - 1); 179 | } 180 | 181 | /* Select the log2(num_states) to use for an alphabet. */ 182 | static unsigned 183 | select_log2_num_states(u32 total_freq, unsigned num_used_syms, 184 | unsigned max_log2_num_states) 185 | { 186 | unsigned num_states = 1 << max_log2_num_states; /* Default value */ 187 | 188 | /* 189 | * If there are not many symbols to be encoded, then it's not helpful to 190 | * use many states. 191 | */ 192 | num_states = MIN(num_states, total_freq / 4); 193 | 194 | /* 195 | * There must be at least as many states as distinct used symbols. 196 | * Note: we're guaranteed num_used_syms > 0 here because of the earlier 197 | * check, which implies that this calculation produces num_states > 0. 198 | */ 199 | num_states = MAX(num_states, num_used_syms); 200 | 201 | return ilog2_ceil(num_states); 202 | } 203 | 204 | /* Remove states from symbols until the correct number of states is used. */ 205 | static void 206 | adjust_state_counts(u16 state_counts[], unsigned num_states_overrun, 207 | unsigned alphabet_size) 208 | { 209 | unsigned shift; 210 | unsigned sym; 211 | unsigned n; 212 | 213 | for (shift = 3; num_states_overrun != 0; shift--) { 214 | for (sym = 0; sym < alphabet_size; sym++) { 215 | if (state_counts[sym] > 1) { 216 | n = MIN((state_counts[sym] - 1) >> shift, 217 | num_states_overrun); 218 | state_counts[sym] -= n; 219 | num_states_overrun -= n; 220 | if (num_states_overrun == 0) 221 | break; 222 | } 223 | } 224 | } 225 | } 226 | 227 | /* 228 | * Determine how many states to assign to each symbol. 229 | * 230 | * Basically, for each symbol 'sym' we need to take the real number 231 | * 232 | * freqs[sym] * (num_states / total_freq) 233 | * 234 | * and round it up or down to the nearest integer as appropriate to make all the 235 | * state_counts[] sum to num_states, while still approximating the real entropy 236 | * well. However, this implementation does *not* compute the entropy-optimal 237 | * state counts. 238 | */ 239 | static void 240 | compute_state_counts(const u32 freqs[], const u32 total_freq, 241 | u16 state_counts[], const unsigned alphabet_size, 242 | const unsigned log2_num_states) 243 | { 244 | signed int remaining_states = 1 << log2_num_states; 245 | unsigned max_state_count = 0; 246 | unsigned sym_with_max_state_count = 0; 247 | unsigned sym = 0; 248 | 249 | #if 0 250 | const float scale_factor = (float)(1 << log2_num_states) / (float)total_freq; 251 | const __m128 v_scale_factor = _mm_set1_ps(scale_factor); 252 | const __m128i v_lowcount_cutoff = _mm_set1_epi16(0x7FFF - (u16)(0.5 / scale_factor)); 253 | __m128i v_num_states_used = _mm_set1_epi16(0); 254 | __m128i v_max_state_count = _mm_set1_epi16(0); 255 | __m128i v_sym_with_max_state_count = _mm_set1_epi16(0); 256 | __m128i v_syms = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 257 | 258 | /* Process 8 freqs at a time */ 259 | for (; sym < (alphabet_size & ~7); sym += 8) { 260 | 261 | /* Load the next freqs. */ 262 | __m128i v_freq1 = _mm_loadu_si128((const __m128i *)&freqs[sym + 0]); 263 | __m128i v_freq2 = _mm_loadu_si128((const __m128i *)&freqs[sym + 4]); 264 | 265 | /* Prepare adjustment for the 'state_count == 0 && freq != 0' case */ 266 | __m128i v_freqpack_saturated = _mm_packs_epi32(v_freq1, v_freq2); 267 | __m128i v_freqpack_saturated_adjusted = _mm_add_epi16(v_freqpack_saturated, 268 | v_lowcount_cutoff); 269 | __m128i v_negative_adjustment = _mm_cmpgt_epi16(v_freqpack_saturated_adjusted, 270 | v_lowcount_cutoff); 271 | 272 | /* Compute: state_count = round(count * (num_states / total_freq)) */ 273 | __m128 v_freqf1 = _mm_cvtepi32_ps(v_freq1); 274 | __m128 v_freqf2 = _mm_cvtepi32_ps(v_freq2); 275 | __m128 v_mul1 = _mm_mul_ps(v_freqf1, v_scale_factor); 276 | __m128 v_mul2 = _mm_mul_ps(v_freqf2, v_scale_factor); 277 | __m128i v_muli1 = _mm_cvtps_epi32(v_mul1); 278 | __m128i v_muli2 = _mm_cvtps_epi32(v_mul2); 279 | __m128i v_state_count = _mm_packs_epi32(v_muli1, v_muli2); 280 | 281 | /* If state_count == 0 but freq != 0, set state_count=1. */ 282 | v_state_count = _mm_sub_epi16(v_state_count, v_negative_adjustment); 283 | 284 | /* Save the state counts */ 285 | _mm_storeu_si128((__m128i *)&state_counts[sym], v_state_count); 286 | 287 | /* Update num_states_used */ 288 | v_num_states_used = _mm_add_epi16(v_num_states_used, v_state_count); 289 | 290 | /* Update max_state_count */ 291 | v_max_state_count = _mm_max_epi16(v_max_state_count, v_state_count); 292 | 293 | /* Update sym_with_max_state_count */ 294 | __m128i v_is_new_max = _mm_cmpeq_epi16(v_state_count, v_max_state_count); 295 | #ifdef __SSE4_1__ 296 | v_sym_with_max_state_count = _mm_blendv_epi8(v_sym_with_max_state_count, 297 | v_syms, v_is_new_max); 298 | #else 299 | __m128i v_old_syms_to_keep = _mm_andnot_si128(v_is_new_max, v_sym_with_max_state_count); 300 | __m128i v_new_syms_to_set = _mm_and_si128(v_is_new_max, v_syms); 301 | v_sym_with_max_state_count = _mm_or_si128(v_old_syms_to_keep, v_new_syms_to_set); 302 | #endif 303 | 304 | v_syms = _mm_add_epi32(v_syms, _mm_set1_epi16(8)); 305 | } 306 | 307 | for (int i = 0; i < 8; i++) { 308 | remaining_states -= ((__v8hi)v_num_states_used)[i]; 309 | if (((__v8hi)v_max_state_count)[i] > max_state_count) { 310 | max_state_count = ((__v8hi)v_max_state_count)[i]; 311 | sym_with_max_state_count = ((__v8hi)v_sym_with_max_state_count)[i]; 312 | } 313 | } 314 | #endif /* __SSE2__ */ 315 | 316 | const u32 highprec_step = ((u32)1 << 31) / total_freq; 317 | const unsigned shift = 31 - log2_num_states - 1; 318 | 319 | for (; sym < alphabet_size; sym++) { 320 | /* 321 | * Rescale the frequency. Round up if the fractional part is 322 | * greater than or equal to 0.5. Otherwise, round down. 323 | */ 324 | unsigned state_count = 325 | (((freqs[sym] * highprec_step) >> shift) + 1) >> 1; 326 | 327 | if (state_count == 0 && freqs[sym] != 0) 328 | state_count = 1; 329 | 330 | state_counts[sym] = state_count; 331 | remaining_states -= state_count; 332 | 333 | if (state_count > max_state_count) { 334 | max_state_count = state_count; 335 | sym_with_max_state_count = sym; 336 | } 337 | } 338 | 339 | /* 340 | * If there are still states to assign, assign them to the most common 341 | * symbol. Or if we assigned more states than were actually available, 342 | * then either subtract from the most common symbol (for minor overruns) 343 | * or use the slower adjustment algorithm (for major overruns). 344 | */ 345 | if (-remaining_states < (signed int)(max_state_count >> 2)) { 346 | state_counts[sym_with_max_state_count] += remaining_states; 347 | } else { 348 | adjust_state_counts(state_counts, -remaining_states, 349 | alphabet_size); 350 | } 351 | } 352 | 353 | /* Build the FSE encoding tables for an alphabet, given the state counts. */ 354 | static void 355 | build_fse_encoding_tables(struct xpack_compressor *c, 356 | struct fse_symbol_encoding_info sym_encinfo[], 357 | u16 next_statesx[], 358 | const u16 state_counts[], 359 | const unsigned alphabet_size, 360 | const unsigned log2_num_states) 361 | { 362 | const unsigned num_states = 1 << log2_num_states; 363 | const unsigned state_generator = get_state_generator(num_states); 364 | const unsigned state_mask = num_states - 1; 365 | unsigned cumul_total; 366 | unsigned sym; 367 | unsigned state; 368 | unsigned count; 369 | unsigned max_bits; 370 | 371 | /* 372 | * Build sym_encinfo[], which provides encoding information for each 373 | * used symbol. At the same time, build cumul_state_counts[], which for 374 | * each symbol provides the total state count of the symbols that 375 | * numerically precede it. 376 | */ 377 | cumul_total = 0; 378 | for (sym = 0; sym < alphabet_size; sym++) { 379 | 380 | count = state_counts[sym]; 381 | 382 | if (count == 0) /* Unused symbol? */ 383 | continue; 384 | 385 | c->cumul_state_counts[sym] = cumul_total; 386 | 387 | /* 388 | * Each encoding of this symbol requires either 'min_bits' or 389 | * 'max_bits = min_bits + 1' bits, where 'min_bits' is the 390 | * entropy of this symbol rounded down to the nearest integer: 391 | * 392 | * min_bits = floor(log2(1/probability)) 393 | * min_bits = floor(log2(1/(count/num_states))) 394 | * min_bits = floor(log2(num_states/count)) 395 | * min_bits = floor(log2(num_states) - log2(count)) 396 | * min_bits = log2(num_states) - ceil(log2(count)) 397 | */ 398 | max_bits = log2_num_states - ilog2_ceil(count) + 1; 399 | 400 | /* 401 | * Save a value that makes it possible to branchlessly find the 402 | * num_bits for a given state. See encode_symbol() for details. 403 | */ 404 | sym_encinfo[sym].adjusted_num_states_in_big_ranges = 405 | ((u32)max_bits << MAX_LOG2_NUM_STATES) - 406 | ((u32)count << max_bits); 407 | 408 | /* 409 | * When we need to encode an instance of this symbol, we'll have 410 | * a "current state". We'll need to find which destination 411 | * range the current state is in, and which state --- the "next 412 | * state" from the encoder's point of view but the "previous 413 | * state" from the decoder's point of view --- maps to that 414 | * destination range. How can we do this efficiently? 415 | * 416 | * The solution relies on these facts: 417 | * 418 | * - We'll know the number of bits to use. Consequently, 419 | * we'll know the length of the destination range. 420 | * - The 'min_bits' destination ranges all precede the 421 | * 'max_bits' destination ranges. 422 | * 423 | * What we'll do is maintain the state adjusted upwards by 424 | * 'num_states'. Then, we'll right-shift it by the number of 425 | * bits that need to be used. If 'min_bits' were required, then 426 | * the result will be 'num_states >> min_bits' plus the index of 427 | * the destination range in the list of 'min_bits' destination 428 | * ranges. But if 'max_bits' were required, then the result 429 | * will be 'num_states >> min_bits' minus the number of 430 | * 'max_bits' destination ranges, plus the index of the 431 | * destination range in the list of 'max_bits' destination 432 | * ranges. Result: we map states to consecutive integers, each 433 | * of which identifies a destination range. We can use these 434 | * integers as indices into a lookup table for the next state. 435 | * 436 | * Below, 'cumul_total' is the index at which the entries will 437 | * actually begin in 'next_statesx[]'. 'count' is the beginning 438 | * of the sequence of destination range identifiers. This is 439 | * 'num_states >> min_bits' minus the number of 'max_bits' 440 | * destination ranges, which is also the same as the number of 441 | * states (or destination ranges). Note that the result of the 442 | * subtraction may be a negative number. 443 | */ 444 | sym_encinfo[sym].next_states_begin = (s32)cumul_total - (s32)count; 445 | 446 | cumul_total += count; 447 | } 448 | 449 | /* Assign states to symbols. */ 450 | state = 0; 451 | for (sym = 0; sym < alphabet_size; sym++) { 452 | count = state_counts[sym]; 453 | while (count--) { 454 | c->state_to_symbol[state] = sym; 455 | state = (state + state_generator) & state_mask; 456 | } 457 | } 458 | 459 | /* 460 | * Build next_statesx[]. This array maps symbol occurrences in the 461 | * state table, ordered primarily by increasing symbol value and 462 | * secondarily by increasing state, to their states, adjusted upwards by 463 | * num_states. 464 | */ 465 | for (state = 0; state < num_states; state++) { 466 | unsigned symbol = c->state_to_symbol[state]; 467 | unsigned position = c->cumul_state_counts[symbol]++; 468 | next_statesx[position] = num_states + state; 469 | } 470 | } 471 | 472 | /* 473 | * Choose the FSE state counts for the specified alphabet, where each symbol has 474 | * the frequency given in @freqs. 475 | */ 476 | static unsigned 477 | choose_state_counts(const u32 freqs[], unsigned alphabet_size, 478 | unsigned max_log2_num_states, u16 state_counts[]) 479 | { 480 | u32 total_freq = 0; 481 | unsigned num_used_syms = 0; 482 | unsigned log2_num_states; 483 | unsigned sym; 484 | 485 | /* Compute the total frequency and the number of used symbols. */ 486 | for (sym = 0; sym < alphabet_size; sym++) { 487 | if (freqs[sym] != 0) { 488 | num_used_syms++; 489 | total_freq += freqs[sym]; 490 | } 491 | } 492 | 493 | /* 494 | * If no symbols from this alphabet were used, then output a code that 495 | * contains an arbitrary unused symbol. 496 | */ 497 | if (total_freq == 0) { 498 | state_counts[0] = 1; 499 | for (sym = 1; sym < alphabet_size; sym++) 500 | state_counts[sym] = 0; 501 | return 0; 502 | } 503 | 504 | /* Select the number of states to use. */ 505 | log2_num_states = select_log2_num_states(total_freq, num_used_syms, 506 | max_log2_num_states); 507 | 508 | /* Decide how many states to assign to each symbol. */ 509 | compute_state_counts(freqs, total_freq, state_counts, 510 | alphabet_size, log2_num_states); 511 | 512 | return log2_num_states; 513 | } 514 | 515 | /* Output stream for header (writes in forwards direction) */ 516 | struct header_ostream { 517 | machine_word_t bitbuf; 518 | unsigned bitcount; 519 | u8 *begin; 520 | u8 *next; 521 | u8 *end; 522 | }; 523 | 524 | static void 525 | header_ostream_init(struct header_ostream *os, 526 | void *out, size_t out_nbytes_avail) 527 | { 528 | os->bitbuf = 0; 529 | os->bitcount = 0; 530 | os->begin = out; 531 | os->next = os->begin; 532 | os->end = os->next + out_nbytes_avail; 533 | } 534 | 535 | static void 536 | header_ostream_write_bits(struct header_ostream *os, 537 | machine_word_t bits, unsigned num_bits) 538 | { 539 | /* 540 | * We only flush 'bitbuf' when it completely fills up. This improves 541 | * performance. 542 | */ 543 | os->bitbuf |= bits << os->bitcount; 544 | os->bitcount += num_bits; 545 | if (os->bitcount >= WORDBITS) { 546 | if (os->end - os->next >= WORDBYTES) { 547 | put_unaligned_leword(os->bitbuf, os->next); 548 | os->next += WORDBYTES; 549 | } else { 550 | os->next = os->end; 551 | } 552 | os->bitcount -= WORDBITS; 553 | os->bitbuf = bits >> (num_bits - os->bitcount); 554 | } 555 | } 556 | 557 | static size_t 558 | header_ostream_flush(struct header_ostream *os) 559 | { 560 | while ((int)os->bitcount > 0) { 561 | if (os->next != os->end) 562 | *os->next++ = os->bitbuf; 563 | os->bitcount -= 8; 564 | os->bitbuf >>= 8; 565 | } 566 | 567 | if (os->next == os->end) /* overflow? */ 568 | return 0; 569 | 570 | return os->next - os->begin; 571 | } 572 | 573 | /* 574 | * Output the state counts. Return the number of bytes written, or 0 if the 575 | * output buffer is too small. 576 | */ 577 | static void 578 | write_state_counts(struct header_ostream *os, 579 | const u16 state_counts[], unsigned num_state_counts) 580 | { 581 | unsigned sym = 0; 582 | 583 | while (sym < num_state_counts) { 584 | unsigned count = state_counts[sym++]; 585 | unsigned bits; 586 | unsigned num_bits; 587 | 588 | if (count == 0) { 589 | unsigned start = sym - 1; 590 | unsigned num_zeroes; 591 | 592 | while (sym < num_state_counts && state_counts[sym] == 0) 593 | sym++; 594 | num_zeroes = sym - start; 595 | 596 | while (num_zeroes >= ZEROCODE2_MIN) { 597 | unsigned count = MIN(num_zeroes, ZEROCODE2_MAX); 598 | bits = ((count - ZEROCODE2_MIN) << CODEBITS) | ZEROCODE2; 599 | num_bits = ZEROCODE2_NBITS + CODEBITS; 600 | header_ostream_write_bits(os, bits, num_bits); 601 | num_zeroes -= count; 602 | } 603 | 604 | if (num_zeroes < ZEROCODE1_MIN) 605 | continue; 606 | 607 | bits = ((num_zeroes - ZEROCODE1_MIN) << CODEBITS) | ZEROCODE1; 608 | num_bits = ZEROCODE1_NBITS + CODEBITS; 609 | } else { 610 | unsigned order = bsr32(count); 611 | bits = ((count ^ (1 << order)) << CODEBITS) | order; 612 | num_bits = order + CODEBITS; 613 | } 614 | header_ostream_write_bits(os, bits, num_bits); 615 | } 616 | } 617 | 618 | /* Output stream for encoded symbols (writes in backwards direction) */ 619 | struct symbol_ostream { 620 | machine_word_t bitbuf; 621 | unsigned bitcount; 622 | u8 *begin; 623 | u8 *next; 624 | u8 *end; 625 | }; 626 | 627 | static void 628 | symbol_ostream_init(struct symbol_ostream *os, void *buffer, size_t size) 629 | { 630 | os->bitbuf = 0; 631 | os->bitcount = 0; 632 | os->begin = buffer; 633 | os->end = os->begin + size; 634 | os->next = os->end - MIN(WORDBYTES, size); 635 | } 636 | 637 | /* 638 | * Add bits to the bitbuffer variable, without flushing. The caller must ensure 639 | * there is enough space. 640 | */ 641 | static forceinline void 642 | symbol_ostream_add_bits(struct symbol_ostream *os, machine_word_t bits, unsigned num_bits) 643 | { 644 | os->bitbuf = (os->bitbuf << num_bits) | bits; 645 | os->bitcount += num_bits; 646 | } 647 | 648 | /* 649 | * Flush bits from the bitbuffer variable to the output buffer. After calling 650 | * this, the bitbuffer variable is guaranteed to contain fewer than 8 bits. 651 | */ 652 | static forceinline void 653 | symbol_ostream_flush_bits(struct symbol_ostream *os) 654 | { 655 | machine_word_t bits = os->bitbuf << 656 | ((WORDBITS - os->bitcount) & (WORDBITS - 1)); 657 | 658 | put_unaligned_leword(bits, os->next); 659 | os->next -= MIN(os->next - os->begin, os->bitcount >> 3); 660 | os->bitcount &= 7; 661 | } 662 | 663 | /* 664 | * Flush any remaining bits to the output buffer and terminate the bitstream. 665 | * Return the total number of bytes written to the output buffer, or 0 if there 666 | * was not enough space available in the output buffer to write everything. 667 | */ 668 | static size_t 669 | symbol_ostream_flush(struct symbol_ostream *os) 670 | { 671 | symbol_ostream_flush_bits(os); 672 | 673 | if (os->next == os->begin) /* Not enough space? */ 674 | return 0; 675 | 676 | /* 677 | * Terminate the last byte with a '1' bit so that the decoder knows 678 | * where to start from. 679 | */ 680 | os->bitbuf <<= 8 - os->bitcount; 681 | os->bitbuf |= (1 << (7 - os->bitcount)); 682 | os->next += WORDBYTES - 1; 683 | *os->next = (u8)os->bitbuf; 684 | 685 | return os->end - os->next; 686 | } 687 | 688 | static forceinline void 689 | encode_initial_state(struct symbol_ostream *os, unsigned initial_statex, 690 | unsigned log2_num_states) 691 | { 692 | symbol_ostream_add_bits(os, initial_statex - (1 << log2_num_states), 693 | log2_num_states); 694 | symbol_ostream_flush_bits(os); 695 | } 696 | 697 | /* Encode a symbol using Finite State Entropy encoding */ 698 | static forceinline unsigned 699 | encode_symbol(unsigned symbol, unsigned cur_statex, struct symbol_ostream *os, 700 | const struct fse_symbol_encoding_info sym_encinfo[], 701 | const u16 next_statesx[]) 702 | { 703 | unsigned num_bits; 704 | 705 | /* 706 | * Calculate the number of bits required to encode this symbol when in 707 | * the current state. 'adjusted_num_states_in_big_ranges' was set to 708 | * (max_bits << MAX_LOG2_NUM_STATES) - 2*num_states + (number of states 709 | * in max_bits destination ranges). If we add cur_statex (which is 710 | * num_states plus the current state) to this value, then we get a 711 | * number less than max_bits << MAX_LOG2_NUM_STATES iff the current 712 | * state is in a min_bits destination range (as opposed to a 'max_bits = 713 | * min_bits + 1' destination range). Then the correct num_bits, which 714 | * is always either min_bits or max_bits, is simply that value right 715 | * shifted by MAX_LOG2_NUM_STATES. 716 | */ 717 | num_bits = (sym_encinfo[symbol].adjusted_num_states_in_big_ranges + 718 | cur_statex) >> MAX_LOG2_NUM_STATES; 719 | 720 | /* Output the appropriate number of bits of the state. */ 721 | symbol_ostream_add_bits(os, cur_statex & ((1 << num_bits) - 1), num_bits); 722 | 723 | /* Look up the next state using the high bits of the current state. */ 724 | return next_statesx[sym_encinfo[symbol].next_states_begin + 725 | (cur_statex >> num_bits)]; 726 | } 727 | 728 | /* 729 | * Encode the matches and literals. Note that the encoding order is backwards 730 | * from the decoding order! 731 | */ 732 | static size_t 733 | encode_items(const struct xpack_compressor *c, void *out, size_t out_nbytes_avail, 734 | bool is_aligned_block) 735 | { 736 | struct symbol_ostream os; 737 | size_t nbytes; 738 | unsigned order; 739 | unsigned litrunlen_statex; 740 | unsigned length_statex; 741 | unsigned offset_statex; 742 | unsigned aligned_statex; 743 | #if NUM_LITERAL_STREAMS == 2 744 | unsigned literal_statex_1; 745 | unsigned literal_statex_2; 746 | #else 747 | unsigned literal_statex; 748 | #endif 749 | s32 i; 750 | 751 | symbol_ostream_init(&os, out, out_nbytes_avail); 752 | 753 | /* Encode the matches and literal run lengths */ 754 | 755 | litrunlen_statex = 1 << c->codes.log2_num_litrunlen_states; 756 | length_statex = 1 << c->codes.log2_num_length_states; 757 | offset_statex = 1 << c->codes.log2_num_offset_states; 758 | aligned_statex = 1 << c->codes.log2_num_aligned_states; 759 | 760 | i = c->num_matches - 1; 761 | if (i >= 0 && c->matches[i].offset_sym == MAX_OFFSET_ALPHABET_SIZE) { 762 | /* Terminating literal run length, with no following match */ 763 | litrunlen_statex = encode_symbol(c->matches[i].litrunlen_sym, 764 | litrunlen_statex, 765 | &os, 766 | c->codes.litrunlen_sym_encinfo, 767 | c->codes.litrunlen_next_statesx); 768 | symbol_ostream_flush_bits(&os); 769 | i--; 770 | } 771 | 772 | for (; i >= 0; i--) { 773 | 774 | const struct match *match = &c->matches[i]; 775 | 776 | if (match->offset_sym >= NUM_REPS) { 777 | 778 | unsigned offset_log2 = match->offset_sym - NUM_REPS; 779 | 780 | if (is_aligned_block && offset_log2 >= NUM_ALIGNED_BITS) { 781 | symbol_ostream_add_bits(&os, 782 | match->extra_offset_bits >> NUM_ALIGNED_BITS, 783 | offset_log2 - NUM_ALIGNED_BITS); 784 | aligned_statex = encode_symbol(match->extra_offset_bits & (ALIGNED_ALPHABET_SIZE - 1), 785 | aligned_statex, 786 | &os, 787 | c->codes.aligned_sym_encinfo, 788 | c->codes.aligned_next_statesx); 789 | } else { 790 | symbol_ostream_add_bits(&os, match->extra_offset_bits, offset_log2); 791 | } 792 | symbol_ostream_flush_bits(&os); 793 | } 794 | 795 | offset_statex = encode_symbol(match->offset_sym, 796 | offset_statex, 797 | &os, 798 | c->codes.offset_sym_encinfo, 799 | c->codes.offset_next_statesx); 800 | symbol_ostream_flush_bits(&os); 801 | 802 | length_statex = encode_symbol(match->length_sym, 803 | length_statex, 804 | &os, 805 | c->codes.length_sym_encinfo, 806 | c->codes.length_next_statesx); 807 | 808 | litrunlen_statex = encode_symbol(match->litrunlen_sym, 809 | litrunlen_statex, 810 | &os, 811 | c->codes.litrunlen_sym_encinfo, 812 | c->codes.litrunlen_next_statesx); 813 | symbol_ostream_flush_bits(&os); 814 | } 815 | 816 | /* Encode the inital states for matches and literal run lengths */ 817 | 818 | if (is_aligned_block) 819 | encode_initial_state(&os, aligned_statex, c->codes.log2_num_aligned_states); 820 | encode_initial_state(&os, offset_statex, c->codes.log2_num_offset_states); 821 | encode_initial_state(&os, length_statex, c->codes.log2_num_length_states); 822 | encode_initial_state(&os, litrunlen_statex, c->codes.log2_num_litrunlen_states); 823 | 824 | /* Encode the literals */ 825 | 826 | #if NUM_LITERAL_STREAMS == 2 827 | literal_statex_1 = 1 << c->codes.log2_num_literal_states; 828 | literal_statex_2 = 1 << c->codes.log2_num_literal_states; 829 | 830 | for (i = c->num_literals - 1; i >= 1; i -= 2) { 831 | 832 | literal_statex_1 = encode_symbol(c->literals[i], 833 | literal_statex_1, 834 | &os, 835 | c->codes.literal_sym_encinfo, 836 | c->codes.literal_next_statesx); 837 | 838 | literal_statex_2 = encode_symbol(c->literals[i - 1], 839 | literal_statex_2, 840 | &os, 841 | c->codes.literal_sym_encinfo, 842 | c->codes.literal_next_statesx); 843 | symbol_ostream_flush_bits(&os); 844 | } 845 | 846 | if (c->num_literals & 1) { 847 | literal_statex_1 = encode_symbol(c->literals[0], 848 | literal_statex_1, 849 | &os, 850 | c->codes.literal_sym_encinfo, 851 | c->codes.literal_next_statesx); 852 | symbol_ostream_flush_bits(&os); 853 | 854 | /* last state the encoder used is state_1 855 | * => first state the encoder will see is state_1 856 | * => numbering will be the same 857 | * => encoder must output state_2, then state_1 */ 858 | encode_initial_state(&os, literal_statex_2, c->codes.log2_num_literal_states); 859 | encode_initial_state(&os, literal_statex_1, c->codes.log2_num_literal_states); 860 | } else { 861 | /* Reversed numbering */ 862 | encode_initial_state(&os, literal_statex_1, c->codes.log2_num_literal_states); 863 | encode_initial_state(&os, literal_statex_2, c->codes.log2_num_literal_states); 864 | } 865 | 866 | #else /* NUM_LITERAL_STREAMS == 2 */ 867 | 868 | literal_statex = 1 << c->codes.log2_num_literal_states; 869 | 870 | for (i = c->num_literals - 1; i >= 0; i--) { 871 | 872 | literal_statex = encode_symbol(c->literals[i], 873 | literal_statex, 874 | &os, 875 | c->codes.literal_sym_encinfo, 876 | c->codes.literal_next_statesx); 877 | 878 | symbol_ostream_flush_bits(&os); 879 | } 880 | 881 | encode_initial_state(&os, literal_statex, c->codes.log2_num_literal_states); 882 | #endif /* NUM_LITERAL_STREAMS != 2 */ 883 | 884 | /* Literal count */ 885 | order = bsr32(c->num_literals + 1); 886 | symbol_ostream_add_bits(&os, (c->num_literals + 1) - 887 | ((u32)1 << order), order); 888 | symbol_ostream_add_bits(&os, order, 5); 889 | 890 | nbytes = symbol_ostream_flush(&os); 891 | if (nbytes == 0) 892 | return 0; 893 | 894 | /* 895 | * We wrote the data at the end of the output space going backwards. 896 | * Now move the data to the beginning. 897 | */ 898 | memmove(out, os.next, nbytes); 899 | 900 | return nbytes; 901 | } 902 | 903 | static void 904 | write_block_size(struct header_ostream *os, u32 block_size) 905 | { 906 | u32 bits; 907 | int num_bits; 908 | 909 | if (block_size == DEFAULT_BLOCK_SIZE) { 910 | bits = 1; 911 | num_bits = 1; 912 | } else { 913 | bits = block_size << 1; 914 | num_bits = 1 + NUM_BLOCKSIZE_BITS; 915 | } 916 | 917 | header_ostream_write_bits(os, bits, num_bits); 918 | } 919 | 920 | /* Heuristic for using ALIGNED blocks */ 921 | static int 922 | choose_block_type(struct xpack_compressor *c) 923 | { 924 | u32 min_count = -1; 925 | u32 max_count = 0; 926 | unsigned sym; 927 | 928 | for (sym = 0; sym < ALIGNED_ALPHABET_SIZE; sym++) { 929 | min_count = MIN(min_count, c->freqs.aligned[sym]); 930 | max_count = MAX(max_count, c->freqs.aligned[sym]); 931 | } 932 | 933 | if (min_count * 3 < max_count) /* unbalanced? */ 934 | return BLOCKTYPE_ALIGNED; 935 | else 936 | return BLOCKTYPE_VERBATIM; 937 | } 938 | 939 | /******************************************************************************/ 940 | 941 | /* 942 | * Block splitting algorithm. The problem is to decide when it is worthwhile to 943 | * start a new block with new entropy codes. There is a theoretically optimal 944 | * solution: recursively consider every possible block split, considering the 945 | * exact cost of each block, and choose the minimum cost approach. But this is 946 | * far too slow. Instead, as an approximation, we can count symbols and after 947 | * every N symbols, compare the expected distribution of symbols based on the 948 | * previous data with the actual distribution. If they differ "by enough", then 949 | * start a new block. 950 | * 951 | * As an optimization and heuristic, we don't distinguish between every symbol 952 | * but rather we combine many symbols into a single "observation type". For 953 | * literals we only look at the high bits and low bits, and for matches we only 954 | * look at whether the match is long or not. The assumption is that for typical 955 | * "real" data, places that are good block boundaries will tend to be noticable 956 | * based only on changes in these aggregate frequencies, without looking for 957 | * subtle differences in individual symbols. For example, a change from ASCII 958 | * bytes to non-ASCII bytes, or from few matches (generally less compressible) 959 | * to many matches (generally more compressible), would be easily noticed based 960 | * on the aggregates. 961 | * 962 | * For determining whether the frequency distributions are "different enough" to 963 | * start a new block, the simply heuristic of splitting when the sum of absolute 964 | * differences exceeds a constant seems to be good enough. We also add a number 965 | * proportional to the block size so that the algorithm is more likely to end 966 | * large blocks than small blocks. This reflects the general expectation that 967 | * it will become increasingly beneficial to start a new block as the current 968 | * blocks grows larger. 969 | * 970 | * Finally, for an approximation, it is not strictly necessary that the exact 971 | * symbols being used are considered. With "near-optimal parsing", for example, 972 | * the actual symbols that will be used are unknown until after the block 973 | * boundary is chosen and the block has been optimized. Since the final choices 974 | * cannot be used, we can use preliminary "greedy" choices instead. 975 | */ 976 | 977 | /* Initialize the block split statistics when starting a new block. */ 978 | static void 979 | init_block_split_stats(struct block_split_stats *stats) 980 | { 981 | int i; 982 | 983 | for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { 984 | stats->new_observations[i] = 0; 985 | stats->observations[i] = 0; 986 | } 987 | stats->num_new_observations = 0; 988 | stats->num_observations = 0; 989 | } 990 | 991 | /* Literal observation. Heuristic: use the top 2 bits and low 1 bits of the 992 | * literal, for 8 possible literal observation types. */ 993 | static forceinline void 994 | observe_literal(struct block_split_stats *stats, u8 lit) 995 | { 996 | stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++; 997 | stats->num_new_observations++; 998 | } 999 | 1000 | /* Match observation. Heuristic: use one observation type for "short match" and 1001 | * one observation type for "long match". */ 1002 | static forceinline void 1003 | observe_match(struct block_split_stats *stats, unsigned length) 1004 | { 1005 | stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 9)]++; 1006 | stats->num_new_observations++; 1007 | } 1008 | 1009 | static bool 1010 | do_end_block_check(struct block_split_stats *stats, u32 block_size) 1011 | { 1012 | int i; 1013 | 1014 | if (stats->num_observations > 0) { 1015 | 1016 | /* Note: to avoid slow divisions, we do not divide by 1017 | * 'num_observations', but rather do all math with the numbers 1018 | * multiplied by 'num_observations'. */ 1019 | u32 total_delta = 0; 1020 | for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { 1021 | u32 expected = stats->observations[i] * stats->num_new_observations; 1022 | u32 actual = stats->new_observations[i] * stats->num_observations; 1023 | u32 delta = (actual > expected) ? actual - expected : 1024 | expected - actual; 1025 | total_delta += delta; 1026 | } 1027 | 1028 | /* Ready to end the block? */ 1029 | if (total_delta + (block_size >> 12) * stats->num_observations >= 1030 | 200 * stats->num_observations) 1031 | return true; 1032 | } 1033 | 1034 | for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { 1035 | stats->num_observations += stats->new_observations[i]; 1036 | stats->observations[i] += stats->new_observations[i]; 1037 | stats->new_observations[i] = 0; 1038 | } 1039 | stats->num_new_observations = 0; 1040 | return false; 1041 | } 1042 | 1043 | static forceinline bool 1044 | should_end_block(struct block_split_stats *stats, 1045 | const u8 *in_block_begin, const u8 *in_next, const u8 *in_end) 1046 | { 1047 | /* Ready to check block split statistics? */ 1048 | if (stats->num_new_observations < 512 || 1049 | in_next - in_block_begin < MIN_BLOCK_LENGTH || 1050 | in_end - in_next < 16384) 1051 | return false; 1052 | 1053 | return do_end_block_check(stats, in_next - in_block_begin); 1054 | } 1055 | 1056 | /******************************************************************************/ 1057 | 1058 | static void 1059 | begin_block(struct xpack_compressor *c) 1060 | { 1061 | memset(&c->freqs, 0, sizeof(c->freqs)); 1062 | c->num_literals = 0; 1063 | c->num_matches = 0; 1064 | c->num_extra_bytes = 0; 1065 | init_block_split_stats(&c->split_stats); 1066 | } 1067 | 1068 | static void 1069 | record_literal(struct xpack_compressor *c, u8 literal) 1070 | { 1071 | c->literals[c->num_literals++] = literal; 1072 | c->freqs.literal[literal]++; 1073 | } 1074 | 1075 | static void 1076 | record_litrunlen(struct xpack_compressor *c, struct match *match, u32 litrunlen) 1077 | { 1078 | unsigned litrunlen_sym; 1079 | 1080 | if (litrunlen >= LITRUNLEN_ALPHABET_SIZE - 1) { 1081 | u32 v = litrunlen - (LITRUNLEN_ALPHABET_SIZE - 1); 1082 | if (v < 0xFF) { 1083 | c->extra_bytes[c->num_extra_bytes++] = v; 1084 | } else { 1085 | v -= 0xFF; 1086 | c->extra_bytes[c->num_extra_bytes++] = 0xFF; 1087 | c->extra_bytes[c->num_extra_bytes++] = (u8)(v >> 0); 1088 | c->extra_bytes[c->num_extra_bytes++] = (u8)(v >> 8); 1089 | c->extra_bytes[c->num_extra_bytes++] = (u8)(v >> 16); 1090 | } 1091 | litrunlen_sym = LITRUNLEN_ALPHABET_SIZE - 1; 1092 | } else { 1093 | litrunlen_sym = litrunlen; 1094 | } 1095 | 1096 | match->litrunlen_sym = litrunlen_sym; 1097 | c->freqs.litrunlen[litrunlen_sym]++; 1098 | } 1099 | 1100 | static void 1101 | record_length(struct xpack_compressor *c, struct match *match, u32 length) 1102 | { 1103 | unsigned length_sym; 1104 | 1105 | length -= MIN_MATCH_LEN; 1106 | 1107 | if (length >= LENGTH_ALPHABET_SIZE - 1) { 1108 | u32 v = length - (LENGTH_ALPHABET_SIZE - 1); 1109 | if (v < 0xFF) { 1110 | c->extra_bytes[c->num_extra_bytes++] = v; 1111 | } else { 1112 | v -= 0xFF; 1113 | c->extra_bytes[c->num_extra_bytes++] = 0xFF; 1114 | c->extra_bytes[c->num_extra_bytes++] = (u8)(v >> 0); 1115 | c->extra_bytes[c->num_extra_bytes++] = (u8)(v >> 8); 1116 | c->extra_bytes[c->num_extra_bytes++] = (u8)(v >> 16); 1117 | } 1118 | length_sym = LENGTH_ALPHABET_SIZE - 1; 1119 | } else { 1120 | length_sym = length; 1121 | } 1122 | 1123 | match->length_sym = length_sym; 1124 | c->freqs.length[length_sym]++; 1125 | } 1126 | 1127 | static void 1128 | record_explicit_offset(struct xpack_compressor *c, struct match *match, 1129 | u32 offset) 1130 | { 1131 | unsigned offset_log2 = bsr32(offset); 1132 | unsigned offset_sym = NUM_REPS + offset_log2; 1133 | 1134 | match->offset_sym = offset_sym; 1135 | c->freqs.offset[offset_sym]++; 1136 | match->extra_offset_bits = offset - ((u32)1 << offset_log2); 1137 | if (offset_log2 >= NUM_ALIGNED_BITS) 1138 | c->freqs.aligned[offset & (ALIGNED_ALPHABET_SIZE - 1)]++; 1139 | } 1140 | 1141 | static void 1142 | record_repeat_offset(struct xpack_compressor *c, struct match *match, 1143 | unsigned rep_idx) 1144 | { 1145 | match->offset_sym = rep_idx; 1146 | c->freqs.offset[rep_idx]++; 1147 | } 1148 | 1149 | static size_t 1150 | write_block(struct xpack_compressor *c, void *out, size_t out_nbytes_avail, 1151 | u32 block_size, u32 last_litrunlen, bool is_final_block) 1152 | { 1153 | struct header_ostream os; 1154 | size_t header_size; 1155 | size_t items_size; 1156 | int block_type; 1157 | unsigned num_state_counts; 1158 | unsigned order; 1159 | 1160 | /* Final litrunlen */ 1161 | record_litrunlen(c, &c->matches[c->num_matches], last_litrunlen); 1162 | c->matches[c->num_matches].offset_sym = MAX_OFFSET_ALPHABET_SIZE; 1163 | c->num_matches++; 1164 | 1165 | /* Choose the block type */ 1166 | block_type = choose_block_type(c); 1167 | 1168 | header_ostream_init(&os, out, out_nbytes_avail); 1169 | 1170 | /* Output the "final block" flag */ 1171 | header_ostream_write_bits(&os, is_final_block, 1); 1172 | 1173 | /* Output the block type */ 1174 | header_ostream_write_bits(&os, block_type, NUM_BLOCKTYPE_BITS); 1175 | 1176 | /* Output the block size */ 1177 | write_block_size(&os, block_size); 1178 | 1179 | /* Compute FSE state counts for each alphabet */ 1180 | 1181 | c->codes.log2_num_literal_states = 1182 | choose_state_counts(c->freqs.literal, 1183 | LITERAL_ALPHABET_SIZE, 1184 | MAX_LOG2_NUM_LITERAL_STATES, 1185 | c->codes.literal_state_counts); 1186 | 1187 | c->codes.log2_num_litrunlen_states = 1188 | choose_state_counts(c->freqs.litrunlen, 1189 | LITRUNLEN_ALPHABET_SIZE, 1190 | MAX_LOG2_NUM_LITRUNLEN_STATES, 1191 | c->codes.litrunlen_state_counts); 1192 | 1193 | c->codes.log2_num_length_states = 1194 | choose_state_counts(c->freqs.length, 1195 | LENGTH_ALPHABET_SIZE, 1196 | MAX_LOG2_NUM_LENGTH_STATES, 1197 | c->codes.length_state_counts); 1198 | 1199 | c->codes.log2_num_offset_states = 1200 | choose_state_counts(c->freqs.offset, 1201 | MAX_OFFSET_ALPHABET_SIZE, 1202 | MAX_LOG2_NUM_OFFSET_STATES, 1203 | c->codes.offset_state_counts); 1204 | 1205 | if (block_type == BLOCKTYPE_ALIGNED) { 1206 | c->codes.log2_num_aligned_states = 1207 | choose_state_counts(c->freqs.aligned, 1208 | ALIGNED_ALPHABET_SIZE, 1209 | MAX_LOG2_NUM_ALIGNED_STATES, 1210 | c->codes.aligned_state_counts); 1211 | } 1212 | 1213 | /* Output the FSE state counts for each alphabet */ 1214 | header_ostream_write_bits(&os, c->codes.log2_num_literal_states, 4); 1215 | header_ostream_write_bits(&os, c->codes.log2_num_litrunlen_states, 4); 1216 | header_ostream_write_bits(&os, c->codes.log2_num_length_states, 4); 1217 | header_ostream_write_bits(&os, c->codes.log2_num_offset_states, 4); 1218 | if (block_type == BLOCKTYPE_ALIGNED) 1219 | header_ostream_write_bits(&os, c->codes.log2_num_aligned_states, 4); 1220 | 1221 | #ifndef _MSC_VER 1222 | STATIC_ASSERT(offsetof(struct codes, 1223 | aligned_state_counts[ALIGNED_ALPHABET_SIZE]) == 1224 | offsetof(struct codes, state_counts) + sizeof(c->codes.state_counts)); 1225 | #endif 1226 | num_state_counts = ARRAY_LEN(c->codes.state_counts); 1227 | if (block_type != BLOCKTYPE_ALIGNED) 1228 | num_state_counts -= ALIGNED_ALPHABET_SIZE; 1229 | 1230 | write_state_counts(&os, c->codes.state_counts, num_state_counts); 1231 | 1232 | /* Output the number of extra bytes */ 1233 | order = bsr32(c->num_extra_bytes + 1); 1234 | header_ostream_write_bits(&os, order, 5); 1235 | header_ostream_write_bits(&os, 1236 | (c->num_extra_bytes + 1) - ((u32)1 << order), 1237 | order); 1238 | 1239 | /* Align to the next byte boundary */ 1240 | header_size = header_ostream_flush(&os); 1241 | if (header_size == 0) 1242 | return 0; 1243 | 1244 | /* Add the extra bytes */ 1245 | if (c->num_extra_bytes >= out_nbytes_avail - header_size) 1246 | return 0; 1247 | memcpy((u8 *)out + header_size, c->extra_bytes, c->num_extra_bytes); 1248 | header_size += c->num_extra_bytes; 1249 | 1250 | /* Build the FSE encoding tables for each alphabet */ 1251 | 1252 | build_fse_encoding_tables(c, c->codes.literal_sym_encinfo, 1253 | c->codes.literal_next_statesx, 1254 | c->codes.literal_state_counts, 1255 | LITERAL_ALPHABET_SIZE, 1256 | c->codes.log2_num_literal_states); 1257 | 1258 | build_fse_encoding_tables(c, c->codes.litrunlen_sym_encinfo, 1259 | c->codes.litrunlen_next_statesx, 1260 | c->codes.litrunlen_state_counts, 1261 | LITRUNLEN_ALPHABET_SIZE, 1262 | c->codes.log2_num_litrunlen_states); 1263 | 1264 | build_fse_encoding_tables(c, c->codes.length_sym_encinfo, 1265 | c->codes.length_next_statesx, 1266 | c->codes.length_state_counts, 1267 | LENGTH_ALPHABET_SIZE, 1268 | c->codes.log2_num_length_states); 1269 | 1270 | build_fse_encoding_tables(c, c->codes.offset_sym_encinfo, 1271 | c->codes.offset_next_statesx, 1272 | c->codes.offset_state_counts, 1273 | MAX_OFFSET_ALPHABET_SIZE, 1274 | c->codes.log2_num_offset_states); 1275 | 1276 | if (block_type == BLOCKTYPE_ALIGNED) { 1277 | build_fse_encoding_tables(c, c->codes.aligned_sym_encinfo, 1278 | c->codes.aligned_next_statesx, 1279 | c->codes.aligned_state_counts, 1280 | ALIGNED_ALPHABET_SIZE, 1281 | c->codes.log2_num_aligned_states); 1282 | } 1283 | 1284 | /* Encode the items */ 1285 | 1286 | items_size = encode_items(c, (u8 *)out + header_size, 1287 | out_nbytes_avail - header_size, 1288 | block_type == BLOCKTYPE_ALIGNED); 1289 | if (items_size == 0) 1290 | return 0; 1291 | 1292 | return header_size + items_size; 1293 | } 1294 | 1295 | static size_t 1296 | compress_greedy(struct xpack_compressor *c, void *out, size_t out_nbytes_avail) 1297 | { 1298 | u8 * const out_begin = out; 1299 | u8 * out_next = out_begin; 1300 | u8 * const out_end = out_begin + out_nbytes_avail; 1301 | const u8 * const in_begin = c->in_buffer; 1302 | const u8 * in_next = in_begin; 1303 | const u8 * const in_end = in_begin + c->in_nbytes; 1304 | u32 max_len = MIN(c->in_nbytes, UINT32_MAX); 1305 | u32 nice_len = MIN(c->nice_match_length, max_len); 1306 | u32 next_hashes[2] = {0, 0}; 1307 | u32 recent_offsets[NUM_REPS]; 1308 | 1309 | init_recent_offsets(recent_offsets); 1310 | hc_matchfinder_init(&c->hc_mf); 1311 | 1312 | do { 1313 | /* Starting a new block */ 1314 | 1315 | const u8 * const in_block_begin = in_next; 1316 | const u8 * const in_max_block_end = 1317 | in_next + MIN(SOFT_MAX_BLOCK_LENGTH, in_end - in_next); 1318 | u32 length; 1319 | u32 offset; 1320 | size_t nbytes; 1321 | u32 litrunlen = 0; 1322 | 1323 | begin_block(c); 1324 | 1325 | do { 1326 | if (unlikely(max_len > in_end - in_next)) { 1327 | max_len = in_end - in_next; 1328 | nice_len = MIN(max_len, nice_len); 1329 | } 1330 | 1331 | /* Find the longest match at the current position. */ 1332 | 1333 | length = hc_matchfinder_longest_match(&c->hc_mf, 1334 | in_begin, 1335 | in_next - in_begin, 1336 | #if MIN_MATCH_LEN == 4 1337 | 3, 1338 | #else 1339 | 2, 1340 | #endif 1341 | max_len, 1342 | nice_len, 1343 | c->max_search_depth, 1344 | next_hashes, 1345 | &offset); 1346 | #if MIN_MATCH_LEN == 4 1347 | if (length < 4) { 1348 | #else 1349 | if (length < 3 || (length == 3 && offset >= 4096)) { 1350 | #endif 1351 | /* Literal */ 1352 | observe_literal(&c->split_stats, *in_next); 1353 | record_literal(c, *in_next); 1354 | in_next++; 1355 | litrunlen++; 1356 | } else { 1357 | /* Match */ 1358 | struct match *match = &c->matches[c->num_matches++]; 1359 | 1360 | STATIC_ASSERT(NUM_REPS >= 1 && NUM_REPS <= 4); 1361 | 1362 | observe_match(&c->split_stats, length); 1363 | 1364 | if (offset == recent_offsets[0]) { 1365 | record_repeat_offset(c, match, 0); 1366 | } 1367 | #if NUM_REPS >= 2 1368 | else if (offset == recent_offsets[1]) { 1369 | recent_offsets[1] = recent_offsets[0]; 1370 | record_repeat_offset(c, match, 1); 1371 | } 1372 | #endif 1373 | #if NUM_REPS >= 3 1374 | else if (offset == recent_offsets[2]) { 1375 | recent_offsets[2] = recent_offsets[0]; 1376 | record_repeat_offset(c, match, 2); 1377 | } 1378 | #endif 1379 | #if NUM_REPS >= 4 1380 | else if (offset == recent_offsets[3]) { 1381 | recent_offsets[3] = recent_offsets[0]; 1382 | record_repeat_offset(c, match, 3); 1383 | } 1384 | #endif 1385 | else { 1386 | record_explicit_offset(c, match, offset); 1387 | #if NUM_REPS >= 4 1388 | recent_offsets[3] = recent_offsets[2]; 1389 | #endif 1390 | #if NUM_REPS >= 3 1391 | recent_offsets[2] = recent_offsets[1]; 1392 | #endif 1393 | #if NUM_REPS >= 2 1394 | recent_offsets[1] = recent_offsets[0]; 1395 | #endif 1396 | } 1397 | recent_offsets[0] = offset; 1398 | record_litrunlen(c, match, litrunlen); 1399 | record_length(c, match, length); 1400 | 1401 | in_next = hc_matchfinder_skip_positions(&c->hc_mf, 1402 | in_begin, 1403 | in_next + 1 - in_begin, 1404 | in_end - in_begin, 1405 | length - 1, 1406 | next_hashes); 1407 | litrunlen = 0; 1408 | } 1409 | } while (in_next < in_max_block_end && 1410 | !should_end_block(&c->split_stats, in_block_begin, in_next, in_end)); 1411 | 1412 | nbytes = write_block(c, out_next, out_end - out_next, 1413 | in_next - in_block_begin, litrunlen, 1414 | in_next == in_end); 1415 | if (nbytes == 0) 1416 | return 0; 1417 | 1418 | out_next += nbytes; 1419 | 1420 | } while (in_next != in_end); 1421 | 1422 | return out_next - out_begin; 1423 | } 1424 | 1425 | /* 1426 | * Given a pointer to the current byte sequence and the current list of recent 1427 | * match offsets, find the longest repeat offset match. 1428 | * 1429 | * If no match of at least MIN_MATCH_LEN bytes is found, then return 0. 1430 | * 1431 | * If a match of at least MIN_MATCH_LEN bytes is found, then return its length 1432 | * and set *rep_max_idx_ret to the index of its offset in @queue. 1433 | */ 1434 | static u32 1435 | find_longest_repeat_offset_match(const u8 * const in_next, 1436 | const u32 max_len, 1437 | const u32 recent_offsets[], 1438 | unsigned *rep_max_idx_ret) 1439 | { 1440 | #if MIN_MATCH_LEN == 2 1441 | # define load_initial load_u16_unaligned 1442 | #elif MIN_MATCH_LEN == 3 1443 | # define load_initial load_u24_unaligned 1444 | #elif MIN_MATCH_LEN == 4 1445 | # define load_initial load_u32_unaligned 1446 | #else 1447 | # error "unsupported MIN_MATCH_LEN" 1448 | #endif 1449 | const u32 next_bytes = load_initial(in_next); 1450 | const u8 *matchptr; 1451 | u32 rep_len; 1452 | u32 rep_max_len; 1453 | unsigned rep_max_idx; 1454 | 1455 | STATIC_ASSERT(NUM_REPS >= 1 && NUM_REPS <= 4); 1456 | 1457 | matchptr = in_next - recent_offsets[0]; 1458 | if (load_initial(matchptr) == next_bytes) 1459 | rep_max_len = lz_extend(in_next, matchptr, MIN_MATCH_LEN, max_len); 1460 | else 1461 | rep_max_len = 0; 1462 | rep_max_idx = 0; 1463 | 1464 | #if NUM_REPS >= 2 1465 | matchptr = in_next - recent_offsets[1]; 1466 | if (load_initial(matchptr) == next_bytes) { 1467 | rep_len = lz_extend(in_next, matchptr, MIN_MATCH_LEN, max_len); 1468 | if (rep_len > rep_max_len) { 1469 | rep_max_len = rep_len; 1470 | rep_max_idx = 1; 1471 | } 1472 | } 1473 | #endif 1474 | 1475 | #if NUM_REPS >= 3 1476 | matchptr = in_next - recent_offsets[2]; 1477 | if (load_initial(matchptr) == next_bytes) { 1478 | rep_len = lz_extend(in_next, matchptr, MIN_MATCH_LEN, max_len); 1479 | if (rep_len > rep_max_len) { 1480 | rep_max_len = rep_len; 1481 | rep_max_idx = 2; 1482 | } 1483 | } 1484 | #endif 1485 | 1486 | #if NUM_REPS >= 4 1487 | matchptr = in_next - recent_offsets[3]; 1488 | if (load_initial(matchptr) == next_bytes) { 1489 | rep_len = lz_extend(in_next, matchptr, MIN_MATCH_LEN, max_len); 1490 | if (rep_len > rep_max_len) { 1491 | rep_max_len = rep_len; 1492 | rep_max_idx = 3; 1493 | } 1494 | } 1495 | #endif 1496 | 1497 | *rep_max_idx_ret = rep_max_idx; 1498 | return rep_max_len; 1499 | } 1500 | 1501 | /* Fast heuristic scoring for lazy parsing: how "good" is this match? */ 1502 | static forceinline u32 1503 | explicit_offset_match_score(u32 len, u32 adjusted_offset) 1504 | { 1505 | u32 score = len; 1506 | 1507 | if (adjusted_offset < 4096) 1508 | score++; 1509 | 1510 | if (adjusted_offset < 256) 1511 | score++; 1512 | 1513 | return score; 1514 | } 1515 | 1516 | static forceinline u32 1517 | repeat_offset_match_score(u32 rep_len, unsigned rep_idx) 1518 | { 1519 | return rep_len + 3; 1520 | } 1521 | 1522 | static size_t 1523 | compress_lazy(struct xpack_compressor *c, void *out, size_t out_nbytes_avail) 1524 | { 1525 | u8 * const out_begin = out; 1526 | u8 * out_next = out_begin; 1527 | u8 * const out_end = out_begin + out_nbytes_avail; 1528 | const u8 * const in_begin = c->in_buffer; 1529 | const u8 * in_next = in_begin; 1530 | const u8 * const in_end = in_begin + c->in_nbytes; 1531 | u32 max_len = MIN(c->in_nbytes, UINT32_MAX); 1532 | u32 nice_len = MIN(c->nice_match_length, max_len); 1533 | u32 next_hashes[2] = {0, 0}; 1534 | u32 recent_offsets[NUM_REPS]; 1535 | 1536 | init_recent_offsets(recent_offsets); 1537 | hc_matchfinder_init(&c->hc_mf); 1538 | 1539 | do { 1540 | /* Starting a new block */ 1541 | 1542 | const u8 * const in_block_begin = in_next; 1543 | const u8 * const in_max_block_end = 1544 | in_next + MIN(SOFT_MAX_BLOCK_LENGTH, in_end - in_next); 1545 | u32 cur_len; 1546 | u32 cur_offset; 1547 | u32 cur_offset_data; 1548 | u32 cur_score; 1549 | u32 next_len; 1550 | u32 next_offset; 1551 | u32 next_offset_data; 1552 | u32 next_score; 1553 | u32 rep_max_len; 1554 | unsigned rep_max_idx; 1555 | u32 rep_score; 1556 | u32 skip_len; 1557 | u32 litrunlen = 0; 1558 | size_t nbytes; 1559 | struct match *match; 1560 | 1561 | begin_block(c); 1562 | 1563 | do { 1564 | if (unlikely(max_len > in_end - in_next)) { 1565 | max_len = in_end - in_next; 1566 | nice_len = MIN(max_len, nice_len); 1567 | } 1568 | 1569 | /* Find the longest match at the current position. */ 1570 | 1571 | cur_len = hc_matchfinder_longest_match(&c->hc_mf, 1572 | in_begin, 1573 | in_next - in_begin, 1574 | #if MIN_MATCH_LEN == 4 1575 | 3, 1576 | #else 1577 | 2, 1578 | #endif 1579 | max_len, 1580 | nice_len, 1581 | c->max_search_depth, 1582 | next_hashes, 1583 | &cur_offset); 1584 | #if MIN_MATCH_LEN == 4 1585 | if (cur_len < 4) { 1586 | #else 1587 | if (cur_len < 3 || (cur_len == 3 && cur_offset >= 4096)) { 1588 | #endif 1589 | /* 1590 | * There was no match found, or the only match 1591 | * found was a distant length 3 match. Output a 1592 | * literal. 1593 | */ 1594 | observe_literal(&c->split_stats, *in_next); 1595 | record_literal(c, *in_next); 1596 | in_next++; 1597 | litrunlen++; 1598 | continue; 1599 | } 1600 | 1601 | observe_match(&c->split_stats, cur_len); 1602 | 1603 | if (cur_offset == recent_offsets[0]) { 1604 | in_next++; 1605 | cur_offset_data = 0; 1606 | skip_len = cur_len - 1; 1607 | goto choose_cur_match; 1608 | } 1609 | 1610 | cur_offset_data = cur_offset + (NUM_REPS - 1); 1611 | cur_score = explicit_offset_match_score(cur_len, cur_offset_data); 1612 | 1613 | /* Consider a repeat offset match. */ 1614 | rep_max_len = find_longest_repeat_offset_match(in_next, 1615 | in_end - in_next, 1616 | recent_offsets, 1617 | &rep_max_idx); 1618 | in_next++; 1619 | 1620 | if (rep_max_len >= 3 && 1621 | (rep_score = repeat_offset_match_score(rep_max_len, 1622 | rep_max_idx)) >= cur_score) 1623 | { 1624 | cur_len = rep_max_len; 1625 | cur_offset_data = rep_max_idx; 1626 | skip_len = rep_max_len - 1; 1627 | goto choose_cur_match; 1628 | } 1629 | 1630 | have_cur_match: 1631 | 1632 | /* We have a match at the current position. */ 1633 | 1634 | /* If we have a very long match, choose it immediately. */ 1635 | if (cur_len >= nice_len) { 1636 | skip_len = cur_len - 1; 1637 | goto choose_cur_match; 1638 | } 1639 | 1640 | /* See if there's a better match at the next position. */ 1641 | 1642 | if (unlikely(max_len > in_end - in_next)) { 1643 | max_len = in_end - in_next; 1644 | nice_len = MIN(max_len, nice_len); 1645 | } 1646 | 1647 | next_len = hc_matchfinder_longest_match(&c->hc_mf, 1648 | in_begin, 1649 | in_next - in_begin, 1650 | #if MIN_MATCH_LEN == 2 1651 | cur_len - 2, 1652 | #else 1653 | cur_len - 1, 1654 | #endif 1655 | max_len, 1656 | nice_len, 1657 | c->max_search_depth / 2, 1658 | next_hashes, 1659 | &next_offset); 1660 | 1661 | #if MIN_MATCH_LEN == 2 1662 | if (next_len <= cur_len - 2) { 1663 | #else 1664 | if (next_len <= cur_len - 1) { 1665 | #endif 1666 | in_next++; 1667 | skip_len = cur_len - 2; 1668 | goto choose_cur_match; 1669 | } 1670 | 1671 | next_offset_data = next_offset + (NUM_REPS - 1); 1672 | next_score = explicit_offset_match_score(next_len, next_offset_data); 1673 | 1674 | rep_max_len = find_longest_repeat_offset_match(in_next, 1675 | in_end - in_next, 1676 | recent_offsets, 1677 | &rep_max_idx); 1678 | in_next++; 1679 | 1680 | if (rep_max_len >= 3 && 1681 | (rep_score = repeat_offset_match_score(rep_max_len, 1682 | rep_max_idx)) >= next_score) 1683 | { 1684 | 1685 | if (rep_score > cur_score) { 1686 | /* 1687 | * The next match is better, and it's a 1688 | * repeat offset match. 1689 | */ 1690 | record_literal(c, *(in_next - 2)); 1691 | litrunlen++; 1692 | cur_len = rep_max_len; 1693 | cur_offset_data = rep_max_idx; 1694 | skip_len = cur_len - 1; 1695 | goto choose_cur_match; 1696 | } 1697 | } else { 1698 | if (next_score > cur_score) { 1699 | /* 1700 | * The next match is better, and it's an 1701 | * explicit offset match. 1702 | */ 1703 | record_literal(c, *(in_next - 2)); 1704 | litrunlen++; 1705 | cur_len = next_len; 1706 | cur_offset_data = next_offset_data; 1707 | cur_score = next_score; 1708 | goto have_cur_match; 1709 | } 1710 | } 1711 | 1712 | /* The original match was better. */ 1713 | skip_len = cur_len - 2; 1714 | 1715 | choose_cur_match: 1716 | match = &c->matches[c->num_matches++]; 1717 | if (cur_offset_data < NUM_REPS) { 1718 | u32 offset; 1719 | 1720 | record_repeat_offset(c, match, cur_offset_data); 1721 | 1722 | offset = recent_offsets[cur_offset_data]; 1723 | recent_offsets[cur_offset_data] = recent_offsets[0]; 1724 | recent_offsets[0] = offset; 1725 | } else { 1726 | record_explicit_offset(c, match, 1727 | cur_offset_data - (NUM_REPS - 1)); 1728 | STATIC_ASSERT(NUM_REPS >= 1 && NUM_REPS <= 4); 1729 | #if NUM_REPS >= 4 1730 | recent_offsets[3] = recent_offsets[2]; 1731 | #endif 1732 | #if NUM_REPS >= 3 1733 | recent_offsets[2] = recent_offsets[1]; 1734 | #endif 1735 | #if NUM_REPS >= 2 1736 | recent_offsets[1] = recent_offsets[0]; 1737 | #endif 1738 | recent_offsets[0] = cur_offset_data - (NUM_REPS - 1); 1739 | } 1740 | record_litrunlen(c, match, litrunlen); 1741 | record_length(c, match, cur_len); 1742 | litrunlen = 0; 1743 | 1744 | in_next = hc_matchfinder_skip_positions(&c->hc_mf, 1745 | in_begin, 1746 | in_next - in_begin, 1747 | in_end - in_begin, 1748 | skip_len, 1749 | next_hashes); 1750 | } while (in_next < in_max_block_end && 1751 | !should_end_block(&c->split_stats, in_block_begin, in_next, in_end)); 1752 | 1753 | nbytes = write_block(c, out_next, out_end - out_next, 1754 | in_next - in_block_begin, litrunlen, 1755 | in_next == in_end); 1756 | if (nbytes == 0) 1757 | return 0; 1758 | 1759 | out_next += nbytes; 1760 | 1761 | } while (in_next != in_end); 1762 | 1763 | return out_next - out_begin; 1764 | } 1765 | 1766 | LIBEXPORT struct xpack_compressor * 1767 | xpack_alloc_compressor(size_t max_buffer_size, int compression_level) 1768 | { 1769 | struct xpack_compressor *c; 1770 | 1771 | c = malloc(offsetof(struct xpack_compressor, hc_mf) + 1772 | hc_matchfinder_size(max_buffer_size)); 1773 | if (!c) 1774 | goto err0; 1775 | 1776 | #ifdef ENABLE_PREPROCESSING 1777 | c->in_buffer = malloc(max_buffer_size); 1778 | if (!c->in_buffer) 1779 | goto err1; 1780 | #endif 1781 | 1782 | c->max_buffer_size = max_buffer_size; 1783 | 1784 | switch (compression_level) { 1785 | case 1: 1786 | c->impl = compress_greedy; 1787 | c->max_search_depth = 1; 1788 | c->nice_match_length = MIN_MATCH_LEN; 1789 | break; 1790 | case 2: 1791 | c->impl = compress_greedy; 1792 | c->max_search_depth = 8; 1793 | c->nice_match_length = 8; 1794 | break; 1795 | case 3: 1796 | c->impl = compress_greedy; 1797 | c->max_search_depth = 16; 1798 | c->nice_match_length = 16; 1799 | break; 1800 | case 4: 1801 | c->impl = compress_lazy; 1802 | c->max_search_depth = 8; 1803 | c->nice_match_length = 12; 1804 | break; 1805 | case 5: 1806 | c->impl = compress_lazy; 1807 | c->max_search_depth = 16; 1808 | c->nice_match_length = 24; 1809 | break; 1810 | case 6: 1811 | c->impl = compress_lazy; 1812 | c->max_search_depth = 32; 1813 | c->nice_match_length = 48; 1814 | break; 1815 | case 7: 1816 | c->impl = compress_lazy; 1817 | c->max_search_depth = 64; 1818 | c->nice_match_length = 96; 1819 | break; 1820 | case 8: 1821 | c->impl = compress_lazy; 1822 | c->max_search_depth = 128; 1823 | c->nice_match_length = 192; 1824 | break; 1825 | case 9: 1826 | c->impl = compress_lazy; 1827 | c->max_search_depth = 256; 1828 | c->nice_match_length = 384; 1829 | STATIC_ASSERT(EXTRA_LITERAL_SPACE >= 384 * 4 / 3); 1830 | break; 1831 | default: 1832 | goto err2; 1833 | } 1834 | 1835 | /* max_search_depth == 0 is invalid */ 1836 | if (c->max_search_depth < 1) 1837 | c->max_search_depth = 1; 1838 | 1839 | return c; 1840 | 1841 | err2: 1842 | #ifdef ENABLE_PREPROCESSING 1843 | free(c->in_buffer); 1844 | err1: 1845 | #endif 1846 | free(c); 1847 | err0: 1848 | return NULL; 1849 | } 1850 | 1851 | LIBEXPORT size_t 1852 | xpack_compress(struct xpack_compressor *c, const void *in, size_t in_nbytes, 1853 | void *out, size_t out_nbytes_avail) 1854 | { 1855 | /* Don't bother trying to compress very small inputs. */ 1856 | if (in_nbytes < 100) 1857 | return 0; 1858 | 1859 | /* Safety check */ 1860 | if (unlikely(in_nbytes > c->max_buffer_size)) 1861 | return 0; 1862 | 1863 | #ifdef ENABLE_PREPROCESSING 1864 | /* Copy the input data into the internal buffer and preprocess it. */ 1865 | memcpy(c->in_buffer, in, in_nbytes); 1866 | c->in_nbytes = in_nbytes; 1867 | preprocess(c->in_buffer, in_nbytes); 1868 | #else 1869 | /* Preprocessing is disabled. No internal buffer is needed. */ 1870 | c->in_buffer = (void *)in; 1871 | c->in_nbytes = in_nbytes; 1872 | #endif 1873 | 1874 | return (*c->impl)(c, out, out_nbytes_avail); 1875 | } 1876 | 1877 | LIBEXPORT void 1878 | xpack_free_compressor(struct xpack_compressor *c) 1879 | { 1880 | if (c) { 1881 | #ifdef ENABLE_PREPROCESSING 1882 | free(c->in_buffer); 1883 | #endif 1884 | free(c); 1885 | } 1886 | } 1887 | 1888 | #endif /* !DECOMPRESSION_ONLY */ 1889 | --------------------------------------------------------------------------------