├── tools
    ├── msc_test.bat
    ├── afl-fuzz
    │   ├── compress
    │   │   ├── inputs
    │   │   │   └── 0
    │   │   └── fuzz.c
    │   ├── decompress
    │   │   └── inputs
    │   │   │   └── 0
    │   ├── Makefile
    │   └── prepare_for_fuzz.sh
    ├── windows_test.sh
    ├── mips_test.sh
    ├── arm_test.sh
    └── make-windows-releases
├── .gitignore
├── lib
    ├── lz_hash.h
    ├── x86_cpu_features.h
    ├── lz_extend.h
    ├── xpack_constants.h
    ├── xpack_common.h
    ├── x86_cpu_features.c
    ├── unaligned.h
    ├── xpack_common.c
    ├── hc_matchfinder.h
    ├── decompress_impl.h
    ├── xpack_decompress.c
    └── xpack_compress.c
├── programs
    ├── detect.sh
    ├── tgetopt.c
    ├── prog_util.h
    ├── benchmark.c
    ├── prog_util.c
    └── xpack.c
├── COPYING
├── Makefile.msc
├── common
    ├── compiler_msc.h
    ├── compiler_gcc.h
    └── common_defs.h
├── README.md
├── libxpack.h
└── Makefile


/tools/msc_test.bat:
--------------------------------------------------------------------------------
1 | nmake /f Makefile.msc clean
2 | nmake /f Makefile.msc
3 | copy /y *.exe j:\exe\
4 | 


--------------------------------------------------------------------------------
/tools/afl-fuzz/compress/inputs/0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebiggers/xpack/HEAD/tools/afl-fuzz/compress/inputs/0


--------------------------------------------------------------------------------
/tools/afl-fuzz/decompress/inputs/0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ebiggers/xpack/HEAD/tools/afl-fuzz/decompress/inputs/0


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.a
 2 | *.dll
 3 | *.exe
 4 | *.exp
 5 | *.lib
 6 | *.o
 7 | *.obj
 8 | *.so
 9 | /.lib-cflags
10 | /.prog-cflags
11 | /programs/config.h
12 | /benchmark
13 | /xpack
14 | /xunpack
15 | tags
16 | cscope*
17 | 


--------------------------------------------------------------------------------
/tools/windows_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e
 4 | 
 5 | make -j CC=i686-w64-mingw32-gcc
 6 | cp -vf *.exe /j/exe/
 7 | make -j CC=x86_64-w64-mingw32-gcc
 8 | cp -vf *.exe /j/exe64/
 9 | 
10 | sudo systemctl restart smbd
11 | 


--------------------------------------------------------------------------------
/tools/afl-fuzz/Makefile:
--------------------------------------------------------------------------------
 1 | SRC := $(wildcard */*.c)
 2 | EXE := $(SRC:.c=)
 3 | 
 4 | CFLAGS := -O2 -s
 5 | LDLIBS := -lxpack
 6 | LDFLAGS := -L../..
 7 | CPPFLAGS := -I../..
 8 | 
 9 | all:$(EXE)
10 | 
11 | clean:
12 | 	rm -f $(EXE)
13 | 


--------------------------------------------------------------------------------
/tools/afl-fuzz/prepare_for_fuzz.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e
 4 | 
 5 | AFL_HARDEN=1 make CC=afl-gcc -C ../../
 6 | 
 7 | make clean
 8 | AFL_HARDEN=1 make CC=afl-gcc
 9 | 
10 | for dir in $(find . -mindepth 1 -maxdepth 1 -type d); do
11 | 	rm -rf /tmp/$dir
12 | 	cp -va $dir /tmp/$dir
13 | 	mkdir -p /tmp/$dir/outputs
14 | done
15 | 


--------------------------------------------------------------------------------
/tools/mips_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e
 4 | 
 5 | TOOLCHAIN_DIR=$HOME/src/ddwrt-toolchains/toolchain-mips_34kc_gcc-5.1.0_musl-1.1.9
 6 | 
 7 | make -j benchmark \
 8 | 	CC="$TOOLCHAIN_DIR/bin/mips-openwrt-linux-musl-gcc" \
 9 | 	CFLAGS="-DNEED_PRINTF"
10 | 
11 | scp benchmark $HOME/data/test root@dd-wrt:
12 | ssh root@dd-wrt ./benchmark "$@" test
13 | 


--------------------------------------------------------------------------------
/tools/arm_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e
 4 | 
 5 | NDKDIR=/opt/android-ndk
 6 | 
 7 | make -j benchmark \
 8 | 	CC="$NDKDIR/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-gcc" \
 9 | 	CFLAGS="--sysroot=$NDKDIR/platforms/android-12/arch-arm -march=armv7-a -fPIC -pie -mfpu=neon -mfloat-abi=softfp"
10 | 
11 | adb push benchmark /data/local/tmp
12 | adb push $HOME/data/testdata  /data/local/tmp
13 | adb shell /data/local/tmp/benchmark "$@" /data/local/tmp/testdata
14 | 


--------------------------------------------------------------------------------
/tools/make-windows-releases:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -eu
 4 | 
 5 | for arch in 'i686' 'x86_64'; do
 6 | 	make -j CC=${arch}-w64-mingw32-gcc CFLAGS="-Werror"
 7 | 	dir=xpack-$(git describe --tags | tr -d v)-windows-${arch}-bin
 8 | 	rm -rf $dir ${dir}.zip
 9 | 	mkdir $dir
10 | 	cp libxpack.dll libxpack.lib libxpack.h *.exe $dir
11 | 	${arch}-w64-mingw32-strip ${dir}/libxpack.dll ${dir}/*.exe
12 | 	for file in COPYING; do
13 | 		sed < $file > ${dir}/${file}.txt -e 's/$/\r/g'
14 | 	done
15 | 	for file in README.md; do
16 | 		sed < $file > ${dir}/${file} -e 's/$/\r/g'
17 | 	done
18 | 	(cd ${dir} && zip -r ../${dir}.zip .)
19 | done
20 | 


--------------------------------------------------------------------------------
/lib/lz_hash.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * lz_hash.h - hashing for Lempel-Ziv matchfinding
 3 |  */
 4 | 
 5 | #ifndef LIB_LZ_HASH_H
 6 | #define LIB_LZ_HASH_H
 7 | 
 8 | #include "common_defs.h"
 9 | 
10 | /*
11 |  * The hash function: given a sequence prefix held in the low-order bits of a
12 |  * 32-bit value, multiply by a carefully-chosen large constant.  Discard any
13 |  * bits of the product that don't fit in a 32-bit value, but take the
14 |  * next-highest @num_bits bits of the product as the hash value, as those have
15 |  * the most randomness.
16 |  */
17 | static forceinline u32
18 | lz_hash(u32 seq, unsigned num_bits)
19 | {
20 | 	return (u32)(seq * 0x1E35A7BD) >> (32 - num_bits);
21 | }
22 | 
23 | #endif /* LIB_LZ_HASH_H */
24 | 


--------------------------------------------------------------------------------
/programs/detect.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | if [ -z "$CC" ]; then
 4 | 	CC=cc
 5 | fi
 6 | 
 7 | echo "/* THIS FILE WAS AUTOMATICALLY GENERATED.  DO NOT EDIT. */"
 8 | echo "#ifndef _CONFIG_H"
 9 | echo "#define _CONFIG_H"
10 | 
11 | tmpfile="$(mktemp -t xpack_config.XXXXXXXX)"
12 | trap "rm -f \"$tmpfile\"" EXIT
13 | 
14 | check_function() {
15 | 	funcname="$1"
16 | 	macro="HAVE_$(echo $funcname | tr a-z A-Z)"
17 | 	echo "int main() { $funcname(); }" > "$tmpfile"
18 | 	echo
19 | 	echo "/* Is the $funcname() function available? */"
20 | 	if $CC -x c $tmpfile -o /dev/null > /dev/null 2>&1; then
21 | 		echo "#define $macro 1"
22 | 	else
23 | 		echo "/* $macro is not set */"
24 | 	fi
25 | }
26 | 
27 | check_function clock_gettime
28 | check_function futimens
29 | check_function futimes
30 | 
31 | echo
32 | echo "#endif /* _CONFIG_H */"
33 | 


--------------------------------------------------------------------------------
/tools/afl-fuzz/compress/fuzz.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <libxpack.h>
 3 | #include <string.h>
 4 | #include <fcntl.h>
 5 | #include <unistd.h>
 6 | #include <sys/stat.h>
 7 | 
 8 | int main(int argc, char **argv)
 9 | {
10 | 	struct xpack_decompressor *d;
11 | 	struct xpack_compressor *c;
12 | 	int ret;
13 | 	int fd = open(argv[1], O_RDONLY);
14 | 	struct stat stbuf;
15 | 	assert(fd >= 0);
16 | 	ret = fstat(fd, &stbuf);
17 | 	assert(!ret);
18 | 
19 | 	char in[stbuf.st_size];
20 | 	ret = read(fd, in, sizeof in);
21 | 	assert(ret == sizeof in);
22 | 
23 | 	c = xpack_alloc_compressor(stbuf.st_size, 6);
24 | 	d = xpack_alloc_decompressor();
25 | 
26 | 	char out[sizeof(in)];
27 | 	char checkarray[sizeof(in)];
28 | 
29 | 	size_t csize = xpack_compress(c, in,sizeof in, out, sizeof out);
30 | 	if (csize) {
31 | 		enum decompress_result res;
32 | 		res = xpack_decompress(d, out, csize, checkarray, sizeof in, NULL);
33 | 		assert(!res);
34 | 		assert(!memcmp(in, checkarray, sizeof in));
35 | 	}
36 | 
37 | 	xpack_free_compressor(c);
38 | 	xpack_free_decompressor(d);
39 | 	return 0;
40 | }
41 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
 1 | Copyright 2016 Eric Biggers
 2 | 
 3 | Permission is hereby granted, free of charge, to any person
 4 | obtaining a copy of this software and associated documentation files
 5 | (the "Software"), to deal in the Software without restriction,
 6 | including without limitation the rights to use, copy, modify, merge,
 7 | publish, distribute, sublicense, and/or sell copies of the Software,
 8 | and to permit persons to whom the Software is furnished to do so,
 9 | subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile.msc:
--------------------------------------------------------------------------------
 1 | #
 2 | # Makefile for the Microsoft toolchain
 3 | #
 4 | # Usage:
 5 | #	nmake /f Makefile.msc
 6 | #
 7 | 
 8 | CC = cl
 9 | LD = link
10 | AR = lib
11 | CFLAGS = /MD /O2 -I. -Icommon
12 | LDFLAGS =
13 | 
14 | STATICLIB = libxpackstatic.lib
15 | SHAREDLIB = libxpack.dll
16 | IMPLIB    = libxpack.lib
17 | 
18 | LIB_OBJ = lib/x86_cpu_features.obj	\
19 | 	  lib/xpack_compress.obj	\
20 | 	  lib/xpack_decompress.obj	\
21 | 	  lib/xpack_common.obj
22 | 
23 | PROG_COMMON_OBJ = programs/prog_util.obj \
24 | 		  programs/tgetopt.obj \
25 | 		  $(STATICLIB)
26 | 
27 | PROG_CFLAGS = $(CFLAGS) -Iprograms
28 | 
29 | PROGRAMS = benchmark.exe xpack.exe xunpack.exe
30 | 
31 | all: $(STATICLIB) $(SHAREDLIB) $(IMPLIB) $(PROGRAMS)
32 | 
33 | .c.obj:
34 | 	$(CC) -c /Fo$@ $(CFLAGS) $**
35 | 
36 | $(STATICLIB): $(LIB_OBJ)
37 | 	$(AR) $(ARFLAGS) -out:$@ $(LIB_OBJ)
38 | 
39 | $(SHAREDLIB): $(LIB_OBJ)
40 | 	$(LD) $(LDFLAGS) -out:$@ -dll -implib:$(IMPLIB) $(LIB_OBJ)
41 | 
42 | $(IMPLIB): $(SHAREDLIB)
43 | 
44 | benchmark.exe:programs/benchmark.obj $(PROG_COMMON_OBJ)
45 | 	$(LD) $(LDFLAGS) -out:$@ $**
46 | 
47 | xpack.exe:programs/xpack.obj $(PROG_COMMON_OBJ)
48 | 	$(LD) $(LDFLAGS) -out:$@ $**
49 | 
50 | xunpack.exe:xpack.exe
51 | 	copy $** $@
52 | 
53 | clean:
54 | 	-del *.dll *.exe *.exp *.lib lib\*.obj programs\*.obj 2>nul
55 | 


--------------------------------------------------------------------------------
/lib/x86_cpu_features.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * x86_cpu_features.h - feature detection for x86 processors
 3 |  */
 4 | 
 5 | #ifndef LIB_X86_CPU_FEATURES_H
 6 | #define LIB_X86_CPU_FEATURES_H
 7 | 
 8 | #include "common_defs.h"
 9 | 
10 | #if defined(__x86_64__) && COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE
11 | #  define X86_CPU_FEATURES_ENABLED 1
12 | #else
13 | #  define X86_CPU_FEATURES_ENABLED 0
14 | #endif
15 | 
16 | #if X86_CPU_FEATURES_ENABLED
17 | 
18 | #define X86_CPU_FEATURE_SSE		0x00000001
19 | #define X86_CPU_FEATURE_SSE2		0x00000002
20 | #define X86_CPU_FEATURE_SSE3		0x00000004
21 | #define X86_CPU_FEATURE_SSSE3		0x00000008
22 | #define X86_CPU_FEATURE_SSE4_1		0x00000010
23 | #define X86_CPU_FEATURE_SSE4_2		0x00000020
24 | #define X86_CPU_FEATURE_AVX		0x00000040
25 | #define X86_CPU_FEATURE_BMI		0x00000080
26 | #define X86_CPU_FEATURE_AVX2		0x00000100
27 | #define X86_CPU_FEATURE_BMI2		0x00000200
28 | 
29 | #define X86_CPU_FEATURES_KNOWN		0x80000000
30 | 
31 | extern u32 _x86_cpu_features;
32 | 
33 | extern void
34 | x86_setup_cpu_features(void);
35 | 
36 | /* Does the processor have the specified feature?  */
37 | static forceinline bool
38 | x86_have_cpu_feature(u32 feature)
39 | {
40 | 	if (_x86_cpu_features == 0)
41 | 		x86_setup_cpu_features();
42 | 	return _x86_cpu_features & feature;
43 | }
44 | 
45 | #endif /* X86_CPU_FEATURES_ENABLED */
46 | 
47 | #endif /* LIB_X86_CPU_FEATURES_H */
48 | 


--------------------------------------------------------------------------------
/lib/lz_extend.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * lz_extend.h - fast match extension for Lempel-Ziv matchfinding
 3 |  */
 4 | 
 5 | #ifndef LIB_LZ_EXTEND_H
 6 | #define LIB_LZ_EXTEND_H
 7 | 
 8 | #include "unaligned.h"
 9 | 
10 | /*
11 |  * Return the number of bytes at @matchptr that match the bytes at @strptr, up
12 |  * to a maximum of @max_len.  Initially, @start_len bytes are matched.
13 |  */
14 | static forceinline u32
15 | lz_extend(const u8 * const strptr, const u8 * const matchptr,
16 | 	  const u32 start_len, const u32 max_len)
17 | {
18 | 	u32 len = start_len;
19 | 	machine_word_t v_word;
20 | 
21 | 	if (UNALIGNED_ACCESS_IS_FAST) {
22 | 
23 | 		if (likely(max_len - len >= 4 * WORDBYTES)) {
24 | 
25 | 		#define COMPARE_WORD_STEP					\
26 | 			v_word = load_word_unaligned(&matchptr[len]) ^		\
27 | 				 load_word_unaligned(&strptr[len]);		\
28 | 			if (v_word != 0)					\
29 | 				goto word_differs;				\
30 | 			len += WORDBYTES;					\
31 | 
32 | 			COMPARE_WORD_STEP
33 | 			COMPARE_WORD_STEP
34 | 			COMPARE_WORD_STEP
35 | 			COMPARE_WORD_STEP
36 | 		#undef COMPARE_WORD_STEP
37 | 		}
38 | 
39 | 		while (len + WORDBYTES <= max_len) {
40 | 			v_word = load_word_unaligned(&matchptr[len]) ^
41 | 				 load_word_unaligned(&strptr[len]);
42 | 			if (v_word != 0)
43 | 				goto word_differs;
44 | 			len += WORDBYTES;
45 | 		}
46 | 	}
47 | 
48 | 	while (len < max_len && matchptr[len] == strptr[len])
49 | 		len++;
50 | 	return len;
51 | 
52 | word_differs:
53 | 	if (CPU_IS_LITTLE_ENDIAN())
54 | 		len += (bsfw(v_word) >> 3);
55 | 	else
56 | 		len += (8 * WORDBYTES - 1 - bsrw(v_word)) >> 3;
57 | 	return len;
58 | }
59 | 
60 | #endif /* LIB_LZ_EXTEND_H */
61 | 


--------------------------------------------------------------------------------
/lib/xpack_constants.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * xpack_constants.h - constants for the XPACK compression format
 3 |  */
 4 | 
 5 | #ifndef LIB_XPACK_CONSTANTS_H
 6 | #define LIB_XPACK_CONSTANTS_H
 7 | 
 8 | #define MIN_MATCH_LEN			2
 9 | #define NUM_REPS			3
10 | 
11 | #define BLOCKTYPE_VERBATIM		1
12 | #define BLOCKTYPE_ALIGNED		2
13 | #define BLOCKTYPE_UNCOMPRESSED		3
14 | 
15 | #define NUM_BLOCKTYPE_BITS		3
16 | #define NUM_BLOCKSIZE_BITS		20
17 | #define DEFAULT_BLOCK_SIZE		32768
18 | 
19 | #define NUM_ALIGNED_BITS		3
20 | 
21 | #define LITERAL_ALPHABET_SIZE		256
22 | #define LITRUNLEN_ALPHABET_SIZE		16
23 | #define LENGTH_ALPHABET_SIZE		64
24 | #define MAX_OFFSET_ALPHABET_SIZE	32
25 | #define ALIGNED_ALPHABET_SIZE		(1 << NUM_ALIGNED_BITS)
26 | 
27 | #define MAX_ALPHABET_SIZE		LITERAL_ALPHABET_SIZE
28 | 
29 | #define MAX_LOG2_NUM_LITERAL_STATES	10
30 | #define MAX_LOG2_NUM_LITRUNLEN_STATES	9
31 | #define MAX_LOG2_NUM_LENGTH_STATES	9
32 | #define MAX_LOG2_NUM_OFFSET_STATES	9
33 | #define MAX_LOG2_NUM_ALIGNED_STATES	7
34 | 
35 | #define MAX_LOG2_NUM_STATES		MAX_LOG2_NUM_LITERAL_STATES
36 | #define MAX_NUM_STATES			(1 << MAX_LOG2_NUM_STATES)
37 | 
38 | #define NUM_LITERAL_STREAMS		2
39 | 
40 | #define MAGIC_FILESIZE			12000000
41 | 
42 | #define CODEBITS			4
43 | #define MAX_EXTRA_CODEBITS		((1 << CODEBITS) - 3)
44 | #define ZEROCODE1			((1 << CODEBITS) - 2)
45 | #define ZEROCODE2			((1 << CODEBITS) - 1)
46 | #define ZEROCODE1_NBITS			2
47 | #define ZEROCODE2_NBITS			7
48 | #define ZEROCODE1_MIN			1
49 | #define ZEROCODE1_MAX			(ZEROCODE1_MIN + (1 << ZEROCODE1_NBITS) - 1)
50 | #define ZEROCODE2_MIN			(ZEROCODE1_MAX + 1)
51 | #define ZEROCODE2_MAX			(ZEROCODE2_MIN + (1 << ZEROCODE2_NBITS) - 1)
52 | 
53 | #endif /* LIB_XPACK_CONSTANTS_H */
54 | 


--------------------------------------------------------------------------------
/lib/xpack_common.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIB_XPACK_COMMON_H
 2 | #define LIB_XPACK_COMMON_H
 3 | 
 4 | #include <limits.h>
 5 | #include <stdlib.h>
 6 | #include <string.h>
 7 | 
 8 | #include "common_defs.h"
 9 | #include "unaligned.h"
10 | #include "xpack_constants.h"
11 | 
12 | #include "libxpack.h"
13 | 
14 | #ifdef ENABLE_PREPROCESSING
15 | extern void preprocess(void *data, u32 size);
16 | extern void postprocess(void *data, u32 size);
17 | #endif
18 | 
19 | /*
20 |  * Given the number of states, return the corresponding state generator, which
21 |  * is the amount by which we will step through the states when assigning symbols
22 |  * to states.  We require a value such that every state will be visited exactly
23 |  * once after num_states steps.  Mathematically, we require a generator of the
24 |  * cyclic group consisting of the set of integers {0...num_states - 1} and the
25 |  * group operation of addition modulo num_states.  By a well-known theorem, the
26 |  * generators are the set of integers relatively prime to num_states.  In this
27 |  * case, since num_states is a power of 2, its prime factors are all 2's;
28 |  * therefore, the generators are all numbers that do not have 2 as a prime
29 |  * factor, i.e. all odd numbers.
30 |  *
31 |  * The number '1' is always a valid choice, but a poor one because it is
32 |  * advantageous to distribute each symbol's states more evenly.  The value we
33 |  * actually use that works well in practice is five-eighths the number of states
34 |  * plus 3.  But use | instead of + to guarantee an odd number if num_states <=
35 |  * 8.  Also, it is okay to use a value greater than num_states because we have
36 |  * to mod with num_states after each addition anyway.
37 |  *
38 |  * Note: it is essential that the encoder and decoder always choose the same
39 |  * generator as each other for a given num_states!  If you were to change this
40 |  * formula, then you would change the on-disk compression format.
41 |  */
42 | static forceinline unsigned
43 | get_state_generator(unsigned num_states)
44 | {
45 | 	return (num_states >> 1) | (num_states >> 3) | 3;
46 | }
47 | 
48 | /* Initialize the recent offsets queue. */
49 | static forceinline void
50 | init_recent_offsets(u32 recent_offsets[NUM_REPS])
51 | {
52 | 	unsigned i;
53 | 
54 | 	for (i = 0; i < NUM_REPS; i++)
55 | 		recent_offsets[i] = 1 + i;
56 | }
57 | 
58 | #endif /* LIB_XPACK_COMMON_H */
59 | 


--------------------------------------------------------------------------------
/common/compiler_msc.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * compiler_msc.h - definitions for the Microsoft C Compiler
 3 |  */
 4 | 
 5 | #define BUILDING_LIBXPACK
 6 | 
 7 | #define LIBEXPORT	__declspec(dllexport)
 8 | 
 9 | /*
10 |  * Old versions (e.g. VS2010) of MSC don't have the C99 header stdbool.h.
11 |  * Beware: the below replacement isn't fully standard, since normally any value
12 |  * != 0 should be implicitly cast to a bool with value 1... but that doesn't
13 |  * happen if bool is really just an 'int'.
14 |  */
15 | typedef int bool;
16 | #define true 1
17 | #define false 0
18 | #define __bool_true_false_are_defined 1
19 | 
20 | /* Define ssize_t */
21 | #ifdef _WIN64
22 | typedef long long ssize_t;
23 | #else
24 | typedef int ssize_t;
25 | #endif
26 | 
27 | /*
28 |  * Old versions (e.g. VS2010) of MSC have stdint.h but not the C99 header
29 |  * inttypes.h.  Work around this by defining the PRI* macros ourselves.
30 |  */
31 | #include <stdint.h>
32 | #define PRIu8  "hhu"
33 | #define PRIu16 "hu"
34 | #define PRIu32 "u"
35 | #define PRIu64 "llu"
36 | #define PRIi8  "hhi"
37 | #define PRIi16 "hi"
38 | #define PRIi32 "i"
39 | #define PRIi64 "lli"
40 | #define PRIx8  "hhx"
41 | #define PRIx16 "hx"
42 | #define PRIx32 "x"
43 | #define PRIx64 "llx"
44 | 
45 | /* Assume a little endian architecture with fast unaligned access */
46 | #define CPU_IS_LITTLE_ENDIAN()		1
47 | #define UNALIGNED_ACCESS_IS_FAST	1
48 | 
49 | /* __restrict has nonstandard behavior; don't use it */
50 | #define restrict
51 | 
52 | /* ... but we can use __inline and __forceinline */
53 | #define inline		__inline
54 | #define forceinline	__forceinline
55 | 
56 | /* Byte swap functions */
57 | #define bswap16	_byteswap_ushort
58 | #define bswap32	_byteswap_ulong
59 | #define bswap64	_byteswap_uint64
60 | 
61 | /* Bit scan functions (32-bit) */
62 | 
63 | static forceinline unsigned
64 | bsr32(uint32_t n)
65 | {
66 | 	_BitScanReverse(&n, n);
67 | 	return n;
68 | }
69 | #define bsr32 bsr32
70 | 
71 | static forceinline unsigned
72 | bsf32(uint32_t n)
73 | {
74 | 	_BitScanForward(&n, n);
75 | 	return n;
76 | }
77 | #define bsf32 bsf32
78 | 
79 | #ifdef _M_X64 /* Bit scan functions (64-bit) */
80 | 
81 | static forceinline unsigned
82 | bsr64(uint64_t n)
83 | {
84 | 	_BitScanReverse64(&n, n);
85 | 	return n;
86 | }
87 | #define bsr64 bsr64
88 | 
89 | static forceinline unsigned
90 | bsf64(uint64_t n)
91 | {
92 | 	_BitScanForward64(&n, n);
93 | 	return n;
94 | }
95 | #define bsf64 bsf64
96 | 
97 | #endif /* _M_X64 */
98 | 


--------------------------------------------------------------------------------
/common/compiler_gcc.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * compiler_gcc.h - definitions for the GNU C Compiler.  Currently this also
 3 |  * handles clang and the Intel C Compiler.
 4 |  */
 5 | 
 6 | #define GCC_PREREQ(major, minor)					\
 7 | 	(!defined(__clang__) && !defined(__INTEL_COMPILER) &&		\
 8 | 	 (__GNUC__ > (major) ||						\
 9 | 	  (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))))
10 | 
11 | #ifndef __has_attribute
12 | #  define __has_attribute(attribute)	0
13 | #endif
14 | #ifndef __has_feature
15 | #  define __has_feature(feature)	0
16 | #endif
17 | #ifndef __has_builtin
18 | #  define __has_builtin(builtin)	0
19 | #endif
20 | 
21 | #ifdef _WIN32
22 | #  define LIBEXPORT __declspec(dllexport)
23 | #else
24 | #  define LIBEXPORT __attribute__((visibility("default")))
25 | #endif
26 | 
27 | #define inline			inline
28 | #define forceinline		inline __attribute__((always_inline))
29 | #define restrict		__restrict__
30 | #define likely(expr)		__builtin_expect(!!(expr), 1)
31 | #define unlikely(expr)		__builtin_expect(!!(expr), 0)
32 | #define prefetchr(addr)		__builtin_prefetch((addr), 0)
33 | #define prefetchw(addr)		__builtin_prefetch((addr), 1)
34 | 
35 | #define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE		\
36 | 	(GCC_PREREQ(4, 4) || __has_attribute(target))
37 | 
38 | #define COMPILER_SUPPORTS_BMI2_TARGET				\
39 | 	(COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE &&		\
40 | 	 (GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_pdep_di)))
41 | 
42 | /* Newer gcc supports __BYTE_ORDER__.  Older gcc doesn't. */
43 | #ifdef __BYTE_ORDER__
44 | #  define CPU_IS_LITTLE_ENDIAN() (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
45 | #endif
46 | 
47 | #if GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
48 | #  define bswap16	__builtin_bswap16
49 | #endif
50 | 
51 | #if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32)
52 | #  define bswap32	__builtin_bswap32
53 | #endif
54 | 
55 | #if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64)
56 | #  define bswap64	__builtin_bswap64
57 | #endif
58 | 
59 | #if defined(__x86_64__) || defined(__i386__) || defined(__ARM_FEATURE_UNALIGNED)
60 | #  define UNALIGNED_ACCESS_IS_FAST 1
61 | #endif
62 | 
63 | /* With gcc, we can access unaligned memory through 'packed' structures. */
64 | #define DEFINE_UNALIGNED_TYPE(type)				\
65 | 								\
66 | struct type##unaligned {					\
67 | 	type v;							\
68 | } __attribute__((packed));					\
69 | 								\
70 | static forceinline type						\
71 | load_##type##_unaligned(const void *p)				\
72 | {								\
73 | 	return ((const struct type##unaligned *)p)->v;		\
74 | }								\
75 | 								\
76 | static forceinline void						\
77 | store_##type##_unaligned(type v, void *p)			\
78 | {								\
79 | 	((struct type##unaligned *)p)->v = v;			\
80 | }
81 | 
82 | #define bsr32(n)	(31 - __builtin_clz(n))
83 | #define bsr64(n)	(63 - __builtin_clzll(n))
84 | #define bsf32(n)	__builtin_ctz(n)
85 | #define bsf64(n)	__builtin_ctzll(n)
86 | 


--------------------------------------------------------------------------------
/programs/tgetopt.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * tgetopt.c - portable replacement for GNU getopt()
  3 |  *
  4 |  * Copyright 2016 Eric Biggers
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person
  7 |  * obtaining a copy of this software and associated documentation
  8 |  * files (the "Software"), to deal in the Software without
  9 |  * restriction, including without limitation the rights to use,
 10 |  * copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 |  * copies of the Software, and to permit persons to whom the
 12 |  * Software is furnished to do so, subject to the following
 13 |  * conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be
 16 |  * included in all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 19 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 20 |  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 21 |  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 22 |  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 23 |  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 24 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 25 |  * OTHER DEALINGS IN THE SOFTWARE.
 26 |  */
 27 | 
 28 | #include "prog_util.h"
 29 | 
 30 | tchar *toptarg;
 31 | int toptind = 1, topterr = 1, toptopt;
 32 | 
 33 | /*
 34 |  * This is a simple implementation of getopt().  It can be compiled with either
 35 |  * 'char' or 'wchar_t' as the character type.
 36 |  *
 37 |  * Do *not* use this implementation if you need any of the following features,
 38 |  * as they are not supported:
 39 |  *	- Optional arguments
 40 |  *	- Long options
 41 |  *	- Option-related arguments retained in argv, not nulled out
 42 |  *	- '+' and '-' characters in optstring
 43 |  */
 44 | int
 45 | tgetopt(int argc, tchar *argv[], const tchar *optstring)
 46 | {
 47 | 	static tchar empty[1];
 48 | 	static tchar *nextchar;
 49 | 	static bool done;
 50 | 
 51 | 	if (toptind == 1) {
 52 | 		/* Starting to scan a new argument vector */
 53 | 		nextchar = NULL;
 54 | 		done = false;
 55 | 	}
 56 | 
 57 | 	while (!done && (nextchar != NULL || toptind < argc)) {
 58 | 		if (nextchar == NULL) {
 59 | 			/* Scanning a new argument */
 60 | 			tchar *arg = argv[toptind++];
 61 | 			if (arg[0] == '-' && arg[1] != '\0') {
 62 | 				if (arg[1] == '-' && arg[2] == '\0') {
 63 | 					/* All args after "--" are nonoptions */
 64 | 					argv[toptind - 1] = NULL;
 65 | 					done = true;
 66 | 				} else {
 67 | 					/* Start of short option characters */
 68 | 					nextchar = &arg[1];
 69 | 				}
 70 | 			}
 71 | 		} else {
 72 | 			/* More short options in previous arg */
 73 | 			tchar opt = *nextchar;
 74 | 			tchar *p = tstrchr(optstring, opt);
 75 | 			if (p == NULL) {
 76 | 				if (topterr)
 77 | 					msg("invalid option -- '%"TC"'", opt);
 78 | 				toptopt = opt;
 79 | 				return '?';
 80 | 			}
 81 | 			/* 'opt' is a valid short option character */
 82 | 			nextchar++;
 83 | 			if (*(p + 1) == ':') {
 84 | 				/* 'opt' requires an argument */
 85 | 				if (*nextchar != '\0') {
 86 | 					/* Optarg is in same argv argument */
 87 | 					toptarg = nextchar;
 88 | 				} else if (toptind < argc) {
 89 | 					/* Optarg is next argv argument */
 90 | 					argv[toptind - 1] = NULL;
 91 | 					toptarg = argv[toptind++];
 92 | 				} else {
 93 | 					if (topterr && *optstring != ':') {
 94 | 						msg("option requires an "
 95 | 						    "argument -- '%"TC"'", opt);
 96 | 					}
 97 | 					toptopt = opt;
 98 | 					opt = (*optstring == ':') ? ':' : '?';
 99 | 				}
100 | 				nextchar = empty;
101 | 			}
102 | 			if (*nextchar == '\0') {
103 | 				argv[toptind - 1] = NULL;
104 | 				nextchar = NULL;
105 | 			}
106 | 			return opt;
107 | 		}
108 | 	}
109 | 
110 | 	/* Done scanning.  Move all nonoptions to the end, set optind to the
111 | 	 * index of the first nonoption, and return -1. */
112 | 	toptind = argc;
113 | 	while (--argc > 0)
114 | 		if (argv[argc] != NULL)
115 | 			argv[--toptind] = argv[argc];
116 | 	done = true;
117 | 	return -1;
118 | }
119 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | XPACK is an experimental compression format.  It is intended to have better
 4 | performance than DEFLATE as implemented in the zlib library and also produce a
 5 | notably better compression ratio on most inputs.  The format is not yet stable.
 6 | 
 7 | XPACK has been inspired by the DEFLATE, LZX, and Zstandard formats, among
 8 | others.  Originally envisioned as a DEFLATE replacement, it won't necessarily
 9 | see a lot of additional development since other solutions such as Zstandard seem
10 | to have gotten much closer to that goal first (great job to those involved!).
11 | But I am releasing the code anyway for anyone who may find it useful.
12 | 
13 | # Format overview
14 | 
15 | Like many other common compression formats, XPACK is based on the LZ77 method
16 | (decomposition into literals and length/offset copy commands) with a number of
17 | tricks on top.  Features include:
18 | 
19 | * Increased sliding window, or "dictionary", size (like LZX and Zstd)
20 | * Entropy encoding with finite state entropy (FSE) codes, also known as
21 |   table-based asymmetric numeral systems (tANS) (like Zstd)
22 | * Minimum match length of 2 (like LZX)
23 | * Lowest three bits of match offsets can be entropy-encoded (like LZX)
24 | * Aligned and verbatim blocks (like LZX)
25 | * Recent match offsets queue with three entries (like LZX)
26 | * Literals packed separately from matches, and with two FSE streams (like older
27 |   Zstd versions)
28 | * Literal runs (like Zstd)
29 | * Concise FSE header (state count list) representation
30 | * Decoder reads in forwards direction, encoder writes in backwards direction
31 | * Optional preprocessing step for x86 machine code (like LZX)
32 | 
33 | # Implementation overview
34 | 
35 | libxpack is a library containing an optimized, portable implementation of an
36 | XPACK compressor and decompressor.  Features currently include:
37 | 
38 | * Whole-buffer compression and decompression only
39 | * Multiple compression levels
40 | * Fast hash chains-based matchfinder
41 | * Greedy and lazy parsers
42 | * Decompressor automatically uses Intel BMI2 instructions when supported
43 | 
44 | In addition, the following command-line programs using libxpack are provided:
45 | 
46 | * xpack (or xunpack), a program which behaves like a standard UNIX command-line
47 |   compressor such as gzip (or gunzip).  The command-line interface should be
48 |   compatible enough that xpack can be used as a drop-in gzip replacement in many
49 |   cases --- though the on-disk format is incompatible, of course.
50 | * benchmark, a program for benchmarking in-memory compression and decompression
51 | 
52 | Note that currently, all the programs internally use "chunks", as the library
53 | does not yet support streaming.  This will worsen the compression ratio
54 | slightly, compared to what is possible.
55 | 
56 | All files may be modified and/or redistributed under the terms of the MIT
57 | license.  There is NO WARRANTY, to the extent permitted by law.  See the COPYING
58 | file for details.
59 | 
60 | # Building
61 | 
62 | ## For UNIX
63 | 
64 | Just run `make`.  You need GNU Make and either GCC or Clang.  GCC is recommended
65 | because it builds slightly faster binaries.  There is no `make install` yet;
66 | just copy the file(s) to where you want.
67 | 
68 | By default, all targets are built, including the library and programs.  `make
69 | help` shows the available targets.  There are also several options which can be
70 | set on the `make` command line.  See the Makefile for details.
71 | 
72 | ## For Windows
73 | 
74 | MinGW (GCC) is the recommended compiler to use when building binaries for
75 | Windows.  MinGW can be used on either Windows or Linux.  Use a command like:
76 | 
77 |     $ make CC=x86_64-w64-mingw32-gcc
78 | 
79 | Windows binaries prebuilt with MinGW may also be downloaded from
80 | https://github.com/ebiggers/xpack/releases.
81 | 
82 | Alternatively, a separate Makefile, `Makefile.msc`, is provided for the tools
83 | that come with Visual Studio, for those who strongly prefer that toolchain.
84 | 
85 | As usual, 64-bit binaries are faster than 32-bit binaries and should be
86 | preferred whenever possible.
87 | 


--------------------------------------------------------------------------------
/lib/x86_cpu_features.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * x86_cpu_features.c - feature detection for x86 processors
  3 |  */
  4 | 
  5 | #include "x86_cpu_features.h"
  6 | 
  7 | #if X86_CPU_FEATURES_ENABLED
  8 | 
  9 | #define DEBUG 0
 10 | 
 11 | #if DEBUG
 12 | #  include <stdio.h>
 13 | #endif
 14 | 
 15 | u32 _x86_cpu_features = 0;
 16 | 
 17 | /*
 18 |  * With old GCC versions we have to manually save and restore the x86_32 PIC
 19 |  * register (ebx).  See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602
 20 |  */
 21 | #if defined(__i386__) && defined(__PIC__)
 22 | #  define EBX_CONSTRAINT "=r"
 23 | #else
 24 | #  define EBX_CONSTRAINT "=b"
 25 | #endif
 26 | 
 27 | /* Execute the CPUID instruction. */
 28 | static inline void
 29 | cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d)
 30 | {
 31 | 	__asm__(".ifnc %%ebx, %1; mov  %%ebx, %1; .endif\n"
 32 | 		"cpuid                                  \n"
 33 | 		".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n"
 34 | 		: "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d)
 35 | 		: "a" (leaf), "c" (subleaf));
 36 | }
 37 | 
 38 | /* Read an extended control register. */
 39 | static inline u64
 40 | read_xcr(u32 index)
 41 | {
 42 | 	u32 edx, eax;
 43 | 
 44 | 	/*
 45 | 	 * Execute the "xgetbv" instruction.  Old versions of binutils do not
 46 | 	 * recognize this instruction, so list the raw bytes instead.
 47 | 	 */
 48 | 	__asm__ (".byte 0x0f, 0x01, 0xd0" : "=d" (edx), "=a" (eax) : "c" (index));
 49 | 
 50 | 	return ((u64)edx << 32) | eax;
 51 | }
 52 | 
 53 | #define IS_SET(reg, bit) ((reg) & ((u32)1 << (bit)))
 54 | 
 55 | /* Initialize _x86_cpu_features with bits for interesting processor features. */
 56 | void
 57 | x86_setup_cpu_features(void)
 58 | {
 59 | 	u32 features = 0;
 60 | 	u32 dummy1, dummy2, dummy3, dummy4;
 61 | 	u32 max_function;
 62 | 	u32 features_1, features_2, features_3, features_4;
 63 | 	bool os_saves_ymm_regs = false;
 64 | 
 65 | 	/* Get maximum supported function */
 66 | 	cpuid(0, 0, &max_function, &dummy2, &dummy3, &dummy4);
 67 | 	if (max_function < 1)
 68 | 		goto out;
 69 | 
 70 | 	/* Standard feature flags */
 71 | 	cpuid(1, 0, &dummy1, &dummy2, &features_2, &features_1);
 72 | 
 73 | 	if (IS_SET(features_1, 25))
 74 | 		features |= X86_CPU_FEATURE_SSE;
 75 | 
 76 | 	if (IS_SET(features_1, 26))
 77 | 		features |= X86_CPU_FEATURE_SSE2;
 78 | 
 79 | 	if (IS_SET(features_2, 0))
 80 | 		features |= X86_CPU_FEATURE_SSE3;
 81 | 
 82 | 	if (IS_SET(features_2, 9))
 83 | 		features |= X86_CPU_FEATURE_SSSE3;
 84 | 
 85 | 	if (IS_SET(features_2, 19))
 86 | 		features |= X86_CPU_FEATURE_SSE4_1;
 87 | 
 88 | 	if (IS_SET(features_2, 20))
 89 | 		features |= X86_CPU_FEATURE_SSE4_2;
 90 | 
 91 | 	if (IS_SET(features_2, 27)) /* OSXSAVE set? */
 92 | 		if ((read_xcr(0) & 0x6) == 0x6)
 93 | 			os_saves_ymm_regs = true;
 94 | 
 95 | 	if (os_saves_ymm_regs && IS_SET(features_2, 28))
 96 | 		features |= X86_CPU_FEATURE_AVX;
 97 | 
 98 | 	if (max_function < 7)
 99 | 		goto out;
100 | 
101 | 	/* Extended feature flags */
102 | 	cpuid(7, 0, &dummy1, &features_3, &features_4, &dummy4);
103 | 
104 | 	if (IS_SET(features_3, 3))
105 | 		features |= X86_CPU_FEATURE_BMI;
106 | 
107 | 	if (os_saves_ymm_regs && IS_SET(features_3, 5))
108 | 		features |= X86_CPU_FEATURE_AVX2;
109 | 
110 | 	if (IS_SET(features_3, 8))
111 | 		features |= X86_CPU_FEATURE_BMI2;
112 | 
113 | out:
114 | 
115 | #if DEBUG
116 | 	printf("Detected x86 CPU features: ");
117 | 	if (features & X86_CPU_FEATURE_SSE)
118 | 		printf("SSE ");
119 | 	if (features & X86_CPU_FEATURE_SSE2)
120 | 		printf("SSE2 ");
121 | 	if (features & X86_CPU_FEATURE_SSE3)
122 | 		printf("SSE3 ");
123 | 	if (features & X86_CPU_FEATURE_SSSE3)
124 | 		printf("SSSE3 ");
125 | 	if (features & X86_CPU_FEATURE_SSE4_1)
126 | 		printf("SSE4.1 ");
127 | 	if (features & X86_CPU_FEATURE_SSE4_2)
128 | 		printf("SSE4.2 ");
129 | 	if (features & X86_CPU_FEATURE_BMI)
130 | 		printf("BMI ");
131 | 	if (features & X86_CPU_FEATURE_AVX)
132 | 		printf("AVX ");
133 | 	if (features & X86_CPU_FEATURE_BMI2)
134 | 		printf("BMI2 ");
135 | 	if (features & X86_CPU_FEATURE_AVX2)
136 | 		printf("AVX2 ");
137 | 	printf("\n");
138 | #endif /* DEBUG */
139 | 
140 | 	_x86_cpu_features = features | X86_CPU_FEATURES_KNOWN;
141 | }
142 | 
143 | #endif /* X86_CPU_FEATURES_ENABLED */
144 | 


--------------------------------------------------------------------------------
/lib/unaligned.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * unaligned.h - inline functions for unaligned memory accesses
  3 |  */
  4 | 
  5 | #ifndef LIB_UNALIGNED_H
  6 | #define LIB_UNALIGNED_H
  7 | 
  8 | #include "common_defs.h"
  9 | 
 10 | /*
 11 |  * Naming note:
 12 |  *
 13 |  * {load,store}_*_unaligned() deal with raw bytes without endianness conversion.
 14 |  * {get,put}_unaligned_*() deal with a specific endianness.
 15 |  */
 16 | 
 17 | DEFINE_UNALIGNED_TYPE(u16)
 18 | DEFINE_UNALIGNED_TYPE(u32)
 19 | DEFINE_UNALIGNED_TYPE(u64)
 20 | DEFINE_UNALIGNED_TYPE(machine_word_t)
 21 | 
 22 | #define load_word_unaligned	load_machine_word_t_unaligned
 23 | #define store_word_unaligned	store_machine_word_t_unaligned
 24 | 
 25 | /***** Unaligned loads  *****/
 26 | 
 27 | static forceinline u16
 28 | get_unaligned_le16(const u8 *p)
 29 | {
 30 | 	if (UNALIGNED_ACCESS_IS_FAST)
 31 | 		return le16_bswap(load_u16_unaligned(p));
 32 | 	else
 33 | 		return ((u16)p[1] << 8) | p[0];
 34 | }
 35 | 
 36 | static forceinline u32
 37 | get_unaligned_le32(const u8 *p)
 38 | {
 39 | 	if (UNALIGNED_ACCESS_IS_FAST)
 40 | 		return le32_bswap(load_u32_unaligned(p));
 41 | 	else
 42 | 		return ((u32)p[3] << 24) | ((u32)p[2] << 16) |
 43 | 			((u32)p[1] << 8) | p[0];
 44 | }
 45 | 
 46 | static forceinline u64
 47 | get_unaligned_le64(const u8 *p)
 48 | {
 49 | 	if (UNALIGNED_ACCESS_IS_FAST)
 50 | 		return le64_bswap(load_u64_unaligned(p));
 51 | 	else
 52 | 		return ((u64)p[7] << 56) | ((u64)p[6] << 48) |
 53 | 			((u64)p[5] << 40) | ((u64)p[4] << 32) |
 54 | 			((u64)p[3] << 24) | ((u64)p[2] << 16) |
 55 | 			((u64)p[1] << 8) | p[0];
 56 | }
 57 | 
 58 | static forceinline machine_word_t
 59 | get_unaligned_leword(const u8 *p)
 60 | {
 61 | 	STATIC_ASSERT(WORDBYTES == 4 || WORDBYTES == 8);
 62 | 	if (WORDBYTES == 4)
 63 | 		return get_unaligned_le32(p);
 64 | 	else
 65 | 		return get_unaligned_le64(p);
 66 | }
 67 | 
 68 | /***** Unaligned stores  *****/
 69 | 
 70 | static forceinline void
 71 | put_unaligned_le16(u16 v, u8 *p)
 72 | {
 73 | 	if (UNALIGNED_ACCESS_IS_FAST) {
 74 | 		store_u16_unaligned(le16_bswap(v), p);
 75 | 	} else {
 76 | 		p[0] = (u8)(v >> 0);
 77 | 		p[1] = (u8)(v >> 8);
 78 | 	}
 79 | }
 80 | 
 81 | static forceinline void
 82 | put_unaligned_le32(u32 v, u8 *p)
 83 | {
 84 | 	if (UNALIGNED_ACCESS_IS_FAST) {
 85 | 		store_u32_unaligned(le32_bswap(v), p);
 86 | 	} else {
 87 | 		p[0] = (u8)(v >> 0);
 88 | 		p[1] = (u8)(v >> 8);
 89 | 		p[2] = (u8)(v >> 16);
 90 | 		p[3] = (u8)(v >> 24);
 91 | 	}
 92 | }
 93 | 
 94 | static forceinline void
 95 | put_unaligned_le64(u64 v, u8 *p)
 96 | {
 97 | 	if (UNALIGNED_ACCESS_IS_FAST) {
 98 | 		store_u64_unaligned(le64_bswap(v), p);
 99 | 	} else {
100 | 		p[0] = (u8)(v >> 0);
101 | 		p[1] = (u8)(v >> 8);
102 | 		p[2] = (u8)(v >> 16);
103 | 		p[3] = (u8)(v >> 24);
104 | 		p[4] = (u8)(v >> 32);
105 | 		p[5] = (u8)(v >> 40);
106 | 		p[6] = (u8)(v >> 48);
107 | 		p[7] = (u8)(v >> 56);
108 | 	}
109 | }
110 | 
111 | static forceinline void
112 | put_unaligned_leword(machine_word_t v, u8 *p)
113 | {
114 | 	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
115 | 	if (WORDBITS == 32)
116 | 		put_unaligned_le32(v, p);
117 | 	else
118 | 		put_unaligned_le64(v, p);
119 | }
120 | 
121 | /***** 24-bit loads *****/
122 | 
123 | /*
124 |  * Given a 32-bit value that was loaded with the platform's native endianness,
125 |  * return a 32-bit value whose high-order 8 bits are 0 and whose low-order 24
126 |  * bits contain the first 3 bytes, arranged in octets in a platform-dependent
127 |  * order, at the memory location from which the input 32-bit value was loaded.
128 |  */
129 | static forceinline u32
130 | loaded_u32_to_u24(u32 v)
131 | {
132 | 	if (CPU_IS_LITTLE_ENDIAN())
133 | 		return v & 0xFFFFFF;
134 | 	else
135 | 		return v >> 8;
136 | }
137 | 
138 | /*
139 |  * Load the next 3 bytes from the memory location @p into the 24 low-order bits
140 |  * of a 32-bit value.  The order in which the 3 bytes will be arranged as octets
141 |  * in the 24 bits is platform-dependent.  At least LOAD_U24_REQUIRED_NBYTES
142 |  * bytes must be available at @p; note that this may be more than 3.
143 |  */
144 | static forceinline u32
145 | load_u24_unaligned(const u8 *p)
146 | {
147 | #if UNALIGNED_ACCESS_IS_FAST
148 | #  define LOAD_U24_REQUIRED_NBYTES 4
149 | 	return loaded_u32_to_u24(load_u32_unaligned(p));
150 | #else
151 | #  define LOAD_U24_REQUIRED_NBYTES 3
152 | 	if (CPU_IS_LITTLE_ENDIAN())
153 | 		return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16);
154 | 	else
155 | 		return ((u32)p[2] << 0) | ((u32)p[1] << 8) | ((u32)p[0] << 16);
156 | #endif
157 | }
158 | 
159 | #endif /* LIB_UNALIGNED_H */
160 | 


--------------------------------------------------------------------------------
/programs/prog_util.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * prog_util.h - utility functions for programs
  3 |  *
  4 |  * Copyright 2016 Eric Biggers
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person
  7 |  * obtaining a copy of this software and associated documentation
  8 |  * files (the "Software"), to deal in the Software without
  9 |  * restriction, including without limitation the rights to use,
 10 |  * copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 |  * copies of the Software, and to permit persons to whom the
 12 |  * Software is furnished to do so, subject to the following
 13 |  * conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be
 16 |  * included in all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 19 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 20 |  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 21 |  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 22 |  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 23 |  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 24 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 25 |  * OTHER DEALINGS IN THE SOFTWARE.
 26 |  */
 27 | 
 28 | #ifndef PROGRAMS_PROG_UTIL_H
 29 | #define PROGRAMS_PROG_UTIL_H
 30 | 
 31 | #ifdef HAVE_CONFIG_H
 32 | #  include "config.h"
 33 | #endif
 34 | 
 35 | #include "libxpack.h"
 36 | 
 37 | #include <limits.h>
 38 | #include <stdio.h>
 39 | #include <stdlib.h>
 40 | #include <string.h>
 41 | 
 42 | #include "common_defs.h"
 43 | 
 44 | #ifdef __GNUC__
 45 | # define _printf(str_idx, args_idx)	\
 46 | 		__attribute__((format(printf, str_idx, args_idx)))
 47 | #else
 48 | # define _printf(str_idx, args_idx)
 49 | #endif
 50 | 
 51 | #ifdef _WIN32
 52 | 
 53 | /*
 54 |  * Definitions for Windows builds.  Mainly, 'tchar' is defined to be the 2-byte
 55 |  * 'wchar_t' type instead of 'char'.  This is the only "easy" way I know of to
 56 |  * get full Unicode support on Windows...
 57 |  */
 58 | 
 59 | #include <wchar.h>
 60 | extern int wmain(int argc, wchar_t **argv);
 61 | #  define	tmain		wmain
 62 | #  define	tchar		wchar_t
 63 | #  define	_T(text)	L##text
 64 | #  define	T(text)		_T(text)
 65 | #  define	TS		"ls"
 66 | #  define	TC		"lc"
 67 | #  define	tmemcpy		wmemcpy
 68 | #  define	topen		_wopen
 69 | #  define	tstrchr		wcschr
 70 | #  define	tstrcmp		wcscmp
 71 | #  define	tstrcpy		wcscpy
 72 | #  define	tstrlen		wcslen
 73 | #  define	tstrrchr	wcsrchr
 74 | #  define	tstrtoul	wcstoul
 75 | #  define	tstrxcmp	wcsicmp
 76 | #  define	tunlink		_wunlink
 77 | #  define	tutimbuf	_utimbuf
 78 | #  define	tutime		_wutime
 79 | #  ifdef _MSC_VER
 80 |      /* Standard file descriptors?  What are those? */
 81 | #    define	STDIN_FILENO	0
 82 | #    define	STDOUT_FILENO	1
 83 | #    define	STDERR_FILENO	2
 84 | 
 85 |      /* Fix the broken stat-related definitions. */
 86 | #    define	fstat		_fstat
 87 | #    define	stat		_stat
 88 | #    define	S_ISREG(m)      (((m) & S_IFMT) == S_IFREG)
 89 | #    define	S_ISDIR(m)      (((m) & S_IFMT) == S_IFDIR)
 90 | #  endif
 91 | #  define	O_NOFOLLOW	0
 92 | 
 93 | #else /* _WIN32 */
 94 | 
 95 | /* Standard definitions for everyone else */
 96 | 
 97 | #  define	tmain		main
 98 | #  define	tchar		char
 99 | #  define	T(text)		text
100 | #  define	TS		"s"
101 | #  define	TC		"c"
102 | #  define	tmemcpy		memcpy
103 | #  define	topen		open
104 | #  define	tstrchr		strchr
105 | #  define	tstrcmp		strcmp
106 | #  define	tstrcpy		strcpy
107 | #  define	tstrlen		strlen
108 | #  define	tstrrchr	strrchr
109 | #  define	tstrtoul	strtoul
110 | #  define	tstrxcmp	strcmp
111 | #  define	tunlink		unlink
112 | #  define	tutimbuf	utimbuf
113 | #  define	tutime		utime
114 | #  define	O_BINARY	0
115 | 
116 | #endif /* !_WIN32 */
117 | 
118 | extern const tchar *program_invocation_name;
119 | 
120 | extern void _printf(1, 2) msg(const char *fmt, ...);
121 | extern void _printf(1, 2) msg_errno(const char *fmt, ...);
122 | 
123 | extern void *xmalloc(size_t size);
124 | 
125 | extern u64 current_time(void);
126 | 
127 | extern const tchar *get_filename(const tchar *path);
128 | 
129 | struct file_stream {
130 | 	int fd;
131 | 	tchar *name;
132 | 	bool is_standard_stream;
133 | };
134 | 
135 | extern int xopen_for_read(const tchar *path, struct file_stream *strm);
136 | extern int xopen_for_write(const tchar *path, bool force,
137 | 			   struct file_stream *strm);
138 | 
139 | extern ssize_t xread(struct file_stream *strm, void *buf, size_t count);
140 | extern int skip_bytes(struct file_stream *strm, size_t count);
141 | extern int full_write(struct file_stream *strm, const void *buf, size_t count);
142 | 
143 | extern int xclose(struct file_stream *strm);
144 | 
145 | extern u32 parse_chunk_size(const tchar *arg);
146 | extern int parse_compression_level(const tchar *arg);
147 | 
148 | extern struct xpack_compressor *alloc_compressor(u32 chunk_size, int level);
149 | extern struct xpack_decompressor *alloc_decompressor(void);
150 | 
151 | /* tgetopt.c */
152 | 
153 | extern tchar *toptarg;
154 | extern int toptind, topterr, toptopt;
155 | 
156 | extern int tgetopt(int argc, tchar *argv[], const tchar *optstring);
157 | 
158 | #endif /* PROGRAMS_PROG_UTIL_H */
159 | 


--------------------------------------------------------------------------------
/libxpack.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * libxpack.h - public header for libxpack
  3 |  */
  4 | 
  5 | #ifndef LIBXPACK_H
  6 | #define LIBXPACK_H
  7 | 
  8 | #ifdef __cplusplus
  9 | extern "C" {
 10 | #endif
 11 | 
 12 | #include <stddef.h>
 13 | 
 14 | /* Microsoft C / Visual Studio garbage.  If you want to link to the DLL version
 15 |  * of libxpack, then #define LIBXPACK_DLL. */
 16 | #ifdef _MSC_VER
 17 | #  ifdef BUILDING_LIBXPACK
 18 | #    define LIBXPACKAPI __declspec(dllexport)
 19 | #  elif defined(LIBXPACK_DLL)
 20 | #    define LIBXPACKAPI __declspec(dllimport)
 21 | #  endif
 22 | #endif
 23 | #ifndef LIBXPACKAPI
 24 | #  define LIBXPACKAPI
 25 | #endif
 26 | 
 27 | /* ========================================================================== */
 28 | /*                               Compression                                  */
 29 | /* ========================================================================== */
 30 | 
 31 | struct xpack_compressor;
 32 | 
 33 | /*
 34 |  * xpack_alloc_compressor() allocates a new compressor.
 35 |  *
 36 |  * 'max_buffer_size' is the maximum size of any buffer which will be compressed
 37 |  * by the compressor.  This specifies the maximum allowed value for the
 38 |  * 'uncompressed_size' parameter of xpack_compress() when called using this
 39 |  * compressor.
 40 |  *
 41 |  * 'compression_level' is the compression level on a zlib-like scale (1 =
 42 |  * fastest, 6 = medium/default, 9 = slowest).
 43 |  *
 44 |  * Returns a pointer to the new compressor, or NULL if out of memory or the
 45 |  * maximum buffer size or compression level is not supported.
 46 |  */
 47 | LIBXPACKAPI struct xpack_compressor *
 48 | xpack_alloc_compressor(size_t max_buffer_size, int compression_level);
 49 | 
 50 | /*
 51 |  * xpack_compress() compresses a buffer of data.  The function attempts to
 52 |  * compress 'in_nbytes' bytes of data located at 'in' and write the results to
 53 |  * 'out', which has space for 'out_nbytes_avail' bytes.  The return value is the
 54 |  * compressed size in bytes, or 0 if the data could not be compressed to
 55 |  * 'out_nbytes_avail' bytes or fewer.
 56 |  */
 57 | LIBXPACKAPI size_t
 58 | xpack_compress(struct xpack_compressor *compressor,
 59 | 	       const void *in, size_t in_nbytes,
 60 | 	       void *out, size_t out_nbytes_avail);
 61 | 
 62 | /*
 63 |  * xpack_free_compressor() frees a compressor allocated with
 64 |  * xpack_alloc_compressor().  If NULL is passed, then no action is taken.
 65 |  */
 66 | LIBXPACKAPI void
 67 | xpack_free_compressor(struct xpack_compressor *compressor);
 68 | 
 69 | /* ========================================================================== */
 70 | /*                               Decompression                                */
 71 | /* ========================================================================== */
 72 | 
 73 | struct xpack_decompressor;
 74 | 
 75 | /*
 76 |  * xpack_alloc_decompressor() allocates a new decompressor.
 77 |  *
 78 |  * Returns a pointer to the new decompressor, or NULL if out of memory.
 79 |  */
 80 | LIBXPACKAPI struct xpack_decompressor *
 81 | xpack_alloc_decompressor(void);
 82 | 
 83 | /* Result of a call to xpack_decompress() */
 84 | enum decompress_result {
 85 | 
 86 | 	/* Decompression was successful */
 87 | 	DECOMPRESS_SUCCESS = 0,
 88 | 
 89 | 	/* Decompressed failed because the compressed data was invalid, corrupt,
 90 | 	 * or otherwise unsupported */
 91 | 	DECOMPRESS_BAD_DATA = 1,
 92 | 
 93 | 	/* A NULL 'actual_out_nbytes_ret' was provided, but the data would have
 94 | 	 * decompressed to fewer than 'out_nbytes_avail' bytes */
 95 | 	DECOMPRESS_SHORT_OUTPUT = 2,
 96 | 
 97 | 	/* The data would have decompressed to more than 'out_nbytes_avail'
 98 | 	 * bytes */
 99 | 	DECOMPRESS_INSUFFICIENT_SPACE = 3,
100 | };
101 | 
102 | /*
103 |  * xpack_decompress() decompresses 'in_nbytes' bytes of compressed data at 'in'
104 |  * and writes the uncompressed data to 'out', which is a buffer of at least
105 |  * 'out_nbytes_avail' bytes.  If decompression was successful, then 0
106 |  * (DECOMPRESS_SUCCESS) is returned; otherwise, a nonzero result code such as
107 |  * DECOMPRESS_BAD_DATA is returned.  If a nonzero result code is returned, then
108 |  * the contents of the output buffer are undefined.
109 |  *
110 |  * xpack_decompress() can be used in cases where the actual uncompressed size is
111 |  * known (recommended) or unknown (not recommended):
112 |  *
113 |  *   - If the actual uncompressed size is known, then pass the actual
114 |  *     uncompressed size as 'out_nbytes_avail' and pass NULL for
115 |  *     'actual_out_nbytes_ret'.  This makes xpack_decompress() fail with
116 |  *     DECOMPRESS_SHORT_OUTPUT if the data decompressed to fewer than the
117 |  *     specified number of bytes.
118 |  *
119 |  *   - If the actual uncompressed size is unknown, then provide a non-NULL
120 |  *     'actual_out_nbytes_ret' and provide a buffer with some size
121 |  *     'out_nbytes_avail' that you think is large enough to hold all the
122 |  *     uncompressed data.  In this case, if the data decompresses to less than
123 |  *     or equal to 'out_nbytes_avail' bytes, then xpack_decompress() will write
124 |  *     the actual uncompressed size to *actual_out_nbytes_ret and return 0
125 |  *     (DECOMPRESS_SUCCESS).  Otherwise, it will return
126 |  *     DECOMPRESS_INSUFFICIENT_SPACE if the provided buffer was not large enough
127 |  *     but no other problems were encountered, or another nonzero result code if
128 |  *     decompression failed for another reason.
129 |  */
130 | LIBXPACKAPI enum decompress_result
131 | xpack_decompress(struct xpack_decompressor *decompressor,
132 | 		 const void *in, size_t in_nbytes,
133 | 		 void *out, size_t out_nbytes_avail,
134 | 		 size_t *actual_out_nbytes_ret);
135 | 
136 | /*
137 |  * xpack_free_decompressor() frees a decompressor allocated with
138 |  * xpack_alloc_decompressor().  If NULL is passed, no action is taken.
139 |  */
140 | LIBXPACKAPI void
141 | xpack_free_decompressor(struct xpack_decompressor *decompressor);
142 | 
143 | 
144 | #ifdef __cplusplus
145 | }
146 | #endif
147 | 
148 | #endif /* LIBXPACK_H */
149 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | #
  2 | # Use 'make help' to list available targets.
  3 | #
  4 | # Define V=1 to enable "verbose" mode, showing all executed commands.
  5 | #
  6 | # Define DECOMPRESSION_ONLY to omit all compression code, building a
  7 | # decompression-only library.  If doing this, you must also build a specific
  8 | # library target such as 'libxpack.a', as the programs will no longer compile.
  9 | #
 10 | # TODO: ENABLE_PREPROCESSING option
 11 | #
 12 | ##############################################################################
 13 | 
 14 | #### Common compiler flags.
 15 | #### Flags given here are not intended to be overridden, but you can add more
 16 | #### by defining CFLAGS in the environment or on the 'make' command line.
 17 | 
 18 | cc-option = $(shell if $(CC) $(1) -c -x c /dev/null -o /dev/null \
 19 | 	      1>&2 2>/dev/null; then echo $(1); fi)
 20 | 
 21 | override CFLAGS :=							\
 22 | 	$(CFLAGS) -O2 -fomit-frame-pointer -std=gnu89 -I. -Icommon	\
 23 | 	-Wall -Wundef							\
 24 | 	$(call cc-option,-Wdeclaration-after-statement)			\
 25 | 	$(call cc-option,-Wmissing-prototypes)				\
 26 | 	$(call cc-option,-Wstrict-prototypes)				\
 27 | 	$(call cc-option,-Wvla)
 28 | 
 29 | ##############################################################################
 30 | 
 31 | STATIC_LIB_SUFFIX := .a
 32 | SHARED_LIB_SUFFIX := .so
 33 | PROG_SUFFIX       :=
 34 | PROG_CFLAGS       :=
 35 | PIC_REQUIRED      := 1
 36 | HARD_LINKS        := 1
 37 | 
 38 | # Compiling for Windows with MinGW?
 39 | ifneq ($(findstring -mingw,$(CC)),)
 40 |     ifeq ($(AR),ar)
 41 |         AR := $(patsubst %-gcc,%-ar,$(CC))
 42 |     endif
 43 |     STATIC_LIB_SUFFIX := .lib
 44 |     SHARED_LIB_SUFFIX := .dll
 45 |     PROG_SUFFIX       := .exe
 46 |     PROG_CFLAGS       := -static -municode
 47 |     PIC_REQUIRED      :=
 48 |     HARD_LINKS        :=
 49 | endif
 50 | 
 51 | ##############################################################################
 52 | 
 53 | #### Quiet make is enabled by default.  Define V=1 to disable.
 54 | 
 55 | ifneq ($(findstring s,$(MAKEFLAGS)),s)
 56 | ifneq ($(V),1)
 57 |         QUIET_CC       = @echo '  CC      ' $@;
 58 |         QUIET_CCLD     = @echo '  CCLD    ' $@;
 59 |         QUIET_AR       = @echo '  AR      ' $@;
 60 |         QUIET_LN       = @echo '  LN      ' $@;
 61 |         QUIET_CP       = @echo '  CP      ' $@;
 62 |         QUIET_GEN      = @echo '  GEN     ' $@;
 63 | endif
 64 | endif
 65 | 
 66 | ##############################################################################
 67 | 
 68 | COMMON_HEADERS := $(wildcard common/*.h)
 69 | ALL_TARGETS :=
 70 | 
 71 | #### Library
 72 | 
 73 | STATIC_LIB := libxpack$(STATIC_LIB_SUFFIX)
 74 | SHARED_LIB := libxpack$(SHARED_LIB_SUFFIX)
 75 | 
 76 | LIB_CFLAGS += $(CFLAGS) -fvisibility=hidden -D_ANSI_SOURCE
 77 | 
 78 | DECOMPRESSION_ONLY :=
 79 | ifdef DECOMPRESSION_ONLY
 80 |     LIB_CFLAGS += -DDECOMPRESSION_ONLY=1
 81 | endif
 82 | 
 83 | ENABLE_PREPROCESSING :=
 84 | ifdef ENABLE_PREPROCESSING
 85 |     LIB_CFLAGS += -DENABLE_PREPROCESSING=1
 86 | endif
 87 | 
 88 | LIB_HEADERS := $(wildcard lib/*.h)
 89 | 
 90 | LIB_SRC := lib/x86_cpu_features.c	\
 91 | 	   lib/xpack_common.c		\
 92 | 	   lib/xpack_compress.c		\
 93 | 	   lib/xpack_decompress.c
 94 | 
 95 | LIB_OBJ := $(LIB_SRC:.c=.o)
 96 | LIB_PIC_OBJ := $(LIB_SRC:.c=.pic.o)
 97 | ifdef PIC_REQUIRED
 98 |     SHLIB_OBJ := $(LIB_PIC_OBJ)
 99 | else
100 |     SHLIB_OBJ := $(LIB_OBJ)
101 | endif
102 | 
103 | # Compile position dependent library object files
104 | $(LIB_OBJ): %.o: %.c $(LIB_HEADERS) $(COMMON_HEADERS) .lib-cflags
105 | 	$(QUIET_CC) $(CC) -o $@ -c $(LIB_CFLAGS) $<
106 | 
107 | # Compile position independent library object files
108 | $(LIB_PIC_OBJ): %.pic.o: %.c $(LIB_HEADERS) $(COMMON_HEADERS) .lib-cflags
109 | 	$(QUIET_CC) $(CC) -o $@ -c $(LIB_CFLAGS) -fPIC $<
110 | 
111 | # Link shared library
112 | $(SHARED_LIB):$(SHLIB_OBJ)
113 | 	$(QUIET_CCLD) $(CC) -o $@ $(LDFLAGS) $(LIB_CFLAGS) -shared $+
114 | 
115 | ALL_TARGETS += $(SHARED_LIB)
116 | 
117 | # Create static library
118 | $(STATIC_LIB):$(LIB_OBJ)
119 | 	$(QUIET_AR) $(AR) cr $@ $+
120 | 
121 | ALL_TARGETS += $(STATIC_LIB)
122 | 
123 | # Rebuild if CC or LIB_CFLAGS changed
124 | .lib-cflags: FORCE
125 | 	@flags='$(CC):$(LIB_CFLAGS)'; \
126 | 	if [ "$$flags" != "`cat $@ 2>/dev/null`" ]; then \
127 | 		[ -e $@ ] && echo "Rebuilding library due to new compiler flags"; \
128 | 		echo "$$flags" > $@; \
129 | 	fi
130 | 
131 | ##############################################################################
132 | 
133 | #### Programs
134 | 
135 | PROG_CFLAGS += $(CFLAGS)		\
136 | 	       -D_DEFAULT_SOURCE	\
137 | 	       -D_FILE_OFFSET_BITS=64	\
138 | 	       -DHAVE_CONFIG_H
139 | 
140 | PROG_COMMON_HEADERS := programs/prog_util.h programs/config.h
141 | PROG_COMMON_SRC := programs/prog_util.c programs/tgetopt.c
142 | PROG_SPECIFIC_SRC := programs/xpack.c programs/benchmark.c
143 | 
144 | PROG_COMMON_OBJ := $(PROG_COMMON_SRC:.c=.o)
145 | PROG_SPECIFIC_OBJ := $(PROG_SPECIFIC_SRC:.c=.o)
146 | PROG_OBJ := $(PROG_COMMON_OBJ) $(PROG_SPECIFIC_OBJ)
147 | 
148 | # Generate autodetected configuration header
149 | programs/config.h:programs/detect.sh .prog-cflags
150 | 	$(QUIET_GEN) CC=$(CC) $< > $@
151 | 
152 | # Compile program object files
153 | $(PROG_OBJ): %.o: %.c $(PROG_COMMON_HEADERS) $(COMMON_HEADERS) .prog-cflags
154 | 	$(QUIET_CC) $(CC) -o $@ -c $(PROG_CFLAGS) $<
155 | 
156 | # Link benchmark program
157 | benchmark$(PROG_SUFFIX):programs/benchmark.o $(PROG_COMMON_OBJ) $(STATIC_LIB)
158 | 	$(QUIET_CCLD) $(CC) -o $@ $(LDFLAGS) $(PROG_CFLAGS) $+
159 | 
160 | ALL_TARGETS += benchmark$(PROG_SUFFIX)
161 | 
162 | # Link xpack program
163 | xpack$(PROG_SUFFIX):programs/xpack.o $(PROG_COMMON_OBJ) $(STATIC_LIB)
164 | 	$(QUIET_CCLD) $(CC) -o $@ $(LDFLAGS) $(PROG_CFLAGS) $+
165 | 
166 | ALL_TARGETS += xpack$(PROG_SUFFIX)
167 | 
168 | ifdef HARD_LINKS
169 | # Hard link xunpack to xpack
170 | xunpack$(PROG_SUFFIX):xpack$(PROG_SUFFIX)
171 | 	$(QUIET_LN) ln -f $< $@
172 | else
173 | # No hard links; copy xpack to xunpack
174 | xunpack$(PROG_SUFFIX):xpack$(PROG_SUFFIX)
175 | 	$(QUIET_CP) cp -f $< $@
176 | endif
177 | 
178 | ALL_TARGETS += xunpack$(PROG_SUFFIX)
179 | 
180 | # Rebuild if CC or PROG_CFLAGS changed
181 | .prog-cflags: FORCE
182 | 	@flags='$(CC):$(PROG_CFLAGS)'; \
183 | 	if [ "$$flags" != "`cat $@ 2>/dev/null`" ]; then \
184 | 		[ -e $@ ] && echo "Rebuilding programs due to new compiler flags"; \
185 | 		echo "$$flags" > $@; \
186 | 	fi
187 | 
188 | ##############################################################################
189 | 
190 | all:$(ALL_TARGETS)
191 | 
192 | help:
193 | 	@echo "Available targets:"
194 | 	@echo "------------------"
195 | 	@for target in $(ALL_TARGETS); do	\
196 | 		echo -e "$$target";		\
197 | 	done
198 | 
199 | clean:
200 | 	rm -f *.a *.dll *.exe *.exp *.lib *.so \
201 | 		lib/*.o lib/*.obj programs/*.o programs/*.obj \
202 | 		benchmark xpack xunpack programs/config.h \
203 | 		.lib-cflags .prog-cflags
204 | 
205 | realclean: clean
206 | 	rm -f tags cscope*
207 | 
208 | FORCE:
209 | 
210 | .PHONY: all help clean realclean
211 | 
212 | .DEFAULT_GOAL = all
213 | 


--------------------------------------------------------------------------------
/lib/xpack_common.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * xpack_common.c - common code for XPACK compression and decompression
  3 |  *
  4 |  * Copyright 2016 Eric Biggers
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person
  7 |  * obtaining a copy of this software and associated documentation
  8 |  * files (the "Software"), to deal in the Software without
  9 |  * restriction, including without limitation the rights to use,
 10 |  * copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 |  * copies of the Software, and to permit persons to whom the
 12 |  * Software is furnished to do so, subject to the following
 13 |  * conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be
 16 |  * included in all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 19 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 20 |  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 21 |  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 22 |  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 23 |  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 24 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 25 |  * OTHER DEALINGS IN THE SOFTWARE.
 26 |  */
 27 | 
 28 | #ifdef ENABLE_PREPROCESSING
 29 | 
 30 | #include <string.h>
 31 | 
 32 | #ifdef __SSE2__
 33 | #  include <emmintrin.h>
 34 | #endif
 35 | 
 36 | #ifdef __AVX2__
 37 | #  include <immintrin.h>
 38 | #endif
 39 | 
 40 | #include "xpack_common.h"
 41 | #include "unaligned.h"
 42 | 
 43 | static void
 44 | do_translate_target(void *target, s32 input_pos)
 45 | {
 46 | 	s32 abs_offset, rel_offset;
 47 | 
 48 | 	rel_offset = get_unaligned_le32(target);
 49 | 	if (rel_offset >= -input_pos && rel_offset < MAGIC_FILESIZE) {
 50 | 		if (rel_offset < MAGIC_FILESIZE - input_pos) {
 51 | 			/* "good translation" */
 52 | 			abs_offset = rel_offset + input_pos;
 53 | 		} else {
 54 | 			/* "compensating translation" */
 55 | 			abs_offset = rel_offset - MAGIC_FILESIZE;
 56 | 		}
 57 | 		put_unaligned_le32(abs_offset, target);
 58 | 	}
 59 | }
 60 | 
 61 | static void
 62 | undo_translate_target(void *target, s32 input_pos)
 63 | {
 64 | 	s32 abs_offset, rel_offset;
 65 | 
 66 | 	abs_offset = get_unaligned_le32(target);
 67 | 	if (abs_offset >= 0) {
 68 | 		if (abs_offset < MAGIC_FILESIZE) {
 69 | 			/* "good translation" */
 70 | 			rel_offset = abs_offset - input_pos;
 71 | 			put_unaligned_le32(rel_offset, target);
 72 | 		}
 73 | 	} else {
 74 | 		if (abs_offset >= -input_pos) {
 75 | 			/* "compensating translation" */
 76 | 			rel_offset = abs_offset + MAGIC_FILESIZE;
 77 | 			put_unaligned_le32(rel_offset, target);
 78 | 		}
 79 | 	}
 80 | }
 81 | 
 82 | static void
 83 | e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32))
 84 | {
 85 | 
 86 | #if !defined(__SSE2__) && !defined(__AVX2__)
 87 | 	/*
 88 | 	 * A worthwhile optimization is to push the end-of-buffer check into the
 89 | 	 * relatively rare E8 case.  This is possible if we replace the last six
 90 | 	 * bytes of data with E8 bytes; then we are guaranteed to hit an E8 byte
 91 | 	 * before reaching end-of-buffer.  In addition, this scheme guarantees
 92 | 	 * that no translation can begin following an E8 byte in the last 10
 93 | 	 * bytes because a 4-byte offset containing E8 as its high byte is a
 94 | 	 * large negative number that is not valid for translation.  That is
 95 | 	 * exactly what we need.
 96 | 	 */
 97 | 	u8 *tail;
 98 | 	u8 saved_bytes[6];
 99 | 	u8 *p;
100 | 
101 | 	if (size <= 10)
102 | 		return;
103 | 
104 | 	tail = &data[size - 6];
105 | 	memcpy(saved_bytes, tail, 6);
106 | 	memset(tail, 0xE8, 6);
107 | 	p = data;
108 | 	for (;;) {
109 | 		while (*p != 0xE8)
110 | 			p++;
111 | 		if (p >= tail)
112 | 			break;
113 | 		(*process_target)(p + 1, p - data);
114 | 		p += 5;
115 | 	}
116 | 	memcpy(tail, saved_bytes, 6);
117 | #else
118 | 	/* SSE2 or AVX-2 optimized version for x86_64 */
119 | 
120 | 	u8 *p = data;
121 | 	u64 valid_mask = ~0;
122 | 
123 | 	if (size <= 10)
124 | 		return;
125 | #ifdef __AVX2__
126 | #  define ALIGNMENT_REQUIRED 32
127 | #else
128 | #  define ALIGNMENT_REQUIRED 16
129 | #endif
130 | 
131 | 	/* Process one byte at a time until the pointer is properly aligned. */
132 | 	while ((uintptr_t)p % ALIGNMENT_REQUIRED != 0) {
133 | 		if (p >= data + size - 10)
134 | 			return;
135 | 		if (*p == 0xE8 && (valid_mask & 1)) {
136 | 			(*process_target)(p + 1, p - data);
137 | 			valid_mask &= ~0x1F;
138 | 		}
139 | 		p++;
140 | 		valid_mask >>= 1;
141 | 		valid_mask |= (u64)1 << 63;
142 | 	}
143 | 
144 | 	if (data + size - p >= 64) {
145 | 
146 | 		/* Vectorized processing */
147 | 
148 | 		/* Note: we use a "trap" E8 byte to eliminate the need to check
149 | 		 * for end-of-buffer in the inner loop.  This byte is carefully
150 | 		 * positioned so that it will never be changed by a previous
151 | 		 * translation before it is detected. */
152 | 
153 | 		u8 *trap = p + ((data + size - p) & ~31) - 32 + 4;
154 | 		u8 saved_byte = *trap;
155 | 		*trap = 0xE8;
156 | 
157 | 		for (;;) {
158 | 			u32 e8_mask;
159 | 			u8 *orig_p = p;
160 | 		#ifdef __AVX2__
161 | 			const __m256i e8_bytes = _mm256_set1_epi8(0xE8);
162 | 			for (;;) {
163 | 				__m256i bytes = *(const __m256i *)p;
164 | 				__m256i cmpresult = _mm256_cmpeq_epi8(bytes, e8_bytes);
165 | 				e8_mask = _mm256_movemask_epi8(cmpresult);
166 | 				if (e8_mask)
167 | 					break;
168 | 				p += 32;
169 | 			}
170 | 		#else
171 | 			const __m128i e8_bytes = _mm_set1_epi8(0xE8);
172 | 			for (;;) {
173 | 				/* Read the next 32 bytes of data and test them
174 | 				 * for E8 bytes. */
175 | 				__m128i bytes1 = *(const __m128i *)p;
176 | 				__m128i bytes2 = *(const __m128i *)(p + 16);
177 | 				__m128i cmpresult1 = _mm_cmpeq_epi8(bytes1, e8_bytes);
178 | 				__m128i cmpresult2 = _mm_cmpeq_epi8(bytes2, e8_bytes);
179 | 				u32 mask1 = _mm_movemask_epi8(cmpresult1);
180 | 				u32 mask2 = _mm_movemask_epi8(cmpresult2);
181 | 				/* The masks have a bit set for each E8 byte.
182 | 				 * We stay in this fast inner loop as long as
183 | 				 * there are no E8 bytes. */
184 | 				if (mask1 | mask2) {
185 | 					e8_mask = mask1 | (mask2 << 16);
186 | 					break;
187 | 				}
188 | 				p += 32;
189 | 			}
190 | 		#endif
191 | 
192 | 			/* Did we pass over data with no E8 bytes? */
193 | 			if (p != orig_p)
194 | 				valid_mask = ~0;
195 | 
196 | 			/* Are we nearing end-of-buffer? */
197 | 			if (p == trap - 4)
198 | 				break;
199 | 
200 | 			/* Process the E8 bytes.  However, the AND with
201 | 			 * 'valid_mask' ensures we never process an E8 byte that
202 | 			 * was itself part of a translation target. */
203 | 			while ((e8_mask &= valid_mask)) {
204 | 				unsigned bit = bsf32(e8_mask);
205 | 				(*process_target)(p + bit + 1, p + bit - data);
206 | 				valid_mask &= ~((u64)0x1F << bit);
207 | 			}
208 | 
209 | 			valid_mask >>= 32;
210 | 			valid_mask |= 0xFFFFFFFF00000000;
211 | 			p += 32;
212 | 		}
213 | 
214 | 		*trap = saved_byte;
215 | 	}
216 | 
217 | 	/* Approaching the end of the buffer; process one byte a time. */
218 | 	while (p < data + size - 10) {
219 | 		if (*p == 0xE8 && (valid_mask & 1)) {
220 | 			(*process_target)(p + 1, p - data);
221 | 			valid_mask &= ~0x1F;
222 | 		}
223 | 		p++;
224 | 		valid_mask >>= 1;
225 | 		valid_mask |= (u64)1 << 63;
226 | 	}
227 | #endif /* __SSE2__ || __AVX2__ */
228 | }
229 | 
230 | void
231 | preprocess(void *data, u32 size)
232 | {
233 | 	e8_filter(data, size, do_translate_target);
234 | }
235 | 
236 | void
237 | postprocess(void *data, u32 size)
238 | {
239 | 	e8_filter(data, size, undo_translate_target);
240 | }
241 | 
242 | #endif /* ENABLE_PREPROCESSING */
243 | 


--------------------------------------------------------------------------------
/programs/benchmark.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * benchmark.c - a compression testing and benchmark program
  3 |  *
  4 |  * Copyright 2016 Eric Biggers
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person
  7 |  * obtaining a copy of this software and associated documentation
  8 |  * files (the "Software"), to deal in the Software without
  9 |  * restriction, including without limitation the rights to use,
 10 |  * copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 |  * copies of the Software, and to permit persons to whom the
 12 |  * Software is furnished to do so, subject to the following
 13 |  * conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be
 16 |  * included in all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 19 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 20 |  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 21 |  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 22 |  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 23 |  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 24 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 25 |  * OTHER DEALINGS IN THE SOFTWARE.
 26 |  */
 27 | 
 28 | #include "prog_util.h"
 29 | 
 30 | static const tchar *const optstring = T("123456789hL:s:V");
 31 | 
 32 | static void
 33 | show_usage(FILE *fp)
 34 | {
 35 | 	fprintf(fp,
 36 | "Usage: %"TS" [-123456789hV] [-L LVL] [-s SIZE] [FILE]...\n"
 37 | "Benchmark XPACK compression and decompression on the specified FILEs.\n"
 38 | "\n"
 39 | "Options:\n"
 40 | "  -1        fastest (worst) compression\n"
 41 | "  -9        slowest (best) compression\n"
 42 | "  -h        print this help\n"
 43 | "  -L LVL    compression level [1-9] (default 6)\n"
 44 | "  -s SIZE   chunk size (default 524288)\n"
 45 | "  -V        show version and legal information\n",
 46 | 	program_invocation_name);
 47 | }
 48 | 
 49 | static void
 50 | show_version(void)
 51 | {
 52 | 	printf(
 53 | "XPACK compression benchmark program, experimental version\n"
 54 | "Copyright 2016 Eric Biggers\n"
 55 | "\n"
 56 | "This program is free software which may be modified and/or redistributed\n"
 57 | "under the terms of the MIT license.  There is NO WARRANTY, to the extent\n"
 58 | "permitted by law.  See the COPYING file for details.\n"
 59 | 	);
 60 | }
 61 | 
 62 | static int
 63 | do_benchmark(struct file_stream *in, void *original_buf, void *compressed_buf,
 64 | 	     void *decompressed_buf, u32 chunk_size,
 65 | 	     struct xpack_compressor *compressor,
 66 | 	     struct xpack_decompressor *decompressor)
 67 | {
 68 | 	u64 total_uncompressed_size = 0;
 69 | 	u64 total_compressed_size = 0;
 70 | 	u64 total_compress_time = 0;
 71 | 	u64 total_decompress_time = 0;
 72 | 	ssize_t ret;
 73 | 
 74 | 	while ((ret = xread(in, original_buf, chunk_size)) > 0) {
 75 | 		u32 original_size = ret;
 76 | 		u32 compressed_size;
 77 | 		u64 start_time;
 78 | 		enum decompress_result result;
 79 | 
 80 | 		total_uncompressed_size += original_size;
 81 | 
 82 | 		/* Compress the chunk of data. */
 83 | 		start_time = current_time();
 84 | 		compressed_size = xpack_compress(compressor,
 85 | 						 original_buf,
 86 | 						 original_size,
 87 | 						 compressed_buf,
 88 | 						 original_size - 1);
 89 | 		total_compress_time += current_time() - start_time;
 90 | 
 91 | 		if (compressed_size) {
 92 | 			/* Successfully compressed the chunk of data. */
 93 | 
 94 | 			/* Decompress the data we just compressed and compare
 95 | 			 * the result with the original. */
 96 | 			start_time = current_time();
 97 | 			result = xpack_decompress(decompressor,
 98 | 						  compressed_buf,
 99 | 						  compressed_size,
100 | 						  decompressed_buf,
101 | 						  original_size,
102 | 						  NULL);
103 | 			total_decompress_time += current_time() - start_time;
104 | 
105 | 			if (result != DECOMPRESS_SUCCESS) {
106 | 				msg("%"TS": failed to decompress data",
107 | 				    in->name);
108 | 				return -1;
109 | 			}
110 | 
111 | 			if (memcmp(original_buf, decompressed_buf,
112 | 				   original_size) != 0)
113 | 			{
114 | 				msg("%"TS": data did not decompress to "
115 | 				    "original", in->name);
116 | 				return -1;
117 | 			}
118 | 
119 | 			total_compressed_size += compressed_size;
120 | 		} else {
121 | 			/* Compression did not make the chunk smaller. */
122 | 			total_compressed_size += original_size;
123 | 		}
124 | 	}
125 | 
126 | 	if (ret < 0)
127 | 		return ret;
128 | 
129 | 	if (total_uncompressed_size == 0) {
130 | 		printf("\tFile was empty.\n");
131 | 		return 0;
132 | 	}
133 | 
134 | 	if (total_compress_time == 0)
135 | 		total_compress_time = 1;
136 | 	if (total_decompress_time == 0)
137 | 		total_decompress_time = 1;
138 | 
139 | 	printf("\tCompressed %"PRIu64 " => %"PRIu64" bytes (%u.%03u%%)\n",
140 | 	       total_uncompressed_size, total_compressed_size,
141 | 	       (unsigned int)(total_compressed_size * 100 /
142 | 				total_uncompressed_size),
143 | 	       (unsigned int)(total_compressed_size * 100000 /
144 | 				total_uncompressed_size % 1000));
145 | 	printf("\tCompression time: %"PRIu64" ms (%"PRIu64" MB/s)\n",
146 | 	       total_compress_time / 1000000,
147 | 	       1000 * total_uncompressed_size / total_compress_time);
148 | 	printf("\tDecompression time: %"PRIu64" ms (%"PRIu64" MB/s)\n",
149 | 	       total_decompress_time / 1000000,
150 | 	       1000 * total_uncompressed_size / total_decompress_time);
151 | 
152 | 	return 0;
153 | }
154 | 
155 | int
156 | tmain(int argc, tchar *argv[])
157 | {
158 | 	u32 chunk_size = 524288;
159 | 	int compression_level = 6;
160 | 	void *original_buf = NULL;
161 | 	void *compressed_buf = NULL;
162 | 	void *decompressed_buf = NULL;
163 | 	struct xpack_compressor *compressor = NULL;
164 | 	struct xpack_decompressor *decompressor = NULL;
165 | 	tchar *default_file_list[] = { NULL };
166 | 	int opt_char;
167 | 	int i;
168 | 	int ret;
169 | 
170 | 	program_invocation_name = get_filename(argv[0]);
171 | 
172 | 	while ((opt_char = tgetopt(argc, argv, optstring)) != -1) {
173 | 		switch (opt_char) {
174 | 		case '1':
175 | 		case '2':
176 | 		case '3':
177 | 		case '4':
178 | 		case '5':
179 | 		case '6':
180 | 		case '7':
181 | 		case '8':
182 | 		case '9':
183 | 			compression_level = opt_char - '0';
184 | 			break;
185 | 		case 'h':
186 | 			show_usage(stdout);
187 | 			return 0;
188 | 		case 'L':
189 | 			compression_level = parse_compression_level(toptarg);
190 | 			if (compression_level <= 0)
191 | 				return 1;
192 | 			break;
193 | 		case 's':
194 | 			chunk_size = parse_chunk_size(toptarg);
195 | 			if (chunk_size == 0)
196 | 				return 1;
197 | 			break;
198 | 		case 'V':
199 | 			show_version();
200 | 			return 0;
201 | 		default:
202 | 			show_usage(stderr);
203 | 			return 1;
204 | 		}
205 | 	}
206 | 
207 | 	argc -= toptind;
208 | 	argv += toptind;
209 | 
210 | 	original_buf = xmalloc(chunk_size);
211 | 	compressed_buf = xmalloc(chunk_size - 1);
212 | 	decompressed_buf = xmalloc(chunk_size);
213 | 	compressor = alloc_compressor(chunk_size, compression_level);
214 | 	decompressor = alloc_decompressor();
215 | 
216 | 	ret = -1;
217 | 	if (original_buf == NULL || compressed_buf == NULL ||
218 | 	    decompressed_buf == NULL ||
219 | 	    compressor == NULL || decompressor == NULL)
220 | 		goto out;
221 | 
222 | 	if (argc == 0) {
223 | 		argv = default_file_list;
224 | 		argc = ARRAY_LEN(default_file_list);
225 | 	} else {
226 | 		for (i = 0; i < argc; i++)
227 | 			if (argv[i][0] == '-' && argv[i][1] == '\0')
228 | 				argv[i] = NULL;
229 | 	}
230 | 
231 | 	printf("Benchmarking XPACK compression:\n");
232 | 	printf("\tChunk size: %"PRIu32"\n", chunk_size);
233 | 	printf("\tCompression level: %d\n", compression_level);
234 | 
235 | 	for (i = 0; i < argc; i++) {
236 | 		struct file_stream in;
237 | 
238 | 		ret = xopen_for_read(argv[i], &in);
239 | 		if (ret != 0)
240 | 			goto out;
241 | 
242 | 		printf("Processing %"TS"...\n", in.name);
243 | 
244 | 		ret = do_benchmark(&in, original_buf, compressed_buf,
245 | 				   decompressed_buf, chunk_size, compressor,
246 | 				   decompressor);
247 | 		xclose(&in);
248 | 		if (ret != 0)
249 | 			goto out;
250 | 	}
251 | 	ret = 0;
252 | out:
253 | 	xpack_free_decompressor(decompressor);
254 | 	xpack_free_compressor(compressor);
255 | 	free(decompressed_buf);
256 | 	free(compressed_buf);
257 | 	free(original_buf);
258 | 	return -ret;
259 | }
260 | 


--------------------------------------------------------------------------------
/common/common_defs.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * common_defs.h
  3 |  *
  4 |  * Copyright 2016 Eric Biggers
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person
  7 |  * obtaining a copy of this software and associated documentation
  8 |  * files (the "Software"), to deal in the Software without
  9 |  * restriction, including without limitation the rights to use,
 10 |  * copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 |  * copies of the Software, and to permit persons to whom the
 12 |  * Software is furnished to do so, subject to the following
 13 |  * conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be
 16 |  * included in all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 19 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 20 |  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 21 |  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 22 |  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 23 |  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 24 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 25 |  * OTHER DEALINGS IN THE SOFTWARE.
 26 |  */
 27 | 
 28 | #ifndef COMMON_COMMON_DEFS_H
 29 | #define COMMON_COMMON_DEFS_H
 30 | 
 31 | #ifdef __GNUC__
 32 | #  include "compiler_gcc.h"
 33 | #elif defined(_MSC_VER)
 34 | #  include "compiler_msc.h"
 35 | #else
 36 | #  pragma message("Unrecognized compiler.  Please add a header file for your compiler.  Compilation will proceed, but performance may suffer!")
 37 | #endif
 38 | 
 39 | /* ========================================================================== */
 40 | /*                              Type definitions                              */
 41 | /* ========================================================================== */
 42 | 
 43 | #include <stddef.h> /* size_t */
 44 | 
 45 | #ifndef __bool_true_false_are_defined
 46 | #  include <stdbool.h> /* bool */
 47 | #endif
 48 | 
 49 | /* Fixed-width integer types */
 50 | #ifndef PRIu32
 51 | #  include <inttypes.h>
 52 | #endif
 53 | typedef uint8_t u8;
 54 | typedef uint16_t u16;
 55 | typedef uint32_t u32;
 56 | typedef uint64_t u64;
 57 | typedef int8_t s8;
 58 | typedef int16_t s16;
 59 | typedef int32_t s32;
 60 | typedef int64_t s64;
 61 | 
 62 | /*
 63 |  * Word type of the target architecture.  Use 'size_t' instead of 'unsigned
 64 |  * long' to account for platforms such as Windows that use 32-bit 'unsigned
 65 |  * long' on 64-bit architectures.
 66 |  */
 67 | typedef size_t machine_word_t;
 68 | 
 69 | /* Number of bytes in a word */
 70 | #define WORDBYTES	((int)sizeof(machine_word_t))
 71 | 
 72 | /* Number of bits in a word */
 73 | #define WORDBITS	(8 * WORDBYTES)
 74 | 
 75 | /* ========================================================================== */
 76 | /*                         Optional compiler features                         */
 77 | /* ========================================================================== */
 78 | 
 79 | /* LIBEXPORT - annotate a function that is part of the library API */
 80 | #ifndef LIBEXPORT
 81 | #  define LIBEXPORT
 82 | #endif
 83 | 
 84 | /* inline - suggest that a function be inlined */
 85 | #ifndef inline
 86 | #  define inline
 87 | #endif
 88 | 
 89 | /* forceinline - force a function to be inlined, if possible */
 90 | #ifndef forceinline
 91 | #  define forceinline inline
 92 | #endif
 93 | 
 94 | /* restrict - annotate a non-aliased pointer */
 95 | #ifndef restrict
 96 | #  define restrict
 97 | #endif
 98 | 
 99 | /* likely(expr) - hint that an expression is usually true */
100 | #ifndef likely
101 | #  define likely(expr)		(expr)
102 | #endif
103 | 
104 | /* unlikely(expr) - hint that an expression is usually false */
105 | #ifndef unlikely
106 | #  define unlikely(expr)	(expr)
107 | #endif
108 | 
109 | /* prefetchr(addr) - prefetch into L1 cache for read */
110 | #ifndef prefetchr
111 | #  define prefetchr(addr)
112 | #endif
113 | 
114 | /* prefetchw(addr) - prefetch into L1 cache for write */
115 | #ifndef prefetchw
116 | #  define prefetchw(addr)
117 | #endif
118 | 
119 | /* Does the compiler support the 'target' function attribute? */
120 | #ifndef COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE
121 | #  define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 0
122 | #endif
123 | 
124 | /* Does the compiler support __attribute__((target("bmi2")))? */
125 | #ifndef COMPILER_SUPPORTS_BMI2_TARGET
126 | #  define COMPILER_SUPPORTS_BMI2_TARGET 0
127 | #endif
128 | 
129 | /* ========================================================================== */
130 | /*                          Miscellaneous macros                              */
131 | /* ========================================================================== */
132 | 
133 | #define ARRAY_LEN(A)		(sizeof(A) / sizeof((A)[0]))
134 | #define MIN(a, b)		((a) <= (b) ? (a) : (b))
135 | #define MAX(a, b)		((a) >= (b) ? (a) : (b))
136 | #define MAX3(a, b, c)		MAX((a), MAX((b), (c)))
137 | #define MAX4(a, b, c, d)	MAX((a), MAX3((b), (c), (d)))
138 | #define DIV_ROUND_UP(n, d)	(((n) + (d) - 1) / (d))
139 | #define STATIC_ASSERT(expr)	((void)sizeof(char[1 - 2 * !(expr)]))
140 | 
141 | /* ========================================================================== */
142 | /*                           Endianness handling                              */
143 | /* ========================================================================== */
144 | 
145 | /*
146 |  * CPU_IS_LITTLE_ENDIAN() - a macro which evaluates to 1 if the CPU is little
147 |  * endian or 0 if it is big endian.  The macro should be defined in a way such
148 |  * that the compiler can evaluate it at compilation time.  If not defined, a
149 |  * fallback is used.
150 |  */
151 | #ifndef CPU_IS_LITTLE_ENDIAN
152 | static forceinline int CPU_IS_LITTLE_ENDIAN(void)
153 | {
154 | 	union {
155 | 		unsigned int v;
156 | 		unsigned char b;
157 | 	} u;
158 | 	u.v = 1;
159 | 	return u.b;
160 | }
161 | #endif
162 | 
163 | /* bswap16(n) - swap the bytes of a 16-bit integer */
164 | #ifndef bswap16
165 | static forceinline u16 bswap16(u16 n)
166 | {
167 | 	return (n << 8) | (n >> 8);
168 | }
169 | #endif
170 | 
171 | /* bswap32(n) - swap the bytes of a 32-bit integer */
172 | #ifndef bswap32
173 | static forceinline u32 bswap32(u32 n)
174 | {
175 | 	return ((n & 0x000000FF) << 24) |
176 | 	       ((n & 0x0000FF00) << 8) |
177 | 	       ((n & 0x00FF0000) >> 8) |
178 | 	       ((n & 0xFF000000) >> 24);
179 | }
180 | #endif
181 | 
182 | /* bswap64(n) - swap the bytes of a 64-bit integer */
183 | #ifndef bswap64
184 | static forceinline u64 bswap64(u64 n)
185 | {
186 | 	return ((n & 0x00000000000000FF) << 56) |
187 | 	       ((n & 0x000000000000FF00) << 40) |
188 | 	       ((n & 0x0000000000FF0000) << 24) |
189 | 	       ((n & 0x00000000FF000000) << 8) |
190 | 	       ((n & 0x000000FF00000000) >> 8) |
191 | 	       ((n & 0x0000FF0000000000) >> 24) |
192 | 	       ((n & 0x00FF000000000000) >> 40) |
193 | 	       ((n & 0xFF00000000000000) >> 56);
194 | }
195 | #endif
196 | 
197 | #define le16_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap16(n))
198 | #define le32_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap32(n))
199 | #define le64_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap64(n))
200 | 
201 | /* ========================================================================== */
202 | /*                          Unaligned memory accesses                         */
203 | /* ========================================================================== */
204 | 
205 | /*
206 |  * UNALIGNED_ACCESS_IS_FAST should be defined to 1 if unaligned memory accesses
207 |  * can be performed efficiently on the target platform.
208 |  */
209 | #ifndef UNALIGNED_ACCESS_IS_FAST
210 | #  define UNALIGNED_ACCESS_IS_FAST 0
211 | #endif
212 | 
213 | /*
214 |  * DEFINE_UNALIGNED_TYPE(type) - a macro that, given an integer type 'type',
215 |  * defines load_type_unaligned(addr) and store_type_unaligned(v, addr) functions
216 |  * which load and store variables of type 'type' from/to unaligned memory
217 |  * addresses.  If not defined, a fallback is used.
218 |  */
219 | #ifndef DEFINE_UNALIGNED_TYPE
220 | 
221 | /*
222 |  * Although memcpy() may seem inefficient, it *usually* gets optimized
223 |  * appropriately by modern compilers.  It's portable and may be the best we can
224 |  * do for a fallback...
225 |  */
226 | #include <string.h>
227 | 
228 | #define DEFINE_UNALIGNED_TYPE(type)				\
229 | 								\
230 | static forceinline type						\
231 | load_##type##_unaligned(const void *p)				\
232 | {								\
233 | 	type v;							\
234 | 	memcpy(&v, p, sizeof(v));				\
235 | 	return v;						\
236 | }								\
237 | 								\
238 | static forceinline void						\
239 | store_##type##_unaligned(type v, void *p)			\
240 | {								\
241 | 	memcpy(p, &v, sizeof(v));				\
242 | }
243 | 
244 | #endif /* !DEFINE_UNALIGNED_TYPE */
245 | 
246 | /* ========================================================================== */
247 | /*                             Bit scan functions                             */
248 | /* ========================================================================== */
249 | 
250 | /*
251 |  * Bit Scan Reverse (BSR) - find the 0-based index (relative to the least
252 |  * significant end) of the *most* significant 1 bit in the input value.  The
253 |  * input value must be nonzero!
254 |  */
255 | 
256 | #ifndef bsr32
257 | static forceinline unsigned
258 | bsr32(u32 n)
259 | {
260 | 	unsigned i = 0;
261 | 	while ((n >>= 1) != 0)
262 | 		i++;
263 | 	return i;
264 | }
265 | #endif
266 | 
267 | #ifndef bsr64
268 | static forceinline unsigned
269 | bsr64(u64 n)
270 | {
271 | 	unsigned i = 0;
272 | 	while ((n >>= 1) != 0)
273 | 		i++;
274 | 	return i;
275 | }
276 | #endif
277 | 
278 | static forceinline unsigned
279 | bsrw(machine_word_t n)
280 | {
281 | 	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
282 | 	if (WORDBITS == 32)
283 | 		return bsr32(n);
284 | 	else
285 | 		return bsr64(n);
286 | }
287 | 
288 | /*
289 |  * Bit Scan Forward (BSF) - find the 0-based index (relative to the least
290 |  * significant end) of the *least* significant 1 bit in the input value.  The
291 |  * input value must be nonzero!
292 |  */
293 | 
294 | #ifndef bsf32
295 | static forceinline unsigned
296 | bsf32(u32 n)
297 | {
298 | 	unsigned i = 0;
299 | 	while ((n & 1) == 0) {
300 | 		i++;
301 | 		n >>= 1;
302 | 	}
303 | 	return i;
304 | }
305 | #endif
306 | 
307 | #ifndef bsf64
308 | static forceinline unsigned
309 | bsf64(u64 n)
310 | {
311 | 	unsigned i = 0;
312 | 	while ((n & 1) == 0) {
313 | 		i++;
314 | 		n >>= 1;
315 | 	}
316 | 	return i;
317 | }
318 | #endif
319 | 
320 | static forceinline unsigned
321 | bsfw(machine_word_t n)
322 | {
323 | 	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
324 | 	if (WORDBITS == 32)
325 | 		return bsf32(n);
326 | 	else
327 | 		return bsf64(n);
328 | }
329 | 
330 | #endif /* COMMON_COMMON_DEFS_H */
331 | 


--------------------------------------------------------------------------------
/programs/prog_util.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * prog_util.c - utility functions for programs
  3 |  *
  4 |  * Copyright 2016 Eric Biggers
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person
  7 |  * obtaining a copy of this software and associated documentation
  8 |  * files (the "Software"), to deal in the Software without
  9 |  * restriction, including without limitation the rights to use,
 10 |  * copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 |  * copies of the Software, and to permit persons to whom the
 12 |  * Software is furnished to do so, subject to the following
 13 |  * conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be
 16 |  * included in all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 19 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 20 |  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 21 |  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 22 |  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 23 |  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 24 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 25 |  * OTHER DEALINGS IN THE SOFTWARE.
 26 |  */
 27 | 
 28 | #include "prog_util.h"
 29 | 
 30 | #include <errno.h>
 31 | #include <fcntl.h>
 32 | #include <stdarg.h>
 33 | #include <time.h>
 34 | #ifdef _WIN32
 35 | #  include <windows.h>
 36 | #else
 37 | #  include <unistd.h>
 38 | #  include <sys/time.h>
 39 | #endif
 40 | 
 41 | /* The invocation name of the program (filename component only) */
 42 | const tchar *program_invocation_name;
 43 | 
 44 | static void
 45 | do_msg(const char *format, bool with_errno, va_list va)
 46 | {
 47 | 	int saved_errno = errno;
 48 | 
 49 | 	fprintf(stderr, "%"TS": ", program_invocation_name);
 50 | 	vfprintf(stderr, format, va);
 51 | 	if (with_errno)
 52 | 		fprintf(stderr, ": %s\n", strerror(saved_errno));
 53 | 	else
 54 | 		fprintf(stderr, "\n");
 55 | 
 56 | 	errno = saved_errno;
 57 | }
 58 | 
 59 | /* Print a message to standard error */
 60 | void
 61 | msg(const char *format, ...)
 62 | {
 63 | 	va_list va;
 64 | 
 65 | 	va_start(va, format);
 66 | 	do_msg(format, false, va);
 67 | 	va_end(va);
 68 | }
 69 | 
 70 | /* Print a message to standard error, including a description of errno */
 71 | void
 72 | msg_errno(const char *format, ...)
 73 | {
 74 | 	va_list va;
 75 | 
 76 | 	va_start(va, format);
 77 | 	do_msg(format, true, va);
 78 | 	va_end(va);
 79 | }
 80 | 
 81 | /* malloc() wrapper */
 82 | void *
 83 | xmalloc(size_t size)
 84 | {
 85 | 	void *p = malloc(size);
 86 | 	if (p == NULL && size == 0)
 87 | 		p = malloc(1);
 88 | 	if (p == NULL)
 89 | 		msg("Out of memory");
 90 | 	return p;
 91 | }
 92 | 
 93 | /*
 94 |  * Retrieve the current time in nanoseconds since a start time which is fixed
 95 |  * for the duration of program execution but is otherwise unspecified
 96 |  */
 97 | u64
 98 | current_time(void)
 99 | {
100 | #ifdef _WIN32
101 | 	FILETIME ft;
102 | 	GetSystemTimeAsFileTime(&ft);
103 | 	return 100 * (((u64)ft.dwHighDateTime << 32) | ft.dwLowDateTime);
104 | #elif defined(HAVE_CLOCK_GETTIME)
105 | 	struct timespec ts;
106 | 	clock_gettime(CLOCK_MONOTONIC, &ts);
107 | 	return (1000000000 * (u64)ts.tv_sec) + ts.tv_nsec;
108 | #else
109 | 	struct timeval tv;
110 | 	gettimeofday(&tv, NULL);
111 | 	return (1000000000 * (u64)tv.tv_sec) + (1000 * (u64)tv.tv_usec);
112 | #endif
113 | }
114 | 
115 | /*
116 |  * Retrieve a pointer to the filename component of the specified path.
117 |  *
118 |  * Note: this does not modify the path.  Therefore, it is not guaranteed to work
119 |  * properly for directories, since a path to a directory might have trailing
120 |  * slashes.
121 |  */
122 | const tchar *
123 | get_filename(const tchar *path)
124 | {
125 | 	const tchar *slash = tstrrchr(path, '/');
126 | #ifdef _WIN32
127 | 	const tchar *backslash = tstrrchr(path, '\\');
128 | 	if (backslash != NULL && (slash == NULL || backslash > slash))
129 | 		slash = backslash;
130 | #endif
131 | 	if (slash != NULL)
132 | 		return slash + 1;
133 | 	return path;
134 | }
135 | 
136 | /* Create a copy of 'path' surrounded by double quotes */
137 | static tchar *
138 | quote_path(const tchar *path)
139 | {
140 | 	size_t len = tstrlen(path);
141 | 	tchar *result;
142 | 
143 | 	result = xmalloc((1 + len + 1 + 1) * sizeof(tchar));
144 | 	if (result == NULL)
145 | 		return NULL;
146 | 	result[0] = '"';
147 | 	tmemcpy(&result[1], path, len);
148 | 	result[1 + len] = '"';
149 | 	result[1 + len + 1] = '\0';
150 | 	return result;
151 | }
152 | 
153 | /* Open a file for reading, or set up standard input for reading */
154 | int
155 | xopen_for_read(const tchar *path, struct file_stream *strm)
156 | {
157 | 	if (path == NULL) {
158 | 		strm->is_standard_stream = true;
159 | 		strm->name = T("standard input");
160 | 		strm->fd = STDIN_FILENO;
161 | 	#ifdef _WIN32
162 | 		_setmode(strm->fd, O_BINARY);
163 | 	#endif
164 | 		return 0;
165 | 	}
166 | 
167 | 	strm->is_standard_stream = false;
168 | 
169 | 	strm->name = quote_path(path);
170 | 	if (strm->name == NULL)
171 | 		return -1;
172 | 
173 | 	strm->fd = topen(path, O_RDONLY | O_BINARY | O_NOFOLLOW);
174 | 	if (strm->fd < 0) {
175 | 		msg_errno("Can't open %"TS" for reading", strm->name);
176 | 		free(strm->name);
177 | 		return -1;
178 | 	}
179 | 
180 | 	return 0;
181 | }
182 | 
183 | /* Open a file for writing, or set up standard output for writing */
184 | int
185 | xopen_for_write(const tchar *path, bool overwrite, struct file_stream *strm)
186 | {
187 | 	if (path == NULL) {
188 | 		strm->is_standard_stream = true;
189 | 		strm->name = T("standard output");
190 | 		strm->fd = STDOUT_FILENO;
191 | 	#ifdef _WIN32
192 | 		_setmode(strm->fd, O_BINARY);
193 | 	#endif
194 | 		return 0;
195 | 	}
196 | 
197 | 	strm->is_standard_stream = false;
198 | 
199 | 	strm->name = quote_path(path);
200 | 	if (strm->name == NULL)
201 | 		goto err;
202 | retry:
203 | 	strm->fd = topen(path, O_WRONLY | O_BINARY | O_NOFOLLOW |
204 | 				O_CREAT | O_EXCL, 0644);
205 | 	if (strm->fd < 0) {
206 | 		if (errno != EEXIST) {
207 | 			msg_errno("Can't open %"TS" for writing", strm->name);
208 | 			goto err;
209 | 		}
210 | 		if (!overwrite) {
211 | 			if (!isatty(STDERR_FILENO) || !isatty(STDIN_FILENO)) {
212 | 				msg("%"TS" already exists; use -f to overwrite",
213 | 				    strm->name);
214 | 				goto err;
215 | 			}
216 | 			fprintf(stderr, "%"TS": %"TS" already exists; "
217 | 				"overwrite? (y/n) ",
218 | 				program_invocation_name, strm->name);
219 | 			if (getchar() != 'y') {
220 | 				msg("Not overwriting.");
221 | 				goto err;
222 | 			}
223 | 		}
224 | 		if (tunlink(path) != 0) {
225 | 			msg_errno("Unable to delete %"TS, strm->name);
226 | 			goto err;
227 | 		}
228 | 		goto retry;
229 | 	}
230 | 
231 | 	return 0;
232 | 
233 | err:
234 | 	free(strm->name);
235 | 	return -1;
236 | }
237 | 
238 | /*
239 |  * Read from a file, returning the full count to indicate all bytes were read, a
240 |  * short count (possibly 0) to indicate EOF, or -1 to indicate error.
241 |  */
242 | ssize_t
243 | xread(struct file_stream *strm, void *buf, size_t count)
244 | {
245 | 	char *p = buf;
246 | 	size_t orig_count = count;
247 | 
248 | 	while (count != 0) {
249 | 		ssize_t res = read(strm->fd, p, MIN(count, INT_MAX));
250 | 		if (res == 0)
251 | 			break;
252 | 		if (res < 0) {
253 | 			msg_errno("Error reading from %"TS, strm->name);
254 | 			return -1;
255 | 		}
256 | 		p += res;
257 | 		count -= res;
258 | 	}
259 | 	return orig_count - count;
260 | }
261 | 
262 | /* Skip over 'count' bytes from a file, returning 0 on success or -1 on error */
263 | int
264 | skip_bytes(struct file_stream *strm, size_t count)
265 | {
266 | 	size_t bufsize;
267 | 	char *buffer;
268 | 	ssize_t ret;
269 | 
270 | 	if (count == 0)
271 | 		return 0;
272 | 
273 | 	bufsize = MIN(count, 4096);
274 | 	buffer = xmalloc(bufsize);
275 | 	if (buffer == NULL)
276 | 		return -1;
277 | 	do {
278 | 		size_t n = MIN(count, bufsize);
279 | 		ret = xread(strm, buffer, n);
280 | 		if (ret < 0)
281 | 			goto out;
282 | 		if (ret != n) {
283 | 			msg("%"TS": unexpected end-of-file", strm->name);
284 | 			ret = -1;
285 | 			goto out;
286 | 		}
287 | 		count -= ret;
288 | 	} while (count != 0);
289 | 	ret = 0;
290 | out:
291 | 	free(buffer);
292 | 	return ret;
293 | }
294 | 
295 | /* Write to a file, returning 0 if all bytes were written or -1 on error */
296 | int
297 | full_write(struct file_stream *strm, const void *buf, size_t count)
298 | {
299 | 	const char *p = buf;
300 | 
301 | 	while (count != 0) {
302 | 		ssize_t res = write(strm->fd, p, MIN(count, INT_MAX));
303 | 		if (res <= 0) {
304 | 			msg_errno("Error writing to %"TS, strm->name);
305 | 			return -1;
306 | 		}
307 | 		p += res;
308 | 		count -= res;
309 | 	}
310 | 	return 0;
311 | }
312 | 
313 | /* Close a file, returning 0 on success or -1 on error */
314 | int
315 | xclose(struct file_stream *strm)
316 | {
317 | 	int ret = 0;
318 | 	if (strm->fd >= 0 && !strm->is_standard_stream) {
319 | 		if (close(strm->fd) != 0) {
320 | 			msg_errno("Error closing %"TS, strm->name);
321 | 			ret = -1;
322 | 		}
323 | 		free(strm->name);
324 | 	}
325 | 	strm->fd = -1;
326 | 	strm->name = NULL;
327 | 	return ret;
328 | }
329 | 
330 | /*
331 |  * Parse the chunk size given on the command line, returning the chunk size on
332 |  * success or 0 on error
333 |  */
334 | u32
335 | parse_chunk_size(const tchar *arg)
336 | {
337 | 	tchar *tmp;
338 | 	unsigned long chunk_size = tstrtoul(arg, &tmp, 10);
339 | 
340 | 	if (chunk_size < 1024 || chunk_size > 67108864 || *tmp != '\0') {
341 | 		msg("Invalid chunk size : \"%"TS"\".  "
342 | 		    "Must be an integer in the range [1024, 67108864]", arg);
343 | 		return 0;
344 | 	}
345 | 
346 | 	return chunk_size;
347 | }
348 | 
349 | /*
350 |  * Parse the compression level given on the command line, returning the
351 |  * compression level on success or 0 on error
352 |  */
353 | int
354 | parse_compression_level(const tchar *arg)
355 | {
356 | 	tchar *tmp;
357 | 	unsigned long level = tstrtoul(arg, &tmp, 10);
358 | 
359 | 	if (level < 1 || level > 9 || *tmp != '\0') {
360 | 		msg("Invalid compression level: \"%"TS"\".  "
361 | 		    "Must be an integer in the range [1, 9].", arg);
362 | 		return 0;
363 | 	}
364 | 
365 | 	return level;
366 | }
367 | 
368 | /* Allocate a new XPACK compressor */
369 | struct xpack_compressor *
370 | alloc_compressor(u32 chunk_size, int level)
371 | {
372 | 	struct xpack_compressor *c;
373 | 
374 | 	c = xpack_alloc_compressor(chunk_size, level);
375 | 	if (c == NULL) {
376 | 		msg_errno("Unable to allocate compressor with "
377 | 			  "chunk size %"PRIu32" and compression level %d",
378 | 			  chunk_size, level);
379 | 	}
380 | 	return c;
381 | }
382 | 
383 | /* Allocate a new XPACK decompressor */
384 | struct xpack_decompressor *
385 | alloc_decompressor(void)
386 | {
387 | 	struct xpack_decompressor *d;
388 | 
389 | 	d = xpack_alloc_decompressor();
390 | 	if (d == NULL)
391 | 		msg_errno("Unable to allocate decompressor");
392 | 
393 | 	return d;
394 | }
395 | 


--------------------------------------------------------------------------------
/lib/hc_matchfinder.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * hc_matchfinder.h - Lempel-Ziv matchfinding with a hash table of linked lists
  3 |  *
  4 |  * ---------------------------------------------------------------------------
  5 |  *
  6 |  *				   Algorithm
  7 |  *
  8 |  * This is a Hash Chains (hc) based matchfinder.
  9 |  *
 10 |  * The main data structure is a hash table where each hash bucket contains a
 11 |  * linked list (or "chain") of sequences whose first 4 bytes share the same hash
 12 |  * code.  Each sequence is identified by its starting position in the input
 13 |  * buffer.
 14 |  *
 15 |  * The algorithm processes the input buffer sequentially.  At each byte
 16 |  * position, the hash code of the first 4 bytes of the sequence beginning at
 17 |  * that position (the sequence being matched against) is computed.  This
 18 |  * identifies the hash bucket to use for that position.  Then, this hash
 19 |  * bucket's linked list is searched for matches.  Then, a new linked list node
 20 |  * is created to represent the current sequence and is prepended to the list.
 21 |  *
 22 |  * This algorithm has several useful properties:
 23 |  *
 24 |  * - It only finds true Lempel-Ziv matches; i.e., those where the matching
 25 |  *   sequence occurs prior to the sequence being matched against.
 26 |  *
 27 |  * - The sequences in each linked list are always sorted by decreasing starting
 28 |  *   position.  Therefore, the closest (smallest offset) matches are found
 29 |  *   first, which in many compression formats tend to be the cheapest to encode.
 30 |  *
 31 |  * - Although fast running time is not guaranteed due to the possibility of the
 32 |  *   lists getting very long, the worst degenerate behavior can be easily
 33 |  *   prevented by capping the number of nodes searched at each position.
 34 |  *
 35 |  * - If the compressor decides not to search for matches at a certain position,
 36 |  *   then that position can be quickly inserted without searching the list.
 37 |  *
 38 |  * - The algorithm is adaptable to sliding windows: just store the positions
 39 |  *   relative to a "base" value that is updated from time to time, and stop
 40 |  *   searching each list when the sequences get too far away.
 41 |  *
 42 |  * ---------------------------------------------------------------------------
 43 |  *
 44 |  *				Notes on usage
 45 |  *
 46 |  * The number of bytes that must be allocated for a given 'struct
 47 |  * hc_matchfinder' must be gotten by calling hc_matchfinder_size().
 48 |  *
 49 |  * ----------------------------------------------------------------------------
 50 |  *
 51 |  *				 Optimizations
 52 |  *
 53 |  * The main hash table and chains handle length 4+ matches.  Length 3 matches
 54 |  * are handled by a separate hash table with no chains.  This works well for
 55 |  * typical "greedy" or "lazy"-style compressors, where length 3 matches are
 56 |  * often only helpful if they have small offsets.  Instead of searching a full
 57 |  * chain for length 3+ matches, the algorithm just checks for one close length 3
 58 |  * match, then focuses on finding length 4+ matches.
 59 |  *
 60 |  * The longest_match() and skip_positions() functions are inlined into the
 61 |  * compressors that use them.  This isn't just about saving the overhead of a
 62 |  * function call.  These functions are intended to be called from the inner
 63 |  * loops of compressors, where giving the compiler more control over register
 64 |  * allocation is very helpful.  There is also significant benefit to be gained
 65 |  * from allowing the CPU to predict branches independently at each call site.
 66 |  * For example, "lazy"-style compressors can be written with two calls to
 67 |  * longest_match(), each of which starts with a different 'best_len' and
 68 |  * therefore has significantly different performance characteristics.
 69 |  *
 70 |  * Although any hash function can be used, a multiplicative hash is fast and
 71 |  * works well.
 72 |  *
 73 |  * On some processors, it is significantly faster to extend matches by whole
 74 |  * words (32 or 64 bits) instead of by individual bytes.  For this to be the
 75 |  * case, the processor must implement unaligned memory accesses efficiently and
 76 |  * must have either a fast "find first set bit" instruction or a fast "find last
 77 |  * set bit" instruction, depending on the processor's endianness.
 78 |  *
 79 |  * The code uses one loop for finding the first match and one loop for finding a
 80 |  * longer match.  Each of these loops is tuned for its respective task and in
 81 |  * combination are faster than a single generalized loop that handles both
 82 |  * tasks.
 83 |  *
 84 |  * The code also uses a tight inner loop that only compares the last and first
 85 |  * bytes of a potential match.  It is only when these bytes match that a full
 86 |  * match extension is attempted.
 87 |  *
 88 |  * ----------------------------------------------------------------------------
 89 |  */
 90 | 
 91 | #include <string.h>
 92 | 
 93 | #include "lz_extend.h"
 94 | #include "lz_hash.h"
 95 | #include "unaligned.h"
 96 | 
 97 | #define HC_MATCHFINDER_HASH3_ORDER	15
 98 | #define HC_MATCHFINDER_HASH4_ORDER	16
 99 | 
100 | struct hc_matchfinder {
101 | 
102 | 	/* The hash table for finding length 3 matches */
103 | 	u32 hash3_tab[1UL << HC_MATCHFINDER_HASH3_ORDER];
104 | 
105 | 	/* The hash table which contains the first nodes of the linked lists for
106 | 	 * finding length 4+ matches */
107 | 	u32 hash4_tab[1UL << HC_MATCHFINDER_HASH4_ORDER];
108 | 
109 | 	/* The "next node" references for the linked lists.  The "next node" of
110 | 	 * the node for the sequence with position 'pos' is 'next_tab[pos]'. */
111 | 	u32 next_tab[];
112 | };
113 | 
114 | /*
115 |  * Return the number of bytes that must be allocated for a 'hc_matchfinder' that
116 |  * can work with buffers up to the specified size.
117 |  */
118 | static forceinline size_t
119 | hc_matchfinder_size(size_t max_bufsize)
120 | {
121 | 	return sizeof(struct hc_matchfinder) + (max_bufsize * sizeof(u32));
122 | }
123 | 
124 | /* Prepare the matchfinder for a new input buffer. */
125 | static forceinline void
126 | hc_matchfinder_init(struct hc_matchfinder *mf)
127 | {
128 | 	memset(mf, 0, sizeof(*mf));
129 | }
130 | 
131 | /*
132 |  * Find the longest match longer than 'best_len' bytes.
133 |  *
134 |  * @mf
135 |  *	The matchfinder structure.
136 |  * @in_begin
137 |  *	Pointer to the beginning of the input buffer.
138 |  * @cur_pos
139 |  *	The current position in the input buffer (the position of the sequence
140 |  *	being matched against).
141 |  * @best_len
142 |  *	Require a match longer than this length.
143 |  * @max_len
144 |  *	The maximum permissible match length at this position.
145 |  * @nice_len
146 |  *	Stop searching if a match of at least this length is found.
147 |  *	Must be <= @max_len.
148 |  * @max_search_depth
149 |  *	Limit on the number of potential matches to consider.  Must be >= 1.
150 |  * @next_hashes
151 |  *	The precomputed hash codes for the sequence beginning at @in_next.
152 |  *	These will be used and then updated with the precomputed hashcodes for
153 |  *	the sequence beginning at @in_next + 1.
154 |  * @offset_ret
155 |  *	If a match is found, its offset is returned in this location.
156 |  *
157 |  * Return the length of the match found, or 'best_len' if no match longer than
158 |  * 'best_len' was found.
159 |  */
160 | static forceinline u32
161 | hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf,
162 | 			     const u8 * const restrict in_begin,
163 | 			     const ptrdiff_t cur_pos,
164 | 			     u32 best_len,
165 | 			     const u32 max_len,
166 | 			     const u32 nice_len,
167 | 			     const u32 max_search_depth,
168 | 			     u32 next_hashes[restrict 2],
169 | 			     u32 * const restrict offset_ret)
170 | {
171 | 	const u8 *in_next = in_begin + cur_pos;
172 | 	u32 depth_remaining = max_search_depth;
173 | 	const u8 *best_matchptr = in_next;
174 | 	u32 cur_node3, cur_node4;
175 | 	u32 hash3, hash4;
176 | 	u32 next_seq3, next_seq4;
177 | 	u32 seq4;
178 | 	const u8 *matchptr;
179 | 	u32 len;
180 | 
181 | 	if (unlikely(max_len < 5)) /* can we read 4 bytes from 'in_next + 1'? */
182 | 		goto out;
183 | 
184 | 	/* Get the precomputed hash codes */
185 | 	hash3 = next_hashes[0];
186 | 	hash4 = next_hashes[1];
187 | 
188 | 	/* From the hash buckets, get the first node of each linked list. */
189 | 	cur_node3 = mf->hash3_tab[hash3];
190 | 	cur_node4 = mf->hash4_tab[hash4];
191 | 
192 | 	/* Update for length 3 matches.  This replaces the singleton node in the
193 | 	 * 'hash3' bucket with the node for the current sequence. */
194 | 	mf->hash3_tab[hash3] = cur_pos;
195 | 
196 | 	/* Update for length 4 matches.  This prepends the node for the current
197 | 	 * sequence to the linked list in the 'hash4' bucket. */
198 | 	mf->hash4_tab[hash4] = cur_pos;
199 | 	mf->next_tab[cur_pos] = cur_node4;
200 | 
201 | 	/* Compute the next hash codes */
202 | 	next_seq4 = load_u32_unaligned(in_next + 1);
203 | 	next_seq3 = loaded_u32_to_u24(next_seq4);
204 | 	next_hashes[0] = lz_hash(next_seq3, HC_MATCHFINDER_HASH3_ORDER);
205 | 	next_hashes[1] = lz_hash(next_seq4, HC_MATCHFINDER_HASH4_ORDER);
206 | 	prefetchw(&mf->hash3_tab[next_hashes[0]]);
207 | 	prefetchw(&mf->hash4_tab[next_hashes[1]]);
208 | 
209 | 	if (best_len < 4) {  /* No match of length >= 4 found yet? */
210 | 
211 | 		/* Check for a length 3 match if needed */
212 | 
213 | 		if (!cur_node3)
214 | 			goto out;
215 | 
216 | 		seq4 = load_u32_unaligned(in_next);
217 | 
218 | 		if (best_len < 3) {
219 | 			matchptr = &in_begin[cur_node3];
220 | 			if (load_u24_unaligned(matchptr) == loaded_u32_to_u24(seq4)) {
221 | 				best_len = 3;
222 | 				best_matchptr = matchptr;
223 | 			}
224 | 		}
225 | 
226 | 		/* Check for a length 4 match */
227 | 
228 | 		if (!cur_node4)
229 | 			goto out;
230 | 
231 | 		for (;;) {
232 | 			/* No length 4 match found yet.  Check the first 4 bytes. */
233 | 			matchptr = &in_begin[cur_node4];
234 | 
235 | 			if (load_u32_unaligned(matchptr) == seq4)
236 | 				break;
237 | 
238 | 			/* The first 4 bytes did not match.  Keep trying. */
239 | 			cur_node4 = mf->next_tab[cur_node4];
240 | 			if (!cur_node4 || !--depth_remaining)
241 | 				goto out;
242 | 		}
243 | 
244 | 		/* Found a match of length >= 4.  Extend it to its full length. */
245 | 		best_matchptr = matchptr;
246 | 		best_len = lz_extend(in_next, best_matchptr, 4, max_len);
247 | 		if (best_len >= nice_len)
248 | 			goto out;
249 | 		cur_node4 = mf->next_tab[cur_node4];
250 | 		if (!cur_node4 || !--depth_remaining)
251 | 			goto out;
252 | 	} else {
253 | 		if (!cur_node4 || best_len >= nice_len)
254 | 			goto out;
255 | 	}
256 | 
257 | 	/* Check for matches of length >= 5 */
258 | 
259 | 	for (;;) {
260 | 		for (;;) {
261 | 			matchptr = &in_begin[cur_node4];
262 | 
263 | 			/* Already found a length 4 match.  Try for a longer
264 | 			 * match; start by checking either the last 4 bytes and
265 | 			 * the first 4 bytes, or the last byte.  (The last byte,
266 | 			 * the one which would extend the match length by 1, is
267 | 			 * the most important.) */
268 | 		#if UNALIGNED_ACCESS_IS_FAST
269 | 			if ((load_u32_unaligned(matchptr + best_len - 3) ==
270 | 			     load_u32_unaligned(in_next + best_len - 3)) &&
271 | 			    (load_u32_unaligned(matchptr) ==
272 | 			     load_u32_unaligned(in_next)))
273 | 		#else
274 | 			if (matchptr[best_len] == in_next[best_len])
275 | 		#endif
276 | 				break;
277 | 
278 | 			/* Continue to the next node in the list */
279 | 			cur_node4 = mf->next_tab[cur_node4];
280 | 			if (!cur_node4 || !--depth_remaining)
281 | 				goto out;
282 | 		}
283 | 
284 | 	#if UNALIGNED_ACCESS_IS_FAST
285 | 		len = 4;
286 | 	#else
287 | 		len = 0;
288 | 	#endif
289 | 		len = lz_extend(in_next, matchptr, len, max_len);
290 | 		if (len > best_len) {
291 | 			/* This is the new longest match */
292 | 			best_len = len;
293 | 			best_matchptr = matchptr;
294 | 			if (best_len >= nice_len)
295 | 				goto out;
296 | 		}
297 | 
298 | 		/* Continue to the next node in the list */
299 | 		cur_node4 = mf->next_tab[cur_node4];
300 | 		if (!cur_node4 || !--depth_remaining)
301 | 			goto out;
302 | 	}
303 | out:
304 | 	*offset_ret = in_next - best_matchptr;
305 | 	return best_len;
306 | }
307 | 
308 | /*
309 |  * Advance the matchfinder, but don't search for matches.
310 |  *
311 |  * @mf
312 |  *	The matchfinder structure.
313 |  * @in_begin
314 |  *	Pointer to the beginning of the input buffer.
315 |  * @cur_pos
316 |  *	The current position in the input buffer (the position of the sequence
317 |  *	being matched against).
318 |  * @end_pos
319 |  *	The length of the input buffer.
320 |  * @next_hashes
321 |  *	The precomputed hash codes for the sequence beginning at @in_next.
322 |  *	These will be used and then updated with the precomputed hashcodes for
323 |  *	the sequence beginning at @in_next + @count.
324 |  * @count
325 |  *	The number of bytes to advance.  Must be > 0.
326 |  *
327 |  * Returns @in_next + @count.
328 |  */
329 | static forceinline const u8 *
330 | hc_matchfinder_skip_positions(struct hc_matchfinder * const restrict mf,
331 | 			      const u8 * const restrict in_begin,
332 | 			      const ptrdiff_t cur_pos,
333 | 			      const ptrdiff_t end_pos,
334 | 			      const u32 count,
335 | 			      u32 next_hashes[restrict 2])
336 | {
337 | 	const u8 *in_next = in_begin + cur_pos;
338 | 	const u8 * const stop_ptr = in_next + count;
339 | 
340 | 	if (likely(count + 5 <= end_pos - cur_pos)) {
341 | 		u32 hash3, hash4;
342 | 		u32 next_seq3, next_seq4;
343 | 
344 | 		hash3 = next_hashes[0];
345 | 		hash4 = next_hashes[1];
346 | 		do {
347 | 			mf->hash3_tab[hash3] = in_next - in_begin;
348 | 			mf->next_tab[in_next - in_begin] = mf->hash4_tab[hash4];
349 | 			mf->hash4_tab[hash4] = in_next - in_begin;
350 | 
351 | 			next_seq4 = load_u32_unaligned(++in_next);
352 | 			next_seq3 = loaded_u32_to_u24(next_seq4);
353 | 			hash3 = lz_hash(next_seq3, HC_MATCHFINDER_HASH3_ORDER);
354 | 			hash4 = lz_hash(next_seq4, HC_MATCHFINDER_HASH4_ORDER);
355 | 
356 | 		} while (in_next != stop_ptr);
357 | 
358 | 		prefetchw(&mf->hash3_tab[hash3]);
359 | 		prefetchw(&mf->hash4_tab[hash4]);
360 | 		next_hashes[0] = hash3;
361 | 		next_hashes[1] = hash4;
362 | 	}
363 | 
364 | 	return stop_ptr;
365 | }
366 | 


--------------------------------------------------------------------------------
/lib/decompress_impl.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * decompress_impl.h - XPACK decompression implementation
  3 |  *
  4 |  * Copyright 2016 Eric Biggers
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person
  7 |  * obtaining a copy of this software and associated documentation
  8 |  * files (the "Software"), to deal in the Software without
  9 |  * restriction, including without limitation the rights to use,
 10 |  * copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 |  * copies of the Software, and to permit persons to whom the
 12 |  * Software is furnished to do so, subject to the following
 13 |  * conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be
 16 |  * included in all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 19 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 20 |  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 21 |  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 22 |  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 23 |  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 24 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 25 |  * OTHER DEALINGS IN THE SOFTWARE.
 26 |  */
 27 | 
 28 | /*
 29 |  * This is the actual decompression routine, lifted out of xpack_decompress.c so
 30 |  * that it can be compiled with different target instruction sets.
 31 |  */
 32 | 
 33 | static enum decompress_result ATTRIBUTES
 34 | FUNCNAME(struct xpack_decompressor * restrict d,
 35 | 	 const void * restrict in, size_t in_nbytes,
 36 | 	 void * restrict out, size_t out_nbytes_avail,
 37 | 	 size_t *actual_out_nbytes_ret)
 38 | {
 39 | 	const u8 *in_next = in;
 40 | 	const u8 * const in_end = in_next + in_nbytes;
 41 | 	u8 *out_next = out;
 42 | 	u8 * const out_end = out_next + out_nbytes_avail;
 43 | 	u8 *out_block_end;
 44 | 	u32 recent_offsets[NUM_REPS];
 45 | #ifdef ENABLE_PREPROCESSING
 46 | 	unsigned preprocessed = 0;
 47 | #endif
 48 | 	machine_word_t bitbuf = 0;
 49 | 	unsigned bitsleft = 0;
 50 | 	size_t overrun_count = 0;
 51 | 	unsigned num_padding_bits;
 52 | 	unsigned num_state_counts;
 53 | 	unsigned is_final_block;
 54 | 	unsigned block_type;
 55 | 	size_t block_usize;
 56 | 	s32 num_literals;
 57 | 	u8 *literals;
 58 | 	u8 *lits;
 59 | 	u8 *lits_end;
 60 | 	unsigned order;
 61 | 	const u8 *extra_bytes;
 62 | 	u32 num_extra_bytes;
 63 | 	unsigned log2_num_literal_states;
 64 | 	unsigned log2_num_litrunlen_states;
 65 | 	unsigned log2_num_length_states;
 66 | 	unsigned log2_num_offset_states;
 67 | 	unsigned log2_num_aligned_states;
 68 | #if NUM_LITERAL_STREAMS == 2
 69 | 	unsigned literal_state_1;
 70 | 	unsigned literal_state_2;
 71 | #else
 72 | 	unsigned literal_state;
 73 | #endif
 74 | 	unsigned litrunlen_state;
 75 | 	unsigned length_state;
 76 | 	unsigned offset_state;
 77 | 	unsigned aligned_state;
 78 | 	unsigned i;
 79 | 	u32 sym;
 80 | 	u32 bits;
 81 | 
 82 | 	init_recent_offsets(recent_offsets);
 83 | 
 84 | next_block:
 85 | 	/* Starting to decompress the next block */
 86 | 
 87 | 	ENSURE_BITS(1 + NUM_BLOCKTYPE_BITS + 1 + NUM_BLOCKSIZE_BITS);
 88 | 
 89 | 	/* "final block" flag */
 90 | 	is_final_block = POP_BITS(1);
 91 | 
 92 | 	/* block type */
 93 | 	block_type = POP_BITS(NUM_BLOCKTYPE_BITS);
 94 | 
 95 | 	/* block uncompressed size */
 96 | 	if (POP_BITS(1))
 97 | 		block_usize = DEFAULT_BLOCK_SIZE;
 98 | 	else
 99 | 		block_usize = POP_BITS(NUM_BLOCKSIZE_BITS);
100 | 
101 | 	SAFETY_CHECK(block_type == BLOCKTYPE_ALIGNED ||
102 | 		     block_type == BLOCKTYPE_VERBATIM);
103 | 
104 | 	if (unlikely(block_usize > out_end - out_next))
105 | 		return DECOMPRESS_INSUFFICIENT_SPACE;
106 | 
107 | 	SAFETY_CHECK(block_usize > 0);
108 | 
109 | 	out_block_end = out_next + block_usize;
110 | 
111 | 	/* Read the FSE state counts for each alphabet. */
112 | 	ENSURE_BITS(20);
113 | 	log2_num_literal_states = POP_BITS(4);
114 | 	log2_num_litrunlen_states = POP_BITS(4);
115 | 	log2_num_length_states = POP_BITS(4);
116 | 	log2_num_offset_states = POP_BITS(4);
117 | 	if (block_type == BLOCKTYPE_ALIGNED)
118 | 		log2_num_aligned_states = POP_BITS(4);
119 | 	else
120 | 		log2_num_aligned_states = 0;
121 | 
122 | 	SAFETY_CHECK(log2_num_literal_states <= MAX_LOG2_NUM_LITERAL_STATES &&
123 | 		     log2_num_litrunlen_states <= MAX_LOG2_NUM_LITRUNLEN_STATES &&
124 | 		     log2_num_length_states <= MAX_LOG2_NUM_LENGTH_STATES &&
125 | 		     log2_num_offset_states <= MAX_LOG2_NUM_OFFSET_STATES &&
126 | 		     log2_num_aligned_states <= MAX_LOG2_NUM_ALIGNED_STATES);
127 | 
128 | #ifndef _MSC_VER
129 | 	STATIC_ASSERT(offsetof(struct xpack_decompressor,
130 | 			aligned_state_counts[ALIGNED_ALPHABET_SIZE]) ==
131 | 		      offsetof(struct xpack_decompressor, state_counts) +
132 | 			sizeof(d->state_counts));
133 | #endif
134 | 
135 | 	num_state_counts = ARRAY_LEN(d->state_counts);
136 | 	if (block_type != BLOCKTYPE_ALIGNED)
137 | 		num_state_counts -= ALIGNED_ALPHABET_SIZE;
138 | 
139 | 	for (i = 0; i < num_state_counts; ) {
140 | 		unsigned code;
141 | 
142 | 		ENSURE_BITS(CODEBITS + MAX_EXTRA_CODEBITS);
143 | 
144 | 		code = POP_BITS(CODEBITS);
145 | 
146 | 		if (code < ZEROCODE1) {
147 | 			/* single nonzero count */
148 | 			d->state_counts[i++] = (1 << code) + POP_BITS(code);
149 | 		} else {
150 | 			unsigned num_zeroes;
151 | 
152 | 			if (code == ZEROCODE1) {
153 | 				/* a few zeroes */
154 | 				num_zeroes = ZEROCODE1_MIN +
155 | 					     POP_BITS(ZEROCODE1_NBITS);
156 | 			} else {
157 | 				/* many zeroes */
158 | 				num_zeroes = ZEROCODE2_MIN +
159 | 					     POP_BITS(ZEROCODE2_NBITS);
160 | 			}
161 | 			SAFETY_CHECK(num_zeroes <= num_state_counts - i);
162 | 			do {
163 | 				d->state_counts[i++] = 0;
164 | 			} while (--num_zeroes);
165 | 		}
166 | 	}
167 | 
168 | #ifdef ENABLE_PREPROCESSING
169 | 	preprocessed |= d->literal_state_counts[0xE8];
170 | #endif
171 | 
172 | 	/* Prepare the extra_bytes pointer. */
173 | 
174 | 	ENSURE_BITS(5);
175 | 	order = POP_BITS(5);
176 | 	STATIC_ASSERT(CAN_ENSURE(25));
177 | 	SAFETY_CHECK(order <= 25);
178 | 	ENSURE_BITS(order);
179 | 	num_extra_bytes = ((u32)1 << order) + POP_BITS(order) - 1;
180 | 	ALIGN_INPUT();
181 | 	SAFETY_CHECK(num_extra_bytes < in_end - in_next);
182 | 	extra_bytes = in_next;
183 | 	in_next += num_extra_bytes;
184 | 
185 | 	/* Set up the FSE symbol input stream. */
186 | 	SAFETY_CHECK(*in_next != 0);
187 | 	num_padding_bits = 1 + bsf32(*in_next);
188 | 	bitbuf = *in_next++ >> num_padding_bits;
189 | 	bitsleft = 8 - num_padding_bits;
190 | 
191 | 	/* Decode the literals. */
192 | 
193 | 	ENSURE_BITS(5);
194 | 	order = POP_BITS(5);
195 | 	STATIC_ASSERT(CAN_ENSURE(25));
196 | 	SAFETY_CHECK(order <= 25);
197 | 	ENSURE_BITS(order);
198 | 	num_literals = ((u32)1 << order) + POP_BITS(order) - 1;
199 | 	SAFETY_CHECK(num_literals <= out_block_end - out_next);
200 | 	literals = out_block_end - num_literals;
201 | 
202 | 	SAFETY_CHECK(build_fse_decode_table(d->literal_decode_table,
203 | 					    d->literal_state_counts,
204 | 					    LITERAL_ALPHABET_SIZE,
205 | 					    log2_num_literal_states));
206 | 
207 | #if NUM_LITERAL_STREAMS == 2
208 | 	ENSURE_BITS(2 * MAX_LOG2_NUM_LITERAL_STATES);
209 | 	literal_state_1 = POP_BITS(log2_num_literal_states);
210 | 	literal_state_2 = POP_BITS(log2_num_literal_states);
211 | 	lits = literals;
212 | 	lits_end = literals + (num_literals & ~1);
213 | 	while (lits != lits_end) {
214 | 		ENSURE_BITS(2 * MAX_LOG2_NUM_LITERAL_STATES);
215 | 		*lits++ = DECODE_SYMBOL(literal_state_1, d->literal_decode_table);
216 | 		*lits++ = DECODE_SYMBOL(literal_state_2, d->literal_decode_table);
217 | 	}
218 | 	if (lits_end != out_block_end) {
219 | 		ENSURE_BITS(MAX_LOG2_NUM_LITERAL_STATES);
220 | 		*lits++ = DECODE_SYMBOL(literal_state_1, d->literal_decode_table);
221 | 	}
222 | 	SAFETY_CHECK(literal_state_1 == 0 && literal_state_2 == 0);
223 | #else
224 | 	ENSURE_BITS(MAX_LOG2_NUM_LITERAL_STATES);
225 | 	literal_state = POP_BITS(log2_num_literal_states);
226 | 	lits = literals;
227 | 	lits_end = literals + num_literals;
228 | 	while (lits != lits_end) {
229 | 		ENSURE_BITS(MAX_LOG2_NUM_LITERAL_STATES);
230 | 		*lits++ = DECODE_SYMBOL(literal_state, d->literal_decode_table);
231 | 	}
232 | 	SAFETY_CHECK(literal_state == 0);
233 | #endif
234 | 
235 | 	/* Prepare to decode literal runs and matches */
236 | 
237 | 	ENSURE_BITS(MAX_LOG2_NUM_LITRUNLEN_STATES + MAX_LOG2_NUM_LENGTH_STATES);
238 | 	litrunlen_state = POP_BITS(log2_num_litrunlen_states);
239 | 	length_state = POP_BITS(log2_num_length_states);
240 | 
241 | 	ENSURE_BITS(MAX_LOG2_NUM_OFFSET_STATES + MAX_LOG2_NUM_ALIGNED_STATES);
242 | 	offset_state = POP_BITS(log2_num_offset_states);
243 | 	aligned_state = 0;
244 | 	if (block_type == BLOCKTYPE_ALIGNED)
245 | 		aligned_state = POP_BITS(log2_num_aligned_states);
246 | 
247 | 	SAFETY_CHECK(build_fse_decode_table(d->litrunlen_decode_table,
248 | 					    d->litrunlen_state_counts,
249 | 					    LITRUNLEN_ALPHABET_SIZE,
250 | 					    log2_num_litrunlen_states));
251 | 
252 | 	SAFETY_CHECK(build_fse_decode_table(d->length_decode_table,
253 | 					    d->length_state_counts,
254 | 					    LENGTH_ALPHABET_SIZE,
255 | 					    log2_num_length_states));
256 | 
257 | 	SAFETY_CHECK(build_fse_decode_table(d->offset_decode_table,
258 | 					    d->offset_state_counts,
259 | 					    MAX_OFFSET_ALPHABET_SIZE,
260 | 					    log2_num_offset_states));
261 | 
262 | 	if (block_type == BLOCKTYPE_ALIGNED) {
263 | 		SAFETY_CHECK(build_fse_decode_table(d->aligned_decode_table,
264 | 						    d->aligned_state_counts,
265 | 						    ALIGNED_ALPHABET_SIZE,
266 | 						    log2_num_aligned_states));
267 | 	}
268 | 
269 | 	/* Decode literal runs and matches */
270 | 
271 | 	for (;;) {
272 | 		u32 litrunlen;
273 | 		u32 length;
274 | 		u32 offset;
275 | 		unsigned offset_sym;
276 | 
277 | 		STATIC_ASSERT(MAX_LOG2_NUM_LITRUNLEN_STATES +
278 | 			      MAX_LOG2_NUM_LENGTH_STATES +
279 | 			      MAX_LOG2_NUM_OFFSET_STATES <= 32);
280 | 		if (CAN_ENSURE(32))
281 | 			ENSURE_BITS(32);
282 | 		else
283 | 			ENSURE_BITS(MAX_LOG2_NUM_LITRUNLEN_STATES +
284 | 				    MAX_LOG2_NUM_LENGTH_STATES);
285 | 
286 | 		/* BEGIN decode literal run */
287 | 
288 | 		/* Decode the literal run length and copy the literals. */
289 | 		litrunlen = DECODE_SYMBOL(litrunlen_state,
290 | 					  d->litrunlen_decode_table);
291 | 
292 | 	#if 0	/* Unoptimized version */
293 | 		if (litrunlen == LITRUNLEN_ALPHABET_SIZE - 1) {
294 | 			SAFETY_CHECK(extra_bytes < in_end);
295 | 			litrunlen += *extra_bytes++;
296 | 			if (litrunlen == 0xFF + LITRUNLEN_ALPHABET_SIZE - 1) {
297 | 				SAFETY_CHECK(in_end - extra_bytes >= 3);
298 | 				litrunlen += (u32)*extra_bytes++ << 0;
299 | 				litrunlen += (u32)*extra_bytes++ << 8;
300 | 				litrunlen += (u32)*extra_bytes++ << 16;
301 | 			}
302 | 		}
303 | 		num_literals -= litrunlen;
304 | 		SAFETY_CHECK(num_literals >= 0);
305 | 		SAFETY_CHECK(out_next <= literals);
306 | 		while (litrunlen--)
307 | 			*out_next++ = *literals++;
308 | 		if (out_next == out_block_end) /* End of block? */
309 | 			break;
310 | 	#else
311 | 		STATIC_ASSERT(LITRUNLEN_ALPHABET_SIZE - 2 <= 15);
312 | 		if (UNALIGNED_ACCESS_IS_FAST &&
313 | 		    likely(num_literals >= 16 && literals - out_next >= 16 &&
314 | 			   litrunlen != LITRUNLEN_ALPHABET_SIZE - 1))
315 | 		{
316 | 			/* Fast case */
317 | 			copy_16_bytes_unaligned(literals, out_next);
318 | 			out_next += litrunlen;
319 | 			literals += litrunlen;
320 | 			num_literals -= litrunlen;
321 | 		} else {
322 | 			/* Slow case */
323 | 			const u32 cutoff = LITRUNLEN_ALPHABET_SIZE - 1;
324 | 			if (litrunlen == cutoff) {
325 | 				SAFETY_CHECK(extra_bytes < in_end);
326 | 				litrunlen += *extra_bytes++;
327 | 				if (litrunlen == 0xFF + cutoff) {
328 | 					SAFETY_CHECK(in_end - extra_bytes >= 3);
329 | 					litrunlen += (u32)*extra_bytes++ << 0;
330 | 					litrunlen += (u32)*extra_bytes++ << 8;
331 | 					litrunlen += (u32)*extra_bytes++ << 16;
332 | 				}
333 | 			}
334 | 
335 | 			num_literals -= litrunlen;
336 | 			SAFETY_CHECK(num_literals >= 0);
337 | 
338 | 			if (UNALIGNED_ACCESS_IS_FAST &&
339 | 			    likely(litrunlen + WORDBYTES <= literals - out_next &&
340 | 				   num_literals >= WORDBYTES))
341 | 			{
342 | 				const u8 *src = literals;
343 | 				u8 *dst = out_next;
344 | 
345 | 				out_next += litrunlen;
346 | 				literals += litrunlen;
347 | 				do {
348 | 					copy_word_unaligned(src, dst);
349 | 					src += WORDBYTES;
350 | 					dst += WORDBYTES;
351 | 					litrunlen -= WORDBYTES;
352 | 				} while ((s32)litrunlen > 0);
353 | 			} else {
354 | 				while (litrunlen--)
355 | 					*out_next++ = *literals++;
356 | 			}
357 | 
358 | 			if (out_next == out_block_end) /* End of block? */
359 | 				break;
360 | 		}
361 | 	#endif
362 | 		/* END decode literal run */
363 | 
364 | 		/* BEGIN decode match */
365 | 
366 | 		/* Decode the length symbol */
367 | 
368 | 		length = DECODE_SYMBOL(length_state, d->length_decode_table);
369 | 
370 | 		/* Decode the offset symbol */
371 | 
372 | 		if (!CAN_ENSURE(32))
373 | 			ENSURE_BITS(MAX_LOG2_NUM_OFFSET_STATES);
374 | 		offset_sym = DECODE_SYMBOL(offset_state, d->offset_decode_table);
375 | 
376 | 		/* Decode the rest of the offset */
377 | 
378 | 		if (offset_sym >= NUM_REPS) {
379 | 
380 | 			/* Explicit offset */
381 | 
382 | 			unsigned offset_log2 = offset_sym - NUM_REPS;
383 | 
384 | 			offset = (u32)1 << offset_log2;
385 | 
386 | 			if (block_type == BLOCKTYPE_ALIGNED &&
387 | 			    offset_log2 >= NUM_ALIGNED_BITS)
388 | 			{
389 | 				ENSURE_BITS(MAX_LOG2_NUM_ALIGNED_STATES +
390 | 					    offset_log2 - NUM_ALIGNED_BITS);
391 | 
392 | 				offset += DECODE_SYMBOL(aligned_state,
393 | 							d->aligned_decode_table);
394 | 				offset += POP_BITS(offset_log2 -
395 | 						   NUM_ALIGNED_BITS) <<
396 | 							NUM_ALIGNED_BITS;
397 | 			} else {
398 | 				ENSURE_BITS(offset_log2);
399 | 				offset += POP_BITS(offset_log2);
400 | 			}
401 | 
402 | 			STATIC_ASSERT(NUM_REPS >= 1 && NUM_REPS <= 4);
403 | 		#if NUM_REPS >= 4
404 | 			recent_offsets[3] = recent_offsets[2];
405 | 		#endif
406 | 		#if NUM_REPS >= 3
407 | 			recent_offsets[2] = recent_offsets[1];
408 | 		#endif
409 | 		#if NUM_REPS >= 2
410 | 			recent_offsets[1] = recent_offsets[0];
411 | 		#endif
412 | 		} else {
413 | 			/* Repeat offset */
414 | 			offset = recent_offsets[offset_sym];
415 | 			recent_offsets[offset_sym] = recent_offsets[0];
416 | 		}
417 | 
418 | 		recent_offsets[0] = offset;
419 | 
420 | 		SAFETY_CHECK(offset <= out_next - (u8 *)out);
421 | 
422 | 		/* Decode the remainder of the length and copy the match. */
423 | 
424 | 		length += MIN_MATCH_LEN;
425 | 
426 | 		if (UNALIGNED_ACCESS_IS_FAST && length <= 16 &&
427 | 		    offset >= length && literals - out_next >= 16)
428 | 		{
429 | 			/*
430 | 			 * Fast case: short length, no overlap, and we aren't
431 | 			 * getting too close to the literals portion of the
432 | 			 * output buffer.
433 | 			 */
434 | 			copy_16_bytes_unaligned(out_next - offset, out_next);
435 | 		} else {
436 | 			/*
437 | 			 * "Slow case" (but still very important): long length,
438 | 			 * or small offset, or we're getting close to the
439 | 			 * literals portion of the output buffer.
440 | 			 */
441 | 			const u32 cutoff = LENGTH_ALPHABET_SIZE - 1 + MIN_MATCH_LEN;
442 | 			const u8 *src;
443 | 			u8 *dst, *end;
444 | 			if (length == cutoff) {
445 | 				SAFETY_CHECK(extra_bytes < in_end);
446 | 				length += *extra_bytes++;
447 | 				if (length == 0xFF + cutoff) {
448 | 					SAFETY_CHECK(in_end - extra_bytes >= 3);
449 | 					length += (u32)*extra_bytes++ << 0;
450 | 					length += (u32)*extra_bytes++ << 8;
451 | 					length += (u32)*extra_bytes++ << 16;
452 | 				}
453 | 			}
454 | 
455 | 			SAFETY_CHECK(length <= literals - out_next);
456 | 
457 | 			src = out_next - offset;
458 | 			dst = out_next;
459 | 			end = out_next + length;
460 | 
461 | 			if (UNALIGNED_ACCESS_IS_FAST &&
462 | 			    likely(literals - end >= WORDBYTES)) {
463 | 				if (offset >= WORDBYTES) {
464 | 					copy_word_unaligned(src, dst);
465 | 					src += WORDBYTES;
466 | 					dst += WORDBYTES;
467 | 					if (dst < end) {
468 | 						do {
469 | 							copy_word_unaligned(src, dst);
470 | 							src += WORDBYTES;
471 | 							dst += WORDBYTES;
472 | 						} while (dst < end);
473 | 					}
474 | 				} else if (offset == 1) {
475 | 					machine_word_t v = repeat_byte(*(dst - 1));
476 | 					do {
477 | 						store_word_unaligned(v, dst);
478 | 						src += WORDBYTES;
479 | 						dst += WORDBYTES;
480 | 					} while (dst < end);
481 | 				} else {
482 | 					do {
483 | 						*dst++ = *src++;
484 | 					} while (dst < end);
485 | 				}
486 | 			} else {
487 | 				do {
488 | 					*dst++ = *src++;
489 | 				} while (dst < end);
490 | 			}
491 | 		}
492 | 
493 | 		out_next += length;
494 | 
495 | 		/* END decode match */
496 | 	}
497 | 
498 | 	SAFETY_CHECK(litrunlen_state == 0 && length_state == 0 &&
499 | 		     offset_state == 0 && aligned_state == 0);
500 | 
501 | 	ALIGN_INPUT();
502 | 
503 | 	/* Finished decompressing a block. */
504 | 	if (!is_final_block)
505 | 		goto next_block;
506 | 
507 | 	/* That was the final block. */
508 | 
509 | #ifdef ENABLE_PREPROCESSING
510 | 	/* Postprocess the data if needed. */
511 | 	if (preprocessed)
512 | 		postprocess(out, out_nbytes_avail);
513 | #endif
514 | 
515 | 	if (actual_out_nbytes_ret) {
516 | 		*actual_out_nbytes_ret = out_next - (u8 *)out;
517 | 	} else {
518 | 		if (out_next != out_end)
519 | 			return DECOMPRESS_SHORT_OUTPUT;
520 | 	}
521 | 	return DECOMPRESS_SUCCESS;
522 | }
523 | 


--------------------------------------------------------------------------------
/programs/xpack.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * xpack.c - a file compression and decompression program
  3 |  *
  4 |  * Copyright 2016 Eric Biggers
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person
  7 |  * obtaining a copy of this software and associated documentation
  8 |  * files (the "Software"), to deal in the Software without
  9 |  * restriction, including without limitation the rights to use,
 10 |  * copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 |  * copies of the Software, and to permit persons to whom the
 12 |  * Software is furnished to do so, subject to the following
 13 |  * conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be
 16 |  * included in all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 19 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 20 |  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 21 |  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 22 |  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 23 |  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 24 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 25 |  * OTHER DEALINGS IN THE SOFTWARE.
 26 |  */
 27 | 
 28 | #include "prog_util.h"
 29 | 
 30 | #include <sys/types.h>
 31 | #include <sys/stat.h>
 32 | #ifdef _WIN32
 33 | #  include <sys/utime.h>
 34 | #else
 35 | #  include <sys/time.h>
 36 | #  include <unistd.h>
 37 | #  include <utime.h>
 38 | #endif
 39 | 
 40 | struct options {
 41 | 	bool to_stdout;
 42 | 	bool decompress;
 43 | 	bool force;
 44 | 	bool keep;
 45 | 	int compression_level;
 46 | 	u32 chunk_size;
 47 | 	const tchar *suffix;
 48 | };
 49 | 
 50 | static const tchar *const optstring = T("123456789cdfhkL:s:S:V");
 51 | 
 52 | static void
 53 | show_usage(FILE *fp)
 54 | {
 55 | 	fprintf(fp,
 56 | "Usage: %"TS" [-123456789cdfhkV] [-L LVL] [-s SIZE] [-S SUF] [FILE]...\n"
 57 | "Compress or decompress the specified FILEs.\n"
 58 | "\n"
 59 | "Options:\n"
 60 | "  -1        fastest (worst) compression\n"
 61 | "  -9        slowest (best) compression\n"
 62 | "  -c        write to standard output\n"
 63 | "  -d        decompress\n"
 64 | "  -f        overwrite existing output files\n"
 65 | "  -h        print this help\n"
 66 | "  -k        don't delete input files\n"
 67 | "  -L LVL    compression level [1-9] (default 6)\n"
 68 | "  -s SIZE   chunk size (default 524288)\n"
 69 | "  -S SUF    use suffix .SUF instead of .xpack\n"
 70 | "  -V        show version and legal information\n"
 71 | "\n"
 72 | "NOTICE: this program is currently experimental, and the on-disk format\n"
 73 | "is not yet stable!\n",
 74 | 	program_invocation_name);
 75 | }
 76 | 
 77 | static void
 78 | show_version(void)
 79 | {
 80 | 	printf(
 81 | "xpack compression program, experimental version\n"
 82 | "Copyright 2016 Eric Biggers\n"
 83 | "\n"
 84 | "This program is free software which may be modified and/or redistributed\n"
 85 | "under the terms of the MIT license.  There is NO WARRANTY, to the extent\n"
 86 | "permitted by law.  See the COPYING file for details.\n"
 87 | 	);
 88 | }
 89 | 
 90 | /* Was the program invoked in decompression mode? */
 91 | static bool
 92 | is_xunpack(void)
 93 | {
 94 | 	if (tstrxcmp(program_invocation_name, T("xunpack")) == 0)
 95 | 		return true;
 96 | #ifdef _WIN32
 97 | 	if (tstrxcmp(program_invocation_name, T("xunpack.exe")) == 0)
 98 | 		return true;
 99 | #endif
100 | 	return false;
101 | }
102 | 
103 | static const tchar *
104 | get_suffix(const tchar *path, const tchar *suffix)
105 | {
106 | 	const tchar *dot = tstrrchr(get_filename(path), '.');
107 | 
108 | 	if (dot != NULL && tstrxcmp(dot + 1, suffix) == 0)
109 | 		return dot;
110 | 	return NULL;
111 | }
112 | 
113 | static bool
114 | has_suffix(const tchar *path, const tchar *suffix)
115 | {
116 | 	return get_suffix(path, suffix) != NULL;
117 | }
118 | 
119 | struct xpack_file_header {
120 | #define XPACK_MAGIC "XPACK\0\0\0"
121 | 	char magic[8];
122 | 	u32 chunk_size;
123 | 	u16 header_size;
124 | 	u8 version;
125 | 	u8 compression_level;
126 | };
127 | 
128 | struct xpack_chunk_header {
129 | 	u32 stored_size;
130 | 	u32 original_size;
131 | };
132 | 
133 | static void
134 | bswap_file_header(struct xpack_file_header *hdr)
135 | {
136 | 	STATIC_ASSERT(sizeof(struct xpack_file_header) == 16);
137 | 
138 | 	hdr->chunk_size = le32_bswap(hdr->chunk_size);
139 | 	hdr->header_size = le16_bswap(hdr->header_size);
140 | }
141 | 
142 | static void
143 | bswap_chunk_header(struct xpack_chunk_header *hdr)
144 | {
145 | 	STATIC_ASSERT(sizeof(struct xpack_chunk_header) == 8);
146 | 
147 | 	hdr->stored_size = le32_bswap(hdr->stored_size);
148 | 	hdr->original_size = le32_bswap(hdr->original_size);
149 | }
150 | 
151 | static int
152 | write_file_header(struct file_stream *out, u32 chunk_size, int compression_level)
153 | {
154 | 	struct xpack_file_header hdr;
155 | 
156 | 	memcpy(hdr.magic, XPACK_MAGIC, sizeof(hdr.magic));
157 | 	hdr.chunk_size = chunk_size;
158 | 	hdr.header_size = sizeof(hdr);
159 | 	hdr.version = 1;
160 | 	hdr.compression_level = compression_level;
161 | 
162 | 	bswap_file_header(&hdr);
163 | 	return full_write(out, &hdr, sizeof(hdr));
164 | }
165 | 
166 | static int
167 | write_chunk_header(struct file_stream *out, u32 stored_size, u32 original_size)
168 | {
169 | 	struct xpack_chunk_header hdr;
170 | 
171 | 	hdr.stored_size = stored_size;
172 | 	hdr.original_size = original_size;
173 | 
174 | 	bswap_chunk_header(&hdr);
175 | 	return full_write(out, &hdr, sizeof(hdr));
176 | }
177 | 
178 | static int
179 | do_compress(struct xpack_compressor *compressor, struct file_stream *in,
180 | 	    struct file_stream *out, u32 chunk_size)
181 | {
182 | 	void *original_buf = xmalloc(chunk_size);
183 | 	void *compressed_buf = xmalloc(chunk_size - 1);
184 | 	ssize_t ret = -1;
185 | 
186 | 	if (original_buf == NULL || compressed_buf == NULL)
187 | 		goto out;
188 | 
189 | 	while ((ret = xread(in, original_buf, chunk_size)) > 0) {
190 | 		u32 original_size = ret;
191 | 		u32 compressed_size;
192 | 		void *stored_buf;
193 | 		u32 stored_size;
194 | 
195 | 		compressed_size = xpack_compress(compressor,
196 | 						 original_buf,
197 | 						 original_size,
198 | 						 compressed_buf,
199 | 						 original_size - 1);
200 | 		if (compressed_size == 0) {
201 | 			/* Store the chunk uncompressed */
202 | 			stored_buf = original_buf;
203 | 			stored_size = original_size;
204 | 		} else {
205 | 			/* Store the chunk compressed */
206 | 			stored_buf = compressed_buf;
207 | 			stored_size = compressed_size;
208 | 		}
209 | 
210 | 		ret = write_chunk_header(out, stored_size, original_size);
211 | 		if (ret != 0)
212 | 			goto out;
213 | 
214 | 		ret = full_write(out, stored_buf, stored_size);
215 | 		if (ret != 0)
216 | 			goto out;
217 | 	}
218 | out:
219 | 	free(compressed_buf);
220 | 	free(original_buf);
221 | 	return ret;
222 | }
223 | 
224 | static int
225 | do_decompress(struct xpack_decompressor *decompressor, struct file_stream *in,
226 | 	      struct file_stream *out, u32 chunk_size)
227 | {
228 | 	void *original_buf = xmalloc(chunk_size);
229 | 	void *compressed_buf = xmalloc(chunk_size - 1);
230 | 	ssize_t ret = -1;
231 | 	struct xpack_chunk_header chunk_hdr;
232 | 
233 | 	if (original_buf == NULL || compressed_buf == NULL)
234 | 		goto out;
235 | 
236 | 	while ((ret = xread(in, &chunk_hdr, sizeof(chunk_hdr)))
237 | 			== sizeof(chunk_hdr))
238 | 	{
239 | 		u32 original_size;
240 | 		u32 stored_size;
241 | 		enum decompress_result result;
242 | 
243 | 		bswap_chunk_header(&chunk_hdr);
244 | 		original_size = chunk_hdr.original_size;
245 | 		stored_size = chunk_hdr.stored_size;
246 | 
247 | 		if (original_size < 1 || original_size > chunk_size ||
248 | 		    stored_size < 1 || stored_size > original_size) {
249 | 			msg("%"TS": file corrupt", in->name);
250 | 			ret = -1;
251 | 			goto out;
252 | 		}
253 | 
254 | 		ret = xread(in, (stored_size == original_size) ?
255 | 				original_buf : compressed_buf, stored_size);
256 | 		if (ret < 0)
257 | 			goto out;
258 | 
259 | 		if (ret != stored_size) {
260 | 			msg("%"TS": unexpected end-of-file", in->name);
261 | 			ret = -1;
262 | 			goto out;
263 | 		}
264 | 
265 | 		if (stored_size != original_size) {
266 | 			/* Chunk was stored compressed */
267 | 			result = xpack_decompress(decompressor,
268 | 						  compressed_buf, stored_size,
269 | 						  original_buf, original_size,
270 | 						  NULL);
271 | 			if (result != DECOMPRESS_SUCCESS) {
272 | 				msg("%"TS": data corrupt", in->name);
273 | 				ret = -1;
274 | 				goto out;
275 | 			}
276 | 		}
277 | 
278 | 		ret = full_write(out, original_buf, original_size);
279 | 		if (ret != 0)
280 | 			goto out;
281 | 	}
282 | 	if (ret > 0) {
283 | 		msg("%"TS": unexpected end-of-file", in->name);
284 | 		ret = -1;
285 | 	}
286 | out:
287 | 	free(compressed_buf);
288 | 	free(original_buf);
289 | 	return ret;
290 | }
291 | 
292 | static int
293 | stat_file(struct file_stream *in, struct stat *stbuf, bool allow_hard_links)
294 | {
295 | 	if (fstat(in->fd, stbuf) != 0) {
296 | 		msg("%"TS": unable to stat file", in->name);
297 | 		return -1;
298 | 	}
299 | 
300 | 	if (!S_ISREG(stbuf->st_mode) && !in->is_standard_stream) {
301 | 		msg("%"TS" is %s -- skipping",
302 | 		    in->name, S_ISDIR(stbuf->st_mode) ? "a directory" :
303 | 							"not a regular file");
304 | 		return -2;
305 | 	}
306 | 
307 | 	if (stbuf->st_nlink > 1 && !allow_hard_links) {
308 | 		msg("%"TS" has multiple hard links -- skipping "
309 | 		    "(use -f to process anyway)", in->name);
310 | 		return -2;
311 | 	}
312 | 
313 | 	return 0;
314 | }
315 | 
316 | static void
317 | restore_mode(struct file_stream *out, const struct stat *stbuf)
318 | {
319 | #ifndef _WIN32
320 | 	if (fchmod(out->fd, stbuf->st_mode) != 0)
321 | 		msg_errno("%"TS": unable to preserve mode", out->name);
322 | #endif
323 | }
324 | 
325 | static void
326 | restore_owner_and_group(struct file_stream *out, const struct stat *stbuf)
327 | {
328 | #ifndef _WIN32
329 | 	if (fchown(out->fd, stbuf->st_uid, stbuf->st_gid) != 0) {
330 | 		msg_errno("%"TS": unable to preserve owner and group",
331 | 			  out->name);
332 | 	}
333 | #endif
334 | }
335 | 
336 | static void
337 | restore_timestamps(struct file_stream *out, const tchar *newpath,
338 | 		   const struct stat *stbuf)
339 | {
340 | 	int ret;
341 | #if defined(HAVE_FUTIMENS)
342 | 	struct timespec times[2] = {
343 | 		stbuf->st_atim, stbuf->st_mtim,
344 | 	};
345 | 	ret = futimens(out->fd, times);
346 | #elif defined(HAVE_FUTIMES)
347 | 	struct timeval times[2] = {
348 | 		{ stbuf->st_atim.tv_sec, stbuf->st_atim.tv_nsec / 1000, },
349 | 		{ stbuf->st_mtim.tv_sec, stbuf->st_mtim.tv_nsec / 1000, },
350 | 	};
351 | 	ret = futimes(out->fd, times);
352 | #else /* HAVE_FUTIMES */
353 | 	struct tutimbuf times = {
354 | 		stbuf->st_atime, stbuf->st_mtime,
355 | 	};
356 | 	ret = tutime(newpath, &times);
357 | #endif /* !HAVE_FUTIMES */
358 | 	if (ret != 0)
359 | 		msg_errno("%"TS": unable to preserve timestamps", out->name);
360 | }
361 | 
362 | static void
363 | restore_metadata(struct file_stream *out, const tchar *newpath,
364 | 		 const struct stat *stbuf)
365 | {
366 | 	restore_mode(out, stbuf);
367 | 	restore_owner_and_group(out, stbuf);
368 | 	restore_timestamps(out, newpath, stbuf);
369 | }
370 | 
371 | static int
372 | decompress_file(struct xpack_decompressor *decompressor, const tchar *path,
373 | 		const struct options *options)
374 | {
375 | 	tchar *newpath = NULL;
376 | 	struct file_stream in;
377 | 	struct file_stream out;
378 | 	struct xpack_file_header hdr;
379 | 	struct stat stbuf;
380 | 	int ret;
381 | 	int ret2;
382 | 
383 | 	if (path != NULL && !options->to_stdout) {
384 | 		const tchar *suffix = get_suffix(path, options->suffix);
385 | 		if (suffix == NULL) {
386 | 			msg("\"%"TS"\" does not end with the .%"TS" suffix -- "
387 | 			    "skipping", path, options->suffix);
388 | 			ret = -2;
389 | 			goto out;
390 | 		}
391 | 		newpath = xmalloc((suffix - path + 1) * sizeof(tchar));
392 | 		tmemcpy(newpath, path, suffix - path);
393 | 		newpath[suffix - path] = '\0';
394 | 	}
395 | 
396 | 	ret = xopen_for_read(path, &in);
397 | 	if (ret != 0)
398 | 		goto out_free_newpath;
399 | 
400 | 	if (!options->force && isatty(in.fd)) {
401 | 		msg("Refusing to read compressed data from terminal.  "
402 | 		    "Use -f to override.\nFor help, use -h.");
403 | 		ret = -1;
404 | 		goto out_close_in;
405 | 	}
406 | 
407 | 	ret = stat_file(&in, &stbuf, options->force || newpath == NULL);
408 | 	if (ret != 0)
409 | 		goto out_close_in;
410 | 
411 | 	ret = xread(&in, &hdr, sizeof(hdr));
412 | 	if (ret < 0)
413 | 		goto out_close_in;
414 | 	if (ret != sizeof(hdr)) {
415 | 		msg("%"TS": not in XPACK format", in.name);
416 | 		ret = -1;
417 | 		goto out_close_in;
418 | 	}
419 | 	bswap_file_header(&hdr);
420 | 
421 | 	if (memcmp(hdr.magic, XPACK_MAGIC, sizeof(hdr.magic)) != 0) {
422 | 		msg("%"TS": not in XPACK format", in.name);
423 | 		ret = -1;
424 | 		goto out_close_in;
425 | 	}
426 | 
427 | 	if (hdr.version != 1) {
428 | 		msg("%"TS": unsupported version (%d)", in.name, hdr.version);
429 | 		ret = -1;
430 | 		goto out_close_in;
431 | 	}
432 | 
433 | 	if (hdr.header_size < sizeof(hdr)) {
434 | 		msg("%"TS": incorrect header size (%"PRIu16")", in.name,
435 | 		    hdr.header_size);
436 | 		ret = -1;
437 | 		goto out_close_in;
438 | 	}
439 | 
440 | 	if (hdr.chunk_size < 1024 || hdr.chunk_size > 67108864) {
441 | 		msg("%"TS": unsupported chunk size (%"PRIu32")", in.name,
442 | 		    hdr.chunk_size);
443 | 		ret = -1;
444 | 		goto out_close_in;
445 | 	}
446 | 
447 | 	ret = skip_bytes(&in, hdr.header_size - sizeof(hdr));
448 | 	if (ret != 0)
449 | 		goto out_close_in;
450 | 
451 | 	ret = xopen_for_write(newpath, options->force, &out);
452 | 	if (ret != 0)
453 | 		goto out_close_in;
454 | 
455 | 	ret = do_decompress(decompressor, &in, &out, hdr.chunk_size);
456 | 	if (ret != 0)
457 | 		goto out_close_out;
458 | 
459 | 	if (path != NULL && newpath != NULL)
460 | 		restore_metadata(&out, newpath, &stbuf);
461 | 	ret = 0;
462 | out_close_out:
463 | 	ret2 = xclose(&out);
464 | 	if (ret == 0)
465 | 		ret = ret2;
466 | 	if (ret != 0 && newpath != NULL)
467 | 		tunlink(newpath);
468 | out_close_in:
469 | 	xclose(&in);
470 | 	if (ret == 0 && path != NULL && newpath != NULL && !options->keep)
471 | 		tunlink(path);
472 | out_free_newpath:
473 | 	free(newpath);
474 | out:
475 | 	return ret;
476 | }
477 | 
478 | static int
479 | compress_file(struct xpack_compressor *compressor, const tchar *path,
480 | 	      const struct options *options)
481 | {
482 | 	tchar *newpath = NULL;
483 | 	struct file_stream in;
484 | 	struct file_stream out;
485 | 	struct stat stbuf;
486 | 	int ret;
487 | 	int ret2;
488 | 
489 | 	if (path != NULL && !options->to_stdout) {
490 | 		size_t path_nchars, suffix_nchars;
491 | 
492 | 		if (!options->force && has_suffix(path, options->suffix)) {
493 | 			msg("%"TS": already has .%"TS" suffix -- skipping",
494 | 			    path, options->suffix);
495 | 			ret = -2;
496 | 			goto out;
497 | 		}
498 | 		path_nchars = tstrlen(path);
499 | 		suffix_nchars = tstrlen(options->suffix);
500 | 		newpath = xmalloc((path_nchars + 1 + suffix_nchars + 1) *
501 | 					sizeof(tchar));
502 | 		tmemcpy(newpath, path, path_nchars);
503 | 		newpath[path_nchars] = '.';
504 | 		tmemcpy(&newpath[path_nchars + 1], options->suffix,
505 | 			suffix_nchars + 1);
506 | 	}
507 | 
508 | 	ret = xopen_for_read(path, &in);
509 | 	if (ret != 0)
510 | 		goto out_free_newpath;
511 | 
512 | 	ret = stat_file(&in, &stbuf, options->force || newpath == NULL);
513 | 	if (ret != 0)
514 | 		goto out_close_in;
515 | 
516 | 	ret = xopen_for_write(newpath, options->force, &out);
517 | 	if (ret != 0)
518 | 		goto out_close_in;
519 | 
520 | 	if (!options->force && isatty(out.fd)) {
521 | 		msg("Refusing to write compressed data to terminal. "
522 | 		    "Use -f to override.\nFor help, use -h.");
523 | 		ret = -1;
524 | 		goto out_close_out;
525 | 	}
526 | 
527 | 	ret = write_file_header(&out, options->chunk_size,
528 | 				options->compression_level);
529 | 	if (ret != 0)
530 | 		goto out_close_out;
531 | 
532 | 	ret = do_compress(compressor, &in, &out, options->chunk_size);
533 | 	if (ret != 0)
534 | 		goto out_close_out;
535 | 
536 | 	if (path != NULL && newpath != NULL)
537 | 		restore_metadata(&out, newpath, &stbuf);
538 | 	ret = 0;
539 | out_close_out:
540 | 	ret2 = xclose(&out);
541 | 	if (ret == 0)
542 | 		ret = ret2;
543 | 	if (ret != 0 && newpath != NULL)
544 | 		tunlink(newpath);
545 | out_close_in:
546 | 	xclose(&in);
547 | 	if (ret == 0 && path != NULL && newpath != NULL && !options->keep)
548 | 		tunlink(path);
549 | out_free_newpath:
550 | 	free(newpath);
551 | out:
552 | 	return ret;
553 | }
554 | 
555 | int
556 | tmain(int argc, tchar *argv[])
557 | {
558 | 	struct options options;
559 | 	tchar *default_file_list[] = { NULL };
560 | 	int opt_char;
561 | 	int i;
562 | 	int ret;
563 | 
564 | 	program_invocation_name = get_filename(argv[0]);
565 | 
566 | 	options.to_stdout = false;
567 | 	options.decompress = is_xunpack();
568 | 	options.force = false;
569 | 	options.keep = false;
570 | 	options.compression_level = 6;
571 | 	options.chunk_size = 524288;
572 | 	options.suffix = T("xpack");
573 | 
574 | 	while ((opt_char = tgetopt(argc, argv, optstring)) != -1) {
575 | 		switch (opt_char) {
576 | 		case '1':
577 | 		case '2':
578 | 		case '3':
579 | 		case '4':
580 | 		case '5':
581 | 		case '6':
582 | 		case '7':
583 | 		case '8':
584 | 		case '9':
585 | 			options.compression_level = opt_char - '0';
586 | 			break;
587 | 		case 'c':
588 | 			options.to_stdout = true;
589 | 			break;
590 | 		case 'd':
591 | 			options.decompress = true;
592 | 			break;
593 | 		case 'f':
594 | 			options.force = true;
595 | 			break;
596 | 		case 'h':
597 | 			show_usage(stdout);
598 | 			return 0;
599 | 		case 'k':
600 | 			options.keep = true;
601 | 			break;
602 | 		case 'L':
603 | 			options.compression_level =
604 | 				parse_compression_level(toptarg);
605 | 			if (options.compression_level <= 0)
606 | 				return 1;
607 | 			break;
608 | 		case 's':
609 | 			options.chunk_size = parse_chunk_size(toptarg);
610 | 			if (options.chunk_size == 0)
611 | 				return 1;
612 | 			break;
613 | 		case 'S':
614 | 			options.suffix = toptarg;
615 | 			break;
616 | 		case 'V':
617 | 			show_version();
618 | 			return 0;
619 | 		default:
620 | 			show_usage(stderr);
621 | 			return 1;
622 | 		}
623 | 	}
624 | 
625 | 	argv += toptind;
626 | 	argc -= toptind;
627 | 
628 | 	if (argc == 0) {
629 | 		argv = default_file_list;
630 | 		argc = ARRAY_LEN(default_file_list);
631 | 	} else {
632 | 		for (i = 0; i < argc; i++)
633 | 			if (argv[i][0] == '-' && argv[i][1] == '\0')
634 | 				argv[i] = NULL;
635 | 	}
636 | 
637 | 	ret = 0;
638 | 	if (options.decompress) {
639 | 		struct xpack_decompressor *d;
640 | 
641 | 		d = alloc_decompressor();
642 | 		if (d == NULL)
643 | 			return 1;
644 | 
645 | 		for (i = 0; i < argc; i++)
646 | 			ret |= -decompress_file(d, argv[i], &options);
647 | 
648 | 		xpack_free_decompressor(d);
649 | 	} else {
650 | 		struct xpack_compressor *c;
651 | 
652 | 		c = alloc_compressor(options.chunk_size,
653 | 				     options.compression_level);
654 | 		if (c == NULL)
655 | 			return 1;
656 | 
657 | 		for (i = 0; i < argc; i++)
658 | 			ret |= -compress_file(c, argv[i], &options);
659 | 
660 | 		xpack_free_compressor(c);
661 | 	}
662 | 
663 | 	/*
664 | 	 * If ret=0, there were no warnings or errors.  Exit with status 0.
665 | 	 * If ret=2, there was at least one warning.  Exit with status 2.
666 | 	 * Else, there was at least one error.  Exit with status 1.
667 | 	 */
668 | 	if (ret != 0 && ret != 2)
669 | 		ret = 1;
670 | 
671 | 	return ret;
672 | }
673 | 


--------------------------------------------------------------------------------
/lib/xpack_decompress.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * xpack_decompress.c - decompressor for the XPACK compression format
  3 |  *
  4 |  * Copyright 2016 Eric Biggers
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person
  7 |  * obtaining a copy of this software and associated documentation
  8 |  * files (the "Software"), to deal in the Software without
  9 |  * restriction, including without limitation the rights to use,
 10 |  * copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 |  * copies of the Software, and to permit persons to whom the
 12 |  * Software is furnished to do so, subject to the following
 13 |  * conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be
 16 |  * included in all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 19 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 20 |  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 21 |  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 22 |  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 23 |  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 24 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 25 |  * OTHER DEALINGS IN THE SOFTWARE.
 26 |  */
 27 | 
 28 | #ifdef __SSE2__
 29 | #  include <emmintrin.h>
 30 | #endif
 31 | 
 32 | #include "xpack_common.h"
 33 | #include "x86_cpu_features.h"
 34 | 
 35 | /*
 36 |  * If the expression passed to SAFETY_CHECK() evaluates to false, then the
 37 |  * decompression routine immediately returns DECOMPRESS_BAD_DATA, indicating the
 38 |  * compressed data is invalid.
 39 |  *
 40 |  * Theoretically, these checks could be disabled for specialized applications
 41 |  * where all input to the decompressor will be trusted.
 42 |  */
 43 | #if 0
 44 | #  pragma message("UNSAFE DECOMPRESSION IS ENABLED. THIS MUST ONLY BE USED IF THE DECOMPRESSOR INPUT WILL ALWAYS BE TRUSTED!")
 45 | #  define SAFETY_CHECK(expr)	(void)(expr)
 46 | #else
 47 | #  define SAFETY_CHECK(expr)	if (unlikely(!(expr))) return DECOMPRESS_BAD_DATA
 48 | #endif
 49 | 
 50 | /*
 51 |  * An entry in a FSE decode table.  The index of the entry in the table is the
 52 |  * state with which the entry is associated.
 53 |  *
 54 |  * For efficiency we sometimes access this struct as the u32 value and sometimes
 55 |  * as the individual fields.
 56 |  */
 57 | typedef struct {
 58 | 	union {
 59 | 		u32 entry;
 60 | 		struct { /* for big endian systems */
 61 | 			u16 destination_range_start;
 62 | 			u8 num_bits;
 63 | 			u8 symbol;
 64 | 		} be;
 65 | 		struct { /* for little endian systems */
 66 | 			u8 symbol;
 67 | 			u8 num_bits;
 68 | 			u16 destination_range_start;
 69 | 		} le;
 70 | 	};
 71 | } fse_decode_entry_t;
 72 | 
 73 | /*
 74 |  * DECODE_SYMBOL() - Macro to decode a FSE-encoded symbol.  The decoded symbol
 75 |  * is obtained from the decode table entry for the current state.  The state is
 76 |  * then updated to the next state, which is obtained by indexing the current
 77 |  * state's "destination range" with the next 'num_bits' bits of input data.
 78 |  */
 79 | #if 1 /* Optimized version which accesses the entry as a u32 */
 80 | #define DECODE_SYMBOL(state, decode_table)				\
 81 | (									\
 82 | 	sym = decode_table[state].entry,				\
 83 | 	state = (sym >> 16) + POP_BITS((sym >> 8) & 0xFF),		\
 84 | 	sym & 0xFF							\
 85 | )
 86 | 
 87 | #else /* Unoptimized version which accesses individual struct members */
 88 | #define DECODE_SYMBOL(state, decode_table)				\
 89 | (									\
 90 | 	sym = CPU_IS_LITTLE_ENDIAN() ?					\
 91 | 	        decode_table[state].le.symbol :				\
 92 | 	        decode_table[state].be.symbol,				\
 93 | 	state = CPU_IS_LITTLE_ENDIAN() ?				\
 94 | 		  decode_table[state].le.destination_range_start +	\
 95 | 			POP_BITS(decode_table[state].le.num_bits) :	\
 96 | 		  decode_table[state].be.destination_range_start +	\
 97 | 			POP_BITS(decode_table[state].be.num_bits),	\
 98 | 	sym								\
 99 | )
100 | #endif
101 | 
102 | /*
103 |  * Build the FSE decode table for an alphabet.
104 |  *
105 |  * @decode_table [out]
106 |  *	The decode table to build.
107 |  * @state_counts [in but invalidated]
108 |  *	An array which provides, for each symbol in the alphabet, the number of
109 |  *	states which should be assigned to that symbol.
110 |  * @alphabet_size [in]
111 |  *	The number of symbols in the alphabet.
112 |  * @log2_num_states [in]
113 |  *	The log base 2 of the number of states, which is also the number of
114 |  *	entries in the decode table being built.
115 |  *
116 |  * Returns true if the state counts were valid or false if they were not.
117 |  */
118 | static bool
119 | build_fse_decode_table(fse_decode_entry_t decode_table[], u16 state_counts[],
120 | 		       unsigned alphabet_size, unsigned log2_num_states)
121 | {
122 | 	/*
123 | 	 * Assign a symbol to each state such that each symbol 'sym' gets
124 | 	 * assigned to exactly 'state_counts[sym]' states.  To do this, assign
125 | 	 * states to symbols in order of increasing symbol value while visiting
126 | 	 * all states in a special order.
127 | 	 */
128 | 	const unsigned num_states = 1 << log2_num_states;
129 | 	const unsigned state_generator = get_state_generator(num_states);
130 | 	const unsigned state_mask = num_states - 1;
131 | 	unsigned state = 0;
132 | 	u32 total_count = 0;
133 | 	unsigned sym;
134 | 
135 | 	for (sym = 0; sym < alphabet_size; sym++) {
136 | 		unsigned count = state_counts[sym];
137 | 		if (count == 0) /* Unused symbol? */
138 | 			continue;
139 | 		total_count += count;
140 | 		do {
141 | 			decode_table[state].entry = sym;
142 | 			state = (state + state_generator) & state_mask;
143 | 		} while (--count);
144 | 	}
145 | 
146 | 	/*
147 | 	 * Verify that the sum of the state counts really was
148 | 	 * 2**log2_num_states.  With a bad input, the sum might be lower than
149 | 	 * expected (in which case not all states were visited) or higher than
150 | 	 * expected (in which case some states were visited multiple times).
151 | 	 * Both cases are strictly forbidden.
152 | 	 */
153 | 	if (unlikely(total_count != num_states))
154 | 		return false;
155 | 
156 | 	/*
157 | 	 * Now, set 'num_bits' and 'destination_range_start' for each decode
158 | 	 * table entry.  This works as follows.  First, a little background:
159 | 	 * given a symbol that is assigned 'count' states out of a total of
160 | 	 * 'num_states' states, the entropy, in bits, of an occurrence of that
161 | 	 * symbol is:
162 | 	 *
163 | 	 *		  log2(1/probability)
164 | 	 *		= log2(1/(count/num_states))
165 | 	 *		= log2(num_states/count)
166 | 	 *		= log2(num_states) - log2(count)
167 | 	 *
168 | 	 * This may be a non-integer value.  The rounded-down value is:
169 | 	 *
170 | 	 *     min_bits = floor(log2(num_states) - log2(count))
171 | 	 *		= log2(num_states) - ceil(log2(count))
172 | 	 *
173 | 	 * With finite state entropy coding, we will sometimes code the symbol
174 | 	 * using 'min_bits' bits and sometimes using 'min_bits + 1' bits.  Each
175 | 	 * of the symbol's 'count' states will be associated with one of these
176 | 	 * two choices of 'num_bits'.  In addition, each state will point to a
177 | 	 * "destination range" of length '2**num_bits'.  The destination range
178 | 	 * is the range of states which the encoder may have been in prior to
179 | 	 * encoding the symbol and entering a given state.
180 | 	 *
181 | 	 * The precise mapping of a symbol's states to bit counts and
182 | 	 * destination ranges is defined as follows.  For some 'X < count', the
183 | 	 * numerically first 'X' states are each assigned 'min_bits + 1' bits
184 | 	 * and are mapped consecutively to a series of destination ranges that
185 | 	 * ends with state 'num_states - 1'.  The remaining 'count - X' states
186 | 	 * are each assigned 'min_bits' bits and are mapped consecutively to a
187 | 	 * series of destination ranges that starts with state 0.  Since the
188 | 	 * destination ranges must exactly cover all 'num_states' states (this
189 | 	 * is required, in general, for encoding to have been possible), we can
190 | 	 * solve for 'X':
191 | 	 *
192 | 	 *	(2**(min_bits+1))X + (2**min_bits)(count - X) = num_states
193 | 	 *	(2**min_bits)(2X + count - X) = num_states
194 | 	 *	(2**min_bits)(X + count) = num_states
195 | 	 *	X + count = num_states / (2**min_bits)
196 | 	 *	X = num_states / (2**min_bits) - count
197 | 	 *
198 | 	 * As an example, with num_states = 256 and count = 23, then min_bits =
199 | 	 * log2(256) - ceil(log2(23)) = 8 - 5 = 3.  So each of the symbol's 23
200 | 	 * states will be assigned 3 ('min_bits') or 4 ('min_bits + 1') bits.
201 | 	 * Processing the 23 states in ascending numerical order, the first X
202 | 	 * states will each be assigned 4 bits and the next 23 - X states will
203 | 	 * each be assigned 3 bits.  X is:
204 | 	 *
205 | 	 *	X = num_states / (2**min_bits) - count
206 | 	 *	  = 256 / (2**3) - 23
207 | 	 *	  = 9
208 | 	 *
209 | 	 * Hence, the first 9 states will each be assigned 4 bits and have
210 | 	 * destination ranges covering the last 9 * 2**4 = 144 of the 256
211 | 	 * states, and the remaining 23 - 9 = 14 states will each be assigned 3
212 | 	 * bits and have destination ranges covering the first 14 * 2**3 = 112
213 | 	 * of the 256 states.
214 | 	 *
215 | 	 * There are a few possible implementations for actually computing
216 | 	 * 'num_bits' and 'destination_range_start' for each of a symbol's
217 | 	 * states.  What we do is iterate through *all* states in ascending
218 | 	 * order.  This interleaves states for different symbols but guarantees
219 | 	 * that all states for each symbol are visited in ascending order.
220 | 	 * 'state_counts[sym]' is re-used as a counter which is incremented each
221 | 	 * time after a state for symbol 'sym' is visited.  'X' is just the
222 | 	 * distance between the initial value of 'state_counts[sym]' and the
223 | 	 * closest power of 2 greater than or equal to 'state_counts[sym]'.
224 | 	 * When the counter reaches this power of 2, then the number of bits
225 | 	 * required, as computed by 'log2(num_states) - floor(log2(counter))',
226 | 	 * decreases from 'min_bits + 1' to 'min_bits'.  In addition, the
227 | 	 * destination range start for each state is easily computed from the
228 | 	 * value of the counter and num_bits at that state.
229 | 	 */
230 | 	for (state = 0; state < num_states; state++) {
231 | 
232 | 		u32 sym = decode_table[state].entry;
233 | 		u32 counter = state_counts[sym]++;
234 | 		unsigned num_bits = log2_num_states - bsr32(counter);
235 | 		u32 destination_range_start = (counter << num_bits) - num_states;
236 | 
237 | 		if (CPU_IS_LITTLE_ENDIAN()) {
238 | 			decode_table[state].le.num_bits = num_bits;
239 | 			decode_table[state].le.destination_range_start = destination_range_start;
240 | 		} else {
241 | 			decode_table[state].be.num_bits = num_bits;
242 | 			decode_table[state].be.destination_range_start = destination_range_start;
243 | 		}
244 | 	}
245 | 
246 | 	return true;
247 | }
248 | 
249 | /* Copy a word from @src to @dst, making no assumptions about alignment. */
250 | static forceinline void
251 | copy_word_unaligned(const u8 *src, u8 *dst)
252 | {
253 | 	store_word_unaligned(load_word_unaligned(src), dst);
254 | }
255 | 
256 | /* Copy 16 bytes from @src to @dst, making no assumptions about alignment. */
257 | static forceinline void
258 | copy_16_bytes_unaligned(const u8 *src, u8 *dst)
259 | {
260 | #ifdef __SSE2__
261 | 	__m128i v = _mm_loadu_si128((const __m128i *)src);
262 | 	_mm_storeu_si128((__m128i *)dst, v);
263 | #else
264 | 	STATIC_ASSERT(WORDBYTES == 4 || WORDBYTES == 8);
265 | 	if (WORDBYTES == 4) {
266 | 		copy_word_unaligned(src + 0, dst + 0);
267 | 		copy_word_unaligned(src + 4, dst + 4);
268 | 		copy_word_unaligned(src + 8, dst + 8);
269 | 		copy_word_unaligned(src + 12, dst + 12);
270 | 	} else {
271 | 		copy_word_unaligned(src + 0, dst + 0);
272 | 		copy_word_unaligned(src + 8, dst + 8);
273 | 	}
274 | #endif
275 | }
276 | 
277 | /* Build a word which consists of the byte @b repeated. */
278 | static forceinline machine_word_t
279 | repeat_byte(u8 b)
280 | {
281 | 	machine_word_t v;
282 | 
283 | 	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
284 | 
285 | 	v = b;
286 | 	v |= v << 8;
287 | 	v |= v << 16;
288 | 	v |= v << ((WORDBITS == 64) ? 32 : 0);
289 | 	return v;
290 | }
291 | 
292 | 
293 | /******************************************************************************
294 |  *				Input bitstream                               *
295 |  ******************************************************************************/
296 | 
297 | /*
298 |  * The state of the "input bitstream" consists of the following variables:
299 |  *
300 |  *	- in_next: pointer to the next unread byte in the input buffer
301 |  *
302 |  *	- in_end: pointer just past the end of the input buffer
303 |  *
304 |  *	- bitbuf: a word-sized variable containing bits that have been read from
305 |  *		  the input buffer.  The buffered bits are right-aligned
306 |  *		  (they're the low-order bits).
307 |  *
308 |  *	- bitsleft: number of bits in 'bitbuf' that are valid.
309 |  *
310 |  * To make it easier for the compiler to optimize the code by keeping variables
311 |  * in registers, these are declared as normal variables and manipulated using
312 |  * macros.
313 |  */
314 | 
315 | /*
316 |  * The maximum number of bits that can be requested to be in the bitbuffer
317 |  * variable.  This is the maximum value of 'n' that can be passed to
318 |  * ENSURE_BITS(n).
319 |  *
320 |  * This not equal to WORDBITS because we never read less than one byte at a
321 |  * time.  If the bitbuffer variable contains more than (WORDBITS - 8) bits, then
322 |  * we can't read another byte without first consuming some bits.  So the maximum
323 |  * count we can ensure is (WORDBITS - 7).
324 |  */
325 | #define MAX_ENSURE	(WORDBITS - 7)
326 | 
327 | /*
328 |  * Evaluates to true if 'n' is a valid argument to ENSURE_BITS(n), or false if
329 |  * 'n' is too large to be passed to ENSURE_BITS(n).  Note: if 'n' is a compile
330 |  * time constant, then this expression will be a compile-type constant.
331 |  * Therefore, CAN_ENSURE() can be used choose between alternative
332 |  * implementations at compile time.
333 |  */
334 | #define CAN_ENSURE(n)	((n) <= MAX_ENSURE)
335 | 
336 | /*
337 |  * Fill the bitbuffer variable, reading one byte at a time.
338 |  *
339 |  * Note: if we would overrun the input buffer, we just don't read anything,
340 |  * leaving the bits as 0 but marking them as filled.  This makes the
341 |  * implementation simpler because this removes the need to distinguish between
342 |  * "real" overruns and overruns that occur because of our own lookahead during
343 |  * decompression.  The disadvantage is that a "real" overrun can go undetected,
344 |  * and the decompressor may return a success status rather than the expected
345 |  * failure status if one occurs.  However, this is not too important because
346 |  * even if this specific case were to be handled "correctly", one could easily
347 |  * come up with a different case where the compressed data would be corrupted in
348 |  * such a way that fully retains its validity from the point of view of the
349 |  * decompressor.  Users should run a checksum against the decompressed data if
350 |  * they wish to detect corruptions.
351 |  */
352 | #define FILL_BITS_BYTEWISE()						  \
353 | do {									  \
354 | 	do {								  \
355 | 		if (likely(in_next != in_end))				  \
356 | 			bitbuf |= (machine_word_t)*in_next++ << bitsleft; \
357 | 		else							  \
358 | 			overrun_count++;				  \
359 | 		bitsleft += 8;						  \
360 | 	} while (bitsleft <= WORDBITS - 8);				  \
361 | } while (0)
362 | 
363 | /*
364 |  * Fill the bitbuffer variable by reading the next word from the input buffer.
365 |  * This can be significantly faster than FILL_BITS_BYTEWISE().  However, for
366 |  * this to work correctly, the word must be interpreted in little-endian format.
367 |  * In addition, the memory access may be unaligned.  Therefore, this method is
368 |  * most efficient on little-endian architectures that support fast unaligned
369 |  * access, such as x86 and x86_64.
370 |  */
371 | #define FILL_BITS_WORDWISE()						\
372 | do {									\
373 | 	bitbuf |= get_unaligned_leword(in_next) << bitsleft;		\
374 | 	in_next += (WORDBITS - bitsleft) >> 3;				\
375 | 	bitsleft += (WORDBITS - bitsleft) & ~7;				\
376 | } while (0)
377 | 
378 | /*
379 |  * Load more bits from the input buffer until the specified number of bits is
380 |  * present in the bitbuffer variable.  'n' must be <= MAX_ENSURE.
381 |  */
382 | #define ENSURE_BITS(n)							\
383 | do {									\
384 | 	if (bitsleft < (n)) {						\
385 | 		if (UNALIGNED_ACCESS_IS_FAST &&				\
386 | 		    likely(in_end - in_next >= WORDBYTES))		\
387 | 			FILL_BITS_WORDWISE();				\
388 | 		else							\
389 | 			FILL_BITS_BYTEWISE();				\
390 | 	}								\
391 | } while (0)
392 | 
393 | /* Remove and return the next 'n' bits from the bitbuffer variable. */
394 | #define POP_BITS(n)							\
395 | (									\
396 | 	bits = (u32)bitbuf & (((u32)1 << (n)) - 1),			\
397 | 	bitbuf >>= (n),							\
398 | 	bitsleft -= (n),						\
399 | 	bits								\
400 | )
401 | 
402 | /*
403 |  * Align the input to the next byte boundary, discarding any remaining bits in
404 |  * the current byte.
405 |  *
406 |  * Note that if the bitbuffer variable currently contains more than 8 bits, then
407 |  * we must rewind 'in_next', effectively putting those bits back.  Only the bits
408 |  * in what would be the "current" byte if we were reading one byte at a time can
409 |  * be actually discarded.
410 |  */
411 | #define ALIGN_INPUT()							\
412 | do {									\
413 | 	in_next -= (bitsleft >> 3) - MIN(overrun_count, bitsleft >> 3);	\
414 | 	bitbuf = 0;							\
415 | 	bitsleft = 0;							\
416 | } while (0)
417 | 
418 | 
419 | /* The main decompressor structure */
420 | struct xpack_decompressor {
421 | 
422 | 	/*
423 | 	 * The FSE decoding table for each alphabet.  The literal table can be
424 | 	 * in union with the other tables because all literal symbols are
425 | 	 * decoded first.
426 | 	 */
427 | 	union {
428 | 		fse_decode_entry_t literal_decode_table
429 | 				[1 << MAX_LOG2_NUM_LITERAL_STATES];
430 | 		struct {
431 | 			fse_decode_entry_t litrunlen_decode_table
432 | 					[1 << MAX_LOG2_NUM_LITRUNLEN_STATES];
433 | 			fse_decode_entry_t length_decode_table
434 | 					[1 << MAX_LOG2_NUM_LENGTH_STATES];
435 | 			fse_decode_entry_t offset_decode_table
436 | 					[1 << MAX_LOG2_NUM_OFFSET_STATES];
437 | 			fse_decode_entry_t aligned_decode_table
438 | 					[1 << MAX_LOG2_NUM_ALIGNED_STATES];
439 | 		};
440 | 	};
441 | 
442 | 	/* The FSE state counts for each alphabet */
443 | 	union {
444 | 		u16 state_counts[LITERAL_ALPHABET_SIZE +
445 | 				 LITRUNLEN_ALPHABET_SIZE +
446 | 				 LENGTH_ALPHABET_SIZE +
447 | 				 MAX_OFFSET_ALPHABET_SIZE +
448 | 				 ALIGNED_ALPHABET_SIZE];
449 | 		struct {
450 | 			u16 literal_state_counts[LITERAL_ALPHABET_SIZE];
451 | 			u16 litrunlen_state_counts[LITRUNLEN_ALPHABET_SIZE];
452 | 			u16 length_state_counts[LENGTH_ALPHABET_SIZE];
453 | 			u16 offset_state_counts[MAX_OFFSET_ALPHABET_SIZE];
454 | 			u16 aligned_state_counts[ALIGNED_ALPHABET_SIZE];
455 | 		};
456 | 	};
457 | };
458 | 
459 | #define FUNCNAME xpack_decompress_default
460 | #define ATTRIBUTES
461 | #include "decompress_impl.h"
462 | #undef FUNCNAME
463 | #undef ATTRIBUTES
464 | 
465 | #if X86_CPU_FEATURES_ENABLED && \
466 | 	COMPILER_SUPPORTS_BMI2_TARGET && !defined(__BMI2__)
467 | #  define FUNCNAME xpack_decompress_bmi2
468 | #  define ATTRIBUTES __attribute__((target("bmi2")))
469 | #  include "decompress_impl.h"
470 | #  undef FUNCNAME
471 | #  undef ATTRIBUTES
472 | #  define DISPATCH_ENABLED 1
473 | #else
474 | #  define DISPATCH_ENABLED 0
475 | #endif
476 | 
477 | #if DISPATCH_ENABLED
478 | 
479 | static enum decompress_result
480 | dispatch(struct xpack_decompressor *d, const void *in, size_t in_nbytes,
481 | 	 void *out, size_t out_nbytes_avail, size_t *actual_out_nbytes_ret);
482 | 
483 | typedef enum decompress_result (*decompress_func_t)
484 | 	(struct xpack_decompressor *d, const void *in, size_t in_nbytes,
485 | 	 void *out, size_t out_nbytes_avail, size_t *actual_out_nbytes_ret);
486 | 
487 | static decompress_func_t decompress_impl = dispatch;
488 | 
489 | static enum decompress_result
490 | dispatch(struct xpack_decompressor *d, const void *in, size_t in_nbytes,
491 | 	 void *out, size_t out_nbytes_avail, size_t *actual_out_nbytes_ret)
492 | {
493 | 	decompress_func_t f = xpack_decompress_default;
494 | #if X86_CPU_FEATURES_ENABLED
495 | 	if (x86_have_cpu_feature(X86_CPU_FEATURE_BMI2))
496 | 		f = xpack_decompress_bmi2;
497 | #endif
498 | 	decompress_impl = f;
499 | 	return (*f)(d, in, in_nbytes, out, out_nbytes_avail,
500 | 		    actual_out_nbytes_ret);
501 | }
502 | #endif /* DISPATCH_ENABLED */
503 | 
504 | /*
505 |  * This is the main decompression routine.  See libxpack.h for the
506 |  * documentation.
507 |  *
508 |  * Note that the real code is in decompress_impl.h.  The part here just handles
509 |  * calling the appropriate implementation depending on the CPU features at
510 |  * runtime.
511 |  */
512 | LIBEXPORT enum decompress_result
513 | xpack_decompress(struct xpack_decompressor *d, const void *in, size_t in_nbytes,
514 | 		 void *out, size_t out_nbytes_avail,
515 | 		 size_t *actual_out_nbytes_ret)
516 | {
517 | #if DISPATCH_ENABLED
518 | 	return (*decompress_impl)(d, in, in_nbytes, out, out_nbytes_avail,
519 | 				  actual_out_nbytes_ret);
520 | #else
521 | 	return xpack_decompress_default(d, in, in_nbytes, out, out_nbytes_avail,
522 | 					actual_out_nbytes_ret);
523 | #endif
524 | }
525 | 
526 | LIBEXPORT struct xpack_decompressor *
527 | xpack_alloc_decompressor(void)
528 | {
529 | 	return malloc(sizeof(struct xpack_decompressor));
530 | }
531 | 
532 | LIBEXPORT void
533 | xpack_free_decompressor(struct xpack_decompressor *d)
534 | {
535 | 	free(d);
536 | }
537 | 


--------------------------------------------------------------------------------
/lib/xpack_compress.c:
--------------------------------------------------------------------------------
   1 | /*
   2 |  * xpack_compress.c - compressor for the XPACK compression format
   3 |  *
   4 |  * Copyright 2016 Eric Biggers
   5 |  *
   6 |  * Permission is hereby granted, free of charge, to any person
   7 |  * obtaining a copy of this software and associated documentation
   8 |  * files (the "Software"), to deal in the Software without
   9 |  * restriction, including without limitation the rights to use,
  10 |  * copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 |  * copies of the Software, and to permit persons to whom the
  12 |  * Software is furnished to do so, subject to the following
  13 |  * conditions:
  14 |  *
  15 |  * The above copyright notice and this permission notice shall be
  16 |  * included in all copies or substantial portions of the Software.
  17 |  *
  18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  20 |  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  21 |  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  22 |  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  23 |  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  24 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  25 |  * OTHER DEALINGS IN THE SOFTWARE.
  26 |  */
  27 | 
  28 | #ifndef DECOMPRESSION_ONLY
  29 | 
  30 | #ifdef __SSE2__
  31 | #  include <emmintrin.h>
  32 | #endif
  33 | #ifdef __SSE4_1__
  34 | #  include <smmintrin.h>
  35 | #endif
  36 | 
  37 | #include "hc_matchfinder.h"
  38 | #include "lz_extend.h"
  39 | #include "xpack_common.h"
  40 | 
  41 | /*
  42 |  * The compressor always chooses a block of at least MIN_BLOCK_LENGTH bytes,
  43 |  * except if the last block has to be shorter.
  44 |  */
  45 | #define MIN_BLOCK_LENGTH	10000
  46 | 
  47 | /*
  48 |  * The compressor attempts to end blocks after SOFT_MAX_BLOCK_LENGTH bytes, but
  49 |  * the final size might be larger due to matches extending beyond the end of the
  50 |  * block.  Specifically:
  51 |  *
  52 |  *  - The greedy parser may choose an arbitrarily long match starting at the
  53 |  *    SOFT_MAX_BLOCK_LENGTH'th byte.
  54 |  *
  55 |  *  - The lazy parser may choose a sequence of literals starting at the
  56 |  *    SOFT_MAX_BLOCK_LENGTH'th byte when it sees a sequence of increasing good
  57 |  *    matches.  The final match may be of arbitrary length.  The length of the
  58 |  *    literal sequence is approximately limited by the "nice match length"
  59 |  *    parameter.  The actual limit is related to match scores and may be
  60 |  *    slightly different.  We overestimate the limit as EXTRA_LITERAL_SPACE.
  61 |  */
  62 | #define SOFT_MAX_BLOCK_LENGTH	300000
  63 | #define EXTRA_LITERAL_SPACE	512
  64 | 
  65 | /* Holds the symbols and extra offset bits needed to represent a match */
  66 | struct match {
  67 | 	u8 litrunlen_sym;
  68 | 	u8 length_sym;
  69 | 	u8 offset_sym;
  70 | 	u32 extra_offset_bits;
  71 | };
  72 | 
  73 | /* Frequency counters for each alphabet */
  74 | struct freqs {
  75 | 	u32 literal[LITERAL_ALPHABET_SIZE];
  76 | 	u32 litrunlen[LITRUNLEN_ALPHABET_SIZE];
  77 | 	u32 length[LENGTH_ALPHABET_SIZE];
  78 | 	u32 offset[MAX_OFFSET_ALPHABET_SIZE];
  79 | 	u32 aligned[ALIGNED_ALPHABET_SIZE];
  80 | };
  81 | 
  82 | /* Finite State Entropy encoding information for a symbol */
  83 | struct fse_symbol_encoding_info {
  84 | 	u32 adjusted_num_states_in_big_ranges;
  85 | 	s32 next_states_begin;
  86 | };
  87 | 
  88 | /* Finite State Entropy encoding information for each alphabet */
  89 | struct codes {
  90 | 	struct fse_symbol_encoding_info literal_sym_encinfo[LITERAL_ALPHABET_SIZE];
  91 | 	struct fse_symbol_encoding_info litrunlen_sym_encinfo[LITRUNLEN_ALPHABET_SIZE];
  92 | 	struct fse_symbol_encoding_info length_sym_encinfo[LENGTH_ALPHABET_SIZE];
  93 | 	struct fse_symbol_encoding_info offset_sym_encinfo[MAX_OFFSET_ALPHABET_SIZE];
  94 | 	struct fse_symbol_encoding_info aligned_sym_encinfo[ALIGNED_ALPHABET_SIZE];
  95 | 
  96 | 	u16 literal_next_statesx[1 << MAX_LOG2_NUM_LITERAL_STATES];
  97 | 	u16 litrunlen_next_statesx[1 << MAX_LOG2_NUM_LITRUNLEN_STATES];
  98 | 	u16 length_next_statesx[1 << MAX_LOG2_NUM_LENGTH_STATES];
  99 | 	u16 offset_next_statesx[1 << MAX_LOG2_NUM_OFFSET_STATES];
 100 | 	u16 aligned_next_statesx[1 << MAX_LOG2_NUM_ALIGNED_STATES];
 101 | 
 102 | 	unsigned log2_num_literal_states;
 103 | 	unsigned log2_num_litrunlen_states;
 104 | 	unsigned log2_num_length_states;
 105 | 	unsigned log2_num_offset_states;
 106 | 	unsigned log2_num_aligned_states;
 107 | 
 108 | 	union {
 109 | 		u16 state_counts[LITERAL_ALPHABET_SIZE +
 110 | 				 LITRUNLEN_ALPHABET_SIZE +
 111 | 				 LENGTH_ALPHABET_SIZE +
 112 | 				 MAX_OFFSET_ALPHABET_SIZE +
 113 | 				 ALIGNED_ALPHABET_SIZE];
 114 | 		struct {
 115 | 			u16 literal_state_counts[LITERAL_ALPHABET_SIZE];
 116 | 			u16 litrunlen_state_counts[LITRUNLEN_ALPHABET_SIZE];
 117 | 			u16 length_state_counts[LENGTH_ALPHABET_SIZE];
 118 | 			u16 offset_state_counts[MAX_OFFSET_ALPHABET_SIZE];
 119 | 			u16 aligned_state_counts[ALIGNED_ALPHABET_SIZE];
 120 | 		};
 121 | 	};
 122 | };
 123 | 
 124 | /* Block split statistics.  See "Block splitting algorithm" below. */
 125 | #define NUM_LITERAL_OBSERVATION_TYPES 8
 126 | #define NUM_MATCH_OBSERVATION_TYPES 2
 127 | #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES)
 128 | struct block_split_stats {
 129 | 	u32 new_observations[NUM_OBSERVATION_TYPES];
 130 | 	u32 observations[NUM_OBSERVATION_TYPES];
 131 | 	u32 num_new_observations;
 132 | 	u32 num_observations;
 133 | };
 134 | 
 135 | /* The main compressor structure */
 136 | struct xpack_compressor {
 137 | 
 138 | 	unsigned nice_match_length;
 139 | 	unsigned max_search_depth;
 140 | 	u8 *in_buffer;
 141 | 	size_t in_nbytes;
 142 | 	size_t max_buffer_size;
 143 | 	size_t (*impl)(struct xpack_compressor *, void *, size_t);
 144 | 
 145 | 	struct freqs freqs;
 146 | 	struct block_split_stats split_stats;
 147 | 	struct codes codes;
 148 | 
 149 | 	unsigned cumul_state_counts[MAX_ALPHABET_SIZE];
 150 | 	u8 state_to_symbol[MAX_NUM_STATES];
 151 | 
 152 | 	u32 num_literals;
 153 | 	u32 num_matches;
 154 | 	u32 num_extra_bytes;
 155 | 
 156 | 	u8 literals[SOFT_MAX_BLOCK_LENGTH + EXTRA_LITERAL_SPACE];
 157 | 	struct match matches[DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH, MIN_MATCH_LEN) + 1];
 158 | 	u8 extra_bytes[6 + /* extra for actual block length > soft max */
 159 | 		MAX4(1 * DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH,
 160 | 				      MIN_MATCH_LEN + LENGTH_ALPHABET_SIZE - 1),
 161 | 		     3 * DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH,
 162 | 				      MIN_MATCH_LEN + LENGTH_ALPHABET_SIZE - 1 + 0xFF),
 163 | 		     1 * DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH,
 164 | 				      LITRUNLEN_ALPHABET_SIZE - 1),
 165 | 		     3 * DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH,
 166 | 				      LITRUNLEN_ALPHABET_SIZE - 1 + 0xFF))];
 167 | 
 168 | 	/* Hash chains matchfinder (MUST BE LAST!!!) */
 169 | 	struct hc_matchfinder hc_mf;
 170 | };
 171 | 
 172 | /* Return the log base 2 of 'n', rounded up to the nearest integer. */
 173 | static forceinline unsigned
 174 | ilog2_ceil(u32 n)
 175 | {
 176 | 	if (n <= 1)
 177 | 		return 0;
 178 | 	return 1 + bsr32(n - 1);
 179 | }
 180 | 
 181 | /* Select the log2(num_states) to use for an alphabet. */
 182 | static unsigned
 183 | select_log2_num_states(u32 total_freq, unsigned num_used_syms,
 184 | 		       unsigned max_log2_num_states)
 185 | {
 186 | 	unsigned num_states = 1 << max_log2_num_states;  /* Default value */
 187 | 
 188 | 	/*
 189 | 	 * If there are not many symbols to be encoded, then it's not helpful to
 190 | 	 * use many states.
 191 | 	 */
 192 | 	num_states = MIN(num_states, total_freq / 4);
 193 | 
 194 | 	/*
 195 | 	 * There must be at least as many states as distinct used symbols.
 196 | 	 * Note: we're guaranteed num_used_syms > 0 here because of the earlier
 197 | 	 * check, which implies that this calculation produces num_states > 0.
 198 | 	 */
 199 | 	num_states = MAX(num_states, num_used_syms);
 200 | 
 201 | 	return ilog2_ceil(num_states);
 202 | }
 203 | 
 204 | /* Remove states from symbols until the correct number of states is used. */
 205 | static void
 206 | adjust_state_counts(u16 state_counts[], unsigned num_states_overrun,
 207 | 		    unsigned alphabet_size)
 208 | {
 209 | 	unsigned shift;
 210 | 	unsigned sym;
 211 | 	unsigned n;
 212 | 
 213 | 	for (shift = 3; num_states_overrun != 0; shift--) {
 214 | 		for (sym = 0; sym < alphabet_size; sym++) {
 215 | 			if (state_counts[sym] > 1) {
 216 | 				n = MIN((state_counts[sym] - 1) >> shift,
 217 | 					num_states_overrun);
 218 | 				state_counts[sym] -= n;
 219 | 				num_states_overrun -= n;
 220 | 				if (num_states_overrun == 0)
 221 | 					break;
 222 | 			}
 223 | 		}
 224 | 	}
 225 | }
 226 | 
 227 | /*
 228 |  * Determine how many states to assign to each symbol.
 229 |  *
 230 |  * Basically, for each symbol 'sym' we need to take the real number
 231 |  *
 232 |  *	freqs[sym] * (num_states / total_freq)
 233 |  *
 234 |  * and round it up or down to the nearest integer as appropriate to make all the
 235 |  * state_counts[] sum to num_states, while still approximating the real entropy
 236 |  * well.  However, this implementation does *not* compute the entropy-optimal
 237 |  * state counts.
 238 |  */
 239 | static void
 240 | compute_state_counts(const u32 freqs[], const u32 total_freq,
 241 | 		     u16 state_counts[], const unsigned alphabet_size,
 242 | 		     const unsigned log2_num_states)
 243 | {
 244 | 	signed int remaining_states = 1 << log2_num_states;
 245 | 	unsigned max_state_count = 0;
 246 | 	unsigned sym_with_max_state_count = 0;
 247 | 	unsigned sym = 0;
 248 | 
 249 | #if 0
 250 | 	const float scale_factor = (float)(1 << log2_num_states) / (float)total_freq;
 251 | 	const __m128 v_scale_factor = _mm_set1_ps(scale_factor);
 252 | 	const __m128i v_lowcount_cutoff = _mm_set1_epi16(0x7FFF - (u16)(0.5 / scale_factor));
 253 | 	__m128i v_num_states_used = _mm_set1_epi16(0);
 254 | 	__m128i v_max_state_count = _mm_set1_epi16(0);
 255 | 	__m128i v_sym_with_max_state_count = _mm_set1_epi16(0);
 256 | 	__m128i v_syms = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
 257 | 
 258 | 	/* Process 8 freqs at a time  */
 259 | 	for (; sym < (alphabet_size & ~7); sym += 8) {
 260 | 
 261 | 		/* Load the next freqs. */
 262 | 		__m128i v_freq1 = _mm_loadu_si128((const __m128i *)&freqs[sym + 0]);
 263 | 		__m128i v_freq2 = _mm_loadu_si128((const __m128i *)&freqs[sym + 4]);
 264 | 
 265 | 		/* Prepare adjustment for the 'state_count == 0 && freq != 0' case  */
 266 | 		__m128i v_freqpack_saturated = _mm_packs_epi32(v_freq1, v_freq2);
 267 | 		__m128i v_freqpack_saturated_adjusted = _mm_add_epi16(v_freqpack_saturated,
 268 | 								      v_lowcount_cutoff);
 269 | 		__m128i v_negative_adjustment = _mm_cmpgt_epi16(v_freqpack_saturated_adjusted,
 270 | 								v_lowcount_cutoff);
 271 | 
 272 | 		/* Compute: state_count = round(count * (num_states / total_freq))  */
 273 | 		__m128 v_freqf1 = _mm_cvtepi32_ps(v_freq1);
 274 | 		__m128 v_freqf2 = _mm_cvtepi32_ps(v_freq2);
 275 | 		__m128 v_mul1 = _mm_mul_ps(v_freqf1, v_scale_factor);
 276 | 		__m128 v_mul2 = _mm_mul_ps(v_freqf2, v_scale_factor);
 277 | 		__m128i v_muli1 = _mm_cvtps_epi32(v_mul1);
 278 | 		__m128i v_muli2 = _mm_cvtps_epi32(v_mul2);
 279 | 		__m128i v_state_count = _mm_packs_epi32(v_muli1, v_muli2);
 280 | 
 281 | 		/* If state_count == 0 but freq != 0, set state_count=1. */
 282 | 		v_state_count = _mm_sub_epi16(v_state_count, v_negative_adjustment);
 283 | 
 284 | 		/* Save the state counts  */
 285 | 		_mm_storeu_si128((__m128i *)&state_counts[sym], v_state_count);
 286 | 
 287 | 		/* Update num_states_used  */
 288 | 		v_num_states_used = _mm_add_epi16(v_num_states_used, v_state_count);
 289 | 
 290 | 		/* Update max_state_count  */
 291 | 		v_max_state_count = _mm_max_epi16(v_max_state_count, v_state_count);
 292 | 
 293 | 		/* Update sym_with_max_state_count  */
 294 | 		__m128i v_is_new_max = _mm_cmpeq_epi16(v_state_count, v_max_state_count);
 295 | 	#ifdef __SSE4_1__
 296 | 		v_sym_with_max_state_count = _mm_blendv_epi8(v_sym_with_max_state_count,
 297 | 							     v_syms, v_is_new_max);
 298 | 	#else
 299 | 		__m128i v_old_syms_to_keep = _mm_andnot_si128(v_is_new_max, v_sym_with_max_state_count);
 300 | 		__m128i v_new_syms_to_set = _mm_and_si128(v_is_new_max, v_syms);
 301 | 		v_sym_with_max_state_count = _mm_or_si128(v_old_syms_to_keep, v_new_syms_to_set);
 302 | 	#endif
 303 | 
 304 | 		v_syms = _mm_add_epi32(v_syms, _mm_set1_epi16(8));
 305 | 	}
 306 | 
 307 | 	for (int i = 0; i < 8; i++) {
 308 | 		remaining_states -= ((__v8hi)v_num_states_used)[i];
 309 | 		if (((__v8hi)v_max_state_count)[i] > max_state_count) {
 310 | 			max_state_count = ((__v8hi)v_max_state_count)[i];
 311 | 			sym_with_max_state_count = ((__v8hi)v_sym_with_max_state_count)[i];
 312 | 		}
 313 | 	}
 314 | #endif /* __SSE2__ */
 315 | 
 316 | 	const u32 highprec_step = ((u32)1 << 31) / total_freq;
 317 | 	const unsigned shift = 31 - log2_num_states - 1;
 318 | 
 319 | 	for (; sym < alphabet_size; sym++) {
 320 | 		 /*
 321 | 		  * Rescale the frequency.  Round up if the fractional part is
 322 | 		  * greater than or equal to 0.5.  Otherwise, round down.
 323 | 		  */
 324 | 		unsigned state_count =
 325 | 			(((freqs[sym] * highprec_step) >> shift) + 1) >> 1;
 326 | 
 327 | 		if (state_count == 0 && freqs[sym] != 0)
 328 | 			state_count = 1;
 329 | 
 330 | 		state_counts[sym] = state_count;
 331 | 		remaining_states -= state_count;
 332 | 
 333 | 		if (state_count > max_state_count) {
 334 | 			max_state_count = state_count;
 335 | 			sym_with_max_state_count = sym;
 336 | 		}
 337 | 	}
 338 | 
 339 | 	/*
 340 | 	 * If there are still states to assign, assign them to the most common
 341 | 	 * symbol.  Or if we assigned more states than were actually available,
 342 | 	 * then either subtract from the most common symbol (for minor overruns)
 343 | 	 * or use the slower adjustment algorithm (for major overruns).
 344 | 	 */
 345 | 	if (-remaining_states < (signed int)(max_state_count >> 2)) {
 346 | 		state_counts[sym_with_max_state_count] += remaining_states;
 347 | 	} else {
 348 | 		adjust_state_counts(state_counts, -remaining_states,
 349 | 				    alphabet_size);
 350 | 	}
 351 | }
 352 | 
 353 | /* Build the FSE encoding tables for an alphabet, given the state counts. */
 354 | static void
 355 | build_fse_encoding_tables(struct xpack_compressor *c,
 356 | 			  struct fse_symbol_encoding_info sym_encinfo[],
 357 | 			  u16 next_statesx[],
 358 | 			  const u16 state_counts[],
 359 | 			  const unsigned alphabet_size,
 360 | 			  const unsigned log2_num_states)
 361 | {
 362 | 	const unsigned num_states = 1 << log2_num_states;
 363 | 	const unsigned state_generator = get_state_generator(num_states);
 364 | 	const unsigned state_mask = num_states - 1;
 365 | 	unsigned cumul_total;
 366 | 	unsigned sym;
 367 | 	unsigned state;
 368 | 	unsigned count;
 369 | 	unsigned max_bits;
 370 | 
 371 | 	/*
 372 | 	 * Build sym_encinfo[], which provides encoding information for each
 373 | 	 * used symbol.  At the same time, build cumul_state_counts[], which for
 374 | 	 * each symbol provides the total state count of the symbols that
 375 | 	 * numerically precede it.
 376 | 	 */
 377 | 	cumul_total = 0;
 378 | 	for (sym = 0; sym < alphabet_size; sym++) {
 379 | 
 380 | 		count = state_counts[sym];
 381 | 
 382 | 		if (count == 0) /* Unused symbol? */
 383 | 			continue;
 384 | 
 385 | 		c->cumul_state_counts[sym] = cumul_total;
 386 | 
 387 | 		/*
 388 | 		 * Each encoding of this symbol requires either 'min_bits' or
 389 | 		 * 'max_bits = min_bits + 1' bits, where 'min_bits' is the
 390 | 		 * entropy of this symbol rounded down to the nearest integer:
 391 | 		 *
 392 | 		 *	min_bits = floor(log2(1/probability))
 393 | 		 *	min_bits = floor(log2(1/(count/num_states)))
 394 | 		 *	min_bits = floor(log2(num_states/count))
 395 | 		 *	min_bits = floor(log2(num_states) - log2(count))
 396 | 		 *	min_bits = log2(num_states) - ceil(log2(count))
 397 | 		 */
 398 | 		max_bits = log2_num_states - ilog2_ceil(count) + 1;
 399 | 
 400 | 		/*
 401 | 		 * Save a value that makes it possible to branchlessly find the
 402 | 		 * num_bits for a given state.  See encode_symbol() for details.
 403 | 		 */
 404 | 		sym_encinfo[sym].adjusted_num_states_in_big_ranges =
 405 | 			((u32)max_bits << MAX_LOG2_NUM_STATES) -
 406 | 			((u32)count << max_bits);
 407 | 
 408 | 		/*
 409 | 		 * When we need to encode an instance of this symbol, we'll have
 410 | 		 * a "current state".  We'll need to find which destination
 411 | 		 * range the current state is in, and which state --- the "next
 412 | 		 * state" from the encoder's point of view but the "previous
 413 | 		 * state" from the decoder's point of view --- maps to that
 414 | 		 * destination range.  How can we do this efficiently?
 415 | 		 *
 416 | 		 * The solution relies on these facts:
 417 | 		 *
 418 | 		 *   - We'll know the number of bits to use.  Consequently,
 419 | 		 *     we'll know the length of the destination range.
 420 | 		 *   - The 'min_bits' destination ranges all precede the
 421 | 		 *     'max_bits' destination ranges.
 422 | 		 *
 423 | 		 * What we'll do is maintain the state adjusted upwards by
 424 | 		 * 'num_states'.  Then, we'll right-shift it by the number of
 425 | 		 * bits that need to be used.  If 'min_bits' were required, then
 426 | 		 * the result will be 'num_states >> min_bits' plus the index of
 427 | 		 * the destination range in the list of 'min_bits' destination
 428 | 		 * ranges.  But if 'max_bits' were required, then the result
 429 | 		 * will be 'num_states >> min_bits' minus the number of
 430 | 		 * 'max_bits' destination ranges, plus the index of the
 431 | 		 * destination range in the list of 'max_bits' destination
 432 | 		 * ranges.  Result: we map states to consecutive integers, each
 433 | 		 * of which identifies a destination range.  We can use these
 434 | 		 * integers as indices into a lookup table for the next state.
 435 | 		 *
 436 | 		 * Below, 'cumul_total' is the index at which the entries will
 437 | 		 * actually begin in 'next_statesx[]'.  'count' is the beginning
 438 | 		 * of the sequence of destination range identifiers.  This is
 439 | 		 * 'num_states >> min_bits' minus the number of 'max_bits'
 440 | 		 * destination ranges, which is also the same as the number of
 441 | 		 * states (or destination ranges).  Note that the result of the
 442 | 		 * subtraction may be a negative number.
 443 | 		 */
 444 | 		sym_encinfo[sym].next_states_begin = (s32)cumul_total - (s32)count;
 445 | 
 446 | 		cumul_total += count;
 447 | 	}
 448 | 
 449 | 	/* Assign states to symbols. */
 450 | 	state = 0;
 451 | 	for (sym = 0; sym < alphabet_size; sym++) {
 452 | 		count = state_counts[sym];
 453 | 		while (count--) {
 454 | 			c->state_to_symbol[state] = sym;
 455 | 			state = (state + state_generator) & state_mask;
 456 | 		}
 457 | 	}
 458 | 
 459 | 	/*
 460 | 	 * Build next_statesx[].  This array maps symbol occurrences in the
 461 | 	 * state table, ordered primarily by increasing symbol value and
 462 | 	 * secondarily by increasing state, to their states, adjusted upwards by
 463 | 	 * num_states.
 464 | 	 */
 465 | 	for (state = 0; state < num_states; state++) {
 466 | 		unsigned symbol = c->state_to_symbol[state];
 467 | 		unsigned position = c->cumul_state_counts[symbol]++;
 468 | 		next_statesx[position] = num_states + state;
 469 | 	}
 470 | }
 471 | 
 472 | /*
 473 |  * Choose the FSE state counts for the specified alphabet, where each symbol has
 474 |  * the frequency given in @freqs.
 475 |  */
 476 | static unsigned
 477 | choose_state_counts(const u32 freqs[], unsigned alphabet_size,
 478 | 		    unsigned max_log2_num_states, u16 state_counts[])
 479 | {
 480 | 	u32 total_freq = 0;
 481 | 	unsigned num_used_syms = 0;
 482 | 	unsigned log2_num_states;
 483 | 	unsigned sym;
 484 | 
 485 | 	/* Compute the total frequency and the number of used symbols. */
 486 | 	for (sym = 0; sym < alphabet_size; sym++) {
 487 | 		if (freqs[sym] != 0) {
 488 | 			num_used_syms++;
 489 | 			total_freq += freqs[sym];
 490 | 		}
 491 | 	}
 492 | 
 493 | 	/*
 494 | 	 * If no symbols from this alphabet were used, then output a code that
 495 | 	 * contains an arbitrary unused symbol.
 496 | 	 */
 497 | 	if (total_freq == 0) {
 498 | 		state_counts[0] = 1;
 499 | 		for (sym = 1; sym < alphabet_size; sym++)
 500 | 			state_counts[sym] = 0;
 501 | 		return 0;
 502 | 	}
 503 | 
 504 | 	/* Select the number of states to use. */
 505 | 	log2_num_states = select_log2_num_states(total_freq, num_used_syms,
 506 | 						 max_log2_num_states);
 507 | 
 508 | 	/* Decide how many states to assign to each symbol. */
 509 | 	compute_state_counts(freqs, total_freq, state_counts,
 510 | 			     alphabet_size, log2_num_states);
 511 | 
 512 | 	return log2_num_states;
 513 | }
 514 | 
 515 | /* Output stream for header (writes in forwards direction) */
 516 | struct header_ostream {
 517 | 	machine_word_t bitbuf;
 518 | 	unsigned bitcount;
 519 | 	u8 *begin;
 520 | 	u8 *next;
 521 | 	u8 *end;
 522 | };
 523 | 
 524 | static void
 525 | header_ostream_init(struct header_ostream *os,
 526 | 		    void *out, size_t out_nbytes_avail)
 527 | {
 528 | 	os->bitbuf = 0;
 529 | 	os->bitcount = 0;
 530 | 	os->begin = out;
 531 | 	os->next = os->begin;
 532 | 	os->end = os->next + out_nbytes_avail;
 533 | }
 534 | 
 535 | static void
 536 | header_ostream_write_bits(struct header_ostream *os,
 537 | 			  machine_word_t bits, unsigned num_bits)
 538 | {
 539 | 	/*
 540 | 	 * We only flush 'bitbuf' when it completely fills up.  This improves
 541 | 	 * performance.
 542 | 	 */
 543 | 	os->bitbuf |= bits << os->bitcount;
 544 | 	os->bitcount += num_bits;
 545 | 	if (os->bitcount >= WORDBITS) {
 546 | 		if (os->end - os->next >= WORDBYTES) {
 547 | 			put_unaligned_leword(os->bitbuf, os->next);
 548 | 			os->next += WORDBYTES;
 549 | 		} else {
 550 | 			os->next = os->end;
 551 | 		}
 552 | 		os->bitcount -= WORDBITS;
 553 | 		os->bitbuf = bits >> (num_bits - os->bitcount);
 554 | 	}
 555 | }
 556 | 
 557 | static size_t
 558 | header_ostream_flush(struct header_ostream *os)
 559 | {
 560 | 	while ((int)os->bitcount > 0) {
 561 | 		if (os->next != os->end)
 562 | 			*os->next++ = os->bitbuf;
 563 | 		os->bitcount -= 8;
 564 | 		os->bitbuf >>= 8;
 565 | 	}
 566 | 
 567 | 	if (os->next == os->end)  /* overflow? */
 568 | 		return 0;
 569 | 
 570 | 	return os->next - os->begin;
 571 | }
 572 | 
 573 | /*
 574 |  * Output the state counts.  Return the number of bytes written, or 0 if the
 575 |  * output buffer is too small.
 576 |  */
 577 | static void
 578 | write_state_counts(struct header_ostream *os,
 579 | 		   const u16 state_counts[], unsigned num_state_counts)
 580 | {
 581 | 	unsigned sym = 0;
 582 | 
 583 | 	while (sym < num_state_counts) {
 584 | 		unsigned count = state_counts[sym++];
 585 | 		unsigned bits;
 586 | 		unsigned num_bits;
 587 | 
 588 | 		if (count == 0) {
 589 | 			unsigned start = sym - 1;
 590 | 			unsigned num_zeroes;
 591 | 
 592 | 			while (sym < num_state_counts && state_counts[sym] == 0)
 593 | 				sym++;
 594 | 			num_zeroes = sym - start;
 595 | 
 596 | 			while (num_zeroes >= ZEROCODE2_MIN) {
 597 | 				unsigned count = MIN(num_zeroes, ZEROCODE2_MAX);
 598 | 				bits = ((count - ZEROCODE2_MIN) << CODEBITS) | ZEROCODE2;
 599 | 				num_bits = ZEROCODE2_NBITS + CODEBITS;
 600 | 				header_ostream_write_bits(os, bits, num_bits);
 601 | 				num_zeroes -= count;
 602 | 			}
 603 | 
 604 | 			if (num_zeroes < ZEROCODE1_MIN)
 605 | 				continue;
 606 | 
 607 | 			bits = ((num_zeroes - ZEROCODE1_MIN) << CODEBITS) | ZEROCODE1;
 608 | 			num_bits = ZEROCODE1_NBITS + CODEBITS;
 609 | 		} else {
 610 | 			unsigned order = bsr32(count);
 611 | 			bits = ((count ^ (1 << order)) << CODEBITS) | order;
 612 | 			num_bits = order + CODEBITS;
 613 | 		}
 614 | 		header_ostream_write_bits(os, bits, num_bits);
 615 | 	}
 616 | }
 617 | 
 618 | /* Output stream for encoded symbols (writes in backwards direction) */
 619 | struct symbol_ostream {
 620 | 	machine_word_t bitbuf;
 621 | 	unsigned bitcount;
 622 | 	u8 *begin;
 623 | 	u8 *next;
 624 | 	u8 *end;
 625 | };
 626 | 
 627 | static void
 628 | symbol_ostream_init(struct symbol_ostream *os, void *buffer, size_t size)
 629 | {
 630 | 	os->bitbuf = 0;
 631 | 	os->bitcount = 0;
 632 | 	os->begin = buffer;
 633 | 	os->end = os->begin + size;
 634 | 	os->next = os->end - MIN(WORDBYTES, size);
 635 | }
 636 | 
 637 | /*
 638 |  * Add bits to the bitbuffer variable, without flushing.  The caller must ensure
 639 |  * there is enough space.
 640 |  */
 641 | static forceinline void
 642 | symbol_ostream_add_bits(struct symbol_ostream *os, machine_word_t bits, unsigned num_bits)
 643 | {
 644 | 	os->bitbuf = (os->bitbuf << num_bits) | bits;
 645 | 	os->bitcount += num_bits;
 646 | }
 647 | 
 648 | /*
 649 |  * Flush bits from the bitbuffer variable to the output buffer.  After calling
 650 |  * this, the bitbuffer variable is guaranteed to contain fewer than 8 bits.
 651 |  */
 652 | static forceinline void
 653 | symbol_ostream_flush_bits(struct symbol_ostream *os)
 654 | {
 655 | 	machine_word_t bits = os->bitbuf <<
 656 | 		((WORDBITS - os->bitcount) & (WORDBITS - 1));
 657 | 
 658 | 	put_unaligned_leword(bits, os->next);
 659 | 	os->next -= MIN(os->next - os->begin, os->bitcount >> 3);
 660 | 	os->bitcount &= 7;
 661 | }
 662 | 
 663 | /*
 664 |  * Flush any remaining bits to the output buffer and terminate the bitstream.
 665 |  * Return the total number of bytes written to the output buffer, or 0 if there
 666 |  * was not enough space available in the output buffer to write everything.
 667 |  */
 668 | static size_t
 669 | symbol_ostream_flush(struct symbol_ostream *os)
 670 | {
 671 | 	symbol_ostream_flush_bits(os);
 672 | 
 673 | 	if (os->next == os->begin) /* Not enough space? */
 674 | 		return 0;
 675 | 
 676 | 	/*
 677 | 	 * Terminate the last byte with a '1' bit so that the decoder knows
 678 | 	 * where to start from.
 679 | 	 */
 680 | 	os->bitbuf <<= 8 - os->bitcount;
 681 | 	os->bitbuf |= (1 << (7 - os->bitcount));
 682 | 	os->next += WORDBYTES - 1;
 683 | 	*os->next = (u8)os->bitbuf;
 684 | 
 685 | 	return os->end - os->next;
 686 | }
 687 | 
 688 | static forceinline void
 689 | encode_initial_state(struct symbol_ostream *os, unsigned initial_statex,
 690 | 		     unsigned log2_num_states)
 691 | {
 692 | 	symbol_ostream_add_bits(os, initial_statex - (1 << log2_num_states),
 693 | 				log2_num_states);
 694 | 	symbol_ostream_flush_bits(os);
 695 | }
 696 | 
 697 | /* Encode a symbol using Finite State Entropy encoding */
 698 | static forceinline unsigned
 699 | encode_symbol(unsigned symbol, unsigned cur_statex, struct symbol_ostream *os,
 700 | 	      const struct fse_symbol_encoding_info sym_encinfo[],
 701 | 	      const u16 next_statesx[])
 702 | {
 703 | 	unsigned num_bits;
 704 | 
 705 | 	/*
 706 | 	 * Calculate the number of bits required to encode this symbol when in
 707 | 	 * the current state.  'adjusted_num_states_in_big_ranges' was set to
 708 | 	 * (max_bits << MAX_LOG2_NUM_STATES) - 2*num_states + (number of states
 709 | 	 * in max_bits destination ranges).  If we add cur_statex (which is
 710 | 	 * num_states plus the current state) to this value, then we get a
 711 | 	 * number less than max_bits << MAX_LOG2_NUM_STATES iff the current
 712 | 	 * state is in a min_bits destination range (as opposed to a 'max_bits =
 713 | 	 * min_bits + 1' destination range).  Then the correct num_bits, which
 714 | 	 * is always either min_bits or max_bits, is simply that value right
 715 | 	 * shifted by MAX_LOG2_NUM_STATES.
 716 | 	 */
 717 | 	num_bits = (sym_encinfo[symbol].adjusted_num_states_in_big_ranges +
 718 | 		    cur_statex) >> MAX_LOG2_NUM_STATES;
 719 | 
 720 | 	/* Output the appropriate number of bits of the state. */
 721 | 	symbol_ostream_add_bits(os, cur_statex & ((1 << num_bits) - 1), num_bits);
 722 | 
 723 | 	/* Look up the next state using the high bits of the current state. */
 724 | 	return next_statesx[sym_encinfo[symbol].next_states_begin +
 725 | 			    (cur_statex >> num_bits)];
 726 | }
 727 | 
 728 | /*
 729 |  * Encode the matches and literals.  Note that the encoding order is backwards
 730 |  * from the decoding order!
 731 |  */
 732 | static size_t
 733 | encode_items(const struct xpack_compressor *c, void *out, size_t out_nbytes_avail,
 734 | 	     bool is_aligned_block)
 735 | {
 736 | 	struct symbol_ostream os;
 737 | 	size_t nbytes;
 738 | 	unsigned order;
 739 | 	unsigned litrunlen_statex;
 740 | 	unsigned length_statex;
 741 | 	unsigned offset_statex;
 742 | 	unsigned aligned_statex;
 743 | #if NUM_LITERAL_STREAMS == 2
 744 | 	unsigned literal_statex_1;
 745 | 	unsigned literal_statex_2;
 746 | #else
 747 | 	unsigned literal_statex;
 748 | #endif
 749 | 	s32 i;
 750 | 
 751 | 	symbol_ostream_init(&os, out, out_nbytes_avail);
 752 | 
 753 | 	/* Encode the matches and literal run lengths */
 754 | 
 755 | 	litrunlen_statex = 1 << c->codes.log2_num_litrunlen_states;
 756 | 	length_statex = 1 << c->codes.log2_num_length_states;
 757 | 	offset_statex = 1 << c->codes.log2_num_offset_states;
 758 | 	aligned_statex = 1 << c->codes.log2_num_aligned_states;
 759 | 
 760 | 	i = c->num_matches - 1;
 761 | 	if (i >= 0 && c->matches[i].offset_sym == MAX_OFFSET_ALPHABET_SIZE) {
 762 | 		/* Terminating literal run length, with no following match */
 763 | 		litrunlen_statex = encode_symbol(c->matches[i].litrunlen_sym,
 764 | 						 litrunlen_statex,
 765 | 						 &os,
 766 | 						 c->codes.litrunlen_sym_encinfo,
 767 | 						 c->codes.litrunlen_next_statesx);
 768 | 		symbol_ostream_flush_bits(&os);
 769 | 		i--;
 770 | 	}
 771 | 
 772 | 	for (; i >= 0; i--) {
 773 | 
 774 | 		const struct match *match = &c->matches[i];
 775 | 
 776 | 		if (match->offset_sym >= NUM_REPS) {
 777 | 
 778 | 			unsigned offset_log2 = match->offset_sym - NUM_REPS;
 779 | 
 780 | 			if (is_aligned_block && offset_log2 >= NUM_ALIGNED_BITS) {
 781 | 				symbol_ostream_add_bits(&os,
 782 | 							match->extra_offset_bits >> NUM_ALIGNED_BITS,
 783 | 							offset_log2 - NUM_ALIGNED_BITS);
 784 | 				aligned_statex = encode_symbol(match->extra_offset_bits & (ALIGNED_ALPHABET_SIZE - 1),
 785 | 							       aligned_statex,
 786 | 							       &os,
 787 | 							       c->codes.aligned_sym_encinfo,
 788 | 							       c->codes.aligned_next_statesx);
 789 | 			} else {
 790 | 				symbol_ostream_add_bits(&os, match->extra_offset_bits, offset_log2);
 791 | 			}
 792 | 			symbol_ostream_flush_bits(&os);
 793 | 		}
 794 | 
 795 | 		offset_statex = encode_symbol(match->offset_sym,
 796 | 					      offset_statex,
 797 | 					      &os,
 798 | 					      c->codes.offset_sym_encinfo,
 799 | 					      c->codes.offset_next_statesx);
 800 | 		symbol_ostream_flush_bits(&os);
 801 | 
 802 | 		length_statex = encode_symbol(match->length_sym,
 803 | 					      length_statex,
 804 | 					      &os,
 805 | 					      c->codes.length_sym_encinfo,
 806 | 					      c->codes.length_next_statesx);
 807 | 
 808 | 		litrunlen_statex = encode_symbol(match->litrunlen_sym,
 809 | 						 litrunlen_statex,
 810 | 						 &os,
 811 | 						 c->codes.litrunlen_sym_encinfo,
 812 | 						 c->codes.litrunlen_next_statesx);
 813 | 		symbol_ostream_flush_bits(&os);
 814 | 	}
 815 | 
 816 | 	/* Encode the inital states for matches and literal run lengths */
 817 | 
 818 | 	if (is_aligned_block)
 819 | 		encode_initial_state(&os, aligned_statex, c->codes.log2_num_aligned_states);
 820 | 	encode_initial_state(&os, offset_statex, c->codes.log2_num_offset_states);
 821 | 	encode_initial_state(&os, length_statex, c->codes.log2_num_length_states);
 822 | 	encode_initial_state(&os, litrunlen_statex, c->codes.log2_num_litrunlen_states);
 823 | 
 824 | 	/* Encode the literals */
 825 | 
 826 | #if NUM_LITERAL_STREAMS == 2
 827 | 	literal_statex_1 = 1 << c->codes.log2_num_literal_states;
 828 | 	literal_statex_2 = 1 << c->codes.log2_num_literal_states;
 829 | 
 830 | 	for (i = c->num_literals - 1; i >= 1; i -= 2) {
 831 | 
 832 | 		literal_statex_1 = encode_symbol(c->literals[i],
 833 | 						 literal_statex_1,
 834 | 						 &os,
 835 | 						 c->codes.literal_sym_encinfo,
 836 | 						 c->codes.literal_next_statesx);
 837 | 
 838 | 		literal_statex_2 = encode_symbol(c->literals[i - 1],
 839 | 						 literal_statex_2,
 840 | 						 &os,
 841 | 						 c->codes.literal_sym_encinfo,
 842 | 						 c->codes.literal_next_statesx);
 843 | 		symbol_ostream_flush_bits(&os);
 844 | 	}
 845 | 
 846 | 	if (c->num_literals & 1) {
 847 | 		literal_statex_1 = encode_symbol(c->literals[0],
 848 | 						 literal_statex_1,
 849 | 						 &os,
 850 | 						 c->codes.literal_sym_encinfo,
 851 | 						 c->codes.literal_next_statesx);
 852 | 		symbol_ostream_flush_bits(&os);
 853 | 
 854 | 		/* last state the encoder used is state_1
 855 | 		 * => first state the encoder will see is state_1
 856 | 		 * => numbering will be the same
 857 | 		 * => encoder must output state_2, then state_1 */
 858 | 		encode_initial_state(&os, literal_statex_2, c->codes.log2_num_literal_states);
 859 | 		encode_initial_state(&os, literal_statex_1, c->codes.log2_num_literal_states);
 860 | 	} else {
 861 | 		/* Reversed numbering */
 862 | 		encode_initial_state(&os, literal_statex_1, c->codes.log2_num_literal_states);
 863 | 		encode_initial_state(&os, literal_statex_2, c->codes.log2_num_literal_states);
 864 | 	}
 865 | 
 866 | #else /* NUM_LITERAL_STREAMS == 2 */
 867 | 
 868 | 	literal_statex = 1 << c->codes.log2_num_literal_states;
 869 | 
 870 | 	for (i = c->num_literals - 1; i >= 0; i--) {
 871 | 
 872 | 		literal_statex = encode_symbol(c->literals[i],
 873 | 					       literal_statex,
 874 | 					       &os,
 875 | 					       c->codes.literal_sym_encinfo,
 876 | 					       c->codes.literal_next_statesx);
 877 | 
 878 | 		symbol_ostream_flush_bits(&os);
 879 | 	}
 880 | 
 881 | 	encode_initial_state(&os, literal_statex, c->codes.log2_num_literal_states);
 882 | #endif /* NUM_LITERAL_STREAMS != 2 */
 883 | 
 884 | 	/* Literal count */
 885 | 	order = bsr32(c->num_literals + 1);
 886 | 	symbol_ostream_add_bits(&os, (c->num_literals + 1) -
 887 | 				((u32)1 << order), order);
 888 | 	symbol_ostream_add_bits(&os, order, 5);
 889 | 
 890 | 	nbytes = symbol_ostream_flush(&os);
 891 | 	if (nbytes == 0)
 892 | 		return 0;
 893 | 
 894 | 	/*
 895 | 	 * We wrote the data at the end of the output space going backwards.
 896 | 	 * Now move the data to the beginning.
 897 | 	 */
 898 | 	memmove(out, os.next, nbytes);
 899 | 
 900 | 	return nbytes;
 901 | }
 902 | 
 903 | static void
 904 | write_block_size(struct header_ostream *os, u32 block_size)
 905 | {
 906 | 	u32 bits;
 907 | 	int num_bits;
 908 | 
 909 | 	if (block_size == DEFAULT_BLOCK_SIZE) {
 910 | 		bits = 1;
 911 | 		num_bits = 1;
 912 | 	} else {
 913 | 		bits = block_size << 1;
 914 | 		num_bits = 1 + NUM_BLOCKSIZE_BITS;
 915 | 	}
 916 | 
 917 | 	header_ostream_write_bits(os, bits, num_bits);
 918 | }
 919 | 
 920 | /* Heuristic for using ALIGNED blocks */
 921 | static int
 922 | choose_block_type(struct xpack_compressor *c)
 923 | {
 924 | 	u32 min_count = -1;
 925 | 	u32 max_count = 0;
 926 | 	unsigned sym;
 927 | 
 928 | 	for (sym = 0; sym < ALIGNED_ALPHABET_SIZE; sym++) {
 929 | 		min_count = MIN(min_count, c->freqs.aligned[sym]);
 930 | 		max_count = MAX(max_count, c->freqs.aligned[sym]);
 931 | 	}
 932 | 
 933 | 	if (min_count * 3 < max_count) /* unbalanced? */
 934 | 		return BLOCKTYPE_ALIGNED;
 935 | 	else
 936 | 		return BLOCKTYPE_VERBATIM;
 937 | }
 938 | 
 939 | /******************************************************************************/
 940 | 
 941 | /*
 942 |  * Block splitting algorithm.  The problem is to decide when it is worthwhile to
 943 |  * start a new block with new entropy codes.  There is a theoretically optimal
 944 |  * solution: recursively consider every possible block split, considering the
 945 |  * exact cost of each block, and choose the minimum cost approach.  But this is
 946 |  * far too slow.  Instead, as an approximation, we can count symbols and after
 947 |  * every N symbols, compare the expected distribution of symbols based on the
 948 |  * previous data with the actual distribution.  If they differ "by enough", then
 949 |  * start a new block.
 950 |  *
 951 |  * As an optimization and heuristic, we don't distinguish between every symbol
 952 |  * but rather we combine many symbols into a single "observation type".  For
 953 |  * literals we only look at the high bits and low bits, and for matches we only
 954 |  * look at whether the match is long or not.  The assumption is that for typical
 955 |  * "real" data, places that are good block boundaries will tend to be noticable
 956 |  * based only on changes in these aggregate frequencies, without looking for
 957 |  * subtle differences in individual symbols.  For example, a change from ASCII
 958 |  * bytes to non-ASCII bytes, or from few matches (generally less compressible)
 959 |  * to many matches (generally more compressible), would be easily noticed based
 960 |  * on the aggregates.
 961 |  *
 962 |  * For determining whether the frequency distributions are "different enough" to
 963 |  * start a new block, the simply heuristic of splitting when the sum of absolute
 964 |  * differences exceeds a constant seems to be good enough.  We also add a number
 965 |  * proportional to the block size so that the algorithm is more likely to end
 966 |  * large blocks than small blocks.  This reflects the general expectation that
 967 |  * it will become increasingly beneficial to start a new block as the current
 968 |  * blocks grows larger.
 969 |  *
 970 |  * Finally, for an approximation, it is not strictly necessary that the exact
 971 |  * symbols being used are considered.  With "near-optimal parsing", for example,
 972 |  * the actual symbols that will be used are unknown until after the block
 973 |  * boundary is chosen and the block has been optimized.  Since the final choices
 974 |  * cannot be used, we can use preliminary "greedy" choices instead.
 975 |  */
 976 | 
 977 | /* Initialize the block split statistics when starting a new block. */
 978 | static void
 979 | init_block_split_stats(struct block_split_stats *stats)
 980 | {
 981 | 	int i;
 982 | 
 983 | 	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
 984 | 		stats->new_observations[i] = 0;
 985 | 		stats->observations[i] = 0;
 986 | 	}
 987 | 	stats->num_new_observations = 0;
 988 | 	stats->num_observations = 0;
 989 | }
 990 | 
 991 | /* Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
 992 |  * literal, for 8 possible literal observation types.  */
 993 | static forceinline void
 994 | observe_literal(struct block_split_stats *stats, u8 lit)
 995 | {
 996 | 	stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
 997 | 	stats->num_new_observations++;
 998 | }
 999 | 
1000 | /* Match observation.  Heuristic: use one observation type for "short match" and
1001 |  * one observation type for "long match".  */
1002 | static forceinline void
1003 | observe_match(struct block_split_stats *stats, unsigned length)
1004 | {
1005 | 	stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 9)]++;
1006 | 	stats->num_new_observations++;
1007 | }
1008 | 
1009 | static bool
1010 | do_end_block_check(struct block_split_stats *stats, u32 block_size)
1011 | {
1012 | 	int i;
1013 | 
1014 | 	if (stats->num_observations > 0) {
1015 | 
1016 | 		/* Note: to avoid slow divisions, we do not divide by
1017 | 		 * 'num_observations', but rather do all math with the numbers
1018 | 		 * multiplied by 'num_observations'.  */
1019 | 		u32 total_delta = 0;
1020 | 		for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1021 | 			u32 expected = stats->observations[i] * stats->num_new_observations;
1022 | 			u32 actual = stats->new_observations[i] * stats->num_observations;
1023 | 			u32 delta = (actual > expected) ? actual - expected :
1024 | 							  expected - actual;
1025 | 			total_delta += delta;
1026 | 		}
1027 | 
1028 | 		/* Ready to end the block? */
1029 | 		if (total_delta + (block_size >> 12) * stats->num_observations >=
1030 | 		    200 * stats->num_observations)
1031 | 			return true;
1032 | 	}
1033 | 
1034 | 	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1035 | 		stats->num_observations += stats->new_observations[i];
1036 | 		stats->observations[i] += stats->new_observations[i];
1037 | 		stats->new_observations[i] = 0;
1038 | 	}
1039 | 	stats->num_new_observations = 0;
1040 | 	return false;
1041 | }
1042 | 
1043 | static forceinline bool
1044 | should_end_block(struct block_split_stats *stats,
1045 | 		 const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
1046 | {
1047 | 	/* Ready to check block split statistics? */
1048 | 	if (stats->num_new_observations < 512 ||
1049 | 	    in_next - in_block_begin < MIN_BLOCK_LENGTH ||
1050 | 	    in_end - in_next < 16384)
1051 | 		return false;
1052 | 
1053 | 	return do_end_block_check(stats, in_next - in_block_begin);
1054 | }
1055 | 
1056 | /******************************************************************************/
1057 | 
1058 | static void
1059 | begin_block(struct xpack_compressor *c)
1060 | {
1061 | 	memset(&c->freqs, 0, sizeof(c->freqs));
1062 | 	c->num_literals = 0;
1063 | 	c->num_matches = 0;
1064 | 	c->num_extra_bytes = 0;
1065 | 	init_block_split_stats(&c->split_stats);
1066 | }
1067 | 
1068 | static void
1069 | record_literal(struct xpack_compressor *c, u8 literal)
1070 | {
1071 | 	c->literals[c->num_literals++] = literal;
1072 | 	c->freqs.literal[literal]++;
1073 | }
1074 | 
1075 | static void
1076 | record_litrunlen(struct xpack_compressor *c, struct match *match, u32 litrunlen)
1077 | {
1078 | 	unsigned litrunlen_sym;
1079 | 
1080 | 	if (litrunlen >= LITRUNLEN_ALPHABET_SIZE - 1) {
1081 | 		u32 v = litrunlen - (LITRUNLEN_ALPHABET_SIZE - 1);
1082 | 		if (v < 0xFF) {
1083 | 			c->extra_bytes[c->num_extra_bytes++] = v;
1084 | 		} else {
1085 | 			v -= 0xFF;
1086 | 			c->extra_bytes[c->num_extra_bytes++] = 0xFF;
1087 | 			c->extra_bytes[c->num_extra_bytes++] = (u8)(v >> 0);
1088 | 			c->extra_bytes[c->num_extra_bytes++] = (u8)(v >> 8);
1089 | 			c->extra_bytes[c->num_extra_bytes++] = (u8)(v >> 16);
1090 | 		}
1091 | 		litrunlen_sym = LITRUNLEN_ALPHABET_SIZE - 1;
1092 | 	} else {
1093 | 		litrunlen_sym = litrunlen;
1094 | 	}
1095 | 
1096 | 	match->litrunlen_sym = litrunlen_sym;
1097 | 	c->freqs.litrunlen[litrunlen_sym]++;
1098 | }
1099 | 
1100 | static void
1101 | record_length(struct xpack_compressor *c, struct match *match, u32 length)
1102 | {
1103 | 	unsigned length_sym;
1104 | 
1105 | 	length -= MIN_MATCH_LEN;
1106 | 
1107 | 	if (length >= LENGTH_ALPHABET_SIZE - 1) {
1108 | 		u32 v = length - (LENGTH_ALPHABET_SIZE - 1);
1109 | 		if (v < 0xFF) {
1110 | 			c->extra_bytes[c->num_extra_bytes++] = v;
1111 | 		} else {
1112 | 			v -= 0xFF;
1113 | 			c->extra_bytes[c->num_extra_bytes++] = 0xFF;
1114 | 			c->extra_bytes[c->num_extra_bytes++] = (u8)(v >> 0);
1115 | 			c->extra_bytes[c->num_extra_bytes++] = (u8)(v >> 8);
1116 | 			c->extra_bytes[c->num_extra_bytes++] = (u8)(v >> 16);
1117 | 		}
1118 | 		length_sym = LENGTH_ALPHABET_SIZE - 1;
1119 | 	} else {
1120 | 		length_sym = length;
1121 | 	}
1122 | 
1123 | 	match->length_sym = length_sym;
1124 | 	c->freqs.length[length_sym]++;
1125 | }
1126 | 
1127 | static void
1128 | record_explicit_offset(struct xpack_compressor *c, struct match *match,
1129 | 		       u32 offset)
1130 | {
1131 | 	unsigned offset_log2 = bsr32(offset);
1132 | 	unsigned offset_sym = NUM_REPS + offset_log2;
1133 | 
1134 | 	match->offset_sym = offset_sym;
1135 | 	c->freqs.offset[offset_sym]++;
1136 | 	match->extra_offset_bits = offset - ((u32)1 << offset_log2);
1137 | 	if (offset_log2 >= NUM_ALIGNED_BITS)
1138 | 		c->freqs.aligned[offset & (ALIGNED_ALPHABET_SIZE - 1)]++;
1139 | }
1140 | 
1141 | static void
1142 | record_repeat_offset(struct xpack_compressor *c, struct match *match,
1143 | 		     unsigned rep_idx)
1144 | {
1145 | 	match->offset_sym = rep_idx;
1146 | 	c->freqs.offset[rep_idx]++;
1147 | }
1148 | 
1149 | static size_t
1150 | write_block(struct xpack_compressor *c, void *out, size_t out_nbytes_avail,
1151 | 	    u32 block_size, u32 last_litrunlen, bool is_final_block)
1152 | {
1153 | 	struct header_ostream os;
1154 | 	size_t header_size;
1155 | 	size_t items_size;
1156 | 	int block_type;
1157 | 	unsigned num_state_counts;
1158 | 	unsigned order;
1159 | 
1160 | 	/* Final litrunlen */
1161 | 	record_litrunlen(c, &c->matches[c->num_matches], last_litrunlen);
1162 | 	c->matches[c->num_matches].offset_sym = MAX_OFFSET_ALPHABET_SIZE;
1163 | 	c->num_matches++;
1164 | 
1165 | 	/* Choose the block type */
1166 | 	block_type = choose_block_type(c);
1167 | 
1168 | 	header_ostream_init(&os, out, out_nbytes_avail);
1169 | 
1170 | 	/* Output the "final block" flag */
1171 | 	header_ostream_write_bits(&os, is_final_block, 1);
1172 | 
1173 | 	/* Output the block type */
1174 | 	header_ostream_write_bits(&os, block_type, NUM_BLOCKTYPE_BITS);
1175 | 
1176 | 	/* Output the block size */
1177 | 	write_block_size(&os, block_size);
1178 | 
1179 | 	/* Compute FSE state counts for each alphabet */
1180 | 
1181 | 	c->codes.log2_num_literal_states =
1182 | 		choose_state_counts(c->freqs.literal,
1183 | 				    LITERAL_ALPHABET_SIZE,
1184 | 				    MAX_LOG2_NUM_LITERAL_STATES,
1185 | 				    c->codes.literal_state_counts);
1186 | 
1187 | 	c->codes.log2_num_litrunlen_states =
1188 | 		choose_state_counts(c->freqs.litrunlen,
1189 | 				    LITRUNLEN_ALPHABET_SIZE,
1190 | 				    MAX_LOG2_NUM_LITRUNLEN_STATES,
1191 | 				    c->codes.litrunlen_state_counts);
1192 | 
1193 | 	c->codes.log2_num_length_states =
1194 | 		choose_state_counts(c->freqs.length,
1195 | 				    LENGTH_ALPHABET_SIZE,
1196 | 				    MAX_LOG2_NUM_LENGTH_STATES,
1197 | 				    c->codes.length_state_counts);
1198 | 
1199 | 	c->codes.log2_num_offset_states =
1200 | 		choose_state_counts(c->freqs.offset,
1201 | 				    MAX_OFFSET_ALPHABET_SIZE,
1202 | 				    MAX_LOG2_NUM_OFFSET_STATES,
1203 | 				    c->codes.offset_state_counts);
1204 | 
1205 | 	if (block_type == BLOCKTYPE_ALIGNED) {
1206 | 		c->codes.log2_num_aligned_states =
1207 | 			choose_state_counts(c->freqs.aligned,
1208 | 					    ALIGNED_ALPHABET_SIZE,
1209 | 					    MAX_LOG2_NUM_ALIGNED_STATES,
1210 | 					    c->codes.aligned_state_counts);
1211 | 	}
1212 | 
1213 | 	/* Output the FSE state counts for each alphabet */
1214 | 	header_ostream_write_bits(&os, c->codes.log2_num_literal_states, 4);
1215 | 	header_ostream_write_bits(&os, c->codes.log2_num_litrunlen_states, 4);
1216 | 	header_ostream_write_bits(&os, c->codes.log2_num_length_states, 4);
1217 | 	header_ostream_write_bits(&os, c->codes.log2_num_offset_states, 4);
1218 | 	if (block_type == BLOCKTYPE_ALIGNED)
1219 | 		header_ostream_write_bits(&os, c->codes.log2_num_aligned_states, 4);
1220 | 
1221 | #ifndef _MSC_VER
1222 | 	STATIC_ASSERT(offsetof(struct codes,
1223 | 			       aligned_state_counts[ALIGNED_ALPHABET_SIZE]) ==
1224 | 		      offsetof(struct codes, state_counts) + sizeof(c->codes.state_counts));
1225 | #endif
1226 | 	num_state_counts = ARRAY_LEN(c->codes.state_counts);
1227 | 	if (block_type != BLOCKTYPE_ALIGNED)
1228 | 		num_state_counts -= ALIGNED_ALPHABET_SIZE;
1229 | 
1230 | 	write_state_counts(&os, c->codes.state_counts, num_state_counts);
1231 | 
1232 | 	/* Output the number of extra bytes */
1233 | 	order = bsr32(c->num_extra_bytes + 1);
1234 | 	header_ostream_write_bits(&os, order, 5);
1235 | 	header_ostream_write_bits(&os,
1236 | 				  (c->num_extra_bytes + 1) - ((u32)1 << order),
1237 | 				  order);
1238 | 
1239 | 	/* Align to the next byte boundary */
1240 | 	header_size = header_ostream_flush(&os);
1241 | 	if (header_size == 0)
1242 | 		return 0;
1243 | 
1244 | 	/* Add the extra bytes */
1245 | 	if (c->num_extra_bytes >= out_nbytes_avail - header_size)
1246 | 		return 0;
1247 | 	memcpy((u8 *)out + header_size, c->extra_bytes, c->num_extra_bytes);
1248 | 	header_size += c->num_extra_bytes;
1249 | 
1250 | 	/* Build the FSE encoding tables for each alphabet */
1251 | 
1252 | 	build_fse_encoding_tables(c, c->codes.literal_sym_encinfo,
1253 | 				  c->codes.literal_next_statesx,
1254 | 				  c->codes.literal_state_counts,
1255 | 				  LITERAL_ALPHABET_SIZE,
1256 | 				  c->codes.log2_num_literal_states);
1257 | 
1258 | 	build_fse_encoding_tables(c, c->codes.litrunlen_sym_encinfo,
1259 | 				  c->codes.litrunlen_next_statesx,
1260 | 				  c->codes.litrunlen_state_counts,
1261 | 				  LITRUNLEN_ALPHABET_SIZE,
1262 | 				  c->codes.log2_num_litrunlen_states);
1263 | 
1264 | 	build_fse_encoding_tables(c, c->codes.length_sym_encinfo,
1265 | 				  c->codes.length_next_statesx,
1266 | 				  c->codes.length_state_counts,
1267 | 				  LENGTH_ALPHABET_SIZE,
1268 | 				  c->codes.log2_num_length_states);
1269 | 
1270 | 	build_fse_encoding_tables(c, c->codes.offset_sym_encinfo,
1271 | 				  c->codes.offset_next_statesx,
1272 | 				  c->codes.offset_state_counts,
1273 | 				  MAX_OFFSET_ALPHABET_SIZE,
1274 | 				  c->codes.log2_num_offset_states);
1275 | 
1276 | 	if (block_type == BLOCKTYPE_ALIGNED) {
1277 | 		build_fse_encoding_tables(c, c->codes.aligned_sym_encinfo,
1278 | 					  c->codes.aligned_next_statesx,
1279 | 					  c->codes.aligned_state_counts,
1280 | 					  ALIGNED_ALPHABET_SIZE,
1281 | 					  c->codes.log2_num_aligned_states);
1282 | 	}
1283 | 
1284 | 	/* Encode the items */
1285 | 
1286 | 	items_size = encode_items(c, (u8 *)out + header_size,
1287 | 				  out_nbytes_avail - header_size,
1288 | 				  block_type == BLOCKTYPE_ALIGNED);
1289 | 	if (items_size == 0)
1290 | 		return 0;
1291 | 
1292 | 	return header_size + items_size;
1293 | }
1294 | 
1295 | static size_t
1296 | compress_greedy(struct xpack_compressor *c, void *out, size_t out_nbytes_avail)
1297 | {
1298 | 	u8 * const out_begin = out;
1299 | 	u8 * out_next = out_begin;
1300 | 	u8 * const out_end = out_begin + out_nbytes_avail;
1301 | 	const u8 * const in_begin = c->in_buffer;
1302 | 	const u8 *	 in_next = in_begin;
1303 | 	const u8 * const in_end  = in_begin + c->in_nbytes;
1304 | 	u32 max_len = MIN(c->in_nbytes, UINT32_MAX);
1305 | 	u32 nice_len = MIN(c->nice_match_length, max_len);
1306 | 	u32 next_hashes[2] = {0, 0};
1307 | 	u32 recent_offsets[NUM_REPS];
1308 | 
1309 | 	init_recent_offsets(recent_offsets);
1310 | 	hc_matchfinder_init(&c->hc_mf);
1311 | 
1312 | 	do {
1313 | 		/* Starting a new block */
1314 | 
1315 | 		const u8 * const in_block_begin = in_next;
1316 | 		const u8 * const in_max_block_end =
1317 | 			in_next + MIN(SOFT_MAX_BLOCK_LENGTH, in_end - in_next);
1318 | 		u32 length;
1319 | 		u32 offset;
1320 | 		size_t nbytes;
1321 | 		u32 litrunlen = 0;
1322 | 
1323 | 		begin_block(c);
1324 | 
1325 | 		do {
1326 | 			if (unlikely(max_len > in_end - in_next)) {
1327 | 				max_len = in_end - in_next;
1328 | 				nice_len = MIN(max_len, nice_len);
1329 | 			}
1330 | 
1331 | 			/* Find the longest match at the current position. */
1332 | 
1333 | 			length = hc_matchfinder_longest_match(&c->hc_mf,
1334 | 							      in_begin,
1335 | 							      in_next - in_begin,
1336 | 							#if MIN_MATCH_LEN == 4
1337 | 							      3,
1338 | 							#else
1339 | 							      2,
1340 | 							#endif
1341 | 							      max_len,
1342 | 							      nice_len,
1343 | 							      c->max_search_depth,
1344 | 							      next_hashes,
1345 | 							      &offset);
1346 | 		#if MIN_MATCH_LEN == 4
1347 | 			if (length < 4) {
1348 | 		#else
1349 | 			if (length < 3 || (length == 3 && offset >= 4096)) {
1350 | 		#endif
1351 | 				/* Literal */
1352 | 				observe_literal(&c->split_stats, *in_next);
1353 | 				record_literal(c, *in_next);
1354 | 				in_next++;
1355 | 				litrunlen++;
1356 | 			} else {
1357 | 				/* Match */
1358 | 				struct match *match = &c->matches[c->num_matches++];
1359 | 
1360 | 				STATIC_ASSERT(NUM_REPS >= 1 && NUM_REPS <= 4);
1361 | 
1362 | 				observe_match(&c->split_stats, length);
1363 | 
1364 | 				if (offset == recent_offsets[0]) {
1365 | 					record_repeat_offset(c, match, 0);
1366 | 				}
1367 | 			#if NUM_REPS >= 2
1368 | 				else if (offset == recent_offsets[1]) {
1369 | 					recent_offsets[1] = recent_offsets[0];
1370 | 					record_repeat_offset(c, match, 1);
1371 | 				}
1372 | 			#endif
1373 | 			#if NUM_REPS >= 3
1374 | 				else if (offset == recent_offsets[2]) {
1375 | 					recent_offsets[2] = recent_offsets[0];
1376 | 					record_repeat_offset(c, match, 2);
1377 | 				}
1378 | 			#endif
1379 | 			#if NUM_REPS >= 4
1380 | 				else if (offset == recent_offsets[3]) {
1381 | 					recent_offsets[3] = recent_offsets[0];
1382 | 					record_repeat_offset(c, match, 3);
1383 | 				}
1384 | 			#endif
1385 | 				else {
1386 | 					record_explicit_offset(c, match, offset);
1387 | 				#if NUM_REPS >= 4
1388 | 					recent_offsets[3] = recent_offsets[2];
1389 | 				#endif
1390 | 				#if NUM_REPS >= 3
1391 | 					recent_offsets[2] = recent_offsets[1];
1392 | 				#endif
1393 | 				#if NUM_REPS >= 2
1394 | 					recent_offsets[1] = recent_offsets[0];
1395 | 				#endif
1396 | 				}
1397 | 				recent_offsets[0] = offset;
1398 | 				record_litrunlen(c, match, litrunlen);
1399 | 				record_length(c, match, length);
1400 | 
1401 | 				in_next = hc_matchfinder_skip_positions(&c->hc_mf,
1402 | 									in_begin,
1403 | 									in_next + 1 - in_begin,
1404 | 									in_end - in_begin,
1405 | 									length - 1,
1406 | 									next_hashes);
1407 | 				litrunlen = 0;
1408 | 			}
1409 | 		} while (in_next < in_max_block_end &&
1410 | 			 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
1411 | 
1412 | 		nbytes = write_block(c, out_next, out_end - out_next,
1413 | 				     in_next - in_block_begin, litrunlen,
1414 | 				     in_next == in_end);
1415 | 		if (nbytes == 0)
1416 | 			return 0;
1417 | 
1418 | 		out_next += nbytes;
1419 | 
1420 | 	} while (in_next != in_end);
1421 | 
1422 | 	return out_next - out_begin;
1423 | }
1424 | 
1425 | /*
1426 |  * Given a pointer to the current byte sequence and the current list of recent
1427 |  * match offsets, find the longest repeat offset match.
1428 |  *
1429 |  * If no match of at least MIN_MATCH_LEN bytes is found, then return 0.
1430 |  *
1431 |  * If a match of at least MIN_MATCH_LEN bytes is found, then return its length
1432 |  * and set *rep_max_idx_ret to the index of its offset in @queue.
1433 |  */
1434 | static u32
1435 | find_longest_repeat_offset_match(const u8 * const in_next,
1436 | 				 const u32 max_len,
1437 | 				 const u32 recent_offsets[],
1438 | 				 unsigned *rep_max_idx_ret)
1439 | {
1440 | #if MIN_MATCH_LEN == 2
1441 | #  define load_initial  load_u16_unaligned
1442 | #elif MIN_MATCH_LEN == 3
1443 | #  define load_initial	load_u24_unaligned
1444 | #elif MIN_MATCH_LEN == 4
1445 | #  define load_initial	load_u32_unaligned
1446 | #else
1447 | #  error "unsupported MIN_MATCH_LEN"
1448 | #endif
1449 | 	const u32 next_bytes = load_initial(in_next);
1450 | 	const u8 *matchptr;
1451 | 	u32 rep_len;
1452 | 	u32 rep_max_len;
1453 | 	unsigned rep_max_idx;
1454 | 
1455 | 	STATIC_ASSERT(NUM_REPS >= 1 && NUM_REPS <= 4);
1456 | 
1457 | 	matchptr = in_next - recent_offsets[0];
1458 | 	if (load_initial(matchptr) == next_bytes)
1459 | 		rep_max_len = lz_extend(in_next, matchptr, MIN_MATCH_LEN, max_len);
1460 | 	else
1461 | 		rep_max_len = 0;
1462 | 	rep_max_idx = 0;
1463 | 
1464 | #if NUM_REPS >= 2
1465 | 	matchptr = in_next - recent_offsets[1];
1466 | 	if (load_initial(matchptr) == next_bytes) {
1467 | 		rep_len = lz_extend(in_next, matchptr, MIN_MATCH_LEN, max_len);
1468 | 		if (rep_len > rep_max_len) {
1469 | 			rep_max_len = rep_len;
1470 | 			rep_max_idx = 1;
1471 | 		}
1472 | 	}
1473 | #endif
1474 | 
1475 | #if NUM_REPS >= 3
1476 | 	matchptr = in_next - recent_offsets[2];
1477 | 	if (load_initial(matchptr) == next_bytes) {
1478 | 		rep_len = lz_extend(in_next, matchptr, MIN_MATCH_LEN, max_len);
1479 | 		if (rep_len > rep_max_len) {
1480 | 			rep_max_len = rep_len;
1481 | 			rep_max_idx = 2;
1482 | 		}
1483 | 	}
1484 | #endif
1485 | 
1486 | #if NUM_REPS >= 4
1487 | 	matchptr = in_next - recent_offsets[3];
1488 | 	if (load_initial(matchptr) == next_bytes) {
1489 | 		rep_len = lz_extend(in_next, matchptr, MIN_MATCH_LEN, max_len);
1490 | 		if (rep_len > rep_max_len) {
1491 | 			rep_max_len = rep_len;
1492 | 			rep_max_idx = 3;
1493 | 		}
1494 | 	}
1495 | #endif
1496 | 
1497 | 	*rep_max_idx_ret = rep_max_idx;
1498 | 	return rep_max_len;
1499 | }
1500 | 
1501 | /* Fast heuristic scoring for lazy parsing: how "good" is this match? */
1502 | static forceinline u32
1503 | explicit_offset_match_score(u32 len, u32 adjusted_offset)
1504 | {
1505 | 	u32 score = len;
1506 | 
1507 | 	if (adjusted_offset < 4096)
1508 | 		score++;
1509 | 
1510 | 	if (adjusted_offset < 256)
1511 | 		score++;
1512 | 
1513 | 	return score;
1514 | }
1515 | 
1516 | static forceinline u32
1517 | repeat_offset_match_score(u32 rep_len, unsigned rep_idx)
1518 | {
1519 | 	return rep_len + 3;
1520 | }
1521 | 
1522 | static size_t
1523 | compress_lazy(struct xpack_compressor *c, void *out, size_t out_nbytes_avail)
1524 | {
1525 | 	u8 * const out_begin = out;
1526 | 	u8 * out_next = out_begin;
1527 | 	u8 * const out_end = out_begin + out_nbytes_avail;
1528 | 	const u8 * const in_begin = c->in_buffer;
1529 | 	const u8 *	 in_next = in_begin;
1530 | 	const u8 * const in_end  = in_begin + c->in_nbytes;
1531 | 	u32 max_len = MIN(c->in_nbytes, UINT32_MAX);
1532 | 	u32 nice_len = MIN(c->nice_match_length, max_len);
1533 | 	u32 next_hashes[2] = {0, 0};
1534 | 	u32 recent_offsets[NUM_REPS];
1535 | 
1536 | 	init_recent_offsets(recent_offsets);
1537 | 	hc_matchfinder_init(&c->hc_mf);
1538 | 
1539 | 	do {
1540 | 		/* Starting a new block */
1541 | 
1542 | 		const u8 * const in_block_begin = in_next;
1543 | 		const u8 * const in_max_block_end =
1544 | 			in_next + MIN(SOFT_MAX_BLOCK_LENGTH, in_end - in_next);
1545 | 		u32 cur_len;
1546 | 		u32 cur_offset;
1547 | 		u32 cur_offset_data;
1548 | 		u32 cur_score;
1549 | 		u32 next_len;
1550 | 		u32 next_offset;
1551 | 		u32 next_offset_data;
1552 | 		u32 next_score;
1553 | 		u32 rep_max_len;
1554 | 		unsigned rep_max_idx;
1555 | 		u32 rep_score;
1556 | 		u32 skip_len;
1557 | 		u32 litrunlen = 0;
1558 | 		size_t nbytes;
1559 | 		struct match *match;
1560 | 
1561 | 		begin_block(c);
1562 | 
1563 | 		do {
1564 | 			if (unlikely(max_len > in_end - in_next)) {
1565 | 				max_len = in_end - in_next;
1566 | 				nice_len = MIN(max_len, nice_len);
1567 | 			}
1568 | 
1569 | 			/* Find the longest match at the current position. */
1570 | 
1571 | 			cur_len = hc_matchfinder_longest_match(&c->hc_mf,
1572 | 							       in_begin,
1573 | 							       in_next - in_begin,
1574 | 							#if MIN_MATCH_LEN == 4
1575 | 							       3,
1576 | 							#else
1577 | 							       2,
1578 | 							#endif
1579 | 							       max_len,
1580 | 							       nice_len,
1581 | 							       c->max_search_depth,
1582 | 							       next_hashes,
1583 | 							       &cur_offset);
1584 | 		#if MIN_MATCH_LEN == 4
1585 | 			if (cur_len < 4) {
1586 | 		#else
1587 | 			if (cur_len < 3 || (cur_len == 3 && cur_offset >= 4096)) {
1588 | 		#endif
1589 | 				/*
1590 | 				 * There was no match found, or the only match
1591 | 				 * found was a distant length 3 match.  Output a
1592 | 				 * literal.
1593 | 				 */
1594 | 				observe_literal(&c->split_stats, *in_next);
1595 | 				record_literal(c, *in_next);
1596 | 				in_next++;
1597 | 				litrunlen++;
1598 | 				continue;
1599 | 			}
1600 | 
1601 | 			observe_match(&c->split_stats, cur_len);
1602 | 
1603 | 			if (cur_offset == recent_offsets[0]) {
1604 | 				in_next++;
1605 | 				cur_offset_data = 0;
1606 | 				skip_len = cur_len - 1;
1607 | 				goto choose_cur_match;
1608 | 			}
1609 | 
1610 | 			cur_offset_data = cur_offset + (NUM_REPS - 1);
1611 | 			cur_score = explicit_offset_match_score(cur_len, cur_offset_data);
1612 | 
1613 | 			/* Consider a repeat offset match. */
1614 | 			rep_max_len = find_longest_repeat_offset_match(in_next,
1615 | 								       in_end - in_next,
1616 | 								       recent_offsets,
1617 | 								       &rep_max_idx);
1618 | 			in_next++;
1619 | 
1620 | 			if (rep_max_len >= 3 &&
1621 | 			    (rep_score = repeat_offset_match_score(rep_max_len,
1622 | 								   rep_max_idx)) >= cur_score)
1623 | 			{
1624 | 				cur_len = rep_max_len;
1625 | 				cur_offset_data = rep_max_idx;
1626 | 				skip_len = rep_max_len - 1;
1627 | 				goto choose_cur_match;
1628 | 			}
1629 | 
1630 | 		have_cur_match:
1631 | 
1632 | 			/* We have a match at the current position. */
1633 | 
1634 | 			/* If we have a very long match, choose it immediately. */
1635 | 			if (cur_len >= nice_len) {
1636 | 				skip_len = cur_len - 1;
1637 | 				goto choose_cur_match;
1638 | 			}
1639 | 
1640 | 			/* See if there's a better match at the next position. */
1641 | 
1642 | 			if (unlikely(max_len > in_end - in_next)) {
1643 | 				max_len = in_end - in_next;
1644 | 				nice_len = MIN(max_len, nice_len);
1645 | 			}
1646 | 
1647 | 			next_len = hc_matchfinder_longest_match(&c->hc_mf,
1648 | 								in_begin,
1649 | 								in_next - in_begin,
1650 | 							#if MIN_MATCH_LEN == 2
1651 | 								cur_len - 2,
1652 | 							#else
1653 | 								cur_len - 1,
1654 | 							#endif
1655 | 								max_len,
1656 | 								nice_len,
1657 | 								c->max_search_depth / 2,
1658 | 								next_hashes,
1659 | 								&next_offset);
1660 | 
1661 | 		#if MIN_MATCH_LEN == 2
1662 | 			if (next_len <= cur_len - 2) {
1663 | 		#else
1664 | 			if (next_len <= cur_len - 1) {
1665 | 		#endif
1666 | 				in_next++;
1667 | 				skip_len = cur_len - 2;
1668 | 				goto choose_cur_match;
1669 | 			}
1670 | 
1671 | 			next_offset_data = next_offset + (NUM_REPS - 1);
1672 | 			next_score = explicit_offset_match_score(next_len, next_offset_data);
1673 | 
1674 | 			rep_max_len = find_longest_repeat_offset_match(in_next,
1675 | 								       in_end - in_next,
1676 | 								       recent_offsets,
1677 | 								       &rep_max_idx);
1678 | 			in_next++;
1679 | 
1680 | 			if (rep_max_len >= 3 &&
1681 | 			    (rep_score = repeat_offset_match_score(rep_max_len,
1682 | 								   rep_max_idx)) >= next_score)
1683 | 			{
1684 | 
1685 | 				if (rep_score > cur_score) {
1686 | 					/*
1687 | 					 * The next match is better, and it's a
1688 | 					 * repeat offset match.
1689 | 					 */
1690 | 					record_literal(c, *(in_next - 2));
1691 | 					litrunlen++;
1692 | 					cur_len = rep_max_len;
1693 | 					cur_offset_data = rep_max_idx;
1694 | 					skip_len = cur_len - 1;
1695 | 					goto choose_cur_match;
1696 | 				}
1697 | 			} else {
1698 | 				if (next_score > cur_score) {
1699 | 					/*
1700 | 					 * The next match is better, and it's an
1701 | 					 * explicit offset match.
1702 | 					 */
1703 | 					record_literal(c, *(in_next - 2));
1704 | 					litrunlen++;
1705 | 					cur_len = next_len;
1706 | 					cur_offset_data = next_offset_data;
1707 | 					cur_score = next_score;
1708 | 					goto have_cur_match;
1709 | 				}
1710 | 			}
1711 | 
1712 | 			/* The original match was better. */
1713 | 			skip_len = cur_len - 2;
1714 | 
1715 | 		choose_cur_match:
1716 | 			match = &c->matches[c->num_matches++];
1717 | 			if (cur_offset_data < NUM_REPS) {
1718 | 				u32 offset;
1719 | 
1720 | 				record_repeat_offset(c, match, cur_offset_data);
1721 | 
1722 | 				offset = recent_offsets[cur_offset_data];
1723 | 				recent_offsets[cur_offset_data] = recent_offsets[0];
1724 | 				recent_offsets[0] = offset;
1725 | 			} else {
1726 | 				record_explicit_offset(c, match,
1727 | 						       cur_offset_data - (NUM_REPS - 1));
1728 | 				STATIC_ASSERT(NUM_REPS >= 1 && NUM_REPS <= 4);
1729 | 			#if NUM_REPS >= 4
1730 | 				recent_offsets[3] = recent_offsets[2];
1731 | 			#endif
1732 | 			#if NUM_REPS >= 3
1733 | 				recent_offsets[2] = recent_offsets[1];
1734 | 			#endif
1735 | 			#if NUM_REPS >= 2
1736 | 				recent_offsets[1] = recent_offsets[0];
1737 | 			#endif
1738 | 				recent_offsets[0] = cur_offset_data - (NUM_REPS - 1);
1739 | 			}
1740 | 			record_litrunlen(c, match, litrunlen);
1741 | 			record_length(c, match, cur_len);
1742 | 			litrunlen = 0;
1743 | 
1744 | 			in_next = hc_matchfinder_skip_positions(&c->hc_mf,
1745 | 								in_begin,
1746 | 								in_next - in_begin,
1747 | 								in_end - in_begin,
1748 | 								skip_len,
1749 | 								next_hashes);
1750 | 		} while (in_next < in_max_block_end &&
1751 | 			 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
1752 | 
1753 | 		nbytes = write_block(c, out_next, out_end - out_next,
1754 | 				     in_next - in_block_begin, litrunlen,
1755 | 				     in_next == in_end);
1756 | 		if (nbytes == 0)
1757 | 			return 0;
1758 | 
1759 | 		out_next += nbytes;
1760 | 
1761 | 	} while (in_next != in_end);
1762 | 
1763 | 	return out_next - out_begin;
1764 | }
1765 | 
1766 | LIBEXPORT struct xpack_compressor *
1767 | xpack_alloc_compressor(size_t max_buffer_size, int compression_level)
1768 | {
1769 | 	struct xpack_compressor *c;
1770 | 
1771 | 	c = malloc(offsetof(struct xpack_compressor, hc_mf) +
1772 | 		   hc_matchfinder_size(max_buffer_size));
1773 | 	if (!c)
1774 | 		goto err0;
1775 | 
1776 | #ifdef ENABLE_PREPROCESSING
1777 | 	c->in_buffer = malloc(max_buffer_size);
1778 | 	if (!c->in_buffer)
1779 | 		goto err1;
1780 | #endif
1781 | 
1782 | 	c->max_buffer_size = max_buffer_size;
1783 | 
1784 | 	switch (compression_level) {
1785 | 	case 1:
1786 | 		c->impl = compress_greedy;
1787 | 		c->max_search_depth = 1;
1788 | 		c->nice_match_length = MIN_MATCH_LEN;
1789 | 		break;
1790 | 	case 2:
1791 | 		c->impl = compress_greedy;
1792 | 		c->max_search_depth = 8;
1793 | 		c->nice_match_length = 8;
1794 | 		break;
1795 | 	case 3:
1796 | 		c->impl = compress_greedy;
1797 | 		c->max_search_depth = 16;
1798 | 		c->nice_match_length = 16;
1799 | 		break;
1800 | 	case 4:
1801 | 		c->impl = compress_lazy;
1802 | 		c->max_search_depth = 8;
1803 | 		c->nice_match_length = 12;
1804 | 		break;
1805 | 	case 5:
1806 | 		c->impl = compress_lazy;
1807 | 		c->max_search_depth = 16;
1808 | 		c->nice_match_length = 24;
1809 | 		break;
1810 | 	case 6:
1811 | 		c->impl = compress_lazy;
1812 | 		c->max_search_depth = 32;
1813 | 		c->nice_match_length = 48;
1814 | 		break;
1815 | 	case 7:
1816 | 		c->impl = compress_lazy;
1817 | 		c->max_search_depth = 64;
1818 | 		c->nice_match_length = 96;
1819 | 		break;
1820 | 	case 8:
1821 | 		c->impl = compress_lazy;
1822 | 		c->max_search_depth = 128;
1823 | 		c->nice_match_length = 192;
1824 | 		break;
1825 | 	case 9:
1826 | 		c->impl = compress_lazy;
1827 | 		c->max_search_depth = 256;
1828 | 		c->nice_match_length = 384;
1829 | 		STATIC_ASSERT(EXTRA_LITERAL_SPACE >= 384 * 4 / 3);
1830 | 		break;
1831 | 	default:
1832 | 		goto err2;
1833 | 	}
1834 | 
1835 | 	/* max_search_depth == 0 is invalid */
1836 | 	if (c->max_search_depth < 1)
1837 | 		c->max_search_depth = 1;
1838 | 
1839 | 	return c;
1840 | 
1841 | err2:
1842 | #ifdef ENABLE_PREPROCESSING
1843 | 	free(c->in_buffer);
1844 | err1:
1845 | #endif
1846 | 	free(c);
1847 | err0:
1848 | 	return NULL;
1849 | }
1850 | 
1851 | LIBEXPORT size_t
1852 | xpack_compress(struct xpack_compressor *c, const void *in, size_t in_nbytes,
1853 | 	       void *out, size_t out_nbytes_avail)
1854 | {
1855 | 	/* Don't bother trying to compress very small inputs. */
1856 | 	if (in_nbytes < 100)
1857 | 		return 0;
1858 | 
1859 | 	/* Safety check */
1860 | 	if (unlikely(in_nbytes > c->max_buffer_size))
1861 | 		return 0;
1862 | 
1863 | #ifdef ENABLE_PREPROCESSING
1864 | 	/* Copy the input data into the internal buffer and preprocess it. */
1865 | 	memcpy(c->in_buffer, in, in_nbytes);
1866 | 	c->in_nbytes = in_nbytes;
1867 | 	preprocess(c->in_buffer, in_nbytes);
1868 | #else
1869 | 	/* Preprocessing is disabled.  No internal buffer is needed. */
1870 | 	c->in_buffer = (void *)in;
1871 | 	c->in_nbytes = in_nbytes;
1872 | #endif
1873 | 
1874 | 	return (*c->impl)(c, out, out_nbytes_avail);
1875 | }
1876 | 
1877 | LIBEXPORT void
1878 | xpack_free_compressor(struct xpack_compressor *c)
1879 | {
1880 | 	if (c) {
1881 | 	#ifdef ENABLE_PREPROCESSING
1882 | 		free(c->in_buffer);
1883 | 	#endif
1884 | 		free(c);
1885 | 	}
1886 | }
1887 | 
1888 | #endif /* !DECOMPRESSION_ONLY */
1889 | 


--------------------------------------------------------------------------------