├── .cirrus.yml ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── CMakeLists.txt ├── COPYING ├── NEWS.md ├── README.md ├── common_defs.h ├── lib ├── adler32.c ├── arm │ ├── adler32_impl.h │ ├── cpu_features.c │ ├── cpu_features.h │ ├── crc32_impl.h │ ├── crc32_pmull_helpers.h │ ├── crc32_pmull_wide.h │ └── matchfinder_impl.h ├── bt_matchfinder.h ├── cpu_features_common.h ├── crc32.c ├── crc32_multipliers.h ├── crc32_tables.h ├── decompress_template.h ├── deflate_compress.c ├── deflate_compress.h ├── deflate_constants.h ├── deflate_decompress.c ├── gzip_compress.c ├── gzip_constants.h ├── gzip_decompress.c ├── hc_matchfinder.h ├── ht_matchfinder.h ├── lib_common.h ├── matchfinder_common.h ├── riscv │ └── matchfinder_impl.h ├── utils.c ├── x86 │ ├── adler32_impl.h │ ├── adler32_template.h │ ├── cpu_features.c │ ├── cpu_features.h │ ├── crc32_impl.h │ ├── crc32_pclmul_template.h │ ├── decompress_impl.h │ └── matchfinder_impl.h ├── zlib_compress.c ├── zlib_constants.h └── zlib_decompress.c ├── libdeflate-config.cmake.in ├── libdeflate.h ├── libdeflate.pc.in ├── programs ├── CMakeLists.txt ├── benchmark.c ├── checksum.c ├── config.h.in ├── gzip.c ├── prog_util.c ├── prog_util.h ├── test_checksums.c ├── test_custom_malloc.c ├── test_incomplete_codes.c ├── test_invalid_streams.c ├── test_litrunlen_overflow.c ├── test_overread.c ├── test_slow_decompression.c ├── test_trailing_bytes.c ├── test_util.c ├── test_util.h └── tgetopt.c └── scripts ├── android_build.sh ├── android_tests.sh ├── benchmark.sh ├── checksum.sh ├── checksum_benchmarks.sh ├── cmake-helper.sh ├── deflate_benchmarks.sh ├── exec_tests.sh ├── gen-crc32-consts.py ├── gen-release-archives.sh ├── gen_bitreverse_tab.py ├── gen_default_litlen_costs.py ├── gen_offset_slot_map.py ├── gzip_tests.sh ├── libFuzzer ├── .gitignore ├── deflate_compress │ ├── corpus │ │ └── 0 │ └── fuzz.c ├── deflate_decompress │ ├── corpus │ │ └── 0 │ └── fuzz.c ├── fuzz.sh ├── gzip_decompress │ ├── corpus │ │ └── 0 │ └── fuzz.c └── zlib_decompress │ ├── corpus │ └── 0 │ └── fuzz.c ├── run_tests.sh ├── toolchain-i686-w64-mingw32.cmake └── toolchain-x86_64-w64-mingw32.cmake /.cirrus.yml: -------------------------------------------------------------------------------- 1 | task: 2 | freebsd_instance: 3 | matrix: 4 | - image_family: freebsd-14-2 5 | install_script: pkg install -y cmake 6 | script: 7 | - cmake -B build -DLIBDEFLATE_BUILD_TESTS=1 8 | - cmake --build build 9 | - ctest --test-dir build 10 | # Direct compilation without official build system 11 | - cc -O2 -Wall -Werror lib/*.c lib/*/*.c programs/gzip.c programs/prog_util.c programs/tgetopt.c -o libdeflate-gzip 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /build* 2 | /libdeflate-*-windows-* 3 | /libdeflate-*.tar.gz 4 | cscope* 5 | tags 6 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | Copyright 2016 Eric Biggers 2 | Copyright 2024 Google LLC 3 | 4 | Permission is hereby granted, free of charge, to any person 5 | obtaining a copy of this software and associated documentation files 6 | (the "Software"), to deal in the Software without restriction, 7 | including without limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of the Software, 9 | and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be 13 | included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /lib/adler32.c: -------------------------------------------------------------------------------- 1 | /* 2 | * adler32.c - Adler-32 checksum algorithm 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #include "lib_common.h" 29 | 30 | /* The Adler-32 divisor, or "base", value */ 31 | #define DIVISOR 65521 32 | 33 | /* 34 | * MAX_CHUNK_LEN is the most bytes that can be processed without the possibility 35 | * of s2 overflowing when it is represented as an unsigned 32-bit integer. This 36 | * value was computed using the following Python script: 37 | * 38 | * divisor = 65521 39 | * count = 0 40 | * s1 = divisor - 1 41 | * s2 = divisor - 1 42 | * while True: 43 | * s1 += 0xFF 44 | * s2 += s1 45 | * if s2 > 0xFFFFFFFF: 46 | * break 47 | * count += 1 48 | * print(count) 49 | * 50 | * Note that to get the correct worst-case value, we must assume that every byte 51 | * has value 0xFF and that s1 and s2 started with the highest possible values 52 | * modulo the divisor. 53 | */ 54 | #define MAX_CHUNK_LEN 5552 55 | 56 | /* 57 | * Update the Adler-32 values s1 and s2 using n bytes from p, update p to p + n, 58 | * update n to 0, and reduce s1 and s2 mod DIVISOR. It is assumed that neither 59 | * s1 nor s2 can overflow before the reduction at the end, i.e. n plus any bytes 60 | * already processed after the last reduction must not exceed MAX_CHUNK_LEN. 61 | * 62 | * This uses only portable C code. This is used as a fallback when a vectorized 63 | * implementation of Adler-32 (e.g. AVX2) is unavailable on the platform. 64 | * 65 | * Some of the vectorized implementations also use this to handle the end of the 66 | * data when the data isn't evenly divisible by the length the vectorized code 67 | * works on. To avoid compiler errors about target-specific option mismatches 68 | * when this is used in that way, this is a macro rather than a function. 69 | * 70 | * Although this is unvectorized, this does include an optimization where the 71 | * main loop processes four bytes at a time using a strategy similar to that 72 | * used by vectorized implementations. This provides increased instruction- 73 | * level parallelism compared to the traditional 's1 += *p++; s2 += s1;'. 74 | */ 75 | #define ADLER32_CHUNK(s1, s2, p, n) \ 76 | do { \ 77 | if (n >= 4) { \ 78 | u32 s1_sum = 0; \ 79 | u32 byte_0_sum = 0; \ 80 | u32 byte_1_sum = 0; \ 81 | u32 byte_2_sum = 0; \ 82 | u32 byte_3_sum = 0; \ 83 | \ 84 | do { \ 85 | s1_sum += s1; \ 86 | s1 += p[0] + p[1] + p[2] + p[3]; \ 87 | byte_0_sum += p[0]; \ 88 | byte_1_sum += p[1]; \ 89 | byte_2_sum += p[2]; \ 90 | byte_3_sum += p[3]; \ 91 | p += 4; \ 92 | n -= 4; \ 93 | } while (n >= 4); \ 94 | s2 += (4 * (s1_sum + byte_0_sum)) + (3 * byte_1_sum) + \ 95 | (2 * byte_2_sum) + byte_3_sum; \ 96 | } \ 97 | for (; n; n--, p++) { \ 98 | s1 += *p; \ 99 | s2 += s1; \ 100 | } \ 101 | s1 %= DIVISOR; \ 102 | s2 %= DIVISOR; \ 103 | } while (0) 104 | 105 | static u32 MAYBE_UNUSED 106 | adler32_generic(u32 adler, const u8 *p, size_t len) 107 | { 108 | u32 s1 = adler & 0xFFFF; 109 | u32 s2 = adler >> 16; 110 | 111 | while (len) { 112 | size_t n = MIN(len, MAX_CHUNK_LEN & ~3); 113 | 114 | len -= n; 115 | ADLER32_CHUNK(s1, s2, p, n); 116 | } 117 | 118 | return (s2 << 16) | s1; 119 | } 120 | 121 | /* Include architecture-specific implementation(s) if available. */ 122 | #undef DEFAULT_IMPL 123 | #undef arch_select_adler32_func 124 | typedef u32 (*adler32_func_t)(u32 adler, const u8 *p, size_t len); 125 | #if defined(ARCH_ARM32) || defined(ARCH_ARM64) 126 | # include "arm/adler32_impl.h" 127 | #elif defined(ARCH_X86_32) || defined(ARCH_X86_64) 128 | # include "x86/adler32_impl.h" 129 | #endif 130 | 131 | #ifndef DEFAULT_IMPL 132 | # define DEFAULT_IMPL adler32_generic 133 | #endif 134 | 135 | #ifdef arch_select_adler32_func 136 | static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len); 137 | 138 | static volatile adler32_func_t adler32_impl = dispatch_adler32; 139 | 140 | /* Choose the best implementation at runtime. */ 141 | static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len) 142 | { 143 | adler32_func_t f = arch_select_adler32_func(); 144 | 145 | if (f == NULL) 146 | f = DEFAULT_IMPL; 147 | 148 | adler32_impl = f; 149 | return f(adler, p, len); 150 | } 151 | #else 152 | /* The best implementation is statically known, so call it directly. */ 153 | #define adler32_impl DEFAULT_IMPL 154 | #endif 155 | 156 | LIBDEFLATEAPI u32 157 | libdeflate_adler32(u32 adler, const void *buffer, size_t len) 158 | { 159 | if (buffer == NULL) /* Return initial value. */ 160 | return 1; 161 | return adler32_impl(adler, buffer, len); 162 | } 163 | -------------------------------------------------------------------------------- /lib/arm/cpu_features.c: -------------------------------------------------------------------------------- 1 | /* 2 | * arm/cpu_features.c - feature detection for ARM CPUs 3 | * 4 | * Copyright 2018 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | /* 29 | * ARM CPUs don't have a standard way for unprivileged programs to detect CPU 30 | * features. But an OS-specific way can be used when available. 31 | */ 32 | 33 | #ifdef __APPLE__ 34 | # undef _ANSI_SOURCE 35 | # undef _DARWIN_C_SOURCE 36 | # define _DARWIN_C_SOURCE /* for sysctlbyname() */ 37 | #endif 38 | 39 | #include "../cpu_features_common.h" /* must be included first */ 40 | #include "cpu_features.h" 41 | 42 | #ifdef ARM_CPU_FEATURES_KNOWN 43 | /* Runtime ARM CPU feature detection is supported. */ 44 | 45 | #ifdef __linux__ 46 | /* 47 | * On Linux, arm32 and arm64 CPU features can be detected by reading the 48 | * AT_HWCAP and AT_HWCAP2 values from /proc/self/auxv. 49 | * 50 | * Ideally we'd use the C library function getauxval(), but it's not guaranteed 51 | * to be available: it was only added to glibc in 2.16, and in Android it was 52 | * added to API level 18 for arm32 and level 21 for arm64. 53 | */ 54 | 55 | #include 56 | #include 57 | #include 58 | #include 59 | 60 | #define AT_HWCAP 16 61 | #define AT_HWCAP2 26 62 | 63 | static void scan_auxv(unsigned long *hwcap, unsigned long *hwcap2) 64 | { 65 | int fd; 66 | unsigned long auxbuf[32]; 67 | int filled = 0; 68 | int i; 69 | 70 | fd = open("/proc/self/auxv", O_RDONLY); 71 | if (fd < 0) 72 | return; 73 | 74 | for (;;) { 75 | do { 76 | int ret = read(fd, &((char *)auxbuf)[filled], 77 | sizeof(auxbuf) - filled); 78 | if (ret <= 0) { 79 | if (ret < 0 && errno == EINTR) 80 | continue; 81 | goto out; 82 | } 83 | filled += ret; 84 | } while (filled < 2 * sizeof(long)); 85 | 86 | i = 0; 87 | do { 88 | unsigned long type = auxbuf[i]; 89 | unsigned long value = auxbuf[i + 1]; 90 | 91 | if (type == AT_HWCAP) 92 | *hwcap = value; 93 | else if (type == AT_HWCAP2) 94 | *hwcap2 = value; 95 | i += 2; 96 | filled -= 2 * sizeof(long); 97 | } while (filled >= 2 * sizeof(long)); 98 | 99 | memmove(auxbuf, &auxbuf[i], filled); 100 | } 101 | out: 102 | close(fd); 103 | } 104 | 105 | static u32 query_arm_cpu_features(void) 106 | { 107 | u32 features = 0; 108 | unsigned long hwcap = 0; 109 | unsigned long hwcap2 = 0; 110 | 111 | scan_auxv(&hwcap, &hwcap2); 112 | 113 | #ifdef ARCH_ARM32 114 | STATIC_ASSERT(sizeof(long) == 4); 115 | if (hwcap & (1 << 12)) /* HWCAP_NEON */ 116 | features |= ARM_CPU_FEATURE_NEON; 117 | #else 118 | STATIC_ASSERT(sizeof(long) == 8); 119 | if (hwcap & (1 << 1)) /* HWCAP_ASIMD */ 120 | features |= ARM_CPU_FEATURE_NEON; 121 | if (hwcap & (1 << 4)) /* HWCAP_PMULL */ 122 | features |= ARM_CPU_FEATURE_PMULL; 123 | if (hwcap & (1 << 7)) /* HWCAP_CRC32 */ 124 | features |= ARM_CPU_FEATURE_CRC32; 125 | if (hwcap & (1 << 17)) /* HWCAP_SHA3 */ 126 | features |= ARM_CPU_FEATURE_SHA3; 127 | if (hwcap & (1 << 20)) /* HWCAP_ASIMDDP */ 128 | features |= ARM_CPU_FEATURE_DOTPROD; 129 | #endif 130 | return features; 131 | } 132 | 133 | #elif defined(__APPLE__) 134 | /* On Apple platforms, arm64 CPU features can be detected via sysctlbyname(). */ 135 | 136 | #include 137 | #include 138 | #include 139 | 140 | static const struct { 141 | const char *name; 142 | u32 feature; 143 | } feature_sysctls[] = { 144 | { "hw.optional.neon", ARM_CPU_FEATURE_NEON }, 145 | { "hw.optional.AdvSIMD", ARM_CPU_FEATURE_NEON }, 146 | { "hw.optional.arm.FEAT_PMULL", ARM_CPU_FEATURE_PMULL }, 147 | { "hw.optional.armv8_crc32", ARM_CPU_FEATURE_CRC32 }, 148 | { "hw.optional.armv8_2_sha3", ARM_CPU_FEATURE_SHA3 }, 149 | { "hw.optional.arm.FEAT_SHA3", ARM_CPU_FEATURE_SHA3 }, 150 | { "hw.optional.arm.FEAT_DotProd", ARM_CPU_FEATURE_DOTPROD }, 151 | }; 152 | 153 | static u32 query_arm_cpu_features(void) 154 | { 155 | u32 features = 0; 156 | size_t i; 157 | 158 | for (i = 0; i < ARRAY_LEN(feature_sysctls); i++) { 159 | const char *name = feature_sysctls[i].name; 160 | u32 val = 0; 161 | size_t valsize = sizeof(val); 162 | 163 | if (sysctlbyname(name, &val, &valsize, NULL, 0) == 0 && 164 | valsize == sizeof(val) && val == 1) 165 | features |= feature_sysctls[i].feature; 166 | } 167 | return features; 168 | } 169 | #elif defined(_WIN32) 170 | 171 | #include 172 | 173 | #ifndef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE /* added in Windows SDK 20348 */ 174 | # define PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE 43 175 | #endif 176 | 177 | static u32 query_arm_cpu_features(void) 178 | { 179 | u32 features = ARM_CPU_FEATURE_NEON; 180 | 181 | if (IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)) 182 | features |= ARM_CPU_FEATURE_PMULL; 183 | if (IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE)) 184 | features |= ARM_CPU_FEATURE_CRC32; 185 | if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) 186 | features |= ARM_CPU_FEATURE_DOTPROD; 187 | 188 | /* FIXME: detect SHA3 support too. */ 189 | 190 | return features; 191 | } 192 | #else 193 | #error "unhandled case" 194 | #endif 195 | 196 | static const struct cpu_feature arm_cpu_feature_table[] = { 197 | {ARM_CPU_FEATURE_NEON, "neon"}, 198 | {ARM_CPU_FEATURE_PMULL, "pmull"}, 199 | {ARM_CPU_FEATURE_PREFER_PMULL, "prefer_pmull"}, 200 | {ARM_CPU_FEATURE_CRC32, "crc32"}, 201 | {ARM_CPU_FEATURE_SHA3, "sha3"}, 202 | {ARM_CPU_FEATURE_DOTPROD, "dotprod"}, 203 | }; 204 | 205 | volatile u32 libdeflate_arm_cpu_features = 0; 206 | 207 | void libdeflate_init_arm_cpu_features(void) 208 | { 209 | u32 features = query_arm_cpu_features(); 210 | 211 | /* 212 | * On the Apple M1 processor, crc32 instructions max out at about 25.5 213 | * GB/s in the best case of using a 3-way or greater interleaved chunked 214 | * implementation, whereas a pmull-based implementation achieves 68 GB/s 215 | * provided that the stride length is large enough (about 10+ vectors 216 | * with eor3, or 12+ without). 217 | * 218 | * Assume that crc32 instructions are preferable in other cases. 219 | */ 220 | #if (defined(__APPLE__) && TARGET_OS_OSX) || defined(TEST_SUPPORT__DO_NOT_USE) 221 | features |= ARM_CPU_FEATURE_PREFER_PMULL; 222 | #endif 223 | 224 | disable_cpu_features_for_testing(&features, arm_cpu_feature_table, 225 | ARRAY_LEN(arm_cpu_feature_table)); 226 | 227 | libdeflate_arm_cpu_features = features | ARM_CPU_FEATURES_KNOWN; 228 | } 229 | 230 | #endif /* ARM_CPU_FEATURES_KNOWN */ 231 | -------------------------------------------------------------------------------- /lib/arm/cpu_features.h: -------------------------------------------------------------------------------- 1 | /* 2 | * arm/cpu_features.h - feature detection for ARM CPUs 3 | * 4 | * Copyright 2018 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #ifndef LIB_ARM_CPU_FEATURES_H 29 | #define LIB_ARM_CPU_FEATURES_H 30 | 31 | #include "../lib_common.h" 32 | 33 | #if defined(ARCH_ARM32) || defined(ARCH_ARM64) 34 | 35 | #define ARM_CPU_FEATURE_NEON (1 << 0) 36 | #define ARM_CPU_FEATURE_PMULL (1 << 1) 37 | /* 38 | * PREFER_PMULL indicates that the CPU has very high pmull throughput, and so 39 | * the 12x wide pmull-based CRC-32 implementation is likely to be faster than an 40 | * implementation based on the crc32 instructions. 41 | */ 42 | #define ARM_CPU_FEATURE_PREFER_PMULL (1 << 2) 43 | #define ARM_CPU_FEATURE_CRC32 (1 << 3) 44 | #define ARM_CPU_FEATURE_SHA3 (1 << 4) 45 | #define ARM_CPU_FEATURE_DOTPROD (1 << 5) 46 | 47 | #if !defined(FREESTANDING) && \ 48 | (defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)) && \ 49 | (defined(__linux__) || \ 50 | (defined(__APPLE__) && defined(ARCH_ARM64)) || \ 51 | (defined(_WIN32) && defined(ARCH_ARM64))) 52 | /* Runtime ARM CPU feature detection is supported. */ 53 | # define ARM_CPU_FEATURES_KNOWN (1U << 31) 54 | extern volatile u32 libdeflate_arm_cpu_features; 55 | 56 | void libdeflate_init_arm_cpu_features(void); 57 | 58 | static inline u32 get_arm_cpu_features(void) 59 | { 60 | if (libdeflate_arm_cpu_features == 0) 61 | libdeflate_init_arm_cpu_features(); 62 | return libdeflate_arm_cpu_features; 63 | } 64 | #else 65 | static inline u32 get_arm_cpu_features(void) { return 0; } 66 | #endif 67 | 68 | /* NEON */ 69 | #if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(ARCH_ARM64)) 70 | # define HAVE_NEON(features) 1 71 | # define HAVE_NEON_NATIVE 1 72 | #else 73 | # define HAVE_NEON(features) ((features) & ARM_CPU_FEATURE_NEON) 74 | # define HAVE_NEON_NATIVE 0 75 | #endif 76 | /* 77 | * With both gcc and clang, NEON intrinsics require that the main target has 78 | * NEON enabled already. Exception: with gcc 6.1 and later (r230411 for arm32, 79 | * r226563 for arm64), hardware floating point support is sufficient. 80 | */ 81 | #if (defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)) && \ 82 | (HAVE_NEON_NATIVE || (GCC_PREREQ(6, 1) && defined(__ARM_FP))) 83 | # define HAVE_NEON_INTRIN 1 84 | # include 85 | #else 86 | # define HAVE_NEON_INTRIN 0 87 | #endif 88 | 89 | /* PMULL */ 90 | #ifdef __ARM_FEATURE_CRYPTO 91 | # define HAVE_PMULL(features) 1 92 | #else 93 | # define HAVE_PMULL(features) ((features) & ARM_CPU_FEATURE_PMULL) 94 | #endif 95 | #if defined(ARCH_ARM64) && HAVE_NEON_INTRIN && \ 96 | (GCC_PREREQ(7, 1) || defined(__clang__) || defined(_MSC_VER)) && \ 97 | CPU_IS_LITTLE_ENDIAN() /* untested on big endian */ 98 | # define HAVE_PMULL_INTRIN 1 99 | /* Work around MSVC's vmull_p64() taking poly64x1_t instead of poly64_t */ 100 | # ifdef _MSC_VER 101 | # define compat_vmull_p64(a, b) vmull_p64(vcreate_p64(a), vcreate_p64(b)) 102 | # else 103 | # define compat_vmull_p64(a, b) vmull_p64((a), (b)) 104 | # endif 105 | #else 106 | # define HAVE_PMULL_INTRIN 0 107 | #endif 108 | 109 | /* CRC32 */ 110 | #ifdef __ARM_FEATURE_CRC32 111 | # define HAVE_CRC32(features) 1 112 | #else 113 | # define HAVE_CRC32(features) ((features) & ARM_CPU_FEATURE_CRC32) 114 | #endif 115 | #if defined(ARCH_ARM64) && \ 116 | (defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)) 117 | # define HAVE_CRC32_INTRIN 1 118 | # if defined(__GNUC__) || defined(__clang__) 119 | # include 120 | # endif 121 | /* 122 | * Use an inline assembly fallback for clang 15 and earlier, which only 123 | * defined the crc32 intrinsics when crc32 is enabled in the main target. 124 | */ 125 | # if defined(__clang__) && !CLANG_PREREQ(16, 0, 16000000) && \ 126 | !defined(__ARM_FEATURE_CRC32) 127 | # undef __crc32b 128 | # define __crc32b(a, b) \ 129 | ({ uint32_t res; \ 130 | __asm__("crc32b %w0, %w1, %w2" \ 131 | : "=r" (res) : "r" (a), "r" (b)); \ 132 | res; }) 133 | # undef __crc32h 134 | # define __crc32h(a, b) \ 135 | ({ uint32_t res; \ 136 | __asm__("crc32h %w0, %w1, %w2" \ 137 | : "=r" (res) : "r" (a), "r" (b)); \ 138 | res; }) 139 | # undef __crc32w 140 | # define __crc32w(a, b) \ 141 | ({ uint32_t res; \ 142 | __asm__("crc32w %w0, %w1, %w2" \ 143 | : "=r" (res) : "r" (a), "r" (b)); \ 144 | res; }) 145 | # undef __crc32d 146 | # define __crc32d(a, b) \ 147 | ({ uint32_t res; \ 148 | __asm__("crc32x %w0, %w1, %2" \ 149 | : "=r" (res) : "r" (a), "r" (b)); \ 150 | res; }) 151 | # pragma clang diagnostic ignored "-Wgnu-statement-expression" 152 | # endif 153 | #else 154 | # define HAVE_CRC32_INTRIN 0 155 | #endif 156 | 157 | /* SHA3 (needed for the eor3 instruction) */ 158 | #ifdef __ARM_FEATURE_SHA3 159 | # define HAVE_SHA3(features) 1 160 | #else 161 | # define HAVE_SHA3(features) ((features) & ARM_CPU_FEATURE_SHA3) 162 | #endif 163 | #if defined(ARCH_ARM64) && HAVE_NEON_INTRIN && \ 164 | (GCC_PREREQ(9, 1) /* r268049 */ || \ 165 | CLANG_PREREQ(7, 0, 10010463) /* r338010 */) 166 | # define HAVE_SHA3_INTRIN 1 167 | /* 168 | * Use an inline assembly fallback for clang 15 and earlier, which only 169 | * defined the sha3 intrinsics when sha3 is enabled in the main target. 170 | */ 171 | # if defined(__clang__) && !CLANG_PREREQ(16, 0, 16000000) && \ 172 | !defined(__ARM_FEATURE_SHA3) 173 | # undef veor3q_u8 174 | # define veor3q_u8(a, b, c) \ 175 | ({ uint8x16_t res; \ 176 | __asm__("eor3 %0.16b, %1.16b, %2.16b, %3.16b" \ 177 | : "=w" (res) : "w" (a), "w" (b), "w" (c)); \ 178 | res; }) 179 | # pragma clang diagnostic ignored "-Wgnu-statement-expression" 180 | # endif 181 | #else 182 | # define HAVE_SHA3_INTRIN 0 183 | #endif 184 | 185 | /* dotprod */ 186 | #ifdef __ARM_FEATURE_DOTPROD 187 | # define HAVE_DOTPROD(features) 1 188 | #else 189 | # define HAVE_DOTPROD(features) ((features) & ARM_CPU_FEATURE_DOTPROD) 190 | #endif 191 | #if defined(ARCH_ARM64) && HAVE_NEON_INTRIN && \ 192 | (GCC_PREREQ(8, 1) || CLANG_PREREQ(7, 0, 10010000) || defined(_MSC_VER)) 193 | # define HAVE_DOTPROD_INTRIN 1 194 | /* 195 | * Use an inline assembly fallback for clang 15 and earlier, which only 196 | * defined the dotprod intrinsics when dotprod is enabled in the main target. 197 | */ 198 | # if defined(__clang__) && !CLANG_PREREQ(16, 0, 16000000) && \ 199 | !defined(__ARM_FEATURE_DOTPROD) 200 | # undef vdotq_u32 201 | # define vdotq_u32(a, b, c) \ 202 | ({ uint32x4_t res = (a); \ 203 | __asm__("udot %0.4s, %1.16b, %2.16b" \ 204 | : "+w" (res) : "w" (b), "w" (c)); \ 205 | res; }) 206 | # pragma clang diagnostic ignored "-Wgnu-statement-expression" 207 | # endif 208 | #else 209 | # define HAVE_DOTPROD_INTRIN 0 210 | #endif 211 | 212 | #endif /* ARCH_ARM32 || ARCH_ARM64 */ 213 | 214 | #endif /* LIB_ARM_CPU_FEATURES_H */ 215 | -------------------------------------------------------------------------------- /lib/arm/crc32_pmull_helpers.h: -------------------------------------------------------------------------------- 1 | /* 2 | * arm/crc32_pmull_helpers.h - helper functions for CRC-32 folding with PMULL 3 | * 4 | * Copyright 2022 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | /* 29 | * This file is a "template" for instantiating helper functions for CRC folding 30 | * with pmull instructions. It accepts the following parameters: 31 | * 32 | * SUFFIX: 33 | * Name suffix to append to all instantiated functions. 34 | * ATTRIBUTES: 35 | * Target function attributes to use. 36 | * ENABLE_EOR3: 37 | * Use the eor3 instruction (from the sha3 extension). 38 | */ 39 | 40 | /* Create a vector with 'a' in the first 4 bytes, and the rest zeroed out. */ 41 | #undef u32_to_bytevec 42 | static forceinline ATTRIBUTES uint8x16_t 43 | ADD_SUFFIX(u32_to_bytevec)(u32 a) 44 | { 45 | return vreinterpretq_u8_u32(vsetq_lane_u32(a, vdupq_n_u32(0), 0)); 46 | } 47 | #define u32_to_bytevec ADD_SUFFIX(u32_to_bytevec) 48 | 49 | /* Load two 64-bit values into a vector. */ 50 | #undef load_multipliers 51 | static forceinline ATTRIBUTES poly64x2_t 52 | ADD_SUFFIX(load_multipliers)(const u64 p[2]) 53 | { 54 | return vreinterpretq_p64_u64(vld1q_u64(p)); 55 | } 56 | #define load_multipliers ADD_SUFFIX(load_multipliers) 57 | 58 | /* Do carryless multiplication of the low halves of two vectors. */ 59 | #undef clmul_low 60 | static forceinline ATTRIBUTES uint8x16_t 61 | ADD_SUFFIX(clmul_low)(uint8x16_t a, poly64x2_t b) 62 | { 63 | return vreinterpretq_u8_p128( 64 | compat_vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u8(a), 0), 65 | vgetq_lane_p64(b, 0))); 66 | } 67 | #define clmul_low ADD_SUFFIX(clmul_low) 68 | 69 | /* Do carryless multiplication of the high halves of two vectors. */ 70 | #undef clmul_high 71 | static forceinline ATTRIBUTES uint8x16_t 72 | ADD_SUFFIX(clmul_high)(uint8x16_t a, poly64x2_t b) 73 | { 74 | #ifdef __clang__ 75 | /* 76 | * Use inline asm to ensure that pmull2 is really used. This works 77 | * around clang bug https://github.com/llvm/llvm-project/issues/52868. 78 | */ 79 | uint8x16_t res; 80 | 81 | __asm__("pmull2 %0.1q, %1.2d, %2.2d" : "=w" (res) : "w" (a), "w" (b)); 82 | return res; 83 | #else 84 | return vreinterpretq_u8_p128(vmull_high_p64(vreinterpretq_p64_u8(a), b)); 85 | #endif 86 | } 87 | #define clmul_high ADD_SUFFIX(clmul_high) 88 | 89 | #undef eor3 90 | static forceinline ATTRIBUTES uint8x16_t 91 | ADD_SUFFIX(eor3)(uint8x16_t a, uint8x16_t b, uint8x16_t c) 92 | { 93 | #if ENABLE_EOR3 94 | return veor3q_u8(a, b, c); 95 | #else 96 | return veorq_u8(veorq_u8(a, b), c); 97 | #endif 98 | } 99 | #define eor3 ADD_SUFFIX(eor3) 100 | 101 | #undef fold_vec 102 | static forceinline ATTRIBUTES uint8x16_t 103 | ADD_SUFFIX(fold_vec)(uint8x16_t src, uint8x16_t dst, poly64x2_t multipliers) 104 | { 105 | uint8x16_t a = clmul_low(src, multipliers); 106 | uint8x16_t b = clmul_high(src, multipliers); 107 | 108 | return eor3(a, b, dst); 109 | } 110 | #define fold_vec ADD_SUFFIX(fold_vec) 111 | 112 | /* 113 | * Given v containing a 16-byte polynomial, and a pointer 'p' that points to the 114 | * next '1 <= len <= 15' data bytes, rearrange the concatenation of v and the 115 | * data into vectors x0 and x1 that contain 'len' bytes and 16 bytes, 116 | * respectively. Then fold x0 into x1 and return the result. Assumes that 117 | * 'p + len - 16' is in-bounds. 118 | */ 119 | #undef fold_partial_vec 120 | static forceinline ATTRIBUTES MAYBE_UNUSED uint8x16_t 121 | ADD_SUFFIX(fold_partial_vec)(uint8x16_t v, const u8 *p, size_t len, 122 | poly64x2_t multipliers_1) 123 | { 124 | /* 125 | * vqtbl1q_u8(v, shift_tab[len..len+15]) left shifts v by 16-len bytes. 126 | * vqtbl1q_u8(v, shift_tab[len+16..len+31]) right shifts v by len bytes. 127 | */ 128 | static const u8 shift_tab[48] = { 129 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 130 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 131 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 132 | 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 133 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 134 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 135 | }; 136 | const uint8x16_t lshift = vld1q_u8(&shift_tab[len]); 137 | const uint8x16_t rshift = vld1q_u8(&shift_tab[len + 16]); 138 | uint8x16_t x0, x1, bsl_mask; 139 | 140 | /* x0 = v left-shifted by '16 - len' bytes */ 141 | x0 = vqtbl1q_u8(v, lshift); 142 | 143 | /* Create a vector of '16 - len' 0x00 bytes, then 'len' 0xff bytes. */ 144 | bsl_mask = vreinterpretq_u8_s8( 145 | vshrq_n_s8(vreinterpretq_s8_u8(rshift), 7)); 146 | 147 | /* 148 | * x1 = the last '16 - len' bytes from v (i.e. v right-shifted by 'len' 149 | * bytes) followed by the remaining data. 150 | */ 151 | x1 = vbslq_u8(bsl_mask /* 0 bits select from arg3, 1 bits from arg2 */, 152 | vld1q_u8(p + len - 16), vqtbl1q_u8(v, rshift)); 153 | 154 | return fold_vec(x0, x1, multipliers_1); 155 | } 156 | #define fold_partial_vec ADD_SUFFIX(fold_partial_vec) 157 | -------------------------------------------------------------------------------- /lib/arm/crc32_pmull_wide.h: -------------------------------------------------------------------------------- 1 | /* 2 | * arm/crc32_pmull_wide.h - gzip CRC-32 with PMULL (extra-wide version) 3 | * 4 | * Copyright 2022 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | /* 29 | * This file is a "template" for instantiating PMULL-based crc32_arm functions. 30 | * The "parameters" are: 31 | * 32 | * SUFFIX: 33 | * Name suffix to append to all instantiated functions. 34 | * ATTRIBUTES: 35 | * Target function attributes to use. 36 | * ENABLE_EOR3: 37 | * Use the eor3 instruction (from the sha3 extension). 38 | * 39 | * This is the extra-wide version; it uses an unusually large stride length of 40 | * 12, and it assumes that crc32 instructions are available too. It's intended 41 | * for powerful CPUs that support both pmull and crc32 instructions, but where 42 | * throughput of pmull and xor (given enough instructions issued in parallel) is 43 | * significantly higher than that of crc32, thus making the crc32 instructions 44 | * (counterintuitively) not actually the fastest way to compute the CRC-32. The 45 | * Apple M1 processor is an example of such a CPU. 46 | */ 47 | 48 | #include "crc32_pmull_helpers.h" 49 | 50 | static ATTRIBUTES u32 51 | ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len) 52 | { 53 | uint8x16_t v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11; 54 | 55 | if (len < 3 * 192) { 56 | static const u64 _aligned_attribute(16) mults[3][2] = { 57 | { CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */ 58 | { CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */ 59 | { CRC32_X159_MODG, CRC32_X95_MODG }, /* 1 vecs */ 60 | }; 61 | poly64x2_t multipliers_4, multipliers_2, multipliers_1; 62 | 63 | if (len < 64) 64 | goto tail; 65 | multipliers_4 = load_multipliers(mults[0]); 66 | multipliers_2 = load_multipliers(mults[1]); 67 | multipliers_1 = load_multipliers(mults[2]); 68 | /* 69 | * Short length; don't bother aligning the pointer, and fold 70 | * 64 bytes (4 vectors) at a time, at most. 71 | */ 72 | v0 = veorq_u8(vld1q_u8(p + 0), u32_to_bytevec(crc)); 73 | v1 = vld1q_u8(p + 16); 74 | v2 = vld1q_u8(p + 32); 75 | v3 = vld1q_u8(p + 48); 76 | p += 64; 77 | len -= 64; 78 | while (len >= 64) { 79 | v0 = fold_vec(v0, vld1q_u8(p + 0), multipliers_4); 80 | v1 = fold_vec(v1, vld1q_u8(p + 16), multipliers_4); 81 | v2 = fold_vec(v2, vld1q_u8(p + 32), multipliers_4); 82 | v3 = fold_vec(v3, vld1q_u8(p + 48), multipliers_4); 83 | p += 64; 84 | len -= 64; 85 | } 86 | v0 = fold_vec(v0, v2, multipliers_2); 87 | v1 = fold_vec(v1, v3, multipliers_2); 88 | if (len >= 32) { 89 | v0 = fold_vec(v0, vld1q_u8(p + 0), multipliers_2); 90 | v1 = fold_vec(v1, vld1q_u8(p + 16), multipliers_2); 91 | p += 32; 92 | len -= 32; 93 | } 94 | v0 = fold_vec(v0, v1, multipliers_1); 95 | } else { 96 | static const u64 _aligned_attribute(16) mults[4][2] = { 97 | { CRC32_X1567_MODG, CRC32_X1503_MODG }, /* 12 vecs */ 98 | { CRC32_X799_MODG, CRC32_X735_MODG }, /* 6 vecs */ 99 | { CRC32_X415_MODG, CRC32_X351_MODG }, /* 3 vecs */ 100 | { CRC32_X159_MODG, CRC32_X95_MODG }, /* 1 vecs */ 101 | }; 102 | const poly64x2_t multipliers_12 = load_multipliers(mults[0]); 103 | const poly64x2_t multipliers_6 = load_multipliers(mults[1]); 104 | const poly64x2_t multipliers_3 = load_multipliers(mults[2]); 105 | const poly64x2_t multipliers_1 = load_multipliers(mults[3]); 106 | const size_t align = -(uintptr_t)p & 15; 107 | const uint8x16_t *vp; 108 | 109 | /* Align p to the next 16-byte boundary. */ 110 | if (align) { 111 | if (align & 1) 112 | crc = __crc32b(crc, *p++); 113 | if (align & 2) { 114 | crc = __crc32h(crc, le16_bswap(*(u16 *)p)); 115 | p += 2; 116 | } 117 | if (align & 4) { 118 | crc = __crc32w(crc, le32_bswap(*(u32 *)p)); 119 | p += 4; 120 | } 121 | if (align & 8) { 122 | crc = __crc32d(crc, le64_bswap(*(u64 *)p)); 123 | p += 8; 124 | } 125 | len -= align; 126 | } 127 | vp = (const uint8x16_t *)p; 128 | v0 = veorq_u8(*vp++, u32_to_bytevec(crc)); 129 | v1 = *vp++; 130 | v2 = *vp++; 131 | v3 = *vp++; 132 | v4 = *vp++; 133 | v5 = *vp++; 134 | v6 = *vp++; 135 | v7 = *vp++; 136 | v8 = *vp++; 137 | v9 = *vp++; 138 | v10 = *vp++; 139 | v11 = *vp++; 140 | len -= 192; 141 | /* Fold 192 bytes (12 vectors) at a time. */ 142 | do { 143 | v0 = fold_vec(v0, *vp++, multipliers_12); 144 | v1 = fold_vec(v1, *vp++, multipliers_12); 145 | v2 = fold_vec(v2, *vp++, multipliers_12); 146 | v3 = fold_vec(v3, *vp++, multipliers_12); 147 | v4 = fold_vec(v4, *vp++, multipliers_12); 148 | v5 = fold_vec(v5, *vp++, multipliers_12); 149 | v6 = fold_vec(v6, *vp++, multipliers_12); 150 | v7 = fold_vec(v7, *vp++, multipliers_12); 151 | v8 = fold_vec(v8, *vp++, multipliers_12); 152 | v9 = fold_vec(v9, *vp++, multipliers_12); 153 | v10 = fold_vec(v10, *vp++, multipliers_12); 154 | v11 = fold_vec(v11, *vp++, multipliers_12); 155 | len -= 192; 156 | } while (len >= 192); 157 | 158 | /* 159 | * Fewer than 192 bytes left. Fold v0-v11 down to just v0, 160 | * while processing up to 144 more bytes. 161 | */ 162 | v0 = fold_vec(v0, v6, multipliers_6); 163 | v1 = fold_vec(v1, v7, multipliers_6); 164 | v2 = fold_vec(v2, v8, multipliers_6); 165 | v3 = fold_vec(v3, v9, multipliers_6); 166 | v4 = fold_vec(v4, v10, multipliers_6); 167 | v5 = fold_vec(v5, v11, multipliers_6); 168 | if (len >= 96) { 169 | v0 = fold_vec(v0, *vp++, multipliers_6); 170 | v1 = fold_vec(v1, *vp++, multipliers_6); 171 | v2 = fold_vec(v2, *vp++, multipliers_6); 172 | v3 = fold_vec(v3, *vp++, multipliers_6); 173 | v4 = fold_vec(v4, *vp++, multipliers_6); 174 | v5 = fold_vec(v5, *vp++, multipliers_6); 175 | len -= 96; 176 | } 177 | v0 = fold_vec(v0, v3, multipliers_3); 178 | v1 = fold_vec(v1, v4, multipliers_3); 179 | v2 = fold_vec(v2, v5, multipliers_3); 180 | if (len >= 48) { 181 | v0 = fold_vec(v0, *vp++, multipliers_3); 182 | v1 = fold_vec(v1, *vp++, multipliers_3); 183 | v2 = fold_vec(v2, *vp++, multipliers_3); 184 | len -= 48; 185 | } 186 | v0 = fold_vec(v0, v1, multipliers_1); 187 | v0 = fold_vec(v0, v2, multipliers_1); 188 | p = (const u8 *)vp; 189 | } 190 | /* Reduce 128 to 32 bits using crc32 instructions. */ 191 | crc = __crc32d(0, vgetq_lane_u64(vreinterpretq_u64_u8(v0), 0)); 192 | crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(v0), 1)); 193 | tail: 194 | /* Finish up the remainder using crc32 instructions. */ 195 | if (len & 32) { 196 | crc = __crc32d(crc, get_unaligned_le64(p + 0)); 197 | crc = __crc32d(crc, get_unaligned_le64(p + 8)); 198 | crc = __crc32d(crc, get_unaligned_le64(p + 16)); 199 | crc = __crc32d(crc, get_unaligned_le64(p + 24)); 200 | p += 32; 201 | } 202 | if (len & 16) { 203 | crc = __crc32d(crc, get_unaligned_le64(p + 0)); 204 | crc = __crc32d(crc, get_unaligned_le64(p + 8)); 205 | p += 16; 206 | } 207 | if (len & 8) { 208 | crc = __crc32d(crc, get_unaligned_le64(p)); 209 | p += 8; 210 | } 211 | if (len & 4) { 212 | crc = __crc32w(crc, get_unaligned_le32(p)); 213 | p += 4; 214 | } 215 | if (len & 2) { 216 | crc = __crc32h(crc, get_unaligned_le16(p)); 217 | p += 2; 218 | } 219 | if (len & 1) 220 | crc = __crc32b(crc, *p); 221 | return crc; 222 | } 223 | 224 | #undef SUFFIX 225 | #undef ATTRIBUTES 226 | #undef ENABLE_EOR3 227 | -------------------------------------------------------------------------------- /lib/arm/matchfinder_impl.h: -------------------------------------------------------------------------------- 1 | /* 2 | * arm/matchfinder_impl.h - ARM implementations of matchfinder functions 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #ifndef LIB_ARM_MATCHFINDER_IMPL_H 29 | #define LIB_ARM_MATCHFINDER_IMPL_H 30 | 31 | #include "cpu_features.h" 32 | 33 | #if HAVE_NEON_NATIVE 34 | static forceinline void 35 | matchfinder_init_neon(mf_pos_t *data, size_t size) 36 | { 37 | int16x8_t *p = (int16x8_t *)data; 38 | int16x8_t v = vdupq_n_s16(MATCHFINDER_INITVAL); 39 | 40 | STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); 41 | STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); 42 | STATIC_ASSERT(sizeof(mf_pos_t) == 2); 43 | 44 | do { 45 | p[0] = v; 46 | p[1] = v; 47 | p[2] = v; 48 | p[3] = v; 49 | p += 4; 50 | size -= 4 * sizeof(*p); 51 | } while (size != 0); 52 | } 53 | #define matchfinder_init matchfinder_init_neon 54 | 55 | static forceinline void 56 | matchfinder_rebase_neon(mf_pos_t *data, size_t size) 57 | { 58 | int16x8_t *p = (int16x8_t *)data; 59 | int16x8_t v = vdupq_n_s16((u16)-MATCHFINDER_WINDOW_SIZE); 60 | 61 | STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); 62 | STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); 63 | STATIC_ASSERT(sizeof(mf_pos_t) == 2); 64 | 65 | do { 66 | p[0] = vqaddq_s16(p[0], v); 67 | p[1] = vqaddq_s16(p[1], v); 68 | p[2] = vqaddq_s16(p[2], v); 69 | p[3] = vqaddq_s16(p[3], v); 70 | p += 4; 71 | size -= 4 * sizeof(*p); 72 | } while (size != 0); 73 | } 74 | #define matchfinder_rebase matchfinder_rebase_neon 75 | 76 | #endif /* HAVE_NEON_NATIVE */ 77 | 78 | #endif /* LIB_ARM_MATCHFINDER_IMPL_H */ 79 | -------------------------------------------------------------------------------- /lib/cpu_features_common.h: -------------------------------------------------------------------------------- 1 | /* 2 | * cpu_features_common.h - code shared by all lib/$arch/cpu_features.c 3 | * 4 | * Copyright 2020 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #ifndef LIB_CPU_FEATURES_COMMON_H 29 | #define LIB_CPU_FEATURES_COMMON_H 30 | 31 | #if defined(TEST_SUPPORT__DO_NOT_USE) && !defined(FREESTANDING) 32 | /* for strdup() and strtok_r() */ 33 | # undef _ANSI_SOURCE 34 | # ifndef __APPLE__ 35 | # undef _GNU_SOURCE 36 | # define _GNU_SOURCE 37 | # endif 38 | # include 39 | # include 40 | # include 41 | #endif 42 | 43 | #include "lib_common.h" 44 | 45 | struct cpu_feature { 46 | u32 bit; 47 | const char *name; 48 | }; 49 | 50 | #if defined(TEST_SUPPORT__DO_NOT_USE) && !defined(FREESTANDING) 51 | /* Disable any features that are listed in $LIBDEFLATE_DISABLE_CPU_FEATURES. */ 52 | static inline void 53 | disable_cpu_features_for_testing(u32 *features, 54 | const struct cpu_feature *feature_table, 55 | size_t feature_table_length) 56 | { 57 | char *env_value, *strbuf, *p, *saveptr = NULL; 58 | size_t i; 59 | 60 | env_value = getenv("LIBDEFLATE_DISABLE_CPU_FEATURES"); 61 | if (!env_value) 62 | return; 63 | strbuf = strdup(env_value); 64 | if (!strbuf) 65 | abort(); 66 | p = strtok_r(strbuf, ",", &saveptr); 67 | while (p) { 68 | for (i = 0; i < feature_table_length; i++) { 69 | if (strcmp(p, feature_table[i].name) == 0) { 70 | *features &= ~feature_table[i].bit; 71 | break; 72 | } 73 | } 74 | if (i == feature_table_length) { 75 | fprintf(stderr, 76 | "unrecognized feature in LIBDEFLATE_DISABLE_CPU_FEATURES: \"%s\"\n", 77 | p); 78 | abort(); 79 | } 80 | p = strtok_r(NULL, ",", &saveptr); 81 | } 82 | free(strbuf); 83 | } 84 | #else /* TEST_SUPPORT__DO_NOT_USE */ 85 | static inline void 86 | disable_cpu_features_for_testing(u32 *features, 87 | const struct cpu_feature *feature_table, 88 | size_t feature_table_length) 89 | { 90 | } 91 | #endif /* !TEST_SUPPORT__DO_NOT_USE */ 92 | 93 | #endif /* LIB_CPU_FEATURES_COMMON_H */ 94 | -------------------------------------------------------------------------------- /lib/deflate_compress.h: -------------------------------------------------------------------------------- 1 | #ifndef LIB_DEFLATE_COMPRESS_H 2 | #define LIB_DEFLATE_COMPRESS_H 3 | 4 | #include "lib_common.h" 5 | 6 | /* 7 | * DEFLATE compression is private to deflate_compress.c, but we do need to be 8 | * able to query the compression level for zlib and gzip header generation. 9 | */ 10 | 11 | struct libdeflate_compressor; 12 | 13 | unsigned int libdeflate_get_compression_level(struct libdeflate_compressor *c); 14 | 15 | #endif /* LIB_DEFLATE_COMPRESS_H */ 16 | -------------------------------------------------------------------------------- /lib/deflate_constants.h: -------------------------------------------------------------------------------- 1 | /* 2 | * deflate_constants.h - constants for the DEFLATE compression format 3 | */ 4 | 5 | #ifndef LIB_DEFLATE_CONSTANTS_H 6 | #define LIB_DEFLATE_CONSTANTS_H 7 | 8 | /* Valid block types */ 9 | #define DEFLATE_BLOCKTYPE_UNCOMPRESSED 0 10 | #define DEFLATE_BLOCKTYPE_STATIC_HUFFMAN 1 11 | #define DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN 2 12 | 13 | /* Minimum and maximum supported match lengths (in bytes) */ 14 | #define DEFLATE_MIN_MATCH_LEN 3 15 | #define DEFLATE_MAX_MATCH_LEN 258 16 | 17 | /* Maximum supported match offset (in bytes) */ 18 | #define DEFLATE_MAX_MATCH_OFFSET 32768 19 | 20 | /* log2 of DEFLATE_MAX_MATCH_OFFSET */ 21 | #define DEFLATE_WINDOW_ORDER 15 22 | 23 | /* Number of symbols in each Huffman code. Note: for the literal/length 24 | * and offset codes, these are actually the maximum values; a given block 25 | * might use fewer symbols. */ 26 | #define DEFLATE_NUM_PRECODE_SYMS 19 27 | #define DEFLATE_NUM_LITLEN_SYMS 288 28 | #define DEFLATE_NUM_OFFSET_SYMS 32 29 | 30 | /* The maximum number of symbols across all codes */ 31 | #define DEFLATE_MAX_NUM_SYMS 288 32 | 33 | /* Division of symbols in the literal/length code */ 34 | #define DEFLATE_NUM_LITERALS 256 35 | #define DEFLATE_END_OF_BLOCK 256 36 | #define DEFLATE_FIRST_LEN_SYM 257 37 | 38 | /* Maximum codeword length, in bits, within each Huffman code */ 39 | #define DEFLATE_MAX_PRE_CODEWORD_LEN 7 40 | #define DEFLATE_MAX_LITLEN_CODEWORD_LEN 15 41 | #define DEFLATE_MAX_OFFSET_CODEWORD_LEN 15 42 | 43 | /* The maximum codeword length across all codes */ 44 | #define DEFLATE_MAX_CODEWORD_LEN 15 45 | 46 | /* Maximum possible overrun when decoding codeword lengths */ 47 | #define DEFLATE_MAX_LENS_OVERRUN 137 48 | 49 | /* 50 | * Maximum number of extra bits that may be required to represent a match 51 | * length or offset. 52 | */ 53 | #define DEFLATE_MAX_EXTRA_LENGTH_BITS 5 54 | #define DEFLATE_MAX_EXTRA_OFFSET_BITS 13 55 | 56 | #endif /* LIB_DEFLATE_CONSTANTS_H */ 57 | -------------------------------------------------------------------------------- /lib/gzip_compress.c: -------------------------------------------------------------------------------- 1 | /* 2 | * gzip_compress.c - compress with a gzip wrapper 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #include "deflate_compress.h" 29 | #include "gzip_constants.h" 30 | 31 | LIBDEFLATEAPI size_t 32 | libdeflate_gzip_compress(struct libdeflate_compressor *c, 33 | const void *in, size_t in_nbytes, 34 | void *out, size_t out_nbytes_avail) 35 | { 36 | u8 *out_next = out; 37 | unsigned compression_level; 38 | u8 xfl; 39 | size_t deflate_size; 40 | 41 | if (out_nbytes_avail <= GZIP_MIN_OVERHEAD) 42 | return 0; 43 | 44 | /* ID1 */ 45 | *out_next++ = GZIP_ID1; 46 | /* ID2 */ 47 | *out_next++ = GZIP_ID2; 48 | /* CM */ 49 | *out_next++ = GZIP_CM_DEFLATE; 50 | /* FLG */ 51 | *out_next++ = 0; 52 | /* MTIME */ 53 | put_unaligned_le32(GZIP_MTIME_UNAVAILABLE, out_next); 54 | out_next += 4; 55 | /* XFL */ 56 | xfl = 0; 57 | compression_level = libdeflate_get_compression_level(c); 58 | if (compression_level < 2) 59 | xfl |= GZIP_XFL_FASTEST_COMPRESSION; 60 | else if (compression_level >= 8) 61 | xfl |= GZIP_XFL_SLOWEST_COMPRESSION; 62 | *out_next++ = xfl; 63 | /* OS */ 64 | *out_next++ = GZIP_OS_UNKNOWN; /* OS */ 65 | 66 | /* Compressed data */ 67 | deflate_size = libdeflate_deflate_compress(c, in, in_nbytes, out_next, 68 | out_nbytes_avail - GZIP_MIN_OVERHEAD); 69 | if (deflate_size == 0) 70 | return 0; 71 | out_next += deflate_size; 72 | 73 | /* CRC32 */ 74 | put_unaligned_le32(libdeflate_crc32(0, in, in_nbytes), out_next); 75 | out_next += 4; 76 | 77 | /* ISIZE */ 78 | put_unaligned_le32((u32)in_nbytes, out_next); 79 | out_next += 4; 80 | 81 | return out_next - (u8 *)out; 82 | } 83 | 84 | LIBDEFLATEAPI size_t 85 | libdeflate_gzip_compress_bound(struct libdeflate_compressor *c, 86 | size_t in_nbytes) 87 | { 88 | return GZIP_MIN_OVERHEAD + 89 | libdeflate_deflate_compress_bound(c, in_nbytes); 90 | } 91 | -------------------------------------------------------------------------------- /lib/gzip_constants.h: -------------------------------------------------------------------------------- 1 | /* 2 | * gzip_constants.h - constants for the gzip wrapper format 3 | */ 4 | 5 | #ifndef LIB_GZIP_CONSTANTS_H 6 | #define LIB_GZIP_CONSTANTS_H 7 | 8 | #define GZIP_MIN_HEADER_SIZE 10 9 | #define GZIP_FOOTER_SIZE 8 10 | #define GZIP_MIN_OVERHEAD (GZIP_MIN_HEADER_SIZE + GZIP_FOOTER_SIZE) 11 | 12 | #define GZIP_ID1 0x1F 13 | #define GZIP_ID2 0x8B 14 | 15 | #define GZIP_CM_DEFLATE 8 16 | 17 | #define GZIP_FTEXT 0x01 18 | #define GZIP_FHCRC 0x02 19 | #define GZIP_FEXTRA 0x04 20 | #define GZIP_FNAME 0x08 21 | #define GZIP_FCOMMENT 0x10 22 | #define GZIP_FRESERVED 0xE0 23 | 24 | #define GZIP_MTIME_UNAVAILABLE 0 25 | 26 | #define GZIP_XFL_SLOWEST_COMPRESSION 0x02 27 | #define GZIP_XFL_FASTEST_COMPRESSION 0x04 28 | 29 | #define GZIP_OS_FAT 0 30 | #define GZIP_OS_AMIGA 1 31 | #define GZIP_OS_VMS 2 32 | #define GZIP_OS_UNIX 3 33 | #define GZIP_OS_VM_CMS 4 34 | #define GZIP_OS_ATARI_TOS 5 35 | #define GZIP_OS_HPFS 6 36 | #define GZIP_OS_MACINTOSH 7 37 | #define GZIP_OS_Z_SYSTEM 8 38 | #define GZIP_OS_CP_M 9 39 | #define GZIP_OS_TOPS_20 10 40 | #define GZIP_OS_NTFS 11 41 | #define GZIP_OS_QDOS 12 42 | #define GZIP_OS_RISCOS 13 43 | #define GZIP_OS_UNKNOWN 255 44 | 45 | #endif /* LIB_GZIP_CONSTANTS_H */ 46 | -------------------------------------------------------------------------------- /lib/gzip_decompress.c: -------------------------------------------------------------------------------- 1 | /* 2 | * gzip_decompress.c - decompress with a gzip wrapper 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #include "lib_common.h" 29 | #include "gzip_constants.h" 30 | 31 | LIBDEFLATEAPI enum libdeflate_result 32 | libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *d, 33 | const void *in, size_t in_nbytes, 34 | void *out, size_t out_nbytes_avail, 35 | size_t *actual_in_nbytes_ret, 36 | size_t *actual_out_nbytes_ret) 37 | { 38 | const u8 *in_next = in; 39 | const u8 * const in_end = in_next + in_nbytes; 40 | u8 flg; 41 | size_t actual_in_nbytes; 42 | size_t actual_out_nbytes; 43 | enum libdeflate_result result; 44 | 45 | if (in_nbytes < GZIP_MIN_OVERHEAD) 46 | return LIBDEFLATE_BAD_DATA; 47 | 48 | /* ID1 */ 49 | if (*in_next++ != GZIP_ID1) 50 | return LIBDEFLATE_BAD_DATA; 51 | /* ID2 */ 52 | if (*in_next++ != GZIP_ID2) 53 | return LIBDEFLATE_BAD_DATA; 54 | /* CM */ 55 | if (*in_next++ != GZIP_CM_DEFLATE) 56 | return LIBDEFLATE_BAD_DATA; 57 | flg = *in_next++; 58 | /* MTIME */ 59 | in_next += 4; 60 | /* XFL */ 61 | in_next += 1; 62 | /* OS */ 63 | in_next += 1; 64 | 65 | if (flg & GZIP_FRESERVED) 66 | return LIBDEFLATE_BAD_DATA; 67 | 68 | /* Extra field */ 69 | if (flg & GZIP_FEXTRA) { 70 | u16 xlen = get_unaligned_le16(in_next); 71 | in_next += 2; 72 | 73 | if (in_end - in_next < (u32)xlen + GZIP_FOOTER_SIZE) 74 | return LIBDEFLATE_BAD_DATA; 75 | 76 | in_next += xlen; 77 | } 78 | 79 | /* Original file name (zero terminated) */ 80 | if (flg & GZIP_FNAME) { 81 | while (*in_next++ != 0 && in_next != in_end) 82 | ; 83 | if (in_end - in_next < GZIP_FOOTER_SIZE) 84 | return LIBDEFLATE_BAD_DATA; 85 | } 86 | 87 | /* File comment (zero terminated) */ 88 | if (flg & GZIP_FCOMMENT) { 89 | while (*in_next++ != 0 && in_next != in_end) 90 | ; 91 | if (in_end - in_next < GZIP_FOOTER_SIZE) 92 | return LIBDEFLATE_BAD_DATA; 93 | } 94 | 95 | /* CRC16 for gzip header */ 96 | if (flg & GZIP_FHCRC) { 97 | in_next += 2; 98 | if (in_end - in_next < GZIP_FOOTER_SIZE) 99 | return LIBDEFLATE_BAD_DATA; 100 | } 101 | 102 | /* Compressed data */ 103 | result = libdeflate_deflate_decompress_ex(d, in_next, 104 | in_end - GZIP_FOOTER_SIZE - in_next, 105 | out, out_nbytes_avail, 106 | &actual_in_nbytes, 107 | actual_out_nbytes_ret); 108 | if (result != LIBDEFLATE_SUCCESS) 109 | return result; 110 | 111 | if (actual_out_nbytes_ret) 112 | actual_out_nbytes = *actual_out_nbytes_ret; 113 | else 114 | actual_out_nbytes = out_nbytes_avail; 115 | 116 | in_next += actual_in_nbytes; 117 | 118 | /* CRC32 */ 119 | if (libdeflate_crc32(0, out, actual_out_nbytes) != 120 | get_unaligned_le32(in_next)) 121 | return LIBDEFLATE_BAD_DATA; 122 | in_next += 4; 123 | 124 | /* ISIZE */ 125 | if ((u32)actual_out_nbytes != get_unaligned_le32(in_next)) 126 | return LIBDEFLATE_BAD_DATA; 127 | in_next += 4; 128 | 129 | if (actual_in_nbytes_ret) 130 | *actual_in_nbytes_ret = in_next - (u8 *)in; 131 | 132 | return LIBDEFLATE_SUCCESS; 133 | } 134 | 135 | LIBDEFLATEAPI enum libdeflate_result 136 | libdeflate_gzip_decompress(struct libdeflate_decompressor *d, 137 | const void *in, size_t in_nbytes, 138 | void *out, size_t out_nbytes_avail, 139 | size_t *actual_out_nbytes_ret) 140 | { 141 | return libdeflate_gzip_decompress_ex(d, in, in_nbytes, 142 | out, out_nbytes_avail, 143 | NULL, actual_out_nbytes_ret); 144 | } 145 | -------------------------------------------------------------------------------- /lib/ht_matchfinder.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ht_matchfinder.h - Lempel-Ziv matchfinding with a hash table 3 | * 4 | * Copyright 2022 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | * 27 | * --------------------------------------------------------------------------- 28 | * 29 | * This is a Hash Table (ht) matchfinder. 30 | * 31 | * This is a variant of the Hash Chains (hc) matchfinder that is optimized for 32 | * very fast compression. The ht_matchfinder stores the hash chains inline in 33 | * the hash table, whereas the hc_matchfinder stores them in a separate array. 34 | * Storing the hash chains inline is the faster method when max_search_depth 35 | * (the maximum chain length) is very small. It is not appropriate when 36 | * max_search_depth is larger, as then it uses too much memory. 37 | * 38 | * Due to its focus on speed, the ht_matchfinder doesn't support length 3 39 | * matches. It also doesn't allow max_search_depth to vary at runtime; it is 40 | * fixed at build time as HT_MATCHFINDER_BUCKET_SIZE. 41 | * 42 | * See hc_matchfinder.h for more information. 43 | */ 44 | 45 | #ifndef LIB_HT_MATCHFINDER_H 46 | #define LIB_HT_MATCHFINDER_H 47 | 48 | #include "matchfinder_common.h" 49 | 50 | #define HT_MATCHFINDER_HASH_ORDER 15 51 | #define HT_MATCHFINDER_BUCKET_SIZE 2 52 | 53 | #define HT_MATCHFINDER_MIN_MATCH_LEN 4 54 | /* Minimum value of max_len for ht_matchfinder_longest_match() */ 55 | #define HT_MATCHFINDER_REQUIRED_NBYTES 5 56 | 57 | struct MATCHFINDER_ALIGNED ht_matchfinder { 58 | mf_pos_t hash_tab[1UL << HT_MATCHFINDER_HASH_ORDER] 59 | [HT_MATCHFINDER_BUCKET_SIZE]; 60 | }; 61 | 62 | static forceinline void 63 | ht_matchfinder_init(struct ht_matchfinder *mf) 64 | { 65 | STATIC_ASSERT(sizeof(*mf) % MATCHFINDER_SIZE_ALIGNMENT == 0); 66 | 67 | matchfinder_init((mf_pos_t *)mf, sizeof(*mf)); 68 | } 69 | 70 | static forceinline void 71 | ht_matchfinder_slide_window(struct ht_matchfinder *mf) 72 | { 73 | matchfinder_rebase((mf_pos_t *)mf, sizeof(*mf)); 74 | } 75 | 76 | /* Note: max_len must be >= HT_MATCHFINDER_REQUIRED_NBYTES */ 77 | static forceinline u32 78 | ht_matchfinder_longest_match(struct ht_matchfinder * const mf, 79 | const u8 ** const in_base_p, 80 | const u8 * const in_next, 81 | const u32 max_len, 82 | const u32 nice_len, 83 | u32 * const next_hash, 84 | u32 * const offset_ret) 85 | { 86 | u32 best_len = 0; 87 | const u8 *best_matchptr = in_next; 88 | u32 cur_pos = in_next - *in_base_p; 89 | const u8 *in_base; 90 | mf_pos_t cutoff; 91 | u32 hash; 92 | u32 seq; 93 | mf_pos_t cur_node; 94 | const u8 *matchptr; 95 | #if HT_MATCHFINDER_BUCKET_SIZE > 1 96 | mf_pos_t to_insert; 97 | u32 len; 98 | #endif 99 | #if HT_MATCHFINDER_BUCKET_SIZE > 2 100 | int i; 101 | #endif 102 | 103 | /* This is assumed throughout this function. */ 104 | STATIC_ASSERT(HT_MATCHFINDER_MIN_MATCH_LEN == 4); 105 | 106 | if (cur_pos == MATCHFINDER_WINDOW_SIZE) { 107 | ht_matchfinder_slide_window(mf); 108 | *in_base_p += MATCHFINDER_WINDOW_SIZE; 109 | cur_pos = 0; 110 | } 111 | in_base = *in_base_p; 112 | cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE; 113 | 114 | hash = *next_hash; 115 | STATIC_ASSERT(HT_MATCHFINDER_REQUIRED_NBYTES == 5); 116 | *next_hash = lz_hash(get_unaligned_le32(in_next + 1), 117 | HT_MATCHFINDER_HASH_ORDER); 118 | seq = load_u32_unaligned(in_next); 119 | prefetchw(&mf->hash_tab[*next_hash]); 120 | #if HT_MATCHFINDER_BUCKET_SIZE == 1 121 | /* Hand-unrolled version for BUCKET_SIZE == 1 */ 122 | cur_node = mf->hash_tab[hash][0]; 123 | mf->hash_tab[hash][0] = cur_pos; 124 | if (cur_node <= cutoff) 125 | goto out; 126 | matchptr = &in_base[cur_node]; 127 | if (load_u32_unaligned(matchptr) == seq) { 128 | best_len = lz_extend(in_next, matchptr, 4, max_len); 129 | best_matchptr = matchptr; 130 | } 131 | #elif HT_MATCHFINDER_BUCKET_SIZE == 2 132 | /* 133 | * Hand-unrolled version for BUCKET_SIZE == 2. The logic here also 134 | * differs slightly in that it copies the first entry to the second even 135 | * if nice_len is reached on the first, as this can be slightly faster. 136 | */ 137 | cur_node = mf->hash_tab[hash][0]; 138 | mf->hash_tab[hash][0] = cur_pos; 139 | if (cur_node <= cutoff) 140 | goto out; 141 | matchptr = &in_base[cur_node]; 142 | 143 | to_insert = cur_node; 144 | cur_node = mf->hash_tab[hash][1]; 145 | mf->hash_tab[hash][1] = to_insert; 146 | 147 | if (load_u32_unaligned(matchptr) == seq) { 148 | best_len = lz_extend(in_next, matchptr, 4, max_len); 149 | best_matchptr = matchptr; 150 | if (cur_node <= cutoff || best_len >= nice_len) 151 | goto out; 152 | matchptr = &in_base[cur_node]; 153 | if (load_u32_unaligned(matchptr) == seq && 154 | load_u32_unaligned(matchptr + best_len - 3) == 155 | load_u32_unaligned(in_next + best_len - 3)) { 156 | len = lz_extend(in_next, matchptr, 4, max_len); 157 | if (len > best_len) { 158 | best_len = len; 159 | best_matchptr = matchptr; 160 | } 161 | } 162 | } else { 163 | if (cur_node <= cutoff) 164 | goto out; 165 | matchptr = &in_base[cur_node]; 166 | if (load_u32_unaligned(matchptr) == seq) { 167 | best_len = lz_extend(in_next, matchptr, 4, max_len); 168 | best_matchptr = matchptr; 169 | } 170 | } 171 | #else 172 | /* Generic version for HT_MATCHFINDER_BUCKET_SIZE > 2 */ 173 | to_insert = cur_pos; 174 | for (i = 0; i < HT_MATCHFINDER_BUCKET_SIZE; i++) { 175 | cur_node = mf->hash_tab[hash][i]; 176 | mf->hash_tab[hash][i] = to_insert; 177 | if (cur_node <= cutoff) 178 | goto out; 179 | matchptr = &in_base[cur_node]; 180 | if (load_u32_unaligned(matchptr) == seq) { 181 | len = lz_extend(in_next, matchptr, 4, max_len); 182 | if (len > best_len) { 183 | best_len = len; 184 | best_matchptr = matchptr; 185 | if (best_len >= nice_len) 186 | goto out; 187 | } 188 | } 189 | to_insert = cur_node; 190 | } 191 | #endif 192 | out: 193 | *offset_ret = in_next - best_matchptr; 194 | return best_len; 195 | } 196 | 197 | static forceinline void 198 | ht_matchfinder_skip_bytes(struct ht_matchfinder * const mf, 199 | const u8 ** const in_base_p, 200 | const u8 *in_next, 201 | const u8 * const in_end, 202 | const u32 count, 203 | u32 * const next_hash) 204 | { 205 | s32 cur_pos = in_next - *in_base_p; 206 | u32 hash; 207 | u32 remaining = count; 208 | int i; 209 | 210 | if (unlikely(count + HT_MATCHFINDER_REQUIRED_NBYTES > in_end - in_next)) 211 | return; 212 | 213 | if (cur_pos + count - 1 >= MATCHFINDER_WINDOW_SIZE) { 214 | ht_matchfinder_slide_window(mf); 215 | *in_base_p += MATCHFINDER_WINDOW_SIZE; 216 | cur_pos -= MATCHFINDER_WINDOW_SIZE; 217 | } 218 | 219 | hash = *next_hash; 220 | do { 221 | for (i = HT_MATCHFINDER_BUCKET_SIZE - 1; i > 0; i--) 222 | mf->hash_tab[hash][i] = mf->hash_tab[hash][i - 1]; 223 | mf->hash_tab[hash][0] = cur_pos; 224 | 225 | hash = lz_hash(get_unaligned_le32(++in_next), 226 | HT_MATCHFINDER_HASH_ORDER); 227 | cur_pos++; 228 | } while (--remaining); 229 | 230 | prefetchw(&mf->hash_tab[hash]); 231 | *next_hash = hash; 232 | } 233 | 234 | #endif /* LIB_HT_MATCHFINDER_H */ 235 | -------------------------------------------------------------------------------- /lib/lib_common.h: -------------------------------------------------------------------------------- 1 | /* 2 | * lib_common.h - internal header included by all library code 3 | */ 4 | 5 | #ifndef LIB_LIB_COMMON_H 6 | #define LIB_LIB_COMMON_H 7 | 8 | #ifdef LIBDEFLATE_H 9 | /* 10 | * When building the library, LIBDEFLATEAPI needs to be defined properly before 11 | * including libdeflate.h. 12 | */ 13 | # error "lib_common.h must always be included before libdeflate.h" 14 | #endif 15 | 16 | #if defined(LIBDEFLATE_DLL) && (defined(_WIN32) || defined(__CYGWIN__)) 17 | # define LIBDEFLATE_EXPORT_SYM __declspec(dllexport) 18 | #elif defined(__GNUC__) 19 | # define LIBDEFLATE_EXPORT_SYM __attribute__((visibility("default"))) 20 | #else 21 | # define LIBDEFLATE_EXPORT_SYM 22 | #endif 23 | 24 | /* 25 | * On i386, gcc assumes that the stack is 16-byte aligned at function entry. 26 | * However, some compilers (e.g. MSVC) and programming languages (e.g. Delphi) 27 | * only guarantee 4-byte alignment when calling functions. This is mainly an 28 | * issue on Windows, but it has been seen on Linux too. Work around this ABI 29 | * incompatibility by realigning the stack pointer when entering libdeflate. 30 | * This prevents crashes in SSE/AVX code. 31 | */ 32 | #if defined(__GNUC__) && defined(__i386__) 33 | # define LIBDEFLATE_ALIGN_STACK __attribute__((force_align_arg_pointer)) 34 | #else 35 | # define LIBDEFLATE_ALIGN_STACK 36 | #endif 37 | 38 | #define LIBDEFLATEAPI LIBDEFLATE_EXPORT_SYM LIBDEFLATE_ALIGN_STACK 39 | 40 | #include "../common_defs.h" 41 | 42 | typedef void *(*malloc_func_t)(size_t); 43 | typedef void (*free_func_t)(void *); 44 | 45 | extern malloc_func_t libdeflate_default_malloc_func; 46 | extern free_func_t libdeflate_default_free_func; 47 | 48 | void *libdeflate_aligned_malloc(malloc_func_t malloc_func, 49 | size_t alignment, size_t size); 50 | void libdeflate_aligned_free(free_func_t free_func, void *ptr); 51 | 52 | #ifdef FREESTANDING 53 | /* 54 | * With -ffreestanding, may be missing, and we must provide 55 | * implementations of memset(), memcpy(), memmove(), and memcmp(). 56 | * See https://gcc.gnu.org/onlinedocs/gcc/Standards.html 57 | * 58 | * Also, -ffreestanding disables interpreting calls to these functions as 59 | * built-ins. E.g., calling memcpy(&v, p, WORDBYTES) will make a function call, 60 | * not be optimized to a single load instruction. For performance reasons we 61 | * don't want that. So, declare these functions as macros that expand to the 62 | * corresponding built-ins. This approach is recommended in the gcc man page. 63 | * We still need the actual function definitions in case gcc calls them. 64 | */ 65 | void *memset(void *s, int c, size_t n); 66 | #define memset(s, c, n) __builtin_memset((s), (c), (n)) 67 | 68 | void *memcpy(void *dest, const void *src, size_t n); 69 | #define memcpy(dest, src, n) __builtin_memcpy((dest), (src), (n)) 70 | 71 | void *memmove(void *dest, const void *src, size_t n); 72 | #define memmove(dest, src, n) __builtin_memmove((dest), (src), (n)) 73 | 74 | int memcmp(const void *s1, const void *s2, size_t n); 75 | #define memcmp(s1, s2, n) __builtin_memcmp((s1), (s2), (n)) 76 | 77 | #undef LIBDEFLATE_ENABLE_ASSERTIONS 78 | #else 79 | # include 80 | /* 81 | * To prevent false positive static analyzer warnings, ensure that assertions 82 | * are visible to the static analyzer. 83 | */ 84 | # ifdef __clang_analyzer__ 85 | # define LIBDEFLATE_ENABLE_ASSERTIONS 86 | # endif 87 | #endif 88 | 89 | /* 90 | * Runtime assertion support. Don't enable this in production builds; it may 91 | * hurt performance significantly. 92 | */ 93 | #ifdef LIBDEFLATE_ENABLE_ASSERTIONS 94 | NORETURN void 95 | libdeflate_assertion_failed(const char *expr, const char *file, int line); 96 | #define ASSERT(expr) { if (unlikely(!(expr))) \ 97 | libdeflate_assertion_failed(#expr, __FILE__, __LINE__); } 98 | #else 99 | #define ASSERT(expr) (void)(expr) 100 | #endif 101 | 102 | #define CONCAT_IMPL(a, b) a##b 103 | #define CONCAT(a, b) CONCAT_IMPL(a, b) 104 | #define ADD_SUFFIX(name) CONCAT(name, SUFFIX) 105 | 106 | #endif /* LIB_LIB_COMMON_H */ 107 | -------------------------------------------------------------------------------- /lib/matchfinder_common.h: -------------------------------------------------------------------------------- 1 | /* 2 | * matchfinder_common.h - common code for Lempel-Ziv matchfinding 3 | */ 4 | 5 | #ifndef LIB_MATCHFINDER_COMMON_H 6 | #define LIB_MATCHFINDER_COMMON_H 7 | 8 | #include "lib_common.h" 9 | 10 | #ifndef MATCHFINDER_WINDOW_ORDER 11 | # error "MATCHFINDER_WINDOW_ORDER must be defined!" 12 | #endif 13 | 14 | /* 15 | * Given a 32-bit value that was loaded with the platform's native endianness, 16 | * return a 32-bit value whose high-order 8 bits are 0 and whose low-order 24 17 | * bits contain the first 3 bytes, arranged in octets in a platform-dependent 18 | * order, at the memory location from which the input 32-bit value was loaded. 19 | */ 20 | static forceinline u32 21 | loaded_u32_to_u24(u32 v) 22 | { 23 | if (CPU_IS_LITTLE_ENDIAN()) 24 | return v & 0xFFFFFF; 25 | else 26 | return v >> 8; 27 | } 28 | 29 | /* 30 | * Load the next 3 bytes from @p into the 24 low-order bits of a 32-bit value. 31 | * The order in which the 3 bytes will be arranged as octets in the 24 bits is 32 | * platform-dependent. At least 4 bytes (not 3) must be available at @p. 33 | */ 34 | static forceinline u32 35 | load_u24_unaligned(const u8 *p) 36 | { 37 | #if UNALIGNED_ACCESS_IS_FAST 38 | return loaded_u32_to_u24(load_u32_unaligned(p)); 39 | #else 40 | if (CPU_IS_LITTLE_ENDIAN()) 41 | return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16); 42 | else 43 | return ((u32)p[2] << 0) | ((u32)p[1] << 8) | ((u32)p[0] << 16); 44 | #endif 45 | } 46 | 47 | #define MATCHFINDER_WINDOW_SIZE (1UL << MATCHFINDER_WINDOW_ORDER) 48 | 49 | typedef s16 mf_pos_t; 50 | 51 | #define MATCHFINDER_INITVAL ((mf_pos_t)-MATCHFINDER_WINDOW_SIZE) 52 | 53 | /* 54 | * This is the memory address alignment, in bytes, required for the matchfinder 55 | * buffers by the architecture-specific implementations of matchfinder_init() 56 | * and matchfinder_rebase(). "Matchfinder buffer" means an entire struct 57 | * hc_matchfinder, bt_matchfinder, or ht_matchfinder; the next_tab field of 58 | * struct hc_matchfinder; or the child_tab field of struct bt_matchfinder. 59 | * 60 | * This affects how the entire 'struct deflate_compressor' is allocated, since 61 | * the matchfinder structures are embedded inside it. 62 | * 63 | * Currently the maximum memory address alignment required is 32 bytes, needed 64 | * by the AVX-2 matchfinder functions. 65 | */ 66 | #define MATCHFINDER_MEM_ALIGNMENT 32 67 | 68 | /* 69 | * This declares a size, in bytes, that is guaranteed to divide the sizes of the 70 | * matchfinder buffers (where "matchfinder buffers" is as defined for 71 | * MATCHFINDER_MEM_ALIGNMENT). The architecture-specific implementations of 72 | * matchfinder_init() and matchfinder_rebase() take advantage of this value. 73 | * 74 | * Currently the maximum size alignment required is 128 bytes, needed by 75 | * the AVX-2 matchfinder functions. However, the RISC-V Vector Extension 76 | * matchfinder functions can, in principle, take advantage of a larger size 77 | * alignment. Therefore, we set this to 1024, which still easily divides the 78 | * actual sizes that result from the current matchfinder struct definitions. 79 | * This value can safely be changed to any power of two that is >= 128. 80 | */ 81 | #define MATCHFINDER_SIZE_ALIGNMENT 1024 82 | 83 | #undef matchfinder_init 84 | #undef matchfinder_rebase 85 | #ifdef _aligned_attribute 86 | # define MATCHFINDER_ALIGNED _aligned_attribute(MATCHFINDER_MEM_ALIGNMENT) 87 | # if defined(ARCH_ARM32) || defined(ARCH_ARM64) 88 | # include "arm/matchfinder_impl.h" 89 | # elif defined(ARCH_RISCV) 90 | # include "riscv/matchfinder_impl.h" 91 | # elif defined(ARCH_X86_32) || defined(ARCH_X86_64) 92 | # include "x86/matchfinder_impl.h" 93 | # endif 94 | #else 95 | # define MATCHFINDER_ALIGNED 96 | #endif 97 | 98 | /* 99 | * Initialize the hash table portion of the matchfinder. 100 | * 101 | * Essentially, this is an optimized memset(). 102 | * 103 | * 'data' must be aligned to a MATCHFINDER_MEM_ALIGNMENT boundary, and 104 | * 'size' must be a multiple of MATCHFINDER_SIZE_ALIGNMENT. 105 | */ 106 | #ifndef matchfinder_init 107 | static forceinline void 108 | matchfinder_init(mf_pos_t *data, size_t size) 109 | { 110 | size_t num_entries = size / sizeof(*data); 111 | size_t i; 112 | 113 | for (i = 0; i < num_entries; i++) 114 | data[i] = MATCHFINDER_INITVAL; 115 | } 116 | #endif 117 | 118 | /* 119 | * Slide the matchfinder by MATCHFINDER_WINDOW_SIZE bytes. 120 | * 121 | * This must be called just after each MATCHFINDER_WINDOW_SIZE bytes have been 122 | * run through the matchfinder. 123 | * 124 | * This subtracts MATCHFINDER_WINDOW_SIZE bytes from each entry in the given 125 | * array, making the entries be relative to the current position rather than the 126 | * position MATCHFINDER_WINDOW_SIZE bytes prior. To avoid integer underflows, 127 | * entries that would become less than -MATCHFINDER_WINDOW_SIZE stay at 128 | * -MATCHFINDER_WINDOW_SIZE, keeping them permanently out of bounds. 129 | * 130 | * The given array must contain all matchfinder data that is position-relative: 131 | * the hash table(s) as well as any hash chain or binary tree links. Its 132 | * address must be aligned to a MATCHFINDER_MEM_ALIGNMENT boundary, and its size 133 | * must be a multiple of MATCHFINDER_SIZE_ALIGNMENT. 134 | */ 135 | #ifndef matchfinder_rebase 136 | static forceinline void 137 | matchfinder_rebase(mf_pos_t *data, size_t size) 138 | { 139 | size_t num_entries = size / sizeof(*data); 140 | size_t i; 141 | 142 | if (MATCHFINDER_WINDOW_SIZE == 32768) { 143 | /* 144 | * Branchless version for 32768-byte windows. Clear all bits if 145 | * the value was already negative, then set the sign bit. This 146 | * is equivalent to subtracting 32768 with signed saturation. 147 | */ 148 | for (i = 0; i < num_entries; i++) 149 | data[i] = 0x8000 | (data[i] & ~(data[i] >> 15)); 150 | } else { 151 | for (i = 0; i < num_entries; i++) { 152 | if (data[i] >= 0) 153 | data[i] -= (mf_pos_t)-MATCHFINDER_WINDOW_SIZE; 154 | else 155 | data[i] = (mf_pos_t)-MATCHFINDER_WINDOW_SIZE; 156 | } 157 | } 158 | } 159 | #endif 160 | 161 | /* 162 | * The hash function: given a sequence prefix held in the low-order bits of a 163 | * 32-bit value, multiply by a carefully-chosen large constant. Discard any 164 | * bits of the product that don't fit in a 32-bit value, but take the 165 | * next-highest @num_bits bits of the product as the hash value, as those have 166 | * the most randomness. 167 | */ 168 | static forceinline u32 169 | lz_hash(u32 seq, unsigned num_bits) 170 | { 171 | return (u32)(seq * 0x1E35A7BD) >> (32 - num_bits); 172 | } 173 | 174 | /* 175 | * Return the number of bytes at @matchptr that match the bytes at @strptr, up 176 | * to a maximum of @max_len. Initially, @start_len bytes are matched. 177 | */ 178 | static forceinline u32 179 | lz_extend(const u8 * const strptr, const u8 * const matchptr, 180 | const u32 start_len, const u32 max_len) 181 | { 182 | u32 len = start_len; 183 | machine_word_t v_word; 184 | 185 | if (UNALIGNED_ACCESS_IS_FAST) { 186 | 187 | if (likely(max_len - len >= 4 * WORDBYTES)) { 188 | 189 | #define COMPARE_WORD_STEP \ 190 | v_word = load_word_unaligned(&matchptr[len]) ^ \ 191 | load_word_unaligned(&strptr[len]); \ 192 | if (v_word != 0) \ 193 | goto word_differs; \ 194 | len += WORDBYTES; \ 195 | 196 | COMPARE_WORD_STEP 197 | COMPARE_WORD_STEP 198 | COMPARE_WORD_STEP 199 | COMPARE_WORD_STEP 200 | #undef COMPARE_WORD_STEP 201 | } 202 | 203 | while (len + WORDBYTES <= max_len) { 204 | v_word = load_word_unaligned(&matchptr[len]) ^ 205 | load_word_unaligned(&strptr[len]); 206 | if (v_word != 0) 207 | goto word_differs; 208 | len += WORDBYTES; 209 | } 210 | } 211 | 212 | while (len < max_len && matchptr[len] == strptr[len]) 213 | len++; 214 | return len; 215 | 216 | word_differs: 217 | if (CPU_IS_LITTLE_ENDIAN()) 218 | len += (bsfw(v_word) >> 3); 219 | else 220 | len += (WORDBITS - 1 - bsrw(v_word)) >> 3; 221 | return len; 222 | } 223 | 224 | #endif /* LIB_MATCHFINDER_COMMON_H */ 225 | -------------------------------------------------------------------------------- /lib/riscv/matchfinder_impl.h: -------------------------------------------------------------------------------- 1 | /* 2 | * riscv/matchfinder_impl.h - RISC-V implementations of matchfinder functions 3 | * 4 | * Copyright 2024 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #ifndef LIB_RISCV_MATCHFINDER_IMPL_H 29 | #define LIB_RISCV_MATCHFINDER_IMPL_H 30 | 31 | #if defined(ARCH_RISCV) && defined(__riscv_vector) 32 | #include 33 | 34 | /* 35 | * Return the maximum number of 16-bit (mf_pos_t) elements that fit in 8 RISC-V 36 | * vector registers and also evenly divide the sizes of the matchfinder buffers. 37 | */ 38 | static forceinline size_t 39 | riscv_matchfinder_vl(void) 40 | { 41 | const size_t vl = __riscv_vsetvlmax_e16m8(); 42 | 43 | STATIC_ASSERT(sizeof(mf_pos_t) == sizeof(s16)); 44 | /* 45 | * MATCHFINDER_SIZE_ALIGNMENT is a power of 2, as is 'vl' because the 46 | * RISC-V Vector Extension requires that the vector register length 47 | * (VLEN) be a power of 2. Thus, a simple MIN() gives the correct 48 | * answer here; rounding to a power of 2 is not required. 49 | */ 50 | STATIC_ASSERT((MATCHFINDER_SIZE_ALIGNMENT & 51 | (MATCHFINDER_SIZE_ALIGNMENT - 1)) == 0); 52 | ASSERT((vl & (vl - 1)) == 0); 53 | return MIN(vl, MATCHFINDER_SIZE_ALIGNMENT / sizeof(mf_pos_t)); 54 | } 55 | 56 | /* matchfinder_init() optimized using the RISC-V Vector Extension */ 57 | static forceinline void 58 | matchfinder_init_rvv(mf_pos_t *p, size_t size) 59 | { 60 | const size_t vl = riscv_matchfinder_vl(); 61 | const vint16m8_t v = __riscv_vmv_v_x_i16m8(MATCHFINDER_INITVAL, vl); 62 | 63 | ASSERT(size > 0 && size % (vl * sizeof(p[0])) == 0); 64 | do { 65 | __riscv_vse16_v_i16m8(p, v, vl); 66 | p += vl; 67 | size -= vl * sizeof(p[0]); 68 | } while (size != 0); 69 | } 70 | #define matchfinder_init matchfinder_init_rvv 71 | 72 | /* matchfinder_rebase() optimized using the RISC-V Vector Extension */ 73 | static forceinline void 74 | matchfinder_rebase_rvv(mf_pos_t *p, size_t size) 75 | { 76 | const size_t vl = riscv_matchfinder_vl(); 77 | 78 | ASSERT(size > 0 && size % (vl * sizeof(p[0])) == 0); 79 | do { 80 | vint16m8_t v = __riscv_vle16_v_i16m8(p, vl); 81 | 82 | /* 83 | * This should generate the vsadd.vx instruction 84 | * (Vector Saturating Add, integer vector-scalar) 85 | */ 86 | v = __riscv_vsadd_vx_i16m8(v, (s16)-MATCHFINDER_WINDOW_SIZE, 87 | vl); 88 | __riscv_vse16_v_i16m8(p, v, vl); 89 | p += vl; 90 | size -= vl * sizeof(p[0]); 91 | } while (size != 0); 92 | } 93 | #define matchfinder_rebase matchfinder_rebase_rvv 94 | 95 | #endif /* ARCH_RISCV && __riscv_vector */ 96 | 97 | #endif /* LIB_RISCV_MATCHFINDER_IMPL_H */ 98 | -------------------------------------------------------------------------------- /lib/utils.c: -------------------------------------------------------------------------------- 1 | /* 2 | * utils.c - utility functions for libdeflate 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #include "lib_common.h" 29 | 30 | #ifdef FREESTANDING 31 | # define malloc NULL 32 | # define free NULL 33 | #else 34 | # include 35 | #endif 36 | 37 | malloc_func_t libdeflate_default_malloc_func = malloc; 38 | free_func_t libdeflate_default_free_func = free; 39 | 40 | void * 41 | libdeflate_aligned_malloc(malloc_func_t malloc_func, 42 | size_t alignment, size_t size) 43 | { 44 | void *ptr = (*malloc_func)(sizeof(void *) + alignment - 1 + size); 45 | 46 | if (ptr) { 47 | void *orig_ptr = ptr; 48 | 49 | ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment); 50 | ((void **)ptr)[-1] = orig_ptr; 51 | } 52 | return ptr; 53 | } 54 | 55 | void 56 | libdeflate_aligned_free(free_func_t free_func, void *ptr) 57 | { 58 | (*free_func)(((void **)ptr)[-1]); 59 | } 60 | 61 | LIBDEFLATEAPI void 62 | libdeflate_set_memory_allocator(malloc_func_t malloc_func, 63 | free_func_t free_func) 64 | { 65 | libdeflate_default_malloc_func = malloc_func; 66 | libdeflate_default_free_func = free_func; 67 | } 68 | 69 | /* 70 | * Implementations of libc functions for freestanding library builds. 71 | * Normal library builds don't use these. Not optimized yet; usually the 72 | * compiler expands these functions and doesn't actually call them anyway. 73 | */ 74 | #ifdef FREESTANDING 75 | #undef memset 76 | void * __attribute__((weak)) 77 | memset(void *s, int c, size_t n) 78 | { 79 | u8 *p = s; 80 | size_t i; 81 | 82 | for (i = 0; i < n; i++) 83 | p[i] = c; 84 | return s; 85 | } 86 | 87 | #undef memcpy 88 | void * __attribute__((weak)) 89 | memcpy(void *dest, const void *src, size_t n) 90 | { 91 | u8 *d = dest; 92 | const u8 *s = src; 93 | size_t i; 94 | 95 | for (i = 0; i < n; i++) 96 | d[i] = s[i]; 97 | return dest; 98 | } 99 | 100 | #undef memmove 101 | void * __attribute__((weak)) 102 | memmove(void *dest, const void *src, size_t n) 103 | { 104 | u8 *d = dest; 105 | const u8 *s = src; 106 | size_t i; 107 | 108 | if (d <= s) 109 | return memcpy(d, s, n); 110 | 111 | for (i = n; i > 0; i--) 112 | d[i - 1] = s[i - 1]; 113 | return dest; 114 | } 115 | 116 | #undef memcmp 117 | int __attribute__((weak)) 118 | memcmp(const void *s1, const void *s2, size_t n) 119 | { 120 | const u8 *p1 = s1; 121 | const u8 *p2 = s2; 122 | size_t i; 123 | 124 | for (i = 0; i < n; i++) { 125 | if (p1[i] != p2[i]) 126 | return (int)p1[i] - (int)p2[i]; 127 | } 128 | return 0; 129 | } 130 | #endif /* FREESTANDING */ 131 | 132 | #ifdef LIBDEFLATE_ENABLE_ASSERTIONS 133 | #include 134 | #include 135 | NORETURN void 136 | libdeflate_assertion_failed(const char *expr, const char *file, int line) 137 | { 138 | fprintf(stderr, "Assertion failed: %s at %s:%d\n", expr, file, line); 139 | abort(); 140 | } 141 | #endif /* LIBDEFLATE_ENABLE_ASSERTIONS */ 142 | -------------------------------------------------------------------------------- /lib/x86/adler32_impl.h: -------------------------------------------------------------------------------- 1 | /* 2 | * x86/adler32_impl.h - x86 implementations of Adler-32 checksum algorithm 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #ifndef LIB_X86_ADLER32_IMPL_H 29 | #define LIB_X86_ADLER32_IMPL_H 30 | 31 | #include "cpu_features.h" 32 | 33 | /* SSE2 and AVX2 implementations. Used on older CPUs. */ 34 | #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) 35 | # define adler32_x86_sse2 adler32_x86_sse2 36 | # define SUFFIX _sse2 37 | # define ATTRIBUTES _target_attribute("sse2") 38 | # define VL 16 39 | # define USE_VNNI 0 40 | # define USE_AVX512 0 41 | # include "adler32_template.h" 42 | 43 | # define adler32_x86_avx2 adler32_x86_avx2 44 | # define SUFFIX _avx2 45 | # define ATTRIBUTES _target_attribute("avx2") 46 | # define VL 32 47 | # define USE_VNNI 0 48 | # define USE_AVX512 0 49 | # include "adler32_template.h" 50 | #endif 51 | 52 | /* 53 | * AVX-VNNI implementation. This is used on CPUs that have AVX2 and AVX-VNNI 54 | * but don't have AVX-512, for example Intel Alder Lake. 55 | * 56 | * Unusually for a new CPU feature, gcc added support for the AVX-VNNI 57 | * intrinsics (in gcc 11.1) slightly before binutils added support for 58 | * assembling AVX-VNNI instructions (in binutils 2.36). Distros can reasonably 59 | * have gcc 11 with binutils 2.35. Because of this issue, we check for gcc 12 60 | * instead of gcc 11. (libdeflate supports direct compilation without a 61 | * configure step, so checking the binutils version is not always an option.) 62 | */ 63 | #if (GCC_PREREQ(12, 1) || CLANG_PREREQ(12, 0, 13000000) || MSVC_PREREQ(1930)) && \ 64 | !defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_AVX_VNNI) 65 | # define adler32_x86_avx2_vnni adler32_x86_avx2_vnni 66 | # define SUFFIX _avx2_vnni 67 | # define ATTRIBUTES _target_attribute("avx2,avxvnni") 68 | # define VL 32 69 | # define USE_VNNI 1 70 | # define USE_AVX512 0 71 | # include "adler32_template.h" 72 | #endif 73 | 74 | #if (GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)) && \ 75 | !defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_AVX512VNNI) 76 | /* 77 | * AVX512VNNI implementation using 256-bit vectors. This is very similar to the 78 | * AVX-VNNI implementation but takes advantage of masking and more registers. 79 | * This is used on certain older Intel CPUs, specifically Ice Lake and Tiger 80 | * Lake, which support AVX512VNNI but downclock a bit too eagerly when ZMM 81 | * registers are used. 82 | */ 83 | # define adler32_x86_avx512_vl256_vnni adler32_x86_avx512_vl256_vnni 84 | # define SUFFIX _avx512_vl256_vnni 85 | # define ATTRIBUTES _target_attribute("avx512bw,avx512vl,avx512vnni") 86 | # define VL 32 87 | # define USE_VNNI 1 88 | # define USE_AVX512 1 89 | # include "adler32_template.h" 90 | 91 | /* 92 | * AVX512VNNI implementation using 512-bit vectors. This is used on CPUs that 93 | * have a good AVX-512 implementation including AVX512VNNI. 94 | */ 95 | # define adler32_x86_avx512_vl512_vnni adler32_x86_avx512_vl512_vnni 96 | # define SUFFIX _avx512_vl512_vnni 97 | # define ATTRIBUTES _target_attribute("avx512bw,avx512vnni") 98 | # define VL 64 99 | # define USE_VNNI 1 100 | # define USE_AVX512 1 101 | # include "adler32_template.h" 102 | #endif 103 | 104 | static inline adler32_func_t 105 | arch_select_adler32_func(void) 106 | { 107 | const u32 features MAYBE_UNUSED = get_x86_cpu_features(); 108 | 109 | #ifdef adler32_x86_avx512_vl512_vnni 110 | if ((features & X86_CPU_FEATURE_ZMM) && 111 | HAVE_AVX512BW(features) && HAVE_AVX512VNNI(features)) 112 | return adler32_x86_avx512_vl512_vnni; 113 | #endif 114 | #ifdef adler32_x86_avx512_vl256_vnni 115 | if (HAVE_AVX512BW(features) && HAVE_AVX512VL(features) && 116 | HAVE_AVX512VNNI(features)) 117 | return adler32_x86_avx512_vl256_vnni; 118 | #endif 119 | #ifdef adler32_x86_avx2_vnni 120 | if (HAVE_AVX2(features) && HAVE_AVXVNNI(features)) 121 | return adler32_x86_avx2_vnni; 122 | #endif 123 | #ifdef adler32_x86_avx2 124 | if (HAVE_AVX2(features)) 125 | return adler32_x86_avx2; 126 | #endif 127 | #ifdef adler32_x86_sse2 128 | if (HAVE_SSE2(features)) 129 | return adler32_x86_sse2; 130 | #endif 131 | return NULL; 132 | } 133 | #define arch_select_adler32_func arch_select_adler32_func 134 | 135 | #endif /* LIB_X86_ADLER32_IMPL_H */ 136 | -------------------------------------------------------------------------------- /lib/x86/cpu_features.c: -------------------------------------------------------------------------------- 1 | /* 2 | * x86/cpu_features.c - feature detection for x86 CPUs 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #include "../cpu_features_common.h" /* must be included first */ 29 | #include "cpu_features.h" 30 | 31 | #ifdef X86_CPU_FEATURES_KNOWN 32 | /* Runtime x86 CPU feature detection is supported. */ 33 | 34 | /* Execute the CPUID instruction. */ 35 | static inline void 36 | cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d) 37 | { 38 | #ifdef _MSC_VER 39 | int result[4]; 40 | 41 | __cpuidex(result, leaf, subleaf); 42 | *a = result[0]; 43 | *b = result[1]; 44 | *c = result[2]; 45 | *d = result[3]; 46 | #else 47 | __asm__ volatile("cpuid" : "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d) 48 | : "a" (leaf), "c" (subleaf)); 49 | #endif 50 | } 51 | 52 | /* Read an extended control register. */ 53 | static inline u64 54 | read_xcr(u32 index) 55 | { 56 | #ifdef _MSC_VER 57 | return _xgetbv(index); 58 | #else 59 | u32 d, a; 60 | 61 | /* 62 | * Execute the "xgetbv" instruction. Old versions of binutils do not 63 | * recognize this instruction, so list the raw bytes instead. 64 | * 65 | * This must be 'volatile' to prevent this code from being moved out 66 | * from under the check for OSXSAVE. 67 | */ 68 | __asm__ volatile(".byte 0x0f, 0x01, 0xd0" : 69 | "=d" (d), "=a" (a) : "c" (index)); 70 | 71 | return ((u64)d << 32) | a; 72 | #endif 73 | } 74 | 75 | static const struct cpu_feature x86_cpu_feature_table[] = { 76 | {X86_CPU_FEATURE_SSE2, "sse2"}, 77 | {X86_CPU_FEATURE_PCLMULQDQ, "pclmulqdq"}, 78 | {X86_CPU_FEATURE_AVX, "avx"}, 79 | {X86_CPU_FEATURE_AVX2, "avx2"}, 80 | {X86_CPU_FEATURE_BMI2, "bmi2"}, 81 | {X86_CPU_FEATURE_ZMM, "zmm"}, 82 | {X86_CPU_FEATURE_AVX512BW, "avx512bw"}, 83 | {X86_CPU_FEATURE_AVX512VL, "avx512vl"}, 84 | {X86_CPU_FEATURE_VPCLMULQDQ, "vpclmulqdq"}, 85 | {X86_CPU_FEATURE_AVX512VNNI, "avx512_vnni"}, 86 | {X86_CPU_FEATURE_AVXVNNI, "avx_vnni"}, 87 | }; 88 | 89 | volatile u32 libdeflate_x86_cpu_features = 0; 90 | 91 | static inline bool 92 | os_supports_avx512(u64 xcr0) 93 | { 94 | #ifdef __APPLE__ 95 | /* 96 | * The Darwin kernel had a bug where it could corrupt the opmask 97 | * registers. See 98 | * https://community.intel.com/t5/Software-Tuning-Performance/MacOS-Darwin-kernel-bug-clobbers-AVX-512-opmask-register-state/m-p/1327259 99 | * Darwin also does not initially set the XCR0 bits for AVX512, but they 100 | * are set if the thread tries to use AVX512 anyway. Thus, to safely 101 | * and consistently use AVX512 on macOS we'd need to check the kernel 102 | * version as well as detect AVX512 support using a macOS-specific 103 | * method. We don't bother with this, especially given Apple's 104 | * transition to arm64. 105 | */ 106 | return false; 107 | #else 108 | return (xcr0 & 0xe6) == 0xe6; 109 | #endif 110 | } 111 | 112 | /* 113 | * Don't use 512-bit vectors (ZMM registers) on Intel CPUs before Rocket Lake 114 | * and Sapphire Rapids, due to the overly-eager downclocking which can reduce 115 | * the performance of workloads that use ZMM registers only occasionally. 116 | */ 117 | static inline bool 118 | allow_512bit_vectors(const u32 manufacturer[3], u32 family, u32 model) 119 | { 120 | #ifdef TEST_SUPPORT__DO_NOT_USE 121 | return true; 122 | #endif 123 | if (memcmp(manufacturer, "GenuineIntel", 12) != 0) 124 | return true; 125 | if (family != 6) 126 | return true; 127 | switch (model) { 128 | case 85: /* Skylake (Server), Cascade Lake, Cooper Lake */ 129 | case 106: /* Ice Lake (Server) */ 130 | case 108: /* Ice Lake (Server) */ 131 | case 126: /* Ice Lake (Client) */ 132 | case 140: /* Tiger Lake */ 133 | case 141: /* Tiger Lake */ 134 | return false; 135 | } 136 | return true; 137 | } 138 | 139 | /* Initialize libdeflate_x86_cpu_features. */ 140 | void libdeflate_init_x86_cpu_features(void) 141 | { 142 | u32 max_leaf; 143 | u32 manufacturer[3]; 144 | u32 family, model; 145 | u32 a, b, c, d; 146 | u64 xcr0 = 0; 147 | u32 features = 0; 148 | 149 | /* EAX=0: Highest Function Parameter and Manufacturer ID */ 150 | cpuid(0, 0, &max_leaf, &manufacturer[0], &manufacturer[2], 151 | &manufacturer[1]); 152 | if (max_leaf < 1) 153 | goto out; 154 | 155 | /* EAX=1: Processor Info and Feature Bits */ 156 | cpuid(1, 0, &a, &b, &c, &d); 157 | family = (a >> 8) & 0xf; 158 | model = (a >> 4) & 0xf; 159 | if (family == 6 || family == 0xf) 160 | model += (a >> 12) & 0xf0; 161 | if (family == 0xf) 162 | family += (a >> 20) & 0xff; 163 | if (d & (1 << 26)) 164 | features |= X86_CPU_FEATURE_SSE2; 165 | /* 166 | * No known CPUs have pclmulqdq without sse4.1, so in practice code 167 | * targeting pclmulqdq can use sse4.1 instructions. But to be safe, 168 | * explicitly check for both the pclmulqdq and sse4.1 bits. 169 | */ 170 | if ((c & (1 << 1)) && (c & (1 << 19))) 171 | features |= X86_CPU_FEATURE_PCLMULQDQ; 172 | if (c & (1 << 27)) 173 | xcr0 = read_xcr(0); 174 | if ((c & (1 << 28)) && ((xcr0 & 0x6) == 0x6)) 175 | features |= X86_CPU_FEATURE_AVX; 176 | 177 | if (max_leaf < 7) 178 | goto out; 179 | 180 | /* EAX=7, ECX=0: Extended Features */ 181 | cpuid(7, 0, &a, &b, &c, &d); 182 | if (b & (1 << 8)) 183 | features |= X86_CPU_FEATURE_BMI2; 184 | if ((xcr0 & 0x6) == 0x6) { 185 | if (b & (1 << 5)) 186 | features |= X86_CPU_FEATURE_AVX2; 187 | if (c & (1 << 10)) 188 | features |= X86_CPU_FEATURE_VPCLMULQDQ; 189 | } 190 | if (os_supports_avx512(xcr0)) { 191 | if (allow_512bit_vectors(manufacturer, family, model)) 192 | features |= X86_CPU_FEATURE_ZMM; 193 | if (b & (1 << 30)) 194 | features |= X86_CPU_FEATURE_AVX512BW; 195 | if (b & (1U << 31)) 196 | features |= X86_CPU_FEATURE_AVX512VL; 197 | if (c & (1 << 11)) 198 | features |= X86_CPU_FEATURE_AVX512VNNI; 199 | } 200 | 201 | /* EAX=7, ECX=1: Extended Features */ 202 | cpuid(7, 1, &a, &b, &c, &d); 203 | if ((a & (1 << 4)) && ((xcr0 & 0x6) == 0x6)) 204 | features |= X86_CPU_FEATURE_AVXVNNI; 205 | 206 | out: 207 | disable_cpu_features_for_testing(&features, x86_cpu_feature_table, 208 | ARRAY_LEN(x86_cpu_feature_table)); 209 | 210 | libdeflate_x86_cpu_features = features | X86_CPU_FEATURES_KNOWN; 211 | } 212 | 213 | #endif /* X86_CPU_FEATURES_KNOWN */ 214 | -------------------------------------------------------------------------------- /lib/x86/cpu_features.h: -------------------------------------------------------------------------------- 1 | /* 2 | * x86/cpu_features.h - feature detection for x86 CPUs 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #ifndef LIB_X86_CPU_FEATURES_H 29 | #define LIB_X86_CPU_FEATURES_H 30 | 31 | #include "../lib_common.h" 32 | 33 | #if defined(ARCH_X86_32) || defined(ARCH_X86_64) 34 | 35 | #define X86_CPU_FEATURE_SSE2 (1 << 0) 36 | #define X86_CPU_FEATURE_PCLMULQDQ (1 << 1) 37 | #define X86_CPU_FEATURE_AVX (1 << 2) 38 | #define X86_CPU_FEATURE_AVX2 (1 << 3) 39 | #define X86_CPU_FEATURE_BMI2 (1 << 4) 40 | /* 41 | * ZMM indicates whether 512-bit vectors (zmm registers) should be used. On 42 | * some CPUs, to avoid downclocking issues we don't set ZMM even if the CPU and 43 | * operating system support AVX-512. On these CPUs, we may still use AVX-512 44 | * instructions, but only with xmm and ymm registers. 45 | */ 46 | #define X86_CPU_FEATURE_ZMM (1 << 5) 47 | #define X86_CPU_FEATURE_AVX512BW (1 << 6) 48 | #define X86_CPU_FEATURE_AVX512VL (1 << 7) 49 | #define X86_CPU_FEATURE_VPCLMULQDQ (1 << 8) 50 | #define X86_CPU_FEATURE_AVX512VNNI (1 << 9) 51 | #define X86_CPU_FEATURE_AVXVNNI (1 << 10) 52 | 53 | #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) 54 | /* Runtime x86 CPU feature detection is supported. */ 55 | # define X86_CPU_FEATURES_KNOWN (1U << 31) 56 | extern volatile u32 libdeflate_x86_cpu_features; 57 | 58 | void libdeflate_init_x86_cpu_features(void); 59 | 60 | static inline u32 get_x86_cpu_features(void) 61 | { 62 | if (libdeflate_x86_cpu_features == 0) 63 | libdeflate_init_x86_cpu_features(); 64 | return libdeflate_x86_cpu_features; 65 | } 66 | /* 67 | * x86 intrinsics are also supported. Include the headers needed to use them. 68 | * Normally just immintrin.h suffices. With clang in MSVC compatibility mode, 69 | * immintrin.h incorrectly skips including sub-headers, so include those too. 70 | */ 71 | # include 72 | # if defined(_MSC_VER) && defined(__clang__) 73 | # include 74 | # include 75 | # include 76 | # include 77 | # include 78 | # include 79 | # include 80 | # include 81 | # if __has_include() 82 | # include 83 | # endif 84 | # if __has_include() 85 | # include 86 | # endif 87 | # if __has_include() 88 | # include 89 | # endif 90 | # if __has_include() 91 | # include 92 | # endif 93 | # if __has_include() 94 | # include 95 | # endif 96 | # endif 97 | #else 98 | static inline u32 get_x86_cpu_features(void) { return 0; } 99 | #endif 100 | 101 | #if defined(__SSE2__) || \ 102 | (defined(_MSC_VER) && \ 103 | (defined(ARCH_X86_64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))) 104 | # define HAVE_SSE2(features) 1 105 | # define HAVE_SSE2_NATIVE 1 106 | #else 107 | # define HAVE_SSE2(features) ((features) & X86_CPU_FEATURE_SSE2) 108 | # define HAVE_SSE2_NATIVE 0 109 | #endif 110 | 111 | #if (defined(__PCLMUL__) && defined(__SSE4_1__)) || \ 112 | (defined(_MSC_VER) && defined(__AVX2__)) 113 | # define HAVE_PCLMULQDQ(features) 1 114 | #else 115 | # define HAVE_PCLMULQDQ(features) ((features) & X86_CPU_FEATURE_PCLMULQDQ) 116 | #endif 117 | 118 | #ifdef __AVX__ 119 | # define HAVE_AVX(features) 1 120 | #else 121 | # define HAVE_AVX(features) ((features) & X86_CPU_FEATURE_AVX) 122 | #endif 123 | 124 | #ifdef __AVX2__ 125 | # define HAVE_AVX2(features) 1 126 | #else 127 | # define HAVE_AVX2(features) ((features) & X86_CPU_FEATURE_AVX2) 128 | #endif 129 | 130 | #if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__)) 131 | # define HAVE_BMI2(features) 1 132 | # define HAVE_BMI2_NATIVE 1 133 | #else 134 | # define HAVE_BMI2(features) ((features) & X86_CPU_FEATURE_BMI2) 135 | # define HAVE_BMI2_NATIVE 0 136 | #endif 137 | 138 | #ifdef __AVX512BW__ 139 | # define HAVE_AVX512BW(features) 1 140 | #else 141 | # define HAVE_AVX512BW(features) ((features) & X86_CPU_FEATURE_AVX512BW) 142 | #endif 143 | 144 | #ifdef __AVX512VL__ 145 | # define HAVE_AVX512VL(features) 1 146 | #else 147 | # define HAVE_AVX512VL(features) ((features) & X86_CPU_FEATURE_AVX512VL) 148 | #endif 149 | 150 | #ifdef __VPCLMULQDQ__ 151 | # define HAVE_VPCLMULQDQ(features) 1 152 | #else 153 | # define HAVE_VPCLMULQDQ(features) ((features) & X86_CPU_FEATURE_VPCLMULQDQ) 154 | #endif 155 | 156 | #ifdef __AVX512VNNI__ 157 | # define HAVE_AVX512VNNI(features) 1 158 | #else 159 | # define HAVE_AVX512VNNI(features) ((features) & X86_CPU_FEATURE_AVX512VNNI) 160 | #endif 161 | 162 | #ifdef __AVXVNNI__ 163 | # define HAVE_AVXVNNI(features) 1 164 | #else 165 | # define HAVE_AVXVNNI(features) ((features) & X86_CPU_FEATURE_AVXVNNI) 166 | #endif 167 | 168 | #endif /* ARCH_X86_32 || ARCH_X86_64 */ 169 | 170 | #endif /* LIB_X86_CPU_FEATURES_H */ 171 | -------------------------------------------------------------------------------- /lib/x86/crc32_impl.h: -------------------------------------------------------------------------------- 1 | /* 2 | * x86/crc32_impl.h - x86 implementations of the gzip CRC-32 algorithm 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #ifndef LIB_X86_CRC32_IMPL_H 29 | #define LIB_X86_CRC32_IMPL_H 30 | 31 | #include "cpu_features.h" 32 | 33 | /* 34 | * pshufb(x, shift_tab[len..len+15]) left shifts x by 16-len bytes. 35 | * pshufb(x, shift_tab[len+16..len+31]) right shifts x by len bytes. 36 | */ 37 | static const u8 MAYBE_UNUSED shift_tab[48] = { 38 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 39 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 40 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 41 | 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 42 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 43 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 44 | }; 45 | 46 | #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) 47 | /* 48 | * PCLMULQDQ implementation. This targets PCLMULQDQ+SSE4.1, since in practice 49 | * all CPUs that support PCLMULQDQ also support SSE4.1. 50 | */ 51 | # define crc32_x86_pclmulqdq crc32_x86_pclmulqdq 52 | # define SUFFIX _pclmulqdq 53 | # define ATTRIBUTES _target_attribute("pclmul,sse4.1") 54 | # define VL 16 55 | # define USE_AVX512 0 56 | # include "crc32_pclmul_template.h" 57 | 58 | /* 59 | * PCLMULQDQ/AVX implementation. Same as above, but this is compiled with AVX 60 | * enabled so that the compiler can generate VEX-coded instructions which can be 61 | * slightly more efficient. It still uses 128-bit vectors. 62 | */ 63 | # define crc32_x86_pclmulqdq_avx crc32_x86_pclmulqdq_avx 64 | # define SUFFIX _pclmulqdq_avx 65 | # define ATTRIBUTES _target_attribute("pclmul,avx") 66 | # define VL 16 67 | # define USE_AVX512 0 68 | # include "crc32_pclmul_template.h" 69 | #endif 70 | 71 | /* 72 | * VPCLMULQDQ/AVX2 implementation. This is used on CPUs that have AVX2 and 73 | * VPCLMULQDQ but don't have AVX-512, for example Intel Alder Lake. 74 | * 75 | * Currently this can't be enabled with MSVC because MSVC has a bug where it 76 | * incorrectly assumes that VPCLMULQDQ implies AVX-512: 77 | * https://developercommunity.visualstudio.com/t/Compiler-incorrectly-assumes-VAES-and-VP/10578785 78 | * 79 | * gcc 8.1 and 8.2 had a similar bug where they assumed that 80 | * _mm256_clmulepi64_epi128() always needed AVX512. It's fixed in gcc 8.3. 81 | * 82 | * _mm256_zextsi128_si256() requires gcc 10. 83 | */ 84 | #if (GCC_PREREQ(10, 1) || CLANG_PREREQ(6, 0, 10000000)) && \ 85 | !defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ) 86 | # define crc32_x86_vpclmulqdq_avx2 crc32_x86_vpclmulqdq_avx2 87 | # define SUFFIX _vpclmulqdq_avx2 88 | # define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx2") 89 | # define VL 32 90 | # define USE_AVX512 0 91 | # include "crc32_pclmul_template.h" 92 | #endif 93 | 94 | #if (GCC_PREREQ(10, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)) && \ 95 | !defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ) 96 | /* 97 | * VPCLMULQDQ/AVX512 implementation using 256-bit vectors. This is very similar 98 | * to the VPCLMULQDQ/AVX2 implementation but takes advantage of the vpternlog 99 | * instruction and more registers. This is used on certain older Intel CPUs, 100 | * specifically Ice Lake and Tiger Lake, which support VPCLMULQDQ and AVX512 but 101 | * downclock a bit too eagerly when ZMM registers are used. 102 | * 103 | * _mm256_zextsi128_si256() requires gcc 10. 104 | */ 105 | # define crc32_x86_vpclmulqdq_avx512_vl256 crc32_x86_vpclmulqdq_avx512_vl256 106 | # define SUFFIX _vpclmulqdq_avx512_vl256 107 | # define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl") 108 | # define VL 32 109 | # define USE_AVX512 1 110 | # include "crc32_pclmul_template.h" 111 | 112 | /* 113 | * VPCLMULQDQ/AVX512 implementation using 512-bit vectors. This is used on CPUs 114 | * that have a good AVX-512 implementation including VPCLMULQDQ. 115 | * 116 | * _mm512_zextsi128_si512() requires gcc 10. 117 | */ 118 | # define crc32_x86_vpclmulqdq_avx512_vl512 crc32_x86_vpclmulqdq_avx512_vl512 119 | # define SUFFIX _vpclmulqdq_avx512_vl512 120 | # define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl") 121 | # define VL 64 122 | # define USE_AVX512 1 123 | # include "crc32_pclmul_template.h" 124 | #endif 125 | 126 | static inline crc32_func_t 127 | arch_select_crc32_func(void) 128 | { 129 | const u32 features MAYBE_UNUSED = get_x86_cpu_features(); 130 | 131 | #ifdef crc32_x86_vpclmulqdq_avx512_vl512 132 | if ((features & X86_CPU_FEATURE_ZMM) && 133 | HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) && 134 | HAVE_AVX512BW(features) && HAVE_AVX512VL(features)) 135 | return crc32_x86_vpclmulqdq_avx512_vl512; 136 | #endif 137 | #ifdef crc32_x86_vpclmulqdq_avx512_vl256 138 | if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) && 139 | HAVE_AVX512BW(features) && HAVE_AVX512VL(features)) 140 | return crc32_x86_vpclmulqdq_avx512_vl256; 141 | #endif 142 | #ifdef crc32_x86_vpclmulqdq_avx2 143 | if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) && 144 | HAVE_AVX2(features)) 145 | return crc32_x86_vpclmulqdq_avx2; 146 | #endif 147 | #ifdef crc32_x86_pclmulqdq_avx 148 | if (HAVE_PCLMULQDQ(features) && HAVE_AVX(features)) 149 | return crc32_x86_pclmulqdq_avx; 150 | #endif 151 | #ifdef crc32_x86_pclmulqdq 152 | if (HAVE_PCLMULQDQ(features)) 153 | return crc32_x86_pclmulqdq; 154 | #endif 155 | return NULL; 156 | } 157 | #define arch_select_crc32_func arch_select_crc32_func 158 | 159 | #endif /* LIB_X86_CRC32_IMPL_H */ 160 | -------------------------------------------------------------------------------- /lib/x86/decompress_impl.h: -------------------------------------------------------------------------------- 1 | #ifndef LIB_X86_DECOMPRESS_IMPL_H 2 | #define LIB_X86_DECOMPRESS_IMPL_H 3 | 4 | #include "cpu_features.h" 5 | 6 | /* 7 | * BMI2 optimized decompression function. 8 | * 9 | * With gcc and clang we just compile the whole function with 10 | * __attribute__((target("bmi2"))), and the compiler uses bmi2 automatically. 11 | * 12 | * With MSVC, there is no target function attribute, but it's still possible to 13 | * use bmi2 intrinsics explicitly. Currently we mostly don't, but there's a 14 | * case in which we do (see below), so we at least take advantage of that. 15 | * However, MSVC from VS2017 (toolset v141) apparently miscompiles the _bzhi_*() 16 | * intrinsics. It seems to be fixed in VS2022. Hence, use MSVC_PREREQ(1930). 17 | */ 18 | #if defined(__GNUC__) || defined(__clang__) || MSVC_PREREQ(1930) 19 | # define deflate_decompress_bmi2 deflate_decompress_bmi2 20 | # define FUNCNAME deflate_decompress_bmi2 21 | # define ATTRIBUTES _target_attribute("bmi2") 22 | /* 23 | * Even with __attribute__((target("bmi2"))), gcc doesn't reliably use the 24 | * bzhi instruction for 'word & BITMASK(count)'. So use the bzhi intrinsic 25 | * explicitly. EXTRACT_VARBITS() is equivalent to 'word & BITMASK(count)'; 26 | * EXTRACT_VARBITS8() is equivalent to 'word & BITMASK((u8)count)'. 27 | * Nevertheless, their implementation using the bzhi intrinsic is identical, 28 | * as the bzhi instruction truncates the count to 8 bits implicitly. 29 | */ 30 | # ifndef __clang__ 31 | # ifdef ARCH_X86_64 32 | # define EXTRACT_VARBITS(word, count) _bzhi_u64((word), (count)) 33 | # define EXTRACT_VARBITS8(word, count) _bzhi_u64((word), (count)) 34 | # else 35 | # define EXTRACT_VARBITS(word, count) _bzhi_u32((word), (count)) 36 | # define EXTRACT_VARBITS8(word, count) _bzhi_u32((word), (count)) 37 | # endif 38 | # endif 39 | # include "../decompress_template.h" 40 | #endif 41 | 42 | #if defined(deflate_decompress_bmi2) && HAVE_BMI2_NATIVE 43 | #define DEFAULT_IMPL deflate_decompress_bmi2 44 | #else 45 | static inline decompress_func_t 46 | arch_select_decompress_func(void) 47 | { 48 | #ifdef deflate_decompress_bmi2 49 | if (HAVE_BMI2(get_x86_cpu_features())) 50 | return deflate_decompress_bmi2; 51 | #endif 52 | return NULL; 53 | } 54 | #define arch_select_decompress_func arch_select_decompress_func 55 | #endif 56 | 57 | #endif /* LIB_X86_DECOMPRESS_IMPL_H */ 58 | -------------------------------------------------------------------------------- /lib/x86/matchfinder_impl.h: -------------------------------------------------------------------------------- 1 | /* 2 | * x86/matchfinder_impl.h - x86 implementations of matchfinder functions 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #ifndef LIB_X86_MATCHFINDER_IMPL_H 29 | #define LIB_X86_MATCHFINDER_IMPL_H 30 | 31 | #include "cpu_features.h" 32 | 33 | #ifdef __AVX2__ 34 | static forceinline void 35 | matchfinder_init_avx2(mf_pos_t *data, size_t size) 36 | { 37 | __m256i *p = (__m256i *)data; 38 | __m256i v = _mm256_set1_epi16(MATCHFINDER_INITVAL); 39 | 40 | STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); 41 | STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); 42 | STATIC_ASSERT(sizeof(mf_pos_t) == 2); 43 | 44 | do { 45 | p[0] = v; 46 | p[1] = v; 47 | p[2] = v; 48 | p[3] = v; 49 | p += 4; 50 | size -= 4 * sizeof(*p); 51 | } while (size != 0); 52 | } 53 | #define matchfinder_init matchfinder_init_avx2 54 | 55 | static forceinline void 56 | matchfinder_rebase_avx2(mf_pos_t *data, size_t size) 57 | { 58 | __m256i *p = (__m256i *)data; 59 | __m256i v = _mm256_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE); 60 | 61 | STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); 62 | STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); 63 | STATIC_ASSERT(sizeof(mf_pos_t) == 2); 64 | 65 | do { 66 | /* PADDSW: Add Packed Signed Integers With Signed Saturation */ 67 | p[0] = _mm256_adds_epi16(p[0], v); 68 | p[1] = _mm256_adds_epi16(p[1], v); 69 | p[2] = _mm256_adds_epi16(p[2], v); 70 | p[3] = _mm256_adds_epi16(p[3], v); 71 | p += 4; 72 | size -= 4 * sizeof(*p); 73 | } while (size != 0); 74 | } 75 | #define matchfinder_rebase matchfinder_rebase_avx2 76 | 77 | #elif HAVE_SSE2_NATIVE 78 | static forceinline void 79 | matchfinder_init_sse2(mf_pos_t *data, size_t size) 80 | { 81 | __m128i *p = (__m128i *)data; 82 | __m128i v = _mm_set1_epi16(MATCHFINDER_INITVAL); 83 | 84 | STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); 85 | STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); 86 | STATIC_ASSERT(sizeof(mf_pos_t) == 2); 87 | 88 | do { 89 | p[0] = v; 90 | p[1] = v; 91 | p[2] = v; 92 | p[3] = v; 93 | p += 4; 94 | size -= 4 * sizeof(*p); 95 | } while (size != 0); 96 | } 97 | #define matchfinder_init matchfinder_init_sse2 98 | 99 | static forceinline void 100 | matchfinder_rebase_sse2(mf_pos_t *data, size_t size) 101 | { 102 | __m128i *p = (__m128i *)data; 103 | __m128i v = _mm_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE); 104 | 105 | STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); 106 | STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); 107 | STATIC_ASSERT(sizeof(mf_pos_t) == 2); 108 | 109 | do { 110 | /* PADDSW: Add Packed Signed Integers With Signed Saturation */ 111 | p[0] = _mm_adds_epi16(p[0], v); 112 | p[1] = _mm_adds_epi16(p[1], v); 113 | p[2] = _mm_adds_epi16(p[2], v); 114 | p[3] = _mm_adds_epi16(p[3], v); 115 | p += 4; 116 | size -= 4 * sizeof(*p); 117 | } while (size != 0); 118 | } 119 | #define matchfinder_rebase matchfinder_rebase_sse2 120 | #endif /* HAVE_SSE2_NATIVE */ 121 | 122 | #endif /* LIB_X86_MATCHFINDER_IMPL_H */ 123 | -------------------------------------------------------------------------------- /lib/zlib_compress.c: -------------------------------------------------------------------------------- 1 | /* 2 | * zlib_compress.c - compress with a zlib wrapper 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #include "deflate_compress.h" 29 | #include "zlib_constants.h" 30 | 31 | LIBDEFLATEAPI size_t 32 | libdeflate_zlib_compress(struct libdeflate_compressor *c, 33 | const void *in, size_t in_nbytes, 34 | void *out, size_t out_nbytes_avail) 35 | { 36 | u8 *out_next = out; 37 | u16 hdr; 38 | unsigned compression_level; 39 | unsigned level_hint; 40 | size_t deflate_size; 41 | 42 | if (out_nbytes_avail <= ZLIB_MIN_OVERHEAD) 43 | return 0; 44 | 45 | /* 2 byte header: CMF and FLG */ 46 | hdr = (ZLIB_CM_DEFLATE << 8) | (ZLIB_CINFO_32K_WINDOW << 12); 47 | compression_level = libdeflate_get_compression_level(c); 48 | if (compression_level < 2) 49 | level_hint = ZLIB_FASTEST_COMPRESSION; 50 | else if (compression_level < 6) 51 | level_hint = ZLIB_FAST_COMPRESSION; 52 | else if (compression_level < 8) 53 | level_hint = ZLIB_DEFAULT_COMPRESSION; 54 | else 55 | level_hint = ZLIB_SLOWEST_COMPRESSION; 56 | hdr |= level_hint << 6; 57 | hdr |= 31 - (hdr % 31); 58 | 59 | put_unaligned_be16(hdr, out_next); 60 | out_next += 2; 61 | 62 | /* Compressed data */ 63 | deflate_size = libdeflate_deflate_compress(c, in, in_nbytes, out_next, 64 | out_nbytes_avail - ZLIB_MIN_OVERHEAD); 65 | if (deflate_size == 0) 66 | return 0; 67 | out_next += deflate_size; 68 | 69 | /* ADLER32 */ 70 | put_unaligned_be32(libdeflate_adler32(1, in, in_nbytes), out_next); 71 | out_next += 4; 72 | 73 | return out_next - (u8 *)out; 74 | } 75 | 76 | LIBDEFLATEAPI size_t 77 | libdeflate_zlib_compress_bound(struct libdeflate_compressor *c, 78 | size_t in_nbytes) 79 | { 80 | return ZLIB_MIN_OVERHEAD + 81 | libdeflate_deflate_compress_bound(c, in_nbytes); 82 | } 83 | -------------------------------------------------------------------------------- /lib/zlib_constants.h: -------------------------------------------------------------------------------- 1 | /* 2 | * zlib_constants.h - constants for the zlib wrapper format 3 | */ 4 | 5 | #ifndef LIB_ZLIB_CONSTANTS_H 6 | #define LIB_ZLIB_CONSTANTS_H 7 | 8 | #define ZLIB_MIN_HEADER_SIZE 2 9 | #define ZLIB_FOOTER_SIZE 4 10 | #define ZLIB_MIN_OVERHEAD (ZLIB_MIN_HEADER_SIZE + ZLIB_FOOTER_SIZE) 11 | 12 | #define ZLIB_CM_DEFLATE 8 13 | 14 | #define ZLIB_CINFO_32K_WINDOW 7 15 | 16 | #define ZLIB_FASTEST_COMPRESSION 0 17 | #define ZLIB_FAST_COMPRESSION 1 18 | #define ZLIB_DEFAULT_COMPRESSION 2 19 | #define ZLIB_SLOWEST_COMPRESSION 3 20 | 21 | #endif /* LIB_ZLIB_CONSTANTS_H */ 22 | -------------------------------------------------------------------------------- /lib/zlib_decompress.c: -------------------------------------------------------------------------------- 1 | /* 2 | * zlib_decompress.c - decompress with a zlib wrapper 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #include "lib_common.h" 29 | #include "zlib_constants.h" 30 | 31 | LIBDEFLATEAPI enum libdeflate_result 32 | libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *d, 33 | const void *in, size_t in_nbytes, 34 | void *out, size_t out_nbytes_avail, 35 | size_t *actual_in_nbytes_ret, 36 | size_t *actual_out_nbytes_ret) 37 | { 38 | const u8 *in_next = in; 39 | const u8 * const in_end = in_next + in_nbytes; 40 | u16 hdr; 41 | size_t actual_in_nbytes; 42 | size_t actual_out_nbytes; 43 | enum libdeflate_result result; 44 | 45 | if (in_nbytes < ZLIB_MIN_OVERHEAD) 46 | return LIBDEFLATE_BAD_DATA; 47 | 48 | /* 2 byte header: CMF and FLG */ 49 | hdr = get_unaligned_be16(in_next); 50 | in_next += 2; 51 | 52 | /* FCHECK */ 53 | if ((hdr % 31) != 0) 54 | return LIBDEFLATE_BAD_DATA; 55 | 56 | /* CM */ 57 | if (((hdr >> 8) & 0xF) != ZLIB_CM_DEFLATE) 58 | return LIBDEFLATE_BAD_DATA; 59 | 60 | /* CINFO */ 61 | if ((hdr >> 12) > ZLIB_CINFO_32K_WINDOW) 62 | return LIBDEFLATE_BAD_DATA; 63 | 64 | /* FDICT */ 65 | if ((hdr >> 5) & 1) 66 | return LIBDEFLATE_BAD_DATA; 67 | 68 | /* Compressed data */ 69 | result = libdeflate_deflate_decompress_ex(d, in_next, 70 | in_end - ZLIB_FOOTER_SIZE - in_next, 71 | out, out_nbytes_avail, 72 | &actual_in_nbytes, actual_out_nbytes_ret); 73 | if (result != LIBDEFLATE_SUCCESS) 74 | return result; 75 | 76 | if (actual_out_nbytes_ret) 77 | actual_out_nbytes = *actual_out_nbytes_ret; 78 | else 79 | actual_out_nbytes = out_nbytes_avail; 80 | 81 | in_next += actual_in_nbytes; 82 | 83 | /* ADLER32 */ 84 | if (libdeflate_adler32(1, out, actual_out_nbytes) != 85 | get_unaligned_be32(in_next)) 86 | return LIBDEFLATE_BAD_DATA; 87 | in_next += 4; 88 | 89 | if (actual_in_nbytes_ret) 90 | *actual_in_nbytes_ret = in_next - (u8 *)in; 91 | 92 | return LIBDEFLATE_SUCCESS; 93 | } 94 | 95 | LIBDEFLATEAPI enum libdeflate_result 96 | libdeflate_zlib_decompress(struct libdeflate_decompressor *d, 97 | const void *in, size_t in_nbytes, 98 | void *out, size_t out_nbytes_avail, 99 | size_t *actual_out_nbytes_ret) 100 | { 101 | return libdeflate_zlib_decompress_ex(d, in, in_nbytes, 102 | out, out_nbytes_avail, 103 | NULL, actual_out_nbytes_ret); 104 | } 105 | -------------------------------------------------------------------------------- /libdeflate-config.cmake.in: -------------------------------------------------------------------------------- 1 | @PACKAGE_INIT@ 2 | 3 | include("${CMAKE_CURRENT_LIST_DIR}/libdeflate-targets.cmake") 4 | -------------------------------------------------------------------------------- /libdeflate.pc.in: -------------------------------------------------------------------------------- 1 | prefix=@CMAKE_INSTALL_PREFIX@ 2 | exec_prefix=${prefix} 3 | includedir=@CMAKE_PKGCONFIG_INCLUDEDIR@ 4 | libdir=@CMAKE_PKGCONFIG_LIBDIR@ 5 | 6 | Name: libdeflate 7 | Description: Fast implementation of DEFLATE, zlib, and gzip 8 | Version: @PROJECT_VERSION@ 9 | Libs: -L${libdir} -ldeflate 10 | Cflags: -I${includedir} 11 | 12 | # Note: this library's public header allows LIBDEFLATE_DLL to be defined when 13 | # linking to the DLL on Windows, to make __declspec(dllimport) be used. 14 | # However, the only way to define a shared-library-only flag in a pkgconfig file 15 | # is to use the weird workaround of unconditionally defining it in Cflags, then 16 | # undefining it in Cflags.private. Just don't bother with this, since 17 | # __declspec(dllimport) is optional anyway. It is a very minor performance 18 | # optimization that is irrelevant for most use cases of libdeflate. 19 | -------------------------------------------------------------------------------- /programs/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(CheckSymbolExists) 2 | 3 | # Check for the availability of OS functionality and generate the config.h file. 4 | # 5 | # Keep CMAKE_REQUIRED_DEFINITIONS in sync with what prog_util.h does. 6 | if(LINUX) 7 | set(CMAKE_REQUIRED_DEFINITIONS -D_GNU_SOURCE -D_POSIX_C_SOURCE=200809L) 8 | elseif(APPLE) 9 | set(CMAKE_REQUIRED_DEFINITIONS -D_DARWIN_C_SOURCE -U_POSIX_C_SOURCE) 10 | else() 11 | set(CMAKE_REQUIRED_DEFINITIONS -U_POSIX_C_SOURCE) 12 | endif() 13 | check_symbol_exists(clock_gettime "time.h" HAVE_CLOCK_GETTIME) 14 | check_symbol_exists(futimens "fcntl.h;sys/stat.h" HAVE_FUTIMENS) 15 | check_symbol_exists(posix_fadvise "fcntl.h" HAVE_POSIX_FADVISE) 16 | check_symbol_exists(posix_madvise "sys/mman.h" HAVE_POSIX_MADVISE) 17 | check_c_source_compiles("#include 18 | #include 19 | int main() { struct stat st; (void)st.st_atim; }" 20 | HAVE_STAT_NANOSECOND_PRECISION) 21 | configure_file(config.h.in config.h) 22 | 23 | # Build a utility library for the programs. This library is not installed. 24 | add_library(libdeflate_prog_utils STATIC prog_util.c tgetopt.c ../common_defs.h) 25 | set_target_properties(libdeflate_prog_utils PROPERTIES 26 | OUTPUT_NAME deflate_prog_utils) 27 | if(LIBDEFLATE_USE_SHARED_LIB) 28 | target_link_libraries(libdeflate_prog_utils PUBLIC libdeflate_shared) 29 | else() 30 | target_link_libraries(libdeflate_prog_utils PUBLIC libdeflate_static) 31 | endif() 32 | target_include_directories(libdeflate_prog_utils PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) 33 | target_compile_definitions(libdeflate_prog_utils PUBLIC HAVE_CONFIG_H) 34 | if(WIN32) 35 | if(MINGW) 36 | target_compile_options(libdeflate_prog_utils PUBLIC -municode) 37 | target_link_libraries(libdeflate_prog_utils PUBLIC -municode) 38 | else() 39 | target_compile_definitions(libdeflate_prog_utils PUBLIC UNICODE _UNICODE) 40 | endif() 41 | endif() 42 | 43 | # Build and install libdeflate-gzip and its alias libdeflate-gunzip. 44 | if(LIBDEFLATE_BUILD_GZIP) 45 | add_executable(libdeflate-gzip gzip.c) 46 | target_link_libraries(libdeflate-gzip PRIVATE libdeflate_prog_utils) 47 | install(TARGETS libdeflate-gzip DESTINATION ${CMAKE_INSTALL_BINDIR}) 48 | if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.14") 49 | # Install libdeflate-gunzip as a hard link to libdeflate-gzip. 50 | # Fall back to a copy if hard links are unsupported. 51 | # 52 | # Note: on Windows, prepending DESTDIR like this doesn't work correctly 53 | # when ${CMAKE_INSTALL_FULL_BINDIR} includes a drive letter. But that 54 | # is fine since DESTDIR is unsupported on Windows anyway, according to 55 | # the CMake documentation. 56 | set(GZIP "${CMAKE_INSTALL_FULL_BINDIR}/libdeflate-gzip${CMAKE_EXECUTABLE_SUFFIX}") 57 | set(GUNZIP "${CMAKE_INSTALL_FULL_BINDIR}/libdeflate-gunzip${CMAKE_EXECUTABLE_SUFFIX}") 58 | install(CODE "message(\"-- Installing: \$ENV{DESTDIR}${GUNZIP}\")") 59 | install(CODE "file(CREATE_LINK \"\$ENV{DESTDIR}${GZIP}\" 60 | \"\$ENV{DESTDIR}${GUNZIP}\" COPY_ON_ERROR)") 61 | else() 62 | # The cmake version is too old to support file(CREATE_LINK). 63 | # Just compile gzip.c again to build libdeflate-gunzip. 64 | add_executable(libdeflate-gunzip gzip.c) 65 | target_link_libraries(libdeflate-gunzip PRIVATE libdeflate_prog_utils) 66 | install(TARGETS libdeflate-gunzip DESTINATION ${CMAKE_INSTALL_BINDIR}) 67 | endif() 68 | endif() 69 | 70 | # Build the test programs, if requested. 71 | if(LIBDEFLATE_BUILD_TESTS) 72 | 73 | # The test programs depend on zlib for comparison tests. 74 | find_package(ZLIB REQUIRED) 75 | 76 | # Build a utility library for the test programs. 77 | add_library(libdeflate_test_utils STATIC test_util.c) 78 | set_target_properties(libdeflate_test_utils PROPERTIES 79 | OUTPUT_NAME deflate_test_utils) 80 | target_link_libraries(libdeflate_test_utils PUBLIC 81 | libdeflate_prog_utils ZLIB::ZLIB) 82 | 83 | # Build the benchmark and checksum programs. 84 | add_executable(benchmark benchmark.c) 85 | target_link_libraries(benchmark PRIVATE libdeflate_test_utils) 86 | add_executable(checksum checksum.c) 87 | target_link_libraries(checksum PRIVATE libdeflate_test_utils) 88 | 89 | # Build the unit test programs and register them with CTest. 90 | set(UNIT_TEST_PROGS 91 | test_checksums 92 | test_custom_malloc 93 | test_incomplete_codes 94 | test_invalid_streams 95 | test_litrunlen_overflow 96 | test_overread 97 | test_slow_decompression 98 | test_trailing_bytes 99 | ) 100 | foreach(PROG ${UNIT_TEST_PROGS}) 101 | add_executable(${PROG} ${PROG}.c) 102 | target_link_libraries(${PROG} PRIVATE libdeflate_test_utils) 103 | add_test(NAME ${PROG} COMMAND ${PROG}) 104 | endforeach() 105 | endif() 106 | -------------------------------------------------------------------------------- /programs/checksum.c: -------------------------------------------------------------------------------- 1 | /* 2 | * checksum.c - Adler-32 and CRC-32 checksumming program 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #include "test_util.h" 29 | 30 | static const tchar *const optstring = T("Ahm:s:tZ"); 31 | 32 | static void 33 | show_usage(FILE *fp) 34 | { 35 | fprintf(fp, 36 | "Usage: %"TS" [-A] [-h] [-m ALIGN] [-s SIZE] [-t] [-Z] [FILE]...\n" 37 | "Calculate Adler-32 or CRC-32 checksums of the specified FILEs.\n" 38 | "\n" 39 | "Options:\n" 40 | " -A use Adler-32 (default is CRC-32)\n" 41 | " -h print this help\n" 42 | " -m ALIGN misalign the buffer by ALIGN bytes\n" 43 | " -s SIZE chunk size in bytes\n" 44 | " -t show checksum speed, excluding I/O\n" 45 | " -Z use zlib implementation instead of libdeflate\n", 46 | prog_invocation_name); 47 | } 48 | 49 | typedef u32 (*cksum_fn_t)(u32, const void *, size_t); 50 | 51 | static u32 52 | adler32_libdeflate(u32 adler, const void *buf, size_t len) 53 | { 54 | return libdeflate_adler32(adler, buf, len); 55 | } 56 | 57 | static u32 58 | crc32_libdeflate(u32 crc, const void *buf, size_t len) 59 | { 60 | return libdeflate_crc32(crc, buf, len); 61 | } 62 | 63 | static u32 64 | adler32_zlib(u32 adler, const void *buf, size_t len) 65 | { 66 | return adler32(adler, buf, len); 67 | } 68 | 69 | static u32 70 | crc32_zlib(u32 crc, const void *buf, size_t len) 71 | { 72 | return crc32(crc, buf, len); 73 | } 74 | 75 | static int 76 | checksum_stream(struct file_stream *in, cksum_fn_t cksum, u32 *sum, 77 | void *buf, size_t bufsize, u64 *size_ret, u64 *elapsed_ret) 78 | { 79 | u64 size = 0; 80 | u64 elapsed = 0; 81 | 82 | for (;;) { 83 | ssize_t ret; 84 | u64 start_time; 85 | 86 | ret = xread(in, buf, bufsize); 87 | if (ret < 0) 88 | return ret; 89 | if (ret == 0) 90 | break; 91 | 92 | size += ret; 93 | start_time = timer_ticks(); 94 | *sum = cksum(*sum, buf, ret); 95 | elapsed += timer_ticks() - start_time; 96 | } 97 | 98 | if (elapsed == 0) 99 | elapsed = 1; 100 | *size_ret = size; 101 | *elapsed_ret = elapsed; 102 | return 0; 103 | } 104 | 105 | int 106 | tmain(int argc, tchar *argv[]) 107 | { 108 | bool use_adler32 = false; 109 | bool use_zlib_impl = false; 110 | bool do_timing = false; 111 | void *orig_buf = NULL; 112 | void *buf; 113 | size_t misalignment = 0; 114 | size_t bufsize = 131072; 115 | tchar *default_file_list[] = { NULL }; 116 | cksum_fn_t cksum; 117 | int opt_char; 118 | int i; 119 | int ret; 120 | 121 | begin_program(argv); 122 | 123 | while ((opt_char = tgetopt(argc, argv, optstring)) != -1) { 124 | switch (opt_char) { 125 | case 'A': 126 | use_adler32 = true; 127 | break; 128 | case 'h': 129 | show_usage(stdout); 130 | return 0; 131 | case 'm': 132 | misalignment = tstrtoul(toptarg, NULL, 10); 133 | if (misalignment >= 4096) { 134 | msg("invalid misalignment: \"%"TS"\"", toptarg); 135 | return 1; 136 | } 137 | break; 138 | case 's': 139 | bufsize = tstrtoul(toptarg, NULL, 10); 140 | if (bufsize == 0 || bufsize > SIZE_MAX / 2) { 141 | msg("invalid chunk size: \"%"TS"\"", toptarg); 142 | return 1; 143 | } 144 | break; 145 | case 't': 146 | do_timing = true; 147 | break; 148 | case 'Z': 149 | use_zlib_impl = true; 150 | break; 151 | default: 152 | show_usage(stderr); 153 | return 1; 154 | } 155 | } 156 | 157 | argc -= toptind; 158 | argv += toptind; 159 | 160 | if (use_adler32) { 161 | if (use_zlib_impl) 162 | cksum = adler32_zlib; 163 | else 164 | cksum = adler32_libdeflate; 165 | } else { 166 | if (use_zlib_impl) 167 | cksum = crc32_zlib; 168 | else 169 | cksum = crc32_libdeflate; 170 | } 171 | 172 | orig_buf = xmalloc(bufsize + 4096 + misalignment); 173 | if (orig_buf == NULL) 174 | return 1; 175 | buf = (u8 *)orig_buf + (-(uintptr_t)orig_buf % 4096) + misalignment; 176 | 177 | if (argc == 0) { 178 | argv = default_file_list; 179 | argc = ARRAY_LEN(default_file_list); 180 | } else { 181 | for (i = 0; i < argc; i++) 182 | if (argv[i][0] == '-' && argv[i][1] == '\0') 183 | argv[i] = NULL; 184 | } 185 | 186 | for (i = 0; i < argc; i++) { 187 | struct file_stream in; 188 | u32 sum = cksum(0, NULL, 0); 189 | u64 size = 0; 190 | u64 elapsed = 0; 191 | 192 | ret = xopen_for_read(argv[i], true, &in); 193 | if (ret != 0) 194 | goto out; 195 | 196 | ret = checksum_stream(&in, cksum, &sum, buf, bufsize, 197 | &size, &elapsed); 198 | if (ret == 0) { 199 | if (do_timing) { 200 | printf("%08"PRIx32"\t%"TS"\t" 201 | "%"PRIu64" ms\t%"PRIu64" MB/s\n", 202 | sum, in.name, timer_ticks_to_ms(elapsed), 203 | timer_MB_per_s(size, elapsed)); 204 | } else { 205 | printf("%08"PRIx32"\t%"TS"\t\n", sum, in.name); 206 | } 207 | } 208 | 209 | xclose(&in); 210 | 211 | if (ret != 0) 212 | goto out; 213 | } 214 | ret = 0; 215 | out: 216 | free(orig_buf); 217 | return -ret; 218 | } 219 | -------------------------------------------------------------------------------- /programs/config.h.in: -------------------------------------------------------------------------------- 1 | #ifndef CONFIG_H 2 | #define CONFIG_H 3 | 4 | /* Is the clock_gettime() function available? */ 5 | #cmakedefine HAVE_CLOCK_GETTIME 6 | 7 | /* Is the futimens() function available? */ 8 | #cmakedefine HAVE_FUTIMENS 9 | 10 | /* Is the posix_fadvise() function available? */ 11 | #cmakedefine HAVE_POSIX_FADVISE 12 | 13 | /* Is the posix_madvise() function available? */ 14 | #cmakedefine HAVE_POSIX_MADVISE 15 | 16 | /* Does stat() provide nanosecond-precision timestamps? */ 17 | #cmakedefine HAVE_STAT_NANOSECOND_PRECISION 18 | 19 | #endif /* CONFIG_H */ 20 | -------------------------------------------------------------------------------- /programs/prog_util.h: -------------------------------------------------------------------------------- 1 | /* 2 | * prog_util.h - common header for the programs; must be included first 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #ifndef PROGRAMS_PROG_UTIL_H 29 | #define PROGRAMS_PROG_UTIL_H 30 | 31 | /* 32 | * This header provides some utility functions and macros for the programs. It 33 | * also defines some macros that control the behavior of system headers, and for 34 | * that reason it must be included before any system header. 35 | * 36 | * The latter part could be handled in this directory's CMakeLists.txt instead. 37 | * We put as much as possible here, directly in the source, to make it easier to 38 | * build the programs using other build systems (or "no build system"). 39 | * 40 | * Note: CMakeLists.txt does do some dynamic feature detection, which can't be 41 | * done in the source code. For that reason, it duplicates some of the logic 42 | * that defines macros like _GNU_SOURCE. Keep this logic in sync. 43 | */ 44 | 45 | #ifdef _WIN32 46 | 47 | /* 48 | * To keep the code similar on all platforms, sometimes we intentionally use 49 | * the "deprecated" non-underscore-prefixed variants of functions in msvcrt. 50 | */ 51 | # undef _CRT_NONSTDC_NO_DEPRECATE 52 | # define _CRT_NONSTDC_NO_DEPRECATE 1 53 | 54 | /* 55 | * Similarly, to match other platforms we intentionally use the "non-secure" 56 | * variants, which aren't actually any less secure when used properly. 57 | */ 58 | # undef _CRT_SECURE_NO_WARNINGS 59 | # define _CRT_SECURE_NO_WARNINGS 1 60 | 61 | #else 62 | 63 | /* Needed to work with files >= 2 GiB on 32-bit systems */ 64 | # undef _FILE_OFFSET_BITS 65 | # define _FILE_OFFSET_BITS 64 66 | 67 | /* Note: when making changes here, update programs/CMakeLists.txt too. */ 68 | # if defined(__linux__) 69 | /* 70 | * May be needed for clock_gettime(), posix_fadvise(), posix_madvise(), 71 | * futimens(), and MAP_ANONYMOUS, depending on the C library version. 72 | */ 73 | # undef _GNU_SOURCE 74 | # define _GNU_SOURCE 75 | # undef _POSIX_C_SOURCE 76 | # define _POSIX_C_SOURCE 200809L 77 | # elif defined(__APPLE__) 78 | /* Needed for O_NOFOLLOW and MAP_ANON */ 79 | # undef _DARWIN_C_SOURCE 80 | # define _DARWIN_C_SOURCE 81 | # undef _POSIX_C_SOURCE 82 | # elif defined(__sun) 83 | /* Needed for futimens() */ 84 | # undef __EXTENSIONS__ 85 | # define __EXTENSIONS__ 86 | # undef _POSIX_C_SOURCE 87 | # else 88 | /* 89 | * Else assume that nothing else is needed. Don't use _POSIX_C_SOURCE on 90 | * BSD, since it causes anything non-POSIX, such as MAP_ANON, to be hidden. 91 | */ 92 | # undef _POSIX_C_SOURCE 93 | # endif 94 | #endif 95 | 96 | #ifdef HAVE_CONFIG_H 97 | # include "config.h" 98 | #endif 99 | 100 | #include "../common_defs.h" 101 | 102 | #include 103 | #include 104 | #include 105 | #include 106 | #include 107 | #ifndef _WIN32 108 | # include 109 | #endif 110 | 111 | #if defined(__GNUC__) || __has_attribute(format) 112 | # define _printf(str_idx, args_idx) \ 113 | __attribute__((format(printf, str_idx, args_idx))) 114 | #else 115 | # define _printf(str_idx, args_idx) 116 | #endif 117 | 118 | #ifdef _WIN32 119 | 120 | /* 121 | * Definitions for Windows builds. Mainly, 'tchar' is defined to be the 2-byte 122 | * 'wchar_t' type instead of 'char'. This is the only "easy" way I know of to 123 | * get full Unicode support on Windows... 124 | */ 125 | 126 | #include 127 | #include 128 | int wmain(int argc, wchar_t **argv); 129 | # define tmain wmain 130 | # define tchar wchar_t 131 | # define _T(text) L##text 132 | # define T(text) _T(text) 133 | # define TS "ls" 134 | # define TC "lc" 135 | # define tmemcpy wmemcpy 136 | # define topen _wopen 137 | # define tstrchr wcschr 138 | # define tstrcmp wcscmp 139 | # define tstrlen wcslen 140 | # define tstrrchr wcsrchr 141 | # define tstrtoul wcstoul 142 | # define tstrxcmp wcsicmp 143 | # define tunlink _wunlink 144 | # define tutimbuf __utimbuf64 145 | # define tutime _wutime64 146 | # define tstat _wstat64 147 | # define tfstat _fstat64 148 | # define stat_t struct _stat64 149 | # ifdef _MSC_VER 150 | # define STDIN_FILENO 0 151 | # define STDOUT_FILENO 1 152 | # define STDERR_FILENO 2 153 | # define S_ISREG(m) (((m) & S_IFMT) == S_IFREG) 154 | # define S_ISDIR(m) (((m) & S_IFMT) == S_IFDIR) 155 | # endif 156 | 157 | #else /* _WIN32 */ 158 | 159 | /* Standard definitions for everyone else */ 160 | 161 | # define tmain main 162 | # define tchar char 163 | # define T(text) text 164 | # define TS "s" 165 | # define TC "c" 166 | # define tmemcpy memcpy 167 | # define topen open 168 | # define tstrchr strchr 169 | # define tstrcmp strcmp 170 | # define tstrlen strlen 171 | # define tstrrchr strrchr 172 | # define tstrtoul strtoul 173 | # define tstrxcmp strcmp 174 | # define tunlink unlink 175 | # define tutimbuf utimbuf 176 | # define tutime utime 177 | # define tstat stat 178 | # define tfstat fstat 179 | # define stat_t struct stat 180 | 181 | #endif /* !_WIN32 */ 182 | 183 | extern const tchar *prog_invocation_name; 184 | extern bool suppress_warnings; 185 | 186 | void _printf(1, 2) msg(const char *fmt, ...); 187 | void _printf(1, 2) msg_errno(const char *fmt, ...); 188 | void _printf(1, 2) warn(const char *fmt, ...); 189 | 190 | void *xmalloc(size_t size); 191 | 192 | void begin_program(tchar *argv[]); 193 | 194 | struct file_stream { 195 | int fd; 196 | tchar *name; 197 | bool is_standard_stream; 198 | void *mmap_token; 199 | void *mmap_mem; 200 | size_t mmap_size; 201 | }; 202 | 203 | int xopen_for_read(const tchar *path, bool symlink_ok, 204 | struct file_stream *strm); 205 | int xopen_for_write(const tchar *path, bool force, struct file_stream *strm); 206 | int map_file_contents(struct file_stream *strm, u64 size); 207 | 208 | ssize_t xread(struct file_stream *strm, void *buf, size_t count); 209 | int full_write(struct file_stream *strm, const void *buf, size_t count); 210 | 211 | int xclose(struct file_stream *strm); 212 | 213 | int parse_compression_level(tchar opt_char, const tchar *arg); 214 | 215 | struct libdeflate_compressor *alloc_compressor(int level); 216 | struct libdeflate_decompressor *alloc_decompressor(void); 217 | 218 | /* tgetopt.c */ 219 | 220 | extern tchar *toptarg; 221 | extern int toptind, topterr, toptopt; 222 | 223 | int tgetopt(int argc, tchar *argv[], const tchar *optstring); 224 | 225 | #endif /* PROGRAMS_PROG_UTIL_H */ 226 | -------------------------------------------------------------------------------- /programs/test_checksums.c: -------------------------------------------------------------------------------- 1 | /* 2 | * test_checksums.c 3 | * 4 | * Verify that libdeflate's Adler-32 and CRC-32 functions produce the same 5 | * results as their zlib equivalents. 6 | */ 7 | 8 | #include "test_util.h" 9 | 10 | #include 11 | #include 12 | 13 | static unsigned int rng_seed; 14 | 15 | typedef u32 (*cksum_fn_t)(u32, const void *, size_t); 16 | 17 | static u32 18 | adler32_libdeflate(u32 adler, const void *buf, size_t len) 19 | { 20 | return libdeflate_adler32(adler, buf, len); 21 | } 22 | 23 | static u32 24 | crc32_libdeflate(u32 crc, const void *buf, size_t len) 25 | { 26 | return libdeflate_crc32(crc, buf, len); 27 | } 28 | 29 | static u32 30 | adler32_zlib(u32 adler, const void *buf, size_t len) 31 | { 32 | return adler32(adler, buf, len); 33 | } 34 | 35 | static u32 36 | crc32_zlib(u32 crc, const void *buf, size_t len) 37 | { 38 | return crc32(crc, buf, len); 39 | } 40 | 41 | static u32 42 | select_initial_crc(void) 43 | { 44 | if (rand() & 1) 45 | return 0; 46 | return ((u32)rand() << 16) | rand(); 47 | } 48 | 49 | static u32 50 | select_initial_adler(void) 51 | { 52 | u32 lo, hi; 53 | 54 | if (rand() & 1) 55 | return 1; 56 | 57 | lo = (rand() % 4 == 0 ? 65520 : rand() % 65521); 58 | hi = (rand() % 4 == 0 ? 65520 : rand() % 65521); 59 | return (hi << 16) | lo; 60 | } 61 | 62 | static void 63 | test_initial_values(cksum_fn_t cksum, u32 expected) 64 | { 65 | ASSERT(cksum(0, NULL, 0) == expected); 66 | if (cksum != adler32_zlib) /* broken */ 67 | ASSERT(cksum(0, NULL, 1) == expected); 68 | ASSERT(cksum(0, NULL, 1234) == expected); 69 | ASSERT(cksum(1234, NULL, 0) == expected); 70 | ASSERT(cksum(1234, NULL, 1234) == expected); 71 | } 72 | 73 | static void 74 | test_multipart(const u8 *buffer, size_t size, const char *name, 75 | cksum_fn_t cksum, u32 v, u32 expected) 76 | { 77 | size_t division = rand() % (size + 1); 78 | v = cksum(v, buffer, division); 79 | v = cksum(v, buffer + division, size - division); 80 | if (v != expected) { 81 | fprintf(stderr, "%s checksum failed multipart test\n", name); 82 | ASSERT(0); 83 | } 84 | } 85 | 86 | static void 87 | test_checksums(const void *buffer, size_t size, const char *name, 88 | cksum_fn_t cksum1, cksum_fn_t cksum2, u32 initial_value) 89 | { 90 | u32 v1 = cksum1(initial_value, buffer, size); 91 | u32 v2 = cksum2(initial_value, buffer, size); 92 | 93 | if (v1 != v2) { 94 | fprintf(stderr, "%s checksum mismatch\n", name); 95 | fprintf(stderr, "initial_value=0x%08"PRIx32", buffer=%p, " 96 | "size=%zu, buffer=", initial_value, buffer, size); 97 | for (size_t i = 0; i < MIN(size, 256); i++) 98 | fprintf(stderr, "%02x", ((const u8 *)buffer)[i]); 99 | if (size > 256) 100 | fprintf(stderr, "..."); 101 | fprintf(stderr, "\n"); 102 | ASSERT(0); 103 | } 104 | 105 | if ((rand() & 15) == 0) { 106 | test_multipart(buffer, size, name, cksum1, initial_value, v1); 107 | test_multipart(buffer, size, name, cksum2, initial_value, v1); 108 | } 109 | } 110 | 111 | static void 112 | test_crc32(const void *buffer, size_t size, u32 initial_value) 113 | { 114 | test_checksums(buffer, size, "CRC-32", 115 | crc32_libdeflate, crc32_zlib, initial_value); 116 | } 117 | 118 | static void 119 | test_adler32(const void *buffer, size_t size, u32 initial_value) 120 | { 121 | test_checksums(buffer, size, "Adler-32", 122 | adler32_libdeflate, adler32_zlib, initial_value); 123 | } 124 | 125 | static void test_random_buffers(u8 *buf_start, u8 *buf_end, size_t limit, 126 | u32 num_iter) 127 | { 128 | for (u32 i = 0; i < num_iter; i++) { 129 | size_t start = rand() % limit; 130 | size_t len = rand() % (limit - start); 131 | u32 a0 = select_initial_adler(); 132 | u32 c0 = select_initial_crc(); 133 | 134 | for (size_t j = start; j < start + len; j++) 135 | buf_start[j] = rand(); 136 | 137 | /* Test with chosen size and alignment */ 138 | test_adler32(&buf_start[start], len, a0); 139 | test_crc32(&buf_start[start], len, c0); 140 | 141 | /* Test with chosen size, with guard page before input buffer */ 142 | memmove(buf_start, &buf_start[start], len); 143 | test_adler32(buf_start, len, a0); 144 | test_crc32(buf_start, len, c0); 145 | 146 | /* Test with chosen size, with guard page after input buffer */ 147 | memmove(buf_end - len, buf_start, len); 148 | test_adler32(buf_end - len, len, a0); 149 | test_crc32(buf_end - len, len, c0); 150 | } 151 | } 152 | 153 | int 154 | tmain(int argc, tchar *argv[]) 155 | { 156 | u8 *buf_start, *buf_end; 157 | 158 | begin_program(argv); 159 | 160 | alloc_guarded_buffer(262144, &buf_start, &buf_end); 161 | 162 | rng_seed = time(NULL); 163 | srand(rng_seed); 164 | 165 | test_initial_values(adler32_libdeflate, 1); 166 | test_initial_values(adler32_zlib, 1); 167 | test_initial_values(crc32_libdeflate, 0); 168 | test_initial_values(crc32_zlib, 0); 169 | 170 | /* Test different buffer sizes and alignments */ 171 | test_random_buffers(buf_start, buf_end, 256, 5000); 172 | test_random_buffers(buf_start, buf_end, 1024, 500); 173 | test_random_buffers(buf_start, buf_end, 32768, 50); 174 | test_random_buffers(buf_start, buf_end, 262144, 50); 175 | 176 | /* 177 | * Test Adler-32 overflow cases. For example, given all 0xFF bytes and 178 | * the highest possible initial (s1, s2) of (65520, 65520), then s2 if 179 | * stored as a 32-bit unsigned integer will overflow if > 5552 bytes are 180 | * processed. Implementations must make sure to reduce s2 modulo 65521 181 | * before that point. Also, some implementations make use of 16-bit 182 | * counters which can overflow earlier. 183 | */ 184 | memset(buf_start, 0xFF, 32768); 185 | for (u32 i = 0; i < 20; i++) { 186 | u32 initial_value; 187 | 188 | if (i == 0) 189 | initial_value = ((u32)65520 << 16) | 65520; 190 | else 191 | initial_value = select_initial_adler(); 192 | 193 | test_adler32(buf_start, 5553, initial_value); 194 | test_adler32(buf_start, rand() % 32769, initial_value); 195 | buf_start[rand() % 32768] = 0xFE; 196 | } 197 | 198 | free_guarded_buffer(buf_start, buf_end); 199 | return 0; 200 | } 201 | -------------------------------------------------------------------------------- /programs/test_custom_malloc.c: -------------------------------------------------------------------------------- 1 | /* 2 | * test_custom_malloc.c 3 | * 4 | * Test the support for custom memory allocators. 5 | * Also test injecting allocation failures. 6 | */ 7 | 8 | #include "test_util.h" 9 | 10 | static int malloc_count = 0; 11 | static int free_count = 0; 12 | 13 | static void *do_malloc(size_t size) 14 | { 15 | malloc_count++; 16 | return malloc(size); 17 | } 18 | 19 | static void *do_fail_malloc(size_t size) 20 | { 21 | malloc_count++; 22 | return NULL; 23 | } 24 | 25 | static void do_free(void *ptr) 26 | { 27 | free_count++; 28 | free(ptr); 29 | } 30 | 31 | static void reset_state(void) 32 | { 33 | libdeflate_set_memory_allocator(malloc, free); 34 | malloc_count = 0; 35 | free_count = 0; 36 | } 37 | 38 | /* Test that the custom allocator is actually used when requested. */ 39 | static void do_custom_memalloc_test(bool global) 40 | { 41 | static const struct libdeflate_options options = { 42 | .sizeof_options = sizeof(options), 43 | .malloc_func = do_malloc, 44 | .free_func = do_free, 45 | }; 46 | int level; 47 | struct libdeflate_compressor *c; 48 | struct libdeflate_decompressor *d; 49 | 50 | if (global) 51 | libdeflate_set_memory_allocator(do_malloc, do_free); 52 | 53 | for (level = 0; level <= 12; level++) { 54 | malloc_count = free_count = 0; 55 | if (global) 56 | c = libdeflate_alloc_compressor(level); 57 | else 58 | c = libdeflate_alloc_compressor_ex(level, &options); 59 | ASSERT(c != NULL); 60 | ASSERT(malloc_count == 1); 61 | ASSERT(free_count == 0); 62 | libdeflate_free_compressor(c); 63 | ASSERT(malloc_count == 1); 64 | ASSERT(free_count == 1); 65 | } 66 | 67 | malloc_count = free_count = 0; 68 | if (global) 69 | d = libdeflate_alloc_decompressor(); 70 | else 71 | d = libdeflate_alloc_decompressor_ex(&options); 72 | ASSERT(d != NULL); 73 | ASSERT(malloc_count == 1); 74 | ASSERT(free_count == 0); 75 | libdeflate_free_decompressor(d); 76 | ASSERT(malloc_count == 1); 77 | ASSERT(free_count == 1); 78 | 79 | reset_state(); 80 | } 81 | 82 | #define offsetofend(type, field) \ 83 | (offsetof(type, field) + sizeof(((type *)NULL)->field)) 84 | 85 | /* Test some edge cases involving libdeflate_options. */ 86 | static void do_options_test(void) 87 | { 88 | struct libdeflate_options options = { 0 }; 89 | struct libdeflate_compressor *c; 90 | struct libdeflate_decompressor *d; 91 | /* Size in libdeflate v1.19 */ 92 | size_t min_size = offsetofend(struct libdeflate_options, free_func); 93 | 94 | /* sizeof_options must be at least the minimum size. */ 95 | for (; options.sizeof_options < min_size; 96 | options.sizeof_options++) { 97 | c = libdeflate_alloc_compressor_ex(6, &options); 98 | ASSERT(c == NULL); 99 | d = libdeflate_alloc_decompressor_ex(&options); 100 | ASSERT(d == NULL); 101 | } 102 | 103 | /* NULL malloc_func and free_func means "use the global allocator". */ 104 | options.sizeof_options = min_size; 105 | malloc_count = free_count = 0; 106 | libdeflate_set_memory_allocator(do_malloc, do_free); 107 | c = libdeflate_alloc_compressor_ex(6, &options); 108 | libdeflate_free_compressor(c); 109 | ASSERT(malloc_count == 1); 110 | ASSERT(free_count == 1); 111 | d = libdeflate_alloc_decompressor_ex(&options); 112 | libdeflate_free_decompressor(d); 113 | ASSERT(malloc_count == 2); 114 | ASSERT(free_count == 2); 115 | 116 | reset_state(); 117 | } 118 | 119 | /* Test injecting memory allocation failures. */ 120 | static void do_fault_injection_test(void) 121 | { 122 | int level; 123 | struct libdeflate_compressor *c; 124 | struct libdeflate_decompressor *d; 125 | 126 | libdeflate_set_memory_allocator(do_fail_malloc, do_free); 127 | 128 | for (level = 0; level <= 12; level++) { 129 | malloc_count = free_count = 0; 130 | c = libdeflate_alloc_compressor(level); 131 | ASSERT(c == NULL); 132 | ASSERT(malloc_count == 1); 133 | ASSERT(free_count == 0); 134 | } 135 | 136 | malloc_count = free_count = 0; 137 | d = libdeflate_alloc_decompressor(); 138 | ASSERT(d == NULL); 139 | ASSERT(malloc_count == 1); 140 | ASSERT(free_count == 0); 141 | 142 | reset_state(); 143 | } 144 | 145 | int 146 | tmain(int argc, tchar *argv[]) 147 | { 148 | begin_program(argv); 149 | 150 | do_custom_memalloc_test(true); 151 | do_custom_memalloc_test(false); 152 | do_options_test(); 153 | do_fault_injection_test(); 154 | return 0; 155 | } 156 | -------------------------------------------------------------------------------- /programs/test_invalid_streams.c: -------------------------------------------------------------------------------- 1 | /* 2 | * test_invalid_streams.c 3 | * 4 | * Test that invalid DEFLATE streams are rejected with LIBDEFLATE_BAD_DATA. 5 | * 6 | * This isn't actually very important, since DEFLATE doesn't have built-in error 7 | * detection, so corruption of a DEFLATE stream can only be reliably detected 8 | * using a separate checksum anyway. As long as the DEFLATE decompressor 9 | * handles all streams safely (no crashes, etc.), in practice it is fine for it 10 | * to automatically remap invalid streams to valid streams, instead of returning 11 | * an error. Corruption detection is the responsibility of the zlib or gzip 12 | * layer, or the application when an external checksum is used. 13 | * 14 | * Nevertheless, to reduce surprises when people intentionally compare zlib's 15 | * and libdeflate's handling of invalid DEFLATE streams, libdeflate implements 16 | * zlib's strict behavior when decoding DEFLATE, except when it would have a 17 | * significant performance cost. 18 | */ 19 | 20 | #include "test_util.h" 21 | 22 | static void 23 | assert_decompression_error(const u8 *in, size_t in_nbytes) 24 | { 25 | struct libdeflate_decompressor *d; 26 | z_stream z; 27 | u8 out[128]; 28 | const size_t out_nbytes_avail = sizeof(out); 29 | size_t actual_out_nbytes; 30 | enum libdeflate_result res; 31 | 32 | /* libdeflate */ 33 | d = libdeflate_alloc_decompressor(); 34 | ASSERT(d != NULL); 35 | res = libdeflate_deflate_decompress(d, in, in_nbytes, 36 | out, out_nbytes_avail, 37 | &actual_out_nbytes); 38 | ASSERT(res == LIBDEFLATE_BAD_DATA); 39 | libdeflate_free_decompressor(d); 40 | 41 | /* zlib, as a control */ 42 | memset(&z, 0, sizeof(z)); 43 | res = inflateInit2(&z, -15); 44 | ASSERT(res == Z_OK); 45 | z.next_in = (void *)in; 46 | z.avail_in = in_nbytes; 47 | z.next_out = (void *)out; 48 | z.avail_out = out_nbytes_avail; 49 | res = inflate(&z, Z_FINISH); 50 | ASSERT(res == Z_DATA_ERROR); 51 | inflateEnd(&z); 52 | } 53 | 54 | /* 55 | * Test that DEFLATE decompression returns an error if a block header contains 56 | * too many encoded litlen and offset codeword lengths. 57 | */ 58 | static void 59 | test_too_many_codeword_lengths(void) 60 | { 61 | u8 in[128]; 62 | struct output_bitstream os = { .next = in, .end = in + sizeof(in) }; 63 | int i; 64 | 65 | ASSERT(put_bits(&os, 1, 1)); /* BFINAL: 1 */ 66 | ASSERT(put_bits(&os, 2, 2)); /* BTYPE: DYNAMIC_HUFFMAN */ 67 | 68 | /* 69 | * Litlen code: 70 | * litlensym_255 len=1 codeword=0 71 | * litlensym_256 (end-of-block) len=1 codeword=1 72 | * Offset code: 73 | * (empty) 74 | * 75 | * Litlen and offset codeword lengths: 76 | * [0..254] = 0 presym_{18,18} 77 | * [255] = 1 presym_1 78 | * [256] = 1 presym_1 79 | * [257...] = 0 presym_18 [TOO MANY] 80 | * 81 | * Precode: 82 | * presym_1 len=1 codeword=0 83 | * presym_18 len=1 codeword=1 84 | */ 85 | 86 | ASSERT(put_bits(&os, 0, 5)); /* num_litlen_syms: 0 + 257 */ 87 | ASSERT(put_bits(&os, 0, 5)); /* num_offset_syms: 0 + 1 */ 88 | ASSERT(put_bits(&os, 14, 4)); /* num_explicit_precode_lens: 14 + 4 */ 89 | 90 | /* 91 | * Precode codeword lengths: order is 92 | * [16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15] 93 | */ 94 | for (i = 0; i < 2; i++) /* presym_{16,17}: len=0 */ 95 | ASSERT(put_bits(&os, 0, 3)); 96 | ASSERT(put_bits(&os, 1, 3)); /* presym_18: len=1 */ 97 | ASSERT(put_bits(&os, 0, 3)); /* presym_0: len=0 */ 98 | for (i = 0; i < 13; i++) /* presym_{8,...,14}: len=0 */ 99 | ASSERT(put_bits(&os, 0, 3)); 100 | ASSERT(put_bits(&os, 1, 3)); /* presym_1: len=1 */ 101 | 102 | /* Litlen and offset codeword lengths */ 103 | ASSERT(put_bits(&os, 0x1, 1) && /* presym_18, 128 zeroes */ 104 | put_bits(&os, 117, 7)); 105 | ASSERT(put_bits(&os, 0x1, 1) && /* presym_18, 127 zeroes */ 106 | put_bits(&os, 116, 7)); 107 | ASSERT(put_bits(&os, 0x0, 1)); /* presym_1 */ 108 | ASSERT(put_bits(&os, 0x0, 1)); /* presym_1 */ 109 | ASSERT(put_bits(&os, 0x1, 1) && /* presym_18, 128 zeroes [TOO MANY] */ 110 | put_bits(&os, 117, 7)); 111 | 112 | /* Literal */ 113 | ASSERT(put_bits(&os, 0x0, 0)); /* litlensym_255 */ 114 | 115 | /* End of block */ 116 | ASSERT(put_bits(&os, 0x1, 1)); /* litlensym_256 */ 117 | 118 | ASSERT(flush_bits(&os)); 119 | 120 | assert_decompression_error(in, os.next - in); 121 | } 122 | 123 | int 124 | tmain(int argc, tchar *argv[]) 125 | { 126 | begin_program(argv); 127 | 128 | test_too_many_codeword_lengths(); 129 | return 0; 130 | } 131 | -------------------------------------------------------------------------------- /programs/test_litrunlen_overflow.c: -------------------------------------------------------------------------------- 1 | /* 2 | * test_litrunlen_overflow.c 3 | * 4 | * Regression test for commit f2f0df727444 ("deflate_compress: fix corruption 5 | * with long literal run"). Try to compress a file longer than 65535 bytes 6 | * where no 2-byte sequence (3 would be sufficient) is repeated <= 32768 bytes 7 | * apart, and the distribution of bytes remains constant throughout, and yet not 8 | * all bytes are used so the data is still slightly compressible. There will be 9 | * no matches in this data, but the compressor should still output a compressed 10 | * block, and this block should contain more than 65535 consecutive literals, 11 | * which triggered the bug. 12 | * 13 | * Note: on random data, this situation is extremely unlikely if the compressor 14 | * uses all matches it finds, since random data will on average have a 3-byte 15 | * match every (256**3)/32768 = 512 bytes. 16 | */ 17 | 18 | #include "test_util.h" 19 | 20 | int 21 | tmain(int argc, tchar *argv[]) 22 | { 23 | const int data_size = 2 * 250 * 251; 24 | u8 *orig_data, *compressed_data, *decompressed_data; 25 | int i, stride, multiple, j = 0; 26 | struct libdeflate_decompressor *d; 27 | static const int levels[] = { 3, 6, 12 }; 28 | 29 | begin_program(argv); 30 | 31 | orig_data = xmalloc(data_size); 32 | compressed_data = xmalloc(data_size); 33 | decompressed_data = xmalloc(data_size); 34 | 35 | for (i = 0; i < 2; i++) { 36 | for (stride = 1; stride < 251; stride++) { 37 | for (multiple = 0; multiple < 251; multiple++) 38 | orig_data[j++] = (stride * multiple) % 251; 39 | } 40 | } 41 | ASSERT(j == data_size); 42 | 43 | d = libdeflate_alloc_decompressor(); 44 | ASSERT(d != NULL); 45 | 46 | for (i = 0; i < ARRAY_LEN(levels); i++) { 47 | struct libdeflate_compressor *c; 48 | size_t csize; 49 | enum libdeflate_result res; 50 | 51 | c = libdeflate_alloc_compressor(levels[i]); 52 | ASSERT(c != NULL); 53 | 54 | csize = libdeflate_deflate_compress(c, orig_data, data_size, 55 | compressed_data, data_size); 56 | ASSERT(csize > 0 && csize < data_size); 57 | 58 | res = libdeflate_deflate_decompress(d, compressed_data, csize, 59 | decompressed_data, 60 | data_size, NULL); 61 | ASSERT(res == LIBDEFLATE_SUCCESS); 62 | ASSERT(memcmp(orig_data, decompressed_data, data_size) == 0); 63 | 64 | libdeflate_free_compressor(c); 65 | } 66 | 67 | libdeflate_free_decompressor(d); 68 | free(orig_data); 69 | free(compressed_data); 70 | free(decompressed_data); 71 | return 0; 72 | } 73 | -------------------------------------------------------------------------------- /programs/test_overread.c: -------------------------------------------------------------------------------- 1 | /* 2 | * test_overread.c 3 | * 4 | * Test that the decompressor doesn't produce an unbounded amount of output if 5 | * it runs out of input, even when implicit zeroes appended to the input would 6 | * continue producing output (as is the case when the input ends during a 7 | * DYNAMIC_HUFFMAN block where a literal has an all-zeroes codeword). 8 | * 9 | * This is a regression test for commit 3f21ec9d6121 ("deflate_decompress: error 10 | * out if overread count gets too large"). 11 | */ 12 | 13 | #include "test_util.h" 14 | 15 | static void 16 | generate_test_input(struct output_bitstream *os) 17 | { 18 | int i; 19 | 20 | put_bits(os, 0, 1); /* BFINAL: 0 */ 21 | put_bits(os, 2, 2); /* BTYPE: DYNAMIC_HUFFMAN */ 22 | 23 | /* 24 | * Write the Huffman codes. 25 | * 26 | * Litlen code: 27 | * litlensym_0 (0) len=1 codeword=0 28 | * litlensym_256 (end-of-block) len=1 codeword=1 29 | * Offset code: 30 | * offsetsym_0 (unused) len=1 codeword=0 31 | * 32 | * Litlen and offset codeword lengths: 33 | * [0] = 1 presym_1 34 | * [1..255] = 0 presym_{18,18} 35 | * [256] = 1 presym_1 36 | * [257] = 1 presym_1 37 | * 38 | * Precode: 39 | * presym_1 len=1 codeword=0 40 | * presym_18 len=1 codeword=1 41 | */ 42 | put_bits(os, 0, 5); /* num_litlen_syms: 0 + 257 */ 43 | put_bits(os, 0, 5); /* num_offset_syms: 0 + 1 */ 44 | put_bits(os, 14, 4); /* num_explicit_precode_lens: 14 + 4 */ 45 | /* 46 | * Precode codeword lengths: order is 47 | * [16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15] 48 | */ 49 | put_bits(os, 0, 3); /* presym_16: len=0 */ 50 | put_bits(os, 0, 3); /* presym_17: len=0 */ 51 | put_bits(os, 1, 3); /* presym_18: len=1 */ 52 | for (i = 0; i < 14; i++) /* presym_{0,...,14}: len=0 */ 53 | put_bits(os, 0, 3); 54 | put_bits(os, 1, 3); /* presym_1: len=1 */ 55 | 56 | /* Litlen and offset codeword lengths */ 57 | put_bits(os, 0, 1); /* presym_1 */ 58 | put_bits(os, 1, 1); /* presym_18 ... */ 59 | put_bits(os, 117, 7); /* ... 11 + 117 zeroes */ 60 | put_bits(os, 1, 1); /* presym_18 ... */ 61 | put_bits(os, 116, 7); /* ... 11 + 116 zeroes */ 62 | put_bits(os, 0, 1); /* presym_1 */ 63 | put_bits(os, 0, 1); /* presym_1 */ 64 | 65 | /* Implicit zeroes would generate endless literals from here. */ 66 | 67 | ASSERT(flush_bits(os)); 68 | } 69 | 70 | int 71 | tmain(int argc, tchar *argv[]) 72 | { 73 | u8 cdata[16]; 74 | u8 udata[256]; 75 | struct output_bitstream os = 76 | { .next = cdata, .end = cdata + sizeof(cdata) }; 77 | struct libdeflate_decompressor *d; 78 | enum libdeflate_result res; 79 | size_t actual_out_nbytes; 80 | 81 | begin_program(argv); 82 | 83 | generate_test_input(&os); 84 | d = libdeflate_alloc_decompressor(); 85 | ASSERT(d != NULL); 86 | 87 | res = libdeflate_deflate_decompress(d, cdata, os.next - cdata, 88 | udata, sizeof(udata), 89 | &actual_out_nbytes); 90 | /* Before the fix, the result was LIBDEFLATE_INSUFFICIENT_SPACE here. */ 91 | ASSERT(res == LIBDEFLATE_BAD_DATA); 92 | 93 | libdeflate_free_decompressor(d); 94 | return 0; 95 | } 96 | -------------------------------------------------------------------------------- /programs/test_trailing_bytes.c: -------------------------------------------------------------------------------- 1 | /* 2 | * test_trailing_bytes.c 3 | * 4 | * Test that decompression correctly stops at the end of the first DEFLATE, 5 | * zlib, or gzip stream, and doesn't process any additional trailing bytes. 6 | */ 7 | 8 | #include "test_util.h" 9 | 10 | static const struct { 11 | size_t (*compress)(struct libdeflate_compressor *compressor, 12 | const void *in, size_t in_nbytes, 13 | void *out, size_t out_nbytes_avail); 14 | enum libdeflate_result (*decompress)( 15 | struct libdeflate_decompressor *decompressor, 16 | const void *in, size_t in_nbytes, 17 | void *out, size_t out_nbytes_avail, 18 | size_t *actual_out_nbytes_ret); 19 | enum libdeflate_result (*decompress_ex)( 20 | struct libdeflate_decompressor *decompressor, 21 | const void *in, size_t in_nbytes, 22 | void *out, size_t out_nbytes_avail, 23 | size_t *actual_in_nbytes_ret, 24 | size_t *actual_out_nbytes_ret); 25 | } codecs[] = { 26 | { 27 | .compress = libdeflate_deflate_compress, 28 | .decompress = libdeflate_deflate_decompress, 29 | .decompress_ex = libdeflate_deflate_decompress_ex, 30 | }, { 31 | .compress = libdeflate_zlib_compress, 32 | .decompress = libdeflate_zlib_decompress, 33 | .decompress_ex = libdeflate_zlib_decompress_ex, 34 | }, { 35 | .compress = libdeflate_gzip_compress, 36 | .decompress = libdeflate_gzip_decompress, 37 | .decompress_ex = libdeflate_gzip_decompress_ex, 38 | } 39 | }; 40 | 41 | int 42 | tmain(int argc, tchar *argv[]) 43 | { 44 | const size_t original_nbytes = 32768; 45 | const size_t compressed_nbytes_total = 32768; 46 | /* 47 | * Don't use the full buffer for compressed data, because we want to 48 | * test whether decompression can deal with additional trailing bytes. 49 | * 50 | * Note: we can't use a guarded buffer (i.e. a buffer where the byte 51 | * after compressed_nbytes is unmapped) because the decompressor may 52 | * read a few bytes beyond the end of the stream (but ultimately not 53 | * actually use those bytes) as long as they are within the buffer. 54 | */ 55 | const size_t compressed_nbytes_avail = 30000; 56 | size_t i; 57 | u8 *original; 58 | u8 *compressed; 59 | u8 *decompressed; 60 | struct libdeflate_compressor *c; 61 | struct libdeflate_decompressor *d; 62 | size_t compressed_nbytes; 63 | enum libdeflate_result res; 64 | size_t actual_compressed_nbytes; 65 | size_t actual_decompressed_nbytes; 66 | 67 | begin_program(argv); 68 | 69 | ASSERT(compressed_nbytes_avail < compressed_nbytes_total); 70 | 71 | /* Prepare some dummy data to compress */ 72 | original = xmalloc(original_nbytes); 73 | ASSERT(original != NULL); 74 | for (i = 0; i < original_nbytes; i++) 75 | original[i] = (i % 123) + (i % 1023); 76 | 77 | compressed = xmalloc(compressed_nbytes_total); 78 | ASSERT(compressed != NULL); 79 | memset(compressed, 0, compressed_nbytes_total); 80 | 81 | decompressed = xmalloc(original_nbytes); 82 | ASSERT(decompressed != NULL); 83 | 84 | c = libdeflate_alloc_compressor(6); 85 | ASSERT(c != NULL); 86 | 87 | d = libdeflate_alloc_decompressor(); 88 | ASSERT(d != NULL); 89 | 90 | for (i = 0; i < ARRAY_LEN(codecs); i++) { 91 | compressed_nbytes = codecs[i].compress(c, original, 92 | original_nbytes, 93 | compressed, 94 | compressed_nbytes_avail); 95 | ASSERT(compressed_nbytes > 0); 96 | ASSERT(compressed_nbytes <= compressed_nbytes_avail); 97 | 98 | /* Test decompress() of stream that fills the whole buffer */ 99 | actual_decompressed_nbytes = 0; 100 | memset(decompressed, 0, original_nbytes); 101 | res = codecs[i].decompress(d, compressed, compressed_nbytes, 102 | decompressed, original_nbytes, 103 | &actual_decompressed_nbytes); 104 | ASSERT(res == LIBDEFLATE_SUCCESS); 105 | ASSERT(actual_decompressed_nbytes == original_nbytes); 106 | ASSERT(memcmp(decompressed, original, original_nbytes) == 0); 107 | 108 | /* Test decompress_ex() of stream that fills the whole buffer */ 109 | actual_compressed_nbytes = actual_decompressed_nbytes = 0; 110 | memset(decompressed, 0, original_nbytes); 111 | res = codecs[i].decompress_ex(d, compressed, compressed_nbytes, 112 | decompressed, original_nbytes, 113 | &actual_compressed_nbytes, 114 | &actual_decompressed_nbytes); 115 | ASSERT(res == LIBDEFLATE_SUCCESS); 116 | ASSERT(actual_compressed_nbytes == compressed_nbytes); 117 | ASSERT(actual_decompressed_nbytes == original_nbytes); 118 | ASSERT(memcmp(decompressed, original, original_nbytes) == 0); 119 | 120 | /* Test decompress() of stream with trailing bytes */ 121 | actual_decompressed_nbytes = 0; 122 | memset(decompressed, 0, original_nbytes); 123 | res = codecs[i].decompress(d, compressed, 124 | compressed_nbytes_total, 125 | decompressed, original_nbytes, 126 | &actual_decompressed_nbytes); 127 | ASSERT(res == LIBDEFLATE_SUCCESS); 128 | ASSERT(actual_decompressed_nbytes == original_nbytes); 129 | ASSERT(memcmp(decompressed, original, original_nbytes) == 0); 130 | 131 | /* Test decompress_ex() of stream with trailing bytes */ 132 | actual_compressed_nbytes = actual_decompressed_nbytes = 0; 133 | memset(decompressed, 0, original_nbytes); 134 | res = codecs[i].decompress_ex(d, compressed, 135 | compressed_nbytes_total, 136 | decompressed, original_nbytes, 137 | &actual_compressed_nbytes, 138 | &actual_decompressed_nbytes); 139 | ASSERT(res == LIBDEFLATE_SUCCESS); 140 | ASSERT(actual_compressed_nbytes == compressed_nbytes); 141 | ASSERT(actual_decompressed_nbytes == original_nbytes); 142 | ASSERT(memcmp(decompressed, original, original_nbytes) == 0); 143 | } 144 | 145 | free(original); 146 | free(compressed); 147 | free(decompressed); 148 | libdeflate_free_compressor(c); 149 | libdeflate_free_decompressor(d); 150 | return 0; 151 | } 152 | -------------------------------------------------------------------------------- /programs/test_util.c: -------------------------------------------------------------------------------- 1 | /* 2 | * test_util.c - utility functions for test programs 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #include "test_util.h" 29 | 30 | #include 31 | #include 32 | #ifdef _WIN32 33 | # include 34 | #else 35 | # include 36 | # include 37 | # include 38 | #endif 39 | 40 | #ifndef MAP_ANONYMOUS 41 | # define MAP_ANONYMOUS MAP_ANON 42 | #endif 43 | 44 | /* Abort with an error message */ 45 | NORETURN void 46 | assertion_failed(const char *expr, const char *file, int line) 47 | { 48 | msg("Assertion failed: %s at %s:%d", expr, file, line); 49 | abort(); 50 | } 51 | 52 | void 53 | begin_performance_test(void) 54 | { 55 | /* Skip performance tests by default, since they can be flaky. */ 56 | if (getenv("INCLUDE_PERF_TESTS") == NULL) 57 | exit(0); 58 | } 59 | 60 | static size_t 61 | get_page_size(void) 62 | { 63 | #ifdef _WIN32 64 | SYSTEM_INFO info; 65 | 66 | GetSystemInfo(&info); 67 | return info.dwPageSize; 68 | #else 69 | return sysconf(_SC_PAGESIZE); 70 | #endif 71 | } 72 | 73 | /* Allocate a buffer with guard pages */ 74 | void 75 | alloc_guarded_buffer(size_t size, u8 **start_ret, u8 **end_ret) 76 | { 77 | const size_t pagesize = get_page_size(); 78 | const size_t nr_pages = (size + pagesize - 1) / pagesize; 79 | u8 *base_addr; 80 | u8 *start, *end; 81 | #ifdef _WIN32 82 | DWORD oldProtect; 83 | #endif 84 | 85 | *start_ret = NULL; 86 | *end_ret = NULL; 87 | 88 | #ifdef _WIN32 89 | /* Allocate buffer and guard pages with no access. */ 90 | base_addr = VirtualAlloc(NULL, (nr_pages + 2) * pagesize, 91 | MEM_COMMIT | MEM_RESERVE, PAGE_NOACCESS); 92 | if (!base_addr) { 93 | msg("Unable to allocate memory (VirtualAlloc): Windows error %u", 94 | (unsigned int)GetLastError()); 95 | ASSERT(0); 96 | } 97 | start = base_addr + pagesize; 98 | end = start + (nr_pages * pagesize); 99 | 100 | /* Grant read+write access to just the buffer. */ 101 | if (!VirtualProtect(start, end - start, PAGE_READWRITE, &oldProtect)) { 102 | msg("Unable to protect memory (VirtualProtect): Windows error %u", 103 | (unsigned int)GetLastError()); 104 | VirtualFree(base_addr, 0, MEM_RELEASE); 105 | ASSERT(0); 106 | } 107 | #else 108 | /* Allocate buffer and guard pages. */ 109 | base_addr = mmap(NULL, (nr_pages + 2) * pagesize, PROT_READ|PROT_WRITE, 110 | MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); 111 | if (base_addr == (u8 *)MAP_FAILED) { 112 | msg_errno("Unable to allocate memory (anonymous mmap)"); 113 | ASSERT(0); 114 | } 115 | start = base_addr + pagesize; 116 | end = start + (nr_pages * pagesize); 117 | 118 | /* Unmap the guard pages. */ 119 | munmap(base_addr, pagesize); 120 | munmap(end, pagesize); 121 | #endif 122 | *start_ret = start; 123 | *end_ret = end; 124 | } 125 | 126 | /* Free a buffer that was allocated by alloc_guarded_buffer() */ 127 | void 128 | free_guarded_buffer(u8 *start, u8 *end) 129 | { 130 | if (!start) 131 | return; 132 | #ifdef _WIN32 133 | VirtualFree(start - get_page_size(), 0, MEM_RELEASE); 134 | #else 135 | munmap(start, end - start); 136 | #endif 137 | } 138 | 139 | /* 140 | * Return the number of timer ticks that have elapsed since some unspecified 141 | * point fixed at the start of program execution 142 | */ 143 | u64 144 | timer_ticks(void) 145 | { 146 | #ifdef _WIN32 147 | LARGE_INTEGER count; 148 | 149 | QueryPerformanceCounter(&count); 150 | return count.QuadPart; 151 | #elif defined(HAVE_CLOCK_GETTIME) || \ 152 | /* fallback detection method for direct compilation */ \ 153 | (!defined(HAVE_CONFIG_H) && defined(CLOCK_MONOTONIC)) 154 | struct timespec ts; 155 | 156 | clock_gettime(CLOCK_MONOTONIC, &ts); 157 | return (1000000000 * (u64)ts.tv_sec) + ts.tv_nsec; 158 | #else 159 | struct timeval tv; 160 | 161 | gettimeofday(&tv, NULL); 162 | return (1000000 * (u64)tv.tv_sec) + tv.tv_usec; 163 | #endif 164 | } 165 | 166 | /* 167 | * Return the number of timer ticks per second 168 | */ 169 | static u64 170 | timer_frequency(void) 171 | { 172 | #ifdef _WIN32 173 | LARGE_INTEGER freq; 174 | 175 | QueryPerformanceFrequency(&freq); 176 | return freq.QuadPart; 177 | #elif defined(HAVE_CLOCK_GETTIME) || \ 178 | /* fallback detection method for direct compilation */ \ 179 | (!defined(HAVE_CONFIG_H) && defined(CLOCK_MONOTONIC)) 180 | return 1000000000; 181 | #else 182 | return 1000000; 183 | #endif 184 | } 185 | 186 | /* 187 | * Convert a number of elapsed timer ticks to milliseconds 188 | */ 189 | u64 timer_ticks_to_ms(u64 ticks) 190 | { 191 | return ticks * 1000 / timer_frequency(); 192 | } 193 | 194 | /* 195 | * Convert a byte count and a number of elapsed timer ticks to MB/s 196 | */ 197 | u64 timer_MB_per_s(u64 bytes, u64 ticks) 198 | { 199 | return bytes * timer_frequency() / ticks / 1000000; 200 | } 201 | 202 | /* 203 | * Convert a byte count and a number of elapsed timer ticks to KB/s 204 | */ 205 | u64 timer_KB_per_s(u64 bytes, u64 ticks) 206 | { 207 | return bytes * timer_frequency() / ticks / 1000; 208 | } 209 | 210 | bool 211 | put_bits(struct output_bitstream *os, machine_word_t bits, int num_bits) 212 | { 213 | os->bitbuf |= bits << os->bitcount; 214 | os->bitcount += num_bits; 215 | while (os->bitcount >= 8) { 216 | if (os->next == os->end) 217 | return false; 218 | *os->next++ = os->bitbuf; 219 | os->bitcount -= 8; 220 | os->bitbuf >>= 8; 221 | } 222 | return true; 223 | } 224 | 225 | bool 226 | flush_bits(struct output_bitstream *os) 227 | { 228 | while (os->bitcount > 0) { 229 | if (os->next == os->end) 230 | return false; 231 | *os->next++ = os->bitbuf; 232 | os->bitcount -= 8; 233 | os->bitbuf >>= 8; 234 | } 235 | os->bitcount = 0; 236 | return true; 237 | } 238 | -------------------------------------------------------------------------------- /programs/test_util.h: -------------------------------------------------------------------------------- 1 | /* 2 | * test_util.h - utility functions for test programs 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #ifndef PROGRAMS_TEST_UTIL_H 29 | #define PROGRAMS_TEST_UTIL_H 30 | 31 | #include "prog_util.h" /* must be included first */ 32 | 33 | #include /* for comparison purposes */ 34 | 35 | NORETURN void 36 | assertion_failed(const char *expr, const char *file, int line); 37 | 38 | #define ASSERT(expr) { if (unlikely(!(expr))) \ 39 | assertion_failed(#expr, __FILE__, __LINE__); } 40 | 41 | void begin_performance_test(void); 42 | 43 | void alloc_guarded_buffer(size_t size, u8 **start_ret, u8 **end_ret); 44 | void free_guarded_buffer(u8 *start, u8 *end); 45 | 46 | u64 timer_ticks(void); 47 | u64 timer_ticks_to_ms(u64 ticks); 48 | u64 timer_MB_per_s(u64 bytes, u64 ticks); 49 | u64 timer_KB_per_s(u64 bytes, u64 ticks); 50 | 51 | struct output_bitstream { 52 | machine_word_t bitbuf; 53 | int bitcount; 54 | u8 *next; 55 | u8 *end; 56 | }; 57 | 58 | bool put_bits(struct output_bitstream *os, machine_word_t bits, int num_bits); 59 | bool flush_bits(struct output_bitstream *os); 60 | 61 | #endif /* PROGRAMS_TEST_UTIL_H */ 62 | -------------------------------------------------------------------------------- /programs/tgetopt.c: -------------------------------------------------------------------------------- 1 | /* 2 | * tgetopt.c - portable replacement for GNU getopt() 3 | * 4 | * Copyright 2016 Eric Biggers 5 | * 6 | * Permission is hereby granted, free of charge, to any person 7 | * obtaining a copy of this software and associated documentation 8 | * files (the "Software"), to deal in the Software without 9 | * restriction, including without limitation the rights to use, 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following 13 | * conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be 16 | * included in all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | * OTHER DEALINGS IN THE SOFTWARE. 26 | */ 27 | 28 | #include "prog_util.h" 29 | 30 | tchar *toptarg; 31 | int toptind = 1, topterr = 1, toptopt; 32 | 33 | /* 34 | * This is a simple implementation of getopt(). It can be compiled with either 35 | * 'char' or 'wchar_t' as the character type. 36 | * 37 | * Do *not* use this implementation if you need any of the following features, 38 | * as they are not supported: 39 | * - Long options 40 | * - Option-related arguments retained in argv, not nulled out 41 | * - '+' and '-' characters in optstring 42 | */ 43 | int 44 | tgetopt(int argc, tchar *argv[], const tchar *optstring) 45 | { 46 | static tchar empty[1]; 47 | static tchar *nextchar; 48 | static bool done; 49 | 50 | if (toptind == 1) { 51 | /* Starting to scan a new argument vector */ 52 | nextchar = NULL; 53 | done = false; 54 | } 55 | 56 | while (!done && (nextchar != NULL || toptind < argc)) { 57 | if (nextchar == NULL) { 58 | /* Scanning a new argument */ 59 | tchar *arg = argv[toptind++]; 60 | if (arg[0] == '-' && arg[1] != '\0') { 61 | if (arg[1] == '-' && arg[2] == '\0') { 62 | /* All args after "--" are nonoptions */ 63 | argv[toptind - 1] = NULL; 64 | done = true; 65 | } else { 66 | /* Start of short option characters */ 67 | nextchar = &arg[1]; 68 | } 69 | } 70 | } else { 71 | /* More short options in previous arg */ 72 | tchar opt = *nextchar; 73 | tchar *p = tstrchr(optstring, opt); 74 | if (p == NULL) { 75 | if (topterr) 76 | msg("invalid option -- '%"TC"'", opt); 77 | toptopt = opt; 78 | return '?'; 79 | } 80 | /* 'opt' is a valid short option character */ 81 | nextchar++; 82 | toptarg = NULL; 83 | if (*(p + 1) == ':') { 84 | /* 'opt' can take an argument */ 85 | if (*nextchar != '\0') { 86 | /* Optarg is in same argv argument */ 87 | toptarg = nextchar; 88 | nextchar = empty; 89 | } else if (toptind < argc && *(p + 2) != ':') { 90 | /* Optarg is next argv argument */ 91 | argv[toptind - 1] = NULL; 92 | toptarg = argv[toptind++]; 93 | } else if (*(p + 2) != ':') { 94 | if (topterr && *optstring != ':') { 95 | msg("option requires an " 96 | "argument -- '%"TC"'", opt); 97 | } 98 | toptopt = opt; 99 | opt = (*optstring == ':') ? ':' : '?'; 100 | } 101 | } 102 | if (*nextchar == '\0') { 103 | argv[toptind - 1] = NULL; 104 | nextchar = NULL; 105 | } 106 | return opt; 107 | } 108 | } 109 | 110 | /* Done scanning. Move all nonoptions to the end, set optind to the 111 | * index of the first nonoption, and return -1. */ 112 | toptind = argc; 113 | while (--argc > 0) 114 | if (argv[argc] != NULL) 115 | argv[--toptind] = argv[argc]; 116 | done = true; 117 | return -1; 118 | } 119 | -------------------------------------------------------------------------------- /scripts/android_build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eu -o pipefail 4 | 5 | SCRIPTDIR="$(dirname "$0")" 6 | BUILDDIR="$SCRIPTDIR/../build" 7 | API_LEVEL=28 8 | ARCH=arm64 9 | CFLAGS=${CFLAGS:-} 10 | ENABLE_CRC=false 11 | ENABLE_CRYPTO=false 12 | NDKDIR=$HOME/android-ndk-r25b 13 | 14 | usage() { 15 | cat << EOF 16 | Usage: $0 [OPTION]... 17 | Build libdeflate for Android. 18 | 19 | --api-level=LEVEL Android API level to target (default: $API_LEVEL) 20 | --arch=ARCH Architecture: arm32|arm64|x86|x86_64 (default: $ARCH) 21 | --enable-crc Enable crc instructions 22 | --enable-crypto Enable crypto instructions 23 | --ndkdir=NDKDIR Android NDK directory (default: $NDKDIR) 24 | EOF 25 | } 26 | if ! options=$(getopt -o '' \ 27 | -l 'api-level:,arch:,enable-crc,enable-crypto,help,ndkdir:' -- "$@"); then 28 | usage 1>&2 29 | exit 1 30 | fi 31 | 32 | eval set -- "$options" 33 | 34 | while [ $# -gt 0 ]; do 35 | case "$1" in 36 | --api-level) 37 | API_LEVEL="$2" 38 | shift 39 | ;; 40 | --arch) 41 | ARCH="$2" 42 | shift 43 | ;; 44 | --enable-crc) 45 | ENABLE_CRC=true 46 | ;; 47 | --enable-crypto) 48 | ENABLE_CRYPTO=true 49 | ;; 50 | --help) 51 | usage 52 | exit 0 53 | ;; 54 | --ndkdir) 55 | NDKDIR="$2" 56 | shift 57 | ;; 58 | --) 59 | shift 60 | break 61 | ;; 62 | *) 63 | echo 1>&2 "Unknown option \"$1\"" 64 | usage 1>&2 65 | exit 1 66 | esac 67 | shift 68 | done 69 | 70 | case "$ARCH" in 71 | arm|arm32|aarch32|armeabi-v7a) 72 | ANDROID_ABI=armeabi-v7a 73 | if $ENABLE_CRC || $ENABLE_CRYPTO; then 74 | CFLAGS+=" -march=armv8-a" 75 | if $ENABLE_CRC; then 76 | CFLAGS+=" -mcrc" 77 | else 78 | CFLAGS+=" -mnocrc" 79 | fi 80 | if $ENABLE_CRYPTO; then 81 | CFLAGS+=" -mfpu=crypto-neon-fp-armv8" 82 | else 83 | CFLAGS+=" -mfpu=neon" 84 | fi 85 | fi 86 | ;; 87 | arm64|aarch64|arm64-v8a) 88 | ANDROID_ABI=arm64-v8a 89 | features="" 90 | if $ENABLE_CRC; then 91 | features+="+crc" 92 | fi 93 | if $ENABLE_CRYPTO; then 94 | features+="+crypto" 95 | fi 96 | if [ -n "$features" ]; then 97 | CFLAGS+=" -march=armv8-a$features" 98 | fi 99 | ;; 100 | x86) 101 | ANDROID_ABI=x86 102 | ;; 103 | x86_64) 104 | ANDROID_ABI=x86_64 105 | ;; 106 | *) 107 | echo 1>&2 "Unknown architecture: \"$ARCH\"" 108 | usage 1>&2 109 | exit 1 110 | esac 111 | 112 | "$SCRIPTDIR"/cmake-helper.sh -G Ninja \ 113 | -DCMAKE_TOOLCHAIN_FILE="$NDKDIR"/build/cmake/android.toolchain.cmake \ 114 | -DCMAKE_C_FLAGS="$CFLAGS" \ 115 | -DANDROID_ABI="$ANDROID_ABI" \ 116 | -DANDROID_PLATFORM="$API_LEVEL" \ 117 | -DLIBDEFLATE_BUILD_TESTS=1 118 | cmake --build "$BUILDDIR" 119 | -------------------------------------------------------------------------------- /scripts/android_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Test libdeflate on a connected arm64 Android device. 4 | # Requires the Android NDK (release 19 or later) and adb. 5 | 6 | set -eu -o pipefail 7 | cd "$(dirname "$0")/.." 8 | 9 | if [ $# -ne 0 ]; then 10 | echo 1>&2 "Usage: $0" 11 | exit 2 12 | fi 13 | 14 | # Use NDKDIR if specified in environment, else use default value. 15 | : "${NDKDIR:=$HOME/android-ndk-r25b}" 16 | if [ ! -e "$NDKDIR" ]; then 17 | cat 1>&2 << EOF 18 | Android NDK was not found in NDKDIR=$NDKDIR! Set the 19 | environmental variable NDKDIR to the location of your Android NDK installation. 20 | EOF 21 | exit 1 22 | fi 23 | 24 | CLEANUP_CMDS=() 25 | cleanup() { 26 | for cmd in "${CLEANUP_CMDS[@]}"; do 27 | eval "$cmd" 28 | done 29 | } 30 | trap cleanup EXIT 31 | 32 | # Use TESTDATA if specified in environment, else generate it. 33 | if [ -z "${TESTDATA:-}" ]; then 34 | # Generate default TESTDATA file. 35 | TESTDATA=$(mktemp -t libdeflate_testdata.XXXXXXXXXX) 36 | export TESTDATA 37 | CLEANUP_CMDS+=("rm -f '$TESTDATA'") 38 | find . '(' -name '*.c' -o -name '*.h' -o -name '*.sh' ')' \ 39 | -exec cat '{}' ';' | head -c 1000000 > "$TESTDATA" 40 | fi 41 | 42 | TMPDIR=$(mktemp -d -t libdeflate_test.XXXXXXXXX) 43 | CLEANUP_CMDS+=("rm -r '$TMPDIR'") 44 | 45 | android_build_and_test() { 46 | echo "Running Android tests with $*" 47 | 48 | ./scripts/android_build.sh --ndkdir="$NDKDIR" "$@" > /dev/null 49 | adb push "$TESTDATA" ./scripts/exec_tests.sh \ 50 | ./build/programs/{benchmark,test_*} /data/local/tmp/ > /dev/null 51 | 52 | # Note: adb shell always returns 0, even if the shell command fails... 53 | adb shell "cd /data/local/tmp && WRAPPER= TESTDATA=$(basename "$TESTDATA") sh exec_tests.sh" \ 54 | > "$TMPDIR/adb.out" 55 | if ! grep -q "exec_tests finished successfully" "$TMPDIR/adb.out"; then 56 | echo 1>&2 "Android test failure! adb shell output:" 57 | cat "$TMPDIR/adb.out" 58 | exit 1 59 | fi 60 | } 61 | 62 | android_build_and_test --arch=arm32 63 | android_build_and_test --arch=arm32 --enable-crc 64 | android_build_and_test --arch=arm64 65 | android_build_and_test --arch=arm64 --enable-crc 66 | android_build_and_test --arch=arm64 --enable-crypto 67 | android_build_and_test --arch=arm64 --enable-crc --enable-crypto 68 | 69 | echo "Android tests passed" 70 | -------------------------------------------------------------------------------- /scripts/benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | SCRIPTDIR="$(dirname "$(realpath "$0")")" 6 | BUILDDIR="$SCRIPTDIR/../build" 7 | 8 | "$SCRIPTDIR"/cmake-helper.sh -DLIBDEFLATE_BUILD_TESTS=1 -G Ninja > /dev/null 9 | ninja -C "$BUILDDIR" --quiet benchmark 10 | "$BUILDDIR"/programs/benchmark "$@" 11 | -------------------------------------------------------------------------------- /scripts/checksum.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | SCRIPTDIR="$(dirname "$(realpath "$0")")" 6 | BUILDDIR="$SCRIPTDIR/../build" 7 | 8 | "$SCRIPTDIR"/cmake-helper.sh -DLIBDEFLATE_BUILD_TESTS=1 -G Ninja > /dev/null 9 | ninja -C "$BUILDDIR" --quiet checksum 10 | "$BUILDDIR"/programs/checksum "$@" 11 | -------------------------------------------------------------------------------- /scripts/checksum_benchmarks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eu -o pipefail 4 | 5 | __have_cpu_feature() { 6 | local feature="$1" 7 | local tag 8 | case $ARCH in 9 | arm*|aarch*) 10 | tag="Features" 11 | ;; 12 | *) 13 | tag="flags" 14 | ;; 15 | esac 16 | grep -q "^$tag"$'[ \t]'"*:.*\<$feature\>" /proc/cpuinfo 17 | } 18 | 19 | have_cpu_features() { 20 | local feature 21 | for feature; do 22 | __have_cpu_feature "$feature" || return 1 23 | done 24 | } 25 | 26 | make_and_test() { 27 | # Build the checksum program and tests. Set the special test support 28 | # flag to get support for LIBDEFLATE_DISABLE_CPU_FEATURES. 29 | rm -rf build 30 | CFLAGS="$CFLAGS -DTEST_SUPPORT__DO_NOT_USE=1" \ 31 | cmake -B build -G Ninja -DLIBDEFLATE_BUILD_TESTS=1 \ 32 | "${EXTRA_CMAKE_FLAGS[@]}" > /dev/null 33 | cmake --build build > /dev/null 34 | 35 | # Run the checksum tests, for good measure. (This isn't actually part 36 | # of the benchmarking.) 37 | ./build/programs/test_checksums > /dev/null 38 | } 39 | 40 | __do_benchmark() { 41 | local impl="$1" speed 42 | shift 43 | local flags=("$@") 44 | 45 | speed=$(./build/programs/checksum "${CKSUM_FLAGS[@]}" \ 46 | "${flags[@]}" -t "$FILE" | \ 47 | grep -o '[0-9]\+ MB/s' | grep -o '[0-9]\+') 48 | printf "%-60s%-10s\n" "$CKSUM_NAME ($impl)" "$speed" 49 | } 50 | 51 | do_benchmark() { 52 | local impl="$1" 53 | 54 | CFLAGS="${EXTRA_CFLAGS[*]}" make_and_test 55 | if [ "$impl" = zlib ]; then 56 | __do_benchmark "$impl" "-Z" 57 | else 58 | __do_benchmark "libdeflate, $impl" 59 | if $ENABLE_32BIT; then 60 | CFLAGS="-m32 ${EXTRA_CFLAGS[*]}" make_and_test 61 | __do_benchmark "libdeflate, $impl, 32-bit" 62 | fi 63 | fi 64 | } 65 | 66 | sort_by_speed() { 67 | awk '{print $NF, $0}' | sort -nr | cut -f2- -d' ' 68 | } 69 | 70 | disable_cpu_feature() { 71 | LIBDEFLATE_DISABLE_CPU_FEATURES+=",$1" 72 | shift 73 | if (( $# > 0 )); then 74 | EXTRA_CFLAGS+=("$@") 75 | fi 76 | } 77 | 78 | cleanup() { 79 | if $USING_TMPFILE; then 80 | rm "$FILE" 81 | fi 82 | } 83 | 84 | ARCH="$(uname -m)" 85 | USING_TMPFILE=false 86 | EXTRA_CMAKE_FLAGS=() 87 | ENABLE_32BIT=false 88 | 89 | trap cleanup EXIT 90 | 91 | longopts="help" 92 | longopts+=",cmake-flag:" 93 | longopts+=",enable-32bit" 94 | 95 | usage() { 96 | echo "Usage: $0 [--cmake-flag=FLAG]... [--enable-32bit] [FILE]" 97 | } 98 | 99 | if ! options=$(getopt -o "" -l "$longopts" -- "$@"); then 100 | usage 1>&2 101 | exit 1 102 | fi 103 | eval set -- "$options" 104 | while (( $# >= 1 )); do 105 | case "$1" in 106 | --cmake-flag) 107 | EXTRA_CMAKE_FLAGS+=("$2") 108 | shift 109 | ;; 110 | --enable-32bit) 111 | ENABLE_32BIT=true 112 | ;; 113 | --help) 114 | usage 115 | exit 0 116 | ;; 117 | --) 118 | shift 119 | break 120 | ;; 121 | *) 122 | echo 1>&2 "Invalid option: '$1'" 123 | usage 1>&2 124 | exit 1 125 | ;; 126 | esac 127 | shift 128 | done 129 | 130 | if (( $# == 0 )); then 131 | # Generate default test data file. 132 | FILE=$(mktemp -t checksum_testdata.XXXXXXXXXX) 133 | USING_TMPFILE=true 134 | echo "Generating 250 MB test file: $FILE" 135 | head -c 250000000 /dev/urandom > "$FILE" 136 | elif (( $# == 1 )); then 137 | FILE="$1" 138 | else 139 | usage 1>&2 140 | exit 1 141 | fi 142 | 143 | cat << EOF 144 | Method Speed (MB/s) 145 | ------ ------------ 146 | EOF 147 | 148 | # CRC-32 149 | CKSUM_NAME="CRC-32" 150 | CKSUM_FLAGS=() 151 | EXTRA_CFLAGS=() 152 | export LIBDEFLATE_DISABLE_CPU_FEATURES="" 153 | { 154 | case $ARCH in 155 | i386|x86_64) 156 | if have_cpu_features vpclmulqdq pclmulqdq avx512bw avx512vl; then 157 | do_benchmark "VPCLMULQDQ/AVX512/VL512" 158 | disable_cpu_feature zmm 159 | do_benchmark "VPCLMULQDQ/AVX512/VL256" 160 | disable_cpu_feature avx512vl "-mno-avx512vl" 161 | disable_cpu_feature avx512bw "-mno-avx512bw" 162 | fi 163 | if have_cpu_features vpclmulqdq pclmulqdq avx2; then 164 | do_benchmark "VPCLMULQDQ/AVX2" 165 | disable_cpu_feature vpclmulqdq "-mno-vpclmulqdq" 166 | fi 167 | if have_cpu_features pclmulqdq avx; then 168 | do_benchmark "PCLMULQDQ/AVX" 169 | disable_cpu_feature avx "-mno-avx" 170 | fi 171 | if have_cpu_features pclmulqdq; then 172 | do_benchmark "PCLMULQDQ" 173 | disable_cpu_feature pclmulqdq "-mno-pclmul" 174 | fi 175 | ;; 176 | aarch*) 177 | EXTRA_CFLAGS=("-march=armv8-a") 178 | if have_cpu_features pmull crc32 sha3; then 179 | do_benchmark "pmullx12_crc_eor3" 180 | disable_cpu_feature sha3 181 | fi 182 | if have_cpu_features pmull crc32; then 183 | do_benchmark "pmullx12_crc" 184 | disable_cpu_feature prefer_pmull 185 | do_benchmark "crc_pmullcombine" 186 | fi 187 | if have_cpu_features crc32; then 188 | do_benchmark "crc" 189 | disable_cpu_feature crc32 190 | fi 191 | if have_cpu_features pmull; then 192 | do_benchmark "pmull4x" 193 | disable_cpu_feature pmull 194 | fi 195 | ;; 196 | esac 197 | do_benchmark "generic" 198 | do_benchmark "zlib" 199 | } | sort_by_speed 200 | 201 | # Adler-32 202 | CKSUM_NAME="Adler-32" 203 | CKSUM_FLAGS=(-A) 204 | EXTRA_CFLAGS=() 205 | export LIBDEFLATE_DISABLE_CPU_FEATURES="" 206 | echo 207 | { 208 | case $ARCH in 209 | i386|x86_64) 210 | if have_cpu_features avx512bw avx512_vnni; then 211 | do_benchmark "AVX512VNNI/VL512" 212 | disable_cpu_feature zmm 213 | if have_cpu_features avx512vl; then 214 | do_benchmark "AVX512VNNI/VL256" 215 | fi 216 | disable_cpu_feature avx512_vnni "-mno-avx512vnni" 217 | disable_cpu_feature avx512bw "-mno-avx512bw" 218 | fi 219 | if have_cpu_features avx2 avx_vnni; then 220 | do_benchmark "AVX-VNNI" 221 | disable_cpu_feature avx_vnni "-mno-avxvnni" 222 | fi 223 | if have_cpu_features avx2; then 224 | do_benchmark "AVX2" 225 | disable_cpu_feature avx2 "-mno-avx2" 226 | fi 227 | if have_cpu_features sse2; then 228 | do_benchmark "SSE2" 229 | disable_cpu_feature sse2 "-mno-sse2" 230 | fi 231 | ;; 232 | arm*) 233 | if have_cpu_features neon; then 234 | do_benchmark "NEON" 235 | disable_cpu_feature neon "-mfpu=vfpv3" 236 | fi 237 | ;; 238 | aarch*) 239 | EXTRA_CFLAGS=("-march=armv8-a") 240 | if have_cpu_features asimd asimddp; then 241 | do_benchmark "DOTPROD" 242 | disable_cpu_feature dotprod 243 | fi 244 | if have_cpu_features asimd; then 245 | do_benchmark "NEON" 246 | disable_cpu_feature neon 247 | EXTRA_CFLAGS=("-march=armv8-a+nosimd") 248 | fi 249 | ;; 250 | esac 251 | do_benchmark "generic" 252 | do_benchmark "zlib" 253 | } | sort_by_speed 254 | -------------------------------------------------------------------------------- /scripts/cmake-helper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # This script ensures that the 'build' directory has been created and configured 4 | # with the given CMake options and environment. 5 | 6 | set -e 7 | 8 | TOPDIR="$(dirname "$0")"/.. 9 | BUILDDIR="$TOPDIR"/build 10 | 11 | flags=$(env; echo "@CMAKEOPTS@=$*") 12 | if [ "$flags" != "$(cat "$BUILDDIR"/.flags 2>/dev/null || true)" ]; then 13 | rm -rf "$BUILDDIR"/CMakeCache.txt "$BUILDDIR"/CMakeFiles 14 | mkdir -p "$BUILDDIR" 15 | cmake -S "$TOPDIR" -B "$BUILDDIR" "$@" 16 | echo "$flags" > "$BUILDDIR"/.flags 17 | fi 18 | -------------------------------------------------------------------------------- /scripts/deflate_benchmarks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eu -o pipefail 4 | topdir="$(dirname "$0")/.." 5 | tmpfile=$(mktemp) 6 | trap 'rm -f $tmpfile' EXIT 7 | 8 | run_benchmark() 9 | { 10 | local best_ctime=1000000000 11 | local i 12 | 13 | for i in $(seq "$NUM_ITERATIONS"); do 14 | "$@" > "$tmpfile" 15 | csize=$(awk '/Compressed/{print $4}' "$tmpfile") 16 | ctime=$(awk '/Compression time/{print $3}' "$tmpfile") 17 | if (( ctime < best_ctime )); then 18 | best_ctime=$ctime 19 | fi 20 | : "$i" # make shellcheck happy 21 | done 22 | CSIZE=$csize 23 | CTIME=$best_ctime 24 | } 25 | 26 | multifile() 27 | { 28 | local file results cmd best em 29 | 30 | NUM_ITERATIONS=1 31 | 32 | echo "File | zlib -6 | zlib -9 | libdeflate -6 | libdeflate -9 | libdeflate -12" 33 | echo "-----|---------|---------|---------------|---------------|---------------" 34 | 35 | for file in "$@"; do 36 | echo -n "$(basename "$file")" 37 | results=() 38 | cmd=("$topdir/build/programs/benchmark" 39 | -s"$(stat -c "%s" "$file")" "$file") 40 | run_benchmark "${cmd[@]}" -Y -6 41 | results+=("$CSIZE") 42 | run_benchmark "${cmd[@]}" -Y -6 43 | results+=("$CSIZE") 44 | run_benchmark "${cmd[@]}" -6 45 | results+=("$CSIZE") 46 | run_benchmark "${cmd[@]}" -9 47 | results+=("$CSIZE") 48 | run_benchmark "${cmd[@]}" -12 49 | results+=("$CSIZE") 50 | best=2000000000 51 | for result in "${results[@]}"; do 52 | if (( result < best)); then 53 | best=$result 54 | fi 55 | done 56 | for result in "${results[@]}"; do 57 | if (( result == best )); then 58 | em="**" 59 | else 60 | em="" 61 | fi 62 | echo -n " | ${em}${result}${em}" 63 | done 64 | echo 65 | done 66 | } 67 | 68 | single_file() 69 | { 70 | local file=$1 71 | local usize args 72 | local include_old=false 73 | 74 | usize=$(stat -c "%s" "$file") 75 | : "${NUM_ITERATIONS:=3}" 76 | 77 | if [ -e "$topdir/benchmark-old" ]; then 78 | include_old=true 79 | fi 80 | echo -n "Level | libdeflate (new) " 81 | if $include_old; then 82 | echo -n "| libdeflate (old) " 83 | fi 84 | echo "| zlib" 85 | echo -n "------|------------------" 86 | if $include_old; then 87 | echo -n "|------------------" 88 | fi 89 | echo "|-----" 90 | for level in {1..12}; do 91 | echo -n "$level" 92 | args=("$file" -s "$usize" "-$level") 93 | 94 | run_benchmark "$topdir/build/programs/benchmark" "${args[@]}" 95 | echo -n " | $CSIZE / $CTIME" 96 | 97 | if $include_old; then 98 | run_benchmark "$topdir/benchmark-old" "${args[@]}" 99 | echo -n " | $CSIZE / $CTIME" 100 | fi 101 | 102 | if (( level > 9 )); then 103 | echo -n " | N/A" 104 | else 105 | run_benchmark "$topdir/build/programs/benchmark" \ 106 | "${args[@]}" -Y 107 | echo -n " | $CSIZE / $CTIME" 108 | fi 109 | echo 110 | done 111 | } 112 | 113 | if (( $# > 1 )); then 114 | multifile "$@" 115 | elif (( $# == 1 )); then 116 | single_file "$@" 117 | else 118 | echo 1>&2 "Usage: $0 FILE..." 119 | fi 120 | -------------------------------------------------------------------------------- /scripts/exec_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Helper script used by run_tests.sh and android_tests.sh, 4 | # not intended to be run directly 5 | # 6 | 7 | set -eu 8 | 9 | DIR=${1:-.} 10 | 11 | cd "$DIR" 12 | 13 | run_cmd() { 14 | echo "$WRAPPER $*" 15 | $WRAPPER "$@" > /dev/null 16 | } 17 | 18 | for prog in ./test_*; do 19 | run_cmd "$prog" 20 | done 21 | 22 | for format in '' '-g' '-z'; do 23 | for ref_impl in '' '-Y' '-Z'; do 24 | run_cmd ./benchmark $format $ref_impl "$TESTDATA" 25 | done 26 | done 27 | for level in 0 1 3 7 9; do 28 | for ref_impl in '' '-Y'; do 29 | run_cmd ./benchmark -$level $ref_impl "$TESTDATA" 30 | done 31 | done 32 | for level in 0 1 3 7 9 12; do 33 | for ref_impl in '' '-Z'; do 34 | run_cmd ./benchmark -$level $ref_impl "$TESTDATA" 35 | done 36 | done 37 | 38 | echo "exec_tests finished successfully" # Needed for 'adb shell' 39 | -------------------------------------------------------------------------------- /scripts/gen-crc32-consts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # This script generates constants for efficient computation of the gzip CRC-32. 4 | 5 | import sys 6 | 7 | # This is the generator polynomial G(x) of the gzip CRC-32, represented as an 8 | # int using the natural mapping between bits and polynomial coefficients. 9 | G = 0x104c11db7 10 | 11 | # XOR (add) an iterable of polynomials. 12 | def xor(iterable): 13 | res = 0 14 | for val in iterable: 15 | res ^= val 16 | return res 17 | 18 | # Multiply two polynomials. 19 | def clmul(a, b): 20 | return xor(a << i for i in range(b.bit_length()) if (b & (1 << i)) != 0) 21 | 22 | # Polynomial division floor(a / b). 23 | def div(a, b): 24 | q = 0 25 | while a.bit_length() >= b.bit_length(): 26 | q ^= 1 << (a.bit_length() - b.bit_length()) 27 | a ^= b << (a.bit_length() - b.bit_length()) 28 | return q 29 | 30 | # Reduce the polynomial 'a' modulo the polynomial 'b'. 31 | def reduce(a, b): 32 | return a ^ clmul(div(a, b), b) 33 | 34 | # Reverse the bits of a polynomial. 35 | def bitreverse(poly, num_bits): 36 | return xor(1 << (num_bits - 1 - i) for i in range(num_bits) 37 | if (poly & (1 << i)) != 0) 38 | 39 | # Compute x^d mod G. 40 | def x_to_the_d(d): 41 | if d < G.bit_length() - 1: 42 | return 1 << d 43 | t = x_to_the_d(d//2) 44 | t = clmul(t, t) 45 | if d % 2 != 0: 46 | t <<= 1 47 | return reduce(t, G) 48 | 49 | def gen_tables(): 50 | print('/*') 51 | print(' * crc32_tables.h - data tables for CRC-32 computation') 52 | print(' *') 53 | print(' * THIS FILE WAS GENERATED BY gen-crc32-consts.py. DO NOT EDIT.') 54 | print(' */') 55 | for n in [1, 8]: 56 | print('') 57 | print(f'static const u32 crc32_slice{n}_table[] MAYBE_UNUSED = {{') 58 | # The i'th table entry is the CRC-32 of the message consisting of byte 59 | # i % 256 followed by i // 256 zero bytes. 60 | polys = [bitreverse(i % 256, 8) << (32 + 8*(i//256)) for i in range(256 * n)] 61 | polys = [bitreverse(reduce(poly, G), 32) for poly in polys] 62 | for i in range(0, len(polys), 4): 63 | print(f'\t0x{polys[i+0]:08x}, 0x{polys[i+1]:08x}, 0x{polys[i+2]:08x}, 0x{polys[i+3]:08x},') 64 | print('};') 65 | 66 | # Compute the constant multipliers needed for "folding" over various distances 67 | # with the gzip CRC-32. Each such multiplier is x^d mod G(x) for some distance 68 | # d, in bits, over which the folding is occurring. 69 | # 70 | # Folding works as follows: let A(x) be a polynomial (possibly reduced partially 71 | # or fully mod G(x)) for part of the message, and let B(x) be a polynomial 72 | # (possibly reduced partially or fully mod G(x)) for a later part of the 73 | # message. The unreduced combined polynomial is A(x)*x^d + B(x), where d is the 74 | # number of bits separating the two parts of the message plus len(B(x)). Since 75 | # mod G(x) can be applied at any point, x^d mod G(x) can be precomputed and used 76 | # instead of x^d unreduced. That allows the combined polynomial to be computed 77 | # relatively easily in a partially-reduced form A(x)*(x^d mod G(x)) + B(x), with 78 | # length max(len(A(x)) + 31, len(B(x))). This does require doing a polynomial 79 | # multiplication (carryless multiplication). 80 | # 81 | # "Folding" in this way can be used for the entire CRC computation except the 82 | # final reduction to 32 bits; this works well when CPU support for carryless 83 | # multiplication is available. It can also be used to combine CRCs of different 84 | # parts of the message that were computed using a different method. 85 | # 86 | # Note that the gzip CRC-32 uses bit-reversed polynomials. I.e., the low order 87 | # bits are really the high order polynomial coefficients. 88 | def gen_multipliers(): 89 | print('/*') 90 | print(' * crc32_multipliers.h - constants for CRC-32 folding') 91 | print(' *') 92 | print(' * THIS FILE WAS GENERATED BY gen-crc32-consts.py. DO NOT EDIT.') 93 | print(' */') 94 | print('') 95 | 96 | # Compute the multipliers needed for CRC-32 folding with carryless 97 | # multiplication instructions that operate on the 64-bit halves of 128-bit 98 | # segments. Using the terminology from earlier, for each 64-bit fold 99 | # len(A(x)) = 64, and len(B(x)) = 95 since a 64-bit polynomial multiplied by 100 | # a 32-bit one produces a 95-bit one. When A(x) is the low order polynomial 101 | # half of a 128-bit segments (high order physical half), the separation 102 | # between the message parts is the total length of the 128-bit segments 103 | # separating the values. When A(x) is the high order polynomial half, the 104 | # separation is 64 bits greater. 105 | for i in range(1, 33): 106 | sep_lo = 128 * (i - 1) 107 | sep_hi = sep_lo + 64 108 | len_B = 95 109 | for d in [sep_hi + len_B, # A(x) = high 64 polynomial bits (low 64 physical bits) 110 | sep_lo + len_B # A(x) = low 64 polynomial bits (high 64 physical bits) 111 | ]: 112 | poly = bitreverse(x_to_the_d(d), 32) 113 | print(f'#define CRC32_X{d}_MODG 0x{poly:08x} /* x^{d} mod G(x) */') 114 | print('') 115 | 116 | # Compute constants for the final 128 => 32 bit reduction. 117 | poly = bitreverse(div(1 << 95, G), 64) 118 | print(f'#define CRC32_BARRETT_CONSTANT_1 0x{poly:016x}ULL /* floor(x^95 / G(x)) */') 119 | poly = bitreverse(G, 33) 120 | print(f'#define CRC32_BARRETT_CONSTANT_2 0x{poly:016x}ULL /* G(x) */') 121 | 122 | # Compute multipliers for combining the CRCs of separate chunks. 123 | print('') 124 | num_chunks = 4 125 | table_len = 129 126 | min_chunk_len = 128 127 | print(f'#define CRC32_NUM_CHUNKS {num_chunks}') 128 | print(f'#define CRC32_MIN_VARIABLE_CHUNK_LEN {min_chunk_len}UL') 129 | print(f'#define CRC32_MAX_VARIABLE_CHUNK_LEN {(table_len-1) * min_chunk_len}UL') 130 | print('') 131 | print('/* Multipliers for implementations that use a variable chunk length */') 132 | print('static const u32 crc32_mults_for_chunklen[][CRC32_NUM_CHUNKS - 1] MAYBE_UNUSED = {') 133 | print('\t{ 0 /* unused row */ },') 134 | for i in range(1, table_len): 135 | chunk_len = i * min_chunk_len 136 | print(f'\t/* chunk_len={chunk_len} */') 137 | print('\t{ ', end='') 138 | for j in range(num_chunks - 1, 0, -1): 139 | d = (j * 8 * chunk_len) - 33 140 | poly = bitreverse(x_to_the_d(d), 32) 141 | print(f'0x{poly:08x} /* x^{d} mod G(x) */, ', end='') 142 | print('},') 143 | print('};') 144 | fixed_chunk_len = 32768 145 | print('') 146 | print('/* Multipliers for implementations that use a large fixed chunk length */') 147 | print(f'#define CRC32_FIXED_CHUNK_LEN {fixed_chunk_len}UL') 148 | for j in range(1, num_chunks): 149 | d = (j * 8 * fixed_chunk_len) - 33 150 | poly = bitreverse(x_to_the_d(d), 32) 151 | print(f'#define CRC32_FIXED_CHUNK_MULT_{j} 0x{poly:08x} /* x^{d} mod G(x) */') 152 | 153 | with open('lib/crc32_tables.h', 'w') as f: 154 | sys.stdout = f 155 | gen_tables() 156 | with open('lib/crc32_multipliers.h', 'w') as f: 157 | sys.stdout = f 158 | gen_multipliers() 159 | -------------------------------------------------------------------------------- /scripts/gen-release-archives.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eu -o pipefail 4 | 5 | # This script generates source and binary archives that should be posted for 6 | # each new release of libdeflate. 7 | 8 | prefix="libdeflate-$(git describe HEAD | sed 's/^v//')" 9 | 10 | # Generate source code archive libdeflate-*.tar.gz 11 | tarball="${prefix}.tar.gz" 12 | echo "Generating $tarball" 13 | git archive --format=tar --prefix="${prefix}/" HEAD \ 14 | | libdeflate-gzip -12 > "$tarball" 15 | 16 | # Generate Windows binary releases libdeflate-*-windows-*-bin.zip 17 | for arch in 'i686' 'x86_64'; do 18 | dir=${prefix}-windows-${arch}-bin 19 | zipfile="${dir}.zip" 20 | echo "Generating $zipfile" 21 | rm -rf build "$dir" "$zipfile" 22 | CFLAGS="-Werror" ${arch}-w64-mingw32-cmake -B build -G Ninja \ 23 | -DLIBDEFLATE_BUILD_TESTS=1 > /dev/null 24 | cmake --build build > /dev/null 25 | mkdir "$dir" 26 | cp libdeflate.h build/libdeflate.{dll,dll.a,a} \ 27 | build/programs/{benchmark,checksum}.exe "$dir" 28 | cp build/programs/libdeflate-gzip.exe "$dir"/gzip.exe 29 | cp build/programs/libdeflate-gzip.exe "$dir"/gunzip.exe 30 | ${arch}-w64-mingw32-strip "$dir"/libdeflate.dll "$dir"/*.exe 31 | for file in COPYING NEWS.md README.md; do 32 | sed < $file > "$dir/${file}.txt" -e 's/$/\r/g' 33 | done 34 | (cd "$dir" && zip -q -r "../${zipfile}" .) 35 | done 36 | 37 | echo "Successfully generated release archives" 38 | -------------------------------------------------------------------------------- /scripts/gen_bitreverse_tab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # This script computes a table that maps each byte to its bitwise reverse. 4 | 5 | def reverse_byte(v): 6 | return sum(1 << (7 - bit) for bit in range(8) if (v & (1 << bit)) != 0) 7 | 8 | tab = [reverse_byte(v) for v in range(256)] 9 | 10 | print('static const u8 bitreverse_tab[256] = {') 11 | for i in range(0, len(tab), 8): 12 | print('\t', end='') 13 | for j, v in enumerate(tab[i:i+8]): 14 | print(f'0x{v:02x},', end='') 15 | if j == 7: 16 | print('') 17 | else: 18 | print(' ', end='') 19 | print('};') 20 | -------------------------------------------------------------------------------- /scripts/gen_default_litlen_costs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # This script computes the default litlen symbol costs for the near-optimal 4 | # parser. 5 | 6 | from math import log2 7 | 8 | BIT_COST = 16 # Must match BIT_COST in deflate_compress.c 9 | NUM_LEN_SLOTS = 29 10 | 11 | print("""static const struct { 12 | u8 used_lits_to_lit_cost[257]; 13 | u8 len_sym_cost; 14 | } default_litlen_costs[] = {""") 15 | MATCH_PROBS = [0.25, 0.50, 0.75] 16 | for i, match_prob in enumerate(MATCH_PROBS): 17 | len_prob = match_prob / NUM_LEN_SLOTS 18 | len_sym_cost = int(-log2(len_prob) * BIT_COST) 19 | if i == 0: 20 | print('\t{', end='') 21 | print(f' /* match_prob = {match_prob} */') 22 | print('\t\t.used_lits_to_lit_cost = {') 23 | 24 | j = 0 25 | for num_used_literals in range(0, 257): 26 | if num_used_literals == 0: 27 | num_used_literals = 1 28 | lit_prob = (1 - match_prob) / num_used_literals 29 | lit_cost = int(-log2(lit_prob) * BIT_COST) 30 | if j == 0: 31 | print('\t\t\t', end='') 32 | if j == 7 or num_used_literals == 256: 33 | print(f'{lit_cost},') 34 | j = 0 35 | else: 36 | print(f'{lit_cost}, ', end='') 37 | j += 1 38 | print('\t\t},') 39 | print(f'\t\t.len_sym_cost = {len_sym_cost},') 40 | if i < len(MATCH_PROBS) - 1: 41 | print('\t}, {', end='') 42 | else: 43 | print('\t},') 44 | print('};') 45 | -------------------------------------------------------------------------------- /scripts/gen_offset_slot_map.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # This script generates the deflate_offset_slot[] array, which maps 4 | # 'offset - 1 => offset_slot' for offset <= 256. 5 | 6 | DEFLATE_OFFSET_SLOT_BASE = [ 7 | 1 , 2 , 3 , 4 , 5 , 7 , 9 , 13 , 8 | 17 , 25 , 33 , 49 , 65 , 97 , 129 , 193 , 9 | 257 , 385 , 513 , 769 , 1025 , 1537 , 2049 , 3073 , 10 | 4097 , 6145 , 8193 , 12289 , 16385 , 24577 , 11 | ] 12 | 13 | offset_slot_map = [0] * 256 14 | offset_slot = -1 15 | for offset in range(1, len(offset_slot_map) + 1): 16 | if offset >= DEFLATE_OFFSET_SLOT_BASE[offset_slot + 1]: 17 | offset_slot += 1 18 | offset_slot_map[offset - 1] = offset_slot 19 | 20 | print(f'static const u8 deflate_offset_slot[{len(offset_slot_map)}] = {{') 21 | for i in range(0, len(offset_slot_map), 16): 22 | print('\t', end='') 23 | for j, v in enumerate(offset_slot_map[i:i+16]): 24 | print(f'{v},', end='') 25 | if j == 15: 26 | print('') 27 | else: 28 | print(' ', end='') 29 | print('};') 30 | -------------------------------------------------------------------------------- /scripts/libFuzzer/.gitignore: -------------------------------------------------------------------------------- 1 | */fuzz 2 | -------------------------------------------------------------------------------- /scripts/libFuzzer/deflate_compress/corpus/0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebiggers/libdeflate/6bb493615b0ef35c98fc4aa4ec04f448788db6a5/scripts/libFuzzer/deflate_compress/corpus/0 -------------------------------------------------------------------------------- /scripts/libFuzzer/deflate_compress/fuzz.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | static void 11 | alloc_guarded_buffer(size_t size, uint8_t **start_ret, uint8_t **end_ret) 12 | { 13 | const size_t pagesize = sysconf(_SC_PAGESIZE); 14 | const size_t nr_pages = (size + pagesize - 1) / pagesize; 15 | uint8_t *base_addr, *start, *end; 16 | 17 | /* Allocate buffer and guard pages. */ 18 | base_addr = mmap(NULL, (nr_pages + 2) * pagesize, PROT_READ|PROT_WRITE, 19 | MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); 20 | assert(base_addr != (uint8_t *)MAP_FAILED); 21 | start = base_addr + pagesize; 22 | end = start + (nr_pages * pagesize); 23 | 24 | /* Unmap the guard pages. */ 25 | munmap(base_addr, pagesize); 26 | munmap(end, pagesize); 27 | 28 | *start_ret = start; 29 | *end_ret = end; 30 | } 31 | 32 | static void 33 | free_guarded_buffer(uint8_t *start, uint8_t *end) 34 | { 35 | munmap(start, end - start); 36 | } 37 | 38 | /* Fuzz the DEFLATE compression and decompression round trip. */ 39 | int LLVMFuzzerTestOneInput(const uint8_t *in, size_t insize) 40 | { 41 | int level; 42 | bool use_bound; 43 | struct libdeflate_compressor *c; 44 | struct libdeflate_decompressor *d; 45 | size_t csize_avail; 46 | uint8_t *ubuf_start, *ubuf_end, *ubuf; 47 | uint8_t *cbuf_start, *cbuf_end, *cbuf; 48 | uint8_t *dbuf_start, *dbuf_end, *dbuf; 49 | size_t csize; 50 | enum libdeflate_result res; 51 | 52 | if (insize < 2) 53 | return 0; 54 | 55 | level = in[0] % 13; 56 | use_bound = in[1] % 2; 57 | in += 2; 58 | insize -= 2; 59 | 60 | c = libdeflate_alloc_compressor(level); 61 | d = libdeflate_alloc_decompressor(); 62 | 63 | /* Use guard pages to make all input/output buffer overflows segfault */ 64 | 65 | alloc_guarded_buffer(insize, &ubuf_start, &ubuf_end); 66 | ubuf = ubuf_end - insize; 67 | memcpy(ubuf, in, insize); 68 | 69 | csize_avail = use_bound ? libdeflate_deflate_compress_bound(c, insize) : 70 | insize; 71 | alloc_guarded_buffer(csize_avail, &cbuf_start, &cbuf_end); 72 | cbuf = cbuf_end - csize_avail; 73 | 74 | alloc_guarded_buffer(insize, &dbuf_start, &dbuf_end); 75 | dbuf = dbuf_end - insize; 76 | 77 | csize = libdeflate_deflate_compress(c, ubuf, insize, cbuf, csize_avail); 78 | if (csize != 0) { 79 | assert(csize <= csize_avail); 80 | memmove(cbuf_end - csize, cbuf, csize); 81 | res = libdeflate_deflate_decompress(d, cbuf_end - csize, csize, 82 | dbuf, insize, NULL); 83 | assert(res == LIBDEFLATE_SUCCESS); 84 | assert(memcmp(in, dbuf, insize) == 0); 85 | } else { 86 | assert(!use_bound); 87 | } 88 | 89 | libdeflate_free_compressor(c); 90 | libdeflate_free_decompressor(d); 91 | free_guarded_buffer(ubuf_start, ubuf_end); 92 | free_guarded_buffer(cbuf_start, cbuf_end); 93 | free_guarded_buffer(dbuf_start, dbuf_end); 94 | return 0; 95 | } 96 | -------------------------------------------------------------------------------- /scripts/libFuzzer/deflate_decompress/corpus/0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebiggers/libdeflate/6bb493615b0ef35c98fc4aa4ec04f448788db6a5/scripts/libFuzzer/deflate_decompress/corpus/0 -------------------------------------------------------------------------------- /scripts/libFuzzer/deflate_decompress/fuzz.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | static void 11 | alloc_guarded_buffer(size_t size, uint8_t **start_ret, uint8_t **end_ret) 12 | { 13 | const size_t pagesize = sysconf(_SC_PAGESIZE); 14 | const size_t nr_pages = (size + pagesize - 1) / pagesize; 15 | uint8_t *base_addr, *start, *end; 16 | 17 | /* Allocate buffer and guard pages. */ 18 | base_addr = mmap(NULL, (nr_pages + 2) * pagesize, PROT_READ|PROT_WRITE, 19 | MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); 20 | assert(base_addr != (uint8_t *)MAP_FAILED); 21 | start = base_addr + pagesize; 22 | end = start + (nr_pages * pagesize); 23 | 24 | /* Unmap the guard pages. */ 25 | munmap(base_addr, pagesize); 26 | munmap(end, pagesize); 27 | 28 | *start_ret = start; 29 | *end_ret = end; 30 | } 31 | 32 | static void 33 | free_guarded_buffer(uint8_t *start, uint8_t *end) 34 | { 35 | munmap(start, end - start); 36 | } 37 | 38 | /* Fuzz DEFLATE decompression. */ 39 | int LLVMFuzzerTestOneInput(const uint8_t *in, size_t insize) 40 | { 41 | size_t outsize_avail = 3 * insize; 42 | uint8_t *cbuf_start, *cbuf_end, *cbuf; 43 | uint8_t *dbuf_start, *dbuf_end, *dbuf; 44 | struct libdeflate_decompressor *d; 45 | 46 | /* Use guard pages to make all input/output buffer overflows segfault */ 47 | 48 | alloc_guarded_buffer(insize, &cbuf_start, &cbuf_end); 49 | cbuf = cbuf_end - insize; 50 | memcpy(cbuf, in, insize); 51 | 52 | alloc_guarded_buffer(outsize_avail, &dbuf_start, &dbuf_end); 53 | dbuf = dbuf_end - outsize_avail; 54 | 55 | d = libdeflate_alloc_decompressor(); 56 | libdeflate_deflate_decompress(d, cbuf, insize, dbuf, outsize_avail, 57 | NULL); 58 | libdeflate_free_decompressor(d); 59 | free_guarded_buffer(cbuf_start, cbuf_end); 60 | free_guarded_buffer(dbuf_start, dbuf_end); 61 | return 0; 62 | } 63 | -------------------------------------------------------------------------------- /scripts/libFuzzer/fuzz.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e -u -o pipefail 4 | 5 | cd "$(dirname "$0")" 6 | 7 | read -r -a AVAILABLE_TARGETS < <(echo */fuzz.c | sed 's@/fuzz.c@@g') 8 | 9 | usage() 10 | { 11 | cat << EOF 12 | Usage: $0 [OPTION]... FUZZ_TARGET 13 | 14 | Fuzz libdeflate with LLVM's libFuzzer. 15 | 16 | Options: 17 | --asan Enable AddressSanitizer 18 | --max-len=LEN Maximum length of generated inputs (default: $MAX_LEN) 19 | --msan Enable MemorySanitizer 20 | --time=SECONDS Stop after the given time has passed 21 | --ubsan Enable UndefinedBehaviorSanitizer 22 | 23 | Available fuzz targets: ${AVAILABLE_TARGETS[*]} 24 | EOF 25 | } 26 | 27 | die() 28 | { 29 | echo "$*" 1>&2 30 | exit 1 31 | } 32 | 33 | run_cmd() 34 | { 35 | echo "$*" 36 | "$@" 37 | } 38 | 39 | EXTRA_SANITIZERS= 40 | EXTRA_FUZZER_ARGS=() 41 | MAX_LEN=65536 42 | 43 | longopts_array=( 44 | asan 45 | help 46 | max-len: 47 | msan 48 | time: 49 | ubsan 50 | ) 51 | longopts=$(echo "${longopts_array[@]}" | tr ' ' ',') 52 | 53 | if ! options=$(getopt -o "" -l "$longopts" -- "$@"); then 54 | usage 1>&2 55 | exit 1 56 | fi 57 | eval set -- "$options" 58 | while true; do 59 | case "$1" in 60 | --asan) 61 | EXTRA_SANITIZERS+=",address" 62 | ;; 63 | --help) 64 | usage 65 | exit 0 66 | ;; 67 | --max-len) 68 | MAX_LEN=$2 69 | shift 70 | ;; 71 | --msan) 72 | EXTRA_SANITIZERS+=",memory" 73 | ;; 74 | --time) 75 | EXTRA_FUZZER_ARGS+=("-max_total_time=$2") 76 | shift 77 | ;; 78 | --ubsan) 79 | EXTRA_SANITIZERS+=",undefined" 80 | ;; 81 | --) 82 | shift 83 | break 84 | ;; 85 | *) 86 | echo 1>&2 "Invalid option '$1'" 87 | usage 1>&2 88 | exit 1 89 | esac 90 | shift 91 | done 92 | EXTRA_FUZZER_ARGS+=("-max_len=$MAX_LEN") 93 | 94 | if (( $# != 1 )); then 95 | echo 1>&2 "No fuzz target specified!" 96 | usage 1>&2 97 | exit 1 98 | fi 99 | TARGET=$1 100 | if [ ! -e "$TARGET/fuzz.c" ]; then 101 | echo 1>&2 "'$TARGET' is not a valid fuzz target!" 102 | usage 1>&2 103 | exit 1 104 | fi 105 | run_cmd clang -g -O1 -fsanitize=fuzzer$EXTRA_SANITIZERS \ 106 | -Wall -Werror -DLIBDEFLATE_ENABLE_ASSERTIONS=1 -I ../../ \ 107 | ../../lib/*{,/*}.c "$TARGET/fuzz.c" -o "$TARGET/fuzz" 108 | run_cmd "$TARGET/fuzz" "${EXTRA_FUZZER_ARGS[@]}" "$TARGET/corpus" 109 | -------------------------------------------------------------------------------- /scripts/libFuzzer/gzip_decompress/corpus/0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebiggers/libdeflate/6bb493615b0ef35c98fc4aa4ec04f448788db6a5/scripts/libFuzzer/gzip_decompress/corpus/0 -------------------------------------------------------------------------------- /scripts/libFuzzer/gzip_decompress/fuzz.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | /* Fuzz gzip decompression. */ 6 | int LLVMFuzzerTestOneInput(const uint8_t *in, size_t insize) 7 | { 8 | size_t outsize_avail = 3 * insize; 9 | uint8_t *out; 10 | struct libdeflate_decompressor *d; 11 | 12 | out = malloc(outsize_avail); 13 | 14 | d = libdeflate_alloc_decompressor(); 15 | libdeflate_gzip_decompress(d, in, insize, out, outsize_avail, NULL); 16 | libdeflate_free_decompressor(d); 17 | free(out); 18 | return 0; 19 | } 20 | -------------------------------------------------------------------------------- /scripts/libFuzzer/zlib_decompress/corpus/0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebiggers/libdeflate/6bb493615b0ef35c98fc4aa4ec04f448788db6a5/scripts/libFuzzer/zlib_decompress/corpus/0 -------------------------------------------------------------------------------- /scripts/libFuzzer/zlib_decompress/fuzz.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | /* Fuzz zlib decompression. */ 6 | int LLVMFuzzerTestOneInput(const uint8_t *in, size_t insize) 7 | { 8 | size_t outsize_avail = 3 * insize; 9 | uint8_t *out; 10 | struct libdeflate_decompressor *d; 11 | 12 | out = malloc(outsize_avail); 13 | 14 | d = libdeflate_alloc_decompressor(); 15 | libdeflate_zlib_decompress(d, in, insize, out, outsize_avail, NULL); 16 | libdeflate_free_decompressor(d); 17 | free(out); 18 | return 0; 19 | } 20 | -------------------------------------------------------------------------------- /scripts/toolchain-i686-w64-mingw32.cmake: -------------------------------------------------------------------------------- 1 | set(CMAKE_SYSTEM_NAME Windows) 2 | set(CMAKE_SYSTEM_PROCESSOR i686) 3 | set(CMAKE_C_COMPILER i686-w64-mingw32-gcc) 4 | set(CMAKE_FIND_ROOT_PATH /usr/i686-w64-mingw32) 5 | set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) 6 | set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) 7 | set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) 8 | set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) 9 | -------------------------------------------------------------------------------- /scripts/toolchain-x86_64-w64-mingw32.cmake: -------------------------------------------------------------------------------- 1 | set(CMAKE_SYSTEM_NAME Windows) 2 | set(CMAKE_SYSTEM_PROCESSOR x86_64) 3 | set(CMAKE_C_COMPILER x86_64-w64-mingw32-gcc) 4 | set(CMAKE_FIND_ROOT_PATH /usr/x86_64-w64-mingw32) 5 | set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) 6 | set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) 7 | set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) 8 | set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) 9 | --------------------------------------------------------------------------------