├── .github └── workflows │ ├── c-cpp.yml │ └── release.yaml ├── CHANGES.md ├── LICENSE ├── Makefile ├── README.md ├── fuzzing ├── README.md ├── afl_compress_driver.c ├── afl_decompress_driver.c └── afl_roundtrip_driver.c ├── lzw-eddy.c ├── lzw.h ├── package.sh ├── run-tests.sh ├── test-ref.sh └── tests ├── AaAx64.txt ├── abra.txt.lzw ├── atsign.lzw └── zeros80000.lzw /.github/workflows/c-cpp.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: build 3 | 4 | on: 5 | push: 6 | branches: [master] 7 | pull_request: 8 | branches: [master] 9 | 10 | jobs: 11 | build: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout 15 | uses: actions/checkout@v2 16 | with: 17 | fetch-depth: 0 18 | - name: Build and test 19 | run: make test 20 | -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | name: release 3 | 4 | on: 5 | workflow_dispatch: 6 | push: 7 | tags: 8 | - 'v*' 9 | 10 | jobs: 11 | release: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout 15 | uses: actions/checkout@v2 16 | with: 17 | fetch-depth: 0 18 | - name: Build linux package 19 | run: ./package.sh linux 20 | - name: Create release 21 | uses: actions/create-release@v1 22 | id: release 23 | env: 24 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 25 | with: 26 | draft: true 27 | prerelease: false 28 | tag_name: ${{ github.ref }} 29 | release_name: lzw-eddy ${{ github.ref }} 30 | body_path: CHANGES.md 31 | - name: Upload linux package 32 | uses: actions/upload-release-asset@v1 33 | env: 34 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 35 | with: 36 | upload_url: ${{ steps.release.outputs.upload_url }} 37 | asset_path: ./packages/lzw-eddy.linux-x86_64.tar.gz 38 | asset_name: lzw-eddy.linux-x86_64.tar.gz 39 | asset_content_type: application/gzip 40 | -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- 1 | # LZW single-header library 2 | 3 | Stable releases have even minor version (e.g 1.0.x, 1.2.x), odd numbered minors are development versions. 4 | 5 | ## Release 1.1.0-dev - next 6 | 7 | * Now works on Microsoft's compiler (v19.32.31332) 8 | 9 | ## Release 1.0.0 - 2022-06-19 10 | 11 | * Initial versioned release. 12 | 13 | ## Prior versions 14 | 15 | * For unversioned 'releases' going back to 2020, see the git log. 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020, 2021, 2022 Eddy L O Jansson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | OPT=-O3 -fomit-frame-pointer -funroll-loops -fstrict-aliasing -march=native -mtune=native 2 | WARNFLAGS=-Wall -Wextra -Wshadow -Wstrict-aliasing -Wcast-qual -Wcast-align -Wpointer-arith -Wredundant-decls -Wfloat-equal -Wswitch-enum 3 | CWARNFLAGS=-Wstrict-prototypes -Wmissing-prototypes 4 | MISCFLAGS=-fvisibility=hidden -fstack-protector 5 | DEVFLAGS=-ggdb -DDEBUG -D_FORTIFY_SOURCE=3 -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function 6 | 7 | # 8 | # Some architecture specific flags 9 | # 10 | ARCH:=$(shell uname -m) 11 | ifeq ($(ARCH),x86_64) 12 | ARCHFLAGS=-fcf-protection -msse4.2 -mavx 13 | endif 14 | ifeq ($(ARCH),aarch64) 15 | ARCHFLAGS=-mbranch-protection=bti 16 | endif 17 | 18 | AFLCC?=afl-clang-fast 19 | 20 | YELLOW='\033[1;33m' 21 | NC='\033[0m' 22 | 23 | ifdef MEMCHECK 24 | TEST_PREFIX:=valgrind --tool=memcheck --leak-check=full --track-origins=yes 25 | endif 26 | 27 | ifdef PERF 28 | TEST_PREFIX:=perf stat 29 | endif 30 | 31 | # GCC only 32 | ifdef ANALYZER 33 | MISCFLAGS+=-fanalyzer 34 | endif 35 | 36 | # clang only 37 | ifdef SANITIZE 38 | MISCFLAGS+=-fsanitize=memory 39 | endif 40 | 41 | ifdef OPTIMIZED 42 | # On mingw, -static avoids dep on libssp-0.dll when built with -fstack-protector 43 | MISCFLAGS+=-DNDEBUG -Werror -static 44 | else 45 | MISCFLAGS+=$(DEVFLAGS) 46 | endif 47 | 48 | BITWIDTH?=12 49 | 50 | CFLAGS=-std=c11 $(OPT) $(CWARNFLAGS) $(WARNFLAGS) $(ARCHFLAGS) $(MISCFLAGS) -DLZW_MAX_CODE_WIDTH=$(BITWIDTH) 51 | CXXFLAGS=-std=gnu++17 -fno-rtti $(OPT) $(WARNFLAGS) $(ARCHFLAGS) $(MISCFLAGS) 52 | 53 | .PHONY: clean test fuzz 54 | 55 | all: lzw-eddy 56 | 57 | FORCE: 58 | 59 | build_const.h: FORCE 60 | @git show-ref --head --hash | head -n 1 | awk '{ printf "const char *build_hash = \"%s\";\n",$$1 }' > $@.tmp 61 | @if test -r $@ ; then \ 62 | (cmp $@.tmp $@ && rm $@.tmp) || mv -f $@.tmp $@ ; \ 63 | else \ 64 | mv $@.tmp $@ ; \ 65 | fi 66 | @if test ! -s $@ ; then \ 67 | echo "Bare build, no build hash available." ; \ 68 | echo 'const char *build_hash = "";' > $@ ; \ 69 | fi 70 | 71 | lzw-eddy: lzw-eddy.c lzw.h build_const.h 72 | $(CC) $(CFLAGS) $< -o $@ 73 | 74 | afl-%: fuzzing/afl_*.c lzw.h 75 | $(AFLCC) $(CFLAGS) -I. fuzzing/afl_$(subst -,_,$*).c -o $@ 76 | 77 | fuzz-%: 78 | make afl-$* 79 | AFL_AUTORESUME=1 AFL_SKIP_CPUFREQ=1 afl-fuzz -m 16 -i tests -o findings -- ./afl-$* 80 | 81 | fuzz: fuzz-roundtrip-driver 82 | 83 | test: lzw-eddy 84 | ${TEST_PREFIX} ./run-tests.sh 85 | 86 | cppcheck: 87 | @cppcheck --verbose --error-exitcode=1 --enable=warning,performance,portability . 88 | 89 | backup: 90 | @echo -e $(YELLOW)Making backup$(NC) 91 | tar -cJf ../$(notdir $(CURDIR))-`date +"%Y-%m"`.tar.xz ../$(notdir $(CURDIR)) 92 | 93 | clean: 94 | @echo -e $(YELLOW)Cleaning$(NC) 95 | rm -f lzw-eddy build_const.h afl-*-driver core core.* 96 | rm -rf packages 97 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Simple LZW (Lempel-Ziv-Welch) Library, Compressor & Decompressor 3 | 4 | [![License](https://img.shields.io/github/license/eloj/lzw-eddy)](LICENSE) 5 | [![Release](https://img.shields.io/github/release/eloj/lzw-eddy/all.svg)](https://github.com/eloj/lzw-eddy/releases) 6 | [![Build status](https://github.com/eloj/lzw-eddy/workflows/build/badge.svg)](https://github.com/eloj/lzw-eddy/actions/workflows/c-cpp.yml) 7 | 8 | A single-header library and basic headerless compressor and decompressor. Supports variable length codes 9 | between 9 and 12 bits per default, but the upper bound is a compile-time constant that can be adjusted between 10 | 9 and 16 bits. 11 | 12 | The algorithm implemented by this code was widely distributed in the old MS-DOS days in places 13 | like [Dr.Dobbs](https://marknelson.us/posts/1989/10/01/lzw-data-compression.html) and a popular book on compression, 14 | probably due to its use in GIF. This resulted in it being used in all sorts of places. 15 | 16 | There are many different and mutually incompatible ways to implement the LZW algorithm. 17 | Specifically the code in this repository was written to be [bit-compatible with Puzznic](https://www.giantbomb.com/profile/eloj/blog/technical-notes-on-the-level-format-of-puzznic-for/114881/) (MS-DOS), 18 | and as such does not represent an effort to write "the best" or "most compatible" LZW codec. 19 | 20 | Code developed using the note "[LZW and GIF explained](https://www.eecis.udel.edu/~amer/CISC651/lzw.and.gif.explained.html)" 21 | by Steve Blackstock as a reference. 22 | 23 | All code is provided under the [MIT License](LICENSE). 24 | 25 | ## Features 26 | 27 | * Single-Header Library. 28 | * Fixed memory requirements: 29 | * Uses ~16KiB for state/string table by default. 30 | * At least ~4KiB output buffer recommended, but can go _much_ lower in practice. 31 | * Low stack usage. 32 | * Compressor can be 'short-stroked' to limit decompression buffer size requirement. 33 | * Fast decompression. _Very_ slow compression. 34 | * Releases are: 35 | * [Valgrind](https://valgrind.org/) clean, 36 | * [scan-build](https://clang-analyzer.llvm.org/scan-build.html) clean, and 37 | * [AFL++](https://aflplus.plus/) clean (for some reasonable run-time). 38 | 39 | ## C interface 40 | 41 | ```c 42 | ssize_t lzw_decompress(struct lzw_state *state, uint8_t *src, size_t slen, uint8_t *dest, size_t dlen); 43 | ssize_t lzw_compress(struct lzw_state *state, uint8_t *src, size_t slen, uint8_t *dest, size_t dlen); 44 | const char *lzw_strerror(enum lzw_errors errnum); 45 | ``` 46 | 47 | * State must be zero-initialized. 48 | * The return value is the number of bytes compressed or decompressed into `dest`. Once all input has been processed, `0` is returned. See [example](#example). 49 | * On error, a negative integer is returned. 50 | 51 | All input is assumed to be available at `src`; e.g it is NOT allowed to switch `src` during encoding/decoding. A function 52 | to 'hand over' state to new input could be added, but I don't have the need. 53 | 54 | ## Security 55 | 56 | I would _not_ recommend using this code in a security-sensitive context. If you expose this code 57 | to possibly adversarial data, beware that you do so at your own risk. 58 | 59 | I make no guarantees, provide no warranties, and may not respond to security issues in a timely manner. 60 | 61 | This code was written simply to allow interoperability with an old MS-DOS game, nothing more. 62 | 63 | Thank you and enjoy. 64 | 65 | ## Usage 66 | 67 | In your code, define `LZW_EDDY_IMPLEMENTATION` and then `#include "lzw.h"`. This will give you a decoder/encoder _specific_ 68 | for 9-12 bit codes, giving a string table of 4096 entries. 69 | 70 | You can optionally define `LZW_MAX_CODE_WIDTH` to a value between 9 and 16 before including the header to 71 | change this compile-time default. Due to the way the dictionary is reconstructed during decompression, 72 | a decoder is only compatible with data generated for the _exact_ same size string table. 73 | 74 | 12-bit codes are probably the sweet spot for most applications. Larger codes means more bits are needed to 75 | encode newer strings, and because the string table is larger, the dictionary doesn't adapt as fast as it 76 | would if it was smaller. This combination means that a larger table can result in worse compression ratio. 77 | 78 | The encoder could theoretically be improved to flush or prune the existing string table if few long matches are made over 79 | some window, but no such adaptability is present. 80 | 81 | ## CLI compressor 82 | 83 | `lzw-eddy` is a simple command-line compressor built using the library. 84 | 85 | ```bash 86 | lzw-eddy 1.1.0-dev <45bf69f1> 87 | Usage: ./lzw-eddy -c file|-d file -o outfile 88 | Compiled Configuration: 89 | LZW_MIN_CODE_WIDTH=9, LZW_MAX_CODE_WIDTH=12, LZW_MAX_CODES=4096, sizeof(lzw_state)=16440 90 | ``` 91 | 92 | You can pass BITWIDTH=\ to build it with a non-default string table size. 93 | 94 | ```bash 95 | $ make -B BITWIDTH=14 && ./lzw-eddy -c lzw.h -o /dev/null 96 | lzw-eddy 1.1.0-dev <45bf69f1> 97 | Compressing 'lzw.h', 14566 bytes. 98 | 6947 bytes written to output, reduction=52.31% (longest prefix=15). 99 | ``` 100 | 101 | ## Example 102 | 103 | ```c 104 | #define LZW_EDDY_IMPLEMENTATION 105 | #include "lzw.h" 106 | 107 | struct lzw_state state = { 0 }; 108 | 109 | size_t slen = 110 | uint8_t *src = ; 111 | uint8_t dest[4096]; 112 | 113 | ssize_t res, written = 0; 114 | while ((res = lzw_decompress(&state, src, slen, dest, sizeof(dest))) > 0) { 115 | // Process `res` bytes of output in `dest`, e.g: 116 | // fwrite(dest, res, 1, outfile); 117 | written += res; 118 | } 119 | if (res == 0) { 120 | printf("%zd bytes successfully decompressed.\n", written); 121 | } else if (res < 0) { 122 | fprintf(stderr, "Decompression error: %s (err:%zd)\n", lzw_strerror(res), res); 123 | } 124 | ``` 125 | 126 | ## Unlikely To Do 127 | 128 | * Add Google Benchmark. 129 | * Use hashing for lookups in `lzw_string_table_lookup`. 130 | * Support changing inputs during processing. 131 | * Gather/Scatter alternative interface. 132 | -------------------------------------------------------------------------------- /fuzzing/README.md: -------------------------------------------------------------------------------- 1 | 2 | # LZW Fuzzing Drivers 3 | 4 | This directory contains short driver programs optimized for running the 5 | lzw compression and decompression code through a fuzzer, specifically [AFL - American Fuzzy Lop](https://lcamtuf.coredump.cx/afl/). 6 | 7 | ## Prerequisites 8 | 9 | You need to install AFL, or one of its forks. 10 | 11 | The original AFL hasn't seen updates in a long time, but [AFL++](https://aflplus.plus/) should also work, but is untested at this time. 12 | 13 | ## Running 14 | 15 | Simply run `make fuzz` or `make fuzz--driver` to build the corresponding driver and start AFL. 16 | 17 | NOTE: It's recommended to run fuzzing on /tmp (or another RAM disk), since 18 | AFL does a lot of disk writes. 19 | -------------------------------------------------------------------------------- /fuzzing/afl_compress_driver.c: -------------------------------------------------------------------------------- 1 | /* 2 | Compression console driver for use with afl-fuzz (fast mode) 3 | */ 4 | #include 5 | #include 6 | 7 | #define LZW_EDDY_IMPLEMENTATION 8 | #include "lzw.h" 9 | 10 | /* this lets the source compile without afl-clang-fast/lto */ 11 | #ifndef __AFL_FUZZ_TESTCASE_LEN 12 | 13 | ssize_t fuzz_len; 14 | unsigned char fuzz_buf[1024000]; 15 | 16 | #define __AFL_FUZZ_TESTCASE_LEN fuzz_len 17 | #define __AFL_FUZZ_TESTCASE_BUF fuzz_buf 18 | #define __AFL_FUZZ_INIT() void sync(void); 19 | #define __AFL_LOOP(x) \ 20 | ((fuzz_len = read(0, fuzz_buf, sizeof(fuzz_buf))) > 0 ? 1 : 0) 21 | #define __AFL_INIT() sync() 22 | #endif 23 | 24 | __AFL_FUZZ_INIT(); 25 | 26 | #ifdef __clang__ 27 | #pragma clang optimize off 28 | #else 29 | #pragma GCC optimize("O0") 30 | #endif 31 | 32 | int main(int argc, char *argv[]) { 33 | struct lzw_state state; 34 | uint8_t dest[2048]; 35 | 36 | uint8_t *input = __AFL_FUZZ_TESTCASE_BUF; 37 | 38 | #ifdef __clang_major__ 39 | while (__AFL_LOOP(1000)) { 40 | #endif 41 | memset(&state, 0, sizeof(state)); 42 | 43 | ssize_t slen = __AFL_FUZZ_TESTCASE_LEN; 44 | if (slen > 0) { 45 | ssize_t res, written = 0; 46 | while ((res = lzw_compress(&state, input, slen, dest, sizeof(dest))) > 0) { 47 | written += res; 48 | } 49 | printf("compressed:%zd (res=%zd)\n", written, res); 50 | } 51 | #ifdef __clang_major__ 52 | } 53 | #endif 54 | return EXIT_SUCCESS; 55 | } 56 | -------------------------------------------------------------------------------- /fuzzing/afl_decompress_driver.c: -------------------------------------------------------------------------------- 1 | /* 2 | Decompression console driver for use with afl-fuzz (fast mode) 3 | */ 4 | #include 5 | #include 6 | 7 | #define LZW_EDDY_IMPLEMENTATION 8 | #include "lzw.h" 9 | 10 | /* this lets the source compile without afl-clang-fast/lto */ 11 | #ifndef __AFL_FUZZ_TESTCASE_LEN 12 | 13 | ssize_t fuzz_len; 14 | unsigned char fuzz_buf[1024000]; 15 | 16 | #define __AFL_FUZZ_TESTCASE_LEN fuzz_len 17 | #define __AFL_FUZZ_TESTCASE_BUF fuzz_buf 18 | #define __AFL_FUZZ_INIT() void sync(void); 19 | #define __AFL_LOOP(x) \ 20 | ((fuzz_len = read(0, fuzz_buf, sizeof(fuzz_buf))) > 0 ? 1 : 0) 21 | #define __AFL_INIT() sync() 22 | #endif 23 | 24 | __AFL_FUZZ_INIT(); 25 | 26 | #ifdef __clang__ 27 | #pragma clang optimize off 28 | #else 29 | #pragma GCC optimize("O0") 30 | #endif 31 | 32 | int main(int argc, char *argv[]) { 33 | struct lzw_state state; 34 | uint8_t dest[2048]; 35 | 36 | uint8_t *input = __AFL_FUZZ_TESTCASE_BUF; 37 | 38 | #ifdef __clang_major__ 39 | while (__AFL_LOOP(1000)) { 40 | #endif 41 | memset(&state, 0, sizeof(state)); 42 | 43 | ssize_t slen = __AFL_FUZZ_TESTCASE_LEN; 44 | if (slen > 0) { 45 | ssize_t res, written = 0; 46 | while ((res = lzw_decompress(&state, input, slen, dest, sizeof(dest))) > 0) { 47 | written += res; 48 | } 49 | printf("decompressed:%zd (res=%zd)\n", written, res); 50 | } 51 | #ifdef __clang_major__ 52 | } 53 | #endif 54 | return EXIT_SUCCESS; 55 | } 56 | -------------------------------------------------------------------------------- /fuzzing/afl_roundtrip_driver.c: -------------------------------------------------------------------------------- 1 | /* 2 | Roundtrip input->compress->decompress driver for use with afl-fuzz (fast mode) 3 | 4 | This driver takes input, compresses it, then decompresses it, and 5 | then re-compresses it, checking that returned lengths and contents 6 | of input and output buffers agree. 7 | */ 8 | #include 9 | #include 10 | 11 | #define LZW_EDDY_IMPLEMENTATION 12 | #include "lzw.h" 13 | 14 | /* this lets the source compile without afl-clang-fast/lto */ 15 | #ifndef __AFL_FUZZ_TESTCASE_LEN 16 | 17 | ssize_t fuzz_len; 18 | unsigned char fuzz_buf[1024000]; 19 | 20 | #define __AFL_FUZZ_TESTCASE_LEN fuzz_len 21 | #define __AFL_FUZZ_TESTCASE_BUF fuzz_buf 22 | #define __AFL_FUZZ_INIT() void sync(void); 23 | #define __AFL_LOOP(x) \ 24 | ((fuzz_len = read(0, fuzz_buf, sizeof(fuzz_buf))) > 0 ? 1 : 0) 25 | #define __AFL_INIT() sync() 26 | #endif 27 | 28 | __AFL_FUZZ_INIT(); 29 | 30 | #ifdef __clang__ 31 | #pragma clang optimize off 32 | #else 33 | #pragma GCC optimize("O0") 34 | #endif 35 | 36 | int main(int argc, char *argv[]) { 37 | struct lzw_state statec0; 38 | struct lzw_state stated0; 39 | size_t dest_size = 1UL << 19; // 512KiB 40 | uint8_t *decomp = malloc(dest_size*2); 41 | uint8_t *comp = decomp + dest_size; 42 | 43 | #ifdef __AFL_HAVE_MANUAL_CONTROL 44 | __AFL_INIT(); 45 | #endif 46 | 47 | uint8_t *input = __AFL_FUZZ_TESTCASE_BUF; 48 | 49 | #ifdef __clang_major__ 50 | while (__AFL_LOOP(5000)) { 51 | #endif 52 | memset(&statec0, 0, sizeof(struct lzw_state)); 53 | memset(&stated0, 0, sizeof(struct lzw_state)); 54 | 55 | ssize_t res; 56 | size_t comp_size = 0; 57 | size_t decomp_size = 0; 58 | 59 | size_t slen = __AFL_FUZZ_TESTCASE_LEN; 60 | if (input && slen > 0) { 61 | // Compress input from fuzzer. 62 | while ((res = lzw_compress(&statec0, input, slen, comp, dest_size)) > 0) { comp_size += res; }; 63 | printf("compressed:%zu (res=%zd)\n", comp_size, res); 64 | if (res < 0) { 65 | abort(); 66 | } 67 | 68 | // Decompress the compressed data... 69 | while ((res = lzw_decompress(&stated0, comp, comp_size, decomp, dest_size)) > 0) { decomp_size += res; }; 70 | printf("decompressed:%zu (res=%zd)\n", decomp_size, res); 71 | if (res < 0) { 72 | abort(); 73 | } 74 | 75 | // Verify input size vs decompressed size. 76 | if (slen != decomp_size) { 77 | abort(); 78 | } 79 | 80 | // Compare the decompressed data and the original input; should match obviously. 81 | int comp0 = memcmp(input, decomp, slen); 82 | if (comp0 != 0) { 83 | abort(); 84 | } 85 | } 86 | 87 | #ifdef __clang_major__ 88 | } 89 | #endif 90 | free(decomp); 91 | return EXIT_SUCCESS; 92 | } 93 | -------------------------------------------------------------------------------- /lzw-eddy.c: -------------------------------------------------------------------------------- 1 | #define LZW_EDDY_IMPLEMENTATION 2 | // #define LZW_MAX_CODE_WIDTH 14 3 | #include "lzw.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "build_const.h" 14 | 15 | static const char *infile; 16 | static const char *outfile; 17 | static int compress = 0; 18 | static size_t maxlen = 0; 19 | 20 | static void print_version(void) { 21 | if (build_hash && *build_hash) { 22 | printf("%s <%.*s>\n", LZW_EDDY_VERSION, 8, build_hash); 23 | } else { 24 | printf("%s\n", LZW_EDDY_VERSION); 25 | } 26 | } 27 | 28 | static void print_banner(void) { 29 | printf("lzw-eddy "); 30 | print_version(); 31 | } 32 | 33 | static int parse_args(int argc, char **argv) { 34 | for (int i=1 ; i < argc ; ++i) { 35 | const char *arg = argv[i]; 36 | // "argv[argc] shall be a null pointer", section 5.1.2.2.1 37 | const char *value = argv[i+1]; 38 | 39 | if (arg && *arg == '-') { 40 | ++arg; 41 | if (value) { 42 | switch (*arg) { 43 | case 'c': 44 | compress = 1; 45 | infile = value; 46 | break; 47 | case 'd': 48 | /* fallthrough */ 49 | case 'x': 50 | compress = 0; 51 | infile = value; 52 | break; 53 | case 'o': 54 | outfile = value; 55 | break; 56 | case 'm': 57 | maxlen = atoi(value); 58 | break; 59 | } 60 | } else { 61 | if (*arg == 'v' || *arg == 'V' || strcmp(arg, "-version") == 0) { 62 | print_version(); 63 | exit(0); 64 | } 65 | } 66 | } 67 | } 68 | 69 | return 0; 70 | } 71 | 72 | static void lzw_compress_file(const char *srcfile, const char *destfile) { 73 | FILE *ifile = fopen(srcfile, "rb"); 74 | 75 | if (!ifile) { 76 | fprintf(stderr, "Error: %m\n"); 77 | return; 78 | } 79 | fseek(ifile, 0, SEEK_END); 80 | long slen = ftell(ifile); 81 | fseek(ifile, 0, SEEK_SET); 82 | 83 | printf("Compressing %zu bytes.\n", (size_t)slen); 84 | FILE *ofile = fopen(destfile, "wb"); 85 | if (ofile) { 86 | uint8_t *src = malloc(slen); 87 | if (!src) { 88 | fprintf(stderr, "ERROR: memory allocation of %ld bytes failed.\n", slen); 89 | exit(1); 90 | } 91 | uint8_t dest[4096]; 92 | 93 | struct lzw_state state = { 0 }; 94 | if (maxlen > 0) { 95 | state.longest_prefix_allowed = maxlen; 96 | printf("WARNING: Restricting maximum prefix length to %zu.\n", state.longest_prefix_allowed); 97 | } 98 | 99 | if ((fread(src, slen, 1, ifile) != 1) && (ferror(ifile) != 0)) { 100 | fprintf(stderr, "fread '%s': %s", srcfile, strerror(errno)); 101 | exit(EXIT_FAILURE); 102 | } 103 | 104 | ssize_t res, written = 0; 105 | while ((res = lzw_compress(&state, src, slen, dest, sizeof(dest))) > 0) { 106 | fwrite(dest, res, 1, ofile); 107 | written += res; 108 | } 109 | if (res == 0) { 110 | printf("%zd bytes written to output, reduction=%2.02f%% (longest prefix=%zu).\n", 111 | written, 112 | (1.0f - ((float)written/slen)) * 100.0f, 113 | state.longest_prefix); 114 | } else if (res < 0) { 115 | fprintf(stderr, "Compression returned error: %s (err: %zd)\n", lzw_strerror(res), res); 116 | } 117 | fclose(ofile); 118 | free(src); 119 | } else { 120 | fprintf(stderr, "Error: %m\n"); 121 | } 122 | fclose(ifile); 123 | } 124 | 125 | static void lzw_decompress_file(const char *srcfile, const char *destfile) { 126 | FILE *ifile = fopen(srcfile, "rb"); 127 | 128 | if (!ifile) { 129 | fprintf(stderr, "Error: %m\n"); 130 | return; 131 | } 132 | fseek(ifile, 0, SEEK_END); 133 | long slen = ftell(ifile); 134 | fseek(ifile, 0, SEEK_SET); 135 | 136 | if (slen > 0) { 137 | printf("Decompressing %zu bytes.\n", (size_t)slen); 138 | FILE *ofile = stdout; 139 | if (strcmp(destfile, "-") != 0) { 140 | ofile = fopen(destfile, "wb"); 141 | } 142 | if (ofile) { 143 | uint8_t dest[4096]; 144 | size_t dest_len = sizeof(dest); 145 | if (maxlen > 0 && maxlen + 1 < dest_len) { 146 | dest_len = maxlen + 1; 147 | printf("WARNING: Restricting output buffer to %zu bytes.\n", dest_len); 148 | } 149 | uint8_t *src = malloc(slen); 150 | if (!src) { 151 | fprintf(stderr, "ERROR: memory allocation of %ld bytes failed.\n", slen); 152 | exit(1); 153 | } 154 | 155 | if ((fread(src, slen, 1, ifile) != 1) && (ferror(ifile) != 0)) { 156 | fprintf(stderr, "fread '%s': %s", srcfile, strerror(errno)); 157 | exit(EXIT_FAILURE); 158 | } 159 | 160 | struct lzw_state state = { 0 }; 161 | 162 | ssize_t res, written = 0; 163 | // Returns 0 when done, otherwise number of bytes written to destination buffer. On error, < 0. 164 | while ((res = lzw_decompress(&state, src, slen, dest, dest_len)) > 0) { 165 | fwrite(dest, res, 1, ofile); 166 | written += res; 167 | } 168 | if (res == 0) { 169 | printf("%zd bytes written to output, expansion=%2.2f%% (longest prefix=%zu).\n", 170 | written, 171 | ((float)written/slen - 1.0f) * 100.0f, 172 | state.longest_prefix); 173 | } else if (res < 0) { 174 | fprintf(stderr, "Decompression returned error: %s (err: %zd)\n", lzw_strerror(res), res); 175 | } 176 | fclose(ofile); 177 | free(src); 178 | } else { 179 | fprintf(stderr, "Error: %m\n"); 180 | } 181 | } 182 | fclose(ifile); 183 | } 184 | 185 | int main(int argc, char *argv []) { 186 | parse_args(argc, argv); 187 | 188 | print_banner(); 189 | 190 | if (!infile || !outfile) { 191 | printf("Usage: %s -c file|-d file -o outfile\n", argv[0]); 192 | printf("Compiled Configuration:\n LZW_MIN_CODE_WIDTH=%d, LZW_MAX_CODE_WIDTH=%d, LZW_MAX_CODES=%lu, sizeof(lzw_state)=%zu\n", 193 | LZW_MIN_CODE_WIDTH, 194 | LZW_MAX_CODE_WIDTH, 195 | LZW_MAX_CODES, 196 | sizeof(struct lzw_state) 197 | ); 198 | return EXIT_SUCCESS; 199 | } 200 | 201 | if (compress) { 202 | lzw_compress_file(infile, outfile); 203 | } else { 204 | lzw_decompress_file(infile, outfile); 205 | } 206 | 207 | return EXIT_SUCCESS; 208 | } 209 | -------------------------------------------------------------------------------- /lzw.h: -------------------------------------------------------------------------------- 1 | /* 2 | Variable-length code LZW compressor and decompressor for fixed-memory decoding. 3 | Copyright (c) 2020-2022, Eddy L O Jansson. Licensed under The MIT License. 4 | 5 | See https://github.com/eloj/lzw-eddy 6 | */ 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | #include 12 | #include 13 | 14 | #if defined(_MSC_VER) 15 | #include 16 | typedef SSIZE_T ssize_t; 17 | #else 18 | #include // for ssize_t 19 | #endif 20 | 21 | #define LZW_EDDY_MAJOR_VERSION 1 22 | #define LZW_EDDY_MINOR_VERSION 1 23 | #define LZW_EDDY_PATCH_VERSION 0 24 | #define LZW_EDDY_VERSION "1.1.0-dev" 25 | 26 | #define LZW_MIN_CODE_WIDTH 9 27 | // 9 to 16-bit codes should all work, but 12 is the default for a reason. 28 | // Going beyond 16-bit codes would require code changes. More isn't better either. 29 | #ifndef LZW_MAX_CODE_WIDTH 30 | #define LZW_MAX_CODE_WIDTH 12 31 | #endif 32 | #define LZW_MAX_CODES (1UL << LZW_MAX_CODE_WIDTH) 33 | 34 | enum lzw_errors { 35 | LZW_NOERROR = 0, 36 | LZW_DESTINATION_TOO_SMALL = -1, 37 | LZW_INVALID_CODE_STREAM = -2, 38 | LZW_STRING_TABLE_FULL = -3, 39 | }; 40 | 41 | // This type must be large enough for SYMBOL_BITS + LZW_MAX_CODE_WIDTH*2 bits. 42 | #if LZW_MAX_CODE_WIDTH > 12 43 | typedef uint64_t lzw_node; 44 | #else 45 | typedef uint32_t lzw_node; 46 | #endif 47 | typedef uint32_t bitres_t; 48 | typedef uint16_t code_t; 49 | typedef uint8_t sym_t; 50 | 51 | struct lzw_string_table { 52 | uint32_t code_width; 53 | code_t next_code; 54 | code_t prev_code; 55 | lzw_node node[LZW_MAX_CODES]; // 16K at 12-bit codes. 56 | }; 57 | 58 | struct lzw_state { 59 | struct lzw_string_table tree; 60 | 61 | // If we ever need more of these, change to a flag-word. 62 | bool was_init; 63 | bool must_reset; 64 | 65 | size_t rptr; 66 | size_t wptr; 67 | // Bit reservoir, need room for LZW_MAX_CODE_WIDTH*2-1 bits. 68 | bitres_t bitres; 69 | uint32_t bitres_len; 70 | 71 | // Tracks the longest prefix used, which is equal to the minimum output buffer required for decompression. 72 | size_t longest_prefix; 73 | // Restrict the longest_prefix to this -- optimize for decode buffer size. 74 | size_t longest_prefix_allowed; 75 | }; 76 | 77 | // Translate error code to message. 78 | const char *lzw_strerror(enum lzw_errors errnum); 79 | 80 | /* 81 | Decompress `slen` bytes from `src` into `dest` of size `dlen`. 82 | 83 | Returns the number of bytes decompressed into `dest`. 84 | Once all input has been consumed, 0 is returned. 85 | On error, a negative integer is returned. 86 | 87 | Neither `src` nor `dest` may be NULL. 88 | 89 | `state`should be zero-initialized. 90 | 91 | `dlen` should be at least 4096 bytes, unless the input is known to 92 | require less. 93 | 94 | `LZWD_DESTINATION_TOO_SMALL` will be returned if the output buffer is too small, in which case 95 | you'd have to restart from the beginning with a larger `dest`. 96 | 97 | All that said, even a file consisting of 80K zeros requires only 400 bytes, 98 | so we're being very conservative here. A 'normal' file may need only 128 bytes or so. 99 | */ 100 | ssize_t lzw_decompress(struct lzw_state *state, uint8_t *src, size_t slen, uint8_t *dest, size_t dlen); 101 | 102 | /* 103 | Compress `slen` bytes from `src` into `dest` of size `dlen`. 104 | 105 | Returns the number of bytes compressed into `dest`. 106 | Once all input has been consumed, 0 is returned. 107 | On error, a negative integer is returned. 108 | 109 | Neither `src` nor `dest` may be NULL. 110 | 111 | `state`should be zero-initialized. 112 | */ 113 | ssize_t lzw_compress(struct lzw_state *state, uint8_t *src, size_t slen, uint8_t *dest, size_t dlen); 114 | 115 | #ifdef LZW_EDDY_IMPLEMENTATION 116 | 117 | /* 118 | Variable-length code LZW compressor and decompressor for fixed-memory decoding. 119 | Copyright (c) 2020-2022, Eddy L O Jansson. Licensed under The MIT License. 120 | 121 | See https://github.com/eloj/lzw-eddy 122 | */ 123 | #include 124 | #include 125 | #include 126 | #include 127 | #include 128 | 129 | #define SYMBOL_BITS 8 130 | #define SYMBOL_MASK ((1UL << SYMBOL_BITS)-1) 131 | #define PARENT_BITS LZW_MAX_CODE_WIDTH 132 | #define PARENT_SHIFT SYMBOL_BITS 133 | #define PARENT_MASK ((1UL << PARENT_BITS)-1) 134 | #define PREFIXLEN_BITS LZW_MAX_CODE_WIDTH 135 | #define PREFIXLEN_SHIFT (PARENT_BITS+SYMBOL_BITS) 136 | #define PREFIXLEN_MASK ((1UL << PREFIXLEN_BITS)-1) 137 | 138 | #define CODE_CLEAR (1UL << SYMBOL_BITS) 139 | #define CODE_EOF (CODE_CLEAR+1) 140 | #define CODE_FIRST (CODE_CLEAR+2) 141 | 142 | static_assert((LZW_MAX_CODE_WIDTH >= LZW_MIN_CODE_WIDTH), ""); 143 | static_assert(SYMBOL_BITS <= sizeof(sym_t)*8, "sym_t type too small"); 144 | static_assert((SYMBOL_BITS + PARENT_BITS + PREFIXLEN_BITS) <= sizeof(lzw_node)*8, "lzw_node type too small"); 145 | static_assert((LZW_MAX_CODE_WIDTH*2 - 1) < sizeof(bitres_t)*8, "bitres_t type too small"); 146 | 147 | static inline sym_t lzw_node_symbol(lzw_node node) { 148 | return node & SYMBOL_MASK; 149 | } 150 | 151 | static inline code_t lzw_node_parent(lzw_node node) { 152 | return (node >> PARENT_SHIFT) & PARENT_MASK; 153 | } 154 | 155 | static inline code_t lzw_node_prefix_len(lzw_node node) { 156 | return (node >> PREFIXLEN_SHIFT) & PREFIXLEN_MASK; 157 | } 158 | 159 | static inline lzw_node lzw_make_node(sym_t symbol, code_t parent, code_t len) { 160 | lzw_node node = (len << PREFIXLEN_SHIFT) | (parent << PARENT_SHIFT) | symbol; 161 | return node; 162 | } 163 | 164 | static inline uint32_t mask_from_width(uint32_t width) { 165 | return (1UL << width)-1; 166 | } 167 | 168 | static void lzw_reset(struct lzw_state *state) { 169 | state->tree.prev_code = CODE_EOF; 170 | state->tree.next_code = CODE_FIRST; 171 | state->tree.code_width = LZW_MIN_CODE_WIDTH; 172 | state->must_reset = false; 173 | } 174 | 175 | static void lzw_init(struct lzw_state *state) { 176 | for (size_t i=0 ; i < (1UL << SYMBOL_BITS) ; ++i) { 177 | state->tree.node[i] = lzw_make_node((sym_t)i, 0, 0); 178 | } 179 | state->rptr = 0; 180 | state->bitres = 0; 181 | state->bitres_len = 0; 182 | state->was_init = true; 183 | lzw_reset(state); 184 | } 185 | 186 | const char *lzw_strerror(enum lzw_errors errnum) { 187 | const char *errstr = "Unknown error"; 188 | 189 | switch (errnum) { 190 | case LZW_NOERROR: 191 | errstr = "No error"; 192 | break; 193 | case LZW_DESTINATION_TOO_SMALL: 194 | errstr = "Destination buffer too small"; 195 | break; 196 | case LZW_INVALID_CODE_STREAM: 197 | errstr = "Invalid code stream"; 198 | break; 199 | case LZW_STRING_TABLE_FULL: 200 | errstr = "String table full"; 201 | break; 202 | 203 | } 204 | return errstr; 205 | } 206 | 207 | ssize_t lzw_decompress(struct lzw_state *state, uint8_t *src, size_t slen, uint8_t *dest, size_t dlen) { 208 | if (state->was_init == false) 209 | lzw_init(state); 210 | 211 | // Keep local copies so that we can exit and continue without losing bits. 212 | uint32_t bitres = state->bitres; 213 | uint32_t bitres_len = state->bitres_len; 214 | 215 | uint32_t code = 0; 216 | size_t wptr = 0; 217 | 218 | while (state->rptr < slen) { 219 | // Fill bit-reservoir. 220 | while ((bitres_len < state->tree.code_width) && (state->rptr < slen)) { 221 | bitres |= src[state->rptr++] << bitres_len; 222 | bitres_len += 8; 223 | } 224 | 225 | state->bitres = bitres; 226 | state->bitres_len = bitres_len; 227 | 228 | if (state->bitres_len < state->tree.code_width) { 229 | return LZW_INVALID_CODE_STREAM; 230 | } 231 | 232 | code = bitres & mask_from_width(state->tree.code_width); 233 | bitres >>= state->tree.code_width; 234 | bitres_len -= state->tree.code_width; 235 | 236 | if (code == CODE_CLEAR) { 237 | if (state->tree.next_code != CODE_FIRST) 238 | lzw_reset(state); 239 | continue; 240 | } else if (code == CODE_EOF) { 241 | break; 242 | } else if (state->must_reset) { 243 | // ERROR: Ran out of space in string table 244 | return LZW_STRING_TABLE_FULL; 245 | } 246 | 247 | if (code <= state->tree.next_code) { 248 | bool known_code = code < state->tree.next_code; 249 | code_t tcode = known_code ? code : state->tree.prev_code; 250 | size_t prefix_len = 1 + lzw_node_prefix_len(state->tree.node[tcode]); 251 | uint8_t symbol = 0; 252 | 253 | assert(prefix_len > 0); 254 | 255 | // Invalid state, invalid input. 256 | if (!known_code && state->tree.prev_code == CODE_EOF) { 257 | return LZW_INVALID_CODE_STREAM; 258 | } 259 | 260 | // Track longest prefix seen. 261 | if (prefix_len > state->longest_prefix) { 262 | state->longest_prefix = prefix_len; 263 | } 264 | 265 | // Check if prefix alone too large for output buffer. User could start over with a larger buffer. 266 | if (prefix_len + (known_code ? 0 : 1) > dlen) { 267 | return LZW_DESTINATION_TOO_SMALL; 268 | } 269 | 270 | // Check if room in output buffer, else return early. 271 | if (wptr + prefix_len + (known_code ? 0 : 1) > dlen) { 272 | return wptr; 273 | } 274 | 275 | // Write out prefix to destination 276 | for (size_t i=0 ; i < prefix_len ; ++i) { 277 | symbol = lzw_node_symbol(state->tree.node[tcode]); 278 | dest[wptr + prefix_len - 1 - i] = symbol; 279 | tcode = lzw_node_parent(state->tree.node[tcode]); 280 | } 281 | wptr += prefix_len; 282 | 283 | // Add the first character of the prefix as a new code with prev_code as the parent. 284 | if (state->tree.prev_code != CODE_EOF) { 285 | if (!known_code) { 286 | assert(code == state->tree.next_code); 287 | assert(wptr < dlen); 288 | dest[wptr++] = symbol; // Special case for new codes. 289 | } 290 | 291 | state->tree.node[state->tree.next_code] = lzw_make_node(symbol, state->tree.prev_code, 1 + lzw_node_prefix_len(state->tree.node[state->tree.prev_code])); 292 | 293 | // TODO: Change to == 294 | if (state->tree.next_code >= mask_from_width(state->tree.code_width)) { 295 | if (state->tree.code_width == LZW_MAX_CODE_WIDTH) { 296 | // Out of bits in code, next code MUST be a reset! 297 | state->must_reset = true; 298 | state->tree.prev_code = code; 299 | continue; 300 | } 301 | ++state->tree.code_width; 302 | } 303 | state->tree.next_code++; 304 | } 305 | state->tree.prev_code = code; 306 | } else { 307 | // Desynchronized, probably corrupt/invalid input. 308 | return LZW_INVALID_CODE_STREAM; 309 | } 310 | } 311 | return wptr; 312 | } 313 | 314 | static bool lzw_string_table_lookup(struct lzw_state *state, uint8_t *prefix, size_t len, code_t *code) { 315 | // printf("Looking up prefix '%.*s' from %p to %p (len=%zu)\n", (int)(len), prefix, prefix, prefix+len, len); 316 | assert (len > 0); 317 | 318 | if (len == 1) { 319 | *code = state->tree.node[prefix[0]]; 320 | return true; 321 | } 322 | 323 | // PERF: This is slow, we should store an array of hashes to use as an initial comparison before walking the tree. 324 | // NOTE: It's imperative that we search newest to oldest. When limiting the prefix length, we'll 325 | // end up with duplicate prefixes, and only the newest code is valid for the decoder to stay in sync. 326 | for (size_t i=state->tree.next_code - 1 ; i >= CODE_FIRST ; --i) { 327 | assert(i < LZW_MAX_CODES); 328 | lzw_node node = state->tree.node[i]; 329 | 330 | if (len - 1 == lzw_node_prefix_len(node)) { 331 | for (size_t j=0 ; j < len ; ++j) { 332 | if (prefix[len-j-1] != lzw_node_symbol(node)) { 333 | break; 334 | } 335 | if (lzw_node_prefix_len(node) == 0) { 336 | *code = (code_t)i; 337 | assert(j == len - 1); 338 | return true; 339 | } 340 | node = state->tree.node[lzw_node_parent(node)]; 341 | } 342 | } 343 | } 344 | 345 | return false; 346 | } 347 | 348 | inline static void lzw_output_code(struct lzw_state *state, code_t code) { 349 | assert(state->bitres_len + state->tree.code_width <= sizeof(bitres_t)*8); // maybe increase size of bitres_t? 350 | state->bitres |= code << state->bitres_len; 351 | state->bitres_len += state->tree.code_width; 352 | state->tree.prev_code = code; 353 | 354 | // printf("\n", code, state->tree.code_width, state->bitres_len, sizeof(bitres_t)*8, state->bitres); 355 | } 356 | 357 | static void lzw_flush_reservoir(struct lzw_state *state, uint8_t *dest, bool final) { 358 | // SECURITY: We assume we have enough space left in dest! 359 | 360 | // Write codes to output. 361 | while (state->bitres_len >= 8) { 362 | dest[state->wptr++] = state->bitres & 0xFF; 363 | state->bitres >>= 8; 364 | state->bitres_len -= 8; 365 | // printf("DEBUG: Flushed: %02x, reservoir:%02d/%zu:%02x\n", dest[state->wptr-1], state->bitres_len, sizeof(bitres_t)*8, state->bitres); 366 | } 367 | 368 | if (final && state->bitres_len > 0) { 369 | // printf("DEBUG: Flushing last %d bits.\n", state->bitres_len); 370 | dest[state->wptr++] = state->bitres; 371 | state->bitres = 0; 372 | state->bitres_len = 0; 373 | // printf("DEBUG: Flushed: %02x, reservoir:%02d/%zu:%02x\n", dest[state->wptr-1], state->bitres_len, sizeof(bitres_t)*8, state->bitres); 374 | } 375 | } 376 | 377 | ssize_t lzw_compress(struct lzw_state *state, uint8_t *src, size_t slen, uint8_t *dest, size_t dlen) { 378 | if (state->was_init == false) { 379 | lzw_init(state); 380 | lzw_output_code(state, CODE_CLEAR); 381 | } 382 | 383 | code_t code = CODE_EOF; 384 | size_t prefix_end = 0; 385 | state->wptr = 0; 386 | 387 | while (state->rptr + prefix_end < slen) { 388 | // Ensure we have enough space for flushing codes. 389 | if (state->wptr + (state->tree.code_width >> 3) + 1 + 2 + 2 > dlen) { // Also reserve bits for worst-case 16-bit CLEAR + EOF code 390 | return state->wptr; 391 | } 392 | 393 | ++prefix_end; 394 | // lookup prefix in string table 395 | bool overlong = ((state->longest_prefix_allowed > 0) && (prefix_end >= state->longest_prefix_allowed)); 396 | bool existing_code = lzw_string_table_lookup(state, src + state->rptr, prefix_end, &code); 397 | if (!existing_code || overlong) { 398 | assert(code != CODE_CLEAR); 399 | assert(code != CODE_EOF); 400 | 401 | uint8_t symbol = src[state->rptr + prefix_end - 1]; 402 | code_t parent = code; 403 | code_t parent_len = 1 + lzw_node_prefix_len(state->tree.node[parent]); 404 | 405 | // Output code _before_ we potentially change the bit-width. 406 | lzw_output_code(state, parent); 407 | 408 | // Handle code width expansion. 409 | if (state->tree.next_code == (1UL << state->tree.code_width) 410 | #if LZW_MAX_CODE_WIDTH == 16 411 | || (state->tree.next_code == LZW_MAX_CODES - 1) /* special case for wrapping on 16-bit code_t */ 412 | #endif 413 | ) { 414 | if (state->tree.code_width < LZW_MAX_CODE_WIDTH) { 415 | // printf("DEBUG: Expanding bitwidth to %d\n", state->tree.code_width + 1); 416 | ++state->tree.code_width; 417 | } else { 418 | // printf("DEBUG: Max code-width reached -- Issuing clear/reset\n"); 419 | lzw_flush_reservoir(state, dest, false); 420 | lzw_output_code(state, CODE_CLEAR); 421 | lzw_reset(state); 422 | lzw_flush_reservoir(state, dest, false); 423 | state->tree.next_code = CODE_EOF; // XXX: Required for compatibility with puzznic. 424 | } 425 | } 426 | 427 | assert(state->tree.next_code < LZW_MAX_CODES); 428 | // printf("New prefix from src[%zu], adding symbol '%c' (%02x) as code %d /w parent %d\n", state->rptr + prefix_end, symbol, symbol, state->tree.next_code, parent); 429 | state->tree.node[state->tree.next_code++] = lzw_make_node(symbol, parent, parent_len); 430 | 431 | if (parent_len > state->longest_prefix) { 432 | state->longest_prefix = parent_len; 433 | } 434 | 435 | state->rptr += parent_len; 436 | prefix_end = 0; 437 | 438 | lzw_flush_reservoir(state, dest, false); 439 | } 440 | } 441 | if (prefix_end != 0) { 442 | // printf("DEBUG: Last prefix existed, writing existing code %d to stream\n", code); 443 | lzw_output_code(state, code); 444 | lzw_flush_reservoir(state, dest, false); 445 | state->rptr += prefix_end; 446 | prefix_end = 0; 447 | } 448 | 449 | // WARN: Problem with this is that we can't chain encodes, add 'final' flag to compression call? 450 | // NIGHTMARE: Handle zero-input 451 | if ((state->rptr + prefix_end == slen && state->tree.prev_code != CODE_EOF) 452 | // This happens to be true if we're called with slen=0, but only the first time as we now flush the bits. 453 | || (state->wptr == 0 && state->bitres_len > 0)) { 454 | lzw_output_code(state, CODE_EOF); 455 | lzw_flush_reservoir(state, dest, true); 456 | } 457 | 458 | // if we didn't write anything, there shouldn't be any bits left in reservoir. 459 | assert(!(state->wptr == 0 && state->bitres_len > 0)); 460 | 461 | // printf("DEBUG: Returning %zu bytes written to caller.\n", state->wptr); 462 | 463 | return state->wptr; 464 | } 465 | #endif // LZW_EDDY_IMPLEMENTATION 466 | 467 | #ifdef __cplusplus 468 | } 469 | #endif 470 | -------------------------------------------------------------------------------- /package.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Create a release package for a specific OS. 4 | # 5 | PROJECT=lzw-eddy 6 | OS=$1 7 | ARCH=${2:-`uname -m`} 8 | BP="packages/${OS}" 9 | RP="${BP}/${PROJECT}" 10 | FILES='lzw-eddy lzw.h LICENSE' 11 | 12 | if [ -z "${OS}" ]; then 13 | echo "Usage: $0 " 14 | exit 1 15 | fi 16 | 17 | if [[ "${OS}" == "linux" || "${OS}" == "msys2" ]]; then 18 | echo "Packaging for ${OS}-${ARCH}" 19 | OPTIMIZED=1 make -B lzw-eddy 20 | if [ $? -ne 0 ]; then 21 | echo "Build failed, packaging aborted." 22 | exit 1 23 | fi 24 | mkdir -p ${RP} 25 | cp ${FILES} ${RP} 26 | tar -C ${BP} -czvf packages/${PROJECT}.${OS}-${ARCH}.tar.gz ${PROJECT} 27 | if [ "${OS}" == "msys2" ]; then 28 | pushd . 29 | cd ${RP} && 7z -bd a ../../${PROJECT}.${OS}-${ARCH}.7z * 30 | popd 31 | fi 32 | rm -r ${BP} 33 | else 34 | echo "Unknown target OS: ${OS}" 35 | exit 2 36 | fi 37 | 38 | echo "Done." 39 | -------------------------------------------------------------------------------- /run-tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | TMPFILED=$(mktemp) 3 | TMPFILEC=$(mktemp) 4 | trap "{ rm $TMPFILED $TMPFILEC; }" EXIT 5 | set -e 6 | 7 | rep() { 8 | C=${1:-0} 9 | S=${2:-A} 10 | if [ "$C" -gt 0 ]; then 11 | printf "%.0s$S" $(seq 1 $C) 12 | fi 13 | } 14 | 15 | function testfile { 16 | INFILE=$1 17 | EOPT=$2 18 | HASHD=$3 19 | HASHC=${4:-$(sha256sum -b $INFILE | cut -f 1 -d ' ')} 20 | ./lzw-eddy -d $INFILE -o $TMPFILED 21 | if [ $? != 0 ]; then 22 | echo "TEST FAIL. (Decompression error)" 23 | exit 1 24 | fi 25 | test "$(sha256sum -b $TMPFILED)" = "$HASHD *$TMPFILED" || (echo "Test failed -- Decompressed hash mismatch." && exit 1) 26 | ./lzw-eddy $EOPT -c $TMPFILED -o $TMPFILEC 27 | if [ $? != 0 ]; then 28 | echo "TEST FAIL. (Compression error)" 29 | exit 1 30 | fi 31 | test "$(sha256sum -b $TMPFILEC)" = "$HASHC *$TMPFILEC" || (echo "Test failed. -- Compressed hash mismatch" && exit 1) 32 | } 33 | 34 | function testcheck { 35 | INFILE=$1 36 | HASHD=$(sha256sum $INFILE | cut -f 1 -d ' ') 37 | ./lzw-eddy -c $INFILE -o $TMPFILEC 38 | ./lzw-eddy -d $TMPFILEC -o $TMPFILED 39 | HASHC=$(sha256sum $TMPFILED | cut -f 1 -d ' ') 40 | test "$HASHD" = "$HASHC" || (echo "Test failed. -- Compressed hash mismatch" && exit 1) 41 | } 42 | 43 | testfile tests/atsign.lzw "" c3641f8544d7c02f3580b07c0f9887f0c6a27ff5ab1d4a3e29caf197cfc299ae 44 | testfile tests/abra.txt.lzw "" 3119a48c6843ee7dcc08312e97b1d8e3b241b082996afe761f8a045d493b7cef 45 | testfile tests/zeros80000.lzw "" f8c784aa6b57396e7c5e094c34d079d8252473e46e2f60593a921dbebf941fcc 46 | testfile tests/zeros80000.lzw "-m 254" f8c784aa6b57396e7c5e094c34d079d8252473e46e2f60593a921dbebf941fcc 58e6c0321a62f7f112e61dca779edc9be0e34cf0ee356f61df949c7aea839492 47 | testcheck lzw.h 48 | rep 65536 AaA >$TMPFILED 49 | testcheck $TMPFILED 50 | echo "All tests passed." 51 | -------------------------------------------------------------------------------- /test-ref.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Script that verifies that our reference files decompress and recompress to the exact original bitstream. 4 | # These files can not be redistributed though. 5 | # 6 | DATA=${1:-tests} 7 | if [[ ! -f ${DATA}/levelmp1.map ]] || [[ ! -f ${DATA}/extra.puz ]]; then 8 | echo "Sorry, you don't seem to have the required reference files." 9 | exit 1 10 | fi 11 | make 12 | test "$(sha256sum -b ${DATA}/levelmp1.map | cut -f 1 -d ' ')" = "8b57ee1e373c926182e47afd3d97477c07f98ad6dde076cdf3c3f703f250d46c" || (echo "Invalid input: hash mismatch for levelmp1.map" && exit 1) || exit 1 13 | test "$(sha256sum -b ${DATA}/extra.puz | cut -f 1 -d ' ')" = "f397e1e1b58d02ca6f469c8af0f5e50620621f267f48cb71af545f77d550607a" || (echo "Invalid input: hash mismatch for extra.puz" && exit 1) || exit 1 14 | echo "Input verified OK." 15 | ./lzw-eddy -d ${DATA}/levelmp1.map -o .decomp1 16 | test "$(sha256sum -b .decomp1 | cut -c1-64)" = "7183a6828de608f69563bff78eec25f9c86053d0ab1e36c6f998853717292f5c" || (echo "Decompression error: hash mismatch for levelmp1.decompressed" && exit 1) || exit 1 17 | ./lzw-eddy -c .decomp1 -o .comp1 >/dev/null 18 | test "$(sha256sum -b .comp1)" = "8b57ee1e373c926182e47afd3d97477c07f98ad6dde076cdf3c3f703f250d46c *.comp1" || (echo "Compression error: hash mismatch for levelmp1.map" && exit 1) || exit 1 19 | ./lzw-eddy -d ${DATA}/extra.puz -o .decomp1 20 | test "$(sha256sum -b .decomp1 | cut -c1-64)" = "d1ce310a2496af792c0c527b86c7e0dc17c14e8eb508d3472a894845e15b1c99" || (echo "Decompression error: hash mismatch for extra.decompressed" && exit 1) || exit 1 21 | ./lzw-eddy -c .decomp1 -o .comp1 >/dev/null 22 | test "$(sha256sum -b .comp1)" = "f397e1e1b58d02ca6f469c8af0f5e50620621f267f48cb71af545f77d550607a *.comp1" || (echo "Compression error: hash mismatch for extra.puz" && exit 1) || exit 1 23 | echo "All Good!" 24 | rm .decomp1 .comp1 25 | -------------------------------------------------------------------------------- /tests/AaAx64.txt: -------------------------------------------------------------------------------- 1 | AaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaA -------------------------------------------------------------------------------- /tests/abra.txt.lzw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eloj/lzw-eddy/facf92c16c83f041369a960e84dc6998ab730517/tests/abra.txt.lzw -------------------------------------------------------------------------------- /tests/atsign.lzw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eloj/lzw-eddy/facf92c16c83f041369a960e84dc6998ab730517/tests/atsign.lzw -------------------------------------------------------------------------------- /tests/zeros80000.lzw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eloj/lzw-eddy/facf92c16c83f041369a960e84dc6998ab730517/tests/zeros80000.lzw --------------------------------------------------------------------------------