├── .github
    └── workflows
    │   ├── c-cpp.yml
    │   └── release.yaml
├── CHANGES.md
├── LICENSE
├── Makefile
├── README.md
├── fuzzing
    ├── README.md
    ├── afl_compress_driver.c
    ├── afl_decompress_driver.c
    └── afl_roundtrip_driver.c
├── lzw-eddy.c
├── lzw.h
├── package.sh
├── run-tests.sh
├── test-ref.sh
└── tests
    ├── AaAx64.txt
    ├── abra.txt.lzw
    ├── atsign.lzw
    └── zeros80000.lzw


/.github/workflows/c-cpp.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: build
 3 | 
 4 | on:
 5 |   push:
 6 |     branches: [master]
 7 |   pull_request:
 8 |     branches: [master]
 9 | 
10 | jobs:
11 |   build:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - name: Checkout
15 |         uses: actions/checkout@v2
16 |         with:
17 |             fetch-depth: 0
18 |       - name: Build and test
19 |         run: make test
20 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: release
 3 | 
 4 | on:
 5 |   workflow_dispatch:
 6 |   push:
 7 |     tags:
 8 |       - 'v*'
 9 | 
10 | jobs:
11 |   release:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - name: Checkout
15 |         uses: actions/checkout@v2
16 |         with:
17 |             fetch-depth: 0
18 |       - name: Build linux package
19 |         run: ./package.sh linux
20 |       - name: Create release
21 |         uses: actions/create-release@v1
22 |         id: release
23 |         env:
24 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
25 |         with:
26 |           draft: true
27 |           prerelease: false
28 |           tag_name: ${{ github.ref }}
29 |           release_name: lzw-eddy ${{ github.ref }}
30 |           body_path: CHANGES.md
31 |       - name: Upload linux package
32 |         uses: actions/upload-release-asset@v1
33 |         env:
34 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
35 |         with:
36 |           upload_url: ${{ steps.release.outputs.upload_url }}
37 |           asset_path: ./packages/lzw-eddy.linux-x86_64.tar.gz
38 |           asset_name: lzw-eddy.linux-x86_64.tar.gz
39 |           asset_content_type: application/gzip
40 | 


--------------------------------------------------------------------------------
/CHANGES.md:
--------------------------------------------------------------------------------
 1 | # LZW single-header library
 2 | 
 3 | Stable releases have even minor version (e.g 1.0.x, 1.2.x), odd numbered minors are development versions.
 4 | 
 5 | ## Release 1.1.0-dev - next
 6 | 
 7 | * Now works on Microsoft's compiler (v19.32.31332)
 8 | 
 9 | ## Release 1.0.0 - 2022-06-19
10 | 
11 | * Initial versioned release.
12 | 
13 | ## Prior versions
14 | 
15 | * For unversioned 'releases' going back to 2020, see the git log.
16 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020, 2021, 2022 Eddy L O Jansson
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | OPT=-O3 -fomit-frame-pointer -funroll-loops -fstrict-aliasing -march=native -mtune=native
 2 | WARNFLAGS=-Wall -Wextra -Wshadow -Wstrict-aliasing -Wcast-qual -Wcast-align -Wpointer-arith -Wredundant-decls -Wfloat-equal -Wswitch-enum
 3 | CWARNFLAGS=-Wstrict-prototypes -Wmissing-prototypes
 4 | MISCFLAGS=-fvisibility=hidden -fstack-protector
 5 | DEVFLAGS=-ggdb -DDEBUG -D_FORTIFY_SOURCE=3 -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function
 6 | 
 7 | #
 8 | # Some architecture specific flags
 9 | #
10 | ARCH:=$(shell uname -m)
11 | ifeq ($(ARCH),x86_64)
12 | ARCHFLAGS=-fcf-protection -msse4.2 -mavx
13 | endif
14 | ifeq ($(ARCH),aarch64)
15 | ARCHFLAGS=-mbranch-protection=bti
16 | endif
17 | 
18 | AFLCC?=afl-clang-fast
19 | 
20 | YELLOW='\033[1;33m'
21 | NC='\033[0m'
22 | 
23 | ifdef MEMCHECK
24 | 	TEST_PREFIX:=valgrind --tool=memcheck --leak-check=full --track-origins=yes
25 | endif
26 | 
27 | ifdef PERF
28 | 	TEST_PREFIX:=perf stat
29 | endif
30 | 
31 | # GCC only
32 | ifdef ANALYZER
33 | 	MISCFLAGS+=-fanalyzer
34 | endif
35 | 
36 | # clang only
37 | ifdef SANITIZE
38 | 	MISCFLAGS+=-fsanitize=memory
39 | endif
40 | 
41 | ifdef OPTIMIZED
42 | # On mingw, -static avoids dep on libssp-0.dll when built with -fstack-protector
43 | 	MISCFLAGS+=-DNDEBUG -Werror -static
44 | else
45 | 	MISCFLAGS+=$(DEVFLAGS)
46 | endif
47 | 
48 | BITWIDTH?=12
49 | 
50 | CFLAGS=-std=c11 $(OPT) $(CWARNFLAGS) $(WARNFLAGS) $(ARCHFLAGS) $(MISCFLAGS) -DLZW_MAX_CODE_WIDTH=$(BITWIDTH)
51 | CXXFLAGS=-std=gnu++17 -fno-rtti $(OPT) $(WARNFLAGS) $(ARCHFLAGS) $(MISCFLAGS)
52 | 
53 | .PHONY: clean test fuzz
54 | 
55 | all: lzw-eddy
56 | 
57 | FORCE:
58 | 
59 | build_const.h: FORCE
60 | 	@git show-ref --head --hash | head -n 1 | awk '{ printf "const char *build_hash = \"%s\";\n",$$1 }' > $@.tmp
61 | 	@if test -r $@ ; then \
62 | 		(cmp $@.tmp $@ && rm $@.tmp) || mv -f $@.tmp $@ ; \
63 | 	else \
64 | 		mv $@.tmp $@ ; \
65 | 	fi
66 | 	@if test ! -s $@ ; then \
67 | 		echo "Bare build, no build hash available." ; \
68 | 		echo 'const char *build_hash = "";' > $@ ; \
69 | 	fi
70 | 
71 | lzw-eddy: lzw-eddy.c lzw.h build_const.h
72 | 	$(CC) $(CFLAGS) $< -o $@
73 | 
74 | afl-%: fuzzing/afl_*.c lzw.h
75 | 	$(AFLCC) $(CFLAGS) -I. fuzzing/afl_$(subst -,_,$*).c -o $@
76 | 
77 | fuzz-%:
78 | 	make afl-$*
79 | 	AFL_AUTORESUME=1 AFL_SKIP_CPUFREQ=1 afl-fuzz -m 16 -i tests -o findings -- ./afl-$*
80 | 
81 | fuzz: fuzz-roundtrip-driver
82 | 
83 | test: lzw-eddy
84 | 	${TEST_PREFIX} ./run-tests.sh
85 | 
86 | cppcheck:
87 | 	@cppcheck --verbose --error-exitcode=1 --enable=warning,performance,portability .
88 | 
89 | backup:
90 | 	@echo -e $(YELLOW)Making backup$(NC)
91 | 	tar -cJf ../$(notdir $(CURDIR))-`date +"%Y-%m"`.tar.xz ../$(notdir $(CURDIR))
92 | 
93 | clean:
94 | 	@echo -e $(YELLOW)Cleaning$(NC)
95 | 	rm -f lzw-eddy build_const.h afl-*-driver core core.*
96 | 	rm -rf packages
97 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Simple LZW (Lempel-Ziv-Welch) Library, Compressor & Decompressor
  3 | 
  4 | [![License](https://img.shields.io/github/license/eloj/lzw-eddy)](LICENSE)
  5 | [![Release](https://img.shields.io/github/release/eloj/lzw-eddy/all.svg)](https://github.com/eloj/lzw-eddy/releases)
  6 | [![Build status](https://github.com/eloj/lzw-eddy/workflows/build/badge.svg)](https://github.com/eloj/lzw-eddy/actions/workflows/c-cpp.yml)
  7 | 
  8 | A single-header library and basic headerless compressor and decompressor. Supports variable length codes
  9 | between 9 and 12 bits per default, but the upper bound is a compile-time constant that can be adjusted between
 10 | 9 and 16 bits.
 11 | 
 12 | The algorithm implemented by this code was widely distributed in the old MS-DOS days in places
 13 | like [Dr.Dobbs](https://marknelson.us/posts/1989/10/01/lzw-data-compression.html) and a popular book on compression,
 14 | probably due to its use in GIF. This resulted in it being used in all sorts of places.
 15 | 
 16 | There are many different and mutually incompatible ways to implement the LZW algorithm.
 17 | Specifically the code in this repository was written to be [bit-compatible with Puzznic](https://www.giantbomb.com/profile/eloj/blog/technical-notes-on-the-level-format-of-puzznic-for/114881/) (MS-DOS),
 18 | and as such does not represent an effort to write "the best" or "most compatible" LZW codec.
 19 | 
 20 | Code developed using the note "[LZW and GIF explained](https://www.eecis.udel.edu/~amer/CISC651/lzw.and.gif.explained.html)"
 21 | by Steve Blackstock as a reference.
 22 | 
 23 | All code is provided under the [MIT License](LICENSE).
 24 | 
 25 | ## Features
 26 | 
 27 | * Single-Header Library.
 28 | * Fixed memory requirements:
 29 | 	* Uses ~16KiB for state/string table by default.
 30 | 	* At least ~4KiB output buffer recommended, but can go _much_ lower in practice.
 31 | 	* Low stack usage.
 32 | * Compressor can be 'short-stroked' to limit decompression buffer size requirement.
 33 | * Fast decompression. _Very_ slow compression.
 34 | * Releases are:
 35 | 	* [Valgrind](https://valgrind.org/) clean,
 36 | 	* [scan-build](https://clang-analyzer.llvm.org/scan-build.html) clean, and
 37 | 	* [AFL++](https://aflplus.plus/) clean (for some reasonable run-time).
 38 | 
 39 | ## C interface
 40 | 
 41 | ```c
 42 | ssize_t lzw_decompress(struct lzw_state *state, uint8_t *src, size_t slen, uint8_t *dest, size_t dlen);
 43 | ssize_t lzw_compress(struct lzw_state *state, uint8_t *src, size_t slen, uint8_t *dest, size_t dlen);
 44 | const char *lzw_strerror(enum lzw_errors errnum);
 45 | ```
 46 | 
 47 | * State must be zero-initialized.
 48 | * The return value is the number of bytes compressed or decompressed into `dest`. Once all input has been processed, `0` is returned. See [example](#example).
 49 | * On error, a negative integer is returned.
 50 | 
 51 | All input is assumed to be available at `src`; e.g it is NOT allowed to switch `src` during encoding/decoding. A function
 52 | to 'hand over' state to new input could be added, but I don't have the need.
 53 | 
 54 | ## Security
 55 | 
 56 | I would _not_ recommend using this code in a security-sensitive context. If you expose this code
 57 | to possibly adversarial data, beware that you do so at your own risk.
 58 | 
 59 | I make no guarantees, provide no warranties, and may not respond to security issues in a timely manner.
 60 | 
 61 | This code was written simply to allow interoperability with an old MS-DOS game, nothing more.
 62 | 
 63 | Thank you and enjoy.
 64 | 
 65 | ## Usage
 66 | 
 67 | In your code, define `LZW_EDDY_IMPLEMENTATION` and then `#include "lzw.h"`. This will give you a decoder/encoder _specific_
 68 | for 9-12 bit codes, giving a string table of 4096 entries.
 69 | 
 70 | You can optionally define `LZW_MAX_CODE_WIDTH` to a value between 9 and 16 before including the header to
 71 | change this compile-time default. Due to the way the dictionary is reconstructed during decompression,
 72 | a decoder is only compatible with data generated for the _exact_ same size string table.
 73 | 
 74 | 12-bit codes are probably the sweet spot for most applications. Larger codes means more bits are needed to
 75 | encode newer strings, and because the string table is larger, the dictionary doesn't adapt as fast as it
 76 | would if it was smaller. This combination means that a larger table can result in worse compression ratio.
 77 | 
 78 | The encoder could theoretically be improved to flush or prune the existing string table if few long matches are made over
 79 | some window, but no such adaptability is present.
 80 | 
 81 | ## CLI compressor
 82 | 
 83 | `lzw-eddy` is a simple command-line compressor built using the library.
 84 | 
 85 | ```bash
 86 | lzw-eddy 1.1.0-dev <45bf69f1>
 87 | Usage: ./lzw-eddy -c file|-d file -o outfile
 88 | Compiled Configuration:
 89 |  LZW_MIN_CODE_WIDTH=9, LZW_MAX_CODE_WIDTH=12, LZW_MAX_CODES=4096, sizeof(lzw_state)=16440
 90 | ```
 91 | 
 92 | You can pass BITWIDTH=\<num\> to build it with a non-default string table size.
 93 | 
 94 | ```bash
 95 | $ make -B BITWIDTH=14 && ./lzw-eddy -c lzw.h -o /dev/null
 96 | lzw-eddy 1.1.0-dev <45bf69f1>
 97 | Compressing 'lzw.h', 14566 bytes.
 98 | 6947 bytes written to output, reduction=52.31% (longest prefix=15).
 99 | ```
100 | 
101 | ## Example
102 | 
103 | ```c
104 | 	#define LZW_EDDY_IMPLEMENTATION
105 | 	#include "lzw.h"
106 | 
107 | 	struct lzw_state state = { 0 };
108 | 
109 | 	size_t slen = <length of compressed data>
110 | 	uint8_t *src = <compressed data>;
111 | 	uint8_t dest[4096];
112 | 
113 | 	ssize_t res, written = 0;
114 | 	while ((res = lzw_decompress(&state, src, slen, dest, sizeof(dest))) > 0) {
115 | 		// Process `res` bytes of output in `dest`, e.g:
116 | 		// fwrite(dest, res, 1, outfile);
117 | 		written += res;
118 | 	}
119 | 	if (res == 0) {
120 | 		printf("%zd bytes successfully decompressed.\n", written);
121 | 	} else if (res < 0) {
122 | 		fprintf(stderr, "Decompression error: %s (err:%zd)\n", lzw_strerror(res), res);
123 | 	}
124 | ```
125 | 
126 | ## Unlikely To Do
127 | 
128 | * Add Google Benchmark.
129 | * Use hashing for lookups in `lzw_string_table_lookup`.
130 | * Support changing inputs during processing.
131 | * Gather/Scatter alternative interface.
132 | 


--------------------------------------------------------------------------------
/fuzzing/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # LZW Fuzzing Drivers
 3 | 
 4 | This directory contains short driver programs optimized for running the
 5 | lzw compression and decompression code through a fuzzer, specifically [AFL - American Fuzzy Lop](https://lcamtuf.coredump.cx/afl/).
 6 | 
 7 | ## Prerequisites
 8 | 
 9 | You need to install AFL, or one of its forks.
10 | 
11 | The original AFL hasn't seen updates in a long time, but [AFL++](https://aflplus.plus/) should also work, but is untested at this time.
12 | 
13 | ## Running
14 | 
15 | Simply run `make fuzz` or `make fuzz-<name>-driver` to build the corresponding driver and start AFL.
16 | 
17 | NOTE: It's recommended to run fuzzing on /tmp (or another RAM disk), since
18 | AFL does a lot of disk writes.
19 | 


--------------------------------------------------------------------------------
/fuzzing/afl_compress_driver.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | 	Compression console driver for use with afl-fuzz (fast mode)
 3 | */
 4 | #include <unistd.h>
 5 | #include <stdio.h>
 6 | 
 7 | #define LZW_EDDY_IMPLEMENTATION
 8 | #include "lzw.h"
 9 | 
10 | /* this lets the source compile without afl-clang-fast/lto */
11 | #ifndef __AFL_FUZZ_TESTCASE_LEN
12 | 
13 | ssize_t       fuzz_len;
14 | unsigned char fuzz_buf[1024000];
15 | 
16 | #define __AFL_FUZZ_TESTCASE_LEN fuzz_len
17 | #define __AFL_FUZZ_TESTCASE_BUF fuzz_buf
18 | #define __AFL_FUZZ_INIT() void sync(void);
19 | #define __AFL_LOOP(x) \
20 | 	((fuzz_len = read(0, fuzz_buf, sizeof(fuzz_buf))) > 0 ? 1 : 0)
21 | #define __AFL_INIT() sync()
22 | #endif
23 | 
24 | __AFL_FUZZ_INIT();
25 | 
26 | #ifdef __clang__
27 | #pragma clang optimize off
28 | #else
29 | #pragma GCC optimize("O0")
30 | #endif
31 | 
32 | int main(int argc, char *argv[]) {
33 | 	struct lzw_state state;
34 | 	uint8_t dest[2048];
35 | 
36 | 	uint8_t *input = __AFL_FUZZ_TESTCASE_BUF;
37 | 
38 | #ifdef __clang_major__
39 | 	while (__AFL_LOOP(1000)) {
40 | #endif
41 | 		memset(&state, 0, sizeof(state));
42 | 
43 | 		ssize_t slen = __AFL_FUZZ_TESTCASE_LEN;
44 | 		if (slen > 0) {
45 | 			ssize_t res, written = 0;
46 | 			while ((res = lzw_compress(&state, input, slen, dest, sizeof(dest))) > 0) {
47 | 				written += res;
48 | 			}
49 | 			printf("compressed:%zd (res=%zd)\n", written, res);
50 | 		}
51 | #ifdef __clang_major__
52 | 	}
53 | #endif
54 | 	return EXIT_SUCCESS;
55 | }
56 | 


--------------------------------------------------------------------------------
/fuzzing/afl_decompress_driver.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | 	Decompression console driver for use with afl-fuzz (fast mode)
 3 | */
 4 | #include <unistd.h>
 5 | #include <stdio.h>
 6 | 
 7 | #define LZW_EDDY_IMPLEMENTATION
 8 | #include "lzw.h"
 9 | 
10 | /* this lets the source compile without afl-clang-fast/lto */
11 | #ifndef __AFL_FUZZ_TESTCASE_LEN
12 | 
13 | ssize_t       fuzz_len;
14 | unsigned char fuzz_buf[1024000];
15 | 
16 | #define __AFL_FUZZ_TESTCASE_LEN fuzz_len
17 | #define __AFL_FUZZ_TESTCASE_BUF fuzz_buf
18 | #define __AFL_FUZZ_INIT() void sync(void);
19 | #define __AFL_LOOP(x) \
20 | 	((fuzz_len = read(0, fuzz_buf, sizeof(fuzz_buf))) > 0 ? 1 : 0)
21 | #define __AFL_INIT() sync()
22 | #endif
23 | 
24 | __AFL_FUZZ_INIT();
25 | 
26 | #ifdef __clang__
27 | #pragma clang optimize off
28 | #else
29 | #pragma GCC optimize("O0")
30 | #endif
31 | 
32 | int main(int argc, char *argv[]) {
33 | 	struct lzw_state state;
34 | 	uint8_t dest[2048];
35 | 
36 | 	uint8_t *input = __AFL_FUZZ_TESTCASE_BUF;
37 | 
38 | #ifdef __clang_major__
39 | 	while (__AFL_LOOP(1000)) {
40 | #endif
41 | 		memset(&state, 0, sizeof(state));
42 | 
43 | 		ssize_t slen = __AFL_FUZZ_TESTCASE_LEN;
44 | 		if (slen > 0) {
45 | 			ssize_t res, written = 0;
46 | 			while ((res = lzw_decompress(&state, input, slen, dest, sizeof(dest))) > 0) {
47 | 				written += res;
48 | 			}
49 | 			printf("decompressed:%zd (res=%zd)\n", written, res);
50 | 		}
51 | #ifdef __clang_major__
52 | 	}
53 | #endif
54 | 	return EXIT_SUCCESS;
55 | }
56 | 


--------------------------------------------------------------------------------
/fuzzing/afl_roundtrip_driver.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | 	Roundtrip input->compress->decompress driver for use with afl-fuzz (fast mode)
 3 | 
 4 | 	This driver takes input, compresses it, then decompresses it, and
 5 | 	then re-compresses it, checking that returned lengths and contents
 6 | 	of input and output buffers agree.
 7 | */
 8 | #include <unistd.h>
 9 | #include <stdio.h>
10 | 
11 | #define LZW_EDDY_IMPLEMENTATION
12 | #include "lzw.h"
13 | 
14 | /* this lets the source compile without afl-clang-fast/lto */
15 | #ifndef __AFL_FUZZ_TESTCASE_LEN
16 | 
17 | ssize_t       fuzz_len;
18 | unsigned char fuzz_buf[1024000];
19 | 
20 | #define __AFL_FUZZ_TESTCASE_LEN fuzz_len
21 | #define __AFL_FUZZ_TESTCASE_BUF fuzz_buf
22 | #define __AFL_FUZZ_INIT() void sync(void);
23 | #define __AFL_LOOP(x) \
24 | 	((fuzz_len = read(0, fuzz_buf, sizeof(fuzz_buf))) > 0 ? 1 : 0)
25 | #define __AFL_INIT() sync()
26 | #endif
27 | 
28 | __AFL_FUZZ_INIT();
29 | 
30 | #ifdef __clang__
31 | #pragma clang optimize off
32 | #else
33 | #pragma GCC optimize("O0")
34 | #endif
35 | 
36 | int main(int argc, char *argv[]) {
37 | 	struct lzw_state statec0;
38 | 	struct lzw_state stated0;
39 | 	size_t dest_size = 1UL << 19; // 512KiB
40 | 	uint8_t *decomp = malloc(dest_size*2);
41 | 	uint8_t *comp = decomp + dest_size;
42 | 
43 | #ifdef __AFL_HAVE_MANUAL_CONTROL
44 | 	__AFL_INIT();
45 | #endif
46 | 
47 | 	uint8_t *input = __AFL_FUZZ_TESTCASE_BUF;
48 | 
49 | #ifdef __clang_major__
50 | 	while (__AFL_LOOP(5000)) {
51 | #endif
52 | 		memset(&statec0, 0, sizeof(struct lzw_state));
53 | 		memset(&stated0, 0, sizeof(struct lzw_state));
54 | 
55 | 		ssize_t res;
56 | 		size_t comp_size = 0;
57 | 		size_t decomp_size = 0;
58 | 
59 | 		size_t slen = __AFL_FUZZ_TESTCASE_LEN;
60 | 		if (input && slen > 0) {
61 | 			// Compress input from fuzzer.
62 | 			while ((res = lzw_compress(&statec0, input, slen, comp, dest_size)) > 0) { comp_size += res; };
63 | 			printf("compressed:%zu (res=%zd)\n", comp_size, res);
64 | 			if (res < 0) {
65 | 				abort();
66 | 			}
67 | 
68 | 			// Decompress the compressed data...
69 | 			while ((res = lzw_decompress(&stated0, comp, comp_size, decomp, dest_size)) > 0) { decomp_size += res; };
70 | 			printf("decompressed:%zu (res=%zd)\n", decomp_size, res);
71 | 			if (res < 0) {
72 | 				abort();
73 | 			}
74 | 
75 | 			// Verify input size vs decompressed size.
76 | 			if (slen != decomp_size) {
77 | 				abort();
78 | 			}
79 | 
80 | 			// Compare the decompressed data and the original input; should match obviously.
81 | 			int comp0 = memcmp(input, decomp, slen);
82 | 			if (comp0 != 0) {
83 | 				abort();
84 | 			}
85 | 		}
86 | 
87 | #ifdef __clang_major__
88 | 	}
89 | #endif
90 | 	free(decomp);
91 | 	return EXIT_SUCCESS;
92 | }
93 | 


--------------------------------------------------------------------------------
/lzw-eddy.c:
--------------------------------------------------------------------------------
  1 | #define LZW_EDDY_IMPLEMENTATION
  2 | // #define LZW_MAX_CODE_WIDTH 14
  3 | #include "lzw.h"
  4 | 
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <string.h>
  8 | #include <stdint.h>
  9 | #include <assert.h>
 10 | #include <stdbool.h>
 11 | #include <errno.h>
 12 | 
 13 | #include "build_const.h"
 14 | 
 15 | static const char *infile;
 16 | static const char *outfile;
 17 | static int compress = 0;
 18 | static size_t maxlen = 0;
 19 | 
 20 | static void print_version(void) {
 21 | 	if (build_hash && *build_hash) {
 22 | 		printf("%s <%.*s>\n", LZW_EDDY_VERSION, 8, build_hash);
 23 | 	} else {
 24 | 		printf("%s\n", LZW_EDDY_VERSION);
 25 | 	}
 26 | }
 27 | 
 28 | static void print_banner(void) {
 29 | 	printf("lzw-eddy ");
 30 | 	print_version();
 31 | }
 32 | 
 33 | static int parse_args(int argc, char **argv) {
 34 | 	for (int i=1 ; i < argc ; ++i) {
 35 | 		const char *arg = argv[i];
 36 | 		// "argv[argc] shall be a null pointer", section 5.1.2.2.1
 37 | 		const char *value = argv[i+1];
 38 | 
 39 | 		if (arg && *arg == '-') {
 40 | 			++arg;
 41 | 			if (value) {
 42 | 				switch (*arg) {
 43 | 					case 'c':
 44 | 						compress = 1;
 45 | 						infile = value;
 46 | 						break;
 47 | 					case 'd':
 48 | 						/* fallthrough */
 49 | 					case 'x':
 50 | 						compress = 0;
 51 | 						infile = value;
 52 | 						break;
 53 | 					case 'o':
 54 | 						outfile = value;
 55 | 						break;
 56 | 					case 'm':
 57 | 						maxlen = atoi(value);
 58 | 						break;
 59 | 				}
 60 | 			} else {
 61 | 				if (*arg == 'v' || *arg == 'V' || strcmp(arg, "-version") == 0) {
 62 | 					print_version();
 63 | 					exit(0);
 64 | 				}
 65 | 			}
 66 | 		}
 67 | 	}
 68 | 
 69 | 	return 0;
 70 | }
 71 | 
 72 | static void lzw_compress_file(const char *srcfile, const char *destfile) {
 73 | 	FILE *ifile = fopen(srcfile, "rb");
 74 | 
 75 | 	if (!ifile) {
 76 | 		fprintf(stderr, "Error: %m\n");
 77 | 		return;
 78 | 	}
 79 | 	fseek(ifile, 0, SEEK_END);
 80 | 	long slen = ftell(ifile);
 81 | 	fseek(ifile, 0, SEEK_SET);
 82 | 
 83 | 	printf("Compressing %zu bytes.\n", (size_t)slen);
 84 | 	FILE *ofile = fopen(destfile, "wb");
 85 | 	if (ofile) {
 86 | 		uint8_t *src = malloc(slen);
 87 | 		if (!src) {
 88 | 			fprintf(stderr, "ERROR: memory allocation of %ld bytes failed.\n", slen);
 89 | 			exit(1);
 90 | 		}
 91 | 		uint8_t dest[4096];
 92 | 
 93 | 		struct lzw_state state = { 0 };
 94 | 		if (maxlen > 0) {
 95 | 			state.longest_prefix_allowed = maxlen;
 96 | 			printf("WARNING: Restricting maximum prefix length to %zu.\n", state.longest_prefix_allowed);
 97 | 		}
 98 | 
 99 | 		if ((fread(src, slen, 1, ifile) != 1) && (ferror(ifile) != 0)) {
100 | 			fprintf(stderr, "fread '%s': %s", srcfile, strerror(errno));
101 | 			exit(EXIT_FAILURE);
102 | 		}
103 | 
104 | 		ssize_t res, written = 0;
105 | 		while ((res = lzw_compress(&state, src, slen, dest, sizeof(dest))) > 0) {
106 | 			fwrite(dest, res, 1, ofile);
107 | 			written += res;
108 | 		}
109 | 		if (res == 0) {
110 | 			printf("%zd bytes written to output, reduction=%2.02f%% (longest prefix=%zu).\n",
111 | 					written,
112 | 					(1.0f - ((float)written/slen)) * 100.0f,
113 | 					state.longest_prefix);
114 | 		} else if (res < 0) {
115 | 			fprintf(stderr, "Compression returned error: %s (err: %zd)\n", lzw_strerror(res), res);
116 | 		}
117 | 		fclose(ofile);
118 | 		free(src);
119 | 	} else {
120 | 		fprintf(stderr, "Error: %m\n");
121 | 	}
122 | 	fclose(ifile);
123 | }
124 | 
125 | static void lzw_decompress_file(const char *srcfile, const char *destfile) {
126 | 	FILE *ifile = fopen(srcfile, "rb");
127 | 
128 | 	if (!ifile) {
129 | 		fprintf(stderr, "Error: %m\n");
130 | 		return;
131 | 	}
132 | 	fseek(ifile, 0, SEEK_END);
133 | 	long slen = ftell(ifile);
134 | 	fseek(ifile, 0, SEEK_SET);
135 | 
136 | 	if (slen > 0) {
137 | 		printf("Decompressing %zu bytes.\n", (size_t)slen);
138 | 		FILE *ofile = stdout;
139 | 		if (strcmp(destfile, "-") != 0) {
140 | 			ofile = fopen(destfile, "wb");
141 | 		}
142 | 		if (ofile) {
143 | 			uint8_t dest[4096];
144 | 			size_t dest_len = sizeof(dest);
145 | 			if (maxlen > 0 && maxlen + 1 < dest_len) {
146 | 				dest_len = maxlen + 1;
147 | 				printf("WARNING: Restricting output buffer to %zu bytes.\n", dest_len);
148 | 			}
149 | 			uint8_t *src = malloc(slen);
150 | 			if (!src) {
151 | 				fprintf(stderr, "ERROR: memory allocation of %ld bytes failed.\n", slen);
152 | 				exit(1);
153 | 			}
154 | 
155 | 			if ((fread(src, slen, 1, ifile) != 1) && (ferror(ifile) != 0)) {
156 | 				fprintf(stderr, "fread '%s': %s", srcfile, strerror(errno));
157 | 				exit(EXIT_FAILURE);
158 | 			}
159 | 
160 | 			struct lzw_state state = { 0 };
161 | 
162 | 			ssize_t res, written = 0;
163 | 			// Returns 0 when done, otherwise number of bytes written to destination buffer. On error, < 0.
164 | 			while ((res = lzw_decompress(&state, src, slen, dest, dest_len)) > 0) {
165 | 				fwrite(dest, res, 1, ofile);
166 | 				written += res;
167 | 			}
168 | 			if (res == 0) {
169 | 				printf("%zd bytes written to output, expansion=%2.2f%% (longest prefix=%zu).\n",
170 | 					written,
171 | 					((float)written/slen - 1.0f) * 100.0f,
172 | 					state.longest_prefix);
173 | 			} else if (res < 0) {
174 | 				fprintf(stderr, "Decompression returned error: %s (err: %zd)\n", lzw_strerror(res), res);
175 | 			}
176 | 			fclose(ofile);
177 | 			free(src);
178 | 		} else {
179 | 			fprintf(stderr, "Error: %m\n");
180 | 		}
181 | 	}
182 | 	fclose(ifile);
183 | }
184 | 
185 | int main(int argc, char *argv []) {
186 | 	parse_args(argc, argv);
187 | 
188 | 	print_banner();
189 | 
190 | 	if (!infile || !outfile) {
191 | 		printf("Usage: %s -c file|-d file -o outfile\n", argv[0]);
192 | 		printf("Compiled Configuration:\n LZW_MIN_CODE_WIDTH=%d, LZW_MAX_CODE_WIDTH=%d, LZW_MAX_CODES=%lu, sizeof(lzw_state)=%zu\n",
193 | 			LZW_MIN_CODE_WIDTH,
194 | 			LZW_MAX_CODE_WIDTH,
195 | 			LZW_MAX_CODES,
196 | 			sizeof(struct lzw_state)
197 | 		);
198 | 		return EXIT_SUCCESS;
199 | 	}
200 | 
201 | 	if (compress) {
202 | 		lzw_compress_file(infile, outfile);
203 | 	} else {
204 | 		lzw_decompress_file(infile, outfile);
205 | 	}
206 | 
207 | 	return EXIT_SUCCESS;
208 | }
209 | 


--------------------------------------------------------------------------------
/lzw.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | 	Variable-length code LZW compressor and decompressor for fixed-memory decoding.
  3 | 	Copyright (c) 2020-2022, Eddy L O Jansson. Licensed under The MIT License.
  4 | 
  5 | 	See https://github.com/eloj/lzw-eddy
  6 | */
  7 | #ifdef __cplusplus
  8 | extern "C" {
  9 | #endif
 10 | 
 11 | #include <stdint.h>
 12 | #include <stdbool.h>
 13 | 
 14 | #if defined(_MSC_VER)
 15 | #include <BaseTsd.h>
 16 | typedef SSIZE_T ssize_t;
 17 | #else
 18 | #include <sys/types.h> // for ssize_t
 19 | #endif
 20 | 
 21 | #define LZW_EDDY_MAJOR_VERSION 1
 22 | #define LZW_EDDY_MINOR_VERSION 1
 23 | #define LZW_EDDY_PATCH_VERSION 0
 24 | #define LZW_EDDY_VERSION "1.1.0-dev"
 25 | 
 26 | #define LZW_MIN_CODE_WIDTH 9
 27 | // 9 to 16-bit codes should all work, but 12 is the default for a reason.
 28 | // Going beyond 16-bit codes would require code changes. More isn't better either.
 29 | #ifndef LZW_MAX_CODE_WIDTH
 30 | #define LZW_MAX_CODE_WIDTH 12
 31 | #endif
 32 | #define LZW_MAX_CODES (1UL << LZW_MAX_CODE_WIDTH)
 33 | 
 34 | enum lzw_errors {
 35 | 	LZW_NOERROR = 0,
 36 | 	LZW_DESTINATION_TOO_SMALL = -1,
 37 | 	LZW_INVALID_CODE_STREAM = -2,
 38 | 	LZW_STRING_TABLE_FULL = -3,
 39 | };
 40 | 
 41 | // This type must be large enough for SYMBOL_BITS + LZW_MAX_CODE_WIDTH*2 bits.
 42 | #if LZW_MAX_CODE_WIDTH > 12
 43 | typedef uint64_t lzw_node;
 44 | #else
 45 | typedef uint32_t lzw_node;
 46 | #endif
 47 | typedef uint32_t bitres_t;
 48 | typedef uint16_t code_t;
 49 | typedef uint8_t sym_t;
 50 | 
 51 | struct lzw_string_table {
 52 | 	uint32_t code_width;
 53 | 	code_t next_code;
 54 | 	code_t prev_code;
 55 | 	lzw_node node[LZW_MAX_CODES]; // 16K at 12-bit codes.
 56 | };
 57 | 
 58 | struct lzw_state {
 59 | 	struct lzw_string_table tree;
 60 | 
 61 | 	// If we ever need more of these, change to a flag-word.
 62 | 	bool	 was_init;
 63 | 	bool	 must_reset;
 64 | 
 65 | 	size_t rptr;
 66 | 	size_t wptr;
 67 | 	// Bit reservoir, need room for LZW_MAX_CODE_WIDTH*2-1 bits.
 68 | 	bitres_t bitres;
 69 | 	uint32_t bitres_len;
 70 | 
 71 | 	// Tracks the longest prefix used, which is equal to the minimum output buffer required for decompression.
 72 | 	size_t longest_prefix;
 73 | 	// Restrict the longest_prefix to this -- optimize for decode buffer size.
 74 | 	size_t longest_prefix_allowed;
 75 | };
 76 | 
 77 | // Translate error code to message.
 78 | const char *lzw_strerror(enum lzw_errors errnum);
 79 | 
 80 | /*
 81 | 	Decompress `slen` bytes from `src` into `dest` of size `dlen`.
 82 | 
 83 | 	Returns the number of bytes decompressed into `dest`.
 84 | 	Once all input has been consumed, 0 is returned.
 85 | 	On error, a negative integer is returned.
 86 | 
 87 | 	Neither `src` nor `dest` may be NULL.
 88 | 
 89 | 	`state`should be zero-initialized.
 90 | 
 91 | 	`dlen` should be at least 4096 bytes, unless the input is known to
 92 | 	require less.
 93 | 
 94 | 	`LZWD_DESTINATION_TOO_SMALL` will be returned if the output buffer is too small, in which case
 95 | 	you'd have to restart from the beginning with a larger `dest`.
 96 | 
 97 | 	All that said, even a file consisting of 80K zeros requires only 400 bytes,
 98 | 	so we're being very conservative here. A 'normal' file may need only 128 bytes or so.
 99 | */
100 | ssize_t lzw_decompress(struct lzw_state *state, uint8_t *src, size_t slen, uint8_t *dest, size_t dlen);
101 | 
102 | /*
103 | 	Compress `slen` bytes from `src` into `dest` of size `dlen`.
104 | 
105 | 	Returns the number of bytes compressed into `dest`.
106 | 	Once all input has been consumed, 0 is returned.
107 | 	On error, a negative integer is returned.
108 | 
109 | 	Neither `src` nor `dest` may be NULL.
110 | 
111 | 	`state`should be zero-initialized.
112 | */
113 | ssize_t lzw_compress(struct lzw_state *state, uint8_t *src, size_t slen, uint8_t *dest, size_t dlen);
114 | 
115 | #ifdef LZW_EDDY_IMPLEMENTATION
116 | 
117 | /*
118 | 	Variable-length code LZW compressor and decompressor for fixed-memory decoding.
119 | 	Copyright (c) 2020-2022, Eddy L O Jansson. Licensed under The MIT License.
120 | 
121 | 	See https://github.com/eloj/lzw-eddy
122 | */
123 | #include <stdlib.h>
124 | #include <string.h>
125 | #include <stdint.h>
126 | #include <assert.h>
127 | #include <stdbool.h>
128 | 
129 | #define SYMBOL_BITS 8
130 | #define SYMBOL_MASK ((1UL << SYMBOL_BITS)-1)
131 | #define PARENT_BITS LZW_MAX_CODE_WIDTH
132 | #define PARENT_SHIFT SYMBOL_BITS
133 | #define PARENT_MASK ((1UL << PARENT_BITS)-1)
134 | #define PREFIXLEN_BITS LZW_MAX_CODE_WIDTH
135 | #define PREFIXLEN_SHIFT (PARENT_BITS+SYMBOL_BITS)
136 | #define PREFIXLEN_MASK ((1UL << PREFIXLEN_BITS)-1)
137 | 
138 | #define CODE_CLEAR (1UL << SYMBOL_BITS)
139 | #define CODE_EOF (CODE_CLEAR+1)
140 | #define CODE_FIRST (CODE_CLEAR+2)
141 | 
142 | static_assert((LZW_MAX_CODE_WIDTH >= LZW_MIN_CODE_WIDTH), "");
143 | static_assert(SYMBOL_BITS <= sizeof(sym_t)*8, "sym_t type too small");
144 | static_assert((SYMBOL_BITS + PARENT_BITS + PREFIXLEN_BITS) <= sizeof(lzw_node)*8, "lzw_node type too small");
145 | static_assert((LZW_MAX_CODE_WIDTH*2 - 1) < sizeof(bitres_t)*8, "bitres_t type too small");
146 | 
147 | static inline sym_t lzw_node_symbol(lzw_node node) {
148 | 	return node & SYMBOL_MASK;
149 | }
150 | 
151 | static inline code_t lzw_node_parent(lzw_node node) {
152 | 	return (node >> PARENT_SHIFT) & PARENT_MASK;
153 | }
154 | 
155 | static inline code_t lzw_node_prefix_len(lzw_node node) {
156 | 	return (node >> PREFIXLEN_SHIFT) & PREFIXLEN_MASK;
157 | }
158 | 
159 | static inline lzw_node lzw_make_node(sym_t symbol, code_t parent, code_t len) {
160 | 	lzw_node node = (len << PREFIXLEN_SHIFT) | (parent << PARENT_SHIFT) | symbol;
161 | 	return node;
162 | }
163 | 
164 | static inline uint32_t mask_from_width(uint32_t width) {
165 | 	return (1UL << width)-1;
166 | }
167 | 
168 | static void lzw_reset(struct lzw_state *state) {
169 | 	state->tree.prev_code = CODE_EOF;
170 | 	state->tree.next_code = CODE_FIRST;
171 | 	state->tree.code_width = LZW_MIN_CODE_WIDTH;
172 | 	state->must_reset = false;
173 | }
174 | 
175 | static void lzw_init(struct lzw_state *state) {
176 | 	for (size_t i=0 ; i < (1UL << SYMBOL_BITS) ; ++i) {
177 | 		state->tree.node[i] = lzw_make_node((sym_t)i, 0, 0);
178 | 	}
179 | 	state->rptr = 0;
180 | 	state->bitres = 0;
181 | 	state->bitres_len = 0;
182 | 	state->was_init = true;
183 | 	lzw_reset(state);
184 | }
185 | 
186 | const char *lzw_strerror(enum lzw_errors errnum) {
187 | 	const char *errstr = "Unknown error";
188 | 
189 | 	switch (errnum) {
190 | 		case LZW_NOERROR:
191 | 			errstr = "No error";
192 | 			break;
193 | 		case LZW_DESTINATION_TOO_SMALL:
194 | 			errstr = "Destination buffer too small";
195 | 			break;
196 | 		case LZW_INVALID_CODE_STREAM:
197 | 			errstr = "Invalid code stream";
198 | 			break;
199 | 		case LZW_STRING_TABLE_FULL:
200 | 			errstr = "String table full";
201 | 			break;
202 | 
203 | 	}
204 | 	return errstr;
205 | }
206 | 
207 | ssize_t lzw_decompress(struct lzw_state *state, uint8_t *src, size_t slen, uint8_t *dest, size_t dlen) {
208 | 	if (state->was_init == false)
209 | 		lzw_init(state);
210 | 
211 | 	// Keep local copies so that we can exit and continue without losing bits.
212 | 	uint32_t bitres = state->bitres;
213 | 	uint32_t bitres_len = state->bitres_len;
214 | 
215 | 	uint32_t code = 0;
216 | 	size_t wptr = 0;
217 | 
218 | 	while (state->rptr < slen) {
219 | 		// Fill bit-reservoir.
220 | 		while ((bitres_len < state->tree.code_width) && (state->rptr < slen)) {
221 | 			bitres |= src[state->rptr++] << bitres_len;
222 | 			bitres_len += 8;
223 | 		}
224 | 
225 | 		state->bitres = bitres;
226 | 		state->bitres_len = bitres_len;
227 | 
228 | 		if (state->bitres_len < state->tree.code_width) {
229 | 			return LZW_INVALID_CODE_STREAM;
230 | 		}
231 | 
232 | 		code = bitres & mask_from_width(state->tree.code_width);
233 | 		bitres >>= state->tree.code_width;
234 | 		bitres_len -= state->tree.code_width;
235 | 
236 | 		if (code == CODE_CLEAR) {
237 | 			if (state->tree.next_code != CODE_FIRST)
238 | 				lzw_reset(state);
239 | 			continue;
240 | 		} else if (code == CODE_EOF) {
241 | 			break;
242 | 		} else if (state->must_reset) {
243 | 			// ERROR: Ran out of space in string table
244 | 			return LZW_STRING_TABLE_FULL;
245 | 		}
246 | 
247 | 		if (code <= state->tree.next_code) {
248 | 			bool known_code = code < state->tree.next_code;
249 | 			code_t tcode = known_code ? code : state->tree.prev_code;
250 | 			size_t prefix_len = 1 + lzw_node_prefix_len(state->tree.node[tcode]);
251 | 			uint8_t symbol = 0;
252 | 
253 | 			assert(prefix_len > 0);
254 | 
255 | 			// Invalid state, invalid input.
256 | 			if (!known_code && state->tree.prev_code == CODE_EOF) {
257 | 				return LZW_INVALID_CODE_STREAM;
258 | 			}
259 | 
260 | 			// Track longest prefix seen.
261 | 			if (prefix_len > state->longest_prefix) {
262 | 				state->longest_prefix = prefix_len;
263 | 			}
264 | 
265 | 			// Check if prefix alone too large for output buffer. User could start over with a larger buffer.
266 | 			if (prefix_len + (known_code ? 0 : 1) > dlen) {
267 | 				return LZW_DESTINATION_TOO_SMALL;
268 | 			}
269 | 
270 | 			// Check if room in output buffer, else return early.
271 | 			if (wptr + prefix_len + (known_code ? 0 : 1) > dlen) {
272 | 				return wptr;
273 | 			}
274 | 
275 | 			// Write out prefix to destination
276 | 			for (size_t i=0 ; i < prefix_len ; ++i) {
277 | 				symbol = lzw_node_symbol(state->tree.node[tcode]);
278 | 				dest[wptr + prefix_len - 1 - i] = symbol;
279 | 				tcode = lzw_node_parent(state->tree.node[tcode]);
280 | 			}
281 | 			wptr += prefix_len;
282 | 
283 | 			// Add the first character of the prefix as a new code with prev_code as the parent.
284 | 			if (state->tree.prev_code != CODE_EOF) {
285 | 				if (!known_code) {
286 | 					assert(code == state->tree.next_code);
287 | 					assert(wptr < dlen);
288 | 					dest[wptr++] = symbol; // Special case for new codes.
289 | 				}
290 | 
291 | 				state->tree.node[state->tree.next_code] = lzw_make_node(symbol, state->tree.prev_code, 1 + lzw_node_prefix_len(state->tree.node[state->tree.prev_code]));
292 | 
293 | 				// TODO: Change to ==
294 | 				if (state->tree.next_code >= mask_from_width(state->tree.code_width)) {
295 | 					if (state->tree.code_width == LZW_MAX_CODE_WIDTH) {
296 | 						// Out of bits in code, next code MUST be a reset!
297 | 						state->must_reset = true;
298 | 						state->tree.prev_code = code;
299 | 						continue;
300 | 					}
301 | 					++state->tree.code_width;
302 | 				}
303 | 				state->tree.next_code++;
304 | 			}
305 | 			state->tree.prev_code = code;
306 | 		} else {
307 | 			// Desynchronized, probably corrupt/invalid input.
308 | 			return LZW_INVALID_CODE_STREAM;
309 | 		}
310 | 	}
311 | 	return wptr;
312 | }
313 | 
314 | static bool lzw_string_table_lookup(struct lzw_state *state, uint8_t *prefix, size_t len, code_t *code) {
315 | 	// printf("Looking up prefix '%.*s' from %p to %p (len=%zu)\n", (int)(len), prefix, prefix, prefix+len, len);
316 | 	assert (len > 0);
317 | 
318 | 	if (len == 1) {
319 | 		*code = state->tree.node[prefix[0]];
320 | 		return true;
321 | 	}
322 | 
323 | 	// PERF: This is slow, we should store an array of hashes to use as an initial comparison before walking the tree.
324 | 	// NOTE: It's imperative that we search newest to oldest. When limiting the prefix length, we'll
325 | 	// end up with duplicate prefixes, and only the newest code is valid for the decoder to stay in sync.
326 | 	for (size_t i=state->tree.next_code - 1 ; i >= CODE_FIRST ; --i) {
327 | 		assert(i < LZW_MAX_CODES);
328 | 		lzw_node node = state->tree.node[i];
329 | 
330 | 		if (len - 1 == lzw_node_prefix_len(node)) {
331 | 			for (size_t j=0 ; j < len ; ++j) {
332 | 				if (prefix[len-j-1] != lzw_node_symbol(node)) {
333 | 					break;
334 | 				}
335 | 				if (lzw_node_prefix_len(node) == 0) {
336 | 					*code = (code_t)i;
337 | 					assert(j == len - 1);
338 | 					return true;
339 | 				}
340 | 				node = state->tree.node[lzw_node_parent(node)];
341 | 			}
342 | 		}
343 | 	}
344 | 
345 | 	return false;
346 | }
347 | 
348 | inline static void lzw_output_code(struct lzw_state *state, code_t code) {
349 | 	assert(state->bitres_len + state->tree.code_width <= sizeof(bitres_t)*8); // maybe increase size of bitres_t?
350 | 	state->bitres |= code << state->bitres_len;
351 | 	state->bitres_len += state->tree.code_width;
352 | 	state->tree.prev_code = code;
353 | 
354 | 	// printf("<CODE:%d width=%d reservoir:%02d/%zu:%02x>\n", code, state->tree.code_width, state->bitres_len, sizeof(bitres_t)*8, state->bitres);
355 | }
356 | 
357 | static void lzw_flush_reservoir(struct lzw_state *state, uint8_t *dest, bool final) {
358 | 	// SECURITY: We assume we have enough space left in dest!
359 | 
360 | 	// Write codes to output.
361 | 	while (state->bitres_len >= 8) {
362 | 		dest[state->wptr++] = state->bitres & 0xFF;
363 | 		state->bitres >>= 8;
364 | 		state->bitres_len -= 8;
365 | 		// printf("DEBUG: Flushed: %02x, reservoir:%02d/%zu:%02x\n", dest[state->wptr-1], state->bitres_len, sizeof(bitres_t)*8, state->bitres);
366 | 	}
367 | 
368 | 	if (final && state->bitres_len > 0) {
369 | 		// printf("DEBUG: Flushing last %d bits.\n", state->bitres_len);
370 | 		dest[state->wptr++] = state->bitres;
371 | 		state->bitres = 0;
372 | 		state->bitres_len = 0;
373 | 		// printf("DEBUG: Flushed: %02x, reservoir:%02d/%zu:%02x\n", dest[state->wptr-1], state->bitres_len, sizeof(bitres_t)*8, state->bitres);
374 | 	}
375 | }
376 | 
377 | ssize_t lzw_compress(struct lzw_state *state, uint8_t *src, size_t slen, uint8_t *dest, size_t dlen) {
378 | 	if (state->was_init == false) {
379 | 		lzw_init(state);
380 | 		lzw_output_code(state, CODE_CLEAR);
381 | 	}
382 | 
383 | 	code_t code = CODE_EOF;
384 | 	size_t prefix_end = 0;
385 | 	state->wptr = 0;
386 | 
387 | 	while (state->rptr + prefix_end < slen) {
388 | 		// Ensure we have enough space for flushing codes.
389 | 		if (state->wptr + (state->tree.code_width >> 3) + 1 + 2 + 2 > dlen) { // Also reserve bits for worst-case 16-bit CLEAR + EOF code
390 | 			return state->wptr;
391 | 		}
392 | 
393 | 		++prefix_end;
394 | 		// lookup prefix in string table
395 | 		bool overlong = ((state->longest_prefix_allowed > 0) && (prefix_end >= state->longest_prefix_allowed));
396 | 		bool existing_code = lzw_string_table_lookup(state, src + state->rptr, prefix_end, &code);
397 | 		if (!existing_code || overlong) {
398 | 			assert(code != CODE_CLEAR);
399 | 			assert(code != CODE_EOF);
400 | 
401 | 			uint8_t symbol = src[state->rptr + prefix_end - 1];
402 | 			code_t parent = code;
403 | 			code_t parent_len = 1 + lzw_node_prefix_len(state->tree.node[parent]);
404 | 
405 | 			// Output code _before_ we potentially change the bit-width.
406 | 			lzw_output_code(state, parent);
407 | 
408 | 			// Handle code width expansion.
409 | 			if (state->tree.next_code == (1UL << state->tree.code_width)
410 | #if LZW_MAX_CODE_WIDTH == 16
411 | 				|| (state->tree.next_code == LZW_MAX_CODES - 1) /* special case for wrapping on 16-bit code_t */
412 | #endif
413 | 			) {
414 | 				if (state->tree.code_width < LZW_MAX_CODE_WIDTH) {
415 | 					// printf("DEBUG: Expanding bitwidth to %d\n", state->tree.code_width + 1);
416 | 					++state->tree.code_width;
417 | 				} else {
418 | 					// printf("DEBUG: Max code-width reached -- Issuing clear/reset\n");
419 | 					lzw_flush_reservoir(state, dest, false);
420 | 					lzw_output_code(state, CODE_CLEAR);
421 | 					lzw_reset(state);
422 | 					lzw_flush_reservoir(state, dest, false);
423 | 					state->tree.next_code = CODE_EOF; // XXX: Required for compatibility with puzznic.
424 | 				}
425 | 			}
426 | 
427 | 			assert(state->tree.next_code < LZW_MAX_CODES);
428 | 			// printf("New prefix from src[%zu], adding symbol '%c' (%02x) as code %d /w parent %d\n", state->rptr + prefix_end, symbol, symbol, state->tree.next_code, parent);
429 | 			state->tree.node[state->tree.next_code++] = lzw_make_node(symbol, parent, parent_len);
430 | 
431 | 			if (parent_len > state->longest_prefix) {
432 | 				state->longest_prefix = parent_len;
433 | 			}
434 | 
435 | 			state->rptr += parent_len;
436 | 			prefix_end = 0;
437 | 
438 | 			lzw_flush_reservoir(state, dest, false);
439 | 		}
440 | 	}
441 | 	if (prefix_end != 0) {
442 | 		// printf("DEBUG: Last prefix existed, writing existing code %d to stream\n", code);
443 | 		lzw_output_code(state, code);
444 | 		lzw_flush_reservoir(state, dest, false);
445 | 		state->rptr += prefix_end;
446 | 		prefix_end = 0;
447 | 	}
448 | 
449 | 	// WARN: Problem with this is that we can't chain encodes, add 'final' flag to compression call?
450 | 	// NIGHTMARE: Handle zero-input
451 | 	if ((state->rptr + prefix_end == slen && state->tree.prev_code != CODE_EOF)
452 | 		// This happens to be true if we're called with slen=0, but only the first time as we now flush the bits.
453 | 		|| (state->wptr == 0 && state->bitres_len > 0)) {
454 | 		lzw_output_code(state, CODE_EOF);
455 | 		lzw_flush_reservoir(state, dest, true);
456 | 	}
457 | 
458 | 	// if we didn't write anything, there shouldn't be any bits left in reservoir.
459 | 	assert(!(state->wptr == 0 && state->bitres_len > 0));
460 | 
461 | 	// printf("DEBUG: Returning %zu bytes written to caller.\n", state->wptr);
462 | 
463 | 	return state->wptr;
464 | }
465 | #endif // LZW_EDDY_IMPLEMENTATION
466 | 
467 | #ifdef __cplusplus
468 | }
469 | #endif
470 | 


--------------------------------------------------------------------------------
/package.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Create a release package for a specific OS.
 4 | #
 5 | PROJECT=lzw-eddy
 6 | OS=$1
 7 | ARCH=${2:-`uname -m`}
 8 | BP="packages/${OS}"
 9 | RP="${BP}/${PROJECT}"
10 | FILES='lzw-eddy lzw.h LICENSE'
11 | 
12 | if [ -z "${OS}" ]; then
13 | 	echo "Usage: $0 <operating-system>"
14 | 	exit 1
15 | fi
16 | 
17 | if [[ "${OS}" == "linux" || "${OS}" == "msys2" ]]; then
18 | 	echo "Packaging for ${OS}-${ARCH}"
19 | 	OPTIMIZED=1 make -B lzw-eddy
20 | 	if [ $? -ne 0 ]; then
21 | 		echo "Build failed, packaging aborted."
22 | 		exit 1
23 | 	fi
24 | 	mkdir -p ${RP}
25 | 	cp ${FILES} ${RP}
26 | 	tar -C ${BP} -czvf packages/${PROJECT}.${OS}-${ARCH}.tar.gz ${PROJECT}
27 | 	if [ "${OS}" == "msys2" ]; then
28 | 		pushd .
29 | 		cd ${RP} && 7z -bd a ../../${PROJECT}.${OS}-${ARCH}.7z *
30 | 		popd
31 | 	fi
32 | 	rm -r ${BP}
33 | else
34 | 	echo "Unknown target OS: ${OS}"
35 | 	exit 2
36 | fi
37 | 
38 | echo "Done."
39 | 


--------------------------------------------------------------------------------
/run-tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | TMPFILED=$(mktemp)
 3 | TMPFILEC=$(mktemp)
 4 | trap "{ rm $TMPFILED $TMPFILEC; }" EXIT
 5 | set -e
 6 | 
 7 | rep() {
 8 | 	C=${1:-0}
 9 | 	S=${2:-A}
10 | 	if [ "$C" -gt 0 ]; then
11 | 		printf "%.0s$S" $(seq 1 $C)
12 | 	fi
13 | }
14 | 
15 | function testfile {
16 | 	INFILE=$1
17 | 	EOPT=$2
18 | 	HASHD=$3
19 | 	HASHC=${4:-$(sha256sum -b $INFILE | cut -f 1 -d ' ')}
20 | 	./lzw-eddy -d $INFILE -o $TMPFILED
21 | 	if [ $? != 0 ]; then
22 | 		echo "TEST FAIL. (Decompression error)"
23 | 		exit 1
24 | 	fi
25 | 	test "$(sha256sum -b $TMPFILED)" = "$HASHD *$TMPFILED" || (echo "Test failed -- Decompressed hash mismatch." && exit 1)
26 | 	./lzw-eddy $EOPT -c $TMPFILED -o $TMPFILEC
27 | 	if [ $? != 0 ]; then
28 | 		echo "TEST FAIL. (Compression error)"
29 | 		exit 1
30 | 	fi
31 | 	test "$(sha256sum -b $TMPFILEC)" = "$HASHC *$TMPFILEC" || (echo "Test failed. -- Compressed hash mismatch" && exit 1)
32 | }
33 | 
34 | function testcheck {
35 | 	INFILE=$1
36 | 	HASHD=$(sha256sum $INFILE | cut -f 1 -d ' ')
37 | 	./lzw-eddy -c $INFILE -o $TMPFILEC
38 | 	./lzw-eddy -d $TMPFILEC -o $TMPFILED
39 | 	HASHC=$(sha256sum $TMPFILED | cut -f 1 -d ' ')
40 | 	test "$HASHD" = "$HASHC" || (echo "Test failed. -- Compressed hash mismatch" && exit 1)
41 | }
42 | 
43 | testfile tests/atsign.lzw "" c3641f8544d7c02f3580b07c0f9887f0c6a27ff5ab1d4a3e29caf197cfc299ae
44 | testfile tests/abra.txt.lzw "" 3119a48c6843ee7dcc08312e97b1d8e3b241b082996afe761f8a045d493b7cef
45 | testfile tests/zeros80000.lzw "" f8c784aa6b57396e7c5e094c34d079d8252473e46e2f60593a921dbebf941fcc
46 | testfile tests/zeros80000.lzw "-m 254" f8c784aa6b57396e7c5e094c34d079d8252473e46e2f60593a921dbebf941fcc 58e6c0321a62f7f112e61dca779edc9be0e34cf0ee356f61df949c7aea839492
47 | testcheck lzw.h
48 | rep 65536 AaA >$TMPFILED
49 | testcheck $TMPFILED
50 | echo "All tests passed."
51 | 


--------------------------------------------------------------------------------
/test-ref.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Script that verifies that our reference files decompress and recompress to the exact original bitstream.
 4 | # These files can not be redistributed though.
 5 | #
 6 | DATA=${1:-tests}
 7 | if [[ ! -f ${DATA}/levelmp1.map ]] || [[ ! -f ${DATA}/extra.puz ]]; then
 8 | 	echo "Sorry, you don't seem to have the required reference files."
 9 | 	exit 1
10 | fi
11 | make
12 | test "$(sha256sum -b ${DATA}/levelmp1.map | cut -f 1 -d ' ')" = "8b57ee1e373c926182e47afd3d97477c07f98ad6dde076cdf3c3f703f250d46c" || (echo "Invalid input: hash mismatch for levelmp1.map" && exit 1) || exit 1
13 | test "$(sha256sum -b ${DATA}/extra.puz | cut -f 1 -d ' ')" = "f397e1e1b58d02ca6f469c8af0f5e50620621f267f48cb71af545f77d550607a" || (echo "Invalid input: hash mismatch for extra.puz" && exit 1) || exit 1
14 | echo "Input verified OK."
15 | ./lzw-eddy -d ${DATA}/levelmp1.map -o .decomp1
16 | test "$(sha256sum -b .decomp1 | cut -c1-64)" = "7183a6828de608f69563bff78eec25f9c86053d0ab1e36c6f998853717292f5c" || (echo "Decompression error: hash mismatch for levelmp1.decompressed" && exit 1) || exit 1
17 | ./lzw-eddy -c .decomp1 -o .comp1 >/dev/null
18 | test "$(sha256sum -b .comp1)" = "8b57ee1e373c926182e47afd3d97477c07f98ad6dde076cdf3c3f703f250d46c *.comp1" || (echo "Compression error: hash mismatch for levelmp1.map" && exit 1) || exit 1
19 | ./lzw-eddy -d ${DATA}/extra.puz -o .decomp1
20 | test "$(sha256sum -b .decomp1 | cut -c1-64)" = "d1ce310a2496af792c0c527b86c7e0dc17c14e8eb508d3472a894845e15b1c99" || (echo "Decompression error: hash mismatch for extra.decompressed" && exit 1) || exit 1
21 | ./lzw-eddy -c .decomp1 -o .comp1 >/dev/null
22 | test "$(sha256sum -b .comp1)" = "f397e1e1b58d02ca6f469c8af0f5e50620621f267f48cb71af545f77d550607a *.comp1" || (echo "Compression error: hash mismatch for extra.puz" && exit 1) || exit 1
23 | echo "All Good!"
24 | rm .decomp1 .comp1
25 | 


--------------------------------------------------------------------------------
/tests/AaAx64.txt:
--------------------------------------------------------------------------------
1 | AaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaAAaA


--------------------------------------------------------------------------------
/tests/abra.txt.lzw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eloj/lzw-eddy/facf92c16c83f041369a960e84dc6998ab730517/tests/abra.txt.lzw


--------------------------------------------------------------------------------
/tests/atsign.lzw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eloj/lzw-eddy/facf92c16c83f041369a960e84dc6998ab730517/tests/atsign.lzw


--------------------------------------------------------------------------------
/tests/zeros80000.lzw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eloj/lzw-eddy/facf92c16c83f041369a960e84dc6998ab730517/tests/zeros80000.lzw


--------------------------------------------------------------------------------