├── .gitignore ├── .gitmodules ├── test.h ├── Makefile ├── ungz.c ├── bench.c ├── README.md ├── assembler.c ├── test.c ├── pigz.h └── pigz.s /.gitignore: -------------------------------------------------------------------------------- 1 | *.bin 2 | *.bin.dSYM 3 | *.o 4 | pigz_s.h 5 | pigz_o.s 6 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "LuaJIT"] 2 | path = LuaJIT 3 | url = https://github.com/corsix/LuaJIT 4 | -------------------------------------------------------------------------------- /test.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "pigz.h" 3 | #ifdef __cplusplus 4 | extern "C" { 5 | #endif 6 | 7 | #define PIGZ_READ_SIZE (PIGZ_WINDOW_SIZE - 260) 8 | 9 | typedef struct pigz_functions { 10 | void (*init)(pigz_state*, void*, pigz_reader); 11 | uint64_t (*available)(pigz_state*); 12 | const char* (*consume)(pigz_state*, uint64_t); 13 | uint8_t allow_bmi2; 14 | } pigz_functions; 15 | 16 | uint32_t run_all_pigz_test_cases(const pigz_functions* funcs); 17 | 18 | #ifdef __cplusplus 19 | } 20 | #endif 21 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | DASM=LuaJIT/dynasm 2 | 3 | all: bench.bin ungz.bin 4 | 5 | clean: 6 | rm pigz_s.h 7 | rm pigz_o.s 8 | rm *.o 9 | rm *.bin 10 | 11 | pigz_s.h: pigz.s 12 | luajit $(DASM)/dynasm.lua -o pigz_s.h -F pigz.s 13 | 14 | test.o: test.c test.h pigz.h 15 | gcc -c -g -o test.o test.c 16 | 17 | assembler.bin: assembler.c pigz_s.h test.o 18 | gcc -o assembler.bin -g -I $(DASM) assembler.c test.o 19 | 20 | pigz.o: assembler.bin 21 | ./assembler.bin >/dev/null 22 | gcc -c -o pigz.o pigz_o.s 23 | 24 | ungz.bin: pigz.o pigz.h ungz.c 25 | gcc -o ungz.bin -O2 ungz.c pigz.o 26 | 27 | bench.bin: pigz.o pigz.h bench.c 28 | gcc -o bench.bin -O2 bench.c pigz.o 29 | -------------------------------------------------------------------------------- /ungz.c: -------------------------------------------------------------------------------- 1 | #include "pigz.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | static const char* read_from_stdin(void* opaque, uint64_t* len) { 8 | char* inbuf = (char*)opaque; 9 | for (;;) { 10 | ssize_t m = read(STDIN_FILENO, inbuf, 8192); 11 | if (m >= 0) { 12 | *len = (uint64_t)m; 13 | return inbuf; 14 | } else if (errno == EINTR || errno == EAGAIN) { 15 | continue; 16 | } else { 17 | fprintf(stderr, "Error %d reading from stdin\n", errno); 18 | exit(1); 19 | } 20 | } 21 | } 22 | 23 | int main() { 24 | pigz_state s; 25 | char inbuf[8192]; 26 | uint64_t n; 27 | pigz_init(&s, inbuf, read_from_stdin); 28 | while ((n = pigz_available(&s))) { 29 | const char* buf = pigz_consume(&s, n); 30 | do { 31 | ssize_t m = write(STDOUT_FILENO, buf, n); 32 | if (m <= 0) { 33 | if (errno == EINTR || errno == EAGAIN) { 34 | continue; 35 | } 36 | fprintf(stderr, "Error %d writing to stdout\n", errno); 37 | return 1; 38 | } 39 | buf += m; 40 | n -= m; 41 | } while (n); 42 | } 43 | if (s.status != PIGZ_STATUS_EOF) { 44 | fprintf(stderr, "Error %d inflating gzip stream\n", s.status); 45 | } 46 | return 0; 47 | } 48 | -------------------------------------------------------------------------------- /bench.c: -------------------------------------------------------------------------------- 1 | #include "pigz.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | typedef struct inputbuf { 11 | char* buf; 12 | uint32_t len; 13 | uint32_t n; 14 | } inputbuf_t; 15 | 16 | static const char* readbuf(void* opaque, uint64_t* len) { 17 | inputbuf_t* self = (inputbuf_t*)opaque; 18 | if (self->n) { 19 | --self->n; 20 | *len = self->len; 21 | return self->buf; 22 | } else { 23 | *len = 0; 24 | return NULL; 25 | } 26 | } 27 | 28 | int main(int argc, const char** argv) { 29 | const char* filename = (argc >= 2) ? argv[1] : "bench.gz"; 30 | int fd = open(filename, O_RDONLY | O_CLOEXEC); 31 | struct stat st; 32 | inputbuf_t buf; 33 | pigz_state p; 34 | uint64_t n; 35 | if (fd < 0) { 36 | fprintf(stderr, "Cannot open input (%s): %s\n", filename, strerror(errno)); 37 | return 1; 38 | } 39 | if (fstat(fd, &st) != 0) { 40 | fprintf(stderr, "Cannot stat input (%s): %s\n", filename, strerror(errno)); 41 | return 1; 42 | } 43 | buf.buf = malloc(st.st_size); 44 | for (buf.len = 0; buf.len < st.st_size; ) { 45 | ssize_t m = read(fd, buf.buf + buf.len, st.st_size - buf.len); 46 | if (m <= 0) { 47 | if (errno == EINTR || errno == EAGAIN) { 48 | continue; 49 | } 50 | fprintf(stderr, "Cannot read input\n"); 51 | return 1; 52 | } 53 | buf.len += m; 54 | } 55 | buf.n = (argc >= 3) ? atoi(argv[2]) : 2; 56 | pigz_init(&p, &buf, readbuf); 57 | if (argc >= 4 && strcmp(argv[3], "--no-bmi2") == 0) { 58 | p.status &=~ (int8_t)1; 59 | } 60 | while ((n = pigz_available(&p))) { 61 | pigz_consume(&p, n); 62 | } 63 | if (p.status != PIGZ_STATUS_EOF) { 64 | fprintf(stderr, "Error %d inflating gzip stream\n", p.status); 65 | return 1; 66 | } 67 | return 0; 68 | } 69 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pigz - performant inflater for gzip files 2 | 3 | pigz is a library for decompressing (inflating) gzipped data. It is written in x86-64 assembly, and intended for use by C/C++ programs. It is generally faster than zlib, however unlike zlib: 4 | * pigz is not portable (only x86-64) 5 | * pigz does not do any kind of compression (only decompresssion) 6 | * pigz cannot operate on raw deflate streams (only gzip streams) 7 | * pigz is not API compatible with zlib 8 | 9 | ## The API 10 | 11 | The API is fully described in [pigz.h](https://github.com/corsix/pigz/blob/master/pigz.h), but the quick synopsis is: 12 | ```c 13 | typedef struct pigz_state { 14 | ... 15 | int8_t status; 16 | ... 17 | } pigz_state; 18 | 19 | typedef const char* (*pigz_reader)(void* opaque, uint64_t* len); 20 | void pigz_init(pigz_state* state, void* opaque, pigz_reader reader); 21 | 22 | uint64_t pigz_available(pigz_state* state); 23 | 24 | const char* pigz_consume(pigz_state* state, uint64_t len); 25 | 26 | // Error values for pigz_state::status 27 | #define PIGZ_STATUS_BAD_BITS -5 28 | #define PIGZ_STATUS_BAD_CRC -4 29 | #define PIGZ_STATUS_BAD_HEADER -3 30 | #define PIGZ_STATUS_UNEXPECTED_EOF -2 31 | #define PIGZ_STATUS_EOF -1 32 | ``` 33 | To begin, call `pigz_init`, passing a callback which will provide a gzip stream. Then call `pigz_available` and `pigz_consume` in a loop until `pigz_available` returns zero. Finally, check the `status` field to determine why decompression stopped. 34 | 35 | A complete example is provided in [ungz.c](https://github.com/corsix/pigz/blob/master/ungz.c): 36 | ```c 37 | int main() { 38 | pigz_state s; 39 | char inbuf[8192]; 40 | uint64_t n; 41 | pigz_init(&s, inbuf, read_from_stdin); 42 | while ((n = pigz_available(&s))) { 43 | const char* buf = pigz_consume(&s, n); 44 | do { 45 | ssize_t m = write(STDOUT_FILENO, buf, n); 46 | if (m <= 0) { 47 | if (errno == EINTR || errno == EAGAIN) { 48 | continue; 49 | } 50 | fprintf(stderr, "Error %d writing to stdout\n", errno); 51 | return 1; 52 | } 53 | buf += m; 54 | n -= m; 55 | } while (n); 56 | } 57 | if (s.status != PIGZ_STATUS_EOF) { 58 | fprintf(stderr, "Error %d inflating gzip stream\n", s.status); 59 | } 60 | return 0; 61 | } 62 | 63 | static const char* read_from_stdin(void* opaque, uint64_t* len) { 64 | char* inbuf = (char*)opaque; 65 | for (;;) { 66 | ssize_t m = read(STDIN_FILENO, inbuf, 8192); 67 | if (m >= 0) { 68 | *len = (uint64_t)m; 69 | return inbuf; 70 | } else if (errno == EINTR || errno == EAGAIN) { 71 | continue; 72 | } else { 73 | fprintf(stderr, "Error %d reading from stdin\n", errno); 74 | exit(1); 75 | } 76 | } 77 | } 78 | ``` 79 | 80 | ## Building / using pigz 81 | 82 | The following commands will download pigz and build `pigz.o`. To use pigz in your project, include `pigz.h` and link against `pigz.o`. 83 | ``` 84 | git clone https://github.com/corsix/pigz 85 | cd pigz 86 | git submodule update --init 87 | make pigz.o 88 | ``` 89 | 90 | ## The code 91 | 92 | If reading gnarly x86-64 assembly code is your thing, look at [pigz.s](https://github.com/corsix/pigz/blob/master/pigz.s). The syntax is that of [DynASM](https://corsix.github.io/dynasm-doc/index.html). 93 | -------------------------------------------------------------------------------- /assembler.c: -------------------------------------------------------------------------------- 1 | #include "dasm_proto.h" 2 | #include "dasm_x86.h" 3 | #include "test.h" 4 | #include "pigz_s.h" 5 | #include 6 | #ifdef _WIN32 7 | #include 8 | #else 9 | #include 10 | #if !defined(MAP_ANONYMOUS) && defined(MAP_ANON) 11 | #define MAP_ANONYMOUS MAP_ANON 12 | #endif 13 | #endif 14 | 15 | #ifdef _WIN32 16 | typedef enum fixup_fn_kind { 17 | win32_fixup_kind_export = 1, 18 | win32_fixup_kind_startproc = 2, 19 | win32_fixup_kind_endproc = 3, 20 | win32_fixup_kind_push = 0 << 8, 21 | win32_fixup_kind_save = 4 << 8, 22 | win32_fixup_kind_sub = 1 << 8, 23 | } fixup_fn_kind; 24 | #else 25 | typedef void (*fixup_fn_kind)(FILE*, void*); 26 | #endif 27 | 28 | typedef struct fixup { 29 | int lbl; 30 | int addr; 31 | fixup_fn_kind fn; 32 | void* arg; 33 | } fixup_t; 34 | 35 | static int compar_fixup(const void* lhs, const void* rhs) { 36 | const fixup_t* lhsF = (const fixup_t*)lhs; 37 | const fixup_t* rhsF = (const fixup_t*)rhs; 38 | if (lhsF->addr != rhsF->addr) { 39 | return lhsF->addr - rhsF->addr; 40 | } 41 | return lhsF->lbl - rhsF->lbl; 42 | } 43 | 44 | fixup_t fixups[100]; 45 | unsigned nfixups = 0; 46 | 47 | int alloc_fixup(fixup_fn_kind fn, void* arg) { 48 | int result = nfixups + 20; 49 | fixups[nfixups].lbl = result; 50 | fixups[nfixups].fn = fn; 51 | fixups[nfixups].arg = arg; 52 | ++nfixups; 53 | return result; 54 | } 55 | 56 | void asm_export_fixup(FILE* f, void* arg) { 57 | const char* name = (const char*)arg; 58 | #ifdef __linux__ 59 | fprintf(f, ".globl %s\n", name); 60 | fprintf(f, ".type %s, @function\n", name); 61 | fprintf(f, "%s:\n", name); 62 | #else 63 | fprintf(f, ".globl _%s\n", name); 64 | fprintf(f, "_%s:\n", name); 65 | #endif 66 | } 67 | 68 | int asm_export(const char* name) { 69 | #ifdef _WIN32 70 | return alloc_fixup(win32_fixup_kind_export, (void*)name); 71 | #else 72 | return alloc_fixup(asm_export_fixup, (void*)name); 73 | #endif 74 | } 75 | 76 | #ifndef _WIN32 77 | void asm_cfi_fixup(FILE* f, void* arg) { 78 | const char* str = (const char*)arg; 79 | fprintf(f, ".cfi_%s\n", str); 80 | free(arg); 81 | } 82 | 83 | int asm_cfi(const char* fmt, ...) { 84 | char* buf = malloc(strlen(fmt) + 20); 85 | va_list args; 86 | va_start(args, fmt); 87 | vsprintf(buf, fmt, args); 88 | va_end(args); 89 | return alloc_fixup(asm_cfi_fixup, (void*)buf); 90 | } 91 | #endif 92 | 93 | int asm_cfi_startproc() { 94 | #ifdef _WIN32 95 | return alloc_fixup(win32_fixup_kind_startproc, 0); 96 | #else 97 | return asm_cfi("startproc"); 98 | #endif 99 | } 100 | 101 | int asm_cfi_endproc() { 102 | #ifdef _WIN32 103 | return alloc_fixup(win32_fixup_kind_endproc, 0); 104 | #else 105 | return asm_cfi("endproc"); 106 | #endif 107 | } 108 | 109 | static const uint8_t dwarf_reg_to_msvc_reg[16] = { 110 | 0, 2, 1, 3, 6, 7, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15 111 | }; 112 | 113 | int asm_cfi_push(int offset, int reg) { 114 | #ifdef _WIN32 115 | (void)offset; 116 | return alloc_fixup(win32_fixup_kind_push, (void*)dwarf_reg_to_msvc_reg[reg]); 117 | #else 118 | return asm_cfi("def_cfa_offset %d\n.cfi_offset %d, -%d", offset, reg, offset); 119 | #endif 120 | } 121 | 122 | #if _WIN32 123 | int asm_cfi_save(int offset, int reg) { 124 | return alloc_fixup(win32_fixup_kind_save, (void*)(ptrdiff_t)(dwarf_reg_to_msvc_reg[reg] + offset * 2)); 125 | } 126 | #endif 127 | 128 | int asm_cfi_sub(int offset, int delta) { 129 | #if _WIN32 130 | (void)offset; 131 | return alloc_fixup(win32_fixup_kind_sub, (void*)(ptrdiff_t)(delta * 2)); 132 | #else 133 | (void)delta; 134 | return asm_cfi("def_cfa_offset %d", offset); 135 | #endif 136 | } 137 | 138 | void pigz_assemble(pigz_functions* result) { 139 | dasm_State* Dst; 140 | void* globs[glob__MAX]; 141 | size_t sz, i; 142 | #ifdef _WIN32 143 | unsigned nprocs = 0; 144 | #endif 145 | unsigned n; 146 | void* mem; 147 | const char* bprefix; 148 | FILE* f; 149 | dasm_init(&Dst, DASM_MAXSECTION); 150 | dasm_setupglobal(&Dst, globs, glob__MAX); 151 | dasm_setup(&Dst, actions); 152 | dasm_growpc(&Dst, 120); 153 | pigz_emit_asm(&Dst); 154 | dasm_link(&Dst, &sz); 155 | #ifdef _WIN32 156 | i = 0; 157 | for (n = 0; n < nfixups; ++n) { 158 | switch (fixups[n].fn) { 159 | case win32_fixup_kind_startproc: 160 | ++nprocs; 161 | i += 16; 162 | break; 163 | case win32_fixup_kind_endproc: 164 | i += (i & 2); 165 | break; 166 | case win32_fixup_kind_push: 167 | i += 2; 168 | break; 169 | case win32_fixup_kind_save: 170 | i += 4; 171 | break; 172 | case win32_fixup_kind_sub: 173 | i += 4; 174 | break; 175 | default: 176 | break; 177 | } 178 | } 179 | mem = VirtualAlloc(NULL, sz + i, MEM_RESERVE | MEM_COMMIT, PAGE_EXECUTE_READWRITE); 180 | #else 181 | mem = mmap(0, sz, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 182 | #endif 183 | dasm_encode(&Dst, mem); 184 | result->init = globs[glob_pigz_init]; 185 | result->available = globs[glob_pigz_available]; 186 | result->consume = pigz_consume; 187 | for (n = 0; n < nfixups; ++n) { 188 | fixups[n].addr = dasm_getpclabel(&Dst, fixups[n].lbl); 189 | } 190 | dasm_free(&Dst); 191 | qsort(fixups, nfixups, sizeof(fixup_t), compar_fixup); 192 | #ifdef _WIN32 193 | { 194 | uint32_t* rtfuncs = (uint32_t*)((char*)mem + sz); 195 | uint16_t* unwinds = (uint16_t*)(rtfuncs + nprocs * 3); 196 | for (n = 0; n < nfixups; ++n) { 197 | switch (fixups[n].fn) { 198 | case win32_fixup_kind_startproc: 199 | rtfuncs[0] = (uint32_t)fixups[n].addr; 200 | rtfuncs[2] = (uint32_t)((char*)unwinds - (char*)mem); 201 | unwinds += 2; 202 | break; 203 | case win32_fixup_kind_save: 204 | case win32_fixup_kind_sub: 205 | *unwinds++ = (uint16_t)((uint32_t)(size_t)fixups[n].arg / 16); 206 | // fallthrough 207 | case win32_fixup_kind_push: 208 | *unwinds++ = (uint16_t)(((uint32_t)fixups[n].addr - rtfuncs[0]) + (uint32_t)fixups[n].fn + ((uint32_t)(size_t)fixups[n].arg << 12)); 209 | break; 210 | case win32_fixup_kind_endproc: { 211 | uint16_t *a = (uint16_t*)((char*)mem + rtfuncs[2]), *b; 212 | a[0] = (uint16_t)(1 + (unwinds[-1] << 8)); 213 | a[1] = (uint16_t)(unwinds - a - 2); 214 | b = unwinds - 1; 215 | if (a[1] & 1) { 216 | *unwinds++ = 0; 217 | } 218 | a += 2; 219 | for (; a < b; ++a, --b) { 220 | *a ^= *b; 221 | *b ^= *a; 222 | *a ^= *b; 223 | } 224 | rtfuncs[1] = (uint32_t)fixups[n].addr; 225 | rtfuncs += 3; 226 | break; } 227 | default: 228 | break; 229 | } 230 | } 231 | RtlAddFunctionTable((PRUNTIME_FUNCTION)((char*)mem + sz), nprocs, (DWORD64)mem); 232 | } 233 | #endif 234 | n = 0; 235 | f = fopen("pigz_o.s", "w"); 236 | fprintf(f, ".file \"pigz.s\"\n"); 237 | #ifdef __linux__ 238 | fprintf(f, ".section .note.GNU-stack, \"\", @progbits\n"); 239 | #endif 240 | fprintf(f, ".text\n"); 241 | fprintf(f, ".p2align 6"); 242 | bprefix = ".byte"; 243 | for (i = 0; i < sz; ++i) { 244 | if (n < nfixups && fixups[n].addr == (int)i) { 245 | fprintf(f, "\n"); 246 | do { 247 | #ifndef _WIN32 248 | fixups[n].fn(f, fixups[n].arg); 249 | #endif 250 | ++n; 251 | } while (n < nfixups && fixups[n].addr == (int)i); 252 | bprefix = ".byte"; 253 | } else if (!(i & 15)) { 254 | fprintf(f, "\n"); 255 | bprefix = ".byte"; 256 | } 257 | fprintf(f, "%s %d", bprefix, ((unsigned char*)mem)[i]); 258 | bprefix = ","; 259 | } 260 | fprintf(f, "\n.p2align 6\n"); 261 | fclose(f); 262 | } 263 | 264 | int main() { 265 | int result; 266 | pigz_functions asmf; 267 | pigz_assemble(&asmf); 268 | asmf.allow_bmi2 = 0; 269 | result = (run_all_pigz_test_cases(&asmf) != 0); 270 | if (result == 0) { 271 | pigz_state state; 272 | asmf.init(&state, 0, 0); 273 | if (state.status & 1) { 274 | asmf.allow_bmi2 = 1; 275 | result = (run_all_pigz_test_cases(&asmf) != 0); 276 | } 277 | } 278 | return result; 279 | } 280 | -------------------------------------------------------------------------------- /test.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "test.h" 5 | 6 | typedef struct verbatim_test_case { 7 | const char* name; 8 | const char* input; 9 | const char* output; 10 | uint32_t input_len; 11 | int32_t output_len; // or error code if output is NULL 12 | } verbatim_test_case_t; 13 | 14 | static verbatim_test_case_t verbatim_test_cases[] = { 15 | {"no input", NULL, "", 0, 0}, 16 | {"too short for magic", "\0", NULL, 1, PIGZ_STATUS_UNEXPECTED_EOF}, 17 | {"wrong magic", " ", NULL, 4, PIGZ_STATUS_BAD_HEADER}, 18 | {"too short for header", "\x1f\x8b\x08\x00\x00\x00\x00\x00", NULL, 8, PIGZ_STATUS_UNEXPECTED_EOF}, 19 | {"empty via litrl", "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\xf9\x00\x00\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00", "", 23, 0}, 20 | {"empty via fixed", "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00", "", 20, 0}, 21 | {"empty via dnmic", "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\x05\xc0\x81\x00\x00\x00\x00\x00\x90\xff\x6b\x00\x00\x00\x00\x00\x00\x00\x00\x00", "", 30, 0}, 22 | {"double empty via fixed", "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00" 23 | "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00", "", 40, 0}, 24 | {"xyz via litrl", "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\x01\x03\x00\xfc\xff\x78\x79\x7a\x67\xba\x8e\xeb\x03\x00\x00\x00", "xyz", 26, 3}, 25 | {"bad litrl", "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\x01\x03\x00\x03\x00\x78\x79\x7a\x67\xba\x8e\xeb\x03\x00\x00\x00", NULL, 26, PIGZ_STATUS_BAD_BITS}, 26 | {"xyz via fixed", "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\xab\xa8\xac\x02\xfc\x67\xba\x8e\xeb\x03\x00\x00\x00", "xyz", 23, 3}, 27 | {"xyz bad crc", "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\xab\xa8\xac\x02\xfc\x67\xba\x8e\xea\x03\x00\x00\x00", NULL, 23, PIGZ_STATUS_BAD_CRC}, 28 | {"xyz bad length", "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\xab\xa8\xac\x02\xfc\x67\xba\x8e\xeb\x04\x00\x00\x00", NULL, 23, PIGZ_STATUS_BAD_BITS}, 29 | {"xyz via dnmic", "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\x05\x83\x81\x00\x00\x00\x00\x40\xb6\x51\x0f\xb0\x01\x67\xba\x8e\xeb\x03\x00\x00\x00", "xyz", 31, 3}, 30 | {"xyz three ways", "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\x01\x03\x00\xfc\xff\x78\x79\x7a\x67\xba\x8e\xeb\x03\x00\x00\x00" 31 | "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\xab\xa8\xac\x02\xfc\x67\xba\x8e\xeb\x03\x00\x00\x00" 32 | "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\x05\x83\x81\x00\x00\x00\x00\x40\xb6\x51\x0f\xb0\x01\x67\xba\x8e\xeb\x03\x00\x00\x00", "xyzxyzxyz", 80, 9}, 33 | {"incomplete dist code set", "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\x05\x82\x81\x00\x00\x00\x00\x40\xb6\x51\x0f\x00\x00\x00\x00\x00", NULL, 26, PIGZ_STATUS_BAD_BITS}, 34 | {"incomplete lit code set", "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\x05\x83\x81\x00\x00\x00\x00\x40\xba\xa9\x07\x00\x00\x00\x00", NULL, 25, PIGZ_STATUS_BAD_BITS}, 35 | {"overfull lit code set", "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\x05\xc0\x81\x00\x00\x00\x00\x00\x90\x6d\xd4\x03\x00\x00\x00\x00", NULL, 26, PIGZ_STATUS_BAD_BITS}, 36 | {"no EOB in lit code set", "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\x05\xc0\x81\x00\x00\x00\x00\x00\x90\x6d\xf2\x01\x00\x00\x00\x00", NULL, 26, PIGZ_STATUS_BAD_BITS}, 37 | {"hlit too large", "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\xfd\x00\x00\x00\x00\x00", NULL, 16, PIGZ_STATUS_BAD_BITS}, 38 | {"hdist too large", "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\x05\x1f\x00\x00\x00\x00", NULL, 16, PIGZ_STATUS_BAD_BITS}, 39 | {"incomplete code length set", "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\xf5\xfe\xff\xff\xff\xff\xff\xff\xff\x03\x00\x00\x00\x00", NULL, 24, PIGZ_STATUS_BAD_BITS}, 40 | {"overfull code length set", "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\xf5\xfe\x49\x92\x24\x49\x92\x24\x49\x02\x00\x00\x00\x00", NULL, 24, PIGZ_STATUS_BAD_BITS}, 41 | {"small text file", "\x1f\x8b\x08\x08\xee\x91\x66\x59\x04\x00\x78\x2e\x6c\x75\x61\x00\x55\x8d\xbd\x0a\x80\x20\x1c\xc4\xf7\xa0\x77\xb8\xd1\x3e\x06\x69\xce\xa7\x68\x0f\x4c\x2d\x04\xf9" 42 | "\x1b\x69\x20\x3d\x7d\xd2\x50\xb6\x1c\xc7\xef\x8e\x3b\xe7\x95\x74\xd8\x0d\x69\x4b\x1b\x04\x78\x5d\xfd\xd0\x64\x2f\x53\xe2\xf5\x24\x15\xad\x27\x2c\x36\x06\x96\x7a" 43 | "\x50\x53\x57\x80\x0c\xc1\x1c\x91\x71\x8c\x02\x09\x92\x74\xd6\x6c\x87\xf9\x89\xbf\xfd\xd7\x75\x60\x09\x6d\x2e\x14\x3f\xcd\x0d\x54\x03\x2a\x24\x8d\x00\x00\x00", 44 | "local pending = 0\r\n" 45 | "local pendingSize = 0\r\n" 46 | "local function bits(x, n)\r\n" 47 | " assert(0 <= x and x <= 2^n)\r\n" 48 | " pending = pending + (x * 2^pendingSize)", 119, 141}, 49 | {NULL, NULL, NULL, 0, 0}, 50 | }; 51 | 52 | typedef struct verbatim_reader { 53 | const char* input; 54 | uint32_t input_len; 55 | uint32_t chunk_size; 56 | char buf[4]; 57 | } verbatim_reader_t; 58 | 59 | static const char* verbatim_reader_fn(void* opaque, uint64_t* len) { 60 | verbatim_reader_t* self = (verbatim_reader_t*)opaque; 61 | uint32_t chunk_len = self->input_len < self->chunk_size ? self->input_len : self->chunk_size; 62 | *len = chunk_len; 63 | if (chunk_len == 0) { 64 | return opaque; 65 | } else if (chunk_len <= 4) { 66 | memcpy(self->buf, self->input, chunk_len); 67 | self->input += chunk_len; 68 | self->input_len -= chunk_len; 69 | return self->buf; 70 | } else { 71 | const char* result = self->input; 72 | self->input += chunk_len; 73 | self->input_len -= chunk_len; 74 | return result; 75 | } 76 | } 77 | 78 | static void call_init(const pigz_functions* funcs, pigz_state* state, void* opaque, pigz_reader reader) { 79 | funcs->init(state, opaque, reader); 80 | if (!funcs->allow_bmi2) { 81 | state->status &=~ (int8_t)1; 82 | } 83 | } 84 | 85 | static uint32_t run_verbatim_test_cases(const pigz_functions* funcs) { 86 | uint32_t nfail = 0; 87 | pigz_state state; 88 | verbatim_test_case_t* test_case = verbatim_test_cases; 89 | verbatim_reader_t reader; 90 | for (; test_case->name; ++test_case) { 91 | for (reader.chunk_size = 1; reader.chunk_size <= (test_case->input_len ? test_case->input_len : 1); ++reader.chunk_size) { 92 | uint64_t available; 93 | uint64_t total = 0; 94 | reader.input = test_case->input; 95 | reader.input_len = test_case->input_len; 96 | printf("%s %d: ", test_case->name, (int)reader.chunk_size); 97 | call_init(funcs, &state, &reader, verbatim_reader_fn); 98 | while ((available = funcs->available(&state))) { 99 | const char* buf = funcs->consume(&state, available); 100 | if (test_case->output && memcmp(buf, test_case->output + total, available) != 0) { 101 | printf("FAIL (output mismatches)\n"); 102 | ++nfail; 103 | goto next_test; 104 | } 105 | total += available; 106 | } 107 | if (state.status != (test_case->output ? PIGZ_STATUS_EOF : test_case->output_len)) { 108 | printf("FAIL (finished in state %d)\n", (int)state.status); 109 | ++nfail; 110 | goto next_test; 111 | } 112 | if (test_case->output && total != (uint32_t)test_case->output_len) { 113 | printf("FAIL (only produced %d of %d bytes)\n", (int)total, (int)test_case->output_len); 114 | ++nfail; 115 | goto next_test; 116 | } 117 | printf("PASS\n"); 118 | next_test:; 119 | } 120 | } 121 | return nfail; 122 | } 123 | 124 | typedef struct one_byte_reader { 125 | int32_t crc; 126 | char byteval; 127 | uint8_t state; 128 | char buf[4]; 129 | } one_byte_reader_t; 130 | 131 | static const char* one_byte_reader_literal(void* opaque, uint64_t* len) { 132 | one_byte_reader_t* self = (one_byte_reader_t*)opaque; 133 | switch (self->state++) { 134 | case 0: *len = 15; return "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\x01\x01\x00\xfe\xff"; 135 | case 1: *len = 1; return &self->byteval; 136 | case 3: self->crc = 1; 137 | case 2: *len = 4; return (const char*)&self->crc; 138 | case 4: *len = 0; return NULL; 139 | default: *len = 1; return ""; 140 | } 141 | } 142 | 143 | static uint32_t revbits(uint32_t x, uint32_t n) { 144 | uint32_t result = 0; 145 | while (n--) { 146 | result = (result << 1) | (x & 1); 147 | x >>= 1; 148 | } 149 | return result; 150 | } 151 | 152 | static const char* one_byte_reader_static(void* opaque, uint64_t* len) { 153 | one_byte_reader_t* self = (one_byte_reader_t*)opaque; 154 | switch (self->state++) { 155 | case 0: *len = 10; return "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00"; 156 | case 1: { 157 | uint32_t bits = 3; 158 | if ((uint8_t)self->byteval < 144) { 159 | bits |= (revbits(0x30 + (uint8_t)self->byteval, 8) << 3); 160 | } else { 161 | bits |= (revbits(0x100 + (uint8_t)self->byteval, 9) << 3); 162 | } 163 | memcpy(self->buf, &bits, 4); 164 | *len = 3; 165 | return self->buf; 166 | } 167 | case 3: self->crc = 1; 168 | case 2: *len = 4; return (const char*)&self->crc; 169 | case 4: *len = 0; return NULL; 170 | default: *len = 1; return ""; 171 | } 172 | } 173 | 174 | static uint32_t run_one_byte_test_cases(const pigz_functions* funcs) { 175 | uint32_t nfail = 0; 176 | pigz_state state; 177 | one_byte_reader_t reader; 178 | const char* result; 179 | int n, k; 180 | for (n = 0; n < 256; n++) { 181 | int32_t c = n; 182 | for (k = 0; k < 8; k++) { 183 | c = (0xedb88320L & -(c & 1)) ^ (int32_t)((uint32_t)c >> 1); 184 | } 185 | reader.crc = c ^ 0xff000000; 186 | reader.byteval = (char)(n ^ 255); 187 | reader.state = 0; 188 | printf("one_byte_lit-%d: ", n); 189 | call_init(funcs, &state, &reader, one_byte_reader_literal); 190 | result = funcs->available(&state) ? funcs->consume(&state, 1) : NULL; 191 | if (!result || *result != reader.byteval || funcs->available(&state) || state.status != PIGZ_STATUS_EOF) { 192 | printf("FAIL\n"); 193 | ++nfail; 194 | } else { 195 | printf("PASS\n"); 196 | } 197 | reader.crc = c ^ 0xff000000; 198 | reader.byteval = (char)(n ^ 255); 199 | reader.state = 0; 200 | printf("one_byte_static-%d: ", n); 201 | call_init(funcs, &state, &reader, one_byte_reader_static); 202 | result = funcs->available(&state) ? funcs->consume(&state, 1) : NULL; 203 | if (!result || *result != reader.byteval || funcs->available(&state) || state.status != PIGZ_STATUS_EOF) { 204 | printf("FAIL\n"); 205 | ++nfail; 206 | } else { 207 | printf("PASS\n"); 208 | } 209 | } 210 | return nfail; 211 | } 212 | 213 | typedef struct fixed_size_record_reader { 214 | uint32_t length_produced; 215 | char initial[5 + 21]; 216 | char record[5 + 21]; 217 | uint8_t state; 218 | uint32_t crc; 219 | uint32_t crc_table[256]; 220 | } fixed_size_record_reader_t; 221 | 222 | static const char* fixed_size_record_reader_fn(void* opaque, uint64_t* len) { 223 | fixed_size_record_reader_t* self = (fixed_size_record_reader_t*)opaque; 224 | if (self->state == 0) { 225 | *len = 10; 226 | self->state = 1; 227 | return "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00"; 228 | } 229 | if (self->state == 1) { 230 | if (self->length_produced > 130000) { 231 | self->state = 2; 232 | } else { 233 | const char* result = self->length_produced ? self->record : self->initial; 234 | uint32_t crc = ~self->crc; 235 | uint32_t i; 236 | *len = 5 + *(uint16_t*)(result + 1); 237 | for (i = 5; i < *len; ++i) { 238 | crc = self->crc_table[(crc ^ result[i]) & 0xff] ^ (crc >> 8); 239 | } 240 | self->crc = ~crc; 241 | self->length_produced += *(uint16_t*)(result + 1); 242 | return result; 243 | } 244 | } 245 | if (self->state == 2) { 246 | *len = 5; 247 | self->state = 3; 248 | return "\x01\x00\x00\xff\xff"; 249 | } 250 | if (self->state == 3) { 251 | *len = 4; 252 | self->state = 4; 253 | return (const char*)&self->crc; 254 | } 255 | if (self->state == 4) { 256 | *len = 4; 257 | self->state = 5; 258 | return (const char*)&self->length_produced; 259 | } 260 | *len = 0; 261 | return NULL; 262 | } 263 | 264 | static uint64_t readinto(pigz_state* state, const pigz_functions* funcs, char* buf, uint64_t len) { 265 | uint64_t avail; 266 | do { 267 | avail = funcs->available(state); 268 | if (avail > len) { 269 | avail = len; 270 | } 271 | memcpy(buf, funcs->consume(state, avail), avail); 272 | buf += avail; 273 | len -= avail; 274 | } while (avail); 275 | return len; 276 | } 277 | 278 | static uint32_t run_fixed_size_record_test_cases(const pigz_functions* funcs) { 279 | uint32_t i, j, k; 280 | uint32_t nfail = 0; 281 | pigz_state state; 282 | fixed_size_record_reader_t reader; 283 | const char* result; 284 | char buf[21]; 285 | for (i = 0; i < 256; i++) { 286 | int32_t c = (int32_t)i; 287 | for (j = 0; j < 8; j++) { 288 | c = (0xedb88320L & -(c & 1)) ^ (int32_t)((uint32_t)c >> 1); 289 | } 290 | reader.crc_table[i] = (uint32_t)c; 291 | } 292 | reader.initial[0] = 0; 293 | reader.record[0] = 0; 294 | for (i = 1; i <= 21; ++i) { 295 | *(uint16_t*)(reader.initial + 1) = i; 296 | *(uint16_t*)(reader.initial + 3) = ~i; 297 | for (j = 1; j <= 21; ++j) { 298 | reader.length_produced = 0; 299 | reader.state = 0; 300 | reader.crc = 0; 301 | *(uint16_t*)(reader.record + 1) = j; 302 | *(uint16_t*)(reader.record + 3) = ~j; 303 | for (k = 0; k < i || k < j; ++k) { 304 | reader.initial[5 + k] = ~k; 305 | reader.record[5 + k] = 1 + k; 306 | } 307 | printf("fixed_size_record-%d-%d: ", (int)i, (int)j); 308 | call_init(funcs, &state, &reader, fixed_size_record_reader_fn); 309 | result = funcs->available(&state) >= i ? funcs->consume(&state, i) : NULL; 310 | if (!result || memcmp(result, reader.initial + 5, i)) { 311 | printf("FAIL\n"); 312 | ++nfail; 313 | goto next_test; 314 | } 315 | for (;;) { 316 | memset(buf, 'a', sizeof(buf)); 317 | result = readinto(&state, funcs, buf, j) ? NULL : buf; 318 | if (result == NULL && state.status == PIGZ_STATUS_EOF && reader.length_produced > 130000) { 319 | printf("PASS\n"); 320 | goto next_test; 321 | } 322 | if (!result || memcmp(result, reader.record + 5, j)) { 323 | printf("FAIL\n"); 324 | ++nfail; 325 | goto next_test; 326 | } 327 | } 328 | next_test:; 329 | } 330 | } 331 | return nfail; 332 | } 333 | 334 | uint32_t run_all_pigz_test_cases(const pigz_functions* funcs) { 335 | return run_verbatim_test_cases(funcs) 336 | + run_one_byte_test_cases(funcs) 337 | + run_fixed_size_record_test_cases(funcs) 338 | ; 339 | } 340 | -------------------------------------------------------------------------------- /pigz.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #ifdef __cplusplus 4 | extern "C" { 5 | #endif 6 | 7 | /** 8 | * @brief Callback function used to provide chunks of gzipped input. 9 | * 10 | * Whenever @c pigz_available needs to fetch some more gzipped input in order 11 | * to make more uncompressed bytes available, it will call this callback. If 12 | * the end of input has been reached, zero should be stored to *len. 13 | * Otherwise, if the end of input has not been reached, at least one byte of 14 | * gzipped input must be provided. Once the callback has indicated that the 15 | * end of input has been reached for a given state, the callback will not be 16 | * called again (even if @c pigz_available is called again). 17 | * 18 | * @param opaque An opaque value, which is provided to @c pigz_init, and 19 | * is passed through verbatim to the callback. 20 | * @param[out] len A pointer into which the callback should store the number 21 | * of bytes of gzipped input which it has made available. If 22 | * the end of input has been reached, zero should be stored. 23 | * @return A pointer to the start of the chunk of input. This pointer (and the 24 | * *len bytes it refers to) must remain valid until the next 25 | * time the callback is called for the same state. If the end of input 26 | * has been reached, any value can be returned. 27 | */ 28 | typedef const char* (*pigz_reader)(void* opaque, uint64_t* len); 29 | 30 | /** 31 | * @brief The size of the circular buffer of uncompressed bytes in a @c pigz_state. 32 | * 33 | * This value is provided for documentation only. It cannot be changed. 34 | */ 35 | #define PIGZ_WINDOW_SIZE 65536 36 | 37 | /** 38 | * Value for @c pigz_state::status indicating that the input bitstream was 39 | * invalid. 40 | * 41 | * This value can arise from many different problems with the input, for 42 | * example:
    43 | *
  • A block with a @c BTYPE value of 11.
  • 44 | *
  • An uncompressed block whose @c LEN and @c NLEN fields are not 45 | * inverses of each other.
  • 46 | *
  • A gzip footer whose @c ISIZE field mismatches the number of uncompressed 47 | * bytes produced from the gzip file.
  • 48 | *
  • A backref with distance greater than the number of uncompressed bytes 49 | * produced so far.
  • 50 | *
  • A dynamic Huffman table having too few entries (or, equivalently, having 51 | * scope to reduce the lengths of some entries in the table).
  • 52 | *
  • A dynamic Huffman table having too many entries (for example, having 53 | * more than pow(2, N) entries of length @c N).
  • 54 | *
55 | */ 56 | #define PIGZ_STATUS_BAD_BITS -5 57 | 58 | /** 59 | * Value for @c pigz_state::status indicating that either a gzip header CRC 60 | * was incorrect, or a gzip footer CRC was incorrect. 61 | */ 62 | #define PIGZ_STATUS_BAD_CRC -4 63 | 64 | /** 65 | * Value for @c pigz_state::status indicating that a gzip file did not start 66 | * with the three-byte sequence 0x1F 0x8B 0x08, or a reserved bit was set in 67 | * the gzip header's @c FLAGS field. 68 | */ 69 | #define PIGZ_STATUS_BAD_HEADER -3 70 | 71 | /** 72 | * Value for @c pigz_state::status indicating that the end of input was reached, 73 | * but that this did not happen at a gzip file boundary (i.e. the input was - at 74 | * best - a truncated gzip file, rather than a complete gzip file). 75 | */ 76 | #define PIGZ_STATUS_UNEXPECTED_EOF -2 77 | 78 | /** 79 | * Value for @c pigz_state::status indicating that the end of input was reached, 80 | * and that this coincided with the end of a gzip file (i.e. the input was a 81 | * valid gzip file). 82 | * 83 | * This is the only negative value for @c pigz_state::status which doesn't 84 | * indicate a problem with the input. 85 | */ 86 | #define PIGZ_STATUS_EOF -1 87 | 88 | /** 89 | * @brief Structure containing all the state required for uncompressing a gzip stream. 90 | * 91 | * All fields should be considered opaque, with the exception of @c status, which 92 | * can be inspected at any time to determine whether the end of uncompressed data 93 | * has been reached or whether an error has occured (see @c pigz_available). Other 94 | * fields are documented merely for the purpose of understanding the library's 95 | * implementation - users of the library should neither read nor write these fields. 96 | * 97 | * @c pigz_init should be used to initialise instances of this structure. 98 | */ 99 | typedef struct pigz_state { 100 | /** 101 | * A pointer to four bytes before the end of the current input chunk. 102 | */ 103 | const char* inend; 104 | 105 | /** 106 | * A pointer to the next byte of the current input chunk. Conceptually, 107 | * a byte of input will be read from this pointer, those eight bits will be 108 | * absorbed into @c bits, and this pointer will be advanced by one (in 109 | * practice, this is typically done four bytes at a time, rather than one 110 | * byte at a time). 111 | */ 112 | const char* input; 113 | 114 | /** 115 | * The total number of uncompressed bytes which @c pigz_consume has consumed 116 | * from the current gzip file. This value will not exceed @c writepos. 117 | */ 118 | uint64_t readpos; 119 | 120 | /** 121 | * The total number of uncompressed bytes which @c pigz_available has made 122 | * available (i.e. has uncompressed) from the current gzip file. If this is 123 | * equal to @c readpos when @c pigz_available is called, more input will get 124 | * uncompressed. This value will not exceed readpos + PIGZ_WINDOW_SIZE - 1. 125 | */ 126 | uint64_t writepos; 127 | 128 | /** 129 | * An opaque value which will be passed to @c reader. 130 | */ 131 | void* opaque; 132 | 133 | /** 134 | * Callback function which will be used to obtain the next chunk of input. 135 | * 136 | * @see pigz_reader 137 | */ 138 | pigz_reader reader; 139 | 140 | /** 141 | * To users of the library, an extra return value from @c pigz_available: if 142 | * @c pigz_available returns a non-zero value, then this field will be 143 | * non-negative. If @c pigz_available returns zero, then this field will be 144 | * negative, and will indicate why @c pigz_available returned zero (by being 145 | * one of @c PIGZ_STATUS_BAD_BITS, @c, PIGZ_STATUS_BAD_CRC, 146 | * @c PIGZ_STATUS_BAD_HEADER, @c, PIGZ_STATUS_UNEXPECTED_EOF, or 147 | * @c PIGZ_STATUS_EOF). 148 | * 149 | * Internal to the library, there are three kinds of non-negative value in 150 | * this field:
    151 | *
  • Values in the range 0 through 11 are used to denote the current block 152 | * type and whether the current block is the final block (the low bit is 153 | * set if the CPU supports BMI2, the next bit is @c BFINAL, the next two 154 | * bits are @c BTYPE).
  • 155 | *
  • The values 12 and 13 denotes being at a gzip file boundary - the 156 | * @c CRC32 and @c ISIZE fields of the previous gzip file have been 157 | * verified, but the header of the next gzip file has yet to be looked 158 | * at. The value 13 is used if (and only if) the CPU supports BMI2.
  • 159 | *
  • Values in the range 64 through 127 denote that an error has occurred, 160 | * but some uncompressed bytes have yet to be consumed, so said error 161 | * has not yet been revealed. Once the bytes have been consumed, it'll 162 | * be revealed by toggling the sign bit.
  • 163 | *
164 | */ 165 | int8_t status; 166 | 167 | /** 168 | * The number of bits which have been read from the input, but not yet 169 | * been used. Always in the range 0 through 63. 170 | */ 171 | uint8_t nbits; 172 | 173 | union { 174 | /** 175 | * When uncompressing an uncompressed-block, the number of literal bytes 176 | * remaining in the block. 177 | * 178 | * This field is valid when @c status is in the range 0 through 3 (i.e. 179 | * @c BTYPE is 00). 180 | */ 181 | uint16_t litlen; 182 | struct { 183 | /** 184 | * When uncompressing a compressed block, log2 of the size of the 185 | * first-level table in @c litcodes. Always in the range 1 through 9. 186 | * 187 | * This field is valid when @c status is in the range 4 through 11 (i.e. 188 | * @c BTYPE is 01 or 10). 189 | */ 190 | uint8_t litbits; 191 | 192 | /** 193 | * When uncompressing a compressed block, log2 of the size of the 194 | * first-level table in @c distcodes. Always in the range 0 through 6. 195 | * 196 | * This field is valid when @c status is in the range 4 through 11 (i.e. 197 | * @c BTYPE is 01 or 10). 198 | */ 199 | uint8_t distbits; 200 | }; 201 | }; 202 | 203 | /** 204 | * The CRC32 (without post-conditioning) of bytes [0, writepos) of 205 | * the uncompressed bytes from the current gzip file. 206 | */ 207 | uint32_t crc; 208 | 209 | /** 210 | * Between 0 and 63 bits which have been read from the input, but not yet 211 | * been used. Starting at the LSB, bits 0 through nbits - 1 are 212 | * valid. Bits nbits through 63 are zero. 213 | */ 214 | uint64_t bits; 215 | 216 | /** 217 | * @brief Circular buffer of uncompressed bytes from the current gzip file. 218 | * 219 | * The next uncompressed byte to be consumed by @c pigz_consume is at index 220 | * readpos % PIGZ_WINDOW_SIZE. The next byte to be overwritten by 221 | * more uncompressed data is at writepos % PIGZ_WINDOW_SIZE. 222 | */ 223 | char window[PIGZ_WINDOW_SIZE]; 224 | 225 | /** 226 | * @brief Lookup tables for interpreting Huffman-encoded distance codes. 227 | * 228 | * Each entry is of the form (VAL << 16) | (NBITS << 8) | (KIND << 6) | NXBITS, 229 | * where @c NXBITS is a 6-bit value, @c KIND is a 2-bit value, @c NBITS is 230 | * an 8-bit value, and @c VAL is a 16-bit value. @c NBITS gives the number 231 | * of bits which should be dropped from the input bitstream. @c NXBITS gives 232 | * the number of bits which should be consumed from the input bitstream, 233 | * treated as an NXBIT-integer, and added to @c VAL. The meaning of 234 | * @c VAL then depends on @c KIND:
    235 | *
  • If @c KIND is 0, @c VAL gives a distance value.
  • 236 | *
  • If @c KIND is 1 or 2, the bitstream is bad.
  • 237 | *
  • If @c KIND is 3, @c VAL gives the index of another entry in @c distcodes. 238 | * In this case, the @c NXBITS bits are peeked rather than consumed.
  • 239 | *
240 | * 241 | * The first pow(2, distbits) entries are called the first-level 242 | * table. This first-level table is followed by zero or more second-level 243 | * tables. Second-level tables are only used if there distance codes which 244 | * are encoded using more than six bits; the leading six bits will lead to an 245 | * entry in the first-level table with a @c KIND of 3 and a @c VAL (before 246 | * adding in @c NXBITS bits) which gives the offset of the start of a 247 | * second-level table, the size of which is pow(2, NXBITS) (hence 248 | * once the @c NXBITS bits are added to @c VAL, the result is the offset 249 | * of a particular element in the referenced second-level table). 250 | */ 251 | uint32_t distcodes[592]; 252 | 253 | /** 254 | * @brief Lookup tables for interpreting Huffman-encoded literal/length codes. 255 | * 256 | * Like @c distcodes, but with different meanings for @c KIND:
    257 | *
  • If @c KIND is 0, @c VAL gives a length value (which is then followed 258 | * by a Huffman-encoded distance code).
  • 259 | *
  • If @c KIND is 1, @c VAL gives a literal value.
  • 260 | *
  • If @c KIND is 2 and @c VAL is zero, end-of-block has been reached.
  • 261 | *
  • If @c KIND is 2 and @c VAL is non-zero, the bitstream is bad.
  • 262 | *
  • If @c KIND is 3, @c VAL gives the index of another entry in @c litcodes. 263 | * In this case, the @c NXBITS bits are peeked rather than consumed.
  • 264 | *
265 | * 266 | * If @c KIND is 1 or 2, @c NXBITS is always zero. The first pow(2, litbits) 267 | * entries are called the first-level table. As for @c distcodes, second-level 268 | * tables might follow, albeit with the cutoff at nine bits rather than six bits. 269 | */ 270 | uint32_t litcodes[852]; 271 | } pigz_state; 272 | 273 | /** 274 | * @brief Initialise (or re-initialise) a pigz_state structure. 275 | * 276 | * Following initialisation, the @c status field of @p state will be 277 | * non-negative. Initialisation does not call the @p reader function - 278 | * that will happen on the next call to @c pigz_available. Initialisation 279 | * does not allocate any resources, and cannot fail. There is no corresponding 280 | * de-initialisation function, as such a function is not required. 281 | * 282 | * If the CPU supports BMI2, pigz will use it. To prevent use of BMI2, clear 283 | * the low bit of the @c status field after calling @c pigz_init. 284 | * 285 | * @param state The state to initialise. 286 | * @param opaque An opaque value which will later be passed to @p reader. 287 | * @param reader The function which will be called by @c pigz_available to get 288 | * chunks of input (i.e. chunks of gzip file). 289 | */ 290 | void pigz_init(pigz_state* state, void* opaque, pigz_reader reader); 291 | 292 | /** 293 | * @brief Determine how many uncompressed bytes are available to be consumed. 294 | * 295 | * Following a call to @c pigz_available which returns @c N, it is permitted to 296 | * call @c pigz_consume passing @c N, or call @c pigz_consume multiple times 297 | * with values which sum to @c N. If bytes are available, @c pigz_available 298 | * will return a non-negative value, and the @c status field of @p state will 299 | * be set to some (unrelated) non-negative value. Otherwise, if the end of 300 | * input is reached, @c pigz_available will return zero, and the @c status 301 | * field of @p state will be set to @c PIGZ_STATUS_EOF (a negative value). 302 | * Otherwise, if an error is detected in the input stream, @c pigz_available 303 | * will return zero, and the @c status field of @p state will be set to some 304 | * other negative value describing the error (@c PIGZ_STATUS_UNEXPECTED_EOF, 305 | * @c PIGZ_STATUS_BAD_HEADER, @c PIGZ_STATUS_BAD_CRC, or 306 | * @c PIGZ_STATUS_BAD_BITS). 307 | * 308 | * If @c pigz_available returns zero for a given state, then subsequent calls 309 | * for that state will also return zero (i.e. once EOF is reached, more data 310 | * cannot be provided, and once an error occurs, it cannot be cleared). 311 | * 312 | * @c pigz_available will only return zero in case of EOF or error. It may 313 | * call the @c reader function previously provided to @c pigz_init in order 314 | * to fetch more compressed data, and then decompress that data in order to 315 | * make available some more uncompressed bytes. 316 | * 317 | * @param state A state previously initialised by @c pigz_init. 318 | * @return The maximum value which can be passed to @c pigz_consume for 319 | * @p state. Zero in case of EOF or error. 320 | */ 321 | uint64_t pigz_available(pigz_state* state); 322 | 323 | /** 324 | * @brief Read some number of uncompressed bytes. 325 | * 326 | * @param len The number of bytes to read. Must be less than or equal to the 327 | * result of @c pigz_available for @p state. 328 | * @return A pointer to the start of the uncompressed bytes (if say @c ptr is 329 | * returned, then the uncompressed bytes are the half-open range 330 | * [ptr, ptr + len)). This pointer remains valid until the 331 | * next call to @c pigz_available for @p state (after such a call, the 332 | * contents of the range might be replaced with other data). 333 | */ 334 | static const char* pigz_consume(pigz_state* state, uint64_t len) { 335 | char* result = state->window + (state->readpos & (PIGZ_WINDOW_SIZE - 1)); 336 | state->readpos += len; 337 | return result; 338 | } 339 | 340 | #ifdef __cplusplus 341 | } 342 | #endif 343 | -------------------------------------------------------------------------------- /pigz.s: -------------------------------------------------------------------------------- 1 | |.arch x64 2 | |.section code 3 | |.globals glob_ 4 | |.actionlist actions 5 | |.define CRC 6 | 7 | // Different calling conventions between Win/x64 and POSIX/x64. 8 | |.if WIN 9 | |.define Rarg1, rcx 10 | |.define Rarg2, rdx 11 | |.define Rarg3, r8 12 | |.else 13 | |.define Rarg1, rdi 14 | |.define Rarg2, rsi 15 | |.define Rarg3, rdx 16 | |.endif 17 | 18 | // Register assignments for (the majority of) pigz_available 19 | // The first letter indicates bit width: R=64, E=32, W=16, B=8 20 | // Other registers are used transiently: rax, rbx, rcx, rdx, xmm0-3 21 | |.define Rstate, rbp 22 | |.define Elitmask, r12d // state->litbits if BMI2 else ((1 << state->litbits) - 1) 23 | |.define Edistmask, r13d // state->distbits if BMI2 else ((1 << state->distbits) - 1) 24 | |.define Rbits, r8 // state->bits (only the low Enbits bits are valid) 25 | |.define Ebits, r8d 26 | |.define Wbits, r8w 27 | |.define Bbits, r8b 28 | |.define Rnbits, rsi 29 | |.define Enbits, esi // state->nbits 30 | |.define Bnbits, r6b 31 | |.define Rinput, r9 // state->input 32 | |.define Rinend, r10 // state->inend 33 | |.define Einend, r10d 34 | |.define Binend, r10b 35 | |.define Rwritepos, r14 // state->writepos 36 | |.define Ewritepos, r14d 37 | |.define Wwritepos, r14w 38 | |.define Ecrc, edi // state->crc 39 | |.define Wcrc, di 40 | |.define Bcrc, r7b 41 | |.define Rwritegoal, r15 // state->readpos + PIGZ_READ_SIZE 42 | |.define Ewritegoal, r15d 43 | |.define Rcrc_table, r11 // &pigz_crc_table 44 | 45 | // Ensure that at least 32 bits of input are available in Rbits 46 | |.macro needbits 47 | | // Pre-conditions: Usual pigz_available stack frame and register assignments 48 | | // Clobbers: rax, rcx, rdx, xmm0-15 49 | | cmp Enbits, 32 50 | | jae >9 51 | | lea rax, [>9] 52 | | mov ecx, Enbits 53 | | cmp Rinend, Rinput 54 | | jb ->need_reader_bits 55 | | // The length of the instructions between here and "9:" must match the N in 56 | | // the "sub rax, N" done by need_reader_bits (currently N=16). 57 | | mov eax, [Rinput] 58 | | add Rinput, 4 59 | | shl rax, cl 60 | | add Enbits, 32 61 | | or Rbits, rax 62 | |9: 63 | | // Post-condition: 32 <= Enbits < 64 64 | | // If this condition cannot be satisfied, pigz_available will return, and 65 | | // PIGZ_STATUS_EOF or PIGZ_STATUS_UNEXPETED_EOF will be reported. 66 | |.endmacro 67 | 68 | // Stack frame layout for pigz_available 69 | typedef struct Stack { 70 | |.if WIN 71 | void* homespace[4]; 72 | |.endif 73 | uint16_t nper_lit[16]; 74 | uint16_t nper_dist[16]; 75 | void* need_reader_bits_continuation; 76 | uint32_t sorted_dist[30 + 1]; 77 | uint32_t pad[1]; 78 | } Stack; 79 | 80 | // Alternative fields for a pigz_state when its distcodes field is not being used. 81 | typedef struct AltState { 82 | char reserved[offsetof(pigz_state, distcodes) + 8]; 83 | uint16_t offset_lit[16]; 84 | uint8_t offset_dist[16]; 85 | uint8_t codelengths[286 + 30]; 86 | uint32_t sorted_lit[286 + 1]; 87 | } AltState; 88 | 89 | // DynASM type mappings 90 | |.type Stack, Stack, rsp 91 | |.type State, pigz_state, Rstate 92 | |.type AltState, AltState, Rstate 93 | 94 | #define bad_bits(n) (n) 95 | #define OFFSET_lit_ops (256*4*4) 96 | #define OFFSET_dist_ops (OFFSET_lit_ops+32*4) 97 | 98 | int asm_export(const char* name); 99 | int asm_cfi(const char* fmt, ...); 100 | int asm_cfi_startproc(); 101 | int asm_cfi_endproc(); 102 | int asm_cfi_push(int offset, int reg); 103 | int asm_cfi_save(int offset, int reg); 104 | int asm_cfi_sub(int offset, int delta); 105 | 106 | void pigz_emit_asm(Dst_DECL) { 107 | int i; 108 | int cfa = 8; 109 | 110 | |.code 111 | |.align 16 112 | |=>asm_export("pigz_available"): 113 | |->pigz_available: 114 | |=>asm_cfi_startproc(): 115 | // Pre-conditions: Rarg1=state, (rsp & 15) == 8 116 | // Fast-path: Do not allocate a stack frame until we know that we need it 117 | | mov r9, State:Rarg1->writepos 118 | | mov rdx, State:Rarg1->readpos 119 | | cmp dx, r9w 120 | | movzx eax, r9w 121 | | mov r8d, PIGZ_WINDOW_SIZE 122 | | movzx edx, dx 123 | | cmova eax, r8d 124 | | sub eax, edx 125 | | jz >1 126 | | ret 127 | |1: 128 | | test byte State:Rarg1->status, 0xC0 129 | | jz >1 130 | | or byte State:Rarg1->status, 0x80 131 | | ret 132 | |1: 133 | // End of fast-path: time to allocate the stack frame 134 | // Known: state->readpos == state->writepos (i.e. time to uncompress some more input) 135 | // Known: 0 <= state->status < 64 (i.e. no error has occurred) 136 | // Pre-conditions: r9=state->writepos 137 | | push rbx 138 | |=>asm_cfi_push(cfa += 8, 3): 139 | | push rbp 140 | |=>asm_cfi_push(cfa += 8, 6): 141 | | mov Rstate, Rarg1 142 | | push r12 143 | |=>asm_cfi_push(cfa += 8, 12): 144 | | push r13 145 | |=>asm_cfi_push(cfa += 8, 13): 146 | | mov Rinend, State:Rarg1->inend 147 | | push r14 148 | |=>asm_cfi_push(cfa += 8, 14): 149 | | mov Rwritepos, r9 150 | | mov Rinput, State:Rarg1->input 151 | | push r15 152 | |=>asm_cfi_push(cfa += 8, 15): 153 | | sub rsp, sizeof(Stack) 154 | |=>asm_cfi_sub(cfa += sizeof(Stack), sizeof(Stack)): 155 | |.if WIN 156 | | mov [rsp+cfa], rsi 157 | |=>asm_cfi_save(cfa, 4): 158 | | mov [rsp+cfa+8], rdi 159 | |=>asm_cfi_save(cfa + 8, 5): 160 | |.endif 161 | | mov Rbits, State->bits 162 | | movzx Enbits, byte State->nbits 163 | | mov Ecrc, State->crc 164 | | jmp ->load_Rcrc_table_status_dispatch 165 | 166 | |->return_from_available: 167 | // Pre-conditions: Usual pigz_available stack frame and register assignments 168 | | mov State->nbits, Bnbits 169 | | add rsp, sizeof(Stack) 170 | cfa -= sizeof(Stack); 171 | |.if not WIN 172 | |=>asm_cfi("remember_state"): 173 | |=>asm_cfi("def_cfa_offset %d", cfa): 174 | |.endif 175 | | mov State->bits, Rbits 176 | | pop r15 177 | cfa -= 8; 178 | |.if not WIN 179 | |=>asm_cfi("def_cfa_offset %d", cfa): 180 | |.endif 181 | | mov State->writepos, Rwritepos 182 | | pop r14 183 | cfa -= 8; 184 | |.if not WIN 185 | |=>asm_cfi("def_cfa_offset %d", cfa): 186 | |.endif 187 | | mov State->crc, Ecrc 188 | | pop r13 189 | cfa -= 8; 190 | |.if not WIN 191 | |=>asm_cfi("def_cfa_offset %d", cfa): 192 | |.endif 193 | | mov State->inend, Rinend 194 | | pop r12 195 | cfa -= 8; 196 | |.if not WIN 197 | |=>asm_cfi("def_cfa_offset %d", cfa): 198 | |.endif 199 | | mov Rarg1, Rstate 200 | | pop rbp 201 | cfa -= 8; 202 | |.if not WIN 203 | |=>asm_cfi("def_cfa_offset %d", cfa): 204 | |.endif 205 | | mov State:Rarg1->input, Rinput 206 | | pop rbx 207 | cfa -= 8; 208 | |.if not WIN 209 | |=>asm_cfi("def_cfa_offset %d", cfa): 210 | |.endif 211 | |.if WIN 212 | | mov rdi, [rsp+16] 213 | | mov rsi, [rsp+8] 214 | |.endif 215 | | jmp ->pigz_available 216 | |.if not WIN 217 | |=>asm_cfi("restore_state"): 218 | |.endif 219 | 220 | { 221 | // Start of main decompression loop 222 | // NB: The loop entry point is ->fetch_compressed_main_loop 223 | 224 | |.align 16 225 | |->lit_not_lit: 226 | // Pre-conditions: KIND == 2 (i.e. either end-of-block or error) 227 | // Pre-conditions: ebx contains VAL 228 | | test ebx, ebx 229 | | jz ->fetch_next_block 230 | |->bad_bits: 231 | | mov byte State->status, PIGZ_STATUS_BAD_BITS ^ 0x80 232 | | jmp ->return_from_available 233 | for (i = 0; i < 2; ++i) { 234 | | nop 235 | } 236 | |->lit_not_length: 237 | // Pre-conditions: flags set by "cmp cl, 64" 238 | // Pre-conditions: 1 <= KIND <= 2 239 | // Pre-conditions: cl contains (KIND << 6) 240 | // Pre-conditions: ebx contains VAL 241 | | jnz ->lit_not_lit 242 | // Known: KIND == 1 (i.e. VAL is a literal value) 243 | | movzx eax, Wwritepos 244 | | add Rwritepos, 1 245 | | mov [Rstate + eax*1 + offsetof(pigz_state, window)], bl 246 | |.if CRC 247 | | xor bl, Bcrc 248 | | shr Ecrc, 8 249 | | xor Ecrc, [Rcrc_table + ebx*4] 250 | |.endif 251 | |->fetch_compressed_main_loop: 252 | // Pre-conditions: Usual pigz_available stack frame and register assignments 253 | | cmp Rwritepos, Rwritegoal 254 | | jae ->return_from_available 255 | | needbits 256 | // Known: enough bits are available for any literal/length code 257 | | mov ebx, Elitmask 258 | | and ebx, Ebits 259 | { 260 | |->load_lit_code: 261 | // Pre-conditions: ebx is an index into state->litcodes 262 | // Replaces ebx with the VAL from litcodes 263 | | mov eax, [Rstate + ebx*4 + offsetof(pigz_state, litcodes)] 264 | | xor ebx, ebx 265 | | movzx ecx, ah 266 | // Known: cl contains NBITS 267 | | shr Rbits, cl 268 | | sub Enbits, ecx 269 | | inc ebx 270 | | movzx ecx, al 271 | // Known: cl contains (KIND << 6) | NXBITS 272 | | shl ebx, cl // NB: Ignores the high two bits of cl 273 | | dec ebx 274 | | shr eax, 16 275 | | and ebx, Ebits 276 | | add ebx, eax 277 | // Known: ebx contains VAL 278 | | cmp cl, 192 279 | | jae ->load_lit_code 280 | } 281 | | sub Enbits, ecx 282 | | shr Rbits, cl // NB: Ignores the high two bits of cl 283 | | and Enbits, 63 284 | | cmp cl, 64 285 | | jae ->lit_not_length 286 | // Known: KIND == 0 (i.e. VAL is a length value) 287 | | needbits 288 | // Known: enough bits are available for any distance code 289 | | mov edx, Edistmask 290 | | and edx, Ebits 291 | { 292 | |->load_dist_code: 293 | // Pre-conditions: ebx contains a length value 294 | // Pre-conditions: edx is an index into state->distcodes 295 | // Replaces edx with the VAL from distcodes 296 | | mov eax, [Rstate + edx*4 + offsetof(pigz_state, distcodes)] 297 | | xor edx, edx 298 | | movzx ecx, ah 299 | // Known: cl contains NBITS 300 | | shr Rbits, cl 301 | | sub Enbits, ecx 302 | | inc edx 303 | | movzx ecx, al 304 | // Known: cl contains (KIND << 6) | NXBITS 305 | | shl edx, cl // NB: Ignores the high two bits of cl 306 | | dec edx 307 | | shr eax, 16 308 | | and edx, Ebits 309 | | add edx, eax 310 | // Known: edx contains VAL 311 | | cmp cl, 192 312 | | jae ->load_dist_code 313 | } 314 | | test cl, 192 315 | | .byte 0x2E; jnz ->bad_bits 316 | // Known: KIND == 0 (i.e. VAL is a distance value) 317 | // Replace edx with (writepos - edx) & (PIGZ_WINDOW_SIZE - 1) 318 | | neg rdx 319 | | sub Enbits, ecx 320 | | shr Rbits, cl 321 | | add rdx, Rwritepos 322 | | js ->bad_bits // Distance is greater than writepos 323 | | movzx edx, dx 324 | | movzx ecx, Wwritepos 325 | // Start of backref-copy loop 326 | // Invariants: ebx contains number of bytes remaining to copy 327 | // Invariants: ecx contains the window index of the next byte to write 328 | // Invariants: edx contains the window index of the next byte to read 329 | | add Rwritepos, rbx 330 | |.if CRC 331 | | movd xmm0, esi // Temporarily spill esi 332 | |.endif 333 | | test bl, 3 334 | | jz ->backref_copy4 335 | { 336 | |->backref_copy: // One-byte-at-a-time backref-copy loop (at most three iterations) 337 | | .byte 0x40; movzx eax, byte [Rstate + edx*1 + offsetof(pigz_state, window)] 338 | | inc dx 339 | | mov [Rstate + ecx*1 + offsetof(pigz_state, window)], al 340 | |.if CRC 341 | | xor al, Bcrc 342 | | shr Ecrc, 8 343 | |.endif 344 | | inc cx 345 | |.if CRC 346 | | xor Ecrc, [Rcrc_table + eax*4] 347 | |.endif 348 | | .byte 0x81, 0xEB, 0x01, 0x00, 0x00, 0x00 // sub ebx, dword 1 349 | | .byte 0x2E; jz ->fetch_compressed_main_loop 350 | | .byte 0x40; test ebx, 3 351 | | jnz ->backref_copy 352 | } 353 | { 354 | |->backref_copy4: // Four-bytes-at-a-time backref-copy loop (at most 64 iterations) 355 | // Pre-conditions: 0 < ebx <= 256, (ebx & 3) == 0 356 | // The input might overlap the output, so reads from the window and writes to 357 | // the window are still done one byte at a time, but the unrolling massively 358 | // helps the CRC calculation (and also reduces ebx manipulations). 359 | for (i = 3; i >= 0; --i) { 360 | | movzx eax, byte [Rstate + edx*1 + offsetof(pigz_state, window)] 361 | | inc dx 362 | | mov [Rstate + ecx*1 + offsetof(pigz_state, window)], al 363 | |.if CRC 364 | if (i) { 365 | | xor al, Bcrc 366 | | shr Ecrc, 8 367 | } else { 368 | | xor eax, Ecrc 369 | } 370 | |.endif 371 | | inc cx 372 | |.if CRC 373 | if (i == 3) { 374 | | mov esi, [Rcrc_table + eax*4 + 256*4*i] 375 | } else if (i) { 376 | | xor esi, [Rcrc_table + eax*4 + 256*4*i] 377 | } else { 378 | | xor esi, [Rcrc_table + eax*4] 379 | | mov Ecrc, esi 380 | } 381 | |.endif 382 | } 383 | | sub ebx, 4 384 | | jnz ->backref_copy4 385 | } 386 | |.if CRC 387 | | movd esi, xmm0 // Restore esi (it was spilled before the loop) 388 | |.endif 389 | // End of backref-copy loop 390 | | jmp ->fetch_compressed_main_loop 391 | // End of main decompression loop 392 | } 393 | { 394 | // Slow-path of the needbits macro 395 | // Can be "called" from numerous places within pigz_available, though uses the 396 | // same stack frame as pigz_available, and the "return address" is in rax. 397 | |.align 16 398 | |->need_reader_bits: 399 | // Pre-conditions: Usual pigz_available stack frame and register assignments 400 | // Pre-conditions: rax contains the "return address" at the end a needbits macro 401 | // Pre-conditions: ecx, rather than Enbits, contains state->nbits 402 | // Pre-conditions: 0 <= ecx < 32 403 | // Pre-conditions: less than four input bytes remain 404 | // Clobbers: rcx, rdx, xmm0-15 405 | | sub Rinend, Rinput 406 | | add Einend, 4 407 | |4: 408 | // Pre-conditions: Einend contains the number of input bytes remaining 409 | // Pre-conditions: 0 <= Einend < 4 410 | // Read two bytes of input, if possible 411 | | test Binend, 2 412 | | jz >1 413 | | movzx edx, word [Rinput] 414 | | add Rinput, 2 415 | | shl rdx, cl 416 | | add ecx, 16 417 | | sub Einend, 2 418 | | or Rbits, rdx 419 | |1: 420 | // Pre-conditions: 0 <= Einend < 2 421 | // Read one byte of input, if possible 422 | | test Binend, Binend 423 | | jz >2 424 | | movzx edx, byte [Rinput] 425 | | shl rdx, cl 426 | | add ecx, 8 427 | | xor Einend, Einend 428 | | or Rbits, rdx 429 | |2: 430 | // Pre-conditions: Einend == 0 431 | | test cl, 32 432 | | jnz >3 433 | // Call the reader function to get more input 434 | | mov Stack->need_reader_bits_continuation, rax 435 | | mov State->bits, Rbits 436 | | mov State->nbits, cl 437 | | mov State->crc, Ecrc 438 | | mov Rarg1, State->opaque 439 | | mov Rarg2, State 440 | | call aword State->reader 441 | | mov Rinend, [State] 442 | | mov Rinput, rax 443 | | mov rax, Stack->need_reader_bits_continuation 444 | | mov Rbits, State->bits 445 | | movzx ecx, byte State->nbits 446 | | mov Ecrc, State->crc 447 | | lea Rcrc_table, [->pigz_crc_table] 448 | | test Rinend, Rinend 449 | | jz ->unexpected_eof 450 | | cmp Rinend, 4 451 | | jb <4 452 | // Known: At least four bytes are available at Rinput 453 | | sub rax, 16 // The instructions before rax will load 32 bits from Rinput into Rbits 454 | |3: 455 | | lea Rinend, [Rinput + Rinend - 4] 456 | | mov Enbits, ecx 457 | | jmp rax 458 | |->unexpected_eof: 459 | // Pre-conditions: reader function returned a zero-length chunk 460 | | cmp byte State->status, 12 461 | | mov byte State->status, PIGZ_STATUS_UNEXPECTED_EOF ^ 0x80 462 | | jb >1 463 | | or rcx, Rwritepos 464 | | jnz >1 465 | // If status was 12/13 ("expecting gzip header"), and writepos was zero, and 466 | // nbits was zero, then EOF was expected. Otherwise, it was unexpected. 467 | | mov byte State->status, PIGZ_STATUS_EOF ^ 0x80 468 | |1: 469 | | jmp ->return_from_available 470 | // End of need_reader_bits 471 | } 472 | 473 | |->fetch_next_block_bmi2: 474 | | add Rinend, 11 475 | |->fetch_next_block: 476 | // Pre-conditions: Usual pigz_available stack frame and register assignments 477 | // Pre-conditions: A deflate block has just ended, and either another block 478 | // or a gzip footer follows. 479 | | needbits 480 | | movzx ebx, byte State->status 481 | | test bl, 2 482 | | jnz ->gz_tail 483 | |->fetch_next_block_got_bits: 484 | // Pre-conditions: A deflate block is expected. 485 | // Pre-conditions: 32 <= Enbits < 64 486 | | mov ebx, Ebits 487 | | and ebx, 7 488 | | shr Rbits, 3 489 | | sub Enbits, 3 490 | | cmp bl, 6 491 | | jae ->bad_bits // Invalid block type 492 | | shl ebx, 1 493 | | and byte State->status, 1 494 | | or State->status, bl 495 | | cmp bl, 4 496 | | jb ->prepare_uncompressed_block 497 | | cmp bl, 8 498 | | jb ->prepare_static_huffman_block 499 | { 500 | // Known: BTYPE==10 (the next block is "compressed with dynamic Huffman codes") 501 | // Read the dynamic Huffman codes, and prepare for a compressed block... (all 502 | // the code between here and "->status_dispatch" is dedicated to this task) 503 | | mov ebx, Ebits 504 | | shr Rbits, 10 505 | | mov eax, ebx 506 | | shr ebx, 5 507 | | and ebx, 31 508 | | and eax, 31 509 | | cmp bl, 30 510 | | jae ->bad_bits // Too many distance codes 511 | | mov bh, al 512 | | cmp bh, 30 513 | | jae ->bad_bits // Too many literal codes 514 | | add ebx, 0x00010101 515 | // Known: ebx contains (num-literal-codes << 8) | (num-distance-codes) 516 | // (so bl is num-distance-codes, and bh is num-literal-codes minus 256) 517 | | mov r12d, Ebits 518 | | shr Rbits, 4 519 | | and r12d, 15 520 | // Known: r12d contains num-code-length-codes minus 4 521 | // Accumulate code lengths for the code length alphabet into r13 (treating r13 as 19x3 bits) 522 | | mov r13d, Ebits 523 | | shr Rbits, 9 524 | | and r13d, 0x1ff 525 | | mov eax, Ebits 526 | | shr Rbits, 3 527 | | shl r13, 48 528 | | and eax, 7 529 | | or r13, rax 530 | | sub Enbits, 26 531 | { 532 | | test r12b, r12b 533 | | jz ->got_ccodelengths 534 | | mov64 r15, 0xf1e2d3c4b5a6978 // The order in which code lengths for the code length alphabet are given (after the first four) 535 | |->next_ccodelength: 536 | // Invariant: r12d contains number of code lengths for the code length alphabet still to read 537 | // Invariant: (r15 & 15) gives the index of the next code length 538 | | needbits 539 | | mov ecx, r15d 540 | | mov eax, Ebits 541 | | shr Rbits, 3 542 | | and ecx, 15 543 | | and eax, 7 544 | | lea ecx, [ecx+ecx*2] 545 | | sub Enbits, 3 546 | | shl rax, cl 547 | | shr r15, 4 548 | | or r13, rax 549 | | sub r12d, 1 550 | | jnz ->next_ccodelength 551 | |->got_ccodelengths: 552 | } 553 | // Known: r13 contains code lengths for the code length alphabet 554 | { 555 | // Accumulate into r15 the number of code length alphabet entries per code 556 | // length (treating r15 as 8x8 bits), and also put this into xmm0. 557 | | xor r15d, r15d 558 | | lea rcx, [r13*8] 559 | | pxor xmm1, xmm1 560 | |->next_npercodelength: 561 | | ror r15, cl // NB: Ignores the high two bits of cl 562 | | add r15, 1 563 | | rol r15, cl // NB: Ignores the high two bits of cl 564 | | shr rcx, 3 565 | | and rcx, -8 566 | | jnz ->next_npercodelength 567 | | movd xmm0, r15 568 | } 569 | // Known: xmm1 == 0 570 | // Known: ecx == 0 571 | { 572 | // Turn r15 from sum into cumulative sum, and check that the code length 573 | // alphabet has a valid Huffman table. 574 | | shr r15, 8 575 | | lea eax, [ecx+2] 576 | | lea r12d, [ecx+6] 577 | | movzx edx, r15b 578 | | sub eax, edx 579 | | jl ->bad_bits 580 | |->next_code_npercodelength: 581 | | ror r15, 8 582 | | movzx ecx, r15b 583 | | add eax, eax 584 | | add r15, rdx 585 | | add edx, ecx 586 | | sub eax, ecx 587 | | jl ->bad_bits // Code length alphabet over-subscribed. 588 | | sub r12d, 1 589 | | jnz ->next_code_npercodelength 590 | | punpcklbw xmm0, xmm1 // Turn xmm0 from 8x8 bits to 8x16 bits (zero extend each uint8_t to uint16_t) 591 | | test eax, eax 592 | | jnz ->bad_bits // Code length alphabet under-subscribed. 593 | } 594 | { 595 | // Create opcodes for code length alphabet 596 | | mov eax, 0x01000000 597 | | lea r12, Stack->sorted_dist 598 | | movdqa Stack->nper_lit, xmm0 599 | { 600 | |->next_codetable: 601 | | sub al, 1 602 | |.macro codetable_entry 603 | | mov ecx, r13d 604 | | shr r13, 3 605 | | and ecx, 7 606 | | mov ah, cl 607 | | shl ecx, 3 608 | | ror r15, cl 609 | | movzx edx, r15b 610 | | add r15, 1 611 | | mov [r12 + edx*4], eax 612 | | rol r15, cl 613 | |.endmacro 614 | | codetable_entry 615 | | cmp al, 240 616 | | jnz ->next_codetable 617 | } 618 | | mov eax, 0x03020000 619 | | codetable_entry 620 | | movd xmm0, ebx // Spill ebx 621 | | mov eax, 0x030300ff 622 | | codetable_entry 623 | | movd xmm1, Ecrc // Spill Ecrc 624 | | mov eax, 0x0b0700ff 625 | | codetable_entry 626 | | shr r15, 56 627 | | lea r13, Stack->nper_lit 628 | | mov [r12 + r15*4 + 1], al 629 | } 630 | { 631 | // Create 128-entry lookup table for code length alphabet 632 | | mov ecx, 7 633 | | add Rstate, offsetof(pigz_state, litcodes) 634 | | call ->make_tables 635 | | sub Rstate, offsetof(pigz_state, litcodes) 636 | | movd ebx, xmm0 // Restore ebx 637 | } 638 | { 639 | // Set r15d to the number of code lengths to read 640 | | movzx r15d, bl 641 | | mov eax, ebx 642 | | shr eax, 8 643 | | movd Ecrc, xmm1 // Restore Ecrc 644 | | add r15d, eax 645 | } 646 | { 647 | // Make sure that the first code length code does not rely on the current code length 648 | | mov eax, Ebits 649 | | and eax, 127 650 | | lea Rcrc_table, [->pigz_crc_table] 651 | | test byte [Rstate + eax*4 + offsetof(pigz_state, litcodes)], 128 652 | | jz ->bad_bits 653 | } 654 | { 655 | // Use the code length alphabet lookup table to decode the array of code lengths. 656 | // Invariant: r12d = current code length (initially undefined) 657 | // Invariant: r13d = number of code lengths which have been decoded 658 | // Invariant: 0 <= r13d <= r15d 659 | | xor r13d, r13d 660 | |->next_codelength: 661 | | needbits 662 | | mov eax, Ebits 663 | | and eax, 127 664 | | mov ecx, [Rstate + eax*4 + offsetof(pigz_state, litcodes)] 665 | | movsx eax, cl 666 | | shr ecx, 8 667 | | mov edx, eax 668 | | sar eax, 6 669 | | or r12d, eax 670 | | shr Rbits, cl 671 | | sub r12d, edx 672 | | xor edx, edx 673 | | sub Enbits, ecx 674 | | shr ecx, 8 675 | | shrd edx, Ebits, cl 676 | | shr Rbits, cl 677 | | rol edx, cl 678 | | sub Enbits, ecx 679 | | add edx, r13d 680 | | shr ecx, 8 681 | | add edx, ecx 682 | | and Enbits, 63 683 | | cmp edx, r15d 684 | | ja ->bad_bits // More code lengths specified than expected. 685 | { 686 | |->store_next_codelength: 687 | | mov [Rstate + r13d + offsetof(AltState, codelengths)], r12b 688 | | add r13d, 1 689 | | cmp r13d, edx 690 | | jnz ->store_next_codelength 691 | } 692 | | cmp edx, r15d 693 | | jnz ->next_codelength 694 | } 695 | { 696 | // Count the number of literal codes per length, and the number of distance 697 | // codes per length. 698 | // Reminder: ebx contains (num-literal-codes << 8) | (num-distance-codes) 699 | | xorps xmm0, xmm0 700 | | movaps [rsp + offsetof(Stack, nper_lit)], xmm0 701 | | mov ecx, ebx 702 | | movaps [rsp + offsetof(Stack, nper_lit) + 16], xmm0 703 | | xor eax, eax 704 | | cmp al, AltState->codelengths[256] 705 | | jz ->bad_bits // End-of-block is not in the literal alphabet 706 | | shr ecx, 8 707 | | movaps [rsp + offsetof(Stack, nper_dist)], xmm0 708 | | lea rdx, [Rstate + rcx + offsetof(AltState, codelengths)] 709 | | movaps [rsp + offsetof(Stack, nper_dist) + 16], xmm0 710 | { 711 | |->next_both_nper: 712 | | movzx r12d, byte [Rstate + rax + offsetof(AltState, codelengths)] 713 | | movzx r13d, byte [rdx + rax] 714 | | add eax, 1 715 | | add word [rsp + r12*2 + offsetof(Stack, nper_lit)], 1 716 | | add byte [rsp + r13*2 + offsetof(Stack, nper_dist)], 1 717 | | cmp al, bl 718 | | jnz ->next_both_nper 719 | } 720 | { 721 | |->next_one_nper: 722 | | movzx r12d, byte [Rstate + rax + offsetof(AltState, codelengths)] 723 | | add eax, 1 724 | | add word [rsp + r12*2 + offsetof(Stack, nper_lit)], 1 725 | | cmp eax, ecx 726 | | jnz ->next_one_nper 727 | } 728 | } 729 | { 730 | // Use counts in nper_lit to create cumulative sums in offset_lit, and 731 | // check that the literal alphabet has a valid Huffman table, and set 732 | // edx to the longest literal code length. 733 | | mov eax, 1 734 | | mov ecx, eax 735 | | xor edx, edx 736 | | xor r12d, r12d 737 | { 738 | |->next_offset_lit: 739 | | movzx r13d, word [rsp + rax*2 + offsetof(Stack, nper_lit)] 740 | | mov [Rstate + rax*2 + offsetof(AltState, offset_lit)], r12w 741 | | add ecx, ecx 742 | | test r13d, r13d 743 | | cmovnz edx, eax 744 | | add r12d, r13d 745 | | add eax, 1 746 | | sub ecx, r13d 747 | | jl ->bad_bits // Literal alphabet over-subscribed. 748 | | cmp eax, 16 749 | | jnz ->next_offset_lit 750 | } 751 | | test ecx, ecx 752 | | jz >1 753 | | cmp edx, 1 754 | | jnz ->bad_bits // Literal alphabet under-subscribed. 755 | |1: 756 | } 757 | { 758 | // Use counts in nper_dist to create cumulative sums in offset_dist, and 759 | // check that the distance alphabet has a valid Huffman table, and set 760 | // r15d to the longest distance code length. 761 | | mov eax, 1 762 | | mov ecx, eax 763 | | xor r15d, r15d 764 | | xor r12d, r12d 765 | { 766 | |->next_offset_dist: 767 | | movzx r13d, byte [rsp + rax*2 + offsetof(Stack, nper_dist)] 768 | | mov [Rstate + rax + offsetof(AltState, offset_dist)], r12b 769 | | add ecx, ecx 770 | | test r13d, r13d 771 | | cmovnz r15d, eax 772 | | add r12d, r13d 773 | | add eax, 1 774 | | sub ecx, r13d 775 | | jl ->bad_bits // Distance alphabet over-subscribed. 776 | | cmp eax, 16 777 | | jnz ->next_offset_dist 778 | } 779 | | test ecx, ecx 780 | | jz >1 781 | | cmp r15d, 1 782 | | jnz ->bad_bits // Distance alphabet under-subscribed. 783 | |1: 784 | } 785 | { 786 | // Set state->litbits and state->distbits 787 | | mov eax, 9 788 | | lea ecx, [eax-3] 789 | | cmp eax, edx 790 | | cmova eax, edx 791 | | cmp ecx, r15d 792 | | cmova ecx, r15d 793 | | mov State->litbits, al 794 | | mov State->distbits, cl 795 | } 796 | { 797 | // Create opcodes for the literal part of the literal alphabet 798 | | xor eax, eax 799 | |->next_sorted_lit: 800 | | movzx r12d, byte [Rstate + rax + offsetof(AltState, codelengths)] 801 | | test r12b, r12b 802 | | jz >1 803 | | mov edx, r12d 804 | | mov dh, al 805 | | shl edx, 8 806 | | movzx ecx, word [Rstate + r12*2 + offsetof(AltState, offset_lit)] 807 | | or edx, 64 808 | | mov [Rstate + rcx*4 + offsetof(AltState, sorted_lit)], edx 809 | | add ecx, 1 810 | | mov [Rstate + r12*2 + offsetof(AltState, offset_lit)], cx 811 | |1: 812 | | add al, 1 813 | | jnz ->next_sorted_lit 814 | } 815 | { 816 | // Create opcodes for the non-literal part of the literal alphabet 817 | |->next_sorted_lit_op: 818 | | movzx ecx, byte [Rstate + rax + offsetof(AltState, codelengths) + 256] 819 | | test cl, cl 820 | | jz >1 821 | | mov edx, [Rcrc_table + rax*4 + OFFSET_lit_ops] 822 | | movzx r12d, word [Rstate + rcx*2 + offsetof(AltState, offset_lit)] 823 | | mov dh, cl 824 | | mov [Rstate + r12*4 + offsetof(AltState, sorted_lit)], edx 825 | | add r12d, 1 826 | | mov [Rstate + rcx*2 + offsetof(AltState, offset_lit)], r12w 827 | |1: 828 | | add al, 1 829 | | cmp al, bh 830 | | jnz ->next_sorted_lit_op 831 | } 832 | { 833 | // Create opcodes for the distance alphabet 834 | | xor r13d, r13d 835 | | lea rax, [Rstate + rax + offsetof(AltState, codelengths) + 256] 836 | |->next_dist_op: 837 | | movzx ecx, byte [r13 + rax] 838 | | test cl, cl 839 | | jz >1 840 | | mov edx, [Rcrc_table + r13*4 + OFFSET_dist_ops] 841 | | movzx r12d, byte [Rstate + rcx + offsetof(AltState, offset_dist)] 842 | | mov dh, cl 843 | | mov [rsp + r12*4 + offsetof(Stack, sorted_dist)], edx 844 | | add r12d, 1 845 | | mov [Rstate + rcx + offsetof(AltState, offset_dist)], r12b 846 | |1: 847 | | add r13b, 1 848 | | cmp r13b, bl 849 | | jnz ->next_dist_op 850 | } 851 | { 852 | // Put sentinel entries at the end of the opcode arrays 853 | | movzx ecx, word AltState->offset_lit[15] 854 | | xor eax, eax 855 | | movzx edx, byte AltState->offset_dist[15] 856 | | not eax 857 | | mov [rbp + rcx*4 + offsetof(AltState, sorted_lit)], eax 858 | | mov [rsp + rdx*4 + offsetof(Stack, sorted_dist)], eax 859 | } 860 | { 861 | // Create the lookup tables for the literal alphabet and the distance alphabet 862 | | mov eax, 128 + (1 << 16) 863 | | mov State->litcodes[0], eax 864 | | movd xmm0, Ecrc // Spill Ecrc 865 | | mov State->litcodes[1], eax 866 | | mov State->distcodes[0], eax 867 | | movd xmm2, Rinend // Spill Rinend 868 | | mov State->distcodes[1], eax 869 | | movd xmm3, Rinput // Spill Rinput 870 | | lea r12, AltState->sorted_lit 871 | | movzx ecx, byte State->litbits 872 | | add Rstate, offsetof(pigz_state, litcodes) 873 | | test cl, cl 874 | | jz >1 875 | | lea r13, Stack->nper_lit 876 | | call ->make_tables 877 | |1: 878 | | lea r12, Stack->sorted_dist 879 | | movzx ecx, byte [Rstate + offsetof(pigz_state, distbits) - offsetof(pigz_state, litcodes)] 880 | | sub Rstate, offsetof(pigz_state, litcodes) - offsetof(pigz_state, distcodes) 881 | | test cl, cl 882 | | jz >1 883 | | lea r13, Stack->nper_dist 884 | | call ->make_tables 885 | |1: 886 | | movd Rinend, xmm2 // Restore Rinend 887 | | movd Rinput, xmm3 // Restore Rinput 888 | |->make_tables_tidyup_state_dispatch: 889 | | sub Rstate, offsetof(pigz_state, distcodes) 890 | | movd Ecrc, xmm0 // Restore Ecrc 891 | |->load_Rcrc_table_status_dispatch: 892 | | lea Rcrc_table, [->pigz_crc_table] 893 | } 894 | } 895 | |->status_dispatch: 896 | | movzx eax, byte State->status 897 | | mov Rwritegoal, State->readpos 898 | | add Rwritegoal, PIGZ_READ_SIZE 899 | | cmp eax, 4 900 | | jl ->fetch_uncompressed 901 | | cmp eax, 12 902 | | jge ->gz_head 903 | | movzx ecx, word State->litbits 904 | | test eax, 1 905 | | jz >1 906 | | sub Rinend, 11 907 | | mov Elitmask, ecx 908 | | cmp Rinend, Rinput 909 | | jb ->fetch_compressed_main_loop_bmi2 910 | | mov r13, [Rinput] 911 | | jmp ->fetch_compressed_main_loop_bmi2 912 | |1: 913 | | movzx Elitmask, byte [Rcrc_table + 114] // 1 914 | | mov Edistmask, Elitmask 915 | | shl Elitmask, cl 916 | | shr ecx, 8 917 | | shl Edistmask, cl 918 | | sub Elitmask, 1 919 | | sub Edistmask, 1 920 | | jmp ->fetch_compressed_main_loop 921 | |->gz_head: 922 | | cmp Rwritepos, State->readpos 923 | | jnz ->return_from_available 924 | { 925 | | xor Ewritepos, Ewritepos 926 | | mov State->readpos, Rwritepos 927 | | needbits 928 | | xor Ecrc, Ecrc 929 | | mov State->readpos, Rwritepos 930 | | mov eax, Ebits 931 | | not Ecrc 932 | | and eax, 0xe0ffffff 933 | | shld ebx, Ebits, 8 934 | | cmp eax, 0x00088b1f 935 | | jnz ->bad_gz_magic 936 | { 937 | | mov r12d, 10 938 | |->read_more_header_bytes: 939 | | needbits 940 | | movzx eax, Wwritepos 941 | | movzx ecx, Bbits 942 | | add Rwritepos, 1 943 | | shr Rbits, 8 944 | | xor cl, Bcrc 945 | | shr Ecrc, 8 946 | | xor Ecrc, [Rcrc_table + rcx*4] 947 | | sub Enbits, 8 948 | | cmp Ewritepos, r12d 949 | | jnz ->read_more_header_bytes 950 | | test bl, 4 951 | | jz >1 952 | | xor bl, 4 953 | | movzx eax, Wbits 954 | | lea r12d, [r12d+eax+2] 955 | | jmp ->read_more_header_bytes 956 | |1: 957 | } 958 | { 959 | | test bl, 24 960 | | jz >1 961 | |->read_next_strz: 962 | | mov eax, 8 963 | | test bl, al 964 | | setz cl 965 | | shl eax, cl 966 | | xor bl, al 967 | { 968 | |->read_more_strz: 969 | | needbits 970 | | movzx ecx, Bbits 971 | | mov eax, ecx 972 | | shr Rbits, 8 973 | | xor cl, Bcrc 974 | | shr Ecrc, 8 975 | | sub Enbits, 8 976 | | xor Ecrc, [Rcrc_table + rcx*4] 977 | | test al, al 978 | | jnz ->read_more_strz 979 | } 980 | | test bl, 24 981 | | jnz ->read_next_strz 982 | |1: 983 | } 984 | | test bl, 2 985 | | jz >1 986 | | needbits 987 | | not Ecrc 988 | | cmp Wcrc, Wbits 989 | | jnz ->bad_crc 990 | | shr Rbits, 16 991 | | sub Enbits, 16 992 | |1: 993 | | needbits 994 | | xor Ecrc, Ecrc 995 | | xor Ewritepos, Ewritepos 996 | | not Ecrc 997 | | jmp ->fetch_next_block_got_bits 998 | } 999 | | 1000 | |->bad_crc: 1001 | | mov byte State->status, PIGZ_STATUS_BAD_CRC ^ 0x80 1002 | |1: 1003 | | jmp ->return_from_available 1004 | |->bad_gz_magic: 1005 | | mov byte State->status, PIGZ_STATUS_BAD_HEADER ^ 0x80 1006 | | jmp <1 1007 | | 1008 | |->gz_tail: 1009 | | mov ecx, Enbits 1010 | | and ecx, 7 1011 | | shr Rbits, cl 1012 | | sub Enbits, ecx 1013 | | not Ecrc 1014 | | needbits 1015 | |.if CRC 1016 | | xor Ecrc, Ebits 1017 | | jnz ->bad_crc 1018 | |.endif 1019 | | shr Rbits, 32 1020 | | sub Enbits, 32 1021 | | needbits 1022 | | cmp Ewritepos, Ebits 1023 | | jnz ->bad_bits 1024 | | shr Rbits, 32 1025 | | sub Enbits, 32 1026 | | or byte State->status, 12 1027 | | jmp ->status_dispatch 1028 | 1029 | |->fetch_uncompressed: 1030 | | movzx ebx, word State->litlen 1031 | | sub Rwritegoal, Rwritepos 1032 | | jbe ->return_from_available 1033 | | test ebx, ebx 1034 | | jz ->fetch_next_block 1035 | | cmp ebx, Ewritegoal 1036 | | cmova ebx, Ewritegoal 1037 | | sub State->litlen, bx 1038 | { 1039 | |->fetch_uncompressed_main_loop: 1040 | | needbits 1041 | | movzx eax, Bbits 1042 | | movzx ecx, Wwritepos 1043 | | shr Rbits, 8 1044 | | inc Rwritepos 1045 | | sub Enbits, 8 1046 | | mov [Rstate + ecx + offsetof(pigz_state, window)], al 1047 | |.if CRC 1048 | | xor al, Bcrc 1049 | | shr Ecrc, 8 1050 | | xor Ecrc, [Rcrc_table + eax*4] 1051 | |.endif 1052 | | sub ebx, 1 1053 | | jnz ->fetch_uncompressed_main_loop 1054 | } 1055 | | cmp State->litlen, bx 1056 | | jz ->fetch_next_block 1057 | | jmp ->return_from_available 1058 | 1059 | |->prepare_uncompressed_block: 1060 | | mov ecx, Enbits 1061 | | and ecx, 7 1062 | | shr Rbits, cl 1063 | | sub Enbits, ecx 1064 | | movzx ebx, Wbits 1065 | | shr Rbits, 16 1066 | | sub Enbits, 16 1067 | | mov State->litlen, bx 1068 | | not ebx 1069 | | needbits 1070 | | cmp bx, Wbits 1071 | | jnz ->bad_bits 1072 | | shr Rbits, 16 1073 | | sub Enbits, 16 1074 | | jmp ->status_dispatch 1075 | 1076 | |->prepare_static_huffman_block: 1077 | | xor ecx, ecx 1078 | | mov dword Stack->nper_lit[8], 152 + (112 << 16) 1079 | | movd xmm0, Ecrc // Spill Ecrc 1080 | { 1081 | |1: 1082 | | mov eax, [Rcrc_table + ecx*4 + OFFSET_lit_ops] 1083 | | mov [Rstate + ecx*4 + offsetof(AltState, sorted_lit)], eax 1084 | | inc ecx 1085 | | cmp ecx, 24 1086 | | jnz <1 1087 | } 1088 | | mov dword Stack->nper_lit[6], 32 + (24 << 16) 1089 | { 1090 | |1: 1091 | | mov eax, [Rcrc_table + ecx*4 + OFFSET_lit_ops] 1092 | | mov [Rstate + ecx*4 + 144*4 + offsetof(AltState, sorted_lit)], eax 1093 | | inc ecx 1094 | | cmp ecx, 32 1095 | | jnz <1 1096 | } 1097 | | xor ecx, ecx 1098 | | lea r13, Stack->nper_lit 1099 | | mov byte State->litbits, 9 1100 | { 1101 | |1: 1102 | | lea eax, [ecx + (64 + (8 << 8)) << 16] 1103 | | rol eax, 16 1104 | | mov [Rstate + ecx*4 + 24*4 + offsetof(AltState, sorted_lit)], eax 1105 | | inc ecx 1106 | | cmp ecx, 144 1107 | | jnz <1 1108 | } 1109 | | lea r12, AltState->sorted_lit 1110 | | mov byte State->distbits, 5 1111 | { 1112 | |1: 1113 | | lea eax, [ecx + (64 + (9 << 8)) << 16] 1114 | | rol eax, 16 1115 | | mov [Rstate + ecx*4 + 32*4 + offsetof(AltState, sorted_lit)], eax 1116 | | add cl, 1 1117 | | jnz <1 1118 | } 1119 | | dec ecx 1120 | | mov AltState->sorted_lit[288], ecx 1121 | | add ecx, 10 1122 | | add Rstate, offsetof(pigz_state, litcodes) 1123 | | call ->make_tables 1124 | | mov cl, 5 1125 | | add r13, 2 1126 | | lea r12, [->dist_ops] 1127 | | sub Rstate, offsetof(pigz_state, litcodes) - offsetof(pigz_state, distcodes) 1128 | | call ->make_tables 1129 | | jmp ->make_tables_tidyup_state_dispatch 1130 | { 1131 | // Start of main decompression loop, when CPU has support for BMI2 1132 | // NB: The loop entry point is ->fetch_compressed_main_loop_bmi2 1133 | 1134 | |.align 16 1135 | |->need_reader_bits_bmi2: 1136 | | cmp Enbits, 48 1137 | | jge ->got_reader_bits_bmi2 1138 | | lea rcx, [Rinend + 15] 1139 | | cmp Rinput, rcx 1140 | | jz >1 1141 | |2: 1142 | | movzx edx, byte [Rinput] 1143 | | add Rinput, 1 1144 | | shlx rdx, rdx, Rnbits 1145 | | add Enbits, 8 1146 | | or Rbits, rdx 1147 | | cmp Enbits, 48 1148 | | jge ->got_reader_bits_bmi2 1149 | | cmp Rinput, rcx 1150 | | jnz <2 1151 | |1: 1152 | // Call the reader function to get more input 1153 | | mov State->bits, Rbits 1154 | | mov State->nbits, Bnbits 1155 | | mov State->crc, Ecrc 1156 | | mov Rarg1, State->opaque 1157 | | mov Rarg2, State 1158 | | call aword State->reader 1159 | | mov Rinend, [State] 1160 | | mov Rinput, rax 1161 | | mov Rbits, State->bits 1162 | | movzx Enbits, byte State->nbits 1163 | | mov Ecrc, State->crc 1164 | | lea Rcrc_table, [->pigz_crc_table] 1165 | | cmp Rinend, 15 1166 | | lea Rinend, [Rinput + Rinend - 15] 1167 | | jb >3 1168 | | mov r13, [Rinput] 1169 | | jmp ->got_reader_bytes_bmi2 1170 | |3: 1171 | | lea rcx, [Rinend + 15] 1172 | | cmp Rinput, rcx 1173 | | jnz <2 1174 | | mov byte State->status, PIGZ_STATUS_UNEXPECTED_EOF ^ 0x80 1175 | | jmp ->return_from_available_bmi2 1176 | 1177 | |.align 16 1178 | |->lit_not_lit_bmi2: 1179 | // Pre-conditions: KIND == 2 (i.e. either end-of-block or error) 1180 | // Pre-conditions: ebx contains VAL 1181 | | test ebx, ebx 1182 | | jz ->fetch_next_block_bmi2 1183 | |->bad_bits_bmi2: 1184 | | mov byte State->status, PIGZ_STATUS_BAD_BITS ^ 0x80 1185 | |->return_from_available_bmi2: 1186 | | add Rinend, 11 1187 | | jmp ->return_from_available 1188 | |->lit_not_length_bmi2: 1189 | // Pre-conditions: flags set by "cmp al, 64" 1190 | // Pre-conditions: 1 <= KIND <= 2 1191 | // Pre-conditions: al contains (KIND << 6) 1192 | // Pre-conditions: ebx contains VAL 1193 | | .byte 0x2E; jnz ->lit_not_lit_bmi2 1194 | // Known: KIND == 1 (i.e. VAL is a literal value) 1195 | | movzx eax, Wwritepos 1196 | | add Rwritepos, 1 1197 | | .byte 0x40; mov [Rstate + eax*1 + offsetof(pigz_state, window)], bl 1198 | |.if CRC 1199 | | xor bl, Bcrc 1200 | | .byte 0x40; shr Ecrc, 8 1201 | | xor Ecrc, [Rcrc_table + ebx*4] 1202 | |.endif 1203 | |->fetch_compressed_main_loop_bmi2: 1204 | // Pre-conditions: Usual pigz_available stack frame and register assignments 1205 | | cmp Rwritepos, Rwritegoal 1206 | | jae ->return_from_available_bmi2 1207 | | cmp Rinend, Rinput 1208 | | jb ->need_reader_bits_bmi2 1209 | |->got_reader_bytes_bmi2: 1210 | | shlx rax, r13, Rnbits 1211 | | or Rbits, rax 1212 | | lea eax, [Enbits - 63] 1213 | | or Enbits, 56 1214 | | neg eax 1215 | | shr eax, 3 1216 | | add Rinput, rax 1217 | | mov r13, [Rinput] 1218 | |->got_reader_bits_bmi2: 1219 | // Known: enough bits are available for any literal/length code and then any distance code 1220 | | bzhi ebx, Ebits, Elitmask 1221 | { 1222 | |->load_lit_code_bmi2: 1223 | // Pre-conditions: ebx is an index into state->litcodes 1224 | // Replaces ebx with the VAL from litcodes 1225 | | mov eax, [Rstate + ebx*4 + offsetof(pigz_state, litcodes)] 1226 | // Known: al contains (KIND << 6) | NXBITS 1227 | | movzx ecx, ah 1228 | // Known: ecx contains NBITS 1229 | | mov ebx, eax 1230 | | and ebx, 63 1231 | // Known: ebx contains NXBITS 1232 | | shrx Rbits, Rbits, rcx 1233 | | sub Enbits, ecx 1234 | | rorx rcx, rax, 16 1235 | | bzhi ebx, Ebits, ebx 1236 | | add ebx, ecx 1237 | // Known: ebx contains VAL 1238 | | cmp al, 192 1239 | | jae ->load_lit_code_bmi2 1240 | } 1241 | | sub Enbits, eax 1242 | | shrx Rbits, Rbits, rax // NB: Ignores the high two bits of al 1243 | | and Enbits, 63 1244 | | cmp al, 64 1245 | | jae ->lit_not_length_bmi2 1246 | // Known: KIND == 0 (i.e. VAL is a length value) 1247 | | rorx edx, Elitmask, 8 1248 | | bzhi edx, Ebits, edx 1249 | { 1250 | |->load_dist_code_bmi2: 1251 | // Pre-conditions: ebx contains a length value 1252 | // Pre-conditions: edx is an index into state->distcodes 1253 | // Replaces edx with the VAL from distcodes 1254 | | mov eax, [Rstate + edx*4 + offsetof(pigz_state, distcodes)] 1255 | // Known: al contains (KIND << 6) | NXBITS 1256 | | movzx ecx, ah 1257 | // Known: ecx contains NBITS 1258 | | mov edx, eax 1259 | | and edx, 63 1260 | // Known: edx contains NXBITS 1261 | | shrx Rbits, Rbits, rcx 1262 | | sub Enbits, ecx 1263 | | rorx rcx, rax, 16 1264 | | bzhi edx, Ebits, edx 1265 | | add edx, ecx 1266 | // Known: edx contains VAL 1267 | | cmp al, 192 1268 | | jae ->load_dist_code_bmi2 1269 | } 1270 | | test al, 192 1271 | |9: 1272 | | .byte 0x2E; jnz ->bad_bits_bmi2 1273 | // Known: KIND == 0 (i.e. VAL is a distance value) 1274 | // Replace edx with (writepos - edx) & (PIGZ_WINDOW_SIZE - 1) 1275 | | neg rdx 1276 | | sub Bnbits, al 1277 | | shrx Rbits, Rbits, rax 1278 | | add rdx, Rwritepos 1279 | | .byte 0x2E; js <9 // Distance is greater than writepos 1280 | | movzx edx, dx 1281 | | movzx ecx, Wwritepos 1282 | // Start of backref-copy loop 1283 | // Invariants: ebx contains number of bytes remaining to copy 1284 | // Invariants: ecx contains the window index of the next byte to write 1285 | // Invariants: edx contains the window index of the next byte to read 1286 | | add Rwritepos, rbx 1287 | |.if CRC 1288 | | movd xmm0, esi // Temporarily spill esi 1289 | |.endif 1290 | | test bl, 3 1291 | | jz ->backref_copy4_bmi2 1292 | { 1293 | |->backref_copy_bmi2: // One-byte-at-a-time backref-copy loop (at most three iterations) 1294 | | .byte 0x40; movzx eax, byte [Rstate + edx*1 + offsetof(pigz_state, window)] 1295 | | inc dx 1296 | | mov [Rstate + ecx*1 + offsetof(pigz_state, window)], al 1297 | |.if CRC 1298 | | xor al, Bcrc 1299 | | shr Ecrc, 8 1300 | |.endif 1301 | | inc cx 1302 | |.if CRC 1303 | | xor Ecrc, [Rcrc_table + eax*4] 1304 | |.endif 1305 | | .byte 0x81, 0xEB, 0x01, 0x00, 0x00, 0x00 // sub ebx, dword 1 1306 | | .byte 0x2E; jz ->fetch_compressed_main_loop_bmi2 1307 | | .byte 0x40; test ebx, 3 1308 | | jnz ->backref_copy_bmi2 1309 | } 1310 | { 1311 | |->backref_copy4_bmi2: // Four-bytes-at-a-time backref-copy loop (at most 64 iterations) 1312 | // Pre-conditions: 0 < ebx <= 256, (ebx & 3) == 0 1313 | // The input might overlap the output, so reads from the window and writes to 1314 | // the window are still done one byte at a time, but the unrolling massively 1315 | // helps the CRC calculation (and also reduces ebx manipulations). 1316 | for (i = 3; i >= 0; --i) { 1317 | | movzx eax, byte [Rstate + edx*1 + offsetof(pigz_state, window)] 1318 | | inc dx 1319 | | mov [Rstate + ecx*1 + offsetof(pigz_state, window)], al 1320 | |.if CRC 1321 | if (i) { 1322 | | xor al, Bcrc 1323 | | shr Ecrc, 8 1324 | } else { 1325 | | xor eax, Ecrc 1326 | } 1327 | |.endif 1328 | | inc cx 1329 | |.if CRC 1330 | if (i == 3) { 1331 | | mov esi, [Rcrc_table + eax*4 + 256*4*i] 1332 | } else if (i) { 1333 | | xor esi, [Rcrc_table + eax*4 + 256*4*i] 1334 | } else { 1335 | | xor esi, [Rcrc_table + eax*4] 1336 | | mov Ecrc, esi 1337 | } 1338 | |.endif 1339 | } 1340 | | sub ebx, 4 1341 | | jnz ->backref_copy4_bmi2 1342 | } 1343 | |.if CRC 1344 | | movd esi, xmm0 // Restore esi (it was spilled before the loop) 1345 | |.endif 1346 | // End of backref-copy loop 1347 | | jmp ->fetch_compressed_main_loop_bmi2 1348 | // End of main decompression loop 1349 | } 1350 | |=>asm_cfi_endproc(): 1351 | 1352 | { 1353 | |.align 16 1354 | |->make_tables: 1355 | |.if not WIN 1356 | |=>asm_cfi_startproc(): 1357 | |.endif 1358 | | // input: rbp=table, r12=sorted, ecx=rootbits, r13=nper 1359 | | // phase 1 clobbers: r15, r11, rdx, rdi, rbx, rax, r12 1360 | | // phase 2 clobbers: r9, r10 1361 | |.define Ehuff, r15d 1362 | |.define Rnext_table, r11 1363 | |.define Enext_table, r11d 1364 | | xor Enext_table, Enext_table 1365 | | bts Enext_table, ecx 1366 | | movzx edx, byte [r12+1] 1367 | | lea Rnext_table, [rbp + Enext_table*4] 1368 | | xor Ehuff, Ehuff 1369 | { 1370 | |->next_huff: 1371 | | xor edi, edi 1372 | | bts edi, edx 1373 | | lea rbx, [rbp + Ehuff*4] 1374 | | mov eax, [r12] 1375 | | add r12, 4 1376 | { 1377 | |->next_pos: 1378 | | mov [rbx], eax 1379 | | lea rbx, [rbx + edi*4] 1380 | | cmp rbx, Rnext_table 1381 | | jb ->next_pos 1382 | } 1383 | | dec edi 1384 | | xor edi, Ehuff 1385 | | xor eax, eax 1386 | | bsr edi, edi 1387 | | bts eax, edi 1388 | | lea edi, [eax-1] 1389 | | and Ehuff, edi 1390 | | add Ehuff, eax 1391 | | sub word [r13 + edx*2], 1 1392 | | jnz ->next_huff 1393 | | movzx edx, byte [r12+1] 1394 | | cmp dl, cl 1395 | | jbe ->next_huff 1396 | } 1397 | | js ->make_tables_ret 1398 | | xor r10d, r10d 1399 | | shrd r10d, Ehuff, cl 1400 | { 1401 | |->next_level2: 1402 | | xor eax, eax 1403 | | bts eax, edx 1404 | | mov edi, edx 1405 | | shr eax, cl 1406 | { 1407 | |->next_left: 1408 | | sub ax, [r13 + rdi*2] 1409 | | jle >1 1410 | | add eax, eax 1411 | | add edi, 1 1412 | | jmp ->next_left 1413 | |1: 1414 | } 1415 | | sub edi, ecx 1416 | | rol r10d, cl 1417 | | xor eax, eax 1418 | | mov r9, Rnext_table 1419 | | bts eax, edi 1420 | | lea Rnext_table, [Rnext_table+rax*4] 1421 | | mov rax, r9 1422 | | sub rax, rbp 1423 | | shl eax, 14 1424 | | mov al, 128+64 1425 | | mov ah, cl 1426 | | or al, r7b 1427 | | mov [rbp + r10*4], eax 1428 | | ror r10d, cl 1429 | { 1430 | |->next_level2_huff: 1431 | | mov eax, [r12] 1432 | | add r12, 4 1433 | | xor edi, edi 1434 | | mov ebx, Ehuff 1435 | | sub ah, cl 1436 | | shr ebx, cl 1437 | | bts edi, edx 1438 | | lea rbx, [r9 + rbx*4] 1439 | | shr edi, cl 1440 | { 1441 | |->next_level2_pos: 1442 | | mov [rbx], eax 1443 | | lea rbx, [rbx + edi*4] 1444 | | cmp rbx, Rnext_table 1445 | | jb ->next_level2_pos 1446 | } 1447 | | sub word [r13 + rdx*2], 1 1448 | | jnz >1 1449 | | movzx edx, byte [r12+1] 1450 | | test dl, dl 1451 | | jns >1 1452 | |->make_tables_ret: 1453 | | ret 1454 | |1: 1455 | | shl edi, cl 1456 | | dec edi 1457 | | xor edi, Ehuff 1458 | | xor eax, eax 1459 | | bsr edi, edi 1460 | | bts eax, edi 1461 | | lea edi, [eax-1] 1462 | | and Ehuff, edi 1463 | | xor edi, edi 1464 | | add Ehuff, eax 1465 | | shrd edi, Ehuff, cl 1466 | | cmp edi, r10d 1467 | | jz ->next_level2_huff 1468 | } 1469 | | mov r10d, edi 1470 | | jmp ->next_level2 1471 | } 1472 | |.if not WIN 1473 | |=>asm_cfi_endproc(): 1474 | |.endif 1475 | } 1476 | 1477 | { 1478 | |.align 16 1479 | |=>asm_export("pigz_init"): 1480 | |->pigz_init: 1481 | |.if not WIN 1482 | |=>asm_cfi_startproc(): 1483 | |.endif 1484 | | xor eax, eax 1485 | | mov [Rarg1], rax 1486 | | mov State:Rarg1->readpos, rax 1487 | | mov State:Rarg1->writepos, rax 1488 | | mov State:Rarg1->opaque, Rarg2 1489 | | mov State:Rarg1->reader, Rarg3 1490 | | mov State:Rarg1->bits, rax 1491 | | or al, 4 1492 | | mov State:Rarg1->input, rax 1493 | | or al, 3 1494 | |.if WIN 1495 | | push Rarg1 1496 | |.endif 1497 | | xor ecx, ecx 1498 | | push rbx 1499 | | cpuid 1500 | | movzx eax, bh 1501 | | pop rbx 1502 | |.if WIN 1503 | | pop Rarg1 1504 | |.endif 1505 | | and al, 1 1506 | | or al, 12 1507 | | mov State:Rarg1->status, eax 1508 | | ret 1509 | |.if not WIN 1510 | |=>asm_cfi_endproc(): 1511 | |.endif 1512 | } 1513 | 1514 | |.align 64 1515 | |->pigz_crc_table: 1516 | { 1517 | int n, k; 1518 | uint32_t tab[256], tab2[256]; 1519 | for (n = 0; n < 256; n++) { 1520 | int32_t c = n; 1521 | for (k = 0; k < 8; k++) { 1522 | c = (0xedb88320L & -(c & 1)) ^ (int32_t)((uint32_t)c >> 1); 1523 | } 1524 | tab[n] = tab2[n] = (uint32_t)c; 1525 | |.dword c 1526 | } 1527 | for (i = 0; i < 3; i++) { 1528 | for (n = 0; n < 256; n++) { 1529 | tab2[n] = (tab2[n] >> 8) ^ tab[tab2[n] & 0xff]; 1530 | |.dword tab2[n] 1531 | } 1532 | } 1533 | } 1534 | |->lit_ops: 1535 | |.dword 128 + (7 << 8), 0 + (7 << 8) + (3 << 16), 0 + (7 << 8) + (4 << 16) 1536 | |.dword 0 + (7 << 8) + (5 << 16), 0 + (7 << 8) + (6 << 16) 1537 | |.dword 0 + (7 << 8) + (7 << 16), 0 + (7 << 8) + (8 << 16) 1538 | |.dword 0 + (7 << 8) + (9 << 16), 0 + (7 << 8) + (10 << 16) 1539 | |.dword 1 + (7 << 8) + (11 << 16), 1 + (7 << 8) + (13 << 16) 1540 | |.dword 1 + (7 << 8) + (15 << 16), 1 + (7 << 8) + (17 << 16) 1541 | |.dword 2 + (7 << 8) + (19 << 16), 2 + (7 << 8) + (23 << 16) 1542 | |.dword 2 + (7 << 8) + (27 << 16), 2 + (7 << 8) + (31 << 16) 1543 | |.dword 3 + (7 << 8) + (35 << 16), 3 + (7 << 8) + (43 << 16) 1544 | |.dword 3 + (7 << 8) + (51 << 16), 3 + (7 << 8) + (59 << 16) 1545 | |.dword 4 + (7 << 8) + (67 << 16), 4 + (7 << 8) + (83 << 16) 1546 | |.dword 4 + (7 << 8) + (99 << 16) 1547 | |.dword 4 + (8 << 8) + (115 << 16), 5 + (8 << 8) + (131 << 16) 1548 | |.dword 5 + (8 << 8) + (163 << 16), 5 + (8 << 8) + (195 << 16) 1549 | |.dword 5 + (8 << 8) + (227 << 16), 0 + (8 << 8) + (258 << 16) 1550 | |.dword 128 + (8 << 8) + (1 << 16), 128 + (8 << 8) + (1 << 16) 1551 | | 1552 | |->dist_ops: 1553 | |.dword 0 + (5 << 8) + (1 << 16), 0 + (5 << 8) + (2 << 16), 0 + (5 << 8) + (3 << 16) 1554 | |.dword 0 + (5 << 8) + (4 << 16), 1 + (5 << 8) + (5 << 16), 1 + (5 << 8) + (7 << 16) 1555 | |.dword 2 + (5 << 8) + (9 << 16), 2 + (5 << 8) + (13 << 16) 1556 | |.dword 3 + (5 << 8) + (17 << 16), 3 + (5 << 8) + (25 << 16) 1557 | |.dword 4 + (5 << 8) + (33 << 16), 4 + (5 << 8) + (49 << 16) 1558 | |.dword 5 + (5 << 8) + (65 << 16), 5 + (5 << 8) + (97 << 16) 1559 | |.dword 6 + (5 << 8) + (129 << 16), 6 + (5 << 8) + (193 << 16) 1560 | |.dword 7 + (5 << 8) + (257 << 16), 7 + (5 << 8) + (385 << 16) 1561 | |.dword 8 + (5 << 8) + (513 << 16), 8 + (5 << 8) + (769 << 16) 1562 | |.dword 9 + (5 << 8) + (1025 << 16), 9 + (5 << 8) + (1537 << 16) 1563 | |.dword 10 + (5 << 8) + (2049 << 16), 10 + (5 << 8) + (3073 << 16) 1564 | |.dword 11 + (5 << 8) + (4097 << 16), 11 + (5 << 8) + (6145 << 16) 1565 | |.dword 12 + (5 << 8) + (8193 << 16), 12 + (5 << 8) + (12289 << 16) 1566 | |.dword 13 + (5 << 8) + (16385 << 16), 13 + (5 << 8) + (24577 << 16) 1567 | |.dword 128 + (5 << 8) + (1 << 16), 128 + (5 << 8) + (1 << 16), ~0 1568 | } 1569 | --------------------------------------------------------------------------------