├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── src └── lz4dec.S └── test ├── main.c └── mock.c /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | /lz4dec 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Siguza 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all clean 2 | 3 | all: lz4dec 4 | 5 | lz4dec: src/lz4dec.S test/main.c 6 | $(CC) -o $@ $^ -Wall -O3 $(CFLAGS) 7 | 8 | clean: 9 | rm -f lz4dec 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # lz4dec 2 | 3 | LZ4 block decompressor written in arm64 assembly. 4 | 5 | ### Overview 6 | 7 | lz4dec was written with a focus on small code size and is primarily intended for self-extracting firmware images. It has no dependencies, doesn't use the stack, and only does single-byte memory accesses, so it should work in any environment, with or without MMU, with any memory alignment. It also supports overlapping input and output buffers, so long as the output buffer doesn't "catch up" (i.e. output is never written into parts of the input buffer that haven't been parsed yet). Thus when in doubt, align the input to the end of available buffer. 8 | 9 | It has two configuration: 10 | - "Safe" 11 | This is the default and should be used whenever feasible. In this configuration, the input is considered untrusted and proper bounds checks are employed. 12 | - "Unsafe" 13 | This configuration can be enabled by defining the `UNSAFE_LZ4` macro and will eliminate all bounds checks. This must only be used with trusted input, e.g. when combined with a "known-good" compressed firmware image. 14 | 15 | At the time of writing, the code size of the safe configuration is 252 bytes while that of the unsafe configuration is 144 bytes. 16 | 17 | ### Usage 18 | 19 | The [`lz4dec.S`](https://github.com/Siguza/lz4dec/blob/master/src/lz4dec.S) file exports a single function called `lz4dec`. 20 | 21 | If `UNSAFE_LZ4` is not defined, then the prototype of that function is: 22 | 23 | extern uint64_t lz4dec(const void *src, void *dst, uint64_t srcsz, uint64_t dstsz); 24 | 25 | If `UNSAFE_LZ4` is defined, then the prototype becomes: 26 | 27 | extern uint64_t lz4dec(const void *src, void *dst, uint64_t srcsz); 28 | 29 | The return value is the number of bytes written to the output buffer, or `0` if an error occurred. 30 | 31 | `srcsz` must only be the size of compressed data, trailing bytes that are not LZ4 block data are not allowed. This is required because the size must be known in order to determine where an LZ4 block ends. 32 | 33 | ### Testing 34 | 35 | There is a test binary written in C that wraps around `lz4dec`. 36 | 37 | On arm64 hosts, it can be built like so: 38 | 39 | make # for the safe configuration 40 | CFLAGS='-DUNSAFE_LZ4' make # for the unsafe configuration 41 | 42 | And used like so: 43 | 44 | ./lz4dec file 45 | 46 | There are no command-line options. 47 | 48 | I also wrote a [companion binary](https://github.com/Siguza/lz4hc) to generate single-block LZ4 data as used by this test binary. 49 | 50 | ### License 51 | 52 | [MIT](https://github.com/Siguza/lz4dec/blob/master/LICENSE). 53 | -------------------------------------------------------------------------------- /src/lz4dec.S: -------------------------------------------------------------------------------- 1 | #ifdef NO_GLOBAL 2 | .macro sym name 3 | L_\name: 4 | .endm 5 | #else 6 | # define CONCAT(x,y) x ## y 7 | .macro sym name 8 | .globl CONCAT(__USER_LABEL_PREFIX__,\name) 9 | CONCAT(__USER_LABEL_PREFIX__,\name): 10 | .endm 11 | #endif 12 | 13 | // If UNSAFE_LZ4 is not defined: 14 | // extern uint64_t lz4dec(const void *src, void *dst, uint64_t srcsz, uint64_t dstsz); 15 | 16 | // If UNSAFE_LZ4 is defined: 17 | // extern uint64_t lz4dec(const void *src, void *dst, uint64_t srcsz); 18 | 19 | .p2align 2 20 | sym lz4dec 21 | // Register allocation: 22 | // x0 next input byte ptr 23 | // x1 next output byte ptr 24 | // x2 end of input buffer 25 | // x3 end of output buffer 26 | // x4 literals length / matchlength 27 | // x5 scratch / offset 28 | // x6 scratch 29 | 30 | // x14 original start of output buffer 31 | // x15 return address 32 | mov x15, x30 33 | mov x14, x1 34 | // Calculate end of the buffer 35 | adds x2, x0, x2 36 | #ifndef UNSAFE_LZ4 37 | // And make sure it's sane 38 | b.cs Lerr 39 | adds x3, x1, x3 40 | b.cs Lerr 41 | #endif 42 | Lsequence: 43 | // New sequence 44 | #ifndef UNSAFE_LZ4 45 | cmp x0, x2 46 | b.hs Lerr 47 | #endif 48 | ldrb w4, [x0], 1 49 | and w5, w4, 0xf 50 | // Extract literals length 51 | ubfx w4, w4, 4, 4 52 | cbz w4, Lmatchlength 53 | bl Llongsz 54 | // Copy literals to output buffer 55 | #ifndef UNSAFE_LZ4 56 | // (x1 < x0 || x1 >= x2) && x0 < x2 && x1 < x3 && x4 <= (x2 - x0) && x4 <= (x3 - x1) 57 | cmp x1, x0 58 | ccmp x1, x2, 2, hs 59 | ccmp x0, x2, 2, hs 60 | ccmp x1, x3, 2, lo 61 | sub x6, x2, x0 62 | ccmp x4, x6, 2, lo 63 | sub x6, x3, x1 64 | ccmp x4, x6, 2, ls 65 | b.hi Lerr 66 | #endif 67 | Lliterals: 68 | ldrb w6, [x0], 1 69 | strb w6, [x1], 1 70 | sub x4, x4, 1 71 | cbnz x4, Lliterals 72 | Lmatchlength: 73 | // End of the block only happens if matchlength is zero *and* we're at the 74 | // end of the input stream. If we're not at the end of the input stream, 75 | // then a matchlength of 0 means a copy of 4 bytes. 76 | cmp w5, 0 77 | ccmp x0, x2, 0, eq 78 | b.hs Lend 79 | mov w4, w5 80 | // Offset 81 | #ifndef UNSAFE_LZ4 82 | sub x6, x2, x0 83 | cmp x6, 2 84 | b.lo Lerr 85 | #endif 86 | ldrb w5, [x0], 1 87 | ldrb w6, [x0], 1 88 | bfi w5, w6, 8, 8 89 | #ifndef UNSAFE_LZ4 90 | // Zero offset is invalid 91 | cbz w5, Lerr 92 | #endif 93 | // Extract matchlength 94 | bl Llongsz 95 | adds x4, x4, 4 96 | #ifndef UNSAFE_LZ4 97 | b.cs Lerr 98 | #endif 99 | // Copy match 100 | subs x5, x1, x5 101 | #ifndef UNSAFE_LZ4 102 | // Protect against underflow and OOB 103 | ccmp x5, x14, 0, hs 104 | sub x6, x3, x1 105 | ccmp x6, x4, 0, hs 106 | b.lo Lerr 107 | #endif 108 | Lmatch: 109 | ldrb w6, [x5], 1 110 | strb w6, [x1], 1 111 | sub x4, x4, 1 112 | cbnz x4, Lmatch 113 | b Lsequence 114 | 115 | Llongsz: 116 | // Extract more size bytes 117 | cmp w4, 0xf 118 | b.ne Ldonesz 119 | Lmoresz: 120 | #ifndef UNSAFE_LZ4 121 | cmp x0, x2 122 | b.hs Lerr 123 | #endif 124 | ldrb w6, [x0], 1 125 | adds x4, x4, x6 126 | #ifndef UNSAFE_LZ4 127 | b.cs Lerr 128 | #endif 129 | cmp w6, 0xff 130 | b.eq Lmoresz 131 | Ldonesz: 132 | ret 133 | 134 | #ifndef UNSAFE_LZ4 135 | Lerr: 136 | mov x1, x14 137 | #endif 138 | Lend: 139 | sub x0, x1, x14 140 | ret x15 141 | -------------------------------------------------------------------------------- /test/main.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #ifndef UNSAFE_LZ4 8 | extern uint64_t lz4dec(const void *src, void *dst, uint64_t srcsz, uint64_t dstsz); 9 | #else 10 | extern uint64_t lz4dec(const void *src, void *dst, uint64_t srcsz); 11 | #endif 12 | 13 | int main(void) 14 | { 15 | void *inbuf = NULL; 16 | size_t sz = 0x8000; 17 | size_t len = 0; 18 | while(1) 19 | { 20 | sz *= 2; 21 | inbuf = realloc(inbuf, sz); 22 | if(!inbuf) 23 | { 24 | fprintf(stderr, "realloc: %s\n", strerror(errno)); 25 | return -1; 26 | } 27 | size_t want = sz - len; 28 | size_t have = fread((char*)inbuf + len, 1, sz - len, stdin); 29 | len += have; 30 | if(have < want) 31 | { 32 | if(feof(stdin)) 33 | { 34 | break; 35 | } 36 | fprintf(stderr, "fread: %s\n", strerror(errno)); 37 | return -1; 38 | } 39 | } 40 | 41 | // Docs say max compression ratio is around 250, so with 256 42 | // we should be good even if we don't know the actual output size? 43 | size_t outsz = len * 0x100; 44 | void *outbuf = malloc(outsz); 45 | if(!outbuf) 46 | { 47 | fprintf(stderr, "malloc: %s\n", strerror(errno)); 48 | return -1; 49 | } 50 | 51 | #ifndef UNSAFE_LZ4 52 | uint64_t outlen = lz4dec(inbuf, outbuf, len, outsz); 53 | #else 54 | uint64_t outlen = lz4dec(inbuf, outbuf, len); 55 | #endif 56 | 57 | if(!outlen) 58 | { 59 | fprintf(stderr, "lz4 error\n"); 60 | return -1; 61 | } 62 | 63 | size_t written = fwrite(outbuf, 1, outlen, stdout); 64 | if(written != outlen) 65 | { 66 | fprintf(stderr, "fwrite: %s\n", strerror(errno)); 67 | return -1; 68 | } 69 | 70 | fprintf(stderr, "Decompressed 0x%zx bytes to 0x%llx bytes\n", len, (unsigned long long)outlen); 71 | return 0; 72 | } 73 | -------------------------------------------------------------------------------- /test/mock.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | // C version just to see if clang can produce better asm. 4 | // As of now, it hasn't. 5 | 6 | uint64_t lz4dec(const void *src, void *dst, uint64_t srcsz, uint64_t dstsz) 7 | { 8 | const uint8_t *x0 = src; 9 | uint8_t *x1 = dst; 10 | uintptr_t x2p; 11 | if(__builtin_add_overflow((uintptr_t)x0, srcsz, &x2p)) return 0; 12 | const uint8_t *x2 = (const uint8_t*)x2p; 13 | uintptr_t x3p; 14 | if(__builtin_add_overflow((uintptr_t)x1, dstsz, &x3p)) return 0; 15 | uint8_t *x3 = (uint8_t*)x3p; 16 | while(1) 17 | { 18 | uint8_t w6; 19 | if(x0 >= x2) return 0; 20 | uint8_t token = *x0++; 21 | uint64_t litlen = token >> 4; 22 | uint64_t matchlen = token & 0xf; 23 | if(litlen == 0xf) 24 | { 25 | do 26 | { 27 | if(x0 >= x2) return 0; 28 | w6 = *x0++; 29 | if(__builtin_add_overflow(litlen, w6, &litlen)) return 0; 30 | } while(w6 == 0xff); 31 | } 32 | if(!((x1 < x0 || x1 >= x2) && x0 < x2 && x1 < x3 && litlen <= (x2 - x0) && litlen <= (x3 - x1))) return 0; 33 | for(; litlen > 0; --litlen) 34 | { 35 | *x1++ = *x0++; 36 | } 37 | if(matchlen == 0 && x0 >= x2) return x1 - (uint8_t*)dst; 38 | if(x2 - x0 < 2) return 0; 39 | uint16_t offset = (uint16_t)x0[0] | ((uint16_t)x0[1] << 8); 40 | x0 += 2; 41 | if(!offset) return 0; 42 | if(matchlen == 0xf) 43 | { 44 | do 45 | { 46 | if(x0 >= x2) return 0; 47 | w6 = *x0++; 48 | if(__builtin_add_overflow(matchlen, w6, &matchlen)) return 0; 49 | } while(w6 == 0xff); 50 | } 51 | if(__builtin_add_overflow(matchlen, 4, &matchlen)) return 0; 52 | uintptr_t x5p; 53 | if(__builtin_sub_overflow((uintptr_t)x1, offset, &x5p)) return 0; 54 | uint8_t *x5 = (uint8_t*)x5p; 55 | if(x5 < (uint8_t*)dst || matchlen > (x3 - x1)) return 0; 56 | for(; matchlen > 0; --matchlen) 57 | { 58 | *x1++ = *x5++; 59 | } 60 | } 61 | } 62 | --------------------------------------------------------------------------------