├── .gitignore ├── src └── zig │ ├── main.zig │ ├── decoder.zig │ ├── encoder.zig │ ├── line-compressor.zig │ ├── table.zig │ └── trainer.zig ├── .github └── workflows │ └── test.yml └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | /.zig-cache 2 | /zig-out 3 | -------------------------------------------------------------------------------- /src/zig/main.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | 3 | pub const decode = @import("decoder.zig").decode; 4 | pub const encode = @import("encoder.zig").encode; 5 | pub const Table = @import("table.zig").Table; 6 | pub const Trainer = @import("trainer.zig").Trainer; 7 | 8 | comptime { 9 | std.testing.refAllDecls(@This()); 10 | } 11 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: "Tests" 2 | 3 | on: 4 | push: 5 | schedule: 6 | - cron: "0 3 * * 5" 7 | workflow_dispatch: 8 | pull_request: 9 | 10 | jobs: 11 | test: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | with: 16 | submodules: true 17 | - uses: mlugg/setup-zig@v2 18 | with: 19 | version: master 20 | - name: Formatting 21 | run: zig fmt --check src/**/*.zig 22 | - name: Tests 23 | run: zig build test -------------------------------------------------------------------------------- /src/zig/decoder.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const Table = @import("./table.zig").Table; 3 | 4 | pub fn decode(writer: *std.Io.Writer, data: []const u8, table: *const Table) !void { 5 | var i: usize = 0; 6 | while (i < data.len) { 7 | if (data[i] == 255) { 8 | try writer.writeByte(data[i + 1]); 9 | i += 2; 10 | } else { 11 | try writer.writeAll(table.lookup(data[i])); 12 | i += 1; 13 | } 14 | } 15 | } 16 | 17 | const testing = std.testing; 18 | 19 | test "decoding" { 20 | var tbl = Table.init(); 21 | tbl.insert("hello"); 22 | tbl.insert("world"); 23 | const data = [_]u8{ 0, 255, ' ', 1 }; 24 | 25 | var output: [100]u8 = undefined; 26 | var w = std.Io.Writer.fixed(&output); 27 | try decode(&w, &data, &tbl); 28 | 29 | try testing.expectEqualSlices(u8, "hello world", w.buffered()); 30 | } 31 | -------------------------------------------------------------------------------- /src/zig/encoder.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const Table = @import("./table.zig").Table; 3 | 4 | pub fn encode(writer: *std.Io.Writer, data: []const u8, tbl: *const Table) !void { 5 | var i: usize = 0; 6 | while (i < data.len) { 7 | if (tbl.findLongestSymbol(data[i..])) |sym| { 8 | try writer.writeByte(sym.index); 9 | i += sym.data.len; 10 | } else { 11 | try writer.writeByte(255); 12 | try writer.writeByte(data[i]); 13 | i += 1; 14 | } 15 | } 16 | } 17 | 18 | const testing = std.testing; 19 | 20 | test "encode" { 21 | var tbl = Table.init(); 22 | tbl.insert("hallo"); 23 | tbl.insert("hello"); 24 | tbl.insert("world"); 25 | tbl.buildIndex(); 26 | 27 | const data = "hello worldz"; 28 | 29 | var output: [100]u8 = undefined; 30 | var w = std.Io.Writer.fixed(&output); 31 | try encode(&w, data, &tbl); 32 | 33 | try testing.expectEqualSlices(u8, &[_]u8{ 1, 255, ' ', 2, 255, 'z' }, w.buffered()); 34 | } 35 | -------------------------------------------------------------------------------- /src/zig/line-compressor.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const minz = @import("main.zig"); 3 | 4 | const allocator = std.heap.c_allocator; 5 | 6 | pub fn main() !void { 7 | var args = std.process.args(); 8 | const binary_name = args.next() orelse unreachable; 9 | 10 | const file_name = args.next() orelse { 11 | std.debug.print("usage: {s} FILENAME\n", .{binary_name}); 12 | std.process.exit(0); 13 | }; 14 | 15 | var threaded: std.Io.Threaded = .init(allocator); 16 | defer threaded.deinit(); 17 | 18 | std.debug.print("Reading file: {s}\n", .{file_name}); 19 | 20 | var file = try std.fs.cwd().openFile(file_name, .{}); 21 | 22 | var buf: [1024]u8 = undefined; 23 | var r = file.reader(threaded.io(), &buf); 24 | const data = try r.interface.allocRemaining(allocator, .unlimited); 25 | defer allocator.free(data); 26 | 27 | var n: usize = 0; 28 | var lines = std.mem.splitAny(u8, data, "\n"); 29 | while (lines.next()) |line| { 30 | _ = line; 31 | n += 1; 32 | } 33 | 34 | std.debug.print("Read {} lines.\n", .{n}); 35 | 36 | var tbl = minz.Table.init(); 37 | 38 | var iter: usize = 1; 39 | const num_iter = 10; 40 | while (iter <= num_iter) : (iter += 1) { 41 | std.debug.print("Training on iteration {}\n", .{iter}); 42 | 43 | std.debug.print("Adding 1% of lines...\n", .{}); 44 | var t = minz.Trainer.init(&tbl); 45 | var i: usize = 0; 46 | lines = std.mem.splitAny(u8, data, "\n"); 47 | while (lines.next()) |line| { 48 | if (i % 100 == 0) { 49 | t.add(line); 50 | } 51 | i += 1; 52 | } 53 | std.debug.print("Building new table.\n", .{}); 54 | tbl = try t.build(allocator); 55 | } 56 | 57 | var uncompressed_size: usize = 0; 58 | var compressed_size: usize = 0; 59 | std.debug.print("Compressing...\n", .{}); 60 | 61 | var result = std.Io.Writer.Allocating.init(allocator); 62 | defer result.deinit(); 63 | 64 | lines = std.mem.splitAny(u8, data, "\n"); 65 | while (lines.next()) |line| { 66 | uncompressed_size += line.len; 67 | try minz.encode(&result.writer, line, &tbl); 68 | compressed_size += result.written().len; 69 | result.shrinkRetainingCapacity(0); 70 | } 71 | 72 | std.debug.print("Uncompressed: {}\nCompressed: {}\n", .{ uncompressed_size, compressed_size }); 73 | std.debug.print("Ratio: {d}\n", .{@as(f64, @floatFromInt(uncompressed_size)) / @as(f64, @floatFromInt(compressed_size))}); 74 | } 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # minz: A minimal compressor 2 | 3 | minz is a minimal string compressor based on the paper [FSST: Fast Random Access String Compression](http://www.vldb.org/pvldb/vol13/p2649-boncz.pdf). 4 | 5 | The compressed format is very simple: 6 | It uses a pre-computed dictionary of 255 entries, each word being at most 8 bytes long. 7 | Bytes 0x00 to 0xFE adds a word from the dictionary, while byte 0xFF is an escape character which adds the next character as-is. 8 | 9 | **Example:** If the dictionary contains `0x00 = hello` and `0x01 = world`, 10 | then `0x00 0xFF 0x20 0x01 0xFF 0x21` (six bytes) decompresses into `hello world!`. 11 | 12 | This has the following characteristics: 13 | 14 | * You'll have to build the dictionary on sample data before you can compress anything. 15 | * There's extremely little overhead in the compressed string. 16 | This makes it usable for compressing small strings (<200 bytes) directly. 17 | * The maximal compression ratio is 8x (since each word in the dictionary is at most 8 bytes long), but typical ratio seems to be around ~2x-3x. 18 | * This makes minz quite different from "classical" compression algorithms and it has different use cases. 19 | In a database system you can use minz to compress the individual _entries_ in an index, 20 | while with other compression schemes you typically have to compress a bigger block. 21 | This is what the authors of the paper mean by "random access string compression". 22 | 23 | ## Usage 24 | 25 | minz is currently provided as a **library in Zig**. 26 | There's no documentation and you'll have to look at the public functions and test cases. 27 | 28 | There's also a small command-line tool which reads in a file, trains a dictionary (from 1% of the lines), compresses each line separately, and then reports the total ratio: 29 | 30 | ``` 31 | $ zig build 32 | $ ./zig-out/bin/line-compressor access.log 33 | Reading file: access.log 34 | Read 689253 lines. 35 | Training... 36 | Compressing... 37 | Uncompressed: 135114557 38 | Compressed: 46209436 39 | Ratio: 2.9239603140795745 40 | ``` 41 | 42 | ## Current status 43 | 44 | This is just a learning project for me to personally learn the algorithm in the paper. 45 | It's not being used in any production systems, and I'm not actively developing it. 46 | 47 | In addition, the dictionary-training algorithm presented in the paper is actually a bit vague on the exact details. 48 | There is some choice in how you combine symbols and right now it doesn't seem to create an "optimal" dictionary according to human inspection. 49 | If you intend to use this for a "real" project you'll probably have to invest some more time. 50 | 51 | ## Roadmap / pending work 52 | 53 | - [ ] Improve training algorithm. 54 | - [ ] Command-line tool (for training/encoding/decoding). 55 | - [ ] Plain JavaScript encoder/decoder. 56 | - [ ] Optimized encoder using AVX512. 57 | - [ ] Integrate encoder/decoder as a native Node module. -------------------------------------------------------------------------------- /src/zig/table.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const assert = std.debug.assert; 3 | 4 | const Symbol = struct { 5 | index: u8, 6 | data: []const u8, 7 | }; 8 | 9 | const HIGH_BIT = 1 << 7; 10 | const PREFIX = [4]u8{ 'M' | HIGH_BIT, 'I' | HIGH_BIT, 'N' | HIGH_BIT, 'Z' | HIGH_BIT }; 11 | pub const MAX_SYMBOL = 8; 12 | 13 | pub const Table = struct { 14 | n: u8 = 0, 15 | lengths: [255]u8, 16 | symbols: [255][MAX_SYMBOL]u8, 17 | index: [257]u8, 18 | 19 | pub fn init() Table { 20 | return .{ 21 | .lengths = std.mem.zeroes([255]u8), 22 | .symbols = std.mem.zeroes([255][MAX_SYMBOL]u8), 23 | .index = std.mem.zeroes([257]u8), 24 | }; 25 | } 26 | 27 | /// insert adds a new entry into the table. It's up to the caller to ensure that: 28 | /// 1. `insert` must be ordered by the first character. You can't insert "world" before "hello". 29 | /// 2. The longest prefix must be inserted first. You can't insert "he" before "hello". 30 | pub fn insert(self: *Table, data: []const u8) void { 31 | assert(data.len > 0 and data.len <= MAX_SYMBOL); 32 | const idx = self.n; 33 | self.n += 1; 34 | self.lengths[idx] = @intCast(data.len); 35 | std.mem.copyForwards(u8, &self.symbols[idx], data); 36 | } 37 | 38 | /// buildIndex builds the index used for encoding. This must be called after all `insert` 39 | /// have been done in order for `findLongestSymbol` to work as intended. 40 | pub fn buildIndex(self: *Table) void { 41 | var idx: u8 = 0; 42 | var current_fst: usize = 0; 43 | 44 | while (idx < 255) : (idx += 1) { 45 | if (self.lengths[idx] == 0) { 46 | break; 47 | } 48 | 49 | const fst = self.symbols[idx][0]; 50 | while (current_fst <= fst) : (current_fst += 1) { 51 | self.index[current_fst] = idx; 52 | } 53 | } 54 | 55 | while (current_fst <= 256) : (current_fst += 1) { 56 | self.index[current_fst] = idx; 57 | } 58 | } 59 | 60 | pub fn lookup(self: *const Table, idx: u8) []const u8 { 61 | return self.symbols[idx][0..self.lengths[idx]]; 62 | } 63 | 64 | pub fn findLongestSymbol(self: *const Table, data: []const u8) ?Symbol { 65 | const fst = @as(usize, data[0]); 66 | var idx = self.index[fst]; 67 | const idx_stop = self.index[fst + 1]; 68 | while (idx < idx_stop) : (idx += 1) { 69 | const sym = self.lookup(idx); 70 | if (std.mem.startsWith(u8, data, sym)) { 71 | return Symbol{ 72 | .index = idx, 73 | .data = sym, 74 | }; 75 | } 76 | } 77 | return null; 78 | } 79 | 80 | pub fn findLongestMultiSymbol(self: *const Table, data: []const u8) ?Symbol { 81 | if (self.findLongestSymbol(data)) |sym| { 82 | if (sym.data.len > 1) return sym; 83 | } 84 | return null; 85 | } 86 | 87 | pub fn writeTo(self: *const Table, writer: *std.Io.Writer) !void { 88 | try writer.writeAll(&PREFIX); 89 | try writer.writeByte(1); 90 | try writer.writeByte(self.n); 91 | try writer.writeAll(self.lengths[0..self.n]); 92 | const bin_symbols: [*]const u8 = @ptrCast(&self.symbols); 93 | try writer.writeAll(bin_symbols[0 .. self.n * MAX_SYMBOL]); 94 | } 95 | 96 | pub fn readFrom(reader: *std.Io.Reader) !Table { 97 | var res = Table.init(); 98 | 99 | const prefix = try reader.take(PREFIX.len); 100 | if (!std.mem.eql(u8, &PREFIX, prefix)) return error.InvalidFormat; 101 | 102 | const version = try reader.takeByte(); 103 | if (version != 1) return error.InvalidFormat; 104 | 105 | const n = try reader.takeByte(); 106 | var lengths_writer = std.Io.Writer.fixed(&res.lengths); 107 | _ = try reader.stream(&lengths_writer, .limited(n)); 108 | 109 | const bin_symbols: [*]u8 = @ptrCast(&res.symbols); 110 | var bin_symbols_writer = std.Io.Writer.fixed(bin_symbols[0 .. n * MAX_SYMBOL]); 111 | _ = try reader.stream(&bin_symbols_writer, .unlimited); 112 | 113 | res.n = n; 114 | res.buildIndex(); 115 | return res; 116 | } 117 | }; 118 | 119 | const testing = std.testing; 120 | 121 | test "set and lookup" { 122 | var tbl = Table.init(); 123 | tbl.insert("hello"); 124 | tbl.insert("world"); 125 | 126 | try testing.expectEqualSlices(u8, "hello", tbl.lookup(0)); 127 | try testing.expectEqualSlices(u8, "world", tbl.lookup(1)); 128 | try testing.expectEqualSlices(u8, "", tbl.lookup(2)); 129 | } 130 | 131 | test "serialization" { 132 | var tbl = Table.init(); 133 | tbl.insert("hello"); 134 | tbl.insert("world"); 135 | 136 | var allocating = std.Io.Writer.Allocating.init(testing.allocator); 137 | defer allocating.deinit(); 138 | 139 | try tbl.writeTo(&allocating.writer); 140 | // Prefix, version, n, (lengths + symbols) * 2 entries. 141 | try testing.expectEqual(@as(usize, 4 + 1 + 1 + (1 + MAX_SYMBOL) * 2), allocating.written().len); 142 | 143 | var r = std.Io.Reader.fixed(allocating.written()); 144 | var tbl2 = try Table.readFrom(&r); 145 | try testing.expectEqualSlices(u8, "hello", tbl2.lookup(0)); 146 | try testing.expectEqualSlices(u8, "world", tbl2.lookup(1)); 147 | try testing.expectEqualSlices(u8, "", tbl2.lookup(2)); 148 | } 149 | -------------------------------------------------------------------------------- /src/zig/trainer.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | 3 | const encode = @import("./encoder.zig").encode; 4 | const table_ns = @import("table.zig"); 5 | const Table = table_ns.Table; 6 | const MAX_SYMBOL = table_ns.MAX_SYMBOL; 7 | 8 | pub const Trainer = struct { 9 | const Cand = struct { 10 | data: [MAX_SYMBOL]u8, 11 | len: u8, 12 | gain: usize, 13 | 14 | fn slice(cand: *const @This()) []const u8 { 15 | return cand.data[0..cand.len]; 16 | } 17 | }; 18 | 19 | // Note that the trainer uses a "code" which is a number between 0 and 512. 20 | // When this is between 0-255 it refers to a single byte. 21 | // When this is between 256-512 it refers to a symbol in the table. 22 | 23 | table: *const Table, 24 | count1: [512]usize, 25 | count2: [512][512]usize, 26 | 27 | pub fn init(table: *const Table) Trainer { 28 | return Trainer{ 29 | .table = table, 30 | .count1 = std.mem.zeroes([512]usize), 31 | .count2 = std.mem.zeroes([512][512]usize), 32 | }; 33 | } 34 | 35 | pub fn deinit(self: *Trainer) void { 36 | self.* = undefined; 37 | } 38 | 39 | pub fn add(self: *Trainer, text: []const u8) void { 40 | var pos: usize = 0; 41 | var prev: ?usize = null; 42 | 43 | while (pos < text.len) { 44 | if (self.table.findLongestMultiSymbol(text[pos..])) |sym| { 45 | const sym_idx = 256 + @as(usize, sym.index); 46 | 47 | self.count1[sym_idx] += 1; 48 | if (prev) |p| self.count2[p][sym_idx] += 1; 49 | 50 | pos += sym.data.len; 51 | prev = sym_idx; 52 | } else { 53 | const byte_idx = @as(usize, text[pos]); 54 | self.count1[byte_idx] += 1; 55 | if (prev) |p| self.count2[p][byte_idx] += 1; 56 | 57 | pos += 1; 58 | prev = byte_idx; 59 | } 60 | } 61 | } 62 | 63 | fn decode(self: *const Trainer, code: usize, buf: *[MAX_SYMBOL]u8, i: *u8) void { 64 | if (code < 256) { 65 | buf[i.*] = @intCast(code); 66 | i.* += 1; 67 | } else { 68 | const d = self.table.lookup(@intCast(code - 256)); 69 | const len = @min(@as(u8, @intCast(d.len)), MAX_SYMBOL - i.*); 70 | std.mem.copyForwards(u8, buf[i.*..], d[0..len]); 71 | i.* += len; 72 | } 73 | } 74 | 75 | pub fn build(self: *const Trainer, allocator: std.mem.Allocator) !Table { 76 | var cands = std.ArrayList(Cand).empty; 77 | defer cands.deinit(allocator); 78 | 79 | // The number of entries in the table. 80 | const m: usize = 256 + @as(usize, self.table.n); 81 | 82 | var code1: usize = 0; 83 | while (code1 < m) : (code1 += 1) { 84 | var cand: [MAX_SYMBOL]u8 = undefined; 85 | var i: u8 = 0; 86 | 87 | self.decode(code1, &cand, &i); 88 | 89 | if (self.count1[code1] > 0) { 90 | const gain1 = i * self.count1[code1]; 91 | try cands.append(allocator, Cand{ .data = cand, .len = i, .gain = gain1 }); 92 | } 93 | 94 | // If the first symbol is already of length 8 there's nothing to combine. 95 | if (i == MAX_SYMBOL) continue; 96 | 97 | var code2: usize = 0; 98 | while (code2 < m) : (code2 += 1) { 99 | var j = i; 100 | self.decode(code2, &cand, &j); 101 | 102 | if (self.count2[code1][code2] > 0) { 103 | const gain2 = j * self.count2[code1][code2]; 104 | try cands.append(allocator, Cand{ .data = cand, .len = j, .gain = gain2 }); 105 | } 106 | } 107 | } 108 | 109 | const sorting = struct { 110 | fn byGain(context: void, a: Cand, b: Cand) bool { 111 | _ = context; 112 | return a.gain > b.gain; 113 | } 114 | 115 | fn byData(context: void, a: Cand, b: Cand) bool { 116 | _ = context; 117 | return switch (std.math.order(a.data[0], b.data[0])) { 118 | .lt => true, 119 | .gt => false, 120 | .eq => std.mem.lessThan(u8, b.slice(), a.slice()), 121 | }; 122 | } 123 | }; 124 | 125 | if (cands.items.len > 255) { 126 | // Only keep the 255 best candidates (by gain). 127 | std.mem.sort(Cand, cands.items, {}, sorting.byGain); 128 | cands.shrinkRetainingCapacity(255); 129 | } 130 | 131 | std.mem.sort(Cand, cands.items, {}, sorting.byData); 132 | 133 | var res = Table.init(); 134 | for (cands.items) |cand| { 135 | res.insert(cand.slice()); 136 | } 137 | res.buildIndex(); 138 | return res; 139 | } 140 | }; 141 | 142 | const testing = std.testing; 143 | 144 | test "training" { 145 | const target = "tumcwitumvldb"; 146 | const expected_compression = [_]usize{ 26, 7, 4, 2 }; 147 | 148 | var tbl = Table.init(); 149 | 150 | for (expected_compression) |c| { 151 | var compressed: [100]u8 = undefined; 152 | var w = std.Io.Writer.fixed(&compressed); 153 | try encode(&w, target, &tbl); 154 | try testing.expectEqual(c, w.buffered().len); 155 | 156 | var t = try testing.allocator.create(Trainer); 157 | defer testing.allocator.destroy(t); 158 | 159 | t.* = Trainer.init(&tbl); 160 | defer t.deinit(); 161 | t.add(target); 162 | tbl = try t.build(testing.allocator); 163 | } 164 | } 165 | --------------------------------------------------------------------------------