├── .gitignore
├── src
    └── zig
    │   ├── main.zig
    │   ├── decoder.zig
    │   ├── encoder.zig
    │   ├── line-compressor.zig
    │   ├── table.zig
    │   └── trainer.zig
├── .github
    └── workflows
    │   └── test.yml
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | /.zig-cache
2 | /zig-out
3 | 


--------------------------------------------------------------------------------
/src/zig/main.zig:
--------------------------------------------------------------------------------
 1 | const std = @import("std");
 2 | 
 3 | pub const decode = @import("decoder.zig").decode;
 4 | pub const encode = @import("encoder.zig").encode;
 5 | pub const Table = @import("table.zig").Table;
 6 | pub const Trainer = @import("trainer.zig").Trainer;
 7 | 
 8 | comptime {
 9 |     std.testing.refAllDecls(@This());
10 | }
11 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: "Tests"
 2 | 
 3 | on:
 4 |   push:
 5 |   schedule:
 6 |   - cron: "0 3 * * 5"
 7 |   workflow_dispatch:
 8 |   pull_request:
 9 | 
10 | jobs:
11 |   test:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@v4
15 |         with:
16 |           submodules: true
17 |       - uses: mlugg/setup-zig@v2
18 |         with:
19 |           version: master
20 |       - name: Formatting
21 |         run: zig fmt --check src/**/*.zig
22 |       - name: Tests
23 |         run: zig build test


--------------------------------------------------------------------------------
/src/zig/decoder.zig:
--------------------------------------------------------------------------------
 1 | const std = @import("std");
 2 | const Table = @import("./table.zig").Table;
 3 | 
 4 | pub fn decode(writer: *std.Io.Writer, data: []const u8, table: *const Table) !void {
 5 |     var i: usize = 0;
 6 |     while (i < data.len) {
 7 |         if (data[i] == 255) {
 8 |             try writer.writeByte(data[i + 1]);
 9 |             i += 2;
10 |         } else {
11 |             try writer.writeAll(table.lookup(data[i]));
12 |             i += 1;
13 |         }
14 |     }
15 | }
16 | 
17 | const testing = std.testing;
18 | 
19 | test "decoding" {
20 |     var tbl = Table.init();
21 |     tbl.insert("hello");
22 |     tbl.insert("world");
23 |     const data = [_]u8{ 0, 255, ' ', 1 };
24 | 
25 |     var output: [100]u8 = undefined;
26 |     var w = std.Io.Writer.fixed(&output);
27 |     try decode(&w, &data, &tbl);
28 | 
29 |     try testing.expectEqualSlices(u8, "hello world", w.buffered());
30 | }
31 | 


--------------------------------------------------------------------------------
/src/zig/encoder.zig:
--------------------------------------------------------------------------------
 1 | const std = @import("std");
 2 | const Table = @import("./table.zig").Table;
 3 | 
 4 | pub fn encode(writer: *std.Io.Writer, data: []const u8, tbl: *const Table) !void {
 5 |     var i: usize = 0;
 6 |     while (i < data.len) {
 7 |         if (tbl.findLongestSymbol(data[i..])) |sym| {
 8 |             try writer.writeByte(sym.index);
 9 |             i += sym.data.len;
10 |         } else {
11 |             try writer.writeByte(255);
12 |             try writer.writeByte(data[i]);
13 |             i += 1;
14 |         }
15 |     }
16 | }
17 | 
18 | const testing = std.testing;
19 | 
20 | test "encode" {
21 |     var tbl = Table.init();
22 |     tbl.insert("hallo");
23 |     tbl.insert("hello");
24 |     tbl.insert("world");
25 |     tbl.buildIndex();
26 | 
27 |     const data = "hello worldz";
28 | 
29 |     var output: [100]u8 = undefined;
30 |     var w = std.Io.Writer.fixed(&output);
31 |     try encode(&w, data, &tbl);
32 | 
33 |     try testing.expectEqualSlices(u8, &[_]u8{ 1, 255, ' ', 2, 255, 'z' }, w.buffered());
34 | }
35 | 


--------------------------------------------------------------------------------
/src/zig/line-compressor.zig:
--------------------------------------------------------------------------------
 1 | const std = @import("std");
 2 | const minz = @import("main.zig");
 3 | 
 4 | const allocator = std.heap.c_allocator;
 5 | 
 6 | pub fn main() !void {
 7 |     var args = std.process.args();
 8 |     const binary_name = args.next() orelse unreachable;
 9 | 
10 |     const file_name = args.next() orelse {
11 |         std.debug.print("usage: {s} FILENAME\n", .{binary_name});
12 |         std.process.exit(0);
13 |     };
14 | 
15 |     var threaded: std.Io.Threaded = .init(allocator);
16 |     defer threaded.deinit();
17 | 
18 |     std.debug.print("Reading file: {s}\n", .{file_name});
19 | 
20 |     var file = try std.fs.cwd().openFile(file_name, .{});
21 | 
22 |     var buf: [1024]u8 = undefined;
23 |     var r = file.reader(threaded.io(), &buf);
24 |     const data = try r.interface.allocRemaining(allocator, .unlimited);
25 |     defer allocator.free(data);
26 | 
27 |     var n: usize = 0;
28 |     var lines = std.mem.splitAny(u8, data, "\n");
29 |     while (lines.next()) |line| {
30 |         _ = line;
31 |         n += 1;
32 |     }
33 | 
34 |     std.debug.print("Read {} lines.\n", .{n});
35 | 
36 |     var tbl = minz.Table.init();
37 | 
38 |     var iter: usize = 1;
39 |     const num_iter = 10;
40 |     while (iter <= num_iter) : (iter += 1) {
41 |         std.debug.print("Training on iteration {}\n", .{iter});
42 | 
43 |         std.debug.print("Adding 1% of lines...\n", .{});
44 |         var t = minz.Trainer.init(&tbl);
45 |         var i: usize = 0;
46 |         lines = std.mem.splitAny(u8, data, "\n");
47 |         while (lines.next()) |line| {
48 |             if (i % 100 == 0) {
49 |                 t.add(line);
50 |             }
51 |             i += 1;
52 |         }
53 |         std.debug.print("Building new table.\n", .{});
54 |         tbl = try t.build(allocator);
55 |     }
56 | 
57 |     var uncompressed_size: usize = 0;
58 |     var compressed_size: usize = 0;
59 |     std.debug.print("Compressing...\n", .{});
60 | 
61 |     var result = std.Io.Writer.Allocating.init(allocator);
62 |     defer result.deinit();
63 | 
64 |     lines = std.mem.splitAny(u8, data, "\n");
65 |     while (lines.next()) |line| {
66 |         uncompressed_size += line.len;
67 |         try minz.encode(&result.writer, line, &tbl);
68 |         compressed_size += result.written().len;
69 |         result.shrinkRetainingCapacity(0);
70 |     }
71 | 
72 |     std.debug.print("Uncompressed: {}\nCompressed:   {}\n", .{ uncompressed_size, compressed_size });
73 |     std.debug.print("Ratio: {d}\n", .{@as(f64, @floatFromInt(uncompressed_size)) / @as(f64, @floatFromInt(compressed_size))});
74 | }
75 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # minz: A minimal compressor
 2 | 
 3 | minz is a minimal string compressor based on the paper [FSST: Fast Random Access String Compression](http://www.vldb.org/pvldb/vol13/p2649-boncz.pdf).
 4 | 
 5 | The compressed format is very simple:
 6 | It uses a pre-computed dictionary of 255 entries, each word being at most 8 bytes long.
 7 | Bytes 0x00 to 0xFE adds a word from the dictionary, while byte 0xFF is an escape character which adds the next character as-is.
 8 | 
 9 | **Example:** If the dictionary contains `0x00 = hello` and `0x01 = world`,
10 | then `0x00 0xFF 0x20 0x01 0xFF 0x21` (six bytes) decompresses into `hello world!`.
11 | 
12 | This has the following characteristics:
13 | 
14 | * You'll have to build the dictionary on sample data before you can compress anything.
15 | * There's extremely little overhead in the compressed string. 
16 |   This makes it usable for compressing small strings (<200 bytes) directly.
17 | * The maximal compression ratio is 8x (since each word in the dictionary is at most 8 bytes long), but typical ratio seems to be around ~2x-3x.
18 | * This makes minz quite different from "classical" compression algorithms and it has different use cases.
19 |   In a database system you can use minz to compress the individual _entries_ in an index,
20 |   while with other compression schemes you typically have to compress a bigger block.
21 |   This is what the authors of the paper mean by "random access string compression".
22 | 
23 | ## Usage
24 | 
25 | minz is currently provided as a **library in Zig**.
26 | There's no documentation and you'll have to look at the public functions and test cases.
27 | 
28 | There's also a small command-line tool which reads in a file, trains a dictionary (from 1% of the lines), compresses each line separately, and then reports the total ratio:
29 | 
30 | ```
31 | $ zig build
32 | $ ./zig-out/bin/line-compressor access.log
33 | Reading file: access.log
34 | Read 689253 lines.
35 | Training...
36 | Compressing...
37 | Uncompressed: 135114557
38 | Compressed:   46209436
39 | Ratio: 2.9239603140795745
40 | ```
41 | 
42 | ## Current status
43 | 
44 | This is just a learning project for me to personally learn the algorithm in the paper.
45 | It's not being used in any production systems, and I'm not actively developing it.
46 | 
47 | In addition, the dictionary-training algorithm presented in the paper is actually a bit vague on the exact details.
48 | There is some choice in how you combine symbols and right now it doesn't seem to create an "optimal" dictionary according to human inspection.
49 | If you intend to use this for a "real" project you'll probably have to invest some more time.
50 | 
51 | ## Roadmap / pending work
52 | 
53 | - [ ] Improve training algorithm.
54 | - [ ] Command-line tool (for training/encoding/decoding).
55 | - [ ] Plain JavaScript encoder/decoder.
56 | - [ ] Optimized encoder using AVX512.
57 | - [ ] Integrate encoder/decoder as a native Node module.


--------------------------------------------------------------------------------
/src/zig/table.zig:
--------------------------------------------------------------------------------
  1 | const std = @import("std");
  2 | const assert = std.debug.assert;
  3 | 
  4 | const Symbol = struct {
  5 |     index: u8,
  6 |     data: []const u8,
  7 | };
  8 | 
  9 | const HIGH_BIT = 1 << 7;
 10 | const PREFIX = [4]u8{ 'M' | HIGH_BIT, 'I' | HIGH_BIT, 'N' | HIGH_BIT, 'Z' | HIGH_BIT };
 11 | pub const MAX_SYMBOL = 8;
 12 | 
 13 | pub const Table = struct {
 14 |     n: u8 = 0,
 15 |     lengths: [255]u8,
 16 |     symbols: [255][MAX_SYMBOL]u8,
 17 |     index: [257]u8,
 18 | 
 19 |     pub fn init() Table {
 20 |         return .{
 21 |             .lengths = std.mem.zeroes([255]u8),
 22 |             .symbols = std.mem.zeroes([255][MAX_SYMBOL]u8),
 23 |             .index = std.mem.zeroes([257]u8),
 24 |         };
 25 |     }
 26 | 
 27 |     /// insert adds a new entry into the table. It's up to the caller to ensure that:
 28 |     /// 1. `insert` must be ordered by the first character. You can't insert "world" before "hello".
 29 |     /// 2. The longest prefix must be inserted first. You can't insert "he" before "hello".
 30 |     pub fn insert(self: *Table, data: []const u8) void {
 31 |         assert(data.len > 0 and data.len <= MAX_SYMBOL);
 32 |         const idx = self.n;
 33 |         self.n += 1;
 34 |         self.lengths[idx] = @intCast(data.len);
 35 |         std.mem.copyForwards(u8, &self.symbols[idx], data);
 36 |     }
 37 | 
 38 |     /// buildIndex builds the index used for encoding. This must be called after all `insert`
 39 |     /// have been done in order for `findLongestSymbol` to work as intended.
 40 |     pub fn buildIndex(self: *Table) void {
 41 |         var idx: u8 = 0;
 42 |         var current_fst: usize = 0;
 43 | 
 44 |         while (idx < 255) : (idx += 1) {
 45 |             if (self.lengths[idx] == 0) {
 46 |                 break;
 47 |             }
 48 | 
 49 |             const fst = self.symbols[idx][0];
 50 |             while (current_fst <= fst) : (current_fst += 1) {
 51 |                 self.index[current_fst] = idx;
 52 |             }
 53 |         }
 54 | 
 55 |         while (current_fst <= 256) : (current_fst += 1) {
 56 |             self.index[current_fst] = idx;
 57 |         }
 58 |     }
 59 | 
 60 |     pub fn lookup(self: *const Table, idx: u8) []const u8 {
 61 |         return self.symbols[idx][0..self.lengths[idx]];
 62 |     }
 63 | 
 64 |     pub fn findLongestSymbol(self: *const Table, data: []const u8) ?Symbol {
 65 |         const fst = @as(usize, data[0]);
 66 |         var idx = self.index[fst];
 67 |         const idx_stop = self.index[fst + 1];
 68 |         while (idx < idx_stop) : (idx += 1) {
 69 |             const sym = self.lookup(idx);
 70 |             if (std.mem.startsWith(u8, data, sym)) {
 71 |                 return Symbol{
 72 |                     .index = idx,
 73 |                     .data = sym,
 74 |                 };
 75 |             }
 76 |         }
 77 |         return null;
 78 |     }
 79 | 
 80 |     pub fn findLongestMultiSymbol(self: *const Table, data: []const u8) ?Symbol {
 81 |         if (self.findLongestSymbol(data)) |sym| {
 82 |             if (sym.data.len > 1) return sym;
 83 |         }
 84 |         return null;
 85 |     }
 86 | 
 87 |     pub fn writeTo(self: *const Table, writer: *std.Io.Writer) !void {
 88 |         try writer.writeAll(&PREFIX);
 89 |         try writer.writeByte(1);
 90 |         try writer.writeByte(self.n);
 91 |         try writer.writeAll(self.lengths[0..self.n]);
 92 |         const bin_symbols: [*]const u8 = @ptrCast(&self.symbols);
 93 |         try writer.writeAll(bin_symbols[0 .. self.n * MAX_SYMBOL]);
 94 |     }
 95 | 
 96 |     pub fn readFrom(reader: *std.Io.Reader) !Table {
 97 |         var res = Table.init();
 98 | 
 99 |         const prefix = try reader.take(PREFIX.len);
100 |         if (!std.mem.eql(u8, &PREFIX, prefix)) return error.InvalidFormat;
101 | 
102 |         const version = try reader.takeByte();
103 |         if (version != 1) return error.InvalidFormat;
104 | 
105 |         const n = try reader.takeByte();
106 |         var lengths_writer = std.Io.Writer.fixed(&res.lengths);
107 |         _ = try reader.stream(&lengths_writer, .limited(n));
108 | 
109 |         const bin_symbols: [*]u8 = @ptrCast(&res.symbols);
110 |         var bin_symbols_writer = std.Io.Writer.fixed(bin_symbols[0 .. n * MAX_SYMBOL]);
111 |         _ = try reader.stream(&bin_symbols_writer, .unlimited);
112 | 
113 |         res.n = n;
114 |         res.buildIndex();
115 |         return res;
116 |     }
117 | };
118 | 
119 | const testing = std.testing;
120 | 
121 | test "set and lookup" {
122 |     var tbl = Table.init();
123 |     tbl.insert("hello");
124 |     tbl.insert("world");
125 | 
126 |     try testing.expectEqualSlices(u8, "hello", tbl.lookup(0));
127 |     try testing.expectEqualSlices(u8, "world", tbl.lookup(1));
128 |     try testing.expectEqualSlices(u8, "", tbl.lookup(2));
129 | }
130 | 
131 | test "serialization" {
132 |     var tbl = Table.init();
133 |     tbl.insert("hello");
134 |     tbl.insert("world");
135 | 
136 |     var allocating = std.Io.Writer.Allocating.init(testing.allocator);
137 |     defer allocating.deinit();
138 | 
139 |     try tbl.writeTo(&allocating.writer);
140 |     // Prefix, version, n, (lengths + symbols) * 2 entries.
141 |     try testing.expectEqual(@as(usize, 4 + 1 + 1 + (1 + MAX_SYMBOL) * 2), allocating.written().len);
142 | 
143 |     var r = std.Io.Reader.fixed(allocating.written());
144 |     var tbl2 = try Table.readFrom(&r);
145 |     try testing.expectEqualSlices(u8, "hello", tbl2.lookup(0));
146 |     try testing.expectEqualSlices(u8, "world", tbl2.lookup(1));
147 |     try testing.expectEqualSlices(u8, "", tbl2.lookup(2));
148 | }
149 | 


--------------------------------------------------------------------------------
/src/zig/trainer.zig:
--------------------------------------------------------------------------------
  1 | const std = @import("std");
  2 | 
  3 | const encode = @import("./encoder.zig").encode;
  4 | const table_ns = @import("table.zig");
  5 | const Table = table_ns.Table;
  6 | const MAX_SYMBOL = table_ns.MAX_SYMBOL;
  7 | 
  8 | pub const Trainer = struct {
  9 |     const Cand = struct {
 10 |         data: [MAX_SYMBOL]u8,
 11 |         len: u8,
 12 |         gain: usize,
 13 | 
 14 |         fn slice(cand: *const @This()) []const u8 {
 15 |             return cand.data[0..cand.len];
 16 |         }
 17 |     };
 18 | 
 19 |     // Note that the trainer uses a "code" which is a number between 0 and 512.
 20 |     // When this is between 0-255 it refers to a single byte.
 21 |     // When this is between 256-512 it refers to a symbol in the table.
 22 | 
 23 |     table: *const Table,
 24 |     count1: [512]usize,
 25 |     count2: [512][512]usize,
 26 | 
 27 |     pub fn init(table: *const Table) Trainer {
 28 |         return Trainer{
 29 |             .table = table,
 30 |             .count1 = std.mem.zeroes([512]usize),
 31 |             .count2 = std.mem.zeroes([512][512]usize),
 32 |         };
 33 |     }
 34 | 
 35 |     pub fn deinit(self: *Trainer) void {
 36 |         self.* = undefined;
 37 |     }
 38 | 
 39 |     pub fn add(self: *Trainer, text: []const u8) void {
 40 |         var pos: usize = 0;
 41 |         var prev: ?usize = null;
 42 | 
 43 |         while (pos < text.len) {
 44 |             if (self.table.findLongestMultiSymbol(text[pos..])) |sym| {
 45 |                 const sym_idx = 256 + @as(usize, sym.index);
 46 | 
 47 |                 self.count1[sym_idx] += 1;
 48 |                 if (prev) |p| self.count2[p][sym_idx] += 1;
 49 | 
 50 |                 pos += sym.data.len;
 51 |                 prev = sym_idx;
 52 |             } else {
 53 |                 const byte_idx = @as(usize, text[pos]);
 54 |                 self.count1[byte_idx] += 1;
 55 |                 if (prev) |p| self.count2[p][byte_idx] += 1;
 56 | 
 57 |                 pos += 1;
 58 |                 prev = byte_idx;
 59 |             }
 60 |         }
 61 |     }
 62 | 
 63 |     fn decode(self: *const Trainer, code: usize, buf: *[MAX_SYMBOL]u8, i: *u8) void {
 64 |         if (code < 256) {
 65 |             buf[i.*] = @intCast(code);
 66 |             i.* += 1;
 67 |         } else {
 68 |             const d = self.table.lookup(@intCast(code - 256));
 69 |             const len = @min(@as(u8, @intCast(d.len)), MAX_SYMBOL - i.*);
 70 |             std.mem.copyForwards(u8, buf[i.*..], d[0..len]);
 71 |             i.* += len;
 72 |         }
 73 |     }
 74 | 
 75 |     pub fn build(self: *const Trainer, allocator: std.mem.Allocator) !Table {
 76 |         var cands = std.ArrayList(Cand).empty;
 77 |         defer cands.deinit(allocator);
 78 | 
 79 |         // The number of entries in the table.
 80 |         const m: usize = 256 + @as(usize, self.table.n);
 81 | 
 82 |         var code1: usize = 0;
 83 |         while (code1 < m) : (code1 += 1) {
 84 |             var cand: [MAX_SYMBOL]u8 = undefined;
 85 |             var i: u8 = 0;
 86 | 
 87 |             self.decode(code1, &cand, &i);
 88 | 
 89 |             if (self.count1[code1] > 0) {
 90 |                 const gain1 = i * self.count1[code1];
 91 |                 try cands.append(allocator, Cand{ .data = cand, .len = i, .gain = gain1 });
 92 |             }
 93 | 
 94 |             // If the first symbol is already of length 8 there's nothing to combine.
 95 |             if (i == MAX_SYMBOL) continue;
 96 | 
 97 |             var code2: usize = 0;
 98 |             while (code2 < m) : (code2 += 1) {
 99 |                 var j = i;
100 |                 self.decode(code2, &cand, &j);
101 | 
102 |                 if (self.count2[code1][code2] > 0) {
103 |                     const gain2 = j * self.count2[code1][code2];
104 |                     try cands.append(allocator, Cand{ .data = cand, .len = j, .gain = gain2 });
105 |                 }
106 |             }
107 |         }
108 | 
109 |         const sorting = struct {
110 |             fn byGain(context: void, a: Cand, b: Cand) bool {
111 |                 _ = context;
112 |                 return a.gain > b.gain;
113 |             }
114 | 
115 |             fn byData(context: void, a: Cand, b: Cand) bool {
116 |                 _ = context;
117 |                 return switch (std.math.order(a.data[0], b.data[0])) {
118 |                     .lt => true,
119 |                     .gt => false,
120 |                     .eq => std.mem.lessThan(u8, b.slice(), a.slice()),
121 |                 };
122 |             }
123 |         };
124 | 
125 |         if (cands.items.len > 255) {
126 |             // Only keep the 255 best candidates (by gain).
127 |             std.mem.sort(Cand, cands.items, {}, sorting.byGain);
128 |             cands.shrinkRetainingCapacity(255);
129 |         }
130 | 
131 |         std.mem.sort(Cand, cands.items, {}, sorting.byData);
132 | 
133 |         var res = Table.init();
134 |         for (cands.items) |cand| {
135 |             res.insert(cand.slice());
136 |         }
137 |         res.buildIndex();
138 |         return res;
139 |     }
140 | };
141 | 
142 | const testing = std.testing;
143 | 
144 | test "training" {
145 |     const target = "tumcwitumvldb";
146 |     const expected_compression = [_]usize{ 26, 7, 4, 2 };
147 | 
148 |     var tbl = Table.init();
149 | 
150 |     for (expected_compression) |c| {
151 |         var compressed: [100]u8 = undefined;
152 |         var w = std.Io.Writer.fixed(&compressed);
153 |         try encode(&w, target, &tbl);
154 |         try testing.expectEqual(c, w.buffered().len);
155 | 
156 |         var t = try testing.allocator.create(Trainer);
157 |         defer testing.allocator.destroy(t);
158 | 
159 |         t.* = Trainer.init(&tbl);
160 |         defer t.deinit();
161 |         t.add(target);
162 |         tbl = try t.build(testing.allocator);
163 |     }
164 | }
165 | 


--------------------------------------------------------------------------------