├── .github └── workflows │ └── test.yml ├── .gitignore ├── LICENSE.txt ├── README.md ├── build.zig ├── build.zig.zon ├── src ├── CompactArray.zig ├── DictArray.zig ├── EliasFano.zig ├── StringDict.zig ├── darray.zig ├── main.zig ├── pthash.zig ├── ribbon.zig └── utils.zig └── tools ├── zini-pthash └── main.zig ├── zini-ribbon └── main.zig └── zini-seqz └── main.zig /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: "Tests" 2 | 3 | on: 4 | push: 5 | schedule: 6 | - cron: "0 3 * * 5" 7 | workflow_dispatch: 8 | pull_request: 9 | 10 | jobs: 11 | test: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v3 15 | with: 16 | path: zini 17 | 18 | - uses: goto-bus-stop/setup-zig@v1 19 | with: 20 | version: master 21 | 22 | - name: Formatting 23 | run: zig fmt --check src/*.zig 24 | working-directory: zini 25 | 26 | - name: Tests 27 | run: zig build test 28 | working-directory: zini 29 | 30 | - name: Build executables 31 | run: zig build 32 | working-directory: zini 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .zig-cache 2 | zig-out 3 | /coverage 4 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | BSD Zero Clause License 2 | 3 | Copyright (c) 2022 Magnus Holm 4 | 5 | Permission to use, copy, modify, and/or distribute this software for any 6 | purpose with or without fee is hereby granted. 7 | 8 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 9 | REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 10 | AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 11 | INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 12 | LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 13 | OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 14 | PERFORMANCE OF THIS SOFTWARE. 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Zini 2 | 3 | Zini (Zig + Mini) is a [Zig](https://ziglang.org/) library providing some succinct data structures: 4 | 5 | - `zini.pthash`, a [**minimal perfect hash function**](https://en.wikipedia.org/wiki/Perfect_hash_function) construction algorithm, using less than 4 bits per element. 6 | - `zini.ribbon`, a **retrieval data structure** (sometimes called a "static function") construction algorithm, having less than 1% overhead. 7 | - `zini.CompactArray` stores n-bit numbers tightly packed, leaving no bits unused. 8 | If the largest value in an array is `m` then you actually only need `n = log2(m) + 1` bits per element. 9 | E.g. if the largest value is 270, you will get 7x compression using CompactArray over `[]u64` as it stores each element using only 9 bits (and 64 divided by 9 is roughly 7). 10 | - `zini.DictArray` finds all distinct elements in the array, stores each once into a CompactArray (the dictionary), and creates a new CompactArray containing indexes into the dictionary. 11 | This will give excellent compression if there's a lot of repetition in the original array. 12 | - `zini.EliasFano` stores increasing 64-bit numbers in a compact manner. 13 | - `zini.darray` provides constant-time support for the `select1(i)` operation which returns the _i_-th set bit in a `std.DynamicBitSetUnmanaged`. 14 | 15 | ## Overview 16 | 17 | ### PTHash, minimal perfect hash function 18 | 19 | `zini.pthash` contains an implementation of [PTHash][pthash], a [minimal perfect hash function](https://en.wikipedia.org/wiki/Perfect_hash_function) construction algorithm. 20 | Given a set of `n` elements, with the only requirement being that you can hash them, it generates a hash function which maps each element to a distinct number between `0` and `n - 1`. 21 | The generated hash function is extremely small, typically consuming less than **4 _bits_ per element**, regardless of the size of the input type. 22 | The algorithm provides multiple parameters to tune making it possible to optimize for (small) size, (short) construction time, or (short) lookup time. 23 | 24 | To give a practical example: 25 | In ~0.6 seconds Zini was able to create a hash function for /usr/share/dict/words containing 235886 words. 26 | The resulting hash function required in total 865682 bits in memory. 27 | This corresponds to 108.2 kB in total or 3.67 bits per word. 28 | In comparison, the original file was 2.49 MB and compressing it with `gzip -9` only gets it down to 754 kB (which you can't use directly in memory without decompressing it). 29 | It should of course be noted that they don't store the equivalent data as you can't use the generated hash function to determine if a word is present or not in the list. 30 | The comparison is mainly useful to get a feeling of the magnitudes. 31 | 32 | ### Bumped Ribbon Retrieval, a retrieval data structure 33 | 34 | `zini.ribbon` contains an implementation of [Bumped Ribbon Retrieval][burr] (_BuRR_), a retrieval data structure. 35 | Given `n` keys (with the only requirement being that you can hash them) which each have an `r`-bit value, we'll build a data structure which will return the value for all of the `n` keys. 36 | However, the keys are actually not stored (we're only using the hash) so if you ask for the value for an _unknown_ key you will get a seemingly random answer; there's no way of knowing whether the key was present in the original dataset or not. 37 | 38 | The theoretically minimal amount of space needed to store the _values_ is `n * r` (we have `n` `r`-bit values after all). 39 | We use the term "overhead" to refer to how much _extra_ amount of data we need. 40 | The Bumped Ribbon Retrieval will often have **less than 1% overhead**. 41 | 42 | ## Usage 43 | 44 | Zini is intended to be used as a library, but also ships the command-line tools `zini-pthash` and `zini-ribbon`. 45 | As the documentation is a bit lacking it might be useful to look through `tools/zini-{pthash,ribbon}/main.zig` to understand how it's used. 46 | 47 | ``` 48 | USAGE 49 | ./zig-out/bin/zini-pthash [build | lookup] 50 | 51 | COMMAND: build 52 | Builds hash function for plain text file. 53 | 54 | -i, --input 55 | -o, --output 56 | -c 57 | -a, --alpha 58 | -s, --seed 59 | 60 | COMMAND: lookup 61 | 62 | -i, --input 63 | -k, --key 64 | -b, --benchmark 65 | ``` 66 | 67 | And here's an example run of using `zini-pthash`. 68 | 69 | ``` 70 | # Build zini-pthash: 71 | $ zig build -Drelease-safe 72 | 73 | # Build a hash function: 74 | $ ./zig-out/bin/zini-pthash build -i /usr/share/dict/words -o words.pth 75 | Reading /usr/share/dict/words... 76 | 77 | Building hash function... 78 | 79 | Successfully built hash function: 80 | seed: 12323441790160983030 81 | bits: 865554 82 | bits/n: 3.6693741892269993 83 | 84 | Writing to words.pth 85 | 86 | # Look up an index in the hash function: 87 | $ ./zig-out/bin/zini-pthash lookup -i words.pth --key hello 88 | Reading words.pth... 89 | 90 | Successfully loaded hash function: 91 | seed: 12323441790160983030 92 | bits: 865554 93 | bits/n: 3.6693741892269993 94 | 95 | Looking up key=hello: 96 | 112576 97 | ``` 98 | 99 | ## Acknowledgments 100 | 101 | Zini is merely an implementation of existing algorithms and techniques already described in the literature: 102 | 103 | - The [PTHash][pthash] algorithm is described by Giulio Ermanno Pibiri and Roberto Trani in arXiv:2104.10402. 104 | - They also implemented PTHash as a C++ library in under the MIT license. 105 | Zini uses no code directly from that repository, but it has been an invaluable resource for understanding how to implement PTHash in practice. 106 | - The [BuRR][burr] data structure is described by Peter C. Dillinger, Lorenz Hübschle-Schneider, Peter Sanders and Stefan Walzer in arXiv:2109.01892. 107 | 108 | [pthash]: https://arxiv.org/abs/2104.10402 109 | [burr]: https://arxiv.org/abs/2109.01892 110 | 111 | ## License 112 | 113 | Zini is licensed under the [0BSD license](https://spdx.org/licenses/0BSD.html). 114 | -------------------------------------------------------------------------------- /build.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | 3 | pub fn build(b: *std.Build) !void { 4 | const optimize = b.standardOptimizeOption(.{}); 5 | const target = b.standardTargetOptions(.{}); 6 | 7 | const zini = b.addModule("zini", .{ 8 | .root_source_file = b.path("src/main.zig"), 9 | }); 10 | 11 | const tests = b.addTest(.{ 12 | .root_source_file = b.path("src/main.zig"), 13 | .target = target, 14 | .optimize = optimize, 15 | }); 16 | const tests_run_step = b.addRunArtifact(tests); 17 | tests_run_step.has_side_effects = true; 18 | 19 | const coverage = b.option(bool, "test-coverage", "Generate test coverage") orelse false; 20 | if (coverage) { 21 | const runner = [_][]const u8{ 22 | "kcov", 23 | "--include-path", 24 | ".", 25 | "coverage", // output dir 26 | }; 27 | 28 | const dst = try tests_run_step.argv.addManyAt(b.allocator, 0, runner.len); 29 | for (runner, 0..) |arg, idx| { 30 | dst[idx] = .{ .bytes = b.dupe(arg) }; 31 | } 32 | } 33 | 34 | const test_step = b.step("test", "Run unit tests"); 35 | test_step.dependOn(&tests_run_step.step); 36 | 37 | const parg = b.dependency("parg", .{ .target = target, .optimize = optimize }); 38 | 39 | const pthash = b.addExecutable(.{ 40 | .name = "zini-pthash", 41 | .root_source_file = b.path("tools/zini-pthash/main.zig"), 42 | .target = target, 43 | .optimize = optimize, 44 | }); 45 | pthash.root_module.addImport("zini", zini); 46 | pthash.root_module.addImport("parg", parg.module("parg")); 47 | b.installArtifact(pthash); 48 | 49 | const ribbon = b.addExecutable(.{ 50 | .name = "zini-ribbon", 51 | .root_source_file = b.path("tools/zini-ribbon/main.zig"), 52 | .target = target, 53 | .optimize = optimize, 54 | }); 55 | ribbon.root_module.addImport("zini", zini); 56 | ribbon.root_module.addImport("parg", parg.module("parg")); 57 | b.installArtifact(ribbon); 58 | 59 | const seqz = b.addExecutable(.{ 60 | .name = "zini-seqz", 61 | .root_source_file = b.path("tools/zini-seqz/main.zig"), 62 | .target = target, 63 | .optimize = optimize, 64 | }); 65 | seqz.root_module.addImport("zini", zini); 66 | seqz.root_module.addImport("parg", parg.module("parg")); 67 | b.installArtifact(seqz); 68 | } 69 | -------------------------------------------------------------------------------- /build.zig.zon: -------------------------------------------------------------------------------- 1 | .{ 2 | .name = .zini, 3 | .fingerprint = 0xadd0f09e47acd80c, 4 | // This is a [Semantic Version](https://semver.org/). 5 | // In a future version of Zig it will be used for package deduplication. 6 | .version = "0.0.0", 7 | 8 | // This field is optional. 9 | // This is currently advisory only; Zig does not yet do anything 10 | // with this value. 11 | //.minimum_zig_version = "0.11.0", 12 | 13 | // This field is optional. 14 | // Each dependency must either provide a `url` and `hash`, or a `path`. 15 | // `zig build --fetch` can be used to fetch all dependencies of a package, recursively. 16 | // Once all dependencies are fetched, `zig build` no longer requires 17 | // internet connectivity. 18 | .dependencies = .{ 19 | // See `zig fetch --save ` for a command-line interface for adding dependencies. 20 | .parg = .{ 21 | // When updating this field to a new URL, be sure to delete the corresponding 22 | // `hash`, otherwise you are communicating that you expect to find the old hash at 23 | // the new URL. 24 | .url = "https://github.com/judofyr/parg/archive/3e1d79ee543c56797f89bb7ced8ae02f115b9ff3.tar.gz", 25 | 26 | // This is computed from the file contents of the directory of files that is 27 | // obtained after fetching `url` and applying the inclusion rules given by 28 | // `paths`. 29 | // 30 | // This field is the source of truth; packages do not come from a `url`; they 31 | // come from a `hash`. `url` is just one of many possible mirrors for how to 32 | // obtain a package matching this `hash`. 33 | // 34 | // Uses the [multihash](https://multiformats.io/multihash/) format. 35 | .hash = "1220c33110d98d9bf2139ab2b702347a22849166455e132252c08749bd6cb09ffe03", 36 | 37 | // When this is provided, the package is found in a directory relative to the 38 | // build root. In this case the package's hash is irrelevant and therefore not 39 | // computed. This field and `url` are mutually exclusive. 40 | // .path = "foo", 41 | }, 42 | }, 43 | 44 | // Specifies the set of files and directories that are included in this package. 45 | // Only files and directories listed here are included in the `hash` that 46 | // is computed for this package. 47 | // Paths are relative to the build root. Use the empty string (`""`) to refer to 48 | // the build root itself. 49 | // A directory listed here means that all files within, recursively, are included. 50 | .paths = .{ 51 | // This makes *all* files, recursively, included in this package. It is generally 52 | // better to explicitly list the files and directories instead, to insure that 53 | // fetching from tarballs, file system paths, and version control all result 54 | // in the same contents hash. 55 | "", 56 | // For example... 57 | //"build.zig", 58 | //"build.zig.zon", 59 | //"src", 60 | //"LICENSE", 61 | //"README.md", 62 | }, 63 | } 64 | -------------------------------------------------------------------------------- /src/CompactArray.zig: -------------------------------------------------------------------------------- 1 | /// CompactArray stores a list of n-bit integers packed tightly. 2 | const std = @import("std"); 3 | const builtin = @import("builtin"); 4 | 5 | const utils = @import("./utils.zig"); 6 | 7 | const Self = @This(); 8 | 9 | const Int = u64; 10 | const IntLog2 = std.math.Log2Int(Int); 11 | const endian = builtin.cpu.arch.endian(); 12 | 13 | data: []const Int, 14 | width: IntLog2, 15 | 16 | pub fn deinit(self: *Self, allocator: std.mem.Allocator) void { 17 | allocator.free(self.data); 18 | self.* = undefined; 19 | } 20 | 21 | pub fn bits(self: *const Self) usize { 22 | return utils.bitSizeOfSlice(self.data); 23 | } 24 | 25 | fn getMask(self: *const Self) u64 { 26 | return (@as(Int, 1) << self.width) - 1; 27 | } 28 | 29 | /// Returns the value stored at a given index. 30 | pub fn get(self: *const Self, idx: usize) u64 { 31 | const pos = idx * self.width; 32 | const block = pos / @bitSizeOf(Int); 33 | const shift: IntLog2 = @intCast(pos % @bitSizeOf(Int)); 34 | 35 | if (@as(Int, shift) + self.width <= @bitSizeOf(Int)) { 36 | return (self.data[block] >> shift) & self.getMask(); 37 | } else { 38 | const res_shift = ~shift + 1; // =:= @bitSizeOf(Int) - shift; 39 | return (self.data[block] >> shift) | (self.data[block + 1] << res_shift & self.getMask()); 40 | } 41 | } 42 | 43 | /// Encodes an array into the smallest compact array possible. 44 | pub fn encode(allocator: std.mem.Allocator, data: []const u64) !Self { 45 | if (data.len == 0) return Self{ .data = &[_]Int{}, .width = 1 }; 46 | 47 | const width: IntLog2 = @intCast(std.math.log2_int(u64, std.mem.max(u64, data)) + 1); 48 | var arr = try Mutable.init(allocator, width, data.len); 49 | for (data, 0..) |val, idx| { 50 | arr.setFromZero(idx, val); 51 | } 52 | return arr.finalize(); 53 | } 54 | 55 | /// Writes the array into an std.io.Writer. This can be read using `readFrom`. 56 | pub fn writeTo(self: *const Self, w: anytype) !void { 57 | try w.writeInt(Int, self.width, endian); 58 | try utils.writeSlice(w, self.data); 59 | } 60 | 61 | /// Reads an array from a buffer. Note that this will not allocate, but will 62 | /// instead create a new CompactArray which points directly to the data in 63 | /// the buffer. 64 | pub fn readFrom(stream: *std.io.FixedBufferStream([]const u8)) !Self { 65 | var r = stream.reader(); 66 | const width = try r.readInt(Int, endian); 67 | const data = try utils.readSlice(stream, Int); 68 | return Self{ 69 | .width = @intCast(width), 70 | .data = data, 71 | }; 72 | } 73 | 74 | pub const Mutable = struct { 75 | data: []Int, 76 | width: IntLog2, 77 | 78 | /// Creates a new array that can store `n` values of `width` bits each. 79 | pub fn init(allocator: std.mem.Allocator, width: IntLog2, n: usize) !Mutable { 80 | const m = std.math.divCeil(usize, width * n, @bitSizeOf(Int)) catch unreachable; 81 | 82 | const data = try allocator.alloc(Int, m); 83 | @memset(data, 0); 84 | 85 | return Mutable{ 86 | .data = data, 87 | .width = width, 88 | }; 89 | } 90 | 91 | pub fn deinit(self: *Mutable, allocator: std.mem.Allocator) void { 92 | allocator.free(self.data); 93 | self.* = undefined; 94 | } 95 | 96 | pub fn finalize(self: *Mutable) Self { 97 | const result = self.asImmutable(); 98 | self.* = undefined; 99 | return result; 100 | } 101 | 102 | pub fn asImmutable(self: Mutable) Self { 103 | return Self{ 104 | .data = self.data, 105 | .width = self.width, 106 | }; 107 | } 108 | 109 | pub fn get(self: Mutable, idx: usize) u64 { 110 | return self.asImmutable().get(idx); 111 | } 112 | 113 | /// Sets a value at a given index with the assumption that the existing value was already zero. 114 | pub fn setFromZero(self: Mutable, idx: usize, val: u64) void { 115 | const pos = idx * self.width; 116 | const block = pos / @bitSizeOf(Int); 117 | const shift: IntLog2 = @intCast(pos % @bitSizeOf(Int)); 118 | 119 | self.data[block] |= val << shift; 120 | 121 | if (shift > 0) { 122 | const res_shift = ~shift + 1; // =:= @bitSizeOf(Int) - shift; 123 | if (res_shift < self.width) { 124 | self.data[block + 1] |= val >> res_shift; 125 | } 126 | } 127 | } 128 | 129 | /// Sets a value at a given index to zero. 130 | pub fn setToZero(self: Mutable, idx: usize) void { 131 | const pos = idx * self.width; 132 | const block = pos / @bitSizeOf(Int); 133 | const shift: IntLog2 = @intCast(pos % @bitSizeOf(Int)); 134 | 135 | // This is easier to understand with an example: 136 | // block size=8 (this is actually 64 in our implementation) 137 | // width=5 138 | // shift=6 139 | // 140 | // Let "V" be a value bit and "P" a "padding bit" (other value). 141 | // 142 | // Block 1: VV PPPPPP 143 | // Block 2: PPPPP VVV 144 | 145 | // There's also the case where it _doesn't_ cross a block: 146 | // shift=2 147 | // Block 1: PP VVVVV PP 148 | 149 | // Here we need to make sure we don't zero out those upper paddings. 150 | const upper_mask = ~@as(Int, 0) << self.width << shift; 151 | const lower_mask = ((@as(Int, 1) << shift) - 1); 152 | 153 | // Clear out VV by AND-ing 00111111; 154 | self.data[block] &= lower_mask | upper_mask; 155 | 156 | if (shift > 0) { 157 | const res_shift = ~shift + 1; // =:= @bitSizeOf(Int) - shift; 158 | 159 | if (res_shift < self.width) { 160 | // res_shift in this example is 2 and thus width-res_shift = 3. 161 | // We then build the mask 11111000 by NOT-ing 00000111. 162 | 163 | self.data[block + 1] &= ~((@as(Int, 1) << (self.width - res_shift)) - 1); 164 | } 165 | } 166 | } 167 | }; 168 | 169 | const testing = std.testing; 170 | 171 | test "basic" { 172 | const n = 100; 173 | const width = 5; 174 | const max_val = 30; 175 | 176 | var c = try Self.Mutable.init(testing.allocator, width, n); 177 | defer c.deinit(testing.allocator); 178 | 179 | var i: usize = 0; 180 | while (i < n) : (i += 1) { 181 | const value = (i * i) % max_val; 182 | c.setFromZero(i, value); 183 | } 184 | 185 | i = 0; 186 | while (i < n) : (i += 1) { 187 | const value = (i * i) % max_val; 188 | try testing.expectEqual(value, c.get(i)); 189 | } 190 | } 191 | 192 | test "encode empty" { 193 | var arr = try Self.encode(testing.allocator, &[_]u64{}); 194 | defer arr.deinit(testing.allocator); 195 | } 196 | 197 | test "encode" { 198 | const vals = [_]u64{ 5, 2, 9, 100, 0, 5, 10, 90, 9, 1, 65, 10 }; 199 | var arr = try Self.encode(testing.allocator, &vals); 200 | defer arr.deinit(testing.allocator); 201 | 202 | // 100 fits in 6 bits. There's 12 elements. These 72 bits fit in 2 u64. 203 | try testing.expectEqual(@as(usize, 2), arr.data.len); 204 | 205 | for (vals, 0..) |val, idx| { 206 | try testing.expectEqual(val, arr.get(idx)); 207 | } 208 | } 209 | 210 | test "encode #2" { 211 | const vals = [_]u64{ 0, 0, 2, 0, 4, 0 }; 212 | var arr = try Self.encode(testing.allocator, &vals); 213 | defer arr.deinit(testing.allocator); 214 | 215 | for (vals, 0..) |val, idx| { 216 | try testing.expectEqual(val, arr.get(idx)); 217 | } 218 | } 219 | 220 | test "encode #3" { 221 | const vals = [_]u64{255} ** 64; 222 | var arr = try Self.encode(testing.allocator, &vals); 223 | defer arr.deinit(testing.allocator); 224 | 225 | for (vals, 0..) |val, idx| { 226 | try testing.expectEqual(val, arr.get(idx)); 227 | } 228 | } 229 | 230 | test "write and read" { 231 | const vals = [_]u64{ 0, 0, 2, 0, 4, 0 }; 232 | var arr = try Self.encode(testing.allocator, &vals); 233 | defer arr.deinit(testing.allocator); 234 | 235 | // ensure alignment 236 | const buf = try testing.allocator.alignedAlloc(u8, std.mem.Alignment.of(u64), 100); 237 | defer testing.allocator.free(buf); 238 | 239 | { 240 | // Write 241 | var fbs = std.io.fixedBufferStream(buf); 242 | try arr.writeTo(fbs.writer()); 243 | } 244 | 245 | { 246 | // Read 247 | var fbs = std.io.fixedBufferStream(@as([]const u8, buf)); 248 | var arr2 = try Self.readFrom(&fbs); 249 | 250 | for (vals, 0..) |val, idx| { 251 | try testing.expectEqual(val, arr2.get(idx)); 252 | } 253 | } 254 | } 255 | -------------------------------------------------------------------------------- /src/DictArray.zig: -------------------------------------------------------------------------------- 1 | //! DictArray stores a list of integers by placing the unique items in a separate 2 | //! array and refering to indexes into that array. 3 | 4 | const std = @import("std"); 5 | const CompactArray = @import("./CompactArray.zig"); 6 | 7 | const Self = @This(); 8 | 9 | dict: CompactArray, 10 | arr: CompactArray, 11 | 12 | pub fn deinit(self: *Self, allocator: std.mem.Allocator) void { 13 | self.dict.deinit(allocator); 14 | self.arr.deinit(allocator); 15 | self.* = undefined; 16 | } 17 | 18 | pub fn bits(self: *const Self) usize { 19 | return self.dict.bits() + self.arr.bits(); 20 | } 21 | 22 | pub fn get(self: *const Self, idx: usize) u64 { 23 | return self.dict.get(self.arr.get(idx)); 24 | } 25 | 26 | pub fn encode(allocator: std.mem.Allocator, data: []const u64) !Self { 27 | var dict = std.ArrayList(u64).init(allocator); 28 | defer dict.deinit(); 29 | 30 | var arr = try std.ArrayList(u64).initCapacity(allocator, data.len); 31 | defer arr.deinit(); 32 | 33 | var mapping = std.hash_map.AutoHashMap(u64, usize).init(allocator); 34 | defer mapping.deinit(); 35 | 36 | for (data) |val| { 37 | const result = try mapping.getOrPut(val); 38 | if (!result.found_existing) { 39 | result.value_ptr.* = dict.items.len; 40 | try dict.append(val); 41 | } 42 | try arr.append(result.value_ptr.*); 43 | } 44 | 45 | return Self{ 46 | .dict = try CompactArray.encode(allocator, dict.items), 47 | .arr = try CompactArray.encode(allocator, arr.items), 48 | }; 49 | } 50 | 51 | pub fn writeTo(self: *const Self, w: anytype) !void { 52 | try self.dict.writeTo(w); 53 | try self.arr.writeTo(w); 54 | } 55 | 56 | pub fn readFrom(stream: *std.io.FixedBufferStream([]const u8)) !Self { 57 | const dict = try CompactArray.readFrom(stream); 58 | const arr = try CompactArray.readFrom(stream); 59 | return Self{ 60 | .dict = dict, 61 | .arr = arr, 62 | }; 63 | } 64 | -------------------------------------------------------------------------------- /src/EliasFano.zig: -------------------------------------------------------------------------------- 1 | //! EliasFano stores 64-bit _increasing_ numbers in a compact manner. 2 | 3 | const std = @import("std"); 4 | const DArray1 = @import("./darray.zig").DArray1; 5 | const CompactArray = @import("./CompactArray.zig"); 6 | const utils = @import("./utils.zig"); 7 | const DynamicBitSetUnmanaged = std.bit_set.DynamicBitSetUnmanaged; 8 | 9 | const Self = @This(); 10 | 11 | high_bits: DynamicBitSetUnmanaged, 12 | high_bits_select: DArray1, 13 | low_bits: CompactArray, 14 | 15 | pub fn deinit(self: *Self, allocator: std.mem.Allocator) void { 16 | self.high_bits.deinit(allocator); 17 | self.high_bits_select.deinit(allocator); 18 | self.low_bits.deinit(allocator); 19 | self.* = undefined; 20 | } 21 | 22 | pub fn encode(allocator: std.mem.Allocator, data: []const u64) !Self { 23 | const n = data.len; 24 | const u = data[data.len - 1]; 25 | 26 | const l = if (u > data.len) std.math.log2_int(u64, u / data.len) + 1 else 0; 27 | const l_mask = (@as(u64, 1) << l) - 1; 28 | const max_h = u >> l; 29 | 30 | // We need to store `2^h-1` zeroes and `n` ones. 31 | var high_bits = try DynamicBitSetUnmanaged.initEmpty(allocator, max_h + n); 32 | 33 | var low_bits = try CompactArray.Mutable.init(allocator, l, data.len); 34 | 35 | for (data, 0..) |num, idx| { 36 | if (l > 0) { 37 | low_bits.setFromZero(idx, num & l_mask); 38 | } 39 | high_bits.set((num >> l) + idx); 40 | } 41 | 42 | return Self{ 43 | .high_bits = high_bits, 44 | .high_bits_select = try DArray1.init(allocator, high_bits), 45 | .low_bits = low_bits.finalize(), 46 | }; 47 | } 48 | 49 | pub fn get(self: *const Self, idx: usize) u64 { 50 | const h_bits = self.high_bits_select.select(self.high_bits, idx) - idx; 51 | const l = self.low_bits.width; 52 | if (l == 0) return h_bits; 53 | 54 | const l_bits = self.low_bits.get(idx); 55 | return (h_bits << l) | l_bits; 56 | } 57 | 58 | pub fn bits(self: *const Self) u64 { 59 | // We're poking into the internals of DynamicBitSet here... 60 | const masks = self.high_bits.masks; 61 | const len = (masks - 1)[0]; 62 | return self.low_bits.bits() + self.high_bits_select.bits() + len * @bitSizeOf(usize); 63 | } 64 | 65 | pub fn bitsWithoutConstantAccess(self: *const Self) u64 { 66 | const masks = self.high_bits.masks; 67 | const len = (masks - 1)[0]; 68 | return self.low_bits.bits() + len * @bitSizeOf(usize); 69 | } 70 | 71 | pub fn writeTo(self: *const Self, w: anytype) !void { 72 | const masks = self.high_bits.masks; 73 | const len = (masks - 1)[0]; 74 | try utils.writeSlice(w, (masks - 1)[0..len]); 75 | try self.high_bits_select.writeTo(w); 76 | try self.low_bits.writeTo(w); 77 | } 78 | 79 | pub fn readFrom(stream: *std.io.FixedBufferStream([]const u8)) !Self { 80 | const mask_arr = try utils.readSlice(stream, usize); 81 | const high_bits = DynamicBitSetUnmanaged{ .masks = @constCast(mask_arr.ptr) + 1 }; 82 | const high_bits_select = try DArray1.readFrom(stream); 83 | const low_bits = try CompactArray.readFrom(stream); 84 | return Self{ 85 | .high_bits = high_bits, 86 | .high_bits_select = high_bits_select, 87 | .low_bits = low_bits, 88 | }; 89 | } 90 | 91 | const testing = std.testing; 92 | 93 | test "encode" { 94 | const seed = 0x0194f614c15227ba; 95 | var prng = std.Random.DefaultPrng.init(seed); 96 | const r = prng.random(); 97 | 98 | const n = 100000; 99 | 100 | var result = try std.ArrayList(u64).initCapacity(testing.allocator, n); 101 | defer result.deinit(); 102 | 103 | var i: usize = 0; 104 | var prev: u64 = 0; 105 | while (i < n) : (i += 1) { 106 | const num = prev + r.uintLessThan(u64, 50); 107 | result.appendAssumeCapacity(num); 108 | prev = num; 109 | } 110 | 111 | var ef = try encode(testing.allocator, result.items); 112 | defer ef.deinit(testing.allocator); 113 | 114 | // Check that it matches 115 | for (result.items, 0..) |num, idx| { 116 | try testing.expectEqual(num, ef.get(idx)); 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /src/StringDict.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const builtin = @import("builtin"); 3 | 4 | const StringDict = @This(); 5 | 6 | const endian = builtin.cpu.arch.endian(); 7 | 8 | dict: []const u8, 9 | 10 | pub fn deinit(self: *StringDict, allocator: std.mem.Allocator) void { 11 | allocator.free(self.dict); 12 | self.* = undefined; 13 | } 14 | 15 | pub fn writeTo(self: *const StringDict, w: anytype) !void { 16 | try w.writeInt(u64, self.dict.len, endian); 17 | try w.writeAll(self.dict); 18 | } 19 | 20 | pub fn readFrom(stream: *std.io.FixedBufferStream([]const u8)) !StringDict { 21 | var r = stream.reader(); 22 | const len = try r.readInt(u64, endian); 23 | const dict = stream.buffer[stream.pos..][0..len]; 24 | stream.pos += len; 25 | return StringDict{ 26 | .dict = dict, 27 | }; 28 | } 29 | 30 | pub fn bits(self: *const StringDict) u64 { 31 | return self.dict.len * 8; 32 | } 33 | 34 | pub fn get(self: *const StringDict, idx: u64) []const u8 { 35 | const len = self.dict[idx]; 36 | return self.dict[idx + 1 ..][0..len]; 37 | } 38 | 39 | pub const Builder = struct { 40 | dict_values: std.ArrayList(u8), 41 | dict_positions: std.StringHashMap(usize), 42 | 43 | pub fn init(allocator: std.mem.Allocator) !Builder { 44 | return Builder{ 45 | .dict_values = std.ArrayList(u8).init(allocator), 46 | .dict_positions = std.StringHashMap(usize).init(allocator), 47 | }; 48 | } 49 | 50 | pub fn deinit(self: *Builder) void { 51 | self.dict_values.deinit(); 52 | self.dict_positions.deinit(); 53 | self.* = undefined; 54 | } 55 | 56 | pub fn intern(self: *Builder, key: []const u8) !u64 { 57 | const result = try self.dict_positions.getOrPut(key); 58 | if (!result.found_existing) { 59 | result.value_ptr.* = self.dict_values.items.len; 60 | try self.dict_values.append(@intCast(key.len)); 61 | for (key) |byte| { 62 | try self.dict_values.append(byte); 63 | } 64 | } 65 | return result.value_ptr.*; 66 | } 67 | 68 | pub fn build(self: *Builder) !StringDict { 69 | return StringDict{ 70 | .dict = try self.dict_values.toOwnedSlice(), 71 | }; 72 | } 73 | }; 74 | -------------------------------------------------------------------------------- /src/darray.zig: -------------------------------------------------------------------------------- 1 | //! Implements the "darray" data structure which provides constant-time 2 | //! select(i) operation for _dense_ bit sets. Roughly half of the items 3 | //! should be set for this to be practical. 4 | //! 5 | //! See "Practical Entropy-Compressed Rank/Select Dictionary" by Daisuke Okanohara and Kunihiko Sadakane. 6 | //! 7 | //! The code is heavily based on https://github.com/jermp/pthash/blob/master/include/encoders/darray.hpp. 8 | 9 | const std = @import("std"); 10 | const utils = @import("./utils.zig"); 11 | 12 | const BitSet = std.bit_set.DynamicBitSet; 13 | 14 | pub fn DArray(comptime val: bool) type { 15 | return struct { 16 | const Self = @This(); 17 | 18 | const block_size: usize = 1024; 19 | const subblock_size: usize = 32; 20 | const max_in_block_distance: usize = 1 << 16; 21 | 22 | const BlockPosition = packed struct { 23 | is_overflow: bool, 24 | pos: u63, 25 | }; 26 | 27 | block_inventory: []BlockPosition, 28 | subblock_inventory: []u16, 29 | overflow_positions: []u64, 30 | 31 | pub fn init(allocator: std.mem.Allocator, bit_set: std.bit_set.DynamicBitSetUnmanaged) !Self { 32 | var cur_block_positions = std.ArrayListUnmanaged(u63){}; 33 | defer cur_block_positions.deinit(allocator); 34 | 35 | var block_inventory = std.ArrayListUnmanaged(BlockPosition){}; 36 | defer block_inventory.deinit(allocator); 37 | 38 | var subblock_inventory = std.ArrayListUnmanaged(u16){}; 39 | defer subblock_inventory.deinit(allocator); 40 | 41 | var overflow_positions = std.ArrayListUnmanaged(u64){}; 42 | defer overflow_positions.deinit(allocator); 43 | 44 | try cur_block_positions.ensureTotalCapacity(allocator, block_size); 45 | 46 | var iter = bit_set.iterator(.{ .kind = if (val) .set else .unset }); 47 | while (iter.next()) |pos| { 48 | cur_block_positions.appendAssumeCapacity(@intCast(pos)); 49 | if (cur_block_positions.items.len == block_size) { 50 | try flushCurBlock(allocator, &cur_block_positions, &block_inventory, &subblock_inventory, &overflow_positions); 51 | } 52 | } 53 | 54 | if (cur_block_positions.items.len > 0) { 55 | try flushCurBlock(allocator, &cur_block_positions, &block_inventory, &subblock_inventory, &overflow_positions); 56 | } 57 | 58 | return Self{ 59 | .block_inventory = try block_inventory.toOwnedSlice(allocator), 60 | .subblock_inventory = try subblock_inventory.toOwnedSlice(allocator), 61 | .overflow_positions = try overflow_positions.toOwnedSlice(allocator), 62 | }; 63 | } 64 | 65 | // Reads a word, flipping all bits if we're in select0-mode. 66 | fn readWord(bit_set: std.bit_set.DynamicBitSetUnmanaged, idx: usize) u64 { 67 | var word = bit_set.masks[idx]; 68 | if (!val) { 69 | word = ~word; 70 | } 71 | return word; 72 | } 73 | 74 | fn flushCurBlock( 75 | allocator: std.mem.Allocator, 76 | cur_block_positions: *std.ArrayListUnmanaged(u63), 77 | block_inventory: *std.ArrayListUnmanaged(BlockPosition), 78 | subblock_inventory: *std.ArrayListUnmanaged(u16), 79 | overflow_positions: *std.ArrayListUnmanaged(u64), 80 | ) !void { 81 | const fst = cur_block_positions.items[0]; 82 | const lst = cur_block_positions.items[cur_block_positions.items.len - 1]; 83 | if (lst - fst < max_in_block_distance) { 84 | try block_inventory.append(allocator, BlockPosition{ .is_overflow = false, .pos = fst }); 85 | var i: usize = 0; 86 | while (i < cur_block_positions.items.len) : (i += subblock_size) { 87 | try subblock_inventory.append(allocator, @intCast(cur_block_positions.items[i] - fst)); 88 | } 89 | } else { 90 | const overflow_pos = overflow_positions.items.len; 91 | try block_inventory.append(allocator, BlockPosition{ .is_overflow = true, .pos = @intCast(overflow_pos) }); 92 | for (cur_block_positions.items) |pos| { 93 | try overflow_positions.append(allocator, pos); 94 | } 95 | var i: usize = 0; 96 | while (i < cur_block_positions.items.len) : (i += subblock_size) { 97 | // This value isn't used, but we need to fill up the subblock. 98 | try subblock_inventory.append(allocator, 0); 99 | } 100 | } 101 | cur_block_positions.clearRetainingCapacity(); 102 | } 103 | 104 | pub fn deinit(self: *Self, allocator: std.mem.Allocator) void { 105 | allocator.free(self.block_inventory); 106 | allocator.free(self.subblock_inventory); 107 | allocator.free(self.overflow_positions); 108 | self.* = undefined; 109 | } 110 | 111 | /// Returns the position of the `idx`-nth set bit in the bit set. 112 | pub fn select(self: *const Self, bit_set: std.bit_set.DynamicBitSetUnmanaged, idx: usize) usize { 113 | const block = idx / block_size; 114 | const block_pos = self.block_inventory[block]; 115 | 116 | if (block_pos.is_overflow) { 117 | return self.overflow_positions[block_pos.pos + (idx % block_size)]; 118 | } 119 | 120 | const subblock = idx / subblock_size; 121 | const start_pos = block_pos.pos + self.subblock_inventory[subblock]; 122 | var reminder = idx % subblock_size; 123 | if (reminder == 0) return start_pos; 124 | 125 | // Note: These assume the BitSet uses u64. 126 | var word_idx = start_pos >> 6; 127 | const word_shift: u6 = @intCast(start_pos & 63); 128 | 129 | var word = readWord(bit_set, word_idx); 130 | word &= @as(u64, @bitCast(@as(i64, -1))) << word_shift; 131 | 132 | while (true) { 133 | const popcount = @popCount(word); 134 | if (reminder < popcount) break; 135 | reminder -= popcount; 136 | word_idx += 1; 137 | word = readWord(bit_set, word_idx); 138 | } 139 | 140 | // TODO: this is probably not the best select_in_word algorithm 141 | 142 | var word_pos: usize = 0; 143 | 144 | while (true) { 145 | if (word & 1 == 1) { 146 | if (reminder == 0) break; 147 | reminder -= 1; 148 | } 149 | word_pos += 1; 150 | word >>= 1; 151 | } 152 | 153 | return (word_idx << 6) + word_pos; 154 | } 155 | 156 | pub fn bits(self: *const Self) u64 { 157 | return utils.bitSizeOfSlice(self.block_inventory) + 158 | utils.bitSizeOfSlice(self.subblock_inventory) + 159 | utils.bitSizeOfSlice(self.overflow_positions); 160 | } 161 | 162 | pub fn writeTo(self: *const Self, w: anytype) !void { 163 | try utils.writeSlice(w, self.block_inventory); 164 | try utils.writeSlice(w, self.subblock_inventory); 165 | try utils.writeSlice(w, self.overflow_positions); 166 | } 167 | 168 | pub fn readFrom(stream: *std.io.FixedBufferStream([]const u8)) !Self { 169 | const block_inventory = try utils.readSlice(stream, BlockPosition); 170 | const subblock_inventory = try utils.readSlice(stream, u16); 171 | const overflow_positions = try utils.readSlice(stream, u64); 172 | return Self{ 173 | .block_inventory = @constCast(block_inventory), 174 | .subblock_inventory = @constCast(subblock_inventory), 175 | .overflow_positions = @constCast(overflow_positions), 176 | }; 177 | } 178 | }; 179 | } 180 | 181 | /// Provides select_0 support. 182 | pub const DArray1 = DArray(true); 183 | 184 | /// Provides select_0 support. 185 | pub const DArray0 = DArray(false); 186 | 187 | const testing = std.testing; 188 | 189 | fn testBitSet( 190 | bit_set: *std.DynamicBitSet, 191 | positions: []usize, 192 | ) !void { 193 | var darr1 = try DArray1.init(testing.allocator, bit_set.unmanaged); 194 | defer darr1.deinit(testing.allocator); 195 | 196 | for (positions, 0..) |pos, idx| { 197 | try testing.expectEqual(pos, darr1.select(bit_set.unmanaged, idx)); 198 | } 199 | 200 | // Now flip it and test select0(i): 201 | bit_set.toggleAll(); 202 | 203 | var darr0 = try DArray0.init(testing.allocator, bit_set.unmanaged); 204 | defer darr0.deinit(testing.allocator); 205 | 206 | for (positions, 0..) |pos, idx| { 207 | try testing.expectEqual(pos, darr0.select(bit_set.unmanaged, idx)); 208 | } 209 | } 210 | 211 | test "dense" { 212 | const seed = 0x0194f614c15227ba; 213 | var prng = std.Random.DefaultPrng.init(seed); 214 | const r = prng.random(); 215 | 216 | var result = std.ArrayList(usize).init(testing.allocator); 217 | defer result.deinit(); 218 | 219 | const n = 10000; 220 | 221 | var bit_set = try std.DynamicBitSet.initEmpty(testing.allocator, n); 222 | defer bit_set.deinit(); 223 | 224 | var idx: usize = 0; 225 | while (idx < n) : (idx += 1) { 226 | if (r.boolean()) { 227 | try result.append(idx); 228 | bit_set.set(idx); 229 | } 230 | } 231 | 232 | try testBitSet(&bit_set, result.items); 233 | } 234 | 235 | test "sparse" { 236 | const seed = 0x0194f614c15227ba; 237 | var prng = std.Random.DefaultPrng.init(seed); 238 | const r = prng.random(); 239 | 240 | var result = std.ArrayList(usize).init(testing.allocator); 241 | defer result.deinit(); 242 | 243 | const n = 100000; 244 | 245 | var bit_set = try std.DynamicBitSet.initEmpty(testing.allocator, n); 246 | defer bit_set.deinit(); 247 | 248 | var idx: usize = 0; 249 | while (idx < n) : (idx += 1) { 250 | if (r.uintLessThan(u64, 100) == 0) { 251 | try result.append(idx); 252 | bit_set.set(idx); 253 | } 254 | } 255 | 256 | try testBitSet(&bit_set, result.items); 257 | } 258 | -------------------------------------------------------------------------------- /src/main.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | 3 | pub const pthash = @import("./pthash.zig"); 4 | pub const CompactArray = @import("./CompactArray.zig"); 5 | pub const DictArray = @import("./DictArray.zig"); 6 | pub const darray = @import("./darray.zig"); 7 | pub const EliasFano = @import("./EliasFano.zig"); 8 | pub const ribbon = @import("./ribbon.zig"); 9 | pub const StringDict = @import("./StringDict.zig"); 10 | 11 | comptime { 12 | std.testing.refAllDecls(@This()); 13 | } 14 | -------------------------------------------------------------------------------- /src/pthash.zig: -------------------------------------------------------------------------------- 1 | //! This module implements "PTHash: Revisiting FCH Minimal Perfect Hashing" by 2 | //! Giulio Ermanno Pibiri, Roberto Trani, arXiv:2104.10402, https://arxiv.org/abs/2104.10402. 3 | 4 | const std = @import("std"); 5 | const builtin = @import("builtin"); 6 | const Wyhash = std.hash.Wyhash; 7 | 8 | const CompactArray = @import("./CompactArray.zig"); 9 | const EliasFano = @import("./EliasFano.zig"); 10 | const utils = @import("./utils.zig"); 11 | const FreeSlotEncoding = EliasFano; 12 | 13 | const endian = builtin.cpu.arch.endian(); 14 | 15 | /// The bucketer takes a hash and places it into a bucket in an un-even fashion: 16 | /// Roughly 60% of the keys are mapped to 30% of the buckets. In addition, 17 | /// it's initialize with a `c` parameter which represents the expected number of 18 | /// bits-per-n that is required to encode the pivots that are created by PTHash. 19 | const Bucketer = struct { 20 | n: usize, 21 | m: usize, 22 | p1: usize, 23 | p2: usize, 24 | 25 | /// Creates a new bucketer for `n` items with a given `c` parameter. 26 | pub fn init(n: usize, c: usize) Bucketer { 27 | const m = c * n / (std.math.log2_int(usize, n) + 1); 28 | const p1: usize = @intFromFloat(0.6 * @as(f64, @floatFromInt(n))); 29 | const p2: usize = @intFromFloat(0.3 * @as(f64, @floatFromInt(m))); 30 | 31 | return Bucketer{ 32 | .n = n, 33 | .m = m, 34 | .p1 = p1, 35 | .p2 = p2, 36 | }; 37 | } 38 | 39 | /// Returns the bucket for a hash. 40 | pub fn getBucket(self: Bucketer, hash: u64) u64 { 41 | if (hash % self.n < self.p1) { 42 | return hash % self.p2; 43 | } else { 44 | return self.p2 + (hash % (self.m - self.p2)); 45 | } 46 | } 47 | 48 | pub fn writeTo(self: *const Bucketer, w: anytype) !void { 49 | try w.writeInt(u64, self.n, endian); 50 | try w.writeInt(u64, self.m, endian); 51 | try w.writeInt(u64, self.p1, endian); 52 | try w.writeInt(u64, self.p2, endian); 53 | } 54 | 55 | pub fn readFrom(stream: *std.io.FixedBufferStream([]const u8)) !Bucketer { 56 | var r = stream.reader(); 57 | const n = try r.readInt(u64, endian); 58 | const m = try r.readInt(u64, endian); 59 | const p1 = try r.readInt(u64, endian); 60 | const p2 = try r.readInt(u64, endian); 61 | return Bucketer{ 62 | .n = n, 63 | .m = m, 64 | .p1 = p1, 65 | .p2 = p2, 66 | }; 67 | } 68 | }; 69 | 70 | /// Information about the hash + bucket for a key. We compute this once and re-use it. 71 | const HashedKey = struct { 72 | hash: u64, 73 | bucket: u64, 74 | 75 | fn lessThan(_: void, lhs: HashedKey, rhs: HashedKey) bool { 76 | if (lhs.bucket == rhs.bucket) return lhs.hash < rhs.hash; 77 | return lhs.bucket < rhs.bucket; 78 | } 79 | }; 80 | 81 | /// The bucket summary contains information about a single bucket for a slice of hashed keys. 82 | /// The slice should be sorted by bucket. 83 | const BucketSummary = struct { 84 | idx: usize, 85 | entry_start: usize, 86 | entry_end: usize, 87 | 88 | fn count(self: BucketSummary) usize { 89 | return self.entry_end - self.entry_start; 90 | } 91 | 92 | fn lessThan(_: void, a: BucketSummary, b: BucketSummary) bool { 93 | const a_count = a.count(); 94 | const b_count = b.count(); 95 | if (a_count == b_count) return a.idx < b.idx; 96 | return b_count < a_count; 97 | } 98 | }; 99 | 100 | pub const Params = struct { 101 | c: usize, 102 | alpha: f64 = 1, 103 | }; 104 | 105 | // Number of different seeds we try before we give up. 106 | const MAX_ATTEMPTS = 1000; 107 | 108 | /// A minimal perfect hash function for a given type and a hash function. 109 | pub fn HashFn( 110 | comptime Key: type, 111 | comptime hasher: fn (seed: u64, Key: Key) u64, 112 | comptime Encoding: type, 113 | ) type { 114 | return struct { 115 | const Self = @This(); 116 | 117 | n: usize, 118 | seed: u64, 119 | bucketer: Bucketer, 120 | free_slots: FreeSlotEncoding, 121 | pivots: Encoding, 122 | 123 | pub fn deinit(self: *Self, allocator: std.mem.Allocator) void { 124 | self.pivots.deinit(allocator); 125 | self.free_slots.deinit(allocator); 126 | self.* = undefined; 127 | } 128 | 129 | pub fn get(self: *const Self, key: Key) u64 { 130 | const hash = hasher(self.seed, key); 131 | const bucket = self.bucketer.getBucket(hash); 132 | const pivot = self.pivots.get(bucket); 133 | const bucket_hash = Wyhash.hash(self.seed, std.mem.asBytes(&pivot)); 134 | const full_hash = Wyhash.hash(bucket_hash, std.mem.asBytes(&hash)); 135 | const pos = full_hash % self.bucketer.n; 136 | if (pos < self.n) { 137 | return pos; 138 | } else { 139 | return self.free_slots.get(pos - self.n); 140 | } 141 | } 142 | 143 | pub fn bits(self: *const Self) usize { 144 | return self.pivots.bits() + self.free_slots.bits(); 145 | } 146 | 147 | pub fn build( 148 | allocator: std.mem.Allocator, 149 | keys: []const Key, 150 | params: Params, 151 | seed: ?u64, 152 | ) !Self { 153 | if (seed) |s| { 154 | return buildUsingSeed(allocator, keys, params, s); 155 | } else { 156 | return buildUsingRandomSeed(allocator, keys, params, MAX_ATTEMPTS); 157 | } 158 | } 159 | 160 | pub fn buildUsingRandomSeed( 161 | allocator: std.mem.Allocator, 162 | keys: []const Key, 163 | params: Params, 164 | max_attempts: usize, 165 | ) !Self { 166 | var seed: u64 = undefined; 167 | 168 | var attempts: usize = 0; 169 | while (attempts < max_attempts) : (attempts += 1) { 170 | try std.posix.getrandom(std.mem.asBytes(&seed)); 171 | 172 | return buildUsingSeed(allocator, keys, params, seed) catch |err| switch (err) { 173 | error.HashCollision => continue, 174 | else => err, 175 | }; 176 | } 177 | 178 | return error.HashCollision; 179 | } 180 | 181 | pub fn buildUsingSeed( 182 | allocator: std.mem.Allocator, 183 | keys: []const Key, 184 | params: Params, 185 | seed: u64, 186 | ) !Self { 187 | std.debug.assert(params.alpha <= 1); 188 | const n_prime: usize = @intFromFloat(@as(f64, @floatFromInt(keys.len)) / params.alpha); 189 | const bucketer = Bucketer.init(n_prime, params.c); 190 | 191 | // Step 1: Hash all the inputs and figure out which bucket they belong to. 192 | 193 | var entries = try allocator.alloc(HashedKey, keys.len); 194 | defer allocator.free(entries); 195 | 196 | for (keys, 0..) |key, idx| { 197 | const hash = hasher(seed, key); 198 | const bucket = bucketer.getBucket(hash); 199 | entries[idx] = HashedKey{ .hash = hash, .bucket = bucket }; 200 | } 201 | 202 | std.mem.sort(HashedKey, entries, {}, HashedKey.lessThan); 203 | 204 | // Step 2: Group the entries into buckets ordered by size. 205 | 206 | var bucket_summaries = try std.ArrayList(BucketSummary).initCapacity(allocator, bucketer.m); 207 | defer bucket_summaries.deinit(); 208 | 209 | var bucket_start: usize = 0; 210 | var bucket_idx: usize = 0; 211 | var i: usize = 1; 212 | while (i < entries.len + 1) : (i += 1) { 213 | const at_boundary = (i == entries.len) or (entries[i - 1].bucket != entries[i].bucket); 214 | if (at_boundary) { 215 | bucket_summaries.appendAssumeCapacity(BucketSummary{ 216 | .idx = entries[i - 1].bucket, 217 | .entry_start = bucket_start, 218 | .entry_end = i, 219 | }); 220 | bucket_idx += 1; 221 | bucket_start = i; 222 | } else { 223 | if (entries[i - 1].hash == entries[i].hash) return error.HashCollision; 224 | } 225 | } 226 | 227 | std.mem.sort(BucketSummary, bucket_summaries.items, {}, BucketSummary.lessThan); 228 | 229 | // Step 3: Determine pivots 230 | 231 | var taken = try std.bit_set.DynamicBitSet.initEmpty(allocator, bucketer.n); 232 | defer taken.deinit(); 233 | 234 | var attempted_taken = try std.bit_set.DynamicBitSet.initEmpty(allocator, bucketer.n); 235 | defer attempted_taken.deinit(); 236 | 237 | var pivots = try allocator.alloc(u64, bucketer.m); 238 | defer allocator.free(pivots); 239 | 240 | @memset(pivots, 0); 241 | 242 | for (bucket_summaries.items) |b| { 243 | var pivot: u64 = 0; 244 | find_pivot: while (true) : (pivot += 1) { 245 | // Reset attempted_taken 246 | attempted_taken.setRangeValue(.{ .start = 0, .end = attempted_taken.capacity() }, false); 247 | 248 | for (entries[b.entry_start..b.entry_end]) |entry| { 249 | const bucket_hash = Wyhash.hash(seed, std.mem.asBytes(&pivot)); 250 | const full_hash = Wyhash.hash(bucket_hash, std.mem.asBytes(&entry.hash)); 251 | const pos = full_hash % bucketer.n; 252 | 253 | const is_taken_earlier_bucket = taken.isSet(pos); 254 | const is_taken_same_bucket = attempted_taken.isSet(pos); 255 | 256 | if (is_taken_earlier_bucket or is_taken_same_bucket) { 257 | continue :find_pivot; 258 | } 259 | 260 | attempted_taken.set(pos); 261 | } 262 | 263 | pivots[b.idx] = pivot; 264 | 265 | taken.setUnion(attempted_taken); 266 | break; 267 | } 268 | } 269 | 270 | const encoded_pivots = try Encoding.encode(allocator, pivots); 271 | 272 | var free_slots = try allocator.alloc(u64, bucketer.n - keys.len); 273 | defer allocator.free(free_slots); 274 | 275 | var iter = taken.iterator(.{ .kind = .unset }); 276 | 277 | var prev_free_value: usize = 0; 278 | var free_idx: usize = 0; 279 | while (free_idx < free_slots.len) : (free_idx += 1) { 280 | if (taken.isSet(keys.len + free_idx)) { 281 | free_slots[free_idx] = iter.next().?; 282 | prev_free_value = free_slots[free_idx]; 283 | } else { 284 | // This value can be anything. We keep it incremental. 285 | free_slots[free_idx] = prev_free_value; 286 | } 287 | } 288 | 289 | const encoded_free_slots = try FreeSlotEncoding.encode(allocator, free_slots); 290 | 291 | return Self{ 292 | .bucketer = bucketer, 293 | .n = keys.len, 294 | .free_slots = encoded_free_slots, 295 | .seed = seed, 296 | .pivots = encoded_pivots, 297 | }; 298 | } 299 | 300 | pub fn writeTo(self: *const Self, w: anytype) !void { 301 | try w.writeInt(u64, self.n, endian); 302 | try w.writeInt(u64, self.seed, endian); 303 | try self.bucketer.writeTo(w); 304 | try self.free_slots.writeTo(w); 305 | try self.pivots.writeTo(w); 306 | } 307 | 308 | pub fn readFrom(stream: *std.io.FixedBufferStream([]const u8)) !Self { 309 | var r = stream.reader(); 310 | const n = try r.readInt(u64, endian); 311 | const seed = try r.readInt(u64, endian); 312 | const bucketer = try Bucketer.readFrom(stream); 313 | const free_slots = try FreeSlotEncoding.readFrom(stream); 314 | const pivots = try Encoding.readFrom(stream); 315 | return Self{ 316 | .n = n, 317 | .seed = seed, 318 | .bucketer = bucketer, 319 | .free_slots = free_slots, 320 | .pivots = pivots, 321 | }; 322 | } 323 | }; 324 | } 325 | 326 | pub fn AutoHashFn( 327 | comptime Key: type, 328 | comptime Encoding: type, 329 | ) type { 330 | return HashFn(Key, utils.autoHash(Key), Encoding); 331 | } 332 | 333 | pub fn BytesHashFn(comptime Encoding: type) type { 334 | return HashFn([]const u8, Wyhash.hash, Encoding); 335 | } 336 | 337 | const testing = std.testing; 338 | 339 | test "basic bucketing" { 340 | const b = Bucketer.init(100, 7); 341 | try testing.expectEqual(@as(u64, 0), b.getBucket(0)); 342 | } 343 | 344 | test "building" { 345 | var data: [256]u64 = undefined; 346 | 347 | var i: usize = 0; 348 | while (i < data.len) : (i += 1) { 349 | data[i] = i * i; 350 | } 351 | 352 | var h = try AutoHashFn(u64, CompactArray).buildUsingRandomSeed(testing.allocator, &data, .{ .c = 7, .alpha = 0.80 }, 10); 353 | defer h.deinit(testing.allocator); 354 | 355 | var seen = std.hash_map.AutoHashMap(u64, usize).init(testing.allocator); 356 | defer seen.deinit(); 357 | 358 | for (data, 0..) |val, idx| { 359 | const out = h.get(val); 360 | try testing.expect(out < data.len); 361 | 362 | if (try seen.fetchPut(out, idx)) |other_entry| { 363 | std.debug.print("collision between idx={} and {}\n", .{ other_entry.value, idx }); 364 | return error.TestCollision; 365 | } 366 | } 367 | } 368 | 369 | test "collision detection" { 370 | var data: [2]u64 = .{ 5, 5 }; 371 | var h_result = AutoHashFn(u64, CompactArray).buildUsingRandomSeed(testing.allocator, &data, .{ .c = 7 }, 10); 372 | if (h_result) |*h| h.deinit(testing.allocator) else |_| {} 373 | 374 | try testing.expectError(error.HashCollision, h_result); 375 | } 376 | -------------------------------------------------------------------------------- /src/ribbon.zig: -------------------------------------------------------------------------------- 1 | //! This file implements the ideas from "Fast Succinct Retrieval and Approximate Membership using Ribbon". 2 | 3 | const std = @import("std"); 4 | const builtin = @import("builtin"); 5 | const DynamicBitSetUnmanaged = std.bit_set.DynamicBitSetUnmanaged; 6 | const CompactArray = @import("./CompactArray.zig"); 7 | const utils = @import("./utils.zig"); 8 | 9 | const endian = builtin.cpu.arch.endian(); 10 | 11 | fn bitParity(num: u64) u64 { 12 | return @popCount(num) % 2; 13 | } 14 | 15 | const RibbonTable = struct { 16 | const Self = @This(); 17 | 18 | n: usize, 19 | data: CompactArray, 20 | 21 | pub fn init(n: usize, data: CompactArray) Self { 22 | return Self{ 23 | .n = n, 24 | .data = data, 25 | }; 26 | } 27 | 28 | pub fn deinit(self: *Self, allocator: std.mem.Allocator) void { 29 | self.data.deinit(allocator); 30 | self.* = undefined; 31 | } 32 | 33 | pub fn lookup(self: Self, i: u64, c: u64) u64 { 34 | std.debug.assert((c & 1) == 1); 35 | 36 | var i_ = i; 37 | var c_ = c; 38 | var result: u64 = 0; 39 | 40 | while (true) { 41 | result ^= self.data.get(i_); 42 | 43 | c_ >>= 1; 44 | i_ += 1; 45 | if (c_ == 0) break; 46 | 47 | const j: u6 = @intCast(@ctz(c_)); 48 | i_ += j; 49 | c_ >>= j; 50 | } 51 | return result; 52 | } 53 | 54 | pub fn bits(self: *const Self) u64 { 55 | return self.data.bits(); 56 | } 57 | 58 | pub fn writeTo(self: *const Self, w: anytype) !void { 59 | try w.writeInt(u64, self.n, endian); 60 | try self.data.writeTo(w); 61 | } 62 | 63 | pub fn readFrom(stream: *std.io.FixedBufferStream([]const u8)) !Self { 64 | var r = stream.reader(); 65 | const n = try r.readInt(u64, endian); 66 | const data = try CompactArray.readFrom(stream); 67 | return Self{ .n = n, .data = data }; 68 | } 69 | }; 70 | 71 | pub const RibbonBandingSystem = struct { 72 | const Self = @This(); 73 | 74 | const Array = CompactArray.Mutable; 75 | 76 | n: usize, 77 | c: Array, 78 | b: Array, 79 | 80 | pub fn init(allocator: std.mem.Allocator, n: usize, r: u6, w: u6) !Self { 81 | var c = try Array.init(allocator, w, n); 82 | errdefer c.deinit(allocator); 83 | 84 | var b = try Array.init(allocator, r, n); 85 | errdefer b.deinit(allocator); 86 | 87 | return Self{ .n = n, .c = c, .b = b }; 88 | } 89 | 90 | pub fn deinit(self: *Self, allocator: std.mem.Allocator) void { 91 | self.c.deinit(allocator); 92 | self.b.deinit(allocator); 93 | self.* = undefined; 94 | } 95 | 96 | pub fn getBandWidth(self: Self) u6 { 97 | return self.c.width; 98 | } 99 | 100 | pub fn getValueSize(self: Self) u6 { 101 | return self.b.width; 102 | } 103 | 104 | pub const InsertResult = union(enum) { 105 | success: usize, 106 | redunant: void, 107 | failure: void, 108 | }; 109 | 110 | pub fn insertRow(self: *Self, i: usize, c: u64, b: u64) InsertResult { 111 | std.debug.assert(b >> self.getValueSize() == 0); 112 | std.debug.assert(c >> self.getBandWidth() == 0); 113 | std.debug.assert((c & 1) == 1); 114 | 115 | var i_ = i; 116 | var c_ = c; 117 | var b_ = b; 118 | 119 | while (true) { 120 | if (self.c.get(i_) == 0) { 121 | self.c.setFromZero(i_, c_); 122 | self.b.setFromZero(i_, b_); 123 | return .{ .success = i_ }; 124 | } 125 | 126 | c_ = c_ ^ self.c.get(i_); 127 | b_ = b_ ^ self.b.get(i_); 128 | 129 | if (c_ == 0) { 130 | if (b_ == 0) { 131 | return .redunant; 132 | } else { 133 | return .failure; 134 | } 135 | } 136 | 137 | const j: u6 = @intCast(@ctz(c_)); 138 | c_ >>= j; 139 | i_ += j; 140 | } 141 | } 142 | 143 | pub fn clearRow(self: *Self, i: usize) void { 144 | self.c.setToZero(i); 145 | self.b.setToZero(i); 146 | } 147 | 148 | pub fn build(self: Self, allocator: std.mem.Allocator) !RibbonTable { 149 | const r = self.getValueSize(); 150 | 151 | var data = try CompactArray.Mutable.init(allocator, r, self.n); 152 | errdefer data.deinit(allocator); 153 | 154 | var state = try allocator.alloc(u64, r); 155 | defer allocator.free(state); 156 | @memset(state, 0); 157 | 158 | // This logic is taken from https://github.com/lorenzhs/BuRR/blob/1c62832ad7d6eab5b337f386955868c3ce9a54ea/backsubst.hpp#L46 159 | // and I honestly don't quite understand how it works. 160 | 161 | var i = self.n; 162 | while (i > 0) { 163 | i -= 1; 164 | 165 | const c = self.c.get(i); 166 | const b = self.b.get(i); 167 | var resultRow: u64 = 0; 168 | 169 | var j: u6 = 0; 170 | while (j < r) : (j += 1) { 171 | var tmp = state[j] << 1; 172 | const bit = bitParity(tmp & c) ^ ((b >> j) & 1); 173 | tmp |= bit; 174 | state[j] = tmp; 175 | resultRow |= (bit << j); 176 | } 177 | 178 | data.setFromZero(i, resultRow); 179 | } 180 | 181 | return RibbonTable.init(self.n, data.finalize()); 182 | } 183 | }; 184 | 185 | const BumpedLayer = struct { 186 | bucket_size: usize, 187 | upper_threshold: usize, 188 | lower_threshold: usize, 189 | thresholds: CompactArray, 190 | table: RibbonTable, 191 | 192 | pub fn deinit(self: *BumpedLayer, allocator: std.mem.Allocator) void { 193 | self.table.deinit(allocator); 194 | self.thresholds.deinit(allocator); 195 | } 196 | 197 | pub fn lookup(self: BumpedLayer, i: u64, c: u64) ?u64 { 198 | if (self.isBumped(i)) { 199 | return null; 200 | } else { 201 | return self.table.lookup(i, c); 202 | } 203 | } 204 | 205 | fn isBumped(self: BumpedLayer, i: u64) bool { 206 | const bucket_idx = i / self.bucket_size; 207 | const bucket_offset = i % self.bucket_size; 208 | const threshold = self.thresholds.get(bucket_idx); 209 | const threshold_values = [4]usize{ 0, self.lower_threshold, self.upper_threshold, self.bucket_size }; 210 | return bucket_offset < threshold_values[threshold]; 211 | } 212 | 213 | pub fn bits(self: BumpedLayer) usize { 214 | return self.table.bits() + self.thresholds.bits(); 215 | } 216 | 217 | pub fn writeTo(self: *const BumpedLayer, w: anytype) !void { 218 | try w.writeInt(u64, self.bucket_size, endian); 219 | try w.writeInt(u64, self.upper_threshold, endian); 220 | try w.writeInt(u64, self.lower_threshold, endian); 221 | try self.thresholds.writeTo(w); 222 | try self.table.writeTo(w); 223 | } 224 | 225 | pub fn readFrom(stream: *std.io.FixedBufferStream([]const u8)) !BumpedLayer { 226 | var r = stream.reader(); 227 | const bucket_size = try r.readInt(u64, endian); 228 | const upper_threshold = try r.readInt(u64, endian); 229 | const lower_threshold = try r.readInt(u64, endian); 230 | const thresholds = try CompactArray.readFrom(stream); 231 | const table = try RibbonTable.readFrom(stream); 232 | 233 | return BumpedLayer{ 234 | .bucket_size = bucket_size, 235 | .upper_threshold = upper_threshold, 236 | .lower_threshold = lower_threshold, 237 | .thresholds = thresholds, 238 | .table = table, 239 | }; 240 | } 241 | }; 242 | 243 | const BumpedLayerBuilder = struct { 244 | const Self = @This(); 245 | 246 | const Input = struct { 247 | hash1: u64, 248 | hash2: u64, 249 | hash_result: HashResult, 250 | value: u64, 251 | }; 252 | 253 | m: usize, 254 | eps: f64, 255 | opts: BuildOptions, 256 | input: std.ArrayListUnmanaged(Input), 257 | 258 | fn tableSizeFromEps(n: usize, eps: f64, w: u6) usize { 259 | const target: usize = @intFromFloat(@as(f64, @floatFromInt(n)) * (eps + 1)); 260 | return @max(target, @as(usize, @intCast(w)) + 1); 261 | } 262 | 263 | pub fn init(allocator: std.mem.Allocator, n: usize, eps: f64, opts: BuildOptions) error{OutOfMemory}!Self { 264 | const input = try std.ArrayListUnmanaged(Input).initCapacity(allocator, n); 265 | 266 | return Self{ 267 | .m = tableSizeFromEps(n, eps, opts.w), 268 | .eps = eps, 269 | .opts = opts, 270 | .input = input, 271 | }; 272 | } 273 | 274 | pub fn deinit(self: *Self, allocator: std.mem.Allocator) void { 275 | self.input.deinit(allocator); 276 | self.* = undefined; 277 | } 278 | 279 | pub fn insert(self: *Self, hash1: u64, hash2: u64, value: u64) void { 280 | self.input.appendAssumeCapacity( 281 | Input{ 282 | .hash1 = hash1, 283 | .hash2 = hash2, 284 | .hash_result = splitHash(hash1, hash2, self.m, self.opts.w), 285 | .value = value, 286 | }, 287 | ); 288 | } 289 | 290 | pub fn build(self: *Self, allocator: std.mem.Allocator) error{ OutOfMemory, HashCollision }!BumpedLayer { 291 | const w64 = @as(u64, self.opts.w); 292 | const bucket_size = (w64 * w64) / (4 * std.math.log2_int_ceil(u64, w64)); 293 | const n = self.input.items.len; 294 | 295 | const lessThan = struct { 296 | fn lessThan(_: void, left: Input, right: Input) bool { 297 | return left.hash_result.i < right.hash_result.i; 298 | } 299 | }.lessThan; 300 | 301 | std.mem.sort(Input, self.input.items, {}, lessThan); 302 | 303 | var system = try RibbonBandingSystem.init(allocator, self.m, self.opts.r, self.opts.w); 304 | defer system.deinit(allocator); 305 | 306 | var inserted = try std.ArrayListUnmanaged(?usize).initCapacity(allocator, bucket_size); 307 | defer inserted.deinit(allocator); 308 | 309 | var thresholds = try CompactArray.Mutable.init(allocator, 2, std.math.divCeil(usize, self.m, bucket_size) catch unreachable); 310 | errdefer thresholds.deinit(allocator); 311 | 312 | const lower_threshold = bucket_size / 7; 313 | const upper_threshold = bucket_size / 4; 314 | std.debug.assert(lower_threshold < upper_threshold); 315 | std.debug.assert(upper_threshold < bucket_size); 316 | 317 | const threshold_values = [4]usize{ 0, lower_threshold, upper_threshold, bucket_size }; 318 | 319 | const inputs = self.input.items; 320 | 321 | var i: usize = 0; 322 | var bucket_start: usize = 0; 323 | var bucket_idx: usize = 0; 324 | var bump_count: usize = 0; 325 | 326 | while (i < n) { 327 | var j = i; 328 | 329 | // Find the end position of this bucket: 330 | while (j < n) { 331 | if (inputs[j].hash_result.i >= bucket_start + bucket_size) break; 332 | j += 1; 333 | } 334 | 335 | inserted.clearRetainingCapacity(); 336 | 337 | var bump_offset: usize = 0; 338 | 339 | // Now iterate backwards again and insert them: 340 | var k: usize = j; 341 | while (k > i) { 342 | k -= 1; 343 | const input = inputs[k]; 344 | switch (system.insertRow(input.hash_result.i, input.hash_result.c, input.value)) { 345 | .success => |idx| { 346 | try inserted.append(allocator, idx); 347 | }, 348 | .redunant => { 349 | try inserted.append(allocator, null); 350 | }, 351 | .failure => { 352 | bump_offset = input.hash_result.i - bucket_start + 1; 353 | k += 1; 354 | break; 355 | }, 356 | } 357 | } 358 | 359 | // Next determine the actual threshold to use: 360 | var threshold: usize = undefined; 361 | for (threshold_values, 0..) |threshold_value, idx| { 362 | if (threshold_value >= bump_offset) { 363 | threshold = idx; 364 | break; 365 | } 366 | } 367 | 368 | const threshold_value = threshold_values[threshold]; 369 | 370 | thresholds.setFromZero(bucket_idx, threshold); 371 | 372 | // And now undo all the inserted ones which have an offset outside the threshold: 373 | while (k < j) : (k += 1) { 374 | const input = inputs[k]; 375 | if (input.hash_result.i - bucket_start >= threshold_value) break; 376 | if (inserted.pop().?) |idx| { 377 | system.clearRow(idx); 378 | } 379 | } 380 | 381 | bump_count += k - i; 382 | 383 | // Prepare for the next bucket: 384 | i = j; 385 | bucket_start += bucket_size; 386 | bucket_idx += 1; 387 | } 388 | 389 | var table = try system.build(allocator); 390 | errdefer table.deinit(allocator); 391 | 392 | // Prepare for the next layer 393 | 394 | var next_inputs = try std.ArrayListUnmanaged(Input).initCapacity(allocator, bump_count); 395 | errdefer next_inputs.deinit(allocator); 396 | 397 | var layer = BumpedLayer{ 398 | .table = table, 399 | .bucket_size = bucket_size, 400 | .upper_threshold = upper_threshold, 401 | .lower_threshold = lower_threshold, 402 | .thresholds = thresholds.finalize(), 403 | }; 404 | 405 | self.m = tableSizeFromEps(bump_count, self.eps, self.opts.w); 406 | 407 | for (inputs) |input| { 408 | if (layer.isBumped(input.hash_result.i)) { 409 | next_inputs.appendAssumeCapacity(Input{ 410 | .hash1 = input.hash1, 411 | .hash2 = input.hash2, 412 | .hash_result = splitHash(input.hash1, input.hash2, self.m, self.opts.w), 413 | .value = input.value, 414 | }); 415 | } 416 | } 417 | 418 | std.debug.assert(next_inputs.items.len == bump_count); 419 | 420 | self.input.deinit(allocator); 421 | self.input = next_inputs; 422 | 423 | return layer; 424 | } 425 | 426 | pub fn buildFallbackTable(self: *BumpedLayerBuilder, allocator: std.mem.Allocator) !RibbonTable { 427 | const n = self.input.items.len; 428 | const step = @max(n / 10, 1); 429 | var m: usize = @max(n, @as(usize, @intCast(self.opts.w)) + 1); 430 | 431 | var i: usize = 0; 432 | loop: while (i < 50) : (i += 1) { 433 | var system = try RibbonBandingSystem.init(allocator, m, self.opts.r, self.opts.w); 434 | defer system.deinit(allocator); 435 | 436 | for (self.input.items) |input| { 437 | const h = splitHash(input.hash1, input.hash2, m, self.opts.w); 438 | const insert_result = system.insertRow(h.i, h.c, input.value); 439 | switch (insert_result) { 440 | .failure => { 441 | m += step; 442 | continue :loop; 443 | }, 444 | else => {}, 445 | } 446 | } 447 | 448 | return try system.build(allocator); 449 | } 450 | 451 | return error.HashCollision; 452 | } 453 | }; 454 | 455 | const HashResult = struct { 456 | i: u64, 457 | c: u64, 458 | }; 459 | 460 | fn splitHash(hash1: u64, hash2: u64, n: usize, w: u6) HashResult { 461 | const i = hash1 % (n - w); 462 | const c_mask = ((@as(u64, 1) << w) - 1); 463 | const c = (hash2 & c_mask) | 1; 464 | return .{ .i = i, .c = c }; 465 | } 466 | 467 | pub const BuildOptions = struct { 468 | r: u6, 469 | w: u6, 470 | seed: u64 = 100, 471 | }; 472 | 473 | pub fn Ribbon( 474 | comptime Key: type, 475 | comptime hasher: fn (seed: u64, Key: Key) u64, 476 | ) type { 477 | return struct { 478 | const Self = @This(); 479 | 480 | fn hashKey(seed: u64, key: Key, n: usize, w: u6) HashResult { 481 | return splitHash(hasher(seed, key), hasher(seed + 1, key), n, w); 482 | } 483 | 484 | w: u6, 485 | seed: u64, 486 | table: RibbonTable, 487 | 488 | pub fn deinit(self: *Self, allocator: std.mem.Allocator) void { 489 | self.table.deinit(allocator); 490 | self.* = undefined; 491 | } 492 | 493 | pub fn lookup(self: *const Self, key: Key) u64 { 494 | const h = hashKey(self.seed, key, self.table.n, self.w); 495 | return self.table.lookup(h.i, h.c); 496 | } 497 | 498 | pub fn bits(self: *const Self) u64 { 499 | return self.table.bits(); 500 | } 501 | 502 | pub fn writeTo(self: *const Self, w: anytype) !void { 503 | try w.writeIntNative(u64, self.w); 504 | try w.writeIntNative(u64, self.seed); 505 | try self.table.writeTo(w); 506 | } 507 | 508 | pub fn readFrom(stream: *std.io.FixedBufferStream([]const u8)) !Self { 509 | var r = stream.reader(); 510 | const w = try r.readIntNative(u64); 511 | const seed = try r.readIntNative(u64); 512 | const table = try RibbonTable.readFrom(stream); 513 | return Self{ 514 | .w = @intCast(w), 515 | .seed = seed, 516 | .table = table, 517 | }; 518 | } 519 | 520 | /// IncrementalBuilder builds the Ribbon table incrementally: 521 | /// It uses a fixed `n` and tries to construct a table as it inserts entries. 522 | /// If it's not possible to build a table for a given entry it will fail. 523 | pub const IncrementalBuilder = struct { 524 | n: usize, 525 | seed: u64, 526 | system: RibbonBandingSystem, 527 | 528 | pub fn init(allocator: std.mem.Allocator, n: usize, opts: BuildOptions) error{OutOfMemory}!IncrementalBuilder { 529 | const system = try RibbonBandingSystem.init(allocator, n, opts.r, opts.w); 530 | 531 | return IncrementalBuilder{ 532 | .n = n, 533 | .seed = opts.seed, 534 | .system = system, 535 | }; 536 | } 537 | 538 | pub fn deinit(self: *IncrementalBuilder, allocator: std.mem.Allocator) void { 539 | self.system.deinit(allocator); 540 | self.* = undefined; 541 | } 542 | 543 | pub fn insert(self: *IncrementalBuilder, key: Key, value: u64) error{HashCollision}!void { 544 | const h = hashKey(self.seed, key, self.n, self.system.getBandWidth()); 545 | switch (self.system.insertRow(h.i, h.c, value)) { 546 | .failure => return error.HashCollision, 547 | else => {}, 548 | } 549 | } 550 | 551 | pub fn build(self: IncrementalBuilder, allocator: std.mem.Allocator) error{OutOfMemory}!Self { 552 | const table = try self.system.build(allocator); 553 | 554 | return Self{ 555 | .w = self.system.getBandWidth(), 556 | .seed = self.seed, 557 | .table = table, 558 | }; 559 | } 560 | }; 561 | 562 | pub const IterativeBuilder = struct { 563 | const Input = struct { 564 | hash1: u64, 565 | hash2: u64, 566 | value: u64, 567 | }; 568 | 569 | n: usize, 570 | seed: u64, 571 | input: std.ArrayListUnmanaged(Input), 572 | 573 | pub fn init(allocator: std.mem.Allocator, n: usize, seed: u64) error{OutOfMemory}!IterativeBuilder { 574 | const input = try std.ArrayListUnmanaged(Input).initCapacity(allocator, n); 575 | 576 | return IterativeBuilder{ 577 | .n = n, 578 | .seed = seed, 579 | .input = input, 580 | }; 581 | } 582 | 583 | pub fn deinit(self: *IterativeBuilder, allocator: std.mem.Allocator) void { 584 | self.input.deinit(allocator); 585 | self.* = undefined; 586 | } 587 | 588 | pub fn insert(self: *IterativeBuilder, key: Key, value: u64) void { 589 | self.input.appendAssumeCapacity( 590 | Input{ 591 | .hash1 = hasher(self.seed, key), 592 | .hash2 = hasher(self.seed + 1, key), 593 | .value = value, 594 | }, 595 | ); 596 | } 597 | 598 | pub fn insertWithAllocator(self: *IterativeBuilder, allocator: std.mem.Allocator, key: Key, value: u64) error{OutOfMemory}!void { 599 | try self.input.append( 600 | allocator, 601 | Input{ 602 | .hash1 = hasher(self.seed, key), 603 | .hash2 = hasher(self.seed + 1, key), 604 | .value = value, 605 | }, 606 | ); 607 | } 608 | 609 | pub fn build(self: IterativeBuilder, allocator: std.mem.Allocator, opts: BuildOptions) error{ OutOfMemory, HashCollision }!Self { 610 | std.debug.assert(self.seed == opts.seed); 611 | 612 | const n = self.input.items.len; 613 | const step = @max(n / 10, 1); 614 | var m: usize = n; 615 | 616 | var i: usize = 0; 617 | loop: while (i < 50) : (i += 1) { 618 | var system = try RibbonBandingSystem.init(allocator, m, opts.r, opts.w); 619 | defer system.deinit(allocator); 620 | 621 | for (self.input.items) |input| { 622 | const h = splitHash(input.hash1, input.hash2, m, opts.w); 623 | const insert_result = system.insertRow(h.i, h.c, input.value); 624 | switch (insert_result) { 625 | .failure => { 626 | m += step; 627 | continue :loop; 628 | }, 629 | else => {}, 630 | } 631 | } 632 | 633 | const table = try system.build(allocator); 634 | 635 | return Self{ 636 | .w = opts.w, 637 | .seed = opts.seed, 638 | .table = table, 639 | }; 640 | } 641 | 642 | return error.HashCollision; 643 | } 644 | }; 645 | 646 | pub const Bumped = struct { 647 | const Layers = std.BoundedArray(BumpedLayer, 4); 648 | 649 | w: u6, 650 | seed: u64, 651 | layers: Layers, 652 | fallback_table: RibbonTable, 653 | 654 | pub fn deinit(self: *Bumped, allocator: std.mem.Allocator) void { 655 | for (self.layers.slice()) |*layer| { 656 | layer.deinit(allocator); 657 | } 658 | self.fallback_table.deinit(allocator); 659 | self.* = undefined; 660 | } 661 | 662 | pub fn lookup(self: *const Bumped, key: Key) u64 { 663 | const hash1 = hasher(self.seed, key); 664 | const hash2 = hasher(self.seed + 1, key); 665 | for (self.layers.slice()) |layer| { 666 | const h = splitHash(hash1, hash2, layer.table.n, self.w); 667 | if (layer.lookup(h.i, h.c)) |result| { 668 | return result; 669 | } 670 | } 671 | const h = splitHash(hash1, hash2, self.fallback_table.n, self.w); 672 | return self.fallback_table.lookup(h.i, h.c); 673 | } 674 | 675 | pub fn bits(self: Bumped) usize { 676 | var result = self.fallback_table.bits(); 677 | for (self.layers.slice()) |layer| { 678 | result += layer.bits(); 679 | } 680 | return result; 681 | } 682 | 683 | pub fn writeTo(self: *const Bumped, w: anytype) !void { 684 | try w.writeInt(u64, self.w, endian); 685 | try w.writeInt(u64, self.seed, endian); 686 | try w.writeInt(u64, self.layers.len, endian); 687 | for (self.layers.slice()) |layer| { 688 | try layer.writeTo(w); 689 | } 690 | try self.fallback_table.writeTo(w); 691 | } 692 | 693 | pub fn readFrom(stream: *std.io.FixedBufferStream([]const u8)) !Bumped { 694 | var r = stream.reader(); 695 | const w = try r.readInt(u64, endian); 696 | const seed = try r.readInt(u64, endian); 697 | const layers_len = try r.readInt(u64, endian); 698 | var layers = Layers.init(0) catch unreachable; 699 | for (0..layers_len) |_| { 700 | layers.appendAssumeCapacity(try BumpedLayer.readFrom(stream)); 701 | } 702 | const fallback_table = try RibbonTable.readFrom(stream); 703 | return Bumped{ 704 | .w = @intCast(w), 705 | .seed = seed, 706 | .layers = layers, 707 | .fallback_table = fallback_table, 708 | }; 709 | } 710 | }; 711 | 712 | pub const BumpedBuilder = struct { 713 | layer_builder: BumpedLayerBuilder, 714 | 715 | pub fn init(allocator: std.mem.Allocator, n: usize, eps: f64, opts: BuildOptions) error{OutOfMemory}!BumpedBuilder { 716 | var layer_builder = try BumpedLayerBuilder.init(allocator, n, eps, opts); 717 | errdefer layer_builder.deinit(allocator); 718 | 719 | return BumpedBuilder{ .layer_builder = layer_builder }; 720 | } 721 | 722 | pub fn deinit(self: *BumpedBuilder, allocator: std.mem.Allocator) void { 723 | self.layer_builder.deinit(allocator); 724 | self.* = undefined; 725 | } 726 | 727 | pub fn insert(self: *BumpedBuilder, key: Key, value: u64) void { 728 | const hash1 = hasher(self.layer_builder.opts.seed, key); 729 | const hash2 = hasher(self.layer_builder.opts.seed + 1, key); 730 | self.layer_builder.insert(hash1, hash2, value); 731 | } 732 | 733 | pub fn build(self: *BumpedBuilder, allocator: std.mem.Allocator) error{ OutOfMemory, HashCollision }!Bumped { 734 | var layers = Bumped.Layers.init(0) catch unreachable; 735 | errdefer { 736 | for (layers.slice()) |*layer| { 737 | layer.deinit(allocator); 738 | } 739 | } 740 | 741 | while (layers.len < layers.capacity()) { 742 | if (layers.len > 1 and self.layer_builder.input.items.len < 2048) { 743 | // Other bother with the lower level if we have enough items. 744 | break; 745 | } 746 | 747 | var layer = try self.layer_builder.build(allocator); 748 | errdefer layer.deinit(allocator); 749 | 750 | layers.appendAssumeCapacity(layer); 751 | } 752 | 753 | var fallback_table = try self.layer_builder.buildFallbackTable(allocator); 754 | errdefer fallback_table.deinit(allocator); 755 | 756 | return Bumped{ 757 | .w = self.layer_builder.opts.w, 758 | .seed = self.layer_builder.opts.seed, 759 | .layers = layers, 760 | .fallback_table = fallback_table, 761 | }; 762 | } 763 | }; 764 | }; 765 | } 766 | 767 | pub fn RibbonAutoHash(comptime Key: type) type { 768 | return Ribbon(Key, utils.autoHash(Key)); 769 | } 770 | 771 | const testing = std.testing; 772 | const Wyhash = std.hash.Wyhash; 773 | const TestErrorSet = error{ OutOfMemory, HashCollision, TestExpectedEqual }; 774 | 775 | fn testRibbon(t: anytype) TestErrorSet!void { 776 | const valueSize = 8; 777 | t.setValueSize(valueSize); 778 | t.setBandWidth(32); 779 | t.setSeed(100); 780 | try t.init(); 781 | 782 | const seed = 0x0194f614c15227ba; 783 | 784 | { 785 | // Insert random data: 786 | var prng = std.Random.DefaultPrng.init(seed); 787 | const r = prng.random(); 788 | 789 | for (0..t.n) |idx| { 790 | const value = r.uintLessThan(u64, @as(u64, 1) << valueSize); 791 | try t.insert(idx, value); 792 | } 793 | } 794 | 795 | try t.build(); 796 | 797 | { 798 | // Look it up again: 799 | var prng = std.Random.DefaultPrng.init(seed); 800 | const r = prng.random(); 801 | 802 | for (0..t.n) |idx| { 803 | const value = r.uintLessThan(u64, @as(u64, 1) << valueSize); 804 | try testing.expectEqual(value, t.lookup(idx)); 805 | } 806 | } 807 | } 808 | 809 | const RibbonU64 = RibbonAutoHash(u64); 810 | 811 | fn RibbonSettings(comptime Self: type) type { 812 | return struct { 813 | fn setValueSize(self: *Self, r: u6) void { 814 | self.r = r; 815 | } 816 | 817 | fn setBandWidth(self: *Self, w: u6) void { 818 | self.w = w; 819 | } 820 | 821 | fn setSeed(self: *Self, seed: u64) void { 822 | self.seed = seed; 823 | } 824 | 825 | fn options(self: Self) BuildOptions { 826 | return .{ 827 | .r = self.r.?, 828 | .w = self.w.?, 829 | .seed = self.seed.?, 830 | }; 831 | } 832 | }; 833 | } 834 | 835 | const RibbonIncrementalTest = struct { 836 | const Self = @This(); 837 | 838 | allocator: std.mem.Allocator, 839 | n: usize, 840 | 841 | r: ?u6 = null, 842 | w: ?u6 = null, 843 | seed: ?u64 = null, 844 | builder: ?RibbonU64.IncrementalBuilder = null, 845 | table: ?RibbonU64 = null, 846 | 847 | usingnamespace RibbonSettings(Self); 848 | 849 | fn deinit(self: *Self) void { 850 | if (self.builder) |*b| b.deinit(self.allocator); 851 | if (self.table) |*t| t.deinit(self.allocator); 852 | } 853 | 854 | fn init(self: *Self) !void { 855 | self.builder = try RibbonU64.IncrementalBuilder.init(self.allocator, self.n * 2, self.options()); 856 | } 857 | 858 | fn insert(self: *Self, key: u64, value: u64) !void { 859 | try self.builder.?.insert(key, value); 860 | } 861 | 862 | fn build(self: *Self) !void { 863 | self.table = try self.builder.?.build(self.allocator); 864 | } 865 | 866 | fn lookup(self: *Self, key: u64) u64 { 867 | return self.table.?.lookup(key); 868 | } 869 | }; 870 | 871 | const RibbonIterativeTest = struct { 872 | const Self = @This(); 873 | 874 | allocator: std.mem.Allocator, 875 | n: usize, 876 | 877 | r: ?u6 = null, 878 | w: ?u6 = null, 879 | seed: ?u64 = null, 880 | 881 | builder: ?RibbonU64.IterativeBuilder = null, 882 | table: ?RibbonU64 = null, 883 | 884 | usingnamespace RibbonSettings(Self); 885 | 886 | fn deinit(self: *Self) void { 887 | if (self.builder) |*b| b.deinit(self.allocator); 888 | if (self.table) |*t| t.deinit(self.allocator); 889 | } 890 | 891 | fn init(self: *Self) !void { 892 | self.builder = try RibbonU64.IterativeBuilder.init(self.allocator, self.n, self.options().seed); 893 | } 894 | 895 | fn insert(self: *Self, key: u64, value: u64) !void { 896 | self.builder.?.insert(key, value); 897 | } 898 | 899 | fn build(self: *Self) !void { 900 | self.table = try self.builder.?.build(self.allocator, self.options()); 901 | } 902 | 903 | fn lookup(self: *Self, key: u64) u64 { 904 | return self.table.?.lookup(key); 905 | } 906 | }; 907 | 908 | const BumpedRibbonTest = struct { 909 | const Self = @This(); 910 | 911 | allocator: std.mem.Allocator, 912 | n: usize, 913 | 914 | r: ?u6 = null, 915 | w: ?u6 = null, 916 | seed: ?u64 = null, 917 | 918 | builder: ?RibbonU64.BumpedBuilder = null, 919 | table: ?RibbonU64.Bumped = null, 920 | 921 | usingnamespace RibbonSettings(Self); 922 | 923 | fn deinit(self: *Self) void { 924 | if (self.builder) |*b| b.deinit(self.allocator); 925 | if (self.table) |*t| t.deinit(self.allocator); 926 | } 927 | 928 | fn init(self: *Self) !void { 929 | self.builder = try RibbonU64.BumpedBuilder.init(self.allocator, self.n, 0, self.options()); 930 | } 931 | 932 | fn insert(self: *Self, key: u64, value: u64) !void { 933 | self.builder.?.insert(key, value); 934 | } 935 | 936 | fn build(self: *Self) !void { 937 | self.table = try self.builder.?.build(self.allocator); 938 | } 939 | 940 | fn lookup(self: *Self, key: u64) u64 { 941 | return self.table.?.lookup(key); 942 | } 943 | }; 944 | 945 | fn testRibbonIncremental(allocator: std.mem.Allocator) TestErrorSet!void { 946 | var t = RibbonIncrementalTest{ .allocator = allocator, .n = 100 }; 947 | defer t.deinit(); 948 | try testRibbon(&t); 949 | } 950 | 951 | fn testRibbonIterative(allocator: std.mem.Allocator) TestErrorSet!void { 952 | var t = RibbonIterativeTest{ .allocator = allocator, .n = 100 }; 953 | defer t.deinit(); 954 | try testRibbon(&t); 955 | } 956 | 957 | fn testBumpedRibbon(allocator: std.mem.Allocator) TestErrorSet!void { 958 | var t = BumpedRibbonTest{ .allocator = allocator, .n = 100 }; 959 | defer t.deinit(); 960 | try testRibbon(&t); 961 | } 962 | 963 | test "ribbon incremental" { 964 | try utils.testFailingAllocator(testRibbonIncremental); 965 | } 966 | 967 | test "ribbon iterative" { 968 | try utils.testFailingAllocator(testRibbonIterative); 969 | } 970 | 971 | test "bumped ribbon" { 972 | try utils.testFailingAllocator(testBumpedRibbon); 973 | } 974 | -------------------------------------------------------------------------------- /src/utils.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const builtin = @import("builtin"); 3 | const endian = builtin.cpu.arch.endian(); 4 | 5 | pub fn writeSlice(w: anytype, arr: anytype) !void { 6 | const T = @TypeOf(arr[0]); 7 | try w.writeInt(u64, arr.len, endian); 8 | const byte_len = arr.len * @sizeOf(T); 9 | if (byte_len == 0) return; 10 | try w.writeAll(@as([*]const u8, @ptrCast(&arr[0]))[0..byte_len]); 11 | // Make sure we're always at a 64-bit boundary. 12 | const padding = (@alignOf(u64) - (byte_len % @alignOf(u64))) % @alignOf(u64); 13 | try w.writeByteNTimes(0, padding); 14 | } 15 | 16 | pub fn readSlice(stream: *std.io.FixedBufferStream([]const u8), T: anytype) ![]const T { 17 | // Invariant: stream.pos should be 8-byte aligned before and after `readSlice` 18 | std.debug.assert(stream.pos % @alignOf(u64) == 0); 19 | defer std.debug.assert(stream.pos % @alignOf(u64) == 0); 20 | 21 | var r = stream.reader(); 22 | const len = try r.readInt(u64, endian); 23 | const byte_len = len * @sizeOf(T); 24 | if (byte_len == 0) return &[_]T{}; 25 | const data = stream.buffer[stream.pos..][0..byte_len]; 26 | stream.pos += byte_len; 27 | const padding = (@alignOf(u64) - (byte_len % @alignOf(u64))) % @alignOf(u64); 28 | stream.pos += padding; 29 | const cast_data: [*]const T = @ptrCast(@alignCast(&data[0])); 30 | return cast_data[0..len]; 31 | } 32 | 33 | pub fn bitSizeOfSlice(arr: anytype) u64 { 34 | return arr.len * @bitSizeOf(@TypeOf(arr[0])); 35 | } 36 | 37 | pub fn autoHash(comptime Key: type) fn (seed: u64, key: Key) u64 { 38 | return struct { 39 | fn hash(seed: u64, key: Key) u64 { 40 | if (comptime std.meta.hasUniqueRepresentation(Key)) { 41 | return std.hash.Wyhash.hash(seed, std.mem.asBytes(&key)); 42 | } else { 43 | var hasher = std.hash.Wyhash.init(seed); 44 | std.hash.autoHash(&hasher, key); 45 | return hasher.final(); 46 | } 47 | } 48 | }.hash; 49 | } 50 | 51 | pub fn testFailingAllocator(comptime t: fn (allocator: std.mem.Allocator) anyerror!void) !void { 52 | var idx: usize = 0; 53 | while (true) : (idx += 1) { 54 | var failing_alloc = std.testing.FailingAllocator.init(std.testing.allocator, .{ .fail_index = idx }); 55 | 56 | try (t(failing_alloc.allocator()) catch |err| switch (err) { 57 | error.OutOfMemory => continue, 58 | else => err, 59 | }); 60 | 61 | return; 62 | } 63 | } 64 | 65 | const testing = std.testing; 66 | 67 | test "readSlice / writeSlice must maintain 8-byte alignment" { 68 | var buf: [128]u8 = undefined; 69 | var write_stream = std.io.fixedBufferStream(buf[0..]); 70 | 71 | const writer = write_stream.writer(); 72 | 73 | try writeSlice(writer, [_]u8{ 1, 2, 3 }); 74 | try writeSlice(writer, [_]u64{2}); 75 | 76 | var read_stream = std.io.fixedBufferStream(@as([]const u8, buf[0..])); 77 | 78 | try testing.expectEqualSlices(u8, &.{ 1, 2, 3 }, try readSlice(&read_stream, u8)); 79 | try testing.expectEqualSlices(u64, &.{2}, try readSlice(&read_stream, u64)); 80 | } 81 | -------------------------------------------------------------------------------- /tools/zini-pthash/main.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const zini = @import("zini"); 3 | const parg = @import("parg"); 4 | 5 | const HashFn = zini.pthash.BytesHashFn(zini.DictArray); 6 | const StringDict = zini.StringDict; 7 | 8 | const usage = 9 | \\USAGE 10 | \\ {s} [build | lookup] 11 | \\ 12 | \\COMMAND: build 13 | \\ Builds hash function for plain text file. 14 | \\ 15 | \\ -i, --input 16 | \\ -o, --output 17 | \\ -c 18 | \\ -a, --alpha 19 | \\ -s, --seed 20 | \\ -d, --dict 21 | \\ 22 | \\COMMAND: lookup 23 | \\ 24 | \\ -i, --input 25 | \\ -k, --key 26 | \\ -b, --benchmark 27 | \\ 28 | ; 29 | 30 | fn fail(comptime msg: []const u8, args: anytype) noreturn { 31 | std.debug.print("error: ", .{}); 32 | std.debug.print(msg, args); 33 | std.debug.print("\n", .{}); 34 | std.posix.exit(1); 35 | } 36 | 37 | pub fn main() !void { 38 | var gpa = std.heap.GeneralPurposeAllocator(.{}){}; 39 | defer { 40 | const check = gpa.deinit(); 41 | if (check == .leak) @panic("memory leaked"); 42 | } 43 | 44 | const allocator = gpa.allocator(); 45 | 46 | var p = try parg.parseProcess(allocator, .{}); 47 | defer p.deinit(); 48 | 49 | const program_name = p.nextValue() orelse @panic("no executable name"); 50 | 51 | while (p.next()) |token| { 52 | switch (token) { 53 | .flag => |flag| { 54 | fail("uknown flag: {s}", .{flag.name}); 55 | }, 56 | .arg => |arg| { 57 | if (std.mem.eql(u8, arg, "lookup")) { 58 | return lookup(allocator, &p); 59 | } else if (std.mem.eql(u8, arg, "build")) { 60 | return build(allocator, &p); 61 | } else { 62 | fail("uknown argument: {s}", .{arg}); 63 | } 64 | }, 65 | .unexpected_value => |val| fail("uknown argument: {s}", .{val}), 66 | } 67 | } 68 | 69 | std.debug.print(usage, .{program_name}); 70 | } 71 | 72 | fn printHashStats(hash: HashFn, dict: ?StringDict, arr: ?zini.DictArray) !void { 73 | const bits = hash.bits() + @bitSizeOf(HashFn); 74 | std.debug.print(" seed: {}\n", .{hash.seed}); 75 | std.debug.print(" bits: {}\n", .{bits}); 76 | std.debug.print(" bits/n: {d}\n", .{@as(f64, @floatFromInt(bits)) / @as(f64, @floatFromInt(hash.n))}); 77 | std.debug.print("\n", .{}); 78 | 79 | if (dict != null) { 80 | const dict_size = dict.?.bits() + @bitSizeOf(StringDict) + arr.?.bits() + @bitSizeOf(zini.DictArray); 81 | std.debug.print("File contains dictionary as well:\n", .{}); 82 | std.debug.print(" bits: {}\n", .{dict_size}); 83 | std.debug.print(" bits/n: {d}\n", .{@as(f64, @floatFromInt(dict_size)) / @as(f64, @floatFromInt(hash.n))}); 84 | std.debug.print("\n", .{}); 85 | 86 | const total_bits = bits + dict_size; 87 | 88 | std.debug.print("Combined:\n", .{}); 89 | std.debug.print(" bits: {}\n", .{total_bits}); 90 | std.debug.print(" bits/n: {d}\n", .{@as(f64, @floatFromInt(total_bits)) / @as(f64, @floatFromInt(hash.n))}); 91 | std.debug.print("\n", .{}); 92 | } 93 | } 94 | 95 | pub fn build(allocator: std.mem.Allocator, p: anytype) !void { 96 | var params = zini.pthash.Params{ .c = 7, .alpha = 0.95 }; 97 | var input: ?[]const u8 = null; 98 | var output: ?[]const u8 = null; 99 | var seed: ?u64 = null; 100 | var build_dict: bool = false; 101 | 102 | while (p.next()) |token| { 103 | switch (token) { 104 | .flag => |flag| { 105 | if (flag.isShort("i") or flag.isLong("input")) { 106 | const val = p.nextValue() orelse fail("-i/--input requires value", .{}); 107 | input = val; 108 | } else if (flag.isShort("o") or flag.isLong("output")) { 109 | const val = p.nextValue() orelse @panic("value required"); 110 | output = val; 111 | } else if (flag.isShort("s") or flag.isLong("seed")) { 112 | const val = p.nextValue() orelse @panic("value required"); 113 | seed = try std.fmt.parseInt(usize, val, 10); 114 | } else if (flag.isShort("c")) { 115 | const val = p.nextValue() orelse @panic("value required"); 116 | params.c = try std.fmt.parseInt(usize, val, 10); 117 | } else if (flag.isShort("a") or flag.isLong("alpha")) { 118 | const val = p.nextValue() orelse @panic("value required"); 119 | params.alpha = try std.fmt.parseFloat(f64, val); 120 | } else if (flag.isShort("d") or flag.isLong("dict")) { 121 | build_dict = true; 122 | } else { 123 | fail("uknown flag: {s}", .{flag.name}); 124 | } 125 | }, 126 | .arg => |arg| fail("uknown argument: {s}", .{arg}), 127 | .unexpected_value => |val| fail("uknown argument: {s}", .{val}), 128 | } 129 | } 130 | 131 | if (input == null) { 132 | fail("-i/--input is required", .{}); 133 | } 134 | 135 | std.debug.print("Reading {s}...\n", .{input.?}); 136 | var file = try std.fs.cwd().openFile(input.?, .{}); 137 | defer file.close(); 138 | 139 | const data = try file.reader().readAllAlloc(allocator, 10 * 1024 * 1024); 140 | defer allocator.free(data); 141 | 142 | var keys = std.ArrayList([]const u8).init(allocator); 143 | defer keys.deinit(); 144 | 145 | var iter = std.mem.tokenizeScalar(u8, data, '\n'); 146 | while (iter.next()) |line| { 147 | var split = std.mem.splitScalar(u8, line, ' '); 148 | try keys.append(split.next().?); 149 | } 150 | 151 | std.debug.print("\n", .{}); 152 | std.debug.print("Building hash function...\n", .{}); 153 | var hash = try HashFn.build(allocator, keys.items, params, seed); 154 | defer hash.deinit(allocator); 155 | 156 | var dict: ?StringDict = null; 157 | defer if (dict) |*d| d.deinit(allocator); 158 | 159 | var arr: ?zini.DictArray = null; 160 | defer if (arr) |*a| a.deinit(allocator); 161 | 162 | if (build_dict) { 163 | var dict_builder = try StringDict.Builder.init(allocator); 164 | defer dict_builder.deinit(); 165 | 166 | var arr_slice = try allocator.alloc(u64, hash.n); 167 | defer allocator.free(arr_slice); 168 | 169 | iter = std.mem.tokenizeScalar(u8, data, '\n'); 170 | while (iter.next()) |line| { 171 | var split = std.mem.splitScalar(u8, line, ' '); 172 | const key = split.next().?; 173 | const value = split.next().?; 174 | const key_idx = hash.get(key); 175 | const val_idx = try dict_builder.intern(value); 176 | arr_slice[key_idx] = val_idx; 177 | } 178 | 179 | dict = try dict_builder.build(); 180 | arr = try zini.DictArray.encode(allocator, arr_slice); 181 | } 182 | 183 | std.debug.print("\n", .{}); 184 | std.debug.print("Successfully built hash function:\n", .{}); 185 | try printHashStats(hash, dict, arr); 186 | 187 | if (output) |o| { 188 | std.debug.print("Writing to {s}\n", .{o}); 189 | const outfile = try std.fs.cwd().createFile(o, .{}); 190 | defer outfile.close(); 191 | 192 | try hash.writeTo(outfile.writer()); 193 | 194 | if (build_dict) { 195 | try dict.?.writeTo(outfile.writer()); 196 | try arr.?.writeTo(outfile.writer()); 197 | } 198 | } 199 | } 200 | 201 | pub fn lookup(allocator: std.mem.Allocator, p: anytype) !void { 202 | const stdout = std.io.getStdOut().writer(); 203 | 204 | var input: ?[]const u8 = null; 205 | var key: ?[]const u8 = null; 206 | var bench: bool = false; 207 | 208 | while (p.next()) |token| { 209 | switch (token) { 210 | .flag => |flag| { 211 | if (flag.isShort("i") or flag.isLong("input")) { 212 | const val = p.nextValue() orelse fail("-i/--input requires value", .{}); 213 | input = val; 214 | } else if (flag.isShort("k") or flag.isLong("key")) { 215 | const val = p.nextValue() orelse fail("-k/--key requires value", .{}); 216 | key = val; 217 | } else if (flag.isShort("b") or flag.isLong("bench")) { 218 | bench = true; 219 | } else { 220 | fail("unknown flag: {s}", .{flag.name}); 221 | } 222 | }, 223 | .arg => |arg| fail("unexpected argument: {s}", .{arg}), 224 | .unexpected_value => |val| fail("unexpected argument: {s}", .{val}), 225 | } 226 | } 227 | 228 | if (input == null) { 229 | fail("-i/--input is required", .{}); 230 | } 231 | 232 | std.debug.print("Reading {s}...\n", .{input.?}); 233 | const buf = try std.fs.cwd().readFileAlloc(allocator, input.?, 10 * 1024 * 1024); 234 | defer allocator.free(buf); 235 | 236 | var fbs = std.io.fixedBufferStream(@as([]const u8, buf)); 237 | const hash = try HashFn.readFrom(&fbs); 238 | var dict: ?StringDict = null; 239 | var arr: ?zini.DictArray = null; 240 | 241 | if (fbs.pos < fbs.buffer.len) { 242 | dict = try StringDict.readFrom(&fbs); 243 | arr = try zini.DictArray.readFrom(&fbs); 244 | } 245 | 246 | std.debug.print("\n", .{}); 247 | 248 | std.debug.print("Successfully loaded hash function:\n", .{}); 249 | try printHashStats(hash, dict, arr); 250 | 251 | if (key) |k| { 252 | std.debug.print("Looking up key={s}:\n", .{k}); 253 | const h = hash.get(k); 254 | try stdout.print("{}\n", .{h}); 255 | if (dict) |d| { 256 | try stdout.print("{s}\n", .{d.get(arr.?.get(h))}); 257 | } 258 | 259 | if (bench) { 260 | const n = 1000; 261 | std.debug.print("\nBenchmarking...\n", .{}); 262 | var timer = try std.time.Timer.start(); 263 | const start = timer.lap(); 264 | var i: usize = 0; 265 | // TODO: Is this actually a good way of benchmarking? 266 | while (i < n) : (i += 1) { 267 | std.mem.doNotOptimizeAway(hash.get(k)); 268 | } 269 | const end = timer.read(); 270 | const dur = end - start; 271 | std.debug.print("{} ns/read (avg of {} iterations)\n", .{ dur / n, n }); 272 | } 273 | } 274 | } 275 | -------------------------------------------------------------------------------- /tools/zini-ribbon/main.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const builtin = @import("builtin"); 3 | const zini = @import("zini"); 4 | const parg = @import("parg"); 5 | 6 | const HashFn = zini.pthash.BytesHashFn(zini.DictArray); 7 | const HashRibbon = zini.ribbon.Ribbon([]const u8, std.hash.Wyhash.hash); 8 | const StringDict = zini.StringDict; 9 | const endian = builtin.cpu.arch.endian(); 10 | 11 | const usage = 12 | \\USAGE 13 | \\ {s} [build | lookup] 14 | \\ 15 | \\COMMAND: build 16 | \\ Builds Ribbon table for plain text file. 17 | \\ 18 | \\ -i, --input 19 | \\ -o, --output 20 | \\ -w 21 | \\ -s, --seed 22 | \\ 23 | \\COMMAND: lookup 24 | \\ 25 | \\ -i, --input 26 | \\ -k, --key 27 | \\ -b, --benchmark 28 | \\ 29 | ; 30 | 31 | fn fail(comptime msg: []const u8, args: anytype) noreturn { 32 | std.debug.print("error: ", .{}); 33 | std.debug.print(msg, args); 34 | std.debug.print("\n", .{}); 35 | std.posix.exit(1); 36 | } 37 | 38 | pub fn main() !void { 39 | var gpa = std.heap.GeneralPurposeAllocator(.{}){}; 40 | defer { 41 | const check = gpa.deinit(); 42 | if (check == .leak) @panic("memory leaked"); 43 | } 44 | 45 | const allocator = gpa.allocator(); 46 | 47 | var p = try parg.parseProcess(allocator, .{}); 48 | defer p.deinit(); 49 | 50 | const program_name = p.nextValue() orelse @panic("no executable name"); 51 | 52 | while (p.next()) |token| { 53 | switch (token) { 54 | .flag => |flag| { 55 | fail("uknown flag: {s}", .{flag.name}); 56 | }, 57 | .arg => |arg| { 58 | if (std.mem.eql(u8, arg, "lookup")) { 59 | return lookup(allocator, &p); 60 | } else if (std.mem.eql(u8, arg, "build")) { 61 | return build(allocator, &p); 62 | } else { 63 | fail("uknown argument: {s}", .{arg}); 64 | } 65 | }, 66 | .unexpected_value => |val| fail("uknown argument: {s}", .{val}), 67 | } 68 | } 69 | 70 | std.debug.print(usage, .{program_name}); 71 | } 72 | 73 | fn printStats(table: anytype, n: usize) !void { 74 | const bits = table.bits() + @bitSizeOf(@TypeOf(table)); 75 | std.debug.print(" seed: {}\n", .{table.seed}); 76 | std.debug.print(" bits: {}\n", .{bits}); 77 | std.debug.print(" bits/n: {d}\n", .{@as(f64, @floatFromInt(bits)) / @as(f64, @floatFromInt(n))}); 78 | } 79 | 80 | pub fn build(allocator: std.mem.Allocator, p: anytype) !void { 81 | var w: u6 = 32; 82 | var input: ?[]const u8 = null; 83 | var output: ?[]const u8 = null; 84 | var seed: ?u64 = null; 85 | var eps: f64 = 0; 86 | 87 | while (p.next()) |token| { 88 | switch (token) { 89 | .flag => |flag| { 90 | if (flag.isShort("i") or flag.isLong("input")) { 91 | const val = p.nextValue() orelse fail("-i/--input requires value", .{}); 92 | input = val; 93 | } else if (flag.isShort("o") or flag.isLong("output")) { 94 | const val = p.nextValue() orelse @panic("value required"); 95 | output = val; 96 | } else if (flag.isShort("s") or flag.isLong("seed")) { 97 | const val = p.nextValue() orelse @panic("value required"); 98 | seed = try std.fmt.parseInt(usize, val, 10); 99 | } else if (flag.isShort("w")) { 100 | const val = p.nextValue() orelse @panic("value required"); 101 | w = @intCast(try std.fmt.parseInt(usize, val, 10)); 102 | } else if (flag.isLong("eps")) { 103 | const val = p.nextValue() orelse @panic("value required"); 104 | eps = try std.fmt.parseFloat(f64, val); 105 | } else { 106 | fail("uknown flag: {s}", .{flag.name}); 107 | } 108 | }, 109 | .arg => |arg| fail("uknown argument: {s}", .{arg}), 110 | .unexpected_value => |val| fail("uknown argument: {s}", .{val}), 111 | } 112 | } 113 | 114 | if (input == null) { 115 | fail("-i/--input is required", .{}); 116 | } 117 | 118 | std.debug.print("Reading {s}...\n", .{input.?}); 119 | var file = try std.fs.cwd().openFile(input.?, .{}); 120 | defer file.close(); 121 | 122 | const data = try file.reader().readAllAlloc(allocator, 10 * 1024 * 1024); 123 | defer allocator.free(data); 124 | 125 | var keys = std.ArrayList([]const u8).init(allocator); 126 | defer keys.deinit(); 127 | 128 | if (seed == null) { 129 | try std.posix.getrandom(std.mem.asBytes(&seed)); 130 | } 131 | 132 | var max_val: u64 = 0; 133 | var n: usize = 0; 134 | 135 | var iter = std.mem.tokenizeScalar(u8, data, '\n'); 136 | while (iter.next()) |line| { 137 | var split = std.mem.splitScalar(u8, line, ','); 138 | _ = split.next().?; // the key 139 | const value = try std.fmt.parseInt(u64, split.next().?, 10); 140 | max_val = @max(max_val, value); 141 | n += 1; 142 | } 143 | 144 | const r: u6 = @intCast(std.math.log2_int_ceil(u64, max_val + 1)); 145 | 146 | std.debug.print("\n", .{}); 147 | std.debug.print("Building table for r={} value bits and eps={}...\n", .{ r, eps }); 148 | 149 | const opts = zini.ribbon.BuildOptions{ 150 | .r = r, 151 | .w = w, 152 | .seed = seed.?, 153 | }; 154 | 155 | var builder = try HashRibbon.BumpedBuilder.init(allocator, n, eps, opts); 156 | defer builder.deinit(allocator); 157 | 158 | iter = std.mem.tokenizeScalar(u8, data, '\n'); 159 | while (iter.next()) |line| { 160 | var split = std.mem.splitScalar(u8, line, ','); 161 | const key = split.next().?; // the key 162 | const value = try std.fmt.parseInt(u64, split.next().?, 10); 163 | builder.insert(key, value); 164 | } 165 | 166 | var table = try builder.build(allocator); 167 | defer table.deinit(allocator); 168 | 169 | std.debug.print("\n", .{}); 170 | std.debug.print("Successfully built table:\n", .{}); 171 | try printStats(table, n); 172 | 173 | if (output) |o| { 174 | std.debug.print("\n", .{}); 175 | std.debug.print("Writing to {s}\n", .{o}); 176 | const outfile = try std.fs.cwd().createFile(o, .{}); 177 | defer outfile.close(); 178 | 179 | try outfile.writer().writeInt(u64, n, endian); 180 | try table.writeTo(outfile.writer()); 181 | } 182 | } 183 | 184 | pub fn lookup(allocator: std.mem.Allocator, p: anytype) !void { 185 | const stdout = std.io.getStdOut().writer(); 186 | 187 | var input: ?[]const u8 = null; 188 | var key: ?[]const u8 = null; 189 | var bench: bool = false; 190 | 191 | while (p.next()) |token| { 192 | switch (token) { 193 | .flag => |flag| { 194 | if (flag.isShort("i") or flag.isLong("input")) { 195 | const val = p.nextValue() orelse fail("-i/--input requires value", .{}); 196 | input = val; 197 | } else if (flag.isShort("k") or flag.isLong("key")) { 198 | const val = p.nextValue() orelse fail("-k/--key requires value", .{}); 199 | key = val; 200 | } else if (flag.isShort("b") or flag.isLong("benchmark")) { 201 | bench = true; 202 | } else { 203 | fail("unknown flag: {s}", .{flag.name}); 204 | } 205 | }, 206 | .arg => |arg| fail("unexpected argument: {s}", .{arg}), 207 | .unexpected_value => |val| fail("unexpected argument: {s}", .{val}), 208 | } 209 | } 210 | 211 | if (input == null) { 212 | fail("-i/--input is required", .{}); 213 | } 214 | 215 | std.debug.print("Reading {s}...\n", .{input.?}); 216 | const buf = try std.fs.cwd().readFileAlloc(allocator, input.?, 10 * 1024 * 1024); 217 | defer allocator.free(buf); 218 | 219 | var fbs = std.io.fixedBufferStream(@as([]const u8, buf)); 220 | const n = try fbs.reader().readInt(u64, endian); 221 | var table = try HashRibbon.Bumped.readFrom(&fbs); 222 | std.debug.print("\n", .{}); 223 | 224 | std.debug.print("Successfully loaded hash function:\n", .{}); 225 | try printStats(table, n); 226 | std.debug.print("\n", .{}); 227 | 228 | if (key) |k| { 229 | std.debug.print("Looking up key={s}:\n", .{k}); 230 | const value = table.lookup(k); 231 | try stdout.print("{}\n", .{value}); 232 | 233 | if (bench) { 234 | const m = 1000; 235 | std.debug.print("\nBenchmarking...\n", .{}); 236 | var timer = try std.time.Timer.start(); 237 | const start = timer.lap(); 238 | var i: usize = 0; 239 | // TODO: Is this actually a good way of benchmarking? 240 | while (i < m) : (i += 1) { 241 | std.mem.doNotOptimizeAway(table.lookup(k)); 242 | } 243 | const end = timer.read(); 244 | const dur = end - start; 245 | std.debug.print("{} ns/read (avg of {} iterations)\n", .{ dur / m, m }); 246 | } 247 | } 248 | } 249 | -------------------------------------------------------------------------------- /tools/zini-seqz/main.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const zini = @import("zini"); 3 | const parg = @import("parg"); 4 | 5 | const usage = 6 | \\USAGE 7 | \\ {s} 8 | \\ 9 | \\A simple tool which reads a list of numbers (u64) from a file, 10 | \\compresses them using Elias-Fano, and reports the number of 11 | \\bytes it would take. 12 | \\ 13 | ; 14 | 15 | fn fail(comptime msg: []const u8, args: anytype) noreturn { 16 | std.debug.print("error: ", .{}); 17 | std.debug.print(msg, args); 18 | std.debug.print("\n", .{}); 19 | std.posix.exit(1); 20 | } 21 | 22 | pub fn main() !void { 23 | var gpa = std.heap.GeneralPurposeAllocator(.{}){}; 24 | defer { 25 | const check = gpa.deinit(); 26 | if (check == .leak) @panic("memory leaked"); 27 | } 28 | 29 | const allocator = gpa.allocator(); 30 | 31 | var p = try parg.parseProcess(allocator, .{}); 32 | defer p.deinit(); 33 | 34 | const program_name = p.nextValue() orelse @panic("no executable name"); 35 | 36 | var filename: ?[]const u8 = null; 37 | 38 | while (p.next()) |token| { 39 | switch (token) { 40 | .flag => |flag| { 41 | if (flag.isLong("help") or flag.isShort("h")) { 42 | std.debug.print(usage, .{program_name}); 43 | std.process.exit(0); 44 | } else { 45 | fail("uknown flag: {s}", .{flag.name}); 46 | } 47 | }, 48 | .arg => |arg| { 49 | if (filename == null) { 50 | filename = arg; 51 | } else { 52 | fail("uknown argument: {s}", .{arg}); 53 | } 54 | }, 55 | .unexpected_value => |val| fail("uknown argument: {s}", .{val}), 56 | } 57 | } 58 | 59 | const f = filename orelse fail("filename expected as argument", .{}); 60 | var file = try std.fs.cwd().openFile(f, .{}); 61 | defer file.close(); 62 | 63 | var counting_file = std.io.countingReader(file.reader()); 64 | 65 | var numbers = std.ArrayList(u64).init(allocator); 66 | defer numbers.deinit(); 67 | 68 | std.debug.print("Reading {s}\n", .{f}); 69 | 70 | var r = counting_file.reader(); 71 | while (true) { 72 | var buf: [32]u8 = undefined; 73 | const line = r.readUntilDelimiter(&buf, '\n') catch |err| switch (err) { 74 | error.EndOfStream => break, 75 | else => return err, 76 | }; 77 | const num = try std.fmt.parseInt(u64, line, 10); 78 | try numbers.append(num); 79 | } 80 | 81 | std.mem.sort(u64, numbers.items, {}, std.sort.asc(u64)); 82 | 83 | std.debug.print("Compressing {} numbers ({} bytes)...\n", .{ numbers.items.len, counting_file.bytes_read }); 84 | 85 | var encoded = try zini.EliasFano.encode(allocator, numbers.items); 86 | defer encoded.deinit(allocator); 87 | 88 | std.debug.print("The data would compress to: {} bytes\n", .{encoded.bitsWithoutConstantAccess() / 8}); 89 | } 90 | --------------------------------------------------------------------------------