├── .github
    └── workflows
    │   └── test.yml
├── .gitignore
├── LICENSE.txt
├── README.md
├── build.zig
├── build.zig.zon
├── src
    ├── CompactArray.zig
    ├── DictArray.zig
    ├── EliasFano.zig
    ├── StringDict.zig
    ├── darray.zig
    ├── main.zig
    ├── pthash.zig
    ├── ribbon.zig
    └── utils.zig
└── tools
    ├── zini-pthash
        └── main.zig
    ├── zini-ribbon
        └── main.zig
    └── zini-seqz
        └── main.zig


/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: "Tests"
 2 | 
 3 | on:
 4 |   push:
 5 |   schedule:
 6 |   - cron: "0 3 * * 5"
 7 |   workflow_dispatch:
 8 |   pull_request:
 9 | 
10 | jobs:
11 |   test:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@v3
15 |         with:
16 |           path: zini
17 | 
18 |       - uses: goto-bus-stop/setup-zig@v1
19 |         with:
20 |           version: master
21 | 
22 |       - name: Formatting
23 |         run: zig fmt --check src/*.zig
24 |         working-directory: zini
25 | 
26 |       - name: Tests
27 |         run: zig build test
28 |         working-directory: zini
29 | 
30 |       - name: Build executables
31 |         run: zig build
32 |         working-directory: zini
33 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .zig-cache
2 | zig-out
3 | /coverage
4 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | BSD Zero Clause License
 2 | 
 3 | Copyright (c) 2022 Magnus Holm
 4 | 
 5 | Permission to use, copy, modify, and/or distribute this software for any
 6 | purpose with or without fee is hereby granted.
 7 | 
 8 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
 9 | REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
10 | AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
11 | INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
12 | LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
13 | OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14 | PERFORMANCE OF THIS SOFTWARE.
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Zini
  2 | 
  3 | Zini (Zig + Mini) is a [Zig](https://ziglang.org/) library providing some succinct data structures:
  4 | 
  5 | - `zini.pthash`, a [**minimal perfect hash function**](https://en.wikipedia.org/wiki/Perfect_hash_function) construction algorithm, using less than 4 bits per element.
  6 | - `zini.ribbon`, a **retrieval data structure** (sometimes called a "static function") construction algorithm, having less than 1% overhead.
  7 | - `zini.CompactArray` stores n-bit numbers tightly packed, leaving no bits unused.
  8 |   If the largest value in an array is `m` then you actually only need `n = log2(m) + 1` bits per element.
  9 |   E.g. if the largest value is 270, you will get 7x compression using CompactArray over `[]u64` as it stores each element using only 9 bits (and 64 divided by 9 is roughly 7).
 10 | - `zini.DictArray` finds all distinct elements in the array, stores each once into a CompactArray (the dictionary), and creates a new CompactArray containing indexes into the dictionary.
 11 |   This will give excellent compression if there's a lot of repetition in the original array.
 12 | - `zini.EliasFano` stores increasing 64-bit numbers in a compact manner.
 13 | - `zini.darray` provides constant-time support for the `select1(i)` operation which returns the _i_-th set bit in a `std.DynamicBitSetUnmanaged`.
 14 | 
 15 | ## Overview
 16 | 
 17 | ### PTHash, minimal perfect hash function
 18 | 
 19 | `zini.pthash` contains an implementation of [PTHash][pthash], a [minimal perfect hash function](https://en.wikipedia.org/wiki/Perfect_hash_function) construction algorithm.
 20 | Given a set of `n` elements, with the only requirement being that you can hash them, it generates a hash function which maps each element to a distinct number between `0` and `n - 1`.
 21 | The generated hash function is extremely small, typically consuming less than **4 _bits_ per element**, regardless of the size of the input type.
 22 | The algorithm provides multiple parameters to tune making it possible to optimize for (small) size, (short) construction time, or (short) lookup time.
 23 | 
 24 | To give a practical example:
 25 | In ~0.6 seconds Zini was able to create a hash function for /usr/share/dict/words containing 235886 words.
 26 | The resulting hash function required in total 865682 bits in memory.
 27 | This corresponds to 108.2 kB in total or 3.67 bits per word.
 28 | In comparison, the original file was 2.49 MB and compressing it with `gzip -9` only gets it down to 754 kB (which you can't use directly in memory without decompressing it).
 29 | It should of course be noted that they don't store the equivalent data as you can't use the generated hash function to determine if a word is present or not in the list.
 30 | The comparison is mainly useful to get a feeling of the magnitudes.
 31 | 
 32 | ### Bumped Ribbon Retrieval, a retrieval data structure
 33 | 
 34 | `zini.ribbon` contains an implementation of [Bumped Ribbon Retrieval][burr] (_BuRR_), a retrieval data structure.
 35 | Given `n` keys (with the only requirement being that you can hash them) which each have an `r`-bit value, we'll build a data structure which will return the value for all of the `n` keys.
 36 | However, the keys are actually not stored (we're only using the hash) so if you ask for the value for an _unknown_ key you will get a seemingly random answer; there's no way of knowing whether the key was present in the original dataset or not.
 37 | 
 38 | The theoretically minimal amount of space needed to store the _values_ is `n * r` (we have `n` `r`-bit values after all).
 39 | We use the term "overhead" to refer to how much _extra_ amount of data we need.
 40 | The Bumped Ribbon Retrieval will often have **less than 1% overhead**.
 41 | 
 42 | ## Usage
 43 | 
 44 | Zini is intended to be used as a library, but also ships the command-line tools `zini-pthash` and `zini-ribbon`.
 45 | As the documentation is a bit lacking it might be useful to look through `tools/zini-{pthash,ribbon}/main.zig` to understand how it's used.
 46 | 
 47 | ```
 48 | USAGE
 49 |   ./zig-out/bin/zini-pthash [build | lookup] <options>
 50 | 
 51 | COMMAND: build
 52 |   Builds hash function for plain text file.
 53 | 
 54 |   -i, --input <file>
 55 |   -o, --output <file>
 56 |   -c <int>
 57 |   -a, --alpha <float>
 58 |   -s, --seed <int>
 59 | 
 60 | COMMAND: lookup
 61 | 
 62 |   -i, --input <file>
 63 |   -k, --key <key>
 64 |   -b, --benchmark
 65 | ```
 66 | 
 67 | And here's an example run of using `zini-pthash`.
 68 | 
 69 | ```
 70 | # Build zini-pthash:
 71 | $ zig build -Drelease-safe
 72 | 
 73 | # Build a hash function:
 74 | $ ./zig-out/bin/zini-pthash build -i /usr/share/dict/words -o words.pth
 75 | Reading /usr/share/dict/words...
 76 | 
 77 | Building hash function...
 78 | 
 79 | Successfully built hash function:
 80 |   seed: 12323441790160983030
 81 |   bits: 865554
 82 |   bits/n: 3.6693741892269993
 83 | 
 84 | Writing to words.pth
 85 | 
 86 | # Look up an index in the hash function:
 87 | $ ./zig-out/bin/zini-pthash lookup -i words.pth --key hello
 88 | Reading words.pth...
 89 | 
 90 | Successfully loaded hash function:
 91 |   seed: 12323441790160983030
 92 |   bits: 865554
 93 |   bits/n: 3.6693741892269993
 94 | 
 95 | Looking up key=hello:
 96 | 112576
 97 | ```
 98 | 
 99 | ## Acknowledgments
100 | 
101 | Zini is merely an implementation of existing algorithms and techniques already described in the literature:
102 | 
103 | - The [PTHash][pthash] algorithm is described by Giulio Ermanno Pibiri and Roberto Trani in arXiv:2104.10402.
104 | - They also implemented PTHash as a C++ library in <https://github.com/jermp/pthash> under the MIT license.
105 |   Zini uses no code directly from that repository, but it has been an invaluable resource for understanding how to implement PTHash in practice.
106 | - The [BuRR][burr] data structure is described by Peter C. Dillinger, Lorenz Hübschle-Schneider, Peter Sanders and Stefan Walzer in arXiv:2109.01892.
107 | 
108 | [pthash]: https://arxiv.org/abs/2104.10402
109 | [burr]: https://arxiv.org/abs/2109.01892
110 | 
111 | ## License
112 | 
113 | Zini is licensed under the [0BSD license](https://spdx.org/licenses/0BSD.html).
114 | 


--------------------------------------------------------------------------------
/build.zig:
--------------------------------------------------------------------------------
 1 | const std = @import("std");
 2 | 
 3 | pub fn build(b: *std.Build) !void {
 4 |     const optimize = b.standardOptimizeOption(.{});
 5 |     const target = b.standardTargetOptions(.{});
 6 | 
 7 |     const zini = b.addModule("zini", .{
 8 |         .root_source_file = b.path("src/main.zig"),
 9 |     });
10 | 
11 |     const tests = b.addTest(.{
12 |         .root_source_file = b.path("src/main.zig"),
13 |         .target = target,
14 |         .optimize = optimize,
15 |     });
16 |     const tests_run_step = b.addRunArtifact(tests);
17 |     tests_run_step.has_side_effects = true;
18 | 
19 |     const coverage = b.option(bool, "test-coverage", "Generate test coverage") orelse false;
20 |     if (coverage) {
21 |         const runner = [_][]const u8{
22 |             "kcov",
23 |             "--include-path",
24 |             ".",
25 |             "coverage", // output dir
26 |         };
27 | 
28 |         const dst = try tests_run_step.argv.addManyAt(b.allocator, 0, runner.len);
29 |         for (runner, 0..) |arg, idx| {
30 |             dst[idx] = .{ .bytes = b.dupe(arg) };
31 |         }
32 |     }
33 | 
34 |     const test_step = b.step("test", "Run unit tests");
35 |     test_step.dependOn(&tests_run_step.step);
36 | 
37 |     const parg = b.dependency("parg", .{ .target = target, .optimize = optimize });
38 | 
39 |     const pthash = b.addExecutable(.{
40 |         .name = "zini-pthash",
41 |         .root_source_file = b.path("tools/zini-pthash/main.zig"),
42 |         .target = target,
43 |         .optimize = optimize,
44 |     });
45 |     pthash.root_module.addImport("zini", zini);
46 |     pthash.root_module.addImport("parg", parg.module("parg"));
47 |     b.installArtifact(pthash);
48 | 
49 |     const ribbon = b.addExecutable(.{
50 |         .name = "zini-ribbon",
51 |         .root_source_file = b.path("tools/zini-ribbon/main.zig"),
52 |         .target = target,
53 |         .optimize = optimize,
54 |     });
55 |     ribbon.root_module.addImport("zini", zini);
56 |     ribbon.root_module.addImport("parg", parg.module("parg"));
57 |     b.installArtifact(ribbon);
58 | 
59 |     const seqz = b.addExecutable(.{
60 |         .name = "zini-seqz",
61 |         .root_source_file = b.path("tools/zini-seqz/main.zig"),
62 |         .target = target,
63 |         .optimize = optimize,
64 |     });
65 |     seqz.root_module.addImport("zini", zini);
66 |     seqz.root_module.addImport("parg", parg.module("parg"));
67 |     b.installArtifact(seqz);
68 | }
69 | 


--------------------------------------------------------------------------------
/build.zig.zon:
--------------------------------------------------------------------------------
 1 | .{
 2 |     .name = .zini,
 3 |     .fingerprint = 0xadd0f09e47acd80c,
 4 |     // This is a [Semantic Version](https://semver.org/).
 5 |     // In a future version of Zig it will be used for package deduplication.
 6 |     .version = "0.0.0",
 7 | 
 8 |     // This field is optional.
 9 |     // This is currently advisory only; Zig does not yet do anything
10 |     // with this value.
11 |     //.minimum_zig_version = "0.11.0",
12 | 
13 |     // This field is optional.
14 |     // Each dependency must either provide a `url` and `hash`, or a `path`.
15 |     // `zig build --fetch` can be used to fetch all dependencies of a package, recursively.
16 |     // Once all dependencies are fetched, `zig build` no longer requires
17 |     // internet connectivity.
18 |     .dependencies = .{
19 |         // See `zig fetch --save <url>` for a command-line interface for adding dependencies.
20 |         .parg = .{
21 |             // When updating this field to a new URL, be sure to delete the corresponding
22 |             // `hash`, otherwise you are communicating that you expect to find the old hash at
23 |             // the new URL.
24 |             .url = "https://github.com/judofyr/parg/archive/3e1d79ee543c56797f89bb7ced8ae02f115b9ff3.tar.gz",
25 | 
26 |             // This is computed from the file contents of the directory of files that is
27 |             // obtained after fetching `url` and applying the inclusion rules given by
28 |             // `paths`.
29 |             //
30 |             // This field is the source of truth; packages do not come from a `url`; they
31 |             // come from a `hash`. `url` is just one of many possible mirrors for how to
32 |             // obtain a package matching this `hash`.
33 |             //
34 |             // Uses the [multihash](https://multiformats.io/multihash/) format.
35 |             .hash = "1220c33110d98d9bf2139ab2b702347a22849166455e132252c08749bd6cb09ffe03",
36 | 
37 |             // When this is provided, the package is found in a directory relative to the
38 |             // build root. In this case the package's hash is irrelevant and therefore not
39 |             // computed. This field and `url` are mutually exclusive.
40 |             // .path = "foo",
41 |         },
42 |     },
43 | 
44 |     // Specifies the set of files and directories that are included in this package.
45 |     // Only files and directories listed here are included in the `hash` that
46 |     // is computed for this package.
47 |     // Paths are relative to the build root. Use the empty string (`""`) to refer to
48 |     // the build root itself.
49 |     // A directory listed here means that all files within, recursively, are included.
50 |     .paths = .{
51 |         // This makes *all* files, recursively, included in this package. It is generally
52 |         // better to explicitly list the files and directories instead, to insure that
53 |         // fetching from tarballs, file system paths, and version control all result
54 |         // in the same contents hash.
55 |         "",
56 |         // For example...
57 |         //"build.zig",
58 |         //"build.zig.zon",
59 |         //"src",
60 |         //"LICENSE",
61 |         //"README.md",
62 |     },
63 | }
64 | 


--------------------------------------------------------------------------------
/src/CompactArray.zig:
--------------------------------------------------------------------------------
  1 | /// CompactArray stores a list of n-bit integers packed tightly.
  2 | const std = @import("std");
  3 | const builtin = @import("builtin");
  4 | 
  5 | const utils = @import("./utils.zig");
  6 | 
  7 | const Self = @This();
  8 | 
  9 | const Int = u64;
 10 | const IntLog2 = std.math.Log2Int(Int);
 11 | const endian = builtin.cpu.arch.endian();
 12 | 
 13 | data: []const Int,
 14 | width: IntLog2,
 15 | 
 16 | pub fn deinit(self: *Self, allocator: std.mem.Allocator) void {
 17 |     allocator.free(self.data);
 18 |     self.* = undefined;
 19 | }
 20 | 
 21 | pub fn bits(self: *const Self) usize {
 22 |     return utils.bitSizeOfSlice(self.data);
 23 | }
 24 | 
 25 | fn getMask(self: *const Self) u64 {
 26 |     return (@as(Int, 1) << self.width) - 1;
 27 | }
 28 | 
 29 | /// Returns the value stored at a given index.
 30 | pub fn get(self: *const Self, idx: usize) u64 {
 31 |     const pos = idx * self.width;
 32 |     const block = pos / @bitSizeOf(Int);
 33 |     const shift: IntLog2 = @intCast(pos % @bitSizeOf(Int));
 34 | 
 35 |     if (@as(Int, shift) + self.width <= @bitSizeOf(Int)) {
 36 |         return (self.data[block] >> shift) & self.getMask();
 37 |     } else {
 38 |         const res_shift = ~shift + 1; //  =:=  @bitSizeOf(Int) - shift;
 39 |         return (self.data[block] >> shift) | (self.data[block + 1] << res_shift & self.getMask());
 40 |     }
 41 | }
 42 | 
 43 | /// Encodes an array into the smallest compact array possible.
 44 | pub fn encode(allocator: std.mem.Allocator, data: []const u64) !Self {
 45 |     if (data.len == 0) return Self{ .data = &[_]Int{}, .width = 1 };
 46 | 
 47 |     const width: IntLog2 = @intCast(std.math.log2_int(u64, std.mem.max(u64, data)) + 1);
 48 |     var arr = try Mutable.init(allocator, width, data.len);
 49 |     for (data, 0..) |val, idx| {
 50 |         arr.setFromZero(idx, val);
 51 |     }
 52 |     return arr.finalize();
 53 | }
 54 | 
 55 | /// Writes the array into an std.io.Writer. This can be read using `readFrom`.
 56 | pub fn writeTo(self: *const Self, w: anytype) !void {
 57 |     try w.writeInt(Int, self.width, endian);
 58 |     try utils.writeSlice(w, self.data);
 59 | }
 60 | 
 61 | /// Reads an array from a buffer. Note that this will not allocate, but will
 62 | /// instead create a new CompactArray which points directly to the data in
 63 | /// the buffer.
 64 | pub fn readFrom(stream: *std.io.FixedBufferStream([]const u8)) !Self {
 65 |     var r = stream.reader();
 66 |     const width = try r.readInt(Int, endian);
 67 |     const data = try utils.readSlice(stream, Int);
 68 |     return Self{
 69 |         .width = @intCast(width),
 70 |         .data = data,
 71 |     };
 72 | }
 73 | 
 74 | pub const Mutable = struct {
 75 |     data: []Int,
 76 |     width: IntLog2,
 77 | 
 78 |     /// Creates a new array that can store `n` values of `width` bits each.
 79 |     pub fn init(allocator: std.mem.Allocator, width: IntLog2, n: usize) !Mutable {
 80 |         const m = std.math.divCeil(usize, width * n, @bitSizeOf(Int)) catch unreachable;
 81 | 
 82 |         const data = try allocator.alloc(Int, m);
 83 |         @memset(data, 0);
 84 | 
 85 |         return Mutable{
 86 |             .data = data,
 87 |             .width = width,
 88 |         };
 89 |     }
 90 | 
 91 |     pub fn deinit(self: *Mutable, allocator: std.mem.Allocator) void {
 92 |         allocator.free(self.data);
 93 |         self.* = undefined;
 94 |     }
 95 | 
 96 |     pub fn finalize(self: *Mutable) Self {
 97 |         const result = self.asImmutable();
 98 |         self.* = undefined;
 99 |         return result;
100 |     }
101 | 
102 |     pub fn asImmutable(self: Mutable) Self {
103 |         return Self{
104 |             .data = self.data,
105 |             .width = self.width,
106 |         };
107 |     }
108 | 
109 |     pub fn get(self: Mutable, idx: usize) u64 {
110 |         return self.asImmutable().get(idx);
111 |     }
112 | 
113 |     /// Sets a value at a given index with the assumption that the existing value was already zero.
114 |     pub fn setFromZero(self: Mutable, idx: usize, val: u64) void {
115 |         const pos = idx * self.width;
116 |         const block = pos / @bitSizeOf(Int);
117 |         const shift: IntLog2 = @intCast(pos % @bitSizeOf(Int));
118 | 
119 |         self.data[block] |= val << shift;
120 | 
121 |         if (shift > 0) {
122 |             const res_shift = ~shift + 1; //  =:=  @bitSizeOf(Int) - shift;
123 |             if (res_shift < self.width) {
124 |                 self.data[block + 1] |= val >> res_shift;
125 |             }
126 |         }
127 |     }
128 | 
129 |     /// Sets a value at a given index to zero.
130 |     pub fn setToZero(self: Mutable, idx: usize) void {
131 |         const pos = idx * self.width;
132 |         const block = pos / @bitSizeOf(Int);
133 |         const shift: IntLog2 = @intCast(pos % @bitSizeOf(Int));
134 | 
135 |         // This is easier to understand with an example:
136 |         //   block size=8 (this is actually 64 in our implementation)
137 |         //   width=5
138 |         //   shift=6
139 |         //
140 |         // Let "V" be a value bit and "P" a "padding bit" (other value).
141 |         //
142 |         // Block 1: VV PPPPPP
143 |         // Block 2: PPPPP VVV
144 | 
145 |         // There's also the case where it _doesn't_ cross a block:
146 |         //   shift=2
147 |         //   Block 1: PP VVVVV PP
148 | 
149 |         // Here we need to make sure we don't zero out those upper paddings.
150 |         const upper_mask = ~@as(Int, 0) << self.width << shift;
151 |         const lower_mask = ((@as(Int, 1) << shift) - 1);
152 | 
153 |         // Clear out VV by AND-ing 00111111;
154 |         self.data[block] &= lower_mask | upper_mask;
155 | 
156 |         if (shift > 0) {
157 |             const res_shift = ~shift + 1; //  =:=  @bitSizeOf(Int) - shift;
158 | 
159 |             if (res_shift < self.width) {
160 |                 // res_shift in this example is 2 and thus width-res_shift = 3.
161 |                 // We then build the mask 11111000 by NOT-ing 00000111.
162 | 
163 |                 self.data[block + 1] &= ~((@as(Int, 1) << (self.width - res_shift)) - 1);
164 |             }
165 |         }
166 |     }
167 | };
168 | 
169 | const testing = std.testing;
170 | 
171 | test "basic" {
172 |     const n = 100;
173 |     const width = 5;
174 |     const max_val = 30;
175 | 
176 |     var c = try Self.Mutable.init(testing.allocator, width, n);
177 |     defer c.deinit(testing.allocator);
178 | 
179 |     var i: usize = 0;
180 |     while (i < n) : (i += 1) {
181 |         const value = (i * i) % max_val;
182 |         c.setFromZero(i, value);
183 |     }
184 | 
185 |     i = 0;
186 |     while (i < n) : (i += 1) {
187 |         const value = (i * i) % max_val;
188 |         try testing.expectEqual(value, c.get(i));
189 |     }
190 | }
191 | 
192 | test "encode empty" {
193 |     var arr = try Self.encode(testing.allocator, &[_]u64{});
194 |     defer arr.deinit(testing.allocator);
195 | }
196 | 
197 | test "encode" {
198 |     const vals = [_]u64{ 5, 2, 9, 100, 0, 5, 10, 90, 9, 1, 65, 10 };
199 |     var arr = try Self.encode(testing.allocator, &vals);
200 |     defer arr.deinit(testing.allocator);
201 | 
202 |     // 100 fits in 6 bits. There's 12 elements. These 72 bits fit in 2 u64.
203 |     try testing.expectEqual(@as(usize, 2), arr.data.len);
204 | 
205 |     for (vals, 0..) |val, idx| {
206 |         try testing.expectEqual(val, arr.get(idx));
207 |     }
208 | }
209 | 
210 | test "encode #2" {
211 |     const vals = [_]u64{ 0, 0, 2, 0, 4, 0 };
212 |     var arr = try Self.encode(testing.allocator, &vals);
213 |     defer arr.deinit(testing.allocator);
214 | 
215 |     for (vals, 0..) |val, idx| {
216 |         try testing.expectEqual(val, arr.get(idx));
217 |     }
218 | }
219 | 
220 | test "encode #3" {
221 |     const vals = [_]u64{255} ** 64;
222 |     var arr = try Self.encode(testing.allocator, &vals);
223 |     defer arr.deinit(testing.allocator);
224 | 
225 |     for (vals, 0..) |val, idx| {
226 |         try testing.expectEqual(val, arr.get(idx));
227 |     }
228 | }
229 | 
230 | test "write and read" {
231 |     const vals = [_]u64{ 0, 0, 2, 0, 4, 0 };
232 |     var arr = try Self.encode(testing.allocator, &vals);
233 |     defer arr.deinit(testing.allocator);
234 | 
235 |     // ensure alignment
236 |     const buf = try testing.allocator.alignedAlloc(u8, std.mem.Alignment.of(u64), 100);
237 |     defer testing.allocator.free(buf);
238 | 
239 |     {
240 |         // Write
241 |         var fbs = std.io.fixedBufferStream(buf);
242 |         try arr.writeTo(fbs.writer());
243 |     }
244 | 
245 |     {
246 |         // Read
247 |         var fbs = std.io.fixedBufferStream(@as([]const u8, buf));
248 |         var arr2 = try Self.readFrom(&fbs);
249 | 
250 |         for (vals, 0..) |val, idx| {
251 |             try testing.expectEqual(val, arr2.get(idx));
252 |         }
253 |     }
254 | }
255 | 


--------------------------------------------------------------------------------
/src/DictArray.zig:
--------------------------------------------------------------------------------
 1 | //! DictArray stores a list of integers by placing the unique items in a separate
 2 | //! array and refering to indexes into that array.
 3 | 
 4 | const std = @import("std");
 5 | const CompactArray = @import("./CompactArray.zig");
 6 | 
 7 | const Self = @This();
 8 | 
 9 | dict: CompactArray,
10 | arr: CompactArray,
11 | 
12 | pub fn deinit(self: *Self, allocator: std.mem.Allocator) void {
13 |     self.dict.deinit(allocator);
14 |     self.arr.deinit(allocator);
15 |     self.* = undefined;
16 | }
17 | 
18 | pub fn bits(self: *const Self) usize {
19 |     return self.dict.bits() + self.arr.bits();
20 | }
21 | 
22 | pub fn get(self: *const Self, idx: usize) u64 {
23 |     return self.dict.get(self.arr.get(idx));
24 | }
25 | 
26 | pub fn encode(allocator: std.mem.Allocator, data: []const u64) !Self {
27 |     var dict = std.ArrayList(u64).init(allocator);
28 |     defer dict.deinit();
29 | 
30 |     var arr = try std.ArrayList(u64).initCapacity(allocator, data.len);
31 |     defer arr.deinit();
32 | 
33 |     var mapping = std.hash_map.AutoHashMap(u64, usize).init(allocator);
34 |     defer mapping.deinit();
35 | 
36 |     for (data) |val| {
37 |         const result = try mapping.getOrPut(val);
38 |         if (!result.found_existing) {
39 |             result.value_ptr.* = dict.items.len;
40 |             try dict.append(val);
41 |         }
42 |         try arr.append(result.value_ptr.*);
43 |     }
44 | 
45 |     return Self{
46 |         .dict = try CompactArray.encode(allocator, dict.items),
47 |         .arr = try CompactArray.encode(allocator, arr.items),
48 |     };
49 | }
50 | 
51 | pub fn writeTo(self: *const Self, w: anytype) !void {
52 |     try self.dict.writeTo(w);
53 |     try self.arr.writeTo(w);
54 | }
55 | 
56 | pub fn readFrom(stream: *std.io.FixedBufferStream([]const u8)) !Self {
57 |     const dict = try CompactArray.readFrom(stream);
58 |     const arr = try CompactArray.readFrom(stream);
59 |     return Self{
60 |         .dict = dict,
61 |         .arr = arr,
62 |     };
63 | }
64 | 


--------------------------------------------------------------------------------
/src/EliasFano.zig:
--------------------------------------------------------------------------------
  1 | //! EliasFano stores 64-bit _increasing_ numbers in a compact manner.
  2 | 
  3 | const std = @import("std");
  4 | const DArray1 = @import("./darray.zig").DArray1;
  5 | const CompactArray = @import("./CompactArray.zig");
  6 | const utils = @import("./utils.zig");
  7 | const DynamicBitSetUnmanaged = std.bit_set.DynamicBitSetUnmanaged;
  8 | 
  9 | const Self = @This();
 10 | 
 11 | high_bits: DynamicBitSetUnmanaged,
 12 | high_bits_select: DArray1,
 13 | low_bits: CompactArray,
 14 | 
 15 | pub fn deinit(self: *Self, allocator: std.mem.Allocator) void {
 16 |     self.high_bits.deinit(allocator);
 17 |     self.high_bits_select.deinit(allocator);
 18 |     self.low_bits.deinit(allocator);
 19 |     self.* = undefined;
 20 | }
 21 | 
 22 | pub fn encode(allocator: std.mem.Allocator, data: []const u64) !Self {
 23 |     const n = data.len;
 24 |     const u = data[data.len - 1];
 25 | 
 26 |     const l = if (u > data.len) std.math.log2_int(u64, u / data.len) + 1 else 0;
 27 |     const l_mask = (@as(u64, 1) << l) - 1;
 28 |     const max_h = u >> l;
 29 | 
 30 |     // We need to store `2^h-1` zeroes and `n` ones.
 31 |     var high_bits = try DynamicBitSetUnmanaged.initEmpty(allocator, max_h + n);
 32 | 
 33 |     var low_bits = try CompactArray.Mutable.init(allocator, l, data.len);
 34 | 
 35 |     for (data, 0..) |num, idx| {
 36 |         if (l > 0) {
 37 |             low_bits.setFromZero(idx, num & l_mask);
 38 |         }
 39 |         high_bits.set((num >> l) + idx);
 40 |     }
 41 | 
 42 |     return Self{
 43 |         .high_bits = high_bits,
 44 |         .high_bits_select = try DArray1.init(allocator, high_bits),
 45 |         .low_bits = low_bits.finalize(),
 46 |     };
 47 | }
 48 | 
 49 | pub fn get(self: *const Self, idx: usize) u64 {
 50 |     const h_bits = self.high_bits_select.select(self.high_bits, idx) - idx;
 51 |     const l = self.low_bits.width;
 52 |     if (l == 0) return h_bits;
 53 | 
 54 |     const l_bits = self.low_bits.get(idx);
 55 |     return (h_bits << l) | l_bits;
 56 | }
 57 | 
 58 | pub fn bits(self: *const Self) u64 {
 59 |     // We're poking into the internals of DynamicBitSet here...
 60 |     const masks = self.high_bits.masks;
 61 |     const len = (masks - 1)[0];
 62 |     return self.low_bits.bits() + self.high_bits_select.bits() + len * @bitSizeOf(usize);
 63 | }
 64 | 
 65 | pub fn bitsWithoutConstantAccess(self: *const Self) u64 {
 66 |     const masks = self.high_bits.masks;
 67 |     const len = (masks - 1)[0];
 68 |     return self.low_bits.bits() + len * @bitSizeOf(usize);
 69 | }
 70 | 
 71 | pub fn writeTo(self: *const Self, w: anytype) !void {
 72 |     const masks = self.high_bits.masks;
 73 |     const len = (masks - 1)[0];
 74 |     try utils.writeSlice(w, (masks - 1)[0..len]);
 75 |     try self.high_bits_select.writeTo(w);
 76 |     try self.low_bits.writeTo(w);
 77 | }
 78 | 
 79 | pub fn readFrom(stream: *std.io.FixedBufferStream([]const u8)) !Self {
 80 |     const mask_arr = try utils.readSlice(stream, usize);
 81 |     const high_bits = DynamicBitSetUnmanaged{ .masks = @constCast(mask_arr.ptr) + 1 };
 82 |     const high_bits_select = try DArray1.readFrom(stream);
 83 |     const low_bits = try CompactArray.readFrom(stream);
 84 |     return Self{
 85 |         .high_bits = high_bits,
 86 |         .high_bits_select = high_bits_select,
 87 |         .low_bits = low_bits,
 88 |     };
 89 | }
 90 | 
 91 | const testing = std.testing;
 92 | 
 93 | test "encode" {
 94 |     const seed = 0x0194f614c15227ba;
 95 |     var prng = std.Random.DefaultPrng.init(seed);
 96 |     const r = prng.random();
 97 | 
 98 |     const n = 100000;
 99 | 
100 |     var result = try std.ArrayList(u64).initCapacity(testing.allocator, n);
101 |     defer result.deinit();
102 | 
103 |     var i: usize = 0;
104 |     var prev: u64 = 0;
105 |     while (i < n) : (i += 1) {
106 |         const num = prev + r.uintLessThan(u64, 50);
107 |         result.appendAssumeCapacity(num);
108 |         prev = num;
109 |     }
110 | 
111 |     var ef = try encode(testing.allocator, result.items);
112 |     defer ef.deinit(testing.allocator);
113 | 
114 |     // Check that it matches
115 |     for (result.items, 0..) |num, idx| {
116 |         try testing.expectEqual(num, ef.get(idx));
117 |     }
118 | }
119 | 


--------------------------------------------------------------------------------
/src/StringDict.zig:
--------------------------------------------------------------------------------
 1 | const std = @import("std");
 2 | const builtin = @import("builtin");
 3 | 
 4 | const StringDict = @This();
 5 | 
 6 | const endian = builtin.cpu.arch.endian();
 7 | 
 8 | dict: []const u8,
 9 | 
10 | pub fn deinit(self: *StringDict, allocator: std.mem.Allocator) void {
11 |     allocator.free(self.dict);
12 |     self.* = undefined;
13 | }
14 | 
15 | pub fn writeTo(self: *const StringDict, w: anytype) !void {
16 |     try w.writeInt(u64, self.dict.len, endian);
17 |     try w.writeAll(self.dict);
18 | }
19 | 
20 | pub fn readFrom(stream: *std.io.FixedBufferStream([]const u8)) !StringDict {
21 |     var r = stream.reader();
22 |     const len = try r.readInt(u64, endian);
23 |     const dict = stream.buffer[stream.pos..][0..len];
24 |     stream.pos += len;
25 |     return StringDict{
26 |         .dict = dict,
27 |     };
28 | }
29 | 
30 | pub fn bits(self: *const StringDict) u64 {
31 |     return self.dict.len * 8;
32 | }
33 | 
34 | pub fn get(self: *const StringDict, idx: u64) []const u8 {
35 |     const len = self.dict[idx];
36 |     return self.dict[idx + 1 ..][0..len];
37 | }
38 | 
39 | pub const Builder = struct {
40 |     dict_values: std.ArrayList(u8),
41 |     dict_positions: std.StringHashMap(usize),
42 | 
43 |     pub fn init(allocator: std.mem.Allocator) !Builder {
44 |         return Builder{
45 |             .dict_values = std.ArrayList(u8).init(allocator),
46 |             .dict_positions = std.StringHashMap(usize).init(allocator),
47 |         };
48 |     }
49 | 
50 |     pub fn deinit(self: *Builder) void {
51 |         self.dict_values.deinit();
52 |         self.dict_positions.deinit();
53 |         self.* = undefined;
54 |     }
55 | 
56 |     pub fn intern(self: *Builder, key: []const u8) !u64 {
57 |         const result = try self.dict_positions.getOrPut(key);
58 |         if (!result.found_existing) {
59 |             result.value_ptr.* = self.dict_values.items.len;
60 |             try self.dict_values.append(@intCast(key.len));
61 |             for (key) |byte| {
62 |                 try self.dict_values.append(byte);
63 |             }
64 |         }
65 |         return result.value_ptr.*;
66 |     }
67 | 
68 |     pub fn build(self: *Builder) !StringDict {
69 |         return StringDict{
70 |             .dict = try self.dict_values.toOwnedSlice(),
71 |         };
72 |     }
73 | };
74 | 


--------------------------------------------------------------------------------
/src/darray.zig:
--------------------------------------------------------------------------------
  1 | //! Implements the "darray" data structure which provides constant-time
  2 | //! select(i) operation for _dense_ bit sets. Roughly half of the items
  3 | //! should be set for this to be practical.
  4 | //!
  5 | //! See "Practical Entropy-Compressed Rank/Select Dictionary" by Daisuke Okanohara and Kunihiko Sadakane.
  6 | //!
  7 | //! The code is heavily based on https://github.com/jermp/pthash/blob/master/include/encoders/darray.hpp.
  8 | 
  9 | const std = @import("std");
 10 | const utils = @import("./utils.zig");
 11 | 
 12 | const BitSet = std.bit_set.DynamicBitSet;
 13 | 
 14 | pub fn DArray(comptime val: bool) type {
 15 |     return struct {
 16 |         const Self = @This();
 17 | 
 18 |         const block_size: usize = 1024;
 19 |         const subblock_size: usize = 32;
 20 |         const max_in_block_distance: usize = 1 << 16;
 21 | 
 22 |         const BlockPosition = packed struct {
 23 |             is_overflow: bool,
 24 |             pos: u63,
 25 |         };
 26 | 
 27 |         block_inventory: []BlockPosition,
 28 |         subblock_inventory: []u16,
 29 |         overflow_positions: []u64,
 30 | 
 31 |         pub fn init(allocator: std.mem.Allocator, bit_set: std.bit_set.DynamicBitSetUnmanaged) !Self {
 32 |             var cur_block_positions = std.ArrayListUnmanaged(u63){};
 33 |             defer cur_block_positions.deinit(allocator);
 34 | 
 35 |             var block_inventory = std.ArrayListUnmanaged(BlockPosition){};
 36 |             defer block_inventory.deinit(allocator);
 37 | 
 38 |             var subblock_inventory = std.ArrayListUnmanaged(u16){};
 39 |             defer subblock_inventory.deinit(allocator);
 40 | 
 41 |             var overflow_positions = std.ArrayListUnmanaged(u64){};
 42 |             defer overflow_positions.deinit(allocator);
 43 | 
 44 |             try cur_block_positions.ensureTotalCapacity(allocator, block_size);
 45 | 
 46 |             var iter = bit_set.iterator(.{ .kind = if (val) .set else .unset });
 47 |             while (iter.next()) |pos| {
 48 |                 cur_block_positions.appendAssumeCapacity(@intCast(pos));
 49 |                 if (cur_block_positions.items.len == block_size) {
 50 |                     try flushCurBlock(allocator, &cur_block_positions, &block_inventory, &subblock_inventory, &overflow_positions);
 51 |                 }
 52 |             }
 53 | 
 54 |             if (cur_block_positions.items.len > 0) {
 55 |                 try flushCurBlock(allocator, &cur_block_positions, &block_inventory, &subblock_inventory, &overflow_positions);
 56 |             }
 57 | 
 58 |             return Self{
 59 |                 .block_inventory = try block_inventory.toOwnedSlice(allocator),
 60 |                 .subblock_inventory = try subblock_inventory.toOwnedSlice(allocator),
 61 |                 .overflow_positions = try overflow_positions.toOwnedSlice(allocator),
 62 |             };
 63 |         }
 64 | 
 65 |         // Reads a word, flipping all bits if we're in select0-mode.
 66 |         fn readWord(bit_set: std.bit_set.DynamicBitSetUnmanaged, idx: usize) u64 {
 67 |             var word = bit_set.masks[idx];
 68 |             if (!val) {
 69 |                 word = ~word;
 70 |             }
 71 |             return word;
 72 |         }
 73 | 
 74 |         fn flushCurBlock(
 75 |             allocator: std.mem.Allocator,
 76 |             cur_block_positions: *std.ArrayListUnmanaged(u63),
 77 |             block_inventory: *std.ArrayListUnmanaged(BlockPosition),
 78 |             subblock_inventory: *std.ArrayListUnmanaged(u16),
 79 |             overflow_positions: *std.ArrayListUnmanaged(u64),
 80 |         ) !void {
 81 |             const fst = cur_block_positions.items[0];
 82 |             const lst = cur_block_positions.items[cur_block_positions.items.len - 1];
 83 |             if (lst - fst < max_in_block_distance) {
 84 |                 try block_inventory.append(allocator, BlockPosition{ .is_overflow = false, .pos = fst });
 85 |                 var i: usize = 0;
 86 |                 while (i < cur_block_positions.items.len) : (i += subblock_size) {
 87 |                     try subblock_inventory.append(allocator, @intCast(cur_block_positions.items[i] - fst));
 88 |                 }
 89 |             } else {
 90 |                 const overflow_pos = overflow_positions.items.len;
 91 |                 try block_inventory.append(allocator, BlockPosition{ .is_overflow = true, .pos = @intCast(overflow_pos) });
 92 |                 for (cur_block_positions.items) |pos| {
 93 |                     try overflow_positions.append(allocator, pos);
 94 |                 }
 95 |                 var i: usize = 0;
 96 |                 while (i < cur_block_positions.items.len) : (i += subblock_size) {
 97 |                     // This value isn't used, but we need to fill up the subblock.
 98 |                     try subblock_inventory.append(allocator, 0);
 99 |                 }
100 |             }
101 |             cur_block_positions.clearRetainingCapacity();
102 |         }
103 | 
104 |         pub fn deinit(self: *Self, allocator: std.mem.Allocator) void {
105 |             allocator.free(self.block_inventory);
106 |             allocator.free(self.subblock_inventory);
107 |             allocator.free(self.overflow_positions);
108 |             self.* = undefined;
109 |         }
110 | 
111 |         /// Returns the position of the `idx`-nth set bit in the bit set.
112 |         pub fn select(self: *const Self, bit_set: std.bit_set.DynamicBitSetUnmanaged, idx: usize) usize {
113 |             const block = idx / block_size;
114 |             const block_pos = self.block_inventory[block];
115 | 
116 |             if (block_pos.is_overflow) {
117 |                 return self.overflow_positions[block_pos.pos + (idx % block_size)];
118 |             }
119 | 
120 |             const subblock = idx / subblock_size;
121 |             const start_pos = block_pos.pos + self.subblock_inventory[subblock];
122 |             var reminder = idx % subblock_size;
123 |             if (reminder == 0) return start_pos;
124 | 
125 |             // Note: These assume the BitSet uses u64.
126 |             var word_idx = start_pos >> 6;
127 |             const word_shift: u6 = @intCast(start_pos & 63);
128 | 
129 |             var word = readWord(bit_set, word_idx);
130 |             word &= @as(u64, @bitCast(@as(i64, -1))) << word_shift;
131 | 
132 |             while (true) {
133 |                 const popcount = @popCount(word);
134 |                 if (reminder < popcount) break;
135 |                 reminder -= popcount;
136 |                 word_idx += 1;
137 |                 word = readWord(bit_set, word_idx);
138 |             }
139 | 
140 |             // TODO: this is probably not the best select_in_word algorithm
141 | 
142 |             var word_pos: usize = 0;
143 | 
144 |             while (true) {
145 |                 if (word & 1 == 1) {
146 |                     if (reminder == 0) break;
147 |                     reminder -= 1;
148 |                 }
149 |                 word_pos += 1;
150 |                 word >>= 1;
151 |             }
152 | 
153 |             return (word_idx << 6) + word_pos;
154 |         }
155 | 
156 |         pub fn bits(self: *const Self) u64 {
157 |             return utils.bitSizeOfSlice(self.block_inventory) +
158 |                 utils.bitSizeOfSlice(self.subblock_inventory) +
159 |                 utils.bitSizeOfSlice(self.overflow_positions);
160 |         }
161 | 
162 |         pub fn writeTo(self: *const Self, w: anytype) !void {
163 |             try utils.writeSlice(w, self.block_inventory);
164 |             try utils.writeSlice(w, self.subblock_inventory);
165 |             try utils.writeSlice(w, self.overflow_positions);
166 |         }
167 | 
168 |         pub fn readFrom(stream: *std.io.FixedBufferStream([]const u8)) !Self {
169 |             const block_inventory = try utils.readSlice(stream, BlockPosition);
170 |             const subblock_inventory = try utils.readSlice(stream, u16);
171 |             const overflow_positions = try utils.readSlice(stream, u64);
172 |             return Self{
173 |                 .block_inventory = @constCast(block_inventory),
174 |                 .subblock_inventory = @constCast(subblock_inventory),
175 |                 .overflow_positions = @constCast(overflow_positions),
176 |             };
177 |         }
178 |     };
179 | }
180 | 
181 | /// Provides select_0 support.
182 | pub const DArray1 = DArray(true);
183 | 
184 | /// Provides select_0 support.
185 | pub const DArray0 = DArray(false);
186 | 
187 | const testing = std.testing;
188 | 
189 | fn testBitSet(
190 |     bit_set: *std.DynamicBitSet,
191 |     positions: []usize,
192 | ) !void {
193 |     var darr1 = try DArray1.init(testing.allocator, bit_set.unmanaged);
194 |     defer darr1.deinit(testing.allocator);
195 | 
196 |     for (positions, 0..) |pos, idx| {
197 |         try testing.expectEqual(pos, darr1.select(bit_set.unmanaged, idx));
198 |     }
199 | 
200 |     // Now flip it and test select0(i):
201 |     bit_set.toggleAll();
202 | 
203 |     var darr0 = try DArray0.init(testing.allocator, bit_set.unmanaged);
204 |     defer darr0.deinit(testing.allocator);
205 | 
206 |     for (positions, 0..) |pos, idx| {
207 |         try testing.expectEqual(pos, darr0.select(bit_set.unmanaged, idx));
208 |     }
209 | }
210 | 
211 | test "dense" {
212 |     const seed = 0x0194f614c15227ba;
213 |     var prng = std.Random.DefaultPrng.init(seed);
214 |     const r = prng.random();
215 | 
216 |     var result = std.ArrayList(usize).init(testing.allocator);
217 |     defer result.deinit();
218 | 
219 |     const n = 10000;
220 | 
221 |     var bit_set = try std.DynamicBitSet.initEmpty(testing.allocator, n);
222 |     defer bit_set.deinit();
223 | 
224 |     var idx: usize = 0;
225 |     while (idx < n) : (idx += 1) {
226 |         if (r.boolean()) {
227 |             try result.append(idx);
228 |             bit_set.set(idx);
229 |         }
230 |     }
231 | 
232 |     try testBitSet(&bit_set, result.items);
233 | }
234 | 
235 | test "sparse" {
236 |     const seed = 0x0194f614c15227ba;
237 |     var prng = std.Random.DefaultPrng.init(seed);
238 |     const r = prng.random();
239 | 
240 |     var result = std.ArrayList(usize).init(testing.allocator);
241 |     defer result.deinit();
242 | 
243 |     const n = 100000;
244 | 
245 |     var bit_set = try std.DynamicBitSet.initEmpty(testing.allocator, n);
246 |     defer bit_set.deinit();
247 | 
248 |     var idx: usize = 0;
249 |     while (idx < n) : (idx += 1) {
250 |         if (r.uintLessThan(u64, 100) == 0) {
251 |             try result.append(idx);
252 |             bit_set.set(idx);
253 |         }
254 |     }
255 | 
256 |     try testBitSet(&bit_set, result.items);
257 | }
258 | 


--------------------------------------------------------------------------------
/src/main.zig:
--------------------------------------------------------------------------------
 1 | const std = @import("std");
 2 | 
 3 | pub const pthash = @import("./pthash.zig");
 4 | pub const CompactArray = @import("./CompactArray.zig");
 5 | pub const DictArray = @import("./DictArray.zig");
 6 | pub const darray = @import("./darray.zig");
 7 | pub const EliasFano = @import("./EliasFano.zig");
 8 | pub const ribbon = @import("./ribbon.zig");
 9 | pub const StringDict = @import("./StringDict.zig");
10 | 
11 | comptime {
12 |     std.testing.refAllDecls(@This());
13 | }
14 | 


--------------------------------------------------------------------------------
/src/pthash.zig:
--------------------------------------------------------------------------------
  1 | //! This module implements "PTHash: Revisiting FCH Minimal Perfect Hashing" by
  2 | //! Giulio Ermanno Pibiri, Roberto Trani, arXiv:2104.10402, https://arxiv.org/abs/2104.10402.
  3 | 
  4 | const std = @import("std");
  5 | const builtin = @import("builtin");
  6 | const Wyhash = std.hash.Wyhash;
  7 | 
  8 | const CompactArray = @import("./CompactArray.zig");
  9 | const EliasFano = @import("./EliasFano.zig");
 10 | const utils = @import("./utils.zig");
 11 | const FreeSlotEncoding = EliasFano;
 12 | 
 13 | const endian = builtin.cpu.arch.endian();
 14 | 
 15 | /// The bucketer takes a hash and places it into a bucket in an un-even fashion:
 16 | /// Roughly 60% of the keys are mapped to 30% of the buckets. In addition,
 17 | /// it's initialize with a `c` parameter which represents the expected number of
 18 | /// bits-per-n that is required to encode the pivots that are created by PTHash.
 19 | const Bucketer = struct {
 20 |     n: usize,
 21 |     m: usize,
 22 |     p1: usize,
 23 |     p2: usize,
 24 | 
 25 |     /// Creates a new bucketer for `n` items with a given `c` parameter.
 26 |     pub fn init(n: usize, c: usize) Bucketer {
 27 |         const m = c * n / (std.math.log2_int(usize, n) + 1);
 28 |         const p1: usize = @intFromFloat(0.6 * @as(f64, @floatFromInt(n)));
 29 |         const p2: usize = @intFromFloat(0.3 * @as(f64, @floatFromInt(m)));
 30 | 
 31 |         return Bucketer{
 32 |             .n = n,
 33 |             .m = m,
 34 |             .p1 = p1,
 35 |             .p2 = p2,
 36 |         };
 37 |     }
 38 | 
 39 |     /// Returns the bucket for a hash.
 40 |     pub fn getBucket(self: Bucketer, hash: u64) u64 {
 41 |         if (hash % self.n < self.p1) {
 42 |             return hash % self.p2;
 43 |         } else {
 44 |             return self.p2 + (hash % (self.m - self.p2));
 45 |         }
 46 |     }
 47 | 
 48 |     pub fn writeTo(self: *const Bucketer, w: anytype) !void {
 49 |         try w.writeInt(u64, self.n, endian);
 50 |         try w.writeInt(u64, self.m, endian);
 51 |         try w.writeInt(u64, self.p1, endian);
 52 |         try w.writeInt(u64, self.p2, endian);
 53 |     }
 54 | 
 55 |     pub fn readFrom(stream: *std.io.FixedBufferStream([]const u8)) !Bucketer {
 56 |         var r = stream.reader();
 57 |         const n = try r.readInt(u64, endian);
 58 |         const m = try r.readInt(u64, endian);
 59 |         const p1 = try r.readInt(u64, endian);
 60 |         const p2 = try r.readInt(u64, endian);
 61 |         return Bucketer{
 62 |             .n = n,
 63 |             .m = m,
 64 |             .p1 = p1,
 65 |             .p2 = p2,
 66 |         };
 67 |     }
 68 | };
 69 | 
 70 | /// Information about the hash + bucket for a key. We compute this once and re-use it.
 71 | const HashedKey = struct {
 72 |     hash: u64,
 73 |     bucket: u64,
 74 | 
 75 |     fn lessThan(_: void, lhs: HashedKey, rhs: HashedKey) bool {
 76 |         if (lhs.bucket == rhs.bucket) return lhs.hash < rhs.hash;
 77 |         return lhs.bucket < rhs.bucket;
 78 |     }
 79 | };
 80 | 
 81 | /// The bucket summary contains information about a single bucket for a slice of hashed keys.
 82 | /// The slice should be sorted by bucket.
 83 | const BucketSummary = struct {
 84 |     idx: usize,
 85 |     entry_start: usize,
 86 |     entry_end: usize,
 87 | 
 88 |     fn count(self: BucketSummary) usize {
 89 |         return self.entry_end - self.entry_start;
 90 |     }
 91 | 
 92 |     fn lessThan(_: void, a: BucketSummary, b: BucketSummary) bool {
 93 |         const a_count = a.count();
 94 |         const b_count = b.count();
 95 |         if (a_count == b_count) return a.idx < b.idx;
 96 |         return b_count < a_count;
 97 |     }
 98 | };
 99 | 
100 | pub const Params = struct {
101 |     c: usize,
102 |     alpha: f64 = 1,
103 | };
104 | 
105 | // Number of different seeds we try before we give up.
106 | const MAX_ATTEMPTS = 1000;
107 | 
108 | /// A minimal perfect hash function for a given type and a hash function.
109 | pub fn HashFn(
110 |     comptime Key: type,
111 |     comptime hasher: fn (seed: u64, Key: Key) u64,
112 |     comptime Encoding: type,
113 | ) type {
114 |     return struct {
115 |         const Self = @This();
116 | 
117 |         n: usize,
118 |         seed: u64,
119 |         bucketer: Bucketer,
120 |         free_slots: FreeSlotEncoding,
121 |         pivots: Encoding,
122 | 
123 |         pub fn deinit(self: *Self, allocator: std.mem.Allocator) void {
124 |             self.pivots.deinit(allocator);
125 |             self.free_slots.deinit(allocator);
126 |             self.* = undefined;
127 |         }
128 | 
129 |         pub fn get(self: *const Self, key: Key) u64 {
130 |             const hash = hasher(self.seed, key);
131 |             const bucket = self.bucketer.getBucket(hash);
132 |             const pivot = self.pivots.get(bucket);
133 |             const bucket_hash = Wyhash.hash(self.seed, std.mem.asBytes(&pivot));
134 |             const full_hash = Wyhash.hash(bucket_hash, std.mem.asBytes(&hash));
135 |             const pos = full_hash % self.bucketer.n;
136 |             if (pos < self.n) {
137 |                 return pos;
138 |             } else {
139 |                 return self.free_slots.get(pos - self.n);
140 |             }
141 |         }
142 | 
143 |         pub fn bits(self: *const Self) usize {
144 |             return self.pivots.bits() + self.free_slots.bits();
145 |         }
146 | 
147 |         pub fn build(
148 |             allocator: std.mem.Allocator,
149 |             keys: []const Key,
150 |             params: Params,
151 |             seed: ?u64,
152 |         ) !Self {
153 |             if (seed) |s| {
154 |                 return buildUsingSeed(allocator, keys, params, s);
155 |             } else {
156 |                 return buildUsingRandomSeed(allocator, keys, params, MAX_ATTEMPTS);
157 |             }
158 |         }
159 | 
160 |         pub fn buildUsingRandomSeed(
161 |             allocator: std.mem.Allocator,
162 |             keys: []const Key,
163 |             params: Params,
164 |             max_attempts: usize,
165 |         ) !Self {
166 |             var seed: u64 = undefined;
167 | 
168 |             var attempts: usize = 0;
169 |             while (attempts < max_attempts) : (attempts += 1) {
170 |                 try std.posix.getrandom(std.mem.asBytes(&seed));
171 | 
172 |                 return buildUsingSeed(allocator, keys, params, seed) catch |err| switch (err) {
173 |                     error.HashCollision => continue,
174 |                     else => err,
175 |                 };
176 |             }
177 | 
178 |             return error.HashCollision;
179 |         }
180 | 
181 |         pub fn buildUsingSeed(
182 |             allocator: std.mem.Allocator,
183 |             keys: []const Key,
184 |             params: Params,
185 |             seed: u64,
186 |         ) !Self {
187 |             std.debug.assert(params.alpha <= 1);
188 |             const n_prime: usize = @intFromFloat(@as(f64, @floatFromInt(keys.len)) / params.alpha);
189 |             const bucketer = Bucketer.init(n_prime, params.c);
190 | 
191 |             // Step 1: Hash all the inputs and figure out which bucket they belong to.
192 | 
193 |             var entries = try allocator.alloc(HashedKey, keys.len);
194 |             defer allocator.free(entries);
195 | 
196 |             for (keys, 0..) |key, idx| {
197 |                 const hash = hasher(seed, key);
198 |                 const bucket = bucketer.getBucket(hash);
199 |                 entries[idx] = HashedKey{ .hash = hash, .bucket = bucket };
200 |             }
201 | 
202 |             std.mem.sort(HashedKey, entries, {}, HashedKey.lessThan);
203 | 
204 |             // Step 2: Group the entries into buckets ordered by size.
205 | 
206 |             var bucket_summaries = try std.ArrayList(BucketSummary).initCapacity(allocator, bucketer.m);
207 |             defer bucket_summaries.deinit();
208 | 
209 |             var bucket_start: usize = 0;
210 |             var bucket_idx: usize = 0;
211 |             var i: usize = 1;
212 |             while (i < entries.len + 1) : (i += 1) {
213 |                 const at_boundary = (i == entries.len) or (entries[i - 1].bucket != entries[i].bucket);
214 |                 if (at_boundary) {
215 |                     bucket_summaries.appendAssumeCapacity(BucketSummary{
216 |                         .idx = entries[i - 1].bucket,
217 |                         .entry_start = bucket_start,
218 |                         .entry_end = i,
219 |                     });
220 |                     bucket_idx += 1;
221 |                     bucket_start = i;
222 |                 } else {
223 |                     if (entries[i - 1].hash == entries[i].hash) return error.HashCollision;
224 |                 }
225 |             }
226 | 
227 |             std.mem.sort(BucketSummary, bucket_summaries.items, {}, BucketSummary.lessThan);
228 | 
229 |             // Step 3: Determine pivots
230 | 
231 |             var taken = try std.bit_set.DynamicBitSet.initEmpty(allocator, bucketer.n);
232 |             defer taken.deinit();
233 | 
234 |             var attempted_taken = try std.bit_set.DynamicBitSet.initEmpty(allocator, bucketer.n);
235 |             defer attempted_taken.deinit();
236 | 
237 |             var pivots = try allocator.alloc(u64, bucketer.m);
238 |             defer allocator.free(pivots);
239 | 
240 |             @memset(pivots, 0);
241 | 
242 |             for (bucket_summaries.items) |b| {
243 |                 var pivot: u64 = 0;
244 |                 find_pivot: while (true) : (pivot += 1) {
245 |                     // Reset attempted_taken
246 |                     attempted_taken.setRangeValue(.{ .start = 0, .end = attempted_taken.capacity() }, false);
247 | 
248 |                     for (entries[b.entry_start..b.entry_end]) |entry| {
249 |                         const bucket_hash = Wyhash.hash(seed, std.mem.asBytes(&pivot));
250 |                         const full_hash = Wyhash.hash(bucket_hash, std.mem.asBytes(&entry.hash));
251 |                         const pos = full_hash % bucketer.n;
252 | 
253 |                         const is_taken_earlier_bucket = taken.isSet(pos);
254 |                         const is_taken_same_bucket = attempted_taken.isSet(pos);
255 | 
256 |                         if (is_taken_earlier_bucket or is_taken_same_bucket) {
257 |                             continue :find_pivot;
258 |                         }
259 | 
260 |                         attempted_taken.set(pos);
261 |                     }
262 | 
263 |                     pivots[b.idx] = pivot;
264 | 
265 |                     taken.setUnion(attempted_taken);
266 |                     break;
267 |                 }
268 |             }
269 | 
270 |             const encoded_pivots = try Encoding.encode(allocator, pivots);
271 | 
272 |             var free_slots = try allocator.alloc(u64, bucketer.n - keys.len);
273 |             defer allocator.free(free_slots);
274 | 
275 |             var iter = taken.iterator(.{ .kind = .unset });
276 | 
277 |             var prev_free_value: usize = 0;
278 |             var free_idx: usize = 0;
279 |             while (free_idx < free_slots.len) : (free_idx += 1) {
280 |                 if (taken.isSet(keys.len + free_idx)) {
281 |                     free_slots[free_idx] = iter.next().?;
282 |                     prev_free_value = free_slots[free_idx];
283 |                 } else {
284 |                     // This value can be anything. We keep it incremental.
285 |                     free_slots[free_idx] = prev_free_value;
286 |                 }
287 |             }
288 | 
289 |             const encoded_free_slots = try FreeSlotEncoding.encode(allocator, free_slots);
290 | 
291 |             return Self{
292 |                 .bucketer = bucketer,
293 |                 .n = keys.len,
294 |                 .free_slots = encoded_free_slots,
295 |                 .seed = seed,
296 |                 .pivots = encoded_pivots,
297 |             };
298 |         }
299 | 
300 |         pub fn writeTo(self: *const Self, w: anytype) !void {
301 |             try w.writeInt(u64, self.n, endian);
302 |             try w.writeInt(u64, self.seed, endian);
303 |             try self.bucketer.writeTo(w);
304 |             try self.free_slots.writeTo(w);
305 |             try self.pivots.writeTo(w);
306 |         }
307 | 
308 |         pub fn readFrom(stream: *std.io.FixedBufferStream([]const u8)) !Self {
309 |             var r = stream.reader();
310 |             const n = try r.readInt(u64, endian);
311 |             const seed = try r.readInt(u64, endian);
312 |             const bucketer = try Bucketer.readFrom(stream);
313 |             const free_slots = try FreeSlotEncoding.readFrom(stream);
314 |             const pivots = try Encoding.readFrom(stream);
315 |             return Self{
316 |                 .n = n,
317 |                 .seed = seed,
318 |                 .bucketer = bucketer,
319 |                 .free_slots = free_slots,
320 |                 .pivots = pivots,
321 |             };
322 |         }
323 |     };
324 | }
325 | 
326 | pub fn AutoHashFn(
327 |     comptime Key: type,
328 |     comptime Encoding: type,
329 | ) type {
330 |     return HashFn(Key, utils.autoHash(Key), Encoding);
331 | }
332 | 
333 | pub fn BytesHashFn(comptime Encoding: type) type {
334 |     return HashFn([]const u8, Wyhash.hash, Encoding);
335 | }
336 | 
337 | const testing = std.testing;
338 | 
339 | test "basic bucketing" {
340 |     const b = Bucketer.init(100, 7);
341 |     try testing.expectEqual(@as(u64, 0), b.getBucket(0));
342 | }
343 | 
344 | test "building" {
345 |     var data: [256]u64 = undefined;
346 | 
347 |     var i: usize = 0;
348 |     while (i < data.len) : (i += 1) {
349 |         data[i] = i * i;
350 |     }
351 | 
352 |     var h = try AutoHashFn(u64, CompactArray).buildUsingRandomSeed(testing.allocator, &data, .{ .c = 7, .alpha = 0.80 }, 10);
353 |     defer h.deinit(testing.allocator);
354 | 
355 |     var seen = std.hash_map.AutoHashMap(u64, usize).init(testing.allocator);
356 |     defer seen.deinit();
357 | 
358 |     for (data, 0..) |val, idx| {
359 |         const out = h.get(val);
360 |         try testing.expect(out < data.len);
361 | 
362 |         if (try seen.fetchPut(out, idx)) |other_entry| {
363 |             std.debug.print("collision between idx={} and {}\n", .{ other_entry.value, idx });
364 |             return error.TestCollision;
365 |         }
366 |     }
367 | }
368 | 
369 | test "collision detection" {
370 |     var data: [2]u64 = .{ 5, 5 };
371 |     var h_result = AutoHashFn(u64, CompactArray).buildUsingRandomSeed(testing.allocator, &data, .{ .c = 7 }, 10);
372 |     if (h_result) |*h| h.deinit(testing.allocator) else |_| {}
373 | 
374 |     try testing.expectError(error.HashCollision, h_result);
375 | }
376 | 


--------------------------------------------------------------------------------
/src/ribbon.zig:
--------------------------------------------------------------------------------
  1 | //! This file implements the ideas from "Fast Succinct Retrieval and Approximate Membership using Ribbon".
  2 | 
  3 | const std = @import("std");
  4 | const builtin = @import("builtin");
  5 | const DynamicBitSetUnmanaged = std.bit_set.DynamicBitSetUnmanaged;
  6 | const CompactArray = @import("./CompactArray.zig");
  7 | const utils = @import("./utils.zig");
  8 | 
  9 | const endian = builtin.cpu.arch.endian();
 10 | 
 11 | fn bitParity(num: u64) u64 {
 12 |     return @popCount(num) % 2;
 13 | }
 14 | 
 15 | const RibbonTable = struct {
 16 |     const Self = @This();
 17 | 
 18 |     n: usize,
 19 |     data: CompactArray,
 20 | 
 21 |     pub fn init(n: usize, data: CompactArray) Self {
 22 |         return Self{
 23 |             .n = n,
 24 |             .data = data,
 25 |         };
 26 |     }
 27 | 
 28 |     pub fn deinit(self: *Self, allocator: std.mem.Allocator) void {
 29 |         self.data.deinit(allocator);
 30 |         self.* = undefined;
 31 |     }
 32 | 
 33 |     pub fn lookup(self: Self, i: u64, c: u64) u64 {
 34 |         std.debug.assert((c & 1) == 1);
 35 | 
 36 |         var i_ = i;
 37 |         var c_ = c;
 38 |         var result: u64 = 0;
 39 | 
 40 |         while (true) {
 41 |             result ^= self.data.get(i_);
 42 | 
 43 |             c_ >>= 1;
 44 |             i_ += 1;
 45 |             if (c_ == 0) break;
 46 | 
 47 |             const j: u6 = @intCast(@ctz(c_));
 48 |             i_ += j;
 49 |             c_ >>= j;
 50 |         }
 51 |         return result;
 52 |     }
 53 | 
 54 |     pub fn bits(self: *const Self) u64 {
 55 |         return self.data.bits();
 56 |     }
 57 | 
 58 |     pub fn writeTo(self: *const Self, w: anytype) !void {
 59 |         try w.writeInt(u64, self.n, endian);
 60 |         try self.data.writeTo(w);
 61 |     }
 62 | 
 63 |     pub fn readFrom(stream: *std.io.FixedBufferStream([]const u8)) !Self {
 64 |         var r = stream.reader();
 65 |         const n = try r.readInt(u64, endian);
 66 |         const data = try CompactArray.readFrom(stream);
 67 |         return Self{ .n = n, .data = data };
 68 |     }
 69 | };
 70 | 
 71 | pub const RibbonBandingSystem = struct {
 72 |     const Self = @This();
 73 | 
 74 |     const Array = CompactArray.Mutable;
 75 | 
 76 |     n: usize,
 77 |     c: Array,
 78 |     b: Array,
 79 | 
 80 |     pub fn init(allocator: std.mem.Allocator, n: usize, r: u6, w: u6) !Self {
 81 |         var c = try Array.init(allocator, w, n);
 82 |         errdefer c.deinit(allocator);
 83 | 
 84 |         var b = try Array.init(allocator, r, n);
 85 |         errdefer b.deinit(allocator);
 86 | 
 87 |         return Self{ .n = n, .c = c, .b = b };
 88 |     }
 89 | 
 90 |     pub fn deinit(self: *Self, allocator: std.mem.Allocator) void {
 91 |         self.c.deinit(allocator);
 92 |         self.b.deinit(allocator);
 93 |         self.* = undefined;
 94 |     }
 95 | 
 96 |     pub fn getBandWidth(self: Self) u6 {
 97 |         return self.c.width;
 98 |     }
 99 | 
100 |     pub fn getValueSize(self: Self) u6 {
101 |         return self.b.width;
102 |     }
103 | 
104 |     pub const InsertResult = union(enum) {
105 |         success: usize,
106 |         redunant: void,
107 |         failure: void,
108 |     };
109 | 
110 |     pub fn insertRow(self: *Self, i: usize, c: u64, b: u64) InsertResult {
111 |         std.debug.assert(b >> self.getValueSize() == 0);
112 |         std.debug.assert(c >> self.getBandWidth() == 0);
113 |         std.debug.assert((c & 1) == 1);
114 | 
115 |         var i_ = i;
116 |         var c_ = c;
117 |         var b_ = b;
118 | 
119 |         while (true) {
120 |             if (self.c.get(i_) == 0) {
121 |                 self.c.setFromZero(i_, c_);
122 |                 self.b.setFromZero(i_, b_);
123 |                 return .{ .success = i_ };
124 |             }
125 | 
126 |             c_ = c_ ^ self.c.get(i_);
127 |             b_ = b_ ^ self.b.get(i_);
128 | 
129 |             if (c_ == 0) {
130 |                 if (b_ == 0) {
131 |                     return .redunant;
132 |                 } else {
133 |                     return .failure;
134 |                 }
135 |             }
136 | 
137 |             const j: u6 = @intCast(@ctz(c_));
138 |             c_ >>= j;
139 |             i_ += j;
140 |         }
141 |     }
142 | 
143 |     pub fn clearRow(self: *Self, i: usize) void {
144 |         self.c.setToZero(i);
145 |         self.b.setToZero(i);
146 |     }
147 | 
148 |     pub fn build(self: Self, allocator: std.mem.Allocator) !RibbonTable {
149 |         const r = self.getValueSize();
150 | 
151 |         var data = try CompactArray.Mutable.init(allocator, r, self.n);
152 |         errdefer data.deinit(allocator);
153 | 
154 |         var state = try allocator.alloc(u64, r);
155 |         defer allocator.free(state);
156 |         @memset(state, 0);
157 | 
158 |         // This logic is taken from https://github.com/lorenzhs/BuRR/blob/1c62832ad7d6eab5b337f386955868c3ce9a54ea/backsubst.hpp#L46
159 |         // and I honestly don't quite understand how it works.
160 | 
161 |         var i = self.n;
162 |         while (i > 0) {
163 |             i -= 1;
164 | 
165 |             const c = self.c.get(i);
166 |             const b = self.b.get(i);
167 |             var resultRow: u64 = 0;
168 | 
169 |             var j: u6 = 0;
170 |             while (j < r) : (j += 1) {
171 |                 var tmp = state[j] << 1;
172 |                 const bit = bitParity(tmp & c) ^ ((b >> j) & 1);
173 |                 tmp |= bit;
174 |                 state[j] = tmp;
175 |                 resultRow |= (bit << j);
176 |             }
177 | 
178 |             data.setFromZero(i, resultRow);
179 |         }
180 | 
181 |         return RibbonTable.init(self.n, data.finalize());
182 |     }
183 | };
184 | 
185 | const BumpedLayer = struct {
186 |     bucket_size: usize,
187 |     upper_threshold: usize,
188 |     lower_threshold: usize,
189 |     thresholds: CompactArray,
190 |     table: RibbonTable,
191 | 
192 |     pub fn deinit(self: *BumpedLayer, allocator: std.mem.Allocator) void {
193 |         self.table.deinit(allocator);
194 |         self.thresholds.deinit(allocator);
195 |     }
196 | 
197 |     pub fn lookup(self: BumpedLayer, i: u64, c: u64) ?u64 {
198 |         if (self.isBumped(i)) {
199 |             return null;
200 |         } else {
201 |             return self.table.lookup(i, c);
202 |         }
203 |     }
204 | 
205 |     fn isBumped(self: BumpedLayer, i: u64) bool {
206 |         const bucket_idx = i / self.bucket_size;
207 |         const bucket_offset = i % self.bucket_size;
208 |         const threshold = self.thresholds.get(bucket_idx);
209 |         const threshold_values = [4]usize{ 0, self.lower_threshold, self.upper_threshold, self.bucket_size };
210 |         return bucket_offset < threshold_values[threshold];
211 |     }
212 | 
213 |     pub fn bits(self: BumpedLayer) usize {
214 |         return self.table.bits() + self.thresholds.bits();
215 |     }
216 | 
217 |     pub fn writeTo(self: *const BumpedLayer, w: anytype) !void {
218 |         try w.writeInt(u64, self.bucket_size, endian);
219 |         try w.writeInt(u64, self.upper_threshold, endian);
220 |         try w.writeInt(u64, self.lower_threshold, endian);
221 |         try self.thresholds.writeTo(w);
222 |         try self.table.writeTo(w);
223 |     }
224 | 
225 |     pub fn readFrom(stream: *std.io.FixedBufferStream([]const u8)) !BumpedLayer {
226 |         var r = stream.reader();
227 |         const bucket_size = try r.readInt(u64, endian);
228 |         const upper_threshold = try r.readInt(u64, endian);
229 |         const lower_threshold = try r.readInt(u64, endian);
230 |         const thresholds = try CompactArray.readFrom(stream);
231 |         const table = try RibbonTable.readFrom(stream);
232 | 
233 |         return BumpedLayer{
234 |             .bucket_size = bucket_size,
235 |             .upper_threshold = upper_threshold,
236 |             .lower_threshold = lower_threshold,
237 |             .thresholds = thresholds,
238 |             .table = table,
239 |         };
240 |     }
241 | };
242 | 
243 | const BumpedLayerBuilder = struct {
244 |     const Self = @This();
245 | 
246 |     const Input = struct {
247 |         hash1: u64,
248 |         hash2: u64,
249 |         hash_result: HashResult,
250 |         value: u64,
251 |     };
252 | 
253 |     m: usize,
254 |     eps: f64,
255 |     opts: BuildOptions,
256 |     input: std.ArrayListUnmanaged(Input),
257 | 
258 |     fn tableSizeFromEps(n: usize, eps: f64, w: u6) usize {
259 |         const target: usize = @intFromFloat(@as(f64, @floatFromInt(n)) * (eps + 1));
260 |         return @max(target, @as(usize, @intCast(w)) + 1);
261 |     }
262 | 
263 |     pub fn init(allocator: std.mem.Allocator, n: usize, eps: f64, opts: BuildOptions) error{OutOfMemory}!Self {
264 |         const input = try std.ArrayListUnmanaged(Input).initCapacity(allocator, n);
265 | 
266 |         return Self{
267 |             .m = tableSizeFromEps(n, eps, opts.w),
268 |             .eps = eps,
269 |             .opts = opts,
270 |             .input = input,
271 |         };
272 |     }
273 | 
274 |     pub fn deinit(self: *Self, allocator: std.mem.Allocator) void {
275 |         self.input.deinit(allocator);
276 |         self.* = undefined;
277 |     }
278 | 
279 |     pub fn insert(self: *Self, hash1: u64, hash2: u64, value: u64) void {
280 |         self.input.appendAssumeCapacity(
281 |             Input{
282 |                 .hash1 = hash1,
283 |                 .hash2 = hash2,
284 |                 .hash_result = splitHash(hash1, hash2, self.m, self.opts.w),
285 |                 .value = value,
286 |             },
287 |         );
288 |     }
289 | 
290 |     pub fn build(self: *Self, allocator: std.mem.Allocator) error{ OutOfMemory, HashCollision }!BumpedLayer {
291 |         const w64 = @as(u64, self.opts.w);
292 |         const bucket_size = (w64 * w64) / (4 * std.math.log2_int_ceil(u64, w64));
293 |         const n = self.input.items.len;
294 | 
295 |         const lessThan = struct {
296 |             fn lessThan(_: void, left: Input, right: Input) bool {
297 |                 return left.hash_result.i < right.hash_result.i;
298 |             }
299 |         }.lessThan;
300 | 
301 |         std.mem.sort(Input, self.input.items, {}, lessThan);
302 | 
303 |         var system = try RibbonBandingSystem.init(allocator, self.m, self.opts.r, self.opts.w);
304 |         defer system.deinit(allocator);
305 | 
306 |         var inserted = try std.ArrayListUnmanaged(?usize).initCapacity(allocator, bucket_size);
307 |         defer inserted.deinit(allocator);
308 | 
309 |         var thresholds = try CompactArray.Mutable.init(allocator, 2, std.math.divCeil(usize, self.m, bucket_size) catch unreachable);
310 |         errdefer thresholds.deinit(allocator);
311 | 
312 |         const lower_threshold = bucket_size / 7;
313 |         const upper_threshold = bucket_size / 4;
314 |         std.debug.assert(lower_threshold < upper_threshold);
315 |         std.debug.assert(upper_threshold < bucket_size);
316 | 
317 |         const threshold_values = [4]usize{ 0, lower_threshold, upper_threshold, bucket_size };
318 | 
319 |         const inputs = self.input.items;
320 | 
321 |         var i: usize = 0;
322 |         var bucket_start: usize = 0;
323 |         var bucket_idx: usize = 0;
324 |         var bump_count: usize = 0;
325 | 
326 |         while (i < n) {
327 |             var j = i;
328 | 
329 |             // Find the end position of this bucket:
330 |             while (j < n) {
331 |                 if (inputs[j].hash_result.i >= bucket_start + bucket_size) break;
332 |                 j += 1;
333 |             }
334 | 
335 |             inserted.clearRetainingCapacity();
336 | 
337 |             var bump_offset: usize = 0;
338 | 
339 |             // Now iterate backwards again and insert them:
340 |             var k: usize = j;
341 |             while (k > i) {
342 |                 k -= 1;
343 |                 const input = inputs[k];
344 |                 switch (system.insertRow(input.hash_result.i, input.hash_result.c, input.value)) {
345 |                     .success => |idx| {
346 |                         try inserted.append(allocator, idx);
347 |                     },
348 |                     .redunant => {
349 |                         try inserted.append(allocator, null);
350 |                     },
351 |                     .failure => {
352 |                         bump_offset = input.hash_result.i - bucket_start + 1;
353 |                         k += 1;
354 |                         break;
355 |                     },
356 |                 }
357 |             }
358 | 
359 |             // Next determine the actual threshold to use:
360 |             var threshold: usize = undefined;
361 |             for (threshold_values, 0..) |threshold_value, idx| {
362 |                 if (threshold_value >= bump_offset) {
363 |                     threshold = idx;
364 |                     break;
365 |                 }
366 |             }
367 | 
368 |             const threshold_value = threshold_values[threshold];
369 | 
370 |             thresholds.setFromZero(bucket_idx, threshold);
371 | 
372 |             // And now undo all the inserted ones which have an offset outside the threshold:
373 |             while (k < j) : (k += 1) {
374 |                 const input = inputs[k];
375 |                 if (input.hash_result.i - bucket_start >= threshold_value) break;
376 |                 if (inserted.pop().?) |idx| {
377 |                     system.clearRow(idx);
378 |                 }
379 |             }
380 | 
381 |             bump_count += k - i;
382 | 
383 |             // Prepare for the next bucket:
384 |             i = j;
385 |             bucket_start += bucket_size;
386 |             bucket_idx += 1;
387 |         }
388 | 
389 |         var table = try system.build(allocator);
390 |         errdefer table.deinit(allocator);
391 | 
392 |         // Prepare for the next layer
393 | 
394 |         var next_inputs = try std.ArrayListUnmanaged(Input).initCapacity(allocator, bump_count);
395 |         errdefer next_inputs.deinit(allocator);
396 | 
397 |         var layer = BumpedLayer{
398 |             .table = table,
399 |             .bucket_size = bucket_size,
400 |             .upper_threshold = upper_threshold,
401 |             .lower_threshold = lower_threshold,
402 |             .thresholds = thresholds.finalize(),
403 |         };
404 | 
405 |         self.m = tableSizeFromEps(bump_count, self.eps, self.opts.w);
406 | 
407 |         for (inputs) |input| {
408 |             if (layer.isBumped(input.hash_result.i)) {
409 |                 next_inputs.appendAssumeCapacity(Input{
410 |                     .hash1 = input.hash1,
411 |                     .hash2 = input.hash2,
412 |                     .hash_result = splitHash(input.hash1, input.hash2, self.m, self.opts.w),
413 |                     .value = input.value,
414 |                 });
415 |             }
416 |         }
417 | 
418 |         std.debug.assert(next_inputs.items.len == bump_count);
419 | 
420 |         self.input.deinit(allocator);
421 |         self.input = next_inputs;
422 | 
423 |         return layer;
424 |     }
425 | 
426 |     pub fn buildFallbackTable(self: *BumpedLayerBuilder, allocator: std.mem.Allocator) !RibbonTable {
427 |         const n = self.input.items.len;
428 |         const step = @max(n / 10, 1);
429 |         var m: usize = @max(n, @as(usize, @intCast(self.opts.w)) + 1);
430 | 
431 |         var i: usize = 0;
432 |         loop: while (i < 50) : (i += 1) {
433 |             var system = try RibbonBandingSystem.init(allocator, m, self.opts.r, self.opts.w);
434 |             defer system.deinit(allocator);
435 | 
436 |             for (self.input.items) |input| {
437 |                 const h = splitHash(input.hash1, input.hash2, m, self.opts.w);
438 |                 const insert_result = system.insertRow(h.i, h.c, input.value);
439 |                 switch (insert_result) {
440 |                     .failure => {
441 |                         m += step;
442 |                         continue :loop;
443 |                     },
444 |                     else => {},
445 |                 }
446 |             }
447 | 
448 |             return try system.build(allocator);
449 |         }
450 | 
451 |         return error.HashCollision;
452 |     }
453 | };
454 | 
455 | const HashResult = struct {
456 |     i: u64,
457 |     c: u64,
458 | };
459 | 
460 | fn splitHash(hash1: u64, hash2: u64, n: usize, w: u6) HashResult {
461 |     const i = hash1 % (n - w);
462 |     const c_mask = ((@as(u64, 1) << w) - 1);
463 |     const c = (hash2 & c_mask) | 1;
464 |     return .{ .i = i, .c = c };
465 | }
466 | 
467 | pub const BuildOptions = struct {
468 |     r: u6,
469 |     w: u6,
470 |     seed: u64 = 100,
471 | };
472 | 
473 | pub fn Ribbon(
474 |     comptime Key: type,
475 |     comptime hasher: fn (seed: u64, Key: Key) u64,
476 | ) type {
477 |     return struct {
478 |         const Self = @This();
479 | 
480 |         fn hashKey(seed: u64, key: Key, n: usize, w: u6) HashResult {
481 |             return splitHash(hasher(seed, key), hasher(seed + 1, key), n, w);
482 |         }
483 | 
484 |         w: u6,
485 |         seed: u64,
486 |         table: RibbonTable,
487 | 
488 |         pub fn deinit(self: *Self, allocator: std.mem.Allocator) void {
489 |             self.table.deinit(allocator);
490 |             self.* = undefined;
491 |         }
492 | 
493 |         pub fn lookup(self: *const Self, key: Key) u64 {
494 |             const h = hashKey(self.seed, key, self.table.n, self.w);
495 |             return self.table.lookup(h.i, h.c);
496 |         }
497 | 
498 |         pub fn bits(self: *const Self) u64 {
499 |             return self.table.bits();
500 |         }
501 | 
502 |         pub fn writeTo(self: *const Self, w: anytype) !void {
503 |             try w.writeIntNative(u64, self.w);
504 |             try w.writeIntNative(u64, self.seed);
505 |             try self.table.writeTo(w);
506 |         }
507 | 
508 |         pub fn readFrom(stream: *std.io.FixedBufferStream([]const u8)) !Self {
509 |             var r = stream.reader();
510 |             const w = try r.readIntNative(u64);
511 |             const seed = try r.readIntNative(u64);
512 |             const table = try RibbonTable.readFrom(stream);
513 |             return Self{
514 |                 .w = @intCast(w),
515 |                 .seed = seed,
516 |                 .table = table,
517 |             };
518 |         }
519 | 
520 |         /// IncrementalBuilder builds the Ribbon table incrementally:
521 |         /// It uses a fixed `n` and tries to construct a table as it inserts entries.
522 |         /// If it's not possible to build a table for a given entry it will fail.
523 |         pub const IncrementalBuilder = struct {
524 |             n: usize,
525 |             seed: u64,
526 |             system: RibbonBandingSystem,
527 | 
528 |             pub fn init(allocator: std.mem.Allocator, n: usize, opts: BuildOptions) error{OutOfMemory}!IncrementalBuilder {
529 |                 const system = try RibbonBandingSystem.init(allocator, n, opts.r, opts.w);
530 | 
531 |                 return IncrementalBuilder{
532 |                     .n = n,
533 |                     .seed = opts.seed,
534 |                     .system = system,
535 |                 };
536 |             }
537 | 
538 |             pub fn deinit(self: *IncrementalBuilder, allocator: std.mem.Allocator) void {
539 |                 self.system.deinit(allocator);
540 |                 self.* = undefined;
541 |             }
542 | 
543 |             pub fn insert(self: *IncrementalBuilder, key: Key, value: u64) error{HashCollision}!void {
544 |                 const h = hashKey(self.seed, key, self.n, self.system.getBandWidth());
545 |                 switch (self.system.insertRow(h.i, h.c, value)) {
546 |                     .failure => return error.HashCollision,
547 |                     else => {},
548 |                 }
549 |             }
550 | 
551 |             pub fn build(self: IncrementalBuilder, allocator: std.mem.Allocator) error{OutOfMemory}!Self {
552 |                 const table = try self.system.build(allocator);
553 | 
554 |                 return Self{
555 |                     .w = self.system.getBandWidth(),
556 |                     .seed = self.seed,
557 |                     .table = table,
558 |                 };
559 |             }
560 |         };
561 | 
562 |         pub const IterativeBuilder = struct {
563 |             const Input = struct {
564 |                 hash1: u64,
565 |                 hash2: u64,
566 |                 value: u64,
567 |             };
568 | 
569 |             n: usize,
570 |             seed: u64,
571 |             input: std.ArrayListUnmanaged(Input),
572 | 
573 |             pub fn init(allocator: std.mem.Allocator, n: usize, seed: u64) error{OutOfMemory}!IterativeBuilder {
574 |                 const input = try std.ArrayListUnmanaged(Input).initCapacity(allocator, n);
575 | 
576 |                 return IterativeBuilder{
577 |                     .n = n,
578 |                     .seed = seed,
579 |                     .input = input,
580 |                 };
581 |             }
582 | 
583 |             pub fn deinit(self: *IterativeBuilder, allocator: std.mem.Allocator) void {
584 |                 self.input.deinit(allocator);
585 |                 self.* = undefined;
586 |             }
587 | 
588 |             pub fn insert(self: *IterativeBuilder, key: Key, value: u64) void {
589 |                 self.input.appendAssumeCapacity(
590 |                     Input{
591 |                         .hash1 = hasher(self.seed, key),
592 |                         .hash2 = hasher(self.seed + 1, key),
593 |                         .value = value,
594 |                     },
595 |                 );
596 |             }
597 | 
598 |             pub fn insertWithAllocator(self: *IterativeBuilder, allocator: std.mem.Allocator, key: Key, value: u64) error{OutOfMemory}!void {
599 |                 try self.input.append(
600 |                     allocator,
601 |                     Input{
602 |                         .hash1 = hasher(self.seed, key),
603 |                         .hash2 = hasher(self.seed + 1, key),
604 |                         .value = value,
605 |                     },
606 |                 );
607 |             }
608 | 
609 |             pub fn build(self: IterativeBuilder, allocator: std.mem.Allocator, opts: BuildOptions) error{ OutOfMemory, HashCollision }!Self {
610 |                 std.debug.assert(self.seed == opts.seed);
611 | 
612 |                 const n = self.input.items.len;
613 |                 const step = @max(n / 10, 1);
614 |                 var m: usize = n;
615 | 
616 |                 var i: usize = 0;
617 |                 loop: while (i < 50) : (i += 1) {
618 |                     var system = try RibbonBandingSystem.init(allocator, m, opts.r, opts.w);
619 |                     defer system.deinit(allocator);
620 | 
621 |                     for (self.input.items) |input| {
622 |                         const h = splitHash(input.hash1, input.hash2, m, opts.w);
623 |                         const insert_result = system.insertRow(h.i, h.c, input.value);
624 |                         switch (insert_result) {
625 |                             .failure => {
626 |                                 m += step;
627 |                                 continue :loop;
628 |                             },
629 |                             else => {},
630 |                         }
631 |                     }
632 | 
633 |                     const table = try system.build(allocator);
634 | 
635 |                     return Self{
636 |                         .w = opts.w,
637 |                         .seed = opts.seed,
638 |                         .table = table,
639 |                     };
640 |                 }
641 | 
642 |                 return error.HashCollision;
643 |             }
644 |         };
645 | 
646 |         pub const Bumped = struct {
647 |             const Layers = std.BoundedArray(BumpedLayer, 4);
648 | 
649 |             w: u6,
650 |             seed: u64,
651 |             layers: Layers,
652 |             fallback_table: RibbonTable,
653 | 
654 |             pub fn deinit(self: *Bumped, allocator: std.mem.Allocator) void {
655 |                 for (self.layers.slice()) |*layer| {
656 |                     layer.deinit(allocator);
657 |                 }
658 |                 self.fallback_table.deinit(allocator);
659 |                 self.* = undefined;
660 |             }
661 | 
662 |             pub fn lookup(self: *const Bumped, key: Key) u64 {
663 |                 const hash1 = hasher(self.seed, key);
664 |                 const hash2 = hasher(self.seed + 1, key);
665 |                 for (self.layers.slice()) |layer| {
666 |                     const h = splitHash(hash1, hash2, layer.table.n, self.w);
667 |                     if (layer.lookup(h.i, h.c)) |result| {
668 |                         return result;
669 |                     }
670 |                 }
671 |                 const h = splitHash(hash1, hash2, self.fallback_table.n, self.w);
672 |                 return self.fallback_table.lookup(h.i, h.c);
673 |             }
674 | 
675 |             pub fn bits(self: Bumped) usize {
676 |                 var result = self.fallback_table.bits();
677 |                 for (self.layers.slice()) |layer| {
678 |                     result += layer.bits();
679 |                 }
680 |                 return result;
681 |             }
682 | 
683 |             pub fn writeTo(self: *const Bumped, w: anytype) !void {
684 |                 try w.writeInt(u64, self.w, endian);
685 |                 try w.writeInt(u64, self.seed, endian);
686 |                 try w.writeInt(u64, self.layers.len, endian);
687 |                 for (self.layers.slice()) |layer| {
688 |                     try layer.writeTo(w);
689 |                 }
690 |                 try self.fallback_table.writeTo(w);
691 |             }
692 | 
693 |             pub fn readFrom(stream: *std.io.FixedBufferStream([]const u8)) !Bumped {
694 |                 var r = stream.reader();
695 |                 const w = try r.readInt(u64, endian);
696 |                 const seed = try r.readInt(u64, endian);
697 |                 const layers_len = try r.readInt(u64, endian);
698 |                 var layers = Layers.init(0) catch unreachable;
699 |                 for (0..layers_len) |_| {
700 |                     layers.appendAssumeCapacity(try BumpedLayer.readFrom(stream));
701 |                 }
702 |                 const fallback_table = try RibbonTable.readFrom(stream);
703 |                 return Bumped{
704 |                     .w = @intCast(w),
705 |                     .seed = seed,
706 |                     .layers = layers,
707 |                     .fallback_table = fallback_table,
708 |                 };
709 |             }
710 |         };
711 | 
712 |         pub const BumpedBuilder = struct {
713 |             layer_builder: BumpedLayerBuilder,
714 | 
715 |             pub fn init(allocator: std.mem.Allocator, n: usize, eps: f64, opts: BuildOptions) error{OutOfMemory}!BumpedBuilder {
716 |                 var layer_builder = try BumpedLayerBuilder.init(allocator, n, eps, opts);
717 |                 errdefer layer_builder.deinit(allocator);
718 | 
719 |                 return BumpedBuilder{ .layer_builder = layer_builder };
720 |             }
721 | 
722 |             pub fn deinit(self: *BumpedBuilder, allocator: std.mem.Allocator) void {
723 |                 self.layer_builder.deinit(allocator);
724 |                 self.* = undefined;
725 |             }
726 | 
727 |             pub fn insert(self: *BumpedBuilder, key: Key, value: u64) void {
728 |                 const hash1 = hasher(self.layer_builder.opts.seed, key);
729 |                 const hash2 = hasher(self.layer_builder.opts.seed + 1, key);
730 |                 self.layer_builder.insert(hash1, hash2, value);
731 |             }
732 | 
733 |             pub fn build(self: *BumpedBuilder, allocator: std.mem.Allocator) error{ OutOfMemory, HashCollision }!Bumped {
734 |                 var layers = Bumped.Layers.init(0) catch unreachable;
735 |                 errdefer {
736 |                     for (layers.slice()) |*layer| {
737 |                         layer.deinit(allocator);
738 |                     }
739 |                 }
740 | 
741 |                 while (layers.len < layers.capacity()) {
742 |                     if (layers.len > 1 and self.layer_builder.input.items.len < 2048) {
743 |                         // Other bother with the lower level if we have enough items.
744 |                         break;
745 |                     }
746 | 
747 |                     var layer = try self.layer_builder.build(allocator);
748 |                     errdefer layer.deinit(allocator);
749 | 
750 |                     layers.appendAssumeCapacity(layer);
751 |                 }
752 | 
753 |                 var fallback_table = try self.layer_builder.buildFallbackTable(allocator);
754 |                 errdefer fallback_table.deinit(allocator);
755 | 
756 |                 return Bumped{
757 |                     .w = self.layer_builder.opts.w,
758 |                     .seed = self.layer_builder.opts.seed,
759 |                     .layers = layers,
760 |                     .fallback_table = fallback_table,
761 |                 };
762 |             }
763 |         };
764 |     };
765 | }
766 | 
767 | pub fn RibbonAutoHash(comptime Key: type) type {
768 |     return Ribbon(Key, utils.autoHash(Key));
769 | }
770 | 
771 | const testing = std.testing;
772 | const Wyhash = std.hash.Wyhash;
773 | const TestErrorSet = error{ OutOfMemory, HashCollision, TestExpectedEqual };
774 | 
775 | fn testRibbon(t: anytype) TestErrorSet!void {
776 |     const valueSize = 8;
777 |     t.setValueSize(valueSize);
778 |     t.setBandWidth(32);
779 |     t.setSeed(100);
780 |     try t.init();
781 | 
782 |     const seed = 0x0194f614c15227ba;
783 | 
784 |     {
785 |         // Insert random data:
786 |         var prng = std.Random.DefaultPrng.init(seed);
787 |         const r = prng.random();
788 | 
789 |         for (0..t.n) |idx| {
790 |             const value = r.uintLessThan(u64, @as(u64, 1) << valueSize);
791 |             try t.insert(idx, value);
792 |         }
793 |     }
794 | 
795 |     try t.build();
796 | 
797 |     {
798 |         // Look it up again:
799 |         var prng = std.Random.DefaultPrng.init(seed);
800 |         const r = prng.random();
801 | 
802 |         for (0..t.n) |idx| {
803 |             const value = r.uintLessThan(u64, @as(u64, 1) << valueSize);
804 |             try testing.expectEqual(value, t.lookup(idx));
805 |         }
806 |     }
807 | }
808 | 
809 | const RibbonU64 = RibbonAutoHash(u64);
810 | 
811 | fn RibbonSettings(comptime Self: type) type {
812 |     return struct {
813 |         fn setValueSize(self: *Self, r: u6) void {
814 |             self.r = r;
815 |         }
816 | 
817 |         fn setBandWidth(self: *Self, w: u6) void {
818 |             self.w = w;
819 |         }
820 | 
821 |         fn setSeed(self: *Self, seed: u64) void {
822 |             self.seed = seed;
823 |         }
824 | 
825 |         fn options(self: Self) BuildOptions {
826 |             return .{
827 |                 .r = self.r.?,
828 |                 .w = self.w.?,
829 |                 .seed = self.seed.?,
830 |             };
831 |         }
832 |     };
833 | }
834 | 
835 | const RibbonIncrementalTest = struct {
836 |     const Self = @This();
837 | 
838 |     allocator: std.mem.Allocator,
839 |     n: usize,
840 | 
841 |     r: ?u6 = null,
842 |     w: ?u6 = null,
843 |     seed: ?u64 = null,
844 |     builder: ?RibbonU64.IncrementalBuilder = null,
845 |     table: ?RibbonU64 = null,
846 | 
847 |     usingnamespace RibbonSettings(Self);
848 | 
849 |     fn deinit(self: *Self) void {
850 |         if (self.builder) |*b| b.deinit(self.allocator);
851 |         if (self.table) |*t| t.deinit(self.allocator);
852 |     }
853 | 
854 |     fn init(self: *Self) !void {
855 |         self.builder = try RibbonU64.IncrementalBuilder.init(self.allocator, self.n * 2, self.options());
856 |     }
857 | 
858 |     fn insert(self: *Self, key: u64, value: u64) !void {
859 |         try self.builder.?.insert(key, value);
860 |     }
861 | 
862 |     fn build(self: *Self) !void {
863 |         self.table = try self.builder.?.build(self.allocator);
864 |     }
865 | 
866 |     fn lookup(self: *Self, key: u64) u64 {
867 |         return self.table.?.lookup(key);
868 |     }
869 | };
870 | 
871 | const RibbonIterativeTest = struct {
872 |     const Self = @This();
873 | 
874 |     allocator: std.mem.Allocator,
875 |     n: usize,
876 | 
877 |     r: ?u6 = null,
878 |     w: ?u6 = null,
879 |     seed: ?u64 = null,
880 | 
881 |     builder: ?RibbonU64.IterativeBuilder = null,
882 |     table: ?RibbonU64 = null,
883 | 
884 |     usingnamespace RibbonSettings(Self);
885 | 
886 |     fn deinit(self: *Self) void {
887 |         if (self.builder) |*b| b.deinit(self.allocator);
888 |         if (self.table) |*t| t.deinit(self.allocator);
889 |     }
890 | 
891 |     fn init(self: *Self) !void {
892 |         self.builder = try RibbonU64.IterativeBuilder.init(self.allocator, self.n, self.options().seed);
893 |     }
894 | 
895 |     fn insert(self: *Self, key: u64, value: u64) !void {
896 |         self.builder.?.insert(key, value);
897 |     }
898 | 
899 |     fn build(self: *Self) !void {
900 |         self.table = try self.builder.?.build(self.allocator, self.options());
901 |     }
902 | 
903 |     fn lookup(self: *Self, key: u64) u64 {
904 |         return self.table.?.lookup(key);
905 |     }
906 | };
907 | 
908 | const BumpedRibbonTest = struct {
909 |     const Self = @This();
910 | 
911 |     allocator: std.mem.Allocator,
912 |     n: usize,
913 | 
914 |     r: ?u6 = null,
915 |     w: ?u6 = null,
916 |     seed: ?u64 = null,
917 | 
918 |     builder: ?RibbonU64.BumpedBuilder = null,
919 |     table: ?RibbonU64.Bumped = null,
920 | 
921 |     usingnamespace RibbonSettings(Self);
922 | 
923 |     fn deinit(self: *Self) void {
924 |         if (self.builder) |*b| b.deinit(self.allocator);
925 |         if (self.table) |*t| t.deinit(self.allocator);
926 |     }
927 | 
928 |     fn init(self: *Self) !void {
929 |         self.builder = try RibbonU64.BumpedBuilder.init(self.allocator, self.n, 0, self.options());
930 |     }
931 | 
932 |     fn insert(self: *Self, key: u64, value: u64) !void {
933 |         self.builder.?.insert(key, value);
934 |     }
935 | 
936 |     fn build(self: *Self) !void {
937 |         self.table = try self.builder.?.build(self.allocator);
938 |     }
939 | 
940 |     fn lookup(self: *Self, key: u64) u64 {
941 |         return self.table.?.lookup(key);
942 |     }
943 | };
944 | 
945 | fn testRibbonIncremental(allocator: std.mem.Allocator) TestErrorSet!void {
946 |     var t = RibbonIncrementalTest{ .allocator = allocator, .n = 100 };
947 |     defer t.deinit();
948 |     try testRibbon(&t);
949 | }
950 | 
951 | fn testRibbonIterative(allocator: std.mem.Allocator) TestErrorSet!void {
952 |     var t = RibbonIterativeTest{ .allocator = allocator, .n = 100 };
953 |     defer t.deinit();
954 |     try testRibbon(&t);
955 | }
956 | 
957 | fn testBumpedRibbon(allocator: std.mem.Allocator) TestErrorSet!void {
958 |     var t = BumpedRibbonTest{ .allocator = allocator, .n = 100 };
959 |     defer t.deinit();
960 |     try testRibbon(&t);
961 | }
962 | 
963 | test "ribbon incremental" {
964 |     try utils.testFailingAllocator(testRibbonIncremental);
965 | }
966 | 
967 | test "ribbon iterative" {
968 |     try utils.testFailingAllocator(testRibbonIterative);
969 | }
970 | 
971 | test "bumped ribbon" {
972 |     try utils.testFailingAllocator(testBumpedRibbon);
973 | }
974 | 


--------------------------------------------------------------------------------
/src/utils.zig:
--------------------------------------------------------------------------------
 1 | const std = @import("std");
 2 | const builtin = @import("builtin");
 3 | const endian = builtin.cpu.arch.endian();
 4 | 
 5 | pub fn writeSlice(w: anytype, arr: anytype) !void {
 6 |     const T = @TypeOf(arr[0]);
 7 |     try w.writeInt(u64, arr.len, endian);
 8 |     const byte_len = arr.len * @sizeOf(T);
 9 |     if (byte_len == 0) return;
10 |     try w.writeAll(@as([*]const u8, @ptrCast(&arr[0]))[0..byte_len]);
11 |     // Make sure we're always at a 64-bit boundary.
12 |     const padding = (@alignOf(u64) - (byte_len % @alignOf(u64))) % @alignOf(u64);
13 |     try w.writeByteNTimes(0, padding);
14 | }
15 | 
16 | pub fn readSlice(stream: *std.io.FixedBufferStream([]const u8), T: anytype) ![]const T {
17 |     // Invariant: stream.pos should be 8-byte aligned before and after `readSlice`
18 |     std.debug.assert(stream.pos % @alignOf(u64) == 0);
19 |     defer std.debug.assert(stream.pos % @alignOf(u64) == 0);
20 | 
21 |     var r = stream.reader();
22 |     const len = try r.readInt(u64, endian);
23 |     const byte_len = len * @sizeOf(T);
24 |     if (byte_len == 0) return &[_]T{};
25 |     const data = stream.buffer[stream.pos..][0..byte_len];
26 |     stream.pos += byte_len;
27 |     const padding = (@alignOf(u64) - (byte_len % @alignOf(u64))) % @alignOf(u64);
28 |     stream.pos += padding;
29 |     const cast_data: [*]const T = @ptrCast(@alignCast(&data[0]));
30 |     return cast_data[0..len];
31 | }
32 | 
33 | pub fn bitSizeOfSlice(arr: anytype) u64 {
34 |     return arr.len * @bitSizeOf(@TypeOf(arr[0]));
35 | }
36 | 
37 | pub fn autoHash(comptime Key: type) fn (seed: u64, key: Key) u64 {
38 |     return struct {
39 |         fn hash(seed: u64, key: Key) u64 {
40 |             if (comptime std.meta.hasUniqueRepresentation(Key)) {
41 |                 return std.hash.Wyhash.hash(seed, std.mem.asBytes(&key));
42 |             } else {
43 |                 var hasher = std.hash.Wyhash.init(seed);
44 |                 std.hash.autoHash(&hasher, key);
45 |                 return hasher.final();
46 |             }
47 |         }
48 |     }.hash;
49 | }
50 | 
51 | pub fn testFailingAllocator(comptime t: fn (allocator: std.mem.Allocator) anyerror!void) !void {
52 |     var idx: usize = 0;
53 |     while (true) : (idx += 1) {
54 |         var failing_alloc = std.testing.FailingAllocator.init(std.testing.allocator, .{ .fail_index = idx });
55 | 
56 |         try (t(failing_alloc.allocator()) catch |err| switch (err) {
57 |             error.OutOfMemory => continue,
58 |             else => err,
59 |         });
60 | 
61 |         return;
62 |     }
63 | }
64 | 
65 | const testing = std.testing;
66 | 
67 | test "readSlice / writeSlice must maintain 8-byte alignment" {
68 |     var buf: [128]u8 = undefined;
69 |     var write_stream = std.io.fixedBufferStream(buf[0..]);
70 | 
71 |     const writer = write_stream.writer();
72 | 
73 |     try writeSlice(writer, [_]u8{ 1, 2, 3 });
74 |     try writeSlice(writer, [_]u64{2});
75 | 
76 |     var read_stream = std.io.fixedBufferStream(@as([]const u8, buf[0..]));
77 | 
78 |     try testing.expectEqualSlices(u8, &.{ 1, 2, 3 }, try readSlice(&read_stream, u8));
79 |     try testing.expectEqualSlices(u64, &.{2}, try readSlice(&read_stream, u64));
80 | }
81 | 


--------------------------------------------------------------------------------
/tools/zini-pthash/main.zig:
--------------------------------------------------------------------------------
  1 | const std = @import("std");
  2 | const zini = @import("zini");
  3 | const parg = @import("parg");
  4 | 
  5 | const HashFn = zini.pthash.BytesHashFn(zini.DictArray);
  6 | const StringDict = zini.StringDict;
  7 | 
  8 | const usage =
  9 |     \\USAGE
 10 |     \\  {s} [build | lookup] <options>
 11 |     \\
 12 |     \\COMMAND: build
 13 |     \\  Builds hash function for plain text file.
 14 |     \\ 
 15 |     \\  -i, --input <file>
 16 |     \\  -o, --output <file>
 17 |     \\  -c <int>
 18 |     \\  -a, --alpha <float>
 19 |     \\  -s, --seed <int>
 20 |     \\  -d, --dict
 21 |     \\
 22 |     \\COMMAND: lookup
 23 |     \\
 24 |     \\  -i, --input <file>
 25 |     \\  -k, --key <key>
 26 |     \\  -b, --benchmark
 27 |     \\
 28 | ;
 29 | 
 30 | fn fail(comptime msg: []const u8, args: anytype) noreturn {
 31 |     std.debug.print("error: ", .{});
 32 |     std.debug.print(msg, args);
 33 |     std.debug.print("\n", .{});
 34 |     std.posix.exit(1);
 35 | }
 36 | 
 37 | pub fn main() !void {
 38 |     var gpa = std.heap.GeneralPurposeAllocator(.{}){};
 39 |     defer {
 40 |         const check = gpa.deinit();
 41 |         if (check == .leak) @panic("memory leaked");
 42 |     }
 43 | 
 44 |     const allocator = gpa.allocator();
 45 | 
 46 |     var p = try parg.parseProcess(allocator, .{});
 47 |     defer p.deinit();
 48 | 
 49 |     const program_name = p.nextValue() orelse @panic("no executable name");
 50 | 
 51 |     while (p.next()) |token| {
 52 |         switch (token) {
 53 |             .flag => |flag| {
 54 |                 fail("uknown flag: {s}", .{flag.name});
 55 |             },
 56 |             .arg => |arg| {
 57 |                 if (std.mem.eql(u8, arg, "lookup")) {
 58 |                     return lookup(allocator, &p);
 59 |                 } else if (std.mem.eql(u8, arg, "build")) {
 60 |                     return build(allocator, &p);
 61 |                 } else {
 62 |                     fail("uknown argument: {s}", .{arg});
 63 |                 }
 64 |             },
 65 |             .unexpected_value => |val| fail("uknown argument: {s}", .{val}),
 66 |         }
 67 |     }
 68 | 
 69 |     std.debug.print(usage, .{program_name});
 70 | }
 71 | 
 72 | fn printHashStats(hash: HashFn, dict: ?StringDict, arr: ?zini.DictArray) !void {
 73 |     const bits = hash.bits() + @bitSizeOf(HashFn);
 74 |     std.debug.print("  seed: {}\n", .{hash.seed});
 75 |     std.debug.print("  bits: {}\n", .{bits});
 76 |     std.debug.print("  bits/n: {d}\n", .{@as(f64, @floatFromInt(bits)) / @as(f64, @floatFromInt(hash.n))});
 77 |     std.debug.print("\n", .{});
 78 | 
 79 |     if (dict != null) {
 80 |         const dict_size = dict.?.bits() + @bitSizeOf(StringDict) + arr.?.bits() + @bitSizeOf(zini.DictArray);
 81 |         std.debug.print("File contains dictionary as well:\n", .{});
 82 |         std.debug.print("  bits: {}\n", .{dict_size});
 83 |         std.debug.print("  bits/n: {d}\n", .{@as(f64, @floatFromInt(dict_size)) / @as(f64, @floatFromInt(hash.n))});
 84 |         std.debug.print("\n", .{});
 85 | 
 86 |         const total_bits = bits + dict_size;
 87 | 
 88 |         std.debug.print("Combined:\n", .{});
 89 |         std.debug.print("  bits: {}\n", .{total_bits});
 90 |         std.debug.print("  bits/n: {d}\n", .{@as(f64, @floatFromInt(total_bits)) / @as(f64, @floatFromInt(hash.n))});
 91 |         std.debug.print("\n", .{});
 92 |     }
 93 | }
 94 | 
 95 | pub fn build(allocator: std.mem.Allocator, p: anytype) !void {
 96 |     var params = zini.pthash.Params{ .c = 7, .alpha = 0.95 };
 97 |     var input: ?[]const u8 = null;
 98 |     var output: ?[]const u8 = null;
 99 |     var seed: ?u64 = null;
100 |     var build_dict: bool = false;
101 | 
102 |     while (p.next()) |token| {
103 |         switch (token) {
104 |             .flag => |flag| {
105 |                 if (flag.isShort("i") or flag.isLong("input")) {
106 |                     const val = p.nextValue() orelse fail("-i/--input requires value", .{});
107 |                     input = val;
108 |                 } else if (flag.isShort("o") or flag.isLong("output")) {
109 |                     const val = p.nextValue() orelse @panic("value required");
110 |                     output = val;
111 |                 } else if (flag.isShort("s") or flag.isLong("seed")) {
112 |                     const val = p.nextValue() orelse @panic("value required");
113 |                     seed = try std.fmt.parseInt(usize, val, 10);
114 |                 } else if (flag.isShort("c")) {
115 |                     const val = p.nextValue() orelse @panic("value required");
116 |                     params.c = try std.fmt.parseInt(usize, val, 10);
117 |                 } else if (flag.isShort("a") or flag.isLong("alpha")) {
118 |                     const val = p.nextValue() orelse @panic("value required");
119 |                     params.alpha = try std.fmt.parseFloat(f64, val);
120 |                 } else if (flag.isShort("d") or flag.isLong("dict")) {
121 |                     build_dict = true;
122 |                 } else {
123 |                     fail("uknown flag: {s}", .{flag.name});
124 |                 }
125 |             },
126 |             .arg => |arg| fail("uknown argument: {s}", .{arg}),
127 |             .unexpected_value => |val| fail("uknown argument: {s}", .{val}),
128 |         }
129 |     }
130 | 
131 |     if (input == null) {
132 |         fail("-i/--input is required", .{});
133 |     }
134 | 
135 |     std.debug.print("Reading {s}...\n", .{input.?});
136 |     var file = try std.fs.cwd().openFile(input.?, .{});
137 |     defer file.close();
138 | 
139 |     const data = try file.reader().readAllAlloc(allocator, 10 * 1024 * 1024);
140 |     defer allocator.free(data);
141 | 
142 |     var keys = std.ArrayList([]const u8).init(allocator);
143 |     defer keys.deinit();
144 | 
145 |     var iter = std.mem.tokenizeScalar(u8, data, '\n');
146 |     while (iter.next()) |line| {
147 |         var split = std.mem.splitScalar(u8, line, ' ');
148 |         try keys.append(split.next().?);
149 |     }
150 | 
151 |     std.debug.print("\n", .{});
152 |     std.debug.print("Building hash function...\n", .{});
153 |     var hash = try HashFn.build(allocator, keys.items, params, seed);
154 |     defer hash.deinit(allocator);
155 | 
156 |     var dict: ?StringDict = null;
157 |     defer if (dict) |*d| d.deinit(allocator);
158 | 
159 |     var arr: ?zini.DictArray = null;
160 |     defer if (arr) |*a| a.deinit(allocator);
161 | 
162 |     if (build_dict) {
163 |         var dict_builder = try StringDict.Builder.init(allocator);
164 |         defer dict_builder.deinit();
165 | 
166 |         var arr_slice = try allocator.alloc(u64, hash.n);
167 |         defer allocator.free(arr_slice);
168 | 
169 |         iter = std.mem.tokenizeScalar(u8, data, '\n');
170 |         while (iter.next()) |line| {
171 |             var split = std.mem.splitScalar(u8, line, ' ');
172 |             const key = split.next().?;
173 |             const value = split.next().?;
174 |             const key_idx = hash.get(key);
175 |             const val_idx = try dict_builder.intern(value);
176 |             arr_slice[key_idx] = val_idx;
177 |         }
178 | 
179 |         dict = try dict_builder.build();
180 |         arr = try zini.DictArray.encode(allocator, arr_slice);
181 |     }
182 | 
183 |     std.debug.print("\n", .{});
184 |     std.debug.print("Successfully built hash function:\n", .{});
185 |     try printHashStats(hash, dict, arr);
186 | 
187 |     if (output) |o| {
188 |         std.debug.print("Writing to {s}\n", .{o});
189 |         const outfile = try std.fs.cwd().createFile(o, .{});
190 |         defer outfile.close();
191 | 
192 |         try hash.writeTo(outfile.writer());
193 | 
194 |         if (build_dict) {
195 |             try dict.?.writeTo(outfile.writer());
196 |             try arr.?.writeTo(outfile.writer());
197 |         }
198 |     }
199 | }
200 | 
201 | pub fn lookup(allocator: std.mem.Allocator, p: anytype) !void {
202 |     const stdout = std.io.getStdOut().writer();
203 | 
204 |     var input: ?[]const u8 = null;
205 |     var key: ?[]const u8 = null;
206 |     var bench: bool = false;
207 | 
208 |     while (p.next()) |token| {
209 |         switch (token) {
210 |             .flag => |flag| {
211 |                 if (flag.isShort("i") or flag.isLong("input")) {
212 |                     const val = p.nextValue() orelse fail("-i/--input requires value", .{});
213 |                     input = val;
214 |                 } else if (flag.isShort("k") or flag.isLong("key")) {
215 |                     const val = p.nextValue() orelse fail("-k/--key requires value", .{});
216 |                     key = val;
217 |                 } else if (flag.isShort("b") or flag.isLong("bench")) {
218 |                     bench = true;
219 |                 } else {
220 |                     fail("unknown flag: {s}", .{flag.name});
221 |                 }
222 |             },
223 |             .arg => |arg| fail("unexpected argument: {s}", .{arg}),
224 |             .unexpected_value => |val| fail("unexpected argument: {s}", .{val}),
225 |         }
226 |     }
227 | 
228 |     if (input == null) {
229 |         fail("-i/--input is required", .{});
230 |     }
231 | 
232 |     std.debug.print("Reading {s}...\n", .{input.?});
233 |     const buf = try std.fs.cwd().readFileAlloc(allocator, input.?, 10 * 1024 * 1024);
234 |     defer allocator.free(buf);
235 | 
236 |     var fbs = std.io.fixedBufferStream(@as([]const u8, buf));
237 |     const hash = try HashFn.readFrom(&fbs);
238 |     var dict: ?StringDict = null;
239 |     var arr: ?zini.DictArray = null;
240 | 
241 |     if (fbs.pos < fbs.buffer.len) {
242 |         dict = try StringDict.readFrom(&fbs);
243 |         arr = try zini.DictArray.readFrom(&fbs);
244 |     }
245 | 
246 |     std.debug.print("\n", .{});
247 | 
248 |     std.debug.print("Successfully loaded hash function:\n", .{});
249 |     try printHashStats(hash, dict, arr);
250 | 
251 |     if (key) |k| {
252 |         std.debug.print("Looking up key={s}:\n", .{k});
253 |         const h = hash.get(k);
254 |         try stdout.print("{}\n", .{h});
255 |         if (dict) |d| {
256 |             try stdout.print("{s}\n", .{d.get(arr.?.get(h))});
257 |         }
258 | 
259 |         if (bench) {
260 |             const n = 1000;
261 |             std.debug.print("\nBenchmarking...\n", .{});
262 |             var timer = try std.time.Timer.start();
263 |             const start = timer.lap();
264 |             var i: usize = 0;
265 |             // TODO: Is this actually a good way of benchmarking?
266 |             while (i < n) : (i += 1) {
267 |                 std.mem.doNotOptimizeAway(hash.get(k));
268 |             }
269 |             const end = timer.read();
270 |             const dur = end - start;
271 |             std.debug.print("{} ns/read (avg of {} iterations)\n", .{ dur / n, n });
272 |         }
273 |     }
274 | }
275 | 


--------------------------------------------------------------------------------
/tools/zini-ribbon/main.zig:
--------------------------------------------------------------------------------
  1 | const std = @import("std");
  2 | const builtin = @import("builtin");
  3 | const zini = @import("zini");
  4 | const parg = @import("parg");
  5 | 
  6 | const HashFn = zini.pthash.BytesHashFn(zini.DictArray);
  7 | const HashRibbon = zini.ribbon.Ribbon([]const u8, std.hash.Wyhash.hash);
  8 | const StringDict = zini.StringDict;
  9 | const endian = builtin.cpu.arch.endian();
 10 | 
 11 | const usage =
 12 |     \\USAGE
 13 |     \\  {s} [build | lookup] <options>
 14 |     \\
 15 |     \\COMMAND: build
 16 |     \\  Builds Ribbon table for plain text file.
 17 |     \\ 
 18 |     \\  -i, --input <file>
 19 |     \\  -o, --output <file>
 20 |     \\  -w <int>
 21 |     \\  -s, --seed <int>
 22 |     \\
 23 |     \\COMMAND: lookup
 24 |     \\
 25 |     \\  -i, --input <file>
 26 |     \\  -k, --key <key>
 27 |     \\  -b, --benchmark
 28 |     \\
 29 | ;
 30 | 
 31 | fn fail(comptime msg: []const u8, args: anytype) noreturn {
 32 |     std.debug.print("error: ", .{});
 33 |     std.debug.print(msg, args);
 34 |     std.debug.print("\n", .{});
 35 |     std.posix.exit(1);
 36 | }
 37 | 
 38 | pub fn main() !void {
 39 |     var gpa = std.heap.GeneralPurposeAllocator(.{}){};
 40 |     defer {
 41 |         const check = gpa.deinit();
 42 |         if (check == .leak) @panic("memory leaked");
 43 |     }
 44 | 
 45 |     const allocator = gpa.allocator();
 46 | 
 47 |     var p = try parg.parseProcess(allocator, .{});
 48 |     defer p.deinit();
 49 | 
 50 |     const program_name = p.nextValue() orelse @panic("no executable name");
 51 | 
 52 |     while (p.next()) |token| {
 53 |         switch (token) {
 54 |             .flag => |flag| {
 55 |                 fail("uknown flag: {s}", .{flag.name});
 56 |             },
 57 |             .arg => |arg| {
 58 |                 if (std.mem.eql(u8, arg, "lookup")) {
 59 |                     return lookup(allocator, &p);
 60 |                 } else if (std.mem.eql(u8, arg, "build")) {
 61 |                     return build(allocator, &p);
 62 |                 } else {
 63 |                     fail("uknown argument: {s}", .{arg});
 64 |                 }
 65 |             },
 66 |             .unexpected_value => |val| fail("uknown argument: {s}", .{val}),
 67 |         }
 68 |     }
 69 | 
 70 |     std.debug.print(usage, .{program_name});
 71 | }
 72 | 
 73 | fn printStats(table: anytype, n: usize) !void {
 74 |     const bits = table.bits() + @bitSizeOf(@TypeOf(table));
 75 |     std.debug.print("  seed: {}\n", .{table.seed});
 76 |     std.debug.print("  bits: {}\n", .{bits});
 77 |     std.debug.print("  bits/n: {d}\n", .{@as(f64, @floatFromInt(bits)) / @as(f64, @floatFromInt(n))});
 78 | }
 79 | 
 80 | pub fn build(allocator: std.mem.Allocator, p: anytype) !void {
 81 |     var w: u6 = 32;
 82 |     var input: ?[]const u8 = null;
 83 |     var output: ?[]const u8 = null;
 84 |     var seed: ?u64 = null;
 85 |     var eps: f64 = 0;
 86 | 
 87 |     while (p.next()) |token| {
 88 |         switch (token) {
 89 |             .flag => |flag| {
 90 |                 if (flag.isShort("i") or flag.isLong("input")) {
 91 |                     const val = p.nextValue() orelse fail("-i/--input requires value", .{});
 92 |                     input = val;
 93 |                 } else if (flag.isShort("o") or flag.isLong("output")) {
 94 |                     const val = p.nextValue() orelse @panic("value required");
 95 |                     output = val;
 96 |                 } else if (flag.isShort("s") or flag.isLong("seed")) {
 97 |                     const val = p.nextValue() orelse @panic("value required");
 98 |                     seed = try std.fmt.parseInt(usize, val, 10);
 99 |                 } else if (flag.isShort("w")) {
100 |                     const val = p.nextValue() orelse @panic("value required");
101 |                     w = @intCast(try std.fmt.parseInt(usize, val, 10));
102 |                 } else if (flag.isLong("eps")) {
103 |                     const val = p.nextValue() orelse @panic("value required");
104 |                     eps = try std.fmt.parseFloat(f64, val);
105 |                 } else {
106 |                     fail("uknown flag: {s}", .{flag.name});
107 |                 }
108 |             },
109 |             .arg => |arg| fail("uknown argument: {s}", .{arg}),
110 |             .unexpected_value => |val| fail("uknown argument: {s}", .{val}),
111 |         }
112 |     }
113 | 
114 |     if (input == null) {
115 |         fail("-i/--input is required", .{});
116 |     }
117 | 
118 |     std.debug.print("Reading {s}...\n", .{input.?});
119 |     var file = try std.fs.cwd().openFile(input.?, .{});
120 |     defer file.close();
121 | 
122 |     const data = try file.reader().readAllAlloc(allocator, 10 * 1024 * 1024);
123 |     defer allocator.free(data);
124 | 
125 |     var keys = std.ArrayList([]const u8).init(allocator);
126 |     defer keys.deinit();
127 | 
128 |     if (seed == null) {
129 |         try std.posix.getrandom(std.mem.asBytes(&seed));
130 |     }
131 | 
132 |     var max_val: u64 = 0;
133 |     var n: usize = 0;
134 | 
135 |     var iter = std.mem.tokenizeScalar(u8, data, '\n');
136 |     while (iter.next()) |line| {
137 |         var split = std.mem.splitScalar(u8, line, ',');
138 |         _ = split.next().?; // the key
139 |         const value = try std.fmt.parseInt(u64, split.next().?, 10);
140 |         max_val = @max(max_val, value);
141 |         n += 1;
142 |     }
143 | 
144 |     const r: u6 = @intCast(std.math.log2_int_ceil(u64, max_val + 1));
145 | 
146 |     std.debug.print("\n", .{});
147 |     std.debug.print("Building table for r={} value bits and eps={}...\n", .{ r, eps });
148 | 
149 |     const opts = zini.ribbon.BuildOptions{
150 |         .r = r,
151 |         .w = w,
152 |         .seed = seed.?,
153 |     };
154 | 
155 |     var builder = try HashRibbon.BumpedBuilder.init(allocator, n, eps, opts);
156 |     defer builder.deinit(allocator);
157 | 
158 |     iter = std.mem.tokenizeScalar(u8, data, '\n');
159 |     while (iter.next()) |line| {
160 |         var split = std.mem.splitScalar(u8, line, ',');
161 |         const key = split.next().?; // the key
162 |         const value = try std.fmt.parseInt(u64, split.next().?, 10);
163 |         builder.insert(key, value);
164 |     }
165 | 
166 |     var table = try builder.build(allocator);
167 |     defer table.deinit(allocator);
168 | 
169 |     std.debug.print("\n", .{});
170 |     std.debug.print("Successfully built table:\n", .{});
171 |     try printStats(table, n);
172 | 
173 |     if (output) |o| {
174 |         std.debug.print("\n", .{});
175 |         std.debug.print("Writing to {s}\n", .{o});
176 |         const outfile = try std.fs.cwd().createFile(o, .{});
177 |         defer outfile.close();
178 | 
179 |         try outfile.writer().writeInt(u64, n, endian);
180 |         try table.writeTo(outfile.writer());
181 |     }
182 | }
183 | 
184 | pub fn lookup(allocator: std.mem.Allocator, p: anytype) !void {
185 |     const stdout = std.io.getStdOut().writer();
186 | 
187 |     var input: ?[]const u8 = null;
188 |     var key: ?[]const u8 = null;
189 |     var bench: bool = false;
190 | 
191 |     while (p.next()) |token| {
192 |         switch (token) {
193 |             .flag => |flag| {
194 |                 if (flag.isShort("i") or flag.isLong("input")) {
195 |                     const val = p.nextValue() orelse fail("-i/--input requires value", .{});
196 |                     input = val;
197 |                 } else if (flag.isShort("k") or flag.isLong("key")) {
198 |                     const val = p.nextValue() orelse fail("-k/--key requires value", .{});
199 |                     key = val;
200 |                 } else if (flag.isShort("b") or flag.isLong("benchmark")) {
201 |                     bench = true;
202 |                 } else {
203 |                     fail("unknown flag: {s}", .{flag.name});
204 |                 }
205 |             },
206 |             .arg => |arg| fail("unexpected argument: {s}", .{arg}),
207 |             .unexpected_value => |val| fail("unexpected argument: {s}", .{val}),
208 |         }
209 |     }
210 | 
211 |     if (input == null) {
212 |         fail("-i/--input is required", .{});
213 |     }
214 | 
215 |     std.debug.print("Reading {s}...\n", .{input.?});
216 |     const buf = try std.fs.cwd().readFileAlloc(allocator, input.?, 10 * 1024 * 1024);
217 |     defer allocator.free(buf);
218 | 
219 |     var fbs = std.io.fixedBufferStream(@as([]const u8, buf));
220 |     const n = try fbs.reader().readInt(u64, endian);
221 |     var table = try HashRibbon.Bumped.readFrom(&fbs);
222 |     std.debug.print("\n", .{});
223 | 
224 |     std.debug.print("Successfully loaded hash function:\n", .{});
225 |     try printStats(table, n);
226 |     std.debug.print("\n", .{});
227 | 
228 |     if (key) |k| {
229 |         std.debug.print("Looking up key={s}:\n", .{k});
230 |         const value = table.lookup(k);
231 |         try stdout.print("{}\n", .{value});
232 | 
233 |         if (bench) {
234 |             const m = 1000;
235 |             std.debug.print("\nBenchmarking...\n", .{});
236 |             var timer = try std.time.Timer.start();
237 |             const start = timer.lap();
238 |             var i: usize = 0;
239 |             // TODO: Is this actually a good way of benchmarking?
240 |             while (i < m) : (i += 1) {
241 |                 std.mem.doNotOptimizeAway(table.lookup(k));
242 |             }
243 |             const end = timer.read();
244 |             const dur = end - start;
245 |             std.debug.print("{} ns/read (avg of {} iterations)\n", .{ dur / m, m });
246 |         }
247 |     }
248 | }
249 | 


--------------------------------------------------------------------------------
/tools/zini-seqz/main.zig:
--------------------------------------------------------------------------------
 1 | const std = @import("std");
 2 | const zini = @import("zini");
 3 | const parg = @import("parg");
 4 | 
 5 | const usage =
 6 |     \\USAGE
 7 |     \\  {s} <filename>
 8 |     \\
 9 |     \\A simple tool which reads a list of numbers (u64) from a file,
10 |     \\compresses them using Elias-Fano, and reports the number of
11 |     \\bytes it would take.
12 |     \\
13 | ;
14 | 
15 | fn fail(comptime msg: []const u8, args: anytype) noreturn {
16 |     std.debug.print("error: ", .{});
17 |     std.debug.print(msg, args);
18 |     std.debug.print("\n", .{});
19 |     std.posix.exit(1);
20 | }
21 | 
22 | pub fn main() !void {
23 |     var gpa = std.heap.GeneralPurposeAllocator(.{}){};
24 |     defer {
25 |         const check = gpa.deinit();
26 |         if (check == .leak) @panic("memory leaked");
27 |     }
28 | 
29 |     const allocator = gpa.allocator();
30 | 
31 |     var p = try parg.parseProcess(allocator, .{});
32 |     defer p.deinit();
33 | 
34 |     const program_name = p.nextValue() orelse @panic("no executable name");
35 | 
36 |     var filename: ?[]const u8 = null;
37 | 
38 |     while (p.next()) |token| {
39 |         switch (token) {
40 |             .flag => |flag| {
41 |                 if (flag.isLong("help") or flag.isShort("h")) {
42 |                     std.debug.print(usage, .{program_name});
43 |                     std.process.exit(0);
44 |                 } else {
45 |                     fail("uknown flag: {s}", .{flag.name});
46 |                 }
47 |             },
48 |             .arg => |arg| {
49 |                 if (filename == null) {
50 |                     filename = arg;
51 |                 } else {
52 |                     fail("uknown argument: {s}", .{arg});
53 |                 }
54 |             },
55 |             .unexpected_value => |val| fail("uknown argument: {s}", .{val}),
56 |         }
57 |     }
58 | 
59 |     const f = filename orelse fail("filename expected as argument", .{});
60 |     var file = try std.fs.cwd().openFile(f, .{});
61 |     defer file.close();
62 | 
63 |     var counting_file = std.io.countingReader(file.reader());
64 | 
65 |     var numbers = std.ArrayList(u64).init(allocator);
66 |     defer numbers.deinit();
67 | 
68 |     std.debug.print("Reading {s}\n", .{f});
69 | 
70 |     var r = counting_file.reader();
71 |     while (true) {
72 |         var buf: [32]u8 = undefined;
73 |         const line = r.readUntilDelimiter(&buf, '\n') catch |err| switch (err) {
74 |             error.EndOfStream => break,
75 |             else => return err,
76 |         };
77 |         const num = try std.fmt.parseInt(u64, line, 10);
78 |         try numbers.append(num);
79 |     }
80 | 
81 |     std.mem.sort(u64, numbers.items, {}, std.sort.asc(u64));
82 | 
83 |     std.debug.print("Compressing {} numbers ({} bytes)...\n", .{ numbers.items.len, counting_file.bytes_read });
84 | 
85 |     var encoded = try zini.EliasFano.encode(allocator, numbers.items);
86 |     defer encoded.deinit(allocator);
87 | 
88 |     std.debug.print("The data would compress to: {} bytes\n", .{encoded.bitsWithoutConstantAccess() / 8});
89 | }
90 | 


--------------------------------------------------------------------------------