├── .dockerignore
├── startup-gcp-mig.sh
├── Dockerfile
├── localbuild.sh
├── startup-aws-asg.sh
├── .gitignore
├── dockerfile-debian
├── LICENSE
├── k8s.yaml
├── .github
    └── workflows
    │   └── main.yml
├── src
    ├── scratch.zig
    ├── main.zig
    └── zgroup.zig
└── README.md


/.dockerignore:
--------------------------------------------------------------------------------
1 | .zig-cache/
2 | zig-out/


--------------------------------------------------------------------------------
/startup-gcp-mig.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | mkdir /opt/zgroup/ && cd /opt/zgroup/
3 | wget https://github.com/flowerinthenight/zgroup/releases/download/v0.3.2/zgroup-v0.3.2-x86_64-linux.tar.gz
4 | tar -xzvf zgroup-v0.3.2-x86_64-linux.tar.gz
5 | INTERNAL_IP=$(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0/ip)
6 | ZGROUP_JOIN_PREFIX=0b9303ad-1beb-483f-abb5-bc58e0214531 ./zgroup group1 ${INTERNAL_IP}:8080 2>&1 | logger &
7 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM kassany/alpine-ziglang:0.13.0
 2 | WORKDIR /tmp/
 3 | COPY src/ ./src/
 4 | COPY build* ./
 5 | RUN zig build -Doptimize=ReleaseFast --summary all
 6 | # RUN zig build --summary all
 7 | 
 8 | FROM debian:stable-slim
 9 | RUN set -x && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y curl ca-certificates && rm -rf /var/lib/apt/lists/*
10 | WORKDIR /app/
11 | COPY --from=0 /tmp/zig-out/bin/zgroup .
12 | ENTRYPOINT ["/app/zgroup"]
13 | CMD ["group1", "0.0.0.0:8080"]
14 | 


--------------------------------------------------------------------------------
/localbuild.sh:
--------------------------------------------------------------------------------
 1 | # NOTE: This is specific to my local dev environment.
 2 | # Usage:
 3 | #
 4 | #   ./localbuild.sh {tag}, e.g. ./localbuild.sh v7
 5 | #
 6 | kubectl delete -f deployment.yaml
 7 | docker build --rm -t zgroup .
 8 | docker tag zgroup asia.gcr.io/mobingi-main/zgroup:$1
 9 | docker push asia.gcr.io/mobingi-main/zgroup:$1
10 | docker rmi $(docker images --filter "dangling=true" -q --no-trunc) -f
11 | sed -i -e 's/image\:\ asia.gcr.io\/mobingi\-main\/zgroup[\:@].*$/image\:\ asia.gcr.io\/mobingi\-main\/zgroup\:'$1'/g' deployment.yaml
12 | 


--------------------------------------------------------------------------------
/startup-aws-asg.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | mkdir /opt/zgroup/ && cd /opt/zgroup/
3 | wget https://github.com/flowerinthenight/zgroup/releases/download/v0.3.2/zgroup-v0.3.2-x86_64-linux.tar.gz
4 | tar -xzvf zgroup-v0.3.2-x86_64-linux.tar.gz
5 | METADATA_TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600")
6 | INTERNAL_IP=$(curl -H "X-aws-ec2-metadata-token: $METADATA_TOKEN" http://169.254.169.254/latest/meta-data/local-ipv4)
7 | ZGROUP_JOIN_PREFIX=0b9303ad-1beb-483f-abb5-bc58e0214531 ./zgroup group1 ${INTERNAL_IP}:8080 2>&1 | logger &
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Object files
 5 | *.o
 6 | *.ko
 7 | *.obj
 8 | *.elf
 9 | 
10 | # Linker output
11 | *.ilk
12 | *.map
13 | *.exp
14 | 
15 | # Precompiled Headers
16 | *.gch
17 | *.pch
18 | 
19 | # Libraries
20 | *.lib
21 | *.a
22 | *.la
23 | *.lo
24 | 
25 | # Shared objects (inc. Windows DLLs)
26 | *.dll
27 | *.so
28 | *.so.*
29 | *.dylib
30 | 
31 | # Executables
32 | *.exe
33 | *.out
34 | *.app
35 | *.i*86
36 | *.x86_64
37 | *.hex
38 | zig-out/
39 | 
40 | # Debug files
41 | *.dSYM/
42 | *.su
43 | *.idb
44 | *.pdb
45 | 
46 | # Kernel Module Compile Results
47 | *.mod*
48 | *.cmd
49 | .tmp_versions/
50 | modules.order
51 | Module.symvers
52 | Mkfile.old
53 | dkms.conf
54 | 
55 | # Cache(s)
56 | .zig-cache/
57 | 
58 | # Others
59 | deployment.yaml


--------------------------------------------------------------------------------
/dockerfile-debian:
--------------------------------------------------------------------------------
 1 | FROM debian:bookworm
 2 | RUN set -x && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y curl xz-utils ca-certificates && rm -rf /var/lib/apt/lists/*
 3 | WORKDIR /tmp/
 4 | COPY src/ ./src/
 5 | COPY build* ./
 6 | RUN curl -O https://ziglang.org/download/0.13.0/zig-linux-x86_64-0.13.0.tar.xz && \
 7 | xz --decompress zig-linux-x86_64-0.13.0.tar.xz && tar -xf zig-linux-x86_64-0.13.0.tar && \
 8 | ./zig-linux-x86_64-0.13.0/zig build -Doptimize=ReleaseFast --summary all
 9 | 
10 | FROM debian:stable-slim
11 | RUN set -x && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y curl ca-certificates && rm -rf /var/lib/apt/lists/*
12 | WORKDIR /app/
13 | COPY --from=0 /tmp/zig-out/bin/zgroup .
14 | ENTRYPOINT ["/app/zgroup"]
15 | CMD ["group1", "0.0.0.0:8080"]
16 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 flowerinthenight
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/k8s.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | apiVersion: apps/v1
 4 | kind: Deployment
 5 | metadata:
 6 |   name: zgroup
 7 | spec:
 8 |   selector:
 9 |     matchLabels:
10 |       app: zgroup
11 |   replicas: 1
12 |   revisionHistoryLimit: 5
13 |   template:
14 |     metadata:
15 |       labels:
16 |         app: zgroup
17 |     spec:
18 |       containers:
19 |       - name: zgroup
20 |         image: quay.io/flowerinthenight/zgroup:v0.3.2
21 |         command: ["/bin/sh"]
22 |         args: ["-c", '/app/zgroup group1 ${K8S_MY_POD_IP}:8080']
23 |         resources:
24 |           requests:
25 |             cpu: 100m
26 |             memory: 500Mi
27 |           limits:
28 |             cpu: 100m
29 |             memory: 500Mi
30 |         imagePullPolicy: Always
31 |         env:
32 |         - name: K8S_MY_POD_IP
33 |           valueFrom:
34 |             fieldRef:
35 |               fieldPath: status.podIP
36 |         - name: GET_HOSTS_FROM
37 |           value: dns
38 |         - name: ZGROUP_JOIN_PREFIX
39 |           value: "c06a9044-856d-4583-8095-c57d37272b05"
40 |         ports:
41 |         - containerPort: 8080
42 | 
43 | ---
44 | 
45 | apiVersion: autoscaling/v1
46 | kind: HorizontalPodAutoscaler
47 | metadata:
48 |   name: zgroup-hpa
49 | spec:
50 |   scaleTargetRef:
51 |     apiVersion: apps/v1
52 |     kind: Deployment
53 |     name: zgroup
54 |   minReplicas: 3
55 |   maxReplicas: 3
56 |   targetCPUUtilizationPercentage: 40
57 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: main
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "main" ]
 6 |     tags:
 7 |       - '*'
 8 |   pull_request:
 9 |     branches: [ "main" ]
10 | 
11 | jobs:
12 |   codeberg:
13 |     name: Codeberg
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - name: Checkout code
17 |         uses: actions/checkout@v4
18 |         with:
19 |           fetch-depth: 0
20 | 
21 |       - name: Mirror to Codeberg
22 |         uses: yesolutions/mirror-action@master
23 |         with:
24 |           REMOTE: "https://codeberg.org/flowerinthenight/zgroup.git"
25 |           GIT_USERNAME: flowerinthenight
26 |           GIT_PASSWORD: ${{ secrets.GIT_PASSWORD }}
27 | 
28 |   build:
29 |     name: Build
30 |     if: "!contains(github.event.commits[0].message, 'ci skip')"
31 |     runs-on: ubuntu-latest
32 |     steps:
33 |       - name: Print GH context
34 |         env:
35 |           GITHUB_CONTEXT: ${{ toJson(github) }}
36 |         run: |
37 |           echo "$GITHUB_CONTEXT"
38 | 
39 |       - name: Checkout code
40 |         uses: actions/checkout@v2
41 | 
42 |       - name: Setup Zig
43 |         uses: mlugg/setup-zig@v1
44 |         with:
45 |           version: 0.13.0
46 | 
47 |       - name: Run tests
48 |         run: zig build test
49 | 
50 |       - name: Release from tags
51 |         if: startsWith(github.event.ref, 'refs/tags/v')
52 |         env:
53 |           GH_TOKEN: ${{ secrets.GH_TOKEN }}
54 |         run: |
55 |           zig build -Doptimize=ReleaseFast -Dtarget=x86_64-linux --summary all
56 |           cp zig-out/bin/zgroup ./
57 |           tar czvf zgroup-${GITHUB_REF_NAME}-x86_64-linux.tar.gz zgroup
58 |           gh release create ${GITHUB_REF_NAME} ./*.tar.gz --generate-notes
59 | 


--------------------------------------------------------------------------------
/src/scratch.zig:
--------------------------------------------------------------------------------
  1 | const std = @import("std");
  2 | const builtin = std.builtin;
  3 | const AtomicOrder = std.builtin.AtomicOrder;
  4 | const AtomicRmwOp = std.builtin.AtomicRmwOp;
  5 | const backoff = @import("zbackoff");
  6 | const zgroup = @import("zgroup.zig");
  7 | const dbg = std.debug.print;
  8 | 
  9 | const pdata = struct {
 10 |     ev1: *std.Thread.ResetEvent,
 11 |     ev2: *std.Thread.ResetEvent,
 12 | };
 13 | 
 14 | fn waiter(p: *pdata) void {
 15 |     for (0..2) |i| {
 16 |         dbg("{d} start wait1\n", .{i});
 17 |         p.ev1.wait();
 18 |         dbg("{d} end wait1, call reset\n", .{i});
 19 |         p.ev1.reset();
 20 | 
 21 |         dbg("{d} start wait2\n", .{i});
 22 |         p.ev2.wait();
 23 |         dbg("{d} end wait2, call reset\n", .{i});
 24 |         p.ev2.reset();
 25 |     }
 26 | }
 27 | 
 28 | fn testWaiter() !void {
 29 |     var ev1 = std.Thread.ResetEvent{};
 30 |     var ev2 = std.Thread.ResetEvent{};
 31 |     var data = pdata{ .ev1 = &ev1, .ev2 = &ev2 };
 32 | 
 33 |     const t = try std.Thread.spawn(.{}, waiter, .{&data});
 34 |     t.detach();
 35 | 
 36 |     std.time.sleep(std.time.ns_per_s * 5);
 37 |     ev1.set();
 38 |     ev2.set();
 39 |     std.time.sleep(std.time.ns_per_s * 5);
 40 |     ev1.set();
 41 |     ev2.set();
 42 |     std.time.sleep(std.time.ns_per_s * 5);
 43 | }
 44 | 
 45 | test "backoff" {
 46 |     // Try referencing external dep in test block.
 47 |     const bo = backoff.Backoff{};
 48 |     dbg("val={any}\n", .{bo.initial});
 49 | }
 50 | 
 51 | test "atomic" {
 52 |     var tm = try std.time.Timer.start();
 53 |     var v: u64 = 0;
 54 |     @atomicStore(u64, &v, 1, AtomicOrder.seq_cst);
 55 |     _ = @atomicLoad(u64, &v, AtomicOrder.seq_cst);
 56 |     // print("load={d}\n", .{a});
 57 |     _ = @atomicRmw(u64, &v, AtomicRmwOp.Add, 1e9, AtomicOrder.seq_cst);
 58 |     _ = @atomicLoad(u64, &v, AtomicOrder.seq_cst);
 59 |     // print("add={d}\n", .{b});
 60 |     dbg("took {any}\n", .{std.fmt.fmtDuration(tm.read())});
 61 | }
 62 | 
 63 | test "view" {
 64 |     const en = enum(u4) {
 65 |         change,
 66 |         do,
 67 |         start,
 68 |     };
 69 | 
 70 |     const e: en = .start;
 71 |     dbg("size={d}\n", .{@sizeOf(@TypeOf(e))});
 72 |     const ee: en = @enumFromInt(2);
 73 |     dbg("int={any}\n", .{ee});
 74 | 
 75 |     const val = 17293822569102704642; // 2
 76 |     dbg("cmd={x}\n", .{(val & 0xf000000000000000) >> 60});
 77 |     dbg("val={x}\n", .{val & 0x0fffffffffffffff});
 78 |     dbg("{x}\n", .{0xffffffffffffffff & (0b11 << 62)});
 79 | }
 80 | 
 81 | // test "httpget" {
 82 | //     var parent = std.heap.ArenaAllocator.init(std.testing.allocator);
 83 | //     defer parent.deinit();
 84 | //     const arena = parent.allocator();
 85 | 
 86 | //     var client = std.http.Client{ .allocator = arena };
 87 | //     defer client.deinit();
 88 | 
 89 | //     const endpoint = "https://keyvalue.immanuel.co/api/KeyVal/GetValue/seegmed7/chew";
 90 | //     const uri = try std.Uri.parse(endpoint);
 91 | 
 92 | //     const server_header_buffer: []u8 = try arena.alloc(u8, 8 * 1024 * 4);
 93 | //     var req = try client.open(.GET, uri, std.http.Client.RequestOptions{
 94 | //         .server_header_buffer = server_header_buffer,
 95 | //     });
 96 | 
 97 | //     defer req.deinit();
 98 | 
 99 | //     try req.send();
100 | //     try req.finish();
101 | //     try req.wait();
102 | 
103 | //     const repstr = try req.reader().readAllAlloc(arena, std.math.maxInt(usize));
104 | 
105 | //     dbg("reply={s}\n", .{repstr});
106 | // }
107 | 
108 | // test "httppost" {
109 | //     var parent = std.heap.ArenaAllocator.init(std.testing.allocator);
110 | //     defer parent.deinit();
111 | //     const arena = parent.allocator();
112 | 
113 | //     var client = std.http.Client{ .allocator = arena };
114 | //     defer client.deinit();
115 | 
116 | //     const endpoint = "https://keyvalue.immanuel.co/api/KeyVal/UpdateValue/seegmed7/chew/something";
117 | //     const uri = try std.Uri.parse(endpoint);
118 | 
119 | //     const server_header_buffer: []u8 = try arena.alloc(u8, 8 * 1024 * 4);
120 | //     var req = try client.open(.POST, uri, std.http.Client.RequestOptions{
121 | //         .server_header_buffer = server_header_buffer,
122 | //         .extra_headers = &[_]std.http.Header{.{ .name = "content-length", .value = "9" }},
123 | //     });
124 | 
125 | //     defer req.deinit();
126 | 
127 | //     try req.send();
128 | //     try req.finish();
129 | //     try req.wait();
130 | 
131 | //     const repstr = try req.reader().readAllAlloc(arena, std.math.maxInt(usize));
132 | 
133 | //     dbg("reply={s}\n", .{repstr});
134 | // }
135 | 
136 | // test "httpfetch" {
137 | //     var parent = std.heap.ArenaAllocator.init(std.testing.allocator);
138 | //     defer parent.deinit();
139 | //     const arena = parent.allocator();
140 | 
141 | //     var client = std.http.Client{ .allocator = arena };
142 | //     defer client.deinit();
143 | 
144 | //     // https://api.keyval.org/get/chew
145 | //     // const endpoint = "https://keyvalue.immanuel.co/api/KeyVal/UpdateValue/seegmed7/chew/something";
146 | //     const endpoint = "https://api.keyval.org/set/chew/bloodboil";
147 | //     const uri = try std.Uri.parse(endpoint);
148 | 
149 | //     var response_body = std.ArrayList(u8).init(arena);
150 | 
151 | //     const response = try client.fetch(std.http.Client.FetchOptions{
152 | //         .method = std.http.Method.POST,
153 | //         .location = .{ .uri = uri },
154 | //         // .extra_headers = &[_]std.http.Header{.{ .name = "Content-Length", .value = "9" }},
155 | //         .response_storage = .{ .dynamic = &response_body },
156 | //     });
157 | 
158 | //     if (response.status != .ok) dbg("booooooo\n", .{});
159 | 
160 | //     const parsed_body = try response_body.toOwnedSlice();
161 | //     dbg("RESPONSE: {s}\n", .{parsed_body});
162 | // }
163 | 
164 | test "returnblock" {
165 |     {
166 |         dbg("block entry\n", .{});
167 |         defer dbg("block exit\n", .{});
168 |         if (true) return;
169 |     }
170 | 
171 |     dbg("should not be here\n", .{});
172 | }
173 | 
174 | test "shift" {
175 |     // 0xFFFF.FFFF.FFFF.FFFF
176 |     const on = 1 << 63;
177 |     dbg("{X}\n", .{on});
178 |     const val = (on & 0x8000000000000000) >> 63;
179 |     dbg("{d}\n", .{val});
180 |     const min = 21 << 31;
181 |     dbg("{d}\n", .{(min & 0x7FFFFFFF80000000) >> 31});
182 | }
183 | 
184 | test "comp" {
185 |     var empty = try std.fmt.allocPrint(std.testing.allocator, "", .{});
186 |     dbg("len_empty={d}\n", .{empty.len});
187 |     const str = try std.fmt.allocPrint(std.testing.allocator, "hello", .{});
188 |     defer std.testing.allocator.free(str);
189 |     empty = str;
190 |     dbg("len_empty={d}\n", .{empty.len});
191 | }
192 | 
193 | test "envmap" {
194 |     const allocator = std.testing.allocator;
195 |     var envmap = try std.process.getEnvMap(allocator);
196 |     defer envmap.deinit();
197 | 
198 |     var iter = envmap.iterator();
199 |     while (iter.next()) |v| {
200 |         dbg("{s}={s}\n", .{ v.key_ptr.*, v.value_ptr.* });
201 |     }
202 | 
203 |     const path = envmap.getPtr("PATH");
204 |     if (path) |v| {
205 |         dbg("PATH={s}\n", .{v.*});
206 |     } else {
207 |         dbg("no PATH\n", .{});
208 |     }
209 | }
210 | 
211 | test "fba" {
212 |     const mem = try std.testing.allocator.alloc(u8, 8);
213 |     defer std.testing.allocator.free(mem);
214 |     var _fba = std.heap.FixedBufferAllocator.init(mem);
215 |     var fba = _fba.allocator();
216 |     dbg("0x{X}\n", .{mem});
217 |     var m0 = try fba.alloc(u8, 1);
218 |     m0[0] = 0xFF;
219 |     dbg("0x{X}\n", .{mem});
220 |     var m1 = try fba.alloc(u8, 1);
221 |     m1[0] = 0x7F;
222 |     dbg("0x{X}\n", .{mem});
223 |     var m2 = try fba.alloc(u8, 1);
224 |     m2[0] = 0x8F;
225 |     dbg("0x{X}\n", .{mem});
226 |     var m3 = try fba.alloc(u8, 1);
227 |     m3[0] = 0x9F;
228 |     dbg("0x{X}\n", .{mem});
229 |     var m = try fba.alloc(u8, 4);
230 |     m[0] = 0xBF;
231 |     m[1] = 0xBF;
232 |     m[2] = 0xBF;
233 |     m[3] = 0xBF;
234 |     dbg("0x{X}\n", .{mem});
235 | 
236 |     fba.free(m0);
237 |     fba.free(m1);
238 |     fba.free(m3);
239 |     dbg("0x{X}\n", .{mem});
240 | 
241 |     var mx = try fba.alloc(u8, 2);
242 |     mx[0] = 0x21;
243 |     mx[1] = 0x21;
244 |     dbg("0x{X}\n", .{mem});
245 | }
246 | 


--------------------------------------------------------------------------------
/src/main.zig:
--------------------------------------------------------------------------------
  1 | const std = @import("std");
  2 | const zgroup = @import("zgroup.zig");
  3 | const backoff = @import("zbackoff");
  4 | 
  5 | const log = std.log;
  6 | 
  7 | // You can change zgroup's log-level to .info.
  8 | pub const std_options = .{
  9 |     .log_level = .info,
 10 |     .log_scope_levels = &[_]std.log.ScopeLevel{
 11 |         .{ .scope = .zgroup, .level = .debug },
 12 |     },
 13 | };
 14 | 
 15 | // To be passed to our callback(s).
 16 | const UserData = struct {
 17 |     prefix: []const u8,
 18 |     group: []const u8,
 19 |     skip_callback: bool = false,
 20 | };
 21 | 
 22 | const Fleet = zgroup.Fleet(UserData);
 23 | 
 24 | // A sample binary on how to use the zgroup library.
 25 | // Expected cmdline args:
 26 | //
 27 | //   [0] = bin
 28 | //   [1] = name
 29 | //   [2] = member ip:port
 30 | //   [3] = join ip:port (optional)
 31 | //
 32 | pub fn main() !void {
 33 |     var gpa = std.heap.GeneralPurposeAllocator(.{}){};
 34 |     var aa = std.heap.ArenaAllocator.init(gpa.allocator());
 35 |     defer aa.deinit(); // destroy arena in one go
 36 |     const arena = aa.allocator();
 37 | 
 38 |     // Collect process args to a map.
 39 |     var args = try std.process.argsWithAllocator(arena);
 40 |     var hm = std.AutoHashMap(usize, []const u8).init(arena);
 41 |     var i: usize = 0;
 42 |     while (args.next()) |val| : (i += 1) {
 43 |         const arg = try std.fmt.allocPrint(arena, "{s}", .{val});
 44 |         try hm.put(i, arg);
 45 |     }
 46 | 
 47 |     if (hm.count() < 3) {
 48 |         log.err("invalid args", .{});
 49 |         return;
 50 |     }
 51 | 
 52 |     var iter = hm.iterator();
 53 |     while (iter.next()) |v|
 54 |         log.info("args[{d}]: {s}", .{ v.key_ptr.*, v.value_ptr.* });
 55 | 
 56 |     // Required: so we can have our own unique URL in the free service.
 57 |     var envmap = try std.process.getEnvMap(arena);
 58 |     if (hm.count() == 3) {
 59 |         const jp = envmap.getPtr("ZGROUP_JOIN_PREFIX");
 60 |         if (jp) |_| {} else {
 61 |             log.err("no $ZGROUP_JOIN_PREFIX envvar found", .{});
 62 |             return;
 63 |         }
 64 |     }
 65 | 
 66 |     const name = hm.getEntry(1).?.value_ptr.*;
 67 | 
 68 |     var data = UserData{
 69 |         .prefix = b: {
 70 |             const jp = envmap.getPtr("ZGROUP_JOIN_PREFIX");
 71 |             if (jp) |v| break :b v.* else {
 72 |                 break :b try std.fmt.allocPrint(arena, "", .{});
 73 |             }
 74 |         },
 75 |         .group = name,
 76 |     };
 77 | 
 78 |     const callbacks = Fleet.Callbacks{
 79 |         .data = &data, // arbitrary callback data
 80 | 
 81 |         // Callback function for the join address.
 82 |         .onJoinAddr = onJoinAddr,
 83 | 
 84 |         // So we won't overload the free service we are using.
 85 |         .on_join_every = 50,
 86 |     };
 87 | 
 88 |     var member = hm.getEntry(2).?.value_ptr.*;
 89 |     var sep = std.mem.indexOf(u8, member, ":").?;
 90 | 
 91 |     var cfg = Fleet.Config{
 92 |         .name = name,
 93 |         .ip = member[0..sep],
 94 |         .callbacks = callbacks,
 95 |     };
 96 | 
 97 |     cfg.port = try std.fmt.parseUnsigned(u16, member[sep + 1 ..], 10);
 98 | 
 99 |     var fleet = try Fleet.init(gpa.allocator(), &cfg);
100 |     try fleet.run(); // actual run, join later
101 |     defer fleet.deinit();
102 | 
103 |     i = 0;
104 |     var joined = false;
105 |     var bo = backoff.Backoff{};
106 |     while (true) : (i += 1) {
107 |         if (joined)
108 |             std.time.sleep(std.time.ns_per_s * 1)
109 |         else
110 |             std.time.sleep(if (i >= 100) std.time.ns_per_s else bo.pause());
111 | 
112 |         if (i > 1 and i < 100 and !joined) {
113 |             switch (hm.count()) {
114 |                 3 => {
115 |                     // No join address in args. Try using a free discovery service.
116 |                     var join_addr: []const u8 = "";
117 |                     const ja = try getJoinAddress(
118 |                         arena,
119 |                         envmap.getPtr("ZGROUP_JOIN_PREFIX").?.*,
120 |                         name,
121 |                     );
122 | 
123 |                     if (ja.len > 0) join_addr = ja else continue;
124 | 
125 |                     log.info("[{d}] join address found, addr={s}", .{ i, join_addr });
126 | 
127 |                     sep = std.mem.indexOf(u8, join_addr, ":").?;
128 |                     const join_port = try std.fmt.parseUnsigned(
129 |                         u16,
130 |                         join_addr[sep + 1 ..],
131 |                         10,
132 |                     );
133 | 
134 |                     fleet.join(
135 |                         name,
136 |                         join_addr[0..sep],
137 |                         join_port,
138 |                         &joined,
139 |                     ) catch |err|
140 |                         log.err("joining thru {s}:{d} failed: {any}", .{
141 |                         join_addr[0..sep],
142 |                         join_port,
143 |                         err,
144 |                     });
145 |                 },
146 |                 4 => {
147 |                     // Join address is provided. Skip callback.
148 |                     data.skip_callback = true;
149 | 
150 |                     const join = hm.getEntry(3).?.value_ptr.*;
151 |                     sep = std.mem.indexOf(u8, join, ":").?;
152 |                     const join_ip = join[0..sep];
153 |                     if (join_ip.len == 0) {
154 |                         log.err("invalid join address", .{});
155 |                         return;
156 |                     }
157 | 
158 |                     const join_port = try std.fmt.parseUnsigned(
159 |                         u16,
160 |                         join[sep + 1 ..],
161 |                         10,
162 |                     );
163 | 
164 |                     fleet.join(
165 |                         name,
166 |                         join_ip,
167 |                         join_port,
168 |                         &joined,
169 |                     ) catch |err| log.err("join failed: {any}", .{err});
170 |                 },
171 |                 else => {},
172 |             }
173 |         }
174 | 
175 |         // Sample code on getting the current members in the group.
176 |         if (i > 0 and @mod(i, 10) == 0) {
177 |             const members = try fleet.getMembers(gpa.allocator());
178 |             defer members.deinit();
179 |             log.info("main: members={d}", .{members.items.len});
180 |             for (members.items) |v| gpa.allocator().free(v);
181 |         }
182 |     }
183 | }
184 | 
185 | // The allocator here is the allocator passed to Fleet's init function. `addr`'s
186 | // format is "ip:port", e.g. "127.0.0.1:8080", and needs to be freed after use.
187 | fn onJoinAddr(allocator: std.mem.Allocator, data: ?*UserData, addr: []const u8) !void {
188 |     defer allocator.free(addr);
189 |     if (data.?.skip_callback) return;
190 |     try setJoinAddress(allocator, data.?.prefix, data.?.group, addr);
191 | }
192 | 
193 | // We are using curl here as std.http.Client seems to not play well with this endpoint.
194 | // The "seegmed7" in the url is our API key.
195 | fn setJoinAddress(
196 |     allocator: std.mem.Allocator,
197 |     prefix: []const u8,
198 |     group: []const u8,
199 |     addr: []const u8,
200 | ) !void {
201 |     const enc = std.base64.Base64Encoder.init(std.base64.url_safe_alphabet_chars, '=');
202 |     const buf = try allocator.alloc(u8, enc.calcSize(addr.len));
203 |     defer allocator.free(buf);
204 |     const out = enc.encode(buf, addr);
205 |     const url = try std.fmt.allocPrint(
206 |         allocator,
207 |         "https://keyvalue.immanuel.co/api/KeyVal/UpdateValue/seegmed7/{s}-{s}/{s}",
208 |         .{ prefix, group, out },
209 |     );
210 | 
211 |     defer allocator.free(url);
212 | 
213 |     log.info("callback: setJoinAddress: url={s}", .{url});
214 | 
215 |     const result = try std.process.Child.run(.{
216 |         .allocator = allocator,
217 |         .argv = &[_][]const u8{
218 |             "curl",
219 |             "-X",
220 |             "POST",
221 |             "-H",
222 |             "Content-Length: 1", // somehow, this works with this endpoint (required though)
223 |             url,
224 |         },
225 |     });
226 | 
227 |     defer {
228 |         allocator.free(result.stdout);
229 |         allocator.free(result.stderr);
230 |     }
231 | }
232 | 
233 | // We are using curl here as std.http.Client seems to not play well with this endpoint.
234 | // The "seegmed7" in the url is our API key. We are passing an arena allocator here.
235 | fn getJoinAddress(allocator: std.mem.Allocator, prefix: []const u8, group: []const u8) ![]u8 {
236 |     const url = try std.fmt.allocPrint(
237 |         allocator,
238 |         "https://keyvalue.immanuel.co/api/KeyVal/GetValue/seegmed7/{s}-{s}",
239 |         .{ prefix, group },
240 |     );
241 | 
242 |     log.info("callback: getJoinAddress: url={s}", .{url});
243 | 
244 |     const result = try std.process.Child.run(.{
245 |         .allocator = allocator,
246 |         .argv = &[_][]const u8{ "curl", url },
247 |     });
248 | 
249 |     const out = std.mem.trim(u8, result.stdout, "\"");
250 |     const dec = std.base64.Base64Decoder.init(std.base64.url_safe_alphabet_chars, '=');
251 |     const buf = try allocator.alloc(u8, try dec.calcSizeForSlice(out));
252 |     try dec.decode(buf, out);
253 |     return buf;
254 | }
255 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | > [!WARNING]
  2 | > Still in alpha stage. APIs may change.
  3 | 
  4 | ---
  5 | 
  6 | [![main](https://github.com/flowerinthenight/zgroup/actions/workflows/main.yml/badge.svg)](https://github.com/flowerinthenight/zgroup/actions/workflows/main.yml)
  7 | [![Docker Repository on Quay](https://quay.io/repository/flowerinthenight/zgroup/status "Docker Repository on Quay")](https://quay.io/repository/flowerinthenight/zgroup)
  8 | 
  9 | (This repo is mirrored to [https://codeberg.org/flowerinthenight/zgroup](https://codeberg.org/flowerinthenight/zgroup)).
 10 | 
 11 | ## Overview
 12 | 
 13 | **zgroup** is a [Zig](https://ziglang.org/) library that can manage cluster membership and member failure detection. It uses a combination of [SWIM Protocol](https://www.cs.cornell.edu/projects/Quicksilver/public_pdfs/SWIM.pdf)'s gossip-style information dissemination, and [Raft](https://raft.github.io/raft.pdf)'s leader election algorithm (minus the log management) to track cluster changes.
 14 | 
 15 | ### On payload size
 16 | 
 17 | One of zgroup's main goal is to be able to track clusters with sizes that can change dynamically over time (e.g. [Kubernetes Deployments](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/), [GCP Instance Groups](https://cloud.google.com/compute/docs/instance-groups), [AWS Autoscaling Groups](https://docs.aws.amazon.com/autoscaling/ec2/userguide/auto-scaling-groups.html), etc.) with minimal dependencies and network load. All of my previous related works so far, depend on some external service (see [spindle](https://github.com/flowerinthenight/spindle), [hedge](https://github.com/flowerinthenight/hedge)), using traditional heartbeating, to achieve this. This heartbeating technique usually suffers from increasing payload sizes (proportional to cluster sizes) as clusters get bigger. But I wanted a system that doesn't suffer from that side effect. Enter [SWIM](https://www.cs.cornell.edu/projects/Quicksilver/public_pdfs/SWIM.pdf)'s infection-style information dissemination. It can use a constant payload size regardless of the cluster size. SWIM uses a combination of `PING`s, `INDIRECT-PING`s, and `ACK`s to detect member failures while piggybacking on these same messages to propagate membership updates (gossip protocol). Currently, zgroup only uses SWIM's direct probing protocol; it doesn't fully implement the Suspicion sub-protocol (yet).
 18 | 
 19 | At the moment, zgroup uses a single, 64-byte payload for all its messages, including leader election (see below).
 20 | 
 21 | ### On leader election
 22 | 
 23 | I also wanted some sort of leader election capability without depending on an external lock service. At the moment, zgroup uses [Raft](https://raft.github.io/raft.pdf)'s leader election algorithm sub-protocol (without the log management) to achieve this. I should note that Raft's leader election algorithm depends on stable membership for it work properly, so zgroup's leader election is a best-effort basis only; split-brain can still happen while the cluster size is still changing. Additional code guards are added to minimize split-brain in these scenarios but it's not completely eliminated. In my use-case (and testing), gradual cluster size changes are mostly stable, while sudden changes with huge size deltas are not. For example, a big, sudden jump from three nodes (zgroup's minimum size) to, say, a hundred, due to autoscaling, would cause a split-brain. Once the target size is achieved however, a single leader will always be elected.
 24 | 
 25 | A note on Raft's random timeout range during leader election: zgroup's leader tracks ping latency averages and attempts to adjust the timeout range accordingly to accomodate for cluster size changes overtime.
 26 | 
 27 | ### Join address
 28 | 
 29 | For a node to join an existing cluster, it needs a joining address. While zgroup exposes a `join()` function for this, it also provides a callback mechanism, providing callers with a join address. This address can then be stored to an external store for the other nodes to use. Internally, zgroup uses the node with the highest IP(v4) address in the group.
 30 | 
 31 | ## Sample binary
 32 | 
 33 | A [sample](./src/main.zig) binary is provided to show a way to use the library. There are two ways to run the sample:
 34 | 
 35 | * Specifying the join address manually
 36 | * Using an external service to get the join address
 37 | 
 38 | ### Local with join address
 39 | 
 40 | ```sh
 41 | # Build the sample binary:
 42 | $ zig build --summary all
 43 | 
 44 | # Run the 1st process. The expected args look like:
 45 | #
 46 | #   ./zgroup groupname member_ip:port [join_ip:port]
 47 | #
 48 | 
 49 | # Run the first process (join to self).
 50 | $ ./zig-out/bin/zgroup group1 0.0.0.0:8080 0.0.0.0:8080
 51 | 
 52 | # Then you can run additional instances.
 53 | # Join through the 1st process/node (different terminal):
 54 | $ ./zig-out/bin/zgroup group1 0.0.0.0:8081 0.0.0.0:8080
 55 | 
 56 | # Join through the 2nd process/node (different terminal):
 57 | $ ./zig-out/bin/zgroup group1 0.0.0.0:8082 0.0.0.0:8081
 58 | 
 59 | # Join through the 1st process/node (different terminal):
 60 | $ ./zig-out/bin/zgroup group1 0.0.0.0:8083 0.0.0.0:8080
 61 | 
 62 | # and so on...
 63 | ```
 64 | 
 65 | ### Local with an external service
 66 | 
 67 | If configured, the sample binary uses a free service, [https://keyvalue.immanuel.co/](https://keyvalue.immanuel.co/), as a store for the join address.
 68 | 
 69 | ```sh
 70 | # Build the sample binary:
 71 | $ zig build --summary all
 72 | 
 73 | # Generate UUID:
 74 | $ uuidgen
 75 | {output}
 76 | 
 77 | # Run the 1st process. The expected args look like:
 78 | #
 79 | #   ./zgroup groupname member_ip:port
 80 | #
 81 | 
 82 | # Run the first process:
 83 | $ ZGROUP_JOIN_PREFIX={output} ./zig-out/bin/zgroup group1 0.0.0.0:8080
 84 | 
 85 | # Add a second node (different terminal):
 86 | $ ZGROUP_JOIN_PREFIX={output} ./zig-out/bin/zgroup group1 0.0.0.0:8081
 87 | 
 88 | # Add a third node (different terminal):
 89 | $ ZGROUP_JOIN_PREFIX={output} ./zig-out/bin/zgroup group1 0.0.0.0:8082
 90 | 
 91 | # Add a fourth node (different terminal):
 92 | $ ZGROUP_JOIN_PREFIX={output} ./zig-out/bin/zgroup group1 0.0.0.0:8083
 93 | 
 94 | # and so on...
 95 | ```
 96 | 
 97 | ### Kubernetes (Deployment)
 98 | 
 99 | A sample Kubernetes [deployment file](./k8s.yaml) is provided to try zgroup on [Kubernetes Deployments](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/). Before deploying though, make sure to update the `ZGROUP_JOIN_PREFIX` environment variable, like so:
100 | 
101 | ```sh
102 | # Generate UUID:
103 | $ uuidgen
104 | {output}
105 | 
106 | # Update the 'value' part with your output.
107 |   ...
108 |   - name: ZGROUP_JOIN_PREFIX
109 |     value: "{output}"
110 |   ...
111 | 
112 | # Deploy to Kubernetes:
113 | $ kubectl create -f k8s.yaml
114 | 
115 | # You will notice some initial errors in the logs.
116 | # Wait for a while before the K/V store is updated.
117 | ```
118 | 
119 | ### GCP Managed Instance Group (MIG)
120 | 
121 | A sample [startup script](./startup-gcp-mig.sh) is provided to try zgroup on a [GCP MIG](https://cloud.google.com/compute/docs/instance-groups#managed_instance_groups). Before deploying though, make sure to update the `ZGROUP_JOIN_PREFIX` value in the script, like so:
122 | 
123 | ```sh
124 | # Generate UUID:
125 | $ uuidgen
126 | {output}
127 | 
128 | # Update the 'value' part of ZGROUP_JOIN_PREFIX with your output.
129 | ...
130 | ZGROUP_JOIN_PREFIX={output} ./zgroup group1 ...
131 | 
132 | # Create an instance template:
133 | $ gcloud compute instance-templates create zgroup-tmpl \
134 |   --machine-type e2-micro \
135 |   --metadata=startup-script=''"$(cat startup-gcp-mig.sh)"''
136 | 
137 | # Create a regional MIG:
138 | $ gcloud compute instance-groups managed create rmig \
139 |   --template zgroup-tmpl --size 3 --region {your-region}
140 | 
141 | # You can view the logs through:
142 | $ tail -f /var/log/messages
143 | ```
144 | 
145 | ### AWS Autoscaling Group
146 | 
147 | A sample [startup script](./startup-aws-asg.sh) is provided to try zgroup on an [AWS ASG](https://docs.aws.amazon.com/autoscaling/ec2/userguide/auto-scaling-groups.html). Before deploying though, make sure to update the `ZGROUP_JOIN_PREFIX` value in the script, like so:
148 | 
149 | ```sh
150 | # Generate UUID:
151 | $ uuidgen
152 | {output}
153 | 
154 | # Update the 'value' part of ZGROUP_JOIN_PREFIX with your output.
155 | ...
156 | ZGROUP_JOIN_PREFIX={output} ./zgroup group1 ...
157 | 
158 | # Create a launch template. ImageId here is Amazon Linux, default VPC.
159 | # (Added newlines for readability. Might not run when copied as is.)
160 | $ aws ec2 create-launch-template \
161 |   --launch-template-name zgroup-lt \
162 |   --version-description version1 \
163 |   --launch-template-data '
164 |   {
165 |     "UserData":"'"$(cat startup-aws-asg.sh | base64 -w 0)"'",
166 |     "ImageId":"ami-0f75d1a8c9141bd00",
167 |     "InstanceType":"t2.micro"
168 |   }'
169 | 
170 | # Create the ASG:
171 | $ aws autoscaling create-auto-scaling-group \
172 |   --auto-scaling-group-name zgroup-asg \
173 |   --launch-template LaunchTemplateName=zgroup-lt,Version='1' \
174 |   --min-size 3 \
175 |   --max-size 3 \
176 |   --availability-zones {target-zone}
177 | 
178 | # You can view the logs through:
179 | $ [sudo] journalctl -f
180 | ```
181 | 
182 | ## Getting the list of members
183 | 
184 | To get the current members of the group, you can try something like:
185 | 
186 | ```zig
187 | const members = try fleet.getMembers(gpa.allocator());
188 | defer members.deinit();
189 | 
190 | for (members.items, 0..) |v, i| {
191 |     defer gpa.allocator().free(v);
192 |     log.info("member[{d}]: {s}", .{ i, v });
193 | }
194 | ```
195 | 
196 | The tricky part of using zgroup is configuring the timeouts to optimize state dissemination and convergence. The current implementation was only tested within a local network.
197 | 
198 | ## TODOs
199 | 
200 | - [ ] - Provide callbacks for membership changes
201 | - [ ] - Provide an API to get the current leader
202 | - [ ] - Provide an interface for other processes (non-Zig users)
203 | - [ ] - Use multicast (if available) for the join address
204 | 
205 | PR's are welcome.
206 | 


--------------------------------------------------------------------------------
/src/zgroup.zig:
--------------------------------------------------------------------------------
   1 | //! zgroup is a library that can manage cluster membership and member failure detection.
   2 | //! It is based on the SWIM Protocol and Raft's leader election algorithm sub-protocol
   3 | //! (without the log management).
   4 | //!
   5 | //! References:
   6 | //!
   7 | //!   https://www.cs.cornell.edu/projects/Quicksilver/public_pdfs/SWIM.pdf
   8 | //!   https://raft.github.io/raft.pdf
   9 | //!
  10 | const std = @import("std");
  11 | const backoff = @import("zbackoff");
  12 | 
  13 | const log = std.log.scoped(.zgroup);
  14 | 
  15 | pub fn Fleet(UserData: type) type {
  16 |     return struct {
  17 |         const Self = @This();
  18 | 
  19 |         allocator: std.mem.Allocator,
  20 | 
  21 |         // See Config comments for these fields.
  22 |         name: []const u8,
  23 |         ip: []const u8,
  24 |         port: u16,
  25 |         proto_time: u64,
  26 |         suspect_time: u64,
  27 |         ping_req_k: u32,
  28 |         elex_delay: u64,
  29 | 
  30 |         // Our per-member data. Key format is "ip:port", eg. "127.0.0.1:8080".
  31 |         members: std.StringHashMap(MemberData),
  32 |         members_mtx: std.Thread.Mutex = .{},
  33 | 
  34 |         // Long-term references to all keys used in `members` and other intermediate
  35 |         // copies. Safer for access amidst all the addition and removals of items.
  36 |         refkeys: std.StringHashMap(void),
  37 |         refkeys_mtx: std.Thread.Mutex = .{},
  38 | 
  39 |         // Intermediate member queue for round-robin pings and randomization.
  40 |         ping_queue: std.ArrayList([]const u8),
  41 | 
  42 |         // For requesting our indirect ping agent(s).
  43 |         ping_req_data: *RequestPing = undefined, // set in run()
  44 |         ping_req_0: std.Thread.ResetEvent = .{}, // request
  45 |         ping_req_1: std.Thread.ResetEvent = .{}, // response
  46 | 
  47 |         // Join address heartbeat timeout.
  48 |         join_addr_tm: std.time.Timer,
  49 | 
  50 |         callbacks: Callbacks,
  51 | 
  52 |         // Raft-inspired leader election.
  53 |         elex_mtx: std.Thread.Mutex = .{},
  54 |         elex_join: bool = false,
  55 |         elex_join_tm: std.time.Timer,
  56 |         elex_term: u64 = 0,
  57 |         elex_state: ElectionState = .follower,
  58 |         votes: u32 = 0,
  59 |         voted_for: []const u8,
  60 |         elex_tm: std.time.Timer,
  61 |         candidate_tm: std.time.Timer,
  62 |         elex_tm_min: u64, // set via config
  63 |         elex_tm_max: u64, // set via config
  64 |         leader: []const u8,
  65 | 
  66 |         const ElectionState = enum(u8) {
  67 |             follower,
  68 |             candidate,
  69 |             leader,
  70 |         };
  71 | 
  72 |         // SWIM protocol generic commands.
  73 |         const Command = enum(u8) {
  74 |             noop,
  75 |             ack,
  76 |             nack,
  77 |             join,
  78 |             ping,
  79 |             ping_req,
  80 |             heartbeat,
  81 |             req4votes,
  82 |             join2leader,
  83 |         };
  84 | 
  85 |         // Infection-style dissemination (ISD) commands.
  86 |         const IsdCommand = enum(u8) {
  87 |             noop,
  88 |             infect,
  89 |             suspect,
  90 |             confirm_alive,
  91 |             confirm_faulty,
  92 |         };
  93 | 
  94 |         // Possible member liveness states.
  95 |         const Liveness = enum(u8) {
  96 |             alive,
  97 |             suspected,
  98 |             faulty,
  99 |         };
 100 | 
 101 |         const KeyInfo = struct {
 102 |             key: []const u8,
 103 |             liveness: Liveness,
 104 |             incarnation: u64 = 0,
 105 |             isd_cmd: IsdCommand = .noop,
 106 |         };
 107 | 
 108 |         // Our generic UDP comms/protocol payload.
 109 |         const Message = packed struct {
 110 |             name: u64 = 0,
 111 | 
 112 |             // Section for ping, ping_req, ack, nack.
 113 |             cmd: Command = .noop,
 114 |             src_ip: u32 = 0,
 115 |             src_port: u16 = 0,
 116 |             src_state: Liveness = .alive,
 117 |             src_incarnation: u64 = 0,
 118 | 
 119 |             dst_cmd: IsdCommand = .noop,
 120 |             dst_ip: u32 = 0,
 121 |             dst_port: u16 = 0,
 122 |             dst_state: Liveness = .alive,
 123 |             dst_incarnation: u64 = 0,
 124 | 
 125 |             // Used for multiple subprotocols explained below:
 126 |             //
 127 |             // 1) For determining the highest node (for join) during SWIM pings.
 128 |             // Format:
 129 |             //   |----- cmd ----| |- port (u16) -| |------- IP address (u32) ------|
 130 |             //   0000000000000011.1111111111111111.1111111111111111.1111111111111111
 131 |             //
 132 |             // 2) Term and node count during leader heartbeats.
 133 |             // Format:
 134 |             //   |---- count ---| |----------------- term (u48) -------------------|
 135 |             //   1111111111111111.1111111111111111.1111111111111111.1111111111111111
 136 |             proto1: u64 = 0,
 137 | 
 138 |             // Used for multiple subprotocols explained below:
 139 |             //
 140 |             // 1) For informing the sender's member count during SWIM pings.
 141 |             // Format: the full 64 bits represents the value.
 142 |             //
 143 |             // 2) Min and max election timeouts during leader heartbeats, in ms.
 144 |             // Format:
 145 |             //
 146 |             //   1-MSB:  1 -> field is valid, 0 -> skip
 147 |             //   62-LSB: 31 bits each for min/max
 148 |             //
 149 |             //   |x|---------- min (u31) ----------||---------- max (u31) ---------|
 150 |             //   1011111111111111.1111111111111111.1111111111111111.1111111111111111
 151 |             proto2: u64 = 0,
 152 |         };
 153 | 
 154 |         // Per-member context data.
 155 |         const MemberData = struct {
 156 |             liveness: Liveness = .alive,
 157 |             age_suspected: std.time.Timer = undefined,
 158 |             age_faulty: std.time.Timer = undefined,
 159 |             incarnation: u64 = 0,
 160 |             targets: std.ArrayList([]const u8),
 161 |         };
 162 | 
 163 |         const JoinCmd = enum(u8) {
 164 |             noop,
 165 |             heartbeat,
 166 |             invalidate,
 167 |         };
 168 | 
 169 |         pub const Callbacks = struct {
 170 |             /// Optional context data; to be passed back to the callback function(s).
 171 |             data: ?*UserData,
 172 | 
 173 |             /// Optional callback for the join address. This is provided as an option to
 174 |             /// provide a join address for new nodes to join in.
 175 |             ///
 176 |             /// For example, you might want to setup a discovery service (e.g. K/V store)
 177 |             /// where you will store the join address from this callback. Other joining
 178 |             /// nodes can then use the store to query the join address.
 179 |             onJoinAddr: ?*const fn (std.mem.Allocator, ?*UserData, []const u8) anyerror!void,
 180 | 
 181 |             /// If > 0, `onJoinAddr` callback will be called every `proto_time * val`. For
 182 |             /// example, if your proto_time is 2s and this value is 10, `onJoinAddr` will
 183 |             /// be called every 20s. Default (0) means every `proto_time`; same as 1.
 184 |             on_join_every: u64 = 0,
 185 |         };
 186 | 
 187 |         /// Config for init().
 188 |         pub const Config = struct {
 189 |             /// We use the name as group identifier when groups are running over the
 190 |             /// same network. Max of 8 chars (u64 in payload).
 191 |             name: []const u8,
 192 | 
 193 |             /// Member IP address for UDP, eg. "0.0.0.0". Use init() to initialize.
 194 |             ip: []const u8,
 195 | 
 196 |             /// Member port number for UDP, eg. 8080.
 197 |             port: u16 = 8080,
 198 | 
 199 |             /// Our SWIM protocol timeout duration.
 200 |             proto_time: u64 = std.time.ns_per_ms * 500,
 201 | 
 202 |             /// Suspicion subprotocol timeout duration.
 203 |             suspect_time: u64 = std.time.ns_per_ms * 500,
 204 | 
 205 |             /// Number of members we will request to do indirect pings for us (agents).
 206 |             /// The only valid value at the moment is `1`.
 207 |             ping_req_k: u32 = 1,
 208 | 
 209 |             /// Delay between leader's liveness pings to all nodes.
 210 |             elex_delay: u64 = std.time.ns_per_ms * 100,
 211 | 
 212 |             /// See `Callbacks` struct for more information.
 213 |             callbacks: Callbacks,
 214 |         };
 215 | 
 216 |         /// Create an instance of Self based on `config`. The `allocator` will be stored
 217 |         /// internally as the main internal allocator. Arena is not recommended as it's
 218 |         /// going to be used in the internal UDP server and the main loop which are
 219 |         /// expected to be long-running. Some areas will utilize an arena allocator
 220 |         /// based on the input allocator when it's appropriate.
 221 |         pub fn init(allocator: std.mem.Allocator, config: *const Config) !Self {
 222 |             const edf: f64 = @floatFromInt(config.elex_delay);
 223 |             const minf: f64 = edf / 0.05;
 224 |             const emin: u64 = @intFromFloat(minf);
 225 | 
 226 |             return Self{
 227 |                 .allocator = allocator,
 228 |                 .name = if (config.name.len > 8) config.name[0..8] else config.name,
 229 |                 .ip = config.ip,
 230 |                 .port = config.port,
 231 |                 .proto_time = config.proto_time,
 232 |                 .suspect_time = config.suspect_time,
 233 |                 .ping_req_k = config.ping_req_k,
 234 |                 .elex_delay = config.elex_delay,
 235 |                 .elex_tm_min = emin,
 236 |                 .elex_tm_max = emin + std.time.ns_per_s,
 237 |                 .members = std.StringHashMap(MemberData).init(allocator),
 238 |                 .refkeys = std.StringHashMap(void).init(allocator),
 239 |                 .ping_queue = std.ArrayList([]const u8).init(allocator),
 240 |                 .join_addr_tm = try std.time.Timer.start(),
 241 |                 .callbacks = config.callbacks,
 242 |                 .leader = try std.fmt.allocPrint(allocator, "", .{}),
 243 |                 .voted_for = try std.fmt.allocPrint(allocator, "", .{}),
 244 |                 .elex_tm = try std.time.Timer.start(),
 245 |                 .candidate_tm = try std.time.Timer.start(),
 246 |                 .elex_join_tm = try std.time.Timer.start(),
 247 |             };
 248 |         }
 249 | 
 250 |         /// Cleanup Self instance. At the moment, it is expected for this
 251 |         /// code to be long running until process is terminated.
 252 |         pub fn deinit(self: *Self) void {
 253 |             log.debug("deinit:", .{});
 254 | 
 255 |             // TODO: See how to gracefuly exit threads.
 256 | 
 257 |             self.members.deinit();
 258 |             var it = self.refkeys.iterator();
 259 |             while (it.next()) |v| self.allocator.free(v.key_ptr.*);
 260 |             self.refkeys.deinit();
 261 |             self.ping_queue.deinit();
 262 |         }
 263 | 
 264 |         /// Start group membership tracking.
 265 |         pub fn run(self: *Self) !void {
 266 |             log.debug("run: name={s}, address={s}:{d}", .{
 267 |                 self.name,
 268 |                 self.ip,
 269 |                 self.port,
 270 |             });
 271 | 
 272 |             log.debug("*Message: size={d}, align={d}", .{
 273 |                 @sizeOf(Message),
 274 |                 @alignOf(Message),
 275 |             });
 276 | 
 277 |             log.debug("SWIM: prototime={any}, suspecttime={any}, k={d}", .{
 278 |                 std.fmt.fmtDuration(self.proto_time),
 279 |                 std.fmt.fmtDuration(self.suspect_time),
 280 |                 self.ping_req_k,
 281 |             });
 282 | 
 283 |             log.debug("leader election timeout range: min={any}, max={any}", .{
 284 |                 std.fmt.fmtDuration(self.elex_tm_min),
 285 |                 std.fmt.fmtDuration(self.elex_tm_max),
 286 |             });
 287 | 
 288 |             const me = try self.getOwnKey();
 289 |             defer self.allocator.free(me);
 290 |             _ = try self.ensureKeyRef(me);
 291 |             try self.upsertMember(me, .alive, 0, true);
 292 |             self.elex_tm.reset();
 293 |             _ = try self.ensureKeyRef("0"); // dummy
 294 | 
 295 |             const server = try std.Thread.spawn(.{}, Self.udpListen, .{self});
 296 |             server.detach();
 297 |             const ticker = try std.Thread.spawn(.{}, Self.swimTick, .{self});
 298 |             ticker.detach();
 299 |             const ldr = try std.Thread.spawn(.{}, Self.leaderElectionTick, .{self});
 300 |             ldr.detach();
 301 | 
 302 |             // self.ping_req_data = try self.allocator.create(RequestPing);
 303 |             // self.ping_req_data.self = self;
 304 |             // const rp = try std.Thread.spawn(.{}, Self.requestPing, .{self.ping_req_data});
 305 |             // rp.detach();
 306 |         }
 307 | 
 308 |         /// Ask a node to join an existing group. `joined` will be set to true
 309 |         /// if joining is successful. We are joining the group through `dst_*`.
 310 |         pub fn join(
 311 |             self: *Self,
 312 |             name: []const u8,
 313 |             dst_ip: []const u8,
 314 |             dst_port: u16,
 315 |             joined: *bool,
 316 |         ) !void {
 317 |             var aa = std.heap.ArenaAllocator.init(self.allocator);
 318 |             defer aa.deinit(); // destroy arena in one go
 319 |             const arena = aa.allocator();
 320 | 
 321 |             const buf = try arena.alloc(u8, @sizeOf(Message));
 322 |             const msg: *Message = @ptrCast(@alignCast(buf));
 323 | 
 324 |             try self.presetMessage(msg);
 325 | 
 326 |             msg.cmd = .join;
 327 |             try self.setMsgSrcToOwn(msg);
 328 | 
 329 |             try send(dst_ip, dst_port, buf, null);
 330 | 
 331 |             switch (msg.cmd) {
 332 |                 .ack => {
 333 |                     const nn = std.mem.readVarInt(u64, self.name, .little);
 334 |                     if (nn == msg.name) {
 335 |                         const key = try std.fmt.allocPrint(arena, "{s}:{d}", .{
 336 |                             dst_ip,
 337 |                             dst_port,
 338 |                         });
 339 | 
 340 |                         try self.upsertMember(key, .alive, 0, true);
 341 |                         self.elex_join_tm.reset();
 342 |                         joined.* = true;
 343 | 
 344 |                         log.info("joined via {s}:{any}, name={s}", .{
 345 |                             dst_ip,
 346 |                             dst_port,
 347 |                             name,
 348 |                         });
 349 |                     }
 350 |                 },
 351 |                 else => {},
 352 |             }
 353 |         }
 354 | 
 355 |         /// Returns a list of active members from the group/cluster. Caller owns the returning
 356 |         /// list, as well as each items in the array, which are duplicated from the internal
 357 |         /// list to prevent crashes during access due to potential changes in the main list.
 358 |         pub fn getMembers(self: *Self, allocator: std.mem.Allocator) !std.ArrayList([]const u8) {
 359 |             var tmp = std.ArrayList([]const u8).init(allocator);
 360 |             defer tmp.deinit();
 361 | 
 362 |             {
 363 |                 self.members_mtx.lock();
 364 |                 defer self.members_mtx.unlock();
 365 |                 var it = self.members.iterator();
 366 |                 while (it.next()) |v| {
 367 |                     if (v.value_ptr.liveness == .faulty) continue;
 368 |                     try tmp.append(v.key_ptr.*);
 369 |                 }
 370 |             }
 371 | 
 372 |             var out = std.ArrayList([]const u8).init(allocator);
 373 | 
 374 |             if (tmp.items.len == 0) return out;
 375 | 
 376 |             for (tmp.items) |v| {
 377 |                 const kdup = try allocator.dupe(u8, v);
 378 |                 try out.append(kdup);
 379 |             }
 380 | 
 381 |             return out;
 382 |         }
 383 | 
 384 |         // Run internal UDP server for handling both SWIM- and Raft-related
 385 |         // protocols. Uses a single allocation of *Message all throughout.
 386 |         fn udpListen(self: *Self) !void {
 387 |             log.info("starting UDP server on :{d}...", .{self.port});
 388 | 
 389 |             const name = std.mem.readVarInt(u64, self.name, .little);
 390 |             const buf = try self.allocator.alloc(u8, @sizeOf(Message));
 391 |             defer self.allocator.free(buf); // release buffer
 392 | 
 393 |             // One allocation for the duration of this function.
 394 |             const msg: *Message = @ptrCast(@alignCast(buf));
 395 | 
 396 |             const addr = try std.net.Address.resolveIp(self.ip, self.port);
 397 |             const sock = try std.posix.socket(
 398 |                 std.posix.AF.INET,
 399 |                 std.posix.SOCK.DGRAM,
 400 |                 std.posix.IPPROTO.UDP,
 401 |             );
 402 | 
 403 |             defer std.posix.close(sock);
 404 |             try setWriteTimeout(sock, 5_000_000);
 405 |             try std.posix.bind(sock, &addr.any, addr.getOsSockLen());
 406 |             var src_addr: std.posix.sockaddr = undefined;
 407 |             var src_addrlen: std.posix.socklen_t = @sizeOf(std.posix.sockaddr);
 408 | 
 409 |             var i: usize = 0;
 410 |             while (true) : (i += 1) {
 411 |                 const len = std.posix.recvfrom(
 412 |                     sock,
 413 |                     buf,
 414 |                     0,
 415 |                     &src_addr,
 416 |                     &src_addrlen,
 417 |                 ) catch |err| {
 418 |                     log.err("recvfrom failed: {any}", .{err});
 419 |                     std.time.sleep(std.time.ns_per_ms * 500);
 420 |                     continue;
 421 |                 };
 422 | 
 423 |                 var aa = std.heap.ArenaAllocator.init(self.allocator);
 424 |                 defer aa.deinit(); // destroy arena in one go
 425 |                 const arena = aa.allocator();
 426 | 
 427 |                 // Main protocol message handler.
 428 |                 switch (msg.cmd) {
 429 |                     .join => b: {
 430 |                         if (msg.name == name) {
 431 |                             const key = try keyFromIpPort(arena, msg.src_ip, msg.src_port);
 432 |                             try self.upsertMember(key, .alive, msg.src_incarnation, true);
 433 | 
 434 |                             // Inform current leader (if any) of this new join.
 435 |                             msg.dst_ip = msg.src_ip;
 436 |                             msg.dst_port = msg.src_port;
 437 |                             try self.setMsgSrcToOwn(msg);
 438 | 
 439 |                             log.debug("{s} is joining, inform leader [{s}]", .{ key, self.leader });
 440 | 
 441 |                             self.informLeaderOfJoin(buf) catch |err|
 442 |                                 log.debug("informLeaderOfJoin failed: {any}", .{err});
 443 | 
 444 |                             // Always set src_* to own info.
 445 |                             try self.setMsgSrcToOwn(msg);
 446 | 
 447 |                             msg.cmd = .ack;
 448 |                             _ = std.posix.sendto(
 449 |                                 sock,
 450 |                                 std.mem.asBytes(msg),
 451 |                                 0,
 452 |                                 &src_addr,
 453 |                                 src_addrlen,
 454 |                             ) catch |err| log.err("sendto failed: {any}", .{err});
 455 | 
 456 |                             break :b;
 457 |                         }
 458 | 
 459 |                         msg.cmd = .nack;
 460 |                         _ = std.posix.sendto(
 461 |                             sock,
 462 |                             std.mem.asBytes(msg),
 463 |                             0,
 464 |                             &src_addr,
 465 |                             src_addrlen,
 466 |                         ) catch |err| log.err("sendto failed: {any}", .{err});
 467 |                     },
 468 |                     .ping => {
 469 |                         //
 470 |                         // Payload information:
 471 |                         //
 472 |                         //   src_*: caller/requester
 473 |                         //   dst_*: ISD (piggyback)
 474 |                         //
 475 |                         msg.cmd = .nack; // default
 476 | 
 477 |                         if (msg.name == name) {
 478 |                             msg.cmd = .ack;
 479 |                             const src = try keyFromIpPort(arena, msg.src_ip, msg.src_port);
 480 |                             try self.upsertMember(src, .alive, msg.src_incarnation, true);
 481 | 
 482 |                             if (msg.dst_cmd == .infect) {
 483 |                                 const dst = try keyFromIpPort(arena, msg.dst_ip, msg.dst_port);
 484 |                                 try self.upsertMember(
 485 |                                     dst,
 486 |                                     msg.dst_state,
 487 |                                     msg.dst_incarnation,
 488 |                                     false,
 489 |                                 );
 490 |                             }
 491 | 
 492 |                             const n = self.getCounts();
 493 |                             if ((n[0] + n[1]) < msg.proto2) {
 494 |                                 self.elex_tm.reset();
 495 |                                 @atomicStore(
 496 |                                     bool,
 497 |                                     &self.elex_join,
 498 |                                     false,
 499 |                                     std.builtin.AtomicOrder.seq_cst,
 500 |                                 );
 501 |                             } else @atomicStore(
 502 |                                 bool,
 503 |                                 &self.elex_join,
 504 |                                 true,
 505 |                                 std.builtin.AtomicOrder.seq_cst,
 506 |                             );
 507 | 
 508 |                             // Always set src_* to own info.
 509 |                             try self.setMsgSrcToOwn(msg);
 510 | 
 511 |                             // Use dst_* for ISD info.
 512 |                             var excludes: [1][]const u8 = .{src};
 513 |                             try self.setMsgDst(arena, msg, &excludes);
 514 | 
 515 |                             // Handle join address protocol.
 516 |                             var ipm = msg.proto1 & 0x00000000FFFFFFFF;
 517 |                             var portm = (msg.proto1 & 0x0000FFFF00000000) >> 32;
 518 |                             const cmdm: JoinCmd = @enumFromInt((msg.proto1 &
 519 |                                 0xFFFF000000000000) >> 48);
 520 | 
 521 |                             if (cmdm == .heartbeat) b: {
 522 |                                 const al = try self.getHighestNode();
 523 |                                 if ((al[0] + al[1]) <= (ipm + portm)) {
 524 |                                     _ = self.join_addr_tm.lap();
 525 |                                     break :b;
 526 |                                 }
 527 | 
 528 |                                 const hb: u64 = @intFromEnum(JoinCmd.invalidate);
 529 |                                 ipm = al[0] & 0x00000000FFFFFFFF;
 530 |                                 portm = (al[1] << 32) & 0x0000FFFF00000000;
 531 |                                 msg.proto1 = (hb << 48) | ipm | portm;
 532 |                             }
 533 |                         }
 534 | 
 535 |                         _ = std.posix.sendto(
 536 |                             sock,
 537 |                             std.mem.asBytes(msg),
 538 |                             0,
 539 |                             &src_addr,
 540 |                             src_addrlen,
 541 |                         ) catch |err| log.err("sendto failed: {any}", .{err});
 542 |                     },
 543 |                     .ping_req => b: {
 544 |                         //
 545 |                         // Payload information:
 546 |                         //
 547 |                         //   src_*: caller/requester (we are the agent)
 548 |                         //   dst_*: target of the ping-request
 549 |                         //
 550 |                         if (msg.name == name) {
 551 |                             const src = try keyFromIpPort(arena, msg.src_ip, msg.src_port);
 552 |                             try self.upsertMember(src, msg.src_state, msg.src_incarnation, true);
 553 | 
 554 |                             const dst = try keyFromIpPort(arena, msg.dst_ip, msg.dst_port);
 555 | 
 556 |                             log.debug("({d}) ping-req: requested to ping {s}", .{ len, dst });
 557 | 
 558 |                             // Always set src_* to own info.
 559 |                             try self.setMsgSrcToOwn(msg);
 560 | 
 561 |                             // Use both dst_* and isd_* for ISD info.
 562 |                             var excludes: [1][]const u8 = .{dst};
 563 |                             try self.setMsgDst(arena, msg, &excludes);
 564 | 
 565 |                             // Handle leader protocol (egress).
 566 |                             try self.setJoinProtoSend(msg);
 567 | 
 568 |                             const ack = self.ping(dst) catch false;
 569 | 
 570 |                             msg.cmd = .nack; // default
 571 | 
 572 |                             if (ack) {
 573 |                                 // The src_* info here is the original ping target.
 574 |                                 // Copy its info to the dst_* section before overwriting.
 575 |                                 msg.cmd = .ack;
 576 |                                 msg.dst_ip = msg.src_ip;
 577 |                                 msg.dst_port = msg.src_port;
 578 |                                 msg.dst_state = msg.src_state;
 579 |                                 msg.dst_incarnation = msg.src_incarnation;
 580 | 
 581 |                                 try self.upsertMember(dst, .alive, msg.src_incarnation, true);
 582 | 
 583 |                                 // Handle join address protocol (ingress).
 584 |                                 self.setJoinProtoRecv(msg);
 585 |                             }
 586 | 
 587 |                             // Always set src_* to own info.
 588 |                             try self.setMsgSrcToOwn(msg);
 589 | 
 590 |                             // Handle join address protocol (egress).
 591 |                             try self.setJoinProtoSend(msg);
 592 | 
 593 |                             _ = std.posix.sendto(
 594 |                                 sock,
 595 |                                 std.mem.asBytes(msg),
 596 |                                 0,
 597 |                                 &src_addr,
 598 |                                 src_addrlen,
 599 |                             ) catch |err| log.err("sendto failed: {any}", .{err});
 600 | 
 601 |                             break :b;
 602 |                         }
 603 | 
 604 |                         // Not in this group.
 605 |                         self.presetMessage(msg) catch {};
 606 |                         msg.cmd = .nack;
 607 | 
 608 |                         _ = std.posix.sendto(
 609 |                             sock,
 610 |                             std.mem.asBytes(msg),
 611 |                             0,
 612 |                             &src_addr,
 613 |                             src_addrlen,
 614 |                         ) catch |err| log.err("sendto failed: {any}", .{err});
 615 |                     },
 616 |                     .heartbeat => {
 617 |                         msg.cmd = .nack;
 618 |                         const tc = self.getTermAndN(msg);
 619 |                         if (tc[0] >= self.getTerm()) {
 620 |                             msg.cmd = .ack;
 621 |                             self.setTerm(tc[0]);
 622 |                             self.setVotes(0);
 623 |                             self.elex_tm.reset();
 624 |                             self.setState(.follower);
 625 | 
 626 |                             const src = try keyFromIpPort(arena, msg.src_ip, msg.src_port);
 627 |                             const lkey = try self.ensureKeyRef(src);
 628 | 
 629 |                             {
 630 |                                 self.elex_mtx.lock();
 631 |                                 defer self.elex_mtx.unlock();
 632 |                                 self.leader = lkey;
 633 |                                 self.voted_for = self.refkeys.getKeyPtr("0").?.*;
 634 |                             }
 635 | 
 636 |                             // Handle min/max timeouts from leader.
 637 |                             b: {
 638 |                                 const on = (msg.proto2 & 0x8000000000000000) >> 63;
 639 |                                 if (on == 0) break :b;
 640 | 
 641 |                                 const lmin = ((msg.proto2 & 0x7FFFFFFF80000000) >> 31) * 1000;
 642 |                                 const lmax = ((msg.proto2 & 0x700000007FFFFFFF)) * 1000;
 643 | 
 644 |                                 @atomicStore(
 645 |                                     u64,
 646 |                                     &self.elex_tm_min,
 647 |                                     lmin,
 648 |                                     std.builtin.AtomicOrder.seq_cst,
 649 |                                 );
 650 | 
 651 |                                 @atomicStore(
 652 |                                     u64,
 653 |                                     &self.elex_tm_max,
 654 |                                     lmax,
 655 |                                     std.builtin.AtomicOrder.seq_cst,
 656 |                                 );
 657 |                             }
 658 |                         }
 659 | 
 660 |                         _ = std.posix.sendto(
 661 |                             sock,
 662 |                             std.mem.asBytes(msg),
 663 |                             0,
 664 |                             &src_addr,
 665 |                             src_addrlen,
 666 |                         ) catch |err| log.err("sendto failed: {any}", .{err});
 667 |                     },
 668 |                     .req4votes => {
 669 |                         msg.cmd = .nack;
 670 |                         var voted = false;
 671 | 
 672 |                         {
 673 |                             self.elex_mtx.lock();
 674 |                             defer self.elex_mtx.unlock();
 675 |                             if (self.voted_for.len > 1) voted = true;
 676 |                         }
 677 | 
 678 |                         const term = self.getTerm();
 679 | 
 680 |                         if (msg.proto1 >= term and !voted and self.getState() != .leader) {
 681 |                             msg.cmd = .ack;
 682 |                             self.setTerm(msg.proto1);
 683 | 
 684 |                             const src = try keyFromIpPort(arena, msg.src_ip, msg.src_port);
 685 |                             const vkey = try self.ensureKeyRef(src);
 686 | 
 687 |                             {
 688 |                                 self.elex_mtx.lock();
 689 |                                 defer self.elex_mtx.unlock();
 690 |                                 self.voted_for = vkey;
 691 |                                 log.debug("req4votes: voted_for={s}", .{self.voted_for});
 692 |                             }
 693 |                         }
 694 | 
 695 |                         _ = std.posix.sendto(
 696 |                             sock,
 697 |                             std.mem.asBytes(msg),
 698 |                             0,
 699 |                             &src_addr,
 700 |                             src_addrlen,
 701 |                         ) catch |err| log.err("sendto failed: {any}", .{err});
 702 |                     },
 703 |                     .join2leader => b: {
 704 |                         const state = self.getState();
 705 |                         if (state != .leader) break :b;
 706 |                         const dst = try keyFromIpPort(arena, msg.dst_ip, msg.dst_port);
 707 |                         const pdst = try self.ensureKeyRef(dst);
 708 |                         log.debug("[{d}] received join2leader, add {s}", .{ i, pdst });
 709 |                         try self.upsertMember(pdst, .alive, 0, false);
 710 |                     },
 711 |                     else => {},
 712 |                 }
 713 | 
 714 |                 self.presetMessage(msg) catch {};
 715 |             }
 716 |         }
 717 | 
 718 |         // Drives the SWIM protocol forward. Runs on a separate thread.
 719 |         fn swimTick(self: *Self) !void {
 720 |             var i: usize = 0;
 721 |             while (true) : (i += 1) {
 722 |                 var tm = try std.time.Timer.start();
 723 |                 var aa = std.heap.ArenaAllocator.init(self.allocator);
 724 |                 defer aa.deinit(); // destroy arena in one go
 725 |                 const arena = aa.allocator();
 726 | 
 727 |                 // const counts = self.getCounts();
 728 |                 // log.debug("[{d}] members: alive={d}, suspected={d}, faulty={d}, total={d}", .{
 729 |                 //     i,
 730 |                 //     counts[0],
 731 |                 //     counts[1],
 732 |                 //     counts[2],
 733 |                 //     counts[3],
 734 |                 // });
 735 | 
 736 |                 var key_ptr: ?[]const u8 = null;
 737 |                 const pt = try self.getPingTarget(arena);
 738 |                 if (pt) |v| key_ptr = v; // ensure non-null
 739 | 
 740 |                 if (key_ptr) |ping_key| {
 741 |                     // log.debug("[{d}] try pinging {s}", .{ i, ping_key });
 742 | 
 743 |                     switch (self.ping(ping_key) catch false) {
 744 |                         false => {
 745 |                             // Let's do indirect ping for this suspicious node.
 746 |                             // var prtm = try std.time.Timer.start();
 747 |                             // defer log.debug("[{d}] ping-req took {any}", .{
 748 |                             //     i,
 749 |                             //     std.fmt.fmtDuration(prtm.read()),
 750 |                             // });
 751 | 
 752 |                             // var do_suspected = false;
 753 |                             // var excludes: [1][]const u8 = .{ping_key};
 754 |                             // const agents = try self.getRandomMember(
 755 |                             //     arena,
 756 |                             //     &excludes,
 757 |                             //     self.ping_req_k,
 758 |                             // );
 759 | 
 760 |                             // if (agents.items.len == 0) do_suspected = true else {
 761 |                             //     log.debug("[{d}] ping-req: agent(s)={d}", .{ i, agents.items.len });
 762 | 
 763 |                             //     self.ping_req_data.src = agents.items[0];
 764 |                             //     self.ping_req_data.dst = ping_key;
 765 | 
 766 |                             //     self.ping_req_0.set();
 767 |                             //     self.ping_req_1.wait();
 768 |                             //     if (!self.ping_req_data.ack) do_suspected = true;
 769 |                             //     self.ping_req_1.reset();
 770 |                             // }
 771 | 
 772 |                             // if (do_suspected) b: {
 773 |                             //     const ki = self.getKeyInfo(ping_key);
 774 |                             //     if (ki) |_| {} else break :b;
 775 |                             //     try self.setMemberInfo(
 776 |                             //         ping_key,
 777 |                             //         .suspected,
 778 |                             //         ki.?.incarnation,
 779 |                             //         true,
 780 |                             //     );
 781 |                             // }
 782 | 
 783 |                             b: {
 784 |                                 const ki = self.getKeyInfo(ping_key);
 785 |                                 if (ki) |_| {} else break :b;
 786 |                                 try self.setMemberInfo(
 787 |                                     ping_key,
 788 |                                     .suspected,
 789 |                                     ki.?.incarnation,
 790 |                                     true,
 791 |                                 );
 792 |                             }
 793 |                         },
 794 |                         else => {
 795 |                             // log.debug("[{d}] ack from {s}", .{ i, ping_key });
 796 | 
 797 |                             // TEST: start
 798 |                             // if (i > 0 and i <= 100 and @mod(i, 20) == 0) {
 799 |                             //     log.debug("[{d}] --- trigger suspect for {s}", .{ i, ping_key });
 800 |                             //     self.isd_mtx.lock();
 801 |                             //     defer self.isd_mtx.unlock();
 802 |                             //     try self.isd_queue.append(.{
 803 |                             //         .key = ping_key,
 804 |                             //         .state = .suspected,
 805 |                             //         .incarnation = 0,
 806 |                             //         .isd_cmd = .suspect,
 807 |                             //     });
 808 |                             // }
 809 |                             // TEST: end
 810 |                         },
 811 |                     }
 812 |                 }
 813 | 
 814 |                 // Setup leader callback. Mainly for joining.
 815 |                 var mod = self.callbacks.on_join_every;
 816 |                 if (mod == 0) mod = 1;
 817 |                 if (i > 0 and @mod(i, mod) == 0) b: {
 818 |                     const al = self.getHighestNode() catch break :b;
 819 |                     if (!al[2]) break :b;
 820 |                     if (self.callbacks.onJoinAddr) |_| {} else break :b;
 821 |                     const me = try std.fmt.allocPrint(self.allocator, "{s}:{d}", .{
 822 |                         self.ip,
 823 |                         self.port,
 824 |                     });
 825 | 
 826 |                     try self.callbacks.onJoinAddr.?(
 827 |                         self.allocator,
 828 |                         self.callbacks.data,
 829 |                         me,
 830 |                     );
 831 |                 }
 832 | 
 833 |                 try self.removeFaultyMembers();
 834 | 
 835 |                 // Suspected to faulty.
 836 |                 var s2f = std.ArrayList([]const u8).init(arena);
 837 | 
 838 |                 {
 839 |                     self.members_mtx.lock();
 840 |                     defer self.members_mtx.unlock();
 841 |                     var it = self.members.iterator();
 842 |                     while (it.next()) |v| {
 843 |                         if (self.keyIsMe(v.key_ptr.*)) continue;
 844 |                         if (v.value_ptr.liveness != .suspected) continue;
 845 |                         if (v.value_ptr.age_suspected.read() < self.suspect_time) continue;
 846 |                         try s2f.append(v.key_ptr.*);
 847 |                     }
 848 |                 }
 849 | 
 850 |                 for (s2f.items) |v| try self.setMemberInfo(v, .faulty, null, false);
 851 | 
 852 |                 // Pause before the next tick.
 853 |                 const elapsed = tm.read();
 854 |                 if (elapsed < self.proto_time) {
 855 |                     const left = self.proto_time - elapsed;
 856 |                     // log.debug("[{d}] sleep for {any}", .{ i, std.fmt.fmtDuration(left) });
 857 |                     std.time.sleep(left);
 858 |                 }
 859 |             }
 860 |         }
 861 | 
 862 |         // Drives the Raft-based leader election forward. Runs on a separate thread.
 863 |         fn leaderElectionTick(self: *Self) !void {
 864 |             const buf = try self.allocator.alloc(u8, @sizeOf(Message));
 865 |             defer self.allocator.free(buf); // release buffer
 866 | 
 867 |             // One allocation for the duration of this function.
 868 |             const msg: *Message = @ptrCast(@alignCast(buf));
 869 | 
 870 |             const seed = std.crypto.random.int(u64);
 871 |             var prng = std.rand.DefaultPrng.init(seed);
 872 |             const random = prng.random();
 873 | 
 874 |             var ldr_last_sweep: bool = false;
 875 |             const min_og = self.getElexTimeoutMin();
 876 |             const max_og = self.getElexTimeoutMax();
 877 |             var lmin: u64 = self.getElexTimeoutMin();
 878 |             var lmax: u64 = self.getElexTimeoutMax();
 879 | 
 880 |             var i: usize = 0;
 881 |             while (true) : (i += 1) {
 882 |                 const skip = false;
 883 |                 const n = self.getCounts();
 884 |                 if ((n[0] + n[1]) < 3 or skip) {
 885 |                     std.time.sleep(random.intRangeAtMost(
 886 |                         u64,
 887 |                         self.getElexTimeoutMin(),
 888 |                         self.getElexTimeoutMax(),
 889 |                     ));
 890 | 
 891 |                     continue;
 892 |                 }
 893 | 
 894 |                 const allowed = @atomicLoad(
 895 |                     bool,
 896 |                     &self.elex_join,
 897 |                     std.builtin.AtomicOrder.seq_cst,
 898 |                 );
 899 | 
 900 |                 var aa = std.heap.ArenaAllocator.init(self.allocator);
 901 |                 defer aa.deinit(); // destroy arena in one go
 902 |                 const arena = aa.allocator();
 903 | 
 904 |                 self.presetMessage(msg) catch {};
 905 | 
 906 |                 switch (self.getState()) {
 907 |                     .follower => {
 908 |                         if (self.elex_join_tm.read() >= self.proto_time * (n[0] + n[1])) {
 909 |                             @atomicStore(
 910 |                                 bool,
 911 |                                 &self.elex_join,
 912 |                                 true,
 913 |                                 std.builtin.AtomicOrder.seq_cst,
 914 |                             );
 915 |                         }
 916 | 
 917 |                         const rand = random.intRangeAtMost(
 918 |                             u64,
 919 |                             self.getElexTimeoutMin(),
 920 |                             self.getElexTimeoutMax(),
 921 |                         );
 922 | 
 923 |                         if (!allowed) {
 924 |                             std.time.sleep(rand);
 925 |                             continue;
 926 |                         }
 927 | 
 928 |                         if (self.elex_tm.read() <= self.getElexTimeoutMin()) {
 929 |                             std.time.sleep(rand);
 930 |                             continue;
 931 |                         }
 932 | 
 933 |                         _ = self.incTermAndGet();
 934 |                         _ = self.voteForSelf();
 935 |                         self.setState(.candidate);
 936 |                         self.candidate_tm.reset();
 937 |                     },
 938 |                     .candidate => {
 939 |                         var bl = std.ArrayList([]const u8).init(arena);
 940 |                         defer bl.deinit();
 941 | 
 942 |                         {
 943 |                             self.members_mtx.lock();
 944 |                             defer self.members_mtx.unlock();
 945 |                             var iter = self.members.iterator();
 946 |                             while (iter.next()) |v| {
 947 |                                 if (v.value_ptr.liveness != .alive) continue;
 948 |                                 if (self.keyIsMe(v.key_ptr.*)) continue;
 949 |                                 try bl.append(v.key_ptr.*);
 950 |                             }
 951 |                         }
 952 | 
 953 |                         if (bl.items.len == 0) {
 954 |                             std.time.sleep(random.intRangeAtMost(
 955 |                                 u64,
 956 |                                 self.getElexTimeoutMin(),
 957 |                                 self.getElexTimeoutMax(),
 958 |                             ));
 959 | 
 960 |                             continue;
 961 |                         }
 962 | 
 963 |                         log.debug("[{d}:{d}] req4votes to {d} nodes", .{
 964 |                             i,
 965 |                             self.getTerm(),
 966 |                             bl.items.len,
 967 |                         });
 968 | 
 969 |                         var to_leader = false;
 970 |                         for (bl.items) |k| {
 971 |                             if (self.getState() == .follower) break;
 972 | 
 973 |                             msg.cmd = .req4votes;
 974 |                             try self.setMsgSrcToOwn(msg);
 975 |                             const sep = std.mem.indexOf(u8, k, ":") orelse continue;
 976 |                             const ip = k[0..sep];
 977 |                             const port = std.fmt.parseUnsigned(u16, k[sep + 1 ..], 10) catch
 978 |                                 continue;
 979 | 
 980 |                             msg.proto1 = self.getTerm();
 981 |                             send(ip, port, buf, null) catch continue;
 982 | 
 983 |                             if (msg.cmd != .ack) continue;
 984 | 
 985 |                             log.debug("[{d}:{d}] received vote from {s}", .{
 986 |                                 i,
 987 |                                 self.getTerm(),
 988 |                                 k,
 989 |                             });
 990 | 
 991 |                             const majority = ((n[0] + n[1]) / 2) + 1;
 992 |                             const votes = self.incVotesAndGet();
 993 |                             if (votes >= majority) {
 994 |                                 log.debug("[{d}:{d}] got {d} votes, majority={d}, n={d}", .{
 995 |                                     i,
 996 |                                     self.getTerm(),
 997 |                                     votes,
 998 |                                     majority,
 999 |                                     n[0] + n[1],
1000 |                                 });
1001 | 
1002 |                                 self.setState(.leader);
1003 |                                 to_leader = true;
1004 |                                 break;
1005 |                             }
1006 |                         }
1007 | 
1008 |                         if (!to_leader) {
1009 |                             if (self.candidate_tm.read() > self.getElexTimeoutMin()) {
1010 |                                 log.debug("[{d}:{d}] lost the election, back to follower", .{
1011 |                                     i,
1012 |                                     self.getTerm(),
1013 |                                 });
1014 | 
1015 |                                 std.time.sleep(random.intRangeAtMost(
1016 |                                     u64,
1017 |                                     self.getElexTimeoutMin(),
1018 |                                     self.getElexTimeoutMax(),
1019 |                                 ));
1020 | 
1021 |                                 self.setState(.follower);
1022 |                                 self.elex_tm.reset();
1023 |                                 self.setVotes(0);
1024 |                                 self.voted_for = self.refkeys.getKeyPtr("0").?.*;
1025 |                             } else std.time.sleep(random.intRangeAtMost(
1026 |                                 u64,
1027 |                                 self.getElexTimeoutMin(),
1028 |                                 self.getElexTimeoutMax(),
1029 |                             ));
1030 |                         }
1031 |                     },
1032 |                     .leader => {
1033 |                         var tm = try std.time.Timer.start();
1034 |                         var items_len: usize = 0;
1035 |                         var fails: usize = 0;
1036 |                         var deferlog = false;
1037 |                         defer {
1038 |                             if (fails > 0) std.time.sleep(self.elex_delay);
1039 |                             if (deferlog) {
1040 |                                 if (@mod(i, 40) == 0) {
1041 |                                     log.debug("[{d}:{d}] leader: hb to {d} nodes, took {any}", .{
1042 |                                         i,
1043 |                                         self.getTerm(),
1044 |                                         items_len,
1045 |                                         std.fmt.fmtDuration(tm.read() - self.elex_delay),
1046 |                                     });
1047 |                                 }
1048 |                             }
1049 |                         }
1050 | 
1051 |                         var bl = std.ArrayList([]const u8).init(arena);
1052 |                         defer bl.deinit();
1053 | 
1054 |                         {
1055 |                             self.members_mtx.lock();
1056 |                             defer self.members_mtx.unlock();
1057 |                             var iter = self.members.iterator();
1058 |                             while (iter.next()) |v| {
1059 |                                 if (v.value_ptr.liveness != .alive) continue;
1060 |                                 if (self.keyIsMe(v.key_ptr.*)) continue;
1061 |                                 try bl.append(v.key_ptr.*);
1062 |                             }
1063 |                         }
1064 | 
1065 |                         if (bl.items.len == 0) {
1066 |                             std.time.sleep(random.intRangeAtMost(
1067 |                                 u64,
1068 |                                 self.getElexTimeoutMin(),
1069 |                                 self.getElexTimeoutMax(),
1070 |                             ));
1071 | 
1072 |                             continue;
1073 |                         }
1074 | 
1075 |                         items_len = bl.items.len; // for later log (see defer)
1076 |                         var latencies = std.ArrayList(u64).init(self.allocator);
1077 |                         defer latencies.deinit();
1078 | 
1079 |                         var ltm = try std.time.Timer.start();
1080 | 
1081 |                         if (ldr_last_sweep)
1082 |                             msg.proto2 = (1 << 63) | ((lmin / 1000) << 31) | (lmax / 1000);
1083 | 
1084 |                         for (bl.items) |k| {
1085 |                             deferlog = true;
1086 |                             msg.cmd = .heartbeat;
1087 |                             try self.setMsgSrcToOwn(msg);
1088 |                             const sep = std.mem.indexOf(u8, k, ":") orelse continue;
1089 |                             const ip = k[0..sep];
1090 |                             const port = std.fmt.parseUnsigned(u16, k[sep + 1 ..], 10) catch
1091 |                                 continue;
1092 | 
1093 |                             msg.proto1 = self.getTerm();
1094 |                             self.setTermAndN(msg);
1095 | 
1096 |                             ltm.reset();
1097 |                             send(ip, port, buf, null) catch |err| {
1098 |                                 log.err("[{d}] hb:send failed: {any}", .{ i, err });
1099 |                                 fails += 1;
1100 |                                 continue;
1101 |                             };
1102 | 
1103 |                             try latencies.append(ltm.read());
1104 |                         }
1105 | 
1106 |                         if (fails == 0) {
1107 |                             var total: u64 = 0;
1108 |                             for (latencies.items) |v| total += v;
1109 |                             const avg = total / latencies.items.len;
1110 |                             const avgf: f64 = @floatFromInt(avg);
1111 |                             const minf: f64 = @floatFromInt(self.getElexTimeoutMin());
1112 |                             const nminf = avgf / 0.05;
1113 |                             if (nminf > minf) {
1114 |                                 lmin = @intFromFloat(nminf);
1115 |                                 lmax = lmin + std.time.ns_per_s;
1116 |                             } else {
1117 |                                 lmin = min_og;
1118 |                                 lmax = max_og;
1119 |                             }
1120 |                         }
1121 | 
1122 |                         ldr_last_sweep = if (fails == 0) true else false;
1123 |                         std.time.sleep(self.elex_delay);
1124 |                     },
1125 |                 }
1126 |             }
1127 |         }
1128 | 
1129 |         // Round-robin for one sweep, then randomize before doing another sweep.
1130 |         // We are passing in an arena allocator here.
1131 |         fn getPingTarget(self: *Self, allocator: std.mem.Allocator) !?[]const u8 {
1132 |             while (true) {
1133 |                 const pop = self.ping_queue.popOrNull();
1134 |                 if (pop) |v| return v;
1135 | 
1136 |                 b: {
1137 |                     var tl = std.ArrayList([]const u8).init(allocator);
1138 | 
1139 |                     {
1140 |                         self.members_mtx.lock();
1141 |                         defer self.members_mtx.unlock();
1142 |                         var iter = self.members.iterator();
1143 |                         while (iter.next()) |v| {
1144 |                             if (v.value_ptr.liveness == .faulty) continue;
1145 |                             if (self.keyIsMe(v.key_ptr.*)) continue;
1146 |                             try tl.append(v.key_ptr.*);
1147 |                         }
1148 |                     }
1149 | 
1150 |                     switch (tl.items.len) {
1151 |                         0 => return null, // probably just us
1152 |                         1 => {
1153 |                             try self.ping_queue.append(tl.items[0]);
1154 |                             break :b;
1155 |                         },
1156 |                         else => {},
1157 |                     }
1158 | 
1159 |                     const seed = std.crypto.random.int(u64);
1160 |                     var prng = std.rand.DefaultPrng.init(seed);
1161 |                     const random = prng.random();
1162 |                     while (true) {
1163 |                         switch (tl.items.len) {
1164 |                             0 => break,
1165 |                             1 => {
1166 |                                 try self.ping_queue.append(tl.items[0]);
1167 |                                 break;
1168 |                             },
1169 |                             else => {},
1170 |                         }
1171 | 
1172 |                         const rv = random.uintAtMost(u64, tl.items.len - 1);
1173 |                         try self.ping_queue.append(tl.items[rv]);
1174 |                         _ = tl.swapRemove(rv);
1175 |                     }
1176 |                 }
1177 |             }
1178 | 
1179 |             unreachable;
1180 |         }
1181 | 
1182 |         // Caller is responsible for releasing the returned memory.
1183 |         // We are passing in an arena allocator here.
1184 |         fn getRandomMember(
1185 |             self: *Self,
1186 |             allocator: std.mem.Allocator,
1187 |             excludes: [][]const u8,
1188 |             max: usize,
1189 |         ) !std.ArrayList([]const u8) {
1190 |             var hm = std.AutoHashMap(u64, []const u8).init(allocator);
1191 |             defer hm.deinit(); // noop since arena
1192 | 
1193 |             {
1194 |                 self.members_mtx.lock();
1195 |                 defer self.members_mtx.unlock();
1196 |                 var iter = self.members.iterator();
1197 |                 while (iter.next()) |v| {
1198 |                     if (v.value_ptr.liveness == .faulty) continue;
1199 |                     if (self.keyIsMe(v.key_ptr.*)) continue;
1200 |                     var eql: usize = 0;
1201 |                     for (excludes) |x| {
1202 |                         if (std.mem.eql(u8, x, v.key_ptr.*)) eql += 1;
1203 |                     }
1204 | 
1205 |                     if (eql > 0) continue;
1206 |                     try hm.put(hm.count(), v.key_ptr.*);
1207 |                 }
1208 |             }
1209 | 
1210 |             var out = std.ArrayList([]const u8).init(allocator);
1211 | 
1212 |             var limit = max;
1213 |             if (limit > hm.count()) limit = hm.count();
1214 |             if (hm.count() == 1 and limit > 0) {
1215 |                 const get = hm.get(0);
1216 |                 if (get) |v| try out.append(v);
1217 |                 return out;
1218 |             }
1219 | 
1220 |             const seed = std.crypto.random.int(u64);
1221 |             var prng = std.rand.DefaultPrng.init(seed);
1222 |             const random = prng.random();
1223 |             for (0..limit) |_| {
1224 |                 if (hm.count() == 0) break;
1225 |                 while (true) {
1226 |                     if (hm.count() == 0) break;
1227 |                     const rv = random.uintAtMost(u64, hm.count() - 1);
1228 |                     const fr = hm.fetchRemove(rv);
1229 |                     if (fr) |v| try out.append(v.value);
1230 |                     break;
1231 |                 }
1232 |             }
1233 | 
1234 |             return out;
1235 |         }
1236 | 
1237 |         // Setup the dst_* section of the payload.
1238 |         // We are passing in an arena allocator here.
1239 |         fn setMsgDst(
1240 |             self: *Self,
1241 |             allocator: std.mem.Allocator,
1242 |             msg: *Message,
1243 |             excludes: [][]const u8,
1244 |         ) !void {
1245 |             b: {
1246 |                 const dst = try self.getRandomMember(allocator, excludes, 1);
1247 |                 if (dst.items.len == 0) break :b;
1248 |                 msg.dst_cmd = .infect;
1249 |                 const ki = self.getKeyInfo(dst.items[0]);
1250 |                 if (ki) |_| {} else break :b;
1251 |                 try setMsgSection(msg, .dst, ki.?);
1252 |             }
1253 |         }
1254 | 
1255 |         // Ping a peer for liveness. Expected format for `key` is "ip:port",
1256 |         // eg. "127.0.0.1:8080". For pings, we use the src_* payload fields
1257 |         // to identify us, the sender.
1258 |         fn ping(self: *Self, key: []const u8) !bool {
1259 |             var aa = std.heap.ArenaAllocator.init(self.allocator);
1260 |             defer aa.deinit(); // destroy arena in one go
1261 |             const arena = aa.allocator();
1262 | 
1263 |             const sep = std.mem.indexOf(u8, key, ":") orelse return false;
1264 |             const ip = key[0..sep];
1265 |             const port = try std.fmt.parseUnsigned(u16, key[sep + 1 ..], 10);
1266 |             if (std.mem.eql(u8, ip, self.ip) and port == self.port) return true;
1267 | 
1268 |             const buf = try arena.alloc(u8, @sizeOf(Message));
1269 |             const msg: *Message = @ptrCast(@alignCast(buf));
1270 |             try self.presetMessage(msg);
1271 | 
1272 |             msg.cmd = .ping;
1273 |             try self.setMsgSrcToOwn(msg);
1274 | 
1275 |             // Use dst_* for ISD info.
1276 |             var excludes: [1][]const u8 = .{key};
1277 |             try self.setMsgDst(arena, msg, &excludes);
1278 | 
1279 |             // Handle join address protocol (egress).
1280 |             try self.setJoinProtoSend(msg);
1281 | 
1282 |             // Propagate number of members.
1283 |             const n = self.getCounts();
1284 |             msg.proto2 = n[0] + n[1];
1285 | 
1286 |             try send(ip, port, buf, null);
1287 | 
1288 |             // Handle join address protocol (ingress).
1289 |             const cmdm: JoinCmd = @enumFromInt((msg.proto1 &
1290 |                 0xF000000000000000) >> 48);
1291 | 
1292 |             if (cmdm != .invalidate) _ = self.join_addr_tm.lap();
1293 | 
1294 |             return switch (msg.cmd) {
1295 |                 .ack => b: {
1296 |                     try self.upsertMember(key, .alive, msg.src_incarnation, true);
1297 | 
1298 |                     // Consume dst_* as piggybacked ISD info.
1299 |                     if (msg.dst_cmd == .infect) {
1300 |                         const k = try keyFromIpPort(arena, msg.dst_ip, msg.dst_port);
1301 |                         try self.upsertMember(k, msg.dst_state, msg.dst_incarnation, false);
1302 |                     }
1303 | 
1304 |                     break :b true;
1305 |                 },
1306 |                 else => false,
1307 |             };
1308 |         }
1309 | 
1310 |         const RequestPing = struct {
1311 |             self: *Self,
1312 |             src: []const u8, // agent
1313 |             dst: []const u8, // target
1314 |             ack: bool = false,
1315 |         };
1316 | 
1317 |         // NOTE: Not used at the moment.
1318 |         // Our only agent for doing indirect pings for suspicious nodes. Long-running.
1319 |         fn requestPing(args: *RequestPing) !void {
1320 |             while (true) {
1321 |                 args.self.ping_req_0.wait();
1322 |                 defer {
1323 |                     args.self.ping_req_0.reset();
1324 |                     args.self.ping_req_1.set();
1325 |                 }
1326 | 
1327 |                 log.debug("[thread] try pinging {s} via {s}", .{ args.dst, args.src });
1328 | 
1329 |                 var aa = std.heap.ArenaAllocator.init(args.self.allocator);
1330 |                 defer aa.deinit(); // destroy arena in one go
1331 |                 const arena = aa.allocator();
1332 | 
1333 |                 const sep = std.mem.indexOf(u8, args.src, ":") orelse return;
1334 |                 const ip = args.src[0..sep];
1335 |                 const port = try std.fmt.parseUnsigned(u16, args.src[sep + 1 ..], 10);
1336 | 
1337 |                 const buf = try arena.alloc(u8, @sizeOf(Message));
1338 |                 const msg: *Message = @ptrCast(@alignCast(buf));
1339 |                 try args.self.presetMessage(msg);
1340 |                 msg.cmd = .ping_req;
1341 | 
1342 |                 // Set src_* to our info, the sender.
1343 |                 try args.self.setMsgSrcToOwn(msg);
1344 | 
1345 |                 // The dst_* section is the target of our ping.
1346 |                 try setMsgSection(msg, .dst, .{
1347 |                     .key = args.dst,
1348 |                     .liveness = .suspected, // will not be used
1349 |                     .incarnation = 0, // will not be used
1350 |                 });
1351 | 
1352 |                 // Handle ISD info.
1353 |                 const isd = try args.self.getIsdInfo(arena, 1);
1354 |                 if (isd.items.len > 0) {
1355 |                     msg.isd_cmd = .infect;
1356 |                     try setMsgSection(msg, .isd, isd.items[0]);
1357 |                 }
1358 | 
1359 |                 // Handle join address protocol (egress).
1360 |                 try args.self.setJoinProtoSend(msg);
1361 | 
1362 |                 args.self.send(ip, port, buf, null) catch continue;
1363 | 
1364 |                 // Handle join address protocol (ingress).
1365 |                 args.self.setJoinProtoRecv(msg);
1366 | 
1367 |                 switch (msg.cmd) {
1368 |                     .ack => {
1369 |                         try args.self.upsertMember(
1370 |                             args.src,
1371 |                             msg.src_state,
1372 |                             msg.src_incarnation,
1373 |                             true,
1374 |                         );
1375 | 
1376 |                         try args.self.upsertMember(
1377 |                             args.dst,
1378 |                             msg.dst_state,
1379 |                             msg.dst_incarnation,
1380 |                             true,
1381 |                         );
1382 | 
1383 |                         // Consume isd_* as the main ISD info.
1384 |                         switch (msg.isd_cmd) {
1385 |                             .infect,
1386 |                             .confirm_alive,
1387 |                             => try args.self.handleIsd(arena, msg, false),
1388 |                             .suspect => try args.self.handleSuspicion(arena, msg),
1389 |                             .confirm_faulty => try args.self.handleConfirmFaulty(arena, msg),
1390 |                             else => {},
1391 |                         }
1392 | 
1393 |                         const ptr = &args.ack;
1394 |                         ptr.* = true;
1395 |                     },
1396 |                     .nack => try args.self.upsertMember(
1397 |                         args.src,
1398 |                         msg.src_state,
1399 |                         msg.src_incarnation,
1400 |                         false,
1401 |                     ),
1402 |                     else => {},
1403 |                 }
1404 |             }
1405 |         }
1406 | 
1407 |         // Handle the isd_* infection protocol of the message payload.
1408 |         // We are passing in an arena allocator here.
1409 |         fn handleIsd(self: *Self, allocator: std.mem.Allocator, msg: *Message, force: bool) !void {
1410 |             const key = try keyFromIpPort(allocator, msg.isd_ip, msg.isd_port);
1411 |             try self.setMemberInfo(key, msg.isd_state, msg.isd_incarnation, force);
1412 |         }
1413 | 
1414 |         // Handle the isd_* suspicion protocol of the message payload.
1415 |         // We are passing in an arena allocator here.
1416 |         fn handleSuspicion(self: *Self, allocator: std.mem.Allocator, msg: *Message) !void {
1417 |             const key = try keyFromIpPort(allocator, msg.isd_ip, msg.isd_port);
1418 |             if (self.keyIsMe(key)) b: {
1419 |                 try self.IncrementIncarnation();
1420 |                 const pkey = self.getPersistentKeyFromKey(key);
1421 |                 if (pkey) |_| {} else break :b;
1422 |                 return;
1423 |             }
1424 | 
1425 |             var suspected = std.ArrayList(KeyInfo).init(allocator);
1426 | 
1427 |             {
1428 |                 self.members_mtx.lock();
1429 |                 defer self.members_mtx.unlock();
1430 |                 const ptr = self.members.getPtr(key);
1431 |                 if (ptr) |_| {} else return;
1432 | 
1433 |                 try suspected.append(.{
1434 |                     .key = key,
1435 |                     .liveness = .suspected,
1436 |                     .isd_cmd = .confirm_alive,
1437 |                     .incarnation = ptr.?.incarnation,
1438 |                 });
1439 |             }
1440 | 
1441 |             if (suspected.items.len == 0) return;
1442 | 
1443 |             const pkey = self.getPersistentKeyFromKey(key);
1444 |             if (pkey) |_| {} else return;
1445 |         }
1446 | 
1447 |         // Handle the isd_* faulty protocol of the message payload.
1448 |         // We are passing in an arena allocator here.
1449 |         fn handleConfirmFaulty(self: *Self, allocator: std.mem.Allocator, msg: *Message) !void {
1450 |             const key = try keyFromIpPort(allocator, msg.isd_ip, msg.isd_port);
1451 |             if (!self.keyIsMe(key)) {
1452 |                 try self.setMemberInfo(key, .faulty, null, true);
1453 |                 return;
1454 |             }
1455 | 
1456 |             const pkey = self.getPersistentKeyFromKey(key);
1457 |             if (pkey) |_| {} else return;
1458 |         }
1459 | 
1460 |         // NOTE: Not using locks; only atomic.
1461 |         fn getIncarnation(self: *Self) !u64 {
1462 |             const me = try self.getOwnKey();
1463 |             defer self.allocator.free(me);
1464 |             const ptr = self.members.getPtr(me);
1465 |             if (ptr) |v| return @atomicLoad(
1466 |                 u64,
1467 |                 &v.incarnation,
1468 |                 std.builtin.AtomicOrder.seq_cst,
1469 |             );
1470 | 
1471 |             unreachable;
1472 |         }
1473 | 
1474 |         // NOTE: Not using locks; only atomic.
1475 |         fn IncrementIncarnation(self: *Self) !void {
1476 |             const me = try self.getOwnKey();
1477 |             defer self.allocator.free(me);
1478 |             const ptr = self.members.getPtr(me);
1479 |             if (ptr) |_| {} else return;
1480 |             _ = @atomicRmw(
1481 |                 u64,
1482 |                 &ptr.?.incarnation,
1483 |                 std.builtin.AtomicRmwOp.Add,
1484 |                 1,
1485 |                 std.builtin.AtomicOrder.seq_cst,
1486 |             );
1487 |         }
1488 | 
1489 |         // Caller must free the returned memory.
1490 |         fn getOwnKey(self: *Self) ![]const u8 {
1491 |             return try std.fmt.allocPrint(self.allocator, "{s}:{d}", .{ self.ip, self.port });
1492 |         }
1493 | 
1494 |         // Expected format for `key` is ip:port, eg. 0.0.0.0:8080.
1495 |         fn keyIsMe(self: *Self, key: []const u8) bool {
1496 |             const sep = std.mem.indexOf(u8, key, ":") orelse return false;
1497 |             const ip = key[0..sep];
1498 |             const port = std.fmt.parseUnsigned(u16, key[sep + 1 ..], 10) catch return false;
1499 |             return if (std.mem.eql(u8, ip, self.ip) and port == self.port) true else false;
1500 |         }
1501 | 
1502 |         // Use the key from `members` when adding items (key) to the isd_queue.
1503 |         fn getPersistentKeyFromKey(self: *Self, key: []const u8) ?[]const u8 {
1504 |             self.members_mtx.lock();
1505 |             defer self.members_mtx.unlock();
1506 |             const ptr = self.members.getKeyPtr(key);
1507 |             if (ptr) |v| return v.*;
1508 |             return null;
1509 |         }
1510 | 
1511 |         // [0] = # of alive members
1512 |         // [1] = # of suspected members
1513 |         // [2] = # of faulty members
1514 |         // [3] = total number of members
1515 |         fn getCounts(self: *Self) std.meta.Tuple(&.{ usize, usize, usize, usize }) {
1516 |             var n: [3]usize = .{ 0, 0, 0 };
1517 |             self.members_mtx.lock();
1518 |             defer self.members_mtx.unlock();
1519 |             var it = self.members.iterator();
1520 |             while (it.next()) |v| {
1521 |                 switch (v.value_ptr.liveness) {
1522 |                     .alive => n[0] += 1,
1523 |                     .suspected => n[1] += 1,
1524 |                     .faulty => n[2] += 1,
1525 |                 }
1526 |             }
1527 | 
1528 |             return .{
1529 |                 n[0],
1530 |                 n[1],
1531 |                 n[2],
1532 |                 self.members.count(),
1533 |             };
1534 |         }
1535 | 
1536 |         fn getKeyInfo(self: *Self, key: []const u8) ?KeyInfo {
1537 |             self.members_mtx.lock();
1538 |             defer self.members_mtx.unlock();
1539 |             const ptr = self.members.getPtr(key);
1540 |             if (ptr) |_| {} else return null;
1541 |             return .{
1542 |                 .key = key,
1543 |                 .liveness = ptr.?.liveness,
1544 |                 .incarnation = ptr.?.incarnation,
1545 |             };
1546 |         }
1547 | 
1548 |         // We always assume the node with the largest ip(int)+port to be leader.
1549 |         // [0] - leader's (highest) ip in int format
1550 |         // [1] - leader's (highest) port number
1551 |         // [2] - true if we are the leader
1552 |         fn getHighestNode(self: *Self) !std.meta.Tuple(&.{ u32, u64, bool }) {
1553 |             var ipl: u32 = 0;
1554 |             var portl: u16 = 0;
1555 |             var me = false;
1556 |             self.members_mtx.lock();
1557 |             defer self.members_mtx.unlock();
1558 |             var it = self.members.iterator();
1559 |             while (it.next()) |v| {
1560 |                 if (v.value_ptr.liveness == .faulty) continue;
1561 |                 const sep = std.mem.indexOf(u8, v.key_ptr.*, ":") orelse continue;
1562 |                 const ip = v.key_ptr.*[0..sep];
1563 |                 const port = try std.fmt.parseUnsigned(u16, v.key_ptr.*[sep + 1 ..], 10);
1564 |                 const addr = try std.net.Address.resolveIp(ip, port);
1565 |                 if ((addr.in.sa.addr + port) > (ipl + portl)) {
1566 |                     ipl = addr.in.sa.addr;
1567 |                     portl = port;
1568 |                     me = std.mem.eql(u8, ip, self.ip) and port == self.port;
1569 |                 }
1570 |             }
1571 | 
1572 |             return .{ ipl, portl, me };
1573 |         }
1574 | 
1575 |         fn setJoinProtoSend(self: *Self, msg: *Message) !void {
1576 |             const n = self.getCounts();
1577 |             const lim = n[0] + n[1];
1578 |             if (lim < 2) return;
1579 |             const al = try self.getHighestNode();
1580 |             const hb: u64 = @intFromEnum(JoinCmd.heartbeat);
1581 |             const ipl: u32 = al[0] & 0x00000000FFFFFFFF;
1582 |             const portl: u64 = (al[1] << 32) & 0x0000FFFF00000000;
1583 |             msg.proto1 = (hb << 48) | ipl | portl;
1584 |         }
1585 | 
1586 |         fn setJoinProtoRecv(self: *Self, msg: *Message) void {
1587 |             const cmdm: JoinCmd = @enumFromInt((msg.proto1 &
1588 |                 0xFFFF000000000000) >> 48);
1589 |             if (cmdm != .invalidate) _ = self.join_addr_tm.lap();
1590 |         }
1591 | 
1592 |         fn setTermAndN(self: *Self, msg: *Message) void {
1593 |             const n = self.getCounts();
1594 |             const total = n[0] + n[1];
1595 |             const term = @atomicLoad(u64, &self.elex_term, std.builtin.AtomicOrder.seq_cst);
1596 |             const mterm: u64 = term & 0x0000FFFFFFFFFFFF;
1597 |             const mcount: u64 = (total << 48) & 0xFFFF000000000000;
1598 |             msg.proto1 = mcount | mterm;
1599 |         }
1600 | 
1601 |         // [0] - term
1602 |         // [1] - count
1603 |         fn getTermAndN(_: *Self, msg: *Message) std.meta.Tuple(&.{ u64, u64 }) {
1604 |             const term = msg.proto1 & 0x0000FFFFFFFFFFFF;
1605 |             const count = (msg.proto1 & 0xFFFF000000000000) >> 48;
1606 |             return .{ term, count };
1607 |         }
1608 | 
1609 |         // Set default values for the message.
1610 |         fn presetMessage(self: *Self, msg: *Message) !void {
1611 |             msg.name = std.mem.readVarInt(u64, self.name, .little);
1612 |             msg.cmd = .noop;
1613 |             msg.src_state = .alive;
1614 |             msg.dst_cmd = .noop;
1615 |             msg.dst_state = .alive;
1616 |             msg.proto1 = 0;
1617 |             msg.proto2 = 0;
1618 |         }
1619 | 
1620 |         fn setMsgSrcToOwn(self: *Self, msg: *Message) !void {
1621 |             const me = try self.getOwnKey();
1622 |             defer self.allocator.free(me);
1623 |             try setMsgSection(msg, .src, .{
1624 |                 .key = me,
1625 |                 .liveness = .alive,
1626 |                 .incarnation = try self.getIncarnation(),
1627 |             });
1628 |         }
1629 | 
1630 |         // Add a new member or update an existing member's info. This function
1631 |         // duplicates the key using self.allocator when adding a new member,
1632 |         // not when updating an existing one.
1633 |         fn upsertMember(
1634 |             self: *Self,
1635 |             key: []const u8,
1636 |             state: ?Liveness,
1637 |             incarnation: ?u64,
1638 |             force: bool,
1639 |         ) !void {
1640 |             const contains = b: {
1641 |                 self.members_mtx.lock();
1642 |                 defer self.members_mtx.unlock();
1643 |                 break :b self.members.contains(key);
1644 |             };
1645 | 
1646 |             if (contains) {
1647 |                 try self.setMemberInfo(key, state, incarnation, force);
1648 |                 return;
1649 |             }
1650 | 
1651 |             const nkey = try self.allocator.dupe(u8, key);
1652 | 
1653 |             // Our copy of all member keys being allocated; to free later.
1654 |             if (!self.refkeys.contains(nkey)) try self.refkeys.put(nkey, {});
1655 | 
1656 |             {
1657 |                 self.members_mtx.lock();
1658 |                 defer self.members_mtx.unlock();
1659 |                 try self.members.put(nkey, .{
1660 |                     .age_suspected = try std.time.Timer.start(),
1661 |                     .age_faulty = try std.time.Timer.start(),
1662 |                     .targets = std.ArrayList([]const u8).init(self.allocator),
1663 |                 });
1664 |             }
1665 | 
1666 |             try self.setMemberInfo(key, state, incarnation, true);
1667 |         }
1668 | 
1669 |         // `key` should be in fmt: "ip:port", e.g. "127.0.0.1:8080". We
1670 |         // duplicate `key` to our internal list to be able to free later.
1671 |         fn ensureKeyRef(self: *Self, key: []const u8) ![]const u8 {
1672 |             self.refkeys_mtx.lock();
1673 |             defer self.refkeys_mtx.unlock();
1674 |             if (self.refkeys.contains(key)) return self.refkeys.getKey(key).?;
1675 |             const dup = try self.allocator.dupe(u8, key);
1676 |             try self.refkeys.put(dup, {});
1677 |             return dup;
1678 |         }
1679 | 
1680 |         // Reference: SWIM:4.2
1681 |         // Order of preference:
1682 |         //
1683 |         //   {Alive:M, inc=i} overrides
1684 |         //    - {Suspect:M, inc=j}, i>j
1685 |         //    - {Alive:M, inc=j}, i>j
1686 |         //
1687 |         //   {Suspect:M, inc=i} overrides
1688 |         //    - {Suspect:M, inc=j}, i>j
1689 |         //    - {Alive:M, inc=j}, i>=j
1690 |         //
1691 |         //   {Faulty:M, inc=i} overrides
1692 |         //    - {Alive:M, inc=j}, any j
1693 |         //    - {Suspect:M, inc=j}, any j
1694 |         //
1695 |         fn setMemberInfo(
1696 |             self: *Self,
1697 |             key: []const u8,
1698 |             state: ?Liveness,
1699 |             incarnation: ?u64,
1700 |             force: bool,
1701 |         ) !void {
1702 |             self.members_mtx.lock();
1703 |             defer self.members_mtx.unlock();
1704 |             const p = self.members.getPtr(key);
1705 |             if (p) |_| {} else return;
1706 | 
1707 |             var apply = false;
1708 |             var in_state: Liveness = .alive;
1709 |             var in_inc: u64 = p.?.incarnation;
1710 |             if (state) |s| in_state = s else return;
1711 |             if (incarnation) |inc| in_inc = inc;
1712 | 
1713 |             if (in_state == .alive) {
1714 |                 if (p.?.liveness == .suspected and in_inc > p.?.incarnation) apply = true;
1715 |                 if (p.?.liveness == .alive and in_inc > p.?.incarnation) apply = true;
1716 |             }
1717 | 
1718 |             if (in_state == .suspected) {
1719 |                 if (p.?.liveness == .suspected and in_inc > p.?.incarnation) apply = true;
1720 |                 if (p.?.liveness == .alive and in_inc >= p.?.incarnation) apply = true;
1721 |             }
1722 | 
1723 |             if (in_state == .faulty) apply = true;
1724 |             if (force) apply = true;
1725 | 
1726 |             if (!apply) return;
1727 | 
1728 |             if (p.?.liveness == .faulty and in_state == .alive) p.?.incarnation = 0;
1729 | 
1730 |             p.?.liveness = in_state;
1731 |             p.?.incarnation = in_inc;
1732 | 
1733 |             if (p.?.liveness == .suspected and in_state != .suspected) p.?.age_suspected.reset();
1734 |             if (p.?.liveness == .faulty and in_state != .faulty) p.?.age_faulty.reset();
1735 |         }
1736 | 
1737 |         // const SuspectToFaulty = struct {
1738 |         //     self: *Self,
1739 |         //     key: []const u8,
1740 |         // };
1741 | 
1742 |         // // To be run as a separate thread. Keep it suspected
1743 |         // // for a while before marking it as faulty.
1744 |         // fn suspectToFaulty(args: *SuspectToFaulty) !void {
1745 |         //     // Pause for a bit before we set to faulty.
1746 |         //     std.time.sleep(args.self.suspected_time);
1747 |         //     try args.self.setMemberInfo(args.key, .faulty, null, false);
1748 | 
1749 |         //     // Broadcast confirm_faulty to the group.
1750 |         //     args.self.isd_mtx.lock();
1751 |         //     defer args.self.isd_mtx.unlock();
1752 |         //     try args.self.isd_queue.append(.{
1753 |         //         .key = args.key,
1754 |         //         .state = .faulty,
1755 |         //         .isd_cmd = .confirm_faulty,
1756 |         //         .incarnation = try args.self.getIncarnation(), // ok since atomic
1757 |         //     });
1758 |         // }
1759 | 
1760 |         // Attempt removing faulty members after some time.
1761 |         fn removeFaultyMembers(self: *Self) !void {
1762 |             var rml = std.ArrayList([]const u8).init(self.allocator);
1763 |             defer rml.deinit();
1764 | 
1765 |             {
1766 |                 self.members_mtx.lock();
1767 |                 defer self.members_mtx.unlock();
1768 |                 var it = self.members.iterator();
1769 |                 const limit = self.proto_time; // TODO: expose
1770 |                 while (it.next()) |v| {
1771 |                     if (v.value_ptr.liveness != .faulty) continue;
1772 |                     if (v.value_ptr.age_faulty.read() > limit) {
1773 |                         try rml.append(v.key_ptr.*);
1774 |                     }
1775 |                 }
1776 |             }
1777 | 
1778 |             for (rml.items) |v| self.removeMember(v);
1779 |         }
1780 | 
1781 |         // We don't free the key itself here; we will free through self.ref_keys.
1782 |         fn removeMember(self: *Self, key: []const u8) void {
1783 |             self.members_mtx.lock();
1784 |             defer self.members_mtx.unlock();
1785 |             const fr = self.members.fetchRemove(key);
1786 |             if (fr) |v| v.value.targets.deinit();
1787 |         }
1788 | 
1789 |         const MsgSection = enum {
1790 |             src,
1791 |             dst,
1792 |             // isd,
1793 |         };
1794 | 
1795 |         // Set a section of the message payload with ip, port, and state info.
1796 |         fn setMsgSection(msg: *Message, section: MsgSection, info: KeyInfo) !void {
1797 |             const sep = std.mem.indexOf(u8, info.key, ":") orelse return;
1798 |             const ip = info.key[0..sep];
1799 |             const port = try std.fmt.parseUnsigned(u16, info.key[sep + 1 ..], 10);
1800 |             const addr = try std.net.Address.resolveIp(ip, port);
1801 | 
1802 |             switch (section) {
1803 |                 .src => {
1804 |                     msg.src_ip = addr.in.sa.addr;
1805 |                     msg.src_port = port;
1806 |                     msg.src_state = info.liveness;
1807 |                     msg.src_incarnation = info.incarnation;
1808 |                 },
1809 |                 .dst => {
1810 |                     msg.dst_ip = addr.in.sa.addr;
1811 |                     msg.dst_port = port;
1812 |                     msg.dst_state = info.liveness;
1813 |                     msg.dst_incarnation = info.incarnation;
1814 |                 },
1815 |             }
1816 |         }
1817 | 
1818 |         fn getState(self: *Self) ElectionState {
1819 |             self.elex_mtx.lock();
1820 |             defer self.elex_mtx.unlock();
1821 |             return self.elex_state;
1822 |         }
1823 | 
1824 |         fn setState(self: *Self, state: ElectionState) void {
1825 |             self.elex_mtx.lock();
1826 |             defer self.elex_mtx.unlock();
1827 |             self.elex_state = state;
1828 |         }
1829 | 
1830 |         // Best-effort basis only. `msg` should already contain the new join info
1831 |         // in the dst_* portion, as well as it's source info.
1832 |         fn informLeaderOfJoin(self: *Self, msg: []u8) !void {
1833 |             const leader = b: {
1834 |                 self.elex_mtx.lock();
1835 |                 defer self.elex_mtx.unlock();
1836 |                 break :b self.leader;
1837 |             };
1838 | 
1839 |             if (leader.len < 2) return;
1840 | 
1841 |             const sep = std.mem.indexOf(u8, leader, ":") orelse return;
1842 |             const ip = leader[0..sep];
1843 |             const port = try std.fmt.parseUnsigned(u16, leader[sep + 1 ..], 10);
1844 | 
1845 |             try send(ip, port, msg, null);
1846 |         }
1847 | 
1848 |         fn getTerm(self: *Self) u64 {
1849 |             return @atomicLoad(
1850 |                 u64,
1851 |                 &self.elex_term,
1852 |                 std.builtin.AtomicOrder.seq_cst,
1853 |             );
1854 |         }
1855 | 
1856 |         fn setTerm(self: *Self, term: u64) void {
1857 |             @atomicStore(
1858 |                 u64,
1859 |                 &self.elex_term,
1860 |                 term,
1861 |                 std.builtin.AtomicOrder.seq_cst,
1862 |             );
1863 |         }
1864 | 
1865 |         fn incTermAndGet(self: *Self) u64 {
1866 |             _ = @atomicRmw(
1867 |                 u64,
1868 |                 &self.elex_term,
1869 |                 std.builtin.AtomicRmwOp.Add,
1870 |                 1,
1871 |                 std.builtin.AtomicOrder.seq_cst,
1872 |             );
1873 | 
1874 |             return self.getTerm();
1875 |         }
1876 | 
1877 |         fn getVotes(self: *Self) u32 {
1878 |             return @atomicLoad(
1879 |                 u32,
1880 |                 &self.votes,
1881 |                 std.builtin.AtomicOrder.seq_cst,
1882 |             );
1883 |         }
1884 | 
1885 |         fn setVotes(self: *Self, vote: u32) void {
1886 |             @atomicStore(
1887 |                 u32,
1888 |                 &self.votes,
1889 |                 vote,
1890 |                 std.builtin.AtomicOrder.seq_cst,
1891 |             );
1892 |         }
1893 | 
1894 |         fn voteForSelf(self: *Self) u32 {
1895 |             _ = @atomicRmw(
1896 |                 u32,
1897 |                 &self.votes,
1898 |                 std.builtin.AtomicRmwOp.Add,
1899 |                 1,
1900 |                 std.builtin.AtomicOrder.seq_cst,
1901 |             );
1902 | 
1903 |             return self.getVotes();
1904 |         }
1905 | 
1906 |         fn incVotesAndGet(self: *Self) u32 {
1907 |             return self.voteForSelf();
1908 |         }
1909 | 
1910 |         fn getElexTimeoutMin(self: *Self) u64 {
1911 |             return @atomicLoad(
1912 |                 u64,
1913 |                 &self.elex_tm_min,
1914 |                 std.builtin.AtomicOrder.seq_cst,
1915 |             );
1916 |         }
1917 | 
1918 |         fn getElexTimeoutMax(self: *Self) u64 {
1919 |             return @atomicLoad(
1920 |                 u64,
1921 |                 &self.elex_tm_max,
1922 |                 std.builtin.AtomicOrder.seq_cst,
1923 |             );
1924 |         }
1925 |     };
1926 | }
1927 | 
1928 | // Helper function for internal one-shot send/recv. The same message ptr is
1929 | // used for both request and response payloads. If `tm_us` is not null,
1930 | // default timeout will be used.
1931 | fn send(ip: []const u8, port: u16, msg: []u8, tm_us: ?u32) !void {
1932 |     const addr = try std.net.Address.resolveIp(ip, port);
1933 |     const sock = try std.posix.socket(
1934 |         std.posix.AF.INET,
1935 |         std.posix.SOCK.DGRAM | std.posix.SOCK.CLOEXEC,
1936 |         0,
1937 |     );
1938 | 
1939 |     var tm: u32 = 1_000_000;
1940 |     if (tm_us) |v| tm = v;
1941 | 
1942 |     defer std.posix.close(sock);
1943 |     try setReadTimeout(sock, tm);
1944 |     try setWriteTimeout(sock, tm);
1945 |     try std.posix.connect(sock, &addr.any, addr.getOsSockLen());
1946 |     _ = try std.posix.write(sock, msg);
1947 |     _ = try std.posix.recv(sock, msg, 0);
1948 | }
1949 | 
1950 | /// Converts an ip and port to a string with format ip:port, eg. "127.0.0.1:8080".
1951 | /// Caller is responsible for releasing the returned memory.
1952 | fn keyFromIpPort(allocator: std.mem.Allocator, ip: u32, port: u16) ![]const u8 {
1953 |     const ipb = std.mem.asBytes(&ip);
1954 |     return try std.fmt.allocPrint(allocator, "{d}.{d}.{d}.{d}:{d}", .{
1955 |         ipb[0],
1956 |         ipb[1],
1957 |         ipb[2],
1958 |         ipb[3],
1959 |         port,
1960 |     });
1961 | }
1962 | 
1963 | test "keyFromIpPort" {
1964 |     const out = try keyFromIpPort(std.testing.allocator, 16777343, 8080);
1965 |     defer std.testing.allocator.free(out);
1966 |     try std.testing.expect(std.mem.eql(u8, out, "127.0.0.1:8080"));
1967 | }
1968 | 
1969 | /// Set socket read timeout in microseconds. Linux only.
1970 | pub fn setReadTimeout(socket: std.posix.socket_t, read: ?u32) !void {
1971 |     std.debug.assert(read == null or read.? != 0);
1972 |     const micros = read orelse 0;
1973 |     const opt = std.posix.timeval{
1974 |         .tv_sec = @intCast(@divTrunc(micros, std.time.us_per_s)),
1975 |         .tv_usec = @intCast(@mod(micros, std.time.us_per_s)),
1976 |     };
1977 | 
1978 |     try std.posix.setsockopt(
1979 |         socket,
1980 |         std.posix.SOL.SOCKET,
1981 |         std.posix.SO.RCVTIMEO,
1982 |         std.mem.toBytes(opt)[0..],
1983 |     );
1984 | }
1985 | 
1986 | /// Set socket write timeout in microseconds. Linux only.
1987 | pub fn setWriteTimeout(socket: std.posix.socket_t, write: ?u32) !void {
1988 |     std.debug.assert(write == null or write.? != 0);
1989 |     const micros = write orelse 0;
1990 |     const opt = std.posix.timeval{
1991 |         .tv_sec = @intCast(@divTrunc(micros, std.time.us_per_s)),
1992 |         .tv_usec = @intCast(@mod(micros, std.time.us_per_s)),
1993 |     };
1994 | 
1995 |     try std.posix.setsockopt(
1996 |         socket,
1997 |         std.posix.SOL.SOCKET,
1998 |         std.posix.SO.SNDTIMEO,
1999 |         std.mem.toBytes(opt)[0..],
2000 |     );
2001 | }
2002 | 


--------------------------------------------------------------------------------