├── .dockerignore ├── startup-gcp-mig.sh ├── Dockerfile ├── localbuild.sh ├── startup-aws-asg.sh ├── .gitignore ├── dockerfile-debian ├── LICENSE ├── k8s.yaml ├── .github └── workflows │ └── main.yml ├── src ├── scratch.zig ├── main.zig └── zgroup.zig └── README.md /.dockerignore: -------------------------------------------------------------------------------- 1 | .zig-cache/ 2 | zig-out/ -------------------------------------------------------------------------------- /startup-gcp-mig.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | mkdir /opt/zgroup/ && cd /opt/zgroup/ 3 | wget https://github.com/flowerinthenight/zgroup/releases/download/v0.3.2/zgroup-v0.3.2-x86_64-linux.tar.gz 4 | tar -xzvf zgroup-v0.3.2-x86_64-linux.tar.gz 5 | INTERNAL_IP=$(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0/ip) 6 | ZGROUP_JOIN_PREFIX=0b9303ad-1beb-483f-abb5-bc58e0214531 ./zgroup group1 ${INTERNAL_IP}:8080 2>&1 | logger & 7 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM kassany/alpine-ziglang:0.13.0 2 | WORKDIR /tmp/ 3 | COPY src/ ./src/ 4 | COPY build* ./ 5 | RUN zig build -Doptimize=ReleaseFast --summary all 6 | # RUN zig build --summary all 7 | 8 | FROM debian:stable-slim 9 | RUN set -x && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y curl ca-certificates && rm -rf /var/lib/apt/lists/* 10 | WORKDIR /app/ 11 | COPY --from=0 /tmp/zig-out/bin/zgroup . 12 | ENTRYPOINT ["/app/zgroup"] 13 | CMD ["group1", "0.0.0.0:8080"] 14 | -------------------------------------------------------------------------------- /localbuild.sh: -------------------------------------------------------------------------------- 1 | # NOTE: This is specific to my local dev environment. 2 | # Usage: 3 | # 4 | # ./localbuild.sh {tag}, e.g. ./localbuild.sh v7 5 | # 6 | kubectl delete -f deployment.yaml 7 | docker build --rm -t zgroup . 8 | docker tag zgroup asia.gcr.io/mobingi-main/zgroup:$1 9 | docker push asia.gcr.io/mobingi-main/zgroup:$1 10 | docker rmi $(docker images --filter "dangling=true" -q --no-trunc) -f 11 | sed -i -e 's/image\:\ asia.gcr.io\/mobingi\-main\/zgroup[\:@].*$/image\:\ asia.gcr.io\/mobingi\-main\/zgroup\:'$1'/g' deployment.yaml 12 | -------------------------------------------------------------------------------- /startup-aws-asg.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | mkdir /opt/zgroup/ && cd /opt/zgroup/ 3 | wget https://github.com/flowerinthenight/zgroup/releases/download/v0.3.2/zgroup-v0.3.2-x86_64-linux.tar.gz 4 | tar -xzvf zgroup-v0.3.2-x86_64-linux.tar.gz 5 | METADATA_TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") 6 | INTERNAL_IP=$(curl -H "X-aws-ec2-metadata-token: $METADATA_TOKEN" http://169.254.169.254/latest/meta-data/local-ipv4) 7 | ZGROUP_JOIN_PREFIX=0b9303ad-1beb-483f-abb5-bc58e0214531 ./zgroup group1 ${INTERNAL_IP}:8080 2>&1 | logger & 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Object files 5 | *.o 6 | *.ko 7 | *.obj 8 | *.elf 9 | 10 | # Linker output 11 | *.ilk 12 | *.map 13 | *.exp 14 | 15 | # Precompiled Headers 16 | *.gch 17 | *.pch 18 | 19 | # Libraries 20 | *.lib 21 | *.a 22 | *.la 23 | *.lo 24 | 25 | # Shared objects (inc. Windows DLLs) 26 | *.dll 27 | *.so 28 | *.so.* 29 | *.dylib 30 | 31 | # Executables 32 | *.exe 33 | *.out 34 | *.app 35 | *.i*86 36 | *.x86_64 37 | *.hex 38 | zig-out/ 39 | 40 | # Debug files 41 | *.dSYM/ 42 | *.su 43 | *.idb 44 | *.pdb 45 | 46 | # Kernel Module Compile Results 47 | *.mod* 48 | *.cmd 49 | .tmp_versions/ 50 | modules.order 51 | Module.symvers 52 | Mkfile.old 53 | dkms.conf 54 | 55 | # Cache(s) 56 | .zig-cache/ 57 | 58 | # Others 59 | deployment.yaml -------------------------------------------------------------------------------- /dockerfile-debian: -------------------------------------------------------------------------------- 1 | FROM debian:bookworm 2 | RUN set -x && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y curl xz-utils ca-certificates && rm -rf /var/lib/apt/lists/* 3 | WORKDIR /tmp/ 4 | COPY src/ ./src/ 5 | COPY build* ./ 6 | RUN curl -O https://ziglang.org/download/0.13.0/zig-linux-x86_64-0.13.0.tar.xz && \ 7 | xz --decompress zig-linux-x86_64-0.13.0.tar.xz && tar -xf zig-linux-x86_64-0.13.0.tar && \ 8 | ./zig-linux-x86_64-0.13.0/zig build -Doptimize=ReleaseFast --summary all 9 | 10 | FROM debian:stable-slim 11 | RUN set -x && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y curl ca-certificates && rm -rf /var/lib/apt/lists/* 12 | WORKDIR /app/ 13 | COPY --from=0 /tmp/zig-out/bin/zgroup . 14 | ENTRYPOINT ["/app/zgroup"] 15 | CMD ["group1", "0.0.0.0:8080"] 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 flowerinthenight 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /k8s.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | apiVersion: apps/v1 4 | kind: Deployment 5 | metadata: 6 | name: zgroup 7 | spec: 8 | selector: 9 | matchLabels: 10 | app: zgroup 11 | replicas: 1 12 | revisionHistoryLimit: 5 13 | template: 14 | metadata: 15 | labels: 16 | app: zgroup 17 | spec: 18 | containers: 19 | - name: zgroup 20 | image: quay.io/flowerinthenight/zgroup:v0.3.2 21 | command: ["/bin/sh"] 22 | args: ["-c", '/app/zgroup group1 ${K8S_MY_POD_IP}:8080'] 23 | resources: 24 | requests: 25 | cpu: 100m 26 | memory: 500Mi 27 | limits: 28 | cpu: 100m 29 | memory: 500Mi 30 | imagePullPolicy: Always 31 | env: 32 | - name: K8S_MY_POD_IP 33 | valueFrom: 34 | fieldRef: 35 | fieldPath: status.podIP 36 | - name: GET_HOSTS_FROM 37 | value: dns 38 | - name: ZGROUP_JOIN_PREFIX 39 | value: "c06a9044-856d-4583-8095-c57d37272b05" 40 | ports: 41 | - containerPort: 8080 42 | 43 | --- 44 | 45 | apiVersion: autoscaling/v1 46 | kind: HorizontalPodAutoscaler 47 | metadata: 48 | name: zgroup-hpa 49 | spec: 50 | scaleTargetRef: 51 | apiVersion: apps/v1 52 | kind: Deployment 53 | name: zgroup 54 | minReplicas: 3 55 | maxReplicas: 3 56 | targetCPUUtilizationPercentage: 40 57 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: main 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | tags: 7 | - '*' 8 | pull_request: 9 | branches: [ "main" ] 10 | 11 | jobs: 12 | codeberg: 13 | name: Codeberg 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Checkout code 17 | uses: actions/checkout@v4 18 | with: 19 | fetch-depth: 0 20 | 21 | - name: Mirror to Codeberg 22 | uses: yesolutions/mirror-action@master 23 | with: 24 | REMOTE: "https://codeberg.org/flowerinthenight/zgroup.git" 25 | GIT_USERNAME: flowerinthenight 26 | GIT_PASSWORD: ${{ secrets.GIT_PASSWORD }} 27 | 28 | build: 29 | name: Build 30 | if: "!contains(github.event.commits[0].message, 'ci skip')" 31 | runs-on: ubuntu-latest 32 | steps: 33 | - name: Print GH context 34 | env: 35 | GITHUB_CONTEXT: ${{ toJson(github) }} 36 | run: | 37 | echo "$GITHUB_CONTEXT" 38 | 39 | - name: Checkout code 40 | uses: actions/checkout@v2 41 | 42 | - name: Setup Zig 43 | uses: mlugg/setup-zig@v1 44 | with: 45 | version: 0.13.0 46 | 47 | - name: Run tests 48 | run: zig build test 49 | 50 | - name: Release from tags 51 | if: startsWith(github.event.ref, 'refs/tags/v') 52 | env: 53 | GH_TOKEN: ${{ secrets.GH_TOKEN }} 54 | run: | 55 | zig build -Doptimize=ReleaseFast -Dtarget=x86_64-linux --summary all 56 | cp zig-out/bin/zgroup ./ 57 | tar czvf zgroup-${GITHUB_REF_NAME}-x86_64-linux.tar.gz zgroup 58 | gh release create ${GITHUB_REF_NAME} ./*.tar.gz --generate-notes 59 | -------------------------------------------------------------------------------- /src/scratch.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const builtin = std.builtin; 3 | const AtomicOrder = std.builtin.AtomicOrder; 4 | const AtomicRmwOp = std.builtin.AtomicRmwOp; 5 | const backoff = @import("zbackoff"); 6 | const zgroup = @import("zgroup.zig"); 7 | const dbg = std.debug.print; 8 | 9 | const pdata = struct { 10 | ev1: *std.Thread.ResetEvent, 11 | ev2: *std.Thread.ResetEvent, 12 | }; 13 | 14 | fn waiter(p: *pdata) void { 15 | for (0..2) |i| { 16 | dbg("{d} start wait1\n", .{i}); 17 | p.ev1.wait(); 18 | dbg("{d} end wait1, call reset\n", .{i}); 19 | p.ev1.reset(); 20 | 21 | dbg("{d} start wait2\n", .{i}); 22 | p.ev2.wait(); 23 | dbg("{d} end wait2, call reset\n", .{i}); 24 | p.ev2.reset(); 25 | } 26 | } 27 | 28 | fn testWaiter() !void { 29 | var ev1 = std.Thread.ResetEvent{}; 30 | var ev2 = std.Thread.ResetEvent{}; 31 | var data = pdata{ .ev1 = &ev1, .ev2 = &ev2 }; 32 | 33 | const t = try std.Thread.spawn(.{}, waiter, .{&data}); 34 | t.detach(); 35 | 36 | std.time.sleep(std.time.ns_per_s * 5); 37 | ev1.set(); 38 | ev2.set(); 39 | std.time.sleep(std.time.ns_per_s * 5); 40 | ev1.set(); 41 | ev2.set(); 42 | std.time.sleep(std.time.ns_per_s * 5); 43 | } 44 | 45 | test "backoff" { 46 | // Try referencing external dep in test block. 47 | const bo = backoff.Backoff{}; 48 | dbg("val={any}\n", .{bo.initial}); 49 | } 50 | 51 | test "atomic" { 52 | var tm = try std.time.Timer.start(); 53 | var v: u64 = 0; 54 | @atomicStore(u64, &v, 1, AtomicOrder.seq_cst); 55 | _ = @atomicLoad(u64, &v, AtomicOrder.seq_cst); 56 | // print("load={d}\n", .{a}); 57 | _ = @atomicRmw(u64, &v, AtomicRmwOp.Add, 1e9, AtomicOrder.seq_cst); 58 | _ = @atomicLoad(u64, &v, AtomicOrder.seq_cst); 59 | // print("add={d}\n", .{b}); 60 | dbg("took {any}\n", .{std.fmt.fmtDuration(tm.read())}); 61 | } 62 | 63 | test "view" { 64 | const en = enum(u4) { 65 | change, 66 | do, 67 | start, 68 | }; 69 | 70 | const e: en = .start; 71 | dbg("size={d}\n", .{@sizeOf(@TypeOf(e))}); 72 | const ee: en = @enumFromInt(2); 73 | dbg("int={any}\n", .{ee}); 74 | 75 | const val = 17293822569102704642; // 2 76 | dbg("cmd={x}\n", .{(val & 0xf000000000000000) >> 60}); 77 | dbg("val={x}\n", .{val & 0x0fffffffffffffff}); 78 | dbg("{x}\n", .{0xffffffffffffffff & (0b11 << 62)}); 79 | } 80 | 81 | // test "httpget" { 82 | // var parent = std.heap.ArenaAllocator.init(std.testing.allocator); 83 | // defer parent.deinit(); 84 | // const arena = parent.allocator(); 85 | 86 | // var client = std.http.Client{ .allocator = arena }; 87 | // defer client.deinit(); 88 | 89 | // const endpoint = "https://keyvalue.immanuel.co/api/KeyVal/GetValue/seegmed7/chew"; 90 | // const uri = try std.Uri.parse(endpoint); 91 | 92 | // const server_header_buffer: []u8 = try arena.alloc(u8, 8 * 1024 * 4); 93 | // var req = try client.open(.GET, uri, std.http.Client.RequestOptions{ 94 | // .server_header_buffer = server_header_buffer, 95 | // }); 96 | 97 | // defer req.deinit(); 98 | 99 | // try req.send(); 100 | // try req.finish(); 101 | // try req.wait(); 102 | 103 | // const repstr = try req.reader().readAllAlloc(arena, std.math.maxInt(usize)); 104 | 105 | // dbg("reply={s}\n", .{repstr}); 106 | // } 107 | 108 | // test "httppost" { 109 | // var parent = std.heap.ArenaAllocator.init(std.testing.allocator); 110 | // defer parent.deinit(); 111 | // const arena = parent.allocator(); 112 | 113 | // var client = std.http.Client{ .allocator = arena }; 114 | // defer client.deinit(); 115 | 116 | // const endpoint = "https://keyvalue.immanuel.co/api/KeyVal/UpdateValue/seegmed7/chew/something"; 117 | // const uri = try std.Uri.parse(endpoint); 118 | 119 | // const server_header_buffer: []u8 = try arena.alloc(u8, 8 * 1024 * 4); 120 | // var req = try client.open(.POST, uri, std.http.Client.RequestOptions{ 121 | // .server_header_buffer = server_header_buffer, 122 | // .extra_headers = &[_]std.http.Header{.{ .name = "content-length", .value = "9" }}, 123 | // }); 124 | 125 | // defer req.deinit(); 126 | 127 | // try req.send(); 128 | // try req.finish(); 129 | // try req.wait(); 130 | 131 | // const repstr = try req.reader().readAllAlloc(arena, std.math.maxInt(usize)); 132 | 133 | // dbg("reply={s}\n", .{repstr}); 134 | // } 135 | 136 | // test "httpfetch" { 137 | // var parent = std.heap.ArenaAllocator.init(std.testing.allocator); 138 | // defer parent.deinit(); 139 | // const arena = parent.allocator(); 140 | 141 | // var client = std.http.Client{ .allocator = arena }; 142 | // defer client.deinit(); 143 | 144 | // // https://api.keyval.org/get/chew 145 | // // const endpoint = "https://keyvalue.immanuel.co/api/KeyVal/UpdateValue/seegmed7/chew/something"; 146 | // const endpoint = "https://api.keyval.org/set/chew/bloodboil"; 147 | // const uri = try std.Uri.parse(endpoint); 148 | 149 | // var response_body = std.ArrayList(u8).init(arena); 150 | 151 | // const response = try client.fetch(std.http.Client.FetchOptions{ 152 | // .method = std.http.Method.POST, 153 | // .location = .{ .uri = uri }, 154 | // // .extra_headers = &[_]std.http.Header{.{ .name = "Content-Length", .value = "9" }}, 155 | // .response_storage = .{ .dynamic = &response_body }, 156 | // }); 157 | 158 | // if (response.status != .ok) dbg("booooooo\n", .{}); 159 | 160 | // const parsed_body = try response_body.toOwnedSlice(); 161 | // dbg("RESPONSE: {s}\n", .{parsed_body}); 162 | // } 163 | 164 | test "returnblock" { 165 | { 166 | dbg("block entry\n", .{}); 167 | defer dbg("block exit\n", .{}); 168 | if (true) return; 169 | } 170 | 171 | dbg("should not be here\n", .{}); 172 | } 173 | 174 | test "shift" { 175 | // 0xFFFF.FFFF.FFFF.FFFF 176 | const on = 1 << 63; 177 | dbg("{X}\n", .{on}); 178 | const val = (on & 0x8000000000000000) >> 63; 179 | dbg("{d}\n", .{val}); 180 | const min = 21 << 31; 181 | dbg("{d}\n", .{(min & 0x7FFFFFFF80000000) >> 31}); 182 | } 183 | 184 | test "comp" { 185 | var empty = try std.fmt.allocPrint(std.testing.allocator, "", .{}); 186 | dbg("len_empty={d}\n", .{empty.len}); 187 | const str = try std.fmt.allocPrint(std.testing.allocator, "hello", .{}); 188 | defer std.testing.allocator.free(str); 189 | empty = str; 190 | dbg("len_empty={d}\n", .{empty.len}); 191 | } 192 | 193 | test "envmap" { 194 | const allocator = std.testing.allocator; 195 | var envmap = try std.process.getEnvMap(allocator); 196 | defer envmap.deinit(); 197 | 198 | var iter = envmap.iterator(); 199 | while (iter.next()) |v| { 200 | dbg("{s}={s}\n", .{ v.key_ptr.*, v.value_ptr.* }); 201 | } 202 | 203 | const path = envmap.getPtr("PATH"); 204 | if (path) |v| { 205 | dbg("PATH={s}\n", .{v.*}); 206 | } else { 207 | dbg("no PATH\n", .{}); 208 | } 209 | } 210 | 211 | test "fba" { 212 | const mem = try std.testing.allocator.alloc(u8, 8); 213 | defer std.testing.allocator.free(mem); 214 | var _fba = std.heap.FixedBufferAllocator.init(mem); 215 | var fba = _fba.allocator(); 216 | dbg("0x{X}\n", .{mem}); 217 | var m0 = try fba.alloc(u8, 1); 218 | m0[0] = 0xFF; 219 | dbg("0x{X}\n", .{mem}); 220 | var m1 = try fba.alloc(u8, 1); 221 | m1[0] = 0x7F; 222 | dbg("0x{X}\n", .{mem}); 223 | var m2 = try fba.alloc(u8, 1); 224 | m2[0] = 0x8F; 225 | dbg("0x{X}\n", .{mem}); 226 | var m3 = try fba.alloc(u8, 1); 227 | m3[0] = 0x9F; 228 | dbg("0x{X}\n", .{mem}); 229 | var m = try fba.alloc(u8, 4); 230 | m[0] = 0xBF; 231 | m[1] = 0xBF; 232 | m[2] = 0xBF; 233 | m[3] = 0xBF; 234 | dbg("0x{X}\n", .{mem}); 235 | 236 | fba.free(m0); 237 | fba.free(m1); 238 | fba.free(m3); 239 | dbg("0x{X}\n", .{mem}); 240 | 241 | var mx = try fba.alloc(u8, 2); 242 | mx[0] = 0x21; 243 | mx[1] = 0x21; 244 | dbg("0x{X}\n", .{mem}); 245 | } 246 | -------------------------------------------------------------------------------- /src/main.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const zgroup = @import("zgroup.zig"); 3 | const backoff = @import("zbackoff"); 4 | 5 | const log = std.log; 6 | 7 | // You can change zgroup's log-level to .info. 8 | pub const std_options = .{ 9 | .log_level = .info, 10 | .log_scope_levels = &[_]std.log.ScopeLevel{ 11 | .{ .scope = .zgroup, .level = .debug }, 12 | }, 13 | }; 14 | 15 | // To be passed to our callback(s). 16 | const UserData = struct { 17 | prefix: []const u8, 18 | group: []const u8, 19 | skip_callback: bool = false, 20 | }; 21 | 22 | const Fleet = zgroup.Fleet(UserData); 23 | 24 | // A sample binary on how to use the zgroup library. 25 | // Expected cmdline args: 26 | // 27 | // [0] = bin 28 | // [1] = name 29 | // [2] = member ip:port 30 | // [3] = join ip:port (optional) 31 | // 32 | pub fn main() !void { 33 | var gpa = std.heap.GeneralPurposeAllocator(.{}){}; 34 | var aa = std.heap.ArenaAllocator.init(gpa.allocator()); 35 | defer aa.deinit(); // destroy arena in one go 36 | const arena = aa.allocator(); 37 | 38 | // Collect process args to a map. 39 | var args = try std.process.argsWithAllocator(arena); 40 | var hm = std.AutoHashMap(usize, []const u8).init(arena); 41 | var i: usize = 0; 42 | while (args.next()) |val| : (i += 1) { 43 | const arg = try std.fmt.allocPrint(arena, "{s}", .{val}); 44 | try hm.put(i, arg); 45 | } 46 | 47 | if (hm.count() < 3) { 48 | log.err("invalid args", .{}); 49 | return; 50 | } 51 | 52 | var iter = hm.iterator(); 53 | while (iter.next()) |v| 54 | log.info("args[{d}]: {s}", .{ v.key_ptr.*, v.value_ptr.* }); 55 | 56 | // Required: so we can have our own unique URL in the free service. 57 | var envmap = try std.process.getEnvMap(arena); 58 | if (hm.count() == 3) { 59 | const jp = envmap.getPtr("ZGROUP_JOIN_PREFIX"); 60 | if (jp) |_| {} else { 61 | log.err("no $ZGROUP_JOIN_PREFIX envvar found", .{}); 62 | return; 63 | } 64 | } 65 | 66 | const name = hm.getEntry(1).?.value_ptr.*; 67 | 68 | var data = UserData{ 69 | .prefix = b: { 70 | const jp = envmap.getPtr("ZGROUP_JOIN_PREFIX"); 71 | if (jp) |v| break :b v.* else { 72 | break :b try std.fmt.allocPrint(arena, "", .{}); 73 | } 74 | }, 75 | .group = name, 76 | }; 77 | 78 | const callbacks = Fleet.Callbacks{ 79 | .data = &data, // arbitrary callback data 80 | 81 | // Callback function for the join address. 82 | .onJoinAddr = onJoinAddr, 83 | 84 | // So we won't overload the free service we are using. 85 | .on_join_every = 50, 86 | }; 87 | 88 | var member = hm.getEntry(2).?.value_ptr.*; 89 | var sep = std.mem.indexOf(u8, member, ":").?; 90 | 91 | var cfg = Fleet.Config{ 92 | .name = name, 93 | .ip = member[0..sep], 94 | .callbacks = callbacks, 95 | }; 96 | 97 | cfg.port = try std.fmt.parseUnsigned(u16, member[sep + 1 ..], 10); 98 | 99 | var fleet = try Fleet.init(gpa.allocator(), &cfg); 100 | try fleet.run(); // actual run, join later 101 | defer fleet.deinit(); 102 | 103 | i = 0; 104 | var joined = false; 105 | var bo = backoff.Backoff{}; 106 | while (true) : (i += 1) { 107 | if (joined) 108 | std.time.sleep(std.time.ns_per_s * 1) 109 | else 110 | std.time.sleep(if (i >= 100) std.time.ns_per_s else bo.pause()); 111 | 112 | if (i > 1 and i < 100 and !joined) { 113 | switch (hm.count()) { 114 | 3 => { 115 | // No join address in args. Try using a free discovery service. 116 | var join_addr: []const u8 = ""; 117 | const ja = try getJoinAddress( 118 | arena, 119 | envmap.getPtr("ZGROUP_JOIN_PREFIX").?.*, 120 | name, 121 | ); 122 | 123 | if (ja.len > 0) join_addr = ja else continue; 124 | 125 | log.info("[{d}] join address found, addr={s}", .{ i, join_addr }); 126 | 127 | sep = std.mem.indexOf(u8, join_addr, ":").?; 128 | const join_port = try std.fmt.parseUnsigned( 129 | u16, 130 | join_addr[sep + 1 ..], 131 | 10, 132 | ); 133 | 134 | fleet.join( 135 | name, 136 | join_addr[0..sep], 137 | join_port, 138 | &joined, 139 | ) catch |err| 140 | log.err("joining thru {s}:{d} failed: {any}", .{ 141 | join_addr[0..sep], 142 | join_port, 143 | err, 144 | }); 145 | }, 146 | 4 => { 147 | // Join address is provided. Skip callback. 148 | data.skip_callback = true; 149 | 150 | const join = hm.getEntry(3).?.value_ptr.*; 151 | sep = std.mem.indexOf(u8, join, ":").?; 152 | const join_ip = join[0..sep]; 153 | if (join_ip.len == 0) { 154 | log.err("invalid join address", .{}); 155 | return; 156 | } 157 | 158 | const join_port = try std.fmt.parseUnsigned( 159 | u16, 160 | join[sep + 1 ..], 161 | 10, 162 | ); 163 | 164 | fleet.join( 165 | name, 166 | join_ip, 167 | join_port, 168 | &joined, 169 | ) catch |err| log.err("join failed: {any}", .{err}); 170 | }, 171 | else => {}, 172 | } 173 | } 174 | 175 | // Sample code on getting the current members in the group. 176 | if (i > 0 and @mod(i, 10) == 0) { 177 | const members = try fleet.getMembers(gpa.allocator()); 178 | defer members.deinit(); 179 | log.info("main: members={d}", .{members.items.len}); 180 | for (members.items) |v| gpa.allocator().free(v); 181 | } 182 | } 183 | } 184 | 185 | // The allocator here is the allocator passed to Fleet's init function. `addr`'s 186 | // format is "ip:port", e.g. "127.0.0.1:8080", and needs to be freed after use. 187 | fn onJoinAddr(allocator: std.mem.Allocator, data: ?*UserData, addr: []const u8) !void { 188 | defer allocator.free(addr); 189 | if (data.?.skip_callback) return; 190 | try setJoinAddress(allocator, data.?.prefix, data.?.group, addr); 191 | } 192 | 193 | // We are using curl here as std.http.Client seems to not play well with this endpoint. 194 | // The "seegmed7" in the url is our API key. 195 | fn setJoinAddress( 196 | allocator: std.mem.Allocator, 197 | prefix: []const u8, 198 | group: []const u8, 199 | addr: []const u8, 200 | ) !void { 201 | const enc = std.base64.Base64Encoder.init(std.base64.url_safe_alphabet_chars, '='); 202 | const buf = try allocator.alloc(u8, enc.calcSize(addr.len)); 203 | defer allocator.free(buf); 204 | const out = enc.encode(buf, addr); 205 | const url = try std.fmt.allocPrint( 206 | allocator, 207 | "https://keyvalue.immanuel.co/api/KeyVal/UpdateValue/seegmed7/{s}-{s}/{s}", 208 | .{ prefix, group, out }, 209 | ); 210 | 211 | defer allocator.free(url); 212 | 213 | log.info("callback: setJoinAddress: url={s}", .{url}); 214 | 215 | const result = try std.process.Child.run(.{ 216 | .allocator = allocator, 217 | .argv = &[_][]const u8{ 218 | "curl", 219 | "-X", 220 | "POST", 221 | "-H", 222 | "Content-Length: 1", // somehow, this works with this endpoint (required though) 223 | url, 224 | }, 225 | }); 226 | 227 | defer { 228 | allocator.free(result.stdout); 229 | allocator.free(result.stderr); 230 | } 231 | } 232 | 233 | // We are using curl here as std.http.Client seems to not play well with this endpoint. 234 | // The "seegmed7" in the url is our API key. We are passing an arena allocator here. 235 | fn getJoinAddress(allocator: std.mem.Allocator, prefix: []const u8, group: []const u8) ![]u8 { 236 | const url = try std.fmt.allocPrint( 237 | allocator, 238 | "https://keyvalue.immanuel.co/api/KeyVal/GetValue/seegmed7/{s}-{s}", 239 | .{ prefix, group }, 240 | ); 241 | 242 | log.info("callback: getJoinAddress: url={s}", .{url}); 243 | 244 | const result = try std.process.Child.run(.{ 245 | .allocator = allocator, 246 | .argv = &[_][]const u8{ "curl", url }, 247 | }); 248 | 249 | const out = std.mem.trim(u8, result.stdout, "\""); 250 | const dec = std.base64.Base64Decoder.init(std.base64.url_safe_alphabet_chars, '='); 251 | const buf = try allocator.alloc(u8, try dec.calcSizeForSlice(out)); 252 | try dec.decode(buf, out); 253 | return buf; 254 | } 255 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | > [!WARNING] 2 | > Still in alpha stage. APIs may change. 3 | 4 | --- 5 | 6 | [![main](https://github.com/flowerinthenight/zgroup/actions/workflows/main.yml/badge.svg)](https://github.com/flowerinthenight/zgroup/actions/workflows/main.yml) 7 | [![Docker Repository on Quay](https://quay.io/repository/flowerinthenight/zgroup/status "Docker Repository on Quay")](https://quay.io/repository/flowerinthenight/zgroup) 8 | 9 | (This repo is mirrored to [https://codeberg.org/flowerinthenight/zgroup](https://codeberg.org/flowerinthenight/zgroup)). 10 | 11 | ## Overview 12 | 13 | **zgroup** is a [Zig](https://ziglang.org/) library that can manage cluster membership and member failure detection. It uses a combination of [SWIM Protocol](https://www.cs.cornell.edu/projects/Quicksilver/public_pdfs/SWIM.pdf)'s gossip-style information dissemination, and [Raft](https://raft.github.io/raft.pdf)'s leader election algorithm (minus the log management) to track cluster changes. 14 | 15 | ### On payload size 16 | 17 | One of zgroup's main goal is to be able to track clusters with sizes that can change dynamically over time (e.g. [Kubernetes Deployments](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/), [GCP Instance Groups](https://cloud.google.com/compute/docs/instance-groups), [AWS Autoscaling Groups](https://docs.aws.amazon.com/autoscaling/ec2/userguide/auto-scaling-groups.html), etc.) with minimal dependencies and network load. All of my previous related works so far, depend on some external service (see [spindle](https://github.com/flowerinthenight/spindle), [hedge](https://github.com/flowerinthenight/hedge)), using traditional heartbeating, to achieve this. This heartbeating technique usually suffers from increasing payload sizes (proportional to cluster sizes) as clusters get bigger. But I wanted a system that doesn't suffer from that side effect. Enter [SWIM](https://www.cs.cornell.edu/projects/Quicksilver/public_pdfs/SWIM.pdf)'s infection-style information dissemination. It can use a constant payload size regardless of the cluster size. SWIM uses a combination of `PING`s, `INDIRECT-PING`s, and `ACK`s to detect member failures while piggybacking on these same messages to propagate membership updates (gossip protocol). Currently, zgroup only uses SWIM's direct probing protocol; it doesn't fully implement the Suspicion sub-protocol (yet). 18 | 19 | At the moment, zgroup uses a single, 64-byte payload for all its messages, including leader election (see below). 20 | 21 | ### On leader election 22 | 23 | I also wanted some sort of leader election capability without depending on an external lock service. At the moment, zgroup uses [Raft](https://raft.github.io/raft.pdf)'s leader election algorithm sub-protocol (without the log management) to achieve this. I should note that Raft's leader election algorithm depends on stable membership for it work properly, so zgroup's leader election is a best-effort basis only; split-brain can still happen while the cluster size is still changing. Additional code guards are added to minimize split-brain in these scenarios but it's not completely eliminated. In my use-case (and testing), gradual cluster size changes are mostly stable, while sudden changes with huge size deltas are not. For example, a big, sudden jump from three nodes (zgroup's minimum size) to, say, a hundred, due to autoscaling, would cause a split-brain. Once the target size is achieved however, a single leader will always be elected. 24 | 25 | A note on Raft's random timeout range during leader election: zgroup's leader tracks ping latency averages and attempts to adjust the timeout range accordingly to accomodate for cluster size changes overtime. 26 | 27 | ### Join address 28 | 29 | For a node to join an existing cluster, it needs a joining address. While zgroup exposes a `join()` function for this, it also provides a callback mechanism, providing callers with a join address. This address can then be stored to an external store for the other nodes to use. Internally, zgroup uses the node with the highest IP(v4) address in the group. 30 | 31 | ## Sample binary 32 | 33 | A [sample](./src/main.zig) binary is provided to show a way to use the library. There are two ways to run the sample: 34 | 35 | * Specifying the join address manually 36 | * Using an external service to get the join address 37 | 38 | ### Local with join address 39 | 40 | ```sh 41 | # Build the sample binary: 42 | $ zig build --summary all 43 | 44 | # Run the 1st process. The expected args look like: 45 | # 46 | # ./zgroup groupname member_ip:port [join_ip:port] 47 | # 48 | 49 | # Run the first process (join to self). 50 | $ ./zig-out/bin/zgroup group1 0.0.0.0:8080 0.0.0.0:8080 51 | 52 | # Then you can run additional instances. 53 | # Join through the 1st process/node (different terminal): 54 | $ ./zig-out/bin/zgroup group1 0.0.0.0:8081 0.0.0.0:8080 55 | 56 | # Join through the 2nd process/node (different terminal): 57 | $ ./zig-out/bin/zgroup group1 0.0.0.0:8082 0.0.0.0:8081 58 | 59 | # Join through the 1st process/node (different terminal): 60 | $ ./zig-out/bin/zgroup group1 0.0.0.0:8083 0.0.0.0:8080 61 | 62 | # and so on... 63 | ``` 64 | 65 | ### Local with an external service 66 | 67 | If configured, the sample binary uses a free service, [https://keyvalue.immanuel.co/](https://keyvalue.immanuel.co/), as a store for the join address. 68 | 69 | ```sh 70 | # Build the sample binary: 71 | $ zig build --summary all 72 | 73 | # Generate UUID: 74 | $ uuidgen 75 | {output} 76 | 77 | # Run the 1st process. The expected args look like: 78 | # 79 | # ./zgroup groupname member_ip:port 80 | # 81 | 82 | # Run the first process: 83 | $ ZGROUP_JOIN_PREFIX={output} ./zig-out/bin/zgroup group1 0.0.0.0:8080 84 | 85 | # Add a second node (different terminal): 86 | $ ZGROUP_JOIN_PREFIX={output} ./zig-out/bin/zgroup group1 0.0.0.0:8081 87 | 88 | # Add a third node (different terminal): 89 | $ ZGROUP_JOIN_PREFIX={output} ./zig-out/bin/zgroup group1 0.0.0.0:8082 90 | 91 | # Add a fourth node (different terminal): 92 | $ ZGROUP_JOIN_PREFIX={output} ./zig-out/bin/zgroup group1 0.0.0.0:8083 93 | 94 | # and so on... 95 | ``` 96 | 97 | ### Kubernetes (Deployment) 98 | 99 | A sample Kubernetes [deployment file](./k8s.yaml) is provided to try zgroup on [Kubernetes Deployments](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/). Before deploying though, make sure to update the `ZGROUP_JOIN_PREFIX` environment variable, like so: 100 | 101 | ```sh 102 | # Generate UUID: 103 | $ uuidgen 104 | {output} 105 | 106 | # Update the 'value' part with your output. 107 | ... 108 | - name: ZGROUP_JOIN_PREFIX 109 | value: "{output}" 110 | ... 111 | 112 | # Deploy to Kubernetes: 113 | $ kubectl create -f k8s.yaml 114 | 115 | # You will notice some initial errors in the logs. 116 | # Wait for a while before the K/V store is updated. 117 | ``` 118 | 119 | ### GCP Managed Instance Group (MIG) 120 | 121 | A sample [startup script](./startup-gcp-mig.sh) is provided to try zgroup on a [GCP MIG](https://cloud.google.com/compute/docs/instance-groups#managed_instance_groups). Before deploying though, make sure to update the `ZGROUP_JOIN_PREFIX` value in the script, like so: 122 | 123 | ```sh 124 | # Generate UUID: 125 | $ uuidgen 126 | {output} 127 | 128 | # Update the 'value' part of ZGROUP_JOIN_PREFIX with your output. 129 | ... 130 | ZGROUP_JOIN_PREFIX={output} ./zgroup group1 ... 131 | 132 | # Create an instance template: 133 | $ gcloud compute instance-templates create zgroup-tmpl \ 134 | --machine-type e2-micro \ 135 | --metadata=startup-script=''"$(cat startup-gcp-mig.sh)"'' 136 | 137 | # Create a regional MIG: 138 | $ gcloud compute instance-groups managed create rmig \ 139 | --template zgroup-tmpl --size 3 --region {your-region} 140 | 141 | # You can view the logs through: 142 | $ tail -f /var/log/messages 143 | ``` 144 | 145 | ### AWS Autoscaling Group 146 | 147 | A sample [startup script](./startup-aws-asg.sh) is provided to try zgroup on an [AWS ASG](https://docs.aws.amazon.com/autoscaling/ec2/userguide/auto-scaling-groups.html). Before deploying though, make sure to update the `ZGROUP_JOIN_PREFIX` value in the script, like so: 148 | 149 | ```sh 150 | # Generate UUID: 151 | $ uuidgen 152 | {output} 153 | 154 | # Update the 'value' part of ZGROUP_JOIN_PREFIX with your output. 155 | ... 156 | ZGROUP_JOIN_PREFIX={output} ./zgroup group1 ... 157 | 158 | # Create a launch template. ImageId here is Amazon Linux, default VPC. 159 | # (Added newlines for readability. Might not run when copied as is.) 160 | $ aws ec2 create-launch-template \ 161 | --launch-template-name zgroup-lt \ 162 | --version-description version1 \ 163 | --launch-template-data ' 164 | { 165 | "UserData":"'"$(cat startup-aws-asg.sh | base64 -w 0)"'", 166 | "ImageId":"ami-0f75d1a8c9141bd00", 167 | "InstanceType":"t2.micro" 168 | }' 169 | 170 | # Create the ASG: 171 | $ aws autoscaling create-auto-scaling-group \ 172 | --auto-scaling-group-name zgroup-asg \ 173 | --launch-template LaunchTemplateName=zgroup-lt,Version='1' \ 174 | --min-size 3 \ 175 | --max-size 3 \ 176 | --availability-zones {target-zone} 177 | 178 | # You can view the logs through: 179 | $ [sudo] journalctl -f 180 | ``` 181 | 182 | ## Getting the list of members 183 | 184 | To get the current members of the group, you can try something like: 185 | 186 | ```zig 187 | const members = try fleet.getMembers(gpa.allocator()); 188 | defer members.deinit(); 189 | 190 | for (members.items, 0..) |v, i| { 191 | defer gpa.allocator().free(v); 192 | log.info("member[{d}]: {s}", .{ i, v }); 193 | } 194 | ``` 195 | 196 | The tricky part of using zgroup is configuring the timeouts to optimize state dissemination and convergence. The current implementation was only tested within a local network. 197 | 198 | ## TODOs 199 | 200 | - [ ] - Provide callbacks for membership changes 201 | - [ ] - Provide an API to get the current leader 202 | - [ ] - Provide an interface for other processes (non-Zig users) 203 | - [ ] - Use multicast (if available) for the join address 204 | 205 | PR's are welcome. 206 | -------------------------------------------------------------------------------- /src/zgroup.zig: -------------------------------------------------------------------------------- 1 | //! zgroup is a library that can manage cluster membership and member failure detection. 2 | //! It is based on the SWIM Protocol and Raft's leader election algorithm sub-protocol 3 | //! (without the log management). 4 | //! 5 | //! References: 6 | //! 7 | //! https://www.cs.cornell.edu/projects/Quicksilver/public_pdfs/SWIM.pdf 8 | //! https://raft.github.io/raft.pdf 9 | //! 10 | const std = @import("std"); 11 | const backoff = @import("zbackoff"); 12 | 13 | const log = std.log.scoped(.zgroup); 14 | 15 | pub fn Fleet(UserData: type) type { 16 | return struct { 17 | const Self = @This(); 18 | 19 | allocator: std.mem.Allocator, 20 | 21 | // See Config comments for these fields. 22 | name: []const u8, 23 | ip: []const u8, 24 | port: u16, 25 | proto_time: u64, 26 | suspect_time: u64, 27 | ping_req_k: u32, 28 | elex_delay: u64, 29 | 30 | // Our per-member data. Key format is "ip:port", eg. "127.0.0.1:8080". 31 | members: std.StringHashMap(MemberData), 32 | members_mtx: std.Thread.Mutex = .{}, 33 | 34 | // Long-term references to all keys used in `members` and other intermediate 35 | // copies. Safer for access amidst all the addition and removals of items. 36 | refkeys: std.StringHashMap(void), 37 | refkeys_mtx: std.Thread.Mutex = .{}, 38 | 39 | // Intermediate member queue for round-robin pings and randomization. 40 | ping_queue: std.ArrayList([]const u8), 41 | 42 | // For requesting our indirect ping agent(s). 43 | ping_req_data: *RequestPing = undefined, // set in run() 44 | ping_req_0: std.Thread.ResetEvent = .{}, // request 45 | ping_req_1: std.Thread.ResetEvent = .{}, // response 46 | 47 | // Join address heartbeat timeout. 48 | join_addr_tm: std.time.Timer, 49 | 50 | callbacks: Callbacks, 51 | 52 | // Raft-inspired leader election. 53 | elex_mtx: std.Thread.Mutex = .{}, 54 | elex_join: bool = false, 55 | elex_join_tm: std.time.Timer, 56 | elex_term: u64 = 0, 57 | elex_state: ElectionState = .follower, 58 | votes: u32 = 0, 59 | voted_for: []const u8, 60 | elex_tm: std.time.Timer, 61 | candidate_tm: std.time.Timer, 62 | elex_tm_min: u64, // set via config 63 | elex_tm_max: u64, // set via config 64 | leader: []const u8, 65 | 66 | const ElectionState = enum(u8) { 67 | follower, 68 | candidate, 69 | leader, 70 | }; 71 | 72 | // SWIM protocol generic commands. 73 | const Command = enum(u8) { 74 | noop, 75 | ack, 76 | nack, 77 | join, 78 | ping, 79 | ping_req, 80 | heartbeat, 81 | req4votes, 82 | join2leader, 83 | }; 84 | 85 | // Infection-style dissemination (ISD) commands. 86 | const IsdCommand = enum(u8) { 87 | noop, 88 | infect, 89 | suspect, 90 | confirm_alive, 91 | confirm_faulty, 92 | }; 93 | 94 | // Possible member liveness states. 95 | const Liveness = enum(u8) { 96 | alive, 97 | suspected, 98 | faulty, 99 | }; 100 | 101 | const KeyInfo = struct { 102 | key: []const u8, 103 | liveness: Liveness, 104 | incarnation: u64 = 0, 105 | isd_cmd: IsdCommand = .noop, 106 | }; 107 | 108 | // Our generic UDP comms/protocol payload. 109 | const Message = packed struct { 110 | name: u64 = 0, 111 | 112 | // Section for ping, ping_req, ack, nack. 113 | cmd: Command = .noop, 114 | src_ip: u32 = 0, 115 | src_port: u16 = 0, 116 | src_state: Liveness = .alive, 117 | src_incarnation: u64 = 0, 118 | 119 | dst_cmd: IsdCommand = .noop, 120 | dst_ip: u32 = 0, 121 | dst_port: u16 = 0, 122 | dst_state: Liveness = .alive, 123 | dst_incarnation: u64 = 0, 124 | 125 | // Used for multiple subprotocols explained below: 126 | // 127 | // 1) For determining the highest node (for join) during SWIM pings. 128 | // Format: 129 | // |----- cmd ----| |- port (u16) -| |------- IP address (u32) ------| 130 | // 0000000000000011.1111111111111111.1111111111111111.1111111111111111 131 | // 132 | // 2) Term and node count during leader heartbeats. 133 | // Format: 134 | // |---- count ---| |----------------- term (u48) -------------------| 135 | // 1111111111111111.1111111111111111.1111111111111111.1111111111111111 136 | proto1: u64 = 0, 137 | 138 | // Used for multiple subprotocols explained below: 139 | // 140 | // 1) For informing the sender's member count during SWIM pings. 141 | // Format: the full 64 bits represents the value. 142 | // 143 | // 2) Min and max election timeouts during leader heartbeats, in ms. 144 | // Format: 145 | // 146 | // 1-MSB: 1 -> field is valid, 0 -> skip 147 | // 62-LSB: 31 bits each for min/max 148 | // 149 | // |x|---------- min (u31) ----------||---------- max (u31) ---------| 150 | // 1011111111111111.1111111111111111.1111111111111111.1111111111111111 151 | proto2: u64 = 0, 152 | }; 153 | 154 | // Per-member context data. 155 | const MemberData = struct { 156 | liveness: Liveness = .alive, 157 | age_suspected: std.time.Timer = undefined, 158 | age_faulty: std.time.Timer = undefined, 159 | incarnation: u64 = 0, 160 | targets: std.ArrayList([]const u8), 161 | }; 162 | 163 | const JoinCmd = enum(u8) { 164 | noop, 165 | heartbeat, 166 | invalidate, 167 | }; 168 | 169 | pub const Callbacks = struct { 170 | /// Optional context data; to be passed back to the callback function(s). 171 | data: ?*UserData, 172 | 173 | /// Optional callback for the join address. This is provided as an option to 174 | /// provide a join address for new nodes to join in. 175 | /// 176 | /// For example, you might want to setup a discovery service (e.g. K/V store) 177 | /// where you will store the join address from this callback. Other joining 178 | /// nodes can then use the store to query the join address. 179 | onJoinAddr: ?*const fn (std.mem.Allocator, ?*UserData, []const u8) anyerror!void, 180 | 181 | /// If > 0, `onJoinAddr` callback will be called every `proto_time * val`. For 182 | /// example, if your proto_time is 2s and this value is 10, `onJoinAddr` will 183 | /// be called every 20s. Default (0) means every `proto_time`; same as 1. 184 | on_join_every: u64 = 0, 185 | }; 186 | 187 | /// Config for init(). 188 | pub const Config = struct { 189 | /// We use the name as group identifier when groups are running over the 190 | /// same network. Max of 8 chars (u64 in payload). 191 | name: []const u8, 192 | 193 | /// Member IP address for UDP, eg. "0.0.0.0". Use init() to initialize. 194 | ip: []const u8, 195 | 196 | /// Member port number for UDP, eg. 8080. 197 | port: u16 = 8080, 198 | 199 | /// Our SWIM protocol timeout duration. 200 | proto_time: u64 = std.time.ns_per_ms * 500, 201 | 202 | /// Suspicion subprotocol timeout duration. 203 | suspect_time: u64 = std.time.ns_per_ms * 500, 204 | 205 | /// Number of members we will request to do indirect pings for us (agents). 206 | /// The only valid value at the moment is `1`. 207 | ping_req_k: u32 = 1, 208 | 209 | /// Delay between leader's liveness pings to all nodes. 210 | elex_delay: u64 = std.time.ns_per_ms * 100, 211 | 212 | /// See `Callbacks` struct for more information. 213 | callbacks: Callbacks, 214 | }; 215 | 216 | /// Create an instance of Self based on `config`. The `allocator` will be stored 217 | /// internally as the main internal allocator. Arena is not recommended as it's 218 | /// going to be used in the internal UDP server and the main loop which are 219 | /// expected to be long-running. Some areas will utilize an arena allocator 220 | /// based on the input allocator when it's appropriate. 221 | pub fn init(allocator: std.mem.Allocator, config: *const Config) !Self { 222 | const edf: f64 = @floatFromInt(config.elex_delay); 223 | const minf: f64 = edf / 0.05; 224 | const emin: u64 = @intFromFloat(minf); 225 | 226 | return Self{ 227 | .allocator = allocator, 228 | .name = if (config.name.len > 8) config.name[0..8] else config.name, 229 | .ip = config.ip, 230 | .port = config.port, 231 | .proto_time = config.proto_time, 232 | .suspect_time = config.suspect_time, 233 | .ping_req_k = config.ping_req_k, 234 | .elex_delay = config.elex_delay, 235 | .elex_tm_min = emin, 236 | .elex_tm_max = emin + std.time.ns_per_s, 237 | .members = std.StringHashMap(MemberData).init(allocator), 238 | .refkeys = std.StringHashMap(void).init(allocator), 239 | .ping_queue = std.ArrayList([]const u8).init(allocator), 240 | .join_addr_tm = try std.time.Timer.start(), 241 | .callbacks = config.callbacks, 242 | .leader = try std.fmt.allocPrint(allocator, "", .{}), 243 | .voted_for = try std.fmt.allocPrint(allocator, "", .{}), 244 | .elex_tm = try std.time.Timer.start(), 245 | .candidate_tm = try std.time.Timer.start(), 246 | .elex_join_tm = try std.time.Timer.start(), 247 | }; 248 | } 249 | 250 | /// Cleanup Self instance. At the moment, it is expected for this 251 | /// code to be long running until process is terminated. 252 | pub fn deinit(self: *Self) void { 253 | log.debug("deinit:", .{}); 254 | 255 | // TODO: See how to gracefuly exit threads. 256 | 257 | self.members.deinit(); 258 | var it = self.refkeys.iterator(); 259 | while (it.next()) |v| self.allocator.free(v.key_ptr.*); 260 | self.refkeys.deinit(); 261 | self.ping_queue.deinit(); 262 | } 263 | 264 | /// Start group membership tracking. 265 | pub fn run(self: *Self) !void { 266 | log.debug("run: name={s}, address={s}:{d}", .{ 267 | self.name, 268 | self.ip, 269 | self.port, 270 | }); 271 | 272 | log.debug("*Message: size={d}, align={d}", .{ 273 | @sizeOf(Message), 274 | @alignOf(Message), 275 | }); 276 | 277 | log.debug("SWIM: prototime={any}, suspecttime={any}, k={d}", .{ 278 | std.fmt.fmtDuration(self.proto_time), 279 | std.fmt.fmtDuration(self.suspect_time), 280 | self.ping_req_k, 281 | }); 282 | 283 | log.debug("leader election timeout range: min={any}, max={any}", .{ 284 | std.fmt.fmtDuration(self.elex_tm_min), 285 | std.fmt.fmtDuration(self.elex_tm_max), 286 | }); 287 | 288 | const me = try self.getOwnKey(); 289 | defer self.allocator.free(me); 290 | _ = try self.ensureKeyRef(me); 291 | try self.upsertMember(me, .alive, 0, true); 292 | self.elex_tm.reset(); 293 | _ = try self.ensureKeyRef("0"); // dummy 294 | 295 | const server = try std.Thread.spawn(.{}, Self.udpListen, .{self}); 296 | server.detach(); 297 | const ticker = try std.Thread.spawn(.{}, Self.swimTick, .{self}); 298 | ticker.detach(); 299 | const ldr = try std.Thread.spawn(.{}, Self.leaderElectionTick, .{self}); 300 | ldr.detach(); 301 | 302 | // self.ping_req_data = try self.allocator.create(RequestPing); 303 | // self.ping_req_data.self = self; 304 | // const rp = try std.Thread.spawn(.{}, Self.requestPing, .{self.ping_req_data}); 305 | // rp.detach(); 306 | } 307 | 308 | /// Ask a node to join an existing group. `joined` will be set to true 309 | /// if joining is successful. We are joining the group through `dst_*`. 310 | pub fn join( 311 | self: *Self, 312 | name: []const u8, 313 | dst_ip: []const u8, 314 | dst_port: u16, 315 | joined: *bool, 316 | ) !void { 317 | var aa = std.heap.ArenaAllocator.init(self.allocator); 318 | defer aa.deinit(); // destroy arena in one go 319 | const arena = aa.allocator(); 320 | 321 | const buf = try arena.alloc(u8, @sizeOf(Message)); 322 | const msg: *Message = @ptrCast(@alignCast(buf)); 323 | 324 | try self.presetMessage(msg); 325 | 326 | msg.cmd = .join; 327 | try self.setMsgSrcToOwn(msg); 328 | 329 | try send(dst_ip, dst_port, buf, null); 330 | 331 | switch (msg.cmd) { 332 | .ack => { 333 | const nn = std.mem.readVarInt(u64, self.name, .little); 334 | if (nn == msg.name) { 335 | const key = try std.fmt.allocPrint(arena, "{s}:{d}", .{ 336 | dst_ip, 337 | dst_port, 338 | }); 339 | 340 | try self.upsertMember(key, .alive, 0, true); 341 | self.elex_join_tm.reset(); 342 | joined.* = true; 343 | 344 | log.info("joined via {s}:{any}, name={s}", .{ 345 | dst_ip, 346 | dst_port, 347 | name, 348 | }); 349 | } 350 | }, 351 | else => {}, 352 | } 353 | } 354 | 355 | /// Returns a list of active members from the group/cluster. Caller owns the returning 356 | /// list, as well as each items in the array, which are duplicated from the internal 357 | /// list to prevent crashes during access due to potential changes in the main list. 358 | pub fn getMembers(self: *Self, allocator: std.mem.Allocator) !std.ArrayList([]const u8) { 359 | var tmp = std.ArrayList([]const u8).init(allocator); 360 | defer tmp.deinit(); 361 | 362 | { 363 | self.members_mtx.lock(); 364 | defer self.members_mtx.unlock(); 365 | var it = self.members.iterator(); 366 | while (it.next()) |v| { 367 | if (v.value_ptr.liveness == .faulty) continue; 368 | try tmp.append(v.key_ptr.*); 369 | } 370 | } 371 | 372 | var out = std.ArrayList([]const u8).init(allocator); 373 | 374 | if (tmp.items.len == 0) return out; 375 | 376 | for (tmp.items) |v| { 377 | const kdup = try allocator.dupe(u8, v); 378 | try out.append(kdup); 379 | } 380 | 381 | return out; 382 | } 383 | 384 | // Run internal UDP server for handling both SWIM- and Raft-related 385 | // protocols. Uses a single allocation of *Message all throughout. 386 | fn udpListen(self: *Self) !void { 387 | log.info("starting UDP server on :{d}...", .{self.port}); 388 | 389 | const name = std.mem.readVarInt(u64, self.name, .little); 390 | const buf = try self.allocator.alloc(u8, @sizeOf(Message)); 391 | defer self.allocator.free(buf); // release buffer 392 | 393 | // One allocation for the duration of this function. 394 | const msg: *Message = @ptrCast(@alignCast(buf)); 395 | 396 | const addr = try std.net.Address.resolveIp(self.ip, self.port); 397 | const sock = try std.posix.socket( 398 | std.posix.AF.INET, 399 | std.posix.SOCK.DGRAM, 400 | std.posix.IPPROTO.UDP, 401 | ); 402 | 403 | defer std.posix.close(sock); 404 | try setWriteTimeout(sock, 5_000_000); 405 | try std.posix.bind(sock, &addr.any, addr.getOsSockLen()); 406 | var src_addr: std.posix.sockaddr = undefined; 407 | var src_addrlen: std.posix.socklen_t = @sizeOf(std.posix.sockaddr); 408 | 409 | var i: usize = 0; 410 | while (true) : (i += 1) { 411 | const len = std.posix.recvfrom( 412 | sock, 413 | buf, 414 | 0, 415 | &src_addr, 416 | &src_addrlen, 417 | ) catch |err| { 418 | log.err("recvfrom failed: {any}", .{err}); 419 | std.time.sleep(std.time.ns_per_ms * 500); 420 | continue; 421 | }; 422 | 423 | var aa = std.heap.ArenaAllocator.init(self.allocator); 424 | defer aa.deinit(); // destroy arena in one go 425 | const arena = aa.allocator(); 426 | 427 | // Main protocol message handler. 428 | switch (msg.cmd) { 429 | .join => b: { 430 | if (msg.name == name) { 431 | const key = try keyFromIpPort(arena, msg.src_ip, msg.src_port); 432 | try self.upsertMember(key, .alive, msg.src_incarnation, true); 433 | 434 | // Inform current leader (if any) of this new join. 435 | msg.dst_ip = msg.src_ip; 436 | msg.dst_port = msg.src_port; 437 | try self.setMsgSrcToOwn(msg); 438 | 439 | log.debug("{s} is joining, inform leader [{s}]", .{ key, self.leader }); 440 | 441 | self.informLeaderOfJoin(buf) catch |err| 442 | log.debug("informLeaderOfJoin failed: {any}", .{err}); 443 | 444 | // Always set src_* to own info. 445 | try self.setMsgSrcToOwn(msg); 446 | 447 | msg.cmd = .ack; 448 | _ = std.posix.sendto( 449 | sock, 450 | std.mem.asBytes(msg), 451 | 0, 452 | &src_addr, 453 | src_addrlen, 454 | ) catch |err| log.err("sendto failed: {any}", .{err}); 455 | 456 | break :b; 457 | } 458 | 459 | msg.cmd = .nack; 460 | _ = std.posix.sendto( 461 | sock, 462 | std.mem.asBytes(msg), 463 | 0, 464 | &src_addr, 465 | src_addrlen, 466 | ) catch |err| log.err("sendto failed: {any}", .{err}); 467 | }, 468 | .ping => { 469 | // 470 | // Payload information: 471 | // 472 | // src_*: caller/requester 473 | // dst_*: ISD (piggyback) 474 | // 475 | msg.cmd = .nack; // default 476 | 477 | if (msg.name == name) { 478 | msg.cmd = .ack; 479 | const src = try keyFromIpPort(arena, msg.src_ip, msg.src_port); 480 | try self.upsertMember(src, .alive, msg.src_incarnation, true); 481 | 482 | if (msg.dst_cmd == .infect) { 483 | const dst = try keyFromIpPort(arena, msg.dst_ip, msg.dst_port); 484 | try self.upsertMember( 485 | dst, 486 | msg.dst_state, 487 | msg.dst_incarnation, 488 | false, 489 | ); 490 | } 491 | 492 | const n = self.getCounts(); 493 | if ((n[0] + n[1]) < msg.proto2) { 494 | self.elex_tm.reset(); 495 | @atomicStore( 496 | bool, 497 | &self.elex_join, 498 | false, 499 | std.builtin.AtomicOrder.seq_cst, 500 | ); 501 | } else @atomicStore( 502 | bool, 503 | &self.elex_join, 504 | true, 505 | std.builtin.AtomicOrder.seq_cst, 506 | ); 507 | 508 | // Always set src_* to own info. 509 | try self.setMsgSrcToOwn(msg); 510 | 511 | // Use dst_* for ISD info. 512 | var excludes: [1][]const u8 = .{src}; 513 | try self.setMsgDst(arena, msg, &excludes); 514 | 515 | // Handle join address protocol. 516 | var ipm = msg.proto1 & 0x00000000FFFFFFFF; 517 | var portm = (msg.proto1 & 0x0000FFFF00000000) >> 32; 518 | const cmdm: JoinCmd = @enumFromInt((msg.proto1 & 519 | 0xFFFF000000000000) >> 48); 520 | 521 | if (cmdm == .heartbeat) b: { 522 | const al = try self.getHighestNode(); 523 | if ((al[0] + al[1]) <= (ipm + portm)) { 524 | _ = self.join_addr_tm.lap(); 525 | break :b; 526 | } 527 | 528 | const hb: u64 = @intFromEnum(JoinCmd.invalidate); 529 | ipm = al[0] & 0x00000000FFFFFFFF; 530 | portm = (al[1] << 32) & 0x0000FFFF00000000; 531 | msg.proto1 = (hb << 48) | ipm | portm; 532 | } 533 | } 534 | 535 | _ = std.posix.sendto( 536 | sock, 537 | std.mem.asBytes(msg), 538 | 0, 539 | &src_addr, 540 | src_addrlen, 541 | ) catch |err| log.err("sendto failed: {any}", .{err}); 542 | }, 543 | .ping_req => b: { 544 | // 545 | // Payload information: 546 | // 547 | // src_*: caller/requester (we are the agent) 548 | // dst_*: target of the ping-request 549 | // 550 | if (msg.name == name) { 551 | const src = try keyFromIpPort(arena, msg.src_ip, msg.src_port); 552 | try self.upsertMember(src, msg.src_state, msg.src_incarnation, true); 553 | 554 | const dst = try keyFromIpPort(arena, msg.dst_ip, msg.dst_port); 555 | 556 | log.debug("({d}) ping-req: requested to ping {s}", .{ len, dst }); 557 | 558 | // Always set src_* to own info. 559 | try self.setMsgSrcToOwn(msg); 560 | 561 | // Use both dst_* and isd_* for ISD info. 562 | var excludes: [1][]const u8 = .{dst}; 563 | try self.setMsgDst(arena, msg, &excludes); 564 | 565 | // Handle leader protocol (egress). 566 | try self.setJoinProtoSend(msg); 567 | 568 | const ack = self.ping(dst) catch false; 569 | 570 | msg.cmd = .nack; // default 571 | 572 | if (ack) { 573 | // The src_* info here is the original ping target. 574 | // Copy its info to the dst_* section before overwriting. 575 | msg.cmd = .ack; 576 | msg.dst_ip = msg.src_ip; 577 | msg.dst_port = msg.src_port; 578 | msg.dst_state = msg.src_state; 579 | msg.dst_incarnation = msg.src_incarnation; 580 | 581 | try self.upsertMember(dst, .alive, msg.src_incarnation, true); 582 | 583 | // Handle join address protocol (ingress). 584 | self.setJoinProtoRecv(msg); 585 | } 586 | 587 | // Always set src_* to own info. 588 | try self.setMsgSrcToOwn(msg); 589 | 590 | // Handle join address protocol (egress). 591 | try self.setJoinProtoSend(msg); 592 | 593 | _ = std.posix.sendto( 594 | sock, 595 | std.mem.asBytes(msg), 596 | 0, 597 | &src_addr, 598 | src_addrlen, 599 | ) catch |err| log.err("sendto failed: {any}", .{err}); 600 | 601 | break :b; 602 | } 603 | 604 | // Not in this group. 605 | self.presetMessage(msg) catch {}; 606 | msg.cmd = .nack; 607 | 608 | _ = std.posix.sendto( 609 | sock, 610 | std.mem.asBytes(msg), 611 | 0, 612 | &src_addr, 613 | src_addrlen, 614 | ) catch |err| log.err("sendto failed: {any}", .{err}); 615 | }, 616 | .heartbeat => { 617 | msg.cmd = .nack; 618 | const tc = self.getTermAndN(msg); 619 | if (tc[0] >= self.getTerm()) { 620 | msg.cmd = .ack; 621 | self.setTerm(tc[0]); 622 | self.setVotes(0); 623 | self.elex_tm.reset(); 624 | self.setState(.follower); 625 | 626 | const src = try keyFromIpPort(arena, msg.src_ip, msg.src_port); 627 | const lkey = try self.ensureKeyRef(src); 628 | 629 | { 630 | self.elex_mtx.lock(); 631 | defer self.elex_mtx.unlock(); 632 | self.leader = lkey; 633 | self.voted_for = self.refkeys.getKeyPtr("0").?.*; 634 | } 635 | 636 | // Handle min/max timeouts from leader. 637 | b: { 638 | const on = (msg.proto2 & 0x8000000000000000) >> 63; 639 | if (on == 0) break :b; 640 | 641 | const lmin = ((msg.proto2 & 0x7FFFFFFF80000000) >> 31) * 1000; 642 | const lmax = ((msg.proto2 & 0x700000007FFFFFFF)) * 1000; 643 | 644 | @atomicStore( 645 | u64, 646 | &self.elex_tm_min, 647 | lmin, 648 | std.builtin.AtomicOrder.seq_cst, 649 | ); 650 | 651 | @atomicStore( 652 | u64, 653 | &self.elex_tm_max, 654 | lmax, 655 | std.builtin.AtomicOrder.seq_cst, 656 | ); 657 | } 658 | } 659 | 660 | _ = std.posix.sendto( 661 | sock, 662 | std.mem.asBytes(msg), 663 | 0, 664 | &src_addr, 665 | src_addrlen, 666 | ) catch |err| log.err("sendto failed: {any}", .{err}); 667 | }, 668 | .req4votes => { 669 | msg.cmd = .nack; 670 | var voted = false; 671 | 672 | { 673 | self.elex_mtx.lock(); 674 | defer self.elex_mtx.unlock(); 675 | if (self.voted_for.len > 1) voted = true; 676 | } 677 | 678 | const term = self.getTerm(); 679 | 680 | if (msg.proto1 >= term and !voted and self.getState() != .leader) { 681 | msg.cmd = .ack; 682 | self.setTerm(msg.proto1); 683 | 684 | const src = try keyFromIpPort(arena, msg.src_ip, msg.src_port); 685 | const vkey = try self.ensureKeyRef(src); 686 | 687 | { 688 | self.elex_mtx.lock(); 689 | defer self.elex_mtx.unlock(); 690 | self.voted_for = vkey; 691 | log.debug("req4votes: voted_for={s}", .{self.voted_for}); 692 | } 693 | } 694 | 695 | _ = std.posix.sendto( 696 | sock, 697 | std.mem.asBytes(msg), 698 | 0, 699 | &src_addr, 700 | src_addrlen, 701 | ) catch |err| log.err("sendto failed: {any}", .{err}); 702 | }, 703 | .join2leader => b: { 704 | const state = self.getState(); 705 | if (state != .leader) break :b; 706 | const dst = try keyFromIpPort(arena, msg.dst_ip, msg.dst_port); 707 | const pdst = try self.ensureKeyRef(dst); 708 | log.debug("[{d}] received join2leader, add {s}", .{ i, pdst }); 709 | try self.upsertMember(pdst, .alive, 0, false); 710 | }, 711 | else => {}, 712 | } 713 | 714 | self.presetMessage(msg) catch {}; 715 | } 716 | } 717 | 718 | // Drives the SWIM protocol forward. Runs on a separate thread. 719 | fn swimTick(self: *Self) !void { 720 | var i: usize = 0; 721 | while (true) : (i += 1) { 722 | var tm = try std.time.Timer.start(); 723 | var aa = std.heap.ArenaAllocator.init(self.allocator); 724 | defer aa.deinit(); // destroy arena in one go 725 | const arena = aa.allocator(); 726 | 727 | // const counts = self.getCounts(); 728 | // log.debug("[{d}] members: alive={d}, suspected={d}, faulty={d}, total={d}", .{ 729 | // i, 730 | // counts[0], 731 | // counts[1], 732 | // counts[2], 733 | // counts[3], 734 | // }); 735 | 736 | var key_ptr: ?[]const u8 = null; 737 | const pt = try self.getPingTarget(arena); 738 | if (pt) |v| key_ptr = v; // ensure non-null 739 | 740 | if (key_ptr) |ping_key| { 741 | // log.debug("[{d}] try pinging {s}", .{ i, ping_key }); 742 | 743 | switch (self.ping(ping_key) catch false) { 744 | false => { 745 | // Let's do indirect ping for this suspicious node. 746 | // var prtm = try std.time.Timer.start(); 747 | // defer log.debug("[{d}] ping-req took {any}", .{ 748 | // i, 749 | // std.fmt.fmtDuration(prtm.read()), 750 | // }); 751 | 752 | // var do_suspected = false; 753 | // var excludes: [1][]const u8 = .{ping_key}; 754 | // const agents = try self.getRandomMember( 755 | // arena, 756 | // &excludes, 757 | // self.ping_req_k, 758 | // ); 759 | 760 | // if (agents.items.len == 0) do_suspected = true else { 761 | // log.debug("[{d}] ping-req: agent(s)={d}", .{ i, agents.items.len }); 762 | 763 | // self.ping_req_data.src = agents.items[0]; 764 | // self.ping_req_data.dst = ping_key; 765 | 766 | // self.ping_req_0.set(); 767 | // self.ping_req_1.wait(); 768 | // if (!self.ping_req_data.ack) do_suspected = true; 769 | // self.ping_req_1.reset(); 770 | // } 771 | 772 | // if (do_suspected) b: { 773 | // const ki = self.getKeyInfo(ping_key); 774 | // if (ki) |_| {} else break :b; 775 | // try self.setMemberInfo( 776 | // ping_key, 777 | // .suspected, 778 | // ki.?.incarnation, 779 | // true, 780 | // ); 781 | // } 782 | 783 | b: { 784 | const ki = self.getKeyInfo(ping_key); 785 | if (ki) |_| {} else break :b; 786 | try self.setMemberInfo( 787 | ping_key, 788 | .suspected, 789 | ki.?.incarnation, 790 | true, 791 | ); 792 | } 793 | }, 794 | else => { 795 | // log.debug("[{d}] ack from {s}", .{ i, ping_key }); 796 | 797 | // TEST: start 798 | // if (i > 0 and i <= 100 and @mod(i, 20) == 0) { 799 | // log.debug("[{d}] --- trigger suspect for {s}", .{ i, ping_key }); 800 | // self.isd_mtx.lock(); 801 | // defer self.isd_mtx.unlock(); 802 | // try self.isd_queue.append(.{ 803 | // .key = ping_key, 804 | // .state = .suspected, 805 | // .incarnation = 0, 806 | // .isd_cmd = .suspect, 807 | // }); 808 | // } 809 | // TEST: end 810 | }, 811 | } 812 | } 813 | 814 | // Setup leader callback. Mainly for joining. 815 | var mod = self.callbacks.on_join_every; 816 | if (mod == 0) mod = 1; 817 | if (i > 0 and @mod(i, mod) == 0) b: { 818 | const al = self.getHighestNode() catch break :b; 819 | if (!al[2]) break :b; 820 | if (self.callbacks.onJoinAddr) |_| {} else break :b; 821 | const me = try std.fmt.allocPrint(self.allocator, "{s}:{d}", .{ 822 | self.ip, 823 | self.port, 824 | }); 825 | 826 | try self.callbacks.onJoinAddr.?( 827 | self.allocator, 828 | self.callbacks.data, 829 | me, 830 | ); 831 | } 832 | 833 | try self.removeFaultyMembers(); 834 | 835 | // Suspected to faulty. 836 | var s2f = std.ArrayList([]const u8).init(arena); 837 | 838 | { 839 | self.members_mtx.lock(); 840 | defer self.members_mtx.unlock(); 841 | var it = self.members.iterator(); 842 | while (it.next()) |v| { 843 | if (self.keyIsMe(v.key_ptr.*)) continue; 844 | if (v.value_ptr.liveness != .suspected) continue; 845 | if (v.value_ptr.age_suspected.read() < self.suspect_time) continue; 846 | try s2f.append(v.key_ptr.*); 847 | } 848 | } 849 | 850 | for (s2f.items) |v| try self.setMemberInfo(v, .faulty, null, false); 851 | 852 | // Pause before the next tick. 853 | const elapsed = tm.read(); 854 | if (elapsed < self.proto_time) { 855 | const left = self.proto_time - elapsed; 856 | // log.debug("[{d}] sleep for {any}", .{ i, std.fmt.fmtDuration(left) }); 857 | std.time.sleep(left); 858 | } 859 | } 860 | } 861 | 862 | // Drives the Raft-based leader election forward. Runs on a separate thread. 863 | fn leaderElectionTick(self: *Self) !void { 864 | const buf = try self.allocator.alloc(u8, @sizeOf(Message)); 865 | defer self.allocator.free(buf); // release buffer 866 | 867 | // One allocation for the duration of this function. 868 | const msg: *Message = @ptrCast(@alignCast(buf)); 869 | 870 | const seed = std.crypto.random.int(u64); 871 | var prng = std.rand.DefaultPrng.init(seed); 872 | const random = prng.random(); 873 | 874 | var ldr_last_sweep: bool = false; 875 | const min_og = self.getElexTimeoutMin(); 876 | const max_og = self.getElexTimeoutMax(); 877 | var lmin: u64 = self.getElexTimeoutMin(); 878 | var lmax: u64 = self.getElexTimeoutMax(); 879 | 880 | var i: usize = 0; 881 | while (true) : (i += 1) { 882 | const skip = false; 883 | const n = self.getCounts(); 884 | if ((n[0] + n[1]) < 3 or skip) { 885 | std.time.sleep(random.intRangeAtMost( 886 | u64, 887 | self.getElexTimeoutMin(), 888 | self.getElexTimeoutMax(), 889 | )); 890 | 891 | continue; 892 | } 893 | 894 | const allowed = @atomicLoad( 895 | bool, 896 | &self.elex_join, 897 | std.builtin.AtomicOrder.seq_cst, 898 | ); 899 | 900 | var aa = std.heap.ArenaAllocator.init(self.allocator); 901 | defer aa.deinit(); // destroy arena in one go 902 | const arena = aa.allocator(); 903 | 904 | self.presetMessage(msg) catch {}; 905 | 906 | switch (self.getState()) { 907 | .follower => { 908 | if (self.elex_join_tm.read() >= self.proto_time * (n[0] + n[1])) { 909 | @atomicStore( 910 | bool, 911 | &self.elex_join, 912 | true, 913 | std.builtin.AtomicOrder.seq_cst, 914 | ); 915 | } 916 | 917 | const rand = random.intRangeAtMost( 918 | u64, 919 | self.getElexTimeoutMin(), 920 | self.getElexTimeoutMax(), 921 | ); 922 | 923 | if (!allowed) { 924 | std.time.sleep(rand); 925 | continue; 926 | } 927 | 928 | if (self.elex_tm.read() <= self.getElexTimeoutMin()) { 929 | std.time.sleep(rand); 930 | continue; 931 | } 932 | 933 | _ = self.incTermAndGet(); 934 | _ = self.voteForSelf(); 935 | self.setState(.candidate); 936 | self.candidate_tm.reset(); 937 | }, 938 | .candidate => { 939 | var bl = std.ArrayList([]const u8).init(arena); 940 | defer bl.deinit(); 941 | 942 | { 943 | self.members_mtx.lock(); 944 | defer self.members_mtx.unlock(); 945 | var iter = self.members.iterator(); 946 | while (iter.next()) |v| { 947 | if (v.value_ptr.liveness != .alive) continue; 948 | if (self.keyIsMe(v.key_ptr.*)) continue; 949 | try bl.append(v.key_ptr.*); 950 | } 951 | } 952 | 953 | if (bl.items.len == 0) { 954 | std.time.sleep(random.intRangeAtMost( 955 | u64, 956 | self.getElexTimeoutMin(), 957 | self.getElexTimeoutMax(), 958 | )); 959 | 960 | continue; 961 | } 962 | 963 | log.debug("[{d}:{d}] req4votes to {d} nodes", .{ 964 | i, 965 | self.getTerm(), 966 | bl.items.len, 967 | }); 968 | 969 | var to_leader = false; 970 | for (bl.items) |k| { 971 | if (self.getState() == .follower) break; 972 | 973 | msg.cmd = .req4votes; 974 | try self.setMsgSrcToOwn(msg); 975 | const sep = std.mem.indexOf(u8, k, ":") orelse continue; 976 | const ip = k[0..sep]; 977 | const port = std.fmt.parseUnsigned(u16, k[sep + 1 ..], 10) catch 978 | continue; 979 | 980 | msg.proto1 = self.getTerm(); 981 | send(ip, port, buf, null) catch continue; 982 | 983 | if (msg.cmd != .ack) continue; 984 | 985 | log.debug("[{d}:{d}] received vote from {s}", .{ 986 | i, 987 | self.getTerm(), 988 | k, 989 | }); 990 | 991 | const majority = ((n[0] + n[1]) / 2) + 1; 992 | const votes = self.incVotesAndGet(); 993 | if (votes >= majority) { 994 | log.debug("[{d}:{d}] got {d} votes, majority={d}, n={d}", .{ 995 | i, 996 | self.getTerm(), 997 | votes, 998 | majority, 999 | n[0] + n[1], 1000 | }); 1001 | 1002 | self.setState(.leader); 1003 | to_leader = true; 1004 | break; 1005 | } 1006 | } 1007 | 1008 | if (!to_leader) { 1009 | if (self.candidate_tm.read() > self.getElexTimeoutMin()) { 1010 | log.debug("[{d}:{d}] lost the election, back to follower", .{ 1011 | i, 1012 | self.getTerm(), 1013 | }); 1014 | 1015 | std.time.sleep(random.intRangeAtMost( 1016 | u64, 1017 | self.getElexTimeoutMin(), 1018 | self.getElexTimeoutMax(), 1019 | )); 1020 | 1021 | self.setState(.follower); 1022 | self.elex_tm.reset(); 1023 | self.setVotes(0); 1024 | self.voted_for = self.refkeys.getKeyPtr("0").?.*; 1025 | } else std.time.sleep(random.intRangeAtMost( 1026 | u64, 1027 | self.getElexTimeoutMin(), 1028 | self.getElexTimeoutMax(), 1029 | )); 1030 | } 1031 | }, 1032 | .leader => { 1033 | var tm = try std.time.Timer.start(); 1034 | var items_len: usize = 0; 1035 | var fails: usize = 0; 1036 | var deferlog = false; 1037 | defer { 1038 | if (fails > 0) std.time.sleep(self.elex_delay); 1039 | if (deferlog) { 1040 | if (@mod(i, 40) == 0) { 1041 | log.debug("[{d}:{d}] leader: hb to {d} nodes, took {any}", .{ 1042 | i, 1043 | self.getTerm(), 1044 | items_len, 1045 | std.fmt.fmtDuration(tm.read() - self.elex_delay), 1046 | }); 1047 | } 1048 | } 1049 | } 1050 | 1051 | var bl = std.ArrayList([]const u8).init(arena); 1052 | defer bl.deinit(); 1053 | 1054 | { 1055 | self.members_mtx.lock(); 1056 | defer self.members_mtx.unlock(); 1057 | var iter = self.members.iterator(); 1058 | while (iter.next()) |v| { 1059 | if (v.value_ptr.liveness != .alive) continue; 1060 | if (self.keyIsMe(v.key_ptr.*)) continue; 1061 | try bl.append(v.key_ptr.*); 1062 | } 1063 | } 1064 | 1065 | if (bl.items.len == 0) { 1066 | std.time.sleep(random.intRangeAtMost( 1067 | u64, 1068 | self.getElexTimeoutMin(), 1069 | self.getElexTimeoutMax(), 1070 | )); 1071 | 1072 | continue; 1073 | } 1074 | 1075 | items_len = bl.items.len; // for later log (see defer) 1076 | var latencies = std.ArrayList(u64).init(self.allocator); 1077 | defer latencies.deinit(); 1078 | 1079 | var ltm = try std.time.Timer.start(); 1080 | 1081 | if (ldr_last_sweep) 1082 | msg.proto2 = (1 << 63) | ((lmin / 1000) << 31) | (lmax / 1000); 1083 | 1084 | for (bl.items) |k| { 1085 | deferlog = true; 1086 | msg.cmd = .heartbeat; 1087 | try self.setMsgSrcToOwn(msg); 1088 | const sep = std.mem.indexOf(u8, k, ":") orelse continue; 1089 | const ip = k[0..sep]; 1090 | const port = std.fmt.parseUnsigned(u16, k[sep + 1 ..], 10) catch 1091 | continue; 1092 | 1093 | msg.proto1 = self.getTerm(); 1094 | self.setTermAndN(msg); 1095 | 1096 | ltm.reset(); 1097 | send(ip, port, buf, null) catch |err| { 1098 | log.err("[{d}] hb:send failed: {any}", .{ i, err }); 1099 | fails += 1; 1100 | continue; 1101 | }; 1102 | 1103 | try latencies.append(ltm.read()); 1104 | } 1105 | 1106 | if (fails == 0) { 1107 | var total: u64 = 0; 1108 | for (latencies.items) |v| total += v; 1109 | const avg = total / latencies.items.len; 1110 | const avgf: f64 = @floatFromInt(avg); 1111 | const minf: f64 = @floatFromInt(self.getElexTimeoutMin()); 1112 | const nminf = avgf / 0.05; 1113 | if (nminf > minf) { 1114 | lmin = @intFromFloat(nminf); 1115 | lmax = lmin + std.time.ns_per_s; 1116 | } else { 1117 | lmin = min_og; 1118 | lmax = max_og; 1119 | } 1120 | } 1121 | 1122 | ldr_last_sweep = if (fails == 0) true else false; 1123 | std.time.sleep(self.elex_delay); 1124 | }, 1125 | } 1126 | } 1127 | } 1128 | 1129 | // Round-robin for one sweep, then randomize before doing another sweep. 1130 | // We are passing in an arena allocator here. 1131 | fn getPingTarget(self: *Self, allocator: std.mem.Allocator) !?[]const u8 { 1132 | while (true) { 1133 | const pop = self.ping_queue.popOrNull(); 1134 | if (pop) |v| return v; 1135 | 1136 | b: { 1137 | var tl = std.ArrayList([]const u8).init(allocator); 1138 | 1139 | { 1140 | self.members_mtx.lock(); 1141 | defer self.members_mtx.unlock(); 1142 | var iter = self.members.iterator(); 1143 | while (iter.next()) |v| { 1144 | if (v.value_ptr.liveness == .faulty) continue; 1145 | if (self.keyIsMe(v.key_ptr.*)) continue; 1146 | try tl.append(v.key_ptr.*); 1147 | } 1148 | } 1149 | 1150 | switch (tl.items.len) { 1151 | 0 => return null, // probably just us 1152 | 1 => { 1153 | try self.ping_queue.append(tl.items[0]); 1154 | break :b; 1155 | }, 1156 | else => {}, 1157 | } 1158 | 1159 | const seed = std.crypto.random.int(u64); 1160 | var prng = std.rand.DefaultPrng.init(seed); 1161 | const random = prng.random(); 1162 | while (true) { 1163 | switch (tl.items.len) { 1164 | 0 => break, 1165 | 1 => { 1166 | try self.ping_queue.append(tl.items[0]); 1167 | break; 1168 | }, 1169 | else => {}, 1170 | } 1171 | 1172 | const rv = random.uintAtMost(u64, tl.items.len - 1); 1173 | try self.ping_queue.append(tl.items[rv]); 1174 | _ = tl.swapRemove(rv); 1175 | } 1176 | } 1177 | } 1178 | 1179 | unreachable; 1180 | } 1181 | 1182 | // Caller is responsible for releasing the returned memory. 1183 | // We are passing in an arena allocator here. 1184 | fn getRandomMember( 1185 | self: *Self, 1186 | allocator: std.mem.Allocator, 1187 | excludes: [][]const u8, 1188 | max: usize, 1189 | ) !std.ArrayList([]const u8) { 1190 | var hm = std.AutoHashMap(u64, []const u8).init(allocator); 1191 | defer hm.deinit(); // noop since arena 1192 | 1193 | { 1194 | self.members_mtx.lock(); 1195 | defer self.members_mtx.unlock(); 1196 | var iter = self.members.iterator(); 1197 | while (iter.next()) |v| { 1198 | if (v.value_ptr.liveness == .faulty) continue; 1199 | if (self.keyIsMe(v.key_ptr.*)) continue; 1200 | var eql: usize = 0; 1201 | for (excludes) |x| { 1202 | if (std.mem.eql(u8, x, v.key_ptr.*)) eql += 1; 1203 | } 1204 | 1205 | if (eql > 0) continue; 1206 | try hm.put(hm.count(), v.key_ptr.*); 1207 | } 1208 | } 1209 | 1210 | var out = std.ArrayList([]const u8).init(allocator); 1211 | 1212 | var limit = max; 1213 | if (limit > hm.count()) limit = hm.count(); 1214 | if (hm.count() == 1 and limit > 0) { 1215 | const get = hm.get(0); 1216 | if (get) |v| try out.append(v); 1217 | return out; 1218 | } 1219 | 1220 | const seed = std.crypto.random.int(u64); 1221 | var prng = std.rand.DefaultPrng.init(seed); 1222 | const random = prng.random(); 1223 | for (0..limit) |_| { 1224 | if (hm.count() == 0) break; 1225 | while (true) { 1226 | if (hm.count() == 0) break; 1227 | const rv = random.uintAtMost(u64, hm.count() - 1); 1228 | const fr = hm.fetchRemove(rv); 1229 | if (fr) |v| try out.append(v.value); 1230 | break; 1231 | } 1232 | } 1233 | 1234 | return out; 1235 | } 1236 | 1237 | // Setup the dst_* section of the payload. 1238 | // We are passing in an arena allocator here. 1239 | fn setMsgDst( 1240 | self: *Self, 1241 | allocator: std.mem.Allocator, 1242 | msg: *Message, 1243 | excludes: [][]const u8, 1244 | ) !void { 1245 | b: { 1246 | const dst = try self.getRandomMember(allocator, excludes, 1); 1247 | if (dst.items.len == 0) break :b; 1248 | msg.dst_cmd = .infect; 1249 | const ki = self.getKeyInfo(dst.items[0]); 1250 | if (ki) |_| {} else break :b; 1251 | try setMsgSection(msg, .dst, ki.?); 1252 | } 1253 | } 1254 | 1255 | // Ping a peer for liveness. Expected format for `key` is "ip:port", 1256 | // eg. "127.0.0.1:8080". For pings, we use the src_* payload fields 1257 | // to identify us, the sender. 1258 | fn ping(self: *Self, key: []const u8) !bool { 1259 | var aa = std.heap.ArenaAllocator.init(self.allocator); 1260 | defer aa.deinit(); // destroy arena in one go 1261 | const arena = aa.allocator(); 1262 | 1263 | const sep = std.mem.indexOf(u8, key, ":") orelse return false; 1264 | const ip = key[0..sep]; 1265 | const port = try std.fmt.parseUnsigned(u16, key[sep + 1 ..], 10); 1266 | if (std.mem.eql(u8, ip, self.ip) and port == self.port) return true; 1267 | 1268 | const buf = try arena.alloc(u8, @sizeOf(Message)); 1269 | const msg: *Message = @ptrCast(@alignCast(buf)); 1270 | try self.presetMessage(msg); 1271 | 1272 | msg.cmd = .ping; 1273 | try self.setMsgSrcToOwn(msg); 1274 | 1275 | // Use dst_* for ISD info. 1276 | var excludes: [1][]const u8 = .{key}; 1277 | try self.setMsgDst(arena, msg, &excludes); 1278 | 1279 | // Handle join address protocol (egress). 1280 | try self.setJoinProtoSend(msg); 1281 | 1282 | // Propagate number of members. 1283 | const n = self.getCounts(); 1284 | msg.proto2 = n[0] + n[1]; 1285 | 1286 | try send(ip, port, buf, null); 1287 | 1288 | // Handle join address protocol (ingress). 1289 | const cmdm: JoinCmd = @enumFromInt((msg.proto1 & 1290 | 0xF000000000000000) >> 48); 1291 | 1292 | if (cmdm != .invalidate) _ = self.join_addr_tm.lap(); 1293 | 1294 | return switch (msg.cmd) { 1295 | .ack => b: { 1296 | try self.upsertMember(key, .alive, msg.src_incarnation, true); 1297 | 1298 | // Consume dst_* as piggybacked ISD info. 1299 | if (msg.dst_cmd == .infect) { 1300 | const k = try keyFromIpPort(arena, msg.dst_ip, msg.dst_port); 1301 | try self.upsertMember(k, msg.dst_state, msg.dst_incarnation, false); 1302 | } 1303 | 1304 | break :b true; 1305 | }, 1306 | else => false, 1307 | }; 1308 | } 1309 | 1310 | const RequestPing = struct { 1311 | self: *Self, 1312 | src: []const u8, // agent 1313 | dst: []const u8, // target 1314 | ack: bool = false, 1315 | }; 1316 | 1317 | // NOTE: Not used at the moment. 1318 | // Our only agent for doing indirect pings for suspicious nodes. Long-running. 1319 | fn requestPing(args: *RequestPing) !void { 1320 | while (true) { 1321 | args.self.ping_req_0.wait(); 1322 | defer { 1323 | args.self.ping_req_0.reset(); 1324 | args.self.ping_req_1.set(); 1325 | } 1326 | 1327 | log.debug("[thread] try pinging {s} via {s}", .{ args.dst, args.src }); 1328 | 1329 | var aa = std.heap.ArenaAllocator.init(args.self.allocator); 1330 | defer aa.deinit(); // destroy arena in one go 1331 | const arena = aa.allocator(); 1332 | 1333 | const sep = std.mem.indexOf(u8, args.src, ":") orelse return; 1334 | const ip = args.src[0..sep]; 1335 | const port = try std.fmt.parseUnsigned(u16, args.src[sep + 1 ..], 10); 1336 | 1337 | const buf = try arena.alloc(u8, @sizeOf(Message)); 1338 | const msg: *Message = @ptrCast(@alignCast(buf)); 1339 | try args.self.presetMessage(msg); 1340 | msg.cmd = .ping_req; 1341 | 1342 | // Set src_* to our info, the sender. 1343 | try args.self.setMsgSrcToOwn(msg); 1344 | 1345 | // The dst_* section is the target of our ping. 1346 | try setMsgSection(msg, .dst, .{ 1347 | .key = args.dst, 1348 | .liveness = .suspected, // will not be used 1349 | .incarnation = 0, // will not be used 1350 | }); 1351 | 1352 | // Handle ISD info. 1353 | const isd = try args.self.getIsdInfo(arena, 1); 1354 | if (isd.items.len > 0) { 1355 | msg.isd_cmd = .infect; 1356 | try setMsgSection(msg, .isd, isd.items[0]); 1357 | } 1358 | 1359 | // Handle join address protocol (egress). 1360 | try args.self.setJoinProtoSend(msg); 1361 | 1362 | args.self.send(ip, port, buf, null) catch continue; 1363 | 1364 | // Handle join address protocol (ingress). 1365 | args.self.setJoinProtoRecv(msg); 1366 | 1367 | switch (msg.cmd) { 1368 | .ack => { 1369 | try args.self.upsertMember( 1370 | args.src, 1371 | msg.src_state, 1372 | msg.src_incarnation, 1373 | true, 1374 | ); 1375 | 1376 | try args.self.upsertMember( 1377 | args.dst, 1378 | msg.dst_state, 1379 | msg.dst_incarnation, 1380 | true, 1381 | ); 1382 | 1383 | // Consume isd_* as the main ISD info. 1384 | switch (msg.isd_cmd) { 1385 | .infect, 1386 | .confirm_alive, 1387 | => try args.self.handleIsd(arena, msg, false), 1388 | .suspect => try args.self.handleSuspicion(arena, msg), 1389 | .confirm_faulty => try args.self.handleConfirmFaulty(arena, msg), 1390 | else => {}, 1391 | } 1392 | 1393 | const ptr = &args.ack; 1394 | ptr.* = true; 1395 | }, 1396 | .nack => try args.self.upsertMember( 1397 | args.src, 1398 | msg.src_state, 1399 | msg.src_incarnation, 1400 | false, 1401 | ), 1402 | else => {}, 1403 | } 1404 | } 1405 | } 1406 | 1407 | // Handle the isd_* infection protocol of the message payload. 1408 | // We are passing in an arena allocator here. 1409 | fn handleIsd(self: *Self, allocator: std.mem.Allocator, msg: *Message, force: bool) !void { 1410 | const key = try keyFromIpPort(allocator, msg.isd_ip, msg.isd_port); 1411 | try self.setMemberInfo(key, msg.isd_state, msg.isd_incarnation, force); 1412 | } 1413 | 1414 | // Handle the isd_* suspicion protocol of the message payload. 1415 | // We are passing in an arena allocator here. 1416 | fn handleSuspicion(self: *Self, allocator: std.mem.Allocator, msg: *Message) !void { 1417 | const key = try keyFromIpPort(allocator, msg.isd_ip, msg.isd_port); 1418 | if (self.keyIsMe(key)) b: { 1419 | try self.IncrementIncarnation(); 1420 | const pkey = self.getPersistentKeyFromKey(key); 1421 | if (pkey) |_| {} else break :b; 1422 | return; 1423 | } 1424 | 1425 | var suspected = std.ArrayList(KeyInfo).init(allocator); 1426 | 1427 | { 1428 | self.members_mtx.lock(); 1429 | defer self.members_mtx.unlock(); 1430 | const ptr = self.members.getPtr(key); 1431 | if (ptr) |_| {} else return; 1432 | 1433 | try suspected.append(.{ 1434 | .key = key, 1435 | .liveness = .suspected, 1436 | .isd_cmd = .confirm_alive, 1437 | .incarnation = ptr.?.incarnation, 1438 | }); 1439 | } 1440 | 1441 | if (suspected.items.len == 0) return; 1442 | 1443 | const pkey = self.getPersistentKeyFromKey(key); 1444 | if (pkey) |_| {} else return; 1445 | } 1446 | 1447 | // Handle the isd_* faulty protocol of the message payload. 1448 | // We are passing in an arena allocator here. 1449 | fn handleConfirmFaulty(self: *Self, allocator: std.mem.Allocator, msg: *Message) !void { 1450 | const key = try keyFromIpPort(allocator, msg.isd_ip, msg.isd_port); 1451 | if (!self.keyIsMe(key)) { 1452 | try self.setMemberInfo(key, .faulty, null, true); 1453 | return; 1454 | } 1455 | 1456 | const pkey = self.getPersistentKeyFromKey(key); 1457 | if (pkey) |_| {} else return; 1458 | } 1459 | 1460 | // NOTE: Not using locks; only atomic. 1461 | fn getIncarnation(self: *Self) !u64 { 1462 | const me = try self.getOwnKey(); 1463 | defer self.allocator.free(me); 1464 | const ptr = self.members.getPtr(me); 1465 | if (ptr) |v| return @atomicLoad( 1466 | u64, 1467 | &v.incarnation, 1468 | std.builtin.AtomicOrder.seq_cst, 1469 | ); 1470 | 1471 | unreachable; 1472 | } 1473 | 1474 | // NOTE: Not using locks; only atomic. 1475 | fn IncrementIncarnation(self: *Self) !void { 1476 | const me = try self.getOwnKey(); 1477 | defer self.allocator.free(me); 1478 | const ptr = self.members.getPtr(me); 1479 | if (ptr) |_| {} else return; 1480 | _ = @atomicRmw( 1481 | u64, 1482 | &ptr.?.incarnation, 1483 | std.builtin.AtomicRmwOp.Add, 1484 | 1, 1485 | std.builtin.AtomicOrder.seq_cst, 1486 | ); 1487 | } 1488 | 1489 | // Caller must free the returned memory. 1490 | fn getOwnKey(self: *Self) ![]const u8 { 1491 | return try std.fmt.allocPrint(self.allocator, "{s}:{d}", .{ self.ip, self.port }); 1492 | } 1493 | 1494 | // Expected format for `key` is ip:port, eg. 0.0.0.0:8080. 1495 | fn keyIsMe(self: *Self, key: []const u8) bool { 1496 | const sep = std.mem.indexOf(u8, key, ":") orelse return false; 1497 | const ip = key[0..sep]; 1498 | const port = std.fmt.parseUnsigned(u16, key[sep + 1 ..], 10) catch return false; 1499 | return if (std.mem.eql(u8, ip, self.ip) and port == self.port) true else false; 1500 | } 1501 | 1502 | // Use the key from `members` when adding items (key) to the isd_queue. 1503 | fn getPersistentKeyFromKey(self: *Self, key: []const u8) ?[]const u8 { 1504 | self.members_mtx.lock(); 1505 | defer self.members_mtx.unlock(); 1506 | const ptr = self.members.getKeyPtr(key); 1507 | if (ptr) |v| return v.*; 1508 | return null; 1509 | } 1510 | 1511 | // [0] = # of alive members 1512 | // [1] = # of suspected members 1513 | // [2] = # of faulty members 1514 | // [3] = total number of members 1515 | fn getCounts(self: *Self) std.meta.Tuple(&.{ usize, usize, usize, usize }) { 1516 | var n: [3]usize = .{ 0, 0, 0 }; 1517 | self.members_mtx.lock(); 1518 | defer self.members_mtx.unlock(); 1519 | var it = self.members.iterator(); 1520 | while (it.next()) |v| { 1521 | switch (v.value_ptr.liveness) { 1522 | .alive => n[0] += 1, 1523 | .suspected => n[1] += 1, 1524 | .faulty => n[2] += 1, 1525 | } 1526 | } 1527 | 1528 | return .{ 1529 | n[0], 1530 | n[1], 1531 | n[2], 1532 | self.members.count(), 1533 | }; 1534 | } 1535 | 1536 | fn getKeyInfo(self: *Self, key: []const u8) ?KeyInfo { 1537 | self.members_mtx.lock(); 1538 | defer self.members_mtx.unlock(); 1539 | const ptr = self.members.getPtr(key); 1540 | if (ptr) |_| {} else return null; 1541 | return .{ 1542 | .key = key, 1543 | .liveness = ptr.?.liveness, 1544 | .incarnation = ptr.?.incarnation, 1545 | }; 1546 | } 1547 | 1548 | // We always assume the node with the largest ip(int)+port to be leader. 1549 | // [0] - leader's (highest) ip in int format 1550 | // [1] - leader's (highest) port number 1551 | // [2] - true if we are the leader 1552 | fn getHighestNode(self: *Self) !std.meta.Tuple(&.{ u32, u64, bool }) { 1553 | var ipl: u32 = 0; 1554 | var portl: u16 = 0; 1555 | var me = false; 1556 | self.members_mtx.lock(); 1557 | defer self.members_mtx.unlock(); 1558 | var it = self.members.iterator(); 1559 | while (it.next()) |v| { 1560 | if (v.value_ptr.liveness == .faulty) continue; 1561 | const sep = std.mem.indexOf(u8, v.key_ptr.*, ":") orelse continue; 1562 | const ip = v.key_ptr.*[0..sep]; 1563 | const port = try std.fmt.parseUnsigned(u16, v.key_ptr.*[sep + 1 ..], 10); 1564 | const addr = try std.net.Address.resolveIp(ip, port); 1565 | if ((addr.in.sa.addr + port) > (ipl + portl)) { 1566 | ipl = addr.in.sa.addr; 1567 | portl = port; 1568 | me = std.mem.eql(u8, ip, self.ip) and port == self.port; 1569 | } 1570 | } 1571 | 1572 | return .{ ipl, portl, me }; 1573 | } 1574 | 1575 | fn setJoinProtoSend(self: *Self, msg: *Message) !void { 1576 | const n = self.getCounts(); 1577 | const lim = n[0] + n[1]; 1578 | if (lim < 2) return; 1579 | const al = try self.getHighestNode(); 1580 | const hb: u64 = @intFromEnum(JoinCmd.heartbeat); 1581 | const ipl: u32 = al[0] & 0x00000000FFFFFFFF; 1582 | const portl: u64 = (al[1] << 32) & 0x0000FFFF00000000; 1583 | msg.proto1 = (hb << 48) | ipl | portl; 1584 | } 1585 | 1586 | fn setJoinProtoRecv(self: *Self, msg: *Message) void { 1587 | const cmdm: JoinCmd = @enumFromInt((msg.proto1 & 1588 | 0xFFFF000000000000) >> 48); 1589 | if (cmdm != .invalidate) _ = self.join_addr_tm.lap(); 1590 | } 1591 | 1592 | fn setTermAndN(self: *Self, msg: *Message) void { 1593 | const n = self.getCounts(); 1594 | const total = n[0] + n[1]; 1595 | const term = @atomicLoad(u64, &self.elex_term, std.builtin.AtomicOrder.seq_cst); 1596 | const mterm: u64 = term & 0x0000FFFFFFFFFFFF; 1597 | const mcount: u64 = (total << 48) & 0xFFFF000000000000; 1598 | msg.proto1 = mcount | mterm; 1599 | } 1600 | 1601 | // [0] - term 1602 | // [1] - count 1603 | fn getTermAndN(_: *Self, msg: *Message) std.meta.Tuple(&.{ u64, u64 }) { 1604 | const term = msg.proto1 & 0x0000FFFFFFFFFFFF; 1605 | const count = (msg.proto1 & 0xFFFF000000000000) >> 48; 1606 | return .{ term, count }; 1607 | } 1608 | 1609 | // Set default values for the message. 1610 | fn presetMessage(self: *Self, msg: *Message) !void { 1611 | msg.name = std.mem.readVarInt(u64, self.name, .little); 1612 | msg.cmd = .noop; 1613 | msg.src_state = .alive; 1614 | msg.dst_cmd = .noop; 1615 | msg.dst_state = .alive; 1616 | msg.proto1 = 0; 1617 | msg.proto2 = 0; 1618 | } 1619 | 1620 | fn setMsgSrcToOwn(self: *Self, msg: *Message) !void { 1621 | const me = try self.getOwnKey(); 1622 | defer self.allocator.free(me); 1623 | try setMsgSection(msg, .src, .{ 1624 | .key = me, 1625 | .liveness = .alive, 1626 | .incarnation = try self.getIncarnation(), 1627 | }); 1628 | } 1629 | 1630 | // Add a new member or update an existing member's info. This function 1631 | // duplicates the key using self.allocator when adding a new member, 1632 | // not when updating an existing one. 1633 | fn upsertMember( 1634 | self: *Self, 1635 | key: []const u8, 1636 | state: ?Liveness, 1637 | incarnation: ?u64, 1638 | force: bool, 1639 | ) !void { 1640 | const contains = b: { 1641 | self.members_mtx.lock(); 1642 | defer self.members_mtx.unlock(); 1643 | break :b self.members.contains(key); 1644 | }; 1645 | 1646 | if (contains) { 1647 | try self.setMemberInfo(key, state, incarnation, force); 1648 | return; 1649 | } 1650 | 1651 | const nkey = try self.allocator.dupe(u8, key); 1652 | 1653 | // Our copy of all member keys being allocated; to free later. 1654 | if (!self.refkeys.contains(nkey)) try self.refkeys.put(nkey, {}); 1655 | 1656 | { 1657 | self.members_mtx.lock(); 1658 | defer self.members_mtx.unlock(); 1659 | try self.members.put(nkey, .{ 1660 | .age_suspected = try std.time.Timer.start(), 1661 | .age_faulty = try std.time.Timer.start(), 1662 | .targets = std.ArrayList([]const u8).init(self.allocator), 1663 | }); 1664 | } 1665 | 1666 | try self.setMemberInfo(key, state, incarnation, true); 1667 | } 1668 | 1669 | // `key` should be in fmt: "ip:port", e.g. "127.0.0.1:8080". We 1670 | // duplicate `key` to our internal list to be able to free later. 1671 | fn ensureKeyRef(self: *Self, key: []const u8) ![]const u8 { 1672 | self.refkeys_mtx.lock(); 1673 | defer self.refkeys_mtx.unlock(); 1674 | if (self.refkeys.contains(key)) return self.refkeys.getKey(key).?; 1675 | const dup = try self.allocator.dupe(u8, key); 1676 | try self.refkeys.put(dup, {}); 1677 | return dup; 1678 | } 1679 | 1680 | // Reference: SWIM:4.2 1681 | // Order of preference: 1682 | // 1683 | // {Alive:M, inc=i} overrides 1684 | // - {Suspect:M, inc=j}, i>j 1685 | // - {Alive:M, inc=j}, i>j 1686 | // 1687 | // {Suspect:M, inc=i} overrides 1688 | // - {Suspect:M, inc=j}, i>j 1689 | // - {Alive:M, inc=j}, i>=j 1690 | // 1691 | // {Faulty:M, inc=i} overrides 1692 | // - {Alive:M, inc=j}, any j 1693 | // - {Suspect:M, inc=j}, any j 1694 | // 1695 | fn setMemberInfo( 1696 | self: *Self, 1697 | key: []const u8, 1698 | state: ?Liveness, 1699 | incarnation: ?u64, 1700 | force: bool, 1701 | ) !void { 1702 | self.members_mtx.lock(); 1703 | defer self.members_mtx.unlock(); 1704 | const p = self.members.getPtr(key); 1705 | if (p) |_| {} else return; 1706 | 1707 | var apply = false; 1708 | var in_state: Liveness = .alive; 1709 | var in_inc: u64 = p.?.incarnation; 1710 | if (state) |s| in_state = s else return; 1711 | if (incarnation) |inc| in_inc = inc; 1712 | 1713 | if (in_state == .alive) { 1714 | if (p.?.liveness == .suspected and in_inc > p.?.incarnation) apply = true; 1715 | if (p.?.liveness == .alive and in_inc > p.?.incarnation) apply = true; 1716 | } 1717 | 1718 | if (in_state == .suspected) { 1719 | if (p.?.liveness == .suspected and in_inc > p.?.incarnation) apply = true; 1720 | if (p.?.liveness == .alive and in_inc >= p.?.incarnation) apply = true; 1721 | } 1722 | 1723 | if (in_state == .faulty) apply = true; 1724 | if (force) apply = true; 1725 | 1726 | if (!apply) return; 1727 | 1728 | if (p.?.liveness == .faulty and in_state == .alive) p.?.incarnation = 0; 1729 | 1730 | p.?.liveness = in_state; 1731 | p.?.incarnation = in_inc; 1732 | 1733 | if (p.?.liveness == .suspected and in_state != .suspected) p.?.age_suspected.reset(); 1734 | if (p.?.liveness == .faulty and in_state != .faulty) p.?.age_faulty.reset(); 1735 | } 1736 | 1737 | // const SuspectToFaulty = struct { 1738 | // self: *Self, 1739 | // key: []const u8, 1740 | // }; 1741 | 1742 | // // To be run as a separate thread. Keep it suspected 1743 | // // for a while before marking it as faulty. 1744 | // fn suspectToFaulty(args: *SuspectToFaulty) !void { 1745 | // // Pause for a bit before we set to faulty. 1746 | // std.time.sleep(args.self.suspected_time); 1747 | // try args.self.setMemberInfo(args.key, .faulty, null, false); 1748 | 1749 | // // Broadcast confirm_faulty to the group. 1750 | // args.self.isd_mtx.lock(); 1751 | // defer args.self.isd_mtx.unlock(); 1752 | // try args.self.isd_queue.append(.{ 1753 | // .key = args.key, 1754 | // .state = .faulty, 1755 | // .isd_cmd = .confirm_faulty, 1756 | // .incarnation = try args.self.getIncarnation(), // ok since atomic 1757 | // }); 1758 | // } 1759 | 1760 | // Attempt removing faulty members after some time. 1761 | fn removeFaultyMembers(self: *Self) !void { 1762 | var rml = std.ArrayList([]const u8).init(self.allocator); 1763 | defer rml.deinit(); 1764 | 1765 | { 1766 | self.members_mtx.lock(); 1767 | defer self.members_mtx.unlock(); 1768 | var it = self.members.iterator(); 1769 | const limit = self.proto_time; // TODO: expose 1770 | while (it.next()) |v| { 1771 | if (v.value_ptr.liveness != .faulty) continue; 1772 | if (v.value_ptr.age_faulty.read() > limit) { 1773 | try rml.append(v.key_ptr.*); 1774 | } 1775 | } 1776 | } 1777 | 1778 | for (rml.items) |v| self.removeMember(v); 1779 | } 1780 | 1781 | // We don't free the key itself here; we will free through self.ref_keys. 1782 | fn removeMember(self: *Self, key: []const u8) void { 1783 | self.members_mtx.lock(); 1784 | defer self.members_mtx.unlock(); 1785 | const fr = self.members.fetchRemove(key); 1786 | if (fr) |v| v.value.targets.deinit(); 1787 | } 1788 | 1789 | const MsgSection = enum { 1790 | src, 1791 | dst, 1792 | // isd, 1793 | }; 1794 | 1795 | // Set a section of the message payload with ip, port, and state info. 1796 | fn setMsgSection(msg: *Message, section: MsgSection, info: KeyInfo) !void { 1797 | const sep = std.mem.indexOf(u8, info.key, ":") orelse return; 1798 | const ip = info.key[0..sep]; 1799 | const port = try std.fmt.parseUnsigned(u16, info.key[sep + 1 ..], 10); 1800 | const addr = try std.net.Address.resolveIp(ip, port); 1801 | 1802 | switch (section) { 1803 | .src => { 1804 | msg.src_ip = addr.in.sa.addr; 1805 | msg.src_port = port; 1806 | msg.src_state = info.liveness; 1807 | msg.src_incarnation = info.incarnation; 1808 | }, 1809 | .dst => { 1810 | msg.dst_ip = addr.in.sa.addr; 1811 | msg.dst_port = port; 1812 | msg.dst_state = info.liveness; 1813 | msg.dst_incarnation = info.incarnation; 1814 | }, 1815 | } 1816 | } 1817 | 1818 | fn getState(self: *Self) ElectionState { 1819 | self.elex_mtx.lock(); 1820 | defer self.elex_mtx.unlock(); 1821 | return self.elex_state; 1822 | } 1823 | 1824 | fn setState(self: *Self, state: ElectionState) void { 1825 | self.elex_mtx.lock(); 1826 | defer self.elex_mtx.unlock(); 1827 | self.elex_state = state; 1828 | } 1829 | 1830 | // Best-effort basis only. `msg` should already contain the new join info 1831 | // in the dst_* portion, as well as it's source info. 1832 | fn informLeaderOfJoin(self: *Self, msg: []u8) !void { 1833 | const leader = b: { 1834 | self.elex_mtx.lock(); 1835 | defer self.elex_mtx.unlock(); 1836 | break :b self.leader; 1837 | }; 1838 | 1839 | if (leader.len < 2) return; 1840 | 1841 | const sep = std.mem.indexOf(u8, leader, ":") orelse return; 1842 | const ip = leader[0..sep]; 1843 | const port = try std.fmt.parseUnsigned(u16, leader[sep + 1 ..], 10); 1844 | 1845 | try send(ip, port, msg, null); 1846 | } 1847 | 1848 | fn getTerm(self: *Self) u64 { 1849 | return @atomicLoad( 1850 | u64, 1851 | &self.elex_term, 1852 | std.builtin.AtomicOrder.seq_cst, 1853 | ); 1854 | } 1855 | 1856 | fn setTerm(self: *Self, term: u64) void { 1857 | @atomicStore( 1858 | u64, 1859 | &self.elex_term, 1860 | term, 1861 | std.builtin.AtomicOrder.seq_cst, 1862 | ); 1863 | } 1864 | 1865 | fn incTermAndGet(self: *Self) u64 { 1866 | _ = @atomicRmw( 1867 | u64, 1868 | &self.elex_term, 1869 | std.builtin.AtomicRmwOp.Add, 1870 | 1, 1871 | std.builtin.AtomicOrder.seq_cst, 1872 | ); 1873 | 1874 | return self.getTerm(); 1875 | } 1876 | 1877 | fn getVotes(self: *Self) u32 { 1878 | return @atomicLoad( 1879 | u32, 1880 | &self.votes, 1881 | std.builtin.AtomicOrder.seq_cst, 1882 | ); 1883 | } 1884 | 1885 | fn setVotes(self: *Self, vote: u32) void { 1886 | @atomicStore( 1887 | u32, 1888 | &self.votes, 1889 | vote, 1890 | std.builtin.AtomicOrder.seq_cst, 1891 | ); 1892 | } 1893 | 1894 | fn voteForSelf(self: *Self) u32 { 1895 | _ = @atomicRmw( 1896 | u32, 1897 | &self.votes, 1898 | std.builtin.AtomicRmwOp.Add, 1899 | 1, 1900 | std.builtin.AtomicOrder.seq_cst, 1901 | ); 1902 | 1903 | return self.getVotes(); 1904 | } 1905 | 1906 | fn incVotesAndGet(self: *Self) u32 { 1907 | return self.voteForSelf(); 1908 | } 1909 | 1910 | fn getElexTimeoutMin(self: *Self) u64 { 1911 | return @atomicLoad( 1912 | u64, 1913 | &self.elex_tm_min, 1914 | std.builtin.AtomicOrder.seq_cst, 1915 | ); 1916 | } 1917 | 1918 | fn getElexTimeoutMax(self: *Self) u64 { 1919 | return @atomicLoad( 1920 | u64, 1921 | &self.elex_tm_max, 1922 | std.builtin.AtomicOrder.seq_cst, 1923 | ); 1924 | } 1925 | }; 1926 | } 1927 | 1928 | // Helper function for internal one-shot send/recv. The same message ptr is 1929 | // used for both request and response payloads. If `tm_us` is not null, 1930 | // default timeout will be used. 1931 | fn send(ip: []const u8, port: u16, msg: []u8, tm_us: ?u32) !void { 1932 | const addr = try std.net.Address.resolveIp(ip, port); 1933 | const sock = try std.posix.socket( 1934 | std.posix.AF.INET, 1935 | std.posix.SOCK.DGRAM | std.posix.SOCK.CLOEXEC, 1936 | 0, 1937 | ); 1938 | 1939 | var tm: u32 = 1_000_000; 1940 | if (tm_us) |v| tm = v; 1941 | 1942 | defer std.posix.close(sock); 1943 | try setReadTimeout(sock, tm); 1944 | try setWriteTimeout(sock, tm); 1945 | try std.posix.connect(sock, &addr.any, addr.getOsSockLen()); 1946 | _ = try std.posix.write(sock, msg); 1947 | _ = try std.posix.recv(sock, msg, 0); 1948 | } 1949 | 1950 | /// Converts an ip and port to a string with format ip:port, eg. "127.0.0.1:8080". 1951 | /// Caller is responsible for releasing the returned memory. 1952 | fn keyFromIpPort(allocator: std.mem.Allocator, ip: u32, port: u16) ![]const u8 { 1953 | const ipb = std.mem.asBytes(&ip); 1954 | return try std.fmt.allocPrint(allocator, "{d}.{d}.{d}.{d}:{d}", .{ 1955 | ipb[0], 1956 | ipb[1], 1957 | ipb[2], 1958 | ipb[3], 1959 | port, 1960 | }); 1961 | } 1962 | 1963 | test "keyFromIpPort" { 1964 | const out = try keyFromIpPort(std.testing.allocator, 16777343, 8080); 1965 | defer std.testing.allocator.free(out); 1966 | try std.testing.expect(std.mem.eql(u8, out, "127.0.0.1:8080")); 1967 | } 1968 | 1969 | /// Set socket read timeout in microseconds. Linux only. 1970 | pub fn setReadTimeout(socket: std.posix.socket_t, read: ?u32) !void { 1971 | std.debug.assert(read == null or read.? != 0); 1972 | const micros = read orelse 0; 1973 | const opt = std.posix.timeval{ 1974 | .tv_sec = @intCast(@divTrunc(micros, std.time.us_per_s)), 1975 | .tv_usec = @intCast(@mod(micros, std.time.us_per_s)), 1976 | }; 1977 | 1978 | try std.posix.setsockopt( 1979 | socket, 1980 | std.posix.SOL.SOCKET, 1981 | std.posix.SO.RCVTIMEO, 1982 | std.mem.toBytes(opt)[0..], 1983 | ); 1984 | } 1985 | 1986 | /// Set socket write timeout in microseconds. Linux only. 1987 | pub fn setWriteTimeout(socket: std.posix.socket_t, write: ?u32) !void { 1988 | std.debug.assert(write == null or write.? != 0); 1989 | const micros = write orelse 0; 1990 | const opt = std.posix.timeval{ 1991 | .tv_sec = @intCast(@divTrunc(micros, std.time.us_per_s)), 1992 | .tv_usec = @intCast(@mod(micros, std.time.us_per_s)), 1993 | }; 1994 | 1995 | try std.posix.setsockopt( 1996 | socket, 1997 | std.posix.SOL.SOCKET, 1998 | std.posix.SO.SNDTIMEO, 1999 | std.mem.toBytes(opt)[0..], 2000 | ); 2001 | } 2002 | --------------------------------------------------------------------------------