├── Dockerfile ├── LICENSE ├── Makefile ├── README ├── example ├── client.c ├── proto.h ├── run-example.py ├── server.c ├── timeout.c ├── timeout.h └── wtfexpect.py ├── include ├── raft.h └── util.h ├── src ├── raft.c └── util.c └── tests ├── blockade.yml ├── docker-entrypoint.sh ├── requirements.txt └── test.py /Dockerfile: -------------------------------------------------------------------------------- 1 | # vim:set ft=dockerfile: 2 | FROM debian:jessie 3 | 4 | RUN apt-get update && apt-get install -y \ 5 | git \ 6 | make \ 7 | gcc \ 8 | gdb \ 9 | python3 \ 10 | libjansson-dev \ 11 | && rm -rf /var/lib/apt/lists/* 12 | 13 | WORKDIR /raft 14 | 15 | ENV REBUILD 3 16 | 17 | #RUN git clone --depth 1 https://github.com/kvap/raft.git /raft 18 | ADD . /raft 19 | RUN cd /raft && make all 20 | 21 | EXPOSE 6000 22 | EXPOSE 6000/udp 23 | ENTRYPOINT ["/raft/tests/docker-entrypoint.sh"] 24 | #CMD ["bash"] 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015-2016, Constantin S. Pan 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | #override CC := clang 2 | override CFLAGS += -fpic -Wall -Wfatal-errors -O0 -g -pedantic -std=c99 -D_POSIX_C_SOURCE=200809L -D_BSD_SOURCE 3 | override CPPFLAGS += -I. -Iinclude #-DDEBUG 4 | override SERVER_LDFLAGS += -Llib -lraft -ljansson 5 | 6 | AR = ar 7 | ARFLAGS = -cru 8 | 9 | .PHONY: all clean bindir objdir libdir 10 | 11 | lib/libraft.a: obj/raft.o obj/util.o | libdir objdir 12 | $(AR) $(ARFLAGS) lib/libraft.a obj/raft.o obj/util.o 13 | 14 | all: lib/libraft.a bin/server bin/client 15 | @echo Done. 16 | 17 | bin/server: obj/server.o lib/libraft.a | bindir objdir 18 | $(CC) -o bin/server $(CFLAGS) $(CPPFLAGS) \ 19 | obj/server.o $(SERVER_LDFLAGS) 20 | 21 | bin/client: obj/client.o obj/timeout.o | bindir objdir 22 | $(CC) $(CFLAGS) $(CPPFLAGS) -o $@ $^ 23 | 24 | obj/%.o: src/%.c | objdir 25 | $(CC) $(CFLAGS) $(CPPFLAGS) -c -o $@ $< 26 | 27 | obj/%.o: example/%.c | objdir 28 | $(CC) $(CFLAGS) $(CPPFLAGS) -c -o $@ $< 29 | 30 | bindir: 31 | mkdir -p bin 32 | 33 | objdir: 34 | mkdir -p obj 35 | 36 | libdir: 37 | mkdir -p lib 38 | 39 | clean: 40 | rm -rfv bin obj lib 41 | 42 | check: 43 | find /tmp -name 'raft.*.log' -delete || true 44 | docker build -t raft . 45 | cd tests && ./test.py 46 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | libraft 2 | ======= 3 | 4 | Raft protocol implementation in C. 5 | 6 | Features 7 | -------- 8 | 9 | + Leader Election 10 | + Log Replication 11 | + Log Compaction 12 | 13 | Usage 14 | ----- 15 | 16 | 1. Include raft.h and link with -lraft 17 | 18 | 2. Initialize: 19 | a. Populate a ``raft_config_t`` with parameters (host, port, timeouts, 20 | log length, etc.) 21 | 22 | b. Set an applier and a snapshooter callbacks. The applier should 23 | perform state modification, and the snapshooter should dump the state. 24 | See raft.h for their signatures. 25 | 26 | c. Call raft_init(cfg). 27 | 28 | d. Use raft_peer_up(...) to configure the peers, including the current 29 | one. 30 | 31 | e. Create and bind the socket with raft_create_udp_socket(...). 32 | 33 | 3. Serve raft: 34 | a. Call raft_tick(...) frequently to perform the logic of timeouts. 35 | 36 | b. Try to extract a message with raft_recv_message(...) when you 37 | believe there is one in the socket. 38 | 39 | c. Call raft_handle_message(...) when a message has been successfully 40 | extracted from the socket. 41 | 42 | 4. Serve clients: 43 | a. If this server is a leader (call raft_get_leader(...) to find out), 44 | then it should accept pending connections and process client quieries. 45 | 46 | b. Use raft_emit(...) to update the state though raft. Then wait until 47 | raft_applied(...) before returning the result to the client. 48 | 49 | c. You may use raft_get_leader(...) to redirect clients to the leader, 50 | though this is not necessary. 51 | 52 | Please read raft.h and example/ for more details. 53 | 54 | Testing 55 | ------- 56 | 57 | 1. Install docker and blockade. 58 | 2. You may want to gain superuser privileges at this point. 59 | 3. $ make check 60 | 61 | TODO 62 | ---- 63 | 64 | + Membership Changes 65 | -------------------------------------------------------------------------------- /example/client.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include "timeout.h" 20 | #include "proto.h" 21 | 22 | #ifndef HOST_NAME_MAX 23 | #define HOST_NAME_MAX 64 24 | #endif 25 | #define MAX_SERVERS 64 26 | 27 | #undef shout 28 | #define shout(...) \ 29 | do { \ 30 | fprintf(stderr, "CLIENT: "); \ 31 | fprintf(stderr, __VA_ARGS__); \ 32 | fflush(stderr); \ 33 | } while (0) 34 | 35 | #define debug(...) 36 | 37 | typedef struct HostPort { 38 | bool up; 39 | char host[HOST_NAME_MAX + 1]; 40 | int port; 41 | } HostPort; 42 | 43 | static int leader = -1; 44 | static int leadersock = -1; 45 | int servernum; 46 | HostPort servers[MAX_SERVERS]; 47 | 48 | static void select_next_server(void) { 49 | int orig_leader = leader; 50 | for (int i = 0; i < MAX_SERVERS; i++) { 51 | int idx = (orig_leader + i + 1) % MAX_SERVERS; 52 | HostPort *hp = servers + idx; 53 | if (hp->up) { 54 | leader = idx; 55 | shout("switched from server %d to server %d\n", orig_leader, leader); 56 | return; 57 | } 58 | } 59 | shout("all servers are down\n"); 60 | } 61 | 62 | static bool poll_until_writable(int sock, timeout_t *timeout) { 63 | struct pollfd pfd = {sock, POLLOUT, 0}; 64 | int r = poll(&pfd, 1, timeout_remaining_ms(timeout)); 65 | if (r != 1) return false; 66 | return (pfd.revents & POLLOUT) != 0; 67 | } 68 | 69 | static bool poll_until_readable(int sock, timeout_t *timeout) { 70 | struct pollfd pfd = {sock, POLLIN, 0}; 71 | int remain = timeout_remaining_ms(timeout); 72 | int r = poll(&pfd, 1, remain); 73 | if (r != 1) return false; 74 | return (pfd.revents & POLLIN) != 0; 75 | } 76 | 77 | static bool timed_write(int sock, void *data, size_t len, timeout_t *timeout) { 78 | int sent = 0; 79 | 80 | while (sent < len) { 81 | int newbytes; 82 | if (timeout_happened(timeout)) { 83 | debug("write timed out\n"); 84 | return false; 85 | } 86 | 87 | newbytes = write(sock, (char *)data + sent, len - sent); 88 | if (newbytes > 0) { 89 | sent += newbytes; 90 | } else if (newbytes == 0) { 91 | return false; 92 | } else { 93 | if ((errno == EAGAIN) || (errno == EWOULDBLOCK) || (errno == EINTR)) { 94 | if (!poll_until_writable(sock, timeout)) { 95 | return false; 96 | } 97 | } else { 98 | debug("failed to write: error %d: %s\n", errno, strerror(errno)); 99 | return false; 100 | } 101 | } 102 | } 103 | 104 | return true; 105 | } 106 | 107 | static bool timed_read(int sock, void *data, size_t len, timeout_t *timeout) { 108 | int recved = 0; 109 | 110 | while (recved < len) { 111 | int newbytes; 112 | if (timeout_happened(timeout)) { 113 | debug("read timed out\n"); 114 | return false; 115 | } 116 | 117 | newbytes = read(sock, (char *)data + recved, len - recved); 118 | if (newbytes > 0) { 119 | recved += newbytes; 120 | } else if (newbytes == 0) { 121 | return false; 122 | } else { 123 | if ((errno == EAGAIN) || (errno == EWOULDBLOCK) || (errno == EINTR)) { 124 | if (!poll_until_readable(sock, timeout)) { 125 | return false; 126 | } 127 | } else { 128 | debug("failed to read: error %d: %s\n", errno, strerror(errno)); 129 | return false; 130 | } 131 | } 132 | } 133 | 134 | return true; 135 | } 136 | 137 | static void wait_ms(int ms) { 138 | struct timespec ts = {ms / 1000, (ms % 1000) * 1000000}; 139 | struct timespec rem; 140 | while (nanosleep(&ts, &rem) == -1) { 141 | if (errno != EINTR) break; 142 | ts = rem; 143 | } 144 | } 145 | 146 | static void disconnect_leader(void) { 147 | if (leadersock >= 0) { 148 | close(leadersock); 149 | } 150 | leadersock = -1; 151 | wait_ms(100); 152 | } 153 | 154 | static bool connect_leader(timeout_t *timeout) { 155 | struct addrinfo *addrs = NULL; 156 | struct addrinfo hint; 157 | char portstr[6]; 158 | struct addrinfo *a; 159 | int rc; 160 | int sd; 161 | 162 | HostPort *leaderhp; 163 | 164 | shout("leadersock is %d\n", leadersock); 165 | if (leadersock < 0) select_next_server(); 166 | if (leader < 0) { 167 | shout("will NOT try server %d\n", leader); 168 | return false; 169 | } 170 | 171 | shout("will try server %d\n", leader); 172 | leaderhp = servers + leader; 173 | 174 | memset(&hint, 0, sizeof(hint)); 175 | hint.ai_socktype = SOCK_STREAM; 176 | hint.ai_family = AF_INET; 177 | snprintf(portstr, 6, "%d", leaderhp->port); 178 | hint.ai_protocol = getprotobyname("tcp")->p_proto; 179 | 180 | if ((rc = getaddrinfo(leaderhp->host, portstr, &hint, &addrs))) { 181 | shout( 182 | "failed to resolve address '%s:%d': %s\n", 183 | leaderhp->host, leaderhp->port, 184 | gai_strerror(rc) 185 | ); 186 | disconnect_leader(); 187 | return false; 188 | } 189 | 190 | shout("trying [%d] %s:%d\n", leader, leaderhp->host, leaderhp->port); 191 | for (a = addrs; a != NULL; a = a->ai_next) { 192 | int one = 1; 193 | 194 | sd = socket(a->ai_family, SOCK_STREAM, 0); 195 | if (sd == -1) { 196 | shout("failed to create a socket: %s\n", strerror(errno)); 197 | continue; 198 | } 199 | fcntl(sd, F_SETFL, O_NONBLOCK); 200 | setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(one)); 201 | 202 | if (connect(sd, a->ai_addr, a->ai_addrlen) == -1) { 203 | if (errno == EINPROGRESS) { 204 | TIMEOUT_LOOP_START(timeout); { 205 | if (poll_until_writable(sd, timeout)) { 206 | int err; 207 | socklen_t optlen = sizeof(err); 208 | getsockopt(sd, SOL_SOCKET, SO_ERROR, &err, &optlen); 209 | if (err == 0) goto success; 210 | } 211 | } TIMEOUT_LOOP_END(timeout); 212 | shout("connect timed out\n"); 213 | goto failure; 214 | } 215 | else 216 | { 217 | shout("failed to connect to an address: %s\n", strerror(errno)); 218 | close(sd); 219 | continue; 220 | } 221 | } 222 | 223 | goto success; 224 | } 225 | failure: 226 | freeaddrinfo(addrs); 227 | disconnect_leader(); 228 | shout("could not connect\n"); 229 | return false; 230 | success: 231 | freeaddrinfo(addrs); 232 | leadersock = sd; 233 | shout("connection to %d succeeded\n", leader); 234 | return true; 235 | } 236 | 237 | static int get_connection(timeout_t *timeout) { 238 | if (leadersock < 0) { 239 | if (connect_leader(timeout)) return leadersock; 240 | debug("update: connect_leader() failed\n"); 241 | } 242 | return leadersock; 243 | } 244 | 245 | static bool try_query(Message *msg, Message *answer, timeout_t *timeout) { 246 | int s = get_connection(timeout); 247 | if (s < 0) return false; 248 | 249 | if (timeout_happened(timeout)) { 250 | debug("try_query: get_connection() timed out\n"); 251 | return false; 252 | } 253 | 254 | if (!timed_write(s, msg, sizeof(Message), timeout)) { 255 | debug("try_query: failed to send the query to the leader\n"); 256 | return false; 257 | } 258 | 259 | if (!timed_read(s, answer, sizeof(Message), timeout)) { 260 | debug("try_query: failed to recv the answer from the leader\n"); 261 | return false; 262 | } 263 | 264 | return true; 265 | } 266 | 267 | static bool query(Message *msg, Message *answer, int timeout_ms) { 268 | timeout_t timeout; 269 | if (timeout_ms < 0) { 270 | while (true) { 271 | timeout_start(&timeout, 100); 272 | 273 | if (try_query(msg, answer, &timeout)) { 274 | return true; 275 | } else { 276 | disconnect_leader(); 277 | } 278 | } 279 | } else { 280 | timeout_start(&timeout, timeout_ms); 281 | 282 | TIMEOUT_LOOP_START(&timeout); { 283 | if (try_query(msg, answer, &timeout)) { 284 | return true; 285 | } else { 286 | disconnect_leader(); 287 | } 288 | } TIMEOUT_LOOP_END(&timeout); 289 | } 290 | 291 | shout("query failed after %d ms\n", timeout_elapsed_ms(&timeout)); 292 | return false; 293 | } 294 | 295 | static void msg_fill(Message *msg, char meaning, char *key, char *value) { 296 | msg->meaning = meaning; 297 | strncpy(msg->key.data, key, sizeof(msg->key)); 298 | strncpy(msg->value.data, value, sizeof(msg->value)); 299 | } 300 | 301 | static char *get(char *key, int timeout_ms) { 302 | Message msg, answer; 303 | 304 | msg_fill(&msg, MEAN_GET, key, ""); 305 | 306 | if (query(&msg, &answer, timeout_ms)) { 307 | if (answer.meaning == MEAN_OK) { 308 | return strndup(answer.value.data, sizeof(answer.value)); 309 | } else { 310 | assert(answer.meaning == MEAN_FAIL); 311 | } 312 | } 313 | return NULL; 314 | } 315 | 316 | static bool set(char *key, char *value, int timeout_ms) { 317 | Message msg, answer; 318 | 319 | msg_fill(&msg, MEAN_SET, key, value); 320 | 321 | if (query(&msg, &answer, timeout_ms)) { 322 | if (answer.meaning == MEAN_OK) { 323 | return true; 324 | } else { 325 | assert(answer.meaning == MEAN_FAIL); 326 | } 327 | } 328 | return false; 329 | } 330 | 331 | static void usage(char *prog) { 332 | printf( 333 | "Usage: %s -k KEY -r ID:HOST:PORT [-r ID:HOST:PORT ...]\n", 334 | prog 335 | ); 336 | } 337 | 338 | char *key; 339 | bool setup(int argc, char **argv) { 340 | servernum = 0; 341 | int opt; 342 | while ((opt = getopt(argc, argv, "hk:r:")) != -1) { 343 | int id; 344 | char *host; 345 | char *str; 346 | int port; 347 | 348 | switch (opt) { 349 | case 'k': 350 | key = optarg; 351 | break; 352 | case 'r': 353 | str = strtok(optarg, ":"); 354 | if (!str) return false; 355 | id = atoi(str); 356 | 357 | host = strtok(NULL, ":"); 358 | 359 | str = strtok(NULL, ":"); 360 | if (!str) return false; 361 | port = atoi(str); 362 | 363 | HostPort *hp = servers + id; 364 | if (hp->up) return false; 365 | hp->up = true; 366 | strncpy(hp->host, host, sizeof(hp->host)); 367 | hp->port = port; 368 | servernum++; 369 | break; 370 | case 'h': 371 | usage(argv[0]); 372 | exit(EXIT_SUCCESS); 373 | default: 374 | return false; 375 | } 376 | } 377 | return true; 378 | } 379 | 380 | int main(int argc, char **argv) { 381 | signal(SIGPIPE, SIG_IGN); 382 | 383 | if (!setup(argc, argv)) { 384 | usage(argv[0]); 385 | return EXIT_FAILURE; 386 | } 387 | 388 | srand(getpid()); 389 | 390 | while (true) { 391 | 392 | //wait_ms(1000); 393 | char value[20]; 394 | snprintf(value, sizeof(value), "%d", rand()); 395 | shout("set(%s, %s)\n", key, value); 396 | if (set(key, value, 1000)) { 397 | char *reply = get(key, 1000); 398 | if (reply) { 399 | shout("%s = %s\n", key, reply); 400 | free(reply); 401 | } 402 | } else { 403 | shout("set() failed\n"); 404 | } 405 | } 406 | 407 | return EXIT_SUCCESS; 408 | } 409 | -------------------------------------------------------------------------------- /example/proto.h: -------------------------------------------------------------------------------- 1 | #ifndef PROTO_H 2 | #define PROTO_H 3 | 4 | typedef struct Key { 5 | char data[64]; 6 | } Key; 7 | 8 | typedef struct Value { 9 | char data[64]; 10 | } Value; 11 | 12 | #define MEAN_FAIL '!' 13 | #define MEAN_OK '.' 14 | #define MEAN_GET '?' 15 | #define MEAN_SET '=' 16 | 17 | typedef struct Message { 18 | char meaning; 19 | Key key; 20 | Value value; 21 | } Message; 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /example/run-example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import time 4 | import wtfexpect 5 | 6 | GREEN = '\033[92m' 7 | RED = '\033[91m' 8 | NOCOLOR = '\033[0m' 9 | 10 | try: 11 | with wtfexpect.WtfExpect() as we: 12 | servers = 'alpha bravo conan'.split() 13 | clients = 'xenon yeast zebra'.split() 14 | serverids = {name: i for i, name in enumerate(servers)} 15 | 16 | cfg = [] 17 | baseport = 6000 18 | for i in range(len(servers)): 19 | cfg.append('-r') 20 | cfg.append("%d:%s:%d" % (i, "127.0.0.1", baseport + i)) 21 | 22 | for i, s in enumerate(servers): 23 | we.spawn(s, 'bin/server', 24 | '-i', str(i), 25 | *cfg, 26 | ) 27 | 28 | for c in clients: 29 | time.sleep(0.333) 30 | we.spawn(c, 'bin/client', '-k', c, *cfg) 31 | 32 | start = time.time() 33 | while we.alive(): 34 | if time.time() - start > 5 and we.alive('alpha'): 35 | we.kill('alpha') 36 | timeout = 0.5 37 | name, line = we.readline(timeout) 38 | if name is None: continue 39 | 40 | if name in servers: 41 | src = "%d(%s)" % (serverids[name], name) 42 | else: 43 | src = name 44 | 45 | if line is None: 46 | code = we.getcode(name) 47 | if code == 0: 48 | color = GREEN 49 | else: 50 | color = RED 51 | print("%s%s finished with code %d%s" % (color, src, code, NOCOLOR)) 52 | else: 53 | print("[%s] %s" % (src, line)) 54 | except KeyboardInterrupt: 55 | print("killed") 56 | -------------------------------------------------------------------------------- /example/server.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include 18 | 19 | #include "raft.h" 20 | #include "util.h" 21 | #include "proto.h" 22 | 23 | #undef shout 24 | #define shout(...) \ 25 | do { \ 26 | fprintf(stderr, "SERVER: "); \ 27 | fprintf(stderr, __VA_ARGS__); \ 28 | fflush(stderr); \ 29 | } while (0) 30 | 31 | #define LISTEN_QUEUE_SIZE 10 32 | #define MAX_CLIENTS 512 33 | 34 | /* Client state machine: 35 | * 36 | * ┏━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ 37 | * ──>┃ Dead ┃<──┨ Sick ┃ 38 | * ┗━━━━┯━━━━┛ ┗━━━━┯━━━━━━━━━━━━━┯━━━━━━━━━━━━━┯━━━━┛ 39 | * │conn ^ fail ^ fail ^ fail 40 | * └───────>┏━━━━┷━━━━┓fin┏━━━━┷━━━━┓fin┏━━━━┷━━━━┓fin 41 | * ┃ Sending ┠──>┃ Waiting ┠──>┃ Recving ┠─┐ 42 | * ┌─>┗━━━━━━━━━┛ ┗━━━━━━━━━┛ ┗━━━━━━━━━┛ │ 43 | * └──────────────────────────────────────────┘ 44 | */ 45 | 46 | typedef enum ClientState { 47 | CLIENT_SICK = -1, 48 | CLIENT_DEAD = 0, 49 | CLIENT_SENDING, 50 | CLIENT_WAITING, 51 | CLIENT_RECVING 52 | } ClientState; 53 | 54 | typedef struct Client { 55 | ClientState state; 56 | int socket; 57 | size_t cursor; 58 | Message msg; /* the message that is currently being sent or received */ 59 | int expect; 60 | } Client; 61 | 62 | typedef struct Server { 63 | char *host; 64 | int port; 65 | 66 | int listener; 67 | int raftsock; 68 | int id; 69 | 70 | int clientnum; 71 | Client clients[MAX_CLIENTS]; 72 | } Server; 73 | 74 | static bool continue_recv(int socket, void *dst, size_t len, size_t *done) { 75 | while (*done < len) { 76 | ssize_t recved = recv(socket, ((char *)dst) + *done, len - *done, MSG_DONTWAIT); 77 | if (recved == 0) return false; 78 | if (recved < 0) { 79 | switch (errno) { 80 | case EAGAIN: 81 | #if EAGAIN != EWOULDBLOCK 82 | case EWOULDBLOCK: 83 | #endif 84 | return true; /* try again later */ 85 | case EINTR: 86 | continue; /* try again now */ 87 | default: 88 | return false; 89 | } 90 | } 91 | *done += recved; 92 | assert(*done <= len); 93 | } 94 | return true; 95 | } 96 | 97 | static bool continue_send(int socket, void *src, size_t len, size_t *done) { 98 | while (*done < len) { 99 | ssize_t sent = send(socket, ((char *)src) + *done, len - *done, MSG_DONTWAIT); 100 | if (sent == 0) return false; 101 | if (sent < 0) { 102 | switch (errno) { 103 | case EAGAIN: 104 | #if EAGAIN != EWOULDBLOCK 105 | case EWOULDBLOCK: 106 | #endif 107 | return true; /* try again later */ 108 | case EINTR: 109 | continue; /* try again now */ 110 | default: 111 | return false; 112 | } 113 | } 114 | *done += sent; 115 | assert(*done <= len); 116 | } 117 | return true; 118 | } 119 | 120 | void client_recv(Client *client) { 121 | assert(client->state == CLIENT_SENDING); 122 | 123 | if (client->cursor < sizeof(client->msg)) { 124 | if (!continue_recv(client->socket, &client->msg, sizeof(client->msg), &client->cursor)) { 125 | goto failure; 126 | } 127 | } 128 | 129 | return; 130 | failure: 131 | client->state = CLIENT_SICK; 132 | } 133 | 134 | void client_send(Client *client) { 135 | assert(client->state == CLIENT_RECVING); 136 | 137 | if (client->cursor < sizeof(client->msg)) { 138 | if (!continue_send(client->socket, &client->msg, sizeof(client->msg), &client->cursor)) { 139 | goto failure; 140 | } 141 | } 142 | 143 | return; 144 | failure: 145 | client->state = CLIENT_SICK; 146 | } 147 | 148 | static void applier(void *state, raft_update_t update, raft_bool_t snapshot) { 149 | json_error_t error; 150 | 151 | json_t *patch = json_loadb(update.data, update.len, 0, &error); 152 | if (!patch) { 153 | shout( 154 | "error parsing json at position %d: %s\n", 155 | error.column, 156 | error.text 157 | ); 158 | } 159 | 160 | if (snapshot) { 161 | json_object_clear(state); 162 | } 163 | 164 | if (json_object_update(state, patch)) { 165 | shout("error updating state\n"); 166 | } 167 | 168 | json_decref(patch); 169 | 170 | char *encoded = json_dumps(state, JSON_INDENT(4) | JSON_SORT_KEYS); 171 | if (encoded) { 172 | debug( 173 | "applied %s: the new state is %s\n", 174 | snapshot ? "a snapshot" : "an update", 175 | encoded 176 | ); 177 | } else { 178 | shout( 179 | "applied %s, but the new state could not be encoded\n", 180 | snapshot ? "a snapshot" : "an update" 181 | ); 182 | } 183 | free(encoded); 184 | } 185 | 186 | static raft_update_t snapshooter(void *state) { 187 | raft_update_t shot; 188 | shot.data = json_dumps(state, JSON_SORT_KEYS); 189 | shot.len = strlen(shot.data); 190 | if (shot.data) { 191 | debug("snapshot taken: %.*s\n", shot.len, shot.data); 192 | } else { 193 | shout("failed to take a snapshot\n"); 194 | } 195 | 196 | return shot; 197 | } 198 | 199 | static int stop = 0; 200 | static void die(int sig) 201 | { 202 | stop = 1; 203 | } 204 | 205 | static void usage(char *prog) { 206 | printf( 207 | "Usage: %s -i ID -r ID:HOST:PORT [-r ID:HOST:PORT ...] [-l LOGFILE]\n" 208 | " -l : Run as a daemon and write output to LOGFILE.\n", 209 | prog 210 | ); 211 | } 212 | 213 | json_t *state; 214 | Server server; 215 | raft_t raft; 216 | 217 | static bool add_client(int sock) { 218 | if (server.clientnum >= MAX_CLIENTS) { 219 | shout("client limit hit\n"); 220 | return false; 221 | } 222 | 223 | for (int i = 0; i < MAX_CLIENTS; i++) { 224 | Client *c = server.clients + i; 225 | if (c->state != CLIENT_DEAD) continue; 226 | 227 | c->socket = sock; 228 | c->state = CLIENT_SENDING; 229 | c->cursor = 0; 230 | c->expect = -1; 231 | server.clientnum++; 232 | return true; 233 | } 234 | 235 | assert(false); // should not happen 236 | return false; 237 | } 238 | 239 | static bool remove_client(Client *c) { 240 | assert(c->socket >= 0); 241 | c->state = CLIENT_DEAD; 242 | 243 | server.clientnum--; 244 | close(c->socket); 245 | return true; 246 | } 247 | 248 | /* Returns the created socket, or -1 if failed. */ 249 | static int create_listening_socket(const char *host, int port) { 250 | int optval; 251 | struct addrinfo *addrs = NULL; 252 | struct addrinfo hint; 253 | struct addrinfo *a; 254 | char portstr[6]; 255 | int rc; 256 | 257 | memset(&hint, 0, sizeof(hint)); 258 | hint.ai_socktype = SOCK_STREAM; 259 | hint.ai_family = AF_INET; 260 | snprintf(portstr, 6, "%d", port); 261 | hint.ai_protocol = getprotobyname("tcp")->p_proto; 262 | 263 | if ((rc = getaddrinfo(host, portstr, &hint, &addrs))) { 264 | shout("failed to resolve address '%s:%d': %s\n", 265 | host, port, gai_strerror(rc)); 266 | return -1; 267 | } 268 | 269 | for (a = addrs; a != NULL; a = a->ai_next) { 270 | int s = socket(AF_INET, SOCK_STREAM, 0); 271 | if (s == -1) { 272 | shout("cannot create the listening socket: %s\n", strerror(errno)); 273 | continue; 274 | } 275 | 276 | optval = 1; 277 | setsockopt(s, IPPROTO_TCP, TCP_NODELAY, (char const*)&optval, sizeof(optval)); 278 | setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char const*)&optval, sizeof(optval)); 279 | 280 | shout("binding tcp %s:%d\n", host, port); 281 | if (bind(s, a->ai_addr, a->ai_addrlen) < 0) { 282 | shout("cannot bind the listening socket: %s\n", strerror(errno)); 283 | close(s); 284 | continue; 285 | } 286 | 287 | if (listen(s, LISTEN_QUEUE_SIZE) == -1) 288 | { 289 | shout("failed to listen the socket: %s\n", strerror(errno)); 290 | close(s); 291 | continue; 292 | } 293 | return s; 294 | } 295 | shout("failed to find proper protocol\n"); 296 | return -1; 297 | } 298 | 299 | 300 | static bool start_server(void) { 301 | server.listener = -1; 302 | server.raftsock = -1; 303 | server.clientnum = 0; 304 | 305 | server.listener = create_listening_socket(server.host, server.port); 306 | if (server.listener == -1) { 307 | return false; 308 | } 309 | 310 | for (int i = 0; i < MAX_CLIENTS; i++) { 311 | server.clients[i].state = CLIENT_DEAD; 312 | } 313 | 314 | return true; 315 | } 316 | 317 | static bool accept_client(void) { 318 | debug("a new connection is queued\n"); 319 | 320 | int fd = accept(server.listener, NULL, NULL); 321 | if (fd == -1) { 322 | shout("failed to accept a connection: %s\n", strerror(errno)); 323 | return false; 324 | } 325 | debug("a new connection fd=%d accepted\n", fd); 326 | 327 | if (!raft_is_leader(raft)) { 328 | debug("not a leader, disconnecting the accepted connection fd=%d\n", fd); 329 | close(fd); 330 | return false; 331 | } 332 | 333 | return add_client(fd); 334 | } 335 | 336 | static void on_message_from(Client *c) { 337 | int index; 338 | 339 | assert(c->state == CLIENT_SENDING); 340 | assert(c->cursor == sizeof(c->msg)); 341 | assert(c->expect == -1); 342 | 343 | c->cursor = 0; 344 | 345 | if (c->msg.meaning == MEAN_SET) { 346 | char buf[sizeof(c->msg) + 10]; 347 | snprintf(buf, sizeof(buf), "{\"%s\": \"%s\"}", c->msg.key.data, c->msg.value.data); 348 | debug("emit update: %s\n", buf); 349 | raft_update_t update = {strlen(buf), buf, NULL}; 350 | index = raft_emit(raft, update); /* raft will copy the data */ 351 | if (index < 0) { 352 | shout("failed to emit a raft update\n"); 353 | c->state = CLIENT_SICK; 354 | } else { 355 | debug("client is waiting for %d\n", index); 356 | c->expect = index; 357 | c->state = CLIENT_WAITING; 358 | } 359 | } else if (c->msg.meaning == MEAN_GET) { 360 | json_t *jval = json_object_get(state, c->msg.key.data); 361 | if (jval == NULL) { 362 | c->msg.meaning = MEAN_FAIL; 363 | } else { 364 | c->msg.meaning = MEAN_OK; 365 | strncpy(c->msg.value.data, json_string_value(jval), sizeof(c->msg.value)); 366 | } 367 | c->state = CLIENT_RECVING; 368 | } else { 369 | shout("unknown meaning %d of the client's message\n", c->msg.meaning); 370 | c->state = CLIENT_SICK; 371 | } 372 | } 373 | 374 | static void on_message_to(Client *c) { 375 | assert(c->state == CLIENT_RECVING); 376 | assert(c->cursor == sizeof(c->msg)); 377 | c->cursor = 0; 378 | c->state = CLIENT_SENDING; 379 | } 380 | 381 | static void attend(Client *c) { 382 | assert(c->state != CLIENT_DEAD); 383 | assert(c->state != CLIENT_SICK); 384 | assert(c->state != CLIENT_WAITING); 385 | 386 | switch (c->state) { 387 | case CLIENT_SENDING: 388 | client_recv(c); 389 | if (c->state == CLIENT_SICK) return; 390 | if (c->cursor < sizeof(Message)) return; 391 | on_message_from(c); 392 | break; 393 | case CLIENT_RECVING: 394 | client_send(c); 395 | if (c->state == CLIENT_SICK) return; 396 | if (c->cursor < sizeof(Message)) return; 397 | on_message_to(c); 398 | break; 399 | default: 400 | assert(false); // should not happen 401 | } 402 | } 403 | 404 | static void notify(void) { 405 | for (int i = 0; i < MAX_CLIENTS; i++) { 406 | Client *c = server.clients + i; 407 | if (c->state != CLIENT_WAITING) continue; 408 | assert(c->expect >= 0); 409 | if (!raft_applied(raft, server.id, c->expect)) continue; 410 | 411 | c->msg.meaning = MEAN_OK; 412 | c->cursor = 0; 413 | c->state = CLIENT_RECVING; 414 | c->expect = -1; 415 | } 416 | } 417 | 418 | static void drop_bads(void) { 419 | for (int i = 0; i < MAX_CLIENTS; i++) { 420 | Client *c = server.clients + i; 421 | if (c->state == CLIENT_DEAD) continue; 422 | if ((c->state == CLIENT_SICK) || !raft_is_leader(raft)) { 423 | remove_client(c); 424 | } 425 | } 426 | } 427 | 428 | static void add_to_fdset(int fd, fd_set *fdset, int *maxfd) { 429 | assert(fd >= 0); 430 | FD_SET(fd, fdset); 431 | if (fd > *maxfd) *maxfd = fd; 432 | } 433 | 434 | static bool tick(int timeout_ms) { 435 | drop_bads(); 436 | 437 | bool raft_ready = false; 438 | 439 | fd_set readfds; 440 | fd_set writefds; 441 | 442 | FD_ZERO(&readfds); 443 | FD_ZERO(&writefds); 444 | 445 | int maxfd = 0; 446 | add_to_fdset(server.listener, &readfds, &maxfd); 447 | add_to_fdset(server.raftsock, &readfds, &maxfd); 448 | for (int i = 0; i < MAX_CLIENTS; i++) { 449 | Client *c = server.clients + i; 450 | switch (c->state) { 451 | case CLIENT_SENDING: 452 | add_to_fdset(c->socket, &readfds, &maxfd); 453 | break; 454 | case CLIENT_RECVING: 455 | add_to_fdset(c->socket, &writefds, &maxfd); 456 | break; 457 | default: 458 | continue; 459 | } 460 | } 461 | 462 | int numready = 0; 463 | mstimer_t timer; 464 | mstimer_reset(&timer); 465 | while (timeout_ms > 0) { 466 | struct timeval timeout = ms2tv(timeout_ms); 467 | numready = select(maxfd + 1, &readfds, &writefds, NULL, &timeout); 468 | timeout_ms -= mstimer_reset(&timer); 469 | if (numready >= 0) break; 470 | if (errno == EINTR) { 471 | continue; 472 | } 473 | shout("failed to select: %s\n", strerror(errno)); 474 | return false; 475 | } 476 | 477 | if (FD_ISSET(server.listener, &readfds)) { 478 | numready--; 479 | accept_client(); 480 | } 481 | 482 | if (FD_ISSET(server.raftsock, &readfds)) { 483 | numready--; 484 | raft_ready = true; 485 | } 486 | 487 | Client *c = server.clients; 488 | while (numready > 0) { 489 | assert(c - server.clients < MAX_CLIENTS); 490 | switch (c->state) { 491 | case CLIENT_SENDING: 492 | assert(c->socket >= 0); 493 | if (FD_ISSET(c->socket, &readfds)) { 494 | attend(c); 495 | numready--; 496 | } 497 | break; 498 | case CLIENT_RECVING: 499 | assert(c->socket >= 0); 500 | if (FD_ISSET(c->socket, &writefds)) { 501 | attend(c); 502 | numready--; 503 | } 504 | break; 505 | default: 506 | break; 507 | } 508 | c++; 509 | } 510 | 511 | return raft_ready; 512 | } 513 | 514 | int main(int argc, char **argv) { 515 | char *logfilename = NULL; 516 | bool daemonize = false; 517 | 518 | int id; 519 | char *host; 520 | char *str; 521 | int port; 522 | int opt; 523 | 524 | server.id = NOBODY; 525 | server.host = NULL; 526 | 527 | state = json_object(); 528 | 529 | raft_config_t rc; 530 | rc.peernum_max = 64; 531 | rc.heartbeat_ms = 50; 532 | rc.election_ms_min = 300; 533 | rc.election_ms_max = 600; 534 | rc.log_len = 10; 535 | rc.chunk_len = 400; 536 | rc.msg_len_max = 500; 537 | rc.userdata = state; 538 | rc.applier = applier; 539 | rc.snapshooter = snapshooter; 540 | raft = raft_init(&rc); 541 | 542 | int peernum = 0; 543 | while ((opt = getopt(argc, argv, "hi:r:l:")) != -1) { 544 | switch (opt) { 545 | case 'i': 546 | server.id = atoi(optarg); 547 | break; 548 | case 'r': 549 | if (server.id == NOBODY) { 550 | usage(argv[0]); 551 | return EXIT_FAILURE; 552 | } 553 | 554 | str = strtok(optarg, ":"); 555 | if (str) { 556 | id = atoi(str); 557 | } else { 558 | usage(argv[0]); 559 | return EXIT_FAILURE; 560 | } 561 | 562 | host = strtok(NULL, ":"); 563 | 564 | str = strtok(NULL, ":"); 565 | if (str) { 566 | port = atoi(str); 567 | } else { 568 | usage(argv[0]); 569 | return EXIT_FAILURE; 570 | } 571 | 572 | if (!raft_peer_up(raft, id, host, port, id == server.id)) { 573 | usage(argv[0]); 574 | return EXIT_FAILURE; 575 | } 576 | if (id == server.id) { 577 | server.host = host; 578 | server.port = port; 579 | } 580 | peernum++; 581 | break; 582 | case 'l': 583 | logfilename = optarg; 584 | daemonize = true; 585 | break; 586 | case 'h': 587 | usage(argv[0]); 588 | return EXIT_SUCCESS; 589 | default: 590 | usage(argv[0]); 591 | return EXIT_FAILURE; 592 | } 593 | } 594 | if (!server.host) { 595 | usage(argv[0]); 596 | return EXIT_FAILURE; 597 | } 598 | 599 | if (logfilename) { 600 | if (!freopen(logfilename, "a", stdout)) { 601 | // nowhere to report this failure 602 | return EXIT_FAILURE; 603 | } 604 | if (!freopen(logfilename, "a", stderr)) { 605 | // nowhere to report this failure 606 | return EXIT_FAILURE; 607 | } 608 | } 609 | 610 | if (daemonize) { 611 | if (daemon(true, true) == -1) { 612 | shout("could not daemonize: %s\n", strerror(errno)); 613 | return EXIT_FAILURE; 614 | } 615 | } 616 | 617 | if (!start_server()) { 618 | shout("couldn't start the server\n"); 619 | return EXIT_FAILURE; 620 | } 621 | 622 | signal(SIGTERM, die); 623 | signal(SIGQUIT, die); 624 | signal(SIGINT, die); 625 | sigset_t sset; 626 | sigfillset(&sset); 627 | sigprocmask(SIG_UNBLOCK, &sset, NULL); 628 | 629 | server.raftsock = raft_create_udp_socket(raft); 630 | if (server.raftsock == -1) { 631 | shout("couldn't start raft\n"); 632 | return EXIT_FAILURE; 633 | } 634 | 635 | mstimer_t t; 636 | mstimer_reset(&t); 637 | while (!stop) 638 | { 639 | raft_msg_t m = NULL; 640 | 641 | int ms = mstimer_reset(&t); 642 | raft_tick(raft, ms); 643 | 644 | if (tick(rc.heartbeat_ms)) 645 | { 646 | m = raft_recv_message(raft); 647 | assert(m != NULL); 648 | raft_handle_message(raft, m); 649 | notify(); 650 | } 651 | } 652 | 653 | return EXIT_SUCCESS; 654 | } 655 | -------------------------------------------------------------------------------- /example/timeout.c: -------------------------------------------------------------------------------- 1 | #include "timeout.h" 2 | 3 | #include 4 | 5 | void timeout_start(timeout_t *t, int msec) { 6 | t->msec_limit = msec; 7 | gettimeofday(&t->start, NULL); 8 | } 9 | 10 | static long msec(struct timeval *tv) { 11 | return tv->tv_sec * 1000 + tv->tv_usec / 1000; 12 | } 13 | 14 | bool timeout_nowait(timeout_t *t) { 15 | return t->msec_limit == 0; 16 | } 17 | 18 | bool timeout_indefinite(timeout_t *t) { 19 | return t->msec_limit < 0; 20 | } 21 | 22 | bool timeout_happened(timeout_t *t) { 23 | if (timeout_nowait(t)) return false; 24 | if (timeout_indefinite(t)) return false; 25 | 26 | return timeout_elapsed_ms(t) > t->msec_limit; 27 | } 28 | 29 | int timeout_elapsed_ms(timeout_t *t) { 30 | struct timeval now, diff; 31 | gettimeofday(&now, NULL); 32 | timersub(&now, &t->start, &diff); 33 | return msec(&diff); 34 | } 35 | 36 | int timeout_remaining_ms(timeout_t *t) { 37 | int remaining_ms; 38 | if (timeout_nowait(t)) return 0; 39 | if (timeout_indefinite(t)) return -1; 40 | 41 | remaining_ms = t->msec_limit - timeout_elapsed_ms(t); 42 | if (remaining_ms > 0) { 43 | return remaining_ms; 44 | } else { 45 | return 0; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /example/timeout.h: -------------------------------------------------------------------------------- 1 | #ifndef TIMEOUT_H 2 | #define TIMEOUT_H 3 | 4 | #include 5 | #include 6 | 7 | typedef struct timeout_t { 8 | /* 9 | * == 0: nowait 10 | * > 0: timer 11 | * < 0: wait indefinitely 12 | */ 13 | int msec_limit; 14 | 15 | struct timeval start; 16 | } timeout_t; 17 | 18 | void timeout_start(timeout_t *t, int msec); 19 | bool timeout_nowait(timeout_t *t); 20 | bool timeout_indefinite(timeout_t *t); 21 | bool timeout_happened(timeout_t *t); 22 | int timeout_elapsed_ms(timeout_t *t); 23 | int timeout_remaining_ms(timeout_t *t); 24 | 25 | #define TIMEOUT_LOOP_START(T) while (!timeout_happened(T)) { 26 | #define TIMEOUT_LOOP_END(T) if (timeout_nowait(T)) break; } 27 | 28 | #endif 29 | -------------------------------------------------------------------------------- /example/wtfexpect.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import select 4 | import subprocess 5 | import time 6 | 7 | class Proc(): 8 | def __init__(self, name, *argv): 9 | self.p = subprocess.Popen( 10 | argv, bufsize=0, 11 | stdin=subprocess.PIPE, 12 | stdout=subprocess.PIPE, 13 | stderr=subprocess.STDOUT, 14 | ) 15 | self.name = name 16 | self.accum = bytes() 17 | 18 | def fileno(self): 19 | return self.p.stdout.fileno() 20 | 21 | def readlines(self): 22 | newbytes = self.p.stdout.read(1024) 23 | if len(newbytes): 24 | self.accum += newbytes 25 | *newlines, self.accum = self.accum.split(b'\n') 26 | return [l.decode() for l in newlines] 27 | else: 28 | self.p.stdout.close() 29 | if len(self.accum): 30 | return [self.accum.decode(), None] 31 | else: 32 | return [None] 33 | 34 | def eof(self): 35 | return self.p.stdout.closed 36 | 37 | def kill(self): 38 | self.p.kill() 39 | 40 | def wait(self): 41 | return self.p.wait() 42 | 43 | class WtfExpect(): 44 | def __init__(self): 45 | self.procs = {} 46 | self.retcodes = {} 47 | self.lines = [] 48 | 49 | def __enter__(self): 50 | return self 51 | 52 | def __exit__(self, *exc): 53 | self.finish() 54 | return False 55 | 56 | def run(self, argv): 57 | p = subprocess.run( 58 | argv, 59 | stdout=subprocess.PIPE, 60 | stderr=subprocess.STDOUT, 61 | ) 62 | return p.returncode, p.stdout 63 | 64 | def spawn(self, name, *argv): 65 | assert(name not in self.procs) 66 | self.procs[name] = Proc(name, *argv) 67 | return True 68 | 69 | def kill(self, name): 70 | assert(name in self.procs) 71 | self.procs[name].kill() 72 | 73 | def close(self, name): 74 | assert(name in self.procs) 75 | self.retcodes[name] = self.procs[name].wait() 76 | del self.procs[name] 77 | 78 | def readline(self, timeout=None): 79 | if len(self.lines): 80 | return self.lines.pop(0) 81 | 82 | active = [p for p in self.procs.values() if not p.eof()] 83 | ready, _, _ = select.select(active, [], [], timeout) 84 | if len(ready): 85 | for proc in ready: 86 | for line in proc.readlines(): 87 | self.lines.append((proc.name, line)) 88 | if line is None: 89 | self.kill(proc.name) 90 | self.close(proc.name) 91 | if len(self.lines): 92 | return self.lines.pop(0) 93 | 94 | return None, None 95 | 96 | def expect(self, patterns, timeout=None): 97 | started = time.time() 98 | while self.alive(): 99 | if timeout is not None: 100 | t = time.time() 101 | if t - started > timeout: 102 | return None, None 103 | elapsed = t - started 104 | timeleft = timeout - elapsed 105 | else: 106 | timeleft = None 107 | 108 | name, line = self.readline(timeleft) 109 | if line is None: 110 | return name, None 111 | if name not in patterns: 112 | continue 113 | if line in patterns[name]: 114 | return name, line 115 | 116 | def capture(self, *names): 117 | results = {} 118 | nameslist = list(names) 119 | for name in names: 120 | assert(name in self.names.values()) 121 | results[name] = { 122 | 'retcode': None, 123 | 'output': [], 124 | } 125 | while len(nameslist): 126 | aname, line = self.readline() 127 | if aname not in nameslist: 128 | continue 129 | if line is None: 130 | results[aname]['retcode'] = self.getcode(aname) 131 | nameslist.remove(aname) 132 | else: 133 | results[aname]['output'].append(line) 134 | return results 135 | 136 | def getcode(self, name): 137 | if name in self.retcodes: 138 | retcode = self.retcodes[name] 139 | del self.retcodes[name] 140 | return retcode 141 | return None 142 | 143 | def alive(self, name=None): 144 | if name is not None: 145 | return name in self.procs 146 | else: 147 | return len(self.procs) > 0 148 | 149 | def finish(self): 150 | for proc in self.procs.values(): 151 | proc.kill() 152 | self.procs = {} 153 | self.retcodes = {} 154 | self.lines = [] 155 | -------------------------------------------------------------------------------- /include/raft.h: -------------------------------------------------------------------------------- 1 | #ifndef RAFT_H 2 | #define RAFT_H 3 | 4 | #define NOBODY -1 5 | 6 | typedef struct raft_data_t *raft_t; 7 | typedef struct raft_msg_data_t *raft_msg_t; 8 | 9 | typedef int raft_bool_t; 10 | 11 | typedef struct raft_update_t { 12 | int len; 13 | char *data; 14 | void *userdata; /* use this to track which query caused this update */ 15 | } raft_update_t; 16 | 17 | /* --- Callbacks --- */ 18 | 19 | /* 20 | * This should be a function that applies an 'update' to the state machine. 21 | * 'snapshot' is true if 'update' contains a snapshot. 'userdata' is the 22 | * userdata that raft was configured with. 23 | */ 24 | typedef void (*raft_applier_t)(void *userdata, raft_update_t update, raft_bool_t snapshot); 25 | 26 | /* 27 | * This should be a function that makes a snapshot of the state machine. Used 28 | * for raft log compaction. 'userdata' is the userdata that raft was configured 29 | * with. 30 | */ 31 | typedef raft_update_t (*raft_snapshooter_t)(void *userdata); 32 | 33 | /* --- Configuration --- */ 34 | 35 | typedef struct raft_config_t { 36 | int peernum_max; 37 | 38 | int heartbeat_ms; 39 | int election_ms_min; 40 | int election_ms_max; 41 | 42 | int log_len; 43 | 44 | int chunk_len; 45 | int msg_len_max; 46 | 47 | void *userdata; /* this will get passed to applier() and snapshooter() */ 48 | raft_applier_t applier; 49 | raft_snapshooter_t snapshooter; 50 | } raft_config_t; 51 | 52 | /* 53 | * Initialize a raft instance. Returns NULL on failure. 54 | */ 55 | raft_t raft_init(raft_config_t *config); 56 | 57 | /* 58 | * Add a peer named 'id'. 'self' should be true, if that peer is this instance. 59 | * Only one peer should have 'self' == true. 60 | */ 61 | raft_bool_t raft_peer_up(raft_t r, int id, char *host, int port, raft_bool_t self); 62 | 63 | /* 64 | * Returns the number of entried applied by the current peer. 65 | */ 66 | int raft_progress(raft_t r); 67 | 68 | /* 69 | * Remove a previously added peer named 'id'. 70 | */ 71 | void raft_peer_down(raft_t r, int id); 72 | 73 | /* --- Log Actions --- */ 74 | 75 | /* 76 | * Emit an 'update'. Returns the log index if emitted successfully, or -1 77 | * otherwise. 78 | */ 79 | int raft_emit(raft_t r, raft_update_t update); 80 | 81 | /* 82 | * Checks whether an entry at 'index' has been applied by the peer named 'id'. 83 | */ 84 | raft_bool_t raft_applied(raft_t t, int id, int index); 85 | 86 | /* --- Control --- */ 87 | 88 | /* 89 | * Note, that UDP socket and raft messages are exposed to the user. This gives 90 | * the user the opportunity to incorporate the socket with other sockets in 91 | * select() or poll(). Thus, the messages will be processed as soon as they 92 | * come, not as soon as we call raft_tick(). 93 | */ 94 | 95 | /* 96 | * Perform various raft logic tied to time. Call this function once in a while 97 | * and pass the elapsed 'msec' from the previous call. This function will only 98 | * trigger time-related events, and will not receive and process messages (see 99 | * the note above). 100 | */ 101 | void raft_tick(raft_t r, int msec); 102 | 103 | /* 104 | * Receive a raft message. Returns NULL if no message available. 105 | */ 106 | raft_msg_t raft_recv_message(raft_t r); 107 | 108 | /* 109 | * Process the message. 110 | */ 111 | void raft_handle_message(raft_t r, raft_msg_t m); 112 | 113 | /* 114 | * Create the raft socket. 115 | */ 116 | int raft_create_udp_socket(raft_t r); 117 | 118 | /* 119 | * Returns true if this peer thinks it is the leader. 120 | */ 121 | raft_bool_t raft_is_leader(raft_t r); 122 | 123 | /* 124 | * Returns the id of the current leader, or NOBODY if no leader. 125 | */ 126 | int raft_get_leader(raft_t r); 127 | 128 | #endif 129 | -------------------------------------------------------------------------------- /include/util.h: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_H 2 | #define UTIL_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | // Check if x in [min, max] 11 | int inrange(int min, int x, int max); 12 | 13 | static inline int min(int a, int b) { 14 | return a < b ? a : b; 15 | } 16 | 17 | static inline int max(int a, int b) { 18 | return a > b ? a : b; 19 | } 20 | 21 | // Return a random int in [min, max] 22 | static inline int rand_between(int min, int max) { 23 | return rand() % (max - min + 1) + min; 24 | } 25 | 26 | // ------ timing ------ 27 | 28 | // Millisecond timer 29 | typedef struct mstimer_t { 30 | struct timeval tv; 31 | } mstimer_t; 32 | 33 | // Restart the timer and return the accumulated milliseconds 34 | int mstimer_reset(mstimer_t *t); 35 | struct timeval ms2tv(int ms); 36 | 37 | // ------ logging ------ 38 | 39 | #ifdef DEBUG 40 | #define DEBUG_ENABLED 1 41 | #else 42 | #define DEBUG_ENABLED 0 43 | #endif 44 | 45 | #define debug(...) \ 46 | do { \ 47 | if (DEBUG_ENABLED) {\ 48 | fprintf(stderr, __VA_ARGS__); \ 49 | fflush(stderr); \ 50 | }\ 51 | } while (0) 52 | 53 | #define shout(...) \ 54 | do { \ 55 | fprintf(stderr, "RAFT: "); \ 56 | fprintf(stderr, __VA_ARGS__); \ 57 | fflush(stderr); \ 58 | } while (0) 59 | 60 | #endif 61 | -------------------------------------------------------------------------------- /src/raft.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "raft.h" 13 | #include "util.h" 14 | 15 | #define bool raft_bool_t 16 | #define true 1 17 | #define false 0 18 | 19 | #define DEFAULT_LISTENHOST "0.0.0.0" 20 | #define DEFAULT_LISTENPORT 6543 21 | 22 | typedef enum roles { 23 | FOLLOWER, 24 | CANDIDATE, 25 | LEADER 26 | } role_t; 27 | 28 | #define UDP_SAFE_SIZE 508 29 | 30 | // The raft log consists of these structures. 31 | typedef struct raft_entry_t { 32 | int term; 33 | bool snapshot; 34 | raft_update_t update; 35 | int bytes; 36 | } raft_entry_t; 37 | 38 | typedef struct raft_log_t { 39 | int first; 40 | int size; // number of entries past first 41 | int acked; // number of entries fully replicated to the majority of peers 42 | int applied; // number of entries applied to the state machine 43 | raft_entry_t *entries; // wraps around 44 | raft_entry_t newentry; // partially received entry 45 | } raft_log_t; 46 | 47 | // This structure is used to track how many entries each peer has replicated. 48 | typedef struct raft_progress_t { 49 | int entries; // number of entries fully sent/acked 50 | int bytes; // number of bytes of the currently being sent entry sent/acked 51 | } raft_progress_t; 52 | 53 | // Peer descriptor. Each raft peer has an array of raft_peer_t to track all 54 | // other peers. 55 | typedef struct raft_peer_t { 56 | bool up; 57 | 58 | int seqno; // the rpc sequence number 59 | raft_progress_t acked; // the number of entries:bytes acked by this peer 60 | int applied; // the number of entries applied by this peer 61 | 62 | char *host; 63 | int port; 64 | struct sockaddr_in addr; 65 | 66 | int silent_ms; // how long was this peer silent 67 | } raft_peer_t; 68 | 69 | // Current peer's private state. 70 | typedef struct raft_data_t { 71 | int term; // current term (latest term we have seen) 72 | int vote; // who received our vote in current term 73 | role_t role; 74 | int me; // my id 75 | int votes; // how many votes are for me (if candidate) 76 | int leader; // the id of the leader 77 | raft_log_t log; 78 | 79 | int sock; 80 | 81 | int peernum; 82 | raft_peer_t *peers; 83 | 84 | int timer; 85 | 86 | raft_config_t config; 87 | } raft_data_t; 88 | 89 | // Convenience macros for log. 90 | #define RAFT_LOG(RAFT, INDEX) ((RAFT)->log.entries[(INDEX) % (RAFT)->config.log_len]) 91 | #define RAFT_LOG_FIRST_INDEX(RAFT) ((RAFT)->log.first) 92 | #define RAFT_LOG_LAST_INDEX(RAFT) ((RAFT)->log.first + (RAFT)->log.size - 1) 93 | #define RAFT_LOG_FULL(RAFT) ((RAFT)->log.size == (RAFT)->config.log_len) 94 | 95 | #define RAFT_MSG_UPDATE 0 // append entry 96 | #define RAFT_MSG_DONE 1 // entry appended 97 | #define RAFT_MSG_CLAIM 2 // vote for me 98 | #define RAFT_MSG_VOTE 3 // my vote 99 | 100 | /* 101 | * Raft message "class" hierarchy: 102 | * 103 | * raft_msg_data_t <-- raft_msg_update_t 104 | * <-- raft_msg_done_t 105 | * <-- raft_msg_claim_t 106 | * <-- raft_msg_vote_t 107 | * 108 | * 'update' is sent by a leader to all other peers 109 | * 'done' is sent in reply to 'update' 110 | * 'claim' is sent by a candidate to all other peers 111 | * 'vote' is sent in reply to 'claim' 112 | */ 113 | 114 | typedef struct raft_msg_data_t { 115 | int msgtype; 116 | int curterm; 117 | int from; 118 | int seqno; 119 | } raft_msg_data_t; 120 | 121 | typedef struct raft_msg_update_t { 122 | raft_msg_data_t msg; 123 | bool snapshot; // true if this message contains a snapshot 124 | int previndex; // the index of the preceding log entry 125 | int prevterm; // the term of the preceding log entry 126 | 127 | bool empty; // the message is just a heartbeat if empty 128 | 129 | int entryterm; 130 | int totallen; // the length of the whole update 131 | 132 | int acked; // the leader's acked number 133 | 134 | int offset; // the offset of this chunk inside the whole update 135 | int len; // the length of the chunk 136 | char data[1]; 137 | } raft_msg_update_t; 138 | 139 | typedef struct raft_msg_done_t { 140 | raft_msg_data_t msg; 141 | int entryterm; // the term of the appended entry 142 | raft_progress_t progress; // the progress after appending 143 | int applied; 144 | bool success; 145 | // the message is considered acked when the last chunk appends successfully 146 | } raft_msg_done_t; 147 | 148 | typedef struct raft_msg_claim_t { 149 | raft_msg_data_t msg; 150 | int index; // the index of my last completely received entry 151 | int lastterm; // the term of my last entry 152 | } raft_msg_claim_t; 153 | 154 | typedef struct raft_msg_vote_t { 155 | raft_msg_data_t msg; 156 | bool granted; 157 | } raft_msg_vote_t; 158 | 159 | typedef union { 160 | raft_msg_update_t u; 161 | raft_msg_done_t d; 162 | raft_msg_claim_t c; 163 | raft_msg_vote_t v; 164 | } raft_msg_any_t; 165 | 166 | // Return true if the given config is sane. 167 | static bool raft_config_is_ok(raft_config_t *config) { 168 | bool ok = true; 169 | 170 | if (config->peernum_max < 3) { 171 | shout("please ensure peernum_max >= 3\n"); 172 | ok = false; 173 | } 174 | 175 | if (config->heartbeat_ms >= config->election_ms_min) { 176 | shout("please ensure heartbeat_ms < election_ms_min (substantially)\n"); 177 | ok = false; 178 | } 179 | 180 | if (config->election_ms_min >= config->election_ms_max) { 181 | shout("please ensure election_ms_min < election_ms_max\n"); 182 | ok = false; 183 | } 184 | 185 | if (sizeof(raft_msg_update_t) + config->chunk_len - 1 > UDP_SAFE_SIZE) { 186 | shout( 187 | "please ensure chunk_len <= %lu, %d is too much for UDP\n", 188 | UDP_SAFE_SIZE - sizeof(raft_msg_update_t) + 1, 189 | config->chunk_len 190 | ); 191 | ok = false; 192 | } 193 | 194 | if (config->msg_len_max < sizeof(raft_msg_any_t)) { 195 | shout("please ensure msg_len_max >= %lu\n", sizeof(raft_msg_any_t)); 196 | ok = false; 197 | } 198 | 199 | return ok; 200 | } 201 | 202 | // === Constructors === 203 | 204 | static void reset_progress(raft_progress_t *p) { 205 | p->entries = 0; 206 | p->bytes = 0; 207 | } 208 | 209 | static void raft_peer_init(raft_peer_t *p) { 210 | p->up = false; 211 | p->seqno = 0; 212 | reset_progress(&p->acked); 213 | p->applied = 0; 214 | 215 | p->host = DEFAULT_LISTENHOST; 216 | p->port = DEFAULT_LISTENPORT; 217 | p->silent_ms = 0; 218 | } 219 | 220 | static void raft_entry_init(raft_entry_t *e) { 221 | e->term = 0; 222 | e->snapshot = false; 223 | e->update.len = 0; 224 | e->update.data = NULL; 225 | e->update.userdata = NULL; 226 | e->bytes = 0; 227 | } 228 | 229 | static bool raft_log_init(raft_t raft) { 230 | raft_log_t *l = &raft->log; 231 | int i; 232 | l->first = 0; 233 | l->size = 0; 234 | l->acked = 0; 235 | l->applied = 0; 236 | l->entries = malloc(raft->config.log_len * sizeof(raft_entry_t)); 237 | if (!l->entries) { 238 | shout("failed to allocate memory for raft log\n"); 239 | return false; 240 | } 241 | for (i = 0; i < raft->config.log_len; i++) { 242 | raft_entry_init(l->entries + i); 243 | } 244 | raft_entry_init(&l->newentry); 245 | return true; 246 | } 247 | 248 | static bool raft_peers_init(raft_t raft) { 249 | int i; 250 | raft->peers = malloc(raft->config.peernum_max * sizeof(raft_peer_t)); 251 | if (!raft->peers) { 252 | shout("failed to allocate memory for raft peers\n"); 253 | return false; 254 | } 255 | for (i = 0; i < raft->config.peernum_max; i++) { 256 | raft_peer_init(raft->peers + i); 257 | } 258 | return true; 259 | } 260 | 261 | // Initialize a raft instance. Returns NULL on failure. 262 | raft_t raft_init(raft_config_t *config) { 263 | raft_t raft = NULL; 264 | 265 | if (!raft_config_is_ok(config)) { 266 | goto cleanup; 267 | } 268 | 269 | raft = malloc(sizeof(raft_data_t)); 270 | if (!raft) { 271 | shout("failed to allocate memory for raft instance\n"); 272 | goto cleanup; 273 | } 274 | raft->log.entries = NULL; 275 | raft->peers = NULL; 276 | 277 | memcpy(&raft->config, config, sizeof(raft_config_t)); 278 | raft->sock = -1; 279 | raft->term = 0; 280 | raft->vote = NOBODY; 281 | raft->role = FOLLOWER; 282 | raft->votes = 0; 283 | raft->me = NOBODY; 284 | raft->leader = NOBODY; 285 | raft->peernum = 0; 286 | 287 | if (!raft_log_init(raft)) goto cleanup; 288 | if (!raft_peers_init(raft)) goto cleanup; 289 | 290 | return raft; 291 | 292 | cleanup: 293 | if (raft) { 294 | free(raft->peers); 295 | free(raft->log.entries); 296 | free(raft); 297 | } 298 | return NULL; 299 | } 300 | 301 | // --- 302 | 303 | // Reset heartbeat or election timer depending on current role of the peer. 304 | static void raft_reset_timer(raft_t r) { 305 | if (r->role == LEADER) { 306 | r->timer = r->config.heartbeat_ms; 307 | } else { 308 | r->timer = rand_between( 309 | r->config.election_ms_min, 310 | r->config.election_ms_max 311 | ); 312 | } 313 | } 314 | 315 | // Add a peer named 'id'. 'self' should be true, if that peer is this instance. 316 | // Only one peer should have 'self' == true. 317 | bool raft_peer_up(raft_t r, int id, char *host, int port, bool self) { 318 | raft_peer_t *p = r->peers + id; 319 | struct addrinfo hint; 320 | struct addrinfo *a = NULL; 321 | char portstr[6]; 322 | 323 | if (r->peernum >= r->config.peernum_max) { 324 | shout("too many peers\n"); 325 | return false; 326 | } 327 | 328 | raft_peer_init(p); 329 | p->up = true; 330 | p->host = host; 331 | p->port = port; 332 | 333 | memset(&hint, 0, sizeof(hint)); 334 | hint.ai_socktype = SOCK_DGRAM; 335 | hint.ai_family = AF_INET; 336 | hint.ai_protocol = getprotobyname("udp")->p_proto; 337 | 338 | snprintf(portstr, 6, "%d", port); 339 | 340 | if (getaddrinfo(host, portstr, &hint, &a)) 341 | { 342 | shout( 343 | "cannot convert the host string '%s'" 344 | " to a valid address: %s\n", host, gai_strerror(errno)); 345 | return false; 346 | } 347 | 348 | assert(a != NULL && a->ai_addrlen <= sizeof(p->addr)); 349 | memcpy(&p->addr, a->ai_addr, a->ai_addrlen); 350 | 351 | if (self) { 352 | if (r->me != NOBODY) { 353 | shout("cannot set 'self' peer multiple times\n"); 354 | return false; 355 | } 356 | r->me = id; 357 | srand(id); 358 | raft_reset_timer(r); 359 | } 360 | r->peernum++; 361 | return true; 362 | } 363 | 364 | 365 | // Remove a previously added peer named 'id'. 366 | void raft_peer_down(raft_t r, int id) { 367 | raft_peer_t *p = r->peers + id; 368 | 369 | p->up = false; 370 | if (r->me == id) { 371 | r->me = NOBODY; 372 | } 373 | 374 | r->peernum--; 375 | } 376 | 377 | // Apply all unapplied entries that have been replicated on a majority of peers. 378 | // Return the number of entries applied by this call. 379 | static int raft_apply(raft_t raft) { 380 | int applied_now = 0; 381 | raft_log_t *l = &raft->log; 382 | while ((l->applied < l->acked) && (l->applied <= RAFT_LOG_LAST_INDEX(raft))) { 383 | raft_entry_t *e = &RAFT_LOG(raft, l->applied); 384 | assert(e->update.len == e->bytes); 385 | raft->config.applier(raft->config.userdata, e->update, false); 386 | raft->log.applied++; 387 | applied_now++; 388 | } 389 | return applied_now; 390 | } 391 | 392 | // === Convenience functions for socket tuning === 393 | 394 | static void socket_set_recv_timeout(int sock, int ms) { 395 | struct timeval tv; 396 | tv.tv_sec = ms / 1000; 397 | tv.tv_usec = ((ms % 1000) * 1000); 398 | if (setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)) == -1) { 399 | shout("failed to set socket recv timeout: %s\n", strerror(errno)); 400 | } 401 | } 402 | 403 | static void socket_set_reuseaddr(int sock) { 404 | int optval = 1; 405 | if (setsockopt( 406 | sock, SOL_SOCKET, SO_REUSEADDR, 407 | (char const*)&optval, sizeof(optval) 408 | ) == -1) { 409 | shout("failed to set socket to reuseaddr: %s\n", strerror(errno)); 410 | } 411 | } 412 | 413 | // --- 414 | 415 | int raft_create_udp_socket(raft_t r) { 416 | assert(r->me != NOBODY); 417 | raft_peer_t *me = r->peers + r->me; 418 | struct addrinfo hint; 419 | struct addrinfo *addrs = NULL; 420 | struct addrinfo *a; 421 | char portstr[6]; 422 | int rc; 423 | memset(&hint, 0, sizeof(hint)); 424 | hint.ai_socktype = SOCK_DGRAM; 425 | hint.ai_family = AF_INET; 426 | hint.ai_protocol = getprotobyname("udp")->p_proto; 427 | 428 | snprintf(portstr, 6, "%d", me->port); 429 | 430 | if ((rc = getaddrinfo(me->host, portstr, &hint, &addrs)) != 0) 431 | { 432 | shout( 433 | "cannot convert the host string '%s'" 434 | " to a valid address: %s\n", me->host, gai_strerror(rc)); 435 | return -1; 436 | } 437 | 438 | for (a = addrs; a != NULL; a = a->ai_next) 439 | { 440 | int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); 441 | if (sock < 0) { 442 | shout("cannot create socket: %s\n", strerror(errno)); 443 | continue; 444 | } 445 | socket_set_reuseaddr(sock); 446 | socket_set_recv_timeout(sock, r->config.heartbeat_ms); 447 | 448 | debug("binding udp %s:%d\n", me->host, me->port); 449 | if (bind(sock, a->ai_addr, a->ai_addrlen) < 0) { 450 | shout("cannot bind the socket: %s\n", strerror(errno)); 451 | close(sock); 452 | continue; 453 | } 454 | r->sock = sock; 455 | assert(a->ai_addrlen <= sizeof(me->addr)); 456 | memcpy(&me->addr, a->ai_addr, a->ai_addrlen); 457 | return sock; 458 | } 459 | 460 | shout("cannot resolve the host string '%s' to a valid address\n", 461 | me->host 462 | ); 463 | return -1; 464 | } 465 | 466 | // Check that the given raft message 'm' should be 'mlen' bytes long according 467 | // to its type. 468 | static bool msg_size_is(raft_msg_t m, int mlen) { 469 | switch (m->msgtype) { 470 | case RAFT_MSG_UPDATE: 471 | return mlen == sizeof(raft_msg_update_t) + ((raft_msg_update_t *)m)->len - 1; 472 | case RAFT_MSG_DONE: 473 | return mlen == sizeof(raft_msg_done_t); 474 | case RAFT_MSG_CLAIM: 475 | return mlen == sizeof(raft_msg_claim_t); 476 | case RAFT_MSG_VOTE: 477 | return mlen == sizeof(raft_msg_vote_t); 478 | } 479 | return false; 480 | } 481 | 482 | // Send message 'm' of length 'mlen' to peer 'dst'. 483 | static void raft_send(raft_t r, int dst, void *m, int mlen) { 484 | assert(r->peers[dst].up); 485 | assert(mlen <= r->config.msg_len_max); 486 | assert(msg_size_is((raft_msg_t)m, mlen)); 487 | assert(((raft_msg_t)m)->msgtype >= 0); 488 | assert(((raft_msg_t)m)->msgtype < 4); 489 | assert(dst >= 0); 490 | assert(dst < r->config.peernum_max); 491 | assert(dst != r->me); 492 | assert(((raft_msg_t)m)->from == r->me); 493 | 494 | raft_peer_t *peer = r->peers + dst; 495 | 496 | int sent = sendto( 497 | r->sock, m, mlen, 0, 498 | (struct sockaddr*)&peer->addr, sizeof(peer->addr) 499 | ); 500 | if (sent == -1) { 501 | shout( 502 | "failed to send a msg to [%d]: %s\n", 503 | dst, strerror(errno) 504 | ); 505 | } 506 | } 507 | 508 | // A heartbeat by the leader. Sends pending entries to each follower, or 509 | // heartbeats to those followers which are up to date. 510 | // 'dst' specifies the destination peer, or NOBODY if a broadcast is needed. 511 | static void raft_beat(raft_t r, int dst) { 512 | if (dst == NOBODY) { 513 | // send a beat/update to everybody 514 | int i; 515 | for (i = 0; i < r->config.peernum_max; i++) { 516 | if (!r->peers[i].up) continue; 517 | if (i == r->me) continue; 518 | raft_beat(r, i); 519 | } 520 | return; 521 | } 522 | 523 | assert(r->role == LEADER); 524 | assert(r->leader == r->me); 525 | 526 | raft_peer_t *p = r->peers + dst; 527 | 528 | raft_msg_update_t *m = malloc(sizeof(raft_msg_update_t) + r->config.chunk_len - 1); 529 | 530 | m->msg.msgtype = RAFT_MSG_UPDATE; 531 | m->msg.curterm = r->term; 532 | m->msg.from = r->me; 533 | 534 | if (p->acked.entries <= RAFT_LOG_LAST_INDEX(r)) { 535 | int sendindex; 536 | 537 | debug("%d has acked %d:%d\n", dst, p->acked.entries, p->acked.bytes); 538 | 539 | if (p->acked.entries < RAFT_LOG_FIRST_INDEX(r)) { 540 | // The peer has woken up from anabiosis. Send the first 541 | // log entry (which is usually a snapshot). 542 | debug("sending the snapshot to %d\n", dst); 543 | sendindex = RAFT_LOG_FIRST_INDEX(r); 544 | assert(RAFT_LOG(r, sendindex).snapshot); 545 | } else { 546 | // The peer is a bit behind. Send an update. 547 | debug("sending update %d snapshot to %d\n", p->acked.entries, dst); 548 | sendindex = p->acked.entries; 549 | } 550 | m->snapshot = RAFT_LOG(r, sendindex).snapshot; 551 | debug("will send index %d to %d\n", sendindex, dst); 552 | 553 | m->previndex = sendindex - 1; 554 | raft_entry_t *e = &RAFT_LOG(r, sendindex); 555 | 556 | if (m->previndex >= 0) { 557 | m->prevterm = RAFT_LOG(r, m->previndex).term; 558 | } else { 559 | m->prevterm = -1; 560 | } 561 | m->entryterm = e->term; 562 | m->totallen = e->update.len; 563 | m->empty = false; 564 | m->offset = p->acked.bytes; 565 | m->len = min(r->config.chunk_len, m->totallen - m->offset); 566 | assert(m->len > 0); 567 | memcpy(m->data, e->update.data + m->offset, m->len); 568 | } else { 569 | // The peer is up to date. Send an empty heartbeat. 570 | debug("sending empty heartbeat to %d\n", dst); 571 | m->empty = true; 572 | m->len = 0; 573 | } 574 | m->acked = r->log.acked; 575 | 576 | p->seqno++; 577 | m->msg.seqno = p->seqno; 578 | if (!m->empty) { 579 | debug( 580 | "sending seqno=%d to %d: offset=%d size=%d total=%d term=%d snapshot=%s\n", 581 | m->msg.seqno, dst, m->offset, m->len, m->totallen, m->entryterm, m->snapshot ? "true" : "false" 582 | ); 583 | } else { 584 | debug("sending seqno=%d to %d: heartbeat\n", m->msg.seqno, dst); 585 | } 586 | 587 | raft_send(r, dst, m, sizeof(raft_msg_update_t) + m->len - 1); 588 | free(m); 589 | } 590 | 591 | // Reset the byte-progress of the followers. This will make the leader send 592 | // the next entry from the beginning. 593 | static void raft_reset_bytes_acked(raft_t r) { 594 | int i; 595 | for (i = 0; i < r->config.peernum_max; i++) { 596 | r->peers[i].acked.bytes = 0; 597 | } 598 | } 599 | 600 | // The timer of 'silence' is used to track the situation of leader being 601 | // isolated. When there is no answer from a majority for a long enough time, 602 | // the leader will step down. 603 | static void raft_reset_silent_time(raft_t r, int id) { 604 | int i; 605 | for (i = 0; i < r->config.peernum_max; i++) { 606 | if ((i == id) || (id == NOBODY)) { 607 | r->peers[i].silent_ms = 0; 608 | } 609 | } 610 | } 611 | static int raft_increase_silent_time(raft_t r, int ms) { 612 | int recent_peers = 1; // count myself as recent 613 | int i; 614 | for (i = 0; i < r->config.peernum_max; i++) { 615 | if (!r->peers[i].up) continue; 616 | if (i == r->me) continue; 617 | 618 | r->peers[i].silent_ms += ms; 619 | if (r->peers[i].silent_ms < r->config.election_ms_max) { 620 | recent_peers++; 621 | } 622 | } 623 | 624 | return recent_peers; 625 | } 626 | 627 | // Return true if we got the support of a majority and became the leader. 628 | static bool raft_become_leader(raft_t r) { 629 | if (r->votes * 2 > r->peernum) { 630 | // got the support of a majority 631 | r->role = LEADER; 632 | r->leader = r->me; 633 | raft_reset_bytes_acked(r); 634 | raft_reset_silent_time(r, NOBODY); 635 | raft_reset_timer(r); 636 | shout("became the leader\n"); 637 | return true; 638 | } 639 | return false; 640 | } 641 | 642 | // Send our claim for leadership to all other peers. 643 | static void raft_claim(raft_t r) { 644 | assert(r->role == CANDIDATE); 645 | assert(r->leader == NOBODY); 646 | 647 | r->votes = 1; // vote for self 648 | if (raft_become_leader(r)) { 649 | // no need to send any messages, since we are alone 650 | return; 651 | } 652 | 653 | raft_msg_claim_t m; 654 | 655 | m.msg.msgtype = RAFT_MSG_CLAIM; 656 | m.msg.curterm = r->term; 657 | m.msg.from = r->me; 658 | 659 | m.index = RAFT_LOG_LAST_INDEX(r); 660 | if (m.index >= 0) { 661 | m.lastterm = RAFT_LOG(r, m.index).term; 662 | } else { 663 | m.lastterm = -1; 664 | } 665 | 666 | int i; 667 | for (i = 0; i < r->config.peernum_max; i++) { 668 | if (!r->peers[i].up) continue; 669 | if (i == r->me) continue; 670 | raft_peer_t *s = r->peers + i; 671 | s->seqno++; 672 | m.msg.seqno = s->seqno; 673 | 674 | raft_send(r, i, &m, sizeof(m)); 675 | } 676 | } 677 | 678 | // Pick each peer's 'acked' and check if it is also acked on the majority. 679 | // The max of these will give us the global progress. 680 | static void raft_refresh_acked(raft_t r) { 681 | // TODO: count 'acked' inside the entry itself to remove the nested loop here 682 | int i, j; 683 | for (i = 0; i < r->config.peernum_max; i++) { 684 | raft_peer_t *p = r->peers + i; 685 | if (i == r->me) continue; 686 | if (!p->up) continue; 687 | 688 | int newacked = p->acked.entries; 689 | if (newacked <= r->log.acked) continue; 690 | 691 | int replication = 1; // count self as yes 692 | for (j = 0; j < r->config.peernum_max; j++) { 693 | if (j == r->me) continue; 694 | 695 | raft_peer_t *pp = r->peers + j; 696 | if (pp->acked.entries >= newacked) { 697 | replication++; 698 | } 699 | } 700 | 701 | assert(replication <= r->peernum); 702 | 703 | if (replication * 2 > r->peernum) { 704 | debug("===== GLOBAL PROGRESS: %d\n", newacked); 705 | r->log.acked = newacked; 706 | } 707 | } 708 | 709 | // Try to apply all entries which have been replicated on a majority. 710 | int applied = raft_apply(r); 711 | if (applied) { 712 | debug("applied %d updates\n", applied); 713 | } 714 | } 715 | 716 | void raft_tick(raft_t r, int msec) { 717 | r->timer -= msec; 718 | if (r->timer < 0) { 719 | switch (r->role) { 720 | case FOLLOWER: 721 | shout( 722 | "lost the leader," 723 | " claiming leadership\n" 724 | ); 725 | r->leader = NOBODY; 726 | r->role = CANDIDATE; 727 | r->term++; 728 | raft_claim(r); 729 | break; 730 | case CANDIDATE: 731 | shout( 732 | "the vote failed," 733 | " claiming leadership\n" 734 | ); 735 | r->term++; 736 | raft_claim(r); 737 | break; 738 | case LEADER: 739 | raft_beat(r, NOBODY); 740 | break; 741 | } 742 | raft_reset_timer(r); 743 | } 744 | raft_refresh_acked(r); 745 | 746 | int recent_peers = raft_increase_silent_time(r, msec); 747 | if ((r->role == LEADER) && (recent_peers * 2 <= r->peernum)) { 748 | shout("lost quorum, demoting\n"); 749 | r->leader = NOBODY; 750 | r->role = FOLLOWER; 751 | } 752 | } 753 | 754 | static int raft_compact(raft_t raft) { 755 | raft_log_t *l = &raft->log; 756 | int i; 757 | int compacted = 0; 758 | for (i = l->first; i < l->applied; i++) { 759 | raft_entry_t *e = &RAFT_LOG(raft, i); 760 | 761 | e->snapshot = false; 762 | free(e->update.data); 763 | e->update.len = 0; 764 | e->update.data = NULL; 765 | 766 | compacted++; 767 | } 768 | if (compacted) { 769 | l->first += compacted - 1; 770 | l->size -= compacted - 1; 771 | raft_entry_t *e = &RAFT_LOG(raft, RAFT_LOG_FIRST_INDEX(raft)); 772 | e->update = raft->config.snapshooter(raft->config.userdata); 773 | e->bytes = e->update.len; 774 | e->snapshot = true; 775 | assert(l->first == l->applied - 1); 776 | 777 | // reset bytes progress of peers that were receiving the compacted entries 778 | for (i = 0; i < raft->config.peernum_max; i++) { 779 | raft_peer_t *p = raft->peers + i; 780 | if (!p->up) continue; 781 | if (i == raft->me) continue; 782 | if (p->acked.entries + 1 <= l->first) 783 | p->acked.bytes = 0; 784 | } 785 | } 786 | return compacted; 787 | } 788 | 789 | // Emit an 'update'. Returns the log index if emitted successfully, or -1 790 | // otherwise. 791 | int raft_emit(raft_t r, raft_update_t update) { 792 | assert(r->leader == r->me); 793 | assert(r->role == LEADER); 794 | 795 | // Compact the log if it is full. 796 | if (RAFT_LOG_FULL(r)) { 797 | int compacted = raft_compact(r); 798 | if (compacted > 1) { 799 | debug("compacted %d entries\n", compacted); 800 | } else { 801 | shout( 802 | "cannot emit new entries, the log is" 803 | " full and cannot be compacted\n" 804 | ); 805 | return -1; 806 | } 807 | } 808 | 809 | // Append the entry to the log. 810 | int newindex = RAFT_LOG_LAST_INDEX(r) + 1; 811 | raft_entry_t *e = &RAFT_LOG(r, newindex); 812 | e->term = r->term; 813 | assert(e->update.len == 0); 814 | assert(e->update.data == NULL); 815 | e->update.len = update.len; 816 | e->bytes = update.len; 817 | e->update.data = malloc(update.len); 818 | memcpy(e->update.data, update.data, update.len); 819 | r->log.size++; 820 | 821 | // Replicate. 822 | raft_beat(r, NOBODY); 823 | raft_reset_timer(r); 824 | return newindex; 825 | } 826 | 827 | // Checks whether an entry at 'index' has been applied by the peer named 'id'. 828 | bool raft_applied(raft_t r, int id, int index) { 829 | if (r->me == id) { 830 | return r->log.applied > index; 831 | } else { 832 | raft_peer_t *p = r->peers + id; 833 | if (!p->up) return false; 834 | return p->applied > index; 835 | } 836 | } 837 | 838 | // Restore the state from a snapshot. 839 | static bool raft_restore(raft_t r, int previndex, raft_entry_t *e) { 840 | int i; 841 | assert(e->bytes == e->update.len); 842 | assert(e->snapshot); 843 | 844 | // Clear the log 845 | for (i = RAFT_LOG_FIRST_INDEX(r); i <= RAFT_LOG_LAST_INDEX(r); i++) { 846 | raft_entry_t *victim = &RAFT_LOG(r, i); 847 | free(victim->update.data); 848 | victim->update.len = 0; 849 | victim->update.data = NULL; 850 | } 851 | 852 | // The new log will have only one entry - the snapshot. 853 | int index = previndex + 1; 854 | r->log.first = index; 855 | r->log.size = 1; 856 | RAFT_LOG(r, index) = *e; 857 | raft_entry_init(e); 858 | 859 | r->config.applier(r->config.userdata, RAFT_LOG(r, index).update, true /*snapshot*/); 860 | r->log.applied = index + 1; 861 | return true; 862 | } 863 | 864 | // Check whether it is possible to insert an entry next to the 'previndex' 865 | // having 'prevterm'. 866 | static bool raft_appendable(raft_t r, int previndex, int prevterm) { 867 | int low, high; 868 | 869 | low = RAFT_LOG_FIRST_INDEX(r); 870 | if (low == 0) low = -1; // allow appending at the start 871 | high = RAFT_LOG_LAST_INDEX(r); 872 | 873 | if (!inrange(low, previndex, high)) 874 | { 875 | debug( 876 | "previndex %d is outside log range %d-%d\n", 877 | previndex, low, high 878 | ); 879 | return false; 880 | } 881 | 882 | if (previndex != -1) { 883 | raft_entry_t *pe = &RAFT_LOG(r, previndex); 884 | if (pe->term != prevterm) { 885 | debug("log term %d != prevterm %d\n", pe->term, prevterm); 886 | return false; 887 | } 888 | } 889 | 890 | return true; 891 | } 892 | 893 | // Append entry 'e' after 'previndex' having 'prevterm'. Return false if not 894 | // possible. 895 | static bool raft_append(raft_t r, int previndex, int prevterm, raft_entry_t *e) { 896 | assert(e->bytes == e->update.len); 897 | assert(!e->snapshot); 898 | 899 | raft_log_t *l = &r->log; 900 | 901 | debug( 902 | "log_append(%p, previndex=%d, prevterm=%d," 903 | " term=%d)\n", 904 | (void *)l, previndex, prevterm, 905 | e->term 906 | ); 907 | 908 | if (!raft_appendable(r, previndex, prevterm)) return false; 909 | 910 | if (previndex == RAFT_LOG_LAST_INDEX(r)) { 911 | debug("previndex == last\n"); 912 | // appending to the end 913 | // check if the log can accomodate 914 | if (l->size == r->config.log_len) { 915 | debug("log is full\n"); 916 | int compacted = raft_compact(r); 917 | if (compacted) { 918 | debug("compacted %d entries\n", compacted); 919 | } else { 920 | return false; 921 | } 922 | } 923 | } 924 | 925 | int index = previndex + 1; 926 | raft_entry_t *slot = &RAFT_LOG(r, index); 927 | 928 | if (index <= RAFT_LOG_LAST_INDEX(r)) { 929 | // replacing an existing entry 930 | if (slot->term != e->term) { 931 | // entry conflict, remove the entry and all that follow 932 | l->size = index - l->first; 933 | } 934 | assert(slot->update.data); 935 | free(slot->update.data); 936 | } 937 | 938 | if (index > RAFT_LOG_LAST_INDEX(r)) { 939 | // increase log size if actually appended 940 | l->size++; 941 | } 942 | *slot = *e; 943 | raft_entry_init(e); 944 | 945 | return true; 946 | } 947 | 948 | // === Incoming message handlers === 949 | 950 | // This is called when an update (or chunk of it) is recved from the leader. 951 | // An update does not get applied and replicated until all of its chunks have 952 | // been recved. We reply with a 'done' message. 953 | static void raft_handle_update(raft_t r, raft_msg_update_t *m) { 954 | int sender = m->msg.from; 955 | 956 | raft_msg_done_t reply; 957 | reply.msg.msgtype = RAFT_MSG_DONE; 958 | reply.msg.curterm = r->term; 959 | reply.msg.from = r->me; 960 | reply.msg.seqno = m->msg.seqno; 961 | 962 | raft_entry_t *e = &r->log.newentry; 963 | raft_update_t *u = &e->update; 964 | 965 | // The entry should either be an empty heartbeat, or be appendable, or be a snapshot. 966 | if (!m->empty && !m->snapshot && !raft_appendable(r, m->previndex, m->prevterm)) goto finish; 967 | 968 | if (RAFT_LOG_LAST_INDEX(r) >= 0) { 969 | reply.entryterm = RAFT_LOG(r, RAFT_LOG_LAST_INDEX(r)).term; 970 | } else { 971 | reply.entryterm = -1; 972 | } 973 | reply.success = false; 974 | 975 | // the message is too old 976 | if (m->msg.curterm < r->term) { 977 | debug("refuse old message %d < %d\n", m->msg.curterm, r->term); 978 | goto finish; 979 | } 980 | 981 | if (sender != r->leader) { 982 | shout("changing leader to %d\n", sender); 983 | r->leader = sender; 984 | } 985 | 986 | r->peers[sender].silent_ms = 0; 987 | raft_reset_timer(r); 988 | 989 | // Update the global progress sent by the leader. 990 | if (m->acked > r->log.acked) { 991 | r->log.acked = min( 992 | r->log.first + r->log.size, 993 | m->acked 994 | ); 995 | raft_peer_t *p = r->peers + sender; 996 | p->acked.entries = r->log.acked; 997 | p->acked.bytes = 0; 998 | } 999 | 1000 | if (!m->empty) { 1001 | debug( 1002 | "got a chunk seqno=%d from %d: offset=%d size=%d total=%d term=%d snapshot=%s\n", 1003 | m->msg.seqno, sender, m->offset, m->len, m->totallen, m->entryterm, m->snapshot ? "true" : "false" 1004 | ); 1005 | 1006 | if ((m->offset > 0) && (e->term != m->entryterm)) { 1007 | shout("a chunk of another version of the entry received, resetting progress to avoid corruption\n"); 1008 | e->term = m->entryterm; 1009 | e->bytes = 0; 1010 | goto finish; 1011 | } 1012 | 1013 | if (m->offset > e->bytes) { 1014 | shout("unexpectedly large offset %d for a chunk, ignoring to avoid gaps\n", m->offset); 1015 | goto finish; 1016 | } 1017 | 1018 | u->len = m->totallen; 1019 | u->data = realloc(u->data, m->totallen); 1020 | 1021 | memcpy(u->data + m->offset, m->data, m->len); 1022 | e->term = m->entryterm; 1023 | e->bytes = m->offset + m->len; 1024 | assert(e->bytes <= u->len); 1025 | 1026 | e->snapshot = m->snapshot; 1027 | 1028 | if (e->bytes == u->len) { 1029 | // The entry has been fully received. 1030 | if (m->snapshot) { 1031 | if (!raft_restore(r, m->previndex, e)) { 1032 | shout("restore from snapshot failed\n"); 1033 | goto finish; 1034 | } 1035 | } else { 1036 | if (!raft_append(r, m->previndex, m->prevterm, e)) { 1037 | debug("log_append failed\n"); 1038 | goto finish; 1039 | } 1040 | } 1041 | } 1042 | } else { 1043 | // just a heartbeat 1044 | e->bytes = 0; 1045 | } 1046 | 1047 | if (RAFT_LOG_LAST_INDEX(r) >= 0) { 1048 | reply.entryterm = RAFT_LOG(r, RAFT_LOG_LAST_INDEX(r)).term; 1049 | } else { 1050 | reply.entryterm = -1; 1051 | } 1052 | reply.applied = r->log.applied; 1053 | 1054 | reply.success = true; 1055 | finish: 1056 | reply.progress.entries = RAFT_LOG_LAST_INDEX(r) + 1; 1057 | reply.progress.bytes = e->bytes; 1058 | 1059 | debug( 1060 | "replying with %s to %d, our progress is %d:%d\n", 1061 | reply.success ? "ok" : "reject", 1062 | sender, 1063 | reply.progress.entries, 1064 | reply.progress.bytes 1065 | ); 1066 | raft_send(r, sender, &reply, sizeof(reply)); 1067 | } 1068 | 1069 | // This is called when an 'ack' for an update (or chunk of it) is recved from a 1070 | // follower. We reply with the next update (or chunk of it), if it is needed. 1071 | static void raft_handle_done(raft_t r, raft_msg_done_t *m) { 1072 | // Ignore the message if it is old or unexpected. 1073 | 1074 | if (r->role != LEADER) { 1075 | return; 1076 | } 1077 | 1078 | int sender = m->msg.from; 1079 | if (sender == r->me) { 1080 | return; 1081 | } 1082 | 1083 | raft_peer_t *peer = r->peers + sender; 1084 | if (m->msg.seqno != peer->seqno) { 1085 | debug("[from %d] ============= mseqno(%d) != sseqno(%d)\n", sender, m->msg.seqno, peer->seqno); 1086 | return; 1087 | } 1088 | peer->seqno++; 1089 | if (m->msg.curterm < r->term) { 1090 | debug("[from %d] ============= msgterm(%d) != term(%d)\n", sender, m->msg.curterm, r->term); 1091 | return; 1092 | } 1093 | 1094 | // The message is expected and actual. 1095 | 1096 | peer->applied = m->applied; 1097 | 1098 | if (m->success) { 1099 | debug("[from %d] ============= done (%d, %d)\n", sender, m->progress.entries, m->progress.bytes); 1100 | peer->acked = m->progress; 1101 | peer->silent_ms = 0; 1102 | } else { 1103 | debug("[from %d] ============= refused\n", sender); 1104 | if (peer->acked.entries > 0) { 1105 | peer->acked.entries--; 1106 | peer->acked.bytes = 0; 1107 | } 1108 | } 1109 | 1110 | if (peer->acked.entries <= RAFT_LOG_LAST_INDEX(r)) { 1111 | // send the next entry 1112 | raft_beat(r, sender); 1113 | } 1114 | } 1115 | 1116 | // Switch to a more recent term. 1117 | static void raft_set_term(raft_t r, int term) { 1118 | assert(term > r->term); 1119 | r->term = term; 1120 | r->vote = NOBODY; 1121 | r->votes = 0; 1122 | } 1123 | 1124 | // This is called when a leadership claim is recved from a candidate. 1125 | // We reply with a 'vote' message. 1126 | static void raft_handle_claim(raft_t r, raft_msg_claim_t *m) { 1127 | int candidate = m->msg.from; 1128 | 1129 | if (m->msg.curterm >= r->term) { 1130 | if (r->role != FOLLOWER) { 1131 | shout("there is another candidate, demoting myself\n"); 1132 | } 1133 | if (m->msg.curterm > r->term) { 1134 | raft_set_term(r, m->msg.curterm); 1135 | } 1136 | r->role = FOLLOWER; 1137 | } 1138 | 1139 | raft_msg_vote_t reply; 1140 | reply.msg.msgtype = RAFT_MSG_VOTE; 1141 | reply.msg.curterm = r->term; 1142 | reply.msg.from = r->me; 1143 | reply.msg.seqno = m->msg.seqno; 1144 | 1145 | reply.granted = false; 1146 | 1147 | if (m->msg.curterm < r->term) goto finish; 1148 | 1149 | // check if the candidate's log is up to date 1150 | if (m->index < RAFT_LOG_LAST_INDEX(r)) goto finish; 1151 | if (m->index == RAFT_LOG_LAST_INDEX(r)) { 1152 | if ((m->index >= 0) && (RAFT_LOG(r, m->index).term != m->lastterm)) { 1153 | goto finish; 1154 | } 1155 | } 1156 | 1157 | // Grant the vote if we haven't voted in the current term, or if we 1158 | // have voted for the same candidate. 1159 | if ((r->vote == NOBODY) || (r->vote == candidate)) { 1160 | r->vote = candidate; 1161 | raft_reset_timer(r); 1162 | reply.granted = true; 1163 | } 1164 | finish: 1165 | shout("voting %s %d\n", reply.granted ? "for" : "against", candidate); 1166 | raft_send(r, candidate, &reply, sizeof(reply)); 1167 | } 1168 | 1169 | // A vote has been recved. Count the valid votes and become the leader if a 1170 | // majority of peers have voted 'yes'. 1171 | static void raft_handle_vote(raft_t r, raft_msg_vote_t *m) { 1172 | int sender = m->msg.from; 1173 | raft_peer_t *peer = r->peers + sender; 1174 | if (m->msg.seqno != peer->seqno) return; 1175 | peer->seqno++; 1176 | if (m->msg.curterm < r->term) return; 1177 | 1178 | if (r->role != CANDIDATE) return; 1179 | 1180 | if (m->granted) { 1181 | r->votes++; 1182 | } 1183 | 1184 | raft_become_leader(r); 1185 | } 1186 | 1187 | // A message has been recved. Determine its type, do some checks, and process 1188 | // it further in a specific handler for the type. 1189 | void raft_handle_message(raft_t r, raft_msg_t m) { 1190 | if (m->curterm > r->term) { 1191 | if (r->role != FOLLOWER) { 1192 | shout("I have an old term, demoting myself\n"); 1193 | } 1194 | raft_set_term(r, m->curterm); 1195 | r->role = FOLLOWER; 1196 | } 1197 | 1198 | assert(m->msgtype >= 0); 1199 | assert(m->msgtype < 4); 1200 | switch (m->msgtype) { 1201 | case RAFT_MSG_UPDATE: 1202 | raft_handle_update(r, (raft_msg_update_t *)m); 1203 | break; 1204 | case RAFT_MSG_DONE: 1205 | raft_handle_done(r, (raft_msg_done_t *)m); 1206 | break; 1207 | case RAFT_MSG_CLAIM: 1208 | raft_handle_claim(r, (raft_msg_claim_t *)m); 1209 | break; 1210 | case RAFT_MSG_VOTE: 1211 | raft_handle_vote(r, (raft_msg_vote_t *)m); 1212 | break; 1213 | default: 1214 | shout("unknown message type\n"); 1215 | } 1216 | } 1217 | 1218 | static char buf[UDP_SAFE_SIZE]; 1219 | 1220 | // Try to recv a message from the socket. Return NULL if no valid messages 1221 | // available. 1222 | raft_msg_t raft_recv_message(raft_t r) { 1223 | struct sockaddr_in addr; 1224 | unsigned int addrlen = sizeof(addr); 1225 | 1226 | //try to receive some data 1227 | raft_msg_t m = (raft_msg_t)buf; 1228 | int recved = recvfrom( 1229 | r->sock, buf, sizeof(buf), 0, 1230 | (struct sockaddr*)&addr, &addrlen 1231 | ); 1232 | 1233 | if (recved <= 0) { 1234 | if ( 1235 | (errno == EAGAIN) || 1236 | (errno == EWOULDBLOCK) || 1237 | (errno == EINTR) 1238 | ) { 1239 | return NULL; 1240 | } else { 1241 | shout("failed to recv: %s\n", strerror(errno)); 1242 | return NULL; 1243 | } 1244 | } 1245 | 1246 | if (!msg_size_is(m, recved)) { 1247 | shout( 1248 | "a corrupt msg recved from %s:%d\n", 1249 | inet_ntoa(addr.sin_addr), 1250 | ntohs(addr.sin_port) 1251 | ); 1252 | return NULL; 1253 | } 1254 | 1255 | if ((m->from < 0) || (m->from >= r->config.peernum_max)) { 1256 | shout( 1257 | "the 'from' is out of range (%d)\n", 1258 | m->from 1259 | ); 1260 | return NULL; 1261 | } 1262 | 1263 | if (m->from == r->me) { 1264 | shout("the message is from myself O_o\n"); 1265 | return NULL; 1266 | } 1267 | 1268 | raft_peer_t *peer = r->peers + m->from; 1269 | if (memcmp(&peer->addr.sin_addr, &addr.sin_addr, sizeof(struct in_addr))) { 1270 | shout( 1271 | "the message is from a wrong address %s = %d" 1272 | " (expected from %s = %d)\n", 1273 | inet_ntoa(peer->addr.sin_addr), 1274 | peer->addr.sin_addr.s_addr, 1275 | inet_ntoa(addr.sin_addr), 1276 | addr.sin_addr.s_addr 1277 | ); 1278 | } 1279 | 1280 | if (peer->addr.sin_port != addr.sin_port) { 1281 | shout( 1282 | "the message is from a wrong port %d" 1283 | " (expected from %d)\n", 1284 | ntohs(peer->addr.sin_port), 1285 | ntohs(addr.sin_port) 1286 | ); 1287 | } 1288 | 1289 | return m; 1290 | } 1291 | 1292 | // Returns true if this peer thinks it is the leader. 1293 | bool raft_is_leader(raft_t r) { 1294 | return r->role == LEADER; 1295 | } 1296 | 1297 | // Returns the id of the current leader, or NOBODY if no leader. 1298 | int raft_get_leader(raft_t r) { 1299 | return r->leader; 1300 | } 1301 | 1302 | // Returns the number of entried applied by the current peer. 1303 | int raft_progress(raft_t r) { 1304 | return r->log.applied; 1305 | } 1306 | -------------------------------------------------------------------------------- /src/util.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "util.h" 7 | 8 | int inrange(int min, int x, int max) { 9 | assert(min <= max); 10 | return (min <= x) && (x <= max); 11 | } 12 | 13 | #define MAX_ELAPSED 30 14 | int mstimer_reset(mstimer_t *t) { 15 | int ms; 16 | struct timeval newtime; 17 | gettimeofday(&newtime, NULL); 18 | 19 | ms = 20 | (newtime.tv_sec - t->tv.tv_sec) * 1000 + 21 | (newtime.tv_usec - t->tv.tv_usec) / 1000; 22 | 23 | t->tv = newtime; 24 | 25 | if (ms > MAX_ELAPSED) { 26 | return MAX_ELAPSED; 27 | } 28 | return ms; 29 | } 30 | 31 | struct timeval ms2tv(int ms) { 32 | struct timeval result; 33 | result.tv_sec = ms / 1000; 34 | result.tv_usec = ((ms % 1000) * 1000); 35 | return result; 36 | } 37 | -------------------------------------------------------------------------------- /tests/blockade.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | 3 | containers: 4 | client1: 5 | container_name: client1 6 | image: raft 7 | environment: 8 | ROLE: 'client' 9 | CLIENTKEY: 'hello' 10 | LOG: '/tmp/raft.client1.log' 11 | volumes: 12 | "/tmp": "/tmp" 13 | 14 | client2: 15 | container_name: client2 16 | image: raft 17 | environment: 18 | ROLE: 'client' 19 | CLIENTKEY: 'world' 20 | LOG: '/tmp/raft.client2.log' 21 | volumes: 22 | "/tmp": "/tmp" 23 | 24 | server1: 25 | container_name: server1 26 | image: raft 27 | environment: 28 | ROLE: 'server' 29 | SERVERID: 10 30 | LOG: '/tmp/raft.server1.log' 31 | volumes: 32 | "/tmp": "/tmp" 33 | 34 | server2: 35 | container_name: server2 36 | image: raft 37 | environment: 38 | ROLE: 'server' 39 | SERVERID: 20 40 | LOG: '/tmp/raft.server2.log' 41 | volumes: 42 | "/tmp": "/tmp" 43 | 44 | server3: 45 | container_name: server3 46 | image: raft 47 | environment: 48 | ROLE: 'server' 49 | SERVERID: 30 50 | LOG: '/tmp/raft.server3.log' 51 | volumes: 52 | "/tmp": "/tmp" 53 | 54 | network: 55 | driver: udn 56 | flaky: 10% 57 | slow: 20ms 50ms distribution normal 58 | -------------------------------------------------------------------------------- /tests/docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | CFG="-r 10:server1:6000 -r 20:server2:6000 -r 30:server3:6000" 5 | if [ "$ROLE" = 'client' ]; then 6 | sleep 15 7 | exec bin/client -k $CLIENTKEY $CFG 2>> $LOG 8 | elif [ "$ROLE" = 'server' ]; then 9 | sleep 10 10 | exec bin/server -i $SERVERID $CFG 2>> $LOG 11 | else 12 | echo "please specify either 'client' or 'server' in \$ROLE" 13 | exit 1 14 | fi 15 | -------------------------------------------------------------------------------- /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/kelvich/blockade.git@002-allow-cores 2 | -------------------------------------------------------------------------------- /tests/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import unittest 4 | import time 5 | import subprocess 6 | import sys 7 | import os 8 | 9 | logfiles = [] 10 | with open(os.path.join(os.path.dirname(sys.argv[0]), "blockade.yml")) as f: 11 | for line in f: 12 | tup = [x.strip() for x in line.split(':', 1)] 13 | if len(tup) < 2: 14 | continue 15 | k, v = tup 16 | if k == 'LOG': 17 | filename = v.strip("'") 18 | print('will also log to %s' % filename) 19 | logfiles.append(filename) 20 | 21 | def clear_logs(): 22 | for filename in logfiles: 23 | try: 24 | os.truncate(filename, 0) 25 | except: 26 | pass 27 | 28 | def log_to_everybody(line): 29 | for filename in logfiles: 30 | with open(filename, 'a') as f: 31 | print(line, file=f) 32 | print(line) 33 | 34 | def blockade_partition(name): 35 | log_to_everybody("=== partition %s away" % name) 36 | subprocess.check_call(['blockade', 'status']) 37 | subprocess.check_call(['blockade', 'partition', name]) 38 | subprocess.check_call(['blockade', 'status']) 39 | log_to_everybody("=== %s partitioned away" % name) 40 | 41 | def blockade_join(): 42 | log_to_everybody("=== join the network") 43 | subprocess.check_call(['blockade', 'status']) 44 | subprocess.check_call(['blockade', 'join']) 45 | subprocess.check_call(['blockade', 'status']) 46 | log_to_everybody("=== the network joined") 47 | 48 | def blockade_up(): 49 | log_to_everybody("=== set up blockade") 50 | subprocess.check_call(['blockade', 'up']) 51 | log_to_everybody('=== blockade is up, waiting a bit') 52 | time.sleep(20) 53 | log_to_everybody('=== blockade is up, the waiting finished') 54 | 55 | def blockade_stop(): 56 | log_to_everybody("=== stop blockade") 57 | subprocess.check_call(['blockade', 'stop', '--all']) 58 | log_to_everybody("=== blockade stopped") 59 | 60 | def blockade_destroy(): 61 | log_to_everybody("=== destroy blockade") 62 | subprocess.check_call(['blockade', 'destroy']) 63 | log_to_everybody("=== blockade destroyed") 64 | 65 | def blockade_flaky(*names): 66 | log_to_everybody("=== make network flaky for %s" % ', '.join(names)) 67 | subprocess.check_call(['blockade', 'status']) 68 | subprocess.check_call(['blockade', 'flaky'] + list(names)) 69 | subprocess.check_call(['blockade', 'status']) 70 | log_to_everybody("=== network made flaky for %s" % ', '.join(names)) 71 | 72 | def blockade_slow(*names): 73 | log_to_everybody("=== make network slow for %s" % ', '.join(names)) 74 | subprocess.check_call(['blockade', 'status']) 75 | subprocess.check_call(['blockade', 'slow'] + list(names)) 76 | subprocess.check_call(['blockade', 'status']) 77 | log_to_everybody("=== network made slow for %s" % ', '.join(names)) 78 | 79 | def blockade_fast(): 80 | log_to_everybody("=== make network fast") 81 | subprocess.check_call(['blockade', 'status']) 82 | subprocess.check_call(['blockade', 'fast', '--all']) 83 | subprocess.check_call(['blockade', 'status']) 84 | log_to_everybody("=== network made fast") 85 | 86 | def log_is_ok(filename): 87 | if 'client' in filename: 88 | longest = 0 89 | fail_streak = 0 90 | with open(filename) as f: 91 | for line in f: 92 | if 'query failed' in line: 93 | fail_streak += 1 94 | if fail_streak > longest: 95 | longest = fail_streak 96 | elif ' = ' in line: 97 | fail_streak = 0 98 | print("%s longest fail streak = %d" % (filename, longest)) 99 | return longest < 10 100 | elif 'server' in filename: 101 | return True 102 | else: 103 | return False 104 | 105 | class PartitionTest(unittest.TestCase): 106 | def setUp(self): 107 | clear_logs() 108 | 109 | def tearDown(self): 110 | for filename in logfiles: 111 | self.assertTrue(log_is_ok(filename)) 112 | 113 | def test_partition(self): 114 | log_to_everybody("=== test_partition") 115 | blockade_up() 116 | try: 117 | for serverid in range(1,4): 118 | name = "server%d" % serverid 119 | blockade_partition(name) 120 | time.sleep(20) 121 | blockade_join() 122 | time.sleep(10) 123 | finally: 124 | blockade_destroy() 125 | 126 | def test_slow(self): 127 | log_to_everybody("=== test_slow") 128 | blockade_up() 129 | try: 130 | blockade_slow('--all') 131 | time.sleep(20) 132 | blockade_fast() 133 | time.sleep(10) 134 | finally: 135 | blockade_destroy() 136 | 137 | def test_flaky(self): 138 | log_to_everybody("=== test_flaky") 139 | blockade_up() 140 | try: 141 | blockade_flaky('--all') 142 | time.sleep(20) 143 | blockade_fast() 144 | time.sleep(10) 145 | finally: 146 | blockade_destroy() 147 | 148 | if __name__ == '__main__': 149 | unittest.main() 150 | 151 | # vim: ai ts=4 sts=4 et sw=4 152 | --------------------------------------------------------------------------------