├── .gitignore ├── tcp.h ├── epoll.h ├── .gitmodules ├── iou.h ├── CONTRIBUTING.md ├── devmem.h ├── LICENSE ├── proto_dbg.h ├── cpu_stat.h ├── test └── ksft.py ├── Makefile ├── bipartite_match.h ├── worker.h ├── server.h ├── tcp.c ├── README.rst ├── CODE_OF_CONDUCT.md ├── cpu_stat.c ├── server.c ├── proto.h ├── bipartite_match.c ├── proto.c ├── worker.c ├── epoll.c ├── iou.c ├── server_session.c ├── devmem.c └── client.c /.gitignore: -------------------------------------------------------------------------------- 1 | *.d 2 | *.o 3 | bipartite_match 4 | client 5 | cpu_stat 6 | server 7 | -------------------------------------------------------------------------------- /tcp.h: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: BSD-3-Clause 2 | /* Copyright Meta Platforms, Inc. and affiliates */ 3 | 4 | /** 5 | * DOC: Random collection of TCP helpers. 6 | */ 7 | 8 | struct tcp_info; 9 | 10 | void print_tcp_info(struct tcp_info *ti); 11 | -------------------------------------------------------------------------------- /epoll.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: BSD-3-Clause */ 2 | /* Copyright Meta Platforms, Inc. and affiliates */ 3 | 4 | #ifndef EPOLL_H 5 | #define EPOLL_H 1 6 | 7 | #include "worker.h" 8 | 9 | void worker_epoll_init(struct worker_state *state); 10 | 11 | #endif /* EPOLL_H */ 12 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "ccan"] 2 | path = ccan 3 | url = https://github.com/rustyrussell/ccan.git 4 | [submodule "ynl-c"] 5 | path = ynl-c 6 | url = https://github.com/linux-netdev/ynl-c.git 7 | [submodule "liburing"] 8 | path = liburing 9 | url = https://github.com/axboe/liburing.git 10 | -------------------------------------------------------------------------------- /iou.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: BSD-3-Clause */ 2 | /* Copyright Meta Platforms, Inc. and affiliates */ 3 | 4 | #ifndef IOU_H 5 | #define IOU_H 1 6 | 7 | #include "worker.h" 8 | 9 | void worker_iou_init(struct worker_state *state); 10 | 11 | int iou_zerocopy_rx_setup(struct session_state_iou *iou, int fd, 12 | int num_queues); 13 | int iou_zerocopy_rx_teardown(struct session_state_iou *iou); 14 | 15 | #endif /* IOU_H */ 16 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to kperf 2 | 3 | We actively welcome your pull requests. 4 | 5 | 1. Fork the repo and create your branch from `main`. 6 | 2. If you've added code that should be tested, add tests. 7 | 3. If you've changed APIs, update the documentation. 8 | 4. Make sure your code lints. 9 | 5. If you haven't already, complete the Contributor License Agreement ("CLA"). 10 | 11 | ## Contributor License Agreement ("CLA") 12 | In order to accept your pull request, we need you to submit a CLA. You only need 13 | to do this once to work on any of Meta's open source projects. 14 | 15 | Complete your CLA here: 16 | 17 | ## Issues 18 | We use GitHub issues to track public bugs. Please ensure your description is 19 | clear and has sufficient instructions to be able to reproduce the issue. 20 | 21 | ## Coding Style 22 | `kperf` uses Linux kernel's coding style. 23 | 24 | ## License 25 | By contributing to `kperf`, you agree that your contributions will be licensed 26 | under the LICENSE file in the root directory of this source tree. 27 | -------------------------------------------------------------------------------- /devmem.h: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: BSD-3-Clause 2 | /* Copyright Meta Platforms, Inc. and affiliates */ 3 | 4 | #ifndef DEVMEM_H 5 | #define DEVMEM_H 1 6 | 7 | #include 8 | 9 | int reserve_queues(int fd, int num_queues, char out_ifname[IFNAMSIZ], 10 | int *out_ifindex, int *out_queue_id, int *out_rss_context); 11 | void unreserve_queues(char *ifname, int rss_context); 12 | 13 | int devmem_setup(struct session_state_devmem *devmem, int fd, 14 | size_t dmabuf_size, int num_queues, 15 | enum memory_provider_type provider, struct pci_dev *dev); 16 | int devmem_teardown(struct session_state_devmem *devmem); 17 | void devmem_teardown_tx(struct session_state_devmem *devmem); 18 | int devmem_release_tokens(int fd, struct connection_devmem *conn); 19 | ssize_t devmem_recv(int fd, struct connection_devmem *conn, 20 | unsigned char *rxbuf, size_t chunk, struct memory_buffer *mem, 21 | int rep, __u64 tot_recv, bool validate); 22 | int devmem_sendmsg(int fd, int dmabuf_id, size_t off, size_t n); 23 | void devmem_teardown_conn(struct connection_devmem *devmem); 24 | int devmem_prepare_connect(int fd, struct sockaddr_in6 *src, struct session_state_devmem *devmem); 25 | int devmem_setup_tx(struct session_state_devmem *devmem, enum memory_provider_type provider, 26 | int dmabuf_tx_size_mb, struct pci_dev *dev, struct sockaddr_in6 *addr); 27 | int devmem_bind_socket(struct session_state_devmem *devmem, int fd); 28 | 29 | #endif /* DEVMEM_H */ 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Redistribution and use in source and binary forms, with or without 2 | modification, are permitted provided that the following conditions 3 | are met: 4 | 1. Redistributions of source code must retain the above copyright 5 | notice, this list of conditions and the following disclaimer. 6 | 2. Redistributions in binary form must reproduce the above copyright 7 | notice, this list of conditions and the following disclaimer in the 8 | documentation and/or other materials provided with the distribution. 9 | 3. Neither the name of the University nor the names of its contributors 10 | may be used to endorse or promote products derived from this software 11 | without specific prior written permission. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 | ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 | SUCH DAMAGE. 24 | -------------------------------------------------------------------------------- /proto_dbg.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: BSD-3-Clause */ 2 | /* Copyright Meta Platforms, Inc. and affiliates */ 3 | 4 | #ifndef PROTO_DBG_H 5 | #define PROTO_DBG_H 1 6 | 7 | #include 8 | #include 9 | 10 | #include "proto.h" 11 | 12 | extern int verbose; 13 | 14 | #define __kpm_cmd_dbg(pfx, msg, hdr) \ 15 | ({ \ 16 | struct kpm_header *_hdr = (hdr); \ 17 | \ 18 | if (verbose >= 4) \ 19 | fprintf(stderr, "D%s %s%s%s T%d (seq:%d, len:%d)\n", \ 20 | pfx, __FILE__, \ 21 | strlen(msg) ? " " : "", msg, \ 22 | _hdr->type, _hdr->id, _hdr->len); \ 23 | }) 24 | 25 | #define __kpm_cmd_dbg_start(name, hdr) __kpm_cmd_dbg("|>", name, hdr) 26 | #define __kpm_cmd_dbg_end(name, hdr) __kpm_cmd_dbg("|<", name, hdr) 27 | 28 | #define kpm_cmd_dbg_start(hdr) __kpm_cmd_dbg_start("", hdr) 29 | #define kpm_cmd_dbg_end(hdr) __kpm_cmd_dbg_end("", hdr) 30 | 31 | static inline void ____kpm_trace(int level, const char *fn, const char *pfx, 32 | const char *fmt, ...) 33 | { 34 | const char *letters = "!EWIDT "; 35 | va_list ap; 36 | 37 | if (verbose < level) 38 | return; 39 | if (level > 6) 40 | level = 6; 41 | 42 | fprintf(stderr, "%c%s %s: ", letters[level], pfx, fn); 43 | va_start(ap, fmt); 44 | vfprintf(stderr, fmt, ap); 45 | va_end(ap); 46 | fprintf(stderr, "\n"); 47 | } 48 | 49 | #define __kpm_info(pfx, msg...) ____kpm_trace(3, __FILE__, pfx, msg) 50 | #define kpm_info(msg...) __kpm_info(" ", msg) 51 | 52 | #define __kpm_dbg(pfx, msg...) ____kpm_trace(4, __FILE__, pfx, msg) 53 | #define kpm_dbg(msg...) __kpm_dbg(" ", msg) 54 | 55 | #define __kpm_trace(pfx, msg...) ____kpm_trace(5, __FILE__, pfx, msg) 56 | #define kpm_trace(msg...) __kpm_trace(" ", msg) 57 | 58 | #endif 59 | -------------------------------------------------------------------------------- /cpu_stat.h: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: BSD-3-Clause 2 | /* Copyright Jakub Kicinski */ 3 | /* Copyright Meta Platforms, Inc. and affiliates */ 4 | 5 | /** 6 | * DOC: CPU utilization stats 7 | * 8 | * Linux CPU use stats read / parsed from procfs. 9 | * 10 | * Linux counts CPU use starting at boot and in jiffies so provide helpers 11 | * to measure CPU use over a period of time and convert it to percentage. 12 | * 13 | * All functions take ncpus as returned by get_nprocs_conf(), pass 0 if you 14 | * don't have get_nprocs_conf() cached. 15 | * 16 | * If function returns a pointer to an array that array will be allocated 17 | * on the heap and has to be explicitly freed. Arrays are sized to ncpus 18 | * (or get_nprocs_conf()). 19 | * 20 | * Example: 21 | * struct cpu_stat *s1, *s2, *diffpct; 22 | * 23 | * s1 = cpu_stat_snapshot(0); 24 | * sleep(1); 25 | * s2 = cpu_stat_snapshot(0); 26 | * 27 | * // Calculate CPU use between s1 was taken and s2 was taken. 28 | * cpu_stat_sub(s2, s1, 0); 29 | * diffpct = cpu_stat_to_pct00(s2, 0); 30 | * 31 | * // Print percentage of time spent in user context. 32 | * printf("usr:%2llu.%02llu\n", 33 | * totpct[i].user / 100, totpct[i].user % 100); 34 | */ 35 | struct cpu_stat { 36 | unsigned int cpu_id; /* CPU id, not a stat */ 37 | unsigned long long int user; /* sum of user and nice */ 38 | unsigned long long int system; 39 | unsigned long long int idle; 40 | unsigned long long int iowait; 41 | unsigned long long int irq; 42 | unsigned long long int sirq; 43 | }; 44 | 45 | struct cpu_stat *cpu_stat_snapshot(int ncpus); 46 | /* convert stats to fractional format, fields multiplied by 10,000 */ 47 | struct cpu_stat *cpu_stat_to_pct00(struct cpu_stat *src, int ncpus); 48 | void cpu_stat_sub(struct cpu_stat *dst, struct cpu_stat *op, int ncpus); 49 | -------------------------------------------------------------------------------- /test/ksft.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # SPDX-License-Identifier: GPL-2.0 3 | # Copyright Meta Platforms, Inc. and affiliates 4 | 5 | """ 6 | Script to run kperf. 7 | This needs to be copied into kernel selftest directory to run. 8 | It depends on kernel networking selftest infra and libraries. 9 | """ 10 | 11 | import time 12 | import psutil 13 | from lib.py import ksft_run, ksft_exit 14 | from lib.py import NetDrvEpEnv 15 | from lib.py import bkg, cmd 16 | 17 | 18 | def kperf(cfg): 19 | """ Run a bunch of kperf configs. Checking is expected to be manual. """ 20 | kpdr = "/home/kicinski/devel/kperf/" 21 | s1 = bkg(kpdr + "server --no-daemon") 22 | s2 = bkg(kpdr + "server --no-daemon --pid-file /tmp/kperf-remote.pid", 23 | host=cfg.remote) 24 | 25 | time.sleep(0.3) 26 | 27 | fd_cnt = psutil.Process(s1.proc.pid).num_fds() 28 | print("Server fd count at the start:", fd_cnt) 29 | 30 | print(">>> Base run") 31 | run = cmd(kpdr + f"client --src {cfg.addr} --dst {cfg.remote_addr} -t 10", 32 | fail=False) 33 | if run.stderr: 34 | print("STDERR:", run.stderr) 35 | print(run.stdout) 36 | 37 | print(">>> pin-off 1") 38 | run = cmd(kpdr + f"client --cpu-max 2 --src {cfg.addr} --dst {cfg.remote_addr} --pin-off 1 -t 10", 39 | fail=False) 40 | if run.stderr: 41 | print("STDERR:", run.stderr) 42 | print(run.stdout) 43 | 44 | end_fd_cnt = psutil.Process(s1.proc.pid).num_fds() 45 | print("Server fd count at the end:", end_fd_cnt) 46 | if end_fd_cnt != fd_cnt: 47 | print(f"ERROR!!! (was {fd_cnt} at init)") 48 | print(cmd("lsof -p " + str(s1.proc.pid)).stdout) 49 | 50 | s1.process(terminate=True, fail=False) 51 | s2.process(terminate=True, fail=False) 52 | 53 | print(s1.stderr, s1.stdout) 54 | print(s2.stderr, s2.stdout) 55 | 56 | 57 | 58 | def main() -> None: 59 | """ Ksft boiler plate main """ 60 | 61 | with NetDrvEpEnv(__file__) as cfg: 62 | ksft_run([kperf], 63 | args=(cfg, )) 64 | ksft_exit() 65 | 66 | 67 | if __name__ == "__main__": 68 | main() 69 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: BSD-3-Clause 2 | # Copyright Meta Platforms, Inc. and affiliates 3 | 4 | CCAN_PATH := ./ccan 5 | YNL_PATH := ./ynl-c 6 | LIBURING_PATH := ./liburing 7 | 8 | CC=gcc 9 | CFLAGS=-std=gnu99 -I$(CCAN_PATH) -O2 -W -Wall -Wextra -Wno-unused-parameter -Wshadow -DDEBUG -g 10 | CFLAGS += -I$(YNL_PATH)/include/ 11 | CFLAGS += -I$(LIBURING_PATH)/src/include/ 12 | 13 | ifeq ("$(DEBUG)","1") 14 | CFLAGS += -g -fsanitize=address -fsanitize=leak -static-libasan 15 | endif 16 | 17 | LIBS=-lm -L$(CCAN_PATH) -pthread -lccan 18 | LIBS += -L$(YNL_PATH) -lynl 19 | LIBS += $(LIBURING_PATH)/src/liburing.a 20 | 21 | ifdef USE_CUDA 22 | CFLAGS += -I/usr/local/cuda/include/ -DUSE_CUDA 23 | endif 24 | 25 | include $(wildcard *.d) 26 | 27 | all: server client units 28 | units: bipartite_match cpu_stat 29 | 30 | ifdef USE_CUDA 31 | server: LIBS += -lcuda -lcudart -L/usr/local/cuda/lib64 32 | endif 33 | 34 | server: $(CCAN_PATH)/libccan.a $(YNL_PATH)/libynl.a $(LIBURING_PATH)/src/liburing.a server.o server_session.o proto.o epoll.o iou.o worker.o devmem.o cpu_stat.o tcp.o 35 | $(CC) $(CFLAGS) -o $@ $^ $(LIBS) 36 | 37 | client: $(CCAN_PATH)/libccan.a client.o proto.o bipartite_match.o 38 | $(CC) $(CFLAGS) -o $@ $^ $(LIBS) 39 | 40 | $(CCAN_PATH)/libccan.a: 41 | make -C $(CCAN_PATH)/ 42 | ar rcs $(CCAN_PATH)/libccan.a $(CCAN_PATH)/ccan/*/*.o 43 | 44 | $(YNL_PATH)/libynl.a: 45 | make -C $(YNL_PATH) 46 | 47 | $(LIBURING_PATH)/src/liburing.a: 48 | @cd $(LIBURING_PATH) && ./configure --cc=$(CC) 49 | make -C $(LIBURING_PATH) 50 | 51 | clean: 52 | rm -rf *.o *.d *~ bipartite_match cpu_stat 53 | 54 | distclean: 55 | rm -rf *.o *.d *~ bipartite_match cpu_stat server client $(CCAN_PATH)/libccan.a 56 | make clean -C $(LIBURING_PATH) 57 | 58 | bipartite_match: $(CCAN_PATH)/libccan.a 59 | $(CC) $(CFLAGS) -DKPERF_UNITS bipartite_match.c -o bipartite_match $(CCAN_PATH)/libccan.a 60 | 61 | cpu_stat: $(CCAN_PATH)/libccan.a 62 | $(CC) $(CFLAGS) -DKPERF_UNITS cpu_stat.c -o cpu_stat $(CCAN_PATH)/libccan.a 63 | 64 | %.o: %.c 65 | $(COMPILE.c) -MMD -o $@ $< 66 | 67 | .PHONY: all clean units ccan distclean 68 | .DEFAULT_GOAL=all 69 | -------------------------------------------------------------------------------- /bipartite_match.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: BSD-3-Clause */ 2 | /* Copyright Jakub Kicinski */ 3 | /* Copyright Meta Platforms, Inc. and affiliates */ 4 | 5 | #ifndef BIPARTITE_MATCH 6 | #define BIPARTITE_MATCH 7 | 8 | #include 9 | 10 | struct bim_state; 11 | 12 | /** 13 | * DOC: Bipartite Match 14 | * 15 | * Find a matching in a bipartite graph. 16 | * 17 | * Number of nodes does not need to be known upfront. Duplicate edges 18 | * are ignored. Designed for incremental growth of the graph, use 19 | * bim_match_size() to check number of pairings with current edge set. 20 | * 21 | * Example: 22 | * struct bim_state *bim; 23 | * struct bim_edge m; 24 | * 25 | * bim = bim_init(); 26 | * while ... 27 | * // Add edge to the graph 28 | * bim_add_edge(bim, left_id, right_id, priv); 29 | * 30 | * // Dump matches 31 | * bim_for_each_match(bim, &m) 32 | * printf("Match %d - %d, %p\n", m.left_id, m.right_id, m.cookie); 33 | * bim_destroy(bim); 34 | */ 35 | 36 | /* Graph init / destroy */ 37 | struct bim_state *bim_init(void); 38 | void bim_destroy(struct bim_state *bim); 39 | 40 | /* Optional, size the state to avoid reallocation, pass 0s to compact */ 41 | void bim_resize(struct bim_state *bim, 42 | unsigned int max_left, unsigned int max_right); 43 | 44 | /* Populating edges */ 45 | bool bim_add_edge(struct bim_state *bim, 46 | unsigned int left_id, unsigned int right_id, void *cookie); 47 | unsigned int bim_match_size(struct bim_state *bim); 48 | 49 | /* Walk pairings and edges */ 50 | struct bim_edge { 51 | unsigned int left_id; 52 | unsigned int right_id; 53 | void *cookie; 54 | bool is_match; 55 | /* Walker's state, don't overwrite */ 56 | unsigned long long _walker; 57 | }; 58 | 59 | void bim_walk_init(struct bim_edge *edge); 60 | bool bim_edge_walk_next(struct bim_state *bim, struct bim_edge *edge); 61 | bool bim_match_walk_next(struct bim_state *bim, struct bim_edge *match); 62 | 63 | #define bim_for_each_match(bim, match) \ 64 | for (bim_walk_init(match); bim_match_walk_next(bim, match); ) 65 | 66 | #define bim_for_each_edge(bim, match) \ 67 | for (bim_walk_init(match); bim_edge_walk_next(bim, match); ) 68 | 69 | #endif 70 | -------------------------------------------------------------------------------- /worker.h: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: BSD-3-Clause 2 | /* Copyright Meta Platforms, Inc. and affiliates */ 3 | 4 | #ifndef WORKER_H 5 | #define WORKER_H 1 6 | 7 | #include 8 | 9 | #include 10 | #include 11 | 12 | #include "proto.h" 13 | #include "server.h" 14 | 15 | /* Main worker state AKA self */ 16 | struct worker_state { 17 | int main_sock; 18 | struct worker_opts opts; 19 | int epollfd; 20 | unsigned int id; 21 | int quit; 22 | int ended; 23 | struct kpm_test *test; 24 | struct cpu_stat *cpu_start; 25 | struct timemono test_start; 26 | struct timemono prev_loop; 27 | unsigned int test_len_msec; 28 | struct list_head connections; 29 | const struct io_ops *ops; 30 | void *io_state; 31 | }; 32 | 33 | struct worker_connection { 34 | unsigned int id; 35 | int fd; 36 | unsigned int read_size; 37 | unsigned int write_size; 38 | __u64 to_send; 39 | __u64 to_send_comp; 40 | __u64 to_recv; 41 | __u64 tot_sent; 42 | __u64 tot_recv; 43 | unsigned char *rxbuf; 44 | 45 | /* zero copy receive */ 46 | size_t rsize; 47 | void *raddr; 48 | void *addr; 49 | 50 | struct connection_devmem devmem; 51 | struct kpm_test_spec *spec; 52 | struct tcp_info init_info; 53 | union { 54 | struct { 55 | unsigned int reqs; 56 | unsigned int hist[33]; 57 | unsigned int log_len; 58 | unsigned int log_len_max; 59 | unsigned int *log; 60 | } rr; 61 | }; 62 | struct list_node connections; 63 | }; 64 | 65 | struct io_ops { 66 | void (*prep)(struct worker_state *state); 67 | void (*wait)(struct worker_state *state, int msec); 68 | void (*conn_add)(struct worker_state *state, struct worker_connection *conn); 69 | void (*conn_close)(struct worker_state *state, struct worker_connection *conn); 70 | void (*exit)(struct worker_state *state); 71 | }; 72 | 73 | void worker_handle_proto(struct worker_state *self, struct kpm_header *hdr); 74 | void worker_kill_conn(struct worker_state *self, struct worker_connection *conn); 75 | void worker_send_finished(struct worker_state *self, struct worker_connection *conn); 76 | void worker_recv_finished(struct worker_state *self, struct worker_connection *conn); 77 | 78 | #endif /* WORKER_H */ 79 | -------------------------------------------------------------------------------- /server.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: BSD-3-Clause */ 2 | /* Copyright Meta Platforms, Inc. and affiliates */ 3 | 4 | #ifndef SERVER_H 5 | #define SERVER_H 1 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | 14 | #include 15 | 16 | #include "proto.h" 17 | 18 | #ifdef USE_CUDA 19 | #include 20 | #endif 21 | 22 | #define PATTERN_PERIOD 255 23 | 24 | struct server_session { 25 | int cfd; 26 | pid_t pid; 27 | struct list_node sessions; 28 | }; 29 | 30 | #ifndef MSG_SOCK_DEVMEM 31 | #define MSG_SOCK_DEVMEM 0x2000000 32 | #define SO_DEVMEM_LINEAR 78 33 | #define SO_DEVMEM_DMABUF 79 34 | #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF 35 | #define SO_DEVMEM_DONTNEED 80 36 | #endif 37 | 38 | struct dmabuf_cmsg { 39 | __u64 frag_offset; 40 | __u32 frag_size; 41 | __u32 frag_token; 42 | __u32 dmabuf_id; 43 | __u32 flags; 44 | }; 45 | 46 | struct dmabuf_token { 47 | __u32 token_start; 48 | __u32 token_count; 49 | }; 50 | 51 | #ifdef USE_CUDA 52 | struct memory_buffer_cuda { 53 | CUcontext ctx; 54 | }; 55 | #endif 56 | 57 | struct memory_buffer { 58 | char *buf_mem; 59 | size_t size; 60 | int fd; 61 | int devfd; 62 | int memfd; 63 | int dmabuf_id; 64 | void *priv; 65 | #ifdef USE_CUDA 66 | struct memory_buffer_cuda cuda; 67 | #endif 68 | enum memory_provider_type provider; 69 | }; 70 | 71 | struct memory_provider { 72 | int (*dev_init)(struct pci_dev *dev); 73 | struct memory_buffer *(*alloc)(size_t size); 74 | void (*free)(struct memory_buffer *mem); 75 | void (*memcpy_to_device)(struct memory_buffer *dst, size_t off, 76 | void *src, int n); 77 | void (*memcpy_from_device)(void *dst, struct memory_buffer *src, 78 | size_t off, int n); 79 | }; 80 | 81 | struct connection_devmem { 82 | struct dmabuf_token rxtok[128]; 83 | int rxtok_len; 84 | /* ncdevmem uses 80k, allocate 64k for recvmsg tokens */ 85 | char ctrl_data[64 * 1024]; 86 | }; 87 | 88 | struct session_state_devmem { 89 | struct ynl_sock *ys; 90 | char ifname[IFNAMSIZ]; 91 | 92 | /* RX */ 93 | struct memory_buffer *mem; 94 | int rss_context; 95 | 96 | /* TX */ 97 | struct memory_buffer *tx_mem; 98 | struct pci_dev tx_dev; 99 | __u32 dmabuf_tx_size_mb; 100 | enum memory_provider_type tx_provider; 101 | struct sockaddr_in6 addr; 102 | }; 103 | 104 | struct worker_state_devmem { 105 | struct memory_buffer *mem; 106 | int dmabuf_id; 107 | }; 108 | 109 | struct session_state_iou { 110 | unsigned int rx_size_mb; 111 | char ifname[IFNAMSIZ]; 112 | int ifindex; 113 | int rss_context; 114 | int queue_id; 115 | }; 116 | 117 | struct worker_state_iou { 118 | unsigned int rx_size_mb; 119 | int ifindex; 120 | int queue_id; 121 | }; 122 | 123 | struct worker_opts { 124 | enum kpm_rx_mode rx_mode; 125 | enum kpm_tx_mode tx_mode; 126 | bool validate; 127 | bool use_iou; 128 | struct worker_state_devmem devmem; 129 | struct worker_state_iou iou; 130 | int fd; 131 | }; 132 | 133 | struct server_session * 134 | server_session_spawn(int fd, struct sockaddr_in6 *addr, socklen_t *addrlen); 135 | 136 | void* worker_main(void* args); 137 | 138 | #endif /* SERVER_H */ 139 | -------------------------------------------------------------------------------- /tcp.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: BSD-3-Clause 2 | /* Copyright Meta Platforms, Inc. and affiliates */ 3 | 4 | #include 5 | #include 6 | 7 | #include "tcp.h" 8 | 9 | void print_tcp_info(struct tcp_info *ti) 10 | { 11 | printf("TCP stats\n" 12 | " %u %u %u %u %u %u %u %u %u %u\n" 13 | " %u %u %u %u %u %u %u %u %u\n" 14 | "Times: %u %u %u %u\n" 15 | "Metrics: %u %u %u %u %u %u %u %u\n" 16 | "rcv_rtt| %u %u %u\n" 17 | "pacing_| %llu %llu %llu %llu\n" 18 | "segs_ou| %u %u %u %u %u %u\n" 19 | "de-ry_r| %llu %llu %llu %llu\n" 20 | "de-ered| %u %u\n" 21 | "bytes_s| %llu %llu\n" 22 | "dsack_d| %u %u %u %u\n", 23 | ti->tcpi_state, 24 | ti->tcpi_ca_state, 25 | ti->tcpi_retransmits, 26 | ti->tcpi_probes, 27 | ti->tcpi_backoff, 28 | ti->tcpi_options, 29 | ti->tcpi_snd_wscale, 30 | ti->tcpi_rcv_wscale, 31 | ti->tcpi_delivery_rate_app_limited, 32 | ti->tcpi_fastopen_client_fail, 33 | 34 | ti->tcpi_rto, 35 | ti->tcpi_ato, 36 | ti->tcpi_snd_mss, 37 | ti->tcpi_rcv_mss, 38 | 39 | ti->tcpi_unacked, 40 | ti->tcpi_sacked, 41 | ti->tcpi_lost, 42 | ti->tcpi_retrans, 43 | ti->tcpi_fackets, 44 | 45 | /* Times. */ 46 | ti->tcpi_last_data_sent, 47 | ti->tcpi_last_ack_sent, 48 | ti->tcpi_last_data_recv, 49 | ti->tcpi_last_ack_recv, 50 | 51 | /* Metrics. */ 52 | ti->tcpi_pmtu, 53 | ti->tcpi_rcv_ssthresh, 54 | ti->tcpi_rtt, 55 | ti->tcpi_rttvar, 56 | ti->tcpi_snd_ssthresh, 57 | ti->tcpi_snd_cwnd, 58 | ti->tcpi_advmss, 59 | ti->tcpi_reordering, 60 | 61 | ti->tcpi_rcv_rtt, 62 | ti->tcpi_rcv_space, 63 | 64 | ti->tcpi_total_retrans, 65 | 66 | ti->tcpi_pacing_rate, 67 | ti->tcpi_max_pacing_rate, 68 | ti->tcpi_bytes_acked, /* RFC4898 tcpEStatsAppHCThruOctetsAcked */ 69 | ti->tcpi_bytes_received, /* RFC4898 tcpEStatsAppHCThruOctetsReceived */ 70 | ti->tcpi_segs_out, /* RFC4898 tcpEStatsPerfSegsOut */ 71 | ti->tcpi_segs_in, /* RFC4898 tcpEStatsPerfSegsIn */ 72 | 73 | ti->tcpi_notsent_bytes, 74 | ti->tcpi_min_rtt, 75 | ti->tcpi_data_segs_in, /* RFC4898 tcpEStatsDataSegsIn */ 76 | ti->tcpi_data_segs_out, /* RFC4898 tcpEStatsDataSegsOut */ 77 | 78 | ti->tcpi_delivery_rate, 79 | 80 | ti->tcpi_busy_time, /* Time (usec) busy sending data */ 81 | ti->tcpi_rwnd_limited, /* Time (usec) limited by receive window */ 82 | ti->tcpi_sndbuf_limited, /* Time (usec) limited by send buffer */ 83 | 84 | ti->tcpi_delivered, 85 | ti->tcpi_delivered_ce, 86 | 87 | ti->tcpi_bytes_sent, /* RFC4898 tcpEStatsPerfHCDataOctetsOut */ 88 | ti->tcpi_bytes_retrans, /* RFC4898 tcpEStatsPerfOctetsRetrans */ 89 | ti->tcpi_dsack_dups, /* RFC4898 tcpEStatsStackDSACKDups */ 90 | ti->tcpi_reord_seen, /* reordering events seen */ 91 | 92 | ti->tcpi_rcv_ooopack, /* Out-of-order packets received */ 93 | 94 | ti->tcpi_snd_wnd /* peer's advertised receive window 95 | * after scaling (bytes) */ 96 | ); 97 | } 98 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. SPDX-License-Identifier: BSD-3-Clause 2 | 3 | kperf 4 | ===== 5 | 6 | kperf is a an iperf/netperf replacement with a more fine-grained worker 7 | control. Modern NICs have multiple Rx queues and while iperf / netperf 8 | can bind to a CPU they are not aware of which CPU is serving the Rx queue 9 | selected by the NIC for the flow. If the NIC does not support flow steering 10 | this is a problem. kperf asks the kernel which CPU is used for Rx and can 11 | bind itself appropriately (same core, Rx core + N, etc.). For parallel runs 12 | it can also make sure that the flows are not colliding (being served by 13 | the same CPU). 14 | 15 | Other strengths include: 16 | - RPC-like traffic (unlike iperf); 17 | - kTLS support (just data, no control records); 18 | - more stats (TCP, latency, CPU use). 19 | 20 | That said, kperf is more of hackable library than a ready-to-use Swiss 21 | army knife. There is an example client application provided but the number 22 | of configurations is so high it seems impossible to write a comprehensive 23 | client controlled solely by command line options. 24 | 25 | High level design 26 | ----------------- 27 | 28 | Client does not generate any traffic, it only orchestrates load between 29 | Servers. 30 | 31 | When Client connect to a Server Server spawns a Session which is what 32 | Client controls on the server side. There can be multiple concurrent 33 | Sessions within one Server, there are no limitations. Note that Session 34 | is between Client and one Server, it can contain connections to many 35 | other Sessions. Each Session is a separate process. 36 | 37 | Session can establish Connections with other Sessions. 38 | 39 | Session can spawn Workers which is what drivers the IO. 40 | 41 | Connections are established within Sessions, not Workers because Workers 42 | and Connections are usually assigned once it's known which CPU given 43 | connection lands on. 44 | 45 | Currently only Process Workers are supported (each worker is a separate 46 | process), adding threads should not be a problem but was not needed, so far:: 47 | 48 | .--------. 49 | .-----| Client |----. 50 | | '--------' | 51 | | | 52 | ----------------------|------ ------|--------------------- 53 | v | | v 54 | .--------. .---------. | | .---------. .--------. 55 | | Server |-----| Session | | | | Session |-----| Server | 56 | '--------' '---------' | | '---------' '--------' 57 | | | | | 58 | v | | v 59 | .---------. | | .---------. 60 | | Worker | | | | Worker | 61 | '---------' | | '---------' 62 | Host A .---------. | | .---------. Host B 63 | | Worker | | | | Worker | 64 | '---------' | | '---------' 65 | .---------. | | .---------. 66 | | Worker | | | | Worker | 67 | '---------' | | '---------' 68 | | | 69 | 70 | Contributing 71 | ------------ 72 | 73 | Please refer to relevant details in the `license`_, `code of conduct`_, 74 | and `contributing guide`_. 75 | 76 | .. _license: LICENSE 77 | .. _code of conduct: CODE_OF_CONDUCT.md 78 | .. _contributing guide: CONTRIBUTING.md 79 | 80 | Per Meta's policies contributors are required to submit a CLA. 81 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | This Code of Conduct also applies outside the project spaces when there is a 56 | reasonable belief that an individual's behavior may have a negative impact on 57 | the project or its community. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported by contacting the project team at . All 63 | complaints will be reviewed and investigated and will result in a response that 64 | is deemed necessary and appropriate to the circumstances. The project team is 65 | obligated to maintain confidentiality with regard to the reporter of an incident. 66 | Further details of specific enforcement policies may be posted separately. 67 | 68 | Project maintainers who do not follow or enforce the Code of Conduct in good 69 | faith may face temporary or permanent repercussions as determined by other 70 | members of the project's leadership. 71 | 72 | ## Attribution 73 | 74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 76 | 77 | [homepage]: https://www.contributor-covenant.org 78 | 79 | For answers to common questions about this code of conduct, see 80 | https://www.contributor-covenant.org/faq -------------------------------------------------------------------------------- /cpu_stat.c: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: BSD-3-Clause */ 2 | /* Copyright Jakub Kicinski */ 3 | /* Copyright Meta Platforms, Inc. and affiliates */ 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "cpu_stat.h" 11 | 12 | /* Skip the rest of the line */ 13 | static void cpu_stat_nl(FILE *fp) 14 | { 15 | char c; 16 | 17 | do { 18 | c = getc(fp); 19 | } while (c != '\n' && c != EOF); 20 | } 21 | 22 | struct cpu_stat *cpu_stat_snapshot(int ncpus) 23 | { 24 | struct cpu_stat *stats; 25 | FILE *fp; 26 | int i; 27 | 28 | if (!ncpus) 29 | ncpus = get_nprocs_conf(); 30 | if (ncpus < 1) 31 | return NULL; 32 | 33 | stats = calloc(ncpus, sizeof(*stats)); 34 | if (!stats) 35 | return NULL; 36 | 37 | fp = fopen("/proc/stat", "r"); 38 | if (!fp) 39 | goto err_free; 40 | 41 | /* skip first line */ 42 | cpu_stat_nl(fp); 43 | 44 | for (i = 0; i < ncpus; i++) { 45 | unsigned long long int nice; 46 | 47 | fscanf(fp, "cpu%u %llu %llu %llu %llu %llu %llu %llu", 48 | &stats[i].cpu_id, 49 | &stats[i].user, &nice, 50 | &stats[i].system, 51 | &stats[i].idle, 52 | &stats[i].iowait, 53 | &stats[i].irq, 54 | &stats[i].sirq); 55 | stats[i].user += nice; 56 | cpu_stat_nl(fp); 57 | } 58 | 59 | fclose(fp); 60 | return stats; 61 | 62 | err_free: 63 | free(stats); 64 | return NULL; 65 | } 66 | 67 | /* dst -= op; */ 68 | void cpu_stat_sub(struct cpu_stat *dst, struct cpu_stat *op, int ncpus) 69 | { 70 | int i; 71 | 72 | if (!ncpus) 73 | ncpus = get_nprocs_conf(); 74 | 75 | for (i = 0; i < ncpus; i++) { 76 | dst[i].user -= op[i].user; 77 | dst[i].system -= op[i].system; 78 | dst[i].idle -= op[i].idle; 79 | dst[i].iowait -= op[i].iowait; 80 | dst[i].irq -= op[i].irq; 81 | dst[i].sirq -= op[i].sirq; 82 | } 83 | } 84 | 85 | struct cpu_stat *cpu_stat_to_pct00(struct cpu_stat *src, int ncpus) 86 | { 87 | struct cpu_stat *pct; 88 | int i; 89 | 90 | if (!ncpus) 91 | ncpus = get_nprocs_conf(); 92 | if (ncpus < 1) 93 | return NULL; 94 | 95 | pct = calloc(ncpus, sizeof(*pct)); 96 | if (!pct) 97 | return NULL; 98 | 99 | for (i = 0; i < ncpus; i++) { 100 | unsigned long long int total; 101 | 102 | total = src[i].user + src[i].system + src[i].idle + 103 | src[i].iowait + src[i].irq + src[i].sirq; 104 | 105 | pct[i].cpu_id = src[i].cpu_id; 106 | pct[i].user = src[i].user * 10000 / total; 107 | pct[i].system = src[i].system * 10000 / total; 108 | pct[i].idle = src[i].idle * 10000 / total; 109 | pct[i].iowait = src[i].iowait * 10000 / total; 110 | pct[i].irq = src[i].irq * 10000 / total; 111 | pct[i].sirq = src[i].sirq * 10000 / total; 112 | } 113 | 114 | return pct; 115 | } 116 | 117 | #ifdef KPERF_UNITS 118 | int main() 119 | { 120 | struct cpu_stat *stats1, *stats2; 121 | struct cpu_stat *totpct, *diffpct; 122 | int i; 123 | 124 | stats1 = cpu_stat_snapshot(0); 125 | sleep(1); 126 | stats2 = cpu_stat_snapshot(0); 127 | totpct = cpu_stat_to_pct00(stats2, 0); 128 | 129 | cpu_stat_sub(stats2, stats1, 0); 130 | diffpct = cpu_stat_to_pct00(stats2, 0); 131 | 132 | for (i = 0; i < get_nprocs_conf(); i++) { 133 | printf("%u/%u: usr:%2llu sys:%2llu idl:%2llu\n", 134 | i, stats1[i].cpu_id, 135 | stats1[i].user, 136 | stats1[i].system, 137 | stats1[i].idle); 138 | printf("%u/%u: usr:%2llu.%02llu sys:%2llu.%02llu idl:%2llu.%02llu\n", 139 | i, totpct[i].cpu_id, 140 | totpct[i].user / 100, totpct[i].user % 100, 141 | totpct[i].system / 100, totpct[i].system % 100, 142 | totpct[i].idle / 100, totpct[i].idle % 100); 143 | } 144 | free(totpct); 145 | free(diffpct); 146 | free(stats1); 147 | free(stats2); 148 | 149 | return 0; 150 | } 151 | #endif 152 | -------------------------------------------------------------------------------- /server.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: BSD-3-Clause 2 | /* Copyright Meta Platforms, Inc. and affiliates */ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include "server.h" 23 | #include "proto_dbg.h" 24 | 25 | int verbose = 3; 26 | 27 | static struct { 28 | char *addr; 29 | char *service; 30 | char *pid_file; 31 | bool kill; 32 | bool server; 33 | } opt = { 34 | .server = true, 35 | .service = "18323", 36 | .pid_file = "/tmp/kperf.pid", 37 | }; 38 | 39 | static const struct opt_table opts[] = { 40 | OPT_WITH_ARG("--addr|-a ", opt_set_charp, opt_show_charp, 41 | &opt.addr, "Bind to specific control address"), 42 | OPT_WITH_ARG("--port|-p ", opt_set_charp, opt_show_charp, 43 | &opt.service, "Set control port/service to listen on"), 44 | OPT_WITHOUT_ARG("--no-daemon", opt_set_invbool, &opt.server, 45 | "Don't start a daemon"), 46 | OPT_WITH_ARG("--pid-file ", opt_set_charp, opt_show_charp, 47 | &opt.pid_file, "Set daemon identity / pid file"), 48 | OPT_WITHOUT_ARG("--kill", opt_set_bool, &opt.kill, "Stop the daemon"), 49 | OPT_WITHOUT_ARG("--verbose|-v", opt_inc_intval, &verbose, 50 | "Verbose mode (can be specified more than once)"), 51 | OPT_WITHOUT_ARG("--usage|--help|-h", opt_usage_and_exit, 52 | "kpeft server", "Show this help message"), 53 | OPT_ENDTABLE 54 | }; 55 | 56 | static volatile int chld; 57 | 58 | static void chld_sig_handler(int sig) 59 | { 60 | chld = 1; 61 | } 62 | 63 | static struct list_head sessions = LIST_HEAD_INIT(sessions); 64 | 65 | static void server_session_add(struct server_session *ses) 66 | { 67 | list_add(&sessions, &ses->sessions); 68 | } 69 | 70 | static void server_session_del(pid_t pid) 71 | { 72 | struct server_session *ses = NULL; 73 | 74 | list_for_each(&sessions, ses, sessions) { 75 | if (ses->pid == pid) 76 | break; 77 | } 78 | if (!ses || ses->pid != pid) 79 | return; 80 | 81 | list_del(&ses->sessions); 82 | free(ses); 83 | } 84 | 85 | static void server_reap_sessions(void) 86 | { 87 | if (!chld) 88 | return; 89 | 90 | while (true) { 91 | int status; 92 | pid_t pid; 93 | 94 | chld = 0; 95 | pid = waitpid(-1, &status, WNOHANG); 96 | if (pid < 1) 97 | break; 98 | server_session_del(pid); 99 | } 100 | } 101 | 102 | static void kill_old_daemon(void) 103 | { 104 | char buf[10]; 105 | ssize_t n; 106 | pid_t pid; 107 | int fd; 108 | 109 | fd = open(opt.pid_file, O_RDONLY); 110 | if (fd < 0) { 111 | if (errno == ENOENT) 112 | return; 113 | err(2, "Failed to open PID file"); 114 | } 115 | 116 | n = read(fd, buf, sizeof(buf)); 117 | if (n < 0) 118 | err(2, "Failed to read PID file"); 119 | if (!n || n == sizeof(buf)) 120 | errx(2, "Bad pid file len - %zd", n); 121 | buf[n] = 0; 122 | close(fd); 123 | 124 | pid = atoi(buf); 125 | 126 | if (kill(pid, SIGKILL)) 127 | if (errno != ESRCH) 128 | err(2, "Can't kill the old daemon"); 129 | 130 | if (unlink(opt.pid_file)) 131 | err(2, "Failed to remove pid file"); 132 | } 133 | 134 | static void server_daemonize(void) 135 | { 136 | char buf[10]; 137 | ssize_t n; 138 | int fd; 139 | 140 | fd = open(opt.pid_file, O_WRONLY | O_CREAT | O_EXCL, 00660); 141 | if (fd < 0) 142 | err(3, "Failed to create PID file"); 143 | 144 | if (!daemonize()) 145 | err(1, "can't daemonize"); 146 | 147 | n = snprintf(buf, sizeof(buf), "%d", getpid()); 148 | if (!n || n == sizeof(buf)) 149 | errx(3, "Bad pid file len - %zd", n); 150 | 151 | if (write(fd, buf, n) != n) 152 | err(3, "Short write to pid file"); 153 | close(fd); 154 | } 155 | 156 | /* same as net_server_lookup but accepts the node argument */ 157 | static struct addrinfo *net_server_lookup_node(const char *node, 158 | const char *service, 159 | int family, 160 | int socktype) 161 | { 162 | struct addrinfo *res, hints; 163 | 164 | memset(&hints, 0, sizeof(hints)); 165 | hints.ai_family = family; 166 | hints.ai_socktype = socktype; 167 | hints.ai_flags = AI_PASSIVE; 168 | hints.ai_protocol = 0; 169 | 170 | if (getaddrinfo(node, service, &hints, &res) != 0) 171 | return NULL; 172 | 173 | return res; 174 | } 175 | 176 | static void log_address(const char *format, struct sockaddr_in6 *sin6) 177 | { 178 | struct sockaddr_in *sin = (void *)sin6; 179 | char buf[256]; 180 | 181 | if (sin6->sin6_family == AF_INET6) 182 | inet_ntop(AF_INET6, &sin6->sin6_addr, buf, sizeof(buf)); 183 | else 184 | inet_ntop(AF_INET, &sin->sin_addr, buf, sizeof(buf)); 185 | 186 | kpm_info(format, buf); 187 | } 188 | 189 | static void print_listener(int *fds, int num_fds) 190 | { 191 | struct sockaddr_in6 sin6; 192 | socklen_t sa_len; 193 | int ret; 194 | int i; 195 | 196 | for (i = 0; i < num_fds; i++) { 197 | sa_len = sizeof(sin6); 198 | ret = getsockname(fds[i], (struct sockaddr *)&sin6, &sa_len); 199 | if (ret != 0) 200 | err(1, "Failed to look up address for fd %d", fds[i]); 201 | log_address("Bound to %s", &sin6); 202 | } 203 | } 204 | 205 | int main(int argc, char *argv[]) 206 | { 207 | int fds[2], i, num_fds, max_fd; 208 | struct addrinfo *addr; 209 | 210 | opt_register_table(opts, NULL); 211 | if (!opt_parse(&argc, argv, opt_log_stderr)) 212 | exit(1); 213 | 214 | err_set_progname(argv[0]); 215 | 216 | if (opt.server || opt.kill) 217 | kill_old_daemon(); 218 | if (opt.kill) 219 | return 0; 220 | 221 | if (opt.server) 222 | server_daemonize(); 223 | 224 | addr = net_server_lookup_node(opt.addr, opt.service, AF_UNSPEC, SOCK_STREAM); 225 | if (!addr) 226 | errx(1, "Failed to look up service to bind to"); 227 | 228 | num_fds = net_bind(addr, fds); 229 | freeaddrinfo(addr); 230 | if (num_fds < 1) 231 | err(1, "Failed to listen"); 232 | if (opt.addr) 233 | print_listener(fds, num_fds); 234 | 235 | max_fd = num_fds == 1 || fds[0] > fds[1] ? fds[0] : fds[1]; 236 | 237 | signal(SIGCHLD, chld_sig_handler); 238 | 239 | while (true) { 240 | struct sockaddr_in6 sockaddr; 241 | struct server_session *ses; 242 | struct timeval tv; 243 | socklen_t addrlen; 244 | int cfd, fd, ret; 245 | fd_set rfds; 246 | 247 | FD_ZERO(&rfds); 248 | for (i = 0; i < num_fds; i++) 249 | FD_SET(fds[i], &rfds); 250 | 251 | tv.tv_sec = 1; 252 | tv.tv_usec = 0; 253 | 254 | ret = select(max_fd + 1, &rfds, NULL, NULL, &tv); 255 | if (ret < 0) { 256 | if (errno == EINTR && chld) 257 | goto reap_child; 258 | err(2, "Failed to select"); 259 | } else if (!ret) { 260 | continue; 261 | } 262 | 263 | if (FD_ISSET(fds[0], &rfds)) 264 | fd = fds[0]; 265 | else if (num_fds > 1 && FD_ISSET(fds[1], &rfds)) 266 | fd = fds[1]; 267 | else 268 | errx(3, "Failed to find fd"); 269 | 270 | addrlen = sizeof(sockaddr); 271 | cfd = accept(fd, (void *)&sockaddr, &addrlen); 272 | if (cfd < 0) { 273 | warn("Failed to accept"); 274 | continue; 275 | } 276 | 277 | if (opt.addr) 278 | log_address("Accepted %s", &sockaddr); 279 | ses = server_session_spawn(cfd, &sockaddr, &addrlen); 280 | if (ses) 281 | server_session_add(ses); 282 | reap_child: 283 | server_reap_sessions(); 284 | } 285 | 286 | return 0; 287 | } 288 | -------------------------------------------------------------------------------- /proto.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: BSD-3-Clause */ 2 | /* Copyright Meta Platforms, Inc. and affiliates */ 3 | 4 | #ifndef PROTO_H 5 | #define PROTO_H 1 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #define KPERF_MAX_CPUS 1024 15 | 16 | enum kpm_msg_type { 17 | KPM_MSG_TYPE_ERROR = 1, 18 | KPM_MSG_TYPE_ECHO, 19 | KPM_MSG_TYPE_HELLO, 20 | KPM_MSG_TYPE_SPAWN_WORKER, 21 | KPM_MSG_TYPE_PIN_WORKER, 22 | KPM_MSG_TYPE_OPEN_TCP_ACCEPTOR, 23 | KPM_MSG_TYPE_CONNECT, 24 | KPM_MSG_TYPE_DISCONNECT, 25 | KPM_MSG_TYPE_CONNECTION_ID, 26 | KPM_MSG_TYPE_TLS, 27 | KPM_MSG_TYPE_MAX_PACING, 28 | KPM_MSG_TYPE_TCP_CC, 29 | KPM_MSG_TYPE_MODE, 30 | KPM_MSG_TYPE_TEST, 31 | KPM_MSG_TYPE_TEST_RESULT, 32 | KPM_MSG_TYPE_END_TEST, 33 | 34 | KPM_MSG_WORKER_ID, 35 | KPM_MSG_WORKER_KILL, 36 | KPM_MSG_WORKER_TEST, 37 | KPM_MSG_WORKER_END_TEST, 38 | KPM_MSG_WORKER_TEST_RESULT, 39 | 40 | __KPM_MSG_TOTAL, 41 | 42 | KPM_MSG_REPLY = 0x8000 43 | }; 44 | 45 | struct kpm_header { 46 | __u16 type; 47 | __u16 id; 48 | __u32 len; 49 | }; 50 | 51 | struct kpm_empty { 52 | struct kpm_header hdr; 53 | }; 54 | 55 | struct kpm_hello { 56 | struct kpm_header hdr; 57 | __u32 version; 58 | __u32 n_cpus; 59 | }; 60 | 61 | struct __kpm_generic_u16 { 62 | struct kpm_header hdr; 63 | __u16 val; 64 | __u16 pad; 65 | }; 66 | 67 | struct kpm_tcp_acceptor_reply { 68 | struct kpm_header hdr; 69 | socklen_t len; 70 | struct sockaddr_in6 addr; 71 | }; 72 | 73 | struct __kpm_generic_u32 { 74 | struct kpm_header hdr; 75 | __u32 val; 76 | }; 77 | 78 | struct kpm_reply_error { 79 | struct kpm_header hdr; 80 | __u16 type; 81 | __u16 error; 82 | }; 83 | 84 | struct kpm_pin_worker { 85 | struct kpm_header hdr; 86 | __u32 worker_id; 87 | __u32 cpu; 88 | }; 89 | 90 | struct kpm_connect { 91 | struct kpm_header hdr; 92 | socklen_t len; 93 | struct sockaddr_in6 addr; 94 | __u32 mss; 95 | }; 96 | 97 | struct kpm_connect_reply { 98 | struct kpm_header hdr; 99 | struct { 100 | __u32 id; 101 | __u32 cpu; 102 | __u16 port; 103 | } local, remote; 104 | }; 105 | 106 | struct kpm_connection_id { 107 | struct kpm_header hdr; 108 | __u32 id; 109 | __u32 cpu; 110 | }; 111 | 112 | struct kpm_max_pacing { 113 | struct kpm_header hdr; 114 | __u32 id; 115 | __u32 max_pacing; 116 | }; 117 | 118 | #define KPM_CC_NAME_LEN 16 119 | 120 | struct kpm_tcp_cc { 121 | struct kpm_header hdr; 122 | __u32 id; 123 | char cc_name[KPM_CC_NAME_LEN]; 124 | }; 125 | 126 | enum kpm_rx_mode { 127 | KPM_RX_MODE_SOCKET, 128 | KPM_RX_MODE_SOCKET_TRUNC, 129 | KPM_RX_MODE_SOCKET_ZEROCOPY, 130 | KPM_RX_MODE_DEVMEM, 131 | }; 132 | 133 | enum kpm_tx_mode { 134 | KPM_TX_MODE_SOCKET, 135 | KPM_TX_MODE_SOCKET_ZEROCOPY, 136 | KPM_TX_MODE_DEVMEM, 137 | }; 138 | 139 | enum memory_provider_type { 140 | MEMORY_PROVIDER_HOST, 141 | MEMORY_PROVIDER_CUDA, 142 | }; 143 | 144 | #define DEVICE_DOMAIN_ANY 0xffff 145 | #define DEVICE_BUS_ANY 0xff 146 | #define DEVICE_DEVICE_ANY 0xff 147 | 148 | struct pci_dev { 149 | __u16 domain; 150 | __u8 bus; 151 | __u8 device; 152 | }; 153 | 154 | struct kpm_mode { 155 | struct kpm_header hdr; 156 | enum kpm_rx_mode rx_mode; 157 | enum kpm_tx_mode tx_mode; 158 | 159 | /* devmem info */ 160 | enum memory_provider_type rx_provider; 161 | enum memory_provider_type tx_provider; 162 | struct pci_dev dev; 163 | __u32 dmabuf_rx_size_mb; 164 | __u32 dmabuf_tx_size_mb; 165 | __u32 num_rx_queues; 166 | struct sockaddr_in6 addr; 167 | 168 | __u8 validate; 169 | __u8 iou; 170 | __u32 iou_rx_size_mb; 171 | }; 172 | 173 | enum kpm_tls_mask { 174 | KPM_TLS_ULP = 1, 175 | KPM_TLS_TX = 2, 176 | KPM_TLS_RX = 4, 177 | KPM_TLS_NOPAD = 8, 178 | }; 179 | 180 | struct kpm_tls { 181 | struct kpm_header hdr; 182 | __u32 connection_id; 183 | __u32 dir_mask; 184 | socklen_t len; 185 | union { 186 | struct tls12_crypto_info_aes_gcm_128 aes128; 187 | } info; 188 | }; 189 | 190 | struct kpm_end_test { 191 | struct kpm_header hdr; 192 | __u32 id; 193 | }; 194 | 195 | enum kpm_test_type { 196 | KPM_TEST_TYPE_STREAM = 1, 197 | KPM_TEST_TYPE_RR, 198 | }; 199 | 200 | #define KPM_DFL_OP_CHUNK (1 << 16) 201 | #define KPM_MAX_OP_CHUNK (1 << 27) 202 | 203 | struct kpm_test { 204 | struct kpm_header hdr; 205 | __u8 active; 206 | __u8 pad; 207 | __u16 time_sec; 208 | __u32 n_conns; 209 | __u32 test_id; 210 | struct kpm_test_spec { 211 | __u32 connection_id; 212 | __u32 worker_id; 213 | enum kpm_test_type type; 214 | __u32 read_size; 215 | __u32 write_size; 216 | union kpm_test_arg { 217 | struct { 218 | __u32 req_size; 219 | __u32 resp_size; 220 | __u8 timings; 221 | } rr; 222 | } arg; 223 | } specs[0]; 224 | }; 225 | 226 | struct kpm_test_results { 227 | struct kpm_header hdr; 228 | __u32 time_usec; 229 | __u32 n_conns; 230 | __u32 test_id; 231 | struct kpm_cpu_load { 232 | __u16 id; 233 | __u16 user; /* sum of user and nice */ 234 | __u16 system; 235 | __u16 idle; 236 | __u16 iowait; 237 | __u16 irq; 238 | __u16 sirq; 239 | } cpu_load[KPERF_MAX_CPUS]; 240 | struct kpm_test_result { 241 | __u32 connection_id; 242 | __u32 worker_id; 243 | enum kpm_test_type type; 244 | __u64 rx_bytes; 245 | __u64 tx_bytes; 246 | 247 | __u32 reqs; 248 | 249 | __u32 retrans; 250 | __u32 reord_seen; 251 | __u32 rtt; 252 | __u32 rttvar; 253 | __u32 delivered_ce; 254 | __u32 snd_wnd; 255 | __u32 snd_cwnd; 256 | 257 | __u32 lat_hist[33]; 258 | __u32 p25; 259 | __u32 p50; 260 | __u32 p90; 261 | __u32 p99; 262 | __u32 p999; 263 | __u32 p9999; 264 | } res[0]; 265 | }; 266 | 267 | #define kpm_good_req(msg, msg_type) \ 268 | ({ \ 269 | struct kpm_header *_hdr = &(msg)->hdr; \ 270 | int _ret; \ 271 | \ 272 | _ret = _hdr->type == (msg_type) && \ 273 | _hdr->len == sizeof(*msg); \ 274 | _ret; \ 275 | }) 276 | 277 | #define kpm_good_reply(msg, msg_type, seq) \ 278 | ({ \ 279 | struct kpm_header *_hdr = &(msg)->hdr; \ 280 | int _ret; \ 281 | \ 282 | _ret = _hdr->type == ((msg_type) | KPM_MSG_REPLY) && \ 283 | _hdr->id == (seq) && \ 284 | _hdr->len == sizeof(*msg); \ 285 | _ret; \ 286 | }) 287 | 288 | void *kpm_msg_dup(struct kpm_header *hdr); 289 | 290 | void *kpm_receive(int fd); 291 | 292 | int kpm_send(int fd, struct kpm_header *msg, size_t size, 293 | enum kpm_msg_type type); 294 | int kpm_send_empty(int fd, enum kpm_msg_type type); 295 | int kpm_send_u32(int fd, enum kpm_msg_type type, __u32 arg); 296 | 297 | int kpm_send_conn_id(int fd, __u32 id, __u32 cpu); 298 | int kpm_send_connect(int fd, struct sockaddr_in6 *addr, socklen_t len, 299 | __u32 mss); 300 | int kpm_send_tls(int fd, __u32 conn_id, __u32 dir_mask, 301 | void *info, socklen_t len); 302 | int kpm_send_max_pacing(int fd, __u32 id, __u32 max_pace); 303 | int kpm_send_tcp_cc(int fd, __u32 id, char *cc_name); 304 | int kpm_send_mode(int fd, struct kpm_mode *mode); 305 | int kpm_send_pin_worker(int fd, __u32 id, __u32 cpu); 306 | 307 | void kpm_reply_error(int fd, struct kpm_header *hdr, __u16 error); 308 | 309 | int kpm_reply_empty(int fd, struct kpm_header *hdr); 310 | int kpm_reply_u16(int fd, struct kpm_header *hdr, __u16 arg); 311 | int kpm_reply_u32(int fd, struct kpm_header *hdr, __u32 arg); 312 | 313 | int kpm_reply_acceptor(int fd, struct kpm_header *hdr, 314 | struct sockaddr_in6 *addr, socklen_t len); 315 | int kpm_reply_connect(int fd, struct kpm_header *hdr, 316 | __u32 local_id, __u32 local_cpu, __u16 local_port, 317 | __u32 remote_id, __u32 remote_cpu, __u16 remote_port); 318 | 319 | int kpm_xchg_hello(int fd, unsigned int *ncpus); 320 | 321 | int kpm_req_tcp_sock(int fd, struct sockaddr_in6 *addr, socklen_t *len); 322 | int kpm_req_end_test(int fd, __u32 test_id); 323 | int kpm_req_tls(int fd, __u32 conn_id, __u32 dir_mask, 324 | void *info, socklen_t len); 325 | int kpm_req_pacing(int fd, __u32 conn_id, __u32 max_pace); 326 | int kpm_req_tcp_cc(int fd, __u32 conn_id, char *cc_name); 327 | int kpm_req_mode(int fd, struct kpm_mode *mode); 328 | int kpm_req_disconnect(int fd, __u32 connection_id); 329 | 330 | #endif /* PROTO_H */ 331 | -------------------------------------------------------------------------------- /bipartite_match.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: BSD-3-Clause 2 | /* Copyright Jakub Kicinski */ 3 | /* Copyright Meta Platforms, Inc. and affiliates */ 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "bipartite_match.h" 15 | 16 | #define INIT_STATE_SIZE 8 17 | 18 | struct bim_state { 19 | /* Sizing allocated memory */ 20 | unsigned int max_left; 21 | unsigned int max_right; 22 | /* Max vertex indexes seen */ 23 | unsigned int max_used_left; 24 | unsigned int max_used_right; 25 | 26 | /* Arrays indexed by left_id */ 27 | unsigned int **left_neigh; 28 | void ***left_neigh_cookie; 29 | unsigned int *n_left_neigh; 30 | int *left_match; 31 | 32 | /* Arrays indexed by right_id */ 33 | int *right_match; 34 | 35 | /* Recursion state */ 36 | struct { 37 | bool *left_on_path; 38 | } aug; 39 | 40 | /* Cached count of pairings */ 41 | unsigned int n_matches; 42 | }; 43 | 44 | struct bim_state *bim_init(void) 45 | { 46 | struct bim_state *bim; 47 | unsigned int i; 48 | 49 | bim = talz(NULL, struct bim_state); 50 | 51 | bim->max_left = INIT_STATE_SIZE; 52 | bim->max_right = INIT_STATE_SIZE; 53 | 54 | bim->left_neigh = tal_arr(bim, unsigned int *, bim->max_left); 55 | for (i = 0; i < bim->max_left; i++) 56 | bim->left_neigh[i] = tal_arr(bim, unsigned int, bim->max_right); 57 | 58 | bim->left_neigh_cookie = tal_arr(bim, void **, bim->max_left); 59 | for (i = 0; i < bim->max_left; i++) 60 | bim->left_neigh_cookie[i] = tal_arr(bim, void *, 61 | bim->max_right); 62 | bim->n_left_neigh = tal_arrz(bim, unsigned int, bim->max_left); 63 | 64 | bim->left_match = tal_arr(bim, int, bim->max_left); 65 | memset(bim->left_match, 0xff, bim->max_left * sizeof(int)); 66 | bim->right_match = tal_arr(bim, int, bim->max_right); 67 | memset(bim->right_match, 0xff, bim->max_right * sizeof(int)); 68 | 69 | bim->aug.left_on_path = tal_arr(bim, bool, bim->max_left); 70 | 71 | return bim; 72 | } 73 | 74 | void bim_destroy(struct bim_state *bim) 75 | { 76 | tal_free(bim); 77 | } 78 | 79 | static void bim_realloc(struct bim_state *bim, 80 | unsigned int max_left, unsigned int max_right) 81 | { 82 | unsigned int i; 83 | 84 | tal_resize(&bim->left_neigh, max_left); 85 | for (i = 0; i < bim->max_left; i++) 86 | tal_resize(&bim->left_neigh[i], max_right); 87 | for (; i < max_left; i++) 88 | bim->left_neigh[i] = tal_arr(bim, unsigned int, max_right); 89 | 90 | tal_resize(&bim->left_neigh_cookie, max_left); 91 | for (i = 0; i < bim->max_left; i++) 92 | tal_resize(&bim->left_neigh_cookie[i], max_right); 93 | for (; i < max_left; i++) 94 | bim->left_neigh_cookie[i] = tal_arr(bim, void *, max_right); 95 | 96 | tal_resizez(&bim->n_left_neigh, max_left); 97 | 98 | tal_resize(&bim->left_match, max_left); 99 | if (max_left > bim->max_left) 100 | memset(&bim->left_match[bim->max_left], 0xff, 101 | (max_left - bim->max_left) * sizeof(int)); 102 | 103 | tal_resize(&bim->right_match, max_right); 104 | if (max_right > bim->max_right) 105 | memset(&bim->right_match[bim->max_right], 0xff, 106 | (max_right - bim->max_right) * sizeof(int)); 107 | 108 | tal_resize(&bim->aug.left_on_path, max_left); 109 | 110 | bim->max_left = max_left; 111 | bim->max_right = max_right; 112 | } 113 | 114 | /* Resize the state, can be used both to grow and shrink. 115 | * Pass 0, 0 to trim overallocation. 116 | */ 117 | void bim_resize(struct bim_state *bim, 118 | unsigned int max_left, unsigned int max_right) 119 | { 120 | max_left = max(bim->max_used_left + 1, max_left); 121 | max_right = max(bim->max_used_right + 1, max_right); 122 | 123 | if (bim->max_left != max_left || bim->max_right != max_right) 124 | bim_realloc(bim, max_left, max_right); 125 | } 126 | 127 | static void bim_size_check(struct bim_state *bim, 128 | unsigned int left_id, unsigned int right_id) 129 | { 130 | bim->max_used_left = max(bim->max_used_left, left_id); 131 | bim->max_used_right = max(bim->max_used_right, right_id); 132 | 133 | if (bim->max_used_left >= bim->max_left || 134 | bim->max_used_right >= bim->max_right) 135 | bim_realloc(bim, max(bim->max_used_left * 2, bim->max_left), 136 | max(bim->max_used_right * 2, bim->max_right)); 137 | } 138 | 139 | /* == Algo == */ 140 | /* Straightforward implementation of Knuth Max Bipartite Matching */ 141 | 142 | static void bim_reset_aug_state(struct bim_state *bim) 143 | { 144 | memset(bim->aug.left_on_path, 0, 145 | sizeof(bool) * (bim->max_used_left + 1)); 146 | } 147 | 148 | static void bim_add_match(struct bim_state *bim, 149 | unsigned int left_id, unsigned int right_id) 150 | { 151 | bim->left_match[left_id] = right_id; 152 | bim->right_match[right_id] = left_id; 153 | } 154 | 155 | static bool bim_try_aug(struct bim_state *bim, unsigned int left_id) 156 | { 157 | unsigned int i; 158 | 159 | if (bim->aug.left_on_path[left_id]) 160 | return false; 161 | bim->aug.left_on_path[left_id] = true; 162 | 163 | for (i = 0; i < bim->n_left_neigh[left_id]; i++) { 164 | unsigned int right_id = bim->left_neigh[left_id][i]; 165 | 166 | if (bim->right_match[right_id] == -1 || 167 | bim_try_aug(bim, bim->right_match[right_id])) { 168 | bim_add_match(bim, left_id, right_id); 169 | return true; 170 | } 171 | } 172 | 173 | return false; 174 | } 175 | 176 | /* Ruturns false if edge is a duplicate */ 177 | bool bim_add_edge(struct bim_state *bim, 178 | unsigned int left_id, unsigned int right_id, void *cookie) 179 | { 180 | unsigned int i, lv; 181 | 182 | bim_size_check(bim, left_id, right_id); 183 | 184 | /* Add edge */ 185 | for (i = 0; i < bim->n_left_neigh[left_id]; i++) 186 | /* Duplicate edge add, ignore */ 187 | if (bim->left_neigh[left_id][i] == right_id) 188 | return false; 189 | i = bim->n_left_neigh[left_id]++; 190 | bim->left_neigh[left_id][i] = right_id; 191 | bim->left_neigh_cookie[left_id][i] = cookie; 192 | 193 | /* Fast path good edge */ 194 | if (bim->left_match[left_id] == -1 && 195 | bim->right_match[right_id] == -1) { 196 | bim_add_match(bim, left_id, right_id); 197 | bim->n_matches++; 198 | return true; 199 | } 200 | 201 | /* Slow path, re-match */ 202 | for (lv = 0; lv < bim->max_used_left + 1; lv++) { 203 | if (bim->left_match[lv] != -1) 204 | continue; 205 | bim_reset_aug_state(bim); 206 | if (bim_try_aug(bim, lv)) { 207 | bim->n_matches++; 208 | break; 209 | } 210 | } 211 | 212 | return true; 213 | } 214 | 215 | /* == Accessors == */ 216 | 217 | unsigned int bim_match_size(struct bim_state *bim) 218 | { 219 | return bim->n_matches; 220 | } 221 | 222 | void bim_walk_init(struct bim_edge *match) 223 | { 224 | memset(match, 0, sizeof(*match)); 225 | } 226 | 227 | bool bim_edge_walk_next(struct bim_state *bim, struct bim_edge *match) 228 | { 229 | unsigned int left_id, i; 230 | 231 | i = match->_walker << 32 >> 32; 232 | left_id = match->_walker >> 32; 233 | for (; left_id < bim->max_used_left + 1; left_id++) { 234 | if (i < bim->n_left_neigh[left_id]) 235 | goto found; 236 | i = 0; 237 | } 238 | 239 | return false; 240 | 241 | found: 242 | match->_walker = ((unsigned long long)left_id << 32) | (i + 1); 243 | match->left_id = left_id; 244 | match->right_id = bim->left_neigh[left_id][i]; 245 | match->is_match = bim->left_match[left_id] == (int)match->right_id; 246 | match->cookie = bim->left_neigh_cookie[left_id][i]; 247 | return true; 248 | } 249 | 250 | bool bim_match_walk_next(struct bim_state *bim, struct bim_edge *match) 251 | { 252 | unsigned int left_id, i; 253 | 254 | for (left_id = match->_walker; 255 | left_id < bim->max_used_left + 1; left_id++) 256 | if (bim->left_match[left_id] != -1) 257 | goto found; 258 | return false; 259 | 260 | found: 261 | match->is_match = true; 262 | match->_walker = left_id + 1; 263 | match->left_id = left_id; 264 | match->right_id = bim->left_match[left_id]; 265 | match->cookie = NULL; 266 | for (i = 0; i < bim->n_left_neigh[left_id]; i++) 267 | if (bim->left_neigh[left_id][i] == match->right_id) { 268 | match->cookie = bim->left_neigh_cookie[left_id][i]; 269 | break; 270 | } 271 | return true; 272 | } 273 | 274 | /* == Test / example == */ 275 | 276 | #ifdef KPERF_UNITS 277 | #include 278 | 279 | static UNNEEDED void bim_dump(struct bim_state *bim) 280 | { 281 | unsigned int i, j; 282 | 283 | printf("============\n"); 284 | printf("max_l %d max_r %d used_l %d used_r %d matches %d\n", 285 | bim->max_left, bim->max_right, 286 | bim->max_used_left, bim->max_used_right, bim->n_matches); 287 | 288 | for (i = 0; i <= bim->max_used_left; i++) 289 | if (bim->left_match[i] != -1) 290 | printf(" %d -> %d\n", i, bim->left_match[i]); 291 | 292 | for (i = 0; i <= bim->max_used_right; i++) 293 | if (bim->right_match[i] != -1) 294 | printf(" %d <- %d\n", i, bim->right_match[i]); 295 | 296 | for (i = 0; i <= bim->max_used_left; i++) { 297 | if (!bim->n_left_neigh[i]) 298 | continue; 299 | 300 | printf(" =%d=", i); 301 | for (j = 0; j < bim->n_left_neigh[i]; j++) 302 | printf(" %d", bim->left_neigh[i][j]); 303 | printf("\n"); 304 | } 305 | } 306 | 307 | int main() 308 | { 309 | static const int edges[][2] = {{1, 2}, {1, 2}, {2, 2}, {2, 3}, 310 | {0, 3}, {2, 0}, {170, 18}}; 311 | struct bim_state *bim; 312 | struct bim_edge m; 313 | unsigned int i; 314 | 315 | bim = bim_init(); 316 | printf("Init match: %d\n", bim_match_size(bim)); 317 | 318 | for (i = 0; i < ARRAY_SIZE(edges); i++) { 319 | bim_add_edge(bim, edges[i][0], edges[i][1], 320 | (void *)(unsigned long)i); 321 | printf("Added edge %d - %d, match: %d\n", 322 | edges[i][0], edges[i][1], bim_match_size(bim)); 323 | } 324 | bim_for_each_match(bim, &m) 325 | printf("Match %d - %d, %p\n", m.left_id, m.right_id, m.cookie); 326 | 327 | bim_destroy(bim); 328 | } 329 | #endif 330 | -------------------------------------------------------------------------------- /proto.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: BSD-3-Clause 2 | /* Copyright Meta Platforms, Inc. and affiliates */ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | 15 | #include "proto.h" 16 | 17 | static const unsigned int proto_ver = 18 | __KPM_MSG_TOTAL << 24 | 19 | sizeof(struct kpm_test) << 16 | 20 | sizeof(struct kpm_test_results); 21 | 22 | void *kpm_msg_dup(struct kpm_header *hdr) 23 | { 24 | char *msg; 25 | 26 | msg = malloc(hdr->len); 27 | memcpy(msg, hdr, hdr->len); 28 | return msg; 29 | } 30 | 31 | void *kpm_receive(int fd) 32 | { 33 | struct kpm_header hdr; 34 | ssize_t off, n; 35 | char *msg; 36 | 37 | n = recv(fd, &hdr, sizeof(hdr), MSG_PEEK | MSG_WAITALL); 38 | if (n < (int)sizeof(hdr)) { 39 | if (n) 40 | warn("Failed to receive header (%zd)", n); 41 | return NULL; 42 | } 43 | if (hdr.len < sizeof(hdr)) { 44 | warnx("Invalid header length (%d)", hdr.len); 45 | return NULL; 46 | } 47 | 48 | msg = malloc(hdr.len); 49 | if (!msg) 50 | return NULL; 51 | 52 | off = 0; 53 | while (hdr.len) { 54 | n = recv(fd, msg + off, hdr.len, 0); 55 | if (n > hdr.len) { 56 | warnx("Oversized recv"); 57 | } else if (n <= 0) { 58 | warnx("Short recv"); 59 | } else { 60 | off += n; 61 | hdr.len -= n; 62 | continue; 63 | } 64 | 65 | free(msg); 66 | return NULL; 67 | } 68 | 69 | return msg; 70 | } 71 | 72 | static int __kpm_send(int fd, struct kpm_header *msg, size_t size, int id, 73 | enum kpm_msg_type type) 74 | { 75 | ssize_t n, off; 76 | 77 | msg->type = type; 78 | msg->id = id; 79 | msg->len = size; 80 | 81 | off = 0; 82 | while (size) { 83 | n = send(fd, (char *)msg + off, size, 0); 84 | if (n <= 0) { 85 | warnx("Short send"); 86 | return -1; 87 | } 88 | size -= n; 89 | } 90 | 91 | return id; 92 | } 93 | 94 | int kpm_send(int fd, struct kpm_header *msg, size_t size, 95 | enum kpm_msg_type type) 96 | { 97 | static short int id_gen; 98 | 99 | return __kpm_send(fd, msg, size, ++id_gen, type); 100 | } 101 | 102 | int kpm_send_empty(int fd, enum kpm_msg_type type) 103 | { 104 | struct kpm_header hdr; 105 | 106 | return kpm_send(fd, &hdr, sizeof(hdr), type); 107 | } 108 | 109 | int kpm_send_u32(int fd, enum kpm_msg_type type, __u32 arg) 110 | { 111 | struct __kpm_generic_u32 msg; 112 | 113 | msg.val = arg; 114 | 115 | return kpm_send(fd, &msg.hdr, sizeof(msg), type); 116 | } 117 | 118 | int kpm_send_conn_id(int fd, __u32 id, __u32 cpu) 119 | { 120 | struct kpm_connection_id msg; 121 | 122 | msg.id = id; 123 | msg.cpu = cpu; 124 | 125 | return kpm_send(fd, &msg.hdr, sizeof(msg), KPM_MSG_TYPE_CONNECTION_ID); 126 | } 127 | 128 | int kpm_send_connect(int fd, struct sockaddr_in6 *addr, socklen_t len, 129 | __u32 mss) 130 | { 131 | struct kpm_connect msg; 132 | 133 | if (len > sizeof(msg.addr)) { 134 | warnx("Oversized connect arg"); 135 | return -1; 136 | } 137 | 138 | msg.len = len; 139 | memcpy(&msg.addr, addr, len); 140 | msg.mss = mss; 141 | 142 | return kpm_send(fd, &msg.hdr, sizeof(msg), KPM_MSG_TYPE_CONNECT); 143 | } 144 | 145 | int 146 | kpm_send_tls(int fd, __u32 conn_id, __u32 dir_mask, void *info, socklen_t len) 147 | { 148 | struct kpm_tls msg; 149 | 150 | if (len > sizeof(msg.info)) { 151 | warnx("Oversized TLS arg"); 152 | return -1; 153 | } 154 | 155 | msg.connection_id = conn_id; 156 | msg.dir_mask = dir_mask; 157 | msg.len = len; 158 | memcpy(&msg.info, info, len); 159 | 160 | return kpm_send(fd, &msg.hdr, sizeof(msg), KPM_MSG_TYPE_TLS); 161 | } 162 | 163 | int kpm_send_max_pacing(int fd, __u32 id, __u32 pace) 164 | { 165 | struct kpm_max_pacing msg; 166 | 167 | msg.id = id; 168 | msg.max_pacing = pace; 169 | 170 | return kpm_send(fd, &msg.hdr, sizeof(msg), KPM_MSG_TYPE_MAX_PACING); 171 | } 172 | 173 | int kpm_send_tcp_cc(int fd, __u32 id, char *cc_name) 174 | { 175 | struct kpm_tcp_cc msg = {}; 176 | 177 | msg.id = id; 178 | strncpy(msg.cc_name, cc_name, sizeof(msg.cc_name) - 1); 179 | 180 | return kpm_send(fd, &msg.hdr, sizeof(msg), KPM_MSG_TYPE_TCP_CC); 181 | } 182 | 183 | int kpm_send_mode(int fd, struct kpm_mode *mode) 184 | { 185 | return kpm_send(fd, &mode->hdr, sizeof(*mode), KPM_MSG_TYPE_MODE); 186 | } 187 | 188 | int kpm_send_pin_worker(int fd, __u32 id, __u32 cpu) 189 | { 190 | struct kpm_pin_worker msg; 191 | 192 | msg.worker_id = id; 193 | msg.cpu = cpu; 194 | 195 | return kpm_send(fd, &msg.hdr, sizeof(msg), KPM_MSG_TYPE_PIN_WORKER); 196 | } 197 | 198 | static int kpm_reply(int fd, struct kpm_header *msg, size_t size, 199 | struct kpm_header *req) 200 | { 201 | return __kpm_send(fd, msg, size, req->id, req->type | KPM_MSG_REPLY); 202 | } 203 | 204 | void kpm_reply_error(int fd, struct kpm_header *hdr, __u16 error) 205 | { 206 | struct kpm_reply_error msg; 207 | 208 | msg.type = hdr->type; 209 | msg.error = error; 210 | 211 | __kpm_send(fd, &msg.hdr, sizeof(msg), hdr->id, KPM_MSG_TYPE_ERROR); 212 | } 213 | 214 | int kpm_reply_empty(int fd, struct kpm_header *hdr) 215 | { 216 | struct kpm_header msg; 217 | 218 | return kpm_reply(fd, &msg, sizeof(msg), hdr); 219 | } 220 | 221 | int kpm_reply_u16(int fd, struct kpm_header *hdr, __u16 arg) 222 | { 223 | struct __kpm_generic_u16 msg; 224 | 225 | msg.val = arg; 226 | memset(&msg.pad, 0, sizeof(msg.pad)); 227 | 228 | return kpm_reply(fd, &msg.hdr, sizeof(msg), hdr); 229 | } 230 | 231 | int kpm_reply_u32(int fd, struct kpm_header *hdr, __u32 arg) 232 | { 233 | struct __kpm_generic_u32 msg; 234 | 235 | msg.val = arg; 236 | 237 | return kpm_reply(fd, &msg.hdr, sizeof(msg), hdr); 238 | } 239 | 240 | int kpm_reply_acceptor(int fd, struct kpm_header *hdr, 241 | struct sockaddr_in6 *addr, socklen_t len) 242 | { 243 | struct kpm_tcp_acceptor_reply msg; 244 | 245 | memcpy(&msg.addr, addr, len); 246 | msg.len = len; 247 | 248 | return kpm_reply(fd, &msg.hdr, sizeof(msg), hdr); 249 | } 250 | 251 | int kpm_reply_connect(int fd, struct kpm_header *hdr, 252 | __u32 local_id, __u32 local_cpu, __u16 local_port, 253 | __u32 remote_id, __u32 remote_cpu, __u16 remote_port) 254 | { 255 | struct kpm_connect_reply msg = {}; 256 | 257 | msg.local.id = local_id; 258 | msg.local.cpu = local_cpu; 259 | msg.local.port = local_port; 260 | msg.remote.id = remote_id; 261 | msg.remote.cpu = remote_cpu; 262 | msg.remote.port = remote_port; 263 | 264 | return kpm_reply(fd, &msg.hdr, sizeof(msg), hdr); 265 | } 266 | 267 | int kpm_xchg_hello(int fd, unsigned int *ncpus) 268 | { 269 | struct kpm_hello hello; 270 | struct kpm_hello *rcv; 271 | 272 | hello.version = proto_ver; 273 | hello.n_cpus = get_nprocs(); 274 | 275 | if (kpm_send(fd, &hello.hdr, sizeof(hello), KPM_MSG_TYPE_HELLO) < 0) { 276 | warnx("Failed to send hello"); 277 | return -1; 278 | } 279 | 280 | rcv = kpm_receive(fd); 281 | if (!rcv) 282 | return -1; 283 | 284 | if (!kpm_good_req(rcv, KPM_MSG_TYPE_HELLO)) { 285 | warnx("Bad hello msg"); 286 | goto err_free; 287 | } 288 | if (rcv->version != proto_ver) { 289 | warnx("Bad PROTO version"); 290 | goto err_free; 291 | } 292 | 293 | if (ncpus) 294 | *ncpus = rcv->n_cpus; 295 | free(rcv); 296 | 297 | return 0; 298 | 299 | err_free: 300 | free(rcv); 301 | return -1; 302 | } 303 | 304 | int kpm_req_tcp_sock(int fd, struct sockaddr_in6 *addr, socklen_t *len) 305 | { 306 | struct kpm_tcp_acceptor_reply *repl; 307 | struct kpm_header hdr; 308 | int id; 309 | 310 | id = kpm_send(fd, &hdr, sizeof(hdr), KPM_MSG_TYPE_OPEN_TCP_ACCEPTOR); 311 | if (id < 0) { 312 | warnx("Failed to request TCP sock"); 313 | return id; 314 | } 315 | 316 | repl = kpm_receive(fd); 317 | if (!repl) { 318 | warnx("Failed to request TCP sock - no response"); 319 | return -1; 320 | } 321 | 322 | if (!kpm_good_reply(repl, KPM_MSG_TYPE_OPEN_TCP_ACCEPTOR, id)) { 323 | warnx("Failed to request TCP sock - unexpected reply"); 324 | free(repl); 325 | return -1; 326 | } 327 | 328 | if (*len < repl->len) { 329 | warnx("Failed to request TCP sock - req space small"); 330 | free(repl); 331 | return -1; 332 | } 333 | 334 | memcpy(addr, &repl->addr, repl->len); 335 | *len = repl->len; 336 | free(repl); 337 | return 0; 338 | } 339 | 340 | int kpm_req_end_test(int fd, __u32 test_id) 341 | { 342 | struct kpm_empty *repl; 343 | int id; 344 | 345 | id = kpm_send_u32(fd, KPM_MSG_TYPE_END_TEST, test_id); 346 | if (id < 0) { 347 | warnx("Failed to end test"); 348 | return id; 349 | } 350 | 351 | repl = kpm_receive(fd); 352 | if (!repl) { 353 | warnx("Failed to end test - no response"); 354 | return -1; 355 | } 356 | 357 | if (!kpm_good_reply(repl, KPM_MSG_TYPE_END_TEST, id)) { 358 | warnx("Failed to end test - bad reply"); 359 | free(repl); 360 | return -1; 361 | } 362 | 363 | free(repl); 364 | return 0; 365 | } 366 | 367 | int 368 | kpm_req_tls(int fd, __u32 conn_id, __u32 dir_mask, void *info, socklen_t len) 369 | { 370 | struct kpm_empty *repl; 371 | int id; 372 | 373 | id = kpm_send_tls(fd, conn_id, dir_mask, info, len); 374 | if (id < 0) { 375 | warnx("Failed to start TLS"); 376 | return id; 377 | } 378 | 379 | repl = kpm_receive(fd); 380 | if (!repl) { 381 | warnx("Failed to start TLS - no response"); 382 | return -1; 383 | } 384 | 385 | if (!kpm_good_reply(repl, KPM_MSG_TYPE_TLS, id)) { 386 | warnx("Failed to start TLS - bad reply"); 387 | free(repl); 388 | return -1; 389 | } 390 | 391 | free(repl); 392 | return 0; 393 | } 394 | 395 | int 396 | kpm_req_pacing(int fd, __u32 conn_id, __u32 max_pace) 397 | { 398 | struct kpm_empty *repl; 399 | int id; 400 | 401 | id = kpm_send_max_pacing(fd, conn_id, max_pace); 402 | if (id < 0) { 403 | warnx("Failed to request pacing"); 404 | return id; 405 | } 406 | 407 | repl = kpm_receive(fd); 408 | if (!repl) { 409 | warnx("Failed to request pacing - no response"); 410 | return -1; 411 | } 412 | 413 | if (!kpm_good_reply(repl, KPM_MSG_TYPE_MAX_PACING, id)) { 414 | warnx("Failed to request pacing - bad reply"); 415 | free(repl); 416 | return -1; 417 | } 418 | 419 | free(repl); 420 | return 0; 421 | } 422 | 423 | int 424 | kpm_req_tcp_cc(int fd, __u32 conn_id, char *cc_name) 425 | { 426 | struct kpm_empty *repl; 427 | int id; 428 | 429 | id = kpm_send_tcp_cc(fd, conn_id, cc_name); 430 | if (id < 0) { 431 | warnx("Failed to request TCP cong control"); 432 | return id; 433 | } 434 | 435 | repl = kpm_receive(fd); 436 | if (!repl) { 437 | warnx("Failed to request TCP cong control - no response"); 438 | return -1; 439 | } 440 | 441 | if (!kpm_good_reply(repl, KPM_MSG_TYPE_TCP_CC, id)) { 442 | warnx("Failed to request TCP cong control - bad reply"); 443 | free(repl); 444 | return -1; 445 | } 446 | 447 | free(repl); 448 | return 0; 449 | } 450 | 451 | int 452 | kpm_req_mode(int fd, struct kpm_mode *mode) 453 | { 454 | struct kpm_empty *repl; 455 | int id; 456 | 457 | id = kpm_send_mode(fd, mode); 458 | if (id < 0) { 459 | warnx("Failed to request mode"); 460 | return id; 461 | } 462 | 463 | repl = kpm_receive(fd); 464 | if (!repl) { 465 | warnx("Failed to request mode - no response"); 466 | return -1; 467 | } 468 | 469 | if (!kpm_good_reply(repl, KPM_MSG_TYPE_MODE, id)) { 470 | warnx("Failed to request mode - bad reply"); 471 | free(repl); 472 | return -1; 473 | } 474 | 475 | free(repl); 476 | return 0; 477 | } 478 | 479 | int kpm_req_disconnect(int fd, __u32 connection_id) 480 | { 481 | struct kpm_empty *repl; 482 | int id; 483 | 484 | id = kpm_send_u32(fd, KPM_MSG_TYPE_DISCONNECT, connection_id); 485 | if (id < 0) { 486 | warnx("Failed to end connection"); 487 | return id; 488 | } 489 | 490 | repl = kpm_receive(fd); 491 | if (!repl) { 492 | warnx("Failed to end connection - no response"); 493 | return -1; 494 | } 495 | 496 | if (!kpm_good_reply(repl, KPM_MSG_TYPE_DISCONNECT, id)) { 497 | warnx("Failed to end connection - bad reply"); 498 | free(repl); 499 | return -1; 500 | } 501 | 502 | free(repl); 503 | return 0; 504 | } 505 | -------------------------------------------------------------------------------- /worker.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: BSD-3-Clause 2 | /* Copyright Meta Platforms, Inc. and affiliates */ 3 | 4 | #define _GNU_SOURCE 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include "worker.h" 16 | #include "cpu_stat.h" 17 | #include "tcp.h" 18 | #include "proto_dbg.h" 19 | #include "server.h" 20 | #include "tcp.h" 21 | #include "iou.h" 22 | #include "epoll.h" 23 | 24 | unsigned char patbuf[KPM_MAX_OP_CHUNK + PATTERN_PERIOD + 1]; 25 | 26 | void 27 | worker_kill_conn(struct worker_state *self, struct worker_connection *conn) 28 | { 29 | self->ops->conn_close(self, conn); 30 | close(conn->fd); 31 | list_del(&conn->connections); 32 | free(conn->rxbuf); 33 | free(conn->rr.log); 34 | free(conn); 35 | } 36 | 37 | static int 38 | worker_pstat_cmp(unsigned int const *a, unsigned int const *b, void *unused) 39 | { 40 | return (long long int)*a - (long long int)*b; 41 | } 42 | 43 | static void 44 | worker_report_pstats(struct worker_state *self, struct worker_connection *conn, 45 | struct kpm_test_result *data) 46 | { 47 | if (conn->spec->arg.rr.timings < 2) 48 | return; 49 | 50 | asort(conn->rr.log, conn->rr.log_len, worker_pstat_cmp, NULL); 51 | data->p25 = conn->rr.log[conn->rr.log_len / 4]; 52 | data->p50 = conn->rr.log[conn->rr.log_len / 2]; 53 | data->p90 = conn->rr.log[(__u64)conn->rr.log_len * 90 / 100]; 54 | data->p99 = conn->rr.log[(__u64)conn->rr.log_len * 99 / 100]; 55 | data->p999 = conn->rr.log[(__u64)conn->rr.log_len * 999 / 1000]; 56 | data->p9999 = conn->rr.log[(__u64)conn->rr.log_len * 9999 / 10000]; 57 | } 58 | 59 | /* == Worker command handling == */ 60 | 61 | static void worker_report_test(struct worker_state *self) 62 | { 63 | struct worker_connection *conn; 64 | struct cpu_stat *cpu, *cpu_pct; 65 | struct kpm_test_results *res; 66 | unsigned int ncpus, i; 67 | struct timerel t; 68 | size_t sz; 69 | 70 | kpm_dbg("Reporting results"); 71 | 72 | sz = sizeof(*res) + sizeof(res->res[0]) * self->test->n_conns; 73 | res = malloc(sz); 74 | memset(res, 0, sz); 75 | 76 | t = timemono_since(self->test_start); 77 | res->time_usec = time_to_usec(t); 78 | res->n_conns = self->test->n_conns; 79 | res->test_id = self->test->test_id; 80 | 81 | ncpus = get_nprocs(); 82 | cpu = cpu_stat_snapshot(ncpus); 83 | cpu_stat_sub(cpu, self->cpu_start, ncpus); 84 | cpu_pct = cpu_stat_to_pct00(cpu, ncpus); 85 | free(cpu); 86 | for (i = 0; i < ncpus; i++) { 87 | res->cpu_load[i].id = cpu_pct[i].cpu_id; 88 | res->cpu_load[i].user = cpu_pct[i].user; 89 | res->cpu_load[i].system = cpu_pct[i].system; 90 | res->cpu_load[i].idle = cpu_pct[i].idle; 91 | res->cpu_load[i].iowait = cpu_pct[i].iowait; 92 | res->cpu_load[i].irq = cpu_pct[i].irq; 93 | res->cpu_load[i].sirq = cpu_pct[i].sirq; 94 | } 95 | free(cpu_pct); 96 | 97 | i = 0; 98 | list_for_each(&self->connections, conn, connections) { 99 | struct kpm_test_result *data; 100 | struct tcp_info info; 101 | socklen_t info_len; 102 | 103 | do { 104 | if (i == res->n_conns) { 105 | warnx("Missing connections!"); 106 | goto skip_results; 107 | } 108 | data = &res->res[i]; 109 | data->worker_id = self->id; 110 | data->connection_id = self->test->specs[i].connection_id; 111 | i++; 112 | /* Expect the connections to be in order */ 113 | } while (conn->id != data->connection_id); 114 | 115 | data->type = conn->spec->type; 116 | 117 | info_len = sizeof(conn->init_info); 118 | if (getsockopt(conn->fd, IPPROTO_TCP, TCP_INFO, 119 | (void *)&info, &info_len) < 0) { 120 | warn("Can't get TCP info"); 121 | goto skip_results; 122 | } 123 | 124 | data->rx_bytes = conn->tot_recv; 125 | data->tx_bytes = conn->tot_sent; 126 | 127 | if (conn->spec->type == KPM_TEST_TYPE_RR) 128 | data->reqs = conn->rr.reqs; 129 | 130 | data->retrans = info.tcpi_total_retrans - 131 | conn->init_info.tcpi_total_retrans; 132 | data->reord_seen = info.tcpi_reord_seen - 133 | conn->init_info.tcpi_reord_seen; 134 | data->rtt = info.tcpi_rtt; 135 | data->rttvar = info.tcpi_rttvar; 136 | data->delivered_ce = info.tcpi_delivered_ce - 137 | conn->init_info.tcpi_delivered_ce; 138 | data->snd_wnd = info.tcpi_snd_wnd; 139 | data->snd_cwnd = info.tcpi_snd_cwnd; 140 | 141 | if (verbose > 2) 142 | print_tcp_info(&info); 143 | 144 | memcpy(data->lat_hist, conn->rr.hist, sizeof(data->lat_hist)); 145 | worker_report_pstats(self, conn, data); 146 | 147 | /* Shut down sending to let the connection drain */ 148 | conn->to_send = 0; 149 | } 150 | skip_results: 151 | 152 | free(self->test); 153 | self->test = NULL; 154 | 155 | kpm_send(self->main_sock, &res->hdr, sz, KPM_MSG_WORKER_TEST_RESULT); 156 | free(res); 157 | } 158 | 159 | #define KPM_HNDL(type, name) \ 160 | { KPM_MSG_WORKER_ ## type, \ 161 | worker_msg_ ## name, \ 162 | sizeof(struct kpm_##name), \ 163 | stringify(name) } 164 | 165 | #define KPM_HNDL_GEN(type, name, gtype) \ 166 | { KPM_MSG_WORKER_ ## type, \ 167 | worker_msg_ ## name, \ 168 | sizeof(struct __kpm_generic_##gtype), \ 169 | stringify(name) } 170 | 171 | static void 172 | worker_msg_id(struct worker_state *self, struct kpm_header *hdr) 173 | { 174 | struct __kpm_generic_u32 *id = (void *)hdr; 175 | 176 | self->id = id->val; 177 | } 178 | 179 | static void 180 | worker_msg_test(struct worker_state *self, struct kpm_header *hdr) 181 | { 182 | struct kpm_test *req = (void *)hdr; 183 | unsigned int i; 184 | 185 | if (self->test) { 186 | warn("Already running a test"); 187 | self->quit = 1; 188 | return; 189 | } 190 | 191 | kpm_dbg("start test %s", req->active ? "act" : "psv"); 192 | 193 | self->test = malloc(hdr->len); 194 | memcpy(self->test, req, hdr->len); 195 | 196 | for (i = 0; i < req->n_conns; i++) { 197 | struct worker_connection *conn; 198 | socklen_t info_len; 199 | __u64 len; 200 | 201 | conn = malloc(sizeof(*conn)); 202 | memset(conn, 0, sizeof(*conn)); 203 | conn->spec = &self->test->specs[i]; 204 | conn->id = req->specs[i].connection_id; 205 | conn->fd = fdpass_recv(self->main_sock); 206 | 207 | info_len = sizeof(conn->init_info); 208 | if (getsockopt(conn->fd, IPPROTO_TCP, TCP_INFO, 209 | (void *)&conn->init_info, &info_len) < 0) { 210 | warn("Can't get TCP info"); 211 | self->quit = 1; 212 | } 213 | 214 | if (conn->spec->arg.rr.timings > 1) { 215 | /* Assume we can't do a round trip < 1us on avg */ 216 | conn->rr.log_len_max = 217 | self->test->time_sec * 1000 * 1000; 218 | conn->rr.log = calloc(conn->rr.log_len_max, 219 | sizeof(conn->rr.log[0])); 220 | } 221 | 222 | list_add(&self->connections, &conn->connections); 223 | 224 | conn->read_size = conn->spec->read_size; 225 | conn->write_size = conn->spec->write_size; 226 | 227 | conn->rxbuf = malloc(conn->read_size); 228 | if (!conn->rxbuf) { 229 | warnx("No memory"); 230 | self->quit = 1; 231 | return; 232 | } 233 | 234 | if (!conn->read_size || conn->read_size > KPM_MAX_OP_CHUNK || 235 | !conn->write_size || conn->write_size > KPM_MAX_OP_CHUNK) { 236 | warnx("wrong size io op read:%u write:%u", 237 | conn->read_size, conn->write_size); 238 | self->quit = 1; 239 | return; 240 | } 241 | 242 | switch (conn->spec->type) { 243 | case KPM_TEST_TYPE_STREAM: 244 | len = ~0ULL; 245 | break; 246 | case KPM_TEST_TYPE_RR: 247 | len = conn->spec->arg.rr.req_size; 248 | break; 249 | default: 250 | warnx("Unknown test type"); 251 | return; 252 | } 253 | 254 | if (req->active) 255 | conn->to_send = len; 256 | else 257 | conn->to_recv = len; 258 | 259 | self->ops->conn_add(self, conn); 260 | } 261 | 262 | self->cpu_start = cpu_stat_snapshot(0); 263 | self->test_start = time_mono(); 264 | memset(&self->prev_loop, 0, sizeof(self->prev_loop)); 265 | if (self->test->active) 266 | self->test_len_msec = req->time_sec * 1000; 267 | } 268 | 269 | static void 270 | worker_msg_end_test(struct worker_state *self, struct kpm_header *hdr) 271 | { 272 | struct worker_connection *conn, *next; 273 | 274 | if (self->test) 275 | worker_report_test(self); 276 | 277 | free(self->cpu_start); 278 | self->cpu_start = NULL; 279 | list_for_each_safe(&self->connections, conn, next, connections) 280 | worker_kill_conn(self, conn); 281 | self->ended = 1; 282 | } 283 | 284 | static const struct { 285 | enum kpm_msg_type type; 286 | void (*cb)(struct worker_state *self, struct kpm_header *hdr); 287 | size_t req_size; 288 | const char *name; 289 | } msg_handlers[] = { 290 | KPM_HNDL_GEN(ID, id, u32), 291 | KPM_HNDL(TEST, test), 292 | KPM_HNDL(END_TEST, end_test), 293 | }; 294 | 295 | void worker_handle_proto(struct worker_state *self, struct kpm_header *hdr) 296 | { 297 | int i; 298 | 299 | kpm_cmd_dbg_start(hdr); 300 | 301 | for (i = 0; i < (int)ARRAY_SIZE(msg_handlers); i++) { 302 | if (msg_handlers[i].type != hdr->type) 303 | continue; 304 | 305 | if (hdr->len < msg_handlers[i].req_size) { 306 | warn("Invalid request for %s", msg_handlers[i].name); 307 | self->quit = 1; 308 | break; 309 | } 310 | 311 | msg_handlers[i].cb(self, hdr); 312 | break; 313 | } 314 | if (i == (int)ARRAY_SIZE(msg_handlers)) { 315 | warnx("Unknown message type: %d", hdr->type); 316 | self->quit = 1; 317 | } 318 | 319 | kpm_cmd_dbg_end(hdr); 320 | } 321 | 322 | /* == Worker I/O handling == */ 323 | 324 | static void 325 | worker_record_rr_time(struct worker_state *self, struct worker_connection *conn) 326 | { 327 | struct timerel delta; 328 | unsigned int nsec128; 329 | struct timemono now; 330 | int hist_idx; 331 | 332 | if (!conn->spec->arg.rr.timings) 333 | return; 334 | 335 | now = time_mono(); 336 | if (!self->prev_loop.ts.tv_sec) 337 | goto out_update; 338 | 339 | delta = timemono_between(now, self->prev_loop); 340 | nsec128 = delta.ts.tv_nsec / 128; 341 | if (delta.ts.tv_sec) 342 | nsec128 = ~0U; 343 | 344 | if (conn->spec->arg.rr.timings > 1 && 345 | conn->rr.log_len < conn->rr.log_len_max) 346 | conn->rr.log[conn->rr.log_len++] = nsec128; 347 | 348 | hist_idx = 0; 349 | while (nsec128) { 350 | nsec128 >>= 1; 351 | hist_idx++; 352 | } 353 | conn->rr.hist[hist_idx]++; 354 | 355 | out_update: 356 | self->prev_loop = now; 357 | } 358 | 359 | void 360 | worker_send_finished(struct worker_state *self, struct worker_connection *conn) 361 | { 362 | worker_record_rr_time(self, conn); 363 | 364 | if (conn->spec->type != KPM_TEST_TYPE_RR) 365 | warnx("Done sending for non-RR test"); 366 | else 367 | conn->rr.reqs++; 368 | 369 | if (self->test->active) 370 | conn->to_recv = conn->spec->arg.rr.resp_size; 371 | else 372 | conn->to_recv = conn->spec->arg.rr.req_size; 373 | } 374 | 375 | void 376 | worker_recv_finished(struct worker_state *self, struct worker_connection *conn) 377 | { 378 | if (!self->test) 379 | return; 380 | 381 | if (conn->spec->type != KPM_TEST_TYPE_RR) 382 | warnx("Done sending for non-RR test"); 383 | 384 | if (self->test->active) 385 | conn->to_send = conn->spec->arg.rr.req_size; 386 | else 387 | conn->to_send = conn->spec->arg.rr.resp_size; 388 | } 389 | 390 | /* == Main loop == */ 391 | void* worker_main(void* args) 392 | { 393 | struct worker_opts* opts = args; 394 | struct worker_state self = { 395 | .main_sock = opts->fd, 396 | .opts = *opts, 397 | 398 | }; 399 | 400 | free(opts); 401 | if (self.opts.use_iou) 402 | worker_iou_init(&self); 403 | else 404 | worker_epoll_init(&self); 405 | list_head_init(&self.connections); 406 | 407 | self.ops->prep(&self); 408 | 409 | while (!self.quit) { 410 | int msec = -1; 411 | 412 | /* Check if we should end the test if we initiated */ 413 | if (self.test && self.test->active) { 414 | struct timerel t; 415 | 416 | t = timemono_since(self.test_start); 417 | msec = self.test_len_msec - time_to_msec(t); 418 | if (msec < 0) 419 | worker_report_test(&self); 420 | } 421 | 422 | self.ops->wait(&self, msec); 423 | } 424 | 425 | self.ops->exit(&self); 426 | close(self.main_sock); 427 | kpm_dbg("exiting!"); 428 | return NULL; 429 | } 430 | -------------------------------------------------------------------------------- /epoll.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: BSD-3-Clause 2 | /* Copyright Meta Platforms, Inc. and affiliates */ 3 | 4 | #include "epoll.h" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | #include "worker.h" 17 | #include "devmem.h" 18 | #include "proto_dbg.h" 19 | 20 | extern unsigned char patbuf[KPM_MAX_OP_CHUNK + PATTERN_PERIOD + 1]; 21 | 22 | #define ALIGN_UP(v, align) (((v) + (align) - 1) & ~((align) - 1)) 23 | #define ALIGN_PTR_UP(p, ptr_align_to) ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to)) 24 | 25 | /* Each thread should reserve a big enough vma to avoid 26 | * spinlock collisions in ptl locks. 27 | * This size is 2MB on x86_64, and is exported in /proc/meminfo. 28 | */ 29 | static unsigned long default_huge_page_size(void) 30 | { 31 | FILE *f = fopen("/proc/meminfo", "r"); 32 | unsigned long hps = 0; 33 | size_t linelen = 0; 34 | char *line = NULL; 35 | 36 | if (!f) { 37 | warnx("Failed to detect default huge page size; using 2 MB as fallback"); 38 | return 2 * 1024 * 1024; 39 | } 40 | while (getline(&line, &linelen, f) > 0) { 41 | if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { 42 | hps <<= 10; 43 | break; 44 | } 45 | } 46 | free(line); 47 | fclose(f); 48 | return hps; 49 | } 50 | 51 | static struct worker_connection * 52 | ep_find_connection_by_fd(struct worker_state *self, int fd) 53 | { 54 | struct worker_connection *conn; 55 | 56 | list_for_each(&self->connections, conn, connections) { 57 | if (conn->fd == fd) 58 | return conn; 59 | } 60 | return NULL; 61 | } 62 | 63 | static void 64 | ep_conn_close(struct worker_state *self, struct worker_connection *conn) 65 | { 66 | struct epoll_event ev = {}; 67 | 68 | ev.data.fd = conn->fd; 69 | if (epoll_ctl(self->epollfd, EPOLL_CTL_DEL, conn->fd, &ev) < 0) 70 | warn("Failed to del poll out"); 71 | if (self->opts.rx_mode == KPM_RX_MODE_DEVMEM) 72 | (void)devmem_release_tokens(conn->fd, &conn->devmem); 73 | else if (self->opts.rx_mode == KPM_RX_MODE_SOCKET_ZEROCOPY) 74 | munmap(conn->raddr, conn->rsize); 75 | } 76 | 77 | static void 78 | ep_conn_add(struct worker_state *self, struct worker_connection *conn) 79 | { 80 | struct epoll_event ev = {}; 81 | int zc; 82 | 83 | zc = self->opts.tx_mode == KPM_TX_MODE_SOCKET_ZEROCOPY || self->opts.tx_mode == KPM_TX_MODE_DEVMEM; 84 | if (setsockopt(conn->fd, SOL_SOCKET, SO_ZEROCOPY, &zc, sizeof(zc))) { 85 | warnx("Failed to set SO_ZEROCOPY"); 86 | self->quit = 1; 87 | return; 88 | } 89 | 90 | if (self->opts.rx_mode == KPM_RX_MODE_SOCKET_ZEROCOPY) { 91 | size_t map_align; 92 | 93 | map_align = default_huge_page_size(); 94 | conn->raddr = mmap(NULL, 95 | conn->read_size + map_align, 96 | PROT_READ, 97 | MAP_SHARED, 98 | conn->fd, 99 | 0); 100 | if (conn->raddr == MAP_FAILED) { 101 | warnx("Failed to mmap TCP_ZEROCOPY_RECEIVE"); 102 | self->quit = 1; 103 | return; 104 | } 105 | conn->addr = ALIGN_PTR_UP(conn->raddr, map_align); 106 | conn->rsize = conn->read_size + map_align; 107 | } 108 | 109 | ev.events = EPOLLIN | EPOLLOUT; 110 | ev.data.fd = conn->fd; 111 | if (epoll_ctl(self->epollfd, EPOLL_CTL_ADD, conn->fd, &ev) < 0) 112 | warn("Failed to modify poll out"); 113 | } 114 | 115 | static void ep_handle_main_sock(struct worker_state *self) 116 | { 117 | struct kpm_header *hdr; 118 | 119 | hdr = kpm_receive(self->main_sock); 120 | if (!hdr) { 121 | __kpm_dbg("<<", "ctrl recv failed"); 122 | self->quit = 1; 123 | return; 124 | } 125 | 126 | worker_handle_proto(self, hdr); 127 | 128 | free(hdr); 129 | } 130 | 131 | static void 132 | ep_send_arm(struct worker_state *self, struct worker_connection *conn, 133 | unsigned int events) 134 | { 135 | struct epoll_event ev = {}; 136 | 137 | if (events & EPOLLOUT) 138 | return; 139 | 140 | ev.events = EPOLLIN | EPOLLOUT; 141 | ev.data.fd = conn->fd; 142 | if (epoll_ctl(self->epollfd, EPOLL_CTL_MOD, conn->fd, &ev) < 0) 143 | warn("Failed to modify poll out"); 144 | } 145 | 146 | static void 147 | ep_send_disarm(struct worker_state *self, struct worker_connection *conn, 148 | unsigned int events) 149 | { 150 | struct epoll_event ev = {}; 151 | 152 | if (!(events & EPOLLOUT)) 153 | return; 154 | 155 | ev.events = EPOLLIN; 156 | ev.data.fd = conn->fd; 157 | if (epoll_ctl(self->epollfd, EPOLL_CTL_MOD, conn->fd, &ev) < 0) 158 | warn("Failed to modify poll out"); 159 | } 160 | 161 | static void 162 | ep_handle_completions(struct worker_state *self, struct worker_connection *conn, 163 | unsigned int events) 164 | { 165 | struct sock_extended_err *serr; 166 | struct msghdr msg = {}; 167 | char control[64] = {}; 168 | struct cmsghdr *cm; 169 | int ret, n; 170 | 171 | msg.msg_control = control; 172 | msg.msg_controllen = sizeof(control); 173 | 174 | ret = recvmsg(conn->fd, &msg, MSG_ERRQUEUE); 175 | if (ret < 0) { 176 | if (errno == EAGAIN) 177 | return; 178 | warn("failed to clean completions"); 179 | goto kill_conn; 180 | } 181 | 182 | if (msg.msg_flags & MSG_CTRUNC) { 183 | warnx("failed to clean completions: truncated cmsg"); 184 | goto kill_conn; 185 | } 186 | 187 | cm = CMSG_FIRSTHDR(&msg); 188 | if (!cm) { 189 | warnx("failed to clean completions: no cmsg"); 190 | goto kill_conn; 191 | } 192 | 193 | if (cm->cmsg_level != SOL_IP && cm->cmsg_level != SOL_IPV6) { 194 | warnx("failed to clean completions: wrong level %d", 195 | cm->cmsg_level); 196 | goto kill_conn; 197 | } 198 | 199 | if (cm->cmsg_type != IP_RECVERR && cm->cmsg_type != IPV6_RECVERR) { 200 | warnx("failed to clean completions: wrong type %d", 201 | cm->cmsg_type); 202 | goto kill_conn; 203 | } 204 | 205 | serr = (void *)CMSG_DATA(cm); 206 | if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) { 207 | warnx("failed to clean completions: wrong origin %d", 208 | serr->ee_origin); 209 | goto kill_conn; 210 | } 211 | if (serr->ee_errno) { 212 | warnx("failed to clean completions: error %d", 213 | serr->ee_errno); 214 | goto kill_conn; 215 | } 216 | n = serr->ee_data - serr->ee_info + 1; 217 | conn->to_send_comp -= n; 218 | kpm_dbg("send complete (%d..%d) %d\n", 219 | serr->ee_data, serr->ee_info + 1, conn->to_send_comp); 220 | 221 | return; 222 | 223 | kill_conn: 224 | worker_kill_conn(self, conn); 225 | } 226 | 227 | static void 228 | ep_handle_send(struct worker_state *self, struct worker_connection *conn, 229 | unsigned int events) 230 | { 231 | unsigned int rep = max_t(int, 10, conn->to_send / conn->write_size + 1); 232 | bool msg_zerocopy = self->opts.tx_mode == KPM_TX_MODE_SOCKET_ZEROCOPY || self->opts.tx_mode == KPM_TX_MODE_DEVMEM; 233 | int flags = msg_zerocopy ? MSG_ZEROCOPY : 0; 234 | 235 | while (rep--) { 236 | size_t chunk; 237 | void *src; 238 | ssize_t n; 239 | 240 | chunk = min_t(size_t, conn->write_size, conn->to_send); 241 | 242 | if (self->opts.tx_mode == KPM_TX_MODE_DEVMEM) { 243 | n = devmem_sendmsg(conn->fd, self->opts.devmem.dmabuf_id, 244 | conn->tot_sent % PATTERN_PERIOD, chunk); 245 | } else { 246 | src = &patbuf[conn->tot_sent % PATTERN_PERIOD]; 247 | n = send(conn->fd, src, chunk, MSG_DONTWAIT | flags); 248 | } 249 | if (n == 0) { 250 | warnx("zero send chunk:%zd to_send:%lld to_recv:%lld", 251 | chunk, conn->to_send, conn->to_recv); 252 | worker_kill_conn(self, conn); 253 | return; 254 | } 255 | if (n < 0) { 256 | if (errno == EAGAIN || errno == EWOULDBLOCK) { 257 | kpm_dbg("send full (0 sent)"); 258 | ep_send_arm(self, conn, events); 259 | return; 260 | } 261 | warn("Send failed"); 262 | worker_kill_conn(self, conn); 263 | return; 264 | } 265 | 266 | conn->to_send -= n; 267 | conn->tot_sent += n; 268 | if (msg_zerocopy) { 269 | conn->to_send_comp += 1; 270 | kpm_dbg("queued send completion, total %d", 271 | conn->to_send_comp); 272 | } 273 | 274 | if (!conn->to_send && !conn->to_send_comp) { 275 | ep_send_disarm(self, conn, events); 276 | worker_send_finished(self, conn); 277 | break; 278 | } 279 | 280 | if (n != (ssize_t)chunk) { 281 | kpm_dbg("send full (partial)"); 282 | ep_send_arm(self, conn, events); 283 | return; 284 | } 285 | } 286 | } 287 | 288 | static ssize_t 289 | ep_handle_zerocopy_recv(struct worker_state *self, struct worker_connection *conn, 290 | size_t chunk, int rep) 291 | { 292 | void *src = &patbuf[conn->tot_recv % PATTERN_PERIOD]; 293 | struct tcp_zerocopy_receive zc; 294 | socklen_t len = sizeof(zc); 295 | ssize_t n = 0; 296 | int res; 297 | 298 | memset(&zc, 0, len); 299 | zc.address = (__u64)((unsigned long)conn->addr); 300 | zc.length = chunk; 301 | zc.copybuf_address = (__u64)((unsigned long)conn->rxbuf); 302 | zc.copybuf_len = chunk; 303 | res = getsockopt(conn->fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, 304 | &zc, &len); 305 | if (res < 0) 306 | return res; 307 | if (zc.err) 308 | return zc.err; 309 | 310 | if (zc.length) { 311 | if (self->opts.validate && memcmp(conn->addr, src, zc.length)) 312 | warnx("Data corruption %d %d %u %lld %lld %d", 313 | *(char *)conn->addr, *(char *)src, zc.length, 314 | conn->tot_recv % PATTERN_PERIOD, 315 | conn->tot_recv, rep); 316 | madvise(conn->addr, zc.length, MADV_DONTNEED); 317 | src = &patbuf[(conn->tot_recv + zc.length) % PATTERN_PERIOD]; 318 | n += zc.length; 319 | } 320 | 321 | if (zc.copybuf_len) { 322 | if (self->opts.validate && memcmp(conn->rxbuf, src, zc.copybuf_len)) 323 | warnx("Data corruption %d %d %d %lld %lld %d", 324 | *conn->rxbuf, *(char *)src, zc.copybuf_len, 325 | (conn->tot_recv + n) % PATTERN_PERIOD, 326 | (conn->tot_recv + n), rep); 327 | n += zc.copybuf_len; 328 | } 329 | 330 | /* Sometimes getsockopt returns 0 for both length and copybuf_len, try 331 | * again */ 332 | return n == 0 ? -EAGAIN : n; 333 | } 334 | 335 | static ssize_t 336 | ep_handle_regular_recv(struct worker_state *self, struct worker_connection *conn, 337 | size_t chunk, int rep) 338 | { 339 | bool msg_trunc = self->opts.rx_mode == KPM_RX_MODE_SOCKET_TRUNC; 340 | void *src = &patbuf[conn->tot_recv % PATTERN_PERIOD]; 341 | int flags = msg_trunc ? MSG_TRUNC : 0; 342 | ssize_t n; 343 | 344 | n = recv(conn->fd, conn->rxbuf, chunk, MSG_DONTWAIT | flags); 345 | 346 | if (n <= 0 || msg_trunc) 347 | return n; 348 | 349 | if (self->opts.validate && memcmp(conn->rxbuf, src, n)) 350 | warnx("Data corruption %d %d %ld %lld %lld %d", 351 | *conn->rxbuf, *(char *)src, n, 352 | conn->tot_recv % PATTERN_PERIOD, 353 | conn->tot_recv, rep); 354 | 355 | return n; 356 | } 357 | 358 | static void 359 | ep_handle_recv(struct worker_state *self, struct worker_connection *conn) 360 | { 361 | unsigned int rep = 10; 362 | 363 | while (rep--) { 364 | size_t chunk; 365 | ssize_t n; 366 | 367 | chunk = min_t(size_t, conn->read_size, conn->to_recv); 368 | if (self->opts.rx_mode == KPM_RX_MODE_DEVMEM) 369 | n = devmem_recv(conn->fd, &conn->devmem, 370 | conn->rxbuf, chunk, self->opts.devmem.mem, 371 | rep, conn->tot_recv, self->opts.validate); 372 | else if (self->opts.rx_mode == KPM_RX_MODE_SOCKET_ZEROCOPY) 373 | n = ep_handle_zerocopy_recv(self, conn, chunk, rep); 374 | else 375 | n = ep_handle_regular_recv(self, conn, chunk, rep); 376 | if (n == 0) { 377 | warnx("zero recv"); 378 | worker_kill_conn(self, conn); 379 | break; 380 | } 381 | if (n < 0) { 382 | if (errno == EAGAIN || errno == EWOULDBLOCK) 383 | break; 384 | if (n == -EAGAIN) 385 | break; 386 | warn("Recv failed"); 387 | worker_kill_conn(self, conn); 388 | break; 389 | } 390 | 391 | conn->to_recv -= n; 392 | conn->tot_recv += n; 393 | 394 | if (!conn->to_recv) { 395 | worker_recv_finished(self, conn); 396 | if (conn->to_send) { 397 | ep_handle_send(self, conn, 0); 398 | break; 399 | } 400 | } 401 | 402 | if (n != conn->read_size) 403 | break; 404 | } 405 | 406 | } 407 | 408 | static void 409 | ep_handle_conn(struct worker_state *self, int fd, unsigned int events) 410 | { 411 | static int warnd_unexpected_pi; 412 | struct worker_connection *conn; 413 | 414 | conn = ep_find_connection_by_fd(self, fd); 415 | 416 | if (events & EPOLLOUT) { 417 | if (conn->to_send) 418 | ep_handle_send(self, conn, events); 419 | else if (!conn->to_send_comp) 420 | ep_send_disarm(self, conn, events); 421 | } 422 | if (events & EPOLLIN) { 423 | if (conn->to_recv) { 424 | ep_handle_recv(self, conn); 425 | } else if (!warnd_unexpected_pi) { 426 | warnx("Unexpected POLLIN %x", events); 427 | warnd_unexpected_pi = 1; 428 | } 429 | } 430 | if (events & EPOLLERR) 431 | ep_handle_completions(self, conn, events); 432 | 433 | if (!(events & (EPOLLOUT | EPOLLIN | EPOLLERR))) 434 | warnx("Connection has nothing to do %x", events); 435 | } 436 | 437 | static void ep_prep(struct worker_state *self) 438 | { 439 | int fd = self->main_sock; 440 | struct epoll_event ev; 441 | 442 | self->epollfd = epoll_create1(0); 443 | if (self->epollfd < 0) 444 | err(5, "Failed to create epoll"); 445 | 446 | ev.events = EPOLLIN; 447 | ev.data.fd = fd; 448 | if (epoll_ctl(self->epollfd, EPOLL_CTL_ADD, fd, &ev) < 0) 449 | err(6, "Failed to init epoll"); 450 | } 451 | 452 | static void ep_wait(struct worker_state *self, int msec) 453 | { 454 | struct epoll_event events[32]; 455 | int i, nfds; 456 | 457 | nfds = epoll_wait(self->epollfd, events, ARRAY_SIZE(events), 458 | msec); 459 | if (nfds < 0) 460 | err(7, "Failed to epoll"); 461 | 462 | for (i = 0; i < nfds; i++) { 463 | struct epoll_event *e = &events[i]; 464 | 465 | if (e->data.fd == self->main_sock) 466 | ep_handle_main_sock(self); 467 | else 468 | ep_handle_conn(self, e->data.fd, 469 | e->events); 470 | } 471 | } 472 | 473 | static void ep_exit(struct worker_state *self) 474 | { 475 | } 476 | 477 | static const struct io_ops epoll_io_ops = { 478 | .prep = ep_prep, 479 | .wait = ep_wait, 480 | .conn_add = ep_conn_add, 481 | .conn_close = ep_conn_close, 482 | .exit = ep_exit, 483 | }; 484 | 485 | void worker_epoll_init(struct worker_state *self) 486 | { 487 | self->ops = &epoll_io_ops; 488 | } 489 | -------------------------------------------------------------------------------- /iou.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: BSD-3-Clause 2 | /* Copyright Meta Platforms, Inc. and affiliates */ 3 | 4 | #include "iou.h" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include 15 | 16 | #include "proto.h" 17 | #include "proto_dbg.h" 18 | #include "devmem.h" 19 | #include "worker.h" 20 | 21 | extern unsigned char patbuf[KPM_MAX_OP_CHUNK + PATTERN_PERIOD + 1]; 22 | 23 | #define ALIGN_UP(v, align) (((v) + (align) - 1) & ~((align) - 1)) 24 | 25 | static long page_size; 26 | 27 | struct iou_state { 28 | struct io_uring ring; 29 | void *area_ptr; 30 | size_t area_size; 31 | __u64 area_token; 32 | void *rq_ptr; 33 | struct io_uring_zcrx_rq rq; 34 | size_t rq_size; 35 | unsigned rq_mask; 36 | __u32 zcrx_id; 37 | }; 38 | 39 | struct iou_kpm_msg_state { 40 | struct kpm_header hdr; 41 | void *msg; 42 | ssize_t off; 43 | }; 44 | 45 | enum iou_req_type { 46 | IOU_REQ_TYPE_PROTO_HDR = 1, 47 | IOU_REQ_TYPE_PROTO_PLD = 2, 48 | IOU_REQ_TYPE_SEND = 3, 49 | IOU_REQ_TYPE_RECV = 4, 50 | IOU_REQ_TYPE_RECVZC = 5, 51 | IOU_REQ_TYPE_CANCEL = 6, 52 | IOU_REQ_TYPE_SENDZC = 7, 53 | }; 54 | 55 | static void * 56 | tag(void *ptr, enum iou_req_type x) 57 | { 58 | x &= 0xf; 59 | return (void *)(((uintptr_t)ptr) | x); 60 | } 61 | 62 | static void * 63 | untag(uintptr_t ptr) 64 | { 65 | return (void *)(ptr & ~0xf); 66 | } 67 | 68 | static enum iou_req_type 69 | get_tag(uintptr_t ptr) 70 | { 71 | return (int)(ptr & 0xf); 72 | } 73 | 74 | static struct iou_state *get_iou_state(struct worker_state *state) 75 | { 76 | return state->io_state; 77 | } 78 | 79 | static struct io_uring *get_ring(struct worker_state *state) 80 | { 81 | return &get_iou_state(state)->ring; 82 | } 83 | 84 | static void iou_conn_add_send(struct io_uring *ring, struct worker_connection *conn) 85 | { 86 | struct io_uring_sqe *sqe; 87 | size_t chunk; 88 | void *src; 89 | 90 | chunk = min_t(size_t, conn->write_size, conn->to_send); 91 | src = &patbuf[conn->tot_sent % PATTERN_PERIOD]; 92 | 93 | sqe = io_uring_get_sqe(ring); 94 | io_uring_prep_send(sqe, conn->fd, src, chunk, 0); 95 | io_uring_sqe_set_data(sqe, tag(conn, IOU_REQ_TYPE_SEND)); 96 | } 97 | 98 | static void iou_conn_add_sendzc(struct io_uring *ring, struct worker_connection *conn) 99 | { 100 | struct io_uring_sqe *sqe; 101 | size_t chunk; 102 | void *src; 103 | 104 | chunk = min_t(size_t, conn->write_size, conn->to_send); 105 | src = &patbuf[conn->tot_sent % PATTERN_PERIOD]; 106 | 107 | sqe = io_uring_get_sqe(ring); 108 | io_uring_prep_send_zc_fixed(sqe, conn->fd, src, chunk, 0, 0, 0); 109 | io_uring_sqe_set_data(sqe, tag(conn, IOU_REQ_TYPE_SENDZC)); 110 | } 111 | 112 | static void iou_handle_send(struct worker_state *self, struct io_uring_cqe *cqe) 113 | { 114 | struct worker_connection *conn; 115 | ssize_t n; 116 | 117 | if (self->ended) 118 | return; 119 | 120 | conn = untag(cqe->user_data); 121 | n = cqe->res; 122 | if (n <= 0) { 123 | warnx("Send failed"); 124 | worker_kill_conn(self, conn); 125 | return; 126 | } 127 | 128 | conn->to_send -= n; 129 | conn->tot_sent += n; 130 | 131 | if (!conn->to_send) 132 | worker_send_finished(self, conn); 133 | else 134 | iou_conn_add_send(get_ring(self), conn); 135 | } 136 | 137 | static void iou_handle_sendzc(struct worker_state *self, struct io_uring_cqe *cqe) 138 | { 139 | struct worker_connection *conn; 140 | ssize_t n; 141 | 142 | if (self->ended) 143 | return; 144 | 145 | conn = untag(cqe->user_data); 146 | if (cqe->flags & IORING_CQE_F_NOTIF) { 147 | if (cqe->flags & IORING_CQE_F_MORE) { 148 | warnx("Notification completion has F_MORE set"); 149 | worker_kill_conn(self, conn); 150 | } 151 | return; 152 | } 153 | 154 | n = cqe->res; 155 | if (n <= 0) { 156 | warnx("Send failed"); 157 | worker_kill_conn(self, conn); 158 | return; 159 | } 160 | 161 | conn->to_send -= n; 162 | conn->tot_sent += n; 163 | 164 | if (!conn->to_send) 165 | worker_send_finished(self, conn); 166 | else 167 | iou_conn_add_sendzc(get_ring(self), conn); 168 | } 169 | 170 | static void iou_conn_add_recv(struct io_uring *ring, struct worker_connection *conn) 171 | { 172 | struct io_uring_sqe *sqe; 173 | 174 | sqe = io_uring_get_sqe(ring); 175 | io_uring_prep_recv(sqe, conn->fd, conn->rxbuf, conn->read_size, 0); 176 | io_uring_sqe_set_data(sqe, tag(conn, IOU_REQ_TYPE_RECV)); 177 | } 178 | 179 | static void iou_conn_add_recvzc(struct io_uring *ring, struct worker_connection *conn, __u32 id) 180 | { 181 | struct io_uring_sqe *sqe; 182 | 183 | sqe = io_uring_get_sqe(ring); 184 | io_uring_prep_rw(IORING_OP_RECV_ZC, sqe, conn->fd, NULL, 0, 0); 185 | sqe->ioprio |= IORING_RECV_MULTISHOT; 186 | sqe->zcrx_ifq_idx = id; 187 | io_uring_sqe_set_data(sqe, tag(conn, IOU_REQ_TYPE_RECVZC)); 188 | } 189 | 190 | static void iou_handle_recv(struct worker_state *self, struct io_uring_cqe *cqe) 191 | { 192 | struct io_uring *ring = get_ring(self); 193 | struct worker_connection *conn; 194 | ssize_t n; 195 | void *src; 196 | 197 | if (self->ended) 198 | return; 199 | 200 | conn = untag(cqe->user_data); 201 | n = cqe->res; 202 | if (n <= 0) { 203 | warnx("Recv failed: %ld, to_recv: %llu", n, conn->to_recv); 204 | worker_kill_conn(self, conn); 205 | return; 206 | } 207 | 208 | src = &patbuf[conn->tot_recv % PATTERN_PERIOD]; 209 | if (self->opts.validate && memcmp(conn->rxbuf, src, n)) 210 | warnx("Data corruption %d %d %ld %lld %lld", 211 | *conn->rxbuf, *(char *)src, n, 212 | conn->tot_recv % PATTERN_PERIOD, 213 | conn->tot_recv); 214 | 215 | conn->to_recv -= n; 216 | conn->tot_recv += n; 217 | 218 | if (!conn->to_recv) { 219 | worker_recv_finished(self, conn); 220 | if (conn->to_send) 221 | iou_conn_add_send(ring, conn); 222 | } 223 | 224 | iou_conn_add_recv(ring, conn); 225 | } 226 | 227 | static void iou_handle_recvzc(struct worker_state *self, struct io_uring_cqe *cqe) 228 | { 229 | struct iou_state *state = get_iou_state(self); 230 | struct io_uring *ring = get_ring(self); 231 | struct io_uring_zcrx_rq *rq_ring; 232 | struct io_uring_zcrx_cqe* rcqe; 233 | struct worker_connection *conn; 234 | struct io_uring_zcrx_rqe *rqe; 235 | unsigned char *data; 236 | __u64 mask; 237 | ssize_t n; 238 | void *src; 239 | 240 | if (self->ended) 241 | return; 242 | 243 | conn = untag(cqe->user_data); 244 | n = cqe->res; 245 | if (!(cqe->flags & IORING_CQE_F_MORE)) { 246 | if (conn->to_recv) 247 | warn("Recvzc ended early"); 248 | if (n != 0) 249 | warn("Recvzc final completion invalid res: %ld", n); 250 | worker_kill_conn(self, conn); 251 | return; 252 | } 253 | 254 | if (n <= 0) { 255 | warnx("Recv failed: %ld, to_recv: %llu", n, conn->to_recv); 256 | worker_kill_conn(self, conn); 257 | return; 258 | } 259 | 260 | rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1); 261 | mask = (1ULL << IORING_ZCRX_AREA_SHIFT) - 1; 262 | data = (unsigned char *)state->area_ptr + (rcqe->off & mask); 263 | 264 | src = &patbuf[conn->tot_recv % PATTERN_PERIOD]; 265 | if (self->opts.validate && memcmp(data, src, n)) 266 | warnx("Data corruption %d %d %ld %lld %lld", 267 | *data, *(char *)src, n, 268 | conn->tot_recv % PATTERN_PERIOD, 269 | conn->tot_recv); 270 | 271 | conn->to_recv -= n; 272 | conn->tot_recv += n; 273 | 274 | if (!conn->to_recv) { 275 | worker_recv_finished(self, conn); 276 | if (conn->to_send) 277 | iou_conn_add_send(ring, conn); 278 | } 279 | 280 | rq_ring = &state->rq; 281 | rqe = &rq_ring->rqes[rq_ring->rq_tail & state->rq_mask]; 282 | rqe->off = (rcqe->off & ~IORING_ZCRX_AREA_MASK) | state->area_token; 283 | rqe->len = cqe->res; 284 | io_uring_smp_store_release(rq_ring->ktail, ++rq_ring->rq_tail); 285 | } 286 | 287 | static size_t get_rq_ring_size(unsigned int entries) 288 | { 289 | size_t size; 290 | 291 | size = entries * sizeof(struct io_uring_zcrx_rqe); 292 | /* add space for the header (head/tail/etc.) */ 293 | size += page_size; 294 | 295 | return ALIGN_UP(size, page_size); 296 | } 297 | 298 | static int iou_register_zerocopy_rx(struct worker_state *self) 299 | { 300 | struct iou_state *state = get_iou_state(self); 301 | unsigned int ring_entries; 302 | size_t area_size; 303 | size_t ring_size; 304 | void *area_ptr; 305 | void *ring_ptr; 306 | int ret; 307 | 308 | area_size = self->opts.iou.rx_size_mb * 1024 * 1024; 309 | /* arbitrary ring size chosen based on rx_size_mb */ 310 | ring_entries = (area_size / (page_size * 2)); 311 | ring_size = get_rq_ring_size(ring_entries); 312 | 313 | area_ptr = mmap(NULL, 314 | area_size + ring_size, 315 | PROT_READ | PROT_WRITE, 316 | MAP_ANONYMOUS | MAP_PRIVATE, 317 | -1, 318 | 0 319 | ); 320 | if (area_ptr == MAP_FAILED) { 321 | warn("Failed to mmap zero copy receive memory"); 322 | return -1; 323 | } 324 | struct io_uring_zcrx_area_reg area_reg = { 325 | .addr = (__u64)(unsigned long)area_ptr, 326 | .len = area_size, 327 | .flags = 0, 328 | }; 329 | 330 | ring_ptr = (char *)area_ptr + area_size; 331 | struct io_uring_region_desc region_reg = { 332 | .user_addr = (__u64)(unsigned long)ring_ptr, 333 | .size = ring_size, 334 | .flags = IORING_MEM_REGION_TYPE_USER, 335 | }; 336 | 337 | struct io_uring_zcrx_ifq_reg reg = { 338 | .if_idx = self->opts.iou.ifindex, 339 | .if_rxq = self->opts.iou.queue_id, 340 | .rq_entries = ring_entries, 341 | .area_ptr = (__u64)(unsigned long)&area_reg, 342 | .region_ptr = (__u64)(unsigned long)®ion_reg, 343 | }; 344 | 345 | ret = io_uring_register_ifq(&state->ring, ®); 346 | if (ret) { 347 | warn("io_uring_register_ifq failed: %d", ret); 348 | munmap(area_ptr, area_size + ring_size); 349 | return ret; 350 | } 351 | 352 | state->rq.khead = (unsigned int *)((char *)ring_ptr + reg.offsets.head); 353 | state->rq.ktail = (unsigned int *)((char *)ring_ptr + reg.offsets.tail); 354 | state->rq.rqes = (struct io_uring_zcrx_rqe *)((char *)ring_ptr + reg.offsets.rqes); 355 | state->rq.rq_tail = 0; 356 | state->rq.ring_entries = reg.rq_entries; 357 | 358 | state->area_token = area_reg.rq_area_token; 359 | state->rq_mask = reg.rq_entries - 1; 360 | state->zcrx_id = reg.zcrx_id; 361 | 362 | state->area_ptr = area_ptr; 363 | state->rq_ptr = ring_ptr; 364 | state->area_size = area_size; 365 | state->rq_size = ring_size; 366 | 367 | return 0; 368 | } 369 | 370 | static int iou_register_zerocopy_tx(struct worker_state *self) 371 | { 372 | struct iou_state *state = get_iou_state(self); 373 | struct iovec iov; 374 | 375 | iov.iov_base = patbuf; 376 | iov.iov_len = KPM_MAX_OP_CHUNK + PATTERN_PERIOD + 1; 377 | 378 | return io_uring_register_buffers(&state->ring, &iov, 1); 379 | } 380 | 381 | static void iou_prep(struct worker_state *self) 382 | { 383 | struct iou_kpm_msg_state *msg; 384 | struct io_uring_params p = {}; 385 | struct io_uring_sqe *sqe; 386 | struct iou_state *state; 387 | int ret; 388 | 389 | state = malloc(sizeof(*state)); 390 | if (!state) 391 | err(4, "Failed to malloc iou_state"); 392 | memset(state, 0, sizeof(*state)); 393 | self->io_state = state; 394 | 395 | p.flags |= IORING_SETUP_COOP_TASKRUN; 396 | p.flags |= IORING_SETUP_CQSIZE; 397 | p.flags |= IORING_SETUP_DEFER_TASKRUN; 398 | p.flags |= IORING_SETUP_SINGLE_ISSUER; 399 | p.flags |= IORING_SETUP_SUBMIT_ALL; 400 | if (self->opts.rx_mode == KPM_RX_MODE_SOCKET_ZEROCOPY) 401 | p.flags |= IORING_SETUP_CQE32; 402 | p.cq_entries = 512; 403 | 404 | ret = io_uring_queue_init_params(64, &state->ring, &p); 405 | if (ret) 406 | err(5, "Failed to create io_uring"); 407 | 408 | msg = malloc(sizeof(*msg)); 409 | if (!msg) { 410 | free(state); 411 | err(6, "Failed to malloc iou_kpm_msg_state"); 412 | } 413 | 414 | if (self->opts.rx_mode == KPM_RX_MODE_SOCKET_ZEROCOPY) 415 | if (iou_register_zerocopy_rx(self)) 416 | err(7, "Failed to register zero copy rx"); 417 | 418 | if (self->opts.tx_mode == KPM_TX_MODE_SOCKET_ZEROCOPY) 419 | if (iou_register_zerocopy_tx(self)) 420 | err(8, "Failed to register zero copy tx"); 421 | 422 | sqe = io_uring_get_sqe(&state->ring); 423 | io_uring_prep_recv(sqe, self->main_sock, &msg->hdr, sizeof(msg->hdr), MSG_PEEK | MSG_WAITALL); 424 | io_uring_sqe_set_data(sqe, tag(msg, IOU_REQ_TYPE_PROTO_HDR)); 425 | } 426 | 427 | static void iou_handle_proto_hdr(struct worker_state *self, struct io_uring_cqe *cqe) 428 | { 429 | struct io_uring *ring = get_ring(self); 430 | struct iou_kpm_msg_state *msg; 431 | struct io_uring_sqe *sqe; 432 | ssize_t n = cqe->res; 433 | 434 | msg = untag(cqe->user_data); 435 | if (n < (int)sizeof(msg->hdr)) { 436 | if (n) 437 | warn("Failed to receive header (%zd)", n); 438 | goto err; 439 | } 440 | if (msg->hdr.len < sizeof(msg->hdr)) { 441 | warnx("Invalid header length (%d)", msg->hdr.len); 442 | goto err; 443 | } 444 | 445 | msg->msg = malloc(msg->hdr.len); 446 | if (!msg->msg) { 447 | warnx("Failed to malloc msg"); 448 | goto err; 449 | } 450 | 451 | msg->off = 0; 452 | sqe = io_uring_get_sqe(ring); 453 | io_uring_prep_recv(sqe, self->main_sock, msg->msg + msg->off, msg->hdr.len, 0); 454 | io_uring_sqe_set_data(sqe, tag(msg, IOU_REQ_TYPE_PROTO_PLD)); 455 | 456 | return; 457 | 458 | err: 459 | __kpm_dbg("<<", "ctrl recv failed"); 460 | self->quit = 1; 461 | free(msg); 462 | return; 463 | } 464 | 465 | static void iou_handle_proto_pld(struct worker_state *self, struct io_uring_cqe *cqe) 466 | { 467 | struct io_uring *ring = get_ring(self); 468 | struct iou_kpm_msg_state *msg; 469 | struct io_uring_sqe *sqe; 470 | ssize_t n = cqe->res; 471 | 472 | msg = untag(cqe->user_data); 473 | if (n > msg->hdr.len) { 474 | warnx("Oversized recv"); 475 | goto err; 476 | } else if (n <= 0) { 477 | warnx("Short recv"); 478 | goto err; 479 | } 480 | 481 | msg->off += n; 482 | msg->hdr.len -= n; 483 | 484 | if (msg->hdr.len) { 485 | sqe = io_uring_get_sqe(ring); 486 | io_uring_prep_recv(sqe, self->main_sock, msg->msg + msg->off, msg->hdr.len, 0); 487 | io_uring_sqe_set_data(sqe, tag(msg, IOU_REQ_TYPE_PROTO_PLD)); 488 | return; 489 | } 490 | 491 | worker_handle_proto(self, msg->msg); 492 | 493 | free(msg->msg); 494 | memset(msg, 0, sizeof(*msg)); 495 | 496 | sqe = io_uring_get_sqe(ring); 497 | io_uring_prep_recv(sqe, self->main_sock, &msg->hdr, sizeof(msg->hdr), MSG_PEEK | MSG_WAITALL); 498 | io_uring_sqe_set_data(sqe, tag(msg, IOU_REQ_TYPE_PROTO_HDR)); 499 | 500 | return; 501 | err: 502 | __kpm_dbg("<<", "ctrl recv failed"); 503 | self->quit = 1; 504 | free(msg->msg); 505 | free(msg); 506 | return; 507 | } 508 | 509 | static void iou_wait(struct worker_state *self, int msec) 510 | { 511 | struct io_uring *ring = get_ring(self); 512 | struct __kernel_timespec timeout; 513 | struct io_uring_cqe *cqe; 514 | unsigned int count = 0; 515 | unsigned int head; 516 | 517 | timeout.tv_sec = msec / 1000; 518 | timeout.tv_nsec = (msec % 1000) * 1000000; 519 | 520 | io_uring_submit_and_wait_timeout(ring, &cqe, 1, &timeout, NULL); 521 | 522 | io_uring_for_each_cqe(ring, head, cqe) { 523 | switch (get_tag(cqe->user_data)) { 524 | case IOU_REQ_TYPE_PROTO_HDR: 525 | iou_handle_proto_hdr(self, cqe); 526 | break; 527 | case IOU_REQ_TYPE_PROTO_PLD: 528 | iou_handle_proto_pld(self, cqe); 529 | break; 530 | case IOU_REQ_TYPE_SEND: 531 | iou_handle_send(self, cqe); 532 | break; 533 | case IOU_REQ_TYPE_SENDZC: 534 | iou_handle_sendzc(self, cqe); 535 | break; 536 | case IOU_REQ_TYPE_RECV: 537 | iou_handle_recv(self, cqe); 538 | break; 539 | case IOU_REQ_TYPE_RECVZC: 540 | iou_handle_recvzc(self, cqe); 541 | break; 542 | case IOU_REQ_TYPE_CANCEL: 543 | break; 544 | default: 545 | err(1, "Unknown io_uring request type: %d, res: %d", get_tag(cqe->user_data), cqe->res); 546 | } 547 | 548 | count++; 549 | } 550 | io_uring_cq_advance(ring, count); 551 | } 552 | 553 | static void iou_conn_add(struct worker_state *state, struct worker_connection *conn) 554 | { 555 | struct io_uring *ring = get_ring(state); 556 | 557 | if (conn->to_send) { 558 | if (state->opts.tx_mode == KPM_TX_MODE_SOCKET_ZEROCOPY) 559 | iou_conn_add_sendzc(ring, conn); 560 | else 561 | iou_conn_add_send(ring, conn); 562 | } 563 | 564 | if (state->opts.rx_mode == KPM_RX_MODE_SOCKET_ZEROCOPY) 565 | iou_conn_add_recvzc(ring, conn, get_iou_state(state)->zcrx_id); 566 | else 567 | iou_conn_add_recv(ring, conn); 568 | } 569 | 570 | static void iou_conn_close(struct worker_state *state, struct worker_connection *conn) 571 | { 572 | struct io_uring *ring = get_ring(state); 573 | struct io_uring_sqe *sqe; 574 | 575 | sqe = io_uring_get_sqe(ring); 576 | io_uring_prep_cancel_fd(sqe, conn->fd, 0); 577 | io_uring_sqe_set_data(sqe, tag(NULL, IOU_REQ_TYPE_CANCEL)); 578 | /* Cancellation is sync. A completion is always generated by the time 579 | * submit returns */ 580 | io_uring_submit(ring); 581 | } 582 | 583 | static void iou_exit(struct worker_state *self) 584 | { 585 | struct iou_state *state = get_iou_state(self); 586 | struct io_uring *ring = get_ring(self); 587 | if (state->area_ptr) 588 | munmap(state->area_ptr, state->area_size + state->rq_size); 589 | io_uring_queue_exit(ring); 590 | free(self->io_state); 591 | } 592 | 593 | static const struct io_ops iou_io_ops = { 594 | .prep = iou_prep, 595 | .wait = iou_wait, 596 | .conn_add = iou_conn_add, 597 | .conn_close = iou_conn_close, 598 | .exit = iou_exit, 599 | }; 600 | 601 | void worker_iou_init(struct worker_state *self) 602 | { 603 | self->ops = &iou_io_ops; 604 | page_size = sysconf(_SC_PAGESIZE); 605 | } 606 | 607 | int iou_zerocopy_rx_setup(struct session_state_iou *iou, int fd, 608 | int num_queues) 609 | { 610 | return reserve_queues(fd, num_queues, iou->ifname, &iou->ifindex, 611 | &iou->queue_id, &iou->rss_context); 612 | } 613 | 614 | int iou_zerocopy_rx_teardown(struct session_state_iou *iou) 615 | { 616 | unreserve_queues(iou->ifname, iou->rss_context); 617 | return 0; 618 | } 619 | -------------------------------------------------------------------------------- /server_session.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: BSD-3-Clause 2 | /* Copyright Meta Platforms, Inc. and affiliates */ 3 | 4 | #define _GNU_SOURCE 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include "proto.h" 28 | #include "proto_dbg.h" 29 | #include "server.h" 30 | #include "devmem.h" 31 | #include "iou.h" 32 | 33 | extern unsigned char patbuf[KPM_MAX_OP_CHUNK + PATTERN_PERIOD + 1]; 34 | 35 | struct session_state { 36 | int main_sock; 37 | int epollfd; 38 | int quit; 39 | int tcp_sock; 40 | enum kpm_rx_mode rx_mode; 41 | enum kpm_tx_mode tx_mode; 42 | unsigned int connection_ids; 43 | unsigned int worker_ids; 44 | unsigned int test_ids; 45 | struct list_head connections; 46 | struct list_head workers; 47 | struct list_head tests; 48 | struct session_state_devmem devmem; 49 | struct session_state_iou iou_state; 50 | bool validate; 51 | bool iou; 52 | }; 53 | 54 | struct connection { 55 | unsigned int id; 56 | int fd; 57 | int cpu; 58 | int worker_fd; 59 | unsigned int tls_mask; 60 | struct list_node connections; 61 | }; 62 | 63 | struct worker { 64 | unsigned int id; 65 | int fd; 66 | pid_t pid; 67 | int busy; 68 | struct list_node workers; 69 | }; 70 | 71 | struct test { 72 | unsigned int id; 73 | int active; 74 | unsigned int min_worker_id; 75 | unsigned int worker_range; 76 | unsigned int workers_total; 77 | unsigned int workers_done; 78 | struct kpm_test *req, **fwd; 79 | struct kpm_test_results **results; 80 | struct list_node tests; 81 | }; 82 | 83 | static struct connection * 84 | session_find_connection_by_id(struct session_state *self, unsigned int id) 85 | { 86 | struct connection *conn; 87 | 88 | list_for_each(&self->connections, conn, connections) { 89 | if (conn->id == id) 90 | return conn; 91 | } 92 | return NULL; 93 | } 94 | 95 | static struct worker * 96 | session_find_worker_by_id(struct session_state *self, unsigned int id) 97 | { 98 | struct worker *wrk; 99 | 100 | list_for_each(&self->workers, wrk, workers) { 101 | if (wrk->id == id) 102 | return wrk; 103 | } 104 | return NULL; 105 | } 106 | 107 | static struct test * 108 | session_find_test_by_id(struct session_state *self, unsigned int id) 109 | { 110 | struct test *test; 111 | 112 | list_for_each(&self->tests, test, tests) { 113 | if (test->id == id) 114 | return test; 115 | } 116 | return NULL; 117 | } 118 | 119 | static void session_new_conn(struct session_state *self, int fd) 120 | { 121 | struct connection *conn; 122 | socklen_t len; 123 | 124 | conn = malloc(sizeof(*conn)); 125 | if (!conn) 126 | goto err_close; 127 | memset(conn, 0, sizeof(*conn)); 128 | 129 | conn->id = ++self->connection_ids; 130 | conn->fd = fd; 131 | 132 | len = sizeof(conn->cpu); 133 | if (getsockopt(fd, SOL_SOCKET, SO_INCOMING_CPU, &conn->cpu, &len) < 0) { 134 | warn("Failed to read CPU for socket"); 135 | goto err_free; 136 | } 137 | 138 | if (kpm_send_conn_id(fd, conn->id, conn->cpu) < 0) 139 | goto err_free; 140 | 141 | list_add(&self->connections, &conn->connections); 142 | return; 143 | 144 | err_free: 145 | free(conn); 146 | err_close: 147 | close(fd); 148 | return; 149 | } 150 | 151 | static void 152 | server_msg_tcp_acceptor(struct session_state *self, struct kpm_header *req) 153 | { 154 | struct epoll_event ev = {}; 155 | struct sockaddr_in6 addr; 156 | socklen_t len; 157 | int ret; 158 | 159 | if (self->tcp_sock) { 160 | kpm_reply_error(self->main_sock, req, EBUSY); 161 | return; 162 | } 163 | 164 | len = sizeof(addr); 165 | if (getsockname(self->main_sock, (void *)&addr, &len)) { 166 | warn("Failed to get sock type for main sock"); 167 | self->quit = 1; 168 | return; 169 | } 170 | addr.sin6_port = 0; 171 | 172 | self->tcp_sock = socket(addr.sin6_family, SOCK_STREAM, 0); 173 | if (self->tcp_sock < 0) { 174 | warn("Failed to open socket"); 175 | self->quit = 1; 176 | return; 177 | } 178 | 179 | ret = bind(self->tcp_sock, (void *)&addr, sizeof(addr)); 180 | if (ret < 0) { 181 | warn("Failed to bind socket"); 182 | self->quit = 1; 183 | return; 184 | } 185 | 186 | ret = listen(self->tcp_sock, 10); 187 | if (ret < 0) { 188 | warn("Failed to listen on socket"); 189 | self->quit = 1; 190 | return; 191 | } 192 | 193 | len = sizeof(addr); 194 | if (getsockname(self->tcp_sock, (void *)&addr, &len)) { 195 | warn("Failed to get sock type for main sock"); 196 | self->quit = 1; 197 | return; 198 | } 199 | 200 | ev.events = EPOLLIN | EPOLLET; 201 | ev.data.fd = self->tcp_sock; 202 | if (epoll_ctl(self->epollfd, EPOLL_CTL_ADD, self->tcp_sock, &ev) < 0) { 203 | warn("Failed to add tcp sock to epoll"); 204 | self->quit = 1; 205 | return; 206 | } 207 | 208 | if (kpm_reply_acceptor(self->main_sock, req, &addr, len) < 1) { 209 | warn("Failed reply in %s", __func__); 210 | self->quit = 1; 211 | return; 212 | } 213 | } 214 | 215 | static void 216 | server_msg_connect(struct session_state *self, struct kpm_header *hdr) 217 | { 218 | unsigned short local_port, remote_port; 219 | struct kpm_connection_id *id; 220 | struct sockaddr_in6 addr; 221 | struct kpm_connect *req; 222 | struct connection *conn; 223 | socklen_t len; 224 | int ret, cfd; 225 | 226 | if (hdr->len < sizeof(struct kpm_connect)) { 227 | warn("Invalid request in %s", __func__); 228 | self->quit = 1; 229 | return; 230 | } 231 | req = (void *)hdr; 232 | 233 | conn = malloc(sizeof(*conn)); 234 | if (!conn) { 235 | self->quit = 1; 236 | return; 237 | } 238 | memset(conn, 0, sizeof(*conn)); 239 | 240 | cfd = socket(req->addr.sin6_family, SOCK_STREAM, 0); 241 | if (cfd < 0) { 242 | warn("Failed to open socket"); 243 | goto err_free; 244 | } 245 | 246 | if (req->mss && 247 | setsockopt(cfd, IPPROTO_TCP, TCP_MAXSEG, 248 | (void *)&req->mss, sizeof(req->mss))) { 249 | warn("Setting mss failed"); 250 | goto err_close; 251 | } 252 | 253 | if (self->tx_mode == KPM_TX_MODE_DEVMEM && 254 | devmem_bind_socket(&self->devmem, cfd) < 0) 255 | goto err_close; 256 | 257 | ret = connect(cfd, (void *)&req->addr, req->len); 258 | if (ret < 0) { 259 | warn("Failed to connect"); 260 | goto err_close; 261 | } 262 | 263 | id = kpm_receive(cfd); 264 | if (!id) { 265 | warnx("No connection ID"); 266 | goto err_close; 267 | } 268 | 269 | if (id->hdr.type != KPM_MSG_TYPE_CONNECTION_ID || 270 | id->hdr.len != sizeof(*id)) { 271 | warnx("Invalid connection ID %d %d", id->hdr.type, id->hdr.len); 272 | goto err_free_id; 273 | } 274 | 275 | conn->id = ++self->connection_ids; 276 | conn->fd = cfd; 277 | 278 | len = sizeof(conn->cpu); 279 | if (getsockopt(cfd, SOL_SOCKET, SO_INCOMING_CPU, &conn->cpu, &len) < 0) { 280 | warn("Failed to read CPU for socket"); 281 | goto err_free_id; 282 | } 283 | 284 | len = sizeof(addr); 285 | if (getsockname(cfd, &addr, &len)) { 286 | warn("Failed to read address of socket"); 287 | goto err_free_id; 288 | } 289 | local_port = ntohs(addr.sin6_port); 290 | 291 | len = sizeof(addr); 292 | if (getpeername(cfd, &addr, &len)) { 293 | warn("Failed to read address of socket"); 294 | goto err_free_id; 295 | } 296 | remote_port = ntohs(addr.sin6_port); 297 | 298 | if (kpm_reply_connect(self->main_sock, hdr, 299 | conn->id, conn->cpu, local_port, 300 | id->id, id->cpu, remote_port) < 1) { 301 | warn("Failed to reply"); 302 | goto err_free_id; 303 | } 304 | 305 | list_add(&self->connections, &conn->connections); 306 | free(id); 307 | 308 | return; 309 | 310 | err_free_id: 311 | free(id); 312 | err_close: 313 | close(cfd); 314 | err_free: 315 | free(conn); 316 | self->quit = 1; 317 | return; 318 | } 319 | 320 | static void 321 | server_msg_disconnect(struct session_state *self, struct kpm_header *hdr) 322 | { 323 | struct __kpm_generic_u32 *req; 324 | struct connection *conn; 325 | 326 | if (hdr->len < sizeof(*req)) { 327 | warn("Invalid request in %s", __func__); 328 | goto err_quit; 329 | } 330 | req = (void *)hdr; 331 | 332 | conn = session_find_connection_by_id(self, req->val); 333 | if (!conn) { 334 | warnx("connection not found"); 335 | kpm_reply_error(self->main_sock, hdr, ENOENT); 336 | goto err_quit; 337 | } 338 | 339 | kpm_trace("close %d", conn->fd); 340 | close(conn->fd); 341 | list_del(&conn->connections); 342 | free(conn); 343 | 344 | if (kpm_reply_empty(self->main_sock, hdr) < 1) { 345 | warnx("Reply failed"); 346 | goto err_quit; 347 | } 348 | 349 | return; 350 | 351 | err_quit: 352 | self->quit = 1; 353 | } 354 | 355 | static void 356 | server_msg_tls(struct session_state *self, struct kpm_header *hdr) 357 | { 358 | struct connection *conn; 359 | struct kpm_tls *req; 360 | int one = 1; 361 | 362 | if (hdr->len < sizeof(*req)) { 363 | warn("Invalid request in %s", __func__); 364 | goto err_quit; 365 | } 366 | req = (void *)hdr; 367 | 368 | if (req->dir_mask & ~(KPM_TLS_ULP | KPM_TLS_TX | KPM_TLS_RX | 369 | KPM_TLS_NOPAD)) { 370 | warnx("unknown TLS flag"); 371 | kpm_reply_error(self->main_sock, hdr, EINVAL); 372 | goto err_quit; 373 | } 374 | 375 | conn = session_find_connection_by_id(self, req->connection_id); 376 | if (!conn) { 377 | warnx("connection not found"); 378 | kpm_reply_error(self->main_sock, hdr, ENOENT); 379 | goto err_quit; 380 | } 381 | 382 | if (conn->tls_mask & req->dir_mask) { 383 | warnx("TLS already set"); 384 | kpm_reply_error(self->main_sock, hdr, EBUSY); 385 | goto err_quit; 386 | } 387 | 388 | if (!((conn->tls_mask | req->dir_mask) & KPM_TLS_ULP)) { 389 | warnx("TLS ULP not requested"); 390 | kpm_reply_error(self->main_sock, hdr, EINVAL); 391 | goto err_quit; 392 | } 393 | 394 | if ((req->dir_mask & KPM_TLS_ULP) && 395 | setsockopt(conn->fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls"))) { 396 | warn("TLS ULP setup failed"); 397 | goto err_repl_errno; 398 | } 399 | 400 | if ((req->dir_mask & KPM_TLS_TX) && 401 | setsockopt(conn->fd, SOL_TLS, TLS_TX, 402 | (void *)&req->info, req->len)) { 403 | warn("TLS Tx setup failed"); 404 | goto err_repl_errno; 405 | } 406 | 407 | if ((req->dir_mask & KPM_TLS_RX) && 408 | setsockopt(conn->fd, SOL_TLS, TLS_RX, 409 | (void *)&req->info, req->len)) { 410 | warn("TLS Rx setup failed"); 411 | goto err_repl_errno; 412 | } 413 | 414 | if ((req->dir_mask & KPM_TLS_NOPAD) && 415 | setsockopt(conn->fd, SOL_TLS, TLS_RX_EXPECT_NO_PAD, 416 | (void *)&one, sizeof(one))) { 417 | warn("TLS nopad setup failed"); 418 | goto err_repl_errno; 419 | } 420 | 421 | conn->tls_mask = req->dir_mask; 422 | 423 | if (kpm_reply_empty(self->main_sock, hdr) < 1) { 424 | warnx("Reply failed"); 425 | goto err_quit; 426 | } 427 | 428 | return; 429 | 430 | err_repl_errno: 431 | kpm_reply_error(self->main_sock, hdr, errno); 432 | err_quit: 433 | self->quit = 1; 434 | } 435 | 436 | static void 437 | server_msg_max_pacing(struct session_state *self, struct kpm_header *hdr) 438 | { 439 | struct kpm_max_pacing *req; 440 | struct connection *conn; 441 | 442 | if (hdr->len < sizeof(*req)) { 443 | warn("Invalid request in %s", __func__); 444 | goto err_quit; 445 | } 446 | req = (void *)hdr; 447 | 448 | conn = session_find_connection_by_id(self, req->id); 449 | if (!conn) { 450 | warnx("connection not found"); 451 | kpm_reply_error(self->main_sock, hdr, ENOENT); 452 | goto err_quit; 453 | } 454 | 455 | if (setsockopt(conn->fd, SOL_SOCKET, SO_MAX_PACING_RATE, 456 | &req->max_pacing, sizeof(req->max_pacing))) { 457 | warn("setting pacing rate failed"); 458 | goto err_repl_errno; 459 | } 460 | 461 | if (kpm_reply_empty(self->main_sock, hdr) < 1) { 462 | warnx("Reply failed"); 463 | goto err_quit; 464 | } 465 | 466 | return; 467 | 468 | err_repl_errno: 469 | kpm_reply_error(self->main_sock, hdr, errno); 470 | err_quit: 471 | self->quit = 1; 472 | } 473 | 474 | static void 475 | server_msg_tcp_cc(struct session_state *self, struct kpm_header *hdr) 476 | { 477 | struct connection *conn; 478 | struct kpm_tcp_cc *req; 479 | 480 | if (hdr->len < sizeof(*req)) { 481 | warn("Invalid request in %s", __func__); 482 | goto err_quit; 483 | } 484 | req = (void *)hdr; 485 | 486 | conn = session_find_connection_by_id(self, req->id); 487 | if (!conn) { 488 | warnx("connection not found"); 489 | kpm_reply_error(self->main_sock, hdr, ENOENT); 490 | goto err_quit; 491 | } 492 | 493 | if (setsockopt(conn->fd, IPPROTO_TCP, TCP_CONGESTION, &req->cc_name, 494 | strnlen(req->cc_name, sizeof(req->cc_name)))) { 495 | warn("setting TCP cong contorl failed"); 496 | goto err_repl_errno; 497 | } 498 | 499 | if (kpm_reply_empty(self->main_sock, hdr) < 1) { 500 | warnx("Reply failed"); 501 | goto err_quit; 502 | } 503 | 504 | return; 505 | 506 | err_repl_errno: 507 | kpm_reply_error(self->main_sock, hdr, errno); 508 | err_quit: 509 | self->quit = 1; 510 | } 511 | 512 | static void 513 | server_msg_mode(struct session_state *self, struct kpm_header *hdr) 514 | { 515 | struct kpm_mode *req; 516 | int ret; 517 | 518 | if (hdr->len < sizeof(*req)) { 519 | warn("Invalid request in %s", __func__); 520 | goto err_quit; 521 | } 522 | req = (void *)hdr; 523 | 524 | if (self->tcp_sock && req->rx_mode == KPM_RX_MODE_DEVMEM) { 525 | ret = devmem_setup(&self->devmem, self->tcp_sock, req->dmabuf_rx_size_mb, 526 | req->num_rx_queues, req->rx_provider, 527 | &req->dev); 528 | if (ret < 0) { 529 | warnx("Failed to setup devmem"); 530 | self->quit = 1; 531 | return; 532 | } 533 | } 534 | if (self->tcp_sock && req->iou && req->rx_mode == KPM_RX_MODE_SOCKET_ZEROCOPY) { 535 | ret = iou_zerocopy_rx_setup(&self->iou_state, self->tcp_sock, req->num_rx_queues); 536 | if (ret < 0) { 537 | warnx("Failed to setup io_uring zero copy receive"); 538 | self->quit = 1; 539 | return; 540 | } 541 | } 542 | 543 | self->rx_mode = req->rx_mode; 544 | self->tx_mode = req->tx_mode; 545 | self->validate = req->validate; 546 | self->iou = req->iou; 547 | self->iou_state.rx_size_mb = req->iou_rx_size_mb; 548 | 549 | if (!self->tcp_sock && (req->tx_mode == KPM_TX_MODE_DEVMEM)) { 550 | ret = devmem_setup_tx(&self->devmem, req->tx_provider, req->dmabuf_tx_size_mb, 551 | &req->dev, &req->addr); 552 | if (ret < 0) { 553 | warnx("Failed to setup devmem tx"); 554 | self->quit = 1; 555 | return; 556 | } 557 | } 558 | 559 | if (kpm_reply_empty(self->main_sock, hdr) < 1) { 560 | warnx("Reply failed"); 561 | goto err_quit; 562 | } 563 | 564 | return; 565 | 566 | err_quit: 567 | self->quit = 1; 568 | } 569 | 570 | static void 571 | server_msg_spawn_worker(struct session_state *self, struct kpm_header *hdr) 572 | { 573 | struct worker_opts *opts = NULL; 574 | struct worker *wrk = NULL; 575 | struct epoll_event ev = {}; 576 | int p[2], dmabuf_id; 577 | pthread_attr_t attr; 578 | pthread_t thread; 579 | 580 | wrk = malloc(sizeof(*wrk)); 581 | if (!wrk) { 582 | self->quit = 1; 583 | return; 584 | } 585 | memset(wrk, 0, sizeof(*wrk)); 586 | 587 | if (socketpair(AF_LOCAL, SOCK_STREAM, 0, p) < 0) { 588 | warnx("Failed to create socket pair"); 589 | goto err_free; 590 | } 591 | 592 | if (pthread_attr_init(&attr)) { 593 | warnx("Failed to init pthread attr"); 594 | goto err_free; 595 | } 596 | if (pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED)) { 597 | warnx("Failed to set pthread attr"); 598 | goto err_free_attr; 599 | } 600 | dmabuf_id = self->devmem.tx_mem ? self->devmem.tx_mem->dmabuf_id : -1; 601 | opts = malloc(sizeof(*opts)); 602 | if (!opts) 603 | goto err_free_attr; 604 | memset(opts, 0, sizeof(*opts)); 605 | opts->fd = p[1]; 606 | opts->rx_mode = self->rx_mode; 607 | opts->tx_mode = self->tx_mode; 608 | opts->validate = self->validate; 609 | opts->use_iou = self->iou; 610 | opts->devmem.mem = self->devmem.mem; 611 | opts->devmem.dmabuf_id = dmabuf_id; 612 | opts->iou.rx_size_mb = self->iou_state.rx_size_mb; 613 | opts->iou.ifindex = self->iou_state.ifindex; 614 | opts->iou.queue_id = self->iou_state.queue_id; 615 | if (pthread_create(&thread, &attr, worker_main, opts) != 0) { 616 | warnx("Failed to create worker thread"); 617 | free(opts); 618 | goto err_free_attr; 619 | } 620 | 621 | self->iou_state.queue_id++; 622 | wrk->id = ++self->worker_ids; 623 | wrk->fd = p[0]; 624 | 625 | ev.events = EPOLLIN | EPOLLET; 626 | ev.data.fd = wrk->fd; 627 | if (epoll_ctl(self->epollfd, EPOLL_CTL_ADD, wrk->fd, &ev) < 0) { 628 | warnx("Failed to add worker sock to epoll"); 629 | goto err_worker_kill; 630 | } 631 | 632 | kpm_send_u32(wrk->fd, KPM_MSG_WORKER_ID, wrk->id); 633 | 634 | if (kpm_reply_u32(self->main_sock, hdr, wrk->id) < 1) 635 | goto err_worker_kill; 636 | 637 | list_add(&self->workers, &wrk->workers); 638 | pthread_attr_destroy(&attr); 639 | 640 | return; 641 | 642 | err_worker_kill: 643 | kpm_send_empty(wrk->fd, KPM_MSG_WORKER_KILL); 644 | err_free_attr: 645 | pthread_attr_destroy(&attr); 646 | err_free: 647 | free(wrk); 648 | self->quit = 1; 649 | } 650 | 651 | static void 652 | server_msg_pin_worker(struct session_state *self, struct kpm_header *hdr) 653 | { 654 | struct kpm_pin_worker *req; 655 | struct worker *wrk; 656 | cpu_set_t set; 657 | 658 | if (hdr->len < sizeof(struct kpm_pin_worker)) { 659 | warn("Invalid request in %s", __func__); 660 | self->quit = 1; 661 | return; 662 | } 663 | req = (void *)hdr; 664 | 665 | wrk = session_find_worker_by_id(self, req->worker_id); 666 | if (!wrk) { 667 | kpm_reply_error(self->main_sock, hdr, ENOENT); 668 | return; 669 | } 670 | 671 | CPU_ZERO(&set); 672 | if (req->cpu == (unsigned int)-1) { 673 | int i, n; 674 | 675 | n = sysconf(_SC_NPROCESSORS_CONF); 676 | if (n < 0) { 677 | warn("Failed to get CPU count"); 678 | kpm_reply_error(self->main_sock, hdr, errno); 679 | return; 680 | } 681 | 682 | for (i = 0; i < n; i++) 683 | CPU_SET(i, &set); 684 | } else { 685 | CPU_SET(req->cpu, &set); 686 | } 687 | 688 | if (sched_setaffinity(wrk->pid, sizeof(set), &set) < 0) { 689 | warn("Failed to pin worker to CPU"); 690 | kpm_reply_error(self->main_sock, hdr, errno); 691 | return; 692 | } 693 | 694 | if (kpm_reply_empty(self->main_sock, hdr) < 1) { 695 | self->quit = 1; 696 | return; 697 | } 698 | } 699 | 700 | static void 701 | server_msg_test(struct session_state *self, struct kpm_header *hdr) 702 | { 703 | unsigned int i, j, min_wrk, max_wrk; 704 | struct kpm_test *req, **fwd; 705 | unsigned int n_conns; 706 | struct test *test; 707 | 708 | if (hdr->len < sizeof(struct kpm_test)) { 709 | bad_req: 710 | warnx("Invalid request in %s: %d < %zd", 711 | __func__, hdr->len, sizeof(*req)); 712 | self->quit = 1; 713 | return; 714 | } 715 | req = (void *)hdr; 716 | 717 | n_conns = hdr->len - sizeof(struct kpm_test); 718 | if (n_conns % sizeof(struct kpm_test_spec)) 719 | goto bad_req; 720 | 721 | n_conns /= sizeof(struct kpm_test_spec); 722 | if (req->test_id || !req->time_sec || n_conns != req->n_conns) 723 | goto bad_req; 724 | 725 | test = malloc(sizeof(*test)); 726 | memset(test, 0, sizeof(*test)); 727 | 728 | test->id = ++self->test_ids; 729 | test->active = req->active; 730 | 731 | min_wrk = -1; 732 | max_wrk = 0; 733 | for (i = 0; i < n_conns; i++) { 734 | min_wrk = min(min_wrk, req->specs[i].worker_id); 735 | max_wrk = max(max_wrk, req->specs[i].worker_id); 736 | } 737 | test->worker_range = max_wrk - min_wrk + 1; 738 | 739 | fwd = calloc(test->worker_range, sizeof(void *)); 740 | for (i = 0; i < n_conns; i++) 741 | fwd[i] = calloc(1, hdr->len); 742 | test->results = calloc(test->worker_range, sizeof(*test->results)); 743 | 744 | for (i = 0; i < n_conns; i++) { 745 | struct kpm_test_spec *t = &req->specs[i]; 746 | struct connection *conn; 747 | struct worker *wrk; 748 | struct kpm_test *msg; 749 | 750 | wrk = session_find_worker_by_id(self, t->worker_id); 751 | conn = session_find_connection_by_id(self, t->connection_id); 752 | if (!wrk || !conn) { 753 | warnx("worker or connection not found"); 754 | kpm_reply_error(self->main_sock, hdr, ENOENT); 755 | goto err_free; 756 | } 757 | if (wrk->busy) { 758 | warnx("worker is busy"); 759 | kpm_reply_error(self->main_sock, hdr, EBUSY); 760 | goto err_free; 761 | } 762 | 763 | msg = fwd[t->worker_id - min_wrk]; 764 | memcpy(&msg->specs[msg->n_conns++], t, sizeof(*t)); 765 | } 766 | 767 | for (i = 0; i < test->worker_range; i++) { 768 | struct connection *conn; 769 | struct worker *wrk; 770 | struct kpm_test *msg; 771 | 772 | msg = fwd[i]; 773 | if (!msg->n_conns) 774 | continue; 775 | msg->active = req->active; 776 | msg->time_sec = req->time_sec; 777 | msg->test_id = test->id; 778 | 779 | test->workers_total++; 780 | wrk = session_find_worker_by_id(self, msg->specs[0].worker_id); 781 | wrk->busy = 1; 782 | 783 | kpm_send(wrk->fd, &msg->hdr, 784 | sizeof(*msg) + sizeof(msg->specs[0]) * msg->n_conns, 785 | KPM_MSG_WORKER_TEST); 786 | for (j = 0; j < msg->n_conns; j++) { 787 | conn = session_find_connection_by_id(self, msg->specs[j].connection_id); 788 | fdpass_send(wrk->fd, conn->fd); 789 | } 790 | } 791 | 792 | test->req = kpm_msg_dup(hdr); 793 | test->fwd = fwd; 794 | test->min_worker_id = min_wrk; 795 | list_add(&self->tests, &test->tests); 796 | kpm_reply_u32(self->main_sock, hdr, test->id); 797 | 798 | return; 799 | 800 | err_free: 801 | free(fwd); 802 | self->quit = 1; 803 | return; 804 | } 805 | 806 | static void 807 | server_msg_end_test(struct session_state *self, struct kpm_header *hdr) 808 | { 809 | struct kpm_end_test *req; 810 | struct test *test; 811 | unsigned int i; 812 | 813 | if (hdr->len < sizeof(*req)) { 814 | warn("Invalid request in %s", __func__); 815 | self->quit = 1; 816 | return; 817 | } 818 | req = (void *)hdr; 819 | 820 | test = session_find_test_by_id(self, req->id); 821 | if (!test) { 822 | warn("Failed to find test"); 823 | kpm_reply_error(self->main_sock, hdr, ENOENT); 824 | return; 825 | } 826 | 827 | if (test->active && test->workers_total != test->workers_done) { 828 | warn("Early test termination not supported"); 829 | kpm_reply_error(self->main_sock, hdr, EBUSY); 830 | return; 831 | } 832 | 833 | for (i = 0; i < test->worker_range; i++) { 834 | struct worker *wrk; 835 | struct kpm_test *msg; 836 | 837 | msg = test->fwd[i]; 838 | if (!msg->n_conns) { 839 | warnx("no conns on %d", i); 840 | continue; 841 | } 842 | 843 | kpm_trace("searching for worker %d", msg->specs[0].worker_id); 844 | wrk = session_find_worker_by_id(self, msg->specs[0].worker_id); 845 | wrk->busy = 0; 846 | 847 | kpm_trace("Sending end test to worker"); 848 | kpm_send_u32(wrk->fd, KPM_MSG_WORKER_END_TEST, req->id); 849 | } 850 | 851 | if (kpm_reply_empty(self->main_sock, hdr) < 1) { 852 | self->quit = 1; 853 | return; 854 | } 855 | } 856 | 857 | static void session_handle_main_sock(struct session_state *self) 858 | { 859 | struct kpm_header *hdr; 860 | 861 | hdr = kpm_receive(self->main_sock); 862 | if (!hdr) { 863 | __kpm_dbg("<<", "ctrl recv failed"); 864 | self->quit = 1; 865 | return; 866 | } 867 | kpm_cmd_dbg_start(hdr); 868 | 869 | switch (hdr->type) { 870 | case KPM_MSG_TYPE_OPEN_TCP_ACCEPTOR: 871 | server_msg_tcp_acceptor(self, hdr); 872 | break; 873 | case KPM_MSG_TYPE_CONNECT: 874 | server_msg_connect(self, hdr); 875 | break; 876 | case KPM_MSG_TYPE_DISCONNECT: 877 | server_msg_disconnect(self, hdr); 878 | break; 879 | case KPM_MSG_TYPE_TLS: 880 | server_msg_tls(self, hdr); 881 | break; 882 | case KPM_MSG_TYPE_MAX_PACING: 883 | server_msg_max_pacing(self, hdr); 884 | break; 885 | case KPM_MSG_TYPE_TCP_CC: 886 | server_msg_tcp_cc(self, hdr); 887 | break; 888 | case KPM_MSG_TYPE_MODE: 889 | server_msg_mode(self, hdr); 890 | break; 891 | case KPM_MSG_TYPE_SPAWN_WORKER: 892 | server_msg_spawn_worker(self, hdr); 893 | break; 894 | case KPM_MSG_TYPE_PIN_WORKER: 895 | server_msg_pin_worker(self, hdr); 896 | break; 897 | case KPM_MSG_TYPE_TEST: 898 | server_msg_test(self, hdr); 899 | break; 900 | case KPM_MSG_TYPE_END_TEST: 901 | server_msg_end_test(self, hdr); 902 | break; 903 | default: 904 | warnx("Unknown message type: %d", hdr->type); 905 | self->quit = 1; 906 | break; 907 | } 908 | 909 | kpm_cmd_dbg_end(hdr); 910 | free(hdr); 911 | } 912 | 913 | static void 914 | session_results_assemble(struct session_state *self, struct test *test) 915 | { 916 | struct kpm_test_results *reply; 917 | unsigned int i, j; 918 | size_t sz; 919 | 920 | if (!test->results[0]) { 921 | warnx("First result slot empty!"); 922 | return; 923 | } 924 | 925 | sz = sizeof(*reply) + test->req->n_conns * sizeof(reply->res[0]); 926 | reply = calloc(1, sz); 927 | memcpy(reply, test->results[0], sizeof(*reply)); 928 | 929 | for (i = 0; i < test->req->n_conns; i++) { 930 | struct kpm_test_result *res = NULL; 931 | struct kpm_test_results *rmsg; 932 | __u32 worker_id, conn_id; 933 | 934 | worker_id = test->req->specs[i].worker_id; 935 | conn_id = test->req->specs[i].connection_id; 936 | rmsg = test->results[worker_id - test->min_worker_id]; 937 | if (!rmsg) { 938 | warnx("No results for worker %d", worker_id); 939 | goto out; 940 | } 941 | for (j = 0; j < rmsg->n_conns; j++) { 942 | if (rmsg->res[j].connection_id == conn_id) { 943 | res = &rmsg->res[j]; 944 | break; 945 | } 946 | } 947 | if (!res) { 948 | warnx("No results for connection %d", conn_id); 949 | goto out; 950 | } 951 | 952 | memcpy(&reply->res[i], res, sizeof(*res)); 953 | } 954 | 955 | kpm_dbg("Results sent"); 956 | kpm_send(self->main_sock, &reply->hdr, sz, KPM_MSG_TYPE_TEST_RESULT); 957 | 958 | out: 959 | free(reply); 960 | } 961 | 962 | static void 963 | session_wmsg_test(struct session_state *self, struct kpm_header *hdr) 964 | { 965 | struct kpm_test_results *msg = (void *)hdr; 966 | __u32 worker_id = msg->res[0].worker_id; 967 | struct test *test; 968 | 969 | test = session_find_test_by_id(self, msg->test_id); 970 | if (!test) 971 | warn("Failed to find test for result"); 972 | 973 | test->workers_done++; 974 | if (test->results[worker_id - test->min_worker_id]) 975 | warnx("Results already reported for worker %d", worker_id); 976 | test->results[worker_id - test->min_worker_id] = kpm_msg_dup(&msg->hdr); 977 | 978 | kpm_dbg("Results received %d/%d", 979 | test->workers_done, test->workers_total); 980 | 981 | if (test->workers_done == test->workers_total) 982 | session_results_assemble(self, test); 983 | } 984 | 985 | static void session_handle_worker(struct session_state *self, int fd) 986 | { 987 | struct kpm_header *hdr; 988 | 989 | hdr = kpm_receive(fd); 990 | if (!hdr) { 991 | warnx("worker recv empty"); 992 | self->quit = 1; 993 | return; 994 | } 995 | __kpm_cmd_dbg_start("worker", hdr); 996 | 997 | switch (hdr->type) { 998 | case KPM_MSG_WORKER_TEST_RESULT: 999 | session_wmsg_test(self, hdr); 1000 | break; 1001 | default: 1002 | warnx("Unknown worker message type: %d", hdr->type); 1003 | self->quit = 1; 1004 | break; 1005 | } 1006 | 1007 | __kpm_cmd_dbg_end("worker", hdr); 1008 | free(hdr); 1009 | } 1010 | 1011 | static void session_handle_accept_sock(struct session_state *self) 1012 | { 1013 | struct sockaddr_in6 sockaddr; 1014 | socklen_t addrlen; 1015 | int cfd; 1016 | 1017 | __kpm_trace(">>", "accept"); 1018 | 1019 | addrlen = sizeof(sockaddr); 1020 | cfd = accept(self->tcp_sock, (void *)&sockaddr, &addrlen); 1021 | if (cfd < 0) 1022 | warn("Failed to accept conn"); 1023 | else 1024 | session_new_conn(self, cfd); 1025 | } 1026 | 1027 | static void server_session_loop(int fd) 1028 | { 1029 | struct session_state self = { .main_sock = fd, }; 1030 | struct epoll_event ev = {}, events[32]; 1031 | struct connection *conn, *next; 1032 | unsigned char j; 1033 | int i; 1034 | 1035 | /* Initialize the data buffer we send/receive, it must match on both 1036 | * ends, this is how we catch data corruption (ekhm kTLS..). 1037 | * 1038 | * We need to do this before initializing TX buffers with the pattern 1039 | * (e.g., devmem). 1040 | */ 1041 | for (i = 0, j = 0; i < (int)ARRAY_SIZE(patbuf); i++, j++) { 1042 | j = j ?: 1; 1043 | patbuf[i] = j; 1044 | } 1045 | 1046 | list_head_init(&self.connections); 1047 | list_head_init(&self.workers); 1048 | list_head_init(&self.tests); 1049 | 1050 | self.epollfd = epoll_create1(0); 1051 | if (self.epollfd < 0) 1052 | err(1, "Failed to create epoll"); 1053 | 1054 | ev.events = EPOLLIN; 1055 | ev.data.fd = fd; 1056 | if (epoll_ctl(self.epollfd, EPOLL_CTL_ADD, fd, &ev) < 0) 1057 | err(2, "Failed to init epoll"); 1058 | 1059 | while (!self.quit) { 1060 | int nfds; 1061 | 1062 | nfds = epoll_wait(self.epollfd, events, ARRAY_SIZE(events), -1); 1063 | if (nfds < 0) 1064 | err(3, "Failed to epoll"); 1065 | 1066 | for (i = 0; i < nfds; i++) { 1067 | struct epoll_event *e = &events[i]; 1068 | 1069 | if (e->data.fd == self.main_sock) 1070 | session_handle_main_sock(&self); 1071 | else if (e->data.fd == self.tcp_sock) 1072 | session_handle_accept_sock(&self); 1073 | else 1074 | session_handle_worker(&self, e->data.fd); 1075 | } 1076 | } 1077 | 1078 | kpm_dbg("exiting!"); 1079 | 1080 | list_for_each_safe(&self.connections, conn, next, connections) { 1081 | close(conn->fd); 1082 | list_del(&conn->connections); 1083 | free(conn); 1084 | } 1085 | if (self.tcp_sock && self.rx_mode == KPM_RX_MODE_DEVMEM) 1086 | devmem_teardown(&self.devmem); 1087 | if (!self.tcp_sock && self.tx_mode == KPM_TX_MODE_DEVMEM) 1088 | devmem_teardown_tx(&self.devmem); 1089 | if (self.tcp_sock && self.iou && self.rx_mode == KPM_RX_MODE_SOCKET_ZEROCOPY) 1090 | iou_zerocopy_rx_teardown(&self.iou_state); 1091 | } 1092 | 1093 | static NORETURN void server_session(int fd) 1094 | { 1095 | if (!kpm_xchg_hello(fd, NULL)) 1096 | server_session_loop(fd); 1097 | close(fd); 1098 | exit(0); 1099 | } 1100 | 1101 | struct server_session * 1102 | server_session_spawn(int fd, struct sockaddr_in6 *addr, socklen_t *addrlen) 1103 | { 1104 | struct server_session *ses; 1105 | 1106 | if (get_nprocs() > KPERF_MAX_CPUS) { 1107 | warnx("Too many CPUs in the system: %d, proto has max of %d", 1108 | get_nprocs(), KPERF_MAX_CPUS); 1109 | return NULL; 1110 | } 1111 | 1112 | ses = malloc(sizeof(*ses)); 1113 | if (!ses) { 1114 | close(fd); 1115 | return NULL; 1116 | } 1117 | memset(ses, 0, sizeof(*ses)); 1118 | 1119 | ses->pid = fork(); 1120 | if (ses->pid) 1121 | return ses; 1122 | 1123 | free(ses); 1124 | server_session(fd); 1125 | } 1126 | -------------------------------------------------------------------------------- /devmem.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: BSD-3-Clause 2 | /* Copyright Meta Platforms, Inc. and affiliates */ 3 | 4 | #define _GNU_SOURCE 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include 25 | #include 26 | 27 | #include 28 | #include 29 | 30 | #include "server.h" 31 | #include "proto_dbg.h" 32 | 33 | #ifdef USE_CUDA 34 | #include 35 | #include 36 | 37 | #ifdef CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE 38 | #define CUDA_FLAGS CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE 39 | #else 40 | #define CUDA_FLAGS 0 41 | #endif 42 | #endif 43 | 44 | extern unsigned char patbuf[KPM_MAX_OP_CHUNK + PATTERN_PERIOD + 1]; 45 | 46 | static int steering_rule_loc = -1; 47 | 48 | static int ethtool(const char *ifname, void *data) 49 | { 50 | struct ifreq ifr = {}; 51 | int ret; 52 | 53 | strcat(ifr.ifr_ifrn.ifrn_name, ifname); 54 | ifr.ifr_ifru.ifru_data = data; 55 | 56 | int fd = socket(AF_UNIX, SOCK_DGRAM, 0); 57 | if (fd < 0) 58 | return fd; 59 | 60 | ret = ioctl(fd, SIOCETHTOOL, &ifr); 61 | close(fd); 62 | return ret; 63 | } 64 | 65 | static void reset_flow_steering(const char *ifname) 66 | { 67 | struct ethtool_rxnfc del; 68 | 69 | if (steering_rule_loc < 0) 70 | return; 71 | 72 | del.cmd = ETHTOOL_SRXCLSRLDEL; 73 | del.fs.location = steering_rule_loc; 74 | 75 | ethtool(ifname, &del); 76 | 77 | steering_rule_loc = -1; 78 | } 79 | 80 | static int find_free_rule_loc(const char *ifname, int rule_cnt) 81 | { 82 | struct ethtool_rxnfc cnt = {}; 83 | struct ethtool_rxnfc *rules; 84 | int free_loc = 0; 85 | 86 | cnt.cmd = ETHTOOL_GRXCLSRLCNT; 87 | if (ethtool(ifname, &cnt) < 0) 88 | return -1; 89 | 90 | rules = calloc(1, sizeof(*rules) + (cnt.rule_cnt * sizeof(__u32))); 91 | if (!rules) 92 | return -1; 93 | 94 | rules->cmd = ETHTOOL_GRXCLSRLALL; 95 | rules->rule_cnt = cnt.rule_cnt; 96 | if (ethtool(ifname, rules) < 0) 97 | goto free_rules; 98 | 99 | while (true) { 100 | bool used = false; 101 | for (__u32 i = 0; i < rules->rule_cnt; i++) 102 | if ((unsigned int)free_loc == rules->rule_locs[i]) { 103 | used = true; 104 | break; 105 | } 106 | if (!used) 107 | break; 108 | free_loc++; 109 | } 110 | 111 | free(rules); 112 | return free_loc; 113 | 114 | free_rules: 115 | free(rules); 116 | return -1; 117 | } 118 | 119 | static int add_steering_rule(struct sockaddr_in6 *server_sin, 120 | const char *ifname, int rss_context) 121 | { 122 | struct ethtool_rxnfc add = {}; 123 | struct ethtool_rxnfc cnt = {}; 124 | int ret; 125 | 126 | add.cmd = ETHTOOL_SRXCLSRLINS; 127 | add.rss_context = rss_context; 128 | 129 | if (IN6_IS_ADDR_V4MAPPED(&server_sin->sin6_addr)) { 130 | add.fs.flow_type = TCP_V4_FLOW; 131 | memcpy(&add.fs.h_u.tcp_ip4_spec.ip4dst, 132 | &server_sin->sin6_addr.s6_addr32[3], 4); 133 | memcpy(&add.fs.h_u.tcp_ip4_spec.pdst, 134 | &server_sin->sin6_port, 2); 135 | 136 | add.fs.m_u.tcp_ip4_spec.ip4dst = 0xffffffff; 137 | add.fs.m_u.tcp_ip4_spec.pdst = 0xffff; 138 | } else { 139 | add.fs.flow_type = TCP_V6_FLOW; 140 | memcpy(add.fs.h_u.tcp_ip6_spec.ip6dst, &server_sin->sin6_addr, 141 | 16); 142 | memcpy(&add.fs.h_u.tcp_ip6_spec.pdst, &server_sin->sin6_port, 143 | 2); 144 | 145 | add.fs.m_u.tcp_ip6_spec.ip6dst[0] = 0xffffffff; 146 | add.fs.m_u.tcp_ip6_spec.ip6dst[1] = 0xffffffff; 147 | add.fs.m_u.tcp_ip6_spec.ip6dst[2] = 0xffffffff; 148 | add.fs.m_u.tcp_ip6_spec.ip6dst[3] = 0xffffffff; 149 | add.fs.m_u.tcp_ip6_spec.pdst = 0xffff; 150 | } 151 | 152 | add.fs.flow_type |= FLOW_RSS; 153 | 154 | cnt.cmd = ETHTOOL_GRXCLSRLCNT; 155 | ret = ethtool(ifname, &cnt); 156 | if (ret) 157 | return ret; 158 | 159 | if (cnt.data & RX_CLS_LOC_SPECIAL) 160 | add.fs.location = RX_CLS_LOC_ANY; 161 | else if (cnt.rule_cnt) { 162 | ret = find_free_rule_loc(ifname, cnt.rule_cnt); 163 | if (ret < 0) { 164 | warnx("Failed to find free steering rule loc"); 165 | return -1; 166 | } 167 | add.fs.location = ret; 168 | } 169 | 170 | ret = ethtool(ifname, &add); 171 | if (ret) 172 | return ret; 173 | 174 | steering_rule_loc = add.fs.location; 175 | 176 | return 0; 177 | } 178 | 179 | static int rss_context_delete(char *ifname, int rss_context) 180 | { 181 | struct ethtool_rxfh set = {}; 182 | 183 | set.cmd = ETHTOOL_SRSSH; 184 | set.rss_context = rss_context; 185 | set.indir_size = 0; 186 | 187 | if (ethtool(ifname, &set) < 0) { 188 | warn("ethtool failed to delete RSS context %u", rss_context); 189 | return -1; 190 | } 191 | 192 | return 0; 193 | } 194 | 195 | static int rss_context_equal(char *ifname, int start_queue, int num_queues, 196 | struct sockaddr_in6 *addr) 197 | { 198 | struct ethtool_rxfh get = {}; 199 | struct ethtool_rxfh *set; 200 | __u32 indir_bytes; 201 | int rss_context; 202 | int queue; 203 | int ret; 204 | 205 | get.cmd = ETHTOOL_GRSSH; 206 | if (ethtool(ifname, &get) < 0) { 207 | warn("ethtool failed to get RSS context"); 208 | return -1; 209 | } 210 | 211 | indir_bytes = get.indir_size * sizeof(get.rss_config[0]); 212 | 213 | set = calloc(1, sizeof(*set) + indir_bytes); 214 | if (!set) { 215 | warn("failed to allocate memory"); 216 | return -1; 217 | } 218 | 219 | set->cmd = ETHTOOL_SRSSH; 220 | set->rss_context = ETH_RXFH_CONTEXT_ALLOC; 221 | set->indir_size = get.indir_size; 222 | 223 | queue = start_queue; 224 | for (__u32 i = 0; i < get.indir_size; i++) { 225 | set->rss_config[i] = queue++; 226 | if (queue >= start_queue + num_queues) 227 | queue = start_queue; 228 | } 229 | 230 | if (ethtool(ifname, set) < 0) { 231 | warn("ethtool failed to create RSS context"); 232 | ret = -1; 233 | goto free_set; 234 | } 235 | 236 | rss_context = set->rss_context; 237 | 238 | if (add_steering_rule(addr, ifname, rss_context) < 0) { 239 | warn("Failed to add rule to RSS context"); 240 | ret = -1; 241 | goto delete_context; 242 | } 243 | 244 | free(set); 245 | 246 | return rss_context; 247 | 248 | delete_context: 249 | rss_context_delete(ifname, rss_context); 250 | 251 | free_set: 252 | free(set); 253 | 254 | return ret; 255 | } 256 | 257 | static int rss_equal(const char *ifname, int max_queue) 258 | { 259 | struct ethtool_rxfh_indir get = {}; 260 | struct ethtool_rxfh_indir *set; 261 | int queue = 0; 262 | int ret; 263 | 264 | get.cmd = ETHTOOL_GRXFHINDIR; 265 | if (ethtool(ifname, &get) < 0) 266 | return -1; 267 | 268 | set = malloc(sizeof(*set) + get.size * sizeof(__u32)); 269 | if (!set) 270 | return -1; 271 | 272 | for (__u32 i = 0; i < get.size; i++) { 273 | set->ring_index[i] = queue++; 274 | if (queue >= max_queue) 275 | queue = 0; 276 | } 277 | 278 | set->cmd = ETHTOOL_SRXFHINDIR; 279 | set->size = get.size; 280 | ret = ethtool(ifname, set); 281 | 282 | free(set); 283 | return ret; 284 | } 285 | 286 | static int rxq_num(int ifindex) 287 | { 288 | struct ethtool_channels_get_req *req; 289 | struct ethtool_channels_get_rsp *rsp; 290 | struct ynl_error yerr; 291 | struct ynl_sock *ys; 292 | int num = -1; 293 | 294 | ys = ynl_sock_create(&ynl_ethtool_family, &yerr); 295 | if (!ys) { 296 | warnx("Failed to setup YNL socket: %s", yerr.msg); 297 | return -1; 298 | } 299 | 300 | req = ethtool_channels_get_req_alloc(); 301 | ethtool_channels_get_req_set_header_dev_index(req, ifindex); 302 | rsp = ethtool_channels_get(ys, req); 303 | if (rsp) 304 | num = rsp->rx_count + rsp->combined_count; 305 | else 306 | warnx("ethtool_channels_get: %s", ys->err.msg); 307 | ethtool_channels_get_req_free(req); 308 | ethtool_channels_get_rsp_free(rsp); 309 | ynl_sock_destroy(ys); 310 | 311 | return num; 312 | } 313 | 314 | static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd, 315 | struct netdev_queue_id *queues, 316 | unsigned int n_queue_index, struct ynl_sock *ys) 317 | { 318 | struct netdev_bind_rx_req *req; 319 | struct netdev_bind_rx_rsp *rsp; 320 | int ret = -1; 321 | 322 | req = netdev_bind_rx_req_alloc(); 323 | if (!req) 324 | return -1; 325 | 326 | netdev_bind_rx_req_set_ifindex(req, ifindex); 327 | netdev_bind_rx_req_set_fd(req, dmabuf_fd); 328 | __netdev_bind_rx_req_set_queues(req, queues, n_queue_index); 329 | 330 | rsp = netdev_bind_rx(ys, req); 331 | if (!rsp) { 332 | warnx("netdev_bind_rx: %s", ys->err.msg); 333 | goto out; 334 | } 335 | 336 | if (!rsp->_present.id) { 337 | warnx("id not present"); 338 | goto out; 339 | } 340 | 341 | ret = rsp->id; 342 | 343 | out: 344 | if (req) 345 | netdev_bind_rx_req_free(req); 346 | if (rsp) 347 | netdev_bind_rx_rsp_free(rsp); 348 | 349 | return ret; 350 | } 351 | 352 | static int bind_tx_queue(unsigned int ifindex, unsigned int dmabuf_fd, 353 | struct ynl_sock *ys) 354 | { 355 | struct netdev_bind_tx_req *req = NULL; 356 | struct netdev_bind_tx_rsp *rsp = NULL; 357 | int ret; 358 | 359 | req = netdev_bind_tx_req_alloc(); 360 | if (!req) { 361 | warnx("netdev_bind_tx_req_alloc() failed"); 362 | return -1; 363 | } 364 | netdev_bind_tx_req_set_ifindex(req, ifindex); 365 | netdev_bind_tx_req_set_fd(req, dmabuf_fd); 366 | 367 | rsp = netdev_bind_tx(ys, req); 368 | if (!rsp) { 369 | warnx("netdev_bind_tx"); 370 | ret = -1; 371 | goto free_req; 372 | } 373 | 374 | if (!rsp->_present.id) { 375 | warnx("id not present"); 376 | ret = -1; 377 | goto free_rsp; 378 | } 379 | 380 | ret = rsp->id; 381 | netdev_bind_tx_req_free(req); 382 | netdev_bind_tx_rsp_free(rsp); 383 | 384 | return ret; 385 | 386 | free_rsp: 387 | netdev_bind_tx_rsp_free(rsp); 388 | free_req: 389 | netdev_bind_tx_req_free(req); 390 | return ret; 391 | } 392 | 393 | #define UDMABUF_LIMIT_PATH "/sys/module/udmabuf/parameters/size_limit_mb" 394 | 395 | static int udmabuf_check_size(size_t size_mb) 396 | { 397 | size_t limit_mb = 0; 398 | int ret = 0; 399 | FILE *f; 400 | 401 | f = fopen(UDMABUF_LIMIT_PATH, "r"); 402 | if (f) { 403 | fscanf(f, "%lu", &limit_mb); 404 | if (size_mb > limit_mb) { 405 | warnx( 406 | "udmabuf size limit is too small (%lu > %lu), update %s", 407 | size_mb, limit_mb, UDMABUF_LIMIT_PATH); 408 | ret = -EINVAL; 409 | } 410 | fclose(f); 411 | } 412 | 413 | return ret; 414 | } 415 | 416 | static struct memory_buffer *udmabuf_alloc(size_t size) 417 | { 418 | struct udmabuf_create create; 419 | struct memory_buffer *mem; 420 | int ret; 421 | 422 | mem = calloc(1, sizeof(*mem)); 423 | if (!mem) 424 | return NULL; 425 | 426 | ret = udmabuf_check_size(size / 1024 / 1024); 427 | if (ret < 0) { 428 | warnx("Failed: udmabuf_check_size(), ret=%d", ret); 429 | goto free_mem; 430 | } 431 | 432 | mem->devfd = open("/dev/udmabuf", O_RDWR); 433 | if (mem->devfd < 0) { 434 | warn("Failed to open /dev/udmabuf"); 435 | goto free_mem; 436 | } 437 | 438 | mem->memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING); 439 | if (mem->memfd < 0) { 440 | warn("memfd_create() failed"); 441 | goto close_devfd; 442 | } 443 | 444 | ret = fcntl(mem->memfd, F_ADD_SEALS, F_SEAL_SHRINK); 445 | if (ret < 0) { 446 | warn("fcntl() failed"); 447 | goto close_memfd; 448 | } 449 | 450 | ret = ftruncate(mem->memfd, size); 451 | if (ret < 0) { 452 | warn("ftruncate() failed"); 453 | goto close_memfd; 454 | } 455 | 456 | memset(&create, 0, sizeof(create)); 457 | 458 | create.memfd = mem->memfd; 459 | create.offset = 0; 460 | create.size = size; 461 | 462 | mem->fd = ioctl(mem->devfd, UDMABUF_CREATE, &create); 463 | if (mem->fd < 0) { 464 | warn("ioctl(mem->devfd) failed"); 465 | goto close_memfd; 466 | } 467 | 468 | mem->size = size; 469 | mem->provider = MEMORY_PROVIDER_HOST; 470 | mem->buf_mem = mmap(NULL, mem->size, PROT_READ | PROT_WRITE, 471 | MAP_SHARED, mem->fd, 0); 472 | 473 | if (mem->buf_mem == MAP_FAILED) { 474 | ret = -errno; 475 | goto close_dmabuf_fd; 476 | } 477 | 478 | return mem; 479 | 480 | close_dmabuf_fd: 481 | close(mem->fd); 482 | close_memfd: 483 | close(mem->memfd); 484 | close_devfd: 485 | close(mem->devfd); 486 | free_mem: 487 | free(mem); 488 | return NULL; 489 | } 490 | 491 | static void udmabuf_free(struct memory_buffer *mem) 492 | { 493 | if (mem->buf_mem) { 494 | close(mem->fd); 495 | close(mem->memfd); 496 | close(mem->devfd); 497 | munmap(mem->buf_mem, mem->size); 498 | } 499 | free(mem); 500 | } 501 | 502 | static void inet_to_inet6(struct sockaddr *addr, struct sockaddr_in6 *out) 503 | { 504 | out->sin6_addr.s6_addr32[3] = ((struct sockaddr_in *)addr)->sin_addr.s_addr; 505 | out->sin6_addr.s6_addr32[0] = 0; 506 | out->sin6_addr.s6_addr32[1] = 0; 507 | out->sin6_addr.s6_addr16[4] = 0; 508 | out->sin6_addr.s6_addr16[5] = 0xffff; 509 | out->sin6_family = AF_INET6; 510 | } 511 | 512 | static int find_iface(struct sockaddr_in6 *addr, char ifname[IFNAMSIZ]) 513 | { 514 | struct ifaddrs *ifaddr, *ifa; 515 | struct sockaddr_in6 tmp; 516 | 517 | if (getifaddrs(&ifaddr) < 0) 518 | return -errno; 519 | 520 | for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) { 521 | if (!ifa->ifa_addr) 522 | continue; 523 | 524 | if (ifa->ifa_addr->sa_family == AF_INET) 525 | inet_to_inet6(ifa->ifa_addr, &tmp); 526 | else if (ifa->ifa_addr->sa_family == AF_INET6) 527 | memcpy(&tmp, ifa->ifa_addr, sizeof(tmp)); 528 | else 529 | continue; 530 | 531 | if (!memcmp(&tmp.sin6_addr, &addr->sin6_addr, 532 | sizeof(tmp.sin6_addr))) { 533 | strncpy(ifname, ifa->ifa_name, IFNAMSIZ - 1); 534 | freeifaddrs(ifaddr); 535 | return if_nametoindex(ifname); 536 | } 537 | } 538 | 539 | freeifaddrs(ifaddr); 540 | return -ENODEV; 541 | } 542 | 543 | void udmabuf_memcpy_to_device(struct memory_buffer *dst, size_t off, 544 | void *src, int n) 545 | { 546 | struct dma_buf_sync sync = {}; 547 | 548 | sync.flags = DMA_BUF_SYNC_START | DMA_BUF_SYNC_WRITE; 549 | ioctl(dst->fd, DMA_BUF_IOCTL_SYNC, &sync); 550 | 551 | memcpy(dst->buf_mem + off, src, n); 552 | 553 | sync.flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_WRITE; 554 | ioctl(dst->fd, DMA_BUF_IOCTL_SYNC, &sync); 555 | } 556 | 557 | static struct memory_provider udmabuf_memory_provider = { 558 | .alloc = udmabuf_alloc, 559 | .free = udmabuf_free, 560 | .memcpy_to_device = udmabuf_memcpy_to_device, 561 | }; 562 | 563 | static struct memory_provider *rxmp; 564 | static struct memory_provider *txmp; 565 | 566 | #ifdef USE_CUDA 567 | 568 | /* Length of str: 'XXXX:XX:XX' */ 569 | #define MAX_BUS_ID_LEN 11 570 | 571 | static int cuda_find_device(__u16 domain, __u8 bus, __u8 device) 572 | { 573 | char bus_id[MAX_BUS_ID_LEN]; 574 | int devnum; 575 | int ret; 576 | 577 | ret = snprintf(bus_id, MAX_BUS_ID_LEN, "%hx:%hhx:%hhx", domain, bus, device); 578 | if (ret < 0) 579 | return -EINVAL; 580 | 581 | ret = cudaDeviceGetByPCIBusId(&devnum, bus_id); 582 | if (ret != cudaSuccess) { 583 | warnx("No CUDA device found %s", bus_id); 584 | return -EINVAL; 585 | } 586 | 587 | return devnum; 588 | } 589 | 590 | static int cuda_dev_init(struct pci_dev *dev) 591 | { 592 | struct cudaDeviceProp deviceProp; 593 | CUdevice cuda_dev; 594 | int devnum; 595 | int ret; 596 | int ok; 597 | 598 | ret = cuInit(0); 599 | if (ret != CUDA_SUCCESS) 600 | return -1; 601 | 602 | /* If the user did not specify a device, select any device */ 603 | if (dev->domain == DEVICE_DOMAIN_ANY && dev->bus == DEVICE_BUS_ANY && dev->device == DEVICE_DEVICE_ANY) { 604 | devnum = 0; 605 | } else { 606 | devnum = cuda_find_device(dev->domain, dev->bus, dev->device); 607 | if (devnum < 0) 608 | return -1; 609 | } 610 | 611 | ret = cuDeviceGet(&cuda_dev, devnum); 612 | if (ret != CUDA_SUCCESS) 613 | return -1; 614 | 615 | ok = 0; 616 | ret = cuDeviceGetAttribute(&ok, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, 617 | cuda_dev); 618 | if (ret != CUDA_SUCCESS || !ok) { 619 | if (!ok) 620 | warnx("CUDA device does not support dmabuf"); 621 | return -1; 622 | } 623 | 624 | ret = cudaSetDevice(devnum); 625 | if (ret != cudaSuccess) { 626 | warn("cudaSetDevice() failed with error %d", ret); 627 | return -1; 628 | } 629 | 630 | if (verbose >= 4) 631 | fprintf(stderr, "cuda: tid %d selecting device %d (%s)\n", 632 | getpid(), devnum, deviceProp.name); 633 | 634 | return 0; 635 | } 636 | 637 | static struct memory_buffer *cuda_alloc(size_t size) 638 | { 639 | struct memory_buffer *mem; 640 | size_t page_size; 641 | int ret; 642 | 643 | page_size = sysconf(_SC_PAGESIZE); 644 | if (size % page_size) { 645 | warnx("cuda memory size not aligned, size 0x%lx", size); 646 | return NULL; 647 | } 648 | 649 | mem = calloc(1, sizeof(*mem)); 650 | if (!mem) 651 | return NULL; 652 | memset(mem, 0, sizeof(*mem)); 653 | mem->size = size; 654 | mem->provider = MEMORY_PROVIDER_CUDA; 655 | 656 | ret = cudaMalloc((void *)&mem->buf_mem, size); 657 | if (ret != cudaSuccess) 658 | goto free_mem; 659 | 660 | ret = cuMemGetHandleForAddressRange((void *)&mem->fd, 661 | ((CUdeviceptr)mem->buf_mem), size, 662 | CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 663 | CUDA_FLAGS); 664 | if (ret != CUDA_SUCCESS) 665 | goto free_cuda; 666 | 667 | return mem; 668 | 669 | free_cuda: 670 | if (cudaFree(mem->buf_mem) != cudaSuccess) 671 | warnx("cudaFree() failed"); 672 | free_mem: 673 | free(mem); 674 | 675 | return NULL; 676 | } 677 | 678 | static void cuda_free(struct memory_buffer *mem) 679 | { 680 | if (mem->fd) 681 | close(mem->fd); 682 | if (mem->buf_mem) 683 | cudaFree(mem->buf_mem); 684 | 685 | free(mem); 686 | } 687 | 688 | void cuda_memcpy_to_device(struct memory_buffer *dst, size_t off, 689 | void *src, int n) 690 | { 691 | int ret; 692 | 693 | ret = cudaMemcpy((void *)(dst->buf_mem + off), src, n, 694 | cudaMemcpyHostToDevice); 695 | if (ret != cudaSuccess) 696 | warnx("cudaMemcpy() failed"); 697 | } 698 | 699 | static struct memory_provider cuda_memory_provider = { 700 | .dev_init = cuda_dev_init, 701 | .alloc = cuda_alloc, 702 | .free = cuda_free, 703 | .memcpy_to_device = cuda_memcpy_to_device, 704 | }; 705 | #endif 706 | 707 | static struct memory_provider *get_memory_provider(enum memory_provider_type provider) 708 | { 709 | switch (provider) { 710 | case MEMORY_PROVIDER_HOST: 711 | return &udmabuf_memory_provider; 712 | #ifdef USE_CUDA 713 | case MEMORY_PROVIDER_CUDA: 714 | return &cuda_memory_provider; 715 | #endif 716 | default: 717 | warn("invalid provider: %d", provider); 718 | return NULL; 719 | } 720 | } 721 | 722 | int reserve_queues(int fd, int num_queues, char out_ifname[IFNAMSIZ], 723 | int *out_ifindex, int *out_queue_id, int *out_rss_context) 724 | { 725 | struct sockaddr_in6 addr; 726 | char ifname[IFNAMSIZ]; 727 | int max_kernel_queue; 728 | socklen_t optlen; 729 | int rss_context; 730 | int ifindex; 731 | int ret = 0; 732 | int rxqn; 733 | 734 | if (num_queues <= 0) { 735 | warnx("Invalid number of RX queues: %u", num_queues); 736 | return -1; 737 | } 738 | 739 | optlen = sizeof(addr); 740 | if (getsockname(fd, (struct sockaddr *)&addr, &optlen) < 0) { 741 | warn("Failed to query socket address"); 742 | return -1; 743 | } 744 | 745 | if (addr.sin6_family == AF_INET) 746 | inet_to_inet6((void *)&addr, &addr); 747 | 748 | ifindex = find_iface(&addr, ifname); 749 | if (ifindex < 0) { 750 | warnx("Failed to resolve ifindex: %s", strerror(-ifindex)); 751 | return -1; 752 | } 753 | 754 | rxqn = rxq_num(ifindex); 755 | if (rxqn < 2) { 756 | warnx("Invalid number of queues: %d", rxqn); 757 | return -1; 758 | } 759 | 760 | if (num_queues >= rxqn - 1) { 761 | warnx("Invalid number of RX queues (%u) requested (max: %u)", 762 | num_queues, rxqn - 1); 763 | return -1; 764 | } 765 | 766 | max_kernel_queue = rxqn - num_queues; 767 | 768 | reset_flow_steering(ifname); 769 | if (rss_equal(ifname, max_kernel_queue)) { 770 | warnx("Failed to setup RSS"); 771 | return -1; 772 | } 773 | 774 | rss_context = rss_context_equal(ifname, max_kernel_queue, 775 | num_queues, &addr); 776 | if (rss_context < 0) { 777 | warnx("Failed to setup RSS context"); 778 | ret = -1; 779 | goto undo_rss; 780 | } 781 | 782 | memcpy(out_ifname, ifname, IFNAMSIZ); 783 | *out_ifindex = ifindex; 784 | *out_queue_id = max_kernel_queue; 785 | *out_rss_context = rss_context; 786 | 787 | return ret; 788 | 789 | undo_rss: 790 | rss_equal(ifname, rxqn); 791 | 792 | return ret; 793 | } 794 | 795 | void unreserve_queues(char *ifname, int rss_context) 796 | { 797 | int ifindex; 798 | int rxqn; 799 | 800 | reset_flow_steering(ifname); 801 | rss_context_delete(ifname, rss_context); 802 | ifindex = if_nametoindex(ifname); 803 | if (ifindex > 0) { 804 | rxqn = rxq_num(ifindex); 805 | if (rxqn > 0) 806 | rss_equal(ifname, rxqn); 807 | } 808 | } 809 | 810 | /* Setup Devmem RX */ 811 | int devmem_setup(struct session_state_devmem *devmem, int fd, 812 | size_t dmabuf_rx_size_mb, int num_queues, 813 | enum memory_provider_type provider, 814 | struct pci_dev *dev) 815 | { 816 | struct netdev_queue_id *queues; 817 | struct ynl_error yerr; 818 | int max_kernel_queue; 819 | int ifindex; 820 | int ret; 821 | 822 | ret = reserve_queues(fd, num_queues, devmem->ifname, &ifindex, 823 | &max_kernel_queue, &devmem->rss_context); 824 | if (ret) 825 | return ret; 826 | 827 | rxmp = get_memory_provider(provider); 828 | if (!rxmp) { 829 | ret = -1; 830 | goto undo_queues; 831 | } 832 | 833 | devmem->ys = ynl_sock_create(&ynl_netdev_family, &yerr); 834 | if (!devmem->ys) { 835 | warnx("Failed to setup YNL socket: %s", yerr.msg); 836 | goto undo_queues; 837 | } 838 | 839 | if (rxmp->dev_init && rxmp->dev_init(dev) < 0) { 840 | ret = -1; 841 | goto sock_destroy; 842 | } 843 | 844 | devmem->mem = rxmp->alloc(dmabuf_rx_size_mb * 1024 * 1024); 845 | if (!devmem->mem) { 846 | warnx("Failed to allocate memory"); 847 | ret = -1; 848 | goto sock_destroy; 849 | } 850 | 851 | queues = calloc(num_queues, sizeof(*queues)); 852 | if (!queues) { 853 | warn("Failed to allocate memory for queues"); 854 | ret = -1; 855 | goto free_memory; 856 | } 857 | 858 | for (int i = 0; i < num_queues; i++) { 859 | queues[i]._present.type = 1; 860 | queues[i]._present.id = 1; 861 | queues[i].type = NETDEV_QUEUE_TYPE_RX; 862 | queues[i].id = max_kernel_queue + i; 863 | } 864 | 865 | devmem->mem->dmabuf_id = bind_rx_queue(ifindex, devmem->mem->fd, queues, 866 | num_queues, devmem->ys); 867 | if (devmem->mem->dmabuf_id < 0) { 868 | warnx("Failed to bind RX queue"); 869 | ret = -1; 870 | goto free_queues; 871 | } 872 | 873 | return 0; 874 | 875 | free_queues: 876 | free(queues); 877 | free_memory: 878 | rxmp->free(devmem->mem); 879 | sock_destroy: 880 | ynl_sock_destroy(devmem->ys); 881 | devmem->ys = NULL; 882 | undo_queues: 883 | unreserve_queues(devmem->ifname, devmem->rss_context); 884 | 885 | return ret; 886 | } 887 | 888 | int devmem_teardown(struct session_state_devmem *devmem) 889 | { 890 | unreserve_queues(devmem->ifname, devmem->rss_context); 891 | if (devmem->ys) 892 | ynl_sock_destroy(devmem->ys); 893 | if (rxmp) 894 | rxmp->free(devmem->mem); 895 | return 0; 896 | } 897 | 898 | int devmem_release_tokens(int fd, struct connection_devmem *conn) 899 | { 900 | int ret; 901 | 902 | if (!conn->rxtok_len) 903 | return 0; 904 | 905 | ret = setsockopt(fd, SOL_SOCKET, SO_DEVMEM_DONTNEED, &conn->rxtok[0], 906 | sizeof(struct dmabuf_token) * conn->rxtok_len); 907 | 908 | if (ret >= 0 && ret != conn->rxtok_len) 909 | warnx("requested to release %d token, got %d", conn->rxtok_len, 910 | ret); 911 | 912 | conn->rxtok_len = 0; 913 | 914 | return ret; 915 | } 916 | 917 | static int devmem_validate_host(struct memory_buffer *mem, __u64 offset, 918 | __u32 pat_start, __u32 size) 919 | { 920 | struct dma_buf_sync sync = {}; 921 | void *pat = NULL; 922 | int ret = 0; 923 | 924 | sync.flags = DMA_BUF_SYNC_START; 925 | ioctl(mem->fd, DMA_BUF_IOCTL_SYNC, &sync); 926 | 927 | pat = &patbuf[pat_start]; 928 | ret = memcmp(pat, mem->buf_mem + offset, size); 929 | 930 | sync.flags = DMA_BUF_SYNC_END; 931 | ioctl(mem->fd, DMA_BUF_IOCTL_SYNC, &sync); 932 | 933 | if (ret) { 934 | warnx("Data corruption %d %d %d %d", 935 | *(char *)mem->buf_mem, *(char *)pat, size, pat_start); 936 | return -1; 937 | } 938 | 939 | return 0; 940 | } 941 | 942 | static int devmem_validate_cuda(unsigned char *rxbuf, struct memory_buffer *mem, 943 | __u64 offset, __u32 pat_start, __u32 size) 944 | { 945 | #ifdef USE_CUDA 946 | void *pat = NULL; 947 | int ret = 0; 948 | 949 | ret = cudaMemcpy(rxbuf, (void *)(mem->buf_mem + offset), size, 950 | cudaMemcpyDeviceToHost); 951 | if (ret != cudaSuccess) { 952 | warnx("cudaMemcpyDeviceToHost failed rc=%d", ret); 953 | return -1; 954 | } 955 | 956 | pat = &patbuf[pat_start]; 957 | ret = memcmp(pat, rxbuf, size); 958 | if (ret) { 959 | warnx("Data corruption %d %d %d %d", 960 | *(char *)rxbuf, *(char *)pat, size, pat_start); 961 | return -1; 962 | } 963 | #endif 964 | 965 | return 0; 966 | } 967 | 968 | static int devmem_validate_recv(unsigned char *rxbuf, struct memory_buffer *mem, 969 | struct cmsghdr *cm, int rep, __u64 *tot_recv) 970 | { 971 | struct dmabuf_cmsg *dmabuf_cmsg = (struct dmabuf_cmsg *)CMSG_DATA(cm); 972 | size_t start = 0; 973 | int ret = 0; 974 | 975 | start = *tot_recv % PATTERN_PERIOD; 976 | if (start + dmabuf_cmsg->frag_size > ARRAY_SIZE(patbuf)) { 977 | warnx("dmabuf fragment size too big rep=%d", rep); 978 | return -1; 979 | } 980 | 981 | switch (mem->provider) { 982 | case MEMORY_PROVIDER_HOST: 983 | ret = devmem_validate_host(mem, dmabuf_cmsg->frag_offset, start, 984 | dmabuf_cmsg->frag_size); 985 | break; 986 | case MEMORY_PROVIDER_CUDA: 987 | ret = devmem_validate_cuda(rxbuf, mem, dmabuf_cmsg->frag_offset, 988 | start, dmabuf_cmsg->frag_size); 989 | break; 990 | } 991 | if (ret) { 992 | warnx("devmem recv validation failed rep=%d rc=%d", rep, ret); 993 | return -1; 994 | } 995 | 996 | *tot_recv += dmabuf_cmsg->frag_size; 997 | return ret; 998 | } 999 | 1000 | static int devmem_handle_token(int fd, struct connection_devmem *conn, 1001 | struct cmsghdr *cm) 1002 | { 1003 | struct dmabuf_cmsg *dmabuf_cmsg = (struct dmabuf_cmsg *)CMSG_DATA(cm); 1004 | struct dmabuf_token *token; 1005 | 1006 | if (cm->cmsg_type == SO_DEVMEM_LINEAR) { 1007 | warnx("received linear chunk, flow steering error?"); 1008 | return -EFAULT; 1009 | } 1010 | 1011 | if (conn->rxtok_len == ARRAY_SIZE(conn->rxtok)) { 1012 | int ret; 1013 | 1014 | ret = devmem_release_tokens(fd, conn); 1015 | if (ret < 0) 1016 | return ret; 1017 | } 1018 | 1019 | token = &conn->rxtok[conn->rxtok_len++]; 1020 | token->token_start = dmabuf_cmsg->frag_token; 1021 | token->token_count = 1; 1022 | 1023 | return 0; 1024 | } 1025 | 1026 | ssize_t devmem_recv(int fd, struct connection_devmem *conn, 1027 | unsigned char *rxbuf, size_t chunk, 1028 | struct memory_buffer *mem, int rep, __u64 tot_recv, 1029 | bool validate) 1030 | { 1031 | struct msghdr msg = {}; 1032 | struct iovec iov = { 1033 | .iov_base = NULL, 1034 | .iov_len = chunk, 1035 | }; 1036 | struct cmsghdr *cm; 1037 | int tokens = 0; 1038 | ssize_t n; 1039 | int ret; 1040 | 1041 | msg.msg_iov = &iov; 1042 | msg.msg_iovlen = 1; 1043 | msg.msg_control = conn->ctrl_data; 1044 | msg.msg_controllen = sizeof(conn->ctrl_data); 1045 | n = recvmsg(fd, &msg, MSG_DONTWAIT | MSG_SOCK_DEVMEM); 1046 | if (n < 0) 1047 | return n; 1048 | 1049 | for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) { 1050 | if (cm->cmsg_level != SOL_SOCKET || 1051 | (cm->cmsg_type != SO_DEVMEM_DMABUF && 1052 | cm->cmsg_type != SO_DEVMEM_LINEAR)) 1053 | continue; 1054 | 1055 | ret = devmem_handle_token(fd, conn, cm); 1056 | if (ret < 0) 1057 | return ret; 1058 | 1059 | if (validate) { 1060 | ret = devmem_validate_recv(rxbuf, mem, cm, rep, 1061 | &tot_recv); 1062 | if (ret < 0) 1063 | return ret; 1064 | } 1065 | 1066 | tokens++; 1067 | } 1068 | 1069 | if (!tokens) { 1070 | warnx("devmem recvmsg returned no tokens"); 1071 | errno = -EFAULT; 1072 | return -1; 1073 | } 1074 | 1075 | return n; 1076 | } 1077 | 1078 | int devmem_sendmsg(int fd, int dmabuf_id, size_t off, size_t n) 1079 | { 1080 | char ctrl_data[CMSG_SPACE(sizeof(int))]; 1081 | struct msghdr msg = { 0 }; 1082 | struct cmsghdr *cmsg; 1083 | struct iovec iov; 1084 | 1085 | iov.iov_base = (void *)off; 1086 | iov.iov_len = n; 1087 | 1088 | msg.msg_iov = &iov; 1089 | msg.msg_iovlen = 1; 1090 | 1091 | msg.msg_control = ctrl_data; 1092 | msg.msg_controllen = sizeof(ctrl_data); 1093 | 1094 | cmsg = CMSG_FIRSTHDR(&msg); 1095 | cmsg->cmsg_level = SOL_SOCKET; 1096 | cmsg->cmsg_type = SCM_DEVMEM_DMABUF; 1097 | cmsg->cmsg_len = CMSG_LEN(sizeof(int)); 1098 | *((int *)CMSG_DATA(cmsg)) = dmabuf_id; 1099 | 1100 | return sendmsg(fd, &msg, MSG_ZEROCOPY); 1101 | } 1102 | 1103 | int devmem_bind_socket(struct session_state_devmem *devmem, int fd) 1104 | { 1105 | char ifname[IFNAMSIZ] = {}; 1106 | int ifindex; 1107 | 1108 | ifindex = find_iface(&devmem->addr, ifname); 1109 | if (ifindex < 0) { 1110 | warnx("Failed to resolve ifindex: %s", strerror(-ifindex)); 1111 | return -1; 1112 | } 1113 | 1114 | if (setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, ifname, IFNAMSIZ)) { 1115 | warn("failed to bind device to socket"); 1116 | return -1; 1117 | } 1118 | 1119 | return 0; 1120 | } 1121 | 1122 | int devmem_setup_tx(struct session_state_devmem *devmem, enum memory_provider_type provider, 1123 | int dmabuf_tx_size_mb, struct pci_dev *dev, struct sockaddr_in6 *addr) 1124 | { 1125 | char ifname[IFNAMSIZ] = {}; 1126 | struct ynl_error yerr; 1127 | int ifindex; 1128 | int ret; 1129 | 1130 | devmem->tx_provider = provider; 1131 | devmem->dmabuf_tx_size_mb = dmabuf_tx_size_mb; 1132 | memcpy(&devmem->tx_dev, dev, sizeof(devmem->tx_dev)); 1133 | memcpy(&devmem->addr, addr, sizeof(devmem->addr)); 1134 | 1135 | txmp = get_memory_provider(devmem->tx_provider); 1136 | if (!txmp) 1137 | return -1; 1138 | 1139 | if (txmp->dev_init && txmp->dev_init(&devmem->tx_dev) < 0) 1140 | return -1; 1141 | 1142 | devmem->tx_mem = txmp->alloc(devmem->dmabuf_tx_size_mb * 1024 * 1024); 1143 | if (!devmem->tx_mem) { 1144 | warnx("Failed to allocate devmem tx buffer"); 1145 | return -1; 1146 | } 1147 | 1148 | txmp->memcpy_to_device(devmem->tx_mem, 0, patbuf, sizeof(patbuf)); 1149 | 1150 | ifindex = find_iface(&devmem->addr, ifname); 1151 | if (ifindex < 0) { 1152 | warnx("Failed to resolve ifindex: %s", strerror(-ifindex)); 1153 | return -1; 1154 | } 1155 | 1156 | devmem->ys = ynl_sock_create(&ynl_netdev_family, &yerr); 1157 | if (!devmem->ys) { 1158 | warnx("Failed to setup YNL socket: %s", yerr.msg); 1159 | return -1; 1160 | } 1161 | 1162 | devmem->tx_mem->dmabuf_id = bind_tx_queue(ifindex, devmem->tx_mem->fd, devmem->ys); 1163 | if (devmem->tx_mem->dmabuf_id < 0) { 1164 | warnx("Failed to bind TX queue dmabuf: %d\n", devmem->tx_mem->dmabuf_id); 1165 | ret = -1; 1166 | goto sock_destroy; 1167 | } 1168 | 1169 | 1170 | return 0; 1171 | 1172 | sock_destroy: 1173 | ynl_sock_destroy(devmem->ys); 1174 | devmem->ys = NULL; 1175 | return ret; 1176 | } 1177 | 1178 | void devmem_teardown_tx(struct session_state_devmem *devmem) 1179 | { 1180 | if (txmp && devmem->tx_mem) { 1181 | txmp->free(devmem->tx_mem); 1182 | devmem->tx_mem = NULL; 1183 | } 1184 | 1185 | if (devmem->ys) { 1186 | ynl_sock_destroy(devmem->ys); 1187 | devmem->ys = NULL; 1188 | } 1189 | } 1190 | -------------------------------------------------------------------------------- /client.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: BSD-3-Clause 2 | /* Copyright Meta Platforms, Inc. and affiliates */ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include "bipartite_match.h" 21 | #include "proto.h" 22 | #include "proto_dbg.h" 23 | 24 | int verbose = 3; 25 | 26 | static struct { 27 | bool msg_trunc; 28 | bool devmem_rx; 29 | enum memory_provider_type devmem_rx_memory; 30 | enum memory_provider_type devmem_tx_memory; 31 | struct pci_dev devmem_dst_dev; 32 | struct pci_dev devmem_src_dev; 33 | bool devmem_tx; 34 | bool msg_zerocopy; 35 | bool tls; 36 | bool tls_rx; 37 | bool tls_tx; 38 | bool tls_nopad; 39 | bool output_csv; 40 | bool output_hdr; 41 | bool xpin; 42 | unsigned int tls_ver; 43 | char *src; 44 | char *dst; 45 | char *src_svc; 46 | char *dst_svc; 47 | unsigned int time_stats; 48 | unsigned int req_size; 49 | unsigned int resp_size; 50 | unsigned int read_size; 51 | unsigned int write_size; 52 | unsigned int pin_off; 53 | unsigned int time; 54 | unsigned int cpu_min; 55 | unsigned int cpu_max; 56 | int cpu_src_wrk; 57 | int cpu_dst_wrk; 58 | unsigned int mss; 59 | unsigned int n_conns; 60 | unsigned int max_pace; 61 | char *tcp_cong_ctrl; 62 | unsigned int dmabuf_rx_size_mb; 63 | unsigned int dmabuf_tx_size_mb; 64 | unsigned int num_rx_queues; 65 | bool validate; 66 | bool iou_src; 67 | bool iou_dst; 68 | bool zerocopy_rx; 69 | unsigned int iou_rx_size_mb; 70 | } opt = { 71 | .tls_ver = TLS_1_3_VERSION, 72 | .src = "localhost", 73 | .dst = "localhost", 74 | .src_svc = "18323", 75 | .dst_svc = "18323", 76 | .req_size = ~0U, 77 | .read_size = KPM_DFL_OP_CHUNK, 78 | .write_size = KPM_DFL_OP_CHUNK, 79 | .time = 5, 80 | .cpu_min = 0, 81 | .cpu_max = 255, 82 | .cpu_src_wrk = -1, 83 | .cpu_dst_wrk = -1, 84 | .n_conns = 1, 85 | /* 128M is enough to drive one queue at 200G */ 86 | .dmabuf_rx_size_mb = 128, 87 | .dmabuf_tx_size_mb = 128, 88 | .num_rx_queues = 1, 89 | .devmem_rx_memory = MEMORY_PROVIDER_HOST, 90 | .devmem_dst_dev = { 91 | .domain = DEVICE_DOMAIN_ANY, 92 | .bus = DEVICE_BUS_ANY, 93 | .device = DEVICE_DEVICE_ANY 94 | }, 95 | .devmem_tx_memory = MEMORY_PROVIDER_HOST, 96 | .devmem_src_dev = { 97 | .domain = DEVICE_DOMAIN_ANY, 98 | .bus = DEVICE_BUS_ANY, 99 | .device = DEVICE_DEVICE_ANY 100 | }, 101 | .iou_src = false, 102 | .iou_dst = false, 103 | .zerocopy_rx = false, 104 | .iou_rx_size_mb = 64, 105 | }; 106 | 107 | #define dbg(fmt...) while (0) { warnx(fmt); } 108 | 109 | static void opt_show_uinthex(char buf[OPT_SHOW_LEN], const unsigned int *ui) 110 | { 111 | sprintf(buf, "0x%x", *ui); 112 | } 113 | 114 | static char *arg_bad(const char *fmt, const char *arg) 115 | { 116 | char *str; 117 | 118 | str = malloc(strlen(fmt) + strlen(arg)); 119 | if (!str) 120 | return strerror(errno); 121 | 122 | sprintf(str, fmt, arg); 123 | 124 | return str; 125 | } 126 | 127 | static char * 128 | opt_set_memory_provider(const char *arg, enum memory_provider_type *provider) 129 | { 130 | char *ret; 131 | 132 | if (!strcmp(arg, "cuda")) { 133 | #ifndef USE_CUDA 134 | return arg_bad("memory provider %s requires kperf compiled with CUDA", arg); 135 | #endif 136 | } 137 | 138 | ret = NULL; 139 | if (!strcmp(arg, "host")) { 140 | *provider = MEMORY_PROVIDER_HOST; 141 | } else if (!strcmp(arg, "cuda")) { 142 | *provider = MEMORY_PROVIDER_CUDA; 143 | } else { 144 | ret = arg_bad("'%s' is not a valid memory provider", arg); 145 | } 146 | 147 | return ret; 148 | 149 | } 150 | 151 | static char * 152 | opt_set_dev(const char *arg, struct pci_dev *dev) 153 | { 154 | if (!strcmp(arg, "any")) { 155 | dev->domain = DEVICE_DOMAIN_ANY; 156 | dev->bus = DEVICE_BUS_ANY; 157 | dev->device = DEVICE_DEVICE_ANY; 158 | return NULL; 159 | } 160 | 161 | if (sscanf(arg, "%hx:%hhx:%hhx", &dev->domain, &dev->bus, &dev->device) == 3) 162 | return NULL; 163 | 164 | return arg_bad("'%s' invalid PCI ID format. Expected format: domain:bus:device\n", arg); 165 | } 166 | 167 | static void 168 | opt_show_memory_provider(char buf[OPT_SHOW_LEN], const enum memory_provider_type *p) 169 | { 170 | switch (*p) { 171 | case MEMORY_PROVIDER_HOST: 172 | strncpy(buf, "host", OPT_SHOW_LEN); 173 | break; 174 | case MEMORY_PROVIDER_CUDA: 175 | strncpy(buf, "cuda", OPT_SHOW_LEN); 176 | break; 177 | default: 178 | /* inval */ 179 | strncpy(buf, "invalid", OPT_SHOW_LEN); 180 | break; 181 | } 182 | } 183 | 184 | static void 185 | opt_show_dev(char buf[OPT_SHOW_LEN], const struct pci_dev *dev) 186 | { 187 | if (dev->domain == DEVICE_DOMAIN_ANY && 188 | dev->bus == DEVICE_BUS_ANY && 189 | dev->device == DEVICE_DEVICE_ANY) 190 | strncpy(buf, "any", OPT_SHOW_LEN); 191 | else 192 | snprintf(buf, OPT_SHOW_LEN, "%hx:%hhx:%hhx", 193 | dev->domain, dev->bus, dev->device); 194 | } 195 | 196 | static const struct opt_table opts[] = { 197 | OPT_WITH_ARG("--src ", opt_set_charp, opt_show_charp, 198 | &opt.src, "Source server address"), 199 | OPT_WITH_ARG("--dst ", opt_set_charp, opt_show_charp, 200 | &opt.dst, "Destination server address"), 201 | OPT_WITH_ARG("--src-svc ", opt_set_charp, opt_show_charp, 202 | &opt.src_svc, "Source server port"), 203 | OPT_WITH_ARG("--dst-svc ", opt_set_charp, opt_show_charp, 204 | &opt.dst_svc, "Destination server port"), 205 | OPT_WITH_ARG("--req-size|-s ", opt_set_uintval, opt_show_uintval, 206 | &opt.req_size, "Request size"), 207 | OPT_WITH_ARG("--resp-size ", opt_set_uintval, opt_show_uintval, 208 | &opt.resp_size, "Response size"), 209 | OPT_WITH_ARG("--read-size ", opt_set_uintval, opt_show_uintval, 210 | &opt.read_size, "Buffer size for write/send syscall"), 211 | OPT_WITH_ARG("--write-size ", opt_set_uintval, opt_show_uintval, 212 | &opt.write_size, "Buffer size for read/recv syscall"), 213 | OPT_WITH_ARG("--pin-off ", opt_set_uintval, opt_show_uintval, 214 | &opt.pin_off, "CPU pin offset"), 215 | OPT_WITH_ARG("--cpu-min ", opt_set_uintval, opt_show_uintval, 216 | &opt.cpu_min, "min CPU number for connection"), 217 | OPT_WITH_ARG("--cpu-max ", opt_set_uintval, opt_show_uintval, 218 | &opt.cpu_max, "max CPU number for connection"), 219 | OPT_WITH_ARG("--cpu-src-wrk ", opt_set_intval, opt_show_intval, 220 | &opt.cpu_src_wrk, "max CPU number for connection"), 221 | OPT_WITH_ARG("--cpu-dst-wrk ", opt_set_intval, opt_show_intval, 222 | &opt.cpu_dst_wrk, "max CPU number for connection"), 223 | OPT_WITHOUT_ARG("--cross-pin", opt_set_bool, &opt.xpin, "Cross-pin"), 224 | OPT_WITH_ARG("--time|-t ", opt_set_uintval, opt_show_uintval, 225 | &opt.time, "Test length"), 226 | OPT_WITH_ARG("--time-stats|-T ", opt_set_uintval, opt_show_uintval, 227 | &opt.time_stats, 228 | "Time stats - (0) none, (1) hist, (2) hist+pstats"), 229 | OPT_WITH_ARG("--mss|-M ", opt_set_uintval, opt_show_uintval, 230 | &opt.mss, "MSS for TCP"), 231 | OPT_WITH_ARG("--max-pace ", opt_set_uintval, opt_show_uintval, 232 | &opt.max_pace, "Max sending/pacing rate"), 233 | OPT_WITHOUT_ARG("--tls", opt_set_bool, &opt.tls, 234 | "Enable TLS in both directions"), 235 | OPT_WITH_ARG("--tls-ver ", opt_set_uintval, opt_show_uinthex, 236 | &opt.tls_ver, "Version of TLS as per kernel defines"), 237 | OPT_WITHOUT_ARG("--tls-rx", opt_set_bool, &opt.tls_rx, 238 | "Enable TLS for Rx"), 239 | OPT_WITHOUT_ARG("--tls-tx", opt_set_bool, &opt.tls_tx, 240 | "Enable TLS for Tx"), 241 | OPT_WITHOUT_ARG("--tls-nopad", opt_set_bool, &opt.tls_nopad, 242 | "Enable TLS no padding optimization for Rx"), 243 | OPT_WITH_ARG("--num-connections|-n ", 244 | opt_set_uintval, opt_show_uintval, 245 | &opt.n_conns, "Number of connections"), 246 | OPT_WITH_ARG("--tcp-cc ", opt_set_charp, opt_show_charp, 247 | &opt.tcp_cong_ctrl, "Set TCP congestion control"), 248 | OPT_WITHOUT_ARG("--out-csv", opt_set_bool, &opt.output_csv, 249 | "Print output in terse CSV format"), 250 | OPT_WITHOUT_ARG("--out-hdr", opt_set_bool, &opt.output_hdr, 251 | "Include column name header in the CSV output"), 252 | OPT_WITHOUT_ARG("--verbose|-v", opt_inc_intval, &verbose, 253 | "Verbose mode (can be specified more than once)"), 254 | OPT_WITHOUT_ARG("--quiet|-q", opt_dec_intval, &verbose, 255 | "Quiet mode (can be specified more than once)"), 256 | OPT_WITHOUT_ARG("--usage|--help|-h", opt_usage_and_exit, 257 | "kpeft client", "Show this help message"), 258 | OPT_WITHOUT_ARG("--msg-trunc", opt_set_bool, &opt.msg_trunc, "Use MSG_TRUNC on receive"), 259 | OPT_WITHOUT_ARG("--msg-zerocopy", opt_set_bool, &opt.msg_zerocopy, "Use MSG_ZEROCOPY on transmit"), 260 | OPT_EARLY_WITHOUT_ARG("--devmem-rx", opt_set_bool, &opt.devmem_rx, "Use TCP Devmem on receive"), 261 | OPT_WITH_ARG("--devmem-rx-memory {cuda,host}", opt_set_memory_provider, 262 | opt_show_memory_provider, &opt.devmem_rx_memory, 263 | "Select the memory provider for TCP Devmem RX"), 264 | OPT_WITH_ARG("--dmabuf-rx-size-mb ", opt_set_uintval, opt_show_uintval, 265 | &opt.dmabuf_rx_size_mb, "Size of RX dmabuf for TCP Devmem mode"), 266 | OPT_WITH_ARG("--dmabuf-tx-size-mb ", opt_set_uintval, opt_show_uintval, 267 | &opt.dmabuf_tx_size_mb, "Size of TX dmabuf for TCP Devmem mode"), 268 | OPT_WITHOUT_ARG("--devmem-tx", opt_set_bool, &opt.devmem_tx, "Use TCP Devmem on transmit"), 269 | OPT_WITH_ARG("--devmem-tx-memory {cuda,host}", opt_set_memory_provider, 270 | opt_show_memory_provider, &opt.devmem_tx_memory, 271 | "Select the memory provider for TCP Devmem TX"), 272 | OPT_WITH_ARG("--num-rx-queues ", opt_set_uintval, opt_show_uintval, 273 | &opt.num_rx_queues, "Number of RX queues for TCP Devmem mode"), 274 | OPT_WITH_ARG("--validate ", opt_set_bool_arg, NULL, &opt.validate, 275 | "Validate payload. Default is no when using --devmem-rx; otherwise, default is yes"), 276 | OPT_WITH_ARG("--devmem-dst-dev ", opt_set_dev, opt_show_dev, 277 | &opt.devmem_dst_dev, "Select the destination device for the TCP Devmem memory provider"), 278 | OPT_WITH_ARG("--devmem-src-dev ", opt_set_dev, opt_show_dev, 279 | &opt.devmem_src_dev, "Select the source device for the TCP Devmem memory provider"), 280 | OPT_WITHOUT_ARG("--iou-src", opt_set_bool, &opt.iou_src, 281 | "Use io_uring on source server"), 282 | OPT_WITHOUT_ARG("--iou-dst", opt_set_bool, &opt.iou_dst, 283 | "Use io_uring on destination server"), 284 | OPT_EARLY_WITHOUT_ARG("--zerocopy-rx", opt_set_bool, &opt.zerocopy_rx, 285 | "Use zero copy on receive"), 286 | OPT_WITH_ARG("--iou-rx-size-mb ", opt_set_uintval, opt_show_uintval, 287 | &opt.iou_rx_size_mb, "Size of RX memory reserved by io_uring"), 288 | OPT_ENDTABLE 289 | }; 290 | 291 | static struct kpm_connect_reply * 292 | spawn_conn(int src, int dst, struct sockaddr_in6 *addr, socklen_t len) 293 | { 294 | struct kpm_connect_reply **replies; 295 | struct kpm_connect_reply *conns; 296 | struct kpm_connect_reply *id; 297 | struct bim_state *bim; 298 | struct bim_edge m; 299 | unsigned int i; 300 | int *seq; 301 | 302 | if (!opt.n_conns) 303 | return NULL; 304 | conns = calloc(opt.n_conns, sizeof(*conns)); 305 | if (!conns) 306 | return NULL; 307 | replies = calloc(opt.n_conns, sizeof(*replies)); 308 | if (!replies) 309 | goto err_free_conns; 310 | seq = calloc(opt.n_conns, sizeof(int)); 311 | if (!seq) 312 | goto err_free_replies; 313 | bim = bim_init(); 314 | if (!bim) 315 | goto err_free_seq; 316 | 317 | again: 318 | for (i = 0; i < opt.n_conns; i++) { 319 | seq[i] = kpm_send_connect(src, addr, len, opt.mss); 320 | if (seq[i] < 0) 321 | err(7, "Failed to connect"); 322 | } 323 | for (i = 0; i < opt.n_conns; i++) { 324 | id = kpm_receive(src); 325 | if (!id) 326 | errx(7, "No connection ID"); 327 | 328 | if (!kpm_good_reply(id, KPM_MSG_TYPE_CONNECT, seq[i])) 329 | errx(7, "Invalid connection ID %d %d", 330 | id->hdr.type, id->hdr.len); 331 | 332 | replies[i] = id; 333 | } 334 | 335 | for (i = 0; i < opt.n_conns; i++) { 336 | bool good, bim_unique; 337 | 338 | id = replies[i]; 339 | 340 | good = clamp(id->local.cpu, opt.cpu_min, opt.cpu_max) == 341 | id->local.cpu && 342 | clamp(id->remote.cpu, opt.cpu_min, opt.cpu_max) == 343 | id->remote.cpu; 344 | bim_unique = good && 345 | bim_add_edge(bim, id->local.cpu, id->remote.cpu, id); 346 | 347 | kpm_dbg("Connection established %d:cpu %d | %d:cpu %d - %s", 348 | id->local.id, id->local.cpu, 349 | id->remote.id, id->remote.cpu, 350 | good && bim_unique ? "good" : 351 | (good ? "duplicate" : "out of range")); 352 | 353 | if (!bim_unique) { 354 | bool fail = kpm_req_disconnect(src, id->local.id) < 0 || 355 | kpm_req_disconnect(dst, id->remote.id) < 0; 356 | free(id); 357 | if (fail) { 358 | warnx("Disconnect failed"); 359 | i = opt.n_conns - i - 1; 360 | goto err_drain; 361 | } 362 | } 363 | } 364 | 365 | if (bim_match_size(bim) < opt.n_conns) 366 | goto again; 367 | 368 | i = 0; 369 | bim_for_each_edge(bim, &m) { 370 | id = m.cookie; 371 | 372 | if (m.is_match && i < opt.n_conns) { 373 | kpm_info("Connected %d:cpu %d | %d:cpu %d", 374 | id->local.id, id->local.cpu, 375 | id->remote.id, id->remote.cpu); 376 | memcpy(&conns[i], id, sizeof(*id)); 377 | i++; 378 | } else { 379 | kpm_req_disconnect(src, id->local.id); 380 | kpm_req_disconnect(dst, id->remote.id); 381 | } 382 | free(id); 383 | } 384 | 385 | for (i = 0; i < opt.n_conns; i++) { 386 | if (opt.max_pace) { 387 | if (kpm_req_pacing(src, conns[i].local.id, opt.max_pace) || 388 | kpm_req_pacing(dst, conns[i].remote.id, opt.max_pace)) 389 | err(8, "Failed to set pacing rate"); 390 | } 391 | 392 | if (opt.tcp_cong_ctrl) { 393 | if (kpm_req_tcp_cc(src, conns[i].local.id, opt.tcp_cong_ctrl) || 394 | kpm_req_tcp_cc(dst, conns[i].remote.id, opt.tcp_cong_ctrl)) 395 | err(8, "Failed to set TCP cong control"); 396 | } 397 | } 398 | 399 | free(seq); 400 | free(replies); 401 | return conns; 402 | 403 | err_drain: 404 | bim_for_each_edge(bim, &m) { 405 | id = m.cookie; 406 | kpm_req_disconnect(src, id->local.id); 407 | kpm_req_disconnect(dst, id->remote.id); 408 | free(id); 409 | } 410 | bim_destroy(bim); 411 | err_free_seq: 412 | free(seq); 413 | err_free_replies: 414 | free(replies); 415 | err_free_conns: 416 | free(conns); 417 | return NULL; 418 | } 419 | 420 | static int spawn_worker(int fd, int cpu, __u32 *wid) 421 | { 422 | struct __kpm_generic_u32 *id; 423 | struct kpm_empty *ack; 424 | int seq; 425 | 426 | seq = kpm_send_empty(fd, KPM_MSG_TYPE_SPAWN_WORKER); 427 | if (seq < 0) { 428 | warn("Failed to spawn"); 429 | return 1; 430 | } 431 | 432 | id = kpm_receive(fd); 433 | if (!id) { 434 | warnx("No ack for spawn"); 435 | return 1; 436 | } 437 | 438 | if (!kpm_good_reply(id, KPM_MSG_TYPE_SPAWN_WORKER, seq)) { 439 | warnx("Invalid spawn ack %d %d", id->hdr.type, id->hdr.len); 440 | free(id); 441 | return 1; 442 | } 443 | 444 | *wid = id->val; 445 | free(id); 446 | 447 | seq = kpm_send_pin_worker(fd, *wid, cpu); 448 | if (seq < 0) { 449 | warn("Failed to pin"); 450 | return 1; 451 | } 452 | 453 | ack = kpm_receive(fd); 454 | if (!ack) { 455 | warnx("No ack for pin"); 456 | return 1; 457 | } 458 | 459 | if (!kpm_good_reply(ack, KPM_MSG_TYPE_PIN_WORKER, seq)) { 460 | warnx("Invalid ack for pin %d %d", ack->hdr.type, ack->hdr.len); 461 | free(ack); 462 | return 1; 463 | } 464 | free(ack); 465 | 466 | return 0; 467 | } 468 | 469 | static void 470 | show_cpu_stat(const char *pfx, struct kpm_test_results *result, unsigned int id) 471 | { 472 | struct kpm_cpu_load *cpu = &result->cpu_load[id]; 473 | 474 | if (cpu->id != id) { 475 | warnx("Sparse CPU IDs %d != %d!", cpu->id, id); 476 | return; 477 | } 478 | 479 | warnx(" %sCPU%3d: usr:%5.2f%% sys:%5.2f%% idle:%5.2f%% iow:%5.2f%% irq:%5.2f%% sirq:%5.2f%%", 480 | pfx, id, cpu->user / 100.0, cpu->system / 100.0, 481 | cpu->idle / 100.0, cpu->iowait / 100.0, cpu->irq / 100.0, 482 | cpu->sirq / 100.0); 483 | } 484 | 485 | static void 486 | dump_result(struct kpm_test_results *result, const char *dir, 487 | struct kpm_connect_reply *conns, bool local) 488 | { 489 | unsigned int end = 0, i, r; 490 | int start = -1; 491 | 492 | warnx("== %s", dir); 493 | for (r = 0; r < opt.n_conns; r++) 494 | warnx(" Tx%7.3lf Gbps (%llu bytes in %u usec)", 495 | (double)result->res[r].tx_bytes * 8 / 496 | result->time_usec / 497 | 1000, 498 | result->res[r].tx_bytes, 499 | result->time_usec); 500 | for (r = 0; r < opt.n_conns; r++) 501 | warnx(" Rx%7.3lf Gbps (%llu bytes in %u usec)", 502 | (double)result->res[r].rx_bytes * 8 / 503 | result->time_usec / 504 | 1000, 505 | result->res[r].rx_bytes, 506 | result->time_usec); 507 | warnx(" TCP retrans reord rtt rttvar d_ce snd_wnd cwnd"); 508 | for (r = 0; r < opt.n_conns; r++) 509 | warnx(" %7u %5u %3u %6u %4u %7u %4u", 510 | result->res[r].retrans, result->res[r].reord_seen, 511 | result->res[r].rtt, 512 | result->res[r].rttvar, result->res[r].delivered_ce, 513 | result->res[r].snd_wnd, result->res[r].snd_cwnd); 514 | 515 | for (r = 0; r < opt.n_conns; r++) { 516 | int flow_cpu; 517 | 518 | flow_cpu = local ? conns[r].local.cpu : conns[r].remote.cpu; 519 | show_cpu_stat(opt.pin_off ? "net " : "", result, flow_cpu); 520 | if (opt.pin_off) 521 | show_cpu_stat("app ", result, flow_cpu + opt.pin_off); 522 | } 523 | 524 | /* The rest is RR-only */ 525 | if (opt.req_size == ~0U) 526 | return; 527 | 528 | for (r = 0; r < opt.n_conns; r++) 529 | warnx("%.1lf RPS", 530 | (double)result->res[r].reqs / 531 | result->time_usec * 1000000); 532 | 533 | if (opt.time_stats < 1) 534 | return; 535 | 536 | for (r = 0; r < opt.n_conns; r++) { 537 | for (i = 0; i < ARRAY_SIZE(result->res[r].lat_hist); i++) { 538 | if (!result->res[r].lat_hist[i]) 539 | continue; 540 | if (start < 0) 541 | start = i; 542 | end = i + 1; 543 | } 544 | for (i = start; i < end; i++) { 545 | unsigned int val; 546 | const char *unit; 547 | 548 | if (i < 3) { 549 | val = 128 << i; 550 | unit = "ns"; 551 | } else if (i < 13) { 552 | val = (1ULL << (i + 7)) / 1000; 553 | unit = "us"; 554 | } else { 555 | val = (1ULL << (i + 7)) / (1000 * 1000); 556 | unit = "ms"; 557 | } 558 | warnx(" [%3d%s] %d", 559 | val, unit, result->res[r].lat_hist[i]); 560 | } 561 | } 562 | 563 | if (opt.time_stats < 2) 564 | return; 565 | 566 | for (r = 0; r < opt.n_conns; r++) 567 | warnx("p25:%uus p50:%uus p90:%uus p99:%uus p999:%uus p9999:%uus", 568 | result->res[r].p25 * 128 / 1000, 569 | result->res[r].p50 * 128 / 1000, 570 | result->res[r].p90 * 128 / 1000, 571 | result->res[r].p99 * 128 / 1000, 572 | result->res[r].p999 * 128 / 1000, 573 | result->res[r].p9999 * 128 / 1000); 574 | } 575 | 576 | static void 577 | dump_result_machine(struct kpm_test_results *result, const char *dir, 578 | struct kpm_connect_reply *conns, bool local) 579 | { 580 | struct kpm_test_result res = {}; 581 | struct kpm_cpu_load *cpu; 582 | unsigned int r; 583 | int flow_cpu; 584 | __u64 bytes; 585 | int i; 586 | 587 | for (r = 0; r < opt.n_conns; r++) { 588 | #define S(f) res.f += result->res[r].f; 589 | S(rx_bytes); 590 | S(tx_bytes); 591 | S(reqs); 592 | S(retrans); 593 | S(reord_seen); 594 | S(rtt); 595 | S(rttvar); 596 | S(delivered_ce); 597 | S(snd_wnd); 598 | S(snd_cwnd); 599 | 600 | if (opt.time_stats < 2) 601 | continue; 602 | S(p25); 603 | S(p50); 604 | S(p90); 605 | S(p99); 606 | S(p999); 607 | S(p9999); 608 | #undef S 609 | } 610 | res.rtt /= opt.n_conns; 611 | res.rttvar /= opt.n_conns; 612 | res.snd_wnd /= opt.n_conns; 613 | res.snd_cwnd /= opt.n_conns; 614 | res.p25 /= opt.n_conns; 615 | res.p50 /= opt.n_conns; 616 | res.p90 /= opt.n_conns; 617 | res.p99 /= opt.n_conns; 618 | res.p999 /= opt.n_conns; 619 | res.p9999 /= opt.n_conns; 620 | r = 0; 621 | 622 | /* Headers once on the first line */ 623 | if (local && opt.output_hdr) { 624 | for (i = 0; i < 2; i++) { 625 | printf("tcp,,,,,,,"); 626 | if (opt.time_stats >= 2) 627 | printf("latency,(us),,,,,"); 628 | if (opt.n_conns < 2) { 629 | printf("net,,,,"); 630 | if (opt.pin_off) 631 | printf("app,,,,"); 632 | } 633 | printf("data%c", i ? '\n' : ','); 634 | } 635 | for (i = 0; i < 2; i++) { 636 | printf("retrans,reord,ce,rtt,rttvar,swnd,cwnd,"); 637 | if (opt.time_stats >= 2) 638 | printf("p25,p50,p90,p99,p999,p9999,"); 639 | if (opt.n_conns < 2) { 640 | printf("usr,sys,idle,sirq,"); 641 | if (opt.pin_off) 642 | printf("usr,sys,idle,sirq,"); 643 | } 644 | printf(i ? "rx\n" : "tx,"); 645 | } 646 | } 647 | 648 | printf("%u,%u,%u,%u,%u,%u,%u,", 649 | res.retrans, res.reord_seen, res.delivered_ce, 650 | res.rtt, res.rttvar, res.snd_wnd, res.snd_cwnd); 651 | 652 | if (opt.time_stats >= 2) 653 | printf("%u,%u,%u,%u,%u,%u,", 654 | res.p25 * 128 / 1000, res.p50 * 128 / 1000, 655 | res.p90 * 128 / 1000, res.p99 * 128 / 1000, 656 | res.p999 * 128 / 1000, res.p9999 * 128 / 1000); 657 | 658 | /* Dunno how to report CPU use, yet */ 659 | if (opt.n_conns < 2) { 660 | flow_cpu = local ? conns[r].local.cpu : conns[r].remote.cpu; 661 | cpu = &result->cpu_load[flow_cpu]; 662 | printf("%.4f,%.4f,%.4f,%.4f,", 663 | cpu->user / 10000.0, cpu->system / 10000.0, 664 | cpu->idle / 10000.0, cpu->sirq / 10000.0); 665 | 666 | if (opt.pin_off) { 667 | cpu = &result->cpu_load[flow_cpu + opt.pin_off]; 668 | printf("%.4f,%.4f,%.4f,%.4f,", 669 | cpu->user / 10000.0, cpu->system / 10000.0, 670 | cpu->idle / 10000.0, cpu->sirq / 10000.0); 671 | } 672 | } 673 | 674 | bytes = local ? res.tx_bytes : res.rx_bytes; 675 | printf("%.3lf", (double)bytes * 8 / result->time_usec / 1000); 676 | printf(local ? "," : "\n"); 677 | } 678 | 679 | /* copied from devmem.c */ 680 | static void inet_to_inet6(struct sockaddr *addr, struct sockaddr_in6 *out) 681 | { 682 | out->sin6_addr.s6_addr32[3] = ((struct sockaddr_in6 *)addr)->sin6_addr.s6_addr32[0]; 683 | out->sin6_addr.s6_addr32[0] = 0; 684 | out->sin6_addr.s6_addr32[1] = 0; 685 | out->sin6_addr.s6_addr16[4] = 0; 686 | out->sin6_addr.s6_addr16[5] = 0xffff; 687 | out->sin6_family = AF_INET6; 688 | } 689 | 690 | int inet_sockaddr(const char *str, struct sockaddr_in6 *out) 691 | { 692 | struct sockaddr_in *sa4; 693 | struct sockaddr_in6 tmp; 694 | 695 | out->sin6_family = AF_INET6; 696 | if (inet_pton(AF_INET6, str, &(out->sin6_addr)) == 1) { 697 | out->sin6_family = AF_INET6; 698 | return 0; 699 | } 700 | 701 | sa4 = (struct sockaddr_in *)&tmp; 702 | if (inet_pton(AF_INET, str, &(sa4->sin_addr)) == 1) { 703 | sa4->sin_family = AF_INET; 704 | inet_to_inet6((void *)sa4, out); 705 | return 0; 706 | } 707 | 708 | return -1; 709 | } 710 | 711 | int main(int argc, char *argv[]) 712 | { 713 | enum kpm_rx_mode rx_mode = KPM_RX_MODE_SOCKET; 714 | enum kpm_tx_mode tx_mode = KPM_TX_MODE_SOCKET; 715 | unsigned int src_ncpus, dst_ncpus; 716 | struct __kpm_generic_u32 *ack_id; 717 | __u32 *src_wrk_cpu, *dst_wrk_cpu; 718 | struct kpm_connect_reply *conns; 719 | struct kpm_test_results *result; 720 | __u32 *src_wrk_id, *dst_wrk_id; 721 | struct sockaddr_in6 conn_addr; 722 | __u32 src_tst_id, dst_tst_id; 723 | struct sockaddr_in6 src_addr; 724 | struct addrinfo *addr; 725 | struct kpm_test *test; 726 | unsigned int i; 727 | socklen_t len; 728 | int src, dst; 729 | size_t sz; 730 | int seq; 731 | 732 | opt_register_table(opts, NULL); 733 | 734 | /* Use early parse to set default for --validate based on --devmem-rx */ 735 | if (!opt_early_parse(argc, argv, opt_log_stderr)) 736 | exit(1); 737 | opt.validate = !opt.devmem_rx; 738 | 739 | if (!opt_parse(&argc, argv, opt_log_stderr)) 740 | exit(1); 741 | 742 | err_set_progname(argv[0]); 743 | 744 | if (opt.read_size > KPM_MAX_OP_CHUNK || 745 | opt.write_size > KPM_MAX_OP_CHUNK) 746 | errx(1, "Max read/write size is %d", KPM_MAX_OP_CHUNK); 747 | if (opt.tcp_cong_ctrl && 748 | strnlen(opt.tcp_cong_ctrl, KPM_CC_NAME_LEN) == KPM_CC_NAME_LEN) 749 | errx(1, "TCP CC name is too long"); 750 | if (opt.xpin) { 751 | if (opt.cpu_src_wrk != -1 || opt.cpu_dst_wrk != -1) 752 | errx(1, "Cross-pin can't use explicit pin"); 753 | if (opt.pin_off) 754 | errx(1, "Cross-pin can't use pin off"); 755 | if (opt.n_conns != 2) 756 | errx(1, "Cross-pin only works with 2 connections"); 757 | } 758 | 759 | if (inet_sockaddr(opt.src, &src_addr) < 0) 760 | errx(1, "failed to get sockaddr from %s\n", opt.src); 761 | 762 | /* io_uring doesn't support devmem yet */ 763 | if (opt.devmem_rx && opt.iou_dst) 764 | errx(1, "io_uring does not support --devmem-rx yet"); 765 | if (opt.devmem_tx && opt.iou_src) 766 | errx(1, "io_uring does not support --devmem-tx yet"); 767 | 768 | if (opt.msg_trunc && opt.validate) 769 | errx(1, "--msg-trunc and --validate yes are mutually exclusive"); 770 | 771 | if (opt.msg_trunc && (opt.devmem_rx || opt.zerocopy_rx)) 772 | errx(1, "--msg-trunc and (--devmem-rx or --zerocopy-rx) are mutually exclusive"); 773 | 774 | if (opt.msg_trunc) 775 | rx_mode = KPM_RX_MODE_SOCKET_TRUNC; 776 | else if (opt.zerocopy_rx) 777 | rx_mode = KPM_RX_MODE_SOCKET_ZEROCOPY; 778 | else if (opt.devmem_rx) 779 | rx_mode = KPM_RX_MODE_DEVMEM; 780 | 781 | if (opt.msg_zerocopy && opt.devmem_tx) 782 | errx(1, "--msg-zerocopy and --devmem-tx are mutually exclusive"); 783 | 784 | if (opt.msg_zerocopy) 785 | tx_mode = KPM_TX_MODE_SOCKET_ZEROCOPY; 786 | else if (opt.devmem_tx) 787 | tx_mode = KPM_TX_MODE_DEVMEM; 788 | 789 | addr = net_client_lookup(opt.src, opt.src_svc, AF_UNSPEC, SOCK_STREAM); 790 | if (!addr) 791 | errx(1, "Failed to look up service to connect to"); 792 | 793 | /* Src */ 794 | src = net_connect(addr); 795 | freeaddrinfo(addr); 796 | if (src < 1) 797 | err(1, "Failed to connect"); 798 | 799 | addr = net_client_lookup(opt.dst, opt.dst_svc, AF_UNSPEC, SOCK_STREAM); 800 | if (!addr) 801 | errx(1, "Failed to look up service to connect to"); 802 | 803 | if (kpm_xchg_hello(src, &src_ncpus)) 804 | errx(2, "Bad hello"); 805 | 806 | /* Dst */ 807 | dst = net_connect(addr); 808 | freeaddrinfo(addr); 809 | if (dst < 1) 810 | err(1, "Failed to connect"); 811 | 812 | if (kpm_xchg_hello(dst, &dst_ncpus)) 813 | errx(2, "Bad hello"); 814 | 815 | src_wrk_id = calloc(opt.n_conns, sizeof(*src_wrk_id)); 816 | dst_wrk_id = calloc(opt.n_conns, sizeof(*dst_wrk_id)); 817 | src_wrk_cpu = calloc(opt.n_conns, sizeof(*src_wrk_cpu)); 818 | dst_wrk_cpu = calloc(opt.n_conns, sizeof(*dst_wrk_cpu)); 819 | 820 | /* Main */ 821 | len = sizeof(conn_addr); 822 | if (kpm_req_tcp_sock(dst, &conn_addr, &len) < 0) { 823 | warnx("Failed create TCP acceptor"); 824 | goto out; 825 | } 826 | 827 | struct kpm_mode dst_mode = { 828 | .rx_mode = rx_mode, 829 | .tx_mode = tx_mode, 830 | .rx_provider = opt.devmem_rx_memory, 831 | .tx_provider = opt.devmem_tx_memory, 832 | .dev = opt.devmem_dst_dev, 833 | .dmabuf_rx_size_mb = opt.dmabuf_rx_size_mb, 834 | .dmabuf_tx_size_mb = opt.dmabuf_tx_size_mb, 835 | .num_rx_queues = opt.num_rx_queues, 836 | .validate = opt.validate, 837 | .iou = opt.iou_dst, 838 | .iou_rx_size_mb = opt.iou_rx_size_mb, 839 | }; 840 | if (kpm_req_mode(dst, &dst_mode) < 0) { 841 | warnx("Failed setup destination mode"); 842 | goto out; 843 | } 844 | 845 | struct kpm_mode src_mode = { 846 | .rx_mode = rx_mode, 847 | .tx_mode = tx_mode, 848 | .rx_provider = opt.devmem_rx_memory, 849 | .tx_provider = opt.devmem_tx_memory, 850 | .dev = opt.devmem_src_dev, 851 | .dmabuf_rx_size_mb = opt.dmabuf_rx_size_mb, 852 | .dmabuf_tx_size_mb = opt.dmabuf_tx_size_mb, 853 | .num_rx_queues = opt.num_rx_queues, 854 | .addr = src_addr, 855 | .validate = opt.validate, 856 | .iou = opt.iou_src, 857 | .iou_rx_size_mb = opt.iou_rx_size_mb, 858 | }; 859 | if (kpm_req_mode(src, &src_mode) < 0) { 860 | warnx("Failed setup source mode"); 861 | goto out; 862 | } 863 | 864 | conns = spawn_conn(src, dst, &conn_addr, len); 865 | if (!conns) 866 | goto out; 867 | 868 | if (opt.tls || opt.tls_rx || opt.tls_tx) { 869 | struct tls12_crypto_info_aes_gcm_128 aes128 = {}; 870 | unsigned int rx, src_mask, dst_mask; 871 | 872 | aes128.info.version = opt.tls_ver; 873 | aes128.info.cipher_type = TLS_CIPHER_AES_GCM_128; 874 | 875 | rx = KPM_TLS_RX; 876 | if (opt.tls_nopad) 877 | rx |= KPM_TLS_NOPAD; 878 | if (opt.tls) { 879 | src_mask = dst_mask = KPM_TLS_TX | rx; 880 | } else if (opt.tls_rx) { 881 | src_mask = rx; 882 | dst_mask = KPM_TLS_TX; 883 | } else { 884 | src_mask = KPM_TLS_TX; 885 | dst_mask = rx; 886 | } 887 | 888 | for (i = 0; i < opt.n_conns; i++) { 889 | if (kpm_req_tls(src, conns[i].local.id, 890 | KPM_TLS_ULP | src_mask, 891 | &aes128, sizeof(aes128)) || 892 | kpm_req_tls(dst, conns[i].remote.id, 893 | KPM_TLS_ULP | dst_mask, 894 | &aes128, sizeof(aes128))) { 895 | warnx("TLS setup failed"); 896 | goto out_id; 897 | } 898 | } 899 | } 900 | 901 | for (i = 0; i < opt.n_conns; i++) { 902 | struct kpm_connect_reply *id = &conns[i]; 903 | 904 | if (opt.xpin) 905 | src_wrk_cpu[i] = conns[!i].local.cpu; 906 | else if (opt.cpu_src_wrk != -1) 907 | src_wrk_cpu[i] = opt.cpu_src_wrk; 908 | else 909 | src_wrk_cpu[i] = id->local.cpu + opt.pin_off; 910 | 911 | if (opt.xpin) 912 | dst_wrk_cpu[i] = conns[!i].remote.cpu; 913 | if (opt.cpu_dst_wrk != -1) 914 | dst_wrk_cpu[i] = opt.cpu_dst_wrk; 915 | else 916 | dst_wrk_cpu[i] = id->remote.cpu + opt.pin_off; 917 | 918 | if (spawn_worker(src, src_wrk_cpu[i], &src_wrk_id[i]) || 919 | spawn_worker(dst, dst_wrk_cpu[i], &dst_wrk_id[i])) 920 | goto out_id; 921 | } 922 | 923 | sz = sizeof(*test) + opt.n_conns * sizeof(test->specs[0]); 924 | test = malloc(sz); 925 | memset(test, 0, sz); 926 | 927 | test->n_conns = opt.n_conns; 928 | test->time_sec = opt.time; 929 | for (i = 0; i < opt.n_conns; i++) { 930 | test->specs[i].connection_id = conns[i].remote.id; 931 | test->specs[i].worker_id = dst_wrk_id[i]; 932 | test->specs[i].read_size = opt.read_size; 933 | test->specs[i].write_size = opt.write_size; 934 | if (opt.req_size == ~0U) { 935 | test->specs[i].type = KPM_TEST_TYPE_STREAM; 936 | } else { 937 | test->specs[i].type = KPM_TEST_TYPE_RR; 938 | test->specs[i].arg.rr.req_size = opt.req_size; 939 | test->specs[i].arg.rr.resp_size = opt.resp_size ?: opt.req_size; 940 | test->specs[i].arg.rr.timings = opt.time_stats; 941 | } 942 | } 943 | 944 | seq = kpm_send(dst, &test->hdr, sz, KPM_MSG_TYPE_TEST); 945 | 946 | ack_id = kpm_receive(dst); 947 | if (!kpm_good_reply(ack_id, KPM_MSG_TYPE_TEST, seq)) { 948 | warnx("Invalid ack for test %d %d", 949 | ack_id->hdr.type, ack_id->hdr.len); 950 | goto out_id; 951 | } 952 | dst_tst_id = ack_id->val; 953 | dbg("Test id dst %d", dst_tst_id); 954 | free(ack_id); 955 | 956 | test->active = 1; 957 | for (i = 0; i < opt.n_conns; i++) { 958 | test->specs[i].connection_id = conns[i].local.id; 959 | test->specs[i].worker_id = src_wrk_id[i]; 960 | } 961 | 962 | seq = kpm_send(src, &test->hdr, sz, KPM_MSG_TYPE_TEST); 963 | free(test); 964 | 965 | ack_id = kpm_receive(src); 966 | if (!kpm_good_reply(ack_id, KPM_MSG_TYPE_TEST, seq)) { 967 | warnx("Invalid ack for test %d %d", 968 | ack_id->hdr.type, ack_id->hdr.len); 969 | goto out_id; 970 | } 971 | src_tst_id = ack_id->val; 972 | dbg("Test id src %d", src_tst_id); 973 | free(ack_id); 974 | 975 | /* Source worker is done */ 976 | result = kpm_receive(src); 977 | if (!result) { 978 | warnx("No result"); 979 | goto out_id; 980 | } 981 | sz = sizeof(*result) + opt.n_conns * sizeof(result->res[0]); 982 | if (result->hdr.type != KPM_MSG_TYPE_TEST_RESULT || 983 | result->hdr.len < sz) 984 | warnx("Invalid result %d %d", 985 | result->hdr.type, result->hdr.len); 986 | else if (opt.output_csv) 987 | dump_result_machine(result, "Source", conns, true); 988 | else 989 | dump_result(result, "Source", conns, true); 990 | free(result); 991 | 992 | /* Stop the test on both ends */ 993 | if (kpm_req_end_test(src, src_tst_id) || 994 | kpm_req_end_test(dst, dst_tst_id)) 995 | warnx("Failed to stop test"); 996 | 997 | /* Destination worker is done */ 998 | result = kpm_receive(dst); 999 | if (!result) { 1000 | warnx("No result"); 1001 | goto out_id; 1002 | } 1003 | if (result->hdr.type != KPM_MSG_TYPE_TEST_RESULT || 1004 | result->hdr.len < sizeof(*result) + sizeof(result->res[0])) 1005 | warnx("Invalid result %d %d", 1006 | result->hdr.type, result->hdr.len); 1007 | else if (opt.output_csv) 1008 | dump_result_machine(result, "Source", conns, false); 1009 | else 1010 | dump_result(result, "Target", conns, false); 1011 | free(result); 1012 | 1013 | out_id: 1014 | free(conns); 1015 | out: 1016 | close(src); 1017 | close(dst); 1018 | 1019 | free(src_wrk_id); 1020 | free(dst_wrk_id); 1021 | free(src_wrk_cpu); 1022 | free(dst_wrk_cpu); 1023 | 1024 | return 0; 1025 | } 1026 | --------------------------------------------------------------------------------