├── .gitignore
├── tcp.h
├── epoll.h
├── .gitmodules
├── iou.h
├── CONTRIBUTING.md
├── devmem.h
├── LICENSE
├── proto_dbg.h
├── cpu_stat.h
├── test
    └── ksft.py
├── Makefile
├── bipartite_match.h
├── worker.h
├── server.h
├── tcp.c
├── README.rst
├── CODE_OF_CONDUCT.md
├── cpu_stat.c
├── server.c
├── proto.h
├── bipartite_match.c
├── proto.c
├── worker.c
├── epoll.c
├── iou.c
├── server_session.c
├── devmem.c
└── client.c


/.gitignore:
--------------------------------------------------------------------------------
1 | *.d
2 | *.o
3 | bipartite_match
4 | client
5 | cpu_stat
6 | server
7 | 


--------------------------------------------------------------------------------
/tcp.h:
--------------------------------------------------------------------------------
 1 | // SPDX-License-Identifier: BSD-3-Clause
 2 | /* Copyright Meta Platforms, Inc. and affiliates */
 3 | 
 4 | /**
 5 |  * DOC: Random collection of TCP helpers.
 6 |  */
 7 | 
 8 | struct tcp_info;
 9 | 
10 | void print_tcp_info(struct tcp_info *ti);
11 | 


--------------------------------------------------------------------------------
/epoll.h:
--------------------------------------------------------------------------------
 1 | /* SPDX-License-Identifier: BSD-3-Clause */
 2 | /* Copyright Meta Platforms, Inc. and affiliates */
 3 | 
 4 | #ifndef EPOLL_H
 5 | #define EPOLL_H 1
 6 | 
 7 | #include "worker.h"
 8 | 
 9 | void worker_epoll_init(struct worker_state *state);
10 | 
11 | #endif /* EPOLL_H */
12 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "ccan"]
 2 | 	path = ccan
 3 | 	url = https://github.com/rustyrussell/ccan.git
 4 | [submodule "ynl-c"]
 5 | 	path = ynl-c
 6 | 	url = https://github.com/linux-netdev/ynl-c.git
 7 | [submodule "liburing"]
 8 | 	path = liburing
 9 | 	url = https://github.com/axboe/liburing.git
10 | 


--------------------------------------------------------------------------------
/iou.h:
--------------------------------------------------------------------------------
 1 | /* SPDX-License-Identifier: BSD-3-Clause */
 2 | /* Copyright Meta Platforms, Inc. and affiliates */
 3 | 
 4 | #ifndef IOU_H
 5 | #define IOU_H 1
 6 | 
 7 | #include "worker.h"
 8 | 
 9 | void worker_iou_init(struct worker_state *state);
10 | 
11 | int iou_zerocopy_rx_setup(struct session_state_iou *iou, int fd,
12 | 			  int num_queues);
13 | int iou_zerocopy_rx_teardown(struct session_state_iou *iou);
14 | 
15 | #endif /* IOU_H */
16 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to kperf
 2 | 
 3 | We actively welcome your pull requests.
 4 | 
 5 | 1. Fork the repo and create your branch from `main`.
 6 | 2. If you've added code that should be tested, add tests.
 7 | 3. If you've changed APIs, update the documentation.
 8 | 4. Make sure your code lints.
 9 | 5. If you haven't already, complete the Contributor License Agreement ("CLA").
10 | 
11 | ## Contributor License Agreement ("CLA")
12 | In order to accept your pull request, we need you to submit a CLA. You only need
13 | to do this once to work on any of Meta's open source projects.
14 | 
15 | Complete your CLA here: <https://code.facebook.com/cla>
16 | 
17 | ## Issues
18 | We use GitHub issues to track public bugs. Please ensure your description is
19 | clear and has sufficient instructions to be able to reproduce the issue.
20 | 
21 | ## Coding Style
22 | `kperf` uses Linux kernel's coding style.
23 | 
24 | ## License
25 | By contributing to `kperf`, you agree that your contributions will be licensed
26 | under the LICENSE file in the root directory of this source tree.
27 | 


--------------------------------------------------------------------------------
/devmem.h:
--------------------------------------------------------------------------------
 1 | // SPDX-License-Identifier: BSD-3-Clause
 2 | /* Copyright Meta Platforms, Inc. and affiliates */
 3 | 
 4 | #ifndef DEVMEM_H
 5 | #define DEVMEM_H 1
 6 | 
 7 | #include <sys/socket.h>
 8 | 
 9 | int reserve_queues(int fd, int num_queues, char out_ifname[IFNAMSIZ],
10 | 		   int *out_ifindex, int *out_queue_id, int *out_rss_context);
11 | void unreserve_queues(char *ifname, int rss_context);
12 | 
13 | int devmem_setup(struct session_state_devmem *devmem, int fd,
14 | 		 size_t dmabuf_size, int num_queues,
15 | 		 enum memory_provider_type provider, struct pci_dev *dev);
16 | int devmem_teardown(struct session_state_devmem *devmem);
17 | void devmem_teardown_tx(struct session_state_devmem *devmem);
18 | int devmem_release_tokens(int fd, struct connection_devmem *conn);
19 | ssize_t devmem_recv(int fd, struct connection_devmem *conn,
20 | 		    unsigned char *rxbuf, size_t chunk, struct memory_buffer *mem,
21 | 		    int rep, __u64 tot_recv, bool validate);
22 | int devmem_sendmsg(int fd, int dmabuf_id, size_t off, size_t n);
23 | void devmem_teardown_conn(struct connection_devmem *devmem);
24 | int devmem_prepare_connect(int fd, struct sockaddr_in6 *src, struct session_state_devmem *devmem);
25 | int devmem_setup_tx(struct session_state_devmem *devmem, enum memory_provider_type provider,
26 | 		    int dmabuf_tx_size_mb, struct pci_dev *dev, struct sockaddr_in6 *addr);
27 | int devmem_bind_socket(struct session_state_devmem *devmem, int fd);
28 | 
29 | #endif /* DEVMEM_H */
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Redistribution and use in source and binary forms, with or without
 2 | modification, are permitted provided that the following conditions
 3 | are met:
 4 | 1. Redistributions of source code must retain the above copyright
 5 |    notice, this list of conditions and the following disclaimer.
 6 | 2. Redistributions in binary form must reproduce the above copyright
 7 |    notice, this list of conditions and the following disclaimer in the
 8 |    documentation and/or other materials provided with the distribution.
 9 | 3. Neither the name of the University nor the names of its contributors
10 |    may be used to endorse or promote products derived from this software
11 |    without specific prior written permission.
12 | 
13 | THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 | ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
17 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 | SUCH DAMAGE.
24 | 


--------------------------------------------------------------------------------
/proto_dbg.h:
--------------------------------------------------------------------------------
 1 | /* SPDX-License-Identifier: BSD-3-Clause */
 2 | /* Copyright Meta Platforms, Inc. and affiliates */
 3 | 
 4 | #ifndef PROTO_DBG_H
 5 | #define PROTO_DBG_H 1
 6 | 
 7 | #include <stdarg.h>
 8 | #include <stdio.h>
 9 | 
10 | #include "proto.h"
11 | 
12 | extern int verbose;
13 | 
14 | #define __kpm_cmd_dbg(pfx, msg, hdr)					\
15 | 	({								\
16 | 		struct kpm_header *_hdr = (hdr);			\
17 | 									\
18 | 		if (verbose >= 4)						\
19 | 			fprintf(stderr, "D%s %s%s%s T%d (seq:%d, len:%d)\n", \
20 | 				pfx, __FILE__,				\
21 | 				strlen(msg) ? " " : "", msg,		\
22 | 				_hdr->type, _hdr->id, _hdr->len);	\
23 | 	})
24 | 
25 | #define __kpm_cmd_dbg_start(name, hdr)	__kpm_cmd_dbg("|>", name, hdr)
26 | #define __kpm_cmd_dbg_end(name, hdr)	__kpm_cmd_dbg("|<", name, hdr)
27 | 
28 | #define kpm_cmd_dbg_start(hdr)	__kpm_cmd_dbg_start("", hdr)
29 | #define kpm_cmd_dbg_end(hdr)	__kpm_cmd_dbg_end("", hdr)
30 | 
31 | static inline void ____kpm_trace(int level, const char *fn, const char *pfx,
32 | 				 const char *fmt, ...)
33 | {
34 | 	const char *letters = "!EWIDT ";
35 | 	va_list ap;
36 | 
37 | 	if (verbose < level)
38 | 		return;
39 | 	if (level > 6)
40 | 		level = 6;
41 | 
42 | 	fprintf(stderr, "%c%s %s: ", letters[level], pfx, fn);
43 | 	va_start(ap, fmt);
44 | 	vfprintf(stderr, fmt, ap);
45 | 	va_end(ap);
46 | 	fprintf(stderr, "\n");
47 | }
48 | 
49 | #define __kpm_info(pfx, msg...)		____kpm_trace(3, __FILE__, pfx, msg)
50 | #define kpm_info(msg...)		__kpm_info("  ", msg)
51 | 
52 | #define __kpm_dbg(pfx, msg...)		____kpm_trace(4, __FILE__, pfx, msg)
53 | #define kpm_dbg(msg...)			__kpm_dbg("  ", msg)
54 | 
55 | #define __kpm_trace(pfx, msg...)	____kpm_trace(5, __FILE__, pfx, msg)
56 | #define kpm_trace(msg...)		__kpm_trace("  ", msg)
57 | 
58 | #endif
59 | 


--------------------------------------------------------------------------------
/cpu_stat.h:
--------------------------------------------------------------------------------
 1 | // SPDX-License-Identifier: BSD-3-Clause
 2 | /* Copyright Jakub Kicinski */
 3 | /* Copyright Meta Platforms, Inc. and affiliates */
 4 | 
 5 | /**
 6 |  * DOC: CPU utilization stats
 7 |  *
 8 |  * Linux CPU use stats read / parsed from procfs.
 9 |  *
10 |  * Linux counts CPU use starting at boot and in jiffies so provide helpers
11 |  * to measure CPU use over a period of time and convert it to percentage.
12 |  *
13 |  * All functions take ncpus as returned by get_nprocs_conf(), pass 0 if you
14 |  * don't have get_nprocs_conf() cached.
15 |  *
16 |  * If function returns a pointer to an array that array will be allocated
17 |  * on the heap and has to be explicitly freed. Arrays are sized to ncpus
18 |  * (or get_nprocs_conf()).
19 |  *
20 |  * Example:
21 |  *	struct cpu_stat *s1, *s2, *diffpct;
22 |  *
23 |  *	s1 = cpu_stat_snapshot(0);
24 |  *	sleep(1);
25 |  *	s2 = cpu_stat_snapshot(0);
26 |  *
27 |  *	// Calculate CPU use between s1 was taken and s2 was taken.
28 |  *	cpu_stat_sub(s2, s1, 0);
29 |  *	diffpct = cpu_stat_to_pct00(s2, 0);
30 |  *
31 |  *	// Print percentage of time spent in user context.
32 |  *	printf("usr:%2llu.%02llu\n",
33 |  *	       totpct[i].user / 100, totpct[i].user % 100);
34 |  */
35 | struct cpu_stat {
36 | 	unsigned int cpu_id; /* CPU id, not a stat */
37 | 	unsigned long long int user; /* sum of user and nice */
38 | 	unsigned long long int system;
39 | 	unsigned long long int idle;
40 | 	unsigned long long int iowait;
41 | 	unsigned long long int irq;
42 | 	unsigned long long int sirq;
43 | };
44 | 
45 | struct cpu_stat *cpu_stat_snapshot(int ncpus);
46 | /* convert stats to fractional format, fields multiplied by 10,000 */
47 | struct cpu_stat *cpu_stat_to_pct00(struct cpu_stat *src, int ncpus);
48 | void cpu_stat_sub(struct cpu_stat *dst, struct cpu_stat *op, int ncpus);
49 | 


--------------------------------------------------------------------------------
/test/ksft.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # SPDX-License-Identifier: GPL-2.0
 3 | # Copyright Meta Platforms, Inc. and affiliates
 4 | 
 5 | """
 6 | Script to run kperf.
 7 | This needs to be copied into kernel selftest directory to run.
 8 | It depends on kernel networking selftest infra and libraries.
 9 | """
10 | 
11 | import time
12 | import psutil
13 | from lib.py import ksft_run, ksft_exit
14 | from lib.py import NetDrvEpEnv
15 | from lib.py import bkg, cmd
16 | 
17 | 
18 | def kperf(cfg):
19 |     """ Run a bunch of kperf configs. Checking is expected to be manual. """
20 |     kpdr = "/home/kicinski/devel/kperf/"
21 |     s1 = bkg(kpdr + "server --no-daemon")
22 |     s2 = bkg(kpdr + "server --no-daemon --pid-file /tmp/kperf-remote.pid",
23 |              host=cfg.remote)
24 | 
25 |     time.sleep(0.3)
26 | 
27 |     fd_cnt = psutil.Process(s1.proc.pid).num_fds()
28 |     print("Server fd count at the start:", fd_cnt)
29 | 
30 |     print(">>> Base run")
31 |     run = cmd(kpdr + f"client --src {cfg.addr} --dst {cfg.remote_addr} -t 10",
32 |               fail=False)
33 |     if run.stderr:
34 |         print("STDERR:", run.stderr)
35 |     print(run.stdout)
36 | 
37 |     print(">>> pin-off 1")
38 |     run = cmd(kpdr + f"client --cpu-max 2 --src {cfg.addr} --dst {cfg.remote_addr} --pin-off 1 -t 10",
39 |               fail=False)
40 |     if run.stderr:
41 |         print("STDERR:", run.stderr)
42 |     print(run.stdout)
43 | 
44 |     end_fd_cnt = psutil.Process(s1.proc.pid).num_fds()
45 |     print("Server fd count at the end:", end_fd_cnt)
46 |     if end_fd_cnt != fd_cnt:
47 |         print(f"ERROR!!! (was {fd_cnt} at init)")
48 |         print(cmd("lsof -p " + str(s1.proc.pid)).stdout)
49 | 
50 |     s1.process(terminate=True, fail=False)
51 |     s2.process(terminate=True, fail=False)
52 | 
53 |     print(s1.stderr, s1.stdout)
54 |     print(s2.stderr, s2.stdout)
55 | 
56 | 
57 | 
58 | def main() -> None:
59 |     """ Ksft boiler plate main """
60 | 
61 |     with NetDrvEpEnv(__file__) as cfg:
62 |         ksft_run([kperf],
63 |                  args=(cfg, ))
64 |     ksft_exit()
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     main()
69 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: BSD-3-Clause
 2 | # Copyright Meta Platforms, Inc. and affiliates
 3 | 
 4 | CCAN_PATH := ./ccan
 5 | YNL_PATH := ./ynl-c
 6 | LIBURING_PATH := ./liburing
 7 | 
 8 | CC=gcc
 9 | CFLAGS=-std=gnu99   -I$(CCAN_PATH)   -O2   -W -Wall -Wextra -Wno-unused-parameter -Wshadow   -DDEBUG   -g
10 | CFLAGS += -I$(YNL_PATH)/include/
11 | CFLAGS += -I$(LIBURING_PATH)/src/include/
12 | 
13 | ifeq ("$(DEBUG)","1")
14 |   CFLAGS += -g -fsanitize=address -fsanitize=leak -static-libasan
15 | endif
16 | 
17 | LIBS=-lm -L$(CCAN_PATH) -pthread -lccan
18 | LIBS += -L$(YNL_PATH) -lynl
19 | LIBS += $(LIBURING_PATH)/src/liburing.a
20 | 
21 | ifdef USE_CUDA
22 |     CFLAGS += -I/usr/local/cuda/include/ -DUSE_CUDA
23 | endif
24 | 
25 | include $(wildcard *.d)
26 | 
27 | all: server client units
28 | units: bipartite_match cpu_stat
29 | 
30 | ifdef USE_CUDA
31 | server: LIBS += -lcuda -lcudart -L/usr/local/cuda/lib64
32 | endif
33 | 
34 | server: $(CCAN_PATH)/libccan.a $(YNL_PATH)/libynl.a $(LIBURING_PATH)/src/liburing.a server.o server_session.o proto.o epoll.o iou.o worker.o devmem.o cpu_stat.o tcp.o
35 | 	$(CC) $(CFLAGS) -o $@ $^ $(LIBS)
36 | 
37 | client: $(CCAN_PATH)/libccan.a client.o proto.o bipartite_match.o
38 | 	$(CC) $(CFLAGS) -o $@ $^ $(LIBS)
39 | 
40 | $(CCAN_PATH)/libccan.a:
41 | 	make -C $(CCAN_PATH)/
42 | 	ar rcs $(CCAN_PATH)/libccan.a $(CCAN_PATH)/ccan/*/*.o
43 | 
44 | $(YNL_PATH)/libynl.a:
45 | 	make -C $(YNL_PATH)
46 | 
47 | $(LIBURING_PATH)/src/liburing.a:
48 | 	@cd $(LIBURING_PATH) && ./configure --cc=$(CC)
49 | 	make -C $(LIBURING_PATH)
50 | 
51 | clean:
52 | 	rm -rf *.o *.d *~ bipartite_match cpu_stat
53 | 
54 | distclean:
55 | 	rm -rf *.o *.d *~ bipartite_match cpu_stat server client $(CCAN_PATH)/libccan.a
56 | 	make clean -C $(LIBURING_PATH)
57 | 
58 | bipartite_match: $(CCAN_PATH)/libccan.a
59 | 	$(CC) $(CFLAGS) -DKPERF_UNITS bipartite_match.c -o bipartite_match $(CCAN_PATH)/libccan.a
60 | 
61 | cpu_stat: $(CCAN_PATH)/libccan.a
62 | 	$(CC) $(CFLAGS) -DKPERF_UNITS cpu_stat.c -o cpu_stat $(CCAN_PATH)/libccan.a
63 | 
64 | %.o: %.c
65 | 	$(COMPILE.c) -MMD -o $@ $<
66 | 
67 | .PHONY: all clean units ccan distclean
68 | .DEFAULT_GOAL=all
69 | 


--------------------------------------------------------------------------------
/bipartite_match.h:
--------------------------------------------------------------------------------
 1 | /* SPDX-License-Identifier: BSD-3-Clause */
 2 | /* Copyright Jakub Kicinski */
 3 | /* Copyright Meta Platforms, Inc. and affiliates */
 4 | 
 5 | #ifndef BIPARTITE_MATCH
 6 | #define BIPARTITE_MATCH
 7 | 
 8 | #include <stdbool.h>
 9 | 
10 | struct bim_state;
11 | 
12 | /**
13 |  * DOC: Bipartite Match
14 |  *
15 |  * Find a matching in a bipartite graph.
16 |  *
17 |  * Number of nodes does not need to be known upfront. Duplicate edges
18 |  * are ignored. Designed for incremental growth of the graph, use
19 |  * bim_match_size() to check number of pairings with current edge set.
20 |  *
21 |  * Example:
22 |  *	struct bim_state *bim;
23 |  *	struct bim_edge m;
24 |  *
25 |  *	bim = bim_init();
26 |  *	while ...
27 |  *		// Add edge to the graph
28 |  *		bim_add_edge(bim, left_id, right_id, priv);
29 |  *
30 |  *	// Dump matches
31 |  *	bim_for_each_match(bim, &m)
32 |  *		printf("Match %d - %d, %p\n", m.left_id, m.right_id, m.cookie);
33 |  *	bim_destroy(bim);
34 |  */
35 | 
36 | /* Graph init / destroy */
37 | struct bim_state *bim_init(void);
38 | void bim_destroy(struct bim_state *bim);
39 | 
40 | /* Optional, size the state to avoid reallocation, pass 0s to compact */
41 | void bim_resize(struct bim_state *bim,
42 | 		unsigned int max_left, unsigned int max_right);
43 | 
44 | /* Populating edges */
45 | bool bim_add_edge(struct bim_state *bim,
46 | 		  unsigned int left_id, unsigned int right_id, void *cookie);
47 | unsigned int bim_match_size(struct bim_state *bim);
48 | 
49 | /* Walk pairings and edges */
50 | struct bim_edge {
51 | 	unsigned int left_id;
52 | 	unsigned int right_id;
53 | 	void *cookie;
54 | 	bool is_match;
55 | 	/* Walker's state, don't overwrite */
56 | 	unsigned long long _walker;
57 | };
58 | 
59 | void bim_walk_init(struct bim_edge *edge);
60 | bool bim_edge_walk_next(struct bim_state *bim, struct bim_edge *edge);
61 | bool bim_match_walk_next(struct bim_state *bim, struct bim_edge *match);
62 | 
63 | #define bim_for_each_match(bim, match)					\
64 | 	for (bim_walk_init(match); bim_match_walk_next(bim, match); )
65 | 
66 | #define bim_for_each_edge(bim, match)					\
67 | 	for (bim_walk_init(match); bim_edge_walk_next(bim, match); )
68 | 
69 | #endif
70 | 


--------------------------------------------------------------------------------
/worker.h:
--------------------------------------------------------------------------------
 1 | // SPDX-License-Identifier: BSD-3-Clause
 2 | /* Copyright Meta Platforms, Inc. and affiliates */
 3 | 
 4 | #ifndef WORKER_H
 5 | #define WORKER_H 1
 6 | 
 7 | #include <linux/tcp.h>
 8 | 
 9 | #include <ccan/time/time.h>
10 | #include <ccan/list/list.h>
11 | 
12 | #include "proto.h"
13 | #include "server.h"
14 | 
15 | /* Main worker state AKA self */
16 | struct worker_state {
17 | 	int main_sock;
18 | 	struct worker_opts opts;
19 | 	int epollfd;
20 | 	unsigned int id;
21 | 	int quit;
22 | 	int ended;
23 | 	struct kpm_test *test;
24 | 	struct cpu_stat *cpu_start;
25 | 	struct timemono test_start;
26 | 	struct timemono prev_loop;
27 | 	unsigned int test_len_msec;
28 | 	struct list_head connections;
29 | 	const struct io_ops *ops;
30 | 	void *io_state;
31 | };
32 | 
33 | struct worker_connection {
34 | 	unsigned int id;
35 | 	int fd;
36 | 	unsigned int read_size;
37 | 	unsigned int write_size;
38 | 	__u64 to_send;
39 | 	__u64 to_send_comp;
40 | 	__u64 to_recv;
41 | 	__u64 tot_sent;
42 | 	__u64 tot_recv;
43 | 	unsigned char *rxbuf;
44 | 
45 | 	/* zero copy receive */
46 | 	size_t rsize;
47 | 	void *raddr;
48 | 	void *addr;
49 | 
50 | 	struct connection_devmem devmem;
51 | 	struct kpm_test_spec *spec;
52 | 	struct tcp_info init_info;
53 | 	union {
54 | 		struct {
55 | 			unsigned int reqs;
56 | 			unsigned int hist[33];
57 | 			unsigned int log_len;
58 | 			unsigned int log_len_max;
59 | 			unsigned int *log;
60 | 		} rr;
61 | 	};
62 | 	struct list_node connections;
63 | };
64 | 
65 | struct io_ops {
66 | 	void (*prep)(struct worker_state *state);
67 | 	void (*wait)(struct worker_state *state, int msec);
68 | 	void (*conn_add)(struct worker_state *state, struct worker_connection *conn);
69 | 	void (*conn_close)(struct worker_state *state, struct worker_connection *conn);
70 | 	void (*exit)(struct worker_state *state);
71 | };
72 | 
73 | void worker_handle_proto(struct worker_state *self, struct kpm_header *hdr);
74 | void worker_kill_conn(struct worker_state *self, struct worker_connection *conn);
75 | void worker_send_finished(struct worker_state *self, struct worker_connection *conn);
76 | void worker_recv_finished(struct worker_state *self, struct worker_connection *conn);
77 | 
78 | #endif /* WORKER_H */
79 | 


--------------------------------------------------------------------------------
/server.h:
--------------------------------------------------------------------------------
  1 | /* SPDX-License-Identifier: BSD-3-Clause */
  2 | /* Copyright Meta Platforms, Inc. and affiliates */
  3 | 
  4 | #ifndef SERVER_H
  5 | #define SERVER_H 1
  6 | 
  7 | #include <netdb.h>
  8 | #include <net/if.h>
  9 | #include <sys/types.h>
 10 | 
 11 | #include <ccan/compiler/compiler.h>
 12 | #include <ccan/list/list.h>
 13 | 
 14 | #include <ynl-c/ynl.h>
 15 | 
 16 | #include "proto.h"
 17 | 
 18 | #ifdef USE_CUDA
 19 | #include <cuda.h>
 20 | #endif
 21 | 
 22 | #define PATTERN_PERIOD 255
 23 | 
 24 | struct server_session {
 25 | 	int cfd;
 26 | 	pid_t pid;
 27 | 	struct list_node sessions;
 28 | };
 29 | 
 30 | #ifndef MSG_SOCK_DEVMEM
 31 | #define MSG_SOCK_DEVMEM 0x2000000
 32 | #define SO_DEVMEM_LINEAR 78
 33 | #define SO_DEVMEM_DMABUF 79
 34 | #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF
 35 | #define SO_DEVMEM_DONTNEED 80
 36 | #endif
 37 | 
 38 | struct dmabuf_cmsg {
 39 | 	__u64 frag_offset;
 40 | 	__u32 frag_size;
 41 | 	__u32 frag_token;
 42 | 	__u32  dmabuf_id;
 43 | 	__u32 flags;
 44 | };
 45 | 
 46 | struct dmabuf_token {
 47 | 	__u32 token_start;
 48 | 	__u32 token_count;
 49 | };
 50 | 
 51 | #ifdef USE_CUDA
 52 | struct memory_buffer_cuda {
 53 | 	CUcontext ctx;
 54 | };
 55 | #endif
 56 | 
 57 | struct memory_buffer {
 58 | 	char *buf_mem;
 59 | 	size_t size;
 60 | 	int fd;
 61 | 	int devfd;
 62 | 	int memfd;
 63 | 	int dmabuf_id;
 64 | 	void *priv;
 65 | #ifdef USE_CUDA
 66 | 	struct memory_buffer_cuda cuda;
 67 | #endif
 68 | 	enum memory_provider_type provider;
 69 | };
 70 | 
 71 | struct memory_provider {
 72 | 	int (*dev_init)(struct pci_dev *dev);
 73 | 	struct memory_buffer *(*alloc)(size_t size);
 74 | 	void (*free)(struct memory_buffer *mem);
 75 | 	void (*memcpy_to_device)(struct memory_buffer *dst, size_t off,
 76 | 				 void *src, int n);
 77 | 	void (*memcpy_from_device)(void *dst, struct memory_buffer *src,
 78 | 				   size_t off, int n);
 79 | };
 80 | 
 81 | struct connection_devmem {
 82 | 	struct dmabuf_token rxtok[128];
 83 | 	int rxtok_len;
 84 | 	/* ncdevmem uses 80k, allocate 64k for recvmsg tokens */
 85 | 	char ctrl_data[64 * 1024];
 86 | };
 87 | 
 88 | struct session_state_devmem {
 89 | 	struct ynl_sock *ys;
 90 | 	char ifname[IFNAMSIZ];
 91 | 
 92 | 	/* RX */
 93 | 	struct memory_buffer *mem;
 94 | 	int rss_context;
 95 | 
 96 | 	/* TX */
 97 | 	struct memory_buffer *tx_mem;
 98 | 	struct pci_dev tx_dev;
 99 | 	__u32 dmabuf_tx_size_mb;
100 | 	enum memory_provider_type tx_provider;
101 | 	struct sockaddr_in6 addr;
102 | };
103 | 
104 | struct worker_state_devmem {
105 | 	struct memory_buffer *mem;
106 | 	int dmabuf_id;
107 | };
108 | 
109 | struct session_state_iou {
110 | 	unsigned int rx_size_mb;
111 | 	char ifname[IFNAMSIZ];
112 | 	int ifindex;
113 | 	int rss_context;
114 | 	int queue_id;
115 | };
116 | 
117 | struct worker_state_iou {
118 | 	unsigned int rx_size_mb;
119 | 	int ifindex;
120 | 	int queue_id;
121 | };
122 | 
123 | struct worker_opts {
124 | 	enum kpm_rx_mode rx_mode;
125 | 	enum kpm_tx_mode tx_mode;
126 | 	bool validate;
127 | 	bool use_iou;
128 | 	struct worker_state_devmem devmem;
129 | 	struct worker_state_iou iou;
130 | 	int fd;
131 | };
132 | 
133 | struct server_session *
134 | server_session_spawn(int fd, struct sockaddr_in6 *addr, socklen_t *addrlen);
135 | 
136 | void* worker_main(void* args);
137 | 
138 | #endif /* SERVER_H */
139 | 


--------------------------------------------------------------------------------
/tcp.c:
--------------------------------------------------------------------------------
 1 | // SPDX-License-Identifier: BSD-3-Clause
 2 | /* Copyright Meta Platforms, Inc. and affiliates */
 3 | 
 4 | #include <stdio.h>
 5 | #include <linux/tcp.h>
 6 | 
 7 | #include "tcp.h"
 8 | 
 9 | void print_tcp_info(struct tcp_info *ti)
10 | {
11 | 	printf("TCP stats\n"
12 | 	       "         %u %u %u %u %u %u %u %u %u %u\n"
13 | 	       "         %u %u %u %u %u %u %u %u %u\n"
14 | 	       "Times:   %u %u %u %u\n"
15 | 	       "Metrics: %u %u %u %u %u %u %u %u\n"
16 | 	       "rcv_rtt| %u %u %u\n"
17 | 	       "pacing_| %llu %llu %llu %llu\n"
18 | 	       "segs_ou| %u %u %u %u %u %u\n"
19 | 	       "de-ry_r| %llu %llu %llu %llu\n"
20 | 	       "de-ered| %u %u\n"
21 | 	       "bytes_s| %llu %llu\n"
22 | 	       "dsack_d| %u %u %u %u\n",
23 | 	       ti->tcpi_state,
24 | 	       ti->tcpi_ca_state,
25 | 	       ti->tcpi_retransmits,
26 | 	       ti->tcpi_probes,
27 | 	       ti->tcpi_backoff,
28 | 	       ti->tcpi_options,
29 | 	       ti->tcpi_snd_wscale,
30 | 	       ti->tcpi_rcv_wscale,
31 | 	       ti->tcpi_delivery_rate_app_limited,
32 | 	       ti->tcpi_fastopen_client_fail,
33 | 
34 | 	       ti->tcpi_rto,
35 | 	       ti->tcpi_ato,
36 | 	       ti->tcpi_snd_mss,
37 | 	       ti->tcpi_rcv_mss,
38 | 
39 | 	       ti->tcpi_unacked,
40 | 	       ti->tcpi_sacked,
41 | 	       ti->tcpi_lost,
42 | 	       ti->tcpi_retrans,
43 | 	       ti->tcpi_fackets,
44 | 
45 | 	       /* Times. */
46 | 	       ti->tcpi_last_data_sent,
47 | 	       ti->tcpi_last_ack_sent,
48 | 	       ti->tcpi_last_data_recv,
49 | 	       ti->tcpi_last_ack_recv,
50 | 
51 | 	       /* Metrics. */
52 | 	       ti->tcpi_pmtu,
53 | 	       ti->tcpi_rcv_ssthresh,
54 | 	       ti->tcpi_rtt,
55 | 	       ti->tcpi_rttvar,
56 | 	       ti->tcpi_snd_ssthresh,
57 | 	       ti->tcpi_snd_cwnd,
58 | 	       ti->tcpi_advmss,
59 | 	       ti->tcpi_reordering,
60 | 
61 | 	       ti->tcpi_rcv_rtt,
62 | 	       ti->tcpi_rcv_space,
63 | 
64 | 	       ti->tcpi_total_retrans,
65 | 
66 | 	       ti->tcpi_pacing_rate,
67 | 	       ti->tcpi_max_pacing_rate,
68 | 	       ti->tcpi_bytes_acked,	/* RFC4898 tcpEStatsAppHCThruOctetsAcked */
69 | 	       ti->tcpi_bytes_received,	/* RFC4898 tcpEStatsAppHCThruOctetsReceived */
70 | 	       ti->tcpi_segs_out,	/* RFC4898 tcpEStatsPerfSegsOut */
71 | 	       ti->tcpi_segs_in,	/* RFC4898 tcpEStatsPerfSegsIn */
72 | 
73 | 	       ti->tcpi_notsent_bytes,
74 | 	       ti->tcpi_min_rtt,
75 | 	       ti->tcpi_data_segs_in,	/* RFC4898 tcpEStatsDataSegsIn */
76 | 	       ti->tcpi_data_segs_out,	/* RFC4898 tcpEStatsDataSegsOut */
77 | 
78 | 	       ti->tcpi_delivery_rate,
79 | 
80 | 	       ti->tcpi_busy_time,	/* Time (usec) busy sending data */
81 | 	       ti->tcpi_rwnd_limited,	/* Time (usec) limited by receive window */
82 | 	       ti->tcpi_sndbuf_limited,	/* Time (usec) limited by send buffer */
83 | 
84 | 	       ti->tcpi_delivered,
85 | 	       ti->tcpi_delivered_ce,
86 | 
87 | 	       ti->tcpi_bytes_sent,	/* RFC4898 tcpEStatsPerfHCDataOctetsOut */
88 | 	       ti->tcpi_bytes_retrans,	/* RFC4898 tcpEStatsPerfOctetsRetrans */
89 | 	       ti->tcpi_dsack_dups,	/* RFC4898 tcpEStatsStackDSACKDups */
90 | 	       ti->tcpi_reord_seen,	/* reordering events seen */
91 | 
92 | 	       ti->tcpi_rcv_ooopack,	/* Out-of-order packets received */
93 | 
94 | 	       ti->tcpi_snd_wnd		/* peer's advertised receive window
95 | 					 * after scaling (bytes) */
96 | 		);
97 | }
98 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | .. SPDX-License-Identifier: BSD-3-Clause
 2 | 
 3 | kperf
 4 | =====
 5 | 
 6 | kperf is a an iperf/netperf replacement with a more fine-grained worker
 7 | control. Modern NICs have multiple Rx queues and while iperf / netperf
 8 | can bind to a CPU they are not aware of which CPU is serving the Rx queue
 9 | selected by the NIC for the flow. If the NIC does not support flow steering
10 | this is a problem. kperf asks the kernel which CPU is used for Rx and can
11 | bind itself appropriately (same core, Rx core + N, etc.). For parallel runs
12 | it can also make sure that the flows are not colliding (being served by
13 | the same CPU).
14 | 
15 | Other strengths include:
16 |  - RPC-like traffic (unlike iperf);
17 |  - kTLS support (just data, no control records);
18 |  - more stats (TCP, latency, CPU use).
19 | 
20 | That said, kperf is more of hackable library than a ready-to-use Swiss
21 | army knife. There is an example client application provided but the number
22 | of configurations is so high it seems impossible to write a comprehensive
23 | client controlled solely by command line options.
24 | 
25 | High level design
26 | -----------------
27 | 
28 | Client does not generate any traffic, it only orchestrates load between
29 | Servers.
30 | 
31 | When Client connect to a Server Server spawns a Session which is what
32 | Client controls on the server side. There can be multiple concurrent
33 | Sessions within one Server, there are no limitations. Note that Session
34 | is between Client and one Server, it can contain connections to many
35 | other Sessions. Each Session is a separate process.
36 | 
37 | Session can establish Connections with other Sessions.
38 | 
39 | Session can spawn Workers which is what drivers the IO.
40 | 
41 | Connections are established within Sessions, not Workers because Workers
42 | and Connections are usually assigned once it's known which CPU given
43 | connection lands on.
44 | 
45 | Currently only Process Workers are supported (each worker is a separate
46 | process), adding threads should not be a problem but was not needed, so far::
47 | 
48 |                                   .--------.
49 |                             .-----| Client |----.
50 |                             |     '--------'    |
51 |                             |                   |
52 |       ----------------------|------       ------|---------------------
53 |                             v      |     |      v
54 |         .--------.     .---------. |     | .---------.     .--------.
55 |         | Server |-----| Session | |     | | Session |-----| Server |
56 |         '--------'     '---------' |     | '---------'     '--------'
57 |                             |      |     |      |
58 |                             v      |     |      v
59 |                        .---------. |     | .---------.
60 |                        | Worker  | |     | | Worker  |
61 |                        '---------' |     | '---------'
62 |      Host A            .---------. |     | .---------.        Host B
63 |                        | Worker  | |     | | Worker  |
64 |                        '---------' |     | '---------'
65 |                        .---------. |     | .---------.
66 |                        | Worker  | |     | | Worker  |
67 |                        '---------' |     | '---------'
68 |                                    |     |
69 | 
70 | Contributing
71 | ------------
72 | 
73 | Please refer to relevant details in the `license`_, `code of conduct`_,
74 | and `contributing guide`_.
75 | 
76 | .. _license: LICENSE
77 | .. _code of conduct: CODE_OF_CONDUCT.md
78 | .. _contributing guide: CONTRIBUTING.md
79 | 
80 | Per Meta's policies contributors are required to submit a CLA.
81 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | This Code of Conduct also applies outside the project spaces when there is a
56 | reasonable belief that an individual's behavior may have a negative impact on
57 | the project or its community.
58 | 
59 | ## Enforcement
60 | 
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported by contacting the project team at <opensource-conduct@fb.com>. All
63 | complaints will be reviewed and investigated and will result in a response that
64 | is deemed necessary and appropriate to the circumstances. The project team is
65 | obligated to maintain confidentiality with regard to the reporter of an incident.
66 | Further details of specific enforcement policies may be posted separately.
67 | 
68 | Project maintainers who do not follow or enforce the Code of Conduct in good
69 | faith may face temporary or permanent repercussions as determined by other
70 | members of the project's leadership.
71 | 
72 | ## Attribution
73 | 
74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
76 | 
77 | [homepage]: https://www.contributor-covenant.org
78 | 
79 | For answers to common questions about this code of conduct, see
80 | https://www.contributor-covenant.org/faq


--------------------------------------------------------------------------------
/cpu_stat.c:
--------------------------------------------------------------------------------
  1 | /* SPDX-License-Identifier: BSD-3-Clause */
  2 | /* Copyright Jakub Kicinski */
  3 | /* Copyright Meta Platforms, Inc. and affiliates */
  4 | 
  5 | #include <stdlib.h>
  6 | #include <stdio.h>
  7 | #include <unistd.h>
  8 | #include <sys/sysinfo.h>
  9 | 
 10 | #include "cpu_stat.h"
 11 | 
 12 | /* Skip the rest of the line */
 13 | static void cpu_stat_nl(FILE *fp)
 14 | {
 15 | 	char c;
 16 | 
 17 | 	do {
 18 | 		c = getc(fp);
 19 | 	} while (c != '\n' && c != EOF);
 20 | }
 21 | 
 22 | struct cpu_stat *cpu_stat_snapshot(int ncpus)
 23 | {
 24 | 	struct cpu_stat *stats;
 25 | 	FILE *fp;
 26 | 	int i;
 27 | 
 28 | 	if (!ncpus)
 29 | 		ncpus = get_nprocs_conf();
 30 | 	if (ncpus < 1)
 31 | 		return NULL;
 32 | 
 33 | 	stats = calloc(ncpus, sizeof(*stats));
 34 | 	if (!stats)
 35 | 		return NULL;
 36 | 
 37 | 	fp = fopen("/proc/stat", "r");
 38 | 	if (!fp)
 39 | 		goto err_free;
 40 | 
 41 | 	/* skip first line */
 42 | 	cpu_stat_nl(fp);
 43 | 
 44 | 	for (i = 0; i < ncpus; i++) {
 45 | 		unsigned long long int nice;
 46 | 
 47 | 		fscanf(fp, "cpu%u %llu %llu %llu %llu %llu %llu %llu",
 48 | 		       &stats[i].cpu_id,
 49 | 		       &stats[i].user, &nice,
 50 | 		       &stats[i].system,
 51 | 		       &stats[i].idle,
 52 | 		       &stats[i].iowait,
 53 | 		       &stats[i].irq,
 54 | 		       &stats[i].sirq);
 55 | 		stats[i].user += nice;
 56 | 		cpu_stat_nl(fp);
 57 | 	}
 58 | 
 59 | 	fclose(fp);
 60 | 	return stats;
 61 | 
 62 | err_free:
 63 | 	free(stats);
 64 | 	return NULL;
 65 | }
 66 | 
 67 | /* dst -= op; */
 68 | void cpu_stat_sub(struct cpu_stat *dst, struct cpu_stat *op, int ncpus)
 69 | {
 70 | 	int i;
 71 | 
 72 | 	if (!ncpus)
 73 | 		ncpus = get_nprocs_conf();
 74 | 
 75 | 	for (i = 0; i < ncpus; i++) {
 76 | 		dst[i].user -= op[i].user;
 77 | 		dst[i].system -= op[i].system;
 78 | 		dst[i].idle -= op[i].idle;
 79 | 		dst[i].iowait -= op[i].iowait;
 80 | 		dst[i].irq -= op[i].irq;
 81 | 		dst[i].sirq -= op[i].sirq;
 82 | 	}
 83 | }
 84 | 
 85 | struct cpu_stat *cpu_stat_to_pct00(struct cpu_stat *src, int ncpus)
 86 | {
 87 | 	struct cpu_stat *pct;
 88 | 	int i;
 89 | 
 90 | 	if (!ncpus)
 91 | 		ncpus = get_nprocs_conf();
 92 | 	if (ncpus < 1)
 93 | 		return NULL;
 94 | 
 95 | 	pct = calloc(ncpus, sizeof(*pct));
 96 | 	if (!pct)
 97 | 		return NULL;
 98 | 
 99 | 	for (i = 0; i < ncpus; i++) {
100 | 		unsigned long long int total;
101 | 
102 | 		total = src[i].user + src[i].system + src[i].idle +
103 | 			src[i].iowait + src[i].irq + src[i].sirq;
104 | 
105 | 		pct[i].cpu_id	= src[i].cpu_id;
106 | 		pct[i].user	= src[i].user	* 10000 / total;
107 | 		pct[i].system	= src[i].system	* 10000 / total;
108 | 		pct[i].idle	= src[i].idle	* 10000 / total;
109 | 		pct[i].iowait	= src[i].iowait	* 10000 / total;
110 | 		pct[i].irq	= src[i].irq	* 10000 / total;
111 | 		pct[i].sirq	= src[i].sirq	* 10000 / total;
112 | 	}
113 | 
114 | 	return pct;
115 | }
116 | 
117 | #ifdef KPERF_UNITS
118 | int main()
119 | {
120 | 	struct cpu_stat *stats1, *stats2;
121 | 	struct cpu_stat *totpct, *diffpct;
122 | 	int i;
123 | 
124 | 	stats1 = cpu_stat_snapshot(0);
125 | 	sleep(1);
126 | 	stats2 = cpu_stat_snapshot(0);
127 | 	totpct = cpu_stat_to_pct00(stats2, 0);
128 | 
129 | 	cpu_stat_sub(stats2, stats1, 0);
130 | 	diffpct = cpu_stat_to_pct00(stats2, 0);
131 | 
132 | 	for (i = 0; i < get_nprocs_conf(); i++) {
133 | 		printf("%u/%u: usr:%2llu sys:%2llu idl:%2llu\n",
134 | 		       i, stats1[i].cpu_id,
135 | 		       stats1[i].user,
136 | 		       stats1[i].system,
137 | 		       stats1[i].idle);
138 | 		printf("%u/%u: usr:%2llu.%02llu sys:%2llu.%02llu idl:%2llu.%02llu\n",
139 | 		       i, totpct[i].cpu_id,
140 | 		       totpct[i].user / 100, totpct[i].user % 100,
141 | 		       totpct[i].system / 100, totpct[i].system % 100,
142 | 		       totpct[i].idle / 100, totpct[i].idle % 100);
143 | 	}
144 | 	free(totpct);
145 | 	free(diffpct);
146 | 	free(stats1);
147 | 	free(stats2);
148 | 
149 | 	return 0;
150 | }
151 | #endif
152 | 


--------------------------------------------------------------------------------
/server.c:
--------------------------------------------------------------------------------
  1 | // SPDX-License-Identifier: BSD-3-Clause
  2 | /* Copyright Meta Platforms, Inc. and affiliates */
  3 | 
  4 | #include <errno.h>
  5 | #include <fcntl.h>
  6 | #include <signal.h>
  7 | #include <stdbool.h>
  8 | #include <stdio.h>
  9 | #include <unistd.h>
 10 | #include <sys/socket.h>
 11 | #include <sys/stat.h>
 12 | #include <sys/types.h>
 13 | #include <sys/wait.h>
 14 | #include <arpa/inet.h>
 15 | 
 16 | #include <ccan/err/err.h>
 17 | #include <ccan/daemonize/daemonize.h>
 18 | #include <ccan/list/list.h>
 19 | #include <ccan/net/net.h>
 20 | #include <ccan/opt/opt.h>
 21 | 
 22 | #include "server.h"
 23 | #include "proto_dbg.h"
 24 | 
 25 | int verbose = 3;
 26 | 
 27 | static struct {
 28 | 	char *addr;
 29 | 	char *service;
 30 | 	char *pid_file;
 31 | 	bool kill;
 32 | 	bool server;
 33 | } opt = {
 34 | 	.server		= true,
 35 | 	.service	= "18323",
 36 | 	.pid_file	= "/tmp/kperf.pid",
 37 | };
 38 | 
 39 | static const struct opt_table opts[] = {
 40 | 	OPT_WITH_ARG("--addr|-a <arg>", opt_set_charp, opt_show_charp,
 41 | 		     &opt.addr, "Bind to specific control address"),
 42 |  	OPT_WITH_ARG("--port|-p <arg>", opt_set_charp, opt_show_charp,
 43 | 		     &opt.service, "Set control port/service to listen on"),
 44 | 	OPT_WITHOUT_ARG("--no-daemon", opt_set_invbool, &opt.server,
 45 | 			"Don't start a daemon"),
 46 | 	OPT_WITH_ARG("--pid-file <arg>", opt_set_charp, opt_show_charp,
 47 | 		     &opt.pid_file, "Set daemon identity / pid file"),
 48 | 	OPT_WITHOUT_ARG("--kill", opt_set_bool, &opt.kill, "Stop the daemon"),
 49 |  	OPT_WITHOUT_ARG("--verbose|-v", opt_inc_intval, &verbose,
 50 | 			"Verbose mode (can be specified more than once)"),
 51 |  	OPT_WITHOUT_ARG("--usage|--help|-h", opt_usage_and_exit,
 52 |  			"kpeft server",	"Show this help message"),
 53 |  	OPT_ENDTABLE
 54 | };
 55 | 
 56 | static volatile int chld;
 57 | 
 58 | static void chld_sig_handler(int sig)
 59 | {
 60 | 	chld = 1;
 61 | }
 62 | 
 63 | static struct list_head sessions = LIST_HEAD_INIT(sessions);
 64 | 
 65 | static void server_session_add(struct server_session *ses)
 66 | {
 67 | 	list_add(&sessions, &ses->sessions);
 68 | }
 69 | 
 70 | static void server_session_del(pid_t pid)
 71 | {
 72 | 	struct server_session *ses = NULL;
 73 | 
 74 | 	list_for_each(&sessions, ses, sessions) {
 75 | 		if (ses->pid == pid)
 76 | 			break;
 77 | 	}
 78 | 	if (!ses || ses->pid != pid)
 79 | 		return;
 80 | 
 81 | 	list_del(&ses->sessions);
 82 | 	free(ses);
 83 | }
 84 | 
 85 | static void server_reap_sessions(void)
 86 | {
 87 | 	if (!chld)
 88 | 		return;
 89 | 
 90 | 	while (true) {
 91 | 		int status;
 92 | 		pid_t pid;
 93 | 
 94 | 		chld = 0;
 95 | 		pid = waitpid(-1, &status, WNOHANG);
 96 | 		if (pid < 1)
 97 | 			break;
 98 | 		server_session_del(pid);
 99 | 	}
100 | }
101 | 
102 | static void kill_old_daemon(void)
103 | {
104 | 	char buf[10];
105 | 	ssize_t n;
106 | 	pid_t pid;
107 | 	int fd;
108 | 
109 | 	fd = open(opt.pid_file, O_RDONLY);
110 | 	if (fd < 0) {
111 | 		if (errno == ENOENT)
112 | 			return;
113 | 		err(2, "Failed to open PID file");
114 | 	}
115 | 
116 | 	n = read(fd, buf, sizeof(buf));
117 | 	if (n < 0)
118 | 		err(2, "Failed to read PID file");
119 | 	if (!n || n == sizeof(buf))
120 | 		errx(2, "Bad pid file len - %zd", n);
121 | 	buf[n] = 0;
122 | 	close(fd);
123 | 
124 | 	pid = atoi(buf);
125 | 
126 | 	if (kill(pid, SIGKILL))
127 | 		if (errno != ESRCH)
128 | 			err(2, "Can't kill the old daemon");
129 | 
130 | 	if (unlink(opt.pid_file))
131 | 		err(2, "Failed to remove pid file");
132 | }
133 | 
134 | static void server_daemonize(void)
135 | {
136 | 	char buf[10];
137 | 	ssize_t n;
138 | 	int fd;
139 | 
140 | 	fd = open(opt.pid_file, O_WRONLY | O_CREAT | O_EXCL, 00660);
141 | 	if (fd < 0)
142 | 		err(3, "Failed to create PID file");
143 | 
144 | 	if (!daemonize())
145 | 		err(1, "can't daemonize");
146 | 
147 | 	n = snprintf(buf, sizeof(buf), "%d", getpid());
148 | 	if (!n || n == sizeof(buf))
149 | 		errx(3, "Bad pid file len - %zd", n);
150 | 
151 | 	if (write(fd, buf, n) != n)
152 | 		err(3, "Short write to pid file");
153 | 	close(fd);
154 | }
155 | 
156 | /* same as net_server_lookup but accepts the node argument */
157 | static struct addrinfo *net_server_lookup_node(const char *node,
158 | 					       const char *service,
159 | 					       int family,
160 | 					       int socktype)
161 | {
162 | 	struct addrinfo *res, hints;
163 | 
164 | 	memset(&hints, 0, sizeof(hints));
165 | 	hints.ai_family = family;
166 | 	hints.ai_socktype = socktype;
167 | 	hints.ai_flags = AI_PASSIVE;
168 | 	hints.ai_protocol = 0;
169 | 
170 | 	if (getaddrinfo(node, service, &hints, &res) != 0)
171 | 		return NULL;
172 | 
173 | 	return res;
174 | }
175 | 
176 | static void log_address(const char *format, struct sockaddr_in6 *sin6)
177 | {
178 | 	struct sockaddr_in *sin = (void *)sin6;
179 | 	char buf[256];
180 | 
181 | 	if (sin6->sin6_family == AF_INET6)
182 | 		inet_ntop(AF_INET6, &sin6->sin6_addr, buf, sizeof(buf));
183 | 	else
184 | 		inet_ntop(AF_INET, &sin->sin_addr, buf, sizeof(buf));
185 | 
186 | 	kpm_info(format, buf);
187 | }
188 | 
189 | static void print_listener(int *fds, int num_fds)
190 | {
191 | 	struct sockaddr_in6 sin6;
192 | 	socklen_t sa_len;
193 | 	int ret;
194 | 	int i;
195 | 
196 | 	for (i = 0; i < num_fds; i++) {
197 | 		sa_len = sizeof(sin6);
198 | 		ret = getsockname(fds[i], (struct sockaddr *)&sin6, &sa_len);
199 | 		if (ret != 0)
200 | 			err(1, "Failed to look up address for fd %d", fds[i]);
201 | 		log_address("Bound to %s", &sin6);
202 | 	}
203 | }
204 | 
205 | int main(int argc, char *argv[])
206 | {
207 | 	int fds[2], i, num_fds, max_fd;
208 | 	struct addrinfo *addr;
209 | 
210 | 	opt_register_table(opts, NULL);
211 | 	if (!opt_parse(&argc, argv, opt_log_stderr))
212 | 		exit(1);
213 | 
214 | 	err_set_progname(argv[0]);
215 | 
216 | 	if (opt.server || opt.kill)
217 | 		kill_old_daemon();
218 | 	if (opt.kill)
219 | 		return 0;
220 | 
221 | 	if (opt.server)
222 | 		server_daemonize();
223 | 
224 | 	addr = net_server_lookup_node(opt.addr, opt.service, AF_UNSPEC, SOCK_STREAM);
225 | 	if (!addr)
226 | 		errx(1, "Failed to look up service to bind to");
227 | 
228 | 	num_fds = net_bind(addr, fds);
229 | 	freeaddrinfo(addr);
230 | 	if (num_fds < 1)
231 | 		err(1, "Failed to listen");
232 | 	if (opt.addr)
233 | 		print_listener(fds, num_fds);
234 | 
235 | 	max_fd = num_fds == 1 || fds[0] > fds[1] ? fds[0] : fds[1];
236 | 
237 | 	signal(SIGCHLD, chld_sig_handler);
238 | 
239 | 	while (true) {
240 | 		struct sockaddr_in6 sockaddr;
241 | 		struct server_session *ses;
242 | 		struct timeval tv;
243 | 		socklen_t addrlen;
244 | 		int cfd, fd, ret;
245 | 		fd_set rfds;
246 | 
247 | 		FD_ZERO(&rfds);
248 | 		for (i = 0; i < num_fds; i++)
249 | 			FD_SET(fds[i], &rfds);
250 | 
251 | 		tv.tv_sec = 1;
252 | 		tv.tv_usec = 0;
253 | 
254 | 		ret = select(max_fd + 1, &rfds, NULL, NULL, &tv);
255 | 		if (ret < 0) {
256 | 			if (errno == EINTR && chld)
257 | 				goto reap_child;
258 | 			err(2, "Failed to select");
259 | 		} else if (!ret) {
260 | 			continue;
261 | 		}
262 | 
263 | 		if (FD_ISSET(fds[0], &rfds))
264 | 			fd = fds[0];
265 | 		else if (num_fds > 1 && FD_ISSET(fds[1], &rfds))
266 | 			fd = fds[1];
267 | 		else
268 | 			errx(3, "Failed to find fd");
269 | 
270 | 		addrlen = sizeof(sockaddr);
271 | 		cfd = accept(fd, (void *)&sockaddr, &addrlen);
272 | 		if (cfd < 0) {
273 | 			warn("Failed to accept");
274 | 			continue;
275 | 		}
276 | 
277 | 		if (opt.addr)
278 | 			log_address("Accepted %s", &sockaddr);
279 | 		ses = server_session_spawn(cfd, &sockaddr, &addrlen);
280 | 		if (ses)
281 | 			server_session_add(ses);
282 | reap_child:
283 | 		server_reap_sessions();
284 | 	}
285 | 
286 | 	return 0;
287 | }
288 | 


--------------------------------------------------------------------------------
/proto.h:
--------------------------------------------------------------------------------
  1 | /* SPDX-License-Identifier: BSD-3-Clause */
  2 | /* Copyright Meta Platforms, Inc. and affiliates */
  3 | 
  4 | #ifndef PROTO_H
  5 | #define PROTO_H 1
  6 | 
  7 | #include <netdb.h>
  8 | #include <linux/tls.h>
  9 | #include <linux/types.h>
 10 | #include <sys/types.h>
 11 | #include <sys/socket.h>
 12 | #include <stdbool.h>
 13 | 
 14 | #define KPERF_MAX_CPUS	1024
 15 | 
 16 | enum kpm_msg_type {
 17 | 	KPM_MSG_TYPE_ERROR = 1,
 18 | 	KPM_MSG_TYPE_ECHO,
 19 | 	KPM_MSG_TYPE_HELLO,
 20 | 	KPM_MSG_TYPE_SPAWN_WORKER,
 21 | 	KPM_MSG_TYPE_PIN_WORKER,
 22 | 	KPM_MSG_TYPE_OPEN_TCP_ACCEPTOR,
 23 | 	KPM_MSG_TYPE_CONNECT,
 24 | 	KPM_MSG_TYPE_DISCONNECT,
 25 | 	KPM_MSG_TYPE_CONNECTION_ID,
 26 | 	KPM_MSG_TYPE_TLS,
 27 | 	KPM_MSG_TYPE_MAX_PACING,
 28 | 	KPM_MSG_TYPE_TCP_CC,
 29 | 	KPM_MSG_TYPE_MODE,
 30 | 	KPM_MSG_TYPE_TEST,
 31 | 	KPM_MSG_TYPE_TEST_RESULT,
 32 | 	KPM_MSG_TYPE_END_TEST,
 33 | 
 34 | 	KPM_MSG_WORKER_ID,
 35 | 	KPM_MSG_WORKER_KILL,
 36 | 	KPM_MSG_WORKER_TEST,
 37 | 	KPM_MSG_WORKER_END_TEST,
 38 | 	KPM_MSG_WORKER_TEST_RESULT,
 39 | 
 40 | 	__KPM_MSG_TOTAL,
 41 | 
 42 | 	KPM_MSG_REPLY		= 0x8000
 43 | };
 44 | 
 45 | struct kpm_header {
 46 | 	__u16 type;
 47 | 	__u16 id;
 48 | 	__u32 len;
 49 | };
 50 | 
 51 | struct kpm_empty {
 52 | 	struct kpm_header hdr;
 53 | };
 54 | 
 55 | struct kpm_hello {
 56 | 	struct kpm_header hdr;
 57 | 	__u32 version;
 58 | 	__u32 n_cpus;
 59 | };
 60 | 
 61 | struct __kpm_generic_u16 {
 62 | 	struct kpm_header hdr;
 63 | 	__u16 val;
 64 | 	__u16 pad;
 65 | };
 66 | 
 67 | struct kpm_tcp_acceptor_reply {
 68 | 	struct kpm_header hdr;
 69 | 	socklen_t len;
 70 | 	struct sockaddr_in6 addr;
 71 | };
 72 | 
 73 | struct __kpm_generic_u32 {
 74 | 	struct kpm_header hdr;
 75 | 	__u32 val;
 76 | };
 77 | 
 78 | struct kpm_reply_error {
 79 | 	struct kpm_header hdr;
 80 | 	__u16 type;
 81 | 	__u16 error;
 82 | };
 83 | 
 84 | struct kpm_pin_worker {
 85 | 	struct kpm_header hdr;
 86 | 	__u32 worker_id;
 87 | 	__u32 cpu;
 88 | };
 89 | 
 90 | struct kpm_connect {
 91 | 	struct kpm_header hdr;
 92 | 	socklen_t len;
 93 | 	struct sockaddr_in6 addr;
 94 | 	__u32 mss;
 95 | };
 96 | 
 97 | struct kpm_connect_reply {
 98 | 	struct kpm_header hdr;
 99 | 	struct {
100 | 		__u32 id;
101 | 		__u32 cpu;
102 | 		__u16 port;
103 | 	} local, remote;
104 | };
105 | 
106 | struct kpm_connection_id {
107 | 	struct kpm_header hdr;
108 | 	__u32 id;
109 | 	__u32 cpu;
110 | };
111 | 
112 | struct kpm_max_pacing {
113 | 	struct kpm_header hdr;
114 | 	__u32 id;
115 | 	__u32 max_pacing;
116 | };
117 | 
118 | #define KPM_CC_NAME_LEN 16
119 | 
120 | struct kpm_tcp_cc {
121 | 	struct kpm_header hdr;
122 | 	__u32 id;
123 | 	char cc_name[KPM_CC_NAME_LEN];
124 | };
125 | 
126 | enum kpm_rx_mode {
127 | 	KPM_RX_MODE_SOCKET,
128 | 	KPM_RX_MODE_SOCKET_TRUNC,
129 | 	KPM_RX_MODE_SOCKET_ZEROCOPY,
130 | 	KPM_RX_MODE_DEVMEM,
131 | };
132 | 
133 | enum kpm_tx_mode {
134 | 	KPM_TX_MODE_SOCKET,
135 | 	KPM_TX_MODE_SOCKET_ZEROCOPY,
136 | 	KPM_TX_MODE_DEVMEM,
137 | };
138 | 
139 | enum memory_provider_type {
140 | 	MEMORY_PROVIDER_HOST,
141 | 	MEMORY_PROVIDER_CUDA,
142 | };
143 | 
144 | #define DEVICE_DOMAIN_ANY 0xffff
145 | #define DEVICE_BUS_ANY 0xff
146 | #define DEVICE_DEVICE_ANY 0xff
147 | 
148 | struct pci_dev {
149 | 	__u16 domain;
150 | 	__u8 bus;
151 | 	__u8 device;
152 | };
153 | 
154 | struct kpm_mode {
155 | 	struct kpm_header hdr;
156 | 	enum kpm_rx_mode rx_mode;
157 | 	enum kpm_tx_mode tx_mode;
158 | 
159 | 	/* devmem info */
160 | 	enum memory_provider_type rx_provider;
161 | 	enum memory_provider_type tx_provider;
162 | 	struct pci_dev dev;
163 | 	__u32 dmabuf_rx_size_mb;
164 | 	__u32 dmabuf_tx_size_mb;
165 | 	__u32 num_rx_queues;
166 | 	struct sockaddr_in6 addr;
167 | 
168 | 	__u8 validate;
169 | 	__u8 iou;
170 | 	__u32 iou_rx_size_mb;
171 | };
172 | 
173 | enum kpm_tls_mask {
174 | 	KPM_TLS_ULP = 1,
175 | 	KPM_TLS_TX = 2,
176 | 	KPM_TLS_RX = 4,
177 | 	KPM_TLS_NOPAD = 8,
178 | };
179 | 
180 | struct kpm_tls {
181 | 	struct kpm_header hdr;
182 | 	__u32 connection_id;
183 | 	__u32 dir_mask;
184 | 	socklen_t len;
185 | 	union {
186 | 		struct tls12_crypto_info_aes_gcm_128 aes128;
187 | 	} info;
188 | };
189 | 
190 | struct kpm_end_test {
191 | 	struct kpm_header hdr;
192 | 	__u32 id;
193 | };
194 | 
195 | enum kpm_test_type {
196 | 	KPM_TEST_TYPE_STREAM = 1,
197 | 	KPM_TEST_TYPE_RR,
198 | };
199 | 
200 | #define KPM_DFL_OP_CHUNK		(1 << 16)
201 | #define KPM_MAX_OP_CHUNK		(1 << 27)
202 | 
203 | struct kpm_test {
204 | 	struct kpm_header hdr;
205 | 	__u8 active;
206 | 	__u8 pad;
207 | 	__u16 time_sec;
208 | 	__u32 n_conns;
209 | 	__u32 test_id;
210 | 	struct kpm_test_spec {
211 | 		__u32 connection_id;
212 | 		__u32 worker_id;
213 | 		enum kpm_test_type type;
214 | 		__u32 read_size;
215 | 		__u32 write_size;
216 | 		union kpm_test_arg {
217 | 			struct {
218 | 				__u32 req_size;
219 | 				__u32 resp_size;
220 | 				__u8 timings;
221 | 			} rr;
222 | 		} arg;
223 | 	} specs[0];
224 | };
225 | 
226 | struct kpm_test_results {
227 | 	struct kpm_header hdr;
228 | 	__u32 time_usec;
229 | 	__u32 n_conns;
230 | 	__u32 test_id;
231 | 	struct kpm_cpu_load {
232 | 		__u16 id;
233 | 		__u16 user; /* sum of user and nice */
234 | 		__u16 system;
235 | 		__u16 idle;
236 | 		__u16 iowait;
237 | 		__u16 irq;
238 | 		__u16 sirq;
239 | 	} cpu_load[KPERF_MAX_CPUS];
240 | 	struct kpm_test_result {
241 | 		__u32 connection_id;
242 | 		__u32 worker_id;
243 | 		enum kpm_test_type type;
244 | 		__u64 rx_bytes;
245 | 		__u64 tx_bytes;
246 | 
247 | 		__u32 reqs;
248 | 
249 | 		__u32 retrans;
250 | 		__u32 reord_seen;
251 | 		__u32 rtt;
252 | 		__u32 rttvar;
253 | 		__u32 delivered_ce;
254 | 		__u32 snd_wnd;
255 | 		__u32 snd_cwnd;
256 | 
257 | 		__u32 lat_hist[33];
258 | 		__u32 p25;
259 | 		__u32 p50;
260 | 		__u32 p90;
261 | 		__u32 p99;
262 | 		__u32 p999;
263 | 		__u32 p9999;
264 | 	} res[0];
265 | };
266 | 
267 | #define kpm_good_req(msg, msg_type)					\
268 | 	({								\
269 | 		struct kpm_header *_hdr = &(msg)->hdr;			\
270 | 		int _ret;						\
271 | 									\
272 | 		_ret = _hdr->type == (msg_type) &&			\
273 | 			_hdr->len == sizeof(*msg);			\
274 | 		_ret;							\
275 | 	})
276 | 
277 | #define kpm_good_reply(msg, msg_type, seq)				\
278 | 	({								\
279 | 		struct kpm_header *_hdr = &(msg)->hdr;			\
280 | 		int _ret;						\
281 | 									\
282 | 		_ret = _hdr->type == ((msg_type) | KPM_MSG_REPLY) &&	\
283 | 			_hdr->id == (seq) &&				\
284 | 			_hdr->len == sizeof(*msg);			\
285 | 		_ret;							\
286 | 	})
287 | 
288 | void *kpm_msg_dup(struct kpm_header *hdr);
289 | 
290 | void *kpm_receive(int fd);
291 | 
292 | int kpm_send(int fd, struct kpm_header *msg, size_t size,
293 | 	     enum kpm_msg_type type);
294 | int kpm_send_empty(int fd, enum kpm_msg_type type);
295 | int kpm_send_u32(int fd, enum kpm_msg_type type, __u32 arg);
296 | 
297 | int kpm_send_conn_id(int fd, __u32 id, __u32 cpu);
298 | int kpm_send_connect(int fd, struct sockaddr_in6 *addr, socklen_t len,
299 | 		     __u32 mss);
300 | int kpm_send_tls(int fd, __u32 conn_id, __u32 dir_mask,
301 | 		 void *info, socklen_t len);
302 | int kpm_send_max_pacing(int fd, __u32 id, __u32 max_pace);
303 | int kpm_send_tcp_cc(int fd, __u32 id, char *cc_name);
304 | int kpm_send_mode(int fd, struct kpm_mode *mode);
305 | int kpm_send_pin_worker(int fd, __u32 id, __u32 cpu);
306 | 
307 | void kpm_reply_error(int fd, struct kpm_header *hdr, __u16 error);
308 | 
309 | int kpm_reply_empty(int fd, struct kpm_header *hdr);
310 | int kpm_reply_u16(int fd, struct kpm_header *hdr, __u16 arg);
311 | int kpm_reply_u32(int fd, struct kpm_header *hdr, __u32 arg);
312 | 
313 | int kpm_reply_acceptor(int fd, struct kpm_header *hdr,
314 | 		       struct sockaddr_in6 *addr, socklen_t len);
315 | int kpm_reply_connect(int fd, struct kpm_header *hdr,
316 | 		      __u32 local_id, __u32 local_cpu, __u16 local_port,
317 | 		      __u32 remote_id, __u32 remote_cpu, __u16 remote_port);
318 | 
319 | int kpm_xchg_hello(int fd, unsigned int *ncpus);
320 | 
321 | int kpm_req_tcp_sock(int fd, struct sockaddr_in6 *addr, socklen_t *len);
322 | int kpm_req_end_test(int fd, __u32 test_id);
323 | int kpm_req_tls(int fd, __u32 conn_id, __u32 dir_mask,
324 | 		void *info, socklen_t len);
325 | int kpm_req_pacing(int fd, __u32 conn_id, __u32 max_pace);
326 | int kpm_req_tcp_cc(int fd, __u32 conn_id, char *cc_name);
327 | int kpm_req_mode(int fd, struct kpm_mode *mode);
328 | int kpm_req_disconnect(int fd, __u32 connection_id);
329 | 
330 | #endif /* PROTO_H */
331 | 


--------------------------------------------------------------------------------
/bipartite_match.c:
--------------------------------------------------------------------------------
  1 | // SPDX-License-Identifier: BSD-3-Clause
  2 | /* Copyright Jakub Kicinski */
  3 | /* Copyright Meta Platforms, Inc. and affiliates */
  4 | 
  5 | #include <stdbool.h>
  6 | #include <stdlib.h>
  7 | #include <string.h>
  8 | 
  9 | #include <ccan/array_size/array_size.h>
 10 | #include <ccan/compiler/compiler.h>
 11 | #include <ccan/minmax/minmax.h>
 12 | #include <ccan/tal/tal.h>
 13 | 
 14 | #include "bipartite_match.h"
 15 | 
 16 | #define INIT_STATE_SIZE		8
 17 | 
 18 | struct bim_state {
 19 | 	/* Sizing allocated memory */
 20 | 	unsigned int max_left;
 21 | 	unsigned int max_right;
 22 | 	/* Max vertex indexes seen */
 23 | 	unsigned int max_used_left;
 24 | 	unsigned int max_used_right;
 25 | 
 26 | 	/* Arrays indexed by left_id */
 27 | 	unsigned int **left_neigh;
 28 | 	void ***left_neigh_cookie;
 29 | 	unsigned int *n_left_neigh;
 30 | 	int *left_match;
 31 | 
 32 | 	/* Arrays indexed by right_id */
 33 | 	int *right_match;
 34 | 
 35 | 	/* Recursion state */
 36 | 	struct {
 37 | 		bool *left_on_path;
 38 | 	} aug;
 39 | 
 40 | 	/* Cached count of pairings */
 41 | 	unsigned int n_matches;
 42 | };
 43 | 
 44 | struct bim_state *bim_init(void)
 45 | {
 46 | 	struct bim_state *bim;
 47 | 	unsigned int i;
 48 | 
 49 | 	bim = talz(NULL, struct bim_state);
 50 | 
 51 | 	bim->max_left = INIT_STATE_SIZE;
 52 | 	bim->max_right = INIT_STATE_SIZE;
 53 | 
 54 | 	bim->left_neigh = tal_arr(bim, unsigned int *, bim->max_left);
 55 | 	for (i = 0; i < bim->max_left; i++)
 56 | 		bim->left_neigh[i] = tal_arr(bim, unsigned int, bim->max_right);
 57 | 
 58 | 	bim->left_neigh_cookie = tal_arr(bim, void **, bim->max_left);
 59 | 	for (i = 0; i < bim->max_left; i++)
 60 | 		bim->left_neigh_cookie[i] = tal_arr(bim, void *,
 61 | 						    bim->max_right);
 62 | 	bim->n_left_neigh = tal_arrz(bim, unsigned int, bim->max_left);
 63 | 
 64 | 	bim->left_match = tal_arr(bim, int, bim->max_left);
 65 | 	memset(bim->left_match, 0xff, bim->max_left * sizeof(int));
 66 | 	bim->right_match = tal_arr(bim, int, bim->max_right);
 67 | 	memset(bim->right_match, 0xff, bim->max_right * sizeof(int));
 68 | 
 69 | 	bim->aug.left_on_path = tal_arr(bim, bool, bim->max_left);
 70 | 
 71 | 	return bim;
 72 | }
 73 | 
 74 | void bim_destroy(struct bim_state *bim)
 75 | {
 76 | 	tal_free(bim);
 77 | }
 78 | 
 79 | static void bim_realloc(struct bim_state *bim,
 80 | 			unsigned int max_left, unsigned int max_right)
 81 | {
 82 | 	unsigned int i;
 83 | 
 84 | 	tal_resize(&bim->left_neigh, max_left);
 85 | 	for (i = 0; i < bim->max_left; i++)
 86 | 		tal_resize(&bim->left_neigh[i], max_right);
 87 | 	for (; i < max_left; i++)
 88 | 		bim->left_neigh[i] = tal_arr(bim, unsigned int, max_right);
 89 | 
 90 | 	tal_resize(&bim->left_neigh_cookie, max_left);
 91 | 	for (i = 0; i < bim->max_left; i++)
 92 | 		tal_resize(&bim->left_neigh_cookie[i], max_right);
 93 | 	for (; i < max_left; i++)
 94 | 		bim->left_neigh_cookie[i] = tal_arr(bim, void *, max_right);
 95 | 
 96 | 	tal_resizez(&bim->n_left_neigh, max_left);
 97 | 
 98 | 	tal_resize(&bim->left_match, max_left);
 99 | 	if (max_left > bim->max_left)
100 | 		memset(&bim->left_match[bim->max_left], 0xff,
101 | 		       (max_left - bim->max_left) * sizeof(int));
102 | 
103 | 	tal_resize(&bim->right_match, max_right);
104 | 	if (max_right > bim->max_right)
105 | 		memset(&bim->right_match[bim->max_right], 0xff,
106 | 		       (max_right - bim->max_right) * sizeof(int));
107 | 
108 | 	tal_resize(&bim->aug.left_on_path, max_left);
109 | 
110 | 	bim->max_left = max_left;
111 | 	bim->max_right = max_right;
112 | }
113 | 
114 | /* Resize the state, can be used both to grow and shrink.
115 |  * Pass 0, 0 to trim overallocation.
116 |  */
117 | void bim_resize(struct bim_state *bim,
118 | 		unsigned int max_left, unsigned int max_right)
119 | {
120 | 	max_left = max(bim->max_used_left + 1, max_left);
121 | 	max_right = max(bim->max_used_right + 1, max_right);
122 | 
123 | 	if (bim->max_left != max_left || bim->max_right != max_right)
124 | 		bim_realloc(bim, max_left, max_right);
125 | }
126 | 
127 | static void bim_size_check(struct bim_state *bim,
128 | 			   unsigned int left_id, unsigned int right_id)
129 | {
130 | 	bim->max_used_left = max(bim->max_used_left, left_id);
131 | 	bim->max_used_right = max(bim->max_used_right, right_id);
132 | 
133 | 	if (bim->max_used_left >= bim->max_left ||
134 | 	    bim->max_used_right >= bim->max_right)
135 | 		bim_realloc(bim, max(bim->max_used_left * 2, bim->max_left),
136 | 			    max(bim->max_used_right * 2, bim->max_right));
137 | }
138 | 
139 | /* == Algo == */
140 | /* Straightforward implementation of Knuth Max Bipartite Matching */
141 | 
142 | static void bim_reset_aug_state(struct bim_state *bim)
143 | {
144 | 	memset(bim->aug.left_on_path, 0,
145 | 	       sizeof(bool) * (bim->max_used_left + 1));
146 | }
147 | 
148 | static void bim_add_match(struct bim_state *bim,
149 | 			  unsigned int left_id, unsigned int right_id)
150 | {
151 | 	bim->left_match[left_id] = right_id;
152 | 	bim->right_match[right_id] = left_id;
153 | }
154 | 
155 | static bool bim_try_aug(struct bim_state *bim, unsigned int left_id)
156 | {
157 | 	unsigned int i;
158 | 
159 | 	if (bim->aug.left_on_path[left_id])
160 | 		return false;
161 | 	bim->aug.left_on_path[left_id] = true;
162 | 
163 | 	for (i = 0; i < bim->n_left_neigh[left_id]; i++) {
164 | 		unsigned int right_id = bim->left_neigh[left_id][i];
165 | 
166 | 		if (bim->right_match[right_id] == -1 ||
167 | 		    bim_try_aug(bim, bim->right_match[right_id])) {
168 | 			bim_add_match(bim, left_id, right_id);
169 | 			return true;
170 | 		}
171 | 	}
172 | 
173 | 	return false;
174 | }
175 | 
176 | /* Ruturns false if edge is a duplicate */
177 | bool bim_add_edge(struct bim_state *bim,
178 | 		  unsigned int left_id, unsigned int right_id, void *cookie)
179 | {
180 | 	unsigned int i, lv;
181 | 
182 | 	bim_size_check(bim, left_id, right_id);
183 | 
184 | 	/* Add edge */
185 | 	for (i = 0; i < bim->n_left_neigh[left_id]; i++)
186 | 		/* Duplicate edge add, ignore */
187 | 		if (bim->left_neigh[left_id][i] == right_id)
188 | 			return false;
189 | 	i = bim->n_left_neigh[left_id]++;
190 | 	bim->left_neigh[left_id][i] = right_id;
191 | 	bim->left_neigh_cookie[left_id][i] = cookie;
192 | 
193 | 	/* Fast path good edge */
194 | 	if (bim->left_match[left_id] == -1 &&
195 | 	    bim->right_match[right_id] == -1) {
196 | 		bim_add_match(bim, left_id, right_id);
197 | 		bim->n_matches++;
198 | 		return true;
199 | 	}
200 | 
201 | 	/* Slow path, re-match */
202 | 	for (lv = 0; lv < bim->max_used_left + 1; lv++) {
203 | 		if (bim->left_match[lv] != -1)
204 | 			continue;
205 | 		bim_reset_aug_state(bim);
206 | 		if (bim_try_aug(bim, lv)) {
207 | 			bim->n_matches++;
208 | 			break;
209 | 		}
210 | 	}
211 | 
212 | 	return true;
213 | }
214 | 
215 | /* == Accessors == */
216 | 
217 | unsigned int bim_match_size(struct bim_state *bim)
218 | {
219 | 	return bim->n_matches;
220 | }
221 | 
222 | void bim_walk_init(struct bim_edge *match)
223 | {
224 | 	memset(match, 0, sizeof(*match));
225 | }
226 | 
227 | bool bim_edge_walk_next(struct bim_state *bim, struct bim_edge *match)
228 | {
229 | 	unsigned int left_id, i;
230 | 
231 | 	i = match->_walker << 32 >> 32;
232 | 	left_id = match->_walker >> 32;
233 | 	for (; left_id < bim->max_used_left + 1; left_id++) {
234 | 		if (i < bim->n_left_neigh[left_id])
235 | 			goto found;
236 | 		i = 0;
237 | 	}
238 | 
239 | 	return false;
240 | 
241 | found:
242 | 	match->_walker = ((unsigned long long)left_id << 32) | (i + 1);
243 | 	match->left_id = left_id;
244 | 	match->right_id = bim->left_neigh[left_id][i];
245 | 	match->is_match = bim->left_match[left_id] == (int)match->right_id;
246 | 	match->cookie = bim->left_neigh_cookie[left_id][i];
247 | 	return true;
248 | }
249 | 
250 | bool bim_match_walk_next(struct bim_state *bim, struct bim_edge *match)
251 | {
252 | 	unsigned int left_id, i;
253 | 
254 | 	for (left_id = match->_walker;
255 | 	     left_id < bim->max_used_left + 1; left_id++)
256 | 		if (bim->left_match[left_id] != -1)
257 | 			goto found;
258 | 	return false;
259 | 
260 | found:
261 | 	match->is_match = true;
262 | 	match->_walker = left_id + 1;
263 | 	match->left_id = left_id;
264 | 	match->right_id = bim->left_match[left_id];
265 | 	match->cookie = NULL;
266 | 	for (i = 0; i < bim->n_left_neigh[left_id]; i++)
267 | 		if (bim->left_neigh[left_id][i] == match->right_id) {
268 | 			match->cookie = bim->left_neigh_cookie[left_id][i];
269 | 			break;
270 | 		}
271 | 	return true;
272 | }
273 | 
274 | /* == Test / example == */
275 | 
276 | #ifdef KPERF_UNITS
277 | #include <stdio.h>
278 | 
279 | static UNNEEDED void bim_dump(struct bim_state *bim)
280 | {
281 | 	unsigned int i, j;
282 | 
283 | 	printf("============\n");
284 | 	printf("max_l %d max_r %d used_l %d used_r %d matches %d\n",
285 | 	       bim->max_left, bim->max_right,
286 | 	       bim->max_used_left, bim->max_used_right, bim->n_matches);
287 | 
288 | 	for (i = 0; i <= bim->max_used_left; i++)
289 | 		if (bim->left_match[i] != -1)
290 | 			printf("  %d -> %d\n", i, bim->left_match[i]);
291 | 
292 | 	for (i = 0; i <= bim->max_used_right; i++)
293 | 		if (bim->right_match[i] != -1)
294 | 			printf("  %d <- %d\n", i, bim->right_match[i]);
295 | 
296 | 	for (i = 0; i <= bim->max_used_left; i++) {
297 | 		if (!bim->n_left_neigh[i])
298 | 			continue;
299 | 
300 | 		printf("  =%d=", i);
301 | 		for (j = 0; j < bim->n_left_neigh[i]; j++)
302 | 			printf(" %d", bim->left_neigh[i][j]);
303 | 		printf("\n");
304 | 	}
305 | }
306 | 
307 | int main()
308 | {
309 | 	static const int edges[][2] = {{1, 2}, {1, 2}, {2, 2}, {2, 3},
310 | 				       {0, 3}, {2, 0}, {170, 18}};
311 | 	struct bim_state *bim;
312 | 	struct bim_edge m;
313 | 	unsigned int i;
314 | 
315 | 	bim = bim_init();
316 | 	printf("Init match: %d\n", bim_match_size(bim));
317 | 
318 | 	for (i = 0; i < ARRAY_SIZE(edges); i++) {
319 | 		bim_add_edge(bim, edges[i][0], edges[i][1],
320 | 			     (void *)(unsigned long)i);
321 | 		printf("Added edge %d - %d, match: %d\n",
322 | 		       edges[i][0], edges[i][1], bim_match_size(bim));
323 | 	}
324 | 	bim_for_each_match(bim, &m)
325 | 		printf("Match %d - %d, %p\n", m.left_id, m.right_id, m.cookie);
326 | 
327 | 	bim_destroy(bim);
328 | }
329 | #endif
330 | 


--------------------------------------------------------------------------------
/proto.c:
--------------------------------------------------------------------------------
  1 | // SPDX-License-Identifier: BSD-3-Clause
  2 | /* Copyright Meta Platforms, Inc. and affiliates */
  3 | 
  4 | #include <pthread.h>
  5 | #include <stdint.h>
  6 | #include <stdlib.h>
  7 | #include <string.h>
  8 | #include <unistd.h>
  9 | #include <sys/types.h>
 10 | #include <sys/socket.h>
 11 | #include <sys/sysinfo.h>
 12 | 
 13 | #include <ccan/err/err.h>
 14 | 
 15 | #include "proto.h"
 16 | 
 17 | static const unsigned int proto_ver =
 18 | 	__KPM_MSG_TOTAL << 24 |
 19 | 	sizeof(struct kpm_test) << 16 |
 20 | 	sizeof(struct kpm_test_results);
 21 | 
 22 | void *kpm_msg_dup(struct kpm_header *hdr)
 23 | {
 24 | 	char *msg;
 25 | 
 26 | 	msg = malloc(hdr->len);
 27 | 	memcpy(msg, hdr, hdr->len);
 28 | 	return msg;
 29 | }
 30 | 
 31 | void *kpm_receive(int fd)
 32 | {
 33 | 	struct kpm_header hdr;
 34 | 	ssize_t off, n;
 35 | 	char *msg;
 36 | 
 37 | 	n = recv(fd, &hdr, sizeof(hdr), MSG_PEEK | MSG_WAITALL);
 38 | 	if (n < (int)sizeof(hdr)) {
 39 | 		if (n)
 40 | 			warn("Failed to receive header (%zd)", n);
 41 | 		return NULL;
 42 | 	}
 43 | 	if (hdr.len < sizeof(hdr)) {
 44 | 		warnx("Invalid header length (%d)", hdr.len);
 45 | 		return NULL;
 46 | 	}
 47 | 
 48 | 	msg = malloc(hdr.len);
 49 | 	if (!msg)
 50 | 		return NULL;
 51 | 
 52 | 	off = 0;
 53 | 	while (hdr.len) {
 54 | 		n = recv(fd, msg + off, hdr.len, 0);
 55 | 		if (n > hdr.len) {
 56 | 			warnx("Oversized recv");
 57 | 		} else if (n <= 0) {
 58 | 			warnx("Short recv");
 59 | 		} else {
 60 | 			off += n;
 61 | 			hdr.len -= n;
 62 | 			continue;
 63 | 		}
 64 | 
 65 | 		free(msg);
 66 | 		return NULL;
 67 | 	}
 68 | 
 69 | 	return msg;
 70 | }
 71 | 
 72 | static int __kpm_send(int fd, struct kpm_header *msg, size_t size, int id,
 73 | 		      enum kpm_msg_type type)
 74 | {
 75 | 	ssize_t n, off;
 76 | 
 77 | 	msg->type = type;
 78 | 	msg->id = id;
 79 | 	msg->len = size;
 80 | 
 81 | 	off = 0;
 82 | 	while (size) {
 83 | 		n = send(fd, (char *)msg + off, size, 0);
 84 | 		if (n <= 0) {
 85 | 			warnx("Short send");
 86 | 			return -1;
 87 | 		}
 88 | 		size -= n;
 89 | 	}
 90 | 
 91 | 	return id;
 92 | }
 93 | 
 94 | int kpm_send(int fd, struct kpm_header *msg, size_t size,
 95 | 	     enum kpm_msg_type type)
 96 | {
 97 | 	static short int id_gen;
 98 | 
 99 | 	return __kpm_send(fd, msg, size, ++id_gen, type);
100 | }
101 | 
102 | int kpm_send_empty(int fd, enum kpm_msg_type type)
103 | {
104 | 	struct kpm_header hdr;
105 | 
106 | 	return kpm_send(fd, &hdr, sizeof(hdr), type);
107 | }
108 | 
109 | int kpm_send_u32(int fd, enum kpm_msg_type type, __u32 arg)
110 | {
111 | 	struct __kpm_generic_u32 msg;
112 | 
113 | 	msg.val = arg;
114 | 
115 | 	return kpm_send(fd, &msg.hdr, sizeof(msg), type);
116 | }
117 | 
118 | int kpm_send_conn_id(int fd, __u32 id, __u32 cpu)
119 | {
120 | 	struct kpm_connection_id msg;
121 | 
122 | 	msg.id = id;
123 | 	msg.cpu = cpu;
124 | 
125 | 	return kpm_send(fd, &msg.hdr, sizeof(msg), KPM_MSG_TYPE_CONNECTION_ID);
126 | }
127 | 
128 | int kpm_send_connect(int fd, struct sockaddr_in6 *addr, socklen_t len,
129 | 		     __u32 mss)
130 | {
131 | 	struct kpm_connect msg;
132 | 
133 | 	if (len > sizeof(msg.addr)) {
134 | 		warnx("Oversized connect arg");
135 | 		return -1;
136 | 	}
137 | 
138 | 	msg.len = len;
139 | 	memcpy(&msg.addr, addr, len);
140 | 	msg.mss = mss;
141 | 
142 | 	return kpm_send(fd, &msg.hdr, sizeof(msg), KPM_MSG_TYPE_CONNECT);
143 | }
144 | 
145 | int
146 | kpm_send_tls(int fd, __u32 conn_id, __u32 dir_mask, void *info, socklen_t len)
147 | {
148 | 	struct kpm_tls msg;
149 | 
150 | 	if (len > sizeof(msg.info)) {
151 | 		warnx("Oversized TLS arg");
152 | 		return -1;
153 | 	}
154 | 
155 | 	msg.connection_id = conn_id;
156 | 	msg.dir_mask = dir_mask;
157 | 	msg.len = len;
158 | 	memcpy(&msg.info, info, len);
159 | 
160 | 	return kpm_send(fd, &msg.hdr, sizeof(msg), KPM_MSG_TYPE_TLS);
161 | }
162 | 
163 | int kpm_send_max_pacing(int fd, __u32 id, __u32 pace)
164 | {
165 | 	struct kpm_max_pacing msg;
166 | 
167 | 	msg.id = id;
168 | 	msg.max_pacing = pace;
169 | 
170 | 	return kpm_send(fd, &msg.hdr, sizeof(msg), KPM_MSG_TYPE_MAX_PACING);
171 | }
172 | 
173 | int kpm_send_tcp_cc(int fd, __u32 id, char *cc_name)
174 | {
175 | 	struct kpm_tcp_cc msg = {};
176 | 
177 | 	msg.id = id;
178 | 	strncpy(msg.cc_name, cc_name, sizeof(msg.cc_name) - 1);
179 | 
180 | 	return kpm_send(fd, &msg.hdr, sizeof(msg), KPM_MSG_TYPE_TCP_CC);
181 | }
182 | 
183 | int kpm_send_mode(int fd, struct kpm_mode *mode)
184 | {
185 | 	return kpm_send(fd, &mode->hdr, sizeof(*mode), KPM_MSG_TYPE_MODE);
186 | }
187 | 
188 | int kpm_send_pin_worker(int fd, __u32 id, __u32 cpu)
189 | {
190 | 	struct kpm_pin_worker msg;
191 | 
192 | 	msg.worker_id = id;
193 | 	msg.cpu = cpu;
194 | 
195 | 	return kpm_send(fd, &msg.hdr, sizeof(msg), KPM_MSG_TYPE_PIN_WORKER);
196 | }
197 | 
198 | static int kpm_reply(int fd, struct kpm_header *msg, size_t size,
199 | 		     struct kpm_header *req)
200 | {
201 | 	return __kpm_send(fd, msg, size, req->id, req->type | KPM_MSG_REPLY);
202 | }
203 | 
204 | void kpm_reply_error(int fd, struct kpm_header *hdr, __u16 error)
205 | {
206 | 	struct kpm_reply_error msg;
207 | 
208 | 	msg.type = hdr->type;
209 | 	msg.error = error;
210 | 
211 | 	__kpm_send(fd, &msg.hdr, sizeof(msg), hdr->id, KPM_MSG_TYPE_ERROR);
212 | }
213 | 
214 | int kpm_reply_empty(int fd, struct kpm_header *hdr)
215 | {
216 | 	struct kpm_header msg;
217 | 
218 | 	return kpm_reply(fd, &msg, sizeof(msg), hdr);
219 | }
220 | 
221 | int kpm_reply_u16(int fd, struct kpm_header *hdr, __u16 arg)
222 | {
223 | 	struct __kpm_generic_u16 msg;
224 | 
225 | 	msg.val = arg;
226 | 	memset(&msg.pad, 0, sizeof(msg.pad));
227 | 
228 | 	return kpm_reply(fd, &msg.hdr, sizeof(msg), hdr);
229 | }
230 | 
231 | int kpm_reply_u32(int fd, struct kpm_header *hdr, __u32 arg)
232 | {
233 | 	struct __kpm_generic_u32 msg;
234 | 
235 | 	msg.val = arg;
236 | 
237 | 	return kpm_reply(fd, &msg.hdr, sizeof(msg), hdr);
238 | }
239 | 
240 | int kpm_reply_acceptor(int fd, struct kpm_header *hdr,
241 | 		       struct sockaddr_in6 *addr, socklen_t len)
242 | {
243 | 	struct kpm_tcp_acceptor_reply msg;
244 | 
245 | 	memcpy(&msg.addr, addr, len);
246 | 	msg.len = len;
247 | 
248 | 	return kpm_reply(fd, &msg.hdr, sizeof(msg), hdr);
249 | }
250 | 
251 | int kpm_reply_connect(int fd, struct kpm_header *hdr,
252 | 		      __u32 local_id, __u32 local_cpu, __u16 local_port,
253 | 		      __u32 remote_id, __u32 remote_cpu, __u16 remote_port)
254 | {
255 | 	struct kpm_connect_reply msg = {};
256 | 
257 | 	msg.local.id = local_id;
258 | 	msg.local.cpu = local_cpu;
259 | 	msg.local.port = local_port;
260 | 	msg.remote.id = remote_id;
261 | 	msg.remote.cpu = remote_cpu;
262 | 	msg.remote.port = remote_port;
263 | 
264 | 	return kpm_reply(fd, &msg.hdr, sizeof(msg), hdr);
265 | }
266 | 
267 | int kpm_xchg_hello(int fd, unsigned int *ncpus)
268 | {
269 | 	struct kpm_hello hello;
270 | 	struct kpm_hello *rcv;
271 | 
272 | 	hello.version = proto_ver;
273 | 	hello.n_cpus = get_nprocs();
274 | 
275 | 	if (kpm_send(fd, &hello.hdr, sizeof(hello), KPM_MSG_TYPE_HELLO) < 0) {
276 | 		warnx("Failed to send hello");
277 | 		return -1;
278 | 	}
279 | 
280 | 	rcv = kpm_receive(fd);
281 | 	if (!rcv)
282 | 		return -1;
283 | 
284 | 	if (!kpm_good_req(rcv, KPM_MSG_TYPE_HELLO)) {
285 | 		warnx("Bad hello msg");
286 | 		goto err_free;
287 | 	}
288 | 	if (rcv->version != proto_ver) {
289 | 		warnx("Bad PROTO version");
290 | 		goto err_free;
291 | 	}
292 | 
293 | 	if (ncpus)
294 | 		*ncpus = rcv->n_cpus;
295 | 	free(rcv);
296 | 
297 | 	return 0;
298 | 
299 | err_free:
300 | 	free(rcv);
301 | 	return -1;
302 | }
303 | 
304 | int kpm_req_tcp_sock(int fd, struct sockaddr_in6 *addr, socklen_t *len)
305 | {
306 | 	struct kpm_tcp_acceptor_reply *repl;
307 | 	struct kpm_header hdr;
308 | 	int id;
309 | 
310 | 	id = kpm_send(fd, &hdr, sizeof(hdr), KPM_MSG_TYPE_OPEN_TCP_ACCEPTOR);
311 | 	if (id < 0) {
312 | 		warnx("Failed to request TCP sock");
313 | 		return id;
314 | 	}
315 | 
316 | 	repl = kpm_receive(fd);
317 | 	if (!repl) {
318 | 		warnx("Failed to request TCP sock - no response");
319 | 		return -1;
320 | 	}
321 | 
322 | 	if (!kpm_good_reply(repl, KPM_MSG_TYPE_OPEN_TCP_ACCEPTOR, id)) {
323 | 		warnx("Failed to request TCP sock - unexpected reply");
324 | 		free(repl);
325 | 		return -1;
326 | 	}
327 | 
328 | 	if (*len < repl->len) {
329 | 		warnx("Failed to request TCP sock - req space small");
330 | 		free(repl);
331 | 		return -1;
332 | 	}
333 | 
334 | 	memcpy(addr, &repl->addr, repl->len);
335 | 	*len = repl->len;
336 | 	free(repl);
337 | 	return 0;
338 | }
339 | 
340 | int kpm_req_end_test(int fd, __u32 test_id)
341 | {
342 | 	struct kpm_empty *repl;
343 | 	int id;
344 | 
345 | 	id = kpm_send_u32(fd, KPM_MSG_TYPE_END_TEST, test_id);
346 | 	if (id < 0) {
347 | 		warnx("Failed to end test");
348 | 		return id;
349 | 	}
350 | 
351 | 	repl = kpm_receive(fd);
352 | 	if (!repl) {
353 | 		warnx("Failed to end test - no response");
354 | 		return -1;
355 | 	}
356 | 
357 | 	if (!kpm_good_reply(repl, KPM_MSG_TYPE_END_TEST, id)) {
358 | 		warnx("Failed to end test - bad reply");
359 | 		free(repl);
360 | 		return -1;
361 | 	}
362 | 
363 | 	free(repl);
364 | 	return 0;
365 | }
366 | 
367 | int
368 | kpm_req_tls(int fd, __u32 conn_id, __u32 dir_mask, void *info, socklen_t len)
369 | {
370 | 	struct kpm_empty *repl;
371 | 	int id;
372 | 
373 | 	id = kpm_send_tls(fd, conn_id, dir_mask, info, len);
374 | 	if (id < 0) {
375 | 		warnx("Failed to start TLS");
376 | 		return id;
377 | 	}
378 | 
379 | 	repl = kpm_receive(fd);
380 | 	if (!repl) {
381 | 		warnx("Failed to start TLS - no response");
382 | 		return -1;
383 | 	}
384 | 
385 | 	if (!kpm_good_reply(repl, KPM_MSG_TYPE_TLS, id)) {
386 | 		warnx("Failed to start TLS - bad reply");
387 | 		free(repl);
388 | 		return -1;
389 | 	}
390 | 
391 | 	free(repl);
392 | 	return 0;
393 | }
394 | 
395 | int
396 | kpm_req_pacing(int fd, __u32 conn_id, __u32 max_pace)
397 | {
398 | 	struct kpm_empty *repl;
399 | 	int id;
400 | 
401 | 	id = kpm_send_max_pacing(fd, conn_id, max_pace);
402 | 	if (id < 0) {
403 | 		warnx("Failed to request pacing");
404 | 		return id;
405 | 	}
406 | 
407 | 	repl = kpm_receive(fd);
408 | 	if (!repl) {
409 | 		warnx("Failed to request pacing - no response");
410 | 		return -1;
411 | 	}
412 | 
413 | 	if (!kpm_good_reply(repl, KPM_MSG_TYPE_MAX_PACING, id)) {
414 | 		warnx("Failed to request pacing - bad reply");
415 | 		free(repl);
416 | 		return -1;
417 | 	}
418 | 
419 | 	free(repl);
420 | 	return 0;
421 | }
422 | 
423 | int
424 | kpm_req_tcp_cc(int fd, __u32 conn_id, char *cc_name)
425 | {
426 | 	struct kpm_empty *repl;
427 | 	int id;
428 | 
429 | 	id = kpm_send_tcp_cc(fd, conn_id, cc_name);
430 | 	if (id < 0) {
431 | 		warnx("Failed to request TCP cong control");
432 | 		return id;
433 | 	}
434 | 
435 | 	repl = kpm_receive(fd);
436 | 	if (!repl) {
437 | 		warnx("Failed to request TCP cong control - no response");
438 | 		return -1;
439 | 	}
440 | 
441 | 	if (!kpm_good_reply(repl, KPM_MSG_TYPE_TCP_CC, id)) {
442 | 		warnx("Failed to request TCP cong control - bad reply");
443 | 		free(repl);
444 | 		return -1;
445 | 	}
446 | 
447 | 	free(repl);
448 | 	return 0;
449 | }
450 | 
451 | int
452 | kpm_req_mode(int fd, struct kpm_mode *mode)
453 | {
454 | 	struct kpm_empty *repl;
455 | 	int id;
456 | 
457 | 	id = kpm_send_mode(fd, mode);
458 | 	if (id < 0) {
459 | 		warnx("Failed to request mode");
460 | 		return id;
461 | 	}
462 | 
463 | 	repl = kpm_receive(fd);
464 | 	if (!repl) {
465 | 		warnx("Failed to request mode - no response");
466 | 		return -1;
467 | 	}
468 | 
469 | 	if (!kpm_good_reply(repl, KPM_MSG_TYPE_MODE, id)) {
470 | 		warnx("Failed to request mode - bad reply");
471 | 		free(repl);
472 | 		return -1;
473 | 	}
474 | 
475 | 	free(repl);
476 | 	return 0;
477 | }
478 | 
479 | int kpm_req_disconnect(int fd, __u32 connection_id)
480 | {
481 | 	struct kpm_empty *repl;
482 | 	int id;
483 | 
484 | 	id = kpm_send_u32(fd, KPM_MSG_TYPE_DISCONNECT, connection_id);
485 | 	if (id < 0) {
486 | 		warnx("Failed to end connection");
487 | 		return id;
488 | 	}
489 | 
490 | 	repl = kpm_receive(fd);
491 | 	if (!repl) {
492 | 		warnx("Failed to end connection - no response");
493 | 		return -1;
494 | 	}
495 | 
496 | 	if (!kpm_good_reply(repl, KPM_MSG_TYPE_DISCONNECT, id)) {
497 | 		warnx("Failed to end connection - bad reply");
498 | 		free(repl);
499 | 		return -1;
500 | 	}
501 | 
502 | 	free(repl);
503 | 	return 0;
504 | }
505 | 


--------------------------------------------------------------------------------
/worker.c:
--------------------------------------------------------------------------------
  1 | // SPDX-License-Identifier: BSD-3-Clause
  2 | /* Copyright Meta Platforms, Inc. and affiliates */
  3 | 
  4 | #define _GNU_SOURCE
  5 | 
  6 | #include <stdlib.h>
  7 | #include <unistd.h>
  8 | #include <sys/sysinfo.h>
  9 | 
 10 | #include <ccan/array_size/array_size.h>
 11 | #include <ccan/asort/asort.h>
 12 | #include <ccan/err/err.h>
 13 | #include <ccan/fdpass/fdpass.h>
 14 | 
 15 | #include "worker.h"
 16 | #include "cpu_stat.h"
 17 | #include "tcp.h"
 18 | #include "proto_dbg.h"
 19 | #include "server.h"
 20 | #include "tcp.h"
 21 | #include "iou.h"
 22 | #include "epoll.h"
 23 | 
 24 | unsigned char patbuf[KPM_MAX_OP_CHUNK + PATTERN_PERIOD + 1];
 25 | 
 26 | void
 27 | worker_kill_conn(struct worker_state *self, struct worker_connection *conn)
 28 | {
 29 | 	self->ops->conn_close(self, conn);
 30 | 	close(conn->fd);
 31 | 	list_del(&conn->connections);
 32 | 	free(conn->rxbuf);
 33 | 	free(conn->rr.log);
 34 | 	free(conn);
 35 | }
 36 | 
 37 | static int
 38 | worker_pstat_cmp(unsigned int const *a, unsigned int const *b, void *unused)
 39 | {
 40 | 	return (long long int)*a - (long long int)*b;
 41 | }
 42 | 
 43 | static void
 44 | worker_report_pstats(struct worker_state *self, struct worker_connection *conn,
 45 | 		     struct kpm_test_result *data)
 46 | {
 47 | 	if (conn->spec->arg.rr.timings < 2)
 48 | 		return;
 49 | 
 50 |         asort(conn->rr.log, conn->rr.log_len, worker_pstat_cmp, NULL);
 51 | 	data->p25 = conn->rr.log[conn->rr.log_len / 4];
 52 | 	data->p50 = conn->rr.log[conn->rr.log_len / 2];
 53 | 	data->p90 = conn->rr.log[(__u64)conn->rr.log_len * 90 / 100];
 54 | 	data->p99 = conn->rr.log[(__u64)conn->rr.log_len * 99 / 100];
 55 | 	data->p999 = conn->rr.log[(__u64)conn->rr.log_len * 999 / 1000];
 56 | 	data->p9999 = conn->rr.log[(__u64)conn->rr.log_len * 9999 / 10000];
 57 | }
 58 | 
 59 | /* == Worker command handling == */
 60 | 
 61 | static void worker_report_test(struct worker_state *self)
 62 | {
 63 | 	struct worker_connection *conn;
 64 | 	struct cpu_stat *cpu, *cpu_pct;
 65 | 	struct kpm_test_results *res;
 66 | 	unsigned int ncpus, i;
 67 | 	struct timerel t;
 68 | 	size_t sz;
 69 | 
 70 | 	kpm_dbg("Reporting results");
 71 | 
 72 | 	sz = sizeof(*res) + sizeof(res->res[0]) * self->test->n_conns;
 73 | 	res = malloc(sz);
 74 | 	memset(res, 0, sz);
 75 | 
 76 | 	t = timemono_since(self->test_start);
 77 | 	res->time_usec = time_to_usec(t);
 78 | 	res->n_conns = self->test->n_conns;
 79 | 	res->test_id = self->test->test_id;
 80 | 
 81 | 	ncpus = get_nprocs();
 82 | 	cpu = cpu_stat_snapshot(ncpus);
 83 | 	cpu_stat_sub(cpu, self->cpu_start, ncpus);
 84 | 	cpu_pct = cpu_stat_to_pct00(cpu, ncpus);
 85 | 	free(cpu);
 86 | 	for (i = 0; i < ncpus; i++) {
 87 | 		res->cpu_load[i].id	 = cpu_pct[i].cpu_id;
 88 | 		res->cpu_load[i].user	 = cpu_pct[i].user;
 89 | 		res->cpu_load[i].system	 = cpu_pct[i].system;
 90 | 		res->cpu_load[i].idle	 = cpu_pct[i].idle;
 91 | 		res->cpu_load[i].iowait	 = cpu_pct[i].iowait;
 92 | 		res->cpu_load[i].irq	 = cpu_pct[i].irq;
 93 | 		res->cpu_load[i].sirq	 = cpu_pct[i].sirq;
 94 | 	}
 95 | 	free(cpu_pct);
 96 | 
 97 | 	i = 0;
 98 | 	list_for_each(&self->connections, conn, connections) {
 99 | 		struct kpm_test_result *data;
100 | 		struct tcp_info info;
101 | 		socklen_t info_len;
102 | 
103 | 		do {
104 | 			if (i == res->n_conns) {
105 | 				warnx("Missing connections!");
106 | 				goto skip_results;
107 | 			}
108 | 			data = &res->res[i];
109 | 			data->worker_id = self->id;
110 | 			data->connection_id = self->test->specs[i].connection_id;
111 | 			i++;
112 | 			/* Expect the connections to be in order */
113 | 		} while (conn->id != data->connection_id);
114 | 
115 | 		data->type = conn->spec->type;
116 | 
117 | 		info_len = sizeof(conn->init_info);
118 | 		if (getsockopt(conn->fd, IPPROTO_TCP, TCP_INFO,
119 | 			       (void *)&info, &info_len) < 0) {
120 | 			warn("Can't get TCP info");
121 | 			goto skip_results;
122 | 		}
123 | 
124 | 		data->rx_bytes = conn->tot_recv;
125 | 		data->tx_bytes = conn->tot_sent;
126 | 
127 | 		if (conn->spec->type == KPM_TEST_TYPE_RR)
128 | 			data->reqs = conn->rr.reqs;
129 | 
130 | 		data->retrans	= info.tcpi_total_retrans -
131 | 			conn->init_info.tcpi_total_retrans;
132 | 		data->reord_seen = info.tcpi_reord_seen -
133 | 			conn->init_info.tcpi_reord_seen;
134 | 		data->rtt	= info.tcpi_rtt;
135 | 		data->rttvar	= info.tcpi_rttvar;
136 | 		data->delivered_ce = info.tcpi_delivered_ce -
137 | 			conn->init_info.tcpi_delivered_ce;
138 | 		data->snd_wnd	= info.tcpi_snd_wnd;
139 | 		data->snd_cwnd	= info.tcpi_snd_cwnd;
140 | 
141 | 		if (verbose > 2)
142 | 			print_tcp_info(&info);
143 | 
144 | 		memcpy(data->lat_hist, conn->rr.hist, sizeof(data->lat_hist));
145 | 		worker_report_pstats(self, conn, data);
146 | 
147 | 		/* Shut down sending to let the connection drain */
148 | 		conn->to_send = 0;
149 | 	}
150 | skip_results:
151 | 
152 | 	free(self->test);
153 | 	self->test = NULL;
154 | 
155 | 	kpm_send(self->main_sock, &res->hdr, sz, KPM_MSG_WORKER_TEST_RESULT);
156 | 	free(res);
157 | }
158 | 
159 | #define KPM_HNDL(type, name)						\
160 | 	{ KPM_MSG_WORKER_ ## type,					\
161 | 	  worker_msg_ ## name,						\
162 | 	  sizeof(struct kpm_##name),					\
163 | 	  stringify(name) }
164 | 
165 | #define KPM_HNDL_GEN(type, name, gtype)					\
166 | 	{ KPM_MSG_WORKER_ ## type,					\
167 | 	  worker_msg_ ## name,						\
168 | 	  sizeof(struct __kpm_generic_##gtype),				\
169 | 	  stringify(name) }
170 | 
171 | static void
172 | worker_msg_id(struct worker_state *self, struct kpm_header *hdr)
173 | {
174 | 	struct __kpm_generic_u32 *id = (void *)hdr;
175 | 
176 | 	self->id = id->val;
177 | }
178 | 
179 | static void
180 | worker_msg_test(struct worker_state *self, struct kpm_header *hdr)
181 | {
182 | 	struct kpm_test *req = (void *)hdr;
183 | 	unsigned int i;
184 | 
185 | 	if (self->test) {
186 | 		warn("Already running a test");
187 | 		self->quit = 1;
188 | 		return;
189 | 	}
190 | 
191 | 	kpm_dbg("start test %s", req->active ? "act" : "psv");
192 | 
193 | 	self->test = malloc(hdr->len);
194 | 	memcpy(self->test, req, hdr->len);
195 | 
196 | 	for (i = 0; i < req->n_conns; i++) {
197 | 		struct worker_connection *conn;
198 | 		socklen_t info_len;
199 | 		__u64 len;
200 | 
201 | 		conn = malloc(sizeof(*conn));
202 | 		memset(conn, 0, sizeof(*conn));
203 | 		conn->spec = &self->test->specs[i];
204 | 		conn->id = req->specs[i].connection_id;
205 | 		conn->fd = fdpass_recv(self->main_sock);
206 | 
207 | 		info_len = sizeof(conn->init_info);
208 | 		if (getsockopt(conn->fd, IPPROTO_TCP, TCP_INFO,
209 | 			       (void *)&conn->init_info, &info_len) < 0) {
210 | 			warn("Can't get TCP info");
211 | 			self->quit = 1;
212 | 		}
213 | 
214 | 		if (conn->spec->arg.rr.timings > 1) {
215 | 			/* Assume we can't do a round trip < 1us on avg */
216 | 			conn->rr.log_len_max =
217 | 				self->test->time_sec * 1000 * 1000;
218 | 			conn->rr.log = calloc(conn->rr.log_len_max,
219 | 					      sizeof(conn->rr.log[0]));
220 | 		}
221 | 
222 | 		list_add(&self->connections, &conn->connections);
223 | 
224 | 		conn->read_size = conn->spec->read_size;
225 | 		conn->write_size = conn->spec->write_size;
226 | 
227 | 		conn->rxbuf = malloc(conn->read_size);
228 | 		if (!conn->rxbuf) {
229 | 			warnx("No memory");
230 | 			self->quit = 1;
231 | 			return;
232 | 		}
233 | 
234 | 		if (!conn->read_size || conn->read_size > KPM_MAX_OP_CHUNK ||
235 | 		    !conn->write_size || conn->write_size > KPM_MAX_OP_CHUNK) {
236 | 			warnx("wrong size io op read:%u write:%u",
237 | 			      conn->read_size, conn->write_size);
238 | 			self->quit = 1;
239 | 			return;
240 | 		}
241 | 
242 | 		switch (conn->spec->type) {
243 | 		case KPM_TEST_TYPE_STREAM:
244 | 			len = ~0ULL;
245 | 			break;
246 | 		case KPM_TEST_TYPE_RR:
247 | 			len = conn->spec->arg.rr.req_size;
248 | 			break;
249 | 		default:
250 | 			warnx("Unknown test type");
251 | 			return;
252 | 		}
253 | 
254 | 		if (req->active)
255 | 			conn->to_send = len;
256 | 		else
257 | 			conn->to_recv = len;
258 | 
259 | 		self->ops->conn_add(self, conn);
260 | 	}
261 | 
262 | 	self->cpu_start = cpu_stat_snapshot(0);
263 | 	self->test_start = time_mono();
264 | 	memset(&self->prev_loop, 0, sizeof(self->prev_loop));
265 | 	if (self->test->active)
266 | 		self->test_len_msec = req->time_sec * 1000;
267 | }
268 | 
269 | static void
270 | worker_msg_end_test(struct worker_state *self, struct kpm_header *hdr)
271 | {
272 | 	struct worker_connection *conn, *next;
273 | 
274 | 	if (self->test)
275 | 		worker_report_test(self);
276 | 
277 | 	free(self->cpu_start);
278 | 	self->cpu_start = NULL;
279 | 	list_for_each_safe(&self->connections, conn, next, connections)
280 | 		worker_kill_conn(self, conn);
281 | 	self->ended = 1;
282 | }
283 | 
284 | static const struct {
285 | 	enum kpm_msg_type type;
286 | 	void (*cb)(struct worker_state *self, struct kpm_header *hdr);
287 | 	size_t req_size;
288 | 	const char *name;
289 | } msg_handlers[] = {
290 | 	KPM_HNDL_GEN(ID, id, u32),
291 | 	KPM_HNDL(TEST, test),
292 | 	KPM_HNDL(END_TEST, end_test),
293 | };
294 | 
295 | void worker_handle_proto(struct worker_state *self, struct kpm_header *hdr)
296 | {
297 | 	int i;
298 | 
299 | 	kpm_cmd_dbg_start(hdr);
300 | 
301 | 	for (i = 0; i < (int)ARRAY_SIZE(msg_handlers); i++) {
302 | 		if (msg_handlers[i].type != hdr->type)
303 | 			continue;
304 | 
305 | 		if (hdr->len < msg_handlers[i].req_size) {
306 | 			warn("Invalid request for %s", msg_handlers[i].name);
307 | 			self->quit = 1;
308 | 			break;
309 | 		}
310 | 
311 | 		msg_handlers[i].cb(self, hdr);
312 | 		break;
313 | 	}
314 | 	if (i == (int)ARRAY_SIZE(msg_handlers)) {
315 | 		warnx("Unknown message type: %d", hdr->type);
316 | 		self->quit = 1;
317 | 	}
318 | 
319 | 	kpm_cmd_dbg_end(hdr);
320 | }
321 | 
322 | /* == Worker I/O handling == */
323 | 
324 | static void
325 | worker_record_rr_time(struct worker_state *self, struct worker_connection *conn)
326 | {
327 | 	struct timerel delta;
328 | 	unsigned int nsec128;
329 | 	struct timemono now;
330 | 	int hist_idx;
331 | 
332 | 	if (!conn->spec->arg.rr.timings)
333 | 		return;
334 | 
335 | 	now = time_mono();
336 | 	if (!self->prev_loop.ts.tv_sec)
337 | 		goto out_update;
338 | 
339 | 	delta = timemono_between(now, self->prev_loop);
340 | 	nsec128 = delta.ts.tv_nsec / 128;
341 | 	if (delta.ts.tv_sec)
342 | 		nsec128 = ~0U;
343 | 
344 | 	if (conn->spec->arg.rr.timings > 1 &&
345 | 	    conn->rr.log_len < conn->rr.log_len_max)
346 | 		conn->rr.log[conn->rr.log_len++] = nsec128;
347 | 
348 | 	hist_idx = 0;
349 | 	while (nsec128) {
350 | 		nsec128 >>= 1;
351 | 		hist_idx++;
352 | 	}
353 | 	conn->rr.hist[hist_idx]++;
354 | 
355 | out_update:
356 | 	self->prev_loop = now;
357 | }
358 | 
359 | void
360 | worker_send_finished(struct worker_state *self, struct worker_connection *conn)
361 | {
362 | 	worker_record_rr_time(self, conn);
363 | 
364 | 	if (conn->spec->type != KPM_TEST_TYPE_RR)
365 | 		warnx("Done sending for non-RR test");
366 | 	else
367 | 		conn->rr.reqs++;
368 | 
369 | 	if (self->test->active)
370 | 		conn->to_recv =	conn->spec->arg.rr.resp_size;
371 | 	else
372 | 		conn->to_recv =	conn->spec->arg.rr.req_size;
373 | }
374 | 
375 | void
376 | worker_recv_finished(struct worker_state *self, struct worker_connection *conn)
377 | {
378 | 	if (!self->test)
379 | 		return;
380 | 
381 | 	if (conn->spec->type != KPM_TEST_TYPE_RR)
382 | 		warnx("Done sending for non-RR test");
383 | 
384 | 	if (self->test->active)
385 | 		conn->to_send =	conn->spec->arg.rr.req_size;
386 | 	else
387 | 		conn->to_send =	conn->spec->arg.rr.resp_size;
388 | }
389 | 
390 | /* == Main loop == */
391 | void* worker_main(void* args)
392 | {
393 | 	struct worker_opts* opts = args;
394 | 	struct worker_state self = {
395 | 		.main_sock = opts->fd,
396 | 		.opts = *opts,
397 | 
398 | 	};
399 | 
400 | 	free(opts);
401 | 	if (self.opts.use_iou)
402 | 		worker_iou_init(&self);
403 | 	else
404 | 		worker_epoll_init(&self);
405 | 	list_head_init(&self.connections);
406 | 
407 | 	self.ops->prep(&self);
408 | 
409 | 	while (!self.quit) {
410 | 		int msec = -1;
411 | 
412 | 		/* Check if we should end the test if we initiated */
413 | 		if (self.test && self.test->active) {
414 | 			struct timerel t;
415 | 
416 | 			t = timemono_since(self.test_start);
417 | 			msec = self.test_len_msec - time_to_msec(t);
418 | 			if (msec < 0)
419 | 				worker_report_test(&self);
420 | 		}
421 | 
422 | 		self.ops->wait(&self, msec);
423 | 	}
424 | 
425 | 	self.ops->exit(&self);
426 | 	close(self.main_sock);
427 | 	kpm_dbg("exiting!");
428 | 	return NULL;
429 | }
430 | 


--------------------------------------------------------------------------------
/epoll.c:
--------------------------------------------------------------------------------
  1 | // SPDX-License-Identifier: BSD-3-Clause
  2 | /* Copyright Meta Platforms, Inc. and affiliates */
  3 | 
  4 | #include "epoll.h"
  5 | 
  6 | #include <errno.h>
  7 | #include <stdlib.h>
  8 | #include <sys/epoll.h>
  9 | #include <linux/errqueue.h>
 10 | #include <sys/mman.h>
 11 | 
 12 | #include <ccan/array_size/array_size.h>
 13 | #include <ccan/err/err.h>
 14 | #include <ccan/minmax/minmax.h>
 15 | 
 16 | #include "worker.h"
 17 | #include "devmem.h"
 18 | #include "proto_dbg.h"
 19 | 
 20 | extern unsigned char patbuf[KPM_MAX_OP_CHUNK + PATTERN_PERIOD + 1];
 21 | 
 22 | #define ALIGN_UP(v, align) (((v) + (align) - 1) & ~((align) - 1))
 23 | #define ALIGN_PTR_UP(p, ptr_align_to)	((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to))
 24 | 
 25 | /* Each thread should reserve a big enough vma to avoid
 26 |  * spinlock collisions in ptl locks.
 27 |  * This size is 2MB on x86_64, and is exported in /proc/meminfo.
 28 |  */
 29 | static unsigned long default_huge_page_size(void)
 30 | {
 31 | 	FILE *f = fopen("/proc/meminfo", "r");
 32 | 	unsigned long hps = 0;
 33 | 	size_t linelen = 0;
 34 | 	char *line = NULL;
 35 | 
 36 | 	if (!f) {
 37 | 		warnx("Failed to detect default huge page size; using 2 MB as fallback");
 38 | 		return 2 * 1024 * 1024;
 39 | 	}
 40 | 	while (getline(&line, &linelen, f) > 0) {
 41 | 		if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
 42 | 			hps <<= 10;
 43 | 			break;
 44 | 		}
 45 | 	}
 46 | 	free(line);
 47 | 	fclose(f);
 48 | 	return hps;
 49 | }
 50 | 
 51 | static struct worker_connection *
 52 | ep_find_connection_by_fd(struct worker_state *self, int fd)
 53 | {
 54 | 	struct worker_connection *conn;
 55 | 
 56 | 	list_for_each(&self->connections, conn, connections) {
 57 | 		if (conn->fd == fd)
 58 | 			return conn;
 59 | 	}
 60 | 	return NULL;
 61 | }
 62 | 
 63 | static void
 64 | ep_conn_close(struct worker_state *self, struct worker_connection *conn)
 65 | {
 66 | 	struct epoll_event ev = {};
 67 | 
 68 | 	ev.data.fd = conn->fd;
 69 | 	if (epoll_ctl(self->epollfd, EPOLL_CTL_DEL, conn->fd, &ev) < 0)
 70 | 		warn("Failed to del poll out");
 71 | 	if (self->opts.rx_mode == KPM_RX_MODE_DEVMEM)
 72 | 		(void)devmem_release_tokens(conn->fd, &conn->devmem);
 73 | 	else if (self->opts.rx_mode == KPM_RX_MODE_SOCKET_ZEROCOPY)
 74 | 		munmap(conn->raddr, conn->rsize);
 75 | }
 76 | 
 77 | static void
 78 | ep_conn_add(struct worker_state *self, struct worker_connection *conn)
 79 | {
 80 | 	struct epoll_event ev = {};
 81 | 	int zc;
 82 | 
 83 | 	zc = self->opts.tx_mode == KPM_TX_MODE_SOCKET_ZEROCOPY || self->opts.tx_mode == KPM_TX_MODE_DEVMEM;
 84 | 	if (setsockopt(conn->fd, SOL_SOCKET, SO_ZEROCOPY, &zc, sizeof(zc))) {
 85 | 		warnx("Failed to set SO_ZEROCOPY");
 86 | 		self->quit = 1;
 87 | 		return;
 88 | 	}
 89 | 
 90 | 	if (self->opts.rx_mode == KPM_RX_MODE_SOCKET_ZEROCOPY) {
 91 | 		size_t map_align;
 92 | 
 93 | 		map_align = default_huge_page_size();
 94 | 		conn->raddr = mmap(NULL,
 95 | 			conn->read_size + map_align,
 96 | 			PROT_READ,
 97 | 			MAP_SHARED,
 98 | 			conn->fd,
 99 | 			0);
100 | 		if (conn->raddr == MAP_FAILED) {
101 | 			warnx("Failed to mmap TCP_ZEROCOPY_RECEIVE");
102 | 			self->quit = 1;
103 | 			return;
104 | 		}
105 | 		conn->addr = ALIGN_PTR_UP(conn->raddr, map_align);
106 | 		conn->rsize = conn->read_size + map_align;
107 | 	}
108 | 
109 | 	ev.events = EPOLLIN | EPOLLOUT;
110 | 	ev.data.fd = conn->fd;
111 | 	if (epoll_ctl(self->epollfd, EPOLL_CTL_ADD, conn->fd, &ev) < 0)
112 | 		warn("Failed to modify poll out");
113 | }
114 | 
115 | static void ep_handle_main_sock(struct worker_state *self)
116 | {
117 | 	struct kpm_header *hdr;
118 | 
119 | 	hdr = kpm_receive(self->main_sock);
120 | 	if (!hdr) {
121 | 		__kpm_dbg("<<", "ctrl recv failed");
122 | 		self->quit = 1;
123 | 		return;
124 | 	}
125 | 
126 | 	worker_handle_proto(self, hdr);
127 | 
128 | 	free(hdr);
129 | }
130 | 
131 | static void
132 | ep_send_arm(struct worker_state *self, struct worker_connection *conn,
133 | 	    unsigned int events)
134 | {
135 | 	struct epoll_event ev = {};
136 | 
137 | 	if (events & EPOLLOUT)
138 | 		return;
139 | 
140 | 	ev.events = EPOLLIN | EPOLLOUT;
141 | 	ev.data.fd = conn->fd;
142 | 	if (epoll_ctl(self->epollfd, EPOLL_CTL_MOD, conn->fd, &ev) < 0)
143 | 		warn("Failed to modify poll out");
144 | }
145 | 
146 | static void
147 | ep_send_disarm(struct worker_state *self, struct worker_connection *conn,
148 | 	       unsigned int events)
149 | {
150 | 	struct epoll_event ev = {};
151 | 
152 | 	if (!(events & EPOLLOUT))
153 | 		return;
154 | 
155 | 	ev.events = EPOLLIN;
156 | 	ev.data.fd = conn->fd;
157 | 	if (epoll_ctl(self->epollfd, EPOLL_CTL_MOD, conn->fd, &ev) < 0)
158 | 		warn("Failed to modify poll out");
159 | }
160 | 
161 | static void
162 | ep_handle_completions(struct worker_state *self, struct worker_connection *conn,
163 | 		      unsigned int events)
164 | {
165 | 	struct sock_extended_err *serr;
166 | 	struct msghdr msg = {};
167 | 	char control[64] = {};
168 | 	struct cmsghdr *cm;
169 | 	int ret, n;
170 | 
171 | 	msg.msg_control = control;
172 | 	msg.msg_controllen = sizeof(control);
173 | 
174 | 	ret = recvmsg(conn->fd, &msg, MSG_ERRQUEUE);
175 | 	if (ret < 0) {
176 | 		if (errno == EAGAIN)
177 | 			return;
178 | 		warn("failed to clean completions");
179 | 		goto kill_conn;
180 | 	}
181 | 
182 | 	if (msg.msg_flags & MSG_CTRUNC) {
183 | 		warnx("failed to clean completions: truncated cmsg");
184 | 		goto kill_conn;
185 | 	}
186 | 
187 | 	cm = CMSG_FIRSTHDR(&msg);
188 | 	if (!cm) {
189 | 		warnx("failed to clean completions: no cmsg");
190 | 		goto kill_conn;
191 | 	}
192 | 
193 | 	if (cm->cmsg_level != SOL_IP && cm->cmsg_level != SOL_IPV6) {
194 | 		warnx("failed to clean completions: wrong level %d",
195 | 		      cm->cmsg_level);
196 | 		goto kill_conn;
197 | 	}
198 | 
199 | 	if (cm->cmsg_type != IP_RECVERR && cm->cmsg_type != IPV6_RECVERR) {
200 | 		warnx("failed to clean completions: wrong type %d",
201 | 		      cm->cmsg_type);
202 | 		goto kill_conn;
203 | 	}
204 | 
205 | 	serr = (void *)CMSG_DATA(cm);
206 | 	if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) {
207 | 		warnx("failed to clean completions: wrong origin %d",
208 | 		      serr->ee_origin);
209 | 		goto kill_conn;
210 | 	}
211 | 	if (serr->ee_errno) {
212 | 		warnx("failed to clean completions: error %d",
213 | 		      serr->ee_errno);
214 | 		goto kill_conn;
215 | 	}
216 | 	n = serr->ee_data - serr->ee_info + 1;
217 | 	conn->to_send_comp -= n;
218 | 	kpm_dbg("send complete (%d..%d) %d\n",
219 | 		serr->ee_data, serr->ee_info + 1, conn->to_send_comp);
220 | 
221 | 	return;
222 | 
223 | kill_conn:
224 | 	worker_kill_conn(self, conn);
225 | }
226 | 
227 | static void
228 | ep_handle_send(struct worker_state *self, struct worker_connection *conn,
229 | 	       unsigned int events)
230 | {
231 | 	unsigned int rep = max_t(int, 10, conn->to_send / conn->write_size + 1);
232 | 	bool msg_zerocopy = self->opts.tx_mode == KPM_TX_MODE_SOCKET_ZEROCOPY || self->opts.tx_mode == KPM_TX_MODE_DEVMEM;
233 | 	int flags = msg_zerocopy ? MSG_ZEROCOPY : 0;
234 | 
235 | 	while (rep--) {
236 | 		size_t chunk;
237 | 		void *src;
238 | 		ssize_t n;
239 | 
240 | 		chunk = min_t(size_t, conn->write_size, conn->to_send);
241 | 
242 | 		if (self->opts.tx_mode == KPM_TX_MODE_DEVMEM) {
243 | 			n = devmem_sendmsg(conn->fd, self->opts.devmem.dmabuf_id,
244 | 					   conn->tot_sent % PATTERN_PERIOD, chunk);
245 | 		} else {
246 | 			src = &patbuf[conn->tot_sent % PATTERN_PERIOD];
247 | 			n = send(conn->fd, src, chunk, MSG_DONTWAIT | flags);
248 | 		}
249 | 		if (n == 0) {
250 | 			warnx("zero send chunk:%zd to_send:%lld to_recv:%lld",
251 | 			      chunk, conn->to_send, conn->to_recv);
252 | 			worker_kill_conn(self, conn);
253 | 			return;
254 | 		}
255 | 		if (n < 0) {
256 | 			if (errno == EAGAIN || errno == EWOULDBLOCK) {
257 | 				kpm_dbg("send full (0 sent)");
258 | 				ep_send_arm(self, conn, events);
259 | 				return;
260 | 			}
261 | 			warn("Send failed");
262 | 			worker_kill_conn(self, conn);
263 | 			return;
264 | 		}
265 | 
266 | 		conn->to_send -= n;
267 | 		conn->tot_sent += n;
268 | 		if (msg_zerocopy) {
269 | 			conn->to_send_comp += 1;
270 | 			kpm_dbg("queued send completion, total %d",
271 | 				conn->to_send_comp);
272 | 		}
273 | 
274 | 		if (!conn->to_send && !conn->to_send_comp) {
275 | 			ep_send_disarm(self, conn, events);
276 | 			worker_send_finished(self, conn);
277 | 			break;
278 | 		}
279 | 
280 | 		if (n != (ssize_t)chunk) {
281 | 			kpm_dbg("send full (partial)");
282 | 			ep_send_arm(self, conn, events);
283 | 			return;
284 | 		}
285 | 	}
286 | }
287 | 
288 | static ssize_t
289 | ep_handle_zerocopy_recv(struct worker_state *self, struct worker_connection *conn,
290 | 			size_t chunk, int rep)
291 | {
292 | 	void *src = &patbuf[conn->tot_recv % PATTERN_PERIOD];
293 | 	struct tcp_zerocopy_receive zc;
294 | 	socklen_t len = sizeof(zc);
295 | 	ssize_t n = 0;
296 | 	int res;
297 | 
298 | 	memset(&zc, 0, len);
299 | 	zc.address = (__u64)((unsigned long)conn->addr);
300 | 	zc.length = chunk;
301 | 	zc.copybuf_address = (__u64)((unsigned long)conn->rxbuf);
302 | 	zc.copybuf_len = chunk;
303 | 	res = getsockopt(conn->fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE,
304 | 			 &zc, &len);
305 | 	if (res < 0)
306 | 		return res;
307 | 	if (zc.err)
308 | 		return zc.err;
309 | 
310 | 	if (zc.length) {
311 | 		if (self->opts.validate && memcmp(conn->addr, src, zc.length))
312 | 			warnx("Data corruption %d %d %u %lld %lld %d",
313 | 			*(char *)conn->addr, *(char *)src, zc.length,
314 | 			conn->tot_recv % PATTERN_PERIOD,
315 | 			conn->tot_recv, rep);
316 | 		madvise(conn->addr, zc.length, MADV_DONTNEED);
317 | 		src = &patbuf[(conn->tot_recv + zc.length) % PATTERN_PERIOD];
318 | 		n += zc.length;
319 | 	}
320 | 
321 | 	if (zc.copybuf_len) {
322 | 		if (self->opts.validate && memcmp(conn->rxbuf, src, zc.copybuf_len))
323 | 			warnx("Data corruption %d %d %d %lld %lld %d",
324 | 			*conn->rxbuf, *(char *)src, zc.copybuf_len,
325 | 			(conn->tot_recv + n) % PATTERN_PERIOD,
326 | 			(conn->tot_recv + n), rep);
327 | 		n += zc.copybuf_len;
328 | 	}
329 | 
330 | 	/* Sometimes getsockopt returns 0 for both length and copybuf_len, try
331 | 	 * again */
332 | 	return n == 0 ? -EAGAIN : n;
333 | }
334 | 
335 | static ssize_t
336 | ep_handle_regular_recv(struct worker_state *self, struct worker_connection *conn,
337 | 		       size_t chunk, int rep)
338 | {
339 | 	bool msg_trunc = self->opts.rx_mode == KPM_RX_MODE_SOCKET_TRUNC;
340 | 	void *src = &patbuf[conn->tot_recv % PATTERN_PERIOD];
341 | 	int flags = msg_trunc ? MSG_TRUNC : 0;
342 | 	ssize_t n;
343 | 
344 | 	n = recv(conn->fd, conn->rxbuf, chunk, MSG_DONTWAIT | flags);
345 | 
346 | 	if (n <= 0 || msg_trunc)
347 | 		return n;
348 | 
349 | 	if (self->opts.validate && memcmp(conn->rxbuf, src, n))
350 | 		warnx("Data corruption %d %d %ld %lld %lld %d",
351 | 		      *conn->rxbuf, *(char *)src, n,
352 | 		      conn->tot_recv % PATTERN_PERIOD,
353 | 		      conn->tot_recv, rep);
354 | 
355 | 	return n;
356 | }
357 | 
358 | static void
359 | ep_handle_recv(struct worker_state *self, struct worker_connection *conn)
360 | {
361 | 	unsigned int rep = 10;
362 | 
363 | 	while (rep--) {
364 | 		size_t chunk;
365 | 		ssize_t n;
366 | 
367 | 		chunk = min_t(size_t, conn->read_size, conn->to_recv);
368 | 		if (self->opts.rx_mode == KPM_RX_MODE_DEVMEM)
369 | 			n = devmem_recv(conn->fd, &conn->devmem,
370 | 					conn->rxbuf, chunk, self->opts.devmem.mem,
371 | 					rep, conn->tot_recv, self->opts.validate);
372 | 		else if (self->opts.rx_mode == KPM_RX_MODE_SOCKET_ZEROCOPY)
373 | 			n = ep_handle_zerocopy_recv(self, conn, chunk, rep);
374 | 		else
375 | 			n = ep_handle_regular_recv(self, conn, chunk, rep);
376 | 		if (n == 0) {
377 | 			warnx("zero recv");
378 | 			worker_kill_conn(self, conn);
379 | 			break;
380 | 		}
381 | 		if (n < 0) {
382 | 			if (errno == EAGAIN || errno == EWOULDBLOCK)
383 | 				break;
384 | 			if (n == -EAGAIN)
385 | 				break;
386 | 			warn("Recv failed");
387 | 			worker_kill_conn(self, conn);
388 | 			break;
389 | 		}
390 | 
391 | 		conn->to_recv -= n;
392 | 		conn->tot_recv += n;
393 | 
394 | 		if (!conn->to_recv) {
395 | 			worker_recv_finished(self, conn);
396 | 			if (conn->to_send) {
397 | 				ep_handle_send(self, conn, 0);
398 | 				break;
399 | 			}
400 | 		}
401 | 
402 | 		if (n != conn->read_size)
403 | 			break;
404 | 	}
405 | 
406 | }
407 | 
408 | static void
409 | ep_handle_conn(struct worker_state *self, int fd, unsigned int events)
410 | {
411 | 	static int warnd_unexpected_pi;
412 | 	struct worker_connection *conn;
413 | 
414 | 	conn = ep_find_connection_by_fd(self, fd);
415 | 
416 | 	if (events & EPOLLOUT) {
417 | 		if (conn->to_send)
418 | 			ep_handle_send(self, conn, events);
419 | 		else if (!conn->to_send_comp)
420 | 			ep_send_disarm(self, conn, events);
421 | 	}
422 | 	if (events & EPOLLIN) {
423 | 		if (conn->to_recv) {
424 | 			ep_handle_recv(self, conn);
425 | 		} else if (!warnd_unexpected_pi) {
426 | 			warnx("Unexpected POLLIN %x", events);
427 | 			warnd_unexpected_pi = 1;
428 | 		}
429 | 	}
430 | 	if (events & EPOLLERR)
431 | 		ep_handle_completions(self, conn, events);
432 | 
433 | 	if (!(events & (EPOLLOUT | EPOLLIN | EPOLLERR)))
434 | 		warnx("Connection has nothing to do %x", events);
435 | }
436 | 
437 | static void ep_prep(struct worker_state *self)
438 | {
439 | 	int fd = self->main_sock;
440 | 	struct epoll_event ev;
441 | 
442 | 	self->epollfd = epoll_create1(0);
443 | 	if (self->epollfd < 0)
444 | 		err(5, "Failed to create epoll");
445 | 
446 | 	ev.events = EPOLLIN;
447 | 	ev.data.fd = fd;
448 | 	if (epoll_ctl(self->epollfd, EPOLL_CTL_ADD, fd, &ev) < 0)
449 | 		err(6, "Failed to init epoll");
450 | }
451 | 
452 | static void ep_wait(struct worker_state *self, int msec)
453 | {
454 | 	struct epoll_event events[32];
455 | 	int i, nfds;
456 | 
457 | 	nfds = epoll_wait(self->epollfd, events, ARRAY_SIZE(events),
458 | 				msec);
459 | 	if (nfds < 0)
460 | 		err(7, "Failed to epoll");
461 | 
462 | 	for (i = 0; i < nfds; i++) {
463 | 		struct epoll_event *e = &events[i];
464 | 
465 | 		if (e->data.fd == self->main_sock)
466 | 			ep_handle_main_sock(self);
467 | 		else
468 | 			ep_handle_conn(self, e->data.fd,
469 | 						e->events);
470 | 	}
471 | }
472 | 
473 | static void ep_exit(struct worker_state *self)
474 | {
475 | }
476 | 
477 | static const struct io_ops epoll_io_ops = {
478 | 	.prep		= ep_prep,
479 | 	.wait		= ep_wait,
480 | 	.conn_add	= ep_conn_add,
481 | 	.conn_close	= ep_conn_close,
482 | 	.exit		= ep_exit,
483 | };
484 | 
485 | void worker_epoll_init(struct worker_state *self)
486 | {
487 | 	self->ops = &epoll_io_ops;
488 | }
489 | 


--------------------------------------------------------------------------------
/iou.c:
--------------------------------------------------------------------------------
  1 | // SPDX-License-Identifier: BSD-3-Clause
  2 | /* Copyright Meta Platforms, Inc. and affiliates */
  3 | 
  4 | #include "iou.h"
  5 | 
  6 | #include <err.h>
  7 | #include <net/if.h>
  8 | #include <stdlib.h>
  9 | #include <string.h>
 10 | #include <liburing.h>
 11 | #include <sys/socket.h>
 12 | #include <sys/mman.h>
 13 | 
 14 | #include <ccan/minmax/minmax.h>
 15 | 
 16 | #include "proto.h"
 17 | #include "proto_dbg.h"
 18 | #include "devmem.h"
 19 | #include "worker.h"
 20 | 
 21 | extern unsigned char patbuf[KPM_MAX_OP_CHUNK + PATTERN_PERIOD + 1];
 22 | 
 23 | #define ALIGN_UP(v, align) (((v) + (align) - 1) & ~((align) - 1))
 24 | 
 25 | static long page_size;
 26 | 
 27 | struct iou_state {
 28 | 	struct io_uring ring;
 29 | 	void *area_ptr;
 30 | 	size_t area_size;
 31 | 	__u64 area_token;
 32 | 	void *rq_ptr;
 33 | 	struct io_uring_zcrx_rq rq;
 34 | 	size_t rq_size;
 35 | 	unsigned rq_mask;
 36 | 	__u32 zcrx_id;
 37 | };
 38 | 
 39 | struct iou_kpm_msg_state {
 40 | 	struct kpm_header hdr;
 41 | 	void *msg;
 42 | 	ssize_t off;
 43 | };
 44 | 
 45 | enum iou_req_type {
 46 | 	IOU_REQ_TYPE_PROTO_HDR		= 1,
 47 | 	IOU_REQ_TYPE_PROTO_PLD		= 2,
 48 | 	IOU_REQ_TYPE_SEND		= 3,
 49 | 	IOU_REQ_TYPE_RECV		= 4,
 50 | 	IOU_REQ_TYPE_RECVZC		= 5,
 51 | 	IOU_REQ_TYPE_CANCEL		= 6,
 52 | 	IOU_REQ_TYPE_SENDZC		= 7,
 53 | };
 54 | 
 55 | static void *
 56 | tag(void *ptr, enum iou_req_type x)
 57 | {
 58 | 	x &= 0xf;
 59 | 	return (void *)(((uintptr_t)ptr) | x);
 60 | }
 61 | 
 62 | static void *
 63 | untag(uintptr_t ptr)
 64 | {
 65 | 	return (void *)(ptr & ~0xf);
 66 | }
 67 | 
 68 | static enum iou_req_type
 69 | get_tag(uintptr_t ptr)
 70 | {
 71 | 	return (int)(ptr & 0xf);
 72 | }
 73 | 
 74 | static struct iou_state *get_iou_state(struct worker_state *state)
 75 | {
 76 | 	return state->io_state;
 77 | }
 78 | 
 79 | static struct io_uring *get_ring(struct worker_state *state)
 80 | {
 81 | 	return &get_iou_state(state)->ring;
 82 | }
 83 | 
 84 | static void iou_conn_add_send(struct io_uring *ring, struct worker_connection *conn)
 85 | {
 86 | 	struct io_uring_sqe *sqe;
 87 | 	size_t chunk;
 88 | 	void *src;
 89 | 
 90 | 	chunk = min_t(size_t, conn->write_size, conn->to_send);
 91 | 	src = &patbuf[conn->tot_sent % PATTERN_PERIOD];
 92 | 
 93 | 	sqe = io_uring_get_sqe(ring);
 94 | 	io_uring_prep_send(sqe, conn->fd, src, chunk, 0);
 95 | 	io_uring_sqe_set_data(sqe, tag(conn, IOU_REQ_TYPE_SEND));
 96 | }
 97 | 
 98 | static void iou_conn_add_sendzc(struct io_uring *ring, struct worker_connection *conn)
 99 | {
100 | 	struct io_uring_sqe *sqe;
101 | 	size_t chunk;
102 | 	void *src;
103 | 
104 | 	chunk = min_t(size_t, conn->write_size, conn->to_send);
105 | 	src = &patbuf[conn->tot_sent % PATTERN_PERIOD];
106 | 
107 | 	sqe = io_uring_get_sqe(ring);
108 | 	io_uring_prep_send_zc_fixed(sqe, conn->fd, src, chunk, 0, 0, 0);
109 | 	io_uring_sqe_set_data(sqe, tag(conn, IOU_REQ_TYPE_SENDZC));
110 | }
111 | 
112 | static void iou_handle_send(struct worker_state *self, struct io_uring_cqe *cqe)
113 | {
114 | 	struct worker_connection *conn;
115 | 	ssize_t n;
116 | 
117 | 	if (self->ended)
118 | 		return;
119 | 
120 | 	conn = untag(cqe->user_data);
121 | 	n = cqe->res;
122 | 	if (n <= 0) {
123 | 		warnx("Send failed");
124 | 		worker_kill_conn(self, conn);
125 | 		return;
126 | 	}
127 | 
128 | 	conn->to_send -= n;
129 | 	conn->tot_sent += n;
130 | 
131 | 	if (!conn->to_send)
132 | 		worker_send_finished(self, conn);
133 | 	else
134 | 		iou_conn_add_send(get_ring(self), conn);
135 | }
136 | 
137 | static void iou_handle_sendzc(struct worker_state *self, struct io_uring_cqe *cqe)
138 | {
139 | 	struct worker_connection *conn;
140 | 	ssize_t n;
141 | 
142 | 	if (self->ended)
143 | 		return;
144 | 
145 | 	conn = untag(cqe->user_data);
146 | 	if (cqe->flags & IORING_CQE_F_NOTIF) {
147 | 		if (cqe->flags & IORING_CQE_F_MORE) {
148 | 			warnx("Notification completion has F_MORE set");
149 | 			worker_kill_conn(self, conn);
150 | 		}
151 | 		return;
152 | 	}
153 | 
154 | 	n = cqe->res;
155 | 	if (n <= 0) {
156 | 		warnx("Send failed");
157 | 		worker_kill_conn(self, conn);
158 | 		return;
159 | 	}
160 | 
161 | 	conn->to_send -= n;
162 | 	conn->tot_sent += n;
163 | 
164 | 	if (!conn->to_send)
165 | 		worker_send_finished(self, conn);
166 | 	else
167 | 		iou_conn_add_sendzc(get_ring(self), conn);
168 | }
169 | 
170 | static void iou_conn_add_recv(struct io_uring *ring, struct worker_connection *conn)
171 | {
172 | 	struct io_uring_sqe *sqe;
173 | 
174 | 	sqe = io_uring_get_sqe(ring);
175 | 	io_uring_prep_recv(sqe, conn->fd, conn->rxbuf, conn->read_size, 0);
176 | 	io_uring_sqe_set_data(sqe, tag(conn, IOU_REQ_TYPE_RECV));
177 | }
178 | 
179 | static void iou_conn_add_recvzc(struct io_uring *ring, struct worker_connection *conn, __u32 id)
180 | {
181 | 	struct io_uring_sqe *sqe;
182 | 
183 | 	sqe = io_uring_get_sqe(ring);
184 | 	io_uring_prep_rw(IORING_OP_RECV_ZC, sqe, conn->fd, NULL, 0, 0);
185 | 	sqe->ioprio |= IORING_RECV_MULTISHOT;
186 | 	sqe->zcrx_ifq_idx = id;
187 | 	io_uring_sqe_set_data(sqe, tag(conn, IOU_REQ_TYPE_RECVZC));
188 | }
189 | 
190 | static void iou_handle_recv(struct worker_state *self, struct io_uring_cqe *cqe)
191 | {
192 | 	struct io_uring *ring = get_ring(self);
193 | 	struct worker_connection *conn;
194 | 	ssize_t n;
195 | 	void *src;
196 | 
197 | 	if (self->ended)
198 | 		return;
199 | 
200 | 	conn = untag(cqe->user_data);
201 | 	n = cqe->res;
202 | 	if (n <= 0) {
203 | 		warnx("Recv failed: %ld, to_recv: %llu", n, conn->to_recv);
204 | 		worker_kill_conn(self, conn);
205 | 		return;
206 | 	}
207 | 
208 | 	src = &patbuf[conn->tot_recv % PATTERN_PERIOD];
209 | 	if (self->opts.validate && memcmp(conn->rxbuf, src, n))
210 | 		warnx("Data corruption %d %d %ld %lld %lld",
211 | 		      *conn->rxbuf, *(char *)src, n,
212 | 		      conn->tot_recv % PATTERN_PERIOD,
213 | 		      conn->tot_recv);
214 | 
215 | 	conn->to_recv -= n;
216 | 	conn->tot_recv += n;
217 | 
218 | 	if (!conn->to_recv) {
219 | 		worker_recv_finished(self, conn);
220 | 		if (conn->to_send)
221 | 			iou_conn_add_send(ring, conn);
222 | 	}
223 | 
224 | 	iou_conn_add_recv(ring, conn);
225 | }
226 | 
227 | static void iou_handle_recvzc(struct worker_state *self, struct io_uring_cqe *cqe)
228 | {
229 | 	struct iou_state *state = get_iou_state(self);
230 | 	struct io_uring *ring = get_ring(self);
231 | 	struct io_uring_zcrx_rq *rq_ring;
232 | 	struct io_uring_zcrx_cqe* rcqe;
233 | 	struct worker_connection *conn;
234 | 	struct io_uring_zcrx_rqe *rqe;
235 | 	unsigned char *data;
236 | 	__u64 mask;
237 | 	ssize_t n;
238 | 	void *src;
239 | 
240 | 	if (self->ended)
241 | 		return;
242 | 
243 | 	conn = untag(cqe->user_data);
244 | 	n = cqe->res;
245 | 	if (!(cqe->flags & IORING_CQE_F_MORE)) {
246 | 		if (conn->to_recv)
247 | 			warn("Recvzc ended early");
248 | 		if (n != 0)
249 | 			warn("Recvzc final completion invalid res: %ld", n);
250 | 		worker_kill_conn(self, conn);
251 | 		return;
252 | 	}
253 | 
254 | 	if (n <= 0) {
255 | 		warnx("Recv failed: %ld, to_recv: %llu", n, conn->to_recv);
256 | 		worker_kill_conn(self, conn);
257 | 		return;
258 | 	}
259 | 
260 | 	rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1);
261 | 	mask = (1ULL << IORING_ZCRX_AREA_SHIFT) - 1;
262 | 	data = (unsigned char *)state->area_ptr + (rcqe->off & mask);
263 | 
264 | 	src = &patbuf[conn->tot_recv % PATTERN_PERIOD];
265 | 	if (self->opts.validate && memcmp(data, src, n))
266 | 		warnx("Data corruption %d %d %ld %lld %lld",
267 | 		      *data, *(char *)src, n,
268 | 		      conn->tot_recv % PATTERN_PERIOD,
269 | 		      conn->tot_recv);
270 | 
271 | 	conn->to_recv -= n;
272 | 	conn->tot_recv += n;
273 | 
274 | 	if (!conn->to_recv) {
275 | 		worker_recv_finished(self, conn);
276 | 		if (conn->to_send)
277 | 			iou_conn_add_send(ring, conn);
278 | 	}
279 | 
280 | 	rq_ring = &state->rq;
281 | 	rqe = &rq_ring->rqes[rq_ring->rq_tail & state->rq_mask];
282 | 	rqe->off = (rcqe->off & ~IORING_ZCRX_AREA_MASK) | state->area_token;
283 | 	rqe->len = cqe->res;
284 | 	io_uring_smp_store_release(rq_ring->ktail, ++rq_ring->rq_tail);
285 | }
286 | 
287 | static size_t get_rq_ring_size(unsigned int entries)
288 | {
289 | 	size_t size;
290 | 
291 | 	size = entries * sizeof(struct io_uring_zcrx_rqe);
292 | 	/* add space for the header (head/tail/etc.) */
293 | 	size += page_size;
294 | 
295 | 	return ALIGN_UP(size, page_size);
296 | }
297 | 
298 | static int iou_register_zerocopy_rx(struct worker_state *self)
299 | {
300 | 	struct iou_state *state = get_iou_state(self);
301 | 	unsigned int ring_entries;
302 | 	size_t area_size;
303 | 	size_t ring_size;
304 | 	void *area_ptr;
305 | 	void *ring_ptr;
306 | 	int ret;
307 | 
308 | 	area_size = self->opts.iou.rx_size_mb * 1024 * 1024;
309 | 	/* arbitrary ring size chosen based on rx_size_mb */
310 | 	ring_entries = (area_size / (page_size * 2));
311 | 	ring_size = get_rq_ring_size(ring_entries);
312 | 
313 | 	area_ptr = mmap(NULL,
314 | 		   area_size + ring_size,
315 | 		   PROT_READ | PROT_WRITE,
316 | 		   MAP_ANONYMOUS | MAP_PRIVATE,
317 | 		   -1,
318 | 		   0
319 | 	);
320 | 	if (area_ptr == MAP_FAILED) {
321 | 		warn("Failed to mmap zero copy receive memory");
322 | 		return -1;
323 | 	}
324 | 	struct io_uring_zcrx_area_reg area_reg = {
325 | 		.addr = (__u64)(unsigned long)area_ptr,
326 | 		.len = area_size,
327 | 		.flags = 0,
328 | 	};
329 | 
330 | 	ring_ptr = (char *)area_ptr + area_size;
331 | 	struct io_uring_region_desc region_reg = {
332 | 		.user_addr = (__u64)(unsigned long)ring_ptr,
333 | 		.size = ring_size,
334 | 		.flags = IORING_MEM_REGION_TYPE_USER,
335 | 	};
336 | 
337 | 	struct io_uring_zcrx_ifq_reg reg = {
338 | 		.if_idx = self->opts.iou.ifindex,
339 | 		.if_rxq = self->opts.iou.queue_id,
340 | 		.rq_entries = ring_entries,
341 | 		.area_ptr = (__u64)(unsigned long)&area_reg,
342 | 		.region_ptr = (__u64)(unsigned long)&region_reg,
343 | 	};
344 | 
345 | 	ret = io_uring_register_ifq(&state->ring, &reg);
346 | 	if (ret) {
347 | 		warn("io_uring_register_ifq failed: %d", ret);
348 | 		munmap(area_ptr, area_size + ring_size);
349 | 		return ret;
350 | 	}
351 | 
352 | 	state->rq.khead = (unsigned int *)((char *)ring_ptr + reg.offsets.head);
353 | 	state->rq.ktail = (unsigned int *)((char *)ring_ptr + reg.offsets.tail);
354 | 	state->rq.rqes = (struct io_uring_zcrx_rqe *)((char *)ring_ptr + reg.offsets.rqes);
355 | 	state->rq.rq_tail = 0;
356 | 	state->rq.ring_entries = reg.rq_entries;
357 | 
358 | 	state->area_token = area_reg.rq_area_token;
359 | 	state->rq_mask = reg.rq_entries - 1;
360 | 	state->zcrx_id = reg.zcrx_id;
361 | 
362 | 	state->area_ptr = area_ptr;
363 | 	state->rq_ptr = ring_ptr;
364 | 	state->area_size = area_size;
365 | 	state->rq_size = ring_size;
366 | 
367 | 	return 0;
368 | }
369 | 
370 | static int iou_register_zerocopy_tx(struct worker_state *self)
371 | {
372 | 	struct iou_state *state = get_iou_state(self);
373 | 	struct iovec iov;
374 | 
375 | 	iov.iov_base = patbuf;
376 | 	iov.iov_len = KPM_MAX_OP_CHUNK + PATTERN_PERIOD + 1;
377 | 
378 | 	return io_uring_register_buffers(&state->ring, &iov, 1);
379 | }
380 | 
381 | static void iou_prep(struct worker_state *self)
382 | {
383 | 	struct iou_kpm_msg_state *msg;
384 | 	struct io_uring_params p = {};
385 | 	struct io_uring_sqe *sqe;
386 | 	struct iou_state *state;
387 | 	int ret;
388 | 
389 | 	state = malloc(sizeof(*state));
390 | 	if (!state)
391 | 		err(4, "Failed to malloc iou_state");
392 | 	memset(state, 0, sizeof(*state));
393 | 	self->io_state = state;
394 | 
395 | 	p.flags |= IORING_SETUP_COOP_TASKRUN;
396 | 	p.flags |= IORING_SETUP_CQSIZE;
397 | 	p.flags |= IORING_SETUP_DEFER_TASKRUN;
398 | 	p.flags |= IORING_SETUP_SINGLE_ISSUER;
399 | 	p.flags |= IORING_SETUP_SUBMIT_ALL;
400 | 	if (self->opts.rx_mode == KPM_RX_MODE_SOCKET_ZEROCOPY)
401 | 		p.flags |= IORING_SETUP_CQE32;
402 | 	p.cq_entries = 512;
403 | 
404 | 	ret = io_uring_queue_init_params(64, &state->ring, &p);
405 | 	if (ret)
406 | 		err(5, "Failed to create io_uring");
407 | 
408 | 	msg = malloc(sizeof(*msg));
409 | 	if (!msg) {
410 | 		free(state);
411 | 		err(6, "Failed to malloc iou_kpm_msg_state");
412 | 	}
413 | 
414 | 	if (self->opts.rx_mode == KPM_RX_MODE_SOCKET_ZEROCOPY)
415 | 		if (iou_register_zerocopy_rx(self))
416 | 			err(7, "Failed to register zero copy rx");
417 | 
418 | 	if (self->opts.tx_mode == KPM_TX_MODE_SOCKET_ZEROCOPY)
419 | 		if (iou_register_zerocopy_tx(self))
420 | 			err(8, "Failed to register zero copy tx");
421 | 
422 | 	sqe = io_uring_get_sqe(&state->ring);
423 | 	io_uring_prep_recv(sqe, self->main_sock, &msg->hdr, sizeof(msg->hdr), MSG_PEEK | MSG_WAITALL);
424 | 	io_uring_sqe_set_data(sqe, tag(msg, IOU_REQ_TYPE_PROTO_HDR));
425 | }
426 | 
427 | static void iou_handle_proto_hdr(struct worker_state *self, struct io_uring_cqe *cqe)
428 | {
429 | 	struct io_uring *ring = get_ring(self);
430 | 	struct iou_kpm_msg_state *msg;
431 | 	struct io_uring_sqe *sqe;
432 | 	ssize_t n = cqe->res;
433 | 
434 | 	msg = untag(cqe->user_data);
435 | 	if (n < (int)sizeof(msg->hdr)) {
436 | 		if (n)
437 | 			warn("Failed to receive header (%zd)", n);
438 | 		goto err;
439 | 	}
440 | 	if (msg->hdr.len < sizeof(msg->hdr)) {
441 | 		warnx("Invalid header length (%d)", msg->hdr.len);
442 | 		goto err;
443 | 	}
444 | 
445 | 	msg->msg = malloc(msg->hdr.len);
446 | 	if (!msg->msg) {
447 | 		warnx("Failed to malloc msg");
448 | 		goto err;
449 | 	}
450 | 
451 | 	msg->off = 0;
452 | 	sqe = io_uring_get_sqe(ring);
453 | 	io_uring_prep_recv(sqe, self->main_sock, msg->msg + msg->off, msg->hdr.len, 0);
454 | 	io_uring_sqe_set_data(sqe, tag(msg, IOU_REQ_TYPE_PROTO_PLD));
455 | 
456 | 	return;
457 | 
458 | err:
459 | 	__kpm_dbg("<<", "ctrl recv failed");
460 | 	self->quit = 1;
461 | 	free(msg);
462 | 	return;
463 | }
464 | 
465 | static void iou_handle_proto_pld(struct worker_state *self, struct io_uring_cqe *cqe)
466 | {
467 | 	struct io_uring *ring = get_ring(self);
468 | 	struct iou_kpm_msg_state *msg;
469 | 	struct io_uring_sqe *sqe;
470 | 	ssize_t n = cqe->res;
471 | 
472 | 	msg = untag(cqe->user_data);
473 | 	if (n > msg->hdr.len) {
474 | 		warnx("Oversized recv");
475 | 		goto err;
476 | 	} else if (n <= 0) {
477 | 		warnx("Short recv");
478 | 		goto err;
479 | 	}
480 | 
481 | 	msg->off += n;
482 | 	msg->hdr.len -= n;
483 | 
484 | 	if (msg->hdr.len) {
485 | 		sqe = io_uring_get_sqe(ring);
486 | 		io_uring_prep_recv(sqe, self->main_sock, msg->msg + msg->off, msg->hdr.len, 0);
487 | 		io_uring_sqe_set_data(sqe, tag(msg, IOU_REQ_TYPE_PROTO_PLD));
488 | 		return;
489 | 	}
490 | 
491 | 	worker_handle_proto(self, msg->msg);
492 | 
493 | 	free(msg->msg);
494 | 	memset(msg, 0, sizeof(*msg));
495 | 
496 | 	sqe = io_uring_get_sqe(ring);
497 | 	io_uring_prep_recv(sqe, self->main_sock, &msg->hdr, sizeof(msg->hdr), MSG_PEEK | MSG_WAITALL);
498 | 	io_uring_sqe_set_data(sqe, tag(msg, IOU_REQ_TYPE_PROTO_HDR));
499 | 
500 | 	return;
501 | err:
502 | 	__kpm_dbg("<<", "ctrl recv failed");
503 | 	self->quit = 1;
504 | 	free(msg->msg);
505 | 	free(msg);
506 | 	return;
507 | }
508 | 
509 | static void iou_wait(struct worker_state *self, int msec)
510 | {
511 | 	struct io_uring *ring = get_ring(self);
512 | 	struct __kernel_timespec timeout;
513 | 	struct io_uring_cqe *cqe;
514 | 	unsigned int count = 0;
515 | 	unsigned int head;
516 | 
517 | 	timeout.tv_sec = msec / 1000;
518 | 	timeout.tv_nsec = (msec % 1000) * 1000000;
519 | 
520 | 	io_uring_submit_and_wait_timeout(ring, &cqe, 1, &timeout, NULL);
521 | 
522 | 	io_uring_for_each_cqe(ring, head, cqe) {
523 | 		switch (get_tag(cqe->user_data)) {
524 | 			case IOU_REQ_TYPE_PROTO_HDR:
525 | 				iou_handle_proto_hdr(self, cqe);
526 | 				break;
527 | 			case IOU_REQ_TYPE_PROTO_PLD:
528 | 				iou_handle_proto_pld(self, cqe);
529 | 				break;
530 | 			case IOU_REQ_TYPE_SEND:
531 | 				iou_handle_send(self, cqe);
532 | 				break;
533 | 			case IOU_REQ_TYPE_SENDZC:
534 | 				iou_handle_sendzc(self, cqe);
535 | 				break;
536 | 			case IOU_REQ_TYPE_RECV:
537 | 				iou_handle_recv(self, cqe);
538 | 				break;
539 | 			case IOU_REQ_TYPE_RECVZC:
540 | 				iou_handle_recvzc(self, cqe);
541 | 				break;
542 | 			case IOU_REQ_TYPE_CANCEL:
543 | 				break;
544 | 			default:
545 | 				err(1, "Unknown io_uring request type: %d, res: %d", get_tag(cqe->user_data), cqe->res);
546 | 		}
547 | 
548 | 		count++;
549 | 	}
550 | 	io_uring_cq_advance(ring, count);
551 | }
552 | 
553 | static void iou_conn_add(struct worker_state *state, struct worker_connection *conn)
554 | {
555 | 	struct io_uring *ring = get_ring(state);
556 | 
557 | 	if (conn->to_send) {
558 | 		if (state->opts.tx_mode == KPM_TX_MODE_SOCKET_ZEROCOPY)
559 | 			iou_conn_add_sendzc(ring, conn);
560 | 		else
561 | 			iou_conn_add_send(ring, conn);
562 | 	}
563 | 
564 | 	if (state->opts.rx_mode == KPM_RX_MODE_SOCKET_ZEROCOPY)
565 | 		iou_conn_add_recvzc(ring, conn, get_iou_state(state)->zcrx_id);
566 | 	else
567 | 		iou_conn_add_recv(ring, conn);
568 | }
569 | 
570 | static void iou_conn_close(struct worker_state *state, struct worker_connection *conn)
571 | {
572 | 	struct io_uring *ring = get_ring(state);
573 | 	struct io_uring_sqe *sqe;
574 | 
575 | 	sqe = io_uring_get_sqe(ring);
576 | 	io_uring_prep_cancel_fd(sqe, conn->fd, 0);
577 | 	io_uring_sqe_set_data(sqe, tag(NULL, IOU_REQ_TYPE_CANCEL));
578 | 	/* Cancellation is sync. A completion is always generated by the time
579 | 	 * submit returns */
580 | 	io_uring_submit(ring);
581 | }
582 | 
583 | static void iou_exit(struct worker_state *self)
584 | {
585 | 	struct iou_state *state = get_iou_state(self);
586 | 	struct io_uring *ring = get_ring(self);
587 | 	if (state->area_ptr)
588 | 		munmap(state->area_ptr, state->area_size + state->rq_size);
589 | 	io_uring_queue_exit(ring);
590 | 	free(self->io_state);
591 | }
592 | 
593 | static const struct io_ops iou_io_ops = {
594 | 	.prep		= iou_prep,
595 | 	.wait		= iou_wait,
596 | 	.conn_add	= iou_conn_add,
597 | 	.conn_close	= iou_conn_close,
598 | 	.exit		= iou_exit,
599 | };
600 | 
601 | void worker_iou_init(struct worker_state *self)
602 | {
603 | 	self->ops = &iou_io_ops;
604 | 	page_size = sysconf(_SC_PAGESIZE);
605 | }
606 | 
607 | int iou_zerocopy_rx_setup(struct session_state_iou *iou, int fd,
608 | 			  int num_queues)
609 | {
610 | 	return reserve_queues(fd, num_queues, iou->ifname, &iou->ifindex,
611 | 			      &iou->queue_id, &iou->rss_context);
612 | }
613 | 
614 | int iou_zerocopy_rx_teardown(struct session_state_iou *iou)
615 | {
616 | 	unreserve_queues(iou->ifname, iou->rss_context);
617 | 	return 0;
618 | }
619 | 


--------------------------------------------------------------------------------
/server_session.c:
--------------------------------------------------------------------------------
   1 | // SPDX-License-Identifier: BSD-3-Clause
   2 | /* Copyright Meta Platforms, Inc. and affiliates */
   3 | 
   4 | #define _GNU_SOURCE
   5 | 
   6 | #include <errno.h>
   7 | #include <pthread.h>
   8 | #include <sched.h>
   9 | #include <stdint.h>
  10 | #include <stdlib.h>
  11 | #include <unistd.h>
  12 | #include <linux/in6.h>
  13 | #include <linux/socket.h>
  14 | #include <linux/tcp.h>
  15 | #include <sys/epoll.h>
  16 | #include <sys/types.h>
  17 | #include <sys/socket.h>
  18 | #include <sys/sysinfo.h>
  19 | 
  20 | #include <ccan/array_size/array_size.h>
  21 | #include <ccan/compiler/compiler.h>
  22 | #include <ccan/daemonize/daemonize.h>
  23 | #include <ccan/err/err.h>
  24 | #include <ccan/fdpass/fdpass.h>
  25 | #include <ccan/minmax/minmax.h>
  26 | 
  27 | #include "proto.h"
  28 | #include "proto_dbg.h"
  29 | #include "server.h"
  30 | #include "devmem.h"
  31 | #include "iou.h"
  32 | 
  33 | extern unsigned char patbuf[KPM_MAX_OP_CHUNK + PATTERN_PERIOD + 1];
  34 | 
  35 | struct session_state {
  36 | 	int main_sock;
  37 | 	int epollfd;
  38 | 	int quit;
  39 | 	int tcp_sock;
  40 | 	enum kpm_rx_mode rx_mode;
  41 | 	enum kpm_tx_mode tx_mode;
  42 | 	unsigned int connection_ids;
  43 | 	unsigned int worker_ids;
  44 | 	unsigned int test_ids;
  45 | 	struct list_head connections;
  46 | 	struct list_head workers;
  47 | 	struct list_head tests;
  48 | 	struct session_state_devmem devmem;
  49 | 	struct session_state_iou iou_state;
  50 | 	bool validate;
  51 | 	bool iou;
  52 | };
  53 | 
  54 | struct connection {
  55 | 	unsigned int id;
  56 | 	int fd;
  57 | 	int cpu;
  58 | 	int worker_fd;
  59 | 	unsigned int tls_mask;
  60 | 	struct list_node connections;
  61 | };
  62 | 
  63 | struct worker {
  64 | 	unsigned int id;
  65 | 	int fd;
  66 | 	pid_t pid;
  67 | 	int busy;
  68 | 	struct list_node workers;
  69 | };
  70 | 
  71 | struct test {
  72 | 	unsigned int id;
  73 | 	int active;
  74 | 	unsigned int min_worker_id;
  75 | 	unsigned int worker_range;
  76 | 	unsigned int workers_total;
  77 | 	unsigned int workers_done;
  78 | 	struct kpm_test *req, **fwd;
  79 | 	struct kpm_test_results **results;
  80 | 	struct list_node tests;
  81 | };
  82 | 
  83 | static struct connection *
  84 | session_find_connection_by_id(struct session_state *self, unsigned int id)
  85 | {
  86 | 	struct connection *conn;
  87 | 
  88 | 	list_for_each(&self->connections, conn, connections) {
  89 | 		if (conn->id == id)
  90 | 			return conn;
  91 | 	}
  92 | 	return NULL;
  93 | }
  94 | 
  95 | static struct worker *
  96 | session_find_worker_by_id(struct session_state *self, unsigned int id)
  97 | {
  98 | 	struct worker *wrk;
  99 | 
 100 | 	list_for_each(&self->workers, wrk, workers) {
 101 | 		if (wrk->id == id)
 102 | 			return wrk;
 103 | 	}
 104 | 	return NULL;
 105 | }
 106 | 
 107 | static struct test *
 108 | session_find_test_by_id(struct session_state *self, unsigned int id)
 109 | {
 110 | 	struct test *test;
 111 | 
 112 | 	list_for_each(&self->tests, test, tests) {
 113 | 		if (test->id == id)
 114 | 			return test;
 115 | 	}
 116 | 	return NULL;
 117 | }
 118 | 
 119 | static void session_new_conn(struct session_state *self, int fd)
 120 | {
 121 | 	struct connection *conn;
 122 | 	socklen_t len;
 123 | 
 124 | 	conn = malloc(sizeof(*conn));
 125 | 	if (!conn)
 126 | 		goto err_close;
 127 | 	memset(conn, 0, sizeof(*conn));
 128 | 
 129 | 	conn->id = ++self->connection_ids;
 130 | 	conn->fd = fd;
 131 | 
 132 | 	len = sizeof(conn->cpu);
 133 | 	if (getsockopt(fd, SOL_SOCKET, SO_INCOMING_CPU, &conn->cpu, &len) < 0) {
 134 | 		warn("Failed to read CPU for socket");
 135 | 		goto err_free;
 136 | 	}
 137 | 
 138 | 	if (kpm_send_conn_id(fd, conn->id, conn->cpu) < 0)
 139 | 		goto err_free;
 140 | 
 141 | 	list_add(&self->connections, &conn->connections);
 142 | 	return;
 143 | 
 144 | err_free:
 145 | 	free(conn);
 146 | err_close:
 147 | 	close(fd);
 148 | 	return;
 149 | }
 150 | 
 151 | static void
 152 | server_msg_tcp_acceptor(struct session_state *self, struct kpm_header *req)
 153 | {
 154 | 	struct epoll_event ev = {};
 155 | 	struct sockaddr_in6 addr;
 156 | 	socklen_t len;
 157 | 	int ret;
 158 | 
 159 | 	if (self->tcp_sock) {
 160 | 		kpm_reply_error(self->main_sock, req, EBUSY);
 161 | 		return;
 162 | 	}
 163 | 
 164 | 	len = sizeof(addr);
 165 | 	if (getsockname(self->main_sock, (void *)&addr, &len)) {
 166 | 		warn("Failed to get sock type for main sock");
 167 | 		self->quit = 1;
 168 | 		return;
 169 | 	}
 170 | 	addr.sin6_port = 0;
 171 | 
 172 | 	self->tcp_sock = socket(addr.sin6_family, SOCK_STREAM, 0);
 173 | 	if (self->tcp_sock < 0) {
 174 | 		warn("Failed to open socket");
 175 | 		self->quit = 1;
 176 | 		return;
 177 | 	}
 178 | 
 179 | 	ret = bind(self->tcp_sock, (void *)&addr, sizeof(addr));
 180 | 	if (ret < 0) {
 181 | 		warn("Failed to bind socket");
 182 | 		self->quit = 1;
 183 | 		return;
 184 | 	}
 185 | 
 186 | 	ret = listen(self->tcp_sock, 10);
 187 | 	if (ret < 0) {
 188 | 		warn("Failed to listen on socket");
 189 | 		self->quit = 1;
 190 | 		return;
 191 | 	}
 192 | 
 193 | 	len = sizeof(addr);
 194 | 	if (getsockname(self->tcp_sock, (void *)&addr, &len)) {
 195 | 		warn("Failed to get sock type for main sock");
 196 | 		self->quit = 1;
 197 | 		return;
 198 | 	}
 199 | 
 200 | 	ev.events = EPOLLIN | EPOLLET;
 201 | 	ev.data.fd = self->tcp_sock;
 202 | 	if (epoll_ctl(self->epollfd, EPOLL_CTL_ADD, self->tcp_sock, &ev) < 0) {
 203 | 		warn("Failed to add tcp sock to epoll");
 204 | 		self->quit = 1;
 205 | 		return;
 206 | 	}
 207 | 
 208 | 	if (kpm_reply_acceptor(self->main_sock, req, &addr, len) < 1) {
 209 | 		warn("Failed reply in %s", __func__);
 210 | 		self->quit = 1;
 211 | 		return;
 212 | 	}
 213 | }
 214 | 
 215 | static void
 216 | server_msg_connect(struct session_state *self, struct kpm_header *hdr)
 217 | {
 218 | 	unsigned short local_port, remote_port;
 219 | 	struct kpm_connection_id *id;
 220 | 	struct sockaddr_in6 addr;
 221 | 	struct kpm_connect *req;
 222 | 	struct connection *conn;
 223 | 	socklen_t len;
 224 | 	int ret, cfd;
 225 | 
 226 | 	if (hdr->len < sizeof(struct kpm_connect)) {
 227 | 		warn("Invalid request in %s", __func__);
 228 | 		self->quit = 1;
 229 | 		return;
 230 | 	}
 231 | 	req = (void *)hdr;
 232 | 
 233 | 	conn = malloc(sizeof(*conn));
 234 | 	if (!conn) {
 235 | 		self->quit = 1;
 236 | 		return;
 237 | 	}
 238 | 	memset(conn, 0, sizeof(*conn));
 239 | 
 240 | 	cfd = socket(req->addr.sin6_family, SOCK_STREAM, 0);
 241 | 	if (cfd < 0) {
 242 | 		warn("Failed to open socket");
 243 | 		goto err_free;
 244 | 	}
 245 | 
 246 | 	if (req->mss &&
 247 | 	    setsockopt(cfd, IPPROTO_TCP, TCP_MAXSEG,
 248 | 		       (void *)&req->mss, sizeof(req->mss))) {
 249 | 		warn("Setting mss failed");
 250 | 		goto err_close;
 251 | 	}
 252 | 
 253 | 	if (self->tx_mode == KPM_TX_MODE_DEVMEM &&
 254 | 	    devmem_bind_socket(&self->devmem, cfd) < 0)
 255 | 		goto err_close;
 256 | 
 257 | 	ret = connect(cfd, (void *)&req->addr, req->len);
 258 | 	if (ret < 0) {
 259 | 		warn("Failed to connect");
 260 | 		goto err_close;
 261 | 	}
 262 | 
 263 | 	id = kpm_receive(cfd);
 264 | 	if (!id) {
 265 | 		warnx("No connection ID");
 266 | 		goto err_close;
 267 | 	}
 268 | 
 269 | 	if (id->hdr.type != KPM_MSG_TYPE_CONNECTION_ID ||
 270 | 	    id->hdr.len != sizeof(*id)) {
 271 | 		warnx("Invalid connection ID %d %d", id->hdr.type, id->hdr.len);
 272 | 		goto err_free_id;
 273 | 	}
 274 | 
 275 | 	conn->id = ++self->connection_ids;
 276 | 	conn->fd = cfd;
 277 | 
 278 | 	len = sizeof(conn->cpu);
 279 | 	if (getsockopt(cfd, SOL_SOCKET, SO_INCOMING_CPU, &conn->cpu, &len) < 0) {
 280 | 		warn("Failed to read CPU for socket");
 281 | 		goto err_free_id;
 282 | 	}
 283 | 
 284 | 	len = sizeof(addr);
 285 | 	if (getsockname(cfd, &addr, &len)) {
 286 | 		warn("Failed to read address of socket");
 287 | 		goto err_free_id;
 288 | 	}
 289 | 	local_port = ntohs(addr.sin6_port);
 290 | 
 291 | 	len = sizeof(addr);
 292 | 	if (getpeername(cfd, &addr, &len)) {
 293 | 		warn("Failed to read address of socket");
 294 | 		goto err_free_id;
 295 | 	}
 296 | 	remote_port = ntohs(addr.sin6_port);
 297 | 
 298 | 	if (kpm_reply_connect(self->main_sock, hdr,
 299 | 			      conn->id, conn->cpu, local_port,
 300 | 			      id->id, id->cpu, remote_port) < 1) {
 301 | 		warn("Failed to reply");
 302 | 		goto err_free_id;
 303 | 	}
 304 | 
 305 | 	list_add(&self->connections, &conn->connections);
 306 | 	free(id);
 307 | 
 308 | 	return;
 309 | 
 310 | err_free_id:
 311 | 	free(id);
 312 | err_close:
 313 | 	close(cfd);
 314 | err_free:
 315 | 	free(conn);
 316 | 	self->quit = 1;
 317 | 	return;
 318 | }
 319 | 
 320 | static void
 321 | server_msg_disconnect(struct session_state *self, struct kpm_header *hdr)
 322 | {
 323 | 	struct __kpm_generic_u32 *req;
 324 | 	struct connection *conn;
 325 | 
 326 | 	if (hdr->len < sizeof(*req)) {
 327 | 		warn("Invalid request in %s", __func__);
 328 | 		goto err_quit;
 329 | 	}
 330 | 	req = (void *)hdr;
 331 | 
 332 | 	conn = session_find_connection_by_id(self, req->val);
 333 | 	if (!conn) {
 334 | 		warnx("connection not found");
 335 | 		kpm_reply_error(self->main_sock, hdr, ENOENT);
 336 | 		goto err_quit;
 337 | 	}
 338 | 
 339 | 	kpm_trace("close %d", conn->fd);
 340 | 	close(conn->fd);
 341 | 	list_del(&conn->connections);
 342 | 	free(conn);
 343 | 
 344 | 	if (kpm_reply_empty(self->main_sock, hdr) < 1) {
 345 | 		warnx("Reply failed");
 346 | 		goto err_quit;
 347 | 	}
 348 | 
 349 | 	return;
 350 | 
 351 | err_quit:
 352 | 	self->quit = 1;
 353 | }
 354 | 
 355 | static void
 356 | server_msg_tls(struct session_state *self, struct kpm_header *hdr)
 357 | {
 358 | 	struct connection *conn;
 359 | 	struct kpm_tls *req;
 360 | 	int one = 1;
 361 | 
 362 | 	if (hdr->len < sizeof(*req)) {
 363 | 		warn("Invalid request in %s", __func__);
 364 | 		goto err_quit;
 365 | 	}
 366 | 	req = (void *)hdr;
 367 | 
 368 | 	if (req->dir_mask & ~(KPM_TLS_ULP | KPM_TLS_TX | KPM_TLS_RX |
 369 | 			      KPM_TLS_NOPAD)) {
 370 | 		warnx("unknown TLS flag");
 371 | 		kpm_reply_error(self->main_sock, hdr, EINVAL);
 372 | 		goto err_quit;
 373 | 	}
 374 | 
 375 | 	conn = session_find_connection_by_id(self, req->connection_id);
 376 | 	if (!conn) {
 377 | 		warnx("connection not found");
 378 | 		kpm_reply_error(self->main_sock, hdr, ENOENT);
 379 | 		goto err_quit;
 380 | 	}
 381 | 
 382 | 	if (conn->tls_mask & req->dir_mask) {
 383 | 		warnx("TLS already set");
 384 | 		kpm_reply_error(self->main_sock, hdr, EBUSY);
 385 | 		goto err_quit;
 386 | 	}
 387 | 
 388 | 	if (!((conn->tls_mask | req->dir_mask) & KPM_TLS_ULP)) {
 389 | 		warnx("TLS ULP not requested");
 390 | 		kpm_reply_error(self->main_sock, hdr, EINVAL);
 391 | 		goto err_quit;
 392 | 	}
 393 | 
 394 | 	if ((req->dir_mask & KPM_TLS_ULP) &&
 395 | 	    setsockopt(conn->fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls"))) {
 396 | 		warn("TLS ULP setup failed");
 397 | 		goto err_repl_errno;
 398 | 	}
 399 | 
 400 | 	if ((req->dir_mask & KPM_TLS_TX) &&
 401 | 	    setsockopt(conn->fd, SOL_TLS, TLS_TX,
 402 | 		       (void *)&req->info, req->len)) {
 403 | 		warn("TLS Tx setup failed");
 404 | 		goto err_repl_errno;
 405 | 	}
 406 | 
 407 | 	if ((req->dir_mask & KPM_TLS_RX) &&
 408 | 	    setsockopt(conn->fd, SOL_TLS, TLS_RX,
 409 | 		       (void *)&req->info, req->len)) {
 410 | 		warn("TLS Rx setup failed");
 411 | 		goto err_repl_errno;
 412 | 	}
 413 | 
 414 | 	if ((req->dir_mask & KPM_TLS_NOPAD) &&
 415 | 	    setsockopt(conn->fd, SOL_TLS, TLS_RX_EXPECT_NO_PAD,
 416 | 		       (void *)&one, sizeof(one))) {
 417 | 		warn("TLS nopad setup failed");
 418 | 		goto err_repl_errno;
 419 | 	}
 420 | 
 421 | 	conn->tls_mask = req->dir_mask;
 422 | 
 423 | 	if (kpm_reply_empty(self->main_sock, hdr) < 1) {
 424 | 		warnx("Reply failed");
 425 | 		goto err_quit;
 426 | 	}
 427 | 
 428 | 	return;
 429 | 
 430 | err_repl_errno:
 431 | 	kpm_reply_error(self->main_sock, hdr, errno);
 432 | err_quit:
 433 | 	self->quit = 1;
 434 | }
 435 | 
 436 | static void
 437 | server_msg_max_pacing(struct session_state *self, struct kpm_header *hdr)
 438 | {
 439 | 	struct kpm_max_pacing *req;
 440 | 	struct connection *conn;
 441 | 
 442 | 	if (hdr->len < sizeof(*req)) {
 443 | 		warn("Invalid request in %s", __func__);
 444 | 		goto err_quit;
 445 | 	}
 446 | 	req = (void *)hdr;
 447 | 
 448 | 	conn = session_find_connection_by_id(self, req->id);
 449 | 	if (!conn) {
 450 | 		warnx("connection not found");
 451 | 		kpm_reply_error(self->main_sock, hdr, ENOENT);
 452 | 		goto err_quit;
 453 | 	}
 454 | 
 455 | 	if (setsockopt(conn->fd, SOL_SOCKET, SO_MAX_PACING_RATE,
 456 | 		       &req->max_pacing, sizeof(req->max_pacing))) {
 457 | 		warn("setting pacing rate failed");
 458 | 		goto err_repl_errno;
 459 | 	}
 460 | 
 461 | 	if (kpm_reply_empty(self->main_sock, hdr) < 1) {
 462 | 		warnx("Reply failed");
 463 | 		goto err_quit;
 464 | 	}
 465 | 
 466 | 	return;
 467 | 
 468 | err_repl_errno:
 469 | 	kpm_reply_error(self->main_sock, hdr, errno);
 470 | err_quit:
 471 | 	self->quit = 1;
 472 | }
 473 | 
 474 | static void
 475 | server_msg_tcp_cc(struct session_state *self, struct kpm_header *hdr)
 476 | {
 477 | 	struct connection *conn;
 478 | 	struct kpm_tcp_cc *req;
 479 | 
 480 | 	if (hdr->len < sizeof(*req)) {
 481 | 		warn("Invalid request in %s", __func__);
 482 | 		goto err_quit;
 483 | 	}
 484 | 	req = (void *)hdr;
 485 | 
 486 | 	conn = session_find_connection_by_id(self, req->id);
 487 | 	if (!conn) {
 488 | 		warnx("connection not found");
 489 | 		kpm_reply_error(self->main_sock, hdr, ENOENT);
 490 | 		goto err_quit;
 491 | 	}
 492 | 
 493 | 	if (setsockopt(conn->fd, IPPROTO_TCP, TCP_CONGESTION, &req->cc_name,
 494 | 		       strnlen(req->cc_name, sizeof(req->cc_name)))) {
 495 | 		warn("setting TCP cong contorl failed");
 496 | 		goto err_repl_errno;
 497 | 	}
 498 | 
 499 | 	if (kpm_reply_empty(self->main_sock, hdr) < 1) {
 500 | 		warnx("Reply failed");
 501 | 		goto err_quit;
 502 | 	}
 503 | 
 504 | 	return;
 505 | 
 506 | err_repl_errno:
 507 | 	kpm_reply_error(self->main_sock, hdr, errno);
 508 | err_quit:
 509 | 	self->quit = 1;
 510 | }
 511 | 
 512 | static void
 513 | server_msg_mode(struct session_state *self, struct kpm_header *hdr)
 514 | {
 515 | 	struct kpm_mode *req;
 516 | 	int ret;
 517 | 
 518 | 	if (hdr->len < sizeof(*req)) {
 519 | 		warn("Invalid request in %s", __func__);
 520 | 		goto err_quit;
 521 | 	}
 522 | 	req = (void *)hdr;
 523 | 
 524 | 	if (self->tcp_sock && req->rx_mode == KPM_RX_MODE_DEVMEM) {
 525 | 		ret = devmem_setup(&self->devmem, self->tcp_sock, req->dmabuf_rx_size_mb,
 526 | 				   req->num_rx_queues, req->rx_provider,
 527 | 				   &req->dev);
 528 | 		if (ret < 0) {
 529 | 			warnx("Failed to setup devmem");
 530 | 			self->quit = 1;
 531 | 			return;
 532 | 		}
 533 | 	}
 534 | 	if (self->tcp_sock && req->iou && req->rx_mode == KPM_RX_MODE_SOCKET_ZEROCOPY) {
 535 | 		ret = iou_zerocopy_rx_setup(&self->iou_state, self->tcp_sock, req->num_rx_queues);
 536 | 		if (ret < 0) {
 537 | 			warnx("Failed to setup io_uring zero copy receive");
 538 | 			self->quit = 1;
 539 | 			return;
 540 | 		}
 541 | 	}
 542 | 
 543 | 	self->rx_mode = req->rx_mode;
 544 | 	self->tx_mode = req->tx_mode;
 545 | 	self->validate = req->validate;
 546 | 	self->iou = req->iou;
 547 | 	self->iou_state.rx_size_mb = req->iou_rx_size_mb;
 548 | 
 549 | 	if (!self->tcp_sock && (req->tx_mode == KPM_TX_MODE_DEVMEM)) {
 550 | 		ret = devmem_setup_tx(&self->devmem, req->tx_provider, req->dmabuf_tx_size_mb,
 551 | 				      &req->dev, &req->addr);
 552 | 		if (ret < 0) {
 553 | 			warnx("Failed to setup devmem tx");
 554 | 			self->quit = 1;
 555 | 			return;
 556 | 		}
 557 | 	}
 558 | 
 559 | 	if (kpm_reply_empty(self->main_sock, hdr) < 1) {
 560 | 		warnx("Reply failed");
 561 | 		goto err_quit;
 562 | 	}
 563 | 
 564 | 	return;
 565 | 
 566 | err_quit:
 567 | 	self->quit = 1;
 568 | }
 569 | 
 570 | static void
 571 | server_msg_spawn_worker(struct session_state *self, struct kpm_header *hdr)
 572 | {
 573 | 	struct worker_opts *opts = NULL;
 574 | 	struct worker *wrk = NULL;
 575 | 	struct epoll_event ev = {};
 576 | 	int p[2], dmabuf_id;
 577 | 	pthread_attr_t attr;
 578 | 	pthread_t thread;
 579 | 
 580 | 	wrk = malloc(sizeof(*wrk));
 581 | 	if (!wrk) {
 582 | 		self->quit = 1;
 583 | 		return;
 584 | 	}
 585 | 	memset(wrk, 0, sizeof(*wrk));
 586 | 
 587 | 	if (socketpair(AF_LOCAL, SOCK_STREAM, 0, p) < 0) {
 588 | 		warnx("Failed to create socket pair");
 589 | 		goto err_free;
 590 | 	}
 591 | 
 592 | 	if (pthread_attr_init(&attr)) {
 593 | 		warnx("Failed to init pthread attr");
 594 | 		goto err_free;
 595 | 	}
 596 | 	if (pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED)) {
 597 | 		warnx("Failed to set pthread attr");
 598 | 		goto err_free_attr;
 599 | 	}
 600 | 	dmabuf_id = self->devmem.tx_mem ? self->devmem.tx_mem->dmabuf_id : -1;
 601 | 	opts = malloc(sizeof(*opts));
 602 | 	if (!opts)
 603 | 		goto err_free_attr;
 604 | 	memset(opts, 0, sizeof(*opts));
 605 | 	opts->fd = p[1];
 606 | 	opts->rx_mode = self->rx_mode;
 607 | 	opts->tx_mode = self->tx_mode;
 608 | 	opts->validate = self->validate;
 609 | 	opts->use_iou = self->iou;
 610 | 	opts->devmem.mem = self->devmem.mem;
 611 | 	opts->devmem.dmabuf_id = dmabuf_id;
 612 | 	opts->iou.rx_size_mb = self->iou_state.rx_size_mb;
 613 | 	opts->iou.ifindex = self->iou_state.ifindex;
 614 | 	opts->iou.queue_id = self->iou_state.queue_id;
 615 | 	if (pthread_create(&thread, &attr, worker_main, opts) != 0) {
 616 | 		warnx("Failed to create worker thread");
 617 | 		free(opts);
 618 | 		goto err_free_attr;
 619 | 	}
 620 | 
 621 | 	self->iou_state.queue_id++;
 622 | 	wrk->id = ++self->worker_ids;
 623 | 	wrk->fd = p[0];
 624 | 
 625 | 	ev.events = EPOLLIN | EPOLLET;
 626 | 	ev.data.fd = wrk->fd;
 627 | 	if (epoll_ctl(self->epollfd, EPOLL_CTL_ADD, wrk->fd, &ev) < 0) {
 628 | 		warnx("Failed to add worker sock to epoll");
 629 | 		goto err_worker_kill;
 630 | 	}
 631 | 
 632 | 	kpm_send_u32(wrk->fd, KPM_MSG_WORKER_ID, wrk->id);
 633 | 
 634 | 	if (kpm_reply_u32(self->main_sock, hdr, wrk->id) < 1)
 635 | 		goto err_worker_kill;
 636 | 
 637 | 	list_add(&self->workers, &wrk->workers);
 638 | 	pthread_attr_destroy(&attr);
 639 | 
 640 | 	return;
 641 | 
 642 | err_worker_kill:
 643 | 	kpm_send_empty(wrk->fd, KPM_MSG_WORKER_KILL);
 644 | err_free_attr:
 645 | 	pthread_attr_destroy(&attr);
 646 | err_free:
 647 | 	free(wrk);
 648 | 	self->quit = 1;
 649 | }
 650 | 
 651 | static void
 652 | server_msg_pin_worker(struct session_state *self, struct kpm_header *hdr)
 653 | {
 654 | 	struct kpm_pin_worker *req;
 655 | 	struct worker *wrk;
 656 | 	cpu_set_t set;
 657 | 
 658 | 	if (hdr->len < sizeof(struct kpm_pin_worker)) {
 659 | 		warn("Invalid request in %s", __func__);
 660 | 		self->quit = 1;
 661 | 		return;
 662 | 	}
 663 | 	req = (void *)hdr;
 664 | 
 665 | 	wrk = session_find_worker_by_id(self, req->worker_id);
 666 | 	if (!wrk) {
 667 | 		kpm_reply_error(self->main_sock, hdr, ENOENT);
 668 | 		return;
 669 | 	}
 670 | 
 671 | 	CPU_ZERO(&set);
 672 | 	if (req->cpu == (unsigned int)-1) {
 673 | 		int i, n;
 674 | 
 675 | 		n = sysconf(_SC_NPROCESSORS_CONF);
 676 | 		if (n < 0) {
 677 | 			warn("Failed to get CPU count");
 678 | 			kpm_reply_error(self->main_sock, hdr, errno);
 679 | 			return;
 680 | 		}
 681 | 
 682 | 		for (i = 0; i < n; i++)
 683 | 			CPU_SET(i, &set);
 684 | 	} else {
 685 | 		CPU_SET(req->cpu, &set);
 686 | 	}
 687 | 
 688 | 	if (sched_setaffinity(wrk->pid, sizeof(set), &set) < 0) {
 689 | 		warn("Failed to pin worker to CPU");
 690 | 		kpm_reply_error(self->main_sock, hdr, errno);
 691 | 		return;
 692 | 	}
 693 | 
 694 | 	if (kpm_reply_empty(self->main_sock, hdr) < 1) {
 695 | 		self->quit = 1;
 696 | 		return;
 697 | 	}
 698 | }
 699 | 
 700 | static void
 701 | server_msg_test(struct session_state *self, struct kpm_header *hdr)
 702 | {
 703 | 	unsigned int i, j, min_wrk, max_wrk;
 704 | 	struct kpm_test *req, **fwd;
 705 | 	unsigned int n_conns;
 706 | 	struct test *test;
 707 | 
 708 | 	if (hdr->len < sizeof(struct kpm_test)) {
 709 | bad_req:
 710 | 		warnx("Invalid request in %s: %d < %zd",
 711 | 		      __func__, hdr->len, sizeof(*req));
 712 | 		self->quit = 1;
 713 | 		return;
 714 | 	}
 715 | 	req = (void *)hdr;
 716 | 
 717 | 	n_conns = hdr->len - sizeof(struct kpm_test);
 718 | 	if (n_conns % sizeof(struct kpm_test_spec))
 719 | 		goto bad_req;
 720 | 
 721 | 	n_conns /= sizeof(struct kpm_test_spec);
 722 | 	if (req->test_id || !req->time_sec || n_conns != req->n_conns)
 723 | 		goto bad_req;
 724 | 
 725 | 	test = malloc(sizeof(*test));
 726 | 	memset(test, 0, sizeof(*test));
 727 | 
 728 | 	test->id = ++self->test_ids;
 729 | 	test->active = req->active;
 730 | 
 731 | 	min_wrk = -1;
 732 | 	max_wrk = 0;
 733 | 	for (i = 0; i < n_conns; i++) {
 734 | 		min_wrk = min(min_wrk, req->specs[i].worker_id);
 735 | 		max_wrk = max(max_wrk, req->specs[i].worker_id);
 736 | 	}
 737 | 	test->worker_range = max_wrk - min_wrk + 1;
 738 | 
 739 | 	fwd = calloc(test->worker_range, sizeof(void *));
 740 | 	for (i = 0; i < n_conns; i++)
 741 | 		fwd[i] = calloc(1, hdr->len);
 742 | 	test->results = calloc(test->worker_range, sizeof(*test->results));
 743 | 
 744 | 	for (i = 0; i < n_conns; i++) {
 745 | 		struct kpm_test_spec *t = &req->specs[i];
 746 | 		struct connection *conn;
 747 | 		struct worker *wrk;
 748 | 		struct kpm_test *msg;
 749 | 
 750 | 		wrk = session_find_worker_by_id(self, t->worker_id);
 751 | 		conn = session_find_connection_by_id(self, t->connection_id);
 752 | 		if (!wrk || !conn) {
 753 | 			warnx("worker or connection not found");
 754 | 			kpm_reply_error(self->main_sock, hdr, ENOENT);
 755 | 			goto err_free;
 756 | 		}
 757 | 		if (wrk->busy) {
 758 | 			warnx("worker is busy");
 759 | 			kpm_reply_error(self->main_sock, hdr, EBUSY);
 760 | 			goto err_free;
 761 | 		}
 762 | 
 763 | 		msg = fwd[t->worker_id - min_wrk];
 764 | 		memcpy(&msg->specs[msg->n_conns++], t, sizeof(*t));
 765 | 	}
 766 | 
 767 | 	for (i = 0; i < test->worker_range; i++) {
 768 | 		struct connection *conn;
 769 | 		struct worker *wrk;
 770 | 		struct kpm_test *msg;
 771 | 
 772 | 		msg = fwd[i];
 773 | 		if (!msg->n_conns)
 774 | 			continue;
 775 | 		msg->active = req->active;
 776 | 		msg->time_sec = req->time_sec;
 777 | 		msg->test_id = test->id;
 778 | 
 779 | 		test->workers_total++;
 780 | 		wrk = session_find_worker_by_id(self, msg->specs[0].worker_id);
 781 | 		wrk->busy = 1;
 782 | 
 783 | 		kpm_send(wrk->fd, &msg->hdr,
 784 | 			 sizeof(*msg) + sizeof(msg->specs[0]) * msg->n_conns,
 785 | 			 KPM_MSG_WORKER_TEST);
 786 | 		for (j = 0; j < msg->n_conns; j++) {
 787 | 			conn = session_find_connection_by_id(self, msg->specs[j].connection_id);
 788 | 			fdpass_send(wrk->fd, conn->fd);
 789 | 		}
 790 | 	}
 791 | 
 792 | 	test->req = kpm_msg_dup(hdr);
 793 | 	test->fwd = fwd;
 794 | 	test->min_worker_id = min_wrk;
 795 | 	list_add(&self->tests, &test->tests);
 796 | 	kpm_reply_u32(self->main_sock, hdr, test->id);
 797 | 
 798 | 	return;
 799 | 
 800 | err_free:
 801 | 	free(fwd);
 802 | 	self->quit = 1;
 803 | 	return;
 804 | }
 805 | 
 806 | static void
 807 | server_msg_end_test(struct session_state *self, struct kpm_header *hdr)
 808 | {
 809 | 	struct kpm_end_test *req;
 810 | 	struct test *test;
 811 | 	unsigned int i;
 812 | 
 813 | 	if (hdr->len < sizeof(*req)) {
 814 | 		warn("Invalid request in %s", __func__);
 815 | 		self->quit = 1;
 816 | 		return;
 817 | 	}
 818 | 	req = (void *)hdr;
 819 | 
 820 | 	test = session_find_test_by_id(self, req->id);
 821 | 	if (!test) {
 822 | 		warn("Failed to find test");
 823 | 		kpm_reply_error(self->main_sock, hdr, ENOENT);
 824 | 		return;
 825 | 	}
 826 | 
 827 | 	if (test->active && test->workers_total != test->workers_done) {
 828 | 		warn("Early test termination not supported");
 829 | 		kpm_reply_error(self->main_sock, hdr, EBUSY);
 830 | 		return;
 831 | 	}
 832 | 
 833 | 	for (i = 0; i < test->worker_range; i++) {
 834 | 		struct worker *wrk;
 835 | 		struct kpm_test *msg;
 836 | 
 837 | 		msg = test->fwd[i];
 838 | 		if (!msg->n_conns) {
 839 | 			warnx("no conns on %d", i);
 840 | 			continue;
 841 | 		}
 842 | 
 843 | 		kpm_trace("searching for worker %d", msg->specs[0].worker_id);
 844 | 		wrk = session_find_worker_by_id(self, msg->specs[0].worker_id);
 845 | 		wrk->busy = 0;
 846 | 
 847 | 		kpm_trace("Sending end test to worker");
 848 | 		kpm_send_u32(wrk->fd, KPM_MSG_WORKER_END_TEST, req->id);
 849 | 	}
 850 | 
 851 | 	if (kpm_reply_empty(self->main_sock, hdr) < 1) {
 852 | 		self->quit = 1;
 853 | 		return;
 854 | 	}
 855 | }
 856 | 
 857 | static void session_handle_main_sock(struct session_state *self)
 858 | {
 859 | 	struct kpm_header *hdr;
 860 | 
 861 | 	hdr = kpm_receive(self->main_sock);
 862 | 	if (!hdr) {
 863 | 		__kpm_dbg("<<", "ctrl recv failed");
 864 | 		self->quit = 1;
 865 | 		return;
 866 | 	}
 867 | 	kpm_cmd_dbg_start(hdr);
 868 | 
 869 | 	switch (hdr->type) {
 870 | 	case KPM_MSG_TYPE_OPEN_TCP_ACCEPTOR:
 871 | 		server_msg_tcp_acceptor(self, hdr);
 872 | 		break;
 873 | 	case KPM_MSG_TYPE_CONNECT:
 874 | 		server_msg_connect(self, hdr);
 875 | 		break;
 876 | 	case KPM_MSG_TYPE_DISCONNECT:
 877 | 		server_msg_disconnect(self, hdr);
 878 | 		break;
 879 | 	case KPM_MSG_TYPE_TLS:
 880 | 		server_msg_tls(self, hdr);
 881 | 		break;
 882 | 	case KPM_MSG_TYPE_MAX_PACING:
 883 | 		server_msg_max_pacing(self, hdr);
 884 | 		break;
 885 | 	case KPM_MSG_TYPE_TCP_CC:
 886 | 		server_msg_tcp_cc(self, hdr);
 887 | 		break;
 888 | 	case KPM_MSG_TYPE_MODE:
 889 | 		server_msg_mode(self, hdr);
 890 | 		break;
 891 | 	case KPM_MSG_TYPE_SPAWN_WORKER:
 892 | 		server_msg_spawn_worker(self, hdr);
 893 | 		break;
 894 | 	case KPM_MSG_TYPE_PIN_WORKER:
 895 | 		server_msg_pin_worker(self, hdr);
 896 | 		break;
 897 | 	case KPM_MSG_TYPE_TEST:
 898 | 		server_msg_test(self, hdr);
 899 | 		break;
 900 | 	case KPM_MSG_TYPE_END_TEST:
 901 | 		server_msg_end_test(self, hdr);
 902 | 		break;
 903 | 	default:
 904 | 		warnx("Unknown message type: %d", hdr->type);
 905 | 		self->quit = 1;
 906 | 		break;
 907 | 	}
 908 | 
 909 | 	kpm_cmd_dbg_end(hdr);
 910 | 	free(hdr);
 911 | }
 912 | 
 913 | static void
 914 | session_results_assemble(struct session_state *self, struct test *test)
 915 | {
 916 | 	struct kpm_test_results *reply;
 917 | 	unsigned int i, j;
 918 | 	size_t sz;
 919 | 
 920 | 	if (!test->results[0]) {
 921 | 		warnx("First result slot empty!");
 922 | 		return;
 923 | 	}
 924 | 
 925 | 	sz = sizeof(*reply) + test->req->n_conns * sizeof(reply->res[0]);
 926 | 	reply = calloc(1, sz);
 927 | 	memcpy(reply, test->results[0], sizeof(*reply));
 928 | 
 929 | 	for (i = 0; i < test->req->n_conns; i++) {
 930 | 		struct kpm_test_result *res = NULL;
 931 | 		struct kpm_test_results *rmsg;
 932 | 		__u32 worker_id, conn_id;
 933 | 
 934 | 		worker_id = test->req->specs[i].worker_id;
 935 | 		conn_id = test->req->specs[i].connection_id;
 936 | 		rmsg = test->results[worker_id - test->min_worker_id];
 937 | 		if (!rmsg) {
 938 | 			warnx("No results for worker %d", worker_id);
 939 | 			goto out;
 940 | 		}
 941 | 		for (j = 0; j < rmsg->n_conns; j++) {
 942 | 			if (rmsg->res[j].connection_id == conn_id) {
 943 | 				res = &rmsg->res[j];
 944 | 				break;
 945 | 			}
 946 | 		}
 947 | 		if (!res) {
 948 | 			warnx("No results for connection %d", conn_id);
 949 | 			goto out;
 950 | 		}
 951 | 
 952 | 		memcpy(&reply->res[i], res, sizeof(*res));
 953 | 	}
 954 | 
 955 | 	kpm_dbg("Results sent");
 956 | 	kpm_send(self->main_sock, &reply->hdr, sz, KPM_MSG_TYPE_TEST_RESULT);
 957 | 
 958 | out:
 959 | 	free(reply);
 960 | }
 961 | 
 962 | static void
 963 | session_wmsg_test(struct session_state *self, struct kpm_header *hdr)
 964 | {
 965 | 	struct kpm_test_results *msg = (void *)hdr;
 966 | 	__u32 worker_id = msg->res[0].worker_id;
 967 | 	struct test *test;
 968 | 
 969 | 	test = session_find_test_by_id(self, msg->test_id);
 970 | 	if (!test)
 971 | 		warn("Failed to find test for result");
 972 | 
 973 | 	test->workers_done++;
 974 | 	if (test->results[worker_id - test->min_worker_id])
 975 | 		warnx("Results already reported for worker %d", worker_id);
 976 | 	test->results[worker_id - test->min_worker_id] = kpm_msg_dup(&msg->hdr);
 977 | 
 978 | 	kpm_dbg("Results received %d/%d",
 979 | 		test->workers_done, test->workers_total);
 980 | 
 981 | 	if (test->workers_done == test->workers_total)
 982 | 		session_results_assemble(self, test);
 983 | }
 984 | 
 985 | static void session_handle_worker(struct session_state *self, int fd)
 986 | {
 987 | 	struct kpm_header *hdr;
 988 | 
 989 | 	hdr = kpm_receive(fd);
 990 | 	if (!hdr) {
 991 | 		warnx("worker recv empty");
 992 | 		self->quit = 1;
 993 | 		return;
 994 | 	}
 995 | 	__kpm_cmd_dbg_start("worker", hdr);
 996 | 
 997 | 	switch (hdr->type) {
 998 | 	case KPM_MSG_WORKER_TEST_RESULT:
 999 | 		session_wmsg_test(self, hdr);
1000 | 		break;
1001 | 	default:
1002 | 		warnx("Unknown worker message type: %d", hdr->type);
1003 | 		self->quit = 1;
1004 | 		break;
1005 | 	}
1006 | 
1007 | 	__kpm_cmd_dbg_end("worker", hdr);
1008 | 	free(hdr);
1009 | }
1010 | 
1011 | static void session_handle_accept_sock(struct session_state *self)
1012 | {
1013 | 	struct sockaddr_in6 sockaddr;
1014 | 	socklen_t addrlen;
1015 | 	int cfd;
1016 | 
1017 | 	__kpm_trace(">>", "accept");
1018 | 
1019 | 	addrlen = sizeof(sockaddr);
1020 | 	cfd = accept(self->tcp_sock, (void *)&sockaddr, &addrlen);
1021 | 	if (cfd < 0)
1022 | 		warn("Failed to accept conn");
1023 | 	else
1024 | 		session_new_conn(self, cfd);
1025 | }
1026 | 
1027 | static void server_session_loop(int fd)
1028 | {
1029 | 	struct session_state self = { .main_sock = fd, };
1030 | 	struct epoll_event ev = {}, events[32];
1031 | 	struct connection *conn, *next;
1032 | 	unsigned char j;
1033 | 	int i;
1034 | 
1035 | 	/* Initialize the data buffer we send/receive, it must match on both
1036 | 	 * ends, this is how we catch data corruption (ekhm kTLS..).
1037 | 	 *
1038 | 	 * We need to do this before initializing TX buffers with the pattern
1039 | 	 * (e.g., devmem).
1040 | 	 */
1041 | 	for (i = 0, j = 0; i < (int)ARRAY_SIZE(patbuf); i++, j++) {
1042 | 		j = j ?: 1;
1043 | 		patbuf[i] = j;
1044 | 	}
1045 | 
1046 | 	list_head_init(&self.connections);
1047 | 	list_head_init(&self.workers);
1048 | 	list_head_init(&self.tests);
1049 | 
1050 | 	self.epollfd = epoll_create1(0);
1051 | 	if (self.epollfd < 0)
1052 | 		err(1, "Failed to create epoll");
1053 | 
1054 | 	ev.events = EPOLLIN;
1055 | 	ev.data.fd = fd;
1056 | 	if (epoll_ctl(self.epollfd, EPOLL_CTL_ADD, fd, &ev) < 0)
1057 | 		err(2, "Failed to init epoll");
1058 | 
1059 | 	while (!self.quit) {
1060 | 		int nfds;
1061 | 
1062 | 		nfds = epoll_wait(self.epollfd, events, ARRAY_SIZE(events), -1);
1063 | 		if (nfds < 0)
1064 | 			err(3, "Failed to epoll");
1065 | 
1066 | 		for (i = 0; i < nfds; i++) {
1067 | 			struct epoll_event *e = &events[i];
1068 | 
1069 | 			if (e->data.fd == self.main_sock)
1070 | 				session_handle_main_sock(&self);
1071 | 			else if (e->data.fd == self.tcp_sock)
1072 | 				session_handle_accept_sock(&self);
1073 | 			else
1074 | 				session_handle_worker(&self, e->data.fd);
1075 | 		}
1076 | 	}
1077 | 
1078 | 	kpm_dbg("exiting!");
1079 | 
1080 | 	list_for_each_safe(&self.connections, conn, next, connections) {
1081 | 		close(conn->fd);
1082 | 		list_del(&conn->connections);
1083 | 		free(conn);
1084 | 	}
1085 | 	if (self.tcp_sock && self.rx_mode == KPM_RX_MODE_DEVMEM)
1086 | 		devmem_teardown(&self.devmem);
1087 | 	if (!self.tcp_sock && self.tx_mode == KPM_TX_MODE_DEVMEM)
1088 | 		devmem_teardown_tx(&self.devmem);
1089 | 	if (self.tcp_sock && self.iou && self.rx_mode == KPM_RX_MODE_SOCKET_ZEROCOPY)
1090 | 		iou_zerocopy_rx_teardown(&self.iou_state);
1091 | }
1092 | 
1093 | static NORETURN void server_session(int fd)
1094 | {
1095 | 	if (!kpm_xchg_hello(fd, NULL))
1096 | 		server_session_loop(fd);
1097 | 	close(fd);
1098 | 	exit(0);
1099 | }
1100 | 
1101 | struct server_session *
1102 | server_session_spawn(int fd, struct sockaddr_in6 *addr, socklen_t *addrlen)
1103 | {
1104 | 	struct server_session *ses;
1105 | 
1106 | 	if (get_nprocs() > KPERF_MAX_CPUS) {
1107 | 		warnx("Too many CPUs in the system: %d, proto has max of %d",
1108 | 		      get_nprocs(), KPERF_MAX_CPUS);
1109 | 		return NULL;
1110 | 	}
1111 | 
1112 | 	ses = malloc(sizeof(*ses));
1113 | 	if (!ses) {
1114 | 		close(fd);
1115 | 		return NULL;
1116 | 	}
1117 | 	memset(ses, 0, sizeof(*ses));
1118 | 
1119 | 	ses->pid = fork();
1120 | 	if (ses->pid)
1121 | 		return ses;
1122 | 
1123 | 	free(ses);
1124 | 	server_session(fd);
1125 | }
1126 | 


--------------------------------------------------------------------------------
/devmem.c:
--------------------------------------------------------------------------------
   1 | // SPDX-License-Identifier: BSD-3-Clause
   2 | /* Copyright Meta Platforms, Inc. and affiliates */
   3 | 
   4 | #define _GNU_SOURCE
   5 | 
   6 | #include <errno.h>
   7 | #include <fcntl.h>
   8 | #include <ifaddrs.h>
   9 | #include <stdio.h>
  10 | #include <stdlib.h>
  11 | #include <string.h>
  12 | #include <unistd.h>
  13 | #include <arpa/inet.h>
  14 | #include <net/if.h>
  15 | #include <sys/ioctl.h>
  16 | #include <sys/mman.h>
  17 | #include <sys/types.h>
  18 | 
  19 | #include <linux/dma-buf.h>
  20 | #include <linux/ethtool_netlink.h>
  21 | #include <linux/sockios.h>
  22 | #include <linux/udmabuf.h>
  23 | 
  24 | #include <ccan/array_size/array_size.h>
  25 | #include <ccan/err/err.h>
  26 | 
  27 | #include <ynl-c/ethtool.h>
  28 | #include <ynl-c/netdev.h>
  29 | 
  30 | #include "server.h"
  31 | #include "proto_dbg.h"
  32 | 
  33 | #ifdef USE_CUDA
  34 | #include <cuda.h>
  35 | #include <cuda_runtime.h>
  36 | 
  37 | #ifdef CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE
  38 | #define CUDA_FLAGS CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE
  39 | #else
  40 | #define CUDA_FLAGS 0
  41 | #endif
  42 | #endif
  43 | 
  44 | extern unsigned char patbuf[KPM_MAX_OP_CHUNK + PATTERN_PERIOD + 1];
  45 | 
  46 | static int steering_rule_loc = -1;
  47 | 
  48 | static int ethtool(const char *ifname, void *data)
  49 | {
  50 | 	struct ifreq ifr = {};
  51 | 	int ret;
  52 | 
  53 | 	strcat(ifr.ifr_ifrn.ifrn_name, ifname);
  54 | 	ifr.ifr_ifru.ifru_data = data;
  55 | 
  56 | 	int fd = socket(AF_UNIX, SOCK_DGRAM, 0);
  57 | 	if (fd < 0)
  58 | 		return fd;
  59 | 
  60 | 	ret = ioctl(fd, SIOCETHTOOL, &ifr);
  61 | 	close(fd);
  62 | 	return ret;
  63 | }
  64 | 
  65 | static void reset_flow_steering(const char *ifname)
  66 | {
  67 | 	struct ethtool_rxnfc del;
  68 | 
  69 | 	if (steering_rule_loc < 0)
  70 | 		return;
  71 | 
  72 | 	del.cmd = ETHTOOL_SRXCLSRLDEL;
  73 | 	del.fs.location = steering_rule_loc;
  74 | 
  75 | 	ethtool(ifname, &del);
  76 | 
  77 | 	steering_rule_loc = -1;
  78 | }
  79 | 
  80 | static int find_free_rule_loc(const char *ifname, int rule_cnt)
  81 | {
  82 | 	struct ethtool_rxnfc cnt = {};
  83 | 	struct ethtool_rxnfc *rules;
  84 | 	int free_loc = 0;
  85 | 
  86 | 	cnt.cmd = ETHTOOL_GRXCLSRLCNT;
  87 | 	if (ethtool(ifname, &cnt) < 0)
  88 | 		return -1;
  89 | 
  90 | 	rules = calloc(1, sizeof(*rules) + (cnt.rule_cnt * sizeof(__u32)));
  91 | 	if (!rules)
  92 | 		return -1;
  93 | 
  94 | 	rules->cmd = ETHTOOL_GRXCLSRLALL;
  95 | 	rules->rule_cnt = cnt.rule_cnt;
  96 | 	if (ethtool(ifname, rules) < 0)
  97 | 		goto free_rules;
  98 | 
  99 | 	while (true) {
 100 | 		bool used = false;
 101 | 		for (__u32 i = 0; i < rules->rule_cnt; i++)
 102 | 			if ((unsigned int)free_loc == rules->rule_locs[i]) {
 103 | 				used = true;
 104 | 				break;
 105 | 			}
 106 | 		if (!used)
 107 | 			break;
 108 | 		free_loc++;
 109 | 	}
 110 | 
 111 | 	free(rules);
 112 | 	return free_loc;
 113 | 
 114 | free_rules:
 115 | 	free(rules);
 116 | 	return -1;
 117 | }
 118 | 
 119 | static int add_steering_rule(struct sockaddr_in6 *server_sin,
 120 | 			     const char *ifname, int rss_context)
 121 | {
 122 | 	struct ethtool_rxnfc add = {};
 123 | 	struct ethtool_rxnfc cnt = {};
 124 | 	int ret;
 125 | 
 126 | 	add.cmd = ETHTOOL_SRXCLSRLINS;
 127 | 	add.rss_context = rss_context;
 128 | 
 129 | 	if (IN6_IS_ADDR_V4MAPPED(&server_sin->sin6_addr)) {
 130 | 		add.fs.flow_type = TCP_V4_FLOW;
 131 |                 memcpy(&add.fs.h_u.tcp_ip4_spec.ip4dst,
 132 |                        &server_sin->sin6_addr.s6_addr32[3], 4);
 133 |                 memcpy(&add.fs.h_u.tcp_ip4_spec.pdst,
 134 | 		       &server_sin->sin6_port, 2);
 135 | 
 136 | 		add.fs.m_u.tcp_ip4_spec.ip4dst = 0xffffffff;
 137 | 		add.fs.m_u.tcp_ip4_spec.pdst = 0xffff;
 138 | 	} else {
 139 | 		add.fs.flow_type = TCP_V6_FLOW;
 140 |                 memcpy(add.fs.h_u.tcp_ip6_spec.ip6dst, &server_sin->sin6_addr,
 141 |                        16);
 142 |                 memcpy(&add.fs.h_u.tcp_ip6_spec.pdst, &server_sin->sin6_port,
 143 |                        2);
 144 | 
 145 |                 add.fs.m_u.tcp_ip6_spec.ip6dst[0] = 0xffffffff;
 146 | 		add.fs.m_u.tcp_ip6_spec.ip6dst[1] = 0xffffffff;
 147 | 		add.fs.m_u.tcp_ip6_spec.ip6dst[2] = 0xffffffff;
 148 | 		add.fs.m_u.tcp_ip6_spec.ip6dst[3] = 0xffffffff;
 149 | 		add.fs.m_u.tcp_ip6_spec.pdst = 0xffff;
 150 | 	}
 151 | 
 152 | 	add.fs.flow_type |= FLOW_RSS;
 153 | 
 154 | 	cnt.cmd = ETHTOOL_GRXCLSRLCNT;
 155 | 	ret = ethtool(ifname, &cnt);
 156 | 	if (ret)
 157 | 		return ret;
 158 | 
 159 | 	if (cnt.data & RX_CLS_LOC_SPECIAL)
 160 | 		add.fs.location = RX_CLS_LOC_ANY;
 161 | 	else if (cnt.rule_cnt) {
 162 | 		ret = find_free_rule_loc(ifname, cnt.rule_cnt);
 163 | 		if (ret < 0) {
 164 | 			warnx("Failed to find free steering rule loc");
 165 | 			return -1;
 166 | 		}
 167 | 		add.fs.location = ret;
 168 | 	}
 169 | 
 170 | 	ret = ethtool(ifname, &add);
 171 | 	if (ret)
 172 | 		return ret;
 173 | 
 174 | 	steering_rule_loc = add.fs.location;
 175 | 
 176 | 	return 0;
 177 | }
 178 | 
 179 | static int rss_context_delete(char *ifname, int rss_context)
 180 | {
 181 | 	struct ethtool_rxfh set = {};
 182 | 
 183 | 	set.cmd = ETHTOOL_SRSSH;
 184 | 	set.rss_context = rss_context;
 185 | 	set.indir_size = 0;
 186 | 
 187 | 	if (ethtool(ifname, &set) < 0) {
 188 | 		warn("ethtool failed to delete RSS context %u", rss_context);
 189 | 		return -1;
 190 | 	}
 191 | 
 192 | 	return 0;
 193 | }
 194 | 
 195 | static int rss_context_equal(char *ifname, int start_queue, int num_queues,
 196 | 			     struct sockaddr_in6 *addr)
 197 | {
 198 | 	struct ethtool_rxfh get = {};
 199 | 	struct ethtool_rxfh *set;
 200 | 	__u32 indir_bytes;
 201 | 	int rss_context;
 202 | 	int queue;
 203 | 	int ret;
 204 | 
 205 | 	get.cmd = ETHTOOL_GRSSH;
 206 | 	if (ethtool(ifname, &get) < 0) {
 207 | 		warn("ethtool failed to get RSS context");
 208 | 		return -1;
 209 | 	}
 210 | 
 211 | 	indir_bytes = get.indir_size * sizeof(get.rss_config[0]);
 212 | 
 213 | 	set = calloc(1, sizeof(*set) + indir_bytes);
 214 | 	if (!set) {
 215 | 		warn("failed to allocate memory");
 216 | 		return -1;
 217 | 	}
 218 | 
 219 | 	set->cmd = ETHTOOL_SRSSH;
 220 | 	set->rss_context = ETH_RXFH_CONTEXT_ALLOC;
 221 | 	set->indir_size = get.indir_size;
 222 | 
 223 | 	queue = start_queue;
 224 | 	for (__u32 i = 0; i < get.indir_size; i++) {
 225 | 		set->rss_config[i] = queue++;
 226 | 		if (queue >= start_queue + num_queues)
 227 | 			queue = start_queue;
 228 | 	}
 229 | 
 230 | 	if (ethtool(ifname, set) < 0) {
 231 | 		warn("ethtool failed to create RSS context");
 232 | 		ret = -1;
 233 | 		goto free_set;
 234 | 	}
 235 | 
 236 | 	rss_context = set->rss_context;
 237 | 
 238 | 	if (add_steering_rule(addr, ifname, rss_context) < 0) {
 239 | 		warn("Failed to add rule to RSS context");
 240 | 		ret = -1;
 241 | 		goto delete_context;
 242 | 	}
 243 | 
 244 | 	free(set);
 245 | 
 246 | 	return rss_context;
 247 | 
 248 | delete_context:
 249 | 	rss_context_delete(ifname, rss_context);
 250 | 
 251 | free_set:
 252 | 	free(set);
 253 | 
 254 | 	return ret;
 255 | }
 256 | 
 257 | static int rss_equal(const char *ifname, int max_queue)
 258 | {
 259 | 	struct ethtool_rxfh_indir get = {};
 260 | 	struct ethtool_rxfh_indir *set;
 261 | 	int queue = 0;
 262 | 	int ret;
 263 | 
 264 | 	get.cmd = ETHTOOL_GRXFHINDIR;
 265 | 	if (ethtool(ifname, &get) < 0)
 266 | 		return -1;
 267 | 
 268 | 	set = malloc(sizeof(*set) + get.size * sizeof(__u32));
 269 | 	if (!set)
 270 | 		return -1;
 271 | 
 272 | 	for (__u32 i = 0; i < get.size; i++) {
 273 | 		set->ring_index[i] = queue++;
 274 | 		if (queue >= max_queue)
 275 | 			queue = 0;
 276 | 	}
 277 | 
 278 | 	set->cmd = ETHTOOL_SRXFHINDIR;
 279 | 	set->size = get.size;
 280 | 	ret = ethtool(ifname, set);
 281 | 
 282 | 	free(set);
 283 | 	return ret;
 284 | }
 285 | 
 286 | static int rxq_num(int ifindex)
 287 | {
 288 | 	struct ethtool_channels_get_req *req;
 289 | 	struct ethtool_channels_get_rsp *rsp;
 290 | 	struct ynl_error yerr;
 291 | 	struct ynl_sock *ys;
 292 | 	int num = -1;
 293 | 
 294 | 	ys = ynl_sock_create(&ynl_ethtool_family, &yerr);
 295 | 	if (!ys) {
 296 | 		warnx("Failed to setup YNL socket: %s", yerr.msg);
 297 | 		return -1;
 298 | 	}
 299 | 
 300 | 	req = ethtool_channels_get_req_alloc();
 301 | 	ethtool_channels_get_req_set_header_dev_index(req, ifindex);
 302 | 	rsp = ethtool_channels_get(ys, req);
 303 | 	if (rsp)
 304 | 		num = rsp->rx_count + rsp->combined_count;
 305 | 	else
 306 | 		warnx("ethtool_channels_get: %s", ys->err.msg);
 307 | 	ethtool_channels_get_req_free(req);
 308 | 	ethtool_channels_get_rsp_free(rsp);
 309 | 	ynl_sock_destroy(ys);
 310 | 
 311 | 	return num;
 312 | }
 313 | 
 314 | static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd,
 315 | 			 struct netdev_queue_id *queues,
 316 | 			 unsigned int n_queue_index, struct ynl_sock *ys)
 317 | {
 318 | 	struct netdev_bind_rx_req *req;
 319 | 	struct netdev_bind_rx_rsp *rsp;
 320 | 	int ret = -1;
 321 | 
 322 | 	req = netdev_bind_rx_req_alloc();
 323 | 	if (!req)
 324 | 		return -1;
 325 | 
 326 | 	netdev_bind_rx_req_set_ifindex(req, ifindex);
 327 | 	netdev_bind_rx_req_set_fd(req, dmabuf_fd);
 328 | 	__netdev_bind_rx_req_set_queues(req, queues, n_queue_index);
 329 | 
 330 | 	rsp = netdev_bind_rx(ys, req);
 331 | 	if (!rsp) {
 332 | 		warnx("netdev_bind_rx: %s", ys->err.msg);
 333 | 		goto out;
 334 | 	}
 335 | 
 336 | 	if (!rsp->_present.id) {
 337 | 		warnx("id not present");
 338 | 		goto out;
 339 | 	}
 340 | 
 341 | 	ret = rsp->id;
 342 | 
 343 | out:
 344 | 	if (req)
 345 | 		netdev_bind_rx_req_free(req);
 346 | 	if (rsp)
 347 | 		netdev_bind_rx_rsp_free(rsp);
 348 | 
 349 | 	return ret;
 350 | }
 351 | 
 352 | static int bind_tx_queue(unsigned int ifindex, unsigned int dmabuf_fd,
 353 | 			 struct ynl_sock *ys)
 354 | {
 355 | 	struct netdev_bind_tx_req *req = NULL;
 356 | 	struct netdev_bind_tx_rsp *rsp = NULL;
 357 | 	int ret;
 358 | 
 359 | 	req = netdev_bind_tx_req_alloc();
 360 | 	if (!req) {
 361 | 		warnx("netdev_bind_tx_req_alloc() failed");
 362 | 		return -1;
 363 | 	}
 364 | 	netdev_bind_tx_req_set_ifindex(req, ifindex);
 365 | 	netdev_bind_tx_req_set_fd(req, dmabuf_fd);
 366 | 
 367 | 	rsp = netdev_bind_tx(ys, req);
 368 | 	if (!rsp) {
 369 | 		warnx("netdev_bind_tx");
 370 | 		ret = -1;
 371 | 		goto free_req;
 372 | 	}
 373 | 
 374 | 	if (!rsp->_present.id) {
 375 | 		warnx("id not present");
 376 | 		ret = -1;
 377 | 		goto free_rsp;
 378 | 	}
 379 | 
 380 | 	ret = rsp->id;
 381 | 	netdev_bind_tx_req_free(req);
 382 | 	netdev_bind_tx_rsp_free(rsp);
 383 | 
 384 | 	return ret;
 385 | 
 386 | free_rsp:
 387 | 	netdev_bind_tx_rsp_free(rsp);
 388 | free_req:
 389 | 	netdev_bind_tx_req_free(req);
 390 | 	return ret;
 391 | }
 392 | 
 393 | #define UDMABUF_LIMIT_PATH "/sys/module/udmabuf/parameters/size_limit_mb"
 394 | 
 395 | static int udmabuf_check_size(size_t size_mb)
 396 | {
 397 | 	size_t limit_mb = 0;
 398 | 	int ret = 0;
 399 | 	FILE *f;
 400 | 
 401 | 	f = fopen(UDMABUF_LIMIT_PATH, "r");
 402 | 	if (f) {
 403 | 		fscanf(f, "%lu", &limit_mb);
 404 | 		if (size_mb > limit_mb) {
 405 |                   warnx(
 406 |                       "udmabuf size limit is too small (%lu > %lu), update %s",
 407 |                       size_mb, limit_mb, UDMABUF_LIMIT_PATH);
 408 |                   ret = -EINVAL;
 409 | 		}
 410 | 		fclose(f);
 411 | 	}
 412 | 
 413 | 	return ret;
 414 | }
 415 | 
 416 | static struct memory_buffer *udmabuf_alloc(size_t size)
 417 | {
 418 | 	struct udmabuf_create create;
 419 | 	struct memory_buffer *mem;
 420 | 	int ret;
 421 | 
 422 | 	mem = calloc(1, sizeof(*mem));
 423 | 	if (!mem)
 424 | 		return NULL;
 425 | 
 426 | 	ret = udmabuf_check_size(size / 1024 / 1024);
 427 | 	if (ret < 0) {
 428 | 		warnx("Failed: udmabuf_check_size(), ret=%d", ret);
 429 | 		goto free_mem;
 430 | 	}
 431 | 
 432 | 	mem->devfd = open("/dev/udmabuf", O_RDWR);
 433 | 	if (mem->devfd < 0) {
 434 | 		warn("Failed to open /dev/udmabuf");
 435 | 		goto free_mem;
 436 | 	}
 437 | 
 438 | 	mem->memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING);
 439 | 	if (mem->memfd < 0) {
 440 | 		warn("memfd_create() failed");
 441 | 		goto close_devfd;
 442 | 	}
 443 | 
 444 | 	ret = fcntl(mem->memfd, F_ADD_SEALS, F_SEAL_SHRINK);
 445 | 	if (ret < 0) {
 446 | 		warn("fcntl() failed");
 447 | 		goto close_memfd;
 448 | 	}
 449 | 
 450 | 	ret = ftruncate(mem->memfd, size);
 451 | 	if (ret < 0) {
 452 | 		warn("ftruncate() failed");
 453 | 		goto close_memfd;
 454 | 	}
 455 | 
 456 | 	memset(&create, 0, sizeof(create));
 457 | 
 458 | 	create.memfd = mem->memfd;
 459 | 	create.offset = 0;
 460 | 	create.size = size;
 461 | 
 462 |         mem->fd = ioctl(mem->devfd, UDMABUF_CREATE, &create);
 463 |         if (mem->fd < 0) {
 464 | 		warn("ioctl(mem->devfd) failed");
 465 | 		goto close_memfd;
 466 | 	}
 467 | 
 468 | 	mem->size = size;
 469 | 	mem->provider = MEMORY_PROVIDER_HOST;
 470 | 	mem->buf_mem = mmap(NULL, mem->size, PROT_READ | PROT_WRITE,
 471 | 				  MAP_SHARED, mem->fd, 0);
 472 | 
 473 | 	if (mem->buf_mem == MAP_FAILED) {
 474 | 		ret = -errno;
 475 | 		goto close_dmabuf_fd;
 476 | 	}
 477 | 
 478 | 	return mem;
 479 | 
 480 | close_dmabuf_fd:
 481 | 	close(mem->fd);
 482 | close_memfd:
 483 | 	close(mem->memfd);
 484 | close_devfd:
 485 | 	close(mem->devfd);
 486 | free_mem:
 487 | 	free(mem);
 488 | 	return NULL;
 489 | }
 490 | 
 491 | static void udmabuf_free(struct memory_buffer *mem)
 492 | {
 493 | 	if (mem->buf_mem) {
 494 | 		close(mem->fd);
 495 | 		close(mem->memfd);
 496 | 		close(mem->devfd);
 497 | 		munmap(mem->buf_mem, mem->size);
 498 | 	}
 499 | 	free(mem);
 500 | }
 501 | 
 502 | static void inet_to_inet6(struct sockaddr *addr, struct sockaddr_in6 *out)
 503 | {
 504 | 	out->sin6_addr.s6_addr32[3] = ((struct sockaddr_in *)addr)->sin_addr.s_addr;
 505 | 	out->sin6_addr.s6_addr32[0] = 0;
 506 | 	out->sin6_addr.s6_addr32[1] = 0;
 507 | 	out->sin6_addr.s6_addr16[4] = 0;
 508 | 	out->sin6_addr.s6_addr16[5] = 0xffff;
 509 | 	out->sin6_family = AF_INET6;
 510 | }
 511 | 
 512 | static int find_iface(struct sockaddr_in6 *addr, char ifname[IFNAMSIZ])
 513 | {
 514 | 	struct ifaddrs *ifaddr, *ifa;
 515 | 	struct sockaddr_in6 tmp;
 516 | 
 517 | 	if (getifaddrs(&ifaddr) < 0)
 518 | 		return -errno;
 519 | 
 520 | 	for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) {
 521 | 		if (!ifa->ifa_addr)
 522 | 			continue;
 523 | 
 524 | 		if (ifa->ifa_addr->sa_family == AF_INET)
 525 | 			inet_to_inet6(ifa->ifa_addr, &tmp);
 526 | 		else if (ifa->ifa_addr->sa_family == AF_INET6)
 527 | 			memcpy(&tmp, ifa->ifa_addr, sizeof(tmp));
 528 | 		else
 529 | 			continue;
 530 | 
 531 | 		if (!memcmp(&tmp.sin6_addr, &addr->sin6_addr,
 532 | 			    sizeof(tmp.sin6_addr))) {
 533 | 			strncpy(ifname, ifa->ifa_name, IFNAMSIZ - 1);
 534 | 			freeifaddrs(ifaddr);
 535 | 			return if_nametoindex(ifname);
 536 | 		}
 537 |         }
 538 | 
 539 | 	freeifaddrs(ifaddr);
 540 | 	return -ENODEV;
 541 | }
 542 | 
 543 | void udmabuf_memcpy_to_device(struct memory_buffer *dst, size_t off,
 544 | 			      void *src, int n)
 545 | {
 546 | 	struct dma_buf_sync sync = {};
 547 | 
 548 | 	sync.flags = DMA_BUF_SYNC_START | DMA_BUF_SYNC_WRITE;
 549 | 	ioctl(dst->fd, DMA_BUF_IOCTL_SYNC, &sync);
 550 | 
 551 | 	memcpy(dst->buf_mem + off, src, n);
 552 | 
 553 | 	sync.flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_WRITE;
 554 | 	ioctl(dst->fd, DMA_BUF_IOCTL_SYNC, &sync);
 555 | }
 556 | 
 557 | static struct memory_provider udmabuf_memory_provider = {
 558 | 	.alloc = udmabuf_alloc,
 559 | 	.free = udmabuf_free,
 560 | 	.memcpy_to_device = udmabuf_memcpy_to_device,
 561 | };
 562 | 
 563 | static struct memory_provider *rxmp;
 564 | static struct memory_provider *txmp;
 565 | 
 566 | #ifdef USE_CUDA
 567 | 
 568 |  /* Length of str: 'XXXX:XX:XX' */
 569 | #define MAX_BUS_ID_LEN 11
 570 | 
 571 | static int cuda_find_device(__u16 domain, __u8 bus, __u8 device)
 572 | {
 573 | 	char bus_id[MAX_BUS_ID_LEN];
 574 | 	int devnum;
 575 | 	int ret;
 576 | 
 577 | 	ret = snprintf(bus_id, MAX_BUS_ID_LEN, "%hx:%hhx:%hhx", domain, bus, device);
 578 | 	if (ret < 0)
 579 | 		return -EINVAL;
 580 | 
 581 | 	ret = cudaDeviceGetByPCIBusId(&devnum, bus_id);
 582 | 	if (ret != cudaSuccess) {
 583 | 		warnx("No CUDA device found %s", bus_id);
 584 | 		return -EINVAL;
 585 | 	}
 586 | 
 587 | 	return devnum;
 588 | }
 589 | 
 590 | static int cuda_dev_init(struct pci_dev *dev)
 591 | {
 592 | 	struct cudaDeviceProp deviceProp;
 593 | 	CUdevice cuda_dev;
 594 | 	int devnum;
 595 | 	int ret;
 596 | 	int ok;
 597 | 
 598 | 	ret = cuInit(0);
 599 | 	if (ret != CUDA_SUCCESS)
 600 | 		return -1;
 601 | 
 602 | 	/* If the user did not specify a device, select any device */
 603 | 	if (dev->domain == DEVICE_DOMAIN_ANY && dev->bus == DEVICE_BUS_ANY && dev->device == DEVICE_DEVICE_ANY) {
 604 | 		devnum = 0;
 605 | 	} else {
 606 | 		devnum = cuda_find_device(dev->domain, dev->bus, dev->device);
 607 | 		if (devnum < 0)
 608 | 			return -1;
 609 | 	}
 610 | 
 611 | 	ret = cuDeviceGet(&cuda_dev, devnum);
 612 | 	if (ret != CUDA_SUCCESS)
 613 | 		return -1;
 614 | 
 615 | 	ok = 0;
 616 | 	ret = cuDeviceGetAttribute(&ok, CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED,
 617 | 				   cuda_dev);
 618 | 	if (ret != CUDA_SUCCESS || !ok) {
 619 | 		if (!ok)
 620 | 			warnx("CUDA device does not support dmabuf");
 621 | 		return -1;
 622 | 	}
 623 | 
 624 | 	ret = cudaSetDevice(devnum);
 625 | 	if (ret != cudaSuccess) {
 626 | 		warn("cudaSetDevice() failed with error %d", ret);
 627 | 		return -1;
 628 | 	}
 629 | 
 630 | 	if (verbose >= 4)
 631 | 		fprintf(stderr, "cuda: tid %d selecting device %d (%s)\n",
 632 | 			getpid(), devnum, deviceProp.name);
 633 | 
 634 | 	return 0;
 635 | }
 636 | 
 637 | static struct memory_buffer *cuda_alloc(size_t size)
 638 | {
 639 | 	struct memory_buffer *mem;
 640 | 	size_t page_size;
 641 | 	int ret;
 642 | 
 643 | 	page_size = sysconf(_SC_PAGESIZE);
 644 | 	if (size % page_size) {
 645 | 		warnx("cuda memory size not aligned, size 0x%lx", size);
 646 | 		return NULL;
 647 | 	}
 648 | 
 649 | 	mem = calloc(1, sizeof(*mem));
 650 | 	if (!mem)
 651 | 		return NULL;
 652 | 	memset(mem, 0, sizeof(*mem));
 653 | 	mem->size = size;
 654 | 	mem->provider = MEMORY_PROVIDER_CUDA;
 655 | 
 656 | 	ret = cudaMalloc((void *)&mem->buf_mem, size);
 657 | 	if (ret != cudaSuccess)
 658 | 		goto free_mem;
 659 | 
 660 | 	ret = cuMemGetHandleForAddressRange((void *)&mem->fd,
 661 | 					    ((CUdeviceptr)mem->buf_mem), size,
 662 | 					    CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
 663 | 					    CUDA_FLAGS);
 664 | 	if (ret != CUDA_SUCCESS)
 665 | 		goto free_cuda;
 666 | 
 667 | 	return mem;
 668 | 
 669 | free_cuda:
 670 | 	if (cudaFree(mem->buf_mem) != cudaSuccess)
 671 | 		warnx("cudaFree() failed");
 672 | free_mem:
 673 | 	free(mem);
 674 | 
 675 | 	return NULL;
 676 | }
 677 | 
 678 | static void cuda_free(struct memory_buffer *mem)
 679 | {
 680 | 	if (mem->fd)
 681 | 		close(mem->fd);
 682 | 	if (mem->buf_mem)
 683 | 		cudaFree(mem->buf_mem);
 684 | 
 685 | 	free(mem);
 686 | }
 687 | 
 688 | void cuda_memcpy_to_device(struct memory_buffer *dst, size_t off,
 689 | 			   void *src, int n)
 690 | {
 691 | 	int ret;
 692 | 
 693 | 	ret = cudaMemcpy((void *)(dst->buf_mem + off), src, n,
 694 | 			 cudaMemcpyHostToDevice);
 695 | 	if (ret != cudaSuccess)
 696 | 		warnx("cudaMemcpy() failed");
 697 | }
 698 | 
 699 | static struct memory_provider cuda_memory_provider = {
 700 | 	.dev_init = cuda_dev_init,
 701 | 	.alloc = cuda_alloc,
 702 | 	.free = cuda_free,
 703 | 	.memcpy_to_device = cuda_memcpy_to_device,
 704 | };
 705 | #endif
 706 | 
 707 | static struct memory_provider *get_memory_provider(enum memory_provider_type provider)
 708 | {
 709 | 	switch (provider) {
 710 | 	case MEMORY_PROVIDER_HOST:
 711 | 		return &udmabuf_memory_provider;
 712 | #ifdef USE_CUDA
 713 | 	case MEMORY_PROVIDER_CUDA:
 714 | 		return &cuda_memory_provider;
 715 | #endif
 716 | 	default:
 717 | 		warn("invalid provider: %d", provider);
 718 | 		return NULL;
 719 | 	}
 720 | }
 721 | 
 722 | int reserve_queues(int fd, int num_queues, char out_ifname[IFNAMSIZ],
 723 | 		   int *out_ifindex, int *out_queue_id, int *out_rss_context)
 724 | {
 725 | 	struct sockaddr_in6 addr;
 726 | 	char ifname[IFNAMSIZ];
 727 | 	int max_kernel_queue;
 728 | 	socklen_t optlen;
 729 | 	int rss_context;
 730 | 	int ifindex;
 731 | 	int ret = 0;
 732 | 	int rxqn;
 733 | 
 734 | 	if (num_queues <= 0) {
 735 | 		warnx("Invalid number of RX queues: %u", num_queues);
 736 | 		return -1;
 737 | 	}
 738 | 
 739 | 	optlen = sizeof(addr);
 740 | 	if (getsockname(fd, (struct sockaddr *)&addr, &optlen) < 0) {
 741 | 		warn("Failed to query socket address");
 742 | 		return -1;
 743 | 	}
 744 | 
 745 | 	if (addr.sin6_family == AF_INET)
 746 | 		inet_to_inet6((void *)&addr, &addr);
 747 | 
 748 | 	ifindex = find_iface(&addr, ifname);
 749 | 	if (ifindex < 0) {
 750 | 		warnx("Failed to resolve ifindex: %s", strerror(-ifindex));
 751 | 		return -1;
 752 | 	}
 753 | 
 754 | 	rxqn = rxq_num(ifindex);
 755 | 	if (rxqn < 2) {
 756 | 		warnx("Invalid number of queues: %d", rxqn);
 757 | 		return -1;
 758 | 	}
 759 | 
 760 | 	if (num_queues >= rxqn - 1) {
 761 | 		warnx("Invalid number of RX queues (%u) requested (max: %u)",
 762 | 		      num_queues, rxqn - 1);
 763 | 		return -1;
 764 | 	}
 765 | 
 766 | 	max_kernel_queue = rxqn - num_queues;
 767 | 
 768 | 	reset_flow_steering(ifname);
 769 | 	if (rss_equal(ifname, max_kernel_queue)) {
 770 | 		warnx("Failed to setup RSS");
 771 | 		return -1;
 772 | 	}
 773 | 
 774 | 	rss_context = rss_context_equal(ifname, max_kernel_queue,
 775 | 					num_queues, &addr);
 776 | 	if (rss_context < 0) {
 777 | 		warnx("Failed to setup RSS context");
 778 | 		ret = -1;
 779 | 		goto undo_rss;
 780 | 	}
 781 | 
 782 | 	memcpy(out_ifname, ifname, IFNAMSIZ);
 783 | 	*out_ifindex = ifindex;
 784 | 	*out_queue_id = max_kernel_queue;
 785 | 	*out_rss_context = rss_context;
 786 | 
 787 | 	return ret;
 788 | 
 789 | undo_rss:
 790 | 	rss_equal(ifname, rxqn);
 791 | 
 792 | 	return ret;
 793 | }
 794 | 
 795 | void unreserve_queues(char *ifname, int rss_context)
 796 | {
 797 | 	int ifindex;
 798 | 	int rxqn;
 799 | 
 800 | 	reset_flow_steering(ifname);
 801 | 	rss_context_delete(ifname, rss_context);
 802 | 	ifindex = if_nametoindex(ifname);
 803 | 	if (ifindex > 0) {
 804 | 		rxqn = rxq_num(ifindex);
 805 | 		if (rxqn > 0)
 806 | 			rss_equal(ifname, rxqn);
 807 | 	}
 808 | }
 809 | 
 810 | /* Setup Devmem RX */
 811 | int devmem_setup(struct session_state_devmem *devmem, int fd,
 812 | 		 size_t dmabuf_rx_size_mb, int num_queues,
 813 | 		 enum memory_provider_type provider,
 814 | 		 struct pci_dev *dev)
 815 | {
 816 | 	struct netdev_queue_id *queues;
 817 | 	struct ynl_error yerr;
 818 | 	int max_kernel_queue;
 819 | 	int ifindex;
 820 | 	int ret;
 821 | 
 822 | 	ret = reserve_queues(fd, num_queues, devmem->ifname, &ifindex,
 823 | 			     &max_kernel_queue, &devmem->rss_context);
 824 | 	if (ret)
 825 | 		return ret;
 826 | 
 827 | 	rxmp = get_memory_provider(provider);
 828 | 	if (!rxmp) {
 829 | 		ret = -1;
 830 | 		goto undo_queues;
 831 | 	}
 832 | 
 833 | 	devmem->ys = ynl_sock_create(&ynl_netdev_family, &yerr);
 834 | 	if (!devmem->ys) {
 835 | 		warnx("Failed to setup YNL socket: %s", yerr.msg);
 836 | 		goto undo_queues;
 837 | 	}
 838 | 
 839 | 	if (rxmp->dev_init && rxmp->dev_init(dev) < 0) {
 840 | 		ret = -1;
 841 | 		goto sock_destroy;
 842 | 	}
 843 | 
 844 | 	devmem->mem = rxmp->alloc(dmabuf_rx_size_mb * 1024 * 1024);
 845 | 	if (!devmem->mem) {
 846 | 		warnx("Failed to allocate memory");
 847 | 		ret = -1;
 848 | 		goto sock_destroy;
 849 | 	}
 850 | 
 851 | 	queues = calloc(num_queues, sizeof(*queues));
 852 | 	if (!queues) {
 853 | 		warn("Failed to allocate memory for queues");
 854 | 		ret = -1;
 855 | 		goto free_memory;
 856 | 	}
 857 | 
 858 | 	for (int i = 0; i < num_queues; i++) {
 859 | 		queues[i]._present.type = 1;
 860 | 		queues[i]._present.id = 1;
 861 | 		queues[i].type = NETDEV_QUEUE_TYPE_RX;
 862 | 		queues[i].id = max_kernel_queue + i;
 863 | 	}
 864 | 
 865 |         devmem->mem->dmabuf_id = bind_rx_queue(ifindex, devmem->mem->fd, queues,
 866 |                                           num_queues, devmem->ys);
 867 |         if (devmem->mem->dmabuf_id < 0) {
 868 | 		warnx("Failed to bind RX queue");
 869 | 		ret = -1;
 870 | 		goto free_queues;
 871 | 	}
 872 | 
 873 | 	return 0;
 874 | 
 875 | free_queues:
 876 | 	free(queues);
 877 | free_memory:
 878 | 	rxmp->free(devmem->mem);
 879 | sock_destroy:
 880 | 	ynl_sock_destroy(devmem->ys);
 881 | 	devmem->ys = NULL;
 882 | undo_queues:
 883 | 	unreserve_queues(devmem->ifname, devmem->rss_context);
 884 | 
 885 | 	return ret;
 886 | }
 887 | 
 888 | int devmem_teardown(struct session_state_devmem *devmem)
 889 | {
 890 | 	unreserve_queues(devmem->ifname, devmem->rss_context);
 891 | 	if (devmem->ys)
 892 | 		ynl_sock_destroy(devmem->ys);
 893 | 	if (rxmp)
 894 | 		rxmp->free(devmem->mem);
 895 | 	return 0;
 896 | }
 897 | 
 898 | int devmem_release_tokens(int fd, struct connection_devmem *conn)
 899 | {
 900 | 	int ret;
 901 | 
 902 | 	if (!conn->rxtok_len)
 903 | 		return 0;
 904 | 
 905 | 	ret = setsockopt(fd, SOL_SOCKET, SO_DEVMEM_DONTNEED, &conn->rxtok[0],
 906 | 		  sizeof(struct dmabuf_token) * conn->rxtok_len);
 907 | 
 908 | 	if (ret >= 0 && ret != conn->rxtok_len)
 909 | 		warnx("requested to release %d token, got %d", conn->rxtok_len,
 910 | 		      ret);
 911 | 
 912 |         conn->rxtok_len = 0;
 913 | 
 914 | 	return ret;
 915 | }
 916 | 
 917 | static int devmem_validate_host(struct memory_buffer *mem, __u64 offset,
 918 | 				__u32 pat_start, __u32 size)
 919 | {
 920 | 	struct dma_buf_sync sync = {};
 921 | 	void *pat = NULL;
 922 | 	int ret = 0;
 923 | 
 924 | 	sync.flags = DMA_BUF_SYNC_START;
 925 | 	ioctl(mem->fd, DMA_BUF_IOCTL_SYNC, &sync);
 926 | 
 927 | 	pat = &patbuf[pat_start];
 928 | 	ret = memcmp(pat, mem->buf_mem + offset, size);
 929 | 
 930 | 	sync.flags = DMA_BUF_SYNC_END;
 931 | 	ioctl(mem->fd, DMA_BUF_IOCTL_SYNC, &sync);
 932 | 
 933 | 	if (ret) {
 934 | 		warnx("Data corruption %d %d %d %d",
 935 | 		      *(char *)mem->buf_mem, *(char *)pat, size, pat_start);
 936 | 		return -1;
 937 | 	}
 938 | 
 939 | 	return 0;
 940 | }
 941 | 
 942 | static int devmem_validate_cuda(unsigned char *rxbuf, struct memory_buffer *mem,
 943 | 				__u64 offset, __u32 pat_start, __u32 size)
 944 | {
 945 | #ifdef USE_CUDA
 946 | 	void *pat = NULL;
 947 | 	int ret = 0;
 948 | 
 949 | 	ret = cudaMemcpy(rxbuf, (void *)(mem->buf_mem + offset), size,
 950 | 			 cudaMemcpyDeviceToHost);
 951 | 	if (ret != cudaSuccess) {
 952 | 		warnx("cudaMemcpyDeviceToHost failed rc=%d", ret);
 953 | 		return -1;
 954 | 	}
 955 | 
 956 | 	pat = &patbuf[pat_start];
 957 | 	ret = memcmp(pat, rxbuf, size);
 958 | 	if (ret) {
 959 | 		warnx("Data corruption %d %d %d %d",
 960 | 		      *(char *)rxbuf, *(char *)pat, size, pat_start);
 961 | 		return -1;
 962 | 	}
 963 | #endif
 964 | 
 965 | 	return 0;
 966 | }
 967 | 
 968 | static int devmem_validate_recv(unsigned char *rxbuf, struct memory_buffer *mem,
 969 | 				struct cmsghdr *cm, int rep, __u64 *tot_recv)
 970 | {
 971 | 	struct dmabuf_cmsg *dmabuf_cmsg = (struct dmabuf_cmsg *)CMSG_DATA(cm);
 972 | 	size_t start = 0;
 973 | 	int ret = 0;
 974 | 
 975 | 	start = *tot_recv % PATTERN_PERIOD;
 976 | 	if (start + dmabuf_cmsg->frag_size > ARRAY_SIZE(patbuf)) {
 977 | 		warnx("dmabuf fragment size too big rep=%d", rep);
 978 | 		return -1;
 979 | 	}
 980 | 
 981 | 	switch (mem->provider) {
 982 | 	case MEMORY_PROVIDER_HOST:
 983 | 		ret = devmem_validate_host(mem, dmabuf_cmsg->frag_offset, start,
 984 | 					   dmabuf_cmsg->frag_size);
 985 | 		break;
 986 | 	case MEMORY_PROVIDER_CUDA:
 987 | 		ret = devmem_validate_cuda(rxbuf, mem, dmabuf_cmsg->frag_offset,
 988 | 					   start, dmabuf_cmsg->frag_size);
 989 | 		break;
 990 | 	}
 991 | 	if (ret) {
 992 | 		warnx("devmem recv validation failed rep=%d rc=%d", rep, ret);
 993 | 		return -1;
 994 | 	}
 995 | 
 996 | 	*tot_recv += dmabuf_cmsg->frag_size;
 997 | 	return ret;
 998 | }
 999 | 
1000 | static int devmem_handle_token(int fd, struct connection_devmem *conn,
1001 | 			       struct cmsghdr *cm)
1002 | {
1003 | 	struct dmabuf_cmsg *dmabuf_cmsg = (struct dmabuf_cmsg *)CMSG_DATA(cm);
1004 | 	struct dmabuf_token *token;
1005 | 
1006 | 	if (cm->cmsg_type == SO_DEVMEM_LINEAR) {
1007 | 		warnx("received linear chunk, flow steering error?");
1008 | 		return -EFAULT;
1009 | 	}
1010 | 
1011 | 	if (conn->rxtok_len == ARRAY_SIZE(conn->rxtok)) {
1012 | 		int ret;
1013 | 
1014 | 		ret = devmem_release_tokens(fd, conn);
1015 | 		if (ret < 0)
1016 | 			return ret;
1017 | 	}
1018 | 
1019 | 	token = &conn->rxtok[conn->rxtok_len++];
1020 | 	token->token_start = dmabuf_cmsg->frag_token;
1021 | 	token->token_count = 1;
1022 | 
1023 | 	return 0;
1024 | }
1025 | 
1026 | ssize_t devmem_recv(int fd, struct connection_devmem *conn,
1027 | 		    unsigned char *rxbuf, size_t chunk,
1028 | 		    struct memory_buffer *mem, int rep, __u64 tot_recv,
1029 | 		    bool validate)
1030 | {
1031 | 	struct msghdr msg = {};
1032 | 	struct iovec iov = {
1033 | 		.iov_base = NULL,
1034 | 		.iov_len = chunk,
1035 | 	};
1036 | 	struct cmsghdr *cm;
1037 | 	int tokens = 0;
1038 | 	ssize_t n;
1039 | 	int ret;
1040 | 
1041 | 	msg.msg_iov = &iov;
1042 | 	msg.msg_iovlen = 1;
1043 | 	msg.msg_control = conn->ctrl_data;
1044 | 	msg.msg_controllen = sizeof(conn->ctrl_data);
1045 | 	n = recvmsg(fd, &msg, MSG_DONTWAIT | MSG_SOCK_DEVMEM);
1046 | 	if (n < 0)
1047 | 		return n;
1048 | 
1049 | 	for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) {
1050 | 		if (cm->cmsg_level != SOL_SOCKET ||
1051 | 		    (cm->cmsg_type != SO_DEVMEM_DMABUF &&
1052 | 		     cm->cmsg_type != SO_DEVMEM_LINEAR))
1053 | 			continue;
1054 | 
1055 | 		ret = devmem_handle_token(fd, conn, cm);
1056 | 		if (ret < 0)
1057 | 			return ret;
1058 | 
1059 | 		if (validate) {
1060 | 			ret = devmem_validate_recv(rxbuf, mem, cm, rep,
1061 | 						   &tot_recv);
1062 | 			if (ret < 0)
1063 | 				return ret;
1064 | 		}
1065 | 
1066 | 		tokens++;
1067 | 	}
1068 | 
1069 | 	if (!tokens) {
1070 | 		warnx("devmem recvmsg returned no tokens");
1071 | 		errno = -EFAULT;
1072 | 		return -1;
1073 | 	}
1074 | 
1075 | 	return n;
1076 | }
1077 | 
1078 | int devmem_sendmsg(int fd, int dmabuf_id, size_t off, size_t n)
1079 | {
1080 | 	char ctrl_data[CMSG_SPACE(sizeof(int))];
1081 | 	struct msghdr msg = { 0 };
1082 | 	struct cmsghdr *cmsg;
1083 | 	struct iovec iov;
1084 | 
1085 | 	iov.iov_base = (void *)off;
1086 | 	iov.iov_len = n;
1087 | 
1088 | 	msg.msg_iov = &iov;
1089 | 	msg.msg_iovlen = 1;
1090 | 
1091 | 	msg.msg_control = ctrl_data;
1092 | 	msg.msg_controllen = sizeof(ctrl_data);
1093 | 
1094 | 	cmsg = CMSG_FIRSTHDR(&msg);
1095 | 	cmsg->cmsg_level = SOL_SOCKET;
1096 | 	cmsg->cmsg_type = SCM_DEVMEM_DMABUF;
1097 | 	cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1098 | 	*((int *)CMSG_DATA(cmsg)) = dmabuf_id;
1099 | 
1100 | 	return sendmsg(fd, &msg, MSG_ZEROCOPY);
1101 | }
1102 | 
1103 | int devmem_bind_socket(struct session_state_devmem *devmem, int fd)
1104 | {
1105 | 	char ifname[IFNAMSIZ] = {};
1106 | 	int ifindex;
1107 | 
1108 | 	ifindex = find_iface(&devmem->addr, ifname);
1109 | 	if (ifindex < 0) {
1110 | 		warnx("Failed to resolve ifindex: %s", strerror(-ifindex));
1111 | 		return -1;
1112 | 	}
1113 | 
1114 | 	if (setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, ifname, IFNAMSIZ)) {
1115 | 		warn("failed to bind device to socket");
1116 | 		return -1;
1117 | 	}
1118 | 
1119 | 	return 0;
1120 | }
1121 | 
1122 | int devmem_setup_tx(struct session_state_devmem *devmem, enum memory_provider_type provider,
1123 | 		    int dmabuf_tx_size_mb, struct pci_dev *dev, struct sockaddr_in6 *addr)
1124 | {
1125 | 	char ifname[IFNAMSIZ] = {};
1126 | 	struct ynl_error yerr;
1127 | 	int ifindex;
1128 | 	int ret;
1129 | 
1130 | 	devmem->tx_provider = provider;
1131 | 	devmem->dmabuf_tx_size_mb = dmabuf_tx_size_mb;
1132 | 	memcpy(&devmem->tx_dev, dev, sizeof(devmem->tx_dev));
1133 | 	memcpy(&devmem->addr, addr, sizeof(devmem->addr));
1134 | 
1135 | 	txmp = get_memory_provider(devmem->tx_provider);
1136 | 	if (!txmp)
1137 | 		return -1;
1138 | 
1139 | 	if (txmp->dev_init && txmp->dev_init(&devmem->tx_dev) < 0)
1140 | 		return -1;
1141 | 
1142 | 	devmem->tx_mem = txmp->alloc(devmem->dmabuf_tx_size_mb * 1024 * 1024);
1143 | 	if (!devmem->tx_mem) {
1144 | 		warnx("Failed to allocate devmem tx buffer");
1145 | 		return -1;
1146 | 	}
1147 | 
1148 | 	txmp->memcpy_to_device(devmem->tx_mem, 0, patbuf, sizeof(patbuf));
1149 | 
1150 | 	ifindex = find_iface(&devmem->addr, ifname);
1151 | 	if (ifindex < 0) {
1152 | 		warnx("Failed to resolve ifindex: %s", strerror(-ifindex));
1153 | 		return -1;
1154 | 	}
1155 | 
1156 | 	devmem->ys = ynl_sock_create(&ynl_netdev_family, &yerr);
1157 | 	if (!devmem->ys) {
1158 | 		warnx("Failed to setup YNL socket: %s", yerr.msg);
1159 | 		return -1;
1160 | 	}
1161 | 
1162 | 	devmem->tx_mem->dmabuf_id = bind_tx_queue(ifindex, devmem->tx_mem->fd, devmem->ys);
1163 | 	if (devmem->tx_mem->dmabuf_id < 0) {
1164 | 		warnx("Failed to bind TX queue dmabuf: %d\n", devmem->tx_mem->dmabuf_id);
1165 | 		ret = -1;
1166 | 		goto sock_destroy;
1167 | 	}
1168 | 
1169 | 
1170 | 	return 0;
1171 | 
1172 | sock_destroy:
1173 | 	ynl_sock_destroy(devmem->ys);
1174 | 	devmem->ys = NULL;
1175 | 	return ret;
1176 | }
1177 | 
1178 | void devmem_teardown_tx(struct session_state_devmem *devmem)
1179 | {
1180 | 	if (txmp && devmem->tx_mem) {
1181 | 		txmp->free(devmem->tx_mem);
1182 | 		devmem->tx_mem = NULL;
1183 | 	}
1184 | 
1185 | 	if (devmem->ys) {
1186 | 		ynl_sock_destroy(devmem->ys);
1187 | 		devmem->ys = NULL;
1188 | 	}
1189 | }
1190 | 


--------------------------------------------------------------------------------
/client.c:
--------------------------------------------------------------------------------
   1 | // SPDX-License-Identifier: BSD-3-Clause
   2 | /* Copyright Meta Platforms, Inc. and affiliates */
   3 | 
   4 | #include <errno.h>
   5 | #include <stdbool.h>
   6 | #include <stdio.h>
   7 | #include <string.h>
   8 | #include <unistd.h>
   9 | #include <arpa/inet.h>
  10 | #include <sys/types.h>
  11 | #include <sys/socket.h>
  12 | 
  13 | #include <ccan/array_size/array_size.h>
  14 | #include <ccan/err/err.h>
  15 | #include <ccan/daemonize/daemonize.h>
  16 | #include <ccan/minmax/minmax.h>
  17 | #include <ccan/net/net.h>
  18 | #include <ccan/opt/opt.h>
  19 | 
  20 | #include "bipartite_match.h"
  21 | #include "proto.h"
  22 | #include "proto_dbg.h"
  23 | 
  24 | int verbose = 3;
  25 | 
  26 | static struct {
  27 | 	bool msg_trunc;
  28 | 	bool devmem_rx;
  29 | 	enum memory_provider_type devmem_rx_memory;
  30 | 	enum memory_provider_type devmem_tx_memory;
  31 | 	struct pci_dev devmem_dst_dev;
  32 | 	struct pci_dev devmem_src_dev;
  33 | 	bool devmem_tx;
  34 | 	bool msg_zerocopy;
  35 | 	bool tls;
  36 | 	bool tls_rx;
  37 | 	bool tls_tx;
  38 | 	bool tls_nopad;
  39 | 	bool output_csv;
  40 | 	bool output_hdr;
  41 | 	bool xpin;
  42 | 	unsigned int tls_ver;
  43 | 	char *src;
  44 | 	char *dst;
  45 | 	char *src_svc;
  46 | 	char *dst_svc;
  47 | 	unsigned int time_stats;
  48 | 	unsigned int req_size;
  49 | 	unsigned int resp_size;
  50 | 	unsigned int read_size;
  51 | 	unsigned int write_size;
  52 | 	unsigned int pin_off;
  53 | 	unsigned int time;
  54 | 	unsigned int cpu_min;
  55 | 	unsigned int cpu_max;
  56 | 	int cpu_src_wrk;
  57 | 	int cpu_dst_wrk;
  58 | 	unsigned int mss;
  59 | 	unsigned int n_conns;
  60 | 	unsigned int max_pace;
  61 | 	char *tcp_cong_ctrl;
  62 | 	unsigned int dmabuf_rx_size_mb;
  63 | 	unsigned int dmabuf_tx_size_mb;
  64 | 	unsigned int num_rx_queues;
  65 | 	bool validate;
  66 | 	bool iou_src;
  67 | 	bool iou_dst;
  68 | 	bool zerocopy_rx;
  69 | 	unsigned int iou_rx_size_mb;
  70 | } opt = {
  71 | 	.tls_ver = TLS_1_3_VERSION,
  72 | 	.src = "localhost",
  73 | 	.dst = "localhost",
  74 | 	.src_svc = "18323",
  75 | 	.dst_svc = "18323",
  76 | 	.req_size = ~0U,
  77 | 	.read_size = KPM_DFL_OP_CHUNK,
  78 | 	.write_size = KPM_DFL_OP_CHUNK,
  79 | 	.time = 5,
  80 | 	.cpu_min = 0,
  81 | 	.cpu_max = 255,
  82 | 	.cpu_src_wrk = -1,
  83 | 	.cpu_dst_wrk = -1,
  84 | 	.n_conns = 1,
  85 | 	/* 128M is enough to drive one queue at 200G */
  86 | 	.dmabuf_rx_size_mb = 128,
  87 | 	.dmabuf_tx_size_mb = 128,
  88 | 	.num_rx_queues = 1,
  89 | 	.devmem_rx_memory = MEMORY_PROVIDER_HOST,
  90 | 	.devmem_dst_dev = {
  91 | 		.domain = DEVICE_DOMAIN_ANY,
  92 | 		.bus = DEVICE_BUS_ANY,
  93 | 		.device = DEVICE_DEVICE_ANY
  94 | 	},
  95 | 	.devmem_tx_memory = MEMORY_PROVIDER_HOST,
  96 | 	.devmem_src_dev = {
  97 | 		.domain = DEVICE_DOMAIN_ANY,
  98 | 		.bus = DEVICE_BUS_ANY,
  99 | 		.device = DEVICE_DEVICE_ANY
 100 | 	},
 101 | 	.iou_src = false,
 102 | 	.iou_dst = false,
 103 | 	.zerocopy_rx = false,
 104 | 	.iou_rx_size_mb = 64,
 105 | };
 106 | 
 107 | #define dbg(fmt...) while (0) { warnx(fmt); }
 108 | 
 109 | static void opt_show_uinthex(char buf[OPT_SHOW_LEN], const unsigned int *ui)
 110 | {
 111 | 	sprintf(buf, "0x%x", *ui);
 112 | }
 113 | 
 114 | static char *arg_bad(const char *fmt, const char *arg)
 115 | {
 116 | 	char *str;
 117 | 
 118 | 	str = malloc(strlen(fmt) + strlen(arg));
 119 | 	if (!str)
 120 | 		return strerror(errno);
 121 | 
 122 | 	sprintf(str, fmt, arg);
 123 | 
 124 | 	return str;
 125 | }
 126 | 
 127 | static char *
 128 | opt_set_memory_provider(const char *arg, enum memory_provider_type *provider)
 129 | {
 130 | 	char *ret;
 131 | 
 132 | 	if (!strcmp(arg, "cuda")) {
 133 | #ifndef USE_CUDA
 134 | 		return arg_bad("memory provider %s requires kperf compiled with CUDA", arg);
 135 | #endif
 136 | 	}
 137 | 
 138 | 	ret = NULL;
 139 | 	if (!strcmp(arg, "host")) {
 140 | 		*provider = MEMORY_PROVIDER_HOST;
 141 | 	} else if (!strcmp(arg, "cuda")) {
 142 | 		*provider = MEMORY_PROVIDER_CUDA;
 143 | 	} else {
 144 | 		ret = arg_bad("'%s' is not a valid memory provider", arg);
 145 | 	}
 146 | 
 147 | 	return ret;
 148 | 
 149 | }
 150 | 
 151 | static char *
 152 | opt_set_dev(const char *arg, struct pci_dev *dev)
 153 | {
 154 | 	if (!strcmp(arg, "any")) {
 155 | 		dev->domain = DEVICE_DOMAIN_ANY;
 156 | 		dev->bus = DEVICE_BUS_ANY;
 157 | 		dev->device = DEVICE_DEVICE_ANY;
 158 | 		return NULL;
 159 | 	}
 160 | 
 161 | 	if (sscanf(arg, "%hx:%hhx:%hhx", &dev->domain, &dev->bus, &dev->device) == 3)
 162 | 		return NULL;
 163 | 
 164 | 	return arg_bad("'%s' invalid PCI ID format. Expected format: domain:bus:device\n", arg);
 165 | }
 166 | 
 167 | static void
 168 | opt_show_memory_provider(char buf[OPT_SHOW_LEN], const enum memory_provider_type *p)
 169 | {
 170 | 	switch (*p) {
 171 | 	case MEMORY_PROVIDER_HOST:
 172 | 		strncpy(buf, "host", OPT_SHOW_LEN);
 173 | 		break;
 174 | 	case MEMORY_PROVIDER_CUDA:
 175 | 		strncpy(buf, "cuda", OPT_SHOW_LEN);
 176 | 		break;
 177 | 	default:
 178 | 		/* inval */
 179 | 		strncpy(buf, "invalid", OPT_SHOW_LEN);
 180 | 		break;
 181 | 	}
 182 | }
 183 | 
 184 | static void
 185 | opt_show_dev(char buf[OPT_SHOW_LEN], const struct pci_dev *dev)
 186 | {
 187 | 	if (dev->domain == DEVICE_DOMAIN_ANY &&
 188 | 	    dev->bus == DEVICE_BUS_ANY &&
 189 | 	    dev->device == DEVICE_DEVICE_ANY)
 190 | 		strncpy(buf, "any", OPT_SHOW_LEN);
 191 | 	else
 192 | 		snprintf(buf, OPT_SHOW_LEN, "%hx:%hhx:%hhx",
 193 | 			 dev->domain, dev->bus, dev->device);
 194 | }
 195 | 
 196 | static const struct opt_table opts[] = {
 197 | 	OPT_WITH_ARG("--src <arg>", opt_set_charp, opt_show_charp,
 198 | 		     &opt.src, "Source server address"),
 199 | 	OPT_WITH_ARG("--dst <arg>", opt_set_charp, opt_show_charp,
 200 | 		     &opt.dst, "Destination server address"),
 201 | 	OPT_WITH_ARG("--src-svc <arg>", opt_set_charp, opt_show_charp,
 202 | 		     &opt.src_svc, "Source server port"),
 203 | 	OPT_WITH_ARG("--dst-svc <arg>", opt_set_charp, opt_show_charp,
 204 | 		     &opt.dst_svc, "Destination server port"),
 205 | 	OPT_WITH_ARG("--req-size|-s <arg>", opt_set_uintval, opt_show_uintval,
 206 | 		     &opt.req_size, "Request size"),
 207 | 	OPT_WITH_ARG("--resp-size <arg>", opt_set_uintval, opt_show_uintval,
 208 | 		     &opt.resp_size, "Response size"),
 209 | 	OPT_WITH_ARG("--read-size <arg>", opt_set_uintval, opt_show_uintval,
 210 | 		     &opt.read_size, "Buffer size for write/send syscall"),
 211 | 	OPT_WITH_ARG("--write-size <arg>", opt_set_uintval, opt_show_uintval,
 212 | 		     &opt.write_size, "Buffer size for read/recv syscall"),
 213 | 	OPT_WITH_ARG("--pin-off <arg>", opt_set_uintval, opt_show_uintval,
 214 | 		     &opt.pin_off, "CPU pin offset"),
 215 | 	OPT_WITH_ARG("--cpu-min <arg>", opt_set_uintval, opt_show_uintval,
 216 | 		     &opt.cpu_min, "min CPU number for connection"),
 217 | 	OPT_WITH_ARG("--cpu-max <arg>", opt_set_uintval, opt_show_uintval,
 218 | 		     &opt.cpu_max, "max CPU number for connection"),
 219 | 	OPT_WITH_ARG("--cpu-src-wrk <arg>", opt_set_intval, opt_show_intval,
 220 | 		     &opt.cpu_src_wrk, "max CPU number for connection"),
 221 | 	OPT_WITH_ARG("--cpu-dst-wrk <arg>", opt_set_intval, opt_show_intval,
 222 | 		     &opt.cpu_dst_wrk, "max CPU number for connection"),
 223 | 	OPT_WITHOUT_ARG("--cross-pin", opt_set_bool, &opt.xpin, "Cross-pin"),
 224 | 	OPT_WITH_ARG("--time|-t <arg>", opt_set_uintval, opt_show_uintval,
 225 | 		     &opt.time, "Test length"),
 226 | 	OPT_WITH_ARG("--time-stats|-T <arg>", opt_set_uintval, opt_show_uintval,
 227 | 		     &opt.time_stats,
 228 | 		     "Time stats - (0) none, (1) hist, (2) hist+pstats"),
 229 | 	OPT_WITH_ARG("--mss|-M <arg>", opt_set_uintval, opt_show_uintval,
 230 | 		     &opt.mss, "MSS for TCP"),
 231 | 	OPT_WITH_ARG("--max-pace <arg>", opt_set_uintval, opt_show_uintval,
 232 | 		     &opt.max_pace, "Max sending/pacing rate"),
 233 | 	OPT_WITHOUT_ARG("--tls", opt_set_bool, &opt.tls,
 234 | 			"Enable TLS in both directions"),
 235 | 	OPT_WITH_ARG("--tls-ver <arg>", opt_set_uintval, opt_show_uinthex,
 236 | 		     &opt.tls_ver, "Version of TLS as per kernel defines"),
 237 | 	OPT_WITHOUT_ARG("--tls-rx", opt_set_bool, &opt.tls_rx,
 238 | 			"Enable TLS for Rx"),
 239 | 	OPT_WITHOUT_ARG("--tls-tx", opt_set_bool, &opt.tls_tx,
 240 | 			"Enable TLS for Tx"),
 241 | 	OPT_WITHOUT_ARG("--tls-nopad", opt_set_bool, &opt.tls_nopad,
 242 | 			"Enable TLS no padding optimization for Rx"),
 243 | 	OPT_WITH_ARG("--num-connections|-n <arg>",
 244 | 		     opt_set_uintval, opt_show_uintval,
 245 | 		     &opt.n_conns, "Number of connections"),
 246 | 	OPT_WITH_ARG("--tcp-cc <arg>", opt_set_charp, opt_show_charp,
 247 | 		     &opt.tcp_cong_ctrl, "Set TCP congestion control"),
 248 | 	OPT_WITHOUT_ARG("--out-csv", opt_set_bool, &opt.output_csv,
 249 | 			"Print output in terse CSV format"),
 250 | 	OPT_WITHOUT_ARG("--out-hdr", opt_set_bool, &opt.output_hdr,
 251 | 			"Include column name header in the CSV output"),
 252 | 	OPT_WITHOUT_ARG("--verbose|-v", opt_inc_intval, &verbose,
 253 | 			"Verbose mode (can be specified more than once)"),
 254 | 	OPT_WITHOUT_ARG("--quiet|-q", opt_dec_intval, &verbose,
 255 | 			"Quiet mode (can be specified more than once)"),
 256 | 	OPT_WITHOUT_ARG("--usage|--help|-h", opt_usage_and_exit,
 257 | 			"kpeft client",	"Show this help message"),
 258 | 	OPT_WITHOUT_ARG("--msg-trunc", opt_set_bool, &opt.msg_trunc, "Use MSG_TRUNC on receive"),
 259 | 	OPT_WITHOUT_ARG("--msg-zerocopy", opt_set_bool, &opt.msg_zerocopy, "Use MSG_ZEROCOPY on transmit"),
 260 | 	OPT_EARLY_WITHOUT_ARG("--devmem-rx", opt_set_bool, &opt.devmem_rx, "Use TCP Devmem on receive"),
 261 | 	OPT_WITH_ARG("--devmem-rx-memory {cuda,host}", opt_set_memory_provider,
 262 | 		     opt_show_memory_provider, &opt.devmem_rx_memory,
 263 | 		     "Select the memory provider for TCP Devmem RX"),
 264 | 	OPT_WITH_ARG("--dmabuf-rx-size-mb <arg>", opt_set_uintval, opt_show_uintval,
 265 | 		     &opt.dmabuf_rx_size_mb, "Size of RX dmabuf for TCP Devmem mode"),
 266 | 	OPT_WITH_ARG("--dmabuf-tx-size-mb <arg>", opt_set_uintval, opt_show_uintval,
 267 | 		     &opt.dmabuf_tx_size_mb, "Size of TX dmabuf for TCP Devmem mode"),
 268 | 	OPT_WITHOUT_ARG("--devmem-tx", opt_set_bool, &opt.devmem_tx, "Use TCP Devmem on transmit"),
 269 | 	OPT_WITH_ARG("--devmem-tx-memory {cuda,host}", opt_set_memory_provider,
 270 | 		     opt_show_memory_provider, &opt.devmem_tx_memory,
 271 | 		     "Select the memory provider for TCP Devmem TX"),
 272 | 	OPT_WITH_ARG("--num-rx-queues <arg>", opt_set_uintval, opt_show_uintval,
 273 | 		     &opt.num_rx_queues, "Number of RX queues for TCP Devmem mode"),
 274 | 	OPT_WITH_ARG("--validate <yes|no>", opt_set_bool_arg, NULL, &opt.validate,
 275 | 		     "Validate payload. Default is no when using --devmem-rx; otherwise, default is yes"),
 276 | 	OPT_WITH_ARG("--devmem-dst-dev <arg>", opt_set_dev, opt_show_dev,
 277 | 		     &opt.devmem_dst_dev, "Select the destination device for the TCP Devmem memory provider"),
 278 | 	OPT_WITH_ARG("--devmem-src-dev <arg>", opt_set_dev, opt_show_dev,
 279 | 		     &opt.devmem_src_dev, "Select the source device for the TCP Devmem memory provider"),
 280 | 	OPT_WITHOUT_ARG("--iou-src", opt_set_bool, &opt.iou_src,
 281 | 			"Use io_uring on source server"),
 282 | 	OPT_WITHOUT_ARG("--iou-dst", opt_set_bool, &opt.iou_dst,
 283 | 			"Use io_uring on destination server"),
 284 | 	OPT_EARLY_WITHOUT_ARG("--zerocopy-rx", opt_set_bool, &opt.zerocopy_rx,
 285 | 			      "Use zero copy on receive"),
 286 | 	OPT_WITH_ARG("--iou-rx-size-mb <arg>", opt_set_uintval, opt_show_uintval,
 287 | 		     &opt.iou_rx_size_mb, "Size of RX memory reserved by io_uring"),
 288 | 	OPT_ENDTABLE
 289 | };
 290 | 
 291 | static struct kpm_connect_reply *
 292 | spawn_conn(int src, int dst, struct sockaddr_in6 *addr, socklen_t len)
 293 | {
 294 | 	struct kpm_connect_reply **replies;
 295 | 	struct kpm_connect_reply *conns;
 296 | 	struct kpm_connect_reply *id;
 297 | 	struct bim_state *bim;
 298 | 	struct bim_edge m;
 299 | 	unsigned int i;
 300 | 	int *seq;
 301 | 
 302 | 	if (!opt.n_conns)
 303 | 		return NULL;
 304 | 	conns = calloc(opt.n_conns, sizeof(*conns));
 305 | 	if (!conns)
 306 | 		return NULL;
 307 | 	replies = calloc(opt.n_conns, sizeof(*replies));
 308 | 	if (!replies)
 309 | 		goto err_free_conns;
 310 | 	seq = calloc(opt.n_conns, sizeof(int));
 311 | 	if (!seq)
 312 | 		goto err_free_replies;
 313 | 	bim = bim_init();
 314 | 	if (!bim)
 315 | 		goto err_free_seq;
 316 | 
 317 | again:
 318 | 	for (i = 0; i < opt.n_conns; i++) {
 319 | 		seq[i] = kpm_send_connect(src, addr, len, opt.mss);
 320 | 		if (seq[i] < 0)
 321 | 			err(7, "Failed to connect");
 322 | 	}
 323 | 	for (i = 0; i < opt.n_conns; i++) {
 324 | 		id = kpm_receive(src);
 325 | 		if (!id)
 326 | 			errx(7, "No connection ID");
 327 | 
 328 | 		if (!kpm_good_reply(id, KPM_MSG_TYPE_CONNECT, seq[i]))
 329 | 			errx(7, "Invalid connection ID %d %d",
 330 | 			     id->hdr.type, id->hdr.len);
 331 | 
 332 | 		replies[i] = id;
 333 | 	}
 334 | 
 335 | 	for (i = 0; i < opt.n_conns; i++) {
 336 | 		bool good, bim_unique;
 337 | 
 338 | 		id = replies[i];
 339 | 
 340 | 		good = clamp(id->local.cpu, opt.cpu_min, opt.cpu_max) ==
 341 | 			id->local.cpu &&
 342 | 		       clamp(id->remote.cpu, opt.cpu_min, opt.cpu_max) ==
 343 | 			id->remote.cpu;
 344 | 		bim_unique = good &&
 345 | 			bim_add_edge(bim, id->local.cpu, id->remote.cpu, id);
 346 | 
 347 | 		kpm_dbg("Connection established %d:cpu %d | %d:cpu %d - %s",
 348 | 			id->local.id, id->local.cpu,
 349 | 			id->remote.id, id->remote.cpu,
 350 | 			good && bim_unique ? "good" :
 351 | 			(good ? "duplicate" : "out of range"));
 352 | 
 353 | 		if (!bim_unique) {
 354 | 			bool fail = kpm_req_disconnect(src, id->local.id) < 0 ||
 355 | 				    kpm_req_disconnect(dst, id->remote.id) < 0;
 356 | 			free(id);
 357 | 			if (fail) {
 358 | 				warnx("Disconnect failed");
 359 | 				i = opt.n_conns - i - 1;
 360 | 				goto err_drain;
 361 | 			}
 362 | 		}
 363 | 	}
 364 | 
 365 | 	if (bim_match_size(bim) < opt.n_conns)
 366 | 		goto again;
 367 | 
 368 | 	i = 0;
 369 | 	bim_for_each_edge(bim, &m) {
 370 | 		id = m.cookie;
 371 | 
 372 | 		if (m.is_match && i < opt.n_conns) {
 373 | 			kpm_info("Connected %d:cpu %d | %d:cpu %d",
 374 | 				 id->local.id, id->local.cpu,
 375 | 				 id->remote.id, id->remote.cpu);
 376 | 			memcpy(&conns[i], id, sizeof(*id));
 377 | 			i++;
 378 | 		} else {
 379 | 			kpm_req_disconnect(src, id->local.id);
 380 | 			kpm_req_disconnect(dst, id->remote.id);
 381 | 		}
 382 | 		free(id);
 383 | 	}
 384 | 
 385 | 	for (i = 0; i < opt.n_conns; i++) {
 386 | 		if (opt.max_pace) {
 387 | 			if (kpm_req_pacing(src, conns[i].local.id, opt.max_pace) ||
 388 | 			    kpm_req_pacing(dst, conns[i].remote.id, opt.max_pace))
 389 | 				err(8, "Failed to set pacing rate");
 390 | 		}
 391 | 
 392 | 		if (opt.tcp_cong_ctrl) {
 393 | 			if (kpm_req_tcp_cc(src, conns[i].local.id, opt.tcp_cong_ctrl) ||
 394 | 			    kpm_req_tcp_cc(dst, conns[i].remote.id, opt.tcp_cong_ctrl))
 395 | 				err(8, "Failed to set TCP cong control");
 396 | 		}
 397 | 	}
 398 | 
 399 | 	free(seq);
 400 | 	free(replies);
 401 | 	return conns;
 402 | 
 403 | err_drain:
 404 | 	bim_for_each_edge(bim, &m) {
 405 | 		id = m.cookie;
 406 | 		kpm_req_disconnect(src, id->local.id);
 407 | 		kpm_req_disconnect(dst, id->remote.id);
 408 | 		free(id);
 409 | 	}
 410 | 	bim_destroy(bim);
 411 | err_free_seq:
 412 | 	free(seq);
 413 | err_free_replies:
 414 | 	free(replies);
 415 | err_free_conns:
 416 | 	free(conns);
 417 | 	return NULL;
 418 | }
 419 | 
 420 | static int spawn_worker(int fd, int cpu, __u32 *wid)
 421 | {
 422 | 	struct __kpm_generic_u32 *id;
 423 | 	struct kpm_empty *ack;
 424 | 	int seq;
 425 | 
 426 | 	seq = kpm_send_empty(fd, KPM_MSG_TYPE_SPAWN_WORKER);
 427 | 	if (seq < 0) {
 428 | 		warn("Failed to spawn");
 429 | 		return 1;
 430 | 	}
 431 | 
 432 | 	id = kpm_receive(fd);
 433 | 	if (!id) {
 434 | 		warnx("No ack for spawn");
 435 | 		return 1;
 436 | 	}
 437 | 
 438 | 	if (!kpm_good_reply(id, KPM_MSG_TYPE_SPAWN_WORKER, seq)) {
 439 | 		warnx("Invalid spawn ack %d %d", id->hdr.type, id->hdr.len);
 440 | 		free(id);
 441 | 		return 1;
 442 | 	}
 443 | 
 444 | 	*wid = id->val;
 445 | 	free(id);
 446 | 
 447 | 	seq = kpm_send_pin_worker(fd, *wid, cpu);
 448 | 	if (seq < 0) {
 449 | 		warn("Failed to pin");
 450 | 		return 1;
 451 | 	}
 452 | 
 453 | 	ack = kpm_receive(fd);
 454 | 	if (!ack) {
 455 | 		warnx("No ack for pin");
 456 | 		return 1;
 457 | 	}
 458 | 
 459 | 	if (!kpm_good_reply(ack, KPM_MSG_TYPE_PIN_WORKER, seq)) {
 460 | 		warnx("Invalid ack for pin %d %d", ack->hdr.type, ack->hdr.len);
 461 | 		free(ack);
 462 | 		return 1;
 463 | 	}
 464 | 	free(ack);
 465 | 
 466 | 	return 0;
 467 | }
 468 | 
 469 | static void
 470 | show_cpu_stat(const char *pfx, struct kpm_test_results *result, unsigned int id)
 471 | {
 472 | 	struct kpm_cpu_load *cpu = &result->cpu_load[id];
 473 | 
 474 | 	if (cpu->id != id) {
 475 | 		warnx("Sparse CPU IDs %d != %d!", cpu->id, id);
 476 | 		return;
 477 | 	}
 478 | 
 479 | 	warnx("  %sCPU%3d: usr:%5.2f%% sys:%5.2f%% idle:%5.2f%% iow:%5.2f%% irq:%5.2f%% sirq:%5.2f%%",
 480 | 	      pfx, id, cpu->user / 100.0, cpu->system / 100.0,
 481 | 	      cpu->idle / 100.0, cpu->iowait / 100.0, cpu->irq / 100.0,
 482 | 	      cpu->sirq / 100.0);
 483 | }
 484 | 
 485 | static void
 486 | dump_result(struct kpm_test_results *result, const char *dir,
 487 | 	    struct kpm_connect_reply *conns, bool local)
 488 | {
 489 | 	unsigned int end = 0, i, r;
 490 | 	int start = -1;
 491 | 
 492 | 	warnx("== %s", dir);
 493 | 	for (r = 0; r < opt.n_conns; r++)
 494 | 		warnx("  Tx%7.3lf Gbps (%llu bytes in %u usec)",
 495 | 		      (double)result->res[r].tx_bytes * 8 /
 496 | 		      result->time_usec /
 497 | 		      1000,
 498 | 		      result->res[r].tx_bytes,
 499 | 		      result->time_usec);
 500 | 	for (r = 0; r < opt.n_conns; r++)
 501 | 		warnx("  Rx%7.3lf Gbps (%llu bytes in %u usec)",
 502 | 		      (double)result->res[r].rx_bytes * 8 /
 503 | 		      result->time_usec /
 504 | 		      1000,
 505 | 		      result->res[r].rx_bytes,
 506 | 		      result->time_usec);
 507 | 	warnx("  TCP retrans reord rtt rttvar d_ce snd_wnd cwnd");
 508 | 	for (r = 0; r < opt.n_conns; r++)
 509 | 		warnx("      %7u %5u %3u %6u %4u %7u %4u",
 510 | 		      result->res[r].retrans, result->res[r].reord_seen,
 511 | 		      result->res[r].rtt,
 512 | 		      result->res[r].rttvar, result->res[r].delivered_ce,
 513 | 		      result->res[r].snd_wnd, result->res[r].snd_cwnd);
 514 | 
 515 | 	for (r = 0; r < opt.n_conns; r++) {
 516 | 		int flow_cpu;
 517 | 
 518 | 		flow_cpu = local ? conns[r].local.cpu : conns[r].remote.cpu;
 519 | 		show_cpu_stat(opt.pin_off ? "net " : "", result, flow_cpu);
 520 | 		if (opt.pin_off)
 521 | 			show_cpu_stat("app ", result, flow_cpu + opt.pin_off);
 522 | 	}
 523 | 
 524 | 	/* The rest is RR-only */
 525 | 	if (opt.req_size == ~0U)
 526 | 		return;
 527 | 
 528 | 	for (r = 0; r < opt.n_conns; r++)
 529 | 		warnx("%.1lf RPS",
 530 | 		      (double)result->res[r].reqs /
 531 | 		      result->time_usec * 1000000);
 532 | 
 533 | 	if (opt.time_stats < 1)
 534 | 		return;
 535 | 
 536 | 	for (r = 0; r < opt.n_conns; r++) {
 537 | 		for (i = 0; i < ARRAY_SIZE(result->res[r].lat_hist); i++) {
 538 | 			if (!result->res[r].lat_hist[i])
 539 | 				continue;
 540 | 			if (start < 0)
 541 | 				start = i;
 542 | 			end = i + 1;
 543 | 		}
 544 | 		for (i = start; i < end; i++) {
 545 | 			unsigned int val;
 546 | 			const char *unit;
 547 | 
 548 | 			if (i < 3) {
 549 | 				val = 128 << i;
 550 | 				unit = "ns";
 551 | 			} else if (i < 13) {
 552 | 				val = (1ULL << (i + 7)) / 1000;
 553 | 				unit = "us";
 554 | 			} else {
 555 | 				val = (1ULL << (i + 7)) / (1000 * 1000);
 556 | 				unit = "ms";
 557 | 			}
 558 | 			warnx("  [%3d%s] %d",
 559 | 			      val, unit, result->res[r].lat_hist[i]);
 560 | 		}
 561 | 	}
 562 | 
 563 | 	if (opt.time_stats < 2)
 564 | 		return;
 565 | 
 566 | 	for (r = 0; r < opt.n_conns; r++)
 567 | 		warnx("p25:%uus p50:%uus p90:%uus p99:%uus p999:%uus p9999:%uus",
 568 | 		      result->res[r].p25 * 128 / 1000,
 569 | 		      result->res[r].p50 * 128 / 1000,
 570 | 		      result->res[r].p90 * 128 / 1000,
 571 | 		      result->res[r].p99 * 128 / 1000,
 572 | 		      result->res[r].p999 * 128 / 1000,
 573 | 		      result->res[r].p9999 * 128 / 1000);
 574 | }
 575 | 
 576 | static void
 577 | dump_result_machine(struct kpm_test_results *result, const char *dir,
 578 | 		    struct kpm_connect_reply *conns, bool local)
 579 | {
 580 | 	struct kpm_test_result res = {};
 581 | 	struct kpm_cpu_load *cpu;
 582 | 	unsigned int r;
 583 | 	int flow_cpu;
 584 | 	__u64 bytes;
 585 | 	int i;
 586 | 
 587 | 	for (r = 0; r < opt.n_conns; r++) {
 588 | #define S(f) res.f += result->res[r].f;
 589 | 		S(rx_bytes);
 590 | 		S(tx_bytes);
 591 | 		S(reqs);
 592 | 		S(retrans);
 593 | 		S(reord_seen);
 594 | 		S(rtt);
 595 | 		S(rttvar);
 596 | 		S(delivered_ce);
 597 | 		S(snd_wnd);
 598 | 		S(snd_cwnd);
 599 | 
 600 | 		if (opt.time_stats < 2)
 601 | 			continue;
 602 | 		S(p25);
 603 | 		S(p50);
 604 | 		S(p90);
 605 | 		S(p99);
 606 | 		S(p999);
 607 | 		S(p9999);
 608 | #undef S
 609 | 	}
 610 | 	res.rtt /= opt.n_conns;
 611 | 	res.rttvar /= opt.n_conns;
 612 | 	res.snd_wnd /= opt.n_conns;
 613 | 	res.snd_cwnd /= opt.n_conns;
 614 | 	res.p25 /= opt.n_conns;
 615 | 	res.p50 /= opt.n_conns;
 616 | 	res.p90 /= opt.n_conns;
 617 | 	res.p99 /= opt.n_conns;
 618 | 	res.p999 /= opt.n_conns;
 619 | 	res.p9999 /= opt.n_conns;
 620 | 	r = 0;
 621 | 
 622 | 	/* Headers once on the first line */
 623 | 	if (local && opt.output_hdr) {
 624 | 		for (i = 0; i < 2; i++) {
 625 | 			printf("tcp,,,,,,,");
 626 | 			if (opt.time_stats >= 2)
 627 | 				printf("latency,(us),,,,,");
 628 | 			if (opt.n_conns < 2) {
 629 | 				printf("net,,,,");
 630 | 				if (opt.pin_off)
 631 | 					printf("app,,,,");
 632 | 			}
 633 | 			printf("data%c", i ? '\n' : ',');
 634 | 		}
 635 | 		for (i = 0; i < 2; i++) {
 636 | 			printf("retrans,reord,ce,rtt,rttvar,swnd,cwnd,");
 637 | 			if (opt.time_stats >= 2)
 638 | 				printf("p25,p50,p90,p99,p999,p9999,");
 639 | 			if (opt.n_conns < 2) {
 640 | 				printf("usr,sys,idle,sirq,");
 641 | 				if (opt.pin_off)
 642 | 					printf("usr,sys,idle,sirq,");
 643 | 			}
 644 | 			printf(i ? "rx\n" : "tx,");
 645 | 		}
 646 | 	}
 647 | 
 648 | 	printf("%u,%u,%u,%u,%u,%u,%u,",
 649 | 	       res.retrans, res.reord_seen, res.delivered_ce,
 650 | 	       res.rtt, res.rttvar, res.snd_wnd, res.snd_cwnd);
 651 | 
 652 | 	if (opt.time_stats >= 2)
 653 | 		printf("%u,%u,%u,%u,%u,%u,",
 654 | 		       res.p25 * 128 / 1000, res.p50 * 128 / 1000,
 655 | 		       res.p90 * 128 / 1000, res.p99 * 128 / 1000,
 656 | 		       res.p999 * 128 / 1000, res.p9999 * 128 / 1000);
 657 | 
 658 | 	/* Dunno how to report CPU use, yet */
 659 | 	if (opt.n_conns < 2) {
 660 | 		flow_cpu = local ? conns[r].local.cpu : conns[r].remote.cpu;
 661 | 		cpu = &result->cpu_load[flow_cpu];
 662 | 		printf("%.4f,%.4f,%.4f,%.4f,",
 663 | 		       cpu->user / 10000.0, cpu->system / 10000.0,
 664 | 		       cpu->idle / 10000.0, cpu->sirq / 10000.0);
 665 | 
 666 | 		if (opt.pin_off) {
 667 | 			cpu = &result->cpu_load[flow_cpu + opt.pin_off];
 668 | 			printf("%.4f,%.4f,%.4f,%.4f,",
 669 | 			       cpu->user / 10000.0, cpu->system / 10000.0,
 670 | 			       cpu->idle / 10000.0, cpu->sirq / 10000.0);
 671 | 		}
 672 | 	}
 673 | 
 674 | 	bytes = local ? res.tx_bytes : res.rx_bytes;
 675 | 	printf("%.3lf", (double)bytes * 8 / result->time_usec / 1000);
 676 | 	printf(local ? "," : "\n");
 677 | }
 678 | 
 679 | /* copied from devmem.c */
 680 | static void inet_to_inet6(struct sockaddr *addr, struct sockaddr_in6 *out)
 681 | {
 682 | 	out->sin6_addr.s6_addr32[3] = ((struct sockaddr_in6 *)addr)->sin6_addr.s6_addr32[0];
 683 | 	out->sin6_addr.s6_addr32[0] = 0;
 684 | 	out->sin6_addr.s6_addr32[1] = 0;
 685 | 	out->sin6_addr.s6_addr16[4] = 0;
 686 | 	out->sin6_addr.s6_addr16[5] = 0xffff;
 687 | 	out->sin6_family = AF_INET6;
 688 | }
 689 | 
 690 | int inet_sockaddr(const char *str, struct sockaddr_in6 *out)
 691 | {
 692 | 	struct sockaddr_in *sa4;
 693 | 	struct sockaddr_in6 tmp;
 694 | 
 695 | 	out->sin6_family = AF_INET6;
 696 | 	if (inet_pton(AF_INET6, str, &(out->sin6_addr)) == 1) {
 697 | 		out->sin6_family = AF_INET6;
 698 | 		return 0;
 699 | 	}
 700 | 
 701 | 	sa4 = (struct sockaddr_in *)&tmp;
 702 | 	if (inet_pton(AF_INET, str, &(sa4->sin_addr)) == 1) {
 703 | 		sa4->sin_family = AF_INET;
 704 | 		inet_to_inet6((void *)sa4, out);
 705 | 		return 0;
 706 | 	}
 707 | 
 708 | 	return -1;
 709 | }
 710 | 
 711 | int main(int argc, char *argv[])
 712 | {
 713 | 	enum kpm_rx_mode rx_mode = KPM_RX_MODE_SOCKET;
 714 | 	enum kpm_tx_mode tx_mode = KPM_TX_MODE_SOCKET;
 715 | 	unsigned int src_ncpus, dst_ncpus;
 716 | 	struct __kpm_generic_u32 *ack_id;
 717 | 	__u32 *src_wrk_cpu, *dst_wrk_cpu;
 718 | 	struct kpm_connect_reply *conns;
 719 | 	struct kpm_test_results *result;
 720 | 	__u32 *src_wrk_id, *dst_wrk_id;
 721 | 	struct sockaddr_in6 conn_addr;
 722 | 	__u32 src_tst_id, dst_tst_id;
 723 | 	struct sockaddr_in6 src_addr;
 724 | 	struct addrinfo *addr;
 725 | 	struct kpm_test *test;
 726 | 	unsigned int i;
 727 | 	socklen_t len;
 728 | 	int src, dst;
 729 | 	size_t sz;
 730 | 	int seq;
 731 | 
 732 | 	opt_register_table(opts, NULL);
 733 | 
 734 | 	/* Use early parse to set default for --validate based on --devmem-rx */
 735 | 	if (!opt_early_parse(argc, argv, opt_log_stderr))
 736 | 		exit(1);
 737 | 	opt.validate = !opt.devmem_rx;
 738 | 
 739 | 	if (!opt_parse(&argc, argv, opt_log_stderr))
 740 | 		exit(1);
 741 | 
 742 | 	err_set_progname(argv[0]);
 743 | 
 744 | 	if (opt.read_size > KPM_MAX_OP_CHUNK ||
 745 | 	    opt.write_size > KPM_MAX_OP_CHUNK)
 746 | 		errx(1, "Max read/write size is %d", KPM_MAX_OP_CHUNK);
 747 | 	if (opt.tcp_cong_ctrl &&
 748 | 	    strnlen(opt.tcp_cong_ctrl, KPM_CC_NAME_LEN) == KPM_CC_NAME_LEN)
 749 | 		errx(1, "TCP CC name is too long");
 750 | 	if (opt.xpin) {
 751 | 		if (opt.cpu_src_wrk != -1 || opt.cpu_dst_wrk != -1)
 752 | 			errx(1, "Cross-pin can't use explicit pin");
 753 | 		if (opt.pin_off)
 754 | 			errx(1, "Cross-pin can't use pin off");
 755 | 		if (opt.n_conns != 2)
 756 | 			errx(1, "Cross-pin only works with 2 connections");
 757 | 	}
 758 | 
 759 | 	if (inet_sockaddr(opt.src, &src_addr) < 0)
 760 | 		errx(1, "failed to get sockaddr from %s\n", opt.src);
 761 | 
 762 | 	/* io_uring doesn't support devmem yet */
 763 | 	if (opt.devmem_rx && opt.iou_dst)
 764 | 		errx(1, "io_uring does not support --devmem-rx yet");
 765 | 	if (opt.devmem_tx && opt.iou_src)
 766 | 		errx(1, "io_uring does not support --devmem-tx yet");
 767 | 
 768 | 	if (opt.msg_trunc && opt.validate)
 769 | 		errx(1, "--msg-trunc and --validate yes are mutually exclusive");
 770 | 
 771 | 	if (opt.msg_trunc && (opt.devmem_rx || opt.zerocopy_rx))
 772 | 		errx(1, "--msg-trunc and (--devmem-rx or --zerocopy-rx) are mutually exclusive");
 773 | 
 774 | 	if (opt.msg_trunc)
 775 | 		rx_mode = KPM_RX_MODE_SOCKET_TRUNC;
 776 | 	else if (opt.zerocopy_rx)
 777 | 		rx_mode = KPM_RX_MODE_SOCKET_ZEROCOPY;
 778 | 	else if (opt.devmem_rx)
 779 | 		rx_mode = KPM_RX_MODE_DEVMEM;
 780 | 
 781 | 	if (opt.msg_zerocopy && opt.devmem_tx)
 782 | 		errx(1, "--msg-zerocopy and --devmem-tx are mutually exclusive");
 783 | 
 784 | 	if (opt.msg_zerocopy)
 785 | 		tx_mode = KPM_TX_MODE_SOCKET_ZEROCOPY;
 786 | 	else if (opt.devmem_tx)
 787 | 		tx_mode = KPM_TX_MODE_DEVMEM;
 788 | 
 789 | 	addr = net_client_lookup(opt.src, opt.src_svc, AF_UNSPEC, SOCK_STREAM);
 790 | 	if (!addr)
 791 | 		errx(1, "Failed to look up service to connect to");
 792 | 
 793 | 	/* Src */
 794 | 	src = net_connect(addr);
 795 | 	freeaddrinfo(addr);
 796 | 	if (src < 1)
 797 | 		err(1, "Failed to connect");
 798 | 
 799 | 	addr = net_client_lookup(opt.dst, opt.dst_svc, AF_UNSPEC, SOCK_STREAM);
 800 | 	if (!addr)
 801 | 		errx(1, "Failed to look up service to connect to");
 802 | 
 803 | 	if (kpm_xchg_hello(src, &src_ncpus))
 804 | 		errx(2, "Bad hello");
 805 | 
 806 | 	/* Dst */
 807 | 	dst = net_connect(addr);
 808 | 	freeaddrinfo(addr);
 809 | 	if (dst < 1)
 810 | 		err(1, "Failed to connect");
 811 | 
 812 | 	if (kpm_xchg_hello(dst, &dst_ncpus))
 813 | 		errx(2, "Bad hello");
 814 | 
 815 | 	src_wrk_id = calloc(opt.n_conns, sizeof(*src_wrk_id));
 816 | 	dst_wrk_id = calloc(opt.n_conns, sizeof(*dst_wrk_id));
 817 | 	src_wrk_cpu = calloc(opt.n_conns, sizeof(*src_wrk_cpu));
 818 | 	dst_wrk_cpu = calloc(opt.n_conns, sizeof(*dst_wrk_cpu));
 819 | 
 820 | 	/* Main */
 821 | 	len = sizeof(conn_addr);
 822 | 	if (kpm_req_tcp_sock(dst, &conn_addr, &len) < 0) {
 823 | 		warnx("Failed create TCP acceptor");
 824 | 		goto out;
 825 | 	}
 826 | 
 827 | 	struct kpm_mode dst_mode = {
 828 | 		.rx_mode = rx_mode,
 829 | 		.tx_mode = tx_mode,
 830 | 		.rx_provider = opt.devmem_rx_memory,
 831 | 		.tx_provider = opt.devmem_tx_memory,
 832 | 		.dev = opt.devmem_dst_dev,
 833 | 		.dmabuf_rx_size_mb = opt.dmabuf_rx_size_mb,
 834 | 		.dmabuf_tx_size_mb = opt.dmabuf_tx_size_mb,
 835 | 		.num_rx_queues = opt.num_rx_queues,
 836 | 		.validate = opt.validate,
 837 | 		.iou = opt.iou_dst,
 838 | 		.iou_rx_size_mb = opt.iou_rx_size_mb,
 839 | 	};
 840 | 	if (kpm_req_mode(dst, &dst_mode) < 0) {
 841 | 		warnx("Failed setup destination mode");
 842 | 		goto out;
 843 | 	}
 844 | 
 845 | 	struct kpm_mode src_mode = {
 846 | 		.rx_mode = rx_mode,
 847 | 		.tx_mode = tx_mode,
 848 | 		.rx_provider = opt.devmem_rx_memory,
 849 | 		.tx_provider = opt.devmem_tx_memory,
 850 | 		.dev = opt.devmem_src_dev,
 851 | 		.dmabuf_rx_size_mb = opt.dmabuf_rx_size_mb,
 852 | 		.dmabuf_tx_size_mb = opt.dmabuf_tx_size_mb,
 853 | 		.num_rx_queues = opt.num_rx_queues,
 854 | 		.addr = src_addr,
 855 | 		.validate = opt.validate,
 856 | 		.iou = opt.iou_src,
 857 | 		.iou_rx_size_mb = opt.iou_rx_size_mb,
 858 | 	};
 859 | 	if (kpm_req_mode(src, &src_mode) < 0) {
 860 | 		warnx("Failed setup source mode");
 861 | 		goto out;
 862 | 	}
 863 | 
 864 | 	conns = spawn_conn(src, dst, &conn_addr, len);
 865 | 	if (!conns)
 866 | 		goto out;
 867 | 
 868 | 	if (opt.tls || opt.tls_rx || opt.tls_tx) {
 869 | 		struct tls12_crypto_info_aes_gcm_128 aes128 = {};
 870 | 		unsigned int rx, src_mask, dst_mask;
 871 | 
 872 | 		aes128.info.version = opt.tls_ver;
 873 | 		aes128.info.cipher_type = TLS_CIPHER_AES_GCM_128;
 874 | 
 875 | 		rx = KPM_TLS_RX;
 876 | 		if (opt.tls_nopad)
 877 | 			rx |= KPM_TLS_NOPAD;
 878 | 		if (opt.tls) {
 879 | 			src_mask = dst_mask = KPM_TLS_TX | rx;
 880 | 		} else if (opt.tls_rx) {
 881 | 			src_mask = rx;
 882 | 			dst_mask = KPM_TLS_TX;
 883 | 		} else {
 884 | 			src_mask = KPM_TLS_TX;
 885 | 			dst_mask = rx;
 886 | 		}
 887 | 
 888 | 		for (i = 0; i < opt.n_conns; i++) {
 889 | 			if (kpm_req_tls(src, conns[i].local.id,
 890 | 					KPM_TLS_ULP | src_mask,
 891 | 					&aes128, sizeof(aes128)) ||
 892 | 			    kpm_req_tls(dst, conns[i].remote.id,
 893 | 					KPM_TLS_ULP | dst_mask,
 894 | 					&aes128, sizeof(aes128))) {
 895 | 				warnx("TLS setup failed");
 896 | 				goto out_id;
 897 | 			}
 898 | 		}
 899 | 	}
 900 | 
 901 | 	for (i = 0; i < opt.n_conns; i++) {
 902 | 		struct kpm_connect_reply *id = &conns[i];
 903 | 
 904 | 		if (opt.xpin)
 905 | 			src_wrk_cpu[i] = conns[!i].local.cpu;
 906 | 		else if (opt.cpu_src_wrk != -1)
 907 | 			src_wrk_cpu[i] = opt.cpu_src_wrk;
 908 | 		else
 909 | 			src_wrk_cpu[i] = id->local.cpu + opt.pin_off;
 910 | 
 911 | 		if (opt.xpin)
 912 | 			dst_wrk_cpu[i] = conns[!i].remote.cpu;
 913 | 		if (opt.cpu_dst_wrk != -1)
 914 | 			dst_wrk_cpu[i] = opt.cpu_dst_wrk;
 915 | 		else
 916 | 			dst_wrk_cpu[i] = id->remote.cpu + opt.pin_off;
 917 | 
 918 | 		if (spawn_worker(src, src_wrk_cpu[i], &src_wrk_id[i]) ||
 919 | 		    spawn_worker(dst, dst_wrk_cpu[i], &dst_wrk_id[i]))
 920 | 			goto out_id;
 921 | 	}
 922 | 
 923 | 	sz = sizeof(*test) + opt.n_conns * sizeof(test->specs[0]);
 924 | 	test = malloc(sz);
 925 | 	memset(test, 0, sz);
 926 | 
 927 | 	test->n_conns = opt.n_conns;
 928 | 	test->time_sec = opt.time;
 929 | 	for (i = 0; i < opt.n_conns; i++) {
 930 | 		test->specs[i].connection_id = conns[i].remote.id;
 931 | 		test->specs[i].worker_id = dst_wrk_id[i];
 932 | 		test->specs[i].read_size = opt.read_size;
 933 | 		test->specs[i].write_size = opt.write_size;
 934 | 		if (opt.req_size == ~0U) {
 935 | 			test->specs[i].type = KPM_TEST_TYPE_STREAM;
 936 | 		} else {
 937 | 			test->specs[i].type = KPM_TEST_TYPE_RR;
 938 | 			test->specs[i].arg.rr.req_size = opt.req_size;
 939 | 			test->specs[i].arg.rr.resp_size = opt.resp_size ?: opt.req_size;
 940 | 			test->specs[i].arg.rr.timings = opt.time_stats;
 941 | 		}
 942 | 	}
 943 | 
 944 | 	seq = kpm_send(dst, &test->hdr, sz, KPM_MSG_TYPE_TEST);
 945 | 
 946 | 	ack_id = kpm_receive(dst);
 947 | 	if (!kpm_good_reply(ack_id, KPM_MSG_TYPE_TEST, seq)) {
 948 | 		warnx("Invalid ack for test %d %d",
 949 | 		      ack_id->hdr.type, ack_id->hdr.len);
 950 | 		goto out_id;
 951 | 	}
 952 | 	dst_tst_id = ack_id->val;
 953 | 	dbg("Test id dst %d", dst_tst_id);
 954 | 	free(ack_id);
 955 | 
 956 | 	test->active = 1;
 957 | 	for (i = 0; i < opt.n_conns; i++) {
 958 | 		test->specs[i].connection_id = conns[i].local.id;
 959 | 		test->specs[i].worker_id = src_wrk_id[i];
 960 | 	}
 961 | 
 962 | 	seq = kpm_send(src, &test->hdr, sz, KPM_MSG_TYPE_TEST);
 963 | 	free(test);
 964 | 
 965 | 	ack_id = kpm_receive(src);
 966 | 	if (!kpm_good_reply(ack_id, KPM_MSG_TYPE_TEST, seq)) {
 967 | 		warnx("Invalid ack for test %d %d",
 968 | 		      ack_id->hdr.type, ack_id->hdr.len);
 969 | 		goto out_id;
 970 | 	}
 971 | 	src_tst_id = ack_id->val;
 972 | 	dbg("Test id src %d", src_tst_id);
 973 | 	free(ack_id);
 974 | 
 975 | 	/* Source worker is done */
 976 | 	result = kpm_receive(src);
 977 | 	if (!result) {
 978 | 		warnx("No result");
 979 | 		goto out_id;
 980 | 	}
 981 | 	sz = sizeof(*result) + opt.n_conns * sizeof(result->res[0]);
 982 | 	if (result->hdr.type != KPM_MSG_TYPE_TEST_RESULT ||
 983 | 	    result->hdr.len < sz)
 984 | 		warnx("Invalid result %d %d",
 985 | 		      result->hdr.type, result->hdr.len);
 986 | 	else if (opt.output_csv)
 987 | 		dump_result_machine(result, "Source", conns, true);
 988 | 	else
 989 | 		dump_result(result, "Source", conns, true);
 990 | 	free(result);
 991 | 
 992 | 	/* Stop the test on both ends */
 993 | 	if (kpm_req_end_test(src, src_tst_id) ||
 994 | 	    kpm_req_end_test(dst, dst_tst_id))
 995 | 		warnx("Failed to stop test");
 996 | 
 997 | 	/* Destination worker is done */
 998 | 	result = kpm_receive(dst);
 999 | 	if (!result) {
1000 | 		warnx("No result");
1001 | 		goto out_id;
1002 | 	}
1003 | 	if (result->hdr.type != KPM_MSG_TYPE_TEST_RESULT ||
1004 | 	    result->hdr.len < sizeof(*result) + sizeof(result->res[0]))
1005 | 		warnx("Invalid result %d %d",
1006 | 		      result->hdr.type, result->hdr.len);
1007 | 	else if (opt.output_csv)
1008 | 		dump_result_machine(result, "Source", conns, false);
1009 | 	else
1010 | 		dump_result(result, "Target", conns, false);
1011 | 	free(result);
1012 | 
1013 | out_id:
1014 | 	free(conns);
1015 | out:
1016 | 	close(src);
1017 | 	close(dst);
1018 | 
1019 | 	free(src_wrk_id);
1020 | 	free(dst_wrk_id);
1021 | 	free(src_wrk_cpu);
1022 | 	free(dst_wrk_cpu);
1023 | 
1024 | 	return 0;
1025 | }
1026 | 


--------------------------------------------------------------------------------