├── libs ├── libsnowrdma.a └── libsnowrdma.so ├── src ├── common.h ├── common.c ├── Makefile ├── rdma_helpers.h ├── rdma.h ├── rdma_helpers.c └── rdma.c ├── samples ├── Makefile ├── server.c └── client.c ├── .gitignore ├── README.md ├── LICENSE └── include └── rdma.h /libs/libsnowrdma.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huangyibo/SnowRDMA/HEAD/libs/libsnowrdma.a -------------------------------------------------------------------------------- /libs/libsnowrdma.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huangyibo/SnowRDMA/HEAD/libs/libsnowrdma.so -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | #ifndef __COMMON_H_ 2 | 3 | #include 4 | #include 5 | 6 | char *strdup(const char *s); 7 | pid_t gettid(void); 8 | int get_current_cpu(void); 9 | 10 | #endif // !__COMMON_H_ -------------------------------------------------------------------------------- /samples/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean 2 | 3 | TARGETS := client server 4 | 5 | CC := gcc 6 | 7 | RELEASE := -DNDEBUG 8 | LDFLAGS := -Wl,-Bdynamic -lrdmacm -libverbs -lpthread 9 | CFLAGS += -Wall -g -I../include $(RELEASE) 10 | 11 | all: clean $(TARGETS) 12 | 13 | %.o: %.c %.h 14 | $(CC) $(CFLAGS) -c $< -o $@ 15 | 16 | client: client.o 17 | $(CC) -o $@ $^ $(CFLAGS) -L../libs -Wl,-Bstatic -lsnowrdma -Wl,-rpath=../libs $(LDFLAGS) 18 | 19 | server: server.o 20 | $(CC) -o $@ $^ $(CFLAGS) -L../libs -Wl,-Bstatic -lsnowrdma -Wl,-rpath=../libs $(LDFLAGS) 21 | 22 | clean: 23 | rm -rf $(TARGETS) *.o 24 | 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Object files 5 | *.o 6 | *.ko 7 | *.obj 8 | *.elf 9 | 10 | # Linker output 11 | *.ilk 12 | *.map 13 | *.exp 14 | 15 | # Precompiled Headers 16 | *.gch 17 | *.pch 18 | 19 | # Libraries 20 | *.lib 21 | *.a 22 | *.la 23 | *.lo 24 | 25 | # Shared objects (inc. Windows DLLs) 26 | *.dll 27 | *.so 28 | *.so.* 29 | *.dylib 30 | 31 | # Executables 32 | *.exe 33 | *.out 34 | *.app 35 | *.i*86 36 | *.x86_64 37 | *.hex 38 | 39 | # Debug files 40 | *.dSYM/ 41 | *.su 42 | *.idb 43 | *.pdb 44 | 45 | # Kernel Module Compile Results 46 | *.mod* 47 | *.cmd 48 | .tmp_versions/ 49 | modules.order 50 | Module.symvers 51 | Mkfile.old 52 | dkms.conf 53 | -------------------------------------------------------------------------------- /src/common.c: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "common.h" 8 | 9 | char *strdup(const char *s) 10 | { 11 | char *dst = malloc(strlen(s) + 1); 12 | if (!dst) 13 | return NULL; 14 | 15 | strcpy(dst, s); 16 | return dst; 17 | } 18 | 19 | pid_t gettid(void) 20 | { 21 | return syscall(SYS_gettid); 22 | } 23 | 24 | int get_current_cpu(void) 25 | { 26 | cpu_set_t cpuset; 27 | int num_cpus = sysconf(_SC_NPROCESSORS_ONLN); 28 | int cpu_core = -1; 29 | 30 | /* Get the affinity mask of the current process */ 31 | sched_getaffinity(0, sizeof(cpuset), &cpuset); 32 | 33 | /* Find the first CPU in the affinity mask */ 34 | for (int i = 0; i < num_cpus; i++) { 35 | if (CPU_ISSET(i, &cpuset)) { 36 | cpu_core = i; 37 | break; 38 | } 39 | } 40 | return cpu_core; 41 | } 42 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean 2 | 3 | CC := gcc 4 | AR := ar 5 | 6 | RELEASE := -DNDEBUG 7 | LDFLAGS := -Wl,-Bdynamic -lrdmacm -libverbs -lpthread 8 | CFLAGS += -Wall -g -I. $(RELEASE) 9 | 10 | RDMA_OBJS = common.o rdma_helpers.o rdma.o 11 | RDMA_SRC := common.c rdma_helpers.c rdma.c 12 | HEADER_SRC := common.h rdma_helpers.h rdma.h 13 | RELEASE_HEADER := rdma.h 14 | 15 | STATIC_LIB = libsnowrdma.a 16 | DYNAMIC_LIB = libsnowrdma.so 17 | 18 | all: clean $(STATIC_LIB) $(DYNAMIC_LIB) 19 | @if [ ! -d "../libs" ]; then mkdir libs; fi 20 | @if [ ! -d "../include" ]; then mkdir include; fi 21 | cp $(RELEASE_HEADER) ../include/ 22 | 23 | %.o: %.c %.h 24 | $(CC) $(CFLAGS) -c $< -o $@ 25 | 26 | $(STATIC_LIB): $(RDMA_OBJS) $(HEADER_SRC) 27 | $(AR) -cvq $@ $(RDMA_OBJS) 28 | mv $(STATIC_LIB) ../libs/ 29 | 30 | $(DYNAMIC_LIB): $(RDMA_SRC) $(HEADER_SRC) 31 | $(CC) -fPIC -shared $(CFLAGS) $(RDMA_SRC) -o $@ $(LDFLAGS) 32 | mv $(DYNAMIC_LIB) ../libs/ 33 | 34 | clean: 35 | rm -rf *.o $(RDMA_OBJS) $(STATIC_LIB) $(DYNAMIC_LIB) 36 | 37 | install: 38 | cp ../libs/* /usr/local/lib 39 | cp ../include/* /usr/local/include 40 | -------------------------------------------------------------------------------- /samples/server.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "rdma.h" 3 | 4 | char *serverip; 5 | int port = 8888; 6 | 7 | static void usage(const char *argv0) 8 | { 9 | fprintf(stderr, "usage: %s \n", argv0); 10 | exit(1); 11 | } 12 | 13 | void connRecvSuccess(RdmaConn *conn, void *data, size_t data_len) 14 | { 15 | printf("RDMA: recv data from peer: %s\n", (char *)data); 16 | rdmaConnSend(conn, data, data_len); 17 | } 18 | 19 | void connConnectSuccess(RdmaConn *conn) 20 | { 21 | printf("RDMA: one connection (%s:%d) connect success to server\n", conn->ip, conn->port); 22 | } 23 | 24 | void connDisconnectSuccess(RdmaConn *conn) 25 | { 26 | printf("RDMA: one connection (%s:%d) disconnect success\n", conn->ip, conn->port); 27 | } 28 | 29 | void serverAcceptSuccess(RdmaConn *conn) 30 | { 31 | printf("RDMA: Accepted a new connection (%s:%d). \n" 32 | "Let's register recv callback here\n", 33 | conn->ip, conn->port); 34 | rdmaConnSetRecvCallback(conn, connRecvSuccess); 35 | rdmaConnSetConnectedCallback(conn, connConnectSuccess); 36 | rdmaConnSetDisconnectCallback(conn, connDisconnectSuccess); 37 | } 38 | 39 | int main(int argc, char *argv[]) 40 | { 41 | RdmaListener *server; 42 | RdmaServerOptions opt = {0}; 43 | int ret; 44 | 45 | if (argc != 3) 46 | usage(argv[0]); 47 | 48 | serverip = argv[1]; 49 | port = atoi(argv[2]); 50 | 51 | opt.rdma_recv_depth = 512; 52 | opt.rdma_enable_phys_addr_access = true; 53 | opt.accept_callback = serverAcceptSuccess; 54 | opt.rdma_io_affinity_cpuid = 1; 55 | ret = rdmaServer(&server, serverip, port, &opt); 56 | if (ret != RDMA_OK) 57 | { 58 | rdmaErr("create rdma server failed"); 59 | goto err; 60 | } 61 | // rdmaServerSetAcceptCallback(server, serverAcceptSuccess); 62 | 63 | rdmaServerStart(server); 64 | 65 | rdmaServerStop(server); 66 | rdmaServerRelease(server); 67 | 68 | return 0; 69 | 70 | err: 71 | return -1; 72 | } 73 | -------------------------------------------------------------------------------- /samples/client.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "rdma.h" 6 | 7 | char *serverip; 8 | int port = 8888; 9 | 10 | char *hello_msg = "hahahaahahaha!"; 11 | char *local_msg_buf; 12 | struct ibv_mr *data_mr; 13 | 14 | static void usage(const char *argv0) 15 | { 16 | fprintf(stderr, "usage: %s \n", argv0); 17 | exit(1); 18 | } 19 | 20 | static uint64_t time_get_ns(void) 21 | { 22 | struct timespec ts; 23 | 24 | clock_gettime(CLOCK_MONOTONIC, &ts); 25 | return ts.tv_sec * 1000000000ull + ts.tv_nsec; 26 | } 27 | 28 | /* params for test */ 29 | static RdmaConn *conn; 30 | static int max_cnt = 1000000; 31 | static int cur_cnt = 0; 32 | static unsigned long remote_addr = 0xff1ba1000; 33 | pthread_cond_t g_cond; 34 | pthread_mutex_t g_mutex; 35 | uint64_t start_time, end_time; 36 | static int nr_outstanding_reqs = 32; 37 | 38 | void clientRecvSuccess(RdmaConn *conn, void *data, size_t data_len) 39 | { 40 | printf("RDMA: recv data from peer (%s:%d): %s\n", conn->ip, conn->port, (char *)data); 41 | } 42 | 43 | void clientWriteSuccess(RdmaConn *conn, size_t data_len) 44 | { 45 | printf("RDMA WRITE to peer (%s:%d) success\n", conn->ip, conn->port); 46 | } 47 | 48 | void clientReadSuccess(RdmaConn *conn, size_t data_len) 49 | { 50 | // printf("RDMA READ from peer (%s:%d): %s\n", conn->ip, conn->port, local_msg_buf); 51 | 52 | // if (local_msg_buf && data_mr) 53 | // { 54 | // printf("RDMA conn de-register data mr\n"); 55 | // rdmaConnDeregMem(conn, data_mr); 56 | // local_msg_buf = NULL; 57 | // } 58 | 59 | cur_cnt++; 60 | if (cur_cnt < max_cnt) { 61 | rdmaPAReadSignaled(conn, (unsigned long)local_msg_buf, data_mr->lkey, remote_addr, strlen(hello_msg)); 62 | } 63 | else if (cur_cnt == max_cnt) 64 | { 65 | end_time = time_get_ns(); 66 | printf("====> %lld reqs per second ====\n", max_cnt * 1000000000ll / (end_time - start_time)); 67 | 68 | pthread_mutex_lock(&g_mutex); 69 | pthread_cond_signal(&g_cond); 70 | pthread_mutex_unlock(&g_mutex); 71 | } 72 | 73 | } 74 | 75 | void clientConnectSuccess(RdmaConn *conn) 76 | { 77 | printf("RDMA: one connection (%s:%d) connect success to server\n", conn->ip, conn->port); 78 | } 79 | 80 | void clientDisconnectSuccess(RdmaConn *conn) 81 | { 82 | printf("RDMA: one connection (%s:%d) disconnect success\n", conn->ip, conn->port); 83 | } 84 | 85 | int main(int argc, char *argv[]) 86 | { 87 | RdmaConnOptions opt = {0}; 88 | int ret = RDMA_ERR; 89 | int i; 90 | 91 | if (argc != 3) 92 | usage(argv[0]); 93 | 94 | serverip = argv[1]; 95 | port = atoi(argv[2]); 96 | 97 | opt.rdma_recv_depth = 32; 98 | opt.recv_callback = clientRecvSuccess; 99 | opt.write_callback = clientWriteSuccess; 100 | opt.read_callback = clientReadSuccess; 101 | opt.connected_callback = clientConnectSuccess; 102 | opt.disconnect_callback = clientDisconnectSuccess; 103 | opt.rdma_io_affinity_cpuid = 2; 104 | conn = rdmaConn(&opt); 105 | if (!conn) 106 | { 107 | rdmaErr("create rdma connection failed"); 108 | goto end; 109 | } 110 | // rdmaConnSetRecvCallback(conn, clientRecvSuccess); 111 | 112 | ret = rdmaConnect(conn, serverip, port); 113 | if (ret != RDMA_OK) 114 | { 115 | rdmaErr("rdma connect failed"); 116 | goto end; 117 | } 118 | 119 | ret = rdmaConnSend(conn, hello_msg, strlen(hello_msg)); 120 | printf("rdmaConnSend success %d bytes\n", ret); 121 | if (ret != strlen(hello_msg)) 122 | { 123 | rdmaErr("rdma send msg failed"); 124 | goto end; 125 | } 126 | 127 | rdmaConnWrite(conn, hello_msg, strlen(hello_msg)); 128 | 129 | data_mr = rdmaConnRegMem(conn, strlen(hello_msg) + 1); 130 | if (!data_mr) 131 | { 132 | rdmaErr("rdma register memory failed"); 133 | goto end; 134 | } 135 | local_msg_buf = (char *)data_mr->addr; 136 | // rdmaConnRead(conn, local_msg_buf, data_mr->lkey, conn->tx_addr, conn->tx_key, strlen(hello_msg)); 137 | 138 | /* test PA MR */ 139 | #define TEST_MSG "Umich RBPF!" 140 | // rdmaConnRead(conn, local_msg_buf, data_mr->lkey, (void *)remote_addr, conn->tx_pa_rkey, strlen(hello_msg)); 141 | memcpy((void *)local_msg_buf, TEST_MSG, strlen(TEST_MSG)); 142 | rdmaPAWriteSignaled(conn, (unsigned long)local_msg_buf, data_mr->lkey, remote_addr, strlen(hello_msg)); 143 | 144 | pthread_cond_init(&g_cond, NULL); 145 | pthread_mutex_init(&g_mutex, NULL); 146 | 147 | start_time = time_get_ns(); 148 | for (i = 0; i <= nr_outstanding_reqs; i++) 149 | { 150 | rdmaPAReadSignaled(conn, (unsigned long)local_msg_buf, data_mr->lkey, remote_addr, strlen(hello_msg)); 151 | } 152 | 153 | pthread_mutex_lock(&g_mutex); 154 | pthread_cond_wait(&g_cond, &g_mutex); 155 | pthread_mutex_unlock(&g_mutex); 156 | 157 | ret = RDMA_OK; 158 | 159 | end: 160 | if (local_msg_buf && data_mr) 161 | { 162 | sleep(1); 163 | printf("RDMA conn de-register data mr\n"); 164 | rdmaConnDeregMem(conn, data_mr); 165 | local_msg_buf = NULL; 166 | } 167 | if (conn) 168 | { 169 | rdmaConnClose(conn); 170 | } 171 | rdmaRuntimeStop(); 172 | 173 | pthread_mutex_destroy(&g_mutex); 174 | pthread_cond_destroy(&g_cond); 175 | 176 | return ret; 177 | } 178 | -------------------------------------------------------------------------------- /src/rdma_helpers.h: -------------------------------------------------------------------------------- 1 | #ifndef __RDMA_HELPERS_H_ 2 | #define __RDMA_HELPERS_H_ 3 | 4 | #include 5 | #include 6 | 7 | struct ibv_mr *rdma_exp_reg_phys_mem_range(struct ibv_pd *pd, void *buf, size_t size); 8 | 9 | struct ibv_mr *rdma_exp_reg_phys_mem_full(struct ibv_pd *pd); 10 | 11 | struct ibv_mr *rdma_reg_mem_readonly(struct ibv_pd *pd, void *buf, size_t size); 12 | 13 | struct ibv_mr *rdma_reg_mem_writeonly(struct ibv_pd *pd, void *buf, size_t size); 14 | /* with Write and Read */ 15 | struct ibv_mr *rdma_reg_mem(struct ibv_pd *pd, void *buf, size_t size); 16 | 17 | struct ibv_mr *rdma_reg_mem_atomic(struct ibv_pd *pd, void *buf, size_t size); 18 | 19 | struct ibv_mr *rdma_reg_mem_for_mw(struct ibv_pd *pd, void *buf, size_t size); 20 | 21 | void rdma_dereg_mem(struct ibv_mr *mr); 22 | 23 | int rdma_poll_send_comp(struct ibv_qp *qp, struct ibv_wc *wc, int num); 24 | 25 | int rdma_poll_recv_comp(struct ibv_qp *qp, struct ibv_wc *wc, int num); 26 | 27 | int rdma_post_srq_recv(struct ibv_srq *srq, uint64_t wr_id, 28 | uint64_t local_addr, uint32_t lkey, 29 | uint32_t length); 30 | 31 | int rdma_post_recv(struct ibv_qp *qp, uint64_t wr_id, 32 | uint64_t local_addr, uint32_t lkey, 33 | uint32_t length); 34 | 35 | int rdma_two_sided_send(struct ibv_qp *qp, enum ibv_wr_opcode opcode, 36 | const uint32_t max_inline_data, unsigned int send_flags, 37 | uint64_t wr_id, uint32_t imm_data, uint64_t local_addr, 38 | uint32_t lkey, uint32_t length); 39 | 40 | int rdma_one_sided_send(struct ibv_qp *qp, enum ibv_wr_opcode opcode, 41 | const uint32_t max_inline_data, unsigned int send_flags, 42 | uint64_t wr_id, uint32_t imm_data, 43 | uint64_t local_addr, uint32_t lkey, 44 | uint64_t remote_addr, uint32_t rkey, uint32_t length); 45 | 46 | int rdma_send_signaled(struct ibv_qp *qp, uint64_t wr_id, 47 | uint64_t local_addr, uint32_t length, 48 | uint32_t lkey, const uint32_t max_inline_data); 49 | 50 | int rdma_send(struct ibv_qp *qp, uint64_t wr_id, 51 | uint64_t local_addr, uint32_t length, 52 | uint32_t lkey, const uint32_t max_inline_data); 53 | 54 | int rdma_send_with_imm_signaled(struct ibv_qp *qp, uint64_t wr_id, uint32_t imm_data, 55 | uint64_t local_addr, uint32_t length, 56 | uint32_t lkey, const uint32_t max_inline_data); 57 | 58 | int rdma_send_with_imm(struct ibv_qp *qp, uint64_t wr_id, uint32_t imm_data, 59 | uint64_t local_addr, uint32_t length, 60 | uint32_t lkey, const uint32_t max_inline_data); 61 | 62 | int rdma_write_signaled(struct ibv_qp *qp, uint64_t wr_id, 63 | uint64_t local_addr, uint32_t lkey, 64 | uint64_t remote_addr, uint32_t rkey, 65 | uint32_t length, const uint32_t max_inline_data); 66 | 67 | int rdma_write(struct ibv_qp *qp, uint64_t wr_id, 68 | uint64_t local_addr, uint32_t lkey, 69 | uint64_t remote_addr, uint32_t rkey, 70 | uint32_t length, const uint32_t max_inline_data); 71 | 72 | /** TODO: Need to rethink this interface design */ 73 | int rdma_write_send_signaled(struct ibv_qp *qp, uint64_t wr_id, 74 | uint64_t local_addr, uint32_t lkey, 75 | uint64_t remote_addr, uint32_t rkey, uint32_t length, 76 | uint32_t payload, const uint32_t max_inline_data); 77 | 78 | /** TODO: Need to rethink this interface design */ 79 | int rdma_write_write_signaled(struct ibv_qp *qp, uint64_t wr_id, 80 | uint64_t local_addr, uint32_t lkey, 81 | uint64_t remote_addr, uint32_t rkey, uint32_t length, 82 | uint32_t payload, const uint32_t max_inline_data); 83 | 84 | int rdma_send_cas_signaled(struct ibv_qp *qp, uint64_t wr_id, 85 | uint64_t local_addr, uint32_t lkey, 86 | uint64_t remote_addr, uint32_t rkey, 87 | uint64_t expected, uint64_t swap); 88 | 89 | int rdma_write_with_imm_signaled(struct ibv_qp *qp, uint64_t wr_id, 90 | uint32_t imm_data, uint64_t local_addr, uint32_t lkey, 91 | uint64_t remote_addr, uint32_t rkey, uint32_t length, 92 | const uint32_t max_inline_data); 93 | 94 | int rdma_write_with_imm(struct ibv_qp *qp, uint64_t wr_id, 95 | uint32_t imm_data, uint64_t local_addr, uint32_t lkey, 96 | uint64_t remote_addr, uint32_t rkey, uint32_t length, 97 | const uint32_t max_inline_data); 98 | 99 | int rdma_read_signaled(struct ibv_qp *qp, uint64_t wr_id, 100 | uint64_t local_addr, uint32_t lkey, 101 | uint64_t remote_addr, uint32_t rkey, 102 | uint32_t length, const uint32_t max_inline_data); 103 | 104 | int rdma_read(struct ibv_qp *qp, uint64_t wr_id, 105 | uint64_t local_addr, uint32_t lkey, 106 | uint64_t remote_addr, uint32_t rkey, 107 | uint32_t length, const uint32_t max_inline_data); 108 | 109 | #endif // !__RDMA_HELPERS_H_ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SnowRDMA -- An ultral-fast and easy-to-use RDMA library 2 | 3 | *Hope that you'd be glad to add a star if you think this repo is helpful!* 4 | 5 | ## Overview 6 | 7 | A high-performance and easy-to-use RDMA library, called SnowRDMA. 8 | I call it "SnowRDMA" because I completed its main development work during 9 | a snowstorm in Ann Arbor, MI. My wife Eva just suggested "SnowRDMA" to 10 | commemorate the first snow in 2024 we experienced together. 11 | SnowRDMA provides user-friendly RDMA programming abstractions while 12 | preserving ultra-fast networking IO by integrating with advanced RDMA 13 | hardware features. 14 | 15 | 16 | ## SnowRDMA Features 17 | 18 | **The features supported by SnowRDMA include:** 19 | 20 | - Callback based asynchronous programming model. 21 | - Support event and polling driven RDMA completion model. 22 | - Support Remote Directly Physical Memory Access (PA-MR) feature. 23 | - Note that this feature needs to be enabled in the MLNX_OFED driver 24 | at boot time. Please refer to this google doc [PA MR in RDMA 25 | ](https://docs.google.com/document/d/12bsFDSS3jV7WQ7OdfP2SEaooYVrPnhxDR8b_hpwQDgc/edit?usp=sharing). 26 | - Single-thread IO model. 27 | - Support CPU affinity setting. 28 | - Support to adjust the outstanding RDMA Read/Atomic handled by RDMA NIC. 29 | - Use RDMA *max_rd_atomic* feature at a QP(Queue Pair) level, which 30 | allows us to adjust the number of outstanding RDMA Read or Atomic operations 31 | handled by RDMA NIC. 32 | - Note that we set QP's *max_rd_atomic* as the RNIC's max_qp_rd_atom by default. 33 | By this, the throughput of RDMA Read is improved from ~0.9M requests per second (RPS) 34 | to ~4.9M RPS in a testbed with Mellanox CX4 NIC. 35 | 36 | **Features that will be supported as next plans:** 37 | 38 | - [ ] Multi-thread RDMA IO model. 39 | - [ ] Adaptive event/polling switching. 40 | - [ ] Support connection-level RDMA QoS feature. 41 | - [ ] Support enhanced atomic operations including: 42 | - Masked Compare and Swap 43 | - Masked Fetch and Add 44 | - [ ] Support XRC--- [eXtended Reliable Connected Transport Service for InfiniBand](https://docs.nvidia.com/networking/display/mlnxofedv497100lts/advanced+transport) 45 | - Significantly reduce QPs number and the associated memory resources required when 46 | establishing all-to-all process connectivity in large clusters. 47 | - [ ] Support Dynamically Connected Transport (DCT) 48 | - DCT connections only stay connected when they are active. 49 | - Smaller memory footprint, less overhead to set connections, higher 50 | on-chip cache utilization. 51 | - Note that DCT is supported only in mlx5 driver. 52 | - [ ] Support resource domain for higher data-path performance. 53 | - [ ] Support User-Mode Memory Registration (UMR) for efficiently 54 | scattering data through appropriate memory keys on the remote side. 55 | - ... 56 | 57 | 58 | 59 | 60 | ## SnowRDMA Usage 61 | 62 | SnowRDMA provides easy-to-use RDMA programming interfaces for control 63 | and data plane operations while preserving high performance networking, 64 | so that a developer without RDMA experience can easily take advantage 65 | of performance benefits by RDMA--- e.g., ultra-low latency at a 66 | sub-microsecond level, high throughput(25Gbps~800Gbps), and near-zero 67 | CPU utilization. This provides three types of interfaces: 68 | 69 | (1) server side control interfaces: 70 | ```c 71 | /* init RDMA server */ 72 | int rdmaServer(RdmaListener **listener, char *ip, int port); 73 | 74 | /* start RDMA server runtime */ 75 | int rdmaServerStart(RdmaListener *listener); 76 | 77 | /* stop RDMA server runtime */ 78 | int rdmaServerStop(RdmaListener *listener); 79 | 80 | /* release global RDMA server context */ 81 | void rdmaServerRelease(RdmaListener *listener); 82 | 83 | /* set AcceptCallback for RDMA server */ 84 | typedef void (*RdmaAcceptCallbackFunc)(RdmaConn *conn); 85 | int rdmaServerSetAcceptCallback(RdmaListener *listener, RdmaAcceptCallbackFunc func); 86 | 87 | ``` 88 | 89 | (2) client side control interfaces: 90 | ```c 91 | /* create a new rdma connection as endpoint */ 92 | RdmaConn *rdmaConn(void); 93 | 94 | /* try to connect remote RDMA server with serving IP and port */ 95 | int rdmaConnect(RdmaConn *conn, char *serverip, int port); 96 | 97 | /* try to close given a client rdma connection */ 98 | void rdmaConnClose(RdmaConn *conn); 99 | 100 | /* explicitly stop background rdma runtime */ 101 | void rdmaRuntimeStop(void); 102 | 103 | /* set RecvCallback for each RDMA connection */ 104 | typedef void (*RdmaRecvCallbackFunc)(RdmaConn *conn, void *data, size_t data_len); 105 | int rdmaConnSetRecvCallback(RdmaConn *conn, RdmaRecvCallbackFunc func); 106 | ``` 107 | 108 | (3) data plane interfaces: RDMA signaled by default 109 | ```c 110 | /* use RDMA WRITE WITH IMM as main send primitive */ 111 | size_t rdmaConnSend(RdmaConn *conn, void *data, size_t data_len); 112 | 113 | size_t rdmaConnSendWithImm(RdmaConn *conn, uint32_t imm_data, 114 | const void *data, size_t data_len); 115 | 116 | /* use RDMA WRITE to send data. Assume that data is a pre-registered RDMA MR */ 117 | size_t rdmaConnWrite(RdmaConn *conn, const void *data, size_t data_len); 118 | int rdmaConnWriteWithImm(RdmaConn *conn, uint32_t imm_data, 119 | const void *data, size_t data_len); 120 | 121 | /* use RDMA READ to recv data. Assume that data buffer is a 122 | pre-registered RDMA MR */ 123 | int rdmaConnRead(RdmaConn *conn, void *data_buf, size_t buf_len); 124 | 125 | /* RDMA blocking interfaces that require RDMA_BLOCKING mode. 126 | * Assume that remote addr is RDMA-registered before use. 127 | */ 128 | int rdmaSyncWriteSignaled(RdmaConn *conn, uint64_t local_addr, 129 | uint32_t lkey, uint64_t remote_addr, 130 | uint32_t rkey, uint32_t length); 131 | int rdmaSyncReadSignaled(RdmaConn *conn, uint64_t local_addr, 132 | uint32_t lkey, uint64_t remote_addr, 133 | uint32_t rkey, uint32_t length); 134 | 135 | /* RDMA physical memory access interfaces. */ 136 | int rdmaPAWriteSignaled(RdmaConn *conn, uint64_t local_addr, 137 | uint32_t lkey, uint64_t remote_addr, uint32_t length); 138 | int rdmaPAReadSignaled(RdmaConn *conn, uint64_t local_addr, 139 | uint32_t lkey, uint64_t remote_addr, uint32_t length); 140 | 141 | /* RDMA blocking interfaces require RDMA_BLOCKING mode */ 142 | int rdmaPASyncWriteSignaled(RdmaConn *conn, uint64_t local_addr, 143 | uint32_t lkey, uint64_t remote_addr, uint32_t length); 144 | int rdmaPASyncReadSignaled(RdmaConn *conn, uint64_t local_addr, 145 | uint32_t lkey, uint64_t remote_addr, uint32_t length); 146 | ``` 147 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /src/rdma.h: -------------------------------------------------------------------------------- 1 | #ifndef __RDMA_H_ 2 | #define __RDMA_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | typedef struct RdmaConn RdmaConn; 11 | 12 | /* callback funcs for RDMA connection level in Async communication (Non-Blocking) */ 13 | typedef void (*RdmaRecvCallbackFunc)(RdmaConn *conn, void *data, size_t data_len); 14 | typedef void (*RdmaWriteCallbackFunc)(RdmaConn *conn, size_t byte_len); 15 | typedef void (*RdmaReadCallbackFunc)(RdmaConn *conn, size_t byte_len); 16 | 17 | typedef void (*RdmaConnectedCallbackFunc)(RdmaConn *conn); 18 | typedef void (*RdmaDisconnectCallbackFunc)(RdmaConn *conn); 19 | typedef void (*RdmaAcceptCallbackFunc)(RdmaConn *conn); 20 | 21 | typedef enum RdmaCmdType 22 | { 23 | REG_LOCAL_ADDR, /* register local addr */ 24 | REG_PHYS_ADDR, /* register physical mem */ 25 | CONN_GOODBYE, /* disconnect cmd between server and client */ 26 | } RdmaCmdType; 27 | 28 | typedef struct RdmaCmd 29 | { 30 | uint8_t magic; 31 | uint8_t version; 32 | uint8_t cmd_opcode; 33 | uint8_t rsvd[13]; 34 | uint64_t addr; 35 | uint32_t length; 36 | uint32_t key; 37 | } RdmaCmd; 38 | 39 | #define RDMA_MAX_SGE 1024 40 | #define RDMA_DEFAULT_RX_LEN (1024 * 1024) 41 | #define RDMA_CMD_MAGIC 'R' 42 | 43 | /* Error codes */ 44 | #define RDMA_OK 0 45 | #define RDMA_ERR -1 46 | 47 | extern int rdmaListenBacklog; 48 | extern int rdmaTimeoutms; 49 | extern int rdmaPollEventTimeoutms; 50 | extern int rdmaMaxInlineData; /* max inline data enabled by RNIC, 51 | 0 indicate no inline optimization. */ 52 | extern int rdmaRetryCount; 53 | extern int rdmaRnrRetryCount; 54 | extern int rdmaMaxConcurrentWorkRequests; /* used to handle a batch of CQEs */ 55 | extern int rdmaRecvDepth; 56 | /* Number of outstanding RDMA Reads or Atomic reqs used by RDMA NIC. 57 | * By default, set QP's max_rd_atomic as the RNIC's max_qp_rd_atom. 58 | * Note that max_rd_atomic is a crucial QP attribute for performance, 59 | * it is the number of RDMA Reads & atomic operations outstanding at 60 | * any time that can be handled by a RC QP as an initiator. 61 | */ 62 | extern int rdmaMaxOutstandingRdAtomic; 63 | 64 | extern int rdmaQp2CqMode; /* One QP to One CQ by default */ 65 | extern int rdmaCommMode; /* Blocking by default */ 66 | extern bool rdmaEnablePhysAddrAccess; /* disable by default */ 67 | extern int rdmaIoAffinityCpuId; /* -1 by default */ 68 | 69 | typedef enum 70 | { 71 | ONE_TO_ONE = 0, /* One QP is mapped to one CQ */ 72 | MANY_TO_ONE, /* Many QP is mapped to one CQ */ 73 | } RdmaQpCqMapping; 74 | 75 | /* RDMA communication mode: SYNC (Blocking), ASYNC (Non-Blocking) */ 76 | typedef enum 77 | { 78 | RDMA_BLOCKING = 0, 79 | RDMA_NON_BLOCKING, 80 | } RdmaCommMode; 81 | 82 | typedef struct RdmaOptions 83 | { 84 | 85 | /* set the number of backlog for RDMA Listener when rdma_listen(). 86 | * Default: 128 87 | */ 88 | int rdma_listen_backlog; 89 | 90 | /** set timeout (ms) value for rdma_resolve_addr() and rdma_resolve_route(). 91 | * Default: 1000 ms 92 | */ 93 | int rdma_timeoutms; 94 | 95 | /** set the timeout (ms) value for poll() when polling RDMA cm event channel 96 | * and completion event channel. 97 | * Default: 10 ms 98 | */ 99 | int rdma_poll_event_timeoutms; 100 | 101 | /** the max inline data size enabled by RNIC. 0 indicate no inline optimization. 102 | * Default: 0 103 | */ 104 | int rdma_max_inline_data; 105 | 106 | /** set the retry times for rdma_connect() and rdma_listen(). 107 | * Default: 7 108 | */ 109 | int rdma_retry_count; 110 | 111 | /** set the maximum number of times that a send operation from the remote peer 112 | * should be retried on a connection after receiving a receiver not ready (RNR) 113 | * error. RNR errors are generated when a send request arrives before a buffer 114 | * has been posted to receive the incoming data. Applies only to RDMA_PS_TCP. 115 | * Default: 7 116 | */ 117 | int rdma_rnr_retry_count; 118 | 119 | /** set the maximum number of concurrent work requests for one ibv_poll_cq() 120 | * invocation. This can be used to handle a batch of CQEs for a better 121 | * throughput. 122 | * Default: 128 + 2048 * 2 123 | */ 124 | int rdma_max_concurrent_work_requests; 125 | 126 | /** set the recv depth of RDMA recv buffers for ibv_post_recv() in two-sided 127 | * messaging verbs. 128 | * Default: 1024 129 | */ 130 | int rdma_recv_depth; 131 | 132 | /** set the mode of RDMA QP instances to CQ instance mapping relationship. 133 | * Default: MANY_TO_ONE 134 | */ 135 | RdmaQpCqMapping rdma_qp2cq_mode; 136 | 137 | /** set the mode of RDMA communication: SYNC (Blocking), ASYNC (Non-Blocking) 138 | * Default: RDMA_NON_BLOCKING 139 | */ 140 | RdmaCommMode rdma_comm_mode; 141 | 142 | /** set whehther enable Remote Direct Physical Memory Access (RDPMA). 143 | * Default: false 144 | */ 145 | bool rdma_enable_phys_addr_access; 146 | 147 | /** set the number of outstanding RDMA Read and Atomic operations handled by RDMA NIC. 148 | * Default: RNIC's max_qp_rd_atom 149 | */ 150 | int rdma_max_outstanding_rd_atomic; 151 | 152 | /** set pinned cpu id (better CPU affinity) for RDMA IO thread. 153 | * Default: pin current CPU core if no pinned CPU id is specified 154 | */ 155 | int rdma_io_affinity_cpuid; 156 | 157 | RdmaRecvCallbackFunc recv_callback; 158 | RdmaWriteCallbackFunc write_callback; /* for RDMA_WRITE */ 159 | RdmaReadCallbackFunc read_callback; /* for RDMA_READ */ 160 | RdmaConnectedCallbackFunc connected_callback; 161 | RdmaDisconnectCallbackFunc disconnect_callback; 162 | RdmaAcceptCallbackFunc accept_callback; 163 | } RdmaOptions; 164 | 165 | typedef RdmaOptions RdmaServerOptions; 166 | typedef RdmaOptions RdmaConnOptions; 167 | 168 | typedef enum 169 | { 170 | RDMA_CONN_STATE_NONE = 0, 171 | RDMA_CONN_STATE_CONNECTING, 172 | RDMA_CONN_STATE_ACCEPTING, 173 | RDMA_CONN_STATE_CONNECTED, 174 | RDMA_CONN_STATE_MR_READY, 175 | RDMA_CONN_STATE_CLOSED, 176 | RDMA_CONN_STATE_ERROR, 177 | } RdmaConnState; 178 | 179 | typedef enum 180 | { 181 | ACCEPTED_CONN = 0, /* server side accept */ 182 | CONNECTED_CONN, /* client side connect */ 183 | } ConnectionType; 184 | 185 | /* used to describe the type of Work Request Context for different RDMA opcodes */ 186 | typedef enum 187 | { 188 | RECV_CONTEXT, 189 | WRITE_CONTEXT, 190 | SEND_CONTEXT, 191 | } RdmaReqCtxType; 192 | 193 | typedef struct RdmaWrCtx 194 | { 195 | RdmaReqCtxType type; 196 | void *rdma_conn; /* RdmaConn in this case */ 197 | void *private_data; /* For example, RdmaCmd context in IBV_WC_RECV */ 198 | } RdmaWrCtx; 199 | 200 | struct RdmaConn 201 | { 202 | struct rdma_cm_id *cm_id; 203 | int last_errno; 204 | RdmaConnState state; 205 | ConnectionType type; 206 | 207 | char *ip; 208 | int port; 209 | struct ibv_pd *pd; 210 | struct rdma_event_channel *cm_channel; 211 | struct ibv_comp_channel *comp_channel; 212 | struct ibv_cq *cq; 213 | uint32_t max_inline_data; 214 | 215 | /* TX */ 216 | char *tx_addr; /* remote side */ 217 | uint32_t tx_length; 218 | uint32_t tx_offset; 219 | uint32_t tx_key; 220 | char *send_buf; /* local side */ 221 | uint32_t send_length; 222 | uint32_t send_offset; 223 | uint32_t send_ops; 224 | struct ibv_mr *send_mr; 225 | 226 | /* RX */ 227 | uint32_t rx_offset; 228 | char *recv_buf; 229 | unsigned int recv_length; 230 | unsigned int recv_offset; 231 | struct ibv_mr *recv_mr; 232 | 233 | /* Physical memory TX mr over RDMA. 234 | * Note that when register full physical memory, 235 | * the phys addr is NULL and the MR length is 0. */ 236 | uint32_t tx_pa_rkey; /* remote key */ 237 | char *tx_pa_addr; /* remote physical memory */ 238 | unsigned int tx_pa_length; 239 | unsigned int tx_pa_offset; 240 | 241 | /* CMD 0 ~ RDMA_MAX_SGE for recv buffer 242 | * RDMA_MAX_SGE ~ 2 * RDMA_MAX_SGE - 1 for send buffer 243 | */ 244 | RdmaCmd *cmd_buf; 245 | struct ibv_mr *cmd_mr; 246 | RdmaWrCtx *rx_ctx; 247 | RdmaWrCtx *tx_ctx; 248 | 249 | pthread_cond_t status_cond; 250 | pthread_mutex_t status_mutex; 251 | 252 | RdmaConnOptions options; 253 | 254 | /* callbacks for control and data plane */ 255 | RdmaRecvCallbackFunc recv_callback; 256 | RdmaWriteCallbackFunc write_callback; 257 | RdmaReadCallbackFunc read_callback; 258 | RdmaConnectedCallbackFunc connected_callback; 259 | RdmaDisconnectCallbackFunc disconnect_callback; 260 | }; 261 | 262 | typedef struct RdmaListener 263 | { 264 | struct rdma_cm_id *cm_id; 265 | struct rdma_event_channel *cm_channel; 266 | RdmaServerOptions options; 267 | 268 | /* callbacks for server-side control plane */ 269 | RdmaAcceptCallbackFunc accept_callback; 270 | } RdmaListener; 271 | 272 | /* common RDMA interfaces/handlers */ 273 | 274 | /* RDMA server side interfaces */ 275 | int rdmaServer(RdmaListener **listener, const char *ip, 276 | const int port, const RdmaServerOptions *opt); 277 | int rdmaServerStart(RdmaListener *listener); 278 | int rdmaServerStop(RdmaListener *listener); 279 | void rdmaServerRelease(RdmaListener *listener); 280 | int rdmaServerSetAcceptCallback(RdmaListener *listener, RdmaAcceptCallbackFunc func); 281 | 282 | /* RDMA client side interfaces */ 283 | RdmaConn *rdmaConn(const RdmaServerOptions *opt); 284 | int rdmaConnect(RdmaConn *conn, char *serverip, int port); 285 | void rdmaConnClose(RdmaConn *conn); 286 | int rdmaConnSetRecvCallback(RdmaConn *conn, RdmaRecvCallbackFunc func); 287 | int rdmaConnSetWriteCallback(RdmaConn *conn, RdmaWriteCallbackFunc func); 288 | int rdmaConnSetReadCallback(RdmaConn *conn, RdmaReadCallbackFunc func); 289 | int rdmaConnSetConnectedCallback(RdmaConn *conn, RdmaConnectedCallbackFunc func); 290 | int rdmaConnSetDisconnectCallback(RdmaConn *conn, RdmaDisconnectCallbackFunc func); 291 | 292 | void rdmaRuntimeStop(void); 293 | 294 | /* MR related management interfaces */ 295 | struct ibv_mr *rdmaConnRegMem(RdmaConn *conn, size_t size); 296 | void rdmaConnDeregMem(RdmaConn *conn, struct ibv_mr *mr); 297 | 298 | /* data plane interfaces. Signaled by default. */ 299 | size_t rdmaConnSend(RdmaConn *conn, void *data, size_t data_len); 300 | /* size_t rdmaConnSendWithImm(RdmaConn *conn, uint32_t imm_data, const void *data, size_t data_len); */ 301 | size_t rdmaConnWrite(RdmaConn *conn, const void *data, size_t data_len); 302 | int rdmaConnWriteWithImm(RdmaConn *conn, uint32_t imm_data, 303 | const void *data, size_t data_len); 304 | 305 | /* This uses RDMA READ verb. Assume that local buf and remote buf are all registered by RDMA. 306 | * Current connection needs to know and exchange remote buf and rkey (REMOTE_RDMA_READ allowed) 307 | * before invocation. 308 | */ 309 | int rdmaConnRead(RdmaConn *conn, void *local_buf, uint32_t lkey, 310 | void *remote_buf, uint32_t rkey, size_t length); 311 | 312 | /* RDMA blocking interfaces that require RDMA_BLOCKING mode. 313 | * Assume that remote addr is RDMA-registered before use. 314 | */ 315 | int rdmaSyncWriteSignaled(RdmaConn *conn, uint64_t local_addr, 316 | uint32_t lkey, uint64_t remote_addr, 317 | uint32_t rkey, uint32_t length); 318 | int rdmaSyncReadSignaled(RdmaConn *conn, uint64_t local_addr, 319 | uint32_t lkey, uint64_t remote_addr, 320 | uint32_t rkey, uint32_t length); 321 | 322 | /* RDMA physical memory access interfaces. */ 323 | int rdmaPAWriteSignaled(RdmaConn *conn, uint64_t local_addr, 324 | uint32_t lkey, uint64_t remote_addr, uint32_t length); 325 | int rdmaPAReadSignaled(RdmaConn *conn, uint64_t local_addr, 326 | uint32_t lkey, uint64_t remote_addr, uint32_t length); 327 | 328 | /* RDMA blocking interfaces require RDMA_BLOCKING mode */ 329 | int rdmaPASyncWriteSignaled(RdmaConn *conn, uint64_t local_addr, 330 | uint32_t lkey, uint64_t remote_addr, uint32_t length); 331 | int rdmaPASyncReadSignaled(RdmaConn *conn, uint64_t local_addr, 332 | uint32_t lkey, uint64_t remote_addr, uint32_t length); 333 | 334 | /* RDMA tracing and debug helpers */ 335 | 336 | #ifdef NDEBUG 337 | #define rdmaDebug(fmt, ...) 338 | #else 339 | #define rdmaDebug(fmt, ...) \ 340 | fprintf(stdout, "[DEBUG] %s:%d " fmt "\n", __FILE__, __LINE__, ##__VA_ARGS__) 341 | #endif 342 | 343 | #define clean_errno() (errno == 0 ? "None" : strerror(errno)) 344 | 345 | #define rdmaInfo(fmt, ...) \ 346 | fprintf(stdout, "[INFO] %s:%d " fmt "\n", __FILE__, __LINE__, ##__VA_ARGS__) 347 | #define rdmaWarn(fmt, ...) \ 348 | fprintf(stdout, "[WARN] (%s:%d: errno: %s) " fmt "\n", __FILE__, __LINE__, clean_errno(), ##__VA_ARGS__) 349 | #define rdmaErr(fmt, ...) \ 350 | fprintf(stderr, "[ERROR] (%s:%d: errno: %s) " fmt "\n", __FILE__, __LINE__, clean_errno(), ##__VA_ARGS__) 351 | 352 | #endif // !__RDMA_H_ -------------------------------------------------------------------------------- /include/rdma.h: -------------------------------------------------------------------------------- 1 | #ifndef __RDMA_H_ 2 | #define __RDMA_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | typedef struct RdmaConn RdmaConn; 11 | 12 | /* callback funcs for RDMA connection level in Async communication (Non-Blocking) */ 13 | typedef void (*RdmaRecvCallbackFunc)(RdmaConn *conn, void *data, size_t data_len); 14 | typedef void (*RdmaWriteCallbackFunc)(RdmaConn *conn, size_t byte_len); 15 | typedef void (*RdmaReadCallbackFunc)(RdmaConn *conn, size_t byte_len); 16 | 17 | typedef void (*RdmaConnectedCallbackFunc)(RdmaConn *conn); 18 | typedef void (*RdmaDisconnectCallbackFunc)(RdmaConn *conn); 19 | typedef void (*RdmaAcceptCallbackFunc)(RdmaConn *conn); 20 | 21 | typedef enum RdmaCmdType 22 | { 23 | REG_LOCAL_ADDR, /* register local addr */ 24 | REG_PHYS_ADDR, /* register physical mem */ 25 | CONN_GOODBYE, /* disconnect cmd between server and client */ 26 | } RdmaCmdType; 27 | 28 | typedef struct RdmaCmd 29 | { 30 | uint8_t magic; 31 | uint8_t version; 32 | uint8_t cmd_opcode; 33 | uint8_t rsvd[13]; 34 | uint64_t addr; 35 | uint32_t length; 36 | uint32_t key; 37 | } RdmaCmd; 38 | 39 | #define RDMA_MAX_SGE 1024 40 | #define RDMA_DEFAULT_RX_LEN (1024 * 1024) 41 | #define RDMA_CMD_MAGIC 'R' 42 | 43 | /* Error codes */ 44 | #define RDMA_OK 0 45 | #define RDMA_ERR -1 46 | 47 | extern int rdmaListenBacklog; 48 | extern int rdmaTimeoutms; 49 | extern int rdmaPollEventTimeoutms; 50 | extern int rdmaMaxInlineData; /* max inline data enabled by RNIC, 51 | 0 indicate no inline optimization. */ 52 | extern int rdmaRetryCount; 53 | extern int rdmaRnrRetryCount; 54 | extern int rdmaMaxConcurrentWorkRequests; /* used to handle a batch of CQEs */ 55 | extern int rdmaRecvDepth; 56 | /* Number of outstanding RDMA Reads or Atomic reqs used by RDMA NIC. 57 | * By default, set QP's max_rd_atomic as the RNIC's max_qp_rd_atom. 58 | * Note that max_rd_atomic is a crucial QP attribute for performance, 59 | * it is the number of RDMA Reads & atomic operations outstanding at 60 | * any time that can be handled by a RC QP as an initiator. 61 | */ 62 | extern int rdmaMaxOutstandingRdAtomic; 63 | 64 | extern int rdmaQp2CqMode; /* One QP to One CQ by default */ 65 | extern int rdmaCommMode; /* Blocking by default */ 66 | extern bool rdmaEnablePhysAddrAccess; /* disable by default */ 67 | extern int rdmaIoAffinityCpuId; /* -1 by default */ 68 | 69 | typedef enum 70 | { 71 | ONE_TO_ONE = 0, /* One QP is mapped to one CQ */ 72 | MANY_TO_ONE, /* Many QP is mapped to one CQ */ 73 | } RdmaQpCqMapping; 74 | 75 | /* RDMA communication mode: SYNC (Blocking), ASYNC (Non-Blocking) */ 76 | typedef enum 77 | { 78 | RDMA_BLOCKING = 0, 79 | RDMA_NON_BLOCKING, 80 | } RdmaCommMode; 81 | 82 | typedef struct RdmaOptions 83 | { 84 | 85 | /* set the number of backlog for RDMA Listener when rdma_listen(). 86 | * Default: 128 87 | */ 88 | int rdma_listen_backlog; 89 | 90 | /** set timeout (ms) value for rdma_resolve_addr() and rdma_resolve_route(). 91 | * Default: 1000 ms 92 | */ 93 | int rdma_timeoutms; 94 | 95 | /** set the timeout (ms) value for poll() when polling RDMA cm event channel 96 | * and completion event channel. 97 | * Default: 10 ms 98 | */ 99 | int rdma_poll_event_timeoutms; 100 | 101 | /** the max inline data size enabled by RNIC. 0 indicate no inline optimization. 102 | * Default: 0 103 | */ 104 | int rdma_max_inline_data; 105 | 106 | /** set the retry times for rdma_connect() and rdma_listen(). 107 | * Default: 7 108 | */ 109 | int rdma_retry_count; 110 | 111 | /** set the maximum number of times that a send operation from the remote peer 112 | * should be retried on a connection after receiving a receiver not ready (RNR) 113 | * error. RNR errors are generated when a send request arrives before a buffer 114 | * has been posted to receive the incoming data. Applies only to RDMA_PS_TCP. 115 | * Default: 7 116 | */ 117 | int rdma_rnr_retry_count; 118 | 119 | /** set the maximum number of concurrent work requests for one ibv_poll_cq() 120 | * invocation. This can be used to handle a batch of CQEs for a better 121 | * throughput. 122 | * Default: 128 + 2048 * 2 123 | */ 124 | int rdma_max_concurrent_work_requests; 125 | 126 | /** set the recv depth of RDMA recv buffers for ibv_post_recv() in two-sided 127 | * messaging verbs. 128 | * Default: 1024 129 | */ 130 | int rdma_recv_depth; 131 | 132 | /** set the mode of RDMA QP instances to CQ instance mapping relationship. 133 | * Default: MANY_TO_ONE 134 | */ 135 | RdmaQpCqMapping rdma_qp2cq_mode; 136 | 137 | /** set the mode of RDMA communication: SYNC (Blocking), ASYNC (Non-Blocking) 138 | * Default: RDMA_NON_BLOCKING 139 | */ 140 | RdmaCommMode rdma_comm_mode; 141 | 142 | /** set whehther enable Remote Direct Physical Memory Access (RDPMA). 143 | * Default: false 144 | */ 145 | bool rdma_enable_phys_addr_access; 146 | 147 | /** set the number of outstanding RDMA Read and Atomic operations handled by RDMA NIC. 148 | * Default: RNIC's max_qp_rd_atom 149 | */ 150 | int rdma_max_outstanding_rd_atomic; 151 | 152 | /** set pinned cpu id (better CPU affinity) for RDMA IO thread. 153 | * Default: pin current CPU core if no pinned CPU id is specified 154 | */ 155 | int rdma_io_affinity_cpuid; 156 | 157 | RdmaRecvCallbackFunc recv_callback; 158 | RdmaWriteCallbackFunc write_callback; /* for RDMA_WRITE */ 159 | RdmaReadCallbackFunc read_callback; /* for RDMA_READ */ 160 | RdmaConnectedCallbackFunc connected_callback; 161 | RdmaDisconnectCallbackFunc disconnect_callback; 162 | RdmaAcceptCallbackFunc accept_callback; 163 | } RdmaOptions; 164 | 165 | typedef RdmaOptions RdmaServerOptions; 166 | typedef RdmaOptions RdmaConnOptions; 167 | 168 | typedef enum 169 | { 170 | RDMA_CONN_STATE_NONE = 0, 171 | RDMA_CONN_STATE_CONNECTING, 172 | RDMA_CONN_STATE_ACCEPTING, 173 | RDMA_CONN_STATE_CONNECTED, 174 | RDMA_CONN_STATE_MR_READY, 175 | RDMA_CONN_STATE_CLOSED, 176 | RDMA_CONN_STATE_ERROR, 177 | } RdmaConnState; 178 | 179 | typedef enum 180 | { 181 | ACCEPTED_CONN = 0, /* server side accept */ 182 | CONNECTED_CONN, /* client side connect */ 183 | } ConnectionType; 184 | 185 | /* used to describe the type of Work Request Context for different RDMA opcodes */ 186 | typedef enum 187 | { 188 | RECV_CONTEXT, 189 | WRITE_CONTEXT, 190 | SEND_CONTEXT, 191 | } RdmaReqCtxType; 192 | 193 | typedef struct RdmaWrCtx 194 | { 195 | RdmaReqCtxType type; 196 | void *rdma_conn; /* RdmaConn in this case */ 197 | void *private_data; /* For example, RdmaCmd context in IBV_WC_RECV */ 198 | } RdmaWrCtx; 199 | 200 | struct RdmaConn 201 | { 202 | struct rdma_cm_id *cm_id; 203 | int last_errno; 204 | RdmaConnState state; 205 | ConnectionType type; 206 | 207 | char *ip; 208 | int port; 209 | struct ibv_pd *pd; 210 | struct rdma_event_channel *cm_channel; 211 | struct ibv_comp_channel *comp_channel; 212 | struct ibv_cq *cq; 213 | uint32_t max_inline_data; 214 | 215 | /* TX */ 216 | char *tx_addr; /* remote side */ 217 | uint32_t tx_length; 218 | uint32_t tx_offset; 219 | uint32_t tx_key; 220 | char *send_buf; /* local side */ 221 | uint32_t send_length; 222 | uint32_t send_offset; 223 | uint32_t send_ops; 224 | struct ibv_mr *send_mr; 225 | 226 | /* RX */ 227 | uint32_t rx_offset; 228 | char *recv_buf; 229 | unsigned int recv_length; 230 | unsigned int recv_offset; 231 | struct ibv_mr *recv_mr; 232 | 233 | /* Physical memory TX mr over RDMA. 234 | * Note that when register full physical memory, 235 | * the phys addr is NULL and the MR length is 0. */ 236 | uint32_t tx_pa_rkey; /* remote key */ 237 | char *tx_pa_addr; /* remote physical memory */ 238 | unsigned int tx_pa_length; 239 | unsigned int tx_pa_offset; 240 | 241 | /* CMD 0 ~ RDMA_MAX_SGE for recv buffer 242 | * RDMA_MAX_SGE ~ 2 * RDMA_MAX_SGE - 1 for send buffer 243 | */ 244 | RdmaCmd *cmd_buf; 245 | struct ibv_mr *cmd_mr; 246 | RdmaWrCtx *rx_ctx; 247 | RdmaWrCtx *tx_ctx; 248 | 249 | pthread_cond_t status_cond; 250 | pthread_mutex_t status_mutex; 251 | 252 | RdmaConnOptions options; 253 | 254 | /* callbacks for control and data plane */ 255 | RdmaRecvCallbackFunc recv_callback; 256 | RdmaWriteCallbackFunc write_callback; 257 | RdmaReadCallbackFunc read_callback; 258 | RdmaConnectedCallbackFunc connected_callback; 259 | RdmaDisconnectCallbackFunc disconnect_callback; 260 | }; 261 | 262 | typedef struct RdmaListener 263 | { 264 | struct rdma_cm_id *cm_id; 265 | struct rdma_event_channel *cm_channel; 266 | RdmaServerOptions options; 267 | 268 | /* callbacks for server-side control plane */ 269 | RdmaAcceptCallbackFunc accept_callback; 270 | } RdmaListener; 271 | 272 | /* common RDMA interfaces/handlers */ 273 | 274 | /* RDMA server side interfaces */ 275 | int rdmaServer(RdmaListener **listener, const char *ip, 276 | const int port, const RdmaServerOptions *opt); 277 | int rdmaServerStart(RdmaListener *listener); 278 | int rdmaServerStop(RdmaListener *listener); 279 | void rdmaServerRelease(RdmaListener *listener); 280 | int rdmaServerSetAcceptCallback(RdmaListener *listener, RdmaAcceptCallbackFunc func); 281 | 282 | /* RDMA client side interfaces */ 283 | RdmaConn *rdmaConn(const RdmaServerOptions *opt); 284 | int rdmaConnect(RdmaConn *conn, char *serverip, int port); 285 | void rdmaConnClose(RdmaConn *conn); 286 | int rdmaConnSetRecvCallback(RdmaConn *conn, RdmaRecvCallbackFunc func); 287 | int rdmaConnSetWriteCallback(RdmaConn *conn, RdmaWriteCallbackFunc func); 288 | int rdmaConnSetReadCallback(RdmaConn *conn, RdmaReadCallbackFunc func); 289 | int rdmaConnSetConnectedCallback(RdmaConn *conn, RdmaConnectedCallbackFunc func); 290 | int rdmaConnSetDisconnectCallback(RdmaConn *conn, RdmaDisconnectCallbackFunc func); 291 | 292 | void rdmaRuntimeStop(void); 293 | 294 | /* MR related management interfaces */ 295 | struct ibv_mr *rdmaConnRegMem(RdmaConn *conn, size_t size); 296 | void rdmaConnDeregMem(RdmaConn *conn, struct ibv_mr *mr); 297 | 298 | /* data plane interfaces. Signaled by default. */ 299 | size_t rdmaConnSend(RdmaConn *conn, void *data, size_t data_len); 300 | /* size_t rdmaConnSendWithImm(RdmaConn *conn, uint32_t imm_data, const void *data, size_t data_len); */ 301 | size_t rdmaConnWrite(RdmaConn *conn, const void *data, size_t data_len); 302 | int rdmaConnWriteWithImm(RdmaConn *conn, uint32_t imm_data, 303 | const void *data, size_t data_len); 304 | 305 | /* This uses RDMA READ verb. Assume that local buf and remote buf are all registered by RDMA. 306 | * Current connection needs to know and exchange remote buf and rkey (REMOTE_RDMA_READ allowed) 307 | * before invocation. 308 | */ 309 | int rdmaConnRead(RdmaConn *conn, void *local_buf, uint32_t lkey, 310 | void *remote_buf, uint32_t rkey, size_t length); 311 | 312 | /* RDMA blocking interfaces that require RDMA_BLOCKING mode. 313 | * Assume that remote addr is RDMA-registered before use. 314 | */ 315 | int rdmaSyncWriteSignaled(RdmaConn *conn, uint64_t local_addr, 316 | uint32_t lkey, uint64_t remote_addr, 317 | uint32_t rkey, uint32_t length); 318 | int rdmaSyncReadSignaled(RdmaConn *conn, uint64_t local_addr, 319 | uint32_t lkey, uint64_t remote_addr, 320 | uint32_t rkey, uint32_t length); 321 | 322 | /* RDMA physical memory access interfaces. */ 323 | int rdmaPAWriteSignaled(RdmaConn *conn, uint64_t local_addr, 324 | uint32_t lkey, uint64_t remote_addr, uint32_t length); 325 | int rdmaPAReadSignaled(RdmaConn *conn, uint64_t local_addr, 326 | uint32_t lkey, uint64_t remote_addr, uint32_t length); 327 | 328 | /* RDMA blocking interfaces require RDMA_BLOCKING mode */ 329 | int rdmaPASyncWriteSignaled(RdmaConn *conn, uint64_t local_addr, 330 | uint32_t lkey, uint64_t remote_addr, uint32_t length); 331 | int rdmaPASyncReadSignaled(RdmaConn *conn, uint64_t local_addr, 332 | uint32_t lkey, uint64_t remote_addr, uint32_t length); 333 | 334 | /* RDMA tracing and debug helpers */ 335 | 336 | #ifdef NDEBUG 337 | #define rdmaDebug(fmt, ...) 338 | #else 339 | #define rdmaDebug(fmt, ...) \ 340 | fprintf(stdout, "[DEBUG] %s:%d " fmt "\n", __FILE__, __LINE__, ##__VA_ARGS__) 341 | #endif 342 | 343 | #define clean_errno() (errno == 0 ? "None" : strerror(errno)) 344 | 345 | #define rdmaInfo(fmt, ...) \ 346 | fprintf(stdout, "[INFO] %s:%d " fmt "\n", __FILE__, __LINE__, ##__VA_ARGS__) 347 | #define rdmaWarn(fmt, ...) \ 348 | fprintf(stdout, "[WARN] (%s:%d: errno: %s) " fmt "\n", __FILE__, __LINE__, clean_errno(), ##__VA_ARGS__) 349 | #define rdmaErr(fmt, ...) \ 350 | fprintf(stderr, "[ERROR] (%s:%d: errno: %s) " fmt "\n", __FILE__, __LINE__, clean_errno(), ##__VA_ARGS__) 351 | 352 | #endif // !__RDMA_H_ -------------------------------------------------------------------------------- /src/rdma_helpers.c: -------------------------------------------------------------------------------- 1 | #include "rdma_helpers.h" 2 | 3 | struct ibv_mr *rdma_exp_reg_phys_mem_range(struct ibv_pd *pd, void *buf, size_t size) 4 | { 5 | struct ibv_exp_reg_mr_in in = {0}; 6 | in.pd = pd; 7 | in.addr = buf; 8 | in.length = size; 9 | in.exp_access = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | 10 | IBV_ACCESS_REMOTE_ATOMIC | IBV_ACCESS_LOCAL_WRITE | 11 | IBV_EXP_ACCESS_PHYSICAL_ADDR; 12 | return ibv_exp_reg_mr(&in); 13 | } 14 | 15 | struct ibv_mr *rdma_exp_reg_phys_mem_full(struct ibv_pd *pd) 16 | { 17 | struct ibv_exp_reg_mr_in in = {0}; 18 | in.pd = pd; 19 | in.addr = NULL; 20 | in.length = 0; 21 | in.exp_access = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | 22 | IBV_ACCESS_REMOTE_ATOMIC | IBV_ACCESS_LOCAL_WRITE | 23 | IBV_EXP_ACCESS_PHYSICAL_ADDR; 24 | return ibv_exp_reg_mr(&in); 25 | } 26 | 27 | struct ibv_mr *rdma_reg_mem_readonly(struct ibv_pd *pd, void *buf, size_t size) 28 | { 29 | return ibv_reg_mr(pd, buf, size, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ); 30 | } 31 | 32 | struct ibv_mr *rdma_reg_mem_writeonly(struct ibv_pd *pd, void *buf, size_t size) 33 | { 34 | return ibv_reg_mr(pd, buf, size, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); 35 | } 36 | 37 | struct ibv_mr *rdma_reg_mem(struct ibv_pd *pd, void *buf, size_t size) 38 | { 39 | return ibv_reg_mr(pd, buf, size, 40 | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | 41 | IBV_ACCESS_REMOTE_WRITE); 42 | } 43 | 44 | struct ibv_mr *rdma_reg_mem_atomic(struct ibv_pd *pd, void *buf, size_t size) 45 | { 46 | return ibv_reg_mr(pd, buf, size, 47 | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | 48 | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC); 49 | } 50 | 51 | struct ibv_mr *rdma_reg_mem_for_mw(struct ibv_pd *pd, void *buf, size_t size) 52 | { 53 | return ibv_reg_mr(pd, buf, size, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_MW_BIND); 54 | } 55 | 56 | void rdma_dereg_mem(struct ibv_mr *mr) 57 | { 58 | ibv_dereg_mr(mr); 59 | } 60 | 61 | int rdma_poll_send_comp(struct ibv_qp *qp, struct ibv_wc *wc, int num) 62 | { 63 | return ibv_poll_cq(qp->send_cq, num, wc); 64 | } 65 | 66 | int rdma_poll_recv_comp(struct ibv_qp *qp, struct ibv_wc *wc, int num) 67 | { 68 | return ibv_poll_cq(qp->recv_cq, num, wc); 69 | } 70 | 71 | int rdma_post_srq_recv(struct ibv_srq *srq, uint64_t wr_id, 72 | uint64_t local_addr, uint32_t lkey, 73 | uint32_t length) 74 | { 75 | struct ibv_sge sge; 76 | struct ibv_recv_wr wr, *bad; 77 | 78 | sge.addr = local_addr; 79 | sge.length = length; 80 | sge.lkey = lkey; 81 | 82 | wr.wr_id = wr_id; 83 | wr.next = NULL; 84 | wr.sg_list = &sge; 85 | wr.num_sge = 1; 86 | 87 | return ibv_post_srq_recv(srq, &wr, &bad); 88 | } 89 | 90 | int rdma_post_recv(struct ibv_qp *qp, uint64_t wr_id, 91 | uint64_t local_addr, uint32_t lkey, 92 | uint32_t length) 93 | { 94 | struct ibv_sge sge; 95 | struct ibv_recv_wr wr, *bad; 96 | 97 | sge.addr = local_addr; 98 | sge.length = length; 99 | sge.lkey = lkey; 100 | 101 | wr.wr_id = wr_id; 102 | wr.next = NULL; 103 | wr.sg_list = &sge; 104 | wr.num_sge = 1; 105 | 106 | return ibv_post_recv(qp, &wr, &bad); 107 | } 108 | 109 | int rdma_two_sided_send(struct ibv_qp *qp, enum ibv_wr_opcode opcode, 110 | const uint32_t max_inline_data, unsigned int send_flags, 111 | uint64_t wr_id, uint32_t imm_data, uint64_t local_addr, 112 | uint32_t lkey, uint32_t length) 113 | { 114 | 115 | struct ibv_sge sge; 116 | struct ibv_send_wr wr, *bad; 117 | 118 | if (length != 0 && length <= max_inline_data) 119 | { 120 | send_flags |= IBV_SEND_INLINE; 121 | } 122 | 123 | sge.addr = local_addr; 124 | sge.length = length; 125 | sge.lkey = lkey; 126 | 127 | wr.wr_id = wr_id; 128 | wr.next = NULL; 129 | wr.sg_list = &sge; 130 | wr.num_sge = 1; 131 | wr.opcode = opcode; 132 | 133 | wr.send_flags = send_flags; 134 | wr.imm_data = imm_data; 135 | 136 | return ibv_post_send(qp, &wr, &bad); 137 | } 138 | 139 | int rdma_one_sided_send(struct ibv_qp *qp, enum ibv_wr_opcode opcode, 140 | const uint32_t max_inline_data, unsigned int send_flags, 141 | uint64_t wr_id, uint32_t imm_data, 142 | uint64_t local_addr, uint32_t lkey, 143 | uint64_t remote_addr, uint32_t rkey, uint32_t length) 144 | { 145 | struct ibv_sge sge; 146 | struct ibv_send_wr wr, *bad; 147 | 148 | if (length != 0 && length <= max_inline_data) 149 | { 150 | send_flags |= IBV_SEND_INLINE; 151 | } 152 | 153 | sge.addr = local_addr; 154 | sge.length = length; 155 | sge.lkey = lkey; 156 | 157 | wr.wr_id = wr_id; 158 | wr.next = NULL; 159 | wr.sg_list = &sge; 160 | wr.num_sge = 1; 161 | wr.opcode = opcode; 162 | 163 | wr.send_flags = send_flags; 164 | wr.imm_data = imm_data; 165 | 166 | wr.wr.rdma.remote_addr = remote_addr; 167 | wr.wr.rdma.rkey = rkey; 168 | 169 | return ibv_post_send(qp, &wr, &bad); 170 | } 171 | 172 | int rdma_send_signaled(struct ibv_qp *qp, uint64_t wr_id, 173 | uint64_t local_addr, uint32_t length, 174 | uint32_t lkey, const uint32_t max_inline_data) 175 | { 176 | return rdma_two_sided_send(qp, IBV_WR_SEND, max_inline_data, 177 | IBV_SEND_SIGNALED, wr_id, 0, 178 | local_addr, lkey, length); 179 | } 180 | 181 | int rdma_send(struct ibv_qp *qp, uint64_t wr_id, 182 | uint64_t local_addr, uint32_t length, 183 | uint32_t lkey, const uint32_t max_inline_data) 184 | { 185 | return rdma_two_sided_send(qp, IBV_WR_SEND, max_inline_data, 186 | 0, wr_id, 0, local_addr, lkey, length); 187 | } 188 | 189 | int rdma_send_with_imm_signaled(struct ibv_qp *qp, uint64_t wr_id, uint32_t imm_data, 190 | uint64_t local_addr, uint32_t length, 191 | uint32_t lkey, const uint32_t max_inline_data) 192 | { 193 | return rdma_two_sided_send(qp, IBV_WR_SEND_WITH_IMM, 194 | max_inline_data, IBV_SEND_SIGNALED, 195 | wr_id, 0, local_addr, lkey, length); 196 | } 197 | 198 | int rdma_send_with_imm(struct ibv_qp *qp, uint64_t wr_id, uint32_t imm_data, 199 | uint64_t local_addr, uint32_t length, 200 | uint32_t lkey, const uint32_t max_inline_data) 201 | { 202 | return rdma_two_sided_send(qp, IBV_WR_SEND_WITH_IMM, 203 | max_inline_data, 0, wr_id, 0, 204 | local_addr, lkey, length); 205 | } 206 | 207 | int rdma_write_signaled(struct ibv_qp *qp, uint64_t wr_id, 208 | uint64_t local_addr, uint32_t lkey, 209 | uint64_t remote_addr, uint32_t rkey, 210 | uint32_t length, const uint32_t max_inline_data) 211 | { 212 | return rdma_one_sided_send(qp, IBV_WR_RDMA_WRITE, max_inline_data, 213 | IBV_SEND_SIGNALED, wr_id, 0, local_addr, 214 | lkey, remote_addr, rkey, length); 215 | } 216 | 217 | int rdma_write(struct ibv_qp *qp, uint64_t wr_id, 218 | uint64_t local_addr, uint32_t lkey, 219 | uint64_t remote_addr, uint32_t rkey, 220 | uint32_t length, const uint32_t max_inline_data) 221 | { 222 | return rdma_one_sided_send(qp, IBV_WR_RDMA_WRITE, max_inline_data, 223 | 0, wr_id, 0, local_addr, lkey, 224 | remote_addr, rkey, length); 225 | } 226 | 227 | /** TODO: Need to rethink this interface design */ 228 | int rdma_write_send_signaled(struct ibv_qp *qp, uint64_t wr_id, 229 | uint64_t local_addr, uint32_t lkey, 230 | uint64_t remote_addr, uint32_t rkey, uint32_t length, 231 | uint32_t payload, const uint32_t max_inline_data) 232 | { 233 | struct ibv_sge sge[2]; 234 | struct ibv_send_wr wr[2], *bad; 235 | 236 | sge[0].addr = local_addr; 237 | sge[0].length = length; 238 | sge[0].lkey = lkey; 239 | 240 | wr[0].wr_id = wr_id; 241 | wr[0].next = &wr[1]; 242 | wr[0].sg_list = &sge[0]; 243 | wr[0].num_sge = 1; 244 | wr[0].opcode = IBV_WR_RDMA_WRITE; 245 | wr[0].send_flags = (length <= max_inline_data ? IBV_SEND_INLINE : 0); 246 | 247 | wr[0].wr.rdma.remote_addr = remote_addr; 248 | wr[0].wr.rdma.rkey = rkey; 249 | 250 | sge[1].addr = local_addr; 251 | sge[1].length = payload; 252 | sge[1].lkey = lkey; 253 | 254 | wr[1].wr_id = wr_id; 255 | wr[1].next = NULL; 256 | wr[1].sg_list = &sge[1]; 257 | wr[1].num_sge = 1; 258 | wr[1].opcode = IBV_WR_SEND; 259 | wr[1].send_flags = IBV_SEND_SIGNALED | 260 | (payload <= max_inline_data ? IBV_SEND_INLINE : 0); 261 | 262 | return ibv_post_send(qp, wr, &bad); 263 | } 264 | 265 | /** TODO: Need to rethink this interface design */ 266 | int rdma_write_write_signaled(struct ibv_qp *qp, uint64_t wr_id, 267 | uint64_t local_addr, uint32_t lkey, 268 | uint64_t remote_addr, uint32_t rkey, uint32_t length, 269 | uint32_t payload, const uint32_t max_inline_data) 270 | { 271 | struct ibv_sge sge[2]; 272 | struct ibv_send_wr wr[2], *bad; 273 | 274 | sge[0].addr = local_addr; 275 | sge[0].length = length; 276 | sge[0].lkey = lkey; 277 | 278 | wr[0].wr_id = wr_id; 279 | wr[0].next = &wr[1]; 280 | wr[0].sg_list = &sge[0]; 281 | wr[0].num_sge = 1; 282 | wr[0].opcode = IBV_WR_RDMA_WRITE; 283 | wr[0].send_flags = (length <= max_inline_data ? IBV_SEND_INLINE : 0); 284 | 285 | wr[0].wr.rdma.remote_addr = remote_addr; 286 | wr[0].wr.rdma.rkey = rkey; 287 | 288 | sge[1].addr = local_addr; 289 | sge[1].length = payload; 290 | sge[1].lkey = lkey; 291 | 292 | wr[1].wr_id = wr_id; 293 | wr[1].next = NULL; 294 | wr[1].sg_list = &sge[1]; 295 | wr[1].num_sge = 1; 296 | wr[1].opcode = IBV_WR_RDMA_WRITE_WITH_IMM; 297 | wr[1].send_flags = IBV_SEND_SIGNALED | 298 | (payload <= max_inline_data ? IBV_SEND_INLINE : 0); 299 | 300 | wr[1].wr.rdma.remote_addr = remote_addr; 301 | wr[1].wr.rdma.rkey = rkey; 302 | 303 | return ibv_post_send(qp, wr, &bad); 304 | } 305 | 306 | int rdma_send_cas_signaled(struct ibv_qp *qp, uint64_t wr_id, 307 | uint64_t local_addr, uint32_t lkey, 308 | uint64_t remote_addr, uint32_t rkey, 309 | uint64_t expected, uint64_t swap) 310 | { 311 | struct ibv_sge sge; 312 | struct ibv_send_wr wr, *bad; 313 | 314 | sge.addr = local_addr; 315 | sge.length = sizeof(uint64_t); 316 | sge.lkey = lkey; 317 | 318 | wr.wr_id = wr_id; 319 | wr.next = NULL; 320 | wr.sg_list = &sge; 321 | wr.num_sge = 1; 322 | wr.opcode = IBV_WR_ATOMIC_CMP_AND_SWP; 323 | wr.send_flags = IBV_SEND_SIGNALED; 324 | 325 | wr.wr.atomic.remote_addr = remote_addr; 326 | wr.wr.atomic.rkey = rkey; 327 | wr.wr.atomic.compare_add = expected; /* expected value in remote address */ 328 | wr.wr.atomic.swap = swap; /* the value that remote address will be assigned to */ 329 | 330 | return ibv_post_send(qp, &wr, &bad); 331 | } 332 | 333 | int rdma_write_with_imm_signaled(struct ibv_qp *qp, uint64_t wr_id, 334 | uint32_t imm_data, uint64_t local_addr, uint32_t lkey, 335 | uint64_t remote_addr, uint32_t rkey, uint32_t length, 336 | const uint32_t max_inline_data) 337 | { 338 | return rdma_one_sided_send(qp, IBV_WR_RDMA_WRITE_WITH_IMM, max_inline_data, 339 | IBV_SEND_SIGNALED, wr_id, imm_data, local_addr, 340 | lkey, remote_addr, rkey, length); 341 | } 342 | 343 | int rdma_write_with_imm(struct ibv_qp *qp, uint64_t wr_id, 344 | uint32_t imm_data, uint64_t local_addr, uint32_t lkey, 345 | uint64_t remote_addr, uint32_t rkey, uint32_t length, 346 | const uint32_t max_inline_data) 347 | { 348 | return rdma_one_sided_send(qp, IBV_WR_RDMA_WRITE_WITH_IMM, max_inline_data, 349 | 0, wr_id, imm_data, local_addr, 350 | lkey, remote_addr, rkey, length); 351 | } 352 | 353 | int rdma_read_signaled(struct ibv_qp *qp, uint64_t wr_id, 354 | uint64_t local_addr, uint32_t lkey, 355 | uint64_t remote_addr, uint32_t rkey, 356 | uint32_t length, const uint32_t max_inline_data) 357 | { 358 | return rdma_one_sided_send(qp, IBV_WR_RDMA_READ, max_inline_data, 359 | IBV_SEND_SIGNALED, wr_id, 0, local_addr, 360 | lkey, remote_addr, rkey, length); 361 | } 362 | 363 | int rdma_read(struct ibv_qp *qp, uint64_t wr_id, 364 | uint64_t local_addr, uint32_t lkey, 365 | uint64_t remote_addr, uint32_t rkey, 366 | uint32_t length, const uint32_t max_inline_data) 367 | { 368 | return rdma_one_sided_send(qp, IBV_WR_RDMA_READ, max_inline_data, 369 | 0, wr_id, 0, local_addr, 370 | lkey, remote_addr, rkey, length); 371 | } 372 | -------------------------------------------------------------------------------- /src/rdma.c: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include "common.h" 16 | #include "rdma.h" 17 | #include "rdma_helpers.h" 18 | 19 | #define ANET_ERR_LEN 256 20 | #define NET_IP_STR_LEN 46 /* INET6_ADDRSTRLEN is 46, but we need to be sure */ 21 | 22 | #if __GNUC__ >= 3 23 | #define likely(x) __builtin_expect(!!(x), 1) 24 | #define unlikely(x) __builtin_expect(!!(x), 0) 25 | #else 26 | #define likely(x) (x) 27 | #define unlikely(x) (x) 28 | #endif 29 | 30 | #define MIN(a, b) (a) < (b) ? a : b 31 | 32 | /* RDMA Flags environment variables */ 33 | int rdmaListenBacklog = 128; 34 | int rdmaTimeoutms = 1000; 35 | int rdmaPollEventTimeoutms = 10; 36 | int rdmaMaxInlineData = 0; /* max inline data enabled by RNIC, 37 | 0 indicate no inline optimization. */ 38 | int rdmaRetryCount = 7; 39 | int rdmaRnrRetryCount = 7; 40 | int rdmaMaxConcurrentWorkRequests = 128 + 2048 * 2; /* used to handle a batch of CQEs */ 41 | int rdmaRecvDepth = RDMA_MAX_SGE; 42 | int rdmaMaxOutstandingRdAtomic = 0; 43 | 44 | int rdmaQp2CqMode = ONE_TO_ONE; 45 | int rdmaCommMode = RDMA_NON_BLOCKING; 46 | bool rdmaEnablePhysAddrAccess = false; 47 | int rdmaIoAffinityCpuId = -1; 48 | 49 | /* a global RDMA context structure. 50 | * Note that all members in this context are globally shared. 51 | * Each RDMA ib device (i.e., each port) has unique ibv_context. 52 | * So we assume that all RDMA QPs over the same ibv_context share 53 | * the globally unique PD, CQ, and completion channel instance. 54 | * A dedicated worker thread will be created to process CQ for 55 | * each IB context. 56 | */ 57 | struct rdma_context 58 | { 59 | struct ibv_context *ctx; 60 | struct ibv_pd *pd; 61 | struct ibv_cq *cq; 62 | struct ibv_comp_channel *comp_channel; 63 | 64 | /* RDMA-registered physical memory */ 65 | struct ibv_mr *phys_mr; 66 | 67 | pthread_t cq_poller_thread; 68 | }; 69 | 70 | /* global RDMA-related variables */ 71 | static struct rdma_context *g_ctx; 72 | static atomic_bool g_should_stop = ATOMIC_VAR_INIT(false); 73 | static atomic_bool c_should_stop = ATOMIC_VAR_INIT(false); 74 | 75 | static struct rdma_event_channel *g_cm_channel; 76 | static pthread_t g_ev_poller_thread; /* used to process rdma event channel */ 77 | static atomic_bool g_ev_poller_should_stop = ATOMIC_VAR_INIT(false); /* false by default*/ 78 | 79 | static void *rdmaPollCQ(void *); 80 | static void *rdmaCompChannelStart(void *ptr); 81 | static int rdmaContextInit(struct ibv_context *verbs); 82 | static void rdmaContextRelease(void); 83 | static int rdmaConnCreate(struct rdma_cm_id *id, RdmaConn *conn); 84 | static void rdmaConnRelease(RdmaConn *conn); 85 | static void *rdmaCmChannelStart(void *ptr); /* run in a thread */ 86 | 87 | /* common RDMA helpers and handlers */ 88 | static int rdmaPollEvents(struct rdma_event_channel *event_channel, void *poll_ctx); 89 | static int rdmaOnConnectRequest(struct rdma_cm_event *ev); 90 | static int rdmaOnAddrResolved(struct rdma_cm_event *ev); 91 | static int rdmaOnRouteResolved(struct rdma_cm_event *ev); 92 | static int rdmaOnConnected(struct rdma_cm_event *ev, void *poll_ctx); 93 | static int rdmaOnDisconnected(struct rdma_cm_event *ev); 94 | static int rdmaOnRejected(struct rdma_cm_event *ev); 95 | 96 | static int connRdmaSyncRxMr(RdmaConn *conn, struct rdma_cm_id *cm_id); 97 | static int connRdmaSyncPhysRxMr(RdmaConn *conn, struct rdma_cm_id *cm_id); 98 | static int connRdmaSayBye(RdmaConn *conn, struct rdma_cm_id *cm_id); 99 | 100 | // static int rdmaRegSendbuf(RdmaConn *conn, unsigned int length); 101 | static int rdmaSendCommand(RdmaConn *conn, struct rdma_cm_id *id, RdmaCmd *cmd, void *tx_ctx); 102 | // static int rdmaConnRegisterRx(RdmaConn *conn, struct rdma_cm_id *id); 103 | 104 | static int rdmaConnHandleRecv(RdmaConn *conn, struct rdma_cm_id *cm_id, 105 | RdmaCmd *cmd, RdmaWrCtx *wr_ctx, uint32_t byte_len); 106 | static int rdmaConnHandleSend(RdmaConn *conn, RdmaCmd *cmd); 107 | 108 | static int rdma_get_max_qp_rd_atom(struct ibv_context *ctx) 109 | { 110 | int max_rd_atom = 0; 111 | struct ibv_device_attr attr; 112 | if (!ibv_query_device(ctx, &attr)) 113 | { 114 | max_rd_atom = attr.max_qp_rd_atom; 115 | } 116 | rdmaDebug("max_qp_rd_atom: %d", max_rd_atom); 117 | return max_rd_atom; 118 | } 119 | 120 | static inline void rdma_set_outstanding_rd_atomic(int max_rd_atomic) 121 | { 122 | if (rdmaMaxOutstandingRdAtomic == 0 || rdmaMaxOutstandingRdAtomic > max_rd_atomic) 123 | rdmaMaxOutstandingRdAtomic = max_rd_atomic; 124 | } 125 | 126 | static inline void rdmaSetDefaultOptions(RdmaOptions *dst_opt) 127 | { 128 | if (dst_opt) 129 | { 130 | dst_opt->rdma_listen_backlog = 128; 131 | dst_opt->rdma_timeoutms = 1000; 132 | dst_opt->rdma_poll_event_timeoutms = 10; 133 | dst_opt->rdma_max_inline_data = 0; 134 | dst_opt->rdma_retry_count = 7; 135 | dst_opt->rdma_rnr_retry_count = 7; 136 | dst_opt->rdma_max_concurrent_work_requests = 128 + 2048 * 2; 137 | dst_opt->rdma_recv_depth = RDMA_MAX_SGE; 138 | dst_opt->rdma_qp2cq_mode = MANY_TO_ONE; 139 | dst_opt->rdma_comm_mode = RDMA_NON_BLOCKING; 140 | dst_opt->rdma_enable_phys_addr_access = false; 141 | dst_opt->rdma_max_outstanding_rd_atomic = 0; 142 | } 143 | } 144 | 145 | static inline void rdmaSetGlobalEnv(const RdmaOptions *opt) 146 | { 147 | if (!opt) 148 | return; 149 | 150 | if (opt->rdma_poll_event_timeoutms > 0) 151 | { 152 | rdmaPollEventTimeoutms = opt->rdma_poll_event_timeoutms; 153 | } 154 | 155 | if (opt->rdma_max_inline_data > 0) 156 | { 157 | rdmaMaxInlineData = opt->rdma_max_inline_data; 158 | } 159 | 160 | if (opt->rdma_retry_count > 0) 161 | { 162 | rdmaRetryCount = opt->rdma_retry_count; 163 | } 164 | 165 | if (opt->rdma_rnr_retry_count) 166 | { 167 | rdmaRnrRetryCount = opt->rdma_rnr_retry_count; 168 | } 169 | 170 | if (opt->rdma_max_concurrent_work_requests > 0) 171 | { 172 | rdmaMaxConcurrentWorkRequests = opt->rdma_max_concurrent_work_requests; 173 | } 174 | 175 | if (opt->rdma_qp2cq_mode >= 0) 176 | { 177 | rdmaQp2CqMode = (opt->rdma_comm_mode == RDMA_BLOCKING) ? ONE_TO_ONE : opt->rdma_qp2cq_mode; 178 | } 179 | 180 | if (opt->rdma_comm_mode >= 0) 181 | { 182 | rdmaCommMode = opt->rdma_comm_mode; 183 | } 184 | 185 | if (opt->rdma_recv_depth > 0) 186 | { 187 | rdmaRecvDepth = opt->rdma_recv_depth; 188 | } 189 | 190 | if (opt->rdma_max_outstanding_rd_atomic > 0) 191 | { 192 | rdmaMaxOutstandingRdAtomic = opt->rdma_max_outstanding_rd_atomic; 193 | } 194 | 195 | if (opt->rdma_io_affinity_cpuid > 0) 196 | { 197 | rdmaIoAffinityCpuId = opt->rdma_io_affinity_cpuid; 198 | } 199 | 200 | rdmaEnablePhysAddrAccess = opt->rdma_enable_phys_addr_access; 201 | } 202 | 203 | static inline void rdmaConnSetEnv(RdmaConn *conn, const RdmaOptions *opt) 204 | { 205 | if (!opt) 206 | return; 207 | 208 | if (opt->rdma_comm_mode == RDMA_BLOCKING) 209 | { 210 | conn->options.rdma_qp2cq_mode = ONE_TO_ONE; 211 | } 212 | if (opt->rdma_recv_depth > 0) 213 | { 214 | conn->options.rdma_recv_depth = opt->rdma_recv_depth; 215 | } 216 | if (opt->rdma_timeoutms > 0) 217 | { 218 | conn->options.rdma_timeoutms = opt->rdma_timeoutms; 219 | } 220 | if (opt->recv_callback) 221 | { 222 | rdmaConnSetRecvCallback(conn, opt->recv_callback); 223 | } 224 | if (opt->write_callback) 225 | { 226 | rdmaConnSetWriteCallback(conn, opt->write_callback); 227 | } 228 | if (opt->read_callback) 229 | { 230 | rdmaConnSetReadCallback(conn, opt->read_callback); 231 | } 232 | if (opt->connected_callback) 233 | { 234 | rdmaConnSetConnectedCallback(conn, opt->connected_callback); 235 | } 236 | if (opt->disconnect_callback) 237 | { 238 | rdmaConnSetDisconnectCallback(conn, opt->disconnect_callback); 239 | } 240 | } 241 | 242 | /* To make RDMA apps forkable, buffer which is registered as RDMA 243 | * memory region should be aligned to page size. And the length 244 | * also need to be aligned to page size. 245 | */ 246 | static void *page_aligned_alloc(size_t size) 247 | { 248 | void *tmp; 249 | size_t aligned_size, page_size = sysconf(_SC_PAGESIZE); 250 | 251 | aligned_size = (size + page_size - 1) & (~(page_size - 1)); 252 | if (posix_memalign(&tmp, page_size, aligned_size)) 253 | { 254 | rdmaErr("posix_memalign failed"); 255 | return NULL; 256 | } 257 | 258 | memset(tmp, 0x00, aligned_size); 259 | 260 | return tmp; 261 | } 262 | 263 | static int rdmaPostRecv(RdmaConn *ctx, struct rdma_cm_id *cm_id, RdmaCmd *cmd, void *rx_ctx) 264 | { 265 | struct ibv_sge sge; 266 | size_t length = sizeof(RdmaCmd); 267 | struct ibv_recv_wr recv_wr, *bad_wr; 268 | int ret; 269 | 270 | sge.addr = (uint64_t)cmd; 271 | sge.length = length; 272 | sge.lkey = ctx->cmd_mr->lkey; 273 | 274 | recv_wr.wr_id = (uint64_t)rx_ctx; 275 | recv_wr.sg_list = &sge; 276 | recv_wr.num_sge = 1; 277 | recv_wr.next = NULL; 278 | 279 | ret = ibv_post_recv(cm_id->qp, &recv_wr, &bad_wr); 280 | if (ret && (ret != EAGAIN)) 281 | { 282 | rdmaWarn("RDMA: post recv failed: %d", ret); 283 | return RDMA_ERR; 284 | } 285 | 286 | return RDMA_OK; 287 | } 288 | 289 | static void rdmaDestroyIoBuf(RdmaConn *conn) 290 | { 291 | if (conn->recv_mr) 292 | { 293 | ibv_dereg_mr(conn->recv_mr); 294 | conn->recv_mr = NULL; 295 | } 296 | 297 | if (conn->recv_buf) 298 | { 299 | free(conn->recv_buf); 300 | conn->recv_buf = NULL; 301 | } 302 | 303 | if (conn->send_mr) 304 | { 305 | ibv_dereg_mr(conn->send_mr); 306 | conn->send_mr = NULL; 307 | } 308 | 309 | if (conn->send_buf) 310 | { 311 | free(conn->send_buf); 312 | conn->send_buf = NULL; 313 | } 314 | 315 | if (conn->cmd_mr) 316 | { 317 | ibv_dereg_mr(conn->cmd_mr); 318 | conn->cmd_mr = NULL; 319 | } 320 | 321 | if (conn->cmd_buf) 322 | { 323 | free(conn->cmd_buf); 324 | conn->cmd_buf = NULL; 325 | } 326 | 327 | if (conn->rx_ctx) 328 | { 329 | free(conn->rx_ctx); 330 | conn->rx_ctx = NULL; 331 | } 332 | 333 | if (conn->tx_ctx) 334 | { 335 | free(conn->tx_ctx); 336 | conn->tx_ctx = NULL; 337 | } 338 | } 339 | 340 | /* register RDMA MRs for two-sided messaging and one-sided memory */ 341 | static int rdmaSetupIoBuf(RdmaConn *conn, struct rdma_cm_id *cm_id) 342 | { 343 | int access = IBV_ACCESS_LOCAL_WRITE; 344 | size_t length = sizeof(RdmaCmd) * rdmaRecvDepth * 2; 345 | RdmaCmd *cmd; 346 | RdmaWrCtx *rx_ctx; 347 | int i; 348 | 349 | /* setup RDMA cmd buf for two-sided messaging */ 350 | conn->cmd_buf = page_aligned_alloc(length); 351 | conn->cmd_mr = ibv_reg_mr(conn->pd, conn->cmd_buf, length, access); 352 | if (!conn->cmd_mr) 353 | { 354 | rdmaWarn("RDMA: reg mr for CMD error %d (%s)", errno, strerror(errno)); 355 | goto err; 356 | } 357 | 358 | /* setup RDMAQ work request contexts for two-sided messaging */ 359 | length = sizeof(RdmaWrCtx) * rdmaRecvDepth; 360 | conn->rx_ctx = page_aligned_alloc(length); 361 | conn->tx_ctx = page_aligned_alloc(length); 362 | 363 | for (i = 0; i < rdmaRecvDepth; i++) 364 | { 365 | cmd = conn->cmd_buf + i; 366 | rx_ctx = conn->rx_ctx + i; 367 | 368 | rx_ctx->type = RECV_CONTEXT; 369 | rx_ctx->rdma_conn = (void *)conn; 370 | rx_ctx->private_data = (void *)cmd; 371 | 372 | if (rdmaPostRecv(conn, cm_id, cmd, (void *)rx_ctx) == RDMA_ERR) 373 | { 374 | rdmaWarn("RDMA: post recv failed"); 375 | goto err; 376 | } 377 | } 378 | 379 | /* setup RDMA data buf for one-sided verbs */ 380 | length = RDMA_DEFAULT_RX_LEN; 381 | conn->recv_buf = page_aligned_alloc(length); 382 | conn->recv_length = length; 383 | conn->recv_mr = rdma_reg_mem(conn->pd, conn->recv_buf, conn->recv_length); 384 | if (!conn->recv_mr) 385 | { 386 | rdmaWarn("RDMA: reg mr for RDMA recv buf error %d (%s)", errno, strerror(errno)); 387 | goto err; 388 | } 389 | 390 | return RDMA_OK; 391 | 392 | err: 393 | rdmaDestroyIoBuf(conn); 394 | return RDMA_ERR; 395 | } 396 | 397 | static int rdmaAdjustSendbuf(RdmaConn *conn, unsigned int length) 398 | { 399 | if (length == conn->send_length) 400 | return RDMA_OK; 401 | 402 | /* try to free old send MR & buffer */ 403 | if (conn->send_length) 404 | { 405 | ibv_dereg_mr(conn->send_mr); 406 | free(conn->send_buf); 407 | conn->send_length = 0; 408 | } 409 | 410 | /* setup new send MR & buffer */ 411 | conn->send_length = length; 412 | conn->send_buf = page_aligned_alloc(conn->send_length); 413 | conn->send_mr = rdma_reg_mem(conn->pd, conn->send_buf, conn->send_length); 414 | if (!conn->send_mr) 415 | { 416 | rdmaErr("RDMA: reg send mr failed %d(%s)", errno, strerror(errno)); 417 | free(conn->send_buf); 418 | conn->send_buf = NULL; 419 | conn->send_length = 0; 420 | return RDMA_ERR; 421 | } 422 | 423 | return RDMA_OK; 424 | } 425 | 426 | static int rdmaConnHandleRecvImm(RdmaConn *conn, struct rdma_cm_id *cm_id, 427 | RdmaCmd *cmd, RdmaWrCtx *wr_ctx, 428 | struct ibv_wc *wc, uint32_t byte_len) 429 | { 430 | uint32_t rx_offset = ntohl(wc->imm_data); 431 | char *rx_buffer = conn->recv_buf + rx_offset; 432 | 433 | if (unlikely(rx_offset + byte_len > conn->recv_length)) 434 | { 435 | rdmaErr("RDMA: recv buffer overflow. Please adjust RDMA MR"); 436 | return RDMA_ERR; 437 | } 438 | conn->rx_offset += byte_len; 439 | 440 | if (conn->recv_callback) 441 | { 442 | conn->recv_callback(conn, rx_buffer, byte_len); 443 | } 444 | 445 | return rdmaPostRecv(conn, cm_id, cmd, wr_ctx); 446 | } 447 | 448 | /* rdma common helpers */ 449 | int rdmaContextInit(struct ibv_context *verbs) 450 | { 451 | if (g_ctx) 452 | { 453 | if (g_ctx->ctx != verbs) 454 | { 455 | rdmaWarn("cannot handle events in more than one IB context"); 456 | } 457 | 458 | return RDMA_OK; 459 | } 460 | 461 | /* The ibv_fork_init() func initializes libibverbs'data structures to handle 462 | * fork() func calls correctly and avoid data corruption. 463 | */ 464 | if (ibv_fork_init()) 465 | { 466 | rdmaWarn("RDMA: FATAL error, ibv_fork_init failed"); 467 | } 468 | 469 | g_ctx = (struct rdma_context *)malloc(sizeof(struct rdma_context)); 470 | assert(g_ctx); 471 | 472 | g_ctx->ctx = verbs; 473 | rdmaMaxOutstandingRdAtomic = rdma_get_max_qp_rd_atom(g_ctx->ctx); 474 | 475 | g_ctx->pd = ibv_alloc_pd(g_ctx->ctx); 476 | if (!g_ctx->pd) 477 | { 478 | rdmaErr("RDMA: ibv alloc pd failed"); 479 | goto err; 480 | } 481 | 482 | /* register RDMA-enabled physical memory when enabled */ 483 | if (rdmaEnablePhysAddrAccess) 484 | { 485 | g_ctx->phys_mr = rdma_exp_reg_phys_mem_full(g_ctx->pd); 486 | if (!g_ctx->phys_mr) 487 | { 488 | rdmaErr("RDMA: ibv exp reg mr error"); 489 | goto err; 490 | } 491 | } 492 | 493 | g_ctx->comp_channel = ibv_create_comp_channel(g_ctx->ctx); 494 | if (!g_ctx->comp_channel) 495 | { 496 | rdmaErr("RDMA: ibv create comp channel failed"); 497 | goto err; 498 | } 499 | 500 | g_ctx->cq = ibv_create_cq(g_ctx->ctx, RDMA_MAX_SGE * 2, NULL, g_ctx->comp_channel, 0); 501 | if (!g_ctx->cq) 502 | { 503 | rdmaErr("RDMA: ibv create cq failed"); 504 | goto err; 505 | } 506 | ibv_req_notify_cq(g_ctx->cq, 0); 507 | 508 | pthread_create(&g_ctx->cq_poller_thread, NULL, rdmaCompChannelStart, (void *)g_ctx); 509 | 510 | return RDMA_OK; 511 | err: 512 | rdmaContextRelease(); 513 | return RDMA_ERR; 514 | } 515 | 516 | static void rdmaContextRelease(void) 517 | { 518 | if (!g_ctx) 519 | return; 520 | 521 | if (g_ctx->cq) 522 | { 523 | ibv_destroy_cq(g_ctx->cq); 524 | } 525 | 526 | if (g_ctx->comp_channel) 527 | { 528 | ibv_destroy_comp_channel(g_ctx->comp_channel); 529 | } 530 | 531 | if (g_ctx->pd) 532 | { 533 | ibv_dealloc_pd(g_ctx->pd); 534 | } 535 | 536 | /* dealloc other rdma resources like mw here */ 537 | } 538 | 539 | void rdmaRuntimeStop() 540 | { 541 | /* wait for poller thread */ 542 | atomic_store(&g_should_stop, true); 543 | atomic_store(&c_should_stop, true); 544 | pthread_join(g_ev_poller_thread, NULL); 545 | 546 | if (g_ctx) 547 | { 548 | pthread_join(g_ctx->cq_poller_thread, NULL); 549 | } 550 | 551 | rdmaContextRelease(); 552 | if (g_cm_channel) 553 | rdma_destroy_event_channel(g_cm_channel); 554 | } 555 | 556 | struct ibv_mr *rdmaConnRegMem(RdmaConn *conn, size_t size) 557 | { 558 | void *buf; 559 | 560 | buf = malloc(size); 561 | if (!buf) 562 | return NULL; 563 | memset(buf, 0, size); 564 | 565 | return rdma_reg_mem(conn->pd, buf, size); 566 | } 567 | 568 | void rdmaConnDeregMem(RdmaConn *conn, struct ibv_mr *mr) 569 | { 570 | void *buf = mr->addr; 571 | rdma_dereg_mem(mr); 572 | free(buf); 573 | buf = NULL; 574 | } 575 | 576 | int rdmaConnHandleRecv(RdmaConn *conn, struct rdma_cm_id *cm_id, 577 | RdmaCmd *cmd, RdmaWrCtx *wr_ctx, uint32_t byte_len) 578 | { 579 | if (unlikely(byte_len != sizeof(RdmaCmd))) 580 | { 581 | rdmaErr("RDMA: FATAL error, recv corrupted cmd"); 582 | return RDMA_ERR; 583 | } 584 | 585 | switch (cmd->cmd_opcode) 586 | { 587 | case REG_LOCAL_ADDR: 588 | conn->tx_addr = (char *)cmd->addr; 589 | conn->tx_length = ntohl(cmd->length); 590 | conn->tx_key = ntohl(cmd->key); 591 | conn->tx_offset = 0; 592 | rdmaAdjustSendbuf(conn, conn->tx_length); 593 | 594 | /* notify the waiting side once connected */ 595 | conn->state = RDMA_CONN_STATE_MR_READY; 596 | if (conn->type == CONNECTED_CONN) 597 | { 598 | pthread_mutex_lock(&conn->status_mutex); 599 | pthread_cond_broadcast(&conn->status_cond); /* signal waiting threads */ 600 | pthread_mutex_unlock(&conn->status_mutex); 601 | } 602 | break; 603 | 604 | case REG_PHYS_ADDR: 605 | conn->tx_pa_addr = (char *)cmd->addr; 606 | conn->tx_pa_length = ntohl(cmd->length); 607 | conn->tx_pa_rkey = ntohl(cmd->key); 608 | conn->tx_pa_offset = 0; 609 | break; 610 | 611 | case CONN_GOODBYE: 612 | rdmaInfo("RDMA: disconnect with host %s:%d", conn->ip, conn->port); 613 | // rdma_disconnect(cm_id); 614 | 615 | break; 616 | 617 | default: 618 | rdmaErr("RDMA: FATAL error, unknown RDMA cmd"); 619 | return RDMA_ERR; 620 | } 621 | 622 | return rdmaPostRecv(conn, cm_id, cmd, wr_ctx); 623 | } 624 | 625 | int rdmaConnHandleSend(RdmaConn *conn, RdmaCmd *cmd) 626 | { 627 | /* mark this RDMA cmd has already sent */ 628 | cmd->magic = 0; 629 | 630 | switch (cmd->cmd_opcode) 631 | { 632 | case CONN_GOODBYE: 633 | /* start disconnect once the CONN_GOODBYE msg arrives at peer host. */ 634 | rdma_disconnect(conn->cm_id); 635 | break; 636 | 637 | default: 638 | break; 639 | } 640 | 641 | return RDMA_OK; 642 | } 643 | 644 | void *rdmaPollCQ(void *ctx_ptr) 645 | { 646 | struct rdma_cm_id *id; 647 | struct rdma_context *ctx = (struct rdma_context *)ctx_ptr; 648 | struct ibv_cq *ev_cq = NULL; 649 | void *ev_ctx = NULL; 650 | struct ibv_wc wc[rdmaMaxConcurrentWorkRequests]; 651 | RdmaCmd *cmd; 652 | RdmaConn *conn; 653 | RdmaWrCtx *wr_ctx; 654 | int num_ev, i; 655 | assert(ctx_ptr); 656 | 657 | /* wait for the completion event */ 658 | if (ibv_get_cq_event(ctx->comp_channel, &ev_cq, &ev_ctx) < 0) 659 | { 660 | if (errno != EAGAIN) 661 | { 662 | rdmaWarn("RDMA: get CQ event error %s", strerror(errno)); 663 | return NULL; 664 | } 665 | } 666 | 667 | /* ack the event */ 668 | ibv_ack_cq_events(ev_cq, 1); 669 | 670 | /* request notification upon the next completion event */ 671 | if (ibv_req_notify_cq(ev_cq, 0)) 672 | { 673 | rdmaWarn("RDMA: notify CQ error %s", strerror(errno)); 674 | return NULL; 675 | } 676 | 677 | /* empty the CQ by polling all of the cvompletions from the CQ (if any exist) */ 678 | pollcq: 679 | num_ev = ibv_poll_cq(ctx->cq, rdmaMaxConcurrentWorkRequests, wc); 680 | if (num_ev < 0) 681 | { 682 | rdmaWarn("RDMA: poll recv CQ error %s", strerror(errno)); 683 | return NULL; 684 | } 685 | else if (num_ev == 0) 686 | { 687 | goto out; 688 | } 689 | 690 | for (i = 0; i < num_ev; i++) 691 | { 692 | if (wc[i].status != IBV_WC_SUCCESS) 693 | { 694 | rdmaDebug("(Ignored) RDMA: CQ handle error status: %s[0x%x], opcode : 0x%x", 695 | ibv_wc_status_str(wc[i].status), wc[i].status, wc[i].opcode); 696 | // goto out; 697 | continue; 698 | } 699 | 700 | switch (wc[i].opcode) 701 | { 702 | case IBV_WC_RECV: 703 | wr_ctx = (RdmaWrCtx *)wc[i].wr_id; 704 | conn = (RdmaConn *)wr_ctx->rdma_conn; 705 | cmd = (RdmaCmd *)wr_ctx->private_data; 706 | id = conn->cm_id; 707 | 708 | if (rdmaConnHandleRecv(conn, id, cmd, wr_ctx, wc[i].byte_len) == RDMA_ERR) 709 | { 710 | rdmaErr("RDMA: rdma connection handle Recv error"); 711 | goto out; 712 | } 713 | break; 714 | 715 | case IBV_WC_RECV_RDMA_WITH_IMM: 716 | wr_ctx = (RdmaWrCtx *)wc[i].wr_id; 717 | conn = (RdmaConn *)wr_ctx->rdma_conn; 718 | cmd = (RdmaCmd *)wr_ctx->private_data; 719 | if (rdmaConnHandleRecvImm(conn, conn->cm_id, cmd, wr_ctx, &wc[i], wc[i].byte_len) == RDMA_ERR) 720 | { 721 | rdmaErr("RDMA: rdma connection handle Recv Imm error"); 722 | conn->state = RDMA_CONN_STATE_ERROR; 723 | goto out; 724 | } 725 | 726 | break; 727 | 728 | case IBV_WC_RDMA_WRITE: 729 | conn = (RdmaConn *)wc[i].wr_id; 730 | if (conn && conn->write_callback) 731 | { 732 | conn->write_callback(conn, wc[i].byte_len); 733 | } 734 | 735 | break; 736 | 737 | case IBV_WC_RDMA_READ: 738 | conn = (RdmaConn *)wc[i].wr_id; 739 | if (conn && conn->read_callback) 740 | { 741 | conn->read_callback(conn, wc[i].byte_len); 742 | } 743 | 744 | break; 745 | 746 | case IBV_WC_SEND: 747 | wr_ctx = (RdmaWrCtx *)wc[i].wr_id; 748 | conn = (RdmaConn *)wr_ctx->rdma_conn; 749 | cmd = (RdmaCmd *)wr_ctx->private_data; 750 | 751 | if (rdmaConnHandleSend(conn, cmd) == RDMA_ERR) 752 | { 753 | goto out; 754 | } 755 | 756 | break; 757 | 758 | default: 759 | rdmaWarn("RDMA: unexpected opcode 0x[%x]", wc[i].opcode); 760 | break; 761 | } 762 | } 763 | 764 | goto pollcq; 765 | out: 766 | return NULL; 767 | } 768 | 769 | void *rdmaCompChannelStart(void *ctx_ptr) 770 | { 771 | struct rdma_context *ctx = (struct rdma_context *)ctx_ptr; 772 | assert(ctx); 773 | int flags = fcntl(ctx->comp_channel->fd, F_GETFL); 774 | int ret = fcntl(ctx->comp_channel->fd, F_SETFL, flags | O_NONBLOCK); 775 | int error_flags = POLLERR | POLLHUP | POLLNVAL; 776 | struct pollfd pfd = { 777 | .fd = ctx->comp_channel->fd, 778 | .events = POLLIN, 779 | .revents = 0}; 780 | int num_events = 0; 781 | 782 | if (ret != 0) 783 | { 784 | rdmaErr("RDMA: fcntl rdma completion channel fd failed status: %s", strerror(errno)); 785 | return NULL; 786 | } 787 | 788 | if (ret != 0) 789 | { 790 | rdmaErr("RDMA: fcntl rdma completion channel fd failed status: %s", strerror(errno)); 791 | return NULL; 792 | } 793 | 794 | /* set CPU affinity */ 795 | rdmaDebug("current CPU ID: %d", get_current_cpu()); 796 | if (rdmaIoAffinityCpuId == -1) 797 | { 798 | rdmaIoAffinityCpuId = get_current_cpu(); 799 | } 800 | rdmaDebug("CPU affinity core: %d", rdmaIoAffinityCpuId); 801 | 802 | cpu_set_t mask; 803 | CPU_ZERO(&mask); 804 | CPU_SET(rdmaIoAffinityCpuId, &mask); 805 | ret = pthread_setaffinity_np(pthread_self() ,sizeof(mask),&mask); 806 | if (ret != 0) { 807 | rdmaWarn("failed to set affinity CPU %d for RDMA IO thread", rdmaIoAffinityCpuId); 808 | } 809 | 810 | while (!atomic_load(&c_should_stop)) 811 | { 812 | num_events = poll(&pfd, 1, rdmaPollEventTimeoutms); 813 | 814 | if (num_events == -1) 815 | { 816 | rdmaErr("RDMA: poll rdma completion channel faild (%s)", strerror(errno)); 817 | break; 818 | } 819 | else if (num_events == 0) 820 | { 821 | // rdmaDebug("RDMA: rdma completion channel timeout reached. No events"); 822 | continue; 823 | } 824 | 825 | if ((pfd.revents & error_flags) != 0) 826 | { 827 | rdmaErr("RDMA: rdma cm event channel poll err"); 828 | break; 829 | } 830 | 831 | if (!(pfd.revents & POLLIN)) 832 | continue; 833 | 834 | rdmaPollCQ(ctx); 835 | } 836 | 837 | rdmaDebug("rdma poll CQ thread exit!"); 838 | 839 | return NULL; 840 | } 841 | 842 | int rdmaConnCreate(struct rdma_cm_id *id, RdmaConn *conn) 843 | { 844 | struct ibv_qp_init_attr init_attr; 845 | int max_rd_atom; 846 | int ret; 847 | 848 | ret = rdmaContextInit(id->verbs); 849 | if (ret != RDMA_OK) 850 | { 851 | rdmaErr("RDMA: failed to init RDMA context"); 852 | goto reject; 853 | } 854 | 855 | /* setup context for this connection */ 856 | conn->cm_id = id; 857 | conn->pd = g_ctx->pd; 858 | conn->cq = g_ctx->cq; 859 | conn->max_inline_data = rdmaMaxInlineData; 860 | 861 | /* setup RDMA QP */ 862 | memset(&init_attr, 0, sizeof(init_attr)); 863 | init_attr.cap.max_send_wr = RDMA_MAX_SGE; 864 | init_attr.cap.max_recv_wr = RDMA_MAX_SGE; 865 | init_attr.cap.max_send_sge = 1; 866 | init_attr.cap.max_recv_sge = 1; 867 | init_attr.cap.max_inline_data = rdmaMaxInlineData; 868 | init_attr.qp_type = IBV_QPT_RC; 869 | init_attr.send_cq = g_ctx->cq; 870 | init_attr.recv_cq = g_ctx->cq; 871 | 872 | ret = rdma_create_qp(id, g_ctx->pd, &init_attr); 873 | if (ret) 874 | { 875 | rdmaWarn("RDMA: create qp failed %d (%s)", errno, strerror(errno)); 876 | goto reject; 877 | } 878 | 879 | /* set max outstanding Reads and Atomics by modifying QP */ 880 | max_rd_atom = rdma_get_max_qp_rd_atom(g_ctx->ctx); 881 | rdma_set_outstanding_rd_atomic(max_rd_atom); 882 | 883 | if (rdmaSetupIoBuf(conn, id) == RDMA_ERR) 884 | { 885 | rdmaWarn("RDMA: setup RDMA IO Buf failed"); 886 | goto reject; 887 | } 888 | 889 | return RDMA_OK; 890 | 891 | reject: 892 | return RDMA_ERR; 893 | } 894 | 895 | void rdmaConnRelease(RdmaConn *conn) 896 | { 897 | if (!conn || !conn->cm_id) 898 | return; 899 | 900 | rdma_destroy_qp(conn->cm_id); 901 | rdmaDestroyIoBuf(conn); 902 | 903 | if (conn->cm_id) 904 | rdma_destroy_id(conn->cm_id); 905 | conn->cm_id = NULL; 906 | 907 | if (conn->ip) 908 | free(conn->ip); 909 | 910 | free(conn); 911 | conn = NULL; 912 | } 913 | 914 | void *rdmaCmChannelStart(void *ptr) 915 | { 916 | int flags = fcntl(g_cm_channel->fd, F_GETFL); 917 | int ret = fcntl(g_cm_channel->fd, F_SETFL, flags | O_NONBLOCK); 918 | assert(ret == 0); 919 | int error_flags = POLLERR | POLLHUP | POLLNVAL; 920 | struct pollfd pfd = { 921 | .fd = g_cm_channel->fd, 922 | .events = POLLIN, 923 | .revents = 0}; 924 | int num_events = 0; 925 | 926 | if (ret != 0) 927 | { 928 | rdmaErr("RDMA: fcntl rdma cm event channel fd failed status: %s", strerror(errno)); 929 | return NULL; 930 | } 931 | 932 | while (!atomic_load(&g_should_stop)) 933 | { 934 | num_events = poll(&pfd, 1, rdmaPollEventTimeoutms); 935 | 936 | if (num_events == -1) 937 | { 938 | rdmaErr("RDMA: poll rdma cm event channel faild (%s)", strerror(errno)); 939 | break; 940 | } 941 | else if (num_events == 0) 942 | { 943 | // rdmaDebug("RDMA: rdma cm event channel timeout reached. No events"); 944 | continue; 945 | } 946 | 947 | if ((pfd.revents & error_flags) != 0) 948 | { 949 | rdmaErr("RDMA: rdma cm event channel poll err"); 950 | break; 951 | } 952 | 953 | if (!(pfd.revents & POLLIN)) 954 | continue; 955 | 956 | ret = rdmaPollEvents(g_cm_channel, NULL); 957 | if (ret != 0) 958 | { 959 | rdmaErr("RDMA: poll CM events failed (%s)", strerror(errno)); 960 | break; 961 | } 962 | } 963 | 964 | rdmaDebug("rdma poll CM event thread exit!"); 965 | 966 | return NULL; 967 | } 968 | 969 | /* public RDMA interfaces */ 970 | 971 | /* RDMA server side */ 972 | 973 | int rdmaServer(RdmaListener **listener, const char *ip, 974 | const int port, const RdmaServerOptions *opt) 975 | { 976 | struct addrinfo hints, *addrinfo; 977 | struct sockaddr_storage sock_addr; 978 | struct rdma_cm_id *listen_cmid = NULL; 979 | struct rdma_event_channel *listen_channel = NULL; 980 | char _port[6]; /* strlen("65536") */ 981 | int ret = RDMA_OK, af_type = AF_INET, afonly = 1; 982 | assert(*listener); 983 | assert(ip); 984 | 985 | *listener = (RdmaListener *)malloc(sizeof(RdmaListener)); 986 | if (*listener == NULL) 987 | { 988 | rdmaErr("RDMA: failed to alloc RdmaListener %d (%s)", errno, strerror(errno)); 989 | goto err; 990 | } 991 | memset(*listener, 0, sizeof(**listener)); 992 | 993 | /* parse IP addr info */ 994 | sprintf(_port, "%d", port); 995 | af_type = strchr(ip, ':') ? AF_INET6 : AF_INET; 996 | memset(&hints, 0, sizeof(hints)); 997 | hints.ai_family = af_type; 998 | hints.ai_flags = AI_PASSIVE; 999 | hints.ai_socktype = SOCK_STREAM; 1000 | 1001 | ret = getaddrinfo(ip, _port, &hints, &addrinfo); 1002 | if (ret || !addrinfo) 1003 | { 1004 | rdmaErr("RDMA: failed to get addr info for %s:%d", ip, port); 1005 | ret = RDMA_ERR; 1006 | goto err; 1007 | } 1008 | 1009 | /* create listen event channel */ 1010 | listen_channel = rdma_create_event_channel(); 1011 | if (!listen_channel) 1012 | { 1013 | rdmaErr("RDMA: create event channel failed"); 1014 | ret = RDMA_ERR; 1015 | goto err; 1016 | } 1017 | 1018 | /* setup Rdma Server Options */ 1019 | rdmaSetDefaultOptions(&(*listener)->options); 1020 | if (opt) 1021 | { 1022 | // memcpy(&(*listener)->options, opt, sizeof(*opt)); 1023 | if (opt->rdma_comm_mode == RDMA_BLOCKING) 1024 | { 1025 | (*listener)->options.rdma_qp2cq_mode = ONE_TO_ONE; 1026 | rdmaQp2CqMode = ONE_TO_ONE; 1027 | } 1028 | if (opt->accept_callback) 1029 | { 1030 | rdmaServerSetAcceptCallback(*listener, opt->accept_callback); 1031 | } 1032 | rdmaSetGlobalEnv(opt); 1033 | } 1034 | 1035 | memset(&sock_addr, 0, sizeof(sock_addr)); 1036 | if (addrinfo->ai_family == AF_INET6) 1037 | { 1038 | memcpy(&sock_addr, addrinfo->ai_addr, sizeof(struct sockaddr_in6)); 1039 | ((struct sockaddr_in6 *)&sock_addr)->sin6_family = AF_INET6; 1040 | ((struct sockaddr_in6 *)&sock_addr)->sin6_port = htons(port); 1041 | } 1042 | else 1043 | { 1044 | memcpy(&sock_addr, addrinfo->ai_addr, sizeof(struct sockaddr_in)); 1045 | ((struct sockaddr_in *)&sock_addr)->sin_family = AF_INET; 1046 | ((struct sockaddr_in *)&sock_addr)->sin_port = htons(port); 1047 | } 1048 | 1049 | /* create listen rdma cm id */ 1050 | ret = rdma_create_id(listen_channel, &listen_cmid, NULL, RDMA_PS_TCP); 1051 | if (ret) 1052 | { 1053 | rdmaErr("RDMA: create listen cm id error %d", errno); 1054 | goto err; 1055 | } 1056 | 1057 | rdma_set_option(listen_cmid, RDMA_OPTION_ID, RDMA_OPTION_ID_AFONLY, 1058 | &afonly, sizeof(afonly)); 1059 | 1060 | ret = rdma_bind_addr(listen_cmid, (struct sockaddr *)&sock_addr); 1061 | if (ret) 1062 | { 1063 | rdmaErr("RDMA: bind addr error for %s:%d", ip, port); 1064 | goto err; 1065 | } 1066 | 1067 | ret = rdma_listen(listen_cmid, (*listener)->options.rdma_listen_backlog); 1068 | if (ret) 1069 | { 1070 | rdmaErr("RDMA: listen addr error %d", errno); 1071 | goto err; 1072 | } 1073 | 1074 | (*listener)->cm_id = listen_cmid; 1075 | (*listener)->cm_channel = listen_channel; 1076 | ret = RDMA_OK; 1077 | goto end; 1078 | 1079 | err: 1080 | if (listen_cmid) 1081 | rdma_destroy_id(listen_cmid); 1082 | if (listen_channel) 1083 | rdma_destroy_event_channel(listen_channel); 1084 | ret = RDMA_ERR; 1085 | 1086 | end: 1087 | if (addrinfo) 1088 | freeaddrinfo(addrinfo); 1089 | 1090 | return ret; 1091 | } 1092 | 1093 | int rdmaPollEvents(struct rdma_event_channel *event_channel, void *poll_ctx) 1094 | { 1095 | int ret = RDMA_OK; 1096 | struct rdma_cm_event *ev, event_copy; 1097 | enum rdma_cm_event_type ev_type; 1098 | 1099 | ret = rdma_get_cm_event(event_channel, &ev); 1100 | if (ret) 1101 | { 1102 | if (errno != EAGAIN) 1103 | { 1104 | rdmaWarn("RDMA: rdma event channel get cm event failed, %s", strerror(errno)); 1105 | } 1106 | return RDMA_ERR; 1107 | } 1108 | 1109 | /* Note that failing to acknowledge events will result in rdma_destroy_id() blocking. */ 1110 | memcpy(&event_copy, ev, sizeof(*ev)); 1111 | rdma_ack_cm_event(ev); 1112 | 1113 | ev_type = event_copy.event; 1114 | switch (ev_type) 1115 | { 1116 | case RDMA_CM_EVENT_CONNECT_REQUEST: 1117 | ret = rdmaOnConnectRequest(&event_copy); 1118 | break; 1119 | case RDMA_CM_EVENT_ADDR_RESOLVED: 1120 | ret = rdmaOnAddrResolved(&event_copy); 1121 | break; 1122 | case RDMA_CM_EVENT_ROUTE_RESOLVED: 1123 | ret = rdmaOnRouteResolved(&event_copy); 1124 | break; 1125 | case RDMA_CM_EVENT_ESTABLISHED: 1126 | ret = rdmaOnConnected(&event_copy, poll_ctx); 1127 | break; 1128 | case RDMA_CM_EVENT_UNREACHABLE: 1129 | case RDMA_CM_EVENT_ADDR_ERROR: 1130 | case RDMA_CM_EVENT_ROUTE_ERROR: 1131 | case RDMA_CM_EVENT_CONNECT_ERROR: 1132 | case RDMA_CM_EVENT_ADDR_CHANGE: 1133 | case RDMA_CM_EVENT_DISCONNECTED: 1134 | case RDMA_CM_EVENT_TIMEWAIT_EXIT: 1135 | ret = rdmaOnDisconnected(&event_copy); 1136 | break; 1137 | case RDMA_CM_EVENT_REJECTED: 1138 | ret = rdmaOnRejected(&event_copy); 1139 | break; 1140 | default: 1141 | rdmaWarn("RDMA: rdma cm event channel get unknown event %d (%s)", 1142 | ev_type, rdma_event_str(ev_type)); 1143 | } 1144 | 1145 | return ret; 1146 | } 1147 | 1148 | int rdmaOnConnectRequest(struct rdma_cm_event *ev) 1149 | { 1150 | int ret = RDMA_OK; 1151 | struct rdma_cm_id *cmid = ev->id; 1152 | struct sockaddr_storage caddr; 1153 | RdmaConn *conn = NULL; /* used to store accepted RDMA connection */ 1154 | struct rdma_conn_param conn_param = { 1155 | .responder_resources = 1, 1156 | .initiator_depth = 1, 1157 | .retry_count = rdmaRetryCount, 1158 | .rnr_retry_count = rdmaRnrRetryCount, 1159 | }; 1160 | char cip[NET_IP_STR_LEN]; 1161 | int cport = 0; 1162 | 1163 | rdmaDebug("rdmaOnConnectRequest recv a new connection"); 1164 | memcpy(&caddr, &cmid->route.addr.dst_addr, sizeof(caddr)); 1165 | if (caddr.ss_family == AF_INET) 1166 | { 1167 | struct sockaddr_in *s = (struct sockaddr_in *)&caddr; 1168 | inet_ntop(AF_INET, (void *)&(s->sin_addr), cip, sizeof(cip)); 1169 | cport = ntohs(s->sin_port); 1170 | } 1171 | else 1172 | { 1173 | struct sockaddr_in6 *s = (struct sockaddr_in6 *)&caddr; 1174 | inet_ntop(AF_INET6, (void *)&(s->sin6_addr), cip, cport); 1175 | cport = ntohs(s->sin6_port); 1176 | } 1177 | 1178 | conn = malloc(sizeof(RdmaConn)); 1179 | if (!conn) 1180 | { 1181 | goto err; 1182 | } 1183 | memset(conn, 0, sizeof(*conn)); 1184 | 1185 | conn->ip = strdup(cip); 1186 | conn->port = cport; 1187 | conn->state = RDMA_CONN_STATE_ACCEPTING; 1188 | conn->type = ACCEPTED_CONN; 1189 | conn->send_length = 0; 1190 | conn->recv_length = 0; 1191 | conn->tx_length = 0; 1192 | pthread_cond_init(&conn->status_cond, NULL); 1193 | pthread_mutex_init(&conn->status_mutex, NULL); 1194 | 1195 | cmid->context = conn; 1196 | if (rdmaConnCreate(cmid, conn) != RDMA_OK) 1197 | { 1198 | rdmaErr("RDMA: failed to create accepted RDMA Connection (%s:%d)", cip, cport); 1199 | goto err; 1200 | } 1201 | 1202 | conn_param.initiator_depth = rdmaMaxOutstandingRdAtomic; 1203 | conn_param.responder_resources = rdmaMaxOutstandingRdAtomic; 1204 | ret = rdma_accept(cmid, &conn_param); 1205 | if (ret) 1206 | { 1207 | rdmaErr("RDMA: accept failed %d (%s)", errno, strerror(errno)); 1208 | goto err; 1209 | } 1210 | 1211 | return RDMA_OK; 1212 | 1213 | err: 1214 | /* free rdma related resource */ 1215 | rdmaConnRelease(conn); 1216 | 1217 | /* reject connect request if hitting error*/ 1218 | rdma_reject(cmid, NULL, 0); 1219 | 1220 | return RDMA_ERR; 1221 | } 1222 | 1223 | int rdmaOnAddrResolved(struct rdma_cm_event *ev) 1224 | { 1225 | struct rdma_cm_id *id = ev->id; 1226 | RdmaConn *conn = id->context; 1227 | 1228 | /* resolve route at most 1000ms */ 1229 | if (rdma_resolve_route(id, conn->options.rdma_timeoutms) != 0) 1230 | { 1231 | rdmaErr("RDMA: resolve route failed with timeout %d ms", conn->options.rdma_timeoutms); 1232 | rdmaOnRejected(ev); 1233 | return RDMA_ERR; 1234 | } 1235 | 1236 | return RDMA_OK; 1237 | } 1238 | 1239 | int rdmaOnRouteResolved(struct rdma_cm_event *ev) 1240 | { 1241 | struct rdma_cm_id *id = ev->id; 1242 | struct rdma_conn_param conn_param = {0}; 1243 | RdmaConn *conn = id->context; 1244 | 1245 | if (rdmaConnCreate(id, conn) != RDMA_OK) 1246 | { 1247 | rdmaErr("RDMA: failed to create RDMA Connection Resource"); 1248 | return RDMA_ERR; 1249 | } 1250 | 1251 | /* rdma connect with param */ 1252 | conn_param.responder_resources = rdmaMaxOutstandingRdAtomic; 1253 | conn_param.initiator_depth = rdmaMaxOutstandingRdAtomic; 1254 | conn_param.retry_count = rdmaRetryCount; 1255 | conn_param.rnr_retry_count = rdmaRnrRetryCount; 1256 | if (rdma_connect(id, &conn_param)) 1257 | { 1258 | rdmaErr("RDMA: rdma_connect() failed in error (%s)", strerror(errno)); 1259 | rdmaOnRejected(ev); 1260 | return RDMA_ERR; 1261 | } 1262 | 1263 | return RDMA_OK; 1264 | } 1265 | 1266 | static RdmaCmd *rdmaAllocCmdBuf(RdmaConn *conn, RdmaWrCtx **tx_ctx) 1267 | { 1268 | RdmaCmd *_cmd; 1269 | RdmaWrCtx *_tx_ctx; 1270 | int i; 1271 | 1272 | /* find an unused cmd buffer */ 1273 | for (i = rdmaRecvDepth; i < 2 * rdmaRecvDepth; i++) 1274 | { 1275 | _cmd = conn->cmd_buf + i; 1276 | if (!_cmd->magic) 1277 | { 1278 | break; 1279 | } 1280 | } 1281 | 1282 | assert(i < 2 * rdmaRecvDepth); 1283 | 1284 | /* find corresponding RdmaWrCtx */ 1285 | _tx_ctx = conn->tx_ctx + i - rdmaRecvDepth; 1286 | *tx_ctx = _tx_ctx; 1287 | 1288 | return _cmd; 1289 | } 1290 | 1291 | int rdmaSendCommand(RdmaConn *conn, struct rdma_cm_id *id, RdmaCmd *cmd, void *tx_ctx) 1292 | { 1293 | int ret; 1294 | 1295 | ret = rdma_send_signaled(id->qp, (uint64_t)tx_ctx, (uint64_t)cmd, 1296 | sizeof(RdmaCmd), conn->cmd_mr->lkey, conn->max_inline_data); 1297 | if (ret) 1298 | { 1299 | rdmaWarn("RDMA: post send RDMA cmd failed %d", ret); 1300 | conn->state = RDMA_CONN_STATE_ERROR; 1301 | return RDMA_ERR; 1302 | } 1303 | 1304 | return RDMA_OK; 1305 | } 1306 | 1307 | /* sync RDMA recv MR to remote via two sided messaging */ 1308 | int connRdmaSyncRxMr(RdmaConn *conn, struct rdma_cm_id *cm_id) 1309 | { 1310 | RdmaCmd *cmd; 1311 | RdmaWrCtx *tx_ctx; 1312 | 1313 | cmd = rdmaAllocCmdBuf(conn, &tx_ctx); 1314 | 1315 | cmd->addr = (uint64_t)conn->recv_buf; 1316 | cmd->length = htonl(conn->recv_length); 1317 | cmd->key = htonl(conn->recv_mr->rkey); 1318 | cmd->cmd_opcode = REG_LOCAL_ADDR; 1319 | cmd->magic = RDMA_CMD_MAGIC; 1320 | 1321 | tx_ctx->type = SEND_CONTEXT; 1322 | tx_ctx->rdma_conn = (void *)conn; 1323 | tx_ctx->private_data = (void *)cmd; 1324 | 1325 | conn->rx_offset = 0; 1326 | conn->recv_offset = 0; 1327 | 1328 | return rdmaSendCommand(conn, cm_id, cmd, tx_ctx); 1329 | } 1330 | 1331 | int connRdmaSyncPhysRxMr(RdmaConn *conn, struct rdma_cm_id *cm_id) 1332 | { 1333 | RdmaCmd *cmd; 1334 | RdmaWrCtx *tx_ctx; 1335 | 1336 | if (!rdmaEnablePhysAddrAccess) 1337 | return RDMA_OK; 1338 | 1339 | if (rdmaEnablePhysAddrAccess && (!g_ctx || !g_ctx->phys_mr)) 1340 | { 1341 | rdmaDebug("You should enable Physical Memory Access over RDMA before use. \n" 1342 | "Note that you need to enable RDMA Physical Address Memory Region" 1343 | " (pa-mr) in MLNX_OFED driver."); 1344 | return RDMA_ERR; 1345 | } 1346 | 1347 | cmd = rdmaAllocCmdBuf(conn, &tx_ctx); 1348 | 1349 | cmd->addr = (uint64_t)g_ctx->phys_mr->addr; 1350 | cmd->length = htonl(g_ctx->phys_mr->length); 1351 | cmd->key = htonl(g_ctx->phys_mr->rkey); 1352 | cmd->cmd_opcode = REG_PHYS_ADDR; 1353 | cmd->magic = RDMA_CMD_MAGIC; 1354 | 1355 | tx_ctx->type = SEND_CONTEXT; 1356 | tx_ctx->rdma_conn = (void *)conn; 1357 | tx_ctx->private_data = (void *)cmd; 1358 | 1359 | return rdmaSendCommand(conn, cm_id, cmd, tx_ctx); 1360 | } 1361 | 1362 | int connRdmaSayBye(RdmaConn *conn, struct rdma_cm_id *cm_id) 1363 | { 1364 | RdmaCmd *cmd; 1365 | RdmaWrCtx *tx_ctx; 1366 | 1367 | cmd = rdmaAllocCmdBuf(conn, &tx_ctx); 1368 | 1369 | cmd->cmd_opcode = CONN_GOODBYE; 1370 | cmd->magic = RDMA_CMD_MAGIC; 1371 | 1372 | tx_ctx->type = SEND_CONTEXT; 1373 | tx_ctx->rdma_conn = (void *)conn; 1374 | tx_ctx->private_data = (void *)cmd; 1375 | 1376 | return rdmaSendCommand(conn, cm_id, cmd, tx_ctx); 1377 | } 1378 | 1379 | int rdmaOnConnected(struct rdma_cm_event *ev, void *poll_ctx) 1380 | { 1381 | struct rdma_cm_id *id = ev->id; 1382 | RdmaConn *conn = id->context; 1383 | 1384 | connRdmaSyncPhysRxMr(conn, id); 1385 | connRdmaSyncRxMr(conn, id); 1386 | conn->state = RDMA_CONN_STATE_CONNECTED; 1387 | 1388 | if (conn->type == ACCEPTED_CONN && poll_ctx) 1389 | { 1390 | RdmaListener *listener = poll_ctx; 1391 | if (listener->accept_callback) 1392 | { 1393 | listener->accept_callback(conn); 1394 | } 1395 | } 1396 | else if (conn->type == CONNECTED_CONN) 1397 | { 1398 | /* ConnectCallback for client side */ 1399 | if (conn->connected_callback) 1400 | { 1401 | conn->connected_callback(conn); 1402 | } 1403 | } 1404 | 1405 | return RDMA_OK; 1406 | } 1407 | 1408 | int rdmaOnDisconnected(struct rdma_cm_event *ev) 1409 | { 1410 | struct rdma_cm_id *id = ev->id; 1411 | RdmaConn *conn = id->context; 1412 | 1413 | conn->state = RDMA_CONN_STATE_CLOSED; 1414 | /* call Disconnect Callback before release */ 1415 | if (conn->disconnect_callback) 1416 | { 1417 | conn->disconnect_callback(conn); 1418 | } 1419 | pthread_mutex_lock(&conn->status_mutex); 1420 | pthread_cond_broadcast(&conn->status_cond); /* signal waiting threads */ 1421 | pthread_mutex_unlock(&conn->status_mutex); 1422 | rdmaConnRelease(conn); 1423 | 1424 | return RDMA_OK; 1425 | } 1426 | 1427 | int rdmaOnRejected(struct rdma_cm_event *ev) 1428 | { 1429 | struct rdma_cm_id *id = ev->id; 1430 | RdmaConn *conn = id->context; 1431 | 1432 | conn->state = RDMA_CONN_STATE_ERROR; 1433 | 1434 | return RDMA_OK; 1435 | } 1436 | 1437 | /* note that rdmaServerStart can also run in a thread in an async manner. */ 1438 | int rdmaServerStart(RdmaListener *listener) 1439 | { 1440 | int flags = fcntl(listener->cm_channel->fd, F_GETFL); 1441 | int ret = fcntl(listener->cm_channel->fd, F_SETFL, flags | O_NONBLOCK); 1442 | assert(ret == 0); 1443 | int error_flags = POLLERR | POLLHUP | POLLNVAL; 1444 | struct pollfd pfd = { 1445 | .fd = listener->cm_channel->fd, 1446 | .events = POLLIN, 1447 | .revents = 0}; 1448 | int num_events = 0; 1449 | 1450 | if (ret != 0) 1451 | { 1452 | rdmaErr("RDMA: fcntl rdma cm event channel fd failed status: %s", strerror(errno)); 1453 | return RDMA_ERR; 1454 | } 1455 | 1456 | while (!atomic_load(&g_should_stop)) 1457 | { 1458 | num_events = poll(&pfd, 1, rdmaPollEventTimeoutms); 1459 | 1460 | if (num_events == -1) 1461 | { 1462 | rdmaErr("RDMA: poll rdma cm event channel faild (%s)", strerror(errno)); 1463 | goto err; 1464 | } 1465 | else if (num_events == 0) 1466 | { 1467 | // rdmaDebug("RDMA: rdma cm event channel timeout reached. No events"); 1468 | continue; 1469 | } 1470 | 1471 | if ((pfd.revents & error_flags) != 0) 1472 | { 1473 | rdmaErr("RDMA: rdma cm event channel poll err"); 1474 | goto err; 1475 | } 1476 | 1477 | if (!(pfd.revents & POLLIN)) 1478 | continue; 1479 | 1480 | ret = rdmaPollEvents(listener->cm_channel, listener); 1481 | if (ret != 0) 1482 | { 1483 | rdmaErr("RDMA: poll CM events failed (%s)", strerror(errno)); 1484 | goto err; 1485 | } 1486 | } 1487 | 1488 | ret = RDMA_OK; 1489 | goto end; 1490 | err: 1491 | ret = RDMA_ERR; 1492 | 1493 | end: 1494 | return ret; 1495 | } 1496 | 1497 | int rdmaServerStop(RdmaListener *listener) 1498 | { 1499 | int ret = RDMA_OK; 1500 | 1501 | atomic_store(&g_should_stop, true); 1502 | rdmaContextRelease(); 1503 | 1504 | return ret; 1505 | } 1506 | 1507 | void rdmaServerRelease(RdmaListener *listener) 1508 | { 1509 | if (!listener) 1510 | return; 1511 | 1512 | if (listener->cm_id) 1513 | { 1514 | rdma_destroy_id(listener->cm_id); 1515 | } 1516 | 1517 | if (listener->cm_channel) 1518 | { 1519 | rdma_destroy_event_channel(listener->cm_channel); 1520 | } 1521 | 1522 | free(listener); 1523 | } 1524 | 1525 | int rdmaServerSetAcceptCallback(RdmaListener *listener, RdmaAcceptCallbackFunc func) 1526 | { 1527 | if (listener->accept_callback == func) 1528 | return RDMA_OK; 1529 | 1530 | listener->accept_callback = func; 1531 | 1532 | return RDMA_OK; 1533 | } 1534 | 1535 | /** rdma client side */ 1536 | 1537 | RdmaConn *rdmaConn(const RdmaServerOptions *opt) 1538 | { 1539 | RdmaConn *conn; 1540 | 1541 | if (!g_cm_channel) 1542 | { 1543 | g_cm_channel = rdma_create_event_channel(); 1544 | if (!g_cm_channel) 1545 | { 1546 | rdmaErr("RDMA: create cm event channel failed %d(%s)", errno, strerror(errno)); 1547 | return NULL; 1548 | } 1549 | 1550 | atomic_store(&g_ev_poller_should_stop, false); 1551 | pthread_create(&g_ev_poller_thread, NULL, rdmaCmChannelStart, NULL); 1552 | } 1553 | 1554 | conn = malloc(sizeof(RdmaConn)); 1555 | if (!conn) 1556 | { 1557 | rdmaErr("RDMA: malloc RdmaConn failed %s", strerror(errno)); 1558 | return NULL; 1559 | } 1560 | memset(conn, 0, sizeof(*conn)); 1561 | 1562 | /* setup Rdma Server Options */ 1563 | rdmaSetDefaultOptions(&conn->options); 1564 | if (opt) 1565 | { 1566 | rdmaConnSetEnv(conn, opt); 1567 | rdmaSetGlobalEnv(opt); 1568 | } 1569 | 1570 | conn->state = RDMA_CONN_STATE_NONE; 1571 | conn->type = CONNECTED_CONN; 1572 | conn->cm_channel = g_cm_channel; 1573 | conn->send_length = 0; 1574 | conn->recv_length = 0; 1575 | conn->tx_length = 0; 1576 | pthread_cond_init(&conn->status_cond, NULL); 1577 | pthread_mutex_init(&conn->status_mutex, NULL); 1578 | 1579 | if (rdma_create_id(conn->cm_channel, &(conn->cm_id), (void *)conn, RDMA_PS_TCP)) 1580 | { 1581 | rdmaErr("RDMA: create cm id failed %d(%s", errno, strerror(errno)); 1582 | goto err; 1583 | } 1584 | 1585 | return conn; 1586 | 1587 | err: 1588 | if (conn) 1589 | free(conn); 1590 | 1591 | return NULL; 1592 | } 1593 | 1594 | int rdmaConnect(RdmaConn *conn, char *serverip, int port) 1595 | { 1596 | struct addrinfo hints, *servinfo = NULL; 1597 | struct sockaddr_storage saddr; 1598 | char _port[6]; /* strlen("65535") */ 1599 | int ret = RDMA_ERR; 1600 | 1601 | snprintf(_port, 6, "%d", port); 1602 | memset(&hints, 0, sizeof(hints)); 1603 | hints.ai_family = AF_INET; 1604 | hints.ai_socktype = SOCK_STREAM; 1605 | 1606 | if (getaddrinfo(serverip, _port, &hints, &servinfo)) 1607 | { 1608 | hints.ai_family = AF_INET6; 1609 | if (getaddrinfo(serverip, _port, &hints, &servinfo)) 1610 | { 1611 | rdmaWarn("RDMA: bad server addr info %d(%s)", errno, strerror(errno)); 1612 | goto out; 1613 | } 1614 | } 1615 | 1616 | if (servinfo->ai_family == PF_INET) 1617 | { 1618 | memcpy(&saddr, servinfo->ai_addr, sizeof(struct sockaddr_in)); 1619 | ((struct sockaddr_in *)&saddr)->sin_port = htons(port); 1620 | } 1621 | else if (servinfo->ai_family == PF_INET6) 1622 | { 1623 | memcpy(&saddr, servinfo->ai_addr, sizeof(struct sockaddr_in6)); 1624 | ((struct sockaddr_in6 *)&saddr)->sin6_port = htons(port); 1625 | } 1626 | else 1627 | { 1628 | rdmaWarn("RDMA: Unsupported family"); 1629 | goto out; 1630 | } 1631 | 1632 | /* resolve addr at most 1000ms */ 1633 | conn->ip = strdup(serverip); 1634 | conn->port = port; 1635 | conn->state = RDMA_CONN_STATE_CONNECTING; 1636 | if (rdma_resolve_addr(conn->cm_id, NULL, 1637 | (struct sockaddr *)&saddr, rdmaTimeoutms)) 1638 | { 1639 | rdmaWarn("RDMA: cannot resolve addr %s:%d (error: %s)", 1640 | serverip, port, strerror(errno)); 1641 | goto out; 1642 | } 1643 | 1644 | /* wait for connected state */ 1645 | pthread_mutex_lock(&conn->status_mutex); 1646 | pthread_cond_wait(&conn->status_cond, &conn->status_mutex); 1647 | pthread_mutex_unlock(&conn->status_mutex); 1648 | 1649 | ret = RDMA_OK; 1650 | out: 1651 | if (servinfo) 1652 | { 1653 | freeaddrinfo(servinfo); 1654 | } 1655 | 1656 | return ret; 1657 | } 1658 | 1659 | void rdmaConnClose(RdmaConn *conn) 1660 | { 1661 | struct rdma_cm_id *cm_id = conn->cm_id; 1662 | 1663 | if (!cm_id) 1664 | return; 1665 | 1666 | // rdma_disconnect(cm_id); 1667 | connRdmaSayBye(conn, cm_id); 1668 | pthread_mutex_lock(&conn->status_mutex); 1669 | pthread_cond_wait(&conn->status_cond, &conn->status_mutex); 1670 | pthread_mutex_unlock(&conn->status_mutex); 1671 | } 1672 | 1673 | int rdmaConnSetRecvCallback(RdmaConn *conn, RdmaRecvCallbackFunc func) 1674 | { 1675 | if (func == conn->recv_callback) 1676 | return RDMA_OK; 1677 | 1678 | conn->recv_callback = func; 1679 | 1680 | return RDMA_OK; 1681 | } 1682 | 1683 | int rdmaConnSetWriteCallback(RdmaConn *conn, RdmaWriteCallbackFunc func) 1684 | { 1685 | if (func == conn->write_callback) 1686 | return RDMA_OK; 1687 | 1688 | conn->write_callback = func; 1689 | 1690 | return RDMA_OK; 1691 | } 1692 | 1693 | int rdmaConnSetReadCallback(RdmaConn *conn, RdmaReadCallbackFunc func) 1694 | { 1695 | if (func == conn->read_callback) 1696 | return RDMA_OK; 1697 | 1698 | conn->read_callback = func; 1699 | 1700 | return RDMA_OK; 1701 | } 1702 | 1703 | int rdmaConnSetConnectedCallback(RdmaConn *conn, RdmaConnectedCallbackFunc func) 1704 | { 1705 | if (func == conn->connected_callback) 1706 | return RDMA_OK; 1707 | 1708 | conn->connected_callback = func; 1709 | 1710 | return RDMA_OK; 1711 | } 1712 | 1713 | int rdmaConnSetDisconnectCallback(RdmaConn *conn, RdmaDisconnectCallbackFunc func) 1714 | { 1715 | if (func == conn->disconnect_callback) 1716 | return RDMA_OK; 1717 | 1718 | conn->disconnect_callback = func; 1719 | 1720 | return RDMA_OK; 1721 | } 1722 | 1723 | /* data plane interfaces. Signaled by default. */ 1724 | size_t rdmaConnSend(RdmaConn *conn, void *data, size_t data_len) 1725 | { 1726 | struct rdma_cm_id *cm_id = conn->cm_id; 1727 | uint32_t off = conn->tx_offset; 1728 | char *addr = conn->send_buf + off; 1729 | char *remote_addr = conn->tx_addr + conn->tx_offset; 1730 | int ret; 1731 | uint32_t tosend; 1732 | 1733 | if (conn->state == RDMA_CONN_STATE_ERROR || conn->state == RDMA_CONN_STATE_CLOSED) 1734 | { 1735 | return RDMA_ERR; 1736 | } 1737 | 1738 | assert(conn->tx_offset <= conn->tx_length); 1739 | tosend = MIN(conn->tx_length - conn->tx_offset, data_len); 1740 | if (!tosend) 1741 | { 1742 | rdmaWarn("no idle tx memory for RDMA WRITE"); 1743 | return 0; 1744 | } 1745 | 1746 | memcpy(addr, data, data_len); 1747 | 1748 | if (!conn->write_callback && (++conn->send_ops % (RDMA_MAX_SGE / 2)) != 0) 1749 | { 1750 | ret = rdma_write_with_imm(cm_id->qp, (uint64_t)conn, htonl(conn->tx_offset), 1751 | (uint64_t)addr, conn->send_mr->lkey, 1752 | (uint64_t)remote_addr, conn->tx_key, 1753 | data_len, rdmaMaxInlineData); 1754 | } 1755 | else 1756 | { 1757 | ret = rdma_write_with_imm_signaled(cm_id->qp, (uint64_t)conn, htonl(conn->tx_offset), 1758 | (uint64_t)addr, conn->send_mr->lkey, 1759 | (uint64_t)remote_addr, conn->tx_key, 1760 | data_len, rdmaMaxInlineData); 1761 | } 1762 | if (ret) 1763 | { 1764 | rdmaErr("RDMA: post send failed : %s", strerror(errno)); 1765 | return RDMA_ERR; 1766 | } 1767 | 1768 | conn->tx_offset += data_len; 1769 | 1770 | return data_len; 1771 | } 1772 | 1773 | // size_t rdmaConnSendWithImm(RdmaConn *conn, uint32_t imm_data, 1774 | // const void *data, size_t data_len) 1775 | // { 1776 | // return RDMA_OK; 1777 | // } 1778 | 1779 | /* RDMA Write with data copy. Here we assume that input data buf 1780 | * is not RDMA-registered MR. 1781 | */ 1782 | size_t rdmaConnWrite(RdmaConn *conn, const void *data, size_t data_len) 1783 | { 1784 | struct rdma_cm_id *cm_id = conn->cm_id; 1785 | uint32_t off = conn->tx_offset; 1786 | char *addr = conn->send_buf + off; 1787 | char *remote_addr = conn->tx_addr + conn->tx_offset; 1788 | int ret; 1789 | uint32_t tosend; 1790 | 1791 | if (conn->state == RDMA_CONN_STATE_ERROR || conn->state == RDMA_CONN_STATE_CLOSED) 1792 | { 1793 | return RDMA_ERR; 1794 | } 1795 | 1796 | assert(conn->tx_offset <= conn->tx_length); 1797 | tosend = MIN(conn->tx_length - conn->tx_offset, data_len); 1798 | if (!tosend) 1799 | { 1800 | return 0; 1801 | } 1802 | 1803 | memcpy(addr, data, data_len); 1804 | 1805 | if (!conn->write_callback && (++conn->send_ops % (RDMA_MAX_SGE / 2)) != 0) 1806 | { 1807 | ret = rdma_write(cm_id->qp, (uint64_t)conn, 1808 | (uint64_t)addr, conn->send_mr->lkey, 1809 | (uint64_t)remote_addr, conn->tx_key, 1810 | data_len, rdmaMaxInlineData); 1811 | } 1812 | else 1813 | { 1814 | ret = rdma_write_signaled(cm_id->qp, (uint64_t)conn, 1815 | (uint64_t)addr, conn->send_mr->lkey, 1816 | (uint64_t)remote_addr, conn->tx_key, 1817 | data_len, rdmaMaxInlineData); 1818 | } 1819 | if (ret) 1820 | { 1821 | rdmaErr("RDMA: post send failed for RDMA WRITE : %s", strerror(errno)); 1822 | return RDMA_ERR; 1823 | } 1824 | 1825 | conn->tx_offset += data_len; 1826 | 1827 | return data_len; 1828 | } 1829 | 1830 | int rdmaConnWriteWithImm(RdmaConn *conn, uint32_t imm_data, 1831 | const void *data, size_t data_len) 1832 | { 1833 | return RDMA_OK; 1834 | } 1835 | 1836 | int rdmaConnRead(RdmaConn *conn, void *local_buf, uint32_t lkey, 1837 | void *remote_buf, uint32_t rkey, size_t length) 1838 | { 1839 | struct rdma_cm_id *cm_id = conn->cm_id; 1840 | int ret; 1841 | 1842 | if (conn->state == RDMA_CONN_STATE_ERROR || conn->state == RDMA_CONN_STATE_CLOSED) 1843 | { 1844 | return RDMA_ERR; 1845 | } 1846 | 1847 | ret = rdma_read_signaled(cm_id->qp, (uint64_t)conn, (uint64_t)local_buf, lkey, 1848 | (uint64_t)remote_buf, rkey, length, conn->max_inline_data); 1849 | if (ret) 1850 | { 1851 | rdmaErr("RDMA: post send failed for RDMA READ : %s", strerror(errno)); 1852 | return RDMA_ERR; 1853 | } 1854 | 1855 | return length; 1856 | } 1857 | 1858 | int rdmaSyncWriteSignaled(RdmaConn *conn, uint64_t local_addr, 1859 | uint32_t lkey, uint64_t remote_addr, 1860 | uint32_t rkey, uint32_t length) 1861 | { 1862 | struct rdma_cm_id *id = conn->cm_id; 1863 | struct ibv_wc wc = {0}; 1864 | int ret; 1865 | 1866 | if (conn->state == RDMA_CONN_STATE_ERROR || conn->state == RDMA_CONN_STATE_CLOSED) 1867 | { 1868 | return RDMA_ERR; 1869 | } 1870 | 1871 | ret = rdma_write_signaled(id->qp, (uint64_t)conn, local_addr, 1872 | lkey, remote_addr, rkey, 1873 | length, conn->max_inline_data); 1874 | if (ret) 1875 | { 1876 | rdmaErr("RDMA: post send failed for RDMA WRITE : %s", strerror(errno)); 1877 | return RDMA_ERR; 1878 | } 1879 | 1880 | while (ibv_poll_cq(conn->cq, 1, &wc) == 0); 1881 | 1882 | return length; 1883 | } 1884 | 1885 | int rdmaSyncReadSignaled(RdmaConn *conn, uint64_t local_addr, 1886 | uint32_t lkey, uint64_t remote_addr, 1887 | uint32_t rkey, uint32_t length) 1888 | { 1889 | struct rdma_cm_id *id = conn->cm_id; 1890 | struct ibv_wc wc = {0}; 1891 | int ret; 1892 | 1893 | if (conn->state == RDMA_CONN_STATE_ERROR || conn->state == RDMA_CONN_STATE_CLOSED) 1894 | { 1895 | return RDMA_ERR; 1896 | } 1897 | 1898 | ret = rdma_read_signaled(id->qp, (uint64_t)conn, local_addr, lkey, 1899 | remote_addr, rkey, 1900 | length, conn->max_inline_data); 1901 | if (ret) 1902 | { 1903 | rdmaErr("RDMA: post send failed for RDMA READ : %s", strerror(errno)); 1904 | return RDMA_ERR; 1905 | } 1906 | 1907 | while (ibv_poll_cq(conn->cq, 1, &wc) == 0); 1908 | 1909 | return length; 1910 | } 1911 | 1912 | int rdmaPAWriteSignaled(RdmaConn *conn, uint64_t local_addr, 1913 | uint32_t lkey, uint64_t remote_addr, uint32_t length) 1914 | { 1915 | struct rdma_cm_id *id = conn->cm_id; 1916 | int ret; 1917 | 1918 | if (conn->state == RDMA_CONN_STATE_ERROR || conn->state == RDMA_CONN_STATE_CLOSED) 1919 | { 1920 | return RDMA_ERR; 1921 | } 1922 | 1923 | ret = rdma_write_signaled(id->qp, (uint64_t)conn, local_addr, lkey, 1924 | remote_addr, conn->tx_pa_rkey, 1925 | length, conn->max_inline_data); 1926 | if (ret) 1927 | { 1928 | rdmaErr("RDMA: post send failed for RDMA Write : %s", strerror(errno)); 1929 | return RDMA_ERR; 1930 | } 1931 | 1932 | return length; 1933 | } 1934 | 1935 | int rdmaPAReadSignaled(RdmaConn *conn, uint64_t local_addr, 1936 | uint32_t lkey, uint64_t remote_addr, uint32_t length) 1937 | { 1938 | struct rdma_cm_id *cm_id = conn->cm_id; 1939 | int ret; 1940 | 1941 | if (conn->state == RDMA_CONN_STATE_ERROR || conn->state == RDMA_CONN_STATE_CLOSED) 1942 | { 1943 | return RDMA_ERR; 1944 | } 1945 | 1946 | ret = rdma_read_signaled(cm_id->qp, (uint64_t)conn, local_addr, lkey, 1947 | remote_addr, conn->tx_pa_rkey, length, conn->max_inline_data); 1948 | if (ret) 1949 | { 1950 | rdmaErr("RDMA: post send failed for RDMA READ : %s", strerror(errno)); 1951 | return RDMA_ERR; 1952 | } 1953 | 1954 | return length; 1955 | } 1956 | 1957 | int rdmaPASyncWriteSignaled(RdmaConn *conn, uint64_t local_addr, 1958 | uint32_t lkey, uint64_t remote_addr, uint32_t length) 1959 | { 1960 | struct rdma_cm_id *id = conn->cm_id; 1961 | struct ibv_wc wc = {0}; 1962 | int ret; 1963 | 1964 | if (conn->state == RDMA_CONN_STATE_ERROR || conn->state == RDMA_CONN_STATE_CLOSED) 1965 | { 1966 | return RDMA_ERR; 1967 | } 1968 | 1969 | ret = rdma_write_signaled(id->qp, (uint64_t)conn, local_addr, lkey, 1970 | remote_addr, conn->tx_pa_rkey, 1971 | length, conn->max_inline_data); 1972 | if (ret) 1973 | { 1974 | rdmaErr("RDMA: post send failed for RDMA Write : %s", strerror(errno)); 1975 | return RDMA_ERR; 1976 | } 1977 | 1978 | while (ibv_poll_cq(conn->cq, 1, &wc) == 0) 1979 | ; 1980 | 1981 | return length; 1982 | } 1983 | 1984 | int rdmaPASyncReadSignaled(RdmaConn *conn, uint64_t local_addr, 1985 | uint32_t lkey, uint64_t remote_addr, uint32_t length) 1986 | { 1987 | struct rdma_cm_id *id = conn->cm_id; 1988 | struct ibv_wc wc = {0}; 1989 | int ret; 1990 | 1991 | if (conn->state == RDMA_CONN_STATE_ERROR || conn->state == RDMA_CONN_STATE_CLOSED) 1992 | { 1993 | return RDMA_ERR; 1994 | } 1995 | 1996 | ret = rdma_read_signaled(id->qp, (uint64_t)conn, local_addr, lkey, 1997 | remote_addr, conn->tx_pa_rkey, 1998 | length, conn->max_inline_data); 1999 | if (ret) 2000 | { 2001 | rdmaErr("RDMA: post send failed for RDMA READ : %s", strerror(errno)); 2002 | return RDMA_ERR; 2003 | } 2004 | 2005 | while (ibv_poll_cq(conn->cq, 1, &wc) == 0); 2006 | 2007 | return length; 2008 | } 2009 | --------------------------------------------------------------------------------