├── Makefile ├── README.md ├── dcping.c └── my_ibv_helper.h /Makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | CFLAGS += -g 3 | #CFLAGS += -O2 -Wall -W -Werror 4 | LDFLAGS += -libverbs -lrdmacm -lmlx5 5 | TARGETS = dcping 6 | 7 | all: 8 | $(CC) $(CFLAGS) -o $(TARGETS) dcping.c $(LDFLAGS) 9 | 10 | clean: 11 | rm -f $(TARGETS) 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DC RTT ping test 2 | 3 | ## Description 4 | dcping is an example code for testing network Round Trip Time. 5 | It shows how to use the Mellanox DC QP to implement RDMA Write operatinos, and the verbs extended CQ to get time stamps for each transation. 6 | 7 | The server (passive side) application creates a DCT QP, allowing RDMA WRITE's. 8 | The client (active size) creates a DCI QP. Once addressing is resolved by the client, it issues 2 sequeancial's RDMA WRITE's and request a completion events. Then it will check the time-stamp diff between the 2 hardward timestamps to calculate the full RTT. It does that in a loop to get multiple results and calc min/avg/max RTT's. 9 | 10 | this example code demo's the following API's: 11 | 1. [mlx5dv_create_qp()](https://github.com/linux-rdma/rdma-core/blob/master/providers/mlx5/man/mlx5dv_create_qp.3.md) for DCT & DCI 12 | 2. RDMA_CM external QP for DC address resolution: 13 | 2a. mlx5dv_reserved_qpn_alloc() 14 | 2b. ibv_query_ece()/ibv_set_ece() 15 | 2c. rdma_set_local_ece()/rdma_get_remote_ece() 16 | 3. ibv_qp_ex WR's sends on the Mellanox DCI QP ([man ibv_wr_post](https://github.com/linux-rdma/rdma-core/blob/master/libibverbs/man/ibv_wr_post.3.md)): 17 | 3a. ibv_wr_start() 18 | 3b. ibv_wr_rdma_write() 19 | 3c. mlx5dv_set__dc_addr() 20 | 3d. ibv_wr_set_sge() 21 | 3e. ibv_wr_complete() 22 | 4. ibv_cq_ex ([man ibv_create_cq_ex](https://github.com/linux-rdma/rdma-core/blob/master/libibverbs/man/ibv_wr_post.3.md)): 23 | 4a. ibv_start_poll() 24 | 4b. ibv_wc_read_completion_ts() 25 | 4c. ibv_end_poll() 26 | 27 | 28 | ## Build: 29 | ```sh 30 | $ make clean 31 | $ make 32 | ``` 33 | 34 | ## Run Server: 35 | ```sh 36 | $ ./dcping -s -a 192.192.20.13 -d 37 | created cm_id 0x158aef0 38 | rdma_bind_addr successful on address: <192.192.20.13:7174> 39 | rdma_listen 40 | created pd 0x158a5c0 41 | created channel 0x158ab30 42 | created cq 0x158e3b0 43 | created srq 0x158e5d8 44 | created qp 0x1592018 (qpn=4700) 45 | hw_clocks_kHz = 78125 46 | allocated & registered buffers... 47 | server ready, waiting for client connection requests... 48 | waiting for client events ... 49 | got cm event: RDMA_CM_EVENT_CONNECT_REQUEST(4) status=0, cm_id 0x1592560 50 | accepting client connection request from <192.192.20.13:57929> (cm_id 0x1592560) 51 | waiting for client events ... 52 | got cm event: RDMA_CM_EVENT_ESTABLISHED(9) status=0, cm_id 0x1592560 53 | client connection established (cm_id 0x1592560) 54 | waiting for client events ... 55 | got cm event: RDMA_CM_EVENT_DISCONNECTED(10) status=0, cm_id 0x1592560 56 | client connection disconnected (cm_id 0x1592560) 57 | waiting for client events ... 58 | ^C 59 | ``` 60 | 61 | ## Run Client: 62 | ```sh 63 | $ ./dcping -c -a 192.192.20.13 -C 100 -D 100 -d 64 | created cm_id 0x719ef0 65 | got cm event: RDMA_CM_EVENT_ADDR_RESOLVED(0) status=0, cm_id 0x719ef0 66 | got cm event: RDMA_CM_EVENT_ROUTE_RESOLVED(2) status=0, cm_id 0x719ef0 67 | rdma_resolve_addr/rdma_resolve_route successful to server: <192.192.20.13:57929> 68 | created pd 0x71c170 69 | created channel 0x719560 70 | created cq 0x71d520 71 | created qp 0x71d748 (qpn=365) 72 | hw_clocks_kHz = 78125 73 | allocated & registered buffers... 74 | rdma_connecting... 75 | got cm event: RDMA_CM_EVENT_CONNECT_RESPONSE(5) status=0, cm_id 0x719ef0 76 | got server param's: dctn=4700, buf=0x1592330, size=64, rkey=950316 77 | created ah (0x71c260) 78 | rdma_connect successful 79 | connected to server, starting DC RTT test 80 | [total = 100] rtt = 0.012 / 0.070 / 0.281 usec 81 | done DC RTT test 82 | dcping_free_buffers called on cb 0x7193c0 83 | dcping_free_qp/srq/cq/pd called on cb 0x7193c0 84 | destroy cm_id 0x719ef0 85 | ``` 86 | -------------------------------------------------------------------------------- /dcping.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. 3 | * 4 | * This software is available to you under a choice of one of two 5 | * licenses. You may choose to be licensed under the terms of the GNU 6 | * General Public License (GPL) Version 2, available from the file 7 | * COPYING in the main directory of this source tree, or the 8 | * OpenIB.org BSD license below: 9 | * 10 | * Redistribution and use in source and binary forms, with or 11 | * without modification, are permitted provided that the following 12 | * conditions are met: 13 | * 14 | * - Redistributions of source code must retain the above 15 | * copyright notice, this list of conditions and the following 16 | * disclaimer. 17 | * 18 | * - Redistributions in binary form must reproduce the above 19 | * copyright notice, this list of conditions and the following 20 | * disclaimer in the documentation and/or other materials 21 | * provided with the distribution. 22 | * 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 | * SOFTWARE. 31 | */ 32 | #define _GNU_SOURCE 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include "my_ibv_helper.h" 48 | 49 | static int debug = 0; 50 | static int debug_fast_path = 0; 51 | #define DEBUG_LOG if (debug) printf 52 | #define DEBUG_LOG_FAST_PATH if (debug_fast_path) printf 53 | 54 | /* 55 | * dcping "RTT" loop: 56 | * server listens for incoming connection requests 57 | * client connects to server 58 | * server accepts and replies with RDMA buffer: addr/rkey/len 59 | * client receives remote addr/rkey/len 60 | * client loop: 61 | * posts rdma read/write "ping start" sz=1, and cqe will hold start_ts 62 | * posts rdma read/write "ping end" sz=SIZE, and cqe will hold end_ts 63 | * polls cq for 2 cqes, then RTT = (cqe[1]->ts - cqe[0]->ts) 64 | * wait for next latency polling loop 65 | * 66 | */ 67 | 68 | struct dcping_rdma_info { 69 | __be64 addr; 70 | __be32 rkey; 71 | __be32 size; 72 | __be32 dctn; 73 | }; 74 | 75 | /* 76 | * Default max buffer size for IO... 77 | */ 78 | #define PING_BUFSIZE 1024 79 | #define PING_SQ_DEPTH 64 80 | #define DC_KEY 0xffeeddcc 81 | 82 | /* Default string for print data and 83 | * minimum buffer size 84 | */ 85 | #define _stringify( _x ) # _x 86 | #define stringify( _x ) _stringify( _x ) 87 | 88 | #define PING_MSG_FMT "dcping-%d: " 89 | #define PING_MIN_BUFSIZE sizeof(stringify(INT_MAX)) + sizeof(PING_MSG_FMT) 90 | 91 | #define MAX(a,b) ((a)>(b)?(a):(b)) 92 | #define MAX_INET_ADDRSTRLEN MAX(INET_ADDRSTRLEN, INET6_ADDRSTRLEN) 93 | 94 | #define USEC_PER_SEC 1000000L 95 | 96 | /* 97 | * Control block struct. 98 | */ 99 | struct dcping_cb { 100 | int is_server; 101 | uint32_t count; /* ping count */ 102 | uint32_t size; /* ping data size */ 103 | uint32_t delay_usec; 104 | 105 | /* verbs stuff */ 106 | struct ibv_comp_channel *channel; 107 | struct ibv_cq_ex *cq; 108 | struct ibv_pd *pd; 109 | struct ibv_srq *srq; /* server only (for DCT) */ 110 | struct ibv_qp *qp; /* DCI (client) or DCT (server) */ 111 | struct ibv_qp_ex *qpex; /* client only */ 112 | struct mlx5dv_qp_ex *mqpex; /* client only */ 113 | struct ibv_ah *ah; /* client only */ 114 | enum ibv_mtu mtu; 115 | uint8_t is_global:1; 116 | uint8_t is_reserved_qpn_supp:1; 117 | uint8_t is_ece_supp:1; 118 | uint8_t sgid_index; 119 | uint32_t reserved_qpn; 120 | struct ibv_ece ece; 121 | 122 | /* CM stuff */ 123 | struct rdma_event_channel *cm_channel; 124 | struct rdma_cm_id *cm_id; /* connection on client side,*/ 125 | /* listener on service side. */ 126 | 127 | uint64_t hw_clocks_kHz; 128 | char *local_buf_addr; 129 | struct ibv_mr *local_buf_mr; 130 | struct dcping_rdma_info remote_buf_info; 131 | 132 | struct sockaddr_storage sin; 133 | struct sockaddr_storage ssource; 134 | __be16 port; /* dst port in NBO */ 135 | }; 136 | 137 | struct rdma_event_channel *create_first_event_channel(void) 138 | { 139 | struct rdma_event_channel *channel; 140 | 141 | channel = rdma_create_event_channel(); 142 | if (!channel) { 143 | if (errno == ENODEV) 144 | fprintf(stderr, "No RDMA devices were detected\n"); 145 | else 146 | perror("failed to create RDMA CM event channel"); 147 | } 148 | return channel; 149 | } 150 | 151 | static void dcping_init_conn_param(struct dcping_cb *cb, 152 | struct rdma_cm_id *cm_id, 153 | struct rdma_conn_param *conn_param) 154 | { 155 | uint32_t qp_num = 0; 156 | 157 | if (cb->is_reserved_qpn_supp) { 158 | int ret = my_mlx5dv_reserved_qpn_alloc(cb->cm_id->verbs, &qp_num); 159 | if (ret) { 160 | cb->is_reserved_qpn_supp = 0; 161 | DEBUG_LOG("reserved_qpn...NOT SUPPORTED\n"); 162 | } 163 | cb->reserved_qpn = qp_num; 164 | } 165 | 166 | if (qp_num == 0) { 167 | // Fall back to some software base qp_num allocation 168 | 169 | if (cb->is_server) { 170 | // fake a unique qp_num based on peer's IP addr + UDP port as we're 171 | // using the same DCT as an external QPN from all RDMA_CM connection 172 | qp_num = (((struct sockaddr_in *)rdma_get_peer_addr(cm_id))->sin_addr.s_addr) << 16; 173 | qp_num |= be16toh(rdma_get_dst_port(cm_id)); 174 | } else { 175 | qp_num = cb->qp->qp_num; 176 | } 177 | } 178 | 179 | memset(conn_param, 0, sizeof(*conn_param)); 180 | conn_param->responder_resources = 1; 181 | conn_param->initiator_depth = 1; 182 | conn_param->retry_count = 7; 183 | conn_param->rnr_retry_count = 7; 184 | conn_param->qp_num = qp_num; 185 | 186 | conn_param->private_data = &cb->remote_buf_info; // server's reports it's RDMA buffer details 187 | conn_param->private_data_len = sizeof(cb->remote_buf_info); 188 | } 189 | 190 | static int dcping_setup_buffers(struct dcping_cb *cb) 191 | { 192 | int ret; 193 | 194 | cb->local_buf_addr = malloc(cb->size); 195 | if (!cb->local_buf_addr) { 196 | fprintf(stderr, "local_buf_addr malloc failed\n"); 197 | ret = -ENOMEM; 198 | goto err1; 199 | } 200 | 201 | cb->local_buf_mr = ibv_reg_mr(cb->pd, cb->local_buf_addr, cb->size, 202 | IBV_ACCESS_LOCAL_WRITE | 203 | IBV_ACCESS_REMOTE_WRITE); 204 | if (!cb->local_buf_mr) { 205 | fprintf(stderr, "local_buf_addr reg_mr failed\n"); 206 | ret = errno; 207 | goto err2; 208 | } 209 | 210 | DEBUG_LOG("allocated & registered buffers...\n"); 211 | return 0; 212 | 213 | err2: 214 | free(cb->local_buf_addr); 215 | err1: 216 | return ret; 217 | } 218 | 219 | static void dcping_free_buffers(struct dcping_cb *cb) 220 | { 221 | DEBUG_LOG("dcping_free_buffers called on cb %p\n", cb); 222 | ibv_dereg_mr(cb->local_buf_mr); 223 | free(cb->local_buf_addr); 224 | } 225 | 226 | static void dcping_ece_get_locally_set_remote(struct dcping_cb *cb, struct rdma_cm_id *cm_id) 227 | { 228 | int ret = 0; 229 | if (!cb->is_ece_supp) 230 | return; 231 | 232 | DEBUG_LOG("update ECE from QP to CM..."); 233 | 234 | ret = ibv_query_ece(cb->qp, &cb->ece); 235 | if (ret) { 236 | cb->is_ece_supp = 0; 237 | DEBUG_LOG("NOT SUPPORTED\n"); 238 | return; 239 | } 240 | DEBUG_LOG("(%#x, %#x)\n", cb->ece.vendor_id, cb->ece.options); 241 | ret = rdma_set_local_ece(cm_id, &cb->ece); 242 | if (ret) { 243 | perror("rdma_set_local_ece"); 244 | return; 245 | } 246 | } 247 | 248 | static void dcping_ece_get_remote_set_locally(struct dcping_cb *cb, struct rdma_cm_id *cm_id) 249 | { 250 | int ret = 0; 251 | struct ibv_ece ece; 252 | if (!cb->is_ece_supp) 253 | return; 254 | 255 | DEBUG_LOG("update ECE from CM responce to QP..."); 256 | 257 | ret = rdma_get_remote_ece(cm_id, &ece); 258 | if (ret) { 259 | cb->is_ece_supp = 0; 260 | DEBUG_LOG("NOT SUPPORTED\n"); 261 | return; 262 | } 263 | DEBUG_LOG("(%#x, %#x)\n", ece.vendor_id, ece.options); 264 | ibv_set_ece(cb->qp, &ece); 265 | if (ret) { 266 | perror("ibv_set_ece"); 267 | return; 268 | } 269 | } 270 | 271 | static int dcping_create_qp(struct dcping_cb *cb) 272 | { 273 | struct ibv_qp_init_attr_ex attr_ex; 274 | struct mlx5dv_qp_init_attr attr_dv; 275 | int ret = 0; 276 | 277 | /* create DC QP */ 278 | memset(&attr_ex, 0, sizeof(attr_ex)); 279 | memset(&attr_dv, 0, sizeof(attr_dv)); 280 | 281 | attr_ex.qp_type = IBV_QPT_DRIVER; 282 | attr_ex.send_cq = ibv_cq_ex_to_cq(cb->cq); 283 | attr_ex.recv_cq = ibv_cq_ex_to_cq(cb->cq); 284 | 285 | attr_ex.comp_mask |= IBV_QP_INIT_ATTR_PD; 286 | attr_ex.pd = cb->pd; 287 | attr_ex.srq = cb->srq; /* will be NULL for client (DCI) */ 288 | 289 | if (cb->is_server) { 290 | /* create DCT */ 291 | attr_dv.comp_mask |= MLX5DV_QP_INIT_ATTR_MASK_DC; 292 | attr_dv.dc_init_attr.dc_type = MLX5DV_DCTYPE_DCT; 293 | attr_dv.dc_init_attr.dct_access_key = DC_KEY; 294 | 295 | cb->qp = mlx5dv_create_qp(cb->cm_id->verbs, &attr_ex, &attr_dv); 296 | } 297 | else { 298 | /* create DCI */ 299 | attr_dv.comp_mask |= MLX5DV_QP_INIT_ATTR_MASK_DC; 300 | attr_dv.dc_init_attr.dc_type = MLX5DV_DCTYPE_DCI; 301 | 302 | attr_ex.cap.max_send_wr = PING_SQ_DEPTH; 303 | attr_ex.cap.max_send_sge = 1; 304 | 305 | attr_ex.comp_mask |= IBV_QP_INIT_ATTR_SEND_OPS_FLAGS; 306 | attr_ex.send_ops_flags = IBV_QP_EX_WITH_RDMA_WRITE; 307 | 308 | attr_dv.comp_mask |= MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS; 309 | attr_dv.create_flags |= MLX5DV_QP_CREATE_DISABLE_SCATTER_TO_CQE; /*driver doesnt support scatter2cqe data-path on DCI yet*/ 310 | 311 | cb->qp = mlx5dv_create_qp(cb->cm_id->verbs, &attr_ex, &attr_dv); 312 | } 313 | 314 | if (!cb->qp) { 315 | perror("mlx5dv_create_qp(DC)"); 316 | ret = errno; 317 | return ret; 318 | } 319 | if (!cb->is_server) { 320 | cb->qpex = ibv_qp_to_qp_ex(cb->qp); 321 | if (!cb->qpex) { 322 | perror("ibv_qp_to_qp_ex(DC)"); 323 | ret = errno; 324 | } 325 | cb->mqpex = mlx5dv_qp_ex_from_ibv_qp_ex(cb->qpex); 326 | if (!cb->mqpex) { 327 | perror("mlx5dv_qp_ex_from_ibv_qp_ex(DC)"); 328 | ret = errno; 329 | } 330 | return ret; 331 | } 332 | 333 | return ret; 334 | } 335 | 336 | static int dcping_modify_qp(struct dcping_cb *cb) 337 | { 338 | int attr_mask = 0; 339 | int ret = 0; 340 | 341 | /* modify QP to INIT */ 342 | { 343 | attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT; 344 | 345 | struct ibv_qp_attr attr = { 346 | .qp_state = IBV_QPS_INIT, 347 | .pkey_index = 0, 348 | .port_num = cb->cm_id->port_num, 349 | }; 350 | 351 | if (cb->is_server) { 352 | attr_mask |= IBV_QP_ACCESS_FLAGS; 353 | attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE; 354 | } 355 | 356 | 357 | if (ibv_modify_qp(cb->qp, &attr, attr_mask)) { 358 | perror("failed to modify QP to IBV_QPS_INIT"); 359 | ret = errno; 360 | return ret; 361 | } 362 | } 363 | 364 | /* modify QP to RTR */ 365 | { 366 | attr_mask = IBV_QP_STATE | IBV_QP_PATH_MTU | IBV_QP_AV; 367 | 368 | struct ibv_qp_attr attr = { 369 | .qp_state = IBV_QPS_RTR, 370 | .path_mtu = cb->mtu, 371 | .min_rnr_timer = 0x10, 372 | .rq_psn = 0, 373 | .ah_attr = { 374 | .is_global = cb->is_global, 375 | .sl = 0, 376 | .src_path_bits = 0, 377 | .port_num = cb->cm_id->port_num, 378 | .grh.hop_limit = 1, 379 | .grh.sgid_index = cb->sgid_index, 380 | .grh.traffic_class = 0, 381 | 382 | } 383 | }; 384 | 385 | if (cb->is_server) { 386 | attr_mask |= IBV_QP_MIN_RNR_TIMER; 387 | } 388 | 389 | if (ibv_modify_qp(cb->qp, &attr, attr_mask)) { 390 | perror("failed to modify QP to IBV_QPS_RTR"); 391 | ret = errno; 392 | return ret; 393 | } 394 | } 395 | 396 | if (!cb->is_server) { 397 | /* modify QP to RTS */ 398 | attr_mask = IBV_QP_STATE | IBV_QP_TIMEOUT | 399 | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | 400 | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC; 401 | // Optional: IB_QP_MIN_RNR_TIMER 402 | 403 | struct ibv_qp_attr attr = { 404 | .qp_state = IBV_QPS_RTS, 405 | .timeout = 0x10, 406 | .retry_cnt = 7, 407 | .rnr_retry = 7, 408 | .sq_psn = 0, 409 | .max_rd_atomic = 1, 410 | }; 411 | 412 | if (ibv_modify_qp(cb->qp, &attr, attr_mask)) { 413 | perror("failed to modify QP to IBV_QPS_RTS"); 414 | ret = errno; 415 | return ret; 416 | } 417 | } 418 | 419 | return ret; 420 | } 421 | 422 | static void dcping_free_qp(struct dcping_cb *cb) 423 | { 424 | DEBUG_LOG("dcping_free_qp/srq/cq/pd called on cb %p\n", cb); 425 | if (cb->qp) ibv_destroy_qp(cb->qp); 426 | if (cb->srq) ibv_destroy_srq(cb->srq); 427 | ibv_destroy_cq(ibv_cq_ex_to_cq(cb->cq)); 428 | ibv_destroy_comp_channel(cb->channel); 429 | ibv_dealloc_pd(cb->pd); 430 | } 431 | 432 | static int dcping_setup_qp(struct dcping_cb *cb, struct rdma_cm_id *srv_req_cm_id) 433 | { 434 | int ret; 435 | struct ibv_cq_init_attr_ex cq_attr_ex; 436 | struct ibv_device_attr_ex device_attr_ex = {}; 437 | 438 | cb->channel = ibv_create_comp_channel(cb->cm_id->verbs); 439 | if (!cb->channel) { 440 | fprintf(stderr, "ibv_create_comp_channel failed\n"); 441 | ret = errno; 442 | goto err1; 443 | } 444 | DEBUG_LOG("created channel %p\n", cb->channel); 445 | 446 | memset(&cq_attr_ex, 0, sizeof(cq_attr_ex)); 447 | cq_attr_ex.cqe = PING_SQ_DEPTH * 2; 448 | cq_attr_ex.cq_context = cb; 449 | cq_attr_ex.channel = cb->channel; 450 | cq_attr_ex.comp_vector = 0; 451 | cq_attr_ex.wc_flags = IBV_WC_EX_WITH_COMPLETION_TIMESTAMP; 452 | 453 | cb->cq = ibv_create_cq_ex(cb->cm_id->verbs, &cq_attr_ex); 454 | if (!cb->cq) { 455 | fprintf(stderr, "ibv_create_cq failed\n"); 456 | ret = errno; 457 | goto err2; 458 | } 459 | DEBUG_LOG("created cq %p\n", cb->cq); 460 | 461 | if (cb->is_server) 462 | { 463 | struct ibv_srq_init_attr srq_attr; 464 | memset(&srq_attr, 0, sizeof(srq_attr)); 465 | srq_attr.attr.max_wr = 2; 466 | srq_attr.attr.max_sge = 1; 467 | cb->srq = ibv_create_srq(cb->pd, &srq_attr); 468 | if (!cb->srq) { 469 | fprintf(stderr, "ibv_create_srq failed\n"); 470 | ret = errno; 471 | goto err3; 472 | } 473 | 474 | DEBUG_LOG("created srq %p\n", cb->srq); 475 | } 476 | 477 | ret = dcping_create_qp(cb); 478 | if (ret) { 479 | goto err4; 480 | } 481 | 482 | if (cb->is_server) { 483 | dcping_ece_get_remote_set_locally(cb, srv_req_cm_id); 484 | } 485 | 486 | ret = dcping_modify_qp(cb); 487 | if (ret) { 488 | goto err5; 489 | } 490 | DEBUG_LOG("created qp %p (qpn=%d)\n", cb->qp, (cb->qp ? cb->qp->qp_num : (uint32_t)-1)); 491 | 492 | ret = ibv_query_device_ex(cb->cm_id->verbs, NULL, &device_attr_ex); 493 | if (ret) { 494 | fprintf(stderr, "ibv_query_device_ex failed\n"); 495 | ret = errno; 496 | goto err3; 497 | } 498 | if (!device_attr_ex.hca_core_clock) { 499 | fprintf(stderr, "hca_core_clock = 0\n"); 500 | ret = errno; 501 | goto err3; 502 | } 503 | cb->hw_clocks_kHz = device_attr_ex.hca_core_clock; 504 | DEBUG_LOG("hw_clocks_kHz = %ld\n", cb->hw_clocks_kHz); 505 | 506 | return 0; 507 | 508 | err5: 509 | ibv_destroy_qp(cb->qp); 510 | err4: 511 | if (cb->srq) 512 | ibv_destroy_srq(cb->srq); 513 | err3: 514 | ibv_destroy_cq(ibv_cq_ex_to_cq(cb->cq)); 515 | err2: 516 | ibv_destroy_comp_channel(cb->channel); 517 | err1: 518 | ibv_dealloc_pd(cb->pd); 519 | return ret; 520 | } 521 | 522 | static int dcping_handle_cm_event(struct dcping_cb *cb, enum rdma_cm_event_type *cm_event, struct rdma_cm_id **cm_id) 523 | { 524 | int ret; 525 | struct rdma_cm_event *event; 526 | 527 | *cm_id = NULL; 528 | *cm_event = -1; 529 | 530 | ret = rdma_get_cm_event(cb->cm_channel, &event); 531 | if (ret) { 532 | perror("rdma_get_cm_event"); 533 | exit(ret); 534 | } 535 | DEBUG_LOG("got cm event: %s(%d) status=%d, cm_id %p\n", rdma_event_str(event->event), event->event, event->status, event->id); 536 | 537 | *cm_id = event->id; 538 | *cm_event = event->event; 539 | 540 | switch (event->event) { 541 | 542 | case RDMA_CM_EVENT_ADDR_RESOLVED: 543 | case RDMA_CM_EVENT_ADDR_ERROR: 544 | case RDMA_CM_EVENT_ROUTE_RESOLVED: 545 | case RDMA_CM_EVENT_ROUTE_ERROR: 546 | case RDMA_CM_EVENT_CONNECT_REQUEST: 547 | case RDMA_CM_EVENT_CONNECT_ERROR: 548 | case RDMA_CM_EVENT_UNREACHABLE: 549 | case RDMA_CM_EVENT_REJECTED: 550 | case RDMA_CM_EVENT_ESTABLISHED: 551 | case RDMA_CM_EVENT_DISCONNECTED: 552 | break; 553 | 554 | case RDMA_CM_EVENT_CONNECT_RESPONSE: 555 | if (event->param.conn.private_data_len >= sizeof(struct dcping_rdma_info)) { 556 | struct rdma_conn_param *conn_param = &event->param.conn; 557 | struct dcping_rdma_info *remote_buf_info = (struct dcping_rdma_info *)conn_param->private_data; 558 | 559 | cb->remote_buf_info.addr = be64toh(remote_buf_info->addr); 560 | cb->remote_buf_info.size = be32toh(remote_buf_info->size); 561 | cb->remote_buf_info.rkey = be32toh(remote_buf_info->rkey); 562 | cb->remote_buf_info.dctn = be32toh(remote_buf_info->dctn); 563 | 564 | DEBUG_LOG("got server param's: dctn=%d, buf=%llu, size=%d, rkey=%d\n", cb->remote_buf_info.dctn, cb->remote_buf_info.addr, cb->remote_buf_info.size, cb->remote_buf_info.rkey); 565 | } 566 | break; 567 | 568 | case RDMA_CM_EVENT_DEVICE_REMOVAL: 569 | fprintf(stderr, "cma detected device removal!!!!\n"); 570 | ret = -1; 571 | break; 572 | 573 | default: 574 | fprintf(stderr, "unhandled event: %s, ignoring\n", 575 | rdma_event_str(event->event)); 576 | ret = -1; 577 | break; 578 | } 579 | rdma_ack_cm_event(event); 580 | return ret; 581 | } 582 | 583 | static int dcping_bind_server(struct dcping_cb *cb) 584 | { 585 | int ret; 586 | char str[MAX_INET_ADDRSTRLEN]; 587 | struct ibv_port_attr port_attr; 588 | 589 | if (cb->sin.ss_family == AF_INET) { 590 | ((struct sockaddr_in *) &cb->sin)->sin_port = cb->port; 591 | inet_ntop(AF_INET, &(((struct sockaddr_in *)&cb->sin)->sin_addr), str, sizeof(str)); 592 | } 593 | else { 594 | ((struct sockaddr_in6 *) &cb->sin)->sin6_port = cb->port; 595 | inet_ntop(AF_INET6, &(((struct sockaddr_in6 *)&cb->sin)->sin6_addr), str, sizeof(str)); 596 | } 597 | 598 | ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &cb->sin); 599 | if (ret) { 600 | perror("rdma_bind_addr"); 601 | return ret; 602 | } 603 | if (cb->cm_id->verbs == NULL) { 604 | DEBUG_LOG("Failed to bind to an RDMA device, exiting... <%s, %d>\n", str, be16toh(cb->port)); 605 | exit(1); 606 | } 607 | 608 | if (ibv_query_port(cb->cm_id->verbs, cb->cm_id->port_num, &port_attr)) { 609 | perror("ibv_query_port"); 610 | exit(1); 611 | } 612 | cb->mtu = port_attr.active_mtu; 613 | if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { 614 | cb->is_global = 1; 615 | cb->sgid_index = my_ibv_find_sgid_type(cb->cm_id->verbs, cb->cm_id->port_num, MY_IBV_GID_TYPE_ROCE_V2, cb->sin.ss_family); 616 | } 617 | 618 | DEBUG_LOG("rdma_bind_addr successful on address: <%s:%d>\n", str, be16toh(cb->port)); 619 | 620 | DEBUG_LOG("rdma_listen\n"); 621 | ret = rdma_listen(cb->cm_id, 3); 622 | if (ret) { 623 | perror("rdma_listen"); 624 | return ret; 625 | } 626 | 627 | cb->pd = ibv_alloc_pd(cb->cm_id->verbs); 628 | if (!cb->pd) { 629 | fprintf(stderr, "ibv_alloc_pd failed\n"); 630 | return errno; 631 | } 632 | DEBUG_LOG("created pd %p\n", cb->pd); 633 | 634 | return 0; 635 | } 636 | 637 | static void free_cb(struct dcping_cb *cb) 638 | { 639 | free(cb); 640 | } 641 | 642 | static int dcping_run_server(struct dcping_cb *cb) 643 | { 644 | int ret; 645 | char str[MAX_INET_ADDRSTRLEN]; 646 | 647 | ret = dcping_bind_server(cb); 648 | if (ret) 649 | return ret; 650 | 651 | ret = dcping_setup_buffers(cb); 652 | if (ret) { 653 | fprintf(stderr, "setup_buffers failed: %d\n", ret); 654 | goto err1; 655 | } 656 | 657 | printf("server ready, waiting for client connection requests...\n"); 658 | 659 | // main loop: 660 | // wait for CONN REQ 661 | // accept with dctn and MKey 662 | while (1) 663 | { 664 | struct rdma_cm_id *req_cm_id; 665 | enum rdma_cm_event_type cm_event; 666 | 667 | DEBUG_LOG("waiting for client events ...\n"); 668 | ret = dcping_handle_cm_event(cb, &cm_event, &req_cm_id); 669 | switch (cm_event) { 670 | 671 | case RDMA_CM_EVENT_CONNECT_REQUEST: 672 | if (cb->sin.ss_family == AF_INET) { 673 | inet_ntop(AF_INET, &(((struct sockaddr_in *)rdma_get_peer_addr(req_cm_id))->sin_addr), str, sizeof(str)); 674 | } 675 | else { 676 | inet_ntop(AF_INET6, &(((struct sockaddr_in6 *)rdma_get_peer_addr(req_cm_id))->sin6_addr), str, sizeof(str)); 677 | } 678 | 679 | DEBUG_LOG("accepting client connection request from <%s:%d> (cm_id %p)\n", str, be16toh(rdma_get_dst_port(req_cm_id)), req_cm_id); 680 | 681 | if (!cb->qp) { 682 | ret = dcping_setup_qp(cb, req_cm_id); 683 | if (ret) { 684 | fprintf(stderr, "setup_qp failed: %d\n", ret); 685 | return ret; 686 | } 687 | 688 | if (cb->is_server) { 689 | cb->remote_buf_info.addr = htobe64((uint64_t) (unsigned long) cb->local_buf_addr); 690 | cb->remote_buf_info.size = htobe32(cb->size); 691 | cb->remote_buf_info.rkey = htobe32(cb->local_buf_mr->rkey); 692 | cb->remote_buf_info.dctn = htobe32(cb->qp->qp_num); 693 | } 694 | } 695 | 696 | dcping_ece_get_locally_set_remote(cb, req_cm_id); 697 | 698 | struct rdma_conn_param conn_param; 699 | dcping_init_conn_param(cb, req_cm_id, &conn_param); 700 | ret = rdma_accept(req_cm_id, &conn_param); 701 | if (ret) { 702 | perror("rdma_accept"); 703 | goto err2; 704 | } 705 | break; 706 | 707 | case RDMA_CM_EVENT_ESTABLISHED: 708 | printf("client connection established (cm_id %p)\n", req_cm_id); 709 | break; 710 | 711 | case RDMA_CM_EVENT_DISCONNECTED: 712 | rdma_disconnect(req_cm_id); 713 | rdma_destroy_id(req_cm_id); 714 | if (cb->is_reserved_qpn_supp) { 715 | my_mlx5dv_reserved_qpn_dealloc(cb->cm_id->verbs, cb->reserved_qpn); 716 | cb->reserved_qpn = 0; 717 | } 718 | if (cb->is_server) printf("client connection disconnected (cm_id %p)\n", req_cm_id); 719 | break; 720 | 721 | default: 722 | fprintf(stderr, "server unexpected event: %s (%d)\n", rdma_event_str(cm_event), cm_event); 723 | exit(1); 724 | break; 725 | } 726 | } 727 | 728 | 729 | ret = 0; 730 | err2: 731 | dcping_free_buffers(cb); 732 | err1: 733 | dcping_free_qp(cb); 734 | 735 | return ret; 736 | } 737 | 738 | static int dcping_client_dc_send_wr(struct dcping_cb *cb, uint64_t wr_id) 739 | { 740 | /* 1st small RDMA Write for DCI connect, this will create cqe->ts_start */ 741 | ibv_wr_start(cb->qpex); 742 | cb->qpex->wr_id = wr_id; 743 | cb->qpex->wr_flags = IBV_SEND_SIGNALED; 744 | ibv_wr_rdma_write(cb->qpex, cb->remote_buf_info.rkey, cb->remote_buf_info.addr); 745 | mlx5dv_wr_set_dc_addr(cb->mqpex, cb->ah, cb->remote_buf_info.dctn, DC_KEY); 746 | ibv_wr_set_sge(cb->qpex, cb->local_buf_mr->lkey, 747 | (uintptr_t)cb->local_buf_addr, 1); 748 | 749 | /* 2nd SIZE x RDMA Write, this will create cqe->ts_end */ 750 | cb->qpex->wr_flags = IBV_SEND_SIGNALED | IBV_SEND_FENCE; 751 | ibv_wr_rdma_write(cb->qpex, cb->remote_buf_info.rkey, cb->remote_buf_info.addr); 752 | mlx5dv_wr_set_dc_addr(cb->mqpex, cb->ah, cb->remote_buf_info.dctn, DC_KEY); 753 | ibv_wr_set_sge(cb->qpex, cb->local_buf_mr->lkey, 754 | (uintptr_t)cb->local_buf_addr, 755 | (uint32_t)cb->size); 756 | 757 | /* ring DB */ 758 | return ibv_wr_complete(cb->qpex); 759 | } 760 | 761 | static int dcping_client_wait_cq_event(struct dcping_cb *cb) 762 | { 763 | int ret; 764 | void *ev_ctx; 765 | struct ibv_cq *ev_cq; 766 | 767 | ret = ibv_req_notify_cq(ibv_cq_ex_to_cq(cb->cq), 0); 768 | if (ret) { 769 | perror("ibv_req_notify_cq"); 770 | ret = errno; 771 | return ret; 772 | } 773 | DEBUG_LOG_FAST_PATH("waiting for cq event...\n"); 774 | if (ibv_get_cq_event(cb->channel, &ev_cq, &ev_ctx)) { 775 | perror("ibv_get_cq_event"); 776 | ret = errno; 777 | return ret; 778 | } 779 | ibv_ack_cq_events(ibv_cq_ex_to_cq(cb->cq), 1); 780 | DEBUG_LOG_FAST_PATH("got someting.. checking\n"); 781 | 782 | if ((ev_ctx != cb) || (ev_cq != ibv_cq_ex_to_cq(cb->cq))) { 783 | fprintf(stderr, "ibv_get_cq_event return with wrong cq_ctx (%p) or wrong ibv_cq_ctx (%p)\n", ev_ctx, ev_cq); 784 | ret = errno; 785 | return ret; 786 | } 787 | return ret; 788 | } 789 | 790 | static int dcping_client_process_cqe(struct dcping_cb *cb, uint64_t wr_id, uint64_t *ts_out) 791 | { 792 | *ts_out = 0; 793 | 794 | if (cb->cq->status != IBV_WC_SUCCESS) { 795 | fprintf(stderr, "CQ failed with status '%s' (%d) for wr_id %d\n", 796 | ibv_wc_status_str(cb->cq->status), 797 | cb->cq->status, (int)cb->cq->wr_id); 798 | return -1; 799 | } 800 | 801 | if (cb->cq->wr_id != wr_id) { 802 | fprintf(stderr, "CQ failed wr_id compare '%s' (%d) for cqe->wr_id(%ld) vs wr_id(%ld)\n", 803 | ibv_wc_status_str(cb->cq->status), cb->cq->status, 804 | cb->cq->wr_id, wr_id); 805 | return -1; 806 | } 807 | 808 | *ts_out = ibv_wc_read_completion_ts(cb->cq); 809 | return 0; 810 | } 811 | 812 | static int dcping_client_get_cqe_tiemstmp(struct dcping_cb *cb, uint64_t wr_id, uint64_t *ts_hw_start, uint64_t *ts_hw_end) 813 | { 814 | /* we expect 2 cqe matching wr_id's to input */ 815 | 816 | int ret, step=0; 817 | uint64_t ts_hw; 818 | struct ibv_poll_cq_attr cq_attr = {}; 819 | 820 | *ts_hw_start = *ts_hw_end = 0; 821 | 822 | do { 823 | ret = ibv_start_poll(cb->cq, &cq_attr); 824 | if (ret) { 825 | if (ret == ENOENT) { 826 | ret = dcping_client_wait_cq_event(cb); 827 | if (ret) { 828 | return ret; 829 | } 830 | /* check cq again, return to main loop */ 831 | continue; 832 | } 833 | perror("ibv_start_poll"); 834 | ret = errno; 835 | return ret; 836 | } 837 | 838 | ret = dcping_client_process_cqe(cb, wr_id, &ts_hw); 839 | ibv_end_poll(cb->cq); 840 | 841 | DEBUG_LOG_FAST_PATH("processing cqe (step %d) ts_hw = %lu\n", step, ts_hw); 842 | 843 | if (ret) 844 | return ret; 845 | 846 | if (step == 0) 847 | *ts_hw_start = ts_hw; 848 | else 849 | *ts_hw_end = ts_hw; 850 | 851 | step++; 852 | 853 | } while (step < 2); 854 | 855 | return 0; 856 | } 857 | 858 | static int dcping_test_client(struct dcping_cb *cb) 859 | { 860 | int ret = 0; 861 | uint32_t ping; 862 | uint64_t ts_hw_start, ts_hw_end; 863 | uint64_t rtt_nsec, rtt_hw; 864 | 865 | uint64_t rtt_nsec_min = ULLONG_MAX; 866 | uint64_t rtt_nsec_max = 0; 867 | uint64_t rtt_nsec_total = 0; 868 | 869 | printf("connected to server, starting DC RTT test\n"); 870 | 871 | for (ping = 0; !cb->count || ping < cb->count; ping++) { 872 | /* initiate RDMA Write x2 ops to create tiemstamp CQE's */ 873 | DEBUG_LOG_FAST_PATH("before post send \n"); 874 | ret = dcping_client_dc_send_wr(cb, ping); 875 | if (ret) { 876 | DEBUG_LOG("dc send error :(\n"); 877 | } 878 | 879 | /* wait for CQE's with timestamp */ 880 | DEBUG_LOG_FAST_PATH("before cqe check\n"); 881 | ret = dcping_client_get_cqe_tiemstmp(cb, ping, &ts_hw_start, &ts_hw_end); 882 | if (ret) { 883 | DEBUG_LOG("cqe processing failed :(\n"); 884 | return ret; 885 | } 886 | 887 | /* clac RTT */ 888 | rtt_hw = ts_hw_end - ts_hw_start; 889 | rtt_nsec = rtt_hw * USEC_PER_SEC / cb->hw_clocks_kHz; 890 | printf("\r[iter =%4d] rtt = %ld.%3.3ld usec", ping, rtt_nsec/1000, rtt_nsec%1000); fflush(stdout); 891 | 892 | rtt_nsec_total += rtt_nsec; 893 | if (rtt_nsec_min > rtt_nsec) rtt_nsec_min = rtt_nsec; 894 | if (rtt_nsec_max < rtt_nsec) rtt_nsec_max = rtt_nsec; 895 | 896 | usleep(cb->delay_usec); 897 | } 898 | 899 | printf("\r[total = %d] rtt = %ld.%3.3ld / %ld.%3.3ld / %ld.%3.3ld usec \n", ping, 900 | (rtt_nsec_min)/1000, (rtt_nsec_min)%1000, 901 | (rtt_nsec_total/ping)/1000, (rtt_nsec_total/ping)%1000, 902 | (rtt_nsec_max)/1000, (rtt_nsec_max)%1000); 903 | printf("done DC RTT test\n"); 904 | 905 | return 0; 906 | } 907 | 908 | static int dcping_connect_client(struct dcping_cb *cb) 909 | { 910 | int ret; 911 | int qp_attr_mask; 912 | struct ibv_qp_attr qp_attr; 913 | struct rdma_cm_id *cm_id; 914 | enum rdma_cm_event_type cm_event; 915 | struct rdma_conn_param conn_param; 916 | 917 | dcping_ece_get_locally_set_remote(cb, cb->cm_id); 918 | 919 | DEBUG_LOG("rdma_connecting...\n"); 920 | dcping_init_conn_param(cb, cb->cm_id, &conn_param); 921 | ret = rdma_connect(cb->cm_id, &conn_param); 922 | if (ret) { 923 | perror("rdma_connect"); 924 | return ret; 925 | } 926 | 927 | ret = dcping_handle_cm_event(cb, &cm_event, &cm_id); 928 | if (ret || cm_event != RDMA_CM_EVENT_CONNECT_RESPONSE) { 929 | perror("rdma_connect wrong responce"); 930 | return -1; 931 | } 932 | 933 | dcping_ece_get_remote_set_locally(cb, cb->cm_id); 934 | 935 | DEBUG_LOG("modify QP...\n"); 936 | qp_attr.qp_state = IBV_QPS_RTR; 937 | ret = rdma_init_qp_attr(cb->cm_id, &qp_attr, &qp_attr_mask); 938 | if (ret) { 939 | perror("rdma_init_qp_attr"); 940 | return ret; 941 | } 942 | 943 | cb->ah = ibv_create_ah(cb->pd, &qp_attr.ah_attr); 944 | if (!cb->ah) { 945 | perror("ibv_create_ah"); 946 | return -1; 947 | } 948 | DEBUG_LOG("created ah (%p)\n", cb->ah); 949 | 950 | ret = rdma_establish(cb->cm_id); 951 | if (ret) { 952 | perror("rdma_establish"); 953 | return ret; 954 | } 955 | 956 | DEBUG_LOG("rdma_connect successful\n"); 957 | return 0; 958 | } 959 | 960 | static int dcping_bind_client(struct dcping_cb *cb) 961 | { 962 | int ret; 963 | char str[MAX_INET_ADDRSTRLEN]; 964 | struct ibv_port_attr port_attr; 965 | struct rdma_cm_id *cm_id; 966 | enum rdma_cm_event_type cm_event; 967 | 968 | if (cb->sin.ss_family == AF_INET) { 969 | ((struct sockaddr_in *) &cb->sin)->sin_port = cb->port; 970 | inet_ntop(AF_INET, &(((struct sockaddr_in *)&cb->sin)->sin_addr), str, sizeof(str)); 971 | } 972 | else { 973 | ((struct sockaddr_in6 *) &cb->sin)->sin6_port = cb->port; 974 | inet_ntop(AF_INET6, &(((struct sockaddr_in6 *)&cb->sin)->sin6_addr), str, sizeof(str)); 975 | } 976 | 977 | if (cb->ssource.ss_family) 978 | ret = rdma_resolve_addr(cb->cm_id, (struct sockaddr *) &cb->ssource, 979 | (struct sockaddr *) &cb->sin, 2000); 980 | else 981 | ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &cb->sin, 2000); 982 | 983 | if (ret) { 984 | perror("rdma_resolve_addr"); 985 | return ret; 986 | } 987 | 988 | ret = dcping_handle_cm_event(cb, &cm_event, &cm_id); 989 | if (cm_event != RDMA_CM_EVENT_ADDR_RESOLVED) { 990 | return -1; 991 | } 992 | 993 | ret = rdma_resolve_route(cb->cm_id, 2000); 994 | if (ret) { 995 | perror("rdma_resolve_route"); 996 | } 997 | 998 | ret = dcping_handle_cm_event(cb, &cm_event, &cm_id); 999 | if (cm_event != RDMA_CM_EVENT_ROUTE_RESOLVED) { 1000 | return -1; 1001 | } 1002 | 1003 | if (ibv_query_port(cb->cm_id->verbs, cb->cm_id->port_num, &port_attr)) { 1004 | perror("ibv_query_port"); 1005 | exit(1); 1006 | } 1007 | cb->mtu = port_attr.active_mtu; 1008 | if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { 1009 | cb->is_global = 1; 1010 | cb->sgid_index = my_ibv_find_sgid_type(cb->cm_id->verbs, cb->cm_id->port_num, MY_IBV_GID_TYPE_ROCE_V2, cb->sin.ss_family); 1011 | } 1012 | 1013 | DEBUG_LOG("rdma_resolve_addr/rdma_resolve_route successful to server: <%s:%d>\n", str, be16toh(rdma_get_src_port(cb->cm_id))); 1014 | 1015 | cb->pd = ibv_alloc_pd(cb->cm_id->verbs); 1016 | if (!cb->pd) { 1017 | fprintf(stderr, "ibv_alloc_pd failed\n"); 1018 | return errno; 1019 | } 1020 | DEBUG_LOG("created pd %p\n", cb->pd); 1021 | 1022 | return 0; 1023 | } 1024 | 1025 | static int dcping_run_client(struct dcping_cb *cb) 1026 | { 1027 | int ret; 1028 | 1029 | ret = dcping_bind_client(cb); 1030 | if (ret) 1031 | return ret; 1032 | 1033 | ret = dcping_setup_qp(cb, NULL); 1034 | if (ret) { 1035 | fprintf(stderr, "setup_qp failed: %d\n", ret); 1036 | return ret; 1037 | } 1038 | 1039 | ret = dcping_setup_buffers(cb); 1040 | if (ret) { 1041 | fprintf(stderr, "rping_setup_buffers failed: %d\n", ret); 1042 | goto err1; 1043 | } 1044 | 1045 | ret = dcping_connect_client(cb); 1046 | if (ret) { 1047 | fprintf(stderr, "connect error %d\n", ret); 1048 | goto err2; 1049 | } 1050 | 1051 | ret = dcping_test_client(cb); 1052 | if (ret) { 1053 | fprintf(stderr, "rping client failed: %d\n", ret); 1054 | goto err3; 1055 | } 1056 | 1057 | ret = 0; 1058 | err3: 1059 | rdma_disconnect(cb->cm_id); 1060 | err2: 1061 | dcping_free_buffers(cb); 1062 | err1: 1063 | dcping_free_qp(cb); 1064 | 1065 | return ret; 1066 | } 1067 | 1068 | static int get_addr(char *dst, struct sockaddr *addr) 1069 | { 1070 | struct addrinfo *res; 1071 | int ret; 1072 | 1073 | ret = getaddrinfo(dst, NULL, NULL, &res); 1074 | if (ret) { 1075 | printf("getaddrinfo failed (%s) - invalid hostname or IP address\n", gai_strerror(ret)); 1076 | return ret; 1077 | } 1078 | 1079 | if (res->ai_family == PF_INET) 1080 | memcpy(addr, res->ai_addr, sizeof(struct sockaddr_in)); 1081 | else if (res->ai_family == PF_INET6) 1082 | memcpy(addr, res->ai_addr, sizeof(struct sockaddr_in6)); 1083 | else 1084 | ret = -1; 1085 | 1086 | freeaddrinfo(res); 1087 | return ret; 1088 | } 1089 | 1090 | static void usage(const char *name, int op) 1091 | { 1092 | if (op) { 1093 | printf("%s: op '%c' not avilable\n", 1094 | basename(name), op); 1095 | } 1096 | printf("%s -s -a addr [-d] [-S size] [-C count] [-p port]\n", 1097 | basename(name)); 1098 | printf("%s -c -a addr [-d] [-S size] [-C count] [-D delay] [-I addr] [-p port]\n", 1099 | basename(name)); 1100 | printf("\t-c\t\tclient side\n"); 1101 | printf("\t-s\t\tserver side. To bind to any address with IPv6 use -a ::0\n"); 1102 | printf("\t-a addr\t\taddress\n"); 1103 | printf("\t-p port\t\tserver port\n"); 1104 | printf("\t-I\t\tSource address to bind to for client.\n"); 1105 | printf("\t-S size \tping data size (default: 64B)\n"); 1106 | printf("\t-C count\tping count times (default: 1)\n"); 1107 | printf("\t-D delay\tinter-rtt delay [milli-sec] (default: 1 sec)\n"); 1108 | printf("\t-d\t\tdebug printfs\n"); 1109 | } 1110 | 1111 | int main(int argc, char *argv[]) 1112 | { 1113 | struct dcping_cb *cb; 1114 | int op; 1115 | int ret = 0; 1116 | 1117 | cb = malloc(sizeof(*cb)); 1118 | if (!cb) 1119 | return -ENOMEM; 1120 | 1121 | memset(cb, 0, sizeof(*cb)); 1122 | cb->is_server = -1; 1123 | cb->count = 1; 1124 | cb->size = 64; 1125 | cb->sin.ss_family = PF_INET; 1126 | cb->port = htobe16(7174); 1127 | cb->is_reserved_qpn_supp = 1; 1128 | cb->is_ece_supp = 1; 1129 | 1130 | opterr = 0; 1131 | while ((op = getopt(argc, argv, "a:I:p:C:S:D:t:scvd")) != -1) { 1132 | switch (op) { 1133 | case 'a': 1134 | ret = get_addr(optarg, (struct sockaddr *) &cb->sin); 1135 | break; 1136 | case 'I': 1137 | ret = get_addr(optarg, (struct sockaddr *) &cb->ssource); 1138 | break; 1139 | case 'p': 1140 | cb->port = htobe16(atoi(optarg)); 1141 | DEBUG_LOG("port %d\n", (int) atoi(optarg)); 1142 | break; 1143 | case 's': 1144 | cb->is_server = 1; 1145 | DEBUG_LOG("server\n"); 1146 | break; 1147 | case 'c': 1148 | cb->is_server = 0; 1149 | DEBUG_LOG("client\n"); 1150 | break; 1151 | case 'S': 1152 | cb->size = atoi(optarg); 1153 | if ((cb->size < PING_MIN_BUFSIZE) || 1154 | (cb->size > (PING_BUFSIZE - 1))) { 1155 | fprintf(stderr, "Invalid size %d " 1156 | "(valid range is %zd to %d)\n", 1157 | cb->size, PING_MIN_BUFSIZE, PING_BUFSIZE); 1158 | ret = EINVAL; 1159 | } else 1160 | DEBUG_LOG("size %d\n", (int) atoi(optarg)); 1161 | break; 1162 | case 'C': 1163 | cb->count = atoi(optarg); 1164 | DEBUG_LOG("count %d\n", (int) cb->count); 1165 | break; 1166 | case 'D': 1167 | cb->delay_usec = atoi(optarg) * 1000; 1168 | DEBUG_LOG("delay %d [msec]\n", cb->delay_usec / 1000); 1169 | break; 1170 | case 'd': 1171 | debug++; 1172 | break; 1173 | default: 1174 | usage("rping", op); 1175 | ret = EINVAL; 1176 | goto out; 1177 | } 1178 | } 1179 | if (ret) 1180 | goto out; 1181 | 1182 | if (cb->is_server == -1) { 1183 | usage("dcping", 0); 1184 | ret = EINVAL; 1185 | goto out; 1186 | } 1187 | 1188 | cb->cm_channel = create_first_event_channel(); 1189 | if (!cb->cm_channel) { 1190 | ret = errno; 1191 | goto out; 1192 | } 1193 | 1194 | ret = rdma_create_id(cb->cm_channel, &cb->cm_id, cb, RDMA_PS_TCP); 1195 | if (ret) { 1196 | perror("rdma_create_id"); 1197 | goto out2; 1198 | } 1199 | DEBUG_LOG("created cm_id %p\n", cb->cm_id); 1200 | 1201 | if (cb->is_server) { 1202 | ret = dcping_run_server(cb); 1203 | } else { 1204 | ret = dcping_run_client(cb); 1205 | } 1206 | 1207 | DEBUG_LOG("destroy cm_id %p\n", cb->cm_id); 1208 | rdma_destroy_id(cb->cm_id); 1209 | out2: 1210 | rdma_destroy_event_channel(cb->cm_channel); 1211 | out: 1212 | free_cb(cb); 1213 | return ret; 1214 | } 1215 | -------------------------------------------------------------------------------- /my_ibv_helper.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. 3 | * 4 | * This software is available to you under a choice of one of two 5 | * licenses. You may choose to be licensed under the terms of the GNU 6 | * General Public License (GPL) Version 2, available from the file 7 | * COPYING in the main directory of this source tree, or the 8 | * OpenIB.org BSD license below: 9 | * 10 | * Redistribution and use in source and binary forms, with or 11 | * without modification, are permitted provided that the following 12 | * conditions are met: 13 | * 14 | * - Redistributions of source code must retain the above 15 | * copyright notice, this list of conditions and the following 16 | * disclaimer. 17 | * 18 | * - Redistributions in binary form must reproduce the above 19 | * copyright notice, this list of conditions and the following 20 | * disclaimer in the documentation and/or other materials 21 | * provided with the distribution. 22 | * 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 | * SOFTWARE. 31 | */ 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | 39 | 40 | #pragma weak mlx5dv_reserved_qpn_alloc 41 | extern int mlx5dv_reserved_qpn_alloc(struct ibv_context *ctx, uint32_t *qpn); 42 | int (*_my_mlx5dv_reserved_qpn_alloc)(struct ibv_context *ctx, uint32_t *qpn) = mlx5dv_reserved_qpn_alloc; 43 | 44 | int my_mlx5dv_reserved_qpn_alloc(struct ibv_context *ctx, uint32_t *qpn) 45 | { 46 | if (_my_mlx5dv_reserved_qpn_alloc) return _my_mlx5dv_reserved_qpn_alloc(ctx, qpn); 47 | else return EOPNOTSUPP; 48 | } 49 | 50 | #pragma weak mlx5dv_reserved_qpn_dealloc 51 | extern int mlx5dv_reserved_qpn_dealloc(struct ibv_context *ctx, uint32_t qpn); 52 | int (*_my_mlx5dv_reserved_qpn_dealloc)(struct ibv_context *ctx, uint32_t qpn) = mlx5dv_reserved_qpn_dealloc; 53 | 54 | int my_mlx5dv_reserved_qpn_dealloc(struct ibv_context *ctx, uint32_t qpn) 55 | { 56 | if (_my_mlx5dv_reserved_qpn_dealloc) return _my_mlx5dv_reserved_qpn_dealloc(ctx, qpn); 57 | else return EOPNOTSUPP; 58 | } 59 | 60 | 61 | 62 | enum my_ibv_gid_type { 63 | MY_IBV_GID_TYPE_IB_ROCE_V1, 64 | MY_IBV_GID_TYPE_ROCE_V2, 65 | }; 66 | 67 | static int my_ibv_read_sysfs_file(const char *dir, const char *file, 68 | char *buf, size_t size) 69 | { 70 | char *path; 71 | int fd; 72 | int len; 73 | 74 | if (asprintf(&path, "%s/%s", dir, file) < 0) 75 | return -1; 76 | 77 | fd = open(path, O_RDONLY | O_CLOEXEC); 78 | if (fd < 0) { 79 | free(path); 80 | return -1; 81 | } 82 | 83 | len = read(fd, buf, size); 84 | 85 | close(fd); 86 | free(path); 87 | 88 | if (len > 0) { 89 | if (buf[len - 1] == '\n') 90 | buf[--len] = '\0'; 91 | else if (len < size) 92 | buf[len] = '\0'; 93 | else 94 | /* We would have to truncate the contents to NULL 95 | * terminate, so we are going to fail no matter 96 | * what we do, either right now or later when 97 | * we pass around an unterminated string. Fail now. 98 | */ 99 | return -1; 100 | } 101 | 102 | return len; 103 | } 104 | 105 | 106 | /* GID types as appear in sysfs, no change is expected as of ABI 107 | * compatibility. 108 | */ 109 | #define V1_TYPE "IB/RoCE v1" 110 | #define V2_TYPE "RoCE v2" 111 | static int my_ibv_query_gid_type(struct ibv_context *context, uint8_t port_num, 112 | unsigned int index, enum my_ibv_gid_type *type) 113 | { 114 | char name[32]; 115 | char buff[11]; 116 | 117 | snprintf(name, sizeof(name), "ports/%d/gid_attrs/types/%d", port_num, 118 | index); 119 | 120 | /* Reset errno so that we can rely on its value upon any error flow in 121 | * ibv_read_sysfs_file. 122 | */ 123 | errno = 0; 124 | if (my_ibv_read_sysfs_file(context->device->ibdev_path, name, buff, 125 | sizeof(buff)) <= 0) { 126 | char *dir_path; 127 | DIR *dir; 128 | 129 | if (errno == EINVAL) { 130 | /* In IB, this file doesn't exist and the kernel sets 131 | * errno to -EINVAL. 132 | */ 133 | *type = MY_IBV_GID_TYPE_IB_ROCE_V1; 134 | return 0; 135 | } 136 | if (asprintf(&dir_path, "%s/%s/%d/%s/", 137 | context->device->ibdev_path, "ports", port_num, 138 | "gid_attrs") < 0) 139 | return -1; 140 | dir = opendir(dir_path); 141 | free(dir_path); 142 | if (!dir) { 143 | if (errno == ENOENT) 144 | /* Assuming that if gid_attrs doesn't exist, 145 | * we have an old kernel and all GIDs are 146 | * IB/RoCE v1 147 | */ 148 | *type = MY_IBV_GID_TYPE_IB_ROCE_V1; 149 | else 150 | return -1; 151 | } else { 152 | closedir(dir); 153 | errno = EFAULT; 154 | return -1; 155 | } 156 | } else { 157 | if (!strcmp(buff, V1_TYPE)) { 158 | *type = MY_IBV_GID_TYPE_IB_ROCE_V1; 159 | } else if (!strcmp(buff, V2_TYPE)) { 160 | *type = MY_IBV_GID_TYPE_ROCE_V2; 161 | } else { 162 | errno = ENOTSUP; 163 | return -1; 164 | } 165 | } 166 | 167 | return 0; 168 | } 169 | 170 | void my_ibv_find_gid_family(union ibv_gid *gid, int *gid_family) 171 | { 172 | if (gid->raw[0] == 0 && gid->raw[1] == 0) 173 | *gid_family = AF_INET; 174 | else 175 | *gid_family = AF_INET6; 176 | } 177 | 178 | int my_ibv_find_sgid_type(struct ibv_context *context, uint8_t port_num, 179 | enum my_ibv_gid_type gid_type, int gid_family) 180 | { 181 | enum my_ibv_gid_type sgid_type = 0; 182 | union ibv_gid sgid; 183 | int sgid_family = -1; 184 | int idx = 0; 185 | 186 | do { 187 | 188 | if (ibv_query_gid(context, port_num, idx, &sgid)) { 189 | errno = EFAULT; 190 | return -1; 191 | } 192 | if (my_ibv_query_gid_type(context, port_num, idx, &sgid_type)) { 193 | errno = EFAULT; 194 | return -1; 195 | } 196 | 197 | my_ibv_find_gid_family(&sgid, &sgid_family); 198 | 199 | if (gid_type == sgid_type && gid_family == sgid_family) { 200 | return idx; 201 | } 202 | 203 | idx++; 204 | } while (gid_type != sgid_type || gid_family != sgid_family); 205 | 206 | return idx; 207 | } 208 | --------------------------------------------------------------------------------