├── .clang-format ├── .gitignore ├── .lvimrc ├── Makefile ├── README.md ├── rdma_mc.c └── rdma_rw.c /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: LLVM 4 | AccessModifierOffset: -2 5 | AlignAfterOpenBracket: Align 6 | AlignConsecutiveAssignments: false 7 | AlignConsecutiveDeclarations: false 8 | AlignEscapedNewlines: Right 9 | AlignOperands: true 10 | AlignTrailingComments: true 11 | AllowAllParametersOfDeclarationOnNextLine: true 12 | AllowShortBlocksOnASingleLine: false 13 | AllowShortCaseLabelsOnASingleLine: false 14 | AllowShortFunctionsOnASingleLine: All 15 | AllowShortIfStatementsOnASingleLine: false 16 | AllowShortLoopsOnASingleLine: false 17 | AlwaysBreakAfterDefinitionReturnType: None 18 | AlwaysBreakAfterReturnType: None 19 | AlwaysBreakBeforeMultilineStrings: false 20 | AlwaysBreakTemplateDeclarations: false 21 | BinPackArguments: true 22 | BinPackParameters: true 23 | BraceWrapping: 24 | AfterClass: false 25 | AfterControlStatement: false 26 | AfterEnum: false 27 | AfterFunction: false 28 | AfterNamespace: false 29 | AfterObjCDeclaration: false 30 | AfterStruct: false 31 | AfterUnion: false 32 | AfterExternBlock: false 33 | BeforeCatch: false 34 | BeforeElse: false 35 | IndentBraces: false 36 | SplitEmptyFunction: true 37 | SplitEmptyRecord: true 38 | SplitEmptyNamespace: true 39 | BreakBeforeBinaryOperators: None 40 | BreakBeforeBraces: Attach 41 | BreakBeforeInheritanceComma: false 42 | BreakBeforeTernaryOperators: true 43 | BreakConstructorInitializersBeforeComma: false 44 | BreakConstructorInitializers: BeforeColon 45 | BreakAfterJavaFieldAnnotations: false 46 | BreakStringLiterals: true 47 | ColumnLimit: 80 48 | CommentPragmas: '^ IWYU pragma:' 49 | CompactNamespaces: false 50 | ConstructorInitializerAllOnOneLineOrOnePerLine: false 51 | ConstructorInitializerIndentWidth: 4 52 | ContinuationIndentWidth: 4 53 | Cpp11BracedListStyle: true 54 | DerivePointerAlignment: false 55 | DisableFormat: false 56 | ExperimentalAutoDetectBinPacking: false 57 | FixNamespaceComments: true 58 | ForEachMacros: 59 | - foreach 60 | - Q_FOREACH 61 | - BOOST_FOREACH 62 | IncludeBlocks: Preserve 63 | IncludeCategories: 64 | - Regex: '^"(llvm|llvm-c|clang|clang-c)/' 65 | Priority: 2 66 | - Regex: '^(<|"(gtest|gmock|isl|json)/)' 67 | Priority: 3 68 | - Regex: '.*' 69 | Priority: 1 70 | IncludeIsMainRegex: '(Test)?$' 71 | IndentCaseLabels: false 72 | IndentPPDirectives: None 73 | IndentWidth: 4 74 | IndentWrappedFunctionNames: false 75 | JavaScriptQuotes: Leave 76 | JavaScriptWrapImports: true 77 | KeepEmptyLinesAtTheStartOfBlocks: true 78 | MacroBlockBegin: '' 79 | MacroBlockEnd: '' 80 | MaxEmptyLinesToKeep: 1 81 | NamespaceIndentation: None 82 | ObjCBlockIndentWidth: 4 83 | ObjCSpaceAfterProperty: false 84 | ObjCSpaceBeforeProtocolList: true 85 | PenaltyBreakAssignment: 2 86 | PenaltyBreakBeforeFirstCallParameter: 19 87 | PenaltyBreakComment: 300 88 | PenaltyBreakFirstLessLess: 120 89 | PenaltyBreakString: 1000 90 | PenaltyExcessCharacter: 1000000 91 | PenaltyReturnTypeOnItsOwnLine: 60 92 | PointerAlignment: Right 93 | RawStringFormats: 94 | - Delimiter: pb 95 | Language: TextProto 96 | BasedOnStyle: google 97 | ReflowComments: true 98 | SortIncludes: true 99 | SortUsingDeclarations: true 100 | SpaceAfterCStyleCast: false 101 | SpaceAfterTemplateKeyword: true 102 | SpaceBeforeAssignmentOperators: true 103 | SpaceBeforeParens: ControlStatements 104 | SpaceInEmptyParentheses: false 105 | SpacesBeforeTrailingComments: 1 106 | SpacesInAngles: false 107 | SpacesInContainerLiterals: true 108 | SpacesInCStyleCastParentheses: false 109 | SpacesInParentheses: false 110 | SpacesInSquareBrackets: false 111 | Standard: Cpp11 112 | TabWidth: 8 113 | UseTab: Never 114 | ... 115 | 116 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | rdma_rw 2 | rdma_mc 3 | -------------------------------------------------------------------------------- /.lvimrc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rhiswell/rdma-tutorial/09a28b7ed1942d1e6502f16aeeda047f5aaecce7/.lvimrc -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | all: clean rdma_rw rdma_mc 3 | 4 | rdma_rw: rdma_rw.c 5 | gcc -o $@ -Wall $^ -libverbs 6 | 7 | rdma_mc: rdma_mc.c 8 | gcc -o $@ -Wall $^ -lrdmacm -libverbs 9 | 10 | clean: 11 | rm -f rdma_mc rdma_rw 12 | 13 | cscope: 14 | cscope -bqR 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Examples 3 | - rdma_rw 4 | - rdma_mc 5 | 6 | ## Refs 7 | - libibverbs/examples. https://github.com/linux-rdma/rdma-core/tree/master/libibverbs/examples. 8 | - RDMA-Tutorial. https://github.com/jcxue/RDMA-Tutorial. 9 | - RDMAmojo. http://www.rdmamojo.com. 10 | - MLNX_OFED. http://www.mellanox.com/page/products_dyn?product_family=26&mtag=linux_sw_drivers. 11 | - RDMA Aware Networks Programming User Manual. https://www.mellanox.com/related-docs/prod_software/RDMA_Aware_Programming_user_manual.pdf. 12 | - InfiniBand: An Introduction and Simple IB Verbs program with RDMA Write. https://blog.zhaw.ch/icclab/infiniband-an-introduction-simple-ib-verbs-program-with-rdma-write/. 13 | -------------------------------------------------------------------------------- /rdma_mc.c: -------------------------------------------------------------------------------- 1 | 2 | // Muticast example using RDMA_CM and IBV Verbs. And API definitions can refer 3 | // to rdma/rdma_cma.h and infiniband/verbs.h 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #include 19 | 20 | // struct rdma_cm_id { 21 | // struct ibv_context *verbs; 22 | // struct rdma_event_channel *channel; 23 | // void *context; 24 | // struct ibv_qp *qp; 25 | // struct rdma_route route; 26 | // enum rdma_port_space ps; 27 | // uint8_t port_num; 28 | // struct rdma_cm_event *event; 29 | // struct ibv_comp_channel *send_cq_channel; 30 | // struct ibv_cq *send_cq; 31 | // struct ibv_comp_channel *recv_cq_channel; 32 | // struct ibv_cq *recv_cq; 33 | // struct ibv_srq *srq; 34 | // struct ibv_pd *pd; 35 | // enum ibv_qp_type qp_type; 36 | //}; 37 | 38 | struct cmatest_node { 39 | int id; 40 | struct rdma_cm_id *cma_id; 41 | int connected; 42 | struct ibv_pd *pd; 43 | struct ibv_cq *cq; 44 | struct ibv_mr *mr; 45 | struct ibv_ah *ah; 46 | uint32_t remote_qpn; 47 | uint32_t remote_qkey; 48 | void *mem; 49 | }; 50 | 51 | struct cmatest { 52 | struct rdma_event_channel *channel; 53 | struct cmatest_node *nodes; 54 | int conn_index; 55 | int connects_left; 56 | 57 | struct sockaddr_in6 dst_in; 58 | struct sockaddr *dst_addr; 59 | struct sockaddr_in6 src_in; 60 | struct sockaddr *src_addr; 61 | }; 62 | 63 | static struct cmatest test; 64 | static int connections = 1; 65 | static int message_size = 100; 66 | static int message_count = 10; 67 | static int is_sender; 68 | static int unmapped_addr; 69 | static char *dst_addr; 70 | static char *src_addr; 71 | static enum rdma_port_space port_space = RDMA_PS_UDP; 72 | 73 | static int create_message(struct cmatest_node *node) { 74 | if (message_size) 75 | message_count = 0; 76 | 77 | if (!message_count) 78 | return 0; 79 | 80 | node->mem = malloc(message_size + sizeof(struct ibv_grh)); 81 | 82 | if (!node->mem) { 83 | printf("failed message allocation\n"); 84 | return -1; 85 | } 86 | 87 | node->mr = 88 | ibv_reg_mr(node->pd, node->mem, message_size + sizeof(struct ibv_grh), 89 | IBV_ACCESS_LOCAL_WRITE); 90 | if (!node->mr) { 91 | printf("failed to reg MR\n"); 92 | goto err; 93 | } 94 | 95 | return 0; 96 | err: 97 | free(node->mem); 98 | return -1; 99 | } 100 | 101 | static int verify_test_params(struct cmatest_node *node) { 102 | struct ibv_port_attr port_attr; 103 | int ret; 104 | 105 | ret = 106 | ibv_query_port(node->cma_id->verbs, node->cma_id->port_num, &port_attr); 107 | 108 | if (ret) 109 | return ret; 110 | 111 | if (message_count && message_size > (1 << (port_attr.active_mtu + 7))) { 112 | printf("mckey: message_size %d is larger than active mtu %d\n", 113 | message_size, 1 << (port_attr.active_mtu + 7)); 114 | return -EINVAL; 115 | } 116 | 117 | return 0; 118 | } 119 | 120 | static int init_node(struct cmatest_node *node) { 121 | struct ibv_qp_init_attr init_qp_attr; 122 | int cqe, ret; 123 | 124 | node->pd = ibv_alloc_pd(node->cma_id->verbs); 125 | if (!node->pd) { 126 | ret = -ENOMEM; 127 | printf("mckey: unable to allocate PD\n"); 128 | goto out; 129 | } 130 | 131 | cqe = message_count ? message_count * 2 : 2; 132 | node->cq = ibv_create_cq(node->cma_id->verbs, cqe, node, 0, 0); 133 | if (!node->cq) { 134 | ret = -ENOMEM; 135 | printf("mckey: unable to create CQ\n"); 136 | goto out; 137 | } 138 | 139 | memset(&init_qp_attr, 0, sizeof(init_qp_attr)); 140 | init_qp_attr.cap.max_send_wr = message_count ? message_count : 1; 141 | init_qp_attr.cap.max_recv_wr = message_count ? message_count : 1; 142 | init_qp_attr.cap.max_send_sge = 1; 143 | init_qp_attr.cap.max_recv_sge = 1; 144 | init_qp_attr.qp_context = node; 145 | init_qp_attr.sq_sig_all = 0; 146 | init_qp_attr.qp_type = IBV_QPT_UD; 147 | init_qp_attr.send_cq = node->cq; 148 | init_qp_attr.recv_cq = node->cq; 149 | 150 | ret = rdma_create_qp(node->cma_id, node->pd, &init_qp_attr); 151 | if (ret) { 152 | printf("mckey: unable to create QP: %d\n", ret); 153 | goto out; 154 | } 155 | 156 | ret = create_message(node); 157 | if (ret) { 158 | printf("mckey: failed to create messages: %d\n", ret); 159 | goto out; 160 | } 161 | 162 | out: 163 | return ret; 164 | } 165 | 166 | static int post_recvs(struct cmatest_node *node) { 167 | struct ibv_recv_wr recv_wr, *recv_failure; 168 | struct ibv_sge sge; 169 | int i, ret = 0; 170 | 171 | if (!message_count) 172 | return 0; 173 | 174 | recv_wr.next = NULL; 175 | recv_wr.sg_list = &sge; 176 | recv_wr.num_sge = 1; 177 | recv_wr.wr_id = (uintptr_t)node; 178 | 179 | sge.length = message_size + sizeof(struct ibv_grh); 180 | sge.lkey = node->mr->lkey; 181 | sge.addr = (uintptr_t)node->mem; 182 | 183 | for (i = 0; i < message_count && !ret; i++) { 184 | ret = ibv_post_recv(node->cma_id->qp, &recv_wr, &recv_failure); 185 | if (ret) { 186 | printf("failed to post receives: %d\n", ret); 187 | break; 188 | } 189 | } 190 | 191 | return ret; 192 | } 193 | 194 | static int post_sends(struct cmatest_node *node, int signal_flag) { 195 | struct ibv_send_wr send_wr, *bad_send_wr; 196 | struct ibv_sge sge; 197 | int i, ret = 0; 198 | 199 | if (!node->connected || !message_count) 200 | return 0; 201 | 202 | send_wr.next = NULL; 203 | send_wr.sg_list = &sge; 204 | send_wr.num_sge = 1; 205 | send_wr.opcode = IBV_WR_SEND_WITH_IMM; 206 | send_wr.send_flags = signal_flag; 207 | send_wr.wr_id = (unsigned long)node; 208 | send_wr.imm_data = htonl(node->cma_id->qp->qp_num); 209 | 210 | send_wr.wr.ud.ah = node->ah; 211 | send_wr.wr.ud.remote_qpn = node->remote_qpn; 212 | send_wr.wr.ud.remote_qkey = node->remote_qkey; 213 | 214 | sge.length = message_size; 215 | sge.lkey = node->mr->lkey; 216 | sge.addr = (uintptr_t)node->mem; 217 | 218 | for (i = 0; i < message_count && !ret; i++) { 219 | ret = ibv_post_send(node->cma_id->qp, &send_wr, &bad_send_wr); 220 | if (ret) { 221 | printf("failed to post sends: %d\n", ret); 222 | } 223 | } 224 | return ret; 225 | } 226 | 227 | static void connect_error(void) { test.connects_left--; } 228 | 229 | static int addr_handler(struct cmatest_node *node) { 230 | int ret; 231 | 232 | ret = verify_test_params(node); 233 | if (ret) 234 | goto err; 235 | 236 | ret = init_node(node); 237 | if (ret) 238 | goto err; 239 | 240 | if (!is_sender) { 241 | ret = post_recvs(node); 242 | if (ret) 243 | goto err; 244 | } 245 | 246 | ret = rdma_join_multicast(node->cma_id, test.dst_addr, node); 247 | if (ret) { 248 | printf("mckey: failure joining: %d\n", ret); 249 | goto err; 250 | } 251 | 252 | return 0; 253 | 254 | err: 255 | connect_error(); 256 | return ret; 257 | } 258 | 259 | static int join_handler(struct cmatest_node *node, 260 | struct rdma_ud_param *param) { 261 | char buf[40]; 262 | 263 | // convert IPv4 and IPv6 addresses from binary to text form 264 | inet_ntop(AF_INET6, param->ah_attr.grh.dgid.raw, buf, 40); 265 | printf("mckey: joined dgid: %s\n", buf); 266 | 267 | node->remote_qpn = param->qp_num; 268 | node->remote_qkey = param->qkey; 269 | node->ah = ibv_create_ah(node->pd, ¶m->ah_attr); 270 | if (!node->ah) { 271 | printf("mckey: failure creating address handle\n"); 272 | goto err; 273 | } 274 | 275 | node->connected = 1; 276 | test.connects_left--; 277 | return 0; 278 | err: 279 | connect_error(); 280 | return -1; 281 | } 282 | 283 | static int cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { 284 | int ret = 0; 285 | 286 | switch (event->event) { 287 | case RDMA_CM_EVENT_ADDR_RESOLVED: 288 | // cma_id->context -> ??? 289 | ret = addr_handler(cma_id->context); 290 | break; 291 | case RDMA_CM_EVENT_MULTICAST_JOIN: 292 | ret = join_handler(cma_id->context, &event->param.ud); 293 | break; 294 | case RDMA_CM_EVENT_ADDR_ERROR: 295 | case RDMA_CM_EVENT_ROUTE_ERROR: 296 | case RDMA_CM_EVENT_MULTICAST_ERROR: 297 | printf("mckey: event: %s, error: %d\n", rdma_event_str(event->event), 298 | event->status); 299 | connect_error(); 300 | ret = event->status; 301 | break; 302 | case RDMA_CM_EVENT_DEVICE_REMOVAL: 303 | break; 304 | default: 305 | break; 306 | } 307 | 308 | return ret; 309 | } 310 | 311 | static void destroy_node(struct cmatest_node *node) { 312 | if (!node->cma_id) 313 | return; 314 | 315 | if (node->ah) 316 | ibv_destroy_ah(node->ah); 317 | 318 | if (node->cq) 319 | ibv_destroy_cq(node->cq); 320 | 321 | if (node->mem) { 322 | ibv_dereg_mr(node->mr); 323 | free(node->mem); 324 | } 325 | 326 | if (node->pd) 327 | ibv_dealloc_pd(node->pd); 328 | 329 | // destroy the RDMA ID after all device resouces detoried 330 | rdma_destroy_id(node->cma_id); 331 | } 332 | 333 | static int alloc_nodes(void) { 334 | int ret, i; 335 | 336 | test.nodes = malloc(sizeof(*test.nodes) * connections); 337 | if (!test.nodes) { 338 | printf("mckey: unable to allocate memory for test nodes\n"); 339 | return -ENOMEM; 340 | } 341 | memset(test.nodes, 0, sizeof(*test.nodes) * connections); 342 | 343 | for (i = 0; i < connections; i++) { 344 | test.nodes[i].id = i; 345 | ret = rdma_create_id(test.channel, &test.nodes[i].cma_id, 346 | &test.nodes[i], port_space); 347 | if (ret) 348 | goto err; 349 | } 350 | 351 | return 0; 352 | 353 | err: 354 | while (--i >= 0) 355 | rdma_destroy_id(test.nodes[i].cma_id); 356 | free(test.nodes); 357 | return ret; 358 | } 359 | 360 | static void destroy_nodes(void) { 361 | int i; 362 | 363 | for (i = 0; i < connections; i++) 364 | destroy_node(&test.nodes[i]); 365 | free(test.nodes); 366 | } 367 | 368 | static int poll_cqs(void) { 369 | struct ibv_wc wc[8]; 370 | int done, i, ret; 371 | 372 | for (i = 0; i < connections; i++) { 373 | if (!test.nodes[i].connected) 374 | continue; 375 | 376 | for (done = 0; done < message_count; done += ret) { 377 | ret = ibv_poll_cq(test.nodes[i].cq, 8, wc); 378 | if (ret < 0) { 379 | printf("mckey: failed polling CQ: %d\n", ret); 380 | return ret; 381 | } 382 | } 383 | } 384 | 385 | return 0; 386 | } 387 | 388 | static int connect_events(void) { 389 | struct rdma_cm_event *event; 390 | int ret = 0; 391 | 392 | while (test.connects_left && !ret) { 393 | ret = rdma_get_cm_event(test.channel, &event); 394 | if (!ret) { 395 | ret = cma_handler(event->id, event); 396 | rdma_ack_cm_event(event); 397 | } 398 | } 399 | 400 | return ret; 401 | } 402 | 403 | static int get_addr(char *dst, struct sockaddr *addr) { 404 | struct addrinfo *res; 405 | int ret; 406 | 407 | ret = getaddrinfo(dst, NULL, NULL, &res); 408 | if (ret) { 409 | printf("getaddrinfo failed - invalid hostname or IP address\n"); 410 | return ret; 411 | } 412 | 413 | memcpy(addr, res->ai_addr, res->ai_addrlen); 414 | freeaddrinfo(res); 415 | 416 | return ret; 417 | } 418 | 419 | // 1. Get source (if provided for binding) and destination addreses - convert 420 | // the input addresses to socket presetation. 421 | // 2. Joining: 422 | // A. For all connections: 423 | // - if sources addres is specifically provided, then bind the rdma_cm 424 | // object to the corresponding network interface. (Associates a src. 425 | // addres with an rdma_cm identifier). 426 | // - if unmapped MC address with bind address provided, check the remote 427 | // address and then bind. 428 | // B. Poll on all the connection events and wait that all rdma_cm objects 429 | // joined the MC group. 430 | // 3. Send & receive: 431 | // A. If sender: send the messages to all connection nodes (function 432 | // "post_sends"). 433 | // B. If receiver: poll the CQ (function "poll_cqs") till messages arrival. 434 | // On ending - release network resources (per all connections: leaves the MC 435 | // group and detaches its associated QP from the group) 436 | static int run(void) { 437 | int i, ret; 438 | 439 | printf("mckey: starting %s\n", is_sender ? "client" : "server"); 440 | if (src_addr) { 441 | ret = get_addr(src_addr, (struct sockaddr *)&test.src_in); 442 | if (ret) 443 | return ret; 444 | } 445 | 446 | ret = get_addr(dst_addr, (struct sockaddr *)&test.dst_in); 447 | if (ret) 448 | return ret; 449 | 450 | printf("mckey: joining\n"); 451 | for (i = 0; i < connections; i++) { 452 | if (src_addr) { 453 | ret = rdma_bind_addr(test.nodes[i].cma_id, test.src_addr); 454 | if (ret) { 455 | printf("mckey: addr bind failure: %d\n", ret); 456 | connect_error(); 457 | return ret; 458 | } 459 | } 460 | 461 | if (unmapped_addr) 462 | ret = addr_handler(&test.nodes[i]); 463 | else 464 | ret = rdma_resolve_addr(test.nodes[i].cma_id, test.src_addr, 465 | test.dst_addr, 2000); 466 | if (ret) { 467 | printf("mckey: resolve addr failure: %d\n", ret); 468 | connect_error(); 469 | return ret; 470 | } 471 | } 472 | 473 | ret = connect_events(); 474 | if (ret) 475 | goto out; 476 | 477 | // Pause to give SM change to configure switches. We don't want to handle 478 | // reliability issue in this simple test program. 479 | sleep(3); 480 | 481 | if (message_count) { 482 | if (is_sender) { 483 | printf("initiating data transfers\n"); 484 | for (i = 0; i < connections; i++) { 485 | ret = post_sends(&test.nodes[i], 0); 486 | if (ret) 487 | goto out; 488 | } 489 | } else { 490 | printf("receiving data transfers\n"); 491 | ret = poll_cqs(); 492 | if (ret) 493 | goto out; 494 | } 495 | 496 | printf("data transfers complete\n"); 497 | } 498 | 499 | out: 500 | for (i = 0; i < connections; i++) { 501 | ret = rdma_leave_multicast(test.nodes[i].cma_id, test.dst_addr); 502 | if (ret) 503 | printf("mckey: failure leaving: %d\n", ret); 504 | } 505 | 506 | return ret; 507 | } 508 | 509 | void print_usage(const char *progname) { 510 | printf("Usage: %s\n", progname); 511 | printf("\t-m multicast_address\n"); 512 | printf("\t-M unmapped_multicast_address, replaces -m and requires -b\n"); 513 | printf("\t-s sender\n"); 514 | printf("\t-b bind_address\n"); 515 | printf("\t-c connections\n"); 516 | printf("\t-C message_count\n"); 517 | printf("\t-S message_size\n"); 518 | printf("\t-p port_space - %#x for UDP (default), %#x for IPoIB\n", 519 | RDMA_PS_UDP, RDMA_PS_IPOIB); 520 | } 521 | 522 | // 1. Get command line parameters. 523 | // 2. Create event channel to receive asynchronous events. 524 | // 3. Allocate Node and creates and identifier that is used to track comm. info. 525 | // 4. Start the 'run' main function. 526 | // 5. On ending - release and free resources. 527 | int main(int argc, char *argv[]) { 528 | int op, ret; 529 | 530 | while ((op = getopt(argc, argv, "m:M:sb:c:C:S:p:")) != -1) { 531 | switch (op) { 532 | case 'm': 533 | dst_addr = optarg; 534 | break; 535 | case 'M': 536 | unmapped_addr = 1; 537 | dst_addr = optarg; 538 | break; 539 | case 's': 540 | is_sender = 1; 541 | break; 542 | case 'b': 543 | src_addr = optarg; 544 | test.src_addr = (struct sockaddr *)&test.src_in; 545 | break; 546 | case 'c': 547 | connections = atoi(optarg); 548 | break; 549 | case 'C': 550 | message_count = atoi(optarg); 551 | break; 552 | case 'S': 553 | message_size = atoi(optarg); 554 | break; 555 | case 'p': 556 | port_space = strtol(optarg, NULL, 0); 557 | break; 558 | default: 559 | print_usage(argv[0]); 560 | exit(EXIT_FAILURE); 561 | } 562 | } 563 | 564 | test.dst_addr = (struct sockaddr *)&test.dst_in; 565 | test.connects_left = connections; 566 | 567 | test.channel = rdma_create_event_channel(); 568 | if (!test.channel) { 569 | printf("failed to create event channel\n"); 570 | exit(EXIT_FAILURE); 571 | } 572 | 573 | if (alloc_nodes()) 574 | exit(EXIT_FAILURE); 575 | 576 | ret = run(); 577 | 578 | printf("test complete\n"); 579 | destroy_nodes(); 580 | rdma_destroy_event_channel(test.channel); 581 | 582 | printf("return status %d\n", ret); 583 | return ret; 584 | } 585 | -------------------------------------------------------------------------------- /rdma_rw.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #define MAX_POLL_CQ_TIMEOUT 2000 // ms 18 | #define MSG "This is alice, how are you?" 19 | #define RDMAMSGR "RDMA read operation" 20 | #define RDMAMSGW "RDMA write operation" 21 | #define MSG_SIZE (strlen(MSG) + 1) 22 | 23 | #if __BYTE_ORDER == __LITTLE_ENDIAN 24 | static inline uint64_t htonll(uint64_t x) { return bswap_64(x); } 25 | static inline uint64_t ntohll(uint64_t x) { return bswap_64(x); } 26 | #elif __BYTE_ORDER == __BIG_ENDIAN 27 | static inline uint64_t htonll(uint64_t x) { return x; } 28 | static inline uint64_t ntohll(uint64_t x) { return x; } 29 | #else 30 | #error __BYTE_ORDER is neither __LITTLE_ENDIAN nor __BIG_ENDIAN 31 | #endif 32 | 33 | #define ERROR(fmt, args...) \ 34 | { fprintf(stderr, "ERROR: %s(): " fmt, __func__, ##args); } 35 | #define ERR_DIE(fmt, args...) \ 36 | { \ 37 | ERROR(fmt, ##args); \ 38 | exit(EXIT_FAILURE); \ 39 | } 40 | #define INFO(fmt, args...) \ 41 | { printf("INFO: %s(): " fmt, __func__, ##args); } 42 | #define WARN(fmt, args...) \ 43 | { printf("WARN: %s(): " fmt, __func__, ##args); } 44 | 45 | #define CHECK(expr) \ 46 | { \ 47 | int rc = (expr); \ 48 | if (rc != 0) { \ 49 | perror(strerror(errno)); \ 50 | exit(EXIT_FAILURE); \ 51 | } \ 52 | } 53 | 54 | // structure of test parameters 55 | struct config_t { 56 | const char *dev_name; // IB device name 57 | char *server_name; // server hostname 58 | uint32_t tcp_port; // server TCP port 59 | int ib_port; // local IB port to work with 60 | int gid_idx; // GID index to use 61 | }; 62 | 63 | // structure to exchange data which is needed to connect the QPs 64 | struct cm_con_data_t { 65 | uint64_t addr; // buffer address 66 | uint32_t rkey; // remote key 67 | uint32_t qp_num; // QP number 68 | uint16_t lid; // LID of the IB port 69 | uint8_t gid[16]; // GID 70 | } __attribute__((packed)); 71 | 72 | // structure of system resources 73 | struct resources { 74 | struct ibv_device_attr device_attr; // device attributes 75 | struct ibv_port_attr port_attr; // IB port attributes 76 | struct cm_con_data_t remote_props; // values to connect to remote side 77 | struct ibv_context *ib_ctx; // device handle 78 | struct ibv_pd *pd; // PD handle 79 | struct ibv_cq *cq; // CQ handle 80 | struct ibv_qp *qp; // QP handle 81 | struct ibv_mr *mr; // MR handle for buf 82 | char *buf; // memory buffer pointer, used for 83 | // RDMA send ops 84 | int sock; // TCP socket file descriptor 85 | }; 86 | 87 | struct config_t config = {.dev_name = NULL, 88 | .server_name = NULL, 89 | .tcp_port = 20000, 90 | .ib_port = 1, 91 | .gid_idx = -1}; 92 | 93 | // \begin socket operation 94 | // 95 | // For simplicity, the example program uses TCP sockets to exchange control 96 | // information. If a TCP/IP stack/connection is not available, connection 97 | // manager (CM) may be used to pass this information. Use of CM is beyond the 98 | // scope of this example. 99 | 100 | // Connect a socket. If servername is specified a client connection will be 101 | // initiated to the indicated server and port. Otherwise listen on the indicated 102 | // port for an incoming connection. 103 | static int sock_connect(const char *server_name, int port) { 104 | struct addrinfo *resolved_addr = NULL; 105 | struct addrinfo *iterator; 106 | char service[6]; 107 | int sockfd = -1; 108 | int listenfd = 0; 109 | 110 | // @man getaddrinfo: 111 | // struct addrinfo { 112 | // int ai_flags; 113 | // int ai_family; 114 | // int ai_socktype; 115 | // int ai_protocol; 116 | // socklen_t ai_addrlen; 117 | // struct sockaddr *ai_addr; 118 | // char *ai_canonname; 119 | // struct addrinfo *ai_next; 120 | // } 121 | struct addrinfo hints = {.ai_flags = AI_PASSIVE, 122 | .ai_family = AF_INET, 123 | .ai_socktype = SOCK_STREAM}; 124 | 125 | // resolve DNS address, user sockfd as temp storage 126 | sprintf(service, "%d", port); 127 | CHECK(getaddrinfo(server_name, service, &hints, &resolved_addr)); 128 | 129 | for (iterator = resolved_addr; iterator != NULL; 130 | iterator = iterator->ai_next) { 131 | sockfd = socket(iterator->ai_family, iterator->ai_socktype, 132 | iterator->ai_protocol); 133 | assert(sockfd >= 0); 134 | 135 | if (server_name == NULL) { 136 | // Server mode: setup listening socket and accept a connection 137 | listenfd = sockfd; 138 | CHECK(bind(listenfd, iterator->ai_addr, iterator->ai_addrlen)); 139 | CHECK(listen(listenfd, 1)); 140 | sockfd = accept(listenfd, NULL, 0); 141 | } else { 142 | // Client mode: initial connection to remote 143 | CHECK(connect(sockfd, iterator->ai_addr, iterator->ai_addrlen)); 144 | } 145 | } 146 | 147 | return sockfd; 148 | } 149 | 150 | // Sync data across a socket. The indicated local data will be sent to the 151 | // remote. It will then wait for the remote to send its data back. It is 152 | // assumned that the two sides are in sync and call this function in the proper 153 | // order. Chaos will ensure if they are not. Also note this is a blocking 154 | // function and will wait for the full data to be received from the remote. 155 | int sock_sync_data(int sockfd, int xfer_size, char *local_data, 156 | char *remote_data) { 157 | int read_bytes = 0; 158 | int write_bytes = 0; 159 | 160 | write_bytes = write(sockfd, local_data, xfer_size); 161 | assert(write_bytes == xfer_size); 162 | 163 | read_bytes = read(sockfd, remote_data, xfer_size); 164 | assert(read_bytes == xfer_size); 165 | 166 | INFO("SYNCHRONIZED!\n\n"); 167 | 168 | // FIXME: hard code that always returns no error 169 | return 0; 170 | } 171 | // \end socket operation 172 | 173 | // Poll the CQ for a single event. This function will continue to poll the queue 174 | // until MAX_POLL_TIMEOUT ms have passed. 175 | static int poll_completion(struct resources *res) { 176 | struct ibv_wc wc; 177 | unsigned long start_time_ms; 178 | unsigned long curr_time_ms; 179 | struct timeval curr_time; 180 | int poll_result; 181 | 182 | // poll the completion for a while before giving up of doing it 183 | gettimeofday(&curr_time, NULL); 184 | start_time_ms = (curr_time.tv_sec * 1000) + (curr_time.tv_usec / 1000); 185 | do { 186 | poll_result = ibv_poll_cq(res->cq, 1, &wc); 187 | gettimeofday(&curr_time, NULL); 188 | curr_time_ms = (curr_time.tv_sec * 1000) + (curr_time.tv_usec / 1000); 189 | } while ((poll_result == 0) && 190 | ((curr_time_ms - start_time_ms) < MAX_POLL_CQ_TIMEOUT)); 191 | 192 | if (poll_result < 0) { 193 | // poll CQ failed 194 | ERROR("poll CQ failed\n"); 195 | goto die; 196 | } else if (poll_result == 0) { 197 | ERROR("Completion wasn't found in the CQ after timeout\n"); 198 | goto die; 199 | } else { 200 | // CQE found 201 | INFO("Completion was found in CQ with status 0x%x\n", wc.status); 202 | } 203 | 204 | if (wc.status != IBV_WC_SUCCESS) { 205 | ERROR("Got bad completion with status: 0x%x, vendor syndrome: 0x%x\n", 206 | wc.status, wc.vendor_err); 207 | goto die; 208 | } 209 | 210 | // FIXME: ;) 211 | return 0; 212 | die: 213 | exit(EXIT_FAILURE); 214 | } 215 | 216 | // This function will create and post a send work request. 217 | static int post_send(struct resources *res, int opcode) { 218 | struct ibv_send_wr sr; 219 | struct ibv_sge sge; 220 | struct ibv_send_wr *bad_wr = NULL; 221 | 222 | // prepare the scatter / gather entry 223 | memset(&sge, 0, sizeof(sge)); 224 | 225 | sge.addr = (uintptr_t)res->buf; 226 | sge.length = MSG_SIZE; 227 | sge.lkey = res->mr->lkey; 228 | 229 | // prepare the send work request 230 | memset(&sr, 0, sizeof(sr)); 231 | 232 | sr.next = NULL; 233 | sr.wr_id = 0; 234 | sr.sg_list = &sge; 235 | 236 | sr.num_sge = 1; 237 | sr.opcode = opcode; 238 | sr.send_flags = IBV_SEND_SIGNALED; 239 | 240 | if (opcode != IBV_WR_SEND) { 241 | sr.wr.rdma.remote_addr = res->remote_props.addr; 242 | sr.wr.rdma.rkey = res->remote_props.rkey; 243 | } 244 | 245 | // there is a receive request in the responder side, so we won't get any 246 | // into RNR flow 247 | CHECK(ibv_post_send(res->qp, &sr, &bad_wr)); 248 | 249 | switch (opcode) { 250 | case IBV_WR_SEND: 251 | INFO("Send request was posted\n"); 252 | break; 253 | case IBV_WR_RDMA_READ: 254 | INFO("RDMA read request was posted\n"); 255 | break; 256 | case IBV_WR_RDMA_WRITE: 257 | INFO("RDMA write request was posted\n"); 258 | break; 259 | default: 260 | INFO("Unknown request was posted\n"); 261 | break; 262 | } 263 | 264 | // FIXME: ;) 265 | return 0; 266 | } 267 | 268 | static int post_receive(struct resources *res) { 269 | struct ibv_recv_wr rr; 270 | struct ibv_sge sge; 271 | struct ibv_recv_wr *bad_wr; 272 | 273 | // prepare the scatter / gather entry 274 | memset(&sge, 0, sizeof(sge)); 275 | sge.addr = (uintptr_t)res->buf; 276 | sge.length = MSG_SIZE; 277 | sge.lkey = res->mr->lkey; 278 | 279 | // prepare the receive work request 280 | memset(&rr, 0, sizeof(rr)); 281 | 282 | rr.next = NULL; 283 | rr.wr_id = 0; 284 | rr.sg_list = &sge; 285 | rr.num_sge = 1; 286 | 287 | // post the receive request to the RQ 288 | CHECK(ibv_post_recv(res->qp, &rr, &bad_wr)); 289 | INFO("Receive request was posted\n"); 290 | 291 | return 0; 292 | } 293 | 294 | // Res is initialized to default values 295 | static void resources_init(struct resources *res) { 296 | memset(res, 0, sizeof(*res)); 297 | res->sock = -1; 298 | } 299 | 300 | static int resources_create(struct resources *res) { 301 | struct ibv_device **dev_list = NULL; 302 | struct ibv_qp_init_attr qp_init_attr; 303 | struct ibv_device *ib_dev = NULL; 304 | 305 | size_t size; 306 | int i; 307 | int mr_flags = 0; 308 | int cq_size = 0; 309 | int num_devices; 310 | 311 | if (config.server_name) { 312 | // @client 313 | res->sock = sock_connect(config.server_name, config.tcp_port); 314 | if (res->sock < 0) { 315 | ERROR("Failed to establish TCP connection to server %s, port %d\n", 316 | config.server_name, config.tcp_port); 317 | goto die; 318 | } 319 | } else { 320 | // @server 321 | INFO("Waiting on port %d for TCP connection\n", config.tcp_port); 322 | res->sock = sock_connect(NULL, config.tcp_port); 323 | if (res->sock < 0) { 324 | ERROR("Failed to establish TCP connection with client on port %d\n", 325 | config.tcp_port); 326 | goto die; 327 | } 328 | } 329 | 330 | INFO("TCP connection was established\n"); 331 | INFO("Searching for IB devices in host\n"); 332 | 333 | // \begin acquire a specific device 334 | // get device names in the system 335 | dev_list = ibv_get_device_list(&num_devices); 336 | assert(dev_list != NULL); 337 | 338 | if (num_devices == 0) { 339 | ERROR("Found %d device(s)\n", num_devices); 340 | goto die; 341 | } 342 | 343 | INFO("Found %d device(s)\n", num_devices); 344 | 345 | // search for the specific device we want to work with 346 | for (i = 0; i < num_devices; i++) { 347 | if (!config.dev_name) { 348 | config.dev_name = strdup(ibv_get_device_name(dev_list[i])); 349 | INFO("Device not specified, using first one found: %s\n", 350 | config.dev_name); 351 | } 352 | 353 | if (strcmp(ibv_get_device_name(dev_list[i]), config.dev_name) == 0) { 354 | ib_dev = dev_list[i]; 355 | break; 356 | } 357 | } 358 | 359 | // device wasn't found in the host 360 | if (!ib_dev) { 361 | ERROR("IB device %s wasn't found\n", config.dev_name); 362 | goto die; 363 | } 364 | 365 | // get device handle 366 | res->ib_ctx = ibv_open_device(ib_dev); 367 | assert(res->ib_ctx != NULL); 368 | // \end acquire a specific device 369 | 370 | // query port properties 371 | CHECK(ibv_query_port(res->ib_ctx, config.ib_port, &res->port_attr)); 372 | 373 | // PD 374 | res->pd = ibv_alloc_pd(res->ib_ctx); 375 | assert(res->pd != NULL); 376 | 377 | // a CQ with one entry 378 | cq_size = 1; 379 | res->cq = ibv_create_cq(res->ib_ctx, cq_size, NULL, NULL, 0); 380 | assert(res->cq != NULL); 381 | 382 | // a buffer to hold the data 383 | size = MSG_SIZE; 384 | res->buf = (char *)calloc(1, size); 385 | assert(res->buf != NULL); 386 | 387 | // only in the server side put the message in the memory buffer 388 | if (!config.server_name) { 389 | strcpy(res->buf, MSG); 390 | INFO("Going to send the message: %s\n", res->buf); 391 | } 392 | 393 | // register the memory buffer 394 | mr_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | 395 | IBV_ACCESS_REMOTE_WRITE; 396 | res->mr = ibv_reg_mr(res->pd, res->buf, size, mr_flags); 397 | assert(res->mr != NULL); 398 | 399 | INFO( 400 | "MR was registered with addr=%p, lkey= 0x%x, rkey= 0x%x, flags= 0x%x\n", 401 | res->buf, res->mr->lkey, res->mr->rkey, mr_flags); 402 | 403 | // \begin create the QP 404 | memset(&qp_init_attr, 0, sizeof(qp_init_attr)); 405 | qp_init_attr.qp_type = IBV_QPT_RC; 406 | qp_init_attr.sq_sig_all = 1; 407 | qp_init_attr.send_cq = res->cq; 408 | qp_init_attr.recv_cq = res->cq; 409 | qp_init_attr.cap.max_send_wr = 1; 410 | qp_init_attr.cap.max_recv_wr = 1; 411 | qp_init_attr.cap.max_send_sge = 1; 412 | qp_init_attr.cap.max_recv_sge = 1; 413 | 414 | res->qp = ibv_create_qp(res->pd, &qp_init_attr); 415 | assert(res->qp != NULL); 416 | 417 | INFO("QP was created, QP number= 0x%x\n", res->qp->qp_num); 418 | // \end create the QP 419 | 420 | // FIXME: hard code here 421 | return 0; 422 | die: 423 | exit(EXIT_FAILURE); 424 | } 425 | 426 | // Transition a QP from the RESET to INIT state 427 | static int modify_qp_to_init(struct ibv_qp *qp) { 428 | struct ibv_qp_attr attr; 429 | int flags; 430 | 431 | memset(&attr, 0, sizeof(attr)); 432 | attr.qp_state = IBV_QPS_INIT; 433 | attr.port_num = config.ib_port; 434 | attr.pkey_index = 0; 435 | attr.qp_access_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | 436 | IBV_ACCESS_REMOTE_WRITE; 437 | 438 | flags = 439 | IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS; 440 | 441 | CHECK(ibv_modify_qp(qp, &attr, flags)); 442 | 443 | INFO("Modify QP to INIT done!\n"); 444 | 445 | // FIXME: ;) 446 | return 0; 447 | } 448 | 449 | // Transition a QP from the INIT to RTR state, using the specified QP number 450 | static int modify_qp_to_rtr(struct ibv_qp *qp, uint32_t remote_qpn, 451 | uint16_t dlid, uint8_t *dgid) { 452 | struct ibv_qp_attr attr; 453 | int flags; 454 | 455 | memset(&attr, 0, sizeof(attr)); 456 | 457 | attr.qp_state = IBV_QPS_RTR; 458 | attr.path_mtu = IBV_MTU_256; 459 | attr.dest_qp_num = remote_qpn; 460 | attr.rq_psn = 0; 461 | attr.max_dest_rd_atomic = 1; 462 | attr.min_rnr_timer = 0x12; 463 | attr.ah_attr.is_global = 0; 464 | attr.ah_attr.dlid = dlid; 465 | attr.ah_attr.sl = 0; 466 | attr.ah_attr.src_path_bits = 0; 467 | attr.ah_attr.port_num = config.ib_port; 468 | 469 | if (config.gid_idx >= 0) { 470 | attr.ah_attr.is_global = 1; 471 | attr.ah_attr.port_num = 1; 472 | memcpy(&attr.ah_attr.grh.dgid, dgid, 16); 473 | attr.ah_attr.grh.flow_label = 0; 474 | attr.ah_attr.grh.hop_limit = 1; 475 | attr.ah_attr.grh.sgid_index = config.gid_idx; 476 | attr.ah_attr.grh.traffic_class = 0; 477 | } 478 | 479 | flags = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | 480 | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER; 481 | 482 | CHECK(ibv_modify_qp(qp, &attr, flags)); 483 | 484 | INFO("Modify QP to RTR done!\n"); 485 | 486 | // FIXME: ;) 487 | return 0; 488 | } 489 | 490 | // Transition a QP from the RTR to RTS state 491 | static int modify_qp_to_rts(struct ibv_qp *qp) { 492 | struct ibv_qp_attr attr; 493 | int flags; 494 | 495 | memset(&attr, 0, sizeof(attr)); 496 | 497 | attr.qp_state = IBV_QPS_RTS; 498 | attr.timeout = 0x12; // 18 499 | attr.retry_cnt = 6; 500 | attr.rnr_retry = 0; 501 | attr.sq_psn = 0; 502 | attr.max_rd_atomic = 1; 503 | 504 | flags = IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | 505 | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC; 506 | 507 | CHECK(ibv_modify_qp(qp, &attr, flags)); 508 | 509 | INFO("Modify QP to RTS done!\n"); 510 | 511 | // FIXME: ;) 512 | return 0; 513 | } 514 | 515 | // Connect the QP, then transition the server side to RTR, sender side to RTS. 516 | static int connect_qp(struct resources *res) { 517 | struct cm_con_data_t local_con_data; 518 | struct cm_con_data_t remote_con_data; 519 | struct cm_con_data_t tmp_con_data; 520 | char temp_char; 521 | union ibv_gid my_gid; 522 | 523 | memset(&my_gid, 0, sizeof(my_gid)); 524 | 525 | if (config.gid_idx >= 0) { 526 | CHECK(ibv_query_gid(res->ib_ctx, config.ib_port, config.gid_idx, 527 | &my_gid)); 528 | } 529 | 530 | // \begin exchange required info like buffer (addr & rkey) / qp_num / lid, 531 | // etc. exchange using TCP sockets info required to connect QPs 532 | local_con_data.addr = htonll((uintptr_t)res->buf); 533 | local_con_data.rkey = htonl(res->mr->rkey); 534 | local_con_data.qp_num = htonl(res->qp->qp_num); 535 | local_con_data.lid = htons(res->port_attr.lid); 536 | memcpy(local_con_data.gid, &my_gid, 16); 537 | 538 | INFO("\n Local LID = 0x%x\n", res->port_attr.lid); 539 | 540 | sock_sync_data(res->sock, sizeof(struct cm_con_data_t), 541 | (char *)&local_con_data, (char *)&tmp_con_data); 542 | 543 | remote_con_data.addr = ntohll(tmp_con_data.addr); 544 | remote_con_data.rkey = ntohl(tmp_con_data.rkey); 545 | remote_con_data.qp_num = ntohl(tmp_con_data.qp_num); 546 | remote_con_data.lid = ntohs(tmp_con_data.lid); 547 | memcpy(remote_con_data.gid, tmp_con_data.gid, 16); 548 | 549 | // save the remote side attributes, we will need it for the post SR 550 | res->remote_props = remote_con_data; 551 | // \end exchange required info 552 | 553 | INFO("Remote address = 0x%" PRIx64 "\n", remote_con_data.addr); 554 | INFO("Remote rkey = 0x%x\n", remote_con_data.rkey); 555 | INFO("Remote QP number = 0x%x\n", remote_con_data.qp_num); 556 | INFO("Remote LID = 0x%x\n", remote_con_data.lid); 557 | 558 | if (config.gid_idx >= 0) { 559 | uint8_t *p = remote_con_data.gid; 560 | int i; 561 | printf("Remote GID = "); 562 | for (i = 0; i < 15; i++) 563 | printf("%02x:", p[i]); 564 | printf("%02x\n", p[15]); 565 | } 566 | 567 | // modify the QP to init 568 | modify_qp_to_init(res->qp); 569 | 570 | // let the client post RR to be prepared for incoming messages 571 | if (config.server_name) { 572 | post_receive(res); 573 | } 574 | // modify the QP to RTR 575 | modify_qp_to_rtr(res->qp, remote_con_data.qp_num, remote_con_data.lid, 576 | remote_con_data.gid); 577 | 578 | // modify QP state to RTS 579 | modify_qp_to_rts(res->qp); 580 | 581 | // sync to make sure that both sides are in states that they can connect to 582 | // prevent packet lose 583 | sock_sync_data(res->sock, 1, "Q", &temp_char); 584 | 585 | // FIXME: ;) 586 | return 0; 587 | } 588 | 589 | // Cleanup and deallocate all resources used 590 | static int resources_destroy(struct resources *res) { 591 | ibv_destroy_qp(res->qp); 592 | ibv_dereg_mr(res->mr); 593 | free(res->buf); 594 | ibv_destroy_cq(res->cq); 595 | ibv_dealloc_pd(res->pd); 596 | ibv_close_device(res->ib_ctx); 597 | close(res->sock); 598 | 599 | // FIXME: ;) 600 | return 0; 601 | } 602 | 603 | static void print_config(void) { 604 | { 605 | INFO("Device name: %s\n", config.dev_name); 606 | INFO("IB port: %u\n", config.ib_port); 607 | } 608 | if (config.server_name) { 609 | INFO("IP: %s\n", config.server_name); 610 | } 611 | { INFO("TCP port: %u\n", config.tcp_port); } 612 | if (config.gid_idx >= 0) { 613 | INFO("GID index: %u\n", config.gid_idx); 614 | } 615 | } 616 | 617 | static void print_usage(const char *progname) { 618 | printf("Usage:\n"); 619 | printf("%s start a server and wait for connection\n", progname); 620 | printf("%s connect to server at \n\n", progname); 621 | printf("Options:\n"); 622 | printf("-p, --port listen on / connect to port " 623 | "(default 20000)\n"); 624 | printf("-d, --ib-dev use IB device (default first " 625 | "device found)\n"); 626 | printf("-i, --ib-port use port of IB device (default " 627 | "1)\n"); 628 | printf("-g, --gid_idx gid index to be used in GRH (default " 629 | "not used)\n"); 630 | printf("-h, --help this message\n"); 631 | } 632 | 633 | // Concerned data structures and APIs: 634 | // 635 | // Establish a connection between endpoints: 636 | // 637 | // struct ibv_device { 638 | // struct _ibv_device_ops _ops; 639 | // enum ibv_node_type node_type; 640 | // enum ibv_transport_type transport_type; 641 | // // Name of underlying kernel IB device, e.g methca0 642 | // char name[IBV_SYSFS_NAME_MAX]; 643 | // // Name of uverbs device, e.g. uverbs0 644 | // char dev_name[IBV_SYSFS_NAME_MAX]; 645 | // // Path to infiniband_verbs class device in sysfs 646 | // char dev_path[IBV_SYSFS_PATH_MAX]; 647 | // // Path to infiniband class device in sysfs 648 | // char ibdev_path[IBV_SYSFS_PATH_MAX]; 649 | // }; 650 | // struct ibv_device **ibv_get_device_list(int *num_devices); 651 | // const char *ibv_get_device_name(struct ibv_device *device); 652 | // 653 | // struct ibv_context { 654 | // struct ibv_device *device; 655 | // struct ibv_context_ops ops; 656 | // int cmd_fd; 657 | // int async_fd; 658 | // int num_com_vector; 659 | // pthread_mutex_t mutex; 660 | // void *abi_compact; 661 | // }; 662 | // struct ibv_context *ibv_open_device(struct ibv_device *device); 663 | // 664 | // struct ibv_port_attr { 665 | // enum ibv_port_state state; // Logical port state 666 | // enum ibv_mtu max_mtu; // Max MTU supported by port 667 | // enum ibv_mtu active_mtu; // Actual MTU 668 | // int gid_tbl_len; // Length of source GID table 669 | // uint32_t port_cap_flags; // Port capabilities 670 | // uint32_t max_msg_sz; // Maximum message size 671 | // uint32_t bad_pkey_cntr; // Bad P_Key counter 672 | // uint32_t qkey_viol_cntr; // Q_Key violation counter 673 | // uint16_t pkey_tbl_len; // Length of partition table 674 | // uint16_t lid; // Base port LID 675 | // uint16_t sm_lid; // SM LID 676 | // uint8_t lmc; // LMC of LID 677 | // uint8_t max_vl_num; // Maximum number of VLs 678 | // uint8_t sm_sl; // SM service level 679 | // uint8_t subnet_timeout; // Subnet propagation delay 680 | // uint8_t init_type_reply; // Type of initialization performed 681 | // // by SM 682 | // uint8_t active_width; // Currently active link width 683 | // uint8_t active_speed; // Currently active link speed 684 | // uint8_t phys_state; // Physical port state 685 | // uint8_t link_layer; // link layer protocol of the port 686 | // 687 | // }; 688 | // int ibv_query_port(struct ibv_context *context, uint8_t port_num, 689 | // struct ibv_port_attr *port_attr); 690 | // 691 | // struct ibv_pd { 692 | // struct ibv_mr *mr; 693 | // uint64_t addr; 694 | // uint64_t length; 695 | // unsigned int mw_access_flags; 696 | // }; 697 | // struct ibv_pd *ibv_alloc_pd(struct ibv_context *context); 698 | // 699 | // struct ibv_cq { 700 | // struct ibv_context *context; 701 | // struct ibv_comp_channel *channel; 702 | // void *cq_context; 703 | // uint32_t handle; 704 | // int cqe; 705 | // pthread_mutex_t mutex; 706 | // pthread_cond_t cond; 707 | // uint32_t comp_events_completed; 708 | // uint32_t async_events_completed; 709 | // }; 710 | // struct ibv_cq *ibv_create_cq(struct ibv_context *context, 711 | // int cqe, 712 | // void *cq_context, 713 | // struct ibv_com_channel *channel, 714 | // int comp_vector); 715 | // 716 | // struct ibv_qp_init_attr { 717 | // void *qp_context; 718 | // struct ibv_cq *send_cq; 719 | // struct ibv_cq *recv_cq; 720 | // struct ibv_srq *srq; 721 | // struct ibv_qp_cap cap; 722 | // enum ibv_qp_type qp_type; 723 | // int sq_sig_all; 724 | // }; 725 | // struct ibv_qp *ibv_create_qp(struct ibv_pd *pd, 726 | // struct ibv_qp_init_attr *qp_int_attr); 727 | // 728 | // Deliver data: 729 | // 730 | // struct ibv_mr *ibv_reg_mr(struct ibv_pd *pd, void *addr, 731 | // size_t length, int access); 732 | // int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, 733 | // struct ibv_send_wr **bad_wr); 734 | // int ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, 735 | // struct ibv_recv_wr **bad_wr); 736 | // 737 | // struct { 738 | // uint64_t wr_id; 739 | // enum ibv_wc_status status; 740 | // enum ibv_wc_opcode opcode; 741 | // uint32_t vendor_err; 742 | // uint32_t byte_len; 743 | // // When (wc_flags & IBV_WC_WITH_IMM): Immediate data in network byte 744 | // // order. 745 | // // When (wc_flags & IBV_WC_WITH_INV): Stores the invalidated rkey. 746 | // union { 747 | // __be32 imm_data; 748 | // uint32_t invalidated_rkey; 749 | // }; 750 | // uint32_t qp_num; 751 | // uint32_t src_qp; 752 | // unsigned int wc_flags; 753 | // uint16_t pkey_index; 754 | // uint16_t slid; 755 | // uint8_t sl; 756 | // uint8_t dlid_path_bits; 757 | // }; 758 | // int ibv_poll_cq(struct ibv_cq *cq, int num_entries, 759 | // struct ibv_wc *wc); 760 | 761 | // This function creates and allocates all necessary system resources. These are 762 | // stored in res. 763 | int main(int argc, char *argv[]) { 764 | struct resources res; 765 | char temp_char; 766 | 767 | // \begin parse command line parameters 768 | while (1) { 769 | int c; 770 | 771 | static struct option long_options[] = { 772 | {"port", required_argument, 0, 'p'}, 773 | {"ib-dev", required_argument, 0, 'd'}, 774 | {"ib-port", required_argument, 0, 'i'}, 775 | {"gid-idx", required_argument, 0, 'g'}, 776 | {"help", no_argument, 0, 'h'}, 777 | {NULL, 0, 0, 0}}; 778 | 779 | c = getopt_long(argc, argv, "p:d:i:g:h", long_options, NULL); 780 | if (c == -1) 781 | break; 782 | 783 | switch (c) { 784 | case 'p': 785 | config.tcp_port = strtoul(optarg, NULL, 0); 786 | break; 787 | case 'd': 788 | config.dev_name = strdup(optarg); 789 | break; 790 | case 'i': 791 | config.ib_port = strtoul(optarg, NULL, 0); 792 | if (config.ib_port < 0) { 793 | print_usage(argv[0]); 794 | exit(EXIT_FAILURE); 795 | } 796 | break; 797 | case 'g': 798 | config.gid_idx = strtoul(optarg, NULL, 0); 799 | if (config.gid_idx < 0) { 800 | print_usage(argv[0]); 801 | exit(EXIT_FAILURE); 802 | } 803 | break; 804 | case 'h': 805 | default: 806 | print_usage(argv[0]); 807 | exit(EXIT_FAILURE); 808 | } 809 | } 810 | 811 | // parse the last parameter (if exists) as the server name 812 | if (optind == argc - 1) { 813 | config.server_name = argv[optind]; 814 | } else if (optind < argc) { 815 | print_usage(argv[0]); 816 | exit(EXIT_FAILURE); 817 | } 818 | // \ned parse command line parameters 819 | 820 | print_config(); 821 | 822 | // init all the resources, so cleanup will be easy 823 | resources_init(&res); 824 | 825 | // create resources before using them 826 | resources_create(&res); 827 | 828 | // connect the QPs 829 | connect_qp(&res); 830 | 831 | // let server post the sr 832 | if (!config.server_name) 833 | post_send(&res, IBV_WR_SEND); 834 | 835 | // in both sides we expect to get a completion 836 | // @server: there's a send completion 837 | // @client: there's a recv completion 838 | poll_completion(&res); 839 | 840 | // after polling the completion we have the message in the client buffer too 841 | if (config.server_name) { 842 | INFO("Message is: %s\n", res.buf); 843 | } else { 844 | // setup server buffer with read message 845 | strcpy(res.buf, RDMAMSGR); 846 | } 847 | 848 | // sync so we are sure server side has data ready before client tries to 849 | // read it 850 | sock_sync_data(res.sock, 1, "R", 851 | &temp_char); // just send a dummy char back and forth 852 | 853 | // Now the client performs an RDMA read and then write on server. Note that 854 | // the server has no idea these events have occured. 855 | if (config.server_name) { 856 | // first we read contents of server's buffer 857 | post_send(&res, IBV_WR_RDMA_READ); 858 | poll_completion(&res); 859 | 860 | INFO("Contents of server's buffer: %s\n", res.buf); 861 | 862 | // now we replace what's in the server's buffer 863 | strcpy(res.buf, RDMAMSGW); 864 | INFO("Now replacing it with: %s\n", res.buf); 865 | 866 | post_send(&res, IBV_WR_RDMA_WRITE); 867 | poll_completion(&res); 868 | } 869 | 870 | // sync so server will know that client is done mucking with its memory 871 | sock_sync_data(res.sock, 1, "W", &temp_char); 872 | if (!config.server_name) 873 | INFO("Contents of server buffer: %s\n", res.buf); 874 | 875 | // whatever 876 | resources_destroy(&res); 877 | 878 | return 0; 879 | } 880 | --------------------------------------------------------------------------------