├── Makefile ├── README.md ├── arp_announce_conf.sh ├── client.c ├── gpu_direct_rdma_access.c ├── gpu_direct_rdma_access.h ├── gpu_mem_util.c ├── gpu_mem_util.h ├── ibv_helper.h ├── khash.h ├── map_pci_nic_gpu.sh ├── server.c ├── utils.c └── utils.h /Makefile: -------------------------------------------------------------------------------- 1 | IDIR = . 2 | CC = gcc 3 | ODIR = obj 4 | 5 | ifeq ($(USE_CUDA),1) 6 | CUDAFLAGS = -I/usr/local/cuda-10.1/targets/x86_64-linux/include 7 | CUDAFLAGS += -I/usr/local/cuda/include 8 | PRE_CFLAGS1 = -I$(IDIR) $(CUDAFLAGS) -g -DHAVE_CUDA 9 | LIBS = -Wall -lrdmacm -libverbs -lmlx5 -lcuda 10 | else 11 | PRE_CFLAGS1 = -I$(IDIR) -g 12 | LIBS = -Wall -lrdmacm -libverbs -lmlx5 13 | endif 14 | 15 | ifeq ($(PRINT_LAT),1) 16 | CFLAGS = $(PRE_CFLAGS1) -DPRINT_LATENCY 17 | else 18 | CFLAGS = $(PRE_CFLAGS1) 19 | endif 20 | 21 | OEXE_CLT = client 22 | OEXE_SRV = server 23 | 24 | DEPS = gpu_direct_rdma_access.h 25 | DEPS += ibv_helper.h 26 | DEPS += khash.h 27 | DEPS += gpu_mem_util.h 28 | DEPS += utils.h 29 | 30 | OBJS = gpu_direct_rdma_access.o 31 | OBJS += gpu_mem_util.o 32 | OBJS += utils.o 33 | 34 | $(ODIR)/%.o: %.c $(DEPS) 35 | $(CC) -c -o $@ $< $(CFLAGS) 36 | 37 | all : make_odir $(OEXE_CLT) $(OEXE_SRV) 38 | 39 | make_odir: $(ODIR)/ 40 | 41 | $(OEXE_SRV) : $(patsubst %,$(ODIR)/%,$(OBJS)) $(ODIR)/server.o 42 | $(CC) -o $@ $^ $(CFLAGS) $(LIBS) 43 | 44 | $(OEXE_CLT) : $(patsubst %,$(ODIR)/%,$(OBJS)) $(ODIR)/client.o 45 | $(CC) -o $@ $^ $(CFLAGS) $(LIBS) 46 | 47 | $(ODIR)/: 48 | mkdir -p $@ 49 | 50 | .PHONY: clean 51 | 52 | clean : 53 | rm -f $(OEXE_CLT) $(OEXE_SRV) $(ODIR)/*.o *~ core.* $(IDIR)/*~ 54 | 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GPU Direct RDMA Access example code 2 | This package shows how to use the Mellanox DC QP to implement RDMA Read and Write operatinos directly to a remote GPU memory. It assumes the client appliation will run on a GPU enabled machine, like the NVIDIA DGX2. The server application, acting as a file storage simulation, will be running on few other Linux machines. All machine should have Mellanox ConnectX-5 NIC (or newer) in order for the DC QP to work properlly. 3 | 4 | In the test codem the client application allocates memory on the defined GPU (flag '-u ) or on system RAM (default). Then sends a TCP request to the server application for a RDMA Write to the client's allocated buffer. Once the server application completes the RDMA Write operation it sends back a TCP 'done' message to the client. The client can loop for multiple such requests (flag '-n'). The RDMA message size can be configured (flag '-s' bytes) 5 | 6 | For optimzed data transfer, the client requiers the GPU device selection based on PCI "B:D.F" format. It is recommened to chose a GPU which shares the same PCI bridge as the Mellanox ConectX NIC. 7 | 8 | ## Content: 9 | 10 | gpu_direct_rdma_access.h, gpu_direct_rdma_access.c - Handles RDMA Read and Write ops from Server to GPU memory by request from the Client. 11 | The API-s use DC type QPs connection for RDMA operations. The request to the server comes by socket. 12 | 13 | gpu_mem_util.h, gpu_mem_util.c - GPU/CPU memory allocation 14 | 15 | server.c, client.c - client and server main programs implementing GPU's Read/Write. 16 | 17 | map_pci_nic_gpu.sh, arp_announce_conf.sh - help scripts 18 | 19 | Makefile - makefile to build cliend and server execute files 20 | 21 | ## Installation Guide: 22 | 23 | **1. MLNX_OFED** 24 | 25 | Download MLNX_OFED-4.6-1.0.1.0 (or newer) from Mellanox web site: http://www.mellanox.com/page/products_dyn?product_family=26 26 | Install with upstream libs 27 | ```sh 28 | $ sudo ./mlnxofedinstall --force-fw-update --upstream-libs --dpdk 29 | ``` 30 | **2. CUDA libs** 31 | 32 | Download CUDA Toolkit 10.1 (or newer) from Nvidia web site 33 | ```sh 34 | $ wget https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.105_418.39_linux.run 35 | ``` 36 | install on DGX server (GPU enabled servers) 37 | ```sh 38 | $ sudo sh cuda_10.1.105_418.39_linux.run 39 | ``` 40 | **3. GPU Direct** 41 | 42 | follow the download, build and inall guide on https://github.com/Mellanox/nv_peer_memory 43 | 44 | **4. Multi-Homes network** 45 | 46 | Configured system arp handling for multi-homed network with RoCE traffic (on DGX2 server) 47 | ```sh 48 | $ git clone https://github.com/Mellanox/gpu_direct_rdma_access.git 49 | $ ./write_to_gpu/arp_announce_conf.sh 50 | ``` 51 | **5. Check RDMA connectivity between all cluster nodes** 52 | 53 | ## Build Example Code: 54 | 55 | ```sh 56 | $ git clone git@github.com:Mellanox/gpu_direct_rdma_access.git 57 | $ cd gpu_direct_rdma_access 58 | ``` 59 | On the client machines 60 | ```sh 61 | $ make USE_CUDA=1 62 | ``` 63 | On the server machines 64 | ```sh 65 | $ make 66 | ``` 67 | 68 | ## Run Server: 69 | ```sh 70 | $ ./server -a 172.172.1.34 -n 10000 -D 1 -s 10000000 -p 18001 & 71 | ``` 72 | 73 | ## Run Client: 74 | 75 | We want to find the GPU's which share the same PCI bridge as the ConnectX Mellanox NIC 76 | ```sh 77 | $ ./map_pci_nic_gpu.sh 78 | 172.172.1.112 (mlx5_12) is near 0000:b7:00.0 3D controller: NVIDIA Corporation Device 1db8 (rev a1) 79 | 172.172.1.112 (mlx5_12) is near 0000:b9:00.0 3D controller: NVIDIA Corporation Device 1db8 (rev a1) 80 | 172.172.1.113 (mlx5_14) is near 0000:bc:00.0 3D controller: NVIDIA Corporation Device 1db8 (rev a1) 81 | 172.172.1.113 (mlx5_14) is near 0000:be:00.0 3D controller: NVIDIA Corporation Device 1db8 (rev a1) 82 | 172.172.1.114 (mlx5_16) is near 0000:e0:00.0 3D controller: NVIDIA Corporation Device 1db8 (rev a1) 83 | 172.172.1.114 (mlx5_16) is near 0000:e2:00.0 3D controller: NVIDIA Corporation Device 1db8 (rev a1) 84 | ``` 85 | 86 | Run client application with matching IP address and BDF from the script output (-a and -u parameters) 87 | ```sh 88 | $ ./client -t 0 -a 172.172.1.112 172.172.1.34 -u b7:00.0 -n 10000 -D 0 -s 10000000 -p 18001 & 89 | 90 | ``` 91 | -------------------------------------------------------------------------------- /arp_announce_conf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (c) 2019 Mellanox Technologies. All rights reserved. 4 | # 5 | # This Software is licensed under one of the following licenses: 6 | # 7 | # 1) under the terms of the "Common Public License 1.0" a copy of which is 8 | # available from the Open Source Initiative, see 9 | # http://www.opensource.org/licenses/cpl.php. 10 | # 11 | # 2) under the terms of the "The BSD License" a copy of which is 12 | # available from the Open Source Initiative, see 13 | # http://www.opensource.org/licenses/bsd-license.php. 14 | # 15 | # 3) under the terms of the "GNU General Public License (GPL) Version 2" a 16 | # copy of which is available from the Open Source Initiative, see 17 | # http://www.opensource.org/licenses/gpl-license.php. 18 | # 19 | # Licensee has the right to choose one of the above licenses. 20 | # 21 | # Redistributions of source code must retain the above copyright 22 | # notice and one of the license notices. 23 | # 24 | # Redistributions in binary form must reproduce both the above copyright 25 | # notice, one of the license notices in the documentation 26 | # and/or other materials provided with the distribution. 27 | # 28 | # Author: Michael Berezin 29 | # 30 | 31 | if [[ debug == "$1" ]]; then 32 | INSTRUMENTING=yes # any non-null will do 33 | shift 34 | fi 35 | echodbg () { 36 | [[ "$INSTRUMENTING" ]] && builtin echo $@ 37 | } 38 | 39 | DEVS=$1 40 | if [ -z "$DEVS" ] ; then 41 | DEVS=$(ls /sys/class/infiniband/) 42 | fi 43 | 44 | for dev in $DEVS ; do 45 | #echo -e "dev=$dev" 46 | for port in $(ls /sys/class/infiniband/$dev/ports/) ; do 47 | #echo -e " port=$port" 48 | ll=$(cat /sys/class/infiniband/$dev/ports/$port/link_layer); 49 | #echo -e " ll=$ll" 50 | if [ $ll = "Ethernet" ] ; then 51 | ndev=$(cat /sys/class/infiniband/$dev/ports/$port/gid_attrs/ndevs/0) 52 | link_st=$(ip -f inet link show $ndev | grep "state UP") 53 | if [ -n "$link_st" ] ; then 54 | echo "device $dev port $port ==> $ndev (Up) : ARP announce/ignore config" 55 | sysctl -w net.ipv4.conf.$ndev.arp_announce=1 56 | sysctl -w net.ipv4.conf.$ndev.arp_ignore=2 57 | else 58 | echo "device $dev port $port ==> $ndev (Down) : no config" 59 | fi 60 | fi 61 | done #port 62 | done #dev 63 | 64 | -------------------------------------------------------------------------------- /client.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. 3 | * 4 | * This software is available to you under a choice of one of two 5 | * licenses. You may choose to be licensed under the terms of the GNU 6 | * General Public License (GPL) Version 2, available from the file 7 | * COPYING in the main directory of this source tree, or the 8 | * OpenIB.org BSD license below: 9 | * 10 | * Redistribution and use in source and binary forms, with or 11 | * without modification, are permitted provided that the following 12 | * conditions are met: 13 | * 14 | * - Redistributions of source code must retain the above 15 | * copyright notice, this list of conditions and the following 16 | * disclaimer. 17 | * 18 | * - Redistributions in binary form must reproduce the above 19 | * copyright notice, this list of conditions and the following 20 | * disclaimer in the documentation and/or other materials 21 | * provided with the distribution. 22 | * 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 | * SOFTWARE. 31 | */ 32 | 33 | #define _GNU_SOURCE 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | 47 | #include "utils.h" 48 | #include "gpu_mem_util.h" 49 | #include "gpu_direct_rdma_access.h" 50 | 51 | extern int debug; 52 | extern int debug_fast_path; 53 | 54 | #define DEBUG_LOG if (debug) printf 55 | #define DEBUG_LOG_FAST_PATH if (debug_fast_path) printf 56 | #define FDEBUG_LOG if (debug) fprintf 57 | #define FDEBUG_LOG_FAST_PATH if (debug_fast_path) fprintf 58 | 59 | #define ACK_MSG "rdma_task completed" 60 | 61 | struct user_params { 62 | 63 | uint32_t task; 64 | int port; 65 | unsigned long size; 66 | int iters; 67 | int use_cuda; 68 | char *bdf; 69 | char *servername; 70 | struct sockaddr hostaddr; 71 | }; 72 | 73 | /**************************************************************************************** 74 | * Open socket connection on the client side, try to connect to the server by the given 75 | * IP address (servername). If success, return the connected socket file descriptor ID 76 | * Return value: socket fd - success, -1 - error 77 | ****************************************************************************************/ 78 | static int open_client_socket(const char *servername, 79 | int port) 80 | { 81 | struct addrinfo *res, 82 | *t; 83 | struct addrinfo hints = { 84 | .ai_family = AF_UNSPEC, 85 | .ai_socktype = SOCK_STREAM 86 | }; 87 | char *service; 88 | int ret_val; 89 | int sockfd; 90 | 91 | if (asprintf(&service, "%d", port) < 0) 92 | return -1; 93 | 94 | ret_val = getaddrinfo(servername, service, &hints, &res); 95 | 96 | if (ret_val < 0) { 97 | fprintf(stderr, "FAILURE: %s for %s:%d\n", gai_strerror(ret_val), servername, port); 98 | free(service); 99 | return -1; 100 | } 101 | 102 | for (t = res; t; t = t->ai_next) { 103 | sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); 104 | if (sockfd >= 0) { 105 | if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) 106 | break; 107 | close(sockfd); 108 | sockfd = -1; 109 | } 110 | } 111 | 112 | freeaddrinfo(res); 113 | free(service); 114 | 115 | if (sockfd < 0) { 116 | fprintf(stderr, "FAILURE: Couldn't connect to %s:%d\n", servername, port); 117 | return -1; 118 | } 119 | 120 | return sockfd; 121 | } 122 | 123 | enum payload_t { RDMA_BUF_DESC, TASK_ATTRS }; 124 | 125 | struct payload_attr { 126 | enum payload_t data_t; 127 | char *payload_str; 128 | }; 129 | 130 | /************************************************************************************ 131 | * Simple package protocol which packs payload string into allocated memory. 132 | * Protocol consist of: 133 | * uint8_t payload_t - type of the payload data 134 | * uint16_t payload_size - strlen of the payload_str 135 | * char * payload_str - payload to pack 136 | * 137 | * returns: an integer equal to the size of the copied into package data in bytes 138 | * _________________________________________________________________________________ 139 | * 140 | * PACKAGE = {|type|size|---------payload----------|} 141 | * 1b 2b (size * sizeof(char))b 142 | * 143 | ***********************************************************************************/ 144 | int pack_payload_data(void *package, size_t package_size, struct payload_attr *attr) 145 | { 146 | uint8_t data_t = attr->data_t; 147 | uint16_t payload_size = strlen(attr->payload_str) + 1; 148 | size_t req_size = sizeof(data_t) + sizeof(payload_size) + payload_size * sizeof(char) ; 149 | if (req_size > package_size) { 150 | fprintf(stderr, "package size (%lu) is less than required (%lu) for sending payload with attributes\n", 151 | package_size, req_size); 152 | return 0; 153 | } 154 | memcpy(package, &data_t, sizeof(data_t)); 155 | memcpy(package + sizeof(data_t), &payload_size, sizeof(payload_size)); 156 | memcpy(package + sizeof(data_t) + sizeof(payload_size), attr->payload_str, payload_size * sizeof(char)); 157 | 158 | return req_size; 159 | } 160 | 161 | //==================================================================================== 162 | /* t*/ 163 | #define RDMA_TASK_ATTR_DESC_STRING_LENGTH (sizeof "12345678") 164 | /************************************************************************************* 165 | * Get a rdma_task_attr_flags description string representation 166 | * 167 | * The Client application should pass this description string to the 168 | * Server which will issue the RDMA Read/Write operation 169 | * 170 | * desc_str is input and output holding the rdma_task_attr_flags information 171 | * desc_length is input size in bytes of desc_str 172 | * 173 | * returns: an integer equal to the size of the char data copied into desc_str 174 | ************************************************************************************/ 175 | int rdma_task_attr_flags_get_desc_str(uint32_t flags, char *desc_str, size_t desc_length) 176 | { 177 | if (desc_length < RDMA_TASK_ATTR_DESC_STRING_LENGTH) { 178 | fprintf(stderr, "desc string size (%lu) is less than required (%lu) for sending rdma_task_attr_flags data\n", 179 | desc_length, RDMA_TASK_ATTR_DESC_STRING_LENGTH); 180 | return 0; 181 | } 182 | 183 | sprintf(desc_str, "%08x", flags); 184 | 185 | return strlen(desc_str) + 1; /*including the terminating null character*/ 186 | } 187 | 188 | static void usage(const char *argv0) 189 | { 190 | printf("Usage:\n"); 191 | printf(" %s connect to server at \n", argv0); 192 | printf("\n"); 193 | printf("Options:\n"); 194 | printf(" -t, --task_flags= rdma task attrs bitmask: bit 0 - rdma operation type: 0 - \"WRITE\"(default),\n" 195 | " 1 - \"READ\"\n"); 196 | printf(" -a, --addr= ip address of the local host net device (mandatory)\n"); 197 | printf(" -p, --port= listen on/connect to port (default 18515)\n"); 198 | printf(" -s, --size= size of message to exchange (default 4096)\n"); 199 | printf(" -n, --iters= number of exchanges (default 1000)\n"); 200 | printf(" -u, --use-cuda= use CUDA pacage (work with GPU memoty),\n" 201 | " BDF corresponding to CUDA device, for example, \"3e:02.0\"\n"); 202 | printf(" -D, --debug-mask= debug bitmask: bit 0 - debug print enable,\n" 203 | " bit 1 - fast path debug print enable\n"); 204 | } 205 | 206 | static int parse_command_line(int argc, char *argv[], struct user_params *usr_par) 207 | { 208 | memset(usr_par, 0, sizeof *usr_par); 209 | /*Set defaults*/ 210 | usr_par->port = 18515; 211 | usr_par->size = 4096; 212 | usr_par->iters = 1000; 213 | usr_par->task = 0; 214 | 215 | while (1) { 216 | int c; 217 | 218 | static struct option long_options[] = { 219 | { .name = "task-flags", .has_arg = 1, .val = 't' }, 220 | { .name = "addr", .has_arg = 1, .val = 'a' }, 221 | { .name = "port", .has_arg = 1, .val = 'p' }, 222 | { .name = "size", .has_arg = 1, .val = 's' }, 223 | { .name = "iters", .has_arg = 1, .val = 'n' }, 224 | { .name = "use-cuda", .has_arg = 1, .val = 'u' }, 225 | { .name = "debug-mask", .has_arg = 1, .val = 'D' }, 226 | { 0 } 227 | }; 228 | 229 | c = getopt_long(argc, argv, "t:a:p:s:n:u:D:", 230 | long_options, NULL); 231 | if (c == -1) 232 | break; 233 | 234 | switch (c) { 235 | 236 | case 't': 237 | usr_par->task = (strtol(optarg, NULL, 0) >> 0) & 1; /*bit 0*/ 238 | break; 239 | 240 | case 'a': 241 | get_addr(optarg, (struct sockaddr *) &usr_par->hostaddr); 242 | break; 243 | 244 | case 'p': 245 | usr_par->port = strtol(optarg, NULL, 0); 246 | if (usr_par->port < 0 || usr_par->port > 65535) { 247 | usage(argv[0]); 248 | return 1; 249 | } 250 | break; 251 | 252 | case 's': 253 | usr_par->size = strtol(optarg, NULL, 0); 254 | break; 255 | 256 | case 'n': 257 | usr_par->iters = strtol(optarg, NULL, 0); 258 | break; 259 | 260 | case 'u': 261 | usr_par->use_cuda = 1; 262 | usr_par->bdf = calloc(1, strlen(optarg)+1); 263 | if (!usr_par->bdf){ 264 | fprintf(stderr, "FAILURE: BDF mem alloc failure (errno=%d '%m')", errno); 265 | return 1; 266 | } 267 | strcpy(usr_par->bdf, optarg); 268 | break; 269 | 270 | case 'D': 271 | debug = (strtol(optarg, NULL, 0) >> 0) & 1; /*bit 0*/ 272 | debug_fast_path = (strtol(optarg, NULL, 0) >> 1) & 1; /*bit 1*/ 273 | break; 274 | 275 | default: 276 | usage(argv[0]); 277 | return 1; 278 | } 279 | } 280 | 281 | if (optind == argc) { 282 | fprintf(stderr, "FAILURE: Server name is missing in the commant line.\n"); 283 | usage(argv[0]); 284 | return 1; 285 | } else if (optind == argc - 1) { 286 | //usr_par->servername = strdupa(argv[optind]); 287 | usr_par->servername = calloc(1, strlen(argv[optind])+1); 288 | if (!usr_par->servername){ 289 | fprintf(stderr, "FAILURE: servername mem alloc failure (errno=%d '%m')", errno); 290 | return 1; 291 | } 292 | strcpy(usr_par->servername, argv[optind]); 293 | } 294 | else if (optind < argc) { 295 | usage(argv[0]); 296 | return 1; 297 | } 298 | 299 | return 0; 300 | } 301 | 302 | int main(int argc, char *argv[]) 303 | { 304 | struct rdma_device *rdma_dev; 305 | struct timeval start; 306 | int cnt; 307 | struct user_params usr_par; 308 | int ret_val = 0; 309 | int sockfd; 310 | 311 | srand48(getpid() * time(NULL)); 312 | 313 | ret_val = parse_command_line(argc, argv, &usr_par); 314 | if (ret_val) { 315 | ret_val = 1; 316 | /* We don't exit here, because when parse_command_line failed, probably 317 | some of memory allocations were completed, so we need to free them */ 318 | goto clean_usr_par; 319 | } 320 | 321 | if (!usr_par.hostaddr.sa_family) { 322 | fprintf(stderr, "FAILURE: host ip address is missing in the command line."); 323 | usage(argv[0]); 324 | ret_val = 1; 325 | goto clean_usr_par; 326 | } 327 | 328 | printf("Connecting to remote server \"%s:%d\"\n", usr_par.servername, usr_par.port); 329 | sockfd = open_client_socket(usr_par.servername, usr_par.port); 330 | free(usr_par.servername); 331 | 332 | if (sockfd < 0) { 333 | ret_val = 1; 334 | goto clean_usr_par; 335 | } 336 | 337 | printf("Opening rdma device\n"); 338 | rdma_dev = rdma_open_device_client(&usr_par.hostaddr); 339 | 340 | if (!rdma_dev) { 341 | ret_val = 1; 342 | goto clean_socket; 343 | } 344 | 345 | /* CPU or GPU memory buffer allocation */ 346 | void *buff; 347 | buff = work_buffer_alloc(usr_par.size, usr_par.use_cuda, usr_par.bdf); 348 | if (!buff) { 349 | ret_val = 1; 350 | goto clean_device; 351 | } 352 | 353 | /* We don't need bdf any more, sio we can free this. */ 354 | if (usr_par.bdf) { 355 | free(usr_par.bdf); 356 | usr_par.bdf = NULL; 357 | } 358 | 359 | /* RDMA buffer registration */ 360 | struct rdma_buffer *rdma_buff; 361 | 362 | rdma_buff = rdma_buffer_reg(rdma_dev, buff, usr_par.size); 363 | if (!rdma_buff) { 364 | ret_val = 1; 365 | goto clean_mem_buff; 366 | } 367 | 368 | char desc_str[256], task_opt_str[16]; 369 | 370 | int ret_desc_str_size = rdma_buffer_get_desc_str(rdma_buff, desc_str, sizeof(desc_str)); 371 | int ret_task_opt_str_size = rdma_task_attr_flags_get_desc_str(usr_par.task, task_opt_str, sizeof(task_opt_str)); 372 | 373 | if (!ret_desc_str_size || !ret_task_opt_str_size) { 374 | ret_val = 1; 375 | goto clean_rdma_buff; 376 | } 377 | 378 | /* Package memory allocation */ 379 | const int package_size = (ret_desc_str_size + ret_task_opt_str_size) * sizeof(char) + 2 * sizeof(uint16_t) + 2 * sizeof(uint8_t); 380 | void *package = malloc(package_size); 381 | memset(package, 0, package_size); 382 | 383 | /* Packing RDMA buff desc str */ 384 | struct payload_attr pl_attr = { .data_t = RDMA_BUF_DESC, .payload_str = desc_str }; 385 | int buff_package_size = pack_payload_data(package, package_size, &pl_attr); 386 | if (!buff_package_size) { 387 | ret_val = 1; 388 | goto clean_package_data; 389 | } 390 | 391 | /* Packing RDMA task attrs desc str */ 392 | pl_attr.data_t = TASK_ATTRS; 393 | pl_attr.payload_str = task_opt_str; 394 | buff_package_size += pack_payload_data(package + buff_package_size, package_size, &pl_attr); 395 | if (!buff_package_size) { 396 | ret_val = 1; 397 | goto clean_package_data; 398 | } 399 | 400 | printf("Starting data transfer (%d iters)\n", usr_par.iters); 401 | if (gettimeofday(&start, NULL)) { 402 | fprintf(stderr, "FAILURE: gettimeofday (errno=%d '%m')", errno); 403 | ret_val = 1; 404 | goto clean_package_data; 405 | } 406 | 407 | /**************************************************************************************************** 408 | * The main loop where client and server send and receive "iters" number of messages 409 | */ 410 | for (cnt = 0; cnt < usr_par.iters; cnt++) { 411 | 412 | char ackmsg[sizeof ACK_MSG]; 413 | int ret_size; 414 | 415 | // Sending RDMA data (address and rkey) by socket as a triger to start RDMA read/write operation 416 | DEBUG_LOG_FAST_PATH("Send message N %d: buffer desc \"%s\" of size %d with task opt \"%s\" of size %d\n", cnt, desc_str, strlen(desc_str), task_opt_str, strlen(task_opt_str)); 417 | ret_size = write(sockfd, package, buff_package_size); 418 | if (ret_size != buff_package_size) { 419 | fprintf(stderr, "FAILURE: Couldn't send RDMA data for iteration, write data size %d (errno=%d '%m')\n", ret_size, errno); 420 | ret_val = 1; 421 | goto clean_package_data; 422 | } 423 | 424 | // Wating for confirmation message from the socket that rdma_read/write from the server has beed completed 425 | ret_size = recv(sockfd, ackmsg, sizeof ackmsg, MSG_WAITALL); 426 | if (ret_size != sizeof ackmsg) { 427 | fprintf(stderr, "FAILURE: Couldn't read \"%s\" message, recv data size %d (errno=%d '%m')\n", ACK_MSG, ret_size, errno); 428 | ret_val = 1; 429 | goto clean_package_data; 430 | } 431 | 432 | // Printing received data for debug purpose 433 | DEBUG_LOG_FAST_PATH("Received ack N %d: \"%s\"\n", cnt, ackmsg); 434 | if (!usr_par.use_cuda) { 435 | DEBUG_LOG_FAST_PATH("Written data \"%s\"\n", (char*)buff); 436 | } 437 | } 438 | /****************************************************************************************************/ 439 | 440 | ret_val = print_run_time(start, usr_par.size, usr_par.iters); 441 | if (ret_val) { 442 | goto clean_package_data; 443 | } 444 | 445 | clean_package_data: 446 | free(package); 447 | 448 | clean_rdma_buff: 449 | rdma_buffer_dereg(rdma_buff); 450 | 451 | clean_mem_buff: 452 | work_buffer_free(buff, usr_par.use_cuda); 453 | 454 | clean_device: 455 | rdma_close_device(rdma_dev); 456 | 457 | clean_socket: 458 | close(sockfd); 459 | 460 | clean_usr_par: 461 | if (usr_par.bdf) { 462 | free(usr_par.bdf); 463 | } 464 | 465 | return ret_val; 466 | } 467 | -------------------------------------------------------------------------------- /gpu_direct_rdma_access.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. 3 | * 4 | * This software is available to you under a choice of one of two 5 | * licenses. You may choose to be licensed under the terms of the GNU 6 | * General Public License (GPL) Version 2, available from the file 7 | * COPYING in the main directory of this source tree, or the 8 | * OpenIB.org BSD license below: 9 | * 10 | * Redistribution and use in source and binary forms, with or 11 | * without modification, are permitted provided that the following 12 | * conditions are met: 13 | * 14 | * - Redistributions of source code must retain the above 15 | * copyright notice, this list of conditions and the following 16 | * disclaimer. 17 | * 18 | * - Redistributions in binary form must reproduce the above 19 | * copyright notice, this list of conditions and the following 20 | * disclaimer in the documentation and/or other materials 21 | * provided with the distribution. 22 | * 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 | * SOFTWARE. 31 | */ 32 | 33 | #if HAVE_CONFIG_H 34 | # include 35 | #endif /* HAVE_CONFIG_H */ 36 | 37 | #define _GNU_SOURCE 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | 51 | #include 52 | #include 53 | 54 | #include "khash.h" 55 | #include "ibv_helper.h" 56 | #include "gpu_direct_rdma_access.h" 57 | 58 | int debug = 0; 59 | int debug_fast_path = 0; 60 | 61 | #define DEBUG_LOG if (debug) printf 62 | #define DEBUG_LOG_FAST_PATH if (debug_fast_path) printf 63 | #define FDEBUG_LOG if (debug) fprintf 64 | #define FDEBUG_LOG_FAST_PATH if (debug_fast_path) fprintf 65 | 66 | #define CQ_DEPTH 640 67 | #define SEND_Q_DEPTH 640 68 | #define DC_KEY 0xffeeddcc /*this is defined for both sides: client and server*/ 69 | #define COMP_ARRAY_SIZE 16 70 | #define TC_PRIO 3 71 | 72 | #define WR_ID_FLUSH_MARKER UINT64_MAX 73 | 74 | #define mmin(a, b) a < b ? a : b 75 | 76 | KHASH_TYPE(kh_ib_ah, struct ibv_ah_attr, struct ibv_ah*); 77 | 78 | enum wr_id_flags { 79 | WR_ID_FLAGS_ACTIVE = 1 << 0 80 | }; 81 | 82 | struct wr_id_reported { 83 | uint64_t wr_id; 84 | uint16_t num_wrs; 85 | uint16_t flags; /* enum wr_id_flags */ 86 | }; 87 | 88 | #ifdef PRINT_LATENCY 89 | struct wr_latency { 90 | uint64_t wr_start_ts; 91 | uint64_t wr_complete_ts; 92 | uint64_t completion_ts; 93 | uint64_t read_comp_ts; 94 | }; 95 | #endif /*PRINT_LATENCY*/ 96 | 97 | struct rdma_device { 98 | 99 | struct rdma_event_channel *cm_channel; 100 | struct rdma_cm_id *cm_id; 101 | 102 | struct ibv_context *context; 103 | struct ibv_pd *pd; 104 | #ifdef PRINT_LATENCY 105 | struct ibv_cq_ex *cq; 106 | #else 107 | struct ibv_cq *cq; 108 | #endif 109 | struct ibv_srq *srq; /* for DCT (client) only, for DCI (server) this is NULL */ 110 | struct ibv_qp *qp; 111 | struct ibv_qp_ex *qpex; /* DCI (server) only */ 112 | struct mlx5dv_qp_ex *mqpex; /* DCI (server) only */ 113 | 114 | /* Address handler (port info) relateed fields */ 115 | int ib_port; 116 | int is_global; 117 | int gidx; 118 | union ibv_gid gid; 119 | uint16_t lid; 120 | enum ibv_mtu mtu; 121 | 122 | struct wr_id_reported app_wr_id[SEND_Q_DEPTH]; 123 | int app_wr_id_idx; 124 | int qp_available_wr; 125 | int rdma_buff_cnt; 126 | 127 | /* AH hash */ 128 | khash_t(kh_ib_ah) ah_hash; 129 | #ifdef PRINT_LATENCY 130 | uint64_t hca_core_clock_kHz; 131 | struct wr_latency latency[SEND_Q_DEPTH]; 132 | uint64_t measure_index; 133 | uint64_t wr_complete_latency_sum; /*from wr_start_ts*/ 134 | uint64_t completion_latency_sum; /*from wr_start_ts*/ 135 | uint64_t read_comp_latency_sum; /*from completion_ts*/ 136 | uint64_t min_wr_complete_latency; 137 | uint64_t min_completion_latency; 138 | uint64_t min_read_comp_latency; 139 | uint64_t max_wr_complete_latency; 140 | uint64_t max_completion_latency; 141 | uint64_t max_read_comp_latency; 142 | #endif /*PRINT_LATENCY*/ 143 | }; 144 | 145 | struct rdma_buffer { 146 | /* Buffer Related fields */ 147 | void *buf_addr; //uint64_t addr; 148 | size_t buf_size; //uint32_t size; 149 | /* MR Related fields */ 150 | struct ibv_mr *mr; 151 | uint32_t rkey; 152 | /* Linked rdma_device */ 153 | struct rdma_device *rdma_dev; 154 | }; 155 | 156 | struct rdma_exec_params { 157 | struct rdma_device *device; 158 | uint64_t wr_id; 159 | unsigned long rem_buf_rkey; 160 | unsigned long long rem_buf_addr; 161 | uint32_t rem_buf_size; 162 | struct ibv_ah *ah; 163 | unsigned long rem_dctn; /*QP number from DCT (client)*/ 164 | uint32_t local_buf_mr_lkey; 165 | void *local_buf_addr; 166 | struct iovec *local_buf_iovec; 167 | int local_buf_iovcnt; 168 | uint32_t flags; /*enum rdma_task_attr_flags*/ 169 | }; 170 | 171 | static inline 172 | int is_server(struct rdma_device *device) 173 | { 174 | return device->srq == NULL; 175 | } 176 | 177 | /* use both gid + lid data for key generarion (lid - ib based, gid - RoCE) */ 178 | static inline 179 | khint32_t kh_ib_ah_hash_func(struct ibv_ah_attr attr) 180 | { 181 | return kh_int64_hash_func(attr.grh.dgid.global.subnet_prefix ^ 182 | attr.grh.dgid.global.interface_id ^ 183 | attr.dlid); 184 | } 185 | 186 | static inline 187 | int kh_ib_ah_hash_equal(struct ibv_ah_attr a, struct ibv_ah_attr b) 188 | { 189 | return !memcmp(&a, &b, sizeof(a)); 190 | } 191 | 192 | KHASH_IMPL(kh_ib_ah, struct ibv_ah_attr, struct ibv_ah*, 1, 193 | kh_ib_ah_hash_func, kh_ib_ah_hash_equal) 194 | 195 | 196 | //============================================================================================ 197 | static struct ibv_context *open_ib_device_by_name(const char *ib_dev_name) 198 | { 199 | struct ibv_device **dev_list; 200 | struct ibv_device *ib_dev; 201 | struct ibv_context *context = NULL; 202 | 203 | /**************************************************************************************************** 204 | * In the next block we are checking if given IB device name matches one of devices in the list. 205 | * The result of this block is ig_dev - initialized pointer to the relevant struct ibv_device 206 | ****************************************************************************************************/ 207 | dev_list = ibv_get_device_list(NULL); 208 | if (!dev_list) { 209 | perror("Failed to get IB devices list"); 210 | return NULL; 211 | } 212 | 213 | DEBUG_LOG ("Given device name \"%s\"\n", ib_dev_name); 214 | int i; 215 | for (i = 0; dev_list[i]; ++i) { 216 | char *dev_name_from_list = (char*)ibv_get_device_name(dev_list[i]); 217 | DEBUG_LOG ("Device %d name \"%s\"\n", i, dev_name_from_list); 218 | if (!strcmp(dev_name_from_list, ib_dev_name)) /*if found*/ 219 | break; 220 | } 221 | ib_dev = dev_list[i]; 222 | if (!ib_dev) { 223 | fprintf(stderr, "IB device %s not found\n", ib_dev_name); 224 | goto clean_device_list; 225 | } 226 | /****************************************************************************************************/ 227 | 228 | DEBUG_LOG ("ibv_open_device(ib_dev = %p)\n", ib_dev); 229 | context = ibv_open_device(ib_dev); 230 | if (!context) { 231 | fprintf(stderr, "Couldn't get context for %s\n", ib_dev_name); 232 | goto clean_device_list; 233 | } 234 | DEBUG_LOG("created ib context %p\n", context); 235 | /* We are now done with device list, we can free it */ 236 | 237 | clean_device_list: 238 | ibv_free_device_list(dev_list); /*dev_list is not NULL*/ 239 | 240 | return context; 241 | } 242 | 243 | //============================================================================================ 244 | static struct ibv_context *open_ib_device_by_addr(struct rdma_device *rdma_dev, struct sockaddr *addr) 245 | { 246 | int ret; 247 | uint16_t sin_port; 248 | char str[INET_ADDRSTRLEN]; 249 | 250 | rdma_dev->cm_channel = rdma_create_event_channel(); 251 | if (!rdma_dev->cm_channel) { 252 | DEBUG_LOG("rdma_create_event_channel() failure"); 253 | return NULL; 254 | } 255 | 256 | ret = rdma_create_id(rdma_dev->cm_channel, &rdma_dev->cm_id, rdma_dev, RDMA_PS_UDP); 257 | if (ret) { 258 | DEBUG_LOG("rdma_create_id() failure"); 259 | goto out1; 260 | } 261 | 262 | ret = rdma_bind_addr(rdma_dev->cm_id, addr); 263 | if (ret) { 264 | DEBUG_LOG("rdma_bind_addr() failure"); 265 | goto out2; 266 | } 267 | 268 | if (addr->sa_family == AF_INET) { 269 | sin_port = ((struct sockaddr_in *)addr)->sin_port; 270 | inet_ntop(AF_INET, &(((struct sockaddr_in *)addr)->sin_addr), str, INET_ADDRSTRLEN); 271 | } 272 | else { 273 | sin_port = ((struct sockaddr_in6 *)addr)->sin6_port; 274 | inet_ntop(AF_INET6, &(((struct sockaddr_in6 *)addr)->sin6_addr), str, INET_ADDRSTRLEN); 275 | } 276 | 277 | if (rdma_dev->cm_id->verbs == NULL) { 278 | DEBUG_LOG("Failed to bind to an RDMA device, exiting... <%s, %d>\n", str, ntohs(sin_port)); 279 | goto out2; 280 | } 281 | 282 | rdma_dev->ib_port = rdma_dev->cm_id->port_num; 283 | rdma_dev->gidx = -1; 284 | 285 | DEBUG_LOG("bound to RDMA device name:%s, port:%d, based on '%s'\n", 286 | rdma_dev->cm_id->verbs->device->name, rdma_dev->cm_id->port_num, str); 287 | 288 | return rdma_dev->cm_id->verbs; 289 | 290 | out2: 291 | rdma_destroy_id(rdma_dev->cm_id); 292 | out1: 293 | rdma_destroy_event_channel(rdma_dev->cm_channel); 294 | return NULL; 295 | 296 | } 297 | 298 | static void close_ib_device(struct rdma_device *rdma_dev) 299 | { 300 | int ret; 301 | 302 | if (rdma_dev->cm_channel) { 303 | 304 | /* if we are using RDMA_CM then we just referance the cma's ibv_context */ 305 | rdma_dev->context = NULL; 306 | 307 | if (rdma_dev->cm_id) { 308 | DEBUG_LOG("rdma_destroy_id(%p)\n", rdma_dev->cm_id); 309 | ret = rdma_destroy_id(rdma_dev->cm_id); 310 | if (ret) { 311 | fprintf(stderr, "failure in rdma_destroy_id(), error %d\n", ret); 312 | } 313 | } 314 | 315 | DEBUG_LOG("rdma_destroy_event_channel(%p)\n", rdma_dev->cm_id); 316 | rdma_destroy_event_channel(rdma_dev->cm_channel); 317 | } 318 | 319 | if (rdma_dev->context) { 320 | DEBUG_LOG("ibv_close_device(%p)\n", rdma_dev->context); 321 | ret = ibv_close_device(rdma_dev->context); 322 | if (ret) { 323 | fprintf(stderr, "failure in ibv_close_device(), error %d\n", ret); 324 | } 325 | } 326 | } 327 | 328 | /*********************************************************************************** 329 | * Fill portinfo structure, get lid and gid from portinfo 330 | * Return value: 0 - success, 1 - error 331 | ****************************************************************************************/ 332 | static int rdma_set_lid_gid_from_port_info(struct rdma_device *rdma_dev) 333 | { 334 | struct ibv_port_attr portinfo; 335 | int ret_val; 336 | 337 | ret_val = ibv_query_port(rdma_dev->context, rdma_dev->ib_port, &portinfo); 338 | if (ret_val) { 339 | fprintf(stderr, "Couldn't get port info\n"); 340 | return 1; 341 | } 342 | 343 | rdma_dev->mtu = portinfo.active_mtu; 344 | rdma_dev->lid = portinfo.lid; 345 | if ((portinfo.link_layer != IBV_LINK_LAYER_ETHERNET) && (!portinfo.lid)) { 346 | fprintf(stderr, "Couldn't get local LID\n"); 347 | return 1; 348 | } 349 | 350 | if (rdma_dev->cm_id && portinfo.link_layer == IBV_LINK_LAYER_ETHERNET) { 351 | rdma_dev->gidx = ibv_find_sgid_type(rdma_dev->context, rdma_dev->ib_port, 352 | IBV_GID_TYPE_ROCE_V2, rdma_dev->cm_id->route.addr.src_addr.sa_family); 353 | } 354 | 355 | if (rdma_dev->gidx < 0) { 356 | if (portinfo.link_layer == IBV_LINK_LAYER_ETHERNET) { 357 | fprintf(stderr, "Wrong GID index (%d) for ETHERNET port\n", rdma_dev->gidx); 358 | return 1; 359 | } else { 360 | memset(&(rdma_dev->gid), 0, sizeof rdma_dev->gid); 361 | } 362 | } else /* rdma_dev->gidx >= 0*/ { 363 | ret_val = ibv_query_gid(rdma_dev->context, rdma_dev->ib_port, rdma_dev->gidx, &(rdma_dev->gid)); 364 | if (ret_val) { 365 | fprintf(stderr, "can't read GID of index %d, error code %d\n", rdma_dev->gidx, ret_val); 366 | return 1; 367 | } 368 | DEBUG_LOG ("my gid idx: %d, value:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x\n", rdma_dev->gidx, 369 | rdma_dev->gid.raw[0], rdma_dev->gid.raw[1], rdma_dev->gid.raw[2], rdma_dev->gid.raw[3], 370 | rdma_dev->gid.raw[4], rdma_dev->gid.raw[5], rdma_dev->gid.raw[6], rdma_dev->gid.raw[7], 371 | rdma_dev->gid.raw[8], rdma_dev->gid.raw[9], rdma_dev->gid.raw[10], rdma_dev->gid.raw[11], 372 | rdma_dev->gid.raw[12], rdma_dev->gid.raw[13], rdma_dev->gid.raw[14], rdma_dev->gid.raw[15] ); 373 | } 374 | rdma_dev->is_global = (rdma_dev->gid.global.interface_id != 0); 375 | 376 | DEBUG_LOG ("link_layer:%s, lid:%d, is_global:%d, MTU:%d Bytes\n", 377 | (portinfo.link_layer == IBV_LINK_LAYER_ETHERNET ? "ETH" : "IB"), 378 | rdma_dev->lid, rdma_dev->is_global, (256<<(rdma_dev->mtu - 1))); 379 | 380 | return 0; 381 | } 382 | 383 | /**************************************************************************************** 384 | * Modify target QP state to RTR (on the client side) 385 | * Return value: 0 - success, 1 - error 386 | ****************************************************************************************/ 387 | static int modify_target_qp_to_rtr(struct rdma_device *rdma_dev) 388 | { 389 | struct ibv_qp_attr qp_attr; 390 | enum ibv_qp_attr_mask attr_mask; 391 | 392 | memset(&qp_attr, 0, sizeof qp_attr); 393 | qp_attr.qp_state = IBV_QPS_RTR; 394 | qp_attr.path_mtu = rdma_dev->mtu; 395 | qp_attr.min_rnr_timer = 16; 396 | qp_attr.ah_attr.port_num = rdma_dev->ib_port; 397 | 398 | if (rdma_dev->gid.global.interface_id) { 399 | qp_attr.ah_attr.is_global = 1; 400 | qp_attr.ah_attr.grh.hop_limit = 1; 401 | qp_attr.ah_attr.grh.sgid_index = rdma_dev->gidx; 402 | qp_attr.ah_attr.grh.traffic_class = TC_PRIO << 5; // <<3 for dscp2prio, <<2 for ECN bits 403 | } 404 | attr_mask = IBV_QP_STATE | 405 | IBV_QP_AV | 406 | IBV_QP_PATH_MTU | 407 | IBV_QP_MIN_RNR_TIMER; // for DCT 408 | 409 | DEBUG_LOG("ibv_modify_qp(qp = %p, qp_attr.qp_state = %d, attr_mask = 0x%x)\n", 410 | rdma_dev->qp, qp_attr.qp_state, attr_mask); 411 | if (ibv_modify_qp(rdma_dev->qp, &qp_attr, attr_mask)) { 412 | fprintf(stderr, "Failed to modify QP to RTR\n"); 413 | return 1; 414 | } 415 | DEBUG_LOG ("ibv_modify_qp to state %d completed: qp_num = 0x%x\n", qp_attr.qp_state, rdma_dev->qp->qp_num); 416 | 417 | return 0; 418 | } 419 | 420 | /**************************************************************************************** 421 | * Modify source QP state to RTR and then to RTS (on the server side) 422 | * Return value: 0 - success, 1 - error 423 | ****************************************************************************************/ 424 | static int modify_source_qp_to_rtr_and_rts(struct rdma_device *rdma_dev) 425 | { 426 | struct ibv_qp_attr qp_attr; 427 | enum ibv_qp_attr_mask attr_mask; 428 | 429 | memset(&qp_attr, 0, sizeof qp_attr); 430 | 431 | /* - - - - - - - Modify QP to RTR - - - - - - - */ 432 | qp_attr.qp_state = IBV_QPS_RTR; 433 | qp_attr.path_mtu = rdma_dev->mtu; 434 | qp_attr.ah_attr.port_num = rdma_dev->ib_port; 435 | 436 | if (rdma_dev->gid.global.interface_id) { 437 | qp_attr.ah_attr.is_global = 1; 438 | qp_attr.ah_attr.grh.hop_limit = 1; 439 | qp_attr.ah_attr.grh.sgid_index = rdma_dev->gidx; 440 | qp_attr.ah_attr.grh.traffic_class = TC_PRIO << 5; // <<3 for dscp2prio, <<2 for ECN bits 441 | } 442 | attr_mask = IBV_QP_STATE | 443 | IBV_QP_AV | 444 | IBV_QP_PATH_MTU ; 445 | 446 | DEBUG_LOG("ibv_modify_qp(qp = %p, qp_attr.qp_state = %d, attr_mask = 0x%x)\n", 447 | rdma_dev->qp, qp_attr.qp_state, attr_mask); 448 | if (ibv_modify_qp(rdma_dev->qp, &qp_attr, attr_mask)) { 449 | fprintf(stderr, "Failed to modify QP to RTR\n"); 450 | return 1; 451 | } 452 | DEBUG_LOG ("ibv_modify_qp to state %d completed: qp_num = 0x%lx\n", qp_attr.qp_state, rdma_dev->qp->qp_num); 453 | 454 | /* - - - - - - - Modify QP to RTS - - - - - - - */ 455 | qp_attr.qp_state = IBV_QPS_RTS; 456 | qp_attr.timeout = 16; 457 | qp_attr.retry_cnt = 7; 458 | qp_attr.rnr_retry = 7; 459 | //qp_attr.sq_psn = 0; 460 | qp_attr.max_rd_atomic = 1; 461 | attr_mask = IBV_QP_STATE | 462 | IBV_QP_TIMEOUT | 463 | IBV_QP_RETRY_CNT | 464 | IBV_QP_RNR_RETRY | 465 | IBV_QP_SQ_PSN | 466 | IBV_QP_MAX_QP_RD_ATOMIC ; 467 | DEBUG_LOG("ibv_modify_qp(qp = %p, qp_attr.qp_state = %d, attr_mask = 0x%x)\n", 468 | rdma_dev->qp, qp_attr.qp_state, attr_mask); 469 | if (ibv_modify_qp(rdma_dev->qp, &qp_attr, attr_mask)) { 470 | fprintf(stderr, "Failed to modify QP to RTS\n"); 471 | return 1; 472 | } 473 | DEBUG_LOG ("ibv_modify_qp to state %d completed: qp_num = 0x%lx\n", qp_attr.qp_state, rdma_dev->qp->qp_num); 474 | 475 | return 0; 476 | } 477 | 478 | static int destroy_qp(struct ibv_qp *qp) 479 | { 480 | int ret; 481 | if (qp) { 482 | DEBUG_LOG("ibv_destroy_qp(%p)\n", qp); 483 | ret = ibv_destroy_qp(qp); 484 | if (ret) { 485 | fprintf(stderr, "Couldn't destroy QP: error %d\n", ret); 486 | } 487 | } 488 | return ret; 489 | } 490 | 491 | static int modify_source_qp_rst2rts(struct rdma_device *rdma_dev) 492 | { 493 | int ret_val; 494 | /* - - - - - - - - - - Modify QP to INIT - - - - - - - - - - - - - */ 495 | struct ibv_qp_attr qp_attr = { 496 | .qp_state = IBV_QPS_INIT, 497 | .pkey_index = 0, 498 | .port_num = rdma_dev->ib_port, 499 | .qp_access_flags = IBV_ACCESS_LOCAL_WRITE 500 | }; 501 | enum ibv_qp_attr_mask attr_mask = IBV_QP_STATE | 502 | IBV_QP_PKEY_INDEX | 503 | IBV_QP_PORT | 504 | 0 /*IBV_QP_ACCESS_FLAGS*/; /*we must zero this bit for DCI QP*/ 505 | DEBUG_LOG("ibv_modify_qp(qp = %p, qp_attr.qp_state = %d, attr_mask = 0x%x)\n", 506 | rdma_dev->qp, qp_attr.qp_state, attr_mask); 507 | ret_val = ibv_modify_qp(rdma_dev->qp, &qp_attr, attr_mask); 508 | if (ret_val) { 509 | fprintf(stderr, "Failed to modify QP to INIT, error %d\n", ret_val); 510 | return 1; 511 | } 512 | DEBUG_LOG("ibv_modify_qp to state %d completed: qp_num = 0x%lx\n", qp_attr.qp_state, rdma_dev->qp->qp_num); 513 | 514 | /* - - - - - - - - - - - - - Modify QP to RTS - - - - - - - - - - - - */ 515 | ret_val = modify_source_qp_to_rtr_and_rts(rdma_dev); 516 | if (ret_val) { 517 | return 1; 518 | } 519 | 520 | rdma_dev->qpex->wr_flags = IBV_SEND_SIGNALED; 521 | 522 | return 0; 523 | } 524 | 525 | //============================================================================================ 526 | struct rdma_device *rdma_open_device_client(struct sockaddr *addr) 527 | { 528 | struct rdma_device *rdma_dev; 529 | int ret_val; 530 | 531 | rdma_dev = calloc(1, sizeof *rdma_dev); 532 | if (!rdma_dev) { 533 | fprintf(stderr, "rdma_device memory allocation failed\n"); 534 | return NULL; 535 | } 536 | 537 | /**************************************************************************************************** 538 | * In the next function we let rdma_cm find a IB device that matches the IP address of a the local netdev, 539 | * if yes, we return a pointer to that ib context 540 | * The result of this function is ib_dev - initialized pointer to the relevant struct ibv_device 541 | ****************************************************************************************************/ 542 | rdma_dev->context = open_ib_device_by_addr(rdma_dev, addr); 543 | if (!rdma_dev->context){ 544 | goto clean_rdma_dev; 545 | } 546 | 547 | ret_val = rdma_set_lid_gid_from_port_info(rdma_dev); 548 | if (ret_val) { 549 | goto clean_device; 550 | } 551 | 552 | /****************************************************************************************************/ 553 | 554 | DEBUG_LOG ("ibv_alloc_pd(ibv_context = %p)\n", rdma_dev->context); 555 | rdma_dev->pd = ibv_alloc_pd(rdma_dev->context); 556 | if (!rdma_dev->pd) { 557 | fprintf(stderr, "Couldn't allocate PD\n"); 558 | goto clean_device; 559 | } 560 | DEBUG_LOG("created pd %p\n", rdma_dev->pd); 561 | 562 | /* ********************************** Create CQ ********************************** */ 563 | #ifdef PRINT_LATENCY 564 | struct ibv_cq_init_attr_ex cq_attr_ex; 565 | 566 | memset(&cq_attr_ex, 0, sizeof(cq_attr_ex)); 567 | cq_attr_ex.cqe = CQ_DEPTH; 568 | cq_attr_ex.cq_context = rdma_dev; 569 | cq_attr_ex.channel = NULL; 570 | cq_attr_ex.comp_vector = 0; 571 | cq_attr_ex.wc_flags = IBV_WC_EX_WITH_COMPLETION_TIMESTAMP; 572 | 573 | DEBUG_LOG ("ibv_create_cq_ex(rdma_dev->context = %p, &cq_attr_ex)\n", rdma_dev->context); 574 | rdma_dev->cq = ibv_create_cq_ex(rdma_dev->context, &cq_attr_ex); 575 | #else /*PRINT_LATENCY*/ 576 | DEBUG_LOG ("ibv_create_cq(%p, %d, NULL, NULL, 0)\n", rdma_dev->context, CQ_DEPTH); 577 | rdma_dev->cq = ibv_create_cq(rdma_dev->context, CQ_DEPTH, NULL, NULL /*comp. events channel*/, 0); 578 | #endif /*PRINT_LATENCY*/ 579 | if (!rdma_dev->cq) { 580 | fprintf(stderr, "Couldn't create CQ\n"); 581 | goto clean_pd; 582 | } 583 | DEBUG_LOG("created cq %p\n", rdma_dev->cq); 584 | 585 | /* ********************************** Create SRQ ********************************** */ 586 | struct ibv_srq_init_attr srq_attr; 587 | memset(&srq_attr, 0, sizeof(srq_attr)); 588 | srq_attr.attr.max_wr = 2; 589 | srq_attr.attr.max_sge = 1; 590 | DEBUG_LOG ("ibv_create_srq(%p, %d, NULL, NULL, 0)\n", rdma_dev->context, CQ_DEPTH); 591 | rdma_dev->srq = ibv_create_srq(rdma_dev->pd, &srq_attr); 592 | if (!rdma_dev->srq) { 593 | fprintf(stderr, "ibv_create_srq failed\n"); 594 | goto clean_cq; 595 | } 596 | DEBUG_LOG("created srq %p\n", rdma_dev->srq); 597 | 598 | /* ********************************** Create QP ********************************** */ 599 | struct ibv_qp_init_attr_ex attr_ex; 600 | struct mlx5dv_qp_init_attr attr_dv; 601 | 602 | memset(&attr_ex, 0, sizeof(attr_ex)); 603 | memset(&attr_dv, 0, sizeof(attr_dv)); 604 | 605 | attr_ex.qp_type = IBV_QPT_DRIVER; 606 | #ifdef PRINT_LATENCY 607 | attr_ex.send_cq = ibv_cq_ex_to_cq(rdma_dev->cq); 608 | attr_ex.recv_cq = ibv_cq_ex_to_cq(rdma_dev->cq); 609 | #else /*PRINT_LATENCY*/ 610 | attr_ex.send_cq = rdma_dev->cq; 611 | attr_ex.recv_cq = rdma_dev->cq; 612 | #endif /*PRINT_LATENCY*/ 613 | 614 | attr_ex.comp_mask |= IBV_QP_INIT_ATTR_PD; 615 | attr_ex.pd = rdma_dev->pd; 616 | attr_ex.srq = rdma_dev->srq; /* Should use SRQ for client only (DCT) */ 617 | 618 | /* create DCT */ 619 | attr_dv.comp_mask |= MLX5DV_QP_INIT_ATTR_MASK_DC; 620 | attr_dv.dc_init_attr.dc_type = MLX5DV_DCTYPE_DCT; 621 | attr_dv.dc_init_attr.dct_access_key = DC_KEY; 622 | 623 | DEBUG_LOG ("mlx5dv_create_qp(%p)\n", rdma_dev->context); 624 | rdma_dev->qp = mlx5dv_create_qp(rdma_dev->context, &attr_ex, &attr_dv); 625 | 626 | if (!rdma_dev->qp) { 627 | fprintf(stderr, "Couldn't create QP\n"); 628 | goto clean_srq; 629 | } 630 | DEBUG_LOG ("mlx5dv_create_qp %p completed: qp_num = 0x%lx\n", rdma_dev->qp, rdma_dev->qp->qp_num); 631 | 632 | /* - - - - - - - Modify QP to INIT - - - - - - - */ 633 | struct ibv_qp_attr qp_attr = { 634 | .qp_state = IBV_QPS_INIT, 635 | .pkey_index = 0, 636 | .port_num = rdma_dev->ib_port, 637 | .qp_access_flags = IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE 638 | }; 639 | enum ibv_qp_attr_mask attr_mask = IBV_QP_STATE | 640 | IBV_QP_PKEY_INDEX | 641 | IBV_QP_PORT | 642 | IBV_QP_ACCESS_FLAGS; 643 | DEBUG_LOG ("ibv_modify_qp(qp = %p, qp_attr.qp_state = %d, attr_mask = 0x%x)\n", 644 | rdma_dev->qp, qp_attr.qp_state, attr_mask); 645 | ret_val = ibv_modify_qp(rdma_dev->qp, &qp_attr, attr_mask); 646 | if (ret_val) { 647 | fprintf(stderr, "Failed to modify QP to INIT, error %d\n", ret_val); 648 | goto clean_qp; 649 | } 650 | DEBUG_LOG ("ibv_modify_qp to state %d completed: qp_num = 0x%lx\n", qp_attr.qp_state, rdma_dev->qp->qp_num); 651 | 652 | ret_val = modify_target_qp_to_rtr(rdma_dev); 653 | if (ret_val) { 654 | goto clean_qp; 655 | } 656 | 657 | DEBUG_LOG("init AH cache\n"); 658 | kh_init_inplace(kh_ib_ah, &rdma_dev->ah_hash); 659 | 660 | #ifdef PRINT_LATENCY 661 | struct ibv_device_attr_ex device_attr_ex = {}; 662 | //struct ibv_query_device_ex_input query_device_ex_input = { 663 | // .comp_masc = ... 664 | //} 665 | 666 | ret_val = ibv_query_device_ex(rdma_dev->context, /*struct ibv_query_device_ex_input*/NULL, &device_attr_ex); 667 | if (ret_val) { 668 | fprintf(stderr, "ibv_query_device_ex failed\n"); 669 | goto clean_qp; 670 | } 671 | if (!device_attr_ex.hca_core_clock) { 672 | fprintf(stderr, "hca_core_clock = 0\n"); 673 | goto clean_qp; 674 | } 675 | 676 | rdma_dev->hca_core_clock_kHz = device_attr_ex.hca_core_clock; 677 | DEBUG_LOG("hca_core_clock = %d kHz\n", rdma_dev->hca_core_clock_kHz); 678 | #endif /*PRINT_LATENCY*/ 679 | 680 | return rdma_dev; 681 | 682 | clean_qp: 683 | destroy_qp(rdma_dev->qp); 684 | 685 | clean_srq: 686 | if (rdma_dev->srq) { 687 | ibv_destroy_srq(rdma_dev->srq); 688 | } 689 | 690 | clean_cq: 691 | if (rdma_dev->cq) { 692 | #ifdef PRINT_LATENCY 693 | ibv_destroy_cq(ibv_cq_ex_to_cq(rdma_dev->cq)); 694 | #else /*PRINT_LATENCY*/ 695 | ibv_destroy_cq(rdma_dev->cq); 696 | #endif /*PRINT_LATENCY*/ 697 | } 698 | 699 | clean_pd: 700 | if (rdma_dev->pd) { 701 | ibv_dealloc_pd(rdma_dev->pd); 702 | } 703 | 704 | clean_device: 705 | close_ib_device(rdma_dev); 706 | 707 | clean_rdma_dev: 708 | free(rdma_dev); 709 | 710 | return NULL; 711 | } 712 | 713 | //============================================================================================ 714 | struct rdma_device *rdma_open_device_server(struct sockaddr *addr) 715 | { 716 | struct rdma_device *rdma_dev; 717 | int ret_val; 718 | 719 | rdma_dev = calloc(1, sizeof *rdma_dev); 720 | if (!rdma_dev) { 721 | fprintf(stderr, "rdma_device memory allocation failed\n"); 722 | return NULL; 723 | } 724 | 725 | /**************************************************************************************************** 726 | * In the next function we let rdma_cm find a IB device that matches the IP address of a the local netdev, 727 | * if yes, we return a pointer to that ib context 728 | * The result of this function is ib_dev - initialized pointer to the relevant struct ibv_device 729 | ****************************************************************************************************/ 730 | rdma_dev->context = open_ib_device_by_addr(rdma_dev, addr); 731 | if (!rdma_dev->context){ 732 | goto clean_rdma_dev; 733 | } 734 | 735 | ret_val = rdma_set_lid_gid_from_port_info(rdma_dev); 736 | if (ret_val) { 737 | goto clean_device; 738 | } 739 | 740 | /****************************************************************************************************/ 741 | 742 | DEBUG_LOG ("ibv_alloc_pd(ibv_context = %p)\n", rdma_dev->context); 743 | rdma_dev->pd = ibv_alloc_pd(rdma_dev->context); 744 | if (!rdma_dev->pd) { 745 | fprintf(stderr, "Couldn't allocate PD\n"); 746 | goto clean_device; 747 | } 748 | DEBUG_LOG("created pd %p\n", rdma_dev->pd); 749 | 750 | /* We don't create completion events channel (ibv_create_comp_channel), we prefer working in polling mode */ 751 | 752 | /* ********************************** Create CQ ********************************** */ 753 | #ifdef PRINT_LATENCY 754 | struct ibv_cq_init_attr_ex cq_attr_ex; 755 | 756 | memset(&cq_attr_ex, 0, sizeof(cq_attr_ex)); 757 | cq_attr_ex.cqe = CQ_DEPTH; 758 | cq_attr_ex.cq_context = rdma_dev; 759 | cq_attr_ex.channel = NULL; 760 | cq_attr_ex.comp_vector = 0; 761 | cq_attr_ex.wc_flags = IBV_WC_EX_WITH_COMPLETION_TIMESTAMP; 762 | 763 | DEBUG_LOG ("ibv_create_cq_ex(rdma_dev->context = %p, &cq_attr_ex)\n", rdma_dev->context); 764 | rdma_dev->cq = ibv_create_cq_ex(rdma_dev->context, &cq_attr_ex); 765 | #else /*PRINT_LATENCY*/ 766 | DEBUG_LOG ("ibv_create_cq(%p, %d, NULL, NULL, 0)\n", rdma_dev->context, CQ_DEPTH); 767 | rdma_dev->cq = ibv_create_cq(rdma_dev->context, CQ_DEPTH, NULL, NULL /*comp. events channel*/, 0); 768 | #endif /*PRINT_LATENCY*/ 769 | if (!rdma_dev->cq) { 770 | fprintf(stderr, "Couldn't create CQ\n"); 771 | goto clean_pd; 772 | } 773 | DEBUG_LOG("created cq %p\n", rdma_dev->cq); 774 | 775 | /* We don't create SRQ for DCI (server) side */ 776 | 777 | /* ********************************** Create QP ********************************** */ 778 | struct ibv_qp_init_attr_ex attr_ex; 779 | struct mlx5dv_qp_init_attr attr_dv; 780 | 781 | memset(&attr_ex, 0, sizeof(attr_ex)); 782 | memset(&attr_dv, 0, sizeof(attr_dv)); 783 | 784 | attr_ex.qp_type = IBV_QPT_DRIVER; 785 | #ifdef PRINT_LATENCY 786 | attr_ex.send_cq = ibv_cq_ex_to_cq(rdma_dev->cq); 787 | attr_ex.recv_cq = ibv_cq_ex_to_cq(rdma_dev->cq); 788 | #else /*PRINT_LATENCY*/ 789 | attr_ex.send_cq = rdma_dev->cq; 790 | attr_ex.recv_cq = rdma_dev->cq; 791 | #endif /*PRINT_LATENCY*/ 792 | 793 | attr_ex.comp_mask |= IBV_QP_INIT_ATTR_PD; 794 | attr_ex.pd = rdma_dev->pd; 795 | 796 | /* create DCI */ 797 | attr_dv.comp_mask |= MLX5DV_QP_INIT_ATTR_MASK_DC; 798 | attr_dv.dc_init_attr.dc_type = MLX5DV_DCTYPE_DCI; 799 | 800 | attr_ex.cap.max_send_wr = SEND_Q_DEPTH; 801 | attr_ex.cap.max_send_sge = MAX_SEND_SGE; 802 | rdma_dev->qp_available_wr = SEND_Q_DEPTH; 803 | 804 | attr_ex.comp_mask |= IBV_QP_INIT_ATTR_SEND_OPS_FLAGS; 805 | attr_ex.send_ops_flags = IBV_QP_EX_WITH_RDMA_WRITE | IBV_QP_EX_WITH_RDMA_READ; 806 | 807 | attr_dv.comp_mask |= MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS; 808 | attr_dv.create_flags |= MLX5DV_QP_CREATE_DISABLE_SCATTER_TO_CQE; /*driver doesnt support scatter2cqe data-path on DCI yet*/ 809 | 810 | DEBUG_LOG ("mlx5dv_create_qp(%p)\n", rdma_dev->context); 811 | rdma_dev->qp = mlx5dv_create_qp(rdma_dev->context, &attr_ex, &attr_dv); 812 | DEBUG_LOG ("mlx5dv_create_qp %p completed: qp_num = 0x%lx\n", rdma_dev->qp, rdma_dev->qp->qp_num); 813 | 814 | if (!rdma_dev->qp) { 815 | fprintf(stderr, "Couldn't create QP\n"); 816 | goto clean_cq; 817 | } 818 | rdma_dev->qpex = ibv_qp_to_qp_ex(rdma_dev->qp); 819 | if (!rdma_dev->qpex) { 820 | fprintf(stderr, "Couldn't create QPEX\n"); 821 | goto clean_qp; 822 | } 823 | rdma_dev->mqpex = mlx5dv_qp_ex_from_ibv_qp_ex(rdma_dev->qpex); 824 | if (!rdma_dev->mqpex) { 825 | fprintf(stderr, "Couldn't create MQPEX\n"); 826 | goto clean_qp; 827 | } 828 | ret_val = modify_source_qp_rst2rts(rdma_dev); 829 | if (ret_val) { 830 | goto clean_qp; 831 | } 832 | 833 | DEBUG_LOG("init AH cache\n"); 834 | kh_init_inplace(kh_ib_ah, &rdma_dev->ah_hash); 835 | 836 | #ifdef PRINT_LATENCY 837 | struct ibv_device_attr_ex device_attr_ex = {}; 838 | //struct ibv_query_device_ex_input query_device_ex_input = { 839 | // .comp_masc = ... 840 | //} 841 | 842 | ret_val = ibv_query_device_ex(rdma_dev->context, /*struct ibv_query_device_ex_input*/NULL, &device_attr_ex); 843 | if (ret_val) { 844 | fprintf(stderr, "ibv_query_device_ex failed\n"); 845 | goto clean_qp; 846 | } 847 | if (!device_attr_ex.hca_core_clock) { 848 | fprintf(stderr, "hca_core_clock = 0\n"); 849 | goto clean_qp; 850 | } 851 | 852 | rdma_dev->hca_core_clock_kHz = device_attr_ex.hca_core_clock; 853 | DEBUG_LOG("hca_core_clock = %d kHz\n", rdma_dev->hca_core_clock_kHz); 854 | 855 | rdma_dev->min_wr_complete_latency = 0x8FFFFFFFFFFFFFFF; 856 | rdma_dev->min_completion_latency = 0x8FFFFFFFFFFFFFFF; 857 | rdma_dev->min_read_comp_latency = 0x8FFFFFFFFFFFFFFF; 858 | #endif /*PRINT_LATENCY*/ 859 | 860 | return rdma_dev; 861 | 862 | clean_qp: 863 | destroy_qp(rdma_dev->qp); 864 | 865 | clean_cq: 866 | if (rdma_dev->cq) { 867 | #ifdef PRINT_LATENCY 868 | ibv_destroy_cq(ibv_cq_ex_to_cq(rdma_dev->cq)); 869 | #else /*PRINT_LATENCY*/ 870 | ibv_destroy_cq(rdma_dev->cq); 871 | #endif /*PRINT_LATENCY*/ 872 | } 873 | 874 | clean_pd: 875 | if (rdma_dev->pd) { 876 | ibv_dealloc_pd(rdma_dev->pd); 877 | } 878 | 879 | clean_device: 880 | close_ib_device(rdma_dev); 881 | 882 | clean_rdma_dev: 883 | free(rdma_dev); 884 | 885 | return NULL; 886 | } 887 | 888 | //=========================================================================================== 889 | static 890 | int rdma_exec_task(struct rdma_exec_params *exec_params) 891 | { 892 | int ret_val; 893 | int required_wr = (exec_params->local_buf_iovcnt) ? (exec_params->local_buf_iovcnt + MAX_SEND_SGE - 1) / MAX_SEND_SGE : 1; 894 | if (required_wr > exec_params->device->qp_available_wr) { 895 | fprintf(stderr, "Required WR number %d is greater than available in QP WRs %d\n", 896 | required_wr, exec_params->device->qp_available_wr); 897 | return 1; 898 | } 899 | void (*ibv_wr_rdma_rw_post)(struct ibv_qp_ex *qp, uint32_t rkey, uint64_t remote_addr) = (exec_params->flags & RDMA_TASK_ATTR_RDMA_READ) 900 | ? ibv_wr_rdma_read // client wants to send data to the server 901 | : ibv_wr_rdma_write; // client wants to receive data from the server 902 | 903 | /* RDMA Read/Write for DCI connect, this will create cqe->ts_start */ 904 | DEBUG_LOG_FAST_PATH("RDMA Read/Write: ibv_wr_start: qpex = %p\n", exec_params->device->qpex); 905 | ibv_wr_start(exec_params->device->qpex); 906 | #ifdef PRINT_LATENCY 907 | struct ibv_values_ex ts_values = { 908 | .comp_mask = IBV_VALUES_MASK_RAW_CLOCK, 909 | .raw_clock = {} /*struct timespec*/ 910 | }; 911 | 912 | ret_val = ibv_query_rt_values_ex(exec_params->device->context, &ts_values); 913 | if (ret_val) { 914 | fprintf(stderr, "ibv_query_rt_values_ex failed after ibv_wr_start call\n"); 915 | return 1; 916 | } 917 | #endif /*PRINT_LATENCY*/ 918 | 919 | // The following code should be atomic operation 920 | int wr_id_idx = exec_params->device->app_wr_id_idx++; 921 | if (exec_params->device->app_wr_id_idx >= SEND_Q_DEPTH) { 922 | exec_params->device->app_wr_id_idx = 0; 923 | } 924 | // end of atomic operation 925 | 926 | #ifdef PRINT_LATENCY 927 | exec_params->device->latency[wr_id_idx].wr_start_ts = ts_values.raw_clock.tv_nsec; /*the value in hca clocks*/ 928 | #endif /*PRINT_LATENCY*/ 929 | 930 | // update internal wr_id DB 931 | exec_params->device->qp_available_wr -= required_wr; 932 | exec_params->device->app_wr_id[wr_id_idx].num_wrs = required_wr; 933 | exec_params->device->app_wr_id[wr_id_idx].wr_id = exec_params->wr_id; 934 | exec_params->device->app_wr_id[wr_id_idx].flags = WR_ID_FLAGS_ACTIVE; 935 | 936 | exec_params->device->qpex->wr_id = (uint64_t)wr_id_idx; 937 | 938 | if (exec_params->local_buf_iovcnt) { 939 | int i, start_i = 0; 940 | struct ibv_sge sg_list[MAX_SEND_SGE]; 941 | uint64_t curr_rem_addr = (uint64_t)exec_params->rem_buf_addr; 942 | int num_sges_to_send = exec_params->local_buf_iovcnt; 943 | 944 | while (num_sges_to_send > 0) { 945 | int curr_iovcnt = mmin(MAX_SEND_SGE, num_sges_to_send); 946 | exec_params->device->qpex->wr_flags = num_sges_to_send > MAX_SEND_SGE ? 0 : IBV_SEND_SIGNALED; 947 | 948 | DEBUG_LOG_FAST_PATH("RDMA Read/Write: ibv_wr_rdma_%s: wr_id=0x%llx, qpex=%p, rkey=0x%lx, remote_buf=0x%llx\n", 949 | exec_params->flags & RDMA_TASK_ATTR_RDMA_READ ? "read" : "write", 950 | (long long unsigned int)exec_params->wr_id, exec_params->device->qpex, exec_params->rem_buf_rkey, (long long unsigned int)curr_rem_addr); 951 | ibv_wr_rdma_rw_post(exec_params->device->qpex, exec_params->rem_buf_rkey, curr_rem_addr); 952 | 953 | for (i = 0; i < curr_iovcnt; i++) { 954 | sg_list[i].addr = (uint64_t)exec_params->local_buf_iovec[start_i + i].iov_base; 955 | sg_list[i].length = (uint32_t)exec_params->local_buf_iovec[start_i + i].iov_len; 956 | sg_list[i].lkey = exec_params->local_buf_mr_lkey; 957 | curr_rem_addr += sg_list[i].length; 958 | } 959 | 960 | DEBUG_LOG_FAST_PATH("RDMA Read/Write: ibv_wr_set_sge_list(qpex=%p, num_sge=%lu, sg_list=%p), start_i=%d, num_sges_to_send=%d, sg[0].length=%u\n", 961 | exec_params->device->qpex, (size_t)curr_iovcnt, (void*)sg_list, start_i, num_sges_to_send, sg_list[0].length); 962 | ibv_wr_set_sge_list(exec_params->device->qpex, (size_t)curr_iovcnt, sg_list); 963 | num_sges_to_send -= curr_iovcnt; 964 | start_i += curr_iovcnt; 965 | 966 | 967 | DEBUG_LOG_FAST_PATH("RDMA Read/Write: mlx5dv_wr_set_dc_addr: mqpex=%p, ah=%p, rem_dctn=0x%06lx\n", 968 | exec_params->device->mqpex, exec_params->ah, exec_params->rem_dctn); 969 | mlx5dv_wr_set_dc_addr(exec_params->device->mqpex, exec_params->ah, exec_params->rem_dctn, DC_KEY); 970 | } 971 | } else { 972 | exec_params->device->qpex->wr_flags = IBV_SEND_SIGNALED; 973 | 974 | DEBUG_LOG_FAST_PATH("RDMA Read/Write: ibv_wr_rdma_%s: wr_id=0x%llx, qpex=%p, rkey=0x%lx, remote_buf=0x%llx\n", 975 | exec_params->flags & RDMA_TASK_ATTR_RDMA_READ ? "read" : "write", 976 | (long long unsigned int)exec_params->wr_id, exec_params->device->qpex, exec_params->rem_buf_rkey, (unsigned long long)exec_params->rem_buf_addr); 977 | 978 | ibv_wr_rdma_rw_post(exec_params->device->qpex, exec_params->rem_buf_rkey, exec_params->rem_buf_addr); 979 | 980 | DEBUG_LOG_FAST_PATH("RDMA Read/Write: ibv_wr_set_sge: qpex=%p, lkey=0x%x, local_buf=0x%llx, size=%u\n", 981 | exec_params->device->qpex, exec_params->local_buf_mr_lkey, 982 | (unsigned long long)exec_params->local_buf_addr, exec_params->rem_buf_size); 983 | ibv_wr_set_sge(exec_params->device->qpex, exec_params->local_buf_mr_lkey, (uintptr_t)exec_params->local_buf_addr, exec_params->rem_buf_size); 984 | 985 | DEBUG_LOG_FAST_PATH("RDMA Read/Write: mlx5dv_wr_set_dc_addr: mqpex=%p, ah=%p, rem_dctn=0x%06lx\n", 986 | exec_params->device->mqpex, exec_params->ah, exec_params->rem_dctn); 987 | mlx5dv_wr_set_dc_addr(exec_params->device->mqpex, exec_params->ah, exec_params->rem_dctn, DC_KEY); 988 | } 989 | 990 | /* ring DB */ 991 | DEBUG_LOG_FAST_PATH("ibv_wr_complete: qpex=%p, required_wr=%d\n", exec_params->device->qpex, required_wr); 992 | ret_val = ibv_wr_complete(exec_params->device->qpex); 993 | if (ret_val) { 994 | DEBUG_LOG_FAST_PATH("FAILURE: ibv_wr_complete (error=%d\n", ret_val); 995 | return ret_val; 996 | } 997 | #ifdef PRINT_LATENCY 998 | ret_val = ibv_query_rt_values_ex(exec_params->device->context, &ts_values); 999 | if (ret_val) { 1000 | fprintf(stderr, "ibv_query_rt_values_ex failed after ibv_wr_start call\n"); 1001 | return 1; 1002 | } 1003 | exec_params->device->latency[wr_id_idx].wr_complete_ts = ts_values.raw_clock.tv_nsec; /*the value in hca clocks*/ 1004 | #endif /*PRINT_LATENCY*/ 1005 | return ret_val; 1006 | } 1007 | //=========================================================================================== 1008 | 1009 | int rdma_reset_device(struct rdma_device *device) 1010 | { 1011 | if (!is_server(device)) { 1012 | fprintf(stderr, "Method \"rdma_reset_device()\" could be executed only by server side!\n"); 1013 | return EOPNOTSUPP; 1014 | } 1015 | struct ibv_qp_attr qp_attr; 1016 | enum ibv_qp_attr_mask attr_mask; 1017 | memset(&qp_attr, 0, sizeof qp_attr); 1018 | 1019 | /* - - - - - - - Modify QP to ERR - - - - - - - */ 1020 | qp_attr.qp_state = IBV_QPS_ERR; 1021 | attr_mask = IBV_QP_STATE; 1022 | DEBUG_LOG("ibv_modify_qp(qp = %p, qp_attr.qp_state = %d, attr_mask = 0x%x)\n", 1023 | device->qp, qp_attr.qp_state, attr_mask); 1024 | if (ibv_modify_qp(device->qp, &qp_attr, attr_mask)) { 1025 | fprintf(stderr, "Failed to modify QP to ERR\n"); 1026 | return 1; 1027 | } 1028 | 1029 | /* - - - - - - - FLUSH WORK COMPLETIONS - - - - - - - */ 1030 | struct rdma_exec_params exec_params; 1031 | memset(&exec_params, 0, sizeof exec_params); 1032 | khiter_t ah_itr = 0; 1033 | for (ah_itr = kh_begin(&device->ah_hash); ah_itr != kh_end(&device->ah_hash); ++ah_itr) { 1034 | if (kh_exist(&device->ah_hash, ah_itr) && kh_value(&device->ah_hash, ah_itr) != NULL) { 1035 | exec_params.ah = kh_value(&device->ah_hash, ah_itr); 1036 | } 1037 | } 1038 | if (exec_params.ah) { 1039 | exec_params.wr_id = WR_ID_FLUSH_MARKER; 1040 | exec_params.device = device; 1041 | 1042 | DEBUG_LOG_FAST_PATH("Posting FLUSH MARKER on queue\n"); 1043 | rdma_exec_task(&exec_params); 1044 | 1045 | DEBUG_LOG_FAST_PATH("Flushing Work Completions\n"); 1046 | struct rdma_completion_event rdma_comp_ev[COMP_ARRAY_SIZE]; 1047 | int flushed = 0; 1048 | do { 1049 | int i, reported_ev = 0; 1050 | reported_ev = rdma_poll_completions(device, &rdma_comp_ev[reported_ev], COMP_ARRAY_SIZE); 1051 | for (i = 0; !flushed && i < reported_ev; i++) { 1052 | flushed = rdma_comp_ev[i].wr_id == WR_ID_FLUSH_MARKER; 1053 | } 1054 | } while (!flushed); 1055 | DEBUG_LOG_FAST_PATH("Finished Work Completions flushing\n"); 1056 | } 1057 | 1058 | /* - - - - - - - RESET RDMA_DEVICE MEMBERS - - - - - - - */ 1059 | memset(device->app_wr_id, 0, sizeof(device->app_wr_id)); 1060 | device->app_wr_id_idx = 0; 1061 | device->qp_available_wr = SEND_Q_DEPTH; 1062 | /* - - - - - - - Modify QP to RESET - - - - - - - */ 1063 | qp_attr.qp_state = IBV_QPS_RESET; 1064 | attr_mask = IBV_QP_STATE; 1065 | DEBUG_LOG("ibv_modify_qp(qp = %p, qp_attr.qp_state = %d, attr_mask = 0x%x)\n", 1066 | device->qp, qp_attr.qp_state, attr_mask); 1067 | if (ibv_modify_qp(device->qp, &qp_attr, attr_mask)) { 1068 | fprintf(stderr, "Failed to modify QP to RESET\n"); 1069 | return 1; 1070 | } 1071 | DEBUG_LOG ("ibv_modify_qp to state %d completed.\n", qp_attr.qp_state, device->qp->qp_num); 1072 | 1073 | /* - - - - - - - Modify QP to RTS (RESET->INIT->RTR->RTS) - - - - - - - */ 1074 | return modify_source_qp_rst2rts(device); 1075 | } 1076 | 1077 | //============================================================================================ 1078 | void rdma_close_device(struct rdma_device *rdma_dev) 1079 | { 1080 | int ret_val; 1081 | struct ibv_ah *ah; 1082 | 1083 | if (rdma_dev->rdma_buff_cnt > 0) { 1084 | fprintf(stderr, "The number of attached RDMA buffers is not zero (%d). Can't close device.\n", 1085 | rdma_dev->rdma_buff_cnt); 1086 | return; 1087 | } 1088 | #ifdef PRINT_LATENCY 1089 | if (rdma_dev->measure_index) { 1090 | DEBUG_LOG("PRINT_LATENCY: %6lu wr-s, wr_sent latency: min %8lu, max %8lu, avg %8lu (nSec)\n", 1091 | rdma_dev->measure_index, 1092 | rdma_dev->min_wr_complete_latency * 1000000 / rdma_dev->hca_core_clock_kHz, 1093 | rdma_dev->max_wr_complete_latency * 1000000 / rdma_dev->hca_core_clock_kHz, 1094 | rdma_dev->wr_complete_latency_sum / rdma_dev->measure_index * 1000000 / rdma_dev->hca_core_clock_kHz); 1095 | 1096 | DEBUG_LOG("PRINT_LATENCY: completion latency : min %8lu, max %8lu, avg %8lu (nSec)\n", 1097 | rdma_dev->min_completion_latency * 1000000 / rdma_dev->hca_core_clock_kHz, 1098 | rdma_dev->max_completion_latency * 1000000 / rdma_dev->hca_core_clock_kHz, 1099 | rdma_dev->completion_latency_sum / rdma_dev->measure_index * 1000000 / rdma_dev->hca_core_clock_kHz); 1100 | 1101 | DEBUG_LOG("PRINT_LATENCY: read_comp latency : min %8lu, max %8lu, avg %8lu (nSec)\n", 1102 | rdma_dev->min_read_comp_latency * 1000000 / rdma_dev->hca_core_clock_kHz, 1103 | rdma_dev->max_read_comp_latency * 1000000 / rdma_dev->hca_core_clock_kHz, 1104 | rdma_dev->read_comp_latency_sum / rdma_dev->measure_index * 1000000 / rdma_dev->hca_core_clock_kHz); 1105 | 1106 | fflush(stdout); 1107 | } 1108 | #endif /*PRINT_LATENCY*/ 1109 | ret_val = destroy_qp(rdma_dev->qp); 1110 | if (ret_val) { 1111 | return; 1112 | } 1113 | 1114 | if (rdma_dev->srq) { 1115 | DEBUG_LOG("ibv_destroy_srq(%p)\n", rdma_dev->srq); 1116 | ret_val = ibv_destroy_srq(rdma_dev->srq); 1117 | if (ret_val) { 1118 | fprintf(stderr, "Couldn't destroy SRQ\n"); 1119 | return; 1120 | } 1121 | } 1122 | 1123 | DEBUG_LOG("ibv_destroy_cq(%p)\n", rdma_dev->cq); 1124 | #ifdef PRINT_LATENCY 1125 | ibv_destroy_cq(ibv_cq_ex_to_cq(rdma_dev->cq)); 1126 | #else /*PRINT_LATENCY*/ 1127 | ibv_destroy_cq(rdma_dev->cq); 1128 | #endif /*PRINT_LATENCY*/ 1129 | if (ret_val) { 1130 | fprintf(stderr, "Couldn't destroy CQ, error %d\n", ret_val); 1131 | return; 1132 | } 1133 | 1134 | DEBUG_LOG("destroy ibv_ah's\n"); 1135 | kh_foreach_value(&rdma_dev->ah_hash, ah, ibv_destroy_ah(ah)); 1136 | 1137 | DEBUG_LOG("ibv_dealloc_pd(%p)\n", rdma_dev->pd); 1138 | ret_val = ibv_dealloc_pd(rdma_dev->pd); 1139 | if (ret_val) { 1140 | fprintf(stderr, "Couldn't deallocate PD, error %d\n", ret_val); 1141 | return; 1142 | } 1143 | 1144 | DEBUG_LOG("destroy AH cache\n"); 1145 | kh_destroy_inplace(kh_ib_ah, &rdma_dev->ah_hash); 1146 | 1147 | close_ib_device(rdma_dev); 1148 | 1149 | free(rdma_dev); 1150 | 1151 | return; 1152 | } 1153 | 1154 | //============================================================================================ 1155 | struct rdma_buffer *rdma_buffer_reg(struct rdma_device *rdma_dev, void *addr, size_t length) 1156 | { 1157 | struct rdma_buffer *rdma_buff; 1158 | int ret_val; 1159 | 1160 | rdma_buff = calloc(1, sizeof *rdma_buff); 1161 | if (!rdma_buff) { 1162 | fprintf(stderr, "rdma_buff memory allocation failed\n"); 1163 | return NULL; 1164 | } 1165 | 1166 | enum ibv_access_flags access_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE; 1167 | /*In the case of local buffer we can use IBV_ACCESS_LOCAL_WRITE only flag*/ 1168 | DEBUG_LOG("ibv_reg_mr(pd %p, buf %p, size = %lu, access_flags = 0x%08x\n", 1169 | rdma_dev->pd, addr, length, access_flags); 1170 | rdma_buff->mr = ibv_reg_mr(rdma_dev->pd, addr, length, access_flags); 1171 | if (!rdma_buff->mr) { 1172 | fprintf(stderr, "Couldn't register GPU MR\n"); 1173 | goto clean_rdma_buff; 1174 | } 1175 | DEBUG_LOG("ibv_reg_mr completed: buf %p, size = %lu, rkey = 0x%08x\n", 1176 | addr, length, rdma_buff->mr->rkey); 1177 | 1178 | rdma_buff->buf_addr = addr; 1179 | rdma_buff->buf_size = length; 1180 | rdma_buff->rkey = rdma_buff->mr->rkey; /*not used for local buffer case*/ 1181 | rdma_buff->rdma_dev = rdma_dev; 1182 | rdma_dev->rdma_buff_cnt++; 1183 | 1184 | return rdma_buff; 1185 | 1186 | clean_rdma_buff: 1187 | /* We don't decrement device rdma_buff_cnt because we still did not increment it, 1188 | we just free the allocated for rdma_buff memory. */ 1189 | free(rdma_buff); 1190 | 1191 | return NULL; 1192 | } 1193 | 1194 | //============================================================================================ 1195 | void rdma_buffer_dereg(struct rdma_buffer *rdma_buff) 1196 | { 1197 | int ret_val; 1198 | 1199 | DEBUG_LOG("ibv_dereg_mr(%p)\n", rdma_buff->mr); 1200 | if (rdma_buff->mr) { 1201 | ret_val = ibv_dereg_mr(rdma_buff->mr); 1202 | if (ret_val) { 1203 | fprintf(stderr, "Couldn't deregister MR, error %d\n", ret_val); 1204 | return; 1205 | } 1206 | } 1207 | rdma_buff->rdma_dev->rdma_buff_cnt--; 1208 | DEBUG_LOG("The buffer detached from rdma_device (%p). Number of attached to device buffers is %d.\n", 1209 | rdma_buff->rdma_dev, rdma_buff->rdma_dev->rdma_buff_cnt); 1210 | 1211 | free(rdma_buff); 1212 | } 1213 | 1214 | //============================================================================================ 1215 | static void wire_gid_to_gid(const char *wgid, union ibv_gid *gid) 1216 | { 1217 | char tmp[9]; 1218 | uint32_t v32; 1219 | uint32_t *raw = (uint32_t *)gid->raw; 1220 | int i; 1221 | 1222 | for (tmp[8] = 0, i = 0; i < 4; ++i) { 1223 | memcpy(tmp, wgid + i * 8, 8); 1224 | sscanf(tmp, "%x", &v32); 1225 | raw[i] = ntohl(v32); 1226 | } 1227 | } 1228 | 1229 | static void gid_to_wire_gid(const union ibv_gid *gid, char wgid[]) 1230 | { 1231 | int i; 1232 | uint32_t *raw = (uint32_t *)gid->raw; 1233 | 1234 | for (i = 0; i < 4; ++i) 1235 | sprintf(&wgid[i * 8], "%08x", htonl(raw[i])); 1236 | } 1237 | 1238 | //=============================================================================================== 1239 | /* addr size rkey lid dctn g gid */ 1240 | #define BUFF_DESC_STRING_LENGTH (sizeof "0102030405060708:01020304:01020304:0102:010203:1:0102030405060708090a0b0c0d0e0f10") 1241 | 1242 | int rdma_buffer_get_desc_str(struct rdma_buffer *rdma_buff, char *desc_str, size_t desc_length) 1243 | { 1244 | if (desc_length < BUFF_DESC_STRING_LENGTH) { 1245 | fprintf(stderr, "desc string size (%lu) is less than required (%lu) for sending rdma_buffer attributes\n", 1246 | desc_length, BUFF_DESC_STRING_LENGTH); 1247 | return 0; 1248 | } 1249 | /* addr size rkey lid dctn g 1250 | "0102030405060708:01020304:01020304:0102:010203:1:" */ 1251 | sprintf(desc_str, "%016llx:%08lx:%08x:%04x:%06x:%d:", 1252 | (unsigned long long)rdma_buff->buf_addr, 1253 | (unsigned long)rdma_buff->buf_size, 1254 | rdma_buff->rkey, 1255 | rdma_buff->rdma_dev->lid, 1256 | rdma_buff->rdma_dev->qp->qp_num /* dctn */, 1257 | rdma_buff->rdma_dev->is_global & 0x1); 1258 | 1259 | gid_to_wire_gid(&rdma_buff->rdma_dev->gid, desc_str + sizeof "0102030405060708:01020304:01020304:0102:010203:1"); 1260 | 1261 | return strlen(desc_str) + 1; /*including the terminating null character*/ 1262 | } 1263 | 1264 | static int rdma_create_ah_cached(struct rdma_device *rdma_dev, 1265 | struct ibv_ah_attr *ah_attr, 1266 | struct ibv_ah **p_ah) 1267 | { 1268 | int ret = -1; 1269 | khiter_t iter; 1270 | 1271 | /* looking for existing AH with same attributes */ 1272 | iter = kh_get(kh_ib_ah, &rdma_dev->ah_hash, *ah_attr); 1273 | if (iter == kh_end(&rdma_dev->ah_hash)) { 1274 | 1275 | /* new AH */ 1276 | DEBUG_LOG("ibv_create_ah(dlid=%d port=%d is_global=%d, tc=%d)\n", 1277 | ah_attr->dlid, ah_attr->port_num, ah_attr->is_global, (ah_attr->grh.traffic_class >> 5)); 1278 | *p_ah = ibv_create_ah(rdma_dev->pd, ah_attr); 1279 | 1280 | if (*p_ah == NULL) { 1281 | perror("ibv_create_ah"); 1282 | goto out; 1283 | } 1284 | 1285 | /* store AH in hash */ 1286 | iter = kh_put(kh_ib_ah, &rdma_dev->ah_hash, *ah_attr, &ret); 1287 | 1288 | /* failed to store - rollback */ 1289 | if (iter == kh_end(&rdma_dev->ah_hash)) { 1290 | perror("rdma_create_ah_cached failed storing"); 1291 | ibv_destroy_ah(*p_ah); 1292 | goto out; 1293 | } 1294 | 1295 | kh_value(&rdma_dev->ah_hash, iter) = *p_ah; 1296 | ret = 0; 1297 | } else { 1298 | /* found existing AH */ 1299 | *p_ah = kh_value(&rdma_dev->ah_hash, iter); 1300 | ret = 0; 1301 | } 1302 | 1303 | out: 1304 | return ret; 1305 | } 1306 | 1307 | //============================================================================================ 1308 | static int buff_size_validation(struct rdma_task_attr *attr, unsigned long rem_buf_size) 1309 | { 1310 | size_t total_len = 0; 1311 | int i; 1312 | 1313 | for (i = 0; i < attr->local_buf_iovcnt; i++) { 1314 | if ((attr->local_buf_iovec[i].iov_base < attr->local_buf_rdma->buf_addr) || 1315 | (attr->local_buf_iovec[i].iov_base + attr->local_buf_iovec[i].iov_len > 1316 | attr->local_buf_rdma->buf_addr + attr->local_buf_rdma->buf_size)) { 1317 | 1318 | fprintf(stderr, "sge buffer %d (%p, %p) exceeds the local buffer bounary (%p, %p)\n", i, 1319 | attr->local_buf_iovec[i].iov_base, attr->local_buf_iovec[i].iov_base + attr->local_buf_iovec[i].iov_len, 1320 | attr->local_buf_rdma->buf_addr, attr->local_buf_rdma->buf_addr + attr->local_buf_rdma->buf_size); 1321 | return 1; 1322 | } 1323 | total_len += attr->local_buf_iovec[i].iov_len; 1324 | if (total_len > rem_buf_size) { 1325 | fprintf(stderr, "The sum of sge buffers lengths (%lu) exceeded the remote buffer size %lu on iteration %d\n", 1326 | total_len, rem_buf_size, i); 1327 | return 1; 1328 | } 1329 | } 1330 | if ((attr->local_buf_iovcnt) && (total_len != rem_buf_size)) { 1331 | fprintf(stderr, "WARN: The sum of sge buffers lengths (%lu) differs from the remote buffer size %lu\n", 1332 | total_len, rem_buf_size); 1333 | } 1334 | if ((!attr->local_buf_iovcnt) && (rem_buf_size > attr->local_buf_rdma->buf_size)) { 1335 | fprintf(stderr, "WARN: When not using sge list, the requested buffer size %lu is greater than allocated local size %lu\n", 1336 | rem_buf_size, attr->local_buf_rdma->buf_size); 1337 | } 1338 | return 0; 1339 | } 1340 | 1341 | //============================================================================================ 1342 | int rdma_submit_task(struct rdma_task_attr *attr) 1343 | { 1344 | struct rdma_exec_params exec_params = {}; 1345 | uint16_t rem_lid = 0; 1346 | int is_global = 0; 1347 | union ibv_gid rem_gid; 1348 | int ret_val; 1349 | 1350 | exec_params.wr_id = attr->wr_id; 1351 | exec_params.device = attr->local_buf_rdma->rdma_dev; 1352 | exec_params.flags = attr->flags; 1353 | exec_params.local_buf_mr_lkey = (uint32_t)attr->local_buf_rdma->mr->lkey; 1354 | exec_params.local_buf_addr = attr->local_buf_rdma->buf_addr; 1355 | exec_params.local_buf_iovec = attr->local_buf_iovec; 1356 | exec_params.local_buf_iovcnt = attr->local_buf_iovcnt; 1357 | /* 1358 | * Parse desc string, extracting remote buffer address, size, rkey, lid, dctn, and if global is true, also gid 1359 | */ 1360 | DEBUG_LOG_FAST_PATH("Starting to parse desc string: \"%s\"\n", attr->remote_buf_desc_str); 1361 | /* addr size rkey lid dctn g gid 1362 | * "0102030405060708:01020304:01020304:0102:010203:1:0102030405060708090a0b0c0d0e0f10"*/ 1363 | sscanf(attr->remote_buf_desc_str, "%llx:%lx:%lx:%hx:%lx:%d", 1364 | &exec_params.rem_buf_addr, &exec_params.rem_buf_size, 1365 | &exec_params.rem_buf_rkey, &rem_lid, 1366 | &exec_params.rem_dctn, &is_global); 1367 | memset(&rem_gid, 0, sizeof(rem_gid)); 1368 | if (is_global) { 1369 | wire_gid_to_gid(attr->remote_buf_desc_str + sizeof "0102030405060708:01020304:01020304:0102:010203:1", &rem_gid); 1370 | } 1371 | DEBUG_LOG_FAST_PATH("rem_buf_addr=0x%llx, rem_buf_size=%u, rem_buf_offset=%u, rem_buf_rkey=0x%lx, rem_lid=0x%hx, rem_dctn=0x%lx, is_global=%d\n", 1372 | exec_params.rem_buf_addr, exec_params.rem_buf_size, attr->remote_buf_offset, exec_params.rem_buf_rkey, rem_lid, exec_params.rem_dctn, is_global); 1373 | DEBUG_LOG_FAST_PATH("Rem GID: %02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x\n", 1374 | rem_gid.raw[0], rem_gid.raw[1], rem_gid.raw[2], rem_gid.raw[3], 1375 | rem_gid.raw[4], rem_gid.raw[5], rem_gid.raw[6], rem_gid.raw[7], 1376 | rem_gid.raw[8], rem_gid.raw[9], rem_gid.raw[10], rem_gid.raw[11], 1377 | rem_gid.raw[12], rem_gid.raw[13], rem_gid.raw[14], rem_gid.raw[15] ); 1378 | DEBUG_LOG_FAST_PATH("rdma_task_attr_flags=%08x\n", exec_params.flags); 1379 | 1380 | /* upadte the remote buffer addr and size acording to the requested start offset */ 1381 | exec_params.rem_buf_addr += attr->remote_buf_offset; 1382 | exec_params.rem_buf_size -= attr->remote_buf_offset; 1383 | 1384 | /* 1385 | * Pass attr->local_buf_iovec - local_buf_iovcnt elements and check that 1386 | * the sum of local_buf_iovec[i].iov_len doesn't exceed rem_buf_size 1387 | */ 1388 | if (debug_fast_path) { 1389 | /* We do these validation code in debug mode only, because if something 1390 | * is wrong in the fast path, the HW will give completion error */ 1391 | ret_val = buff_size_validation(attr, exec_params.rem_buf_size); 1392 | if (ret_val) { 1393 | return ret_val; 1394 | } 1395 | } 1396 | 1397 | /* Check if address handler corresponding to the given key is present in the hash table, 1398 | if yes - return it and if it is not, create ah and add it to the hash table */ 1399 | struct ibv_ah_attr ah_attr; 1400 | 1401 | memset(&ah_attr, 0, sizeof ah_attr); 1402 | ah_attr.is_global = is_global; 1403 | ah_attr.dlid = rem_lid; 1404 | ah_attr.port_num = exec_params.device->ib_port; 1405 | 1406 | if (ah_attr.is_global) { 1407 | ah_attr.grh.hop_limit = 1; 1408 | ah_attr.grh.dgid = rem_gid; 1409 | ah_attr.grh.sgid_index = exec_params.device->gidx; 1410 | ah_attr.grh.traffic_class = TC_PRIO << 5; // <<3 for dscp2prio, <<2 for ECN bits 1411 | } 1412 | 1413 | if (rdma_create_ah_cached(exec_params.device, &ah_attr, &exec_params.ah)) { 1414 | return 1; 1415 | } 1416 | ret_val = rdma_exec_task(&exec_params); 1417 | 1418 | return ret_val; 1419 | } 1420 | 1421 | //============================================================================================ 1422 | int rdma_poll_completions(struct rdma_device *rdma_dev, 1423 | struct rdma_completion_event *event, 1424 | uint32_t num_entries) 1425 | { 1426 | int reported_entries = 0; 1427 | 1428 | if (num_entries > COMP_ARRAY_SIZE) { 1429 | num_entries = COMP_ARRAY_SIZE; /* We don't returne more than 16 entries, 1430 | If user needs more, he can call rdma_poll_completions again */ 1431 | } 1432 | 1433 | /* Polling completion queue */ 1434 | //DEBUG_LOG_FAST_PATH("Polling completion queue: ibv_poll_cq\n"); 1435 | #ifdef PRINT_LATENCY 1436 | struct ibv_poll_cq_attr cq_attr = {}; 1437 | uint64_t comp_ts; 1438 | int ret_val; 1439 | 1440 | ret_val = ibv_start_poll(rdma_dev->cq, &cq_attr); 1441 | if ((ret_val) && (ret_val != ENOENT)) { 1442 | perror("ibv_start_poll"); 1443 | return reported_entries; /*0*/ 1444 | } 1445 | 1446 | while (ret_val != ENOENT) { 1447 | uint64_t cq_wr_id = rdma_dev->cq->wr_id; 1448 | DEBUG_LOG_FAST_PATH("virtual wr_id %llu, original wr_id 0x%llx, num_wrs=%d\n", 1449 | (long long unsigned int)cq_wr_id, 1450 | (long long unsigned int)rdma_dev->app_wr_id[cq_wr_id].wr_id, 1451 | rdma_dev->app_wr_id[cq_wr_id].num_wrs); 1452 | if (rdma_dev->app_wr_id[cq_wr_id].flags & WR_ID_FLAGS_ACTIVE) { 1453 | rdma_dev->app_wr_id[wc[i].wr_id].flags = 0; 1454 | rdma_dev->qp_available_wr += rdma_dev->app_wr_id[cq_wr_id].num_wrs; 1455 | event[reported_entries].wr_id = rdma_dev->app_wr_id[cq_wr_id].wr_id; 1456 | event[reported_entries].status = rdma_dev->cq->status; 1457 | reported_entries++; 1458 | } 1459 | 1460 | rdma_dev->latency[cq_wr_id].completion_ts = ibv_wc_read_completion_ts(rdma_dev->cq); 1461 | 1462 | struct ibv_values_ex ts_values = { 1463 | .comp_mask = IBV_VALUES_MASK_RAW_CLOCK, 1464 | .raw_clock = {} /*struct timespec*/ 1465 | }; 1466 | 1467 | ret_val = ibv_query_rt_values_ex(rdma_dev->context, &ts_values); 1468 | if (ret_val) { 1469 | fprintf(stderr, "ibv_query_rt_values_ex failed after ibv_wr_start call\n"); 1470 | ts_values.raw_clock.tv_nsec = 0; 1471 | } 1472 | rdma_dev->latency[cq_wr_id].read_comp_ts = ts_values.raw_clock.tv_nsec; 1473 | 1474 | uint64_t wr_complete_latency = rdma_dev->latency[cq_wr_id].wr_complete_ts - rdma_dev->latency[cq_wr_id].wr_start_ts; 1475 | uint64_t completion_latency = rdma_dev->latency[cq_wr_id].completion_ts - rdma_dev->latency[cq_wr_id].wr_start_ts; 1476 | uint64_t read_comp_latency = rdma_dev->latency[cq_wr_id].read_comp_ts - rdma_dev->latency[cq_wr_id].completion_ts; 1477 | 1478 | rdma_dev->measure_index++; 1479 | rdma_dev->wr_complete_latency_sum += wr_complete_latency; 1480 | rdma_dev->completion_latency_sum += completion_latency; 1481 | rdma_dev->read_comp_latency_sum += read_comp_latency; 1482 | 1483 | rdma_dev->min_wr_complete_latency = (wr_complete_latency < rdma_dev->min_wr_complete_latency)? 1484 | wr_complete_latency: rdma_dev->min_wr_complete_latency; 1485 | rdma_dev->min_completion_latency = (completion_latency < rdma_dev->min_completion_latency)? 1486 | completion_latency: rdma_dev->min_completion_latency; 1487 | rdma_dev->min_read_comp_latency = (read_comp_latency < rdma_dev->min_read_comp_latency)? 1488 | read_comp_latency: rdma_dev->min_read_comp_latency; 1489 | 1490 | rdma_dev->max_wr_complete_latency = (wr_complete_latency > rdma_dev->max_wr_complete_latency)? 1491 | wr_complete_latency: rdma_dev->max_wr_complete_latency; 1492 | rdma_dev->max_completion_latency = (completion_latency > rdma_dev->max_completion_latency)? 1493 | completion_latency: rdma_dev->max_completion_latency; 1494 | rdma_dev->max_read_comp_latency = (read_comp_latency > rdma_dev->max_read_comp_latency)? 1495 | read_comp_latency: rdma_dev->max_read_comp_latency; 1496 | 1497 | DEBUG_LOG_FAST_PATH("PRINT_LATENCY: wr_id = %6lu, wr_sent latency: current %8lu, min %8lu, max %8lu, avg %8lu (nSec)\n", 1498 | cq_wr_id, 1499 | wr_complete_latency * 1000000 / rdma_dev->hca_core_clock_kHz, 1500 | rdma_dev->min_wr_complete_latency * 1000000 / rdma_dev->hca_core_clock_kHz, 1501 | rdma_dev->max_wr_complete_latency * 1000000 / rdma_dev->hca_core_clock_kHz, 1502 | rdma_dev->wr_complete_latency_sum / rdma_dev->measure_index * 1000000 / rdma_dev->hca_core_clock_kHz); 1503 | 1504 | DEBUG_LOG_FAST_PATH("PRINT_LATENCY: completion latency : current %8lu, min %8lu, max %8lu, avg %8lu (nSec)\n", 1505 | completion_latency * 1000000 / rdma_dev->hca_core_clock_kHz, 1506 | rdma_dev->min_completion_latency * 1000000 / rdma_dev->hca_core_clock_kHz, 1507 | rdma_dev->max_completion_latency * 1000000 / rdma_dev->hca_core_clock_kHz, 1508 | rdma_dev->completion_latency_sum / rdma_dev->measure_index * 1000000 / rdma_dev->hca_core_clock_kHz); 1509 | 1510 | DEBUG_LOG_FAST_PATH("PRINT_LATENCY: read_comp latency : current %8lu, min %8lu, max %8lu, avg %8lu (nSec)\n", 1511 | read_comp_latency * 1000000 / rdma_dev->hca_core_clock_kHz, 1512 | rdma_dev->min_read_comp_latency * 1000000 / rdma_dev->hca_core_clock_kHz, 1513 | rdma_dev->max_read_comp_latency * 1000000 / rdma_dev->hca_core_clock_kHz, 1514 | rdma_dev->read_comp_latency_sum / rdma_dev->measure_index * 1000000 / rdma_dev->hca_core_clock_kHz); 1515 | 1516 | ret_val = ibv_next_poll(rdma_dev->cq); 1517 | if ((ret_val) && (ret_val != ENOENT)) { 1518 | perror("ibv_start_poll"); 1519 | return reported_entries; 1520 | } 1521 | } 1522 | ibv_end_poll(rdma_dev->cq); 1523 | #else /*PRINT_LATENCY*/ 1524 | struct ibv_wc wc[COMP_ARRAY_SIZE]; 1525 | int i, wcn; 1526 | 1527 | wcn = ibv_poll_cq(rdma_dev->cq, num_entries, wc); 1528 | if (wcn < 0) { 1529 | fprintf(stderr, "poll CQ failed %d\n", wcn); 1530 | return 0; 1531 | } 1532 | 1533 | for (i = 0; i < wcn; ++i) { 1534 | DEBUG_LOG_FAST_PATH("cqe idx %d: virtual wr_id %llu, original wr_id 0x%llx, num_wrs=%d\n", 1535 | i, (long long unsigned int)wc[i].wr_id, 1536 | (long long unsigned int)rdma_dev->app_wr_id[wc[i].wr_id].wr_id, 1537 | rdma_dev->app_wr_id[wc[i].wr_id].num_wrs); 1538 | if (rdma_dev->app_wr_id[wc[i].wr_id].flags & WR_ID_FLAGS_ACTIVE) { 1539 | rdma_dev->app_wr_id[wc[i].wr_id].flags = 0; 1540 | rdma_dev->qp_available_wr += rdma_dev->app_wr_id[wc[i].wr_id].num_wrs; 1541 | event[reported_entries].wr_id = rdma_dev->app_wr_id[wc[i].wr_id].wr_id; 1542 | event[reported_entries].status = wc[i].status; 1543 | reported_entries++; 1544 | } 1545 | } 1546 | #endif /*PRINT_LATENCY*/ 1547 | return reported_entries; 1548 | } 1549 | -------------------------------------------------------------------------------- /gpu_direct_rdma_access.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. 3 | * 4 | * This software is available to you under a choice of one of two 5 | * licenses. You may choose to be licensed under the terms of the GNU 6 | * General Public License (GPL) Version 2, available from the file 7 | * COPYING in the main directory of this source tree, or the 8 | * OpenIB.org BSD license below: 9 | * 10 | * Redistribution and use in source and binary forms, with or 11 | * without modification, are permitted provided that the following 12 | * conditions are met: 13 | * 14 | * - Redistributions of source code must retain the above 15 | * copyright notice, this list of conditions and the following 16 | * disclaimer. 17 | * 18 | * - Redistributions in binary form must reproduce the above 19 | * copyright notice, this list of conditions and the following 20 | * disclaimer in the documentation and/or other materials 21 | * provided with the distribution. 22 | * 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 | * SOFTWARE. 31 | */ 32 | 33 | #ifndef _GPU_DIRECT_RDMA_ACCESS_H_ 34 | #define _GPU_DIRECT_RDMA_ACCESS_H_ 35 | 36 | #include /* This file defines `struct iovec' */ 37 | 38 | #include 39 | #include 40 | 41 | #ifdef __cplusplus 42 | extern "C" { 43 | #endif 44 | 45 | #define MAX_SEND_SGE 10 46 | 47 | /* 48 | * rdma_device object holds the RDMA resources of the local RDMA device, 49 | * of a Targte or a Source 50 | */ 51 | struct rdma_device; 52 | 53 | /* 54 | * rdma_buffer is used to represent the rdma parameters of a 55 | * applciation buffer on a specific device to be used in the RDMA operations 56 | */ 57 | struct rdma_buffer; 58 | 59 | struct rdma_open_dev_attr { 60 | const char *ib_devname; 61 | int ib_port; 62 | int gidx; 63 | }; 64 | 65 | enum rdma_task_attr_flags { 66 | RDMA_TASK_ATTR_RDMA_READ = 1 << 0, 67 | }; 68 | 69 | struct rdma_task_attr { 70 | char *remote_buf_desc_str; 71 | size_t remote_buf_desc_length; 72 | size_t remote_buf_offset; 73 | struct rdma_buffer *local_buf_rdma; 74 | struct iovec *local_buf_iovec; 75 | int local_buf_iovcnt; 76 | uint32_t flags; /* Use enum rdma_task_attr_flags */ 77 | uint64_t wr_id; 78 | }; 79 | /* 80 | * Open a RDMA device and allocated requiered resources. 81 | * find the capable RDMA device based on the 'addr' as an ip address 82 | * of the RDMA device selected to preform the RDMA operations. 83 | * Creates a PD, CQ, and QP as internal HW resources. 84 | * 85 | * Source rdma_device preforms the RDMA Read/Write operations to 86 | * the Target rdma_device. 87 | * 88 | * returns: a pointer to a rdma_device object or NULL on error 89 | */ 90 | struct rdma_device *rdma_open_device_client(struct sockaddr *addr); 91 | struct rdma_device *rdma_open_device_server(struct sockaddr *addr); 92 | 93 | /* 94 | * Reset device from failed state back to an operations state 95 | */ 96 | int rdma_reset_device(struct rdma_device *device); 97 | 98 | /* 99 | * Close and release all rdma_device resources 100 | */ 101 | void rdma_close_device(struct rdma_device *device); 102 | 103 | /* 104 | * register and deregister an applciation buffer with the RDMA device 105 | */ 106 | struct rdma_buffer *rdma_buffer_reg(struct rdma_device *device, void *addr, size_t length); 107 | void rdma_buffer_dereg(struct rdma_buffer *buffer); 108 | 109 | /* 110 | * Get a rdma_buffer address description string representations 111 | * 112 | * The Client application should pass this description string to the 113 | * Server which will issue the RDMA Read/Write operation 114 | * 115 | * desc_str is input and output holding the rdma_buffer information 116 | * desc_length is input size in bytes of desc_str 117 | * 118 | * returns: an integer equal to the size of the char data copied into desc_str 119 | */ 120 | int rdma_buffer_get_desc_str(struct rdma_buffer *rdma_buff, char *desc_str, size_t desc_length); 121 | 122 | /* 123 | * Issue a RDMA WRITE operation from a local buffer to a remote buffer, 124 | * or a RDMA READ operation from remote buffer to a local buffer, 125 | * depending on the RDMA_TASK_ATTR_RDMA_READ flag. 126 | * Remote buffer is descibed by the remote_buffer_addr_str, starting at offset remote_buf_offset. 127 | * The local_iov gather list, of size local_iovcnt, hold the buffer addr & size 128 | * pairs, and should be in the range of the local_buffer, which holds relevant 129 | * the rdma info. 130 | * We don't pass struct rdma_device as parameter, because we can get it using 131 | * rdma_task_attr struct field local_buf_rdma 132 | * 133 | * On completion of the RDMA operation, the status and wr_id will be reported 134 | * from rdma_poll_completions() 135 | * 136 | * returns: 0 on success, or the value of errno on failure 137 | */ 138 | int rdma_submit_task(struct rdma_task_attr *attr); 139 | 140 | enum rdma_completion_status { 141 | RDMA_STATUS_SUCCESS, 142 | RDMA_STATUS_ERR_LAST, 143 | }; 144 | 145 | struct rdma_completion_event { 146 | uint64_t wr_id; 147 | enum rdma_completion_status status; 148 | }; 149 | 150 | /* 151 | * Return rdma operations which have completed. 152 | * the event will hold the requets id (wr_id) and the status of the operation. 153 | * 154 | * returns: number of reported events in the event array (<= num_entries) 155 | */ 156 | int rdma_poll_completions(struct rdma_device *device, 157 | struct rdma_completion_event *event, 158 | uint32_t num_entries); 159 | 160 | #ifdef __cplusplus 161 | } 162 | #endif 163 | 164 | #endif /* _GPU_DIRECT_RDMA_ACCESS_H_ */ 165 | -------------------------------------------------------------------------------- /gpu_mem_util.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. 3 | * 4 | * This software is available to you under a choice of one of two 5 | * licenses. You may choose to be licensed under the terms of the GNU 6 | * General Public License (GPL) Version 2, available from the file 7 | * COPYING in the main directory of this source tree, or the 8 | * OpenIB.org BSD license below: 9 | * 10 | * Redistribution and use in source and binary forms, with or 11 | * without modification, are permitted provided that the following 12 | * conditions are met: 13 | * 14 | * - Redistributions of source code must retain the above 15 | * copyright notice, this list of conditions and the following 16 | * disclaimer. 17 | * 18 | * - Redistributions in binary form must reproduce the above 19 | * copyright notice, this list of conditions and the following 20 | * disclaimer in the documentation and/or other materials 21 | * provided with the distribution. 22 | * 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 | * SOFTWARE. 31 | */ 32 | 33 | #if HAVE_CONFIG_H 34 | #include 35 | #endif /* HAVE_CONFIG_H */ 36 | 37 | #define _GNU_SOURCE 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | 51 | #ifdef HAVE_CUDA 52 | /* "/usr/local/cuda/include/" is added to build include path in the Makefile */ 53 | #include "cuda.h" 54 | #endif //HAVE_CUDA 55 | 56 | #include "gpu_mem_util.h" 57 | 58 | extern int debug; 59 | extern int debug_fast_path; 60 | 61 | #define DEBUG_LOG if (debug) printf 62 | #define DEBUG_LOG_FAST_PATH if (debug_fast_path) printf 63 | #define FDEBUG_LOG if (debug) fprintf 64 | #define FDEBUG_LOG_FAST_PATH if (debug_fast_path) fprintf 65 | 66 | #ifdef HAVE_CUDA 67 | #define ASSERT(x) \ 68 | do { \ 69 | if (!(x)) { \ 70 | fprintf(stdout, "Assertion \"%s\" failed at %s:%d\n", #x, __FILE__, __LINE__);\ 71 | } \ 72 | } while (0) 73 | 74 | #define CUCHECK(stmt) \ 75 | do { \ 76 | CUresult result = (stmt); \ 77 | ASSERT(CUDA_SUCCESS == result); \ 78 | } while (0) 79 | 80 | /*----------------------------------------------------------------------------*/ 81 | 82 | static CUcontext cuContext; 83 | 84 | /* 85 | * Debug print information about all available CUDA devices 86 | */ 87 | static void print_gpu_devices_info(void) 88 | { 89 | int device_count = 0; 90 | int i; 91 | 92 | CUCHECK(cuDeviceGetCount(&device_count)); 93 | 94 | DEBUG_LOG("The number of supporting CUDA devices is %d.\n", device_count); 95 | 96 | for (i = 0; i < device_count; i++) { 97 | CUdevice cu_dev; 98 | char name[128]; 99 | int pci_bus_id = 0; 100 | int pci_device_id = 0; 101 | int pci_func = 0; /*always 0 for CUDA device*/ 102 | 103 | CUCHECK(cuDeviceGet(&cu_dev, i)); 104 | CUCHECK(cuDeviceGetName(name, sizeof(name), cu_dev)); 105 | CUCHECK(cuDeviceGetAttribute (&pci_bus_id , CU_DEVICE_ATTRIBUTE_PCI_BUS_ID , cu_dev)); /*PCI bus identifier of the device*/ 106 | CUCHECK(cuDeviceGetAttribute (&pci_device_id, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, cu_dev)); /*PCI device (also known as slot) identifier of the device*/ 107 | 108 | DEBUG_LOG("device %d, handle %d, name \"%s\", BDF %02x:%02x.%d\n", 109 | i, cu_dev, name, pci_bus_id, pci_device_id, pci_func); 110 | } 111 | } 112 | 113 | static int get_gpu_device_id_from_bdf(const char *bdf) 114 | { 115 | int given_bus_id = 0; 116 | int given_device_id = 0; 117 | int given_func = 0; 118 | int device_count = 0; 119 | int i; 120 | int ret_val; 121 | 122 | /* "3e:02.0"*/ 123 | ret_val = sscanf(bdf, "%x:%x.%x", &given_bus_id, &given_device_id, &given_func); 124 | if (ret_val != 3){ 125 | fprintf(stderr, "Wrong BDF format \"%s\". Expected format example: \"3e:02.0\", " 126 | "where 3e - bus id, 02 - device id, 0 - function\n", bdf); 127 | return -1; 128 | } 129 | if (given_func != 0) { 130 | fprintf(stderr, "Wrong pci function %d, 0 is expected\n", given_func); 131 | return -1; 132 | } 133 | CUCHECK(cuDeviceGetCount(&device_count)); 134 | 135 | if (device_count == 0) { 136 | fprintf(stderr, "There are no available devices that support CUDA\n"); 137 | return -1; 138 | } 139 | 140 | for (i = 0; i < device_count; i++) { 141 | CUdevice cu_dev; 142 | int pci_bus_id = 0; 143 | int pci_device_id = 0; 144 | 145 | CUCHECK(cuDeviceGet(&cu_dev, i)); 146 | CUCHECK(cuDeviceGetAttribute (&pci_bus_id , CU_DEVICE_ATTRIBUTE_PCI_BUS_ID , cu_dev)); /*PCI bus identifier of the device*/ 147 | CUCHECK(cuDeviceGetAttribute (&pci_device_id, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, cu_dev)); /*PCI device (also known as slot) identifier of the device*/ 148 | if ((pci_bus_id == given_bus_id) && (pci_device_id == given_device_id)){ 149 | return i; 150 | } 151 | } 152 | fprintf(stderr, "Given BDF \"%s\" doesn't match one of GPU devices\n", bdf); 153 | return -1; 154 | } 155 | 156 | static void *init_gpu(size_t gpu_buf_size, const char *bdf) 157 | { 158 | const size_t gpu_page_size = 64*1024; 159 | size_t aligned_size; 160 | CUresult cu_result; 161 | 162 | aligned_size = (gpu_buf_size + gpu_page_size - 1) & ~(gpu_page_size - 1); 163 | printf("initializing CUDA\n"); 164 | cu_result = cuInit(0); 165 | if (cu_result != CUDA_SUCCESS) { 166 | fprintf(stderr, "cuInit(0) returned %d\n", cu_result); 167 | return NULL; 168 | } 169 | 170 | if (debug) { 171 | print_gpu_devices_info(); 172 | } 173 | 174 | int dev_id = get_gpu_device_id_from_bdf(bdf); 175 | if (dev_id < 0) { 176 | fprintf(stderr, "Wrong device index (%d) obtained from bdf \"%s\"\n", 177 | dev_id, bdf); 178 | /* This function returns NULL if there are no CUDA capable devices. */ 179 | return NULL; 180 | } 181 | 182 | /* Pick up device by given dev_id - an ordinal in the range [0, cuDeviceGetCount()-1] */ 183 | CUdevice cu_dev; 184 | CUCHECK(cuDeviceGet(&cu_dev, dev_id)); 185 | 186 | DEBUG_LOG("creating CUDA Contnext\n"); 187 | /* Create context */ 188 | cu_result = cuCtxCreate(&cuContext, CU_CTX_MAP_HOST, cu_dev); 189 | if (cu_result != CUDA_SUCCESS) { 190 | fprintf(stderr, "cuCtxCreate() error=%d\n", cu_result); 191 | return NULL; 192 | } 193 | 194 | DEBUG_LOG("making it the current CUDA Context\n"); 195 | cu_result = cuCtxSetCurrent(cuContext); 196 | if (cu_result != CUDA_SUCCESS) { 197 | fprintf(stderr, "cuCtxSetCurrent() error=%d\n", cu_result); 198 | return NULL; 199 | } 200 | 201 | DEBUG_LOG("cuMemAlloc() of a %zd bytes GPU buffer\n", aligned_size); 202 | CUdeviceptr d_A; 203 | cu_result = cuMemAlloc(&d_A, aligned_size); 204 | if (cu_result != CUDA_SUCCESS) { 205 | fprintf(stderr, "cuMemAlloc error=%d\n", cu_result); 206 | return NULL; 207 | } 208 | DEBUG_LOG("allocated GPU buffer address at %016llx pointer=%p\n", d_A, (void*)d_A); 209 | 210 | return ((void*)d_A); 211 | } 212 | 213 | static int free_gpu(void *gpu_buff) 214 | { 215 | CUdeviceptr d_A = (CUdeviceptr) gpu_buff; 216 | 217 | printf("deallocating RX GPU buffer\n"); 218 | cuMemFree(d_A); 219 | d_A = 0; 220 | 221 | DEBUG_LOG("destroying current CUDA Context\n"); 222 | CUCHECK(cuCtxDestroy(cuContext)); 223 | 224 | return 0; 225 | } 226 | #endif //HAVE_CUDA 227 | 228 | /**************************************************************************************** 229 | * Memory allocation on CPU or GPU according to HAVE_CUDA pre-compile option and use_cuda flag 230 | * Return value: Allocated buffer pointer (if success), NULL (if error) 231 | ****************************************************************************************/ 232 | void *work_buffer_alloc(size_t length, int use_cuda, const char *bdf) 233 | { 234 | void *buff = NULL; 235 | 236 | if (use_cuda) { 237 | /* Mem allocation on GPU */ 238 | #ifdef HAVE_CUDA 239 | buff = init_gpu(length, bdf); 240 | #else 241 | fprintf(stderr, "Can't init GPU, HAVE_CUDA mode isn't set"); 242 | #endif //HAVE_CUDA 243 | if (!buff) { 244 | fprintf(stderr, "Couldn't allocate work buffer on GPU.\n"); 245 | return NULL; 246 | } 247 | } else { 248 | /* Mem allocation on CPU */ 249 | int page_size = sysconf(_SC_PAGESIZE); 250 | buff = memalign(page_size, length); 251 | if (!buff) { 252 | fprintf(stderr, "Couldn't allocate work buffer on CPU.\n"); 253 | return NULL; 254 | } 255 | DEBUG_LOG("memory buffer(%p) allocated\n", buff); 256 | } 257 | return buff; 258 | } 259 | 260 | /**************************************************************************************** 261 | * CPU or GPU memory free, according to HAVE_CUDA pre-compile option and use_cuda flag 262 | ****************************************************************************************/ 263 | void work_buffer_free(void *buff, int use_cuda) 264 | { 265 | if (use_cuda) { 266 | #ifdef HAVE_CUDA 267 | free_gpu(buff); 268 | #else 269 | fprintf(stderr, "Can't free GPU, HAVE_CUDA mode isn't set"); 270 | #endif //HAVE_CUDA 271 | } else { 272 | DEBUG_LOG("free memory buffer(%p)\n", buff); 273 | free(buff); 274 | } 275 | } 276 | 277 | /*----------------------------------------------------------------------------*/ 278 | 279 | -------------------------------------------------------------------------------- /gpu_mem_util.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. 3 | * 4 | * This software is available to you under a choice of one of two 5 | * licenses. You may choose to be licensed under the terms of the GNU 6 | * General Public License (GPL) Version 2, available from the file 7 | * COPYING in the main directory of this source tree, or the 8 | * OpenIB.org BSD license below: 9 | * 10 | * Redistribution and use in source and binary forms, with or 11 | * without modification, are permitted provided that the following 12 | * conditions are met: 13 | * 14 | * - Redistributions of source code must retain the above 15 | * copyright notice, this list of conditions and the following 16 | * disclaimer. 17 | * 18 | * - Redistributions in binary form must reproduce the above 19 | * copyright notice, this list of conditions and the following 20 | * disclaimer in the documentation and/or other materials 21 | * provided with the distribution. 22 | * 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 | * SOFTWARE. 31 | */ 32 | 33 | #ifndef _GPU_MEM_UTIL_H_ 34 | #define _GPU_MEM_UTIL_H_ 35 | 36 | #ifdef __cplusplus 37 | extern "C" { 38 | #endif 39 | 40 | /* 41 | * Memory allocation on CPU or GPU according to HAVE_CUDA pre-compile option and use_cuda flag 42 | * 43 | * returns: a pointer to the allocated buffer or NULL on error 44 | */ 45 | void *work_buffer_alloc(size_t length, int use_cuda, const char *bdf); 46 | 47 | /* 48 | * CPU or GPU memory free, according to HAVE_CUDA pre-compile option and use_cuda flag 49 | */ 50 | void work_buffer_free(void *buff, int use_cuda); 51 | 52 | 53 | #ifdef __cplusplus 54 | } 55 | #endif 56 | 57 | #endif /* _GPU_MEM_UTIL_H_ */ 58 | -------------------------------------------------------------------------------- /ibv_helper.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. 3 | * 4 | * This software is available to you under a choice of one of two 5 | * licenses. You may choose to be licensed under the terms of the GNU 6 | * General Public License (GPL) Version 2, available from the file 7 | * COPYING in the main directory of this source tree, or the 8 | * OpenIB.org BSD license below: 9 | * 10 | * Redistribution and use in source and binary forms, with or 11 | * without modification, are permitted provided that the following 12 | * conditions are met: 13 | * 14 | * - Redistributions of source code must retain the above 15 | * copyright notice, this list of conditions and the following 16 | * disclaimer. 17 | * 18 | * - Redistributions in binary form must reproduce the above 19 | * copyright notice, this list of conditions and the following 20 | * disclaimer in the documentation and/or other materials 21 | * provided with the distribution. 22 | * 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 | * SOFTWARE. 31 | */ 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | 39 | enum ibv_gid_type { 40 | IBV_GID_TYPE_IB_ROCE_V1, 41 | IBV_GID_TYPE_ROCE_V2, 42 | }; 43 | 44 | static int ibv_read_sysfs_file(const char *dir, const char *file, 45 | char *buf, size_t size) 46 | { 47 | char *path; 48 | int fd; 49 | int len; 50 | 51 | if (asprintf(&path, "%s/%s", dir, file) < 0) 52 | return -1; 53 | 54 | fd = open(path, O_RDONLY | O_CLOEXEC); 55 | if (fd < 0) { 56 | free(path); 57 | return -1; 58 | } 59 | 60 | len = read(fd, buf, size); 61 | 62 | close(fd); 63 | free(path); 64 | 65 | if (len > 0) { 66 | if (buf[len - 1] == '\n') 67 | buf[--len] = '\0'; 68 | else if (len < size) 69 | buf[len] = '\0'; 70 | else 71 | /* We would have to truncate the contents to NULL 72 | * terminate, so we are going to fail no matter 73 | * what we do, either right now or later when 74 | * we pass around an unterminated string. Fail now. 75 | */ 76 | return -1; 77 | } 78 | 79 | return len; 80 | } 81 | 82 | 83 | /* GID types as appear in sysfs, no change is expected as of ABI 84 | * compatibility. 85 | */ 86 | #define V1_TYPE "IB/RoCE v1" 87 | #define V2_TYPE "RoCE v2" 88 | static int ibv_query_gid_type(struct ibv_context *context, uint8_t port_num, 89 | unsigned int index, enum ibv_gid_type *type) 90 | { 91 | char name[32]; 92 | char buff[11]; 93 | 94 | snprintf(name, sizeof(name), "ports/%d/gid_attrs/types/%d", port_num, 95 | index); 96 | 97 | /* Reset errno so that we can rely on its value upon any error flow in 98 | * ibv_read_sysfs_file. 99 | */ 100 | errno = 0; 101 | if (ibv_read_sysfs_file(context->device->ibdev_path, name, buff, 102 | sizeof(buff)) <= 0) { 103 | char *dir_path; 104 | DIR *dir; 105 | 106 | if (errno == EINVAL) { 107 | /* In IB, this file doesn't exist and the kernel sets 108 | * errno to -EINVAL. 109 | */ 110 | *type = IBV_GID_TYPE_IB_ROCE_V1; 111 | return 0; 112 | } 113 | if (asprintf(&dir_path, "%s/%s/%d/%s/", 114 | context->device->ibdev_path, "ports", port_num, 115 | "gid_attrs") < 0) 116 | return -1; 117 | dir = opendir(dir_path); 118 | free(dir_path); 119 | if (!dir) { 120 | if (errno == ENOENT) 121 | /* Assuming that if gid_attrs doesn't exist, 122 | * we have an old kernel and all GIDs are 123 | * IB/RoCE v1 124 | */ 125 | *type = IBV_GID_TYPE_IB_ROCE_V1; 126 | else 127 | return -1; 128 | } else { 129 | closedir(dir); 130 | errno = EFAULT; 131 | return -1; 132 | } 133 | } else { 134 | if (!strcmp(buff, V1_TYPE)) { 135 | *type = IBV_GID_TYPE_IB_ROCE_V1; 136 | } else if (!strcmp(buff, V2_TYPE)) { 137 | *type = IBV_GID_TYPE_ROCE_V2; 138 | } else { 139 | errno = ENOTSUP; 140 | return -1; 141 | } 142 | } 143 | 144 | return 0; 145 | } 146 | 147 | int ibv_find_sgid_type(struct ibv_context *context, uint8_t port_num, 148 | enum ibv_gid_type gid_type, int gid_family) 149 | { 150 | enum ibv_gid_type sgid_type = 0; 151 | union ibv_gid sgid; 152 | int sgid_family = -1; 153 | int idx = 0; 154 | 155 | do { 156 | if (ibv_query_gid(context, port_num, idx, &sgid)) { 157 | errno = EFAULT; 158 | return -1; 159 | } 160 | if (ibv_query_gid_type(context, port_num, idx, &sgid_type)) { 161 | errno = EFAULT; 162 | return -1; 163 | } 164 | if (sgid.raw[0] == 0 && sgid.raw[1] == 0) { 165 | sgid_family = AF_INET; 166 | } 167 | 168 | if (gid_type == sgid_type && gid_family == sgid_family) { 169 | return idx; 170 | } 171 | 172 | idx++; 173 | } while (gid_type != sgid_type || gid_family != sgid_family); 174 | 175 | return idx; 176 | } 177 | 178 | 179 | -------------------------------------------------------------------------------- /khash.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, 2009, 2011 by Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* 27 | An example: 28 | 29 | #include "khash.h" 30 | KHASH_MAP_INIT_INT(32, char) 31 | int main() { 32 | int ret, is_missing; 33 | khiter_t k; 34 | khash_t(32) *h = kh_init(32); 35 | k = kh_put(32, h, 5, &ret); 36 | kh_value(h, k) = 10; 37 | k = kh_get(32, h, 10); 38 | is_missing = (k == kh_end(h)); 39 | k = kh_get(32, h, 5); 40 | kh_del(32, h, k); 41 | for (k = kh_begin(h); k != kh_end(h); ++k) 42 | if (kh_exist(h, k)) kh_value(h, k) = 1; 43 | kh_destroy(32, h); 44 | return 0; 45 | } 46 | */ 47 | 48 | /* 49 | 2013-05-02 (0.2.8): 50 | 51 | * Use quadratic probing. When the capacity is power of 2, stepping function 52 | i*(i+1)/2 guarantees to traverse each bucket. It is better than double 53 | hashing on cache performance and is more robust than linear probing. 54 | 55 | In theory, double hashing should be more robust than quadratic probing. 56 | However, my implementation is probably not for large hash tables, because 57 | the second hash function is closely tied to the first hash function, 58 | which reduce the effectiveness of double hashing. 59 | 60 | Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php 61 | 62 | 2011-12-29 (0.2.7): 63 | 64 | * Minor code clean up; no actual effect. 65 | 66 | 2011-09-16 (0.2.6): 67 | 68 | * The capacity is a power of 2. This seems to dramatically improve the 69 | speed for simple keys. Thank Zilong Tan for the suggestion. Reference: 70 | 71 | - http://code.google.com/p/ulib/ 72 | - http://nothings.org/computer/judy/ 73 | 74 | * Allow to optionally use linear probing which usually has better 75 | performance for random input. Double hashing is still the default as it 76 | is more robust to certain non-random input. 77 | 78 | * Added Wang's integer hash function (not used by default). This hash 79 | function is more robust to certain non-random input. 80 | 81 | 2011-02-14 (0.2.5): 82 | 83 | * Allow to declare global functions. 84 | 85 | 2009-09-26 (0.2.4): 86 | 87 | * Improve portability 88 | 89 | 2008-09-19 (0.2.3): 90 | 91 | * Corrected the example 92 | * Improved interfaces 93 | 94 | 2008-09-11 (0.2.2): 95 | 96 | * Improved speed a little in kh_put() 97 | 98 | 2008-09-10 (0.2.1): 99 | 100 | * Added kh_clear() 101 | * Fixed a compiling error 102 | 103 | 2008-09-02 (0.2.0): 104 | 105 | * Changed to token concatenation which increases flexibility. 106 | 107 | 2008-08-31 (0.1.2): 108 | 109 | * Fixed a bug in kh_get(), which has not been tested previously. 110 | 111 | 2008-08-31 (0.1.1): 112 | 113 | * Added destructor 114 | */ 115 | 116 | 117 | #ifndef __AC_KHASH_H 118 | #define __AC_KHASH_H 119 | 120 | /*! 121 | @header 122 | 123 | Generic hash table library. 124 | */ 125 | 126 | #define AC_VERSION_KHASH_H "0.2.8" 127 | 128 | #include 129 | #include 130 | #include 131 | 132 | /* compiler specific configuration */ 133 | 134 | #if UINT_MAX == 0xffffffffu 135 | typedef unsigned int khint32_t; 136 | #elif ULONG_MAX == 0xffffffffu 137 | typedef unsigned long khint32_t; 138 | #endif 139 | 140 | #if ULONG_MAX == ULLONG_MAX 141 | typedef unsigned long khint64_t; 142 | #else 143 | typedef unsigned long long khint64_t; 144 | #endif 145 | 146 | #ifndef kh_inline 147 | #ifdef _MSC_VER 148 | #define kh_inline __inline 149 | #else 150 | #define kh_inline inline 151 | #endif 152 | #endif /* kh_inline */ 153 | 154 | #ifndef klib_unused 155 | #if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) 156 | #define klib_unused __attribute__ ((__unused__)) 157 | #else 158 | #define klib_unused 159 | #endif 160 | #endif /* klib_unused */ 161 | 162 | typedef khint32_t khint_t; 163 | typedef khint_t khiter_t; 164 | 165 | #define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) 166 | #define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) 167 | #define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) 168 | #define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) 169 | #define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) 170 | #define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) 171 | #define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) 172 | 173 | #define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) 174 | 175 | #ifndef kroundup32 176 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 177 | #endif 178 | 179 | #ifndef kcalloc 180 | #define kcalloc(N,Z) calloc(N,Z) 181 | #endif 182 | #ifndef kmalloc 183 | #define kmalloc(Z) malloc(Z) 184 | #endif 185 | #ifndef krealloc 186 | #define krealloc(P,Z) realloc(P,Z) 187 | #endif 188 | #ifndef kfree 189 | #define kfree(P) free(P) 190 | #endif 191 | #ifndef kmemset 192 | #define kmemset(P,Z,N) memset(P,Z,N) 193 | #endif 194 | 195 | static const double __ac_HASH_UPPER = 0.77; 196 | 197 | #define __KHASH_TYPE(name, khkey_t, khval_t) \ 198 | typedef struct kh_##name##_s { \ 199 | khint_t n_buckets, size, n_occupied, upper_bound; \ 200 | khint32_t *flags; \ 201 | khkey_t *keys; \ 202 | khval_t *vals; \ 203 | } kh_##name##_t; 204 | 205 | #define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ 206 | extern kh_##name##_t *kh_init_##name(void); \ 207 | extern kh_##name##_t *kh_init_##name##_inplace(kh_##name##_t *h); \ 208 | extern void kh_destroy_##name(kh_##name##_t *h); \ 209 | extern void kh_destroy_##name##_inplace(kh_##name##_t *h); \ 210 | extern void kh_clear_##name(kh_##name##_t *h); \ 211 | extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ 212 | extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ 213 | extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ 214 | extern void kh_del_##name(kh_##name##_t *h, khint_t x); 215 | 216 | #define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ 217 | SCOPE kh_##name##_t *kh_init_##name(void) { \ 218 | return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ 219 | } \ 220 | SCOPE kh_##name##_t *kh_init_##name##_inplace(kh_##name##_t *h) { \ 221 | return (kh_##name##_t*)kmemset(h, 0, sizeof(kh_##name##_t)); \ 222 | } \ 223 | SCOPE void kh_destroy_##name(kh_##name##_t *h) \ 224 | { \ 225 | if (h) { \ 226 | kfree((void *)h->keys); kfree(h->flags); \ 227 | kfree((void *)h->vals); \ 228 | kfree(h); \ 229 | } \ 230 | } \ 231 | SCOPE void kh_destroy_##name##_inplace(kh_##name##_t *h) \ 232 | { \ 233 | kfree((void *)h->keys); \ 234 | kfree((void *)h->flags); \ 235 | kfree((void *)h->vals); \ 236 | } \ 237 | SCOPE void kh_clear_##name(kh_##name##_t *h) \ 238 | { \ 239 | if (h && h->flags) { \ 240 | memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ 241 | h->size = h->n_occupied = 0; \ 242 | } \ 243 | } \ 244 | SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ 245 | { \ 246 | if (h->n_buckets) { \ 247 | khint_t k, i, last, mask, step = 0; \ 248 | mask = h->n_buckets - 1; \ 249 | k = __hash_func(key); i = k & mask; \ 250 | last = i; \ 251 | while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ 252 | i = (i + (++step)) & mask; \ 253 | if (i == last) return h->n_buckets; \ 254 | } \ 255 | return __ac_iseither(h->flags, i)? h->n_buckets : i; \ 256 | } else return 0; \ 257 | } \ 258 | SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ 259 | { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ 260 | khint32_t *new_flags = 0; \ 261 | khint_t j = 1; \ 262 | { \ 263 | kroundup32(new_n_buckets); \ 264 | if (new_n_buckets < 4) new_n_buckets = 4; \ 265 | if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ 266 | else { /* hash table size to be changed (shrink or expand); rehash */ \ 267 | new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ 268 | if (!new_flags) return -1; \ 269 | memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ 270 | if (h->n_buckets < new_n_buckets) { /* expand */ \ 271 | khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ 272 | if (!new_keys) { kfree(new_flags); return -1; } \ 273 | h->keys = new_keys; \ 274 | if (kh_is_map) { \ 275 | khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ 276 | if (!new_vals) { kfree(new_flags); return -1; } \ 277 | h->vals = new_vals; \ 278 | } \ 279 | } /* otherwise shrink */ \ 280 | } \ 281 | } \ 282 | if (j) { /* rehashing is needed */ \ 283 | for (j = 0; j != h->n_buckets; ++j) { \ 284 | if (__ac_iseither(h->flags, j) == 0) { \ 285 | khkey_t key = h->keys[j]; \ 286 | khval_t val; \ 287 | khint_t new_mask; \ 288 | new_mask = new_n_buckets - 1; \ 289 | if (kh_is_map) val = h->vals[j]; \ 290 | __ac_set_isdel_true(h->flags, j); \ 291 | while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ 292 | khint_t k, i, step = 0; \ 293 | k = __hash_func(key); \ 294 | i = k & new_mask; \ 295 | while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \ 296 | __ac_set_isempty_false(new_flags, i); \ 297 | if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ 298 | { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ 299 | if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ 300 | __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ 301 | } else { /* write the element and jump out of the loop */ \ 302 | h->keys[i] = key; \ 303 | if (kh_is_map) h->vals[i] = val; \ 304 | break; \ 305 | } \ 306 | } \ 307 | } \ 308 | } \ 309 | if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ 310 | h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ 311 | if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ 312 | } \ 313 | kfree(h->flags); /* free the working space */ \ 314 | h->flags = new_flags; \ 315 | h->n_buckets = new_n_buckets; \ 316 | h->n_occupied = h->size; \ 317 | h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ 318 | } \ 319 | return 0; \ 320 | } \ 321 | SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ 322 | { \ 323 | khint_t x; \ 324 | if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ 325 | if (h->n_buckets > (h->size<<1)) { \ 326 | if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ 327 | *ret = -1; return h->n_buckets; \ 328 | } \ 329 | } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ 330 | *ret = -1; return h->n_buckets; \ 331 | } \ 332 | } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ 333 | { \ 334 | khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \ 335 | x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ 336 | if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ 337 | else { \ 338 | last = i; \ 339 | while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ 340 | if (__ac_isdel(h->flags, i)) site = i; \ 341 | i = (i + (++step)) & mask; \ 342 | if (i == last) { x = site; break; } \ 343 | } \ 344 | if (x == h->n_buckets) { \ 345 | if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ 346 | else x = i; \ 347 | } \ 348 | } \ 349 | } \ 350 | if (__ac_isempty(h->flags, x)) { /* not present at all */ \ 351 | h->keys[x] = key; \ 352 | __ac_set_isboth_false(h->flags, x); \ 353 | ++h->size; ++h->n_occupied; \ 354 | *ret = 1; \ 355 | } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ 356 | h->keys[x] = key; \ 357 | __ac_set_isboth_false(h->flags, x); \ 358 | ++h->size; \ 359 | *ret = 2; \ 360 | } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ 361 | return x; \ 362 | } \ 363 | SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ 364 | { \ 365 | if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ 366 | __ac_set_isdel_true(h->flags, x); \ 367 | --h->size; \ 368 | } \ 369 | } 370 | 371 | #define KHASH_DECLARE(name, khkey_t, khval_t) \ 372 | __KHASH_TYPE(name, khkey_t, khval_t) \ 373 | __KHASH_PROTOTYPES(name, khkey_t, khval_t) 374 | 375 | #define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ 376 | __KHASH_TYPE(name, khkey_t, khval_t) \ 377 | __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) 378 | 379 | #define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ 380 | KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) 381 | 382 | #define KHASH_TYPE(name, khkey_t, khval_t) \ 383 | __KHASH_TYPE(name, khkey_t, khval_t) 384 | 385 | #define KHASH_IMPL(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ 386 | __KHASH_IMPL(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) 387 | 388 | /* --- BEGIN OF HASH FUNCTIONS --- */ 389 | 390 | /*! @function 391 | @abstract Integer hash function 392 | @param key The integer [khint32_t] 393 | @return The hash value [khint_t] 394 | */ 395 | #define kh_int_hash_func(key) (khint32_t)(key) 396 | /*! @function 397 | @abstract Integer comparison function 398 | */ 399 | #define kh_int_hash_equal(a, b) ((a) == (b)) 400 | /*! @function 401 | @abstract 64-bit integer hash function 402 | @param key The integer [khint64_t] 403 | @return The hash value [khint_t] 404 | */ 405 | #define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) 406 | /*! @function 407 | @abstract 64-bit integer comparison function 408 | */ 409 | #define kh_int64_hash_equal(a, b) ((a) == (b)) 410 | /*! @function 411 | @abstract const char* hash function 412 | @param s Pointer to a null terminated string 413 | @return The hash value 414 | */ 415 | static kh_inline khint_t __ac_X31_hash_string(const char *s) 416 | { 417 | khint_t h = (khint_t)*s; 418 | if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; 419 | return h; 420 | } 421 | /*! @function 422 | @abstract Another interface to const char* hash function 423 | @param key Pointer to a null terminated string [const char*] 424 | @return The hash value [khint_t] 425 | */ 426 | #define kh_str_hash_func(key) __ac_X31_hash_string(key) 427 | /*! @function 428 | @abstract Const char* comparison function 429 | */ 430 | #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) 431 | 432 | static kh_inline khint_t __ac_Wang_hash(khint_t key) 433 | { 434 | key += ~(key << 15); 435 | key ^= (key >> 10); 436 | key += (key << 3); 437 | key ^= (key >> 6); 438 | key += ~(key << 11); 439 | key ^= (key >> 16); 440 | return key; 441 | } 442 | #define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key) 443 | 444 | /* --- END OF HASH FUNCTIONS --- */ 445 | 446 | /* Other convenient macros... */ 447 | 448 | /*! 449 | @abstract Type of the hash table. 450 | @param name Name of the hash table [symbol] 451 | */ 452 | #define khash_t(name) kh_##name##_t 453 | 454 | /*! @function 455 | @abstract Initiate a hash table. 456 | @param name Name of the hash table [symbol] 457 | @return Pointer to the hash table [khash_t(name)*] 458 | */ 459 | #define kh_init(name) kh_init_##name() 460 | 461 | /*! @function 462 | @abstract Initiate a hash table if the in-place case. 463 | @param name Name of the hash table [symbol] 464 | @param h Pointer to the hash table [khash_t(name)*] 465 | */ 466 | #define kh_init_inplace(name, h) kh_init_##name##_inplace(h) 467 | 468 | /*! @function 469 | @abstract Destroy a hash table. 470 | @param name Name of the hash table [symbol] 471 | @param h Pointer to the hash table [khash_t(name)*] 472 | */ 473 | #define kh_destroy(name, h) kh_destroy_##name(h) 474 | 475 | /*! @function 476 | @abstract Destroy a hash table if the in-place case. 477 | @param name Name of the hash table [symbol] 478 | @param h Pointer to the hash table [khash_t(name)*] 479 | */ 480 | #define kh_destroy_inplace(name, h) kh_destroy_##name##_inplace(h) 481 | 482 | /*! @function 483 | @abstract Reset a hash table without deallocating memory. 484 | @param name Name of the hash table [symbol] 485 | @param h Pointer to the hash table [khash_t(name)*] 486 | */ 487 | #define kh_clear(name, h) kh_clear_##name(h) 488 | 489 | /*! @function 490 | @abstract Resize a hash table. 491 | @param name Name of the hash table [symbol] 492 | @param h Pointer to the hash table [khash_t(name)*] 493 | @param s New size [khint_t] 494 | */ 495 | #define kh_resize(name, h, s) kh_resize_##name(h, s) 496 | 497 | /*! @function 498 | @abstract Insert a key to the hash table. 499 | @param name Name of the hash table [symbol] 500 | @param h Pointer to the hash table [khash_t(name)*] 501 | @param k Key [type of keys] 502 | @param r Extra return code: -1 if the operation failed; 503 | 0 if the key is present in the hash table; 504 | 1 if the bucket is empty (never used); 2 if the element in 505 | the bucket has been deleted [int*] 506 | @return Iterator to the inserted element [khint_t] 507 | */ 508 | #define kh_put(name, h, k, r) kh_put_##name(h, k, r) 509 | 510 | /*! @function 511 | @abstract Retrieve a key from the hash table. 512 | @param name Name of the hash table [symbol] 513 | @param h Pointer to the hash table [khash_t(name)*] 514 | @param k Key [type of keys] 515 | @return Iterator to the found element, or kh_end(h) if the element is absent [khint_t] 516 | */ 517 | #define kh_get(name, h, k) kh_get_##name(h, k) 518 | 519 | /*! @function 520 | @abstract Remove a key from the hash table. 521 | @param name Name of the hash table [symbol] 522 | @param h Pointer to the hash table [khash_t(name)*] 523 | @param k Iterator to the element to be deleted [khint_t] 524 | */ 525 | #define kh_del(name, h, k) kh_del_##name(h, k) 526 | 527 | /*! @function 528 | @abstract Test whether a bucket contains data. 529 | @param h Pointer to the hash table [khash_t(name)*] 530 | @param x Iterator to the bucket [khint_t] 531 | @return 1 if containing data; 0 otherwise [int] 532 | */ 533 | #define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) 534 | 535 | /*! @function 536 | @abstract Get key given an iterator 537 | @param h Pointer to the hash table [khash_t(name)*] 538 | @param x Iterator to the bucket [khint_t] 539 | @return Key [type of keys] 540 | */ 541 | #define kh_key(h, x) ((h)->keys[x]) 542 | 543 | /*! @function 544 | @abstract Get value given an iterator 545 | @param h Pointer to the hash table [khash_t(name)*] 546 | @param x Iterator to the bucket [khint_t] 547 | @return Value [type of values] 548 | @discussion For hash sets, calling this results in segfault. 549 | */ 550 | #define kh_val(h, x) ((h)->vals[x]) 551 | 552 | /*! @function 553 | @abstract Alias of kh_val() 554 | */ 555 | #define kh_value(h, x) ((h)->vals[x]) 556 | 557 | /*! @function 558 | @abstract Get the start iterator 559 | @param h Pointer to the hash table [khash_t(name)*] 560 | @return The start iterator [khint_t] 561 | */ 562 | #define kh_begin(h) (khint_t)(0) 563 | 564 | /*! @function 565 | @abstract Get the end iterator 566 | @param h Pointer to the hash table [khash_t(name)*] 567 | @return The end iterator [khint_t] 568 | */ 569 | #define kh_end(h) ((h)->n_buckets) 570 | 571 | /*! @function 572 | @abstract Get the number of elements in the hash table 573 | @param h Pointer to the hash table [khash_t(name)*] 574 | @return Number of elements in the hash table [khint_t] 575 | */ 576 | #define kh_size(h) ((h)->size) 577 | 578 | /*! @function 579 | @abstract Get the number of buckets in the hash table 580 | @param h Pointer to the hash table [khash_t(name)*] 581 | @return Number of buckets in the hash table [khint_t] 582 | */ 583 | #define kh_n_buckets(h) ((h)->n_buckets) 584 | 585 | /*! @function 586 | @abstract Iterate over the entries in the hash table 587 | @param h Pointer to the hash table [khash_t(name)*] 588 | @param kvar Variable to which key will be assigned 589 | @param vvar Variable to which value will be assigned 590 | @param code Block of code to execute 591 | */ 592 | #define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ 593 | for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ 594 | if (!kh_exist(h,__i)) continue; \ 595 | (kvar) = kh_key(h,__i); \ 596 | (vvar) = kh_val(h,__i); \ 597 | code; \ 598 | } } 599 | 600 | /*! @function 601 | @abstract Iterate over the keys in the hash table 602 | @param h Pointer to the hash table [khash_t(name)*] 603 | @param kvar Variable to which key will be assigned 604 | @param code Block of code to execute 605 | */ 606 | #define kh_foreach_key(h, kvar, code) { khint_t __i; \ 607 | for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ 608 | if (!kh_exist(h,__i)) continue; \ 609 | (kvar) = kh_key(h,__i); \ 610 | code; \ 611 | } } 612 | 613 | /*! @function 614 | @abstract Iterate over the values in the hash table 615 | @param h Pointer to the hash table [khash_t(name)*] 616 | @param vvar Variable to which value will be assigned 617 | @param code Block of code to execute 618 | */ 619 | #define kh_foreach_value(h, vvar, code) { khint_t __i; \ 620 | for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ 621 | if (!kh_exist(h,__i)) continue; \ 622 | (vvar) = kh_val(h,__i); \ 623 | code; \ 624 | } } 625 | 626 | /* More conenient interfaces */ 627 | 628 | /*! @function 629 | @abstract Instantiate a hash set containing integer keys 630 | @param name Name of the hash table [symbol] 631 | */ 632 | #define KHASH_SET_INIT_INT(name) \ 633 | KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) 634 | 635 | /*! @function 636 | @abstract Instantiate a hash map containing integer keys 637 | @param name Name of the hash table [symbol] 638 | @param khval_t Type of values [type] 639 | */ 640 | #define KHASH_MAP_INIT_INT(name, khval_t) \ 641 | KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) 642 | 643 | /*! @function 644 | @abstract Instantiate a hash map containing 64-bit integer keys 645 | @param name Name of the hash table [symbol] 646 | */ 647 | #define KHASH_SET_INIT_INT64(name) \ 648 | KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) 649 | 650 | /*! @function 651 | @abstract Instantiate a hash map containing 64-bit integer keys 652 | @param name Name of the hash table [symbol] 653 | @param khval_t Type of values [type] 654 | */ 655 | #define KHASH_MAP_INIT_INT64(name, khval_t) \ 656 | KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) 657 | 658 | typedef const char *kh_cstr_t; 659 | /*! @function 660 | @abstract Instantiate a hash map containing const char* keys 661 | @param name Name of the hash table [symbol] 662 | */ 663 | #define KHASH_SET_INIT_STR(name) \ 664 | KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) 665 | 666 | /*! @function 667 | @abstract Instantiate a hash map containing const char* keys 668 | @param name Name of the hash table [symbol] 669 | @param khval_t Type of values [type] 670 | */ 671 | #define KHASH_MAP_INIT_STR(name, khval_t) \ 672 | KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) 673 | 674 | #endif /* __AC_KHASH_H */ 675 | -------------------------------------------------------------------------------- /map_pci_nic_gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (c) 2019 Mellanox Technologies. All rights reserved. 4 | # 5 | # This Software is licensed under one of the following licenses: 6 | # 7 | # 1) under the terms of the "Common Public License 1.0" a copy of which is 8 | # available from the Open Source Initiative, see 9 | # http://www.opensource.org/licenses/cpl.php. 10 | # 11 | # 2) under the terms of the "The BSD License" a copy of which is 12 | # available from the Open Source Initiative, see 13 | # http://www.opensource.org/licenses/bsd-license.php. 14 | # 15 | # 3) under the terms of the "GNU General Public License (GPL) Version 2" a 16 | # copy of which is available from the Open Source Initiative, see 17 | # http://www.opensource.org/licenses/gpl-license.php. 18 | # 19 | # Licensee has the right to choose one of the above licenses. 20 | # 21 | # Redistributions of source code must retain the above copyright 22 | # notice and one of the license notices. 23 | # 24 | # Redistributions in binary form must reproduce both the above copyright 25 | # notice, one of the license notices in the documentation 26 | # and/or other materials provided with the distribution. 27 | # 28 | # Author: Alex Rosenbaum 29 | # 30 | 31 | if [[ debug == "$1" ]]; then 32 | INSTRUMENTING=yes # any non-null will do 33 | shift 34 | fi 35 | echodbg () { 36 | [[ "$INSTRUMENTING" ]] && builtin echo $@ 37 | } 38 | 39 | DEVS=$1 40 | if [ -z "$DEVS" ] ; then 41 | DEVS=$(ls /sys/class/infiniband/) 42 | fi 43 | 44 | for dev in $DEVS ; do 45 | #echo -e "dev=$dev" 46 | for port in $(ls /sys/class/infiniband/$dev/ports/) ; do 47 | #echo -e " port=$port" 48 | ll=$(cat /sys/class/infiniband/$dev/ports/$port/link_layer); 49 | #echo -e " ll=$ll" 50 | if [ $ll = "Ethernet" ] ; then 51 | ndev=$(cat /sys/class/infiniband/$dev/ports/$port/gid_attrs/ndevs/0) 52 | ipaddr=$(ip -f inet addr show $ndev | grep -Po 'inet \K[\d.]+') 53 | if [ -z "$ipaddr" ] ; then 54 | ipaddr="[no ip addr]" 55 | fi 56 | #echo -e "dev=$dev\tport=$port\tll=$ll\tndev=$ndev\tipaddr=$ipaddr" 57 | mlx_pci_dev_path=$(readlink -f /sys/class/infiniband/$dev/device) 58 | mlx_pci_dev=${mlx_pci_dev_path##*/} 59 | mlx_pci_br_path=$(dirname $(dirname $mlx_pci_dev_path)) 60 | mlx_pci_br=${mlx_pci_br_path##*/} 61 | echodbg -e "dev=$dev\tport=$port\tll=$ll\tndev=$ndev\tipaddr=$ipaddr\tpci_dev=${mlx_pci_dev_path##*/}\tpci_br=$mlx_pci_br" 62 | 63 | for pci in $(ls /sys/class/pci_bus/) ; do 64 | #echo -e "pci: $pci" 65 | pci_dev_path=$(readlink -f /sys/class/pci_bus/$pci) 66 | #echo -e "dev_path: $pci_dev_path" 67 | pci_br_path=$(dirname $(dirname $(dirname $pci_dev_path))) 68 | pci_br=${pci_br_path##*/} 69 | #echo -e "br_path: $pci_br_path '$pci_br'" 70 | if [ $mlx_pci_br = $pci_br ] ; then #same pci bridge 71 | if [ ${mlx_pci_dev:0:7} = $pci ] ; then 72 | #echo -e "ALEXR same pci dev ${mlx_pci_dev:0:7} $pci" 73 | continue 74 | fi 75 | pci_str=$(lspci -D -s $pci:00.0) 76 | if [ -z "$pci_str" ] ; then 77 | pci_str="$pci:00.0" 78 | fi 79 | echo -e "$ipaddr ($dev) is near $pci_str" 80 | fi 81 | done #pci 82 | fi 83 | done #port 84 | done #dev 85 | 86 | -------------------------------------------------------------------------------- /server.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. 3 | * 4 | * This software is available to you under a choice of one of two 5 | * licenses. You may choose to be licensed under the terms of the GNU 6 | * General Public License (GPL) Version 2, available from the file 7 | * COPYING in the main directory of this source tree, or the 8 | * OpenIB.org BSD license below: 9 | * 10 | * Redistribution and use in source and binary forms, with or 11 | * without modification, are permitted provided that the following 12 | * conditions are met: 13 | * 14 | * - Redistributions of source code must retain the above 15 | * copyright notice, this list of conditions and the following 16 | * disclaimer. 17 | * 18 | * - Redistributions in binary form must reproduce the above 19 | * copyright notice, this list of conditions and the following 20 | * disclaimer in the documentation and/or other materials 21 | * provided with the distribution. 22 | * 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 | * SOFTWARE. 31 | */ 32 | 33 | #define _GNU_SOURCE 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | 48 | #include "utils.h" 49 | #include "gpu_mem_util.h" 50 | #include "gpu_direct_rdma_access.h" 51 | 52 | #define MAX_SGES 512 53 | #define ACK_MSG "rdma_task completed" 54 | #define PACKAGE_TYPES 2 55 | 56 | extern int debug; 57 | extern int debug_fast_path; 58 | 59 | #define DEBUG_LOG if (debug) printf 60 | #define DEBUG_LOG_FAST_PATH if (debug_fast_path) printf 61 | #define FDEBUG_LOG if (debug) fprintf 62 | #define FDEBUG_LOG_FAST_PATH if (debug_fast_path) sprintf 63 | #define SDEBUG_LOG if (debug) fprintf 64 | #define SDEBUG_LOG_FAST_PATH if (debug_fast_path) sprintf 65 | 66 | struct user_params { 67 | 68 | int persistent; 69 | int port; 70 | unsigned long size; 71 | int iters; 72 | int num_sges; 73 | struct sockaddr hostaddr; 74 | }; 75 | 76 | static volatile int keep_running = 1; 77 | 78 | void sigint_handler(int dummy) 79 | { 80 | keep_running = 0; 81 | } 82 | 83 | /**************************************************************************************** 84 | * Open temporary socket connection on the server side, listening to the client. 85 | * Accepting connection from the client and closing temporary socket. 86 | * If success, return the accepted socket file descriptor ID 87 | * Return value: socket fd - success, -1 - error 88 | ****************************************************************************************/ 89 | static int open_server_socket(int port) 90 | { 91 | struct addrinfo *res, *t; 92 | struct addrinfo hints = { 93 | .ai_flags = AI_PASSIVE, 94 | .ai_family = AF_UNSPEC, 95 | .ai_socktype = SOCK_STREAM 96 | }; 97 | char *service; 98 | int ret_val; 99 | int sockfd; 100 | int tmp_sockfd = -1; 101 | 102 | ret_val = asprintf(&service, "%d", port); 103 | if (ret_val < 0) 104 | return -1; 105 | 106 | ret_val = getaddrinfo(NULL, service, &hints, &res); 107 | if (ret_val < 0) { 108 | fprintf(stderr, "%s for port %d\n", gai_strerror(ret_val), port); 109 | free(service); 110 | return -1; 111 | } 112 | 113 | for (t = res; t; t = t->ai_next) { 114 | tmp_sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); 115 | if (tmp_sockfd >= 0) { 116 | int optval = 1; 117 | 118 | setsockopt(tmp_sockfd, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof optval); 119 | 120 | if (!bind(tmp_sockfd, t->ai_addr, t->ai_addrlen)) 121 | break; 122 | close(tmp_sockfd); 123 | tmp_sockfd = -1; 124 | } 125 | } 126 | 127 | freeaddrinfo(res); 128 | free(service); 129 | 130 | if (tmp_sockfd < 0) { 131 | fprintf(stderr, "Couldn't listen to port %d\n", port); 132 | return -1; 133 | } 134 | 135 | listen(tmp_sockfd, 1); 136 | sockfd = accept(tmp_sockfd, NULL, 0); 137 | close(tmp_sockfd); 138 | if (sockfd < 0) { 139 | fprintf(stderr, "accept() failed\n"); 140 | return -1; 141 | } 142 | 143 | return sockfd; 144 | } 145 | 146 | static void usage(const char *argv0) 147 | { 148 | printf("Usage:\n"); 149 | printf(" %s start a server and wait for connection\n", argv0); 150 | printf("\n"); 151 | printf("Options:\n"); 152 | printf(" -P, --persistent server waits for additional client connections after tranfer is completed\n"); 153 | printf(" -a, --addr= ip address of the local host net device (mandatory)\n"); 154 | printf(" -p, --port= listen on/connect to port (default 18515)\n"); 155 | printf(" -s, --size= size of message to exchange (default 4096)\n"); 156 | printf(" -n, --iters= number of exchanges (default 1000)\n"); 157 | printf(" -l, --sg_list-len= number of sge-s to send in sg_list (default 0 - old mode)\n"); 158 | printf(" -D, --debug-mask= debug bitmask: bit 0 - debug print enable,\n" 159 | " bit 1 - fast path debug print enable\n"); 160 | } 161 | 162 | static int parse_command_line(int argc, char *argv[], struct user_params *usr_par) 163 | { 164 | memset(usr_par, 0, sizeof *usr_par); 165 | /*Set defaults*/ 166 | usr_par->port = 18515; 167 | usr_par->size = 4096; 168 | usr_par->iters = 1000; 169 | 170 | while (1) { 171 | int c; 172 | 173 | static struct option long_options[] = { 174 | { .name = "persistent", .has_arg = 0, .val = 'P' }, 175 | { .name = "addr", .has_arg = 1, .val = 'a' }, 176 | { .name = "port", .has_arg = 1, .val = 'p' }, 177 | { .name = "size", .has_arg = 1, .val = 's' }, 178 | { .name = "iters", .has_arg = 1, .val = 'n' }, 179 | { .name = "sg_list-len", .has_arg = 1, .val = 'l' }, 180 | { .name = "debug-mask", .has_arg = 1, .val = 'D' }, 181 | { 0 } 182 | }; 183 | 184 | c = getopt_long(argc, argv, "Pa:p:s:n:l:D:", 185 | long_options, NULL); 186 | 187 | if (c == -1) 188 | break; 189 | 190 | switch (c) { 191 | 192 | case 'P': 193 | usr_par->persistent = 1; 194 | break; 195 | 196 | case 'a': 197 | get_addr(optarg, (struct sockaddr *) &usr_par->hostaddr); 198 | break; 199 | 200 | case 'p': 201 | usr_par->port = strtol(optarg, NULL, 0); 202 | if (usr_par->port < 0 || usr_par->port > 65535) { 203 | usage(argv[0]); 204 | return 1; 205 | } 206 | break; 207 | 208 | case 's': 209 | usr_par->size = strtol(optarg, NULL, 0); 210 | break; 211 | 212 | case 'n': 213 | usr_par->iters = strtol(optarg, NULL, 0); 214 | break; 215 | 216 | case 'l': 217 | usr_par->num_sges = strtol(optarg, NULL, 0); 218 | break; 219 | 220 | case 'D': 221 | debug = (strtol(optarg, NULL, 0) >> 0) & 1; /*bit 0*/ 222 | debug_fast_path = (strtol(optarg, NULL, 0) >> 1) & 1; /*bit 1*/ 223 | break; 224 | 225 | default: 226 | usage(argv[0]); 227 | return 1; 228 | } 229 | } 230 | 231 | if (optind < argc) { 232 | usage(argv[0]); 233 | return 1; 234 | } 235 | 236 | return 0; 237 | } 238 | 239 | int main(int argc, char *argv[]) 240 | { 241 | struct rdma_device *rdma_dev; 242 | struct timeval start; 243 | int cnt = 0; 244 | struct user_params usr_par; 245 | int ret_val = 0; 246 | int sockfd; 247 | struct iovec buf_iovec[MAX_SGES]; 248 | 249 | srand48(getpid() * time(NULL)); 250 | 251 | ret_val = parse_command_line(argc, argv, &usr_par); 252 | if (ret_val) { 253 | return ret_val; 254 | } 255 | 256 | rdma_dev = rdma_open_device_server(&usr_par.hostaddr); 257 | if (!rdma_dev) { 258 | ret_val = 1; 259 | return ret_val; 260 | } 261 | 262 | /* Local memory buffer allocation */ 263 | /* On the server side, we allocate buffer on CPU and not on GPU */ 264 | void *buff = work_buffer_alloc(usr_par.size, 0 /*use_cuda*/, NULL); 265 | if (!buff) { 266 | ret_val = 1; 267 | goto clean_device; 268 | } 269 | 270 | /* RDMA buffer registration */ 271 | struct rdma_buffer *rdma_buff; 272 | 273 | rdma_buff = rdma_buffer_reg(rdma_dev, buff, usr_par.size); 274 | if (!rdma_buff) { 275 | ret_val = 1; 276 | goto clean_mem_buff; 277 | } 278 | 279 | struct sigaction act; 280 | act.sa_handler = sigint_handler; 281 | sigaction(SIGINT, &act, NULL); 282 | 283 | sock_listen: 284 | printf("Listening to remote client...\n"); 285 | sockfd = open_server_socket(usr_par.port); 286 | if (sockfd < 0) { 287 | goto clean_rdma_buff; 288 | } 289 | printf("Connection accepted.\n"); 290 | 291 | if (gettimeofday(&start, NULL)) { 292 | perror("gettimeofday"); 293 | ret_val = 1; 294 | goto clean_socket; 295 | } 296 | 297 | /**************************************************************************************************** 298 | * The main loop where we client and server send and receive "iters" number of messages 299 | */ 300 | for (cnt = 0; cnt < usr_par.iters && keep_running; cnt++) { 301 | 302 | int r_size; 303 | char desc_str[sizeof "0102030405060708:01020304:01020304:0102:010203:1:0102030405060708090a0b0c0d0e0f10"]; 304 | char ackmsg[sizeof ACK_MSG]; 305 | struct rdma_task_attr task_attr; 306 | int i; 307 | uint32_t flags; /* Use enum rdma_task_attr_flags */ 308 | // payload attrs 309 | uint8_t pl_type; 310 | uint16_t pl_size; 311 | //int expected_comp_events = usr_par.num_sges? (usr_par.num_sges+MAX_SEND_SGE-1)/MAX_SEND_SGE: 1; 312 | 313 | for (i = 0; i < PACKAGE_TYPES; i++) { 314 | r_size = recv(sockfd, &pl_type, sizeof(pl_type), MSG_WAITALL); 315 | r_size = recv(sockfd, &pl_size, sizeof(pl_size), MSG_WAITALL); 316 | switch (pl_type) { 317 | case 0: // RDMA_BUF_DESC 318 | /* Receiving RDMA data (address, size, rkey etc.) from socket as a triger to start RDMA Read/Write operation */ 319 | DEBUG_LOG_FAST_PATH("Iteration %d: Waiting to Receive message of size %lu\n", cnt, sizeof desc_str); 320 | r_size = recv(sockfd, desc_str, pl_size * sizeof(char), MSG_WAITALL); 321 | if (r_size != sizeof desc_str) { 322 | fprintf(stderr, "FAILURE: Couldn't receive RDMA data for iteration %d (errno=%d '%m')\n", cnt, errno); 323 | ret_val = 1; 324 | goto clean_socket; 325 | } 326 | break; 327 | case 1: // TASK_ATTRS 328 | /* Receiving rw attr flags */; 329 | int s = pl_size * sizeof(char); 330 | char t[16]; 331 | r_size = recv(sockfd, &t, s, MSG_WAITALL); 332 | if (r_size != s) { 333 | fprintf(stderr, "FAILURE: Couldn't receive RDMA data for iteration %d (errno=%d '%m')\n", cnt, errno); 334 | ret_val = 1; 335 | goto clean_socket; 336 | } 337 | sscanf(t, "%08x", &flags); 338 | break; 339 | } 340 | } 341 | 342 | DEBUG_LOG_FAST_PATH("Received message \"%s\"\n", desc_str); 343 | memset(&task_attr, 0, sizeof task_attr); 344 | task_attr.remote_buf_desc_str = desc_str; 345 | task_attr.remote_buf_desc_length = sizeof desc_str; 346 | task_attr.local_buf_rdma = rdma_buff; 347 | task_attr.flags = flags; 348 | task_attr.wr_id = cnt;// * expected_comp_events; 349 | 350 | /* Executing RDMA read */ 351 | SDEBUG_LOG_FAST_PATH ((char*)buff, "Read iteration N %d", cnt); 352 | /* Prepare send sg_list */ 353 | if (usr_par.num_sges) { 354 | if (usr_par.num_sges > MAX_SGES) { 355 | fprintf(stderr, "WARN: num_sges %d is too big (max=%d)\n", usr_par.num_sges, MAX_SGES); 356 | ret_val = 1; 357 | goto clean_socket; 358 | } 359 | memset(buf_iovec, 0, sizeof buf_iovec); 360 | task_attr.local_buf_iovcnt = usr_par.num_sges; 361 | task_attr.local_buf_iovec = buf_iovec; 362 | 363 | size_t portion_size; 364 | portion_size = (usr_par.size / usr_par.num_sges) & 0xFFFFFFC0; /* 64 byte aligned */ 365 | for (i = 0; i < usr_par.num_sges; i++) { 366 | buf_iovec[i].iov_base = buff + (i * portion_size); 367 | buf_iovec[i].iov_len = portion_size; 368 | } 369 | } 370 | ret_val = rdma_submit_task(&task_attr); 371 | if (ret_val) { 372 | goto clean_socket; 373 | } 374 | 375 | /* Completion queue polling loop */ 376 | DEBUG_LOG_FAST_PATH("Polling completion queue\n"); 377 | struct rdma_completion_event rdma_comp_ev[10]; 378 | int reported_ev = 0; 379 | do { 380 | reported_ev += rdma_poll_completions(rdma_dev, &rdma_comp_ev[reported_ev], 10/*expected_comp_events-reported_ev*/); 381 | //TODO - we can put sleep here 382 | } while (reported_ev < 1 && keep_running /*expected_comp_events*/); 383 | DEBUG_LOG_FAST_PATH("Finished polling\n"); 384 | 385 | for (i = 0; i < reported_ev; ++i) { 386 | if (rdma_comp_ev[i].status != IBV_WC_SUCCESS) { 387 | fprintf(stderr, "FAILURE: status \"%s\" (%d) for wr_id %d\n", 388 | ibv_wc_status_str(rdma_comp_ev[i].status), 389 | rdma_comp_ev[i].status, (int) rdma_comp_ev[i].wr_id); 390 | ret_val = 1; 391 | if (usr_par.persistent && keep_running) { 392 | rdma_reset_device(rdma_dev); 393 | } 394 | goto clean_socket; 395 | } 396 | } 397 | 398 | // Sending ack-message to the client, confirming that RDMA read/write has been completet 399 | if (write(sockfd, ACK_MSG, sizeof(ACK_MSG)) != sizeof(ACK_MSG)) { 400 | fprintf(stderr, "FAILURE: Couldn't send \"%c\" msg (errno=%d '%m')\n", ACK_MSG, errno); 401 | ret_val = 1; 402 | goto clean_socket; 403 | } 404 | } 405 | /****************************************************************************************************/ 406 | 407 | ret_val = print_run_time(start, usr_par.size, usr_par.iters); 408 | if (ret_val) { 409 | goto clean_socket; 410 | } 411 | 412 | clean_socket: 413 | close(sockfd); 414 | if (usr_par.persistent && keep_running) 415 | goto sock_listen; 416 | 417 | clean_rdma_buff: 418 | rdma_buffer_dereg(rdma_buff); 419 | 420 | clean_mem_buff: 421 | work_buffer_free(buff, 0); 422 | 423 | clean_device: 424 | rdma_close_device(rdma_dev); 425 | 426 | return ret_val; 427 | } 428 | -------------------------------------------------------------------------------- /utils.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. 3 | * 4 | * This software is available to you under a choice of one of two 5 | * licenses. You may choose to be licensed under the terms of the GNU 6 | * General Public License (GPL) Version 2, available from the file 7 | * COPYING in the main directory of this source tree, or the 8 | * OpenIB.org BSD license below: 9 | * 10 | * Redistribution and use in source and binary forms, with or 11 | * without modification, are permitted provided that the following 12 | * conditions are met: 13 | * 14 | * - Redistributions of source code must retain the above 15 | * copyright notice, this list of conditions and the following 16 | * disclaimer. 17 | * 18 | * - Redistributions in binary form must reproduce the above 19 | * copyright notice, this list of conditions and the following 20 | * disclaimer in the documentation and/or other materials 21 | * provided with the distribution. 22 | * 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 | * SOFTWARE. 31 | */ 32 | 33 | #define _GNU_SOURCE 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | 42 | #include "utils.h" 43 | 44 | int get_addr(char *dst, struct sockaddr *addr) 45 | { 46 | struct addrinfo *res; 47 | int ret; 48 | 49 | ret = getaddrinfo(dst, NULL, NULL, &res); 50 | if (ret) { 51 | printf("getaddrinfo failed (%s) - invalid hostname or IP address\n", gai_strerror(ret)); 52 | return ret; 53 | } 54 | 55 | if (res->ai_family == PF_INET) 56 | memcpy(addr, res->ai_addr, sizeof(struct sockaddr_in)); 57 | else if (res->ai_family == PF_INET6) 58 | memcpy(addr, res->ai_addr, sizeof(struct sockaddr_in6)); 59 | else 60 | ret = -1; 61 | 62 | freeaddrinfo(res); 63 | return ret; 64 | } 65 | 66 | int print_run_time(struct timeval start, unsigned long size, int iters) 67 | { 68 | struct timeval end; 69 | float usec; 70 | long long bytes; 71 | 72 | if (gettimeofday(&end, NULL)) { 73 | perror("gettimeofday"); 74 | return 1; 75 | } 76 | 77 | usec = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec); 78 | bytes = (long long) size * iters; 79 | 80 | printf("%lld bytes in %.2f seconds = %.2f Mbit/sec\n", 81 | bytes, usec / 1000000., bytes * 8. / usec); 82 | printf("%d iters in %.2f seconds = %.2f usec/iter\n", 83 | iters, usec / 1000000., usec / iters); 84 | return 0; 85 | 86 | } 87 | 88 | -------------------------------------------------------------------------------- /utils.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. 3 | * 4 | * This software is available to you under a choice of one of two 5 | * licenses. You may choose to be licensed under the terms of the GNU 6 | * General Public License (GPL) Version 2, available from the file 7 | * COPYING in the main directory of this source tree, or the 8 | * OpenIB.org BSD license below: 9 | * 10 | * Redistribution and use in source and binary forms, with or 11 | * without modification, are permitted provided that the following 12 | * conditions are met: 13 | * 14 | * - Redistributions of source code must retain the above 15 | * copyright notice, this list of conditions and the following 16 | * disclaimer. 17 | * 18 | * - Redistributions in binary form must reproduce the above 19 | * copyright notice, this list of conditions and the following 20 | * disclaimer in the documentation and/or other materials 21 | * provided with the distribution. 22 | * 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 | * SOFTWARE. 31 | */ 32 | 33 | #ifndef _UTILS_H_ 34 | #define _UTILS_H_ 35 | 36 | #ifdef __cplusplus 37 | extern "C" { 38 | #endif 39 | 40 | /* 41 | * Convert IP address from string to sockaddr 42 | * 43 | * returns: 0 on success or 1 on error 44 | */ 45 | int get_addr(char *dst, struct sockaddr *addr); 46 | 47 | /* 48 | * Print program run time. 49 | * 50 | * returns: 0 on success or 1 on error 51 | */ 52 | int print_run_time(struct timeval start, unsigned long size, int iters); 53 | 54 | 55 | #ifdef __cplusplus 56 | } 57 | #endif 58 | 59 | #endif /* _UTILS_H_ */ 60 | 61 | --------------------------------------------------------------------------------