├── .gitignore ├── README.md ├── .vscode └── c_cpp_properties.json ├── Makefile ├── install-deps.sh ├── data ├── fi_rma_bw.txt ├── lspci.txt └── ls_sys_bus_pci_devices.txt └── src ├── 8_topo.cpp ├── 4_hello.cpp ├── 5_reverse.cpp ├── 6_write.cpp ├── 7_queue.cpp └── 9_multinet.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/settings.json 2 | build 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # libfabric-efa-demo 2 | 3 | * Tutorial: [Harnessing 3200 Gbps Network: A Journey with RDMA, EFA, and libfabric](https://le.qun.ch/en/blog/2024/12/25/libfabric-efa-0-intro/) 4 | * 中文博客:[驾驭3200Gbps网络](https://abcdabcd987.com/2024/12/25/libfabric-efa-0-intro/) | [知乎](https://zhuanlan.zhihu.com/p/14925828538) 5 | -------------------------------------------------------------------------------- /.vscode/c_cpp_properties.json: -------------------------------------------------------------------------------- 1 | { 2 | "configurations": [ 3 | { 4 | "name": "Linux", 5 | "includePath": [ 6 | "${workspaceFolder}/build/libfabric/include", 7 | "/usr/local/cuda/include" 8 | ], 9 | "defines": [], 10 | "compilerPath": "/usr/bin/gcc", 11 | "cStandard": "c17", 12 | "cppStandard": "c++17", 13 | "intelliSenseMode": "linux-gcc-x64" 14 | } 15 | ], 16 | "version": 4 17 | } 18 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CXX = g++ 2 | CXXFLAGS = -Wall -Werror -std=c++17 -march=native -O2 -g -Ibuild/libfabric/include -I/usr/local/cuda/include 3 | LDFLAGS = -Lbuild/libfabric/lib -L/usr/local/cuda/lib64 4 | LDLIBS = -lfabric -lpthread -lcudart -lcuda 5 | BINARIES = build/4_hello \ 6 | build/5_reverse \ 7 | build/6_write \ 8 | build/7_queue \ 9 | build/8_topo \ 10 | build/9_multinet \ 11 | build/10_warmup \ 12 | build/11_multithread \ 13 | build/12_pin \ 14 | build/13_shard \ 15 | build/14_batch \ 16 | build/15_lazy 17 | 18 | export LD_LIBRARY_PATH := $(PWD)/build/libfabric/lib:$(LD_LIBRARY_PATH) 19 | 20 | .PHONY: all clean 21 | 22 | all: $(BINARIES) 23 | 24 | clean: 25 | rm -rf $(BINARIES) 26 | 27 | build/%: src/%.cpp build/libfabric/lib/libfabric.so 28 | $(CXX) $(CXXFLAGS) -o $@ $< $(LDFLAGS) $(LDLIBS) 29 | -------------------------------------------------------------------------------- /install-deps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -xe 3 | 4 | mkdir -p build 5 | cd build 6 | BUILD_DIR=$(pwd) 7 | 8 | # RDMA Core 9 | sudo apt-get install -y rdma-core 10 | 11 | # GDRCopy 12 | wget -O gdrcopy-2.4.4.tar.gz https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v2.4.4.tar.gz 13 | tar xf gdrcopy-2.4.4.tar.gz 14 | cd gdrcopy-2.4.4/ 15 | make prefix="$BUILD_DIR/gdrcopy" \ 16 | CUDA=/usr/local/cuda \ 17 | -j$(nproc --all) all install 18 | cd .. 19 | export LD_LIBRARY_PATH="$BUILD_DIR/gdrcopy/lib:$LD_LIBRARY_PATH" 20 | 21 | # libfabric 22 | wget https://github.com/ofiwg/libfabric/releases/download/v2.0.0/libfabric-2.0.0.tar.bz2 23 | tar xf libfabric-2.0.0.tar.bz2 24 | cd libfabric-2.0.0 25 | ./configure --prefix="$BUILD_DIR/libfabric" \ 26 | --with-cuda=/usr/local/cuda \ 27 | --with-gdrcopy="$BUILD_DIR/gdrcopy" 28 | make -j$(nproc --all) 29 | make install 30 | cd .. 31 | export LD_LIBRARY_PATH="$BUILD_DIR/libfabric/lib:$LD_LIBRARY_PATH" 32 | 33 | # fabtests 34 | wget https://github.com/ofiwg/libfabric/releases/download/v2.0.0/fabtests-2.0.0.tar.bz2 35 | tar xf fabtests-2.0.0.tar.bz2 36 | cd fabtests-2.0.0 37 | ./configure --prefix="$BUILD_DIR/fabtests" \ 38 | --with-cuda=/usr/local/cuda \ 39 | --with-libfabric="$BUILD_DIR/libfabric" 40 | make -j$(nproc --all) 41 | make install 42 | cd .. 43 | -------------------------------------------------------------------------------- /data/fi_rma_bw.txt: -------------------------------------------------------------------------------- 1 | bytes iters total time MB/sec usec/xfer Mxfers/sec 2 | 1 20k 19k 0.03s 0.75 1.33 0.75 3 | 2 20k 39k 0.03s 1.58 1.27 0.79 4 | 3 20k 58k 0.03s 2.36 1.27 0.79 5 | 4 20k 78k 0.03s 3.18 1.26 0.79 6 | 6 20k 117k 0.03s 4.72 1.27 0.79 7 | 8 20k 156k 0.03s 6.37 1.26 0.80 8 | 12 20k 234k 0.03s 9.48 1.27 0.79 9 | 16 20k 312k 0.03s 12.66 1.26 0.79 10 | 24 20k 468k 0.03s 19.18 1.25 0.80 11 | 32 20k 625k 0.03s 24.78 1.29 0.77 12 | 48 20k 937k 0.02s 38.75 1.24 0.81 13 | 64 20k 1.2m 0.02s 51.87 1.23 0.81 14 | 96 20k 1.8m 0.02s 77.48 1.24 0.81 15 | 128 20k 2.4m 0.02s 103.46 1.24 0.81 16 | 192 20k 3.6m 0.02s 155.55 1.23 0.81 17 | 256 20k 4.8m 0.02s 206.08 1.24 0.80 18 | 384 20k 7.3m 0.02s 307.40 1.25 0.80 19 | 512 20k 9.7m 0.02s 412.82 1.24 0.81 20 | 768 20k 14m 0.03s 614.11 1.25 0.80 21 | 1k 20k 19m 0.03s 814.12 1.26 0.80 22 | 1.5k 20k 29m 0.03s 1212.41 1.27 0.79 23 | 2k 20k 39m 0.03s 1590.49 1.29 0.78 24 | 3k 20k 58m 0.03s 2340.84 1.31 0.76 25 | 4k 20k 78m 0.03s 3068.05 1.34 0.75 26 | 6k 20k 117m 0.03s 4462.85 1.38 0.73 27 | 8k 20k 156m 0.03s 5440.12 1.51 0.66 28 | 12k 20k 234m 0.04s 6770.81 1.81 0.55 29 | 16k 20k 312m 0.04s 7921.86 2.07 0.48 30 | 24k 20k 468m 0.06s 8319.43 2.95 0.34 31 | 32k 20k 625m 0.07s 9470.25 3.46 0.29 32 | 48k 20k 937m 0.10s 10127.33 4.85 0.21 33 | 64k 2k 125m 0.01s 10465.67 6.26 0.16 34 | 96k 2k 187m 0.02s 10885.78 9.03 0.11 35 | 128k 2k 250m 0.03s 10215.66 12.83 0.08 36 | 192k 2k 375m 0.03s 11301.58 17.40 0.06 37 | 256k 2k 500m 0.05s 11501.58 22.79 0.04 38 | 384k 2k 750m 0.08s 10123.87 38.84 0.03 39 | 512k 2k 1000m 0.10s 10749.66 48.77 0.02 40 | 768k 2k 1.4g 0.13s 11843.86 66.40 0.02 41 | 1m 200 200m 0.02s 10968.94 95.60 0.01 42 | 1.5m 200 300m 0.03s 10837.99 145.12 0.01 43 | 2m 200 400m 0.04s 10204.87 205.51 0.00 44 | 3m 200 600m 0.06s 10522.41 298.95 0.00 45 | 4m 200 800m 0.08s 10826.94 387.39 0.00 46 | 6m 200 1.1g 0.11s 11488.20 547.65 0.00 47 | 8m 200 1.5g 0.16s 10504.60 798.57 0.00 48 | -------------------------------------------------------------------------------- /src/8_topo.cpp: -------------------------------------------------------------------------------- 1 | // clang-format off 2 | /* 3 | Example run: 4 | 5 | server$ ./build/8_topo 6 | GPUs: 8, NICs: 32, Total Bandwidth: 3200 Gbps 7 | PCIe Topology: 8 | cuda:0(53:00.0) NUMA0 CPU 6 rdmap79s0 (4f:00.0) rdmap80s0 (50:00.0) rdmap81s0 (51:00.0) rdmap82s0 (52:00.0) 9 | cuda:1(64:00.0) NUMA0 CPU18 rdmap96s0 (60:00.0) rdmap97s0 (61:00.0) rdmap98s0 (62:00.0) rdmap99s0 (63:00.0) 10 | cuda:2(75:00.0) NUMA0 CPU30 rdmap113s0(71:00.0) rdmap114s0(72:00.0) rdmap115s0(73:00.0) rdmap116s0(74:00.0) 11 | cuda:3(86:00.0) NUMA0 CPU42 rdmap130s0(82:00.0) rdmap131s0(83:00.0) rdmap132s0(84:00.0) rdmap133s0(85:00.0) 12 | cuda:4(97:00.0) NUMA1 CPU54 rdmap147s0(93:00.0) rdmap148s0(94:00.0) rdmap149s0(95:00.0) rdmap150s0(96:00.0) 13 | cuda:5(a8:00.0) NUMA1 CPU66 rdmap164s0(a4:00.0) rdmap165s0(a5:00.0) rdmap166s0(a6:00.0) rdmap167s0(a7:00.0) 14 | cuda:6(b9:00.0) NUMA1 CPU78 rdmap181s0(b5:00.0) rdmap182s0(b6:00.0) rdmap183s0(b7:00.0) rdmap184s0(b8:00.0) 15 | cuda:7(ca:00.0) NUMA1 CPU90 rdmap198s0(c6:00.0) rdmap199s0(c7:00.0) rdmap200s0(c8:00.0) rdmap201s0(c9:00.0) 16 | */ 17 | // clang-format on 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | #include 51 | #include 52 | #include 53 | 54 | #define CHECK(stmt) \ 55 | do { \ 56 | if (!(stmt)) { \ 57 | fprintf(stderr, "%s:%d CHECK(%s)\n", __FILE__, __LINE__, #stmt); \ 58 | std::exit(1); \ 59 | } \ 60 | } while (0) 61 | 62 | #define FI_CHECK(stmt) \ 63 | do { \ 64 | int rc = (stmt); \ 65 | if (rc) { \ 66 | fprintf(stderr, "%s:%d %s failed with %d (%s)\n", __FILE__, __LINE__, \ 67 | #stmt, rc, fi_strerror(-rc)); \ 68 | std::exit(1); \ 69 | } \ 70 | } while (0) 71 | 72 | #define CUDA_CHECK(stmt) \ 73 | do { \ 74 | cudaError_t rc = (stmt); \ 75 | if (rc != cudaSuccess) { \ 76 | fprintf(stderr, "%s:%d %s failed with %d (%s)\n", __FILE__, __LINE__, \ 77 | #stmt, rc, cudaGetErrorString(rc)); \ 78 | std::exit(1); \ 79 | } \ 80 | } while (0) 81 | 82 | #define CU_CHECK(stmt) \ 83 | do { \ 84 | CUresult rc = (stmt); \ 85 | if (rc != CUDA_SUCCESS) { \ 86 | const char *err_str; \ 87 | cuGetErrorString(rc, &err_str); \ 88 | fprintf(stderr, "%s:%d %s failed with %d (%s)\n", __FILE__, __LINE__, \ 89 | #stmt, rc, err_str); \ 90 | std::exit(1); \ 91 | } \ 92 | } while (0) 93 | 94 | struct PciAddress { 95 | uint16_t domain : 16; 96 | uint8_t bus : 8; 97 | uint8_t device : 5; 98 | uint8_t function : 3; 99 | 100 | static PciAddress Parse(std::string_view addr) { 101 | CHECK(addr.size() == 12); 102 | uint16_t domain; 103 | uint8_t bus, device, function; 104 | CHECK(sscanf(addr.data(), "%hx:%hhx:%hhx.%hhx", &domain, &bus, &device, 105 | &function) == 4); 106 | return PciAddress{domain, bus, device, function}; 107 | } 108 | 109 | uint32_t AsU32() const { return *(uint32_t *)this; } 110 | 111 | friend bool operator==(const PciAddress &lhs, const PciAddress &rhs) { 112 | return lhs.AsU32() == rhs.AsU32(); 113 | } 114 | }; 115 | static_assert(sizeof(PciAddress) == 4); 116 | 117 | namespace std { 118 | template <> struct hash { 119 | size_t operator()(const PciAddress &addr) const { 120 | return hash()(addr.AsU32()); 121 | } 122 | }; 123 | } // namespace std 124 | 125 | struct TopologyGroup { 126 | int cuda_device; 127 | int numa; 128 | int preferred_cpu; 129 | std::vector fi_infos; 130 | std::vector cpus; 131 | }; 132 | 133 | std::vector DetectTopo(struct fi_info *info) { 134 | char buf[256]; 135 | int num_gpus = 0; 136 | CUDA_CHECK(cudaGetDeviceCount(&num_gpus)); 137 | std::vector topo_groups(num_gpus); 138 | 139 | int num_cpus = 0; 140 | std::vector> numa_cpus; 141 | for (const auto &entry : std::filesystem::recursive_directory_iterator( 142 | "/sys/devices/system/node/")) { 143 | if (entry.path().filename().string().rfind("node", 0) != 0) { 144 | continue; 145 | } 146 | numa_cpus.emplace_back(); 147 | } 148 | int hardware_concurrency = std::thread::hardware_concurrency(); 149 | for (size_t node_id = 0; node_id < numa_cpus.size(); ++node_id) { 150 | for (int cpu = 0; cpu < hardware_concurrency; ++cpu) { 151 | snprintf(buf, sizeof(buf), 152 | "/sys/devices/system/node/node%zu/cpu%d/" 153 | "topology/thread_siblings_list", 154 | node_id, cpu); 155 | // Filter out hyperthreads 156 | std::ifstream f(buf); 157 | std::string sibling_list; 158 | if (f >> sibling_list) { 159 | int first_sibling; 160 | try { 161 | first_sibling = std::stoi(sibling_list); 162 | } catch (std::invalid_argument &e) { 163 | continue; 164 | } 165 | if (first_sibling == cpu) { 166 | numa_cpus[node_id].push_back(cpu); 167 | } 168 | } 169 | } 170 | std::sort(numa_cpus[node_id].begin(), numa_cpus[node_id].end()); 171 | num_cpus += numa_cpus[node_id].size(); 172 | } 173 | int cpus_per_gpu = num_cpus / num_gpus; 174 | 175 | std::unordered_map pci_parent_map; 176 | for (const auto &entry : 177 | std::filesystem::recursive_directory_iterator("/sys/bus/pci/devices")) { 178 | if (!entry.is_symlink()) { 179 | continue; 180 | } 181 | auto target = std::filesystem::read_symlink(entry.path()); 182 | auto addr_str = target.filename().string(); 183 | auto parent_addr_str = target.parent_path().filename().string(); 184 | CHECK(addr_str.size() == 12); // 0000:51:00.0 185 | if (parent_addr_str.size() != 12) { // 0000:46:01.2 186 | continue; // pci0000:cc 187 | } 188 | auto addr = PciAddress::Parse(addr_str); 189 | auto parent_bus = PciAddress::Parse(parent_addr_str); 190 | parent_bus.device = 0; 191 | parent_bus.function = 0; 192 | pci_parent_map[addr] = parent_bus; 193 | } 194 | 195 | std::vector numa_gpu_count(numa_cpus.size()); 196 | std::unordered_map bus_cuda_map; 197 | for (int i = 0; i < num_gpus; ++i) { 198 | cudaDeviceProp prop; 199 | CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); 200 | auto pci_addr = 201 | PciAddress{(uint16_t)prop.pciDomainID, (uint8_t)prop.pciBusID, 202 | (uint8_t)prop.pciDeviceID, 0}; 203 | auto parent_bus = pci_parent_map.at(pci_addr); 204 | bus_cuda_map[parent_bus] = i; 205 | 206 | topo_groups[i].cuda_device = i; 207 | snprintf(buf, sizeof(buf), 208 | "/sys/bus/pci/devices/%04x:%02x:%02x.0/numa_node", 209 | prop.pciDomainID, prop.pciBusID, prop.pciDeviceID); 210 | std::ifstream f(buf); 211 | CHECK(f >> topo_groups[i].numa); 212 | int numa_gpu_idx = numa_gpu_count[topo_groups[i].numa]++; 213 | auto &cpus = numa_cpus[topo_groups[i].numa]; 214 | int cpu_start = cpus_per_gpu * numa_gpu_idx; 215 | CHECK(cpu_start + cpus_per_gpu <= (int)cpus.size()); 216 | topo_groups[i].cpus.assign(cpus.begin() + cpu_start, 217 | cpus.begin() + cpu_start + cpus_per_gpu); 218 | topo_groups[i].preferred_cpu = topo_groups[i].cpus[cpus_per_gpu / 2]; 219 | } 220 | 221 | for (auto *fi = info; fi; fi = fi->next) { 222 | auto &pci = fi->nic->bus_attr->attr.pci; 223 | auto pci_addr = 224 | PciAddress{pci.domain_id, pci.bus_id, pci.device_id, pci.function_id}; 225 | auto parent_bus = pci_parent_map.at(pci_addr); 226 | auto cuda_device = bus_cuda_map.at(parent_bus); 227 | topo_groups[cuda_device].fi_infos.push_back(fi); 228 | } 229 | 230 | return topo_groups; 231 | } 232 | 233 | void PrintTopologyGroups(const std::vector &topo_groups) { 234 | printf("PCIe Topology:\n"); 235 | for (const auto &topo : topo_groups) { 236 | cudaDeviceProp prop; 237 | CUDA_CHECK(cudaGetDeviceProperties(&prop, topo.cuda_device)); 238 | printf(" cuda:%d(%02x:%02x.0)", topo.cuda_device, prop.pciBusID, 239 | prop.pciDeviceID); 240 | printf(" NUMA%d", topo.numa); 241 | printf(" CPU%2d", topo.preferred_cpu); 242 | for (auto *fi : topo.fi_infos) { 243 | printf(" %-10s(%02x:%02x.%d)", fi->nic->device_attr->name, 244 | fi->nic->bus_attr->attr.pci.bus_id, 245 | fi->nic->bus_attr->attr.pci.device_id, 246 | fi->nic->bus_attr->attr.pci.function_id); 247 | } 248 | printf("\n"); 249 | } 250 | } 251 | 252 | struct fi_info *GetInfo() { 253 | struct fi_info *hints, *info; 254 | hints = fi_allocinfo(); 255 | hints->caps = FI_MSG | FI_RMA | FI_HMEM | FI_LOCAL_COMM | FI_REMOTE_COMM; 256 | hints->ep_attr->type = FI_EP_RDM; 257 | hints->fabric_attr->prov_name = strdup("efa"); 258 | hints->domain_attr->mr_mode = FI_MR_LOCAL | FI_MR_HMEM | FI_MR_VIRT_ADDR | 259 | FI_MR_ALLOCATED | FI_MR_PROV_KEY; 260 | hints->domain_attr->threading = FI_THREAD_SAFE; 261 | 262 | FI_CHECK(fi_getinfo(FI_VERSION(2, 0), nullptr, nullptr, 0, hints, &info)); 263 | fi_freeinfo(hints); 264 | return info; 265 | } 266 | 267 | int main() { 268 | struct fi_info *info = GetInfo(); 269 | int num_nets = 0; 270 | size_t total_bw = 0; 271 | for (auto *fi = info; fi; fi = fi->next) { 272 | ++num_nets; 273 | total_bw += info->nic->link_attr->speed; 274 | } 275 | auto topo_groups = DetectTopo(info); 276 | printf("GPUs: %zu, NICs: %d, Total Bandwidth: %.0f Gbps\n", 277 | topo_groups.size(), num_nets, total_bw * 1e-9); 278 | PrintTopologyGroups(topo_groups); 279 | fi_freeinfo(info); 280 | return 0; 281 | } 282 | -------------------------------------------------------------------------------- /src/4_hello.cpp: -------------------------------------------------------------------------------- 1 | // clang-format off 2 | /* 3 | Example run: 4 | 5 | server$ ./build/4_hello 6 | domain: rdmap79s0-rdm, nic: rdmap79s0, fabric: efa, link: 100Gbps 7 | Run client with the following command: 8 | ./build/4_hello fe800000000000000853f7fffea442e1000000001826f6400000000000000000 9 | ./build/4_hello fe800000000000000853f7fffea442e1000000001826f6400000000000000000 "anytext" 10 | ------ 11 | Received message (len=13): Hello, world! 12 | Received message (len=7): anytext 13 | ^C 14 | 15 | client$ ./build/4_hello fe800000000000000853f7fffea442e1000000001826f6400000000000000000 16 | domain: rdmap79s0-rdm, nic: rdmap79s0, fabric: efa, link: 100Gbps 17 | Sent message (len=13): Hello, world! 18 | 19 | client$ ./build/4_hello fe800000000000000853f7fffea442e1000000001826f6400000000000000000 "anytext" 20 | domain: rdmap79s0-rdm, nic: rdmap79s0, fabric: efa, link: 100Gbps 21 | Sent message (len=7): anytext 22 | */ 23 | // clang-format on 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | 45 | #define CHECK(stmt) \ 46 | do { \ 47 | if (!(stmt)) { \ 48 | fprintf(stderr, "%s:%d %s\n", __FILE__, __LINE__, #stmt); \ 49 | std::exit(1); \ 50 | } \ 51 | } while (0) 52 | 53 | #define FI_CHECK(stmt) \ 54 | do { \ 55 | int rc = (stmt); \ 56 | if (rc) { \ 57 | fprintf(stderr, "%s:%d %s failed with %d (%s)\n", __FILE__, __LINE__, \ 58 | #stmt, rc, fi_strerror(-rc)); \ 59 | std::exit(1); \ 60 | } \ 61 | } while (0) 62 | 63 | constexpr size_t kBufAlign = 128; // EFA alignment requirement 64 | constexpr size_t kMessageBufferSize = 8192; 65 | constexpr size_t kCompletionQueueReadCount = 16; 66 | 67 | struct Buffer; 68 | struct Network; 69 | 70 | struct EfaAddress { 71 | uint8_t bytes[32]; 72 | 73 | explicit EfaAddress(uint8_t bytes[32]) { memcpy(this->bytes, bytes, 32); } 74 | 75 | std::string ToString() const { 76 | char buf[65]; 77 | for (size_t i = 0; i < 32; i++) { 78 | snprintf(buf + 2 * i, 3, "%02x", bytes[i]); 79 | } 80 | return std::string(buf, 64); 81 | } 82 | 83 | static EfaAddress Parse(const std::string &str) { 84 | if (str.size() != 64) { 85 | fprintf(stderr, "Unexpected address length %zu\n", str.size()); 86 | std::exit(1); 87 | } 88 | uint8_t bytes[32]; 89 | for (size_t i = 0; i < 32; i++) { 90 | sscanf(str.c_str() + 2 * i, "%02hhx", &bytes[i]); 91 | } 92 | return EfaAddress(bytes); 93 | } 94 | }; 95 | 96 | enum class RdmaOpType : uint8_t { 97 | kRecv = 0, 98 | kSend = 1, 99 | }; 100 | 101 | struct RdmaRecvOp { 102 | Buffer *buf; 103 | fi_addr_t src_addr; // Set after completion 104 | size_t recv_size; // Set after completion 105 | }; 106 | static_assert(std::is_pod_v == true); 107 | 108 | struct RdmaSendOp { 109 | Buffer *buf; 110 | size_t len; 111 | fi_addr_t dest_addr; 112 | }; 113 | static_assert(std::is_pod_v == true); 114 | 115 | struct RdmaOp { 116 | RdmaOpType type; 117 | union { 118 | RdmaRecvOp recv; 119 | RdmaSendOp send; 120 | }; 121 | std::function callback; 122 | }; 123 | 124 | struct Network { 125 | struct fi_info *fi; 126 | struct fid_fabric *fabric; 127 | struct fid_domain *domain; 128 | struct fid_cq *cq; 129 | struct fid_av *av; 130 | struct fid_ep *ep; 131 | EfaAddress addr; 132 | std::unordered_map mr; 133 | 134 | static Network Open(struct fi_info *fi); 135 | 136 | fi_addr_t AddPeerAddress(const EfaAddress &peer_addr); 137 | void RegisterMemory(Buffer &buf); 138 | struct fid_mr *GetMR(const Buffer &buf); 139 | 140 | void PollCompletion(); 141 | void PostRecv(Buffer &buf, 142 | std::function &&callback); 143 | void PostSend(fi_addr_t addr, Buffer &buf, size_t len, 144 | std::function &&callback); 145 | 146 | Network(const Network &) = delete; 147 | Network(Network &&other) 148 | : fi(other.fi), fabric(other.fabric), domain(other.domain), cq(other.cq), 149 | av(other.av), ep(other.ep), addr(other.addr) { 150 | other.fi = nullptr; 151 | other.fabric = nullptr; 152 | other.domain = nullptr; 153 | other.cq = nullptr; 154 | other.av = nullptr; 155 | other.ep = nullptr; 156 | } 157 | 158 | ~Network() { 159 | for (const auto &[_, mr] : mr) { 160 | FI_CHECK(fi_close(&mr->fid)); 161 | } 162 | if (ep) 163 | FI_CHECK(fi_close(&ep->fid)); 164 | if (av) 165 | FI_CHECK(fi_close(&av->fid)); 166 | if (cq) 167 | FI_CHECK(fi_close(&cq->fid)); 168 | if (domain) 169 | FI_CHECK(fi_close(&domain->fid)); 170 | if (fabric) 171 | FI_CHECK(fi_close(&fabric->fid)); 172 | } 173 | 174 | private: 175 | Network(struct fi_info *fi, struct fid_fabric *fabric, 176 | struct fid_domain *domain, struct fid_cq *cq, struct fid_av *av, 177 | struct fid_ep *ep, EfaAddress addr) 178 | : fi(fi), fabric(fabric), domain(domain), cq(cq), av(av), ep(ep), 179 | addr(addr) {} 180 | }; 181 | 182 | void *align_up(void *ptr, size_t align) { 183 | uintptr_t addr = (uintptr_t)ptr; 184 | return (void *)((addr + align - 1) & ~(align - 1)); 185 | } 186 | 187 | struct Buffer { 188 | void *data; 189 | size_t size; 190 | 191 | static Buffer Alloc(size_t size, size_t align) { 192 | void *raw_data = malloc(size); 193 | CHECK(raw_data != nullptr); 194 | return Buffer(raw_data, size, align); 195 | } 196 | 197 | Buffer(Buffer &&other) 198 | : data(other.data), size(other.size), raw_data(other.raw_data) { 199 | other.data = nullptr; 200 | other.raw_data = nullptr; 201 | } 202 | 203 | ~Buffer() { free(raw_data); } 204 | 205 | private: 206 | void *raw_data; 207 | 208 | Buffer(void *raw_data, size_t raw_size, size_t align) { 209 | this->raw_data = raw_data; 210 | this->data = align_up(raw_data, align); 211 | this->size = (size_t)((uintptr_t)raw_data + raw_size - (uintptr_t)data); 212 | } 213 | Buffer(const Buffer &) = delete; 214 | }; 215 | 216 | struct fi_info *GetInfo() { 217 | struct fi_info *hints, *info; 218 | hints = fi_allocinfo(); 219 | hints->ep_attr->type = FI_EP_RDM; 220 | hints->fabric_attr->prov_name = strdup("efa"); 221 | FI_CHECK(fi_getinfo(FI_VERSION(2, 0), nullptr, nullptr, 0, hints, &info)); 222 | fi_freeinfo(hints); 223 | return info; 224 | } 225 | 226 | Network Network::Open(struct fi_info *fi) { 227 | struct fid_fabric *fabric; 228 | FI_CHECK(fi_fabric(fi->fabric_attr, &fabric, nullptr)); 229 | 230 | struct fid_domain *domain; 231 | FI_CHECK(fi_domain(fabric, fi, &domain, nullptr)); 232 | 233 | struct fid_cq *cq; 234 | struct fi_cq_attr cq_attr = {}; 235 | cq_attr.format = FI_CQ_FORMAT_DATA; 236 | FI_CHECK(fi_cq_open(domain, &cq_attr, &cq, nullptr)); 237 | 238 | struct fid_av *av; 239 | struct fi_av_attr av_attr = {}; 240 | FI_CHECK(fi_av_open(domain, &av_attr, &av, nullptr)); 241 | 242 | struct fid_ep *ep; 243 | FI_CHECK(fi_endpoint(domain, fi, &ep, nullptr)); 244 | FI_CHECK(fi_ep_bind(ep, &cq->fid, FI_SEND | FI_RECV)); 245 | FI_CHECK(fi_ep_bind(ep, &av->fid, 0)); 246 | 247 | FI_CHECK(fi_enable(ep)); 248 | 249 | uint8_t addr[64]; 250 | size_t addrlen = sizeof(addr); 251 | FI_CHECK(fi_getname(&ep->fid, addr, &addrlen)); 252 | if (addrlen != 32) { 253 | fprintf(stderr, "Unexpected address length %zu\n", addrlen); 254 | std::exit(1); 255 | } 256 | 257 | return Network(fi, fabric, domain, cq, av, ep, EfaAddress(addr)); 258 | } 259 | 260 | fi_addr_t Network::AddPeerAddress(const EfaAddress &peer_addr) { 261 | fi_addr_t addr = FI_ADDR_UNSPEC; 262 | int ret = fi_av_insert(av, peer_addr.bytes, 1, &addr, 0, nullptr); 263 | if (ret != 1) { 264 | fprintf(stderr, "fi_av_insert failed: %d\n", ret); 265 | std::exit(1); 266 | } 267 | return addr; 268 | } 269 | 270 | void Network::RegisterMemory(Buffer &buf) { 271 | struct fid_mr *mr; 272 | struct fi_mr_attr mr_attr = {}; 273 | struct iovec iov = {.iov_base = buf.data, .iov_len = buf.size}; 274 | mr_attr.mr_iov = &iov; 275 | mr_attr.iov_count = 1; 276 | mr_attr.access = FI_SEND | FI_RECV; 277 | uint64_t flags = 0; 278 | FI_CHECK(fi_mr_regattr(domain, &mr_attr, flags, &mr)); 279 | this->mr[buf.data] = mr; 280 | } 281 | 282 | struct fid_mr *Network::GetMR(const Buffer &buf) { 283 | auto it = mr.find(buf.data); 284 | CHECK(it != mr.end()); 285 | return it->second; 286 | } 287 | 288 | void Network::PostRecv(Buffer &buf, 289 | std::function &&callback) { 290 | auto *op = new RdmaOp{ 291 | .type = RdmaOpType::kRecv, 292 | .recv = 293 | RdmaRecvOp{.buf = &buf, .src_addr = FI_ADDR_UNSPEC, .recv_size = 0}, 294 | .callback = std::move(callback), 295 | }; 296 | struct iovec iov = { 297 | .iov_base = buf.data, 298 | .iov_len = buf.size, 299 | }; 300 | struct fi_msg msg = { 301 | .msg_iov = &iov, 302 | .desc = &GetMR(buf)->mem_desc, 303 | .iov_count = 1, 304 | .addr = FI_ADDR_UNSPEC, 305 | .context = op, 306 | }; 307 | FI_CHECK(fi_recvmsg(ep, &msg, 0)); // TODO: handle EAGAIN 308 | } 309 | 310 | void Network::PostSend(fi_addr_t addr, Buffer &buf, size_t len, 311 | std::function &&callback) { 312 | CHECK(len <= buf.size); 313 | auto *op = new RdmaOp{ 314 | .type = RdmaOpType::kSend, 315 | .send = RdmaSendOp{.buf = &buf, .len = len, .dest_addr = addr}, 316 | .callback = std::move(callback), 317 | }; 318 | struct iovec iov = { 319 | .iov_base = buf.data, 320 | .iov_len = len, 321 | }; 322 | struct fi_msg msg = { 323 | .msg_iov = &iov, 324 | .desc = &GetMR(buf)->mem_desc, 325 | .iov_count = 1, 326 | .addr = addr, 327 | .context = op, 328 | }; 329 | FI_CHECK(fi_sendmsg(ep, &msg, 0)); // TODO: handle EAGAIN 330 | } 331 | 332 | void HandleCompletion(Network &net, const struct fi_cq_data_entry &cqe) { 333 | auto comp_flags = cqe.flags; 334 | auto op = (RdmaOp *)cqe.op_context; 335 | if (!op) { 336 | return; 337 | } 338 | if (comp_flags & FI_RECV) { 339 | op->recv.recv_size = cqe.len; 340 | if (op->callback) 341 | op->callback(net, *op); 342 | } else if (comp_flags & FI_SEND) { 343 | if (op->callback) 344 | op->callback(net, *op); 345 | } else { 346 | fprintf(stderr, "Unhandled completion type. comp_flags=%lx\n", comp_flags); 347 | std::exit(1); 348 | } 349 | delete op; 350 | } 351 | 352 | void Network::PollCompletion() { 353 | struct fi_cq_data_entry cqe[kCompletionQueueReadCount]; 354 | for (;;) { 355 | auto ret = fi_cq_read(cq, cqe, kCompletionQueueReadCount); 356 | if (ret > 0) { 357 | for (ssize_t i = 0; i < ret; i++) { 358 | HandleCompletion(*this, cqe[i]); 359 | } 360 | } else if (ret == -FI_EAVAIL) { 361 | struct fi_cq_err_entry err_entry; 362 | ret = fi_cq_readerr(cq, &err_entry, 0); 363 | if (ret < 0) { 364 | fprintf(stderr, "fi_cq_readerr error: %zd (%s)\n", ret, 365 | fi_strerror(-ret)); 366 | std::exit(1); 367 | } else if (ret > 0) { 368 | fprintf(stderr, "Failed libfabric operation: %s\n", 369 | fi_cq_strerror(cq, err_entry.prov_errno, err_entry.err_data, 370 | nullptr, 0)); 371 | } else { 372 | fprintf(stderr, "fi_cq_readerr returned 0 unexpectedly.\n"); 373 | std::exit(1); 374 | } 375 | } else if (ret == -FI_EAGAIN) { 376 | // No more completions 377 | break; 378 | } else { 379 | fprintf(stderr, "fi_cq_read error: %zd (%s)\n", ret, fi_strerror(-ret)); 380 | std::exit(1); 381 | } 382 | } 383 | } 384 | 385 | int ServerMain(int argc, char **argv) { 386 | struct fi_info *info = GetInfo(); 387 | auto net = Network::Open(info); 388 | printf("domain: %14s", info->domain_attr->name); 389 | printf(", nic: %10s", info->nic->device_attr->name); 390 | printf(", fabric: %s", info->fabric_attr->prov_name); 391 | printf(", link: %.0fGbps", info->nic->link_attr->speed / 1e9); 392 | printf("\n"); 393 | printf("Run client with the following command:\n"); 394 | printf(" %s %s\n", argv[0], net.addr.ToString().c_str()); 395 | printf(" %s %s \"anytext\"\n", argv[0], net.addr.ToString().c_str()); 396 | printf("------\n"); 397 | 398 | auto buf_msg = Buffer::Alloc(kMessageBufferSize, kBufAlign); 399 | net.RegisterMemory(buf_msg); 400 | net.PostRecv(buf_msg, [](Network &net, RdmaOp &op) { 401 | auto *msg = (const char *)op.recv.buf->data; 402 | auto len = op.recv.recv_size; 403 | printf("Received message (len=%zu): %.*s\n", len, (int)len, msg); 404 | net.PostRecv(*op.recv.buf, std::move(op.callback)); 405 | }); 406 | 407 | for (;;) { 408 | net.PollCompletion(); 409 | } 410 | 411 | fi_freeinfo(info); 412 | return 0; 413 | } 414 | 415 | int ClientMain(int argc, char **argv) { 416 | CHECK(argc == 2 || argc == 3); 417 | auto server_addrname = EfaAddress::Parse(argv[1]); 418 | std::string message = argc == 3 ? argv[2] : "Hello, world!"; 419 | 420 | struct fi_info *info = GetInfo(); 421 | auto net = Network::Open(info); 422 | printf("domain: %14s", info->domain_attr->name); 423 | printf(", nic: %10s", info->nic->device_attr->name); 424 | printf(", fabric: %s", info->fabric_attr->prov_name); 425 | printf(", link: %.0fGbps", info->nic->link_attr->speed / 1e9); 426 | printf("\n"); 427 | auto server_addr = net.AddPeerAddress(server_addrname); 428 | auto buf_msg = Buffer::Alloc(kMessageBufferSize, kBufAlign); 429 | net.RegisterMemory(buf_msg); 430 | memcpy(buf_msg.data, message.data(), message.size()); 431 | 432 | bool sent = false; 433 | net.PostSend(server_addr, buf_msg, message.size(), 434 | [&sent](Network &net, RdmaOp &op) { 435 | auto *msg = (const char *)op.send.buf->data; 436 | auto len = op.send.len; 437 | printf("Sent message (len=%zu): %.*s\n", len, (int)len, msg); 438 | sent = true; 439 | }); 440 | while (!sent) { 441 | net.PollCompletion(); 442 | } 443 | 444 | fi_freeinfo(info); 445 | return 0; 446 | } 447 | 448 | int main(int argc, char **argv) { 449 | if (argc == 1) { 450 | return ServerMain(argc, argv); 451 | } else { 452 | return ClientMain(argc, argv); 453 | } 454 | } 455 | -------------------------------------------------------------------------------- /data/lspci.txt: -------------------------------------------------------------------------------- 1 | -+-[0000:cc]---00.0-[cd-d2]----00.0-[ce-d2]--+-00.0-[cf]----00.0 NVIDIA Corporation GH100 [H100 NVSwitch] 2 | | +-00.1-[d0]----00.0 NVIDIA Corporation GH100 [H100 NVSwitch] 3 | | +-00.2-[d1]----00.0 NVIDIA Corporation GH100 [H100 NVSwitch] 4 | | \-00.3-[d2]----00.0 NVIDIA Corporation GH100 [H100 NVSwitch] 5 | +-[0000:bb]---00.0-[bc-cb]----00.0-[bd-cb]--+-00.0-[be]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 6 | | +-00.1-[bf]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 7 | | +-00.2-[c0]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 8 | | +-00.3-[c1]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 9 | | +-00.4-[c2]-- 10 | | +-00.5-[c3]-- 11 | | +-00.6-[c4]-- 12 | | +-00.7-[c5]-- 13 | | +-01.0-[c6]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 14 | | +-01.1-[c7]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 15 | | +-01.2-[c8]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 16 | | +-01.3-[c9]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 17 | | +-01.4-[ca]----00.0 NVIDIA Corporation GH100 [H100 SXM5 80GB] 18 | | \-01.5-[cb]----00.0 Amazon.com, Inc. NVMe SSD Controller 19 | +-[0000:aa]---00.0-[ab-ba]----00.0-[ac-ba]--+-00.0-[ad]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 20 | | +-00.1-[ae]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 21 | | +-00.2-[af]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 22 | | +-00.3-[b0]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 23 | | +-00.4-[b1]-- 24 | | +-00.5-[b2]-- 25 | | +-00.6-[b3]-- 26 | | +-00.7-[b4]-- 27 | | +-01.0-[b5]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 28 | | +-01.1-[b6]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 29 | | +-01.2-[b7]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 30 | | +-01.3-[b8]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 31 | | +-01.4-[b9]----00.0 NVIDIA Corporation GH100 [H100 SXM5 80GB] 32 | | \-01.5-[ba]----00.0 Amazon.com, Inc. NVMe SSD Controller 33 | +-[0000:99]---00.0-[9a-a9]----00.0-[9b-a9]--+-00.0-[9c]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 34 | | +-00.1-[9d]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 35 | | +-00.2-[9e]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 36 | | +-00.3-[9f]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 37 | | +-00.4-[a0]-- 38 | | +-00.5-[a1]-- 39 | | +-00.6-[a2]-- 40 | | +-00.7-[a3]-- 41 | | +-01.0-[a4]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 42 | | +-01.1-[a5]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 43 | | +-01.2-[a6]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 44 | | +-01.3-[a7]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 45 | | +-01.4-[a8]----00.0 NVIDIA Corporation GH100 [H100 SXM5 80GB] 46 | | \-01.5-[a9]----00.0 Amazon.com, Inc. NVMe SSD Controller 47 | +-[0000:88]---00.0-[89-98]----00.0-[8a-98]--+-00.0-[8b]-- 48 | | +-00.1-[8c]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 49 | | +-00.2-[8d]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 50 | | +-00.3-[8e]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 51 | | +-00.4-[8f]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 52 | | +-00.5-[90]-- 53 | | +-00.6-[91]-- 54 | | +-00.7-[92]-- 55 | | +-01.0-[93]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 56 | | +-01.1-[94]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 57 | | +-01.2-[95]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 58 | | +-01.3-[96]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 59 | | +-01.4-[97]----00.0 NVIDIA Corporation GH100 [H100 SXM5 80GB] 60 | | \-01.5-[98]----00.0 Amazon.com, Inc. NVMe SSD Controller 61 | +-[0000:77]---00.0-[78-87]----00.0-[79-87]--+-00.0-[7a]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 62 | | +-00.1-[7b]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 63 | | +-00.2-[7c]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 64 | | +-00.3-[7d]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 65 | | +-00.4-[7e]-- 66 | | +-00.5-[7f]-- 67 | | +-00.6-[80]-- 68 | | +-00.7-[81]-- 69 | | +-01.0-[82]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 70 | | +-01.1-[83]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 71 | | +-01.2-[84]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 72 | | +-01.3-[85]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 73 | | +-01.4-[86]----00.0 NVIDIA Corporation GH100 [H100 SXM5 80GB] 74 | | \-01.5-[87]----00.0 Amazon.com, Inc. NVMe SSD Controller 75 | +-[0000:66]---00.0-[67-76]----00.0-[68-76]--+-00.0-[69]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 76 | | +-00.1-[6a]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 77 | | +-00.2-[6b]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 78 | | +-00.3-[6c]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 79 | | +-00.4-[6d]-- 80 | | +-00.5-[6e]-- 81 | | +-00.6-[6f]-- 82 | | +-00.7-[70]-- 83 | | +-01.0-[71]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 84 | | +-01.1-[72]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 85 | | +-01.2-[73]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 86 | | +-01.3-[74]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 87 | | +-01.4-[75]----00.0 NVIDIA Corporation GH100 [H100 SXM5 80GB] 88 | | \-01.5-[76]----00.0 Amazon.com, Inc. NVMe SSD Controller 89 | +-[0000:55]---00.0-[56-65]----00.0-[57-65]--+-00.0-[58]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 90 | | +-00.1-[59]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 91 | | +-00.2-[5a]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 92 | | +-00.3-[5b]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 93 | | +-00.4-[5c]-- 94 | | +-00.5-[5d]-- 95 | | +-00.6-[5e]-- 96 | | +-00.7-[5f]-- 97 | | +-01.0-[60]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 98 | | +-01.1-[61]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 99 | | +-01.2-[62]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 100 | | +-01.3-[63]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 101 | | +-01.4-[64]----00.0 NVIDIA Corporation GH100 [H100 SXM5 80GB] 102 | | \-01.5-[65]----00.0 Amazon.com, Inc. NVMe SSD Controller 103 | +-[0000:44]---00.0-[45-54]----00.0-[46-54]--+-00.0-[47]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 104 | | +-00.1-[48]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 105 | | +-00.2-[49]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 106 | | +-00.3-[4a]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 107 | | +-00.4-[4b]----00.0 Amazon.com, Inc. Elastic Network Adapter (ENA) 108 | | +-00.5-[4c]-- 109 | | +-00.6-[4d]-- 110 | | +-00.7-[4e]-- 111 | | +-01.0-[4f]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 112 | | +-01.1-[50]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 113 | | +-01.2-[51]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 114 | | +-01.3-[52]----00.0 Amazon.com, Inc. Elastic Fabric Adapter (EFA) 115 | | +-01.4-[53]----00.0 NVIDIA Corporation GH100 [H100 SXM5 80GB] 116 | | \-01.5-[54]----00.0 Amazon.com, Inc. NVMe SSD Controller 117 | +-[0000:01]---00.0-[02-43]----00.0-[03-43]--+-00.0-[04]-- 118 | | +-00.1-[05]-- 119 | | +-00.2-[06]-- 120 | | +-00.3-[07]-- 121 | | +-00.4-[08]-- 122 | | +-00.5-[09]-- 123 | | +-00.6-[0a]-- 124 | | +-00.7-[0b]-- 125 | | +-01.0-[0c]-- 126 | | +-01.1-[0d]-- 127 | | +-01.2-[0e]-- 128 | | +-01.3-[0f]-- 129 | | +-01.4-[10]-- 130 | | +-01.5-[11]-- 131 | | +-01.6-[12]-- 132 | | +-01.7-[13]-- 133 | | +-02.0-[14]-- 134 | | +-02.1-[15]-- 135 | | +-02.2-[16]-- 136 | | +-02.3-[17]-- 137 | | +-02.4-[18]-- 138 | | +-02.5-[19]-- 139 | | +-02.6-[1a]-- 140 | | +-02.7-[1b]-- 141 | | +-03.0-[1c]-- 142 | | +-03.1-[1d]-- 143 | | +-03.2-[1e]-- 144 | | +-03.3-[1f]-- 145 | | +-03.4-[20]-- 146 | | +-03.5-[21]-- 147 | | +-03.6-[22]-- 148 | | +-03.7-[23]-- 149 | | +-04.0-[24]-- 150 | | +-04.1-[25]-- 151 | | +-04.2-[26]-- 152 | | +-04.3-[27]-- 153 | | +-04.4-[28]-- 154 | | +-04.5-[29]-- 155 | | +-04.6-[2a]-- 156 | | +-04.7-[2b]-- 157 | | +-05.0-[2c]-- 158 | | +-05.1-[2d]-- 159 | | +-05.2-[2e]-- 160 | | +-05.3-[2f]-- 161 | | +-05.4-[30]-- 162 | | +-05.5-[31]-- 163 | | +-05.6-[32]-- 164 | | +-05.7-[33]-- 165 | | +-06.0-[34]-- 166 | | +-06.1-[35]-- 167 | | +-06.2-[36]-- 168 | | +-06.3-[37]-- 169 | | +-06.4-[38]-- 170 | | +-06.5-[39]-- 171 | | +-06.6-[3a]-- 172 | | +-06.7-[3b]-- 173 | | +-07.0-[3c]-- 174 | | +-07.1-[3d]-- 175 | | +-07.2-[3e]-- 176 | | +-07.3-[3f]-- 177 | | +-07.4-[40]-- 178 | | +-07.5-[41]-- 179 | | +-07.6-[42]-- 180 | | \-07.7-[43]-- 181 | \-[0000:00]-+-00.0 Intel Corporation 440FX - 82441FX PMC [Natoma] 182 | +-01.0 Intel Corporation 82371SB PIIX3 ISA [Natoma/Triton II] 183 | +-01.3 Intel Corporation 82371AB/EB/MB PIIX4 ACPI 184 | +-03.0 Amazon.com, Inc. Device 1111 185 | \-04.0 Amazon.com, Inc. NVMe EBS Controller 186 | -------------------------------------------------------------------------------- /src/5_reverse.cpp: -------------------------------------------------------------------------------- 1 | // clang-format off 2 | /* 3 | Example run: 4 | 5 | server$ ./build/5_reverse 6 | domain: rdmap79s0-rdm, nic: rdmap79s0, fabric: efa, link: 100Gbps 7 | Run client with the following command: 8 | ./build/5_reverse fe800000000000000853f7fffea442e100000000f6d3b3650000000000000000 9 | ./build/5_reverse fe800000000000000853f7fffea442e100000000f6d3b3650000000000000000 "anytext" 10 | ------ 11 | Received CONNECT message from client: fe8000000000000008129efffe237ea1000000005770a1630000000000000000 12 | Received message (len=13): Hello, world! 13 | Sent reversed message to client 14 | Received CONNECT message from client: fe8000000000000008129efffe237ea1000000004cb67a510000000000000000 15 | Received message (len=7): anytext 16 | Sent reversed message to client 17 | ^C 18 | 19 | client$ ./build/5_reverse fe800000000000000853f7fffea442e100000000f6d3b3650000000000000000 20 | domain: rdmap79s0-rdm, nic: rdmap79s0, fabric: efa, link: 100Gbps 21 | Sent CONNECT message to server 22 | Sent message to server 23 | Received message (len=14): !dlrow ,olleH 24 | 25 | client$ ./build/5_reverse fe800000000000000853f7fffea442e100000000f6d3b3650000000000000000 "anytext" 26 | domain: rdmap79s0-rdm, nic: rdmap79s0, fabric: efa, link: 100Gbps 27 | Sent CONNECT message to server 28 | Sent message to server 29 | Received message (len=8): txetyna 30 | */ 31 | // clang-format on 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | #include 51 | #include 52 | 53 | #define CHECK(stmt) \ 54 | do { \ 55 | if (!(stmt)) { \ 56 | fprintf(stderr, "%s:%d %s\n", __FILE__, __LINE__, #stmt); \ 57 | std::exit(1); \ 58 | } \ 59 | } while (0) 60 | 61 | #define FI_CHECK(stmt) \ 62 | do { \ 63 | int rc = (stmt); \ 64 | if (rc) { \ 65 | fprintf(stderr, "%s:%d %s failed with %d (%s)\n", __FILE__, __LINE__, \ 66 | #stmt, rc, fi_strerror(-rc)); \ 67 | std::exit(1); \ 68 | } \ 69 | } while (0) 70 | 71 | constexpr size_t kBufAlign = 128; // EFA alignment requirement 72 | constexpr size_t kMessageBufferSize = 8192; 73 | constexpr size_t kCompletionQueueReadCount = 16; 74 | 75 | struct Buffer; 76 | struct Network; 77 | 78 | struct EfaAddress { 79 | uint8_t bytes[32]; 80 | 81 | explicit EfaAddress(uint8_t bytes[32]) { memcpy(this->bytes, bytes, 32); } 82 | 83 | std::string ToString() const { 84 | char buf[65]; 85 | for (size_t i = 0; i < 32; i++) { 86 | snprintf(buf + 2 * i, 3, "%02x", bytes[i]); 87 | } 88 | return std::string(buf, 64); 89 | } 90 | 91 | static EfaAddress Parse(const std::string &str) { 92 | if (str.size() != 64) { 93 | fprintf(stderr, "Unexpected address length %zu\n", str.size()); 94 | std::exit(1); 95 | } 96 | uint8_t bytes[32]; 97 | for (size_t i = 0; i < 32; i++) { 98 | sscanf(str.c_str() + 2 * i, "%02hhx", &bytes[i]); 99 | } 100 | return EfaAddress(bytes); 101 | } 102 | }; 103 | 104 | enum class RdmaOpType : uint8_t { 105 | kRecv = 0, 106 | kSend = 1, 107 | }; 108 | 109 | struct RdmaRecvOp { 110 | Buffer *buf; 111 | fi_addr_t src_addr; // Set after completion 112 | size_t recv_size; // Set after completion 113 | }; 114 | static_assert(std::is_pod_v == true); 115 | 116 | struct RdmaSendOp { 117 | Buffer *buf; 118 | size_t len; 119 | fi_addr_t dest_addr; 120 | }; 121 | static_assert(std::is_pod_v == true); 122 | 123 | struct RdmaOp { 124 | RdmaOpType type; 125 | union { 126 | RdmaRecvOp recv; 127 | RdmaSendOp send; 128 | }; 129 | std::function callback; 130 | }; 131 | 132 | struct Network { 133 | struct fi_info *fi; 134 | struct fid_fabric *fabric; 135 | struct fid_domain *domain; 136 | struct fid_cq *cq; 137 | struct fid_av *av; 138 | struct fid_ep *ep; 139 | EfaAddress addr; 140 | std::unordered_map mr; 141 | 142 | static Network Open(struct fi_info *fi); 143 | 144 | fi_addr_t AddPeerAddress(const EfaAddress &peer_addr); 145 | void RegisterMemory(Buffer &buf); 146 | struct fid_mr *GetMR(const Buffer &buf); 147 | 148 | void PollCompletion(); 149 | void PostRecv(Buffer &buf, 150 | std::function &&callback); 151 | void PostSend(fi_addr_t addr, Buffer &buf, size_t len, 152 | std::function &&callback); 153 | 154 | Network(const Network &) = delete; 155 | Network(Network &&other) 156 | : fi(other.fi), fabric(other.fabric), domain(other.domain), cq(other.cq), 157 | av(other.av), ep(other.ep), addr(other.addr) { 158 | other.fi = nullptr; 159 | other.fabric = nullptr; 160 | other.domain = nullptr; 161 | other.cq = nullptr; 162 | other.av = nullptr; 163 | other.ep = nullptr; 164 | } 165 | 166 | ~Network() { 167 | for (const auto &[_, mr] : mr) { 168 | FI_CHECK(fi_close(&mr->fid)); 169 | } 170 | if (ep) 171 | FI_CHECK(fi_close(&ep->fid)); 172 | if (av) 173 | FI_CHECK(fi_close(&av->fid)); 174 | if (cq) 175 | FI_CHECK(fi_close(&cq->fid)); 176 | if (domain) 177 | FI_CHECK(fi_close(&domain->fid)); 178 | if (fabric) 179 | FI_CHECK(fi_close(&fabric->fid)); 180 | } 181 | 182 | private: 183 | Network(struct fi_info *fi, struct fid_fabric *fabric, 184 | struct fid_domain *domain, struct fid_cq *cq, struct fid_av *av, 185 | struct fid_ep *ep, EfaAddress addr) 186 | : fi(fi), fabric(fabric), domain(domain), cq(cq), av(av), ep(ep), 187 | addr(addr) {} 188 | }; 189 | 190 | void *align_up(void *ptr, size_t align) { 191 | uintptr_t addr = (uintptr_t)ptr; 192 | return (void *)((addr + align - 1) & ~(align - 1)); 193 | } 194 | 195 | struct Buffer { 196 | void *data; 197 | size_t size; 198 | 199 | static Buffer Alloc(size_t size, size_t align) { 200 | void *raw_data = malloc(size); 201 | CHECK(raw_data != nullptr); 202 | return Buffer(raw_data, size, align); 203 | } 204 | 205 | Buffer(Buffer &&other) 206 | : data(other.data), size(other.size), raw_data(other.raw_data) { 207 | other.data = nullptr; 208 | other.raw_data = nullptr; 209 | } 210 | 211 | ~Buffer() { free(raw_data); } 212 | 213 | private: 214 | void *raw_data; 215 | 216 | Buffer(void *raw_data, size_t raw_size, size_t align) { 217 | this->raw_data = raw_data; 218 | this->data = align_up(raw_data, align); 219 | this->size = (size_t)((uintptr_t)raw_data + raw_size - (uintptr_t)data); 220 | } 221 | Buffer(const Buffer &) = delete; 222 | }; 223 | 224 | struct fi_info *GetInfo() { 225 | struct fi_info *hints, *info; 226 | hints = fi_allocinfo(); 227 | hints->ep_attr->type = FI_EP_RDM; 228 | hints->fabric_attr->prov_name = strdup("efa"); 229 | FI_CHECK(fi_getinfo(FI_VERSION(2, 0), nullptr, nullptr, 0, hints, &info)); 230 | fi_freeinfo(hints); 231 | return info; 232 | } 233 | 234 | Network Network::Open(struct fi_info *fi) { 235 | struct fid_fabric *fabric; 236 | FI_CHECK(fi_fabric(fi->fabric_attr, &fabric, nullptr)); 237 | 238 | struct fid_domain *domain; 239 | FI_CHECK(fi_domain(fabric, fi, &domain, nullptr)); 240 | 241 | struct fid_cq *cq; 242 | struct fi_cq_attr cq_attr = {}; 243 | cq_attr.format = FI_CQ_FORMAT_DATA; 244 | FI_CHECK(fi_cq_open(domain, &cq_attr, &cq, nullptr)); 245 | 246 | struct fid_av *av; 247 | struct fi_av_attr av_attr = {}; 248 | FI_CHECK(fi_av_open(domain, &av_attr, &av, nullptr)); 249 | 250 | struct fid_ep *ep; 251 | FI_CHECK(fi_endpoint(domain, fi, &ep, nullptr)); 252 | FI_CHECK(fi_ep_bind(ep, &cq->fid, FI_SEND | FI_RECV)); 253 | FI_CHECK(fi_ep_bind(ep, &av->fid, 0)); 254 | 255 | FI_CHECK(fi_enable(ep)); 256 | 257 | uint8_t addr[64]; 258 | size_t addrlen = sizeof(addr); 259 | FI_CHECK(fi_getname(&ep->fid, addr, &addrlen)); 260 | if (addrlen != 32) { 261 | fprintf(stderr, "Unexpected address length %zu\n", addrlen); 262 | std::exit(1); 263 | } 264 | 265 | return Network(fi, fabric, domain, cq, av, ep, EfaAddress(addr)); 266 | } 267 | 268 | fi_addr_t Network::AddPeerAddress(const EfaAddress &peer_addr) { 269 | fi_addr_t addr = FI_ADDR_UNSPEC; 270 | int ret = fi_av_insert(av, peer_addr.bytes, 1, &addr, 0, nullptr); 271 | if (ret != 1) { 272 | fprintf(stderr, "fi_av_insert failed: %d\n", ret); 273 | std::exit(1); 274 | } 275 | return addr; 276 | } 277 | 278 | void Network::RegisterMemory(Buffer &buf) { 279 | struct fid_mr *mr; 280 | struct fi_mr_attr mr_attr = {}; 281 | struct iovec iov = {.iov_base = buf.data, .iov_len = buf.size}; 282 | mr_attr.mr_iov = &iov; 283 | mr_attr.iov_count = 1; 284 | mr_attr.access = FI_SEND | FI_RECV; 285 | uint64_t flags = 0; 286 | FI_CHECK(fi_mr_regattr(domain, &mr_attr, flags, &mr)); 287 | this->mr[buf.data] = mr; 288 | } 289 | 290 | struct fid_mr *Network::GetMR(const Buffer &buf) { 291 | auto it = mr.find(buf.data); 292 | CHECK(it != mr.end()); 293 | return it->second; 294 | } 295 | 296 | void Network::PostRecv(Buffer &buf, 297 | std::function &&callback) { 298 | auto *op = new RdmaOp{ 299 | .type = RdmaOpType::kRecv, 300 | .recv = 301 | RdmaRecvOp{.buf = &buf, .src_addr = FI_ADDR_UNSPEC, .recv_size = 0}, 302 | .callback = std::move(callback), 303 | }; 304 | struct iovec iov = { 305 | .iov_base = buf.data, 306 | .iov_len = buf.size, 307 | }; 308 | struct fi_msg msg = { 309 | .msg_iov = &iov, 310 | .desc = &GetMR(buf)->mem_desc, 311 | .iov_count = 1, 312 | .addr = FI_ADDR_UNSPEC, 313 | .context = op, 314 | }; 315 | FI_CHECK(fi_recvmsg(ep, &msg, 0)); // TODO: handle EAGAIN 316 | } 317 | 318 | void Network::PostSend(fi_addr_t addr, Buffer &buf, size_t len, 319 | std::function &&callback) { 320 | CHECK(len <= buf.size); 321 | auto *op = new RdmaOp{ 322 | .type = RdmaOpType::kSend, 323 | .send = RdmaSendOp{.buf = &buf, .len = len, .dest_addr = addr}, 324 | .callback = std::move(callback), 325 | }; 326 | struct iovec iov = { 327 | .iov_base = buf.data, 328 | .iov_len = len, 329 | }; 330 | struct fi_msg msg = { 331 | .msg_iov = &iov, 332 | .desc = &GetMR(buf)->mem_desc, 333 | .iov_count = 1, 334 | .addr = addr, 335 | .context = op, 336 | }; 337 | FI_CHECK(fi_sendmsg(ep, &msg, 0)); // TODO: handle EAGAIN 338 | } 339 | 340 | void HandleCompletion(Network &net, const struct fi_cq_data_entry &cqe) { 341 | auto comp_flags = cqe.flags; 342 | auto op = (RdmaOp *)cqe.op_context; 343 | if (!op) { 344 | return; 345 | } 346 | if (comp_flags & FI_RECV) { 347 | op->recv.recv_size = cqe.len; 348 | if (op->callback) 349 | op->callback(net, *op); 350 | } else if (comp_flags & FI_SEND) { 351 | if (op->callback) 352 | op->callback(net, *op); 353 | } else { 354 | fprintf(stderr, "Unhandled completion type. comp_flags=%lx\n", comp_flags); 355 | std::exit(1); 356 | } 357 | delete op; 358 | } 359 | 360 | void Network::PollCompletion() { 361 | struct fi_cq_data_entry cqe[kCompletionQueueReadCount]; 362 | for (;;) { 363 | auto ret = fi_cq_read(cq, cqe, kCompletionQueueReadCount); 364 | if (ret > 0) { 365 | for (ssize_t i = 0; i < ret; i++) { 366 | HandleCompletion(*this, cqe[i]); 367 | } 368 | } else if (ret == -FI_EAVAIL) { 369 | struct fi_cq_err_entry err_entry; 370 | ret = fi_cq_readerr(cq, &err_entry, 0); 371 | if (ret < 0) { 372 | fprintf(stderr, "fi_cq_readerr error: %zd (%s)\n", ret, 373 | fi_strerror(-ret)); 374 | std::exit(1); 375 | } else if (ret > 0) { 376 | fprintf(stderr, "Failed libfabric operation: %s\n", 377 | fi_cq_strerror(cq, err_entry.prov_errno, err_entry.err_data, 378 | nullptr, 0)); 379 | } else { 380 | fprintf(stderr, "fi_cq_readerr returned 0 unexpectedly.\n"); 381 | std::exit(1); 382 | } 383 | } else if (ret == -FI_EAGAIN) { 384 | // No more completions 385 | break; 386 | } else { 387 | fprintf(stderr, "fi_cq_read error: %zd (%s)\n", ret, fi_strerror(-ret)); 388 | std::exit(1); 389 | } 390 | } 391 | } 392 | 393 | enum class AppMessageType : uint8_t { 394 | kConnect = 0, 395 | kData = 1, 396 | }; 397 | 398 | struct AppMessageBase { 399 | AppMessageType type; 400 | }; 401 | 402 | struct AppConnectMessage { 403 | AppMessageBase base; 404 | EfaAddress client_addr; 405 | }; 406 | 407 | struct AppDataMessage { 408 | AppMessageBase base; 409 | // Data follows 410 | }; 411 | 412 | struct ReverseRequestState { 413 | fi_addr_t client_addr = FI_ADDR_UNSPEC; 414 | bool done = false; 415 | 416 | void HandleConnect(Network &net, RdmaOp &op) { 417 | auto *base_msg = (const AppMessageBase *)op.recv.buf->data; 418 | CHECK(base_msg->type == AppMessageType::kConnect); 419 | CHECK(op.recv.recv_size == sizeof(AppConnectMessage)); 420 | auto *msg = (const AppConnectMessage *)base_msg; 421 | printf("Received CONNECT message from client: %s\n", 422 | msg->client_addr.ToString().c_str()); 423 | client_addr = net.AddPeerAddress(msg->client_addr); 424 | } 425 | 426 | void HandleData(Network &net, RdmaOp &op) { 427 | auto *base_msg = (const AppMessageBase *)op.recv.buf->data; 428 | CHECK(base_msg->type == AppMessageType::kData); 429 | auto *msg = (uint8_t *)op.recv.buf->data + sizeof(*base_msg); 430 | auto len = op.recv.recv_size - sizeof(*base_msg); 431 | printf("Received message (len=%zu): %.*s\n", len, (int)len, msg); 432 | for (size_t i = 0, j = len - 1; i < j; ++i, --j) { 433 | auto t = msg[i]; 434 | msg[i] = msg[j]; 435 | msg[j] = t; 436 | } 437 | net.PostSend(client_addr, *op.recv.buf, op.recv.recv_size, 438 | [this](Network &net, RdmaOp &op) { 439 | printf("Sent reversed message to client\n"); 440 | done = true; 441 | }); 442 | } 443 | 444 | void OnRecv(Network &net, RdmaOp &op) { 445 | if (client_addr == FI_ADDR_UNSPEC) { 446 | HandleConnect(net, op); 447 | } else { 448 | HandleData(net, op); 449 | } 450 | } 451 | }; 452 | 453 | int ServerMain(int argc, char **argv) { 454 | struct fi_info *info = GetInfo(); 455 | auto net = Network::Open(info); 456 | printf("domain: %14s", info->domain_attr->name); 457 | printf(", nic: %10s", info->nic->device_attr->name); 458 | printf(", fabric: %s", info->fabric_attr->prov_name); 459 | printf(", link: %.0fGbps", info->nic->link_attr->speed / 1e9); 460 | printf("\n"); 461 | printf("Run client with the following command:\n"); 462 | printf(" %s %s\n", argv[0], net.addr.ToString().c_str()); 463 | printf(" %s %s \"anytext\"\n", argv[0], net.addr.ToString().c_str()); 464 | printf("------\n"); 465 | 466 | auto buf1 = Buffer::Alloc(kMessageBufferSize, kBufAlign); 467 | net.RegisterMemory(buf1); 468 | auto buf2 = Buffer::Alloc(kMessageBufferSize, kBufAlign); 469 | net.RegisterMemory(buf2); 470 | 471 | // Loop forever. Accept one client at a time. 472 | for (;;) { 473 | // State machine 474 | ReverseRequestState s; 475 | // RECV for CONNECT 476 | net.PostRecv(buf1, [&s](Network &net, RdmaOp &op) { s.OnRecv(net, op); }); 477 | // RECV for DATA 478 | net.PostRecv(buf2, [&s](Network &net, RdmaOp &op) { s.OnRecv(net, op); }); 479 | // Wait for completion 480 | while (!s.done) { 481 | net.PollCompletion(); 482 | } 483 | } 484 | 485 | fi_freeinfo(info); 486 | return 0; 487 | } 488 | 489 | int ClientMain(int argc, char **argv) { 490 | CHECK(argc == 2 || argc == 3); 491 | auto server_addrname = EfaAddress::Parse(argv[1]); 492 | std::string message = argc == 3 ? argv[2] : "Hello, world!"; 493 | 494 | struct fi_info *info = GetInfo(); 495 | auto net = Network::Open(info); 496 | printf("domain: %14s", info->domain_attr->name); 497 | printf(", nic: %10s", info->nic->device_attr->name); 498 | printf(", fabric: %s", info->fabric_attr->prov_name); 499 | printf(", link: %.0fGbps", info->nic->link_attr->speed / 1e9); 500 | printf("\n"); 501 | 502 | auto server_addr = net.AddPeerAddress(server_addrname); 503 | auto buf1 = Buffer::Alloc(kMessageBufferSize, kBufAlign); 504 | net.RegisterMemory(buf1); 505 | auto buf2 = Buffer::Alloc(kMessageBufferSize, kBufAlign); 506 | net.RegisterMemory(buf2); 507 | 508 | // Send address to server 509 | auto *connect_msg = (AppConnectMessage *)buf1.data; 510 | connect_msg->base.type = AppMessageType::kConnect; 511 | connect_msg->client_addr = net.addr; 512 | bool connect_sent = false; 513 | net.PostSend(server_addr, buf1, sizeof(*connect_msg), 514 | [&connect_sent](Network &net, RdmaOp &op) { 515 | printf("Sent CONNECT message to server\n"); 516 | connect_sent = true; 517 | }); 518 | while (!connect_sent) { 519 | net.PollCompletion(); 520 | } 521 | 522 | // Prepare to receive reversed message from server 523 | bool msg_received = false; 524 | net.PostRecv(buf2, [&msg_received](Network &net, RdmaOp &op) { 525 | auto *msg = (const char *)op.recv.buf->data; 526 | auto len = op.recv.recv_size; 527 | printf("Received message (len=%zu): %.*s\n", len, (int)len, msg); 528 | msg_received = true; 529 | }); 530 | 531 | // Send message to server 532 | auto *data_msg = (AppDataMessage *)buf1.data; 533 | data_msg->base.type = AppMessageType::kData; 534 | memcpy((char *)buf1.data + sizeof(*data_msg), message.c_str(), 535 | message.size()); 536 | net.PostSend( 537 | server_addr, buf1, sizeof(*data_msg) + message.size(), 538 | [](Network &net, RdmaOp &op) { printf("Sent message to server\n"); }); 539 | 540 | // Wait for message from server 541 | while (!msg_received) { 542 | net.PollCompletion(); 543 | } 544 | 545 | fi_freeinfo(info); 546 | return 0; 547 | } 548 | 549 | int main(int argc, char **argv) { 550 | if (argc == 1) { 551 | return ServerMain(argc, argv); 552 | } else { 553 | return ClientMain(argc, argv); 554 | } 555 | } 556 | -------------------------------------------------------------------------------- /src/6_write.cpp: -------------------------------------------------------------------------------- 1 | // clang-format off 2 | /* 3 | Example run: 4 | 5 | server$ ./build/6_write 6 | domain: rdmap79s0-rdm, nic: rdmap79s0, fabric: efa, link: 100Gbps 7 | Run client with the following command: 8 | ./build/6_write fe8000000000000008e7effffeeee81d000000003e88df080000000000000000 [page_size num_pages] 9 | Registered 1 buffer on cuda:0 10 | ------ 11 | Received CONNECT message from client: 12 | addr: fe80000000000000083425fffe7d535100000000057558100000000000000000 13 | MR[0]: addr=0x7f2440800000 size=16777216 rkey=0x000000000070000a 14 | MR[1]: addr=0x7f2442400000 size=16777216 rkey=0x0000000000a00031 15 | Received RandomFill request from client: 16 | remote_context: 0x00000123 17 | seed: 0xb584035fabe6ce9b 18 | page_size: 1048576 19 | num_pages: 8 20 | Generating random data.. 21 | Finished RDMA WRITE to the remote GPU memory. 22 | ------ 23 | ^C 24 | 25 | client$ ./build/6_write fe8000000000000008e7effffeeee81d000000003e88df080000000000000000 26 | domain: rdmap79s0-rdm, nic: rdmap79s0, fabric: efa, link: 100Gbps 27 | Registered 2 buffers on cuda:0 28 | Sent CONNECT message to server 29 | Sent RandomFillRequest to server. page_size: 1048576, num_pages: 8 30 | Received RDMA WRITE to local GPU memory. 31 | Data is correct 32 | */ 33 | // clang-format on 34 | 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | #include 51 | #include 52 | #include 53 | #include 54 | #include 55 | #include 56 | #include 57 | #include 58 | #include 59 | 60 | #define CHECK(stmt) \ 61 | do { \ 62 | if (!(stmt)) { \ 63 | fprintf(stderr, "%s:%d %s\n", __FILE__, __LINE__, #stmt); \ 64 | std::exit(1); \ 65 | } \ 66 | } while (0) 67 | 68 | #define FI_CHECK(stmt) \ 69 | do { \ 70 | int rc = (stmt); \ 71 | if (rc) { \ 72 | fprintf(stderr, "%s:%d %s failed with %d (%s)\n", __FILE__, __LINE__, \ 73 | #stmt, rc, fi_strerror(-rc)); \ 74 | std::exit(1); \ 75 | } \ 76 | } while (0) 77 | 78 | #define CUDA_CHECK(stmt) \ 79 | do { \ 80 | cudaError_t rc = (stmt); \ 81 | if (rc != cudaSuccess) { \ 82 | fprintf(stderr, "%s:%d %s failed with %d (%s)\n", __FILE__, __LINE__, \ 83 | #stmt, rc, cudaGetErrorString(rc)); \ 84 | std::exit(1); \ 85 | } \ 86 | } while (0) 87 | 88 | #define CU_CHECK(stmt) \ 89 | do { \ 90 | CUresult rc = (stmt); \ 91 | if (rc != CUDA_SUCCESS) { \ 92 | const char *err_str; \ 93 | cuGetErrorString(rc, &err_str); \ 94 | fprintf(stderr, "%s:%d %s failed with %d (%s)\n", __FILE__, __LINE__, \ 95 | #stmt, rc, err_str); \ 96 | std::exit(1); \ 97 | } \ 98 | } while (0) 99 | 100 | constexpr size_t kBufAlign = 128; // EFA alignment requirement 101 | constexpr size_t kMessageBufferSize = 8192; 102 | constexpr size_t kCompletionQueueReadCount = 16; 103 | constexpr size_t kMemoryRegionSize = 16 << 20; 104 | constexpr size_t kEfaImmDataSize = 4; 105 | 106 | struct Buffer; 107 | struct Network; 108 | 109 | struct EfaAddress { 110 | uint8_t bytes[32]; 111 | 112 | explicit EfaAddress(uint8_t bytes[32]) { memcpy(this->bytes, bytes, 32); } 113 | 114 | std::string ToString() const { 115 | char buf[65]; 116 | for (size_t i = 0; i < 32; i++) { 117 | snprintf(buf + 2 * i, 3, "%02x", bytes[i]); 118 | } 119 | return std::string(buf, 64); 120 | } 121 | 122 | static EfaAddress Parse(const std::string &str) { 123 | if (str.size() != 64) { 124 | fprintf(stderr, "Unexpected address length %zu\n", str.size()); 125 | std::exit(1); 126 | } 127 | uint8_t bytes[32]; 128 | for (size_t i = 0; i < 32; i++) { 129 | sscanf(str.c_str() + 2 * i, "%02hhx", &bytes[i]); 130 | } 131 | return EfaAddress(bytes); 132 | } 133 | }; 134 | 135 | enum class RdmaOpType : uint8_t { 136 | kRecv = 0, 137 | kSend = 1, 138 | kWrite = 2, 139 | kRemoteWrite = 3, 140 | }; 141 | 142 | struct RdmaRecvOp { 143 | Buffer *buf; 144 | fi_addr_t src_addr; // Set after completion 145 | size_t recv_size; // Set after completion 146 | }; 147 | static_assert(std::is_pod_v); 148 | 149 | struct RdmaSendOp { 150 | Buffer *buf; 151 | size_t len; 152 | fi_addr_t dest_addr; 153 | }; 154 | static_assert(std::is_pod_v); 155 | 156 | struct RdmaWriteOp { 157 | Buffer *buf; 158 | size_t offset; 159 | size_t len; 160 | uint32_t imm_data; 161 | uint64_t dest_ptr; 162 | fi_addr_t dest_addr; 163 | uint64_t dest_key; 164 | }; 165 | static_assert(std::is_pod_v); 166 | 167 | struct RdmaRemoteWriteOp { 168 | uint32_t op_id; 169 | }; 170 | static_assert(std::is_pod_v); 171 | static_assert(sizeof(RdmaRemoteWriteOp) <= kEfaImmDataSize); 172 | 173 | struct RdmaOp { 174 | RdmaOpType type; 175 | union { 176 | RdmaRecvOp recv; 177 | RdmaSendOp send; 178 | RdmaWriteOp write; 179 | RdmaRemoteWriteOp remote_write; 180 | }; 181 | std::function callback; 182 | }; 183 | 184 | struct Network { 185 | struct fi_info *fi; 186 | struct fid_fabric *fabric; 187 | struct fid_domain *domain; 188 | struct fid_cq *cq; 189 | struct fid_av *av; 190 | struct fid_ep *ep; 191 | EfaAddress addr; 192 | 193 | std::unordered_map mr; 194 | std::unordered_map remote_write_ops; 195 | 196 | static Network Open(struct fi_info *fi); 197 | 198 | fi_addr_t AddPeerAddress(const EfaAddress &peer_addr); 199 | void RegisterMemory(Buffer &buf); 200 | struct fid_mr *GetMR(const Buffer &buf); 201 | 202 | void PollCompletion(); 203 | void PostRecv(Buffer &buf, 204 | std::function &&callback); 205 | void PostSend(fi_addr_t addr, Buffer &buf, size_t len, 206 | std::function &&callback); 207 | void PostWrite(RdmaWriteOp &&write, 208 | std::function &&callback); 209 | void AddRemoteWrite(uint32_t id, 210 | std::function &&callback); 211 | 212 | Network(const Network &) = delete; 213 | Network(Network &&other) 214 | : fi(other.fi), fabric(other.fabric), domain(other.domain), cq(other.cq), 215 | av(other.av), ep(other.ep), addr(other.addr) { 216 | other.fi = nullptr; 217 | other.fabric = nullptr; 218 | other.domain = nullptr; 219 | other.cq = nullptr; 220 | other.av = nullptr; 221 | other.ep = nullptr; 222 | } 223 | 224 | ~Network() { 225 | for (const auto &[_, mr] : mr) { 226 | FI_CHECK(fi_close(&mr->fid)); 227 | } 228 | if (ep) 229 | FI_CHECK(fi_close(&ep->fid)); 230 | if (av) 231 | FI_CHECK(fi_close(&av->fid)); 232 | if (cq) 233 | FI_CHECK(fi_close(&cq->fid)); 234 | if (domain) 235 | FI_CHECK(fi_close(&domain->fid)); 236 | if (fabric) 237 | FI_CHECK(fi_close(&fabric->fid)); 238 | } 239 | 240 | private: 241 | Network(struct fi_info *fi, struct fid_fabric *fabric, 242 | struct fid_domain *domain, struct fid_cq *cq, struct fid_av *av, 243 | struct fid_ep *ep, EfaAddress addr) 244 | : fi(fi), fabric(fabric), domain(domain), cq(cq), av(av), ep(ep), 245 | addr(addr) {} 246 | }; 247 | 248 | void *align_up(void *ptr, size_t align) { 249 | uintptr_t addr = (uintptr_t)ptr; 250 | return (void *)((addr + align - 1) & ~(align - 1)); 251 | } 252 | 253 | struct Buffer { 254 | void *data; 255 | size_t size; 256 | int cuda_device; 257 | int dmabuf_fd; 258 | 259 | static Buffer Alloc(size_t size, size_t align) { 260 | void *raw_data = malloc(size); 261 | CHECK(raw_data != nullptr); 262 | return Buffer(raw_data, size, align, -1, -1); 263 | } 264 | 265 | static Buffer AllocCuda(size_t size, size_t align) { 266 | void *raw_data; 267 | struct cudaPointerAttributes attrs = {}; 268 | CUDA_CHECK(cudaMalloc(&raw_data, size)); 269 | CUDA_CHECK(cudaPointerGetAttributes(&attrs, raw_data)); 270 | CHECK(attrs.type == cudaMemoryTypeDevice); 271 | int cuda_device = attrs.device; 272 | int fd = -1; 273 | CU_CHECK(cuMemGetHandleForAddressRange( 274 | &fd, (CUdeviceptr)align_up(raw_data, align), size, 275 | CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); 276 | return Buffer(raw_data, size, align, cuda_device, fd); 277 | } 278 | 279 | bool is_cuda() const { return cuda_device >= 0; } 280 | 281 | Buffer(Buffer &&other) 282 | : data(other.data), size(other.size), cuda_device(other.cuda_device), 283 | dmabuf_fd(other.dmabuf_fd), raw_data(other.raw_data) { 284 | other.data = nullptr; 285 | other.raw_data = nullptr; 286 | other.size = 0; 287 | other.cuda_device = -1; 288 | other.dmabuf_fd = -1; 289 | } 290 | 291 | ~Buffer() { 292 | if (is_cuda()) { 293 | CUDA_CHECK(cudaFree(raw_data)); 294 | } else { 295 | free(raw_data); 296 | } 297 | } 298 | 299 | private: 300 | void *raw_data; 301 | 302 | Buffer(void *raw_data, size_t raw_size, size_t align, int cuda_device, 303 | int dmabuf_fd) { 304 | this->raw_data = raw_data; 305 | this->data = align_up(raw_data, align); 306 | this->size = (size_t)((uintptr_t)raw_data + raw_size - (uintptr_t)data); 307 | this->cuda_device = cuda_device; 308 | this->dmabuf_fd = dmabuf_fd; 309 | } 310 | Buffer(const Buffer &) = delete; 311 | }; 312 | 313 | struct fi_info *GetInfo() { 314 | struct fi_info *hints, *info; 315 | hints = fi_allocinfo(); 316 | hints->caps = FI_MSG | FI_RMA | FI_HMEM | FI_LOCAL_COMM | FI_REMOTE_COMM; 317 | hints->ep_attr->type = FI_EP_RDM; 318 | hints->fabric_attr->prov_name = strdup("efa"); 319 | hints->domain_attr->mr_mode = FI_MR_LOCAL | FI_MR_HMEM | FI_MR_VIRT_ADDR | 320 | FI_MR_ALLOCATED | FI_MR_PROV_KEY; 321 | hints->domain_attr->threading = FI_THREAD_SAFE; 322 | FI_CHECK(fi_getinfo(FI_VERSION(2, 0), nullptr, nullptr, 0, hints, &info)); 323 | fi_freeinfo(hints); 324 | return info; 325 | } 326 | 327 | Network Network::Open(struct fi_info *fi) { 328 | struct fid_fabric *fabric; 329 | FI_CHECK(fi_fabric(fi->fabric_attr, &fabric, nullptr)); 330 | 331 | struct fid_domain *domain; 332 | FI_CHECK(fi_domain(fabric, fi, &domain, nullptr)); 333 | 334 | struct fid_cq *cq; 335 | struct fi_cq_attr cq_attr = {}; 336 | cq_attr.format = FI_CQ_FORMAT_DATA; 337 | FI_CHECK(fi_cq_open(domain, &cq_attr, &cq, nullptr)); 338 | 339 | struct fid_av *av; 340 | struct fi_av_attr av_attr = {}; 341 | FI_CHECK(fi_av_open(domain, &av_attr, &av, nullptr)); 342 | 343 | struct fid_ep *ep; 344 | FI_CHECK(fi_endpoint(domain, fi, &ep, nullptr)); 345 | FI_CHECK(fi_ep_bind(ep, &cq->fid, FI_SEND | FI_RECV)); 346 | FI_CHECK(fi_ep_bind(ep, &av->fid, 0)); 347 | 348 | FI_CHECK(fi_enable(ep)); 349 | 350 | uint8_t addr[64]; 351 | size_t addrlen = sizeof(addr); 352 | FI_CHECK(fi_getname(&ep->fid, addr, &addrlen)); 353 | if (addrlen != 32) { 354 | fprintf(stderr, "Unexpected address length %zu\n", addrlen); 355 | std::exit(1); 356 | } 357 | 358 | return Network(fi, fabric, domain, cq, av, ep, EfaAddress(addr)); 359 | } 360 | 361 | fi_addr_t Network::AddPeerAddress(const EfaAddress &peer_addr) { 362 | fi_addr_t addr = FI_ADDR_UNSPEC; 363 | int ret = fi_av_insert(av, peer_addr.bytes, 1, &addr, 0, nullptr); 364 | if (ret != 1) { 365 | fprintf(stderr, "fi_av_insert failed: %d\n", ret); 366 | std::exit(1); 367 | } 368 | return addr; 369 | } 370 | 371 | void Network::RegisterMemory(Buffer &buf) { 372 | struct fid_mr *mr; 373 | struct fi_mr_attr mr_attr = { 374 | .iov_count = 1, 375 | .access = FI_SEND | FI_RECV | FI_REMOTE_WRITE | FI_REMOTE_READ | 376 | FI_WRITE | FI_READ, 377 | }; 378 | struct iovec iov = {.iov_base = buf.data, .iov_len = buf.size}; 379 | struct fi_mr_dmabuf dmabuf = { 380 | .fd = buf.dmabuf_fd, .offset = 0, .len = buf.size, .base_addr = buf.data}; 381 | uint64_t flags = 0; 382 | if (buf.is_cuda()) { 383 | mr_attr.iface = FI_HMEM_CUDA; 384 | mr_attr.device.cuda = buf.cuda_device; 385 | if (buf.dmabuf_fd != -1) { 386 | mr_attr.dmabuf = &dmabuf; 387 | flags = FI_MR_DMABUF; 388 | } else { 389 | mr_attr.mr_iov = &iov; 390 | } 391 | } else { 392 | mr_attr.mr_iov = &iov; 393 | } 394 | FI_CHECK(fi_mr_regattr(domain, &mr_attr, flags, &mr)); 395 | this->mr[buf.data] = mr; 396 | } 397 | 398 | struct fid_mr *Network::GetMR(const Buffer &buf) { 399 | auto it = mr.find(buf.data); 400 | CHECK(it != mr.end()); 401 | return it->second; 402 | } 403 | 404 | void Network::PostRecv(Buffer &buf, 405 | std::function &&callback) { 406 | auto *op = new RdmaOp{ 407 | .type = RdmaOpType::kRecv, 408 | .recv = 409 | RdmaRecvOp{.buf = &buf, .src_addr = FI_ADDR_UNSPEC, .recv_size = 0}, 410 | .callback = std::move(callback), 411 | }; 412 | struct iovec iov = { 413 | .iov_base = buf.data, 414 | .iov_len = buf.size, 415 | }; 416 | struct fi_msg msg = { 417 | .msg_iov = &iov, 418 | .desc = &GetMR(buf)->mem_desc, 419 | .iov_count = 1, 420 | .addr = FI_ADDR_UNSPEC, 421 | .context = op, 422 | }; 423 | FI_CHECK(fi_recvmsg(ep, &msg, 0)); // TODO: handle EAGAIN 424 | } 425 | 426 | void Network::PostSend(fi_addr_t addr, Buffer &buf, size_t len, 427 | std::function &&callback) { 428 | CHECK(len <= buf.size); 429 | auto *op = new RdmaOp{ 430 | .type = RdmaOpType::kSend, 431 | .send = RdmaSendOp{.buf = &buf, .len = len, .dest_addr = addr}, 432 | .callback = std::move(callback), 433 | }; 434 | struct iovec iov = { 435 | .iov_base = buf.data, 436 | .iov_len = len, 437 | }; 438 | struct fi_msg msg = { 439 | .msg_iov = &iov, 440 | .desc = &GetMR(buf)->mem_desc, 441 | .iov_count = 1, 442 | .addr = addr, 443 | .context = op, 444 | }; 445 | FI_CHECK(fi_sendmsg(ep, &msg, 0)); // TODO: handle EAGAIN 446 | } 447 | 448 | void Network::PostWrite(RdmaWriteOp &&write, 449 | std::function &&callback) { 450 | auto *op = new RdmaOp{ 451 | .type = RdmaOpType::kWrite, 452 | .write = std::move(write), 453 | .callback = std::move(callback), 454 | }; 455 | struct iovec iov = { 456 | .iov_base = (uint8_t *)write.buf->data + write.offset, 457 | .iov_len = write.len, 458 | }; 459 | struct fi_rma_iov rma_iov = { 460 | .addr = write.dest_ptr, 461 | .len = write.len, 462 | .key = write.dest_key, 463 | }; 464 | struct fi_msg_rma msg = { 465 | .msg_iov = &iov, 466 | .desc = &GetMR(*write.buf)->mem_desc, 467 | .iov_count = 1, 468 | .addr = write.dest_addr, 469 | .rma_iov = &rma_iov, 470 | .rma_iov_count = 1, 471 | .context = op, 472 | .data = write.imm_data, 473 | }; 474 | uint64_t flags = 0; 475 | if (write.imm_data) { 476 | flags |= FI_REMOTE_CQ_DATA; 477 | } 478 | FI_CHECK(fi_writemsg(ep, &msg, flags)); // TODO: handle EAGAIN 479 | } 480 | 481 | void Network::AddRemoteWrite( 482 | uint32_t id, std::function &&callback) { 483 | CHECK(remote_write_ops.count(id) == 0); 484 | auto *op = new RdmaOp{ 485 | .type = RdmaOpType::kRemoteWrite, 486 | .remote_write = RdmaRemoteWriteOp{.op_id = id}, 487 | .callback = std::move(callback), 488 | }; 489 | remote_write_ops[id] = op; 490 | } 491 | 492 | void HandleCompletion(Network &net, const struct fi_cq_data_entry &cqe) { 493 | RdmaOp *op = nullptr; 494 | if (cqe.flags & FI_REMOTE_WRITE) { 495 | // REMOTE WRITE does not have op_context 496 | // NOTE(lequn): EFA only supports 4 bytes of immediate data. 497 | uint32_t op_id = cqe.data; 498 | if (!op_id) 499 | return; 500 | auto it = net.remote_write_ops.find(op_id); 501 | if (it == net.remote_write_ops.end()) 502 | return; 503 | op = it->second; 504 | net.remote_write_ops.erase(it); 505 | } else { 506 | // RECV / SEND / WRITE 507 | op = (RdmaOp *)cqe.op_context; 508 | if (!op) 509 | return; 510 | if (cqe.flags & FI_RECV) { 511 | op->recv.recv_size = cqe.len; 512 | } else if (cqe.flags & FI_SEND) { 513 | // Nothing special 514 | } else if (cqe.flags & FI_WRITE) { 515 | // Nothing special 516 | } else { 517 | fprintf(stderr, "Unhandled completion type. cqe.flags=%lx\n", cqe.flags); 518 | std::exit(1); 519 | } 520 | } 521 | if (op->callback) 522 | op->callback(net, *op); 523 | delete op; 524 | } 525 | 526 | void Network::PollCompletion() { 527 | struct fi_cq_data_entry cqe[kCompletionQueueReadCount]; 528 | for (;;) { 529 | auto ret = fi_cq_read(cq, cqe, kCompletionQueueReadCount); 530 | if (ret > 0) { 531 | for (ssize_t i = 0; i < ret; i++) { 532 | HandleCompletion(*this, cqe[i]); 533 | } 534 | } else if (ret == -FI_EAVAIL) { 535 | struct fi_cq_err_entry err_entry; 536 | ret = fi_cq_readerr(cq, &err_entry, 0); 537 | if (ret < 0) { 538 | fprintf(stderr, "fi_cq_readerr error: %zd (%s)\n", ret, 539 | fi_strerror(-ret)); 540 | std::exit(1); 541 | } else if (ret > 0) { 542 | fprintf(stderr, "Failed libfabric operation: %s\n", 543 | fi_cq_strerror(cq, err_entry.prov_errno, err_entry.err_data, 544 | nullptr, 0)); 545 | } else { 546 | fprintf(stderr, "fi_cq_readerr returned 0 unexpectedly.\n"); 547 | std::exit(1); 548 | } 549 | } else if (ret == -FI_EAGAIN) { 550 | // No more completions 551 | break; 552 | } else { 553 | fprintf(stderr, "fi_cq_read error: %zd (%s)\n", ret, fi_strerror(-ret)); 554 | std::exit(1); 555 | } 556 | } 557 | } 558 | 559 | enum class AppMessageType : uint8_t { 560 | kConnect = 0, 561 | kRandomFill = 1, 562 | }; 563 | 564 | struct AppMessageBase { 565 | AppMessageType type; 566 | }; 567 | 568 | struct AppConnectMessage { 569 | struct MemoryRegion { 570 | uint64_t addr; 571 | uint64_t size; 572 | uint64_t rkey; 573 | }; 574 | 575 | AppMessageBase base; 576 | EfaAddress client_addr; 577 | size_t num_mr; 578 | 579 | MemoryRegion &mr(size_t index) { 580 | CHECK(index < num_mr); 581 | return ((MemoryRegion *)((uintptr_t)&base + sizeof(*this)))[index]; 582 | } 583 | 584 | size_t MessageBytes() const { 585 | return sizeof(*this) + num_mr * sizeof(MemoryRegion); 586 | } 587 | }; 588 | 589 | struct AppRandomFillMessage { 590 | AppMessageBase base; 591 | uint32_t remote_context; 592 | uint64_t seed; 593 | size_t page_size; 594 | size_t num_pages; 595 | 596 | uint32_t &page_idx(size_t index) { 597 | CHECK(index < num_pages); 598 | return ((uint32_t *)((uintptr_t)&base + sizeof(*this)))[index]; 599 | } 600 | 601 | size_t MessageBytes() const { 602 | return sizeof(*this) + num_pages * sizeof(uint32_t); 603 | } 604 | }; 605 | 606 | std::vector RandomBytes(uint64_t seed, size_t size) { 607 | CHECK(size % sizeof(uint64_t) == 0); 608 | std::vector buf(size); 609 | std::mt19937_64 gen(seed); 610 | std::uniform_int_distribution dist; 611 | for (size_t i = 0; i < size; i += sizeof(uint64_t)) { 612 | *(uint64_t *)(buf.data() + i) = dist(gen); 613 | } 614 | return buf; 615 | } 616 | 617 | struct RandomFillRequestState { 618 | Buffer *cuda_buf; 619 | fi_addr_t client_addr = FI_ADDR_UNSPEC; 620 | bool done = false; 621 | AppConnectMessage *connect_msg = nullptr; 622 | 623 | explicit RandomFillRequestState(Buffer *cuda_buf) : cuda_buf(cuda_buf) {} 624 | 625 | void HandleConnect(Network &net, RdmaOp &op) { 626 | auto *base_msg = (AppMessageBase *)op.recv.buf->data; 627 | CHECK(base_msg->type == AppMessageType::kConnect); 628 | CHECK(op.recv.recv_size >= sizeof(AppConnectMessage)); 629 | auto &msg = *(AppConnectMessage *)base_msg; 630 | CHECK(op.recv.recv_size == msg.MessageBytes()); 631 | CHECK(msg.num_mr > 0); 632 | 633 | // Save the message. Note that we don't reuse the buffer. 634 | connect_msg = &msg; 635 | 636 | // Add the client to AV 637 | client_addr = net.AddPeerAddress(msg.client_addr); 638 | 639 | printf("Received CONNECT message from client:\n"); 640 | printf(" addr: %s\n", msg.client_addr.ToString().c_str()); 641 | for (size_t i = 0; i < msg.num_mr; i++) { 642 | printf(" MR[%zu]: addr=0x%012lx size=%lu rkey=0x%016lx\n", i, 643 | msg.mr(i).addr, msg.mr(i).size, msg.mr(i).rkey); 644 | } 645 | } 646 | 647 | void HandleRequest(Network &net, RdmaOp &op) { 648 | auto *base_msg = (const AppMessageBase *)op.recv.buf->data; 649 | CHECK(base_msg->type == AppMessageType::kRandomFill); 650 | CHECK(op.recv.recv_size >= sizeof(AppRandomFillMessage)); 651 | auto &msg = *(AppRandomFillMessage *)base_msg; 652 | CHECK(op.recv.recv_size == msg.MessageBytes()); 653 | 654 | printf("Received RandomFill request from client:\n"); 655 | printf(" remote_context: 0x%08x\n", msg.remote_context); 656 | printf(" seed: 0x%016lx\n", msg.seed); 657 | printf(" page_size: %zu\n", msg.page_size); 658 | printf(" num_pages: %zu\n", msg.num_pages); 659 | 660 | // Generate random data and copy to local GPU memory 661 | printf("Generating random data"); 662 | for (size_t i = 0; i < connect_msg->num_mr; ++i) { 663 | auto bytes = RandomBytes(msg.seed + i, msg.page_size * msg.num_pages); 664 | CUDA_CHECK(cudaMemcpy((uint8_t *)cuda_buf->data + i * bytes.size(), 665 | bytes.data(), bytes.size(), 666 | cudaMemcpyHostToDevice)); 667 | printf("."); 668 | fflush(stdout); 669 | } 670 | printf("\n"); 671 | 672 | // RDMA WRITE the data to remote GPU. 673 | // 674 | // NOTE(lequn): iov_limit==4, rma_iov_limit==1. 675 | // So need multiple WRITE instead of a vectorized WRITE. 676 | for (size_t i = 0; i < connect_msg->num_mr; ++i) { 677 | for (size_t j = 0; j < msg.num_pages; j++) { 678 | uint32_t imm_data = 0; 679 | std::function callback; 680 | if (i + 1 == connect_msg->num_mr && j + 1 == msg.num_pages) { 681 | // The last WRITE. 682 | // NOTE(lequn): EFA RDM guarantees send-after-send ordering. 683 | imm_data = msg.remote_context; 684 | callback = [this](Network &net, RdmaOp &op) { 685 | CHECK(op.type == RdmaOpType::kWrite); 686 | done = true; 687 | printf("Finished RDMA WRITE to the remote GPU memory.\n"); 688 | }; 689 | } else { 690 | // Don't send immediate data. Don't wake up the remote side. 691 | // Also skip local callback. 692 | } 693 | net.PostWrite( 694 | {.buf = cuda_buf, 695 | .offset = i * (msg.page_size * msg.num_pages) + j * msg.page_size, 696 | .len = msg.page_size, 697 | .imm_data = imm_data, 698 | .dest_ptr = 699 | connect_msg->mr(i).addr + msg.page_idx(j) * msg.page_size, 700 | .dest_addr = client_addr, 701 | .dest_key = connect_msg->mr(i).rkey}, 702 | std::move(callback)); 703 | } 704 | } 705 | } 706 | 707 | void OnRecv(Network &net, RdmaOp &op) { 708 | if (client_addr == FI_ADDR_UNSPEC) { 709 | HandleConnect(net, op); 710 | } else { 711 | HandleRequest(net, op); 712 | } 713 | } 714 | }; 715 | 716 | int ServerMain(int argc, char **argv) { 717 | // Open Netowrk 718 | struct fi_info *info = GetInfo(); 719 | auto net = Network::Open(info); 720 | printf("domain: %14s", info->domain_attr->name); 721 | printf(", nic: %10s", info->nic->device_attr->name); 722 | printf(", fabric: %s", info->fabric_attr->prov_name); 723 | printf(", link: %.0fGbps", info->nic->link_attr->speed / 1e9); 724 | printf("\n"); 725 | printf("Run client with the following command:\n"); 726 | printf(" %s %s [page_size num_pages]\n", argv[0], 727 | net.addr.ToString().c_str()); 728 | 729 | // Allocate and register message buffer 730 | auto buf1 = Buffer::Alloc(kMessageBufferSize, kBufAlign); 731 | net.RegisterMemory(buf1); 732 | auto buf2 = Buffer::Alloc(kMessageBufferSize, kBufAlign); 733 | net.RegisterMemory(buf2); 734 | 735 | // Allocate and register CUDA memory 736 | auto cuda_buf = Buffer::AllocCuda(kMemoryRegionSize * 2, kBufAlign); 737 | net.RegisterMemory(cuda_buf); 738 | printf("Registered 1 buffer on cuda:%d\n", cuda_buf.cuda_device); 739 | 740 | // Loop forever. Accept one client at a time. 741 | for (;;) { 742 | printf("------\n"); 743 | // State machine 744 | RandomFillRequestState s(&cuda_buf); 745 | // RECV for CONNECT 746 | net.PostRecv(buf1, [&s](Network &net, RdmaOp &op) { s.OnRecv(net, op); }); 747 | // RECV for RandomFillRequest 748 | net.PostRecv(buf2, [&s](Network &net, RdmaOp &op) { s.OnRecv(net, op); }); 749 | // Wait for completion 750 | while (!s.done) { 751 | net.PollCompletion(); 752 | } 753 | } 754 | 755 | fi_freeinfo(info); 756 | return 0; 757 | } 758 | 759 | int ClientMain(int argc, char **argv) { 760 | CHECK(argc == 2 || argc == 4); 761 | auto server_addrname = EfaAddress::Parse(argv[1]); 762 | size_t page_size, num_pages; 763 | if (argc == 4) { 764 | page_size = std::stoull(argv[2]); 765 | num_pages = std::stoull(argv[3]); 766 | } else { 767 | page_size = 1 << 20; 768 | num_pages = 8; 769 | } 770 | size_t max_pages = kMemoryRegionSize / page_size; 771 | CHECK(page_size * num_pages <= kMemoryRegionSize); 772 | 773 | // Open Netowrk 774 | struct fi_info *info = GetInfo(); 775 | auto net = Network::Open(info); 776 | printf("domain: %14s", info->domain_attr->name); 777 | printf(", nic: %10s", info->nic->device_attr->name); 778 | printf(", fabric: %s", info->fabric_attr->prov_name); 779 | printf(", link: %.0fGbps", info->nic->link_attr->speed / 1e9); 780 | printf("\n"); 781 | auto server_addr = net.AddPeerAddress(server_addrname); 782 | 783 | // Allocate and register message buffer 784 | auto buf1 = Buffer::Alloc(kMessageBufferSize, kBufAlign); 785 | net.RegisterMemory(buf1); 786 | 787 | // Allocate and register CUDA memory 788 | auto cuda_buf1 = Buffer::AllocCuda(kMemoryRegionSize, kBufAlign); 789 | net.RegisterMemory(cuda_buf1); 790 | auto cuda_buf2 = Buffer::AllocCuda(kMemoryRegionSize, kBufAlign); 791 | net.RegisterMemory(cuda_buf2); 792 | printf("Registered 2 buffers on cuda:%d\n", cuda_buf1.cuda_device); 793 | 794 | // Prepare request 795 | std::mt19937_64 rng(0xabcdabcd987UL); 796 | uint64_t req_seed = rng(); 797 | std::vector page_idx; 798 | std::vector tmp(max_pages); 799 | std::iota(tmp.begin(), tmp.end(), 0); 800 | std::sample(tmp.begin(), tmp.end(), std::back_inserter(page_idx), num_pages, 801 | rng); 802 | 803 | // Send address and MR to server 804 | auto &connect_msg = *(AppConnectMessage *)buf1.data; 805 | connect_msg = { 806 | .base = {.type = AppMessageType::kConnect}, 807 | .client_addr = net.addr, 808 | .num_mr = 2, 809 | }; 810 | connect_msg.mr(0) = {.addr = (uint64_t)cuda_buf1.data, 811 | .size = kMemoryRegionSize, 812 | .rkey = net.GetMR(cuda_buf1)->key}; 813 | connect_msg.mr(1) = {.addr = (uint64_t)cuda_buf2.data, 814 | .size = kMemoryRegionSize, 815 | .rkey = net.GetMR(cuda_buf2)->key}; 816 | bool connect_sent = false; 817 | net.PostSend( 818 | server_addr, buf1, connect_msg.MessageBytes(), 819 | [&connect_sent](Network &net, RdmaOp &op) { connect_sent = true; }); 820 | while (!connect_sent) { 821 | net.PollCompletion(); 822 | } 823 | printf("Sent CONNECT message to server\n"); 824 | 825 | // Prepare to receive the last REMOTE WRITE from server 826 | bool last_remote_write_received = false; 827 | uint32_t remote_write_op_id = 0x123; 828 | net.AddRemoteWrite(remote_write_op_id, 829 | [&last_remote_write_received](Network &net, RdmaOp &op) { 830 | last_remote_write_received = true; 831 | }); 832 | 833 | // Send message to server 834 | auto &req_msg = *(AppRandomFillMessage *)buf1.data; 835 | req_msg = { 836 | .base = {.type = AppMessageType::kRandomFill}, 837 | .remote_context = remote_write_op_id, 838 | .seed = req_seed, 839 | .page_size = page_size, 840 | .num_pages = num_pages, 841 | }; 842 | for (size_t i = 0; i < num_pages; i++) { 843 | req_msg.page_idx(i) = page_idx[i]; 844 | } 845 | bool req_sent = false; 846 | net.PostSend(server_addr, buf1, req_msg.MessageBytes(), 847 | [&req_sent](Network &net, RdmaOp &op) { req_sent = true; }); 848 | while (!req_sent) { 849 | net.PollCompletion(); 850 | } 851 | printf("Sent RandomFillRequest to server. page_size: %zu, num_pages: %zu\n", 852 | page_size, num_pages); 853 | 854 | // Wait for REMOTE WRITE from server 855 | while (!last_remote_write_received) { 856 | net.PollCompletion(); 857 | } 858 | printf("Received RDMA WRITE to local GPU memory.\n"); 859 | 860 | // Verify data 861 | auto expected1 = RandomBytes(req_seed, page_size * num_pages); 862 | auto expected2 = RandomBytes(req_seed + 1, page_size * num_pages); 863 | auto actual1 = std::vector(page_size * num_pages); 864 | auto actual2 = std::vector(page_size * num_pages); 865 | for (size_t i = 0; i < num_pages; i++) { 866 | CUDA_CHECK(cudaMemcpy(actual1.data() + i * page_size, 867 | (uint8_t *)cuda_buf1.data + page_idx[i] * page_size, 868 | page_size, cudaMemcpyDeviceToHost)); 869 | CUDA_CHECK(cudaMemcpy(actual2.data() + i * page_size, 870 | (uint8_t *)cuda_buf2.data + page_idx[i] * page_size, 871 | page_size, cudaMemcpyDeviceToHost)); 872 | } 873 | CHECK(expected1 == actual1); 874 | CHECK(expected2 == actual2); 875 | printf("Data is correct\n"); 876 | 877 | fi_freeinfo(info); 878 | return 0; 879 | } 880 | 881 | int main(int argc, char **argv) { 882 | if (argc == 1) { 883 | return ServerMain(argc, argv); 884 | } else { 885 | return ClientMain(argc, argv); 886 | } 887 | } 888 | -------------------------------------------------------------------------------- /data/ls_sys_bus_pci_devices.txt: -------------------------------------------------------------------------------- 1 | total 0 2 | drwxr-xr-x 2 root root 0 Oct 14 04:36 . 3 | drwxr-xr-x 5 root root 0 Oct 14 04:36 .. 4 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:00:00.0 -> ../../../devices/pci0000:00/0000:00:00.0 5 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:00:01.0 -> ../../../devices/pci0000:00/0000:00:01.0 6 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:00:01.3 -> ../../../devices/pci0000:00/0000:00:01.3 7 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:00:03.0 -> ../../../devices/pci0000:00/0000:00:03.0 8 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:00:04.0 -> ../../../devices/pci0000:00/0000:00:04.0 9 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:01:00.0 -> ../../../devices/pci0000:01/0000:01:00.0 10 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:02:00.0 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0 11 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:00.0 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:00.0 12 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:00.1 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:00.1 13 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:00.2 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:00.2 14 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:00.3 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:00.3 15 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:00.4 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:00.4 16 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:00.5 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:00.5 17 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:00.6 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:00.6 18 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:00.7 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:00.7 19 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:01.0 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:01.0 20 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:01.1 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:01.1 21 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:01.2 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:01.2 22 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:01.3 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:01.3 23 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:01.4 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:01.4 24 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:01.5 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:01.5 25 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:01.6 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:01.6 26 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:01.7 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:01.7 27 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:02.0 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:02.0 28 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:02.1 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:02.1 29 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:02.2 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:02.2 30 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:02.3 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:02.3 31 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:02.4 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:02.4 32 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:02.5 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:02.5 33 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:02.6 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:02.6 34 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:02.7 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:02.7 35 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:03.0 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:03.0 36 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:03.1 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:03.1 37 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:03.2 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:03.2 38 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:03.3 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:03.3 39 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:03.4 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:03.4 40 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:03.5 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:03.5 41 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:03.6 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:03.6 42 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:03.7 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:03.7 43 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:04.0 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:04.0 44 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:04.1 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:04.1 45 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:04.2 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:04.2 46 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:04.3 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:04.3 47 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:04.4 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:04.4 48 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:04.5 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:04.5 49 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:04.6 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:04.6 50 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:04.7 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:04.7 51 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:05.0 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:05.0 52 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:05.1 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:05.1 53 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:05.2 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:05.2 54 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:05.3 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:05.3 55 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:05.4 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:05.4 56 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:05.5 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:05.5 57 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:05.6 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:05.6 58 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:05.7 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:05.7 59 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:06.0 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:06.0 60 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:06.1 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:06.1 61 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:06.2 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:06.2 62 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:06.3 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:06.3 63 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:06.4 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:06.4 64 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:06.5 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:06.5 65 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:06.6 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:06.6 66 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:06.7 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:06.7 67 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:07.0 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:07.0 68 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:07.1 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:07.1 69 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:07.2 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:07.2 70 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:07.3 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:07.3 71 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:07.4 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:07.4 72 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:07.5 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:07.5 73 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:07.6 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:07.6 74 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:03:07.7 -> ../../../devices/pci0000:01/0000:01:00.0/0000:02:00.0/0000:03:07.7 75 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:44:00.0 -> ../../../devices/pci0000:44/0000:44:00.0 76 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:45:00.0 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0 77 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:46:00.0 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:00.0 78 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:46:00.1 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:00.1 79 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:46:00.2 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:00.2 80 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:46:00.3 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:00.3 81 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:46:00.4 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:00.4 82 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:46:00.5 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:00.5 83 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:46:00.6 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:00.6 84 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:46:00.7 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:00.7 85 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:46:01.0 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:01.0 86 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:46:01.1 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:01.1 87 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:46:01.2 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:01.2 88 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:46:01.3 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:01.3 89 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:46:01.4 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:01.4 90 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:46:01.5 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:01.5 91 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:47:00.0 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:00.0/0000:47:00.0 92 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:48:00.0 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:00.1/0000:48:00.0 93 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:49:00.0 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:00.2/0000:49:00.0 94 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:4a:00.0 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:00.3/0000:4a:00.0 95 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:4b:00.0 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:00.4/0000:4b:00.0 96 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:4f:00.0 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:01.0/0000:4f:00.0 97 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:50:00.0 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:01.1/0000:50:00.0 98 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:51:00.0 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:01.2/0000:51:00.0 99 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:52:00.0 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:01.3/0000:52:00.0 100 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:53:00.0 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:01.4/0000:53:00.0 101 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:54:00.0 -> ../../../devices/pci0000:44/0000:44:00.0/0000:45:00.0/0000:46:01.5/0000:54:00.0 102 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:55:00.0 -> ../../../devices/pci0000:55/0000:55:00.0 103 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:56:00.0 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0 104 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:57:00.0 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0/0000:57:00.0 105 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:57:00.1 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0/0000:57:00.1 106 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:57:00.2 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0/0000:57:00.2 107 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:57:00.3 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0/0000:57:00.3 108 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:57:00.4 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0/0000:57:00.4 109 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:57:00.5 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0/0000:57:00.5 110 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:57:00.6 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0/0000:57:00.6 111 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:57:00.7 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0/0000:57:00.7 112 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:57:01.0 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0/0000:57:01.0 113 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:57:01.1 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0/0000:57:01.1 114 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:57:01.2 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0/0000:57:01.2 115 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:57:01.3 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0/0000:57:01.3 116 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:57:01.4 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0/0000:57:01.4 117 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:57:01.5 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0/0000:57:01.5 118 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:58:00.0 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0/0000:57:00.0/0000:58:00.0 119 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:59:00.0 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0/0000:57:00.1/0000:59:00.0 120 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:5a:00.0 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0/0000:57:00.2/0000:5a:00.0 121 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:5b:00.0 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0/0000:57:00.3/0000:5b:00.0 122 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:60:00.0 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0/0000:57:01.0/0000:60:00.0 123 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:61:00.0 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0/0000:57:01.1/0000:61:00.0 124 | lrwxrwxrwx 1 root root 0 Oct 14 04:36 0000:62:00.0 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0/0000:57:01.2/0000:62:00.0 125 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:63:00.0 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0/0000:57:01.3/0000:63:00.0 126 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:64:00.0 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0/0000:57:01.4/0000:64:00.0 127 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:65:00.0 -> ../../../devices/pci0000:55/0000:55:00.0/0000:56:00.0/0000:57:01.5/0000:65:00.0 128 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:66:00.0 -> ../../../devices/pci0000:66/0000:66:00.0 129 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:67:00.0 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0 130 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:68:00.0 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0/0000:68:00.0 131 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:68:00.1 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0/0000:68:00.1 132 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:68:00.2 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0/0000:68:00.2 133 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:68:00.3 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0/0000:68:00.3 134 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:68:00.4 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0/0000:68:00.4 135 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:68:00.5 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0/0000:68:00.5 136 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:68:00.6 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0/0000:68:00.6 137 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:68:00.7 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0/0000:68:00.7 138 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:68:01.0 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0/0000:68:01.0 139 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:68:01.1 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0/0000:68:01.1 140 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:68:01.2 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0/0000:68:01.2 141 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:68:01.3 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0/0000:68:01.3 142 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:68:01.4 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0/0000:68:01.4 143 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:68:01.5 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0/0000:68:01.5 144 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:69:00.0 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0/0000:68:00.0/0000:69:00.0 145 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:6a:00.0 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0/0000:68:00.1/0000:6a:00.0 146 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:6b:00.0 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0/0000:68:00.2/0000:6b:00.0 147 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:6c:00.0 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0/0000:68:00.3/0000:6c:00.0 148 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:71:00.0 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0/0000:68:01.0/0000:71:00.0 149 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:72:00.0 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0/0000:68:01.1/0000:72:00.0 150 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:73:00.0 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0/0000:68:01.2/0000:73:00.0 151 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:74:00.0 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0/0000:68:01.3/0000:74:00.0 152 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:75:00.0 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0/0000:68:01.4/0000:75:00.0 153 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:76:00.0 -> ../../../devices/pci0000:66/0000:66:00.0/0000:67:00.0/0000:68:01.5/0000:76:00.0 154 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:77:00.0 -> ../../../devices/pci0000:77/0000:77:00.0 155 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:78:00.0 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0 156 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:79:00.0 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0/0000:79:00.0 157 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:79:00.1 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0/0000:79:00.1 158 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:79:00.2 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0/0000:79:00.2 159 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:79:00.3 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0/0000:79:00.3 160 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:79:00.4 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0/0000:79:00.4 161 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:79:00.5 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0/0000:79:00.5 162 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:79:00.6 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0/0000:79:00.6 163 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:79:00.7 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0/0000:79:00.7 164 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:79:01.0 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0/0000:79:01.0 165 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:79:01.1 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0/0000:79:01.1 166 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:79:01.2 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0/0000:79:01.2 167 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:79:01.3 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0/0000:79:01.3 168 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:79:01.4 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0/0000:79:01.4 169 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:79:01.5 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0/0000:79:01.5 170 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:7a:00.0 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0/0000:79:00.0/0000:7a:00.0 171 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:7b:00.0 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0/0000:79:00.1/0000:7b:00.0 172 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:7c:00.0 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0/0000:79:00.2/0000:7c:00.0 173 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:7d:00.0 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0/0000:79:00.3/0000:7d:00.0 174 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:82:00.0 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0/0000:79:01.0/0000:82:00.0 175 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:83:00.0 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0/0000:79:01.1/0000:83:00.0 176 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:84:00.0 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0/0000:79:01.2/0000:84:00.0 177 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:85:00.0 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0/0000:79:01.3/0000:85:00.0 178 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:86:00.0 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0/0000:79:01.4/0000:86:00.0 179 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:87:00.0 -> ../../../devices/pci0000:77/0000:77:00.0/0000:78:00.0/0000:79:01.5/0000:87:00.0 180 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:88:00.0 -> ../../../devices/pci0000:88/0000:88:00.0 181 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:89:00.0 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0 182 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:8a:00.0 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0/0000:8a:00.0 183 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:8a:00.1 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0/0000:8a:00.1 184 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:8a:00.2 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0/0000:8a:00.2 185 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:8a:00.3 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0/0000:8a:00.3 186 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:8a:00.4 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0/0000:8a:00.4 187 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:8a:00.5 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0/0000:8a:00.5 188 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:8a:00.6 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0/0000:8a:00.6 189 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:8a:00.7 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0/0000:8a:00.7 190 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:8a:01.0 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0/0000:8a:01.0 191 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:8a:01.1 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0/0000:8a:01.1 192 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:8a:01.2 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0/0000:8a:01.2 193 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:8a:01.3 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0/0000:8a:01.3 194 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:8a:01.4 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0/0000:8a:01.4 195 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:8a:01.5 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0/0000:8a:01.5 196 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:8c:00.0 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0/0000:8a:00.1/0000:8c:00.0 197 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:8d:00.0 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0/0000:8a:00.2/0000:8d:00.0 198 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:8e:00.0 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0/0000:8a:00.3/0000:8e:00.0 199 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:8f:00.0 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0/0000:8a:00.4/0000:8f:00.0 200 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:93:00.0 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0/0000:8a:01.0/0000:93:00.0 201 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:94:00.0 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0/0000:8a:01.1/0000:94:00.0 202 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:95:00.0 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0/0000:8a:01.2/0000:95:00.0 203 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:96:00.0 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0/0000:8a:01.3/0000:96:00.0 204 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:97:00.0 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0/0000:8a:01.4/0000:97:00.0 205 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:98:00.0 -> ../../../devices/pci0000:88/0000:88:00.0/0000:89:00.0/0000:8a:01.5/0000:98:00.0 206 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:99:00.0 -> ../../../devices/pci0000:99/0000:99:00.0 207 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:9a:00.0 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0 208 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:9b:00.0 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0/0000:9b:00.0 209 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:9b:00.1 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0/0000:9b:00.1 210 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:9b:00.2 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0/0000:9b:00.2 211 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:9b:00.3 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0/0000:9b:00.3 212 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:9b:00.4 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0/0000:9b:00.4 213 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:9b:00.5 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0/0000:9b:00.5 214 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:9b:00.6 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0/0000:9b:00.6 215 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:9b:00.7 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0/0000:9b:00.7 216 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:9b:01.0 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0/0000:9b:01.0 217 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:9b:01.1 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0/0000:9b:01.1 218 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:9b:01.2 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0/0000:9b:01.2 219 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:9b:01.3 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0/0000:9b:01.3 220 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:9b:01.4 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0/0000:9b:01.4 221 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:9b:01.5 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0/0000:9b:01.5 222 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:9c:00.0 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0/0000:9b:00.0/0000:9c:00.0 223 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:9d:00.0 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0/0000:9b:00.1/0000:9d:00.0 224 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:9e:00.0 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0/0000:9b:00.2/0000:9e:00.0 225 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:9f:00.0 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0/0000:9b:00.3/0000:9f:00.0 226 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:a4:00.0 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0/0000:9b:01.0/0000:a4:00.0 227 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:a5:00.0 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0/0000:9b:01.1/0000:a5:00.0 228 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:a6:00.0 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0/0000:9b:01.2/0000:a6:00.0 229 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:a7:00.0 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0/0000:9b:01.3/0000:a7:00.0 230 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:a8:00.0 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0/0000:9b:01.4/0000:a8:00.0 231 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:a9:00.0 -> ../../../devices/pci0000:99/0000:99:00.0/0000:9a:00.0/0000:9b:01.5/0000:a9:00.0 232 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:aa:00.0 -> ../../../devices/pci0000:aa/0000:aa:00.0 233 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:ab:00.0 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0 234 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:ac:00.0 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0/0000:ac:00.0 235 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:ac:00.1 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0/0000:ac:00.1 236 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:ac:00.2 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0/0000:ac:00.2 237 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:ac:00.3 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0/0000:ac:00.3 238 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:ac:00.4 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0/0000:ac:00.4 239 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:ac:00.5 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0/0000:ac:00.5 240 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:ac:00.6 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0/0000:ac:00.6 241 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:ac:00.7 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0/0000:ac:00.7 242 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:ac:01.0 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0/0000:ac:01.0 243 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:ac:01.1 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0/0000:ac:01.1 244 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:ac:01.2 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0/0000:ac:01.2 245 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:ac:01.3 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0/0000:ac:01.3 246 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:ac:01.4 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0/0000:ac:01.4 247 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:ac:01.5 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0/0000:ac:01.5 248 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:ad:00.0 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0/0000:ac:00.0/0000:ad:00.0 249 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:ae:00.0 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0/0000:ac:00.1/0000:ae:00.0 250 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:af:00.0 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0/0000:ac:00.2/0000:af:00.0 251 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:b0:00.0 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0/0000:ac:00.3/0000:b0:00.0 252 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:b5:00.0 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0/0000:ac:01.0/0000:b5:00.0 253 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:b6:00.0 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0/0000:ac:01.1/0000:b6:00.0 254 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:b7:00.0 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0/0000:ac:01.2/0000:b7:00.0 255 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:b8:00.0 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0/0000:ac:01.3/0000:b8:00.0 256 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:b9:00.0 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0/0000:ac:01.4/0000:b9:00.0 257 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:ba:00.0 -> ../../../devices/pci0000:aa/0000:aa:00.0/0000:ab:00.0/0000:ac:01.5/0000:ba:00.0 258 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:bb:00.0 -> ../../../devices/pci0000:bb/0000:bb:00.0 259 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:bc:00.0 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0 260 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:bd:00.0 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0/0000:bd:00.0 261 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:bd:00.1 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0/0000:bd:00.1 262 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:bd:00.2 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0/0000:bd:00.2 263 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:bd:00.3 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0/0000:bd:00.3 264 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:bd:00.4 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0/0000:bd:00.4 265 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:bd:00.5 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0/0000:bd:00.5 266 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:bd:00.6 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0/0000:bd:00.6 267 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:bd:00.7 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0/0000:bd:00.7 268 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:bd:01.0 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0/0000:bd:01.0 269 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:bd:01.1 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0/0000:bd:01.1 270 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:bd:01.2 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0/0000:bd:01.2 271 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:bd:01.3 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0/0000:bd:01.3 272 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:bd:01.4 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0/0000:bd:01.4 273 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:bd:01.5 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0/0000:bd:01.5 274 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:be:00.0 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0/0000:bd:00.0/0000:be:00.0 275 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:bf:00.0 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0/0000:bd:00.1/0000:bf:00.0 276 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:c0:00.0 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0/0000:bd:00.2/0000:c0:00.0 277 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:c1:00.0 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0/0000:bd:00.3/0000:c1:00.0 278 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:c6:00.0 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0/0000:bd:01.0/0000:c6:00.0 279 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:c7:00.0 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0/0000:bd:01.1/0000:c7:00.0 280 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:c8:00.0 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0/0000:bd:01.2/0000:c8:00.0 281 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:c9:00.0 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0/0000:bd:01.3/0000:c9:00.0 282 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:ca:00.0 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0/0000:bd:01.4/0000:ca:00.0 283 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:cb:00.0 -> ../../../devices/pci0000:bb/0000:bb:00.0/0000:bc:00.0/0000:bd:01.5/0000:cb:00.0 284 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:cc:00.0 -> ../../../devices/pci0000:cc/0000:cc:00.0 285 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:cd:00.0 -> ../../../devices/pci0000:cc/0000:cc:00.0/0000:cd:00.0 286 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:ce:00.0 -> ../../../devices/pci0000:cc/0000:cc:00.0/0000:cd:00.0/0000:ce:00.0 287 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:ce:00.1 -> ../../../devices/pci0000:cc/0000:cc:00.0/0000:cd:00.0/0000:ce:00.1 288 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:ce:00.2 -> ../../../devices/pci0000:cc/0000:cc:00.0/0000:cd:00.0/0000:ce:00.2 289 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:ce:00.3 -> ../../../devices/pci0000:cc/0000:cc:00.0/0000:cd:00.0/0000:ce:00.3 290 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:cf:00.0 -> ../../../devices/pci0000:cc/0000:cc:00.0/0000:cd:00.0/0000:ce:00.0/0000:cf:00.0 291 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:d0:00.0 -> ../../../devices/pci0000:cc/0000:cc:00.0/0000:cd:00.0/0000:ce:00.1/0000:d0:00.0 292 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:d1:00.0 -> ../../../devices/pci0000:cc/0000:cc:00.0/0000:cd:00.0/0000:ce:00.2/0000:d1:00.0 293 | lrwxrwxrwx 1 root root 0 Dec 3 02:39 0000:d2:00.0 -> ../../../devices/pci0000:cc/0000:cc:00.0/0000:cd:00.0/0000:ce:00.3/0000:d2:00.0 294 | -------------------------------------------------------------------------------- /src/7_queue.cpp: -------------------------------------------------------------------------------- 1 | // clang-format off 2 | /* 3 | Example run: 4 | 5 | server$ ./build/7_queue 6 | domain: rdmap79s0-rdm, nic: rdmap79s0, fabric: efa, link: 100Gbps 7 | Run client with the following command: 8 | ./build/7_queue fe80000000000000088c03fffecfda9500000000d0bf57530000000000000000 [page_size num_pages] 9 | Registered 1 buffer on cuda:0 10 | ------ 11 | Received CONNECT message from client: 12 | addr: fe800000000000000835cbfffeec498f0000000011b45e020000000000000000 13 | MR[0]: addr=0x7fba60000000 size=1073741824 rkey=0x000000000040001c 14 | MR[1]: addr=0x7fba20000000 size=1073741824 rkey=0x0000000000d00045 15 | Received RandomFill request from client: 16 | remote_context: 0x00000123 17 | seed: 0xb584035fabe6ce9b 18 | page_size: 65536 19 | num_pages: 1000 20 | total_repeat: 500 21 | Generating random data.. 22 | Started RDMA WRITE to the remote GPU memory. 23 | [5.381s] WRITE: 100%, ops=1000000/1000000, posted=1000000(100%), bytes=65536000000/65536000000, bw=97.433Gbps(97.4%), 0.186Mpps 24 | Finished all RDMA WRITEs to the remote GPU memory. 25 | ------ 26 | ^C 27 | 28 | client ./build/7_queue fe80000000000000088c03fffecfda9500000000d0bf57530000000000000000 29 | domain: rdmap79s0-rdm, nic: rdmap79s0, fabric: efa, link: 100Gbps 30 | Registered 2 buffers on cuda:0 31 | Sent CONNECT message to server 32 | Sent RandomFillRequest to server. page_size: 65536, num_pages: 1000 33 | Received RDMA WRITE to local GPU memory. 34 | Data is correct 35 | */ 36 | // clang-format on 37 | 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | #include 51 | #include 52 | #include 53 | #include 54 | #include 55 | #include 56 | #include 57 | #include 58 | #include 59 | #include 60 | #include 61 | #include 62 | #include 63 | #include 64 | 65 | #define CHECK(stmt) \ 66 | do { \ 67 | if (!(stmt)) { \ 68 | fprintf(stderr, "%s:%d %s\n", __FILE__, __LINE__, #stmt); \ 69 | std::exit(1); \ 70 | } \ 71 | } while (0) 72 | 73 | #define FI_CHECK(stmt) \ 74 | do { \ 75 | int rc = (stmt); \ 76 | if (rc) { \ 77 | fprintf(stderr, "%s:%d %s failed with %d (%s)\n", __FILE__, __LINE__, \ 78 | #stmt, rc, fi_strerror(-rc)); \ 79 | std::exit(1); \ 80 | } \ 81 | } while (0) 82 | 83 | #define CUDA_CHECK(stmt) \ 84 | do { \ 85 | cudaError_t rc = (stmt); \ 86 | if (rc != cudaSuccess) { \ 87 | fprintf(stderr, "%s:%d %s failed with %d (%s)\n", __FILE__, __LINE__, \ 88 | #stmt, rc, cudaGetErrorString(rc)); \ 89 | std::exit(1); \ 90 | } \ 91 | } while (0) 92 | 93 | #define CU_CHECK(stmt) \ 94 | do { \ 95 | CUresult rc = (stmt); \ 96 | if (rc != CUDA_SUCCESS) { \ 97 | const char *err_str; \ 98 | cuGetErrorString(rc, &err_str); \ 99 | fprintf(stderr, "%s:%d %s failed with %d (%s)\n", __FILE__, __LINE__, \ 100 | #stmt, rc, err_str); \ 101 | std::exit(1); \ 102 | } \ 103 | } while (0) 104 | 105 | constexpr size_t kBufAlign = 128; // EFA alignment requirement 106 | constexpr size_t kMessageBufferSize = 1 << 20; 107 | constexpr size_t kCompletionQueueReadCount = 16; 108 | constexpr size_t kMemoryRegionSize = 1UL << 30; 109 | constexpr size_t kEfaImmDataSize = 4; 110 | 111 | struct Buffer; 112 | struct Network; 113 | 114 | struct EfaAddress { 115 | uint8_t bytes[32]; 116 | 117 | explicit EfaAddress(uint8_t bytes[32]) { memcpy(this->bytes, bytes, 32); } 118 | 119 | std::string ToString() const { 120 | char buf[65]; 121 | for (size_t i = 0; i < 32; i++) { 122 | snprintf(buf + 2 * i, 3, "%02x", bytes[i]); 123 | } 124 | return std::string(buf, 64); 125 | } 126 | 127 | static EfaAddress Parse(const std::string &str) { 128 | if (str.size() != 64) { 129 | fprintf(stderr, "Unexpected address length %zu\n", str.size()); 130 | std::exit(1); 131 | } 132 | uint8_t bytes[32]; 133 | for (size_t i = 0; i < 32; i++) { 134 | sscanf(str.c_str() + 2 * i, "%02hhx", &bytes[i]); 135 | } 136 | return EfaAddress(bytes); 137 | } 138 | }; 139 | 140 | enum class RdmaOpType : uint8_t { 141 | kRecv = 0, 142 | kSend = 1, 143 | kWrite = 2, 144 | kRemoteWrite = 3, 145 | }; 146 | 147 | struct RdmaRecvOp { 148 | Buffer *buf; 149 | fi_addr_t src_addr; // Set after completion 150 | size_t recv_size; // Set after completion 151 | }; 152 | static_assert(std::is_pod_v); 153 | 154 | struct RdmaSendOp { 155 | Buffer *buf; 156 | size_t len; 157 | fi_addr_t dest_addr; 158 | }; 159 | static_assert(std::is_pod_v); 160 | 161 | struct RdmaWriteOp { 162 | Buffer *buf; 163 | size_t offset; 164 | size_t len; 165 | uint32_t imm_data; 166 | uint64_t dest_ptr; 167 | fi_addr_t dest_addr; 168 | uint64_t dest_key; 169 | }; 170 | static_assert(std::is_pod_v); 171 | 172 | struct RdmaRemoteWriteOp { 173 | uint32_t op_id; 174 | }; 175 | static_assert(std::is_pod_v); 176 | static_assert(sizeof(RdmaRemoteWriteOp) <= kEfaImmDataSize); 177 | 178 | struct RdmaOp { 179 | RdmaOpType type; 180 | union { 181 | RdmaRecvOp recv; 182 | RdmaSendOp send; 183 | RdmaWriteOp write; 184 | RdmaRemoteWriteOp remote_write; 185 | }; 186 | std::function callback; 187 | }; 188 | 189 | struct Network { 190 | struct fi_info *fi; 191 | struct fid_fabric *fabric; 192 | struct fid_domain *domain; 193 | struct fid_cq *cq; 194 | struct fid_av *av; 195 | struct fid_ep *ep; 196 | EfaAddress addr; 197 | 198 | std::deque pending_ops; 199 | 200 | std::unordered_map mr; 201 | std::unordered_map remote_write_ops; 202 | 203 | static Network Open(struct fi_info *fi); 204 | 205 | fi_addr_t AddPeerAddress(const EfaAddress &peer_addr); 206 | void RegisterMemory(Buffer &buf); 207 | struct fid_mr *GetMR(const Buffer &buf); 208 | 209 | void PollCompletion(); 210 | void ProgressPendingOps(); 211 | void PostRecv(Buffer &buf, 212 | std::function &&callback); 213 | void PostSend(fi_addr_t addr, Buffer &buf, size_t len, 214 | std::function &&callback); 215 | void PostWrite(RdmaWriteOp &&write, 216 | std::function &&callback); 217 | void AddRemoteWrite(uint32_t id, 218 | std::function &&callback); 219 | 220 | Network(const Network &) = delete; 221 | Network(Network &&other) 222 | : fi(other.fi), fabric(other.fabric), domain(other.domain), cq(other.cq), 223 | av(other.av), ep(other.ep), addr(other.addr) { 224 | other.fi = nullptr; 225 | other.fabric = nullptr; 226 | other.domain = nullptr; 227 | other.cq = nullptr; 228 | other.av = nullptr; 229 | other.ep = nullptr; 230 | } 231 | 232 | ~Network() { 233 | for (const auto &[_, mr] : mr) { 234 | FI_CHECK(fi_close(&mr->fid)); 235 | } 236 | if (ep) 237 | FI_CHECK(fi_close(&ep->fid)); 238 | if (av) 239 | FI_CHECK(fi_close(&av->fid)); 240 | if (cq) 241 | FI_CHECK(fi_close(&cq->fid)); 242 | if (domain) 243 | FI_CHECK(fi_close(&domain->fid)); 244 | if (fabric) 245 | FI_CHECK(fi_close(&fabric->fid)); 246 | } 247 | 248 | private: 249 | Network(struct fi_info *fi, struct fid_fabric *fabric, 250 | struct fid_domain *domain, struct fid_cq *cq, struct fid_av *av, 251 | struct fid_ep *ep, EfaAddress addr) 252 | : fi(fi), fabric(fabric), domain(domain), cq(cq), av(av), ep(ep), 253 | addr(addr) {} 254 | }; 255 | 256 | void *align_up(void *ptr, size_t align) { 257 | uintptr_t addr = (uintptr_t)ptr; 258 | return (void *)((addr + align - 1) & ~(align - 1)); 259 | } 260 | 261 | struct Buffer { 262 | void *data; 263 | size_t size; 264 | int cuda_device; 265 | int dmabuf_fd; 266 | 267 | static Buffer Alloc(size_t size, size_t align) { 268 | void *raw_data = malloc(size); 269 | CHECK(raw_data != nullptr); 270 | return Buffer(raw_data, size, align, -1, -1); 271 | } 272 | 273 | static Buffer AllocCuda(size_t size, size_t align) { 274 | void *raw_data; 275 | struct cudaPointerAttributes attrs = {}; 276 | CUDA_CHECK(cudaMalloc(&raw_data, size)); 277 | CUDA_CHECK(cudaPointerGetAttributes(&attrs, raw_data)); 278 | CHECK(attrs.type == cudaMemoryTypeDevice); 279 | int cuda_device = attrs.device; 280 | int fd = -1; 281 | CU_CHECK(cuMemGetHandleForAddressRange( 282 | &fd, (CUdeviceptr)align_up(raw_data, align), size, 283 | CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); 284 | return Buffer(raw_data, size, align, cuda_device, fd); 285 | } 286 | 287 | bool is_cuda() const { return cuda_device >= 0; } 288 | 289 | Buffer(Buffer &&other) 290 | : data(other.data), size(other.size), cuda_device(other.cuda_device), 291 | dmabuf_fd(other.dmabuf_fd), raw_data(other.raw_data) { 292 | other.data = nullptr; 293 | other.raw_data = nullptr; 294 | other.size = 0; 295 | other.cuda_device = -1; 296 | other.dmabuf_fd = -1; 297 | } 298 | 299 | ~Buffer() { 300 | if (is_cuda()) { 301 | CUDA_CHECK(cudaFree(raw_data)); 302 | } else { 303 | free(raw_data); 304 | } 305 | } 306 | 307 | private: 308 | void *raw_data; 309 | 310 | Buffer(void *raw_data, size_t raw_size, size_t align, int cuda_device, 311 | int dmabuf_fd) { 312 | this->raw_data = raw_data; 313 | this->data = align_up(raw_data, align); 314 | this->size = (size_t)((uintptr_t)raw_data + raw_size - (uintptr_t)data); 315 | this->cuda_device = cuda_device; 316 | this->dmabuf_fd = dmabuf_fd; 317 | } 318 | Buffer(const Buffer &) = delete; 319 | }; 320 | 321 | struct fi_info *GetInfo() { 322 | struct fi_info *hints, *info; 323 | hints = fi_allocinfo(); 324 | hints->caps = FI_MSG | FI_RMA | FI_HMEM | FI_LOCAL_COMM | FI_REMOTE_COMM; 325 | hints->ep_attr->type = FI_EP_RDM; 326 | hints->fabric_attr->prov_name = strdup("efa"); 327 | hints->domain_attr->mr_mode = FI_MR_LOCAL | FI_MR_HMEM | FI_MR_VIRT_ADDR | 328 | FI_MR_ALLOCATED | FI_MR_PROV_KEY; 329 | hints->domain_attr->threading = FI_THREAD_SAFE; 330 | FI_CHECK(fi_getinfo(FI_VERSION(2, 0), nullptr, nullptr, 0, hints, &info)); 331 | fi_freeinfo(hints); 332 | return info; 333 | } 334 | 335 | Network Network::Open(struct fi_info *fi) { 336 | struct fid_fabric *fabric; 337 | FI_CHECK(fi_fabric(fi->fabric_attr, &fabric, nullptr)); 338 | 339 | struct fid_domain *domain; 340 | FI_CHECK(fi_domain(fabric, fi, &domain, nullptr)); 341 | 342 | struct fid_cq *cq; 343 | struct fi_cq_attr cq_attr = {}; 344 | cq_attr.format = FI_CQ_FORMAT_DATA; 345 | FI_CHECK(fi_cq_open(domain, &cq_attr, &cq, nullptr)); 346 | 347 | struct fid_av *av; 348 | struct fi_av_attr av_attr = {}; 349 | FI_CHECK(fi_av_open(domain, &av_attr, &av, nullptr)); 350 | 351 | struct fid_ep *ep; 352 | FI_CHECK(fi_endpoint(domain, fi, &ep, nullptr)); 353 | FI_CHECK(fi_ep_bind(ep, &cq->fid, FI_SEND | FI_RECV)); 354 | FI_CHECK(fi_ep_bind(ep, &av->fid, 0)); 355 | 356 | FI_CHECK(fi_enable(ep)); 357 | 358 | uint8_t addrbuf[64]; 359 | size_t addrlen = sizeof(addrbuf); 360 | FI_CHECK(fi_getname(&ep->fid, addrbuf, &addrlen)); 361 | if (addrlen != 32) { 362 | fprintf(stderr, "Unexpected address length %zu\n", addrlen); 363 | std::exit(1); 364 | } 365 | auto addr = EfaAddress(addrbuf); 366 | 367 | return Network(fi, fabric, domain, cq, av, ep, addr); 368 | } 369 | 370 | fi_addr_t Network::AddPeerAddress(const EfaAddress &peer_addr) { 371 | fi_addr_t addr = FI_ADDR_UNSPEC; 372 | int ret = fi_av_insert(av, peer_addr.bytes, 1, &addr, 0, nullptr); 373 | if (ret != 1) { 374 | fprintf(stderr, "fi_av_insert failed: %d\n", ret); 375 | std::exit(1); 376 | } 377 | return addr; 378 | } 379 | 380 | void Network::RegisterMemory(Buffer &buf) { 381 | struct fid_mr *mr; 382 | struct fi_mr_attr mr_attr = { 383 | .iov_count = 1, 384 | .access = FI_SEND | FI_RECV | FI_REMOTE_WRITE | FI_REMOTE_READ | 385 | FI_WRITE | FI_READ, 386 | }; 387 | struct iovec iov = {.iov_base = buf.data, .iov_len = buf.size}; 388 | struct fi_mr_dmabuf dmabuf = { 389 | .fd = buf.dmabuf_fd, .offset = 0, .len = buf.size, .base_addr = buf.data}; 390 | uint64_t flags = 0; 391 | if (buf.is_cuda()) { 392 | mr_attr.iface = FI_HMEM_CUDA; 393 | mr_attr.device.cuda = buf.cuda_device; 394 | if (buf.dmabuf_fd != -1) { 395 | mr_attr.dmabuf = &dmabuf; 396 | flags = FI_MR_DMABUF; 397 | } else { 398 | mr_attr.mr_iov = &iov; 399 | } 400 | } else { 401 | mr_attr.mr_iov = &iov; 402 | } 403 | FI_CHECK(fi_mr_regattr(domain, &mr_attr, flags, &mr)); 404 | this->mr[buf.data] = mr; 405 | } 406 | 407 | struct fid_mr *Network::GetMR(const Buffer &buf) { 408 | auto it = mr.find(buf.data); 409 | CHECK(it != mr.end()); 410 | return it->second; 411 | } 412 | 413 | void Network::PostRecv(Buffer &buf, 414 | std::function &&callback) { 415 | auto *op = new RdmaOp{ 416 | .type = RdmaOpType::kRecv, 417 | .recv = 418 | RdmaRecvOp{.buf = &buf, .src_addr = FI_ADDR_UNSPEC, .recv_size = 0}, 419 | .callback = std::move(callback), 420 | }; 421 | pending_ops.push_back(op); 422 | ProgressPendingOps(); 423 | } 424 | 425 | void Network::PostSend(fi_addr_t addr, Buffer &buf, size_t len, 426 | std::function &&callback) { 427 | CHECK(len <= buf.size); 428 | auto *op = new RdmaOp{ 429 | .type = RdmaOpType::kSend, 430 | .send = RdmaSendOp{.buf = &buf, .len = len, .dest_addr = addr}, 431 | .callback = std::move(callback), 432 | }; 433 | pending_ops.push_back(op); 434 | ProgressPendingOps(); 435 | } 436 | 437 | void Network::PostWrite(RdmaWriteOp &&write, 438 | std::function &&callback) { 439 | auto *op = new RdmaOp{ 440 | .type = RdmaOpType::kWrite, 441 | .write = std::move(write), 442 | .callback = std::move(callback), 443 | }; 444 | pending_ops.push_back(op); 445 | ProgressPendingOps(); 446 | } 447 | 448 | void Network::AddRemoteWrite( 449 | uint32_t id, std::function &&callback) { 450 | CHECK(remote_write_ops.count(id) == 0); 451 | auto *op = new RdmaOp{ 452 | .type = RdmaOpType::kRemoteWrite, 453 | .remote_write = RdmaRemoteWriteOp{.op_id = id}, 454 | .callback = std::move(callback), 455 | }; 456 | remote_write_ops[id] = op; 457 | } 458 | 459 | void Network::ProgressPendingOps() { 460 | while (!pending_ops.empty()) { 461 | auto *op = pending_ops.front(); 462 | pending_ops.pop_front(); 463 | const char *op_name = nullptr; 464 | ssize_t ret = 0; 465 | switch (op->type) { 466 | case RdmaOpType::kRecv: { 467 | op_name = "fi_recv"; 468 | auto &recv = op->recv; 469 | struct iovec iov = { 470 | .iov_base = recv.buf->data, 471 | .iov_len = recv.buf->size, 472 | }; 473 | struct fi_msg msg = { 474 | .msg_iov = &iov, 475 | .desc = &GetMR(*recv.buf)->mem_desc, 476 | .iov_count = 1, 477 | .addr = FI_ADDR_UNSPEC, 478 | .context = op, 479 | }; 480 | ret = fi_recvmsg(ep, &msg, 0); 481 | break; 482 | } 483 | case RdmaOpType::kSend: { 484 | op_name = "fi_send"; 485 | auto &send = op->send; 486 | struct iovec iov = { 487 | .iov_base = send.buf->data, 488 | .iov_len = send.len, 489 | }; 490 | struct fi_msg msg = { 491 | .msg_iov = &iov, 492 | .desc = &GetMR(*send.buf)->mem_desc, 493 | .iov_count = 1, 494 | .addr = send.dest_addr, 495 | .context = op, 496 | }; 497 | ret = fi_sendmsg(ep, &msg, 0); 498 | break; 499 | } 500 | case RdmaOpType::kWrite: { 501 | op_name = "fi_writemsg"; 502 | auto &write = op->write; 503 | struct iovec iov = { 504 | .iov_base = (uint8_t *)write.buf->data + write.offset, 505 | .iov_len = write.len, 506 | }; 507 | struct fi_rma_iov rma_iov = { 508 | .addr = write.dest_ptr, 509 | .len = write.len, 510 | .key = write.dest_key, 511 | }; 512 | struct fi_msg_rma msg = { 513 | .msg_iov = &iov, 514 | .desc = &GetMR(*write.buf)->mem_desc, 515 | .iov_count = 1, 516 | .addr = write.dest_addr, 517 | .rma_iov = &rma_iov, 518 | .rma_iov_count = 1, 519 | .context = op, 520 | .data = write.imm_data, 521 | }; 522 | uint64_t flags = 0; 523 | if (write.imm_data) { 524 | flags |= FI_REMOTE_CQ_DATA; 525 | } 526 | ret = fi_writemsg(ep, &msg, flags); 527 | break; 528 | } 529 | case RdmaOpType::kRemoteWrite: { 530 | CHECK(false); // Unreachable 531 | break; 532 | } 533 | } 534 | if (ret == -FI_EAGAIN) { 535 | // Put it back to the front of the queue 536 | pending_ops.push_front(op); 537 | break; 538 | } 539 | if (ret) { 540 | // Unexpected error. Don't put it back. 541 | // Delete the op since it's not going to be in the completion queue. 542 | delete op; 543 | fprintf(stderr, "Failed to ProgressPendingOps. %s() returned %ld (%s)\n", 544 | op_name, ret, fi_strerror(-ret)); 545 | fflush(stderr); 546 | break; 547 | } 548 | } 549 | } 550 | 551 | void HandleCompletion(Network &net, const struct fi_cq_data_entry &cqe) { 552 | RdmaOp *op = nullptr; 553 | if (cqe.flags & FI_REMOTE_WRITE) { 554 | // REMOTE WRITE does not have op_context 555 | // NOTE(lequn): EFA only supports 4 bytes of immediate data. 556 | uint32_t op_id = cqe.data; 557 | if (!op_id) 558 | return; 559 | auto it = net.remote_write_ops.find(op_id); 560 | if (it == net.remote_write_ops.end()) 561 | return; 562 | op = it->second; 563 | net.remote_write_ops.erase(it); 564 | } else { 565 | // RECV / SEND / WRITE 566 | op = (RdmaOp *)cqe.op_context; 567 | if (!op) 568 | return; 569 | if (cqe.flags & FI_RECV) { 570 | op->recv.recv_size = cqe.len; 571 | } else if (cqe.flags & FI_SEND) { 572 | // Nothing special 573 | } else if (cqe.flags & FI_WRITE) { 574 | // Nothing special 575 | } else { 576 | fprintf(stderr, "Unhandled completion type. cqe.flags=%lx\n", cqe.flags); 577 | std::exit(1); 578 | } 579 | } 580 | if (op->callback) 581 | op->callback(net, *op); 582 | delete op; 583 | } 584 | 585 | void Network::PollCompletion() { 586 | // Process completions 587 | struct fi_cq_data_entry cqe[kCompletionQueueReadCount]; 588 | for (;;) { 589 | auto ret = fi_cq_read(cq, cqe, kCompletionQueueReadCount); 590 | if (ret > 0) { 591 | for (ssize_t i = 0; i < ret; i++) { 592 | HandleCompletion(*this, cqe[i]); 593 | } 594 | } else if (ret == -FI_EAVAIL) { 595 | struct fi_cq_err_entry err_entry; 596 | ret = fi_cq_readerr(cq, &err_entry, 0); 597 | if (ret < 0) { 598 | fprintf(stderr, "fi_cq_readerr error: %zd (%s)\n", ret, 599 | fi_strerror(-ret)); 600 | std::exit(1); 601 | } else if (ret > 0) { 602 | fprintf(stderr, "Failed libfabric operation: %s\n", 603 | fi_cq_strerror(cq, err_entry.prov_errno, err_entry.err_data, 604 | nullptr, 0)); 605 | } else { 606 | fprintf(stderr, "fi_cq_readerr returned 0 unexpectedly.\n"); 607 | std::exit(1); 608 | } 609 | } else if (ret == -FI_EAGAIN) { 610 | // No more completions 611 | break; 612 | } else { 613 | fprintf(stderr, "fi_cq_read error: %zd (%s)\n", ret, fi_strerror(-ret)); 614 | std::exit(1); 615 | } 616 | } 617 | 618 | // Try to make progress. 619 | ProgressPendingOps(); 620 | } 621 | 622 | enum class AppMessageType : uint8_t { 623 | kConnect = 0, 624 | kRandomFill = 1, 625 | }; 626 | 627 | struct AppMessageBase { 628 | AppMessageType type; 629 | }; 630 | 631 | struct AppConnectMessage { 632 | struct MemoryRegion { 633 | uint64_t addr; 634 | uint64_t size; 635 | uint64_t rkey; 636 | }; 637 | 638 | AppMessageBase base; 639 | EfaAddress client_addr; 640 | size_t num_mr; 641 | 642 | MemoryRegion &mr(size_t index) { 643 | CHECK(index < num_mr); 644 | return ((MemoryRegion *)((uintptr_t)&base + sizeof(*this)))[index]; 645 | } 646 | 647 | size_t MessageBytes() const { 648 | return sizeof(*this) + num_mr * sizeof(MemoryRegion); 649 | } 650 | }; 651 | 652 | struct AppRandomFillMessage { 653 | AppMessageBase base; 654 | uint32_t remote_context; 655 | uint64_t seed; 656 | size_t page_size; 657 | size_t num_pages; 658 | 659 | uint32_t &page_idx(size_t index) { 660 | CHECK(index < num_pages); 661 | return ((uint32_t *)((uintptr_t)&base + sizeof(*this)))[index]; 662 | } 663 | 664 | size_t MessageBytes() const { 665 | return sizeof(*this) + num_pages * sizeof(uint32_t); 666 | } 667 | }; 668 | 669 | std::vector RandomBytes(uint64_t seed, size_t size) { 670 | CHECK(size % sizeof(uint64_t) == 0); 671 | std::vector buf(size); 672 | std::mt19937_64 gen(seed); 673 | std::uniform_int_distribution dist; 674 | for (size_t i = 0; i < size; i += sizeof(uint64_t)) { 675 | *(uint64_t *)(buf.data() + i) = dist(gen); 676 | } 677 | return buf; 678 | } 679 | 680 | long TimeDeltaNanos( 681 | const std::chrono::time_point &start, 682 | const std::chrono::time_point &end) { 683 | return std::chrono::duration_cast(end - start) 684 | .count(); 685 | } 686 | 687 | struct RandomFillRequestState { 688 | enum class State { 689 | kWaitRequest, 690 | kWrite, 691 | kDone, 692 | }; 693 | 694 | struct WriteState { 695 | size_t i_repeat; 696 | size_t i_buf; 697 | size_t i_page; 698 | }; 699 | 700 | Network *net; 701 | Buffer *cuda_buf; 702 | size_t total_bw = 0; 703 | State state = State::kWaitRequest; 704 | 705 | fi_addr_t client_addr = FI_ADDR_UNSPEC; 706 | AppConnectMessage *connect_msg = nullptr; 707 | AppRandomFillMessage *request_msg = nullptr; 708 | 709 | size_t total_repeat = 0; 710 | WriteState write_state; 711 | size_t total_write_ops = 0; 712 | size_t write_op_size = 0; 713 | size_t posted_write_ops = 0; 714 | size_t finished_write_ops = 0; 715 | std::chrono::time_point write_start_at; 716 | 717 | RandomFillRequestState(Network *net, Buffer *cuda_buf) 718 | : net(net), cuda_buf(cuda_buf) { 719 | total_bw = net->fi->nic->link_attr->speed; 720 | } 721 | 722 | void OnRecv(Network &net, RdmaOp &op) { 723 | if (client_addr == FI_ADDR_UNSPEC) { 724 | HandleConnect(net, op); 725 | } else { 726 | HandleRequest(net, op); 727 | } 728 | } 729 | 730 | void HandleConnect(Network &net, RdmaOp &op) { 731 | auto *base_msg = (AppMessageBase *)op.recv.buf->data; 732 | CHECK(base_msg->type == AppMessageType::kConnect); 733 | CHECK(op.recv.recv_size >= sizeof(AppConnectMessage)); 734 | auto &msg = *(AppConnectMessage *)base_msg; 735 | CHECK(op.recv.recv_size == msg.MessageBytes()); 736 | CHECK(msg.num_mr > 0); 737 | 738 | // Save the message. Note that we don't reuse the buffer. 739 | connect_msg = &msg; 740 | 741 | // Add the client to AV 742 | client_addr = net.AddPeerAddress(msg.client_addr); 743 | 744 | printf("Received CONNECT message from client:\n"); 745 | printf(" addr: %s\n", msg.client_addr.ToString().c_str()); 746 | for (size_t i = 0; i < msg.num_mr; i++) { 747 | printf(" MR[%zu]: addr=0x%012lx size=%lu rkey=0x%016lx\n", i, 748 | msg.mr(i).addr, msg.mr(i).size, msg.mr(i).rkey); 749 | } 750 | } 751 | 752 | void HandleRequest(Network &net, RdmaOp &op) { 753 | auto *base_msg = (const AppMessageBase *)op.recv.buf->data; 754 | CHECK(base_msg->type == AppMessageType::kRandomFill); 755 | CHECK(op.recv.recv_size >= sizeof(AppRandomFillMessage)); 756 | auto &msg = *(AppRandomFillMessage *)base_msg; 757 | CHECK(op.recv.recv_size == msg.MessageBytes()); 758 | 759 | // Save the message. Note that we don't reuse the buffer. 760 | request_msg = &msg; 761 | 762 | printf("Received RandomFill request from client:\n"); 763 | printf(" remote_context: 0x%08x\n", msg.remote_context); 764 | printf(" seed: 0x%016lx\n", msg.seed); 765 | printf(" page_size: %zu\n", msg.page_size); 766 | printf(" num_pages: %zu\n", msg.num_pages); 767 | total_repeat = 500; 768 | printf(" total_repeat: %zu\n", total_repeat); 769 | 770 | // Generate random data and copy to local GPU memory 771 | printf("Generating random data"); 772 | for (size_t i = 0; i < connect_msg->num_mr; ++i) { 773 | auto bytes = RandomBytes(msg.seed + i, msg.page_size * msg.num_pages); 774 | CUDA_CHECK(cudaMemcpy((uint8_t *)cuda_buf->data + i * bytes.size(), 775 | bytes.data(), bytes.size(), 776 | cudaMemcpyHostToDevice)); 777 | printf("."); 778 | fflush(stdout); 779 | } 780 | printf("\n"); 781 | 782 | // Prepare RDMA WRITE the data to remote GPU. 783 | total_write_ops = 784 | connect_msg->num_mr * request_msg->num_pages * total_repeat; 785 | posted_write_ops = 0; 786 | finished_write_ops = 0; 787 | write_op_size = request_msg->page_size; 788 | write_state = {.i_repeat = 0, .i_buf = 0, .i_page = 0}; 789 | write_start_at = std::chrono::high_resolution_clock::now(); 790 | state = State::kWrite; 791 | printf("Started RDMA WRITE to the remote GPU memory.\n"); 792 | } 793 | 794 | void ContinuePostWrite() { 795 | auto &s = write_state; 796 | if (s.i_repeat == total_repeat) 797 | return; 798 | auto page_size = request_msg->page_size; 799 | auto num_pages = request_msg->num_pages; 800 | 801 | uint32_t imm_data = 0; 802 | if (s.i_repeat + 1 == total_repeat && s.i_buf + 1 == connect_msg->num_mr && 803 | s.i_page + 1 == num_pages) { 804 | // The last WRITE. Pass remote context back. 805 | imm_data = request_msg->remote_context; 806 | } 807 | net->PostWrite( 808 | {.buf = cuda_buf, 809 | .offset = s.i_buf * (page_size * num_pages) + s.i_page * page_size, 810 | .len = page_size, 811 | .imm_data = imm_data, 812 | .dest_ptr = connect_msg->mr(s.i_buf).addr + 813 | request_msg->page_idx(s.i_page) * page_size, 814 | .dest_addr = client_addr, 815 | .dest_key = connect_msg->mr(s.i_buf).rkey}, 816 | [this](Network &net, RdmaOp &op) { HandleWriteCompletion(); }); 817 | ++posted_write_ops; 818 | 819 | if (++s.i_page == num_pages) { 820 | s.i_page = 0; 821 | if (++s.i_buf == connect_msg->num_mr) { 822 | s.i_buf = 0; 823 | if (++s.i_repeat == total_repeat) 824 | return; 825 | } 826 | } 827 | } 828 | 829 | void PrintProgress(std::chrono::high_resolution_clock::time_point now, 830 | uint64_t posted, uint64_t finished) { 831 | auto elapsed = TimeDeltaNanos(write_start_at, now) * 1e-9; 832 | float bw_gbps = 8.0f * write_op_size * finished / (elapsed * 1e9); 833 | float bw_util = bw_gbps / (total_bw * 1e-9); 834 | printf("\r[%.3fs] WRITE: %.0f%%, ops=%zu/%zu, posted=%zu(%.0f%%), " 835 | "bytes=%zu/%zu, bw=%.3fGbps(%.1f%%), %.3fMpps\033[K", 836 | // progress 837 | elapsed, 100.0 * finished / total_write_ops, 838 | // ops 839 | finished, total_write_ops, posted, 100.0 * posted / total_write_ops, 840 | // bytes 841 | write_op_size * finished, write_op_size * total_write_ops, 842 | // bw 843 | bw_gbps, 100.0 * bw_util, finished / elapsed * 1e-6); 844 | fflush(stdout); 845 | } 846 | 847 | void HandleWriteCompletion() { 848 | ++finished_write_ops; 849 | if (finished_write_ops % 16384 == 0) { 850 | auto now = std::chrono::high_resolution_clock::now(); 851 | PrintProgress(now, posted_write_ops, finished_write_ops); 852 | } 853 | if (finished_write_ops == total_write_ops) { 854 | auto now = std::chrono::high_resolution_clock::now(); 855 | PrintProgress(now, posted_write_ops, finished_write_ops); 856 | printf("\nFinished all RDMA WRITEs to the remote GPU memory.\n"); 857 | state = State::kDone; 858 | } 859 | } 860 | }; 861 | 862 | int ServerMain(int argc, char **argv) { 863 | // Open Netowrk 864 | struct fi_info *info = GetInfo(); 865 | auto net = Network::Open(info); 866 | printf("domain: %14s", info->domain_attr->name); 867 | printf(", nic: %10s", info->nic->device_attr->name); 868 | printf(", fabric: %s", info->fabric_attr->prov_name); 869 | printf(", link: %.0fGbps", info->nic->link_attr->speed / 1e9); 870 | printf("\n"); 871 | printf("Run client with the following command:\n"); 872 | printf(" %s %s [page_size num_pages]\n", argv[0], 873 | net.addr.ToString().c_str()); 874 | 875 | // Allocate and register message buffer 876 | auto buf1 = Buffer::Alloc(kMessageBufferSize, kBufAlign); 877 | net.RegisterMemory(buf1); 878 | auto buf2 = Buffer::Alloc(kMessageBufferSize, kBufAlign); 879 | net.RegisterMemory(buf2); 880 | 881 | // Allocate and register CUDA memory 882 | auto cuda_buf = Buffer::AllocCuda(kMemoryRegionSize * 2, kBufAlign); 883 | net.RegisterMemory(cuda_buf); 884 | printf("Registered 1 buffer on cuda:%d\n", cuda_buf.cuda_device); 885 | 886 | // Loop forever. Accept one client at a time. 887 | for (;;) { 888 | printf("------\n"); 889 | // State machine 890 | RandomFillRequestState s(&net, &cuda_buf); 891 | // RECV for CONNECT 892 | net.PostRecv(buf1, [&s](Network &net, RdmaOp &op) { s.OnRecv(net, op); }); 893 | // RECV for RandomFillRequest 894 | net.PostRecv(buf2, [&s](Network &net, RdmaOp &op) { s.OnRecv(net, op); }); 895 | // Wait for completion 896 | while (s.state != RandomFillRequestState::State::kDone) { 897 | net.PollCompletion(); 898 | switch (s.state) { 899 | case RandomFillRequestState::State::kWaitRequest: 900 | break; 901 | case RandomFillRequestState::State::kWrite: 902 | s.ContinuePostWrite(); 903 | break; 904 | case RandomFillRequestState::State::kDone: 905 | break; 906 | } 907 | } 908 | } 909 | 910 | fi_freeinfo(info); 911 | return 0; 912 | } 913 | 914 | int ClientMain(int argc, char **argv) { 915 | CHECK(argc == 2 || argc == 4); 916 | auto server_addrname = EfaAddress::Parse(argv[1]); 917 | size_t page_size, num_pages; 918 | if (argc == 4) { 919 | page_size = std::stoull(argv[2]); 920 | num_pages = std::stoull(argv[3]); 921 | } else { 922 | page_size = 128 * 8 * 2 * 16 * sizeof(uint16_t); 923 | num_pages = 1000; 924 | } 925 | size_t max_pages = kMemoryRegionSize / page_size; 926 | CHECK(page_size * num_pages <= kMemoryRegionSize); 927 | 928 | // Open Netowrk 929 | struct fi_info *info = GetInfo(); 930 | auto net = Network::Open(info); 931 | printf("domain: %14s", info->domain_attr->name); 932 | printf(", nic: %10s", info->nic->device_attr->name); 933 | printf(", fabric: %s", info->fabric_attr->prov_name); 934 | printf(", link: %.0fGbps", info->nic->link_attr->speed / 1e9); 935 | printf("\n"); 936 | auto server_addr = net.AddPeerAddress(server_addrname); 937 | 938 | // Allocate and register message buffer 939 | auto buf1 = Buffer::Alloc(kMessageBufferSize, kBufAlign); 940 | net.RegisterMemory(buf1); 941 | 942 | // Allocate and register CUDA memory 943 | auto cuda_buf1 = Buffer::AllocCuda(kMemoryRegionSize, kBufAlign); 944 | net.RegisterMemory(cuda_buf1); 945 | auto cuda_buf2 = Buffer::AllocCuda(kMemoryRegionSize, kBufAlign); 946 | net.RegisterMemory(cuda_buf2); 947 | printf("Registered 2 buffers on cuda:%d\n", cuda_buf1.cuda_device); 948 | 949 | // Prepare request 950 | std::mt19937_64 rng(0xabcdabcd987UL); 951 | uint64_t req_seed = rng(); 952 | std::vector page_idx; 953 | std::vector tmp(max_pages); 954 | std::iota(tmp.begin(), tmp.end(), 0); 955 | std::sample(tmp.begin(), tmp.end(), std::back_inserter(page_idx), num_pages, 956 | rng); 957 | 958 | // Send address and MR to server 959 | auto &connect_msg = *(AppConnectMessage *)buf1.data; 960 | connect_msg = { 961 | .base = {.type = AppMessageType::kConnect}, 962 | .client_addr = net.addr, 963 | .num_mr = 2, 964 | }; 965 | connect_msg.mr(0) = {.addr = (uint64_t)cuda_buf1.data, 966 | .size = kMemoryRegionSize, 967 | .rkey = net.GetMR(cuda_buf1)->key}; 968 | connect_msg.mr(1) = {.addr = (uint64_t)cuda_buf2.data, 969 | .size = kMemoryRegionSize, 970 | .rkey = net.GetMR(cuda_buf2)->key}; 971 | bool connect_sent = false; 972 | net.PostSend( 973 | server_addr, buf1, connect_msg.MessageBytes(), 974 | [&connect_sent](Network &net, RdmaOp &op) { connect_sent = true; }); 975 | while (!connect_sent) { 976 | net.PollCompletion(); 977 | } 978 | printf("Sent CONNECT message to server\n"); 979 | 980 | // Prepare to receive the last REMOTE WRITE from server 981 | bool last_remote_write_received = false; 982 | uint32_t remote_write_op_id = 0x123; 983 | net.AddRemoteWrite(remote_write_op_id, 984 | [&last_remote_write_received](Network &net, RdmaOp &op) { 985 | last_remote_write_received = true; 986 | }); 987 | 988 | // Send message to server 989 | auto &req_msg = *(AppRandomFillMessage *)buf1.data; 990 | req_msg = { 991 | .base = {.type = AppMessageType::kRandomFill}, 992 | .remote_context = remote_write_op_id, 993 | .seed = req_seed, 994 | .page_size = page_size, 995 | .num_pages = num_pages, 996 | }; 997 | for (size_t i = 0; i < num_pages; i++) { 998 | req_msg.page_idx(i) = page_idx[i]; 999 | } 1000 | bool req_sent = false; 1001 | net.PostSend(server_addr, buf1, req_msg.MessageBytes(), 1002 | [&req_sent](Network &net, RdmaOp &op) { req_sent = true; }); 1003 | while (!req_sent) { 1004 | net.PollCompletion(); 1005 | } 1006 | printf("Sent RandomFillRequest to server. page_size: %zu, num_pages: %zu\n", 1007 | page_size, num_pages); 1008 | 1009 | // Wait for REMOTE WRITE from server 1010 | while (!last_remote_write_received) { 1011 | net.PollCompletion(); 1012 | } 1013 | printf("Received RDMA WRITE to local GPU memory.\n"); 1014 | 1015 | // Verify data 1016 | auto expected1 = RandomBytes(req_seed, page_size * num_pages); 1017 | auto expected2 = RandomBytes(req_seed + 1, page_size * num_pages); 1018 | auto actual1 = std::vector(page_size * num_pages); 1019 | auto actual2 = std::vector(page_size * num_pages); 1020 | for (size_t i = 0; i < num_pages; i++) { 1021 | CUDA_CHECK(cudaMemcpy(actual1.data() + i * page_size, 1022 | (uint8_t *)cuda_buf1.data + page_idx[i] * page_size, 1023 | page_size, cudaMemcpyDeviceToHost)); 1024 | CUDA_CHECK(cudaMemcpy(actual2.data() + i * page_size, 1025 | (uint8_t *)cuda_buf2.data + page_idx[i] * page_size, 1026 | page_size, cudaMemcpyDeviceToHost)); 1027 | } 1028 | CHECK(expected1 == actual1); 1029 | CHECK(expected2 == actual2); 1030 | printf("Data is correct\n"); 1031 | 1032 | fi_freeinfo(info); 1033 | return 0; 1034 | } 1035 | 1036 | int main(int argc, char **argv) { 1037 | if (argc == 1) { 1038 | return ServerMain(argc, argv); 1039 | } else { 1040 | return ClientMain(argc, argv); 1041 | } 1042 | } 1043 | -------------------------------------------------------------------------------- /src/9_multinet.cpp: -------------------------------------------------------------------------------- 1 | // clang-format off 2 | /* 3 | Example run: 4 | 5 | server$ ./build/9_multinet 6 | GPUs: 8, NICs: 32, Total Bandwidth: 3200 Gbps 7 | PCIe Topology: 8 | cuda:0(53:00.0) NUMA0 CPU 0-11 rdmap79s0 (4f:00.0) rdmap80s0 (50:00.0) rdmap81s0 (51:00.0) rdmap82s0 (52:00.0) 9 | cuda:1(64:00.0) NUMA0 CPU12-23 rdmap96s0 (60:00.0) rdmap97s0 (61:00.0) rdmap98s0 (62:00.0) rdmap99s0 (63:00.0) 10 | cuda:2(75:00.0) NUMA0 CPU24-35 rdmap113s0(71:00.0) rdmap114s0(72:00.0) rdmap115s0(73:00.0) rdmap116s0(74:00.0) 11 | cuda:3(86:00.0) NUMA0 CPU36-47 rdmap130s0(82:00.0) rdmap131s0(83:00.0) rdmap132s0(84:00.0) rdmap133s0(85:00.0) 12 | cuda:4(97:00.0) NUMA1 CPU48-59 rdmap147s0(93:00.0) rdmap148s0(94:00.0) rdmap149s0(95:00.0) rdmap150s0(96:00.0) 13 | cuda:5(a8:00.0) NUMA1 CPU60-71 rdmap164s0(a4:00.0) rdmap165s0(a5:00.0) rdmap166s0(a6:00.0) rdmap167s0(a7:00.0) 14 | cuda:6(b9:00.0) NUMA1 CPU72-83 rdmap181s0(b5:00.0) rdmap182s0(b6:00.0) rdmap183s0(b7:00.0) rdmap184s0(b8:00.0) 15 | cuda:7(ca:00.0) NUMA1 CPU84-95 rdmap198s0(c6:00.0) rdmap199s0(c7:00.0) rdmap200s0(c8:00.0) rdmap201s0(c9:00.0) 16 | Run client with the following command: 17 | ./build/9_multinet 8 32 fe80000000000000088c03fffecfda9500000000fa6bac5a0000000000000000 [page_size num_pages] 18 | Registered MR from cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 19 | ------ 20 | Received CONNECT message from client: num_gpus=8, num_nets=32, num_mr=64 21 | Received RandomFill request from client: 22 | remote_context: 0x00000123 23 | seed: 0xb584035fabe6ce9b 24 | page_size: 65536 25 | num_pages: 1000 26 | total_repeat: 2000 27 | Generating random data................ 28 | Started RDMA WRITE to the remote GPU memory. 29 | [58.439s] WRITE: 100%, ops=32000000/32000000, posted=32000000(100%), bytes=2097152000000/2097152000000, bw=287.089Gbps(9.0%), 0.548Mpps 30 | Finished all RDMA WRITEs to the remote GPU memory. 31 | ------ 32 | ^C 33 | 34 | client$ ./build/9_multinet 8 32 fe80000000000000088c03fffecfda9500000000fa6bac5a0000000000000000 35 | GPUs: 8, NICs: 32, Total Bandwidth: 3200 Gbps 36 | PCIe Topology: 37 | cuda:0(53:00.0) NUMA0 CPU 0-11 rdmap79s0 (4f:00.0) rdmap80s0 (50:00.0) rdmap81s0 (51:00.0) rdmap82s0 (52:00.0) 38 | cuda:1(64:00.0) NUMA0 CPU12-23 rdmap96s0 (60:00.0) rdmap97s0 (61:00.0) rdmap98s0 (62:00.0) rdmap99s0 (63:00.0) 39 | cuda:2(75:00.0) NUMA0 CPU24-35 rdmap113s0(71:00.0) rdmap114s0(72:00.0) rdmap115s0(73:00.0) rdmap116s0(74:00.0) 40 | cuda:3(86:00.0) NUMA0 CPU36-47 rdmap130s0(82:00.0) rdmap131s0(83:00.0) rdmap132s0(84:00.0) rdmap133s0(85:00.0) 41 | cuda:4(97:00.0) NUMA1 CPU48-59 rdmap147s0(93:00.0) rdmap148s0(94:00.0) rdmap149s0(95:00.0) rdmap150s0(96:00.0) 42 | cuda:5(a8:00.0) NUMA1 CPU60-71 rdmap164s0(a4:00.0) rdmap165s0(a5:00.0) rdmap166s0(a6:00.0) rdmap167s0(a7:00.0) 43 | cuda:6(b9:00.0) NUMA1 CPU72-83 rdmap181s0(b5:00.0) rdmap182s0(b6:00.0) rdmap183s0(b7:00.0) rdmap184s0(b8:00.0) 44 | cuda:7(ca:00.0) NUMA1 CPU84-95 rdmap198s0(c6:00.0) rdmap199s0(c7:00.0) rdmap200s0(c8:00.0) rdmap201s0(c9:00.0) 45 | Registered MR from cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 46 | Sent CONNECT message to server. SEND latency: 19562.466us 47 | Sent RandomFillRequest to server. page_size: 65536, num_pages: 1000, SEND latency: 1218.370us 48 | Received RDMA WRITE to local GPU memory. 49 | Verifying................ 50 | Data is correct 51 | */ 52 | // clang-format on 53 | 54 | #include 55 | #include 56 | #include 57 | #include 58 | #include 59 | #include 60 | #include 61 | #include 62 | #include 63 | #include 64 | #include 65 | #include 66 | #include 67 | #include 68 | #include 69 | #include 70 | #include 71 | #include 72 | #include 73 | #include 74 | #include 75 | #include 76 | #include 77 | #include 78 | #include 79 | #include 80 | #include 81 | #include 82 | #include 83 | #include 84 | #include 85 | #include 86 | #include 87 | #include 88 | 89 | #define CHECK(stmt) \ 90 | do { \ 91 | if (!(stmt)) { \ 92 | fprintf(stderr, "%s:%d %s\n", __FILE__, __LINE__, #stmt); \ 93 | std::exit(1); \ 94 | } \ 95 | } while (0) 96 | 97 | #define FI_CHECK(stmt) \ 98 | do { \ 99 | int rc = (stmt); \ 100 | if (rc) { \ 101 | fprintf(stderr, "%s:%d %s failed with %d (%s)\n", __FILE__, __LINE__, \ 102 | #stmt, rc, fi_strerror(-rc)); \ 103 | std::exit(1); \ 104 | } \ 105 | } while (0) 106 | 107 | #define CUDA_CHECK(stmt) \ 108 | do { \ 109 | cudaError_t rc = (stmt); \ 110 | if (rc != cudaSuccess) { \ 111 | fprintf(stderr, "%s:%d %s failed with %d (%s)\n", __FILE__, __LINE__, \ 112 | #stmt, rc, cudaGetErrorString(rc)); \ 113 | std::exit(1); \ 114 | } \ 115 | } while (0) 116 | 117 | #define CU_CHECK(stmt) \ 118 | do { \ 119 | CUresult rc = (stmt); \ 120 | if (rc != CUDA_SUCCESS) { \ 121 | const char *err_str; \ 122 | cuGetErrorString(rc, &err_str); \ 123 | fprintf(stderr, "%s:%d %s failed with %d (%s)\n", __FILE__, __LINE__, \ 124 | #stmt, rc, err_str); \ 125 | std::exit(1); \ 126 | } \ 127 | } while (0) 128 | 129 | constexpr size_t kBufAlign = 128; // EFA alignment requirement 130 | constexpr size_t kMessageBufferSize = 1 << 20; 131 | constexpr size_t kCompletionQueueReadCount = 16; 132 | constexpr size_t kMemoryRegionSize = 1UL << 30; 133 | constexpr size_t kEfaImmDataSize = 4; 134 | constexpr size_t kMaxNetworksPerGroup = 4; 135 | 136 | struct Buffer; 137 | struct Network; 138 | 139 | struct PciAddress { 140 | uint16_t domain : 16; 141 | uint8_t bus : 8; 142 | uint8_t device : 5; 143 | uint8_t function : 3; 144 | 145 | static PciAddress Parse(std::string_view addr) { 146 | CHECK(addr.size() == 12); 147 | uint16_t domain; 148 | uint8_t bus, device, function; 149 | CHECK(sscanf(addr.data(), "%hx:%hhx:%hhx.%hhx", &domain, &bus, &device, 150 | &function) == 4); 151 | return PciAddress{domain, bus, device, function}; 152 | } 153 | 154 | uint32_t AsU32() const { return *(uint32_t *)this; } 155 | 156 | friend bool operator==(const PciAddress &lhs, const PciAddress &rhs) { 157 | return lhs.AsU32() == rhs.AsU32(); 158 | } 159 | }; 160 | static_assert(sizeof(PciAddress) == 4); 161 | 162 | namespace std { 163 | template <> struct hash { 164 | size_t operator()(const PciAddress &addr) const { 165 | return hash()(addr.AsU32()); 166 | } 167 | }; 168 | } // namespace std 169 | 170 | struct TopologyGroup { 171 | int cuda_device; 172 | int numa; 173 | std::vector fi_infos; 174 | std::vector cpus; 175 | }; 176 | 177 | std::vector DetectTopo(struct fi_info *info) { 178 | char buf[256]; 179 | int num_gpus = 0; 180 | CUDA_CHECK(cudaGetDeviceCount(&num_gpus)); 181 | std::vector topo_groups(num_gpus); 182 | 183 | int num_cpus = 0; 184 | std::vector> numa_cpus; 185 | for (const auto &entry : std::filesystem::recursive_directory_iterator( 186 | "/sys/devices/system/node/")) { 187 | if (entry.path().filename().string().rfind("node", 0) != 0) { 188 | continue; 189 | } 190 | numa_cpus.emplace_back(); 191 | } 192 | int hardware_concurrency = std::thread::hardware_concurrency(); 193 | for (size_t node_id = 0; node_id < numa_cpus.size(); ++node_id) { 194 | for (int cpu = 0; cpu < hardware_concurrency; ++cpu) { 195 | snprintf(buf, sizeof(buf), 196 | "/sys/devices/system/node/node%zu/cpu%d/" 197 | "topology/thread_siblings_list", 198 | node_id, cpu); 199 | // Filter out hyperthreads 200 | std::ifstream f(buf); 201 | std::string sibling_list; 202 | if (f >> sibling_list) { 203 | int first_sibling; 204 | try { 205 | first_sibling = std::stoi(sibling_list); 206 | } catch (std::invalid_argument &e) { 207 | continue; 208 | } 209 | if (first_sibling == cpu) { 210 | numa_cpus[node_id].push_back(cpu); 211 | } 212 | } 213 | } 214 | std::sort(numa_cpus[node_id].begin(), numa_cpus[node_id].end()); 215 | num_cpus += numa_cpus[node_id].size(); 216 | } 217 | int cpus_per_gpu = num_cpus / num_gpus; 218 | 219 | std::unordered_map pci_parent_map; 220 | for (const auto &entry : 221 | std::filesystem::recursive_directory_iterator("/sys/bus/pci/devices")) { 222 | if (!entry.is_symlink()) { 223 | continue; 224 | } 225 | auto target = std::filesystem::read_symlink(entry.path()); 226 | auto addr_str = target.filename().string(); 227 | auto parent_addr_str = target.parent_path().filename().string(); 228 | CHECK(addr_str.size() == 12); // 0000:51:00.0 229 | if (parent_addr_str.size() != 12) { // 0000:46:01.2 230 | continue; // pci0000:cc 231 | } 232 | auto addr = PciAddress::Parse(addr_str); 233 | auto parent_bus = PciAddress::Parse(parent_addr_str); 234 | parent_bus.device = 0; 235 | parent_bus.function = 0; 236 | pci_parent_map[addr] = parent_bus; 237 | } 238 | 239 | std::vector numa_gpu_count(numa_cpus.size()); 240 | std::unordered_map bus_cuda_map; 241 | for (int i = 0; i < num_gpus; ++i) { 242 | cudaDeviceProp prop; 243 | CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); 244 | auto pci_addr = 245 | PciAddress{(uint16_t)prop.pciDomainID, (uint8_t)prop.pciBusID, 246 | (uint8_t)prop.pciDeviceID, 0}; 247 | auto parent_bus = pci_parent_map.at(pci_addr); 248 | bus_cuda_map[parent_bus] = i; 249 | 250 | topo_groups[i].cuda_device = i; 251 | snprintf(buf, sizeof(buf), 252 | "/sys/bus/pci/devices/%04x:%02x:%02x.0/numa_node", 253 | prop.pciDomainID, prop.pciBusID, prop.pciDeviceID); 254 | std::ifstream f(buf); 255 | CHECK(f >> topo_groups[i].numa); 256 | int numa_gpu_idx = numa_gpu_count[topo_groups[i].numa]++; 257 | auto &cpus = numa_cpus[topo_groups[i].numa]; 258 | int cpu_start = cpus_per_gpu * numa_gpu_idx; 259 | CHECK(cpu_start + cpus_per_gpu <= (int)cpus.size()); 260 | topo_groups[i].cpus.assign(cpus.begin() + cpu_start, 261 | cpus.begin() + cpu_start + cpus_per_gpu); 262 | } 263 | 264 | for (auto *fi = info; fi; fi = fi->next) { 265 | auto &pci = fi->nic->bus_attr->attr.pci; 266 | auto pci_addr = 267 | PciAddress{pci.domain_id, pci.bus_id, pci.device_id, pci.function_id}; 268 | auto parent_bus = pci_parent_map.at(pci_addr); 269 | auto cuda_device = bus_cuda_map.at(parent_bus); 270 | topo_groups[cuda_device].fi_infos.push_back(fi); 271 | } 272 | 273 | return topo_groups; 274 | } 275 | 276 | void PrintTopologyGroups(const std::vector &topo_groups) { 277 | printf("PCIe Topology:\n"); 278 | for (const auto &topo : topo_groups) { 279 | cudaDeviceProp prop; 280 | CUDA_CHECK(cudaGetDeviceProperties(&prop, topo.cuda_device)); 281 | printf(" cuda:%d(%02x:%02x.0)", topo.cuda_device, prop.pciBusID, 282 | prop.pciDeviceID); 283 | printf(" NUMA%d", topo.numa); 284 | printf(" CPU%2d-%2d", topo.cpus.front(), topo.cpus.back()); 285 | for (auto *fi : topo.fi_infos) { 286 | printf(" %-10s(%02x:%02x.%d)", fi->nic->device_attr->name, 287 | fi->nic->bus_attr->attr.pci.bus_id, 288 | fi->nic->bus_attr->attr.pci.device_id, 289 | fi->nic->bus_attr->attr.pci.function_id); 290 | } 291 | printf("\n"); 292 | } 293 | } 294 | 295 | void TrimTopo(std::vector &groups, int num_gpus, int num_nets) { 296 | CHECK(num_gpus <= (int)groups.size()); 297 | CHECK(num_nets % num_gpus == 0); 298 | int nets_per_gpu = num_nets / num_gpus; 299 | for (const auto &group : groups) { 300 | CHECK(nets_per_gpu <= (int)group.fi_infos.size()); 301 | } 302 | while ((int)groups.size() > num_gpus) { 303 | groups.pop_back(); 304 | } 305 | for (int i = 0; i < num_gpus; ++i) { 306 | while ((int)groups[i].fi_infos.size() > nets_per_gpu) { 307 | groups[i].fi_infos.pop_back(); 308 | } 309 | } 310 | } 311 | 312 | struct EfaAddress { 313 | uint8_t bytes[32]; 314 | 315 | explicit EfaAddress(uint8_t bytes[32]) { memcpy(this->bytes, bytes, 32); } 316 | 317 | std::string ToString() const { 318 | char buf[65]; 319 | for (size_t i = 0; i < 32; i++) { 320 | snprintf(buf + 2 * i, 3, "%02x", bytes[i]); 321 | } 322 | return std::string(buf, 64); 323 | } 324 | 325 | static EfaAddress Parse(const std::string &str) { 326 | if (str.size() != 64) { 327 | fprintf(stderr, "Unexpected address length %zu\n", str.size()); 328 | std::exit(1); 329 | } 330 | uint8_t bytes[32]; 331 | for (size_t i = 0; i < 32; i++) { 332 | sscanf(str.c_str() + 2 * i, "%02hhx", &bytes[i]); 333 | } 334 | return EfaAddress(bytes); 335 | } 336 | }; 337 | 338 | enum class RdmaOpType : uint8_t { 339 | kRecv = 0, 340 | kSend = 1, 341 | kWrite = 2, 342 | kRemoteWrite = 3, 343 | }; 344 | 345 | struct RdmaRecvOp { 346 | Buffer *buf; 347 | fi_addr_t src_addr; // Set after completion 348 | size_t recv_size; // Set after completion 349 | }; 350 | static_assert(std::is_pod_v); 351 | 352 | struct RdmaSendOp { 353 | Buffer *buf; 354 | size_t len; 355 | fi_addr_t dest_addr; 356 | }; 357 | static_assert(std::is_pod_v); 358 | 359 | struct RdmaWriteOp { 360 | Buffer *buf; 361 | size_t offset; 362 | size_t len; 363 | uint32_t imm_data; 364 | uint64_t dest_ptr; 365 | fi_addr_t dest_addr; 366 | uint64_t dest_key; 367 | }; 368 | static_assert(std::is_pod_v); 369 | 370 | struct RdmaRemoteWriteOp { 371 | uint32_t op_id; 372 | }; 373 | static_assert(std::is_pod_v); 374 | static_assert(sizeof(RdmaRemoteWriteOp) <= kEfaImmDataSize); 375 | 376 | struct RdmaOp { 377 | RdmaOpType type; 378 | union { 379 | RdmaRecvOp recv; 380 | RdmaSendOp send; 381 | RdmaWriteOp write; 382 | RdmaRemoteWriteOp remote_write; 383 | }; 384 | std::function callback; 385 | }; 386 | 387 | struct Network { 388 | struct fi_info *fi; 389 | struct fid_fabric *fabric; 390 | struct fid_domain *domain; 391 | struct fid_cq *cq; 392 | struct fid_av *av; 393 | struct fid_ep *ep; 394 | EfaAddress addr; 395 | int cuda_device; 396 | 397 | std::deque pending_ops; 398 | 399 | std::unordered_map mr; 400 | std::unordered_map remote_write_ops; 401 | 402 | static Network Open(struct fi_info *fi, int cuda_device); 403 | 404 | fi_addr_t AddPeerAddress(const EfaAddress &peer_addr); 405 | void RegisterMemory(Buffer &buf); 406 | struct fid_mr *GetMR(const Buffer &buf); 407 | 408 | void PollCompletion(); 409 | void ProgressPendingOps(); 410 | void PostRecv(Buffer &buf, 411 | std::function &&callback); 412 | void PostSend(fi_addr_t addr, Buffer &buf, size_t len, 413 | std::function &&callback); 414 | void PostWrite(RdmaWriteOp &&write, 415 | std::function &&callback); 416 | void AddRemoteWrite(uint32_t id, 417 | std::function &&callback); 418 | 419 | Network(const Network &) = delete; 420 | Network(Network &&other) 421 | : fi(other.fi), fabric(other.fabric), domain(other.domain), cq(other.cq), 422 | av(other.av), ep(other.ep), addr(other.addr), 423 | cuda_device(other.cuda_device) { 424 | other.fi = nullptr; 425 | other.fabric = nullptr; 426 | other.domain = nullptr; 427 | other.cq = nullptr; 428 | other.av = nullptr; 429 | other.ep = nullptr; 430 | } 431 | 432 | ~Network() { 433 | for (const auto &[_, mr] : mr) { 434 | FI_CHECK(fi_close(&mr->fid)); 435 | } 436 | if (ep) 437 | FI_CHECK(fi_close(&ep->fid)); 438 | if (av) 439 | FI_CHECK(fi_close(&av->fid)); 440 | if (cq) 441 | FI_CHECK(fi_close(&cq->fid)); 442 | if (domain) 443 | FI_CHECK(fi_close(&domain->fid)); 444 | if (fabric) 445 | FI_CHECK(fi_close(&fabric->fid)); 446 | } 447 | 448 | private: 449 | Network(struct fi_info *fi, struct fid_fabric *fabric, 450 | struct fid_domain *domain, struct fid_cq *cq, struct fid_av *av, 451 | struct fid_ep *ep, EfaAddress addr, int cuda_device) 452 | : fi(fi), fabric(fabric), domain(domain), cq(cq), av(av), ep(ep), 453 | addr(addr), cuda_device(cuda_device) {} 454 | }; 455 | 456 | struct NetworkGroup { 457 | std::vector nets; 458 | uint8_t rr_mask; 459 | uint8_t rr_idx = 0; 460 | 461 | NetworkGroup(std::vector &&nets) { 462 | CHECK(nets.size() <= kMaxNetworksPerGroup); 463 | CHECK((nets.size() & (nets.size() - 1)) == 0); // power of 2 464 | this->rr_mask = nets.size() - 1; 465 | this->nets = std::move(nets); 466 | } 467 | NetworkGroup(const NetworkGroup &) = delete; 468 | NetworkGroup(NetworkGroup &&) = default; 469 | 470 | uint8_t GetNext() { 471 | rr_idx = (rr_idx + 1) & rr_mask; 472 | return rr_idx; 473 | } 474 | }; 475 | 476 | void *align_up(void *ptr, size_t align) { 477 | uintptr_t addr = (uintptr_t)ptr; 478 | return (void *)((addr + align - 1) & ~(align - 1)); 479 | } 480 | 481 | struct Buffer { 482 | void *data; 483 | size_t size; 484 | int cuda_device; 485 | int dmabuf_fd; 486 | 487 | static Buffer Alloc(size_t size, size_t align) { 488 | void *raw_data = malloc(size); 489 | CHECK(raw_data != nullptr); 490 | return Buffer(raw_data, size, align, -1, -1); 491 | } 492 | 493 | static Buffer AllocCuda(size_t size, size_t align) { 494 | void *raw_data; 495 | struct cudaPointerAttributes attrs = {}; 496 | CUDA_CHECK(cudaMalloc(&raw_data, size)); 497 | CUDA_CHECK(cudaPointerGetAttributes(&attrs, raw_data)); 498 | CHECK(attrs.type == cudaMemoryTypeDevice); 499 | int cuda_device = attrs.device; 500 | int fd = -1; 501 | CU_CHECK(cuMemGetHandleForAddressRange( 502 | &fd, (CUdeviceptr)align_up(raw_data, align), size, 503 | CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); 504 | return Buffer(raw_data, size, align, cuda_device, fd); 505 | } 506 | 507 | bool is_cuda() const { return cuda_device >= 0; } 508 | 509 | Buffer(Buffer &&other) 510 | : data(other.data), size(other.size), cuda_device(other.cuda_device), 511 | dmabuf_fd(other.dmabuf_fd), raw_data(other.raw_data) { 512 | other.data = nullptr; 513 | other.raw_data = nullptr; 514 | other.size = 0; 515 | other.cuda_device = -1; 516 | other.dmabuf_fd = -1; 517 | } 518 | 519 | ~Buffer() { 520 | if (is_cuda()) { 521 | CUDA_CHECK(cudaFree(raw_data)); 522 | } else { 523 | free(raw_data); 524 | } 525 | } 526 | 527 | private: 528 | void *raw_data; 529 | 530 | Buffer(void *raw_data, size_t raw_size, size_t align, int cuda_device, 531 | int dmabuf_fd) { 532 | this->raw_data = raw_data; 533 | this->data = align_up(raw_data, align); 534 | this->size = (size_t)((uintptr_t)raw_data + raw_size - (uintptr_t)data); 535 | this->cuda_device = cuda_device; 536 | this->dmabuf_fd = dmabuf_fd; 537 | } 538 | Buffer(const Buffer &) = delete; 539 | }; 540 | 541 | struct fi_info *GetInfo() { 542 | struct fi_info *hints, *info; 543 | hints = fi_allocinfo(); 544 | hints->caps = FI_MSG | FI_RMA | FI_HMEM | FI_LOCAL_COMM | FI_REMOTE_COMM; 545 | hints->ep_attr->type = FI_EP_RDM; 546 | hints->fabric_attr->prov_name = strdup("efa"); 547 | hints->domain_attr->mr_mode = FI_MR_LOCAL | FI_MR_HMEM | FI_MR_VIRT_ADDR | 548 | FI_MR_ALLOCATED | FI_MR_PROV_KEY; 549 | hints->domain_attr->threading = FI_THREAD_SAFE; 550 | FI_CHECK(fi_getinfo(FI_VERSION(2, 0), nullptr, nullptr, 0, hints, &info)); 551 | fi_freeinfo(hints); 552 | return info; 553 | } 554 | 555 | Network Network::Open(struct fi_info *fi, int cuda_device) { 556 | struct fid_fabric *fabric; 557 | FI_CHECK(fi_fabric(fi->fabric_attr, &fabric, nullptr)); 558 | 559 | struct fid_domain *domain; 560 | FI_CHECK(fi_domain(fabric, fi, &domain, nullptr)); 561 | 562 | struct fid_cq *cq; 563 | struct fi_cq_attr cq_attr = {}; 564 | cq_attr.format = FI_CQ_FORMAT_DATA; 565 | FI_CHECK(fi_cq_open(domain, &cq_attr, &cq, nullptr)); 566 | 567 | struct fid_av *av; 568 | struct fi_av_attr av_attr = {}; 569 | FI_CHECK(fi_av_open(domain, &av_attr, &av, nullptr)); 570 | 571 | struct fid_ep *ep; 572 | FI_CHECK(fi_endpoint(domain, fi, &ep, nullptr)); 573 | FI_CHECK(fi_ep_bind(ep, &cq->fid, FI_SEND | FI_RECV)); 574 | FI_CHECK(fi_ep_bind(ep, &av->fid, 0)); 575 | 576 | FI_CHECK(fi_enable(ep)); 577 | 578 | uint8_t addrbuf[64]; 579 | size_t addrlen = sizeof(addrbuf); 580 | FI_CHECK(fi_getname(&ep->fid, addrbuf, &addrlen)); 581 | if (addrlen != 32) { 582 | fprintf(stderr, "Unexpected address length %zu\n", addrlen); 583 | std::exit(1); 584 | } 585 | auto addr = EfaAddress(addrbuf); 586 | 587 | return Network(fi, fabric, domain, cq, av, ep, addr, cuda_device); 588 | } 589 | 590 | fi_addr_t Network::AddPeerAddress(const EfaAddress &peer_addr) { 591 | fi_addr_t addr = FI_ADDR_UNSPEC; 592 | int ret = fi_av_insert(av, peer_addr.bytes, 1, &addr, 0, nullptr); 593 | if (ret != 1) { 594 | fprintf(stderr, "fi_av_insert failed: %d\n", ret); 595 | std::exit(1); 596 | } 597 | return addr; 598 | } 599 | 600 | void Network::RegisterMemory(Buffer &buf) { 601 | struct fid_mr *mr; 602 | struct fi_mr_attr mr_attr = { 603 | .iov_count = 1, 604 | .access = FI_SEND | FI_RECV | FI_REMOTE_WRITE | FI_REMOTE_READ | 605 | FI_WRITE | FI_READ, 606 | }; 607 | struct iovec iov = {.iov_base = buf.data, .iov_len = buf.size}; 608 | struct fi_mr_dmabuf dmabuf = { 609 | .fd = buf.dmabuf_fd, .offset = 0, .len = buf.size, .base_addr = buf.data}; 610 | uint64_t flags = 0; 611 | if (buf.is_cuda()) { 612 | CHECK(buf.cuda_device == cuda_device); 613 | mr_attr.iface = FI_HMEM_CUDA; 614 | mr_attr.device.cuda = buf.cuda_device; 615 | if (buf.dmabuf_fd != -1) { 616 | mr_attr.dmabuf = &dmabuf; 617 | flags = FI_MR_DMABUF; 618 | } else { 619 | mr_attr.mr_iov = &iov; 620 | } 621 | } else { 622 | mr_attr.mr_iov = &iov; 623 | } 624 | FI_CHECK(fi_mr_regattr(domain, &mr_attr, flags, &mr)); 625 | this->mr[buf.data] = mr; 626 | } 627 | 628 | struct fid_mr *Network::GetMR(const Buffer &buf) { 629 | auto it = mr.find(buf.data); 630 | CHECK(it != mr.end()); 631 | return it->second; 632 | } 633 | 634 | void Network::PostRecv(Buffer &buf, 635 | std::function &&callback) { 636 | auto *op = new RdmaOp{ 637 | .type = RdmaOpType::kRecv, 638 | .recv = 639 | RdmaRecvOp{.buf = &buf, .src_addr = FI_ADDR_UNSPEC, .recv_size = 0}, 640 | .callback = std::move(callback), 641 | }; 642 | pending_ops.push_back(op); 643 | ProgressPendingOps(); 644 | } 645 | 646 | void Network::PostSend(fi_addr_t addr, Buffer &buf, size_t len, 647 | std::function &&callback) { 648 | CHECK(len <= buf.size); 649 | auto *op = new RdmaOp{ 650 | .type = RdmaOpType::kSend, 651 | .send = RdmaSendOp{.buf = &buf, .len = len, .dest_addr = addr}, 652 | .callback = std::move(callback), 653 | }; 654 | pending_ops.push_back(op); 655 | ProgressPendingOps(); 656 | } 657 | 658 | void Network::PostWrite(RdmaWriteOp &&write, 659 | std::function &&callback) { 660 | auto *op = new RdmaOp{ 661 | .type = RdmaOpType::kWrite, 662 | .write = std::move(write), 663 | .callback = std::move(callback), 664 | }; 665 | pending_ops.push_back(op); 666 | ProgressPendingOps(); 667 | } 668 | 669 | void Network::AddRemoteWrite( 670 | uint32_t id, std::function &&callback) { 671 | CHECK(remote_write_ops.count(id) == 0); 672 | auto *op = new RdmaOp{ 673 | .type = RdmaOpType::kRemoteWrite, 674 | .remote_write = RdmaRemoteWriteOp{.op_id = id}, 675 | .callback = std::move(callback), 676 | }; 677 | remote_write_ops[id] = op; 678 | } 679 | 680 | void Network::ProgressPendingOps() { 681 | while (!pending_ops.empty()) { 682 | auto *op = pending_ops.front(); 683 | pending_ops.pop_front(); 684 | const char *op_name = nullptr; 685 | ssize_t ret = 0; 686 | switch (op->type) { 687 | case RdmaOpType::kRecv: { 688 | op_name = "fi_recv"; 689 | auto &recv = op->recv; 690 | struct iovec iov = { 691 | .iov_base = recv.buf->data, 692 | .iov_len = recv.buf->size, 693 | }; 694 | struct fi_msg msg = { 695 | .msg_iov = &iov, 696 | .desc = &GetMR(*recv.buf)->mem_desc, 697 | .iov_count = 1, 698 | .addr = FI_ADDR_UNSPEC, 699 | .context = op, 700 | }; 701 | ret = fi_recvmsg(ep, &msg, 0); 702 | break; 703 | } 704 | case RdmaOpType::kSend: { 705 | op_name = "fi_send"; 706 | auto &send = op->send; 707 | struct iovec iov = { 708 | .iov_base = send.buf->data, 709 | .iov_len = send.len, 710 | }; 711 | struct fi_msg msg = { 712 | .msg_iov = &iov, 713 | .desc = &GetMR(*send.buf)->mem_desc, 714 | .iov_count = 1, 715 | .addr = send.dest_addr, 716 | .context = op, 717 | }; 718 | ret = fi_sendmsg(ep, &msg, 0); 719 | break; 720 | } 721 | case RdmaOpType::kWrite: { 722 | op_name = "fi_writemsg"; 723 | auto &write = op->write; 724 | struct iovec iov = { 725 | .iov_base = (uint8_t *)write.buf->data + write.offset, 726 | .iov_len = write.len, 727 | }; 728 | struct fi_rma_iov rma_iov = { 729 | .addr = write.dest_ptr, 730 | .len = write.len, 731 | .key = write.dest_key, 732 | }; 733 | struct fi_msg_rma msg = { 734 | .msg_iov = &iov, 735 | .desc = &GetMR(*write.buf)->mem_desc, 736 | .iov_count = 1, 737 | .addr = write.dest_addr, 738 | .rma_iov = &rma_iov, 739 | .rma_iov_count = 1, 740 | .context = op, 741 | .data = write.imm_data, 742 | }; 743 | uint64_t flags = 0; 744 | if (write.imm_data) { 745 | flags |= FI_REMOTE_CQ_DATA; 746 | } 747 | ret = fi_writemsg(ep, &msg, flags); 748 | break; 749 | } 750 | case RdmaOpType::kRemoteWrite: { 751 | CHECK(false); // Unreachable 752 | break; 753 | } 754 | } 755 | if (ret == -FI_EAGAIN) { 756 | // Put it back to the front of the queue 757 | pending_ops.push_front(op); 758 | break; 759 | } 760 | if (ret) { 761 | // Unexpected error. Don't put it back. 762 | // Delete the op since it's not going to be in the completion queue. 763 | delete op; 764 | fprintf(stderr, "Failed to ProgressPendingOps. %s() returned %ld (%s)\n", 765 | op_name, ret, fi_strerror(-ret)); 766 | fflush(stderr); 767 | break; 768 | } 769 | } 770 | } 771 | 772 | void HandleCompletion(Network &net, const struct fi_cq_data_entry &cqe) { 773 | RdmaOp *op = nullptr; 774 | if (cqe.flags & FI_REMOTE_WRITE) { 775 | // REMOTE WRITE does not have op_context 776 | // NOTE(lequn): EFA only supports 4 bytes of immediate data. 777 | uint32_t op_id = cqe.data; 778 | if (!op_id) 779 | return; 780 | auto it = net.remote_write_ops.find(op_id); 781 | if (it == net.remote_write_ops.end()) 782 | return; 783 | op = it->second; 784 | net.remote_write_ops.erase(it); 785 | } else { 786 | // RECV / SEND / WRITE 787 | op = (RdmaOp *)cqe.op_context; 788 | if (!op) 789 | return; 790 | if (cqe.flags & FI_RECV) { 791 | op->recv.recv_size = cqe.len; 792 | } else if (cqe.flags & FI_SEND) { 793 | // Nothing special 794 | } else if (cqe.flags & FI_WRITE) { 795 | // Nothing special 796 | } else { 797 | fprintf(stderr, "Unhandled completion type. cqe.flags=%lx\n", cqe.flags); 798 | std::exit(1); 799 | } 800 | } 801 | if (op->callback) 802 | op->callback(net, *op); 803 | delete op; 804 | } 805 | 806 | void Network::PollCompletion() { 807 | // Process completions 808 | struct fi_cq_data_entry cqe[kCompletionQueueReadCount]; 809 | for (;;) { 810 | auto ret = fi_cq_read(cq, cqe, kCompletionQueueReadCount); 811 | if (ret > 0) { 812 | for (ssize_t i = 0; i < ret; i++) { 813 | HandleCompletion(*this, cqe[i]); 814 | } 815 | } else if (ret == -FI_EAVAIL) { 816 | struct fi_cq_err_entry err_entry; 817 | ret = fi_cq_readerr(cq, &err_entry, 0); 818 | if (ret < 0) { 819 | fprintf(stderr, "fi_cq_readerr error: %zd (%s)\n", ret, 820 | fi_strerror(-ret)); 821 | std::exit(1); 822 | } else if (ret > 0) { 823 | fprintf(stderr, "Failed libfabric operation: %s\n", 824 | fi_cq_strerror(cq, err_entry.prov_errno, err_entry.err_data, 825 | nullptr, 0)); 826 | } else { 827 | fprintf(stderr, "fi_cq_readerr returned 0 unexpectedly.\n"); 828 | std::exit(1); 829 | } 830 | } else if (ret == -FI_EAGAIN) { 831 | // No more completions 832 | break; 833 | } else { 834 | fprintf(stderr, "fi_cq_read error: %zd (%s)\n", ret, fi_strerror(-ret)); 835 | std::exit(1); 836 | } 837 | } 838 | 839 | // Try to make progress. 840 | ProgressPendingOps(); 841 | } 842 | 843 | enum class AppMessageType : uint8_t { 844 | kConnect = 0, 845 | kRandomFill = 1, 846 | }; 847 | 848 | struct AppMessageBase { 849 | AppMessageType type; 850 | }; 851 | 852 | struct AppConnectMessage { 853 | struct MemoryRegion { 854 | uint64_t addr; 855 | uint64_t size; 856 | uint64_t rkey; 857 | }; 858 | 859 | AppMessageBase base; 860 | size_t num_gpus; 861 | size_t num_nets; 862 | size_t num_mr; 863 | 864 | EfaAddress &net_addr(size_t index) { 865 | CHECK(index < num_nets); 866 | return ((EfaAddress *)((uintptr_t)&base + sizeof(*this)))[index]; 867 | } 868 | 869 | MemoryRegion &mr(size_t index) { 870 | CHECK(index < num_mr); 871 | return ((MemoryRegion *)((uintptr_t)&base + sizeof(*this) + 872 | num_nets * sizeof(EfaAddress)))[index]; 873 | } 874 | 875 | size_t MessageBytes() const { 876 | return sizeof(*this) + num_nets * sizeof(EfaAddress) + 877 | num_mr * sizeof(MemoryRegion); 878 | } 879 | }; 880 | 881 | struct AppRandomFillMessage { 882 | AppMessageBase base; 883 | uint32_t remote_context; 884 | uint64_t seed; 885 | size_t page_size; 886 | size_t num_pages; 887 | 888 | uint32_t &page_idx(size_t index) { 889 | CHECK(index < num_pages); 890 | return ((uint32_t *)((uintptr_t)&base + sizeof(*this)))[index]; 891 | } 892 | 893 | size_t MessageBytes() const { 894 | return sizeof(*this) + num_pages * sizeof(uint32_t); 895 | } 896 | }; 897 | 898 | std::vector RandomBytes(uint64_t seed, size_t size) { 899 | CHECK(size % sizeof(uint64_t) == 0); 900 | std::vector buf(size); 901 | std::mt19937_64 gen(seed); 902 | std::uniform_int_distribution dist; 903 | for (size_t i = 0; i < size; i += sizeof(uint64_t)) { 904 | *(uint64_t *)(buf.data() + i) = dist(gen); 905 | } 906 | return buf; 907 | } 908 | 909 | long TimeDeltaNanos( 910 | const std::chrono::time_point &start, 911 | const std::chrono::time_point &end) { 912 | return std::chrono::duration_cast(end - start) 913 | .count(); 914 | } 915 | 916 | struct RandomFillRequestState { 917 | enum class State { 918 | kWaitRequest, 919 | kWrite, 920 | kDone, 921 | }; 922 | 923 | struct WriteState { 924 | size_t i_repeat = 0; 925 | size_t i_buf = 0; 926 | size_t i_page = 0; 927 | }; 928 | 929 | std::vector *nets; 930 | std::vector *net_groups; 931 | std::vector *cuda_bufs; 932 | size_t total_bw = 0; 933 | State state = State::kWaitRequest; 934 | 935 | AppConnectMessage *connect_msg = nullptr; 936 | AppRandomFillMessage *request_msg = nullptr; 937 | 938 | size_t total_repeat = 0; 939 | size_t nets_per_gpu = 0; 940 | size_t buf_per_gpu = 0; 941 | std::vector> remote_addrs; 942 | std::vector write_states; 943 | size_t total_write_ops = 0; 944 | size_t write_op_size = 0; 945 | size_t posted_write_ops = 0; 946 | size_t finished_write_ops = 0; 947 | std::chrono::time_point write_start_at; 948 | 949 | RandomFillRequestState(std::vector *nets, 950 | std::vector *net_groups, 951 | std::vector *cuda_bufs) 952 | : nets(nets), net_groups(net_groups), cuda_bufs(cuda_bufs) { 953 | for (auto &net : *nets) { 954 | total_bw += net.fi->nic->link_attr->speed; 955 | } 956 | } 957 | 958 | void OnRecv(Network &net, RdmaOp &op) { 959 | if (!connect_msg) { 960 | HandleConnect(net, op); 961 | } else { 962 | HandleRequest(net, op); 963 | } 964 | } 965 | 966 | void HandleConnect(Network &net, RdmaOp &op) { 967 | auto *base_msg = (AppMessageBase *)op.recv.buf->data; 968 | CHECK(base_msg->type == AppMessageType::kConnect); 969 | CHECK(op.recv.recv_size >= sizeof(AppConnectMessage)); 970 | auto &msg = *(AppConnectMessage *)base_msg; 971 | CHECK(op.recv.recv_size == msg.MessageBytes()); 972 | CHECK(msg.num_mr > 0); 973 | printf("Received CONNECT message from client: num_gpus=%zu, " 974 | "num_nets=%zu, num_mr=%zu\n", 975 | msg.num_gpus, msg.num_nets, msg.num_mr); 976 | 977 | // Save the message. Note that we don't reuse the buffer. 978 | connect_msg = &msg; 979 | 980 | // Assuming remote has the same number of GPUs and NICs. 981 | CHECK(msg.num_gpus == cuda_bufs->size()); 982 | CHECK(msg.num_nets == nets->size()); 983 | 984 | // Add peer addresses 985 | nets_per_gpu = msg.num_nets / msg.num_gpus; 986 | buf_per_gpu = connect_msg->num_mr / connect_msg->num_nets; 987 | for (size_t i = 0; i < msg.num_gpus; ++i) { 988 | std::array addrs = {}; 989 | for (size_t j = 0; j < nets_per_gpu; ++j) { 990 | auto idx = i * nets_per_gpu + j; 991 | addrs[j] = nets->at(idx).AddPeerAddress(msg.net_addr(idx)); 992 | } 993 | remote_addrs.push_back(addrs); 994 | } 995 | } 996 | 997 | void HandleRequest(Network &net, RdmaOp &op) { 998 | auto *base_msg = (const AppMessageBase *)op.recv.buf->data; 999 | CHECK(base_msg->type == AppMessageType::kRandomFill); 1000 | CHECK(op.recv.recv_size >= sizeof(AppRandomFillMessage)); 1001 | auto &msg = *(AppRandomFillMessage *)base_msg; 1002 | CHECK(op.recv.recv_size == msg.MessageBytes()); 1003 | 1004 | // Save the message. Note that we don't reuse the buffer. 1005 | request_msg = &msg; 1006 | 1007 | printf("Received RandomFill request from client:\n"); 1008 | printf(" remote_context: 0x%08x\n", msg.remote_context); 1009 | printf(" seed: 0x%016lx\n", msg.seed); 1010 | printf(" page_size: %zu\n", msg.page_size); 1011 | printf(" num_pages: %zu\n", msg.num_pages); 1012 | total_repeat = 500 * nets_per_gpu; 1013 | printf(" total_repeat: %zu\n", total_repeat); 1014 | 1015 | // Generate random data and copy to local GPU memory 1016 | printf("Generating random data"); 1017 | fflush(stdout); 1018 | for (size_t i = 0; i < connect_msg->num_gpus; ++i) { 1019 | for (size_t j = 0; j < buf_per_gpu; ++j) { 1020 | auto bytes = RandomBytes(msg.seed + i * buf_per_gpu + j, 1021 | msg.page_size * msg.num_pages); 1022 | CUDA_CHECK( 1023 | cudaMemcpy((uint8_t *)cuda_bufs->at(i).data + j * bytes.size(), 1024 | bytes.data(), bytes.size(), cudaMemcpyHostToDevice)); 1025 | printf("."); 1026 | fflush(stdout); 1027 | } 1028 | } 1029 | printf("\n"); 1030 | 1031 | // Prepare RDMA WRITE the data to remote GPU. 1032 | printf("Started RDMA WRITE to the remote GPU memory.\n"); 1033 | total_write_ops = connect_msg->num_gpus * buf_per_gpu * 1034 | request_msg->num_pages * total_repeat; 1035 | write_op_size = request_msg->page_size; 1036 | write_states.resize(connect_msg->num_gpus); 1037 | write_start_at = std::chrono::high_resolution_clock::now(); 1038 | state = State::kWrite; 1039 | } 1040 | 1041 | void ContinuePostWrite(size_t gpu_idx) { 1042 | auto &s = write_states[gpu_idx]; 1043 | if (s.i_repeat == total_repeat) 1044 | return; 1045 | auto page_size = request_msg->page_size; 1046 | auto num_pages = request_msg->num_pages; 1047 | 1048 | auto net_idx = (*net_groups)[gpu_idx].GetNext(); 1049 | uint32_t imm_data = 0; 1050 | if (s.i_repeat + 1 == total_repeat && s.i_buf + 1 == buf_per_gpu && 1051 | s.i_page + nets_per_gpu >= num_pages) { 1052 | // The last WRITE. Pass remote context back. 1053 | imm_data = request_msg->remote_context; 1054 | } 1055 | const auto &mr = connect_msg->mr( 1056 | (gpu_idx * nets_per_gpu + net_idx) * buf_per_gpu + s.i_buf); 1057 | (*net_groups)[gpu_idx].nets[net_idx]->PostWrite( 1058 | {.buf = &(*cuda_bufs)[gpu_idx], 1059 | .offset = s.i_buf * (page_size * num_pages) + s.i_page * page_size, 1060 | .len = page_size, 1061 | .imm_data = imm_data, 1062 | .dest_ptr = mr.addr + request_msg->page_idx(s.i_page) * page_size, 1063 | .dest_addr = remote_addrs[gpu_idx][net_idx], 1064 | .dest_key = mr.rkey}, 1065 | [this](Network &net, RdmaOp &op) { HandleWriteCompletion(); }); 1066 | ++posted_write_ops; 1067 | 1068 | if (++s.i_page == num_pages) { 1069 | s.i_page = 0; 1070 | if (++s.i_buf == buf_per_gpu) { 1071 | s.i_buf = 0; 1072 | if (++s.i_repeat == total_repeat) 1073 | return; 1074 | } 1075 | } 1076 | } 1077 | 1078 | void PrintProgress(std::chrono::high_resolution_clock::time_point now, 1079 | uint64_t posted, uint64_t finished) { 1080 | auto elapsed = TimeDeltaNanos(write_start_at, now) * 1e-9; 1081 | float bw_gbps = 8.0f * write_op_size * finished / (elapsed * 1e9); 1082 | float bw_util = bw_gbps / (total_bw * 1e-9); 1083 | printf("\r[%.3fs] WRITE: %.0f%%, ops=%zu/%zu, posted=%zu(%.0f%%), " 1084 | "bytes=%zu/%zu, bw=%.3fGbps(%.1f%%), %.3fMpps\033[K", 1085 | // progress 1086 | elapsed, 100.0 * finished / total_write_ops, 1087 | // ops 1088 | finished, total_write_ops, posted, 100.0 * posted / total_write_ops, 1089 | // bytes 1090 | write_op_size * finished, write_op_size * total_write_ops, 1091 | // bw 1092 | bw_gbps, 100.0 * bw_util, finished / elapsed * 1e-6); 1093 | fflush(stdout); 1094 | } 1095 | 1096 | void HandleWriteCompletion() { 1097 | ++finished_write_ops; 1098 | if (finished_write_ops % 16384 == 0) { 1099 | auto now = std::chrono::high_resolution_clock::now(); 1100 | PrintProgress(now, posted_write_ops, finished_write_ops); 1101 | } 1102 | if (finished_write_ops == total_write_ops) { 1103 | auto now = std::chrono::high_resolution_clock::now(); 1104 | PrintProgress(now, posted_write_ops, finished_write_ops); 1105 | printf("\nFinished all RDMA WRITEs to the remote GPU memory.\n"); 1106 | state = State::kDone; 1107 | } 1108 | } 1109 | }; 1110 | 1111 | int ServerMain(int argc, char **argv) { 1112 | if (argc != 1 && argc != 3) { 1113 | fprintf(stderr, "Server Usage: \n"); 1114 | fprintf(stderr, "Default runs with all GPUs and all NICs:\n"); 1115 | fprintf(stderr, " %s\n", argv[0]); 1116 | fprintf(stderr, "Alternatively, specify the number of GPUs and NICs:\n"); 1117 | fprintf(stderr, " %s num_gpus num_nics\n", argv[0]); 1118 | std::exit(1); 1119 | } 1120 | 1121 | // Topology detection 1122 | struct fi_info *info = GetInfo(); 1123 | auto topo_groups = DetectTopo(info); 1124 | int num_gpus, num_nets; 1125 | if (argc == 1) { 1126 | num_gpus = topo_groups.size(); 1127 | num_nets = topo_groups[0].fi_infos.size() * topo_groups.size(); 1128 | } else { 1129 | num_gpus = std::stoi(argv[1]); 1130 | num_nets = std::stoi(argv[2]); 1131 | TrimTopo(topo_groups, num_gpus, num_nets); 1132 | } 1133 | int nets_per_gpu = num_nets / num_gpus; 1134 | 1135 | // Open Netowrk 1136 | std::vector nets; 1137 | std::vector net_groups; 1138 | nets.reserve(num_nets); 1139 | net_groups.reserve(num_gpus); 1140 | size_t total_bw = 0; 1141 | for (int cuda_device = 0; cuda_device < num_gpus; ++cuda_device) { 1142 | std::vector group_nets; 1143 | for (auto *fi : topo_groups[cuda_device].fi_infos) { 1144 | int cuda_device = nets.size() / nets_per_gpu; 1145 | nets.push_back(Network::Open(fi, cuda_device)); 1146 | group_nets.push_back(&nets.back()); 1147 | total_bw += info->nic->link_attr->speed; 1148 | } 1149 | net_groups.push_back(NetworkGroup(std::move(group_nets))); 1150 | } 1151 | printf("GPUs: %d, NICs: %d, Total Bandwidth: %.0f Gbps\n", num_gpus, num_nets, 1152 | total_bw * 1e-9); 1153 | PrintTopologyGroups(topo_groups); 1154 | printf("Run client with the following command:\n"); 1155 | printf(" %s %d %d %s [page_size num_pages]\n", argv[0], num_gpus, num_nets, 1156 | nets[0].addr.ToString().c_str()); 1157 | 1158 | // Allocate and register message buffer 1159 | auto buf1 = Buffer::Alloc(kMessageBufferSize, kBufAlign); 1160 | auto buf2 = Buffer::Alloc(kMessageBufferSize, kBufAlign); 1161 | nets[0].RegisterMemory(buf1); 1162 | nets[0].RegisterMemory(buf2); 1163 | 1164 | // Allocate and register CUDA memory 1165 | printf("Registered MR from"); 1166 | std::vector cuda_bufs; 1167 | for (int i = 0; i < num_gpus; ++i) { 1168 | CUDA_CHECK(cudaSetDevice(i)); 1169 | cuda_bufs.push_back(Buffer::AllocCuda(kMemoryRegionSize * 2, kBufAlign)); 1170 | for (int j = 0; j < nets_per_gpu; ++j) { 1171 | nets[i * nets_per_gpu + j].RegisterMemory(cuda_bufs.back()); 1172 | } 1173 | printf(" cuda:%d", i); 1174 | fflush(stdout); 1175 | } 1176 | printf("\n"); 1177 | 1178 | // Loop forever. Accept one client at a time. 1179 | for (;;) { 1180 | printf("------\n"); 1181 | // State machine 1182 | RandomFillRequestState s(&nets, &net_groups, &cuda_bufs); 1183 | // RECV for CONNECT 1184 | nets[0].PostRecv(buf1, 1185 | [&s](Network &net, RdmaOp &op) { s.OnRecv(net, op); }); 1186 | // RECV for RandomFillRequest 1187 | nets[0].PostRecv(buf2, 1188 | [&s](Network &net, RdmaOp &op) { s.OnRecv(net, op); }); 1189 | // Wait for completion 1190 | while (s.state != RandomFillRequestState::State::kDone) { 1191 | for (size_t gpu_idx = 0; gpu_idx < net_groups.size(); ++gpu_idx) { 1192 | for (auto *net : net_groups[gpu_idx].nets) { 1193 | net->PollCompletion(); 1194 | } 1195 | switch (s.state) { 1196 | case RandomFillRequestState::State::kWaitRequest: 1197 | break; 1198 | case RandomFillRequestState::State::kWrite: 1199 | s.ContinuePostWrite(gpu_idx); 1200 | break; 1201 | case RandomFillRequestState::State::kDone: 1202 | break; 1203 | } 1204 | } 1205 | } 1206 | } 1207 | 1208 | fi_freeinfo(info); 1209 | return 0; 1210 | } 1211 | 1212 | int ClientMain(int argc, char **argv) { 1213 | CHECK(argc == 4 || argc == 6); 1214 | auto server_addrname = EfaAddress::Parse(argv[3]); 1215 | size_t page_size, num_pages; 1216 | if (argc == 6) { 1217 | page_size = std::stoull(argv[4]); 1218 | num_pages = std::stoull(argv[5]); 1219 | } else { 1220 | page_size = 128 * 8 * 2 * 16 * sizeof(uint16_t); 1221 | num_pages = 1000; 1222 | } 1223 | size_t max_pages = kMemoryRegionSize / page_size; 1224 | CHECK(page_size * num_pages <= kMemoryRegionSize); 1225 | 1226 | // Topology detection 1227 | struct fi_info *info = GetInfo(); 1228 | auto topo_groups = DetectTopo(info); 1229 | int num_gpus, num_nets; 1230 | if (argc == 1) { 1231 | num_gpus = topo_groups.size(); 1232 | num_nets = topo_groups[0].fi_infos.size() * topo_groups.size(); 1233 | } else { 1234 | num_gpus = std::stoi(argv[1]); 1235 | num_nets = std::stoi(argv[2]); 1236 | TrimTopo(topo_groups, num_gpus, num_nets); 1237 | } 1238 | int nets_per_gpu = num_nets / num_gpus; 1239 | 1240 | // Open Netowrk 1241 | std::vector nets; 1242 | std::vector net_groups; 1243 | nets.reserve(num_nets); 1244 | net_groups.reserve(num_gpus); 1245 | size_t total_bw = 0; 1246 | for (int cuda_device = 0; cuda_device < num_gpus; ++cuda_device) { 1247 | std::vector group_nets; 1248 | for (auto *fi : topo_groups[cuda_device].fi_infos) { 1249 | int cuda_device = nets.size() / nets_per_gpu; 1250 | nets.push_back(Network::Open(fi, cuda_device)); 1251 | group_nets.push_back(&nets.back()); 1252 | total_bw += info->nic->link_attr->speed; 1253 | } 1254 | net_groups.push_back(NetworkGroup(std::move(group_nets))); 1255 | } 1256 | printf("GPUs: %d, NICs: %d, Total Bandwidth: %.0f Gbps\n", num_gpus, num_nets, 1257 | total_bw * 1e-9); 1258 | PrintTopologyGroups(topo_groups); 1259 | 1260 | // Add server address to the first network 1261 | auto server_addr = nets[0].AddPeerAddress(server_addrname); 1262 | 1263 | // Allocate and register message buffer 1264 | auto buf1 = Buffer::Alloc(kMessageBufferSize, kBufAlign); 1265 | nets[0].RegisterMemory(buf1); 1266 | 1267 | // Allocate and register CUDA memory 1268 | printf("Registered MR from"); 1269 | std::vector cuda_bufs1, cuda_bufs2; 1270 | for (int i = 0; i < num_gpus; ++i) { 1271 | CUDA_CHECK(cudaSetDevice(i)); 1272 | cuda_bufs1.push_back(Buffer::AllocCuda(kMemoryRegionSize, kBufAlign)); 1273 | cuda_bufs2.push_back(Buffer::AllocCuda(kMemoryRegionSize, kBufAlign)); 1274 | for (int j = 0; j < nets_per_gpu; ++j) { 1275 | nets[i * nets_per_gpu + j].RegisterMemory(cuda_bufs1.back()); 1276 | nets[i * nets_per_gpu + j].RegisterMemory(cuda_bufs2.back()); 1277 | } 1278 | printf(" cuda:%d", i); 1279 | fflush(stdout); 1280 | } 1281 | printf("\n"); 1282 | 1283 | // Prepare request 1284 | std::mt19937_64 rng(0xabcdabcd987UL); 1285 | uint64_t req_seed = rng(); 1286 | std::vector page_idx; 1287 | std::vector tmp(max_pages); 1288 | std::iota(tmp.begin(), tmp.end(), 0); 1289 | std::sample(tmp.begin(), tmp.end(), std::back_inserter(page_idx), num_pages, 1290 | rng); 1291 | 1292 | // Send address to server 1293 | auto &connect_msg = *(AppConnectMessage *)buf1.data; 1294 | connect_msg = { 1295 | .base = {.type = AppMessageType::kConnect}, 1296 | .num_gpus = (size_t)num_gpus, 1297 | .num_nets = nets.size(), 1298 | .num_mr = nets.size() * 2, 1299 | }; 1300 | for (size_t i = 0; i < nets.size(); i++) { 1301 | connect_msg.net_addr(i) = nets[i].addr; 1302 | int cuda_device = nets[i].cuda_device; 1303 | connect_msg.mr(i * 2) = { 1304 | .addr = (uint64_t)cuda_bufs1[cuda_device].data, 1305 | .size = cuda_bufs1[cuda_device].size, 1306 | .rkey = nets[i].GetMR(cuda_bufs1[cuda_device])->key, 1307 | }; 1308 | connect_msg.mr(i * 2 + 1) = { 1309 | .addr = (uint64_t)cuda_bufs2[cuda_device].data, 1310 | .size = cuda_bufs2[cuda_device].size, 1311 | .rkey = nets[i].GetMR(cuda_bufs2[cuda_device])->key, 1312 | }; 1313 | } 1314 | auto send_at = std::chrono::high_resolution_clock::now(); 1315 | bool connect_sent = false; 1316 | nets[0].PostSend( 1317 | server_addr, buf1, connect_msg.MessageBytes(), 1318 | [&connect_sent](Network &net, RdmaOp &op) { connect_sent = true; }); 1319 | while (!connect_sent) { 1320 | nets[0].PollCompletion(); 1321 | } 1322 | auto sent_at = std::chrono::high_resolution_clock::now(); 1323 | printf("Sent CONNECT message to server. SEND latency: %.3fus\n", 1324 | 1e-3 * TimeDeltaNanos(send_at, sent_at)); 1325 | 1326 | // Prepare to receive the last REMOTE WRITE from server 1327 | int cnt_last_remote_write_received = 0; 1328 | uint32_t remote_write_op_id = 0x123; 1329 | for (auto &net : nets) { 1330 | net.AddRemoteWrite(remote_write_op_id, [&cnt_last_remote_write_received]( 1331 | Network &net, RdmaOp &op) { 1332 | ++cnt_last_remote_write_received; 1333 | }); 1334 | } 1335 | 1336 | // Send message to server 1337 | auto &req_msg = *(AppRandomFillMessage *)buf1.data; 1338 | req_msg = { 1339 | .base = {.type = AppMessageType::kRandomFill}, 1340 | .remote_context = remote_write_op_id, 1341 | .seed = req_seed, 1342 | .page_size = page_size, 1343 | .num_pages = num_pages, 1344 | }; 1345 | for (size_t i = 0; i < num_pages; i++) { 1346 | req_msg.page_idx(i) = page_idx[i]; 1347 | } 1348 | send_at = std::chrono::high_resolution_clock::now(); 1349 | bool req_sent = false; 1350 | nets[0].PostSend(server_addr, buf1, req_msg.MessageBytes(), 1351 | [&req_sent](Network &net, RdmaOp &op) { req_sent = true; }); 1352 | while (!req_sent) { 1353 | nets[0].PollCompletion(); 1354 | } 1355 | sent_at = std::chrono::high_resolution_clock::now(); 1356 | printf("Sent RandomFillRequest to server. page_size: %zu, num_pages: %zu, " 1357 | "SEND latency: %.3fus\n", 1358 | page_size, num_pages, 1e-3 * TimeDeltaNanos(send_at, sent_at)); 1359 | 1360 | // Wait for REMOTE WRITE from server 1361 | while (cnt_last_remote_write_received != num_nets) { 1362 | for (auto &net : nets) { 1363 | net.PollCompletion(); 1364 | } 1365 | } 1366 | printf("Received RDMA WRITE to local GPU memory.\n"); 1367 | printf("Verifying"); 1368 | fflush(stdout); 1369 | 1370 | // Verify data 1371 | auto verify = [&nets, &page_idx, page_size, 1372 | num_pages](Buffer &cuda_buf, uint64_t seed) -> bool { 1373 | auto actual = std::vector(page_size * num_pages); 1374 | for (size_t j = 0; j < num_pages; ++j) { 1375 | CUDA_CHECK(cudaMemcpy(actual.data() + j * page_size, 1376 | (uint8_t *)cuda_buf.data + page_idx[j] * page_size, 1377 | page_size, cudaMemcpyDeviceToHost)); 1378 | } 1379 | auto expected = RandomBytes(seed, page_size * num_pages); 1380 | return expected == actual; 1381 | }; 1382 | for (int i = 0; i < num_gpus; ++i) { 1383 | CHECK(verify(cuda_bufs1[i], req_seed + i * 2)); 1384 | printf("."); 1385 | fflush(stdout); 1386 | CHECK(verify(cuda_bufs2[i], req_seed + i * 2 + 1)); 1387 | printf("."); 1388 | fflush(stdout); 1389 | } 1390 | printf("\n"); 1391 | printf("Data is correct\n"); 1392 | 1393 | fi_freeinfo(info); 1394 | return 0; 1395 | } 1396 | 1397 | int main(int argc, char **argv) { 1398 | if (argc <= 3) { 1399 | return ServerMain(argc, argv); 1400 | } else { 1401 | return ClientMain(argc, argv); 1402 | } 1403 | } 1404 | --------------------------------------------------------------------------------