├── LICENSE ├── Makefile ├── README.md ├── TODO.md ├── c_src ├── rdma_drv.c ├── rdma_drv_buffers.c ├── rdma_drv_buffers.h ├── rdma_drv_options.c └── rdma_drv_options.h ├── cover.spec ├── rebar ├── rebar.config ├── src ├── rdma.erl ├── rdma_dist.app.src ├── rdma_dist.erl └── rdma_server.erl └── test └── rdma_SUITE.erl /LICENSE: -------------------------------------------------------------------------------- 1 | ERLANG PUBLIC LICENSE 2 | Version 1.1 3 | 4 | 1. Definitions. 5 | 6 | 1.1. ``Contributor'' means each entity that creates or contributes to 7 | the creation of Modifications. 8 | 9 | 1.2. ``Contributor Version'' means the combination of the Original 10 | Code, prior Modifications used by a Contributor, and the Modifications 11 | made by that particular Contributor. 12 | 13 | 1.3. ``Covered Code'' means the Original Code or Modifications or the 14 | combination of the Original Code and Modifications, in each case 15 | including portions thereof. 16 | 17 | 1.4. ``Electronic Distribution Mechanism'' means a mechanism generally 18 | accepted in the software development community for the electronic 19 | transfer of data. 20 | 21 | 1.5. ``Executable'' means Covered Code in any form other than Source 22 | Code. 23 | 24 | 1.6. ``Initial Developer'' means the individual or entity identified 25 | as the Initial Developer in the Source Code notice required by Exhibit 26 | A. 27 | 28 | 1.7. ``Larger Work'' means a work which combines Covered Code or 29 | portions thereof with code not governed by the terms of this License. 30 | 31 | 1.8. ``License'' means this document. 32 | 33 | 1.9. ``Modifications'' means any addition to or deletion from the 34 | substance or structure of either the Original Code or any previous 35 | Modifications. When Covered Code is released as a series of files, a 36 | Modification is: 37 | 38 | A. Any addition to or deletion from the contents of a file containing 39 | Original Code or previous Modifications. 40 | 41 | B. Any new file that contains any part of the Original Code or 42 | previous Modifications. 43 | 44 | 1.10. ``Original Code'' means Source Code of computer software code 45 | which is described in the Source Code notice required by Exhibit A as 46 | Original Code, and which, at the time of its release under this 47 | License is not already Covered Code governed by this License. 48 | 49 | 1.11. ``Source Code'' means the preferred form of the Covered Code for 50 | making modifications to it, including all modules it contains, plus 51 | any associated interface definition files, scripts used to control 52 | compilation and installation of an Executable, or a list of source 53 | code differential comparisons against either the Original Code or 54 | another well known, available Covered Code of the Contributor's 55 | choice. The Source Code can be in a compressed or archival form, 56 | provided the appropriate decompression or de-archiving software is 57 | widely available for no charge. 58 | 59 | 1.12. ``You'' means an individual or a legal entity exercising rights 60 | under, and complying with all of the terms of, this License. For legal 61 | entities,``You'' includes any entity which controls, is controlled by, 62 | or is under common control with You. For purposes of this definition, 63 | ``control'' means (a) the power, direct or indirect, to cause the 64 | direction or management of such entity, whether by contract or 65 | otherwise, or (b) ownership of fifty percent (50%) or more of the 66 | outstanding shares or beneficial ownership of such entity. 67 | 68 | 2. Source Code License. 69 | 70 | 2.1. The Initial Developer Grant. 71 | The Initial Developer hereby grants You a world-wide, royalty-free, 72 | non-exclusive license, subject to third party intellectual property 73 | claims: 74 | 75 | (a) to use, reproduce, modify, display, perform, sublicense and 76 | distribute the Original Code (or portions thereof) with or without 77 | Modifications, or as part of a Larger Work; and 78 | 79 | (b) under patents now or hereafter owned or controlled by Initial 80 | Developer, to make, have made, use and sell (``Utilize'') the 81 | Original Code (or portions thereof), but solely to the extent that 82 | any such patent is reasonably necessary to enable You to Utilize 83 | the Original Code (or portions thereof) and not to any greater 84 | extent that may be necessary to Utilize further Modifications or 85 | combinations. 86 | 87 | 2.2. Contributor Grant. 88 | Each Contributor hereby grants You a world-wide, royalty-free, 89 | non-exclusive license, subject to third party intellectual property 90 | claims: 91 | 92 | (a) to use, reproduce, modify, display, perform, sublicense and 93 | distribute the Modifications created by such Contributor (or 94 | portions thereof) either on an unmodified basis, with other 95 | Modifications, as Covered Code or as part of a Larger Work; and 96 | 97 | (b) under patents now or hereafter owned or controlled by Contributor, 98 | to Utilize the Contributor Version (or portions thereof), but 99 | solely to the extent that any such patent is reasonably necessary 100 | to enable You to Utilize the Contributor Version (or portions 101 | thereof), and not to any greater extent that may be necessary to 102 | Utilize further Modifications or combinations. 103 | 104 | 3. Distribution Obligations. 105 | 106 | 3.1. Application of License. 107 | The Modifications which You contribute are governed by the terms of 108 | this License, including without limitation Section 2.2. The Source 109 | Code version of Covered Code may be distributed only under the terms 110 | of this License, and You must include a copy of this License with 111 | every copy of the Source Code You distribute. You may not offer or 112 | impose any terms on any Source Code version that alters or restricts 113 | the applicable version of this License or the recipients' rights 114 | hereunder. However, You may include an additional document offering 115 | the additional rights described in Section 3.5. 116 | 117 | 3.2. Availability of Source Code. 118 | Any Modification which You contribute must be made available in Source 119 | Code form under the terms of this License either on the same media as 120 | an Executable version or via an accepted Electronic Distribution 121 | Mechanism to anyone to whom you made an Executable version available; 122 | and if made available via Electronic Distribution Mechanism, must 123 | remain available for at least twelve (12) months after the date it 124 | initially became available, or at least six (6) months after a 125 | subsequent version of that particular Modification has been made 126 | available to such recipients. You are responsible for ensuring that 127 | the Source Code version remains available even if the Electronic 128 | Distribution Mechanism is maintained by a third party. 129 | 130 | 3.3. Description of Modifications. 131 | You must cause all Covered Code to which you contribute to contain a 132 | file documenting the changes You made to create that Covered Code and 133 | the date of any change. You must include a prominent statement that 134 | the Modification is derived, directly or indirectly, from Original 135 | Code provided by the Initial Developer and including the name of the 136 | Initial Developer in (a) the Source Code, and (b) in any notice in an 137 | Executable version or related documentation in which You describe the 138 | origin or ownership of the Covered Code. 139 | 140 | 3.4. Intellectual Property Matters 141 | 142 | (a) Third Party Claims. 143 | If You have knowledge that a party claims an intellectual property 144 | right in particular functionality or code (or its utilization 145 | under this License), you must include a text file with the source 146 | code distribution titled ``LEGAL'' which describes the claim and 147 | the party making the claim in sufficient detail that a recipient 148 | will know whom to contact. If you obtain such knowledge after You 149 | make Your Modification available as described in Section 3.2, You 150 | shall promptly modify the LEGAL file in all copies You make 151 | available thereafter and shall take other steps (such as notifying 152 | appropriate mailing lists or newsgroups) reasonably calculated to 153 | inform those who received the Covered Code that new knowledge has 154 | been obtained. 155 | 156 | (b) Contributor APIs. 157 | If Your Modification is an application programming interface and 158 | You own or control patents which are reasonably necessary to 159 | implement that API, you must also include this information in the 160 | LEGAL file. 161 | 162 | 3.5. Required Notices. 163 | You must duplicate the notice in Exhibit A in each file of the Source 164 | Code, and this License in any documentation for the Source Code, where 165 | You describe recipients' rights relating to Covered Code. If You 166 | created one or more Modification(s), You may add your name as a 167 | Contributor to the notice described in Exhibit A. If it is not 168 | possible to put such notice in a particular Source Code file due to 169 | its structure, then you must include such notice in a location (such 170 | as a relevant directory file) where a user would be likely to look for 171 | such a notice. You may choose to offer, and to charge a fee for, 172 | warranty, support, indemnity or liability obligations to one or more 173 | recipients of Covered Code. However, You may do so only on Your own 174 | behalf, and not on behalf of the Initial Developer or any 175 | Contributor. You must make it absolutely clear than any such warranty, 176 | support, indemnity or liability obligation is offered by You alone, 177 | and You hereby agree to indemnify the Initial Developer and every 178 | Contributor for any liability incurred by the Initial Developer or 179 | such Contributor as a result of warranty, support, indemnity or 180 | liability terms You offer. 181 | 182 | 3.6. Distribution of Executable Versions. 183 | You may distribute Covered Code in Executable form only if the 184 | requirements of Section 3.1-3.5 have been met for that Covered Code, 185 | and if You include a notice stating that the Source Code version of 186 | the Covered Code is available under the terms of this License, 187 | including a description of how and where You have fulfilled the 188 | obligations of Section 3.2. The notice must be conspicuously included 189 | in any notice in an Executable version, related documentation or 190 | collateral in which You describe recipients' rights relating to the 191 | Covered Code. You may distribute the Executable version of Covered 192 | Code under a license of Your choice, which may contain terms different 193 | from this License, provided that You are in compliance with the terms 194 | of this License and that the license for the Executable version does 195 | not attempt to limit or alter the recipient's rights in the Source 196 | Code version from the rights set forth in this License. If You 197 | distribute the Executable version under a different license You must 198 | make it absolutely clear that any terms which differ from this License 199 | are offered by You alone, not by the Initial Developer or any 200 | Contributor. You hereby agree to indemnify the Initial Developer and 201 | every Contributor for any liability incurred by the Initial Developer 202 | or such Contributor as a result of any such terms You offer. 203 | 204 | 3.7. Larger Works. 205 | You may create a Larger Work by combining Covered Code with other code 206 | not governed by the terms of this License and distribute the Larger 207 | Work as a single product. In such a case, You must make sure the 208 | requirements of this License are fulfilled for the Covered Code. 209 | 210 | 4. Inability to Comply Due to Statute or Regulation. 211 | If it is impossible for You to comply with any of the terms of this 212 | License with respect to some or all of the Covered Code due to statute 213 | or regulation then You must: (a) comply with the terms of this License 214 | to the maximum extent possible; and (b) describe the limitations and 215 | the code they affect. Such description must be included in the LEGAL 216 | file described in Section 3.4 and must be included with all 217 | distributions of the Source Code. Except to the extent prohibited by 218 | statute or regulation, such description must be sufficiently detailed 219 | for a recipient of ordinary skill to be able to understand it. 220 | 221 | 5. Application of this License. 222 | 223 | This License applies to code to which the Initial Developer has 224 | attached the notice in Exhibit A, and to related Covered Code. 225 | 226 | 6. CONNECTION TO MOZILLA PUBLIC LICENSE 227 | 228 | This Erlang License is a derivative work of the Mozilla Public 229 | License, Version 1.0. It contains terms which differ from the Mozilla 230 | Public License, Version 1.0. 231 | 232 | 7. DISCLAIMER OF WARRANTY. 233 | 234 | COVERED CODE IS PROVIDED UNDER THIS LICENSE ON AN ``AS IS'' BASIS, 235 | WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, 236 | WITHOUT LIMITATION, WARRANTIES THAT THE COVERED CODE IS FREE OF 237 | DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE OR 238 | NON-INFRINGING. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF 239 | THE COVERED CODE IS WITH YOU. SHOULD ANY COVERED CODE PROVE DEFECTIVE 240 | IN ANY RESPECT, YOU (NOT THE INITIAL DEVELOPER OR ANY OTHER 241 | CONTRIBUTOR) ASSUME THE COST OF ANY NECESSARY SERVICING, REPAIR OR 242 | CORRECTION. THIS DISCLAIMER OF WARRANTY CONSTITUTES AN ESSENTIAL PART 243 | OF THIS LICENSE. NO USE OF ANY COVERED CODE IS AUTHORIZED HEREUNDER 244 | EXCEPT UNDER THIS DISCLAIMER. 245 | 246 | 8. TERMINATION. 247 | This License and the rights granted hereunder will terminate 248 | automatically if You fail to comply with terms herein and fail to cure 249 | such breach within 30 days of becoming aware of the breach. All 250 | sublicenses to the Covered Code which are properly granted shall 251 | survive any termination of this License. Provisions which, by their 252 | nature, must remain in effect beyond the termination of this License 253 | shall survive. 254 | 255 | 9. DISCLAIMER OF LIABILITY 256 | Any utilization of Covered Code shall not cause the Initial Developer 257 | or any Contributor to be liable for any damages (neither direct nor 258 | indirect). 259 | 260 | 10. MISCELLANEOUS 261 | This License represents the complete agreement concerning the subject 262 | matter hereof. If any provision is held to be unenforceable, such 263 | provision shall be reformed only to the extent necessary to make it 264 | enforceable. This License shall be construed by and in accordance with 265 | the substantive laws of Sweden. Any dispute, controversy or claim 266 | arising out of or relating to this License, or the breach, termination 267 | or invalidity thereof, shall be subject to the exclusive jurisdiction 268 | of Swedish courts, with the Stockholm City Court as the first 269 | instance. 270 | 271 | EXHIBIT A. 272 | 273 | ``The contents of this file are subject to the Erlang Public License, 274 | Version 1.1, (the "License"); you may not use this file except in 275 | compliance with the License. You should have received a copy of the 276 | Erlang Public License along with this software. If not, it can be 277 | retrieved via the world wide web at http://www.erlang.org/. 278 | 279 | Software distributed under the License is distributed on an "AS IS" 280 | basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 281 | the License for the specific language governing rights and limitations 282 | under the License. 283 | 284 | The Initial Developer of the Original Code is Ericsson Utvecklings AB. 285 | Portions created by Ericsson are Copyright 1999, Ericsson Utvecklings 286 | AB. All Rights Reserved.'' 287 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | default: compile 2 | 3 | compile: 4 | ./rebar compile 5 | 6 | run: compile 7 | ERL_LIBS=deps:$(shell readlink -f ..) erl +K true -pa ebin -proto_dist rdma 8 | 9 | test: compile 10 | ./rebar -v skip_deps=true ct 11 | 12 | clean: 13 | -./rebar clean 14 | -rm -rf test/*.beam 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RDMA Distribution Driver for Erlang # 2 | This is an alternative distribution driver for the Erlang virtual machine. It 3 | implements a native protocol for RDMA devices such as InfiniBand. This can 4 | potentially improve messaging latency and throughput compared to the default 5 | TCP distribution protocol. 6 | 7 | ## Building ## 8 | This software can be added as a dependency in your project using Rebar or Mix. 9 | It requires OFED to be installed. If it is installed somewhere outside of the 10 | default locations, set the `CFLAGS` and `LDFLAGS` environmental variables 11 | appropriately before compiling. 12 | 13 | ## Using ## 14 | The software's `ebin` directory must be added to the Erlang VM's code path. 15 | Then you can tell the VM to use the driver by passing the `-proto_dist rdma` 16 | option. Here is an example: 17 | 18 | * Erlang: 19 | 20 | % erl +K true -pa rdma_dist/ebin -proto_dist rdma -name foo@the-rdma-interface.exmaple.com 21 | 22 | * Elixir: 23 | 24 | % iex --erl "+K true -pa rdma_dist/ebin -proto_dist rdma -name foo@the-rdma-interface.exmaple.com" 25 | 26 | Kernel polling (`+K true`) is recommended. If the RDMA device isn't the 27 | default interface on the system, explicitly name it in the `-name` option. For 28 | InfiniBand devices, you must have an IP-over-IB device set up---it will be used 29 | for connection management. Once the connection is established, the driver will 30 | use native RDMA verbs. 31 | 32 | ## Testing ## 33 | Tests can be run with `make test`. 34 | 35 | This software has seen limited testing over the following devices: 36 | 37 | * Qlogic QDR InfiniBand 38 | * Mellanox QDR InfiniBand 39 | 40 | No real-world applications have been tested using this distribution driver. If 41 | you have success with other RDMA devices or large Erlang applications, let me 42 | know :) 43 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | * Support IPv6 for RDMA CM. Wouldn't take much...I've just hardcoded AF_INET in several places. 2 | * Add mechanism to set default port options (like num_buffers and buffer_size). 3 | * Explore more advanced RDMA stuff: shared receive queues?, dynamic buffer management?, RDMA reads and writes vs. send and recv? 4 | * Are we doing flow-control the right way? 5 | -------------------------------------------------------------------------------- /c_src/rdma_drv.c: -------------------------------------------------------------------------------- 1 | /* 2 | * rdma_drv.c 3 | * Copyright (C) 2013 James Lee 4 | * 5 | * The contents of this file are subject to the Erlang Public License, 6 | * Version 1.1, (the "License"); you may not use this file except in 7 | * compliance with the License. You should have received a copy of the 8 | * Erlang Public License along with this software. If not, it can be 9 | * retrieved online at http://www.erlang.org/. 10 | * 11 | * Software distributed under the License is distributed on an "AS IS" 12 | * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 13 | * the License for the specific language governing rights and limitations 14 | * under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #include "rdma_drv_buffers.h" 26 | #include "rdma_drv_options.h" 27 | 28 | #define DRV_CONNECT 'C' 29 | #define DRV_LISTEN 'L' 30 | #define DRV_ACCEPT 'A' 31 | #define DRV_PEERNAME 'P' 32 | #define DRV_SOCKNAME 'S' 33 | #define DRV_RECV 'R' 34 | #define DRV_DISCONNECT 'D' 35 | #define DRV_GETSTAT 'G' 36 | #define DRV_SETOPTS 'O' 37 | #define DRV_CANCEL 'c' 38 | #define DRV_TIMEOUT 'T' 39 | 40 | #define ACK (1 << 31) 41 | 42 | typedef enum { 43 | STATE_DISCONNECTED, 44 | STATE_CONNECTED, 45 | STATE_LISTENING, 46 | } RdmaDrvState; 47 | 48 | typedef enum { 49 | ACTION_NONE, 50 | ACTION_ACCEPTING, 51 | ACTION_RECEIVING, 52 | ACTION_DISCONNECTING, 53 | } RdmaDrvAction; 54 | 55 | typedef struct rdma_drv_data { 56 | RdmaDrvState state; 57 | RdmaDrvAction action; 58 | RdmaDrvOptions options; 59 | ErlDrvPort port; 60 | ErlDrvTermData caller; 61 | 62 | /* rdma connection manager stuff */ 63 | struct rdma_cm_id *id; 64 | struct rdma_event_channel *ec; 65 | 66 | /* ibverbs stuff */ 67 | struct ibv_pd *pd; 68 | struct ibv_comp_channel *comp_channel; 69 | struct ibv_cq *cq; 70 | struct ibv_mr *send_mr, *recv_mr; 71 | RdmaDrvBuffers send_buffers, recv_buffers; 72 | 73 | /* place to accumulate packet that takes more than one recv */ 74 | void *incomplete_recv; 75 | int incomplete_recv_offset; 76 | 77 | /* basic flow control data */ 78 | bool sending_ack; 79 | int pending_acks; 80 | int peer_ready; 81 | 82 | /* stats */ 83 | unsigned long sent; 84 | unsigned long received; 85 | unsigned long buffered; 86 | 87 | /* linked list for holding accepted sockets */ 88 | struct rdma_drv_data *listener; /* (head) */ 89 | struct rdma_drv_data *next; 90 | ErlDrvMutex *list_mutex; 91 | } RdmaDrvData; 92 | 93 | static void rdma_drv_send_error_atom(RdmaDrvData *data, char *str) { 94 | if (data->options.active) { 95 | /* dist_util seems to expect {tcp_closed, Socket} */ 96 | ErlDrvTermData spec[] = { 97 | ERL_DRV_ATOM, driver_mk_atom("tcp_closed"), 98 | ERL_DRV_PORT, driver_mk_port(data->port), 99 | ERL_DRV_TUPLE, 2, 100 | }; 101 | 102 | erl_drv_output_term(driver_mk_port(data->port), spec, sizeof(spec) / sizeof(spec[0])); 103 | } else { 104 | ErlDrvTermData spec[] = { 105 | ERL_DRV_PORT, driver_mk_port(data->port), 106 | ERL_DRV_ATOM, driver_mk_atom("error"), 107 | ERL_DRV_ATOM, driver_mk_atom(str), 108 | ERL_DRV_TUPLE, 2, 109 | ERL_DRV_TUPLE, 2, 110 | }; 111 | 112 | erl_drv_send_term(driver_mk_port(data->port), data->caller, spec, sizeof(spec) / sizeof(spec[0])); 113 | } 114 | } 115 | 116 | static void rdma_drv_pause(RdmaDrvData *data) { 117 | data->action = ACTION_NONE; 118 | 119 | if (data->ec) { 120 | driver_select(data->port, (ErlDrvEvent) data->ec->fd, ERL_DRV_READ, 0); 121 | } 122 | 123 | if (data->comp_channel) { 124 | driver_select(data->port, (ErlDrvEvent) data->comp_channel->fd, ERL_DRV_READ, 0); 125 | } 126 | } 127 | 128 | static void rdma_drv_resume(RdmaDrvData *data) { 129 | /* Save who to reply to. */ 130 | data->caller = driver_caller(data->port); 131 | 132 | if (data->ec) { 133 | driver_select(data->port, (ErlDrvEvent) data->ec->fd, ERL_DRV_READ, 1); 134 | } 135 | 136 | if (data->comp_channel) { 137 | driver_select(data->port, (ErlDrvEvent) data->comp_channel->fd, ERL_DRV_READ, 1); 138 | } 139 | } 140 | 141 | static bool rdma_drv_add_data(RdmaDrvData *data, RdmaDrvData *add) { 142 | if (!data->list_mutex) { 143 | data->list_mutex = erl_drv_mutex_create("list_mutex"); 144 | if (!data->list_mutex) { 145 | return false; 146 | } 147 | } 148 | 149 | erl_drv_mutex_lock(data->list_mutex); 150 | add->next = data->next; 151 | data->next = add; 152 | erl_drv_mutex_unlock(data->list_mutex); 153 | 154 | return true; 155 | } 156 | 157 | static bool rdma_drv_remove_data(RdmaDrvData *data, RdmaDrvData *remove) { 158 | RdmaDrvData *i; 159 | bool ret = false; 160 | 161 | if (!data->list_mutex) { 162 | return false; 163 | } 164 | 165 | erl_drv_mutex_lock(data->list_mutex); 166 | 167 | for (i = data; i->next != NULL; i = i->next) { 168 | if (i->next == remove) { 169 | i->next = i->next->next; 170 | ret = true; 171 | break; 172 | } 173 | } 174 | 175 | erl_drv_mutex_unlock(data->list_mutex); 176 | 177 | return ret; 178 | } 179 | 180 | static void rdma_drv_post_ack(RdmaDrvData *data) { 181 | int ret; 182 | struct ibv_send_wr wr = {}, *bad_wr = NULL; 183 | 184 | if (data->sending_ack || data->pending_acks == 0) { 185 | return; 186 | } 187 | 188 | wr.wr_id = ACK; 189 | wr.opcode = IBV_WR_SEND_WITH_IMM; 190 | wr.send_flags = IBV_SEND_SIGNALED; 191 | wr.imm_data = htonl(ACK | data->pending_acks); 192 | 193 | ret = ibv_post_send(data->id->qp, &wr, &bad_wr); 194 | if (ret) { 195 | rdma_drv_send_error_atom(data, "ibv_post_send"); 196 | return; 197 | } 198 | 199 | data->sending_ack = true; 200 | data->pending_acks = 0; 201 | } 202 | 203 | static void rdma_drv_post_recv(RdmaDrvData *data, void *buffer) { 204 | int ret; 205 | struct ibv_recv_wr wr = {}, *bad_wr = NULL; 206 | struct ibv_sge sge = {}; 207 | 208 | wr.sg_list = &sge; 209 | wr.num_sge = 1; 210 | 211 | sge.addr = (uintptr_t) buffer; 212 | sge.length = data->options.buffer_size; 213 | sge.lkey = data->recv_mr->lkey; 214 | 215 | ret = ibv_post_recv(data->id->qp, &wr, &bad_wr); 216 | if (ret) { 217 | rdma_drv_send_error_atom(data, "ibv_post_recv"); 218 | return; 219 | } 220 | } 221 | 222 | static void rdma_drv_post_send(RdmaDrvData *data, void *buffer, ErlDrvSizeT remaining) { 223 | int ret; 224 | struct ibv_send_wr wr = {}, *bad_wr = NULL; 225 | struct ibv_sge sge = {}; 226 | 227 | wr.opcode = IBV_WR_SEND_WITH_IMM; 228 | wr.send_flags = IBV_SEND_SIGNALED; 229 | wr.imm_data = htonl(remaining); 230 | 231 | if (remaining) { 232 | wr.sg_list = &sge; 233 | wr.num_sge = 1; 234 | 235 | sge.addr = (uintptr_t) buffer; 236 | sge.length = remaining < data->options.buffer_size ? remaining : data->options.buffer_size; 237 | sge.lkey = data->send_mr->lkey; 238 | } 239 | 240 | ret = ibv_post_send(data->id->qp, &wr, &bad_wr); 241 | if (ret) { 242 | rdma_drv_send_error_atom(data, "ibv_post_send"); 243 | return; 244 | } 245 | 246 | data->peer_ready--; 247 | } 248 | 249 | static bool rdma_drv_init_ibverbs(RdmaDrvData *data) { 250 | int ret; 251 | 252 | /* Allocate a protection domain. */ 253 | data->pd = ibv_alloc_pd(data->id->verbs); 254 | if (!data->pd) { 255 | rdma_drv_send_error_atom(data, "ibv_alloc_pd"); 256 | return false; 257 | } 258 | 259 | /* Create a completion event channel. */ 260 | data->comp_channel = ibv_create_comp_channel(data->id->verbs); 261 | if (!data->comp_channel) { 262 | rdma_drv_send_error_atom(data, "ibv_create_comp_channel"); 263 | return false; 264 | } 265 | 266 | /* Make the completion event channel non-blocking. */ 267 | fcntl(data->comp_channel->fd, F_SETFL, fcntl(data->comp_channel->fd, F_GETFL) | O_NONBLOCK); 268 | 269 | /* 270 | * Create a completion queue large enough to hold all of the work 271 | * items that could be placed in the send and receive queues. 272 | */ 273 | data->cq = ibv_create_cq(data->id->verbs, data->options.num_buffers * 2, NULL, data->comp_channel, 0); 274 | if (!data->cq) { 275 | rdma_drv_send_error_atom(data, "ibv_create_cq"); 276 | return false; 277 | } 278 | 279 | /* Request a completion queue event in the completion channel. */ 280 | ret = ibv_req_notify_cq(data->cq, 0); 281 | if (ret) { 282 | rdma_drv_send_error_atom(data, "ibv_req_notify_cq"); 283 | return false; 284 | } 285 | 286 | /* Initialize the send and receive buffers. */ 287 | ret = rdma_drv_buffers_init(&data->send_buffers, data->options.buffer_size, data->options.num_buffers - 1); 288 | if (!ret) { 289 | rdma_drv_send_error_atom(data, "rdma_drv_buffers_init_send"); 290 | return false; 291 | } 292 | 293 | ret = rdma_drv_buffers_init(&data->recv_buffers, data->options.buffer_size, data->options.num_buffers); 294 | if (!ret) { 295 | rdma_drv_send_error_atom(data, "rdma_drv_buffers_init_recv"); 296 | return false; 297 | } 298 | 299 | /* Register the buffers with the RDMA device. */ 300 | data->send_mr = ibv_reg_mr(data->pd, data->send_buffers.buffers, data->options.buffer_size * (data->options.num_buffers - 1), IBV_ACCESS_LOCAL_WRITE); 301 | if (!data->send_mr) { 302 | rdma_drv_send_error_atom(data, "ibv_reg_mr_send"); 303 | return false; 304 | } 305 | 306 | data->recv_mr = ibv_reg_mr(data->pd, data->recv_buffers.buffers, data->options.buffer_size * data->options.num_buffers, IBV_ACCESS_LOCAL_WRITE); 307 | if (!data->recv_mr) { 308 | rdma_drv_send_error_atom(data, "ibv_reg_mr_recv"); 309 | return false; 310 | } 311 | 312 | /* Create the queue pair. */ 313 | struct ibv_qp_init_attr qp_attr = {}; 314 | qp_attr.send_cq = data->cq; 315 | qp_attr.recv_cq = data->cq; 316 | qp_attr.qp_type = IBV_QPT_RC; 317 | qp_attr.cap.max_send_wr = data->options.num_buffers; 318 | qp_attr.cap.max_recv_wr = data->options.num_buffers; 319 | qp_attr.cap.max_send_sge = 1; 320 | qp_attr.cap.max_recv_sge = 1; 321 | 322 | ret = rdma_create_qp(data->id, data->pd, &qp_attr); 323 | if (ret) { 324 | rdma_drv_send_error_atom(data, "rdma_create_qp"); 325 | return false; 326 | } 327 | 328 | /* Post all available receive buffers. */ 329 | void *recv_buffer; 330 | while ((recv_buffer = rdma_drv_buffers_reserve_buffer(&data->recv_buffers))) { 331 | rdma_drv_post_recv(data, recv_buffer); 332 | } 333 | 334 | /* 335 | * Assume that the peer has posted the same number of receive work 336 | * items as we have posted, minus one, which is "reserved" for ACKs 337 | */ 338 | data->peer_ready = data->options.num_buffers - 1; 339 | 340 | return true; 341 | } 342 | 343 | static void rdma_drv_free_ibverbs(RdmaDrvData *data) { 344 | /* 345 | * Basically 'rdma_drv_init_ibverbs' in reverse. 346 | */ 347 | 348 | if (data->id && data->id->qp) { 349 | rdma_destroy_qp(data->id); 350 | } 351 | 352 | if (data->recv_mr) { 353 | ibv_dereg_mr(data->recv_mr); 354 | data->recv_mr = NULL; 355 | } 356 | 357 | if (data->send_mr) { 358 | ibv_dereg_mr(data->send_mr); 359 | data->send_mr = NULL; 360 | } 361 | 362 | rdma_drv_buffers_free(&data->recv_buffers); 363 | rdma_drv_buffers_free(&data->send_buffers); 364 | 365 | if (data->cq) { 366 | ibv_destroy_cq(data->cq); 367 | data->cq = NULL; 368 | } 369 | 370 | if (data->comp_channel) { 371 | ibv_destroy_comp_channel(data->comp_channel); 372 | data->comp_channel = NULL; 373 | } 374 | 375 | if (data->pd) { 376 | ibv_dealloc_pd(data->pd); 377 | data->pd = NULL; 378 | } 379 | } 380 | 381 | static ErlDrvData rdma_drv_start(ErlDrvPort port, char *command) { 382 | RdmaDrvData *data = (RdmaDrvData *) driver_alloc(sizeof(RdmaDrvData)); 383 | if (!data) { 384 | return NULL; 385 | } 386 | 387 | memset(data, 0, sizeof(RdmaDrvData)); 388 | data->port = port; 389 | 390 | /* ei is used in the control interface. */ 391 | set_port_control_flags(port, PORT_CONTROL_FLAG_BINARY); 392 | 393 | return (ErlDrvData) data; 394 | } 395 | 396 | static void rdma_drv_stop(ErlDrvData drv_data) { 397 | RdmaDrvData *data = (RdmaDrvData *) drv_data, *i; 398 | 399 | /* Stop polling event channels. */ 400 | rdma_drv_pause(data); 401 | 402 | /* Kill child sockets. */ 403 | if (data->list_mutex) { 404 | erl_drv_mutex_lock(data->list_mutex); 405 | for (i = data; i->next != NULL; i = i->next) { 406 | i->next->listener = NULL; /* the listener is closed */ 407 | rdma_drv_send_error_atom(i->next, "closed"); 408 | driver_failure_eof(i->next->port); 409 | } 410 | data->next = NULL; 411 | erl_drv_mutex_unlock(data->list_mutex); 412 | erl_drv_mutex_destroy(data->list_mutex); 413 | data->list_mutex = NULL; 414 | } 415 | 416 | rdma_drv_free_ibverbs(data); 417 | 418 | if (data->id) { 419 | rdma_destroy_id(data->id); 420 | } 421 | 422 | if (data->ec) { 423 | rdma_destroy_event_channel(data->ec); 424 | } 425 | 426 | /* 427 | * If this was an "accepted" socket, remove it from the listener's 428 | * list of accepted sockets. 429 | */ 430 | if (data->listener) { 431 | rdma_drv_remove_data(data->listener, data); 432 | } 433 | 434 | driver_free(data); 435 | } 436 | 437 | static ErlDrvSizeT rdma_drv_send_fully(RdmaDrvData *data, void *buf, ErlDrvSizeT len, bool buf_is_vec) { 438 | void *send_buffer; 439 | ErlDrvSizeT remaining = len; 440 | 441 | while (data->peer_ready && (send_buffer = rdma_drv_buffers_reserve_buffer(&data->send_buffers))) { 442 | ErlDrvSizeT send_amount = remaining < data->options.buffer_size ? remaining : data->options.buffer_size; 443 | 444 | if (buf_is_vec) { 445 | ErlIOVec *queue = (ErlIOVec *) buf; 446 | driver_vec_to_buf(queue, send_buffer, send_amount); 447 | driver_deq(data->port, send_amount); 448 | driver_peekqv(data->port, queue); 449 | } else { 450 | memcpy(send_buffer, buf, send_amount); 451 | buf += send_amount; 452 | } 453 | 454 | rdma_drv_post_send(data, send_buffer, remaining); 455 | remaining -= send_amount; 456 | 457 | if (remaining == 0) { 458 | /* Increment the stats counter. */ 459 | data->sent++; 460 | 461 | break; 462 | } 463 | } 464 | 465 | return remaining; 466 | } 467 | 468 | static bool rdma_drv_flush_queue(RdmaDrvData *data) { 469 | ErlIOVec queue; 470 | 471 | while (driver_peekqv(data->port, &queue)) { 472 | ErlDrvSizeT remaining; 473 | driver_vec_to_buf(&queue, (char *) &remaining, sizeof(remaining)); 474 | driver_deq(data->port, sizeof(remaining)); 475 | driver_peekqv(data->port, &queue); 476 | 477 | remaining = rdma_drv_send_fully(data, &queue, remaining, true); 478 | if (remaining > 0) { 479 | driver_pushq(data->port, (char *) &remaining, sizeof(remaining)); 480 | return false; 481 | } else { 482 | /* 483 | * This is where we know a packet was fully removed from 484 | * the queue. 485 | */ 486 | data->buffered--; 487 | } 488 | } 489 | 490 | return true; 491 | } 492 | 493 | static void rdma_drv_output(ErlDrvData drv_data, char *buf, ErlDrvSizeT len) { 494 | RdmaDrvData *data = (RdmaDrvData *) drv_data; 495 | 496 | /* Save who to reply to. */ 497 | data->caller = driver_caller(data->port); 498 | 499 | if (data->state != STATE_CONNECTED) { 500 | rdma_drv_send_error_atom(data, "not_connected"); 501 | return; 502 | } 503 | 504 | if (rdma_drv_flush_queue(data)) { 505 | ErlDrvSizeT remaining = rdma_drv_send_fully(data, buf, len, false); 506 | if (remaining > 0) { 507 | driver_enq(data->port, (char *) &remaining, sizeof(remaining)); 508 | driver_enq(data->port, buf + (len - remaining), remaining); 509 | 510 | /* Packets get put into the queue here... */ 511 | data->buffered++; 512 | } 513 | } else { 514 | driver_enq(data->port, (char *) &len, sizeof(len)); 515 | driver_enq(data->port, buf, len); 516 | 517 | /* ...and here. */ 518 | data->buffered++; 519 | } 520 | } 521 | 522 | static void rdma_drv_handle_rdma_cm_event_connect_request(RdmaDrvData *data, struct rdma_cm_event *cm_event) { 523 | int ret; 524 | ErlDrvPort new_port; 525 | RdmaDrvData *new_data; 526 | struct rdma_conn_param cm_params = {}; 527 | 528 | /* 529 | * We are going to make a new port for the accepted socket. 530 | */ 531 | 532 | new_data = (RdmaDrvData *) driver_alloc(sizeof(RdmaDrvData)); 533 | if (!new_data) { 534 | rdma_drv_send_error_atom(data, "driver_alloc"); 535 | return; 536 | } 537 | 538 | memset(new_data, 0, sizeof(RdmaDrvData)); 539 | new_port = driver_create_port(data->port, data->caller, "rdma_drv", (ErlDrvData) new_data); 540 | 541 | /* ei is used in the control interface. */ 542 | set_port_control_flags(new_port, PORT_CONTROL_FLAG_BINARY); 543 | 544 | /* 545 | * Connect the new port data to the listener so it can be closed 546 | * if the listener decides to first. 547 | */ 548 | new_data->listener = data; 549 | ret = rdma_drv_add_data(data, new_data); 550 | if (!ret) { 551 | rdma_drv_send_error_atom(data, "rdma_drv_add_data"); 552 | return; 553 | } 554 | 555 | new_data->id = cm_event->id; 556 | new_data->port = new_port; 557 | new_data->options = data->options; 558 | 559 | /* Send the port to Erlang. */ 560 | ErlDrvTermData spec[] = { 561 | ERL_DRV_PORT, driver_mk_port(data->port), 562 | ERL_DRV_ATOM, driver_mk_atom("port"), 563 | ERL_DRV_PORT, driver_mk_port(new_port), 564 | ERL_DRV_TUPLE, 2, 565 | ERL_DRV_TUPLE, 2, 566 | }; 567 | 568 | erl_drv_send_term(driver_mk_port(data->port), data->caller, spec, sizeof(spec) / sizeof(spec[0])); 569 | 570 | /* 571 | * For better or worse, events related to the new socket still come 572 | * in on the listener socket, so we have to store the new data 573 | * somewhere it can be accessed...see: 574 | * rdma_drv_handle_rdma_cm_event_established 575 | * rdma_drv_handle_rdma_cm_event_disconnected 576 | */ 577 | new_data->id->context = new_data; 578 | 579 | /* If ibverbs are initialized successfully, accept the connection. */ 580 | if (rdma_drv_init_ibverbs(new_data)) { 581 | ret = rdma_accept(new_data->id, &cm_params); 582 | if (ret) { 583 | rdma_drv_send_error_atom(data, "rdma_accept"); 584 | return; 585 | } 586 | } 587 | } 588 | 589 | static void rdma_drv_handle_rdma_cm_event_addr_resolved(RdmaDrvData *data, struct rdma_cm_event *cm_event) { 590 | int ret; 591 | 592 | ret = rdma_resolve_route(data->id, data->options.timeout / 4); 593 | if (ret) { 594 | rdma_drv_send_error_atom(data, "rdma_resolve_route"); 595 | return; 596 | } 597 | } 598 | 599 | static void rdma_drv_handle_rdma_cm_event_route_resolved(RdmaDrvData *data, struct rdma_cm_event *cm_event) { 600 | int ret; 601 | struct rdma_conn_param cm_params = {}; 602 | 603 | if (rdma_drv_init_ibverbs(data)) { 604 | ret = rdma_connect(data->id, &cm_params); 605 | if (ret) { 606 | rdma_drv_send_error_atom(data, "rdma_connect"); 607 | return; 608 | } 609 | } 610 | } 611 | 612 | static void rdma_drv_handle_rdma_cm_event_established(RdmaDrvData *data, struct rdma_cm_event *cm_event) { 613 | if (cm_event->id->context) { 614 | RdmaDrvData *new_data = (RdmaDrvData *) cm_event->id->context; 615 | new_data->state = STATE_CONNECTED; 616 | 617 | ErlDrvTermData spec[] = { 618 | ERL_DRV_PORT, driver_mk_port(data->port), 619 | ERL_DRV_ATOM, driver_mk_atom("accept"), 620 | ERL_DRV_PORT, driver_mk_port(new_data->port), 621 | ERL_DRV_TUPLE, 2, 622 | ERL_DRV_TUPLE, 2, 623 | }; 624 | 625 | erl_drv_send_term(driver_mk_port(data->port), data->caller, spec, sizeof(spec) / sizeof(spec[0])); 626 | 627 | if (new_data->options.active) { 628 | /* 629 | * If the listener was configured to be "active", we want 630 | * to start polling the accepted socket for recvs. 631 | */ 632 | rdma_drv_resume(new_data); 633 | } 634 | 635 | /* We've completed an accept, so stop polling. */ 636 | rdma_drv_pause(data); 637 | } else { 638 | data->state = STATE_CONNECTED; 639 | 640 | /* Stop polling for events unless this is an active socket. */ 641 | ErlDrvTermData spec[] = { 642 | ERL_DRV_PORT, driver_mk_port(data->port), 643 | ERL_DRV_ATOM, driver_mk_atom("established"), 644 | ERL_DRV_TUPLE, 2, 645 | }; 646 | 647 | erl_drv_send_term(driver_mk_port(data->port), data->caller, spec, sizeof(spec) / sizeof(spec[0])); 648 | 649 | if (!data->options.active) { 650 | rdma_drv_pause(data); 651 | } 652 | } 653 | } 654 | 655 | static void rdma_drv_handle_rdma_cm_event_disconnected(RdmaDrvData *data, struct rdma_cm_event *cm_event) { 656 | int ret; 657 | 658 | if (cm_event->id->context) { 659 | /* 660 | * When event occurs in listener, it actually refers to a 661 | * previously accepted socket. 662 | */ 663 | data = (RdmaDrvData *) cm_event->id->context; 664 | } 665 | 666 | if (data->action == ACTION_DISCONNECTING) { 667 | /* 668 | * This event is the result of calling rdma_disconnect earlier, 669 | * which is to say, it is expected. 670 | */ 671 | ErlDrvTermData spec[] = { 672 | ERL_DRV_PORT, driver_mk_port(data->port), 673 | ERL_DRV_ATOM, driver_mk_atom("disconnected"), 674 | ERL_DRV_TUPLE, 2, 675 | }; 676 | 677 | erl_drv_send_term(driver_mk_port(data->port), data->caller, spec, sizeof(spec) / sizeof(spec[0])); 678 | } else { 679 | /* 680 | * This event is the result of the peer calling 681 | * rdma_disconnect, which is to say, it is unexpected. 682 | */ 683 | ret = rdma_disconnect(data->id); 684 | if (ret) { 685 | rdma_drv_send_error_atom(data, "rdma_disconnect"); 686 | return; 687 | } 688 | 689 | rdma_drv_send_error_atom(data, "closed"); 690 | } 691 | 692 | /* Empty the queue (otherwise the port won't close). */ 693 | /* XXX: Technically, this should require a port data lock. */ 694 | driver_deq(data->port, driver_sizeq(data->port)); 695 | 696 | /* Either way, if we get to this point, we are disconnected. */ 697 | data->state = STATE_DISCONNECTED; 698 | } 699 | 700 | static void rdma_drv_handle_rdma_cm_event(RdmaDrvData *data) { 701 | int ret; 702 | struct rdma_cm_event *cm_event; 703 | 704 | ret = rdma_get_cm_event(data->ec, &cm_event); 705 | if (ret) { 706 | rdma_drv_send_error_atom(data, "cm_get_event_failed"); 707 | return; 708 | } 709 | 710 | switch (cm_event->event) { 711 | case RDMA_CM_EVENT_CONNECT_REQUEST: 712 | rdma_drv_handle_rdma_cm_event_connect_request(data, cm_event); 713 | break; 714 | 715 | case RDMA_CM_EVENT_CONNECT_ERROR: 716 | rdma_drv_send_error_atom(data, "connection_failed"); 717 | break; 718 | 719 | case RDMA_CM_EVENT_ADDR_RESOLVED: 720 | rdma_drv_handle_rdma_cm_event_addr_resolved(data, cm_event); 721 | break; 722 | 723 | case RDMA_CM_EVENT_ADDR_ERROR: 724 | rdma_drv_send_error_atom(data, "address_resolution_failed"); 725 | break; 726 | 727 | case RDMA_CM_EVENT_ROUTE_RESOLVED: 728 | rdma_drv_handle_rdma_cm_event_route_resolved(data, cm_event); 729 | break; 730 | 731 | case RDMA_CM_EVENT_ROUTE_ERROR: 732 | rdma_drv_send_error_atom(data, "route_resolution_failed"); 733 | break; 734 | 735 | case RDMA_CM_EVENT_UNREACHABLE: 736 | rdma_drv_send_error_atom(data, "unreachable"); 737 | break; 738 | 739 | case RDMA_CM_EVENT_REJECTED: 740 | rdma_drv_send_error_atom(data, "rejected"); 741 | break; 742 | 743 | case RDMA_CM_EVENT_ESTABLISHED: 744 | rdma_drv_handle_rdma_cm_event_established(data, cm_event); 745 | break; 746 | 747 | case RDMA_CM_EVENT_DISCONNECTED: 748 | rdma_drv_handle_rdma_cm_event_disconnected(data, cm_event); 749 | break; 750 | 751 | case RDMA_CM_EVENT_TIMEWAIT_EXIT: 752 | /* XXX: Should we do anything here? */ 753 | break; 754 | 755 | default: 756 | rdma_drv_send_error_atom(data, "unhandled_event"); 757 | break; 758 | } 759 | 760 | ret = rdma_ack_cm_event(cm_event); 761 | if (ret) { 762 | rdma_drv_send_error_atom(data, "cm_ack_event_failed"); 763 | return; 764 | } 765 | } 766 | 767 | static bool rdma_drv_handle_recv_complete(RdmaDrvData *data, struct ibv_wc *wc) { 768 | bool completed = false; 769 | uint32_t imm_data = ntohl(wc->imm_data); 770 | void *recv_buffer = rdma_drv_buffers_current_buffer(&data->recv_buffers); 771 | 772 | if (imm_data & ACK) { 773 | data->peer_ready += imm_data & ~ACK; 774 | rdma_drv_post_recv(data, recv_buffer); 775 | rdma_drv_flush_queue(data); 776 | } else { 777 | ErlDrvSizeT remaining = imm_data; 778 | ErlDrvSizeT recv_amount = remaining < data->options.buffer_size ? remaining : data->options.buffer_size; 779 | 780 | if (remaining > data->options.buffer_size && !data->incomplete_recv) { 781 | data->incomplete_recv = driver_alloc(remaining); 782 | data->incomplete_recv_offset = 0; 783 | 784 | if (!data->incomplete_recv) { 785 | rdma_drv_send_error_atom(data, "driver_alloc_recv"); 786 | return false; 787 | } 788 | } 789 | 790 | if (data->incomplete_recv) { 791 | memcpy(data->incomplete_recv + data->incomplete_recv_offset, recv_buffer, recv_amount); 792 | data->incomplete_recv_offset += recv_amount; 793 | } 794 | 795 | if (remaining <= data->options.buffer_size) { 796 | void *output_buffer; 797 | ErlDrvSizeT output_buffer_size; 798 | 799 | if (data->incomplete_recv) { 800 | output_buffer = data->incomplete_recv; 801 | output_buffer_size = data->incomplete_recv_offset; 802 | } else { 803 | output_buffer = recv_buffer; 804 | output_buffer_size = remaining; 805 | } 806 | 807 | /* Send packet to emulator. */ 808 | if (data->options.active) { 809 | /* Based on 'inet_port_data' from 'inet_drv.c' */ 810 | if (!data->options.binary || data->options.packet > output_buffer_size) { 811 | driver_output2(data->port, output_buffer, output_buffer_size, NULL, 0); 812 | } else if (data->options.packet > 0) { 813 | driver_output2(data->port, output_buffer, data->options.packet, output_buffer + data->options.packet, output_buffer_size - data->options.packet); 814 | } else { 815 | driver_output(data->port, output_buffer, output_buffer_size); 816 | } 817 | } else { 818 | ErlDrvTermData spec[] = { 819 | ERL_DRV_PORT, driver_mk_port(data->port), 820 | ERL_DRV_ATOM, driver_mk_atom("data"), 821 | data->options.binary ? ERL_DRV_BUF2BINARY : ERL_DRV_STRING, (ErlDrvTermData) output_buffer, output_buffer_size, 822 | ERL_DRV_TUPLE, 2, 823 | ERL_DRV_TUPLE, 2, 824 | }; 825 | 826 | erl_drv_send_term(driver_mk_port(data->port), data->caller, spec, sizeof(spec) / sizeof(spec[0])); 827 | } 828 | 829 | if (!data->options.active) { 830 | /* Stop polling for events. */ 831 | rdma_drv_pause(data); 832 | 833 | /* Return false so we don't process any more cq entries. */ 834 | completed = true; 835 | } 836 | 837 | if (data->incomplete_recv) { 838 | driver_free(data->incomplete_recv); 839 | data->incomplete_recv = NULL; 840 | data->incomplete_recv_offset = 0; 841 | } 842 | 843 | /* Increment the stats counter. */ 844 | data->received++; 845 | } 846 | 847 | rdma_drv_post_recv(data, recv_buffer); 848 | data->pending_acks++; 849 | } 850 | 851 | return completed; 852 | } 853 | 854 | static void rdma_drv_handle_send_complete(RdmaDrvData *data, struct ibv_wc *wc) { 855 | if (wc->wr_id == ACK) { 856 | data->sending_ack = false; 857 | } else { 858 | rdma_drv_buffers_release_buffer(&data->send_buffers); 859 | rdma_drv_flush_queue(data); 860 | } 861 | } 862 | 863 | static bool rdma_drv_flush_cq(RdmaDrvData *data) { 864 | bool completed = false; 865 | struct ibv_wc wc; 866 | 867 | while (!completed && ibv_poll_cq(data->cq, 1, &wc)) { 868 | if (wc.status == IBV_WC_SUCCESS) { 869 | if (wc.opcode == IBV_WC_RECV) { 870 | completed = rdma_drv_handle_recv_complete(data, &wc); 871 | } else if (wc.opcode == IBV_WC_SEND) { 872 | rdma_drv_handle_send_complete(data, &wc); 873 | } 874 | } 875 | 876 | /* XXX: What to do about unsuccessful work items? */ 877 | } 878 | 879 | rdma_drv_post_ack(data); 880 | 881 | /* Queue flushed completely. */ 882 | return completed; 883 | } 884 | 885 | static void rdma_drv_handle_comp_channel_event(RdmaDrvData *data) { 886 | void *ev_ctx; 887 | struct ibv_cq *cq; 888 | 889 | /* XXX: Check error status of these functions. */ 890 | ibv_get_cq_event(data->comp_channel, &cq, &ev_ctx); 891 | ibv_ack_cq_events(cq, 1); 892 | 893 | /* Request a completion queue event in the completion channel. */ 894 | ibv_req_notify_cq(cq, 0); 895 | 896 | rdma_drv_flush_cq(data); 897 | } 898 | 899 | static void rdma_drv_ready_input(ErlDrvData drv_data, ErlDrvEvent event) { 900 | RdmaDrvData *data = (RdmaDrvData *) drv_data; 901 | 902 | if (data->ec && event == (ErlDrvEvent) data->ec->fd) { 903 | rdma_drv_handle_rdma_cm_event(data); 904 | } else if (data->comp_channel && event == (ErlDrvEvent) data->comp_channel->fd) { 905 | rdma_drv_handle_comp_channel_event(data); 906 | } 907 | } 908 | 909 | static void rdma_drv_encode_error_string(ei_x_buff *x, const char *str) { 910 | ei_x_encode_tuple_header(x, 2); 911 | ei_x_encode_atom(x, "error"); 912 | ei_x_encode_string(x, str); 913 | } 914 | 915 | static void rdma_drv_encode_error_posix(ei_x_buff *x, int error) { 916 | ei_x_encode_tuple_header(x, 2); 917 | ei_x_encode_atom(x, "error"); 918 | ei_x_encode_atom(x, erl_errno_id(error)); 919 | } 920 | 921 | static void rdma_drv_encode_error_atom(ei_x_buff *x, const char *str) { 922 | ei_x_encode_tuple_header(x, 2); 923 | ei_x_encode_atom(x, "error"); 924 | ei_x_encode_atom(x, str); 925 | } 926 | 927 | static void rdma_drv_control_connect(RdmaDrvData *data, char *buf, ei_x_buff *x) { 928 | int ret; 929 | 930 | /* Parse the connection options */ 931 | rdma_drv_options_init(&data->options); 932 | if (!rdma_drv_options_parse(&data->options, buf)) { 933 | rdma_drv_encode_error_atom(x, "bad_options"); 934 | return; 935 | } 936 | 937 | /* Resolve the address and port */ 938 | struct addrinfo *addr; 939 | struct addrinfo *hints = NULL; 940 | ret = getaddrinfo(data->options.dest_host, data->options.dest_port, hints, &addr); 941 | if (ret == EAI_SYSTEM) { 942 | rdma_drv_encode_error_posix(x, errno); 943 | return; 944 | } else if (ret) { 945 | rdma_drv_encode_error_atom(x, "getaddrinfo"); 946 | return; 947 | } 948 | 949 | /* Create the event channel */ 950 | data->ec = rdma_create_event_channel(); 951 | if (!data->ec) { 952 | rdma_drv_encode_error_string(x, "rdma_create_event_channel"); 953 | return; 954 | } 955 | 956 | /* Make event channel non-blocking channel. */ 957 | fcntl(data->ec->fd, F_SETFL, fcntl(data->ec->fd, F_GETFL) | O_NONBLOCK); 958 | 959 | /* Create the "socket" */ 960 | ret = rdma_create_id(data->ec, &data->id, NULL, RDMA_PS_TCP); 961 | if (ret) { 962 | rdma_drv_encode_error_posix(x, errno); 963 | return; 964 | } 965 | 966 | /* Start address resolution */ 967 | struct sockaddr_in src_addr = {}; 968 | src_addr.sin_family = AF_INET; 969 | src_addr.sin_port = htons(data->options.port); 970 | 971 | if (data->options.ip[0] && !inet_pton(AF_INET, data->options.ip, &src_addr.sin_addr)) { 972 | rdma_drv_encode_error_string(x, "inet_pton"); 973 | return; 974 | } 975 | 976 | ret = rdma_resolve_addr(data->id, (struct sockaddr *) &src_addr, addr->ai_addr, data->options.timeout / 4); 977 | if (ret) { 978 | rdma_drv_encode_error_posix(x, errno); 979 | return; 980 | } 981 | 982 | freeaddrinfo(addr); 983 | 984 | /* Start polling for events. */ 985 | rdma_drv_resume(data); 986 | 987 | ei_x_encode_atom(x, "ok"); 988 | } 989 | 990 | static void rdma_drv_control_listen(RdmaDrvData *data, char *buf, ei_x_buff *x) { 991 | int ret; 992 | 993 | /* Parse the connection options */ 994 | rdma_drv_options_init(&data->options); 995 | if (!rdma_drv_options_parse(&data->options, buf)) { 996 | rdma_drv_encode_error_atom(x, "bad_options"); 997 | return; 998 | } 999 | 1000 | /* Create the event channel */ 1001 | data->ec = rdma_create_event_channel(); 1002 | if (!data->ec) { 1003 | rdma_drv_encode_error_string(x, "rdma_create_event_channel"); 1004 | return; 1005 | } 1006 | 1007 | /* Make event channel non-blocking channel. */ 1008 | fcntl(data->ec->fd, F_SETFL, fcntl(data->ec->fd, F_GETFL) | O_NONBLOCK); 1009 | 1010 | /* Create the "socket" */ 1011 | ret = rdma_create_id(data->ec, &data->id, NULL, RDMA_PS_TCP); 1012 | if (ret) { 1013 | rdma_drv_encode_error_posix(x, errno); 1014 | return; 1015 | } 1016 | 1017 | /* Bind to a specified port */ 1018 | struct sockaddr_in addr = {}; 1019 | addr.sin_family = AF_INET; 1020 | addr.sin_port = htons(data->options.port); 1021 | 1022 | if (data->options.ip[0] && !inet_pton(AF_INET, data->options.ip, &addr.sin_addr)) { 1023 | rdma_drv_encode_error_string(x, "inet_pton"); 1024 | return; 1025 | } 1026 | 1027 | ret = rdma_bind_addr(data->id, (struct sockaddr *) &addr); 1028 | if (ret) { 1029 | rdma_drv_encode_error_posix(x, errno); 1030 | return; 1031 | } 1032 | 1033 | /* Start listening for incoming connections */ 1034 | ret = rdma_listen(data->id, data->options.backlog); 1035 | if (ret) { 1036 | rdma_drv_encode_error_posix(x, errno); 1037 | return; 1038 | } 1039 | 1040 | data->state = STATE_LISTENING; 1041 | ei_x_encode_atom(x, "ok"); 1042 | } 1043 | 1044 | static void rdma_drv_control_accept(RdmaDrvData *data, ei_x_buff *x) { 1045 | if (data->action == ACTION_ACCEPTING && driver_caller(data->port) != data->caller) { 1046 | rdma_drv_encode_error_atom(x, "already_accepting"); 1047 | return; 1048 | } 1049 | 1050 | if (data->state != STATE_LISTENING) { 1051 | rdma_drv_encode_error_atom(x, "not_listening"); 1052 | return; 1053 | } 1054 | 1055 | data->action = ACTION_ACCEPTING; 1056 | 1057 | /* Start polling. */ 1058 | rdma_drv_resume(data); 1059 | 1060 | ei_x_encode_atom(x, "ok"); 1061 | } 1062 | 1063 | static void rdma_drv_control_peername(RdmaDrvData *data, ei_x_buff *x) { 1064 | struct sockaddr_in *addr = (struct sockaddr_in *) rdma_get_peer_addr(data->id); 1065 | 1066 | ei_x_encode_tuple_header(x, 2); 1067 | ei_x_encode_atom(x, "ok"); 1068 | ei_x_encode_tuple_header(x, 2); 1069 | ei_x_encode_string(x, inet_ntoa(addr->sin_addr)); 1070 | ei_x_encode_ulong(x, ntohs(rdma_get_dst_port(data->id))); 1071 | } 1072 | 1073 | static void rdma_drv_control_sockname(RdmaDrvData *data, ei_x_buff *x) { 1074 | struct sockaddr_in *addr = (struct sockaddr_in *) rdma_get_local_addr(data->id); 1075 | 1076 | ei_x_encode_tuple_header(x, 2); 1077 | ei_x_encode_atom(x, "ok"); 1078 | ei_x_encode_tuple_header(x, 2); 1079 | ei_x_encode_string(x, inet_ntoa(addr->sin_addr)); 1080 | ei_x_encode_ulong(x, ntohs(rdma_get_src_port(data->id))); 1081 | } 1082 | 1083 | static void rdma_drv_control_recv(RdmaDrvData *data, ei_x_buff *x) { 1084 | if (data->options.active) { 1085 | if (driver_caller(data->port) != driver_connected(data->port)) { 1086 | rdma_drv_encode_error_atom(x, "not_owner"); 1087 | return; 1088 | } 1089 | } else { 1090 | if (data->action == ACTION_RECEIVING) { 1091 | rdma_drv_encode_error_atom(x, "already_receiving"); 1092 | return; 1093 | } 1094 | 1095 | if (data->state == STATE_DISCONNECTED) { 1096 | rdma_drv_encode_error_atom(x, "closed"); 1097 | return; 1098 | } 1099 | 1100 | if (data->state != STATE_CONNECTED) { 1101 | rdma_drv_encode_error_atom(x, "not_connected"); 1102 | return; 1103 | } 1104 | 1105 | /* 1106 | * Caller needs to be set in case rdma_drv_flush_cq needs to 1107 | * output anytihng. 1108 | */ 1109 | data->caller = driver_caller(data->port); 1110 | if (!rdma_drv_flush_cq(data)) { 1111 | /* Flushing did not complete a receive, so start polling. */ 1112 | data->action = ACTION_RECEIVING; 1113 | rdma_drv_resume(data); 1114 | } 1115 | } 1116 | 1117 | ei_x_encode_atom(x, "ok"); 1118 | } 1119 | 1120 | static void rdma_drv_control_disconnect(RdmaDrvData *data, ei_x_buff *x) { 1121 | int ret; 1122 | 1123 | /* 1124 | * We only need to initiate a proper disconnection for fully 1125 | * connected sockets. Listeners or other sockets in the process 1126 | * of disconnecting don't need to do anything. 1127 | */ 1128 | 1129 | if (data->state == STATE_CONNECTED && data->action != ACTION_DISCONNECTING) { 1130 | /* Start polling for disconnection. */ 1131 | rdma_drv_resume(data); 1132 | 1133 | data->action = ACTION_DISCONNECTING; 1134 | 1135 | ret = rdma_disconnect(data->id); 1136 | if (ret) { 1137 | rdma_drv_encode_error_posix(x, errno); 1138 | return; 1139 | } 1140 | 1141 | ei_x_encode_atom(x, "wait"); 1142 | return; 1143 | } 1144 | 1145 | ei_x_encode_atom(x, "ok"); 1146 | } 1147 | 1148 | static void rdma_drv_control_getstat(RdmaDrvData *data, ei_x_buff *x) { 1149 | ei_x_encode_tuple_header(x, 4); 1150 | ei_x_encode_atom(x, "ok"); 1151 | ei_x_encode_ulong(x, data->received); 1152 | ei_x_encode_ulong(x, data->sent); 1153 | ei_x_encode_ulong(x, data->buffered); 1154 | } 1155 | 1156 | static void rdma_drv_control_setopts(RdmaDrvData *data, char *buf, ei_x_buff *x) { 1157 | /* Parse the connection options */ 1158 | if (!rdma_drv_options_parse(&data->options, buf)) { 1159 | rdma_drv_encode_error_atom(x, "bad_options"); 1160 | return; 1161 | } 1162 | 1163 | if (data->options.active) { 1164 | /* Start polling. */ 1165 | rdma_drv_resume(data); 1166 | } else { 1167 | /* Stop polling. */ 1168 | rdma_drv_resume(data); 1169 | } 1170 | 1171 | ei_x_encode_atom(x, "ok"); 1172 | } 1173 | 1174 | static void rdma_drv_control_cancel(RdmaDrvData *data, ei_x_buff *x) { 1175 | if (data->action != ACTION_NONE) { 1176 | rdma_drv_send_error_atom(data, "canceled"); 1177 | rdma_drv_pause(data); 1178 | } 1179 | 1180 | ei_x_encode_atom(x, "ok"); 1181 | } 1182 | 1183 | static void rdma_drv_control_timeout(RdmaDrvData *data, ei_x_buff *x) { 1184 | /* Basically the same as cancel, without sending error. */ 1185 | 1186 | if (data->action != ACTION_NONE) { 1187 | rdma_drv_pause(data); 1188 | } 1189 | 1190 | ei_x_encode_atom(x, "ok"); 1191 | } 1192 | 1193 | static ErlDrvBinary * ei_x_to_new_binary(ei_x_buff *x) { 1194 | ErlDrvBinary *bin = driver_alloc_binary(x->index); 1195 | 1196 | if (bin != NULL) 1197 | memcpy(&bin->orig_bytes[0], x->buff, x->index); 1198 | 1199 | return bin; 1200 | } 1201 | 1202 | static ErlDrvSSizeT rdma_drv_control(ErlDrvData drv_data, unsigned int command, char *buf, ErlDrvSizeT len, char **rbuf, ErlDrvSizeT rlen) { 1203 | RdmaDrvData *data = (RdmaDrvData *) drv_data; 1204 | 1205 | ei_x_buff x; 1206 | ei_x_new_with_version(&x); 1207 | 1208 | switch (command) { 1209 | case DRV_CONNECT: 1210 | rdma_drv_control_connect(data, buf, &x); 1211 | break; 1212 | 1213 | case DRV_LISTEN: 1214 | rdma_drv_control_listen(data, buf, &x); 1215 | break; 1216 | 1217 | case DRV_ACCEPT: 1218 | rdma_drv_control_accept(data, &x); 1219 | break; 1220 | 1221 | case DRV_PEERNAME: 1222 | rdma_drv_control_peername(data, &x); 1223 | break; 1224 | 1225 | case DRV_SOCKNAME: 1226 | rdma_drv_control_sockname(data, &x); 1227 | break; 1228 | 1229 | case DRV_RECV: 1230 | rdma_drv_control_recv(data, &x); 1231 | break; 1232 | 1233 | case DRV_DISCONNECT: 1234 | rdma_drv_control_disconnect(data, &x); 1235 | break; 1236 | 1237 | case DRV_GETSTAT: 1238 | rdma_drv_control_getstat(data, &x); 1239 | break; 1240 | 1241 | case DRV_SETOPTS: 1242 | rdma_drv_control_setopts(data, buf, &x); 1243 | break; 1244 | 1245 | case DRV_CANCEL: 1246 | rdma_drv_control_cancel(data, &x); 1247 | break; 1248 | 1249 | case DRV_TIMEOUT: 1250 | rdma_drv_control_timeout(data, &x); 1251 | break; 1252 | 1253 | default: 1254 | return -1; 1255 | } 1256 | 1257 | *rbuf = (char *) ei_x_to_new_binary(&x); 1258 | ei_x_free(&x); 1259 | 1260 | return 0; 1261 | } 1262 | 1263 | static ErlDrvEntry rdma_drv_entry = { 1264 | NULL, /* init, N/A */ 1265 | rdma_drv_start, /* start, called when port is opened */ 1266 | rdma_drv_stop, /* stop, called when port is closed */ 1267 | rdma_drv_output, /* output, called when erlang has sent */ 1268 | rdma_drv_ready_input, /* ready_input, called when input descriptor ready */ 1269 | NULL, /* ready_output, called when output descriptor ready */ 1270 | "rdma_drv", /* char *driver_name, the argument to open_port */ 1271 | NULL, /* finish, called when unloaded */ 1272 | NULL, /* void * that is not used (BC) */ 1273 | rdma_drv_control, /* control, port_control callback */ 1274 | NULL, /* timeout, called on timeouts */ 1275 | NULL, /* outputv, vector output interface */ 1276 | NULL, /* ready_async callback */ 1277 | NULL, /* flush callback */ 1278 | NULL, /* call callback */ 1279 | NULL, /* event callback */ 1280 | ERL_DRV_EXTENDED_MARKER, /* Extended driver interface marker */ 1281 | ERL_DRV_EXTENDED_MAJOR_VERSION, /* Major version number */ 1282 | ERL_DRV_EXTENDED_MINOR_VERSION, /* Minor version number */ 1283 | ERL_DRV_FLAG_SOFT_BUSY | /* Driver flags. Soft busy flag is required for distribution drivers */ 1284 | ERL_DRV_FLAG_USE_PORT_LOCKING, 1285 | NULL, /* Reserved for internal use */ 1286 | NULL, /* process_exit callback */ 1287 | NULL /* stop_select callback */ 1288 | }; 1289 | 1290 | DRIVER_INIT(rdma_drv) { 1291 | return &rdma_drv_entry; 1292 | } 1293 | -------------------------------------------------------------------------------- /c_src/rdma_drv_buffers.c: -------------------------------------------------------------------------------- 1 | /* 2 | * rdma_drv_buffers.c 3 | * Copyright (C) 2013 James Lee 4 | * 5 | * The contents of this file are subject to the Erlang Public License, 6 | * Version 1.1, (the "License"); you may not use this file except in 7 | * compliance with the License. You should have received a copy of the 8 | * Erlang Public License along with this software. If not, it can be 9 | * retrieved online at http://www.erlang.org/. 10 | * 11 | * Software distributed under the License is distributed on an "AS IS" 12 | * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 13 | * the License for the specific language governing rights and limitations 14 | * under the License. 15 | */ 16 | 17 | #include 18 | 19 | #include "rdma_drv_buffers.h" 20 | 21 | bool rdma_drv_buffers_init(RdmaDrvBuffers *buffers, int buffer_size, int num_buffers) { 22 | buffers->buffer_size = buffer_size; 23 | buffers->num_buffers = num_buffers; 24 | buffers->buffers = driver_alloc(num_buffers * buffer_size); 25 | buffers->i = 0; 26 | buffers->free_i = 0; 27 | buffers->busy_i = -1; 28 | 29 | return buffers->buffers != NULL; 30 | } 31 | 32 | void rdma_drv_buffers_free(RdmaDrvBuffers *buffers) { 33 | if (buffers->buffers) { 34 | driver_free(buffers->buffers); 35 | buffers->buffers = NULL; 36 | } 37 | } 38 | 39 | void * rdma_drv_buffers_reserve_buffer(RdmaDrvBuffers *buffers) { 40 | if (buffers->free_i == buffers->busy_i) { 41 | /* All buffers are busy. */ 42 | return NULL; 43 | } 44 | 45 | if (buffers->busy_i == -1) { 46 | /* 47 | * No buffers were busy, but we're reserving one now, so set 48 | * the first busy buffer 49 | */ 50 | buffers->busy_i = buffers->free_i; 51 | } 52 | 53 | void *buffer = buffers->buffers + buffers->free_i * buffers->buffer_size; 54 | buffers->free_i = (buffers->free_i + 1) % buffers->num_buffers; 55 | 56 | return buffer; 57 | } 58 | 59 | void rdma_drv_buffers_release_buffer(RdmaDrvBuffers *buffers) { 60 | /* 61 | * Buffers are freed in the same order that they're reserved, so 62 | * all we have to do is mark the next buffer as the start of the 63 | * busy buffers. 64 | */ 65 | 66 | if (++buffers->busy_i == buffers->free_i) { 67 | /* We've caught up to the first free buffer. */ 68 | buffers->busy_i = -1; 69 | } 70 | } 71 | 72 | void * rdma_drv_buffers_current_buffer(RdmaDrvBuffers *buffers) { 73 | void *buffer = buffers->buffers + buffers->i * buffers->buffer_size; 74 | buffers->i = (buffers->i + 1) % buffers->num_buffers; 75 | 76 | return buffer; 77 | } 78 | -------------------------------------------------------------------------------- /c_src/rdma_drv_buffers.h: -------------------------------------------------------------------------------- 1 | /* 2 | * rdma_drv_buffers.h 3 | * Copyright (C) 2013 James Lee 4 | * 5 | * The contents of this file are subject to the Erlang Public License, 6 | * Version 1.1, (the "License"); you may not use this file except in 7 | * compliance with the License. You should have received a copy of the 8 | * Erlang Public License along with this software. If not, it can be 9 | * retrieved online at http://www.erlang.org/. 10 | * 11 | * Software distributed under the License is distributed on an "AS IS" 12 | * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 13 | * the License for the specific language governing rights and limitations 14 | * under the License. 15 | */ 16 | 17 | #ifndef RDMA_DRV_BUFFERS_H 18 | #define RDMA_DRV_BUFFERS_H 19 | 20 | #include 21 | 22 | typedef struct { 23 | void *buffers; 24 | int buffer_size; 25 | int num_buffers; 26 | int i; 27 | int free_i; 28 | int busy_i; 29 | } RdmaDrvBuffers; 30 | 31 | bool rdma_drv_buffers_init(RdmaDrvBuffers *buffers, int buffer_size, int num_buffers); 32 | void rdma_drv_buffers_free(RdmaDrvBuffers *buffers); 33 | void * rdma_drv_buffers_reserve_buffer(RdmaDrvBuffers *buffers); 34 | void rdma_drv_buffers_release_buffer(RdmaDrvBuffers *buffers); 35 | void * rdma_drv_buffers_current_buffer(RdmaDrvBuffers *buffers); 36 | 37 | #endif /* RDMA_DRV_BUFFERS_H */ 38 | -------------------------------------------------------------------------------- /c_src/rdma_drv_options.c: -------------------------------------------------------------------------------- 1 | /* 2 | * rdma_drv_options.c 3 | * Copyright (C) 2013 James Lee 4 | * 5 | * The contents of this file are subject to the Erlang Public License, 6 | * Version 1.1, (the "License"); you may not use this file except in 7 | * compliance with the License. You should have received a copy of the 8 | * Erlang Public License along with this software. If not, it can be 9 | * retrieved online at http://www.erlang.org/. 10 | * 11 | * Software distributed under the License is distributed on an "AS IS" 12 | * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 13 | * the License for the specific language governing rights and limitations 14 | * under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | 20 | #include "rdma_drv_options.h" 21 | 22 | void rdma_drv_options_init(RdmaDrvOptions *options) { 23 | options->binary = false; 24 | options->active = false; 25 | options->packet = 0; 26 | options->backlog = 5; 27 | options->port = 0; 28 | options->buffer_size = 1024; 29 | options->num_buffers = 100; 30 | } 31 | 32 | static bool rdma_drv_options_parse_tuple(RdmaDrvOptions *options, char *buf, int *index) { 33 | int arity = 0; 34 | 35 | if (ei_decode_tuple_header(buf, index, &arity) == 0) { 36 | char atom[MAXATOMLEN]; 37 | 38 | if (ei_decode_atom(buf, index, atom) == 0) { 39 | if (strcmp(atom, "list") == 0) { 40 | int list = 0; 41 | if (ei_decode_boolean(buf, index, &list) == 0) { 42 | options->binary = !((bool) list); 43 | } else { 44 | return false; 45 | } 46 | } else if (strcmp(atom, "binary") == 0) { 47 | int binary = 0; 48 | if (ei_decode_boolean(buf, index, &binary) == 0) { 49 | options->binary = (bool) binary; 50 | } else { 51 | return false; 52 | } 53 | } else if (strcmp(atom, "active") == 0) { 54 | int active = 0; 55 | if (ei_decode_boolean(buf, index, &active) == 0) { 56 | options->active = (bool) active; 57 | } else { 58 | return false; 59 | } 60 | } else if (strcmp(atom, "packet") == 0) { 61 | long packet = 0; 62 | if (ei_decode_long(buf, index, &packet) == 0) { 63 | options->packet = (int) packet; 64 | } else { 65 | return false; 66 | } 67 | } else if (strcmp(atom, "backlog") == 0) { 68 | long backlog = 0; 69 | if (ei_decode_long(buf, index, &backlog) == 0) { 70 | options->backlog = (int) backlog; 71 | } else { 72 | return false; 73 | } 74 | } else if (strcmp(atom, "port") == 0) { 75 | long port = 0; 76 | if (ei_decode_long(buf, index, &port) == 0) { 77 | options->port = (short) port; 78 | } else { 79 | return false; 80 | } 81 | } else if (strcmp(atom, "ip") == 0) { 82 | if (ei_decode_string(buf, index, options->ip) != 0) { 83 | return false; 84 | } 85 | } else if (strcmp(atom, "buffer_size") == 0) { 86 | long buffer_size = 0; 87 | if (ei_decode_long(buf, index, &buffer_size) == 0) { 88 | options->buffer_size = (int) buffer_size; 89 | } else { 90 | return false; 91 | } 92 | } else if (strcmp(atom, "num_buffers") == 0) { 93 | long num_buffers = 0; 94 | if (ei_decode_long(buf, index, &num_buffers) == 0) { 95 | options->num_buffers = (int) num_buffers; 96 | } else { 97 | return false; 98 | } 99 | } else if (strcmp(atom, "dest_host") == 0) { 100 | if (ei_decode_string(buf, index, options->dest_host) != 0) { 101 | return false; 102 | } 103 | } else if (strcmp(atom, "dest_port") == 0) { 104 | if (ei_decode_string(buf, index, options->dest_port) != 0) { 105 | return false; 106 | } 107 | } else if (strcmp(atom, "timeout") == 0) { 108 | long timeout = 0; 109 | if (ei_decode_long(buf, index, &timeout) == 0) { 110 | options->timeout = (int) timeout; 111 | } else { 112 | return false; 113 | } 114 | } else { 115 | return false; 116 | } 117 | } else { 118 | return false; 119 | } 120 | } else { 121 | return false; 122 | } 123 | 124 | return true; 125 | } 126 | 127 | bool rdma_drv_options_parse(RdmaDrvOptions *options, char *buf) { 128 | int index = 0; 129 | int version = 0; 130 | int arity = 0; 131 | 132 | if (ei_decode_version(buf, &index, &version) != 0) { 133 | return false; 134 | } 135 | 136 | if (ei_decode_list_header(buf, &index, &arity) == 0) { 137 | int i; 138 | 139 | for (i = 0; i < arity; i++) { 140 | int type = 0; 141 | int size = 0; 142 | 143 | if (ei_get_type(buf, &index, &type, &size) == 0) { 144 | switch (type) { 145 | case ERL_SMALL_TUPLE_EXT: 146 | if (!rdma_drv_options_parse_tuple(options, buf, &index)) { 147 | return false; 148 | } 149 | break; 150 | 151 | default: 152 | return false; 153 | } 154 | } else { 155 | return false; 156 | } 157 | } 158 | } else { 159 | return false; 160 | } 161 | 162 | return true; 163 | } 164 | -------------------------------------------------------------------------------- /c_src/rdma_drv_options.h: -------------------------------------------------------------------------------- 1 | /* 2 | * rdma_drv_options.h 3 | * Copyright (C) 2013 James Lee 4 | * 5 | * The contents of this file are subject to the Erlang Public License, 6 | * Version 1.1, (the "License"); you may not use this file except in 7 | * compliance with the License. You should have received a copy of the 8 | * Erlang Public License along with this software. If not, it can be 9 | * retrieved online at http://www.erlang.org/. 10 | * 11 | * Software distributed under the License is distributed on an "AS IS" 12 | * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 13 | * the License for the specific language governing rights and limitations 14 | * under the License. 15 | */ 16 | 17 | #ifndef RDMA_DRV_OPTIONS_H 18 | #define RDMA_DRV_OPTIONS_H 19 | 20 | #include 21 | 22 | typedef struct { 23 | bool binary; 24 | bool active; 25 | int packet; 26 | int backlog; 27 | short port; 28 | char ip[INET6_ADDRSTRLEN]; 29 | int buffer_size; 30 | int num_buffers; 31 | char dest_host[NI_MAXHOST]; 32 | char dest_port[NI_MAXSERV]; 33 | int timeout; 34 | } RdmaDrvOptions; 35 | 36 | void rdma_drv_options_init(RdmaDrvOptions *options); 37 | bool rdma_drv_options_parse(RdmaDrvOptions *options, char *buf); 38 | 39 | #endif /* RDMA_DRV_OPTIONS_H */ 40 | -------------------------------------------------------------------------------- /cover.spec: -------------------------------------------------------------------------------- 1 | {incl_app, rdma_dist, details}. 2 | -------------------------------------------------------------------------------- /rebar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/puppetjoy/rdma_dist/c9268bc7c2acb1bb35f57aefb7840e587344a423/rebar -------------------------------------------------------------------------------- /rebar.config: -------------------------------------------------------------------------------- 1 | {port_specs, [{"", "priv/rdma_drv.so", [ 2 | "c_src/rdma_drv.c", 3 | "c_src/rdma_drv_buffers.c", 4 | "c_src/rdma_drv_options.c" 5 | ], [{env, [ 6 | {"LDFLAGS", "$LDFLAGS -lrdmacm -libverbs"} 7 | ]}]}]}. 8 | 9 | {cover_enabled, true}. 10 | {ct_log_dir, "test/logs"}. 11 | {ct_extra_params, "+K true -proto_dist rdma"}. 12 | -------------------------------------------------------------------------------- /src/rdma.erl: -------------------------------------------------------------------------------- 1 | %%% 2 | %%% rdma.erl 3 | %%% Copyright (C) 2013 James Lee 4 | %%% 5 | %%% The contents of this file are subject to the Erlang Public License, 6 | %%% Version 1.1, (the "License"); you may not use this file except in 7 | %%% compliance with the License. You should have received a copy of the 8 | %%% Erlang Public License along with this software. If not, it can be 9 | %%% retrieved online at http://www.erlang.org/. 10 | %%% 11 | %%% Software distributed under the License is distributed on an "AS IS" 12 | %%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 13 | %%% the License for the specific language governing rights and limitations 14 | %%% under the License. 15 | %%% 16 | 17 | -module(rdma). 18 | -author("James Lee "). 19 | 20 | -export([connect/2, connect/3, connect/4, listen/1, listen/2, accept/1, accept/2, peername/1, sockname/1, send/2, recv/1, recv/2, close/1, controlling_process/2, tick/1, getstat/1, setopts/2, cancel/1]). 21 | 22 | -define(DRV_CONNECT, $C). 23 | -define(DRV_LISTEN, $L). 24 | -define(DRV_ACCEPT, $A). 25 | -define(DRV_PEERNAME, $P). 26 | -define(DRV_SOCKNAME, $S). 27 | -define(DRV_RECV, $R). 28 | -define(DRV_DISCONNECT, $D). 29 | -define(DRV_GETSTAT, $G). 30 | -define(DRV_SETOPTS, $O). 31 | -define(DRV_CANCEL, $c). 32 | -define(DRV_TIMEOUT, $T). 33 | 34 | -define(check_server(), case whereis(rdma_server) of 35 | undefined -> 36 | exit(rdma_server_not_started); 37 | _ -> 38 | ok 39 | end). 40 | 41 | 42 | %% 43 | %% API 44 | %% 45 | connect(Host, PortNumber) -> 46 | connect(Host, PortNumber, []). 47 | 48 | connect(Host, PortNumber, Options) -> 49 | connect(Host, PortNumber, Options, 500). 50 | 51 | connect(Host, PortNumber, Options, Timeout)-> 52 | ?check_server(), 53 | Socket = open_port({spawn, "rdma_drv"}, proplists:compact(filter_proplist(Options, [packet, binary]))), 54 | 55 | HostStr = case inet:ntoa(Host) of 56 | {error, einval} -> 57 | Host; 58 | Address -> 59 | Address 60 | end, 61 | 62 | case control(Socket, ?DRV_CONNECT, term_to_binary([{dest_host, HostStr}, {dest_port, integer_to_list(PortNumber)}, {timeout, Timeout} | prepare_options_list(Options)])) of 63 | ok -> 64 | receive 65 | {Socket, established} -> 66 | {ok, Socket}; 67 | {Socket, {error, Reason}} -> 68 | close(Socket), 69 | {error, Reason} 70 | after Timeout -> 71 | close(Socket), 72 | {error, timeout} 73 | end; 74 | {error, Reason} -> 75 | close(Socket), 76 | {error, Reason} 77 | end. 78 | 79 | listen(PortNumber) -> 80 | listen(PortNumber, []). 81 | 82 | listen(PortNumber, Options) -> 83 | ?check_server(), 84 | Socket = open_port({spawn, "rdma_drv"}, proplists:compact(filter_proplist(Options, [packet, binary]))), 85 | case control(Socket, ?DRV_LISTEN, term_to_binary([{port, PortNumber} | prepare_options_list(Options)])) of 86 | ok -> 87 | {ok, Socket}; 88 | {error, Reason} -> 89 | close(Socket), 90 | {error, Reason} 91 | end. 92 | 93 | accept(Socket) -> 94 | accept(Socket, infinity). 95 | 96 | accept(Socket, Timeout) -> 97 | case catch control(Socket, ?DRV_ACCEPT, []) of 98 | ok -> 99 | receive 100 | {Socket, {port, ClientPort}} -> 101 | receive 102 | {Socket, {accept, ClientPort}} -> 103 | {ok, ClientPort}; 104 | {Socket, {error, _Reason}} -> 105 | close(ClientPort), 106 | accept(Socket, Timeout) 107 | after Timeout -> 108 | timeout(Socket), 109 | {error, timeout} 110 | end; 111 | {Socket, {error, Reason}} -> 112 | {error, Reason} 113 | after Timeout -> 114 | timeout(Socket), 115 | {error, timeout} 116 | end; 117 | {error, Reason} -> 118 | {error, Reason}; 119 | {'EXIT', {badarg, _}} -> 120 | {error, closed} 121 | end. 122 | 123 | peername(Socket) -> 124 | case catch control(Socket, ?DRV_PEERNAME, []) of 125 | {ok, {Address, PortNumber}} -> 126 | case inet:parse_address(Address) of 127 | {ok, IPAddress} -> 128 | {ok, {IPAddress, PortNumber}}; 129 | Error -> 130 | Error 131 | end; 132 | {'EXIT', {badarg, _}} -> 133 | {error, closed} 134 | end. 135 | 136 | sockname(Socket) -> 137 | case catch control(Socket, ?DRV_SOCKNAME, []) of 138 | {ok, {Address, PortNumber}} -> 139 | case inet:parse_address(Address) of 140 | {ok, IPAddress} -> 141 | {ok, {IPAddress, PortNumber}}; 142 | Error -> 143 | Error 144 | end; 145 | {'EXIT', {badarg, _}} -> 146 | {error, closed} 147 | end. 148 | 149 | send(Socket, Data) -> 150 | send(Socket, Data, []). 151 | 152 | send(Socket, Data, Options) -> 153 | case catch port_command(Socket, Data, Options) of 154 | true -> 155 | receive 156 | {Socket, {error, Reason}} -> 157 | close(Socket), 158 | {error, Reason} 159 | after 0 -> 160 | ok 161 | end; 162 | {'EXIT', {badarg, _}} -> 163 | {error, closed} 164 | end. 165 | 166 | recv(Socket) -> 167 | recv(Socket, infinity). 168 | 169 | recv(Socket, Timeout) -> 170 | case catch control(Socket, ?DRV_RECV, []) of 171 | ok -> 172 | receive 173 | {Socket, {data, Data}} -> 174 | {ok, Data}; 175 | {Socket, {error, Reason}} -> 176 | close(Socket), 177 | {error, Reason} 178 | after Timeout -> 179 | timeout(Socket), 180 | {error, timeout} 181 | end; 182 | {error, Reason} -> 183 | % Reasons like 'not_connected'...so don't close socket. 184 | {error, Reason}; 185 | {'EXIT', {badarg, _}} -> 186 | {error, closed} 187 | end. 188 | 189 | close(Socket) -> 190 | case catch control(Socket, ?DRV_DISCONNECT, []) of 191 | wait -> 192 | receive 193 | {Socket, disconnected} -> 194 | close_port(Socket); 195 | {Socket, {error, Reason}} -> 196 | close_port(Socket), 197 | {error, Reason} 198 | end; 199 | ok -> 200 | close_port(Socket); 201 | {error, Reason} -> 202 | close_port(Socket), 203 | {error, Reason}; 204 | {'EXIT', {badarg, _}} -> 205 | ok 206 | end. 207 | 208 | controlling_process(Socket, Pid) -> 209 | case catch port_connect(Socket, Pid) of 210 | true -> 211 | catch unlink(Socket), 212 | ok; 213 | {'EXIT', {badarg, _}} -> 214 | {error, closed} 215 | end. 216 | 217 | tick(Socket) -> 218 | send(Socket, [], [force]). 219 | 220 | getstat(Socket) -> 221 | case catch control(Socket, ?DRV_GETSTAT, []) of 222 | {ok, R, S, Q} -> 223 | {ok, R, S, Q}; 224 | {'EXIT', {badarg, _}} -> 225 | {error, closed} 226 | end. 227 | 228 | setopts(Socket, Options) -> 229 | case catch control(Socket, ?DRV_SETOPTS, term_to_binary(prepare_options_list(Options))) of 230 | ok -> 231 | ok; 232 | {error, Reason} -> 233 | {error, Reason}; 234 | {'EXIT', {badarg, _}} -> 235 | {error, closed} 236 | end. 237 | 238 | cancel(Socket) -> 239 | case catch control(Socket, ?DRV_CANCEL, []) of 240 | ok -> 241 | ok; 242 | {'EXIT', {badarg, _}} -> 243 | {error, closed} 244 | end. 245 | 246 | % XXX: Create a flush operation. 247 | 248 | 249 | %% 250 | %% Private Functions 251 | %% 252 | control(Port, Command, Args) -> 253 | binary_to_term(port_control(Port, Command, Args)). 254 | 255 | prepare_options_list(Options) -> 256 | % Turn atoms like 'active' into '{active, true}'. 257 | NewOptions = proplists:unfold(Options), 258 | 259 | % Replace tuple ip address representation with string, if any. 260 | case proplists:get_value(ip, NewOptions) of 261 | undefined -> 262 | NewOptions; 263 | IpAddress -> 264 | [{ip, inet:ntoa(IpAddress)} | proplists:delete(ip, NewOptions)] 265 | end. 266 | 267 | close_port(Socket) -> 268 | case catch erlang:port_close(Socket) of 269 | true -> 270 | ok; 271 | {'EXIT', {badarg, _}} -> 272 | % Socket is already closed. 273 | ok 274 | end. 275 | 276 | timeout(Socket) -> 277 | case catch control(Socket, ?DRV_TIMEOUT, []) of 278 | ok -> 279 | ok; 280 | {'EXIT', {badarg, _}} -> 281 | {error, closed} 282 | end. 283 | 284 | filter_proplist(Proplist, Keylist) -> 285 | lists:flatten(lists:map(fun(Key) -> proplists:lookup_all(Key, Proplist) end, Keylist)). 286 | -------------------------------------------------------------------------------- /src/rdma_dist.app.src: -------------------------------------------------------------------------------- 1 | {application, rdma_dist, [ 2 | {description, "RDMA Distribution Driver"}, 3 | {vsn, "1"}, 4 | {registered, [rdma_server]}, 5 | {applications, []}, 6 | {env, []} 7 | ]}. 8 | -------------------------------------------------------------------------------- /src/rdma_dist.erl: -------------------------------------------------------------------------------- 1 | %%% 2 | %%% rdma_dist.erl 3 | %%% Copyright (C) 2013 James Lee 4 | %%% 5 | %%% The contents of this file are subject to the Erlang Public License, 6 | %%% Version 1.1, (the "License"); you may not use this file except in 7 | %%% compliance with the License. You should have received a copy of the 8 | %%% Erlang Public License along with this software. If not, it can be 9 | %%% retrieved online at http://www.erlang.org/. 10 | %%% 11 | %%% Software distributed under the License is distributed on an "AS IS" 12 | %%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 13 | %%% the License for the specific language governing rights and limitations 14 | %%% under the License. 15 | %%% 16 | 17 | %%% 18 | %%% Portions of this code are modified from inet6_tcp_dist.erl and 19 | %%% uds_dist.erl in the Erlang/OTP distribution, which is Copyright 20 | %%% (C) Ericsson AB 1997-2013 and used under the terms of the Erlang 21 | %%% Public License, Version 1.1. 22 | %%% 23 | 24 | -module(rdma_dist). 25 | -author("James Lee "). 26 | 27 | -export([childspecs/0, listen/1, accept/1, accept_connection/5, 28 | setup/5, close/1, select/1, is_node_name/1]). 29 | 30 | %% internal exports 31 | -export([accept_loop/2, do_accept/6, do_setup/6]). 32 | 33 | -import(error_logger,[error_msg/2]). 34 | 35 | -include_lib("kernel/include/dist.hrl"). 36 | -include_lib("kernel/include/dist_util.hrl"). 37 | -include_lib("kernel/include/net_address.hrl"). 38 | 39 | childspecs() -> 40 | {ok, [{rdma_server,{rdma_server, start_link, []}, 41 | permanent, 2000, worker, [rdma_server]}]}. 42 | 43 | %% ------------------------------------------------------------ 44 | %% Select this protocol based on node name 45 | %% select(Node) => Bool 46 | %% ------------------------------------------------------------ 47 | select(Node) -> 48 | case split_node(atom_to_list(Node), $@, []) of 49 | [_, Host] -> 50 | case inet:getaddr(Host, inet) of 51 | {ok, _} -> 52 | true; 53 | _ -> 54 | false 55 | end; 56 | _ -> 57 | false 58 | end. 59 | 60 | 61 | %% ------------------------------------------------------------ 62 | %% Create the listen socket, i.e. the port that this erlang 63 | %% node is accessible through. 64 | %% ------------------------------------------------------------ 65 | listen(Name) -> 66 | case rdma:listen(0, [{packet, 4}]) of 67 | {ok, Listener} -> 68 | {ok, Address = {_, Port}} = rdma:sockname(Listener), 69 | {ok, Host} = inet:gethostname(), 70 | {ok, Creation} = erl_epmd:register_node(Name, Port), 71 | {ok, {Listener, #net_address{ 72 | address = Address, 73 | host = Host, 74 | protocol = rdma, 75 | family = rdma 76 | }, Creation}}; 77 | Error -> 78 | Error 79 | end. 80 | 81 | %% ------------------------------------------------------------ 82 | %% Accepts new connection attempts from other Erlang nodes. 83 | %% ------------------------------------------------------------ 84 | accept(Listener) -> 85 | spawn_opt(?MODULE, accept_loop, [self(), Listener], [link, {priority, max}]). 86 | 87 | accept_loop(Kernel, Listener) -> 88 | case rdma:accept(Listener) of 89 | {ok, Socket} -> 90 | Kernel ! {accept,self(),Socket,rdma,rdma}, 91 | controller(Kernel, Socket), 92 | accept_loop(Kernel, Listener); 93 | Error -> 94 | exit(Error) 95 | end. 96 | 97 | controller(Kernel, Socket) -> 98 | receive 99 | {Kernel, controller, Pid} -> 100 | ok = rdma:controlling_process(Socket, Pid), 101 | Pid ! {self(), controller}; 102 | {Kernel, unsupported_protocol} -> 103 | exit(unsupported_protocol) 104 | end. 105 | 106 | %% ------------------------------------------------------------ 107 | %% Accepts a new connection attempt from another Erlang node. 108 | %% Performs the handshake with the other side. 109 | %% ------------------------------------------------------------ 110 | accept_connection(AcceptPid, Socket, MyNode, Allowed, SetupTime) -> 111 | spawn_link(?MODULE, do_accept, [self(), AcceptPid, Socket, MyNode, Allowed, SetupTime]). 112 | 113 | do_accept(Kernel, AcceptPid, Socket, MyNode, Allowed, SetupTime) -> 114 | receive 115 | {AcceptPid, controller} -> 116 | Timer = dist_util:start_timer(SetupTime), 117 | HSData = #hs_data{ 118 | kernel_pid = Kernel, 119 | this_node = MyNode, 120 | socket = Socket, 121 | timer = Timer, 122 | this_flags = 0, 123 | allowed = Allowed, 124 | f_send = fun(S,D) -> rdma:send(S,D) end, 125 | f_recv = fun(S,_N,T) -> rdma:recv(S,T) end, 126 | f_setopts_pre_nodeup = fun(S) -> rdma:setopts(S, [{active, false}]) end, 127 | f_setopts_post_nodeup = fun(S) -> rdma:setopts(S, [{active, true}]) end, 128 | f_getll = fun(S) -> {ok, S} end, 129 | f_address = fun get_remote_id/2, 130 | mf_tick = fun rdma:tick/1, 131 | mf_getstat = fun rdma:getstat/1 132 | }, 133 | dist_util:handshake_other_started(HSData) 134 | end. 135 | 136 | get_remote_id(Socket, Node) -> 137 | {ok, Address} = rdma:peername(Socket), 138 | [_, Host] = split_node(atom_to_list(Node), $@, []), 139 | #net_address{ 140 | address = Address, 141 | host = Host, 142 | protocol = rdma, 143 | family = rdma 144 | }. 145 | 146 | %% ------------------------------------------------------------ 147 | %% Setup a new connection to another Erlang node. 148 | %% Performs the handshake with the other side. 149 | %% ------------------------------------------------------------ 150 | setup(Node, Type, MyNode, LongOrShortNames,SetupTime) -> 151 | spawn_opt(?MODULE, do_setup, 152 | [self(), Node, Type, MyNode, LongOrShortNames, SetupTime], 153 | [link, {priority, max}]). 154 | 155 | do_setup(Kernel, Node, Type, MyNode, LongOrShortNames, SetupTime) -> 156 | [Name, Address] = splitnode(Node, LongOrShortNames), 157 | case inet:getaddr(Address, inet) of 158 | {ok, Ip} -> 159 | Timer = dist_util:start_timer(SetupTime), 160 | case erl_epmd:port_please(Name, Ip) of 161 | {port, TcpPort, Version} -> 162 | dist_util:reset_timer(Timer), 163 | case rdma:connect(Ip, TcpPort, [{packet, 4}]) of 164 | {ok, Socket} -> 165 | HSData = #hs_data{ 166 | kernel_pid = Kernel, 167 | other_node = Node, 168 | this_node = MyNode, 169 | socket = Socket, 170 | timer = Timer, 171 | this_flags = 0, 172 | other_version = Version, 173 | f_send = fun(S,D) -> rdma:send(S,D) end, 174 | f_recv = fun(S,_N,T) -> rdma:recv(S,T) end, 175 | f_setopts_pre_nodeup = fun(S) -> rdma:setopts(S, [{active, false}]) end, 176 | f_setopts_post_nodeup = fun(S) -> rdma:setopts(S, [{active, true}]) end, 177 | f_getll = fun(S) -> {ok, S} end, 178 | f_address = fun(_, _) -> 179 | #net_address{ 180 | address = {Ip, TcpPort}, 181 | host = Address, 182 | protocol = rdma, 183 | family = rdma 184 | } 185 | end, 186 | mf_tick = fun rdma:tick/1, 187 | mf_getstat = fun rdma:getstat/1, 188 | request_type = Type 189 | }, 190 | dist_util:handshake_we_started(HSData); 191 | _ -> 192 | ?trace("connect~n", []), 193 | ?shutdown(Node) 194 | end; 195 | _-> 196 | ?trace("port_please~n", []), 197 | ?shutdown(Node) 198 | end; 199 | _ -> 200 | ?trace("getaddr~n", []), 201 | ?shutdown(Node) 202 | end. 203 | 204 | %% 205 | %% Close a socket. 206 | %% 207 | close(Socket) -> 208 | rdma:close(Socket). 209 | 210 | %% If Node is illegal terminate the connection setup!! 211 | splitnode(Node, LongOrShortNames) -> 212 | case split_node(atom_to_list(Node), $@, []) of 213 | [Name|Tail] when Tail =/= [] -> 214 | Host = lists:append(Tail), 215 | case split_node(Host, $., []) of 216 | [_] when LongOrShortNames =:= longnames -> 217 | case inet:parse_ipv4strict_address(Host) of 218 | {ok, _} -> 219 | [Name, Host]; 220 | _ -> 221 | error_msg("** System running to use " 222 | "fully qualified " 223 | "hostnames **~n" 224 | "** Hostname ~s is illegal **~n", 225 | [Host]), 226 | ?shutdown(Node) 227 | end; 228 | L when length(L) > 1, LongOrShortNames =:= shortnames -> 229 | error_msg("** System NOT running to use fully qualified " 230 | "hostnames **~n" 231 | "** Hostname ~s is illegal **~n", 232 | [Host]), 233 | ?shutdown(Node); 234 | _ -> 235 | [Name, Host] 236 | end; 237 | [_] -> 238 | error_msg("** Nodename ~p illegal, no '@' character **~n", 239 | [Node]), 240 | ?shutdown(Node); 241 | _ -> 242 | error_msg("** Nodename ~p illegal **~n", [Node]), 243 | ?shutdown(Node) 244 | end. 245 | 246 | split_node([Chr|T], Chr, Ack) -> [lists:reverse(Ack)|split_node(T, Chr, [])]; 247 | split_node([H|T], Chr, Ack) -> split_node(T, Chr, [H|Ack]); 248 | split_node([], _, Ack) -> [lists:reverse(Ack)]. 249 | 250 | is_node_name(Node) when is_atom(Node) -> 251 | case split_node(atom_to_list(Node), $@, []) of 252 | [_,_Host] -> true; 253 | _ -> false 254 | end; 255 | is_node_name(_Node) -> 256 | false. 257 | -------------------------------------------------------------------------------- /src/rdma_server.erl: -------------------------------------------------------------------------------- 1 | %%% 2 | %%% rdma_server.erl 3 | %%% Copyright (C) 2013 James Lee 4 | %%% 5 | %%% The contents of this file are subject to the Erlang Public License, 6 | %%% Version 1.1, (the "License"); you may not use this file except in 7 | %%% compliance with the License. You should have received a copy of the 8 | %%% Erlang Public License along with this software. If not, it can be 9 | %%% retrieved online at http://www.erlang.org/. 10 | %%% 11 | %%% Software distributed under the License is distributed on an "AS IS" 12 | %%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 13 | %%% the License for the specific language governing rights and limitations 14 | %%% under the License. 15 | %%% 16 | 17 | %%% 18 | %%% This code is basically 'uds_server.erl' from the Erlang/OTP 19 | %%% distrubtion which is Copyright (C) Ericsson AB 1997-2013 and used 20 | %%% under the terms of the Erlang Public License, Version 1.1. 21 | %%% 22 | 23 | -module(rdma_server). 24 | 25 | -behaviour(gen_server). 26 | 27 | %% External exports 28 | -export([start_link/0]). 29 | 30 | %% gen_server callbacks 31 | -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, code_change/3]). 32 | 33 | -define(DRIVER_NAME,"rdma_drv"). 34 | 35 | %%%---------------------------------------------------------------------- 36 | %%% API 37 | %%%---------------------------------------------------------------------- 38 | start_link() -> 39 | gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). 40 | 41 | %%%---------------------------------------------------------------------- 42 | %%% Callback functions from gen_server 43 | %%%---------------------------------------------------------------------- 44 | 45 | %%---------------------------------------------------------------------- 46 | %% Func: init/1 47 | %% Returns: {ok, State} | 48 | %% {ok, State, Timeout} | 49 | %% ignore | 50 | %% {stop, Reason} 51 | %%---------------------------------------------------------------------- 52 | init([]) -> 53 | process_flag(trap_exit,true), 54 | case load_driver() of 55 | ok -> 56 | {ok, []}; 57 | {error, already_loaded} -> 58 | {ok, []}; 59 | Error -> 60 | exit(Error) 61 | end. 62 | 63 | 64 | %%---------------------------------------------------------------------- 65 | %% Func: handle_call/3 66 | %% Returns: {reply, Reply, State} | 67 | %% {reply, Reply, State, Timeout} | 68 | %% {noreply, State} | 69 | %% {noreply, State, Timeout} | 70 | %% {stop, Reason, Reply, State} | (terminate/2 is called) 71 | %% {stop, Reason, State} (terminate/2 is called) 72 | %%---------------------------------------------------------------------- 73 | handle_call(_Request, _From, State) -> 74 | Reply = ok, 75 | {reply, Reply, State}. 76 | 77 | %%---------------------------------------------------------------------- 78 | %% Func: handle_cast/2 79 | %% Returns: {noreply, State} | 80 | %% {noreply, State, Timeout} | 81 | %% {stop, Reason, State} (terminate/2 is called) 82 | %%---------------------------------------------------------------------- 83 | handle_cast(_Msg, State) -> 84 | {noreply, State}. 85 | 86 | %%---------------------------------------------------------------------- 87 | %% Func: handle_info/2 88 | %% Returns: {noreply, State} | 89 | %% {noreply, State, Timeout} | 90 | %% {stop, Reason, State} (terminate/2 is called) 91 | %%---------------------------------------------------------------------- 92 | handle_info(_Info, State) -> 93 | {noreply, State}. 94 | 95 | %%---------------------------------------------------------------------- 96 | %% Func: terminate/2 97 | %% Purpose: Shutdown the server 98 | %% Returns: any (ignored by gen_server) 99 | %%---------------------------------------------------------------------- 100 | terminate(_Reason, _State) -> 101 | erl_ddll:unload_driver(?DRIVER_NAME), 102 | ok. 103 | 104 | %%---------------------------------------------------------------------- 105 | %% Func: code_change/3 106 | %% Purpose: Convert process state when code is changed 107 | %% Returns: {ok, NewState} 108 | %%---------------------------------------------------------------------- 109 | code_change(_OldVsn, State, _Extra) -> 110 | {ok, State}. 111 | 112 | %%%---------------------------------------------------------------------- 113 | %%% Internal functions 114 | %%%---------------------------------------------------------------------- 115 | 116 | %% 117 | %% Actually load the driver. 118 | %% 119 | load_driver() -> 120 | Dir = find_priv_lib(), 121 | erl_ddll:load_driver(Dir,?DRIVER_NAME). 122 | 123 | %% 124 | %% As this server may be started by the distribution, it is not safe to assume 125 | %% a working code server, neither a working file server. 126 | %% I try to utilize the most primitive interfaces available to determine 127 | %% the directory of the port_program. 128 | %% 129 | find_priv_lib() -> 130 | PrivDir = case (catch code:priv_dir(rdma_dist)) of 131 | {'EXIT', _} -> 132 | %% Code server probably not startet yet 133 | {ok, P} = erl_prim_loader:get_path(), 134 | ModuleFile = atom_to_list(?MODULE) ++ extension(), 135 | Pd = (catch lists:foldl 136 | (fun(X,Acc) -> 137 | M = filename:join([X, ModuleFile]), 138 | %% The file server probably not started 139 | %% either, has to use raw interface. 140 | case file:raw_read_file_info(M) of 141 | {ok,_} -> 142 | %% Found our own module in the 143 | %% path, lets bail out with 144 | %% the priv_dir of this directory 145 | Y = filename:split(X), 146 | throw(filename:join 147 | (lists:sublist 148 | (Y,length(Y) - 1) 149 | ++ ["priv"])); 150 | _ -> 151 | Acc 152 | end 153 | end, 154 | false,P)), 155 | case Pd of 156 | false -> 157 | exit(rdma_dist_priv_lib_indeterminate); 158 | _ -> 159 | Pd 160 | end; 161 | Dir -> 162 | Dir 163 | end, 164 | PrivDir. 165 | 166 | extension() -> 167 | %% erlang:info(machine) returns machine name as text in all uppercase 168 | "." ++ lists:map(fun(X) -> 169 | X + $a - $A 170 | end, 171 | erlang:system_info(machine)). 172 | 173 | -------------------------------------------------------------------------------- /test/rdma_SUITE.erl: -------------------------------------------------------------------------------- 1 | %%% 2 | %%% rdma_SUITE.erl 3 | %%% Copyright (C) 2013 James Lee 4 | %%% 5 | %%% The contents of this file are subject to the Erlang Public License, 6 | %%% Version 1.1, (the "License"); you may not use this file except in 7 | %%% compliance with the License. You should have received a copy of the 8 | %%% Erlang Public License along with this software. If not, it can be 9 | %%% retrieved online at http://www.erlang.org/. 10 | %%% 11 | %%% Software distributed under the License is distributed on an "AS IS" 12 | %%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 13 | %%% the License for the specific language governing rights and limitations 14 | %%% under the License. 15 | %%% 16 | 17 | -module(rdma_SUITE). 18 | -author("James Lee "). 19 | -compile(export_all). 20 | 21 | -include_lib("common_test/include/ct.hrl"). 22 | 23 | all() -> [ 24 | test_listen_on_random_port, 25 | test_listen_on_specified_port, 26 | test_connect_to_valid_listener, 27 | test_connect_to_invalid_host, 28 | test_connect_to_invalid_port, 29 | test_accept_ignore_failed_connection, 30 | test_close_client_socket, 31 | test_close_server_socket, 32 | test_close_listener, 33 | test_close_with_buffered_data, 34 | test_send_before_recv, 35 | test_recv_before_send, 36 | test_send_recv_binary, 37 | test_send_recv_zero_byte_buffer, 38 | test_send_more_than_num_buffers, 39 | test_send_larger_than_buffer_size, 40 | test_send_huge, 41 | test_active_socket 42 | ]. 43 | 44 | suite() -> [{timetrap, 5000}]. 45 | 46 | init_per_testcase(test_listen_on_random_port, Config) -> 47 | Config; 48 | 49 | init_per_testcase(test_close_with_buffered_data, Config) -> 50 | create_listener(Config, [{num_buffers, 3}]); 51 | 52 | init_per_testcase(test_send_recv_binary, Config) -> 53 | create_listener(Config, [binary]); 54 | 55 | init_per_testcase(test_send_more_than_num_buffers, Config) -> 56 | create_listener(Config, [{num_buffers, 3}]); 57 | 58 | init_per_testcase(test_send_larger_than_buffer_size, Config) -> 59 | create_listener(Config, [{buffer_size, 3}]); 60 | 61 | init_per_testcase(test_send_huge, Config) -> 62 | create_listener(Config, [{num_buffers, 3}, {buffer_size, 3}]); 63 | 64 | init_per_testcase(test_active_socket, Config) -> 65 | create_listener(Config, [{active, true}]); 66 | 67 | init_per_testcase(_TestCase, Config) -> 68 | create_listener(Config, []). 69 | 70 | end_per_testcase(test_listen_on_random_port, _Config) -> 71 | ok; 72 | 73 | end_per_testcase(_TestCase, Config) -> 74 | rdma:close(?config(listener, Config)), 75 | ok. 76 | 77 | %% 78 | %% Test Cases 79 | %% 80 | test_listen_on_random_port(_Config) -> 81 | {ok, Listener} = rdma:listen(0), 82 | {ok, {_Address, PortNumber}} = rdma:sockname(Listener), 83 | true = PortNumber > 0, 84 | rdma:close(Listener), 85 | ok. 86 | 87 | test_listen_on_specified_port(_Config) -> 88 | PortNumber = 12345, 89 | {ok, Listener} = rdma:listen(PortNumber), 90 | {ok, {_Address, PortNumber}} = rdma:sockname(Listener), 91 | rdma:close(Listener), 92 | ok. 93 | 94 | test_connect_to_valid_listener(Config) -> 95 | Listener = ?config(listener, Config), 96 | start_accept(Listener), 97 | {ok, Client} = rdma:connect("localhost", ?config(port_number, Config)), 98 | {ok, Server} = accept(Listener), 99 | ok = rdma:close(Client), 100 | ok = rdma:close(Server), 101 | ok. 102 | 103 | test_connect_to_invalid_host(Config) -> 104 | {error, getaddrinfo} = rdma:connect("invalid-host", ?config(port_number, Config)), 105 | ok. 106 | 107 | test_connect_to_invalid_port(_Config) -> 108 | {error, rejected} = rdma:connect("localhost", 1000), 109 | ok. 110 | 111 | test_accept_ignore_failed_connection(Config) -> 112 | Listener = ?config(listener, Config), 113 | {error, timeout} = rdma:connect("localhost", ?config(port_number, Config)), 114 | start_accept(Listener), 115 | {ok, Client} = rdma:connect("localhost", ?config(port_number, Config)), 116 | {ok, Server} = accept(Listener), 117 | {error, timeout} = accept(Listener), 118 | ok = rdma:close(Client), 119 | ok = rdma:close(Server), 120 | ok. 121 | 122 | test_close_client_socket(Config) -> 123 | Listener = ?config(listener, Config), 124 | start_accept(Listener), 125 | {ok, Client} = rdma:connect("localhost", ?config(port_number, Config)), 126 | {ok, Server} = accept(Listener), 127 | Key = rpc:async_call(node(), rdma, recv, [Server]), 128 | ok = rdma:close(Client), 129 | {error, closed} = rpc:yield(Key), 130 | ok = rdma:close(Server), % just to be sure 131 | ok. 132 | 133 | test_close_server_socket(Config) -> 134 | Listener = ?config(listener, Config), 135 | start_accept(Listener), 136 | {ok, Client} = rdma:connect("localhost", ?config(port_number, Config)), 137 | {ok, Server} = accept(Listener), 138 | Key = rpc:async_call(node(), rdma, recv, [Client]), 139 | ok = rdma:close(Server), 140 | {error, closed} = rpc:yield(Key), 141 | ok = rdma:close(Client), % just to be sure 142 | ok. 143 | 144 | test_close_listener(Config) -> 145 | Listener = ?config(listener, Config), 146 | start_accept(Listener), 147 | {ok, Client} = rdma:connect("localhost", ?config(port_number, Config)), 148 | {ok, Server} = accept(Listener), 149 | KeyServer = rpc:async_call(node(), rdma, recv, [Server]), 150 | KeyClient = rpc:async_call(node(), rdma, recv, [Client]), 151 | ok = rdma:close(Listener), 152 | {error, closed} = rpc:yield(KeyServer), 153 | {error, closed} = rpc:yield(KeyClient), 154 | ok = rdma:close(Server), % just to be sure 155 | ok = rdma:close(Client), % just to be sure 156 | ok. 157 | 158 | test_close_with_buffered_data(Config) -> 159 | Listener = ?config(listener, Config), 160 | start_accept(Listener), 161 | {ok, Client} = rdma:connect("localhost", ?config(port_number, Config), [{num_buffers, 3}]), 162 | {ok, Server} = accept(Listener), 163 | ok = rdma:send(Client, "test"), 164 | ok = rdma:send(Client, "test"), 165 | ok = rdma:send(Client, "test"), 166 | {ok, _, _, 1} = rdma:getstat(Client), % one packet is buffered 167 | ok = rdma:close(Client), 168 | ok = rdma:close(Server), 169 | ok. 170 | 171 | test_send_before_recv(Config) -> 172 | Listener = ?config(listener, Config), 173 | start_accept(Listener), 174 | {ok, Client} = rdma:connect("localhost", ?config(port_number, Config)), 175 | {ok, Server} = accept(Listener), 176 | ok = rdma:send(Client, "test"), 177 | {ok, "test"} = rdma:recv(Server), 178 | ok = rdma:close(Client), 179 | ok = rdma:close(Server), 180 | ok. 181 | 182 | test_recv_before_send(Config) -> 183 | Listener = ?config(listener, Config), 184 | start_accept(Listener), 185 | {ok, Client} = rdma:connect("localhost", ?config(port_number, Config)), 186 | {ok, Server} = accept(Listener), 187 | Key = rpc:async_call(node(), rdma, recv, [Server]), 188 | timer:sleep(100), % just to be sure recv is waiting 189 | ok = rdma:send(Client, "test"), 190 | {ok, "test"} = rpc:yield(Key), 191 | ok = rdma:close(Client), 192 | ok = rdma:close(Server), 193 | ok. 194 | 195 | test_send_recv_binary(Config) -> 196 | Listener = ?config(listener, Config), 197 | start_accept(Listener), 198 | {ok, Client} = rdma:connect("localhost", ?config(port_number, Config), [binary]), 199 | {ok, Server} = accept(Listener), 200 | ok = rdma:send(Client, <<"test">>), 201 | {ok, <<"test">>} = rdma:recv(Server), 202 | ok = rdma:close(Client), 203 | ok = rdma:close(Server), 204 | ok. 205 | 206 | test_send_recv_zero_byte_buffer(Config) -> 207 | Listener = ?config(listener, Config), 208 | start_accept(Listener), 209 | {ok, Client} = rdma:connect("localhost", ?config(port_number, Config)), 210 | {ok, Server} = accept(Listener), 211 | ok = rdma:send(Client, []), 212 | {ok, []} = rdma:recv(Server), 213 | ok = rdma:close(Client), 214 | ok = rdma:close(Server), 215 | ok. 216 | 217 | test_send_more_than_num_buffers(Config) -> 218 | Listener = ?config(listener, Config), 219 | start_accept(Listener), 220 | {ok, Client} = rdma:connect("localhost", ?config(port_number, Config), [{num_buffers, 3}]), 221 | {ok, Server} = accept(Listener), 222 | ok = rdma:send(Client, "test1"), 223 | ok = rdma:send(Client, "test2"), 224 | ok = rdma:send(Client, "test3"), 225 | ok = rdma:send(Client, "test4"), 226 | {ok, _, _, 2} = rdma:getstat(Client), % two packets are buffered 227 | {ok, "test1"} = rdma:recv(Server), 228 | {ok, "test2"} = rdma:recv(Server), 229 | {error, timeout} = rdma:recv(Server, 100), % no packet available 230 | {ok, _, _, 2} = rdma:getstat(Client), % two packets are still buffered 231 | {error, timeout} = rdma:recv(Client, 0), % gets ack from server and flushes buffer 232 | {ok, _, _, 0} = rdma:getstat(Client), % see? 233 | {ok, "test3"} = rdma:recv(Server), 234 | {ok, "test4"} = rdma:recv(Server), 235 | ok = rdma:close(Client), 236 | ok = rdma:close(Server), 237 | ok. 238 | 239 | test_send_larger_than_buffer_size(Config) -> 240 | Listener = ?config(listener, Config), 241 | start_accept(Listener), 242 | {ok, Client} = rdma:connect("localhost", ?config(port_number, Config), [{buffer_size, 3}]), 243 | {ok, Server} = accept(Listener), 244 | ok = rdma:send(Client, "foobar"), 245 | {ok, "foobar"} = rdma:recv(Server), 246 | ok = rdma:close(Client), 247 | ok = rdma:close(Server), 248 | ok. 249 | 250 | test_send_huge(Config) -> 251 | Listener = ?config(listener, Config), 252 | start_accept(Listener), 253 | {ok, Client} = rdma:connect("localhost", ?config(port_number, Config), [{num_buffers, 3}, {buffer_size, 3}]), 254 | {ok, Server} = accept(Listener), 255 | ok = rdma:send(Client, "foobarbaz"), 256 | {ok, _, _, 1} = rdma:getstat(Client), % "baz" is buffered 257 | {error, timeout} = rdma:recv(Server, 100), % no full packet available 258 | {error, timeout} = rdma:recv(Client, 0), % gets ack from server and flushes buffer 259 | {ok, _, _, 0} = rdma:getstat(Client), % buffer flushed 260 | {ok, "foobarbaz"} = rdma:recv(Server), 261 | ok = rdma:close(Client), 262 | ok = rdma:close(Server), 263 | ok. 264 | 265 | test_active_socket(Config) -> 266 | Listener = ?config(listener, Config), 267 | start_accept(Listener), 268 | {ok, Client} = rdma:connect("localhost", ?config(port_number, Config), [{active, true}]), 269 | {ok, Server} = accept(Listener), 270 | ok = rdma:send(Client, "test"), 271 | 272 | % Current process isn't "owner" of Server. 273 | {error, timeout} = receive 274 | {Server, {data, "test"}} -> 275 | ok 276 | after 100 -> 277 | {error, timeout} 278 | end, 279 | ok = rdma:controlling_process(Server, self()), 280 | ok = rdma:send(Client, "test"), 281 | ok = receive 282 | {Server, {data, "test"}} -> 283 | ok 284 | after 100 -> 285 | {error, timeout} 286 | end, 287 | ok = rdma:close(Client), 288 | ok = receive 289 | {tcp_closed, Server} -> 290 | ok 291 | after 100 -> 292 | {error, timeout} 293 | end, 294 | ok = rdma:close(Server), 295 | ok. 296 | 297 | 298 | %% 299 | %% Helper Functions 300 | %% 301 | accept(Listener) -> 302 | receive 303 | {Listener, {accept, Socket}} -> 304 | {ok, Socket} 305 | after 1000 -> 306 | {error, timeout} 307 | end. 308 | 309 | accept_loop(Listener, Pid) -> 310 | case rdma:accept(Listener) of 311 | {ok, Socket} -> 312 | Pid ! {Listener, {accept, Socket}}, 313 | accept_loop(Listener, Pid); 314 | _Other -> 315 | ok 316 | end. 317 | 318 | start_accept(Listener) -> 319 | spawn_link(?MODULE, accept_loop, [Listener, self()]). 320 | 321 | create_listener(Config, Options) -> 322 | {ok, Listener} = rdma:listen(0, Options), 323 | {ok, {_Address, PortNumber}} = rdma:sockname(Listener), 324 | [{listener, Listener}, {port_number, PortNumber} | Config]. 325 | --------------------------------------------------------------------------------