├── CMakeLists.txt ├── README.md ├── common.hpp ├── example ├── client.cpp └── server.cpp ├── logging.hpp ├── mr.hpp ├── msg_interface.hpp ├── pre_connector.hpp ├── qp.hpp ├── qp_impl.hpp ├── ralloc ├── Makefile ├── README ├── include-x86_64 │ ├── atomic.h │ ├── bitops.h │ ├── cpu.h │ ├── double-list.h │ └── queue.h ├── new_delete.cpp ├── ralloc.h ├── ssmalloc.c └── ssmalloc.h ├── rdma_ctrl.hpp ├── rdma_ctrl_impl.hpp ├── rnic.hpp └── ud_adapter.hpp /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project (rlib) 2 | 3 | cmake_minimum_required(VERSION 2.8) 4 | 5 | ## use C++11 features 6 | add_definitions(-std=c++11) 7 | 8 | set(CMAKE_INCLUDE_CURRENT_DIR ON) 9 | set(CMAKE_CXX_COMPILER /usr/bin/g++) 10 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g") 11 | 12 | ## path to ralloc lib, which you need to build manually 13 | set(RALLOC_LIB "${PROJECT_SOURCE_DIR}/ralloc/libssmalloc.a") 14 | 15 | ## include paths 16 | include_directories(ralloc) 17 | 18 | file(GLOB SOURCES "*.hpp") 19 | 20 | add_library(rdma STATIC ${SOURCES}) 21 | set_target_properties(rdma PROPERTIES LINKER_LANGUAGE CXX) 22 | target_link_libraries(rdma -lpthread ibverbs ${RALLOC_LIB}) 23 | 24 | add_executable(server "example/server.cpp") 25 | add_executable(client "example/client.cpp") 26 | 27 | target_link_libraries(server rdma) 28 | target_link_libraries(client rdma) 29 | 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## RLib 2 | 3 | ### Notice 4 | The latest version has been transferred to https://github.com/wxdwfc/rlibv2, which is now actively maintained. 5 | 6 | ### Intro 7 | 8 | RLib is a header-only library for **easier** use of RDMA using C++. Basically it is a set of wrappers of the interfaces of `libibverbs`, 9 | yet it additionally handles many tedius things, such as establishing connections between RDMA QPs, and simplifies many configurations. 10 | 11 | ------ 12 | 13 | ### To use 14 | 15 | `#include "rdma_ctrl.hpp"` is all you need. 16 | 17 | ------ 18 | 19 | ### Example 20 | 21 | Usually very few lines of code are needed to use RDMA with RLib. Below is a snippet of using RLib to implement a 22 | simple pingpong application using one-sided RDMA primitive. 23 | 24 | Server side 25 | ```c++ 26 | /** 27 | * Note, RDMA usually uses some other communication method (e.g. TCP/IP) to exchange QP informations. 28 | * RLib uses TCP for the pre-communication. 29 | */ 30 | int server_node_id = 1; 31 | int tcp_port = 8888; 32 | int client_port = 8000; 33 | 34 | using namespace rdmaio; 35 | 36 | RdmaCtrl *c = new RdmaCtrl(server_node_id,tcp_port); 37 | RdmaCtrl::DevIdx idx {.dev_id = 0,.port_id = 1 }; // using the first RNIC's first port 38 | c->open_thread_local_device(idx); 39 | 40 | // register a buffer to the previous opened device, using id = 73 41 | char *buffer = (char *)malloc(4096); 42 | memset(buffer, 0, 4096); 43 | RDMA_ASSERT(c->register_memory(73,buffer,4096,c->get_device()) == true); 44 | 45 | char s[] = "hello world"; 46 | memcpy(buffer, s, strlen(s)); 47 | 48 | RCQP *qp = c->create_rc_qp(create_rc_idx(1,0),c->get_device(),c->get_local_mr(73)); 49 | 50 | // server also needs to "connect" clinet. 51 | while(qp->connect(client_ip,client_port) != SUCC) { 52 | usleep(2000); 53 | } 54 | 55 | while(true) { 56 | // This is RDMA, server does not need to do anything :) 57 | sleep(1); 58 | } 59 | ``` 60 | 61 | Client side 62 | ```c++ 63 | int client_node_id = 0; 64 | int tcp_port = 8000; 65 | int server_port = 8888; 66 | 67 | using namespace rdmaio; 68 | 69 | RdmaCtrl *c = new RdmaCtrl(client_node_id,tcp_port); 70 | RdmaCtrl::DevIdx idx {.dev_id = 0,.port_id = 1 }; // using the first RNIC's first port 71 | c->open_thread_local_device(idx); 72 | 73 | // register a buffer to the previous opened device, using id = 73 74 | char *buffer = (char *)malloc(4096); 75 | RDMA_ASSERT(c->register_memory(73,buffer,4096,c->get_device()) == true); 76 | 77 | // get remote server's memory information 78 | MemoryAttr mr; 79 | while(QP::get_remote_mr(server_ip,server_port,73,&mr) != SUCC) { 80 | usleep(2000); 81 | } 82 | 83 | // create the RC qp to access remote server's memory, using the previous registered memory 84 | RCQP *qp = c->create_rc_qp(create_rc_idx(1,0),c->get_device(),c->get_local_mr(73)); 85 | qp->bind_remote_mr(mr); // bind to the previous allocated mr 86 | 87 | while(qp->connect(server_ip,server_port) != SUCC) { 88 | usleep(2000); 89 | } 90 | 91 | // main pingpong loop 92 | 93 | ibv_wc wc; 94 | while(true) { 95 | char *local_buf = buffer; 96 | uint64_t address = 0; 97 | int msg_len = 11; // length of "hello world" 98 | // read an uint64_t from the server 99 | auto rc = qp->post_send(IBV_WR_RDMA_READ,local_buf,msg_len,address,IBV_SEND_SIGNALED); 100 | qp->poll_till_completion(); 101 | // then get the results, stored in the local_buffer 102 | } 103 | 104 | ``` 105 | 106 | ### Acknowledgments 107 | TODO 108 | -------------------------------------------------------------------------------- /common.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "logging.hpp" 6 | #include "rnic.hpp" 7 | #include "mr.hpp" 8 | 9 | namespace rdmaio { 10 | 11 | // connection status 12 | enum ConnStatus { 13 | SUCC = 0, 14 | TIMEOUT = 1, 15 | WRONG_ARG = 2, 16 | ERR = 3, 17 | NOT_READY = 4, 18 | UNKNOWN = 5 19 | }; 20 | 21 | /** 22 | * The connection information exchanged between different QPs. 23 | * RC/UC QPs uses lid & addr to conncet to remote QPs, while qpn is used upon send requests. 24 | * node_id & port_id is used for UD QP to create addresses. 25 | */ 26 | struct QPAttr { 27 | address_t addr; 28 | uint16_t lid; 29 | uint32_t qpn; 30 | uint32_t psn; 31 | uint16_t node_id; 32 | uint16_t port_id; 33 | }; 34 | 35 | /** 36 | * The QP connection requests sent to remote. 37 | * from_node & from_worker identifies which QP it shall connect to 38 | */ 39 | struct QPConnArg { 40 | uint16_t from_node; 41 | uint8_t from_worker; 42 | uint8_t qp_type; // RC QP or UD QP 43 | }; 44 | 45 | /** 46 | * The MR connection requests sent to remote. 47 | */ 48 | struct MRConnArg { 49 | uint64_t mr_id; 50 | }; 51 | 52 | struct ConnArg { 53 | enum { MR, QP } type; 54 | union { 55 | QPConnArg qp; 56 | MRConnArg mr; 57 | } payload; 58 | }; 59 | 60 | struct ConnReply { 61 | ConnStatus ack; 62 | union { 63 | QPAttr qp; 64 | MemoryAttr mr; 65 | } payload; 66 | }; 67 | 68 | inline int convert_mtu(ibv_mtu type) { 69 | int mtu = 0; 70 | switch(type) { 71 | case IBV_MTU_256: 72 | mtu = 256; 73 | break; 74 | case IBV_MTU_512: 75 | mtu = 512; 76 | break; 77 | case IBV_MTU_1024: 78 | mtu = 1024; 79 | break; 80 | case IBV_MTU_2048: 81 | mtu = 2048; 82 | break; 83 | case IBV_MTU_4096: 84 | mtu = 4096; 85 | break; 86 | } 87 | return mtu; 88 | } 89 | 90 | // The structure used to configure UDQP 91 | typedef struct { 92 | int max_send_size; 93 | int max_recv_size; 94 | int qkey; 95 | int psn; 96 | } UDConfig; 97 | 98 | typedef struct { 99 | int access_flags; 100 | int max_rd_atomic; 101 | int max_dest_rd_atomic; 102 | int rq_psn; 103 | int sq_psn; 104 | int timeout; 105 | } RCConfig; 106 | 107 | } // namespace rdmaio 108 | -------------------------------------------------------------------------------- /example/client.cpp: -------------------------------------------------------------------------------- 1 | #include "rdma_ctrl.hpp" 2 | #include 3 | #include 4 | 5 | int client_node_id = 0; 6 | int tcp_port = 8000; 7 | int server_port = 8888; 8 | 9 | using namespace rdmaio; 10 | 11 | int main(int argc, char *argv[]) 12 | { 13 | RdmaCtrl *c = new RdmaCtrl(client_node_id,tcp_port); 14 | RdmaCtrl::DevIdx idx {.dev_id = 0,.port_id = 1 }; // using the first RNIC's first port 15 | c->open_thread_local_device(idx); 16 | 17 | // register a buffer to the previous opened device, using id = 73 18 | char *buffer = (char *)malloc(4096); 19 | memset(buffer, 0, 4096); 20 | RDMA_ASSERT(c->register_memory(73,buffer,4096,c->get_device()) == true); 21 | 22 | // get remote server's memory information 23 | MemoryAttr remote_mr; 24 | while(QP::get_remote_mr("localhost",server_port,73,&remote_mr) != SUCC) { 25 | usleep(2000); 26 | } 27 | 28 | // create the RC qp to access remote server's memory, using the previous registered memory 29 | MemoryAttr local_mr = c->get_local_mr(73); 30 | RCQP *qp = c->create_rc_qp(create_rc_idx(1,0),c->get_device(), &local_mr); 31 | qp->bind_remote_mr(remote_mr); // bind to the previous allocated mr 32 | 33 | while(qp->connect("localhost",server_port) != SUCC) { 34 | usleep(2000); 35 | } 36 | 37 | printf("client: QP connected!\n"); 38 | ibv_wc wc; 39 | char *local_buf = buffer; 40 | uint64_t address = 0; 41 | int msg_len = 11; // length of "hello world" 42 | 43 | // read an uint64_t from the server 44 | auto rc = qp->post_send(IBV_WR_RDMA_READ,local_buf,msg_len,address,IBV_SEND_SIGNALED); 45 | if (rc == SUCC) { 46 | printf("client: post ok\n"); 47 | } else { 48 | printf("client: post fail. rc=%d\n", rc); 49 | } 50 | rc = qp->poll_till_completion(wc, no_timeout); 51 | // then get the results, stored in the local_buffer 52 | if (rc == SUCC) { 53 | printf("client: poll ok\n"); 54 | printf("msg read: %s\n", local_buf); 55 | } else { 56 | printf("client: poll fail. rc=%d\n", rc); 57 | } 58 | 59 | return 0; 60 | 61 | } 62 | 63 | -------------------------------------------------------------------------------- /example/server.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "rdma_ctrl.hpp" 3 | #include 4 | #include 5 | 6 | 7 | /** 8 | * Note, RDMA usually uses some other communication method (e.g. TCP/IP) to exchange QP informations. 9 | * RLib uses TCP for the pre-communication. 10 | */ 11 | int server_node_id = 1; 12 | int tcp_port = 8888; 13 | int client_port = 8000; 14 | 15 | using namespace rdmaio; 16 | 17 | int main(int argc, char *argv[]) 18 | { 19 | RdmaCtrl *c = new RdmaCtrl(server_node_id,tcp_port); 20 | RdmaCtrl::DevIdx idx {.dev_id = 0,.port_id = 1 }; // using the first RNIC's first port 21 | c->open_thread_local_device(idx); 22 | 23 | // register a buffer to the previous opened device, using id = 73 24 | char *buffer = (char *)malloc(4096); 25 | memset(buffer, 0, 4096); 26 | RDMA_ASSERT(c->register_memory(73,buffer,4096,c->get_device()) == true); 27 | 28 | char s[] = "hello world"; 29 | memcpy(buffer, s, strlen(s)); 30 | 31 | MemoryAttr local_mr = c->get_local_mr(73); 32 | RCQP *qp = c->create_rc_qp(create_rc_idx(1,0),c->get_device(), &local_mr); 33 | 34 | // server also needs to "connect" clinet. 35 | while(qp->connect("localhost", client_port, create_rc_idx(1,0)) != SUCC) { 36 | usleep(2000); 37 | } 38 | 39 | printf("server: QP connected!\n"); 40 | while(true) { 41 | // This is RDMA, server does not need to do anything :) 42 | sleep(1); 43 | } 44 | 45 | return 0; 46 | } 47 | -------------------------------------------------------------------------------- /logging.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * The logging utilities used in libRDMA. 3 | */ 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | 10 | namespace rdmaio { 11 | 12 | /** 13 | * \def FATAL 14 | * Used for fatal and probably irrecoverable conditions 15 | * \def ERROR 16 | * Used for errors which are recoverable within the scope of the function 17 | * \def WARNING 18 | * Logs interesting conditions which are probably not fatal 19 | * \def EMPH 20 | * Outputs as INFO, but in WARNING colors. Useful for 21 | * outputting information you want to emphasize. 22 | * \def INFO 23 | * Used for providing general useful information 24 | * \def DEBUG 25 | * Debugging purposes only 26 | * \def EVERYTHING 27 | * Log everything 28 | */ 29 | 30 | enum loglevel { 31 | NONE = 7, 32 | FATAL = 6, 33 | ERROR = 5, 34 | WARNING = 4, 35 | EMPH = 3, 36 | INFO = 2, 37 | DEBUG = 1, 38 | EVERYTHING = 0 39 | }; 40 | 41 | #define unlikely(x) __builtin_expect(!!(x), 0) 42 | 43 | #ifndef RDMA_LOG_LEVEL 44 | #define RDMA_LOG_LEVEL ::rdmaio::INFO 45 | #endif 46 | 47 | // logging macro definiations 48 | // default log 49 | #define RDMA_LOG(n) \ 50 | if (n >= RDMA_LOG_LEVEL) \ 51 | ::rdmaio::MessageLogger((char*)__FILE__, __LINE__, n).stream() 52 | 53 | // log with tag 54 | #define RDMA_TLOG(n,t) \ 55 | if(n >= RDMA_LOG_LEVEL) \ 56 | ::rdmaio::MessageLogger((char*)__FILE__, __LINE__, n).stream() \ 57 | << "[" << (t) << "]" 58 | 59 | #define RDMA_LOG_IF(n,condition) \ 60 | if(n >= RDMA_LOG_LEVEL && (condition)) \ 61 | ::rdmaio::MessageLogger((char*)__FILE__, __LINE__, n).stream() 62 | 63 | #define RDMA_ASSERT(condition) \ 64 | if(unlikely(!(condition))) \ 65 | ::rdmaio::MessageLogger((char*)__FILE__, __LINE__, ::rdmaio::FATAL + 1).stream() << "Assertion! " 66 | 67 | #define RDMA_VERIFY(n,condition) RDMA_LOG_IF(n,(!(condition))) 68 | 69 | class MessageLogger { 70 | public: 71 | MessageLogger(const char *file, int line, int level) :level_(level) { 72 | if(level_ < RDMA_LOG_LEVEL) 73 | return; 74 | stream_ << "[" << StripBasename(std::string(file)) << ":" << line << "] "; 75 | } 76 | 77 | ~MessageLogger() { 78 | if(level_ >= RDMA_LOG_LEVEL) { 79 | stream_ << "\n"; 80 | std::cout << "\033[" << RDMA_DEBUG_LEVEL_COLOR[std::min(level_,6)] << "m" 81 | << stream_.str() << EndcolorFlag(); 82 | if(level_ >= ::rdmaio::FATAL) 83 | abort(); 84 | } 85 | } 86 | 87 | // Return the stream associated with the logger object. 88 | std::stringstream &stream() { return stream_; } 89 | private: 90 | std::stringstream stream_; 91 | int level_; 92 | 93 | // control flags for color 94 | #define R_BLACK 39 95 | #define R_RED 31 96 | #define R_GREEN 32 97 | #define R_YELLOW 33 98 | #define R_BLUE 34 99 | #define R_MAGENTA 35 100 | #define R_CYAN 36 101 | #define R_WHITE 37 102 | 103 | const int RDMA_DEBUG_LEVEL_COLOR[7] = {R_BLACK,R_YELLOW,R_BLACK,R_GREEN,R_MAGENTA,R_RED,R_RED}; 104 | 105 | static std::string StripBasename(const std::string &full_path) { 106 | const char kSeparator = '/'; 107 | size_t pos = full_path.rfind(kSeparator); 108 | if (pos != std::string::npos) { 109 | return full_path.substr(pos + 1, std::string::npos); 110 | } else { 111 | return full_path; 112 | } 113 | } 114 | 115 | static std::string EndcolorFlag() { 116 | char flag[7]; 117 | snprintf(flag,7, "%c[0m", 0x1B); 118 | return std::string(flag); 119 | } 120 | }; 121 | 122 | }; 123 | -------------------------------------------------------------------------------- /mr.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include "logging.hpp" 5 | 6 | namespace rdmaio { 7 | 8 | struct MemoryAttr { 9 | uintptr_t buf; 10 | uint32_t key; 11 | }; 12 | 13 | class Memory { 14 | public: 15 | /** 16 | * The default protection flag of a memory region. 17 | * In default, the memory can be read/write by local and remote RNIC operations. 18 | */ 19 | static const int DEFAULT_PROTECTION_FLAG = (IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | \ 20 | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC ); 21 | 22 | Memory(const char *addr,uint64_t len,ibv_pd *pd,int flag): 23 | addr(addr), 24 | len(len), 25 | mr(ibv_reg_mr(pd,(void *)addr,len,flag)) 26 | { 27 | if(mr == nullptr) { 28 | RDMA_LOG(WARNING) << "failed to register mr, for addr " << addr << "; len " << len; 29 | } else { 30 | rattr.buf = (uintptr_t)addr; 31 | rattr.key = mr->rkey; 32 | } 33 | } 34 | 35 | ~Memory() { 36 | if(mr != nullptr) { 37 | int rc = ibv_dereg_mr(mr); 38 | RDMA_LOG_IF(ERROR,rc != 0) << "dereg mr error: " << strerror(errno); 39 | } 40 | } 41 | 42 | bool valid() { 43 | return mr != nullptr; 44 | } 45 | 46 | const char *addr; 47 | uint64_t len; 48 | 49 | MemoryAttr rattr; // RDMA registered attr 50 | ibv_mr *mr = nullptr; // mr in the driver 51 | }; 52 | 53 | 54 | }; // namespace rdmaio 55 | -------------------------------------------------------------------------------- /msg_interface.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "common.hpp" 8 | 9 | namespace rdmaio { 10 | 11 | typedef std::function msg_callback_t_; 12 | 13 | /** 14 | * An abstract message interface 15 | * Assumption: one per thread 16 | */ 17 | class MsgAdapter { 18 | public: 19 | 20 | MsgAdapter(msg_callback_t_ callback) 21 | : callback_(callback) { 22 | } 23 | 24 | MsgAdapter() { 25 | 26 | } 27 | 28 | void set_callback(msg_callback_t_ callback) { 29 | callback_ = callback; 30 | } 31 | 32 | virtual ConnStatus connect(std::string ip,int port) = 0; 33 | 34 | /** 35 | * Basic send interfaces 36 | */ 37 | virtual ConnStatus send_to(int node_id,const char *msg,int len) = 0; 38 | 39 | virtual ConnStatus send_to(int node_id,int tid,const char *msg,int len) { 40 | return send_to(node_id,msg,len); 41 | } 42 | 43 | /** 44 | * Interfaces which allow batching at the sender's side 45 | */ 46 | virtual void prepare_pending() { 47 | } 48 | 49 | virtual ConnStatus send_pending(int node_id,const char *msg,int len) { 50 | RDMA_ASSERT(false); // not implemented 51 | } 52 | 53 | virtual ConnStatus send_pending(int node_id,int tid,const char *msg,int len) { 54 | return send_pending(node_id,msg,len); 55 | } 56 | 57 | /** 58 | * Flush all the currently pended message 59 | */ 60 | virtual ConnStatus flush_pending() { 61 | return SUCC; 62 | } 63 | 64 | /** 65 | * Examples to use batching at the sender side 66 | * Broadcast the message to a set of servers 67 | */ 68 | virtual ConnStatus broadcast_to(const std::set &nodes, const char *msg,int len) { 69 | prepare_pending(); 70 | for(auto it = nodes.begin(); it != nodes.end(); ++it) { 71 | send_pending(*it,msg,len); 72 | } 73 | flush_pending(); 74 | return SUCC; // TODO 75 | } 76 | 77 | virtual ConnStatus broadcast_to(int *nodes,int num, const char *msg,int len) { 78 | prepare_pending(); 79 | for(int i = 0;i < num;++i) { 80 | send_pending(nodes[i],msg,len); 81 | } 82 | flush_pending(); 83 | return SUCC; // TODO 84 | } 85 | 86 | /** 87 | * The receive function 88 | */ 89 | virtual void poll_comps() = 0; 90 | 91 | /** 92 | * The size of meta data used by the MsgAdapter for each message 93 | */ 94 | virtual int msg_meta_len() { 95 | return 0; 96 | } 97 | 98 | protected: 99 | msg_callback_t_ callback_; 100 | }; 101 | 102 | }; 103 | -------------------------------------------------------------------------------- /pre_connector.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "logging.hpp" 4 | 5 | #include //hostent 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | 15 | namespace rdmaio { 16 | 17 | constexpr struct timeval default_timeout = {0,8000}; 18 | constexpr struct timeval no_timeout = {0,0}; // it means forever 19 | 20 | inline __attribute__ ((always_inline)) // inline to avoid multiple-definiations 21 | int64_t diff_time(const struct timeval &end, const struct timeval &start) { 22 | int64_t diff = (end.tv_sec > start.tv_sec)?(end.tv_sec - start.tv_sec) * 1000:0; 23 | if (end.tv_usec > start.tv_usec) { 24 | diff += (end.tv_usec - start.tv_usec); 25 | } else { 26 | diff -= (start.tv_usec - end.tv_usec); 27 | } 28 | return diff; 29 | } 30 | 31 | class PreConnector { // helper class used to exchange QP information using TCP/IP 32 | public: 33 | static int get_listen_socket(const std::string &addr,int port) { 34 | 35 | struct sockaddr_in serv_addr; 36 | auto sockfd = socket(AF_INET, SOCK_STREAM, 0); 37 | RDMA_ASSERT(sockfd >= 0) << "ERROR opening listen socket: " << strerror(errno); 38 | 39 | /* setup the host_addr structure for use in bind call */ 40 | // server byte order 41 | serv_addr.sin_family = AF_INET; 42 | 43 | serv_addr.sin_addr.s_addr = INADDR_ANY; 44 | 45 | // port 46 | serv_addr.sin_port = htons(port); 47 | 48 | RDMA_ASSERT(bind(sockfd, (struct sockaddr *) &serv_addr, 49 | sizeof(serv_addr)) == 0) << "ERROR on binding: " << strerror(errno); 50 | return sockfd; 51 | } 52 | 53 | static int get_send_socket(const std::string &addr,int port,struct timeval timeout = default_timeout) { 54 | int sockfd; 55 | struct sockaddr_in serv_addr; 56 | 57 | RDMA_ASSERT((sockfd = socket(AF_INET, SOCK_STREAM, 0)) >= 0) << "Error open socket for send!"; 58 | fcntl(sockfd, F_SETFL, O_NONBLOCK); 59 | 60 | serv_addr.sin_family = AF_INET; 61 | serv_addr.sin_port = htons(port); 62 | 63 | auto ip = host_to_ip(addr); 64 | if(ip == "") { 65 | close(sockfd); 66 | return -1; 67 | } 68 | 69 | serv_addr.sin_addr.s_addr = inet_addr(ip.c_str()); 70 | 71 | if(connect(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) == -1) { 72 | if (errno == EINPROGRESS) { 73 | goto PROGRESS; 74 | } 75 | close(sockfd); 76 | return -1; 77 | } 78 | PROGRESS: 79 | // check return status 80 | fd_set fdset; 81 | FD_ZERO(&fdset); 82 | FD_SET(sockfd, &fdset); 83 | 84 | if(select(sockfd + 1, NULL, &fdset, NULL, &timeout) == 1) 85 | { 86 | int so_error; 87 | socklen_t len = sizeof so_error; 88 | 89 | getsockopt(sockfd, SOL_SOCKET, SO_ERROR, &so_error, &len); 90 | 91 | if (so_error == 0) { 92 | // success 93 | } else { 94 | close(sockfd); 95 | return -1; 96 | } 97 | } 98 | 99 | return sockfd; 100 | } 101 | 102 | // timeout in microsend 103 | static bool wait_recv(int socket, uint32_t timeout = 2000) { 104 | 105 | while(true) { 106 | 107 | fd_set rfds; 108 | FD_ZERO(&rfds); 109 | FD_SET(socket, &rfds); 110 | 111 | struct timeval s_timeout = {0,timeout}; 112 | int ready = select(socket + 1, &rfds, NULL, NULL, &s_timeout); 113 | RDMA_ASSERT(ready != -1); 114 | 115 | if(ready == 0) { // no file descriptor found 116 | continue; 117 | } 118 | 119 | if(ready < 0) { // error case 120 | RDMA_ASSERT(false) << "select error " << strerror(errno); 121 | } 122 | 123 | if (FD_ISSET(socket, &rfds)) { 124 | break; // ready 125 | } 126 | } 127 | return true; 128 | } 129 | 130 | static void wait_close(int socket) { 131 | 132 | shutdown(socket, SHUT_WR); 133 | char buf[2]; 134 | 135 | struct timeval timeout={1,0}; 136 | auto ret = setsockopt(socket,SOL_SOCKET,SO_RCVTIMEO,(const char*)&timeout,sizeof(timeout)); 137 | RDMA_ASSERT(ret == 0); 138 | 139 | recv(socket,buf,2,0); 140 | close(socket); 141 | } 142 | 143 | static int send_to(int fd, char *usrbuf, size_t n) { 144 | size_t nleft = n; 145 | ssize_t nwritten; 146 | char *bufp = usrbuf; 147 | 148 | while (nleft > 0) { 149 | if ((nwritten = write(fd, bufp, nleft)) <= 0) { 150 | if (errno == EINTR) /* Interrupted by sig handler return */ 151 | nwritten = 0; /* and call write() again */ 152 | else 153 | return -1; /* errno set by write() */ 154 | } 155 | nleft -= nwritten; 156 | bufp += nwritten; 157 | } 158 | return n; 159 | } 160 | 161 | typedef std::map ipmap_t; 162 | static ipmap_t &local_ip_cache() { 163 | static __thread ipmap_t cache; 164 | return cache; 165 | } 166 | 167 | static std::string host_to_ip(const std::string &host) { 168 | 169 | ipmap_t cache = local_ip_cache(); 170 | if(cache.find(host) != cache.end()) 171 | return cache[host]; 172 | 173 | std::string res = ""; 174 | 175 | struct addrinfo hints, *infoptr; 176 | memset(&hints, 0, sizeof hints); 177 | hints.ai_family = AF_INET; // AF_INET means IPv4 only addresses 178 | 179 | int result = getaddrinfo(host.c_str(), NULL, &hints, &infoptr); 180 | if (result) { 181 | fprintf(stderr, "getaddrinfo: %s at %s\n", gai_strerror(result),host.c_str()); 182 | return ""; 183 | } 184 | char ip[64]; memset(ip,0,sizeof(ip)); 185 | 186 | for(struct addrinfo *p = infoptr; p != NULL; p = p->ai_next) { 187 | getnameinfo(p->ai_addr, p->ai_addrlen, ip, sizeof(ip), NULL, 0, NI_NUMERICHOST); 188 | } 189 | 190 | res = std::string(ip); 191 | if(res != "") 192 | cache.insert(std::make_pair(host,res)); 193 | return res; 194 | } 195 | 196 | }; 197 | 198 | 199 | }; // namespace rdmaio 200 | -------------------------------------------------------------------------------- /qp.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common.hpp" 4 | #include "qp_impl.hpp" // hide the implementation 5 | 6 | namespace rdmaio { 7 | 8 | /** 9 | * The QP managed by RLib is identified by the QPIdx 10 | * Basically it identifies which worker(thread) is using the QP, 11 | * and which machine this QP is connected to. 12 | */ 13 | typedef struct { 14 | int node_id; // the node QP connect to 15 | int worker_id; // the thread/task QP belongs 16 | int index; // mutliple QP may is needed to connect to the node 17 | } QPIdx; 18 | 19 | // some macros for easy computer QP idx, since some use default values 20 | constexpr QPIdx create_rc_idx(int nid,int wid) { 21 | return QPIdx { 22 | .node_id = nid, 23 | .worker_id = wid, 24 | .index = 0 25 | }; 26 | } 27 | 28 | constexpr QPIdx create_ud_idx(int worker_id,int idx = 0) { 29 | return QPIdx { 30 | .node_id = 0, // a UD qp can connect to multiple machine 31 | .worker_id = worker_id, 32 | .index = idx 33 | }; 34 | } 35 | 36 | /** 37 | * Wrappers over ibv_qp & ibv_cq 38 | * For easy use, and connect 39 | */ 40 | class QP { 41 | public: 42 | QP(RNicHandler *rnic,QPIdx idx): 43 | idx_(idx), 44 | rnic_(rnic) 45 | { 46 | } 47 | 48 | ~QP() { 49 | if(qp_ != nullptr) 50 | ibv_destroy_qp(qp_); 51 | if(cq_ != nullptr) 52 | ibv_destroy_cq(cq_); 53 | } 54 | /** 55 | * Connect to remote QP 56 | * Note, we leverage TCP for a pre connect. 57 | * So the IP/Hostname and a TCP port must be given. 58 | * 59 | * WARNING: 60 | * This function actually should contains two functions, connect + change QP status 61 | * maybe split to connect + change status for more flexibility? 62 | */ 63 | /** 64 | * connect to the specific QP at remote, specificed by the nid and wid 65 | * return SUCC if QP are ready. 66 | * return TIMEOUT if there is network error. 67 | * return NOT_READY if remote server fails to find the connected QP 68 | */ 69 | virtual ConnStatus connect(std::string ip,int port,QPIdx idx) = 0; 70 | 71 | // return until the completion events 72 | // this call will block until a timeout 73 | virtual ConnStatus poll_till_completion(ibv_wc &wc, struct timeval timeout = default_timeout) { 74 | return QPImpl::poll_till_completion(cq_,wc,timeout); 75 | } 76 | 77 | void bind_local_mr(MemoryAttr attr) { 78 | local_mr_ = attr; 79 | } 80 | 81 | QPAttr get_attr() const { 82 | QPAttr res = { 83 | .addr = rnic_->query_addr(), 84 | .lid = rnic_->lid, 85 | .qpn = (qp_ != nullptr)?qp_->qp_num:0, 86 | .psn = DEFAULT_PSN, // TODO! this may be filled later 87 | .node_id = 0, // a place holder 88 | .port_id = rnic_->port_id 89 | }; 90 | return res; 91 | } 92 | 93 | /** 94 | * Get remote MR attribute 95 | */ 96 | static ConnStatus get_remote_mr(std::string ip,int port,int mr_id,MemoryAttr *attr) { 97 | return QPImpl::get_remote_mr(ip,port,mr_id,attr); 98 | } 99 | 100 | // QP identifiers 101 | const QPIdx idx_; 102 | 103 | public: 104 | // internal verbs structure 105 | struct ibv_qp *qp_ = NULL; 106 | struct ibv_cq *cq_ = NULL; 107 | 108 | // local MR used to post reqs 109 | MemoryAttr local_mr_; 110 | RNicHandler *rnic_; 111 | 112 | protected: 113 | ConnStatus get_remote_helper(ConnArg *arg, ConnReply *reply,std::string ip,int port) { 114 | return QPImpl::get_remote_helper(arg,reply,ip,port); 115 | } 116 | }; 117 | 118 | inline constexpr RCConfig default_rc_config() { 119 | return RCConfig { 120 | .access_flags = (IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC), 121 | .max_rd_atomic = 16, 122 | .max_dest_rd_atomic = 16, 123 | .rq_psn = DEFAULT_PSN, 124 | .sq_psn = DEFAULT_PSN, 125 | .timeout = 20 126 | }; 127 | } 128 | 129 | /** 130 | * Raw RC QP 131 | */ 132 | template 133 | class RRCQP : public QP { 134 | public: 135 | RRCQP(RNicHandler *rnic,QPIdx idx, 136 | MemoryAttr local_mr,MemoryAttr remote_mr) 137 | :RRCQP(rnic,idx) { 138 | bind_local_mr(local_mr); 139 | bind_remote_mr(remote_mr); 140 | } 141 | 142 | RRCQP(RNicHandler *rnic,QPIdx idx,MemoryAttr local_mr) 143 | :RRCQP(rnic,idx) { 144 | bind_local_mr(local_mr); 145 | } 146 | 147 | RRCQP(RNicHandler *rnic,QPIdx idx) 148 | :QP(rnic,idx) 149 | { 150 | RCQPImpl::init(qp_,cq_,rnic_); 151 | } 152 | 153 | ConnStatus connect(std::string ip,int port) { 154 | return connect(ip,port,idx_); 155 | } 156 | 157 | ConnStatus connect(std::string ip,int port,QPIdx idx) { 158 | 159 | // first check whether QP is valid to connect 160 | enum ibv_qp_state state; 161 | if( (state = QPImpl::query_qp_status(qp_)) != IBV_QPS_INIT) { 162 | if(state != IBV_QPS_RTS) 163 | RDMA_LOG(WARNING) << "qp not in a correct state to connect!"; 164 | return (state == IBV_QPS_RTS)?SUCC:UNKNOWN; 165 | } 166 | ConnArg arg = {} ; ConnReply reply = {}; 167 | arg.type = ConnArg::QP; 168 | arg.payload.qp.from_node = idx.node_id; 169 | arg.payload.qp.from_worker = idx.worker_id; 170 | arg.payload.qp.qp_type = IBV_QPT_RC; 171 | 172 | auto ret = QPImpl::get_remote_helper(&arg,&reply,ip,port); 173 | if(ret == SUCC) { 174 | // change QP status 175 | if(!RCQPImpl::ready2rcv(qp_,reply.payload.qp,rnic_)) { 176 | RDMA_LOG(WARNING) << "change qp status to ready to receive error: " << strerror(errno); 177 | ret = ERR; 178 | goto CONN_END; 179 | } 180 | 181 | if(!RCQPImpl::ready2send(qp_)) { 182 | RDMA_LOG(WARNING) << "change qp status to ready to send error: " << strerror(errno); 183 | ret = ERR; 184 | goto CONN_END; 185 | } 186 | } 187 | CONN_END: 188 | return ret; 189 | } 190 | 191 | /** 192 | * Bind this QP's operation to a remote memory region according to the MemoryAttr. 193 | * Since usually one QP access *one memory region* almost all the time, 194 | * so it is more convenient to use a bind-post;bind-post-post fashion. 195 | */ 196 | void bind_remote_mr(MemoryAttr attr) { 197 | remote_mr_ = attr; 198 | } 199 | 200 | ConnStatus post_send_to_mr(MemoryAttr &local_mr,MemoryAttr &remote_mr, 201 | ibv_wr_opcode op,char *local_buf,uint32_t len,uint64_t off,int flags, 202 | uint64_t wr_id = 0, uint32_t imm = 0) { 203 | ConnStatus ret = SUCC; 204 | struct ibv_send_wr *bad_sr; 205 | 206 | // setting the SGE 207 | struct ibv_sge sge { 208 | .addr = (uint64_t)local_buf, 209 | .length = len, 210 | .lkey = local_mr.key 211 | }; 212 | 213 | // setting sr, sr has to be initilized in this style 214 | struct ibv_send_wr sr; 215 | sr.wr_id = wr_id; 216 | sr.opcode = op; 217 | sr.num_sge = 1; 218 | sr.next = NULL; 219 | sr.sg_list = &sge; 220 | sr.send_flags = flags; 221 | sr.imm_data = imm; 222 | 223 | sr.wr.rdma.remote_addr = remote_mr.buf + off; 224 | sr.wr.rdma.rkey = remote_mr.key; 225 | 226 | auto rc = ibv_post_send(qp_,&sr,&bad_sr); 227 | return rc == 0 ? SUCC : ERR; 228 | } 229 | 230 | /** 231 | * Post request(s) to the sending QP. 232 | * This is just a wrapper of ibv_post_send 233 | */ 234 | ConnStatus post_send(ibv_wr_opcode op,char *local_buf,uint32_t len,uint64_t off,int flags, 235 | uint64_t wr_id = 0, uint32_t imm = 0) { 236 | return post_send_to_mr(local_mr_,remote_mr_,op,local_buf,len,off,flags,wr_id,imm); 237 | } 238 | 239 | // one-sided atomic operations 240 | ConnStatus post_cas(char *local_buf,uint64_t off, 241 | uint64_t compare,uint64_t swap,int flags,uint64_t wr_id = 0) { 242 | return post_atomic(local_buf,off,compare,swap,flags,wr_id); 243 | } 244 | 245 | // one-sided fetch and add 246 | ConnStatus post_faa(char *local_buf,uint64_t off,uint64_t add_value,int flags,int wr_id = 0) { 247 | return post_atomic(local_buf,off,add_value,0 /* no swap value is needed*/,flags,wr_id); 248 | } 249 | 250 | template 251 | ConnStatus post_atomic(char *local_buf,uint64_t off, 252 | uint64_t compare,uint64_t swap,int flags,uint64_t wr_id = 0) { 253 | static_assert(type == IBV_WR_ATOMIC_CMP_AND_SWP || type == IBV_WR_ATOMIC_FETCH_AND_ADD, 254 | "only two atomic operations are currently supported."); 255 | 256 | // check if address (off) is 8-byte aligned 257 | if((off & 0x7) != 0) { 258 | return WRONG_ARG; 259 | } 260 | 261 | ConnStatus ret = SUCC; 262 | struct ibv_send_wr *bad_sr; 263 | 264 | // setting the SGE 265 | struct ibv_sge sge { 266 | .addr = (uint64_t)local_buf, 267 | .length = sizeof(uint64_t), // atomic only supports 8-byte operation 268 | .lkey = local_mr_.key 269 | }; 270 | 271 | struct ibv_send_wr sr; 272 | sr.wr_id = wr_id; 273 | sr.opcode = type; 274 | sr.num_sge = 1; 275 | sr.next = NULL; 276 | sr.sg_list = &sge; 277 | sr.send_flags = flags; 278 | // remote memory 279 | sr.wr.atomic.rkey = remote_mr_.key; 280 | sr.wr.atomic.remote_addr = (off + remote_mr_.buf); 281 | sr.wr.atomic.compare_add = compare; 282 | sr.wr.atomic.swap = swap; 283 | 284 | auto rc = ibv_post_send(qp_,&sr,&bad_sr); 285 | return rc == 0 ? SUCC : ERR; 286 | } 287 | 288 | ConnStatus post_batch(struct ibv_send_wr *send_sr,ibv_send_wr **bad_sr_addr,int num = 0) { 289 | auto rc = ibv_post_send(qp_,send_sr,bad_sr_addr); 290 | return rc == 0 ? SUCC : ERR; 291 | } 292 | 293 | /** 294 | * Poll completions. These are just wrappers of ibv_poll_cq 295 | */ 296 | int poll_send_completion(ibv_wc &wc) { 297 | return ibv_poll_cq(cq_,1,&wc); 298 | } 299 | 300 | ConnStatus poll_till_completion(ibv_wc &wc,struct timeval timeout = default_timeout) { 301 | auto ret = QP::poll_till_completion(wc,timeout); 302 | if(ret == SUCC) { 303 | low_watermark_ = high_watermark_; 304 | } 305 | return ret; 306 | } 307 | 308 | /** 309 | * Used to count pending reqs 310 | * XD: current we use 64 as default, but it is rather application defined, 311 | * which is related to how the QP's send to are created, etc 312 | */ 313 | bool need_poll(int threshold = (RCQPImpl::RC_MAX_SEND_SIZE / 2)) { 314 | return (high_watermark_ - low_watermark_) >= threshold; 315 | } 316 | 317 | uint64_t high_watermark_ = 0; 318 | uint64_t low_watermark_ = 0; 319 | 320 | MemoryAttr remote_mr_; 321 | }; 322 | 323 | inline constexpr UDConfig default_ud_config() { 324 | return UDConfig { 325 | .max_send_size = UDQPImpl::MAX_SEND_SIZE, 326 | .max_recv_size = UDQPImpl::MAX_RECV_SIZE, 327 | .qkey = DEFAULT_QKEY, 328 | .psn = DEFAULT_PSN 329 | }; 330 | } 331 | 332 | /** 333 | * Raw UD QP 334 | */ 335 | template 336 | class RUDQP : public QP { 337 | // the QKEY is used to identify UD QP requests 338 | static const int DEFAULT_QKEY = 0xdeadbeaf; 339 | public: 340 | RUDQP(RNicHandler *rnic,QPIdx idx,MemoryAttr local_mr) 341 | :RUDQP(rnic,idx) { 342 | bind_local_mr(local_mr); 343 | } 344 | 345 | RUDQP(RNicHandler *rnic,QPIdx idx) 346 | :QP(rnic,idx) { 347 | UDQPImpl::init(qp_,cq_,recv_cq_,rnic_); 348 | std::fill_n(ahs_,MAX_SERVER_NUM,nullptr); 349 | } 350 | 351 | bool queue_empty() { 352 | return pendings == 0; 353 | } 354 | 355 | bool need_poll(int threshold = UDQPImpl::MAX_SEND_SIZE / 2) { 356 | return pendings >= threshold; 357 | } 358 | 359 | /** 360 | * Simple wrapper to expose underlying QP structures 361 | */ 362 | inline __attribute__ ((always_inline)) 363 | ibv_cq *recv_queue() { 364 | return recv_cq_; 365 | } 366 | 367 | inline __attribute__ ((always_inline)) 368 | ibv_qp *send_qp() { 369 | return qp_; 370 | } 371 | 372 | ConnStatus connect(std::string ip,int port) { 373 | // UD QP is not bounded to a mac, so use idx to index 374 | return connect(ip,port,idx_); 375 | } 376 | 377 | ConnStatus connect(std::string ip,int port,QPIdx idx) { 378 | 379 | ConnArg arg; ConnReply reply; 380 | arg.type = ConnArg::QP; 381 | arg.payload.qp.from_node = idx.worker_id; 382 | arg.payload.qp.from_worker = idx.index; 383 | arg.payload.qp.qp_type = IBV_QPT_UD; 384 | 385 | auto ret = QPImpl::get_remote_helper(&arg,&reply,ip,port); 386 | 387 | if(ret == SUCC) { 388 | // create the ah, and store the address handler 389 | auto ah = UDQPImpl::create_ah(rnic_,reply.payload.qp); 390 | if(ah == nullptr) { 391 | RDMA_LOG(WARNING) << "create address handler error: " << strerror(errno); 392 | ret = ERR; 393 | } else { 394 | ahs_[reply.payload.qp.node_id] = ah; 395 | attrs_[reply.payload.qp.node_id] = reply.payload.qp; 396 | } 397 | } 398 | CONN_END: 399 | return ret; 400 | } 401 | 402 | /** 403 | * whether this UD QP has been post recved 404 | * a UD QP should be first been post_recved; then it can be connected w others 405 | */ 406 | bool ready() { 407 | return ready_; 408 | } 409 | 410 | void set_ready() { 411 | ready_ = true; 412 | } 413 | 414 | friend class UDAdapter; 415 | private: 416 | /** 417 | * FIXME: curretly we have limited servers, so we use an array. 418 | * using a map will affect the perfomrance in microbenchmarks. 419 | * remove it, and merge this in UDAdapter? 420 | */ 421 | struct ibv_ah *ahs_[MAX_SERVER_NUM]; 422 | struct QPAttr attrs_[MAX_SERVER_NUM]; 423 | 424 | // current outstanding requests which have not been polled 425 | int pendings = 0; 426 | 427 | struct ibv_cq *recv_cq_ = NULL; 428 | bool ready_ = false; 429 | }; 430 | 431 | }; // end namespace rdmaio 432 | -------------------------------------------------------------------------------- /qp_impl.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "pre_connector.hpp" 6 | 7 | namespace rdmaio { 8 | 9 | const int MAX_INLINE_SIZE = 64; 10 | 11 | /** 12 | * These are magic numbers, served as the keys / identifications 13 | * Currently we not allow user defined keys, but can simply added 14 | */ 15 | const uint32_t DEFAULT_QKEY = 0x111111; 16 | const uint32_t DEFAULT_PSN = 3185; 17 | 18 | /** 19 | * QP encoder, provde a default naming to identity QPs 20 | */ 21 | enum { 22 | RC_ID_BASE = 0, 23 | UC_ID_BASE = 10000, 24 | UD_ID_BASE = 20000 25 | }; 26 | 27 | inline constexpr uint32_t index_mask() { 28 | return 0xffff; 29 | } 30 | 31 | inline uint32_t mac_mask() { 32 | return ::rdmaio::index_mask() << 16; 33 | } 34 | 35 | inline uint32_t encode_qp_id(int m,int idx) { 36 | return static_cast(static_cast(m) << 16) | static_cast(idx); 37 | } 38 | 39 | inline uint32_t decode_qp_mac(uint32_t key) { 40 | return (key & ::rdmaio::mac_mask()) >> 16; 41 | } 42 | 43 | inline uint32_t decode_qp_index(uint32_t key) { 44 | return key & ::rdmaio::index_mask(); 45 | } 46 | 47 | class QPImpl { 48 | public: 49 | QPImpl() = default; 50 | ~QPImpl() = default; 51 | 52 | static enum ibv_qp_state query_qp_status(ibv_qp *qp) { 53 | struct ibv_qp_attr attr; 54 | struct ibv_qp_init_attr init_attr; 55 | 56 | if (ibv_query_qp(qp, &attr,IBV_QP_STATE, &init_attr)) { 57 | RDMA_ASSERT(false) << "query qp cannot cause error"; 58 | } 59 | return attr.qp_state; 60 | } 61 | 62 | static ConnStatus get_remote_helper(ConnArg *arg, ConnReply *reply,std::string ip,int port) { 63 | 64 | ConnStatus ret = SUCC; 65 | 66 | auto socket = PreConnector::get_send_socket(ip,port); 67 | if(socket < 0) { 68 | return ERR; 69 | } 70 | 71 | auto n = send(socket,(char *)(arg),sizeof(ConnArg),0); 72 | 73 | if(n != sizeof(ConnArg)) { 74 | ret = ERR; goto CONN_END; 75 | } 76 | 77 | // receive reply 78 | if(!PreConnector::wait_recv(socket,10000)) { 79 | ret = TIMEOUT; goto CONN_END; 80 | } 81 | 82 | n = recv(socket,(char *)((reply)), sizeof(ConnReply), MSG_WAITALL); 83 | if(n != sizeof(ConnReply)) { 84 | ret = ERR; goto CONN_END; 85 | } 86 | if(reply->ack != SUCC) { 87 | ret = NOT_READY; goto CONN_END; 88 | } 89 | CONN_END: 90 | shutdown(socket,SHUT_RDWR); 91 | close(socket); 92 | return ret; 93 | } 94 | 95 | static ConnStatus get_remote_mr(std::string ip,int port,int mr_id,MemoryAttr *attr) { 96 | 97 | ConnArg arg; ConnReply reply; 98 | arg.type = ConnArg::MR; 99 | arg.payload.mr.mr_id = mr_id; 100 | 101 | auto ret = get_remote_helper(&arg,&reply,ip,port); 102 | if(ret == SUCC) { 103 | attr->key = reply.payload.mr.key; 104 | attr->buf = reply.payload.mr.buf; 105 | } 106 | return ret; 107 | } 108 | 109 | static ConnStatus poll_till_completion(ibv_cq *cq,ibv_wc &wc, struct timeval timeout) { 110 | 111 | struct timeval start_time; gettimeofday (&start_time, nullptr); 112 | int poll_result = 0; int64_t diff; 113 | int64_t numeric_timeout = (timeout.tv_sec == 0 && timeout.tv_usec == 0) ? std::numeric_limits::max() : 114 | timeout.tv_sec * 1000 + timeout.tv_usec; 115 | do { 116 | asm volatile("" ::: "memory"); 117 | poll_result = ibv_poll_cq (cq, 1, &wc); 118 | 119 | struct timeval cur_time; gettimeofday(&cur_time,nullptr); 120 | diff = diff_time(cur_time,start_time); 121 | } while((poll_result == 0) && (diff <= numeric_timeout)); 122 | 123 | if(poll_result == 0) { 124 | return TIMEOUT; 125 | } 126 | 127 | if(poll_result < 0) { 128 | RDMA_ASSERT(false); 129 | return ERR; 130 | } 131 | RDMA_LOG_IF(4,wc.status != IBV_WC_SUCCESS) << 132 | "poll till completion error: " << wc.status << " " << ibv_wc_status_str(wc.status); 133 | return wc.status == IBV_WC_SUCCESS ? SUCC : ERR; 134 | } 135 | }; 136 | 137 | class RCQPImpl { 138 | public: 139 | RCQPImpl() = default; 140 | ~RCQPImpl() = default; 141 | 142 | static const int RC_MAX_SEND_SIZE = 128; 143 | static const int RC_MAX_RECV_SIZE = 512; 144 | 145 | template 146 | static void ready2init(ibv_qp *qp,RNicHandler *rnic) { 147 | 148 | auto config = F(); 149 | 150 | struct ibv_qp_attr qp_attr = {}; 151 | qp_attr.qp_state = IBV_QPS_INIT; 152 | qp_attr.pkey_index = 0; 153 | qp_attr.port_num = rnic->port_id; 154 | qp_attr.qp_access_flags = config.access_flags; 155 | 156 | int flags = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS; 157 | int rc = ibv_modify_qp(qp, &qp_attr,flags); 158 | RDMA_VERIFY(WARNING,rc == 0) << "Failed to modify RC to INIT state, %s\n" << strerror(errno); 159 | 160 | if(rc != 0) { 161 | // error handling 162 | RDMA_LOG(WARNING) << " change state to init failed. "; 163 | } 164 | } 165 | 166 | template 167 | static bool ready2rcv(ibv_qp *qp,QPAttr &attr,RNicHandler *rnic) { 168 | 169 | auto config = F(); 170 | 171 | struct ibv_qp_attr qp_attr = {}; 172 | 173 | qp_attr.qp_state = IBV_QPS_RTR; 174 | qp_attr.path_mtu = IBV_MTU_4096; 175 | qp_attr.dest_qp_num = attr.qpn; 176 | qp_attr.rq_psn = config.rq_psn; // should this match the sender's psn ? 177 | qp_attr.max_dest_rd_atomic = config.max_dest_rd_atomic; 178 | qp_attr.min_rnr_timer = 20; 179 | 180 | qp_attr.ah_attr.dlid = attr.lid; 181 | qp_attr.ah_attr.sl = 0; 182 | qp_attr.ah_attr.src_path_bits = 0; 183 | qp_attr.ah_attr.port_num = rnic->port_id; /* Local port! */ 184 | 185 | qp_attr.ah_attr.is_global = 1; 186 | qp_attr.ah_attr.grh.dgid.global.subnet_prefix = attr.addr.subnet_prefix; 187 | qp_attr.ah_attr.grh.dgid.global.interface_id = attr.addr.interface_id; 188 | qp_attr.ah_attr.grh.sgid_index = 0; 189 | qp_attr.ah_attr.grh.flow_label = 0; 190 | qp_attr.ah_attr.grh.hop_limit = 255; 191 | 192 | int flags = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN 193 | | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER; 194 | auto rc = ibv_modify_qp(qp, &qp_attr,flags); 195 | return rc == 0; 196 | 197 | } 198 | 199 | template 200 | static bool ready2send(ibv_qp *qp) { 201 | 202 | auto config = F(); 203 | 204 | int rc, flags; 205 | struct ibv_qp_attr qp_attr = {}; 206 | 207 | qp_attr.qp_state = IBV_QPS_RTS; 208 | qp_attr.sq_psn = config.sq_psn; 209 | qp_attr.timeout = config.timeout; 210 | qp_attr.retry_cnt = 7; 211 | qp_attr.rnr_retry = 7; 212 | qp_attr.max_rd_atomic = config.max_rd_atomic; 213 | qp_attr.max_dest_rd_atomic = config.max_dest_rd_atomic; 214 | 215 | flags = IBV_QP_STATE | IBV_QP_SQ_PSN | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | 216 | IBV_QP_MAX_QP_RD_ATOMIC; 217 | rc = ibv_modify_qp(qp, &qp_attr,flags); 218 | return rc == 0; 219 | } 220 | 221 | template 222 | static void init(ibv_qp *&qp,ibv_cq *&cq,RNicHandler *rnic) { 223 | 224 | // create the CQ 225 | cq = ibv_create_cq(rnic->ctx, RC_MAX_SEND_SIZE, nullptr, nullptr, 0); 226 | RDMA_VERIFY(WARNING,cq != nullptr) << "create cq error: " << strerror(errno); 227 | 228 | // create the QP 229 | struct ibv_qp_init_attr qp_init_attr = {}; 230 | 231 | qp_init_attr.send_cq = cq; 232 | qp_init_attr.recv_cq = cq; // TODO, need seperate handling for two-sided over RC QP 233 | qp_init_attr.qp_type = IBV_QPT_RC; 234 | 235 | qp_init_attr.cap.max_send_wr = RC_MAX_SEND_SIZE; 236 | qp_init_attr.cap.max_recv_wr = RC_MAX_RECV_SIZE; /* Can be set to 1, if RC Two-sided is not required */ 237 | qp_init_attr.cap.max_send_sge = 1; 238 | qp_init_attr.cap.max_recv_sge = 1; 239 | qp_init_attr.cap.max_inline_data = MAX_INLINE_SIZE; 240 | 241 | qp = ibv_create_qp(rnic->pd, &qp_init_attr); 242 | RDMA_VERIFY(WARNING,qp != nullptr); 243 | 244 | if(qp) 245 | ready2init(qp,rnic); 246 | } 247 | }; 248 | 249 | class UDQPImpl { 250 | public: 251 | UDQPImpl() = default; 252 | ~UDQPImpl() = default; 253 | 254 | static const int MAX_SEND_SIZE = 128; 255 | static const int MAX_RECV_SIZE = 2048; 256 | 257 | template 258 | static void init(ibv_qp *&qp,ibv_cq *&cq,ibv_cq *&recv_cq,RNicHandler *rnic) { 259 | 260 | auto config = F(); // generate the config 261 | RDMA_ASSERT(config.max_send_size <= MAX_SEND_SIZE); 262 | RDMA_ASSERT(config.max_recv_size <= MAX_RECV_SIZE); 263 | 264 | if(qp != nullptr) 265 | return; 266 | 267 | if((cq = ibv_create_cq(rnic->ctx, config.max_send_size, nullptr, nullptr, 0)) == nullptr) { 268 | RDMA_LOG(ERROR) << "create send cq for UD QP error: " << strerror(errno); 269 | return; 270 | } 271 | 272 | if((recv_cq = ibv_create_cq(rnic->ctx, config.max_recv_size, nullptr, nullptr, 0)) == nullptr) { 273 | RDMA_LOG(ERROR) << "create recv cq for UD QP error: " << strerror(errno); 274 | return; 275 | } 276 | 277 | /* Initialize creation attributes */ 278 | struct ibv_qp_init_attr qp_init_attr = {}; 279 | qp_init_attr.send_cq = cq; 280 | qp_init_attr.recv_cq = recv_cq; 281 | qp_init_attr.qp_type = IBV_QPT_UD; 282 | 283 | qp_init_attr.cap.max_send_wr = config.max_send_size; 284 | qp_init_attr.cap.max_recv_wr = config.max_recv_size; 285 | qp_init_attr.cap.max_send_sge = 1; 286 | qp_init_attr.cap.max_recv_sge = 1; 287 | qp_init_attr.cap.max_inline_data = MAX_INLINE_SIZE; 288 | 289 | if((qp = ibv_create_qp(rnic->pd, &qp_init_attr)) == nullptr) { 290 | RDMA_LOG(ERROR) << "create send qp for UD QP error: " << strerror(errno); 291 | return; 292 | } 293 | 294 | // change QP status 295 | ready2init(qp, rnic,config); // shall always succeed 296 | 297 | if(!ready2rcv(qp,rnic)) { 298 | RDMA_LOG(WARNING) << "change ud qp to ready to recv error: " << strerror(errno); 299 | } 300 | if(!ready2send(qp,config)) { 301 | RDMA_LOG(WARNING) << "change ud qp to ready to send error: " << strerror(errno); 302 | } 303 | } 304 | 305 | /** 306 | * Unlike RC, which change status happens at different places, so F, the function which generates configurations, 307 | * are passed as templates. On the other hand, UD change status at the same time. So it is more convenient to passed 308 | * the configuration generated by the F to the functions. 309 | */ 310 | static void ready2init(ibv_qp *qp,RNicHandler *rnic,UDConfig &config) { 311 | 312 | int rc, flags = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_QKEY; 313 | struct ibv_qp_attr qp_attr = {}; 314 | qp_attr.qp_state = IBV_QPS_INIT; 315 | qp_attr.pkey_index = 0; 316 | qp_attr.port_num = rnic->port_id; 317 | qp_attr.qkey = config.qkey; 318 | 319 | if((rc = ibv_modify_qp(qp, &qp_attr, flags)) != 0) { 320 | RDMA_LOG(WARNING) << "modify ud qp to init error: " << strerror(errno); 321 | } 322 | } 323 | 324 | static bool ready2rcv(ibv_qp *qp,RNicHandler *rnic) { 325 | 326 | int rc, flags = IBV_QP_STATE; 327 | struct ibv_qp_attr qp_attr = {}; 328 | qp_attr.qp_state = IBV_QPS_RTR; 329 | 330 | rc = ibv_modify_qp(qp, &qp_attr, flags); 331 | return rc == 0; 332 | } 333 | 334 | static bool ready2send(ibv_qp *qp,UDConfig &config) { 335 | 336 | int rc, flags = 0; 337 | struct ibv_qp_attr qp_attr = {}; 338 | qp_attr.qp_state = IBV_QPS_RTS; 339 | qp_attr.sq_psn = config.psn; 340 | 341 | flags = IBV_QP_STATE | IBV_QP_SQ_PSN; 342 | rc = ibv_modify_qp(qp, &qp_attr, flags); 343 | return rc == 0; 344 | } 345 | 346 | static ibv_ah *create_ah(RNicHandler *rnic,QPAttr &attr) { 347 | 348 | struct ibv_ah_attr ah_attr; 349 | ah_attr.is_global = 1; 350 | ah_attr.dlid = attr.lid; 351 | ah_attr.sl = 0; 352 | ah_attr.src_path_bits = 0; 353 | ah_attr.port_num = attr.port_id; 354 | 355 | ah_attr.grh.dgid.global.subnet_prefix = attr.addr.subnet_prefix; 356 | ah_attr.grh.dgid.global.interface_id = attr.addr.interface_id; 357 | ah_attr.grh.flow_label = 0; 358 | ah_attr.grh.hop_limit = 255; 359 | ah_attr.grh.sgid_index = rnic->gid; 360 | 361 | return ibv_create_ah(rnic->pd, &ah_attr); 362 | } 363 | 364 | }; 365 | 366 | } // namespace rdmaio 367 | -------------------------------------------------------------------------------- /ralloc/Makefile: -------------------------------------------------------------------------------- 1 | CFLAGS += -O3 -Wstrict-prototypes -fomit-frame-pointer -g -Wall 2 | #CFLAGS += -O3 -Wstrict-prototypes -fomit-frame-pointer -g -Wall 3 | 4 | CURDIR=${PWD} 5 | libdir = ${shell readlink -m $(CURDIR)/../lib } 6 | 7 | #libdir = /usr/lib 8 | #libdir = ../lib 9 | LDFLAGS += -lpthread -rpath $(libdir) -version-info 1 10 | CC = gcc 11 | CXX = g++ 12 | LD = ld 13 | ARCH = $(shell uname -m) 14 | 15 | # Check Architecture 16 | SUPPORTED_ARCH = NO 17 | 18 | ifeq ($(ARCH), x86_64) 19 | SUPPORTED_ARCH = YES 20 | endif 21 | 22 | ifeq ($(SUPPORTED_ARCH), NO) 23 | $(error Your architecture $(ARCH) is not currently supported. See README.) 24 | endif 25 | 26 | define compile_rule 27 | libtool --mode=compile --tag=CC \ 28 | $(CC) $(CFLAGS) $(CPPFLAGS) -Iinclude-$(ARCH) -c $< 29 | endef 30 | 31 | define cxx_compile_rule 32 | libtool --mode=compile --tag=CC \ 33 | $(CXX) $(CFLAGS) $(CPPFLAGS) -Iinclude-$(ARCH) -c $< 34 | endef 35 | 36 | define link_rule 37 | libtool --mode=link --tag=CC \ 38 | $(LD) $(LDFLAGS) -o $@ $^ $(LDLIBS) 39 | endef 40 | 41 | LIBS = libssmalloc.la 42 | libssmalloc_OBJS = ssmalloc.lo new_delete.lo 43 | 44 | %.lo: %.c 45 | $(call compile_rule) 46 | %.lo: %.cpp 47 | $(call cxx_compile_rule) 48 | 49 | all: libssmalloc.la 50 | 51 | libssmalloc.la: $(libssmalloc_OBJS) 52 | $(call link_rule) 53 | cp .libs/libssmalloc.so ./ 54 | cp .libs/libssmalloc.a ./ 55 | 56 | install/%.la: %.la 57 | libtool --mode=install \ 58 | install -c $(notdir $@) $(libdir)/$(notdir $@) 59 | 60 | install: $(addprefix install/,$(LIBS)) 61 | libtool --mode=finish $(libdir) 62 | # mv libssmalloc.so ../lib 63 | # mv libssmalloc.a ../lib 64 | 65 | 66 | clean: 67 | libtool --mode=clean rm *.la *.lo *.a *.so -f 68 | 69 | -------------------------------------------------------------------------------- /ralloc/README: -------------------------------------------------------------------------------- 1 | This is a malloc for memory allocation in RDMA register area. 2 | It is extended from ssmalloc, thus it is as efficient as ssmalloc. 3 | Notice that using this lib will also overlap the malloc implementation. 4 | We will fix this later but strongly recommand u to use this malloc since it is more efficient in multi-thread programming. 5 | 6 | Current limitation: 7 | If the allocation size is large than 128K, then this malloc will use huge malloc. The memory alloced by huge malloc will 8 | never be reclaimed, thus the user must mange this him/herself. 9 | Usually we found that if the memory allocation is very large, it is never freed through the application lifecycle, so maye 10 | this is not a problem. 11 | 12 | Install: 13 | make;make install; 14 | And linked the libraries generated in ../lib directory. 15 | 16 | Following is the original README from ssmalloc. 17 | 18 | SSMalloc 19 | ======== 20 | 21 | SSMalloc is a low-latency, locality-conscious memory 22 | allocator with stable performance scalability. 23 | 24 | 25 | Compilation & Install 26 | ===================== 27 | 28 | SSMalloc requires libtool for compilation. 29 | In the SSMalloc directory, type: 30 | 31 | $ make 32 | $ make install 33 | 34 | 35 | Usage 36 | ===== 37 | Use libssmalloc.a for static linking. If you want to 38 | dynamically link SSMalloc into your program. Please 39 | set the LD_PRELOAD environment variable as below. 40 | 41 | $export LD_PRELOAD=(path)/libssmalloc.so 42 | 43 | -------------------------------------------------------------------------------- /ralloc/include-x86_64/atomic.h: -------------------------------------------------------------------------------- 1 | #ifndef __SYNCHRO_ATOMIC_H__ 2 | #define __SYNCHRO_ATOMIC_H__ 3 | 4 | #define mb() asm volatile ("sync" : : : "memory") 5 | #define LOCK_PREFIX "lock ; " 6 | 7 | static inline unsigned long fetch_and_store(volatile unsigned int *address, unsigned int value) 8 | { 9 | asm volatile("xchgl %k0,%1" 10 | : "=r" (value) 11 | : "m" (*address), "0" (value) 12 | : "memory"); 13 | 14 | return value; 15 | } 16 | 17 | static inline int atmc_fetch_and_add(volatile unsigned int *address, int value) 18 | { 19 | int prev = value; 20 | 21 | asm volatile( 22 | LOCK_PREFIX "xaddl %0, %1" 23 | : "+r" (value), "+m" (*address) 24 | : : "memory"); 25 | 26 | return prev + value; 27 | } 28 | 29 | static inline long long atmc_fetch_and_add64(volatile unsigned long long *address, long long value) 30 | { 31 | int prev = value; 32 | 33 | asm volatile( 34 | LOCK_PREFIX "xaddq %0, %1" 35 | : "+r" (value), "+m" (*address) 36 | : : "memory"); 37 | 38 | return prev + value; 39 | } 40 | 41 | static inline void atmc_add32(volatile unsigned int* address, int value) 42 | { 43 | asm volatile( 44 | LOCK_PREFIX "addl %1,%0" 45 | : "=m" (*address) 46 | : "ir" (value), "m" (*address)); 47 | } 48 | 49 | static inline void atmc_add64(volatile unsigned long long* address, unsigned long long value) 50 | { 51 | asm volatile( 52 | LOCK_PREFIX "addq %1,%0" 53 | : "=m" (*address) 54 | : "ir" (value), "m" (*address)); 55 | } 56 | 57 | static inline unsigned int compare_and_swap32(volatile unsigned int *address, unsigned int old_value, unsigned int new_value) 58 | { 59 | unsigned long prev = 0; 60 | 61 | asm volatile(LOCK_PREFIX "cmpxchgl %k1,%2" 62 | : "=a"(prev) 63 | : "r"(new_value), "m"(*address), "0"(old_value) 64 | : "memory"); 65 | 66 | return prev == old_value; 67 | } 68 | 69 | static inline unsigned int compare_and_swap32_value(volatile unsigned int *address, unsigned int old_value, unsigned int new_value) 70 | { 71 | unsigned long prev = 0; 72 | 73 | asm volatile(LOCK_PREFIX "cmpxchgl %k1,%2" 74 | : "=a"(prev) 75 | : "r"(new_value), "m"(*address), "0"(old_value) 76 | : "memory"); 77 | 78 | return prev; 79 | } 80 | 81 | 82 | static inline unsigned int compare_and_swap64(volatile unsigned long long *address, unsigned long old_value, unsigned long new_value) 83 | { 84 | unsigned long prev = 0; 85 | 86 | asm volatile(LOCK_PREFIX "cmpxchgq %1,%2" 87 | : "=a"(prev) 88 | : "r"(new_value), "m"(*address), "0"(old_value) 89 | : "memory"); 90 | 91 | return prev == old_value; 92 | } 93 | 94 | static inline unsigned long compare_and_swap64_value(volatile unsigned long long *address, unsigned long old_value, unsigned long new_value) 95 | { 96 | unsigned long prev = 0; 97 | 98 | asm volatile(LOCK_PREFIX "cmpxchgq %1,%2" 99 | : "=a"(prev) 100 | : "r"(new_value), "m"(*address), "0"(old_value) 101 | : "memory"); 102 | 103 | return prev; 104 | } 105 | 106 | static inline unsigned long compare_and_swap_ptr(volatile void *address, void* old_ptr, void* new_ptr) 107 | { 108 | return compare_and_swap64((volatile unsigned long long *)address, (unsigned long)old_ptr, (unsigned long)new_ptr); 109 | } 110 | 111 | #endif 112 | 113 | -------------------------------------------------------------------------------- /ralloc/include-x86_64/bitops.h: -------------------------------------------------------------------------------- 1 | #ifndef __X86_64_BITOPS_H_ 2 | #define __X86_64_BITOPS_H_ 3 | 4 | /* 5 | * Copyright 1992, Linus Torvalds. 6 | */ 7 | 8 | #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) 9 | /* Technically wrong, but this avoids compilation errors on some gcc 10 | versions. */ 11 | #define ADDR "=m" (*(volatile long *) addr) 12 | #else 13 | #define ADDR "+m" (*(volatile long *) addr) 14 | #endif 15 | 16 | /** 17 | * __change_bit - Toggle a bit in memory 18 | * @nr: the bit to change 19 | * @addr: the address to start counting from 20 | * 21 | * Unlike change_bit(), this function is non-atomic and may be reordered. 22 | * If it's called on the same region of memory simultaneously, the effect 23 | * may be that only one operation succeeds. 24 | */ 25 | static __inline__ void __change_bit(int nr, volatile void * addr) 26 | { 27 | __asm__ __volatile__( 28 | "btcl %1,%0" 29 | :ADDR 30 | :"dIr" (nr)); 31 | } 32 | 33 | /* WARNING: non atomic and it can be reordered! */ 34 | static __inline__ int __test_and_change_bit(int nr, volatile void * addr) 35 | { 36 | int oldbit; 37 | 38 | __asm__ __volatile__( 39 | "btcl %2,%1\n\tsbbl %0,%0" 40 | :"=r" (oldbit),ADDR 41 | :"dIr" (nr) : "memory"); 42 | return oldbit; 43 | } 44 | 45 | static inline unsigned long __fls(unsigned long word) 46 | { 47 | asm("bsr %1,%0" 48 | : "=r" (word) 49 | : "rm" (word)); 50 | return word; 51 | } 52 | 53 | static __inline__ unsigned int __get_size_class(unsigned int word) { 54 | asm("dec %1\n" 55 | "shr $2,%1\n" 56 | "bsr %1,%0\n" 57 | "cmovz %2,%0\n" 58 | : "=r" (word) 59 | : "rm" (word), "r" (0)); 60 | return word; 61 | } 62 | 63 | #endif /* _X86_64_BITOPS_H */ 64 | 65 | -------------------------------------------------------------------------------- /ralloc/include-x86_64/cpu.h: -------------------------------------------------------------------------------- 1 | #ifndef __CPU_H_ 2 | #define __CPU_H_ 3 | 4 | /* Machine related macros*/ 5 | #define PAGE_SIZE (4096) 6 | #define SUPER_PAGE_SIZE (4*1024*1024) 7 | #define CACHE_LINE_SIZE (64) 8 | #define DEFAULT_BLOCK_CLASS (100) 9 | #define MAX_CORE_ID (8) 10 | 11 | static inline int get_core_id(void) { 12 | return 0; 13 | int result; 14 | __asm__ __volatile__ ( 15 | "mov $1, %%eax\n" 16 | "cpuid\n" 17 | :"=b"(result) 18 | : 19 | :"eax","ecx","edx"); 20 | return (result>>24)%8; 21 | } 22 | 23 | static inline unsigned long read_tsc(void) 24 | { 25 | unsigned a, d; 26 | __asm __volatile("rdtsc":"=a"(a), "=d"(d)); 27 | return ((unsigned long)a) | (((unsigned long) d) << 32); 28 | } 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /ralloc/include-x86_64/double-list.h: -------------------------------------------------------------------------------- 1 | #ifndef __DOUBLE_LIST_H_ 2 | #define __DOUBLE_LIST_H_ 3 | 4 | typedef struct double_list_elem double_list_elem_t; 5 | typedef struct double_list double_list_t; 6 | 7 | struct double_list_elem { 8 | void* __padding; 9 | struct double_list_elem* next; 10 | struct double_list_elem* prev; 11 | }; 12 | 13 | struct double_list { 14 | struct double_list_elem* head; 15 | struct double_list_elem* tail; 16 | }; 17 | 18 | 19 | /* Places new_node at the front of the list. */ 20 | static void double_list_insert_front(void* new_node, double_list_t* list) 21 | { 22 | double_list_elem_t* elem_new = (double_list_elem_t*)new_node; 23 | double_list_elem_t* old_head = list->head; 24 | 25 | if (old_head == NULL) { 26 | list->tail = elem_new; 27 | } 28 | else { 29 | old_head->prev = elem_new; 30 | } 31 | 32 | elem_new->next = old_head; 33 | elem_new->prev = NULL; 34 | list->head = elem_new; 35 | } 36 | 37 | /* Removes node from the list. */ 38 | static void double_list_remove(void* node, double_list_t* list) 39 | { 40 | double_list_elem_t* elem_node = (double_list_elem_t*)node; 41 | 42 | if (elem_node->prev != NULL) { 43 | elem_node->prev->next = elem_node->next; 44 | } 45 | else { 46 | list->head = elem_node->next; 47 | } 48 | 49 | if (elem_node->next != NULL) { 50 | elem_node->next->prev = elem_node->prev; 51 | } 52 | else { 53 | list->tail = elem_node->prev; 54 | } 55 | 56 | if (list->head != NULL && list->head->next == NULL) { 57 | list->tail = list->head; 58 | } 59 | else if (list->tail != NULL && list->tail->prev == NULL) { 60 | list->head = list->tail; 61 | } 62 | } 63 | 64 | #endif 65 | -------------------------------------------------------------------------------- /ralloc/include-x86_64/queue.h: -------------------------------------------------------------------------------- 1 | #ifndef __QUEUE_H_ 2 | #define __QUEUE_H_ 3 | 4 | #include "atomic.h" 5 | #include 6 | 7 | #define CACHE_LINE_SIZE (64) 8 | #define CACHE_ALIGN __attribute__ ((aligned (CACHE_LINE_SIZE))) 9 | 10 | typedef unsigned long long ptr_t; 11 | 12 | #define ABA_ADDR_BIT (48) 13 | #define ABA_ADDR_MASK ((1L<head = 0; 33 | } 34 | 35 | static inline void mc_enqueue(queue_head_t *queue, void *element, int next_off) 36 | { 37 | unsigned long long old_head; 38 | unsigned long long new_head; 39 | 40 | while(1) { 41 | old_head = queue->head; 42 | NEXT_NODE(element, next_off) = (ptr_t) ABA_ADDR(old_head); 43 | new_head = (ptr_t)element; 44 | new_head |= ABA_COUNT(old_head) + ABA_COUNT_ONE; 45 | if (compare_and_swap64(&queue->head, old_head, new_head)) { 46 | return; 47 | } 48 | } 49 | } 50 | 51 | static inline void *mc_dequeue(queue_head_t *queue, int next_off) 52 | { 53 | unsigned long long old_head; 54 | unsigned long long new_head; 55 | void* old_addr; 56 | 57 | while(1) { 58 | old_head = queue->head; 59 | old_addr = ABA_ADDR(old_head); 60 | if(old_addr == NULL) { 61 | return NULL; 62 | } 63 | new_head = NEXT_NODE(old_addr, next_off); 64 | new_head |= ABA_COUNT(old_head) + ABA_COUNT_ONE; 65 | if (compare_and_swap64(&queue->head, old_head, new_head)) { 66 | return old_addr; 67 | } 68 | } 69 | } 70 | 71 | /* Single-Consumer LIFO Queue */ 72 | 73 | static inline void sc_queue_init(queue_head_t *queue) 74 | { 75 | queue->head = 0; 76 | } 77 | 78 | static inline void sc_enqueue(queue_head_t *queue, void *element, int next_off) 79 | { 80 | unsigned long long old_head; 81 | unsigned long long new_head; 82 | 83 | while(1) { 84 | old_head = queue->head; 85 | NEXT_NODE(element, next_off) = old_head; 86 | new_head = (ptr_t)element; 87 | if (compare_and_swap64(&queue->head, old_head, new_head)) { 88 | return; 89 | } 90 | } 91 | } 92 | 93 | static inline void *sc_dequeue(queue_head_t *queue, int next_off) 94 | { 95 | unsigned long long old_head; 96 | unsigned long long new_head; 97 | 98 | while(1) { 99 | old_head = queue->head; 100 | if(old_head == 0) { 101 | return NULL; 102 | } 103 | new_head = NEXT_NODE(old_head, next_off); 104 | if (compare_and_swap64(&queue->head, old_head, new_head)) { 105 | return (void*)old_head; 106 | } 107 | } 108 | } 109 | 110 | static inline void *sc_chain_dequeue(queue_head_t *queue) 111 | { 112 | unsigned long long old_head; 113 | while(1) { 114 | old_head = queue->head; 115 | if(old_head == 0) { 116 | return NULL; 117 | } 118 | if (compare_and_swap64(&queue->head, old_head, 0)) { 119 | return (void*)old_head; 120 | } 121 | } 122 | } 123 | 124 | /* Sequential LIFO Queue */ 125 | 126 | static inline void seq_queue_init(seq_queue_head_t *queue) 127 | { 128 | *queue = NULL; 129 | } 130 | 131 | static inline void seq_enqueue(seq_queue_head_t *queue, void *element) 132 | { 133 | *(void**)element = *queue; 134 | *queue = element; 135 | } 136 | 137 | static inline void *seq_dequeue(seq_queue_head_t *queue) 138 | { 139 | void* old_head = *queue; 140 | if(old_head == NULL) { 141 | return NULL; 142 | } 143 | *queue = *(void**)old_head; 144 | return old_head; 145 | } 146 | 147 | #define seq_head(queue) (queue) 148 | 149 | /* Counted Queue */ 150 | static inline void* counted_enqueue(queue_head_t *queue, void* elem) { 151 | unsigned long long old_head, new_head, prev; 152 | do { 153 | old_head = queue->head; 154 | *(ptr_t*)elem = (ptr_t)ABA_ADDR(old_head); 155 | new_head = (ptr_t)elem; 156 | new_head |= ABA_COUNT(old_head) + ABA_COUNT_ONE; 157 | 158 | } while((prev=compare_and_swap64_value ( 159 | &queue->head, 160 | old_head, 161 | new_head 162 | ))!=old_head); 163 | 164 | return (void*)prev; 165 | } 166 | 167 | static inline void* counted_chain_enqueue(queue_head_t *queue, void* elems, void* tail, int cnt) { 168 | unsigned long long old_head, new_head, prev; 169 | do { 170 | old_head = queue->head; 171 | *(ptr_t*)tail = (ptr_t)ABA_ADDR(old_head); 172 | new_head = (ptr_t)elems; 173 | new_head |= ABA_COUNT(old_head) + ABA_COUNT_ONE * cnt; 174 | 175 | } while((prev=compare_and_swap64_value ( 176 | &queue->head, 177 | old_head, 178 | new_head 179 | ))!=old_head); 180 | 181 | return (void*)prev; 182 | } 183 | 184 | static inline void* counted_chain_dequeue(queue_head_t* queue, uint32_t *count) { 185 | unsigned long long old_head; 186 | while(1) { 187 | old_head = *(ptr_t*)queue; 188 | if (old_head == 0) 189 | return(NULL); 190 | if (compare_and_swap64(&queue->head, old_head, 0)) { 191 | *count = ABA_COUNT(old_head) >> ABA_ADDR_BIT; 192 | return(ABA_ADDR(old_head)); 193 | } 194 | } 195 | } 196 | 197 | #endif 198 | -------------------------------------------------------------------------------- /ralloc/new_delete.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | using namespace std; 4 | 5 | extern "C" { 6 | void* malloc(size_t); 7 | void free(void*); 8 | } 9 | 10 | void* operator new(size_t size) throw (std::bad_alloc) 11 | { 12 | return malloc(size); 13 | } 14 | 15 | void * operator new(size_t size, const std::nothrow_t&) throw() 16 | { 17 | return malloc(size); 18 | } 19 | 20 | void operator delete(void *ptr) 21 | { 22 | free(ptr); 23 | } 24 | 25 | void* operator new[](size_t size) throw (std::bad_alloc) 26 | { 27 | return malloc(size); 28 | } 29 | 30 | void * operator new[](size_t size, const std::nothrow_t&) throw() 31 | { 32 | return malloc(size); 33 | } 34 | 35 | void operator delete[](void *ptr) 36 | { 37 | free(ptr); 38 | } 39 | 40 | -------------------------------------------------------------------------------- /ralloc/ralloc.h: -------------------------------------------------------------------------------- 1 | #ifndef RDMA_MALLOC 2 | #define RDMA_MALLOC 3 | 4 | #include 5 | #include 6 | 7 | /* This file provides interfaces of a malloc for manging registered RDMA regions. 8 | It shall be linked to the dedicated ssmalloc library which can be installed 9 | by following instructions in ../ralloc/README.md. 10 | 11 | Usage: 12 | To manage allocation in RDMA registered region, just pass the start pointer and the 13 | size to RInit() for initlization. 14 | Before Each thread can alloc memory, they shall call RThreadLocalInit() at first. 15 | 16 | Rmalloc and Rfree works as the same as standard malloc and free. The addresses returned 17 | is in the registered memory region. 18 | 19 | Limitation: 20 | We assume there is exactly one RDMA region on one machine. Which is enough most of the time. 21 | */ 22 | 23 | extern "C" { 24 | /* Initilize the lib with the dedicated memroy buffer. Can only be called exactly once. 25 | @ret 26 | NULL - An error occured. This is because the memory region size is not large enough. 27 | A size - The actual size of memory region shall be allocaed .This typicall is less than size for algiment 28 | reasons. 29 | */ 30 | uint64_t RInit(char *buffer, uint64_t size); 31 | /* 32 | Initilize thread local data structure. 33 | Shall be called exactly after RInit and before the first call of Rmalloc or Rfree at this thread. 34 | */ 35 | void RThreadLocalInit(void); 36 | void *Rmalloc(size_t __size); 37 | void Rfree(void *__ptr); 38 | } 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /ralloc/ssmalloc.c: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 2 | #include "ssmalloc.h" 3 | 4 | /* Global metadata */ 5 | init_state global_state = UNINITIALIZED; 6 | init_state r_global_state = UNINITIALIZED; 7 | gpool_t global_pool; 8 | gpool_t r_global_pool; 9 | char *register_rdma_buffer; 10 | uint64_t register_rdma_buffer_size = 0; 11 | 12 | pthread_key_t destructor; 13 | pthread_key_t r_destructor ; 14 | pthread_once_t init_once = PTHREAD_ONCE_INIT; 15 | 16 | /* Mappings used in ssmalloc */ 17 | CACHE_ALIGN int cls2size[128]; 18 | char sizemap[256]; 19 | char sizemap2[128]; 20 | 21 | /* Mappings used in r_malloc */ 22 | CACHE_ALIGN int r_cls2size[128]; 23 | char r_sizemap[256]; 24 | char r_sizemap2[128]; 25 | 26 | 27 | /* Private metadata in ssmalloc */ 28 | THREAD_LOCAL init_state thread_state = UNINITIALIZED; 29 | THREAD_LOCAL lheap_t *local_heap = NULL; 30 | 31 | /* Private metadata for r_malloc */ 32 | THREAD_LOCAL init_state r_thread_state = UNINITIALIZED; 33 | THREAD_LOCAL lheap_t * r_local_heap = NULL; 34 | 35 | /* System init functions */ 36 | static void maps_init(void); 37 | static void thread_init(void); 38 | static void thread_exit(void *dummy); 39 | static void r_thread_exit(void *dummy); 40 | static void global_init(void); 41 | inline static void check_init(void); 42 | 43 | /* Global pool management functions */ 44 | inline static void gpool_check_size(void *target); 45 | /* Unlike gpool check size which can grow unlimited, RDMA resources is scarce */ 46 | inline static int rpool_check_size(void *target); 47 | static int gpool_grow(void); 48 | static void gpool_init(void); 49 | static void *gpool_make_raw_chunk(void); 50 | static void *rpool_make_raw_chunk(void); 51 | 52 | inline static chunk_t *gpool_acquire_chunk(void); 53 | inline static chunk_t *rpool_acquire_chunk(void); 54 | inline static void gpool_release_chunk(dchunk_t *dc); 55 | 56 | static lheap_t *gpool_acquire_lheap(void); 57 | static lheap_t *r_acquire_lheap(void); 58 | static void gpool_release_lheap(lheap_t *lh); 59 | static void rpool_release_lheap(lheap_t *lh); 60 | 61 | 62 | /* Local heap management functions */ 63 | inline static void lheap_init(lheap_t *lh); 64 | inline static void lheap_replace_foreground(lheap_t *lh, int size_cls); 65 | inline static int r_lheap_replace_foreground(lheap_t *lh,int size_cls); 66 | 67 | /* Data chunk management functions */ 68 | inline static void dchunk_change_cls(dchunk_t *dc, int size_cls); 69 | inline static void dchunk_init(dchunk_t *dc, int size_cls); 70 | inline static void dchunk_collect_garbage(dchunk_t *dc); 71 | inline static void *dchunk_alloc_obj(dchunk_t *dc); 72 | inline static dchunk_t* dchunk_extract(void *ptr); 73 | 74 | /* Object buffer management functions */ 75 | inline static void obj_buf_flush(obj_buf_t *obuf); 76 | inline static void obj_buf_flush_all(lheap_t *lh); 77 | inline static void obj_buf_put(obj_buf_t *bbuf, dchunk_t * dc, void *ptr); 78 | 79 | /* Allocator helpers */ 80 | inline static void *r_large_malloc(size_t size); 81 | inline static void *r_small_malloc(int size_cls); 82 | inline static void *large_malloc(size_t size); 83 | inline static void *small_malloc(int size_cls); 84 | inline static void large_free(void *ptr); 85 | //inline static void *r_large_free(void *ptr); 86 | inline static void local_free(lheap_t *lh, dchunk_t *dc, void *ptr); 87 | inline static void remote_free(lheap_t *lh, dchunk_t *dc, void *ptr); 88 | static void *large_memalign(size_t boundary, size_t size); 89 | 90 | /* Misc functions */ 91 | static void* page_alloc(void *pos, size_t size); 92 | static void page_free(void *pos, size_t size); 93 | static void touch_memory_range(void *start, size_t len); 94 | inline static int size2cls(size_t size); 95 | inline static int r_size2cls(size_t size); 96 | 97 | #ifdef DEBUG 98 | static void handler(int sig); 99 | #endif 100 | 101 | /* Interface */ 102 | void *malloc(size_t size); 103 | void free(void* ptr); 104 | void *realloc(void *ptr, size_t size); 105 | void *calloc(size_t nmemb, size_t size); 106 | void *memalign(size_t boundary, size_t size); 107 | int posix_memalign(void **memptr, size_t alignment, size_t size); 108 | void *valloc(size_t size); 109 | void *pvalloc(size_t size); 110 | 111 | #ifdef RETURN_MEMORY 112 | pthread_t gpool_gc_thread; 113 | 114 | static void* gpool_gc(void* arg) 115 | { 116 | pthread_detach(pthread_self()); 117 | char *ptr = NULL; 118 | 119 | /* sleeptime = 100 ms */ 120 | struct timespec sleeptime = {0, 10000000}; 121 | 122 | while(1) { 123 | nanosleep(&sleeptime, NULL); 124 | ptr = (char*) queue_fetch(&global_pool.free_dc_head[get_core_id()]); 125 | if(ptr) { 126 | void *ptr_end = PAGE_ROUNDDOWN(ptr + CHUNK_SIZE); 127 | void *ptr_start = PAGE_ROUNDUP(ptr); 128 | madvise(ptr_start, (uintptr_t)ptr_end - (uintptr_t)ptr_start, MADV_DONTNEED); 129 | queue_put(&global_pool.released_dc_head[get_core_id()], ptr); 130 | } 131 | } 132 | } 133 | #endif 134 | 135 | static void maps_init() 136 | { 137 | int size; 138 | int class; 139 | 140 | /* 8 +4 64 */ 141 | for (size = 8, class = 0; size <= 64; size += 4, class++) { 142 | cls2size[class] = size; 143 | } 144 | 145 | /* 80 +16 128 */ 146 | for (size = 64 + 16; size <= 128; size += 16, class++) { 147 | cls2size[class] = size; 148 | } 149 | 150 | /* 160 +32 256 */ 151 | for (size = 128 + 32; size <= 256; size += 32, class++) { 152 | cls2size[class] = size; 153 | } 154 | 155 | for (size = 256; size < 65536; size <<= 1) { 156 | cls2size[class++] = size + (size >> 1); 157 | cls2size[class++] = size << 1; 158 | } 159 | 160 | int cur_class = 0; 161 | int cur_size = 0; 162 | 163 | /* init sizemap */ 164 | for (cur_size = 4; cur_size <= 1024; cur_size += 4) { 165 | if (cur_size > cls2size[cur_class]) 166 | cur_class++; 167 | sizemap[(cur_size - 1) >> 2] = cur_class; 168 | } 169 | 170 | /* init sizemap2 */ 171 | for (cur_size = 1024; cur_size <= 65536; cur_size += 512) { 172 | if (cur_size > cls2size[cur_class]) 173 | cur_class++; 174 | sizemap2[(cur_size - 1) >> 9] = cur_class; 175 | } 176 | } 177 | 178 | static void thread_init() 179 | { 180 | /* Register the destructor */ 181 | pthread_setspecific(destructor, ACTIVE); 182 | /* Initialize thread pool */ 183 | local_heap = gpool_acquire_lheap(); 184 | thread_state = READY; 185 | } 186 | 187 | static void r_thread_exit (void *dummy) { 188 | rpool_release_lheap(r_local_heap); 189 | } 190 | static void thread_exit(void *dummy) 191 | { 192 | gpool_release_lheap(local_heap); 193 | } 194 | 195 | 196 | uint64_t RInit(char *buffer,uint64_t size) { 197 | 198 | 199 | pthread_key_create(&r_destructor, r_thread_exit); 200 | 201 | /* Rounding to chunk size */ 202 | uint64_t add_off = CHUNK_SIZE - ((uint64_t )buffer) % CHUNK_SIZE; 203 | if(add_off >= size) 204 | return 0; 205 | size -= add_off; 206 | if(size < 16 * CHUNK_SIZE) { 207 | /* We shall ensure the register rdma area is large enough! */ 208 | return 0; 209 | } 210 | 211 | register_rdma_buffer = buffer + add_off; 212 | register_rdma_buffer_size = size; 213 | 214 | r_global_pool.pool_start = register_rdma_buffer; 215 | r_global_pool.pool_end = register_rdma_buffer + register_rdma_buffer_size; 216 | r_global_pool.free_start = register_rdma_buffer; 217 | 218 | pthread_mutex_init(&r_global_pool.lock, NULL); 219 | 220 | { 221 | /* maps init */ 222 | 223 | int size; 224 | int class; 225 | 226 | /* 8 +4 64 */ 227 | for (size = 8, class = 0; size <= 64; size += 4, class++) { 228 | r_cls2size[class] = size; 229 | } 230 | 231 | /* 80 +16 128 */ 232 | for (size = 64 + 16; size <= 128; size += 16, class++) { 233 | r_cls2size[class] = size; 234 | } 235 | 236 | /* 160 +32 256 */ 237 | for (size = 128 + 32; size <= 256; size += 32, class++) { 238 | r_cls2size[class] = size; 239 | } 240 | 241 | for (size = 256; size < 65536; size <<= 1) { 242 | r_cls2size[class++] = size + (size >> 1); 243 | r_cls2size[class++] = size << 1; 244 | } 245 | 246 | int cur_class = 0; 247 | int cur_size = 0; 248 | 249 | /* init sizemap */ 250 | for (cur_size = 4; cur_size <= 1024; cur_size += 4) { 251 | if (cur_size > r_cls2size[cur_class]) 252 | cur_class++; 253 | r_sizemap[(cur_size - 1) >> 2] = cur_class; 254 | } 255 | 256 | /* init sizemap2 */ 257 | for (cur_size = 1024; cur_size <= 65536; cur_size += 512) { 258 | if (cur_size > r_cls2size[cur_class]) 259 | cur_class++; 260 | r_sizemap2[(cur_size - 1) >> 9] = cur_class; 261 | } 262 | } 263 | r_global_state = READY; 264 | return size; 265 | } 266 | 267 | void RThreadLocalInit () { 268 | 269 | if(unlikely(r_thread_state != READY)) { 270 | pthread_setspecific(r_destructor, ACTIVE); 271 | // r_local_heap = gp 272 | r_local_heap = r_acquire_lheap(); 273 | r_thread_state = READY; 274 | } 275 | } 276 | 277 | static void global_init() 278 | { 279 | #ifdef DEBUG 280 | /* Register the signal handler for backtrace*/ 281 | signal(SIGSEGV, handler); 282 | #endif 283 | pthread_key_create(&destructor, thread_exit); 284 | /* Initialize global data */ 285 | gpool_init(); 286 | maps_init(); 287 | 288 | global_state = READY; 289 | #ifdef RETURN_MEMORY 290 | /* Create the gc thread */ 291 | pthread_create(&gpool_gc_thread, NULL, gpool_gc, NULL); 292 | #endif 293 | } 294 | 295 | inline static void check_init() 296 | { 297 | if (unlikely(thread_state != READY)) { 298 | if (global_state != READY) { 299 | pthread_once(&init_once, global_init); 300 | } 301 | thread_init(); 302 | } 303 | } 304 | 305 | inline static int rpool_check_size(void *target) { 306 | if(r_global_pool.pool_end <= (char *)target) { 307 | return 0 ;//false 308 | } 309 | return 1; //true 310 | } 311 | 312 | inline static void gpool_check_size(void *target) 313 | { 314 | if (global_pool.pool_end <= (char *)target) { 315 | /* Global Pool Full */ 316 | pthread_mutex_lock(&global_pool.lock); 317 | while (global_pool.pool_end <= (char *)target) { 318 | gpool_grow(); 319 | } 320 | pthread_mutex_unlock(&global_pool.lock); 321 | } 322 | } 323 | 324 | static int gpool_grow() 325 | { 326 | /* Enlarge the raw memory pool */ 327 | static int last_alloc = 8; 328 | int alloc_size = ALLOC_UNIT * last_alloc; 329 | if (last_alloc < 32) { 330 | last_alloc *= 2; 331 | } 332 | 333 | void *mem = page_alloc((void *)global_pool.pool_end, alloc_size); 334 | if (mem == MAP_FAILED) { 335 | exit(-1); 336 | return -1; 337 | } 338 | 339 | /* Increase the global pool size */ 340 | global_pool.pool_end += alloc_size; 341 | return 0; 342 | } 343 | 344 | 345 | 346 | /* Initialize the global memory pool */ 347 | static void gpool_init() 348 | { 349 | global_pool.pool_start = RAW_POOL_START; 350 | global_pool.pool_end = RAW_POOL_START; 351 | global_pool.free_start = RAW_POOL_START; 352 | //queue_init(&global_pool.free_dc_head); 353 | pthread_mutex_init(&global_pool.lock, NULL); 354 | gpool_grow(); 355 | } 356 | 357 | 358 | inline static chunk_t *gpool_acquire_chunk() 359 | { 360 | void *ptr = NULL; 361 | 362 | /* Try to alloc a freed chunk from the free list */ 363 | ptr = queue_fetch(&global_pool.free_dc_head[get_core_id()]); 364 | if (ptr) { 365 | return (chunk_t *) ptr; 366 | } 367 | 368 | #ifdef RETURN_MEMORY 369 | ptr = queue_fetch(&global_pool.released_dc_head[get_core_id()]); 370 | if (ptr) { 371 | // XXX: Fix me 372 | ((chunk_t *) ptr)->numa_node = get_core_id(); 373 | touch_memory_range(ptr, CHUNK_SIZE); 374 | return (chunk_t *) ptr; 375 | } 376 | #endif 377 | 378 | /* Or just alloc a new chunk */ 379 | ptr = gpool_make_raw_chunk(); 380 | gpool_check_size(ptr); 381 | ptr -= CHUNK_SIZE; 382 | ((chunk_t *) ptr)->numa_node = get_core_id(); 383 | touch_memory_range(ptr, CHUNK_SIZE); 384 | return (chunk_t *) ptr; 385 | } 386 | 387 | 388 | inline static chunk_t *rpool_acquire_chunk() 389 | { 390 | void *ptr = NULL; 391 | 392 | /* Try to alloc a freed chunk from the free list */ 393 | ptr = queue_fetch(&r_global_pool.free_dc_head[get_core_id()]); 394 | if (ptr) { 395 | return (chunk_t *) ptr; 396 | } 397 | 398 | #ifdef RETURN_MEMORY 399 | ptr = queue_fetch(&r_global_pool.released_dc_head[get_core_id()]); 400 | if (ptr) { 401 | // XXX: Fix me 402 | ((chunk_t *) ptr)->numa_node = get_core_id(); 403 | touch_memory_range(ptr, CHUNK_SIZE); 404 | return (chunk_t *) ptr; 405 | } 406 | #endif 407 | 408 | /* Or just alloc a new chunk */ 409 | ptr = rpool_make_raw_chunk(); 410 | if((char *)ptr > (char *)r_global_pool.pool_end) { 411 | return NULL; 412 | } 413 | 414 | // rpool_check_size(ptr); 415 | ptr -= CHUNK_SIZE; 416 | ((chunk_t *) ptr)->numa_node = get_core_id(); 417 | touch_memory_range(ptr, CHUNK_SIZE); 418 | return (chunk_t *) ptr; 419 | } 420 | 421 | 422 | static void *rpool_make_raw_chunk() { 423 | void *ret = (void *)(atmc_fetch_and_add64((unsigned long long *) 424 | &r_global_pool.free_start, 425 | CHUNK_SIZE)) ; 426 | return ret; 427 | } 428 | 429 | static void *gpool_make_raw_chunk() 430 | { 431 | /* Atomic increse the global pool size */ 432 | void *ret = (void *)(atmc_fetch_and_add64((unsigned long long *) 433 | &global_pool.free_start, 434 | CHUNK_SIZE)); 435 | return ret; 436 | } 437 | 438 | 439 | inline static void gpool_release_chunk(dchunk_t *dc) 440 | { 441 | queue_put(&global_pool.free_dc_head[dc->numa_node], dc); 442 | } 443 | 444 | inline static void rpool_release_chunk(dchunk_t *dc) { 445 | queue_put(&r_global_pool.free_dc_head[dc->numa_node], dc); 446 | } 447 | 448 | 449 | static lheap_t *r_acquire_lheap() { 450 | lheap_t *lh; 451 | lh = queue_fetch(&(r_global_pool.free_lh_head[get_core_id()])); 452 | /* Alloc a new one */ 453 | if (!lh) { 454 | lh = (lheap_t *) rpool_acquire_chunk(); 455 | if(lh == NULL) { 456 | fprintf(stderr,"panic, cannot acquire local heap\n"); 457 | return NULL; 458 | } 459 | lheap_init(lh); 460 | } 461 | return lh; 462 | } 463 | 464 | 465 | 466 | static lheap_t *gpool_acquire_lheap() 467 | { 468 | lheap_t *lh; 469 | lh = queue_fetch(&(global_pool.free_lh_head[get_core_id()])); 470 | /* Alloc a new one */ 471 | if (!lh) { 472 | lh = (lheap_t *) gpool_acquire_chunk(); 473 | lheap_init(lh); 474 | } 475 | return lh; 476 | } 477 | 478 | static void gpool_release_lheap(lheap_t *lh) 479 | { 480 | queue_put(&global_pool.free_lh_head[local_heap->numa_node], lh); 481 | } 482 | 483 | static void rpool_release_lheap(lheap_t *lh) 484 | { 485 | queue_put(&r_global_pool.free_lh_head[local_heap->numa_node], lh); 486 | } 487 | 488 | 489 | inline static void lheap_init(lheap_t * lh) 490 | { 491 | memset(&lh->free_head, 0, sizeof(lheap_t)); 492 | 493 | int size_cls; 494 | lh->dummy_chunk.size_cls = DUMMY_CLASS; 495 | lh->dummy_chunk.free_blk_cnt = 1; 496 | 497 | for (size_cls = 0; size_cls < DEFAULT_BLOCK_CLASS; size_cls++) { 498 | /* Install the dummy chunk */ 499 | lh->foreground[size_cls] = &lh->dummy_chunk; 500 | } 501 | } 502 | 503 | inline static int r_lheap_replace_foreground 504 | (lheap_t * lh, int size_cls) { 505 | 506 | dchunk_t *dc; 507 | 508 | /* Try to acquire the block from background list */ 509 | dc = (dchunk_t *) lh->background[size_cls].head; 510 | if (dc != NULL) { 511 | double_list_remove(dc, &lh->background[size_cls]); 512 | goto finish; 513 | } 514 | 515 | /* Try to acquire a block in the remote freed list */ 516 | dc = fast_queue_fetch(&lh->need_gc[size_cls]); 517 | if (dc != NULL) { 518 | dchunk_collect_garbage(dc); 519 | goto finish; 520 | } 521 | 522 | /* Try to acquire the chunk from local pool */ 523 | dc = (dchunk_t *) seq_queue_fetch(&lh->free_head); 524 | if (dc != NULL) { 525 | // fprintf(stdout,"get free head\n"); 526 | lh->free_cnt--; 527 | dchunk_change_cls(dc, size_cls); 528 | goto finish; 529 | } 530 | 531 | /* Acquire the chunk from global pool */ 532 | 533 | dc = (dchunk_t *) rpool_acquire_chunk(); 534 | // fprintf(stdout,"acquire raw pool\n"); 535 | if(unlikely(dc == NULL)) { 536 | return 0; // false 537 | } 538 | // fprintf(stdout,"owner %p\n",lh); 539 | dc->owner = lh; 540 | fast_queue_init((FastQueue *) & (dc->remote_free_head)); 541 | dchunk_init(dc, size_cls); 542 | 543 | finish: 544 | /* Set the foreground chunk */ 545 | lh->foreground[size_cls] = dc; 546 | dc->state = FOREGROUND; 547 | return 1; // true 548 | } 549 | 550 | 551 | inline static void lheap_replace_foreground 552 | (lheap_t * lh, int size_cls) { 553 | dchunk_t *dc; 554 | 555 | /* Try to acquire the block from background list */ 556 | dc = (dchunk_t *) lh->background[size_cls].head; 557 | if (dc != NULL) { 558 | double_list_remove(dc, &lh->background[size_cls]); 559 | goto finish; 560 | } 561 | 562 | /* Try to acquire a block in the remote freed list */ 563 | dc = fast_queue_fetch(&lh->need_gc[size_cls]); 564 | if (dc != NULL) { 565 | dchunk_collect_garbage(dc); 566 | goto finish; 567 | } 568 | 569 | /* Try to acquire the chunk from local pool */ 570 | dc = (dchunk_t *) seq_queue_fetch(&lh->free_head); 571 | if (dc != NULL) { 572 | lh->free_cnt--; 573 | dchunk_change_cls(dc, size_cls); 574 | goto finish; 575 | } 576 | 577 | /* Acquire the chunk from global pool */ 578 | 579 | dc = (dchunk_t *) gpool_acquire_chunk(); 580 | dc->owner = lh; 581 | fast_queue_init((FastQueue *) & (dc->remote_free_head)); 582 | dchunk_init(dc, size_cls); 583 | 584 | finish: 585 | /* Set the foreground chunk */ 586 | lh->foreground[size_cls] = dc; 587 | dc->state = FOREGROUND; 588 | } 589 | 590 | inline static void dchunk_change_cls(dchunk_t * dc, int size_cls) 591 | { 592 | int size = cls2size[size_cls]; 593 | int data_offset = DCH; 594 | dc->blk_cnt = (CHUNK_SIZE - data_offset) / size; 595 | dc->free_blk_cnt = dc->blk_cnt; 596 | dc->block_size = size; 597 | dc->free_mem = (char *)dc + data_offset; 598 | dc->size_cls = size_cls; 599 | seq_queue_init(&dc->free_head); 600 | } 601 | 602 | inline static void dchunk_init(dchunk_t * dc, int size_cls) 603 | { 604 | dc->active_link.next = NULL; 605 | dc->active_link.prev = NULL; 606 | dchunk_change_cls(dc, size_cls); 607 | } 608 | 609 | inline static void dchunk_collect_garbage(dchunk_t * dc) 610 | { 611 | seq_head(dc->free_head) = 612 | counted_chain_dequeue(&dc->remote_free_head, &dc->free_blk_cnt); 613 | } 614 | 615 | inline static void *dchunk_alloc_obj(dchunk_t * dc) 616 | { 617 | void *ret; 618 | 619 | /* Dirty implementation of dequeue, avoid one branch */ 620 | ret = seq_head(dc->free_head); 621 | 622 | if (unlikely(!ret)) { 623 | ret = dc->free_mem; 624 | dc->free_mem += dc->block_size; 625 | } else { 626 | seq_head(dc->free_head) = *(void**)ret; 627 | } 628 | 629 | #if 0 630 | /* A clearer implementation with one more branch*/ 631 | ret = seq_lifo_dequeue(&dc->free_head); 632 | if (unlikely(!ret)) { 633 | ret = dc->free_mem; 634 | dc->free_mem += dc->block_size; 635 | } 636 | #endif 637 | 638 | return ret; 639 | } 640 | 641 | inline static dchunk_t *dchunk_extract(void *ptr) 642 | { 643 | return (dchunk_t *) ((uintptr_t)ptr - ((uintptr_t)ptr % CHUNK_SIZE)); 644 | } 645 | 646 | inline static void obj_buf_flush(obj_buf_t * bbuf) 647 | { 648 | void *prev; 649 | 650 | dchunk_t *dc = bbuf->dc; 651 | lheap_t *lh = dc->owner; 652 | 653 | prev = counted_chain_enqueue(&(dc->remote_free_head), 654 | seq_head(bbuf->free_head), bbuf->first, bbuf->count); 655 | bbuf->count = 0; 656 | bbuf->dc = NULL; 657 | bbuf->first = NULL; 658 | seq_head(bbuf->free_head) = NULL; 659 | 660 | /* If I am the first thread done remote free in this memory chunk*/ 661 | if ((unsigned long long)prev == 0L) { 662 | fast_queue_put(&(lh->need_gc[dc->size_cls]), dc); 663 | } 664 | return; 665 | } 666 | 667 | inline static void obj_buf_flush_all(lheap_t *lh) { 668 | int i; 669 | for (i = 0; i < BLOCK_BUF_CNT; i++) { 670 | obj_buf_t *buf = &lh->block_bufs[i]; 671 | if (buf->count == 0) 672 | continue; 673 | obj_buf_flush(buf); 674 | buf->dc = NULL; 675 | } 676 | } 677 | 678 | inline static void obj_buf_put(obj_buf_t *bbuf, dchunk_t * dc, void *ptr) { 679 | if (unlikely(bbuf->dc != dc)) { 680 | if (bbuf->dc != NULL) { 681 | obj_buf_flush(bbuf); 682 | } 683 | bbuf->dc = dc; 684 | bbuf->first = ptr; 685 | bbuf->count = 0; 686 | seq_head(bbuf->free_head) = NULL; 687 | } 688 | 689 | seq_queue_put(&bbuf->free_head, ptr); 690 | bbuf->count++; 691 | } 692 | 693 | inline static void *r_large_malloc(size_t size) { 694 | 695 | /* round up the size */ 696 | size_t real_size = size + CHUNK_SIZE - size % CHUNK_SIZE; 697 | void *ret = (void *)(atmc_fetch_and_add64((unsigned long long *) 698 | &r_global_pool.free_start, 699 | real_size)) ; 700 | if(ret > (void *)(r_global_pool.pool_end)) 701 | return NULL; 702 | //void *mem_start = (char *)ret + CHUNK_SIZE - CACHE_LINE_SIZE; 703 | large_header_t *header = (large_header_t *)dchunk_extract(ret); 704 | 705 | header->alloc_size = real_size; 706 | header->mem = ret; 707 | header->owner = LARGE_OWNER; 708 | 709 | return ret; 710 | } 711 | 712 | inline static void *large_malloc(size_t size) 713 | { 714 | size_t alloc_size = PAGE_ROUNDUP(size + CHUNK_SIZE); 715 | void *mem = page_alloc(NULL, alloc_size); 716 | void *mem_start = (char*)mem + CHUNK_SIZE - CACHE_LINE_SIZE; 717 | large_header_t *header = (large_header_t *)dchunk_extract(mem_start); 718 | 719 | /* If space is enough for the header of a large block */ 720 | intptr_t distance = (intptr_t)mem_start - (intptr_t)header; 721 | if (distance >= sizeof(large_header_t)) { 722 | header->alloc_size = alloc_size; 723 | header->mem = mem; 724 | header->owner = LARGE_OWNER; 725 | return mem_start; 726 | } 727 | 728 | /* If not, Retry Allocation */ 729 | void *ret = large_malloc(size); 730 | page_free(mem, alloc_size); 731 | return ret; 732 | } 733 | 734 | inline static void *r_small_malloc(int size_cls) { 735 | 736 | lheap_t *lh = r_local_heap; 737 | dchunk_t *dc; 738 | void *ret; 739 | retry: 740 | dc = lh->foreground[size_cls]; 741 | ret = dchunk_alloc_obj(dc); 742 | // fprintf(stdout,"alloc owner %p\n",dc->owner); 743 | /* Check if the datachunk is full */ 744 | if (unlikely(--dc->free_blk_cnt == 0)) { 745 | dc->state = FULL; 746 | /* There is not enough memory in RDMA region */ 747 | if(unlikely(r_lheap_replace_foreground(lh, size_cls) == 0)) 748 | return NULL; 749 | if (unlikely(dc->size_cls == DUMMY_CLASS)) { 750 | /* A dummy chunk */ 751 | dc->free_blk_cnt = 1; 752 | goto retry; 753 | } 754 | } 755 | 756 | return ret; 757 | } 758 | 759 | inline static void *small_malloc(int size_cls) 760 | { 761 | lheap_t *lh = local_heap; 762 | dchunk_t *dc; 763 | void *ret; 764 | retry: 765 | dc = lh->foreground[size_cls]; 766 | ret = dchunk_alloc_obj(dc); 767 | 768 | /* Check if the datachunk is full */ 769 | if (unlikely(--dc->free_blk_cnt == 0)) { 770 | dc->state = FULL; 771 | lheap_replace_foreground(lh, size_cls); 772 | if (unlikely(dc->size_cls == DUMMY_CLASS)) { 773 | /* A dummy chunk */ 774 | dc->free_blk_cnt = 1; 775 | goto retry; 776 | } 777 | } 778 | 779 | return ret; 780 | } 781 | 782 | inline static void large_free(void *ptr) 783 | { 784 | large_header_t *header = (large_header_t*)dchunk_extract(ptr); 785 | page_free(header->mem, header->alloc_size); 786 | } 787 | 788 | inline static void local_free(lheap_t * lh, dchunk_t * dc, void *ptr) 789 | { 790 | unsigned int free_blk_cnt = ++dc->free_blk_cnt; 791 | seq_queue_put(&dc->free_head, ptr); 792 | 793 | switch (dc->state) { 794 | case FULL: 795 | double_list_insert_front(dc, &lh->background[dc->size_cls]); 796 | dc->state = BACKGROUND; 797 | break; 798 | case BACKGROUND: 799 | if (unlikely(free_blk_cnt == dc->blk_cnt)) { 800 | int free_cnt = lh->free_cnt; 801 | double_list_remove(dc, &lh->background[dc->size_cls]); 802 | 803 | if (free_cnt >= MAX_FREE_CHUNK) { 804 | gpool_release_chunk(dc); 805 | } else { 806 | seq_queue_put(&lh->free_head, dc); 807 | lh->free_cnt = free_cnt + 1; 808 | } 809 | } 810 | break; 811 | case FOREGROUND: 812 | /* Tada.. */ 813 | break; 814 | } 815 | } 816 | 817 | THREAD_LOCAL int buf_cnt; 818 | inline static void remote_free(lheap_t * lh, dchunk_t * dc, void *ptr) 819 | { 820 | /* Put the object in a local buffer rather than return it to owner */ 821 | int tag = ((unsigned long long)dc / CHUNK_SIZE) % BLOCK_BUF_CNT; 822 | obj_buf_t *bbuf = &lh->block_bufs[tag]; 823 | obj_buf_put(bbuf, dc, ptr); 824 | 825 | /* Periodically flush buffered remote objects */ 826 | if ((buf_cnt++ & 0xFFFF) == 0) { 827 | obj_buf_flush_all(lh); 828 | } 829 | } 830 | 831 | static void touch_memory_range(void *addr, size_t len) 832 | { 833 | char *ptr = (char *)addr; 834 | char *end = ptr + len; 835 | 836 | for (; ptr < end; ptr += PAGE_SIZE) { 837 | *ptr = 0; 838 | } 839 | } 840 | 841 | static void *large_memalign(size_t boundary, size_t size) { 842 | /* Alloc a large enough memory block */ 843 | size_t padding = boundary + CHUNK_SIZE; 844 | size_t alloc_size = PAGE_ROUNDUP(size + padding); 845 | void *mem = page_alloc(NULL, alloc_size); 846 | 847 | /* Align up the address to boundary */ 848 | void *mem_start = 849 | (void*)((uintptr_t)((char*)mem + padding) & ~(boundary - 1)); 850 | 851 | /* Extract space for an header */ 852 | large_header_t *header = 853 | (large_header_t *)dchunk_extract(mem_start); 854 | 855 | /* If space is enough for the header of a large block */ 856 | intptr_t distance = (intptr_t)mem_start - (intptr_t)header; 857 | if (distance >= sizeof(large_header_t)) { 858 | header->alloc_size = alloc_size; 859 | header->mem = mem; 860 | header->owner = LARGE_OWNER; 861 | return mem_start; 862 | } 863 | 864 | /* If not, retry allocation */ 865 | void *ret = NULL; 866 | 867 | /* Avoid infinite loop if application call memalign(CHUNK_SIZE,size), 868 | * althrough it is actually illegal 869 | */ 870 | if (boundary % CHUNK_SIZE != 0) { 871 | ret = large_memalign(boundary, size); 872 | } 873 | page_free(mem, alloc_size); 874 | return ret; 875 | } 876 | 877 | #ifdef DEBUG 878 | /* Signal handler for debugging use */ 879 | static void handler(int sig) 880 | { 881 | void *array[10]; 882 | size_t size; 883 | 884 | /* get void*'s for all entries on the stack */ 885 | size = backtrace(array, 10); 886 | 887 | /* print out all the frames to stderr */ 888 | fprintf(stderr, "Error: signal %d:\n", sig); 889 | backtrace_symbols_fd(array, size, 2); 890 | exit(1); 891 | } 892 | #endif 893 | 894 | static void *page_alloc(void *pos, size_t size) 895 | { 896 | return mmap(pos, 897 | size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); 898 | } 899 | 900 | static void page_free(void *pos, size_t size) 901 | { 902 | munmap(pos, size); 903 | } 904 | 905 | 906 | inline static int size2cls(size_t size) 907 | { 908 | int ret; 909 | if (likely(size <= 1024)) { 910 | ret = sizemap[(size - 1) >> 2]; 911 | } else if (size <= 65536) { 912 | ret = sizemap2[(size - 1) >> 9]; 913 | } else { 914 | ret = LARGE_CLASS; 915 | } 916 | return ret; 917 | } 918 | 919 | inline static int r_size2cls(size_t size) { 920 | int ret; 921 | if (likely(size <= 1024)) { 922 | ret = r_sizemap[(size - 1) >> 2]; 923 | } else if (size <= 65536) { 924 | ret = r_sizemap2[(size - 1) >> 9]; 925 | } else { 926 | ret = LARGE_CLASS; 927 | } 928 | return ret; 929 | } 930 | 931 | void *malloc(size_t size) 932 | { 933 | void *ret = NULL; 934 | 935 | /* Initialize the allocator */ 936 | check_init(); 937 | 938 | /* Deal with zero-size allocation */ 939 | size += (size == 0); 940 | 941 | #if 0 942 | /* The expression above is equivalent to the code below */ 943 | if (unlikely(size == 0)) { 944 | size = 1; 945 | } 946 | #endif 947 | 948 | int size_cls = size2cls(size); 949 | if (likely(size_cls < DEFAULT_BLOCK_CLASS)) { 950 | ret = small_malloc(size_cls); 951 | // return NULL; 952 | } else { 953 | ret = large_malloc(size); 954 | } 955 | if(unlikely(ret == NULL)) 956 | assert(0); 957 | return ret; 958 | } 959 | 960 | void *Rmalloc(size_t size) { 961 | void *ret = NULL; 962 | 963 | /* Deal with zero-size allocation */ 964 | size += (size == 0); 965 | 966 | int size_cls = r_size2cls(size); 967 | if (likely(size_cls < DEFAULT_BLOCK_CLASS)) { 968 | ret = r_small_malloc(size_cls); 969 | } else { 970 | ret = r_large_malloc(size); 971 | } 972 | return ret; 973 | } 974 | 975 | 976 | void free(void *ptr) 977 | { 978 | if(ptr == NULL) { 979 | return; 980 | } 981 | 982 | dchunk_t *dc = dchunk_extract(ptr); 983 | lheap_t *lh = local_heap; 984 | lheap_t *target_lh = dc->owner; 985 | 986 | if (likely(target_lh == lh)) { 987 | local_free(lh, dc, ptr); 988 | } else if(likely(target_lh != LARGE_OWNER)){ 989 | check_init(); 990 | lh = local_heap; 991 | remote_free(lh, dc, ptr); 992 | } else { 993 | large_free(ptr); 994 | } 995 | } 996 | 997 | void Rfree(void *ptr) { 998 | 999 | if(ptr == NULL) { 1000 | return; 1001 | } 1002 | 1003 | dchunk_t *dc = dchunk_extract(ptr); 1004 | lheap_t *lh = r_local_heap; 1005 | lheap_t *target_lh = dc->owner; 1006 | // fprintf(stdout,"check owner %p\n",target_lh); 1007 | 1008 | if (likely(target_lh == lh)) { 1009 | local_free(lh, dc, ptr); 1010 | } else if(likely(target_lh != LARGE_OWNER)) { 1011 | // check_init(); 1012 | lh = r_local_heap; 1013 | remote_free(lh, dc, ptr); 1014 | } else { 1015 | // large_free(ptr); 1016 | } 1017 | } 1018 | 1019 | 1020 | void *realloc(void* ptr, size_t size) 1021 | { 1022 | /* Handle special cases */ 1023 | if (ptr == NULL) { 1024 | void *ret = malloc(size); 1025 | return ret; 1026 | } 1027 | 1028 | if (size == 0) { 1029 | free(ptr); 1030 | } 1031 | 1032 | dchunk_t *dc = dchunk_extract(ptr); 1033 | if (dc->owner != LARGE_OWNER) { 1034 | int old_size = cls2size[dc->size_cls]; 1035 | 1036 | /* Not exceed the current size, return */ 1037 | if (size <= old_size) { 1038 | return ptr; 1039 | } 1040 | 1041 | /* Alloc a new block */ 1042 | void *new_ptr = malloc(size); 1043 | memcpy(new_ptr, ptr, old_size); 1044 | free(ptr); 1045 | return new_ptr; 1046 | } else { 1047 | large_header_t *header = (large_header_t *)dc; 1048 | size_t alloc_size = header->alloc_size; 1049 | void* mem = header->mem; 1050 | size_t offset = (uintptr_t)ptr - (uintptr_t)mem; 1051 | size_t old_size = alloc_size - offset; 1052 | 1053 | /* Not exceed the current size, return */ 1054 | if(size <= old_size) { 1055 | return ptr; 1056 | } 1057 | 1058 | /* Try to do mremap */ 1059 | int new_size = PAGE_ROUNDUP(size + CHUNK_SIZE); 1060 | mem = mremap(mem, alloc_size, new_size, MREMAP_MAYMOVE); 1061 | void* mem_start = (void*)((uintptr_t)mem + offset); 1062 | header = (large_header_t*)dchunk_extract(mem_start); 1063 | 1064 | intptr_t distance = (intptr_t)mem_start - (intptr_t)header; 1065 | if (distance >= sizeof(large_header_t)) { 1066 | header->alloc_size = new_size; 1067 | header->mem = mem; 1068 | header->owner = LARGE_OWNER; 1069 | return mem_start; 1070 | } 1071 | 1072 | void* new_ptr = large_malloc(size); 1073 | memcpy(new_ptr, mem_start, old_size); 1074 | free(mem); 1075 | return new_ptr; 1076 | } 1077 | } 1078 | 1079 | void * __attribute__((optimize("O0"))) calloc(size_t nmemb, size_t size) 1080 | { 1081 | void *ptr; 1082 | size_t m_size = nmemb * size; 1083 | // ptr = malloc(nmemb * size); 1084 | ptr = malloc(m_size); 1085 | if (!ptr) { 1086 | // assert(0); 1087 | return NULL; 1088 | } 1089 | return memset(ptr, 0, nmemb * size); 1090 | } 1091 | 1092 | void *memalign(size_t boundary, size_t size) { 1093 | /* Deal with zero-size allocation */ 1094 | size += (size == 0); 1095 | if(boundary <= 256 && size <= 65536) { 1096 | /* In this case, we handle it as small allocations */ 1097 | int boundary_cls = size2cls(boundary); 1098 | int size_cls = size2cls(size); 1099 | int alloc_cls = max(boundary_cls, size_cls); 1100 | return small_malloc(alloc_cls); 1101 | } else { 1102 | /* Handle it as a special large allocation */ 1103 | return large_memalign(boundary, size); 1104 | } 1105 | } 1106 | 1107 | int posix_memalign(void **memptr, size_t alignment, size_t size) 1108 | { 1109 | *memptr = memalign(alignment, size); 1110 | if (*memptr) { 1111 | return 0; 1112 | } else { 1113 | /* We have to "personalize" the return value according to the error */ 1114 | return -1; 1115 | } 1116 | } 1117 | 1118 | void *valloc(size_t size) 1119 | { 1120 | return memalign(PAGE_SIZE, size); 1121 | } 1122 | 1123 | void *pvalloc(size_t size) 1124 | { 1125 | fprintf(stderr, "pvalloc() called. Not implemented! Exiting.\n"); 1126 | exit(1); 1127 | } 1128 | -------------------------------------------------------------------------------- /ralloc/ssmalloc.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | #include "atomic.h" 17 | #include "bitops.h" 18 | #include "queue.h" 19 | #include "double-list.h" 20 | #include "cpu.h" 21 | 22 | /* Configurations */ 23 | #define CHUNK_DATA_SIZE (16*PAGE_SIZE) 24 | #define ALLOC_UNIT (4*1024*1024) 25 | #define MAX_FREE_SIZE (4*1024*1024) 26 | #define RAW_POOL_START ((void*)((0x600000000000/CHUNK_SIZE+1)*CHUNK_SIZE)) 27 | 28 | #define BLOCK_BUF_CNT (16) 29 | 30 | // #define RETURN_MEMORY 31 | // #define DEBUG 32 | 33 | /* Other */ 34 | #define CHUNK_SIZE (CHUNK_DATA_SIZE+sizeof(dchunk_t)) 35 | #define CHUNK_MASK (~(CHUNK_SIZE-1)) 36 | #define LARGE_CLASS (100) 37 | #define DUMMY_CLASS (101) 38 | #define DCH (sizeof(dchunk_t)) 39 | #define MAX_FREE_CHUNK (MAX_FREE_SIZE/CHUNK_SIZE) 40 | #define LARGE_OWNER ((void*)0xDEAD) 41 | #define ACTIVE ((void*)1) 42 | 43 | /* Utility Macros */ 44 | #define ROUNDUP(x,n) ((x+n-1)&(~(n-1))) 45 | #define ROUNDDOWN(x,n) (((x-n)&(~(n-1)))+1) 46 | #define PAGE_ROUNDUP(x) (ROUNDUP((uintptr_t)x,PAGE_SIZE)) 47 | #define PAGE_ROUNDDOWN(x) (ROUNDDOWN((uintptr_t)x,PAGE_SIZE)) 48 | #define CACHE_ALIGN __attribute__ ((aligned (CACHE_LINE_SIZE))) 49 | #define THREAD_LOCAL __attribute__ ((tls_model ("initial-exec"))) __thread 50 | #define likely(x) __builtin_expect(!!(x),1) 51 | #define unlikely(x) __builtin_expect(!!(x),0) 52 | 53 | /* Multi consumer queue */ 54 | #define queue_init(head)\ 55 | mc_queue_init(head) 56 | #define queue_put(head,elem)\ 57 | mc_enqueue(head,elem,0) 58 | #define queue_fetch(head)\ 59 | mc_dequeue(head,0) 60 | typedef queue_head_t Queue; 61 | 62 | /* Single consumer queue */ 63 | #define fast_queue_init(head)\ 64 | sc_queue_init(head) 65 | #define fast_queue_put(head,elem)\ 66 | sc_enqueue(head,elem,0) 67 | #define fast_queue_fetch(head)\ 68 | sc_dequeue(head,0) 69 | #define fast_queue_chain_fetch(head)\ 70 | sc_chain_dequeue(head) 71 | typedef queue_head_t FastQueue; 72 | 73 | /* Sequencial queue */ 74 | #define seq_queue_init(head)\ 75 | seq_queue_init(head) 76 | #define seq_queue_put(head,elem)\ 77 | seq_enqueue(head,elem) 78 | #define seq_queue_fetch(head)\ 79 | seq_dequeue(head) 80 | #define fast_queue_chain_put(head)\ 81 | seq_chain_enqueue(head) 82 | typedef seq_queue_head_t SeqQueue; 83 | 84 | /* Type definations */ 85 | typedef enum { 86 | UNINITIALIZED, 87 | READY 88 | } init_state; 89 | 90 | typedef enum { 91 | FOREGROUND, 92 | BACKGROUND, 93 | FULL 94 | } dchunk_state; 95 | 96 | typedef struct lheap_s lheap_t; 97 | typedef struct gpool_s gpool_t; 98 | typedef struct dchunk_s dchunk_t; 99 | typedef struct chunk_s chunk_t; 100 | typedef struct obj_buf_s obj_buf_t; 101 | typedef struct large_header_s large_header_t; 102 | 103 | typedef double_list_t LinkedList; 104 | typedef double_list_elem_t LinkedListElem; 105 | 106 | struct large_header_s { 107 | CACHE_ALIGN size_t alloc_size; 108 | void* mem; 109 | CACHE_ALIGN lheap_t *owner; 110 | }; 111 | 112 | struct chunk_s { 113 | CACHE_ALIGN LinkedListElem active_link; 114 | uint32_t numa_node; 115 | }; 116 | 117 | /* Data chunk header */ 118 | struct dchunk_s { 119 | /* Local Area */ 120 | CACHE_ALIGN LinkedListElem active_link; 121 | uint32_t numa_node; 122 | 123 | /* Read Area */ 124 | CACHE_ALIGN lheap_t * owner; 125 | uint32_t size_cls; 126 | 127 | /* Local Write Area */ 128 | CACHE_ALIGN dchunk_state state; 129 | uint32_t free_blk_cnt; 130 | uint32_t blk_cnt; 131 | SeqQueue free_head; 132 | uint32_t block_size; 133 | char *free_mem; 134 | 135 | /* Remote Write Area */ 136 | CACHE_ALIGN FastQueue remote_free_head; 137 | }; 138 | 139 | struct gpool_s { 140 | pthread_mutex_t lock; 141 | volatile char *pool_start; 142 | volatile char *pool_end; 143 | volatile char *free_start; 144 | Queue free_dc_head[MAX_CORE_ID]; 145 | Queue free_lh_head[MAX_CORE_ID]; 146 | Queue released_dc_head[MAX_CORE_ID]; 147 | }; 148 | 149 | struct obj_buf_s { 150 | void *dc; 151 | void *first; 152 | SeqQueue free_head; 153 | int count; 154 | }; 155 | 156 | /* Per-thread data chunk pool */ 157 | struct lheap_s { 158 | CACHE_ALIGN LinkedListElem active_link; 159 | uint32_t numa_node; 160 | SeqQueue free_head; 161 | uint32_t free_cnt; 162 | 163 | dchunk_t *foreground[DEFAULT_BLOCK_CLASS]; 164 | LinkedList background[DEFAULT_BLOCK_CLASS]; 165 | dchunk_t dummy_chunk; 166 | obj_buf_t block_bufs[BLOCK_BUF_CNT]; 167 | 168 | CACHE_ALIGN FastQueue need_gc[DEFAULT_BLOCK_CLASS]; 169 | }; 170 | 171 | static inline int max(int a, int b) 172 | { 173 | return (a > b) ? a : b; 174 | } 175 | 176 | 177 | /* The new interfaces which is used for RDMA buffer malloc usage */ 178 | /* Shall only be called once! */ 179 | 180 | /* Return the actual size used. If the return size is 0, then the allocation is failed */ 181 | uint64_t RInit(char *buffer, uint64_t size); 182 | void RThreadLocalInit(void); 183 | void *Rmalloc(size_t __size); 184 | void Rfree(void *__ptr); 185 | 186 | void *malloc(size_t __size); 187 | void *realloc(void *__ptr, size_t __size); 188 | void free(void *__ptr); 189 | -------------------------------------------------------------------------------- /rdma_ctrl.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "qp.hpp" 6 | 7 | namespace rdmaio { 8 | 9 | const int MAX_SERVER_SUPPORTED = 16; 10 | typedef RUDQP UDQP; 11 | typedef RRCQP RCQP; 12 | 13 | typedef std::function connection_callback_t; 14 | 15 | class RdmaCtrl { 16 | public: 17 | RdmaCtrl(int node_id, int tcp_base_port, 18 | connection_callback_t callback = [](const QPConnArg &) { 19 | // the default callback does nothing 20 | }, 21 | std::string ip = "localhost"); 22 | 23 | ~RdmaCtrl(); 24 | 25 | int current_node_id(); 26 | int listening_port(); 27 | 28 | typedef struct { 29 | int dev_id; 30 | int port_id; 31 | } DevIdx; 32 | 33 | /** 34 | * Query devices info on this machine, 35 | * if there is a previous call, return previous results unless clear_dev_info has been called 36 | */ 37 | std::vector query_devs(); 38 | 39 | static std::vector query_devs_helper(); 40 | 41 | // clear the cached infos by RdmaCtrl; 42 | void clear_dev_info(); 43 | 44 | /** 45 | * Open device handlers. 46 | * RdmaCtrl opens a device for each thread. 47 | * The get_device returns previously opened device of this thread, if it is already opened 48 | */ 49 | RNicHandler *open_thread_local_device(DevIdx idx); 50 | 51 | RNicHandler *open_device(DevIdx idx); 52 | 53 | RNicHandler *get_device(); 54 | 55 | /** 56 | * The *callback* is called once a QP connection request is sent to this server 57 | */ 58 | void register_qp_callback(connection_callback_t callback); 59 | 60 | void close_device(); 61 | 62 | void close_device(RNicHandler *); 63 | 64 | /** 65 | * Each RDMA NIC has multiple ports, so we use two-dimeson index to locate the target port. 66 | * convert_port_idx provides a way to translate the one-dimeson index to the two-dimeson 67 | */ 68 | DevIdx convert_port_idx(int idx); 69 | 70 | /** 71 | * Register memory to a specific RNIC handler 72 | */ 73 | bool register_memory(int id,const char *buf,uint64_t size,RNicHandler *rnic, 74 | int flag = Memory::DEFAULT_PROTECTION_FLAG); 75 | 76 | /** 77 | * Get the local registered memory 78 | * undefined if mr_id has been registered 79 | */ 80 | MemoryAttr get_local_mr(int mr_id); 81 | 82 | /** 83 | * Return an arbitrary registered MR 84 | * return -1 if no MR is registered to RdmaCtrl 85 | * return the first mr index, if found one 86 | */ 87 | int get_default_mr(MemoryAttr &attr); 88 | 89 | /** 90 | * Create and query QPs 91 | * For create, an optional local_attr can be provided to bind to this QP 92 | * A local MR is passed as the default local mr for this QP. 93 | * If local_attr = nullptr, then this QP is unbind to any MR. 94 | */ 95 | RCQP *create_rc_qp(QPIdx idx, RNicHandler *dev,MemoryAttr *local_attr = NULL); 96 | UDQP *create_ud_qp(QPIdx idx, RNicHandler *dev,MemoryAttr *local_attr = NULL); 97 | 98 | RCQP *get_rc_qp(QPIdx idx); 99 | UDQP *get_ud_qp(QPIdx idx); 100 | 101 | /** 102 | * Some helper functions (example usage of RdmaCtrl) 103 | * Fully link the QP in a symmetric way, for this thread. 104 | * For example, node 0 can connect to node 1, while node 1 connect to node 0. 105 | */ 106 | bool link_symmetric_rcqps(const std::vector &cluster, 107 | int l_mrid,int mr_id,int wid,int idx = 0); 108 | 109 | private: 110 | class RdmaCtrlImpl; 111 | std::unique_ptr impl_; 112 | }; 113 | } // namespace rdmaio 114 | 115 | #include "rdma_ctrl_impl.hpp" // real implemeatation here 116 | -------------------------------------------------------------------------------- /rdma_ctrl_impl.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | namespace rdmaio { 6 | 7 | /** 8 | * Simple critical section 9 | * It uses a single global block to guard RdmaCtrl. 10 | * This is acceptable, since RdmaCtrl is only the control plane. 11 | */ 12 | class SCS { 13 | public: 14 | SCS() { 15 | get_lock().lock(); 16 | } 17 | 18 | ~SCS() { 19 | get_lock().unlock(); 20 | } 21 | 22 | private: 23 | static std::mutex &get_lock() { 24 | static std::mutex lock; 25 | return lock; 26 | } 27 | }; 28 | 29 | /** 30 | * convert qp idx(node,worker,idx) -> key 31 | */ 32 | inline uint32_t get_rc_key (const QPIdx idx) { 33 | return ::rdmaio::encode_qp_id(idx.node_id,RC_ID_BASE + idx.worker_id * 64 + idx.index); 34 | } 35 | 36 | inline uint32_t get_ud_key(const QPIdx idx) { 37 | return ::rdmaio::encode_qp_id(idx.worker_id,UD_ID_BASE + idx.index); 38 | } 39 | 40 | /** 41 | * Control plane of RLib 42 | */ 43 | class RdmaCtrl::RdmaCtrlImpl { 44 | public: 45 | RdmaCtrlImpl(int node_id, int tcp_base_port,connection_callback_t callback,std::string local_ip): 46 | node_id_(node_id), 47 | tcp_base_port_(tcp_base_port), 48 | local_ip_(local_ip), 49 | qp_callback_(callback) 50 | { 51 | // start the background thread to handle QP connection request 52 | pthread_attr_t attr; 53 | pthread_attr_init(&attr); 54 | pthread_create(&handler_tid_, &attr, &RdmaCtrlImpl::connection_handler_wrapper,this); 55 | } 56 | 57 | ~RdmaCtrlImpl() { 58 | running_ = false; // wait for the handler to join 59 | pthread_join(handler_tid_,NULL); 60 | RDMA_LOG(INFO) << "rdma controler close: does not handle any future connections."; 61 | } 62 | 63 | RNicHandler *open_thread_local_device(DevIdx idx) { 64 | // already openend device 65 | if(rnic_instance() != nullptr) 66 | return rnic_instance(); 67 | 68 | auto handler = open_device(idx); 69 | rnic_instance() = handler; 70 | return rnic_instance(); 71 | } 72 | 73 | RNicHandler *open_device(DevIdx idx) { 74 | 75 | RNicHandler *rnic = nullptr; 76 | 77 | struct ibv_device **dev_list = nullptr; struct ibv_context *ib_ctx = nullptr; struct ibv_pd *pd = nullptr; int num_devices; 78 | int rc; // return code 79 | 80 | dev_list = ibv_get_device_list(&num_devices); 81 | 82 | if(idx.dev_id >= num_devices || idx.dev_id < 0) { 83 | RDMA_LOG(WARNING) << "wrong dev_id: " << idx.dev_id << "; total " << num_devices <<" found"; 84 | goto OPEN_END; 85 | } 86 | 87 | // alloc ctx 88 | ib_ctx = ibv_open_device(dev_list[idx.dev_id]); 89 | if(ib_ctx == nullptr) { 90 | RDMA_LOG(WARNING) << "failed to open ib ctx w error: " << strerror(errno); 91 | goto OPEN_END; 92 | } 93 | 94 | // alloc pd 95 | pd = ibv_alloc_pd(ib_ctx); 96 | if(pd == nullptr) { 97 | RDMA_LOG(WARNING) << "failed to alloc pd w error: " << strerror(errno); 98 | RDMA_VERIFY(INFO,ibv_close_device(ib_ctx) == 0) << "failed to close device " << idx.dev_id; 99 | goto OPEN_END; 100 | } 101 | 102 | // fill the lid 103 | ibv_port_attr port_attr; 104 | rc = ibv_query_port (ib_ctx, idx.port_id, &port_attr); 105 | if(rc < 0) { 106 | RDMA_LOG(WARNING) << "failed to query port status w error: " << strerror(errno); 107 | RDMA_VERIFY(INFO,ibv_close_device(ib_ctx) == 0) << "failed to close device " << idx.dev_id; 108 | RDMA_VERIFY(INFO,ibv_dealloc_pd(pd) == 0) << "failed to dealloc pd"; 109 | goto OPEN_END; 110 | } 111 | 112 | // success open 113 | { 114 | rnic = new RNicHandler(idx.dev_id,idx.port_id,ib_ctx,pd,port_attr.lid); 115 | } 116 | 117 | OPEN_END: 118 | if(dev_list != nullptr) 119 | ibv_free_device_list(dev_list); 120 | return rnic; 121 | } 122 | 123 | RCQP *get_rc_qp(QPIdx idx) { 124 | RCQP *res = nullptr; 125 | { 126 | SCS s; 127 | res = get_qp(idx); 128 | }; 129 | return res; 130 | } 131 | 132 | UDQP *get_ud_qp(QPIdx idx) { 133 | 134 | UDQP *res = nullptr; 135 | { 136 | SCS s; 137 | res = get_qp(idx); 138 | }; 139 | return res; 140 | } 141 | 142 | /** 143 | * Note! this is not a thread-safe function 144 | */ 145 | template 146 | T *get_qp(QPIdx idx) { 147 | uint32_t key = F(idx); 148 | if(qps_.find(key) == qps_.end()) 149 | return nullptr; 150 | else 151 | return dynamic_cast(qps_[key]); 152 | } 153 | 154 | RCQP *create_rc_qp(QPIdx idx, RNicHandler *dev,MemoryAttr *attr) { 155 | 156 | RCQP *res = nullptr; 157 | { 158 | SCS s; 159 | uint64_t qid = get_rc_key(idx); 160 | if(qps_.find(qid) != qps_.end()) { 161 | res = dynamic_cast(qps_[qid]); 162 | } else { 163 | if(attr == NULL) 164 | res = new RCQP(dev,idx); 165 | else 166 | res = new RCQP(dev,idx,*attr); 167 | qps_.insert(std::make_pair(qid,res)); 168 | } 169 | }; 170 | return res; 171 | } 172 | 173 | UDQP *create_ud_qp(QPIdx idx, RNicHandler *dev,MemoryAttr *attr) { 174 | 175 | UDQP *res = nullptr; 176 | uint64_t qid = get_ud_key(idx); 177 | 178 | { 179 | SCS s; 180 | if(qps_.find(qid) != qps_.end()) { 181 | res = dynamic_cast(qps_[qid]); 182 | } else { 183 | if(attr == NULL) 184 | res = new UDQP(dev,idx); 185 | else 186 | res = new UDQP(dev,idx,*attr); 187 | qps_.insert(std::make_pair(qid,res)); 188 | } 189 | }; 190 | return res; 191 | } 192 | 193 | bool register_memory(int mr_id,const char *buf,uint64_t size,RNicHandler *rnic,int flag) { 194 | 195 | Memory *m = new Memory(buf,size,rnic->pd,flag); 196 | if(!m->valid()) { 197 | RDMA_LOG(WARNING) << "register mr to rnic error: " << strerror(errno); 198 | delete m; 199 | return false; 200 | } 201 | { 202 | SCS s; 203 | if(mrs_.find(mr_id) != mrs_.end()) { 204 | RDMA_LOG(WARNING) << "mr " << mr_id << " has already been registered!"; 205 | delete m; 206 | } else { 207 | mrs_.insert(std::make_pair(mr_id,m)); 208 | } 209 | }; 210 | return true; 211 | } 212 | 213 | int get_default_mr(MemoryAttr &attr) { 214 | SCS s; 215 | for(auto it = mrs_.begin();it != mrs_.end();++it) { 216 | int idx = it->first; attr = it->second->rattr; 217 | return idx; 218 | } 219 | return -1; 220 | } 221 | 222 | MemoryAttr get_local_mr(int mr_id) { 223 | MemoryAttr attr = {}; 224 | { 225 | SCS s; 226 | if(mrs_.find(mr_id) != mrs_.end()) 227 | attr = mrs_[mr_id]->rattr; 228 | } 229 | return attr; 230 | } 231 | 232 | void clear_dev_info() { 233 | cached_infos_.clear(); 234 | } 235 | 236 | static std::vector query_devs_helper() { 237 | int num_devices = 0; struct ibv_device **dev_list = nullptr; 238 | std::vector res; 239 | 240 | { // query the device and its active ports using the underlying APIs 241 | dev_list = ibv_get_device_list(&num_devices); 242 | int temp_devices = num_devices; 243 | 244 | if(dev_list == nullptr) { 245 | RDMA_LOG(ERROR) << "cannot get ib devices."; 246 | num_devices = 0; 247 | goto QUERY_END; 248 | } 249 | 250 | for(uint dev_id = 0;dev_id < temp_devices;++dev_id) { 251 | 252 | struct ibv_context *ib_ctx = ibv_open_device(dev_list[dev_id]); 253 | if(ib_ctx == nullptr) { 254 | RDMA_LOG(ERROR) << "open dev " << dev_id << " error: " << strerror(errno) << " ignored"; 255 | num_devices -= 1; 256 | continue; 257 | } 258 | res.emplace_back(ibv_get_device_name(ib_ctx->device),dev_id,ib_ctx); 259 | QUERY_DEV_END: 260 | // close ib_ctx 261 | RDMA_VERIFY(INFO,ibv_close_device(ib_ctx) == 0) << "failed to close device " << dev_id; 262 | } 263 | } 264 | 265 | QUERY_END: 266 | if(dev_list != nullptr) 267 | ibv_free_device_list(dev_list); 268 | return res; 269 | } 270 | 271 | std::vector query_devs() { 272 | 273 | if(cached_infos_.size() != 0) { 274 | return cached_infos_; 275 | } 276 | cached_infos_ = query_devs_helper(); 277 | return std::vector(cached_infos_.begin(),cached_infos_.end()); 278 | } 279 | 280 | RdmaCtrl::DevIdx convert_port_idx(int idx) { 281 | 282 | if(cached_infos_.size() == 0) 283 | query_devs(); 284 | 285 | for(int dev_id = 0; dev_id < cached_infos_.size();++dev_id) { 286 | 287 | int port_num = cached_infos_[dev_id].active_ports.size(); 288 | 289 | for(int port_id = 1; port_id <= port_num; port_id++) { 290 | if(idx == 0) { 291 | // find one 292 | return DevIdx {.dev_id = dev_id,.port_id = port_id}; 293 | } 294 | idx -= 1; 295 | } 296 | } 297 | // failed to find the dev according to the idx 298 | return DevIdx {.dev_id = -1,.port_id = -1}; 299 | } 300 | 301 | RNicHandler *get_device() { 302 | return rnic_instance(); 303 | } 304 | 305 | void close_device() { 306 | if(rnic_instance() != nullptr) delete rnic_instance(); 307 | rnic_instance() = nullptr; 308 | } 309 | 310 | void close_device(RNicHandler *rnic) { 311 | if(rnic != nullptr) 312 | delete rnic; 313 | } 314 | 315 | static void *connection_handler_wrapper(void *context) 316 | { 317 | return ((RdmaCtrlImpl *)context)->connection_handler(); 318 | } 319 | 320 | /** 321 | * Using TCP to connect in-coming QP & MR requests 322 | */ 323 | void *connection_handler(void) { 324 | 325 | pthread_detach(pthread_self()); 326 | 327 | auto listenfd = PreConnector::get_listen_socket(local_ip_,tcp_base_port_); 328 | 329 | int opt = 1; 330 | RDMA_VERIFY(ERROR,setsockopt(listenfd,SOL_SOCKET,SO_REUSEADDR | SO_REUSEPORT,&opt,sizeof(int)) == 0) 331 | << "unable to configure socket status."; 332 | RDMA_VERIFY(ERROR,listen(listenfd,24) == 0) << "TCP listen error: " << strerror(errno); 333 | 334 | while(running_) { 335 | 336 | asm volatile("" ::: "memory"); 337 | 338 | struct sockaddr_in cli_addr = {0}; 339 | socklen_t clilen = sizeof(cli_addr); 340 | auto csfd = accept(listenfd,(struct sockaddr *) &cli_addr, &clilen); 341 | 342 | if(csfd < 0) { 343 | RDMA_LOG(ERROR) << "accept a wrong connection error: " << strerror(errno); 344 | continue; 345 | } 346 | 347 | if(!PreConnector::wait_recv(csfd,6000)) { 348 | close(csfd); 349 | continue; 350 | } 351 | 352 | ConnArg arg; 353 | auto n = recv(csfd,(char *)(&arg),sizeof(ConnArg), MSG_WAITALL); 354 | 355 | if(n != sizeof(ConnArg)) { 356 | // an invalid message 357 | close(csfd); 358 | continue; 359 | } 360 | 361 | ConnReply reply; reply.ack = ERR; 362 | 363 | { // in a global critical section 364 | SCS s; 365 | switch(arg.type) { 366 | case ConnArg::MR: 367 | if(mrs_.find(arg.payload.mr.mr_id) != mrs_.end()) { 368 | memcpy((char *)(&(reply.payload.mr)), 369 | (char *)(&(mrs_[arg.payload.mr.mr_id]->rattr)),sizeof(MemoryAttr)); 370 | reply.ack = SUCC; 371 | }; 372 | break; 373 | case ConnArg::QP: { 374 | qp_callback_(arg.payload.qp); // call the user callback 375 | QP *qp = NULL; 376 | switch(arg.payload.qp.qp_type) { 377 | case IBV_QPT_UD: 378 | { 379 | UDQP *ud_qp = get_qp( 380 | create_ud_idx(arg.payload.qp.from_node,arg.payload.qp.from_worker)); 381 | if(ud_qp != nullptr && ud_qp->ready()) { 382 | qp = ud_qp; 383 | } 384 | } 385 | break; 386 | case IBV_QPT_RC: 387 | { 388 | RCQP *rc_qp = get_qp( 389 | create_rc_idx(arg.payload.qp.from_node,arg.payload.qp.from_worker)); 390 | qp = rc_qp; 391 | } 392 | break; 393 | default: 394 | RDMA_LOG(ERROR) << "unknown QP connection type: " << arg.payload.qp.qp_type; 395 | } 396 | if(qp != nullptr) { 397 | reply.payload.qp = qp->get_attr(); 398 | reply.ack = SUCC; 399 | } 400 | reply.payload.qp.node_id = node_id_; 401 | break; 402 | } 403 | default: 404 | RDMA_LOG(WARNING) << "received unknown connect type " << arg.type; 405 | } 406 | } // end simple critical section protection 407 | 408 | PreConnector::send_to(csfd,(char *)(&reply),sizeof(ConnReply)); 409 | PreConnector::wait_close(csfd); // wait for the client to close the connection 410 | } 411 | // end of the server 412 | close(listenfd); 413 | } 414 | 415 | private: 416 | friend class RdmaCtrl; 417 | static RNicHandler* &rnic_instance() { 418 | static thread_local RNicHandler * handler = NULL; 419 | return handler; 420 | } 421 | 422 | std::vector cached_infos_; 423 | 424 | // registered MRs at this control manager 425 | std::map mrs_; 426 | 427 | // created QPs on this control manager 428 | std::map qps_; 429 | 430 | // local node information 431 | const int node_id_; 432 | const int tcp_base_port_; 433 | const std::string local_ip_; 434 | 435 | pthread_t handler_tid_; 436 | bool running_ = true; 437 | 438 | // connection callback function 439 | connection_callback_t qp_callback_; 440 | 441 | bool link_symmetric_rcqps(const std::vector &cluster,int l_mrid,int mr_id,int wid,int idx) { 442 | 443 | std::vector ready_list(cluster.size(),false); 444 | std::vector mrs; 445 | 446 | MemoryAttr local_mr = get_local_mr(l_mrid); 447 | 448 | for(auto s : cluster) { 449 | // get the target mr 450 | retry: 451 | MemoryAttr mr = {}; 452 | auto rc = QP::get_remote_mr(s,tcp_base_port_,mr_id,&mr); 453 | if(rc != SUCC) { 454 | usleep(2000); 455 | goto retry; 456 | } 457 | mrs.push_back(mr); 458 | } 459 | 460 | RDMA_ASSERT(mrs.size() == cluster.size()); 461 | 462 | while(true) { 463 | int connected = 0, i = 0; 464 | for(auto s : cluster) { 465 | 466 | if(ready_list[i]) { 467 | i++; connected++; 468 | continue; 469 | } 470 | RCQP *qp = create_rc_qp(QPIdx {.node_id = i,.worker_id = wid,.index = idx }, 471 | get_device(),&local_mr); 472 | RDMA_ASSERT(qp != nullptr); 473 | 474 | if(qp->connect(s,tcp_base_port_, 475 | QPIdx {.node_id = node_id_,.worker_id = wid, .index = idx}) == SUCC) { 476 | ready_list[i] = true; 477 | connected++; 478 | qp->bind_remote_mr(mrs[i]); 479 | } 480 | i++; 481 | } 482 | if(connected == cluster.size()) 483 | break; 484 | else 485 | usleep(1000); 486 | } 487 | return true; // This example does not use error handling 488 | } 489 | 490 | void register_qp_callback(connection_callback_t callback) { 491 | qp_callback_ = callback; 492 | } 493 | }; // 494 | 495 | // link to the main class 496 | inline __attribute__ ((always_inline)) 497 | RdmaCtrl::RdmaCtrl(int node_id, int tcp_base_port,connection_callback_t callback,std::string ip) 498 | :impl_(new RdmaCtrlImpl(node_id,tcp_base_port,callback,ip)){ 499 | } 500 | 501 | inline __attribute__ ((always_inline)) 502 | RdmaCtrl::~RdmaCtrl() { 503 | impl_.reset(); 504 | } 505 | 506 | inline __attribute__ ((always_inline)) 507 | std::vector RdmaCtrl::query_devs() { 508 | return impl_->query_devs(); 509 | } 510 | 511 | inline __attribute__ ((always_inline)) 512 | void RdmaCtrl::clear_dev_info() { 513 | return impl_->clear_dev_info(); 514 | } 515 | 516 | inline __attribute__ ((always_inline)) 517 | RNicHandler *RdmaCtrl::get_device() { 518 | return impl_->get_device(); 519 | } 520 | 521 | inline __attribute__ ((always_inline)) 522 | RNicHandler *RdmaCtrl::open_thread_local_device(DevIdx idx) { 523 | return impl_->open_thread_local_device(idx); 524 | } 525 | 526 | inline __attribute__ ((always_inline)) 527 | RNicHandler *RdmaCtrl::open_device(DevIdx idx) { 528 | return impl_->open_device(idx); 529 | } 530 | 531 | inline __attribute__ ((always_inline)) 532 | void RdmaCtrl::close_device() { 533 | return impl_->close_device(); 534 | } 535 | 536 | inline __attribute__ ((always_inline)) 537 | void RdmaCtrl::close_device(RNicHandler *rnic) { 538 | return impl_->close_device(rnic); 539 | } 540 | 541 | inline __attribute__ ((always_inline)) 542 | RdmaCtrl::DevIdx RdmaCtrl::convert_port_idx(int idx) { 543 | return impl_->convert_port_idx(idx); 544 | } 545 | 546 | inline __attribute__ ((always_inline)) 547 | bool RdmaCtrl::register_memory(int id,const char *buf,uint64_t size,RNicHandler *rnic,int flag) { 548 | return impl_->register_memory(id,buf,size,rnic,flag); 549 | } 550 | 551 | inline __attribute__ ((always_inline)) 552 | MemoryAttr RdmaCtrl::get_local_mr(int mr_id) { 553 | return impl_->get_local_mr(mr_id); 554 | } 555 | 556 | inline __attribute__ ((always_inline)) 557 | int RdmaCtrl::get_default_mr(MemoryAttr &attr) { 558 | return impl_->get_default_mr(attr); 559 | } 560 | 561 | inline __attribute__ ((always_inline)) 562 | RCQP *RdmaCtrl::create_rc_qp(QPIdx idx, RNicHandler *dev,MemoryAttr *attr) { 563 | return impl_->create_rc_qp(idx,dev,attr); 564 | } 565 | 566 | inline __attribute__ ((always_inline)) 567 | UDQP *RdmaCtrl::create_ud_qp(QPIdx idx, RNicHandler *dev,MemoryAttr *attr) { 568 | return impl_->create_ud_qp(idx,dev,attr); 569 | } 570 | 571 | inline __attribute__ ((always_inline)) 572 | RCQP *RdmaCtrl::get_rc_qp(QPIdx idx) { 573 | return impl_->get_rc_qp(idx); 574 | } 575 | 576 | inline __attribute__ ((always_inline)) 577 | UDQP *RdmaCtrl::get_ud_qp(QPIdx idx) { 578 | return impl_->get_ud_qp(idx); 579 | } 580 | 581 | inline __attribute__ ((always_inline)) 582 | int RdmaCtrl::current_node_id() { 583 | return impl_->node_id_; 584 | } 585 | 586 | inline __attribute__ ((always_inline)) 587 | int RdmaCtrl::listening_port() { 588 | return impl_->tcp_base_port_; 589 | } 590 | 591 | inline __attribute__ ((always_inline)) 592 | bool RdmaCtrl::link_symmetric_rcqps(const std::vector &cluster, 593 | int l_mrid,int mr_id,int wid,int idx) { 594 | return impl_->link_symmetric_rcqps(cluster,l_mrid,mr_id,wid,idx); 595 | } 596 | 597 | inline __attribute__ ((always_inline)) 598 | std::vector RdmaCtrl::query_devs_helper() { 599 | return RdmaCtrlImpl::query_devs_helper(); 600 | } 601 | 602 | inline __attribute__ ((always_inline)) 603 | void RdmaCtrl::register_qp_callback(connection_callback_t callback) { 604 | impl_->register_qp_callback(callback); 605 | } 606 | 607 | }; 608 | -------------------------------------------------------------------------------- /rnic.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "logging.hpp" 7 | 8 | namespace rdmaio { 9 | 10 | // The name of the particular port on the RNIC. 11 | typedef struct { 12 | uint64_t subnet_prefix; 13 | uint64_t interface_id; 14 | uint32_t local_id; 15 | } address_t; 16 | 17 | struct RNicInfo { 18 | 19 | typedef struct { 20 | uint port_id; 21 | std::string link_layer; 22 | } PortInfo; 23 | 24 | RNicInfo(const char *name,int id,ibv_context *ctx): 25 | dev_id(id), 26 | dev_name(name) 27 | { 28 | query_port_infos(ctx); 29 | query_active_gids(ctx); 30 | } 31 | 32 | bool query_dev_attribute(ibv_context *ctx,ibv_device_attr &attr) { 33 | int rc = ibv_query_device(ctx, &attr); 34 | if(rc != 0) { 35 | RDMA_LOG(ERROR) << "query device attribute error: " << strerror(errno); 36 | return false; 37 | } 38 | return true; 39 | } 40 | 41 | // fill in the active_ports 42 | void query_port_infos(ibv_context *ctx) { 43 | 44 | ibv_device_attr attr; 45 | if(!query_dev_attribute(ctx,attr)) 46 | return; 47 | 48 | // query port info 49 | for(uint port_id = 1;port_id <= attr.phys_port_cnt;++port_id) { 50 | 51 | struct ibv_port_attr port_attr; 52 | int rc = ibv_query_port(ctx, port_id, &port_attr); 53 | if(rc != 0) { 54 | RDMA_LOG(ERROR) << "query port_id " << port_id << " on device " << dev_id << "error."; 55 | continue; 56 | } 57 | 58 | // check port status 59 | if(port_attr.phys_state != IBV_PORT_ACTIVE && port_attr.phys_state != IBV_PORT_ACTIVE_DEFER) { 60 | RDMA_LOG(WARNING) << "query port_id " << port_id << " on device " << dev_id << " not active."; 61 | continue; 62 | } 63 | 64 | std::string link_layer = ""; 65 | switch (port_attr.link_layer) { 66 | case IBV_LINK_LAYER_ETHERNET: 67 | link_layer = "RoCE"; 68 | break; 69 | case IBV_LINK_LAYER_INFINIBAND: 70 | link_layer = "Infiniband"; 71 | break; 72 | default: 73 | RDMA_LOG(WARNING) << "unknown link layer at this port: " << port_attr.link_layer; 74 | link_layer = "Unknown"; 75 | }; 76 | active_ports.push_back({port_id,link_layer}); 77 | } 78 | } 79 | 80 | /** 81 | * I assume that the active gid is the same in the RNIC 82 | */ 83 | void query_active_gids(ibv_context *ctx) { 84 | 85 | if(active_ports.size() == 0) 86 | return; 87 | 88 | int port_id = active_ports[0].port_id; 89 | struct ibv_port_attr port_attr; 90 | int rc = ibv_query_port(ctx, port_id, &port_attr); 91 | 92 | if(rc != 0) { 93 | RDMA_LOG(WARNING) << "query port attribute at dev " << dev_name << ",port " << port_id 94 | << "; w error: " << strerror(errno); 95 | return; 96 | } 97 | 98 | for(uint i = 0;i < port_attr.gid_tbl_len;++i) { 99 | ibv_gid gid = {}; 100 | auto rc = ibv_query_gid(ctx,port_id, i, &gid); 101 | if (gid.global.interface_id) { 102 | active_gids.push_back(i); 103 | } 104 | } 105 | } 106 | 107 | void print() const { 108 | RDMA_LOG(3) << to_string(); 109 | } 110 | 111 | std::string to_string() const { 112 | std::ostringstream oss; 113 | 114 | oss << "device " << dev_name << " has "<< active_ports.size() << " active ports."; 115 | for(auto i : active_ports) { 116 | oss << "port " << i.port_id << " w link layer " << i.link_layer << "."; 117 | } 118 | for(uint i = 0;i active_ports; 128 | std::vector active_gids; 129 | }; 130 | 131 | class RdmaCtrl; 132 | struct RNicHandler { 133 | 134 | RNicHandler(int dev_id,int port_id,ibv_context *ctx,ibv_pd *pd,int lid,int gid = 0): 135 | dev_id(dev_id), 136 | port_id(port_id), 137 | ctx(ctx), 138 | pd(pd), 139 | lid(lid), 140 | gid(gid) 141 | { 142 | } 143 | 144 | address_t query_addr() { 145 | return query_addr(gid); 146 | } 147 | 148 | address_t query_addr(uint8_t gid_index) { 149 | 150 | ibv_gid gid; 151 | ibv_query_gid(ctx,port_id,gid_index,&gid); 152 | 153 | address_t addr { 154 | .subnet_prefix = gid.global.subnet_prefix, 155 | .interface_id = gid.global.interface_id, 156 | .local_id = gid_index 157 | }; 158 | return addr; 159 | } 160 | 161 | private: 162 | friend class RdmaCtrl; 163 | ~RNicHandler() { 164 | // delete ctx & pd 165 | RDMA_VERIFY(INFO,ibv_close_device(ctx) == 0) << "failed to close device " << dev_id; 166 | RDMA_VERIFY(INFO,ibv_dealloc_pd(pd) == 0) << "failed to dealloc pd at device " << dev_id 167 | << "; w error " << strerror(errno); 168 | } 169 | 170 | public: 171 | uint16_t dev_id; // which RNIC 172 | uint16_t port_id; // which port 173 | 174 | struct ibv_context *ctx; 175 | struct ibv_pd *pd; 176 | uint16_t lid; 177 | uint16_t gid; 178 | }; 179 | 180 | 181 | } // namespace rdmaio 182 | -------------------------------------------------------------------------------- /ud_adapter.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "msg_interface.hpp" 4 | #include "rdma_ctrl.hpp" 5 | #include "ralloc/ralloc.h" 6 | 7 | /** 8 | * The Adapter use UD QP is based on FaSST RPC. 9 | */ 10 | namespace rdmaio { 11 | 12 | class UDRecvManager { 13 | public: 14 | UDRecvManager(UDQP *qp,int max_recv_num,MemoryAttr local_mr): 15 | qp_(qp),max_recv_num_(max_recv_num) 16 | { 17 | RDMA_ASSERT(max_recv_num_ <= UDQPImpl::MAX_RECV_SIZE) 18 | << "UD can register at most " << UDQPImpl::MAX_RECV_SIZE << "buffers."; 19 | // allocate local heap 20 | RThreadLocalInit(); 21 | 22 | // int recv structures 23 | /* 24 | * The recv_buf_size must be smaller than *Real packet size* - *sizeof(GRH_header)* 25 | * otherwise the msg cannot be received 26 | */ 27 | int recv_buf_size = MAX_PACKET_SIZE; 28 | RDMA_ASSERT(recv_buf_size <= MAX_PACKET_SIZE); 29 | 30 | // init receive related structures 31 | for(uint i = 0;i < max_recv_num_;++i) { 32 | struct ibv_sge sge { 33 | .addr = (uintptr_t)(Rmalloc(recv_buf_size)), 34 | .length = (uint32_t)recv_buf_size, 35 | .lkey = local_mr.key 36 | }; 37 | RDMA_ASSERT(sge.addr != 0) << "failed to allocate recv buffer."; 38 | sges_[i] = sge; 39 | 40 | rrs_[i].wr_id = sges_[i].addr; 41 | rrs_[i].sg_list = &sges_[i]; 42 | rrs_[i].num_sge = 1; 43 | 44 | rrs_[i].next = (i < (max_recv_num_ - 1)) ? &rrs_[i + 1] : &rrs_[0]; 45 | } 46 | 47 | post_recvs(max_recv_num_); 48 | 49 | // now the qp can receive connection requests 50 | qp_->set_ready(); 51 | } 52 | public: 53 | // the size of global routing header 54 | static const int GRH_SIZE = 40; 55 | static const int MAX_PACKET_SIZE = 4096 - GRH_SIZE; 56 | 57 | protected: 58 | 59 | UDQP *qp_ = nullptr; 60 | 61 | int recv_head_ = 0; 62 | int idle_recv_num_ = 0; 63 | int max_idle_recv_num_ = 1; 64 | int max_recv_num_ = 0; 65 | 66 | struct ibv_recv_wr rrs_[UDQPImpl::MAX_RECV_SIZE]; 67 | struct ibv_sge sges_[UDQPImpl::MAX_RECV_SIZE]; 68 | struct ibv_wc wcs_[UDQPImpl::MAX_RECV_SIZE]; 69 | struct ibv_recv_wr *bad_rr_; 70 | 71 | void post_recvs(int recv_num) { 72 | 73 | if(recv_num <= 0) { 74 | return; 75 | } 76 | 77 | int tail = recv_head_ + recv_num - 1; 78 | if(tail >= max_recv_num_) 79 | tail -= max_recv_num_; 80 | 81 | ibv_recv_wr *head_rr = rrs_ + recv_head_; 82 | ibv_recv_wr *tail_rr = rrs_ + tail; 83 | ibv_recv_wr *temp = tail_rr->next; 84 | tail_rr->next = NULL; 85 | 86 | int rc = ibv_post_recv(qp_->qp_,head_rr,&bad_rr_); 87 | if(rc != 0) { 88 | RDMA_LOG(ERROR) << "post recv " << recv_num << "; w error: " << strerror(errno) ; 89 | } 90 | recv_head_ = tail; 91 | tail_rr->next = temp; 92 | recv_head_ = (recv_head_ + 1) % max_recv_num_; 93 | } 94 | // class specific constants 95 | }; 96 | 97 | class UDAdapter : public MsgAdapter, public UDRecvManager { 98 | static const int MAX_UD_SEND_DOORBELL = 16; 99 | public: 100 | UDAdapter(std::shared_ptr cm, RNicHandler *rnic, MemoryAttr local_mr, 101 | int w_id, int max_recv_num): 102 | node_id_(cm->current_node_id()), 103 | worker_id_(w_id), 104 | UDRecvManager(cm->create_ud_qp(create_ud_idx(w_id,RECV_QP_IDX),rnic,&local_mr),max_recv_num,local_mr), 105 | send_qp_(cm->create_ud_qp(create_ud_idx(w_id,SEND_QP_IDX),rnic,&local_mr)) 106 | { 107 | // init send structures 108 | for(uint i = 0;i < MAX_UD_SEND_DOORBELL;++i) { 109 | srs_[i].opcode = IBV_WR_SEND_WITH_IMM; 110 | srs_[i].num_sge = 1; 111 | srs_[i].imm_data = ::rdmaio::encode_qp_id(node_id_,worker_id_); 112 | RDMA_ASSERT(::rdmaio::decode_qp_mac(srs_[i].imm_data) == node_id_); 113 | srs_[i].next = &srs_[i+1]; 114 | srs_[i].sg_list = &ssges_[i]; 115 | 116 | ssges_[i].lkey = local_mr.key; 117 | } 118 | } 119 | 120 | ConnStatus connect(std::string ip,int port) { 121 | return send_qp_->connect(ip,port,create_ud_idx(worker_id_,RECV_QP_IDX)); 122 | } 123 | 124 | ConnStatus send_to(int node_id,const char *msg,int len) { 125 | 126 | RDMA_ASSERT(current_idx_ == 0) << "There is pending reqs in the msg queue."; 127 | srs_[0].wr.ud.ah = send_qp_->ahs_[node_id]; 128 | srs_[0].wr.ud.remote_qpn = send_qp_->attrs_[node_id].qpn; 129 | srs_[0].wr.ud.remote_qkey = DEFAULT_QKEY; 130 | srs_[0].sg_list = &ssges_[0]; 131 | srs_[0].next = NULL; 132 | 133 | srs_[0].send_flags = ((send_qp_->queue_empty()) ? IBV_SEND_SIGNALED : 0) 134 | | ((len < MAX_INLINE_SIZE) ? IBV_SEND_INLINE : 0); 135 | 136 | ssges_[0].addr = (uint64_t)msg; 137 | ssges_[0].length = len; 138 | 139 | 140 | if(send_qp_->need_poll()) { 141 | ibv_wc wc; auto ret = send_qp_->poll_till_completion(wc); 142 | RDMA_ASSERT(ret == SUCC) << "poll UD completion reply error: " << ret; 143 | send_qp_->pendings = 0; 144 | } else 145 | send_qp_->pendings += 1; 146 | 147 | int rc = ibv_post_send(send_qp_->qp_, &srs_[0], &bad_sr_); 148 | //reset next ptr 149 | srs_[0].next = &srs_[1]; 150 | return (rc == 0)?SUCC:ERR; 151 | } 152 | 153 | void prepare_pending() { 154 | RDMA_ASSERT(current_idx_ == 0); 155 | } 156 | 157 | ConnStatus send_pending(int node_id,const char *msg,int len) { 158 | 159 | auto i = current_idx_++; 160 | srs_[i].wr.ud.ah = send_qp_->ahs_[node_id]; 161 | srs_[i].wr.ud.remote_qpn = send_qp_->attrs_[node_id].qpn; 162 | srs_[i].wr.ud.remote_qkey = DEFAULT_QKEY; 163 | 164 | srs_[i].send_flags = ((send_qp_->queue_empty()) ? IBV_SEND_SIGNALED : 0) 165 | | ((len < MAX_INLINE_SIZE) ? IBV_SEND_INLINE : 0); 166 | 167 | if(send_qp_->need_poll()) { 168 | ibv_wc wc;auto ret = send_qp_->poll_till_completion(wc); 169 | RDMA_ASSERT(ret == SUCC) << "poll UD completion reply error: " << ret; 170 | send_qp_->pendings = 0; 171 | } else { 172 | send_qp_->pendings += 1; 173 | } 174 | 175 | ssges_[i].addr = (uintptr_t)msg; 176 | ssges_[i].length = len; 177 | 178 | if(current_idx_ >= MAX_UD_SEND_DOORBELL) 179 | flush_pending(); 180 | } 181 | 182 | ConnStatus flush_pending() { 183 | if(current_idx_ > 0) { 184 | srs_[current_idx_ - 1].next = NULL; 185 | auto ret = ibv_post_send(send_qp_->qp_, &srs_[0], &bad_sr_); 186 | srs_[current_idx_ - 1].next = &srs_[current_idx_]; 187 | current_idx_ = 0; 188 | return (ret == 0)?SUCC:ERR; 189 | } 190 | return SUCC; 191 | } 192 | 193 | void poll_comps() { 194 | 195 | int poll_result = ibv_poll_cq(qp_->recv_cq_,UDQPImpl::MAX_RECV_SIZE,wcs_); 196 | /** 197 | * The reply messages are batched in this call 198 | */ 199 | prepare_pending(); 200 | for(uint i = 0;i < poll_result;++i) { // poll_result: number of results 201 | RDMA_ASSERT(wcs_[i].status == IBV_WC_SUCCESS) 202 | << "error wc status " << wcs_[i].status << " at " << worker_id_; 203 | callback_((const char *)(wcs_[i].wr_id + GRH_SIZE),::rdmaio::decode_qp_mac(wcs_[i].imm_data), 204 | ::rdmaio::decode_qp_index(wcs_[i].imm_data)); 205 | } 206 | flush_pending(); // send the batched replies 207 | idle_recv_num_ += poll_result; 208 | if(idle_recv_num_ > max_idle_recv_num_) { 209 | // re-post recvs to the QP 210 | post_recvs(idle_recv_num_); 211 | idle_recv_num_ = 0; 212 | } 213 | } 214 | 215 | private: 216 | const int node_id_; // my node id 217 | const int worker_id_; // my thread id 218 | /** 219 | * sender structures 220 | */ 221 | UDQP *send_qp_ = nullptr; 222 | ibv_send_wr srs_[MAX_UD_SEND_DOORBELL]; 223 | ibv_sge ssges_[MAX_UD_SEND_DOORBELL]; 224 | struct ibv_send_wr *bad_sr_ = nullptr; 225 | 226 | int current_idx_ = 0; 227 | 228 | static const int RECV_QP_IDX = 1; 229 | static const int SEND_QP_IDX = 0; 230 | }; 231 | 232 | } // namespace rdmaio 233 | --------------------------------------------------------------------------------