├── CMakeLists.txt
├── README.md
├── common.hpp
├── example
    ├── client.cpp
    └── server.cpp
├── logging.hpp
├── mr.hpp
├── msg_interface.hpp
├── pre_connector.hpp
├── qp.hpp
├── qp_impl.hpp
├── ralloc
    ├── Makefile
    ├── README
    ├── include-x86_64
    │   ├── atomic.h
    │   ├── bitops.h
    │   ├── cpu.h
    │   ├── double-list.h
    │   └── queue.h
    ├── new_delete.cpp
    ├── ralloc.h
    ├── ssmalloc.c
    └── ssmalloc.h
├── rdma_ctrl.hpp
├── rdma_ctrl_impl.hpp
├── rnic.hpp
└── ud_adapter.hpp


/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | project (rlib)
 2 | 
 3 | cmake_minimum_required(VERSION 2.8)
 4 | 
 5 | ## use C++11 features
 6 | add_definitions(-std=c++11)
 7 | 
 8 | set(CMAKE_INCLUDE_CURRENT_DIR ON)
 9 | set(CMAKE_CXX_COMPILER /usr/bin/g++)
10 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
11 | 
12 | ## path to ralloc lib, which you need to build manually
13 | set(RALLOC_LIB "${PROJECT_SOURCE_DIR}/ralloc/libssmalloc.a")
14 | 
15 | ## include paths
16 | include_directories(ralloc)
17 | 
18 | file(GLOB SOURCES "*.hpp")
19 | 
20 | add_library(rdma STATIC ${SOURCES})
21 | set_target_properties(rdma PROPERTIES LINKER_LANGUAGE CXX)
22 | target_link_libraries(rdma -lpthread ibverbs ${RALLOC_LIB})
23 | 
24 | add_executable(server "example/server.cpp")
25 | add_executable(client "example/client.cpp")
26 | 
27 | target_link_libraries(server rdma)
28 | target_link_libraries(client rdma)
29 | 
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## RLib
  2 | 
  3 | ### Notice
  4 | The latest version has been transferred to https://github.com/wxdwfc/rlibv2, which is now actively maintained. 
  5 | 
  6 | ### Intro
  7 | 
  8 | RLib is a header-only library for **easier** use of RDMA using C++. Basically it is a set of wrappers of the interfaces of `libibverbs`, 
  9 | yet it additionally handles many tedius things, such as establishing connections between RDMA QPs, and simplifies many configurations.
 10 | 
 11 | ------
 12 | 
 13 | ### To use
 14 | 
 15 | `#include "rdma_ctrl.hpp"` is all you need.
 16 | 
 17 | ------
 18 | 
 19 | ### Example
 20 | 
 21 | Usually very few lines of code are needed to use RDMA with RLib. Below is a snippet of using RLib to implement a 
 22 | simple pingpong application using one-sided RDMA primitive.
 23 | 
 24 | Server side
 25 | ```c++
 26 | /**
 27 |  * Note, RDMA usually uses some other communication method (e.g. TCP/IP) to exchange QP informations.
 28 |  * RLib uses TCP for the pre-communication.
 29 |  */
 30 | int server_node_id = 1;
 31 | int tcp_port       = 8888;
 32 | int client_port    = 8000;
 33 | 
 34 | using namespace rdmaio;
 35 | 
 36 | RdmaCtrl *c = new RdmaCtrl(server_node_id,tcp_port);
 37 | RdmaCtrl::DevIdx idx {.dev_id = 0,.port_id = 1 }; // using the first RNIC's first port
 38 | c->open_thread_local_device(idx);
 39 | 
 40 | // register a buffer to the previous opened device, using id = 73
 41 | char *buffer = (char *)malloc(4096);
 42 | memset(buffer, 0, 4096);
 43 | RDMA_ASSERT(c->register_memory(73,buffer,4096,c->get_device()) == true);
 44 | 
 45 | char s[] = "hello world";
 46 | memcpy(buffer, s, strlen(s));
 47 | 
 48 | RCQP *qp = c->create_rc_qp(create_rc_idx(1,0),c->get_device(),c->get_local_mr(73));
 49 | 
 50 | // server also needs to "connect" clinet.
 51 | while(qp->connect(client_ip,client_port) != SUCC)  {
 52 |     usleep(2000);
 53 | }
 54 | 
 55 | while(true) {
 56 |     // This is RDMA, server does not need to do anything :)
 57 |     sleep(1);
 58 | }
 59 | ```
 60 | 
 61 | Client side
 62 | ```c++
 63 | int client_node_id = 0;
 64 | int tcp_port       = 8000;
 65 | int server_port    = 8888;
 66 | 
 67 | using namespace rdmaio;
 68 | 
 69 | RdmaCtrl *c = new RdmaCtrl(client_node_id,tcp_port);
 70 | RdmaCtrl::DevIdx idx {.dev_id = 0,.port_id = 1 }; // using the first RNIC's first port
 71 | c->open_thread_local_device(idx);
 72 | 
 73 | // register a buffer to the previous opened device, using id = 73
 74 | char *buffer = (char *)malloc(4096);
 75 | RDMA_ASSERT(c->register_memory(73,buffer,4096,c->get_device()) == true);
 76 | 
 77 | // get remote server's memory information
 78 | MemoryAttr mr;
 79 | while(QP::get_remote_mr(server_ip,server_port,73,&mr) != SUCC) {
 80 |     usleep(2000);
 81 | }
 82 | 
 83 | // create the RC qp to access remote server's memory, using the previous registered memory
 84 | RCQP *qp = c->create_rc_qp(create_rc_idx(1,0),c->get_device(),c->get_local_mr(73));
 85 | qp->bind_remote_mr(mr); // bind to the previous allocated mr
 86 | 
 87 | while(qp->connect(server_ip,server_port) != SUCC)  {
 88 |     usleep(2000);
 89 | }
 90 | 
 91 | // main pingpong loop
 92 | 
 93 | ibv_wc wc;
 94 | while(true) {
 95 |     char *local_buf  = buffer;
 96 |     uint64_t address = 0;
 97 |     int msg_len = 11;   // length of "hello world"
 98 |     // read an uint64_t from the server
 99 |     auto rc = qp->post_send(IBV_WR_RDMA_READ,local_buf,msg_len,address,IBV_SEND_SIGNALED);
100 |     qp->poll_till_completion();
101 |     // then get the results, stored in the local_buffer
102 | }
103 | 
104 | ```
105 | 
106 | ### Acknowledgments
107 | TODO
108 | 


--------------------------------------------------------------------------------
/common.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cstdint>
  4 | 
  5 | #include "logging.hpp"
  6 | #include "rnic.hpp"
  7 | #include "mr.hpp"
  8 | 
  9 | namespace rdmaio {
 10 | 
 11 | // connection status
 12 | enum ConnStatus {
 13 |   SUCC         = 0,
 14 |   TIMEOUT      = 1,
 15 |   WRONG_ARG    = 2,
 16 |   ERR          = 3,
 17 |   NOT_READY    = 4,
 18 |   UNKNOWN      = 5
 19 | };
 20 | 
 21 | /**
 22 |  * The connection information exchanged between different QPs.
 23 |  * RC/UC QPs uses lid & addr to conncet to remote QPs, while qpn is used upon send requests.
 24 |  * node_id & port_id is used for UD QP to create addresses.
 25 |  */
 26 | struct QPAttr {
 27 |   address_t addr;
 28 |   uint16_t lid;
 29 |   uint32_t qpn;
 30 |   uint32_t psn;
 31 |   uint16_t node_id;
 32 |   uint16_t port_id;
 33 | };
 34 | 
 35 | /**
 36 |  * The QP connection requests sent to remote.
 37 |  * from_node & from_worker identifies which QP it shall connect to
 38 |  */
 39 | struct QPConnArg {
 40 |   uint16_t from_node;
 41 |   uint8_t  from_worker;
 42 |   uint8_t  qp_type; // RC QP or UD QP
 43 | };
 44 | 
 45 | /**
 46 |  * The MR connection requests sent to remote.
 47 |  */
 48 | struct MRConnArg {
 49 |   uint64_t mr_id;
 50 | };
 51 | 
 52 | struct ConnArg {
 53 |   enum { MR, QP } type;
 54 |   union {
 55 |     QPConnArg qp;
 56 |     MRConnArg mr;
 57 |   } payload;
 58 | };
 59 | 
 60 | struct ConnReply {
 61 |   ConnStatus ack;
 62 |   union {
 63 |     QPAttr qp;
 64 |     MemoryAttr mr;
 65 |   } payload;
 66 | };
 67 | 
 68 | inline int convert_mtu(ibv_mtu type) {
 69 |   int mtu = 0;
 70 |   switch(type) {
 71 |     case IBV_MTU_256:
 72 |       mtu = 256;
 73 |       break;
 74 |     case IBV_MTU_512:
 75 |       mtu = 512;
 76 |       break;
 77 |     case IBV_MTU_1024:
 78 |       mtu = 1024;
 79 |       break;
 80 |     case IBV_MTU_2048:
 81 |       mtu = 2048;
 82 |       break;
 83 |     case IBV_MTU_4096:
 84 |       mtu = 4096;
 85 |       break;
 86 |   }
 87 |   return mtu;
 88 | }
 89 | 
 90 | // The structure used to configure UDQP
 91 | typedef struct {
 92 |   int max_send_size;
 93 |   int max_recv_size;
 94 |   int qkey;
 95 |   int psn;
 96 | } UDConfig;
 97 | 
 98 | typedef struct {
 99 |   int access_flags;
100 |   int max_rd_atomic;
101 |   int max_dest_rd_atomic;
102 |   int rq_psn;
103 |   int sq_psn;
104 |   int timeout;
105 | } RCConfig;
106 | 
107 | } // namespace rdmaio
108 | 


--------------------------------------------------------------------------------
/example/client.cpp:
--------------------------------------------------------------------------------
 1 | #include "rdma_ctrl.hpp"
 2 | #include <stdio.h>
 3 | #include <assert.h>
 4 | 
 5 | int client_node_id = 0;
 6 | int tcp_port       = 8000;
 7 | int server_port = 8888;
 8 | 
 9 | using namespace rdmaio;
10 | 
11 | int main(int argc, char *argv[])
12 | {
13 |     RdmaCtrl *c = new RdmaCtrl(client_node_id,tcp_port);
14 |     RdmaCtrl::DevIdx idx {.dev_id = 0,.port_id = 1 }; // using the first RNIC's first port
15 |     c->open_thread_local_device(idx);
16 | 
17 |     // register a buffer to the previous opened device, using id = 73
18 |     char *buffer = (char *)malloc(4096);
19 |     memset(buffer, 0, 4096);
20 |     RDMA_ASSERT(c->register_memory(73,buffer,4096,c->get_device()) == true);
21 | 
22 |     // get remote server's memory information
23 |     MemoryAttr remote_mr;
24 |     while(QP::get_remote_mr("localhost",server_port,73,&remote_mr) != SUCC) {
25 |         usleep(2000);
26 |     }
27 | 
28 |     // create the RC qp to access remote server's memory, using the previous registered memory
29 |     MemoryAttr local_mr = c->get_local_mr(73);
30 |     RCQP *qp = c->create_rc_qp(create_rc_idx(1,0),c->get_device(), &local_mr);
31 |     qp->bind_remote_mr(remote_mr); // bind to the previous allocated mr
32 | 
33 |     while(qp->connect("localhost",server_port) != SUCC)  {
34 |         usleep(2000);
35 |     }
36 | 
37 |     printf("client: QP connected!\n");
38 |     ibv_wc wc;
39 |     char *local_buf  = buffer;
40 |     uint64_t address = 0;
41 |     int msg_len = 11;   // length of "hello world"
42 | 
43 |     // read an uint64_t from the server
44 |     auto rc = qp->post_send(IBV_WR_RDMA_READ,local_buf,msg_len,address,IBV_SEND_SIGNALED);
45 |     if (rc == SUCC) {
46 |         printf("client: post ok\n");
47 |     } else {
48 |         printf("client: post fail. rc=%d\n", rc);
49 |     }
50 |     rc = qp->poll_till_completion(wc, no_timeout);
51 |     // then get the results, stored in the local_buffer
52 |     if (rc == SUCC) {
53 |         printf("client: poll ok\n");
54 |         printf("msg read: %s\n", local_buf);
55 |     } else {
56 |         printf("client: poll fail. rc=%d\n", rc);
57 |     }
58 | 
59 |     return 0;
60 | 
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/example/server.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "rdma_ctrl.hpp"
 3 | #include <stdio.h>
 4 | #include <string.h>
 5 | 
 6 | 
 7 | /**
 8 |  * Note, RDMA usually uses some other communication method (e.g. TCP/IP) to exchange QP informations.
 9 |  * RLib uses TCP for the pre-communication.
10 |  */
11 | int server_node_id = 1;
12 | int tcp_port       = 8888;
13 | int client_port    = 8000;
14 | 
15 | using namespace rdmaio;
16 | 
17 | int main(int argc, char *argv[])
18 | {
19 |     RdmaCtrl *c = new RdmaCtrl(server_node_id,tcp_port);
20 |     RdmaCtrl::DevIdx idx {.dev_id = 0,.port_id = 1 }; // using the first RNIC's first port
21 |     c->open_thread_local_device(idx);
22 | 
23 |     // register a buffer to the previous opened device, using id = 73
24 |     char *buffer = (char *)malloc(4096);
25 |     memset(buffer, 0, 4096);
26 |     RDMA_ASSERT(c->register_memory(73,buffer,4096,c->get_device()) == true);
27 | 
28 |     char s[] = "hello world";
29 |     memcpy(buffer, s, strlen(s));
30 | 
31 |     MemoryAttr local_mr = c->get_local_mr(73);
32 |     RCQP *qp = c->create_rc_qp(create_rc_idx(1,0),c->get_device(), &local_mr);
33 | 
34 |     // server also needs to "connect" clinet.
35 |     while(qp->connect("localhost", client_port, create_rc_idx(1,0)) != SUCC)  {
36 |         usleep(2000);
37 |     }
38 | 
39 |     printf("server: QP connected!\n");
40 |     while(true) {
41 |         // This is RDMA, server does not need to do anything :)
42 |         sleep(1);
43 |     }
44 | 
45 |     return 0;
46 | }
47 | 


--------------------------------------------------------------------------------
/logging.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * The logging utilities used in libRDMA.
  3 |  */
  4 | 
  5 | #pragma once
  6 | 
  7 | #include <iostream>
  8 | #include <sstream>
  9 | 
 10 | namespace rdmaio {
 11 | 
 12 | /**
 13 |  * \def FATAL
 14 |  *   Used for fatal and probably irrecoverable conditions
 15 |  * \def ERROR
 16 |  *   Used for errors which are recoverable within the scope of the function
 17 |  * \def WARNING
 18 |  *   Logs interesting conditions which are probably not fatal
 19 |  * \def EMPH
 20 |  *   Outputs as INFO, but in WARNING colors. Useful for
 21 |  *   outputting information you want to emphasize.
 22 |  * \def INFO
 23 |  *   Used for providing general useful information
 24 |  * \def DEBUG
 25 |  *   Debugging purposes only
 26 |  * \def EVERYTHING
 27 |  *   Log everything
 28 |  */
 29 | 
 30 | enum loglevel {
 31 |   NONE       = 7,
 32 |   FATAL      = 6,
 33 |   ERROR      = 5,
 34 |   WARNING    = 4,
 35 |   EMPH       = 3,
 36 |   INFO       = 2,
 37 |   DEBUG      = 1,
 38 |   EVERYTHING = 0
 39 | };
 40 | 
 41 | #define unlikely(x) __builtin_expect(!!(x), 0)
 42 | 
 43 | #ifndef RDMA_LOG_LEVEL
 44 | #define RDMA_LOG_LEVEL ::rdmaio::INFO
 45 | #endif
 46 | 
 47 | // logging macro definiations
 48 | // default log
 49 | #define RDMA_LOG(n)                                                      \
 50 |   if (n >= RDMA_LOG_LEVEL)                                          \
 51 |     ::rdmaio::MessageLogger((char*)__FILE__, __LINE__, n).stream()
 52 | 
 53 | // log with tag
 54 | #define RDMA_TLOG(n,t)                                                   \
 55 |   if(n >= RDMA_LOG_LEVEL)                                               \
 56 |     ::rdmaio::MessageLogger((char*)__FILE__, __LINE__, n).stream()    \
 57 |           << "[" << (t) << "]"
 58 | 
 59 | #define RDMA_LOG_IF(n,condition)                                         \
 60 |   if(n >= RDMA_LOG_LEVEL && (condition))                            \
 61 |     ::rdmaio::MessageLogger((char*)__FILE__, __LINE__, n).stream()
 62 | 
 63 | #define RDMA_ASSERT(condition)                                               \
 64 |   if(unlikely(!(condition)))                                            \
 65 |     ::rdmaio::MessageLogger((char*)__FILE__, __LINE__, ::rdmaio::FATAL + 1).stream() << "Assertion! "
 66 | 
 67 | #define RDMA_VERIFY(n,condition) RDMA_LOG_IF(n,(!(condition)))
 68 | 
 69 | class MessageLogger {
 70 |  public:
 71 |   MessageLogger(const char *file, int line, int level) :level_(level) {
 72 |     if(level_ < RDMA_LOG_LEVEL)
 73 |       return;
 74 |     stream_ << "[" << StripBasename(std::string(file)) << ":" << line << "] ";
 75 |   }
 76 | 
 77 |   ~MessageLogger() {
 78 |     if(level_ >= RDMA_LOG_LEVEL) {
 79 |       stream_ << "\n";
 80 |       std::cout << "\033[" << RDMA_DEBUG_LEVEL_COLOR[std::min(level_,6)] << "m"
 81 |                 << stream_.str() << EndcolorFlag();
 82 |       if(level_ >= ::rdmaio::FATAL)
 83 |         abort();
 84 |     }
 85 |   }
 86 | 
 87 |   // Return the stream associated with the logger object.
 88 |   std::stringstream &stream() { return stream_; }
 89 |  private:
 90 |   std::stringstream stream_;
 91 |   int level_;
 92 | 
 93 |   // control flags for color
 94 | #define R_BLACK 39
 95 | #define R_RED 31
 96 | #define R_GREEN 32
 97 | #define R_YELLOW 33
 98 | #define R_BLUE 34
 99 | #define R_MAGENTA 35
100 | #define R_CYAN 36
101 | #define R_WHITE 37
102 | 
103 |   const int RDMA_DEBUG_LEVEL_COLOR[7] = {R_BLACK,R_YELLOW,R_BLACK,R_GREEN,R_MAGENTA,R_RED,R_RED};
104 | 
105 |   static std::string StripBasename(const std::string &full_path) {
106 |     const char kSeparator = '/';
107 |     size_t pos = full_path.rfind(kSeparator);
108 |     if (pos != std::string::npos) {
109 |       return full_path.substr(pos + 1, std::string::npos);
110 |     } else {
111 |       return full_path;
112 |     }
113 |   }
114 | 
115 |   static std::string EndcolorFlag() {
116 |     char flag[7];
117 |     snprintf(flag,7, "%c[0m", 0x1B);
118 |     return std::string(flag);
119 |   }
120 | };
121 | 
122 | };
123 | 


--------------------------------------------------------------------------------
/mr.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <infiniband/verbs.h>
 4 | #include "logging.hpp"
 5 | 
 6 | namespace rdmaio {
 7 | 
 8 | struct MemoryAttr {
 9 |   uintptr_t  buf;
10 |   uint32_t   key;
11 | };
12 | 
13 | class Memory {
14 |  public:
15 |   /**
16 |    * The default protection flag of a memory region.
17 |    * In default, the memory can be read/write by local and remote RNIC operations.
18 |    */
19 |   static const int DEFAULT_PROTECTION_FLAG = (IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | \
20 |                                               IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC );
21 | 
22 |   Memory(const char *addr,uint64_t len,ibv_pd *pd,int flag):
23 |       addr(addr),
24 |       len(len),
25 |       mr(ibv_reg_mr(pd,(void *)addr,len,flag))
26 |   {
27 |     if(mr == nullptr) {
28 |       RDMA_LOG(WARNING) << "failed to register mr, for addr " << addr << "; len " << len;
29 |     } else {
30 |       rattr.buf = (uintptr_t)addr;
31 |       rattr.key = mr->rkey;
32 |     }
33 |   }
34 | 
35 |   ~Memory() {
36 |     if(mr != nullptr) {
37 |       int rc = ibv_dereg_mr(mr);
38 |       RDMA_LOG_IF(ERROR,rc != 0) << "dereg mr error: " << strerror(errno);
39 |     }
40 |   }
41 | 
42 |   bool valid() {
43 |     return mr != nullptr;
44 |   }
45 | 
46 |   const char *addr;
47 |   uint64_t len;
48 | 
49 |   MemoryAttr rattr;        // RDMA registered attr
50 |   ibv_mr *mr = nullptr;    // mr in the driver
51 | };
52 | 
53 | 
54 | }; // namespace rdmaio
55 | 


--------------------------------------------------------------------------------
/msg_interface.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <string>
  4 | #include <set>
  5 | #include <functional>
  6 | 
  7 | #include "common.hpp"
  8 | 
  9 | namespace rdmaio {
 10 | 
 11 | typedef std::function<void(const char *,int,int)> msg_callback_t_;
 12 | 
 13 | /**
 14 |  * An abstract message interface
 15 |  * Assumption: one per thread
 16 |  */
 17 | class MsgAdapter {
 18 |  public:
 19 | 
 20 |   MsgAdapter(msg_callback_t_ callback)
 21 |       : callback_(callback) {
 22 |   }
 23 | 
 24 |   MsgAdapter() {
 25 | 
 26 |   }
 27 | 
 28 |   void set_callback(msg_callback_t_ callback) {
 29 |     callback_ = callback;
 30 |   }
 31 | 
 32 |   virtual ConnStatus connect(std::string ip,int port) = 0;
 33 | 
 34 |   /**
 35 |    * Basic send interfaces
 36 |    */
 37 |   virtual ConnStatus send_to(int node_id,const char *msg,int len) = 0;
 38 | 
 39 |   virtual ConnStatus send_to(int node_id,int tid,const char *msg,int len) {
 40 |     return send_to(node_id,msg,len);
 41 |   }
 42 | 
 43 |   /**
 44 |    * Interfaces which allow batching at the sender's side
 45 |    */
 46 |   virtual void prepare_pending() {
 47 |   }
 48 | 
 49 |   virtual ConnStatus send_pending(int node_id,const char *msg,int len) {
 50 |     RDMA_ASSERT(false); // not implemented
 51 |   }
 52 | 
 53 |   virtual ConnStatus send_pending(int node_id,int tid,const char *msg,int len) {
 54 |     return send_pending(node_id,msg,len);
 55 |   }
 56 | 
 57 |   /**
 58 |    * Flush all the currently pended message
 59 |    */
 60 |   virtual ConnStatus flush_pending() {
 61 |     return SUCC;
 62 |   }
 63 | 
 64 |   /**
 65 |    * Examples to use batching at the sender side
 66 |    * Broadcast the message to a set of servers
 67 |    */
 68 |   virtual ConnStatus broadcast_to(const std::set<int> &nodes, const char *msg,int len) {
 69 |     prepare_pending();
 70 |     for(auto it = nodes.begin(); it != nodes.end(); ++it) {
 71 |       send_pending(*it,msg,len);
 72 |     }
 73 |     flush_pending();
 74 |     return SUCC; // TODO
 75 |   }
 76 | 
 77 |   virtual ConnStatus broadcast_to(int *nodes,int num, const char *msg,int len) {
 78 |     prepare_pending();
 79 |     for(int i = 0;i < num;++i) {
 80 |       send_pending(nodes[i],msg,len);
 81 |     }
 82 |     flush_pending();
 83 |     return SUCC;  // TODO
 84 |   }
 85 | 
 86 |   /**
 87 |    * The receive function
 88 |    */
 89 |   virtual void poll_comps() = 0;
 90 | 
 91 |   /**
 92 |    * The size of meta data used by the MsgAdapter for each message
 93 |    */
 94 |   virtual int msg_meta_len() {
 95 |     return 0;
 96 |   }
 97 | 
 98 |  protected:
 99 |   msg_callback_t_ callback_;
100 | };
101 | 
102 | };
103 | 


--------------------------------------------------------------------------------
/pre_connector.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "logging.hpp"
  4 | 
  5 | #include <netdb.h> //hostent
  6 | #include <arpa/inet.h>
  7 | #include <fcntl.h>
  8 | #include <sys/time.h>
  9 | #include <unistd.h>
 10 | #include <string.h>
 11 | #include <errno.h>
 12 | 
 13 | #include <map>
 14 | 
 15 | namespace rdmaio {
 16 | 
 17 | constexpr struct timeval default_timeout = {0,8000};
 18 | constexpr struct timeval no_timeout      = {0,0};  // it means forever
 19 | 
 20 | inline __attribute__ ((always_inline)) // inline to avoid multiple-definiations
 21 | int64_t diff_time(const struct timeval &end, const struct timeval &start) {
 22 |   int64_t diff = (end.tv_sec > start.tv_sec)?(end.tv_sec - start.tv_sec) * 1000:0;
 23 |   if (end.tv_usec > start.tv_usec) {
 24 |     diff += (end.tv_usec - start.tv_usec);
 25 |   } else {
 26 |     diff -= (start.tv_usec - end.tv_usec);
 27 |   }
 28 |   return diff;
 29 | }
 30 | 
 31 | class PreConnector { // helper class used to exchange QP information using TCP/IP
 32 |  public:
 33 |   static int get_listen_socket(const std::string &addr,int port) {
 34 | 
 35 |     struct sockaddr_in serv_addr;
 36 |     auto sockfd =  socket(AF_INET, SOCK_STREAM, 0);
 37 |     RDMA_ASSERT(sockfd >= 0) <<  "ERROR opening listen socket: " << strerror(errno);
 38 | 
 39 |     /* setup the host_addr structure for use in bind call */
 40 |     // server byte order
 41 |     serv_addr.sin_family = AF_INET;
 42 | 
 43 |     serv_addr.sin_addr.s_addr = INADDR_ANY;
 44 | 
 45 |     // port
 46 |     serv_addr.sin_port = htons(port);
 47 | 
 48 |     RDMA_ASSERT(bind(sockfd, (struct sockaddr *) &serv_addr,
 49 |                 sizeof(serv_addr)) == 0) << "ERROR on binding: " << strerror(errno);
 50 |     return sockfd;
 51 |   }
 52 | 
 53 |   static int get_send_socket(const std::string &addr,int port,struct timeval timeout = default_timeout) {
 54 |     int sockfd;
 55 |     struct sockaddr_in serv_addr;
 56 | 
 57 |     RDMA_ASSERT((sockfd = socket(AF_INET, SOCK_STREAM, 0)) >= 0) << "Error open socket for send!";
 58 |     fcntl(sockfd, F_SETFL, O_NONBLOCK);
 59 | 
 60 |     serv_addr.sin_family = AF_INET;
 61 |     serv_addr.sin_port = htons(port);
 62 | 
 63 |     auto ip = host_to_ip(addr);
 64 |     if(ip == "") {
 65 |       close(sockfd);
 66 |       return -1;
 67 |     }
 68 | 
 69 |     serv_addr.sin_addr.s_addr = inet_addr(ip.c_str());
 70 | 
 71 |     if(connect(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) == -1) {
 72 |       if (errno == EINPROGRESS) {
 73 |         goto PROGRESS;
 74 |       }
 75 |       close(sockfd);
 76 |       return -1;
 77 |     }
 78 |  PROGRESS:
 79 |     // check return status
 80 |     fd_set fdset;
 81 |     FD_ZERO(&fdset);
 82 |     FD_SET(sockfd, &fdset);
 83 | 
 84 |     if(select(sockfd + 1, NULL, &fdset, NULL, &timeout) == 1)
 85 |     {
 86 |       int so_error;
 87 |       socklen_t len = sizeof so_error;
 88 | 
 89 |       getsockopt(sockfd, SOL_SOCKET, SO_ERROR, &so_error, &len);
 90 | 
 91 |       if (so_error == 0) {
 92 |         // success
 93 |       } else {
 94 |         close(sockfd);
 95 |         return -1;
 96 |       }
 97 |     }
 98 | 
 99 |     return sockfd;
100 |   }
101 | 
102 |   // timeout in microsend
103 |   static bool wait_recv(int socket, uint32_t timeout = 2000) {
104 | 
105 |     while(true) {
106 | 
107 |       fd_set rfds;
108 |       FD_ZERO(&rfds);
109 |       FD_SET(socket, &rfds);
110 | 
111 |       struct timeval s_timeout = {0,timeout};
112 |       int ready = select(socket + 1, &rfds, NULL, NULL, &s_timeout);
113 |       RDMA_ASSERT(ready != -1);
114 | 
115 |       if(ready == 0) { // no file descriptor found
116 |         continue;
117 |       }
118 | 
119 |       if(ready < 0) { // error case
120 |         RDMA_ASSERT(false) << "select error " << strerror(errno);
121 |       }
122 | 
123 |       if (FD_ISSET(socket, &rfds)) {
124 |         break; // ready
125 |       }
126 |     }
127 |     return true;
128 |   }
129 | 
130 |   static void wait_close(int socket) {
131 | 
132 |     shutdown(socket, SHUT_WR);
133 |     char buf[2];
134 | 
135 |     struct timeval timeout={1,0};
136 |     auto ret = setsockopt(socket,SOL_SOCKET,SO_RCVTIMEO,(const char*)&timeout,sizeof(timeout));
137 |     RDMA_ASSERT(ret == 0);
138 | 
139 |     recv(socket,buf,2,0);
140 |     close(socket);
141 |   }
142 | 
143 |   static int send_to(int fd, char *usrbuf, size_t n) {
144 |     size_t nleft = n;
145 |     ssize_t nwritten;
146 |     char *bufp = usrbuf;
147 | 
148 |     while (nleft > 0) {
149 |       if ((nwritten = write(fd, bufp, nleft)) <= 0) {
150 |         if (errno == EINTR)  /* Interrupted by sig handler return */
151 |           nwritten = 0;    /* and call write() again */
152 |         else
153 |           return -1;       /* errno set by write() */
154 |       }
155 |       nleft -= nwritten;
156 |       bufp += nwritten;
157 |     }
158 |     return n;
159 |   }
160 | 
161 |   typedef std::map<std::string,std::string> ipmap_t;
162 |   static ipmap_t &local_ip_cache() {
163 |     static __thread ipmap_t cache;
164 |     return cache;
165 |   }
166 | 
167 |   static std::string host_to_ip(const std::string &host) {
168 | 
169 |     ipmap_t cache = local_ip_cache();
170 |     if(cache.find(host) != cache.end())
171 |       return cache[host];
172 | 
173 |     std::string res = "";
174 | 
175 |     struct addrinfo hints, *infoptr;
176 |     memset(&hints, 0, sizeof hints);
177 |     hints.ai_family = AF_INET; // AF_INET means IPv4 only addresses
178 | 
179 |     int result = getaddrinfo(host.c_str(), NULL, &hints, &infoptr);
180 |     if (result) {
181 |       fprintf(stderr, "getaddrinfo: %s at %s\n", gai_strerror(result),host.c_str());
182 |       return "";
183 |     }
184 |     char ip[64]; memset(ip,0,sizeof(ip));
185 | 
186 |     for(struct addrinfo *p = infoptr; p != NULL; p = p->ai_next) {
187 |       getnameinfo(p->ai_addr, p->ai_addrlen, ip, sizeof(ip), NULL, 0, NI_NUMERICHOST);
188 |     }
189 | 
190 |     res = std::string(ip);
191 |     if(res != "")
192 |       cache.insert(std::make_pair(host,res));
193 |     return res;
194 |   }
195 | 
196 | };
197 | 
198 | 
199 | }; // namespace rdmaio
200 | 


--------------------------------------------------------------------------------
/qp.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "common.hpp"
  4 | #include "qp_impl.hpp" // hide the implementation
  5 | 
  6 | namespace rdmaio {
  7 | 
  8 | /**
  9 |  * The QP managed by RLib is identified by the QPIdx
 10 |  * Basically it identifies which worker(thread) is using the QP,
 11 |  * and which machine this QP is connected to.
 12 |  */
 13 | typedef struct {
 14 |   int node_id;   // the node QP connect to
 15 |   int worker_id; // the thread/task QP belongs
 16 |   int index;     // mutliple QP may is needed to connect to the node
 17 | } QPIdx;
 18 | 
 19 | // some macros for easy computer QP idx, since some use default values
 20 | constexpr QPIdx create_rc_idx(int nid,int wid) {
 21 |   return QPIdx {
 22 |     .node_id   = nid,
 23 |     .worker_id = wid,
 24 |     .index     = 0
 25 |   };
 26 | }
 27 | 
 28 | constexpr QPIdx create_ud_idx(int worker_id,int idx = 0) {
 29 |   return QPIdx {
 30 |     .node_id   = 0, // a UD qp can connect to multiple machine
 31 |     .worker_id = worker_id,
 32 |     .index     = idx
 33 |   };
 34 | }
 35 | 
 36 | /**
 37 |  * Wrappers over ibv_qp & ibv_cq
 38 |  * For easy use, and connect
 39 |  */
 40 | class QP {
 41 |  public:
 42 |   QP(RNicHandler *rnic,QPIdx idx):
 43 |       idx_(idx),
 44 |       rnic_(rnic)
 45 |   {
 46 |   }
 47 | 
 48 |   ~QP() {
 49 |     if(qp_ != nullptr)
 50 |       ibv_destroy_qp(qp_);
 51 |     if(cq_ != nullptr)
 52 |       ibv_destroy_cq(cq_);
 53 |   }
 54 |   /**
 55 |    * Connect to remote QP
 56 |    * Note, we leverage TCP for a pre connect.
 57 |    * So the IP/Hostname and a TCP port must be given.
 58 |    *
 59 |    * WARNING:
 60 |    * This function actually should contains two functions, connect + change QP status
 61 |    * maybe split to connect + change status for more flexibility?
 62 |    */
 63 |   /**
 64 |    * connect to the specific QP at remote, specificed by the nid and wid
 65 |    * return SUCC if QP are ready.
 66 |    * return TIMEOUT if there is network error.
 67 |    * return NOT_READY if remote server fails to find the connected QP
 68 |    */
 69 |   virtual ConnStatus connect(std::string ip,int port,QPIdx idx) = 0;
 70 | 
 71 |   // return until the completion events
 72 |   // this call will block until a timeout
 73 |   virtual ConnStatus poll_till_completion(ibv_wc &wc, struct timeval timeout = default_timeout) {
 74 |     return QPImpl::poll_till_completion(cq_,wc,timeout);
 75 |   }
 76 | 
 77 |   void bind_local_mr(MemoryAttr attr) {
 78 |     local_mr_ = attr;
 79 |   }
 80 | 
 81 |   QPAttr get_attr() const {
 82 |     QPAttr res = {
 83 |       .addr     = rnic_->query_addr(),
 84 |       .lid      = rnic_->lid,
 85 |       .qpn      = (qp_ != nullptr)?qp_->qp_num:0,
 86 |       .psn      = DEFAULT_PSN, // TODO! this may be filled later
 87 |       .node_id  = 0, // a place holder
 88 |       .port_id  = rnic_->port_id
 89 |     };
 90 |     return res;
 91 |   }
 92 | 
 93 |   /**
 94 |    * Get remote MR attribute
 95 |    */
 96 |   static ConnStatus get_remote_mr(std::string ip,int port,int mr_id,MemoryAttr *attr) {
 97 |     return QPImpl::get_remote_mr(ip,port,mr_id,attr);
 98 |   }
 99 | 
100 |   // QP identifiers
101 |   const QPIdx idx_;
102 | 
103 |  public:
104 |   // internal verbs structure
105 |   struct ibv_qp *qp_ = NULL;
106 |   struct ibv_cq *cq_ = NULL;
107 | 
108 |   // local MR used to post reqs
109 |   MemoryAttr local_mr_;
110 |   RNicHandler *rnic_;
111 | 
112 |  protected:
113 |   ConnStatus get_remote_helper(ConnArg *arg, ConnReply *reply,std::string ip,int port) {
114 |     return QPImpl::get_remote_helper(arg,reply,ip,port);
115 |   }
116 | };
117 | 
118 | inline constexpr RCConfig default_rc_config() {
119 |   return RCConfig {
120 |     .access_flags       = (IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC),
121 |     .max_rd_atomic      = 16,
122 |     .max_dest_rd_atomic = 16,
123 |     .rq_psn             = DEFAULT_PSN,
124 |     .sq_psn             = DEFAULT_PSN,
125 |     .timeout            = 20
126 |   };
127 | }
128 | 
129 | /**
130 |  * Raw RC QP
131 |  */
132 | template <RCConfig (*F)(void) = default_rc_config>
133 | class RRCQP : public QP {
134 |  public:
135 |   RRCQP(RNicHandler *rnic,QPIdx idx,
136 |        MemoryAttr local_mr,MemoryAttr remote_mr)
137 |       :RRCQP(rnic,idx) {
138 |     bind_local_mr(local_mr);
139 |     bind_remote_mr(remote_mr);
140 |   }
141 | 
142 |   RRCQP(RNicHandler *rnic,QPIdx idx,MemoryAttr local_mr)
143 |       :RRCQP(rnic,idx) {
144 |     bind_local_mr(local_mr);
145 |   }
146 | 
147 |   RRCQP(RNicHandler *rnic,QPIdx idx)
148 |       :QP(rnic,idx)
149 |   {
150 |     RCQPImpl::init<F>(qp_,cq_,rnic_);
151 |   }
152 | 
153 |   ConnStatus connect(std::string ip,int port) {
154 |     return connect(ip,port,idx_);
155 |   }
156 | 
157 |   ConnStatus connect(std::string ip,int port,QPIdx idx) {
158 | 
159 |     // first check whether QP is valid to connect
160 |     enum ibv_qp_state state;
161 |     if( (state = QPImpl::query_qp_status(qp_)) != IBV_QPS_INIT) {
162 |       if(state != IBV_QPS_RTS)
163 |         RDMA_LOG(WARNING) << "qp not in a correct state to connect!";
164 |       return (state == IBV_QPS_RTS)?SUCC:UNKNOWN;
165 |     }
166 |     ConnArg arg = {} ; ConnReply reply = {};
167 |     arg.type = ConnArg::QP;
168 |     arg.payload.qp.from_node   = idx.node_id;
169 |     arg.payload.qp.from_worker = idx.worker_id;
170 |     arg.payload.qp.qp_type     = IBV_QPT_RC;
171 | 
172 |     auto ret = QPImpl::get_remote_helper(&arg,&reply,ip,port);
173 |     if(ret == SUCC) {
174 |       // change QP status
175 |       if(!RCQPImpl::ready2rcv<F>(qp_,reply.payload.qp,rnic_)) {
176 |         RDMA_LOG(WARNING) << "change qp status to ready to receive error: " << strerror(errno);
177 |         ret = ERR;
178 |         goto CONN_END;
179 |       }
180 | 
181 |       if(!RCQPImpl::ready2send<F>(qp_)) {
182 |         RDMA_LOG(WARNING) << "change qp status to ready to send error: " << strerror(errno);
183 |         ret = ERR;
184 |         goto CONN_END;
185 |       }
186 |     }
187 |  CONN_END:
188 |     return ret;
189 |   }
190 | 
191 |   /**
192 |    * Bind this QP's operation to a remote memory region according to the MemoryAttr.
193 |    * Since usually one QP access *one memory region* almost all the time,
194 |    * so it is more convenient to use a bind-post;bind-post-post fashion.
195 |    */
196 |   void bind_remote_mr(MemoryAttr attr) {
197 |     remote_mr_ = attr;
198 |   }
199 | 
200 |   ConnStatus post_send_to_mr(MemoryAttr &local_mr,MemoryAttr &remote_mr,
201 |                              ibv_wr_opcode op,char *local_buf,uint32_t len,uint64_t off,int flags,
202 |                              uint64_t wr_id = 0, uint32_t imm = 0) {
203 |     ConnStatus ret = SUCC;
204 |     struct ibv_send_wr *bad_sr;
205 | 
206 |     // setting the SGE
207 |     struct ibv_sge sge {
208 |       .addr = (uint64_t)local_buf,
209 |           .length = len,
210 |           .lkey   = local_mr.key
211 |           };
212 | 
213 |     // setting sr, sr has to be initilized in this style
214 |     struct ibv_send_wr sr;
215 |     sr.wr_id        = wr_id;
216 |     sr.opcode       = op;
217 |     sr.num_sge      = 1;
218 |     sr.next         = NULL;
219 |     sr.sg_list      = &sge;
220 |     sr.send_flags   = flags;
221 |     sr.imm_data     = imm;
222 | 
223 |     sr.wr.rdma.remote_addr = remote_mr.buf + off;
224 |     sr.wr.rdma.rkey        = remote_mr.key;
225 | 
226 |     auto rc = ibv_post_send(qp_,&sr,&bad_sr);
227 |     return rc == 0 ? SUCC : ERR;
228 |   }
229 | 
230 |   /**
231 |    * Post request(s) to the sending QP.
232 |    * This is just a wrapper of ibv_post_send
233 |    */
234 |   ConnStatus post_send(ibv_wr_opcode op,char *local_buf,uint32_t len,uint64_t off,int flags,
235 |                        uint64_t wr_id = 0, uint32_t imm = 0) {
236 |     return post_send_to_mr(local_mr_,remote_mr_,op,local_buf,len,off,flags,wr_id,imm);
237 |   }
238 | 
239 |   // one-sided atomic operations
240 |   ConnStatus post_cas(char *local_buf,uint64_t off,
241 |                       uint64_t compare,uint64_t swap,int flags,uint64_t wr_id = 0) {
242 |     return post_atomic<IBV_WR_ATOMIC_CMP_AND_SWP>(local_buf,off,compare,swap,flags,wr_id);
243 |   }
244 | 
245 |   // one-sided fetch and add
246 |   ConnStatus post_faa(char *local_buf,uint64_t off,uint64_t add_value,int flags,int wr_id = 0) {
247 |     return post_atomic<IBV_WR_ATOMIC_FETCH_AND_ADD>(local_buf,off,add_value,0 /* no swap value is needed*/,flags,wr_id);
248 |   }
249 | 
250 |   template <ibv_wr_opcode type>
251 |   ConnStatus post_atomic(char *local_buf,uint64_t off,
252 |                          uint64_t compare,uint64_t swap,int flags,uint64_t wr_id = 0) {
253 |     static_assert(type == IBV_WR_ATOMIC_CMP_AND_SWP || type == IBV_WR_ATOMIC_FETCH_AND_ADD,
254 |                   "only two atomic operations are currently supported.");
255 | 
256 |     // check if address (off) is 8-byte aligned
257 |     if((off & 0x7) != 0) {
258 |       return WRONG_ARG;
259 |     }
260 | 
261 |     ConnStatus ret = SUCC;
262 |     struct ibv_send_wr *bad_sr;
263 | 
264 |     // setting the SGE
265 |     struct ibv_sge sge {
266 |       .addr = (uint64_t)local_buf,
267 |           .length = sizeof(uint64_t), // atomic only supports 8-byte operation
268 |           .lkey   = local_mr_.key
269 |           };
270 | 
271 |     struct ibv_send_wr sr;
272 |     sr.wr_id        = wr_id;
273 |     sr.opcode       = type;
274 |     sr.num_sge      = 1;
275 |     sr.next         = NULL;
276 |     sr.sg_list      = &sge;
277 |     sr.send_flags   = flags;
278 |     // remote memory
279 |     sr.wr.atomic.rkey          = remote_mr_.key;
280 |     sr.wr.atomic.remote_addr   = (off + remote_mr_.buf);
281 | 	sr.wr.atomic.compare_add   = compare;
282 | 	sr.wr.atomic.swap          = swap;
283 | 
284 |     auto rc = ibv_post_send(qp_,&sr,&bad_sr);
285 |     return rc == 0 ? SUCC : ERR;
286 |   }
287 | 
288 |   ConnStatus post_batch(struct ibv_send_wr *send_sr,ibv_send_wr **bad_sr_addr,int num = 0) {
289 |     auto rc = ibv_post_send(qp_,send_sr,bad_sr_addr);
290 |     return rc == 0 ? SUCC : ERR;
291 |   }
292 | 
293 |   /**
294 |    * Poll completions. These are just wrappers of ibv_poll_cq
295 |    */
296 |   int poll_send_completion(ibv_wc &wc) {
297 |     return ibv_poll_cq(cq_,1,&wc);
298 |   }
299 | 
300 |   ConnStatus poll_till_completion(ibv_wc &wc,struct timeval timeout = default_timeout) {
301 |     auto ret = QP::poll_till_completion(wc,timeout);
302 |     if(ret == SUCC) {
303 |       low_watermark_ = high_watermark_;
304 |     }
305 |     return ret;
306 |   }
307 | 
308 |   /**
309 |    * Used to count pending reqs
310 |    * XD: current we use 64 as default, but it is rather application defined,
311 |    * which is related to how the QP's send to are created, etc
312 |    */
313 |   bool need_poll(int threshold = (RCQPImpl::RC_MAX_SEND_SIZE / 2)) {
314 |     return (high_watermark_ - low_watermark_) >= threshold;
315 |   }
316 | 
317 |   uint64_t high_watermark_ = 0;
318 |   uint64_t low_watermark_  = 0;
319 | 
320 |   MemoryAttr remote_mr_;
321 | };
322 | 
323 | inline constexpr UDConfig default_ud_config() {
324 |   return UDConfig {
325 |     .max_send_size  = UDQPImpl::MAX_SEND_SIZE,
326 |     .max_recv_size  = UDQPImpl::MAX_RECV_SIZE,
327 |     .qkey           = DEFAULT_QKEY,
328 |     .psn            = DEFAULT_PSN
329 |  };
330 | }
331 | 
332 | /**
333 |  * Raw UD QP
334 |  */
335 | template <UDConfig (*F)(void) = default_ud_config, int MAX_SERVER_NUM = 16>
336 | class RUDQP : public QP {
337 |   // the QKEY is used to identify UD QP requests
338 |   static const int DEFAULT_QKEY = 0xdeadbeaf;
339 |  public:
340 |   RUDQP(RNicHandler *rnic,QPIdx idx,MemoryAttr local_mr)
341 |       :RUDQP(rnic,idx) {
342 |     bind_local_mr(local_mr);
343 |   }
344 | 
345 |   RUDQP(RNicHandler *rnic,QPIdx idx)
346 |       :QP(rnic,idx) {
347 |     UDQPImpl::init<F>(qp_,cq_,recv_cq_,rnic_);
348 |     std::fill_n(ahs_,MAX_SERVER_NUM,nullptr);
349 |   }
350 | 
351 |   bool queue_empty() {
352 |     return pendings == 0;
353 |   }
354 | 
355 |   bool need_poll(int threshold = UDQPImpl::MAX_SEND_SIZE / 2) {
356 |     return pendings >= threshold;
357 |   }
358 | 
359 |   /**
360 |    * Simple wrapper to expose underlying QP structures
361 |    */
362 |   inline __attribute__ ((always_inline))
363 |   ibv_cq *recv_queue() {
364 |     return recv_cq_;
365 |   }
366 | 
367 |   inline __attribute__ ((always_inline))
368 |   ibv_qp *send_qp() {
369 |     return qp_;
370 |   }
371 | 
372 |   ConnStatus connect(std::string ip,int port) {
373 |     // UD QP is not bounded to a mac, so use idx to index
374 |     return connect(ip,port,idx_);
375 |   }
376 | 
377 |   ConnStatus connect(std::string ip,int port,QPIdx idx) {
378 | 
379 |     ConnArg arg; ConnReply reply;
380 |     arg.type = ConnArg::QP;
381 |     arg.payload.qp.from_node   = idx.worker_id;
382 |     arg.payload.qp.from_worker = idx.index;
383 |     arg.payload.qp.qp_type = IBV_QPT_UD;
384 | 
385 |     auto ret = QPImpl::get_remote_helper(&arg,&reply,ip,port);
386 | 
387 |     if(ret == SUCC) {
388 |       // create the ah, and store the address handler
389 |       auto ah = UDQPImpl::create_ah(rnic_,reply.payload.qp);
390 |       if(ah == nullptr) {
391 |         RDMA_LOG(WARNING) << "create address handler error: " << strerror(errno);
392 |         ret = ERR;
393 |       } else {
394 |         ahs_[reply.payload.qp.node_id]   = ah;
395 |         attrs_[reply.payload.qp.node_id] = reply.payload.qp;
396 |       }
397 |     }
398 |  CONN_END:
399 |     return ret;
400 |   }
401 | 
402 |   /**
403 |    * whether this UD QP has been post recved
404 |    * a UD QP should be first been post_recved; then it can be connected w others
405 |    */
406 |   bool ready() {
407 |     return ready_;
408 |   }
409 | 
410 |   void set_ready() {
411 |     ready_ = true;
412 |   }
413 | 
414 |   friend class UDAdapter;
415 |  private:
416 |   /**
417 |    * FIXME: curretly we have limited servers, so we use an array.
418 |    * using a map will affect the perfomrance in microbenchmarks.
419 |    * remove it, and merge this in UDAdapter?
420 |    */
421 |   struct ibv_ah *ahs_[MAX_SERVER_NUM];
422 |   struct QPAttr  attrs_[MAX_SERVER_NUM];
423 | 
424 |   // current outstanding requests which have not been polled
425 |   int pendings = 0;
426 | 
427 |   struct ibv_cq *recv_cq_ = NULL;
428 |   bool ready_ = false;
429 | };
430 | 
431 | }; // end namespace rdmaio
432 | 


--------------------------------------------------------------------------------
/qp_impl.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <limits>
  4 | 
  5 | #include "pre_connector.hpp"
  6 | 
  7 | namespace rdmaio {
  8 | 
  9 | const int MAX_INLINE_SIZE = 64;
 10 | 
 11 | /**
 12 |  * These are magic numbers, served as the keys / identifications
 13 |  * Currently we not allow user defined keys, but can simply added
 14 |  */
 15 | const uint32_t DEFAULT_QKEY    = 0x111111;
 16 | const uint32_t DEFAULT_PSN     = 3185;
 17 | 
 18 | /**
 19 |  * QP encoder, provde a default naming to identity QPs
 20 |  */
 21 | enum  {
 22 |   RC_ID_BASE = 0,
 23 |   UC_ID_BASE = 10000,
 24 |   UD_ID_BASE = 20000
 25 | };
 26 | 
 27 | inline constexpr uint32_t index_mask() {
 28 |   return 0xffff;
 29 | }
 30 | 
 31 | inline uint32_t mac_mask() {
 32 |   return ::rdmaio::index_mask() << 16;
 33 | }
 34 | 
 35 | inline uint32_t encode_qp_id(int m,int idx) {
 36 |   return static_cast<uint32_t>(static_cast<uint32_t>(m) << 16) | static_cast<uint32_t>(idx);
 37 | }
 38 | 
 39 | inline uint32_t decode_qp_mac(uint32_t key) {
 40 |   return (key & ::rdmaio::mac_mask()) >> 16;
 41 | }
 42 | 
 43 | inline uint32_t decode_qp_index(uint32_t key) {
 44 |   return key & ::rdmaio::index_mask();
 45 | }
 46 | 
 47 | class QPImpl {
 48 |  public:
 49 |   QPImpl()  = default;
 50 |   ~QPImpl() = default;
 51 | 
 52 |   static enum ibv_qp_state query_qp_status(ibv_qp *qp) {
 53 |     struct ibv_qp_attr attr;
 54 |     struct ibv_qp_init_attr init_attr;
 55 | 
 56 |     if (ibv_query_qp(qp, &attr,IBV_QP_STATE, &init_attr)) {
 57 |       RDMA_ASSERT(false) << "query qp cannot cause error";
 58 |     }
 59 |     return attr.qp_state;
 60 |   }
 61 | 
 62 |   static ConnStatus get_remote_helper(ConnArg *arg, ConnReply *reply,std::string ip,int port) {
 63 | 
 64 |     ConnStatus ret = SUCC;
 65 | 
 66 |     auto socket = PreConnector::get_send_socket(ip,port);
 67 |     if(socket < 0) {
 68 |       return ERR;
 69 |     }
 70 | 
 71 |     auto n = send(socket,(char *)(arg),sizeof(ConnArg),0);
 72 | 
 73 |     if(n != sizeof(ConnArg)) {
 74 |       ret = ERR; goto CONN_END;
 75 |     }
 76 | 
 77 |     // receive reply
 78 |     if(!PreConnector::wait_recv(socket,10000)) {
 79 |       ret = TIMEOUT; goto CONN_END;
 80 |     }
 81 | 
 82 |     n = recv(socket,(char *)((reply)), sizeof(ConnReply), MSG_WAITALL);
 83 |     if(n != sizeof(ConnReply)) {
 84 |       ret = ERR; goto CONN_END;
 85 |     }
 86 |     if(reply->ack != SUCC) {
 87 |       ret = NOT_READY; goto CONN_END;
 88 |     }
 89 |  CONN_END:
 90 |     shutdown(socket,SHUT_RDWR);
 91 |     close(socket);
 92 |     return ret;
 93 |   }
 94 | 
 95 |   static ConnStatus get_remote_mr(std::string ip,int port,int mr_id,MemoryAttr *attr) {
 96 | 
 97 |     ConnArg arg; ConnReply reply;
 98 |     arg.type = ConnArg::MR;
 99 |     arg.payload.mr.mr_id = mr_id;
100 | 
101 |     auto ret = get_remote_helper(&arg,&reply,ip,port);
102 |     if(ret == SUCC) {
103 |       attr->key = reply.payload.mr.key;
104 |       attr->buf = reply.payload.mr.buf;
105 |     }
106 |     return ret;
107 |   }
108 | 
109 |   static ConnStatus poll_till_completion(ibv_cq *cq,ibv_wc &wc, struct timeval timeout) {
110 | 
111 |     struct timeval start_time; gettimeofday (&start_time, nullptr);
112 |     int poll_result = 0; int64_t diff;
113 |     int64_t numeric_timeout = (timeout.tv_sec == 0 && timeout.tv_usec == 0) ? std::numeric_limits<int64_t>::max() :
114 |                               timeout.tv_sec * 1000 + timeout.tv_usec;
115 |     do {
116 |       asm volatile("" ::: "memory");
117 |       poll_result = ibv_poll_cq (cq, 1, &wc);
118 | 
119 |       struct timeval cur_time; gettimeofday(&cur_time,nullptr);
120 |       diff = diff_time(cur_time,start_time);
121 |     } while((poll_result == 0) && (diff <= numeric_timeout));
122 | 
123 |     if(poll_result == 0) {
124 |       return TIMEOUT;
125 |     }
126 | 
127 |     if(poll_result < 0) {
128 |       RDMA_ASSERT(false);
129 |       return ERR;
130 |     }
131 |     RDMA_LOG_IF(4,wc.status != IBV_WC_SUCCESS) <<
132 |         "poll till completion error: " << wc.status << " " << ibv_wc_status_str(wc.status);
133 |     return wc.status == IBV_WC_SUCCESS ? SUCC : ERR;
134 |   }
135 | };
136 | 
137 | class RCQPImpl {
138 |  public:
139 |   RCQPImpl()  = default;
140 |   ~RCQPImpl() = default;
141 | 
142 |   static const int RC_MAX_SEND_SIZE = 128;
143 |   static const int RC_MAX_RECV_SIZE = 512;
144 | 
145 |   template <RCConfig (*F)(void)>
146 |   static void ready2init(ibv_qp *qp,RNicHandler *rnic) {
147 | 
148 |     auto config = F();
149 | 
150 |     struct ibv_qp_attr qp_attr = {};
151 |     qp_attr.qp_state           = IBV_QPS_INIT;
152 |     qp_attr.pkey_index         = 0;
153 |     qp_attr.port_num           = rnic->port_id;
154 |     qp_attr.qp_access_flags    = config.access_flags;
155 | 
156 |     int flags = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS;
157 |     int rc = ibv_modify_qp(qp, &qp_attr,flags);
158 |     RDMA_VERIFY(WARNING,rc == 0) <<  "Failed to modify RC to INIT state, %s\n" <<  strerror(errno);
159 | 
160 |     if(rc != 0) {
161 |       // error handling
162 |       RDMA_LOG(WARNING) << " change state to init failed. ";
163 |     }
164 |   }
165 | 
166 |   template <RCConfig (*F)(void)>
167 |   static bool ready2rcv(ibv_qp *qp,QPAttr &attr,RNicHandler *rnic) {
168 | 
169 |     auto config = F();
170 | 
171 |     struct ibv_qp_attr qp_attr = {};
172 | 
173 |     qp_attr.qp_state              = IBV_QPS_RTR;
174 |     qp_attr.path_mtu              = IBV_MTU_4096;
175 |     qp_attr.dest_qp_num           = attr.qpn;
176 |     qp_attr.rq_psn                = config.rq_psn; // should this match the sender's psn ?
177 |     qp_attr.max_dest_rd_atomic    = config.max_dest_rd_atomic;
178 |     qp_attr.min_rnr_timer         = 20;
179 | 
180 |     qp_attr.ah_attr.dlid          = attr.lid;
181 |     qp_attr.ah_attr.sl            = 0;
182 |     qp_attr.ah_attr.src_path_bits = 0;
183 |     qp_attr.ah_attr.port_num      = rnic->port_id; /* Local port! */
184 | 
185 |     qp_attr.ah_attr.is_global                     = 1;
186 |     qp_attr.ah_attr.grh.dgid.global.subnet_prefix = attr.addr.subnet_prefix;
187 |     qp_attr.ah_attr.grh.dgid.global.interface_id  = attr.addr.interface_id;
188 |     qp_attr.ah_attr.grh.sgid_index                = 0;
189 |     qp_attr.ah_attr.grh.flow_label                = 0;
190 |     qp_attr.ah_attr.grh.hop_limit                 = 255;
191 | 
192 |     int flags = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN
193 |                 | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER;
194 |     auto rc = ibv_modify_qp(qp, &qp_attr,flags);
195 |     return rc == 0;
196 | 
197 |   }
198 | 
199 |   template <RCConfig (*F)(void)>
200 |   static bool ready2send(ibv_qp *qp) {
201 | 
202 |     auto config = F();
203 | 
204 |     int rc, flags;
205 |     struct ibv_qp_attr qp_attr = {};
206 | 
207 |     qp_attr.qp_state           = IBV_QPS_RTS;
208 |     qp_attr.sq_psn             = config.sq_psn;
209 |     qp_attr.timeout            = config.timeout;
210 |     qp_attr.retry_cnt          = 7;
211 |     qp_attr.rnr_retry          = 7;
212 |     qp_attr.max_rd_atomic      = config.max_rd_atomic;
213 |     qp_attr.max_dest_rd_atomic = config.max_dest_rd_atomic;
214 | 
215 |     flags = IBV_QP_STATE | IBV_QP_SQ_PSN | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY |
216 |             IBV_QP_MAX_QP_RD_ATOMIC;
217 |     rc = ibv_modify_qp(qp, &qp_attr,flags);
218 |     return rc == 0;
219 |   }
220 | 
221 |   template <RCConfig (*F)(void)>
222 |   static void init(ibv_qp *&qp,ibv_cq *&cq,RNicHandler *rnic) {
223 | 
224 |     // create the CQ
225 |     cq = ibv_create_cq(rnic->ctx, RC_MAX_SEND_SIZE, nullptr, nullptr, 0);
226 |     RDMA_VERIFY(WARNING,cq != nullptr) << "create cq error: " << strerror(errno);
227 | 
228 |     // create the QP
229 |     struct ibv_qp_init_attr qp_init_attr = {};
230 | 
231 |     qp_init_attr.send_cq = cq;
232 |     qp_init_attr.recv_cq = cq; // TODO, need seperate handling for two-sided over RC QP
233 |     qp_init_attr.qp_type = IBV_QPT_RC;
234 | 
235 |     qp_init_attr.cap.max_send_wr = RC_MAX_SEND_SIZE;
236 |     qp_init_attr.cap.max_recv_wr = RC_MAX_RECV_SIZE;	/* Can be set to 1, if RC Two-sided is not required */
237 |     qp_init_attr.cap.max_send_sge = 1;
238 |     qp_init_attr.cap.max_recv_sge = 1;
239 |     qp_init_attr.cap.max_inline_data = MAX_INLINE_SIZE;
240 | 
241 |     qp = ibv_create_qp(rnic->pd, &qp_init_attr);
242 |     RDMA_VERIFY(WARNING,qp != nullptr);
243 | 
244 |     if(qp)
245 |       ready2init<F>(qp,rnic);
246 |   }
247 | };
248 | 
249 | class UDQPImpl {
250 |  public:
251 |   UDQPImpl() = default;
252 |   ~UDQPImpl() = default;
253 | 
254 |   static const int MAX_SEND_SIZE = 128;
255 |   static const int MAX_RECV_SIZE = 2048;
256 | 
257 |   template<UDConfig (*F)(void)>
258 |   static void init(ibv_qp *&qp,ibv_cq *&cq,ibv_cq *&recv_cq,RNicHandler *rnic) {
259 | 
260 |     auto config = F(); // generate the config
261 |     RDMA_ASSERT(config.max_send_size <= MAX_SEND_SIZE);
262 |     RDMA_ASSERT(config.max_recv_size <= MAX_RECV_SIZE);
263 | 
264 |     if(qp != nullptr)
265 |       return;
266 | 
267 | 	if((cq = ibv_create_cq(rnic->ctx, config.max_send_size, nullptr, nullptr, 0)) == nullptr) {
268 |       RDMA_LOG(ERROR) << "create send cq for UD QP error: " << strerror(errno);
269 |       return;
270 |     }
271 | 
272 | 	if((recv_cq = ibv_create_cq(rnic->ctx, config.max_recv_size, nullptr, nullptr, 0)) == nullptr) {
273 |       RDMA_LOG(ERROR) << "create recv cq for UD QP error: " << strerror(errno);
274 |       return;
275 |     }
276 | 
277 | 	/* Initialize creation attributes */
278 | 	struct ibv_qp_init_attr qp_init_attr = {};
279 | 	qp_init_attr.send_cq = cq;
280 | 	qp_init_attr.recv_cq = recv_cq;
281 | 	qp_init_attr.qp_type = IBV_QPT_UD;
282 | 
283 | 	qp_init_attr.cap.max_send_wr  = config.max_send_size;
284 | 	qp_init_attr.cap.max_recv_wr  = config.max_recv_size;
285 | 	qp_init_attr.cap.max_send_sge = 1;
286 | 	qp_init_attr.cap.max_recv_sge = 1;
287 | 	qp_init_attr.cap.max_inline_data = MAX_INLINE_SIZE;
288 | 
289 | 	if((qp = ibv_create_qp(rnic->pd, &qp_init_attr)) == nullptr) {
290 |       RDMA_LOG(ERROR) << "create send qp for UD QP error: " << strerror(errno);
291 |       return;
292 |     }
293 | 
294 |     // change QP status
295 | 	ready2init(qp, rnic,config); // shall always succeed
296 | 
297 | 	if(!ready2rcv(qp,rnic)) {
298 |       RDMA_LOG(WARNING) << "change ud qp to ready to recv error: " << strerror(errno);
299 |     }
300 | 	if(!ready2send(qp,config)) {
301 |       RDMA_LOG(WARNING) << "change ud qp to ready to send error: " << strerror(errno);
302 |     }
303 |   }
304 | 
305 |   /**
306 |    * Unlike RC, which change status happens at different places, so F, the function which generates configurations,
307 |    * are passed as templates. On the other hand, UD change status at the same time. So it is more convenient to passed
308 |    * the configuration generated by the F to the functions.
309 |    */
310 |   static void ready2init(ibv_qp *qp,RNicHandler *rnic,UDConfig &config) {
311 | 
312 | 	int rc, flags = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_QKEY;
313 | 	struct ibv_qp_attr qp_attr = {};
314 | 	qp_attr.qp_state = IBV_QPS_INIT;
315 | 	qp_attr.pkey_index = 0;
316 | 	qp_attr.port_num = rnic->port_id;
317 | 	qp_attr.qkey = config.qkey;
318 | 
319 | 	if((rc = ibv_modify_qp(qp, &qp_attr, flags)) != 0) {
320 |       RDMA_LOG(WARNING) << "modify ud qp to init error: " << strerror(errno);
321 |     }
322 |   }
323 | 
324 |   static bool ready2rcv(ibv_qp *qp,RNicHandler *rnic) {
325 | 
326 | 	int rc, flags = IBV_QP_STATE;
327 | 	struct ibv_qp_attr qp_attr = {};
328 | 	qp_attr.qp_state = IBV_QPS_RTR;
329 | 
330 | 	rc = ibv_modify_qp(qp, &qp_attr, flags);
331 |     return rc == 0;
332 |   }
333 | 
334 |   static bool ready2send(ibv_qp *qp,UDConfig &config) {
335 | 
336 | 	int rc, flags = 0;
337 | 	struct ibv_qp_attr qp_attr = {};
338 | 	qp_attr.qp_state = IBV_QPS_RTS;
339 | 	qp_attr.sq_psn   = config.psn;
340 | 
341 | 	flags = IBV_QP_STATE | IBV_QP_SQ_PSN;
342 | 	rc = ibv_modify_qp(qp, &qp_attr, flags);
343 |     return rc == 0;
344 |   }
345 | 
346 |   static ibv_ah *create_ah(RNicHandler *rnic,QPAttr &attr) {
347 | 
348 |     struct ibv_ah_attr ah_attr;
349 |     ah_attr.is_global = 1;
350 |     ah_attr.dlid = attr.lid;
351 |     ah_attr.sl = 0;
352 |     ah_attr.src_path_bits = 0;
353 |     ah_attr.port_num = attr.port_id;
354 | 
355 |     ah_attr.grh.dgid.global.subnet_prefix = attr.addr.subnet_prefix;
356 |     ah_attr.grh.dgid.global.interface_id = attr.addr.interface_id;
357 |     ah_attr.grh.flow_label = 0;
358 |     ah_attr.grh.hop_limit = 255;
359 |     ah_attr.grh.sgid_index = rnic->gid;
360 | 
361 |     return ibv_create_ah(rnic->pd, &ah_attr);
362 |   }
363 | 
364 | };
365 | 
366 | } // namespace rdmaio
367 | 


--------------------------------------------------------------------------------
/ralloc/Makefile:
--------------------------------------------------------------------------------
 1 | CFLAGS += -O3 -Wstrict-prototypes -fomit-frame-pointer -g -Wall
 2 | #CFLAGS += -O3 -Wstrict-prototypes -fomit-frame-pointer -g -Wall
 3 | 
 4 | CURDIR=${PWD}
 5 | libdir = ${shell readlink -m $(CURDIR)/../lib }
 6 | 
 7 | #libdir = /usr/lib
 8 | #libdir = ../lib
 9 | LDFLAGS += -lpthread -rpath $(libdir) -version-info 1
10 | CC = gcc
11 | CXX = g++
12 | LD = ld
13 | ARCH = $(shell uname -m)
14 | 
15 | # Check Architecture
16 | SUPPORTED_ARCH = NO
17 | 
18 | ifeq ($(ARCH), x86_64)
19 | SUPPORTED_ARCH = YES
20 | endif
21 | 
22 | ifeq ($(SUPPORTED_ARCH), NO)
23 | $(error Your architecture $(ARCH) is not currently supported. See README.)
24 | endif
25 | 
26 | define compile_rule
27 | 	libtool --mode=compile --tag=CC \
28 | 	$(CC) $(CFLAGS) $(CPPFLAGS) -Iinclude-$(ARCH) -c $<
29 | endef
30 | 
31 | define cxx_compile_rule
32 | 	libtool --mode=compile --tag=CC \
33 | 	$(CXX) $(CFLAGS) $(CPPFLAGS) -Iinclude-$(ARCH) -c $<
34 | endef
35 | 
36 | define link_rule
37 | 	libtool --mode=link --tag=CC \
38 | 	$(LD) $(LDFLAGS) -o $@ $^ $(LDLIBS)
39 | endef
40 | 
41 | LIBS = libssmalloc.la
42 | libssmalloc_OBJS = ssmalloc.lo new_delete.lo
43 | 
44 | %.lo: %.c
45 | 	$(call compile_rule)
46 | %.lo: %.cpp
47 | 	$(call cxx_compile_rule)
48 | 
49 | all: libssmalloc.la
50 | 
51 | libssmalloc.la: $(libssmalloc_OBJS)
52 | 	$(call link_rule)
53 | 	cp .libs/libssmalloc.so ./
54 | 	cp .libs/libssmalloc.a ./
55 | 
56 | install/%.la: %.la
57 | 	libtool --mode=install \
58 | 	install -c $(notdir $@) $(libdir)/$(notdir $@)
59 | 
60 | install: $(addprefix install/,$(LIBS))
61 | 	libtool --mode=finish $(libdir)
62 | #	mv libssmalloc.so ../lib
63 | #	mv libssmalloc.a ../lib
64 | 
65 | 
66 | clean:
67 | 	libtool --mode=clean rm *.la *.lo *.a *.so -f
68 | 
69 | 


--------------------------------------------------------------------------------
/ralloc/README:
--------------------------------------------------------------------------------
 1 | This is a malloc for memory allocation in RDMA register area.
 2 | It is extended from ssmalloc, thus it is as efficient as ssmalloc.
 3 | Notice that using this lib will also overlap the malloc implementation.
 4 | We will fix this later but strongly recommand u to use this malloc since it is more efficient in multi-thread programming.
 5 | 
 6 | Current limitation:
 7 | If the allocation size is large than 128K, then this malloc will use huge malloc. The memory alloced by huge malloc will
 8 | never be reclaimed, thus the user must mange this him/herself.
 9 | Usually we found that if the memory allocation is very large, it is never freed through the application lifecycle, so maye
10 | this is not a problem.
11 | 
12 | Install:
13 | make;make install;
14 | And linked the libraries generated in ../lib directory. 
15 | 
16 | Following is the original README from ssmalloc. 
17 | 
18 | SSMalloc
19 | ========
20 | 
21 |  SSMalloc is a low-latency, locality-conscious memory 
22 |  allocator with stable performance scalability. 
23 | 
24 | 
25 | Compilation & Install
26 | =====================
27 | 
28 |  SSMalloc requires libtool for compilation.
29 |  In the SSMalloc directory, type:
30 | 
31 |  $ make
32 |  $ make install
33 | 
34 | 
35 | Usage
36 | =====
37 |  Use libssmalloc.a for static linking. If you want to 
38 |  dynamically link SSMalloc into your program. Please 
39 |  set the LD_PRELOAD environment variable as below.
40 | 
41 |  $export LD_PRELOAD=(path)/libssmalloc.so
42 | 
43 | 


--------------------------------------------------------------------------------
/ralloc/include-x86_64/atomic.h:
--------------------------------------------------------------------------------
  1 | #ifndef __SYNCHRO_ATOMIC_H__
  2 | #define __SYNCHRO_ATOMIC_H__
  3 | 
  4 | #define mb()		asm volatile ("sync" : : : "memory")
  5 | #define LOCK_PREFIX	"lock ; "
  6 | 
  7 | static inline unsigned long fetch_and_store(volatile unsigned int *address, unsigned int value)
  8 | {
  9 | 	asm volatile("xchgl %k0,%1"
 10 | 		: "=r" (value)
 11 | 		: "m" (*address), "0" (value)
 12 | 		: "memory");
 13 | 
 14 | 	return value;
 15 | }
 16 | 
 17 | static inline int atmc_fetch_and_add(volatile unsigned int *address, int value)
 18 | {
 19 | 	int prev = value;
 20 | 
 21 | 	asm volatile(
 22 | 		LOCK_PREFIX "xaddl %0, %1"
 23 | 		: "+r" (value), "+m" (*address)
 24 | 		: : "memory");
 25 | 
 26 | 	return prev + value;
 27 | }
 28 | 
 29 | static inline long long atmc_fetch_and_add64(volatile unsigned long long *address, long long value)
 30 | {
 31 | 	int prev = value;
 32 | 
 33 | 	asm volatile(
 34 | 		LOCK_PREFIX "xaddq %0, %1"
 35 | 		: "+r" (value), "+m" (*address)
 36 | 		: : "memory");
 37 | 
 38 | 	return prev + value;
 39 | }
 40 | 
 41 | static inline void atmc_add32(volatile unsigned int* address, int value)
 42 | {
 43 | 	asm volatile(
 44 | 		LOCK_PREFIX "addl %1,%0"
 45 | 		: "=m" (*address)
 46 | 		: "ir" (value), "m" (*address));
 47 | }
 48 | 
 49 | static inline void atmc_add64(volatile unsigned long long* address, unsigned long long value)
 50 | {
 51 | 	asm volatile(
 52 | 		LOCK_PREFIX "addq %1,%0"
 53 | 		: "=m" (*address)
 54 | 		: "ir" (value), "m" (*address));
 55 | }
 56 | 
 57 | static inline unsigned int compare_and_swap32(volatile unsigned int *address, unsigned int old_value, unsigned int new_value)
 58 | {
 59 | 	unsigned long prev = 0;
 60 | 
 61 | 	asm volatile(LOCK_PREFIX "cmpxchgl %k1,%2"
 62 | 		: "=a"(prev)
 63 | 		: "r"(new_value), "m"(*address), "0"(old_value)
 64 | 		: "memory");
 65 | 
 66 | 	return prev == old_value;
 67 | }
 68 | 
 69 | static inline unsigned int compare_and_swap32_value(volatile unsigned int *address, unsigned int old_value, unsigned int new_value)
 70 | {
 71 | 	unsigned long prev = 0;
 72 | 
 73 | 	asm volatile(LOCK_PREFIX "cmpxchgl %k1,%2"
 74 | 		: "=a"(prev)
 75 | 		: "r"(new_value), "m"(*address), "0"(old_value)
 76 | 		: "memory");
 77 | 
 78 | 	return prev;
 79 | }
 80 | 
 81 | 
 82 | static inline unsigned int compare_and_swap64(volatile unsigned long long *address, unsigned long old_value, unsigned long new_value)
 83 | {
 84 | 	unsigned long prev = 0;
 85 | 
 86 | 	asm volatile(LOCK_PREFIX "cmpxchgq %1,%2"
 87 | 		: "=a"(prev)
 88 | 		: "r"(new_value), "m"(*address), "0"(old_value)
 89 | 		: "memory");
 90 | 
 91 | 	return prev == old_value;
 92 | }
 93 | 
 94 | static inline unsigned long compare_and_swap64_value(volatile unsigned long long *address, unsigned long old_value, unsigned long new_value)
 95 | {
 96 | 	unsigned long prev = 0;
 97 | 
 98 | 	asm volatile(LOCK_PREFIX "cmpxchgq %1,%2"
 99 | 		: "=a"(prev)
100 | 		: "r"(new_value), "m"(*address), "0"(old_value)
101 | 		: "memory");
102 | 
103 | 	return prev;
104 | }
105 | 
106 | static inline unsigned long compare_and_swap_ptr(volatile void *address, void* old_ptr, void* new_ptr)
107 | {
108 | 	return compare_and_swap64((volatile unsigned long long *)address, (unsigned long)old_ptr, (unsigned long)new_ptr); 
109 | }
110 | 
111 | #endif
112 | 
113 | 


--------------------------------------------------------------------------------
/ralloc/include-x86_64/bitops.h:
--------------------------------------------------------------------------------
 1 | #ifndef __X86_64_BITOPS_H_
 2 | #define __X86_64_BITOPS_H_
 3 | 
 4 | /*
 5 |  * Copyright 1992, Linus Torvalds.
 6 |  */
 7 | 
 8 | #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
 9 | /* Technically wrong, but this avoids compilation errors on some gcc
10 |    versions. */
11 | #define ADDR "=m" (*(volatile long *) addr)
12 | #else
13 | #define ADDR "+m" (*(volatile long *) addr)
14 | #endif
15 | 
16 | /**
17 |  * __change_bit - Toggle a bit in memory
18 |  * @nr: the bit to change
19 |  * @addr: the address to start counting from
20 |  *
21 |  * Unlike change_bit(), this function is non-atomic and may be reordered.
22 |  * If it's called on the same region of memory simultaneously, the effect
23 |  * may be that only one operation succeeds.
24 |  */
25 | static __inline__ void __change_bit(int nr, volatile void * addr)
26 | {
27 | 	__asm__ __volatile__(
28 | 		"btcl %1,%0"
29 | 		:ADDR
30 | 		:"dIr" (nr));
31 | }
32 | 
33 | /* WARNING: non atomic and it can be reordered! */
34 | static __inline__ int __test_and_change_bit(int nr, volatile void * addr)
35 | {
36 | 	int oldbit;
37 | 
38 | 	__asm__ __volatile__(
39 | 		"btcl %2,%1\n\tsbbl %0,%0"
40 | 		:"=r" (oldbit),ADDR
41 | 		:"dIr" (nr) : "memory");
42 | 	return oldbit;
43 | }
44 | 
45 | static inline unsigned long __fls(unsigned long word)
46 | {
47 | 	asm("bsr %1,%0"
48 | 	    : "=r" (word)
49 | 	    : "rm" (word));
50 | 	return word;
51 | }
52 | 
53 | static __inline__ unsigned int __get_size_class(unsigned int word) {
54 | 	asm("dec %1\n"
55 |         "shr $2,%1\n"
56 |         "bsr %1,%0\n"
57 |         "cmovz %2,%0\n"
58 | 	    : "=r" (word)
59 | 	    : "rm" (word), "r" (0));
60 | 	return word;    
61 | }
62 | 
63 | #endif /* _X86_64_BITOPS_H */
64 | 
65 | 


--------------------------------------------------------------------------------
/ralloc/include-x86_64/cpu.h:
--------------------------------------------------------------------------------
 1 | #ifndef __CPU_H_
 2 | #define __CPU_H_
 3 | 
 4 | /* Machine related macros*/
 5 | #define PAGE_SIZE           (4096)
 6 | #define SUPER_PAGE_SIZE     (4*1024*1024)
 7 | #define CACHE_LINE_SIZE     (64)
 8 | #define DEFAULT_BLOCK_CLASS (100)
 9 | #define MAX_CORE_ID         (8)
10 | 
11 | static inline int get_core_id(void) {
12 |     return 0;
13 |     int result;
14 |     __asm__ __volatile__ (
15 |         "mov $1, %%eax\n"
16 |         "cpuid\n"
17 |         :"=b"(result)
18 |         :
19 |         :"eax","ecx","edx");
20 |     return (result>>24)%8;
21 | }
22 | 
23 | static inline unsigned long read_tsc(void)
24 | {
25 |     unsigned a, d;
26 |     __asm __volatile("rdtsc":"=a"(a), "=d"(d));
27 |     return ((unsigned long)a) | (((unsigned long) d) << 32);
28 | }
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/ralloc/include-x86_64/double-list.h:
--------------------------------------------------------------------------------
 1 | #ifndef __DOUBLE_LIST_H_
 2 | #define __DOUBLE_LIST_H_
 3 | 
 4 | typedef struct double_list_elem	double_list_elem_t;
 5 | typedef struct double_list double_list_t;
 6 | 
 7 | struct double_list_elem {
 8 | 	void* __padding;
 9 | 	struct double_list_elem* next;
10 | 	struct double_list_elem* prev;
11 | };
12 | 
13 | struct double_list {
14 | 	struct double_list_elem* head;
15 | 	struct double_list_elem* tail;
16 | };
17 | 
18 | 
19 | /* Places new_node at the front of the list. */
20 | static void double_list_insert_front(void* new_node, double_list_t* list)
21 | {
22 | 	double_list_elem_t* elem_new = (double_list_elem_t*)new_node;
23 | 	double_list_elem_t* old_head = list->head;
24 | 
25 | 	if (old_head == NULL) {
26 | 		list->tail = elem_new;
27 | 	}
28 | 	else {
29 | 		old_head->prev = elem_new;
30 | 	}
31 | 
32 | 	elem_new->next = old_head;
33 | 	elem_new->prev = NULL;
34 | 	list->head = elem_new;
35 | }
36 | 
37 | /* Removes node from the list. */
38 | static void double_list_remove(void* node, double_list_t* list)
39 | {
40 | 	double_list_elem_t* elem_node = (double_list_elem_t*)node;
41 | 
42 | 	if (elem_node->prev != NULL) {
43 | 		elem_node->prev->next = elem_node->next;
44 | 	}
45 | 	else {
46 | 		list->head = elem_node->next;
47 | 	}
48 | 
49 | 	if (elem_node->next != NULL) {
50 | 		elem_node->next->prev = elem_node->prev;
51 | 	}
52 | 	else {
53 | 		list->tail = elem_node->prev;
54 | 	}
55 | 
56 | 	if (list->head != NULL && list->head->next == NULL) {
57 | 		list->tail = list->head;
58 | 	}
59 | 	else if (list->tail != NULL && list->tail->prev == NULL) {
60 | 		list->head = list->tail;
61 | 	}
62 | }
63 | 
64 | #endif
65 | 


--------------------------------------------------------------------------------
/ralloc/include-x86_64/queue.h:
--------------------------------------------------------------------------------
  1 | #ifndef __QUEUE_H_ 
  2 | #define __QUEUE_H_
  3 | 
  4 | #include "atomic.h"
  5 | #include <stdlib.h>
  6 | 
  7 | #define CACHE_LINE_SIZE     (64)
  8 | #define CACHE_ALIGN __attribute__ ((aligned (CACHE_LINE_SIZE)))
  9 | 
 10 | typedef unsigned long long ptr_t;
 11 | 
 12 | #define ABA_ADDR_BIT    (48)
 13 | #define ABA_ADDR_MASK   ((1L<<ABA_ADDR_BIT)-1)
 14 | #define ABA_COUNT_MASK  (~ABA_ADDR_MASK)
 15 | #define ABA_COUNT_ONE   (1L<<ABA_ADDR_BIT)
 16 | #define ABA_ADDR(e)     ((void*)((ptr_t)(e) & ABA_ADDR_MASK))
 17 | #define ABA_COUNT(e)    ((ptr_t)(e) & ABA_COUNT_MASK)
 18 | 
 19 | #define NEXT_NODE(ptr, offset) (*(ptr_t*)((char*)(ptr)+offset))
 20 | 
 21 | 
 22 | typedef struct {
 23 |     CACHE_ALIGN volatile ptr_t head;
 24 | } queue_head_t;
 25 | 
 26 | typedef void *seq_queue_head_t;
 27 |  
 28 | /* Multi-Consumer LIFO Queue */
 29 | 
 30 | static inline void mc_queue_init(queue_head_t *queue)
 31 | {
 32 |     queue->head = 0;
 33 | }
 34 | 
 35 | static inline void mc_enqueue(queue_head_t *queue, void *element, int next_off)
 36 | {
 37 |     unsigned long long old_head;
 38 |     unsigned long long new_head;
 39 | 
 40 |     while(1) {
 41 |         old_head = queue->head;
 42 |         NEXT_NODE(element, next_off) = (ptr_t) ABA_ADDR(old_head);
 43 |         new_head = (ptr_t)element;
 44 |         new_head |= ABA_COUNT(old_head) + ABA_COUNT_ONE;
 45 |         if (compare_and_swap64(&queue->head, old_head, new_head)) {
 46 |             return;
 47 |         }
 48 |     }
 49 | }
 50 | 
 51 | static inline void *mc_dequeue(queue_head_t *queue, int next_off)
 52 | {
 53 |     unsigned long long old_head;
 54 |     unsigned long long new_head;
 55 |     void* old_addr;
 56 | 
 57 |     while(1) {
 58 |         old_head = queue->head;
 59 |         old_addr = ABA_ADDR(old_head);
 60 |         if(old_addr == NULL) {
 61 |             return NULL;
 62 |         }
 63 |         new_head = NEXT_NODE(old_addr, next_off);
 64 |         new_head |= ABA_COUNT(old_head) + ABA_COUNT_ONE;
 65 |         if (compare_and_swap64(&queue->head, old_head, new_head)) {
 66 |             return old_addr;
 67 |         }
 68 |     }
 69 | }
 70 | 
 71 | /* Single-Consumer LIFO Queue */
 72 | 
 73 | static inline void sc_queue_init(queue_head_t *queue)
 74 | {
 75 |     queue->head = 0;
 76 | }
 77 | 
 78 | static inline void sc_enqueue(queue_head_t *queue, void *element, int next_off)
 79 | {
 80 |     unsigned long long old_head;
 81 |     unsigned long long new_head;
 82 | 
 83 |     while(1) {
 84 |         old_head = queue->head;
 85 |         NEXT_NODE(element, next_off) = old_head;
 86 |         new_head = (ptr_t)element;
 87 |         if (compare_and_swap64(&queue->head, old_head, new_head)) {
 88 |             return;
 89 |         }
 90 |     }
 91 | }
 92 | 
 93 | static inline void *sc_dequeue(queue_head_t *queue, int next_off)
 94 | {
 95 |     unsigned long long old_head;
 96 |     unsigned long long new_head;
 97 | 
 98 |     while(1) {
 99 |         old_head = queue->head;
100 |         if(old_head == 0) {
101 |             return NULL;
102 |         }
103 |         new_head = NEXT_NODE(old_head, next_off);
104 |         if (compare_and_swap64(&queue->head, old_head, new_head)) {
105 |             return (void*)old_head;
106 |         }
107 |     }
108 | }
109 | 
110 | static inline void *sc_chain_dequeue(queue_head_t *queue)
111 | {
112 |     unsigned long long old_head;
113 |     while(1) {
114 |         old_head = queue->head;
115 |         if(old_head == 0) {
116 |             return NULL;
117 |         }
118 |         if (compare_and_swap64(&queue->head, old_head, 0)) {
119 |             return (void*)old_head;
120 |         }
121 |     }
122 | }
123 | 
124 | /* Sequential LIFO Queue */
125 | 
126 | static inline void seq_queue_init(seq_queue_head_t *queue)
127 | {
128 |     *queue = NULL;
129 | }
130 | 
131 | static inline void seq_enqueue(seq_queue_head_t *queue, void *element)
132 | {
133 |     *(void**)element = *queue;
134 |     *queue = element;
135 | }
136 | 
137 | static inline void *seq_dequeue(seq_queue_head_t *queue)
138 | {
139 |     void* old_head = *queue;
140 |     if(old_head == NULL) {
141 |         return NULL;
142 |     }
143 |     *queue = *(void**)old_head;
144 |     return old_head;
145 | }
146 | 
147 | #define seq_head(queue) (queue)
148 | 
149 | /* Counted Queue */
150 | static inline void* counted_enqueue(queue_head_t *queue, void* elem) {
151 |     unsigned long long old_head, new_head, prev;
152 |     do {
153 |         old_head = queue->head;
154 |         *(ptr_t*)elem = (ptr_t)ABA_ADDR(old_head);
155 |         new_head = (ptr_t)elem;
156 |         new_head |= ABA_COUNT(old_head) + ABA_COUNT_ONE;
157 |         
158 |     } while((prev=compare_and_swap64_value (
159 |             &queue->head,
160 |             old_head,
161 |             new_head
162 |     ))!=old_head);    
163 | 
164 |     return (void*)prev;
165 | }
166 | 
167 | static inline void* counted_chain_enqueue(queue_head_t *queue, void* elems, void* tail, int cnt) {
168 |     unsigned long long old_head, new_head, prev;
169 |     do {
170 |         old_head = queue->head;
171 |         *(ptr_t*)tail = (ptr_t)ABA_ADDR(old_head);
172 |         new_head = (ptr_t)elems;
173 |         new_head |= ABA_COUNT(old_head) + ABA_COUNT_ONE * cnt;
174 | 
175 |     } while((prev=compare_and_swap64_value (
176 |             &queue->head,
177 |             old_head,
178 |             new_head
179 |     ))!=old_head);    
180 | 
181 |     return (void*)prev;
182 | }
183 | 
184 | static inline void* counted_chain_dequeue(queue_head_t* queue, uint32_t *count) {
185 |     unsigned long long old_head;
186 | 	while(1) {
187 | 		old_head = *(ptr_t*)queue;
188 | 		if (old_head == 0)
189 | 			return(NULL);
190 | 		if (compare_and_swap64(&queue->head, old_head, 0)) {
191 |             *count = ABA_COUNT(old_head) >> ABA_ADDR_BIT;
192 | 			return(ABA_ADDR(old_head));
193 | 		}
194 | 	}
195 | }
196 | 
197 | #endif
198 | 


--------------------------------------------------------------------------------
/ralloc/new_delete.cpp:
--------------------------------------------------------------------------------
 1 | #include <new>
 2 | 
 3 | using namespace std;
 4 | 
 5 | extern "C" {
 6 |     void* malloc(size_t);
 7 |     void free(void*);
 8 | }
 9 | 
10 | void* operator new(size_t size) throw (std::bad_alloc)
11 | {
12 |     return malloc(size);
13 | }
14 | 
15 | void * operator new(size_t size, const std::nothrow_t&) throw()
16 | {
17 |     return malloc(size);
18 | }
19 | 
20 | void operator delete(void *ptr)
21 | {
22 |     free(ptr);
23 | }
24 | 
25 | void* operator new[](size_t size) throw (std::bad_alloc)
26 | {
27 |     return malloc(size);
28 | }
29 | 
30 | void * operator new[](size_t size, const std::nothrow_t&) throw()
31 | {
32 |     return malloc(size);
33 | }
34 | 
35 | void operator delete[](void *ptr)
36 | {
37 |     free(ptr);
38 | }
39 | 
40 | 


--------------------------------------------------------------------------------
/ralloc/ralloc.h:
--------------------------------------------------------------------------------
 1 | #ifndef RDMA_MALLOC
 2 | #define RDMA_MALLOC
 3 | 
 4 | #include <stddef.h>
 5 | #include <stdint.h>
 6 | 
 7 | /*   This file provides interfaces of a malloc for manging registered RDMA regions.
 8 |      It shall be linked to the dedicated ssmalloc library which can be installed
 9 |      by following instructions in ../ralloc/README.md.
10 | 
11 |      Usage:
12 |      To manage allocation in RDMA registered region, just pass the start pointer and the
13 |      size to RInit() for initlization.
14 |      Before Each thread can alloc memory, they shall call RThreadLocalInit() at first.
15 | 
16 |      Rmalloc and Rfree works as the same as standard malloc and free. The addresses returned
17 |      is in the registered memory region.
18 | 
19 |      Limitation:
20 |      We assume there is exactly one RDMA region on one machine.  Which is enough most of the time.
21 | */
22 | 
23 | extern "C"  {
24 |   /* Initilize the lib with the dedicated memroy buffer. Can only be called exactly once.
25 |      @ret
26 |      NULL - An error occured. This is because the memory region size is not large enough.
27 |      A size - The actual size of memory region shall be allocaed .This typicall is less than size for algiment
28 |      reasons.
29 |   */
30 |   uint64_t  RInit(char *buffer, uint64_t size);
31 |   /*
32 |     Initilize thread local data structure.
33 |     Shall be called exactly after RInit and before the first call of Rmalloc or Rfree at this thread.
34 |   */
35 |   void  RThreadLocalInit(void);
36 |   void *Rmalloc(size_t __size);
37 |   void  Rfree(void *__ptr);
38 | }
39 | 
40 | #endif
41 | 


--------------------------------------------------------------------------------
/ralloc/ssmalloc.c:
--------------------------------------------------------------------------------
   1 | #define _GNU_SOURCE
   2 | #include "ssmalloc.h"
   3 | 
   4 | /* Global metadata */
   5 | init_state global_state = UNINITIALIZED;
   6 | init_state r_global_state = UNINITIALIZED;
   7 | gpool_t global_pool;
   8 | gpool_t r_global_pool;
   9 | char *register_rdma_buffer;
  10 | uint64_t register_rdma_buffer_size = 0;
  11 | 
  12 | pthread_key_t destructor;
  13 | pthread_key_t r_destructor ;
  14 | pthread_once_t init_once = PTHREAD_ONCE_INIT;
  15 | 
  16 | /* Mappings used in ssmalloc */
  17 | CACHE_ALIGN int cls2size[128];
  18 | char sizemap[256];
  19 | char sizemap2[128];
  20 | 
  21 | /* Mappings used in r_malloc */
  22 | CACHE_ALIGN int r_cls2size[128];
  23 | char r_sizemap[256];
  24 | char r_sizemap2[128];
  25 | 
  26 | 
  27 | /* Private metadata in ssmalloc */
  28 | THREAD_LOCAL init_state thread_state = UNINITIALIZED;
  29 | THREAD_LOCAL lheap_t *local_heap = NULL;
  30 | 
  31 | /* Private metadata for r_malloc */
  32 | THREAD_LOCAL init_state r_thread_state = UNINITIALIZED;
  33 | THREAD_LOCAL lheap_t * r_local_heap = NULL;
  34 | 
  35 | /* System init functions */
  36 | static void maps_init(void);
  37 | static void thread_init(void);
  38 | static void thread_exit(void *dummy);
  39 | static void r_thread_exit(void *dummy);
  40 | static void global_init(void);
  41 | inline static void check_init(void);
  42 | 
  43 | /* Global pool management functions */
  44 | inline static void gpool_check_size(void *target);
  45 | /* Unlike gpool check size which can grow unlimited, RDMA resources is scarce */
  46 | inline static int  rpool_check_size(void *target);
  47 | static int gpool_grow(void);
  48 | static void gpool_init(void);
  49 | static void *gpool_make_raw_chunk(void);
  50 | static void *rpool_make_raw_chunk(void);
  51 | 
  52 | inline static chunk_t *gpool_acquire_chunk(void);
  53 | inline static chunk_t *rpool_acquire_chunk(void);
  54 | inline static void gpool_release_chunk(dchunk_t *dc);
  55 | 
  56 | static lheap_t *gpool_acquire_lheap(void);
  57 | static lheap_t *r_acquire_lheap(void);
  58 | static void gpool_release_lheap(lheap_t *lh);
  59 | static void rpool_release_lheap(lheap_t *lh);
  60 | 
  61 | 
  62 | /* Local heap management functions */
  63 | inline static void lheap_init(lheap_t *lh);
  64 | inline static void lheap_replace_foreground(lheap_t *lh, int size_cls);
  65 | inline static int r_lheap_replace_foreground(lheap_t *lh,int size_cls);
  66 | 
  67 | /* Data chunk management functions */
  68 | inline static void dchunk_change_cls(dchunk_t *dc, int size_cls);
  69 | inline static void dchunk_init(dchunk_t *dc, int size_cls);
  70 | inline static void dchunk_collect_garbage(dchunk_t *dc);
  71 | inline static void *dchunk_alloc_obj(dchunk_t *dc);
  72 | inline static dchunk_t* dchunk_extract(void *ptr);
  73 | 
  74 | /* Object buffer management functions */
  75 | inline static void obj_buf_flush(obj_buf_t *obuf);
  76 | inline static void obj_buf_flush_all(lheap_t *lh);
  77 | inline static void obj_buf_put(obj_buf_t *bbuf, dchunk_t * dc, void *ptr);
  78 | 
  79 | /* Allocator helpers */
  80 | inline static void *r_large_malloc(size_t size);
  81 | inline static void *r_small_malloc(int size_cls);
  82 | inline static void *large_malloc(size_t size);
  83 | inline static void *small_malloc(int size_cls);
  84 | inline static void large_free(void *ptr);
  85 | //inline static void *r_large_free(void *ptr);
  86 | inline static void local_free(lheap_t *lh, dchunk_t *dc, void *ptr);
  87 | inline static void remote_free(lheap_t *lh, dchunk_t *dc, void *ptr);
  88 | static void *large_memalign(size_t boundary, size_t size);
  89 | 
  90 | /* Misc functions */
  91 | static void* page_alloc(void *pos, size_t size);
  92 | static void page_free(void *pos, size_t size);
  93 | static void touch_memory_range(void *start, size_t len); 
  94 | inline static int size2cls(size_t size);
  95 | inline static int r_size2cls(size_t size);
  96 | 
  97 | #ifdef DEBUG
  98 | static void handler(int sig);
  99 | #endif
 100 | 
 101 | /* Interface */
 102 | void *malloc(size_t size);
 103 | void free(void* ptr);
 104 | void *realloc(void *ptr, size_t size);
 105 | void *calloc(size_t nmemb, size_t size);
 106 | void *memalign(size_t boundary, size_t size);
 107 | int posix_memalign(void **memptr, size_t alignment, size_t size);
 108 | void *valloc(size_t size);
 109 | void *pvalloc(size_t size);
 110 | 
 111 | #ifdef RETURN_MEMORY
 112 | pthread_t gpool_gc_thread;
 113 | 
 114 | static void* gpool_gc(void* arg)
 115 | {
 116 |   pthread_detach(pthread_self());
 117 |   char *ptr = NULL;
 118 |   
 119 |   /* sleeptime = 100 ms */
 120 |   struct timespec sleeptime = {0, 10000000};
 121 |   
 122 |   while(1) {
 123 |     nanosleep(&sleeptime, NULL);
 124 |     ptr = (char*) queue_fetch(&global_pool.free_dc_head[get_core_id()]);
 125 |     if(ptr) {
 126 |       void *ptr_end = PAGE_ROUNDDOWN(ptr + CHUNK_SIZE);
 127 |       void *ptr_start = PAGE_ROUNDUP(ptr);
 128 |       madvise(ptr_start, (uintptr_t)ptr_end - (uintptr_t)ptr_start, MADV_DONTNEED); 
 129 |       queue_put(&global_pool.released_dc_head[get_core_id()], ptr);
 130 |     }
 131 |   }
 132 | }
 133 | #endif
 134 | 
 135 | static void maps_init()
 136 | {
 137 |     int size;
 138 |     int class;
 139 | 
 140 |     /* 8 +4 64 */
 141 |     for (size = 8, class = 0; size <= 64; size += 4, class++) {
 142 |         cls2size[class] = size;
 143 |     }
 144 | 
 145 |     /* 80 +16 128 */
 146 |     for (size = 64 + 16; size <= 128; size += 16, class++) {
 147 |         cls2size[class] = size;
 148 |     }
 149 | 
 150 |     /* 160 +32 256 */
 151 |     for (size = 128 + 32; size <= 256; size += 32, class++) {
 152 |         cls2size[class] = size;
 153 |     }
 154 | 
 155 |     for (size = 256; size < 65536; size <<= 1) {
 156 |         cls2size[class++] = size + (size >> 1);
 157 |         cls2size[class++] = size << 1;
 158 |     }
 159 | 
 160 |     int cur_class = 0;
 161 |     int cur_size = 0;
 162 | 
 163 |     /* init sizemap */
 164 |     for (cur_size = 4; cur_size <= 1024; cur_size += 4) {
 165 |         if (cur_size > cls2size[cur_class])
 166 |             cur_class++;
 167 |         sizemap[(cur_size - 1) >> 2] = cur_class;
 168 |     }
 169 |     
 170 |     /* init sizemap2 */
 171 |     for (cur_size = 1024; cur_size <= 65536; cur_size += 512) {
 172 |         if (cur_size > cls2size[cur_class])
 173 |             cur_class++;
 174 |         sizemap2[(cur_size - 1) >> 9] = cur_class;
 175 |     }
 176 | }
 177 | 
 178 | static void thread_init()
 179 | {
 180 |     /* Register the destructor */
 181 |   pthread_setspecific(destructor, ACTIVE);
 182 |   /* Initialize thread pool */
 183 |   local_heap = gpool_acquire_lheap();
 184 |   thread_state = READY;
 185 | }
 186 | 
 187 | static void r_thread_exit (void *dummy) {
 188 |   rpool_release_lheap(r_local_heap);
 189 | }
 190 | static void thread_exit(void *dummy)
 191 | {
 192 |   gpool_release_lheap(local_heap);
 193 | }
 194 | 
 195 | 
 196 | uint64_t RInit(char *buffer,uint64_t size) {
 197 | 
 198 | 
 199 |   pthread_key_create(&r_destructor, r_thread_exit);
 200 | 
 201 |   /* Rounding to chunk size */
 202 |   uint64_t add_off = CHUNK_SIZE - ((uint64_t )buffer) % CHUNK_SIZE;
 203 |   if(add_off >= size)
 204 |     return 0;
 205 |   size -= add_off;
 206 |   if(size < 16 * CHUNK_SIZE) {
 207 |     /* We shall ensure the register rdma area is large enough! */
 208 |     return 0;
 209 |   }
 210 |   
 211 |   register_rdma_buffer = buffer + add_off;
 212 |   register_rdma_buffer_size = size;
 213 |   
 214 |   r_global_pool.pool_start = register_rdma_buffer;
 215 |   r_global_pool.pool_end   = register_rdma_buffer + register_rdma_buffer_size;
 216 |   r_global_pool.free_start = register_rdma_buffer;
 217 | 
 218 |   pthread_mutex_init(&r_global_pool.lock, NULL);
 219 | 
 220 |   {
 221 |     /* maps init */
 222 | 
 223 |     int size;
 224 |     int class;
 225 |     
 226 |     /* 8 +4 64 */
 227 |     for (size = 8, class = 0; size <= 64; size += 4, class++) {
 228 |       r_cls2size[class] = size;
 229 |     }
 230 |     
 231 |     /* 80 +16 128 */
 232 |     for (size = 64 + 16; size <= 128; size += 16, class++) {
 233 |       r_cls2size[class] = size;
 234 |     }
 235 |     
 236 |     /* 160 +32 256 */
 237 |     for (size = 128 + 32; size <= 256; size += 32, class++) {
 238 |       r_cls2size[class] = size;
 239 |     }
 240 |     
 241 |     for (size = 256; size < 65536; size <<= 1) {
 242 |       r_cls2size[class++] = size + (size >> 1);
 243 |       r_cls2size[class++] = size << 1;
 244 |     }
 245 |     
 246 |     int cur_class = 0;
 247 |     int cur_size = 0;
 248 |     
 249 |     /* init sizemap */
 250 |     for (cur_size = 4; cur_size <= 1024; cur_size += 4) {
 251 |       if (cur_size > r_cls2size[cur_class])
 252 | 	cur_class++;
 253 |       r_sizemap[(cur_size - 1) >> 2] = cur_class;
 254 |     }
 255 |     
 256 |     /* init sizemap2 */
 257 |     for (cur_size = 1024; cur_size <= 65536; cur_size += 512) {
 258 |       if (cur_size > r_cls2size[cur_class])
 259 | 	cur_class++;
 260 |       r_sizemap2[(cur_size - 1) >> 9] = cur_class;
 261 |     }    
 262 |   }
 263 |   r_global_state = READY;
 264 |   return size;
 265 | }
 266 | 
 267 | void RThreadLocalInit () {
 268 |   
 269 |   if(unlikely(r_thread_state != READY)) {
 270 |     pthread_setspecific(r_destructor, ACTIVE);
 271 |     //    r_local_heap = gp
 272 |     r_local_heap = r_acquire_lheap();
 273 |     r_thread_state = READY;
 274 |   }
 275 | }
 276 | 
 277 | static void global_init()
 278 | {
 279 | #ifdef DEBUG
 280 |     /* Register the signal handler for backtrace*/
 281 |     signal(SIGSEGV, handler);
 282 | #endif
 283 |     pthread_key_create(&destructor, thread_exit);
 284 |     /* Initialize global data */
 285 |     gpool_init();
 286 |     maps_init();
 287 | 
 288 |     global_state = READY;
 289 | #ifdef RETURN_MEMORY
 290 |     /* Create the gc thread */
 291 |     pthread_create(&gpool_gc_thread, NULL, gpool_gc, NULL);
 292 | #endif
 293 | }
 294 | 
 295 | inline static void check_init()
 296 | {
 297 |   if (unlikely(thread_state != READY)) {
 298 |     if (global_state != READY) {
 299 |       pthread_once(&init_once, global_init);
 300 |     }
 301 |     thread_init();
 302 |   }
 303 | }
 304 | 
 305 | inline static int rpool_check_size(void *target) {
 306 |   if(r_global_pool.pool_end <= (char *)target) {
 307 |     return 0 ;//false
 308 |   }
 309 |   return 1; //true
 310 | }
 311 | 
 312 | inline static void gpool_check_size(void *target)
 313 | {
 314 |   if (global_pool.pool_end <= (char *)target) {
 315 |     /* Global Pool Full */
 316 |     pthread_mutex_lock(&global_pool.lock);
 317 |     while (global_pool.pool_end <= (char *)target) {
 318 |       gpool_grow();
 319 |     }
 320 |     pthread_mutex_unlock(&global_pool.lock);
 321 |   }
 322 | }
 323 | 
 324 | static int gpool_grow()
 325 | {
 326 |     /* Enlarge the raw memory pool */
 327 |     static int last_alloc = 8;
 328 |     int alloc_size = ALLOC_UNIT * last_alloc;
 329 |     if (last_alloc < 32) {
 330 |         last_alloc *= 2;
 331 |     }
 332 | 
 333 |     void *mem = page_alloc((void *)global_pool.pool_end, alloc_size);
 334 |     if (mem == MAP_FAILED) {
 335 |         exit(-1);
 336 |         return -1;
 337 |     }
 338 | 
 339 |     /* Increase the global pool size */
 340 |     global_pool.pool_end += alloc_size;
 341 |     return 0;
 342 | }
 343 | 
 344 | 
 345 | 
 346 | /* Initialize the global memory pool */
 347 | static void gpool_init()
 348 | {
 349 |     global_pool.pool_start = RAW_POOL_START;
 350 |     global_pool.pool_end = RAW_POOL_START;
 351 |     global_pool.free_start = RAW_POOL_START;
 352 |     //queue_init(&global_pool.free_dc_head);
 353 |     pthread_mutex_init(&global_pool.lock, NULL);
 354 |     gpool_grow();
 355 | }
 356 | 
 357 | 
 358 | inline static chunk_t *gpool_acquire_chunk()
 359 | {
 360 |   void *ptr = NULL;
 361 |   
 362 |   /* Try to alloc a freed chunk from the free list */
 363 |   ptr = queue_fetch(&global_pool.free_dc_head[get_core_id()]);
 364 |   if (ptr) {
 365 |     return (chunk_t *) ptr;
 366 |   }
 367 |   
 368 | #ifdef RETURN_MEMORY
 369 |   ptr = queue_fetch(&global_pool.released_dc_head[get_core_id()]);
 370 |   if (ptr) {
 371 |     // XXX: Fix me
 372 |     ((chunk_t *) ptr)->numa_node = get_core_id();
 373 |     touch_memory_range(ptr, CHUNK_SIZE);
 374 |     return (chunk_t *) ptr;
 375 |   }
 376 | #endif
 377 |   
 378 |   /* Or just alloc a new chunk */
 379 |   ptr = gpool_make_raw_chunk();
 380 |   gpool_check_size(ptr);
 381 |   ptr -= CHUNK_SIZE;
 382 |   ((chunk_t *) ptr)->numa_node = get_core_id();
 383 |   touch_memory_range(ptr, CHUNK_SIZE);
 384 |   return (chunk_t *) ptr;
 385 | }
 386 | 
 387 | 
 388 | inline static chunk_t *rpool_acquire_chunk()
 389 | {
 390 |   void *ptr = NULL;
 391 |   
 392 |   /* Try to alloc a freed chunk from the free list */
 393 |   ptr = queue_fetch(&r_global_pool.free_dc_head[get_core_id()]);
 394 |   if (ptr) {
 395 |     return (chunk_t *) ptr;
 396 |   }
 397 |   
 398 | #ifdef RETURN_MEMORY
 399 |   ptr = queue_fetch(&r_global_pool.released_dc_head[get_core_id()]);
 400 |   if (ptr) {
 401 |     // XXX: Fix me
 402 |     ((chunk_t *) ptr)->numa_node = get_core_id();
 403 |     touch_memory_range(ptr, CHUNK_SIZE);
 404 |     return (chunk_t *) ptr;
 405 |   }
 406 | #endif
 407 |   
 408 |   /* Or just alloc a new chunk */
 409 |   ptr = rpool_make_raw_chunk();
 410 |   if((char *)ptr > (char *)r_global_pool.pool_end) {
 411 |     return NULL;
 412 |   }
 413 |     
 414 |   //  rpool_check_size(ptr);
 415 |   ptr -= CHUNK_SIZE;
 416 |   ((chunk_t *) ptr)->numa_node = get_core_id();
 417 |   touch_memory_range(ptr, CHUNK_SIZE);
 418 |   return (chunk_t *) ptr;
 419 | }
 420 | 
 421 | 
 422 | static void *rpool_make_raw_chunk() {
 423 |   void *ret = (void *)(atmc_fetch_and_add64((unsigned long long *)
 424 | 					     &r_global_pool.free_start,
 425 | 					    CHUNK_SIZE)) ;
 426 |   return ret;
 427 | }
 428 | 
 429 | static void *gpool_make_raw_chunk()
 430 | {
 431 |   /* Atomic increse the global pool size */
 432 |   void *ret = (void *)(atmc_fetch_and_add64((unsigned long long *)
 433 | 					    &global_pool.free_start,
 434 | 					    CHUNK_SIZE));
 435 |   return ret;
 436 | }
 437 |   
 438 |   
 439 | inline static void gpool_release_chunk(dchunk_t *dc)
 440 | {
 441 |   queue_put(&global_pool.free_dc_head[dc->numa_node], dc);
 442 | }
 443 | 
 444 | inline static void rpool_release_chunk(dchunk_t *dc) {
 445 |   queue_put(&r_global_pool.free_dc_head[dc->numa_node], dc);
 446 | }
 447 | 
 448 | 
 449 | static lheap_t *r_acquire_lheap() {
 450 |   lheap_t *lh;
 451 |   lh = queue_fetch(&(r_global_pool.free_lh_head[get_core_id()]));
 452 |   /* Alloc a new one */
 453 |   if (!lh) {
 454 |     lh = (lheap_t *) rpool_acquire_chunk();
 455 |     if(lh == NULL) {
 456 |       fprintf(stderr,"panic, cannot acquire local heap\n");
 457 |       return NULL;
 458 |     }
 459 |     lheap_init(lh);
 460 |   }
 461 |   return lh;  
 462 | }
 463 | 
 464 | 
 465 | 
 466 | static lheap_t *gpool_acquire_lheap()
 467 | {
 468 |   lheap_t *lh;
 469 |   lh = queue_fetch(&(global_pool.free_lh_head[get_core_id()]));
 470 |   /* Alloc a new one */
 471 |   if (!lh) {
 472 |     lh = (lheap_t *) gpool_acquire_chunk();
 473 |     lheap_init(lh);
 474 |   }
 475 |   return lh;  
 476 | }
 477 | 
 478 | static void gpool_release_lheap(lheap_t *lh)
 479 | {
 480 |   queue_put(&global_pool.free_lh_head[local_heap->numa_node], lh);
 481 | }
 482 | 
 483 | static void rpool_release_lheap(lheap_t *lh)
 484 | {
 485 |   queue_put(&r_global_pool.free_lh_head[local_heap->numa_node], lh);
 486 | }
 487 | 
 488 | 
 489 | inline static void lheap_init(lheap_t * lh)
 490 | {
 491 |   memset(&lh->free_head, 0, sizeof(lheap_t));
 492 | 
 493 |   int size_cls;
 494 |   lh->dummy_chunk.size_cls = DUMMY_CLASS;
 495 |   lh->dummy_chunk.free_blk_cnt = 1;
 496 |   
 497 |   for (size_cls = 0; size_cls < DEFAULT_BLOCK_CLASS; size_cls++) {
 498 |     /* Install the dummy chunk */
 499 |     lh->foreground[size_cls] = &lh->dummy_chunk;
 500 |   }
 501 | }
 502 | 
 503 | inline static int r_lheap_replace_foreground
 504 | (lheap_t * lh, int size_cls) {
 505 | 
 506 |   dchunk_t *dc;
 507 |   
 508 |   /* Try to acquire the block from background list */
 509 |   dc = (dchunk_t *) lh->background[size_cls].head;
 510 |   if (dc != NULL) {
 511 |     double_list_remove(dc, &lh->background[size_cls]);
 512 |     goto finish;
 513 |   }
 514 |   
 515 |   /* Try to acquire a block in the remote freed list */
 516 |   dc = fast_queue_fetch(&lh->need_gc[size_cls]);
 517 |   if (dc != NULL) {
 518 |     dchunk_collect_garbage(dc);
 519 |     goto finish;
 520 |   }
 521 |   
 522 |   /* Try to acquire the chunk from local pool */
 523 |   dc = (dchunk_t *) seq_queue_fetch(&lh->free_head);
 524 |   if (dc != NULL) {
 525 |     //    fprintf(stdout,"get free head\n");
 526 |     lh->free_cnt--;
 527 |     dchunk_change_cls(dc, size_cls);
 528 |     goto finish;
 529 |   }
 530 |   
 531 |   /* Acquire the chunk from global pool */
 532 |   
 533 |   dc = (dchunk_t *) rpool_acquire_chunk();
 534 |   //  fprintf(stdout,"acquire raw pool\n");
 535 |   if(unlikely(dc == NULL)) {
 536 |     return 0; // false
 537 |   }
 538 |   //  fprintf(stdout,"owner %p\n",lh);
 539 |   dc->owner = lh;
 540 |   fast_queue_init((FastQueue *) & (dc->remote_free_head));
 541 |   dchunk_init(dc, size_cls);
 542 |   
 543 |  finish:
 544 |   /* Set the foreground chunk */
 545 |   lh->foreground[size_cls] = dc;
 546 |   dc->state = FOREGROUND;
 547 |   return 1; // true
 548 | }
 549 | 
 550 | 
 551 | inline static void lheap_replace_foreground
 552 | (lheap_t * lh, int size_cls) {
 553 |   dchunk_t *dc;
 554 | 
 555 |   /* Try to acquire the block from background list */
 556 |   dc = (dchunk_t *) lh->background[size_cls].head;
 557 |   if (dc != NULL) {
 558 |     double_list_remove(dc, &lh->background[size_cls]);
 559 |     goto finish;
 560 |   }
 561 |   
 562 |   /* Try to acquire a block in the remote freed list */
 563 |   dc = fast_queue_fetch(&lh->need_gc[size_cls]);
 564 |   if (dc != NULL) {
 565 |     dchunk_collect_garbage(dc);
 566 |     goto finish;
 567 |   }
 568 |   
 569 |   /* Try to acquire the chunk from local pool */
 570 |   dc = (dchunk_t *) seq_queue_fetch(&lh->free_head);
 571 |   if (dc != NULL) {
 572 |     lh->free_cnt--;
 573 |     dchunk_change_cls(dc, size_cls);
 574 |     goto finish;
 575 |   }
 576 |   
 577 |   /* Acquire the chunk from global pool */
 578 |   
 579 |   dc = (dchunk_t *) gpool_acquire_chunk();
 580 |   dc->owner = lh;
 581 |   fast_queue_init((FastQueue *) & (dc->remote_free_head));
 582 |   dchunk_init(dc, size_cls);
 583 |   
 584 |  finish:
 585 |   /* Set the foreground chunk */
 586 |   lh->foreground[size_cls] = dc;
 587 |   dc->state = FOREGROUND;
 588 | }
 589 | 
 590 | inline static void dchunk_change_cls(dchunk_t * dc, int size_cls)
 591 | {
 592 |     int size = cls2size[size_cls];
 593 |     int data_offset = DCH;
 594 |     dc->blk_cnt = (CHUNK_SIZE - data_offset) / size;
 595 |     dc->free_blk_cnt = dc->blk_cnt;
 596 |     dc->block_size = size;
 597 |     dc->free_mem = (char *)dc + data_offset;
 598 |     dc->size_cls = size_cls;
 599 |     seq_queue_init(&dc->free_head);
 600 | }
 601 | 
 602 | inline static void dchunk_init(dchunk_t * dc, int size_cls)
 603 | {
 604 |     dc->active_link.next = NULL;
 605 |     dc->active_link.prev = NULL;
 606 |     dchunk_change_cls(dc, size_cls);
 607 | }
 608 | 
 609 | inline static void dchunk_collect_garbage(dchunk_t * dc)
 610 | {
 611 |     seq_head(dc->free_head) =
 612 |         counted_chain_dequeue(&dc->remote_free_head, &dc->free_blk_cnt);
 613 | }
 614 | 
 615 | inline static void *dchunk_alloc_obj(dchunk_t * dc)
 616 | {
 617 |     void *ret;
 618 | 
 619 |     /* Dirty implementation of dequeue, avoid one branch */
 620 |     ret = seq_head(dc->free_head);
 621 | 
 622 |     if (unlikely(!ret)) {
 623 |         ret = dc->free_mem;
 624 |         dc->free_mem += dc->block_size;
 625 |     } else {
 626 |         seq_head(dc->free_head) = *(void**)ret;
 627 |     }
 628 | 
 629 | #if 0
 630 |     /* A clearer implementation with one more branch*/
 631 |     ret = seq_lifo_dequeue(&dc->free_head);
 632 |     if (unlikely(!ret)) {
 633 |         ret = dc->free_mem;
 634 |         dc->free_mem += dc->block_size;
 635 |     }
 636 | #endif
 637 | 
 638 |     return ret;
 639 | }
 640 | 
 641 | inline static dchunk_t *dchunk_extract(void *ptr)
 642 | {
 643 |     return (dchunk_t *) ((uintptr_t)ptr - ((uintptr_t)ptr % CHUNK_SIZE));
 644 | }
 645 | 
 646 | inline static void obj_buf_flush(obj_buf_t * bbuf)
 647 | {
 648 |     void *prev;
 649 | 
 650 |     dchunk_t *dc = bbuf->dc;
 651 |     lheap_t *lh = dc->owner;
 652 | 
 653 |     prev = counted_chain_enqueue(&(dc->remote_free_head),
 654 |                                  seq_head(bbuf->free_head), bbuf->first, bbuf->count);
 655 |     bbuf->count = 0;
 656 |     bbuf->dc = NULL;
 657 |     bbuf->first = NULL;
 658 |     seq_head(bbuf->free_head) = NULL;
 659 | 
 660 |     /* If I am the first thread done remote free in this memory chunk*/
 661 |     if ((unsigned long long)prev == 0L) {
 662 |       fast_queue_put(&(lh->need_gc[dc->size_cls]), dc);
 663 |     }
 664 |     return;
 665 | }
 666 | 
 667 | inline static void obj_buf_flush_all(lheap_t *lh) {
 668 |     int i;
 669 |     for (i = 0; i < BLOCK_BUF_CNT; i++) {
 670 |         obj_buf_t *buf = &lh->block_bufs[i];
 671 |         if (buf->count == 0)
 672 |             continue;
 673 |         obj_buf_flush(buf);
 674 |         buf->dc = NULL;
 675 |     }
 676 | }
 677 | 
 678 | inline static void obj_buf_put(obj_buf_t *bbuf, dchunk_t * dc, void *ptr) {
 679 |     if (unlikely(bbuf->dc != dc)) {
 680 |         if (bbuf->dc != NULL) {
 681 |             obj_buf_flush(bbuf);
 682 |         }
 683 |         bbuf->dc = dc;
 684 |         bbuf->first = ptr;
 685 |         bbuf->count = 0;
 686 |         seq_head(bbuf->free_head) = NULL;
 687 |     }
 688 | 
 689 |     seq_queue_put(&bbuf->free_head, ptr);
 690 |     bbuf->count++;
 691 | }
 692 | 
 693 | inline static void *r_large_malloc(size_t size) {
 694 | 
 695 |   /* round up the size */
 696 |   size_t real_size = size + CHUNK_SIZE - size % CHUNK_SIZE;
 697 |   void *ret = (void *)(atmc_fetch_and_add64((unsigned long long *)
 698 | 					     &r_global_pool.free_start,
 699 | 					    real_size)) ;
 700 |   if(ret > (void *)(r_global_pool.pool_end))
 701 |     return NULL;
 702 |   //void *mem_start = (char *)ret + CHUNK_SIZE - CACHE_LINE_SIZE;
 703 |   large_header_t *header = (large_header_t *)dchunk_extract(ret);
 704 |   
 705 |   header->alloc_size = real_size;
 706 |   header->mem = ret;
 707 |   header->owner = LARGE_OWNER;
 708 |   
 709 |   return ret;
 710 | }
 711 | 
 712 | inline static void *large_malloc(size_t size)
 713 | {
 714 |     size_t alloc_size = PAGE_ROUNDUP(size + CHUNK_SIZE);
 715 |     void *mem = page_alloc(NULL, alloc_size);
 716 |     void *mem_start = (char*)mem + CHUNK_SIZE - CACHE_LINE_SIZE;
 717 |     large_header_t *header = (large_header_t *)dchunk_extract(mem_start);
 718 | 
 719 |     /* If space is enough for the header of a large block */
 720 |     intptr_t distance = (intptr_t)mem_start - (intptr_t)header;
 721 |     if (distance >= sizeof(large_header_t)) {
 722 |         header->alloc_size = alloc_size;
 723 |         header->mem = mem;
 724 |         header->owner = LARGE_OWNER;
 725 |         return mem_start;
 726 |     }
 727 | 
 728 |     /* If not, Retry Allocation */
 729 |     void *ret = large_malloc(size);
 730 |     page_free(mem, alloc_size);
 731 |     return ret;
 732 | }
 733 | 
 734 | inline static void *r_small_malloc(int size_cls) {
 735 |   
 736 |   lheap_t *lh = r_local_heap;
 737 |   dchunk_t *dc;
 738 |   void *ret;
 739 |  retry:
 740 |   dc = lh->foreground[size_cls];
 741 |   ret = dchunk_alloc_obj(dc);
 742 |   //  fprintf(stdout,"alloc owner %p\n",dc->owner);
 743 |   /* Check if the datachunk is full */
 744 |   if (unlikely(--dc->free_blk_cnt == 0)) {
 745 |     dc->state = FULL;
 746 |     /* There is not enough memory in RDMA region */
 747 |     if(unlikely(r_lheap_replace_foreground(lh, size_cls) == 0))
 748 |       return NULL;
 749 |     if (unlikely(dc->size_cls == DUMMY_CLASS)) {
 750 |       /* A dummy chunk */
 751 |       dc->free_blk_cnt = 1;
 752 |       goto retry;
 753 |     }
 754 |   }
 755 |   
 756 |   return ret;  
 757 | }
 758 | 
 759 | inline static void *small_malloc(int size_cls)
 760 | {
 761 |     lheap_t *lh = local_heap;
 762 |     dchunk_t *dc;
 763 |     void *ret;
 764 |   retry:
 765 |     dc = lh->foreground[size_cls];
 766 |     ret = dchunk_alloc_obj(dc);
 767 | 
 768 |     /* Check if the datachunk is full */
 769 |     if (unlikely(--dc->free_blk_cnt == 0)) {
 770 |         dc->state = FULL;
 771 |         lheap_replace_foreground(lh, size_cls);
 772 |         if (unlikely(dc->size_cls == DUMMY_CLASS)) {
 773 |             /* A dummy chunk */
 774 |             dc->free_blk_cnt = 1;
 775 |             goto retry;
 776 |         }
 777 |     }
 778 | 
 779 |     return ret;
 780 | }
 781 | 
 782 | inline static void large_free(void *ptr)
 783 | {
 784 |     large_header_t *header = (large_header_t*)dchunk_extract(ptr);
 785 |     page_free(header->mem, header->alloc_size);
 786 | }
 787 | 
 788 | inline static void local_free(lheap_t * lh, dchunk_t * dc, void *ptr)
 789 | {
 790 |     unsigned int free_blk_cnt = ++dc->free_blk_cnt;
 791 |     seq_queue_put(&dc->free_head, ptr);
 792 | 
 793 |     switch (dc->state) {
 794 |     case FULL:
 795 |         double_list_insert_front(dc, &lh->background[dc->size_cls]);
 796 |         dc->state = BACKGROUND;
 797 |         break;
 798 |     case BACKGROUND:
 799 |         if (unlikely(free_blk_cnt == dc->blk_cnt)) {
 800 |             int free_cnt = lh->free_cnt;
 801 |             double_list_remove(dc, &lh->background[dc->size_cls]);
 802 | 
 803 |             if (free_cnt >= MAX_FREE_CHUNK) {
 804 |                 gpool_release_chunk(dc);
 805 |             } else {
 806 |                 seq_queue_put(&lh->free_head, dc);
 807 |                 lh->free_cnt = free_cnt + 1;
 808 |             }
 809 |         }
 810 |         break;
 811 |     case FOREGROUND:
 812 |         /* Tada.. */
 813 |         break;
 814 |     }
 815 | }
 816 | 
 817 | THREAD_LOCAL int buf_cnt;
 818 | inline static void remote_free(lheap_t * lh, dchunk_t * dc, void *ptr)
 819 | {
 820 |     /* Put the object in a local buffer rather than return it to owner */
 821 |   int tag = ((unsigned long long)dc / CHUNK_SIZE) % BLOCK_BUF_CNT;
 822 |   obj_buf_t *bbuf = &lh->block_bufs[tag];
 823 |   obj_buf_put(bbuf, dc, ptr);
 824 |   
 825 |     /* Periodically flush buffered remote objects */
 826 |   if ((buf_cnt++ & 0xFFFF) == 0) {
 827 |     obj_buf_flush_all(lh);
 828 |   }
 829 | }
 830 | 
 831 | static void touch_memory_range(void *addr, size_t len)
 832 | {
 833 |     char *ptr = (char *)addr;
 834 |     char *end = ptr + len;
 835 | 
 836 |     for (; ptr < end; ptr += PAGE_SIZE) {
 837 |         *ptr = 0;
 838 |     }
 839 | }
 840 | 
 841 | static void *large_memalign(size_t boundary, size_t size) {
 842 |     /* Alloc a large enough memory block */
 843 |     size_t padding = boundary + CHUNK_SIZE;
 844 |     size_t alloc_size = PAGE_ROUNDUP(size + padding);
 845 |     void *mem = page_alloc(NULL, alloc_size);
 846 | 
 847 |     /* Align up the address to boundary */
 848 |     void *mem_start = 
 849 |         (void*)((uintptr_t)((char*)mem + padding) & ~(boundary - 1));
 850 | 
 851 |     /* Extract space for an header */
 852 |     large_header_t *header = 
 853 |         (large_header_t *)dchunk_extract(mem_start);
 854 | 
 855 |     /* If space is enough for the header of a large block */
 856 |     intptr_t distance = (intptr_t)mem_start - (intptr_t)header;
 857 |     if (distance >= sizeof(large_header_t)) {
 858 |         header->alloc_size = alloc_size;
 859 |         header->mem = mem;
 860 |         header->owner = LARGE_OWNER;
 861 |         return mem_start;
 862 |     }
 863 | 
 864 |     /* If not, retry allocation */
 865 |     void *ret = NULL;
 866 | 
 867 |     /* Avoid infinite loop if application call memalign(CHUNK_SIZE,size),
 868 |      * althrough it is actually illegal
 869 |      */
 870 |     if (boundary % CHUNK_SIZE != 0) {
 871 |         ret = large_memalign(boundary, size);
 872 |     }
 873 |     page_free(mem, alloc_size);
 874 |     return ret;
 875 | }
 876 | 
 877 | #ifdef DEBUG
 878 | /* Signal handler for debugging use */
 879 | static void handler(int sig)
 880 | {
 881 |     void *array[10];
 882 |     size_t size;
 883 | 
 884 |     /* get void*'s for all entries on the stack */
 885 |     size = backtrace(array, 10);
 886 | 
 887 |     /* print out all the frames to stderr */
 888 |     fprintf(stderr, "Error: signal %d:\n", sig);
 889 |     backtrace_symbols_fd(array, size, 2);
 890 |     exit(1);
 891 | }
 892 | #endif
 893 | 
 894 | static void *page_alloc(void *pos, size_t size)
 895 | {
 896 |     return mmap(pos,
 897 |                 size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
 898 | }
 899 | 
 900 | static void page_free(void *pos, size_t size)
 901 | {   
 902 |     munmap(pos, size);
 903 | }
 904 | 
 905 | 
 906 | inline static int size2cls(size_t size)
 907 | {
 908 |     int ret;
 909 |     if (likely(size <= 1024)) {
 910 |         ret = sizemap[(size - 1) >> 2];
 911 |     } else if (size <= 65536) {
 912 |         ret = sizemap2[(size - 1) >> 9];
 913 |     } else {
 914 |         ret = LARGE_CLASS;
 915 |     }
 916 |     return ret;
 917 | }
 918 | 
 919 | inline static int r_size2cls(size_t size) {
 920 |   int ret;
 921 |   if (likely(size <= 1024)) {
 922 |     ret = r_sizemap[(size - 1) >> 2];
 923 |   } else if (size <= 65536) {
 924 |     ret = r_sizemap2[(size - 1) >> 9];
 925 |   } else {
 926 |     ret = LARGE_CLASS;
 927 |   }
 928 |   return ret;  
 929 | }
 930 | 
 931 | void *malloc(size_t size)
 932 | {
 933 |     void *ret = NULL;
 934 | 
 935 |     /* Initialize the allocator */
 936 |     check_init();
 937 | 
 938 |     /* Deal with zero-size allocation */
 939 |     size += (size == 0);
 940 | 
 941 | #if 0
 942 |     /* The expression above is equivalent to the code below */
 943 |     if (unlikely(size == 0)) {
 944 |         size = 1;
 945 |     }
 946 | #endif
 947 | 
 948 |     int size_cls = size2cls(size);
 949 |     if (likely(size_cls < DEFAULT_BLOCK_CLASS)) {
 950 |       ret = small_malloc(size_cls);
 951 |       //      return NULL;
 952 |     } else {
 953 |       ret = large_malloc(size);
 954 |     }
 955 |     if(unlikely(ret == NULL))
 956 |       assert(0);
 957 |     return ret;
 958 | }
 959 | 
 960 | void *Rmalloc(size_t size) {
 961 |   void *ret = NULL;
 962 |   
 963 |   /* Deal with zero-size allocation */
 964 |   size += (size == 0);
 965 |   
 966 |   int size_cls = r_size2cls(size);
 967 |   if (likely(size_cls < DEFAULT_BLOCK_CLASS)) {
 968 |     ret = r_small_malloc(size_cls);
 969 |   } else {
 970 |     ret = r_large_malloc(size);
 971 |   }
 972 |   return ret;  
 973 | }
 974 | 
 975 | 
 976 | void free(void *ptr)
 977 | {
 978 |     if(ptr == NULL) {
 979 |         return;
 980 |     }
 981 | 
 982 |     dchunk_t *dc = dchunk_extract(ptr);
 983 |     lheap_t *lh = local_heap;
 984 |     lheap_t *target_lh = dc->owner;
 985 | 
 986 |     if (likely(target_lh == lh)) {
 987 |         local_free(lh, dc, ptr);
 988 |     } else if(likely(target_lh != LARGE_OWNER)){
 989 |         check_init();
 990 |         lh = local_heap;
 991 |         remote_free(lh, dc, ptr);
 992 |     } else {
 993 |         large_free(ptr);
 994 |     }
 995 | }
 996 | 
 997 | void Rfree(void *ptr) {
 998 |   
 999 |   if(ptr == NULL) {
1000 |     return;
1001 |   }
1002 |   
1003 |   dchunk_t *dc = dchunk_extract(ptr);
1004 |   lheap_t *lh = r_local_heap;
1005 |   lheap_t *target_lh = dc->owner;
1006 |   //  fprintf(stdout,"check owner %p\n",target_lh);
1007 |   
1008 |   if (likely(target_lh == lh)) {
1009 |     local_free(lh, dc, ptr);
1010 |   } else if(likely(target_lh != LARGE_OWNER)) {
1011 |     //    check_init();
1012 |     lh = r_local_heap;
1013 |     remote_free(lh, dc, ptr);
1014 |   } else {    
1015 |     //    large_free(ptr);
1016 |   }
1017 | }
1018 | 
1019 | 
1020 | void *realloc(void* ptr, size_t size)
1021 | {
1022 |     /* Handle special cases */
1023 |     if (ptr == NULL) {
1024 |         void *ret = malloc(size);
1025 |         return ret;
1026 |     }
1027 | 
1028 |     if (size == 0) {
1029 |         free(ptr);
1030 |     }
1031 | 
1032 |     dchunk_t *dc = dchunk_extract(ptr);
1033 |     if (dc->owner != LARGE_OWNER) {
1034 |         int old_size = cls2size[dc->size_cls];
1035 | 
1036 |         /* Not exceed the current size, return */
1037 |         if (size <= old_size) {
1038 |             return ptr;
1039 |         }
1040 | 
1041 |         /* Alloc a new block */
1042 |         void *new_ptr = malloc(size);
1043 |         memcpy(new_ptr, ptr, old_size);
1044 |         free(ptr);
1045 |         return new_ptr;
1046 |     } else {
1047 |         large_header_t *header = (large_header_t *)dc;
1048 |         size_t alloc_size = header->alloc_size;
1049 |         void* mem = header->mem;
1050 |         size_t offset = (uintptr_t)ptr - (uintptr_t)mem;
1051 |         size_t old_size = alloc_size - offset;
1052 | 
1053 |         /* Not exceed the current size, return */
1054 |         if(size <= old_size) {
1055 |             return ptr;
1056 |         }
1057 |         
1058 |         /* Try to do mremap */
1059 |         int new_size = PAGE_ROUNDUP(size + CHUNK_SIZE);
1060 |         mem = mremap(mem, alloc_size, new_size, MREMAP_MAYMOVE);
1061 |         void* mem_start = (void*)((uintptr_t)mem + offset);
1062 |         header = (large_header_t*)dchunk_extract(mem_start);
1063 | 
1064 |         intptr_t distance = (intptr_t)mem_start - (intptr_t)header;
1065 |         if (distance >= sizeof(large_header_t)) {
1066 |             header->alloc_size = new_size;
1067 |             header->mem = mem;
1068 |             header->owner = LARGE_OWNER;
1069 |             return mem_start;
1070 |         }
1071 | 
1072 |         void* new_ptr = large_malloc(size);
1073 |         memcpy(new_ptr, mem_start, old_size);
1074 |         free(mem);
1075 |         return new_ptr; 
1076 |     }
1077 | }
1078 | 
1079 | void * __attribute__((optimize("O0"))) calloc(size_t nmemb, size_t size)
1080 | {
1081 |   void *ptr;
1082 |     size_t m_size = nmemb * size;
1083 |     //    ptr = malloc(nmemb * size);
1084 |     ptr = malloc(m_size);
1085 |     if (!ptr) {
1086 |       //      assert(0);
1087 |         return NULL;
1088 |     }
1089 |     return memset(ptr, 0, nmemb * size);
1090 | }
1091 | 
1092 | void *memalign(size_t boundary, size_t size) {
1093 |     /* Deal with zero-size allocation */
1094 |     size += (size == 0);
1095 |     if(boundary <= 256 && size <= 65536) {
1096 |         /* In this case, we handle it as small allocations */
1097 |         int boundary_cls = size2cls(boundary);
1098 |         int size_cls = size2cls(size);
1099 |         int alloc_cls = max(boundary_cls, size_cls);
1100 |         return small_malloc(alloc_cls);
1101 |     } else {
1102 |         /* Handle it as a special large allocation */
1103 |         return large_memalign(boundary, size);
1104 |     }
1105 | }
1106 | 
1107 | int posix_memalign(void **memptr, size_t alignment, size_t size)
1108 | {
1109 |     *memptr = memalign(alignment, size);
1110 |     if (*memptr) {
1111 |         return 0;
1112 |     } else {
1113 |         /* We have to "personalize" the return value according to the error */
1114 |         return -1;
1115 |     }
1116 | }
1117 | 
1118 | void *valloc(size_t size)
1119 | {
1120 |     return memalign(PAGE_SIZE, size);
1121 | }
1122 | 
1123 | void *pvalloc(size_t size)
1124 | {
1125 |     fprintf(stderr, "pvalloc() called. Not implemented! Exiting.\n");
1126 |     exit(1);
1127 | }
1128 | 


--------------------------------------------------------------------------------
/ralloc/ssmalloc.h:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include <stdlib.h>
  3 | #include <pthread.h>
  4 | #include <stddef.h>
  5 | #include <stdio.h>
  6 | #include <string.h>
  7 | #include <errno.h>
  8 | #include <sys/mman.h>
  9 | #include <sched.h>
 10 | #include <time.h>
 11 | 
 12 | #include <assert.h>
 13 | #include <execinfo.h>
 14 | #include <signal.h>
 15 | 
 16 | #include "atomic.h"
 17 | #include "bitops.h"
 18 | #include "queue.h"
 19 | #include "double-list.h"
 20 | #include "cpu.h"
 21 | 
 22 | /* Configurations */
 23 | #define CHUNK_DATA_SIZE     (16*PAGE_SIZE)
 24 | #define ALLOC_UNIT          (4*1024*1024)
 25 | #define MAX_FREE_SIZE       (4*1024*1024)
 26 | #define RAW_POOL_START      ((void*)((0x600000000000/CHUNK_SIZE+1)*CHUNK_SIZE))
 27 | 
 28 | #define BLOCK_BUF_CNT       (16)
 29 | 
 30 | // #define RETURN_MEMORY
 31 | // #define DEBUG
 32 | 
 33 | /* Other */
 34 | #define CHUNK_SIZE          (CHUNK_DATA_SIZE+sizeof(dchunk_t))
 35 | #define CHUNK_MASK          (~(CHUNK_SIZE-1))
 36 | #define LARGE_CLASS         (100)
 37 | #define DUMMY_CLASS         (101)
 38 | #define DCH                 (sizeof(dchunk_t))
 39 | #define MAX_FREE_CHUNK      (MAX_FREE_SIZE/CHUNK_SIZE)
 40 | #define LARGE_OWNER         ((void*)0xDEAD)
 41 | #define ACTIVE              ((void*)1)
 42 | 
 43 | /* Utility Macros */
 44 | #define ROUNDUP(x,n)        ((x+n-1)&(~(n-1)))
 45 | #define ROUNDDOWN(x,n)      (((x-n)&(~(n-1)))+1)
 46 | #define PAGE_ROUNDUP(x)     (ROUNDUP((uintptr_t)x,PAGE_SIZE))
 47 | #define PAGE_ROUNDDOWN(x)   (ROUNDDOWN((uintptr_t)x,PAGE_SIZE))
 48 | #define CACHE_ALIGN __attribute__ ((aligned (CACHE_LINE_SIZE)))
 49 | #define THREAD_LOCAL __attribute__ ((tls_model ("initial-exec"))) __thread
 50 | #define likely(x)           __builtin_expect(!!(x),1)
 51 | #define unlikely(x)         __builtin_expect(!!(x),0)
 52 | 
 53 | /* Multi consumer queue */
 54 | #define queue_init(head)\
 55 |     mc_queue_init(head)
 56 | #define queue_put(head,elem)\
 57 |     mc_enqueue(head,elem,0)
 58 | #define queue_fetch(head)\
 59 |     mc_dequeue(head,0)
 60 | typedef queue_head_t Queue;
 61 | 
 62 | /* Single consumer queue */
 63 | #define fast_queue_init(head)\
 64 |     sc_queue_init(head)
 65 | #define fast_queue_put(head,elem)\
 66 |     sc_enqueue(head,elem,0)
 67 | #define fast_queue_fetch(head)\
 68 |     sc_dequeue(head,0)
 69 | #define fast_queue_chain_fetch(head)\
 70 |     sc_chain_dequeue(head)
 71 | typedef queue_head_t FastQueue;
 72 | 
 73 | /* Sequencial queue */
 74 | #define seq_queue_init(head)\
 75 |     seq_queue_init(head)
 76 | #define seq_queue_put(head,elem)\
 77 |     seq_enqueue(head,elem)
 78 | #define seq_queue_fetch(head)\
 79 |     seq_dequeue(head)
 80 | #define fast_queue_chain_put(head)\
 81 |     seq_chain_enqueue(head)
 82 | typedef seq_queue_head_t SeqQueue;
 83 | 
 84 | /* Type definations */
 85 | typedef enum {
 86 |     UNINITIALIZED,
 87 |     READY
 88 | } init_state;
 89 | 
 90 | typedef enum {
 91 |     FOREGROUND,
 92 |     BACKGROUND,
 93 |     FULL
 94 | } dchunk_state;
 95 | 
 96 | typedef struct lheap_s lheap_t;
 97 | typedef struct gpool_s gpool_t;
 98 | typedef struct dchunk_s dchunk_t;
 99 | typedef struct chunk_s chunk_t;
100 | typedef struct obj_buf_s obj_buf_t;
101 | typedef struct large_header_s large_header_t;
102 | 
103 | typedef double_list_t LinkedList;
104 | typedef double_list_elem_t LinkedListElem;
105 | 
106 | struct large_header_s {
107 |     CACHE_ALIGN size_t alloc_size;
108 |     void* mem;
109 |     CACHE_ALIGN lheap_t *owner;
110 | };
111 | 
112 | struct chunk_s {
113 |     CACHE_ALIGN LinkedListElem active_link;
114 |     uint32_t numa_node;
115 | };
116 | 
117 | /* Data chunk header */
118 | struct dchunk_s {
119 |     /* Local Area */
120 |     CACHE_ALIGN LinkedListElem active_link;
121 |     uint32_t numa_node;
122 | 
123 |     /* Read Area */
124 |     CACHE_ALIGN lheap_t * owner;
125 |     uint32_t size_cls;
126 | 
127 |     /* Local Write Area */
128 |      CACHE_ALIGN dchunk_state state;
129 |     uint32_t free_blk_cnt;
130 |     uint32_t blk_cnt;
131 |     SeqQueue free_head;
132 |     uint32_t block_size;
133 |     char *free_mem;
134 | 
135 |     /* Remote Write Area */
136 |      CACHE_ALIGN FastQueue remote_free_head;
137 | };
138 | 
139 | struct gpool_s {
140 |     pthread_mutex_t lock;
141 |     volatile char *pool_start;
142 |     volatile char *pool_end;
143 |     volatile char *free_start;
144 |     Queue free_dc_head[MAX_CORE_ID];
145 |     Queue free_lh_head[MAX_CORE_ID];
146 |     Queue released_dc_head[MAX_CORE_ID];
147 | };
148 | 
149 | struct obj_buf_s {
150 |     void *dc;
151 |     void *first;
152 |     SeqQueue free_head;
153 |     int count;
154 | };
155 | 
156 | /* Per-thread data chunk pool */
157 | struct lheap_s {
158 |     CACHE_ALIGN LinkedListElem active_link;
159 |     uint32_t numa_node;
160 |     SeqQueue free_head;
161 |     uint32_t free_cnt;
162 | 
163 |     dchunk_t *foreground[DEFAULT_BLOCK_CLASS];
164 |     LinkedList background[DEFAULT_BLOCK_CLASS];
165 |     dchunk_t dummy_chunk;
166 |     obj_buf_t block_bufs[BLOCK_BUF_CNT];
167 | 
168 |      CACHE_ALIGN FastQueue need_gc[DEFAULT_BLOCK_CLASS];
169 | };
170 | 
171 | static inline int max(int a, int b)
172 | {
173 |     return (a > b) ? a : b;
174 | }
175 | 
176 | 
177 | /* The new interfaces which is used for RDMA buffer malloc usage */
178 | /* Shall only be called once! */
179 | 
180 | /* Return the actual size used. If the return size is 0, then the allocation is failed */
181 | uint64_t  RInit(char *buffer, uint64_t size);
182 | void  RThreadLocalInit(void);
183 | void *Rmalloc(size_t __size);
184 | void  Rfree(void *__ptr);
185 | 
186 | void *malloc(size_t __size);
187 | void *realloc(void *__ptr, size_t __size);
188 | void free(void *__ptr);
189 | 


--------------------------------------------------------------------------------
/rdma_ctrl.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <memory>
  4 | 
  5 | #include "qp.hpp"
  6 | 
  7 | namespace rdmaio {
  8 | 
  9 | const int MAX_SERVER_SUPPORTED = 16;
 10 | typedef RUDQP<default_ud_config,MAX_SERVER_SUPPORTED> UDQP;
 11 | typedef RRCQP<default_rc_config>                      RCQP;
 12 | 
 13 | typedef std::function<void (const QPConnArg &)>     connection_callback_t;
 14 | 
 15 | class RdmaCtrl {
 16 |  public:
 17 |   RdmaCtrl(int node_id, int tcp_base_port,
 18 |            connection_callback_t callback = [](const QPConnArg &) {
 19 |                                               // the default callback does nothing
 20 |                                             },
 21 |            std::string ip = "localhost");
 22 | 
 23 |   ~RdmaCtrl();
 24 | 
 25 |   int current_node_id();
 26 |   int listening_port();
 27 | 
 28 |   typedef struct {
 29 |     int dev_id;
 30 |     int port_id;
 31 |   } DevIdx;
 32 | 
 33 |   /**
 34 |    * Query devices info on this machine,
 35 |    * if there is a previous call, return previous results unless clear_dev_info has been called
 36 |    */
 37 |   std::vector<RNicInfo> query_devs();
 38 | 
 39 |   static std::vector<RNicInfo> query_devs_helper();
 40 | 
 41 |   // clear the cached infos by RdmaCtrl;
 42 |   void clear_dev_info();
 43 | 
 44 |   /**
 45 |    * Open device handlers.
 46 |    * RdmaCtrl opens a device for each thread.
 47 |    * The get_device returns previously opened device of this thread, if it is already opened
 48 |    */
 49 |   RNicHandler *open_thread_local_device(DevIdx idx);
 50 | 
 51 |   RNicHandler *open_device(DevIdx idx);
 52 | 
 53 |   RNicHandler *get_device();
 54 | 
 55 |   /**
 56 |    * The *callback* is called once a QP connection request is sent to this server
 57 |    */
 58 |   void register_qp_callback(connection_callback_t callback);
 59 | 
 60 |   void close_device();
 61 | 
 62 |   void close_device(RNicHandler *);
 63 | 
 64 |   /**
 65 |    * Each RDMA NIC has multiple ports, so we use two-dimeson index to locate the target port.
 66 |    * convert_port_idx provides a way to translate the one-dimeson index to the two-dimeson
 67 |    */
 68 |   DevIdx convert_port_idx(int idx);
 69 | 
 70 |   /**
 71 |    * Register memory to a specific RNIC handler
 72 |    */
 73 |   bool register_memory(int id,const char *buf,uint64_t size,RNicHandler *rnic,
 74 |                        int flag = Memory::DEFAULT_PROTECTION_FLAG);
 75 | 
 76 |   /**
 77 |    * Get the local registered memory
 78 |    * undefined if mr_id has been registered
 79 |    */
 80 |   MemoryAttr get_local_mr(int mr_id);
 81 | 
 82 |   /**
 83 |    * Return an arbitrary registered MR
 84 |    * return -1 if no MR is registered to RdmaCtrl
 85 |    * return the first mr index, if found one
 86 |    */
 87 |   int get_default_mr(MemoryAttr &attr);
 88 | 
 89 |   /**
 90 |    * Create and query QPs
 91 |    * For create, an optional local_attr can be provided to bind to this QP
 92 |    * A local MR is passed as the default local mr for this QP.
 93 |    * If local_attr = nullptr, then this QP is unbind to any MR.
 94 |    */
 95 |   RCQP *create_rc_qp(QPIdx idx, RNicHandler *dev,MemoryAttr *local_attr = NULL);
 96 |   UDQP *create_ud_qp(QPIdx idx, RNicHandler *dev,MemoryAttr *local_attr = NULL);
 97 | 
 98 |   RCQP *get_rc_qp(QPIdx idx);
 99 |   UDQP *get_ud_qp(QPIdx idx);
100 | 
101 |   /**
102 |    * Some helper functions (example usage of RdmaCtrl)
103 |    * Fully link the QP in a symmetric way, for this thread.
104 |    * For example, node 0 can connect to node 1, while node 1 connect to node 0.
105 |    */
106 |   bool link_symmetric_rcqps(const std::vector<std::string> &cluster,
107 |                             int l_mrid,int mr_id,int wid,int idx = 0);
108 | 
109 |  private:
110 |   class RdmaCtrlImpl;
111 |   std::unique_ptr<RdmaCtrlImpl> impl_;
112 | };
113 | } // namespace rdmaio
114 | 
115 | #include "rdma_ctrl_impl.hpp" // real implemeatation here
116 | 


--------------------------------------------------------------------------------
/rdma_ctrl_impl.hpp:
--------------------------------------------------------------------------------
  1 | #include <pthread.h>
  2 | #include <map>
  3 | #include <mutex>
  4 | 
  5 | namespace rdmaio {
  6 | 
  7 | /**
  8 |  * Simple critical section
  9 |  * It uses a single global block to guard RdmaCtrl.
 10 |  * This is acceptable, since RdmaCtrl is only the control plane.
 11 |  */
 12 | class SCS {
 13 |  public:
 14 |   SCS() {
 15 |     get_lock().lock();
 16 |   }
 17 | 
 18 |   ~SCS() {
 19 |     get_lock().unlock();
 20 |   }
 21 | 
 22 |  private:
 23 |   static std::mutex &get_lock() {
 24 |     static std::mutex lock;
 25 |     return lock;
 26 |   }
 27 | };
 28 | 
 29 | /**
 30 |  * convert qp idx(node,worker,idx) -> key
 31 |  */
 32 | inline uint32_t get_rc_key (const QPIdx idx) {
 33 |   return ::rdmaio::encode_qp_id(idx.node_id,RC_ID_BASE + idx.worker_id * 64 + idx.index);
 34 | }
 35 | 
 36 | inline uint32_t get_ud_key(const QPIdx idx) {
 37 |   return ::rdmaio::encode_qp_id(idx.worker_id,UD_ID_BASE + idx.index);
 38 | }
 39 | 
 40 | /**
 41 |  * Control plane of RLib
 42 |  */
 43 | class RdmaCtrl::RdmaCtrlImpl {
 44 |  public:
 45 |   RdmaCtrlImpl(int node_id, int tcp_base_port,connection_callback_t callback,std::string local_ip):
 46 |       node_id_(node_id),
 47 |       tcp_base_port_(tcp_base_port),
 48 |       local_ip_(local_ip),
 49 |       qp_callback_(callback)
 50 |   {
 51 |     // start the background thread to handle QP connection request
 52 |     pthread_attr_t attr;
 53 |     pthread_attr_init(&attr);
 54 |     pthread_create(&handler_tid_, &attr, &RdmaCtrlImpl::connection_handler_wrapper,this);
 55 |   }
 56 | 
 57 |   ~RdmaCtrlImpl() {
 58 |     running_ = false; // wait for the handler to join
 59 |     pthread_join(handler_tid_,NULL);
 60 |     RDMA_LOG(INFO) << "rdma controler close: does not handle any future connections.";
 61 |   }
 62 | 
 63 |   RNicHandler *open_thread_local_device(DevIdx idx) {
 64 |     // already openend device
 65 |     if(rnic_instance() != nullptr)
 66 |       return rnic_instance();
 67 | 
 68 |     auto handler = open_device(idx);
 69 |     rnic_instance() = handler;
 70 |     return rnic_instance();
 71 |   }
 72 | 
 73 |   RNicHandler *open_device(DevIdx idx) {
 74 | 
 75 |     RNicHandler *rnic = nullptr;
 76 | 
 77 |     struct ibv_device **dev_list = nullptr; struct ibv_context *ib_ctx = nullptr; struct ibv_pd *pd = nullptr; int num_devices;
 78 |     int rc;  // return code
 79 | 
 80 |     dev_list = ibv_get_device_list(&num_devices);
 81 | 
 82 |     if(idx.dev_id >= num_devices || idx.dev_id < 0) {
 83 |       RDMA_LOG(WARNING) << "wrong dev_id: " << idx.dev_id << "; total " << num_devices <<" found";
 84 |       goto OPEN_END;
 85 |     }
 86 | 
 87 |     // alloc ctx
 88 |     ib_ctx = ibv_open_device(dev_list[idx.dev_id]);
 89 |     if(ib_ctx == nullptr) {
 90 |       RDMA_LOG(WARNING) << "failed to open ib ctx w error: " << strerror(errno);
 91 |       goto OPEN_END;
 92 |     }
 93 | 
 94 |     // alloc pd
 95 |     pd = ibv_alloc_pd(ib_ctx);
 96 |     if(pd == nullptr) {
 97 |       RDMA_LOG(WARNING) << "failed to alloc pd w error: " << strerror(errno);
 98 |       RDMA_VERIFY(INFO,ibv_close_device(ib_ctx) == 0) << "failed to close device " << idx.dev_id;
 99 |       goto OPEN_END;
100 |     }
101 | 
102 |     // fill the lid
103 |     ibv_port_attr port_attr;
104 |     rc = ibv_query_port (ib_ctx, idx.port_id, &port_attr);
105 |     if(rc < 0) {
106 |       RDMA_LOG(WARNING) << "failed to query port status w error: " << strerror(errno);
107 |       RDMA_VERIFY(INFO,ibv_close_device(ib_ctx) == 0) << "failed to close device " << idx.dev_id;
108 |       RDMA_VERIFY(INFO,ibv_dealloc_pd(pd) == 0) << "failed to dealloc pd";
109 |       goto OPEN_END;
110 |     }
111 | 
112 |     // success open
113 |     {
114 |       rnic = new RNicHandler(idx.dev_id,idx.port_id,ib_ctx,pd,port_attr.lid);
115 |     }
116 | 
117 |  OPEN_END:
118 |     if(dev_list != nullptr)
119 |       ibv_free_device_list(dev_list);
120 |     return rnic;
121 |   }
122 | 
123 |   RCQP *get_rc_qp(QPIdx idx) {
124 |     RCQP *res = nullptr;
125 |     {
126 |       SCS s;
127 |       res = get_qp<RCQP,get_rc_key>(idx);
128 |     };
129 |     return res;
130 |   }
131 | 
132 |   UDQP *get_ud_qp(QPIdx idx) {
133 | 
134 |     UDQP *res = nullptr;
135 |     {
136 |       SCS s;
137 |       res = get_qp<UDQP,get_ud_key>(idx);
138 |     };
139 |     return res;
140 |   }
141 | 
142 |   /**
143 |    * Note! this is not a thread-safe function
144 |    */
145 |   template<class T,uint32_t (*F)(QPIdx)>
146 |   T *get_qp(QPIdx idx) {
147 |     uint32_t key = F(idx);
148 |     if(qps_.find(key) == qps_.end())
149 |       return nullptr;
150 |     else
151 |       return dynamic_cast<T *>(qps_[key]);
152 |   }
153 | 
154 |   RCQP *create_rc_qp(QPIdx idx, RNicHandler *dev,MemoryAttr *attr) {
155 | 
156 |     RCQP *res = nullptr;
157 |     {
158 |       SCS s;
159 |       uint64_t qid = get_rc_key(idx);
160 |       if(qps_.find(qid) != qps_.end()) {
161 |         res = dynamic_cast<RCQP *>(qps_[qid]);
162 |       } else {
163 |         if(attr == NULL)
164 |           res = new RCQP(dev,idx);
165 |         else
166 |           res = new RCQP(dev,idx,*attr);
167 |         qps_.insert(std::make_pair(qid,res));
168 |       }
169 |     };
170 |     return res;
171 |   }
172 | 
173 |   UDQP *create_ud_qp(QPIdx idx, RNicHandler *dev,MemoryAttr *attr) {
174 | 
175 |     UDQP *res = nullptr;
176 |     uint64_t qid = get_ud_key(idx);
177 | 
178 |     {
179 |       SCS s;
180 |       if(qps_.find(qid) != qps_.end()) {
181 |         res = dynamic_cast<UDQP *>(qps_[qid]);
182 |       } else {
183 |         if(attr == NULL)
184 |           res = new UDQP(dev,idx);
185 |         else
186 |           res = new UDQP(dev,idx,*attr);
187 |         qps_.insert(std::make_pair(qid,res));
188 |       }
189 |     };
190 |     return res;
191 |   }
192 | 
193 |   bool register_memory(int mr_id,const char *buf,uint64_t size,RNicHandler *rnic,int flag) {
194 | 
195 |     Memory *m = new Memory(buf,size,rnic->pd,flag);
196 |     if(!m->valid()) {
197 |       RDMA_LOG(WARNING) << "register mr to rnic error: " << strerror(errno);
198 |       delete m;
199 |       return false;
200 |     }
201 |     {
202 |       SCS s;
203 |       if(mrs_.find(mr_id) != mrs_.end()) {
204 |         RDMA_LOG(WARNING) << "mr " << mr_id << " has already been registered!";
205 |         delete m;
206 |       } else {
207 |         mrs_.insert(std::make_pair(mr_id,m));
208 |       }
209 |     };
210 |     return true;
211 |   }
212 | 
213 |   int get_default_mr(MemoryAttr &attr) {
214 |     SCS s;
215 |     for(auto it = mrs_.begin();it != mrs_.end();++it) {
216 |       int idx = it->first; attr = it->second->rattr;
217 |       return idx;
218 |     }
219 |     return -1;
220 |   }
221 | 
222 |   MemoryAttr get_local_mr(int mr_id) {
223 |     MemoryAttr attr = {};
224 |     {
225 |       SCS s;
226 |       if(mrs_.find(mr_id) != mrs_.end())
227 |         attr = mrs_[mr_id]->rattr;
228 |     }
229 |     return attr;
230 |   }
231 | 
232 |   void clear_dev_info() {
233 |     cached_infos_.clear();
234 |   }
235 | 
236 |   static std::vector<RNicInfo>  query_devs_helper() {
237 |     int num_devices = 0;   struct ibv_device **dev_list = nullptr;
238 |     std::vector<RNicInfo> res;
239 | 
240 |     { // query the device and its active ports using the underlying APIs
241 |       dev_list = ibv_get_device_list(&num_devices);
242 |       int temp_devices = num_devices;
243 | 
244 |       if(dev_list == nullptr) {
245 |         RDMA_LOG(ERROR) << "cannot get ib devices.";
246 |         num_devices = 0;
247 |         goto QUERY_END;
248 |       }
249 | 
250 |       for(uint dev_id = 0;dev_id < temp_devices;++dev_id) {
251 | 
252 |         struct ibv_context *ib_ctx = ibv_open_device(dev_list[dev_id]);
253 |         if(ib_ctx == nullptr) {
254 |           RDMA_LOG(ERROR) << "open dev " << dev_id << " error: " << strerror(errno) << " ignored";
255 |           num_devices -= 1;
256 |           continue;
257 |         }
258 |         res.emplace_back(ibv_get_device_name(ib_ctx->device),dev_id,ib_ctx);
259 |      QUERY_DEV_END:
260 |         // close ib_ctx
261 |         RDMA_VERIFY(INFO,ibv_close_device(ib_ctx) == 0) << "failed to close device " << dev_id;
262 |       }
263 |     }
264 | 
265 |  QUERY_END:
266 |     if(dev_list != nullptr)
267 |       ibv_free_device_list(dev_list);
268 |     return res;
269 |   }
270 | 
271 |   std::vector<RNicInfo>  query_devs() {
272 | 
273 |     if(cached_infos_.size() != 0) {
274 |       return cached_infos_;
275 |     }
276 |     cached_infos_ = query_devs_helper();
277 |     return std::vector<RNicInfo>(cached_infos_.begin(),cached_infos_.end());
278 |   }
279 | 
280 |   RdmaCtrl::DevIdx convert_port_idx(int idx) {
281 | 
282 |     if(cached_infos_.size() == 0)
283 |       query_devs();
284 | 
285 |     for(int dev_id = 0; dev_id < cached_infos_.size();++dev_id) {
286 | 
287 |       int port_num = cached_infos_[dev_id].active_ports.size();
288 | 
289 |       for(int port_id = 1; port_id <= port_num; port_id++) {
290 |         if(idx == 0) {
291 |           // find one
292 |           return DevIdx {.dev_id = dev_id,.port_id = port_id};
293 |         }
294 |         idx -= 1;
295 |       }
296 |     }
297 |     // failed to find the dev according to the idx
298 |     return DevIdx {.dev_id = -1,.port_id = -1};
299 |   }
300 | 
301 |   RNicHandler *get_device() {
302 |     return rnic_instance();
303 |   }
304 | 
305 |   void close_device() {
306 |     if(rnic_instance() != nullptr) delete rnic_instance();
307 |     rnic_instance() = nullptr;
308 |   }
309 | 
310 |   void close_device(RNicHandler *rnic) {
311 |     if(rnic != nullptr)
312 |       delete rnic;
313 |   }
314 | 
315 |   static void *connection_handler_wrapper(void *context)
316 |   {
317 |     return ((RdmaCtrlImpl *)context)->connection_handler();
318 |   }
319 | 
320 |   /**
321 |    * Using TCP to connect in-coming QP & MR requests
322 |    */
323 |   void *connection_handler(void) {
324 | 
325 |     pthread_detach(pthread_self());
326 | 
327 |     auto listenfd = PreConnector::get_listen_socket(local_ip_,tcp_base_port_);
328 | 
329 |     int opt = 1;
330 |     RDMA_VERIFY(ERROR,setsockopt(listenfd,SOL_SOCKET,SO_REUSEADDR | SO_REUSEPORT,&opt,sizeof(int)) == 0)
331 |         << "unable to configure socket status.";
332 |     RDMA_VERIFY(ERROR,listen(listenfd,24) == 0) << "TCP listen error: " << strerror(errno);
333 | 
334 |     while(running_) {
335 | 
336 |       asm volatile("" ::: "memory");
337 | 
338 |       struct sockaddr_in cli_addr = {0};
339 |       socklen_t clilen = sizeof(cli_addr);
340 |       auto csfd = accept(listenfd,(struct sockaddr *) &cli_addr, &clilen);
341 | 
342 |       if(csfd < 0) {
343 |         RDMA_LOG(ERROR) << "accept a wrong connection error: " << strerror(errno);
344 |         continue;
345 |       }
346 | 
347 |       if(!PreConnector::wait_recv(csfd,6000)) {
348 |         close(csfd);
349 |         continue;
350 |       }
351 | 
352 |       ConnArg arg;
353 |       auto n = recv(csfd,(char *)(&arg),sizeof(ConnArg), MSG_WAITALL);
354 | 
355 |       if(n != sizeof(ConnArg)) {
356 |         // an invalid message
357 |         close(csfd);
358 |         continue;
359 |       }
360 | 
361 |       ConnReply reply; reply.ack = ERR;
362 | 
363 |       { // in a global critical section
364 |         SCS s;
365 |         switch(arg.type) {
366 |           case ConnArg::MR:
367 |             if(mrs_.find(arg.payload.mr.mr_id) != mrs_.end()) {
368 |               memcpy((char *)(&(reply.payload.mr)),
369 |                      (char *)(&(mrs_[arg.payload.mr.mr_id]->rattr)),sizeof(MemoryAttr));
370 |               reply.ack = SUCC;
371 |             };
372 |             break;
373 |           case ConnArg::QP: {
374 |             qp_callback_(arg.payload.qp); // call the user callback
375 |             QP *qp = NULL;
376 |             switch(arg.payload.qp.qp_type) {
377 |               case IBV_QPT_UD:
378 |                 {
379 |                   UDQP *ud_qp = get_qp<UDQP,get_ud_key>(
380 |                       create_ud_idx(arg.payload.qp.from_node,arg.payload.qp.from_worker));
381 |                   if(ud_qp != nullptr && ud_qp->ready()) {
382 |                     qp = ud_qp;
383 |                   }
384 |                 }
385 |                 break;
386 |               case IBV_QPT_RC:
387 |                 {
388 |                   RCQP *rc_qp = get_qp<RCQP,get_rc_key>(
389 |                       create_rc_idx(arg.payload.qp.from_node,arg.payload.qp.from_worker));
390 |                   qp = rc_qp;
391 |                 }
392 |                 break;
393 |               default:
394 |                 RDMA_LOG(ERROR) << "unknown QP connection type: " << arg.payload.qp.qp_type;
395 |             }
396 |             if(qp != nullptr) {
397 |               reply.payload.qp = qp->get_attr();
398 |               reply.ack = SUCC;
399 |             }
400 |             reply.payload.qp.node_id = node_id_;
401 |             break;
402 |           }
403 |           default:
404 |             RDMA_LOG(WARNING) << "received unknown connect type " << arg.type;
405 |         }
406 |       } // end simple critical section protection
407 | 
408 |       PreConnector::send_to(csfd,(char *)(&reply),sizeof(ConnReply));
409 |       PreConnector::wait_close(csfd); // wait for the client to close the connection
410 |     }
411 |     // end of the server
412 |     close(listenfd);
413 |   }
414 | 
415 |  private:
416 |   friend class RdmaCtrl;
417 |   static RNicHandler* &rnic_instance() {
418 |     static thread_local RNicHandler * handler = NULL;
419 |     return handler;
420 |   }
421 | 
422 |   std::vector<RNicInfo> cached_infos_;
423 | 
424 |   // registered MRs at this control manager
425 |   std::map<int,Memory *>      mrs_;
426 | 
427 |   // created QPs on this control manager
428 |   std::map<uint64_t,QP *> qps_;
429 | 
430 |   // local node information
431 |   const int node_id_;
432 |   const int tcp_base_port_;
433 |   const std::string local_ip_;
434 | 
435 |   pthread_t handler_tid_;
436 |   bool running_ = true;
437 | 
438 |   // connection callback function
439 |   connection_callback_t qp_callback_;
440 | 
441 |   bool link_symmetric_rcqps(const std::vector<std::string> &cluster,int l_mrid,int mr_id,int wid,int idx) {
442 | 
443 |     std::vector<bool> ready_list(cluster.size(),false);
444 |     std::vector<MemoryAttr> mrs;
445 | 
446 |     MemoryAttr local_mr = get_local_mr(l_mrid);
447 | 
448 |     for(auto s : cluster) {
449 |       // get the target mr
450 |    retry:
451 |       MemoryAttr mr = {};
452 |       auto rc = QP::get_remote_mr(s,tcp_base_port_,mr_id,&mr);
453 |       if(rc != SUCC) {
454 |         usleep(2000);
455 |         goto retry;
456 |       }
457 |       mrs.push_back(mr);
458 |     }
459 | 
460 |     RDMA_ASSERT(mrs.size() == cluster.size());
461 | 
462 |     while(true) {
463 |       int connected = 0, i = 0;
464 |       for(auto s : cluster) {
465 | 
466 |         if(ready_list[i]) {
467 |           i++; connected++;
468 |           continue;
469 |         }
470 |         RCQP *qp = create_rc_qp(QPIdx {.node_id = i,.worker_id = wid,.index = idx },
471 |                                 get_device(),&local_mr);
472 |         RDMA_ASSERT(qp != nullptr);
473 | 
474 |         if(qp->connect(s,tcp_base_port_,
475 |                        QPIdx {.node_id = node_id_,.worker_id = wid, .index = idx}) == SUCC) {
476 |           ready_list[i] = true;
477 |           connected++;
478 |           qp->bind_remote_mr(mrs[i]);
479 |         }
480 |         i++;
481 |       }
482 |       if(connected == cluster.size())
483 |         break;
484 |       else
485 |         usleep(1000);
486 |     }
487 |     return true; // This example does not use error handling
488 |   }
489 | 
490 |   void register_qp_callback(connection_callback_t callback) {
491 |     qp_callback_ = callback;
492 |   }
493 | }; //
494 | 
495 | // link to the main class
496 | inline __attribute__ ((always_inline))
497 | RdmaCtrl::RdmaCtrl(int node_id, int tcp_base_port,connection_callback_t callback,std::string ip)
498 |     :impl_(new RdmaCtrlImpl(node_id,tcp_base_port,callback,ip)){
499 | }
500 | 
501 | inline __attribute__ ((always_inline))
502 | RdmaCtrl::~RdmaCtrl() {
503 |   impl_.reset();
504 | }
505 | 
506 | inline __attribute__ ((always_inline))
507 | std::vector<RNicInfo> RdmaCtrl::query_devs() {
508 |   return impl_->query_devs();
509 | }
510 | 
511 | inline __attribute__ ((always_inline))
512 | void RdmaCtrl::clear_dev_info() {
513 |   return impl_->clear_dev_info();
514 | }
515 | 
516 | inline __attribute__ ((always_inline))
517 | RNicHandler *RdmaCtrl::get_device() {
518 |   return impl_->get_device();
519 | }
520 | 
521 | inline __attribute__ ((always_inline))
522 | RNicHandler *RdmaCtrl::open_thread_local_device(DevIdx idx) {
523 |   return impl_->open_thread_local_device(idx);
524 | }
525 | 
526 | inline __attribute__ ((always_inline))
527 | RNicHandler *RdmaCtrl::open_device(DevIdx idx) {
528 |   return impl_->open_device(idx);
529 | }
530 | 
531 | inline __attribute__ ((always_inline))
532 | void RdmaCtrl::close_device() {
533 |   return impl_->close_device();
534 | }
535 | 
536 | inline __attribute__ ((always_inline))
537 | void RdmaCtrl::close_device(RNicHandler *rnic) {
538 |   return impl_->close_device(rnic);
539 | }
540 | 
541 | inline __attribute__ ((always_inline))
542 | RdmaCtrl::DevIdx RdmaCtrl::convert_port_idx(int idx) {
543 |   return impl_->convert_port_idx(idx);
544 | }
545 | 
546 | inline __attribute__ ((always_inline))
547 | bool RdmaCtrl::register_memory(int id,const char *buf,uint64_t size,RNicHandler *rnic,int flag) {
548 |   return impl_->register_memory(id,buf,size,rnic,flag);
549 | }
550 | 
551 | inline __attribute__ ((always_inline))
552 | MemoryAttr RdmaCtrl::get_local_mr(int mr_id) {
553 |   return impl_->get_local_mr(mr_id);
554 | }
555 | 
556 | inline __attribute__ ((always_inline))
557 | int RdmaCtrl::get_default_mr(MemoryAttr &attr) {
558 |   return impl_->get_default_mr(attr);
559 | }
560 | 
561 | inline __attribute__ ((always_inline))
562 | RCQP *RdmaCtrl::create_rc_qp(QPIdx idx, RNicHandler *dev,MemoryAttr *attr) {
563 |   return impl_->create_rc_qp(idx,dev,attr);
564 | }
565 | 
566 | inline __attribute__ ((always_inline))
567 | UDQP *RdmaCtrl::create_ud_qp(QPIdx idx, RNicHandler *dev,MemoryAttr *attr) {
568 |   return impl_->create_ud_qp(idx,dev,attr);
569 | }
570 | 
571 | inline __attribute__ ((always_inline))
572 | RCQP *RdmaCtrl::get_rc_qp(QPIdx idx) {
573 |   return impl_->get_rc_qp(idx);
574 | }
575 | 
576 | inline __attribute__ ((always_inline))
577 | UDQP *RdmaCtrl::get_ud_qp(QPIdx idx) {
578 |   return impl_->get_ud_qp(idx);
579 | }
580 | 
581 | inline __attribute__ ((always_inline))
582 | int RdmaCtrl::current_node_id() {
583 |   return impl_->node_id_;
584 | }
585 | 
586 | inline __attribute__ ((always_inline))
587 | int RdmaCtrl::listening_port() {
588 |   return impl_->tcp_base_port_;
589 | }
590 | 
591 | inline __attribute__ ((always_inline))
592 | bool RdmaCtrl::link_symmetric_rcqps(const std::vector<std::string> &cluster,
593 |                                     int l_mrid,int mr_id,int wid,int idx) {
594 |   return impl_->link_symmetric_rcqps(cluster,l_mrid,mr_id,wid,idx);
595 | }
596 | 
597 | inline __attribute__ ((always_inline))
598 | std::vector<RNicInfo> RdmaCtrl::query_devs_helper() {
599 |   return RdmaCtrlImpl::query_devs_helper();
600 | }
601 | 
602 | inline __attribute__ ((always_inline))
603 | void RdmaCtrl::register_qp_callback(connection_callback_t callback) {
604 |   impl_->register_qp_callback(callback);
605 | }
606 | 
607 | };
608 | 


--------------------------------------------------------------------------------
/rnic.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <infiniband/verbs.h>
  4 | #include <vector>
  5 | 
  6 | #include "logging.hpp"
  7 | 
  8 | namespace rdmaio {
  9 | 
 10 | // The name of the particular port on the RNIC.
 11 | typedef struct {
 12 |   uint64_t subnet_prefix;
 13 |   uint64_t interface_id;
 14 |   uint32_t local_id;
 15 | } address_t;
 16 | 
 17 | struct RNicInfo {
 18 | 
 19 |   typedef struct {
 20 |     uint port_id;
 21 |     std::string link_layer;
 22 |   } PortInfo;
 23 | 
 24 |   RNicInfo(const char *name,int id,ibv_context *ctx):
 25 |       dev_id(id),
 26 |       dev_name(name)
 27 |   {
 28 |     query_port_infos(ctx);
 29 |     query_active_gids(ctx);
 30 |   }
 31 | 
 32 |   bool query_dev_attribute(ibv_context *ctx,ibv_device_attr &attr) {
 33 |     int rc = ibv_query_device(ctx, &attr);
 34 |     if(rc != 0) {
 35 |       RDMA_LOG(ERROR) << "query device attribute error: " << strerror(errno);
 36 |       return false;
 37 |     }
 38 |     return true;
 39 |   }
 40 | 
 41 |   // fill in the active_ports
 42 |   void query_port_infos(ibv_context *ctx) {
 43 | 
 44 |     ibv_device_attr attr;
 45 |     if(!query_dev_attribute(ctx,attr))
 46 |       return;
 47 | 
 48 |     // query port info
 49 |     for(uint port_id = 1;port_id <= attr.phys_port_cnt;++port_id) {
 50 | 
 51 |       struct ibv_port_attr port_attr;
 52 |       int rc = ibv_query_port(ctx, port_id, &port_attr);
 53 |       if(rc != 0) {
 54 |         RDMA_LOG(ERROR) << "query port_id " << port_id << " on device " << dev_id << "error.";
 55 |         continue;
 56 |       }
 57 | 
 58 |       // check port status
 59 |       if(port_attr.phys_state != IBV_PORT_ACTIVE && port_attr.phys_state != IBV_PORT_ACTIVE_DEFER) {
 60 |         RDMA_LOG(WARNING) << "query port_id " << port_id << " on device " << dev_id << " not active.";
 61 |         continue;
 62 |       }
 63 | 
 64 |       std::string link_layer = "";
 65 |       switch (port_attr.link_layer) {
 66 |         case IBV_LINK_LAYER_ETHERNET:
 67 |           link_layer = "RoCE";
 68 |           break;
 69 |         case IBV_LINK_LAYER_INFINIBAND:
 70 |           link_layer = "Infiniband";
 71 |           break;
 72 |         default:
 73 |           RDMA_LOG(WARNING) << "unknown link layer at this port: " << port_attr.link_layer;
 74 |           link_layer = "Unknown";
 75 |       };
 76 |       active_ports.push_back({port_id,link_layer});
 77 |     }
 78 |   }
 79 | 
 80 |   /**
 81 |    * I assume that the active gid is the same in the RNIC
 82 |    */
 83 |   void query_active_gids(ibv_context *ctx) {
 84 | 
 85 |     if(active_ports.size() == 0)
 86 |       return;
 87 | 
 88 |     int port_id = active_ports[0].port_id;
 89 |     struct ibv_port_attr port_attr;
 90 |     int rc = ibv_query_port(ctx, port_id, &port_attr);
 91 | 
 92 |     if(rc != 0) {
 93 |       RDMA_LOG(WARNING) << "query port attribute at dev " << dev_name << ",port " << port_id
 94 |                        << "; w error: " << strerror(errno);
 95 |       return;
 96 |     }
 97 | 
 98 |     for(uint i = 0;i < port_attr.gid_tbl_len;++i) {
 99 |       ibv_gid gid = {};
100 |       auto rc = ibv_query_gid(ctx,port_id, i, &gid);
101 |       if (gid.global.interface_id) {
102 |         active_gids.push_back(i);
103 |       }
104 |     }
105 |   }
106 | 
107 |   void print() const {
108 |     RDMA_LOG(3) << to_string();
109 |   }
110 | 
111 |   std::string to_string() const {
112 |     std::ostringstream oss;
113 | 
114 |     oss << "device " << dev_name << " has "<< active_ports.size() << " active ports.";
115 |     for(auto i : active_ports) {
116 |       oss << "port " << i.port_id << " w link layer " << i.link_layer << ".";
117 |     }
118 |     for(uint i = 0;i <active_gids.size();++i) {
119 |       oss << "active gid: " << active_gids[i] << ".";
120 |     }
121 |     return oss.str();
122 |   }
123 | 
124 |   // members
125 |   int dev_id;
126 |   std::string dev_name;
127 |   std::vector<PortInfo> active_ports;
128 |   std::vector<int> active_gids;
129 | };
130 | 
131 | class RdmaCtrl;
132 | struct RNicHandler {
133 | 
134 |   RNicHandler(int dev_id,int port_id,ibv_context *ctx,ibv_pd *pd,int lid,int gid = 0):
135 |       dev_id(dev_id),
136 |       port_id(port_id),
137 |       ctx(ctx),
138 |       pd(pd),
139 |       lid(lid),
140 |       gid(gid)
141 |   {
142 |   }
143 | 
144 |   address_t query_addr() {
145 |     return query_addr(gid);
146 |   }
147 | 
148 |   address_t query_addr(uint8_t gid_index) {
149 | 
150 |     ibv_gid gid;
151 |     ibv_query_gid(ctx,port_id,gid_index,&gid);
152 | 
153 |     address_t addr {
154 |       .subnet_prefix = gid.global.subnet_prefix,
155 |           .interface_id  = gid.global.interface_id,
156 |           .local_id      = gid_index
157 |           };
158 |     return addr;
159 |   }
160 | 
161 |  private:
162 |   friend class RdmaCtrl;
163 |   ~RNicHandler() {
164 |     // delete ctx & pd
165 |     RDMA_VERIFY(INFO,ibv_close_device(ctx) == 0) << "failed to close device " << dev_id;
166 |     RDMA_VERIFY(INFO,ibv_dealloc_pd(pd) == 0)    << "failed to dealloc pd at device " << dev_id
167 |                                                      << "; w error " << strerror(errno);
168 |   }
169 | 
170 |  public:
171 |   uint16_t dev_id;     // which RNIC
172 |   uint16_t port_id;    // which port
173 | 
174 |   struct ibv_context *ctx;
175 |   struct ibv_pd      *pd;
176 |   uint16_t lid;
177 |   uint16_t gid;
178 | };
179 | 
180 | 
181 | } // namespace rdmaio
182 | 


--------------------------------------------------------------------------------
/ud_adapter.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "msg_interface.hpp"
  4 | #include "rdma_ctrl.hpp"
  5 | #include "ralloc/ralloc.h"
  6 | 
  7 | /**
  8 |  * The Adapter use UD QP is based on FaSST RPC.
  9 |  */
 10 | namespace rdmaio {
 11 | 
 12 | class UDRecvManager {
 13 |  public:
 14 |   UDRecvManager(UDQP *qp,int max_recv_num,MemoryAttr local_mr):
 15 |       qp_(qp),max_recv_num_(max_recv_num)
 16 |   {
 17 |     RDMA_ASSERT(max_recv_num_ <= UDQPImpl::MAX_RECV_SIZE)
 18 |         << "UD can register at most " << UDQPImpl::MAX_RECV_SIZE << "buffers.";
 19 |     // allocate local heap
 20 |     RThreadLocalInit();
 21 | 
 22 |     // int recv structures
 23 |     /*
 24 |      *  The recv_buf_size must be smaller than *Real packet size* - *sizeof(GRH_header)*
 25 |      *  otherwise the msg cannot be received
 26 |      */
 27 |     int recv_buf_size = MAX_PACKET_SIZE;
 28 |     RDMA_ASSERT(recv_buf_size <= MAX_PACKET_SIZE);
 29 | 
 30 |     // init receive related structures
 31 |     for(uint i = 0;i < max_recv_num_;++i) {
 32 |       struct ibv_sge sge {
 33 |         .addr   = (uintptr_t)(Rmalloc(recv_buf_size)),
 34 |         .length = (uint32_t)recv_buf_size,
 35 |         .lkey   = local_mr.key
 36 |       };
 37 |       RDMA_ASSERT(sge.addr != 0) << "failed to allocate recv buffer.";
 38 |       sges_[i] = sge;
 39 | 
 40 |       rrs_[i].wr_id  = sges_[i].addr;
 41 |       rrs_[i].sg_list = &sges_[i];
 42 |       rrs_[i].num_sge = 1;
 43 | 
 44 |       rrs_[i].next    = (i < (max_recv_num_ - 1)) ? &rrs_[i + 1] : &rrs_[0];
 45 |     }
 46 | 
 47 |     post_recvs(max_recv_num_);
 48 | 
 49 |     // now the qp can receive connection requests
 50 |     qp_->set_ready();
 51 |   }
 52 |  public:
 53 |   // the size of global routing header
 54 |   static const int GRH_SIZE = 40;
 55 |   static const int MAX_PACKET_SIZE = 4096 - GRH_SIZE;
 56 | 
 57 |  protected:
 58 | 
 59 |   UDQP *qp_ = nullptr;
 60 | 
 61 |   int recv_head_ = 0;
 62 |   int idle_recv_num_ = 0;
 63 |   int max_idle_recv_num_ = 1;
 64 |   int max_recv_num_ = 0;
 65 | 
 66 |   struct ibv_recv_wr rrs_[UDQPImpl::MAX_RECV_SIZE];
 67 |   struct ibv_sge sges_[UDQPImpl::MAX_RECV_SIZE];
 68 |   struct ibv_wc wcs_[UDQPImpl::MAX_RECV_SIZE];
 69 |   struct ibv_recv_wr *bad_rr_;
 70 | 
 71 |   void post_recvs(int recv_num) {
 72 | 
 73 |     if(recv_num <= 0) {
 74 |       return;
 75 |     }
 76 | 
 77 |     int tail = recv_head_ + recv_num - 1;
 78 |     if(tail >= max_recv_num_)
 79 |       tail -= max_recv_num_;
 80 | 
 81 |     ibv_recv_wr  *head_rr = rrs_ + recv_head_;
 82 |     ibv_recv_wr  *tail_rr = rrs_ + tail;
 83 |     ibv_recv_wr  *temp = tail_rr->next;
 84 |     tail_rr->next = NULL;
 85 | 
 86 |     int rc = ibv_post_recv(qp_->qp_,head_rr,&bad_rr_);
 87 |     if(rc != 0) {
 88 |       RDMA_LOG(ERROR) << "post recv " << recv_num << "; w error: " << strerror(errno) ;
 89 |     }
 90 |     recv_head_ = tail;
 91 |     tail_rr->next = temp;
 92 |     recv_head_ = (recv_head_ + 1) % max_recv_num_;
 93 |   }
 94 |   // class specific constants
 95 | };
 96 | 
 97 | class UDAdapter : public MsgAdapter, public UDRecvManager {
 98 |   static const int MAX_UD_SEND_DOORBELL = 16;
 99 |  public:
100 |   UDAdapter(std::shared_ptr<RdmaCtrl> cm, RNicHandler *rnic, MemoryAttr local_mr,
101 |         int w_id, int max_recv_num):
102 |       node_id_(cm->current_node_id()),
103 |       worker_id_(w_id),
104 |       UDRecvManager(cm->create_ud_qp(create_ud_idx(w_id,RECV_QP_IDX),rnic,&local_mr),max_recv_num,local_mr),
105 |       send_qp_(cm->create_ud_qp(create_ud_idx(w_id,SEND_QP_IDX),rnic,&local_mr))
106 |   {
107 |     // init send structures
108 |     for(uint i = 0;i < MAX_UD_SEND_DOORBELL;++i) {
109 |       srs_[i].opcode = IBV_WR_SEND_WITH_IMM;
110 |       srs_[i].num_sge = 1;
111 |       srs_[i].imm_data = ::rdmaio::encode_qp_id(node_id_,worker_id_);
112 |       RDMA_ASSERT(::rdmaio::decode_qp_mac(srs_[i].imm_data) == node_id_);
113 |       srs_[i].next = &srs_[i+1];
114 |       srs_[i].sg_list = &ssges_[i];
115 | 
116 |       ssges_[i].lkey = local_mr.key;
117 |     }
118 |   }
119 | 
120 |   ConnStatus connect(std::string ip,int port) {
121 |     return send_qp_->connect(ip,port,create_ud_idx(worker_id_,RECV_QP_IDX));
122 |   }
123 | 
124 |   ConnStatus send_to(int node_id,const char *msg,int len) {
125 | 
126 |     RDMA_ASSERT(current_idx_ == 0) << "There is pending reqs in the msg queue.";
127 |     srs_[0].wr.ud.ah = send_qp_->ahs_[node_id];
128 |     srs_[0].wr.ud.remote_qpn  = send_qp_->attrs_[node_id].qpn;
129 |     srs_[0].wr.ud.remote_qkey = DEFAULT_QKEY;
130 |     srs_[0].sg_list = &ssges_[0];
131 |     srs_[0].next = NULL;
132 | 
133 |     srs_[0].send_flags = ((send_qp_->queue_empty()) ? IBV_SEND_SIGNALED : 0)
134 |                          | ((len < MAX_INLINE_SIZE) ? IBV_SEND_INLINE : 0);
135 | 
136 |     ssges_[0].addr = (uint64_t)msg;
137 |     ssges_[0].length = len;
138 | 
139 | 
140 |     if(send_qp_->need_poll()) {
141 |       ibv_wc wc; auto ret = send_qp_->poll_till_completion(wc);
142 |       RDMA_ASSERT(ret == SUCC) << "poll UD completion reply error: " << ret;
143 |       send_qp_->pendings = 0;
144 |     } else
145 |       send_qp_->pendings += 1;
146 | 
147 |     int rc = ibv_post_send(send_qp_->qp_, &srs_[0], &bad_sr_);
148 |     //reset next ptr
149 |     srs_[0].next = &srs_[1];
150 |     return (rc == 0)?SUCC:ERR;
151 |   }
152 | 
153 |   void prepare_pending() {
154 |     RDMA_ASSERT(current_idx_ == 0);
155 |   }
156 | 
157 |   ConnStatus send_pending(int node_id,const char *msg,int len) {
158 | 
159 |     auto i = current_idx_++;
160 |     srs_[i].wr.ud.ah = send_qp_->ahs_[node_id];
161 |     srs_[i].wr.ud.remote_qpn  = send_qp_->attrs_[node_id].qpn;
162 |     srs_[i].wr.ud.remote_qkey = DEFAULT_QKEY;
163 | 
164 |     srs_[i].send_flags = ((send_qp_->queue_empty()) ? IBV_SEND_SIGNALED : 0)
165 |                          | ((len < MAX_INLINE_SIZE) ? IBV_SEND_INLINE : 0);
166 | 
167 |     if(send_qp_->need_poll()) {
168 |       ibv_wc wc;auto ret = send_qp_->poll_till_completion(wc);
169 |       RDMA_ASSERT(ret == SUCC) << "poll UD completion reply error: " << ret;
170 |       send_qp_->pendings = 0;
171 |     } else {
172 |       send_qp_->pendings += 1;
173 |     }
174 | 
175 |     ssges_[i].addr = (uintptr_t)msg;
176 |     ssges_[i].length = len;
177 | 
178 |     if(current_idx_ >= MAX_UD_SEND_DOORBELL)
179 |       flush_pending();
180 |   }
181 | 
182 |   ConnStatus flush_pending() {
183 |     if(current_idx_ > 0) {
184 |       srs_[current_idx_ - 1].next = NULL;
185 |       auto ret = ibv_post_send(send_qp_->qp_, &srs_[0], &bad_sr_);
186 |       srs_[current_idx_ - 1].next = &srs_[current_idx_];
187 |       current_idx_ = 0;
188 |       return (ret == 0)?SUCC:ERR;
189 |     }
190 |     return SUCC;
191 |   }
192 | 
193 |   void poll_comps() {
194 | 
195 |     int poll_result = ibv_poll_cq(qp_->recv_cq_,UDQPImpl::MAX_RECV_SIZE,wcs_);
196 |     /**
197 |      * The reply messages are batched in this call
198 |      */
199 |     prepare_pending();
200 |     for(uint i = 0;i < poll_result;++i) { // poll_result: number of results
201 |       RDMA_ASSERT(wcs_[i].status == IBV_WC_SUCCESS)
202 |           << "error wc status " << wcs_[i].status << " at " << worker_id_;
203 |       callback_((const char *)(wcs_[i].wr_id + GRH_SIZE),::rdmaio::decode_qp_mac(wcs_[i].imm_data),
204 |                 ::rdmaio::decode_qp_index(wcs_[i].imm_data));
205 |     }
206 |     flush_pending(); // send the batched replies
207 |     idle_recv_num_ += poll_result;
208 |     if(idle_recv_num_ > max_idle_recv_num_) {
209 |       // re-post recvs to the QP
210 |       post_recvs(idle_recv_num_);
211 |       idle_recv_num_ = 0;
212 |     }
213 |   }
214 | 
215 |  private:
216 |   const int node_id_;   // my node id
217 |   const int worker_id_; // my thread id
218 |   /**
219 |    * sender structures
220 |    */
221 |   UDQP *send_qp_ = nullptr;
222 |   ibv_send_wr srs_[MAX_UD_SEND_DOORBELL];
223 |   ibv_sge     ssges_[MAX_UD_SEND_DOORBELL];
224 |   struct ibv_send_wr *bad_sr_ = nullptr;
225 | 
226 |   int current_idx_ = 0;
227 | 
228 |   static const int RECV_QP_IDX = 1;
229 |   static const int SEND_QP_IDX = 0;
230 | };
231 | 
232 | } // namespace rdmaio
233 | 


--------------------------------------------------------------------------------