├── memcached.conf ├── .gitignore ├── script ├── clear_hugepage.sh ├── hugepage.sh ├── check_nic.sh └── restartMemc.sh ├── run.sh ├── src ├── Cache.cpp ├── RawMessageConnection.cpp ├── Common.cpp ├── ThreadConnection.cpp ├── rdma │ ├── Utility.cpp │ ├── StateTrans.cpp │ ├── Resource.cpp │ └── Operation.cpp ├── DirectoryConnection.cpp ├── Directory.cpp ├── AbstractMessageConnection.cpp ├── Debug.cpp ├── Keeper.cpp ├── DSMKeeper.cpp ├── DSM.cpp └── Tree.cpp ├── include ├── Cache.h ├── HugePageAlloc.h ├── Config.h ├── Connection.h ├── RawMessageConnection.h ├── ThreadConnection.h ├── Directory.h ├── DirectoryConnection.h ├── GlobalAddress.h ├── LocalAllocator.h ├── Debug.h ├── Timer.h ├── AbstractMessageConnection.h ├── CacheEntry.h ├── Keeper.h ├── GlobalAllocator.h ├── WRLock.h ├── DSMKeeper.h ├── RdmaBuffer.h ├── Common.h ├── Rdma.h ├── third_party │ ├── slice.h │ └── random.h ├── IndexCache.h ├── DSM.h └── Tree.h ├── CMakeLists.txt ├── test ├── tree_test.cpp ├── skiplist_test.cpp ├── benchmark.cpp └── zipf.h ├── .github └── workflows │ └── jekyll-gh-pages.yml └── README.md /memcached.conf: -------------------------------------------------------------------------------- 1 | 10.0.2.111 2 | 2378 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | build/* 3 | .vscode/ 4 | 5 | -------------------------------------------------------------------------------- /script/clear_hugepage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | sysctl -w vm.nr_hugepages=0 3 | -------------------------------------------------------------------------------- /script/hugepage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | sysctl -w vm.nr_hugepages=32768 3 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/bash 2 | 3 | if [ ! -d "build" ]; then 4 | mkdir build 5 | fi 6 | 7 | cp script/* build 8 | cd build 9 | 10 | cmake .. && make -j 11 | -------------------------------------------------------------------------------- /src/Cache.cpp: -------------------------------------------------------------------------------- 1 | #include "Cache.h" 2 | 3 | Cache::Cache(const CacheConfig &cache_config) { 4 | size = cache_config.cacheSize; 5 | data = (uint64_t)hugePageAlloc(size * define::GB); 6 | } -------------------------------------------------------------------------------- /include/Cache.h: -------------------------------------------------------------------------------- 1 | #if !defined(_CACHE_H_) 2 | #define _CACHE_H_ 3 | 4 | #include "Config.h" 5 | #include "HugePageAlloc.h" 6 | 7 | class Cache { 8 | 9 | public: 10 | Cache(const CacheConfig &cache_config); 11 | 12 | uint64_t data; 13 | uint64_t size; 14 | 15 | private: 16 | }; 17 | 18 | #endif // _CACHE_H_ 19 | -------------------------------------------------------------------------------- /script/check_nic.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | net_name=`ibdev2netdev | awk '{print $5}'` 5 | ip=`ifconfig 2>>/dev/null | grep 10.0.2. | awk '{print $2}'` 6 | for net in $net_name; do 7 | str1=`ibdev2netdev | grep $net` 8 | str2=`cat /sys/class/net/$net/device/numa_node` 9 | echo ${ip} ${str1} numa-${str2} 10 | done 11 | # echo $net_name -------------------------------------------------------------------------------- /script/restartMemc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | addr=$(head -1 ../memcached.conf) 4 | port=$(awk 'NR==2{print}' ../memcached.conf) 5 | 6 | # kill old me 7 | ssh ${addr} "cat /tmp/memcached.pid | xargs kill" 8 | 9 | # launch memcached 10 | ssh ${addr} "memcached -u root -l ${addr} -p ${port} -c 10000 -d -P /tmp/memcached.pid" 11 | sleep 1 12 | 13 | # init 14 | echo -e "set serverNum 0 0 1\r\n0\r\nquit\r" | nc ${addr} ${port} 15 | echo -e "set clientNum 0 0 1\r\n0\r\nquit\r" | nc ${addr} ${port} 16 | -------------------------------------------------------------------------------- /include/HugePageAlloc.h: -------------------------------------------------------------------------------- 1 | #ifndef __HUGEPAGEALLOC_H__ 2 | #define __HUGEPAGEALLOC_H__ 3 | 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | 11 | char *getIP(); 12 | inline void *hugePageAlloc(size_t size) { 13 | 14 | void *res = mmap(NULL, size, PROT_READ | PROT_WRITE, 15 | MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); 16 | if (res == MAP_FAILED) { 17 | Debug::notifyError("%s mmap failed!\n", getIP()); 18 | } 19 | 20 | return res; 21 | } 22 | 23 | #endif /* __HUGEPAGEALLOC_H__ */ 24 | -------------------------------------------------------------------------------- /include/Config.h: -------------------------------------------------------------------------------- 1 | #ifndef __CONFIG_H__ 2 | #define __CONFIG_H__ 3 | 4 | #include "Common.h" 5 | 6 | class CacheConfig { 7 | public: 8 | uint32_t cacheSize; 9 | 10 | CacheConfig(uint32_t cacheSize = 1) : cacheSize(cacheSize) {} 11 | }; 12 | 13 | class DSMConfig { 14 | public: 15 | CacheConfig cacheConfig; 16 | uint32_t machineNR; 17 | uint64_t dsmSize; // G 18 | 19 | DSMConfig(const CacheConfig &cacheConfig = CacheConfig(), 20 | uint32_t machineNR = 2, uint64_t dsmSize = 8) 21 | : cacheConfig(cacheConfig), machineNR(machineNR), dsmSize(dsmSize) {} 22 | }; 23 | 24 | #endif /* __CONFIG_H__ */ 25 | -------------------------------------------------------------------------------- /include/Connection.h: -------------------------------------------------------------------------------- 1 | #ifndef __CONNECTION_H__ 2 | #define __CONNECTION_H__ 3 | 4 | #include "Common.h" 5 | #include "RawMessageConnection.h" 6 | 7 | #include "ThreadConnection.h" 8 | #include "DirectoryConnection.h" 9 | 10 | struct RemoteConnection { 11 | // directory 12 | uint64_t dsmBase; 13 | 14 | uint32_t dsmRKey[NR_DIRECTORY]; 15 | uint32_t dirMessageQPN[NR_DIRECTORY]; 16 | ibv_ah *appToDirAh[MAX_APP_THREAD][NR_DIRECTORY]; 17 | 18 | // cache 19 | uint64_t cacheBase; 20 | 21 | // lock memory 22 | uint64_t lockBase; 23 | uint32_t lockRKey[NR_DIRECTORY]; 24 | 25 | // app thread 26 | uint32_t appRKey[MAX_APP_THREAD]; 27 | uint32_t appMessageQPN[MAX_APP_THREAD]; 28 | ibv_ah *dirToAppAh[NR_DIRECTORY][MAX_APP_THREAD]; 29 | }; 30 | 31 | #endif /* __CONNECTION_H__ */ 32 | -------------------------------------------------------------------------------- /include/RawMessageConnection.h: -------------------------------------------------------------------------------- 1 | #ifndef __RAWMESSAGECONNECTION_H__ 2 | #define __RAWMESSAGECONNECTION_H__ 3 | 4 | #include "AbstractMessageConnection.h" 5 | #include "GlobalAddress.h" 6 | 7 | #include 8 | 9 | enum RpcType : uint8_t { 10 | MALLOC, 11 | FREE, 12 | NEW_ROOT, 13 | NOP, 14 | }; 15 | 16 | struct RawMessage { 17 | RpcType type; 18 | 19 | uint16_t node_id; 20 | uint16_t app_id; 21 | 22 | GlobalAddress addr; // for malloc 23 | int level; 24 | } __attribute__((packed)); 25 | 26 | class RawMessageConnection : public AbstractMessageConnection { 27 | 28 | public: 29 | RawMessageConnection(RdmaContext &ctx, ibv_cq *cq, uint32_t messageNR); 30 | 31 | void initSend(); 32 | void sendRawMessage(RawMessage *m, uint32_t remoteQPN, ibv_ah *ah); 33 | }; 34 | 35 | #endif /* __RAWMESSAGECONNECTION_H__ */ 36 | -------------------------------------------------------------------------------- /src/RawMessageConnection.cpp: -------------------------------------------------------------------------------- 1 | #include "RawMessageConnection.h" 2 | 3 | #include 4 | 5 | RawMessageConnection::RawMessageConnection(RdmaContext &ctx, ibv_cq *cq, 6 | uint32_t messageNR) 7 | : AbstractMessageConnection(IBV_QPT_UD, 0, 40, ctx, cq, messageNR) {} 8 | 9 | void RawMessageConnection::initSend() {} 10 | 11 | void RawMessageConnection::sendRawMessage(RawMessage *m, uint32_t remoteQPN, 12 | ibv_ah *ah) { 13 | 14 | if ((sendCounter & SIGNAL_BATCH) == 0 && sendCounter > 0) { 15 | ibv_wc wc; 16 | pollWithCQ(send_cq, 1, &wc); 17 | } 18 | 19 | rdmaSend(message, (uint64_t)m - sendPadding, sizeof(RawMessage) + sendPadding, 20 | messageLkey, ah, remoteQPN, (sendCounter & SIGNAL_BATCH) == 0); 21 | 22 | ++sendCounter; 23 | } 24 | -------------------------------------------------------------------------------- /include/ThreadConnection.h: -------------------------------------------------------------------------------- 1 | #ifndef __THREADCONNECTION_H__ 2 | #define __THREADCONNECTION_H__ 3 | 4 | #include "Common.h" 5 | #include "RawMessageConnection.h" 6 | 7 | struct RemoteConnection; 8 | 9 | // app thread 10 | struct ThreadConnection { 11 | 12 | uint16_t threadID; 13 | 14 | RdmaContext ctx; 15 | ibv_cq *cq; // for one-side verbs 16 | ibv_cq *rpc_cq; 17 | 18 | RawMessageConnection *message; 19 | 20 | ibv_qp **data[NR_DIRECTORY]; 21 | 22 | ibv_mr *cacheMR; 23 | void *cachePool; 24 | uint32_t cacheLKey; 25 | RemoteConnection *remoteInfo; 26 | 27 | ThreadConnection(uint16_t threadID, void *cachePool, uint64_t cacheSize, 28 | uint32_t machineNR, RemoteConnection *remoteInfo); 29 | 30 | void sendMessage2Dir(RawMessage *m, uint16_t node_id, uint16_t dir_id = 0); 31 | }; 32 | 33 | #endif /* __THREADCONNECTION_H__ */ 34 | -------------------------------------------------------------------------------- /include/Directory.h: -------------------------------------------------------------------------------- 1 | #ifndef __DIRECTORY_H__ 2 | #define __DIRECTORY_H__ 3 | 4 | #include 5 | 6 | #include 7 | 8 | #include "Common.h" 9 | 10 | #include "Connection.h" 11 | #include "GlobalAllocator.h" 12 | 13 | 14 | class Directory { 15 | public: 16 | Directory(DirectoryConnection *dCon, RemoteConnection *remoteInfo, 17 | uint32_t machineNR, uint16_t dirID, uint16_t nodeID); 18 | 19 | ~Directory(); 20 | 21 | private: 22 | DirectoryConnection *dCon; 23 | RemoteConnection *remoteInfo; 24 | 25 | uint32_t machineNR; 26 | uint16_t dirID; 27 | uint16_t nodeID; 28 | 29 | std::thread *dirTh; 30 | 31 | GlobalAllocator *chunckAlloc; 32 | 33 | void dirThread(); 34 | 35 | void sendData2App(const RawMessage *m); 36 | 37 | void process_message(const RawMessage *m); 38 | 39 | }; 40 | 41 | #endif /* __DIRECTORY_H__ */ 42 | -------------------------------------------------------------------------------- /include/DirectoryConnection.h: -------------------------------------------------------------------------------- 1 | #ifndef __DIRECTORYCONNECTION_H__ 2 | #define __DIRECTORYCONNECTION_H__ 3 | 4 | #include "Common.h" 5 | #include "RawMessageConnection.h" 6 | 7 | struct RemoteConnection; 8 | 9 | // directory thread 10 | struct DirectoryConnection { 11 | uint16_t dirID; 12 | 13 | RdmaContext ctx; 14 | ibv_cq *cq; 15 | 16 | RawMessageConnection *message; 17 | 18 | ibv_qp **data2app[MAX_APP_THREAD]; 19 | 20 | ibv_mr *dsmMR; 21 | void *dsmPool; 22 | uint64_t dsmSize; 23 | uint32_t dsmLKey; 24 | 25 | ibv_mr *lockMR; 26 | void *lockPool; // address on-chip 27 | uint64_t lockSize; 28 | uint32_t lockLKey; 29 | 30 | RemoteConnection *remoteInfo; 31 | 32 | DirectoryConnection(uint16_t dirID, void *dsmPool, uint64_t dsmSize, 33 | uint32_t machineNR, RemoteConnection *remoteInfo); 34 | 35 | void sendMessage2App(RawMessage *m, uint16_t node_id, uint16_t th_id); 36 | }; 37 | 38 | #endif /* __DIRECTORYCONNECTION_H__ */ 39 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8.5 FATAL_ERROR) 2 | project(Sherman) 3 | 4 | # disable boost warning 5 | add_definitions(-DBOOST_COROUTINES_NO_DEPRECATION_WARNING) 6 | 7 | #Compiler options 8 | set(CMAKE_C_FLAGS "-Wall -Wno-deprecated-declarations -Wsign-compare -O3 -g") 9 | set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -std=c++11") 10 | 11 | # Link Options 12 | set(LINKS_FLAGS "-lcityhash -lboost_system -lboost_coroutine -lpthread -libverbs -lmemcached") 13 | 14 | set(INCLUDE_BASE ${PROJECT_SOURCE_DIR}/include) 15 | include_directories(${INCLUDE_BASE}) 16 | 17 | #Source file define 18 | set(COMMON_SRC ${PROJECT_SOURCE_DIR}/src) 19 | 20 | #Used by both server and clients 21 | file(GLOB_RECURSE COMMON_FILE ${COMMON_SRC}/*.cpp) 22 | add_library(sherman STATIC ${COMMON_FILE}) 23 | link_libraries(sherman) 24 | 25 | file(GLOB TEST_SRC ${PROJECT_SOURCE_DIR}/test/*.cpp) 26 | foreach (TEST ${TEST_SRC}) 27 | get_filename_component(TEST_NAME ${TEST} NAME_WE) 28 | add_executable(${TEST_NAME} ${TEST}) 29 | target_link_libraries(${TEST_NAME} ${LINKS_FLAGS}) 30 | endforeach() 31 | 32 | -------------------------------------------------------------------------------- /include/GlobalAddress.h: -------------------------------------------------------------------------------- 1 | #ifndef __GLOBALADDRESS_H__ 2 | #define __GLOBALADDRESS_H__ 3 | 4 | #include "Common.h" 5 | 6 | 7 | class GlobalAddress { 8 | public: 9 | 10 | union { 11 | struct { 12 | uint64_t nodeID: 16; 13 | uint64_t offset : 48; 14 | }; 15 | uint64_t val; 16 | }; 17 | 18 | operator uint64_t() { 19 | return val; 20 | } 21 | 22 | static GlobalAddress Null() { 23 | static GlobalAddress zero{0, 0}; 24 | return zero; 25 | }; 26 | } __attribute__((packed)); 27 | 28 | static_assert(sizeof(GlobalAddress) == sizeof(uint64_t), "XXX"); 29 | 30 | inline GlobalAddress GADD(const GlobalAddress &addr, int off) { 31 | auto ret = addr; 32 | ret.offset += off; 33 | return ret; 34 | } 35 | 36 | inline bool operator==(const GlobalAddress &lhs, const GlobalAddress &rhs) { 37 | return (lhs.nodeID == rhs.nodeID) && (lhs.offset == rhs.offset); 38 | } 39 | 40 | inline bool operator!=(const GlobalAddress &lhs, const GlobalAddress &rhs) { 41 | return !(lhs == rhs); 42 | } 43 | 44 | inline std::ostream &operator<<(std::ostream &os, const GlobalAddress &obj) { 45 | os << "[" << (int)obj.nodeID << ", " << obj.offset << "]"; 46 | return os; 47 | } 48 | 49 | #endif /* __GLOBALADDRESS_H__ */ 50 | -------------------------------------------------------------------------------- /src/Common.cpp: -------------------------------------------------------------------------------- 1 | #include "Common.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | void bindCore(uint16_t core) { 12 | 13 | cpu_set_t cpuset; 14 | CPU_ZERO(&cpuset); 15 | CPU_SET(core, &cpuset); 16 | int rc = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); 17 | if (rc != 0) { 18 | Debug::notifyError("can't bind core!"); 19 | } 20 | } 21 | 22 | char *getIP() { 23 | struct ifreq ifr; 24 | int fd = socket(AF_INET, SOCK_DGRAM, 0); 25 | 26 | ifr.ifr_addr.sa_family = AF_INET; 27 | strncpy(ifr.ifr_name, "ib0", IFNAMSIZ - 1); 28 | 29 | ioctl(fd, SIOCGIFADDR, &ifr); 30 | close(fd); 31 | 32 | return inet_ntoa(((struct sockaddr_in*)&ifr.ifr_addr)->sin_addr); 33 | } 34 | 35 | char *getMac() { 36 | static struct ifreq ifr; 37 | int fd = socket(AF_INET, SOCK_DGRAM, 0); 38 | 39 | ifr.ifr_addr.sa_family = AF_INET; 40 | strncpy(ifr.ifr_name, "ens2", IFNAMSIZ - 1); 41 | 42 | ioctl(fd, SIOCGIFHWADDR, &ifr); 43 | close(fd); 44 | 45 | return (char *)ifr.ifr_hwaddr.sa_data; 46 | } 47 | 48 | -------------------------------------------------------------------------------- /include/LocalAllocator.h: -------------------------------------------------------------------------------- 1 | #if !defined(_LOCAL_ALLOC_H_) 2 | #define _LOCAL_ALLOC_H_ 3 | 4 | #include "Common.h" 5 | #include "GlobalAddress.h" 6 | 7 | #include 8 | 9 | // for fine-grained shared memory alloc 10 | // not thread safe 11 | // now it is a simple log-structure alloctor 12 | // TODO: slab-based alloctor 13 | class LocalAllocator { 14 | 15 | public: 16 | LocalAllocator() { 17 | head = GlobalAddress::Null(); 18 | cur = GlobalAddress::Null(); 19 | } 20 | 21 | GlobalAddress malloc(size_t size, bool &need_chunck, bool align = false) { 22 | 23 | if (align) { 24 | } 25 | 26 | GlobalAddress res = cur; 27 | if (log_heads.empty() || 28 | (cur.offset + size > head.offset + define::kChunkSize)) { 29 | need_chunck = true; 30 | } else { 31 | need_chunck = false; 32 | cur.offset += size; 33 | } 34 | 35 | // assert(res.addr + size <= 40 * define::GB); 36 | 37 | return res; 38 | } 39 | 40 | void set_chunck(GlobalAddress &addr) { 41 | log_heads.push_back(addr); 42 | head = cur = addr; 43 | } 44 | 45 | void free(const GlobalAddress &addr) { 46 | // TODO 47 | } 48 | 49 | private: 50 | GlobalAddress head; 51 | GlobalAddress cur; 52 | std::vector log_heads; 53 | }; 54 | 55 | #endif // _LOCAL_ALLOC_H_ 56 | -------------------------------------------------------------------------------- /include/Debug.h: -------------------------------------------------------------------------------- 1 | /*** Debug header. ***/ 2 | 3 | /** Version 1 + Functional Model Modification **/ 4 | 5 | /** Redundance check. **/ 6 | #ifndef DEBUG_HEADER 7 | #define DEBUG_HEADER 8 | 9 | /** Included files. **/ 10 | #include /* Standard I/O operations. E.g. vprintf() */ 11 | #include /* Standard argument operations. E.g. va_list */ 12 | #include /* Time functions. E.g. gettimeofday() */ 13 | 14 | /** Defninitions. **/ 15 | #define MAX_FORMAT_LEN 255 16 | #define DEBUG false 17 | #define TITLE false 18 | #define TIMER false 19 | #define CUR false 20 | 21 | #define DEBUG_ON true 22 | /** Classes. **/ 23 | 24 | class Debug 25 | { 26 | private: 27 | static long startTime; /* Last start time in milliseconds. */ 28 | 29 | public: 30 | static void debugTitle(const char *str); /* Print debug title string. */ 31 | static void debugItem(const char *format, ...); /* Print debug item string. */ 32 | static void debugCur(const char *format, ...); /* Print debug item string. */ 33 | static void notifyInfo(const char *format, ...); /* Print normal notification. */ 34 | static void notifyError(const char *format, ...); /* Print error information. */ 35 | }; 36 | 37 | /** Redundance check. **/ 38 | #endif 39 | -------------------------------------------------------------------------------- /include/Timer.h: -------------------------------------------------------------------------------- 1 | #if !defined(_TIMER_H_) 2 | #define _TIMER_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | class Timer { 9 | public: 10 | Timer() = default; 11 | 12 | void begin() { clock_gettime(CLOCK_REALTIME, &s); } 13 | 14 | uint64_t end(uint64_t loop = 1) { 15 | this->loop = loop; 16 | clock_gettime(CLOCK_REALTIME, &e); 17 | uint64_t ns_all = 18 | (e.tv_sec - s.tv_sec) * 1000000000ull + (e.tv_nsec - s.tv_nsec); 19 | ns = ns_all / loop; 20 | 21 | return ns; 22 | } 23 | 24 | void print() { 25 | 26 | if (ns < 1000) { 27 | printf("%ldns per loop\n", ns); 28 | } else { 29 | printf("%lfus per loop\n", ns * 1.0 / 1000); 30 | } 31 | } 32 | 33 | static uint64_t get_time_ns() { 34 | timespec now; 35 | clock_gettime(CLOCK_REALTIME, &now); 36 | return 1000000000ull * now.tv_sec + now.tv_nsec; 37 | } 38 | 39 | static void sleep(uint64_t sleep_ns) { 40 | Timer clock; 41 | 42 | clock.begin(); 43 | while (true) { 44 | if (clock.end() >= sleep_ns) { 45 | return; 46 | } 47 | } 48 | } 49 | 50 | void end_print(uint64_t loop = 1) { 51 | end(loop); 52 | print(); 53 | } 54 | 55 | private: 56 | timespec s, e; 57 | uint64_t loop; 58 | uint64_t ns; 59 | }; 60 | 61 | #endif // _TIMER_H_ 62 | -------------------------------------------------------------------------------- /src/ThreadConnection.cpp: -------------------------------------------------------------------------------- 1 | #include "ThreadConnection.h" 2 | 3 | #include "Connection.h" 4 | 5 | ThreadConnection::ThreadConnection(uint16_t threadID, void *cachePool, 6 | uint64_t cacheSize, uint32_t machineNR, 7 | RemoteConnection *remoteInfo) 8 | : threadID(threadID), remoteInfo(remoteInfo) { 9 | createContext(&ctx); 10 | 11 | cq = ibv_create_cq(ctx.ctx, RAW_RECV_CQ_COUNT, NULL, NULL, 0); 12 | // rpc_cq = cq; 13 | rpc_cq = ibv_create_cq(ctx.ctx, RAW_RECV_CQ_COUNT, NULL, NULL, 0); 14 | 15 | message = new RawMessageConnection(ctx, rpc_cq, APP_MESSAGE_NR); 16 | 17 | this->cachePool = cachePool; 18 | cacheMR = createMemoryRegion((uint64_t)cachePool, cacheSize, &ctx); 19 | cacheLKey = cacheMR->lkey; 20 | 21 | // dir, RC 22 | for (int i = 0; i < NR_DIRECTORY; ++i) { 23 | data[i] = new ibv_qp *[machineNR]; 24 | for (size_t k = 0; k < machineNR; ++k) { 25 | createQueuePair(&data[i][k], IBV_QPT_RC, cq, &ctx); 26 | } 27 | } 28 | } 29 | 30 | void ThreadConnection::sendMessage2Dir(RawMessage *m, uint16_t node_id, 31 | uint16_t dir_id) { 32 | 33 | message->sendRawMessage(m, remoteInfo[node_id].dirMessageQPN[dir_id], 34 | remoteInfo[node_id].appToDirAh[threadID][dir_id]); 35 | } 36 | -------------------------------------------------------------------------------- /include/AbstractMessageConnection.h: -------------------------------------------------------------------------------- 1 | #ifndef __ABSTRACTMESSAGECONNECTION_H__ 2 | #define __ABSTRACTMESSAGECONNECTION_H__ 3 | 4 | #include "Common.h" 5 | 6 | #define SIGNAL_BATCH 31 7 | 8 | class Message; 9 | 10 | // #messageNR send pool and #messageNR message pool 11 | class AbstractMessageConnection { 12 | 13 | const static int kBatchCount = 4; 14 | 15 | protected: 16 | ibv_qp *message; // ud or raw packet 17 | uint16_t messageNR; 18 | 19 | ibv_mr *messageMR; 20 | void *messagePool; 21 | uint32_t messageLkey; 22 | 23 | uint16_t curMessage; 24 | 25 | void *sendPool; 26 | uint16_t curSend; 27 | 28 | ibv_recv_wr *recvs[kBatchCount]; 29 | ibv_sge *recv_sgl[kBatchCount]; 30 | uint32_t subNR; 31 | 32 | ibv_cq *send_cq; 33 | uint64_t sendCounter; 34 | 35 | uint16_t sendPadding; // ud: 0 36 | // rp: ? 37 | uint16_t recvPadding; // ud: 40 38 | // rp: ? 39 | 40 | public: 41 | AbstractMessageConnection(ibv_qp_type type, uint16_t sendPadding, 42 | uint16_t recvPadding, RdmaContext &ctx, ibv_cq *cq, 43 | uint32_t messageNR); 44 | 45 | void initRecv(); 46 | 47 | char *getMessage(); 48 | char *getSendPool(); 49 | 50 | uint32_t getQPN() { return message->qp_num; } 51 | }; 52 | 53 | #endif /* __ABSTRACTMESSAGECONNECTION_H__ */ 54 | -------------------------------------------------------------------------------- /test/tree_test.cpp: -------------------------------------------------------------------------------- 1 | #include "DSM.h" 2 | #include "Tree.h" 3 | 4 | int main() { 5 | 6 | DSMConfig config; 7 | config.machineNR = 2; 8 | DSM *dsm = DSM::getInstance(config); 9 | 10 | dsm->registerThread(); 11 | 12 | auto tree = new Tree(dsm); 13 | 14 | Value v; 15 | 16 | if (dsm->getMyNodeID() != 0) { 17 | while (true) 18 | ; 19 | } 20 | 21 | for (uint64_t i = 1; i < 10240; ++i) { 22 | tree->insert(i, i * 2); 23 | } 24 | 25 | for (uint64_t i = 10240 - 1; i >= 1; --i) { 26 | tree->insert(i, i * 3); 27 | } 28 | 29 | for (uint64_t i = 1; i < 10240; ++i) { 30 | auto res = tree->search(i, v); 31 | assert(res && v == i * 3); 32 | std::cout << "search result: " << res << " v: " << v << std::endl; 33 | } 34 | 35 | for (uint64_t i = 1; i < 10240; ++i) { 36 | tree->del(i); 37 | } 38 | 39 | for (uint64_t i = 1; i < 10240; ++i) { 40 | auto res = tree->search(i, v); 41 | std::cout << "search result: " << res << std::endl; 42 | } 43 | 44 | for (uint64_t i = 10240 - 1; i >= 1; --i) { 45 | tree->insert(i, i * 3); 46 | } 47 | 48 | for (uint64_t i = 1; i < 10240; ++i) { 49 | auto res = tree->search(i, v); 50 | assert(res && v == i * 3); 51 | std::cout << "search result: " << res << " v: " << v << std::endl; 52 | } 53 | 54 | printf("Hello\n"); 55 | 56 | while (true) 57 | ; 58 | } -------------------------------------------------------------------------------- /include/CacheEntry.h: -------------------------------------------------------------------------------- 1 | #if !defined(_CACHE_ENTRY_H_) 2 | #define _CACHE_ENTRY_H_ 3 | 4 | #include "Common.h" 5 | #include "Tree.h" 6 | 7 | struct CacheEntry { 8 | Key from; 9 | Key to; // [from, to] 10 | mutable InternalPage *ptr; 11 | } 12 | __attribute__((packed)); 13 | 14 | static_assert(sizeof(CacheEntry) == 2 * sizeof(Key) + sizeof(uint64_t), "XXX"); 15 | 16 | inline std::ostream &operator<<(std::ostream &os, const CacheEntry &obj) { 17 | os << "[" << (int)obj.from << ", " << obj.to + 1 << ")"; 18 | return os; 19 | } 20 | 21 | inline static CacheEntry Decode(const char *val) { return *(CacheEntry *)val; } 22 | 23 | struct CacheEntryComparator { 24 | typedef CacheEntry DecodedType; 25 | 26 | static DecodedType decode_key(const char *b) { return Decode(b); } 27 | 28 | int cmp(const DecodedType a_v, const DecodedType b_v) const { 29 | if (a_v.to < b_v.to) { 30 | return -1; 31 | } 32 | 33 | if (a_v.to > b_v.to) { 34 | return +1; 35 | } 36 | 37 | if (a_v.from < b_v.from) { 38 | return +1; 39 | } else if (a_v.from > b_v.from) { 40 | return -1; 41 | } else { 42 | return 0; 43 | } 44 | } 45 | 46 | int operator()(const char *a, const char *b) const { 47 | return cmp(Decode(a), Decode(b)); 48 | } 49 | 50 | int operator()(const char *a, const DecodedType b) const { 51 | return cmp(Decode(a), b); 52 | } 53 | }; 54 | 55 | #endif // _CACHE_ENTRY_H_ 56 | -------------------------------------------------------------------------------- /include/Keeper.h: -------------------------------------------------------------------------------- 1 | #ifndef __KEEPER__H__ 2 | #define __KEEPER__H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | #include 16 | 17 | #include "Config.h" 18 | #include "Debug.h" 19 | #include "Rdma.h" 20 | 21 | class Keeper { 22 | 23 | private: 24 | static const char *SERVER_NUM_KEY; 25 | 26 | uint32_t maxServer; 27 | uint16_t curServer; 28 | uint16_t myNodeID; 29 | std::string myIP; 30 | uint16_t myPort; 31 | 32 | memcached_st *memc; 33 | 34 | protected: 35 | bool connectMemcached(); 36 | bool disconnectMemcached(); 37 | void serverConnect(); 38 | void serverEnter(); 39 | virtual bool connectNode(uint16_t remoteID) = 0; 40 | 41 | 42 | public: 43 | Keeper(uint32_t maxServer = 12); 44 | ~Keeper(); 45 | 46 | uint16_t getMyNodeID() const { return this->myNodeID; } 47 | uint16_t getServerNR() const { return this->maxServer; } 48 | uint16_t getMyPort() const { return this->myPort; } 49 | 50 | std::string getMyIP() const { return this->myIP; } 51 | 52 | 53 | void memSet(const char *key, uint32_t klen, const char *val, uint32_t vlen); 54 | char *memGet(const char *key, uint32_t klen, size_t *v_size = nullptr); 55 | uint64_t memFetchAndAdd(const char *key, uint32_t klen); 56 | }; 57 | 58 | #endif 59 | -------------------------------------------------------------------------------- /.github/workflows/jekyll-gh-pages.yml: -------------------------------------------------------------------------------- 1 | # Sample workflow for building and deploying a Jekyll site to GitHub Pages 2 | name: Deploy Jekyll with GitHub Pages dependencies preinstalled 3 | 4 | on: 5 | # Runs on pushes targeting the default branch 6 | push: 7 | branches: ["main"] 8 | 9 | # Allows you to run this workflow manually from the Actions tab 10 | workflow_dispatch: 11 | 12 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 13 | permissions: 14 | contents: read 15 | pages: write 16 | id-token: write 17 | 18 | # Allow one concurrent deployment 19 | concurrency: 20 | group: "pages" 21 | cancel-in-progress: true 22 | 23 | jobs: 24 | # Build job 25 | build: 26 | runs-on: ubuntu-latest 27 | steps: 28 | - name: Checkout 29 | uses: actions/checkout@v3 30 | - name: Setup Pages 31 | uses: actions/configure-pages@v2 32 | - name: Build with Jekyll 33 | uses: actions/jekyll-build-pages@v1 34 | with: 35 | source: ./ 36 | destination: ./_site 37 | - name: Upload artifact 38 | uses: actions/upload-pages-artifact@v1 39 | 40 | # Deployment job 41 | deploy: 42 | environment: 43 | name: github-pages 44 | url: ${{ steps.deployment.outputs.page_url }} 45 | runs-on: ubuntu-latest 46 | needs: build 47 | steps: 48 | - name: Deploy to GitHub Pages 49 | id: deployment 50 | uses: actions/deploy-pages@v1 51 | -------------------------------------------------------------------------------- /include/GlobalAllocator.h: -------------------------------------------------------------------------------- 1 | #if !defined(_GLOBAL_ALLOCATOR_H_) 2 | #define _GLOBAL_ALLOCATOR_H_ 3 | 4 | #include "Common.h" 5 | #include "Debug.h" 6 | #include "GlobalAddress.h" 7 | 8 | #include 9 | 10 | 11 | 12 | // global allocator for coarse-grained (chunck level) alloc 13 | // used by home agent 14 | // bitmap based 15 | class GlobalAllocator { 16 | 17 | public: 18 | GlobalAllocator(const GlobalAddress &start, size_t size) 19 | : start(start), size(size) { 20 | bitmap_len = size / define::kChunkSize; 21 | bitmap = new bool[bitmap_len]; 22 | memset(bitmap, 0, bitmap_len); 23 | 24 | // null ptr 25 | bitmap[0] = true; 26 | bitmap_tail = 1; 27 | } 28 | 29 | ~GlobalAllocator() { delete[] bitmap; } 30 | 31 | GlobalAddress alloc_chunck() { 32 | 33 | GlobalAddress res = start; 34 | if (bitmap_tail >= bitmap_len) { 35 | assert(false); 36 | Debug::notifyError("shared memory space run out"); 37 | } 38 | 39 | if (bitmap[bitmap_tail] == false) { 40 | bitmap[bitmap_tail] = true; 41 | res.offset += bitmap_tail * define::kChunkSize; 42 | 43 | bitmap_tail++; 44 | } else { 45 | assert(false); 46 | Debug::notifyError("TODO"); 47 | } 48 | 49 | return res; 50 | } 51 | 52 | void free_chunk(const GlobalAddress &addr) { 53 | bitmap[(addr.offset - start.offset) / define::kChunkSize] = false; 54 | } 55 | 56 | private: 57 | GlobalAddress start; 58 | size_t size; 59 | 60 | bool *bitmap; 61 | size_t bitmap_len; 62 | size_t bitmap_tail; 63 | }; 64 | 65 | #endif 66 | -------------------------------------------------------------------------------- /src/rdma/Utility.cpp: -------------------------------------------------------------------------------- 1 | #include "Rdma.h" 2 | 3 | int kMaxDeviceMemorySize = 0; 4 | 5 | void rdmaQueryQueuePair(ibv_qp *qp) { 6 | struct ibv_qp_attr attr; 7 | struct ibv_qp_init_attr init_attr; 8 | ibv_query_qp(qp, &attr, IBV_QP_STATE, &init_attr); 9 | switch (attr.qp_state) { 10 | case IBV_QPS_RESET: 11 | printf("QP state: IBV_QPS_RESET\n"); 12 | break; 13 | case IBV_QPS_INIT: 14 | printf("QP state: IBV_QPS_INIT\n"); 15 | break; 16 | case IBV_QPS_RTR: 17 | printf("QP state: IBV_QPS_RTR\n"); 18 | break; 19 | case IBV_QPS_RTS: 20 | printf("QP state: IBV_QPS_RTS\n"); 21 | break; 22 | case IBV_QPS_SQD: 23 | printf("QP state: IBV_QPS_SQD\n"); 24 | break; 25 | case IBV_QPS_SQE: 26 | printf("QP state: IBV_QPS_SQE\n"); 27 | break; 28 | case IBV_QPS_ERR: 29 | printf("QP state: IBV_QPS_ERR\n"); 30 | break; 31 | case IBV_QPS_UNKNOWN: 32 | printf("QP state: IBV_QPS_UNKNOWN\n"); 33 | break; 34 | } 35 | } 36 | 37 | void checkDMSupported(struct ibv_context *ctx) { 38 | struct ibv_exp_device_attr attrs; 39 | 40 | attrs.comp_mask = IBV_EXP_DEVICE_ATTR_UMR; 41 | attrs.comp_mask |= IBV_EXP_DEVICE_ATTR_MAX_DM_SIZE; 42 | 43 | if (ibv_exp_query_device(ctx, &attrs)) { 44 | printf("Couldn't query device attributes\n"); 45 | } 46 | 47 | if (!(attrs.comp_mask & IBV_EXP_DEVICE_ATTR_MAX_DM_SIZE)) { 48 | fprintf(stderr, "Can not support device memory!\n"); 49 | exit(-1); 50 | } else if (!(attrs.max_dm_size)) { 51 | } else { 52 | kMaxDeviceMemorySize = attrs.max_dm_size; 53 | printf("The RNIC has %dKB device memory\n", kMaxDeviceMemorySize / 1024); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/DirectoryConnection.cpp: -------------------------------------------------------------------------------- 1 | #include "DirectoryConnection.h" 2 | 3 | #include "Connection.h" 4 | 5 | DirectoryConnection::DirectoryConnection(uint16_t dirID, void *dsmPool, 6 | uint64_t dsmSize, uint32_t machineNR, 7 | RemoteConnection *remoteInfo) 8 | : dirID(dirID), remoteInfo(remoteInfo) { 9 | 10 | createContext(&ctx); 11 | cq = ibv_create_cq(ctx.ctx, RAW_RECV_CQ_COUNT, NULL, NULL, 0); 12 | message = new RawMessageConnection(ctx, cq, DIR_MESSAGE_NR); 13 | 14 | message->initRecv(); 15 | message->initSend(); 16 | 17 | // dsm memory 18 | this->dsmPool = dsmPool; 19 | this->dsmSize = dsmSize; 20 | this->dsmMR = createMemoryRegion((uint64_t)dsmPool, dsmSize, &ctx); 21 | this->dsmLKey = dsmMR->lkey; 22 | 23 | // on-chip lock memory 24 | if (dirID == 0) { 25 | this->lockPool = (void *)define::kLockStartAddr; 26 | this->lockSize = define::kLockChipMemSize; 27 | this->lockMR = createMemoryRegionOnChip((uint64_t)this->lockPool, 28 | this->lockSize, &ctx); 29 | this->lockLKey = lockMR->lkey; 30 | } 31 | 32 | // app, RC 33 | for (int i = 0; i < MAX_APP_THREAD; ++i) { 34 | data2app[i] = new ibv_qp *[machineNR]; 35 | for (size_t k = 0; k < machineNR; ++k) { 36 | createQueuePair(&data2app[i][k], IBV_QPT_RC, cq, &ctx); 37 | } 38 | } 39 | } 40 | 41 | void DirectoryConnection::sendMessage2App(RawMessage *m, uint16_t node_id, 42 | uint16_t th_id) { 43 | message->sendRawMessage(m, remoteInfo[node_id].appMessageQPN[th_id], 44 | remoteInfo[node_id].dirToAppAh[dirID][th_id]); 45 | ; 46 | } 47 | -------------------------------------------------------------------------------- /test/skiplist_test.cpp: -------------------------------------------------------------------------------- 1 | #include "third_party/inlineskiplist.h" 2 | #include "Timer.h" 3 | 4 | // Our test skip list stores 8-byte unsigned integers 5 | typedef uint64_t Key; 6 | 7 | // static const char *Encode(const uint64_t *key) { 8 | // return reinterpret_cast(key); 9 | // } 10 | 11 | static Key Decode(const char *key) { 12 | Key rv; 13 | memcpy(&rv, key, sizeof(Key)); 14 | return rv; 15 | } 16 | 17 | struct TestComparator { 18 | typedef Key DecodedType; 19 | 20 | static DecodedType decode_key(const char *b) { return Decode(b); } 21 | 22 | int operator()(const char *a, const char *b) const { 23 | if (Decode(a) < Decode(b)) { 24 | return -1; 25 | } else if (Decode(a) > Decode(b)) { 26 | return +1; 27 | } else { 28 | return 0; 29 | } 30 | } 31 | 32 | int operator()(const char *a, const DecodedType b) const { 33 | if (Decode(a) < b) { 34 | return -1; 35 | } else if (Decode(a) > b) { 36 | return +1; 37 | } else { 38 | return 0; 39 | } 40 | } 41 | }; 42 | 43 | int main() { 44 | Allocator alloc; 45 | TestComparator cmp; 46 | InlineSkipList list(cmp, &alloc, 21); 47 | 48 | InlineSkipList::Iterator iter(&list); 49 | 50 | const uint64_t Space = 100000ull; 51 | const int loop = 10000; 52 | for (uint64_t i = 0; i < Space; ++i) { 53 | auto buf = list.AllocateKey(sizeof(Key)); 54 | *(Key *)buf = i; 55 | bool res = list.InsertConcurrently(buf); 56 | (void)res; 57 | } 58 | 59 | 60 | Timer t; 61 | t.begin(); 62 | for (int i = 0; i < loop; ++i) { 63 | uint64_t k = rand() % Space; 64 | iter.Seek((char *)&k); 65 | } 66 | t.end_print(loop); 67 | 68 | return 0; 69 | } 70 | -------------------------------------------------------------------------------- /include/WRLock.h: -------------------------------------------------------------------------------- 1 | #ifndef __WRLOCK_H__ 2 | #define __WRLOCK_H__ 3 | 4 | #include 5 | 6 | class WRLock { 7 | 8 | private: 9 | std::atomic l; 10 | const static uint16_t UNLOCKED = 0; 11 | const static uint16_t LOCKED = 1; 12 | 13 | public: 14 | WRLock() { init(); } 15 | 16 | bool is_unlock() { 17 | return l.load() == UNLOCKED; 18 | } 19 | 20 | void init() { l.store(UNLOCKED); } 21 | 22 | void wLock() { 23 | while (true) { 24 | while (l.load(std::memory_order_relaxed) != UNLOCKED) { 25 | ; 26 | } 27 | 28 | uint16_t f = UNLOCKED; 29 | if (l.compare_exchange_strong(f, LOCKED)) { 30 | break; 31 | } 32 | } 33 | }; 34 | 35 | bool try_wLock() { 36 | if (l.load(std::memory_order_relaxed) != UNLOCKED) 37 | return false; 38 | 39 | uint16_t f = UNLOCKED; 40 | return l.compare_exchange_strong(f, LOCKED); 41 | } 42 | 43 | void rLock() { 44 | while (true) { 45 | uint16_t v; 46 | while ((v = l.load(std::memory_order_relaxed)) == LOCKED) { 47 | ; 48 | } 49 | 50 | uint16_t b = v + 2; 51 | 52 | if (l.compare_exchange_strong(v, b)) { 53 | break; 54 | } 55 | } 56 | } 57 | 58 | bool try_rLock() { 59 | retry: 60 | uint16_t v = l.load(std::memory_order_relaxed); 61 | if (v == LOCKED) 62 | return false; 63 | 64 | uint16_t b = v + 2; 65 | if (!l.compare_exchange_strong(v, b)) { 66 | goto retry; // concurrent reader; 67 | } 68 | 69 | return true; 70 | } 71 | 72 | void rUnlock() { 73 | while (true) { 74 | uint16_t v = l.load(); 75 | uint16_t b = v - 2; 76 | 77 | if (l.compare_exchange_strong(v, b)) { 78 | break; 79 | } 80 | } 81 | } 82 | 83 | void wUnlock() { l.store(UNLOCKED, std::memory_order_release); } 84 | }; 85 | 86 | #endif /* __FAIRWRLOCK_H__ */ 87 | -------------------------------------------------------------------------------- /src/Directory.cpp: -------------------------------------------------------------------------------- 1 | #include "Directory.h" 2 | #include "Common.h" 3 | 4 | #include "Connection.h" 5 | 6 | GlobalAddress g_root_ptr = GlobalAddress::Null(); 7 | int g_root_level = -1; 8 | bool enable_cache; 9 | 10 | Directory::Directory(DirectoryConnection *dCon, RemoteConnection *remoteInfo, 11 | uint32_t machineNR, uint16_t dirID, uint16_t nodeID) 12 | : dCon(dCon), remoteInfo(remoteInfo), machineNR(machineNR), dirID(dirID), 13 | nodeID(nodeID), dirTh(nullptr) { 14 | 15 | { // chunck alloctor 16 | GlobalAddress dsm_start; 17 | uint64_t per_directory_dsm_size = dCon->dsmSize / NR_DIRECTORY; 18 | dsm_start.nodeID = nodeID; 19 | dsm_start.offset = per_directory_dsm_size * dirID; 20 | chunckAlloc = new GlobalAllocator(dsm_start, per_directory_dsm_size); 21 | } 22 | 23 | dirTh = new std::thread(&Directory::dirThread, this); 24 | } 25 | 26 | Directory::~Directory() { delete chunckAlloc; } 27 | 28 | void Directory::dirThread() { 29 | 30 | bindCore(23 - dirID); 31 | Debug::notifyInfo("thread %d in memory nodes runs...\n", dirID); 32 | 33 | while (true) { 34 | struct ibv_wc wc; 35 | pollWithCQ(dCon->cq, 1, &wc); 36 | 37 | switch (int(wc.opcode)) { 38 | case IBV_WC_RECV: // control message 39 | { 40 | 41 | auto *m = (RawMessage *)dCon->message->getMessage(); 42 | 43 | process_message(m); 44 | 45 | break; 46 | } 47 | case IBV_WC_RDMA_WRITE: { 48 | break; 49 | } 50 | case IBV_WC_RECV_RDMA_WITH_IMM: { 51 | 52 | break; 53 | } 54 | default: 55 | assert(false); 56 | } 57 | } 58 | } 59 | 60 | void Directory::process_message(const RawMessage *m) { 61 | 62 | RawMessage *send = nullptr; 63 | switch (m->type) { 64 | case RpcType::MALLOC: { 65 | 66 | send = (RawMessage *)dCon->message->getSendPool(); 67 | 68 | send->addr = chunckAlloc->alloc_chunck(); 69 | break; 70 | } 71 | 72 | case RpcType::NEW_ROOT: { 73 | 74 | if (g_root_level < m->level) { 75 | g_root_ptr = m->addr; 76 | g_root_level = m->level; 77 | if (g_root_level >= 3) { 78 | enable_cache = true; 79 | } 80 | } 81 | 82 | break; 83 | } 84 | 85 | default: 86 | assert(false); 87 | } 88 | 89 | if (send) { 90 | dCon->sendMessage2App(send, m->node_id, m->app_id); 91 | } 92 | } -------------------------------------------------------------------------------- /include/DSMKeeper.h: -------------------------------------------------------------------------------- 1 | #ifndef __LINEAR_KEEPER__H__ 2 | #define __LINEAR_KEEPER__H__ 3 | 4 | #include 5 | 6 | #include "Keeper.h" 7 | 8 | struct ThreadConnection; 9 | struct DirectoryConnection; 10 | struct CacheAgentConnection; 11 | struct RemoteConnection; 12 | 13 | struct ExPerThread { 14 | uint16_t lid; 15 | uint8_t gid[16]; 16 | 17 | uint32_t rKey; 18 | 19 | uint32_t lock_rkey; //for directory on-chip memory 20 | } __attribute__((packed)); 21 | 22 | struct ExchangeMeta { 23 | uint64_t dsmBase; 24 | uint64_t cacheBase; 25 | uint64_t lockBase; 26 | 27 | ExPerThread appTh[MAX_APP_THREAD]; 28 | ExPerThread dirTh[NR_DIRECTORY]; 29 | 30 | uint32_t appUdQpn[MAX_APP_THREAD]; 31 | uint32_t dirUdQpn[NR_DIRECTORY]; 32 | 33 | uint32_t appRcQpn2dir[MAX_APP_THREAD][NR_DIRECTORY]; 34 | 35 | uint32_t dirRcQpn2app[NR_DIRECTORY][MAX_APP_THREAD]; 36 | 37 | } __attribute__((packed)); 38 | 39 | class DSMKeeper : public Keeper { 40 | 41 | private: 42 | static const char *OK; 43 | static const char *ServerPrefix; 44 | 45 | ThreadConnection **thCon; 46 | DirectoryConnection **dirCon; 47 | RemoteConnection *remoteCon; 48 | 49 | ExchangeMeta localMeta; 50 | 51 | std::vector serverList; 52 | 53 | std::string setKey(uint16_t remoteID) { 54 | return std::to_string(getMyNodeID()) + "-" + std::to_string(remoteID); 55 | } 56 | 57 | std::string getKey(uint16_t remoteID) { 58 | return std::to_string(remoteID) + "-" + std::to_string(getMyNodeID()); 59 | } 60 | 61 | void initLocalMeta(); 62 | 63 | void connectMySelf(); 64 | void initRouteRule(); 65 | 66 | void setDataToRemote(uint16_t remoteID); 67 | void setDataFromRemote(uint16_t remoteID, ExchangeMeta *remoteMeta); 68 | 69 | protected: 70 | virtual bool connectNode(uint16_t remoteID) override; 71 | 72 | public: 73 | DSMKeeper(ThreadConnection **thCon, DirectoryConnection **dirCon, RemoteConnection *remoteCon, 74 | uint32_t maxServer = 12) 75 | : Keeper(maxServer), thCon(thCon), dirCon(dirCon), 76 | remoteCon(remoteCon) { 77 | 78 | initLocalMeta(); 79 | 80 | if (!connectMemcached()) { 81 | return; 82 | } 83 | serverEnter(); 84 | 85 | serverConnect(); 86 | connectMySelf(); 87 | 88 | initRouteRule(); 89 | } 90 | 91 | ~DSMKeeper() { disconnectMemcached(); } 92 | void barrier(const std::string &barrierKey); 93 | uint64_t sum(const std::string &sum_key, uint64_t value); 94 | }; 95 | 96 | #endif 97 | -------------------------------------------------------------------------------- /include/RdmaBuffer.h: -------------------------------------------------------------------------------- 1 | #if !defined(_RDMA_BUFFER_H_) 2 | #define _RDMA_BUFFER_H_ 3 | 4 | #include "Common.h" 5 | 6 | // abstract rdma registered buffer 7 | class RdmaBuffer { 8 | 9 | private: 10 | static const int kPageBufferCnt = 8; // async, buffer safty 11 | static const int kSiblingBufferCnt = 8; // async, buffer safty 12 | static const int kCasBufferCnt = 8; // async, buffer safty 13 | 14 | char *buffer; 15 | 16 | uint64_t *cas_buffer; 17 | uint64_t *unlock_buffer; 18 | uint64_t *zero_64bit; 19 | char *page_buffer; 20 | char *sibling_buffer; 21 | char *entry_buffer; 22 | 23 | int page_buffer_cur; 24 | int sibling_buffer_cur; 25 | int cas_buffer_cur; 26 | 27 | int kPageSize; 28 | 29 | public: 30 | RdmaBuffer(char *buffer) { 31 | set_buffer(buffer); 32 | 33 | page_buffer_cur = 0; 34 | sibling_buffer_cur = 0; 35 | cas_buffer_cur = 0; 36 | } 37 | 38 | RdmaBuffer() = default; 39 | 40 | void set_buffer(char *buffer) { 41 | 42 | // printf("set buffer %p\n", buffer); 43 | 44 | kPageSize = std::max(kLeafPageSize, kInternalPageSize); 45 | this->buffer = buffer; 46 | cas_buffer = (uint64_t *)buffer; 47 | unlock_buffer = 48 | (uint64_t *)((char *)cas_buffer + sizeof(uint64_t) * kCasBufferCnt); 49 | zero_64bit = (uint64_t *)((char *)unlock_buffer + sizeof(uint64_t)); 50 | page_buffer = (char *)zero_64bit + sizeof(uint64_t); 51 | sibling_buffer = (char *)page_buffer + kPageSize * kPageBufferCnt; 52 | entry_buffer = (char *)sibling_buffer + kPageSize * kSiblingBufferCnt; 53 | *zero_64bit = 0; 54 | 55 | assert((char *)zero_64bit + 8 - buffer < define::kPerCoroRdmaBuf); 56 | } 57 | 58 | uint64_t *get_cas_buffer() { 59 | cas_buffer_cur = (cas_buffer_cur + 1) % kCasBufferCnt; 60 | return cas_buffer + cas_buffer_cur; 61 | } 62 | 63 | uint64_t *get_unlock_buffer() const { return unlock_buffer; } 64 | 65 | uint64_t *get_zero_64bit() const { return zero_64bit; } 66 | 67 | char *get_page_buffer() { 68 | page_buffer_cur = (page_buffer_cur + 1) % kPageBufferCnt; 69 | return page_buffer + (page_buffer_cur * kPageSize); 70 | } 71 | 72 | char *get_range_buffer() { 73 | return page_buffer; 74 | } 75 | 76 | char *get_sibling_buffer() { 77 | sibling_buffer_cur = (sibling_buffer_cur + 1) % kSiblingBufferCnt; 78 | return sibling_buffer + (sibling_buffer_cur * kPageSize); 79 | } 80 | 81 | char *get_entry_buffer() const { return entry_buffer; } 82 | }; 83 | 84 | #endif // _RDMA_BUFFER_H_ 85 | -------------------------------------------------------------------------------- /src/AbstractMessageConnection.cpp: -------------------------------------------------------------------------------- 1 | #include "AbstractMessageConnection.h" 2 | 3 | AbstractMessageConnection::AbstractMessageConnection( 4 | ibv_qp_type type, uint16_t sendPadding, uint16_t recvPadding, 5 | RdmaContext &ctx, ibv_cq *cq, uint32_t messageNR) 6 | : messageNR(messageNR), curMessage(0), curSend(0), sendCounter(0), 7 | sendPadding(sendPadding), recvPadding(recvPadding) { 8 | 9 | assert(messageNR % kBatchCount == 0); 10 | 11 | send_cq = ibv_create_cq(ctx.ctx, 128, NULL, NULL, 0); 12 | 13 | createQueuePair(&message, type, send_cq, cq, &ctx); 14 | modifyUDtoRTS(message, &ctx); 15 | 16 | messagePool = hugePageAlloc(2 * messageNR * MESSAGE_SIZE); 17 | messageMR = createMemoryRegion((uint64_t)messagePool, 18 | 2 * messageNR * MESSAGE_SIZE, &ctx); 19 | sendPool = (char *)messagePool + messageNR * MESSAGE_SIZE; 20 | messageLkey = messageMR->lkey; 21 | } 22 | 23 | void AbstractMessageConnection::initRecv() { 24 | subNR = messageNR / kBatchCount; 25 | 26 | for (int i = 0; i < kBatchCount; ++i) { 27 | recvs[i] = new ibv_recv_wr[subNR]; 28 | recv_sgl[i] = new ibv_sge[subNR]; 29 | } 30 | 31 | for (int k = 0; k < kBatchCount; ++k) { 32 | for (size_t i = 0; i < subNR; ++i) { 33 | auto &s = recv_sgl[k][i]; 34 | memset(&s, 0, sizeof(s)); 35 | 36 | s.addr = (uint64_t)messagePool + (k * subNR + i) * MESSAGE_SIZE; 37 | s.length = MESSAGE_SIZE; 38 | s.lkey = messageLkey; 39 | 40 | auto &r = recvs[k][i]; 41 | memset(&r, 0, sizeof(r)); 42 | 43 | r.sg_list = &s; 44 | r.num_sge = 1; 45 | r.next = (i == subNR - 1) ? NULL : &recvs[k][i + 1]; 46 | } 47 | } 48 | 49 | struct ibv_recv_wr *bad; 50 | for (int i = 0; i < kBatchCount; ++i) { 51 | if (ibv_post_recv(message, &recvs[i][0], &bad)) { 52 | Debug::notifyError("Receive failed."); 53 | } 54 | } 55 | } 56 | 57 | char *AbstractMessageConnection::getMessage() { 58 | struct ibv_recv_wr *bad; 59 | char *m = (char *)messagePool + curMessage * MESSAGE_SIZE + recvPadding; 60 | 61 | ADD_ROUND(curMessage, messageNR); 62 | 63 | if (curMessage % subNR == 0) { 64 | if (ibv_post_recv( 65 | message, 66 | &recvs[(curMessage / subNR - 1 + kBatchCount) % kBatchCount][0], 67 | &bad)) { 68 | Debug::notifyError("Receive failed."); 69 | } 70 | } 71 | 72 | return m; 73 | } 74 | 75 | char *AbstractMessageConnection::getSendPool() { 76 | char *s = (char *)sendPool + curSend * MESSAGE_SIZE + sendPadding; 77 | 78 | ADD_ROUND(curSend, messageNR); 79 | 80 | return s; 81 | } 82 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sherman: A Write-Optimized Distributed B+Tree Index on Disaggregated Memory 2 | 3 | Sherman is a B+Tree on disaggregated memory; it uses one-sided RDMA verbs to perform all index operations. 4 | Sherman includes three techniques to boost write performance: 5 | 6 | - A hierarchical locks leveraging on-chip memory of RDMA NICs. 7 | - Coalescing dependent RDMA commands 8 | - Two-level version layout in leaf nodes 9 | 10 | For more details, please refer to our [paper](https://dl.acm.org/doi/abs/10.1145/3514221.3517824): 11 | 12 | [**SIGMOG'22**] Sherman: A Write-Optimized Distributed B+Tree Index on Disaggregated Memory. Qing Wang and Youyou Lu and Jiwu Shu. 13 | 14 | 15 | ## Update (2024.10) 16 | Please use [Deft](https://github.com/thustorage/deft) for evaluation, which improving Sherman in performance and correct synchronization. 17 | 18 | 19 | ## System Requirements 20 | 21 | 1. Mellanox ConnectX-5 NICs and above 22 | 2. RDMA Driver: MLNX_OFED_LINUX-4.7-3.2.9.0 (If you use MLNX_OFED_LINUX-5**, you should modify codes to resolve interface incompatibility) 23 | 3. NIC Firmware: version 16.26.4012 and above (to support on-chip memory, you can use `ibstat` to obtain the version) 24 | 4. memcached (to exchange QP information) 25 | 5. cityhash 26 | 6. boost 1.53 (to support `boost::coroutines::symmetric_coroutine`) 27 | 28 | ## Setup about RDMA Network 29 | 30 | **1. RDMA NIC Selection.** 31 | 32 | You can modify this line according the RDMA NIC you want to use, where `ibv_get_device_name(deviceList[i])` is the name of RNIC (e.g., mlx5_0) 33 | https://github.com/thustorage/Sherman/blob/9bb950887cd066ebf4f906edbb43bae8e728548d/src/rdma/Resource.cpp#L28 34 | 35 | **2. Gid Selection.** 36 | 37 | If you use RoCE, modify `gidIndex` in this line according to the shell command `show_gids`, which is usually 3. 38 | https://github.com/thustorage/Sherman/blob/c5ee9d85e090006df39c0afe025c8f54756a7aea/include/Rdma.h#L60 39 | 40 | **3. MTU Selection.** 41 | 42 | If you use RoCE and the MTU of your NIC is not equal to 4200 (check with `ifconfig`), modify the value `path_mtu` in `src/rdma/StateTrans.cpp` 43 | 44 | **4. On-Chip Memory Size Selection.** 45 | 46 | Change the constant ``kLockChipMemSize`` in `include/Commmon.h`, making it <= max size of on-chip memory. 47 | 48 | ## Getting Started 49 | 50 | - `cd Sherman` 51 | - `./script/hugepage.sh` to request huge pages from OS (use `./script/clear_hugepage.sh` to return huge pages) 52 | - `mkdir build; cd build; cmake ..; make -j` 53 | - `cp ../script/restartMemc.sh .` 54 | - configure `../memcached.conf`, where the 1st line is memcached IP, the 2nd is memcached port 55 | 56 | For each run with `kNodeCount` servers: 57 | - `./restartMemc.sh` (to initialize memcached server) 58 | - In each server, execute `./benchmark kNodeCount kReadRatio kThreadCount` 59 | 60 | > We emulate each server as one compute node and one memory node: In each server, as the compute node, 61 | we launch `kThreadCount` client threads; as the memory node, we launch one memory thread. `kReadRatio` is the ratio of `get` operations. 62 | 63 | > In `./test/benchmark.cpp`, we can modify `kKeySpace` and `zipfan`, to generate different workloads. 64 | > In addition, we can open the macro `USE_CORO` to bind `kCoroCnt` coroutine on each client thread. 65 | 66 | ## Known bugs 67 | 68 | - The two-level version may induce inconsistency in some concurrent cases. Refer to [this SIGMOD'23 paper](https://dl.acm.org/doi/10.1145/3589276) 69 | 70 | ## TODO 71 | - Re-write `delete` operations 72 | -------------------------------------------------------------------------------- /src/Debug.cpp: -------------------------------------------------------------------------------- 1 | /*** Debug source. ***/ 2 | 3 | /** Version 1 + Functional Model Modification **/ 4 | 5 | /** Included files. **/ 6 | #include "Debug.h" 7 | 8 | /** Implemented functions. **/ 9 | /* Print debug title string. 10 | @param str String of debug title. */ 11 | void Debug::debugTitle(const char *str) { 12 | if (TITLE == true) /* If debug option is set. */ 13 | printf("\033[0;45;1m%s\033[0m\n", str); /* Print debug title string. */ 14 | } 15 | 16 | /* Print debug item string. Can be used in a formatted style like a printf(). 17 | @param format Format of debug item. Same as printf(). 18 | POTENTIALPROBLEM: the length of format can not exceed 19 | MAX_FORMAT_LEN - 1, but there is no check. 20 | @param ... Other argument variables to print. Same as printf(). */ 21 | void Debug::debugItem(const char *format, ...) { 22 | char newFormat[MAX_FORMAT_LEN]; 23 | 24 | va_list args; 25 | va_start(args, format); /* Start of variable arguments. */ 26 | 27 | if (DEBUG_ON == true) /* If debug option is set. */ 28 | { 29 | sprintf(newFormat, "\033[0;42;1m%s\033[0m\n", 30 | format); /* Wrap format in a style. */ 31 | vprintf(newFormat, args); /* Print string of debug item. */ 32 | } 33 | 34 | va_end(args); /* End of variable arguments. */ 35 | } 36 | 37 | void Debug::debugCur(const char *format, ...) { 38 | char newFormat[MAX_FORMAT_LEN]; 39 | 40 | va_list args; 41 | va_start(args, format); /* Start of variable arguments. */ 42 | 43 | if (CUR == true) /* If debug option is set. */ 44 | { 45 | sprintf(newFormat, "%s\n", format); /* Wrap format in a style. */ 46 | vprintf(newFormat, args); /* Print string of debug item. */ 47 | } 48 | 49 | va_end(args); /* End of variable arguments. */ 50 | } 51 | /* Print necessary information at start period. Can be used in a formatted style 52 | like a printf(). 53 | @param format Format of debug item. Same as printf(). 54 | POTENTIALPROBLEM: the length of format can not exceed 55 | MAX_FORMAT_LEN - 1, but there is no check. 56 | @param ... Other argument variables to print. Same as printf(). */ 57 | void Debug::notifyInfo(const char *format, ...) { 58 | char newFormat[MAX_FORMAT_LEN]; 59 | 60 | va_list args; 61 | va_start(args, format); /* Start of variable arguments. */ 62 | sprintf(newFormat, "\033[4m%s\033[0m\n", 63 | format); /* Wrap format in a style. */ 64 | vprintf(newFormat, args); /* Print string of notify information. */ 65 | va_end(args); /* End of variable arguments. */ 66 | } 67 | 68 | /* Print error information at start period. Can be used in a formatted style 69 | like a printf(). 70 | @param format Format of debug item. Same as printf(). 71 | POTENTIALPROBLEM: the length of format can not exceed 72 | MAX_FORMAT_LEN - 1, but there is no check. 73 | @param ... Other argument variables to print. Same as printf(). */ 74 | void Debug::notifyError(const char *format, ...) { 75 | char newFormat[MAX_FORMAT_LEN]; 76 | 77 | va_list args; 78 | va_start(args, format); /* Start of variable arguments. */ 79 | sprintf(newFormat, "\033[0;31m%s\033[0m\n", 80 | format); /* Wrap format in a style. */ 81 | vprintf(newFormat, args); /* Print string of notify information. */ 82 | va_end(args); /* End of variable arguments. */ 83 | } 84 | 85 | -------------------------------------------------------------------------------- /include/Common.h: -------------------------------------------------------------------------------- 1 | #ifndef __COMMON_H__ 2 | #define __COMMON_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | #include "Debug.h" 14 | #include "HugePageAlloc.h" 15 | #include "Rdma.h" 16 | 17 | #include "WRLock.h" 18 | 19 | // CONFIG_ENABLE_EMBEDDING_LOCK and CONFIG_ENABLE_CRC 20 | // **cannot** be ON at the same time 21 | 22 | // #define CONFIG_ENABLE_EMBEDDING_LOCK 23 | // #define CONFIG_ENABLE_CRC 24 | 25 | #define LATENCY_WINDOWS 1000000 26 | 27 | #define STRUCT_OFFSET(type, field) \ 28 | (char *)&((type *)(0))->field - (char *)((type *)(0)) 29 | 30 | #define MAX_MACHINE 8 31 | 32 | #define ADD_ROUND(x, n) ((x) = ((x) + 1) % (n)) 33 | 34 | #define MESSAGE_SIZE 96 // byte 35 | 36 | #define POST_RECV_PER_RC_QP 128 37 | 38 | #define RAW_RECV_CQ_COUNT 128 39 | 40 | // { app thread 41 | #define MAX_APP_THREAD 26 42 | 43 | #define APP_MESSAGE_NR 96 44 | 45 | // } 46 | 47 | // { dir thread 48 | #define NR_DIRECTORY 1 49 | 50 | #define DIR_MESSAGE_NR 128 51 | // } 52 | 53 | void bindCore(uint16_t core); 54 | char *getIP(); 55 | char *getMac(); 56 | 57 | inline int bits_in(std::uint64_t u) { 58 | auto bs = std::bitset<64>(u); 59 | return bs.count(); 60 | } 61 | 62 | #include 63 | 64 | using CoroYield = boost::coroutines::symmetric_coroutine::yield_type; 65 | using CoroCall = boost::coroutines::symmetric_coroutine::call_type; 66 | 67 | struct CoroContext { 68 | CoroYield *yield; 69 | CoroCall *master; 70 | int coro_id; 71 | }; 72 | 73 | namespace define { 74 | 75 | constexpr uint64_t MB = 1024ull * 1024; 76 | constexpr uint64_t GB = 1024ull * MB; 77 | constexpr uint16_t kCacheLineSize = 64; 78 | 79 | // for remote allocate 80 | constexpr uint64_t kChunkSize = 32 * MB; 81 | 82 | // for store root pointer 83 | constexpr uint64_t kRootPointerStoreOffest = kChunkSize / 2; 84 | static_assert(kRootPointerStoreOffest % sizeof(uint64_t) == 0, "XX"); 85 | 86 | // lock on-chip memory 87 | constexpr uint64_t kLockStartAddr = 0; 88 | constexpr uint64_t kLockChipMemSize = 256 * 1024; 89 | 90 | // number of locks 91 | // we do not use 16-bit locks, since 64-bit locks can provide enough concurrency. 92 | // if you want to use 16-bit locks, call *cas_dm_mask* 93 | constexpr uint64_t kNumOfLock = kLockChipMemSize / sizeof(uint64_t); 94 | 95 | // level of tree 96 | constexpr uint64_t kMaxLevelOfTree = 7; 97 | 98 | constexpr uint16_t kMaxCoro = 8; 99 | constexpr int64_t kPerCoroRdmaBuf = 128 * 1024; 100 | 101 | constexpr uint8_t kMaxHandOverTime = 8; 102 | 103 | constexpr int kIndexCacheSize = 1000; // MB 104 | } // namespace define 105 | 106 | static inline unsigned long long asm_rdtsc(void) { 107 | unsigned hi, lo; 108 | __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); 109 | return ((unsigned long long)lo) | (((unsigned long long)hi) << 32); 110 | } 111 | 112 | // For Tree 113 | using Key = uint64_t; 114 | using Value = uint64_t; 115 | constexpr Key kKeyMin = std::numeric_limits::min(); 116 | constexpr Key kKeyMax = std::numeric_limits::max(); 117 | constexpr Value kValueNull = 0; 118 | 119 | // Note: our RNICs can read 1KB data in increasing address order (but not for 4KB) 120 | constexpr uint32_t kInternalPageSize = 1024; 121 | constexpr uint32_t kLeafPageSize = 1024; 122 | 123 | __inline__ unsigned long long rdtsc(void) { 124 | unsigned hi, lo; 125 | __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); 126 | return ((unsigned long long)lo) | (((unsigned long long)hi) << 32); 127 | } 128 | 129 | inline void mfence() { asm volatile("mfence" ::: "memory"); } 130 | 131 | inline void compiler_barrier() { asm volatile("" ::: "memory"); } 132 | 133 | #endif /* __COMMON_H__ */ 134 | -------------------------------------------------------------------------------- /src/Keeper.cpp: -------------------------------------------------------------------------------- 1 | #include "Keeper.h" 2 | #include 3 | #include 4 | #include 5 | 6 | char *getIP(); 7 | 8 | std::string trim(const std::string &s) { 9 | std::string res = s; 10 | if (!res.empty()) { 11 | res.erase(0, res.find_first_not_of(" ")); 12 | res.erase(res.find_last_not_of(" ") + 1); 13 | } 14 | return res; 15 | } 16 | 17 | const char *Keeper::SERVER_NUM_KEY = "serverNum"; 18 | 19 | Keeper::Keeper(uint32_t maxServer) 20 | : maxServer(maxServer), curServer(0), memc(NULL) {} 21 | 22 | Keeper::~Keeper() { 23 | // listener.detach(); 24 | 25 | disconnectMemcached(); 26 | } 27 | 28 | bool Keeper::connectMemcached() { 29 | memcached_server_st *servers = NULL; 30 | memcached_return rc; 31 | 32 | std::ifstream conf("../memcached.conf"); 33 | 34 | if (!conf) { 35 | fprintf(stderr, "can't open memcached.conf\n"); 36 | return false; 37 | } 38 | 39 | std::string addr, port; 40 | std::getline(conf, addr); 41 | std::getline(conf, port); 42 | 43 | memc = memcached_create(NULL); 44 | servers = memcached_server_list_append(servers, trim(addr).c_str(), 45 | std::stoi(trim(port)), &rc); 46 | rc = memcached_server_push(memc, servers); 47 | 48 | if (rc != MEMCACHED_SUCCESS) { 49 | fprintf(stderr, "Counld't add server:%s\n", memcached_strerror(memc, rc)); 50 | sleep(1); 51 | return false; 52 | } 53 | 54 | memcached_behavior_set(memc, MEMCACHED_BEHAVIOR_BINARY_PROTOCOL, 1); 55 | return true; 56 | } 57 | 58 | bool Keeper::disconnectMemcached() { 59 | if (memc) { 60 | memcached_quit(memc); 61 | memcached_free(memc); 62 | memc = NULL; 63 | } 64 | return true; 65 | } 66 | 67 | void Keeper::serverEnter() { 68 | memcached_return rc; 69 | uint64_t serverNum; 70 | 71 | while (true) { 72 | rc = memcached_increment(memc, SERVER_NUM_KEY, strlen(SERVER_NUM_KEY), 1, 73 | &serverNum); 74 | if (rc == MEMCACHED_SUCCESS) { 75 | 76 | myNodeID = serverNum - 1; 77 | 78 | printf("I am server %d\n", myNodeID); 79 | return; 80 | } 81 | fprintf(stderr, "Server %d Counld't incr value and get ID: %s, retry...\n", 82 | myNodeID, memcached_strerror(memc, rc)); 83 | usleep(10000); 84 | } 85 | } 86 | 87 | void Keeper::serverConnect() { 88 | 89 | size_t l; 90 | uint32_t flags; 91 | memcached_return rc; 92 | 93 | while (curServer < maxServer) { 94 | char *serverNumStr = memcached_get(memc, SERVER_NUM_KEY, 95 | strlen(SERVER_NUM_KEY), &l, &flags, &rc); 96 | if (rc != MEMCACHED_SUCCESS) { 97 | fprintf(stderr, "Server %d Counld't get serverNum: %s, retry\n", myNodeID, 98 | memcached_strerror(memc, rc)); 99 | continue; 100 | } 101 | uint32_t serverNum = atoi(serverNumStr); 102 | free(serverNumStr); 103 | 104 | // /connect server K 105 | for (size_t k = curServer; k < serverNum; ++k) { 106 | if (k != myNodeID) { 107 | connectNode(k); 108 | printf("I connect server %zu\n", k); 109 | } 110 | } 111 | curServer = serverNum; 112 | } 113 | } 114 | 115 | void Keeper::memSet(const char *key, uint32_t klen, const char *val, 116 | uint32_t vlen) { 117 | 118 | memcached_return rc; 119 | while (true) { 120 | rc = memcached_set(memc, key, klen, val, vlen, (time_t)0, (uint32_t)0); 121 | if (rc == MEMCACHED_SUCCESS) { 122 | break; 123 | } 124 | usleep(400); 125 | } 126 | } 127 | 128 | char *Keeper::memGet(const char *key, uint32_t klen, size_t *v_size) { 129 | 130 | size_t l; 131 | char *res; 132 | uint32_t flags; 133 | memcached_return rc; 134 | 135 | while (true) { 136 | 137 | res = memcached_get(memc, key, klen, &l, &flags, &rc); 138 | if (rc == MEMCACHED_SUCCESS) { 139 | break; 140 | } 141 | usleep(400 * myNodeID); 142 | } 143 | 144 | if (v_size != nullptr) { 145 | *v_size = l; 146 | } 147 | 148 | return res; 149 | } 150 | 151 | uint64_t Keeper::memFetchAndAdd(const char *key, uint32_t klen) { 152 | uint64_t res; 153 | while (true) { 154 | memcached_return rc = memcached_increment(memc, key, klen, 1, &res); 155 | if (rc == MEMCACHED_SUCCESS) { 156 | return res; 157 | } 158 | usleep(10000); 159 | } 160 | } 161 | -------------------------------------------------------------------------------- /include/Rdma.h: -------------------------------------------------------------------------------- 1 | #ifndef _RDMA_H__ 2 | #define _RDMA_H__ 3 | 4 | #define forceinline inline __attribute__((always_inline)) 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | #include "Debug.h" 17 | 18 | #define DCT_ACCESS_KEY 3185 19 | #define UD_PKEY 0x11111111 20 | #define PSN 3185 21 | 22 | constexpr int kOroMax = 3; 23 | struct RdmaOpRegion { 24 | uint64_t source; 25 | uint64_t dest; 26 | uint64_t size; 27 | 28 | uint32_t lkey; 29 | union { 30 | uint32_t remoteRKey; 31 | bool is_on_chip; 32 | }; 33 | }; 34 | 35 | extern int kMaxDeviceMemorySize; 36 | 37 | struct RdmaContext { 38 | uint8_t devIndex; 39 | uint8_t port; 40 | int gidIndex; 41 | 42 | ibv_context *ctx; 43 | ibv_pd *pd; 44 | 45 | uint16_t lid; 46 | union ibv_gid gid; 47 | 48 | RdmaContext() : ctx(NULL), pd(NULL) {} 49 | }; 50 | 51 | struct Region { 52 | uint64_t source; 53 | uint32_t size; 54 | 55 | uint64_t dest; 56 | }; 57 | 58 | //// Resource.cpp 59 | bool createContext(RdmaContext *context, uint8_t port = 1, int gidIndex = 3, 60 | uint8_t devIndex = 0); 61 | bool destoryContext(RdmaContext *context); 62 | 63 | ibv_mr *createMemoryRegion(uint64_t mm, uint64_t mmSize, RdmaContext *ctx); 64 | ibv_mr *createMemoryRegionOnChip(uint64_t mm, uint64_t mmSize, 65 | RdmaContext *ctx); 66 | 67 | bool createQueuePair(ibv_qp **qp, ibv_qp_type mode, ibv_cq *cq, 68 | RdmaContext *context, uint32_t qpsMaxDepth = 128, 69 | uint32_t maxInlineData = 0); 70 | 71 | bool createQueuePair(ibv_qp **qp, ibv_qp_type mode, ibv_cq *send_cq, 72 | ibv_cq *recv_cq, RdmaContext *context, 73 | uint32_t qpsMaxDepth = 128, uint32_t maxInlineData = 0); 74 | 75 | bool createDCTarget(ibv_exp_dct **dct, ibv_cq *cq, RdmaContext *context, 76 | uint32_t qpsMaxDepth = 128, uint32_t maxInlineData = 0); 77 | void fillAhAttr(ibv_ah_attr *attr, uint32_t remoteLid, uint8_t *remoteGid, 78 | RdmaContext *context); 79 | 80 | //// StateTrans.cpp 81 | bool modifyQPtoInit(struct ibv_qp *qp, RdmaContext *context); 82 | bool modifyQPtoRTR(struct ibv_qp *qp, uint32_t remoteQPN, uint16_t remoteLid, 83 | uint8_t *gid, RdmaContext *context); 84 | bool modifyQPtoRTS(struct ibv_qp *qp); 85 | 86 | bool modifyUDtoRTS(struct ibv_qp *qp, RdmaContext *context); 87 | 88 | 89 | //// Operation.cpp 90 | int pollWithCQ(ibv_cq *cq, int pollNumber, struct ibv_wc *wc); 91 | int pollOnce(ibv_cq *cq, int pollNumber, struct ibv_wc *wc); 92 | 93 | bool rdmaSend(ibv_qp *qp, uint64_t source, uint64_t size, uint32_t lkey, 94 | ibv_ah *ah, uint32_t remoteQPN, bool isSignaled = false); 95 | 96 | bool rdmaSend(ibv_qp *qp, uint64_t source, uint64_t size, uint32_t lkey, 97 | int32_t imm = -1); 98 | 99 | bool rdmaReceive(ibv_qp *qp, uint64_t source, uint64_t size, uint32_t lkey, 100 | uint64_t wr_id = 0); 101 | bool rdmaReceive(ibv_srq *srq, uint64_t source, uint64_t size, uint32_t lkey); 102 | 103 | bool rdmaRead(ibv_qp *qp, uint64_t source, uint64_t dest, uint64_t size, 104 | uint32_t lkey, uint32_t remoteRKey, bool signal = true, 105 | uint64_t wrID = 0); 106 | 107 | bool rdmaWrite(ibv_qp *qp, uint64_t source, uint64_t dest, uint64_t size, 108 | uint32_t lkey, uint32_t remoteRKey, int32_t imm = -1, 109 | bool isSignaled = true, uint64_t wrID = 0); 110 | 111 | bool rdmaFetchAndAdd(ibv_qp *qp, uint64_t source, uint64_t dest, uint64_t add, 112 | uint32_t lkey, uint32_t remoteRKey); 113 | bool rdmaFetchAndAddBoundary(ibv_qp *qp, uint64_t source, uint64_t dest, 114 | uint64_t add, uint32_t lkey, uint32_t remoteRKey, 115 | uint64_t boundary = 63, bool singal = true, 116 | uint64_t wr_id = 0); 117 | 118 | bool rdmaCompareAndSwap(ibv_qp *qp, uint64_t source, uint64_t dest, 119 | uint64_t compare, uint64_t swap, uint32_t lkey, 120 | uint32_t remoteRKey, bool signal = true, 121 | uint64_t wrID = 0); 122 | bool rdmaCompareAndSwapMask(ibv_qp *qp, uint64_t source, uint64_t dest, 123 | uint64_t compare, uint64_t swap, uint32_t lkey, 124 | uint32_t remoteRKey, uint64_t mask = ~(0ull), 125 | bool signal = true); 126 | 127 | //// Utility.cpp 128 | void rdmaQueryQueuePair(ibv_qp *qp); 129 | void checkDMSupported(struct ibv_context *ctx); 130 | 131 | 132 | //// specified 133 | bool rdmaWriteBatch(ibv_qp *qp, RdmaOpRegion *ror, int k, bool isSignaled, 134 | uint64_t wrID = 0); 135 | bool rdmaCasRead(ibv_qp *qp, const RdmaOpRegion &cas_ror, 136 | const RdmaOpRegion &read_ror, uint64_t compare, uint64_t swap, 137 | bool isSignaled, uint64_t wrID = 0); 138 | bool rdmaWriteFaa(ibv_qp *qp, const RdmaOpRegion &write_ror, 139 | const RdmaOpRegion &faa_ror, uint64_t add_val, 140 | bool isSignaled, uint64_t wrID = 0); 141 | bool rdmaWriteCas(ibv_qp *qp, const RdmaOpRegion &write_ror, 142 | const RdmaOpRegion &cas_ror, uint64_t compare, uint64_t swap, 143 | bool isSignaled, uint64_t wrID = 0); 144 | #endif 145 | -------------------------------------------------------------------------------- /src/DSMKeeper.cpp: -------------------------------------------------------------------------------- 1 | #include "DSMKeeper.h" 2 | 3 | #include "Connection.h" 4 | 5 | const char *DSMKeeper::OK = "OK"; 6 | const char *DSMKeeper::ServerPrefix = "SPre"; 7 | 8 | void DSMKeeper::initLocalMeta() { 9 | localMeta.dsmBase = (uint64_t)dirCon[0]->dsmPool; 10 | localMeta.lockBase = (uint64_t)dirCon[0]->lockPool; 11 | localMeta.cacheBase = (uint64_t)thCon[0]->cachePool; 12 | 13 | // per thread APP 14 | for (int i = 0; i < MAX_APP_THREAD; ++i) { 15 | localMeta.appTh[i].lid = thCon[i]->ctx.lid; 16 | localMeta.appTh[i].rKey = thCon[i]->cacheMR->rkey; 17 | memcpy((char *)localMeta.appTh[i].gid, (char *)(&thCon[i]->ctx.gid), 18 | 16 * sizeof(uint8_t)); 19 | 20 | localMeta.appUdQpn[i] = thCon[i]->message->getQPN(); 21 | } 22 | 23 | // per thread DIR 24 | for (int i = 0; i < NR_DIRECTORY; ++i) { 25 | localMeta.dirTh[i].lid = dirCon[i]->ctx.lid; 26 | localMeta.dirTh[i].rKey = dirCon[i]->dsmMR->rkey; 27 | localMeta.dirTh[i].lock_rkey = dirCon[i]->lockMR->rkey; 28 | memcpy((char *)localMeta.dirTh[i].gid, (char *)(&dirCon[i]->ctx.gid), 29 | 16 * sizeof(uint8_t)); 30 | 31 | localMeta.dirUdQpn[i] = dirCon[i]->message->getQPN(); 32 | } 33 | 34 | } 35 | 36 | bool DSMKeeper::connectNode(uint16_t remoteID) { 37 | 38 | setDataToRemote(remoteID); 39 | 40 | std::string setK = setKey(remoteID); 41 | memSet(setK.c_str(), setK.size(), (char *)(&localMeta), sizeof(localMeta)); 42 | 43 | std::string getK = getKey(remoteID); 44 | ExchangeMeta *remoteMeta = (ExchangeMeta *)memGet(getK.c_str(), getK.size()); 45 | 46 | setDataFromRemote(remoteID, remoteMeta); 47 | 48 | free(remoteMeta); 49 | return true; 50 | } 51 | 52 | void DSMKeeper::setDataToRemote(uint16_t remoteID) { 53 | for (int i = 0; i < NR_DIRECTORY; ++i) { 54 | auto &c = dirCon[i]; 55 | 56 | for (int k = 0; k < MAX_APP_THREAD; ++k) { 57 | localMeta.dirRcQpn2app[i][k] = c->data2app[k][remoteID]->qp_num; 58 | } 59 | } 60 | 61 | for (int i = 0; i < MAX_APP_THREAD; ++i) { 62 | auto &c = thCon[i]; 63 | for (int k = 0; k < NR_DIRECTORY; ++k) { 64 | localMeta.appRcQpn2dir[i][k] = c->data[k][remoteID]->qp_num; 65 | } 66 | 67 | } 68 | } 69 | 70 | void DSMKeeper::setDataFromRemote(uint16_t remoteID, ExchangeMeta *remoteMeta) { 71 | for (int i = 0; i < NR_DIRECTORY; ++i) { 72 | auto &c = dirCon[i]; 73 | 74 | for (int k = 0; k < MAX_APP_THREAD; ++k) { 75 | auto &qp = c->data2app[k][remoteID]; 76 | 77 | assert(qp->qp_type == IBV_QPT_RC); 78 | modifyQPtoInit(qp, &c->ctx); 79 | modifyQPtoRTR(qp, remoteMeta->appRcQpn2dir[k][i], 80 | remoteMeta->appTh[k].lid, remoteMeta->appTh[k].gid, 81 | &c->ctx); 82 | modifyQPtoRTS(qp); 83 | } 84 | } 85 | 86 | for (int i = 0; i < MAX_APP_THREAD; ++i) { 87 | auto &c = thCon[i]; 88 | for (int k = 0; k < NR_DIRECTORY; ++k) { 89 | auto &qp = c->data[k][remoteID]; 90 | 91 | assert(qp->qp_type == IBV_QPT_RC); 92 | modifyQPtoInit(qp, &c->ctx); 93 | modifyQPtoRTR(qp, remoteMeta->dirRcQpn2app[k][i], 94 | remoteMeta->dirTh[k].lid, remoteMeta->dirTh[k].gid, 95 | &c->ctx); 96 | modifyQPtoRTS(qp); 97 | } 98 | } 99 | 100 | auto &info = remoteCon[remoteID]; 101 | info.dsmBase = remoteMeta->dsmBase; 102 | info.cacheBase = remoteMeta->cacheBase; 103 | info.lockBase = remoteMeta->lockBase; 104 | 105 | for (int i = 0; i < NR_DIRECTORY; ++i) { 106 | info.dsmRKey[i] = remoteMeta->dirTh[i].rKey; 107 | info.lockRKey[i] = remoteMeta->dirTh[i].lock_rkey; 108 | info.dirMessageQPN[i] = remoteMeta->dirUdQpn[i]; 109 | 110 | for (int k = 0; k < MAX_APP_THREAD; ++k) { 111 | struct ibv_ah_attr ahAttr; 112 | fillAhAttr(&ahAttr, remoteMeta->dirTh[i].lid, remoteMeta->dirTh[i].gid, 113 | &thCon[k]->ctx); 114 | info.appToDirAh[k][i] = ibv_create_ah(thCon[k]->ctx.pd, &ahAttr); 115 | 116 | assert(info.appToDirAh[k][i]); 117 | } 118 | } 119 | 120 | 121 | for (int i = 0; i < MAX_APP_THREAD; ++i) { 122 | info.appRKey[i] = remoteMeta->appTh[i].rKey; 123 | info.appMessageQPN[i] = remoteMeta->appUdQpn[i]; 124 | 125 | for (int k = 0; k < NR_DIRECTORY; ++k) { 126 | struct ibv_ah_attr ahAttr; 127 | fillAhAttr(&ahAttr, remoteMeta->appTh[i].lid, remoteMeta->appTh[i].gid, 128 | &dirCon[k]->ctx); 129 | info.dirToAppAh[k][i] = ibv_create_ah(dirCon[k]->ctx.pd, &ahAttr); 130 | 131 | assert(info.dirToAppAh[k][i]); 132 | } 133 | } 134 | } 135 | 136 | void DSMKeeper::connectMySelf() { 137 | setDataToRemote(getMyNodeID()); 138 | setDataFromRemote(getMyNodeID(), &localMeta); 139 | } 140 | 141 | void DSMKeeper::initRouteRule() { 142 | 143 | std::string k = 144 | std::string(ServerPrefix) + std::to_string(this->getMyNodeID()); 145 | memSet(k.c_str(), k.size(), getMyIP().c_str(), getMyIP().size()); 146 | } 147 | 148 | void DSMKeeper::barrier(const std::string &barrierKey) { 149 | 150 | std::string key = std::string("barrier-") + barrierKey; 151 | if (this->getMyNodeID() == 0) { 152 | memSet(key.c_str(), key.size(), "0", 1); 153 | } 154 | memFetchAndAdd(key.c_str(), key.size()); 155 | while (true) { 156 | uint64_t v = std::stoull(memGet(key.c_str(), key.size())); 157 | if (v == this->getServerNR()) { 158 | return; 159 | } 160 | } 161 | } 162 | 163 | uint64_t DSMKeeper::sum(const std::string &sum_key, uint64_t value) { 164 | std::string key_prefix = std::string("sum-") + sum_key; 165 | 166 | std::string key = key_prefix + std::to_string(this->getMyNodeID()); 167 | memSet(key.c_str(), key.size(), (char *)&value, sizeof(value)); 168 | 169 | uint64_t ret = 0; 170 | for (int i = 0; i < this->getServerNR(); ++i) { 171 | key = key_prefix + std::to_string(i); 172 | ret += *(uint64_t *)memGet(key.c_str(), key.size()); 173 | } 174 | 175 | return ret; 176 | } 177 | -------------------------------------------------------------------------------- /include/third_party/slice.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. 2 | // This source code is licensed under both the GPLv2 (found in the 3 | // COPYING file in the root directory) and Apache 2.0 License 4 | // (found in the LICENSE.Apache file in the root directory). 5 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 6 | // Use of this source code is governed by a BSD-style license that can be 7 | // found in the LICENSE file. See the AUTHORS file for names of contributors. 8 | // 9 | // Slice is a simple structure containing a pointer into some external 10 | // storage and a size. The user of a Slice must ensure that the slice 11 | // is not used after the corresponding external storage has been 12 | // deallocated. 13 | // 14 | // Multiple threads can invoke const methods on a Slice without 15 | // external synchronization, but if any of the threads may call a 16 | // non-const method, all threads accessing the same Slice must use 17 | // external synchronization. 18 | 19 | #pragma once 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #ifdef __cpp_lib_string_view 28 | #include 29 | #endif 30 | 31 | // #include "rocksdb/cleanable.h" 32 | 33 | 34 | class Slice { 35 | public: 36 | // Create an empty slice. 37 | Slice() : data_(""), size_(0) {} 38 | 39 | // Create a slice that refers to d[0,n-1]. 40 | Slice(const char* d, size_t n) : data_(d), size_(n) {} 41 | 42 | // Create a slice that refers to the contents of "s" 43 | /* implicit */ 44 | Slice(const std::string& s) : data_(s.data()), size_(s.size()) {} 45 | 46 | #ifdef __cpp_lib_string_view 47 | // Create a slice that refers to the same contents as "sv" 48 | /* implicit */ 49 | Slice(std::string_view sv) : data_(sv.data()), size_(sv.size()) {} 50 | #endif 51 | 52 | // Create a slice that refers to s[0,strlen(s)-1] 53 | /* implicit */ 54 | Slice(const char* s) : data_(s) { size_ = (s == nullptr) ? 0 : strlen(s); } 55 | 56 | // Create a single slice from SliceParts using buf as storage. 57 | // buf must exist as long as the returned Slice exists. 58 | Slice(const struct SliceParts& parts, std::string* buf); 59 | 60 | // Return a pointer to the beginning of the referenced data 61 | const char* data() const { return data_; } 62 | 63 | // Return the length (in bytes) of the referenced data 64 | size_t size() const { return size_; } 65 | 66 | // Return true iff the length of the referenced data is zero 67 | bool empty() const { return size_ == 0; } 68 | 69 | // Return the ith byte in the referenced data. 70 | // REQUIRES: n < size() 71 | char operator[](size_t n) const { 72 | assert(n < size()); 73 | return data_[n]; 74 | } 75 | 76 | // Change this slice to refer to an empty array 77 | void clear() { 78 | data_ = ""; 79 | size_ = 0; 80 | } 81 | 82 | // Drop the first "n" bytes from this slice. 83 | void remove_prefix(size_t n) { 84 | assert(n <= size()); 85 | data_ += n; 86 | size_ -= n; 87 | } 88 | 89 | void remove_suffix(size_t n) { 90 | assert(n <= size()); 91 | size_ -= n; 92 | } 93 | 94 | // Return a string that contains the copy of the referenced data. 95 | // when hex is true, returns a string of twice the length hex encoded (0-9A-F) 96 | std::string ToString(bool hex = false) const; 97 | 98 | #ifdef __cpp_lib_string_view 99 | // Return a string_view that references the same data as this slice. 100 | std::string_view ToStringView() const { 101 | return std::string_view(data_, size_); 102 | } 103 | #endif 104 | 105 | // Decodes the current slice interpreted as an hexadecimal string into result, 106 | // if successful returns true, if this isn't a valid hex string 107 | // (e.g not coming from Slice::ToString(true)) DecodeHex returns false. 108 | // This slice is expected to have an even number of 0-9A-F characters 109 | // also accepts lowercase (a-f) 110 | bool DecodeHex(std::string* result) const; 111 | 112 | // Three-way comparison. Returns value: 113 | // < 0 iff "*this" < "b", 114 | // == 0 iff "*this" == "b", 115 | // > 0 iff "*this" > "b" 116 | int compare(const Slice& b) const; 117 | 118 | // Return true iff "x" is a prefix of "*this" 119 | bool starts_with(const Slice& x) const { 120 | return ((size_ >= x.size_) && (memcmp(data_, x.data_, x.size_) == 0)); 121 | } 122 | 123 | bool ends_with(const Slice& x) const { 124 | return ((size_ >= x.size_) && 125 | (memcmp(data_ + size_ - x.size_, x.data_, x.size_) == 0)); 126 | } 127 | 128 | // Compare two slices and returns the first byte where they differ 129 | size_t difference_offset(const Slice& b) const; 130 | 131 | // private: make these public for rocksdbjni access 132 | const char* data_; 133 | size_t size_; 134 | 135 | // Intentionally copyable 136 | }; 137 | 138 | // A set of Slices that are virtually concatenated together. 'parts' points 139 | // to an array of Slices. The number of elements in the array is 'num_parts'. 140 | struct SliceParts { 141 | SliceParts(const Slice* _parts, int _num_parts) 142 | : parts(_parts), num_parts(_num_parts) {} 143 | SliceParts() : parts(nullptr), num_parts(0) {} 144 | 145 | const Slice* parts; 146 | int num_parts; 147 | }; 148 | 149 | inline bool operator==(const Slice& x, const Slice& y) { 150 | return ((x.size() == y.size()) && 151 | (memcmp(x.data(), y.data(), x.size()) == 0)); 152 | } 153 | 154 | inline bool operator!=(const Slice& x, const Slice& y) { return !(x == y); } 155 | 156 | inline int Slice::compare(const Slice& b) const { 157 | assert(data_ != nullptr && b.data_ != nullptr); 158 | const size_t min_len = (size_ < b.size_) ? size_ : b.size_; 159 | int r = memcmp(data_, b.data_, min_len); 160 | if (r == 0) { 161 | if (size_ < b.size_) 162 | r = -1; 163 | else if (size_ > b.size_) 164 | r = +1; 165 | } 166 | return r; 167 | } 168 | 169 | inline size_t Slice::difference_offset(const Slice& b) const { 170 | size_t off = 0; 171 | const size_t len = (size_ < b.size_) ? size_ : b.size_; 172 | for (; off < len; off++) { 173 | if (data_[off] != b.data_[off]) break; 174 | } 175 | return off; 176 | } 177 | -------------------------------------------------------------------------------- /src/rdma/StateTrans.cpp: -------------------------------------------------------------------------------- 1 | #include "Rdma.h" 2 | bool modifyQPtoInit(struct ibv_qp *qp, RdmaContext *context) { 3 | 4 | struct ibv_qp_attr attr; 5 | memset(&attr, 0, sizeof(attr)); 6 | 7 | attr.qp_state = IBV_QPS_INIT; 8 | attr.port_num = context->port; 9 | attr.pkey_index = 0; 10 | 11 | switch (qp->qp_type) { 12 | case IBV_QPT_RC: 13 | attr.qp_access_flags = IBV_ACCESS_REMOTE_READ | 14 | IBV_ACCESS_REMOTE_WRITE | 15 | IBV_ACCESS_REMOTE_ATOMIC; 16 | break; 17 | 18 | case IBV_QPT_UC: 19 | attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE; 20 | break; 21 | 22 | case IBV_EXP_QPT_DC_INI: 23 | Debug::notifyError("implement me:)"); 24 | break; 25 | 26 | default: 27 | Debug::notifyError("implement me:)"); 28 | } 29 | 30 | if (ibv_modify_qp(qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | 31 | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { 32 | Debug::notifyError("Failed to modify QP state to INIT"); 33 | return false; 34 | } 35 | return true; 36 | } 37 | 38 | bool modifyQPtoRTR(struct ibv_qp *qp, uint32_t remoteQPN, uint16_t remoteLid, 39 | uint8_t *remoteGid, RdmaContext *context) { 40 | 41 | struct ibv_qp_attr attr; 42 | memset(&attr, 0, sizeof(attr)); 43 | attr.qp_state = IBV_QPS_RTR; 44 | 45 | attr.path_mtu = IBV_MTU_4096; 46 | attr.dest_qp_num = remoteQPN; 47 | attr.rq_psn = PSN; 48 | 49 | fillAhAttr(&attr.ah_attr, remoteLid, remoteGid, context); 50 | 51 | int flags = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | 52 | IBV_QP_RQ_PSN; 53 | 54 | if (qp->qp_type == IBV_QPT_RC) { 55 | attr.max_dest_rd_atomic = 16; 56 | attr.min_rnr_timer = 12; 57 | flags |= IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER; 58 | } 59 | 60 | if (ibv_modify_qp(qp, &attr, flags)) { 61 | Debug::notifyError("failed to modify QP state to RTR"); 62 | return false; 63 | } 64 | return true; 65 | } 66 | 67 | bool modifyQPtoRTS(struct ibv_qp *qp) { 68 | struct ibv_qp_attr attr; 69 | int flags; 70 | memset(&attr, 0, sizeof(attr)); 71 | 72 | attr.qp_state = IBV_QPS_RTS; 73 | attr.sq_psn = PSN; 74 | flags = IBV_QP_STATE | IBV_QP_SQ_PSN; 75 | 76 | if (qp->qp_type == IBV_QPT_RC) { 77 | attr.timeout = 14; 78 | attr.retry_cnt = 7; 79 | attr.rnr_retry = 7; 80 | attr.max_rd_atomic = 16; 81 | flags |= IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | 82 | IBV_QP_MAX_QP_RD_ATOMIC; 83 | } 84 | 85 | if (ibv_modify_qp(qp, &attr, flags)) { 86 | Debug::notifyError("failed to modify QP state to RTS"); 87 | return false; 88 | } 89 | return true; 90 | } 91 | 92 | bool modifyUDtoRTS(struct ibv_qp *qp, RdmaContext *context) { 93 | // assert(qp->qp_type == IBV_QPT_UD); 94 | 95 | struct ibv_qp_attr attr; 96 | memset(&attr, 0, sizeof(attr)); 97 | 98 | attr.qp_state = IBV_QPS_INIT; 99 | attr.pkey_index = 0; 100 | attr.port_num = context->port; 101 | attr.qkey = UD_PKEY; 102 | 103 | if (qp->qp_type == IBV_QPT_UD) { 104 | if (ibv_modify_qp(qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | 105 | IBV_QP_PORT | IBV_QP_QKEY)) { 106 | Debug::notifyError("Failed to modify QP state to INIT"); 107 | return false; 108 | } 109 | } else { 110 | if (ibv_modify_qp(qp, &attr, IBV_QP_STATE | IBV_QP_PORT)) { 111 | Debug::notifyError("Failed to modify QP state to INIT"); 112 | return false; 113 | } 114 | } 115 | 116 | memset(&attr, 0, sizeof(attr)); 117 | attr.qp_state = IBV_QPS_RTR; 118 | if (ibv_modify_qp(qp, &attr, IBV_QP_STATE)) { 119 | Debug::notifyError("failed to modify QP state to RTR"); 120 | return false; 121 | } 122 | 123 | memset(&attr, 0, sizeof(attr)); 124 | attr.qp_state = IBV_QPS_RTS; 125 | attr.sq_psn = PSN; 126 | 127 | if (qp->qp_type == IBV_QPT_UD) { 128 | if (ibv_modify_qp(qp, &attr, IBV_QP_STATE | IBV_QP_SQ_PSN)) { 129 | Debug::notifyError("failed to modify QP state to RTS"); 130 | return false; 131 | } 132 | } else { 133 | if (ibv_modify_qp(qp, &attr, IBV_QP_STATE)) { 134 | Debug::notifyError("failed to modify QP state to RTS"); 135 | return false; 136 | } 137 | } 138 | return true; 139 | } 140 | 141 | bool modifyDCtoRTS(struct ibv_qp *qp, uint16_t remoteLid, uint8_t *remoteGid, 142 | RdmaContext *context) { 143 | // assert(qp->qp_type == IBV_EXP_QPT_DC_INI); 144 | 145 | struct ibv_exp_qp_attr attr; 146 | memset(&attr, 0, sizeof(attr)); 147 | 148 | attr.qp_state = IBV_QPS_INIT; 149 | attr.pkey_index = 0; 150 | attr.port_num = context->port; 151 | attr.qp_access_flags = 0; 152 | attr.dct_key = DCT_ACCESS_KEY; 153 | 154 | if (ibv_exp_modify_qp(qp, &attr, IBV_EXP_QP_STATE | IBV_EXP_QP_PKEY_INDEX | 155 | IBV_EXP_QP_PORT | IBV_EXP_QP_DC_KEY)) { 156 | Debug::notifyError("failed to modify QP state to INI"); 157 | return false; 158 | } 159 | 160 | attr.qp_state = IBV_QPS_RTR; 161 | attr.path_mtu = IBV_MTU_4096; 162 | 163 | fillAhAttr(&attr.ah_attr, remoteLid, remoteGid, context); 164 | if (ibv_exp_modify_qp(qp, &attr, IBV_EXP_QP_STATE | IBV_EXP_QP_PATH_MTU | 165 | IBV_EXP_QP_AV)) { 166 | Debug::notifyError("failed to modify QP state to RTR"); 167 | return false; 168 | } 169 | 170 | attr.qp_state = IBV_QPS_RTS; 171 | attr.timeout = 14; 172 | attr.retry_cnt = 7; 173 | attr.rnr_retry = 7; 174 | attr.max_rd_atomic = 16; 175 | if (ibv_exp_modify_qp(qp, &attr, IBV_EXP_QP_STATE | IBV_EXP_QP_TIMEOUT | 176 | IBV_EXP_QP_RETRY_CNT | 177 | IBV_EXP_QP_RNR_RETRY | 178 | IBV_EXP_QP_MAX_QP_RD_ATOMIC)) { 179 | 180 | Debug::notifyError("failed to modify QP state to RTS"); 181 | return false; 182 | } 183 | 184 | return true; 185 | } 186 | -------------------------------------------------------------------------------- /include/third_party/random.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. 2 | // This source code is licensed under both the GPLv2 (found in the 3 | // COPYING file in the root directory) and Apache 2.0 License 4 | // (found in the LICENSE.Apache file in the root directory). 5 | // 6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 7 | // Use of this source code is governed by a BSD-style license that can be 8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. 9 | 10 | #pragma once 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | 19 | // A very simple random number generator. Not especially good at 20 | // generating truly random bits, but good enough for our needs in this 21 | // package. 22 | class Random { 23 | private: 24 | enum : uint32_t { 25 | M = 2147483647L // 2^31-1 26 | }; 27 | enum : uint64_t { 28 | A = 16807 // bits 14, 8, 7, 5, 2, 1, 0 29 | }; 30 | 31 | uint32_t seed_; 32 | 33 | static uint32_t GoodSeed(uint32_t s) { return (s & M) != 0 ? (s & M) : 1; } 34 | 35 | public: 36 | // This is the largest value that can be returned from Next() 37 | enum : uint32_t { kMaxNext = M }; 38 | 39 | explicit Random(uint32_t s) : seed_(GoodSeed(s)) {} 40 | 41 | void Reset(uint32_t s) { seed_ = GoodSeed(s); } 42 | 43 | uint32_t Next() { 44 | // We are computing 45 | // seed_ = (seed_ * A) % M, where M = 2^31-1 46 | // 47 | // seed_ must not be zero or M, or else all subsequent computed values 48 | // will be zero or M respectively. For all other values, seed_ will end 49 | // up cycling through every number in [1,M-1] 50 | uint64_t product = seed_ * A; 51 | 52 | // Compute (product % M) using the fact that ((x << 31) % M) == x. 53 | seed_ = static_cast((product >> 31) + (product & M)); 54 | // The first reduction may overflow by 1 bit, so we may need to 55 | // repeat. mod == M is not possible; using > allows the faster 56 | // sign-bit-based test. 57 | if (seed_ > M) { 58 | seed_ -= M; 59 | } 60 | return seed_; 61 | } 62 | 63 | // Returns a uniformly distributed value in the range [0..n-1] 64 | // REQUIRES: n > 0 65 | uint32_t Uniform(int n) { return Next() % n; } 66 | 67 | // Randomly returns true ~"1/n" of the time, and false otherwise. 68 | // REQUIRES: n > 0 69 | bool OneIn(int n) { return Uniform(n) == 0; } 70 | 71 | // "Optional" one-in-n, where 0 or negative always returns false 72 | // (may or may not consume a random value) 73 | bool OneInOpt(int n) { return n > 0 && OneIn(n); } 74 | 75 | // Returns random bool that is true for the given percentage of 76 | // calls on average. Zero or less is always false and 100 or more 77 | // is always true (may or may not consume a random value) 78 | bool PercentTrue(int percentage) { 79 | return static_cast(Uniform(100)) < percentage; 80 | } 81 | 82 | // Skewed: pick "base" uniformly from range [0,max_log] and then 83 | // return "base" random bits. The effect is to pick a number in the 84 | // range [0,2^max_log-1] with exponential bias towards smaller numbers. 85 | uint32_t Skewed(int max_log) { return Uniform(1 << Uniform(max_log + 1)); } 86 | 87 | // Returns a Random instance for use by the current thread without 88 | // additional locking 89 | static Random *GetTLSInstance() { 90 | thread_local Random *tls_instance; 91 | thread_local std::aligned_storage::type tls_instance_bytes; 92 | 93 | auto rv = tls_instance; 94 | if (rv == nullptr) { 95 | size_t seed = std::hash()(std::this_thread::get_id()); 96 | rv = new (&tls_instance_bytes) Random((uint32_t)seed); 97 | tls_instance = rv; 98 | } 99 | return rv; 100 | } 101 | }; 102 | 103 | // A good 32-bit random number generator based on std::mt19937. 104 | // This exists in part to avoid compiler variance in warning about coercing 105 | // uint_fast32_t from mt19937 to uint32_t. 106 | class Random32 { 107 | private: 108 | std::mt19937 generator_; 109 | 110 | public: 111 | explicit Random32(uint32_t s) : generator_(s) {} 112 | 113 | // Generates the next random number 114 | uint32_t Next() { return static_cast(generator_()); } 115 | 116 | // Returns a uniformly distributed value in the range [0..n-1] 117 | // REQUIRES: n > 0 118 | uint32_t Uniform(uint32_t n) { 119 | return static_cast( 120 | std::uniform_int_distribution(0, n - 1)( 121 | generator_)); 122 | } 123 | 124 | // Returns an *almost* uniformly distributed value in the range [0..n-1]. 125 | // Much faster than Uniform(). 126 | // REQUIRES: n > 0 127 | uint32_t Uniformish(uint32_t n) { 128 | // fastrange (without the header) 129 | return static_cast((uint64_t(generator_()) * uint64_t(n)) >> 32); 130 | } 131 | 132 | // Randomly returns true ~"1/n" of the time, and false otherwise. 133 | // REQUIRES: n > 0 134 | bool OneIn(uint32_t n) { return Uniform(n) == 0; } 135 | 136 | // Skewed: pick "base" uniformly from range [0,max_log] and then 137 | // return "base" random bits. The effect is to pick a number in the 138 | // range [0,2^max_log-1] with exponential bias towards smaller numbers. 139 | uint32_t Skewed(int max_log) { 140 | return Uniform(uint32_t{1} << Uniform(max_log + 1)); 141 | } 142 | 143 | // Reset the seed of the generator to the given value 144 | void Seed(uint32_t new_seed) { generator_.seed(new_seed); } 145 | }; 146 | 147 | // A good 64-bit random number generator based on std::mt19937_64 148 | class Random64 { 149 | private: 150 | std::mt19937_64 generator_; 151 | 152 | public: 153 | explicit Random64(uint64_t s) : generator_(s) {} 154 | 155 | // Generates the next random number 156 | uint64_t Next() { return generator_(); } 157 | 158 | // Returns a uniformly distributed value in the range [0..n-1] 159 | // REQUIRES: n > 0 160 | uint64_t Uniform(uint64_t n) { 161 | return std::uniform_int_distribution(0, n - 1)(generator_); 162 | } 163 | 164 | // Randomly returns true ~"1/n" of the time, and false otherwise. 165 | // REQUIRES: n > 0 166 | bool OneIn(uint64_t n) { return Uniform(n) == 0; } 167 | 168 | // Skewed: pick "base" uniformly from range [0,max_log] and then 169 | // return "base" random bits. The effect is to pick a number in the 170 | // range [0,2^max_log-1] with exponential bias towards smaller numbers. 171 | uint64_t Skewed(int max_log) { 172 | return Uniform(uint64_t(1) << Uniform(max_log + 1)); 173 | } 174 | }; 175 | 176 | // A seeded replacement for removed std::random_shuffle 177 | template 178 | void RandomShuffle(RandomIt first, RandomIt last, uint32_t seed) { 179 | std::mt19937 rng(seed); 180 | std::shuffle(first, last, rng); 181 | } 182 | 183 | // A replacement for removed std::random_shuffle 184 | template void RandomShuffle(RandomIt first, RandomIt last) { 185 | RandomShuffle(first, last, std::random_device{}()); 186 | } 187 | -------------------------------------------------------------------------------- /test/benchmark.cpp: -------------------------------------------------------------------------------- 1 | #include "Timer.h" 2 | #include "Tree.h" 3 | #include "zipf.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | 13 | //////////////////// workload parameters ///////////////////// 14 | 15 | // #define USE_CORO 16 | const int kCoroCnt = 3; 17 | 18 | int kReadRatio; 19 | int kThreadCount; 20 | int kNodeCount; 21 | uint64_t kKeySpace = 64 * define::MB; 22 | double kWarmRatio = 0.8; 23 | double zipfan = 0; 24 | 25 | //////////////////// workload parameters ///////////////////// 26 | 27 | 28 | extern uint64_t cache_miss[MAX_APP_THREAD][8]; 29 | extern uint64_t cache_hit[MAX_APP_THREAD][8]; 30 | 31 | 32 | 33 | std::thread th[MAX_APP_THREAD]; 34 | uint64_t tp[MAX_APP_THREAD][8]; 35 | 36 | extern uint64_t latency[MAX_APP_THREAD][LATENCY_WINDOWS]; 37 | uint64_t latency_th_all[LATENCY_WINDOWS]; 38 | 39 | Tree *tree; 40 | DSM *dsm; 41 | 42 | inline Key to_key(uint64_t k) { 43 | return (CityHash64((char *)&k, sizeof(k)) + 1) % kKeySpace; 44 | } 45 | 46 | class RequsetGenBench : public RequstGen { 47 | 48 | public: 49 | RequsetGenBench(int coro_id, DSM *dsm, int id) 50 | : coro_id(coro_id), dsm(dsm), id(id) { 51 | seed = rdtsc(); 52 | mehcached_zipf_init(&state, kKeySpace, zipfan, 53 | (rdtsc() & (0x0000ffffffffffffull)) ^ id); 54 | } 55 | 56 | Request next() override { 57 | Request r; 58 | uint64_t dis = mehcached_zipf_next(&state); 59 | 60 | r.k = to_key(dis); 61 | r.v = 23; 62 | r.is_search = rand_r(&seed) % 100 < kReadRatio; 63 | 64 | tp[id][0]++; 65 | 66 | return r; 67 | } 68 | 69 | private: 70 | int coro_id; 71 | DSM *dsm; 72 | int id; 73 | 74 | unsigned int seed; 75 | struct zipf_gen_state state; 76 | }; 77 | 78 | RequstGen *coro_func(int coro_id, DSM *dsm, int id) { 79 | return new RequsetGenBench(coro_id, dsm, id); 80 | } 81 | 82 | Timer bench_timer; 83 | std::atomic warmup_cnt{0}; 84 | std::atomic_bool ready{false}; 85 | void thread_run(int id) { 86 | 87 | bindCore(id); 88 | 89 | dsm->registerThread(); 90 | 91 | uint64_t all_thread = kThreadCount * dsm->getClusterSize(); 92 | uint64_t my_id = kThreadCount * dsm->getMyNodeID() + id; 93 | 94 | printf("I am thread %ld on compute nodes\n", my_id); 95 | 96 | if (id == 0) { 97 | bench_timer.begin(); 98 | } 99 | 100 | uint64_t end_warm_key = kWarmRatio * kKeySpace; 101 | for (uint64_t i = 1; i < end_warm_key; ++i) { 102 | if (i % all_thread == my_id) { 103 | tree->insert(to_key(i), i * 2); 104 | } 105 | } 106 | 107 | warmup_cnt.fetch_add(1); 108 | 109 | if (id == 0) { 110 | while (warmup_cnt.load() != kThreadCount) 111 | ; 112 | printf("node %d finish\n", dsm->getMyNodeID()); 113 | dsm->barrier("warm_finish"); 114 | 115 | uint64_t ns = bench_timer.end(); 116 | printf("warmup time %lds\n", ns / 1000 / 1000 / 1000); 117 | 118 | tree->index_cache_statistics(); 119 | tree->clear_statistics(); 120 | 121 | ready = true; 122 | 123 | warmup_cnt.store(0); 124 | } 125 | 126 | while (warmup_cnt.load() != 0) 127 | ; 128 | 129 | #ifdef USE_CORO 130 | tree->run_coroutine(coro_func, id, kCoroCnt); 131 | #else 132 | 133 | /// without coro 134 | unsigned int seed = rdtsc(); 135 | struct zipf_gen_state state; 136 | mehcached_zipf_init(&state, kKeySpace, zipfan, 137 | (rdtsc() & (0x0000ffffffffffffull)) ^ id); 138 | 139 | Timer timer; 140 | while (true) { 141 | 142 | uint64_t dis = mehcached_zipf_next(&state); 143 | uint64_t key = to_key(dis); 144 | 145 | Value v; 146 | timer.begin(); 147 | 148 | if (rand_r(&seed) % 100 < kReadRatio) { // GET 149 | tree->search(key, v); 150 | } else { 151 | v = 12; 152 | tree->insert(key, v); 153 | } 154 | 155 | auto us_10 = timer.end() / 100; 156 | if (us_10 >= LATENCY_WINDOWS) { 157 | us_10 = LATENCY_WINDOWS - 1; 158 | } 159 | latency[id][us_10]++; 160 | 161 | tp[id][0]++; 162 | } 163 | #endif 164 | 165 | } 166 | 167 | void parse_args(int argc, char *argv[]) { 168 | if (argc != 4) { 169 | printf("Usage: ./benchmark kNodeCount kReadRatio kThreadCount\n"); 170 | exit(-1); 171 | } 172 | 173 | kNodeCount = atoi(argv[1]); 174 | kReadRatio = atoi(argv[2]); 175 | kThreadCount = atoi(argv[3]); 176 | 177 | printf("kNodeCount %d, kReadRatio %d, kThreadCount %d\n", kNodeCount, 178 | kReadRatio, kThreadCount); 179 | } 180 | 181 | void cal_latency() { 182 | uint64_t all_lat = 0; 183 | for (int i = 0; i < LATENCY_WINDOWS; ++i) { 184 | latency_th_all[i] = 0; 185 | for (int k = 0; k < MAX_APP_THREAD; ++k) { 186 | latency_th_all[i] += latency[k][i]; 187 | } 188 | all_lat += latency_th_all[i]; 189 | } 190 | 191 | uint64_t th50 = all_lat / 2; 192 | uint64_t th90 = all_lat * 9 / 10; 193 | uint64_t th95 = all_lat * 95 / 100; 194 | uint64_t th99 = all_lat * 99 / 100; 195 | uint64_t th999 = all_lat * 999 / 1000; 196 | 197 | uint64_t cum = 0; 198 | for (int i = 0; i < LATENCY_WINDOWS; ++i) { 199 | cum += latency_th_all[i]; 200 | 201 | if (cum >= th50) { 202 | printf("p50 %f\t", i / 10.0); 203 | th50 = -1; 204 | } 205 | if (cum >= th90) { 206 | printf("p90 %f\t", i / 10.0); 207 | th90 = -1; 208 | } 209 | if (cum >= th95) { 210 | printf("p95 %f\t", i / 10.0); 211 | th95 = -1; 212 | } 213 | if (cum >= th99) { 214 | printf("p99 %f\t", i / 10.0); 215 | th99 = -1; 216 | } 217 | if (cum >= th999) { 218 | printf("p999 %f\n", i / 10.0); 219 | th999 = -1; 220 | return; 221 | } 222 | } 223 | } 224 | 225 | int main(int argc, char *argv[]) { 226 | 227 | parse_args(argc, argv); 228 | 229 | DSMConfig config; 230 | config.machineNR = kNodeCount; 231 | dsm = DSM::getInstance(config); 232 | 233 | dsm->registerThread(); 234 | tree = new Tree(dsm); 235 | 236 | if (dsm->getMyNodeID() == 0) { 237 | for (uint64_t i = 1; i < 1024000; ++i) { 238 | tree->insert(to_key(i), i * 2); 239 | } 240 | } 241 | 242 | dsm->barrier("benchmark"); 243 | dsm->resetThread(); 244 | 245 | for (int i = 0; i < kThreadCount; i++) { 246 | th[i] = std::thread(thread_run, i); 247 | } 248 | 249 | while (!ready.load()) 250 | ; 251 | 252 | timespec s, e; 253 | uint64_t pre_tp = 0; 254 | 255 | int count = 0; 256 | 257 | clock_gettime(CLOCK_REALTIME, &s); 258 | while (true) { 259 | 260 | sleep(2); 261 | clock_gettime(CLOCK_REALTIME, &e); 262 | int microseconds = (e.tv_sec - s.tv_sec) * 1000000 + 263 | (double)(e.tv_nsec - s.tv_nsec) / 1000; 264 | 265 | uint64_t all_tp = 0; 266 | for (int i = 0; i < kThreadCount; ++i) { 267 | all_tp += tp[i][0]; 268 | } 269 | uint64_t cap = all_tp - pre_tp; 270 | pre_tp = all_tp; 271 | 272 | uint64_t all = 0; 273 | uint64_t hit = 0; 274 | for (int i = 0; i < MAX_APP_THREAD; ++i) { 275 | all += (cache_hit[i][0] + cache_miss[i][0]); 276 | hit += cache_hit[i][0]; 277 | } 278 | 279 | clock_gettime(CLOCK_REALTIME, &s); 280 | 281 | if (++count % 3 == 0 && dsm->getMyNodeID() == 0) { 282 | cal_latency(); 283 | } 284 | 285 | double per_node_tp = cap * 1.0 / microseconds; 286 | uint64_t cluster_tp = dsm->sum((uint64_t)(per_node_tp * 1000)); 287 | 288 | printf("%d, throughput %.4f\n", dsm->getMyNodeID(), per_node_tp); 289 | 290 | if (dsm->getMyNodeID() == 0) { 291 | printf("cluster throughput %.3f\n", cluster_tp / 1000.0); 292 | printf("cache hit rate: %lf\n", hit * 1.0 / all); 293 | } 294 | } 295 | 296 | return 0; 297 | } -------------------------------------------------------------------------------- /test/zipf.h: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Carnegie Mellon University 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #ifndef _ZIPF_H_ 18 | #define _ZIPF_H_ 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | // #include "util.h" 27 | 28 | struct zipf_gen_state { 29 | uint64_t n; // number of items (input) 30 | double theta; // skewness (input) in (0, 1); or, 0 = uniform, 1 = always 31 | // zero 32 | double alpha; // only depends on theta 33 | double thres; // only depends on theta 34 | uint64_t last_n; // last n used to calculate the following 35 | double dbl_n; 36 | double zetan; 37 | double eta; 38 | // unsigned short rand_state[3]; // prng state 39 | uint64_t rand_state; 40 | }; 41 | 42 | static double mehcached_rand_d(uint64_t *state) { 43 | // caution: this is maybe too non-random 44 | *state = (*state * 0x5deece66dUL + 0xbUL) & ((1UL << 48) - 1); 45 | return (double)*state / (double)((1UL << 48) - 1); 46 | } 47 | 48 | static double mehcached_pow_approx(double a, double b) { 49 | // from 50 | // http://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp/ 51 | 52 | // calculate approximation with fraction of the exponent 53 | int e = (int)b; 54 | union { 55 | double d; 56 | int x[2]; 57 | } u = { a }; 58 | u.x[1] = 59 | (int)((b - (double)e) * (double)(u.x[1] - 1072632447) + 1072632447.); 60 | u.x[0] = 0; 61 | 62 | // exponentiation by squaring with the exponent's integer part 63 | // double r = u.d makes everything much slower, not sure why 64 | // TODO: use popcount? 65 | double r = 1.; 66 | while (e) { 67 | if (e & 1) 68 | r *= a; 69 | a *= a; 70 | e >>= 1; 71 | } 72 | 73 | return r * u.d; 74 | } 75 | 76 | static void mehcached_zipf_init(struct zipf_gen_state *state, uint64_t n, 77 | double theta, uint64_t rand_seed) { 78 | assert(n > 0); 79 | if (theta > 0.992 && theta < 1) 80 | fprintf(stderr, 81 | "theta > 0.992 will be inaccurate due to approximation\n"); 82 | if (theta >= 1. && theta < 40.) { 83 | fprintf(stderr, "theta in [1., 40.) is not supported\n"); 84 | assert(false); 85 | } 86 | assert(theta == -1. || (theta >= 0. && theta < 1.) || theta >= 40.); 87 | assert(rand_seed < (1UL << 48)); 88 | memset(state, 0, sizeof(struct zipf_gen_state)); 89 | state->n = n; 90 | state->theta = theta; 91 | if (theta == -1.) 92 | rand_seed = rand_seed % n; 93 | else if (theta > 0. && theta < 1.) { 94 | state->alpha = 1. / (1. - theta); 95 | state->thres = 1. + mehcached_pow_approx(0.5, theta); 96 | } else { 97 | state->alpha = 0.; // unused 98 | state->thres = 0.; // unused 99 | } 100 | state->last_n = 0; 101 | state->zetan = 0.; 102 | // state->rand_state[0] = (unsigned short)(rand_seed >> 0); 103 | // state->rand_state[1] = (unsigned short)(rand_seed >> 16); 104 | // state->rand_state[2] = (unsigned short)(rand_seed >> 32); 105 | state->rand_state = rand_seed; 106 | } 107 | 108 | static void mehcached_zipf_init_copy(struct zipf_gen_state *state, 109 | const struct zipf_gen_state *src_state, 110 | uint64_t rand_seed) { 111 | 112 | (void)mehcached_zipf_init_copy; 113 | assert(rand_seed < (1UL << 48)); 114 | memcpy(state, src_state, sizeof(struct zipf_gen_state)); 115 | // state->rand_state[0] = (unsigned short)(rand_seed >> 0); 116 | // state->rand_state[1] = (unsigned short)(rand_seed >> 16); 117 | // state->rand_state[2] = (unsigned short)(rand_seed >> 32); 118 | state->rand_state = rand_seed; 119 | } 120 | 121 | static void mehcached_zipf_change_n(struct zipf_gen_state *state, uint64_t n) { 122 | (void)mehcached_zipf_change_n; 123 | state->n = n; 124 | } 125 | 126 | static double mehcached_zeta(uint64_t last_n, double last_sum, uint64_t n, 127 | double theta) { 128 | if (last_n > n) { 129 | last_n = 0; 130 | last_sum = 0.; 131 | } 132 | while (last_n < n) { 133 | last_sum += 1. / mehcached_pow_approx((double)last_n + 1., theta); 134 | last_n++; 135 | } 136 | return last_sum; 137 | } 138 | 139 | static uint64_t mehcached_zipf_next(struct zipf_gen_state *state) { 140 | if (state->last_n != state->n) { 141 | if (state->theta > 0. && state->theta < 1.) { 142 | state->zetan = mehcached_zeta(state->last_n, state->zetan, state->n, 143 | state->theta); 144 | state->eta = 145 | (1. - mehcached_pow_approx(2. / (double)state->n, 146 | 1. - state->theta)) / 147 | (1. - mehcached_zeta(0, 0., 2, state->theta) / state->zetan); 148 | } 149 | state->last_n = state->n; 150 | state->dbl_n = (double)state->n; 151 | } 152 | 153 | if (state->theta == -1.) { 154 | uint64_t v = state->rand_state; 155 | if (++state->rand_state >= state->n) 156 | state->rand_state = 0; 157 | return v; 158 | } else if (state->theta == 0.) { 159 | double u = mehcached_rand_d(&state->rand_state); 160 | return (uint64_t)(state->dbl_n * u); 161 | } else if (state->theta >= 40.) { 162 | return 0UL; 163 | } else { 164 | // from J. Gray et al. Quickly generating billion-record synthetic 165 | // databases. In SIGMOD, 1994. 166 | 167 | // double u = erand48(state->rand_state); 168 | double u = mehcached_rand_d(&state->rand_state); 169 | double uz = u * state->zetan; 170 | if (uz < 1.) 171 | return 0UL; 172 | else if (uz < state->thres) 173 | return 1UL; 174 | else 175 | return (uint64_t)( 176 | state->dbl_n * 177 | mehcached_pow_approx(state->eta * (u - 1.) + 1., state->alpha)); 178 | } 179 | } 180 | 181 | void mehcached_test_zipf(double theta) { 182 | 183 | (void)(mehcached_test_zipf); 184 | 185 | double zetan = 0.; 186 | const uint64_t n = 10000000000UL; 187 | uint64_t i; 188 | 189 | for (i = 0; i < n; i++) 190 | zetan += 1. / pow((double)i + 1., theta); 191 | 192 | struct zipf_gen_state state; 193 | if (theta < 1. || theta >= 40.) 194 | mehcached_zipf_init(&state, n, theta, 0); 195 | 196 | uint64_t num_key0 = 0; 197 | const uint64_t num_samples = 10000000UL; 198 | if (theta < 1. || theta >= 40.) { 199 | for (i = 0; i < num_samples; i++) 200 | if (mehcached_zipf_next(&state) == 0) 201 | num_key0++; 202 | } 203 | 204 | printf("theta = %lf; using pow(): %.10lf", theta, 1. / zetan); 205 | if (theta < 1. || theta >= 40.) 206 | printf(", using approx-pow(): %.10lf", 207 | (double)num_key0 / (double)num_samples); 208 | printf("\n"); 209 | } 210 | 211 | #endif 212 | -------------------------------------------------------------------------------- /include/IndexCache.h: -------------------------------------------------------------------------------- 1 | #if !defined(_INDEX_CACHE_H_) 2 | #define _INDEX_CACHE_H_ 3 | 4 | #include "CacheEntry.h" 5 | #include "HugePageAlloc.h" 6 | #include "Timer.h" 7 | #include "WRLock.h" 8 | #include "third_party/inlineskiplist.h" 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | extern bool enter_debug; 15 | 16 | using CacheSkipList = InlineSkipList; 17 | 18 | class IndexCache { 19 | 20 | public: 21 | IndexCache(int cache_size); 22 | 23 | bool add_to_cache(InternalPage *page); 24 | const CacheEntry *search_from_cache(const Key &k, GlobalAddress *addr, 25 | bool is_leader = false); 26 | 27 | void search_range_from_cache(const Key &from, const Key &to, 28 | std::vector &result); 29 | 30 | bool add_entry(const Key &from, const Key &to, InternalPage *ptr); 31 | const CacheEntry *find_entry(const Key &k); 32 | const CacheEntry *find_entry(const Key &from, const Key &to); 33 | 34 | bool invalidate(const CacheEntry *entry); 35 | 36 | const CacheEntry *get_a_random_entry(uint64_t &freq); 37 | 38 | void statistics(); 39 | 40 | void bench(); 41 | 42 | private: 43 | uint64_t cache_size; // MB; 44 | std::atomic free_page_cnt; 45 | std::atomic skiplist_node_cnt; 46 | int64_t all_page_cnt; 47 | 48 | std::queue> delay_free_list; 49 | WRLock free_lock; 50 | 51 | // SkipList 52 | CacheSkipList *skiplist; 53 | CacheEntryComparator cmp; 54 | Allocator alloc; 55 | 56 | void evict_one(); 57 | }; 58 | 59 | inline IndexCache::IndexCache(int cache_size) : cache_size(cache_size) { 60 | skiplist = new CacheSkipList(cmp, &alloc, 21); 61 | uint64_t memory_size = define::MB * cache_size; 62 | 63 | all_page_cnt = memory_size / sizeof(InternalPage); 64 | free_page_cnt.store(all_page_cnt); 65 | skiplist_node_cnt.store(0); 66 | } 67 | 68 | // [from, to) 69 | inline bool IndexCache::add_entry(const Key &from, const Key &to, 70 | InternalPage *ptr) { 71 | 72 | // TODO memory leak 73 | auto buf = skiplist->AllocateKey(sizeof(CacheEntry)); 74 | auto &e = *(CacheEntry *)buf; 75 | e.from = from; 76 | e.to = to - 1; // !IMPORTANT; 77 | e.ptr = ptr; 78 | 79 | return skiplist->InsertConcurrently(buf); 80 | } 81 | 82 | inline const CacheEntry *IndexCache::find_entry(const Key &from, 83 | const Key &to) { 84 | CacheSkipList::Iterator iter(skiplist); 85 | 86 | CacheEntry e; 87 | e.from = from; 88 | e.to = to - 1; 89 | iter.Seek((char *)&e); 90 | if (iter.Valid()) { 91 | auto val = (const CacheEntry *)iter.key(); 92 | return val; 93 | } else { 94 | return nullptr; 95 | } 96 | } 97 | 98 | inline const CacheEntry *IndexCache::find_entry(const Key &k) { 99 | return find_entry(k, k + 1); 100 | } 101 | 102 | inline bool IndexCache::add_to_cache(InternalPage *page) { 103 | auto new_page = (InternalPage *)malloc(kInternalPageSize); 104 | memcpy(new_page, page, kInternalPageSize); 105 | new_page->index_cache_freq = 0; 106 | 107 | if (this->add_entry(page->hdr.lowest, page->hdr.highest, new_page)) { 108 | skiplist_node_cnt.fetch_add(1); 109 | auto v = free_page_cnt.fetch_add(-1); 110 | if (v <= 0) { 111 | evict_one(); 112 | } 113 | 114 | return true; 115 | } else { // conflicted 116 | auto e = this->find_entry(page->hdr.lowest, page->hdr.highest); 117 | if (e && e->from == page->hdr.lowest && e->to == page->hdr.highest - 1) { 118 | auto ptr = e->ptr; 119 | if (ptr == nullptr && 120 | __sync_bool_compare_and_swap(&(e->ptr), 0ull, new_page)) { 121 | auto v = free_page_cnt.fetch_add(-1); 122 | if (v <= 0) { 123 | evict_one(); 124 | } 125 | return true; 126 | } 127 | } 128 | 129 | free(new_page); 130 | return false; 131 | } 132 | } 133 | 134 | inline const CacheEntry *IndexCache::search_from_cache(const Key &k, 135 | GlobalAddress *addr, 136 | bool is_leader) { 137 | // notice: please ensure the thread 0 can make progress 138 | if (is_leader && 139 | !delay_free_list.empty()) { // try to free a page in the delay-free-list 140 | auto p = delay_free_list.front(); 141 | if (asm_rdtsc() - p.second > 3000ull * 10) { 142 | free(p.first); 143 | free_page_cnt.fetch_add(1); 144 | 145 | free_lock.wLock(); 146 | delay_free_list.pop(); 147 | free_lock.wUnlock(); 148 | } 149 | } 150 | 151 | auto entry = find_entry(k); 152 | 153 | InternalPage *page = entry ? entry->ptr : nullptr; 154 | 155 | if (page && entry->from <= k && entry->to >= k) { 156 | 157 | page->index_cache_freq++; 158 | 159 | auto cnt = page->hdr.last_index + 1; 160 | if (k < page->records[0].key) { 161 | *addr = page->hdr.leftmost_ptr; 162 | } else { 163 | 164 | bool find = false; 165 | for (int i = 1; i < cnt; ++i) { 166 | if (k < page->records[i].key) { 167 | find = true; 168 | *addr = page->records[i - 1].ptr; 169 | break; 170 | } 171 | } 172 | if (!find) { 173 | *addr = page->records[cnt - 1].ptr; 174 | } 175 | } 176 | 177 | compiler_barrier(); 178 | if (entry->ptr) { // check if it is freed. 179 | return entry; 180 | } 181 | } 182 | 183 | return nullptr; 184 | } 185 | 186 | inline void 187 | IndexCache::search_range_from_cache(const Key &from, const Key &to, 188 | std::vector &result) { 189 | CacheSkipList::Iterator iter(skiplist); 190 | 191 | result.clear(); 192 | CacheEntry e; 193 | e.from = from; 194 | e.to = from; 195 | iter.Seek((char *)&e); 196 | 197 | while (iter.Valid()) { 198 | auto val = (const CacheEntry *)iter.key(); 199 | if (val->ptr) { 200 | if (val->from > to) { 201 | return; 202 | } 203 | result.push_back(val->ptr); 204 | } 205 | iter.Next(); 206 | } 207 | } 208 | 209 | inline bool IndexCache::invalidate(const CacheEntry *entry) { 210 | auto ptr = entry->ptr; 211 | 212 | if (ptr == nullptr) { 213 | return false; 214 | } 215 | 216 | if (__sync_bool_compare_and_swap(&(entry->ptr), ptr, 0)) { 217 | 218 | free_lock.wLock(); 219 | delay_free_list.push(std::make_pair(ptr, asm_rdtsc())); 220 | free_lock.wUnlock(); 221 | return true; 222 | } 223 | 224 | return false; 225 | } 226 | 227 | inline const CacheEntry *IndexCache::get_a_random_entry(uint64_t &freq) { 228 | uint32_t seed = asm_rdtsc(); 229 | GlobalAddress tmp_addr; 230 | retry: 231 | auto k = rand_r(&seed) % (1000ull * define::MB); 232 | auto e = this->search_from_cache(k, &tmp_addr); 233 | if (!e) { 234 | goto retry; 235 | } 236 | auto ptr = e->ptr; 237 | if (!ptr) { 238 | goto retry; 239 | } 240 | 241 | freq = ptr->index_cache_freq; 242 | if (e->ptr != ptr) { 243 | goto retry; 244 | } 245 | return e; 246 | } 247 | 248 | inline void IndexCache::evict_one() { 249 | 250 | uint64_t freq1, freq2; 251 | auto e1 = get_a_random_entry(freq1); 252 | auto e2 = get_a_random_entry(freq2); 253 | 254 | if (freq1 < freq2) { 255 | invalidate(e1); 256 | } else { 257 | invalidate(e2); 258 | } 259 | } 260 | 261 | inline void IndexCache::statistics() { 262 | printf("[skiplist node: %ld] [page cache: %ld]\n", skiplist_node_cnt.load(), 263 | all_page_cnt - free_page_cnt.load()); 264 | } 265 | 266 | inline void IndexCache::bench() { 267 | 268 | Timer t; 269 | t.begin(); 270 | const int loop = 100000; 271 | 272 | for (int i = 0; i < loop; ++i) { 273 | uint64_t r = rand() % (5 * define::MB); 274 | this->find_entry(r); 275 | } 276 | 277 | t.end_print(loop); 278 | } 279 | 280 | #endif // _INDEX_CACHE_H_ 281 | -------------------------------------------------------------------------------- /src/rdma/Resource.cpp: -------------------------------------------------------------------------------- 1 | #include "Rdma.h" 2 | 3 | bool createContext(RdmaContext *context, uint8_t port, int gidIndex, 4 | uint8_t devIndex) { 5 | 6 | ibv_device *dev = NULL; 7 | ibv_context *ctx = NULL; 8 | ibv_pd *pd = NULL; 9 | ibv_port_attr portAttr; 10 | 11 | // get device names in the system 12 | int devicesNum; 13 | struct ibv_device **deviceList = ibv_get_device_list(&devicesNum); 14 | if (!deviceList) { 15 | Debug::notifyError("failed to get IB devices list"); 16 | goto CreateResourcesExit; 17 | } 18 | 19 | // if there isn't any IB device in host 20 | if (!devicesNum) { 21 | Debug::notifyInfo("found %d device(s)", devicesNum); 22 | goto CreateResourcesExit; 23 | } 24 | // Debug::notifyInfo("Open IB Device"); 25 | 26 | for (int i = 0; i < devicesNum; ++i) { 27 | // printf("Device %d: %s\n", i, ibv_get_device_name(deviceList[i])); 28 | if (ibv_get_device_name(deviceList[i])[5] == '0') { 29 | devIndex = i; 30 | break; 31 | } 32 | } 33 | 34 | if (devIndex >= devicesNum) { 35 | Debug::notifyError("ib device wasn't found"); 36 | goto CreateResourcesExit; 37 | } 38 | 39 | dev = deviceList[devIndex]; 40 | // printf("I open %s :)\n", ibv_get_device_name(dev)); 41 | 42 | // get device handle 43 | ctx = ibv_open_device(dev); 44 | if (!ctx) { 45 | Debug::notifyError("failed to open device"); 46 | goto CreateResourcesExit; 47 | } 48 | /* We are now done with device list, free it */ 49 | ibv_free_device_list(deviceList); 50 | deviceList = NULL; 51 | 52 | // query port properties 53 | if (ibv_query_port(ctx, port, &portAttr)) { 54 | Debug::notifyError("ibv_query_port failed"); 55 | goto CreateResourcesExit; 56 | } 57 | 58 | // allocate Protection Domain 59 | // Debug::notifyInfo("Allocate Protection Domain"); 60 | pd = ibv_alloc_pd(ctx); 61 | if (!pd) { 62 | Debug::notifyError("ibv_alloc_pd failed"); 63 | goto CreateResourcesExit; 64 | } 65 | 66 | if (ibv_query_gid(ctx, port, gidIndex, &context->gid)) { 67 | Debug::notifyError("could not get gid for port: %d, gidIndex: %d", port, 68 | gidIndex); 69 | goto CreateResourcesExit; 70 | } 71 | 72 | // Success :) 73 | context->devIndex = devIndex; 74 | context->gidIndex = gidIndex; 75 | context->port = port; 76 | context->ctx = ctx; 77 | context->pd = pd; 78 | context->lid = portAttr.lid; 79 | 80 | // check device memory support 81 | if (kMaxDeviceMemorySize == 0) { 82 | checkDMSupported(ctx); 83 | } 84 | 85 | return true; 86 | 87 | /* Error encountered, cleanup */ 88 | CreateResourcesExit: 89 | Debug::notifyError("Error Encountered, Cleanup ..."); 90 | 91 | if (pd) { 92 | ibv_dealloc_pd(pd); 93 | pd = NULL; 94 | } 95 | if (ctx) { 96 | ibv_close_device(ctx); 97 | ctx = NULL; 98 | } 99 | if (deviceList) { 100 | ibv_free_device_list(deviceList); 101 | deviceList = NULL; 102 | } 103 | 104 | return false; 105 | } 106 | 107 | bool destoryContext(RdmaContext *context) { 108 | bool rc = true; 109 | if (context->pd) { 110 | if (ibv_dealloc_pd(context->pd)) { 111 | Debug::notifyError("Failed to deallocate PD"); 112 | rc = false; 113 | } 114 | } 115 | if (context->ctx) { 116 | if (ibv_close_device(context->ctx)) { 117 | Debug::notifyError("failed to close device context"); 118 | rc = false; 119 | } 120 | } 121 | 122 | return rc; 123 | } 124 | 125 | ibv_mr *createMemoryRegion(uint64_t mm, uint64_t mmSize, RdmaContext *ctx) { 126 | 127 | ibv_mr *mr = NULL; 128 | mr = ibv_reg_mr(ctx->pd, (void *)mm, mmSize, 129 | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | 130 | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC); 131 | 132 | if (!mr) { 133 | Debug::notifyError("Memory registration failed"); 134 | } 135 | 136 | return mr; 137 | } 138 | 139 | ibv_mr *createMemoryRegionOnChip(uint64_t mm, uint64_t mmSize, 140 | RdmaContext *ctx) { 141 | 142 | /* Device memory allocation request */ 143 | struct ibv_exp_alloc_dm_attr dm_attr; 144 | memset(&dm_attr, 0, sizeof(dm_attr)); 145 | dm_attr.length = mmSize; 146 | struct ibv_exp_dm *dm = ibv_exp_alloc_dm(ctx->ctx, &dm_attr); 147 | if (!dm) { 148 | Debug::notifyError("Allocate on-chip memory failed"); 149 | return nullptr; 150 | } 151 | 152 | /* Device memory registration as memory region */ 153 | struct ibv_exp_reg_mr_in mr_in; 154 | memset(&mr_in, 0, sizeof(mr_in)); 155 | mr_in.pd = ctx->pd, mr_in.addr = (void *)mm, mr_in.length = mmSize, 156 | mr_in.exp_access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | 157 | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC, 158 | mr_in.create_flags = 0; 159 | mr_in.dm = dm; 160 | mr_in.comp_mask = IBV_EXP_REG_MR_DM; 161 | struct ibv_mr *mr = ibv_exp_reg_mr(&mr_in); 162 | if (!mr) { 163 | Debug::notifyError("Memory registration failed"); 164 | return nullptr; 165 | } 166 | 167 | // init zero 168 | char *buffer = (char *)malloc(mmSize); 169 | memset(buffer, 0, mmSize); 170 | 171 | struct ibv_exp_memcpy_dm_attr cpy_attr; 172 | memset(&cpy_attr, 0, sizeof(cpy_attr)); 173 | cpy_attr.memcpy_dir = IBV_EXP_DM_CPY_TO_DEVICE; 174 | cpy_attr.host_addr = (void *)buffer; 175 | cpy_attr.length = mmSize; 176 | cpy_attr.dm_offset = 0; 177 | ibv_exp_memcpy_dm(dm, &cpy_attr); 178 | 179 | free(buffer); 180 | 181 | return mr; 182 | } 183 | 184 | bool createQueuePair(ibv_qp **qp, ibv_qp_type mode, ibv_cq *send_cq, 185 | ibv_cq *recv_cq, RdmaContext *context, 186 | uint32_t qpsMaxDepth, uint32_t maxInlineData) { 187 | 188 | struct ibv_exp_qp_init_attr attr; 189 | memset(&attr, 0, sizeof(attr)); 190 | 191 | attr.qp_type = mode; 192 | attr.sq_sig_all = 0; 193 | attr.send_cq = send_cq; 194 | attr.recv_cq = recv_cq; 195 | attr.pd = context->pd; 196 | 197 | if (mode == IBV_QPT_RC) { 198 | attr.comp_mask = IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS | 199 | IBV_EXP_QP_INIT_ATTR_PD | IBV_EXP_QP_INIT_ATTR_ATOMICS_ARG; 200 | attr.max_atomic_arg = 32; 201 | } else { 202 | attr.comp_mask = IBV_EXP_QP_INIT_ATTR_PD; 203 | } 204 | 205 | attr.cap.max_send_wr = qpsMaxDepth; 206 | attr.cap.max_recv_wr = qpsMaxDepth; 207 | attr.cap.max_send_sge = 1; 208 | attr.cap.max_recv_sge = 1; 209 | attr.cap.max_inline_data = maxInlineData; 210 | 211 | *qp = ibv_exp_create_qp(context->ctx, &attr); 212 | if (!(*qp)) { 213 | Debug::notifyError("Failed to create QP"); 214 | return false; 215 | } 216 | 217 | // Debug::notifyInfo("Create Queue Pair with Num = %d", (*qp)->qp_num); 218 | 219 | return true; 220 | } 221 | 222 | bool createQueuePair(ibv_qp **qp, ibv_qp_type mode, ibv_cq *cq, 223 | RdmaContext *context, uint32_t qpsMaxDepth, 224 | uint32_t maxInlineData) { 225 | return createQueuePair(qp, mode, cq, cq, context, qpsMaxDepth, maxInlineData); 226 | } 227 | 228 | bool createDCTarget(ibv_exp_dct **dct, ibv_cq *cq, RdmaContext *context, 229 | uint32_t qpsMaxDepth, uint32_t maxInlineData) { 230 | 231 | // construct SRQ fot DC Target :) 232 | struct ibv_srq_init_attr attr; 233 | memset(&attr, 0, sizeof(attr)); 234 | attr.attr.max_wr = qpsMaxDepth; 235 | attr.attr.max_sge = 1; 236 | ibv_srq *srq = ibv_create_srq(context->pd, &attr); 237 | 238 | ibv_exp_dct_init_attr dAttr; 239 | memset(&dAttr, 0, sizeof(dAttr)); 240 | dAttr.pd = context->pd; 241 | dAttr.cq = cq; 242 | dAttr.srq = srq; 243 | dAttr.dc_key = DCT_ACCESS_KEY; 244 | dAttr.port = context->port; 245 | dAttr.access_flags = IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_READ | 246 | IBV_ACCESS_REMOTE_ATOMIC; 247 | dAttr.min_rnr_timer = 2; 248 | dAttr.tclass = 0; 249 | dAttr.flow_label = 0; 250 | dAttr.mtu = IBV_MTU_4096; 251 | dAttr.pkey_index = 0; 252 | dAttr.hop_limit = 1; 253 | dAttr.create_flags = 0; 254 | dAttr.inline_size = maxInlineData; 255 | 256 | *dct = ibv_exp_create_dct(context->ctx, &dAttr); 257 | if (dct == NULL) { 258 | Debug::notifyError("failed to create dc target"); 259 | return false; 260 | } 261 | 262 | return true; 263 | } 264 | 265 | void fillAhAttr(ibv_ah_attr *attr, uint32_t remoteLid, uint8_t *remoteGid, 266 | RdmaContext *context) { 267 | 268 | (void)remoteGid; 269 | 270 | memset(attr, 0, sizeof(ibv_ah_attr)); 271 | attr->dlid = remoteLid; 272 | attr->sl = 0; 273 | attr->src_path_bits = 0; 274 | attr->port_num = context->port; 275 | 276 | // attr->is_global = 0; 277 | 278 | // fill ah_attr with GRH 279 | attr->is_global = 1; 280 | memcpy(&attr->grh.dgid, remoteGid, 16); 281 | attr->grh.flow_label = 0; 282 | attr->grh.hop_limit = 1; 283 | attr->grh.sgid_index = context->gidIndex; 284 | attr->grh.traffic_class = 0; 285 | } 286 | -------------------------------------------------------------------------------- /include/DSM.h: -------------------------------------------------------------------------------- 1 | #ifndef __DSM_H__ 2 | #define __DSM_H__ 3 | 4 | #include 5 | 6 | #include "Cache.h" 7 | #include "Config.h" 8 | #include "Connection.h" 9 | #include "DSMKeeper.h" 10 | #include "GlobalAddress.h" 11 | #include "LocalAllocator.h" 12 | #include "RdmaBuffer.h" 13 | 14 | class DSMKeeper; 15 | class Directory; 16 | 17 | class DSM { 18 | 19 | public: 20 | // obtain netowrk resources for a thread 21 | void registerThread(); 22 | 23 | // clear the network resources for all threads 24 | void resetThread() { appID.store(0); } 25 | 26 | static DSM *getInstance(const DSMConfig &conf); 27 | 28 | uint16_t getMyNodeID() { return myNodeID; } 29 | uint16_t getMyThreadID() { return thread_id; } 30 | uint16_t getClusterSize() { return conf.machineNR; } 31 | uint64_t getThreadTag() { return thread_tag; } 32 | 33 | // RDMA operations 34 | // buffer is registered memory 35 | void read(char *buffer, GlobalAddress gaddr, size_t size, bool signal = true, 36 | CoroContext *ctx = nullptr); 37 | void read_sync(char *buffer, GlobalAddress gaddr, size_t size, 38 | CoroContext *ctx = nullptr); 39 | 40 | void write(const char *buffer, GlobalAddress gaddr, size_t size, 41 | bool signal = true, CoroContext *ctx = nullptr); 42 | void write_sync(const char *buffer, GlobalAddress gaddr, size_t size, 43 | CoroContext *ctx = nullptr); 44 | 45 | void write_batch(RdmaOpRegion *rs, int k, bool signal = true, 46 | CoroContext *ctx = nullptr); 47 | void write_batch_sync(RdmaOpRegion *rs, int k, CoroContext *ctx = nullptr); 48 | 49 | void write_faa(RdmaOpRegion &write_ror, RdmaOpRegion &faa_ror, 50 | uint64_t add_val, bool signal = true, 51 | CoroContext *ctx = nullptr); 52 | void write_faa_sync(RdmaOpRegion &write_ror, RdmaOpRegion &faa_ror, 53 | uint64_t add_val, CoroContext *ctx = nullptr); 54 | 55 | void write_cas(RdmaOpRegion &write_ror, RdmaOpRegion &cas_ror, uint64_t equal, 56 | uint64_t val, bool signal = true, CoroContext *ctx = nullptr); 57 | void write_cas_sync(RdmaOpRegion &write_ror, RdmaOpRegion &cas_ror, 58 | uint64_t equal, uint64_t val, CoroContext *ctx = nullptr); 59 | 60 | void cas(GlobalAddress gaddr, uint64_t equal, uint64_t val, 61 | uint64_t *rdma_buffer, bool signal = true, 62 | CoroContext *ctx = nullptr); 63 | bool cas_sync(GlobalAddress gaddr, uint64_t equal, uint64_t val, 64 | uint64_t *rdma_buffer, CoroContext *ctx = nullptr); 65 | 66 | void cas_read(RdmaOpRegion &cas_ror, RdmaOpRegion &read_ror, uint64_t equal, 67 | uint64_t val, bool signal = true, CoroContext *ctx = nullptr); 68 | bool cas_read_sync(RdmaOpRegion &cas_ror, RdmaOpRegion &read_ror, 69 | uint64_t equal, uint64_t val, CoroContext *ctx = nullptr); 70 | 71 | void cas_mask(GlobalAddress gaddr, uint64_t equal, uint64_t val, 72 | uint64_t *rdma_buffer, uint64_t mask = ~(0ull), 73 | bool signal = true); 74 | bool cas_mask_sync(GlobalAddress gaddr, uint64_t equal, uint64_t val, 75 | uint64_t *rdma_buffer, uint64_t mask = ~(0ull)); 76 | 77 | void faa_boundary(GlobalAddress gaddr, uint64_t add_val, 78 | uint64_t *rdma_buffer, uint64_t mask = 63, 79 | bool signal = true, CoroContext *ctx = nullptr); 80 | void faa_boundary_sync(GlobalAddress gaddr, uint64_t add_val, 81 | uint64_t *rdma_buffer, uint64_t mask = 63, 82 | CoroContext *ctx = nullptr); 83 | 84 | // for on-chip device memory 85 | void read_dm(char *buffer, GlobalAddress gaddr, size_t size, 86 | bool signal = true, CoroContext *ctx = nullptr); 87 | void read_dm_sync(char *buffer, GlobalAddress gaddr, size_t size, 88 | CoroContext *ctx = nullptr); 89 | 90 | void write_dm(const char *buffer, GlobalAddress gaddr, size_t size, 91 | bool signal = true, CoroContext *ctx = nullptr); 92 | void write_dm_sync(const char *buffer, GlobalAddress gaddr, size_t size, 93 | CoroContext *ctx = nullptr); 94 | 95 | void cas_dm(GlobalAddress gaddr, uint64_t equal, uint64_t val, 96 | uint64_t *rdma_buffer, bool signal = true, 97 | CoroContext *ctx = nullptr); 98 | bool cas_dm_sync(GlobalAddress gaddr, uint64_t equal, uint64_t val, 99 | uint64_t *rdma_buffer, CoroContext *ctx = nullptr); 100 | 101 | void cas_dm_mask(GlobalAddress gaddr, uint64_t equal, uint64_t val, 102 | uint64_t *rdma_buffer, uint64_t mask = ~(0ull), 103 | bool signal = true); 104 | bool cas_dm_mask_sync(GlobalAddress gaddr, uint64_t equal, uint64_t val, 105 | uint64_t *rdma_buffer, uint64_t mask = ~(0ull)); 106 | 107 | void faa_dm_boundary(GlobalAddress gaddr, uint64_t add_val, 108 | uint64_t *rdma_buffer, uint64_t mask = 63, 109 | bool signal = true, CoroContext *ctx = nullptr); 110 | void faa_dm_boundary_sync(GlobalAddress gaddr, uint64_t add_val, 111 | uint64_t *rdma_buffer, uint64_t mask = 63, 112 | CoroContext *ctx = nullptr); 113 | 114 | uint64_t poll_rdma_cq(int count = 1); 115 | bool poll_rdma_cq_once(uint64_t &wr_id); 116 | 117 | uint64_t sum(uint64_t value) { 118 | static uint64_t count = 0; 119 | return keeper->sum(std::string("sum-") + std::to_string(count++), value); 120 | } 121 | 122 | // Memcached operations for sync 123 | size_t Put(uint64_t key, const void *value, size_t count) { 124 | 125 | std::string k = std::string("gam-") + std::to_string(key); 126 | keeper->memSet(k.c_str(), k.size(), (char *)value, count); 127 | return count; 128 | } 129 | 130 | size_t Get(uint64_t key, void *value) { 131 | 132 | std::string k = std::string("gam-") + std::to_string(key); 133 | size_t size; 134 | char *ret = keeper->memGet(k.c_str(), k.size(), &size); 135 | memcpy(value, ret, size); 136 | 137 | return size; 138 | } 139 | 140 | private: 141 | DSM(const DSMConfig &conf); 142 | ~DSM(); 143 | 144 | void initRDMAConnection(); 145 | void fill_keys_dest(RdmaOpRegion &ror, GlobalAddress addr, bool is_chip); 146 | 147 | DSMConfig conf; 148 | std::atomic_int appID; 149 | Cache cache; 150 | 151 | static thread_local int thread_id; 152 | static thread_local ThreadConnection *iCon; 153 | static thread_local char *rdma_buffer; 154 | static thread_local LocalAllocator local_allocator; 155 | static thread_local RdmaBuffer rbuf[define::kMaxCoro]; 156 | static thread_local uint64_t thread_tag; 157 | 158 | uint64_t baseAddr; 159 | uint32_t myNodeID; 160 | 161 | RemoteConnection *remoteInfo; 162 | ThreadConnection *thCon[MAX_APP_THREAD]; 163 | DirectoryConnection *dirCon[NR_DIRECTORY]; 164 | DSMKeeper *keeper; 165 | 166 | Directory *dirAgent[NR_DIRECTORY]; 167 | 168 | public: 169 | bool is_register() { return thread_id != -1; } 170 | void barrier(const std::string &ss) { keeper->barrier(ss); } 171 | 172 | char *get_rdma_buffer() { return rdma_buffer; } 173 | RdmaBuffer &get_rbuf(int coro_id) { return rbuf[coro_id]; } 174 | 175 | GlobalAddress alloc(size_t size); 176 | void free(GlobalAddress addr); 177 | 178 | void rpc_call_dir(const RawMessage &m, uint16_t node_id, 179 | uint16_t dir_id = 0) { 180 | 181 | auto buffer = (RawMessage *)iCon->message->getSendPool(); 182 | 183 | memcpy(buffer, &m, sizeof(RawMessage)); 184 | buffer->node_id = myNodeID; 185 | buffer->app_id = thread_id; 186 | 187 | iCon->sendMessage2Dir(buffer, node_id, dir_id); 188 | } 189 | 190 | RawMessage *rpc_wait() { 191 | ibv_wc wc; 192 | 193 | pollWithCQ(iCon->rpc_cq, 1, &wc); 194 | return (RawMessage *)iCon->message->getMessage(); 195 | } 196 | }; 197 | 198 | inline GlobalAddress DSM::alloc(size_t size) { 199 | 200 | thread_local int next_target_node = 201 | (getMyThreadID() + getMyNodeID()) % conf.machineNR; 202 | thread_local int next_target_dir_id = 203 | (getMyThreadID() + getMyNodeID()) % NR_DIRECTORY; 204 | 205 | bool need_chunk = false; 206 | auto addr = local_allocator.malloc(size, need_chunk); 207 | if (need_chunk) { 208 | RawMessage m; 209 | m.type = RpcType::MALLOC; 210 | 211 | this->rpc_call_dir(m, next_target_node, next_target_dir_id); 212 | local_allocator.set_chunck(rpc_wait()->addr); 213 | 214 | if (++next_target_dir_id == NR_DIRECTORY) { 215 | next_target_node = (next_target_node + 1) % conf.machineNR; 216 | next_target_dir_id = 0; 217 | } 218 | 219 | // retry 220 | addr = local_allocator.malloc(size, need_chunk); 221 | } 222 | 223 | return addr; 224 | } 225 | 226 | inline void DSM::free(GlobalAddress addr) { local_allocator.free(addr); } 227 | #endif /* __DSM_H__ */ 228 | -------------------------------------------------------------------------------- /include/Tree.h: -------------------------------------------------------------------------------- 1 | #if !defined(_TREE_H_) 2 | #define _TREE_H_ 3 | 4 | #include "DSM.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | class IndexCache; 11 | 12 | struct LocalLockNode { 13 | std::atomic ticket_lock; 14 | bool hand_over; 15 | uint8_t hand_time; 16 | }; 17 | 18 | struct Request { 19 | bool is_search; 20 | Key k; 21 | Value v; 22 | }; 23 | 24 | class RequstGen { 25 | public: 26 | RequstGen() = default; 27 | virtual Request next() { return Request{}; } 28 | }; 29 | 30 | using CoroFunc = std::function; 31 | 32 | struct SearchResult { 33 | bool is_leaf; 34 | uint8_t level; 35 | GlobalAddress slibing; 36 | GlobalAddress next_level; 37 | Value val; 38 | }; 39 | 40 | class InternalPage; 41 | class LeafPage; 42 | class Tree { 43 | 44 | public: 45 | Tree(DSM *dsm, uint16_t tree_id = 0); 46 | 47 | void insert(const Key &k, const Value &v, CoroContext *cxt = nullptr, 48 | int coro_id = 0); 49 | bool search(const Key &k, Value &v, CoroContext *cxt = nullptr, 50 | int coro_id = 0); 51 | void del(const Key &k, CoroContext *cxt = nullptr, int coro_id = 0); 52 | 53 | uint64_t range_query(const Key &from, const Key &to, Value *buffer, 54 | CoroContext *cxt = nullptr, int coro_id = 0); 55 | 56 | void print_and_check_tree(CoroContext *cxt = nullptr, int coro_id = 0); 57 | 58 | void run_coroutine(CoroFunc func, int id, int coro_cnt); 59 | 60 | void lock_bench(const Key &k, CoroContext *cxt = nullptr, int coro_id = 0); 61 | 62 | void index_cache_statistics(); 63 | void clear_statistics(); 64 | 65 | private: 66 | DSM *dsm; 67 | uint64_t tree_id; 68 | GlobalAddress root_ptr_ptr; // the address which stores root pointer; 69 | 70 | // static thread_local int coro_id; 71 | static thread_local CoroCall worker[define::kMaxCoro]; 72 | static thread_local CoroCall master; 73 | 74 | LocalLockNode *local_locks[MAX_MACHINE]; 75 | 76 | IndexCache *index_cache; 77 | 78 | void print_verbose(); 79 | 80 | void before_operation(CoroContext *cxt, int coro_id); 81 | 82 | GlobalAddress get_root_ptr_ptr(); 83 | GlobalAddress get_root_ptr(CoroContext *cxt, int coro_id); 84 | 85 | void coro_worker(CoroYield &yield, RequstGen *gen, int coro_id); 86 | void coro_master(CoroYield &yield, int coro_cnt); 87 | 88 | void broadcast_new_root(GlobalAddress new_root_addr, int root_level); 89 | bool update_new_root(GlobalAddress left, const Key &k, GlobalAddress right, 90 | int level, GlobalAddress old_root, CoroContext *cxt, 91 | int coro_id); 92 | 93 | void insert_internal(const Key &k, GlobalAddress v, CoroContext *cxt, 94 | int coro_id, int level); 95 | 96 | bool try_lock_addr(GlobalAddress lock_addr, uint64_t tag, uint64_t *buf, 97 | CoroContext *cxt, int coro_id); 98 | void unlock_addr(GlobalAddress lock_addr, uint64_t tag, uint64_t *buf, 99 | CoroContext *cxt, int coro_id, bool async); 100 | void write_page_and_unlock(char *page_buffer, GlobalAddress page_addr, 101 | int page_size, uint64_t *cas_buffer, 102 | GlobalAddress lock_addr, uint64_t tag, 103 | CoroContext *cxt, int coro_id, bool async); 104 | void lock_and_read_page(char *page_buffer, GlobalAddress page_addr, 105 | int page_size, uint64_t *cas_buffer, 106 | GlobalAddress lock_addr, uint64_t tag, 107 | CoroContext *cxt, int coro_id); 108 | 109 | bool page_search(GlobalAddress page_addr, const Key &k, SearchResult &result, 110 | CoroContext *cxt, int coro_id, bool from_cache = false); 111 | void internal_page_search(InternalPage *page, const Key &k, 112 | SearchResult &result); 113 | void leaf_page_search(LeafPage *page, const Key &k, SearchResult &result); 114 | 115 | void internal_page_store(GlobalAddress page_addr, const Key &k, 116 | GlobalAddress value, GlobalAddress root, int level, 117 | CoroContext *cxt, int coro_id); 118 | bool leaf_page_store(GlobalAddress page_addr, const Key &k, const Value &v, 119 | GlobalAddress root, int level, CoroContext *cxt, 120 | int coro_id, bool from_cache = false); 121 | bool leaf_page_del(GlobalAddress page_addr, const Key &k, int level, 122 | CoroContext *cxt, int coro_id, bool from_cache = false); 123 | 124 | bool acquire_local_lock(GlobalAddress lock_addr, CoroContext *cxt, 125 | int coro_id); 126 | bool can_hand_over(GlobalAddress lock_addr); 127 | void releases_local_lock(GlobalAddress lock_addr); 128 | }; 129 | 130 | class Header { 131 | private: 132 | GlobalAddress leftmost_ptr; 133 | GlobalAddress sibling_ptr; 134 | uint8_t level; 135 | int16_t last_index; 136 | Key lowest; 137 | Key highest; 138 | 139 | friend class InternalPage; 140 | friend class LeafPage; 141 | friend class Tree; 142 | friend class IndexCache; 143 | 144 | public: 145 | Header() { 146 | leftmost_ptr = GlobalAddress::Null(); 147 | sibling_ptr = GlobalAddress::Null(); 148 | last_index = -1; 149 | lowest = kKeyMin; 150 | highest = kKeyMax; 151 | } 152 | 153 | void debug() const { 154 | std::cout << "leftmost=" << leftmost_ptr << ", " 155 | << "sibling=" << sibling_ptr << ", " 156 | << "level=" << (int)level << "," 157 | << "cnt=" << last_index + 1 << "," 158 | << "range=[" << lowest << " - " << highest << "]"; 159 | } 160 | } __attribute__((packed)); 161 | ; 162 | 163 | class InternalEntry { 164 | public: 165 | Key key; 166 | GlobalAddress ptr; 167 | 168 | InternalEntry() { 169 | ptr = GlobalAddress::Null(); 170 | key = 0; 171 | } 172 | } __attribute__((packed)); 173 | 174 | class LeafEntry { 175 | public: 176 | uint8_t f_version : 4; 177 | Key key; 178 | Value value; 179 | uint8_t r_version : 4; 180 | 181 | LeafEntry() { 182 | f_version = 0; 183 | r_version = 0; 184 | value = kValueNull; 185 | key = 0; 186 | } 187 | } __attribute__((packed)); 188 | 189 | constexpr int kInternalCardinality = (kInternalPageSize - sizeof(Header) - 190 | sizeof(uint8_t) * 2 - sizeof(uint64_t)) / 191 | sizeof(InternalEntry); 192 | 193 | constexpr int kLeafCardinality = 194 | (kLeafPageSize - sizeof(Header) - sizeof(uint8_t) * 2 - sizeof(uint64_t)) / 195 | sizeof(LeafEntry); 196 | 197 | class InternalPage { 198 | private: 199 | union { 200 | uint32_t crc; 201 | uint64_t embedding_lock; 202 | uint64_t index_cache_freq; 203 | }; 204 | 205 | uint8_t front_version; 206 | Header hdr; 207 | InternalEntry records[kInternalCardinality]; 208 | 209 | // uint8_t padding[3]; 210 | uint8_t rear_version; 211 | 212 | friend class Tree; 213 | friend class IndexCache; 214 | 215 | public: 216 | // this is called when tree grows 217 | InternalPage(GlobalAddress left, const Key &key, GlobalAddress right, 218 | uint32_t level = 0) { 219 | hdr.leftmost_ptr = left; 220 | hdr.level = level; 221 | records[0].key = key; 222 | records[0].ptr = right; 223 | records[1].ptr = GlobalAddress::Null(); 224 | 225 | hdr.last_index = 0; 226 | 227 | front_version = 0; 228 | rear_version = 0; 229 | } 230 | 231 | InternalPage(uint32_t level = 0) { 232 | hdr.level = level; 233 | records[0].ptr = GlobalAddress::Null(); 234 | 235 | front_version = 0; 236 | rear_version = 0; 237 | 238 | embedding_lock = 0; 239 | } 240 | 241 | void set_consistent() { 242 | front_version++; 243 | rear_version = front_version; 244 | #ifdef CONFIG_ENABLE_CRC 245 | this->crc = 246 | CityHash32((char *)&front_version, (&rear_version) - (&front_version)); 247 | #endif 248 | } 249 | 250 | bool check_consistent() const { 251 | 252 | bool succ = true; 253 | #ifdef CONFIG_ENABLE_CRC 254 | auto cal_crc = 255 | CityHash32((char *)&front_version, (&rear_version) - (&front_version)); 256 | succ = cal_crc == this->crc; 257 | #endif 258 | succ = succ && (rear_version == front_version); 259 | 260 | return succ; 261 | } 262 | 263 | void debug() const { 264 | std::cout << "InternalPage@ "; 265 | hdr.debug(); 266 | std::cout << "version: [" << (int)front_version << ", " << (int)rear_version 267 | << "]" << std::endl; 268 | } 269 | 270 | void verbose_debug() const { 271 | this->debug(); 272 | for (int i = 0; i < this->hdr.last_index + 1; ++i) { 273 | printf("[%lu %lu] ", this->records[i].key, this->records[i].ptr.val); 274 | } 275 | printf("\n"); 276 | } 277 | 278 | } __attribute__((packed)); 279 | 280 | class LeafPage { 281 | private: 282 | union { 283 | uint32_t crc; 284 | uint64_t embedding_lock; 285 | }; 286 | uint8_t front_version; 287 | Header hdr; 288 | LeafEntry records[kLeafCardinality]; 289 | 290 | // uint8_t padding[1]; 291 | uint8_t rear_version; 292 | 293 | friend class Tree; 294 | 295 | public: 296 | LeafPage(uint32_t level = 0) { 297 | hdr.level = level; 298 | records[0].value = kValueNull; 299 | 300 | front_version = 0; 301 | rear_version = 0; 302 | 303 | embedding_lock = 0; 304 | } 305 | 306 | void set_consistent() { 307 | front_version++; 308 | rear_version = front_version; 309 | #ifdef CONFIG_ENABLE_CRC 310 | this->crc = 311 | CityHash32((char *)&front_version, (&rear_version) - (&front_version)); 312 | #endif 313 | } 314 | 315 | bool check_consistent() const { 316 | 317 | bool succ = true; 318 | #ifdef CONFIG_ENABLE_CRC 319 | auto cal_crc = 320 | CityHash32((char *)&front_version, (&rear_version) - (&front_version)); 321 | succ = cal_crc == this->crc; 322 | #endif 323 | 324 | succ = succ && (rear_version == front_version); 325 | 326 | return succ; 327 | } 328 | 329 | void debug() const { 330 | std::cout << "LeafPage@ "; 331 | hdr.debug(); 332 | std::cout << "version: [" << (int)front_version << ", " << (int)rear_version 333 | << "]" << std::endl; 334 | } 335 | 336 | } __attribute__((packed)); 337 | 338 | #endif // _TREE_H_ 339 | -------------------------------------------------------------------------------- /src/rdma/Operation.cpp: -------------------------------------------------------------------------------- 1 | #include "Rdma.h" 2 | 3 | int pollWithCQ(ibv_cq *cq, int pollNumber, struct ibv_wc *wc) { 4 | int count = 0; 5 | 6 | do { 7 | 8 | int new_count = ibv_poll_cq(cq, 1, wc); 9 | count += new_count; 10 | 11 | } while (count < pollNumber); 12 | 13 | if (count < 0) { 14 | Debug::notifyError("Poll Completion failed."); 15 | sleep(5); 16 | return -1; 17 | } 18 | 19 | if (wc->status != IBV_WC_SUCCESS) { 20 | Debug::notifyError("Failed status %s (%d) for wr_id %d", 21 | ibv_wc_status_str(wc->status), wc->status, 22 | (int)wc->wr_id); 23 | sleep(5); 24 | return -1; 25 | } 26 | 27 | return count; 28 | } 29 | 30 | int pollOnce(ibv_cq *cq, int pollNumber, struct ibv_wc *wc) { 31 | int count = ibv_poll_cq(cq, pollNumber, wc); 32 | if (count <= 0) { 33 | return 0; 34 | } 35 | if (wc->status != IBV_WC_SUCCESS) { 36 | Debug::notifyError("Failed status %s (%d) for wr_id %d", 37 | ibv_wc_status_str(wc->status), wc->status, 38 | (int)wc->wr_id); 39 | return -1; 40 | } else { 41 | return count; 42 | } 43 | } 44 | 45 | static inline void fillSgeWr(ibv_sge &sg, ibv_send_wr &wr, uint64_t source, 46 | uint64_t size, uint32_t lkey) { 47 | memset(&sg, 0, sizeof(sg)); 48 | sg.addr = (uintptr_t)source; 49 | sg.length = size; 50 | sg.lkey = lkey; 51 | 52 | memset(&wr, 0, sizeof(wr)); 53 | wr.wr_id = 0; 54 | wr.sg_list = &sg; 55 | wr.num_sge = 1; 56 | } 57 | 58 | static inline void fillSgeWr(ibv_sge &sg, ibv_recv_wr &wr, uint64_t source, 59 | uint64_t size, uint32_t lkey) { 60 | memset(&sg, 0, sizeof(sg)); 61 | sg.addr = (uintptr_t)source; 62 | sg.length = size; 63 | sg.lkey = lkey; 64 | 65 | memset(&wr, 0, sizeof(wr)); 66 | wr.wr_id = 0; 67 | wr.sg_list = &sg; 68 | wr.num_sge = 1; 69 | } 70 | 71 | static inline void fillSgeWr(ibv_sge &sg, ibv_exp_send_wr &wr, uint64_t source, 72 | uint64_t size, uint32_t lkey) { 73 | memset(&sg, 0, sizeof(sg)); 74 | sg.addr = (uintptr_t)source; 75 | sg.length = size; 76 | sg.lkey = lkey; 77 | 78 | memset(&wr, 0, sizeof(wr)); 79 | wr.wr_id = 0; 80 | wr.sg_list = &sg; 81 | wr.num_sge = 1; 82 | } 83 | 84 | // for UD and DC 85 | bool rdmaSend(ibv_qp *qp, uint64_t source, uint64_t size, uint32_t lkey, 86 | ibv_ah *ah, uint32_t remoteQPN /* remote dct_number */, 87 | bool isSignaled) { 88 | 89 | struct ibv_sge sg; 90 | struct ibv_send_wr wr; 91 | struct ibv_send_wr *wrBad; 92 | 93 | fillSgeWr(sg, wr, source, size, lkey); 94 | 95 | wr.opcode = IBV_WR_SEND; 96 | 97 | wr.wr.ud.ah = ah; 98 | wr.wr.ud.remote_qpn = remoteQPN; 99 | wr.wr.ud.remote_qkey = UD_PKEY; 100 | 101 | if (isSignaled) 102 | wr.send_flags = IBV_SEND_SIGNALED; 103 | if (ibv_post_send(qp, &wr, &wrBad)) { 104 | Debug::notifyError("Send with RDMA_SEND failed."); 105 | return false; 106 | } 107 | return true; 108 | } 109 | 110 | // for RC & UC 111 | bool rdmaSend(ibv_qp *qp, uint64_t source, uint64_t size, uint32_t lkey, 112 | int32_t imm) { 113 | 114 | struct ibv_sge sg; 115 | struct ibv_send_wr wr; 116 | struct ibv_send_wr *wrBad; 117 | 118 | fillSgeWr(sg, wr, source, size, lkey); 119 | 120 | if (imm != -1) { 121 | wr.imm_data = imm; 122 | wr.opcode = IBV_WR_SEND_WITH_IMM; 123 | } else { 124 | wr.opcode = IBV_WR_SEND; 125 | } 126 | 127 | wr.send_flags = IBV_SEND_SIGNALED; 128 | if (ibv_post_send(qp, &wr, &wrBad)) { 129 | Debug::notifyError("Send with RDMA_SEND failed."); 130 | return false; 131 | } 132 | return true; 133 | } 134 | 135 | bool rdmaReceive(ibv_qp *qp, uint64_t source, uint64_t size, uint32_t lkey, 136 | uint64_t wr_id) { 137 | struct ibv_sge sg; 138 | struct ibv_recv_wr wr; 139 | struct ibv_recv_wr *wrBad; 140 | 141 | fillSgeWr(sg, wr, source, size, lkey); 142 | 143 | wr.wr_id = wr_id; 144 | 145 | if (ibv_post_recv(qp, &wr, &wrBad)) { 146 | Debug::notifyError("Receive with RDMA_RECV failed."); 147 | return false; 148 | } 149 | return true; 150 | } 151 | 152 | bool rdmaReceive(ibv_srq *srq, uint64_t source, uint64_t size, uint32_t lkey) { 153 | 154 | struct ibv_sge sg; 155 | struct ibv_recv_wr wr; 156 | struct ibv_recv_wr *wrBad; 157 | 158 | fillSgeWr(sg, wr, source, size, lkey); 159 | 160 | if (ibv_post_srq_recv(srq, &wr, &wrBad)) { 161 | Debug::notifyError("Receive with RDMA_RECV failed."); 162 | return false; 163 | } 164 | return true; 165 | } 166 | 167 | 168 | 169 | // for RC & UC 170 | bool rdmaRead(ibv_qp *qp, uint64_t source, uint64_t dest, uint64_t size, 171 | uint32_t lkey, uint32_t remoteRKey, bool signal, uint64_t wrID) { 172 | struct ibv_sge sg; 173 | struct ibv_send_wr wr; 174 | struct ibv_send_wr *wrBad; 175 | 176 | fillSgeWr(sg, wr, source, size, lkey); 177 | 178 | wr.opcode = IBV_WR_RDMA_READ; 179 | 180 | if (signal) { 181 | wr.send_flags = IBV_SEND_SIGNALED; 182 | } 183 | 184 | wr.wr.rdma.remote_addr = dest; 185 | wr.wr.rdma.rkey = remoteRKey; 186 | wr.wr_id = wrID; 187 | 188 | if (ibv_post_send(qp, &wr, &wrBad)) { 189 | Debug::notifyError("Send with RDMA_READ failed."); 190 | return false; 191 | } 192 | return true; 193 | } 194 | 195 | 196 | // for RC & UC 197 | bool rdmaWrite(ibv_qp *qp, uint64_t source, uint64_t dest, uint64_t size, 198 | uint32_t lkey, uint32_t remoteRKey, int32_t imm, bool isSignaled, 199 | uint64_t wrID) { 200 | 201 | struct ibv_sge sg; 202 | struct ibv_send_wr wr; 203 | struct ibv_send_wr *wrBad; 204 | 205 | fillSgeWr(sg, wr, source, size, lkey); 206 | 207 | if (imm == -1) { 208 | wr.opcode = IBV_WR_RDMA_WRITE; 209 | } else { 210 | wr.imm_data = imm; 211 | wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM; 212 | } 213 | 214 | if (isSignaled) { 215 | wr.send_flags = IBV_SEND_SIGNALED; 216 | } 217 | 218 | wr.wr.rdma.remote_addr = dest; 219 | wr.wr.rdma.rkey = remoteRKey; 220 | wr.wr_id = wrID; 221 | 222 | if (ibv_post_send(qp, &wr, &wrBad) != 0) { 223 | Debug::notifyError("Send with RDMA_WRITE(WITH_IMM) failed."); 224 | sleep(10); 225 | return false; 226 | } 227 | return true; 228 | } 229 | 230 | // RC & UC 231 | bool rdmaFetchAndAdd(ibv_qp *qp, uint64_t source, uint64_t dest, uint64_t add, 232 | uint32_t lkey, uint32_t remoteRKey) { 233 | struct ibv_sge sg; 234 | struct ibv_send_wr wr; 235 | struct ibv_send_wr *wrBad; 236 | 237 | fillSgeWr(sg, wr, source, 8, lkey); 238 | 239 | wr.opcode = IBV_WR_ATOMIC_FETCH_AND_ADD; 240 | wr.send_flags = IBV_SEND_SIGNALED; 241 | 242 | wr.wr.atomic.remote_addr = dest; 243 | wr.wr.atomic.rkey = remoteRKey; 244 | wr.wr.atomic.compare_add = add; 245 | 246 | if (ibv_post_send(qp, &wr, &wrBad)) { 247 | Debug::notifyError("Send with ATOMIC_FETCH_AND_ADD failed."); 248 | return false; 249 | } 250 | return true; 251 | } 252 | 253 | bool rdmaFetchAndAddBoundary(ibv_qp *qp, uint64_t source, uint64_t dest, 254 | uint64_t add, uint32_t lkey, uint32_t remoteRKey, 255 | uint64_t boundary, bool singal, uint64_t wr_id) { 256 | struct ibv_sge sg; 257 | struct ibv_exp_send_wr wr; 258 | struct ibv_exp_send_wr *wrBad; 259 | 260 | fillSgeWr(sg, wr, source, 8, lkey); 261 | 262 | wr.exp_opcode = IBV_EXP_WR_EXT_MASKED_ATOMIC_FETCH_AND_ADD; 263 | wr.exp_send_flags = IBV_EXP_SEND_EXT_ATOMIC_INLINE; 264 | wr.wr_id = wr_id; 265 | 266 | if (singal) { 267 | wr.exp_send_flags |= IBV_EXP_SEND_SIGNALED; 268 | } 269 | 270 | wr.ext_op.masked_atomics.log_arg_sz = 3; 271 | wr.ext_op.masked_atomics.remote_addr = dest; 272 | wr.ext_op.masked_atomics.rkey = remoteRKey; 273 | 274 | auto &op = wr.ext_op.masked_atomics.wr_data.inline_data.op.fetch_add; 275 | op.add_val = add; 276 | op.field_boundary = 1ull << boundary; 277 | 278 | if (ibv_exp_post_send(qp, &wr, &wrBad)) { 279 | Debug::notifyError("Send with MASK FETCH_AND_ADD failed."); 280 | return false; 281 | } 282 | return true; 283 | } 284 | 285 | 286 | // for RC & UC 287 | bool rdmaCompareAndSwap(ibv_qp *qp, uint64_t source, uint64_t dest, 288 | uint64_t compare, uint64_t swap, uint32_t lkey, 289 | uint32_t remoteRKey, bool signal, uint64_t wrID) { 290 | struct ibv_sge sg; 291 | struct ibv_send_wr wr; 292 | struct ibv_send_wr *wrBad; 293 | 294 | fillSgeWr(sg, wr, source, 8, lkey); 295 | 296 | wr.opcode = IBV_WR_ATOMIC_CMP_AND_SWP; 297 | 298 | if (signal) { 299 | wr.send_flags = IBV_SEND_SIGNALED; 300 | } 301 | 302 | wr.wr.atomic.remote_addr = dest; 303 | wr.wr.atomic.rkey = remoteRKey; 304 | wr.wr.atomic.compare_add = compare; 305 | wr.wr.atomic.swap = swap; 306 | wr.wr_id = wrID; 307 | 308 | if (ibv_post_send(qp, &wr, &wrBad)) { 309 | Debug::notifyError("Send with ATOMIC_CMP_AND_SWP failed."); 310 | sleep(5); 311 | return false; 312 | } 313 | return true; 314 | } 315 | 316 | bool rdmaCompareAndSwapMask(ibv_qp *qp, uint64_t source, uint64_t dest, 317 | uint64_t compare, uint64_t swap, uint32_t lkey, 318 | uint32_t remoteRKey, uint64_t mask, bool singal) { 319 | struct ibv_sge sg; 320 | struct ibv_exp_send_wr wr; 321 | struct ibv_exp_send_wr *wrBad; 322 | 323 | fillSgeWr(sg, wr, source, 8, lkey); 324 | 325 | wr.exp_opcode = IBV_EXP_WR_EXT_MASKED_ATOMIC_CMP_AND_SWP; 326 | wr.exp_send_flags = IBV_EXP_SEND_EXT_ATOMIC_INLINE; 327 | 328 | if (singal) { 329 | wr.exp_send_flags |= IBV_EXP_SEND_SIGNALED; 330 | } 331 | 332 | wr.ext_op.masked_atomics.log_arg_sz = 3; 333 | wr.ext_op.masked_atomics.remote_addr = dest; 334 | wr.ext_op.masked_atomics.rkey = remoteRKey; 335 | 336 | auto &op = wr.ext_op.masked_atomics.wr_data.inline_data.op.cmp_swap; 337 | op.compare_val = compare; 338 | op.swap_val = swap; 339 | 340 | op.compare_mask = mask; 341 | op.swap_mask = mask; 342 | 343 | if (ibv_exp_post_send(qp, &wr, &wrBad)) { 344 | Debug::notifyError("Send with MASK ATOMIC_CMP_AND_SWP failed."); 345 | return false; 346 | } 347 | return true; 348 | } 349 | 350 | 351 | bool rdmaWriteBatch(ibv_qp *qp, RdmaOpRegion *ror, int k, bool isSignaled, 352 | uint64_t wrID) { 353 | 354 | struct ibv_sge sg[kOroMax]; 355 | struct ibv_send_wr wr[kOroMax]; 356 | struct ibv_send_wr *wrBad; 357 | 358 | for (int i = 0; i < k; ++i) { 359 | fillSgeWr(sg[i], wr[i], ror[i].source, ror[i].size, ror[i].lkey); 360 | 361 | wr[i].next = (i == k - 1) ? NULL : &wr[i + 1]; 362 | 363 | wr[i].opcode = IBV_WR_RDMA_WRITE; 364 | 365 | if (i == k - 1 && isSignaled) { 366 | wr[i].send_flags = IBV_SEND_SIGNALED; 367 | } 368 | 369 | wr[i].wr.rdma.remote_addr = ror[i].dest; 370 | wr[i].wr.rdma.rkey = ror[i].remoteRKey; 371 | wr[i].wr_id = wrID; 372 | } 373 | 374 | if (ibv_post_send(qp, &wr[0], &wrBad) != 0) { 375 | Debug::notifyError("Send with RDMA_WRITE(WITH_IMM) failed."); 376 | sleep(10); 377 | return false; 378 | } 379 | return true; 380 | } 381 | 382 | bool rdmaCasRead(ibv_qp *qp, const RdmaOpRegion &cas_ror, 383 | const RdmaOpRegion &read_ror, uint64_t compare, uint64_t swap, 384 | bool isSignaled, uint64_t wrID) { 385 | 386 | struct ibv_sge sg[2]; 387 | struct ibv_send_wr wr[2]; 388 | struct ibv_send_wr *wrBad; 389 | 390 | fillSgeWr(sg[0], wr[0], cas_ror.source, 8, cas_ror.lkey); 391 | wr[0].opcode = IBV_WR_ATOMIC_CMP_AND_SWP; 392 | wr[0].wr.atomic.remote_addr = cas_ror.dest; 393 | wr[0].wr.atomic.rkey = cas_ror.remoteRKey; 394 | wr[0].wr.atomic.compare_add = compare; 395 | wr[0].wr.atomic.swap = swap; 396 | wr[0].next = &wr[1]; 397 | 398 | fillSgeWr(sg[1], wr[1], read_ror.source, read_ror.size, read_ror.lkey); 399 | wr[1].opcode = IBV_WR_RDMA_READ; 400 | wr[1].wr.rdma.remote_addr = read_ror.dest; 401 | wr[1].wr.rdma.rkey = read_ror.remoteRKey; 402 | wr[1].wr_id = wrID; 403 | wr[1].send_flags |= IBV_SEND_FENCE; 404 | if (isSignaled) { 405 | wr[1].send_flags |= IBV_SEND_SIGNALED; 406 | } 407 | 408 | if (ibv_post_send(qp, &wr[0], &wrBad)) { 409 | Debug::notifyError("Send with CAS_READs failed."); 410 | sleep(10); 411 | return false; 412 | } 413 | return true; 414 | } 415 | 416 | bool rdmaWriteFaa(ibv_qp *qp, const RdmaOpRegion &write_ror, 417 | const RdmaOpRegion &faa_ror, uint64_t add_val, 418 | bool isSignaled, uint64_t wrID) { 419 | 420 | struct ibv_sge sg[2]; 421 | struct ibv_send_wr wr[2]; 422 | struct ibv_send_wr *wrBad; 423 | 424 | fillSgeWr(sg[0], wr[0], write_ror.source, write_ror.size, write_ror.lkey); 425 | wr[0].opcode = IBV_WR_RDMA_WRITE; 426 | wr[0].wr.rdma.remote_addr = write_ror.dest; 427 | wr[0].wr.rdma.rkey = write_ror.remoteRKey; 428 | wr[0].next = &wr[1]; 429 | 430 | fillSgeWr(sg[1], wr[1], faa_ror.source, 8, faa_ror.lkey); 431 | wr[1].opcode = IBV_WR_ATOMIC_FETCH_AND_ADD; 432 | wr[1].wr.atomic.remote_addr = faa_ror.dest; 433 | wr[1].wr.atomic.rkey = faa_ror.remoteRKey; 434 | wr[1].wr.atomic.compare_add = add_val; 435 | wr[1].wr_id = wrID; 436 | 437 | if (isSignaled) { 438 | wr[1].send_flags |= IBV_SEND_SIGNALED; 439 | } 440 | 441 | if (ibv_post_send(qp, &wr[0], &wrBad)) { 442 | Debug::notifyError("Send with Write Faa failed."); 443 | sleep(10); 444 | return false; 445 | } 446 | return true; 447 | } 448 | 449 | bool rdmaWriteCas(ibv_qp *qp, const RdmaOpRegion &write_ror, 450 | const RdmaOpRegion &cas_ror, uint64_t compare, uint64_t swap, 451 | bool isSignaled, uint64_t wrID) { 452 | 453 | struct ibv_sge sg[2]; 454 | struct ibv_send_wr wr[2]; 455 | struct ibv_send_wr *wrBad; 456 | 457 | fillSgeWr(sg[0], wr[0], write_ror.source, write_ror.size, write_ror.lkey); 458 | wr[0].opcode = IBV_WR_RDMA_WRITE; 459 | wr[0].wr.rdma.remote_addr = write_ror.dest; 460 | wr[0].wr.rdma.rkey = write_ror.remoteRKey; 461 | wr[0].next = &wr[1]; 462 | 463 | fillSgeWr(sg[1], wr[1], cas_ror.source, 8, cas_ror.lkey); 464 | wr[1].opcode = IBV_WR_ATOMIC_CMP_AND_SWP; 465 | wr[1].wr.atomic.remote_addr = cas_ror.dest; 466 | wr[1].wr.atomic.rkey = cas_ror.remoteRKey; 467 | wr[1].wr.atomic.compare_add = compare; 468 | wr[1].wr.atomic.swap = swap; 469 | wr[1].wr_id = wrID; 470 | 471 | if (isSignaled) { 472 | wr[1].send_flags |= IBV_SEND_SIGNALED; 473 | } 474 | 475 | if (ibv_post_send(qp, &wr[0], &wrBad)) { 476 | Debug::notifyError("Send with Write Cas failed."); 477 | sleep(10); 478 | return false; 479 | } 480 | return true; 481 | } -------------------------------------------------------------------------------- /src/DSM.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "DSM.h" 3 | #include "Directory.h" 4 | #include "HugePageAlloc.h" 5 | 6 | #include "DSMKeeper.h" 7 | 8 | #include 9 | 10 | thread_local int DSM::thread_id = -1; 11 | thread_local ThreadConnection *DSM::iCon = nullptr; 12 | thread_local char *DSM::rdma_buffer = nullptr; 13 | thread_local LocalAllocator DSM::local_allocator; 14 | thread_local RdmaBuffer DSM::rbuf[define::kMaxCoro]; 15 | thread_local uint64_t DSM::thread_tag = 0; 16 | 17 | DSM *DSM::getInstance(const DSMConfig &conf) { 18 | static DSM *dsm = nullptr; 19 | static WRLock lock; 20 | 21 | lock.wLock(); 22 | if (!dsm) { 23 | dsm = new DSM(conf); 24 | } else { 25 | } 26 | lock.wUnlock(); 27 | 28 | return dsm; 29 | } 30 | 31 | DSM::DSM(const DSMConfig &conf) 32 | : conf(conf), appID(0), cache(conf.cacheConfig) { 33 | 34 | baseAddr = (uint64_t)hugePageAlloc(conf.dsmSize * define::GB); 35 | 36 | Debug::notifyInfo("shared memory size: %dGB, 0x%lx", conf.dsmSize, baseAddr); 37 | Debug::notifyInfo("cache size: %dGB", conf.cacheConfig.cacheSize); 38 | 39 | // warmup 40 | // memset((char *)baseAddr, 0, conf.dsmSize * define::GB); 41 | for (uint64_t i = baseAddr; i < baseAddr + conf.dsmSize * define::GB; 42 | i += 2 * define::MB) { 43 | *(char *)i = 0; 44 | } 45 | 46 | // clear up first chunk 47 | memset((char *)baseAddr, 0, define::kChunkSize); 48 | 49 | initRDMAConnection(); 50 | 51 | Debug::notifyInfo("number of threads on memory node: %d", NR_DIRECTORY); 52 | for (int i = 0; i < NR_DIRECTORY; ++i) { 53 | dirAgent[i] = 54 | new Directory(dirCon[i], remoteInfo, conf.machineNR, i, myNodeID); 55 | } 56 | 57 | keeper->barrier("DSM-init"); 58 | } 59 | 60 | DSM::~DSM() {} 61 | 62 | void DSM::registerThread() { 63 | 64 | static bool has_init[MAX_APP_THREAD]; 65 | 66 | if (thread_id != -1) 67 | return; 68 | 69 | thread_id = appID.fetch_add(1); 70 | thread_tag = thread_id + (((uint64_t)this->getMyNodeID()) << 32) + 1; 71 | 72 | iCon = thCon[thread_id]; 73 | 74 | if (!has_init[thread_id]) { 75 | iCon->message->initRecv(); 76 | iCon->message->initSend(); 77 | 78 | has_init[thread_id] = true; 79 | } 80 | 81 | rdma_buffer = (char *)cache.data + thread_id * 12 * define::MB; 82 | 83 | for (int i = 0; i < define::kMaxCoro; ++i) { 84 | rbuf[i].set_buffer(rdma_buffer + i * define::kPerCoroRdmaBuf); 85 | } 86 | } 87 | 88 | void DSM::initRDMAConnection() { 89 | 90 | Debug::notifyInfo("number of servers (colocated MN/CN): %d", conf.machineNR); 91 | 92 | remoteInfo = new RemoteConnection[conf.machineNR]; 93 | 94 | for (int i = 0; i < MAX_APP_THREAD; ++i) { 95 | thCon[i] = 96 | new ThreadConnection(i, (void *)cache.data, cache.size * define::GB, 97 | conf.machineNR, remoteInfo); 98 | } 99 | 100 | for (int i = 0; i < NR_DIRECTORY; ++i) { 101 | dirCon[i] = 102 | new DirectoryConnection(i, (void *)baseAddr, conf.dsmSize * define::GB, 103 | conf.machineNR, remoteInfo); 104 | } 105 | 106 | keeper = new DSMKeeper(thCon, dirCon, remoteInfo, conf.machineNR); 107 | 108 | myNodeID = keeper->getMyNodeID(); 109 | } 110 | 111 | void DSM::read(char *buffer, GlobalAddress gaddr, size_t size, bool signal, 112 | CoroContext *ctx) { 113 | if (ctx == nullptr) { 114 | rdmaRead(iCon->data[0][gaddr.nodeID], (uint64_t)buffer, 115 | remoteInfo[gaddr.nodeID].dsmBase + gaddr.offset, size, 116 | iCon->cacheLKey, remoteInfo[gaddr.nodeID].dsmRKey[0], signal); 117 | } else { 118 | rdmaRead(iCon->data[0][gaddr.nodeID], (uint64_t)buffer, 119 | remoteInfo[gaddr.nodeID].dsmBase + gaddr.offset, size, 120 | iCon->cacheLKey, remoteInfo[gaddr.nodeID].dsmRKey[0], true, 121 | ctx->coro_id); 122 | (*ctx->yield)(*ctx->master); 123 | } 124 | } 125 | 126 | void DSM::read_sync(char *buffer, GlobalAddress gaddr, size_t size, 127 | CoroContext *ctx) { 128 | read(buffer, gaddr, size, true, ctx); 129 | 130 | if (ctx == nullptr) { 131 | ibv_wc wc; 132 | pollWithCQ(iCon->cq, 1, &wc); 133 | } 134 | } 135 | 136 | void DSM::write(const char *buffer, GlobalAddress gaddr, size_t size, 137 | bool signal, CoroContext *ctx) { 138 | 139 | if (ctx == nullptr) { 140 | rdmaWrite(iCon->data[0][gaddr.nodeID], (uint64_t)buffer, 141 | remoteInfo[gaddr.nodeID].dsmBase + gaddr.offset, size, 142 | iCon->cacheLKey, remoteInfo[gaddr.nodeID].dsmRKey[0], -1, signal); 143 | } else { 144 | rdmaWrite(iCon->data[0][gaddr.nodeID], (uint64_t)buffer, 145 | remoteInfo[gaddr.nodeID].dsmBase + gaddr.offset, size, 146 | iCon->cacheLKey, remoteInfo[gaddr.nodeID].dsmRKey[0], -1, true, 147 | ctx->coro_id); 148 | (*ctx->yield)(*ctx->master); 149 | } 150 | } 151 | 152 | void DSM::write_sync(const char *buffer, GlobalAddress gaddr, size_t size, 153 | CoroContext *ctx) { 154 | write(buffer, gaddr, size, true, ctx); 155 | 156 | if (ctx == nullptr) { 157 | ibv_wc wc; 158 | pollWithCQ(iCon->cq, 1, &wc); 159 | } 160 | } 161 | 162 | void DSM::fill_keys_dest(RdmaOpRegion &ror, GlobalAddress gaddr, bool is_chip) { 163 | ror.lkey = iCon->cacheLKey; 164 | if (is_chip) { 165 | ror.dest = remoteInfo[gaddr.nodeID].lockBase + gaddr.offset; 166 | ror.remoteRKey = remoteInfo[gaddr.nodeID].lockRKey[0]; 167 | } else { 168 | ror.dest = remoteInfo[gaddr.nodeID].dsmBase + gaddr.offset; 169 | ror.remoteRKey = remoteInfo[gaddr.nodeID].dsmRKey[0]; 170 | } 171 | } 172 | 173 | void DSM::write_batch(RdmaOpRegion *rs, int k, bool signal, CoroContext *ctx) { 174 | 175 | int node_id = -1; 176 | for (int i = 0; i < k; ++i) { 177 | 178 | GlobalAddress gaddr; 179 | gaddr.val = rs[i].dest; 180 | node_id = gaddr.nodeID; 181 | fill_keys_dest(rs[i], gaddr, rs[i].is_on_chip); 182 | } 183 | 184 | if (ctx == nullptr) { 185 | rdmaWriteBatch(iCon->data[0][node_id], rs, k, signal); 186 | } else { 187 | rdmaWriteBatch(iCon->data[0][node_id], rs, k, true, ctx->coro_id); 188 | (*ctx->yield)(*ctx->master); 189 | } 190 | } 191 | 192 | void DSM::write_batch_sync(RdmaOpRegion *rs, int k, CoroContext *ctx) { 193 | write_batch(rs, k, true, ctx); 194 | 195 | if (ctx == nullptr) { 196 | ibv_wc wc; 197 | pollWithCQ(iCon->cq, 1, &wc); 198 | } 199 | } 200 | 201 | void DSM::write_faa(RdmaOpRegion &write_ror, RdmaOpRegion &faa_ror, 202 | uint64_t add_val, bool signal, CoroContext *ctx) { 203 | int node_id; 204 | { 205 | GlobalAddress gaddr; 206 | gaddr.val = write_ror.dest; 207 | node_id = gaddr.nodeID; 208 | 209 | fill_keys_dest(write_ror, gaddr, write_ror.is_on_chip); 210 | } 211 | { 212 | GlobalAddress gaddr; 213 | gaddr.val = faa_ror.dest; 214 | 215 | fill_keys_dest(faa_ror, gaddr, faa_ror.is_on_chip); 216 | } 217 | if (ctx == nullptr) { 218 | rdmaWriteFaa(iCon->data[0][node_id], write_ror, faa_ror, add_val, signal); 219 | } else { 220 | rdmaWriteFaa(iCon->data[0][node_id], write_ror, faa_ror, add_val, true, 221 | ctx->coro_id); 222 | (*ctx->yield)(*ctx->master); 223 | } 224 | } 225 | void DSM::write_faa_sync(RdmaOpRegion &write_ror, RdmaOpRegion &faa_ror, 226 | uint64_t add_val, CoroContext *ctx) { 227 | write_faa(write_ror, faa_ror, add_val, true, ctx); 228 | if (ctx == nullptr) { 229 | ibv_wc wc; 230 | pollWithCQ(iCon->cq, 1, &wc); 231 | } 232 | } 233 | 234 | void DSM::write_cas(RdmaOpRegion &write_ror, RdmaOpRegion &cas_ror, 235 | uint64_t equal, uint64_t val, bool signal, 236 | CoroContext *ctx) { 237 | int node_id; 238 | { 239 | GlobalAddress gaddr; 240 | gaddr.val = write_ror.dest; 241 | node_id = gaddr.nodeID; 242 | 243 | fill_keys_dest(write_ror, gaddr, write_ror.is_on_chip); 244 | } 245 | { 246 | GlobalAddress gaddr; 247 | gaddr.val = cas_ror.dest; 248 | 249 | fill_keys_dest(cas_ror, gaddr, cas_ror.is_on_chip); 250 | } 251 | if (ctx == nullptr) { 252 | rdmaWriteCas(iCon->data[0][node_id], write_ror, cas_ror, equal, val, 253 | signal); 254 | } else { 255 | rdmaWriteCas(iCon->data[0][node_id], write_ror, cas_ror, equal, val, true, 256 | ctx->coro_id); 257 | (*ctx->yield)(*ctx->master); 258 | } 259 | } 260 | void DSM::write_cas_sync(RdmaOpRegion &write_ror, RdmaOpRegion &cas_ror, 261 | uint64_t equal, uint64_t val, CoroContext *ctx) { 262 | write_cas(write_ror, cas_ror, equal, val, true, ctx); 263 | if (ctx == nullptr) { 264 | ibv_wc wc; 265 | pollWithCQ(iCon->cq, 1, &wc); 266 | } 267 | } 268 | 269 | void DSM::cas_read(RdmaOpRegion &cas_ror, RdmaOpRegion &read_ror, 270 | uint64_t equal, uint64_t val, bool signal, 271 | CoroContext *ctx) { 272 | 273 | int node_id; 274 | { 275 | GlobalAddress gaddr; 276 | gaddr.val = cas_ror.dest; 277 | node_id = gaddr.nodeID; 278 | fill_keys_dest(cas_ror, gaddr, cas_ror.is_on_chip); 279 | } 280 | { 281 | GlobalAddress gaddr; 282 | gaddr.val = read_ror.dest; 283 | fill_keys_dest(read_ror, gaddr, read_ror.is_on_chip); 284 | } 285 | 286 | if (ctx == nullptr) { 287 | rdmaCasRead(iCon->data[0][node_id], cas_ror, read_ror, equal, val, signal); 288 | } else { 289 | rdmaCasRead(iCon->data[0][node_id], cas_ror, read_ror, equal, val, true, 290 | ctx->coro_id); 291 | (*ctx->yield)(*ctx->master); 292 | } 293 | } 294 | 295 | bool DSM::cas_read_sync(RdmaOpRegion &cas_ror, RdmaOpRegion &read_ror, 296 | uint64_t equal, uint64_t val, CoroContext *ctx) { 297 | cas_read(cas_ror, read_ror, equal, val, true, ctx); 298 | 299 | if (ctx == nullptr) { 300 | ibv_wc wc; 301 | pollWithCQ(iCon->cq, 1, &wc); 302 | } 303 | 304 | return equal == *(uint64_t *)cas_ror.source; 305 | } 306 | 307 | void DSM::cas(GlobalAddress gaddr, uint64_t equal, uint64_t val, 308 | uint64_t *rdma_buffer, bool signal, CoroContext *ctx) { 309 | 310 | if (ctx == nullptr) { 311 | rdmaCompareAndSwap(iCon->data[0][gaddr.nodeID], (uint64_t)rdma_buffer, 312 | remoteInfo[gaddr.nodeID].dsmBase + gaddr.offset, equal, 313 | val, iCon->cacheLKey, 314 | remoteInfo[gaddr.nodeID].dsmRKey[0], signal); 315 | } else { 316 | rdmaCompareAndSwap(iCon->data[0][gaddr.nodeID], (uint64_t)rdma_buffer, 317 | remoteInfo[gaddr.nodeID].dsmBase + gaddr.offset, equal, 318 | val, iCon->cacheLKey, 319 | remoteInfo[gaddr.nodeID].dsmRKey[0], true, ctx->coro_id); 320 | (*ctx->yield)(*ctx->master); 321 | } 322 | } 323 | 324 | bool DSM::cas_sync(GlobalAddress gaddr, uint64_t equal, uint64_t val, 325 | uint64_t *rdma_buffer, CoroContext *ctx) { 326 | cas(gaddr, equal, val, rdma_buffer, true, ctx); 327 | 328 | if (ctx == nullptr) { 329 | ibv_wc wc; 330 | pollWithCQ(iCon->cq, 1, &wc); 331 | } 332 | 333 | return equal == *rdma_buffer; 334 | } 335 | 336 | void DSM::cas_mask(GlobalAddress gaddr, uint64_t equal, uint64_t val, 337 | uint64_t *rdma_buffer, uint64_t mask, bool signal) { 338 | rdmaCompareAndSwapMask(iCon->data[0][gaddr.nodeID], (uint64_t)rdma_buffer, 339 | remoteInfo[gaddr.nodeID].dsmBase + gaddr.offset, equal, 340 | val, iCon->cacheLKey, 341 | remoteInfo[gaddr.nodeID].dsmRKey[0], mask, signal); 342 | } 343 | 344 | bool DSM::cas_mask_sync(GlobalAddress gaddr, uint64_t equal, uint64_t val, 345 | uint64_t *rdma_buffer, uint64_t mask) { 346 | cas_mask(gaddr, equal, val, rdma_buffer, mask); 347 | ibv_wc wc; 348 | pollWithCQ(iCon->cq, 1, &wc); 349 | 350 | return (equal & mask) == (*rdma_buffer & mask); 351 | } 352 | 353 | void DSM::faa_boundary(GlobalAddress gaddr, uint64_t add_val, 354 | uint64_t *rdma_buffer, uint64_t mask, bool signal, 355 | CoroContext *ctx) { 356 | if (ctx == nullptr) { 357 | rdmaFetchAndAddBoundary(iCon->data[0][gaddr.nodeID], (uint64_t)rdma_buffer, 358 | remoteInfo[gaddr.nodeID].dsmBase + gaddr.offset, 359 | add_val, iCon->cacheLKey, 360 | remoteInfo[gaddr.nodeID].dsmRKey[0], mask, signal); 361 | } else { 362 | rdmaFetchAndAddBoundary(iCon->data[0][gaddr.nodeID], (uint64_t)rdma_buffer, 363 | remoteInfo[gaddr.nodeID].dsmBase + gaddr.offset, 364 | add_val, iCon->cacheLKey, 365 | remoteInfo[gaddr.nodeID].dsmRKey[0], mask, true, 366 | ctx->coro_id); 367 | (*ctx->yield)(*ctx->master); 368 | } 369 | } 370 | 371 | void DSM::faa_boundary_sync(GlobalAddress gaddr, uint64_t add_val, 372 | uint64_t *rdma_buffer, uint64_t mask, 373 | CoroContext *ctx) { 374 | faa_boundary(gaddr, add_val, rdma_buffer, mask, true, ctx); 375 | if (ctx == nullptr) { 376 | ibv_wc wc; 377 | pollWithCQ(iCon->cq, 1, &wc); 378 | } 379 | } 380 | 381 | void DSM::read_dm(char *buffer, GlobalAddress gaddr, size_t size, bool signal, 382 | CoroContext *ctx) { 383 | 384 | if (ctx == nullptr) { 385 | rdmaRead(iCon->data[0][gaddr.nodeID], (uint64_t)buffer, 386 | remoteInfo[gaddr.nodeID].lockBase + gaddr.offset, size, 387 | iCon->cacheLKey, remoteInfo[gaddr.nodeID].lockRKey[0], signal); 388 | } else { 389 | rdmaRead(iCon->data[0][gaddr.nodeID], (uint64_t)buffer, 390 | remoteInfo[gaddr.nodeID].lockBase + gaddr.offset, size, 391 | iCon->cacheLKey, remoteInfo[gaddr.nodeID].lockRKey[0], true, 392 | ctx->coro_id); 393 | (*ctx->yield)(*ctx->master); 394 | } 395 | } 396 | 397 | void DSM::read_dm_sync(char *buffer, GlobalAddress gaddr, size_t size, 398 | CoroContext *ctx) { 399 | read_dm(buffer, gaddr, size, true, ctx); 400 | 401 | if (ctx == nullptr) { 402 | ibv_wc wc; 403 | pollWithCQ(iCon->cq, 1, &wc); 404 | } 405 | } 406 | 407 | void DSM::write_dm(const char *buffer, GlobalAddress gaddr, size_t size, 408 | bool signal, CoroContext *ctx) { 409 | if (ctx == nullptr) { 410 | rdmaWrite(iCon->data[0][gaddr.nodeID], (uint64_t)buffer, 411 | remoteInfo[gaddr.nodeID].lockBase + gaddr.offset, size, 412 | iCon->cacheLKey, remoteInfo[gaddr.nodeID].lockRKey[0], -1, 413 | signal); 414 | } else { 415 | rdmaWrite(iCon->data[0][gaddr.nodeID], (uint64_t)buffer, 416 | remoteInfo[gaddr.nodeID].lockBase + gaddr.offset, size, 417 | iCon->cacheLKey, remoteInfo[gaddr.nodeID].lockRKey[0], -1, true, 418 | ctx->coro_id); 419 | (*ctx->yield)(*ctx->master); 420 | } 421 | } 422 | 423 | void DSM::write_dm_sync(const char *buffer, GlobalAddress gaddr, size_t size, 424 | CoroContext *ctx) { 425 | write_dm(buffer, gaddr, size, true, ctx); 426 | 427 | if (ctx == nullptr) { 428 | ibv_wc wc; 429 | pollWithCQ(iCon->cq, 1, &wc); 430 | } 431 | } 432 | 433 | void DSM::cas_dm(GlobalAddress gaddr, uint64_t equal, uint64_t val, 434 | uint64_t *rdma_buffer, bool signal, CoroContext *ctx) { 435 | 436 | if (ctx == nullptr) { 437 | rdmaCompareAndSwap(iCon->data[0][gaddr.nodeID], (uint64_t)rdma_buffer, 438 | remoteInfo[gaddr.nodeID].lockBase + gaddr.offset, equal, 439 | val, iCon->cacheLKey, 440 | remoteInfo[gaddr.nodeID].lockRKey[0], signal); 441 | } else { 442 | rdmaCompareAndSwap(iCon->data[0][gaddr.nodeID], (uint64_t)rdma_buffer, 443 | remoteInfo[gaddr.nodeID].lockBase + gaddr.offset, equal, 444 | val, iCon->cacheLKey, 445 | remoteInfo[gaddr.nodeID].lockRKey[0], true, 446 | ctx->coro_id); 447 | (*ctx->yield)(*ctx->master); 448 | } 449 | } 450 | 451 | bool DSM::cas_dm_sync(GlobalAddress gaddr, uint64_t equal, uint64_t val, 452 | uint64_t *rdma_buffer, CoroContext *ctx) { 453 | cas_dm(gaddr, equal, val, rdma_buffer, true, ctx); 454 | 455 | if (ctx == nullptr) { 456 | ibv_wc wc; 457 | pollWithCQ(iCon->cq, 1, &wc); 458 | } 459 | 460 | return equal == *rdma_buffer; 461 | } 462 | 463 | void DSM::cas_dm_mask(GlobalAddress gaddr, uint64_t equal, uint64_t val, 464 | uint64_t *rdma_buffer, uint64_t mask, bool signal) { 465 | rdmaCompareAndSwapMask(iCon->data[0][gaddr.nodeID], (uint64_t)rdma_buffer, 466 | remoteInfo[gaddr.nodeID].lockBase + gaddr.offset, 467 | equal, val, iCon->cacheLKey, 468 | remoteInfo[gaddr.nodeID].lockRKey[0], mask, signal); 469 | } 470 | 471 | bool DSM::cas_dm_mask_sync(GlobalAddress gaddr, uint64_t equal, uint64_t val, 472 | uint64_t *rdma_buffer, uint64_t mask) { 473 | cas_dm_mask(gaddr, equal, val, rdma_buffer, mask); 474 | ibv_wc wc; 475 | pollWithCQ(iCon->cq, 1, &wc); 476 | 477 | return (equal & mask) == (*rdma_buffer & mask); 478 | } 479 | 480 | void DSM::faa_dm_boundary(GlobalAddress gaddr, uint64_t add_val, 481 | uint64_t *rdma_buffer, uint64_t mask, bool signal, 482 | CoroContext *ctx) { 483 | if (ctx == nullptr) { 484 | 485 | rdmaFetchAndAddBoundary(iCon->data[0][gaddr.nodeID], (uint64_t)rdma_buffer, 486 | remoteInfo[gaddr.nodeID].lockBase + gaddr.offset, 487 | add_val, iCon->cacheLKey, 488 | remoteInfo[gaddr.nodeID].lockRKey[0], mask, signal); 489 | } else { 490 | rdmaFetchAndAddBoundary(iCon->data[0][gaddr.nodeID], (uint64_t)rdma_buffer, 491 | remoteInfo[gaddr.nodeID].lockBase + gaddr.offset, 492 | add_val, iCon->cacheLKey, 493 | remoteInfo[gaddr.nodeID].lockRKey[0], mask, true, 494 | ctx->coro_id); 495 | (*ctx->yield)(*ctx->master); 496 | } 497 | } 498 | 499 | void DSM::faa_dm_boundary_sync(GlobalAddress gaddr, uint64_t add_val, 500 | uint64_t *rdma_buffer, uint64_t mask, 501 | CoroContext *ctx) { 502 | faa_dm_boundary(gaddr, add_val, rdma_buffer, mask, true, ctx); 503 | if (ctx == nullptr) { 504 | ibv_wc wc; 505 | pollWithCQ(iCon->cq, 1, &wc); 506 | } 507 | } 508 | 509 | uint64_t DSM::poll_rdma_cq(int count) { 510 | ibv_wc wc; 511 | pollWithCQ(iCon->cq, count, &wc); 512 | 513 | return wc.wr_id; 514 | } 515 | 516 | bool DSM::poll_rdma_cq_once(uint64_t &wr_id) { 517 | ibv_wc wc; 518 | int res = pollOnce(iCon->cq, 1, &wc); 519 | 520 | wr_id = wc.wr_id; 521 | 522 | return res == 1; 523 | } -------------------------------------------------------------------------------- /src/Tree.cpp: -------------------------------------------------------------------------------- 1 | #include "Tree.h" 2 | #include "IndexCache.h" 3 | #include "RdmaBuffer.h" 4 | #include "Timer.h" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | bool enter_debug = false; 14 | 15 | uint64_t cache_miss[MAX_APP_THREAD][8]; 16 | uint64_t cache_hit[MAX_APP_THREAD][8]; 17 | uint64_t latency[MAX_APP_THREAD][LATENCY_WINDOWS]; 18 | 19 | thread_local CoroCall Tree::worker[define::kMaxCoro]; 20 | thread_local CoroCall Tree::master; 21 | thread_local GlobalAddress path_stack[define::kMaxCoro] 22 | [define::kMaxLevelOfTree]; 23 | 24 | thread_local Timer timer; 25 | thread_local std::queue hot_wait_queue; 26 | 27 | Tree::Tree(DSM *dsm, uint16_t tree_id) : dsm(dsm), tree_id(tree_id) { 28 | 29 | for (int i = 0; i < dsm->getClusterSize(); ++i) { 30 | local_locks[i] = new LocalLockNode[define::kNumOfLock]; 31 | for (size_t k = 0; k < define::kNumOfLock; ++k) { 32 | auto &n = local_locks[i][k]; 33 | n.ticket_lock.store(0); 34 | n.hand_over = false; 35 | n.hand_time = 0; 36 | } 37 | } 38 | 39 | assert(dsm->is_register()); 40 | print_verbose(); 41 | 42 | index_cache = new IndexCache(define::kIndexCacheSize); 43 | 44 | root_ptr_ptr = get_root_ptr_ptr(); 45 | 46 | // try to init tree and install root pointer 47 | auto page_buffer = (dsm->get_rbuf(0)).get_page_buffer(); 48 | auto root_addr = dsm->alloc(kLeafPageSize); 49 | auto root_page = new (page_buffer) LeafPage; 50 | 51 | root_page->set_consistent(); 52 | dsm->write_sync(page_buffer, root_addr, kLeafPageSize); 53 | 54 | auto cas_buffer = (dsm->get_rbuf(0)).get_cas_buffer(); 55 | bool res = dsm->cas_sync(root_ptr_ptr, 0, root_addr.val, cas_buffer); 56 | if (res) { 57 | std::cout << "Tree root pointer value " << root_addr << std::endl; 58 | } else { 59 | // std::cout << "fail\n"; 60 | } 61 | } 62 | 63 | void Tree::print_verbose() { 64 | 65 | int kLeafHdrOffset = STRUCT_OFFSET(LeafPage, hdr); 66 | int kInternalHdrOffset = STRUCT_OFFSET(InternalPage, hdr); 67 | if (kLeafHdrOffset != kInternalHdrOffset) { 68 | std::cerr << "format error" << std::endl; 69 | } 70 | 71 | if (dsm->getMyNodeID() == 0) { 72 | std::cout << "Header size: " << sizeof(Header) << std::endl; 73 | std::cout << "Internal Page size: " << sizeof(InternalPage) << " [" 74 | << kInternalPageSize << "]" << std::endl; 75 | std::cout << "Internal per Page: " << kInternalCardinality << std::endl; 76 | std::cout << "Leaf Page size: " << sizeof(LeafPage) << " [" << kLeafPageSize 77 | << "]" << std::endl; 78 | std::cout << "Leaf per Page: " << kLeafCardinality << std::endl; 79 | std::cout << "LeafEntry size: " << sizeof(LeafEntry) << std::endl; 80 | std::cout << "InternalEntry size: " << sizeof(InternalEntry) << std::endl; 81 | } 82 | } 83 | 84 | inline void Tree::before_operation(CoroContext *cxt, int coro_id) { 85 | for (size_t i = 0; i < define::kMaxLevelOfTree; ++i) { 86 | path_stack[coro_id][i] = GlobalAddress::Null(); 87 | } 88 | } 89 | 90 | GlobalAddress Tree::get_root_ptr_ptr() { 91 | GlobalAddress addr; 92 | addr.nodeID = 0; 93 | addr.offset = 94 | define::kRootPointerStoreOffest + sizeof(GlobalAddress) * tree_id; 95 | 96 | return addr; 97 | } 98 | 99 | extern GlobalAddress g_root_ptr; 100 | extern int g_root_level; 101 | extern bool enable_cache; 102 | GlobalAddress Tree::get_root_ptr(CoroContext *cxt, int coro_id) { 103 | 104 | if (g_root_ptr == GlobalAddress::Null()) { 105 | auto page_buffer = (dsm->get_rbuf(coro_id)).get_page_buffer(); 106 | dsm->read_sync(page_buffer, root_ptr_ptr, sizeof(GlobalAddress), cxt); 107 | GlobalAddress root_ptr = *(GlobalAddress *)page_buffer; 108 | return root_ptr; 109 | } else { 110 | return g_root_ptr; 111 | } 112 | 113 | // std::cout << "root ptr " << root_ptr << std::endl; 114 | } 115 | 116 | void Tree::broadcast_new_root(GlobalAddress new_root_addr, int root_level) { 117 | RawMessage m; 118 | m.type = RpcType::NEW_ROOT; 119 | m.addr = new_root_addr; 120 | m.level = root_level; 121 | for (int i = 0; i < dsm->getClusterSize(); ++i) { 122 | dsm->rpc_call_dir(m, i); 123 | } 124 | } 125 | 126 | bool Tree::update_new_root(GlobalAddress left, const Key &k, 127 | GlobalAddress right, int level, 128 | GlobalAddress old_root, CoroContext *cxt, 129 | int coro_id) { 130 | 131 | auto page_buffer = dsm->get_rbuf(coro_id).get_page_buffer(); 132 | auto cas_buffer = dsm->get_rbuf(coro_id).get_cas_buffer(); 133 | auto new_root = new (page_buffer) InternalPage(left, k, right, level); 134 | 135 | auto new_root_addr = dsm->alloc(kInternalPageSize); 136 | 137 | new_root->set_consistent(); 138 | dsm->write_sync(page_buffer, new_root_addr, kInternalPageSize, cxt); 139 | if (dsm->cas_sync(root_ptr_ptr, old_root, new_root_addr, cas_buffer, cxt)) { 140 | broadcast_new_root(new_root_addr, level); 141 | std::cout << "new root level " << level << " " << new_root_addr 142 | << std::endl; 143 | return true; 144 | } else { 145 | std::cout << "cas root fail " << std::endl; 146 | } 147 | 148 | return false; 149 | } 150 | 151 | void Tree::print_and_check_tree(CoroContext *cxt, int coro_id) { 152 | assert(dsm->is_register()); 153 | 154 | auto root = get_root_ptr(cxt, coro_id); 155 | // SearchResult result; 156 | 157 | GlobalAddress p = root; 158 | GlobalAddress levels[define::kMaxLevelOfTree]; 159 | int level_cnt = 0; 160 | auto page_buffer = (dsm->get_rbuf(coro_id)).get_page_buffer(); 161 | GlobalAddress leaf_head; 162 | 163 | next_level: 164 | 165 | dsm->read_sync(page_buffer, p, kLeafPageSize); 166 | auto header = (Header *)(page_buffer + (STRUCT_OFFSET(LeafPage, hdr))); 167 | levels[level_cnt++] = p; 168 | if (header->level != 0) { 169 | p = header->leftmost_ptr; 170 | goto next_level; 171 | } else { 172 | leaf_head = p; 173 | } 174 | 175 | next: 176 | dsm->read_sync(page_buffer, leaf_head, kLeafPageSize); 177 | auto page = (LeafPage *)page_buffer; 178 | for (int i = 0; i < kLeafCardinality; ++i) { 179 | if (page->records[i].value != kValueNull) { 180 | } 181 | } 182 | while (page->hdr.sibling_ptr != GlobalAddress::Null()) { 183 | leaf_head = page->hdr.sibling_ptr; 184 | goto next; 185 | } 186 | 187 | // for (int i = 0; i < level_cnt; ++i) { 188 | // dsm->read_sync(page_buffer, levels[i], kLeafPageSize); 189 | // auto header = (Header *)(page_buffer + (STRUCT_OFFSET(LeafPage, hdr))); 190 | // // std::cout << "addr: " << levels[i] << " "; 191 | // // header->debug(); 192 | // // std::cout << " | "; 193 | // while (header->sibling_ptr != GlobalAddress::Null()) { 194 | // dsm->read_sync(page_buffer, header->sibling_ptr, kLeafPageSize); 195 | // header = (Header *)(page_buffer + (STRUCT_OFFSET(LeafPage, hdr))); 196 | // // std::cout << "addr: " << header->sibling_ptr << " "; 197 | // // header->debug(); 198 | // // std::cout << " | "; 199 | // } 200 | // // std::cout << "\n------------------------------------" << std::endl; 201 | // // std::cout << "------------------------------------" << std::endl; 202 | // } 203 | } 204 | 205 | inline bool Tree::try_lock_addr(GlobalAddress lock_addr, uint64_t tag, 206 | uint64_t *buf, CoroContext *cxt, int coro_id) { 207 | 208 | bool hand_over = acquire_local_lock(lock_addr, cxt, coro_id); 209 | if (hand_over) { 210 | return true; 211 | } 212 | 213 | { 214 | 215 | uint64_t retry_cnt = 0; 216 | uint64_t pre_tag = 0; 217 | uint64_t conflict_tag = 0; 218 | retry: 219 | retry_cnt++; 220 | if (retry_cnt > 1000000) { 221 | std::cout << "Deadlock " << lock_addr << std::endl; 222 | 223 | std::cout << dsm->getMyNodeID() << ", " << dsm->getMyThreadID() 224 | << " locked by " << (conflict_tag >> 32) << ", " 225 | << (conflict_tag << 32 >> 32) << std::endl; 226 | assert(false); 227 | } 228 | 229 | bool res = dsm->cas_dm_sync(lock_addr, 0, tag, buf, cxt); 230 | 231 | if (!res) { 232 | conflict_tag = *buf - 1; 233 | if (conflict_tag != pre_tag) { 234 | retry_cnt = 0; 235 | pre_tag = conflict_tag; 236 | } 237 | goto retry; 238 | } 239 | } 240 | 241 | return true; 242 | } 243 | 244 | inline void Tree::unlock_addr(GlobalAddress lock_addr, uint64_t tag, 245 | uint64_t *buf, CoroContext *cxt, int coro_id, 246 | bool async) { 247 | 248 | bool hand_over_other = can_hand_over(lock_addr); 249 | if (hand_over_other) { 250 | releases_local_lock(lock_addr); 251 | return; 252 | } 253 | 254 | auto cas_buf = dsm->get_rbuf(coro_id).get_cas_buffer(); 255 | 256 | *cas_buf = 0; 257 | if (async) { 258 | dsm->write_dm((char *)cas_buf, lock_addr, sizeof(uint64_t), false); 259 | } else { 260 | dsm->write_dm_sync((char *)cas_buf, lock_addr, sizeof(uint64_t), cxt); 261 | } 262 | 263 | releases_local_lock(lock_addr); 264 | } 265 | 266 | void Tree::write_page_and_unlock(char *page_buffer, GlobalAddress page_addr, 267 | int page_size, uint64_t *cas_buffer, 268 | GlobalAddress lock_addr, uint64_t tag, 269 | CoroContext *cxt, int coro_id, bool async) { 270 | 271 | bool hand_over_other = can_hand_over(lock_addr); 272 | if (hand_over_other) { 273 | dsm->write_sync(page_buffer, page_addr, page_size, cxt); 274 | releases_local_lock(lock_addr); 275 | return; 276 | } 277 | 278 | RdmaOpRegion rs[2]; 279 | rs[0].source = (uint64_t)page_buffer; 280 | rs[0].dest = page_addr; 281 | rs[0].size = page_size; 282 | rs[0].is_on_chip = false; 283 | 284 | rs[1].source = (uint64_t)dsm->get_rbuf(coro_id).get_cas_buffer(); 285 | rs[1].dest = lock_addr; 286 | rs[1].size = sizeof(uint64_t); 287 | 288 | rs[1].is_on_chip = true; 289 | 290 | *(uint64_t *)rs[1].source = 0; 291 | if (async) { 292 | dsm->write_batch(rs, 2, false); 293 | } else { 294 | dsm->write_batch_sync(rs, 2, cxt); 295 | } 296 | 297 | releases_local_lock(lock_addr); 298 | } 299 | 300 | void Tree::lock_and_read_page(char *page_buffer, GlobalAddress page_addr, 301 | int page_size, uint64_t *cas_buffer, 302 | GlobalAddress lock_addr, uint64_t tag, 303 | CoroContext *cxt, int coro_id) { 304 | 305 | try_lock_addr(lock_addr, tag, cas_buffer, cxt, coro_id); 306 | 307 | dsm->read_sync(page_buffer, page_addr, page_size, cxt); 308 | } 309 | 310 | void Tree::lock_bench(const Key &k, CoroContext *cxt, int coro_id) { 311 | uint64_t lock_index = CityHash64((char *)&k, sizeof(k)) % define::kNumOfLock; 312 | 313 | GlobalAddress lock_addr; 314 | lock_addr.nodeID = 0; 315 | lock_addr.offset = lock_index * sizeof(uint64_t); 316 | auto cas_buffer = dsm->get_rbuf(coro_id).get_cas_buffer(); 317 | 318 | // bool res = dsm->cas_sync(lock_addr, 0, 1, cas_buffer, cxt); 319 | try_lock_addr(lock_addr, 1, cas_buffer, cxt, coro_id); 320 | unlock_addr(lock_addr, 1, cas_buffer, cxt, coro_id, true); 321 | } 322 | 323 | void Tree::insert_internal(const Key &k, GlobalAddress v, CoroContext *cxt, 324 | int coro_id, int level) { 325 | auto root = get_root_ptr(cxt, coro_id); 326 | SearchResult result; 327 | 328 | GlobalAddress p = root; 329 | 330 | next: 331 | 332 | if (!page_search(p, k, result, cxt, coro_id)) { 333 | std::cout << "SEARCH WARNING insert" << std::endl; 334 | p = get_root_ptr(cxt, coro_id); 335 | sleep(1); 336 | goto next; 337 | } 338 | 339 | assert(result.level != 0); 340 | if (result.slibing != GlobalAddress::Null()) { 341 | p = result.slibing; 342 | goto next; 343 | } 344 | 345 | p = result.next_level; 346 | if (result.level != level + 1) { 347 | goto next; 348 | } 349 | 350 | internal_page_store(p, k, v, root, level, cxt, coro_id); 351 | } 352 | 353 | void Tree::insert(const Key &k, const Value &v, CoroContext *cxt, int coro_id) { 354 | assert(dsm->is_register()); 355 | 356 | before_operation(cxt, coro_id); 357 | 358 | if (enable_cache) { 359 | GlobalAddress cache_addr; 360 | auto entry = index_cache->search_from_cache(k, &cache_addr, 361 | dsm->getMyThreadID() == 0); 362 | if (entry) { // cache hit 363 | auto root = get_root_ptr(cxt, coro_id); 364 | if (leaf_page_store(cache_addr, k, v, root, 0, cxt, coro_id, true)) { 365 | 366 | cache_hit[dsm->getMyThreadID()][0]++; 367 | return; 368 | } 369 | // cache stale, from root, 370 | index_cache->invalidate(entry); 371 | } 372 | cache_miss[dsm->getMyThreadID()][0]++; 373 | } 374 | 375 | auto root = get_root_ptr(cxt, coro_id); 376 | SearchResult result; 377 | 378 | GlobalAddress p = root; 379 | 380 | next: 381 | 382 | if (!page_search(p, k, result, cxt, coro_id)) { 383 | std::cout << "SEARCH WARNING insert" << std::endl; 384 | p = get_root_ptr(cxt, coro_id); 385 | sleep(1); 386 | goto next; 387 | } 388 | 389 | if (!result.is_leaf) { 390 | assert(result.level != 0); 391 | if (result.slibing != GlobalAddress::Null()) { 392 | p = result.slibing; 393 | goto next; 394 | } 395 | 396 | p = result.next_level; 397 | if (result.level != 1) { 398 | goto next; 399 | } 400 | } 401 | 402 | leaf_page_store(p, k, v, root, 0, cxt, coro_id); 403 | } 404 | 405 | bool Tree::search(const Key &k, Value &v, CoroContext *cxt, int coro_id) { 406 | assert(dsm->is_register()); 407 | 408 | auto root = get_root_ptr(cxt, coro_id); 409 | SearchResult result; 410 | 411 | GlobalAddress p = root; 412 | 413 | bool from_cache = false; 414 | const CacheEntry *entry = nullptr; 415 | if (enable_cache) { 416 | GlobalAddress cache_addr; 417 | entry = index_cache->search_from_cache(k, &cache_addr, 418 | dsm->getMyThreadID() == 0); 419 | if (entry) { // cache hit 420 | cache_hit[dsm->getMyThreadID()][0]++; 421 | from_cache = true; 422 | p = cache_addr; 423 | 424 | } else { 425 | cache_miss[dsm->getMyThreadID()][0]++; 426 | } 427 | } 428 | 429 | next: 430 | if (!page_search(p, k, result, cxt, coro_id, from_cache)) { 431 | if (from_cache) { // cache stale 432 | index_cache->invalidate(entry); 433 | cache_hit[dsm->getMyThreadID()][0]--; 434 | cache_miss[dsm->getMyThreadID()][0]++; 435 | from_cache = false; 436 | 437 | p = root; 438 | } else { 439 | std::cout << "SEARCH WARNING search" << std::endl; 440 | sleep(1); 441 | } 442 | goto next; 443 | } 444 | if (result.is_leaf) { 445 | if (result.val != kValueNull) { // find 446 | v = result.val; 447 | return true; 448 | } 449 | if (result.slibing != GlobalAddress::Null()) { // turn right 450 | p = result.slibing; 451 | goto next; 452 | } 453 | return false; // not found 454 | } else { // internal 455 | p = result.slibing != GlobalAddress::Null() ? result.slibing 456 | : result.next_level; 457 | goto next; 458 | } 459 | } 460 | 461 | uint64_t Tree::range_query(const Key &from, const Key &to, Value *value_buffer, 462 | CoroContext *cxt, int coro_id) { 463 | 464 | const int kParaFetch = 32; 465 | thread_local std::vector result; 466 | thread_local std::vector leaves; 467 | 468 | result.clear(); 469 | leaves.clear(); 470 | index_cache->search_range_from_cache(from, to, result); 471 | 472 | // FIXME: here, we assume all innernal nodes are cached in compute node 473 | if (result.empty()) { 474 | return 0; 475 | } 476 | 477 | uint64_t counter = 0; 478 | for (auto page : result) { 479 | auto cnt = page->hdr.last_index + 1; 480 | auto addr = page->hdr.leftmost_ptr; 481 | 482 | // [from, to] 483 | // [lowest, page->records[0].key); 484 | bool no_fetch = from > page->records[0].key || to < page->hdr.lowest; 485 | if (!no_fetch) { 486 | leaves.push_back(addr); 487 | } 488 | for (int i = 1; i < cnt; ++i) { 489 | no_fetch = from > page->records[i].key || to < page->records[i - 1].key; 490 | if (!no_fetch) { 491 | leaves.push_back(page->records[i - 1].ptr); 492 | } 493 | } 494 | 495 | no_fetch = from > page->hdr.highest || to < page->records[cnt - 1].key; 496 | if (!no_fetch) { 497 | leaves.push_back(page->records[cnt - 1].ptr); 498 | } 499 | } 500 | 501 | int cq_cnt = 0; 502 | char *range_buffer = (dsm->get_rbuf(coro_id)).get_range_buffer(); 503 | for (size_t i = 0; i < leaves.size(); ++i) { 504 | if (i > 0 && i % kParaFetch == 0) { 505 | dsm->poll_rdma_cq(kParaFetch); 506 | cq_cnt -= kParaFetch; 507 | for (int k = 0; k < kParaFetch; ++k) { 508 | auto page = (LeafPage *)(range_buffer + k * kLeafPageSize); 509 | for (int i = 0; i < kLeafCardinality; ++i) { 510 | auto &r = page->records[i]; 511 | if (r.value != kValueNull && r.f_version == r.r_version) { 512 | if (r.key >= from && r.key <= to) { 513 | value_buffer[counter++] = r.value; 514 | } 515 | } 516 | } 517 | } 518 | } 519 | dsm->read(range_buffer + kLeafPageSize * (i % kParaFetch), leaves[i], 520 | kLeafPageSize, true); 521 | cq_cnt++; 522 | } 523 | 524 | if (cq_cnt != 0) { 525 | dsm->poll_rdma_cq(cq_cnt); 526 | for (int k = 0; k < cq_cnt; ++k) { 527 | auto page = (LeafPage *)(range_buffer + k * kLeafPageSize); 528 | for (int i = 0; i < kLeafCardinality; ++i) { 529 | auto &r = page->records[i]; 530 | if (r.value != kValueNull && r.f_version == r.r_version) { 531 | if (r.key >= from && r.key <= to) { 532 | value_buffer[counter++] = r.value; 533 | } 534 | } 535 | } 536 | } 537 | } 538 | 539 | return counter; 540 | } 541 | 542 | void Tree::del(const Key &k, CoroContext *cxt, int coro_id) { 543 | assert(dsm->is_register()); 544 | 545 | before_operation(cxt, coro_id); 546 | 547 | if (enable_cache) { 548 | GlobalAddress cache_addr; 549 | auto entry = index_cache->search_from_cache(k, &cache_addr, 550 | dsm->getMyThreadID() == 0); 551 | if (entry) { // cache hit 552 | if (leaf_page_del(cache_addr, k, 0, cxt, coro_id, true)) { 553 | 554 | cache_hit[dsm->getMyThreadID()][0]++; 555 | return; 556 | } 557 | // cache stale, from root, 558 | index_cache->invalidate(entry); 559 | } 560 | cache_miss[dsm->getMyThreadID()][0]++; 561 | } 562 | 563 | auto root = get_root_ptr(cxt, coro_id); 564 | SearchResult result; 565 | 566 | GlobalAddress p = root; 567 | 568 | next: 569 | 570 | if (!page_search(p, k, result, cxt, coro_id)) { 571 | std::cout << "SEARCH WARNING del" << std::endl; 572 | p = get_root_ptr(cxt, coro_id); 573 | sleep(1); 574 | goto next; 575 | } 576 | 577 | if (!result.is_leaf) { 578 | assert(result.level != 0); 579 | if (result.slibing != GlobalAddress::Null()) { 580 | p = result.slibing; 581 | goto next; 582 | } 583 | 584 | p = result.next_level; 585 | if (result.level != 1) { 586 | goto next; 587 | } 588 | } 589 | 590 | leaf_page_del(p, k, 0, cxt, coro_id); 591 | } 592 | 593 | bool Tree::page_search(GlobalAddress page_addr, const Key &k, 594 | SearchResult &result, CoroContext *cxt, int coro_id, 595 | bool from_cache) { 596 | auto page_buffer = (dsm->get_rbuf(coro_id)).get_page_buffer(); 597 | auto header = (Header *)(page_buffer + (STRUCT_OFFSET(LeafPage, hdr))); 598 | 599 | int counter = 0; 600 | re_read: 601 | if (++counter > 100) { 602 | printf("re read too many times\n"); 603 | sleep(1); 604 | } 605 | dsm->read_sync(page_buffer, page_addr, kLeafPageSize, cxt); 606 | 607 | memset(&result, 0, sizeof(result)); 608 | result.is_leaf = header->leftmost_ptr == GlobalAddress::Null(); 609 | result.level = header->level; 610 | path_stack[coro_id][result.level] = page_addr; 611 | // std::cout << "level " << (int)result.level << " " << page_addr << 612 | // std::endl; 613 | 614 | if (result.is_leaf) { 615 | auto page = (LeafPage *)page_buffer; 616 | if (!page->check_consistent()) { 617 | goto re_read; 618 | } 619 | 620 | if (from_cache && 621 | (k < page->hdr.lowest || k >= page->hdr.highest)) { // cache is stale 622 | return false; 623 | } 624 | 625 | assert(result.level == 0); 626 | if (k >= page->hdr.highest) { // should turn right 627 | result.slibing = page->hdr.sibling_ptr; 628 | return true; 629 | } 630 | if (k < page->hdr.lowest) { 631 | assert(false); 632 | return false; 633 | } 634 | leaf_page_search(page, k, result); 635 | } else { 636 | assert(result.level != 0); 637 | assert(!from_cache); 638 | auto page = (InternalPage *)page_buffer; 639 | 640 | if (!page->check_consistent()) { 641 | goto re_read; 642 | } 643 | 644 | if (result.level == 1 && enable_cache) { 645 | index_cache->add_to_cache(page); 646 | } 647 | 648 | if (k >= page->hdr.highest) { // should turn right 649 | result.slibing = page->hdr.sibling_ptr; 650 | return true; 651 | } 652 | if (k < page->hdr.lowest) { 653 | printf("key %ld error in level %d\n", k, page->hdr.level); 654 | sleep(10); 655 | print_and_check_tree(); 656 | assert(false); 657 | return false; 658 | } 659 | internal_page_search(page, k, result); 660 | } 661 | 662 | return true; 663 | } 664 | 665 | void Tree::internal_page_search(InternalPage *page, const Key &k, 666 | SearchResult &result) { 667 | 668 | assert(k >= page->hdr.lowest); 669 | assert(k < page->hdr.highest); 670 | 671 | auto cnt = page->hdr.last_index + 1; 672 | // page->debug(); 673 | if (k < page->records[0].key) { 674 | result.next_level = page->hdr.leftmost_ptr; 675 | return; 676 | } 677 | 678 | for (int i = 1; i < cnt; ++i) { 679 | if (k < page->records[i].key) { 680 | result.next_level = page->records[i - 1].ptr; 681 | return; 682 | } 683 | } 684 | result.next_level = page->records[cnt - 1].ptr; 685 | } 686 | 687 | void Tree::leaf_page_search(LeafPage *page, const Key &k, 688 | SearchResult &result) { 689 | 690 | for (int i = 0; i < kLeafCardinality; ++i) { 691 | auto &r = page->records[i]; 692 | if (r.key == k && r.value != kValueNull && r.f_version == r.r_version) { 693 | result.val = r.value; 694 | break; 695 | } 696 | } 697 | } 698 | 699 | void Tree::internal_page_store(GlobalAddress page_addr, const Key &k, 700 | GlobalAddress v, GlobalAddress root, int level, 701 | CoroContext *cxt, int coro_id) { 702 | uint64_t lock_index = 703 | CityHash64((char *)&page_addr, sizeof(page_addr)) % define::kNumOfLock; 704 | 705 | GlobalAddress lock_addr; 706 | lock_addr.nodeID = page_addr.nodeID; 707 | lock_addr.offset = lock_index * sizeof(uint64_t); 708 | 709 | auto &rbuf = dsm->get_rbuf(coro_id); 710 | uint64_t *cas_buffer = rbuf.get_cas_buffer(); 711 | auto page_buffer = rbuf.get_page_buffer(); 712 | 713 | auto tag = dsm->getThreadTag(); 714 | assert(tag != 0); 715 | 716 | lock_and_read_page(page_buffer, page_addr, kInternalPageSize, cas_buffer, 717 | lock_addr, tag, cxt, coro_id); 718 | 719 | auto page = (InternalPage *)page_buffer; 720 | 721 | assert(page->hdr.level == level); 722 | assert(page->check_consistent()); 723 | if (k >= page->hdr.highest) { 724 | 725 | this->unlock_addr(lock_addr, tag, cas_buffer, cxt, coro_id, true); 726 | 727 | assert(page->hdr.sibling_ptr != GlobalAddress::Null()); 728 | 729 | this->internal_page_store(page->hdr.sibling_ptr, k, v, root, level, cxt, 730 | coro_id); 731 | 732 | return; 733 | } 734 | assert(k >= page->hdr.lowest); 735 | 736 | auto cnt = page->hdr.last_index + 1; 737 | 738 | bool is_update = false; 739 | uint16_t insert_index = 0; 740 | for (int i = cnt - 1; i >= 0; --i) { 741 | if (page->records[i].key == k) { // find and update 742 | page->records[i].ptr = v; 743 | // assert(false); 744 | is_update = true; 745 | break; 746 | } 747 | if (page->records[i].key < k) { 748 | insert_index = i + 1; 749 | break; 750 | } 751 | } 752 | 753 | assert(cnt != kInternalCardinality); 754 | 755 | if (!is_update) { // insert and shift 756 | for (int i = cnt; i > insert_index; --i) { 757 | page->records[i].key = page->records[i - 1].key; 758 | page->records[i].ptr = page->records[i - 1].ptr; 759 | } 760 | page->records[insert_index].key = k; 761 | page->records[insert_index].ptr = v; 762 | 763 | page->hdr.last_index++; 764 | } 765 | 766 | cnt = page->hdr.last_index + 1; 767 | bool need_split = cnt == kInternalCardinality; 768 | Key split_key; 769 | GlobalAddress sibling_addr; 770 | if (need_split) { // need split 771 | sibling_addr = dsm->alloc(kInternalPageSize); 772 | auto sibling_buf = rbuf.get_sibling_buffer(); 773 | 774 | auto sibling = new (sibling_buf) InternalPage(page->hdr.level); 775 | 776 | // std::cout << "addr " << sibling_addr << " | level " << 777 | // (int)(page->hdr.level) << std::endl; 778 | 779 | int m = cnt / 2; 780 | split_key = page->records[m].key; 781 | assert(split_key > page->hdr.lowest); 782 | assert(split_key < page->hdr.highest); 783 | for (int i = m + 1; i < cnt; ++i) { // move 784 | sibling->records[i - m - 1].key = page->records[i].key; 785 | sibling->records[i - m - 1].ptr = page->records[i].ptr; 786 | } 787 | page->hdr.last_index -= (cnt - m); 788 | sibling->hdr.last_index += (cnt - m - 1); 789 | 790 | sibling->hdr.leftmost_ptr = page->records[m].ptr; 791 | sibling->hdr.lowest = page->records[m].key; 792 | sibling->hdr.highest = page->hdr.highest; 793 | page->hdr.highest = page->records[m].key; 794 | 795 | // link 796 | sibling->hdr.sibling_ptr = page->hdr.sibling_ptr; 797 | page->hdr.sibling_ptr = sibling_addr; 798 | 799 | sibling->set_consistent(); 800 | dsm->write_sync(sibling_buf, sibling_addr, kInternalPageSize, cxt); 801 | } 802 | 803 | page->set_consistent(); 804 | write_page_and_unlock(page_buffer, page_addr, kInternalPageSize, cas_buffer, 805 | lock_addr, tag, cxt, coro_id, need_split); 806 | 807 | if (!need_split) 808 | return; 809 | 810 | if (root == page_addr) { // update root 811 | 812 | if (update_new_root(page_addr, split_key, sibling_addr, level + 1, root, 813 | cxt, coro_id)) { 814 | return; 815 | } 816 | } 817 | 818 | auto up_level = path_stack[coro_id][level + 1]; 819 | 820 | if (up_level != GlobalAddress::Null()) { 821 | internal_page_store(up_level, split_key, sibling_addr, root, level + 1, cxt, 822 | coro_id); 823 | } else { 824 | assert(false); 825 | } 826 | } 827 | 828 | bool Tree::leaf_page_store(GlobalAddress page_addr, const Key &k, 829 | const Value &v, GlobalAddress root, int level, 830 | CoroContext *cxt, int coro_id, bool from_cache) { 831 | 832 | uint64_t lock_index = 833 | CityHash64((char *)&page_addr, sizeof(page_addr)) % define::kNumOfLock; 834 | 835 | GlobalAddress lock_addr; 836 | 837 | #ifdef CONFIG_ENABLE_EMBEDDING_LOCK 838 | lock_addr = page_addr; 839 | #else 840 | lock_addr.nodeID = page_addr.nodeID; 841 | lock_addr.offset = lock_index * sizeof(uint64_t); 842 | #endif 843 | 844 | auto &rbuf = dsm->get_rbuf(coro_id); 845 | uint64_t *cas_buffer = rbuf.get_cas_buffer(); 846 | auto page_buffer = rbuf.get_page_buffer(); 847 | 848 | auto tag = dsm->getThreadTag(); 849 | assert(tag != 0); 850 | 851 | lock_and_read_page(page_buffer, page_addr, kLeafPageSize, cas_buffer, 852 | lock_addr, tag, cxt, coro_id); 853 | 854 | auto page = (LeafPage *)page_buffer; 855 | 856 | assert(page->hdr.level == level); 857 | assert(page->check_consistent()); 858 | 859 | if (from_cache && 860 | (k < page->hdr.lowest || k >= page->hdr.highest)) { // cache is stale 861 | this->unlock_addr(lock_addr, tag, cas_buffer, cxt, coro_id, true); 862 | return false; 863 | } 864 | 865 | if (k >= page->hdr.highest) { 866 | 867 | this->unlock_addr(lock_addr, tag, cas_buffer, cxt, coro_id, true); 868 | assert(page->hdr.sibling_ptr != GlobalAddress::Null()); 869 | this->leaf_page_store(page->hdr.sibling_ptr, k, v, root, level, cxt, 870 | coro_id); 871 | return true; 872 | } 873 | assert(k >= page->hdr.lowest); 874 | 875 | int cnt = 0; 876 | int empty_index = -1; 877 | char *update_addr = nullptr; 878 | for (int i = 0; i < kLeafCardinality; ++i) { 879 | 880 | auto &r = page->records[i]; 881 | if (r.value != kValueNull) { 882 | cnt++; 883 | if (r.key == k) { 884 | r.value = v; 885 | r.f_version++; 886 | r.r_version = r.f_version; 887 | update_addr = (char *)&r; 888 | break; 889 | } 890 | } else if (empty_index == -1) { 891 | empty_index = i; 892 | } 893 | } 894 | 895 | assert(cnt != kLeafCardinality); 896 | 897 | if (update_addr == nullptr) { // insert new item 898 | if (empty_index == -1) { 899 | printf("%d cnt\n", cnt); 900 | assert(false); 901 | } 902 | 903 | auto &r = page->records[empty_index]; 904 | r.key = k; 905 | r.value = v; 906 | r.f_version++; 907 | r.r_version = r.f_version; 908 | 909 | update_addr = (char *)&r; 910 | 911 | cnt++; 912 | } 913 | 914 | bool need_split = cnt == kLeafCardinality; 915 | if (!need_split) { 916 | assert(update_addr); 917 | write_page_and_unlock( 918 | update_addr, GADD(page_addr, (update_addr - (char *)page)), 919 | sizeof(LeafEntry), cas_buffer, lock_addr, tag, cxt, coro_id, false); 920 | 921 | return true; 922 | } else { 923 | std::sort( 924 | page->records, page->records + kLeafCardinality, 925 | [](const LeafEntry &a, const LeafEntry &b) { return a.key < b.key; }); 926 | } 927 | 928 | Key split_key; 929 | GlobalAddress sibling_addr; 930 | if (need_split) { // need split 931 | sibling_addr = dsm->alloc(kLeafPageSize); 932 | auto sibling_buf = rbuf.get_sibling_buffer(); 933 | 934 | auto sibling = new (sibling_buf) LeafPage(page->hdr.level); 935 | 936 | // std::cout << "addr " << sibling_addr << " | level " << 937 | // (int)(page->hdr.level) << std::endl; 938 | 939 | int m = cnt / 2; 940 | split_key = page->records[m].key; 941 | assert(split_key > page->hdr.lowest); 942 | assert(split_key < page->hdr.highest); 943 | 944 | for (int i = m; i < cnt; ++i) { // move 945 | sibling->records[i - m].key = page->records[i].key; 946 | sibling->records[i - m].value = page->records[i].value; 947 | page->records[i].key = 0; 948 | page->records[i].value = kValueNull; 949 | } 950 | page->hdr.last_index -= (cnt - m); 951 | sibling->hdr.last_index += (cnt - m); 952 | 953 | sibling->hdr.lowest = split_key; 954 | sibling->hdr.highest = page->hdr.highest; 955 | page->hdr.highest = split_key; 956 | 957 | // link 958 | sibling->hdr.sibling_ptr = page->hdr.sibling_ptr; 959 | page->hdr.sibling_ptr = sibling_addr; 960 | 961 | sibling->set_consistent(); 962 | dsm->write_sync(sibling_buf, sibling_addr, kLeafPageSize, cxt); 963 | } 964 | 965 | page->set_consistent(); 966 | 967 | write_page_and_unlock(page_buffer, page_addr, kLeafPageSize, cas_buffer, 968 | lock_addr, tag, cxt, coro_id, need_split); 969 | 970 | if (!need_split) 971 | return true; 972 | 973 | if (root == page_addr) { // update root 974 | if (update_new_root(page_addr, split_key, sibling_addr, level + 1, root, 975 | cxt, coro_id)) { 976 | return true; 977 | } 978 | } 979 | 980 | auto up_level = path_stack[coro_id][level + 1]; 981 | 982 | if (up_level != GlobalAddress::Null()) { 983 | internal_page_store(up_level, split_key, sibling_addr, root, level + 1, cxt, 984 | coro_id); 985 | } else { 986 | assert(from_cache); 987 | insert_internal(split_key, sibling_addr, cxt, coro_id, level + 1); 988 | } 989 | 990 | return true; 991 | } 992 | 993 | bool Tree::leaf_page_del(GlobalAddress page_addr, const Key &k, int level, 994 | CoroContext *cxt, int coro_id, bool from_cache) { 995 | uint64_t lock_index = 996 | CityHash64((char *)&page_addr, sizeof(page_addr)) % define::kNumOfLock; 997 | 998 | GlobalAddress lock_addr; 999 | 1000 | #ifdef CONFIG_ENABLE_EMBEDDING_LOCK 1001 | lock_addr = page_addr; 1002 | #else 1003 | lock_addr.nodeID = page_addr.nodeID; 1004 | lock_addr.offset = lock_index * sizeof(uint64_t); 1005 | #endif 1006 | 1007 | auto &rbuf = dsm->get_rbuf(coro_id); 1008 | uint64_t *cas_buffer = rbuf.get_cas_buffer(); 1009 | auto page_buffer = rbuf.get_page_buffer(); 1010 | 1011 | auto tag = dsm->getThreadTag(); 1012 | assert(tag != 0); 1013 | 1014 | lock_and_read_page(page_buffer, page_addr, kLeafPageSize, cas_buffer, 1015 | lock_addr, tag, cxt, coro_id); 1016 | 1017 | auto page = (LeafPage *)page_buffer; 1018 | 1019 | assert(page->hdr.level == level); 1020 | assert(page->check_consistent()); 1021 | 1022 | if (from_cache && 1023 | (k < page->hdr.lowest || k >= page->hdr.highest)) { // cache is stale 1024 | this->unlock_addr(lock_addr, tag, cas_buffer, cxt, coro_id, true); 1025 | return false; 1026 | } 1027 | 1028 | if (k >= page->hdr.highest) { 1029 | this->unlock_addr(lock_addr, tag, cas_buffer, cxt, coro_id, true); 1030 | assert(page->hdr.sibling_ptr != GlobalAddress::Null()); 1031 | this->leaf_page_del(page->hdr.sibling_ptr, k, level, cxt, coro_id); 1032 | return true; 1033 | } 1034 | 1035 | assert(k >= page->hdr.lowest); 1036 | 1037 | char *update_addr = nullptr; 1038 | for (int i = 0; i < kLeafCardinality; ++i) { 1039 | auto &r = page->records[i]; 1040 | if (r.key == k && r.value != kValueNull) { 1041 | r.value = kValueNull; 1042 | r.f_version++; 1043 | r.r_version = r.f_version; 1044 | update_addr = (char *)&r; 1045 | break; 1046 | } 1047 | } 1048 | 1049 | if (update_addr) { 1050 | write_page_and_unlock( 1051 | update_addr, GADD(page_addr, (update_addr - (char *)page)), 1052 | sizeof(LeafEntry), cas_buffer, lock_addr, tag, cxt, coro_id, false); 1053 | } else { 1054 | this->unlock_addr(lock_addr, tag, cas_buffer, cxt, coro_id, false); 1055 | } 1056 | return true; 1057 | } 1058 | 1059 | void Tree::run_coroutine(CoroFunc func, int id, int coro_cnt) { 1060 | 1061 | using namespace std::placeholders; 1062 | 1063 | assert(coro_cnt <= define::kMaxCoro); 1064 | for (int i = 0; i < coro_cnt; ++i) { 1065 | auto gen = func(i, dsm, id); 1066 | worker[i] = CoroCall(std::bind(&Tree::coro_worker, this, _1, gen, i)); 1067 | } 1068 | 1069 | master = CoroCall(std::bind(&Tree::coro_master, this, _1, coro_cnt)); 1070 | 1071 | master(); 1072 | } 1073 | 1074 | void Tree::coro_worker(CoroYield &yield, RequstGen *gen, int coro_id) { 1075 | CoroContext ctx; 1076 | ctx.coro_id = coro_id; 1077 | ctx.master = &master; 1078 | ctx.yield = &yield; 1079 | 1080 | Timer coro_timer; 1081 | auto thread_id = dsm->getMyThreadID(); 1082 | 1083 | while (true) { 1084 | 1085 | auto r = gen->next(); 1086 | 1087 | coro_timer.begin(); 1088 | if (r.is_search) { 1089 | Value v; 1090 | this->search(r.k, v, &ctx, coro_id); 1091 | } else { 1092 | this->insert(r.k, r.v, &ctx, coro_id); 1093 | } 1094 | auto us_10 = coro_timer.end() / 100; 1095 | if (us_10 >= LATENCY_WINDOWS) { 1096 | us_10 = LATENCY_WINDOWS - 1; 1097 | } 1098 | latency[thread_id][us_10]++; 1099 | } 1100 | } 1101 | 1102 | void Tree::coro_master(CoroYield &yield, int coro_cnt) { 1103 | 1104 | for (int i = 0; i < coro_cnt; ++i) { 1105 | yield(worker[i]); 1106 | } 1107 | 1108 | while (true) { 1109 | 1110 | uint64_t next_coro_id; 1111 | 1112 | if (dsm->poll_rdma_cq_once(next_coro_id)) { 1113 | yield(worker[next_coro_id]); 1114 | } 1115 | 1116 | if (!hot_wait_queue.empty()) { 1117 | next_coro_id = hot_wait_queue.front(); 1118 | hot_wait_queue.pop(); 1119 | yield(worker[next_coro_id]); 1120 | } 1121 | } 1122 | } 1123 | 1124 | // Local Locks 1125 | inline bool Tree::acquire_local_lock(GlobalAddress lock_addr, CoroContext *cxt, 1126 | int coro_id) { 1127 | auto &node = local_locks[lock_addr.nodeID][lock_addr.offset / 8]; 1128 | 1129 | uint64_t lock_val = node.ticket_lock.fetch_add(1); 1130 | 1131 | uint32_t ticket = lock_val << 32 >> 32; 1132 | uint32_t current = lock_val >> 32; 1133 | 1134 | while (ticket != current) { // lock failed 1135 | 1136 | if (cxt != nullptr) { 1137 | hot_wait_queue.push(coro_id); 1138 | (*cxt->yield)(*cxt->master); 1139 | } 1140 | 1141 | current = node.ticket_lock.load(std::memory_order_relaxed) >> 32; 1142 | } 1143 | 1144 | node.hand_time++; 1145 | 1146 | return node.hand_over; 1147 | } 1148 | 1149 | inline bool Tree::can_hand_over(GlobalAddress lock_addr) { 1150 | 1151 | auto &node = local_locks[lock_addr.nodeID][lock_addr.offset / 8]; 1152 | uint64_t lock_val = node.ticket_lock.load(std::memory_order_relaxed); 1153 | 1154 | uint32_t ticket = lock_val << 32 >> 32; 1155 | uint32_t current = lock_val >> 32; 1156 | 1157 | if (ticket <= current + 1) { // no pending locks 1158 | node.hand_over = false; 1159 | } else { 1160 | node.hand_over = node.hand_time < define::kMaxHandOverTime; 1161 | } 1162 | if (!node.hand_over) { 1163 | node.hand_time = 0; 1164 | } 1165 | 1166 | return node.hand_over; 1167 | } 1168 | 1169 | inline void Tree::releases_local_lock(GlobalAddress lock_addr) { 1170 | auto &node = local_locks[lock_addr.nodeID][lock_addr.offset / 8]; 1171 | 1172 | node.ticket_lock.fetch_add((1ull << 32)); 1173 | } 1174 | 1175 | void Tree::index_cache_statistics() { 1176 | index_cache->statistics(); 1177 | index_cache->bench(); 1178 | } 1179 | 1180 | void Tree::clear_statistics() { 1181 | for (int i = 0; i < MAX_APP_THREAD; ++i) { 1182 | cache_hit[i][0] = 0; 1183 | cache_miss[i][0] = 0; 1184 | } 1185 | } 1186 | --------------------------------------------------------------------------------