├── AUTHORS ├── LICENSE ├── Makefile ├── README.md ├── alloc ├── alloc.hpp ├── alloc_adapter.hpp ├── block.hpp ├── block_alloc.hpp ├── local_block.hpp ├── memfd.h ├── size_table.hpp ├── superblock.hpp └── thread_alloc.hpp ├── common └── common.hpp ├── compact.cpp ├── compaction_latency.cpp ├── core.sh ├── latency.cpp ├── load.cpp ├── local_read_benchmark.cpp ├── main.cpp ├── paper └── corm.pdf ├── rdma ├── connectRDMA.hpp ├── rdma_helpers.hpp ├── rdma_memory_manager.hpp └── verbsEP.hpp ├── remote_read_benchmark.cpp ├── run_compaction.sh ├── run_latency.sh ├── run_read_throughput.sh ├── run_throughput.sh ├── run_throughput_compaction.sh ├── thread ├── messenger.hpp └── thread.hpp ├── unload.cpp ├── utilities ├── block_home_table.h ├── cxxopts.hpp ├── debug.h ├── rcu.h ├── timer.h ├── ycsb.hpp └── zipf.hpp ├── worker ├── ReaderWriter.hpp ├── client_api.hpp ├── communication.hpp ├── generic_worker.hpp └── worker.hpp └── workload_readwrite.cpp /AUTHORS: -------------------------------------------------------------------------------- 1 | # This is the official list of CoRM authors (individuals or organizations) for 2 | # copyright purposes. 3 | 4 | ETH Zurich 5 | Konstantin Taranov 6 | Salvatore Di Girolamo 7 | Torsten Hoefler 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2020-2021, ETH Zurich, and all contributors listed in AUTHORS 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: remote_read_benchmark workload_readwrite latency server load unload compaction compact local_read_benchmark 2 | 3 | CFLAGS += -Wall -std=c++14 -O2 -I./ -libverbs -lpthread -lrdmacm -lev #-DDEBUG #-g -D_GNU_SOURCE 4 | CPP = #./alloc/alloc_adapter.cpp 5 | 6 | 7 | remote_read_benchmark: 8 | rm -f remote_read_benchmark 9 | g++ remote_read_benchmark.cpp $(CFLAGS) -o remote_read_benchmark 10 | 11 | latency: 12 | rm -f latency 13 | g++ latency.cpp $(CFLAGS) -o latency 14 | 15 | server: 16 | rm -f server 17 | g++ main.cpp $(CFLAGS) $(CPP) -o server 18 | 19 | load: 20 | rm -f load 21 | g++ load.cpp $(CFLAGS) $(CPP) -o load 22 | 23 | unload: 24 | rm -f unload 25 | g++ unload.cpp $(CFLAGS) $(CPP) -o unload 26 | 27 | compaction: 28 | rm -f compaction 29 | g++ compaction_latency.cpp $(CFLAGS) $(CPP) -o compaction 30 | 31 | compact: 32 | rm -f compact 33 | g++ compact.cpp $(CFLAGS) $(CPP) -o compact 34 | 35 | workload_readwrite: 36 | rm -f workload_readwrite 37 | g++ workload_readwrite.cpp $(CFLAGS) $(CPP) -o workload_readwrite 38 | 39 | local_read_benchmark: 40 | rm -f local_read_benchmark 41 | g++ local_read_benchmark.cpp $(CFLAGS) $(CPP) -o local_read_benchmark 42 | 43 | clean: 44 | rm -f remote_read_benchmark workload_readwrite latency server load unload compaction compact local_read_benchmark 45 | .DELETE_ON_ERROR: 46 | .PHONY: all clean 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CoRM: Compactable Remote Memory over RDMA 2 | A remote memory system that supports compaction over RDMA networks. 3 | This is the source code for our [SIGMOD 2021 paper](paper/corm.pdf). 4 | 5 | ## Requirements 6 | * GCC >= 4.9 with C++14 features 7 | * rdma-core library, or equivalent RDMA verbs library 8 | * RDMA-capable network devices must have assigned IP addresses 9 | * boost lockfree queue 10 | * libev-dev library 11 | 12 | ## Usage 13 | 14 | To compile the code simply run `make`. 15 | We provide a series of bash scripts to launch CoRM. For that modify IP address of your servers accordingly. 16 | 17 | 18 | ## Basic usage 19 | ``` 20 | make 21 | ./server -a 192.168.1.10 --threads=1 % to start code with 1 thread. Corm will print size class info and then periodically report stats of the worker thread. 22 | ./latency -a 192.168.1.10 % start basic latency test 23 | ``` 24 | 25 | Note the CoRM prints only RPC stats, as it is unaware of completed one-sided RDMA reads. 26 | 27 | 28 | ## Debugging 29 | For debugging include `-DDEBUG` flag in `CFLAGS` of the Makefile. It will enable printing debug messages. 30 | 31 | ## Implementation details 32 | For research purposes, CoRM has the following implementation artifacts: 33 | 34 | #### Connection establishment model 35 | Each new client is directly connected to a remote thread worker. 36 | The thread worker is assigned in a round-robin order. It helps to manage and debug the thread that is responsible for a client. 37 | To have direct connections to all threads, a client can open multiple connections to CoRM. 38 | 39 | #### Key/Addr sizes 40 | All sizes of blocks, superblocks and ids are hard-coded. Refer to `common/common.hpp` to tune the parameters. 41 | 42 | #### Compaction 43 | The compaction is triggered manually. For that you can use `./compact` binary. 44 | 45 | #### Loading/Unloading 46 | For loading and unloading data please use `./load` and `./unload` binaries. `load` needs to know the number of remote threads to evenly load each remote worker with objects. It is a side effect of the "Connection establishment model". 47 | 48 | 49 | ## Implementation notice 50 | I am not a professional software developer, and that is why the code is not of the production-level quality. 51 | Notably, I did not invest time in splitting the code into `*.cpp` amd `*.hpp` files to improve compilation process. 52 | Also the settings related to block, ID, and key sizes are hard-coded and can be managed in `common/common.hpp`. 53 | 54 | 55 | 56 | ## Citing this work 57 | 58 | If you use our code, please consider citing our [SIGMOD 2021 paper](paper/corm.pdf): 59 | 60 | ``` 61 | @inproceedings{taranov-corm, 62 | author = {Taranov, Konstantin and Di Girolamo, Salvatore and Hoefler, Torsten}, 63 | title = {Co{RM}: {C}ompactable {R}emote {M}emory over {RDMA}}, 64 | year = {2021}, 65 | isbn = {9781450383431}, 66 | publisher = {Association for Computing Machinery}, 67 | url = {https://doi.org/10.1145/3448016.3452817}, 68 | doi = {10.1145/3448016.3452817}, 69 | booktitle = {Proceedings of the 2021 ACM SIGMOD International Conference on Management of Data}, 70 | location = {Virtual Event, China}, 71 | numpages = {14}, 72 | series = {SIGMOD'21} 73 | } 74 | ``` 75 | 76 | ## Contact 77 | If you have questions, please, contact: 78 | 79 | Konstantin Taranov (konstantin.taranov "at" inf.ethz.ch) 80 | -------------------------------------------------------------------------------- /alloc/alloc.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * Interfaces and callbacks for allocators 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | 12 | #pragma once 13 | 14 | #include "block.hpp" 15 | #include "../common/common.hpp" 16 | #include 17 | #include 18 | 19 | static const int SUCCESS = 0; 20 | static const int BLOCK_IS_NOT_FOUND = -1; 21 | static const int OBJECT_DOES_NOT_EXIST = -2; 22 | 23 | 24 | // Thread allocator callbacks. 25 | typedef void (*thread_alloc_cb)( client_addr_t ret_addr, void *owner); 26 | typedef void (*thread_free_cb)( int status, addr_t newaddr, void *owner); 27 | typedef void (*thread_find_cb)( addr_t newaddr, uint16_t slot_size, void *owner); 28 | typedef void (*fixpointer_cb)( int ret, client_addr_t ret_addr, void *owner); 29 | typedef void (*helper_cb)( void *owner); 30 | 31 | 32 | // A per-thread allocator object. 33 | // this class should be responsible for assigning object ids 34 | class ThreadAlloc { 35 | public: 36 | 37 | struct CompactionCtx{ 38 | boost::lockfree::queue q; 39 | std::atomic counter; 40 | const uint8_t type; 41 | const uint8_t initiator; 42 | ThreadAlloc* master; 43 | const bool with_compaction; 44 | std::chrono::time_point t1; 45 | 46 | CompactionCtx(uint8_t num_threads, uint8_t type, uint8_t initiator, ThreadAlloc* master, bool with_compaction): 47 | q(num_threads), counter(num_threads-1), type(type), initiator(initiator), master(master), with_compaction(with_compaction) 48 | { 49 | t1 = std::chrono::high_resolution_clock::now(); 50 | } 51 | }; 52 | 53 | 54 | virtual ~ThreadAlloc() = default; 55 | 56 | // Allocate a slot. 57 | virtual void Alloc(uint32_t size, thread_alloc_cb cb, void *owner) = 0; 58 | 59 | 60 | // Allocate a slot via adapter. 61 | virtual void AllocAtHome(ThreadAlloc* alloc, uint32_t size, thread_alloc_cb cb, void *owner) = 0; 62 | 63 | // Free a previously allocated block on this server. 64 | virtual void Free(client_addr_t client_addr, thread_free_cb cb, void *owner) = 0; 65 | 66 | // shortcut to find address of objects 67 | virtual void FindObjectAddr(client_addr_t client_addr, thread_find_cb cb, void *owner) = 0; 68 | 69 | // Get the id of the home thread. 70 | virtual int GetHomeThreadMpIdx() const = 0; 71 | 72 | virtual void FixClientAddr(client_addr_t client_addr, fixpointer_cb cb, void *owner ) = 0; 73 | 74 | virtual void Compaction(uint8_t type) = 0; 75 | 76 | virtual void print_stats() = 0; 77 | 78 | virtual void SendBlocksTo(CompactionCtx *ctx, helper_cb cb, void* owner) = 0; 79 | }; 80 | 81 | 82 | #include 83 | typedef void (*block_alloc_cb)(Block *b, addr_t addr, void *owner); 84 | typedef void (*block_free_cb)(bool success, void *owner); 85 | typedef void (*install_blocks_cb)( void *owner); 86 | 87 | class BlockAlloc { 88 | public: 89 | virtual ~BlockAlloc() = default; 90 | 91 | virtual void AllocBlock(ThreadAlloc *alloc, uint8_t type, block_alloc_cb cb, void *owner) = 0; 92 | 93 | virtual void RemoveVirtAddr(addr_t addr, helper_cb cb, void *owner) = 0; 94 | 95 | virtual bool FreePhysBlock(_block_phys_addr_t addr, uint8_t type) = 0; 96 | 97 | virtual uint32_t GetBlockSize() const = 0; 98 | // get the thread that owns this address 99 | virtual ThreadAlloc *GetHomeAlloc(addr_t addr) = 0 ; 100 | 101 | virtual int GetHomeThreadMpIdx() const = 0 ; 102 | 103 | virtual void print_stats() = 0; 104 | // move blocks from one thread to another 105 | virtual void UpdateOwnership( std::forward_list *addresses, ThreadAlloc *newalloc, install_blocks_cb cb, void *owner) = 0; 106 | }; 107 | 108 | -------------------------------------------------------------------------------- /alloc/alloc_adapter.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * Allocation adapter that helps redirect allocation requests to threads depending on size-class 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | #pragma once 12 | 13 | #include "size_table.hpp" 14 | #include "alloc.hpp" 15 | #include 16 | #include 17 | #include "../thread/messenger.hpp" 18 | 19 | /// alloc adapter is also responsible for triggering compaction 20 | class AllocAdapter{ 21 | private: 22 | 23 | AllocAdapter(uint32_t threshold_popular_class, uint32_t threshold_size_class); 24 | 25 | static AllocAdapter& getInstanceImpl(uint32_t threshold_popular_class = 0, uint32_t threshold_size_class = 0) 26 | { 27 | static AllocAdapter instance{ threshold_popular_class, threshold_size_class}; 28 | return instance; 29 | } 30 | 31 | public: 32 | static AllocAdapter& getInstance() 33 | { 34 | return getInstanceImpl(); 35 | } 36 | 37 | static void init(uint32_t threshold_popular_class, uint32_t threshold_size_class) 38 | { 39 | getInstanceImpl(threshold_popular_class,threshold_size_class ); 40 | } 41 | 42 | AllocAdapter(AllocAdapter const&) = delete; 43 | void operator=(AllocAdapter const&) = delete; 44 | 45 | 46 | public: 47 | void RegThread(ThreadAlloc* t, uint32_t id); 48 | std::atomic* GetBstats( ); 49 | 50 | 51 | void Alloc(uint32_t home_thread_id, uint32_t user_size, void *owner); 52 | void Free(uint32_t home_thread_id, client_addr_t client_addr, void *owner); 53 | 54 | 55 | void processAllocReply(client_addr_t ret_addr); 56 | void processFreeReply(int status, uint8_t type ); 57 | 58 | static void AllocReplyCb( client_addr_t ret_addr, void *owner); 59 | static void FreeReplyCb(int status, addr_t newaddr, void *owner); 60 | 61 | static void CompactionTmsg(mts::thread_msg_t *tmsg); 62 | static void CompactionMsgCb(void *owner); 63 | 64 | static void CompactionMsgRetCb(mts::thread_msg_t *msg); 65 | 66 | void print_stats(){ 67 | info(log_fp, "[AllocAdapter] Stats; Compaction times in us:\n"); 68 | 69 | for(uint32_t i = 0; i alloc_stats[SizeTable::ClassCount]; 84 | std::atomic bstat[SizeTable::ClassCount]; 85 | std::atomic pending[SizeTable::ClassCount]; 86 | std::vector all_allocs; 87 | 88 | std::vector> compaction_statistics; 89 | 90 | const uint32_t num_threads; 91 | const uint32_t threshold_popular_class; 92 | const uint32_t threshold_size_class; 93 | 94 | ThreadAlloc* getAlloc(uint32_t id); 95 | uint32_t get_thread_id(uint32_t size , uint32_t home_thread_id); 96 | uint8_t get_compaction_master(uint8_t type); 97 | void trigger_type_collection(uint8_t type, bool with_compaction = false); 98 | int get_best_compaction_candidate(); 99 | void finishCompaction(ThreadAlloc::CompactionCtx* ctx); 100 | }; 101 | 102 | 103 | /***************************************************************************** 104 | 105 | Implementation of alloc adapter 106 | 107 | ******************************************************************************/ 108 | 109 | 110 | 111 | AllocAdapter::AllocAdapter(uint32_t threshold_popular_class, uint32_t threshold_size_class): 112 | num_threads(mts::num_threads), threshold_popular_class(threshold_popular_class),threshold_size_class(threshold_size_class) 113 | { 114 | all_allocs.resize(num_threads); 115 | for(uint32_t i = 0; i* AllocAdapter::GetBstats(){ 129 | return this->bstat; 130 | } 131 | 132 | 133 | uint32_t AllocAdapter::get_thread_id(uint32_t user_size, uint32_t home_thread_id){ 134 | //assert(0 && "Not implemented"); 135 | uint8_t type = SizeTable::getInstance().GetClassFromUserSize(user_size); 136 | 137 | if(type < threshold_size_class || alloc_stats[type].load(std::memory_order_relaxed) > threshold_popular_class ){ 138 | return home_thread_id; 139 | } 140 | 141 | text(log_fp, "Redirect alloc to another thread! \n"); 142 | return type % num_threads; 143 | } 144 | 145 | 146 | void AllocAdapter::Alloc(uint32_t home_thread_id, uint32_t user_size, void *owner){ 147 | uint32_t thread_id = get_thread_id(user_size, home_thread_id); 148 | all_allocs[home_thread_id]->AllocAtHome(all_allocs[thread_id], user_size, AllocReplyCb, owner); 149 | } 150 | 151 | 152 | 153 | void AllocAdapter::processAllocReply( client_addr_t ret_addr ) { 154 | // for debugging. To activate one compaction after 5 allocs 155 | /* if( alloc_stats[ret_addr.comp.type] > 5 && !( pending[ret_addr.comp.type].load(std::memory_order_relaxed) ) && num_threads > 1 ){ 156 | 157 | text(log_fp, "(%d) Trigger collection! \n", mts::thread_id); 158 | trigger_type_collection(ret_addr.comp.type, true); 159 | } */ 160 | 161 | if( ret_addr.comp.addr == (0ULL) ){ 162 | int ret = get_best_compaction_candidate(); 163 | if(ret < 0){ 164 | //we run out of memory 165 | if (num_threads > 1 ){ 166 | // we will enforce one thread to have the class and allocate from that thread 167 | // This compaction will collect all not full blocks at one thread and ask to allocate the object 168 | trigger_type_collection(ret_addr.comp.type, false); 169 | } 170 | } else { 171 | // trigger compaction to find room for allocation 172 | uint8_t type = (uint8_t)ret; 173 | if( !pending[type].load(std::memory_order_relaxed) ){ 174 | 175 | if(num_threads > 1){ 176 | trigger_type_collection(ret_addr.comp.type, true); 177 | }else{ 178 | all_allocs[0]->Compaction(ret_addr.comp.type); 179 | } 180 | } 181 | } 182 | } else { 183 | alloc_stats[ret_addr.comp.type]++; 184 | } 185 | } 186 | 187 | // This compaction will collect all not full blocks at one thread and ask to allocate the object 188 | void AllocAdapter::trigger_type_collection(uint8_t type, bool with_compaction ){ 189 | bool expected = false; 190 | bool exchanged = pending[type].compare_exchange_strong(expected, true); 191 | 192 | if(!exchanged){ 193 | // only one collection at a time 194 | return; 195 | } 196 | 197 | text(log_fp, "(%d) Prepare compaction ctx! \n", mts::thread_id); 198 | 199 | 200 | uint8_t compaction_master = get_compaction_master(type); // the thread which will gather the blocks 201 | text(log_fp, "(%d) compaction_master for type %u is %u! \n", mts::thread_id,type, compaction_master); 202 | 203 | ThreadAlloc::CompactionCtx* ctx = new ThreadAlloc::CompactionCtx(num_threads, type, mts::thread_id, all_allocs[compaction_master], with_compaction); 204 | 205 | mts::thread_msg_t *tmsg = new mts::thread_msg_t(); 206 | tmsg->cb = &CompactionTmsg; 207 | tmsg->payload[0] = ctx; 208 | 209 | for(uint32_t thread_id = 0; thread_id < num_threads; thread_id++){ 210 | if(thread_id!=compaction_master && thread_id!=mts::thread_id){ 211 | mts::send_msg_to_thread_and_notify(thread_id, tmsg); 212 | text(log_fp, "Send ctx to thread ! %d \n", thread_id); 213 | } 214 | } 215 | 216 | if( mts::thread_id != compaction_master){ 217 | text(log_fp, "SendBlocksTo locally ctx to thread ! %d \n", mts::thread_id); 218 | all_allocs[mts::thread_id]->SendBlocksTo(ctx, CompactionMsgCb, tmsg); // there is a chance that this thread will be the last 219 | } 220 | } 221 | 222 | uint8_t AllocAdapter::get_compaction_master(uint8_t type){ 223 | return type % num_threads; 224 | } 225 | 226 | ThreadAlloc* AllocAdapter::getAlloc(uint32_t id) { 227 | return all_allocs[id]; 228 | } 229 | 230 | void AllocAdapter::CompactionTmsg(mts::thread_msg_t *tmsg){ 231 | ThreadAlloc* alloc = AllocAdapter::getInstance().getAlloc(mts::thread_id); 232 | ThreadAlloc::CompactionCtx* ctx = (ThreadAlloc::CompactionCtx*)(tmsg->payload[0]); 233 | alloc->SendBlocksTo(ctx, CompactionMsgCb, tmsg); 234 | } 235 | 236 | 237 | 238 | void AllocAdapter::CompactionMsgCb(void *owner){ 239 | mts::thread_msg_t *msg = (mts::thread_msg_t *)owner; 240 | ThreadAlloc::CompactionCtx* ctx = (ThreadAlloc::CompactionCtx*)msg->payload[0]; 241 | 242 | if(ctx->initiator != mts::thread_id){ 243 | msg->cb = &AllocAdapter::CompactionMsgRetCb; 244 | mts::send_msg_to_thread_and_notify(ctx->initiator, msg); 245 | return; 246 | } 247 | 248 | CompactionMsgRetCb(msg); 249 | } 250 | 251 | void AllocAdapter::CompactionMsgRetCb(mts::thread_msg_t *msg){ 252 | 253 | ThreadAlloc::CompactionCtx* ctx = (ThreadAlloc::CompactionCtx*)msg->payload[0]; 254 | text(log_fp,"Collection is completed for class %u %s \n", ctx->type, ctx->with_compaction ? "with compaction": ""); 255 | 256 | AllocAdapter::getInstance().finishCompaction(ctx); 257 | 258 | delete ctx; 259 | delete msg; 260 | } 261 | 262 | void AllocAdapter::finishCompaction( ThreadAlloc::CompactionCtx* ctx){ 263 | // measure latency 264 | auto t2 = std::chrono::high_resolution_clock::now(); 265 | uint64_t nanosec = std::chrono::duration_cast< std::chrono::nanoseconds >( t2 - ctx->t1 ).count(); 266 | compaction_statistics[ctx->type].push_back(nanosec); 267 | // measure latency 268 | 269 | pending[ctx->type].store( false ); 270 | } 271 | 272 | 273 | 274 | int AllocAdapter::get_best_compaction_candidate(){ 275 | std::vector res; 276 | res.resize(SizeTable::ClassCount); 277 | 278 | int type = -1; 279 | float best_score = 0; 280 | 281 | for( uint32_t i=0; i < SizeTable::ClassCount; i++){ 282 | uint64_t blocks_allocated = bstat[i].load(std::memory_order_relaxed); 283 | 284 | if( blocks_allocated && !( pending[i].load(std::memory_order_relaxed) ) ){ 285 | float score = 1.0 - 286 | (alloc_stats[i].load(std::memory_order_relaxed) + 0.0) / 287 | (blocks_allocated * SizeTable::getInstance().objects_per_class[i] ) ; 288 | if(score >best_score ){ 289 | best_score = score; 290 | type = i; 291 | } 292 | } 293 | } 294 | 295 | if(best_score > 0.3){ // some threshold 296 | return type; 297 | } 298 | 299 | return -1; 300 | } 301 | 302 | void AllocAdapter::Free(uint32_t home_thread_id, client_addr_t client_addr, void *owner){ 303 | all_allocs[home_thread_id]->Free(client_addr, FreeReplyCb, owner); 304 | } 305 | 306 | 307 | 308 | void AllocAdapter::processFreeReply(int status, uint8_t type){ 309 | if(status==SUCCESS){ 310 | alloc_stats[type]--; 311 | } 312 | } 313 | -------------------------------------------------------------------------------- /alloc/block.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * Block is a wraper over memfd descriptor that helps to create virtual addresses 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | #pragma once 12 | #include 13 | #include "../common/common.hpp" 14 | 15 | 16 | class Block{ 17 | const int fd; 18 | const uint32_t offset_in_blocks; 19 | public: 20 | 21 | Block(int fd, uint32_t offset_in_blocks): fd(fd), offset_in_blocks(offset_in_blocks){ 22 | // nothing 23 | } 24 | 25 | uint32_t GetSize() const{ 26 | return BLOCK_SIZE; 27 | } 28 | 29 | _block_phys_addr_t GetPhysAddr() const { 30 | return _block_phys_addr_t({fd,offset_in_blocks}); 31 | } 32 | 33 | uint64_t CreateNewAddr() const{ 34 | // Make address alligned to the size of the block 35 | if(BLOCK_SIZE > 4096 ){ 36 | addr_t futurebuf = (addr_t)(char*)mmap(NULL, 2*BLOCK_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED| MAP_ANONYMOUS, -1, 0); 37 | addr_t alligned_addr = GetVirtBaseAddr(futurebuf); 38 | if(futurebuf - alligned_addr > 0){ 39 | alligned_addr+=BLOCK_SIZE; 40 | // printf(" %" PRIx64 " %" PRIx64 " " , futurebuf, ( alligned_addr - futurebuf ) ); 41 | munmap((void*)futurebuf, ( alligned_addr - futurebuf ) ) ; 42 | } 43 | uint64_t extra = futurebuf + 2*BLOCK_SIZE - alligned_addr - BLOCK_SIZE; 44 | if(extra > 0){ 45 | // printf(" %" PRIx64 " %" PRIx64 " " , alligned_addr + BLOCK_SIZE, extra ); 46 | munmap((void*)(alligned_addr + BLOCK_SIZE), extra ) ; 47 | } 48 | 49 | char* res = (char*)mmap((void*)alligned_addr, BLOCK_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, offset_in_blocks*BLOCK_SIZE); 50 | if (res == MAP_FAILED ){ 51 | perror("mmap failed with NULL"); 52 | exit(1); 53 | } 54 | return (uint64_t)alligned_addr; 55 | } else { 56 | char* alligned_addr = (char*)mmap(NULL, BLOCK_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, offset_in_blocks*BLOCK_SIZE); 57 | if (alligned_addr == MAP_FAILED ){ 58 | perror("mmap failed with NULL"); 59 | exit(1); 60 | } 61 | return (uint64_t)alligned_addr; 62 | } 63 | } 64 | 65 | void RemapVirtAddrToMe(addr_t virt_addr) const { 66 | char* ret = (char*)mmap(ADDR_T_TO_PTR(virt_addr), BLOCK_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED , fd, offset_in_blocks*BLOCK_SIZE); 67 | if (ret == MAP_FAILED){ 68 | perror("mmap when is mapped to file with MAP_FIXED"); 69 | exit(1); 70 | } 71 | } 72 | 73 | ~Block(){ 74 | /* nothing */ 75 | } 76 | 77 | }; 78 | 79 | -------------------------------------------------------------------------------- /alloc/block_alloc.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * Implmenetation of a block allocator. It also manages ownerships of each block. 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | 12 | #pragma once 13 | 14 | #include "alloc.hpp" 15 | 16 | 17 | #include "../utilities/block_home_table.h" 18 | 19 | #include 20 | #include 21 | 22 | #include "superblock.hpp" 23 | 24 | 25 | 26 | class BlockAllocImpl: public BlockAlloc { 27 | 28 | 29 | public: 30 | 31 | BlockAllocImpl(uint32_t thread_id, uint32_t prealloc_superblock_num, std::atomic *bstat = NULL): 32 | home_thread_id(thread_id) ,_superblock_counter (0), bstat(bstat) 33 | { 34 | preallocate_superblocks(prealloc_superblock_num); 35 | 36 | } 37 | 38 | virtual void AllocBlock(ThreadAlloc *alloc, uint8_t type, block_alloc_cb cb, void *owner) override; 39 | 40 | virtual void RemoveVirtAddr(addr_t addr, helper_cb cb, void *owner) override; 41 | 42 | virtual bool FreePhysBlock(_block_phys_addr_t addr, uint8_t type) override; 43 | 44 | virtual uint32_t GetBlockSize() const override; 45 | 46 | virtual ThreadAlloc *GetHomeAlloc(addr_t addr) override; 47 | 48 | virtual int GetHomeThreadMpIdx() const override{ 49 | return home_thread_id; 50 | } 51 | 52 | virtual void print_stats() override{ 53 | 54 | info(log_fp, "[BlockAllocImpl(%d)] Stats; \n", this->home_thread_id ); 55 | // todo. print stats. 56 | } 57 | 58 | 59 | 60 | virtual void UpdateOwnership( std::forward_list *addresses, ThreadAlloc *newalloc, install_blocks_cb cb, void *owner) override{ 61 | assert(this->home_thread_id == mts::thread_id && "Only home thread can modify"); 62 | while (!addresses->empty()){ 63 | addr_t addr = addresses->front(); 64 | addresses->pop_front(); 65 | 66 | text(log_fp, "[UpdateOwnership] Find %" PRIx64 " \n",addr ); 67 | ThreadAlloc * alloc = home_table.Lookup(addr); 68 | assert(alloc!=nullptr && "no addr in table"); 69 | 70 | bool changed = home_table.Update(addr, newalloc); 71 | assert(changed && "Failed to update owner of block addr"); 72 | } 73 | 74 | if(cb!=NULL){ 75 | cb(owner); 76 | } 77 | } 78 | 79 | // for debugging 80 | Block* AllocBlock(); 81 | 82 | 83 | ~BlockAllocImpl(){ 84 | 85 | for (auto const& item : all_superblocks) 86 | { 87 | delete item.second; 88 | } 89 | } 90 | 91 | 92 | private: 93 | const int home_thread_id; 94 | 95 | uint32_t _superblock_counter; 96 | 97 | BlockHomeTable home_table; 98 | std::atomic *const bstat; 99 | std::unordered_map all_superblocks; 100 | std::list free_superblocks; 101 | void preallocate_superblocks(uint32_t prealloc_superblock_num); 102 | 103 | 104 | 105 | }; 106 | 107 | 108 | 109 | ThreadAlloc* BlockAllocImpl::GetHomeAlloc(addr_t addr){ 110 | 111 | ThreadAlloc * alloc = home_table.Lookup(addr); 112 | 113 | return alloc; 114 | } 115 | 116 | 117 | uint32_t BlockAllocImpl::GetBlockSize() const{ 118 | return BLOCK_SIZE; 119 | } 120 | 121 | 122 | void BlockAllocImpl::RemoveVirtAddr(addr_t addr, helper_cb cb, void *owner){ 123 | home_table.Remove(addr); 124 | text(log_fp, " RemoveVirtAddr %" PRIx64 " \n",addr); 125 | int ret = munmap((void*)addr, BLOCK_SIZE); 126 | assert(ret==0 && "munmap failed"); 127 | if(cb!=NULL){ 128 | cb(owner); 129 | } 130 | } 131 | 132 | Block* BlockAllocImpl::AllocBlock(){ 133 | SuperBlock *sb; 134 | if(free_superblocks.empty()){ 135 | sb = new SuperBlock(_superblock_counter++); 136 | all_superblocks.insert({sb->getFD(), sb}); 137 | free_superblocks.push_front(sb); 138 | } 139 | sb = free_superblocks.front(); 140 | 141 | 142 | Block* b = sb->allocateBlock(); 143 | if(sb->isFull()){ 144 | free_superblocks.pop_front(); 145 | } 146 | 147 | return b; 148 | } 149 | 150 | void BlockAllocImpl::AllocBlock(ThreadAlloc *alloc, uint8_t type, block_alloc_cb cb, void *owner){ 151 | 152 | Block *b = AllocBlock(); 153 | text(log_fp, "[BlockAllocImpl] insert %p \n",b); 154 | addr_t addr = b->CreateNewAddr(); 155 | text(log_fp, "[BlockAllocImpl] insert %" PRIx64 " \n",addr); 156 | home_table.Insert(home_thread_id, addr, alloc); 157 | text(log_fp, "[BlockAllocImpl] insert %" PRIx64 " \n",addr); 158 | assert(alloc==home_table.Lookup(addr) && "home_table does not work correctly"); 159 | assert(alloc!=nullptr && "home_table does not work correctly"); 160 | 161 | if(bstat!=NULL){ 162 | bstat[type]++; 163 | } 164 | 165 | cb(b, addr, owner); 166 | } 167 | 168 | 169 | bool BlockAllocImpl::FreePhysBlock(_block_phys_addr_t phys, uint8_t type){ 170 | if(bstat!=NULL){ 171 | bstat[type]--; 172 | } 173 | auto it = all_superblocks.find(phys.fd); 174 | assert(it != all_superblocks.end()); 175 | SuperBlock* sb = it->second; 176 | 177 | bool wasFull = sb->isFull(); 178 | 179 | sb->freeBlock(phys); 180 | 181 | if(wasFull){ 182 | free_superblocks.push_front(sb); 183 | } 184 | 185 | return true; 186 | }; 187 | 188 | 189 | 190 | void BlockAllocImpl::preallocate_superblocks(uint32_t prealloc_superblock_num){ 191 | text(log_fp, "\t\t\t[BlockAllocImpl] preallocate_blocks %" PRIu32 " blocks \n",prealloc_superblock_num ); 192 | for(uint32_t i=0; i< prealloc_superblock_num; i++){ 193 | SuperBlock* sb = new SuperBlock(_superblock_counter++); 194 | free_superblocks.push_front(sb); 195 | all_superblocks.insert({sb->getFD(), sb}); 196 | } 197 | } 198 | -------------------------------------------------------------------------------- /alloc/local_block.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * Implmenetation of a thread-local block. It helps to allocate addresses and manage metadata. 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | #pragma once 12 | 13 | #include "block.hpp" 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | 22 | struct RandomGenerator{ 23 | 24 | std::default_random_engine generator; 25 | std::uniform_int_distribution dis; // not that I hard-coded only 2^16 ids. Change to have more 26 | 27 | RandomGenerator(uint32_t seed):generator(std::default_random_engine(seed)), dis(0,0xFFFF){ // not that I hard-coded only 2^16 ids. Change to have more 28 | 29 | } 30 | uint16_t GetNewRandomNumber(){ // not that I hard-coded only 2^16 ids. Change to have more 31 | return dis(generator); 32 | } 33 | }; 34 | 35 | 36 | 37 | struct LocalBlock 38 | { 39 | 40 | LocalBlock(RandomGenerator &gen, Block* b, uint8_t type, uint16_t slot_size): 41 | _gen(gen), _b(b), _size(_b->GetSize()), _type(type), _slot_size(slot_size), _slots( (BLOCK_USEFUL_SIZE) / slot_size) 42 | { 43 | _freeslots = _slots; 44 | _allocatedslots = 0; 45 | _obj_ids.clear(); 46 | 47 | 48 | std::vector temp_vector_for_shuffle; 49 | temp_vector_for_shuffle.reserve(_slots); 50 | for(uint32_t slot_id= _slots-1; slot_id > 0; slot_id--){ 51 | 52 | temp_vector_for_shuffle.push_back(slot_id*slot_size); 53 | } 54 | temp_vector_for_shuffle.push_back(0); 55 | 56 | std::shuffle ( temp_vector_for_shuffle.begin(), temp_vector_for_shuffle.end(), _gen.generator); 57 | 58 | _free_list = std::forward_list(temp_vector_for_shuffle.begin(), temp_vector_for_shuffle.end()); 59 | 60 | 61 | } 62 | 63 | std::list< struct ibv_mr* > all_virt_addr; 64 | 65 | RandomGenerator &_gen; 66 | Block * const _b; 67 | const uint16_t _size; 68 | const uint8_t _type; 69 | const uint16_t _slot_size; 70 | const uint16_t _slots; 71 | 72 | uint16_t _allocatedslots; 73 | uint16_t _freeslots; 74 | 75 | std::forward_list _free_list; 76 | // it stores offsets 77 | std::map _obj_ids; // obj_id to offset 78 | 79 | uint32_t hasObjects( ) const { 80 | return _obj_ids.size(); 81 | } 82 | 83 | void RemapVirtAddrToMe(addr_t addr) const { 84 | _b->RemapVirtAddrToMe(addr); 85 | } 86 | 87 | void AddNewVirtAddr(struct ibv_mr* mr){ 88 | if(all_virt_addr.empty()){ 89 | mr->lkey = 0; // I reuse lkey as allocated counted 90 | } 91 | all_virt_addr.push_back(mr); 92 | } 93 | 94 | void GetAllAddrs(std::forward_list *list){ 95 | for(auto &mr: all_virt_addr){ 96 | list->push_front((addr_t)mr->addr); 97 | } 98 | } 99 | 100 | struct ibv_mr* RemoveVirtAddr(addr_t addr){ 101 | auto it = std::find_if(all_virt_addr.begin(), all_virt_addr.end(), [&addr] (struct ibv_mr* mr) { return (uint64_t)mr->addr == addr; }); 102 | struct ibv_mr* mr = *it; 103 | all_virt_addr.erase(it); 104 | return mr; 105 | } 106 | 107 | struct ibv_mr* PopVirtAddr(){ 108 | if(all_virt_addr.empty()){ 109 | return NULL; 110 | } 111 | 112 | struct ibv_mr* mr = all_virt_addr.front(); 113 | all_virt_addr.pop_front(); 114 | return mr; 115 | } 116 | 117 | addr_t GetBaseAddr() const{ 118 | assert(!all_virt_addr.empty()); 119 | return (addr_t)(all_virt_addr.front()->addr); 120 | } 121 | 122 | uint32_t GetRKey() const{ 123 | assert(!all_virt_addr.empty()); 124 | return (all_virt_addr.front()->rkey); 125 | } 126 | 127 | _block_phys_addr_t GetPhysAddr() const{ 128 | return _b->GetPhysAddr(); 129 | } 130 | 131 | uint8_t GetType() const{ 132 | return _type; 133 | } 134 | 135 | uint16_t GetSlotSize() const{ 136 | return _slot_size; 137 | } 138 | 139 | offset_t AllocSlot( ){ 140 | offset_t offset = _free_list.front(); 141 | _free_list.pop_front(); 142 | _freeslots--; 143 | _allocatedslots++; 144 | 145 | all_virt_addr.front()->lkey++; // use to count objects in this virtaddr 146 | 147 | return offset; 148 | } 149 | 150 | 151 | offset_t AllocObject(uint16_t *obj_id){ 152 | text(log_fp, "\t\t\t[LocalBlock] AllocObject \n"); 153 | 154 | offset_t offset = AllocSlot(); 155 | 156 | // find free id 157 | uint16_t number = (_gen.GetNewRandomNumber() & mask_of_bits(ID_SIZE_BITS)); // bits ID_SIZE_BITS 158 | auto it = _obj_ids.find (number); 159 | 160 | while ( it != _obj_ids.end() ){ 161 | number = _gen.GetNewRandomNumber() & mask_of_bits(ID_SIZE_BITS); 162 | it = _obj_ids.find (number); 163 | } 164 | 165 | text(log_fp, "\t\t\t[LocalBlock] Assigned obj_id = %" PRIu16 " \n", number); 166 | 167 | _obj_ids.insert({number, offset}); 168 | *obj_id = number; 169 | 170 | return offset; 171 | } 172 | 173 | offset_t FindObject( uint16_t obj_id){ 174 | text(log_fp, "\t\t\t[LocalBlock] Find obj_id = %" PRIu16 " \n", obj_id); 175 | 176 | auto it = _obj_ids.find (obj_id); 177 | assert(it != _obj_ids.end()); 178 | 179 | return it->second; 180 | } 181 | 182 | 183 | offset_t RemoveObject(uint16_t obj_id){ 184 | 185 | text(log_fp, "\t\t\t[LocalBlock] Remove obj_id = %" PRIu16 " \n", obj_id); 186 | 187 | auto it = _obj_ids.find(obj_id); 188 | 189 | if(it == _obj_ids.end()){ 190 | info(log_fp, "\t\t\t[LocalBlock] obj_id = %" PRIu16 " does not exist \n", obj_id); 191 | return std::numeric_limits::max(); 192 | } 193 | 194 | offset_t offset = it->second; 195 | 196 | _obj_ids.erase(it); 197 | _free_list.push_front(offset); 198 | 199 | _freeslots++; 200 | _allocatedslots--; 201 | return offset; 202 | } 203 | 204 | bool is_full() const{ 205 | return _freeslots == 0; 206 | } 207 | 208 | bool RemoveOneAddr(addr_t old_addr){ 209 | auto it = std::find_if(all_virt_addr.begin(), all_virt_addr.end(), [&old_addr] (struct ibv_mr* mr) { return (uint64_t)mr->addr == old_addr; }); 210 | struct ibv_mr* mr = *it; 211 | mr->lkey--; 212 | bool can_be_unmapped = false; 213 | 214 | // if mr is not the main one and we deallocated all objects 215 | if(all_virt_addr.front() != mr && mr->lkey==0 ){ 216 | can_be_unmapped = true; 217 | } 218 | return can_be_unmapped; 219 | } 220 | 221 | bool Compactible(LocalBlock* from){ 222 | 223 | if( from->_type != this->_type || this->_freeslots < from->_allocatedslots){ 224 | return false; 225 | } 226 | 227 | // it assumes that both maps are sorted. Use merge sort to find intersections 228 | auto A_it = this->_obj_ids.begin(); 229 | auto B_it = from->_obj_ids.begin(); 230 | for (; A_it != this->_obj_ids.end() && B_it != from->_obj_ids.end(); ) 231 | { 232 | if(A_it->first == B_it->first){ 233 | return false; 234 | } 235 | 236 | if(A_it->first < B_it->first) 237 | { 238 | A_it++; 239 | }else{ 240 | B_it++; 241 | } 242 | } 243 | 244 | return true; 245 | } 246 | 247 | 248 | 249 | void AddEntriesFrom(LocalBlock* from) 250 | { 251 | text(log_fp, "\t\t\t[LocalBlock] Moving data from one block to another \n"); 252 | 253 | addr_t from_base = from->GetBaseAddr(); 254 | addr_t to_base = this->GetBaseAddr(); 255 | 256 | std::set set_free_list (_free_list.cbegin(),_free_list.cend() ); 257 | 258 | 259 | // here we try to place objects at the same offsets 260 | for (auto it = from->_obj_ids.cbegin(); it!=from->_obj_ids.cend(); /* no increment */) 261 | { 262 | 263 | auto sit = set_free_list.find(it->second); 264 | if ( sit != set_free_list.end() ) 265 | { 266 | 267 | offset_t offset = it->second; 268 | this->_obj_ids.insert({it->first, offset}); 269 | text(log_fp, "\t\t\t[LocalBlock] Moved object to the same offset %" PRIx32 " \n", offset); 270 | 271 | while(!ReaderWriter::try_lock_slot_for_compaction(from_base+offset)) { /* empty */}; 272 | memcpy((void*)(to_base+offset), (void*)(from_base+offset), _slot_size ); 273 | while(!ReaderWriter::unlock_slot_from_compaction(to_base+offset)) {/* empty */ }; 274 | 275 | this->_freeslots--; 276 | this->_allocatedslots++; 277 | from->_obj_ids.erase(it++); 278 | set_free_list.erase(sit); 279 | } 280 | else 281 | { 282 | ++it; 283 | } 284 | } 285 | 286 | _free_list.clear(); 287 | 288 | if(!set_free_list.empty()){ 289 | std::vector temp_vector_for_shuffle(set_free_list.begin(), set_free_list.end()); 290 | std::shuffle ( temp_vector_for_shuffle.begin(), temp_vector_for_shuffle.end(), _gen.generator); 291 | _free_list = std::forward_list(temp_vector_for_shuffle.begin(), temp_vector_for_shuffle.end()); 292 | } 293 | 294 | // here we try to place blocks at other offsets 295 | for(auto it = from->_obj_ids.begin(); it!=from->_obj_ids.end(); it++ ){ 296 | 297 | addr_t offset = AllocSlot(); 298 | text(log_fp, "\t\t\t[LocalBlock] Moved object to a new offset %" PRIx64 " \n", offset); 299 | this->_obj_ids.insert({it->first, offset}); 300 | 301 | while(!ReaderWriter::try_lock_slot_for_compaction(from_base+it->second)) { /* empty */}; 302 | memcpy((void*)(to_base+offset), (void*)(from_base+it->second), _slot_size ); 303 | while(!ReaderWriter::unlock_slot_from_compaction(to_base+offset)) {/* empty */ }; 304 | } 305 | } 306 | 307 | 308 | }; 309 | // helps to sort classes in c++ 310 | struct LocalBlockComp 311 | { 312 | using is_transparent = void; 313 | bool operator()(const LocalBlock* lhs, const LocalBlock* rhs) const { 314 | if(lhs->_freeslots == rhs->_freeslots){ 315 | return lhs < rhs; 316 | } else { 317 | return lhs->_freeslots < rhs->_freeslots; 318 | } 319 | } 320 | 321 | 322 | bool operator() (const LocalBlock* lhs, const uint32_t val) const 323 | { 324 | return lhs->_freeslots < val; 325 | } 326 | }; 327 | -------------------------------------------------------------------------------- /alloc/memfd.h: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * It is a helper class to use memfd in old linux kernels. I do not use it anymore. 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | 12 | #ifndef _MEMFD_H 13 | #define _MEMFD_H 14 | 15 | /* 16 | * No glibc wrappers exist for memfd_create(2), so provide our own. 17 | * 18 | * Also define memfd fcntl sealing macros. While they are already 19 | * defined in the kernel header file , that file as 20 | * a whole conflicts with the original glibc header . 21 | */ 22 | 23 | static inline int memfd_create(const char *name, unsigned int flags) { 24 | return syscall(__NR_memfd_create, name, flags); 25 | } 26 | 27 | #ifndef F_LINUX_SPECIFIC_BASE 28 | #define F_LINUX_SPECIFIC_BASE 1024 29 | #endif 30 | 31 | #ifndef F_ADD_SEALS 32 | #define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) 33 | #define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) 34 | 35 | #define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ 36 | #define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ 37 | #define F_SEAL_GROW 0x0004 /* prevent file from growing */ 38 | #define F_SEAL_WRITE 0x0008 /* prevent writes */ 39 | #endif 40 | 41 | #endif /* _MEMFD_H */ 42 | -------------------------------------------------------------------------------- /alloc/size_table.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * It is a helper class to map user sizes to slot sizes. Note that I need to add cache verions and a header to each user's object. 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | 12 | #pragma once 13 | 14 | 15 | #include "../utilities/debug.h" 16 | #include "../common/common.hpp" 17 | 18 | 19 | class SizeTable{ 20 | public: 21 | // We want 256 classes so they can be indexed with uint8_t. 22 | static const uint32_t ClassCount = 64; 23 | // Align to word boundary. 24 | public: 25 | static SizeTable& getInstance() 26 | { 27 | static SizeTable instance; // Guaranteed to be destroyed. 28 | // Instantiated on first use. 29 | return instance; 30 | } 31 | 32 | SizeTable(SizeTable const&) = delete; 33 | void operator=(SizeTable const&) = delete; 34 | 35 | uint32_t objects_per_class[ClassCount]; // for compaction 36 | 37 | private: 38 | static const uint32_t ClassAlignmentLog = 3; 39 | static const uint32_t ClassAlignment = (1 << ClassAlignmentLog); 40 | 41 | // The fist class is 64 bytes as we are unlikely to need more. 42 | static const uint32_t FirstClassSize = 8; 43 | 44 | static const uint32_t MaxAllowedFragmentation = 128; 45 | 46 | // Size of the lookup table. 47 | static const uint32_t LutBytes = 256; 48 | 49 | static const uint32_t MaxSizeInLut = LutBytes << ClassAlignmentLog; 50 | 51 | 52 | uint32_t size_map[ClassCount]; // real size 53 | uint8_t lut[LutBytes]; 54 | uint8_t last_lut_class; 55 | 56 | 57 | // private 58 | SizeTable(){ 59 | 60 | uint32_t current_size = FirstClassSize; 61 | const uint32_t last_class_size = BLOCK_USEFUL_SIZE; 62 | 63 | // populate the array used for lookups 64 | for(uint32_t i = 0; i < ClassCount; i++) { 65 | // set the size 66 | uint32_t num_obj = last_class_size / current_size; 67 | 68 | while((current_size <=last_class_size) && (last_class_size % current_size > MaxAllowedFragmentation || num_obj == last_class_size / current_size)){ 69 | current_size+=8; 70 | } 71 | 72 | num_obj = last_class_size / current_size; 73 | while((current_size <=last_class_size) && num_obj == last_class_size / current_size){ 74 | current_size+=8; 75 | } 76 | current_size-=8; 77 | 78 | size_map[i] = current_size; 79 | 80 | objects_per_class[i] = (BLOCK_USEFUL_SIZE)/ current_size; 81 | } 82 | 83 | // initialize the lut 84 | for(uint32_t i = 0; i < LutBytes;i++) { 85 | uint32_t size = i << ClassAlignmentLog; 86 | lut[i] = (uint8_t)SearchClass(size, 0); 87 | } 88 | 89 | last_lut_class = lut[LutBytes - 1]; 90 | 91 | }; 92 | 93 | 94 | public: 95 | 96 | void PrintTable(){ 97 | for(uint32_t i=0; i < ClassCount; i++ ){ 98 | info(log_fp,"Class[%u] -> real:%u user:%u;\n", i ,GetRealSize(i), GetUserSize(i)); 99 | } 100 | } 101 | 102 | uint32_t SearchClass(uint32_t size, uint32_t start_idx) const { 103 | // Binary search classes with the caveat that we are not 104 | // looking for the exact match, but for the next larger size. 105 | uint32_t end_idx = ClassCount; 106 | 107 | while(start_idx < end_idx) { 108 | uint32_t mid_idx = (start_idx + end_idx) / 2; 109 | 110 | if(size_map[mid_idx] == size) { 111 | start_idx = end_idx = mid_idx; 112 | } else if(size_map[mid_idx] < size) { 113 | start_idx = mid_idx + 1; 114 | } else { 115 | end_idx = mid_idx; 116 | } 117 | } 118 | 119 | return end_idx; 120 | } 121 | 122 | 123 | uint8_t GetClassFromUserSize(uint32_t user_size) const{ 124 | uint32_t real_size = user_size + sizeof(slot_header_t) + (user_size+sizeof(slot_header_t)-1)/CACHELINE ; 125 | uint8_t class1 = GetClass(real_size); 126 | return GetUserSize(class1) >= user_size ? class1 : class1 + 1; 127 | }; 128 | 129 | 130 | uint8_t GetClass(uint32_t real_size) const{ 131 | if(real_size <= MaxSizeInLut) { 132 | return lut[(real_size + ClassAlignment - 1) >> ClassAlignmentLog]; 133 | } 134 | 135 | return SearchClass(real_size, last_lut_class); 136 | }; 137 | 138 | uint32_t GetRealSize(uint8_t size_class) const{ 139 | return size_map[size_class]; 140 | }; 141 | 142 | uint32_t GetUserSize(uint8_t size_class) const{ 143 | uint32_t real_size = size_map[size_class]; 144 | 145 | if(real_size <= CACHELINE && CACHELINE % real_size == 0){ 146 | return real_size - sizeof(slot_header_t); 147 | } else { 148 | return real_size - sizeof(slot_header_t) - (real_size+CACHELINE-1)/CACHELINE; 149 | } 150 | 151 | 152 | }; 153 | 154 | 155 | }; 156 | -------------------------------------------------------------------------------- /alloc/superblock.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * It is a super-block allocator. IT exists to use less fds to address physical memeory. 5 | * So I can use one physical region for multiple blocks. 6 | * 7 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 8 | * 9 | * Author(s): Konstantin Taranov 10 | * 11 | */ 12 | 13 | #pragma once 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include 22 | #ifndef MFD_HUGE_SHIFT // for old linux kernels 23 | #include "memfd.h" 24 | #endif 25 | #include "block.hpp" 26 | #include 27 | 28 | 29 | class SuperBlock{ 30 | const uint32_t _id; 31 | int _fd; 32 | std::bitset _blocks; // 1 - is free, 0 - is allocated 33 | 34 | public: 35 | 36 | SuperBlock(uint32_t id): _id(id) { 37 | text(log_fp,"Create Block name: %s\n",std::to_string(_id).c_str() ); 38 | 39 | _fd = memfd_create( std::to_string(_id).c_str() , MFD_CLOEXEC ); 40 | if (_fd == -1){ 41 | exit(1); 42 | } 43 | 44 | int ret = ftruncate(_fd, BLOCKS_IN_SUPERBLOCK * BLOCK_SIZE); 45 | if (ret == -1){ 46 | exit(1); 47 | } 48 | // set all bits to true; 49 | _blocks.set(); 50 | text(log_fp,"Success for superblock %s with file descriptor %d \n",std::to_string(_id).c_str(), _fd); 51 | } 52 | 53 | 54 | uint32_t getID() const{ 55 | return _id; 56 | } 57 | 58 | int getFD() const{ 59 | return _fd; 60 | } 61 | 62 | size_t getSize() const{ 63 | return BLOCKS_IN_SUPERBLOCK * BLOCK_SIZE; 64 | } 65 | 66 | bool isFree() const{ 67 | return _blocks.all(); 68 | } 69 | 70 | bool isFull() const{ 71 | return _blocks.none(); 72 | } 73 | 74 | bool hasFreeBlocks() const{ 75 | return _blocks.any(); 76 | } 77 | 78 | Block* allocateBlock(){ 79 | assert(hasFreeBlocks() && "SuperBlock has no free blocks"); 80 | 81 | uint32_t offset_in_blocks = 0; 82 | for (uint32_t i = 0; i < _blocks.size(); ++i) { 83 | 84 | if(_blocks[i]){ 85 | _blocks.reset(i); // set bit to 0 86 | offset_in_blocks = i; 87 | break; 88 | } 89 | } 90 | return new Block(_fd, offset_in_blocks);; 91 | } 92 | 93 | bool freeBlock(_block_phys_addr_t phys){ 94 | assert(phys.fd == _fd && "attempt to deallocate to foreign block"); 95 | _blocks.set(phys.offset_in_blocks); // set bit to 1 96 | return true; 97 | } 98 | 99 | ~SuperBlock(){ 100 | text(log_fp,"\n========== DESTROY SUPERBLOCK[%u] ============ \n", _id); 101 | if(_fd!=-1){ 102 | close(_fd); 103 | } 104 | } 105 | 106 | }; 107 | 108 | -------------------------------------------------------------------------------- /common/common.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * It is a file responsible for corm's settings. It helps to choose size of pointers, addr_t, block headers. 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | 12 | 13 | #pragma once 14 | 15 | #include "../utilities/debug.h" 16 | #include 17 | #include 18 | #include 19 | 20 | typedef unsigned __int128 uint128_t; 21 | 22 | 23 | static const uint64_t CACHELINE = 64ULL; 24 | static const uint64_t CACHELINE_MASK = ~((uint64_t)0x3F); 25 | 26 | static constexpr uint64_t mask_of_bits(uint16_t bits){ 27 | return ((uint64_t)1 << bits) - 1; 28 | } 29 | 30 | static_assert ( mask_of_bits(16) == 0xFFFF , "mask_of_bits does not work correctly") ; 31 | static_assert ( mask_of_bits(2) == 0b11 , "mask_of_bits does not work correctly") ; 32 | static_assert ( mask_of_bits(12) == 0xFFF , "mask_of_bits does not work correctly") ; 33 | 34 | // Note that in my experiments I manually changed the sizes and recompiled the code. 35 | 36 | // 4KB blocks 37 | //static const size_t PAGE_SIZE = (4096); 38 | 39 | static const uint32_t BLOCK_BIT_SIZE = (12) ; 40 | static const uint32_t BLOCK_SIZE = (uint32_t)(1 << BLOCK_BIT_SIZE); 41 | 42 | #define BLOCKS_IN_SUPERBLOCK (16) 43 | 44 | /* 45 | static const uint32_t BLOCK_BIT_SIZE = (30) ; 46 | static const uint32_t BLOCK_SIZE = (uint32_t)(1 << BLOCK_BIT_SIZE); 47 | 48 | #define BLOCKS_IN_SUPERBLOCK (1) 49 | */ 50 | 51 | // client pointer: | object addr | object id | rkey | type | padding | 52 | // 128bits: | 48 bits | 16 bits | 32 bits | 8 bits | 24 | 53 | 54 | #define VIRTUAL_ADDRESS_SIZE (48) // on linux x86-64 it is guaranteed 55 | #define ID_SIZE_BITS (16) // 2^16 different ids 56 | #define TYPE_SIZE (8) 57 | #define BASE_ADDR_BITS (VIRTUAL_ADDRESS_SIZE - BLOCK_BIT_SIZE) // 36 58 | 59 | 60 | typedef union { 61 | uint128_t whole; 62 | struct { 63 | // uint64_t padding: 128 - TYPE_SIZE - 32 - ID_SIZE_BITS - VIRTUAL_ADDRESS_SIZE - 8; 64 | uint64_t version: 8; 65 | uint64_t type : TYPE_SIZE; 66 | uint64_t obj_id : ID_SIZE_BITS; 67 | uint64_t rkey : 32; 68 | uint64_t addr ;// : VIRTUAL_ADDRESS_SIZE; 69 | } comp; 70 | uint64_t parts[2]; 71 | } client_addr_t; 72 | 73 | typedef uint64_t addr_t; // actual memory address 74 | 75 | typedef uint32_t offset_t; // offset from base 76 | 77 | 78 | #define ADDR_T_TO_PTR(x) ((void*)x) 79 | 80 | 81 | typedef struct { 82 | int fd; 83 | uint32_t offset_in_blocks; 84 | } _block_phys_addr_t; 85 | 86 | 87 | typedef union { 88 | uint128_t whole; 89 | struct { 90 | uint64_t padding : 32 - TYPE_SIZE; 91 | uint64_t type : TYPE_SIZE; 92 | uint64_t rkey : 32; 93 | uint64_t base : 64; //BASE_ADDR_BITS 94 | } comp; 95 | uint64_t parts[2]; 96 | } block_header_t; 97 | 98 | static const uint32_t BLOCK_USEFUL_SIZE = (BLOCK_SIZE - sizeof(block_header_t)); 99 | 100 | // Slot header: | V obj | Lock | compaction | old addr | id | 101 | // |8 bits | 1 bit | 1 bits | BASE_ADDR_BITS bits | ID_SIZE_BITS bits | = 16 bytes per object 102 | 103 | struct slot_header_t{ 104 | uint64_t version : 8; 105 | uint64_t lock : 1; 106 | uint64_t allocated : 1; 107 | uint64_t compaction : 1; // the idea is that if slot is compacted it means the object most likely has been moved 108 | uint64_t padding: (53 - ID_SIZE_BITS - BASE_ADDR_BITS); 109 | uint64_t obj_id : ID_SIZE_BITS; 110 | uint64_t oldbase : BASE_ADDR_BITS; // is used to distinguish between native objects in block and one which came from another block 111 | }; 112 | 113 | 114 | 115 | static_assert(std::is_pod::value, "slot_header_t is not POD"); 116 | static_assert(std::is_pod::value, "client_addr_t is not POD"); 117 | static_assert(std::is_pod::value, "block_header_t is not POD"); 118 | static_assert(sizeof(client_addr_t) == sizeof(uint128_t), "client_addr_t is not 128 bits?"); 119 | static_assert(sizeof(slot_header_t) == sizeof(uint64_t), "slot_header_t is not 64 bits?"); 120 | static_assert(sizeof(uint64_t) == sizeof(unsigned long), " Type check "); 121 | static_assert(sizeof(uint64_t) == sizeof(unsigned long long), " Type check "); 122 | 123 | 124 | inline addr_t GetVirtBaseAddr(addr_t addr){ 125 | return (addr >> BLOCK_BIT_SIZE) << BLOCK_BIT_SIZE ; 126 | } 127 | 128 | inline addr_t GetAddressOffset(addr_t addr){ 129 | return addr & mask_of_bits(BLOCK_BIT_SIZE); 130 | } 131 | 132 | inline uint8_t GetSlotType(addr_t addr){ 133 | addr_t baseaddr = GetVirtBaseAddr(addr); 134 | return ((block_header_t*)(baseaddr+BLOCK_USEFUL_SIZE))->comp.type; 135 | } 136 | 137 | inline addr_t GetSlotNewestBaseAddr(addr_t addr){ 138 | addr_t baseaddr = GetVirtBaseAddr(addr); 139 | return (addr_t)(((block_header_t*)(baseaddr+BLOCK_USEFUL_SIZE))->comp.base); 140 | //return (addr_t)(((addr_t)((block_header_t*)(baseaddr+BLOCK_USEFUL_SIZE))->comp.base) << BLOCK_BIT_SIZE); 141 | } 142 | 143 | inline uint32_t GetSlotNewestRkey(addr_t addr){ 144 | addr_t baseaddr = GetVirtBaseAddr(addr); 145 | return (uint32_t)(((block_header_t*)(baseaddr+BLOCK_USEFUL_SIZE))->comp.rkey); 146 | } 147 | 148 | inline uint8_t GetSlotVersion(addr_t addr){ 149 | return (uint8_t)(((slot_header_t*)(char*)addr)->version); 150 | } 151 | 152 | inline uint16_t GetSlotObjId(addr_t addr){ 153 | return (uint16_t)(((slot_header_t*)(char*)addr)->obj_id); 154 | } 155 | 156 | inline client_addr_t CreateClientAddr(addr_t addr, uint32_t rkey, uint16_t obj_id, uint8_t version, uint8_t type){ 157 | client_addr_t client_addr; 158 | client_addr.comp.version = version; 159 | client_addr.comp.type = type; 160 | client_addr.comp.rkey = rkey; 161 | client_addr.comp.obj_id = obj_id; 162 | client_addr.comp.addr = addr; 163 | return client_addr; 164 | } 165 | 166 | 167 | inline addr_t GetObjectAddr(client_addr_t addr){ 168 | return (addr_t)(addr.comp.addr); 169 | } 170 | 171 | inline uint16_t GetObjId(client_addr_t client_addr){ 172 | return (uint16_t)client_addr.comp.obj_id; 173 | } 174 | 175 | 176 | -------------------------------------------------------------------------------- /compact.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * A simple code to trigger compaction. 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | #include // std::cout 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | FILE *log_fp; 26 | 27 | 28 | #include "worker/client_api.hpp" 29 | #include "rdma/connectRDMA.hpp" 30 | 31 | #include "utilities/cxxopts.hpp" 32 | 33 | cxxopts::ParseResult 34 | parse(int argc, char* argv[]) 35 | { 36 | cxxopts::Options options(argv[0], "Trigger compaction for a given size"); 37 | options 38 | .positional_help("[optional args]") 39 | .show_positional_help(); 40 | 41 | try 42 | { 43 | 44 | options.add_options() 45 | ("server", "Another address", cxxopts::value(), "IP") 46 | ("size", "objects size", cxxopts::value()->default_value("24"), "N") 47 | ("help", "Print help") 48 | ; 49 | 50 | auto result = options.parse(argc, argv); 51 | 52 | if (result.count("help")) 53 | { 54 | std::cout << options.help({""}) << std::endl; 55 | exit(0); 56 | } 57 | 58 | if (!result.count("server")) 59 | { 60 | throw cxxopts::OptionException("input must be specified"); 61 | } 62 | 63 | 64 | 65 | return result; 66 | 67 | } catch (const cxxopts::OptionException& e) 68 | { 69 | std::cout << "error parsing options: " << e.what() << std::endl; 70 | std::cout << options.help({""}) << std::endl; 71 | exit(1); 72 | } 73 | } 74 | 75 | 76 | int main(int argc, char* argv[]){ 77 | 78 | 79 | auto allparams = parse(argc,argv); 80 | 81 | log_fp=stdout; 82 | 83 | std::string server = allparams["server"].as(); 84 | uint32_t size = allparams["size"].as(); 85 | 86 | ClientRDMA rdma((char*)server.c_str(),9999); 87 | struct rdma_cm_id * id = rdma.sendConnectRequest(); 88 | 89 | struct ibv_pd * pd = ClientRDMA::create_pd(id); 90 | 91 | struct ibv_qp_init_attr attr; 92 | struct rdma_conn_param conn_param; 93 | memset(&attr, 0, sizeof(attr)); 94 | attr.cap.max_send_wr = 32; 95 | attr.cap.max_recv_wr = 32; 96 | attr.cap.max_send_sge = 1; 97 | attr.cap.max_recv_sge = 1; 98 | attr.cap.max_inline_data = 0; 99 | attr.qp_type = IBV_QPT_RC; 100 | 101 | attr.send_cq = ibv_create_cq(pd->context, 32, NULL, NULL, 0); 102 | attr.recv_cq = ibv_create_cq(pd->context, 32, NULL, NULL, 0); 103 | 104 | memset(&conn_param, 0 , sizeof(conn_param)); 105 | conn_param.responder_resources = 0; 106 | conn_param.initiator_depth = 5; 107 | conn_param.retry_count = 3; 108 | conn_param.rnr_retry_count = 3; 109 | 110 | VerbsEP* ep = ClientRDMA::connectEP(id, &attr, &conn_param, pd); 111 | 112 | printf("Connected\n"); 113 | sleep(1); 114 | 115 | RemoteMemoryClient* RMAPI = new RemoteMemoryClient(0,ep); 116 | 117 | 118 | uint8_t slot_type = SizeTable::getInstance().GetClassFromUserSize(size); 119 | printf("Trigger compaction for size-class %u \n",slot_type); 120 | 121 | RMAPI->TriggerCompaction(slot_type, true, true); 122 | 123 | sleep(0.5); 124 | printf("done one\n"); 125 | 126 | return 0; 127 | } 128 | -------------------------------------------------------------------------------- /compaction_latency.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * A simple code to measure compaction latency. 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | #include // std::cout 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | FILE *log_fp; 26 | 27 | 28 | #include "worker/client_api.hpp" 29 | #include "rdma/connectRDMA.hpp" 30 | 31 | #include "utilities/cxxopts.hpp" 32 | 33 | 34 | 35 | uint64_t num; 36 | 37 | cxxopts::ParseResult 38 | parse(int argc, char* argv[]) 39 | { 40 | cxxopts::Options options(argv[0], "Measure compaction latency"); 41 | options 42 | .positional_help("[optional args]") 43 | .show_positional_help(); 44 | 45 | try 46 | { 47 | 48 | options.add_options() 49 | ("server", "Another address", cxxopts::value(), "IP") 50 | ("t,threads", "the number of remote threads", cxxopts::value()->default_value(std::to_string(1)), "N") 51 | ("n,num", "Number of requests to run", cxxopts::value()->default_value("123"), "N") 52 | ("size", "objects size", cxxopts::value()->default_value("24"), "N") 53 | ("compaction", "enable compact objects") 54 | ("collection", "enable collection objects") 55 | ("help", "Print help") 56 | ; 57 | 58 | auto result = options.parse(argc, argv); 59 | 60 | if (result.count("help")) 61 | { 62 | std::cout << options.help({""}) << std::endl; 63 | exit(0); 64 | } 65 | 66 | if (!result.count("server")) 67 | { 68 | throw cxxopts::OptionException("input must be specified"); 69 | } 70 | 71 | 72 | 73 | return result; 74 | 75 | } catch (const cxxopts::OptionException& e) 76 | { 77 | std::cout << "error parsing options: " << e.what() << std::endl; 78 | std::cout << options.help({""}) << std::endl; 79 | exit(1); 80 | } 81 | } 82 | 83 | 84 | int main(int argc, char* argv[]){ 85 | 86 | 87 | auto allparams = parse(argc,argv); 88 | 89 | log_fp=stdout; 90 | 91 | std::string server = allparams["server"].as(); 92 | uint32_t threads = allparams["threads"].as(); 93 | uint32_t size = allparams["size"].as(); 94 | num = allparams["num"].as(); 95 | 96 | bool with_compaction = allparams.count("compaction"); 97 | bool with_collection = allparams.count("collection"); 98 | 99 | ClientRDMA rdma((char*)server.c_str(),9999); 100 | struct rdma_cm_id * id = rdma.sendConnectRequest(); 101 | 102 | struct ibv_pd * pd = ClientRDMA::create_pd(id); 103 | 104 | struct ibv_qp_init_attr attr; 105 | struct rdma_conn_param conn_param; 106 | memset(&attr, 0, sizeof(attr)); 107 | attr.cap.max_send_wr = 32; 108 | attr.cap.max_recv_wr = 32; 109 | attr.cap.max_send_sge = 1; 110 | attr.cap.max_recv_sge = 1; 111 | attr.cap.max_inline_data = 0; 112 | attr.qp_type = IBV_QPT_RC; 113 | 114 | attr.send_cq = ibv_create_cq(pd->context, 32, NULL, NULL, 0); 115 | attr.recv_cq = ibv_create_cq(pd->context, 32, NULL, NULL, 0); 116 | 117 | memset(&conn_param, 0 , sizeof(conn_param)); 118 | conn_param.responder_resources = 0; 119 | conn_param.initiator_depth = 5; 120 | conn_param.retry_count = 3; 121 | conn_param.rnr_retry_count = 3; 122 | 123 | std::vector conns; 124 | 125 | conns.push_back(ClientRDMA::connectEP(id, &attr, &conn_param, pd)); 126 | 127 | for(uint32_t i = 1 ; i < threads; i++){ 128 | struct rdma_cm_id * tid = rdma.sendConnectRequest(); 129 | attr.send_cq = ibv_create_cq(pd->context, 32, NULL, NULL, 0); 130 | attr.recv_cq = ibv_create_cq(pd->context, 32, NULL, NULL, 0); 131 | conns.push_back(ClientRDMA::connectEP(tid, &attr, &conn_param, pd)); 132 | } 133 | 134 | if(threads>1){ 135 | assert(conns[0]->qp->send_cq != conns[1]->qp->send_cq && "Different connections must use Different CQ") ; 136 | } 137 | 138 | printf("Connected\n"); 139 | sleep(1); 140 | 141 | std::vector RMAPI; 142 | for( auto ep:conns ){ 143 | RMAPI.push_back(new RemoteMemoryClient(0,ep)); 144 | } 145 | 146 | 147 | printf("Start compaction test \n"); 148 | for(uint32_t i = 0; i< num; i++){ 149 | std::vector objects; 150 | 151 | for( auto x : RMAPI ){ 152 | LocalObjectHandler* obj1 = x->Alloc(size); 153 | objects.push_back(obj1); 154 | } 155 | uint8_t slot_type = objects[0]->addr.comp.type; 156 | RMAPI[0]->TriggerCompaction(slot_type,with_collection, with_compaction); 157 | sleep(1.2); 158 | for(uint32_t i = 0; i < threads; i++){ 159 | RMAPI[i]->Free(objects[i]); 160 | } 161 | sleep(0.5); 162 | printf("done one\n"); 163 | } 164 | 165 | printf("Done compaction test\n"); 166 | 167 | return 0; 168 | } 169 | -------------------------------------------------------------------------------- /core.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # CoRM: Compactable Remote Memory over RDMA 5 | # 6 | # Help functions to deploy CoRM 7 | # 8 | # Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 9 | # 10 | # Author(s): Konstantin Taranov 11 | # 12 | 13 | 14 | 15 | define(){ IFS='\n' read -r -d '' ${1} || true; } 16 | redirection=( "> out" "2> err" "< /dev/null" ) 17 | 18 | declare -A processPids 19 | 20 | __count_process=0 21 | __corm="" 22 | 23 | 24 | WORKDIR="$PWD" 25 | LOCALWORKDIR="$PWD" 26 | __VERBOSE=1 27 | 28 | function log () { 29 | if [[ $__VERBOSE -ge 1 ]]; then 30 | echo -e "$@" 31 | fi 32 | } 33 | 34 | function debug () { 35 | if [[ $__VERBOSE -ge 2 ]]; then 36 | echo -e "$@" 37 | fi 38 | } 39 | 40 | scpFileTo(){ 41 | local server="$1" 42 | local filename="$2" 43 | local cmd=( "scp" "$2" "$USER@$server:${WORKDIR}/" ) 44 | debug "\t\tExecuting: ${cmd[@]}" 45 | $("${cmd[@]}") 46 | } 47 | 48 | scpFileFrom(){ 49 | local server="$1" 50 | local filename="$2" 51 | local cmd=("scp" "$USER@$server:${WORKDIR}/$2" ./) 52 | debug "\t\tExecuting: ${cmd[@]}" 53 | $("${cmd[@]}") 54 | } 55 | 56 | sshCommandAsync() { 57 | local server=$1 58 | local command=$2 59 | local valredirect="${redirection[@]}" 60 | if ! [[ -z $3 ]] 61 | then 62 | valredirect="> "$3" 2>/dev/null" 63 | fi 64 | local cmd=( "ssh" "-oStrictHostKeyChecking=no" "$USER@$server" "nohup" "$command" "$valredirect" "&" "echo \$!" ) 65 | local pid=$("${cmd[@]}") 66 | echo "$pid" 67 | } 68 | 69 | sshCommandSync() { 70 | local server="$1" 71 | local command="$2" 72 | local valredirect="${redirection[@]}" 73 | if ! [[ -z $3 ]] 74 | then 75 | valredirect="> "$3" 2>/dev/null" 76 | fi 77 | local cmd=( "ssh" "-oStrictHostKeyChecking=no" "$USER@$server" "$command" "$valredirect" ) 78 | debug "\t\tExecuting: ${cmd[@]}" 79 | $("${cmd[@]}") 80 | } 81 | 82 | sshKillCommand() { 83 | local server=$1 84 | local pid=$2 85 | cmd=( "ssh" "$USER@$server" "kill -9" "${pid}" ) 86 | debug "\t\tExecuting: ${cmd[@]}" 87 | $("${cmd[@]}") 88 | } 89 | 90 | sshStopCommand() { 91 | local server=$1 92 | local pid=$2 93 | cmd=( "ssh" "$USER@$server" "kill -2" "${pid}" ) 94 | debug "\t\tExecuting: ${cmd[@]}" 95 | $("${cmd[@]}") 96 | } 97 | 98 | startCorm(){ 99 | local server=$1 100 | local params=$2 101 | local pid=$(sshCommandAsync $server "${WORKDIR}/server --server=$server ${params}") 102 | log "\tCorm is started at ${server} with PID$pid and params ${params}" 103 | 104 | __corm="$server,$pid" 105 | __corm_server="$server" 106 | } 107 | 108 | loadCorm(){ 109 | local size=$1 110 | local num=$2 111 | local comm="$WORKDIR/load --server=${__corm_server} --num=$num --size=$size --threads=8 --randomize" 112 | log "\tStart loading server with $num elements with user size: $size" 113 | ${comm} 114 | log "\t Loading is done" 115 | } 116 | 117 | unloadCorm(){ 118 | local num=$1 119 | local comm="$WORKDIR/unload --server=${__corm_server} --num=$num " 120 | log "\tStart unloading server with $num elements" 121 | ${comm} 122 | log "\t unLoading is done" 123 | } 124 | 125 | killCorm(){ 126 | local servername=$( echo $__corm | cut -d, -f1) 127 | local pid=$( echo $__corm | cut -d, -f2) 128 | sshKillCommand $servername $pid 129 | log "\tCorm is killed at $servername" 130 | } 131 | 132 | stopCorm(){ 133 | local servername=$( echo $__corm | cut -d, -f1) 134 | local pid=$( echo $__corm | cut -d, -f2) 135 | sshStopCommand $servername $pid 136 | log "\tCorm is stoppped at $servername" 137 | } 138 | 139 | killAllProcesses(){ 140 | echo "try to kill ${__count_process} processes" 141 | echo "the dict has ${!processPids[@]} entries" 142 | for id in "${!processPids[@]}" 143 | do 144 | local temp=${processPids[$id]} 145 | local servername=$( echo $temp | cut -d, -f1) 146 | local pid=$( echo $temp | cut -d, -f2) 147 | sshKillCommand $servername $pid 148 | log "\tClient is killed at $servername" 149 | done 150 | processPids=() 151 | __count_process=0 152 | } 153 | 154 | runLatency(){ 155 | local size=$1 156 | local num=$2 157 | local filename=$3 158 | 159 | debug "runLatency " 160 | local comm="$WORKDIR/latency --server=${__corm_server} --num=$num --size=$size" 161 | log "\tStart runLatency with size=$size" 162 | ${comm} > $filename 163 | log "\t runLatency is done" 164 | } 165 | 166 | runWorkloadRemoteAsync(){ 167 | local server=$1 168 | local prob=$2 169 | local num=$3 170 | local threads=$4 171 | local seed=$5 172 | local flags=$6 173 | local filename=$7 174 | local comm="$WORKDIR/workload_readwrite --server=${__corm_server} --target=2000000 \ 175 | --prob=${prob} --num=${num} --threads=${threads} --seed=${seed} --input=${WORKDIR}/test.bin ${flags}" 176 | echo "$comm" 177 | local pid=$(sshCommandAsync $server "$comm" $filename) 178 | log "\tWorkload is started at ${server} with PID$pid" 179 | processPids[${__count_process}]="$server,$pid" 180 | __count_process=$((${__count_process}+1)) 181 | } 182 | -------------------------------------------------------------------------------- /latency.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * A simple code to measuring latency of various requests. 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | #include // std::cout 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | 23 | #include "thread/thread.hpp" 24 | #include "utilities/timer.h" 25 | 26 | FILE *log_fp; 27 | 28 | #include "worker/client_api.hpp" 29 | #include "rdma/connectRDMA.hpp" 30 | 31 | 32 | #include "utilities/cxxopts.hpp" 33 | 34 | 35 | cxxopts::ParseResult 36 | parse(int argc, char* argv[]) 37 | { 38 | cxxopts::Options options(argv[0], "Various latency test for CoRM"); 39 | options 40 | .positional_help("[optional args]") 41 | .show_positional_help(); 42 | 43 | try 44 | { 45 | 46 | options.add_options() 47 | ("a,server", "Another address", cxxopts::value(), "IP") 48 | ("s,size", "Object size", cxxopts::value()->default_value("8"), "N") 49 | ("n,num", "Number of tests", cxxopts::value()->default_value("1024"), "N") 50 | ("help", "Print help") 51 | ; 52 | 53 | auto result = options.parse(argc, argv); 54 | 55 | if (result.count("help")) 56 | { 57 | std::cout << options.help({""}) << std::endl; 58 | exit(0); 59 | } 60 | 61 | if (!result.count("server")) 62 | { 63 | throw cxxopts::OptionException("input must be specified"); 64 | } 65 | 66 | 67 | 68 | return result; 69 | 70 | } catch (const cxxopts::OptionException& e) 71 | { 72 | std::cout << "error parsing options: " << e.what() << std::endl; 73 | std::cout << options.help({""}) << std::endl; 74 | exit(1); 75 | } 76 | } 77 | 78 | 79 | 80 | int main(int argc, char* argv[]){ 81 | auto allparams = parse(argc,argv); 82 | 83 | log_fp=stdout; 84 | 85 | std::string server = allparams["server"].as(); 86 | uint32_t size = allparams["size"].as(); 87 | uint32_t N = allparams["num"].as(); 88 | 89 | 90 | ClientRDMA rdma((char*)server.c_str(),9999); 91 | struct rdma_cm_id * id = rdma.sendConnectRequest(); 92 | 93 | struct ibv_pd * pd = ClientRDMA::create_pd(id); 94 | 95 | struct ibv_qp_init_attr attr; 96 | struct rdma_conn_param conn_param; 97 | memset(&attr, 0, sizeof(attr)); 98 | attr.cap.max_send_wr = 32; 99 | attr.cap.max_recv_wr = 32; 100 | attr.cap.max_send_sge = 1; 101 | attr.cap.max_recv_sge = 1; 102 | attr.cap.max_inline_data = 0; 103 | attr.qp_type = IBV_QPT_RC; 104 | 105 | memset(&conn_param, 0 , sizeof(conn_param)); 106 | conn_param.responder_resources = 0; 107 | conn_param.initiator_depth = 5; 108 | conn_param.retry_count = 3; 109 | conn_param.rnr_retry_count = 3; 110 | 111 | 112 | VerbsEP* ep = ClientRDMA::connectEP(id, &attr, &conn_param, pd); 113 | 114 | printf("Connected\n"); 115 | sleep(1); 116 | 117 | RemoteMemoryClient* RMAPI = new RemoteMemoryClient(0,ep); 118 | 119 | 120 | std::vector time_alloc; 121 | std::vector time_free; 122 | time_alloc.reserve(N); 123 | time_free.reserve(N); 124 | 125 | printf("Start latency test for allocation/deallocation\n"); 126 | for(uint32_t i = 0; i< N; i++){ 127 | 128 | auto t1 = std::chrono::high_resolution_clock::now(); 129 | LocalObjectHandler* obj1 = RMAPI->Alloc(size); 130 | auto t2 = std::chrono::high_resolution_clock::now(); 131 | RMAPI->Free(obj1); 132 | auto t3 = std::chrono::high_resolution_clock::now(); 133 | 134 | auto alloc_nano = std::chrono::duration_cast< std::chrono::nanoseconds >( t2 - t1 ).count(); 135 | auto free_nano = std::chrono::duration_cast< std::chrono::nanoseconds >( t3 - t2 ).count(); 136 | time_alloc.push_back(alloc_nano / (float)1000.0 ); 137 | time_free.push_back(free_nano / (float)1000.0 ); 138 | } 139 | 140 | printf("Start latency test for read/Write\n"); 141 | std::vector time_read; 142 | std::vector time_read_rdma; 143 | std::vector time_write; 144 | time_read.reserve(N); 145 | time_read_rdma.reserve(N); 146 | time_write.reserve(N); 147 | 148 | char* buffer = (char*)malloc(size); 149 | LocalObjectHandler* obj1 = RMAPI->Alloc(size); 150 | for(uint32_t i = 0; i< N; i++){ 151 | auto t1 = std::chrono::high_resolution_clock::now(); 152 | RMAPI->Read(obj1, buffer, size); 153 | auto t2 = std::chrono::high_resolution_clock::now(); 154 | auto read_nano = std::chrono::duration_cast< std::chrono::nanoseconds >( t2 - t1 ).count(); 155 | time_read.push_back(read_nano / (float)1000.0 ); 156 | } 157 | 158 | for(uint32_t i = 0; i< N; i++){ 159 | auto t1 = std::chrono::high_resolution_clock::now(); 160 | RMAPI->ReadOneSided(obj1, buffer, size); 161 | auto t2 = std::chrono::high_resolution_clock::now(); 162 | auto read_nano = std::chrono::duration_cast< std::chrono::nanoseconds >( t2 - t1 ).count(); 163 | time_read_rdma.push_back(read_nano / (float)1000.0 ); 164 | } 165 | 166 | for(uint32_t i = 0; i< N; i++){ 167 | auto t1 = std::chrono::high_resolution_clock::now(); 168 | RMAPI->Write(obj1, buffer, size, false); 169 | auto t2 = std::chrono::high_resolution_clock::now(); 170 | auto write_nano = std::chrono::duration_cast< std::chrono::nanoseconds >( t2 - t1 ).count(); 171 | time_write.push_back(write_nano / (float)1000.0 ); 172 | } 173 | 174 | 175 | printf("Start latency test for fixpointer\n"); 176 | std::vector time_fix; 177 | std::vector time_fix_read; 178 | std::vector time_fix_read_rdma; 179 | std::vector time_fix_write; 180 | time_fix_write.reserve(N); 181 | time_fix_read.reserve(N); 182 | time_fix_read_rdma.reserve(N); 183 | time_fix.reserve(N); 184 | 185 | LocalObjectHandler* obj2 = RMAPI->Alloc(size); 186 | uint64_t direct_addr = obj2->addr.comp.addr; 187 | uint64_t base_addr = GetVirtBaseAddr(obj2->addr.comp.addr); 188 | if(base_addr == direct_addr){ 189 | printf("Warning! Object was block aligned! the test is not valid for it!\n"); 190 | } 191 | if(obj2->addr.comp.rkey==0) printf("zero rkey;"); 192 | printf("Start latency test for fix read\n"); 193 | for(uint32_t i = 0; i< N; i++){ 194 | obj2->addr.comp.addr = base_addr; // set wrong offset. so we will need to find the object by ID 195 | auto t1 = std::chrono::high_resolution_clock::now(); 196 | RMAPI->Read(obj2, buffer, size); 197 | auto t2 = std::chrono::high_resolution_clock::now(); 198 | auto read_nano = std::chrono::duration_cast< std::chrono::nanoseconds >( t2 - t1 ).count(); 199 | time_fix_read.push_back(read_nano / (float)1000.0 ); 200 | } 201 | if(obj2->addr.comp.rkey==0) printf("zero rkey;"); 202 | 203 | printf("Start latency test for fix read onesided\n"); 204 | for(uint32_t i = 0; i< N; i++){ 205 | obj2->addr.comp.addr = base_addr; 206 | auto t1 = std::chrono::high_resolution_clock::now(); 207 | int ret = RMAPI->ReadOneSided(obj2, buffer, size); 208 | if(ret==NOT_FOUND){ 209 | RMAPI->ReadOneSidedFix(obj2, buffer, size); 210 | } 211 | auto t2 = std::chrono::high_resolution_clock::now(); 212 | auto read_nano = std::chrono::duration_cast< std::chrono::nanoseconds >( t2 - t1 ).count(); 213 | time_fix_read_rdma.push_back(read_nano / (float)1000.0 ); 214 | } 215 | printf("Start latency test for fix write\n"); 216 | for(uint32_t i = 0; i< N; i++){ 217 | obj2->addr.comp.addr = base_addr; 218 | auto t1 = std::chrono::high_resolution_clock::now(); 219 | RMAPI->Write(obj2, buffer, size, false); 220 | auto t2 = std::chrono::high_resolution_clock::now(); 221 | auto write_nano = std::chrono::duration_cast< std::chrono::nanoseconds >( t2 - t1 ).count(); 222 | time_fix_write.push_back(write_nano / (float)1000.0 ); 223 | } 224 | 225 | printf("Start latency test for fix fix pointer\n"); 226 | for(uint32_t i = 0; i< N; i++){ 227 | obj2->addr.comp.addr = base_addr; 228 | auto t1 = std::chrono::high_resolution_clock::now(); 229 | RMAPI->FixPointer(obj2); 230 | auto t2 = std::chrono::high_resolution_clock::now(); 231 | auto fix_nano = std::chrono::duration_cast< std::chrono::nanoseconds >( t2 - t1 ).count(); 232 | time_fix.push_back(fix_nano / (float)1000.0 ); 233 | } 234 | 235 | 236 | 237 | std::vector time_rpc; 238 | std::vector time_rdma; 239 | time_rpc.reserve(N); 240 | time_rdma.reserve(N); 241 | 242 | printf("Start latency test for RDMA rpc and onesided\n"); 243 | for(uint32_t i = 0; i< N; i++){ 244 | 245 | auto t1 = std::chrono::high_resolution_clock::now(); 246 | RMAPI->RpcFake(buffer,size); 247 | auto t2 = std::chrono::high_resolution_clock::now(); 248 | RMAPI->ReadOneSidedFake(obj1,buffer,size); 249 | auto t3 = std::chrono::high_resolution_clock::now(); 250 | 251 | auto rpc_nano = std::chrono::duration_cast< std::chrono::nanoseconds >( t2 - t1 ).count(); 252 | auto rdma_nano = std::chrono::duration_cast< std::chrono::nanoseconds >( t3 - t2 ).count(); 253 | time_rpc.push_back(rpc_nano / (float)1000.0 ); 254 | time_rdma.push_back(rdma_nano / (float)1000.0 ); 255 | } 256 | 257 | 258 | 259 | printf("Done\n"); 260 | RMAPI->Free(obj1); 261 | RMAPI->Free(obj2); 262 | 263 | printf("alloc: "); 264 | for(auto &x : time_alloc){ 265 | printf("%.2f ",x); 266 | } 267 | printf("\nfree: "); 268 | for(auto &x : time_free){ 269 | printf("%.2f ",x); 270 | } 271 | printf("\nread: "); 272 | for(auto &x : time_read){ 273 | printf("%.2f ",x); 274 | } 275 | printf("\nreadrdma: "); 276 | for(auto &x : time_read_rdma){ 277 | printf("%.2f ",x); 278 | } 279 | printf("\nwrite: "); 280 | for(auto &x : time_write){ 281 | printf("%.2f ",x); 282 | } 283 | printf("\nfixread: "); 284 | for(auto &x : time_fix_read){ 285 | printf("%.2f ",x); 286 | } 287 | printf("\nfixreadrdma: "); 288 | for(auto &x : time_fix_read_rdma){ 289 | printf("%.2f ",x); 290 | } 291 | printf("\nfixwrite: "); 292 | for(auto &x : time_fix_write){ 293 | printf("%.2f ",x); 294 | } 295 | printf("\nfixfix: "); 296 | for(auto &x : time_fix){ 297 | printf("%.2f ",x); 298 | } 299 | printf("\nrpc: "); 300 | for(auto &x : time_rpc){ 301 | printf("%.2f ",x); 302 | } 303 | printf("\nrdma: "); 304 | for(auto &x : time_rdma){ 305 | printf("%.2f ",x); 306 | } 307 | printf("\n"); 308 | free(buffer); 309 | return 0; 310 | } 311 | -------------------------------------------------------------------------------- /load.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * A simple code for loading CoRM 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | #include // std::cout 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include "thread/thread.hpp" 24 | FILE *log_fp; 25 | 26 | #include "worker/client_api.hpp" 27 | #include "rdma/connectRDMA.hpp" 28 | 29 | #include "utilities/cxxopts.hpp" 30 | 31 | 32 | cxxopts::ParseResult 33 | parse(int argc, char* argv[]) 34 | { 35 | cxxopts::Options options(argv[0], "load data to CoRM"); 36 | options 37 | .positional_help("[optional args]") 38 | .show_positional_help(); 39 | 40 | try 41 | { 42 | 43 | options.add_options() 44 | ("a,server", "Another address", cxxopts::value(), "IP") 45 | ("threads", "Total threads CoRM has", cxxopts::value()->default_value(std::to_string(1)), "N") 46 | ("o,output", "Output file", cxxopts::value()->default_value("test.bin"), "FILE") 47 | ("size", "Object size in bytes", cxxopts::value()->default_value("123"), "N") 48 | ("n,num", "Number of objects to allocate", cxxopts::value()->default_value("123"), "N") 49 | ("unload", "Number of objects to deallocate", cxxopts::value()->default_value("0"), "N") 50 | ("randomize","randomize objects") 51 | ("help", "Print help") 52 | ; 53 | 54 | auto result = options.parse(argc, argv); 55 | 56 | if (result.count("help")) 57 | { 58 | std::cout << options.help({""}) << std::endl; 59 | exit(0); 60 | } 61 | 62 | if (!result.count("server")) 63 | { 64 | throw cxxopts::OptionException("input must be specified"); 65 | } 66 | 67 | return result; 68 | 69 | } catch (const cxxopts::OptionException& e) 70 | { 71 | std::cout << "error parsing options: " << e.what() << std::endl; 72 | std::cout << options.help({""}) << std::endl; 73 | exit(1); 74 | } 75 | } 76 | 77 | int main(int argc, char* argv[]){ 78 | 79 | 80 | auto allparams = parse(argc,argv); 81 | 82 | log_fp=stdout; 83 | 84 | std::string server = allparams["server"].as(); 85 | 86 | uint32_t threads = allparams["threads"].as(); 87 | uint32_t N = allparams["num"].as(); 88 | uint32_t size = allparams["size"].as(); 89 | std::string output = allparams["output"].as(); 90 | 91 | 92 | ClientRDMA rdma((char*)server.c_str(),9999); 93 | struct rdma_cm_id * id = rdma.sendConnectRequest(); 94 | 95 | struct ibv_pd * pd = ClientRDMA::create_pd(id); 96 | 97 | struct ibv_qp_init_attr attr; 98 | struct rdma_conn_param conn_param; 99 | memset(&attr, 0, sizeof(attr)); 100 | attr.cap.max_send_wr = 32; 101 | attr.cap.max_recv_wr = 32; 102 | attr.cap.max_send_sge = 1; 103 | attr.cap.max_recv_sge = 1; 104 | attr.cap.max_inline_data = 0; 105 | attr.qp_type = IBV_QPT_RC; 106 | 107 | memset(&conn_param, 0 , sizeof(conn_param)); 108 | conn_param.responder_resources = 0; 109 | conn_param.initiator_depth = 5; 110 | conn_param.retry_count = 3; 111 | conn_param.rnr_retry_count = 3; 112 | 113 | 114 | std::vector connections(threads,NULL); 115 | 116 | connections[0] = ClientRDMA::connectEP(id, &attr, &conn_param, pd); 117 | 118 | for(uint32_t i =1; i apis(threads); 127 | for(uint32_t i = 0; i objects; 133 | 134 | for(uint32_t i = 0; i < N; i++){ 135 | objects.push_back(apis[i%threads]->Alloc(size)); 136 | } 137 | 138 | 139 | 140 | if(allparams.count("randomize")){ 141 | std::random_shuffle(objects.begin(), objects.end()); 142 | } 143 | 144 | uint32_t unload = allparams["unload"].as(); 145 | if(unload > 0){ 146 | std::random_shuffle(objects.begin(), objects.end()); 147 | 148 | for(uint32_t i = 0; i < unload; i++){ 149 | apis[0]->Free(objects[i]); 150 | free(objects[i]); 151 | } 152 | } 153 | 154 | std::fstream fout; 155 | fout.open(output.c_str(), std::ios::out|std::ios::trunc|std::ios::binary); 156 | 157 | 158 | uint32_t rest = N - unload; 159 | fout.write((char*)&rest,sizeof(rest)); 160 | for(uint32_t i = 0; i < rest; i++){ 161 | fout.write((char*)objects[unload+i],sizeof(LocalObjectHandler)); 162 | } 163 | 164 | fout.close(); 165 | 166 | printf("Object keys are written to file %s\n", output.c_str()); 167 | 168 | return 0; 169 | } 170 | -------------------------------------------------------------------------------- /local_read_benchmark.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * A simple code for measuring read local bandwidth 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | #include // std::cout 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | FILE *log_fp; 24 | 25 | #include "alloc/block_alloc.hpp" 26 | #include 27 | 28 | #include 29 | #include "alloc/thread_alloc.hpp" 30 | #include "worker/worker.hpp" 31 | #include "rdma/rdma_memory_manager.hpp" 32 | 33 | #include "utilities/zipf.hpp" 34 | #include "utilities/ycsb.hpp" 35 | 36 | #include "utilities/cxxopts.hpp" 37 | 38 | 39 | 40 | cxxopts::ParseResult 41 | parse(int argc, char* argv[]) 42 | { 43 | cxxopts::Options options(argv[0], "simple read local bandwidth benchmark"); 44 | options 45 | .positional_help("[optional args]") 46 | .show_positional_help(); 47 | 48 | try 49 | { 50 | 51 | options.add_options() 52 | ("size", "user entry size", cxxopts::value()->default_value(std::to_string(8)), "N") 53 | ("help", "Print help") 54 | ; 55 | 56 | auto result = options.parse(argc, argv); 57 | 58 | if (result.count("help")) 59 | { 60 | std::cout << options.help({""}) << std::endl; 61 | exit(0); 62 | } 63 | 64 | return result; 65 | 66 | } catch (const cxxopts::OptionException& e) 67 | { 68 | std::cout << "error parsing options: " << e.what() << std::endl; 69 | std::cout << options.help({""}) << std::endl; 70 | exit(1); 71 | } 72 | } 73 | 74 | 75 | int main(int argc, char* argv[]) 76 | { 77 | 78 | auto allparams = parse(argc,argv); 79 | 80 | log_fp = stdout; 81 | uint32_t user_size = allparams["size"].as(); 82 | 83 | size_t totest = 1024*1024*1024*1ULL; // 1 GiB 84 | size_t sbsize = BLOCKS_IN_SUPERBLOCK * BLOCK_SIZE; 85 | 86 | uint8_t type = SizeTable::getInstance().GetClassFromUserSize(user_size); 87 | uint16_t slot_size = SizeTable::getInstance().GetRealSize(type); 88 | 89 | uint32_t slotsperblock = BLOCK_SIZE / slot_size; 90 | 91 | printf("entry size %u\n",type); 92 | 93 | std::vector superblocks; 94 | 95 | for(size_t i=1; i <= totest/sbsize; i++){ 96 | superblocks.push_back(new SuperBlock(i)); 97 | } 98 | 99 | printf("total superblocks %lu\n",superblocks.size()); 100 | 101 | std::vector blocks; 102 | std::vector addresses; 103 | 104 | uint8_t version = 0; 105 | uint16_t object_id = 1; 106 | 107 | for( auto & sb : superblocks){ 108 | char* src = (char*)malloc(user_size); 109 | while(sb->hasFreeBlocks()){ 110 | Block *b = sb->allocateBlock(); 111 | uint64_t alligned_addr = b->CreateNewAddr(); 112 | blocks.push_back(b); 113 | addresses.push_back(alligned_addr); 114 | 115 | ReaderWriter::WriteBlockHeader(alligned_addr, type , 0); 116 | for(uint32_t i =0; i< slotsperblock; i++){ 117 | 118 | ReaderWriter::SetNewObject(alligned_addr + i*slot_size, object_id, &version ); 119 | 120 | ReaderWriter::WriteBufToObject(alligned_addr + i*slot_size, object_id, slot_size, 121 | src, user_size, NULL, NULL); 122 | } 123 | } 124 | free(src); 125 | } 126 | printf("total blocks %lu\n",blocks.size()); 127 | uint64_t totalobj = blocks.size()*slotsperblock; 128 | printf("total objects %lu\n",totalobj); 129 | 130 | Trace *trace = new Uniform(10,0.0,(uint32_t)totalobj); 131 | 132 | char* dest = (char*)malloc(user_size); 133 | 134 | std::vector accesses; 135 | 136 | uint32_t maxi = totest*4/slot_size; 137 | 138 | for(uint32_t i = 0; i< maxi; i++){ 139 | uint32_t id = trace->get_next().first; 140 | uint64_t alligned_addr = addresses[id/slotsperblock] + (id%slotsperblock)*slot_size; 141 | accesses.push_back(alligned_addr); 142 | } 143 | 144 | 145 | 146 | { 147 | auto t1 = std::chrono::high_resolution_clock::now(); 148 | for(uint32_t i = 0; i< maxi; i++){ 149 | uint64_t alligned_addr = accesses[i]; 150 | uint32_t lim_size = user_size; 151 | ReaderWriter::client_read_object_to_buffer_lim((uint64_t)dest, (uint64_t)alligned_addr, 152 | (uint64_t)alligned_addr, slot_size,object_id, &lim_size); 153 | } 154 | auto t2 = std::chrono::high_resolution_clock::now(); 155 | printf("CoRM Time: %lu\n", std::chrono::duration_cast< std::chrono::nanoseconds >( t2 - t1 ).count() ); 156 | } 157 | 158 | { 159 | auto t1 = std::chrono::high_resolution_clock::now(); 160 | for(uint32_t i = 0; i< maxi; i++){ 161 | uint64_t alligned_addr = accesses[i]; 162 | uint32_t lim_size = user_size; 163 | ReaderWriter::client_read_object_farm((uint64_t)dest, (uint64_t)alligned_addr, 164 | (uint64_t)alligned_addr, slot_size, &lim_size); 165 | } 166 | auto t2 = std::chrono::high_resolution_clock::now(); 167 | printf("Farm Time: %lu\n", std::chrono::duration_cast< std::chrono::nanoseconds >( t2 - t1 ).count() ); 168 | } 169 | 170 | { 171 | auto t1 = std::chrono::high_resolution_clock::now(); 172 | for(uint32_t i = 0; i< maxi; i++){ 173 | uint64_t alligned_addr = accesses[i]; 174 | uint32_t lim_size = user_size; 175 | ReaderWriter::client_read_fast((uint64_t)dest, (uint64_t)alligned_addr, slot_size, &lim_size); 176 | } 177 | auto t2 = std::chrono::high_resolution_clock::now(); 178 | printf("Mesh Time: %lu\n", std::chrono::duration_cast< std::chrono::nanoseconds >( t2 - t1 ).count() ); 179 | } 180 | 181 | return 0; 182 | } 183 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * A simple code for launching CoRM 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | #include // std::cout 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | FILE *log_fp; 24 | 25 | #include "alloc/block_alloc.hpp" 26 | #include 27 | 28 | #include 29 | #include "alloc/thread_alloc.hpp" 30 | #include "worker/worker.hpp" 31 | #include "rdma/rdma_memory_manager.hpp" 32 | 33 | 34 | 35 | #include "utilities/cxxopts.hpp" 36 | 37 | 38 | 39 | cxxopts::ParseResult 40 | parse(int argc, char* argv[]) 41 | { 42 | cxxopts::Options options(argv[0], "Launch CoRM server"); 43 | options 44 | .positional_help("[optional args]") 45 | .show_positional_help(); 46 | 47 | try 48 | { 49 | 50 | options.add_options() 51 | ("a,server", "server address. I use port 9999", cxxopts::value(), "IP") 52 | ("threads", "Total threads CoRM has", cxxopts::value()->default_value(std::to_string(1)), "N") 53 | ("thclass", "threshold class", cxxopts::value()->default_value(std::to_string(50)), "N") 54 | ("thpopularity", "threshold popularity", cxxopts::value()->default_value(std::to_string(100)), "N") 55 | ("preallocate", "preallocate superblocks", cxxopts::value()->default_value(std::to_string(1)), "N") 56 | ("num_recv_buf", "Number of receive buffers per thread", cxxopts::value()->default_value(std::to_string(256)), "N") 57 | ("recv_buf_size", "The size of each receive buffer", cxxopts::value()->default_value(std::to_string(2048)), "N") 58 | ("send_buf_size", "The total size of send buffer per thread", cxxopts::value()->default_value(std::to_string(1024*16)), "N") 59 | ("log_file", "output file", cxxopts::value(), "file") 60 | ("odp", "enable ODP with prefetch if supported") 61 | ("help", "Print help") 62 | ; 63 | 64 | auto result = options.parse(argc, argv); 65 | 66 | if (result.count("help")) 67 | { 68 | std::cout << options.help({""}) << std::endl; 69 | exit(0); 70 | } 71 | 72 | if (!result.count("server")) 73 | { 74 | throw cxxopts::OptionException("input must be specified"); 75 | } 76 | 77 | return result; 78 | 79 | } catch (const cxxopts::OptionException& e) 80 | { 81 | std::cout << "error parsing options: " << e.what() << std::endl; 82 | std::cout << options.help({""}) << std::endl; 83 | exit(1); 84 | } 85 | } 86 | 87 | 88 | int main(int argc, char* argv[]) 89 | { 90 | 91 | auto allparams = parse(argc,argv); 92 | if(allparams.count("log_file")){ 93 | std::string name = allparams["log_file"].as(); 94 | log_fp = fopen(name.c_str(), "w+"); 95 | if (log_fp==NULL) { 96 | printf("Cannot open log file\n"); 97 | exit(1); 98 | } 99 | } else { 100 | log_fp = stdout; 101 | } 102 | 103 | std::string ip = allparams["server"].as(); 104 | uint32_t threads = allparams["threads"].as(); 105 | 106 | uint32_t total_thread_num = threads; 107 | LauncherMaster *m = new LauncherMaster(total_thread_num); // 10 threads 108 | 109 | ServerRDMA *server = new ServerRDMA((char *)ip.c_str(), 9999); 110 | struct ibv_pd *pd = server->create_pd(); 111 | 112 | SizeTable::getInstance().PrintTable(); 113 | 114 | uint32_t threshold_popular_class = allparams["thpopularity"].as(); // popular classes are allocated by local thread. The class is popular once it has 100+ requests. 115 | uint32_t threshold_size_class = allparams["thclass"].as(); // smaller size classes are allocated by local thread 116 | 117 | AllocAdapter::init(threshold_popular_class, threshold_size_class); 118 | 119 | uint32_t preallocate = allparams["preallocate"].as(); // how many super blocks preallocate 120 | 121 | BlockAllocImpl *balloc = new BlockAllocImpl(0, preallocate); 122 | ibv_memory_manager *ibv = new ibv_memory_manager(pd, allparams.count("odp") ); 123 | 124 | uint32_t recv_buffers_num = allparams["num_recv_buf"].as(); 125 | uint32_t recv_buffer_size = allparams["recv_buf_size"].as(); 126 | uint32_t send_buffer_size = allparams["send_buf_size"].as(); 127 | 128 | for(uint32_t i = 0; i < total_thread_num; i++) 129 | { 130 | ThreadAlloc *alloc = new ThreadAllocImpl(i, balloc, ibv); 131 | AllocAdapter::getInstance().RegThread(alloc, i); 132 | Worker *w = new Worker(i, alloc, pd, recv_buffers_num, recv_buffer_size, send_buffer_size ); 133 | if(i == 0) 134 | { 135 | w->set_rdma_server(server); 136 | } 137 | m->add_worker(w); 138 | } 139 | 140 | m->launch(); 141 | 142 | delete m; 143 | 144 | if(allparams.count("log_file")){ 145 | fclose(log_fp); 146 | } 147 | 148 | return 0; 149 | } 150 | -------------------------------------------------------------------------------- /paper/corm.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/CoRM/2bcae859eafad28ba51a92ec73e57239febef147/paper/corm.pdf -------------------------------------------------------------------------------- /rdma/connectRDMA.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * A simple/naive code to connect 2 endpoints with rdma using RDMA-CM. 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | 12 | 13 | #pragma once 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include "verbsEP.hpp" 24 | 25 | 26 | 27 | struct ibv_device *ctx_find_dev(const char *ib_devname) { 28 | int num_of_device; 29 | struct ibv_device **dev_list; 30 | struct ibv_device *ib_dev = NULL; 31 | 32 | dev_list = ibv_get_device_list(&num_of_device); 33 | 34 | if (num_of_device <= 0) { 35 | fprintf(stderr, " Did not detect devices \n"); 36 | fprintf(stderr, " If device exists, check if driver is up\n"); 37 | return NULL; 38 | } 39 | 40 | if (!ib_devname) { 41 | ib_dev = dev_list[0]; 42 | if (!ib_dev) { 43 | fprintf(stderr, "No IB devices found\n"); 44 | exit(1); 45 | } 46 | } else { 47 | for (; (ib_dev = *dev_list); ++dev_list) 48 | if (!strcmp(ibv_get_device_name(ib_dev), ib_devname)) break; 49 | if (!ib_dev) fprintf(stderr, "IB device %s not found\n", ib_devname); 50 | } 51 | return ib_dev; 52 | } 53 | 54 | 55 | class ServerRDMA{ 56 | 57 | struct rdma_event_channel *cm_channel; 58 | struct rdma_cm_id *listen_id = NULL; 59 | //struct ibv_context *ctx; 60 | 61 | public: 62 | ServerRDMA(char* ip, int port){ 63 | int ret; 64 | struct rdma_addrinfo hints; 65 | struct rdma_addrinfo *addrinfo; 66 | 67 | /* struct ibv_device *ib_dev = NULL; 68 | ib_dev = ctx_find_dev(devname); 69 | ctx = ibv_open_device(ib_dev);*/ 70 | 71 | memset(&hints, 0, sizeof hints); 72 | hints.ai_flags = RAI_PASSIVE; 73 | hints.ai_port_space = RDMA_PS_TCP; 74 | 75 | char strport[80]; 76 | sprintf(strport, "%d", port); 77 | 78 | ret = rdma_getaddrinfo(ip, strport, &hints, &addrinfo); 79 | if (ret) { 80 | perror("rdma_getaddrinfo\n"); 81 | exit(1); 82 | } 83 | /* 84 | this->cm_channel = rdma_create_event_channel(); 85 | 86 | if (this->cm_channel == NULL) { 87 | perror(" rdma_create_event_channel failed\n"); 88 | exit(1); 89 | } 90 | ret = rdma_create_id(this->cm_channel, &listen_id, NULL, RDMA_PS_TCP); 91 | if (ret) { 92 | perror("Failed to create RDMA CM server control ID."); 93 | exit(1); 94 | } 95 | 96 | ret = rdma_bind_addr(listen_id, addrinfo->ai_src_addr); 97 | if (ret) { 98 | perror("Failed to bind RDMA CM address on the server."); 99 | exit(1); 100 | } 101 | */ 102 | 103 | ret = rdma_create_ep(&listen_id, addrinfo, NULL, NULL); 104 | if (ret) { 105 | perror("rdma_create_ep\n"); 106 | exit(1); 107 | } 108 | 109 | rdma_freeaddrinfo(addrinfo); 110 | 111 | ret = rdma_listen(listen_id, 2); 112 | if (ret) { 113 | perror("rdma_listen"); 114 | exit(1); 115 | } 116 | 117 | } 118 | 119 | int get_listen_fd() 120 | { 121 | 122 | assert(this->listen_id->channel!=NULL); 123 | int options = fcntl(this->listen_id->channel->fd, F_GETFL, 0); 124 | 125 | if (fcntl(this->listen_id->channel->fd, F_SETFL, options | O_NONBLOCK)) { 126 | perror("[RDMA_COM] cannot set server_client to non-blocking mode"); 127 | exit(1); 128 | return 0; 129 | } 130 | 131 | return this->listen_id->channel->fd; 132 | } 133 | 134 | struct ibv_pd * create_pd(){ 135 | return ibv_alloc_pd(listen_id->verbs); 136 | } 137 | 138 | 139 | static struct ibv_srq* create_srq(struct ibv_pd * pd, uint32_t max_wr, uint32_t max_sge=1){ 140 | 141 | struct ibv_srq_init_attr attr; 142 | memset(&attr, 0, sizeof attr); 143 | attr.attr.max_wr = max_wr; 144 | attr.attr.max_sge = max_sge; 145 | return ibv_create_srq(pd, &attr); 146 | } 147 | 148 | struct ibv_cq *create_cq(uint32_t max_wr, struct ibv_comp_channel *channel = NULL){ 149 | return ibv_create_cq(listen_id->verbs, max_wr, NULL,channel, 0); 150 | } 151 | 152 | 153 | struct rdma_cm_id * getConnectRequest(){ 154 | int ret; 155 | struct rdma_cm_id *id; 156 | 157 | ret = rdma_get_request(this->listen_id, &id); 158 | if (ret) { 159 | perror("rdma_get_request"); 160 | exit(1); 161 | } 162 | return id; 163 | } 164 | 165 | 166 | static VerbsEP* acceptEP(struct rdma_cm_id *id, struct ibv_qp_init_attr *attr, struct rdma_conn_param *conn_param, struct ibv_pd* pd = NULL){ 167 | int ret; 168 | attr->qp_type = IBV_QPT_RC; 169 | 170 | ret = rdma_create_qp(id, pd, attr); 171 | if (ret) { 172 | perror("rdma_create_qp"); 173 | exit(1); 174 | } 175 | 176 | ret = rdma_accept(id, conn_param); 177 | if (ret) { 178 | perror("rdma_accept"); 179 | exit(1); 180 | } 181 | 182 | return new VerbsEP(id, attr->cap.max_inline_data, attr->cap.max_send_wr, attr->cap.max_recv_wr ); 183 | } 184 | 185 | }; 186 | 187 | 188 | 189 | class ClientRDMA{ 190 | 191 | struct rdma_addrinfo *addrinfo; 192 | 193 | public: 194 | ClientRDMA(char* ip, int port){ 195 | int ret; 196 | struct rdma_addrinfo hints; 197 | 198 | memset(&hints, 0, sizeof hints); 199 | hints.ai_port_space = RDMA_PS_TCP; 200 | 201 | char strport[80]; 202 | sprintf(strport, "%d", port); 203 | 204 | ret = rdma_getaddrinfo(ip, strport, &hints, &addrinfo); 205 | if (ret) { 206 | perror("rdma_getaddrinfo\n"); 207 | exit(1); 208 | } 209 | 210 | } 211 | 212 | ~ClientRDMA(){ 213 | rdma_freeaddrinfo(addrinfo); 214 | } 215 | 216 | struct rdma_cm_id * sendConnectRequest(){ 217 | int ret; 218 | struct rdma_cm_id *id; 219 | 220 | ret = rdma_create_ep(&id, this->addrinfo, NULL, NULL); 221 | if (ret) { 222 | perror("rdma_create_ep"); 223 | exit(1); 224 | } 225 | return id; 226 | } 227 | 228 | static struct ibv_pd * create_pd(struct rdma_cm_id *id){ 229 | return ibv_alloc_pd(id->verbs); 230 | } 231 | 232 | 233 | static struct ibv_srq* create_srq(struct ibv_pd * pd, uint32_t max_wr, uint32_t max_sge=1){ 234 | struct ibv_srq_init_attr attr; 235 | memset(&attr, 0, sizeof attr); 236 | attr.attr.max_wr = max_wr; 237 | attr.attr.max_sge = max_sge; 238 | return ibv_create_srq(pd, &attr); 239 | } 240 | 241 | static struct ibv_cq *create_cq(struct rdma_cm_id * id, uint32_t max_wr, struct ibv_comp_channel *channel = NULL){ 242 | return ibv_create_cq(id->verbs, max_wr, NULL, channel, 0); 243 | } 244 | 245 | static VerbsEP* connectEP(struct rdma_cm_id * id, struct ibv_qp_init_attr *attr, struct rdma_conn_param *conn_param, struct ibv_pd* pd = NULL){ 246 | int ret; 247 | attr->qp_type = IBV_QPT_RC; 248 | ret = rdma_create_qp(id, pd, attr); 249 | if (ret) { 250 | perror("rdma_create_qp"); 251 | exit(1); 252 | } 253 | 254 | ret = rdma_connect(id, conn_param); 255 | if (ret) { 256 | perror("rdma_accept"); 257 | exit(1); 258 | } 259 | 260 | return new VerbsEP(id, attr->cap.max_inline_data, attr->cap.max_send_wr, attr->cap.max_recv_wr ); 261 | } 262 | 263 | }; -------------------------------------------------------------------------------- /rdma/rdma_helpers.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * A simple manager of fixed-size regions used for send-receive communication. 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | 12 | #pragma once 13 | 14 | #include 15 | #include 16 | 17 | // it allocates buffers of fixed size 18 | struct ReceiveBuffers{ 19 | 20 | uint32_t const num_of_buffers; 21 | uint64_t const buffer_size; 22 | 23 | char *buffer; 24 | size_t const total_size; 25 | uint32_t lkey; 26 | struct ibv_mr * mr; 27 | 28 | ReceiveBuffers(uint32_t num_of_buffers, uint32_t buffer_size, struct ibv_pd *pd): 29 | num_of_buffers(num_of_buffers), buffer_size(buffer_size), total_size(num_of_buffers*buffer_size) 30 | { 31 | this->buffer = (char*)aligned_alloc(64, total_size); 32 | assert(buffer != NULL && "error memory allocation"); 33 | 34 | this->mr = ibv_reg_mr(pd, buffer, total_size, IBV_ACCESS_LOCAL_WRITE); 35 | this->lkey = mr->lkey; 36 | } 37 | 38 | char* get_buffer(uint32_t i) const { 39 | return buffer + i*buffer_size; 40 | } 41 | 42 | uint32_t get_lkey() const { 43 | return lkey; 44 | } 45 | 46 | uint32_t get_buffer_length() const { 47 | return buffer_size; 48 | } 49 | 50 | ~ReceiveBuffers(){ 51 | if(mr) 52 | ibv_dereg_mr(mr); 53 | if(buffer) 54 | free(buffer); 55 | } 56 | 57 | }; 58 | 59 | // it allocates buffers of any size. 60 | // but it assumes short lifetime of objects. 61 | struct SendBuffers{ 62 | char *buffer; 63 | size_t const total_size; 64 | uint32_t lkey; 65 | struct ibv_mr * mr; 66 | 67 | uint32_t current_offset; 68 | uint32_t first_notfree; 69 | 70 | std::set allocated_offsets; 71 | 72 | const uint32_t Allignment_bits = 6; 73 | const uint32_t Allignment = 1<buffer = (char*)aligned_alloc(Allignment, total_size); 80 | assert(buffer != NULL && "error memory allocation"); 81 | 82 | this->mr = ibv_reg_mr(pd, buffer, total_size, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); 83 | if(mr==NULL){ 84 | printf("Error reg mr\n"); 85 | } 86 | this->lkey = mr->lkey; 87 | } 88 | 89 | uint32_t get_lkey() const { 90 | return lkey; 91 | } 92 | 93 | char* Alloc(uint32_t size) { 94 | 95 | uint32_t to_allocate = (size + Allignment - 1) & ~Allignment_mask; 96 | 97 | if(current_offset+to_allocate >= total_size){ 98 | // wrap around 99 | current_offset = 0; 100 | if(allocated_offsets.empty()){ 101 | first_notfree = total_size; 102 | }else{ 103 | first_notfree = *(allocated_offsets.begin()); 104 | } 105 | } 106 | 107 | if(first_notfree-current_offset < to_allocate){ 108 | // don't have memory 109 | printf("return don't have memory %u, total mem%lu\n",size, total_size); 110 | return NULL; 111 | } 112 | 113 | size_t return_offset = current_offset; 114 | current_offset+=to_allocate; 115 | allocated_offsets.insert(return_offset); 116 | return buffer + return_offset; 117 | } 118 | 119 | void Free(char* buf) { 120 | uint32_t offset = (uint32_t)(buf - buffer); 121 | auto it = allocated_offsets.find(offset); 122 | assert(it != allocated_offsets.end() && "address does not exist"); 123 | if(first_notfree == offset){ 124 | auto next = std::next(it); 125 | if(next == allocated_offsets.end()){ 126 | first_notfree = total_size; 127 | }else{ 128 | first_notfree = *next; 129 | } 130 | } 131 | allocated_offsets.erase(it); 132 | } 133 | 134 | ~SendBuffers(){ 135 | if(mr) 136 | ibv_dereg_mr(mr); 137 | if(buffer) 138 | free(buffer); 139 | } 140 | }; 141 | -------------------------------------------------------------------------------- /rdma/rdma_memory_manager.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * A simple manager of block registration. It also used to support experimental API, but I removed it as it is depricated by MOFED. 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | 12 | #pragma once 13 | 14 | #include 15 | 16 | /* 17 | #ifdef HAVE_ODP_MR_PREFETCH 18 | #warning "Prefecth is loaded" 19 | #include 20 | #else 21 | #warning "Prefecth is not supported" 22 | #endif 23 | */ 24 | 25 | struct ibv_memory_manager{ 26 | 27 | struct ibv_pd * const pd; 28 | const bool _withODP; 29 | 30 | ibv_memory_manager(struct ibv_pd *pd, bool with_odp): pd(pd), _withODP(with_odp) { 31 | // empty 32 | } 33 | 34 | struct ibv_mr * mem_reg_odp(void *addr,uint32_t size){ 35 | return ibv_reg_mr(pd,addr,size,IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE \ 36 | | IBV_ACCESS_REMOTE_READ | (IBV_ACCESS_ON_DEMAND * (int)this->_withODP )); 37 | } 38 | 39 | void mem_rereg(struct ibv_mr * mr){ 40 | if( !_withODP ){ 41 | int ret = ibv_rereg_mr(mr,IBV_REREG_MR_CHANGE_TRANSLATION, pd, mr->addr,mr->length,0); 42 | assert(ret==0 && "ibv_rereg_mr failed"); 43 | return; 44 | } 45 | 46 | struct ibv_sge sge; 47 | sge.addr = (uint64_t)mr->addr; 48 | sge.length = mr->length; 49 | sge.lkey = mr->lkey; 50 | 51 | int ret = ibv_advise_mr(pd, IBV_ADVISE_MR_ADVICE_PREFETCH, 52 | IBV_ADVISE_MR_FLAG_FLUSH, 53 | &sge, 1); 54 | 55 | assert(ret==0 && "ibv_advise_mr failed"); 56 | 57 | #if 0 58 | struct ibv_exp_prefetch_attr prefetch_attr; 59 | prefetch_attr.flags = IBV_EXP_PREFETCH_WRITE_ACCESS; 60 | prefetch_attr.addr = (uint64_t)mr->addr; 61 | prefetch_attr.length = mr->length; 62 | prefetch_attr.comp_mask = 0; 63 | int ret =ibv_exp_prefetch_mr(mr, &prefetch_attr); 64 | #endif 65 | 66 | 67 | } 68 | 69 | 70 | void mem_dereg(struct ibv_mr * mr){ 71 | int ret = ibv_dereg_mr(mr); 72 | assert(ret==0 && "ibv_dereg_mr failed"); 73 | return; 74 | } 75 | }; -------------------------------------------------------------------------------- /rdma/verbsEP.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * A simple class for managing an endpoint. 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | 12 | #pragma once 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | 22 | class VerbsEP{ 23 | struct rdma_cm_id * const id; 24 | public: 25 | struct ibv_qp * const qp; 26 | struct ibv_pd * const pd; 27 | const uint32_t max_inline_data; 28 | const uint32_t max_send_size; 29 | const uint32_t max_recv_size; 30 | 31 | VerbsEP(struct rdma_cm_id *id, uint32_t max_inline_data, uint32_t max_send_size, uint32_t max_recv_size): 32 | id(id), qp(id->qp), pd(qp->pd), max_inline_data(0), max_send_size(max_send_size), max_recv_size(max_recv_size) 33 | { 34 | // empty 35 | } 36 | 37 | ~VerbsEP(){ 38 | // empty 39 | } 40 | 41 | int get_event_fd() 42 | { 43 | assert(this->id->channel!=NULL); 44 | int options = fcntl(this->id->channel->fd, F_GETFL, 0); 45 | 46 | if (fcntl(this->id->channel->fd, F_SETFL, options | O_NONBLOCK)) { 47 | perror("[RDMA_COM] cannot set server_client to non-blocking mode"); 48 | exit(1); 49 | return 0; 50 | } 51 | 52 | return this->id->channel->fd; 53 | } 54 | 55 | enum rdma_cm_event_type get_event(){ 56 | int ret; 57 | struct rdma_cm_event *event; 58 | 59 | ret = rdma_get_cm_event(id->channel, &event); 60 | if (ret) { 61 | perror("rdma_get_cm_event"); 62 | exit(ret); 63 | } 64 | enum rdma_cm_event_type out = event->event; 65 | /* switch (event->event){ 66 | case RDMA_CM_EVENT_ADDR_ERROR: 67 | case RDMA_CM_EVENT_ROUTE_ERROR: 68 | case RDMA_CM_EVENT_CONNECT_ERROR: 69 | case RDMA_CM_EVENT_UNREACHABLE: 70 | case RDMA_CM_EVENT_REJECTED: 71 | 72 | text(log_fp,"[rdma_get_cm_event] Error %u \n",event->event); 73 | break; 74 | 75 | case RDMA_CM_EVENT_DISCONNECTED: 76 | text(log_fp,"[rdma_get_cm_event] Disconnect %u \n",event->event); 77 | break; 78 | 79 | case RDMA_CM_EVENT_DEVICE_REMOVAL: 80 | text(log_fp,"[rdma_get_cm_event] Removal %u \n",event->event); 81 | break; 82 | default: 83 | text(log_fp,"[rdma_get_cm_event] %u \n",event->event); 84 | 85 | }*/ 86 | rdma_ack_cm_event(event); 87 | return out; 88 | } 89 | 90 | uint32_t get_qp_num() const{ 91 | return qp->qp_num; 92 | } 93 | 94 | struct ibv_mr * reg_mem(void *buf, uint32_t size){ 95 | return ibv_reg_mr(this->pd, buf, size, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ); 96 | } 97 | 98 | struct ibv_mr * reg_mem_with_atomic(void *buf, uint32_t size){ 99 | return ibv_reg_mr(this->pd, buf, size, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC); 100 | } 101 | 102 | void dereg_mem(struct ibv_mr * mr){ 103 | ibv_dereg_mr(mr); 104 | } 105 | 106 | inline int poll_send_completion(struct ibv_wc* wc, int num = 1){ 107 | return ibv_poll_cq(this->qp->send_cq, num, wc); 108 | } 109 | 110 | inline int poll_recv_completion(struct ibv_wc* wc, int num = 1){ 111 | return ibv_poll_cq(this->qp->recv_cq, num, wc); 112 | } 113 | 114 | static inline int post_srq_recv(struct ibv_srq *srq, uint64_t wr_id, uint64_t local_addr=0ULL, uint32_t lkey=0, uint32_t length=0){ 115 | struct ibv_sge sge; 116 | 117 | sge.addr = local_addr; 118 | sge.length = length; 119 | sge.lkey = lkey; 120 | 121 | struct ibv_recv_wr wr, *bad; 122 | 123 | wr.wr_id = wr_id; 124 | wr.next = NULL; 125 | wr.sg_list = &sge; 126 | wr.num_sge = 1; 127 | 128 | return ibv_post_srq_recv(srq,&wr, &bad); 129 | } 130 | 131 | inline int post_recv(uint64_t wr_id, struct ibv_mr * mr){ 132 | return post_recv(wr_id, (uint64_t)mr->addr, mr->lkey, mr->length); 133 | } 134 | 135 | inline int post_recv(uint64_t wr_id, uint64_t local_addr=0ULL, uint32_t lkey=0, uint32_t length=0){ 136 | struct ibv_sge sge; 137 | 138 | sge.addr = local_addr; 139 | sge.length = length; 140 | sge.lkey = lkey; 141 | 142 | struct ibv_recv_wr wr, *bad; 143 | 144 | wr.wr_id = wr_id; 145 | wr.next = NULL; 146 | wr.sg_list = &sge; 147 | wr.num_sge = 1; 148 | 149 | return ibv_post_recv(qp, &wr, &bad); 150 | } 151 | 152 | inline int post_recv(struct ibv_recv_wr * wr){ 153 | struct ibv_recv_wr *bad; 154 | return ibv_post_recv(qp, wr, &bad); 155 | } 156 | 157 | inline int post_shared_recv(uint64_t wr_id, struct ibv_mr * mr){ 158 | return post_shared_recv(wr_id, (uint64_t)mr->addr, mr->lkey, mr->length); 159 | } 160 | 161 | inline int post_shared_recv(uint64_t wr_id, uint64_t local_addr=0ULL, uint32_t lkey=0, uint32_t length=0){ 162 | struct ibv_sge sge; 163 | 164 | sge.addr = local_addr; 165 | sge.length = length; 166 | sge.lkey = lkey; 167 | 168 | struct ibv_recv_wr wr, *bad; 169 | 170 | wr.wr_id = wr_id; 171 | wr.next = NULL; 172 | wr.sg_list = &sge; 173 | wr.num_sge = 1; 174 | 175 | return ibv_post_srq_recv(qp->srq, &wr, &bad); 176 | } 177 | 178 | inline int send_signaled(uint64_t wr_id, uint64_t local_addr, uint32_t lkey, uint32_t length){ 179 | unsigned int send_flags = IBV_SEND_SIGNALED; 180 | 181 | if(length!=0 && length<=max_inline_data){ 182 | send_flags |= IBV_SEND_INLINE; 183 | } 184 | 185 | return two_sided( IBV_WR_SEND, send_flags, wr_id, 0,local_addr, lkey, length); 186 | } 187 | 188 | inline int send(uint64_t wr_id, uint64_t local_addr, uint32_t lkey, uint32_t length){ 189 | unsigned int send_flags = 0; 190 | 191 | if(length!=0 && length<=max_inline_data){ 192 | send_flags |= IBV_SEND_INLINE; 193 | } 194 | return two_sided( IBV_WR_SEND, send_flags, wr_id, 0,local_addr, lkey, length); 195 | } 196 | 197 | 198 | inline int send_with_imm_signaled(uint64_t wr_id, uint32_t imm_data, uint64_t local_addr, uint32_t lkey, uint32_t length){ 199 | unsigned int send_flags = IBV_SEND_SIGNALED; 200 | 201 | if(length!=0 && length<=max_inline_data){ 202 | send_flags |= IBV_SEND_INLINE; 203 | } 204 | 205 | return two_sided( IBV_WR_SEND_WITH_IMM, send_flags, wr_id, imm_data,local_addr, lkey, length); 206 | } 207 | 208 | inline int send_with_imm(uint64_t wr_id, uint32_t imm_data, uint64_t local_addr, uint32_t lkey, uint32_t length){ 209 | unsigned int send_flags = 0; 210 | 211 | if(length!=0 && length<=max_inline_data){ 212 | send_flags |= IBV_SEND_INLINE; 213 | } 214 | return two_sided( IBV_WR_SEND_WITH_IMM, send_flags, wr_id, imm_data,local_addr, lkey, length); 215 | } 216 | 217 | inline int write_signaled(uint64_t wr_id, uint64_t local_addr, uint32_t lkey, uint64_t remote_addr, uint32_t rkey, uint32_t length){ 218 | 219 | unsigned int send_flags = IBV_SEND_SIGNALED; 220 | 221 | if(length!=0 && length<=max_inline_data){ 222 | send_flags |= IBV_SEND_INLINE; 223 | } 224 | return one_sided(IBV_WR_RDMA_WRITE,send_flags,wr_id,0,local_addr,lkey,remote_addr,rkey,length); 225 | } 226 | 227 | 228 | inline int write(uint64_t wr_id, uint64_t local_addr, uint32_t lkey, uint64_t remote_addr, uint32_t rkey, uint32_t length){ 229 | 230 | unsigned int send_flags = 0; 231 | 232 | if(length!=0 && length<=max_inline_data){ 233 | send_flags |= IBV_SEND_INLINE; 234 | } 235 | return one_sided(IBV_WR_RDMA_WRITE,send_flags,wr_id,0,local_addr,lkey,remote_addr,rkey,length); 236 | } 237 | 238 | inline int write_send_signaled(uint64_t wr_id, uint64_t local_addr, uint32_t lkey, uint64_t remote_addr, uint32_t rkey, uint32_t length, uint32_t payload){ 239 | struct ibv_sge sge[2]; 240 | 241 | 242 | sge[0].addr = local_addr; 243 | sge[0].length = length; 244 | sge[0].lkey = lkey; 245 | struct ibv_send_wr wr[2], *bad; 246 | 247 | wr[0].wr_id = wr_id; 248 | wr[0].next = &wr[1]; 249 | wr[0].sg_list = &sge[0]; 250 | wr[0].num_sge = 1; 251 | wr[0].opcode = IBV_WR_RDMA_WRITE; 252 | 253 | wr[0].send_flags = (length<=max_inline_data ? IBV_SEND_INLINE : 0); 254 | 255 | wr[0].wr.rdma.remote_addr = remote_addr; 256 | wr[0].wr.rdma.rkey = rkey; 257 | 258 | sge[1].addr = local_addr; 259 | sge[1].length = payload; 260 | sge[1].lkey = lkey; 261 | 262 | wr[1].wr_id = wr_id; 263 | wr[1].next = NULL; 264 | wr[1].sg_list = &sge[1]; 265 | wr[1].num_sge = 1; 266 | wr[1].opcode = IBV_WR_SEND; 267 | wr[1].send_flags = IBV_SEND_SIGNALED | (payload<=max_inline_data ? IBV_SEND_INLINE : 0); 268 | 269 | 270 | return ibv_post_send(this->qp, wr, &bad); 271 | 272 | } 273 | 274 | 275 | inline int write_write_signaled(uint64_t wr_id, uint64_t local_addr, uint32_t lkey, uint64_t remote_addr, uint32_t rkey, uint32_t length, uint32_t payload){ 276 | struct ibv_sge sge[2]; 277 | 278 | sge[0].addr = local_addr; 279 | sge[0].length = length; 280 | sge[0].lkey = lkey; 281 | struct ibv_send_wr wr[2], *bad; 282 | 283 | wr[0].wr_id = wr_id; 284 | wr[0].next = &wr[1]; 285 | wr[0].sg_list = &sge[0]; 286 | wr[0].num_sge = 1; 287 | wr[0].opcode = IBV_WR_RDMA_WRITE; 288 | 289 | wr[0].send_flags = (length<=max_inline_data ? IBV_SEND_INLINE : 0); ; 290 | 291 | wr[0].wr.rdma.remote_addr = remote_addr; 292 | wr[0].wr.rdma.rkey = rkey; 293 | 294 | sge[1].addr = local_addr; 295 | sge[1].length = payload; 296 | sge[1].lkey = lkey; 297 | 298 | wr[1].wr_id = wr_id; 299 | wr[1].next = NULL; 300 | wr[1].sg_list = &sge[1]; 301 | wr[1].num_sge = 1; 302 | wr[1].opcode = IBV_WR_RDMA_WRITE_WITH_IMM; 303 | wr[1].send_flags = IBV_SEND_SIGNALED | (payload<=max_inline_data ? IBV_SEND_INLINE : 0); 304 | 305 | wr[1].wr.rdma.remote_addr = remote_addr; 306 | wr[1].wr.rdma.rkey = rkey; 307 | return ibv_post_send(this->qp, wr, &bad); 308 | } 309 | 310 | inline int send_cas_signaled(uint64_t wr_id, uint64_t local_addr, uint32_t lkey, uint64_t remote_addr, uint32_t rkey, uint64_t expected, uint64_t swap ){ 311 | 312 | struct ibv_sge sge; 313 | 314 | sge.addr = local_addr; 315 | sge.length = 8; 316 | sge.lkey = lkey; 317 | struct ibv_send_wr wr, *bad; 318 | 319 | wr.wr_id = wr_id; 320 | wr.next = NULL; 321 | wr.sg_list = &sge; 322 | wr.num_sge = 1; 323 | wr.opcode = IBV_WR_ATOMIC_CMP_AND_SWP; 324 | 325 | wr.send_flags = IBV_SEND_SIGNALED ; //| IBV_SEND_INLINE 326 | 327 | wr.wr.atomic.remote_addr = remote_addr; 328 | wr.wr.atomic.rkey = rkey; 329 | wr.wr.atomic.compare_add = expected; /* expected value in remote address */ 330 | wr.wr.atomic.swap = swap; /* the value that remote address will be assigned to */ 331 | 332 | return ibv_post_send(this->qp, &wr, &bad); 333 | 334 | } 335 | 336 | inline int write_with_imm_signaled(uint64_t wr_id, uint32_t imm_data, 337 | uint64_t local_addr, uint32_t lkey, uint64_t remote_addr, uint32_t rkey, uint32_t length){ 338 | 339 | unsigned int send_flags = IBV_SEND_SIGNALED; 340 | 341 | if(length!=0 && length<=max_inline_data){ 342 | send_flags |= IBV_SEND_INLINE; 343 | } 344 | return one_sided(IBV_WR_RDMA_WRITE_WITH_IMM,send_flags,wr_id,imm_data,local_addr,lkey,remote_addr,rkey,length); 345 | } 346 | 347 | 348 | inline int write_with_imm(uint64_t wr_id, uint32_t imm_data, 349 | uint64_t local_addr, uint32_t lkey, uint64_t remote_addr, uint32_t rkey, uint32_t length){ 350 | 351 | unsigned int send_flags = 0; 352 | 353 | if(length!=0 && length<=max_inline_data){ 354 | send_flags |= IBV_SEND_INLINE; 355 | } 356 | return one_sided(IBV_WR_RDMA_WRITE_WITH_IMM,send_flags,wr_id,imm_data,local_addr,lkey,remote_addr,rkey,length); 357 | } 358 | 359 | 360 | inline int read_signaled(uint64_t wr_id, uint64_t local_addr, uint32_t lkey, uint64_t remote_addr, 361 | uint32_t rkey, uint32_t length) 362 | { 363 | unsigned int send_flags = IBV_SEND_SIGNALED; 364 | 365 | return one_sided(IBV_WR_RDMA_READ,send_flags,wr_id,0,local_addr,lkey,remote_addr,rkey,length); 366 | } 367 | 368 | inline int read(uint64_t wr_id, uint64_t local_addr, uint32_t lkey, uint64_t remote_addr, uint32_t rkey, uint32_t length) 369 | { 370 | unsigned int send_flags = 0; 371 | 372 | return one_sided(IBV_WR_RDMA_READ,send_flags,wr_id,0,local_addr,lkey,remote_addr,rkey,length); 373 | } 374 | 375 | 376 | inline int one_sided(enum ibv_wr_opcode opcode, unsigned int send_flags, uint64_t wr_id, uint32_t imm_data, 377 | uint64_t local_addr, uint32_t lkey, uint64_t remote_addr, uint32_t rkey, uint32_t length) 378 | { 379 | struct ibv_sge sge; 380 | 381 | sge.addr = local_addr; 382 | sge.length = length; 383 | sge.lkey = lkey; 384 | struct ibv_send_wr wr, *bad; 385 | 386 | wr.wr_id = wr_id; 387 | wr.next = NULL; 388 | wr.sg_list = &sge; 389 | wr.num_sge = 1; 390 | wr.opcode = opcode; 391 | 392 | wr.send_flags = send_flags; 393 | wr.imm_data = imm_data; 394 | 395 | 396 | wr.wr.rdma.remote_addr = remote_addr; 397 | wr.wr.rdma.rkey = rkey; 398 | 399 | return ibv_post_send(this->qp, &wr, &bad); 400 | } 401 | 402 | 403 | inline int two_sided(enum ibv_wr_opcode opcode, unsigned int send_flags, uint64_t wr_id, uint32_t imm_data, 404 | uint64_t local_addr, uint32_t lkey, uint32_t length) 405 | { 406 | struct ibv_sge sge; 407 | 408 | sge.addr = local_addr; 409 | sge.length = length; 410 | sge.lkey = lkey ; 411 | struct ibv_send_wr wr, *bad; 412 | 413 | wr.wr_id = wr_id; 414 | wr.next = NULL; 415 | wr.sg_list = &sge; 416 | wr.num_sge = 1; 417 | wr.opcode = opcode; 418 | 419 | wr.send_flags = send_flags; 420 | wr.imm_data = imm_data; 421 | 422 | return ibv_post_send(this->qp, &wr, &bad); 423 | } 424 | 425 | inline int post_send(struct ibv_send_wr *wr) 426 | { 427 | struct ibv_send_wr *bad; 428 | return ibv_post_send(this->qp, wr, &bad); 429 | } 430 | 431 | 432 | }; 433 | -------------------------------------------------------------------------------- /remote_read_benchmark.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * A simple code to measure Remote Read throughput for Farm/Mesh/Corm 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | 12 | #include // std::cout 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | 26 | #include "thread/thread.hpp" 27 | FILE *log_fp; 28 | 29 | 30 | 31 | #include "worker/client_api.hpp" 32 | #include "rdma/connectRDMA.hpp" 33 | #include "utilities/zipf.hpp" 34 | #include "utilities/ycsb.hpp" 35 | 36 | #include "utilities/cxxopts.hpp" 37 | 38 | 39 | cxxopts::ParseResult 40 | parse(int argc, char* argv[]) 41 | { 42 | cxxopts::Options options(argv[0], "Remote Read benchmark for Farm/Mesh/Corm"); 43 | options 44 | .positional_help("[optional args]") 45 | .show_positional_help(); 46 | 47 | try 48 | { 49 | 50 | options.add_options() 51 | ("server", "Another address", cxxopts::value(), "IP") 52 | ("i,input", "input file", cxxopts::value()->default_value("test.bin"), "FILE") 53 | ("target", "expected rate ops/sec", cxxopts::value()->default_value(std::to_string(1000)), "N") 54 | ("rpc", "Use rpc reads") 55 | ("rdmaread", "Use one-sided reads") 56 | ("mesh", "Use mesh reads") 57 | ("farm", "Use farm reads") 58 | ("n,num", "Number of requests to run", cxxopts::value()->default_value("123"), "N") 59 | ("help", "Print help") 60 | ; 61 | 62 | auto result = options.parse(argc, argv); 63 | 64 | if (result.count("help")) 65 | { 66 | std::cout << options.help({""}) << std::endl; 67 | exit(0); 68 | } 69 | 70 | if (!result.count("server")) 71 | { 72 | throw cxxopts::OptionException("input must be specified"); 73 | } 74 | 75 | 76 | 77 | return result; 78 | 79 | } catch (const cxxopts::OptionException& e) 80 | { 81 | std::cout << "error parsing options: " << e.what() << std::endl; 82 | std::cout << options.help({""}) << std::endl; 83 | exit(1); 84 | } 85 | } 86 | 87 | 88 | 89 | 90 | 91 | 92 | int main(int argc, char* argv[]){ 93 | 94 | int seed = 3; 95 | 96 | auto allparams = parse(argc,argv); 97 | 98 | log_fp=stdout; 99 | 100 | std::string server = allparams["server"].as(); 101 | std::string input = allparams["input"].as(); 102 | uint64_t target = allparams["target"].as(); 103 | uint64_t num = allparams["num"].as(); 104 | 105 | ClientRDMA rdma((char*)server.c_str(),9999); 106 | struct rdma_cm_id * id = rdma.sendConnectRequest(); 107 | 108 | struct ibv_pd * pd = ClientRDMA::create_pd(id); 109 | 110 | struct ibv_qp_init_attr attr; 111 | struct rdma_conn_param conn_param; 112 | memset(&attr, 0, sizeof(attr)); 113 | attr.cap.max_send_wr = 32; 114 | attr.cap.max_recv_wr = 32; 115 | attr.cap.max_send_sge = 1; 116 | attr.cap.max_recv_sge = 1; 117 | attr.cap.max_inline_data = 0; 118 | attr.qp_type = IBV_QPT_RC; 119 | 120 | memset(&conn_param, 0 , sizeof(conn_param)); 121 | conn_param.responder_resources = 0; 122 | conn_param.initiator_depth = 5; 123 | conn_param.retry_count = 3; 124 | conn_param.rnr_retry_count = 3; 125 | 126 | VerbsEP* ep = ClientRDMA::connectEP(id, &attr, &conn_param, pd); 127 | 128 | printf("Connected\n"); 129 | sleep(1); 130 | 131 | RemoteMemoryClient* api = new RemoteMemoryClient(0,ep); 132 | 133 | std::fstream fout; 134 | fout.open(input.c_str(), std::ios::in|std::ios::binary); 135 | uint32_t NN = 0; 136 | 137 | fout.read((char*)&NN,sizeof(NN)); 138 | 139 | std::vector objects(NN); 140 | 141 | for(uint32_t i = 0; i < NN; i++){ 142 | LocalObjectHandler* obj = (LocalObjectHandler*)malloc(sizeof(LocalObjectHandler)); 143 | fout.read((char*)obj,sizeof(LocalObjectHandler)); 144 | // obj->print(); 145 | objects[i] = obj; 146 | } 147 | fout.close(); 148 | printf("Finished reading %u objects from file\n", NN); 149 | uint32_t size = objects[0]->requested_size; 150 | char* buffer = (char*)malloc(size); 151 | 152 | Trace *trace = new Uniform(seed,1.0,NN); 153 | 154 | using ReadFuncPtr = int (RemoteMemoryClient::*)( LocalObjectHandler* obj, char* buffer, uint32_t length ); 155 | 156 | ReadFuncPtr readfunc = nullptr; 157 | 158 | if(allparams.count("rdmaread")){ 159 | if(allparams.count("mesh")){ 160 | readfunc = &RemoteMemoryClient::ReadOneSidedFake; 161 | }else if(allparams.count("farm")){ 162 | readfunc = &RemoteMemoryClient::ReadOneSidedFarm; 163 | } 164 | else{ 165 | readfunc = &RemoteMemoryClient::ReadOneSided; 166 | } 167 | } else { 168 | if(allparams.count("mesh")){ 169 | readfunc = &RemoteMemoryClient::RpcFakeMesh; 170 | }else if(allparams.count("farm")){ 171 | readfunc = &RemoteMemoryClient::Read; 172 | } 173 | else{ 174 | readfunc = &RemoteMemoryClient::Read; 175 | } 176 | } 177 | 178 | std::chrono::seconds sec(1); 179 | 180 | uint64_t nanodelay = std::chrono::nanoseconds(sec).count() / target ; // per request 181 | auto starttime = std::chrono::high_resolution_clock::now(); 182 | 183 | uint32_t interval = 256; 184 | 185 | std::vector request_bw; 186 | request_bw.reserve(1024); 187 | 188 | auto bwt1 = std::chrono::high_resolution_clock::now(); 189 | uint32_t count = 0; 190 | auto req = trace->get_next(); 191 | for(uint64_t i=0; i*readfunc)(obj, buffer, size); 199 | assert(ret==0 && "one sided read failed"); 200 | 201 | count++; 202 | if(count > interval){ 203 | auto bwt2 = std::chrono::high_resolution_clock::now(); 204 | request_bw.push_back(std::chrono::duration_cast(bwt2 - bwt1).count()); 205 | bwt1 = bwt2; 206 | count=0; 207 | } 208 | 209 | auto const sleep_end_time = starttime + std::chrono::nanoseconds(nanodelay*i); 210 | while (std::chrono::high_resolution_clock::now() < sleep_end_time){ 211 | // nothing 212 | } 213 | } 214 | auto endtime = std::chrono::high_resolution_clock::now(); 215 | 216 | printf("throughput(Kreq/sec): "); 217 | for(auto &x : request_bw){ 218 | printf("%.2f ",(interval*1000.0)/x); 219 | } 220 | 221 | printf("\nFinished workload in %lu ms\n", std::chrono::duration_cast< std::chrono::milliseconds >( endtime - starttime ).count() ); 222 | 223 | return 0; 224 | } 225 | -------------------------------------------------------------------------------- /run_compaction.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | # 5 | # CoRM: Compactable Remote Memory over RDMA 6 | # 7 | # Help functions to deploy CoRM 8 | # 9 | # Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 10 | # 11 | # Author(s): Konstantin Taranov 12 | # 13 | 14 | source core.sh 15 | trap 'echo -ne "Stop all servers..." && killAllProcesses && killCorm && echo "done" && exit 1' INT 16 | 17 | define HELP <<'EOF' 18 | 19 | Script for starting a compaction experiment. 20 | usage : $0 [options] 21 | options: --server=IP # file containing IP addressof the CoRM server 22 | --num=INT # the number of repetitions 23 | --dir=PATH #absolute path to CoRM 24 | EOF 25 | 26 | usage () { 27 | echo -e "$HELP" 28 | } 29 | 30 | bits=20 # block size in bits. it is hard-coded in CoRM's code 31 | num=10000 32 | server="" 33 | 34 | for arg in "$@" 35 | do 36 | case ${arg} in 37 | --help|-help|-h) 38 | usage 39 | exit 1 40 | ;; 41 | --server=*) 42 | server=`echo $arg | sed -e 's/--server=//'` 43 | server=`eval echo ${server}` # tilde and variable expansion 44 | ;; 45 | --num=*) 46 | num=`echo $arg | sed -e 's/--num=//'` 47 | num=`eval echo ${num}` # tilde and variable expansion 48 | ;; 49 | --dir=*) 50 | WORKDIR=`echo $arg | sed -e 's/--dir=//'` 51 | WORKDIR=`eval echo ${WORKDIR}` # tilde and variable expansion 52 | ;; 53 | esac 54 | done 55 | 56 | 57 | for ((thread=2;thread<=16;thread=thread*2)); do 58 | name="collection_${bits}_${thread}_24.txt" 59 | startCorm $server "--send_buf_size=65536 --threads=${thread} --recv_buf_size=4096 --num_recv_buf=256 --log_file=${WORKDIR}/$name" 60 | sleep 0.5 61 | ./compaction --server=$server --threads=${thread} --num=50 --size=24 --collection 62 | 63 | sleep 0.5 64 | stopCorm 65 | 66 | sleep 1.5 67 | killCorm 68 | done 69 | 70 | echo "----------Done--------------" 71 | 72 | 73 | for ((thread=2;thread<=16;thread=thread*2)); do 74 | name="compaction_${bits}_${thread}_24.txt" 75 | startCorm $server "--send_buf_size=65536 --threads=${thread} --recv_buf_size=4096 --num_recv_buf=256 --log_file=${WORKDIR}/$name" 76 | sleep 0.5 77 | ./compaction --server=$server --threads=${thread} --num=50 --size=24 --collection --compaction 78 | 79 | sleep 0.5 80 | stopCorm 81 | 82 | sleep 1.5 83 | killCorm 84 | done 85 | 86 | echo "----------Done--------------" 87 | -------------------------------------------------------------------------------- /run_latency.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | # 5 | # CoRM: Compactable Remote Memory over RDMA 6 | # 7 | # Help functions to measure latency of CoRM 8 | # 9 | # Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 10 | # 11 | # Author(s): Konstantin Taranov 12 | # 13 | 14 | source core.sh 15 | trap 'echo -ne "Stop all servers..." && killAllProcesses && killCorm && echo "done" && exit 1' INT 16 | 17 | define HELP <<'EOF' 18 | 19 | Script for measuring latency 20 | usage : $0 [options] 21 | options: --server=IP # file containing IP addressof the CoRM server 22 | --num=INT # the number of records to consume 23 | --dir=PATH #absolute path to corm 24 | EOF 25 | 26 | usage () { 27 | echo -e "$HELP" 28 | } 29 | 30 | 31 | num=10000 32 | server="" 33 | 34 | for arg in "$@" 35 | do 36 | case ${arg} in 37 | --help|-help|-h) 38 | usage 39 | exit 1 40 | ;; 41 | --server=*) 42 | server=`echo $arg | sed -e 's/--server=//'` 43 | server=`eval echo ${server}` # tilde and variable expansion 44 | ;; 45 | --num=*) 46 | num=`echo $arg | sed -e 's/--num=//'` 47 | num=`eval echo ${num}` # tilde and variable expansion 48 | ;; 49 | --dir=*) 50 | WORKDIR=`echo $arg | sed -e 's/--dir=//'` 51 | WORKDIR=`eval echo ${WORKDIR}` # tilde and variable expansion 52 | ;; 53 | esac 54 | done 55 | 56 | 57 | startCorm $server "--send_buf_size=65536 --threads=1 --recv_buf_size=4096 --num_recv_buf=256" 58 | 59 | sleep 0.5 60 | 61 | #real:16 user:8; 62 | #real:24 user:15; 63 | #real:32 user:24; 64 | #real:64 user:56; 65 | #real:128 user:118; 66 | #real:248 user:236; 67 | #real:504 user:488; 68 | #real:1016 user:992; 69 | #real:2040 user:2000; 70 | allSizes=(8 15 24 56 118 236 488 992 2000) 71 | 72 | echo "Starting latency test" 73 | for size in ${allSizes[@]}; do 74 | outputfilename=latency_${size}.txt 75 | runLatency $size $num $outputfilename 76 | done 77 | 78 | 79 | killCorm 80 | echo "----------Done--------------" 81 | -------------------------------------------------------------------------------- /run_read_throughput.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # CoRM: Compactable Remote Memory over RDMA 5 | # 6 | # Help functions to deploy CoRM and read thoughput tests 7 | # 8 | # Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 9 | # 10 | # Author(s): Konstantin Taranov 11 | # 12 | 13 | source core.sh 14 | trap 'echo -ne "Stop all servers..." && killAllProcesses && killCorm && echo "done" && exit 1' INT 15 | 16 | define HELP <<'EOF' 17 | 18 | Script for starting a single consumer latency benchmark. 19 | The consumer is always launched locally. 20 | usage : $0 [options] 21 | options: --server=IP # file containing IP addressof the CoRM server 22 | --num=INT # the number of records to consume 23 | --dir=PATH #absolute path to corm 24 | EOF 25 | 26 | usage () { 27 | echo -e "$HELP" 28 | } 29 | 30 | 31 | num=8000000 32 | size=24 33 | server="" 34 | 35 | for arg in "$@" 36 | do 37 | case ${arg} in 38 | --help|-help|-h) 39 | usage 40 | exit 1 41 | ;; 42 | --server=*) 43 | server=`echo $arg | sed -e 's/--server=//'` 44 | server=`eval echo ${server}` # tilde and variable expansion 45 | ;; 46 | --size=*) 47 | size=`echo $arg | sed -e 's/--size=//'` 48 | size=`eval echo ${size}` # tilde and variable expansion 49 | ;; 50 | --num=*) 51 | num=`echo $arg | sed -e 's/--num=//'` 52 | num=`eval echo ${num}` # tilde and variable expansion 53 | ;; 54 | --dir=*) 55 | WORKDIR=`echo $arg | sed -e 's/--dir=//'` 56 | WORKDIR=`eval echo ${WORKDIR}` # tilde and variable expansion 57 | ;; 58 | esac 59 | done 60 | 61 | 62 | startCorm $server "--send_buf_size=116384 --threads=1 --recv_buf_size=4096 --num_recv_buf=256" 63 | 64 | sleep 0.5 65 | 66 | loadCorm $size $num 67 | 68 | allFlags=("--rpc" "--rpc --farm" "--rpc --mesh" "--rdmaread" "--rdmaread --farm" "--rdmaread --mesh") 69 | 70 | 71 | echo "Starting throughput test" 72 | 73 | for flag in "${allFlags[@]}"; do 74 | suffix=${flag//[[:blank:]]/} 75 | outputfilename=meshworkload_${size}_${suffix}.txt 76 | echo ${outputfilename} 77 | ./remote_read_benchmark --server=${__corm_server} --target=2000000 --num=1000000 --seed=10 ${flag}> $outputfilename 78 | echo "--------------done $outputfilename" 79 | done 80 | 81 | 82 | killCorm 83 | echo "----------Done--------------" 84 | -------------------------------------------------------------------------------- /run_throughput.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | # 5 | # CoRM: Compactable Remote Memory over RDMA 6 | # 7 | # Help functions to deploy CoRM and run read/write benchmarks 8 | # 9 | # Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 10 | # 11 | # Author(s): Konstantin Taranov 12 | # 13 | 14 | source core.sh 15 | trap 'echo -ne "Stop all servers..." && killAllProcesses && killCorm && echo "done" && exit 1' INT 16 | 17 | define HELP <<'EOF' 18 | 19 | Script for running read/write benchmarks 20 | The client is always launched locally. 21 | usage : $0 [options] 22 | options: --server=IP # file containing IP addressof the CoRM server 23 | --num=INT # the number of records to consume 24 | --dir=PATH #absolute path to corm 25 | EOF 26 | 27 | usage () { 28 | echo -e "$HELP" 29 | } 30 | 31 | 32 | num=8000000 33 | size=24 34 | server="" 35 | 36 | for arg in "$@" 37 | do 38 | case ${arg} in 39 | --help|-help|-h) 40 | usage 41 | exit 1 42 | ;; 43 | --server=*) 44 | server=`echo $arg | sed -e 's/--server=//'` 45 | server=`eval echo ${server}` # tilde and variable expansion 46 | ;; 47 | --size=*) 48 | size=`echo $arg | sed -e 's/--size=//'` 49 | size=`eval echo ${size}` # tilde and variable expansion 50 | ;; 51 | --num=*) 52 | num=`echo $arg | sed -e 's/--num=//'` 53 | num=`eval echo ${num}` # tilde and variable expansion 54 | ;; 55 | --dir=*) 56 | WORKDIR=`echo $arg | sed -e 's/--dir=//'` 57 | WORKDIR=`eval echo ${WORKDIR}` # tilde and variable expansion 58 | ;; 59 | esac 60 | done 61 | 62 | 63 | startCorm $server "--send_buf_size=116384 --threads=8 --recv_buf_size=4096 --num_recv_buf=256" 64 | 65 | sleep 0.5 66 | 67 | loadCorm $size $num 68 | 69 | 70 | allReadProb=(0.5) #1.0 71 | allFlags=("" "--zipf" "--rdmaread" "--zipf --rdmaread") 72 | allThreads=() #1 2 4 73 | 74 | #allReadProb=(1.0) 75 | #declare -a allFlags=("" "--zipf") 76 | #allThreads=(1) 77 | 78 | echo "Starting throughput test" 79 | for th in ${allThreads[@]}; do 80 | continue 81 | for rp in ${allReadProb[@]}; do 82 | for flag in "${allFlags[@]}"; do 83 | suffix=${flag//[[:blank:]]/} 84 | outputfilename=workload_1_1_${th}_${rp}_${suffix}.txt 85 | ./workload_readwrite --server=${__corm_server} --target=2000000 --prob=${rp} --num=1000000 --threads=$th --seed=10 ${flag}> $outputfilename 86 | echo "--------------done $outputfilename" 87 | done 88 | done 89 | done 90 | 91 | nodes=("192.168.1.72" "192.168.1.73" "192.168.1.74" "192.168.1.75" "192.168.1.76") 92 | echo "Starting throughput test woth multiple nodes" 93 | echo "The clients will be deployed at ${nodes[@]}" 94 | 95 | for rp in ${allReadProb[@]}; do 96 | for flag in "${allFlags[@]}"; do 97 | for ((total=2;total<=4;++total)); do 98 | seed=111 99 | for ((client=1;client $outputfilename 109 | sleep 4 110 | echo "------------------done $outputfilename" 111 | killAllProcesses # for debugging 112 | # break #for debugging 113 | done 114 | # break 115 | done 116 | # break 117 | done 118 | 119 | 120 | 121 | 122 | 123 | killCorm 124 | echo "----------Done--------------" 125 | -------------------------------------------------------------------------------- /run_throughput_compaction.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | # 5 | # CoRM: Compactable Remote Memory over RDMA 6 | # 7 | # Help functions to deploy CoRM and do a compaction experiments 8 | # 9 | # Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 10 | # 11 | # Author(s): Konstantin Taranov 12 | # 13 | 14 | source core.sh 15 | trap 'echo -ne "Stop all servers..." && killAllProcesses && killCorm && echo "done" && exit 1' INT 16 | 17 | define HELP <<'EOF' 18 | 19 | Script for measuring compaction latency 20 | usage : $0 [options] 21 | options: --server=IP # file containing IP addressof the CoRM server 22 | --num=INT # the number of records to consume 23 | --dir=PATH #absolute path to corm 24 | EOF 25 | 26 | usage () { 27 | echo -e "$HELP" 28 | } 29 | 30 | 31 | num=8000000 32 | size=24 33 | server="" 34 | 35 | for arg in "$@" 36 | do 37 | case ${arg} in 38 | --help|-help|-h) 39 | usage 40 | exit 1 41 | ;; 42 | --server=*) 43 | server=`echo $arg | sed -e 's/--server=//'` 44 | server=`eval echo ${server}` # tilde and variable expansion 45 | ;; 46 | --size=*) 47 | size=`echo $arg | sed -e 's/--size=//'` 48 | size=`eval echo ${size}` # tilde and variable expansion 49 | ;; 50 | --num=*) 51 | num=`echo $arg | sed -e 's/--num=//'` 52 | num=`eval echo ${num}` # tilde and variable expansion 53 | ;; 54 | --dir=*) 55 | WORKDIR=`echo $arg | sed -e 's/--dir=//'` 56 | WORKDIR=`eval echo ${WORKDIR}` # tilde and variable expansion 57 | ;; 58 | esac 59 | done 60 | 61 | 62 | 63 | allReadProb=(0.5 1.0 0.95) #1.0 64 | allFlags=("" "--zipf" "--rdmaread" "--zipf --rdmaread") 65 | 66 | 67 | nodes=("192.168.1.72" "192.168.1.73" "192.168.1.74" "192.168.1.75" "192.168.1.76") 68 | echo "Starting throughput test woth multiple nodes" 69 | echo "The clients will be deployed at ${nodes[@]}" 70 | 71 | for rp in ${allReadProb[@]}; do 72 | for flag in "${allFlags[@]}"; do 73 | for ((total=1;total<=4;++total)); do 74 | seed=111 75 | startCorm $server "--send_buf_size=116384 --threads=8 --recv_buf_size=4096 --num_recv_buf=256" 76 | sleep 0.5 77 | loadCorm $size $num 78 | echo "loading is done" 79 | sleep 0.5 80 | unloadCorm "$(echo $num*0.8/1 | bc)" 81 | echo "unloading is done" 82 | sleep 0.5 83 | for ((client=0;client 9 | * 10 | */ 11 | #pragma once 12 | 13 | #define PAYLOAD_SIZE 7 // no more than 7 pointers as a payload. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | 25 | namespace mts 26 | { 27 | 28 | typedef void (*thread_delay_cb)( int revents, void *arg); 29 | 30 | struct thread_msg_t{ 31 | typedef void (*thread_msg_cb)(thread_msg_t *msg); 32 | thread_msg_cb cb; 33 | void* payload[PAYLOAD_SIZE]; // PAYLOAD_SIZE Can be avoided by using circular memory pool for objects. 34 | thread_msg_t() {} 35 | }; 36 | static_assert(sizeof(thread_msg_t) == 64, "size of thread_msg_t is incorrect"); 37 | 38 | thread_local uint8_t thread_id; 39 | 40 | uint32_t num_threads; 41 | std::mutex reg_mutex; 42 | uint32_t reged_threads; 43 | 44 | std::vector*> msg_queues; 45 | std::vector notifies; 46 | std::vector loops; 47 | std::vector workers; 48 | 49 | 50 | void* GetWorker(uint32_t id){ 51 | assert(reged_threads==num_threads); 52 | return workers[id]; 53 | } 54 | 55 | void SetWorker(uint32_t id, void* worker){ 56 | workers[id] = worker; 57 | } 58 | 59 | 60 | void send_msg_to_thread(uint32_t dst, thread_msg_t* msg){ 61 | assert(reged_threads==num_threads); 62 | msg_queues[dst]->push(msg); 63 | } 64 | 65 | 66 | 67 | void send_msg_to_thread_and_notify(uint32_t dst, thread_msg_t* msg){ 68 | assert(reged_threads==num_threads); 69 | msg_queues[dst]->push(msg); 70 | notifies[dst]->send(); 71 | } 72 | 73 | 74 | bool poll_receive(uint32_t thread_id, thread_msg_t** msg){ 75 | return msg_queues[thread_id]->pop(*msg); 76 | } 77 | 78 | 79 | void setup_threads(uint32_t _num_threads){ 80 | num_threads = _num_threads; 81 | reged_threads = 0; 82 | msg_queues.resize(num_threads); 83 | notifies.resize(num_threads); 84 | loops.resize(num_threads); 85 | workers.resize(num_threads); 86 | }; 87 | 88 | } -------------------------------------------------------------------------------- /thread/thread.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * A modifier of a thread to support io events 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | #pragma once 12 | 13 | #include "../worker/generic_worker.hpp" 14 | #include "messenger.hpp" 15 | #include 16 | #include 17 | #include "../utilities/debug.h" 18 | 19 | #define NOW 0.000000001 20 | 21 | 22 | class Thread: public IOWatcher{ 23 | struct my_io: public ev::io{ 24 | my_io(uint32_t id, io_cb cb, void *ctx): id(id), cb(cb), ctx(ctx){ 25 | //empty 26 | } 27 | uint32_t id; 28 | io_cb cb; 29 | void *ctx; 30 | }; 31 | // events 32 | ev::async stopper; // for termination 33 | ev::idle main_event; // main event which is called when is not busy 34 | ev::timer timer_event; // call something time to time 35 | ev::dynamic_loop loop; // loop of the thread 36 | 37 | 38 | std::map io_events; // io events 39 | uint32_t current_io_id; 40 | 41 | ev::async notify; // for notifying about incoming messeges 42 | 43 | 44 | 45 | const int queue_size = 100; 46 | const uint32_t _thread_id; 47 | 48 | boost::lockfree::queue *_msg_queue; 49 | GenericWorker* const _worker; 50 | 51 | std::thread the_thread; 52 | 53 | 54 | 55 | public: 56 | Thread(uint32_t id, GenericWorker *w); 57 | 58 | 59 | ~Thread(){ 60 | text(log_fp,"\t[Thread] Try to destroy worker(%u)\n",_thread_id); 61 | delete _msg_queue; 62 | delete _worker; 63 | 64 | text(log_fp,"\t\t[Thread] Worker is destroyed\n"); 65 | } 66 | 67 | void Start() { 68 | this->the_thread = std::thread(&Thread::main_method,this); 69 | } 70 | 71 | uint32_t GetId() const { 72 | return _thread_id; 73 | }; 74 | 75 | 76 | boost::lockfree::queue* GetMsgQueue() const{ 77 | return _msg_queue; 78 | }; 79 | 80 | ev::async* GetNotify(){ 81 | return &(this->notify); 82 | }; 83 | 84 | ev::dynamic_loop* GetLoop(){ 85 | return &(this->loop); 86 | }; 87 | 88 | 89 | void main_cb (ev::idle &w, int revents){ 90 | 91 | _worker->main_cb(); 92 | //this->poll_message_cb(); 93 | } 94 | 95 | 96 | void main_method(){ 97 | // create async stopper for terminating the tread 98 | 99 | 100 | this->stopper.set(this->loop); 101 | this->stopper.set(this); 102 | this->stopper.priority = EV_MAXPRI-1; 103 | this->stopper.start(); 104 | 105 | 106 | this->notify.set(this->loop); 107 | this->notify.set(this); 108 | this->notify.priority = EV_MAXPRI-1; 109 | this->notify.start(); 110 | 111 | 112 | 113 | this->timer_event.set(this->loop); 114 | this->timer_event.set(this); 115 | this->timer_event.set(10,50); // after 10 repeat 50 116 | this->timer_event.priority = EV_MAXPRI-1; 117 | this->timer_event.start(10,50); // after 10 repeat 50 118 | 119 | 120 | 121 | this->main_event.set(this->loop); 122 | this->main_event.set(this); 123 | this->main_event.priority = EV_MAXPRI; 124 | this->main_event.start(); 125 | 126 | mts::thread_id = _thread_id; 127 | 128 | this->loop.run(0); 129 | 130 | } 131 | 132 | 133 | void timer_cb (ev::timer &w, int revents){ 134 | _worker->sometimes_cb(); 135 | 136 | w.repeat = 5; // repeat after 5 137 | w.again(); 138 | } 139 | 140 | void Stop(){ 141 | text(log_fp,"[Thread] Try stopping %d \n", this->GetId() ); 142 | this->stopper.send(); 143 | if(this->the_thread.joinable()){ 144 | this->the_thread.join(); 145 | } 146 | 147 | } 148 | 149 | 150 | void terminate_cb() { 151 | 152 | 153 | for (auto &pair: io_events) 154 | { 155 | pair.second->stop(); 156 | delete pair.second; 157 | } 158 | 159 | 160 | this->stopper.stop(); 161 | this->timer_event.stop(); 162 | this->main_event.stop(); 163 | this->notify.stop(); 164 | 165 | this->loop.break_loop(ev::ALL); 166 | text(log_fp,"[Thread] Thread(%d) is terminated\n", this->GetId() ); 167 | 168 | // print stats 169 | _worker->print_stats(); 170 | 171 | } 172 | 173 | 174 | void poll_message_cb() { 175 | // text(log_fp, "Poll Message %d \n", this->GetId() ); 176 | bool found = true; 177 | while(found){ 178 | 179 | mts::thread_msg_t* message = nullptr; 180 | this->_msg_queue->pop(message); 181 | found = (message!=nullptr); 182 | if(found){ 183 | //mts::thread_msg_cb cb = (mts::thread_msg_cb)message->cb; 184 | message->cb(message); 185 | } else { 186 | // text(log_fp,"No message %d \n", this->GetId() ); 187 | } 188 | } 189 | 190 | } 191 | 192 | void io_process (ev::io &w, int revents){ 193 | my_io& new_d = static_cast(w); 194 | text(log_fp,"filed = %d\n", new_d.fd); 195 | new_d.cb(new_d.id, new_d.ctx); 196 | } 197 | 198 | void install_io(int fd, io_cb cb, void* ctx ) override{ 199 | my_io *io = new my_io(current_io_id, cb,ctx); 200 | io_events[current_io_id] = io; 201 | current_io_id++; 202 | io->set(this->loop); 203 | io->set(this); 204 | io->start(fd, ev::READ); 205 | } 206 | 207 | void stop_io(uint32_t io_id) override{ 208 | auto it = io_events.find(io_id); 209 | assert(it!=io_events.end()); 210 | delete it->second; 211 | io_events.erase(it); 212 | } 213 | 214 | private: 215 | Thread(const Thread &) = delete; 216 | void operator=(const Thread &) = delete; 217 | 218 | }; 219 | 220 | 221 | 222 | namespace mts{ 223 | void RegisterThread(Thread *t){ 224 | reg_mutex.lock(); 225 | 226 | reged_threads++; 227 | msg_queues[t->GetId()] = t->GetMsgQueue(); 228 | notifies[t->GetId()] = t->GetNotify(); 229 | loops[t->GetId()] = t->GetLoop(); 230 | 231 | reg_mutex.unlock(); 232 | } 233 | 234 | void SCHEDULE_CALLBACK(int thread_id, ev_tstamp time, thread_delay_cb cb, void *arg) { 235 | ev_once (*loops[thread_id], 0, 0, time, cb, arg); 236 | } 237 | 238 | } 239 | 240 | 241 | Thread::Thread(uint32_t id, GenericWorker *w): loop(ev::AUTO), _thread_id(id), _worker(w) 242 | { 243 | //empty 244 | this->_msg_queue = new boost::lockfree::queue(queue_size); 245 | mts::RegisterThread(this); 246 | 247 | } 248 | 249 | 250 | 251 | 252 | // it helps to launch all threads. 253 | class LauncherMaster{ 254 | 255 | 256 | public: 257 | 258 | LauncherMaster (uint32_t tot_threads): current_thread_id(0), num_threads(tot_threads) 259 | { 260 | 261 | instance = this; 262 | std::signal(SIGINT, LauncherMaster::signal_handler); 263 | 264 | mts::setup_threads(tot_threads); 265 | } 266 | 267 | ~LauncherMaster(){ 268 | 269 | text(log_fp,"[LauncherMaster] Try to destroy all threads\n"); 270 | 271 | for (auto &iter: threads) 272 | { 273 | delete iter; 274 | } 275 | 276 | 277 | text(log_fp,"\t[LauncherMaster] All threads are destroyed\n"); 278 | 279 | } 280 | 281 | 282 | uint32_t add_worker(GenericWorker *worker){ 283 | assert(current_thread_idset_thread_id(current_thread_id); 287 | worker->set_io_watcher(t); 288 | 289 | return (current_thread_id++); 290 | } 291 | 292 | void launch(){ 293 | 294 | for (uint32_t i=1; i < current_thread_id; ++i) { 295 | text(log_fp,"\t[LauncherMaster] start thread %u\n",i); 296 | threads[i]->Start(); 297 | } 298 | text(log_fp,"\t[LauncherMaster] start thread 0\n"); 299 | 300 | threads[0]->main_method(); 301 | } 302 | 303 | 304 | private: 305 | 306 | void handler_wraper (int signum) 307 | { 308 | 309 | text(log_fp," Signal %d detected \n",signum); 310 | // handling code 311 | 312 | for (uint32_t i=0; i Stop(); 314 | } 315 | text(log_fp," All threads are stopped\n"); 316 | 317 | } 318 | 319 | 320 | std::vector threads; 321 | uint32_t current_thread_id; 322 | const uint32_t num_threads; 323 | 324 | 325 | 326 | static LauncherMaster* instance; 327 | 328 | static void signal_handler(int signum) 329 | { 330 | instance->handler_wraper(signum); 331 | } 332 | }; 333 | 334 | LauncherMaster* LauncherMaster::instance = nullptr; 335 | -------------------------------------------------------------------------------- /unload.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * A simple code to partially unload data from CoRM to have fragmentation. 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | #include // std::cout 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include "thread/thread.hpp" 25 | FILE *log_fp; 26 | 27 | #include "worker/client_api.hpp" 28 | #include "rdma/connectRDMA.hpp" 29 | 30 | #include "utilities/cxxopts.hpp" 31 | 32 | 33 | cxxopts::ParseResult 34 | parse(int argc, char* argv[]) 35 | { 36 | cxxopts::Options options(argv[0], "unload random objects from CoRM"); 37 | options 38 | .positional_help("[optional args]") 39 | .show_positional_help(); 40 | 41 | try 42 | { 43 | 44 | options.add_options() 45 | ("server", "Another address", cxxopts::value(), "IP") 46 | ("i,input", "input file", cxxopts::value()->default_value("test.bin"), "FILE") 47 | ("n,num", "Number of objects to deallocate", cxxopts::value()->default_value("123"), "N") 48 | ("help", "Print help") 49 | ; 50 | 51 | auto result = options.parse(argc, argv); 52 | 53 | if (result.count("help")) 54 | { 55 | std::cout << options.help({""}) << std::endl; 56 | exit(0); 57 | } 58 | 59 | if (!result.count("server")) 60 | { 61 | throw cxxopts::OptionException("input must be specified"); 62 | } 63 | 64 | 65 | 66 | return result; 67 | 68 | } catch (const cxxopts::OptionException& e) 69 | { 70 | std::cout << "error parsing options: " << e.what() << std::endl; 71 | std::cout << options.help({""}) << std::endl; 72 | exit(1); 73 | } 74 | } 75 | 76 | int main(int argc, char* argv[]){ 77 | 78 | 79 | auto allparams = parse(argc,argv); 80 | 81 | log_fp=stdout; 82 | 83 | std::string server = allparams["server"].as(); 84 | 85 | uint32_t todelete = allparams["num"].as(); 86 | std::string input = allparams["input"].as(); 87 | 88 | ClientRDMA rdma((char*)server.c_str(),9999); 89 | struct rdma_cm_id * id = rdma.sendConnectRequest(); 90 | 91 | struct ibv_pd * pd = ClientRDMA::create_pd(id); 92 | 93 | struct ibv_qp_init_attr attr; 94 | struct rdma_conn_param conn_param; 95 | memset(&attr, 0, sizeof(attr)); 96 | attr.cap.max_send_wr = 32; 97 | attr.cap.max_recv_wr = 32; 98 | attr.cap.max_send_sge = 1; 99 | attr.cap.max_recv_sge = 1; 100 | attr.cap.max_inline_data = 0; 101 | attr.qp_type = IBV_QPT_RC; 102 | 103 | memset(&conn_param, 0 , sizeof(conn_param)); 104 | conn_param.responder_resources = 0; 105 | conn_param.initiator_depth = 5; 106 | conn_param.retry_count = 3; 107 | conn_param.rnr_retry_count = 3; 108 | 109 | 110 | VerbsEP* ep = ClientRDMA::connectEP(id, &attr, &conn_param, pd); 111 | 112 | printf("Connected\n"); 113 | sleep(1); 114 | 115 | RemoteMemoryClient* api = new RemoteMemoryClient(0,ep); 116 | 117 | 118 | 119 | std::fstream fout; 120 | fout.open(input.c_str(), std::ios::in|std::ios::binary); 121 | uint32_t NN = 0; 122 | 123 | fout.read((char*)&NN,sizeof(NN)); 124 | 125 | std::vector objects(NN); 126 | 127 | for(uint32_t i = 0; i < NN; i++){ 128 | LocalObjectHandler* obj = (LocalObjectHandler*)malloc(sizeof(LocalObjectHandler)); 129 | fout.read((char*)obj,sizeof(LocalObjectHandler)); 130 | // obj->print(); 131 | objects[i] = obj; 132 | } 133 | fout.close(); 134 | 135 | 136 | if(NN != todelete){ 137 | std::random_shuffle(objects.begin(), objects.end()); 138 | } 139 | 140 | for(uint32_t i = 0; i < todelete; i++){ 141 | api->Free(objects[i]); 142 | free(objects[i]); 143 | } 144 | 145 | 146 | if(NN != todelete){ 147 | uint32_t rest = NN-todelete; 148 | std::fstream fout; 149 | fout.open(input.c_str(), std::ios::trunc|std::ios::out|std::ios::binary); 150 | 151 | fout.write((char*)&rest,sizeof(rest)); 152 | 153 | for(uint32_t i = 0; i < rest; i++){ 154 | fout.write((char*)objects[todelete+i],sizeof(LocalObjectHandler)); 155 | } 156 | 157 | fout.close(); 158 | 159 | printf("Objects' keys are written to file %s\n", input.c_str()); 160 | } else { 161 | std::remove(input.c_str()); 162 | } 163 | 164 | 165 | return 0; 166 | } 167 | -------------------------------------------------------------------------------- /utilities/block_home_table.h: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * A class for mapping a block to a thread allocator that owns that block. 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | 12 | #pragma once 13 | 14 | #include "../alloc/alloc.hpp" 15 | #include 16 | #include 17 | #include "rcu.h" 18 | 19 | class BlockHomeTable { 20 | public: 21 | BlockHomeTable(); 22 | ~BlockHomeTable(); 23 | 24 | ThreadAlloc *Lookup(addr_t addr); 25 | void Insert(int thread_id, addr_t addr, ThreadAlloc *alloc); 26 | bool Update(addr_t addr, ThreadAlloc *alloc); 27 | void Remove(addr_t addr); 28 | 29 | protected: 30 | struct KeyValuePair { 31 | addr_t addr; 32 | ThreadAlloc *alloc; 33 | }; 34 | 35 | static const size_t BucketBytes = CACHELINE; 36 | static const size_t ItemsInBucket = BucketBytes / sizeof(KeyValuePair); 37 | 38 | static const size_t InitialBucketLog = 15; 39 | static const size_t InitialBucketCount = 1 << InitialBucketLog; 40 | 41 | struct Bucket { 42 | KeyValuePair items[ItemsInBucket]; 43 | }; 44 | 45 | static_assert(sizeof(Bucket) == BucketBytes, "Bucket of unexpected size"); 46 | 47 | struct Header { 48 | size_t buckets; 49 | uint8_t pad[BucketBytes - 50 | sizeof(size_t) ]; 51 | }; 52 | 53 | static_assert(sizeof(Header) == BucketBytes, "Header of unexpected size"); 54 | 55 | ThreadAlloc *LookupIn(Bucket *bucket_array, size_t bucket_count, 56 | uint64_t (*hash_fun)(addr_t addr), addr_t addr); 57 | ThreadAlloc *LookupIn(Bucket *bucket, addr_t addr); 58 | 59 | bool UpdateIn(ThreadAlloc *newalloc, Bucket *bucket, addr_t addr); 60 | 61 | 62 | bool InsertInto(Header *hdr, addr_t addr, ThreadAlloc *alloc); 63 | bool InsertInto(Bucket *bucket_array, size_t bucket_count, 64 | uint64_t (*hash_fun)(addr_t addr), addr_t addr, ThreadAlloc *alloc); 65 | bool InsertInto(Bucket *bucket, addr_t addr, ThreadAlloc *alloc); 66 | 67 | bool RemoveFrom(Bucket *array, addr_t addr); 68 | 69 | void Grow(int thread_id); 70 | bool CopyItems(Header *new_hdr, Header *old_hdr); 71 | bool CopyBucket(Header *new_hdr, Bucket *bucket); 72 | 73 | static void AllFreeCb(void *owner); 74 | 75 | static void *AllocMemory(size_t size); 76 | static void FreeMemory(void *ptr); 77 | 78 | protected: 79 | 80 | // a pointer to the table 81 | std::atomic
table; 82 | }; 83 | 84 | // we use three hash functions for geting raget bucket in the hash table. 85 | inline uint64_t hash_1(addr_t addr) { 86 | return addr >> BLOCK_BIT_SIZE; 87 | } 88 | 89 | inline uint64_t hash_2(addr_t addr) { 90 | uint64_t key = addr >> BLOCK_BIT_SIZE; 91 | key ^= key >> 33; 92 | key *= 0xff51afd7ed558ccdull; 93 | key ^= key >> 33; 94 | key *= 0xc4ceb9fe1a85ec53ull; 95 | key ^= key >> 33; 96 | return key; 97 | } 98 | 99 | inline uint64_t hash_3(addr_t addr) { 100 | uint64_t key = addr >> BLOCK_BIT_SIZE; 101 | key *= 0xc6a4a7935bd1e995ULL; 102 | key ^= key >> 47; 103 | key *= 0xc6a4a7935bd1e995ULL; 104 | return key; 105 | } 106 | 107 | inline uint64_t index(uint64_t hash, uint64_t buckets) { 108 | return hash & (buckets - 1); 109 | } 110 | 111 | BlockHomeTable::BlockHomeTable() { 112 | size_t size = InitialBucketCount * sizeof(Bucket) + sizeof(Header); 113 | Header *hdr = (Header *)AllocMemory(size); 114 | hdr->buckets = InitialBucketCount; 115 | table.store(hdr); 116 | } 117 | 118 | BlockHomeTable::~BlockHomeTable() { 119 | FreeMemory(table.load()); 120 | } 121 | 122 | ThreadAlloc *BlockHomeTable::Lookup(addr_t addr) { 123 | ThreadAlloc *ret = nullptr; 124 | Header *hdr = table.load(std::memory_order_acquire); 125 | Bucket *bucket_array = (Bucket *)(hdr + 1); 126 | Bucket *bucket = nullptr; 127 | 128 | bucket = bucket_array + index(hash_1(addr), hdr->buckets); 129 | if((ret = LookupIn(bucket, addr)) != nullptr) { 130 | return ret; 131 | } 132 | bucket = bucket_array + index(hash_2(addr), hdr->buckets); 133 | if((ret = LookupIn(bucket, addr)) != nullptr) { 134 | return ret; 135 | } 136 | bucket = bucket_array + index(hash_3(addr), hdr->buckets); 137 | if((ret = LookupIn(bucket, addr)) != nullptr) { 138 | return ret; 139 | } 140 | 141 | // may return nullptr during compaction 142 | return nullptr; 143 | } 144 | 145 | ThreadAlloc *BlockHomeTable::LookupIn(Bucket *bucket, addr_t addr) { 146 | 147 | for(unsigned i = 0;i < ItemsInBucket;i++) { 148 | if(bucket->items[i].addr == addr) { 149 | std::atomic *addr_ptr = (std::atomic *)&bucket->items[i].addr; 150 | if(addr_ptr->load(std::memory_order_acquire) == addr) { 151 | return bucket->items[i].alloc; 152 | } 153 | } 154 | } 155 | 156 | return nullptr; 157 | } 158 | 159 | 160 | bool BlockHomeTable::Update(addr_t addr, ThreadAlloc *newalloc ) { 161 | Header *hdr = table.load(std::memory_order_acquire); 162 | Bucket *bucket_array = (Bucket *)(hdr + 1); 163 | Bucket *bucket = nullptr; 164 | 165 | bucket = bucket_array + index(hash_1(addr), hdr->buckets); 166 | if(UpdateIn(newalloc, bucket, addr)) { 167 | return true; 168 | } 169 | bucket = bucket_array + index(hash_2(addr), hdr->buckets); 170 | if(UpdateIn(newalloc, bucket, addr)) { 171 | return true; 172 | } 173 | bucket = bucket_array + index(hash_3(addr), hdr->buckets); 174 | if(UpdateIn(newalloc, bucket, addr)) { 175 | return true; 176 | } 177 | 178 | // may return nullptr during compaction 179 | return false; 180 | } 181 | 182 | bool BlockHomeTable::UpdateIn(ThreadAlloc *newalloc, Bucket *bucket, addr_t addr) { 183 | 184 | for(unsigned i = 0;i < ItemsInBucket;i++) { 185 | if(bucket->items[i].addr == addr) { 186 | std::atomic *addr_ptr = (std::atomic *)&bucket->items[i].addr; 187 | 188 | if(addr_ptr->load(std::memory_order_acquire) == addr) { 189 | bucket->items[i].alloc = newalloc; 190 | return true; 191 | } 192 | } 193 | } 194 | 195 | return false; 196 | } 197 | 198 | 199 | 200 | void BlockHomeTable::Insert(int thread_id, addr_t addr, ThreadAlloc *alloc) { 201 | while(!InsertInto(table.load(std::memory_order_relaxed), addr, alloc)) { 202 | printf("[BlockHomeTable] grow block table event\n"); 203 | Grow(thread_id); 204 | } 205 | } 206 | 207 | bool BlockHomeTable::InsertInto(Header *hdr, addr_t addr, ThreadAlloc *alloc) { 208 | Bucket *bucket_array = (Bucket *)(hdr + 1); 209 | Bucket *bucket = nullptr; 210 | 211 | bucket = bucket_array + index(hash_1(addr), hdr->buckets); 212 | if(InsertInto(bucket, addr, alloc)) { 213 | return true; 214 | } 215 | bucket = bucket_array + index(hash_2(addr), hdr->buckets); 216 | if(InsertInto(bucket, addr, alloc)) { 217 | return true; 218 | } 219 | bucket = bucket_array + index(hash_3(addr), hdr->buckets); 220 | if(InsertInto(bucket, addr, alloc)) { 221 | return true; 222 | } 223 | 224 | return false; 225 | } 226 | 227 | bool BlockHomeTable::InsertInto(Bucket *bucket, addr_t addr, ThreadAlloc *alloc) { 228 | for(unsigned i = 0;i < ItemsInBucket;i++) { 229 | if(bucket->items[i].addr == 0) { 230 | bucket->items[i].alloc = alloc; 231 | std::atomic *addr_ptr = (std::atomic *)&bucket->items[i].addr; 232 | addr_ptr->store(addr, std::memory_order_release); 233 | return true; 234 | } 235 | 236 | assert(bucket->items[i].addr != addr); 237 | } 238 | 239 | return false; 240 | } 241 | 242 | void BlockHomeTable::Remove(addr_t addr) { 243 | Header *hdr = table.load(std::memory_order_relaxed); 244 | Bucket *bucket_array = (Bucket *)(hdr + 1); 245 | Bucket *bucket = nullptr; 246 | 247 | bucket = bucket_array + index(hash_1(addr), hdr->buckets); 248 | if(RemoveFrom(bucket, addr)) { 249 | return; 250 | } 251 | bucket = bucket_array + index(hash_2(addr), hdr->buckets); 252 | if(RemoveFrom(bucket, addr)) { 253 | return; 254 | } 255 | bucket = bucket_array + index(hash_3(addr), hdr->buckets); 256 | if(RemoveFrom(bucket, addr)) { 257 | return; 258 | } 259 | 260 | assert(0); 261 | } 262 | 263 | bool BlockHomeTable::RemoveFrom(Bucket *bucket, addr_t addr) { 264 | for(unsigned i = 0;i < ItemsInBucket;i++) { 265 | if(bucket->items[i].addr == addr) { 266 | std::atomic *addr_ptr = (std::atomic *)&bucket->items[i].addr; 267 | addr_ptr->store(0, std::memory_order_release); 268 | return true; 269 | } 270 | } 271 | 272 | return false; 273 | } 274 | 275 | void BlockHomeTable::Grow(int thread_id) { 276 | Header *hdr = table.load(std::memory_order_relaxed); 277 | size_t new_buckets = hdr->buckets; 278 | Header *new_hdr; 279 | 280 | while(true) { 281 | new_buckets <<= 1; 282 | size_t new_size = new_buckets * sizeof(Bucket) + sizeof(Header); 283 | 284 | new_hdr = (Header *)AllocMemory(new_size); 285 | new_hdr->buckets = new_buckets; 286 | 287 | if(CopyItems(new_hdr, hdr)) { 288 | break; 289 | } 290 | 291 | FreeMemory(new_hdr); 292 | } 293 | 294 | table.store(new_hdr, std::memory_order_release); 295 | 296 | BroadcastDrain::Drain(thread_id, AllFreeCb, hdr); 297 | } 298 | 299 | bool BlockHomeTable::CopyItems(Header *new_hdr, Header *old_hdr) { 300 | size_t old_bucket_count = old_hdr->buckets; 301 | Bucket *old_buckets = (Bucket *)(old_hdr + 1); 302 | 303 | for(unsigned i = 0;i < old_bucket_count;i++) { 304 | if(!CopyBucket(new_hdr, old_buckets + i)) { 305 | return false; 306 | } 307 | } 308 | 309 | return true; 310 | } 311 | 312 | bool BlockHomeTable::CopyBucket(Header *new_hdr, Bucket *bucket) { 313 | for(unsigned i = 0;i < ItemsInBucket;i++) { 314 | ThreadAlloc *alloc = bucket->items[i].alloc; 315 | 316 | if(alloc != nullptr) { 317 | addr_t addr = bucket->items[i].addr; 318 | 319 | if(!InsertInto(new_hdr, addr, alloc)) { 320 | return false; 321 | } 322 | } 323 | } 324 | 325 | return true; 326 | } 327 | 328 | void BlockHomeTable::AllFreeCb(void *owner) { 329 | FreeMemory(owner); 330 | } 331 | 332 | void *BlockHomeTable::AllocMemory(size_t size) { 333 | void *ret = aligned_alloc(CACHELINE, size); 334 | memset(ret, 0, size); 335 | return ret; 336 | } 337 | 338 | void BlockHomeTable::FreeMemory(void *ptr) { 339 | free(ptr); 340 | } 341 | 342 | -------------------------------------------------------------------------------- /utilities/debug.h: -------------------------------------------------------------------------------- 1 | /** 2 | * ring (Direct Access REplication) 3 | * 4 | * Debugging and logging utilities 5 | * 6 | * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Marius Poke 9 | * 10 | */ 11 | 12 | #ifndef DEBUG_H_ 13 | #define DEBUG_H_ 14 | #include 15 | #include 16 | #include 17 | #define __STDC_FORMAT_MACROS 18 | #include 19 | #include 20 | 21 | //#define DEBUG 22 | //extern struct timeval prev_tv; 23 | //extern uint64_t jump_cnt; 24 | 25 | #define info(stream, fmt, ...) do {\ 26 | fprintf(stream, fmt, ##__VA_ARGS__); \ 27 | fflush(stream); \ 28 | } while(0) 29 | #define info_wtime(stream, fmt, ...) do {\ 30 | struct timeval _debug_tv;\ 31 | gettimeofday(&_debug_tv,NULL);\ 32 | fprintf(stream, "[%lu:%06lu] " fmt, _debug_tv.tv_sec, _debug_tv.tv_usec, ##__VA_ARGS__); \ 33 | fflush(stream); \ 34 | } while(0) 35 | 36 | #ifdef DEBUG 37 | #define debug(stream, fmt, ...) do {\ 38 | struct timeval _debug_tv;\ 39 | gettimeofday(&_debug_tv,NULL);\ 40 | fprintf(stream, "[DEBUG %lu:%lu] %s/%d/%s() " fmt, _debug_tv.tv_sec, _debug_tv.tv_usec, __FILE__, __LINE__, __func__, ##__VA_ARGS__); \ 41 | fflush(stream); \ 42 | } while(0) 43 | #define text(stream, fmt, ...) do {\ 44 | fprintf(stream, fmt, ##__VA_ARGS__); \ 45 | fflush(stream); \ 46 | } while(0) 47 | #define text_wtime(stream, fmt, ...) do {\ 48 | struct timeval _debug_tv;\ 49 | gettimeofday(&_debug_tv,NULL);\ 50 | fprintf(stream, "[%lu:%lu] " fmt, _debug_tv.tv_sec, _debug_tv.tv_usec, ##__VA_ARGS__); \ 51 | fflush(stream); \ 52 | } while(0) 53 | #else 54 | #define debug(stream, fmt, ...) 55 | #define text(stream, fmt, ...) 56 | #define text_wtime(stream, fmt, ...) 57 | #endif 58 | 59 | //#ifdef DEBUG 60 | 61 | //#else 62 | //#define error(stream, fmt, ...) 63 | //#endif 64 | 65 | //#ifdef DEBUG 66 | #define error_return(rc, stream, fmt, ...) do { \ 67 | fprintf(stream, "[ERROR] %s/%d/%s() " fmt, __FILE__, __LINE__, __func__, ##__VA_ARGS__); \ 68 | fflush(stream); \ 69 | return (rc); \ 70 | } while(0) 71 | //#else 72 | //#define error_return(rc, stream, fmt, ...) return (rc) 73 | //#endif 74 | 75 | //#ifdef DEBUG 76 | #define error_exit(rc, stream, fmt, ...) do { \ 77 | fprintf(stream, "[ERROR] %s/%d/%s() " fmt, __FILE__, __LINE__, __func__, ##__VA_ARGS__); \ 78 | fflush(stream); \ 79 | exit(rc); \ 80 | } while(0) 81 | //#else 82 | //#define error_exit(rc, stream, fmt, ...) exit(rc) 83 | //#endif 84 | 85 | extern FILE *log_fp; 86 | 87 | #endif /* DEBUG_H_ */ 88 | 89 | -------------------------------------------------------------------------------- /utilities/rcu.h: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * A class for ensuring that each thread does not access the hometable during a grow. 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | 12 | #pragma once 13 | 14 | #include "../thread/thread.hpp" 15 | 16 | // This is a one-shot object. 17 | class BroadcastDrain { 18 | public: 19 | typedef void (*timer_rcu_cb)(void *owner); 20 | 21 | public: 22 | static void Drain(int home_thread_id, timer_rcu_cb cb, void *owner); 23 | 24 | protected: 25 | BroadcastDrain(int home_thread_id, timer_rcu_cb cb, void *owner); 26 | ~BroadcastDrain(); 27 | 28 | void Drain(); 29 | 30 | static void DrainingDoneOne(BroadcastDrain *_this); 31 | 32 | void SendTmsg(int thread_id); 33 | bool DrainingDoneOne(); 34 | 35 | static void RequestTmsg(mts::thread_msg_t *tmsg); 36 | static void ResponseTmsg(mts::thread_msg_t *tmsg); 37 | 38 | protected: 39 | const int host_thread_id; 40 | 41 | timer_rcu_cb cb; 42 | void *owner; 43 | 44 | // threads remaining to drain 45 | int remaining; 46 | }; 47 | 48 | 49 | 50 | void BroadcastDrain::Drain(int home_thread_id, timer_rcu_cb cb, void *owner) { 51 | BroadcastDrain *rcu = new BroadcastDrain(home_thread_id, cb, owner); 52 | rcu->Drain(); 53 | } 54 | 55 | BroadcastDrain::BroadcastDrain(int thread_id, timer_rcu_cb cb, void *owner) 56 | : host_thread_id(thread_id), 57 | cb(cb), 58 | owner(owner), 59 | remaining(0) { 60 | // empty 61 | } 62 | 63 | BroadcastDrain::~BroadcastDrain() { 64 | // empty 65 | } 66 | 67 | void BroadcastDrain::Drain() { 68 | int thread_count = mts::num_threads; 69 | remaining = 1; 70 | 71 | for(int i = 0;i < thread_count;i++) { 72 | if(i != host_thread_id) { 73 | SendTmsg(i); 74 | remaining++; 75 | } 76 | } 77 | 78 | DrainingDoneOne(this); 79 | } 80 | 81 | void BroadcastDrain::SendTmsg(int thread_id) { 82 | mts::thread_msg_t *tmsg = new mts::thread_msg_t(); 83 | //Tmsg *ctx = new (tmsg->payload) Tmsg(this, host_thread_id); 84 | tmsg->cb = &BroadcastDrain::RequestTmsg; 85 | tmsg->payload[0] = this; 86 | tmsg->payload[1] = (void*)(uint64_t)host_thread_id; 87 | mts::send_msg_to_thread_and_notify(thread_id, tmsg); 88 | } 89 | 90 | void BroadcastDrain::DrainingDoneOne(BroadcastDrain *_this) { 91 | if(_this->DrainingDoneOne()) { 92 | delete _this; 93 | } 94 | } 95 | 96 | bool BroadcastDrain::DrainingDoneOne() { 97 | bool ret; 98 | 99 | if(--remaining == 0) { 100 | timer_rcu_cb cb = this->cb; 101 | void *owner = this->owner; 102 | 103 | cb(owner); 104 | 105 | ret = true; 106 | } else { 107 | ret = false; 108 | } 109 | 110 | return ret; 111 | } 112 | 113 | void BroadcastDrain::RequestTmsg(mts::thread_msg_t *tmsg) { 114 | tmsg->cb = &BroadcastDrain::ResponseTmsg; 115 | int _resp_thread_id = (int)(uint64_t)tmsg->payload[1]; 116 | mts::send_msg_to_thread_and_notify(_resp_thread_id, tmsg); 117 | } 118 | 119 | void BroadcastDrain::ResponseTmsg(mts::thread_msg_t *tmsg) { 120 | 121 | BroadcastDrain *_this = (BroadcastDrain*)tmsg->payload[0]; 122 | free(tmsg); 123 | 124 | DrainingDoneOne(_this); 125 | } 126 | -------------------------------------------------------------------------------- /utilities/timer.h: -------------------------------------------------------------------------------- 1 | /** 2 | * ring (Direct Access REplication) 3 | * 4 | * Timer implementation 5 | * 6 | * Copyright (c) 2014-2015 ETH-Zurich. All rights reserved. 7 | * 8 | * Copyright (c) 2009 The Trustees of Indiana University and Indiana 9 | * University Research and Technology 10 | * Corporation. All rights reserved. 11 | * 12 | * Author(s): Torsten Hoefler 13 | */ 14 | 15 | #ifndef TIMER_H_ 16 | #define TIMER_H_ 17 | 18 | #include 19 | 20 | #define UINT32_T uint32_t 21 | #define UINT64_T uint64_t 22 | 23 | #define HRT_CALIBRATE(freq) do { \ 24 | static volatile HRT_TIMESTAMP_T t1, t2; \ 25 | static volatile UINT64_T elapsed_ticks, min = (UINT64_T)(~0x1); \ 26 | int notsmaller=0; \ 27 | while(notsmaller<3) { \ 28 | HRT_GET_TIMESTAMP(t1); \ 29 | sleep(1); \ 30 | /* nanosleep((struct timespec[]){{0, 10000000}}, NULL); */ \ 31 | HRT_GET_TIMESTAMP(t2); \ 32 | HRT_GET_ELAPSED_TICKS(t1, t2, &elapsed_ticks); \ 33 | notsmaller++; \ 34 | if(elapsed_ticks 10 | * 11 | */ 12 | #pragma once 13 | #include // std::cout 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | 26 | #include 27 | #include "zipf.hpp" 28 | 29 | class Trace{ 30 | public: 31 | virtual ~Trace() = default; 32 | virtual std::pair get_next() = 0; 33 | }; 34 | 35 | class YCSB: public Trace 36 | { 37 | 38 | const uint32_t max_value_uni = 0xFFFFFFFF; 39 | std::mt19937 generator; 40 | zipf_distribution<> zipf; 41 | std::uniform_int_distribution dis; 42 | const uint32_t read_threshold; 43 | 44 | public: 45 | 46 | YCSB(unsigned long seed, double read_prob, uint32_t N, double theta): 47 | generator(seed), zipf{N,theta}, dis(0,max_value_uni-1), read_threshold((uint32_t)(read_prob*max_value_uni)) 48 | { 49 | //empty 50 | } 51 | 52 | 53 | virtual std::pair get_next() override 54 | { 55 | uint32_t rank = zipf(generator)-1; 56 | uint32_t val = dis(generator); 57 | char type = (val < read_threshold) ? 'r' : 'w'; 58 | return std::make_pair(rank, type); 59 | } 60 | 61 | }; 62 | 63 | 64 | class Uniform: public Trace 65 | { 66 | const uint32_t max_value_uni = 0xFFFFFFFF; 67 | std::mt19937 generator; 68 | std::uniform_int_distribution uni; 69 | std::uniform_int_distribution dis; 70 | const uint32_t read_threshold; 71 | 72 | public: 73 | 74 | Uniform(unsigned long seed, double read_prob, uint32_t N): 75 | generator(seed), uni{0,N-1}, dis(0,max_value_uni-1), read_threshold((uint32_t)(read_prob*max_value_uni)) 76 | { 77 | //empty 78 | } 79 | 80 | virtual std::pair get_next() override 81 | { 82 | uint32_t rank = uni(generator); 83 | uint32_t val = dis(generator); 84 | char type = (val < read_threshold) ? 'r' : 'w'; 85 | return std::make_pair(rank, type); 86 | } 87 | 88 | }; -------------------------------------------------------------------------------- /utilities/zipf.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * A zipf implementation from a repo on github. 5 | * 6 | * 7 | */ 8 | #pragma once 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | /** Zipf-like random distribution. 15 | * 16 | * "Rejection-inversion to generate variates from monotone discrete 17 | * distributions", Wolfgang Hörmann and Gerhard Derflinger 18 | * ACM TOMACS 6.3 (1996): 169-184 19 | */ 20 | template 21 | class zipf_distribution 22 | { 23 | public: 24 | typedef RealType input_type; 25 | typedef IntType result_type; 26 | 27 | static_assert(std::numeric_limits::is_integer, ""); 28 | static_assert(!std::numeric_limits::is_integer, ""); 29 | 30 | zipf_distribution(const IntType n=std::numeric_limits::max(), 31 | const RealType q=1.0) 32 | : n(n) 33 | , q(q) 34 | , H_x1(H(1.5) - 1.0) 35 | , H_n(H(n + 0.5)) 36 | , dist(H_x1, H_n) 37 | {} 38 | 39 | IntType operator()(std::mt19937& rng) 40 | { 41 | while (true) { 42 | const RealType u = dist(rng); 43 | const RealType x = H_inv(u); 44 | const IntType k = clamp(std::round(x), 1, n); 45 | if (u >= H(k + 0.5) - h(k)) { 46 | return k; 47 | } 48 | } 49 | } 50 | 51 | private: 52 | /** Clamp x to [min, max]. */ 53 | template 54 | static constexpr T clamp(const T x, const T min, const T max) 55 | { 56 | return std::max(min, std::min(max, x)); 57 | } 58 | 59 | /** exp(x) - 1 / x */ 60 | static double 61 | expxm1bx(const double x) 62 | { 63 | return (std::abs(x) > epsilon) 64 | ? std::expm1(x) / x 65 | : (1.0 + x/2.0 * (1.0 + x/3.0 * (1.0 + x/4.0))); 66 | } 67 | 68 | /** H(x) = log(x) if q == 1, (x^(1-q) - 1)/(1 - q) otherwise. 69 | * H(x) is an integral of h(x). 70 | * 71 | * Note the numerator is one less than in the paper order to work with all 72 | * positive q. 73 | */ 74 | const RealType H(const RealType x) 75 | { 76 | const RealType log_x = std::log(x); 77 | return expxm1bx((1.0 - q) * log_x) * log_x; 78 | } 79 | 80 | /** log(1 + x) / x */ 81 | static RealType 82 | log1pxbx(const RealType x) 83 | { 84 | return (std::abs(x) > epsilon) 85 | ? std::log1p(x) / x 86 | : 1.0 - x * ((1/2.0) - x * ((1/3.0) - x * (1/4.0))); 87 | } 88 | 89 | /** The inverse function of H(x) */ 90 | const RealType H_inv(const RealType x) 91 | { 92 | const RealType t = std::max(-1.0, x * (1.0 - q)); 93 | return std::exp(log1pxbx(t) * x); 94 | } 95 | 96 | /** That hat function h(x) = 1 / (x ^ q) */ 97 | const RealType h(const RealType x) 98 | { 99 | return std::exp(-q * std::log(x)); 100 | } 101 | 102 | static constexpr RealType epsilon = 1e-8; 103 | 104 | IntType n; ///< Number of elements 105 | RealType q; ///< Exponent 106 | RealType H_x1; ///< H(x_1) 107 | RealType H_n; ///< H(n) 108 | std::uniform_real_distribution dist; ///< [H(x_1), H(n)] 109 | }; -------------------------------------------------------------------------------- /worker/communication.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * Types for server-client communication 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | 12 | 13 | #pragma once 14 | 15 | 16 | #include "../common/common.hpp" 17 | 18 | struct request_t{ 19 | uint8_t type; 20 | uint8_t version; 21 | uint32_t size; 22 | uint32_t req_id; 23 | client_addr_t addr; 24 | }; 25 | 26 | enum RequestType: uint8_t 27 | { 28 | READ = 1, 29 | WRITE, 30 | WRITEATOMIC, 31 | ALLOC, 32 | FREE, 33 | FIXPOINTER, 34 | COMPACT, // for debugging and benchmarking 35 | DISCONNECT 36 | }; 37 | 38 | struct message_header_t{ 39 | uint8_t thread_id; // destination/source lid 40 | uint8_t type; // type of the message 41 | }; 42 | 43 | struct reply_t{ 44 | uint8_t version; 45 | uint8_t status; 46 | client_addr_t ret_addr; 47 | uint32_t data_size; 48 | uint32_t id; 49 | }; 50 | 51 | -------------------------------------------------------------------------------- /worker/generic_worker.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * Interfaces for an IO-enabled thread 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | #pragma once 12 | 13 | 14 | typedef void (*io_cb)(uint32_t id, void* ctx); 15 | class IOWatcher{ 16 | public: 17 | virtual void install_io(int fd, io_cb cb, void* ctx ) = 0; 18 | virtual void stop_io(uint32_t io_id) = 0; 19 | virtual ~IOWatcher() = default; 20 | }; 21 | 22 | 23 | class GenericWorker { 24 | 25 | 26 | protected: 27 | 28 | uint8_t local_thread_id; 29 | IOWatcher* local_io_watcher; 30 | 31 | public: 32 | virtual ~GenericWorker() = default; 33 | 34 | // Allocate a block. 35 | virtual void main_cb() = 0; 36 | 37 | virtual void sometimes_cb() = 0; 38 | 39 | virtual void set_io_watcher(IOWatcher *w){ 40 | this->local_io_watcher = w; 41 | } 42 | 43 | virtual void set_thread_id(uint8_t id){ 44 | this->local_thread_id = id; 45 | } 46 | 47 | virtual void print_stats() = 0; 48 | 49 | }; 50 | 51 | 52 | -------------------------------------------------------------------------------- /workload_readwrite.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * CoRM: Compactable Remote Memory over RDMA 3 | * 4 | * Various read/write workload for CoRM 5 | * 6 | * Copyright (c) 2020-2021 ETH-Zurich. All rights reserved. 7 | * 8 | * Author(s): Konstantin Taranov 9 | * 10 | */ 11 | #include // std::cout 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | FILE *log_fp; 26 | 27 | 28 | #include "worker/client_api.hpp" 29 | #include "rdma/connectRDMA.hpp" 30 | #include "utilities/zipf.hpp" 31 | #include "utilities/ycsb.hpp" 32 | 33 | #include "utilities/cxxopts.hpp" 34 | 35 | 36 | using ReadFuncPtr = int (RemoteMemoryClient::*)( LocalObjectHandler* obj, char* buffer, uint32_t length ); 37 | 38 | std::atomic order(0); 39 | 40 | uint64_t num; 41 | 42 | cxxopts::ParseResult 43 | parse(int argc, char* argv[]) 44 | { 45 | cxxopts::Options options(argv[0], "Read write workload for CoRM"); 46 | options 47 | .positional_help("[optional args]") 48 | .show_positional_help(); 49 | 50 | try 51 | { 52 | 53 | options.add_options() 54 | ("server", "Another address", cxxopts::value(), "IP") 55 | ("i,input", "input file", cxxopts::value()->default_value("test.bin"), "FILE") 56 | ("t,threads", "the number of threads", cxxopts::value()->default_value(std::to_string(1)), "N") 57 | ("target", "expected rate ops/sec", cxxopts::value()->default_value(std::to_string(1000)), "N") 58 | ("p,prob", "Probability of read", cxxopts::value()->default_value(std::to_string(0.5f)), "N") 59 | ("seed", "seed", cxxopts::value()->default_value(std::to_string(3)), "N") 60 | ("zipf", "use zipf distribution as in YSCB") 61 | ("rdmaread", "Use one-sided reads") 62 | ("n,num", "Number of requests to run", cxxopts::value()->default_value("123"), "N") 63 | ("help", "Print help") 64 | ; 65 | 66 | auto result = options.parse(argc, argv); 67 | 68 | if (result.count("help")) 69 | { 70 | std::cout << options.help({""}) << std::endl; 71 | exit(0); 72 | } 73 | 74 | if (!result.count("server")) 75 | { 76 | throw cxxopts::OptionException("input must be specified"); 77 | } 78 | 79 | 80 | 81 | return result; 82 | 83 | } catch (const cxxopts::OptionException& e) 84 | { 85 | std::cout << "error parsing options: " << e.what() << std::endl; 86 | std::cout << options.help({""}) << std::endl; 87 | exit(1); 88 | } 89 | } 90 | 91 | 92 | 93 | void workload_worker(int threadid, VerbsEP *ep, ReadFuncPtr readfunc, LocalObjectHandler *objects_orig, uint32_t NN, bool is_zipf, int seed, float read_prob, uint64_t target){ 94 | RemoteMemoryClient* api = new RemoteMemoryClient(0,ep); 95 | 96 | Trace *trace = nullptr; 97 | if (is_zipf) 98 | { 99 | trace = new YCSB(seed,read_prob,NN,0.99); 100 | } 101 | else 102 | { 103 | trace = new Uniform(seed,read_prob,NN); 104 | } 105 | 106 | LocalObjectHandler *objects = (LocalObjectHandler*)malloc(NN*sizeof(LocalObjectHandler)); 107 | memcpy((char*)objects,(char*)objects_orig,NN*sizeof(LocalObjectHandler)); 108 | 109 | uint32_t size = objects[0].requested_size; 110 | char* buffer = (char*)malloc(size); 111 | 112 | 113 | std::chrono::seconds sec(1); 114 | uint64_t nanodelay = std::chrono::nanoseconds(sec).count() / target ; // per request 115 | auto starttime = std::chrono::high_resolution_clock::now(); 116 | 117 | uint32_t interval = 2560; 118 | 119 | std::vector request_bw; 120 | request_bw.reserve(1024); 121 | #ifdef LATENCY 122 | std::vector request_latency; 123 | request_latency.reserve(num); 124 | #endif 125 | uint32_t conflicts = 0; 126 | auto bwt1 = std::chrono::high_resolution_clock::now(); 127 | uint32_t count = 0; 128 | for(uint64_t i=0; iget_next(); 131 | LocalObjectHandler* obj = &objects[req.first]; 132 | assert(obj!=nullptr && "object cannot be null"); 133 | 134 | 135 | #ifdef LATENCY 136 | auto t1 = std::chrono::high_resolution_clock::now(); 137 | #endif 138 | 139 | // the following piece of code were used to incur Indirect pointers 140 | // uint64_t direct_addr = obj->addr.comp.addr; 141 | // uint64_t base_addr = GetVirtBaseAddr(obj->addr.comp.addr); 142 | // if(direct_addr == base_addr){ 143 | // base_addr += 32; 144 | // } 145 | // obj->addr.comp.addr = base_addr; 146 | if(req.second == 'r'){ 147 | 148 | int ret = (api->*readfunc)(obj, buffer, size); 149 | if(ret<0){ 150 | conflicts++; 151 | } 152 | // api->Read(obj, buffer, size); 153 | } else { 154 | api->Write(obj, buffer, size, false); 155 | } 156 | #ifdef LATENCY 157 | auto t2 = std::chrono::high_resolution_clock::now(); 158 | request_latency.push_back( std::chrono::duration_cast< std::chrono::nanoseconds >( t2 - t1 ).count() ); 159 | #endif 160 | count++; 161 | if(count > interval){ 162 | auto bwt2 = std::chrono::high_resolution_clock::now(); 163 | request_bw.push_back(std::chrono::duration_cast(bwt2 - bwt1).count()); 164 | bwt1 = bwt2; 165 | count=0; 166 | } 167 | 168 | 169 | auto const sleep_end_time = starttime + std::chrono::nanoseconds(nanodelay*i); 170 | while (std::chrono::high_resolution_clock::now() < sleep_end_time){ 171 | // nothing 172 | } 173 | } 174 | auto endtime = std::chrono::high_resolution_clock::now(); 175 | 176 | 177 | while(order.load() != threadid ){ 178 | 179 | } 180 | 181 | printf("Data thread #%u: \n",threadid); 182 | printf("throughput(Kreq/sec): "); 183 | for(auto &x : request_bw){ 184 | printf("%.2f ",(interval*1000.0)/x); 185 | } 186 | #ifdef LATENCY 187 | printf("latency(us): "); 188 | for(auto &x : request_latency){ 189 | printf("%.2f ",x/1000.0); 190 | } 191 | #endif 192 | printf("\nFinished workload in %lu ms with %u conflicts\n", std::chrono::duration_cast< std::chrono::milliseconds >( endtime - starttime ).count(), conflicts ); 193 | 194 | order++; 195 | 196 | return; 197 | } 198 | 199 | 200 | int main(int argc, char* argv[]){ 201 | 202 | 203 | auto allparams = parse(argc,argv); 204 | 205 | log_fp=stdout; 206 | 207 | std::string server = allparams["server"].as(); 208 | std::string input = allparams["input"].as(); 209 | uint64_t target = allparams["target"].as(); 210 | uint32_t threads = allparams["threads"].as(); 211 | num = allparams["num"].as(); 212 | float read_prob = allparams["prob"].as(); 213 | int seed = allparams["seed"].as(); 214 | 215 | ClientRDMA rdma((char*)server.c_str(),9999); 216 | struct rdma_cm_id * id = rdma.sendConnectRequest(); 217 | 218 | struct ibv_pd * pd = ClientRDMA::create_pd(id); 219 | 220 | struct ibv_qp_init_attr attr; 221 | struct rdma_conn_param conn_param; 222 | memset(&attr, 0, sizeof(attr)); 223 | attr.cap.max_send_wr = 32; 224 | attr.cap.max_recv_wr = 32; 225 | attr.cap.max_send_sge = 1; 226 | attr.cap.max_recv_sge = 1; 227 | attr.cap.max_inline_data = 0; 228 | attr.qp_type = IBV_QPT_RC; 229 | 230 | attr.send_cq = ibv_create_cq(pd->context, 32, NULL, NULL, 0); 231 | attr.recv_cq = ibv_create_cq(pd->context, 32, NULL, NULL, 0); 232 | 233 | memset(&conn_param, 0 , sizeof(conn_param)); 234 | conn_param.responder_resources = 0; 235 | conn_param.initiator_depth = 5; 236 | conn_param.retry_count = 3; 237 | conn_param.rnr_retry_count = 3; 238 | 239 | std::vector conns; 240 | 241 | conns.push_back(ClientRDMA::connectEP(id, &attr, &conn_param, pd)); 242 | 243 | for(uint32_t i = 1 ; i < threads; i++){ 244 | struct rdma_cm_id * tid = rdma.sendConnectRequest(); 245 | attr.send_cq = ibv_create_cq(pd->context, 32, NULL, NULL, 0); 246 | attr.recv_cq = ibv_create_cq(pd->context, 32, NULL, NULL, 0); 247 | conns.push_back(ClientRDMA::connectEP(tid, &attr, &conn_param, pd)); 248 | } 249 | 250 | if(threads>1){ 251 | assert(conns[0]->qp->send_cq != conns[1]->qp->send_cq && "Different connections must use Different CQ") ; 252 | } 253 | 254 | printf("Connected\n"); 255 | sleep(1); 256 | 257 | std::fstream fout; 258 | // printf("File name %s \n",input.c_str()); 259 | fout.open(input.c_str(), std::ios::in|std::ios::binary); 260 | uint32_t NN = 0; 261 | 262 | fout.read((char*)&NN,sizeof(NN)); 263 | 264 | LocalObjectHandler *objects; 265 | objects = (LocalObjectHandler*)malloc(NN*sizeof(LocalObjectHandler)); 266 | 267 | for(uint32_t i = 0; i < NN; i++){ 268 | LocalObjectHandler* obj = &objects[i]; 269 | fout.read((char*)obj,sizeof(LocalObjectHandler)); 270 | // obj->print(); 271 | } 272 | fout.close(); 273 | printf("Finished reading %u objects from file\n", NN); 274 | 275 | 276 | ReadFuncPtr readfunc = nullptr; 277 | if(allparams.count("rdmaread")){ 278 | readfunc = &RemoteMemoryClient::ReadOneSided; 279 | }else { 280 | readfunc = &RemoteMemoryClient::Read; 281 | } 282 | 283 | std::vector workers; 284 | 285 | for(int i = 0; i < (int)threads; i++){ 286 | workers.push_back(std::thread(workload_worker,i,conns[i],readfunc, objects, NN, allparams.count("zipf"), seed + i, read_prob,target)); 287 | } 288 | 289 | for (auto& th : workers) th.join(); 290 | 291 | return 0; 292 | } 293 | --------------------------------------------------------------------------------