├── LICENSE
├── Makefile
├── README
├── README.md
├── backing_store.cpp
├── backing_store.hpp
├── betree.hpp
├── debug.hpp
├── swap_space.cpp
├── swap_space.hpp
└── test.cpp


/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015, Rob Johnson
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | 
25 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS=-Wall -std=c++11 -g -O3 
 2 | #CXXFLAGS=-Wall -std=c++11 -g -pg
 3 | #CXXFLAGS=-Wall -std=c++11 -g -pg -DDEBUG
 4 | CC=g++
 5 | 
 6 | test: test.cpp betree.hpp swap_space.o backing_store.o
 7 | 
 8 | swap_space.o: swap_space.cpp swap_space.hpp backing_store.hpp
 9 | 
10 | backing_store.o: backing_store.hpp backing_store.cpp
11 | 
12 | clean:
13 | 	$(RM) *.o test
14 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
  1 | Betree: a small, simple implementation of a B^e-tree, as described in
  2 | the September 2015 ;login: article,
  3 |       "An Introduction to B^e-trees and Write-Optimization"
  4 | by Michael A. Bender, Martin Farach-Colton, William Jannen, Rob
  5 | Johnson, Bradley C. Kuszmaul, Donald E. Porter, Jun Yuan, and Yang
  6 | Zhan
  7 | 
  8 | Code by Rob Johnson <rob@cs.stonybrook.edu>
  9 | 
 10 | A B^-e-tree is an on-disk data structure with an interface similar to
 11 | a B-tree.  It stores a mapping from keys to values, supporting
 12 | inserts, queries, deletes, updates, and efficient iteration.  The key
 13 | features of a B^e-tree are extremely I/O-efficient insertions,
 14 | updates, and iteration, with query performance comparable to a B-tree.
 15 | See the above-referenced article for more details.
 16 | 
 17 | This distribution includes
 18 | - the B^e-tree implementation
 19 | - a test program that checks correctness and demonstrates
 20 |   how to use the B^e-tree implementation
 21 | 
 22 | 
 23 | BUILDING AND RUNNING THE TEST PROGRAM
 24 | -------------------------------------
 25 | 
 26 | To build, run
 27 |   $ make
 28 |   $ mkdir tmpdir
 29 |   $ ./test -d tmpdir
 30 | 
 31 | The test takes about a minute to run and should print "Test PASSED".
 32 | The test performs a random sequence of operations on a betree and on
 33 | an STL map, verifying that it always gets the same result from each
 34 | data structure.  If it ever finds a discrepancy, it will abort with an
 35 | assertion failure, and will likely leave some files in tmpdir.  A
 36 | successful run should leave tmpdir empty.
 37 | 
 38 | The code has been tested on a Debian 8.2 Linux installation with
 39 | - g++ 4.9.2
 40 | - GNU make 4.0
 41 | - libstdc++ 6.0.20
 42 | - libc 2.19
 43 | If you have trouble compiling or running the test on other systems,
 44 | please submit bug reports to rob@cs.stonybrook.edu.  Patches are
 45 | definitely appreciated.
 46 | 
 47 | GUIDE TO THE CODE
 48 | -----------------
 49 | 
 50 | test.cpp: The main test program.  Demonstrates how to construct and
 51 |           use a betree.
 52 | 
 53 | betree.hpp: The core of the betree implementation.  This class handles
 54 |             flushing messages down the tree, splitting nodes,
 55 |             performing inserts, queries, etc, and provides an iterator
 56 |             for scanning key/value pairs in the tree.
 57 | 
 58 |             The betree is written almost completely as an in-memory
 59 | 	    data structure.  All I/O is handled transparently by
 60 | 	    swap_space.
 61 | 
 62 | swap_space.{cpp,hpp}: Swaps objects to/from disk.  Maintains a cache
 63 | 		      of in-memory objects.  When the cache becomes
 64 | 		      too large the least-recently-used object is
 65 | 		      written to disk and removed from the cache.
 66 | 		      Automatically loads the object back into memory
 67 | 		      when it is referenced.  Garbage collects objects
 68 | 		      that are no longer referenced by any other
 69 | 		      object.  Tracks when objects are modified in
 70 | 		      memory so that it knows to write them back to
 71 | 		      disk next time they get evicted.
 72 | 
 73 | backing_store.{cpp,hpp}: This defines a generic interface used by
 74 |                          swap_space to manage on-disk space.  It
 75 |                          supports allocating and deallocating on-disk
 76 |                          space. The file also defines a simple
 77 |                          implementation of the interface that stores
 78 |                          one object per file on disk.
 79 | 
 80 | 
 81 | INTERESTING PROJECTS AND TODOS
 82 | ------------------------------
 83 | 
 84 | - Implement logging, transactions, and MVCC.  If this can be done in a
 85 |   way that does not touch the internals of betree, that would be extra
 86 |   cool.
 87 | 
 88 | - Implement range upsert messages (and range deletes).  One way to
 89 |   approach this might be to replace the currently-used std::map for
 90 |   betree::node::elements with a boost interval map.
 91 | 
 92 | - Implement efficient garbage collection of nodes that contain only
 93 |   keys that are covered by a range delete message.
 94 | 
 95 | - Implement "sub-nodes".  Sub-nodes are written to disk contiguously
 96 |   as part of their parent node, but can be deserialized individually.
 97 |   This would enable the tree to write a node out to contiguous disk
 98 |   space, enabling fast range queries over the node.  Point queries,
 99 |   however, would be able to deserialize only the sub-node needed to
100 |   answer the query, saving disk bandwidth.
101 | 
102 | - Modify system to track sizes in bytes instead of nodes, keys,
103 |   values, etc.
104 | 
105 | - Add multi-threading support.
106 | 
107 | - Implement checkpointing and saving/loading of the betree.
108 | 
109 | - Use boost serialization.  The main challenge that I see is that the
110 |   deserialization code needs a context for the deserialization, but
111 |   boost serialization does not provide this.
112 | 
113 | - Implement compression and partial eviction (i.e. "evict" a node by
114 |   compressing it but keeping it in memory)
115 | 
116 | - Implement a backing_store that manages space in a single file.
117 | 
118 | - Reinsert results computed from several upserts into the top of the
119 |   tree so we don't have to recompute them.
120 | 
121 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Be-Tree
2 | A simple, reference implementation of a B^e-tree
3 | 


--------------------------------------------------------------------------------
/backing_store.cpp:
--------------------------------------------------------------------------------
 1 | #include "backing_store.hpp"
 2 | #include <iostream>
 3 | #include <ext/stdio_filebuf.h>
 4 | #include <unistd.h>
 5 | #include <cassert>
 6 | 
 7 | /////////////////////////////////////////////////////////////
 8 | // Implementation of the one_file_per_object_backing_store //
 9 | /////////////////////////////////////////////////////////////
10 | one_file_per_object_backing_store::one_file_per_object_backing_store(std::string rt)
11 |   : root(rt),
12 |     nextid(1)
13 | {}
14 | 
15 | uint64_t one_file_per_object_backing_store::allocate(size_t n) {
16 |   uint64_t id = nextid++;
17 |   std::string filename = root + "/" + std::to_string(id);
18 |   std::fstream dummy(filename, std::fstream::out);
19 |   dummy.flush();
20 |   assert(dummy.good());
21 |   return id;
22 | }
23 | 
24 | void one_file_per_object_backing_store::deallocate(uint64_t id) {
25 |   std::string filename = root + "/" + std::to_string(id);
26 |   assert(unlink(filename.c_str()) == 0);
27 | }
28 | 
29 | std::iostream * one_file_per_object_backing_store::get(uint64_t id) {
30 |   __gnu_cxx::stdio_filebuf<char> *fb = new __gnu_cxx::stdio_filebuf<char>;
31 |   std::string filename = root + "/" + std::to_string(id);
32 |   fb->open(filename, std::fstream::in | std::fstream::out);
33 |   std::fstream *ios = new std::fstream;
34 |   ios->std::ios::rdbuf(fb);
35 |   ios->exceptions(std::fstream::badbit | std::fstream::failbit | std::fstream::eofbit);
36 |   assert(ios->good());
37 |   
38 |   return ios;
39 | }
40 | 
41 | void one_file_per_object_backing_store::put(std::iostream *ios)
42 | {
43 |   ios->flush();
44 |   __gnu_cxx::stdio_filebuf<char> *fb = (__gnu_cxx::stdio_filebuf<char> *)ios->rdbuf();
45 |   fsync(fb->fd());
46 |   delete ios;
47 |   delete fb;
48 | }
49 | 


--------------------------------------------------------------------------------
/backing_store.hpp:
--------------------------------------------------------------------------------
 1 | // Generic interface to the disk.  Used by swap_space to store
 2 | // objects.
 3 | 
 4 | #ifndef BACKING_STORE_HPP
 5 | #define BACKING_STORE_HPP
 6 | 
 7 | #include <cstdint>
 8 | #include <cstddef>
 9 | #include <iostream>
10 | 
11 | class backing_store {
12 | public:
13 |   virtual uint64_t allocate(size_t n) = 0;
14 |   virtual void deallocate(uint64_t id) = 0;
15 |   virtual std::iostream * get(uint64_t id) = 0;
16 |   virtual void            put(std::iostream *ios) = 0;
17 | };
18 | 
19 | class one_file_per_object_backing_store: public backing_store {
20 | public:
21 |   one_file_per_object_backing_store(std::string rt);
22 |   uint64_t	  allocate(size_t n);
23 |   void		  deallocate(uint64_t id);
24 |   std::iostream * get(uint64_t id);
25 |   void            put(std::iostream *ios);
26 |   
27 | private:
28 |   std::string	root;
29 |   uint64_t	nextid;
30 | };
31 | 
32 | #endif // BACKING_STORE_HPP
33 | 


--------------------------------------------------------------------------------
/betree.hpp:
--------------------------------------------------------------------------------
  1 | // A basic B^e-tree implementation templated on types Key and Value.
  2 | // Keys and Values must be serializable (see swap_space.hpp).
  3 | // Keys must be comparable (via operator< and operator==).
  4 | // Values must be addable (via operator+).
  5 | // See test.cpp for example usage.
  6 | 
  7 | // This implementation represents in-memory nodes as objects with two
  8 | // fields:
  9 | // - a std::map mapping keys to child pointers
 10 | // - a std::map mapping (key, timestamp) pairs to messages
 11 | // Nodes are de/serialized to/from an on-disk representation.
 12 | // I/O is managed transparently by a swap_space object.
 13 | 
 14 | // This implementation deviates from a "textbook" implementation in
 15 | // that there is not a fixed division of a node's space between pivots
 16 | // and buffered messages.
 17 | 
 18 | // In a textbook implementation, nodes have size B, B^e space is
 19 | // devoted to pivots and child pointers, and B-B^e space is devoted to
 20 | // buffering messages.  Whenever a leaf gets too many messages, it
 21 | // splits.  Whenever an internal node gets too many messages, it
 22 | // performs a flush.  Whenever an internal node gets too many
 23 | // children, it splits.  This policy ensures that, whenever the tree
 24 | // needs to flush messages from a node to one of its children, it can
 25 | // always move a batch of size at least (B-B^e) / B^e = B^(1-e) - 1
 26 | // messages.
 27 | 
 28 | // In this implementation, nodes have a fixed maximum size.  Whenever
 29 | // a leaf exceeds this max size, it splits.  Whenever an internal node
 30 | // exceeds this maximum size, it checks to see if it can flush a large
 31 | // batch of elements to one of its children.  If it can, it does so.
 32 | // If it cannot, then it splits.
 33 | 
 34 | // In-memory nodes may temporarily exceed the maximum size
 35 | // restriction.  During a flush, we move all the incoming messages
 36 | // into the destination node.  At that point the node may exceed the
 37 | // max size.  The flushing procedure then performs further flushes or
 38 | // splits to restore the max-size invariant.  Thus, whenever a flush
 39 | // returns, all the nodes in the subtree of that node are guaranteed
 40 | // to satisfy the max-size requirement.
 41 | 
 42 | // This implementation also optimizes I/O based on which nodes are
 43 | // on-disk, clean in memory, or dirty in memory.  For example,
 44 | // inserted items are always immediately flushed as far down the tree
 45 | // as they can go without dirtying any new nodes.  This is because
 46 | // flushing an item to a node that is already dirty will not require
 47 | // any additional I/O, since the node already has to be written back
 48 | // anyway.  Furthermore, it will flush smaller batches to clean
 49 | // in-memory nodes than to on-disk nodes.  This is because dirtying a
 50 | // clean in-memory node only requires a write-back, whereas flushing
 51 | // to an on-disk node requires reading it in and writing it out.
 52 | 
 53 | #include <map>
 54 | #include <vector>
 55 | #include <cassert>
 56 | #include "swap_space.hpp"
 57 | #include "backing_store.hpp"
 58 | 
 59 | ////////////////// Upserts
 60 | 
 61 | // Internally, we store data indexed by both the user-specified key
 62 | // and a timestamp, so that we can apply upserts in the correct order.
 63 | template<class Key>
 64 | class MessageKey {
 65 | public:
 66 |   MessageKey(void) :
 67 |     key(),
 68 |     timestamp(0)
 69 |   {}
 70 | 
 71 |   MessageKey(const Key & k, uint64_t tstamp) :
 72 |     key(k),
 73 |     timestamp(tstamp)
 74 |   {}
 75 | 
 76 |   static MessageKey range_start(const Key &key) {
 77 |     return MessageKey(key, 0);
 78 |   }
 79 |   
 80 |   static MessageKey range_end(const Key &key) {
 81 |     return MessageKey(key, UINT64_MAX);
 82 |   }
 83 |   
 84 |   MessageKey range_start(void) const {
 85 |     return range_start(key);
 86 |   }
 87 | 
 88 |   MessageKey range_end(void) const {
 89 |     return range_end(key);
 90 |   }
 91 | 
 92 |   void _serialize(std::iostream &fs, serialization_context &context) const {
 93 |     fs << timestamp << " ";
 94 |     serialize(fs, context, key);
 95 |   } 
 96 | 
 97 |   void _deserialize(std::iostream &fs, serialization_context &context) {
 98 |     fs >> timestamp;
 99 |     deserialize(fs, context, key);
100 |   }
101 | 
102 |   Key key;
103 |   uint64_t timestamp;
104 | };
105 | 
106 | template<class Key>
107 | bool operator<(const MessageKey<Key> & mkey1, const MessageKey<Key> & mkey2) {
108 |   return mkey1.key < mkey2.key ||
109 | 		     (mkey1.key == mkey2.key && mkey1.timestamp < mkey2.timestamp);
110 | }
111 | 
112 | template<class Key>
113 | bool operator<(const Key & key, const MessageKey<Key> & mkey) {
114 |   return key < mkey.key;
115 | }
116 | 
117 | template<class Key>
118 | bool operator<(const MessageKey<Key> & mkey, const Key & key) {
119 |   return mkey.key < key;
120 | }
121 | 
122 | template<class Key>
123 | bool operator==(const MessageKey<Key> &a, const MessageKey<Key> &b) {
124 |   return a.key == b.key && a.timestamp == b.timestamp;
125 | }
126 |   
127 | 
128 | // The three types of upsert.  An UPDATE specifies a value, v, that
129 | // will be added (using operator+) to the old value associated to some
130 | // key in the tree.  If there is no old value associated with the key,
131 | // then it will add v to the result of a Value obtained using the
132 | // default zero-argument constructor.
133 | #define INSERT (0)
134 | #define DELETE (1)
135 | #define UPDATE (2)
136 | 
137 | template<class Value>
138 | class Message {
139 | public:
140 |   Message(void) :
141 |     opcode(INSERT),
142 |     val()
143 |   {}
144 | 
145 |   Message(int opc, const Value &v) :
146 |     opcode(opc),
147 |     val(v)
148 |   {}
149 |   
150 |   void _serialize(std::iostream &fs, serialization_context &context) {
151 |     fs << opcode << " ";
152 |     serialize(fs, context, val);
153 |   } 
154 | 
155 |   void _deserialize(std::iostream &fs, serialization_context &context) {
156 |     fs >> opcode;
157 |     deserialize(fs, context, val);
158 |   }
159 | 
160 |   int opcode;
161 |   Value val;
162 | };
163 | 
164 | template <class Value>
165 | bool operator==(const Message<Value> &a, const Message<Value> &b) {
166 |   return a.opcode == b.opcode && a.val == b.val;
167 | }
168 | 
169 | // Measured in messages.
170 | #define DEFAULT_MAX_NODE_SIZE (1ULL<<18)
171 | 
172 | // The minimum number of messages that we will flush to an out-of-cache node.
173 | // Note: we will flush even a single element to a child that is already dirty.
174 | // Note: we will flush MIN_FLUSH_SIZE/2 items to a clean in-memory child.
175 | #define DEFAULT_MIN_FLUSH_SIZE (DEFAULT_MAX_NODE_SIZE / 16ULL)
176 | 
177 | 
178 | template<class Key, class Value> class betree {
179 | private:
180 | 
181 |   class node;
182 |   // We let a swap_space handle all the I/O.
183 |   typedef typename swap_space::pointer<node> node_pointer;
184 |   class child_info : public serializable {
185 |   public:
186 |     child_info(void)
187 |       : child(),
188 | 	child_size(0)
189 |     {}
190 |     
191 |     child_info(node_pointer child, uint64_t child_size)
192 |       : child(child),
193 | 	child_size(child_size)
194 |     {}
195 | 
196 |     void _serialize(std::iostream &fs, serialization_context &context) {
197 |       serialize(fs, context, child);
198 |       fs << " ";
199 |       serialize(fs, context, child_size);
200 |     }
201 | 
202 |     void _deserialize(std::iostream &fs, serialization_context &context) {
203 |       deserialize(fs, context, child);
204 |       deserialize(fs, context, child_size);
205 |     }
206 |     
207 |     node_pointer child;
208 |     uint64_t child_size;
209 |   };
210 |   typedef typename std::map<Key, child_info> pivot_map;
211 |   typedef typename std::map<MessageKey<Key>, Message<Value> > message_map;
212 |     
213 |   class node : public serializable {
214 |   public:
215 | 
216 |     // Child pointers
217 |     pivot_map pivots;
218 |     message_map elements;
219 | 
220 |     bool is_leaf(void) const {
221 |       return pivots.empty();
222 |     }
223 | 
224 |     // Holy frick-a-moly.  We want to write a const function that
225 |     // returns a const_iterator when called from a const function and
226 |     // a non-const function that returns a (non-const_)iterator when
227 |     // called from a non-const function.  And we don't want to
228 |     // duplicate the code.  The following solution is from
229 |     //         http://stackoverflow.com/a/858893
230 |     template<class OUT, class IN>
231 |     static OUT get_pivot(IN & mp, const Key & k) {
232 |       assert(mp.size() > 0);
233 |       auto it = mp.lower_bound(k);
234 |       if (it == mp.begin() && k < it->first)
235 | 	throw std::out_of_range("Key does not exist "
236 | 				"(it is smaller than any key in DB)");
237 |       if (it == mp.end() || k < it->first)
238 | 	--it;
239 |       return it;      
240 |     }
241 | 
242 |     // Instantiate the above template for const and non-const
243 |     // calls. (template inference doesn't seem to work on this code)
244 |     typename pivot_map::const_iterator get_pivot(const Key & k) const {
245 |       return get_pivot<typename pivot_map::const_iterator,
246 | 		       const pivot_map>(pivots, k);
247 |     }
248 | 
249 |     typename pivot_map::iterator
250 |     get_pivot(const Key & k) {
251 |       return get_pivot<typename pivot_map::iterator, pivot_map>(pivots, k);
252 |     }
253 | 
254 |     // Return iterator pointing to the first element with mk >= k.
255 |     // (Same const/non-const templating trick as above)
256 |     template<class OUT, class IN>
257 |     static OUT get_element_begin(IN & elts, const Key &k) {
258 |       return elts.lower_bound(MessageKey<Key>::range_start(k));
259 |     }
260 | 
261 |     typename message_map::iterator get_element_begin(const Key &k) {
262 |       return get_element_begin<typename message_map::iterator,
263 | 			       message_map>(elements, k);
264 |     }
265 | 
266 |     typename message_map::const_iterator get_element_begin(const Key &k) const {
267 |       return get_element_begin<typename message_map::const_iterator,
268 | 			       const message_map>(elements, k);
269 |     }
270 | 
271 |     // Return iterator pointing to the first element that goes to
272 |     // child indicated by it
273 |     typename message_map::iterator
274 |     get_element_begin(const typename pivot_map::iterator it) {
275 |       return it == pivots.end() ? elements.end() : get_element_begin(it->first);
276 |     }
277 | 
278 |     // Apply a message to ourself.
279 |     void apply(const MessageKey<Key> &mkey, const Message<Value> &elt,
280 | 	       Value &default_value) {
281 |       switch (elt.opcode) {
282 |       case INSERT:
283 | 	elements.erase(elements.lower_bound(mkey.range_start()),
284 | 		       elements.upper_bound(mkey.range_end()));
285 | 	elements[mkey] = elt;
286 | 	break;
287 | 
288 |       case DELETE:
289 | 	elements.erase(elements.lower_bound(mkey.range_start()),
290 | 		       elements.upper_bound(mkey.range_end()));
291 | 	if (!is_leaf())
292 | 	  elements[mkey] = elt;
293 | 	break;
294 | 
295 |       case UPDATE:
296 | 	{
297 | 	  auto iter = elements.upper_bound(mkey.range_end());
298 | 	  if (iter != elements.begin())
299 | 	    iter--;
300 | 	  if (iter == elements.end() || iter->first.key != mkey.key)
301 | 	    if (is_leaf()) {
302 | 	      Value dummy = default_value;
303 | 	      apply(mkey, Message<Value>(INSERT, dummy + elt.val),
304 | 		    default_value);
305 | 	    } else {
306 | 	      elements[mkey] = elt;
307 | 	    }
308 | 	  else {
309 | 	    assert(iter != elements.end() && iter->first.key == mkey.key);
310 | 	    if (iter->second.opcode == INSERT) {
311 | 	      apply(mkey, Message<Value>(INSERT, iter->second.val + elt.val),
312 | 		    default_value);	  
313 | 	    } else {
314 | 	      elements[mkey] = elt;	      
315 | 	    }
316 | 	  }
317 | 	}
318 | 	break;
319 | 
320 |       default:
321 | 	assert(0);
322 |       }
323 |     }
324 |     
325 |     // Requires: there are less than MIN_FLUSH_SIZE things in elements
326 |     //           destined for each child in pivots);
327 |     pivot_map split(betree &bet) {
328 |       assert(pivots.size() + elements.size() >= bet.max_node_size);
329 |       // This size split does a good job of causing the resulting
330 |       // nodes to have size between 0.4 * MAX_NODE_SIZE and 0.6 * MAX_NODE_SIZE.
331 |       int num_new_leaves =
332 | 	(pivots.size() + elements.size())  / (10 * bet.max_node_size / 24);
333 |       int things_per_new_leaf =
334 | 	(pivots.size() + elements.size() + num_new_leaves - 1) / num_new_leaves;
335 | 
336 |       pivot_map result;
337 |       auto pivot_idx = pivots.begin();
338 |       auto elt_idx = elements.begin();
339 |       int things_moved = 0;
340 |       for (int i = 0; i < num_new_leaves; i++) {
341 | 	if (pivot_idx == pivots.end() && elt_idx == elements.end())
342 | 	  break;
343 | 	node_pointer new_node = bet.ss->allocate(new node);
344 | 	result[pivot_idx != pivots.end() ?
345 | 	       pivot_idx->first :
346 | 	       elt_idx->first.key] = child_info(new_node,
347 | 						new_node->elements.size() +
348 | 						new_node->pivots.size());
349 | 	while(things_moved < (i+1) * things_per_new_leaf &&
350 | 	      (pivot_idx != pivots.end() || elt_idx != elements.end())) {
351 | 	  if (pivot_idx != pivots.end()) {
352 | 	    new_node->pivots[pivot_idx->first] = pivot_idx->second;
353 | 	    ++pivot_idx;
354 | 	    things_moved++;
355 | 	    auto elt_end = get_element_begin(pivot_idx);
356 | 	    while (elt_idx != elt_end) {
357 | 	      new_node->elements[elt_idx->first] = elt_idx->second;
358 | 	      ++elt_idx;
359 | 	      things_moved++;
360 | 	    }
361 | 	  } else {
362 | 	    // Must be a leaf
363 | 	    assert(pivots.size() == 0);
364 | 	    new_node->elements[elt_idx->first] = elt_idx->second;
365 | 	    ++elt_idx;
366 | 	    things_moved++;	    
367 | 	  }
368 | 	}
369 |       }
370 |       
371 |       for (auto it = result.begin(); it != result.end(); ++it)
372 | 	it->second.child_size = it->second.child->elements.size() +
373 | 	  it->second.child->pivots.size();
374 |       
375 |       assert(pivot_idx == pivots.end());
376 |       assert(elt_idx == elements.end());
377 |       pivots.clear();
378 |       elements.clear();
379 |       return result;
380 |     }
381 | 
382 |     node_pointer merge(betree &bet,
383 | 		       typename pivot_map::iterator begin,
384 | 		       typename pivot_map::iterator end) {
385 |       node_pointer new_node = bet.ss->allocate(new node);
386 |       for (auto it = begin; it != end; ++it) {
387 | 	new_node->elements.insert(it->second.child->elements.begin(),
388 | 				  it->second.child->elements.end());
389 | 	new_node->pivots.insert(it->second.child->pivots.begin(),
390 | 				  it->second.child->pivots.end());
391 |       }
392 |       return new_node;
393 |     }
394 | 
395 |     void merge_small_children(betree &bet) {
396 |       if (is_leaf())
397 | 	return;
398 | 
399 |       for (auto beginit = pivots.begin(); beginit != pivots.end(); ++beginit) {
400 | 	uint64_t total_size = 0;
401 | 	auto endit = beginit;
402 | 	while (endit != pivots.end()) {
403 | 	  if (total_size + beginit->second.child_size > 6 * bet.max_node_size / 10)
404 | 	    break;
405 | 	  total_size += beginit->second.child_size;
406 | 	  ++endit;
407 | 	}
408 | 	if (endit != beginit) {
409 | 	  node_pointer merged_node = merge(bet, beginit, endit);
410 | 	  for (auto tmp = beginit; tmp != endit; ++tmp) {
411 | 	    tmp->second.child->elements.clear();
412 | 	    tmp->second.child->pivots.clear();
413 | 	  }
414 | 	  Key key = beginit->first;
415 | 	  pivots.erase(beginit, endit);
416 | 	  pivots[key] = child_info(merged_node, merged_node->pivots.size() + merged_node->elements.size());
417 | 	  beginit = pivots.lower_bound(key);
418 | 	}
419 |       }
420 |     }
421 |     
422 |     // Receive a collection of new messages and perform recursive
423 |     // flushes or splits as necessary.  If we split, return a
424 |     // map with the new pivot keys pointing to the new nodes.
425 |     // Otherwise return an empty map.
426 |     pivot_map flush(betree &bet, message_map &elts)
427 |     {
428 |       debug(std::cout << "Flushing " << this << std::endl);
429 |       pivot_map result;
430 | 
431 |       if (elts.size() == 0) {
432 | 	debug(std::cout << "Done (empty input)" << std::endl);
433 | 	return result;
434 |       }
435 | 
436 |       if (is_leaf()) {
437 | 	for (auto it = elts.begin(); it != elts.end(); ++it)
438 | 	  apply(it->first, it->second, bet.default_value);
439 | 	if (elements.size() + pivots.size() >= bet.max_node_size)
440 | 	  result = split(bet);
441 | 	return result;
442 |       }	
443 | 
444 |       ////////////// Non-leaf
445 |       
446 |       // Update the key of the first child, if necessary
447 |       Key oldmin = pivots.begin()->first;
448 |       MessageKey<Key> newmin = elts.begin()->first;
449 |       if (newmin < oldmin) {
450 | 	pivots[newmin.key] = pivots[oldmin];
451 | 	pivots.erase(oldmin);
452 |       }
453 | 
454 |       // If everything is going to a single dirty child, go ahead
455 |       // and put it there.
456 |       auto first_pivot_idx = get_pivot(elts.begin()->first.key);
457 |       auto last_pivot_idx = get_pivot((--elts.end())->first.key);
458 |       if (first_pivot_idx == last_pivot_idx &&
459 | 	  first_pivot_idx->second.child.is_dirty()) {
460 |       	// There shouldn't be anything in our buffer for this child,
461 |       	// but lets assert that just to be safe.
462 | 	{
463 | 	  auto next_pivot_idx = next(first_pivot_idx);
464 | 	  auto elt_start = get_element_begin(first_pivot_idx);
465 | 	  auto elt_end = get_element_begin(next_pivot_idx); 
466 | 	  assert(elt_start == elt_end);
467 | 	}
468 |       	pivot_map new_children = first_pivot_idx->second.child->flush(bet, elts);
469 |       	if (!new_children.empty()) {
470 |       	  pivots.erase(first_pivot_idx);
471 |       	  pivots.insert(new_children.begin(), new_children.end());
472 |       	} else {
473 | 	  first_pivot_idx->second.child_size =
474 | 	    first_pivot_idx->second.child->pivots.size() +
475 | 	    first_pivot_idx->second.child->elements.size();
476 | 	}
477 | 
478 |       } else {
479 | 	
480 | 	for (auto it = elts.begin(); it != elts.end(); ++it)
481 | 	  apply(it->first, it->second, bet.default_value);
482 | 
483 | 	// Now flush to out-of-core or clean children as necessary
484 | 	while (elements.size() + pivots.size() >= bet.max_node_size) {
485 | 	  // Find the child with the largest set of messages in our buffer
486 | 	  unsigned int max_size = 0;
487 | 	  auto child_pivot = pivots.begin();
488 | 	  auto next_pivot = pivots.begin();
489 | 	  for (auto it = pivots.begin(); it != pivots.end(); ++it) {
490 | 	    auto it2 = next(it);
491 | 	    auto elt_it = get_element_begin(it); 
492 | 	    auto elt_it2 = get_element_begin(it2); 
493 | 	    unsigned int dist = distance(elt_it, elt_it2);
494 | 	    if (dist > max_size) {
495 | 	      child_pivot = it;
496 | 	      next_pivot = it2;
497 | 	      max_size = dist;
498 | 	    }
499 | 	  }
500 | 	  if (!(max_size > bet.min_flush_size ||
501 | 		(max_size > bet.min_flush_size/2 &&
502 | 		 child_pivot->second.child.is_in_memory())))
503 | 	    break; // We need to split because we have too many pivots
504 | 	  auto elt_child_it = get_element_begin(child_pivot);
505 | 	  auto elt_next_it = get_element_begin(next_pivot);
506 | 	  message_map child_elts(elt_child_it, elt_next_it);
507 | 	  pivot_map new_children = child_pivot->second.child->flush(bet, child_elts);
508 | 	  elements.erase(elt_child_it, elt_next_it);
509 | 	  if (!new_children.empty()) {
510 | 	    pivots.erase(child_pivot);
511 | 	    pivots.insert(new_children.begin(), new_children.end());
512 | 	  } else {
513 | 	    first_pivot_idx->second.child_size =
514 | 	      child_pivot->second.child->pivots.size() +
515 | 	      child_pivot->second.child->elements.size();
516 | 	  }
517 | 	}
518 | 
519 | 	// We have too many pivots to efficiently flush stuff down, so split
520 | 	if (elements.size() + pivots.size() > bet.max_node_size) {
521 | 	  result = split(bet);
522 | 	}
523 |       }
524 | 
525 |       //merge_small_children(bet);
526 |       
527 |       debug(std::cout << "Done flushing " << this << std::endl);
528 |       return result;
529 |     }
530 | 
531 |     Value query(const betree & bet, const Key k) const
532 |     {
533 |       debug(std::cout << "Querying " << this << std::endl);
534 |       if (is_leaf()) {
535 | 	auto it = elements.lower_bound(MessageKey<Key>::range_start(k));
536 | 	if (it != elements.end() && it->first.key == k) {
537 | 	  assert(it->second.opcode == INSERT);
538 | 	  return it->second.val;
539 | 	} else {
540 | 	  throw std::out_of_range("Key does not exist");
541 | 	}
542 |       }
543 | 
544 |       ///////////// Non-leaf
545 |       
546 |       auto message_iter = get_element_begin(k);
547 |       Value v = bet.default_value;
548 | 
549 |       if (message_iter == elements.end() || k < message_iter->first)
550 | 	// If we don't have any messages for this key, just search
551 | 	// further down the tree.
552 | 	v = get_pivot(k)->second.child->query(bet, k);
553 |       else if (message_iter->second.opcode == UPDATE) {
554 | 	// We have some updates for this key.  Search down the tree.
555 | 	// If it has something, then apply our updates to that.  If it
556 | 	// doesn't have anything, then apply our updates to the
557 | 	// default initial value.
558 | 	try {
559 | 	  Value t = get_pivot(k)->second.child->query(bet, k);
560 | 	  v = t;
561 | 	} catch (std::out_of_range e) {}
562 |       } else if (message_iter->second.opcode == DELETE) {
563 | 	// We have a delete message, so we don't need to look further
564 | 	// down the tree.  If we don't have any further update or
565 | 	// insert messages, then we should return does-not-exist (in
566 | 	// this subtree).
567 | 	message_iter++;
568 | 	if (message_iter == elements.end() || k < message_iter->first)
569 | 	  throw std::out_of_range("Key does not exist");
570 |       } else if (message_iter->second.opcode == INSERT) {
571 | 	// We have an insert message, so we don't need to look further
572 | 	// down the tree.  We'll apply any updates to this value.
573 | 	v = message_iter->second.val;
574 | 	message_iter++;
575 |       }
576 | 
577 |       // Apply any updates to the value obtained above.
578 |       while (message_iter != elements.end() && message_iter->first.key == k) {
579 | 	assert(message_iter->second.opcode == UPDATE);
580 | 	v = v + message_iter->second.val;
581 | 	message_iter++;
582 |       }
583 | 
584 |       return v;
585 |     }
586 | 
587 |     std::pair<MessageKey<Key>, Message<Value> >
588 |     get_next_message_from_children(const MessageKey<Key> *mkey) const {
589 |       if (mkey && *mkey < pivots.begin()->first)
590 | 	mkey = NULL;
591 |       auto it = mkey ? get_pivot(mkey->key) : pivots.begin();
592 |       while (it != pivots.end()) {
593 | 	try {
594 | 	  return it->second.child->get_next_message(mkey);
595 | 	} catch (std::out_of_range e) {}
596 | 	++it;
597 |       }
598 |       throw std::out_of_range("No more messages in any children");
599 |     }
600 |     
601 |     std::pair<MessageKey<Key>, Message<Value> >
602 |     get_next_message(const MessageKey<Key> *mkey) const {
603 |       auto it = mkey ? elements.upper_bound(*mkey) : elements.begin();
604 | 
605 |       if (is_leaf()) {
606 | 	if (it == elements.end())
607 | 	  throw std::out_of_range("No more messages in sub-tree");
608 | 	return std::make_pair(it->first, it->second);
609 |       }
610 | 
611 |       if (it == elements.end())
612 | 	return get_next_message_from_children(mkey);
613 |       
614 |       try {
615 | 	auto kids = get_next_message_from_children(mkey);
616 | 	if (kids.first < it->first)
617 | 	  return kids;
618 | 	else 
619 | 	  return std::make_pair(it->first, it->second);
620 |       } catch (std::out_of_range e) {
621 | 	return std::make_pair(it->first, it->second);	
622 |       }
623 |     }
624 |     
625 |     void _serialize(std::iostream &fs, serialization_context &context) {
626 |       fs << "pivots:" << std::endl;
627 |       serialize(fs, context, pivots);
628 |       fs << "elements:" << std::endl;
629 |       serialize(fs, context, elements);
630 |     }
631 |     
632 |     void _deserialize(std::iostream &fs, serialization_context &context) {
633 |       std::string dummy;
634 |       fs >> dummy;
635 |       deserialize(fs, context, pivots);
636 |       fs >> dummy;
637 |       deserialize(fs, context, elements);
638 |     }
639 | 
640 |     
641 |   };
642 | 
643 |   swap_space *ss;
644 |   uint64_t min_flush_size;
645 |   uint64_t max_node_size;
646 |   uint64_t min_node_size;
647 |   node_pointer root;
648 |   uint64_t next_timestamp = 1; // Nothing has a timestamp of 0
649 |   Value default_value;
650 |   
651 | public:
652 |   betree(swap_space *sspace,
653 | 	 uint64_t maxnodesize = DEFAULT_MAX_NODE_SIZE,
654 | 	 uint64_t minnodesize = DEFAULT_MAX_NODE_SIZE / 4,
655 | 	 uint64_t minflushsize = DEFAULT_MIN_FLUSH_SIZE) :
656 |     ss(sspace),
657 |     min_flush_size(minflushsize),
658 |     max_node_size(maxnodesize),
659 |     min_node_size(minnodesize)
660 |   {
661 |     root = ss->allocate(new node);
662 |   }
663 | 
664 |   // Insert the specified message and handle a split of the root if it
665 |   // occurs.
666 |   void upsert(int opcode, Key k, Value v)
667 |   {
668 |     message_map tmp;
669 |     tmp[MessageKey<Key>(k, next_timestamp++)] = Message<Value>(opcode, v);
670 |     pivot_map new_nodes = root->flush(*this, tmp);
671 |     if (new_nodes.size() > 0) {
672 |       root = ss->allocate(new node);
673 |       root->pivots = new_nodes;
674 |     }
675 |   }
676 | 
677 |   void insert(Key k, Value v)
678 |   {
679 |     upsert(INSERT, k, v);
680 |   }
681 | 
682 |   void update(Key k, Value v)
683 |   {
684 |     upsert(UPDATE, k, v);
685 |   }
686 | 
687 |   void erase(Key k)
688 |   {
689 |     upsert(DELETE, k, default_value);
690 |   }
691 |   
692 |   Value query(Key k)
693 |   {
694 |     Value v = root->query(*this, k);
695 |     return v;
696 |   }
697 | 
698 |   void dump_messages(void) {
699 |     std::pair<MessageKey<Key>, Message<Value> > current;
700 | 
701 |     std::cout << "############### BEGIN DUMP ##############" << std::endl;
702 |     
703 |     try {
704 |       current = root->get_next_message(NULL);
705 |       do { 
706 | 	std::cout << current.first.key       << " "
707 | 		  << current.first.timestamp << " "
708 | 		  << current.second.opcode   << " "
709 | 		  << current.second.val      << std::endl;
710 | 	current = root->get_next_message(&current.first);
711 |       } while (1);
712 |     } catch (std::out_of_range e) {}
713 |   }
714 | 
715 |   class iterator {
716 |   public:
717 | 
718 |     iterator(const betree &bet)
719 |       : bet(bet),
720 | 	position(),
721 | 	is_valid(false),
722 | 	pos_is_valid(false),
723 | 	first(),
724 | 	second()
725 |     {}
726 | 
727 |     iterator(const betree &bet, const MessageKey<Key> *mkey)
728 |       : bet(bet),
729 | 	position(),	
730 | 	is_valid(false),
731 | 	pos_is_valid(false),
732 | 	first(),
733 | 	second()
734 |     {
735 |       try {
736 | 	position = bet.root->get_next_message(mkey);
737 | 	pos_is_valid = true;
738 | 	setup_next_element();
739 |       } catch (std::out_of_range e) {}
740 |     }
741 | 
742 |     void apply(const MessageKey<Key> &msgkey, const Message<Value> &msg) {
743 |       switch (msg.opcode) {
744 |       case INSERT:
745 |   	first = msgkey.key;
746 |   	second = msg.val;
747 |   	is_valid = true;
748 |   	break;
749 |       case UPDATE:
750 |   	first = msgkey.key;
751 |   	if (is_valid == false)
752 |   	  second = bet.default_value;
753 |   	second = second + msg.val;
754 |   	is_valid = true;
755 |   	break;
756 |       case DELETE:
757 |   	is_valid = false;
758 |   	break;
759 |       default:
760 |   	abort();
761 |   	break;
762 |       }
763 |     }
764 | 
765 |     void setup_next_element(void) {
766 |       is_valid = false;
767 |       while (pos_is_valid && (!is_valid || position.first.key == first)) {
768 | 	apply(position.first, position.second);
769 | 	try {
770 | 	  position = bet.root->get_next_message(&position.first);
771 | 	} catch (std::exception e) {
772 | 	  pos_is_valid = false;
773 | 	}
774 |       }
775 |     }
776 | 
777 |     bool operator==(const iterator &other) {
778 |       return &bet == &other.bet &&
779 | 	is_valid == other.is_valid &&
780 | 	pos_is_valid == other.pos_is_valid &&
781 | 	(!pos_is_valid || position == other.position) &&
782 | 	(!is_valid || (first == other.first && second == other.second));
783 |     }
784 | 
785 |     bool operator!=(const iterator &other) {
786 |       return !operator==(other);
787 |     }
788 | 
789 |     iterator &operator++(void) {
790 |       setup_next_element();
791 |       return *this;
792 |     }
793 |     
794 |     const betree &bet;
795 |     std::pair<MessageKey<Key>, Message<Value> > position;
796 |     bool is_valid;
797 |     bool pos_is_valid;
798 |     Key first;
799 |     Value second;
800 |   };
801 | 
802 |   iterator begin(void) const {
803 |     return iterator(*this, NULL);
804 |   }
805 | 
806 |   iterator lower_bound(Key key) const {
807 |     MessageKey<Key> tmp = MessageKey<Key>::range_start(key);
808 |     return iterator(*this, &tmp);
809 |   }
810 |   
811 |   iterator upper_bound(Key key) const {
812 |     MessageKey<Key> tmp = MessageKey<Key>::range_end(key);
813 |     return iterator(*this, &tmp);
814 |   }
815 |   
816 |   iterator end(void) const {
817 |     return iterator(*this);
818 |   }
819 | };
820 | 


--------------------------------------------------------------------------------
/debug.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DEBUG_HPP
 2 | #define DEBUG_HPP
 3 | 
 4 | #ifdef DEBUG
 5 | #define debug(x) (x)
 6 | #else
 7 | #define debug(x)
 8 | #endif
 9 | 
10 | #endif // DEBUG_HPP
11 | 


--------------------------------------------------------------------------------
/swap_space.cpp:
--------------------------------------------------------------------------------
  1 | #include "swap_space.hpp"
  2 | 
  3 | void serialize(std::iostream &fs, serialization_context &context, uint64_t x)
  4 | {
  5 |   fs << x << " ";
  6 |   assert(fs.good());
  7 | }
  8 | 
  9 | void deserialize(std::iostream &fs, serialization_context &context, uint64_t &x)
 10 | {
 11 |   fs >> x;
 12 |   assert(fs.good());
 13 | }
 14 | 
 15 | void serialize(std::iostream &fs, serialization_context &context, int64_t x)
 16 | {
 17 |   fs << x << " ";
 18 |   assert(fs.good());
 19 | }
 20 | 
 21 | void deserialize(std::iostream &fs, serialization_context &context, int64_t &x)
 22 | {
 23 |   fs >> x;
 24 |   assert(fs.good());
 25 | }
 26 | 
 27 | void serialize(std::iostream &fs, serialization_context &context, std::string x)
 28 | {
 29 |   fs << x.size() << ",";
 30 |   assert(fs.good());
 31 |   fs.write(x.data(), x.size());
 32 |   assert(fs.good());
 33 | }
 34 | 
 35 | void deserialize(std::iostream &fs, serialization_context &context, std::string &x)
 36 | {
 37 |   size_t length;
 38 |   char comma;
 39 |   fs >> length >> comma;
 40 |   assert(fs.good());
 41 |   char *buf = new char[length];
 42 |   assert(buf);
 43 |   fs.read(buf, length);
 44 |   assert(fs.good());
 45 |   x = std::string(buf, length);
 46 |   delete buf;
 47 | }
 48 | 
 49 | bool swap_space::cmp_by_last_access(swap_space::object *a, swap_space::object *b) {
 50 |   return a->last_access < b->last_access;
 51 | }
 52 | 
 53 | swap_space::swap_space(backing_store *bs, uint64_t n) :
 54 |   backstore(bs),
 55 |   max_in_memory_objects(n),
 56 |   objects(),
 57 |   lru_pqueue(cmp_by_last_access)
 58 | {}
 59 | 
 60 | swap_space::object::object(swap_space *sspace, serializable * tgt) {
 61 |   target = tgt;
 62 |   id = sspace->next_id++;
 63 |   bsid = 0;
 64 |   is_leaf = false;
 65 |   refcount = 1;
 66 |   last_access = sspace->next_access_time++;
 67 |   target_is_dirty = true;
 68 |   pincount = 0;
 69 | }
 70 | 
 71 | void swap_space::set_cache_size(uint64_t sz) {
 72 |   assert(sz > 0);
 73 |   max_in_memory_objects = sz;
 74 |   maybe_evict_something();
 75 | }
 76 | 
 77 | void swap_space::write_back(swap_space::object *obj)
 78 | {
 79 |   assert(objects.count(obj->id) > 0);
 80 | 
 81 |   debug(std::cout << "Writing back " << obj->id
 82 | 	<< " (" << obj->target << ") "
 83 | 	<< "with last access time " << obj->last_access << std::endl);
 84 | 
 85 |   // This calls _serialize on all the pointers in this object,
 86 |   // which keeps refcounts right later on when we delete them all.
 87 |   // In the future, we may also use this to implement in-memory
 88 |   // evictions, i.e. where we first "evict" an object by
 89 |   // compressing it and keeping the compressed version in memory.
 90 |   serialization_context ctxt(*this);
 91 |   std::stringstream sstream;
 92 |   serialize(sstream, ctxt, *obj->target);
 93 |   obj->is_leaf = ctxt.is_leaf;
 94 | 
 95 |   if (obj->target_is_dirty) {
 96 |     std::string buffer = sstream.str();
 97 |     uint64_t bsid = backstore->allocate(buffer.length());
 98 |     std::iostream *out = backstore->get(bsid);
 99 |     out->write(buffer.data(), buffer.length());
100 |     backstore->put(out);
101 |     if (obj->bsid > 0)
102 |       backstore->deallocate(obj->bsid);
103 |     obj->bsid = bsid;
104 |     obj->target_is_dirty = false;
105 |   }
106 | }
107 | 
108 | void swap_space::maybe_evict_something(void)
109 | {
110 |   while (current_in_memory_objects > max_in_memory_objects) {
111 |     object *obj = NULL;
112 |     for (auto it = lru_pqueue.begin(); it != lru_pqueue.end(); ++it)
113 |       if ((*it)->pincount == 0) {
114 | 	obj = *it;
115 | 	break;
116 |       }
117 |     if (obj == NULL)
118 |       return;
119 |     lru_pqueue.erase(obj);
120 | 
121 |     write_back(obj);
122 |     
123 |     delete obj->target;
124 |     obj->target = NULL;
125 |     current_in_memory_objects--;
126 |   }
127 | }
128 | 
129 | 


--------------------------------------------------------------------------------
/swap_space.hpp:
--------------------------------------------------------------------------------
  1 | // A scheme for transparently swapping data structures in and out of
  2 | // memory.
  3 | 
  4 | // WARNING: this is very incomplete.  It's just enough functionality
  5 | //          for the betree.cpp.  In particular, the current system
  6 | //          does not handle cycles in the pointer graph or pointers
  7 | //          into the middle of objects (such as into an array).
  8 | 
  9 | // The goal of this code is to enable users to write complex in-memory
 10 | // data structures and have a separate layer (i.e. this code) manage
 11 | // I/O.  Users should be able to define their data structures as they
 12 | // see fit (i.e. they can use pointers, etc) but still control the
 13 | // granularity at which items are swapped to/from memory.
 14 | 
 15 | // Therefore, we define a swap_space::pointer type that represents a
 16 | // pointer from one swappable unit to another.  When the swapper elects
 17 | // to swap out an object X, it will swap out all the objects that X
 18 | // points to through regular C++ pointers.  All these objects will be
 19 | // written to a single place on the backing store, so this will be
 20 | // I/O-efficient.  The swapper does not traverse swap_space::pointers
 21 | // -- they point to separate things that should be swapped out
 22 | // independently of the thing pointing to them.
 23 | 
 24 | // The betree code provides an example of how this is used.  We want
 25 | // each node to be swapped in/out as a single unit, but separate nodes
 26 | // in the tree should be able to be swapped in/out independently of
 27 | // eachother.  Therefore, nodes use swap_space::pointers to point to
 28 | // eachother.  They use regular C++ pointers to point to internal
 29 | // items that should be serialized as part of the node.
 30 | 
 31 | // The swap_space needs to manage all pointers to swappable objects.
 32 | // New swappable objects should be created like this:
 33 | //      swap_space ss;
 34 | //      swap_space::pointer<T> p = ss.allocate(new T(constructor args));
 35 | 
 36 | // You can then use operator-> as normal, e.g.
 37 | //      p->some_field
 38 | //      p->some_method(args)
 39 | // Although no operator* is not defined, it should be straightforward
 40 | // to do so.
 41 | 
 42 | // Invoking p->some_method() pins the object referred to by p in
 43 | // memory.  Thus, during the execution of some_method(), it is safe to
 44 | // dereference "this" and any other plain C++ pointers in the object.
 45 | 
 46 | // Objects are automatically garbage collected.  The garbage collector
 47 | // uses reference counting.
 48 | 
 49 | // The current system uses LRU to select items to swap.  The swap
 50 | // space has a user-specified in-memory cache size it.  The cache size
 51 | // can be adjusted dynamically.
 52 | 
 53 | // Don't try to get your hands on an unwrapped pointer to the object
 54 | // or anything that is swapped in/out as part of the object.  It can
 55 | // only lead to trouble.  Casting is also probably a bad idea.  Just
 56 | // write nice, clean, type-safe, well-encapsulated code and everything
 57 | // should work just fine.
 58 | 
 59 | // Objects managed by this system must be sub-types of class
 60 | // serializable.  This basically defines two methods for serializing
 61 | // and deserializing the object.  See the betree for examples of
 62 | // implementing these methods.  We provide default implementations for
 63 | // a few basic types and STL containers.  Feel free to add more and
 64 | // submit patches as you need them.
 65 | 
 66 | // The current implementation serializes to a textual file format.
 67 | // This is just a convenience.  It would be nice to be able to swap in
 68 | // different formats.
 69 | 
 70 | #ifndef SWAP_SPACE_HPP
 71 | #define SWAP_SPACE_HPP
 72 | 
 73 | #include <cstdint>
 74 | #include <unordered_map>
 75 | #include <map>
 76 | #include <set>
 77 | #include <functional>
 78 | #include <sstream>
 79 | #include <cassert>
 80 | #include "backing_store.hpp"
 81 | #include "debug.hpp"
 82 | 
 83 | class swap_space;
 84 | 
 85 | class serialization_context {
 86 | public:
 87 |   serialization_context(swap_space &sspace) :
 88 |     ss(sspace),
 89 |     is_leaf(true)
 90 |   {}
 91 |   swap_space &ss;
 92 |   bool is_leaf;
 93 | };
 94 | 
 95 | class serializable {
 96 | public:
 97 |   virtual void _serialize(std::iostream &fs, serialization_context &context) = 0;
 98 |   virtual void _deserialize(std::iostream &fs, serialization_context &context) = 0;
 99 |   virtual ~serializable(void) {};
100 | };
101 | 
102 | void serialize(std::iostream &fs, serialization_context &context, uint64_t x);
103 | void deserialize(std::iostream &fs, serialization_context &context, uint64_t &x);
104 | 
105 | void serialize(std::iostream &fs, serialization_context &context, int64_t x);
106 | void deserialize(std::iostream &fs, serialization_context &context, int64_t &x);
107 | 
108 | void serialize(std::iostream &fs, serialization_context &context, std::string x);
109 | void deserialize(std::iostream &fs, serialization_context &context, std::string &x);
110 | 
111 | template<class Key, class Value> void serialize(std::iostream &fs,
112 | 						serialization_context &context,
113 | 						std::map<Key, Value> &mp)
114 | {
115 |   fs << "map " << mp.size() << " {" << std::endl;
116 |   assert(fs.good());
117 |   for (auto it = mp.begin(); it != mp.end(); ++it) {
118 |     fs << "  ";
119 |     serialize(fs, context, it->first);
120 |     fs << " -> ";
121 |     serialize(fs, context, it->second);
122 |     fs << std::endl;
123 |   }
124 |   fs << "}" << std::endl;
125 | }
126 | 
127 | template<class Key, class Value> void deserialize(std::iostream &fs,
128 | 						  serialization_context &context,
129 | 						  std::map<Key, Value> &mp)
130 | {
131 |   std::string dummy;
132 |   int size = 0;
133 |   fs >> dummy >> size >> dummy;
134 |   assert(fs.good());
135 |   for (int i = 0; i < size; i++) {
136 |     Key k;
137 |     Value v;
138 |     deserialize(fs, context, k);
139 |     fs >> dummy;
140 |     deserialize(fs, context, v);
141 |     mp[k] = v;
142 |   }
143 |   fs >> dummy;
144 | }
145 | 
146 | template<class X> void serialize(std::iostream &fs, serialization_context &context, X *&x)
147 | {
148 |   fs << "pointer ";
149 |   serialize(fs, context, *x);
150 | }
151 | 
152 | template<class X> void deserialize(std::iostream &fs, serialization_context &context, X *&x)
153 | {
154 |   std::string dummy;
155 |   x = new X;
156 |   fs >> dummy;
157 |   assert (dummy == "pointer");
158 |   deserialize(fs, context, *x);
159 | }
160 | 
161 | template<class X> void serialize(std::iostream &fs, serialization_context &context, X &x)
162 | {
163 |   x._serialize(fs, context);
164 | }
165 | 
166 | template<class X> void deserialize(std::iostream &fs, serialization_context &context, X &x)
167 | {
168 |   x._deserialize(fs, context);
169 | }
170 | 
171 | class swap_space {
172 | public:
173 |   swap_space(backing_store *bs, uint64_t n);
174 | 
175 |   template<class Referent> class pointer;
176 | 
177 |   template<class Referent>
178 |   pointer<Referent> allocate(Referent * tgt) {
179 |     return pointer<Referent>(this, tgt);
180 |   }
181 | 
182 |   // This pins an object in memory for the duration of a member
183 |   // access.  It's sort of an instance of the "resource aquisition is
184 |   // initialization" paradigm.
185 |   template<class Referent>
186 |   class pin {
187 |   public:
188 |     const Referent * operator->(void) const {
189 |       assert(ss->objects.count(target) > 0);
190 |       debug(std::cout << "Accessing (constly) " << target
191 | 	    << " (" << ss->objects[target]->target << ")" << std::endl);
192 |       access(target, false);
193 |       return (const Referent *)ss->objects[target]->target;
194 |     }
195 | 
196 |     Referent * operator->(void) {
197 |       assert(ss->objects.count(target) > 0);
198 |       debug(std::cout << "Accessing " << target
199 | 	    << " (" << ss->objects[target]->target << ")" << std::endl);
200 |       access(target, true);
201 |       return (Referent *)ss->objects[target]->target;
202 |     }
203 | 
204 |     pin(const pointer<Referent> *p)
205 |       : ss(NULL),
206 | 	target(0)
207 |     {
208 |       dopin(p->ss, p->target);
209 |     }
210 | 
211 |     pin(void)
212 |       : ss(NULL),
213 | 	target(0)
214 |     {}
215 | 
216 |     ~pin(void) {
217 |       unpin();
218 |     }
219 | 
220 |     pin &operator=(const pin &other) {
221 |       if (&other != this) {
222 | 	unpin();
223 | 	dopin(other.ss, other.target);
224 |       }
225 |     }
226 |     
227 |   private:
228 |     void unpin(void) {
229 |       debug(std::cout << "Unpinning " << target
230 | 	    << " (" << ss->objects[target]->target << ")" << std::endl);
231 |       if (target > 0) {
232 | 	assert(ss->objects.count(target) > 0);
233 | 	ss->objects[target]->pincount--;
234 | 	ss->maybe_evict_something();
235 |       }
236 |       ss = NULL;
237 |       target = 0;
238 |     }
239 | 
240 |     void dopin(swap_space *newss, uint64_t newtarget) {
241 |       assert(ss == NULL && target == 0);
242 |       ss = newss;
243 |       target = newtarget;
244 |       if (target > 0) {
245 | 	assert(ss->objects.count(target) > 0);
246 | 	debug(std::cout << "Pinning " << target
247 | 	      << " (" << ss->objects[target]->target << ")" << std::endl);
248 | 	ss->objects[target]->pincount++;
249 |       }
250 |     }
251 |     
252 |     void access(uint64_t tgt, bool dirty) const {
253 |       assert(ss->objects.count(tgt) > 0);
254 |       object *obj = ss->objects[tgt];
255 |       ss->lru_pqueue.erase(obj);
256 |       obj->last_access = ss->next_access_time++;
257 |       ss->lru_pqueue.insert(obj);
258 |       obj->target_is_dirty |= dirty;
259 |       ss->load<Referent>(tgt);
260 |       ss->maybe_evict_something();
261 |     }
262 |   
263 |     swap_space *ss;
264 |     uint64_t target;
265 |   };
266 |   
267 |   template<class Referent>
268 |   class pointer : public serializable {
269 |     friend class swap_space;
270 |     friend class pin<Referent>;
271 |     
272 |   public:
273 |     pointer(void) :
274 |       ss(NULL),
275 |       target(0)
276 |     {}
277 |     
278 |     pointer(const pointer &other) {
279 |       ss = other.ss;
280 |       target = other.target;
281 |       if (target > 0) {
282 | 	assert(ss->objects.count(target) > 0);
283 | 	ss->objects[target]->refcount++;
284 |       }
285 |     }
286 | 
287 |     ~pointer(void) {
288 |       depoint();
289 |     }
290 | 
291 |     void depoint(void) {
292 |       if (target == 0)
293 | 	return;
294 |       assert(ss->objects.count(target) > 0);
295 | 
296 |       object *obj = ss->objects[target];
297 |       assert(obj->refcount > 0);
298 |       if ((--obj->refcount) == 0) {
299 | 	debug(std::cout << "Erasing " << target << std::endl);
300 | 	// Load it into memory so we can recursively free stuff
301 | 	if (obj->target == NULL) {
302 | 	  assert(obj->bsid > 0);
303 | 	  if (!obj->is_leaf) {
304 | 	    ss->load<Referent>(target);
305 | 	  } else {
306 | 	    debug(std::cout << "Skipping load of leaf " << target << std::endl);
307 | 	  }
308 | 	}
309 | 	ss->objects.erase(target);
310 | 	ss->lru_pqueue.erase(obj);
311 | 	if (obj->target)
312 | 	  delete obj->target;
313 | 	ss->current_in_memory_objects--;
314 | 	if (obj->bsid > 0)
315 | 	  ss->backstore->deallocate(obj->bsid);
316 | 	delete obj;
317 |       }
318 |       target = 0;
319 |     }
320 | 
321 |     pointer & operator=(const pointer &other) {
322 |       if (&other != this) {
323 | 	depoint();
324 | 	ss = other.ss;
325 | 	target = other.target;
326 | 	if (target > 0) {
327 | 	  assert(ss->objects.count(target) > 0);
328 | 	  ss->objects[target]->refcount++;
329 | 	}
330 |       }
331 |       return *this;
332 |     }
333 | 
334 |     bool operator==(const pointer &other) const {
335 |       return ss == other.ss && target == other.target;
336 |     }
337 | 
338 |     bool operator!=(const pointer &other) const {
339 |       return !operator==(other);
340 |     }
341 | 	  
342 |     // const Referent * operator->(void) const {
343 |     //   ss->access(target, false);
344 |     //   return ss->objects[target].target;
345 |     // }
346 | 
347 |     const pin<Referent> operator->(void) const {
348 |       return pin<Referent>(this);
349 |     }
350 | 
351 |     pin<Referent> operator->(void) {
352 |       return pin<Referent>(this);
353 |     }
354 | 
355 |     pin<Referent> get_pin(void) {
356 |       return pin<Referent>(this);
357 |     }
358 |     
359 |     const pin<Referent> get_pin(void) const {
360 |       return pin<Referent>(this);
361 |     }
362 |     
363 |     bool is_in_memory(void) const {
364 |       assert(ss->objects.count(target) > 0);
365 |       return target > 0 && ss->objects[target]->target != NULL;
366 |     }
367 | 
368 |     bool is_dirty(void) const {
369 |       assert(ss->objects.count(target) > 0);
370 |       return target > 0 && ss->objects[target]->target && ss->objects[target]->target_is_dirty;
371 |     }
372 | 
373 |     void _serialize(std::iostream &fs, serialization_context &context) {
374 |       assert(target > 0);
375 |       assert(context.ss.objects.count(target) > 0);
376 |       fs << target << " ";
377 |       target = 0;
378 |       assert(fs.good());
379 |       context.is_leaf = false;
380 |     }
381 |     
382 |     void _deserialize(std::iostream &fs, serialization_context &context) {
383 |       assert(target == 0);
384 |       ss = &context.ss;
385 |       fs >> target;
386 |       assert(fs.good());
387 |       assert(context.ss.objects.count(target) > 0);
388 |       // We just created a new reference to this object and
389 |       // invalidated the on-disk reference, so the total refcount
390 |       // stays the same.
391 |     }
392 | 
393 |   private:
394 |     swap_space *ss;
395 |     uint64_t target;
396 | 
397 |     // Only callable through swap_space::allocate(...)
398 |     pointer(swap_space *sspace, Referent *tgt)
399 |     {
400 |       ss = sspace;
401 |       target = sspace->next_id++;
402 | 
403 |       object *o = new object(sspace, tgt);
404 |       assert(o != NULL);
405 |       target = o->id;
406 |       assert(ss->objects.count(target) == 0);
407 |       ss->objects[target] = o;
408 |       ss->lru_pqueue.insert(o);
409 |       ss->current_in_memory_objects++;
410 |       ss->maybe_evict_something();
411 |     }
412 | 
413 |   };
414 |   
415 | private:
416 |   backing_store *backstore;  
417 | 
418 |   uint64_t next_id = 1;
419 |   uint64_t next_access_time = 0;
420 |   
421 |   class object {
422 |   public:
423 |     
424 |     object(swap_space *sspace, serializable * tgt);
425 |     
426 |     serializable * target;
427 |     uint64_t id;
428 |     uint64_t bsid;
429 |     bool is_leaf;
430 |     uint64_t refcount;
431 |     uint64_t last_access;
432 |     bool target_is_dirty;
433 |     uint64_t pincount;
434 |   };
435 | 
436 |   static bool cmp_by_last_access(object *a, object *b);
437 | 
438 |   template<class Referent>
439 |   void load(uint64_t tgt) {
440 |     assert(objects.count(tgt) > 0);
441 |     if (objects[tgt]->target == NULL) {
442 |       object *obj = objects[tgt];
443 |       debug(std::cout << "Loading " << obj->id << std::endl);
444 |       std::iostream *in = backstore->get(obj->bsid);
445 |       Referent *r = new Referent();
446 |       serialization_context ctxt(*this);
447 |       deserialize(*in, ctxt, *r);
448 |       backstore->put(in);
449 |       obj->target = r;
450 |       current_in_memory_objects++;
451 |     }
452 |   }
453 | 
454 |   void set_cache_size(uint64_t sz);
455 |   
456 |   void write_back(object *obj);
457 |   void maybe_evict_something(void);
458 |   
459 |   uint64_t max_in_memory_objects;
460 |   uint64_t current_in_memory_objects = 0;
461 |   std::unordered_map<uint64_t, object *> objects;
462 |   std::set<object *, bool (*)(object *, object *)> lru_pqueue;
463 | };
464 | 
465 | #endif // SWAP_SPACE_HPP
466 | 


--------------------------------------------------------------------------------
/test.cpp:
--------------------------------------------------------------------------------
  1 | // This test program performs a series of inserts, deletes, updates,
  2 | // and queries to a betree.  It performs the same sequence of
  3 | // operatons on a std::map.  It checks that it always gets the same
  4 | // result from both data structures.
  5 | 
  6 | // The program takes 1 command-line parameter -- the number of
  7 | // distinct keys it can use in the test.
  8 | 
  9 | // The values in this test are strings.  Since updates use operator+
 10 | // on the values, this test performs concatenation on the strings.
 11 | 
 12 | #include <string.h>
 13 | #include <sys/types.h>
 14 | #include <sys/time.h>
 15 | #include <unistd.h>
 16 | #include "betree.hpp"
 17 | 
 18 | void timer_start(uint64_t &timer)
 19 | {
 20 |   struct timeval t;
 21 |   assert(!gettimeofday(&t, NULL));
 22 |   timer -= 1000000*t.tv_sec + t.tv_usec;
 23 | }
 24 | 
 25 | void timer_stop(uint64_t &timer)
 26 | {
 27 |   struct timeval t;
 28 |   assert(!gettimeofday(&t, NULL));
 29 |   timer += 1000000*t.tv_sec + t.tv_usec;
 30 | }
 31 | 
 32 | int next_command(FILE *input, int *op, uint64_t *arg)
 33 | {
 34 |   int ret;
 35 |   char command[64];
 36 | 
 37 |   ret = fscanf(input, "%s %ld", command, arg);
 38 |   if (ret == EOF)
 39 |     return EOF;
 40 |   else if (ret != 2) {
 41 |     fprintf(stderr, "Parse error\n");
 42 |     exit(3);
 43 |   }
 44 |   
 45 |   if (strcmp(command, "Inserting") == 0) {
 46 |     *op = 0;
 47 |   } else if (strcmp(command, "Updating") == 0) {
 48 |     *op = 1;
 49 |   } else if (strcmp(command, "Deleting") == 0) {
 50 |     *op = 2;
 51 |   } else if (strcmp(command, "Query") == 0) {
 52 |     *op = 3;
 53 |     if (1 != fscanf(input, " -> %s", command)) {
 54 |       fprintf(stderr, "Parse error\n");
 55 |       exit(3);
 56 |     }
 57 |   } else if (strcmp(command, "Full_scan") == 0) {
 58 |     *op = 4;
 59 |   } else if (strcmp(command, "Lower_bound_scan") == 0) {
 60 |     *op = 5;
 61 |   } else if (strcmp(command, "Upper_bound_scan") == 0) {
 62 |     *op = 6;
 63 |   } else {
 64 |     fprintf(stderr, "Unknown command: %s\n", command);
 65 |     exit(1);
 66 |   }
 67 |   
 68 |   return 0;
 69 | }
 70 | 
 71 | template<class Key, class Value>
 72 | void do_scan(typename betree<Key, Value>::iterator &betit,
 73 | 	     typename std::map<Key, Value>::iterator &refit,
 74 | 	     betree<Key, Value> &b,
 75 | 	     typename std::map<Key, Value> &reference)
 76 | {
 77 |   while (refit != reference.end()) {
 78 |     assert(betit != b.end());
 79 |     assert(betit.first == refit->first);
 80 |     assert(betit.second == refit->second);
 81 |     ++refit;
 82 |     if (refit == reference.end()) {
 83 |       debug(std::cout << "Almost done" << std::endl);
 84 |     }
 85 |     ++betit;
 86 |   }
 87 |   assert(betit == b.end());
 88 | }
 89 | 
 90 | #define DEFAULT_TEST_MAX_NODE_SIZE (1ULL<<6)
 91 | #define DEFAULT_TEST_MIN_FLUSH_SIZE (DEFAULT_TEST_MAX_NODE_SIZE / 4)
 92 | #define DEFAULT_TEST_CACHE_SIZE (4)
 93 | #define DEFAULT_TEST_NDISTINCT_KEYS (1ULL << 10)
 94 | #define DEFAULT_TEST_NOPS (1ULL << 12)
 95 | 
 96 | void usage(char *name)
 97 | {
 98 |   std::cout
 99 |     << "Usage: " << name << " [OPTIONS]" << std::endl
100 |     << "Tests the betree implementation" << std::endl
101 |     << std::endl
102 |     << "Options are" << std::endl
103 |     << "  Required:"   << std::endl
104 |     << "    -d <backing_store_directory>                    [ default: none, parameter is required ]"           << std::endl
105 |     << "    -m  <mode>  (test or benchmark-<mode>)          [ default: none, parameter required ]"              << std::endl
106 |     << "        benchmark modes:"                                                                               << std::endl
107 |     << "          upserts    "                                                                                  << std::endl
108 |     << "          queries    "                                                                                  << std::endl
109 |     << "  Betree tuning parameters:" << std::endl
110 |     << "    -N <max_node_size>            (in elements)     [ default: " << DEFAULT_TEST_MAX_NODE_SIZE  << " ]" << std::endl
111 |     << "    -f <min_flush_size>           (in elements)     [ default: " << DEFAULT_TEST_MIN_FLUSH_SIZE << " ]" << std::endl
112 |     << "    -C <max_cache_size>           (in betree nodes) [ default: " << DEFAULT_TEST_CACHE_SIZE     << " ]" << std::endl
113 |     << "  Options for both tests and benchmarks" << std::endl
114 |     << "    -k <number_of_distinct_keys>                    [ default: " << DEFAULT_TEST_NDISTINCT_KEYS << " ]" << std::endl
115 |     << "    -t <number_of_operations>                       [ default: " << DEFAULT_TEST_NOPS           << " ]" << std::endl
116 |     << "    -s <random_seed>                                [ default: random ]"                                << std::endl
117 |     << "  Test scripting options" << std::endl
118 |     << "    -o <output_script>                              [ default: no output ]"                             << std::endl
119 |     << "    -i <script_file>                                [ default: none ]"                                  << std::endl;
120 | }
121 | 
122 | int test(betree<uint64_t, std::string> &b,
123 | 	 uint64_t nops,
124 | 	 uint64_t number_of_distinct_keys,
125 | 	 FILE *script_input,
126 | 	 FILE *script_output)
127 | {
128 |   std::map<uint64_t, std::string> reference;
129 | 
130 |   for (unsigned int i = 0; i < nops; i++) {
131 |     int op;
132 |     uint64_t t;
133 |     if (script_input) {
134 |       int r = next_command(script_input, &op, &t);
135 |       if (r == EOF)
136 | 	exit(0);
137 |       else if (r < 0)
138 | 	exit(4);
139 |     } else {
140 |       op = rand() % 7;
141 |       t = rand() % number_of_distinct_keys;
142 |     }
143 |     
144 |     switch (op) {
145 |     case 0: // insert
146 |       if (script_output)
147 | 	fprintf(script_output, "Inserting %lu\n", t);
148 |       b.insert(t, std::to_string(t) + ":");
149 |       reference[t] = std::to_string(t) + ":";
150 |       break;
151 |     case 1: // update
152 |       if (script_output)
153 | 	fprintf(script_output, "Updating %lu\n", t);
154 |       b.update(t, std::to_string(t) + ":");
155 |       if (reference.count(t) > 0)
156 |       	reference[t] += std::to_string(t) + ":";
157 |       else
158 |       	reference[t] = std::to_string(t) + ":";
159 |       break;
160 |     case 2: // delete
161 |       if (script_output)
162 | 	fprintf(script_output, "Deleting %lu\n", t);
163 |       b.erase(t);
164 |       reference.erase(t);
165 |       break;
166 |     case 3: // query
167 |       try {
168 | 	std::string bval = b.query(t);
169 | 	assert(reference.count(t) > 0);
170 | 	std::string rval = reference[t];
171 | 	assert(bval == rval);
172 | 	if (script_output)
173 | 	  fprintf(script_output, "Query %lu -> %s\n", t, bval.c_str());
174 |       } catch (std::out_of_range e) {
175 | 	if (script_output)
176 | 	  fprintf(script_output, "Query %lu -> DNE\n", t);
177 | 	assert(reference.count(t) == 0);
178 |       }
179 |       break;
180 |     case 4: // full scan
181 |       {
182 | 	if (script_output)
183 | 	  fprintf(script_output, "Full_scan 0\n");
184 | 	auto betit = b.begin();
185 | 	auto refit = reference.begin();
186 | 	do_scan(betit, refit, b, reference);
187 |       }
188 |       break;
189 |     case 5: // lower-bound scan
190 |       {
191 | 	if (script_output)
192 | 	  fprintf(script_output, "Lower_bound_scan %lu\n", t);
193 | 	auto betit = b.lower_bound(t);
194 | 	auto refit = reference.lower_bound(t);
195 | 	do_scan(betit, refit, b, reference);
196 |       }
197 |       break;
198 |     case 6: // scan
199 |       {
200 | 	if (script_output)
201 | 	  fprintf(script_output, "Upper_bound_scan %lu\n", t);
202 | 	auto betit = b.upper_bound(t);
203 | 	auto refit = reference.upper_bound(t);
204 | 	do_scan(betit, refit, b, reference);
205 |       }
206 |       break;
207 |     default:
208 |       abort();
209 |     }
210 |   }
211 | 
212 |   std::cout << "Test PASSED" << std::endl;
213 |   
214 |   return 0;
215 | }
216 | 
217 | void benchmark_upserts(betree<uint64_t, std::string> &b,
218 | 		       uint64_t nops,
219 | 		       uint64_t number_of_distinct_keys,
220 | 		       uint64_t random_seed)
221 | {
222 |   uint64_t overall_timer = 0;
223 |   for (uint64_t j = 0; j < 100; j++) {
224 |     uint64_t timer = 0;
225 |     timer_start(timer);
226 |     for (uint64_t i = 0; i < nops / 100; i++) {
227 |       uint64_t t = rand() % number_of_distinct_keys;
228 |       b.update(t, std::to_string(t) + ":");
229 |     }
230 |     timer_stop(timer);
231 |     printf("%ld %ld %ld\n", j, nops/100, timer);
232 |     overall_timer += timer;
233 |   }
234 |   printf("# overall: %ld %ld\n", 100*(nops/100), overall_timer);
235 | }
236 | 
237 | void benchmark_queries(betree<uint64_t, std::string> &b,
238 | 		       uint64_t nops,
239 | 		       uint64_t number_of_distinct_keys,
240 | 		       uint64_t random_seed)
241 | {
242 |   
243 |   // Pre-load the tree with data
244 |   srand(random_seed);
245 |   for (uint64_t i = 0; i < nops; i++) {
246 |     uint64_t t = rand() % number_of_distinct_keys;
247 |     b.update(t, std::to_string(t) + ":");
248 |   }
249 | 
250 | 	// Now go back and query it
251 |   srand(random_seed);
252 |   uint64_t overall_timer = 0;
253 | 	timer_start(overall_timer);
254 |   for (uint64_t i = 0; i < nops; i++) {
255 |     uint64_t t = rand() % number_of_distinct_keys;
256 |     b.query(t);
257 |   }
258 | 	timer_stop(overall_timer);
259 |   printf("# overall: %ld %ld\n", nops, overall_timer);
260 | 
261 | }
262 | 
263 | int main(int argc, char **argv)
264 | {
265 |   char *mode = NULL;
266 |   uint64_t max_node_size = DEFAULT_TEST_MAX_NODE_SIZE;
267 |   uint64_t min_flush_size = DEFAULT_TEST_MIN_FLUSH_SIZE;
268 |   uint64_t cache_size = DEFAULT_TEST_CACHE_SIZE;
269 |   char *backing_store_dir = NULL;
270 |   uint64_t number_of_distinct_keys = DEFAULT_TEST_NDISTINCT_KEYS;
271 |   uint64_t nops = DEFAULT_TEST_NOPS;
272 |   char *script_infile = NULL;
273 |   char *script_outfile = NULL;
274 |   unsigned int random_seed = time(NULL) * getpid();
275 |  
276 |   int opt;
277 |   char *term;
278 |     
279 |   //////////////////////
280 |   // Argument parsing //
281 |   //////////////////////
282 |   
283 |   while ((opt = getopt(argc, argv, "m:d:N:f:C:o:k:t:s:i:")) != -1) {
284 |     switch (opt) {
285 |     case 'm':
286 |       mode = optarg;
287 |       break;
288 |     case 'd':
289 |       backing_store_dir = optarg;
290 |       break;
291 |     case 'N':
292 |       max_node_size = strtoull(optarg, &term, 10);
293 |       if (*term) {
294 | 	std::cerr << "Argument to -N must be an integer" << std::endl;
295 | 	usage(argv[0]);
296 | 	exit(1);
297 |       }
298 |       break;
299 |     case 'f':
300 |       min_flush_size = strtoull(optarg, &term, 10);
301 |       if (*term) {
302 | 	std::cerr << "Argument to -f must be an integer" << std::endl;
303 | 	usage(argv[0]);
304 | 	exit(1);
305 |       }
306 |       break;
307 |     case 'C':
308 |       cache_size = strtoull(optarg, &term, 10);
309 |       if (*term) {
310 | 	std::cerr << "Argument to -C must be an integer" << std::endl;
311 | 	usage(argv[0]);
312 | 	exit(1);
313 |       }
314 |       break;
315 |     case 'o':
316 |       script_outfile = optarg;
317 |       break;
318 |     case 'k':
319 |       number_of_distinct_keys = strtoull(optarg, &term, 10);
320 |       if (*term) {
321 | 	std::cerr << "Argument to -k must be an integer" << std::endl;
322 | 	usage(argv[0]);
323 | 	exit(1);
324 |       }
325 |       break;
326 |     case 't':
327 |       nops = strtoull(optarg, &term, 10);
328 |       if (*term) {
329 | 	std::cerr << "Argument to -t must be an integer" << std::endl;
330 | 	usage(argv[0]);
331 | 	exit(1);
332 |       }
333 |       break;
334 |     case 's':
335 |       random_seed = strtoull(optarg, &term, 10);
336 |       if (*term) {
337 | 	std::cerr << "Argument to -s must be an integer" << std::endl;
338 | 	usage(argv[0]);
339 | 	exit(1);
340 |       }
341 |       break;
342 |     case 'i':
343 |       script_infile = optarg;
344 |       break;
345 |     default:
346 |       std::cerr << "Unknown option '" << (char)opt << "'" << std::endl;
347 |       usage(argv[0]);
348 |       exit(1);
349 |     }
350 |   }
351 |   
352 |   FILE *script_input = NULL;
353 |   FILE *script_output = NULL;
354 | 
355 |   if (mode == NULL ||
356 |       (strcmp(mode, "test") != 0
357 |        && strcmp(mode, "benchmark-upserts") != 0
358 | 			 && strcmp(mode, "benchmark-queries") != 0)) {
359 |     std::cerr << "Must specify a mode of \"test\" or \"benchmark\"" << std::endl;
360 |     usage(argv[0]);
361 |     exit(1);
362 |   }
363 | 
364 |   if (strncmp(mode, "benchmark", strlen("benchmark")) == 0) {
365 |     if (script_infile) {
366 |       std::cerr << "Cannot specify an input script in benchmark mode" << std::endl;
367 |       usage(argv[0]);
368 |       exit(1);
369 |     }
370 |     if (script_outfile) {
371 |       std::cerr << "Cannot specify an output script in benchmark mode" << std::endl;
372 |       usage(argv[0]);
373 |       exit(1);
374 |     }
375 |   }
376 |   
377 |   if (script_infile) {
378 |     script_input = fopen(script_infile, "r");
379 |     if (script_input == NULL) {
380 |       perror("Couldn't open input file");
381 |       exit(1);
382 |     }
383 |   }
384 |   
385 |   if (script_outfile) {
386 |     script_output = fopen(script_outfile, "w");
387 |     if (script_output == NULL) {
388 |       perror("Couldn't open output file");
389 |       exit(1);
390 |     }
391 |   }
392 | 
393 |   srand(random_seed);
394 | 
395 |   if (backing_store_dir == NULL) {
396 |     std::cerr << "-d <backing_store_directory> is required" << std::endl;
397 |     usage(argv[0]);
398 |     exit(1);
399 |   }
400 |   
401 |   ////////////////////////////////////////////////////////
402 |   // Construct a betree and run the tests or benchmarks //
403 |   ////////////////////////////////////////////////////////
404 |   
405 |   one_file_per_object_backing_store ofpobs(backing_store_dir);
406 |   swap_space sspace(&ofpobs, cache_size);
407 |   betree<uint64_t, std::string> b(&sspace, max_node_size, min_flush_size);
408 | 
409 |   if (strcmp(mode, "test") == 0) 
410 |     test(b, nops, number_of_distinct_keys, script_input, script_output);
411 |   else if (strcmp(mode, "benchmark-upserts") == 0)
412 |     benchmark_upserts(b, nops, number_of_distinct_keys, random_seed);
413 |   else if (strcmp(mode, "benchmark-queries") == 0)
414 |     benchmark_queries(b, nops, number_of_distinct_keys, random_seed);
415 |   
416 |   if (script_input)
417 |     fclose(script_input);
418 |   
419 |   if (script_output)
420 |     fclose(script_output);
421 | 
422 |   return 0;
423 | }
424 | 
425 | 


--------------------------------------------------------------------------------